src/gromacs/mdlib/clincs.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /* This file is completely threadsafe - keep it that way! */
  38 #include "gmxpre.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <stdlib.h>
  44
  45 #include <cmath>
  46
  47 #include <algorithm>
  48
  49 #include "gromacs/domdec/domdec.h"
  50 #include "gromacs/fileio/copyrite.h"
  51 #include "gromacs/gmxlib/gmx_omp_nthreads.h"
  52 #include "gromacs/legacyheaders/nrnb.h"
  53 #include "gromacs/legacyheaders/types/commrec.h"
  54 #include "gromacs/math/units.h"
  55 #include "gromacs/math/vec.h"
  56 #include "gromacs/mdlib/constr.h"
  57 #include "gromacs/mdlib/mdrun.h"
  58 #include "gromacs/pbcutil/pbc.h"
  59 #include "gromacs/pbcutil/pbc-simd.h"
  60 #include "gromacs/simd/simd.h"
  61 #include "gromacs/simd/simd_math.h"
  62 #include "gromacs/simd/vector_operations.h"
  63 #include "gromacs/topology/block.h"
  64 #include "gromacs/topology/mtop_util.h"
  65 #include "gromacs/utility/bitmask.h"
  66 #include "gromacs/utility/cstringutil.h"
  67 #include "gromacs/utility/exceptions.h"
  68 #include "gromacs/utility/fatalerror.h"
  69 #include "gromacs/utility/gmxomp.h"
  70 #include "gromacs/utility/smalloc.h"
  71
  72 /* MSVC 2010 produces buggy SIMD PBC code, disable SIMD for MSVC <= 2010 */
  73 #if GMX_SIMD_HAVE_REAL && !(defined _MSC_VER && _MSC_VER < 1700) && !defined(__ICL)
  74 #    define LINCS_SIMD
  75 #endif
  76
  77
  78 #if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
  79
  80 // This was originally work-in-progress for augmenting the SIMD module with
  81 // masked load/store operations. Instead, that turned into and extended SIMD
  82 // interface that supports gather/scatter in all platforms, which will be
  83 // part of a future Gromacs version. However, since the code for bonded
  84 // interactions and LINCS was already written it would be a pity not to get
  85 // the performance gains in Gromacs-5.1. For this reason we have added it as
  86 // a bit of a hack in the two files that use it. It will be replaced with the
  87 // new generic functionality after version 5.1
  88
  89 #    ifdef GMX_DOUBLE
  90 static gmx_inline void gmx_simdcall
  91 gmx_hack_simd_transpose4_r(gmx_simd_double_t *row0,
  92                            gmx_simd_double_t *row1,
  93                            gmx_simd_double_t *row2,
  94                            gmx_simd_double_t *row3)
  95 {
  96     __m256d tmp0, tmp1, tmp2, tmp3;
  97
  98     tmp0  = _mm256_unpacklo_pd(*row0, *row1);
  99     tmp2  = _mm256_unpacklo_pd(*row2, *row3);
 100     tmp1  = _mm256_unpackhi_pd(*row0, *row1);
 101     tmp3  = _mm256_unpackhi_pd(*row2, *row3);
 102     *row0 = _mm256_permute2f128_pd(tmp0, tmp2, 0x20);
 103     *row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
 104     *row2 = _mm256_permute2f128_pd(tmp0, tmp2, 0x31);
 105     *row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0x31);
 106 }
 107
 108 static gmx_inline void gmx_simdcall
 109 gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_double_t *a,
 110                                    gmx_simd_double_t        *row0,
 111                                    gmx_simd_double_t        *row1,
 112                                    gmx_simd_double_t        *row2,
 113                                    gmx_simd_double_t        *row3)
 114 {
 115     *row0 = a[0];
 116     *row1 = a[1];
 117     *row2 = a[2];
 118     *row3 = a[3];
 119
 120     gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
 121 }
 122
 123 static gmx_inline void gmx_simdcall
 124 gmx_hack_simd_transpose_to_simd4_r(gmx_simd_double_t   row0,
 125                                    gmx_simd_double_t   row1,
 126                                    gmx_simd_double_t   row2,
 127                                    gmx_simd_double_t   row3,
 128                                    gmx_simd4_double_t *a)
 129 {
 130     a[0] = row0;
 131     a[1] = row1;
 132     a[2] = row2;
 133     a[3] = row3;
 134
 135     gmx_hack_simd_transpose4_r(&a[0], &a[1], &a[2], &a[3]);
 136 }
 137
 138
 139 #    if GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
 140 #        define gmx_hack_simd4_load3_r(mem)      _mm256_maskload_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)))
 141 #        define gmx_hack_simd4_store3_r(mem, x)   _mm256_maskstore_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)), (x))
 142 #    else
 143 #        define gmx_hack_simd4_load3_r(mem)      _mm256_maskload_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1))
 144 #        define gmx_hack_simd4_store3_r(mem, x)   _mm256_maskstore_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1), (x))
 145 #    endif
 146
 147 #    else /* single instead of double */
 148 static gmx_inline void gmx_simdcall
 149 gmx_hack_simd_transpose4_r(gmx_simd_float_t *row0,
 150                            gmx_simd_float_t *row1,
 151                            gmx_simd_float_t *row2,
 152                            gmx_simd_float_t *row3)
 153 {
 154     __m256 tmp0, tmp1, tmp2, tmp3;
 155
 156     tmp0  = _mm256_unpacklo_ps(*row0, *row1);
 157     tmp2  = _mm256_unpacklo_ps(*row2, *row3);
 158     tmp1  = _mm256_unpackhi_ps(*row0, *row1);
 159     tmp3  = _mm256_unpackhi_ps(*row2, *row3);
 160     *row0 = _mm256_shuffle_ps(tmp0, tmp2, 0x44);
 161     *row1 = _mm256_shuffle_ps(tmp0, tmp2, 0xEE);
 162     *row2 = _mm256_shuffle_ps(tmp1, tmp3, 0x44);
 163     *row3 = _mm256_shuffle_ps(tmp1, tmp3, 0xEE);
 164 }
 165
 166 static gmx_inline void gmx_simdcall
 167 gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_float_t *a,
 168                                    gmx_simd_float_t        *row0,
 169                                    gmx_simd_float_t        *row1,
 170                                    gmx_simd_float_t        *row2,
 171                                    gmx_simd_float_t        *row3)
 172 {
 173     *row0 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[0]), a[4], 1);
 174     *row1 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[1]), a[5], 1);
 175     *row2 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[2]), a[6], 1);
 176     *row3 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[3]), a[7], 1);
 177
 178     gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
 179 }
 180
 181 static gmx_inline void gmx_simdcall
 182 gmx_hack_simd_transpose_to_simd4_r(gmx_simd_float_t   row0,
 183                                    gmx_simd_float_t   row1,
 184                                    gmx_simd_float_t   row2,
 185                                    gmx_simd_float_t   row3,
 186                                    gmx_simd4_float_t *a)
 187 {
 188     gmx_hack_simd_transpose4_r(&row0, &row1, &row2, &row3);
 189
 190     a[0] = _mm256_extractf128_ps(row0, 0);
 191     a[1] = _mm256_extractf128_ps(row1, 0);
 192     a[2] = _mm256_extractf128_ps(row2, 0);
 193     a[3] = _mm256_extractf128_ps(row3, 0);
 194     a[4] = _mm256_extractf128_ps(row0, 1);
 195     a[5] = _mm256_extractf128_ps(row1, 1);
 196     a[6] = _mm256_extractf128_ps(row2, 1);
 197     a[7] = _mm256_extractf128_ps(row3, 1);
 198 }
 199 #if GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
 200 #        define gmx_hack_simd4_load3_r(mem)      _mm_maskload_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)))
 201 #        define gmx_hack_simd4_store3_r(mem, x)   _mm_maskstore_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)), (x))
 202 #else
 203 #        define gmx_hack_simd4_load3_r(mem)      _mm_maskload_ps((mem), _mm_set_epi32(0, -1, -1, -1))
 204 #        define gmx_hack_simd4_store3_r(mem, x)   _mm_maskstore_ps((mem), _mm_set_epi32(0, -1, -1, -1), (x))
 205 #endif
 206
 207 #endif /* double */
 208
 209 #endif /* AVX */
 210
 211 #if GMX_SIMD_HAVE_REAL
 212 /*! \brief Store differences between indexed rvecs in SIMD registers.
 213  *
 214  * Returns SIMD register with the difference vectors:
 215  *     v[pair_index[i*2]] - v[pair_index[i*2 + 1]]
 216  *
 217  * \param[in]     v           Array of rvecs
 218  * \param[in]     pair_index  Index pairs for GMX_SIMD_REAL_WIDTH vector pairs
 219  * \param[in,out] buf         Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
 220  * \param[out]    dx          SIMD register with x difference
 221  * \param[out]    dy          SIMD register with y difference
 222  * \param[out]    dz          SIMD register with z difference
 223  */
 224 static gmx_inline void gmx_simdcall
 225 gmx_hack_simd_gather_rvec_dist_pair_index(const rvec      *v,
 226                                           const int       *pair_index,
 227                                           real gmx_unused *buf,
 228                                           gmx_simd_real_t *dx,
 229                                           gmx_simd_real_t *dy,
 230                                           gmx_simd_real_t *dz)
 231 {
 232 #if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
 233     int              i;
 234     gmx_simd4_real_t d[GMX_SIMD_REAL_WIDTH];
 235     gmx_simd_real_t  tmp;
 236
 237     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 238     {
 239         d[i] = gmx_simd4_sub_r(gmx_hack_simd4_load3_r(&(v[pair_index[i*2 + 0]][0])),
 240                                gmx_hack_simd4_load3_r(&(v[pair_index[i*2 + 1]][0])));
 241     }
 242
 243     gmx_hack_simd4_transpose_to_simd_r(d, dx, dy, dz, &tmp);
 244 #else
 245 #if GMX_ALIGNMENT
 246     GMX_ALIGNED(real, GMX_SIMD_REAL_WIDTH) buf_aligned[3*GMX_SIMD_REAL_WIDTH];
 247 #else
 248     real* buf_aligned = buf;
 249 #endif
 250
 251     int i, m;
 252
 253     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 254     {
 255         /* Store the distances packed and aligned */
 256         for (m = 0; m < DIM; m++)
 257         {
 258             buf_aligned[m*GMX_SIMD_REAL_WIDTH + i] =
 259                 v[pair_index[i*2]][m] - v[pair_index[i*2 + 1]][m];
 260         }
 261     }
 262     *dx = gmx_simd_load_r(buf_aligned + 0*GMX_SIMD_REAL_WIDTH);
 263     *dy = gmx_simd_load_r(buf_aligned + 1*GMX_SIMD_REAL_WIDTH);
 264     *dz = gmx_simd_load_r(buf_aligned + 2*GMX_SIMD_REAL_WIDTH);
 265 #endif
 266 }
 267
 268
 269 /*! \brief Stores SIMD vector into multiple rvecs.
 270  *
 271  * \param[in]     x           SIMD register with x-components of the vectors
 272  * \param[in]     y           SIMD register with y-components of the vectors
 273  * \param[in]     z           SIMD register with z-components of the vectors
 274  * \param[in,out] buf         Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
 275  * \param[out]    v           Array of GMX_SIMD_REAL_WIDTH rvecs
 276  */
 277 static gmx_inline void gmx_simdcall
 278 gmx_simd_store_vec_to_rvec(gmx_simd_real_t  x,
 279                            gmx_simd_real_t  y,
 280                            gmx_simd_real_t  z,
 281                            real gmx_unused *buf,
 282                            rvec            *v)
 283 {
 284 #if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
 285     int              i;
 286     gmx_simd4_real_t s4[GMX_SIMD_REAL_WIDTH];
 287     gmx_simd_real_t  zero = gmx_simd_setzero_r();
 288
 289     gmx_hack_simd_transpose_to_simd4_r(x, y, z, zero, s4);
 290
 291     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 292     {
 293         gmx_hack_simd4_store3_r(v[i], s4[i]);
 294     }
 295 #else
 296 #if GMX_ALIGNMENT
 297     GMX_ALIGNED(real, GMX_SIMD_REAL_WIDTH) buf_aligned[3*GMX_SIMD_REAL_WIDTH];
 298 #else
 299     real* buf_aligned = buf;
 300 #endif
 301
 302     int i, m;
 303
 304     gmx_simd_store_r(buf_aligned + 0*GMX_SIMD_REAL_WIDTH, x);
 305     gmx_simd_store_r(buf_aligned + 1*GMX_SIMD_REAL_WIDTH, y);
 306     gmx_simd_store_r(buf_aligned + 2*GMX_SIMD_REAL_WIDTH, z);
 307
 308     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 309     {
 310         for (m = 0; m < DIM; m++)
 311         {
 312             v[i][m] = buf_aligned[m*GMX_SIMD_REAL_WIDTH + i];
 313         }
 314     }
 315 #endif
 316 }
 317 #endif /* GMX_SIMD_HAVE_REAL */
 318
 319
 320 typedef struct {
 321     int    b0;         /* first constraint for this task */
 322     int    b1;         /* b1-1 is the last constraint for this task */
 323     int    ntriangle;  /* the number of constraints in triangles */
 324     int   *triangle;   /* the list of triangle constraints */
 325     int   *tri_bits;   /* the bits tell if the matrix element should be used */
 326     int    tri_alloc;  /* allocation size of triangle and tri_bits */
 327     int    nind;       /* number of indices */
 328     int   *ind;        /* constraint index for updating atom data */
 329     int    nind_r;     /* number of indices */
 330     int   *ind_r;      /* constraint index for updating atom data */
 331     int    ind_nalloc; /* allocation size of ind and ind_r */
 332     tensor vir_r_m_dr; /* temporary variable for virial calculation */
 333     real   dhdlambda;  /* temporary variable for lambda derivative */
 334     real  *simd_buf;   /* Aligned work array for SIMD */
 335 } lincs_task_t;
 336
 337 typedef struct gmx_lincsdata {
 338     int             ncg;          /* the global number of constraints */
 339     int             ncg_flex;     /* the global number of flexible constraints */
 340     int             ncg_triangle; /* the global number of constraints in triangles */
 341     int             nIter;        /* the number of iterations */
 342     int             nOrder;       /* the order of the matrix expansion */
 343     int             max_connect;  /* the maximum number of constrains connected to a single atom */
 344
 345     int             nc_real;      /* the number of real constraints */
 346     int             nc;           /* the number of constraints including padding for SIMD */
 347     int             nc_alloc;     /* the number we allocated memory for */
 348     int             ncc;          /* the number of constraint connections */
 349     int             ncc_alloc;    /* the number we allocated memory for */
 350     real            matlam;       /* the FE lambda value used for filling blc and blmf */
 351     int            *con_index;    /* mapping from topology to LINCS constraints */
 352     real           *bllen0;       /* the reference distance in topology A */
 353     real           *ddist;        /* the reference distance in top B - the r.d. in top A */
 354     int            *bla;          /* the atom pairs involved in the constraints */
 355     real           *blc;          /* 1/sqrt(invmass1 + invmass2) */
 356     real           *blc1;         /* as blc, but with all masses 1 */
 357     int            *blnr;         /* index into blbnb and blmf */
 358     int            *blbnb;        /* list of constraint connections */
 359     int             ntriangle;    /* the local number of constraints in triangles */
 360     int             ncc_triangle; /* the number of constraint connections in triangles */
 361     gmx_bool        bCommIter;    /* communicate before each LINCS interation */
 362     real           *blmf;         /* matrix of mass factors for constraint connections */
 363     real           *blmf1;        /* as blmf, but with all masses 1 */
 364     real           *bllen;        /* the reference bond length */
 365     int            *nlocat;       /* the local atom count per constraint, can be NULL */
 366
 367     int             ntask;        /* The number of tasks = #threads for LINCS */
 368     lincs_task_t   *task;         /* LINCS thread division */
 369     gmx_bitmask_t  *atf;          /* atom flags for thread parallelization */
 370     int             atf_nalloc;   /* allocation size of atf */
 371     gmx_bool        bTaskDep;     /* are the LINCS tasks interdependent? */
 372     /* arrays for temporary storage in the LINCS algorithm */
 373     rvec           *tmpv;
 374     real           *tmpncc;
 375     real           *tmp1;
 376     real           *tmp2;
 377     real           *tmp3;
 378     real           *tmp4;
 379     real           *mlambda; /* the Lagrange multipliers * -1 */
 380     /* storage for the constraint RMS relative deviation output */
 381     real            rmsd_data[3];
 382 } t_gmx_lincsdata;
 383
 384 /* Define simd_width for memory allocation used for SIMD code */
 385 #ifdef LINCS_SIMD
 386 static const int simd_width = GMX_SIMD_REAL_WIDTH;
 387 #else
 388 static const int simd_width = 1;
 389 #endif
 390 /* Align to 128 bytes, consistent with the current implementation of
 391    AlignedAllocator, which currently forces 128 byte alignment. */
 392 static const int align_bytes = 128;
 393
 394 real *lincs_rmsd_data(struct gmx_lincsdata *lincsd)
 395 {
 396     return lincsd->rmsd_data;
 397 }
 398
 399 real lincs_rmsd(struct gmx_lincsdata *lincsd, gmx_bool bSD2)
 400 {
 401     if (lincsd->rmsd_data[0] > 0)
 402     {
 403         return sqrt(lincsd->rmsd_data[bSD2 ? 2 : 1]/lincsd->rmsd_data[0]);
 404     }
 405     else
 406     {
 407         return 0;
 408     }
 409 }
 410
 411 /* Do a set of nrec LINCS matrix multiplications.
 412  * This function will return with up to date thread-local
 413  * constraint data, without an OpenMP barrier.
 414  */
 415 static void lincs_matrix_expand(const struct gmx_lincsdata *lincsd,
 416                                 const lincs_task_t *li_task,
 417                                 const real *blcc,
 418                                 real *rhs1, real *rhs2, real *sol)
 419 {
 420     int        b0, b1, nrec, rec;
 421     const int *blnr  = lincsd->blnr;
 422     const int *blbnb = lincsd->blbnb;
 423
 424     b0   = li_task->b0;
 425     b1   = li_task->b1;
 426     nrec = lincsd->nOrder;
 427
 428     for (rec = 0; rec < nrec; rec++)
 429     {
 430         int b;
 431
 432         if (lincsd->bTaskDep)
 433         {
 434 #pragma omp barrier
 435         }
 436         for (b = b0; b < b1; b++)
 437         {
 438             real mvb;
 439             int  n;
 440
 441             mvb = 0;
 442             for (n = blnr[b]; n < blnr[b+1]; n++)
 443             {
 444                 mvb = mvb + blcc[n]*rhs1[blbnb[n]];
 445             }
 446             rhs2[b] = mvb;
 447             sol[b]  = sol[b] + mvb;
 448         }
 449
 450         real *swap;
 451
 452         swap = rhs1;
 453         rhs1 = rhs2;
 454         rhs2 = swap;
 455     } /* nrec*(ncons+2*nrtot) flops */
 456
 457     if (lincsd->ntriangle > 0)
 458     {
 459         /* Perform an extra nrec recursions for only the constraints
 460          * involved in rigid triangles.
 461          * In this way their accuracy should come close to those of the other
 462          * constraints, since traingles of constraints can produce eigenvalues
 463          * around 0.7, while the effective eigenvalue for bond constraints
 464          * is around 0.4 (and 0.7*0.7=0.5).
 465          */
 466
 467         if (lincsd->bTaskDep)
 468         {
 469             /* We need a barrier here, since other threads might still be
 470              * reading the contents of rhs1 and/o rhs2.
 471              * We could avoid this barrier by introducing two extra rhs
 472              * arrays for the triangle constraints only.
 473              */
 474 #pragma omp barrier
 475         }
 476
 477         /* Constraints involved in a triangle are ensured to be in the same
 478          * LINCS task. This means no barriers are required during the extra
 479          * iterations for the triangle constraints.
 480          */
 481         const int *triangle = li_task->triangle;
 482         const int *tri_bits = li_task->tri_bits;
 483
 484         for (rec = 0; rec < nrec; rec++)
 485         {
 486             int tb;
 487
 488             for (tb = 0; tb < li_task->ntriangle; tb++)
 489             {
 490                 int  b, bits, nr0, nr1, n;
 491                 real mvb;
 492
 493                 b    = triangle[tb];
 494                 bits = tri_bits[tb];
 495                 mvb  = 0;
 496                 nr0  = blnr[b];
 497                 nr1  = blnr[b+1];
 498                 for (n = nr0; n < nr1; n++)
 499                 {
 500                     if (bits & (1 << (n - nr0)))
 501                     {
 502                         mvb = mvb + blcc[n]*rhs1[blbnb[n]];
 503                     }
 504                 }
 505                 rhs2[b] = mvb;
 506                 sol[b]  = sol[b] + mvb;
 507             }
 508
 509             real *swap;
 510
 511             swap = rhs1;
 512             rhs1 = rhs2;
 513             rhs2 = swap;
 514         } /* nrec*(ntriangle + ncc_triangle*2) flops */
 515     }
 516 }
 517
 518 static void lincs_update_atoms_noind(int ncons, const int *bla,
 519                                      real prefac,
 520                                      const real *fac, rvec *r,
 521                                      const real *invmass,
 522                                      rvec *x)
 523 {
 524     int  b, i, j;
 525     real mvb, im1, im2, tmp0, tmp1, tmp2;
 526
 527     if (invmass != NULL)
 528     {
 529         for (b = 0; b < ncons; b++)
 530         {
 531             i        = bla[2*b];
 532             j        = bla[2*b+1];
 533             mvb      = prefac*fac[b];
 534             im1      = invmass[i];
 535             im2      = invmass[j];
 536             tmp0     = r[b][0]*mvb;
 537             tmp1     = r[b][1]*mvb;
 538             tmp2     = r[b][2]*mvb;
 539             x[i][0] -= tmp0*im1;
 540             x[i][1] -= tmp1*im1;
 541             x[i][2] -= tmp2*im1;
 542             x[j][0] += tmp0*im2;
 543             x[j][1] += tmp1*im2;
 544             x[j][2] += tmp2*im2;
 545         } /* 16 ncons flops */
 546     }
 547     else
 548     {
 549         for (b = 0; b < ncons; b++)
 550         {
 551             i        = bla[2*b];
 552             j        = bla[2*b+1];
 553             mvb      = prefac*fac[b];
 554             tmp0     = r[b][0]*mvb;
 555             tmp1     = r[b][1]*mvb;
 556             tmp2     = r[b][2]*mvb;
 557             x[i][0] -= tmp0;
 558             x[i][1] -= tmp1;
 559             x[i][2] -= tmp2;
 560             x[j][0] += tmp0;
 561             x[j][1] += tmp1;
 562             x[j][2] += tmp2;
 563         }
 564     }
 565 }
 566
 567 static void lincs_update_atoms_ind(int ncons, const int *ind, const int *bla,
 568                                    real prefac,
 569                                    const real *fac, rvec *r,
 570                                    const real *invmass,
 571                                    rvec *x)
 572 {
 573     int  bi, b, i, j;
 574     real mvb, im1, im2, tmp0, tmp1, tmp2;
 575
 576     if (invmass != NULL)
 577     {
 578         for (bi = 0; bi < ncons; bi++)
 579         {
 580             b        = ind[bi];
 581             i        = bla[2*b];
 582             j        = bla[2*b+1];
 583             mvb      = prefac*fac[b];
 584             im1      = invmass[i];
 585             im2      = invmass[j];
 586             tmp0     = r[b][0]*mvb;
 587             tmp1     = r[b][1]*mvb;
 588             tmp2     = r[b][2]*mvb;
 589             x[i][0] -= tmp0*im1;
 590             x[i][1] -= tmp1*im1;
 591             x[i][2] -= tmp2*im1;
 592             x[j][0] += tmp0*im2;
 593             x[j][1] += tmp1*im2;
 594             x[j][2] += tmp2*im2;
 595         } /* 16 ncons flops */
 596     }
 597     else
 598     {
 599         for (bi = 0; bi < ncons; bi++)
 600         {
 601             b        = ind[bi];
 602             i        = bla[2*b];
 603             j        = bla[2*b+1];
 604             mvb      = prefac*fac[b];
 605             tmp0     = r[b][0]*mvb;
 606             tmp1     = r[b][1]*mvb;
 607             tmp2     = r[b][2]*mvb;
 608             x[i][0] -= tmp0;
 609             x[i][1] -= tmp1;
 610             x[i][2] -= tmp2;
 611             x[j][0] += tmp0;
 612             x[j][1] += tmp1;
 613             x[j][2] += tmp2;
 614         } /* 16 ncons flops */
 615     }
 616 }
 617
 618 static void lincs_update_atoms(struct gmx_lincsdata *li, int th,
 619                                real prefac,
 620                                const real *fac, rvec *r,
 621                                const real *invmass,
 622                                rvec *x)
 623 {
 624     if (li->ntask == 1)
 625     {
 626         /* Single thread, we simply update for all constraints */
 627         lincs_update_atoms_noind(li->nc_real,
 628                                  li->bla, prefac, fac, r, invmass, x);
 629     }
 630     else
 631     {
 632         /* Update the atom vector components for our thread local
 633          * constraints that only access our local atom range.
 634          * This can be done without a barrier.
 635          */
 636         lincs_update_atoms_ind(li->task[th].nind, li->task[th].ind,
 637                                li->bla, prefac, fac, r, invmass, x);
 638
 639         if (li->task[li->ntask].nind > 0)
 640         {
 641             /* Update the constraints that operate on atoms
 642              * in multiple thread atom blocks on the master thread.
 643              */
 644 #pragma omp barrier
 645 #pragma omp master
 646             {
 647                 lincs_update_atoms_ind(li->task[li->ntask].nind,
 648                                        li->task[li->ntask].ind,
 649                                        li->bla, prefac, fac, r, invmass, x);
 650             }
 651         }
 652     }
 653 }
 654
 655 #ifdef LINCS_SIMD
 656 /* Calculate the constraint distance vectors r to project on from x.
 657  * Determine the right-hand side of the matrix equation using quantity f.
 658  * This function only differs from calc_dr_x_xp_simd below in that
 659  * no constraint length is subtracted and no PBC is used for f.
 660  */
 661 static void gmx_simdcall
 662 calc_dr_x_f_simd(int                       b0,
 663                  int                       b1,
 664                  const int *               bla,
 665                  const rvec * gmx_restrict x,
 666                  const rvec * gmx_restrict f,
 667                  const real * gmx_restrict blc,
 668                  const pbc_simd_t *        pbc_simd,
 669                  real * gmx_restrict       vbuf1,
 670                  real * gmx_restrict       vbuf2,
 671                  rvec * gmx_restrict       r,
 672                  real * gmx_restrict       rhs,
 673                  real * gmx_restrict       sol)
 674 {
 675     int bs;
 676
 677     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 678
 679     for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 680     {
 681         gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
 682         gmx_simd_real_t fx_S, fy_S, fz_S, ip_S, rhs_S;
 683
 684         gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf1,
 685                                                   &rx_S, &ry_S, &rz_S);
 686
 687         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 688
 689         n2_S  = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
 690         il_S  = gmx_simd_invsqrt_r(n2_S);
 691
 692         rx_S  = gmx_simd_mul_r(rx_S, il_S);
 693         ry_S  = gmx_simd_mul_r(ry_S, il_S);
 694         rz_S  = gmx_simd_mul_r(rz_S, il_S);
 695
 696         gmx_simd_store_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r + bs);
 697
 698         gmx_hack_simd_gather_rvec_dist_pair_index(f, bla + bs*2, vbuf2,
 699                                                   &fx_S, &fy_S, &fz_S);
 700
 701         ip_S  = gmx_simd_iprod_r(rx_S, ry_S, rz_S,
 702                                  fx_S, fy_S, fz_S);
 703
 704         rhs_S = gmx_simd_mul_r(gmx_simd_load_r(blc + bs), ip_S);
 705
 706         gmx_simd_store_r(rhs + bs, rhs_S);
 707         gmx_simd_store_r(sol + bs, rhs_S);
 708     }
 709 }
 710 #endif /* LINCS_SIMD */
 711
 712 /* LINCS projection, works on derivatives of the coordinates */
 713 static void do_lincsp(rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
 714                       struct gmx_lincsdata *lincsd, int th,
 715                       real *invmass,
 716                       int econq, gmx_bool bCalcDHDL,
 717                       gmx_bool bCalcVir, tensor rmdf)
 718 {
 719     int      b0, b1, b;
 720     int     *bla, *blnr, *blbnb;
 721     rvec    *r;
 722     real    *blc, *blmf, *blcc, *rhs1, *rhs2, *sol;
 723
 724     b0 = lincsd->task[th].b0;
 725     b1 = lincsd->task[th].b1;
 726
 727     bla    = lincsd->bla;
 728     r      = lincsd->tmpv;
 729     blnr   = lincsd->blnr;
 730     blbnb  = lincsd->blbnb;
 731     if (econq != econqForce)
 732     {
 733         /* Use mass-weighted parameters */
 734         blc  = lincsd->blc;
 735         blmf = lincsd->blmf;
 736     }
 737     else
 738     {
 739         /* Use non mass-weighted parameters */
 740         blc  = lincsd->blc1;
 741         blmf = lincsd->blmf1;
 742     }
 743     blcc   = lincsd->tmpncc;
 744     rhs1   = lincsd->tmp1;
 745     rhs2   = lincsd->tmp2;
 746     sol    = lincsd->tmp3;
 747
 748 #ifdef LINCS_SIMD
 749
 750     /* This SIMD code does the same as the plain-C code after the #else.
 751      * The only difference is that we always call pbc code, as with SIMD
 752      * the overhead of pbc computation (when not needed) is small.
 753      */
 754     pbc_simd_t pbc_simd;
 755
 756     /* Convert the pbc struct for SIMD */
 757     set_pbc_simd(pbc, &pbc_simd);
 758
 759     /* Compute normalized x i-j vectors, store in r.
 760      * Compute the inner product of r and xp i-j and store in rhs1.
 761      */
 762     calc_dr_x_f_simd(b0, b1, bla, x, f, blc,
 763                      &pbc_simd,
 764                      lincsd->task[th].simd_buf,
 765                      lincsd->task[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
 766                      r, rhs1, sol);
 767
 768 #else /* LINCS_SIMD */
 769
 770     /* Compute normalized i-j vectors */
 771     if (pbc)
 772     {
 773         for (b = b0; b < b1; b++)
 774         {
 775             rvec dx;
 776
 777             pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
 778             unitv(dx, r[b]);
 779         }
 780     }
 781     else
 782     {
 783         for (b = b0; b < b1; b++)
 784         {
 785             rvec dx;
 786
 787             rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
 788             unitv(dx, r[b]);
 789         } /* 16 ncons flops */
 790     }
 791
 792     for (b = b0; b < b1; b++)
 793     {
 794         int  i, j;
 795         real mvb;
 796
 797         i       = bla[2*b];
 798         j       = bla[2*b+1];
 799         mvb     = blc[b]*(r[b][0]*(f[i][0] - f[j][0]) +
 800                           r[b][1]*(f[i][1] - f[j][1]) +
 801                           r[b][2]*(f[i][2] - f[j][2]));
 802         rhs1[b] = mvb;
 803         sol[b]  = mvb;
 804         /* 7 flops */
 805     }
 806
 807 #endif /* LINCS_SIMD */
 808
 809     if (lincsd->bTaskDep)
 810     {
 811         /* We need a barrier, since the matrix construction below
 812          * can access entries in r of other threads.
 813          */
 814 #pragma omp barrier
 815     }
 816
 817     /* Construct the (sparse) LINCS matrix */
 818     for (b = b0; b < b1; b++)
 819     {
 820         int n;
 821
 822         for (n = blnr[b]; n < blnr[b+1]; n++)
 823         {
 824             blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
 825         } /* 6 nr flops */
 826     }
 827     /* Together: 23*ncons + 6*nrtot flops */
 828
 829     lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
 830     /* nrec*(ncons+2*nrtot) flops */
 831
 832     if (econq == econqDeriv_FlexCon)
 833     {
 834         /* We only want to constraint the flexible constraints,
 835          * so we mask out the normal ones by setting sol to 0.
 836          */
 837         for (b = b0; b < b1; b++)
 838         {
 839             if (!(lincsd->bllen0[b] == 0 && lincsd->ddist[b] == 0))
 840             {
 841                 sol[b] = 0;
 842             }
 843         }
 844     }
 845
 846     /* We multiply sol by blc, so we can use lincs_update_atoms for OpenMP */
 847     for (b = b0; b < b1; b++)
 848     {
 849         sol[b] *= blc[b];
 850     }
 851
 852     /* When constraining forces, we should not use mass weighting,
 853      * so we pass invmass=NULL, which results in the use of 1 for all atoms.
 854      */
 855     lincs_update_atoms(lincsd, th, 1.0, sol, r,
 856                        (econq != econqForce) ? invmass : NULL, fp);
 857
 858     if (bCalcDHDL)
 859     {
 860         real dhdlambda;
 861
 862         dhdlambda = 0;
 863         for (b = b0; b < b1; b++)
 864         {
 865             dhdlambda -= sol[b]*lincsd->ddist[b];
 866         }
 867
 868         lincsd->task[th].dhdlambda = dhdlambda;
 869     }
 870
 871     if (bCalcVir)
 872     {
 873         /* Constraint virial,
 874          * determines sum r_bond x delta f,
 875          * where delta f is the constraint correction
 876          * of the quantity that is being constrained.
 877          */
 878         for (b = b0; b < b1; b++)
 879         {
 880             real mvb, tmp1;
 881             int  i, j;
 882
 883             mvb = lincsd->bllen[b]*sol[b];
 884             for (i = 0; i < DIM; i++)
 885             {
 886                 tmp1 = mvb*r[b][i];
 887                 for (j = 0; j < DIM; j++)
 888                 {
 889                     rmdf[i][j] += tmp1*r[b][j];
 890                 }
 891             }
 892         } /* 23 ncons flops */
 893     }
 894 }
 895
 896 #ifdef LINCS_SIMD
 897 /* Calculate the constraint distance vectors r to project on from x.
 898  * Determine the right-hand side of the matrix equation using coordinates xp.
 899  */
 900 static void gmx_simdcall
 901 calc_dr_x_xp_simd(int                       b0,
 902                   int                       b1,
 903                   const int *               bla,
 904                   const rvec * gmx_restrict x,
 905                   const rvec * gmx_restrict xp,
 906                   const real * gmx_restrict bllen,
 907                   const real * gmx_restrict blc,
 908                   const pbc_simd_t *        pbc_simd,
 909                   real * gmx_restrict       vbuf1,
 910                   real * gmx_restrict       vbuf2,
 911                   rvec * gmx_restrict       r,
 912                   real * gmx_restrict       rhs,
 913                   real * gmx_restrict       sol)
 914 {
 915     int bs;
 916
 917     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
 918
 919     for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
 920     {
 921         gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
 922         gmx_simd_real_t rxp_S, ryp_S, rzp_S, ip_S, rhs_S;
 923
 924         gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf1,
 925                                                   &rx_S, &ry_S, &rz_S);
 926
 927         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 928
 929         n2_S  = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
 930         il_S  = gmx_simd_invsqrt_r(n2_S);
 931
 932         rx_S  = gmx_simd_mul_r(rx_S, il_S);
 933         ry_S  = gmx_simd_mul_r(ry_S, il_S);
 934         rz_S  = gmx_simd_mul_r(rz_S, il_S);
 935
 936         gmx_simd_store_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r + bs);
 937
 938         gmx_hack_simd_gather_rvec_dist_pair_index(xp, bla + bs*2, vbuf2,
 939                                                   &rxp_S, &ryp_S, &rzp_S);
 940
 941         pbc_correct_dx_simd(&rxp_S, &ryp_S, &rzp_S, pbc_simd);
 942
 943         ip_S  = gmx_simd_iprod_r(rx_S, ry_S, rz_S,
 944                                  rxp_S, ryp_S, rzp_S);
 945
 946         rhs_S = gmx_simd_mul_r(gmx_simd_load_r(blc + bs),
 947                                gmx_simd_sub_r(ip_S, gmx_simd_load_r(bllen + bs)));
 948
 949         gmx_simd_store_r(rhs + bs, rhs_S);
 950         gmx_simd_store_r(sol + bs, rhs_S);
 951     }
 952 }
 953 #endif /* LINCS_SIMD */
 954
 955 /* Determine the distances and right-hand side for the next iteration */
 956 static void calc_dist_iter(int                       b0,
 957                            int                       b1,
 958                            const int *               bla,
 959                            const rvec * gmx_restrict xp,
 960                            const real * gmx_restrict bllen,
 961                            const real * gmx_restrict blc,
 962                            const t_pbc *             pbc,
 963                            real                      wfac,
 964                            real * gmx_restrict       rhs,
 965                            real * gmx_restrict       sol,
 966                            gmx_bool *                bWarn)
 967 {
 968     int b;
 969
 970     for (b = b0; b < b1; b++)
 971     {
 972         real len, len2, dlen2, mvb;
 973         rvec dx;
 974
 975         len = bllen[b];
 976         if (pbc)
 977         {
 978             pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
 979         }
 980         else
 981         {
 982             rvec_sub(xp[bla[2*b]], xp[bla[2*b+1]], dx);
 983         }
 984         len2  = len*len;
 985         dlen2 = 2*len2 - norm2(dx);
 986         if (dlen2 < wfac*len2)
 987         {
 988             /* not race free - see detailed comment in caller */
 989             *bWarn = TRUE;
 990         }
 991         if (dlen2 > 0)
 992         {
 993             mvb = blc[b]*(len - dlen2*gmx_invsqrt(dlen2));
 994         }
 995         else
 996         {
 997             mvb = blc[b]*len;
 998         }
 999         rhs[b]  = mvb;
1000         sol[b]  = mvb;
1001     } /* 20*ncons flops */
1002 }
1003
1004 #ifdef LINCS_SIMD
1005 /* As the function above, but using SIMD intrinsics */
1006 static void gmx_simdcall
1007 calc_dist_iter_simd(int                       b0,
1008                     int                       b1,
1009                     const int *               bla,
1010                     const rvec * gmx_restrict x,
1011                     const real * gmx_restrict bllen,
1012                     const real * gmx_restrict blc,
1013                     const pbc_simd_t *        pbc_simd,
1014                     real                      wfac,
1015                     real * gmx_restrict       vbuf,
1016                     real * gmx_restrict       rhs,
1017                     real * gmx_restrict       sol,
1018                     gmx_bool *                bWarn)
1019 {
1020     gmx_simd_real_t min_S  = gmx_simd_set1_r(GMX_REAL_MIN);
1021     gmx_simd_real_t two_S  = gmx_simd_set1_r(2.0);
1022     gmx_simd_real_t wfac_S = gmx_simd_set1_r(wfac);
1023     gmx_simd_bool_t warn_S;
1024
1025     int             bs;
1026
1027     assert(b0 % GMX_SIMD_REAL_WIDTH == 0);
1028
1029     /* Initialize all to FALSE */
1030     warn_S = gmx_simd_cmplt_r(two_S, gmx_simd_setzero_r());
1031
1032     for (bs = b0; bs < b1; bs += GMX_SIMD_REAL_WIDTH)
1033     {
1034         gmx_simd_real_t rx_S, ry_S, rz_S, n2_S;
1035         gmx_simd_real_t len_S, len2_S, dlen2_S, lc_S, blc_S;
1036
1037         gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf,
1038                                                   &rx_S, &ry_S, &rz_S);
1039
1040         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
1041
1042         n2_S    = gmx_simd_norm2_r(rx_S, ry_S, rz_S);
1043
1044         len_S   = gmx_simd_load_r(bllen + bs);
1045         len2_S  = gmx_simd_mul_r(len_S, len_S);
1046
1047         dlen2_S = gmx_simd_fmsub_r(two_S, len2_S, n2_S);
1048
1049         warn_S  = gmx_simd_or_b(warn_S,
1050                                 gmx_simd_cmplt_r(dlen2_S,
1051                                                  gmx_simd_mul_r(wfac_S, len2_S)));
1052
1053         /* Avoid 1/0 by taking the max with REAL_MIN.
1054          * Note: when dlen2 is close to zero (90 degree constraint rotation),
1055          * the accuracy of the algorithm is no longer relevant.
1056          */
1057         dlen2_S = gmx_simd_max_r(dlen2_S, min_S);
1058
1059         lc_S    = gmx_simd_fnmadd_r(dlen2_S, gmx_simd_invsqrt_r(dlen2_S), len_S);
1060
1061         blc_S   = gmx_simd_load_r(blc + bs);
1062
1063         lc_S    = gmx_simd_mul_r(blc_S, lc_S);
1064
1065         gmx_simd_store_r(rhs + bs, lc_S);
1066         gmx_simd_store_r(sol + bs, lc_S);
1067     }
1068
1069     if (gmx_simd_anytrue_b(warn_S))
1070     {
1071         *bWarn = TRUE;
1072     }
1073 }
1074 #endif /* LINCS_SIMD */
1075
1076 static void do_lincs(rvec *x, rvec *xp, matrix box, t_pbc *pbc,
1077                      struct gmx_lincsdata *lincsd, int th,
1078                      const real *invmass,
1079                      t_commrec *cr,
1080                      gmx_bool bCalcDHDL,
1081                      real wangle, gmx_bool *bWarn,
1082                      real invdt, rvec * gmx_restrict v,
1083                      gmx_bool bCalcVir, tensor vir_r_m_dr)
1084 {
1085     int      b0, b1, b, i, j, n, iter;
1086     int     *bla, *blnr, *blbnb;
1087     rvec    *r;
1088     real    *blc, *blmf, *bllen, *blcc, *rhs1, *rhs2, *sol, *blc_sol, *mlambda;
1089     int     *nlocat;
1090
1091     b0 = lincsd->task[th].b0;
1092     b1 = lincsd->task[th].b1;
1093
1094     bla     = lincsd->bla;
1095     r       = lincsd->tmpv;
1096     blnr    = lincsd->blnr;
1097     blbnb   = lincsd->blbnb;
1098     blc     = lincsd->blc;
1099     blmf    = lincsd->blmf;
1100     bllen   = lincsd->bllen;
1101     blcc    = lincsd->tmpncc;
1102     rhs1    = lincsd->tmp1;
1103     rhs2    = lincsd->tmp2;
1104     sol     = lincsd->tmp3;
1105     blc_sol = lincsd->tmp4;
1106     mlambda = lincsd->mlambda;
1107     nlocat  = lincsd->nlocat;
1108
1109 #ifdef LINCS_SIMD
1110
1111     /* This SIMD code does the same as the plain-C code after the #else.
1112      * The only difference is that we always call pbc code, as with SIMD
1113      * the overhead of pbc computation (when not needed) is small.
1114      */
1115     pbc_simd_t pbc_simd;
1116
1117     /* Convert the pbc struct for SIMD */
1118     set_pbc_simd(pbc, &pbc_simd);
1119
1120     /* Compute normalized x i-j vectors, store in r.
1121      * Compute the inner product of r and xp i-j and store in rhs1.
1122      */
1123     calc_dr_x_xp_simd(b0, b1, bla, x, xp, bllen, blc,
1124                       &pbc_simd,
1125                       lincsd->task[th].simd_buf,
1126                       lincsd->task[th].simd_buf + GMX_SIMD_REAL_WIDTH*DIM,
1127                       r, rhs1, sol);
1128
1129 #else /* LINCS_SIMD */
1130
1131     if (pbc)
1132     {
1133         /* Compute normalized i-j vectors */
1134         for (b = b0; b < b1; b++)
1135         {
1136             rvec dx;
1137             real mvb;
1138
1139             pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
1140             unitv(dx, r[b]);
1141
1142             pbc_dx_aiuc(pbc, xp[bla[2*b]], xp[bla[2*b+1]], dx);
1143             mvb     = blc[b]*(iprod(r[b], dx) - bllen[b]);
1144             rhs1[b] = mvb;
1145             sol[b]  = mvb;
1146         }
1147     }
1148     else
1149     {
1150         /* Compute normalized i-j vectors */
1151         for (b = b0; b < b1; b++)
1152         {
1153             real tmp0, tmp1, tmp2, rlen, mvb;
1154
1155             i       = bla[2*b];
1156             j       = bla[2*b+1];
1157             tmp0    = x[i][0] - x[j][0];
1158             tmp1    = x[i][1] - x[j][1];
1159             tmp2    = x[i][2] - x[j][2];
1160             rlen    = gmx_invsqrt(tmp0*tmp0 + tmp1*tmp1 + tmp2*tmp2);
1161             r[b][0] = rlen*tmp0;
1162             r[b][1] = rlen*tmp1;
1163             r[b][2] = rlen*tmp2;
1164             /* 16 ncons flops */
1165
1166             i       = bla[2*b];
1167             j       = bla[2*b+1];
1168             mvb     = blc[b]*(r[b][0]*(xp[i][0] - xp[j][0]) +
1169                               r[b][1]*(xp[i][1] - xp[j][1]) +
1170                               r[b][2]*(xp[i][2] - xp[j][2]) - bllen[b]);
1171             rhs1[b] = mvb;
1172             sol[b]  = mvb;
1173             /* 10 flops */
1174         }
1175         /* Together: 26*ncons + 6*nrtot flops */
1176     }
1177
1178 #endif /* LINCS_SIMD */
1179
1180     if (lincsd->bTaskDep)
1181     {
1182         /* We need a barrier, since the matrix construction below
1183          * can access entries in r of other threads.
1184          */
1185 #pragma omp barrier
1186     }
1187
1188     /* Construct the (sparse) LINCS matrix */
1189     for (b = b0; b < b1; b++)
1190     {
1191         for (n = blnr[b]; n < blnr[b+1]; n++)
1192         {
1193             blcc[n] = blmf[n]*iprod(r[b], r[blbnb[n]]);
1194         }
1195     }
1196     /* Together: 26*ncons + 6*nrtot flops */
1197
1198     lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
1199     /* nrec*(ncons+2*nrtot) flops */
1200
1201 #ifndef LINCS_SIMD
1202     for (b = b0; b < b1; b++)
1203     {
1204         mlambda[b] = blc[b]*sol[b];
1205     }
1206 #else
1207     for (b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1208     {
1209         gmx_simd_store_r(mlambda + b,
1210                          gmx_simd_mul_r(gmx_simd_load_r(blc + b),
1211                                         gmx_simd_load_r(sol + b)));
1212     }
1213 #endif
1214
1215     /* Update the coordinates */
1216     lincs_update_atoms(lincsd, th, 1.0, mlambda, r, invmass, xp);
1217
1218     /*
1219      ********  Correction for centripetal effects  ********
1220      */
1221
1222     real wfac;
1223
1224     wfac = cos(DEG2RAD*wangle);
1225     wfac = wfac*wfac;
1226
1227     for (iter = 0; iter < lincsd->nIter; iter++)
1228     {
1229         if ((lincsd->bCommIter && DOMAINDECOMP(cr) && cr->dd->constraints))
1230         {
1231 #pragma omp barrier
1232 #pragma omp master
1233             {
1234                 /* Communicate the corrected non-local coordinates */
1235                 if (DOMAINDECOMP(cr))
1236                 {
1237                     dd_move_x_constraints(cr->dd, box, xp, NULL, FALSE);
1238                 }
1239             }
1240 #pragma omp barrier
1241         }
1242         else if (lincsd->bTaskDep)
1243         {
1244 #pragma omp barrier
1245         }
1246
1247 #ifdef LINCS_SIMD
1248         calc_dist_iter_simd(b0, b1, bla, xp, bllen, blc, &pbc_simd, wfac,
1249                             lincsd->task[th].simd_buf, rhs1, sol, bWarn);
1250 #else
1251         calc_dist_iter(b0, b1, bla, xp, bllen, blc, pbc, wfac,
1252                        rhs1, sol, bWarn);
1253         /* 20*ncons flops */
1254 #endif
1255
1256         lincs_matrix_expand(lincsd, &lincsd->task[th], blcc, rhs1, rhs2, sol);
1257         /* nrec*(ncons+2*nrtot) flops */
1258
1259 #ifndef LINCS_SIMD
1260         for (b = b0; b < b1; b++)
1261         {
1262             real mvb;
1263
1264             mvb         = blc[b]*sol[b];
1265             blc_sol[b]  = mvb;
1266             mlambda[b] += mvb;
1267         }
1268 #else
1269         for (b = b0; b < b1; b += GMX_SIMD_REAL_WIDTH)
1270         {
1271             gmx_simd_real_t mvb;
1272
1273             mvb = gmx_simd_mul_r(gmx_simd_load_r(blc + b),
1274                                  gmx_simd_load_r(sol + b));
1275             gmx_simd_store_r(blc_sol + b, mvb);
1276             gmx_simd_store_r(mlambda + b,
1277                              gmx_simd_add_r(gmx_simd_load_r(mlambda + b), mvb));
1278         }
1279 #endif
1280
1281         /* Update the coordinates */
1282         lincs_update_atoms(lincsd, th, 1.0, blc_sol, r, invmass, xp);
1283     }
1284     /* nit*ncons*(37+9*nrec) flops */
1285
1286     if (v != NULL)
1287     {
1288         /* Update the velocities */
1289         lincs_update_atoms(lincsd, th, invdt, mlambda, r, invmass, v);
1290         /* 16 ncons flops */
1291     }
1292
1293     if (nlocat != NULL && (bCalcDHDL || bCalcVir))
1294     {
1295         if (lincsd->bTaskDep)
1296         {
1297             /* In lincs_update_atoms threads might cross-read mlambda */
1298 #pragma omp barrier
1299         }
1300
1301         /* Only account for local atoms */
1302         for (b = b0; b < b1; b++)
1303         {
1304             mlambda[b] *= 0.5*nlocat[b];
1305         }
1306     }
1307
1308     if (bCalcDHDL)
1309     {
1310         real dhdl;
1311
1312         dhdl = 0;
1313         for (b = b0; b < b1; b++)
1314         {
1315             /* Note that this this is dhdl*dt^2, the dt^2 factor is corrected
1316              * later after the contributions are reduced over the threads.
1317              */
1318             dhdl -= lincsd->mlambda[b]*lincsd->ddist[b];
1319         }
1320         lincsd->task[th].dhdlambda = dhdl;
1321     }
1322
1323     if (bCalcVir)
1324     {
1325         /* Constraint virial */
1326         for (b = b0; b < b1; b++)
1327         {
1328             real tmp0, tmp1;
1329
1330             tmp0 = -bllen[b]*mlambda[b];
1331             for (i = 0; i < DIM; i++)
1332             {
1333                 tmp1 = tmp0*r[b][i];
1334                 for (j = 0; j < DIM; j++)
1335                 {
1336                     vir_r_m_dr[i][j] -= tmp1*r[b][j];
1337                 }
1338             }
1339         } /* 22 ncons flops */
1340     }
1341
1342     /* Total:
1343      * 26*ncons + 6*nrtot + nrec*(ncons+2*nrtot)
1344      * + nit * (20*ncons + nrec*(ncons+2*nrtot) + 17 ncons)
1345      *
1346      * (26+nrec)*ncons + (6+2*nrec)*nrtot
1347      * + nit * ((37+nrec)*ncons + 2*nrec*nrtot)
1348      * if nit=1
1349      * (63+nrec)*ncons + (6+4*nrec)*nrtot
1350      */
1351 }
1352
1353 /* Sets the elements in the LINCS matrix for task li_task */
1354 static void set_lincs_matrix_task(struct gmx_lincsdata *li,
1355                                   lincs_task_t         *li_task,
1356                                   const real           *invmass,
1357                                   int                  *ncc_triangle)
1358 {
1359     int        i;
1360
1361     /* Construct the coupling coefficient matrix blmf */
1362     li_task->ntriangle = 0;
1363     *ncc_triangle      = 0;
1364     for (i = li_task->b0; i < li_task->b1; i++)
1365     {
1366         int a1, a2, n;
1367
1368         a1 = li->bla[2*i];
1369         a2 = li->bla[2*i+1];
1370         for (n = li->blnr[i]; (n < li->blnr[i+1]); n++)
1371         {
1372             int k, sign, center, end;
1373
1374             k = li->blbnb[n];
1375
1376             /* If we are using multiple, independent tasks for LINCS,
1377              * the calls to check_assign_connected should have
1378              * put all connected constraints in our task.
1379              */
1380             assert(li->bTaskDep || (k >= li_task->b0 && k < li_task->b1));
1381
1382             if (a1 == li->bla[2*k] || a2 == li->bla[2*k+1])
1383             {
1384                 sign = -1;
1385             }
1386             else
1387             {
1388                 sign = 1;
1389             }
1390             if (a1 == li->bla[2*k] || a1 == li->bla[2*k+1])
1391             {
1392                 center = a1;
1393                 end    = a2;
1394             }
1395             else
1396             {
1397                 center = a2;
1398                 end    = a1;
1399             }
1400             li->blmf[n]  = sign*invmass[center]*li->blc[i]*li->blc[k];
1401             li->blmf1[n] = sign*0.5;
1402             if (li->ncg_triangle > 0)
1403             {
1404                 int nk, kk;
1405
1406                 /* Look for constraint triangles */
1407                 for (nk = li->blnr[k]; (nk < li->blnr[k+1]); nk++)
1408                 {
1409                     kk = li->blbnb[nk];
1410                     if (kk != i && kk != k &&
1411                         (li->bla[2*kk] == end || li->bla[2*kk+1] == end))
1412                     {
1413                         /* If we are using multiple tasks for LINCS,
1414                          * the calls to check_assign_triangle should have
1415                          * put all constraints in the triangle in our task.
1416                          */
1417                         assert(k  >= li_task->b0 && k  < li_task->b1);
1418                         assert(kk >= li_task->b0 && kk < li_task->b1);
1419
1420                         if (li_task->ntriangle == 0 ||
1421                             li_task->triangle[li_task->ntriangle - 1] < i)
1422                         {
1423                             /* Add this constraint to the triangle list */
1424                             li_task->triangle[li_task->ntriangle] = i;
1425                             li_task->tri_bits[li_task->ntriangle] = 0;
1426                             li_task->ntriangle++;
1427                             if (li->blnr[i+1] - li->blnr[i] > static_cast<int>(sizeof(li_task->tri_bits[0])*8 - 1))
1428                             {
1429                                 gmx_fatal(FARGS, "A constraint is connected to %d constraints, this is more than the %d allowed for constraints participating in triangles",
1430                                           li->blnr[i+1] - li->blnr[i],
1431                                           sizeof(li_task->tri_bits[0])*8-1);
1432                             }
1433                         }
1434                         li_task->tri_bits[li_task->ntriangle-1] |= (1 << (n - li->blnr[i]));
1435                         (*ncc_triangle)++;
1436                     }
1437                 }
1438             }
1439         }
1440     }
1441 }
1442
1443 /* Sets the elements in the LINCS matrix */
1444 void set_lincs_matrix(struct gmx_lincsdata *li, real *invmass, real lambda)
1445 {
1446     int        i;
1447     const real invsqrt2 = 0.7071067811865475244;
1448
1449     for (i = 0; (i < li->nc); i++)
1450     {
1451         int a1, a2;
1452
1453         a1          = li->bla[2*i];
1454         a2          = li->bla[2*i+1];
1455         li->blc[i]  = gmx_invsqrt(invmass[a1] + invmass[a2]);
1456         li->blc1[i] = invsqrt2;
1457     }
1458
1459     /* Construct the coupling coefficient matrix blmf */
1460     int th, ntriangle = 0, ncc_triangle = 0;
1461 #pragma omp parallel for reduction(+: ntriangle, ncc_triangle) num_threads(li->ntask) schedule(static)
1462     for (th = 0; th < li->ntask; th++)
1463     {
1464         try
1465         {
1466             set_lincs_matrix_task(li, &li->task[th], invmass, &ncc_triangle);
1467             ntriangle = li->task[th].ntriangle;
1468         }
1469         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1470     }
1471     li->ntriangle    = ntriangle;
1472     li->ncc_triangle = ncc_triangle;
1473
1474     if (debug)
1475     {
1476         fprintf(debug, "Of the %d constraints %d participate in triangles\n",
1477                 li->nc, li->ntriangle);
1478         fprintf(debug, "There are %d couplings of which %d in triangles\n",
1479                 li->ncc, li->ncc_triangle);
1480     }
1481
1482     /* Set matlam,
1483      * so we know with which lambda value the masses have been set.
1484      */
1485     li->matlam = lambda;
1486 }
1487
1488 static int count_triangle_constraints(const t_ilist  *ilist,
1489                                       const t_blocka *at2con)
1490 {
1491     int      ncon1, ncon_tot;
1492     int      c0, a00, a01, n1, c1, a10, a11, ac1, n2, c2, a20, a21;
1493     int      ncon_triangle;
1494     gmx_bool bTriangle;
1495     t_iatom *ia1, *ia2, *iap;
1496
1497     ncon1    = ilist[F_CONSTR].nr/3;
1498     ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
1499
1500     ia1 = ilist[F_CONSTR].iatoms;
1501     ia2 = ilist[F_CONSTRNC].iatoms;
1502
1503     ncon_triangle = 0;
1504     for (c0 = 0; c0 < ncon_tot; c0++)
1505     {
1506         bTriangle = FALSE;
1507         iap       = constr_iatomptr(ncon1, ia1, ia2, c0);
1508         a00       = iap[1];
1509         a01       = iap[2];
1510         for (n1 = at2con->index[a01]; n1 < at2con->index[a01+1]; n1++)
1511         {
1512             c1 = at2con->a[n1];
1513             if (c1 != c0)
1514             {
1515                 iap = constr_iatomptr(ncon1, ia1, ia2, c1);
1516                 a10 = iap[1];
1517                 a11 = iap[2];
1518                 if (a10 == a01)
1519                 {
1520                     ac1 = a11;
1521                 }
1522                 else
1523                 {
1524                     ac1 = a10;
1525                 }
1526                 for (n2 = at2con->index[ac1]; n2 < at2con->index[ac1+1]; n2++)
1527                 {
1528                     c2 = at2con->a[n2];
1529                     if (c2 != c0 && c2 != c1)
1530                     {
1531                         iap = constr_iatomptr(ncon1, ia1, ia2, c2);
1532                         a20 = iap[1];
1533                         a21 = iap[2];
1534                         if (a20 == a00 || a21 == a00)
1535                         {
1536                             bTriangle = TRUE;
1537                         }
1538                     }
1539                 }
1540             }
1541         }
1542         if (bTriangle)
1543         {
1544             ncon_triangle++;
1545         }
1546     }
1547
1548     return ncon_triangle;
1549 }
1550
1551 static gmx_bool more_than_two_sequential_constraints(const t_ilist  *ilist,
1552                                                      const t_blocka *at2con)
1553 {
1554     t_iatom  *ia1, *ia2, *iap;
1555     int       ncon1, ncon_tot, c;
1556     int       a1, a2;
1557     gmx_bool  bMoreThanTwoSequentialConstraints;
1558
1559     ncon1    = ilist[F_CONSTR].nr/3;
1560     ncon_tot = ncon1 + ilist[F_CONSTRNC].nr/3;
1561
1562     ia1 = ilist[F_CONSTR].iatoms;
1563     ia2 = ilist[F_CONSTRNC].iatoms;
1564
1565     bMoreThanTwoSequentialConstraints = FALSE;
1566     for (c = 0; c < ncon_tot && !bMoreThanTwoSequentialConstraints; c++)
1567     {
1568         iap = constr_iatomptr(ncon1, ia1, ia2, c);
1569         a1  = iap[1];
1570         a2  = iap[2];
1571         /* Check if this constraint has constraints connected at both atoms */
1572         if (at2con->index[a1+1] - at2con->index[a1] > 1 &&
1573             at2con->index[a2+1] - at2con->index[a2] > 1)
1574         {
1575             bMoreThanTwoSequentialConstraints = TRUE;
1576         }
1577     }
1578
1579     return bMoreThanTwoSequentialConstraints;
1580 }
1581
1582 static int int_comp(const void *a, const void *b)
1583 {
1584     return (*(int *)a) - (*(int *)b);
1585 }
1586
1587 gmx_lincsdata_t init_lincs(FILE *fplog, const gmx_mtop_t *mtop,
1588                            int nflexcon_global, const t_blocka *at2con,
1589                            gmx_bool bPLINCS, int nIter, int nProjOrder)
1590 {
1591     struct gmx_lincsdata *li;
1592     int                   mt, mb;
1593     gmx_moltype_t        *molt;
1594     gmx_bool              bMoreThanTwoSeq;
1595
1596     if (fplog)
1597     {
1598         fprintf(fplog, "\nInitializing%s LINear Constraint Solver\n",
1599                 bPLINCS ? " Parallel" : "");
1600     }
1601
1602     snew(li, 1);
1603
1604     li->ncg      =
1605         gmx_mtop_ftype_count(mtop, F_CONSTR) +
1606         gmx_mtop_ftype_count(mtop, F_CONSTRNC);
1607     li->ncg_flex = nflexcon_global;
1608
1609     li->nIter  = nIter;
1610     li->nOrder = nProjOrder;
1611
1612     li->max_connect = 0;
1613     for (mt = 0; mt < mtop->nmoltype; mt++)
1614     {
1615         int a;
1616
1617         molt = &mtop->moltype[mt];
1618         for (a = 0; a < molt->atoms.nr; a++)
1619         {
1620             li->max_connect = std::max(li->max_connect,
1621                                        at2con[mt].index[a + 1] - at2con[mt].index[a]);
1622         }
1623     }
1624
1625     li->ncg_triangle = 0;
1626     bMoreThanTwoSeq  = FALSE;
1627     for (mb = 0; mb < mtop->nmolblock; mb++)
1628     {
1629         molt              = &mtop->moltype[mtop->molblock[mb].type];
1630
1631         li->ncg_triangle +=
1632             mtop->molblock[mb].nmol*
1633             count_triangle_constraints(molt->ilist,
1634                                        &at2con[mtop->molblock[mb].type]);
1635
1636         if (!bMoreThanTwoSeq &&
1637             more_than_two_sequential_constraints(molt->ilist, &at2con[mtop->molblock[mb].type]))
1638         {
1639             bMoreThanTwoSeq = TRUE;
1640         }
1641     }
1642
1643     /* Check if we need to communicate not only before LINCS,
1644      * but also before each iteration.
1645      * The check for only two sequential constraints is only
1646      * useful for the common case of H-bond only constraints.
1647      * With more effort we could also make it useful for small
1648      * molecules with nr. sequential constraints <= nOrder-1.
1649      */
1650     li->bCommIter = (bPLINCS && (li->nOrder < 1 || bMoreThanTwoSeq));
1651
1652     if (debug && bPLINCS)
1653     {
1654         fprintf(debug, "PLINCS communication before each iteration: %d\n",
1655                 li->bCommIter);
1656     }
1657
1658     /* LINCS can run on any number of threads.
1659      * Currently the number is fixed for the whole simulation,
1660      * but it could be set in set_lincs().
1661      * The current constraint to task assignment code can create independent
1662      * tasks only when not more than two constraints are connected sequentially.
1663      */
1664     li->ntask    = gmx_omp_nthreads_get(emntLINCS);
1665     li->bTaskDep = (li->ntask > 1 && bMoreThanTwoSeq);
1666     if (debug)
1667     {
1668         fprintf(debug, "LINCS: using %d threads, tasks are %sdependent\n",
1669                 li->ntask, li->bTaskDep ? "" : "in");
1670     }
1671     if (li->ntask == 1)
1672     {
1673         snew(li->task, 1);
1674     }
1675     else
1676     {
1677         /* Allocate an extra elements for "task-overlap" constraints */
1678         snew(li->task, li->ntask + 1);
1679     }
1680     int th;
1681 #pragma omp parallel for num_threads(li->ntask)
1682     for (th = 0; th < li->ntask; th++)
1683     {
1684         try
1685         {
1686             /* Per thread SIMD load buffer for loading 2 simd_width rvecs */
1687             snew_aligned(li->task[th].simd_buf, 2*simd_width*DIM,
1688                          align_bytes);
1689         }
1690         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1691     }
1692
1693     if (bPLINCS || li->ncg_triangle > 0)
1694     {
1695         please_cite(fplog, "Hess2008a");
1696     }
1697     else
1698     {
1699         please_cite(fplog, "Hess97a");
1700     }
1701
1702     if (fplog)
1703     {
1704         fprintf(fplog, "The number of constraints is %d\n", li->ncg);
1705         if (bPLINCS)
1706         {
1707             fprintf(fplog, "There are inter charge-group constraints,\n"
1708                     "will communicate selected coordinates each lincs iteration\n");
1709         }
1710         if (li->ncg_triangle > 0)
1711         {
1712             fprintf(fplog,
1713                     "%d constraints are involved in constraint triangles,\n"
1714                     "will apply an additional matrix expansion of order %d for couplings\n"
1715                     "between constraints inside triangles\n",
1716                     li->ncg_triangle, li->nOrder);
1717         }
1718     }
1719
1720     return li;
1721 }
1722
1723 /* Sets up the work division over the threads */
1724 static void lincs_thread_setup(struct gmx_lincsdata *li, int natoms)
1725 {
1726     lincs_task_t   *li_m;
1727     int             th;
1728     gmx_bitmask_t  *atf;
1729     int             a;
1730
1731     if (natoms > li->atf_nalloc)
1732     {
1733         li->atf_nalloc = over_alloc_large(natoms);
1734         srenew(li->atf, li->atf_nalloc);
1735     }
1736
1737     atf = li->atf;
1738     /* Clear the atom flags */
1739     for (a = 0; a < natoms; a++)
1740     {
1741         bitmask_clear(&atf[a]);
1742     }
1743
1744     if (li->ntask > BITMASK_SIZE)
1745     {
1746         gmx_fatal(FARGS, "More than %d threads is not supported for LINCS.", BITMASK_SIZE);
1747     }
1748
1749     for (th = 0; th < li->ntask; th++)
1750     {
1751         lincs_task_t *li_task;
1752         int           b;
1753
1754         li_task = &li->task[th];
1755
1756         /* For each atom set a flag for constraints from each */
1757         for (b = li_task->b0; b < li_task->b1; b++)
1758         {
1759             bitmask_set_bit(&atf[li->bla[b*2    ]], th);
1760             bitmask_set_bit(&atf[li->bla[b*2 + 1]], th);
1761         }
1762     }
1763
1764 #pragma omp parallel for num_threads(li->ntask) schedule(static)
1765     for (th = 0; th < li->ntask; th++)
1766     {
1767         try
1768         {
1769             lincs_task_t  *li_task;
1770             gmx_bitmask_t  mask;
1771             int            b;
1772
1773             li_task = &li->task[th];
1774
1775             if (li_task->b1 - li_task->b0 > li_task->ind_nalloc)
1776             {
1777                 li_task->ind_nalloc = over_alloc_large(li_task->b1-li_task->b0);
1778                 srenew(li_task->ind, li_task->ind_nalloc);
1779                 srenew(li_task->ind_r, li_task->ind_nalloc);
1780             }
1781
1782             bitmask_init_low_bits(&mask, th);
1783
1784             li_task->nind   = 0;
1785             li_task->nind_r = 0;
1786             for (b = li_task->b0; b < li_task->b1; b++)
1787             {
1788                 /* We let the constraint with the lowest thread index
1789                  * operate on atoms with constraints from multiple threads.
1790                  */
1791                 if (bitmask_is_disjoint(atf[li->bla[b*2]], mask) &&
1792                     bitmask_is_disjoint(atf[li->bla[b*2+1]], mask))
1793                 {
1794                     /* Add the constraint to the local atom update index */
1795                     li_task->ind[li_task->nind++] = b;
1796                 }
1797                 else
1798                 {
1799                     /* Add the constraint to the rest block */
1800                     li_task->ind_r[li_task->nind_r++] = b;
1801                 }
1802             }
1803         }
1804         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1805     }
1806
1807     /* We need to copy all constraints which have not be assigned
1808      * to a thread to a separate list which will be handled by one thread.
1809      */
1810     li_m = &li->task[li->ntask];
1811
1812     li_m->nind = 0;
1813     for (th = 0; th < li->ntask; th++)
1814     {
1815         lincs_task_t *li_task;
1816         int           b;
1817
1818         li_task   = &li->task[th];
1819
1820         if (li_m->nind + li_task->nind_r > li_m->ind_nalloc)
1821         {
1822             li_m->ind_nalloc = over_alloc_large(li_m->nind+li_task->nind_r);
1823             srenew(li_m->ind, li_m->ind_nalloc);
1824         }
1825
1826         for (b = 0; b < li_task->nind_r; b++)
1827         {
1828             li_m->ind[li_m->nind++] = li_task->ind_r[b];
1829         }
1830
1831         if (debug)
1832         {
1833             fprintf(debug, "LINCS thread %d: %d constraints\n",
1834                     th, li_task->nind);
1835         }
1836     }
1837
1838     if (debug)
1839     {
1840         fprintf(debug, "LINCS thread r: %d constraints\n",
1841                 li_m->nind);
1842     }
1843 }
1844
1845 /* There is no realloc with alignment, so here we make one for reals.
1846  * Note that this function does not preserve the contents of the memory.
1847  */
1848 static void resize_real_aligned(real **ptr, int nelem)
1849 {
1850     sfree_aligned(*ptr);
1851     snew_aligned(*ptr, nelem, align_bytes);
1852 }
1853
1854 static void assign_constraint(struct gmx_lincsdata *li,
1855                               int constraint_index,
1856                               int a1, int a2,
1857                               real lenA, real lenB,
1858                               const t_blocka *at2con)
1859 {
1860     int con;
1861
1862     con = li->nc;
1863
1864     /* Make an mapping of local topology constraint index to LINCS index */
1865     li->con_index[constraint_index] = con;
1866
1867     li->bllen0[con]  = lenA;
1868     li->ddist[con]   = lenB - lenA;
1869     /* Set the length to the topology A length */
1870     li->bllen[con]   = lenA;
1871     li->bla[2*con]   = a1;
1872     li->bla[2*con+1] = a2;
1873
1874     /* Make space in the constraint connection matrix for constraints
1875      * connected to both end of the current constraint.
1876      */
1877     li->ncc +=
1878         at2con->index[a1 + 1] - at2con->index[a1] - 1 +
1879         at2con->index[a2 + 1] - at2con->index[a2] - 1;
1880
1881     li->blnr[con + 1] = li->ncc;
1882
1883     /* Increase the constraint count */
1884     li->nc++;
1885 }
1886
1887 /* Check if constraint with topology index constraint_index is connected
1888  * to other constraints, and if so add those connected constraints to our task.
1889  */
1890 static void check_assign_connected(struct gmx_lincsdata *li,
1891                                    const t_iatom *iatom,
1892                                    const t_idef *idef,
1893                                    int bDynamics,
1894                                    int a1, int a2,
1895                                    const t_blocka *at2con)
1896 {
1897     /* Currently this function only supports constraint groups
1898      * in which all constraints share at least one atom
1899      * (e.g. H-bond constraints).
1900      * Check both ends of the current constraint for
1901      * connected constraints. We need to assign those
1902      * to the same task.
1903      */
1904     int end;
1905
1906     for (end = 0; end < 2; end++)
1907     {
1908         int a, k;
1909
1910         a = (end == 0 ? a1 : a2);
1911
1912         for (k = at2con->index[a]; k < at2con->index[a + 1]; k++)
1913         {
1914             int cc;
1915
1916             cc = at2con->a[k];
1917             /* Check if constraint cc has not yet been assigned */
1918             if (li->con_index[cc] == -1)
1919             {
1920                 int  type;
1921                 real lenA, lenB;
1922
1923                 type = iatom[cc*3];
1924                 lenA = idef->iparams[type].constr.dA;
1925                 lenB = idef->iparams[type].constr.dB;
1926
1927                 if (bDynamics || lenA != 0 || lenB != 0)
1928                 {
1929                     assign_constraint(li, cc, iatom[3*cc + 1], iatom[3*cc + 2], lenA, lenB, at2con);
1930                 }
1931             }
1932         }
1933     }
1934 }
1935
1936 /* Check if constraint with topology index constraint_index is involved
1937  * in a constraint triangle, and if so add the other two constraints
1938  * in the triangle to our task.
1939  */
1940 static void check_assign_triangle(struct gmx_lincsdata *li,
1941                                   const t_iatom *iatom,
1942                                   const t_idef *idef,
1943                                   int bDynamics,
1944                                   int constraint_index,
1945                                   int a1, int a2,
1946                                   const t_blocka *at2con)
1947 {
1948     int nca, cc[32], ca[32], k;
1949     int c_triangle[2] = { -1, -1 };
1950
1951     nca = 0;
1952     for (k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
1953     {
1954         int c;
1955
1956         c = at2con->a[k];
1957         if (c != constraint_index)
1958         {
1959             int aa1, aa2;
1960
1961             aa1 = iatom[c*3 + 1];
1962             aa2 = iatom[c*3 + 2];
1963             if (aa1 != a1)
1964             {
1965                 cc[nca] = c;
1966                 ca[nca] = aa1;
1967                 nca++;
1968             }
1969             if (aa2 != a1)
1970             {
1971                 cc[nca] = c;
1972                 ca[nca] = aa2;
1973                 nca++;
1974             }
1975         }
1976     }
1977
1978     for (k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
1979     {
1980         int c;
1981
1982         c = at2con->a[k];
1983         if (c != constraint_index)
1984         {
1985             int aa1, aa2, i;
1986
1987             aa1 = iatom[c*3 + 1];
1988             aa2 = iatom[c*3 + 2];
1989             if (aa1 != a2)
1990             {
1991                 for (i = 0; i < nca; i++)
1992                 {
1993                     if (aa1 == ca[i])
1994                     {
1995                         c_triangle[0] = cc[i];
1996                         c_triangle[1] = c;
1997                     }
1998                 }
1999             }
2000             if (aa2 != a2)
2001             {
2002                 for (i = 0; i < nca; i++)
2003                 {
2004                     if (aa2 == ca[i])
2005                     {
2006                         c_triangle[0] = cc[i];
2007                         c_triangle[1] = c;
2008                     }
2009                 }
2010             }
2011         }
2012     }
2013
2014     if (c_triangle[0] >= 0)
2015     {
2016         int end;
2017
2018         for (end = 0; end < 2; end++)
2019         {
2020             /* Check if constraint c_triangle[end] has not yet been assigned */
2021             if (li->con_index[c_triangle[end]] == -1)
2022             {
2023                 int  i, type;
2024                 real lenA, lenB;
2025
2026                 i    = c_triangle[end]*3;
2027                 type = iatom[i];
2028                 lenA = idef->iparams[type].constr.dA;
2029                 lenB = idef->iparams[type].constr.dB;
2030
2031                 if (bDynamics || lenA != 0 || lenB != 0)
2032                 {
2033                     assign_constraint(li, c_triangle[end], iatom[i + 1], iatom[i + 2], lenA, lenB, at2con);
2034                 }
2035             }
2036         }
2037     }
2038 }
2039
2040 static void set_matrix_indices(struct gmx_lincsdata *li,
2041                                const lincs_task_t   *li_task,
2042                                const t_blocka       *at2con,
2043                                gmx_bool              bSortMatrix)
2044 {
2045     int b;
2046
2047     for (b = li_task->b0; b < li_task->b1; b++)
2048     {
2049         int a1, a2, i, k;
2050
2051         a1 = li->bla[b*2];
2052         a2 = li->bla[b*2 + 1];
2053
2054         i = li->blnr[b];
2055         for (k = at2con->index[a1]; k < at2con->index[a1 + 1]; k++)
2056         {
2057             int concon;
2058
2059             concon = li->con_index[at2con->a[k]];
2060             if (concon != b)
2061             {
2062                 li->blbnb[i++] = concon;
2063             }
2064         }
2065         for (k = at2con->index[a2]; k < at2con->index[a2 + 1]; k++)
2066         {
2067             int concon;
2068
2069             concon = li->con_index[at2con->a[k]];
2070             if (concon != b)
2071             {
2072                 li->blbnb[i++] = concon;
2073             }
2074         }
2075
2076         if (bSortMatrix)
2077         {
2078             /* Order the blbnb matrix to optimize memory access */
2079             qsort(&(li->blbnb[li->blnr[b]]), li->blnr[b + 1] - li->blnr[b],
2080                   sizeof(li->blbnb[0]), int_comp);
2081         }
2082     }
2083 }
2084
2085 void set_lincs(const t_idef         *idef,
2086                const t_mdatoms      *md,
2087                gmx_bool              bDynamics,
2088                t_commrec            *cr,
2089                struct gmx_lincsdata *li)
2090 {
2091     int          natoms, nflexcon;
2092     t_blocka     at2con;
2093     t_iatom     *iatom;
2094     int          i, ncc_alloc_old, ncon_tot;
2095
2096     li->nc_real = 0;
2097     li->nc      = 0;
2098     li->ncc     = 0;
2099     /* Zero the thread index ranges.
2100      * Otherwise without local constraints we could return with old ranges.
2101      */
2102     for (i = 0; i < li->ntask; i++)
2103     {
2104         li->task[i].b0   = 0;
2105         li->task[i].b1   = 0;
2106         li->task[i].nind = 0;
2107     }
2108     if (li->ntask > 1)
2109     {
2110         li->task[li->ntask].nind = 0;
2111     }
2112
2113     /* This is the local topology, so there are only F_CONSTR constraints */
2114     if (idef->il[F_CONSTR].nr == 0)
2115     {
2116         /* There are no constraints,
2117          * we do not need to fill any data structures.
2118          */
2119         return;
2120     }
2121
2122     if (debug)
2123     {
2124         fprintf(debug, "Building the LINCS connectivity\n");
2125     }
2126
2127     if (DOMAINDECOMP(cr))
2128     {
2129         if (cr->dd->constraints)
2130         {
2131             int start;
2132
2133             dd_get_constraint_range(cr->dd, &start, &natoms);
2134         }
2135         else
2136         {
2137             natoms = cr->dd->nat_home;
2138         }
2139     }
2140     else
2141     {
2142         natoms = md->homenr;
2143     }
2144     at2con = make_at2con(0, natoms, idef->il, idef->iparams, bDynamics,
2145                          &nflexcon);
2146
2147     ncon_tot = idef->il[F_CONSTR].nr/3;
2148
2149     /* Ensure we have enough padding for aligned loads for each thread */
2150     if (ncon_tot + li->ntask*simd_width > li->nc_alloc || li->nc_alloc == 0)
2151     {
2152         li->nc_alloc = over_alloc_dd(ncon_tot + li->ntask*simd_width);
2153         srenew(li->con_index, li->nc_alloc);
2154         resize_real_aligned(&li->bllen0, li->nc_alloc);
2155         resize_real_aligned(&li->ddist, li->nc_alloc);
2156         srenew(li->bla, 2*li->nc_alloc);
2157         resize_real_aligned(&li->blc, li->nc_alloc);
2158         resize_real_aligned(&li->blc1, li->nc_alloc);
2159         srenew(li->blnr, li->nc_alloc + 1);
2160         resize_real_aligned(&li->bllen, li->nc_alloc);
2161         srenew(li->tmpv, li->nc_alloc);
2162         if (DOMAINDECOMP(cr))
2163         {
2164             srenew(li->nlocat, li->nc_alloc);
2165         }
2166         resize_real_aligned(&li->tmp1, li->nc_alloc);
2167         resize_real_aligned(&li->tmp2, li->nc_alloc);
2168         resize_real_aligned(&li->tmp3, li->nc_alloc);
2169         resize_real_aligned(&li->tmp4, li->nc_alloc);
2170         resize_real_aligned(&li->mlambda, li->nc_alloc);
2171     }
2172
2173     iatom = idef->il[F_CONSTR].iatoms;
2174
2175     ncc_alloc_old = li->ncc_alloc;
2176     li->blnr[0]   = li->ncc;
2177
2178     /* Assign the constraints for li->ntask LINCS tasks.
2179      * We target a uniform distribution of constraints over the tasks.
2180      * Note that when flexible constraints are present, but are removed here
2181      * (e.g. because we are doing EM) we get imbalance, but since that doesn't
2182      * happen during normal MD, that's ok.
2183      */
2184     int ncon_assign, ncon_target, con, th;
2185
2186     /* Determine the number of constraints we need to assign here */
2187     ncon_assign      = ncon_tot;
2188     if (!bDynamics)
2189     {
2190         /* With energy minimization, flexible constraints are ignored
2191          * (and thus minimized, as they should be).
2192          */
2193         ncon_assign -= nflexcon;
2194     }
2195
2196     /* Set the target constraint count per task to exactly uniform,
2197      * this might be overridden below.
2198      */
2199     ncon_target = (ncon_assign + li->ntask - 1)/li->ntask;
2200
2201     /* Mark all constraints as unassigned by setting their index to -1 */
2202     for (con = 0; con < ncon_tot; con++)
2203     {
2204         li->con_index[con] = -1;
2205     }
2206
2207     con = 0;
2208     for (th = 0; th < li->ntask; th++)
2209     {
2210         lincs_task_t *li_task;
2211
2212         li_task = &li->task[th];
2213
2214 #ifdef LINCS_SIMD
2215         /* With indepedent tasks we likely have H-bond constraints or constraint
2216          * pairs. The connected constraints will be pulled into the task, so the
2217          * constraints per task will often exceed ncon_target.
2218          * Triangle constraints can also increase the count, but there are
2219          * relatively few of those, so we usually expect to get ncon_target.
2220          */
2221         if (li->bTaskDep)
2222         {
2223             /* We round ncon_target to a multiple of GMX_SIMD_WIDTH,
2224              * since otherwise a lot of operations can be wasted.
2225              * There are several ways to round here, we choose the one
2226              * that alternates block sizes, which helps with Intel HT.
2227              */
2228             ncon_target = ((ncon_assign*(th + 1))/li->ntask - li->nc_real + GMX_SIMD_REAL_WIDTH - 1) & ~(GMX_SIMD_REAL_WIDTH - 1);
2229         }
2230 #endif
2231
2232         /* Continue filling the arrays where we left off with the previous task,
2233          * including padding for SIMD.
2234          */
2235         li_task->b0 = li->nc;
2236
2237         while (con < ncon_tot && li->nc - li_task->b0 < ncon_target)
2238         {
2239             if (li->con_index[con] == -1)
2240             {
2241                 int  type, a1, a2;
2242                 real lenA, lenB;
2243
2244                 type   = iatom[3*con];
2245                 a1     = iatom[3*con + 1];
2246                 a2     = iatom[3*con + 2];
2247                 lenA   = idef->iparams[type].constr.dA;
2248                 lenB   = idef->iparams[type].constr.dB;
2249                 /* Skip the flexible constraints when not doing dynamics */
2250                 if (bDynamics || lenA != 0 || lenB != 0)
2251                 {
2252                     assign_constraint(li, con, a1, a2, lenA, lenB, &at2con);
2253
2254                     if (li->ntask > 1 && !li->bTaskDep)
2255                     {
2256                         /* We can generate independent tasks. Check if we
2257                          * need to assign connected constraints to our task.
2258                          */
2259                         check_assign_connected(li, iatom, idef, bDynamics,
2260                                                a1, a2, &at2con);
2261                     }
2262                     if (li->ntask > 1 && li->ncg_triangle > 0)
2263                     {
2264                         /* Ensure constraints in one triangle are assigned
2265                          * to the same task.
2266                          */
2267                         check_assign_triangle(li, iatom, idef, bDynamics,
2268                                               con, a1, a2, &at2con);
2269                     }
2270                 }
2271             }
2272
2273             con++;
2274         }
2275
2276         li_task->b1 = li->nc;
2277
2278         if (simd_width > 1)
2279         {
2280             /* Copy the last atom pair indices and lengths for constraints
2281              * up to a multiple of simd_width, such that we can do all
2282              * SIMD operations without having to worry about end effects.
2283              */
2284             int i, last;
2285
2286             li->nc = ((li_task->b1 + simd_width - 1)/simd_width)*simd_width;
2287             last   = li_task->b1 - 1;
2288             for (i = li_task->b1; i < li->nc; i++)
2289             {
2290                 li->bla[i*2    ] = li->bla[last*2    ];
2291                 li->bla[i*2 + 1] = li->bla[last*2 + 1];
2292                 li->bllen0[i]    = li->bllen0[last];
2293                 li->ddist[i]     = li->ddist[last];
2294                 li->bllen[i]     = li->bllen[last];
2295                 li->blnr[i + 1]  = li->blnr[last + 1];
2296             }
2297         }
2298
2299         /* Keep track of how many constraints we assigned */
2300         li->nc_real += li_task->b1 - li_task->b0;
2301
2302         if (debug)
2303         {
2304             fprintf(debug, "LINCS task %d constraints %d - %d\n",
2305                     th, li_task->b0, li_task->b1);
2306         }
2307     }
2308
2309     assert(li->nc_real == ncon_assign);
2310
2311     gmx_bool bSortMatrix;
2312
2313     /* Without DD we order the blbnb matrix to optimize memory access.
2314      * With DD the overhead of sorting is more than the gain during access.
2315      */
2316     bSortMatrix = !DOMAINDECOMP(cr);
2317
2318     if (li->ncc > li->ncc_alloc)
2319     {
2320         li->ncc_alloc = over_alloc_small(li->ncc);
2321         srenew(li->blbnb, li->ncc_alloc);
2322     }
2323
2324 #pragma omp parallel for num_threads(li->ntask) schedule(static)
2325     for (th = 0; th < li->ntask; th++)
2326     {
2327         try
2328         {
2329             lincs_task_t *li_task;
2330
2331             li_task = &li->task[th];
2332
2333             if (li->ncg_triangle > 0 &&
2334                 li_task->b1 - li_task->b0 > li_task->tri_alloc)
2335             {
2336                 /* This is allocating too much, but it is difficult to improve */
2337                 li_task->tri_alloc = over_alloc_dd(li_task->b1 - li_task->b0);
2338                 srenew(li_task->triangle, li_task->tri_alloc);
2339                 srenew(li_task->tri_bits, li_task->tri_alloc);
2340             }
2341
2342             set_matrix_indices(li, li_task, &at2con, bSortMatrix);
2343         }
2344         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2345     }
2346
2347     done_blocka(&at2con);
2348
2349     if (cr->dd == NULL)
2350     {
2351         /* Since the matrix is static, we should free some memory */
2352         li->ncc_alloc = li->ncc;
2353         srenew(li->blbnb, li->ncc_alloc);
2354     }
2355
2356     if (li->ncc_alloc > ncc_alloc_old)
2357     {
2358         srenew(li->blmf, li->ncc_alloc);
2359         srenew(li->blmf1, li->ncc_alloc);
2360         srenew(li->tmpncc, li->ncc_alloc);
2361     }
2362
2363     if (DOMAINDECOMP(cr) && dd_constraints_nlocalatoms(cr->dd) != NULL)
2364     {
2365         int *nlocat_dd;
2366
2367         nlocat_dd = dd_constraints_nlocalatoms(cr->dd);
2368
2369         /* Convert nlocat from local topology to LINCS constraint indexing */
2370         for (con = 0; con < ncon_tot; con++)
2371         {
2372             li->nlocat[li->con_index[con]] = nlocat_dd[con];
2373         }
2374     }
2375     else
2376     {
2377         li->nlocat = NULL;
2378     }
2379
2380     if (debug)
2381     {
2382         fprintf(debug, "Number of constraints is %d, padded %d, couplings %d\n",
2383                 li->nc_real, li->nc, li->ncc);
2384     }
2385
2386     if (li->ntask > 1)
2387     {
2388         lincs_thread_setup(li, md->nr);
2389     }
2390
2391     set_lincs_matrix(li, md->invmass, md->lambda);
2392 }
2393
2394 static void lincs_warning(FILE *fplog,
2395                           gmx_domdec_t *dd, rvec *x, rvec *xprime, t_pbc *pbc,
2396                           int ncons, int *bla, real *bllen, real wangle,
2397                           int maxwarn, int *warncount)
2398 {
2399     int  b, i, j;
2400     rvec v0, v1;
2401     real wfac, d0, d1, cosine;
2402     char buf[STRLEN];
2403
2404     wfac = cos(DEG2RAD*wangle);
2405
2406     sprintf(buf, "bonds that rotated more than %g degrees:\n"
2407             " atom 1 atom 2  angle  previous, current, constraint length\n",
2408             wangle);
2409     fprintf(stderr, "%s", buf);
2410     if (fplog)
2411     {
2412         fprintf(fplog, "%s", buf);
2413     }
2414
2415     for (b = 0; b < ncons; b++)
2416     {
2417         i = bla[2*b];
2418         j = bla[2*b+1];
2419         if (pbc)
2420         {
2421             pbc_dx_aiuc(pbc, x[i], x[j], v0);
2422             pbc_dx_aiuc(pbc, xprime[i], xprime[j], v1);
2423         }
2424         else
2425         {
2426             rvec_sub(x[i], x[j], v0);
2427             rvec_sub(xprime[i], xprime[j], v1);
2428         }
2429         d0     = norm(v0);
2430         d1     = norm(v1);
2431         cosine = iprod(v0, v1)/(d0*d1);
2432         if (cosine < wfac)
2433         {
2434             sprintf(buf, " %6d %6d  %5.1f  %8.4f %8.4f    %8.4f\n",
2435                     ddglatnr(dd, i), ddglatnr(dd, j),
2436                     RAD2DEG*acos(cosine), d0, d1, bllen[b]);
2437             fprintf(stderr, "%s", buf);
2438             if (fplog)
2439             {
2440                 fprintf(fplog, "%s", buf);
2441             }
2442             if (!std::isfinite(d1))
2443             {
2444                 gmx_fatal(FARGS, "Bond length not finite.");
2445             }
2446
2447             (*warncount)++;
2448         }
2449     }
2450     if (*warncount > maxwarn)
2451     {
2452         too_many_constraint_warnings(econtLINCS, *warncount);
2453     }
2454 }
2455
2456 static void cconerr(const struct gmx_lincsdata *lincsd,
2457                     rvec *x, t_pbc *pbc,
2458                     real *ncons_loc, real *ssd, real *max, int *imax)
2459 {
2460     const int  *bla, *nlocat;
2461     const real *bllen;
2462     real        ma, ssd2;
2463     int         count, im, task;
2464
2465     bla    = lincsd->bla;
2466     bllen  = lincsd->bllen;
2467     nlocat = lincsd->nlocat;
2468
2469     ma    = 0;
2470     ssd2  = 0;
2471     im    = 0;
2472     count = 0;
2473     for (task = 0; task < lincsd->ntask; task++)
2474     {
2475         int b;
2476
2477         for (b = lincsd->task[task].b0; b < lincsd->task[task].b1; b++)
2478         {
2479             real len, d, r2;
2480             rvec dx;
2481
2482             if (pbc)
2483             {
2484                 pbc_dx_aiuc(pbc, x[bla[2*b]], x[bla[2*b+1]], dx);
2485             }
2486             else
2487             {
2488                 rvec_sub(x[bla[2*b]], x[bla[2*b+1]], dx);
2489             }
2490             r2  = norm2(dx);
2491             len = r2*gmx_invsqrt(r2);
2492             d   = fabs(len/bllen[b]-1);
2493             if (d > ma && (nlocat == NULL || nlocat[b]))
2494             {
2495                 ma = d;
2496                 im = b;
2497             }
2498             if (nlocat == NULL)
2499             {
2500                 ssd2 += d*d;
2501                 count++;
2502             }
2503             else
2504             {
2505                 ssd2  += nlocat[b]*d*d;
2506                 count += nlocat[b];
2507             }
2508         }
2509     }
2510
2511     *ncons_loc = (nlocat ? 0.5 : 1)*count;
2512     *ssd       = (nlocat ? 0.5 : 1)*ssd2;
2513     *max       = ma;
2514     *imax      = im;
2515 }
2516
2517 gmx_bool constrain_lincs(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
2518                          t_inputrec *ir,
2519                          gmx_int64_t step,
2520                          struct gmx_lincsdata *lincsd, t_mdatoms *md,
2521                          t_commrec *cr,
2522                          rvec *x, rvec *xprime, rvec *min_proj,
2523                          matrix box, t_pbc *pbc,
2524                          real lambda, real *dvdlambda,
2525                          real invdt, rvec *v,
2526                          gmx_bool bCalcVir, tensor vir_r_m_dr,
2527                          int econq,
2528                          t_nrnb *nrnb,
2529                          int maxwarn, int *warncount)
2530 {
2531     gmx_bool  bCalcDHDL;
2532     char      buf[STRLEN], buf2[22], buf3[STRLEN];
2533     int       i, p_imax;
2534     real      ncons_loc, p_ssd, p_max = 0;
2535     rvec      dx;
2536     gmx_bool  bOK, bWarn;
2537
2538     bOK = TRUE;
2539
2540     /* This boolean should be set by a flag passed to this routine.
2541      * We can also easily check if any constraint length is changed,
2542      * if not dH/dlambda=0 and we can also set the boolean to FALSE.
2543      */
2544     bCalcDHDL = (ir->efep != efepNO && dvdlambda != NULL);
2545
2546     if (lincsd->nc == 0 && cr->dd == NULL)
2547     {
2548         if (bLog || bEner)
2549         {
2550             lincsd->rmsd_data[0] = 0;
2551             if (ir->eI == eiSD2 && v == NULL)
2552             {
2553                 i = 2;
2554             }
2555             else
2556             {
2557                 i = 1;
2558             }
2559             lincsd->rmsd_data[i] = 0;
2560         }
2561
2562         return bOK;
2563     }
2564
2565     if (econq == econqCoord)
2566     {
2567         /* We can't use bCalcDHDL here, since NULL can be passed for dvdlambda
2568          * also with efep!=fepNO.
2569          */
2570         if (ir->efep != efepNO)
2571         {
2572             if (md->nMassPerturbed && lincsd->matlam != md->lambda)
2573             {
2574                 set_lincs_matrix(lincsd, md->invmass, md->lambda);
2575             }
2576
2577             for (i = 0; i < lincsd->nc; i++)
2578             {
2579                 lincsd->bllen[i] = lincsd->bllen0[i] + lambda*lincsd->ddist[i];
2580             }
2581         }
2582
2583         if (lincsd->ncg_flex)
2584         {
2585             /* Set the flexible constraint lengths to the old lengths */
2586             if (pbc != NULL)
2587             {
2588                 for (i = 0; i < lincsd->nc; i++)
2589                 {
2590                     if (lincsd->bllen[i] == 0)
2591                     {
2592                         pbc_dx_aiuc(pbc, x[lincsd->bla[2*i]], x[lincsd->bla[2*i+1]], dx);
2593                         lincsd->bllen[i] = norm(dx);
2594                     }
2595                 }
2596             }
2597             else
2598             {
2599                 for (i = 0; i < lincsd->nc; i++)
2600                 {
2601                     if (lincsd->bllen[i] == 0)
2602                     {
2603                         lincsd->bllen[i] =
2604                             sqrt(distance2(x[lincsd->bla[2*i]],
2605                                            x[lincsd->bla[2*i+1]]));
2606                     }
2607                 }
2608             }
2609         }
2610
2611         if (bLog && fplog)
2612         {
2613             cconerr(lincsd, xprime, pbc,
2614                     &ncons_loc, &p_ssd, &p_max, &p_imax);
2615         }
2616
2617         /* This bWarn var can be updated by multiple threads
2618          * at the same time. But as we only need to detect
2619          * if a warning occured or not, this is not an issue.
2620          */
2621         bWarn = FALSE;
2622
2623         /* The OpenMP parallel region of constrain_lincs for coords */
2624 #pragma omp parallel num_threads(lincsd->ntask)
2625         {
2626             try
2627             {
2628                 int th = gmx_omp_get_thread_num();
2629
2630                 clear_mat(lincsd->task[th].vir_r_m_dr);
2631
2632                 do_lincs(x, xprime, box, pbc, lincsd, th,
2633                          md->invmass, cr,
2634                          bCalcDHDL,
2635                          ir->LincsWarnAngle, &bWarn,
2636                          invdt, v, bCalcVir,
2637                          th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2638             }
2639             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2640         }
2641
2642         if (bLog && fplog && lincsd->nc > 0)
2643         {
2644             fprintf(fplog, "   Rel. Constraint Deviation:  RMS         MAX     between atoms\n");
2645             fprintf(fplog, "       Before LINCS          %.6f    %.6f %6d %6d\n",
2646                     sqrt(p_ssd/ncons_loc), p_max,
2647                     ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2648                     ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2649         }
2650         if (bLog || bEner)
2651         {
2652             cconerr(lincsd, xprime, pbc,
2653                     &ncons_loc, &p_ssd, &p_max, &p_imax);
2654             /* Check if we are doing the second part of SD */
2655             if (ir->eI == eiSD2 && v == NULL)
2656             {
2657                 i = 2;
2658             }
2659             else
2660             {
2661                 i = 1;
2662             }
2663             lincsd->rmsd_data[0] = ncons_loc;
2664             lincsd->rmsd_data[i] = p_ssd;
2665         }
2666         else
2667         {
2668             lincsd->rmsd_data[0] = 0;
2669             lincsd->rmsd_data[1] = 0;
2670             lincsd->rmsd_data[2] = 0;
2671         }
2672         if (bLog && fplog && lincsd->nc > 0)
2673         {
2674             fprintf(fplog,
2675                     "        After LINCS          %.6f    %.6f %6d %6d\n\n",
2676                     sqrt(p_ssd/ncons_loc), p_max,
2677                     ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2678                     ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2679         }
2680
2681         if (bWarn)
2682         {
2683             if (maxwarn >= 0)
2684             {
2685                 cconerr(lincsd, xprime, pbc,
2686                         &ncons_loc, &p_ssd, &p_max, &p_imax);
2687                 if (MULTISIM(cr))
2688                 {
2689                     sprintf(buf3, " in simulation %d", cr->ms->sim);
2690                 }
2691                 else
2692                 {
2693                     buf3[0] = 0;
2694                 }
2695                 sprintf(buf, "\nStep %s, time %g (ps)  LINCS WARNING%s\n"
2696                         "relative constraint deviation after LINCS:\n"
2697                         "rms %.6f, max %.6f (between atoms %d and %d)\n",
2698                         gmx_step_str(step, buf2), ir->init_t+step*ir->delta_t,
2699                         buf3,
2700                         sqrt(p_ssd/ncons_loc), p_max,
2701                         ddglatnr(cr->dd, lincsd->bla[2*p_imax]),
2702                         ddglatnr(cr->dd, lincsd->bla[2*p_imax+1]));
2703                 if (fplog)
2704                 {
2705                     fprintf(fplog, "%s", buf);
2706                 }
2707                 fprintf(stderr, "%s", buf);
2708                 lincs_warning(fplog, cr->dd, x, xprime, pbc,
2709                               lincsd->nc, lincsd->bla, lincsd->bllen,
2710                               ir->LincsWarnAngle, maxwarn, warncount);
2711             }
2712             bOK = (p_max < 0.5);
2713         }
2714
2715         if (lincsd->ncg_flex)
2716         {
2717             for (i = 0; (i < lincsd->nc); i++)
2718             {
2719                 if (lincsd->bllen0[i] == 0 && lincsd->ddist[i] == 0)
2720                 {
2721                     lincsd->bllen[i] = 0;
2722                 }
2723             }
2724         }
2725     }
2726     else
2727     {
2728         /* The OpenMP parallel region of constrain_lincs for derivatives */
2729 #pragma omp parallel num_threads(lincsd->ntask)
2730         {
2731             try
2732             {
2733                 int th = gmx_omp_get_thread_num();
2734
2735                 do_lincsp(x, xprime, min_proj, pbc, lincsd, th,
2736                           md->invmass, econq, bCalcDHDL,
2737                           bCalcVir, th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
2738             }
2739             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2740         }
2741     }
2742
2743     if (bCalcDHDL)
2744     {
2745         /* Reduce the dH/dlambda contributions over the threads */
2746         real dhdlambda;
2747         int  th;
2748
2749         dhdlambda = 0;
2750         for (th = 0; th < lincsd->ntask; th++)
2751         {
2752             dhdlambda += lincsd->task[th].dhdlambda;
2753         }
2754         if (econq == econqCoord)
2755         {
2756             /* dhdlambda contains dH/dlambda*dt^2, correct for this */
2757             /* TODO This should probably use invdt, so that sd integrator scaling works properly */
2758             dhdlambda /= ir->delta_t*ir->delta_t;
2759         }
2760         *dvdlambda += dhdlambda;
2761     }
2762
2763     if (bCalcVir && lincsd->ntask > 1)
2764     {
2765         for (i = 1; i < lincsd->ntask; i++)
2766         {
2767             m_add(vir_r_m_dr, lincsd->task[i].vir_r_m_dr, vir_r_m_dr);
2768         }
2769     }
2770
2771     /* count assuming nit=1 */
2772     inc_nrnb(nrnb, eNR_LINCS, lincsd->nc_real);
2773     inc_nrnb(nrnb, eNR_LINCSMAT, (2+lincsd->nOrder)*lincsd->ncc);
2774     if (lincsd->ntriangle > 0)
2775     {
2776         inc_nrnb(nrnb, eNR_LINCSMAT, lincsd->nOrder*lincsd->ncc_triangle);
2777     }
2778     if (v)
2779     {
2780         inc_nrnb(nrnb, eNR_CONSTR_V, lincsd->nc_real*2);
2781     }
2782     if (bCalcVir)
2783     {
2784         inc_nrnb(nrnb, eNR_CONSTR_VIR, lincsd->nc_real);
2785     }
2786
2787     return bOK;
2788 }