From b2226136c5a2ec0ffc36ea3dc4fb1e42028a11f1 Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Tue, 2 Feb 2016 10:38:11 +0100
Subject: [PATCH] Converted csettle() to use real/SimdReal template

The csettle() routine for constraining coordinates has been converted
to call a template function that is instantiated with either real or
SimdReal. On Haswell this makes settle a factor 5 faster.
Added reorganized indices to the SETTLE data structure, these are set
by a new function settle_set_constraints analogous to LINCS.
Reorganized the SETTLE initialization.
The settle fatal error no longer report the atom index of (one of the)
problematic water molecule, but the pdb dumps are more useful anyhow.

Change-Id: I61d81ad8a0add6fe234f8c7b5b44dc8c7084ace9
---
 src/gromacs/mdlib/constr.cpp  | 180 ++++-----
 src/gromacs/mdlib/constr.h    |  33 +-
 src/gromacs/mdlib/csettle.cpp | 876 +++++++++++++++++++++++++++---------------
 3 files changed, 659 insertions(+), 430 deletions(-)

diff --git a/src/gromacs/mdlib/constr.cpp b/src/gromacs/mdlib/constr.cpp
index 5b850d0f56..d926204813 100644
--- a/src/gromacs/mdlib/constr.cpp
+++ b/src/gromacs/mdlib/constr.cpp
@@ -90,12 +90,14 @@ typedef struct gmx_constr {
     int                maxwarn;        /* The maximum number of warnings     */
     int                warncount_lincs;
     int                warncount_settle;
-    gmx_edsam_t        ed;            /* The essential dynamics data        */
+    gmx_edsam_t        ed;             /* The essential dynamics data        */
 
-    tensor            *vir_r_m_dr_th; /* Thread local working data          */
-    int               *settle_error;  /* Thread local working data          */
+    /* Thread local working data */
+    tensor            *vir_r_m_dr_th;           /* Thread virial contribution */
+    bool              *bSettleErrorHasOccurred; /* Did a settle error occur?  */
 
-    const gmx_mtop_t  *warn_mtop;     /* Only used for printing warnings    */
+    /* Only used for printing warnings */
+    const gmx_mtop_t  *warn_mtop;     /* Pointer to the global topology     */
 } t_gmx_constr;
 
 typedef struct {
@@ -281,8 +283,6 @@ gmx_bool constrain(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
 {
     gmx_bool    bOK, bDump;
     int         start, homenr;
-    int         i, j;
-    int         settle_error;
     tensor      vir_r_m_dr;
     real        scaled_delta_t;
     real        invdt, vir_fac = 0, t;
@@ -343,14 +343,6 @@ gmx_bool constrain(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
         nth = 1;
     }
 
-    if (nth > 1 && constr->vir_r_m_dr_th == NULL)
-    {
-        snew(constr->vir_r_m_dr_th, nth);
-        snew(constr->settle_error, nth);
-    }
-
-    settle_error = -1;
-
     /* We do not need full pbc when constraints do not cross charge groups,
      * i.e. when dd->constraint_comm==NULL.
      * Note that PBC for constraints is different from PBC for bondeds.
@@ -446,16 +438,7 @@ gmx_bool constrain(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
 
     if (nsettle > 0)
     {
-        int calcvir_atom_end;
-
-        if (vir == NULL)
-        {
-            calcvir_atom_end = 0;
-        }
-        else
-        {
-            calcvir_atom_end = md->homenr;
-        }
+        bool bSettleErrorHasOccurred = false;
 
         switch (econq)
         {
@@ -465,28 +448,19 @@ gmx_bool constrain(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
                 {
                     try
                     {
-                        int start_th, end_th;
-
                         if (th > 0)
                         {
                             clear_mat(constr->vir_r_m_dr_th[th]);
-
-                            constr->settle_error[th] = -1;
                         }
 
-                        start_th = (nsettle* th   )/nth;
-                        end_th   = (nsettle*(th+1))/nth;
-                        if (start_th >= 0 && end_th - start_th > 0)
-                        {
-                            csettle(constr->settled,
-                                    end_th-start_th,
-                                    settle->iatoms+start_th*(1+NRAL(F_SETTLE)),
-                                    pbc_null,
-                                    x[0], xprime[0],
-                                    invdt, v ? v[0] : NULL, calcvir_atom_end,
-                                    th == 0 ? vir_r_m_dr : constr->vir_r_m_dr_th[th],
-                                    th == 0 ? &settle_error : &constr->settle_error[th]);
-                        }
+                        csettle(constr->settled,
+                                nth, th,
+                                pbc_null,
+                                x[0], xprime[0],
+                                invdt, v ? v[0] : NULL,
+                                vir != NULL,
+                                th == 0 ? vir_r_m_dr : constr->vir_r_m_dr_th[th],
+                                th == 0 ? &bSettleErrorHasOccurred : &constr->bSettleErrorHasOccurred[th]);
                     }
                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
                 }
@@ -509,15 +483,24 @@ gmx_bool constrain(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
                 {
                     try
                     {
-                        int start_th, end_th;
+                        int calcvir_atom_end;
+
+                        if (vir == NULL)
+                        {
+                            calcvir_atom_end = 0;
+                        }
+                        else
+                        {
+                            calcvir_atom_end = md->homenr;
+                        }
 
                         if (th > 0)
                         {
                             clear_mat(constr->vir_r_m_dr_th[th]);
                         }
 
-                        start_th = (nsettle* th   )/nth;
-                        end_th   = (nsettle*(th+1))/nth;
+                        int start_th = (nsettle* th   )/nth;
+                        int end_th   = (nsettle*(th+1))/nth;
 
                         if (start_th >= 0 && end_th - start_th > 0)
                         {
@@ -541,34 +524,31 @@ gmx_bool constrain(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
             default:
                 gmx_incons("Unknown constraint quantity for settle");
         }
-    }
 
-    if (settle->nr > 0)
-    {
         if (vir != NULL)
         {
             /* Reduce the virial contributions over the threads */
-            for (i = 1; i < nth; i++)
+            for (int th = 1; th < nth; th++)
             {
-                m_add(vir_r_m_dr, constr->vir_r_m_dr_th[i], vir_r_m_dr);
+                m_add(vir_r_m_dr, constr->vir_r_m_dr_th[th], vir_r_m_dr);
             }
         }
 
         if (econq == econqCoord)
         {
-            for (i = 1; i < nth; i++)
+            for (int th = 1; th < nth; th++)
             {
-                settle_error = std::max(settle_error, constr->settle_error[i]);
+                bSettleErrorHasOccurred = bSettleErrorHasOccurred || constr->bSettleErrorHasOccurred[th];
             }
 
-            if (settle_error >= 0)
+            if (bSettleErrorHasOccurred)
             {
                 dump_confs(fplog, step, constr->warn_mtop, start, homenr, cr, x, xprime, box);
 
                 gmx_fatal(FARGS,
-                          "\nstep " "%" GMX_PRId64 ": Water molecule starting at atom %d can not be "
-                          "settled.\nCheck for bad contacts and/or reduce the timestep if appropriate.\n",
-                          step, ddglatnr(cr->dd, settle->iatoms[settle_error*(1+NRAL(F_SETTLE))+1]));
+                          "\nstep " "%" GMX_PRId64 ": One or more water molecules can not be settled.\n"
+                          "Check for bad contacts and/or reduce the timestep if appropriate.\n",
+                          step);
             }
         }
     }
@@ -603,9 +583,9 @@ gmx_bool constrain(FILE *fplog, gmx_bool bLog, gmx_bool bEner,
         {
             vir_fac *= 2;  /* only constraining over half the distance here */
         }
-        for (i = 0; i < DIM; i++)
+        for (int i = 0; i < DIM; i++)
         {
-            for (j = 0; j < DIM; j++)
+            for (int j = 0; j < DIM; j++)
             {
                 (*vir)[i][j] = vir_fac*vir_r_m_dr[i][j];
             }
@@ -918,19 +898,14 @@ void set_constraints(struct gmx_constr *constr,
                      gmx_localtop_t *top, const t_inputrec *ir,
                      const t_mdatoms *md, t_commrec *cr)
 {
-    t_idef        *idef;
-    int            ncons;
-    const t_ilist *settle;
-    int            iO, iH;
-
-    idef = &top->idef;
+    t_idef *idef = &top->idef;
 
     if (constr->ncon_tot > 0)
     {
         /* We are using the local topology,
          * so there are only F_CONSTR constraints.
          */
-        ncons = idef->il[F_CONSTR].nr/3;
+        int ncons = idef->il[F_CONSTR].nr/3;
 
         /* With DD we might also need to call LINCS with ncons=0 for
          * communicating coordinates to other nodes that do have constraints.
@@ -957,16 +932,10 @@ void set_constraints(struct gmx_constr *constr,
         }
     }
 
-    if (idef->il[F_SETTLE].nr > 0 && constr->settled == NULL)
+    if (constr->settled)
     {
-        settle          = &idef->il[F_SETTLE];
-        iO              = settle->iatoms[1];
-        iH              = settle->iatoms[2];
-        constr->settled =
-            settle_init(md->massT[iO], md->massT[iH],
-                        md->invmass[iO], md->invmass[iH],
-                        idef->iparams[settle->iatoms[0]].settle.doh,
-                        idef->iparams[settle->iatoms[0]].settle.dhh);
+        settle_set_constraints(constr->settled,
+                               &idef->il[F_SETTLE], md);
     }
 
     /* Make a selection of the local atoms for essential dynamics */
@@ -1167,39 +1136,37 @@ gmx_constr_t init_constraints(FILE *fplog,
                               gmx_edsam_t ed, t_state *state,
                               t_commrec *cr)
 {
-    int                  ncon, nset, nmol, settle_type, i, mt, nflexcon;
-    struct gmx_constr   *constr;
-    char                *env;
-    t_ilist             *ilist;
-    gmx_mtop_ilistloop_t iloop;
-
-    ncon =
+    int nconstraints =
         gmx_mtop_ftype_count(mtop, F_CONSTR) +
         gmx_mtop_ftype_count(mtop, F_CONSTRNC);
-    nset = gmx_mtop_ftype_count(mtop, F_SETTLE);
+    int nsettles =
+        gmx_mtop_ftype_count(mtop, F_SETTLE);
 
-    if (ncon+nset == 0 &&
+    if (nconstraints + nsettles == 0 &&
         !(ir->bPull && pull_have_constraint(ir->pull_work)) &&
         ed == NULL)
     {
         return NULL;
     }
 
+    struct gmx_constr *constr;
+
     snew(constr, 1);
 
-    constr->ncon_tot = ncon;
+    constr->ncon_tot = nconstraints;
     constr->nflexcon = 0;
-    if (ncon > 0)
+    if (nconstraints > 0)
     {
         constr->n_at2con_mt = mtop->nmoltype;
         snew(constr->at2con_mt, constr->n_at2con_mt);
-        for (mt = 0; mt < mtop->nmoltype; mt++)
+        for (int mt = 0; mt < mtop->nmoltype; mt++)
         {
+            int nflexcon;
             constr->at2con_mt[mt] = make_at2con(0, mtop->moltype[mt].atoms.nr,
                                                 mtop->moltype[mt].ilist,
                                                 mtop->ffparams.iparams,
                                                 EI_DYNAMICS(ir->eI), &nflexcon);
-            for (i = 0; i < mtop->nmolblock; i++)
+            for (int i = 0; i < mtop->nmolblock; i++)
             {
                 if (mtop->molblock[i].type == mt)
                 {
@@ -1258,53 +1225,40 @@ gmx_constr_t init_constraints(FILE *fplog,
         }
     }
 
-    if (nset > 0)
+    if (nsettles > 0)
     {
         please_cite(fplog, "Miyamoto92a");
 
         constr->bInterCGsettles = inter_charge_group_settles(mtop);
 
-        /* Check that we have only one settle type */
-        settle_type = -1;
-        iloop       = gmx_mtop_ilistloop_init(mtop);
-        while (gmx_mtop_ilistloop_next(iloop, &ilist, &nmol))
-        {
-            for (i = 0; i < ilist[F_SETTLE].nr; i += 4)
-            {
-                if (settle_type == -1)
-                {
-                    settle_type = ilist[F_SETTLE].iatoms[i];
-                }
-                else if (ilist[F_SETTLE].iatoms[i] != settle_type)
-                {
-                    gmx_fatal(FARGS,
-                              "The [molecules] section of your topology specifies more than one block of\n"
-                              "a [moleculetype] with a [settles] block. Only one such is allowed. If you\n"
-                              "are trying to partition your solvent into different *groups* (e.g. for\n"
-                              "freezing, T-coupling, etc.) then you are using the wrong approach. Index\n"
-                              "files specify groups. Otherwise, you may wish to change the least-used\n"
-                              "block of molecules with SETTLE constraints into 3 normal constraints.");
-                }
-            }
-        }
+        constr->settled         = settle_init(mtop);
 
+        /* Make an atom to settle index for use in domain decomposition */
         constr->n_at2settle_mt = mtop->nmoltype;
         snew(constr->at2settle_mt, constr->n_at2settle_mt);
-        for (mt = 0; mt < mtop->nmoltype; mt++)
+        for (int mt = 0; mt < mtop->nmoltype; mt++)
         {
             constr->at2settle_mt[mt] =
                 make_at2settle(mtop->moltype[mt].atoms.nr,
                                &mtop->moltype[mt].ilist[F_SETTLE]);
         }
+
+        /* Allocate thread-local work arrays */
+        int nthreads = gmx_omp_nthreads_get(emntSETTLE);
+        if (nthreads > 1 && constr->vir_r_m_dr_th == NULL)
+        {
+            snew(constr->vir_r_m_dr_th, nthreads);
+            snew(constr->bSettleErrorHasOccurred, nthreads);
+        }
     }
 
-    if ((ncon + nset) > 0 && ir->epc == epcMTTK)
+    if (nconstraints + nsettles > 0 && ir->epc == epcMTTK)
     {
         gmx_fatal(FARGS, "Constraints are not implemented with MTTK pressure control.");
     }
 
     constr->maxwarn = 999;
-    env             = getenv("GMX_MAXCONSTRWARN");
+    char *env       = getenv("GMX_MAXCONSTRWARN");
     if (env)
     {
         constr->maxwarn = 0;
diff --git a/src/gromacs/mdlib/constr.h b/src/gromacs/mdlib/constr.h
index ee0558030a..50c9a1b463 100644
--- a/src/gromacs/mdlib/constr.h
+++ b/src/gromacs/mdlib/constr.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -115,22 +115,29 @@ gmx_bool bshakef(FILE           *log,          /* Log file			*/
  * Return TRUE when OK, FALSE when shake-error
  */
 
-gmx_settledata_t settle_init(real mO, real mH, real invmO, real invmH,
-                             real dOH, real dHH);
+gmx_settledata_t settle_init(const gmx_mtop_t *mtop);
 /* Initializes and returns a structure with SETTLE parameters */
 
-void csettle(gmx_settledata_t    settled,
-             int                 nsettle,          /* Number of settles            */
-             t_iatom             iatoms[],         /* The settle iatom list        */
+void settle_set_constraints(gmx_settledata_t  settled,
+                            const t_ilist    *il_settle,
+                            const t_mdatoms  *mdatoms);
+/* Set up the indices for the settle constraints */
+
+void csettle(gmx_settledata_t    settled,          /* The SETTLE structure */
+             int                 nthread,          /* The number of threads used */
+             int                 thread,           /* Our thread index */
              const struct t_pbc *pbc,              /* PBC data pointer, can be NULL */
-             real                b4[],             /* Old coordinates              */
-             real                after[],          /* New coords, to be settled    */
-             real                invdt,            /* 1/delta_t                    */
-             real               *v,                /* Also constrain v if v!=NULL  */
-             int                 calcvir_atom_end, /* Calculate r x m delta_r up to this atom */
-             tensor              vir_r_m_dr,       /* sum r x m delta_r            */
-             int                *xerror
+             const real          x[],              /* Reference coordinates */
+             real                xprime[],         /* New coords, to be settled */
+             real                invdt,            /* 1/delta_t */
+             real               *v,                /* Also constrain v if v!=NULL */
+             bool                bCalcVirial,      /* Calculate the virial contribution */
+             tensor              vir_r_m_dr,       /* sum r x m delta_r */
+             bool               *bErrorHasOccurred /* True if a settle error occurred */
              );
+/* Constrain coordinates using SETTLE.
+ * Can be called on any number of threads.
+ */
 
 void settle_proj(gmx_settledata_t settled, int econq,
                  int nsettle, t_iatom iatoms[],
diff --git a/src/gromacs/mdlib/csettle.cpp b/src/gromacs/mdlib/csettle.cpp
index 8a2cebb018..fd4bd7e3c4 100644
--- a/src/gromacs/mdlib/csettle.cpp
+++ b/src/gromacs/mdlib/csettle.cpp
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -36,18 +36,31 @@
  */
 #include "gmxpre.h"
 
+#include "config.h"
+
+#include <assert.h>
 #include <math.h>
 #include <stdio.h>
 
+#include <algorithm>
+
 #include "gromacs/math/functions.h"
 #include "gromacs/math/invertmatrix.h"
 #include "gromacs/math/vec.h"
 #include "gromacs/mdlib/constr.h"
+#include "gromacs/mdtypes/mdatom.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pbcutil/pbc-simd.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#include "gromacs/topology/mtop_util.h"
 #include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/smalloc.h"
 
+using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
+
 typedef struct
 {
     real   mO;
@@ -69,8 +82,17 @@ typedef struct
 
 typedef struct gmx_settledata
 {
-    settleparam_t massw;
-    settleparam_t mass1;
+    settleparam_t massw;    /* Parameters for SETTLE for coordinates */
+    settleparam_t mass1;    /* Parameters with all masses 1, for forces */
+
+    int           nsettle;  /* The number of settles on our rank */
+    int          *ow1;      /* Index to OW1 atoms, size nsettle + SIMD padding */
+    int          *hw2;      /* Index to HW2 atoms, size nsettle + SIMD padding */
+    int          *hw3;      /* Index to HW3 atoms, size nsettle + SIMD padding */
+    real         *virfac;   /* Virial factor 0 or 1, size nsettle + SIMD pad. */
+    int           nalloc;   /* Allocation size of ow1, hw2, hw3, virfac */
+
+    bool          bUseSimd; /* Use SIMD intrinsics code, if possible */
 } t_gmx_settledata;
 
 
@@ -137,37 +159,132 @@ static void settleparam_init(settleparam_t *p,
     }
 }
 
-gmx_settledata_t settle_init(real mO, real mH, real invmO, real invmH,
-                             real dOH, real dHH)
+gmx_settledata_t settle_init(const gmx_mtop_t *mtop)
 {
+    /* Check that we have only one settle type */
+    int                   settle_type = -1;
+    gmx_mtop_ilistloop_t  iloop       = gmx_mtop_ilistloop_init(mtop);
+    t_ilist              *ilist;
+    int                   nmol;
+    const int             nral1       = 1 + NRAL(F_SETTLE);
+    while (gmx_mtop_ilistloop_next(iloop, &ilist, &nmol))
+    {
+        for (int i = 0; i < ilist[F_SETTLE].nr; i += nral1)
+        {
+            if (settle_type == -1)
+            {
+                settle_type = ilist[F_SETTLE].iatoms[i];
+            }
+            else if (ilist[F_SETTLE].iatoms[i] != settle_type)
+            {
+                gmx_fatal(FARGS,
+                          "The [molecules] section of your topology specifies more than one block of\n"
+                          "a [moleculetype] with a [settles] block. Only one such is allowed.\n"
+                          "If you are trying to partition your solvent into different *groups*\n"
+                          "(e.g. for freezing, T-coupling, etc.), you are using the wrong approach. Index\n"
+                          "files specify groups. Otherwise, you may wish to change the least-used\n"
+                          "block of molecules with SETTLE constraints into 3 normal constraints.");
+            }
+        }
+    }
+    GMX_RELEASE_ASSERT(settle_type >= 0, "settle_init called without settles");
+
     gmx_settledata_t settled;
 
     snew(settled, 1);
 
-    settleparam_init(&settled->massw, mO, mH, invmO, invmH, dOH, dHH);
+    /* We will not initialize the normal SETTLE parameters here yet,
+     * since the atom (inv)masses can depend on the integrator and
+     * free-energy perturbation. We set mO=-1 to trigger later initialization.
+     */
+    settled->massw.mO = -1;
 
+    real dOH = mtop->ffparams.iparams[settle_type].settle.doh;
+    real dHH = mtop->ffparams.iparams[settle_type].settle.dhh;
     settleparam_init(&settled->mass1, 1.0, 1.0, 1.0, 1.0, dOH, dHH);
 
+    settled->ow1    = NULL;
+    settled->hw2    = NULL;
+    settled->hw3    = NULL;
+    settled->virfac = NULL;
+    settled->nalloc = 0;
+
+    /* Without SIMD configured, this bool is not used */
+    settled->bUseSimd = (getenv("GMX_DISABLE_SIMD_KERNELS") == NULL);
+
     return settled;
 }
 
-#ifdef DEBUG
-static void check_cons(FILE *fp, char *title, real x[], int OW1, int HW2, int HW3)
+void settle_set_constraints(gmx_settledata_t  settled,
+                            const t_ilist    *il_settle,
+                            const t_mdatoms  *mdatoms)
 {
-    rvec dOH1, dOH2, dHH;
-    int  m;
+#if GMX_SIMD_HAVE_REAL
+    const int pack_size = GMX_SIMD_REAL_WIDTH;
+#else
+    const int pack_size = 1;
+#endif
+
+    const int nral1     = 1 + NRAL(F_SETTLE);
+    int       nsettle   = il_settle->nr/nral1;
+    settled->nsettle = nsettle;
 
-    for (m = 0; (m < DIM); m++)
+    if (nsettle > 0)
     {
-        dOH1[m] = x[OW1+m]-x[HW2+m];
-        dOH2[m] = x[OW1+m]-x[HW3+m];
-        dHH[m]  = x[HW2+m]-x[HW3+m];
+        const t_iatom *iatoms = il_settle->iatoms;
+
+        /* Here we initialize the normal SETTLE parameters */
+        if (settled->massw.mO < 0)
+        {
+            int firstO = iatoms[1];
+            int firstH = iatoms[2];
+            settleparam_init(&settled->massw,
+                             mdatoms->massT[firstO],
+                             mdatoms->massT[firstH],
+                             mdatoms->invmass[firstO],
+                             mdatoms->invmass[firstH],
+                             settled->mass1.dOH,
+                             settled->mass1.dHH);
+        }
+
+        if (nsettle + pack_size > settled->nalloc)
+        {
+            settled->nalloc = over_alloc_dd(nsettle + pack_size);
+            sfree_aligned(settled->ow1);
+            sfree_aligned(settled->hw2);
+            sfree_aligned(settled->hw3);
+            sfree_aligned(settled->virfac);
+            snew_aligned(settled->ow1, settled->nalloc, 64);
+            snew_aligned(settled->hw2, settled->nalloc, 64);
+            snew_aligned(settled->hw3, settled->nalloc, 64);
+            snew_aligned(settled->virfac, settled->nalloc, 64);
+        }
+
+        for (int i = 0; i < nsettle; i++)
+        {
+            settled->ow1[i]    = iatoms[i*nral1 + 1];
+            settled->hw2[i]    = iatoms[i*nral1 + 2];
+            settled->hw3[i]    = iatoms[i*nral1 + 3];
+            /* We should avoid double counting of virial contributions for
+             * SETTLEs that appear in multiple DD domains, so we only count
+             * the contribution on the home range of the oxygen atom.
+             */
+            settled->virfac[i] = (iatoms[i*nral1 + 1] < mdatoms->homenr ? 1 : 0);
+        }
+
+        /* Pack the index array to the full SIMD width with copies from
+         * the last normal entry, but with no virial contribution.
+         */
+        int end_packed = ((nsettle + pack_size - 1)/pack_size)*pack_size;
+        for (int i = nsettle; i < end_packed; i++)
+        {
+            settled->ow1[i]    = settled->ow1[nsettle - 1];
+            settled->hw2[i]    = settled->hw2[nsettle - 1];
+            settled->hw3[i]    = settled->hw3[nsettle - 1];
+            settled->virfac[i] = 0;
+        }
     }
-    fprintf(fp, "%10s, OW1=%3d, HW2=%3d, HW3=%3d,  dOH1: %8.3f, dOH2: %8.3f, dHH: %8.3f\n",
-            title, OW1/DIM, HW2/DIM, HW3/DIM, norm(dOH1), norm(dOH2), norm(dHH));
 }
-#endif
-
 
 void settle_proj(gmx_settledata_t settled, int econq,
                  int nsettle, t_iatom iatoms[],
@@ -209,11 +326,13 @@ void settle_proj(gmx_settledata_t settled, int econq,
 #pragma ivdep
 #endif
 
+    const int nral1 = 1 + NRAL(F_SETTLE);
+
     for (i = 0; i < nsettle; i++)
     {
-        ow1 = iatoms[i*4+1];
-        hw2 = iatoms[i*4+2];
-        hw3 = iatoms[i*4+3];
+        ow1 = iatoms[i*nral1 + 1];
+        hw2 = iatoms[i*nral1 + 2];
+        hw3 = iatoms[i*nral1 + 3];
 
         if (pbc == NULL)
         {
@@ -277,126 +396,105 @@ void settle_proj(gmx_settledata_t settled, int econq,
 }
 
 
-void csettle(gmx_settledata_t settled,
-             int nsettle, t_iatom iatoms[],
-             const t_pbc *pbc,
-             real b4[], real after[],
-             real invdt, real *v, int CalcVirAtomEnd,
-             tensor vir_r_m_dr,
-             int *error)
+/* The actual settle code, templated for real/SimdReal and for optimization */
+template<typename T, typename TypeBool, int packSize,
+         typename TypePbc,
+         bool bCorrectVelocity,
+         bool bCalcVirial>
+static void settleTemplate(const gmx_settledata_t settled,
+                           int settleStart, int settleEnd,
+                           const TypePbc pbc,
+                           const real *x, real *xprime,
+                           real invdt, real * gmx_restrict v,
+                           tensor vir_r_m_dr,
+                           bool *bErrorHasOccurred)
 {
-    /* ***************************************************************** */
-    /*                                                               ** */
-    /*    Subroutine : setlep - reset positions of TIP3P waters      ** */
-    /*    Author : Shuichi Miyamoto                                  ** */
-    /*    Date of last update : Oct. 1, 1992                         ** */
-    /*                                                               ** */
-    /*    Reference for the SETTLE algorithm                         ** */
-    /*           S. Miyamoto et al., J. Comp. Chem., 13, 952 (1992). ** */
-    /*                                                               ** */
-    /* ***************************************************************** */
-
-    /* Initialized data */
-    settleparam_t *p;
-    real           wh, ra, rb, rc, irc2;
-    real           mO, mH;
-
-    /* Local variables */
-    real     gama, beta, alpa, xcom, ycom, zcom, al2be2, tmp, tmp2;
-    real     axlng, aylng, azlng, trns11, trns21, trns31, trns12, trns22,
-             trns32, trns13, trns23, trns33, cosphi, costhe, sinphi, sinthe,
-             cospsi, xaksxd, yaksxd, xakszd, yakszd, zakszd, zaksxd, xaksyd,
-             xb0, yb0, zb0, xc0, yc0, zc0, xa1;
-    real     ya1, za1, xb1, yb1;
-    real     zb1, xc1, yc1, zc1, yaksyd, zaksyd, sinpsi, xa3, ya3, za3,
-             xb3, yb3, zb3, xc3, yc3, zc3, xb0d, yb0d, xc0d, yc0d,
-             za1d, xb1d, yb1d, zb1d, xc1d, yc1d, zc1d, ya2d, xb2d, yb2d, yc2d,
-             xa3d, ya3d, za3d, xb3d, yb3d, zb3d, xc3d, yc3d, zc3d;
-    real     t1, t2;
-    real     dax, day, daz, dbx, dby, dbz, dcx, dcy, dcz;
-    real     mdax, mday, mdaz, mdbx, mdby, mdbz, mdcx, mdcy, mdcz;
-
-    gmx_bool bOK;
-
-    int      i, ow1, hw2, hw3;
-
-    rvec     dx, sh_hw2 = {0, 0, 0}, sh_hw3 = {0, 0, 0};
-    rvec     doh2, doh3;
-    int      is;
-
-    *error = -1;
-
-    CalcVirAtomEnd *= 3;
-
-    p     = &settled->massw;
-    wh    = p->wh;
-    rc    = p->rc;
-    ra    = p->ra;
-    rb    = p->rb;
-    irc2  = p->irc2;
-    mO    = p->mO;
-    mH    = p->mH;
+    /* ******************************************************************* */
+    /*                                                                  ** */
+    /*    Original code by Shuichi Miyamoto, last update Oct. 1, 1992   ** */
+    /*                                                                  ** */
+    /*    Algorithm changes by Berk Hess:                               ** */
+    /*    2004-07-15 Convert COM to double precision to avoid drift     ** */
+    /*    2006-10-16 Changed velocity update to use differences         ** */
+    /*    2012-09-24 Use oxygen as reference instead of COM             ** */
+    /*    2016-02    Complete rewrite of the code for SIMD              ** */
+    /*                                                                  ** */
+    /*    Reference for the SETTLE algorithm                            ** */
+    /*           S. Miyamoto et al., J. Comp. Chem., 13, 952 (1992).    ** */
+    /*                                                                  ** */
+    /* ******************************************************************* */
+
+    assert(settleStart % packSize == 0);
+    assert(settleEnd   % packSize == 0);
+
+    TypeBool       bError = TypeBool(false);
+
+    settleparam_t *p    = &settled->massw;
+    T              wh   = T(p->wh);
+    T              rc   = T(p->rc);
+    T              ra   = T(p->ra);
+    T              rb   = T(p->rb);
+    T              irc2 = T(p->irc2);
+    T              mO   = T(p->mO);
+    T              mH   = T(p->mH);
+
+    T              almost_zero = T(1e-12);
+
+    T              sum_r_m_dr[DIM][DIM];
+
+    if (bCalcVirial)
+    {
+        for (int d2 = 0; d2 < DIM; d2++)
+        {
+            for (int d = 0; d < DIM; d++)
+            {
+                sum_r_m_dr[d2][d] = T(0);
+            }
+        }
+    }
 
-#ifdef PRAGMAS
-#pragma ivdep
-#endif
-    for (i = 0; i < nsettle; ++i)
+    for (int i = settleStart; i < settleEnd; i += packSize)
     {
-        bOK = TRUE;
-        /*    --- Step1  A1' ---      */
-        ow1 = iatoms[i*4+1] * 3;
-        hw2 = iatoms[i*4+2] * 3;
-        hw3 = iatoms[i*4+3] * 3;
-        if (pbc == NULL)
+        /* Here we pad up to packSize with copies from the last valid entry.
+         * This gives correct results, since we store (not increment) all
+         * output, so we store the same output multiple times.
+         */
+        const int *ow1 = settled->ow1 + i;
+        const int *hw2 = settled->hw2 + i;
+        const int *hw3 = settled->hw3 + i;
+
+        T          x_ow1[DIM], x_hw2[DIM], x_hw3[DIM];
+
+        gatherLoadUTranspose<3>(x, ow1, &x_ow1[XX], &x_ow1[YY], &x_ow1[ZZ]);
+        gatherLoadUTranspose<3>(x, hw2, &x_hw2[XX], &x_hw2[YY], &x_hw2[ZZ]);
+        gatherLoadUTranspose<3>(x, hw3, &x_hw3[XX], &x_hw3[YY], &x_hw3[ZZ]);
+
+        T xprime_ow1[DIM], xprime_hw2[DIM], xprime_hw3[DIM];
+
+        gatherLoadUTranspose<3>(xprime, ow1, &xprime_ow1[XX], &xprime_ow1[YY], &xprime_ow1[ZZ]);
+        gatherLoadUTranspose<3>(xprime, hw2, &xprime_hw2[XX], &xprime_hw2[YY], &xprime_hw2[ZZ]);
+        gatherLoadUTranspose<3>(xprime, hw3, &xprime_hw3[XX], &xprime_hw3[YY], &xprime_hw3[ZZ]);
+
+        T dist21[DIM], dist31[DIM];
+        T doh2[DIM], doh3[DIM];
+        T sh_hw2[DIM], sh_hw3[DIM];
+
+        pbc_dx_aiuc(pbc, x_hw2, x_ow1, dist21);
+
+        pbc_dx_aiuc(pbc, x_hw3, x_ow1, dist31);
+
+        /* Tedious way of doing pbc */
+        pbc_dx_aiuc(pbc, xprime_hw2, xprime_ow1, doh2);
+        for (int d = 0; d < DIM; d++)
         {
-            xb0 = b4[hw2 + XX] - b4[ow1 + XX];
-            yb0 = b4[hw2 + YY] - b4[ow1 + YY];
-            zb0 = b4[hw2 + ZZ] - b4[ow1 + ZZ];
-            xc0 = b4[hw3 + XX] - b4[ow1 + XX];
-            yc0 = b4[hw3 + YY] - b4[ow1 + YY];
-            zc0 = b4[hw3 + ZZ] - b4[ow1 + ZZ];
-            /* 6 flops */
-
-            rvec_sub(after+hw2, after+ow1, doh2);
-            rvec_sub(after+hw3, after+ow1, doh3);
-            /* 6 flops */
+            sh_hw2[d]     = xprime_hw2[d] - (xprime_ow1[d] + doh2[d]);
+            xprime_hw2[d] = xprime_hw2[d] - sh_hw2[d];
         }
-        else
+        pbc_dx_aiuc(pbc, xprime_hw3, xprime_ow1, doh3);
+        for (int d = 0; d < DIM; d++)
         {
-            pbc_dx_aiuc(pbc, b4+hw2, b4+ow1, dx);
-            xb0 = dx[XX];
-            yb0 = dx[YY];
-            zb0 = dx[ZZ];
-            pbc_dx_aiuc(pbc, b4+hw3, b4+ow1, dx);
-            xc0 = dx[XX];
-            yc0 = dx[YY];
-            zc0 = dx[ZZ];
-
-            /* Tedious way of doing pbc */
-            is = pbc_dx_aiuc(pbc, after+hw2, after+ow1, doh2);
-            if (is == CENTRAL)
-            {
-                clear_rvec(sh_hw2);
-            }
-            else
-            {
-                sh_hw2[XX] = after[hw2 + XX] - (after[ow1 + XX] + doh2[XX]);
-                sh_hw2[YY] = after[hw2 + YY] - (after[ow1 + YY] + doh2[YY]);
-                sh_hw2[ZZ] = after[hw2 + ZZ] - (after[ow1 + ZZ] + doh2[ZZ]);
-                rvec_dec(after+hw2, sh_hw2);
-            }
-            is = pbc_dx_aiuc(pbc, after+hw3, after+ow1, doh3);
-            if (is == CENTRAL)
-            {
-                clear_rvec(sh_hw3);
-            }
-            else
-            {
-                sh_hw3[XX] = after[hw3 + XX] - (after[ow1 + XX] + doh3[XX]);
-                sh_hw3[YY] = after[hw3 + YY] - (after[ow1 + YY] + doh3[YY]);
-                sh_hw3[ZZ] = after[hw3 + ZZ] - (after[ow1 + ZZ] + doh3[ZZ]);
-                rvec_dec(after+hw3, sh_hw3);
-            }
+            sh_hw3[d]     = xprime_hw3[d] - (xprime_ow1[d] + doh3[d]);
+            xprime_hw3[d] = xprime_hw3[d] - sh_hw3[d];
         }
 
         /* Not calculating the center of mass using the oxygen position
@@ -405,206 +503,376 @@ void csettle(gmx_settledata_t settled,
          * as then the oxygen coordinate is multiplied by 0.89 at every step,
          * which can then transfer a systematic rounding to the oxygen velocity.
          */
-        xa1 = -(doh2[XX] + doh3[XX]) * wh;
-        ya1 = -(doh2[YY] + doh3[YY]) * wh;
-        za1 = -(doh2[ZZ] + doh3[ZZ]) * wh;
-
-        xcom = after[ow1 + XX] - xa1;
-        ycom = after[ow1 + YY] - ya1;
-        zcom = after[ow1 + ZZ] - za1;
-
-        xb1 = after[hw2 + XX] - xcom;
-        yb1 = after[hw2 + YY] - ycom;
-        zb1 = after[hw2 + ZZ] - zcom;
-        xc1 = after[hw3 + XX] - xcom;
-        yc1 = after[hw3 + YY] - ycom;
-        zc1 = after[hw3 + ZZ] - zcom;
+        T a1[DIM], com[DIM];
+        for (int d = 0; d < DIM; d++)
+        {
+            a1[d]  = -(doh2[d] + doh3[d]) * wh;
+            com[d] = xprime_ow1[d] - a1[d];
+        }
+        T b1[DIM];
+        for (int d = 0; d < DIM; d++)
+        {
+            b1[d]  = xprime_hw2[d] - com[d];
+        }
+        T c1[DIM];
+        for (int d = 0; d < DIM; d++)
+        {
+            c1[d]  = xprime_hw3[d] - com[d];
+        }
         /* 15 flops */
 
-        xakszd = yb0 * zc0 - zb0 * yc0;
-        yakszd = zb0 * xc0 - xb0 * zc0;
-        zakszd = xb0 * yc0 - yb0 * xc0;
-        xaksxd = ya1 * zakszd - za1 * yakszd;
-        yaksxd = za1 * xakszd - xa1 * zakszd;
-        zaksxd = xa1 * yakszd - ya1 * xakszd;
-        xaksyd = yakszd * zaksxd - zakszd * yaksxd;
-        yaksyd = zakszd * xaksxd - xakszd * zaksxd;
-        zaksyd = xakszd * yaksxd - yakszd * xaksxd;
+        T xakszd = dist21[YY] * dist31[ZZ] - dist21[ZZ] * dist31[YY];
+        T yakszd = dist21[ZZ] * dist31[XX] - dist21[XX] * dist31[ZZ];
+        T zakszd = dist21[XX] * dist31[YY] - dist21[YY] * dist31[XX];
+        T xaksxd = a1[YY] * zakszd - a1[ZZ] * yakszd;
+        T yaksxd = a1[ZZ] * xakszd - a1[XX] * zakszd;
+        T zaksxd = a1[XX] * yakszd - a1[YY] * xakszd;
+        T xaksyd = yakszd * zaksxd - zakszd * yaksxd;
+        T yaksyd = zakszd * xaksxd - xakszd * zaksxd;
+        T zaksyd = xakszd * yaksxd - yakszd * xaksxd;
         /* 27 flops */
 
-        axlng = gmx::invsqrt(xaksxd * xaksxd + yaksxd * yaksxd + zaksxd * zaksxd);
-        aylng = gmx::invsqrt(xaksyd * xaksyd + yaksyd * yaksyd + zaksyd * zaksyd);
-        azlng = gmx::invsqrt(xakszd * xakszd + yakszd * yakszd + zakszd * zakszd);
-
-        trns11 = xaksxd * axlng;
-        trns21 = yaksxd * axlng;
-        trns31 = zaksxd * axlng;
-        trns12 = xaksyd * aylng;
-        trns22 = yaksyd * aylng;
-        trns32 = zaksyd * aylng;
-        trns13 = xakszd * azlng;
-        trns23 = yakszd * azlng;
-        trns33 = zakszd * azlng;
+        T axlng = gmx::invsqrt(xaksxd * xaksxd + yaksxd * yaksxd + zaksxd * zaksxd);
+        T aylng = gmx::invsqrt(xaksyd * xaksyd + yaksyd * yaksyd + zaksyd * zaksyd);
+        T azlng = gmx::invsqrt(xakszd * xakszd + yakszd * yakszd + zakszd * zakszd);
+
+        T trns1[DIM], trns2[DIM], trns3[DIM];
+
+        trns1[XX] = xaksxd * axlng;
+        trns2[XX] = yaksxd * axlng;
+        trns3[XX] = zaksxd * axlng;
+        trns1[YY] = xaksyd * aylng;
+        trns2[YY] = yaksyd * aylng;
+        trns3[YY] = zaksyd * aylng;
+        trns1[ZZ] = xakszd * azlng;
+        trns2[ZZ] = yakszd * azlng;
+        trns3[ZZ] = zakszd * azlng;
         /* 24 flops */
 
-        xb0d = trns11 * xb0 + trns21 * yb0 + trns31 * zb0;
-        yb0d = trns12 * xb0 + trns22 * yb0 + trns32 * zb0;
-        xc0d = trns11 * xc0 + trns21 * yc0 + trns31 * zc0;
-        yc0d = trns12 * xc0 + trns22 * yc0 + trns32 * zc0;
-        /*
-           xa1d = trns11 * xa1 + trns21 * ya1 + trns31 * za1;
-           ya1d = trns12 * xa1 + trns22 * ya1 + trns32 * za1;
-         */
-        za1d = trns13 * xa1 + trns23 * ya1 + trns33 * za1;
-        xb1d = trns11 * xb1 + trns21 * yb1 + trns31 * zb1;
-        yb1d = trns12 * xb1 + trns22 * yb1 + trns32 * zb1;
-        zb1d = trns13 * xb1 + trns23 * yb1 + trns33 * zb1;
-        xc1d = trns11 * xc1 + trns21 * yc1 + trns31 * zc1;
-        yc1d = trns12 * xc1 + trns22 * yc1 + trns32 * zc1;
-        zc1d = trns13 * xc1 + trns23 * yc1 + trns33 * zc1;
+        T b0d[2], c0d[2];
+
+        for (int d = 0; d < 2; d++)
+        {
+            b0d[d] = trns1[d] * dist21[XX] + trns2[d] * dist21[YY] + trns3[d] * dist21[ZZ];
+            c0d[d] = trns1[d] * dist31[XX] + trns2[d] * dist31[YY] + trns3[d] * dist31[ZZ];
+        }
+
+        T a1d_z, b1d[DIM], c1d[DIM];
+
+        a1d_z = trns1[ZZ] * a1[XX] + trns2[ZZ] * a1[YY] + trns3[ZZ] * a1[ZZ];
+        for (int d = 0; d < DIM; d++)
+        {
+            b1d[d] = trns1[d] * b1[XX] + trns2[d] * b1[YY] + trns3[d] * b1[ZZ];
+            c1d[d] = trns1[d] * c1[XX] + trns2[d] * c1[YY] + trns3[d] * c1[ZZ];
+        }
         /* 65 flops */
 
-        sinphi = za1d * gmx::invsqrt(ra*ra);
-        tmp    = 1.0 - sinphi * sinphi;
-        if (tmp <= 0)
+        T tmp, tmp2;
+
+        T sinphi = a1d_z * gmx::invsqrt(ra*ra);
+        tmp2     = 1.0 - sinphi * sinphi;
+
+        /* If tmp2 gets close to or beyond zero we have severly distorted
+         * water molecules and we should terminate the simulation.
+         * Below we take the max with almost_zero to continue the loop.
+         */
+        bError   = bError || (tmp2 <= almost_zero);
+
+        tmp2     = max(tmp2, almost_zero);
+        tmp      = gmx::invsqrt(tmp2);
+        T cosphi = tmp2*tmp;
+        T sinpsi = (b1d[ZZ] - c1d[ZZ]) * irc2 * tmp;
+        tmp2     = 1.0 - sinpsi * sinpsi;
+
+        T cospsi = tmp2*gmx::invsqrt(tmp2);
+        /* 46 flops */
+
+        T a2d_y  =  ra * cosphi;
+        T b2d_x  = -rc * cospsi;
+        T t1     = -rb * cosphi;
+        T t2     =  rc * sinpsi * sinphi;
+        T b2d_y  =  t1 - t2;
+        T c2d_y  =  t1 + t2;
+        /* 7 flops */
+
+        /*     --- Step3  al,be,ga            --- */
+        T alpha  = b2d_x * (b0d[XX] - c0d[XX]) + b0d[YY] * b2d_y + c0d[YY] * c2d_y;
+        T beta   = b2d_x * (c0d[YY] - b0d[YY]) + b0d[XX] * b2d_y + c0d[XX] * c2d_y;
+        T gamma  = b0d[XX] * b1d[YY] - b1d[XX] * b0d[YY] + c0d[XX] * c1d[YY] - c1d[XX] * c0d[YY];
+        T al2be2 = alpha * alpha + beta * beta;
+        tmp2     = (al2be2 - gamma * gamma);
+        T sinthe = (alpha * gamma - beta * tmp2*gmx::invsqrt(tmp2)) * gmx::invsqrt(al2be2*al2be2);
+        /* 47 flops */
+
+        /*  --- Step4  A3' --- */
+        tmp2     = 1.0 - sinthe * sinthe;
+        T costhe = tmp2*gmx::invsqrt(tmp2);
+
+        T a3d[DIM], b3d[DIM], c3d[DIM];
+
+        a3d[XX]  = -a2d_y * sinthe;
+        a3d[YY]  = a2d_y * costhe;
+        a3d[ZZ]  = a1d_z;
+        b3d[XX]  = b2d_x * costhe - b2d_y * sinthe;
+        b3d[YY]  = b2d_x * sinthe + b2d_y * costhe;
+        b3d[ZZ]  = b1d[ZZ];
+        c3d[XX]  = -b2d_x * costhe - c2d_y * sinthe;
+        c3d[YY]  = -b2d_x * sinthe + c2d_y * costhe;
+        c3d[ZZ]  = c1d[ZZ];
+        /* 26 flops */
+
+        /*    --- Step5  A3 --- */
+        T a3[DIM], b3[DIM], c3[DIM];
+
+        a3[XX] = trns1[XX]*a3d[XX] + trns1[YY]*a3d[YY] + trns1[ZZ]*a3d[ZZ];
+        a3[YY] = trns2[XX]*a3d[XX] + trns2[YY]*a3d[YY] + trns2[ZZ]*a3d[ZZ];
+        a3[ZZ] = trns3[XX]*a3d[XX] + trns3[YY]*a3d[YY] + trns3[ZZ]*a3d[ZZ];
+        b3[XX] = trns1[XX]*b3d[XX] + trns1[YY]*b3d[YY] + trns1[ZZ]*b3d[ZZ];
+        b3[YY] = trns2[XX]*b3d[XX] + trns2[YY]*b3d[YY] + trns2[ZZ]*b3d[ZZ];
+        b3[ZZ] = trns3[XX]*b3d[XX] + trns3[YY]*b3d[YY] + trns3[ZZ]*b3d[ZZ];
+        c3[XX] = trns1[XX]*c3d[XX] + trns1[YY]*c3d[YY] + trns1[ZZ]*c3d[ZZ];
+        c3[YY] = trns2[XX]*c3d[XX] + trns2[YY]*c3d[YY] + trns2[ZZ]*c3d[ZZ];
+        c3[ZZ] = trns3[XX]*c3d[XX] + trns3[YY]*c3d[YY] + trns3[ZZ]*c3d[ZZ];
+        /* 45 flops */
+
+        /* Compute and store the corrected new coordinate */
+        for (int d = 0; d < DIM; d++)
         {
-            bOK = FALSE;
+            xprime_ow1[d] = com[d] + a3[d];
         }
-        else
+        for (int d = 0; d < DIM; d++)
+        {
+            xprime_hw2[d] = com[d] + b3[d] + sh_hw2[d];;
+        }
+        for (int d = 0; d < DIM; d++)
+        {
+            xprime_hw3[d] = com[d] + c3[d] + sh_hw3[d];
+        }
+        /* 9 flops + 6 pbc flops */
+
+        transposeScatterStoreU<3>(xprime, ow1, xprime_ow1[XX], xprime_ow1[YY], xprime_ow1[ZZ]);
+        transposeScatterStoreU<3>(xprime, hw2, xprime_hw2[XX], xprime_hw2[YY], xprime_hw2[ZZ]);
+        transposeScatterStoreU<3>(xprime, hw3, xprime_hw3[XX], xprime_hw3[YY], xprime_hw3[ZZ]);
+
+        // cppcheck-suppress duplicateExpression
+        if (bCorrectVelocity || bCalcVirial)
         {
-            tmp2   = gmx::invsqrt(tmp);
-            cosphi = tmp*tmp2;
-            sinpsi = (zb1d - zc1d) * irc2 * tmp2;
-            tmp2   = 1.0 - sinpsi * sinpsi;
-            if (tmp2 <= 0)
+            T da[DIM], db[DIM], dc[DIM];
+            for (int d = 0; d < DIM; d++)
             {
-                bOK = FALSE;
+                da[d] = a3[d] - a1[d];
             }
-            else
+            for (int d = 0; d < DIM; d++)
             {
-                cospsi = tmp2*gmx::invsqrt(tmp2);
+                db[d] = b3[d] - b1[d];
+            }
+            for (int d = 0; d < DIM; d++)
+            {
+                dc[d] = c3[d] - c1[d];
             }
-        }
-        /* 46 flops */
-
-        if (bOK)
-        {
-            ya2d =  ra * cosphi;
-            xb2d = -rc * cospsi;
-            t1   = -rb * cosphi;
-            t2   =  rc * sinpsi * sinphi;
-            yb2d =  t1 - t2;
-            yc2d =  t1 + t2;
-            /* 7 flops */
-
-            /*     --- Step3  al,be,ga            --- */
-            alpa   = xb2d * (xb0d - xc0d) + yb0d * yb2d + yc0d * yc2d;
-            beta   = xb2d * (yc0d - yb0d) + xb0d * yb2d + xc0d * yc2d;
-            gama   = xb0d * yb1d - xb1d * yb0d + xc0d * yc1d - xc1d * yc0d;
-            al2be2 = alpa * alpa + beta * beta;
-            tmp2   = (al2be2 - gama * gama);
-            sinthe = (alpa * gama - beta * tmp2*gmx::invsqrt(tmp2)) * gmx::invsqrt(al2be2*al2be2);
-            /* 47 flops */
-
-            /*  --- Step4  A3' --- */
-            tmp2   = 1.0 - sinthe * sinthe;
-            costhe = tmp2*gmx::invsqrt(tmp2);
-            xa3d   = -ya2d * sinthe;
-            ya3d   = ya2d * costhe;
-            za3d   = za1d;
-            xb3d   = xb2d * costhe - yb2d * sinthe;
-            yb3d   = xb2d * sinthe + yb2d * costhe;
-            zb3d   = zb1d;
-            xc3d   = -xb2d * costhe - yc2d * sinthe;
-            yc3d   = -xb2d * sinthe + yc2d * costhe;
-            zc3d   = zc1d;
-            /* 26 flops */
-
-            /*    --- Step5  A3 --- */
-            xa3 = trns11 * xa3d + trns12 * ya3d + trns13 * za3d;
-            ya3 = trns21 * xa3d + trns22 * ya3d + trns23 * za3d;
-            za3 = trns31 * xa3d + trns32 * ya3d + trns33 * za3d;
-            xb3 = trns11 * xb3d + trns12 * yb3d + trns13 * zb3d;
-            yb3 = trns21 * xb3d + trns22 * yb3d + trns23 * zb3d;
-            zb3 = trns31 * xb3d + trns32 * yb3d + trns33 * zb3d;
-            xc3 = trns11 * xc3d + trns12 * yc3d + trns13 * zc3d;
-            yc3 = trns21 * xc3d + trns22 * yc3d + trns23 * zc3d;
-            zc3 = trns31 * xc3d + trns32 * yc3d + trns33 * zc3d;
-            /* 45 flops */
-            after[ow1]     = xcom + xa3;
-            after[ow1 + 1] = ycom + ya3;
-            after[ow1 + 2] = zcom + za3;
-            after[hw2]     = xcom + xb3;
-            after[hw2 + 1] = ycom + yb3;
-            after[hw2 + 2] = zcom + zb3;
-            after[hw3]     = xcom + xc3;
-            after[hw3 + 1] = ycom + yc3;
-            after[hw3 + 2] = zcom + zc3;
             /* 9 flops */
 
-            if (pbc != NULL)
+            if (bCorrectVelocity)
             {
-                rvec_inc(after+hw2, sh_hw2);
-                rvec_inc(after+hw3, sh_hw3);
+                T v_ow1[DIM], v_hw2[DIM], v_hw3[DIM];
+
+                gatherLoadUTranspose<3>(v, ow1, &v_ow1[XX], &v_ow1[YY], &v_ow1[ZZ]);
+                gatherLoadUTranspose<3>(v, hw2, &v_hw2[XX], &v_hw2[YY], &v_hw2[ZZ]);
+                gatherLoadUTranspose<3>(v, hw3, &v_hw3[XX], &v_hw3[YY], &v_hw3[ZZ]);
+
+                /* Add the position correction divided by dt to the velocity */
+                for (int d = 0; d < DIM; d++)
+                {
+                    v_ow1[d] = gmx::fma(da[d], invdt, v_ow1[d]);
+                }
+                for (int d = 0; d < DIM; d++)
+                {
+                    v_hw2[d] = gmx::fma(db[d], invdt, v_hw2[d]);
+                }
+                for (int d = 0; d < DIM; d++)
+                {
+                    v_hw3[d] = gmx::fma(dc[d], invdt, v_hw3[d]);
+                }
+                /* 3*6 flops */
+
+                transposeScatterStoreU<3>(v, ow1, v_ow1[XX], v_ow1[YY], v_ow1[ZZ]);
+                transposeScatterStoreU<3>(v, hw2, v_hw2[XX], v_hw2[YY], v_hw2[ZZ]);
+                transposeScatterStoreU<3>(v, hw3, v_hw3[XX], v_hw3[YY], v_hw3[ZZ]);
             }
 
-            dax = xa3 - xa1;
-            day = ya3 - ya1;
-            daz = za3 - za1;
-            dbx = xb3 - xb1;
-            dby = yb3 - yb1;
-            dbz = zb3 - zb1;
-            dcx = xc3 - xc1;
-            dcy = yc3 - yc1;
-            dcz = zc3 - zc1;
-            /* 9 flops, counted with the virial */
-
-            if (v != NULL)
+            if (bCalcVirial)
             {
-                v[ow1]     += dax*invdt;
-                v[ow1 + 1] += day*invdt;
-                v[ow1 + 2] += daz*invdt;
-                v[hw2]     += dbx*invdt;
-                v[hw2 + 1] += dby*invdt;
-                v[hw2 + 2] += dbz*invdt;
-                v[hw3]     += dcx*invdt;
-                v[hw3 + 1] += dcy*invdt;
-                v[hw3 + 2] += dcz*invdt;
-                /* 3*6 flops */
+                /* Filter out the non-local settles */
+                T filter = load(settled->virfac + i);
+                T mOf    = filter*mO;
+                T mHf    = filter*mH;
+
+                T mdo[DIM], mdb[DIM], mdc[DIM];
+
+                for (int d = 0; d < DIM; d++)
+                {
+                    mdb[d] = mHf*db[d];
+                    mdc[d] = mHf*dc[d];
+                    mdo[d] = mOf*da[d] + mdb[d] + mdc[d];
+                }
+
+                for (int d2 = 0; d2 < DIM; d2++)
+                {
+                    for (int d = 0; d < DIM; d++)
+                    {
+                        sum_r_m_dr[d2][d] = sum_r_m_dr[d2][d] -
+                            (x_ow1[d2]*mdo[d] +
+                             dist21[d2]*mdb[d] +
+                             dist31[d2]*mdc[d]);
+                    }
+                }
+                /* 71 flops */
             }
+        }
+    }
 
-            if (ow1 < CalcVirAtomEnd)
+    if (bCalcVirial)
+    {
+        for (int d2 = 0; d2 < DIM; d2++)
+        {
+            for (int d = 0; d < DIM; d++)
             {
-                mdax                = mO*dax;
-                mday                = mO*day;
-                mdaz                = mO*daz;
-                mdbx                = mH*dbx;
-                mdby                = mH*dby;
-                mdbz                = mH*dbz;
-                mdcx                = mH*dcx;
-                mdcy                = mH*dcy;
-                mdcz                = mH*dcz;
-                vir_r_m_dr[XX][XX] -= b4[ow1  ]*mdax + (b4[ow1  ]+xb0)*mdbx + (b4[ow1  ]+xc0)*mdcx;
-                vir_r_m_dr[XX][YY] -= b4[ow1  ]*mday + (b4[ow1  ]+xb0)*mdby + (b4[ow1  ]+xc0)*mdcy;
-                vir_r_m_dr[XX][ZZ] -= b4[ow1  ]*mdaz + (b4[ow1  ]+xb0)*mdbz + (b4[ow1  ]+xc0)*mdcz;
-                vir_r_m_dr[YY][XX] -= b4[ow1+1]*mdax + (b4[ow1+1]+yb0)*mdbx + (b4[ow1+1]+yc0)*mdcx;
-                vir_r_m_dr[YY][YY] -= b4[ow1+1]*mday + (b4[ow1+1]+yb0)*mdby + (b4[ow1+1]+yc0)*mdcy;
-                vir_r_m_dr[YY][ZZ] -= b4[ow1+1]*mdaz + (b4[ow1+1]+yb0)*mdbz + (b4[ow1+1]+yc0)*mdcz;
-                vir_r_m_dr[ZZ][XX] -= b4[ow1+2]*mdax + (b4[ow1+2]+zb0)*mdbx + (b4[ow1+2]+zc0)*mdcx;
-                vir_r_m_dr[ZZ][YY] -= b4[ow1+2]*mday + (b4[ow1+2]+zb0)*mdby + (b4[ow1+2]+zc0)*mdcy;
-                vir_r_m_dr[ZZ][ZZ] -= b4[ow1+2]*mdaz + (b4[ow1+2]+zb0)*mdbz + (b4[ow1+2]+zc0)*mdcz;
-                /* 3*24 - 9 flops */
+                vir_r_m_dr[d2][d] += reduce(sum_r_m_dr[d2][d]);
             }
         }
+    }
+
+    *bErrorHasOccurred = anyTrue(bError);
+}
+
+/* Wrapper template function that divides the settles over threads
+ * and instantiates the core template with instantiated booleans.
+ */
+template<typename T, typename TypeBool, int packSize, typename TypePbc>
+static void settleTemplateWrapper(gmx_settledata_t settled,
+                                  int nthread, int thread,
+                                  TypePbc pbc,
+                                  const real x[], real xprime[],
+                                  real invdt, real *v,
+                                  bool bCalcVirial, tensor vir_r_m_dr,
+                                  bool *bErrorHasOccurred)
+{
+    /* We need to assign settles to threads in groups of pack_size */
+    int numSettlePacks = (settled->nsettle + packSize - 1)/packSize;
+    /* Round the end value up to give thread 0 more work */
+    int settleStart    = ((numSettlePacks* thread      + nthread - 1)/nthread)*packSize;
+    int settleEnd      = ((numSettlePacks*(thread + 1) + nthread - 1)/nthread)*packSize;
+
+    if (v != NULL)
+    {
+        if (!bCalcVirial)
+        {
+            settleTemplate<T, TypeBool, packSize,
+                           TypePbc,
+                           true,
+                           false>
+                (settled, settleStart, settleEnd,
+                pbc,
+                x, xprime,
+                invdt, v,
+                NULL,
+                bErrorHasOccurred);
+        }
         else
         {
-            *error = i;
+            settleTemplate<T, TypeBool, packSize,
+                           TypePbc,
+                           true,
+                           true>
+                (settled, settleStart, settleEnd,
+                pbc,
+                x, xprime,
+                invdt, v,
+                vir_r_m_dr,
+                bErrorHasOccurred);
         }
-#ifdef DEBUG
-        if (debug)
+    }
+    else
+    {
+        if (!bCalcVirial)
         {
-            check_cons(debug, "settle", after, ow1, hw2, hw3);
+            settleTemplate<T, TypeBool, packSize,
+                           TypePbc,
+                           false,
+                           false>
+                (settled, settleStart, settleEnd,
+                pbc,
+                x, xprime,
+                invdt, v,
+                NULL,
+                bErrorHasOccurred);
         }
+        else
+        {
+            settleTemplate<T, TypeBool, packSize,
+                           TypePbc,
+                           false,
+                           true>
+                (settled, settleStart, settleEnd,
+                pbc,
+                x, xprime,
+                invdt, v,
+                vir_r_m_dr,
+                bErrorHasOccurred);
+        }
+    }
+}
+
+void csettle(gmx_settledata_t settled,
+             int nthread, int thread,
+             const t_pbc *pbc,
+             const real x[], real xprime[],
+             real invdt, real *v,
+             bool bCalcVirial, tensor vir_r_m_dr,
+             bool *bErrorHasOccurred)
+{
+#if GMX_SIMD_HAVE_REAL
+    if (settled->bUseSimd)
+    {
+        /* Convert the pbc struct for SIMD */
+        GMX_ALIGNED(real, GMX_SIMD_REAL_WIDTH) pbcSimd[9*GMX_SIMD_REAL_WIDTH];
+        set_pbc_simd(pbc, pbcSimd);
+
+        settleTemplateWrapper<SimdReal, SimdBool, GMX_SIMD_REAL_WIDTH,
+                              const real *>(settled,
+                                            nthread, thread,
+                                            pbcSimd,
+                                            x, xprime,
+                                            invdt,
+                                            v,
+                                            bCalcVirial, vir_r_m_dr,
+                                            bErrorHasOccurred);
+    }
+    else
 #endif
+    {
+        /* This construct is needed because pbc_dx_aiuc doesn't accept pbc=NULL */
+        t_pbc        pbcNo;
+        const t_pbc *pbcNonNull;
+
+        if (pbc != NULL)
+        {
+            pbcNonNull = pbc;
+        }
+        else
+        {
+            set_pbc(&pbcNo, epbcNONE, NULL);
+            pbcNonNull = &pbcNo;
+        }
+
+        settleTemplateWrapper<real, bool, 1,
+                              const t_pbc *>(settled,
+                                             nthread, thread,
+                                             pbcNonNull,
+                                             x, xprime,
+                                             invdt,
+                                             v,
+                                             bCalcVirial, vir_r_m_dr,
+                                             bErrorHasOccurred);
     }
 }
-- 
2.11.4.GIT