From 0f26a48719fe046a3aeab2048be302721cd7bad0 Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Mon, 10 Dec 2012 18:20:24 +0100
Subject: [PATCH] implemented nbnxn AVX-256 2x(N+N) kernels

With AVX-256 nbnxn kernels are faster with 2x(4+4)=4x4 than 4x8.
To reduce the number of nbnxn kernels, there are no longer both
simd128 and simd256 kernels, but simd_4xn and for AVX-256 additionally
simd_2xnn kernels. The nbnxn search code is simplified significantly,
as it no longer has 128-bit and 256-bit simd simultaneously.
The SIMD pair-search and kernel configuration is now separated from
the SSE SIMD bounding box code configuration.

Additionally the label x86_simd in all nbnxn files is replaced by simd
only, in prepartion for non-x86 SIMD acceleration.

Change-Id: If03cf569cf918dcec675fbcb43e1ff3a30528f74
---
 include/gmx_x86_avx_256.h                          |  17 +
 include/gmx_x86_simd_macros.h                      |  21 +-
 include/types/nb_verlet.h                          |  80 +++--
 src/kernel/calc_verletbuf.c                        |  13 +-
 src/kernel/pme_loadbal.c                           |   3 +-
 src/mdlib/forcerec.c                               |  81 ++---
 src/mdlib/nbnxn_atomdata.c                         |  30 +-
 src/mdlib/nbnxn_internal.h                         |  48 +--
 ...rnel_x86_simd128.c => nbnxn_kernel_simd_2xnn.c} |  69 ++--
 ...rnel_x86_simd256.h => nbnxn_kernel_simd_2xnn.h} |  29 +-
 ...ncludes.h => nbnxn_kernel_simd_2xnn_includes.h} |  18 +-
 ...simd_inner.h => nbnxn_kernel_simd_2xnn_inner.h} | 369 +++------------------
 ...simd_outer.h => nbnxn_kernel_simd_2xnn_outer.h} | 198 ++++-------
 ...ernel_x86_simd256.c => nbnxn_kernel_simd_4xn.c} |  69 ++--
 ...ernel_x86_simd128.h => nbnxn_kernel_simd_4xn.h} |  28 +-
 ...includes.h => nbnxn_kernel_simd_4xn_includes.h} |  18 +-
 ..._simd_inner.h => nbnxn_kernel_simd_4xn_inner.h} |  11 +-
 ..._simd_outer.h => nbnxn_kernel_simd_4xn_outer.h} |  52 ++-
 ..._x86_simd_utils.h => nbnxn_kernel_simd_utils.h} |  61 +++-
 src/mdlib/nbnxn_search.c                           | 217 ++++++------
 src/mdlib/nbnxn_search_simd_2xnn.h                 | 262 +++++++++++++++
 ...n_search_x86_simd.h => nbnxn_search_simd_4xn.h} | 122 +++----
 src/mdlib/sim_util.c                               |  66 ++--
 23 files changed, 960 insertions(+), 922 deletions(-)
 rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd128.c => nbnxn_kernel_simd_2xnn.c} (83%)
 rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd256.h => nbnxn_kernel_simd_2xnn.h} (69%)
 copy src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_includes.h => nbnxn_kernel_simd_2xnn_includes.h} (86%)
 copy src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_inner.h => nbnxn_kernel_simd_2xnn_inner.h} (58%)
 copy src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_outer.h => nbnxn_kernel_simd_2xnn_outer.h} (66%)
 rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd256.c => nbnxn_kernel_simd_4xn.c} (84%)
 rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd128.h => nbnxn_kernel_simd_4xn.h} (71%)
 rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_includes.h => nbnxn_kernel_simd_4xn_includes.h} (86%)
 rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_inner.h => nbnxn_kernel_simd_4xn_inner.h} (99%)
 rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_outer.h => nbnxn_kernel_simd_4xn_outer.h} (95%)
 rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_utils.h => nbnxn_kernel_simd_utils.h} (90%)
 create mode 100644 src/mdlib/nbnxn_search_simd_2xnn.h
 rename src/mdlib/{nbnxn_search_x86_simd.h => nbnxn_search_simd_4xn.h} (77%)

diff --git a/include/gmx_x86_avx_256.h b/include/gmx_x86_avx_256.h
index 1e444f46e9..9f266e834f 100644
--- a/include/gmx_x86_avx_256.h
+++ b/include/gmx_x86_avx_256.h
@@ -128,6 +128,16 @@ gmx_mm256_set_m128(__m128 hi, __m128 lo)
 }
 
 
+static gmx_inline __m256
+gmx_mm256_load4_ps(float const * p)
+{
+    __m128 a;
+
+    a = _mm_load_ps(p);
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 0x1);
+}
+
+
 static __m256d
 gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
 {
@@ -147,6 +157,13 @@ gmx_mm256_set_m128d(__m128d hi, __m128d lo)
 }
 
 
+static __m128 gmx_mm256_sum4h_m128(__m256 x, __m256 y)
+{
+    __m256 sum;
+
+    sum = _mm256_add_ps(x,y);
+    return _mm_add_ps(_mm256_castps256_ps128(sum),_mm256_extractf128_ps(sum,0x1));
+}
 
 
 static void
diff --git a/include/gmx_x86_simd_macros.h b/include/gmx_x86_simd_macros.h
index fd85c6e5e6..e1b1403c24 100644
--- a/include/gmx_x86_simd_macros.h
+++ b/include/gmx_x86_simd_macros.h
@@ -91,11 +91,11 @@
  */
 
 #if !defined GMX_MM128_HERE && !defined GMX_MM256_HERE
-"You should define GMX_MM128_HERE or GMX_MM256_HERE"
+#error "You should define GMX_MM128_HERE or GMX_MM256_HERE"
 #endif
 
 #if defined GMX_MM128_HERE && defined GMX_MM256_HERE
-"You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
+#error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
 #endif
 
 #ifdef GMX_MM128_HERE
@@ -235,6 +235,23 @@
 #define gmx_pmecorrF_pr   gmx_mm256_pmecorrF_ps
 #define gmx_pmecorrV_pr   gmx_mm256_pmecorrV_ps
 
+#define gmx_loaddh_pr     gmx_mm256_load4_ps
+
+/* Half SIMD-width type */
+#define gmx_mm_hpr  __m128
+
+/* Half SIMD-width macros */
+#define gmx_load_hpr      _mm_load_ps
+#define gmx_load1_hpr(x)  _mm_set1_ps((x)[0])
+#define gmx_store_hpr     _mm_store_ps
+#define gmx_add_hpr       _mm_add_ps
+#define gmx_sub_hpr       _mm_sub_ps
+
+#define gmx_sum4_hpr      gmx_mm256_sum4h_m128
+
+/* Conversion between half and full SIMD-width */
+#define gmx_2hpr_to_pr    gmx_mm256_set_m128
+
 #else
 
 #include "gmx_x86_simd_double.h"
diff --git a/include/types/nb_verlet.h b/include/types/nb_verlet.h
index 32dd9432d7..8a33375c03 100644
--- a/include/types/nb_verlet.h
+++ b/include/types/nb_verlet.h
@@ -46,48 +46,66 @@
 extern "C" {
 #endif
 
-/*! Nonbonded NxN kernel types: plain C, SSE/AVX, GPU CUDA, GPU emulation, etc */
-enum { nbkNotSet = 0, 
-       nbk4x4_PlainC, 
-       nbk4xN_X86_SIMD128,
-       nbk4xN_X86_SIMD256,
-       nbk8x8x8_CUDA,
-       nbk8x8x8_PlainC };
+#ifdef GMX_X86_SSE2
+/* Use SIMD accelerated nbnxn search and kernels */
+#define GMX_NBNXN_SIMD
+
+#ifdef GMX_X86_AVX_256
+/* Comment out this define to use AVX-128 kernels with AVX-256 acceleration */
+#define GMX_NBNXN_SIMD_BITWIDTH  256
+#else
+#define GMX_NBNXN_SIMD_BITWIDTH  128
+#endif
+
+/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
+ * Currently the 2xNN SIMD kernels only make sense and are only implemented
+ * with AVX-256 in single precision using a 4x4 cluster setup instead of 4x8.
+ */
+#define GMX_NBNXN_SIMD_4XN
+#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
+#define GMX_NBNXN_SIMD_2XNN
+#endif
+
+#endif
+
+
+/*! Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
+typedef enum
+{
+    nbnxnkNotSet = 0, 
+    nbnxnk4x4_PlainC, 
+    nbnxnk4xN_SIMD_4xN,
+    nbnxnk4xN_SIMD_2xNN,
+    nbnxnk8x8x8_CUDA,
+    nbnxnk8x8x8_PlainC,
+    nbnxnkNR
+} nbnxn_kernel_type;
 
 /* Note that _mm_... intrinsics can be converted to either SSE or AVX
  * depending on compiler flags.
  * For gcc we check for __AVX__
  * At least a check for icc should be added (if there is a macro)
  */
-static const char *nbk_name[] =
-  { "not set", "plain C 4x4",
-#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__)
+static const char *nbnxn_kernel_name[nbnxnkNR] =
+  { "not set", "plain C",
+#if !(defined GMX_X86_SSE2)
+    "not available", "not available",
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
 #ifndef GMX_X86_SSE4_1
-#ifndef GMX_DOUBLE
-    "SSE2 4x4",
+    "SSE2", "SSE2",
 #else
-    "SSE2 4x2",
+    "SSE4.1", "SSE4.1",
 #endif
 #else
-#ifndef GMX_DOUBLE
-    "SSE4.1 4x4",
-#else
-    "SSE4.1 4x2",
+    "AVX-128", "AVX-128",
 #endif
-#endif
-#else
-#ifndef GMX_DOUBLE
-    "AVX-128 4x4",
 #else
-    "AVX-128 4x2",
+    "AVX-256",  "AVX-256",
 #endif
 #endif
-#ifndef GMX_DOUBLE
-    "AVX-256 4x8",
-#else
-    "AVX-256 4x4",
-#endif
-    "CUDA 8x8x8", "plain C 8x8x8" };
+    "CUDA", "plain C" };
 
 enum { ewaldexclTable, ewaldexclAnalytical };
 
@@ -119,9 +137,9 @@ typedef struct {
 
 /* non-bonded data structure with Verlet-type cut-off */
 typedef struct {
-    nbnxn_search_t           nbs;   /* n vs n atom pair searching data          */
-    int                      ngrp;  /* number of interaction groups             */
-    nonbonded_verlet_group_t grp[2];/* local and non-local interaction group    */
+    nbnxn_search_t           nbs;   /* n vs n atom pair searching data       */
+    int                      ngrp;  /* number of interaction groups          */
+    nonbonded_verlet_group_t grp[2];/* local and non-local interaction group */
 
     gmx_bool         bUseGPU;          /* TRUE when GPU acceleration is used */
     nbnxn_cuda_ptr_t cu_nbv;           /* pointer to CUDA nb verlet data     */
diff --git a/src/kernel/calc_verletbuf.c b/src/kernel/calc_verletbuf.c
index 8a1f6ef00d..68e3cf628c 100644
--- a/src/kernel/calc_verletbuf.c
+++ b/src/kernel/calc_verletbuf.c
@@ -78,17 +78,14 @@ void verletbuf_get_list_setup(gmx_bool bGPU,
     }
     else
     {
-#ifndef GMX_X86_SSE2
+#ifndef GMX_NBNXN_SIMD
         list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
 #else
-        int simd_width;
-
-#ifdef GMX_X86_AVX_256
-        simd_width = 256;
-#else
-        simd_width = 128;
+        list_setup->cluster_size_j = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+#ifdef GMX_NBNXN_SIMD_2XNN
+        /* We assume the smallest cluster size to be on the safe side */
+        list_setup->cluster_size_j /= 2;
 #endif
-        list_setup->cluster_size_j = simd_width/(sizeof(real)*8);
 #endif
     }
 }
diff --git a/src/kernel/pme_loadbal.c b/src/kernel/pme_loadbal.c
index 6367ac25bd..a208f08c4f 100644
--- a/src/kernel/pme_loadbal.c
+++ b/src/kernel/pme_loadbal.c
@@ -624,7 +624,8 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
     ic->ewaldcoeff = set->ewaldcoeff;
 
     bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
-    if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+    if (pme_lb->cutoff_scheme == ecutsVERLET &&
+        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
     {
         nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic);
     }
diff --git a/src/mdlib/forcerec.c b/src/mdlib/forcerec.c
index b773dff15f..ce27dda2b7 100644
--- a/src/mdlib/forcerec.c
+++ b/src/mdlib/forcerec.c
@@ -1403,41 +1403,48 @@ static void init_forcerec_f_threads(t_forcerec *fr,int nenergrp)
 static void pick_nbnxn_kernel_cpu(FILE *fp,
                                   const t_commrec *cr,
                                   const gmx_cpuid_t cpuid_info,
+                                  const t_inputrec *ir,
                                   int *kernel_type,
                                   int *ewald_excl)
 {
-    *kernel_type = nbk4x4_PlainC;
+    *kernel_type = nbnxnk4x4_PlainC;
     *ewald_excl  = ewaldexclTable;
 
-#ifdef GMX_X86_SSE2
+#ifdef GMX_NBNXN_SIMD
     {
-        /* On Intel Sandy-Bridge AVX-256 kernels are always faster.
-         * On AMD Bulldozer AVX-256 is much slower than AVX-128.
-         */
-        if(gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX) == 1 &&
-           gmx_cpuid_vendor(cpuid_info) != GMX_CPUID_VENDOR_AMD)
-        {
-#ifdef GMX_X86_AVX_256
-            *kernel_type = nbk4xN_X86_SIMD256;
-#else
-            *kernel_type = nbk4xN_X86_SIMD128;
+#ifdef GMX_NBNXN_SIMD_4XN
+        *kernel_type = nbnxnk4xN_SIMD_4xN;
 #endif
-        }
-        else
+#ifdef GMX_NBNXN_SIMD_2XNN
+        /* We expect the 2xNN kernels to be faster in most cases */
+        *kernel_type = nbnxnk4xN_SIMD_2xNN;
+#endif
+
+#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
+        if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
         {
-            *kernel_type = nbk4xN_X86_SIMD128;
+            /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+             * 10% with HT, 50% without HT, but extra zeros interactions
+             * can compensate. As we currently don't detect the actual use
+             * of HT, switch to 4x8 to avoid a potential performance hit.
+             */
+            *kernel_type = nbnxnk4xN_SIMD_4xN;
         }
-
-        if (getenv("GMX_NBNXN_AVX128") != NULL)
+#endif
+        if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
         {
-            *kernel_type = nbk4xN_X86_SIMD128;
+#ifdef GMX_NBNXN_SIMD_2XNN
+            *kernel_type = nbnxnk4xN_SIMD_4xN;
+#else
+            gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
+#endif
         }
-        if (getenv("GMX_NBNXN_AVX256") != NULL)
+        if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
         {
-#ifdef GMX_X86_AVX_256
-            *kernel_type = nbk4xN_X86_SIMD256;
+#ifdef GMX_NBNXN_SIMD_2XNN
+            *kernel_type = nbnxnk4xN_SIMD_2xNN;
 #else
-            gmx_fatal(FARGS,"You requested AVX-256 nbnxn kernels, but GROMACS was built without AVX support");
+            gmx_fatal(FARGS,"SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
 #endif
         }
 
@@ -1466,6 +1473,7 @@ static void pick_nbnxn_kernel(FILE *fp,
                               const gmx_hw_info_t *hwinfo,
                               gmx_bool use_cpu_acceleration,
                               gmx_bool *bUseGPU,
+                              const t_inputrec *ir,
                               int *kernel_type,
                               int *ewald_excl,
                               gmx_bool bDoNonbonded)
@@ -1475,7 +1483,7 @@ static void pick_nbnxn_kernel(FILE *fp,
 
     assert(kernel_type);
 
-    *kernel_type = nbkNotSet;
+    *kernel_type = nbnxnkNotSet;
     *ewald_excl  = ewaldexclTable;
 
     bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
@@ -1521,7 +1529,7 @@ static void pick_nbnxn_kernel(FILE *fp,
 
     if (bEmulateGPU)
     {
-        *kernel_type = nbk8x8x8_PlainC;
+        *kernel_type = nbnxnk8x8x8_PlainC;
 
         if (bDoNonbonded)
         {
@@ -1530,31 +1538,28 @@ static void pick_nbnxn_kernel(FILE *fp,
     }
     else if (bGPU)
     {
-        *kernel_type = nbk8x8x8_CUDA;
+        *kernel_type = nbnxnk8x8x8_CUDA;
     }
 
-    if (*kernel_type == nbkNotSet)
+    if (*kernel_type == nbnxnkNotSet)
     {
         if (use_cpu_acceleration)
         {
-            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,
+            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,ir,
                                   kernel_type,ewald_excl);
         }
         else
         {
-            *kernel_type = nbk4x4_PlainC;
+            *kernel_type = nbnxnk4x4_PlainC;
         }
     }
 
     if (bDoNonbonded && fp != NULL)
     {
-        if (MASTER(cr))
-        {
-            fprintf(stderr,"Using %s non-bonded kernels\n",
-                    nbk_name[*kernel_type]);
-        }
-        fprintf(fp,"\nUsing %s non-bonded kernels\n\n",
-                nbk_name[*kernel_type]);
+        fprintf(fp,"\nUsing %s %dx%d non-bonded kernels\n\n",
+                nbnxn_kernel_name[*kernel_type],
+                nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
+                nbnxn_kernel_to_cj_size(*kernel_type));
     }
 }
 
@@ -1754,12 +1759,13 @@ static void init_nb_verlet(FILE *fp,
     {
         nbv->grp[i].nbl_lists.nnbl = 0;
         nbv->grp[i].nbat           = NULL;
-        nbv->grp[i].kernel_type    = nbkNotSet;
+        nbv->grp[i].kernel_type    = nbnxnkNotSet;
 
         if (i == 0) /* local */
         {
             pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
                               &nbv->bUseGPU,
+                              ir,
                               &nbv->grp[i].kernel_type,
                               &nbv->grp[i].ewald_excl,
                               fr->bNonbonded);
@@ -1771,6 +1777,7 @@ static void init_nb_verlet(FILE *fp,
                 /* Use GPU for local, select a CPU kernel for non-local */
                 pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
                                   NULL,
+                                  ir,
                                   &nbv->grp[i].kernel_type,
                                   &nbv->grp[i].ewald_excl,
                                   fr->bNonbonded);
@@ -1834,7 +1841,7 @@ static void init_nb_verlet(FILE *fp,
 
     for(i=0; i<nbv->ngrp; i++)
     {
-        if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+        if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
         {
             nb_alloc = &pmalloc;
             nb_free  = &pfree;
diff --git a/src/mdlib/nbnxn_atomdata.c b/src/mdlib/nbnxn_atomdata.c
index aaa3f22c8e..ad231287cf 100644
--- a/src/mdlib/nbnxn_atomdata.c
+++ b/src/mdlib/nbnxn_atomdata.c
@@ -151,8 +151,8 @@ static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t *out,
     ma((void **)&out->Vvdw,out->nV*sizeof(*out->Vvdw));
     ma((void **)&out->Vc  ,out->nV*sizeof(*out->Vc  ));
 
-    if (nb_kernel_type == nbk4xN_X86_SIMD128 ||
-        nb_kernel_type == nbk4xN_X86_SIMD256)
+    if (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
+        nb_kernel_type == nbnxnk4xN_SIMD_2xNN)
     {
         cj_size = nbnxn_kernel_to_cj_size(nb_kernel_type);
         out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size;
@@ -598,17 +598,25 @@ void nbnxn_atomdata_init(FILE *fp,
     nbat->lj_comb = NULL;
     if (simple)
     {
+        int pack_x;
+
         switch (nb_kernel_type)
         {
-        case nbk4xN_X86_SIMD128:
-            nbat->XFormat = nbatX4;
-            break;
-        case nbk4xN_X86_SIMD256:
-#ifndef GMX_DOUBLE
-            nbat->XFormat = nbatX8;
-#else
-            nbat->XFormat = nbatX4;
-#endif
+        case nbnxnk4xN_SIMD_4xN:
+        case nbnxnk4xN_SIMD_2xNN:
+            pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
+                         nbnxn_kernel_to_cj_size(nb_kernel_type));
+            switch (pack_x)
+            {
+                case 4:
+                    nbat->XFormat = nbatX4;
+                    break;
+                case 8:
+                    nbat->XFormat = nbatX8;
+                    break;
+                default:
+                    gmx_incons("Unsupported packing width");
+            }
             break;
         default:
             nbat->XFormat = nbatXYZ;
diff --git a/src/mdlib/nbnxn_internal.h b/src/mdlib/nbnxn_internal.h
index 9d0be66e30..8e9f005b7c 100644
--- a/src/mdlib/nbnxn_internal.h
+++ b/src/mdlib/nbnxn_internal.h
@@ -95,29 +95,32 @@ typedef struct {
     int  nsubc_tot;      /* Total number of subcell, used for printing  */
 } nbnxn_grid_t;
 
-#ifdef NBNXN_SEARCH_SSE
+#ifdef GMX_NBNXN_SIMD
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
 #define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
 #include "gmx_x86_simd_macros.h"
-typedef struct nbnxn_x_ci_x86_simd128 {
+
+typedef struct nbnxn_x_ci_simd_4xn {
     /* The i-cluster coordinates for simple search */
     gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
     gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
     gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
     gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
-} nbnxn_x_ci_x86_simd128_t;
-#undef GMX_MM128_HERE
-#ifdef GMX_X86_AVX_256
-#define GMX_MM256_HERE
-#include "gmx_x86_simd_macros.h"
-typedef struct nbnxn_x_ci_x86_simd256 {
+} nbnxn_x_ci_simd_4xn_t;
+
+typedef struct nbnxn_x_ci_simd_2xnn {
     /* The i-cluster coordinates for simple search */
     gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
-    gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
     gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
-    gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
-} nbnxn_x_ci_x86_simd256_t;
-#undef GMX_MM256_HERE
-#endif
+} nbnxn_x_ci_simd_2xnn_t;
+
 #endif
 
 /* Working data for the actual i-supercell during pair search */
@@ -126,11 +129,9 @@ typedef struct nbnxn_list_work {
 
     float *bb_ci;      /* The bounding boxes, pbc shifted, for each cluster */
     real  *x_ci;       /* The coordinates, pbc shifted, for each atom       */
-#ifdef NBNXN_SEARCH_SSE
-    nbnxn_x_ci_x86_simd128_t *x_ci_x86_simd128;
-#ifdef GMX_X86_AVX_256
-    nbnxn_x_ci_x86_simd256_t *x_ci_x86_simd256;
-#endif
+#ifdef GMX_NBNXN_SIMD
+    nbnxn_x_ci_simd_4xn_t *x_ci_simd_4xn;
+    nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
 #endif
     int  cj_ind;       /* The current cj_ind index for the current list     */
     int  cj4_init;     /* The first unitialized cj4 block                   */
@@ -155,17 +156,18 @@ gmx_icell_set_x_t(int ci,
                   nbnxn_list_work_t *work);
 
 static gmx_icell_set_x_t icell_set_x_simple;
-#ifdef NBNXN_SEARCH_SSE
-static gmx_icell_set_x_t icell_set_x_simple_x86_simd128;
-#ifdef GMX_X86_AVX_256
-static gmx_icell_set_x_t icell_set_x_simple_x86_simd256;
-#endif
+#ifdef GMX_NBNXN_SIMD
+static gmx_icell_set_x_t icell_set_x_simple_simd_4xn;
+static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn;
 #endif
 static gmx_icell_set_x_t icell_set_x_supersub;
 #ifdef NBNXN_SEARCH_SSE
 static gmx_icell_set_x_t icell_set_x_supersub_sse8;
 #endif
 
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
+
 /* Local cycle count struct for profiling */
 typedef struct {
     int          count;
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c
similarity index 83%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c
index 8018f65f5e..a294bad9c0 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c
@@ -49,18 +49,26 @@
 #include "../nbnxn_consts.h"
 #include "nbnxn_kernel_common.h"
 
-#ifdef GMX_X86_SSE2
+#ifdef GMX_NBNXN_SIMD_2XNN
 
-#include "nbnxn_kernel_x86_simd128.h"
+#include "nbnxn_kernel_simd_2xnn.h"
 
-/* Include all flavors of the 128-bit SSE or AVX kernel loops */
+/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
 
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
 #define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
 
 /* Analytical reaction-field kernels */
 #define CALC_COUL_RF
 
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 
 #undef CALC_COUL_RF
 
@@ -68,11 +76,11 @@
 #define CALC_COUL_TAB
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_TAB
@@ -81,11 +89,11 @@
 #define CALC_COUL_EWALD
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_2xnn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_EWALD
@@ -109,7 +117,7 @@ typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
 
 enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_ener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_ener
 static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -118,7 +126,7 @@ static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_energrp
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_energrp
 static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -127,7 +135,7 @@ static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_noener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_noener
 static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -141,15 +149,14 @@ static void reduce_group_energies(int ng,int ng_2log,
                                   const real *VSvdw,const real *VSc,
                                   real *Vvdw,real *Vc)
 {
+    const int simd_width   = GMX_X86_SIMD_WIDTH_HERE;
+    const int unrollj_half = GMX_X86_SIMD_WIDTH_HERE/4;
     int ng_p2,i,j,j0,j1,c,s;
 
-#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
-#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
-
     ng_p2 = (1<<ng_2log);
 
     /* The size of the x86 SIMD energy group buffer array is:
-     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
+     * ng*ng*ng_p2*unrollj_half*simd_width
      */
     for(i=0; i<ng; i++)
     {
@@ -163,34 +170,34 @@ static void reduce_group_energies(int ng,int ng_2log,
         {
             for(j0=0; j0<ng; j0++)
             {
-                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
-                for(s=0; s<SIMD_WIDTH_HALF; s++)
+                c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width/2;
+                for(s=0; s<unrollj_half; s++)
                 {
                     Vvdw[i*ng+j0] += VSvdw[c+0];
                     Vvdw[i*ng+j1] += VSvdw[c+1];
                     Vc  [i*ng+j0] += VSc  [c+0];
                     Vc  [i*ng+j1] += VSc  [c+1];
-                    c += SIMD_WIDTH + 2;
+                    c += simd_width/2 + 2;
                 }
             }
         }
     }
 }
 
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_NBNXN_SIMD_2XNN */
 
 void
-nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec, 
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw)
-#ifdef GMX_X86_SSE2
+nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t       *nbl_list,
+                       const nbnxn_atomdata_t     *nbat,
+                       const interaction_const_t  *ic,
+                       int                        ewald_excl,
+                       rvec                       *shift_vec, 
+                       int                        force_flags,
+                       int                        clearF,
+                       real                       *fshift,
+                       real                       *Vc,
+                       real                       *Vvdw)
+#ifdef GMX_NBNXN_SIMD_2XNN
 {
     int              nnbl;
     nbnxn_pairlist_t **nbl;
@@ -320,6 +327,6 @@ nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
 }
 #else
 {
-    gmx_incons("nbnxn_kernel_x86_simd128 called while GROMACS was configured without SSE enabled");
+    gmx_incons("nbnxn_kernel_simd_2xnn called while GROMACS was configured without 2x(N+N) SIMD kernels enabled");
 }
 #endif
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.h
similarity index 69%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.h
index 9a8e53c902..c3a6b3b6ee 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.h
@@ -35,8 +35,8 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-#ifndef _nbnxn_kernel_x86_simd256_h
-#define _nbnxn_kernel_x86_simd256_h
+#ifndef _nbnxn_kernel_simd_2xnn_h
+#define _nbnxn_kernel_simd_2xnn_h
 
 #include "typedefs.h"
 
@@ -44,18 +44,21 @@
 extern "C" {
 #endif
 
-/* Wrapper call for the non-bonded cluster vs cluster kernels */
+/* Wrapper call for the non-bonded cluster vs cluster kernels.
+ * These kernels determine 4xN cluster interactions for SIMD width 2*N
+ * by packing 2*N j-atom variables in SIMD registers.
+ */
 void
-nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec,
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw);
+nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t       *nbl_list,
+                       const nbnxn_atomdata_t     *nbat,
+                       const interaction_const_t  *ic,
+                       int                        ewald_excl,
+                       rvec                       *shift_vec,
+                       int                        force_flags,
+                       int                        clearF,
+                       real                       *fshift,
+                       real                       *Vc,
+                       real                       *Vvdw);
 
 #ifdef __cplusplus
 }
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_includes.h
similarity index 86%
copy from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h
copy to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_includes.h
index 15c52fb097..67e63b1b49 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_includes.h
@@ -43,32 +43,32 @@
 /* Include the force+energy kernels */
 #define CALC_ENERGIES
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_2xnn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_2xnn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_2xnn_outer.h"
 #undef CALC_ENERGIES
 
 /* Include the force+energygroups kernels */
 #define CALC_ENERGIES
 #define ENERGY_GROUPS
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_2xnn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_2xnn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_2xnn_outer.h"
 #undef ENERGY_GROUPS
 #undef CALC_ENERGIES
 
 /* Include the force only kernels */
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_2xnn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_2xnn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_2xnn_outer.h"
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h
similarity index 58%
copy from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
copy to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h
index f1a0b9ccc8..cab66c3e34 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h
@@ -35,8 +35,22 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-/* This is the innermost loop contents for the n vs n atom
- * SSE2 single precision kernels.
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel duplicates the data for N j-particles in
+ * 2xN wide SIMD registers to do operate on 2 i-particles at once.
+ * This leads to 4/2=2 sets of most instructions. Therefore we call
+ * this kernel 2x(N+N) = 2xnn
+ *
+ * This 2xnn kernel is basically the 4xn equivalent with half the registers
+ * and instructions removed.
+ *
+ * An alternative would be to load to different cluster of N j-particles
+ * into SIMD registers, giving a 4x(N+N) kernel. This doubles the amount
+ * of instructions, which could lead to better scheduling. But we actually
+ * observed worse scheduling for the AVX-256 4x8 normal analytical PME
+ * kernel, which has a lower pair throughput than 2x(4+4) with gcc 4.7.
+ * It could be worth trying this option, but it takes some more effort.
+ * This 2xnn kernel is basically the 4xn equivalent with
  */
 
 
@@ -81,114 +95,82 @@
 #ifdef CHECK_EXCLS
             /* Interaction (non-exclusion) mask of all 1's or 0's */
             gmx_mm_pr  int_SSE0;
-            gmx_mm_pr  int_SSE1;
             gmx_mm_pr  int_SSE2;
-            gmx_mm_pr  int_SSE3;
 #endif
 
             gmx_mm_pr  jxSSE,jySSE,jzSSE;
             gmx_mm_pr  dx_SSE0,dy_SSE0,dz_SSE0;
-            gmx_mm_pr  dx_SSE1,dy_SSE1,dz_SSE1;
             gmx_mm_pr  dx_SSE2,dy_SSE2,dz_SSE2;
-            gmx_mm_pr  dx_SSE3,dy_SSE3,dz_SSE3;
             gmx_mm_pr  tx_SSE0,ty_SSE0,tz_SSE0;
-            gmx_mm_pr  tx_SSE1,ty_SSE1,tz_SSE1;
             gmx_mm_pr  tx_SSE2,ty_SSE2,tz_SSE2;
-            gmx_mm_pr  tx_SSE3,ty_SSE3,tz_SSE3;
             gmx_mm_pr  rsq_SSE0,rinv_SSE0,rinvsq_SSE0;
-            gmx_mm_pr  rsq_SSE1,rinv_SSE1,rinvsq_SSE1;
             gmx_mm_pr  rsq_SSE2,rinv_SSE2,rinvsq_SSE2;
-            gmx_mm_pr  rsq_SSE3,rinv_SSE3,rinvsq_SSE3;
 #ifndef CUTOFF_BLENDV
             /* wco: within cut-off, mask of all 1's or 0's */
             gmx_mm_pr  wco_SSE0;
-            gmx_mm_pr  wco_SSE1;
             gmx_mm_pr  wco_SSE2;
-            gmx_mm_pr  wco_SSE3;
 #endif
 #ifdef VDW_CUTOFF_CHECK
             gmx_mm_pr  wco_vdw_SSE0;
-            gmx_mm_pr  wco_vdw_SSE1;
 #ifndef HALF_LJ
             gmx_mm_pr  wco_vdw_SSE2;
-            gmx_mm_pr  wco_vdw_SSE3;
 #endif
 #endif
 #ifdef CALC_COULOMB
 #ifdef CHECK_EXCLS
             /* 1/r masked with the interaction mask */
             gmx_mm_pr  rinv_ex_SSE0;
-            gmx_mm_pr  rinv_ex_SSE1;
             gmx_mm_pr  rinv_ex_SSE2;
-            gmx_mm_pr  rinv_ex_SSE3;
 #endif
             gmx_mm_pr  jq_SSE;
             gmx_mm_pr  qq_SSE0;
-            gmx_mm_pr  qq_SSE1;
             gmx_mm_pr  qq_SSE2;
-            gmx_mm_pr  qq_SSE3;
 #ifdef CALC_COUL_TAB
             /* The force (PME mesh force) we need to subtract from 1/r^2 */
             gmx_mm_pr  fsub_SSE0;
-            gmx_mm_pr  fsub_SSE1;
             gmx_mm_pr  fsub_SSE2;
-            gmx_mm_pr  fsub_SSE3;
 #endif
 #ifdef CALC_COUL_EWALD
-            gmx_mm_pr  brsq_SSE0,brsq_SSE1,brsq_SSE2,brsq_SSE3;
-            gmx_mm_pr  ewcorr_SSE0,ewcorr_SSE1,ewcorr_SSE2,ewcorr_SSE3;
+            gmx_mm_pr  brsq_SSE0,brsq_SSE2;
+            gmx_mm_pr  ewcorr_SSE0,ewcorr_SSE2;
 #endif
 
             /* frcoul = (1/r - fsub)*r */
             gmx_mm_pr  frcoul_SSE0;
-            gmx_mm_pr  frcoul_SSE1;
             gmx_mm_pr  frcoul_SSE2;
-            gmx_mm_pr  frcoul_SSE3;
 #ifdef CALC_COUL_TAB
             /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
             gmx_mm_pr  r_SSE0,rs_SSE0,rf_SSE0,frac_SSE0;
-            gmx_mm_pr  r_SSE1,rs_SSE1,rf_SSE1,frac_SSE1;
             gmx_mm_pr  r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
-            gmx_mm_pr  r_SSE3,rs_SSE3,rf_SSE3,frac_SSE3;
-            /* Table index: rs converted to an int */ 
+            /* Table index: rs truncated to an int */
 #if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
-            gmx_epi32  ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
+            gmx_epi32  ti_SSE0,ti_SSE2;
 #else
-            __m128i    ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
+            __m128i    ti_SSE0,ti_SSE2;
 #endif
             /* Linear force table values */
             gmx_mm_pr  ctab0_SSE0,ctab1_SSE0;
-            gmx_mm_pr  ctab0_SSE1,ctab1_SSE1;
             gmx_mm_pr  ctab0_SSE2,ctab1_SSE2;
-            gmx_mm_pr  ctab0_SSE3,ctab1_SSE3;
 #ifdef CALC_ENERGIES
             /* Quadratic energy table value */
             gmx_mm_pr  ctabv_SSE0;
-            gmx_mm_pr  ctabv_SSE1;
             gmx_mm_pr  ctabv_SSE2;
-            gmx_mm_pr  ctabv_SSE3;
 #endif
 #endif
 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
             /* The potential (PME mesh) we need to subtract from 1/r */
             gmx_mm_pr  vc_sub_SSE0;
-            gmx_mm_pr  vc_sub_SSE1;
             gmx_mm_pr  vc_sub_SSE2;
-            gmx_mm_pr  vc_sub_SSE3;
 #endif
 #ifdef CALC_ENERGIES
             /* Electrostatic potential */
             gmx_mm_pr  vcoul_SSE0;
-            gmx_mm_pr  vcoul_SSE1;
             gmx_mm_pr  vcoul_SSE2;
-            gmx_mm_pr  vcoul_SSE3;
 #endif
 #endif
             /* The force times 1/r */
             gmx_mm_pr  fscal_SSE0;
-            gmx_mm_pr  fscal_SSE1;
             gmx_mm_pr  fscal_SSE2;
-            gmx_mm_pr  fscal_SSE3;
 
 #ifdef CALC_LJ
 #ifdef LJ_COMB_LB
@@ -196,17 +178,13 @@
             gmx_mm_pr  hsig_j_SSE,seps_j_SSE;
             /* LJ sigma_ij and epsilon_ij */
             gmx_mm_pr  sig_SSE0,eps_SSE0;
-            gmx_mm_pr  sig_SSE1,eps_SSE1;
 #ifndef HALF_LJ
             gmx_mm_pr  sig_SSE2,eps_SSE2;
-            gmx_mm_pr  sig_SSE3,eps_SSE3;
 #endif
 #ifdef CALC_ENERGIES
             gmx_mm_pr  sig2_SSE0,sig6_SSE0;
-            gmx_mm_pr  sig2_SSE1,sig6_SSE1;
 #ifndef HALF_LJ
             gmx_mm_pr  sig2_SSE2,sig6_SSE2;
-            gmx_mm_pr  sig2_SSE3,sig6_SSE3;
 #endif
 #endif /* LJ_COMB_LB */
 #endif /* CALC_LJ */
@@ -223,43 +201,33 @@
 #ifndef FIX_LJ_C
             /* LJ C6 and C12 parameters, used with geometric comb. rule */
             gmx_mm_pr  c6_SSE0,c12_SSE0;
-            gmx_mm_pr  c6_SSE1,c12_SSE1;
 #ifndef HALF_LJ
             gmx_mm_pr  c6_SSE2,c12_SSE2;
-            gmx_mm_pr  c6_SSE3,c12_SSE3;
 #endif
 #endif
 
             /* Intermediate variables for LJ calculation */
 #ifndef LJ_COMB_LB
             gmx_mm_pr  rinvsix_SSE0;
-            gmx_mm_pr  rinvsix_SSE1;
 #ifndef HALF_LJ
             gmx_mm_pr  rinvsix_SSE2;
-            gmx_mm_pr  rinvsix_SSE3;
 #endif
 #endif
 #ifdef LJ_COMB_LB
             gmx_mm_pr  sir_SSE0,sir2_SSE0,sir6_SSE0;
-            gmx_mm_pr  sir_SSE1,sir2_SSE1,sir6_SSE1;
 #ifndef HALF_LJ
             gmx_mm_pr  sir_SSE2,sir2_SSE2,sir6_SSE2;
-            gmx_mm_pr  sir_SSE3,sir2_SSE3,sir6_SSE3;
 #endif
 #endif
 
             gmx_mm_pr  FrLJ6_SSE0,FrLJ12_SSE0;
-            gmx_mm_pr  FrLJ6_SSE1,FrLJ12_SSE1;
 #ifndef HALF_LJ
             gmx_mm_pr  FrLJ6_SSE2,FrLJ12_SSE2;
-            gmx_mm_pr  FrLJ6_SSE3,FrLJ12_SSE3;
 #endif
 #ifdef CALC_ENERGIES
             gmx_mm_pr  VLJ6_SSE0,VLJ12_SSE0,VLJ_SSE0;
-            gmx_mm_pr  VLJ6_SSE1,VLJ12_SSE1,VLJ_SSE1;
 #ifndef HALF_LJ
             gmx_mm_pr  VLJ6_SSE2,VLJ12_SSE2,VLJ_SSE2;
-            gmx_mm_pr  VLJ6_SSE3,VLJ12_SSE3,VLJ_SSE3;
 #endif
 #endif
 #endif /* CALC_LJ */
@@ -285,80 +253,39 @@
             ajz           = ajy + STRIDE;
 
 #ifdef CHECK_EXCLS
-#ifndef GMX_MM256_HERE
             {
                 /* Load integer interaction mask */
-                __m128i mask_int = _mm_set1_epi32(l_cj[cjind].excl);
-
-                /* The is no unequal sse instruction, so we need a not here */
-                int_SSE0  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask0),zeroi_SSE));
-                int_SSE1  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask1),zeroi_SSE));
-                int_SSE2  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask2),zeroi_SSE));
-                int_SSE3  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask3),zeroi_SSE));
-            }
-#else
-            {
-#ifndef GMX_DOUBLE
-                /* Load integer interaction mask */
                 /* With AVX there are no integer operations, so cast to real */
                 gmx_mm_pr mask_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl));
-                /* We can't compare all 4*8=32 float bits: shift the mask */
-                gmx_mm_pr masksh_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl>>(2*UNROLLJ)));
                 /* Intel Compiler version 12.1.3 20120130 is buggy: use cast.
                  * With gcc we don't need the cast, but it's faster.
                  */
 #define cast_cvt(x)  _mm256_cvtepi32_ps(_mm256_castps_si256(x))
                 int_SSE0  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask0)),zero_SSE);
-                int_SSE1  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask1)),zero_SSE);
-                int_SSE2  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr,mask0)),zero_SSE);
-                int_SSE3  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr,mask1)),zero_SSE);
-#undef cast_cvt
-#else
-                /* Load integer interaction mask */
-                /* With AVX there are no integer operations,
-                 * and there is no int to double conversion, so cast to float
-                 */
-                __m256 mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(l_cj[cjind].excl));
-#define cast_cvt(x)  _mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castps_si256(x)))
-                int_SSE0  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask0)),zero_SSE);
-                int_SSE1  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask1)),zero_SSE);
-                int_SSE2  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask2)),zero_SSE);
-                int_SSE3  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask3)),zero_SSE);
+                int_SSE2  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask2)),zero_SSE);
 #undef cast_cvt
-#endif
             }
 #endif
-#endif
             /* load j atom coordinates */
-            jxSSE         = gmx_load_pr(x+ajx);
-            jySSE         = gmx_load_pr(x+ajy);
-            jzSSE         = gmx_load_pr(x+ajz);
-            
+            jxSSE         = gmx_loaddh_pr(x+ajx);
+            jySSE         = gmx_loaddh_pr(x+ajy);
+            jzSSE         = gmx_loaddh_pr(x+ajz);
+
             /* Calculate distance */
             dx_SSE0       = gmx_sub_pr(ix_SSE0,jxSSE);
             dy_SSE0       = gmx_sub_pr(iy_SSE0,jySSE);
             dz_SSE0       = gmx_sub_pr(iz_SSE0,jzSSE);
-            dx_SSE1       = gmx_sub_pr(ix_SSE1,jxSSE);
-            dy_SSE1       = gmx_sub_pr(iy_SSE1,jySSE);
-            dz_SSE1       = gmx_sub_pr(iz_SSE1,jzSSE);
             dx_SSE2       = gmx_sub_pr(ix_SSE2,jxSSE);
             dy_SSE2       = gmx_sub_pr(iy_SSE2,jySSE);
             dz_SSE2       = gmx_sub_pr(iz_SSE2,jzSSE);
-            dx_SSE3       = gmx_sub_pr(ix_SSE3,jxSSE);
-            dy_SSE3       = gmx_sub_pr(iy_SSE3,jySSE);
-            dz_SSE3       = gmx_sub_pr(iz_SSE3,jzSSE);
-            
+
             /* rsq = dx*dx+dy*dy+dz*dz */
             rsq_SSE0      = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
-            rsq_SSE1      = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1);
             rsq_SSE2      = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
-            rsq_SSE3      = gmx_calc_rsq_pr(dx_SSE3,dy_SSE3,dz_SSE3);
 
 #ifndef CUTOFF_BLENDV
             wco_SSE0      = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
-            wco_SSE1      = gmx_cmplt_pr(rsq_SSE1,rc2_SSE);
             wco_SSE2      = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
-            wco_SSE3      = gmx_cmplt_pr(rsq_SSE3,rc2_SSE);
 #endif
 
 #ifdef CHECK_EXCLS
@@ -368,49 +295,15 @@
             if (cj == ci_sh)
             {
                 wco_SSE0  = gmx_and_pr(wco_SSE0,diag_SSE0);
-                wco_SSE1  = gmx_and_pr(wco_SSE1,diag_SSE1);
                 wco_SSE2  = gmx_and_pr(wco_SSE2,diag_SSE2);
-                wco_SSE3  = gmx_and_pr(wco_SSE3,diag_SSE3);
             }
 #else
-#if UNROLLJ < UNROLLI
-            if (cj == ci_sh*2)
-            {
-                wco_SSE0  = gmx_and_pr(wco_SSE0,diag0_SSE0);
-                wco_SSE1  = gmx_and_pr(wco_SSE1,diag0_SSE1);
-                wco_SSE2  = gmx_and_pr(wco_SSE2,diag0_SSE2);
-                wco_SSE3  = gmx_and_pr(wco_SSE3,diag0_SSE3);
-            }
-            if (cj == ci_sh*2 + 1)
-            { 
-                wco_SSE0  = gmx_and_pr(wco_SSE0,diag1_SSE0);
-                wco_SSE1  = gmx_and_pr(wco_SSE1,diag1_SSE1);
-                wco_SSE2  = gmx_and_pr(wco_SSE2,diag1_SSE2);
-                wco_SSE3  = gmx_and_pr(wco_SSE3,diag1_SSE3);
-            }
-#else
-            if (cj*2 == ci_sh)
-            {
-                wco_SSE0  = gmx_and_pr(wco_SSE0,diag0_SSE0);
-                wco_SSE1  = gmx_and_pr(wco_SSE1,diag0_SSE1);
-                wco_SSE2  = gmx_and_pr(wco_SSE2,diag0_SSE2);
-                wco_SSE3  = gmx_and_pr(wco_SSE3,diag0_SSE3);
-            }
-            else if (cj*2 + 1 == ci_sh)
-            {
-                wco_SSE0  = gmx_and_pr(wco_SSE0,diag1_SSE0);
-                wco_SSE1  = gmx_and_pr(wco_SSE1,diag1_SSE1);
-                wco_SSE2  = gmx_and_pr(wco_SSE2,diag1_SSE2);
-                wco_SSE3  = gmx_and_pr(wco_SSE3,diag1_SSE3);
-            }
-#endif
+#error "only UNROLLJ == UNROLLI currently supported in the joined kernels"
 #endif
 #else /* EXCL_FORCES */
             /* Remove all excluded atom pairs from the list */
             wco_SSE0      = gmx_and_pr(wco_SSE0,int_SSE0);
-            wco_SSE1      = gmx_and_pr(wco_SSE1,int_SSE1);
             wco_SSE2      = gmx_and_pr(wco_SSE2,int_SSE2);
-            wco_SSE3      = gmx_and_pr(wco_SSE3,int_SSE3);
 #endif
 #endif
 
@@ -435,72 +328,51 @@
 #ifdef CHECK_EXCLS
             /* For excluded pairs add a small number to avoid r^-6 = NaN */
             rsq_SSE0      = gmx_add_pr(rsq_SSE0,gmx_andnot_pr(int_SSE0,avoid_sing_SSE));
-            rsq_SSE1      = gmx_add_pr(rsq_SSE1,gmx_andnot_pr(int_SSE1,avoid_sing_SSE));
             rsq_SSE2      = gmx_add_pr(rsq_SSE2,gmx_andnot_pr(int_SSE2,avoid_sing_SSE));
-            rsq_SSE3      = gmx_add_pr(rsq_SSE3,gmx_andnot_pr(int_SSE3,avoid_sing_SSE));
 #endif
 
             /* Calculate 1/r */
-#ifndef GMX_DOUBLE
             rinv_SSE0     = gmx_invsqrt_pr(rsq_SSE0);
-            rinv_SSE1     = gmx_invsqrt_pr(rsq_SSE1);
             rinv_SSE2     = gmx_invsqrt_pr(rsq_SSE2);
-            rinv_SSE3     = gmx_invsqrt_pr(rsq_SSE3);
-#else
-            GMX_MM_INVSQRT2_PD(rsq_SSE0,rsq_SSE1,rinv_SSE0,rinv_SSE1);
-            GMX_MM_INVSQRT2_PD(rsq_SSE2,rsq_SSE3,rinv_SSE2,rinv_SSE3);
-#endif
 
 #ifdef CALC_COULOMB
             /* Load parameters for j atom */
-            jq_SSE        = gmx_load_pr(q+aj);
+            jq_SSE        = gmx_loaddh_pr(q+aj);
             qq_SSE0       = gmx_mul_pr(iq_SSE0,jq_SSE);
-            qq_SSE1       = gmx_mul_pr(iq_SSE1,jq_SSE);
             qq_SSE2       = gmx_mul_pr(iq_SSE2,jq_SSE);
-            qq_SSE3       = gmx_mul_pr(iq_SSE3,jq_SSE);
 #endif
 
 #ifdef CALC_LJ
 
 #if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
-            load_lj_pair_params(nbfp0,type,aj,c6_SSE0,c12_SSE0);
-            load_lj_pair_params(nbfp1,type,aj,c6_SSE1,c12_SSE1);
+            load_lj_pair_params2(nbfp0,type,aj,c6_SSE0,c12_SSE0);
 #ifndef HALF_LJ
-            load_lj_pair_params(nbfp2,type,aj,c6_SSE2,c12_SSE2);
-            load_lj_pair_params(nbfp3,type,aj,c6_SSE3,c12_SSE3);
+            load_lj_pair_params2(nbfp2,type,aj,c6_SSE2,c12_SSE2);
 #endif
 #endif /* not defined any LJ rule */
 
 #ifdef LJ_COMB_GEOM
-            c6s_j_SSE     = gmx_load_pr(ljc+aj2+0);
-            c12s_j_SSE    = gmx_load_pr(ljc+aj2+STRIDE);
+            c6s_j_SSE     = gmx_loaddh_pr(ljc+aj2+0);
+            c12s_j_SSE    = gmx_loaddh_pr(ljc+aj2+STRIDE);
             c6_SSE0       = gmx_mul_pr(c6s_SSE0 ,c6s_j_SSE );
-            c6_SSE1       = gmx_mul_pr(c6s_SSE1 ,c6s_j_SSE );
 #ifndef HALF_LJ
             c6_SSE2       = gmx_mul_pr(c6s_SSE2 ,c6s_j_SSE );
-            c6_SSE3       = gmx_mul_pr(c6s_SSE3 ,c6s_j_SSE );
 #endif
             c12_SSE0      = gmx_mul_pr(c12s_SSE0,c12s_j_SSE);
-            c12_SSE1      = gmx_mul_pr(c12s_SSE1,c12s_j_SSE);
 #ifndef HALF_LJ
             c12_SSE2      = gmx_mul_pr(c12s_SSE2,c12s_j_SSE);
-            c12_SSE3      = gmx_mul_pr(c12s_SSE3,c12s_j_SSE);
 #endif
 #endif /* LJ_COMB_GEOM */
 
 #ifdef LJ_COMB_LB
-            hsig_j_SSE    = gmx_load_pr(ljc+aj2+0);
-            seps_j_SSE    = gmx_load_pr(ljc+aj2+STRIDE);
+            hsig_j_SSE    = gmx_loaddh_pr(ljc+aj2+0);
+            seps_j_SSE    = gmx_loaddh_pr(ljc+aj2+STRIDE);
 
             sig_SSE0      = gmx_add_pr(hsig_i_SSE0,hsig_j_SSE);
-            sig_SSE1      = gmx_add_pr(hsig_i_SSE1,hsig_j_SSE);
             eps_SSE0      = gmx_mul_pr(seps_i_SSE0,seps_j_SSE);
-            eps_SSE1      = gmx_mul_pr(seps_i_SSE1,seps_j_SSE);
 #ifndef HALF_LJ
             sig_SSE2      = gmx_add_pr(hsig_i_SSE2,hsig_j_SSE);
-            sig_SSE3      = gmx_add_pr(hsig_i_SSE3,hsig_j_SSE);
             eps_SSE2      = gmx_mul_pr(seps_i_SSE2,seps_j_SSE);
-            eps_SSE3      = gmx_mul_pr(seps_i_SSE3,seps_j_SSE);
 #endif
 #endif /* LJ_COMB_LB */
 
@@ -508,21 +380,15 @@
 
 #ifndef CUTOFF_BLENDV
             rinv_SSE0     = gmx_and_pr(rinv_SSE0,wco_SSE0);
-            rinv_SSE1     = gmx_and_pr(rinv_SSE1,wco_SSE1);
             rinv_SSE2     = gmx_and_pr(rinv_SSE2,wco_SSE2);
-            rinv_SSE3     = gmx_and_pr(rinv_SSE3,wco_SSE3);
 #else
             /* We only need to mask for the cut-off: blendv is faster */
             rinv_SSE0     = gmx_blendv_pr(rinv_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0));
-            rinv_SSE1     = gmx_blendv_pr(rinv_SSE1,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE1));
             rinv_SSE2     = gmx_blendv_pr(rinv_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2));
-            rinv_SSE3     = gmx_blendv_pr(rinv_SSE3,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE3));
 #endif
 
             rinvsq_SSE0   = gmx_mul_pr(rinv_SSE0,rinv_SSE0);
-            rinvsq_SSE1   = gmx_mul_pr(rinv_SSE1,rinv_SSE1);
             rinvsq_SSE2   = gmx_mul_pr(rinv_SSE2,rinv_SSE2);
-            rinvsq_SSE3   = gmx_mul_pr(rinv_SSE3,rinv_SSE3);
 
 #ifdef CALC_COULOMB
             /* Note that here we calculate force*r, not the usual force/r.
@@ -534,29 +400,21 @@
 #ifdef EXCL_FORCES
             /* Only add 1/r for non-excluded atom pairs */
             rinv_ex_SSE0  = gmx_and_pr(rinv_SSE0,int_SSE0);
-            rinv_ex_SSE1  = gmx_and_pr(rinv_SSE1,int_SSE1);
             rinv_ex_SSE2  = gmx_and_pr(rinv_SSE2,int_SSE2);
-            rinv_ex_SSE3  = gmx_and_pr(rinv_SSE3,int_SSE3);
 #else
             /* No exclusion forces, we always need 1/r */
 #define     rinv_ex_SSE0    rinv_SSE0
-#define     rinv_ex_SSE1    rinv_SSE1
 #define     rinv_ex_SSE2    rinv_SSE2
-#define     rinv_ex_SSE3    rinv_SSE3
 #endif
 
 #ifdef CALC_COUL_RF
             /* Electrostatic interactions */
             frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(rsq_SSE0,mrc_3_SSE)));
-            frcoul_SSE1   = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_mul_pr(rsq_SSE1,mrc_3_SSE)));
             frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(rsq_SSE2,mrc_3_SSE)));
-            frcoul_SSE3   = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_mul_pr(rsq_SSE3,mrc_3_SSE)));
 
 #ifdef CALC_ENERGIES
             vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_add_pr(gmx_mul_pr(rsq_SSE0,hrc_3_SSE),moh_rc_SSE)));
-            vcoul_SSE1    = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_add_pr(gmx_mul_pr(rsq_SSE1,hrc_3_SSE),moh_rc_SSE)));
             vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_add_pr(gmx_mul_pr(rsq_SSE2,hrc_3_SSE),moh_rc_SSE)));
-            vcoul_SSE3    = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_add_pr(gmx_mul_pr(rsq_SSE3,hrc_3_SSE),moh_rc_SSE)));
 #endif
 #endif
 
@@ -566,30 +424,20 @@
              */
 #ifndef CUTOFF_BLENDV
             brsq_SSE0     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE0,wco_SSE0));
-            brsq_SSE1     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE1,wco_SSE1));
             brsq_SSE2     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE2,wco_SSE2));
-            brsq_SSE3     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE3,wco_SSE3));
 #else
             /* Strangely, putting mul on a separate line is slower (icc 13) */
             brsq_SSE0     = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0)));
-            brsq_SSE1     = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE1,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE1)));
             brsq_SSE2     = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2)));
-            brsq_SSE3     = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE3,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE3)));
 #endif
             ewcorr_SSE0   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE0),beta_SSE);
-            ewcorr_SSE1   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE1),beta_SSE);
             ewcorr_SSE2   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE2),beta_SSE);
-            ewcorr_SSE3   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE3),beta_SSE);
             frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(ewcorr_SSE0,brsq_SSE0)));
-            frcoul_SSE1   = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_mul_pr(ewcorr_SSE1,brsq_SSE1)));
             frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(ewcorr_SSE2,brsq_SSE2)));
-            frcoul_SSE3   = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_mul_pr(ewcorr_SSE3,brsq_SSE3)));
 
 #ifdef CALC_ENERGIES
             vc_sub_SSE0   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE0),beta_SSE);
-            vc_sub_SSE1   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE1),beta_SSE);
             vc_sub_SSE2   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE2),beta_SSE);
-            vc_sub_SSE3   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE3),beta_SSE);
 #endif
 
 #endif /* CALC_COUL_EWALD */
@@ -597,35 +445,23 @@
 #ifdef CALC_COUL_TAB
             /* Electrostatic interactions */
             r_SSE0        = gmx_mul_pr(rsq_SSE0,rinv_SSE0);
-            r_SSE1        = gmx_mul_pr(rsq_SSE1,rinv_SSE1);
             r_SSE2        = gmx_mul_pr(rsq_SSE2,rinv_SSE2);
-            r_SSE3        = gmx_mul_pr(rsq_SSE3,rinv_SSE3);
             /* Convert r to scaled table units */
             rs_SSE0       = gmx_mul_pr(r_SSE0,invtsp_SSE);
-            rs_SSE1       = gmx_mul_pr(r_SSE1,invtsp_SSE);
             rs_SSE2       = gmx_mul_pr(r_SSE2,invtsp_SSE);
-            rs_SSE3       = gmx_mul_pr(r_SSE3,invtsp_SSE);
             /* Truncate scaled r to an int */
             ti_SSE0       = gmx_cvttpr_epi32(rs_SSE0);
-            ti_SSE1       = gmx_cvttpr_epi32(rs_SSE1);
             ti_SSE2       = gmx_cvttpr_epi32(rs_SSE2);
-            ti_SSE3       = gmx_cvttpr_epi32(rs_SSE3);
 #ifdef GMX_X86_SSE4_1
             /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
             rf_SSE0       = gmx_floor_pr(rs_SSE0);
-            rf_SSE1       = gmx_floor_pr(rs_SSE1);
             rf_SSE2       = gmx_floor_pr(rs_SSE2);
-            rf_SSE3       = gmx_floor_pr(rs_SSE3);
 #else
             rf_SSE0       = gmx_cvtepi32_pr(ti_SSE0);
-            rf_SSE1       = gmx_cvtepi32_pr(ti_SSE1);
             rf_SSE2       = gmx_cvtepi32_pr(ti_SSE2);
-            rf_SSE3       = gmx_cvtepi32_pr(ti_SSE3);
 #endif
             frac_SSE0     = gmx_sub_pr(rs_SSE0,rf_SSE0);
-            frac_SSE1     = gmx_sub_pr(rs_SSE1,rf_SSE1);
             frac_SSE2     = gmx_sub_pr(rs_SSE2,rf_SSE2);
-            frac_SSE3     = gmx_sub_pr(rs_SSE3,rf_SSE3);
 
             /* Load and interpolate table forces and possibly energies.
              * Force and energy can be combined in one table, stride 4: FDV0
@@ -634,36 +470,24 @@
              */
 #ifndef CALC_ENERGIES
             load_table_f(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0);
-            load_table_f(tab_coul_F,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1);
             load_table_f(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2);
-            load_table_f(tab_coul_F,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3);
 #else
 #ifdef TAB_FDV0
             load_table_f_v(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
-            load_table_f_v(tab_coul_F,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1,ctabv_SSE1);
             load_table_f_v(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
-            load_table_f_v(tab_coul_F,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3,ctabv_SSE3);
 #else
             load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
-            load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1,ctabv_SSE1);
             load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
-            load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3,ctabv_SSE3);
 #endif
 #endif
             fsub_SSE0     = gmx_add_pr(ctab0_SSE0,gmx_mul_pr(frac_SSE0,ctab1_SSE0));
-            fsub_SSE1     = gmx_add_pr(ctab0_SSE1,gmx_mul_pr(frac_SSE1,ctab1_SSE1));
             fsub_SSE2     = gmx_add_pr(ctab0_SSE2,gmx_mul_pr(frac_SSE2,ctab1_SSE2));
-            fsub_SSE3     = gmx_add_pr(ctab0_SSE3,gmx_mul_pr(frac_SSE3,ctab1_SSE3));
             frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,gmx_mul_pr(fsub_SSE0,r_SSE0)));
-            frcoul_SSE1   = gmx_mul_pr(qq_SSE1,gmx_sub_pr(rinv_ex_SSE1,gmx_mul_pr(fsub_SSE1,r_SSE1)));
             frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,gmx_mul_pr(fsub_SSE2,r_SSE2)));
-            frcoul_SSE3   = gmx_mul_pr(qq_SSE3,gmx_sub_pr(rinv_ex_SSE3,gmx_mul_pr(fsub_SSE3,r_SSE3)));
 
 #ifdef CALC_ENERGIES
             vc_sub_SSE0   = gmx_add_pr(ctabv_SSE0,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE0),gmx_add_pr(ctab0_SSE0,fsub_SSE0)));
-            vc_sub_SSE1   = gmx_add_pr(ctabv_SSE1,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE1),gmx_add_pr(ctab0_SSE1,fsub_SSE1)));
             vc_sub_SSE2   = gmx_add_pr(ctabv_SSE2,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE2),gmx_add_pr(ctab0_SSE2,fsub_SSE2)));
-            vc_sub_SSE3   = gmx_add_pr(ctabv_SSE3,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE3),gmx_add_pr(ctab0_SSE3,fsub_SSE3)));
 #endif
 #endif /* CALC_COUL_TAB */
 
@@ -672,30 +496,21 @@
             /* Add Ewald potential shift to vc_sub for convenience */
 #ifdef CHECK_EXCLS
             vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,gmx_and_pr(sh_ewald_SSE,int_SSE0));
-            vc_sub_SSE1   = gmx_add_pr(vc_sub_SSE1,gmx_and_pr(sh_ewald_SSE,int_SSE1));
             vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,gmx_and_pr(sh_ewald_SSE,int_SSE2));
-            vc_sub_SSE3   = gmx_add_pr(vc_sub_SSE3,gmx_and_pr(sh_ewald_SSE,int_SSE3));
 #else
             vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,sh_ewald_SSE);
-            vc_sub_SSE1   = gmx_add_pr(vc_sub_SSE1,sh_ewald_SSE);
             vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,sh_ewald_SSE);
-            vc_sub_SSE3   = gmx_add_pr(vc_sub_SSE3,sh_ewald_SSE);
 #endif
 #endif
             
             vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,vc_sub_SSE0));
-            vcoul_SSE1    = gmx_mul_pr(qq_SSE1,gmx_sub_pr(rinv_ex_SSE1,vc_sub_SSE1));
             vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,vc_sub_SSE2));
-            vcoul_SSE3    = gmx_mul_pr(qq_SSE3,gmx_sub_pr(rinv_ex_SSE3,vc_sub_SSE3));
-
 #endif
 
 #ifdef CALC_ENERGIES
             /* Mask energy for cut-off and diagonal */
             vcoul_SSE0    = gmx_and_pr(vcoul_SSE0,wco_SSE0);
-            vcoul_SSE1    = gmx_and_pr(vcoul_SSE1,wco_SSE1);
             vcoul_SSE2    = gmx_and_pr(vcoul_SSE2,wco_SSE2);
-            vcoul_SSE3    = gmx_and_pr(vcoul_SSE3,wco_SSE3);
 #endif
 
 #endif /* CALC_COULOMB */
@@ -705,128 +520,92 @@
 
 #ifdef VDW_CUTOFF_CHECK
             wco_vdw_SSE0  = gmx_cmplt_pr(rsq_SSE0,rcvdw2_SSE);
-            wco_vdw_SSE1  = gmx_cmplt_pr(rsq_SSE1,rcvdw2_SSE);
 #ifndef HALF_LJ
             wco_vdw_SSE2  = gmx_cmplt_pr(rsq_SSE2,rcvdw2_SSE);
-            wco_vdw_SSE3  = gmx_cmplt_pr(rsq_SSE3,rcvdw2_SSE);
 #endif
 #else
             /* Same cut-off for Coulomb and VdW, reuse the registers */
 #define     wco_vdw_SSE0    wco_SSE0
-#define     wco_vdw_SSE1    wco_SSE1
 #define     wco_vdw_SSE2    wco_SSE2
-#define     wco_vdw_SSE3    wco_SSE3
 #endif
 
 #ifndef LJ_COMB_LB
             rinvsix_SSE0  = gmx_mul_pr(rinvsq_SSE0,gmx_mul_pr(rinvsq_SSE0,rinvsq_SSE0));
-            rinvsix_SSE1  = gmx_mul_pr(rinvsq_SSE1,gmx_mul_pr(rinvsq_SSE1,rinvsq_SSE1));
 #ifdef EXCL_FORCES
             rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,int_SSE0);
-            rinvsix_SSE1  = gmx_and_pr(rinvsix_SSE1,int_SSE1);
 #endif
 #ifndef HALF_LJ
             rinvsix_SSE2  = gmx_mul_pr(rinvsq_SSE2,gmx_mul_pr(rinvsq_SSE2,rinvsq_SSE2));
-            rinvsix_SSE3  = gmx_mul_pr(rinvsq_SSE3,gmx_mul_pr(rinvsq_SSE3,rinvsq_SSE3));
 #ifdef EXCL_FORCES
             rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,int_SSE2);
-            rinvsix_SSE3  = gmx_and_pr(rinvsix_SSE3,int_SSE3);
 #endif
 #endif
 #ifdef VDW_CUTOFF_CHECK
             rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,wco_vdw_SSE0);
-            rinvsix_SSE1  = gmx_and_pr(rinvsix_SSE1,wco_vdw_SSE1);
 #ifndef HALF_LJ
             rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,wco_vdw_SSE2);
-            rinvsix_SSE3  = gmx_and_pr(rinvsix_SSE3,wco_vdw_SSE3);
 #endif
 #endif
             FrLJ6_SSE0    = gmx_mul_pr(c6_SSE0,rinvsix_SSE0);
-            FrLJ6_SSE1    = gmx_mul_pr(c6_SSE1,rinvsix_SSE1);
 #ifndef HALF_LJ
             FrLJ6_SSE2    = gmx_mul_pr(c6_SSE2,rinvsix_SSE2);
-            FrLJ6_SSE3    = gmx_mul_pr(c6_SSE3,rinvsix_SSE3);
 #endif
             FrLJ12_SSE0   = gmx_mul_pr(c12_SSE0,gmx_mul_pr(rinvsix_SSE0,rinvsix_SSE0));
-            FrLJ12_SSE1   = gmx_mul_pr(c12_SSE1,gmx_mul_pr(rinvsix_SSE1,rinvsix_SSE1));
 #ifndef HALF_LJ
             FrLJ12_SSE2   = gmx_mul_pr(c12_SSE2,gmx_mul_pr(rinvsix_SSE2,rinvsix_SSE2));
-            FrLJ12_SSE3   = gmx_mul_pr(c12_SSE3,gmx_mul_pr(rinvsix_SSE3,rinvsix_SSE3));
 #endif
 #endif /* not LJ_COMB_LB */
 
 #ifdef LJ_COMB_LB
             sir_SSE0      = gmx_mul_pr(sig_SSE0,rinv_SSE0);
-            sir_SSE1      = gmx_mul_pr(sig_SSE1,rinv_SSE1);
 #ifndef HALF_LJ
             sir_SSE2      = gmx_mul_pr(sig_SSE2,rinv_SSE2);
-            sir_SSE3      = gmx_mul_pr(sig_SSE3,rinv_SSE3);
 #endif
             sir2_SSE0     = gmx_mul_pr(sir_SSE0,sir_SSE0);
-            sir2_SSE1     = gmx_mul_pr(sir_SSE1,sir_SSE1);
 #ifndef HALF_LJ
             sir2_SSE2     = gmx_mul_pr(sir_SSE2,sir_SSE2);
-            sir2_SSE3     = gmx_mul_pr(sir_SSE3,sir_SSE3);
 #endif
             sir6_SSE0     = gmx_mul_pr(sir2_SSE0,gmx_mul_pr(sir2_SSE0,sir2_SSE0));
-            sir6_SSE1     = gmx_mul_pr(sir2_SSE1,gmx_mul_pr(sir2_SSE1,sir2_SSE1));
 #ifdef EXCL_FORCES
             sir6_SSE0     = gmx_and_pr(sir6_SSE0,int_SSE0);
-            sir6_SSE1     = gmx_and_pr(sir6_SSE1,int_SSE1);
 #endif
 #ifndef HALF_LJ
             sir6_SSE2     = gmx_mul_pr(sir2_SSE2,gmx_mul_pr(sir2_SSE2,sir2_SSE2));
-            sir6_SSE3     = gmx_mul_pr(sir2_SSE3,gmx_mul_pr(sir2_SSE3,sir2_SSE3));
 #ifdef EXCL_FORCES
             sir6_SSE2     = gmx_and_pr(sir6_SSE2,int_SSE2);
-            sir6_SSE3     = gmx_and_pr(sir6_SSE3,int_SSE3);
 #endif
 #endif
 #ifdef VDW_CUTOFF_CHECK
             sir6_SSE0     = gmx_and_pr(sir6_SSE0,wco_vdw_SSE0);
-            sir6_SSE1     = gmx_and_pr(sir6_SSE1,wco_vdw_SSE1);
 #ifndef HALF_LJ
             sir6_SSE2     = gmx_and_pr(sir6_SSE2,wco_vdw_SSE2);
-            sir6_SSE3     = gmx_and_pr(sir6_SSE3,wco_vdw_SSE3);
 #endif
 #endif
             FrLJ6_SSE0    = gmx_mul_pr(eps_SSE0,sir6_SSE0);
-            FrLJ6_SSE1    = gmx_mul_pr(eps_SSE1,sir6_SSE1);
 #ifndef HALF_LJ
             FrLJ6_SSE2    = gmx_mul_pr(eps_SSE2,sir6_SSE2);
-            FrLJ6_SSE3    = gmx_mul_pr(eps_SSE3,sir6_SSE3);
 #endif
             FrLJ12_SSE0   = gmx_mul_pr(FrLJ6_SSE0,sir6_SSE0);
-            FrLJ12_SSE1   = gmx_mul_pr(FrLJ6_SSE1,sir6_SSE1);
 #ifndef HALF_LJ
             FrLJ12_SSE2   = gmx_mul_pr(FrLJ6_SSE2,sir6_SSE2);
-            FrLJ12_SSE3   = gmx_mul_pr(FrLJ6_SSE3,sir6_SSE3);
 #endif
 #if defined CALC_ENERGIES
             /* We need C6 and C12 to calculate the LJ potential shift */
             sig2_SSE0     = gmx_mul_pr(sig_SSE0,sig_SSE0);
-            sig2_SSE1     = gmx_mul_pr(sig_SSE1,sig_SSE1);
 #ifndef HALF_LJ
             sig2_SSE2     = gmx_mul_pr(sig_SSE2,sig_SSE2);
-            sig2_SSE3     = gmx_mul_pr(sig_SSE3,sig_SSE3);
 #endif
             sig6_SSE0     = gmx_mul_pr(sig2_SSE0,gmx_mul_pr(sig2_SSE0,sig2_SSE0));
-            sig6_SSE1     = gmx_mul_pr(sig2_SSE1,gmx_mul_pr(sig2_SSE1,sig2_SSE1));
 #ifndef HALF_LJ
             sig6_SSE2     = gmx_mul_pr(sig2_SSE2,gmx_mul_pr(sig2_SSE2,sig2_SSE2));
-            sig6_SSE3     = gmx_mul_pr(sig2_SSE3,gmx_mul_pr(sig2_SSE3,sig2_SSE3));
 #endif
             c6_SSE0       = gmx_mul_pr(eps_SSE0,sig6_SSE0);
-            c6_SSE1       = gmx_mul_pr(eps_SSE1,sig6_SSE1);
 #ifndef HALF_LJ
             c6_SSE2       = gmx_mul_pr(eps_SSE2,sig6_SSE2);
-            c6_SSE3       = gmx_mul_pr(eps_SSE3,sig6_SSE3);
 #endif
             c12_SSE0      = gmx_mul_pr(c6_SSE0,sig6_SSE0);
-            c12_SSE1      = gmx_mul_pr(c6_SSE1,sig6_SSE1);
 #ifndef HALF_LJ
             c12_SSE2      = gmx_mul_pr(c6_SSE2,sig6_SSE2);
-            c12_SSE3      = gmx_mul_pr(c6_SSE3,sig6_SSE3);
 #endif
 #endif
 #endif /* LJ_COMB_LB */
@@ -862,66 +641,52 @@
 
 #ifdef CALC_COULOMB
 #ifndef ENERGY_GROUPS
-            vctotSSE      = gmx_add_pr(vctotSSE, gmx_sum4_pr(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
+            vctotSSE      = gmx_add_pr(vctotSSE, gmx_add_pr(vcoul_SSE0,vcoul_SSE2));
 #else
-            add_ener_grp(vcoul_SSE0,vctp[0],egp_jj);
-            add_ener_grp(vcoul_SSE1,vctp[1],egp_jj);
-            add_ener_grp(vcoul_SSE2,vctp[2],egp_jj);
-            add_ener_grp(vcoul_SSE3,vctp[3],egp_jj);
+            add_ener_grp_halves(vcoul_SSE0,vctp[0],vctp[1],egp_jj);
+            add_ener_grp_halves(vcoul_SSE2,vctp[2],vctp[3],egp_jj);
 #endif
 #endif
 
 #ifdef CALC_LJ
             /* Calculate the LJ energies */
             VLJ6_SSE0     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE0,gmx_mul_pr(c6_SSE0,sh_invrc6_SSE)));
-            VLJ6_SSE1     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE1,gmx_mul_pr(c6_SSE1,sh_invrc6_SSE)));
 #ifndef HALF_LJ
             VLJ6_SSE2     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE2,gmx_mul_pr(c6_SSE2,sh_invrc6_SSE)));
-            VLJ6_SSE3     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE3,gmx_mul_pr(c6_SSE3,sh_invrc6_SSE)));
 #endif
             VLJ12_SSE0    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE0,gmx_mul_pr(c12_SSE0,sh_invrc12_SSE)));
-            VLJ12_SSE1    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE1,gmx_mul_pr(c12_SSE1,sh_invrc12_SSE)));
 #ifndef HALF_LJ
             VLJ12_SSE2    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE2,gmx_mul_pr(c12_SSE2,sh_invrc12_SSE)));
-            VLJ12_SSE3    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE3,gmx_mul_pr(c12_SSE3,sh_invrc12_SSE)));
 #endif
 
             VLJ_SSE0      = gmx_sub_pr(VLJ12_SSE0,VLJ6_SSE0);
-            VLJ_SSE1      = gmx_sub_pr(VLJ12_SSE1,VLJ6_SSE1);
 #ifndef HALF_LJ
             VLJ_SSE2      = gmx_sub_pr(VLJ12_SSE2,VLJ6_SSE2);
-            VLJ_SSE3      = gmx_sub_pr(VLJ12_SSE3,VLJ6_SSE3);
 #endif
             /* The potential shift should be removed for pairs beyond cut-off */
             VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,wco_vdw_SSE0);
-            VLJ_SSE1      = gmx_and_pr(VLJ_SSE1,wco_vdw_SSE1);
 #ifndef HALF_LJ
             VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,wco_vdw_SSE2);
-            VLJ_SSE3      = gmx_and_pr(VLJ_SSE3,wco_vdw_SSE3);
 #endif
 #ifdef CHECK_EXCLS
             /* The potential shift should be removed for excluded pairs */
             VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,int_SSE0);
-            VLJ_SSE1      = gmx_and_pr(VLJ_SSE1,int_SSE1);
 #ifndef HALF_LJ
             VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,int_SSE2);
-            VLJ_SSE3      = gmx_and_pr(VLJ_SSE3,int_SSE3);
 #endif
 #endif
 #ifndef ENERGY_GROUPS
             VvdwtotSSE    = gmx_add_pr(VvdwtotSSE,
 #ifndef HALF_LJ
-                                       gmx_sum4_pr(VLJ_SSE0,VLJ_SSE1,VLJ_SSE2,VLJ_SSE3)
+                                       gmx_add_pr(VLJ_SSE0,VLJ_SSE2)
 #else
-                                       gmx_add_pr(VLJ_SSE0,VLJ_SSE1)
+                                       VLJ_SSE0
 #endif
                                       );
 #else
-            add_ener_grp(VLJ_SSE0,vvdwtp[0],egp_jj);
-            add_ener_grp(VLJ_SSE1,vvdwtp[1],egp_jj);
+            add_ener_grp_halves(VLJ_SSE0,vvdwtp[0],vvdwtp[1],egp_jj);
 #ifndef HALF_LJ
-            add_ener_grp(VLJ_SSE2,vvdwtp[2],egp_jj);
-            add_ener_grp(VLJ_SSE3,vvdwtp[3],egp_jj);
+            add_ener_grp_halves(VLJ_SSE2,vvdwtp[2],vvdwtp[3],egp_jj);
 #endif
 #endif
 #endif /* CALC_LJ */
@@ -935,16 +700,8 @@
                                                    (
 #endif
                                                     gmx_sub_pr(FrLJ12_SSE0,FrLJ6_SSE0)));
-            fscal_SSE1    = gmx_mul_pr(rinvsq_SSE1,
-#ifdef CALC_COULOMB
-                                                   gmx_add_pr(frcoul_SSE1,
-#else
-                                                   (
-#endif
-                                                    gmx_sub_pr(FrLJ12_SSE1,FrLJ6_SSE1)));
 #else
             fscal_SSE0    = gmx_mul_pr(rinvsq_SSE0,frcoul_SSE0);
-            fscal_SSE1    = gmx_mul_pr(rinvsq_SSE1,frcoul_SSE1);
 #endif /* CALC_LJ */
 #if defined CALC_LJ && !defined HALF_LJ
             fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,
@@ -954,65 +711,41 @@
                                                    (
 #endif
                                                     gmx_sub_pr(FrLJ12_SSE2,FrLJ6_SSE2)));
-            fscal_SSE3    = gmx_mul_pr(rinvsq_SSE3,
-#ifdef CALC_COULOMB
-                                                   gmx_add_pr(frcoul_SSE3,
-#else
-                                                   (
-#endif
-                                                    gmx_sub_pr(FrLJ12_SSE3,FrLJ6_SSE3)));
 #else
             /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
             fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,frcoul_SSE2);
-            fscal_SSE3    = gmx_mul_pr(rinvsq_SSE3,frcoul_SSE3);
 #endif
-            
+
             /* Calculate temporary vectorial force */
             tx_SSE0       = gmx_mul_pr(fscal_SSE0,dx_SSE0);
-            tx_SSE1       = gmx_mul_pr(fscal_SSE1,dx_SSE1);
             tx_SSE2       = gmx_mul_pr(fscal_SSE2,dx_SSE2);
-            tx_SSE3       = gmx_mul_pr(fscal_SSE3,dx_SSE3);
             ty_SSE0       = gmx_mul_pr(fscal_SSE0,dy_SSE0);
-            ty_SSE1       = gmx_mul_pr(fscal_SSE1,dy_SSE1);
             ty_SSE2       = gmx_mul_pr(fscal_SSE2,dy_SSE2);
-            ty_SSE3       = gmx_mul_pr(fscal_SSE3,dy_SSE3);
             tz_SSE0       = gmx_mul_pr(fscal_SSE0,dz_SSE0);
-            tz_SSE1       = gmx_mul_pr(fscal_SSE1,dz_SSE1);
             tz_SSE2       = gmx_mul_pr(fscal_SSE2,dz_SSE2);
-            tz_SSE3       = gmx_mul_pr(fscal_SSE3,dz_SSE3);
-            
+
             /* Increment i atom force */
             fix_SSE0      = gmx_add_pr(fix_SSE0,tx_SSE0);
-            fix_SSE1      = gmx_add_pr(fix_SSE1,tx_SSE1);
             fix_SSE2      = gmx_add_pr(fix_SSE2,tx_SSE2);
-            fix_SSE3      = gmx_add_pr(fix_SSE3,tx_SSE3);
             fiy_SSE0      = gmx_add_pr(fiy_SSE0,ty_SSE0);
-            fiy_SSE1      = gmx_add_pr(fiy_SSE1,ty_SSE1);
             fiy_SSE2      = gmx_add_pr(fiy_SSE2,ty_SSE2);
-            fiy_SSE3      = gmx_add_pr(fiy_SSE3,ty_SSE3);
             fiz_SSE0      = gmx_add_pr(fiz_SSE0,tz_SSE0);
-            fiz_SSE1      = gmx_add_pr(fiz_SSE1,tz_SSE1);
             fiz_SSE2      = gmx_add_pr(fiz_SSE2,tz_SSE2);
-            fiz_SSE3      = gmx_add_pr(fiz_SSE3,tz_SSE3);
-            
+
             /* Decrement j atom force */
-            gmx_store_pr(f+ajx,
-                         gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
-            gmx_store_pr(f+ajy,
-                         gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
-            gmx_store_pr(f+ajz,
-                         gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
+            gmx_store_hpr(f+ajx,
+                         gmx_sub_hpr( gmx_load_hpr(f+ajx), gmx_sum4_hpr(tx_SSE0,tx_SSE2) ));
+            gmx_store_hpr(f+ajy,
+                         gmx_sub_hpr( gmx_load_hpr(f+ajy), gmx_sum4_hpr(ty_SSE0,ty_SSE2) ));
+            gmx_store_hpr(f+ajz,
+                         gmx_sub_hpr( gmx_load_hpr(f+ajz), gmx_sum4_hpr(tz_SSE0,tz_SSE2) ));
         }
 
 #undef  rinv_ex_SSE0
-#undef  rinv_ex_SSE1
 #undef  rinv_ex_SSE2
-#undef  rinv_ex_SSE3
 
 #undef  wco_vdw_SSE0
-#undef  wco_vdw_SSE1
 #undef  wco_vdw_SSE2
-#undef  wco_vdw_SSE3
 
 #undef  CUTOFF_BLENDV
 
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
similarity index 66%
copy from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
copy to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
index 0644f9776c..d36a99f15c 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
@@ -41,13 +41,13 @@
 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
 
 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
-#define UNROLLJ    GMX_X86_SIMD_WIDTH_HERE
+#define UNROLLJ    (GMX_X86_SIMD_WIDTH_HERE/2)
 
 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
 #define STRIDE     4
 #endif
 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
-#define STRIDE     8
+#define STRIDE     4
 #endif 
 
 #ifdef GMX_MM128_HERE
@@ -74,7 +74,7 @@
 
 #define SIMD_MASK_ALL   0xffffffff
 
-#include "nbnxn_kernel_x86_simd_utils.h"
+#include "nbnxn_kernel_simd_utils.h"
 
 /* All functionality defines are set here, except for:
  * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
@@ -91,51 +91,49 @@
 /* Assumes all LJ parameters are identical */
 /* #define FIX_LJ_C */
 
-#define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene
 
 #if defined LJ_COMB_GEOM
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene)
 #else
 #if defined LJ_COMB_LB
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene)
 #else
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene)
 #endif
 #endif
 
 #ifdef CALC_COUL_RF
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene)
 #endif
 #ifdef CALC_COUL_TAB
 #ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene)
 #else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene)
 #endif
 #endif
 #ifdef CALC_COUL_EWALD
 #ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene)
 #else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald_twin,e)
-#endif
-#endif
-
-#ifdef GMX_MM128_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene)
 #endif
-#ifdef GMX_MM256_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
 #endif
 
 static void
 #ifndef CALC_ENERGIES
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,noener)
 #else
 #ifndef ENERGY_GROUPS
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,ener)
 #else
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
+NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
 #endif
 #endif
 #undef NBK_FUNC_NAME
@@ -187,13 +185,9 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
     gmx_mm_pr  shY_SSE;
     gmx_mm_pr  shZ_SSE;
     gmx_mm_pr  ix_SSE0,iy_SSE0,iz_SSE0;
-    gmx_mm_pr  ix_SSE1,iy_SSE1,iz_SSE1;
     gmx_mm_pr  ix_SSE2,iy_SSE2,iz_SSE2;
-    gmx_mm_pr  ix_SSE3,iy_SSE3,iz_SSE3;
     gmx_mm_pr  fix_SSE0,fiy_SSE0,fiz_SSE0;
-    gmx_mm_pr  fix_SSE1,fiy_SSE1,fiz_SSE1;
     gmx_mm_pr  fix_SSE2,fiy_SSE2,fiz_SSE2;
-    gmx_mm_pr  fix_SSE3,fiy_SSE3,fiz_SSE3;
 #if UNROLLJ >= 4
 #ifndef GMX_DOUBLE
     __m128     fix_SSE,fiy_SSE,fiz_SSE;
@@ -205,66 +199,12 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
     __m128d    fix2_SSE,fiy2_SSE,fiz2_SSE;
 #endif
 
-#ifndef GMX_MM256_HERE
-#ifndef GMX_DOUBLE
-    __m128i    mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
-    __m128i    mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
-    __m128i    mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
-    __m128i    mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
-#else
-    /* For double precision we need to set two 32bit ints for one double */
-    __m128i    mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
-    __m128i    mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
-    __m128i    mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
-    __m128i    mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
-#endif
-#else
     /* AVX: use floating point masks, as there are no integer instructions */
-#ifndef GMX_DOUBLE
     gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
-    gmx_mm_pr  mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
-#else
-    /* There is no 256-bit int to double conversion, so we use float here */
-    __m256     mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
-    __m256     mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
-    __m256     mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
-    __m256     mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
-#endif
-#endif
+    gmx_mm_pr  mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
 
-#ifndef GMX_MM256_HERE
-#ifndef GMX_DOUBLE
-    __m128     diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
-    __m128     diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    __m128     diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128     diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#else
-    __m128d    diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    __m128d    diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128d    diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128d    diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    __m128d    diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
-    __m128d    diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
-    __m128d    diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    __m128d    diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
-#else /* GMX_MM256_HERE */
-#ifndef GMX_DOUBLE
-    gmx_mm_pr  diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
-    gmx_mm_pr  diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#else
-    gmx_mm_pr  diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-    gmx_mm_pr  diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
-#endif
+    gmx_mm_pr  diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
+    gmx_mm_pr  diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
 
 #ifndef GMX_MM256_HERE
     __m128i    zeroi_SSE = _mm_setzero_si128();
@@ -275,9 +215,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 
     gmx_mm_pr  one_SSE=gmx_set1_pr(1.0);
     gmx_mm_pr  iq_SSE0=gmx_setzero_pr();
-    gmx_mm_pr  iq_SSE1=gmx_setzero_pr();
     gmx_mm_pr  iq_SSE2=gmx_setzero_pr();
-    gmx_mm_pr  iq_SSE3=gmx_setzero_pr();
     gmx_mm_pr  mrc_3_SSE;
 #ifdef CALC_ENERGIES
     gmx_mm_pr  hrc_3_SSE,moh_rc_SSE;
@@ -292,9 +230,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #endif
 #ifdef GMX_MM256_HERE
     int        ti0_array[2*UNROLLJ-1],*ti0;
-    int        ti1_array[2*UNROLLJ-1],*ti1;
     int        ti2_array[2*UNROLLJ-1],*ti2;
-    int        ti3_array[2*UNROLLJ-1],*ti3;
 #endif
 #ifdef CALC_ENERGIES
     gmx_mm_pr  mhalfsp_SSE;
@@ -313,17 +249,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
     const real *ljc;
 
     gmx_mm_pr  hsig_i_SSE0,seps_i_SSE0;
-    gmx_mm_pr  hsig_i_SSE1,seps_i_SSE1;
     gmx_mm_pr  hsig_i_SSE2,seps_i_SSE2;
-    gmx_mm_pr  hsig_i_SSE3,seps_i_SSE3;
 #else
 #ifdef FIX_LJ_C
     real       pvdw_array[2*UNROLLI*UNROLLJ+3];
     real       *pvdw_c6,*pvdw_c12;
     gmx_mm_pr  c6_SSE0,c12_SSE0;
-    gmx_mm_pr  c6_SSE1,c12_SSE1;
     gmx_mm_pr  c6_SSE2,c12_SSE2;
-    gmx_mm_pr  c6_SSE3,c12_SSE3;
 #endif
 
 #ifdef LJ_COMB_GEOM
@@ -380,9 +312,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #ifdef GMX_MM256_HERE
     /* Generate aligned table pointers */
     ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
-    ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
     ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
-    ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
 #endif
 
     invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
@@ -571,55 +501,45 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
         }
 #endif
 
-		/* Load i atom data */
+#define gmx_load2_hpr(x)  _mm256_insertf128_ps(gmx_load1_pr(x),gmx_load1_hpr(x+1),1)
+
+        /* Load i atom data */
         sciy             = scix + STRIDE;
         sciz             = sciy + STRIDE;
-        ix_SSE0          = gmx_add_pr(gmx_load1_pr(x+scix)  ,shX_SSE);
-        ix_SSE1          = gmx_add_pr(gmx_load1_pr(x+scix+1),shX_SSE);
-        ix_SSE2          = gmx_add_pr(gmx_load1_pr(x+scix+2),shX_SSE);
-        ix_SSE3          = gmx_add_pr(gmx_load1_pr(x+scix+3),shX_SSE);
-        iy_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciy)  ,shY_SSE);
-        iy_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciy+1),shY_SSE);
-        iy_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciy+2),shY_SSE);
-        iy_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciy+3),shY_SSE);
-        iz_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciz)  ,shZ_SSE);
-        iz_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciz+1),shZ_SSE);
-        iz_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
-        iz_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
+        ix_SSE0          = gmx_add_pr(gmx_load2_hpr(x+scix)  ,shX_SSE);
+        ix_SSE2          = gmx_add_pr(gmx_load2_hpr(x+scix+2),shX_SSE);
+        iy_SSE0          = gmx_add_pr(gmx_load2_hpr(x+sciy)  ,shY_SSE);
+        iy_SSE2          = gmx_add_pr(gmx_load2_hpr(x+sciy+2),shY_SSE);
+        iz_SSE0          = gmx_add_pr(gmx_load2_hpr(x+sciz)  ,shZ_SSE);
+        iz_SSE2          = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE);
 
         /* With half_LJ we currently always calculate Coulomb interactions */
         if (do_coul || half_LJ)
         {
-            iq_SSE0      = gmx_set1_pr(facel*q[sci]);
-            iq_SSE1      = gmx_set1_pr(facel*q[sci+1]);
-            iq_SSE2      = gmx_set1_pr(facel*q[sci+2]);
-            iq_SSE3      = gmx_set1_pr(facel*q[sci+3]);
+            gmx_mm_pr facel_SSE;
+
+            facel_SSE    = gmx_set1_pr(facel);
+
+            iq_SSE0      = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci));
+            iq_SSE2      = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci+2));
         }
 
 #ifdef LJ_COMB_LB
-        hsig_i_SSE0      = gmx_load1_pr(ljc+sci2+0);
-        hsig_i_SSE1      = gmx_load1_pr(ljc+sci2+1);
-        hsig_i_SSE2      = gmx_load1_pr(ljc+sci2+2);
-        hsig_i_SSE3      = gmx_load1_pr(ljc+sci2+3);
-        seps_i_SSE0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
-        seps_i_SSE1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
-        seps_i_SSE2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
-        seps_i_SSE3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
+        hsig_i_SSE0      = gmx_load2_hpr(ljc+sci2+0);
+        hsig_i_SSE2      = gmx_load2_hpr(ljc+sci2+2);
+        seps_i_SSE0      = gmx_load2_hpr(ljc+sci2+STRIDE+0);
+        seps_i_SSE2      = gmx_load2_hpr(ljc+sci2+STRIDE+2);
 #else
 #ifdef LJ_COMB_GEOM
-        c6s_SSE0         = gmx_load1_pr(ljc+sci2+0);
-        c6s_SSE1         = gmx_load1_pr(ljc+sci2+1);
+        c6s_SSE0         = gmx_load2_hpr(ljc+sci2+0);
         if (!half_LJ)
         {
-            c6s_SSE2     = gmx_load1_pr(ljc+sci2+2);
-            c6s_SSE3     = gmx_load1_pr(ljc+sci2+3);
+            c6s_SSE2     = gmx_load2_hpr(ljc+sci2+2);
         }
-        c12s_SSE0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
-        c12s_SSE1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
+        c12s_SSE0        = gmx_load2_hpr(ljc+sci2+STRIDE+0);
         if (!half_LJ)
         {
-            c12s_SSE2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
-            c12s_SSE3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
+            c12s_SSE2    = gmx_load2_hpr(ljc+sci2+STRIDE+2);
         }
 #else
         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
@@ -638,17 +558,11 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 
         /* Clear i atom forces */
         fix_SSE0           = gmx_setzero_pr();
-        fix_SSE1           = gmx_setzero_pr();
         fix_SSE2           = gmx_setzero_pr();
-        fix_SSE3           = gmx_setzero_pr();
         fiy_SSE0           = gmx_setzero_pr();
-        fiy_SSE1           = gmx_setzero_pr();
         fiy_SSE2           = gmx_setzero_pr();
-        fiy_SSE3           = gmx_setzero_pr();
         fiz_SSE0           = gmx_setzero_pr();
-        fiz_SSE1           = gmx_setzero_pr();
         fiz_SSE2           = gmx_setzero_pr();
-        fiz_SSE3           = gmx_setzero_pr();
 
         cjind = cjind0;
 
@@ -661,13 +575,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_2xnn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_2xnn_inner.h"
             }
 #undef HALF_LJ
 #undef CALC_COULOMB
@@ -678,13 +592,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_2xnn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_2xnn_inner.h"
             }
 #undef CALC_COULOMB
         }
@@ -693,13 +607,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_2xnn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_2xnn_inner.h"
             }
         }
 #undef CALC_LJ
@@ -716,13 +630,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define gmx_store_ps4 _mm256_store_pd
 #define gmx_add_ps4   _mm256_add_pd
 #endif
-        GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3,fix_SSE);
+        GMX_MM_TRANSPOSE_SUM4H_PR(fix_SSE0,fix_SSE2,fix_SSE);
         gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
 
-        GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3,fiy_SSE);
+        GMX_MM_TRANSPOSE_SUM4H_PR(fiy_SSE0,fiy_SSE2,fiy_SSE);
         gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
 
-        GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3,fiz_SSE);
+        GMX_MM_TRANSPOSE_SUM4H_PR(fiz_SSE0,fiz_SSE2,fiz_SSE);
         gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
 
 #ifdef CALC_SHIFTFORCES
@@ -778,6 +692,8 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #endif
 }
 
+#undef gmx_load2_hpr
+
 #undef gmx_load_ps4
 #undef gmx_store_ps4
 #undef gmx_store_ps4
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c
similarity index 84%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c
index eec30d3da2..640509d75c 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c
@@ -49,18 +49,26 @@
 #include "../nbnxn_consts.h"
 #include "nbnxn_kernel_common.h"
 
-#ifdef GMX_X86_AVX_256
+#ifdef GMX_NBNXN_SIMD_4XN
 
-#include "nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernel_simd_4xn.h"
 
-/* Include all flavors of the 256-bit AVX kernel loops */
+/* Include all flavors of the SSE or AVX 4xN kernel loops */
 
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
 #define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
 
 /* Analytical reaction-field kernels */
 #define CALC_COUL_RF
 
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 
 #undef CALC_COUL_RF
 
@@ -68,11 +76,11 @@
 #define CALC_COUL_TAB
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_TAB
@@ -81,11 +89,11 @@
 #define CALC_COUL_EWALD
 
 /* Single cut-off: rcoulomb = rvdw */
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 
 /* Twin cut-off: rcoulomb >= rvdw */
 #define VDW_CUTOFF_CHECK
-#include "nbnxn_kernel_x86_simd_includes.h"
+#include "nbnxn_kernel_simd_4xn_includes.h"
 #undef VDW_CUTOFF_CHECK
 
 #undef CALC_COUL_EWALD
@@ -109,7 +117,7 @@ typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
 
 enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_ener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_ener
 static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -118,7 +126,7 @@ static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_energrp
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_energrp
 static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -127,7 +135,7 @@ static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
   { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 #undef NBK_FN
 
-#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_noener
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_noener
 static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
 { { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
   { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
@@ -141,15 +149,14 @@ static void reduce_group_energies(int ng,int ng_2log,
                                   const real *VSvdw,const real *VSc,
                                   real *Vvdw,real *Vc)
 {
+    const int simd_width   = GMX_X86_SIMD_WIDTH_HERE;
+    const int unrollj_half = GMX_X86_SIMD_WIDTH_HERE/2;
     int ng_p2,i,j,j0,j1,c,s;
 
-#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
-#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
-
     ng_p2 = (1<<ng_2log);
 
     /* The size of the x86 SIMD energy group buffer array is:
-     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
+     * ng*ng*ng_p2*unrollj_half*simd_width
      */
     for(i=0; i<ng; i++)
     {
@@ -163,34 +170,34 @@ static void reduce_group_energies(int ng,int ng_2log,
         {
             for(j0=0; j0<ng; j0++)
             {
-                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
-                for(s=0; s<SIMD_WIDTH_HALF; s++)
+                c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*simd_width;
+                for(s=0; s<unrollj_half; s++)
                 {
                     Vvdw[i*ng+j0] += VSvdw[c+0];
                     Vvdw[i*ng+j1] += VSvdw[c+1];
                     Vc  [i*ng+j0] += VSc  [c+0];
                     Vc  [i*ng+j1] += VSc  [c+1];
-                    c += SIMD_WIDTH + 2;
+                    c += simd_width + 2;
                 }
             }
         }
     }
 }
 
-#endif /* GMX_X86_AVX_256 */
+#endif /* GMX_NBNXN_SIMD_4XN */
 
 void
-nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec, 
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw)
-#ifdef GMX_X86_AVX_256
+nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t       *nbl_list,
+                      const nbnxn_atomdata_t     *nbat,
+                      const interaction_const_t  *ic,
+                      int                        ewald_excl,
+                      rvec                       *shift_vec, 
+                      int                        force_flags,
+                      int                        clearF,
+                      real                       *fshift,
+                      real                       *Vc,
+                      real                       *Vvdw)
+#ifdef GMX_NBNXN_SIMD_4XN
 {
     int              nnbl;
     nbnxn_pairlist_t **nbl;
@@ -320,6 +327,6 @@ nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
 }
 #else
 {
-    gmx_incons("nbnxn_kernel_x86_simd256 called while GROMACS was configured without AVX enabled");
+    gmx_incons("nbnxn_kernel_simd_4xn called while GROMACS was configured without 4xN SIMD kernels enabled");
 }
 #endif
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.h
similarity index 71%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.h
index 9dd757dd75..a9d4d19802 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.h
@@ -35,8 +35,8 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-#ifndef _nbnxn_kernel_x86_simd128_h
-#define _nbnxn_kernel_x86_simd128_h
+#ifndef _nbnxn_kernel_simd_4xn_h
+#define _nbnxn_kernel_simd_4xn_h
 
 #include "typedefs.h"
 
@@ -44,18 +44,20 @@
 extern "C" {
 #endif
 
-/* Wrapper call for the non-bonded cluster vs cluster kernels */
+/* Wrapper call for the non-bonded cluster vs cluster kernels.
+ * These kernels determine 4xN cluster interactions for SIMD width N.
+ */
 void
-nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
-                         const nbnxn_atomdata_t     *nbat,
-                         const interaction_const_t  *ic,
-                         int                        ewald_excl,
-                         rvec                       *shift_vec,
-                         int                        force_flags,
-                         int                        clearF,
-                         real                       *fshift,
-                         real                       *Vc,
-                         real                       *Vvdw);
+nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t       *nbl_list,
+                      const nbnxn_atomdata_t     *nbat,
+                      const interaction_const_t  *ic,
+                      int                        ewald_excl,
+                      rvec                       *shift_vec,
+                      int                        force_flags,
+                      int                        clearF,
+                      real                       *fshift,
+                      real                       *Vc,
+                      real                       *Vvdw);
 
 #ifdef __cplusplus
 }
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_includes.h
similarity index 86%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_includes.h
index 15c52fb097..07da218f24 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_includes.h
@@ -43,32 +43,32 @@
 /* Include the force+energy kernels */
 #define CALC_ENERGIES
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef CALC_ENERGIES
 
 /* Include the force+energygroups kernels */
 #define CALC_ENERGIES
 #define ENERGY_GROUPS
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef ENERGY_GROUPS
 #undef CALC_ENERGIES
 
 /* Include the force only kernels */
 #define LJ_COMB_GEOM
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_GEOM
 #define LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
 #undef LJ_COMB_LB
-#include "nbnxn_kernel_x86_simd_outer.h"
+#include "nbnxn_kernel_simd_4xn_outer.h"
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
similarity index 99%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
index f1a0b9ccc8..1676f1f43d 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
@@ -35,8 +35,9 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-/* This is the innermost loop contents for the n vs n atom
- * SSE2 single precision kernels.
+/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
+ * This flavor of the kernel calculates interactions of 4 i-atoms
+ * with N j-atoms stored in N wide SIMD registers.
  */
 
 
@@ -150,7 +151,7 @@
             gmx_mm_pr  r_SSE1,rs_SSE1,rf_SSE1,frac_SSE1;
             gmx_mm_pr  r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
             gmx_mm_pr  r_SSE3,rs_SSE3,rf_SSE3,frac_SSE3;
-            /* Table index: rs converted to an int */ 
+            /* Table index: rs truncated to an int */
 #if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
             gmx_epi32  ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
 #else
@@ -333,7 +334,7 @@
             jxSSE         = gmx_load_pr(x+ajx);
             jySSE         = gmx_load_pr(x+ajy);
             jzSSE         = gmx_load_pr(x+ajz);
-            
+
             /* Calculate distance */
             dx_SSE0       = gmx_sub_pr(ix_SSE0,jxSSE);
             dy_SSE0       = gmx_sub_pr(iy_SSE0,jySSE);
@@ -347,7 +348,7 @@
             dx_SSE3       = gmx_sub_pr(ix_SSE3,jxSSE);
             dy_SSE3       = gmx_sub_pr(iy_SSE3,jySSE);
             dz_SSE3       = gmx_sub_pr(iz_SSE3,jzSSE);
-            
+
             /* rsq = dx*dx+dy*dy+dz*dz */
             rsq_SSE0      = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
             rsq_SSE1      = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1);
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
similarity index 95%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
index 0644f9776c..b347b3785f 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
@@ -74,7 +74,7 @@
 
 #define SIMD_MASK_ALL   0xffffffff
 
-#include "nbnxn_kernel_x86_simd_utils.h"
+#include "nbnxn_kernel_simd_utils.h"
 
 /* All functionality defines are set here, except for:
  * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
@@ -91,51 +91,49 @@
 /* Assumes all LJ parameters are identical */
 /* #define FIX_LJ_C */
 
-#define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
+/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
+ * with all combinations off electrostatics (coul), LJ combination rules (ljc)
+ * and energy calculations (ene), depending on the defines set.
+ */
+
+#define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene
 
 #if defined LJ_COMB_GEOM
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene)
 #else
 #if defined LJ_COMB_LB
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene)
 #else
-#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
+#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene)
 #endif
 #endif
 
 #ifdef CALC_COUL_RF
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene)
 #endif
 #ifdef CALC_COUL_TAB
 #ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene)
 #else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene)
 #endif
 #endif
 #ifdef CALC_COUL_EWALD
 #ifndef VDW_CUTOFF_CHECK
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene)
 #else
-#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald_twin,e)
+#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene)
 #endif
 #endif
 
-#ifdef GMX_MM128_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
-#endif
-#ifdef GMX_MM256_HERE
-#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
-#endif
-
 static void
 #ifndef CALC_ENERGIES
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,noener)
 #else
 #ifndef ENERGY_GROUPS
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,ener)
 #else
-NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
+NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
 #endif
 #endif
 #undef NBK_FUNC_NAME
@@ -571,7 +569,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
         }
 #endif
 
-		/* Load i atom data */
+        /* Load i atom data */
         sciy             = scix + STRIDE;
         sciz             = sciy + STRIDE;
         ix_SSE0          = gmx_add_pr(gmx_load1_pr(x+scix)  ,shX_SSE);
@@ -661,13 +659,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
             }
 #undef HALF_LJ
 #undef CALC_COULOMB
@@ -678,13 +676,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
             }
 #undef CALC_COULOMB
         }
@@ -693,13 +691,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #define CHECK_EXCLS
             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
                 cjind++;
             }
 #undef CHECK_EXCLS
             for(; (cjind<cjind1); cjind++)
             {
-#include "nbnxn_kernel_x86_simd_inner.h"
+#include "nbnxn_kernel_simd_4xn_inner.h"
             }
         }
 #undef CALC_LJ
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
similarity index 90%
rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
index 0010a4fbe2..d4f964ea64 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
@@ -105,6 +105,14 @@
     i_SSE1 = _mm256_hadd_ps(i_SSE0,i_SSE2);                             \
     o_SSE  = _mm_add_ps(_mm256_castps256_ps128(i_SSE1),_mm256_extractf128_ps(i_SSE1,1)); \
 }
+#define GMX_MM_TRANSPOSE_SUM4H_PR(i_SSE0,i_SSE2,o_SSE)                  \
+{                                                                       \
+    i_SSE0 = _mm256_hadd_ps(i_SSE0,_mm256_setzero_ps());                \
+    i_SSE2 = _mm256_hadd_ps(i_SSE2,_mm256_setzero_ps());                \
+    i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE2);                             \
+    i_SSE2 = _mm256_permute_ps(i_SSE0,0b10110001);                      \
+    o_SSE  = _mm_add_ps(_mm256_castps256_ps128(i_SSE0),_mm256_extractf128_ps(i_SSE2,1)); \
+}
 #else
 #define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
 {                                                                       \
@@ -228,6 +236,23 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
     GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE);                  \
 }
 
+#define load_lj_pair_params2(nbfp,type,aj,c6_SSE,c12_SSE)                \
+{                                                                       \
+    __m128 clj_SSE[2*UNROLLJ],c6t_SSE[2],c12t_SSE[2];                     \
+    int p;                                                              \
+                                                                        \
+    for(p=0; p<2*UNROLLJ; p++)                                            \
+    {                                                                   \
+        /* Here we load 4 aligned floats, but we need just 2 */         \
+        clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE);          \
+    }                                                                   \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6t_SSE[0],c12t_SSE[0]); \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4],clj_SSE[5],clj_SSE[6],clj_SSE[7],c6t_SSE[1],c12t_SSE[1]); \
+                                                                        \
+    GMX_2_MM_TO_M256(c6t_SSE[0],c6t_SSE[1],c6_SSE);                     \
+    GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE);                  \
+}
+
 #endif
 
 #if defined GMX_MM128_HERE && defined GMX_DOUBLE
@@ -474,7 +499,7 @@ gmx_mm256_invsqrt_ps_single(__m256 x)
 /* Add energy register to possibly multiple terms in the energy array.
  * This function is the same for SSE/AVX single/double.
  */
-static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
+static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,const int *offset_jj)
 {
     int jj;
 
@@ -486,9 +511,39 @@ static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
     {
         gmx_mm_pr v_SSE;
 
-        v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*UNROLLJ);
-        gmx_store_pr(v+offset_jj[jj]+jj*UNROLLJ,gmx_add_pr(v_SSE,e_SSE));
+        v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE);
+        gmx_store_pr(v+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE,gmx_add_pr(v_SSE,e_SSE));
     }
 }
 
+#if defined GMX_X86_AVX_256 && GMX_X86_SIMD_WIDTH_HERE == 8
+/* As add_ener_grp above, but for two groups of UNROLLJ/2 stored in
+ * a single SIMD register.
+ */
+static inline void add_ener_grp_halves(gmx_mm_pr e_SSE,
+                                       real *v0,real *v1,const int *offset_jj)
+{
+    gmx_mm_hpr e_SSE0,e_SSE1;
+    int jj;
+
+    e_SSE0 = _mm256_extractf128_ps(e_SSE,0);
+    e_SSE1 = _mm256_extractf128_ps(e_SSE,1);
+
+    for(jj=0; jj<(UNROLLJ/2); jj++)
+    {
+        gmx_mm_hpr v_SSE;
+
+        v_SSE = gmx_load_hpr(v0+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE/2);
+        gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE0));
+    }
+    for(jj=0; jj<(UNROLLJ/2); jj++)
+    {
+        gmx_mm_hpr v_SSE;
+
+        v_SSE = gmx_load_hpr(v1+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE/2);
+        gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE1));
+    }
+}
+#endif
+
 #endif /* _nbnxn_kernel_sse_utils_h_ */
diff --git a/src/mdlib/nbnxn_search.c b/src/mdlib/nbnxn_search.c
index f356973898..0358e3523d 100644
--- a/src/mdlib/nbnxn_search.c
+++ b/src/mdlib/nbnxn_search.c
@@ -79,11 +79,11 @@
 
 #ifndef GMX_DOUBLE
 #define NBNXN_SEARCH_SSE_SINGLE
-#include "gmx_x86_simd_single.h"
-#else
-#include "gmx_x86_simd_double.h"
 #endif
 
+/* Include basic SSE2 stuff */
+#include <emmintrin.h>
+
 #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
 #define NBNXN_8BB_SSE
 #endif
@@ -94,6 +94,9 @@
 #define STRIDE_8BB        4
 #define STRIDE_8BB_2LOG   2
 
+#endif /* NBNXN_SEARCH_SSE */
+
+#ifdef GMX_NBNXN_SIMD
 
 /* The functions below are macros as they are performance sensitive */
 
@@ -119,27 +122,37 @@
 #define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
 
 /* The j-cluster size is matched to the SIMD width */
-#ifndef GMX_DOUBLE
-/* 128 bits can hold 4 floats */
-#define CI_TO_CJ_S128(ci)  CI_TO_CJ_J4(ci)
-#define X_IND_CI_S128(ci)  X_IND_CI_J4(ci)
-#define X_IND_CJ_S128(cj)  X_IND_CJ_J4(cj)
-/* 256 bits can hold 8 floats */
-#define CI_TO_CJ_S256(ci)  CI_TO_CJ_J8(ci)
-#define X_IND_CI_S256(ci)  X_IND_CI_J8(ci)
-#define X_IND_CJ_S256(cj)  X_IND_CJ_J8(cj)
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#ifdef GMX_DOUBLE
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J2(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J2(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J2(cj)
+#else
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
+#endif
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#ifdef GMX_DOUBLE
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
+#else
+#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J8(ci)
+#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J8(ci)
+#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J8(cj)
+/* Half SIMD with j-cluster size */
+#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
+#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
+#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
+#endif
 #else
-/* 128 bits can hold 2 doubles */
-#define CI_TO_CJ_S128(ci)  CI_TO_CJ_J2(ci)
-#define X_IND_CI_S128(ci)  X_IND_CI_J2(ci)
-#define X_IND_CJ_S128(cj)  X_IND_CJ_J2(cj)
-/* 256 bits can hold 4 doubles */
-#define CI_TO_CJ_S256(ci)  CI_TO_CJ_J4(ci)
-#define X_IND_CI_S256(ci)  X_IND_CI_J4(ci)
-#define X_IND_CJ_S256(cj)  X_IND_CJ_J4(cj)
+#error "unsupported GMX_NBNXN_SIMD_WIDTH"
+#endif
 #endif
 
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* GMX_NBNXN_SIMD */
 
 
 /* Interaction masks for 4xN atom interactions.
@@ -253,12 +266,12 @@ static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 {
     switch (nb_kernel_type)
     {
-    case nbk4x4_PlainC:
-    case nbk4xN_X86_SIMD128:
-    case nbk4xN_X86_SIMD256:
+    case nbnxnk4x4_PlainC:
+    case nbnxnk4xN_SIMD_4xN:
+    case nbnxnk4xN_SIMD_2xNN:
         return NBNXN_CPU_CLUSTER_I_SIZE;
-    case nbk8x8x8_CUDA:
-    case nbk8x8x8_PlainC:
+    case nbnxnk8x8x8_CUDA:
+    case nbnxnk8x8x8_PlainC:
         /* The cluster size for super/sub lists is only set here.
          * Any value should work for the pair-search and atomdata code.
          * The kernels, of course, might require a particular value.
@@ -273,24 +286,33 @@ static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 
 int nbnxn_kernel_to_cj_size(int nb_kernel_type)
 {
+    int nbnxn_simd_width=0;
+    int cj_size=0;
+
+#ifdef GMX_NBNXN_SIMD
+    nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+#endif
+
     switch (nb_kernel_type)
     {
-    case nbk4x4_PlainC:
-        return NBNXN_CPU_CLUSTER_I_SIZE;
-    case nbk4xN_X86_SIMD128:
-        /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
-        return 16/sizeof(real);
-    case nbk4xN_X86_SIMD256:
-        /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
-        return 32/sizeof(real);
-    case nbk8x8x8_CUDA:
-    case nbk8x8x8_PlainC:
-        return nbnxn_kernel_to_ci_size(nb_kernel_type);
+    case nbnxnk4x4_PlainC:
+        cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
+        break;
+    case nbnxnk4xN_SIMD_4xN:
+        cj_size = nbnxn_simd_width;
+        break;
+    case nbnxnk4xN_SIMD_2xNN:
+        cj_size = nbnxn_simd_width/2;
+        break;
+    case nbnxnk8x8x8_CUDA:
+    case nbnxnk8x8x8_PlainC:
+        cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
+        break;
     default:
         gmx_incons("unknown kernel type");
     }
 
-    return 0;
+    return cj_size;
 }
 
 static int ci_to_cj(int na_cj_2log,int ci)
@@ -307,20 +329,20 @@ static int ci_to_cj(int na_cj_2log,int ci)
 
 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
 {
-    if (nb_kernel_type == nbkNotSet)
+    if (nb_kernel_type == nbnxnkNotSet)
     {
         gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
     }
 
     switch (nb_kernel_type)
     {
-    case nbk8x8x8_CUDA:
-    case nbk8x8x8_PlainC:
+    case nbnxnk8x8x8_CUDA:
+    case nbnxnk8x8x8_PlainC:
         return FALSE;
 
-    case nbk4x4_PlainC:
-    case nbk4xN_X86_SIMD128:
-    case nbk4xN_X86_SIMD256:
+    case nbnxnk4x4_PlainC:
+    case nbnxnk4xN_SIMD_4xN:
+    case nbnxnk4xN_SIMD_2xNN:
         return TRUE;
 
     default:
@@ -2360,18 +2382,16 @@ static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
 
     snew(nbl->work,1);
 #ifdef NBNXN_BBXXXX
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
 #else
-    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
-#endif
-    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
-#ifdef NBNXN_SEARCH_SSE
-    snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
-#ifdef GMX_X86_AVX_256
-    snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
 #endif
+    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
+#ifdef GMX_NBNXN_SIMD
+    snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
+    snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
 #endif
-    snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
+    snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
 }
 
 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
@@ -2626,7 +2646,6 @@ static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 }
 
-#ifdef NBNXN_SEARCH_SSE
 /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
 static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
 {
@@ -2639,7 +2658,6 @@ static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
 #endif
 }
 
-#ifdef GMX_X86_AVX_256
 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
 static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
 {
@@ -2647,12 +2665,23 @@ static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
     return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
             (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
              NBNXN_INT_MASK_ALL));
-#else              /* cj-size = 2 */
+#else              /* cj-size = 4 */
     return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 #endif
 }
+
+#ifdef GMX_NBNXN_SIMD
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define get_imask_x86_simd_4xn  get_imask_x86_simd128
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define get_imask_x86_simd_4xn  get_imask_x86_simd256
+#define get_imask_x86_simd_2xnn get_imask_x86_simd128
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
 #endif
-#endif /* NBNXN_SEARCH_SSE */
 
 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
  * Checks bounding box distances and possibly atom pair distances.
@@ -2773,23 +2802,11 @@ static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
     }
 }
 
-#ifdef NBNXN_SEARCH_SSE
-/* Include make_cluster_list_x86_simd128/256 */
-#define GMX_MM128_HERE
-#include "gmx_x86_simd_macros.h"
-#define STRIDE_S  PACK_X4
-#include "nbnxn_search_x86_simd.h"
-#undef STRIDE_S
-#undef GMX_MM128_HERE
-#ifdef GMX_X86_AVX_256
-/* Include make_cluster_list_x86_simd128/256 */
-#define GMX_MM256_HERE
-#include "gmx_x86_simd_macros.h"
-#define STRIDE_S  GMX_X86_SIMD_WIDTH_HERE
-#include "nbnxn_search_x86_simd.h"
-#undef STRIDE_S
-#undef GMX_MM256_HERE
+#ifdef GMX_NBNXN_SIMD_4XN
+#include "nbnxn_search_simd_4xn.h"
 #endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+#include "nbnxn_search_simd_2xnn.h"
 #endif
 
 /* Plain C or SSE code for making a pair list of super-cell sci vs scj.
@@ -4495,7 +4512,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 
                                     switch (nb_kernel_type)
                                     {
-                                    case nbk4x4_PlainC:
+                                    case nbnxnk4x4_PlainC:
                                         check_subcell_list_space_simple(nbl,cl-cf+1);
 
                                         make_cluster_list_simple(gridj,
@@ -4505,30 +4522,30 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
                                                                  rl2,rbb2,
                                                                  &ndistc);
                                         break;
-#ifdef NBNXN_SEARCH_SSE
-                                    case nbk4xN_X86_SIMD128:
+#ifdef GMX_NBNXN_SIMD_4XN
+                                    case nbnxnk4xN_SIMD_4xN:
                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
-                                        make_cluster_list_x86_simd128(gridj,
-                                                                      nbl,ci,cf,cl,
-                                                                      (gridi == gridj && shift == CENTRAL),
-                                                                      nbat->x,
-                                                                      rl2,rbb2,
-                                                                      &ndistc);
+                                        make_cluster_list_simd_4xn(gridj,
+                                                                   nbl,ci,cf,cl,
+                                                                   (gridi == gridj && shift == CENTRAL),
+                                                                   nbat->x,
+                                                                   rl2,rbb2,
+                                                                   &ndistc);
                                         break;
-#ifdef GMX_X86_AVX_256
-                                    case nbk4xN_X86_SIMD256:
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+                                    case nbnxnk4xN_SIMD_2xNN:
                                         check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
-                                        make_cluster_list_x86_simd256(gridj,
-                                                                      nbl,ci,cf,cl,
-                                                                      (gridi == gridj && shift == CENTRAL),
-                                                                      nbat->x,
-                                                                      rl2,rbb2,
-                                                                      &ndistc);
+                                        make_cluster_list_simd_2xnn(gridj,
+                                                                   nbl,ci,cf,cl,
+                                                                   (gridi == gridj && shift == CENTRAL),
+                                                                   nbat->x,
+                                                                   rl2,rbb2,
+                                                                   &ndistc);
                                         break;
 #endif
-#endif
-                                    case nbk8x8x8_PlainC:
-                                    case nbk8x8x8_CUDA:
+                                    case nbnxnk8x8x8_PlainC:
+                                    case nbnxnk8x8x8_CUDA:
                                         check_subcell_list_space_supersub(nbl,cl-cf+1);
                                         for(cj=cf; cj<=cl; cj++)
                                         {
@@ -4728,15 +4745,15 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs,
     {
         switch (nb_kernel_type)
         {
-#ifdef NBNXN_SEARCH_SSE
-        case nbk4xN_X86_SIMD128:
-            nbs->icell_set_x = icell_set_x_x86_simd128;
-            break;
-#ifdef GMX_X86_AVX_256
-        case nbk4xN_X86_SIMD256:
-            nbs->icell_set_x = icell_set_x_x86_simd256;
+#ifdef GMX_NBNXN_SIMD_4XN
+        case nbnxnk4xN_SIMD_4xN:
+            nbs->icell_set_x = icell_set_x_simd_4xn;
             break;
 #endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+        case nbnxnk4xN_SIMD_2xNN:
+            nbs->icell_set_x = icell_set_x_simd_2xnn;
+            break;
 #endif
         default:
             nbs->icell_set_x = icell_set_x_simple;
diff --git a/src/mdlib/nbnxn_search_simd_2xnn.h b/src/mdlib/nbnxn_search_simd_2xnn.h
new file mode 100644
index 0000000000..baf6f7bbb9
--- /dev/null
+++ b/src/mdlib/nbnxn_search_simd_2xnn.h
@@ -0,0 +1,262 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
+#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#else
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+#endif
+#include "gmx_x86_simd_macros.h"
+
+#if GMX_X86_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S  (GMX_X86_SIMD_WIDTH_HERE/2)
+#else
+#define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
+{
+    gmx_mm_hpr a_SSE;
+
+    a_SSE = _mm_load_ps(a);
+
+    return gmx_2hpr_to_pr(a_SSE,a_SSE);
+}
+
+static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a,real shift)
+{
+    gmx_mm_hpr a0,a1;
+
+    a0 = _mm_set1_ps(a[0] + shift);
+    a1 = _mm_set1_ps(a[1] + shift);
+
+    return gmx_2hpr_to_pr(a1,a0);
+}
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_2xnn(int ci,
+                      real shx,real shy,real shz,
+                      int na_c,
+                      int stride,const real *x,
+                      nbnxn_list_work_t *work)
+{
+    int  ia;
+    nbnxn_x_ci_simd_2xnn_t *x_ci;
+
+    x_ci = work->x_ci_simd_2xnn;
+
+    ia = X_IND_CI_SIMD_2XNN(ci);
+
+    x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
+    x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
+    x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
+    x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
+    x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
+    x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
+}
+
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
+ * for coordinates in packed format.
+ * Checks bouding box distances and possibly atom pair distances.
+ * This is an accelerated version of make_cluster_list_simple.
+ */
+static gmx_inline void
+make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
+                            nbnxn_pairlist_t *nbl,
+                            int ci,int cjf,int cjl,
+                            gmx_bool remove_sub_diag,
+                            const real *x_j,
+                            real rl2,float rbb2,
+                            int *ndistc)
+{
+    const nbnxn_x_ci_simd_2xnn_t *work;
+    const float *bb_ci;
+
+    gmx_mm_pr  jx_SSE,jy_SSE,jz_SSE;
+
+    gmx_mm_pr  dx_SSE0,dy_SSE0,dz_SSE0;
+    gmx_mm_pr  dx_SSE2,dy_SSE2,dz_SSE2;
+
+    gmx_mm_pr  rsq_SSE0;
+    gmx_mm_pr  rsq_SSE2;
+
+    gmx_mm_pr  wco_SSE0;
+    gmx_mm_pr  wco_SSE2;
+    gmx_mm_pr  wco_any_SSE;
+
+    gmx_mm_pr  rc2_SSE;
+
+    gmx_bool   InRange;
+    float      d2;
+    int        xind_f,xind_l,cj;
+
+    cjf = CI_TO_CJ_SIMD_2XNN(cjf);
+    cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1;
+
+    work = nbl->work->x_ci_simd_2xnn;
+
+    bb_ci = nbl->work->bb_ci;
+
+    rc2_SSE   = gmx_set1_pr(rl2);
+
+    InRange = FALSE;
+    while (!InRange && cjf <= cjl)
+    {
+        d2 = subc_bb_dist2_sse(4,0,bb_ci,cjf,gridj->bbj);
+        *ndistc += 2;
+
+        /* Check if the distance is within the distance where
+         * we use only the bounding box distance rbb,
+         * or within the cut-off and there is at least one atom pair
+         * within the cut-off.
+         */
+        if (d2 < rbb2)
+        {
+            InRange = TRUE;
+        }
+        else if (d2 < rl2)
+        {
+            xind_f  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf);
+
+            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
+            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
+            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
+
+            /* Calculate distance */
+            dx_SSE0            = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+            dy_SSE0            = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+            dz_SSE0            = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+            dx_SSE2            = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+            dy_SSE2            = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+            dz_SSE2            = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+
+            /* rsq = dx*dx+dy*dy+dz*dz */
+            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+
+            wco_any_SSE        = gmx_or_pr(wco_SSE0,wco_SSE2);
+
+            InRange            = gmx_movemask_pr(wco_any_SSE);
+
+            *ndistc += 2*GMX_X86_SIMD_WIDTH_HERE;
+        }
+        if (!InRange)
+        {
+            cjf++;
+        }
+    }
+    if (!InRange)
+    {
+        return;
+    }
+
+    InRange = FALSE;
+    while (!InRange && cjl > cjf)
+    {
+        d2 = subc_bb_dist2_sse(4,0,bb_ci,cjl,gridj->bbj);
+        *ndistc += 2;
+        
+        /* Check if the distance is within the distance where
+         * we use only the bounding box distance rbb,
+         * or within the cut-off and there is at least one atom pair
+         * within the cut-off.
+         */
+        if (d2 < rbb2)
+        {
+            InRange = TRUE;
+        }
+        else if (d2 < rl2)
+        {
+            xind_l  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl);
+
+            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
+            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
+            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
+
+            /* Calculate distance */
+            dx_SSE0            = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+            dy_SSE0            = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+            dz_SSE0            = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+            dx_SSE2            = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+            dy_SSE2            = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+            dz_SSE2            = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+
+            /* rsq = dx*dx+dy*dy+dz*dz */
+            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+
+            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+
+            wco_any_SSE        = gmx_or_pr(wco_SSE0,wco_SSE2);
+
+            InRange            = gmx_movemask_pr(wco_any_SSE);
+
+            *ndistc += 2*GMX_X86_SIMD_WIDTH_HERE;
+        }
+        if (!InRange)
+        {
+            cjl--;
+        }
+    }
+
+    if (cjf <= cjl)
+    {
+        for(cj=cjf; cj<=cjl; cj++)
+        {
+            /* Store cj and the interaction mask */
+            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cj;
+            nbl->cj[nbl->ncj].excl = get_imask_x86_simd_2xnn(remove_sub_diag,ci,cj);
+            nbl->ncj++;
+        }
+        /* Increase the closing index in i super-cell list */
+        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+    }
+}
+
+#undef STRIDE_S
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
diff --git a/src/mdlib/nbnxn_search_x86_simd.h b/src/mdlib/nbnxn_search_simd_4xn.h
similarity index 77%
rename from src/mdlib/nbnxn_search_x86_simd.h
rename to src/mdlib/nbnxn_search_simd_4xn.h
index a6af973d8a..d754d07bc0 100644
--- a/src/mdlib/nbnxn_search_x86_simd.h
+++ b/src/mdlib/nbnxn_search_simd_4xn.h
@@ -36,40 +36,37 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file.
- * gmx_sse_or_avh.h should be included before including this file.
- */
-
-/* Copies PBC shifted i-cell packed atom coordinates to working array */
-#ifdef GMX_MM128_HERE
-static void icell_set_x_x86_simd128
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+#define GMX_MM128_HERE
 #else
-#ifdef GMX_MM256_HERE
-static void icell_set_x_x86_simd256
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
 #else
-"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
 #endif
 #endif
-                                   (int ci,
-                                    real shx,real shy,real shz,
-                                    int na_c,
-                                    int stride,const real *x,
-                                    nbnxn_list_work_t *work)
-{
-    int  ia;
-#ifdef GMX_MM128_HERE
-    nbnxn_x_ci_x86_simd128_t *x_ci;
-
-    x_ci = work->x_ci_x86_simd128;
+#include "gmx_x86_simd_macros.h"
 
-    ia = X_IND_CI_S128(ci);
+#if GMX_X86_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S  (GMX_X86_SIMD_WIDTH_HERE)
 #else
-    nbnxn_x_ci_x86_simd256_t *x_ci;
+#define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
+#endif
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+static gmx_inline void
+icell_set_x_simd_4xn(int ci,
+                     real shx,real shy,real shz,
+                     int na_c,
+                     int stride,const real *x,
+                     nbnxn_list_work_t *work)
+{
+    int  ia;
+    nbnxn_x_ci_simd_4xn_t *x_ci;
 
-    x_ci = work->x_ci_x86_simd256;
+    x_ci = work->x_ci_simd_4xn;
 
-    ia = X_IND_CI_S256(ci);
-#endif
+    ia = X_IND_CI_SIMD_4XN(ci);
 
     x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S    ] + shx);
     x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S    ] + shy);
@@ -85,34 +82,21 @@ static void icell_set_x_x86_simd256
     x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
 }
 
-/* SSE or AVX code for making a pair list of cell ci vs cell cjf-cjl
+/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
  * for coordinates in packed format.
  * Checks bouding box distances and possibly atom pair distances.
  * This is an accelerated version of make_cluster_list_simple.
  */
-#ifdef GMX_MM128_HERE
-static void make_cluster_list_x86_simd128
-#else
-#ifdef GMX_MM256_HERE
-static void make_cluster_list_x86_simd256
-#else
-"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
-#endif
-#endif
-                                         (const nbnxn_grid_t *gridj,
-                                          nbnxn_pairlist_t *nbl,
-                                          int ci,int cjf,int cjl,
-                                          gmx_bool remove_sub_diag,
-                                          const real *x_j,
-                                          real rl2,float rbb2,
-                                          int *ndistc)
+static gmx_inline void
+make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
+                           nbnxn_pairlist_t *nbl,
+                           int ci,int cjf,int cjl,
+                           gmx_bool remove_sub_diag,
+                           const real *x_j,
+                           real rl2,float rbb2,
+                           int *ndistc)
 {
-#ifdef GMX_MM128_HERE
-    const nbnxn_x_ci_x86_simd128_t *work;
-#else
-    const nbnxn_x_ci_x86_simd256_t *work;
-#endif
-
+    const nbnxn_x_ci_simd_4xn_t *work;
     const float *bb_ci;
 
     gmx_mm_pr  jx_SSE,jy_SSE,jz_SSE;
@@ -139,17 +123,10 @@ static void make_cluster_list_x86_simd256
     float      d2;
     int        xind_f,xind_l,cj;
 
-#ifdef GMX_MM128_HERE
-    cjf = CI_TO_CJ_S128(cjf);
-    cjl = CI_TO_CJ_S128(cjl+1) - 1;
-
-    work = nbl->work->x_ci_x86_simd128;
-#else
-    cjf = CI_TO_CJ_S256(cjf);
-    cjl = CI_TO_CJ_S256(cjl+1) - 1;
+    cjf = CI_TO_CJ_SIMD_4XN(cjf);
+    cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1;
 
-    work = nbl->work->x_ci_x86_simd256;
-#endif
+    work = nbl->work->x_ci_simd_4xn;
 
     bb_ci = nbl->work->bb_ci;
 
@@ -172,11 +149,8 @@ static void make_cluster_list_x86_simd256
         }
         else if (d2 < rl2)
         {
-#ifdef GMX_MM128_HERE
-            xind_f  = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjf);
-#else
-            xind_f  = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjf);
-#endif
+            xind_f  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
+
             jx_SSE  = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
             jy_SSE  = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
             jz_SSE  = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
@@ -242,11 +216,8 @@ static void make_cluster_list_x86_simd256
         }
         else if (d2 < rl2)
         {
-#ifdef GMX_MM128_HERE
-            xind_l  = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjl);
-#else
-            xind_l  = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjl);
-#endif
+            xind_l  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
+
             jx_SSE  = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
             jy_SSE  = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
             jz_SSE  = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
@@ -295,16 +266,15 @@ static void make_cluster_list_x86_simd256
         for(cj=cjf; cj<=cjl; cj++)
         {
             /* Store cj and the interaction mask */
-#ifdef GMX_MM128_HERE
-            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_S128(gridj->cell0) + cj;
-            nbl->cj[nbl->ncj].excl = get_imask_x86_simd128(remove_sub_diag,ci,cj);
-#else
-            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_S256(gridj->cell0) + cj;
-            nbl->cj[nbl->ncj].excl = get_imask_x86_simd256(remove_sub_diag,ci,cj);
-#endif
+            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
+            nbl->cj[nbl->ncj].excl = get_imask_x86_simd_4xn(remove_sub_diag,ci,cj);
             nbl->ncj++;
         }
         /* Increase the closing index in i super-cell list */
         nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
     }
 }
+
+#undef STRIDE_S
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
diff --git a/src/mdlib/sim_util.c b/src/mdlib/sim_util.c
index b57d35cf4d..eb9f636140 100644
--- a/src/mdlib/sim_util.c
+++ b/src/mdlib/sim_util.c
@@ -93,8 +93,8 @@
 #include "nbnxn_atomdata.h"
 #include "nbnxn_search.h"
 #include "nbnxn_kernels/nbnxn_kernel_ref.h"
-#include "nbnxn_kernels/nbnxn_kernel_x86_simd128.h"
-#include "nbnxn_kernels/nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h"
 #include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
 
 #ifdef GMX_LIB_MPI
@@ -620,13 +620,13 @@ static void do_nb_verlet(t_forcerec *fr,
         gmx_incons("Invalid cut-off scheme passed!");
     }
 
-    if (nbvg->kernel_type != nbk8x8x8_CUDA)
+    if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
     {
         wallcycle_sub_start(wcycle, ewcsNONBONDED);
     }
     switch (nbvg->kernel_type)
     {
-        case nbk4x4_PlainC:
+        case nbnxnk4x4_PlainC:
             nbnxn_kernel_ref(&nbvg->nbl_lists,
                              nbvg->nbat, ic,
                              fr->shift_vec,
@@ -639,38 +639,38 @@ static void do_nb_verlet(t_forcerec *fr,
                              enerd->grpp.ener[egLJSR]);
             break;
         
-        case nbk4xN_X86_SIMD128:
-            nbnxn_kernel_x86_simd128(&nbvg->nbl_lists,
-                                     nbvg->nbat, ic,
-                                     nbvg->ewald_excl,
-                                     fr->shift_vec,
-                                     flags,
-                                     clearF,
-                                     fr->fshift[0],
-                                     enerd->grpp.ener[egCOULSR],
-                                     fr->bBHAM ?
-                                     enerd->grpp.ener[egBHAMSR] :
-                                     enerd->grpp.ener[egLJSR]);
+        case nbnxnk4xN_SIMD_4xN:
+            nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
+                                  nbvg->nbat, ic,
+                                  nbvg->ewald_excl,
+                                  fr->shift_vec,
+                                  flags,
+                                  clearF,
+                                  fr->fshift[0],
+                                  enerd->grpp.ener[egCOULSR],
+                                  fr->bBHAM ?
+                                  enerd->grpp.ener[egBHAMSR] :
+                                  enerd->grpp.ener[egLJSR]);
             break;
-        case nbk4xN_X86_SIMD256:
-            nbnxn_kernel_x86_simd256(&nbvg->nbl_lists,
-                                     nbvg->nbat, ic,
-                                     nbvg->ewald_excl,
-                                     fr->shift_vec,
-                                     flags,
-                                     clearF,
-                                     fr->fshift[0],
-                                     enerd->grpp.ener[egCOULSR],
-                                     fr->bBHAM ?
-                                     enerd->grpp.ener[egBHAMSR] :
-                                     enerd->grpp.ener[egLJSR]);
+        case nbnxnk4xN_SIMD_2xNN:
+            nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
+                                   nbvg->nbat, ic,
+                                   nbvg->ewald_excl,
+                                   fr->shift_vec,
+                                   flags,
+                                   clearF,
+                                   fr->fshift[0],
+                                   enerd->grpp.ener[egCOULSR],
+                                   fr->bBHAM ?
+                                   enerd->grpp.ener[egBHAMSR] :
+                                   enerd->grpp.ener[egLJSR]);
             break;
 
-        case nbk8x8x8_CUDA:
+        case nbnxnk8x8x8_CUDA:
             nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
             break;
 
-        case nbk8x8x8_PlainC:
+        case nbnxnk8x8x8_PlainC:
             nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
                                  nbvg->nbat, ic,
                                  fr->shift_vec,
@@ -688,7 +688,7 @@ static void do_nb_verlet(t_forcerec *fr,
             gmx_incons("Invalid nonbonded kernel type passed!");
 
     }
-    if (nbvg->kernel_type != nbk8x8x8_CUDA)
+    if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
     {
         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
     }
@@ -785,7 +785,7 @@ void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
     bDoForces     = (flags & GMX_FORCE_FORCES);
     bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
     bUseGPU       = fr->nbv->bUseGPU;
-    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbk8x8x8_PlainC);
+    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
 
     if (bStateChanged)
     {
@@ -1000,7 +1000,7 @@ void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
 
             wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_NONLOCAL);
 
-            if (nbv->grp[eintNonlocal].kernel_type == nbk8x8x8_CUDA)
+            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
             {
                 /* initialize non-local pair-list on the GPU */
                 nbnxn_cuda_init_pairlist(nbv->cu_nbv,
-- 
2.11.4.GIT