From 7b6508e8318d1d045826616317f8edbaf4658e3b Mon Sep 17 00:00:00 2001
From: Szilard Pall <pszilard@cbr.su.se>
Date: Tue, 2 Oct 2012 12:27:00 +0200
Subject: [PATCH] added Verlet scheme and NxN non-bonded functionality

This commit implements a new "Verlet" cutoff scheme which uses
a exact cut-offs and standard Verlet lists with an automatically
calculated buffer.

The Verlet code-path supports full multi-level heterogeneous
parallelization using MPI/thread-MPI, OpenMP multi-threading,
and GPU acceleration for the non-bonded calculations.

The non-bonded calculations with the Verlet scheme support highly
optimized CPU SIMD acceleration using SSE/AVX and GPU acceleration using
NVIDIA CUDA. The CPU kernels have been tested on and optimized for most
x86 architectures including recent ones like Intel Sandy/Ivy-Bridge and
AMD Bulldozer. The CUDA GPU kernels support hardware of compute
capability 2.0 and above and are optimized for both Fermi and Kepler
architectures.

The new search code has been added in nbnxn_search.c, new non-bonded
kernels in nbnxn_kernels and nbnxn_cuda:
- plain-C kernels: reference CPU implementation and reference GPU
  (emulation)
- x86 128- and 256-bit SIMD kernels (SSE2, SSE4.1, AVX_128, AVX_256
  intrinsics)
- CUDA (two versions: for recent and legacy toolkit/drivers)

This commit also implements some additional optimizations targeting
performance:
- SSE acceleration for dihedrals;
- automated PP/PME load balancing called "PME tuning" which optimizes
  the electrostatics cut-off to improve load balance between CPU and GPU
  or separate PP and PME processes;
- hardware detection and automated run-configuration selection.

Change-Id: I3e1a15331c174265ec086565b978ffd079df2aaa
---
 CMakeLists.txt                                     |   80 +-
 cmake/ThreadMPI.cmake                              |   18 +-
 cmake/gmxDetectAcceleration.cmake                  |   22 +-
 cmake/gmxGCC44O3BugWorkaround.cmake                |   44 +
 cmake/gmxGetCompilerVersion.cmake                  |   46 +
 cmake/gmxManageNvccConfig.cmake                    |   83 +
 cmake/gmxSetBuildInformation.cmake                 |   48 +-
 include/bondf.h                                    |    8 +
 include/constr.h                                   |   78 +-
 include/coulomb.h                                  |   30 +-
 include/domdec.h                                   |   41 +-
 include/domdec_network.h                           |    1 +
 include/force.h                                    |   53 +-
 include/futil.h                                    |    1 +
 include/genborn.h                                  |    1 +
 include/gmx_avx_double.h                           |   61 +
 include/gmx_avx_single.h                           |   57 +
 include/gmx_cpuid.h                                |  257 +
 include/gmx_detect_hardware.h                      |   50 +
 include/gmx_detectcpu.h                            |  162 -
 include/gmx_fatal.h                                |   21 +-
 .../membed.h => include/gmx_fatal_collective.h     |   47 +-
 include/gmx_hash.h                                 |  318 +
 include/gmx_math_x86_avx_128_fma_double.h          |    4 +-
 include/gmx_math_x86_avx_128_fma_single.h          |    4 +-
 include/gmx_math_x86_avx_256_double.h              |   15 +-
 include/gmx_math_x86_avx_256_single.h              |    9 +-
 include/gmx_math_x86_sse2_double.h                 |   10 +-
 include/gmx_math_x86_sse4_1_double.h               |   12 +-
 include/gmx_math_x86_sse4_1_single.h               |    2 +-
 include/gmx_omp.h                                  |   20 +-
 .../gmx_gpu_utils.h => include/gmx_omp_nthreads.h  |   61 +-
 include/gmx_wallcycle.h                            |   46 +-
 include/gmx_x86_avx_128_fma.h                      |    2 +-
 include/gmx_x86_simd_double.h                      |   70 +
 include/gmx_x86_simd_macros.h                      |  261 +
 include/gmx_x86_simd_single.h                      |   71 +
 include/gpu_utils.h                                |  105 +
 include/main.h                                     |    8 +-
 include/maths.h                                    |   13 +-
 include/{types/graph.h => md_logging.h}            |   39 +-
 include/md_support.h                               |  149 +
 include/mdebin.h                                   |    4 +-
 include/mdrun.h                                    |  671 +--
 include/mtop_util.h                                |   28 +-
 include/mvdata.h                                   |    1 +
 include/names.h                                    |    2 +
 include/nbnxn_cuda_data_mgmt.h                     |  131 +
 include/nbnxn_search.h                             |  177 +
 include/network.h                                  |    3 +
 include/nrnb.h                                     |    4 +-
 include/nsgrid.h                                   |   11 +-
 include/pbc.h                                      |   11 +-
 include/physics.h                                  |   21 +
 .../gmx_gpu_utils.h => include/pmalloc_cuda.h      |   49 +-
 include/pme.h                                      |   33 +-
 include/sim_util.h                                 |  202 +
 include/smalloc.h                                  |    4 +-
 src/kernel/membed.h => include/tables.h            |   43 +-
 include/thread_mpi/atomic/gcc_intrinsics.h         |    2 +-
 include/thread_mpi/mpi_bindings.h                  |    1 -
 include/typedefs.h                                 |    9 +-
 include/types/commrec.h                            |   26 +-
 include/types/enums.h                              |   15 +-
 include/types/fcdata.h                             |    6 +-
 src/kernel/membed.h => include/types/force_flags.h |   56 +-
 include/types/forcerec.h                           |   66 +-
 include/types/graph.h                              |    4 +
 include/types/group.h                              |    2 +
 include/types/hw_info.h                            |   83 +
 include/types/idef.h                               |    6 +
 include/types/ifunc.h                              |    6 +-
 include/types/inputrec.h                           |    5 +-
 .../types/interaction_const.h                      |   76 +-
 include/types/nb_verlet.h                          |  130 +
 include/types/nbnxn_cuda_types_ext.h               |   77 +
 include/types/nbnxn_pairlist.h                     |  204 +
 include/types/nrnb.h                               |    6 +
 include/types/simple.h                             |    4 +
 include/update.h                                   |    2 +
 include/vsite.h                                    |    1 +
 share/html/online/mdp_opt.html                     |   81 +-
 src/config.h.cmakein                               |   19 +-
 src/gmxlib/CMakeLists.txt                          |   19 +-
 src/gmxlib/bondfree.c                              | 1092 +++-
 src/gmxlib/calcgrid.c                              |    2 +-
 src/gmxlib/checkpoint.c                            |    1 -
 src/gmxlib/cuda_tools/CMakeLists.txt               |    7 +
 src/gmxlib/cuda_tools/cudautils.cu                 |  266 +
 src/gmxlib/cuda_tools/cudautils.cuh                |  168 +
 src/gmxlib/cuda_tools/pmalloc_cuda.cu              |  106 +
 src/gmxlib/cuda_tools/vectype_ops.cuh              |  156 +
 src/gmxlib/disre.c                                 |   16 +-
 src/gmxlib/ewald_util.c                            |  123 +-
 src/gmxlib/gmx_cpuid.c                             |  858 +++
 src/gmxlib/gmx_detect_hardware.c                   |  594 ++
 src/gmxlib/gmx_detectcpu.c                         |  615 --
 src/gmxlib/gmx_fatal.c                             |    2 +-
 src/gmxlib/gmx_omp.c                               |    8 +
 src/gmxlib/gmx_omp_nthreads.c                      |  445 ++
 .../gpu_utils}/CMakeLists.txt                      |   17 +-
 .../gpu_utils/gpu_utils.cu}                        |  546 +-
 .../gpu_utils}/memtestG80_core.cu                  |    0
 .../gpu_utils}/memtestG80_core.h                   |    0
 src/gmxlib/main.c                                  |   41 +-
 src/gmxlib/maths.c                                 |  206 +-
 .../gmx_gpu_utils.h => gmxlib/md_logging.c}        |   66 +-
 src/gmxlib/mtop_util.c                             |  241 +-
 src/gmxlib/names.c                                 |    4 +
 src/gmxlib/network.c                               |  191 +-
 src/gmxlib/nonbonded/nonbonded.c                   |   10 +-
 src/gmxlib/nrnb.c                                  |  106 +-
 src/gmxlib/pbc.c                                   |   81 +-
 src/gmxlib/smalloc.c                               |   14 +-
 src/gmxlib/tpxio.c                                 |   64 +-
 src/gmxlib/txtdump.c                               |    6 +-
 src/kernel/CMakeLists.txt                          |   14 +-
 src/kernel/calc_verletbuf.c                        |  716 +++
 src/kernel/{repl_ex.h => calc_verletbuf.h}         |   54 +-
 src/kernel/grompp.c                                |  109 +-
 src/kernel/md.c                                    |  329 +-
 src/kernel/md_openmm.c                             |    1 +
 src/kernel/mdrun.c                                 |  131 +-
 src/kernel/membed.c                                |    6 +-
 src/kernel/membed.h                                |    1 +
 src/kernel/openmm_wrapper.cpp                      |    4 +-
 src/kernel/pme_switch.c                            |  529 ++
 .../gmx_gpu_utils.h => pme_switch.h}               |   44 +-
 src/kernel/readir.c                                |  311 +-
 src/kernel/readir.h                                |    1 -
 src/kernel/repl_ex.h                               |    1 +
 src/kernel/runner.c                                | 1048 +++-
 src/kernel/tpbcmp.c                                |    3 +
 src/mdlib/CMakeLists.txt                           |   18 +-
 src/mdlib/calcmu.c                                 |   90 +-
 src/mdlib/clincs.c                                 |  608 +-
 src/mdlib/constr.c                                 |  383 +-
 src/mdlib/csettle.c                                |  270 +-
 src/mdlib/domdec.c                                 | 2339 +++++---
 src/mdlib/domdec_con.c                             |  616 +-
 src/mdlib/domdec_top.c                             | 1145 ++--
 src/mdlib/edsam.c                                  |    9 +-
 src/mdlib/fft5d.c                                  |   40 +-
 src/mdlib/force.c                                  |  213 +-
 src/mdlib/forcerec.c                               |  808 ++-
 src/mdlib/gmx_wallcycle.c                          |  471 +-
 src/mdlib/groupcoord.h                             |    1 +
 src/mdlib/iteratedconstraints.c                    |    7 +-
 src/mdlib/md_support.c                             |   66 +-
 src/mdlib/mdatom.c                                 |   29 +-
 src/mdlib/mdebin.c                                 |   20 +-
 src/mdlib/minimize.c                               |  209 +-
 src/mdlib/nbnxn_consts.h                           |   86 +
 src/mdlib/nbnxn_cuda/CMakeLists.txt                |    8 +
 src/mdlib/nbnxn_cuda/nbnxn_cuda.cu                 |  671 +++
 src/mdlib/nbnxn_cuda/nbnxn_cuda.h                  |   87 +
 src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu       |  884 +++
 src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh         |  421 ++
 src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh  |  375 ++
 src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh   |  296 +
 .../nbnxn_cuda/nbnxn_cuda_kernels.cuh}             |   75 +-
 src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h            |  187 +
 src/mdlib/nbnxn_kernels/nbnxn_kernel_common.c      |   59 +
 .../nbnxn_kernels/nbnxn_kernel_common.h}           |   39 +-
 src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c     |  377 ++
 .../nbnxn_kernels/nbnxn_kernel_gpu_ref.h}          |   44 +-
 src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c         |  265 +
 .../nbnxn_kernels/nbnxn_kernel_ref.h}              |   43 +-
 src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_inner.h   |  274 +
 src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h   |  365 ++
 src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c |  316 +
 .../nbnxn_kernels/nbnxn_kernel_x86_simd128.h}      |   54 +-
 src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c |  316 +
 .../nbnxn_kernels/nbnxn_kernel_x86_simd256.h}      |   54 +-
 .../nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h |   69 +
 .../nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h    |  949 +++
 .../nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h    |  760 +++
 .../nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h    |  489 ++
 src/mdlib/nbnxn_search.c                           | 6053 ++++++++++++++++++++
 src/mdlib/nbnxn_search_x86_simd.h                  |  307 +
 src/mdlib/nlistheuristics.c                        |    1 +
 src/mdlib/ns.c                                     |    2 +-
 src/mdlib/nsgrid.c                                 |   11 +-
 src/mdlib/partdec.c                                |    5 +
 src/mdlib/perf_est.c                               |  351 +-
 src/mdlib/pme.c                                    |  396 +-
 src/mdlib/pme_pp.c                                 |   63 +-
 src/mdlib/pme_sse_single.h                         |  160 +-
 src/mdlib/pull.c                                   |    7 +-
 src/mdlib/pull_rotation.c                          |   12 +-
 src/mdlib/qmmm.c                                   |   22 +-
 src/mdlib/shakef.c                                 |    2 +-
 src/mdlib/shellfc.c                                |   41 +-
 src/mdlib/sim_util.c                               | 1641 ++++--
 src/mdlib/stat.c                                   |    4 +-
 src/mdlib/tables.c                                 |  190 +-
 src/mdlib/tgroup.c                                 |   17 +
 src/mdlib/tpi.c                                    |    2 +-
 src/mdlib/update.c                                 |  370 +-
 src/tools/addconf.c                                |    6 +-
 src/tools/calcpot.c                                |    5 +-
 src/tools/gmx_clustsize.c                          |    8 +-
 src/tools/gmx_disre.c                              |    4 +-
 src/tools/gmx_pme_error.c                          |   10 +-
 src/tools/gmx_trjconv.c                            |    6 +-
 src/tools/gmx_tune_pme.c                           |   48 +-
 206 files changed, 32960 insertions(+), 5772 deletions(-)
 create mode 100644 cmake/gmxGCC44O3BugWorkaround.cmake
 create mode 100644 cmake/gmxGetCompilerVersion.cmake
 create mode 100644 cmake/gmxManageNvccConfig.cmake
 create mode 100644 include/gmx_avx_double.h
 create mode 100644 include/gmx_avx_single.h
 create mode 100644 include/gmx_cpuid.h
 create mode 100644 include/gmx_detect_hardware.h
 delete mode 100644 include/gmx_detectcpu.h
 copy src/kernel/membed.h => include/gmx_fatal_collective.h (61%)
 create mode 100644 include/gmx_hash.h
 copy src/kernel/gmx_gpu_utils/gmx_gpu_utils.h => include/gmx_omp_nthreads.h (51%)
 create mode 100644 include/gmx_x86_simd_double.h
 create mode 100644 include/gmx_x86_simd_macros.h
 create mode 100644 include/gmx_x86_simd_single.h
 create mode 100644 include/gpu_utils.h
 copy include/{types/graph.h => md_logging.h} (61%)
 create mode 100644 include/md_support.h
 rewrite include/mdrun.h (62%)
 create mode 100644 include/nbnxn_cuda_data_mgmt.h
 create mode 100644 include/nbnxn_search.h
 copy src/kernel/gmx_gpu_utils/gmx_gpu_utils.h => include/pmalloc_cuda.h (71%)
 create mode 100644 include/sim_util.h
 copy src/kernel/membed.h => include/tables.h (61%)
 copy src/kernel/membed.h => include/types/force_flags.h (53%)
 create mode 100644 include/types/hw_info.h
 copy src/kernel/gmx_gpu_utils/gmx_gpu_utils.h => include/types/interaction_const.h (50%)
 create mode 100644 include/types/nb_verlet.h
 create mode 100644 include/types/nbnxn_cuda_types_ext.h
 create mode 100644 include/types/nbnxn_pairlist.h
 create mode 100644 src/gmxlib/cuda_tools/CMakeLists.txt
 create mode 100644 src/gmxlib/cuda_tools/cudautils.cu
 create mode 100644 src/gmxlib/cuda_tools/cudautils.cuh
 create mode 100644 src/gmxlib/cuda_tools/pmalloc_cuda.cu
 create mode 100644 src/gmxlib/cuda_tools/vectype_ops.cuh
 create mode 100644 src/gmxlib/gmx_cpuid.c
 create mode 100644 src/gmxlib/gmx_detect_hardware.c
 delete mode 100644 src/gmxlib/gmx_detectcpu.c
 create mode 100644 src/gmxlib/gmx_omp_nthreads.c
 rename src/{kernel/gmx_gpu_utils => gmxlib/gpu_utils}/CMakeLists.txt (63%)
 rename src/{kernel/gmx_gpu_utils/gmx_gpu_utils.cu => gmxlib/gpu_utils/gpu_utils.cu} (54%)
 rename src/{kernel/gmx_gpu_utils => gmxlib/gpu_utils}/memtestG80_core.cu (100%)
 rename src/{kernel/gmx_gpu_utils => gmxlib/gpu_utils}/memtestG80_core.h (100%)
 copy src/{kernel/gmx_gpu_utils/gmx_gpu_utils.h => gmxlib/md_logging.c} (57%)
 create mode 100644 src/kernel/calc_verletbuf.c
 copy src/kernel/{repl_ex.h => calc_verletbuf.h} (52%)
 create mode 100644 src/kernel/pme_switch.c
 copy src/kernel/{gmx_gpu_utils/gmx_gpu_utils.h => pme_switch.h} (56%)
 create mode 100644 src/mdlib/nbnxn_consts.h
 create mode 100644 src/mdlib/nbnxn_cuda/CMakeLists.txt
 create mode 100644 src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
 create mode 100644 src/mdlib/nbnxn_cuda/nbnxn_cuda.h
 create mode 100644 src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
 create mode 100644 src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
 create mode 100644 src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
 create mode 100644 src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
 copy src/{kernel/gmx_gpu_utils/gmx_gpu_utils.h => mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh} (50%)
 create mode 100644 src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_common.c
 copy src/{kernel/gmx_gpu_utils/gmx_gpu_utils.h => mdlib/nbnxn_kernels/nbnxn_kernel_common.h} (74%)
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c
 copy src/{kernel/gmx_gpu_utils/gmx_gpu_utils.h => mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h} (61%)
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
 copy src/{kernel/gmx_gpu_utils/gmx_gpu_utils.h => mdlib/nbnxn_kernels/nbnxn_kernel_ref.h} (64%)
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_inner.h
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
 copy src/{kernel/gmx_gpu_utils/gmx_gpu_utils.h => mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h} (63%)
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
 rename src/{kernel/gmx_gpu_utils/gmx_gpu_utils.h => mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h} (63%)
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h
 create mode 100644 src/mdlib/nbnxn_search.c
 create mode 100644 src/mdlib/nbnxn_search_x86_simd.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4d16ee059b..f1efb48232 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -123,6 +123,7 @@ IF( WIN32 AND NOT CYGWIN)
   ENDIF()
 ENDIF()
 
+
 ########################################################################
 # User input options                                                   #
 ########################################################################
@@ -163,7 +164,8 @@ mark_as_advanced(GMX_MPI_IN_PLACE)
 option(GMX_LOAD_PLUGINS "Compile with plugin support, needed to read VMD supported file formats" ON)
 mark_as_advanced(GMX_LOAD_PLUGINS)
 
-option(GMX_OPENMP "Enable OpenMP-based mutithreading. " ON)
+option(GMX_GPU  "Enable GPU acceleration" ON)
+option(GMX_OPENMP "Enable OpenMP-based multithreading" ON)
 
 option(USE_VERSION_H "Generate development version string/information" ON)
 mark_as_advanced(USE_VERSION_H)
@@ -175,6 +177,8 @@ if(UNIX AND NOT APPLE)
     mark_as_advanced(GMX_PREFER_STATIC_LIBS)
 endif()
 
+option(GMX_CYCLE_SUBCOUNTERS "Enable cycle subcounters to get a more detailed cycle timings" OFF)
+mark_as_advanced(GMX_CYCLE_SUBCOUNTERS)
 
 ######################################################################
 # compiler tests
@@ -222,6 +226,15 @@ endif()
 include(gmxCFlags)
 gmx_c_flags()
 
+include(gmxGetCompilerVersion)
+get_compiler_version()
+
+# gcc 4.4.x is buggy and crashes when compiling some files with O3 and OpenMP on.
+# Detect here whether applying a workaround is needed and will apply it later
+# on the affected files.
+include(gmxGCC44O3BugWorkaround)
+gmx_check_gcc44_bug_workaround_needed(GMX_USE_GCC44_BUG_WORKAROUND)
+
 ########################################################################
 # Set up binary and library suffixing 
 ########################################################################
@@ -322,8 +335,6 @@ else(GMX_OPENMM)
 endif(GMX_OPENMM)
 
 
-
-
 ########################################################################
 # Basic system tests (standard libraries, headers, functions, types)   #
 ########################################################################
@@ -347,7 +358,7 @@ check_include_files(sys/time.h   HAVE_SYS_TIME_H)
 check_include_files(rpc/rpc.h    HAVE_RPC_RPC_H)
 check_include_files("rpc/rpc.h;rpc/xdr.h"    HAVE_RPC_XDR_H)
 check_include_files(io.h  		 HAVE_IO_H)
-
+check_include_files(sched.h      HAVE_SCHED_H)
 
 include(CheckFunctionExists)
 check_function_exists(strcasecmp        HAVE_STRCASECMP)
@@ -366,6 +377,8 @@ check_function_exists(fileno            HAVE_FILENO)
 check_function_exists(_commit           HAVE__COMMIT)
 check_function_exists(lstat             HAVE_LSTAT)
 check_function_exists(sigaction         HAVE_SIGACTION)
+check_function_exists(sysconf           HAVE_SYSCONF)
+check_function_exists(sched_setaffinity HAVE_SCHED_SETAFFINITY)
 
 include(CheckLibraryExists)
 check_library_exists(m sqrt "" HAVE_LIBM)
@@ -491,6 +504,60 @@ if(GMX_OPENMM)
 endif(GMX_OPENMM)
 
 
+if(GMX_GPU)
+    if(GMX_DOUBLE)
+        message(WARNING "GPU acceleration is not available in double precision, disabled!")
+        set(GMX_GPU OFF CACHE BOOL "Enable GPU acceleration" FORCE)
+    endif()
+
+    # We support CUDA >=v3.2 on *nix, but <= v4.1 doesn't work with MSVC
+    if(MSVC)
+        find_package(CUDA 4.1)
+    else()
+        find_package(CUDA 3.2)
+    endif()
+
+    if (NOT EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
+        message(FATAL_ERROR "
+    mdrun supports native GPU acceleration on NVIDIA hardware with compute
+    capability >=2.0. This requires the NVIDIA CUDA library, which was not
+    found; the location can be hinted by setting CUDA_TOOLKIT_ROOT_DIR.
+
+    CPU or GPU acceleration can be selected at runtime, but if you are
+    sure you can not make use of GPU acceleration, disable it by setting
+    the CMake variable GMX_GPU=OFF.")
+    endif()
+
+    if(NOT GMX_OPENMP)
+        message(WARNING "
+    In order to use GPU acceleration efficiently, mdrun requires OpenMP multithreding.
+    Without OpenMP only a single CPU core per GPU can be used which is suboptimal.
+    Note that with MPI multiple processes can be forced to use a single GPU, but this
+    typically inefficient.")
+    endif()
+
+    include(gmxManageNvccConfig)
+
+    # Check whether we can use atomic operations needed for polling wait for GPU
+    # (to avoid the cudaStreamSynchronize + ECC bug).
+    # With thread-MPI testing atomics has already been carried out, but without
+    # thread-MPI we need to invoke the atomics test independently.
+    if (NOT GMX_THREAD_MPI)
+        set(TEST_TMPI_ATOMICS_ONLY ON CACHE INTERNAL
+            "Test only the atomic operations of thread-MPI.")
+        include(ThreadMPI)
+    endif()
+
+    # we need this linker flag in case if we have ld >= 2.22 (typically with gcc 4.5+),
+    # but it's too cumbersome to check the ld version and the flag should not hurt
+    if(CMAKE_COMPILER_IS_GNUCC)
+        set(GROMACS_LINKER_FLAGS "-Wl,--add-needed ${GROMACS_LINKER_FLAGS}")
+    endif()
+
+    # annoyingly enough, FindCUDA leaves a few variables behind as non-advanced
+    mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_SDK_ROOT_DIR CUDA_VERBOSE_BUILD)
+endif()
+
 if(APPLE)
    find_library(ACCELERATE_FRAMEWORK Accelerate)
    list(APPEND GMX_EXTRA_LIBRARIES ${ACCELERATE_FRAMEWORK})
@@ -838,6 +905,11 @@ else(${GMX_FFT_LIBRARY} STREQUAL "FFTW3")
     MESSAGE(FATAL_ERROR "Invalid FFT library setting: ${GMX_FFT_LIBRARY}. Choose one of: fftw3, mkl, fftpack")
 endif(${GMX_FFT_LIBRARY} STREQUAL "FFTW3")
 
+# enable threaded fftw3 if we've found it 
+if(FFTW3_THREADS OR FFTW3F_THREADS)
+    add_definitions(-DFFT5D_FFTW_THREADS)
+endif()
+
 set(GMX_EXTERNAL_BLAS TRUE CACHE BOOL "Use external BLAS instead of built-in")
 set(GMX_EXTERNAL_LAPACK TRUE CACHE BOOL "Use external LAPACK instead of built-in")
 # MKL has BLAS/LAPACK routines
diff --git a/cmake/ThreadMPI.cmake b/cmake/ThreadMPI.cmake
index 68a44f38a9..30e2767b77 100644
--- a/cmake/ThreadMPI.cmake
+++ b/cmake/ThreadMPI.cmake
@@ -13,10 +13,15 @@ MACRO(TEST_TMPI_ATOMICS VARIABLE)
 
         if (TEST_ATOMICS)
             message(STATUS "Atomics found")
-            set(${VARIABLE} CACHE INTERNAL 1)
+            set(${VARIABLE} TRUE CACHE INTERNAL "Whether atomic operations for thread-MPI were found")
         else (TEST_ATOMICS)
-            message(WARNING "Atomics not found for this compiler+cpu combination. Thread support will be unbearably slow: disable threads. Atomics should work on all but the most obscure CPU+compiler combinations; if your system is not obscure -- like, for example, x86 with gcc --  please contact the developers.")
-            set(${VARIABLE} CACHE INTERNAL 0)
+            if (TEST_TMPI_ATOMICS_ONLY)
+                message(WARNING "Atomic operations not found for this CPU+compiler combination. Atomic operations should work on all but the most obscure CPU+compiler combinations; if your system is not obscure -- like, for example, x86 with gcc --  please contact the developers.")
+            else (TEST_TMPI_ATOMICS_ONLY)
+                message(WARNING "Atomic operations not found for this
+            CPU+compiler combination. Thread support will be unbearably slow: disable threads. Atomic operations should work on all but the most obscure CPU+compiler combinations; if your system is not obscure -- like, for example, x86 with gcc --  please contact the developers.")
+            endif (TEST_TMPI_ATOMICS_ONLY)
+            set(${VARIABLE} FALSE CACHE INTERNAL "Whether atomic operations for thread-MPI were found")
         endif(TEST_ATOMICS)
     endif(NOT DEFINED TMPI_ATOMICS)
 ENDMACRO(TEST_TMPI_ATOMICS VARIABLE)
@@ -28,6 +33,10 @@ MACRO(TMPI_MAKE_CXX_LIB)
         thread_mpi/system_error.cpp )
 ENDMACRO(TMPI_MAKE_CXX_LIB)
 
+test_tmpi_atomics(TMPI_ATOMICS)
+
+# do we want to only the atomics of tMPI (with GPU + MPI)
+if(NOT TEST_TMPI_ATOMICS_ONLY)
 include(FindThreads)
 if (CMAKE_USE_PTHREADS_INIT)
     check_include_files(pthread.h    HAVE_PTHREAD_H)
@@ -144,5 +153,4 @@ check_function_exists(sysconf       HAVE_SYSCONF)
 # this runs on windows
 #check_include_files(windows.h		HAVE_WINDOWS_H)
 
-
-test_tmpi_atomics(TMPI_ATOMICS)
+endif(NOT TEST_TMPI_ATOMICS_ONLY)
diff --git a/cmake/gmxDetectAcceleration.cmake b/cmake/gmxDetectAcceleration.cmake
index 9e719a92fe..154847f4fb 100644
--- a/cmake/gmxDetectAcceleration.cmake
+++ b/cmake/gmxDetectAcceleration.cmake
@@ -25,29 +25,29 @@ macro(gmx_detect_acceleration GMX_SUGGESTED_ACCELERATION)
     message(STATUS "Detecting best acceleration for this CPU")
 
     # Get CPU acceleration information
-    try_run(GMX_DETECTCPU_RUN_ACC GMX_DETECTCPU_COMPILED
+    try_run(GMX_CPUID_RUN_ACC GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
-            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
+            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
+            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
             RUN_OUTPUT_VARIABLE OUTPUT_TMP
-            COMPILE_OUTPUT_VARIABLE GMX_DETECTCPU_COMPILE_OUTPUT 
+            COMPILE_OUTPUT_VARIABLE GMX_CPUID_COMPILE_OUTPUT
             ARGS "-acceleration")
 
-    if(NOT GMX_DETECTCPU_COMPILED)
-        message(WARNING "Cannot compile CPU detection code, which means no optimization.")
-        message(STATUS "Compile output: ${GMX_DETECTCPU_COMPILE_OUTPUT}")
+    if(NOT GMX_CPUID_COMPILED)
+        message(WARNING "Cannot compile CPUID code, which means no CPU-specific acceleration.")
+        message(STATUS "Compile output: ${GMX_CPUID_COMPILE_OUTPUT}")
         set(OUTPUT_TMP "None")
-    elseif(NOT GMX_DETECTCPU_RUN_ACC EQUAL 0)
-        message(WARNING "Cannot run CPU detection code, which means no optimization.")
+    elseif(NOT GMX_CPUID_RUN_ACC EQUAL 0)
+        message(WARNING "Cannot run CPUID code, which means no CPU-specific optimization.")
         message(STATUS "Run output: ${OUTPUT_TMP}")
         set(OUTPUT_TMP "None")
-    endif(NOT GMX_DETECTCPU_COMPILED)
+    endif(NOT GMX_CPUID_COMPILED)
 
     string(STRIP "@OUTPUT_TMP@" OUTPUT_ACC)
 
     message(STATUS "Detecting best acceleration for this CPU - @OUTPUT_ACC@")
 
-    set(${GMX_SUGGESTED_ACCELERATION}    "@OUTPUT_ACC@" CACHE INTERNAL "Gromacs CPU Acceleration")
+    set(${GMX_SUGGESTED_ACCELERATION}    "@OUTPUT_ACC@" CACHE INTERNAL "GROMACS CPU-specific acceleration")
 
     ENDIF(NOT DEFINED ${GMX_SUGGESTED_ACCELERATION})
 endmacro(gmx_detect_acceleration GMX_SUGGESTED_ACCELERATION)
diff --git a/cmake/gmxGCC44O3BugWorkaround.cmake b/cmake/gmxGCC44O3BugWorkaround.cmake
new file mode 100644
index 0000000000..fc7433e4d9
--- /dev/null
+++ b/cmake/gmxGCC44O3BugWorkaround.cmake
@@ -0,0 +1,44 @@
+# Due to a bug, gcc 4.4.x crashes when compiling bondfree.c with -O3 and
+# -fopenmp, but strangely it does not crash with -O2 + all additional options.
+# -O3 uses. Therefore, for the affected files, when compiling in release mode,
+# we override -O3 with -O2 and add the additional option.
+#
+
+# Considering compiler version and build configuration, check if the workaround
+# is needed to avoid gcc crash.
+macro(gmx_check_gcc44_bug_workaround_needed OUT_VAR)
+    if(CMAKE_COMPILER_IS_GNUCC AND
+    C_COMPILER_VERSION VERSION_GREATER "4.3.999" AND C_COMPILER_VERSION VERSION_LESS "4.4.999")
+
+        set(_gcc44_workaround FALSE)
+
+        # only apply the workaround if we are actually using -O3
+        string(TOUPPER ${CMAKE_BUILD_TYPE} _build_type)
+        if ("${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${_build_type}}" MATCHES ".*-O3.*" AND
+            GMX_OPENMP)
+            if(GMX_DISABLE_GCC44_BUG_WORKAROUND)
+                set(_msg "gcc ${C_COMPILER_VERSION} detected, using -O3, but workaround for optimization bug is disabled")
+            else()
+                set(_msg "gcc ${C_COMPILER_VERSION} detected, using -O3, will apply workaround for optimization bug (disable with GMX_DISABLE_GCC44_BUG_WORKAROUND)")
+                set(_gcc44_workaround TRUE)
+            endif()
+            # only issues message if the value has changed
+            if((NOT _gcc44_workaround AND ${OUT_VAR}) OR (_gcc44_workaround AND NOT ${OUT_VAR}))
+                message(STATUS "${_msg}")
+            endif()
+        endif()
+
+        set(${OUT_VAR} ${_gcc44_workaround} CACHE INTERNAL "Use gcc 4.4.x O3 optimization bug workaround" FORCE)
+    endif()
+endmacro()
+
+# Apply workaround on the specified source file.
+#
+# This workaround does not seem to affect the performance in a measurable way.
+macro(gmx_apply_gcc44_bug_workaround FILE_NAME)
+    set_source_files_properties(
+        ${FILE_NAME}
+        PROPERTIES
+        COMPILE_FLAGS "-O2 -finline-functions -funswitch-loops -fpredictive-commoning -fgcse-after-reload -ftree-vectorize -fipa-cp-clone"
+        )
+endmacro()
diff --git a/cmake/gmxGetCompilerVersion.cmake b/cmake/gmxGetCompilerVersion.cmake
new file mode 100644
index 0000000000..529577aebc
--- /dev/null
+++ b/cmake/gmxGetCompilerVersion.cmake
@@ -0,0 +1,46 @@
+# This macro attempts to parse the version string of the C compiler in use.
+# Currently supported are only compilers that accept "-dumpversion" argument:
+# gcc, Intel Compiler (on Linux and Mac OS), Open64, EkoPath.
+#
+# C_COMPILER_VERSION    - version string of the current C compiler (CMAKE_C_COMPILER)
+# CXX_COMPILER_VERSION  - version string of the current C++ compiler (CMAKE_CXX_COMPILER)
+#
+macro(get_compiler_version)
+    if(NOT C_COMPILER_VERSION)
+        execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
+            RESULT_VARIABLE _cc_dumpversion_res
+            OUTPUT_VARIABLE _cc_dumpversion_out
+	    ERROR_VARIABLE  _cc_dumpversion_err
+            OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+        if (${_cc_dumpversion_res} EQUAL 0)
+            SET(C_COMPILER_VERSION ${_cc_dumpversion_out}
+                CACHE STRING "C compiler verstion string" FORCE)
+        else ()
+            SET(C_COMPILER_VERSION ""
+                CACHE STRING "C compiler verstion string not available" FORCE)
+        endif ()
+    endif()
+
+    if(NOT CXX_COMPILER_VERSION)
+        execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion
+            RESULT_VARIABLE _cxx_dumpversion_res
+            OUTPUT_VARIABLE _cxx_dumpversion_out
+	    ERROR_VARIABLE  _cxx_dumpversion_err
+            OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+        if (${_cxx_dumpversion_res} EQUAL 0)
+            SET(CXX_COMPILER_VERSION ${_cxx_dumpversion_out}
+                CACHE STRING "C++ compiler verstion string" FORCE)
+        else ()
+            SET(CXX_COMPILER_VERSION ""
+                CACHE STRING "C++ compiler verstion string not available" FORCE)
+        endif ()
+    endif ()
+
+    if (NOT "${C_COMPILER_VERSION}" STREQUAL "${CXX_COMPILER_VERSION}")
+        message(WARNING "The version string of the C and C++ compilers does not match!")
+    endif ()
+
+    mark_as_advanced(C_COMPILER_VERSION CXX_COMPILER_VERSION)
+endmacro()
diff --git a/cmake/gmxManageNvccConfig.cmake b/cmake/gmxManageNvccConfig.cmake
new file mode 100644
index 0000000000..a80029f287
--- /dev/null
+++ b/cmake/gmxManageNvccConfig.cmake
@@ -0,0 +1,83 @@
+# Manage CUDA nvcc compilation configuration, try to be smart to ease the users' 
+# pain as much as possible: 
+# - use the CUDA_NVCC_HOST_COMPILER if defined by the user, otherwise
+# - auto-detect compatible nvcc host compiler and set nvcc -ccbin (if not MPI wrapper)
+# - set icc compatiblity mode to gcc 4.4 (CUDA 4.0 is not compatible with gcc >v4.4.x)
+# - (advanced) variables set:
+#   * CUDA_NVCC_HOST_COMPILER       - the compier nvcc is forced to use (via -ccbin)
+#   * CUDA_NVCC_HOST_COMPILER_OPTIONS   - the full host-compiler related option list passed to nvcc
+if (NOT DEFINED CUDA_NVCC_FLAGS_SET)
+    set(CUDA_NVCC_FLAGS_SET TRUE CACHE INTERNAL "True if NVCC flags have been set" FORCE)
+
+    # Set the host compiler for nvcc explicitly if the current cimpiler is
+    # supported, otherwise warn if the host compiler is not supported.
+    # Note that with MSVC nvcc sets the -compiler-bindir option behind the
+    # scenes; to avoid conflicts we shouldn't set -ccbin automatically.
+    if (NOT DEFINED CUDA_NVCC_HOST_COMPILER AND NOT MSVC)
+        if (NOT CMAKE_COMPILER_IS_GNUCC AND
+            NOT (CMAKE_C_COMPILER_ID MATCHES "Intel" AND UNIX AND NOT APPLE))
+            message(WARNING "
+            Will not set the nvcc host compiler because the current C compiler (ID: ${CMAKE_C_COMPILER_ID}): 
+            ${CMAKE_C_COMPILER}
+            is not compatible with nvcc. Compatible compilers are: gcc on Linux and Mac OS X,
+            Intel Compiler on 64-bit Linux and MSVC on Windows. nvcc will pick the platform
+            default; however, note that mixing compilers might lead to errors. 
+            To set the nvcc host compiler, edit CUDA_NVCC_FLAGS or re-configure with
+            CUDA_NVCC_HOST_COMPILER variable.")
+        else()
+            # the MPI wrappers might not work for compilation
+            if (GMX_MPI AND NOT GMX_THREAD_MPI)
+                message(WARNING "
+            Will not set the nvcc host compiler because the current C compiler is an MPI 
+            compiler wrapper: ${CMAKE_C_COMPILER}
+            which is prone to not work with nvcc, but you might get lucky.
+            To set the nvcc host compiler, edit CUDA_NVCC_FLAGS or re-configure with 
+            CUDA_NVCC_HOST_COMPILER variable.")
+            else()
+                set(CUDA_NVCC_HOST_COMPILER "${CMAKE_C_COMPILER}")
+                set(CUDA_NVCC_HOST_COMPILER_AUTOSET TRUE CACHE INTERNAL
+                    "True if CUDA_NVCC_HOST_COMPILER is automatically set" FORCE)
+            endif()
+        endif()
+    endif()
+
+    if(DEFINED CUDA_NVCC_HOST_COMPILER)
+        message(STATUS "Setting the nvcc host compiler to: ${CUDA_NVCC_HOST_COMPILER}")
+        set(CUDA_NVCC_HOST_COMPILER ${CUDA_NVCC_HOST_COMPILER}
+            CACHE PATH "Host compiler for nvcc (do not edit!)" FORCE)
+        
+        set(CUDA_NVCC_HOST_COMPILER_OPTIONS "-ccbin=${CUDA_NVCC_HOST_COMPILER}")
+        # force icc in gcc 4.4 compatiblity mode on *NIX to make nvcc 3.2/4.0 happy
+        if (UNIX AND CMAKE_C_COMPILER_ID MATCHES "Intel" AND
+            CUDA_NVCC_HOST_COMPILER_AUTOSET)
+            message(STATUS "Setting Intel Compiler compatibity mode to gcc 4.4 for nvcc host compilation")
+            set(CUDA_NVCC_HOST_COMPILER_OPTIONS "${CUDA_NVCC_HOST_COMPILER_OPTIONS};-Xcompiler;-gcc-version=440;")
+        endif()
+        set(CUDA_NVCC_HOST_COMPILER_OPTIONS "${CUDA_NVCC_HOST_COMPILER_OPTIONS}"
+            CACHE STRING "Host-side compiler and options for nvcc (do not edit!)." FORCE)
+
+        mark_as_advanced(CUDA_NVCC_HOST_COMPILER CUDA_NVCC_HOST_COMPILER_OPTIONS)
+    endif()
+
+    # on Linux we need to add -fPIC when building shared gmx libs
+    # Note: will add -fPIC for any compiler that supports it as it shouldn't hurt
+    if(BUILD_SHARED_LIBS)
+        GMX_TEST_CXXFLAG(CXXFLAG_FPIC "-fPIC" _FPIC_NVCC_FLAG)
+        if(_FPIC_NVCC_FLAG)
+            set(_FPIC_NVCC_FLAG "-Xcompiler;${_FPIC_NVCC_FLAG};")
+        endif()
+    endif()
+
+    # set the CUDA architectures to compile for
+    # with CUDA >v4.2 compute capability 3.0 is not supported
+    if(CUDA_VERSION VERSION_LESS "4.2.0")
+        set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_20,code=compute_20")
+    else()
+        set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_30,code=compute_30")
+    endif()
+
+    # finally set the damn flags
+    set(CUDA_NVCC_FLAGS
+        "${_FPIC_NVCC_FLAG}${_CUDA_ARCH_STR};-use_fast_math;${CUDA_NVCC_HOST_COMPILER_OPTIONS}"
+        CACHE STRING "Compiler flags for nvcc." FORCE)
+endif()
diff --git a/cmake/gmxSetBuildInformation.cmake b/cmake/gmxSetBuildInformation.cmake
index 639035335b..8b04ef2d44 100644
--- a/cmake/gmxSetBuildInformation.cmake
+++ b/cmake/gmxSetBuildInformation.cmake
@@ -50,35 +50,35 @@ macro(gmx_set_build_information)
 
     if(NOT CMAKE_CROSSCOMPILING)
         # Get CPU acceleration information
-        try_run(GMX_DETECTCPU_RUN_VENDOR GMX_DETECTCPU_COMPILED
+        try_run(GMX_CPUID_RUN_VENDOR GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
-            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
+            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
+            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_VENDOR ARGS "-vendor")
-        try_run(GMX_DETECTCPU_RUN_BRAND GMX_DETECTCPU_COMPILED
+        try_run(GMX_CPUID_RUN_BRAND GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
-            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
+            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
+            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_BRAND ARGS "-brand")
-        try_run(GMX_DETECTCPU_RUN_FAMILY GMX_DETECTCPU_COMPILED
+        try_run(GMX_CPUID_RUN_FAMILY GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
-            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
+            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
+            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_FAMILY ARGS "-family")
-        try_run(GMX_DETECTCPU_RUN_MODEL GMX_DETECTCPU_COMPILED
+        try_run(GMX_CPUID_RUN_MODEL GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
-            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
+            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
+            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_MODEL ARGS "-model")
-       try_run(GMX_DETECTCPU_RUN_STEPPING GMX_DETECTCPU_COMPILED
+       try_run(GMX_CPUID_RUN_STEPPING GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
-            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
+            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
+            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_STEPPING ARGS "-stepping")
-        try_run(GMX_DETECTCPU_RUN_FEATURES GMX_DETECTCPU_COMPILED
+        try_run(GMX_CPUID_RUN_FEATURES GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
-            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
+            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
+            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_FEATURES ARGS "-features")
 
         string(STRIP "@OUTPUT_CPU_VENDOR@" OUTPUT_CPU_VENDOR)
@@ -88,32 +88,32 @@ macro(gmx_set_build_information)
         string(STRIP "@OUTPUT_CPU_STEPPING@" OUTPUT_CPU_STEPPING)
         string(STRIP "@OUTPUT_CPU_FEATURES@" OUTPUT_CPU_FEATURES)
 
-        if(GMX_DETECTCPU_RUN_VENDOR EQUAL 0)
+        if(GMX_CPUID_RUN_VENDOR EQUAL 0)
             set(BUILD_CPU_VENDOR   "@OUTPUT_CPU_VENDOR@"   CACHE INTERNAL "Build CPU vendor")
         else()
             set(BUILD_CPU_VENDOR   "Unknown, detect failed" CACHE INTERNAL "Build CPU vendor")
         endif()
-        if(GMX_DETECTCPU_RUN_BRAND EQUAL 0)
+        if(GMX_CPUID_RUN_BRAND EQUAL 0)
             set(BUILD_CPU_BRAND    "@OUTPUT_CPU_BRAND@"    CACHE INTERNAL "Build CPU brand")
         else()
             set(BUILD_CPU_BRAND    "Unknown, detect failed" CACHE INTERNAL "Build CPU brand")
         endif()
-        if(GMX_DETECTCPU_RUN_FAMILY EQUAL 0)
+        if(GMX_CPUID_RUN_FAMILY EQUAL 0)
             set(BUILD_CPU_FAMILY   "@OUTPUT_CPU_FAMILY@"   CACHE INTERNAL "Build CPU family")
         else()
             set(BUILD_CPU_FAMILY   "0"                     CACHE INTERNAL "Build CPU family")
         endif()
-        if(GMX_DETECTCPU_RUN_MODEL EQUAL 0)
+        if(GMX_CPUID_RUN_MODEL EQUAL 0)
             set(BUILD_CPU_MODEL    "@OUTPUT_CPU_MODEL@"    CACHE INTERNAL "Build CPU model")
         else()
             set(BUILD_CPU_MODEL    "0"                     CACHE INTERNAL "Build CPU model")
         endif()
-        if(GMX_DETECTCPU_RUN_STEPPING EQUAL 0)
+        if(GMX_CPUID_RUN_STEPPING EQUAL 0)
             set(BUILD_CPU_STEPPING "@OUTPUT_CPU_STEPPING@" CACHE INTERNAL "Build CPU stepping")
         else()
             set(BUILD_CPU_STEPPING "0"                     CACHE INTERNAL "Build CPU stepping")
         endif()
-        if(GMX_DETECTCPU_RUN_FEATURES EQUAL 0)
+            if(GMX_CPUID_RUN_FEATURES EQUAL 0)
             set(BUILD_CPU_FEATURES "@OUTPUT_CPU_FEATURES@" CACHE INTERNAL "Build CPU features")
         else()
             set(BUILD_CPU_FEATURES ""                      CACHE INTERNAL "Build CPU features")
diff --git a/include/bondf.h b/include/bondf.h
index 5ea6cd26a6..9853f1c5aa 100644
--- a/include/bondf.h
+++ b/include/bondf.h
@@ -63,6 +63,7 @@ void calc_bonds(FILE *fplog,const gmx_multisim_t *ms,
                 const t_mdatoms *md,
                 t_fcdata *fcd,int *ddgatindex,
                 t_atomtypes *atype, gmx_genborn_t *born,
+		int force_flags,
                 gmx_bool bPrintSepPot,gmx_large_int_t step);
 /* 
  * The function calc_bonds() calculates all bonded force interactions.
@@ -147,6 +148,13 @@ void make_dp_periodic(real *dp);
   t_ifunc polarize,anharm_polarize,water_pol,thole_pol,angres,angresz,dihres,unimplemented;
 
 
+/* Initialize the setup for the bonded force buffer reduction
+ * over threads. This should be called each time the bonded setup
+ * changes; i.e. at start-up without domain decomposition and at DD.
+ */ 
+void init_bonded_thread_force_reduction(t_forcerec *fr,
+                                        const t_idef *idef);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/constr.h b/include/constr.h
index a45b58d25f..e516ab8ad0 100644
--- a/include/constr.h
+++ b/include/constr.h
@@ -37,6 +37,7 @@
 #ifndef _constr_h
 #define _constr_h
 #include "typedefs.h"
+#include "types/commrec.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -71,7 +72,6 @@ gmx_bool bshakef(FILE *log,		/* Log file			*/
                     int sblock[],       /* The shake blocks             */
                     t_idef *idef,	/* The interaction def		*/
                     t_inputrec *ir,	/* Input record		        */
-                    matrix box,		/* The box			*/
                     rvec x_s[],		/* Coords before update		*/
                     rvec prime[],		/* Output coords		*/
                     t_nrnb *nrnb,       /* Performance measure          */
@@ -99,23 +99,26 @@ gmx_settledata_t settle_init(real mO,real mH,real invmO,real invmH,
 /* Initializes and returns a structure with SETTLE parameters */
 
 void csettle(gmx_settledata_t settled,
-                    int nsettle,	/* Number of settles  	        */
-                    t_iatom iatoms[],	/* The settle iatom list        */
-                    real b4[],		/* Old coordinates		*/
-                    real after[],	/* New coords, to be settled	*/
-                    real invdt,         /* 1/delta_t                    */
-                    real *v,            /* Also constrain v if v!=NULL  */
-                    gmx_bool bCalcVir,      /* Calculate r x m delta_r      */
-                    tensor rmdr,        /* sum r x m delta_r            */
-                    int *xerror,
-                    t_vetavars *vetavar     /* variables for pressure control */   
+             int nsettle,	/* Number of settles  	        */
+             t_iatom iatoms[],	/* The settle iatom list        */
+             const t_pbc *pbc,   /* PBC data pointer, can be NULL  */
+             real b4[],		/* Old coordinates		*/
+             real after[],	/* New coords, to be settled	*/
+             real invdt,         /* 1/delta_t                    */
+             real *v,            /* Also constrain v if v!=NULL  */
+             int calcvir_atom_end, /* Calculate r x m delta_r up to this atom */
+             tensor rmdr,        /* sum r x m delta_r            */
+             int *xerror,
+             t_vetavars *vetavar     /* variables for pressure control */   
     );
 
 void settle_proj(FILE *fp,
-                        gmx_settledata_t settled,int econq,
-                        int nsettle, t_iatom iatoms[],rvec x[],
-                        rvec *der,rvec *derp,
-                        gmx_bool bCalcVir,tensor rmdder, t_vetavars *vetavar);
+                 gmx_settledata_t settled,int econq,
+                 int nsettle, t_iatom iatoms[],
+                 const t_pbc *pbc,   /* PBC data pointer, can be NULL  */
+                 rvec x[],
+                 rvec *der,rvec *derp,
+                 int CalcVirAtomEnd,tensor rmdder, t_vetavars *vetavar);
 /* Analytical algorithm to subtract the components of derivatives
  * of coordinates working on settle type constraint.
  */
@@ -130,17 +133,18 @@ void crattle(atom_id iatom[],int ncon,int *nnit,int maxnit,
                     real invmass[],real tt[],real lagr[],int *nerror,real invdt,t_vetavars *vetavar);
 
 gmx_bool constrain(FILE *log,gmx_bool bLog,gmx_bool bEner,
-                      gmx_constr_t constr,
-                      t_idef *idef,
-                      t_inputrec *ir,
-                      gmx_ekindata_t *ekind,
-                      t_commrec *cr,
-                      gmx_large_int_t step,int delta_step,
-                      t_mdatoms *md,
-                      rvec *x,rvec *xprime,rvec *min_proj,matrix box,
-                      real lambda,real *dvdlambda,
-                      rvec *v,tensor *vir,
-                      t_nrnb *nrnb,int econq, gmx_bool bPscal, real veta, real vetanew);
+                   gmx_constr_t constr,
+                   t_idef *idef,
+                   t_inputrec *ir,
+                   gmx_ekindata_t *ekind,
+                   t_commrec *cr,
+                   gmx_large_int_t step,int delta_step,
+                   t_mdatoms *md,
+                   rvec *x,rvec *xprime,rvec *min_proj,
+                   gmx_bool bMolPBC,matrix box,
+                   real lambda,real *dvdlambda,
+                   rvec *v,tensor *vir,
+                   t_nrnb *nrnb,int econq, gmx_bool bPscal, real veta, real vetanew);
 /*
  * When econq=econqCoord constrains coordinates xprime using th
  * directions in x, min_proj is not used.
@@ -152,6 +156,8 @@ gmx_bool constrain(FILE *log,gmx_bool bLog,gmx_bool bEner,
  * When econq=econqDeriv_FlexCon, the same is done as with econqDeriv,
  * but only the components of the flexible constraints are stored.
  *
+ * When bMolPBC=TRUE, assume that molecules might be broken: correct PBC.
+ *
  * delta_step is used for determining the constraint reference lengths
  * when lenA != lenB or will the pull code with a pulling rate.
  * step + delta_step is the step at which the final configuration
@@ -194,8 +200,11 @@ t_blocka make_at2con(int start,int natoms,
 			    gmx_bool bDynamics,int *nflexiblecons);
 /* Returns a block struct to go from atoms to constraints */
 
-t_blocka *atom2constraints_moltype(gmx_constr_t constr);
-/* Returns the an arry of atom to constraints lists for the moltypes */
+const t_blocka *atom2constraints_moltype(gmx_constr_t constr);
+/* Returns the an array of atom to constraints lists for the moltypes */
+
+const int **atom2settle_moltype(gmx_constr_t constr);
+/* Returns the an array of atom to settle for the moltypes */
 
 #define constr_iatomptr(nconstr,iatom_constr,iatom_constrnc,con) ((con) < (nconstr) ? (iatom_constr)+(con)*3 : (iatom_constrnc)+(con-nconstr)*3)
 /* Macro for getting the constraint iatoms for a constraint number con
@@ -203,9 +212,12 @@ t_blocka *atom2constraints_moltype(gmx_constr_t constr);
  * are concatenated.
  */
 
-gmx_bool inter_charge_group_constraints(gmx_mtop_t *mtop);
+gmx_bool inter_charge_group_constraints(const gmx_mtop_t *mtop);
 /* Returns if there are inter charge group constraints */
 
+gmx_bool inter_charge_group_settles(const gmx_mtop_t *mtop);
+/* Returns if there are inter charge group settles */
+
 real *constr_rmsd_data(gmx_constr_t constr);
 /* Return the data for determining constraint RMS relative deviations.
  * Returns NULL when LINCS is not used.
@@ -238,12 +250,14 @@ real constr_r_max(FILE *fplog,gmx_mtop_t *mtop,t_inputrec *ir);
  * required for LINCS.
  */
 
-gmx_bool constrain_lincs(FILE *log,gmx_bool bLog,gmx_bool bEner,
+gmx_bool
+constrain_lincs(FILE *log,gmx_bool bLog,gmx_bool bEner,
 			    t_inputrec *ir,
 			    gmx_large_int_t step,
 			    gmx_lincsdata_t lincsd,t_mdatoms *md,
-			    t_commrec *cr,
-			    rvec *x,rvec *xprime,rvec *min_proj,matrix box,
+                t_commrec *cr,
+			    rvec *x,rvec *xprime,rvec *min_proj,
+                matrix box,t_pbc *pbc,
 			    real lambda,real *dvdlambda,
 			    real invdt,rvec *v,
 			    gmx_bool bCalcVir,tensor rmdr,
diff --git a/include/coulomb.h b/include/coulomb.h
index 6fb810d283..0bd28f3f81 100644
--- a/include/coulomb.h
+++ b/include/coulomb.h
@@ -38,6 +38,7 @@
 
 #include <stdio.h>
 #include "typedefs.h"
+#include "types/commrec.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -69,16 +70,25 @@ do_ewald(FILE *log,       gmx_bool bVerbose,
  
 real
 ewald_LRcorrection(FILE *fp,
-			       int start,int end,
-			       t_commrec *cr,t_forcerec *fr,
-			       real *chargeA,real *chargeB,
-			       t_blocka *excl,rvec x[],
-			       matrix box,rvec mu_tot[],
-			       int ewald_geometry,real epsilon_surface,
-			       real lambda,real *dvdlambda,
-			       real *vdip,real *vcharge);
-/* Calculate the Long range correction to ewald, due to 
- * 1-4 interactions, surface dipole term and charge terms
+		   int start,int end,
+		   t_commrec *cr,int thread,t_forcerec *fr,
+		   real *chargeA,real *chargeB,
+		   gmx_bool calc_excl_corr,
+		   t_blocka *excl,rvec x[],
+		   matrix box,rvec mu_tot[],
+		   int ewald_geometry,real epsilon_surface,
+		   rvec *f,tensor vir,
+		   real lambda,real *dvdlambda);
+/* Calculate the Long range correction to the Ewald sum,
+ * due to excluded pairs and/or surface dipole terms.
+ */
+
+real
+ewald_charge_correction(t_commrec *cr,t_forcerec *fr,real lambda,matrix box,
+			real *dvdlambda,tensor vir);
+/* Calculate the Long range correction to the Ewald sum,
+ * due to a net system charge.
+ * Should only be called on one thread.
  */
 
 /* Routines to set global constants for speeding up the calculation
diff --git a/include/domdec.h b/include/domdec.h
index 673b54f02e..6e785fe840 100644
--- a/include/domdec.h
+++ b/include/domdec.h
@@ -20,6 +20,7 @@
 #define _domdec_h
 
 #include "typedefs.h"
+#include "types/commrec.h"
 #include "vsite.h"
 #include "genborn.h"
 
@@ -93,6 +94,9 @@ void dd_init_bondeds(FILE *fplog,
                             t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb);
 /* Initialize data structures for bonded interactions */
 
+gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC);
+/* Returns if we need to do pbc for calculating bonded interactions */
+
 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
                               t_inputrec *ir,t_forcerec *fr,
                               gmx_ddbox_t *ddbox);
@@ -100,6 +104,13 @@ void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
  * should be called after calling dd_init_bondeds.
  */
 
+gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
+                          real cutoff_req );
+/* Change the DD non-bonded communication cut-off.
+ * This could fail when trying to increase the cut-off,
+ * then FALSE will be returned and the cut-off is not modified.
+ */
+
 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd);
 
 void dd_collect_vec(gmx_domdec_t *dd,
@@ -119,6 +130,11 @@ void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb);
 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb);
 /* Stop the force flop count */
 
+float dd_pme_f_ratio(gmx_domdec_t *dd);
+/* Return the PME/PP force load ratio, or -1 if nothing was measured.
+ * Should only be called on the DD master node.
+ */
+
 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[]);
 /* Communicate the coordinates to the neighboring cells and do pbc. */
 
@@ -186,15 +202,16 @@ void dd_clear_local_vsite_indices(gmx_domdec_t *dd);
 int dd_make_local_vsites(gmx_domdec_t *dd,int at_start,t_ilist *lil);
 
 int dd_make_local_constraints(gmx_domdec_t *dd,int at_start,
-                                     gmx_mtop_t *mtop,
-                                     gmx_constr_t constr,int nrec,
-                                     t_ilist *il_local);
+                              const gmx_mtop_t *mtop,
+                              const int *cginfo,
+                              gmx_constr_t constr,int nrec,
+                              t_ilist *il_local);
 
 void init_domdec_constraints(gmx_domdec_t *dd,
-                                    int natoms,gmx_mtop_t *mtop,
-                                    gmx_constr_t constr);
+                             gmx_mtop_t *mtop,
+                             gmx_constr_t constr);
 
-void init_domdec_vsites(gmx_domdec_t *dd,int natoms);
+void init_domdec_vsites(gmx_domdec_t *dd,int n_intercg_vsite);
 
 
 /* In domdec_top.c */
@@ -210,11 +227,13 @@ void dd_make_reverse_top(FILE *fplog,
 void dd_make_local_cgs(gmx_domdec_t *dd,t_block *lcgs);
 
 void dd_make_local_top(FILE *fplog,
-                              gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
-                              int npbcdim,matrix box,
-                              rvec cellsize_min,ivec npulse,
-                              t_forcerec *fr,gmx_vsite_t *vsite,
-                              gmx_mtop_t *top,gmx_localtop_t *ltop);
+                       gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
+                       int npbcdim,matrix box,
+                       rvec cellsize_min,ivec npulse,
+                       t_forcerec *fr,
+                       rvec *cgcm_or_x,
+                       gmx_vsite_t *vsite,
+                       gmx_mtop_t *top,gmx_localtop_t *ltop);
 
 void dd_sort_local_top(gmx_domdec_t *dd,t_mdatoms *mdatoms,
                               gmx_localtop_t *ltop);
diff --git a/include/domdec_network.h b/include/domdec_network.h
index 400c50bf9a..5a709d599c 100644
--- a/include/domdec_network.h
+++ b/include/domdec_network.h
@@ -20,6 +20,7 @@
 #define _domdec_network_h
 
 #include "typedefs.h"
+#include "types/commrec.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/force.h b/include/force.h
index 05d585bc29..758b748dbf 100644
--- a/include/force.h
+++ b/include/force.h
@@ -38,12 +38,14 @@
 
 
 #include "typedefs.h"
+#include "types/force_flags.h"
 #include "pbc.h"
 #include "network.h"
 #include "tgroup.h"
 #include "vsite.h"
 #include "genborn.h"
 
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -134,6 +136,19 @@ gmx_bool can_use_allvsall(const t_inputrec *ir, const gmx_mtop_t *mtop,
  * and fp (if !=NULL) on the master node.
  */
 
+void init_interaction_const_tables(FILE *fp, 
+                                   interaction_const_t *ic,
+                                   int verlet_kernel_type);
+/* Initializes the tables in the interaction constant data structure.
+ */
+
+void init_interaction_const(FILE *fp, 
+                            interaction_const_t **interaction_const,
+                            const t_forcerec *fr);
+/* Initializes the interaction constant data structure. Currently it 
+ * uses forcerec as input. 
+ */
+
 void init_forcerec(FILE       *fplog,     
                           const output_env_t oenv,
 			  t_forcerec *fr,   
@@ -147,7 +162,8 @@ void init_forcerec(FILE       *fplog,
 			  const char *tabafn,
 			  const char *tabpfn,
 			  const char *tabbfn,
-			  gmx_bool       bNoSolvOpt,
+		          const char *nbpu_opt,
+			  gmx_bool   bNoSolvOpt,
 			  real       print_force);
 /* The Force rec struct must be created with mk_forcerec 
  * The gmx_booleans have the following meaning:
@@ -156,6 +172,10 @@ void init_forcerec(FILE       *fplog,
  * print_force >= 0: print forces for atoms with force >= print_force
  */
 
+void forcerec_set_excl_load(t_forcerec *fr,
+			    const gmx_localtop_t *top,const t_commrec *cr);
+  /* Set the exclusion load for the local exclusions and possibly threads */
+
 void init_enerdata(int ngener,int n_lambda,gmx_enerdata_t *enerd);
 /* Intializes the energy storage struct */
 
@@ -181,30 +201,7 @@ void update_forcerec(FILE *fplog,t_forcerec *fr,matrix box);
 void set_avcsixtwelve(FILE *fplog,t_forcerec *fr,
 			     const gmx_mtop_t *mtop);
 
-/* The state has changed */
-#define GMX_FORCE_STATECHANGED (1<<0)
-/* The box might have changed */
-#define GMX_FORCE_DYNAMICBOX   (1<<1)
-/* Do neighbor searching */
-#define GMX_FORCE_NS           (1<<2)
-/* Calculate bonded energies/forces */
-#define GMX_FORCE_DOLR         (1<<3)
-/* Calculate long-range energies/forces */
-#define GMX_FORCE_BONDED       (1<<4)
-/* Store long-range forces in a separate array */
-#define GMX_FORCE_SEPLRF       (1<<5)
-/* Calculate non-bonded energies/forces */
-#define GMX_FORCE_NONBONDED    (1<<6)
-/* Calculate forces (not only energies) */
-#define GMX_FORCE_FORCES       (1<<7)
-/* Calculate the virial */
-#define GMX_FORCE_VIRIAL       (1<<8)
-/* Calculate dHdl */
-#define GMX_FORCE_DHDL         (1<<9)
-/* Normally one want all energy terms and forces */
-#define GMX_FORCE_ALLFORCES    (GMX_FORCE_BONDED | GMX_FORCE_NONBONDED | GMX_FORCE_FORCES)
-
-void do_force(FILE *log,t_commrec *cr,
+extern void do_force(FILE *log,t_commrec *cr,
 		     t_inputrec *inputrec,
 		     gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 		     gmx_localtop_t *top,
@@ -216,10 +213,12 @@ void do_force(FILE *log,t_commrec *cr,
 		     t_mdatoms *mdatoms,
 		     gmx_enerdata_t *enerd,t_fcdata *fcd,
 		     real *lambda,t_graph *graph,
-		     t_forcerec *fr,gmx_vsite_t *vsite,rvec mu_tot,
+		     t_forcerec *fr,
+                     gmx_vsite_t *vsite,rvec mu_tot,
 		     double t,FILE *field,gmx_edsam_t ed,
 		     gmx_bool bBornRadii,
 		     int flags);
+
 /* Communicate coordinates (if parallel).
  * Do neighbor searching (if necessary).
  * Calculate forces.
@@ -248,7 +247,7 @@ void ns(FILE       *fplog,
 	       rvec       *f);
 /* Call the neighborsearcher */
 
-void do_force_lowlevel(FILE         *fplog,  
+extern void do_force_lowlevel(FILE         *fplog,  
 			      gmx_large_int_t   step,
 			      t_forcerec   *fr,
 			      t_inputrec   *ir,
diff --git a/include/futil.h b/include/futil.h
index 8ca4fd7fd9..8d21435341 100644
--- a/include/futil.h
+++ b/include/futil.h
@@ -38,6 +38,7 @@
 
 #include <stdio.h>
 #include "typedefs.h"
+#include "types/commrec.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/genborn.h b/include/genborn.h
index a76fab7653..f6e9961dda 100644
--- a/include/genborn.h
+++ b/include/genborn.h
@@ -37,6 +37,7 @@
 #define _genborn_h
 
 #include "typedefs.h"
+#include "types/commrec.h"
 #include "grompp.h"
 
 #ifdef __cplusplus
diff --git a/include/gmx_avx_double.h b/include/gmx_avx_double.h
new file mode 100644
index 0000000000..190b4ccc0d
--- /dev/null
+++ b/include/gmx_avx_double.h
@@ -0,0 +1,61 @@
+/*
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under 
+ * the terms of the GNU Lesser General Public License as published by the Free 
+ * Software Foundation; either version 2 of the License, or (at your option) any 
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.  
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+#ifndef _gmx_avx_double_h_
+#define _gmx_avx_double_h_
+
+/* We require AVX now! */
+
+#include <immintrin.h> /* AVX */
+
+static inline __m256d
+gmx_mm256_invsqrt_pd(__m256d x)
+{
+    /* There is no double precision AVX rsqrt instruction.
+     * But using a single precision rsqrt still gives the full precision.
+     */
+    const __m256d half    = _mm256_set_pd(0.5,0.5,0.5,0.5);
+    const __m256d three   = _mm256_set_pd(3.0,3.0,3.0,3.0);
+
+    __m256d lu = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x)));
+
+    lu = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu,lu),x)),lu));
+    return _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu,lu),x)),lu));
+}
+
+static inline __m256d
+gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
+{
+    return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx,dx), _mm256_mul_pd(dy,dy) ), _mm256_mul_pd(dz,dz) );
+}
+
+/* Normal sum of four xmm registers */
+#define gmx_mm256_sum4_pd(t0,t1,t2,t3)  _mm256_add_pd(_mm256_add_pd(t0,t1),_mm256_add_pd(t2,t3))
+
+#endif /* gmx_avx_double_h_ */
diff --git a/include/gmx_avx_single.h b/include/gmx_avx_single.h
new file mode 100644
index 0000000000..f0697e1021
--- /dev/null
+++ b/include/gmx_avx_single.h
@@ -0,0 +1,57 @@
+/*
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under 
+ * the terms of the GNU Lesser General Public License as published by the Free 
+ * Software Foundation; either version 2 of the License, or (at your option) any 
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.  
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+#ifndef _gmx_avx_single_h_
+#define _gmx_avx_single_h_
+
+/* We require AVX now! */
+
+#include <immintrin.h> /* AVX */
+
+static inline __m256
+gmx_mm256_invsqrt_ps(__m256 x)
+{
+    const __m256 half  = _mm256_set_ps(0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5);
+    const __m256 three = _mm256_set_ps(3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0);
+    
+    __m256 lu = _mm256_rsqrt_ps(x);
+    
+    return _mm256_mul_ps(half,_mm256_mul_ps(_mm256_sub_ps(three,_mm256_mul_ps(_mm256_mul_ps(lu,lu),x)),lu));
+}
+
+static inline __m256
+gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
+{
+    return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx,dx), _mm256_mul_ps(dy,dy) ), _mm256_mul_ps(dz,dz) );
+}
+
+/* Normal sum of four xmm registers */
+#define gmx_mm256_sum4_ps(t0,t1,t2,t3)  _mm256_add_ps(_mm256_add_ps(t0,t1),_mm256_add_ps(t2,t3))
+
+#endif /* gmx_avx_single_h_ */
diff --git a/include/gmx_cpuid.h b/include/gmx_cpuid.h
new file mode 100644
index 0000000000..24578fafb7
--- /dev/null
+++ b/include/gmx_cpuid.h
@@ -0,0 +1,257 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * 
+ * This file is part of GROMACS.
+ * Copyright (c) 2012-  
+ *
+ * Written by the Gromacs development team under coordination of
+ * David van der Spoel, Berk Hess, and Erik Lindahl.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org
+ *
+ * And Hey:
+ * Gnomes, ROck Monsters And Chili Sauce
+ */
+#ifndef GMX_CPUID_H_
+#define GMX_CPUID_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} /* fixes auto-indentation problems */
+#endif
+
+
+/* Currently identifiable CPU Vendors */
+enum gmx_cpuid_vendor
+{
+    GMX_CPUID_VENDOR_CANNOTDETECT,   /* Should only be used if something fails */
+    GMX_CPUID_VENDOR_UNKNOWN,
+    GMX_CPUID_VENDOR_INTEL,
+    GMX_CPUID_VENDOR_AMD,
+    GMX_CPUID_NVENDORS
+};
+
+
+/* CPU feature/property list, to be used as indices into the feature array of the
+ * gmxcpuid_t data structure.
+ *
+ * To facilitate looking things up, we keep this list alphabetical.
+ * The list is NOT exhaustive - we have basically added stuff that might be
+ * useful in an application like Gromacs.
+ *
+ * AMD and Intel tend to share most architectural elements, and even if the
+ * flags might have to be detected in different ways (different cpuid registers),
+ * once the flag is present the functions should be identical. Unfortunately the
+ * trend right now (2012) seems to be that they are diverging. This means that
+ * we need to use specific flags to the compiler to maximize performance, and
+ * then the binaries might not be portable between Intel and AMD as they were
+ * before when we only needed to check for SSE and/or SSE2 support in Gromacs.
+ */
+enum gmx_cpuid_feature
+{
+    GMX_CPUID_FEATURE_CANNOTDETECT,      /* Flag set if we could not detect on this CPU  */
+    GMX_CPUID_FEATURE_X86_AES,           /* x86 advanced encryption standard accel.      */
+    GMX_CPUID_FEATURE_X86_APIC,          /* APIC support                                 */
+    GMX_CPUID_FEATURE_X86_AVX,           /* Advanced vector extensions                   */
+    GMX_CPUID_FEATURE_X86_AVX2,          /* AVX2 including gather support (not used yet) */
+    GMX_CPUID_FEATURE_X86_CLFSH,         /* Supports CLFLUSH instruction                 */
+    GMX_CPUID_FEATURE_X86_CMOV,          /* Conditional move insn support                */
+    GMX_CPUID_FEATURE_X86_CX8,           /* Supports CMPXCHG8B (8-byte compare-exchange) */
+    GMX_CPUID_FEATURE_X86_CX16,          /* Supports CMPXCHG16B (16-byte compare-exchg)  */
+    GMX_CPUID_FEATURE_X86_F16C,          /* Supports 16-bit FP conversion instructions   */
+    GMX_CPUID_FEATURE_X86_FMA,           /* Fused-multiply add support (mainly for AVX)  */
+    GMX_CPUID_FEATURE_X86_FMA4,          /* 4-operand FMA, only on AMD for now           */
+    GMX_CPUID_FEATURE_X86_HTT,           /* Hyper-Threading supported                    */
+    GMX_CPUID_FEATURE_X86_LAHF_LM,       /* LAHF/SAHF support in 64 bits                 */
+    GMX_CPUID_FEATURE_X86_MISALIGNSSE,   /* Support for misaligned SSE data instructions */
+    GMX_CPUID_FEATURE_X86_MMX,           /* MMX registers and instructions               */
+    GMX_CPUID_FEATURE_X86_MSR,           /* Supports Intel model-specific-registers      */
+    GMX_CPUID_FEATURE_X86_NONSTOP_TSC,   /* Invariant TSC (constant rate in ACPI states) */
+    GMX_CPUID_FEATURE_X86_PCID,          /* Process context identifier support           */
+    GMX_CPUID_FEATURE_X86_PCLMULDQ,      /* Carry-less 64-bit multiplication supported   */
+    GMX_CPUID_FEATURE_X86_PDCM,          /* Perfmon and Debug Capability                 */
+    GMX_CPUID_FEATURE_X86_PDPE1GB,       /* Support for 1GB pages                        */
+    GMX_CPUID_FEATURE_X86_POPCNT,        /* Supports the POPCNT (population count) insn  */
+    GMX_CPUID_FEATURE_X86_PSE,           /* Supports 4MB-pages (page size extension)     */
+    GMX_CPUID_FEATURE_X86_RDRND,         /* RDRAND high-quality hardware random numbers  */
+    GMX_CPUID_FEATURE_X86_RDTSCP,        /* Serializing rdtscp instruction available     */
+    GMX_CPUID_FEATURE_X86_SSE2,          /* SSE 2                                        */
+    GMX_CPUID_FEATURE_X86_SSE3,          /* SSE 3                                        */
+    GMX_CPUID_FEATURE_X86_SSE4A,         /* SSE 4A                                       */
+    GMX_CPUID_FEATURE_X86_SSE4_1,        /* SSE 4.1                                      */
+    GMX_CPUID_FEATURE_X86_SSE4_2,        /* SSE 4.2                                      */
+    GMX_CPUID_FEATURE_X86_SSSE3,         /* Supplemental SSE3                            */
+    GMX_CPUID_FEATURE_X86_TDT,           /* TSC deadline timer                           */
+    GMX_CPUID_FEATURE_X86_X2APIC,        /* Extended xAPIC Support                       */
+    GMX_CPUID_FEATURE_X86_XOP,           /* AMD extended instructions, only AMD for now  */
+    GMX_CPUID_NFEATURES
+};
+
+
+/* Currently supported acceleration instruction sets, intrinsics or other similar combinations
+ * in Gromacs. There is not always a 1-to-1 correspondence with feature flags; on some AMD
+ * hardware we prefer to use 128bit AVX instructions (although 256-bit ones could be executed),
+ * and we still haven't written the AVX2 kernels.
+ */
+enum gmx_cpuid_acceleration
+{
+    GMX_CPUID_ACCELERATION_CANNOTDETECT,    /* Should only be used if something fails */
+    GMX_CPUID_ACCELERATION_NONE,
+    GMX_CPUID_ACCELERATION_X86_SSE2,
+    GMX_CPUID_ACCELERATION_X86_SSE4_1,
+    GMX_CPUID_ACCELERATION_X86_AVX_128_FMA,
+    GMX_CPUID_ACCELERATION_X86_AVX_256,
+    GMX_CPUID_NACCELERATIONS
+};
+
+/* Text strings corresponding to CPU vendors */
+extern const char *
+gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS];
+
+/* Text strings for CPU feature indices */
+extern const char *
+gmx_cpuid_feature_string[GMX_CPUID_NFEATURES];
+
+/* Text strings for Gromacs acceleration/instruction sets */
+extern const char *
+gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS];
+
+
+/* Abstract data type with CPU detection information. Set by gmx_cpuid_init(). */
+typedef struct gmx_cpuid *
+gmx_cpuid_t;
+
+
+/* Fill the data structure by using CPU detection instructions.
+ * Return 0 on success, 1 if something bad happened.
+ */
+int
+gmx_cpuid_init              (gmx_cpuid_t *              cpuid);
+
+
+/* Return the vendor id as enumerated type. Use gmx_cpuid_vendor_string[]
+ * to get the corresponding text string.
+ */
+enum gmx_cpuid_vendor
+gmx_cpuid_vendor            (gmx_cpuid_t                cpuid);
+
+
+/* Return a constant pointer to the processor brand string. */
+const char *
+gmx_cpuid_brand             (gmx_cpuid_t                cpuid);
+
+
+/* Return processor family version. For a chip of version 1.2.3, this is 1 */
+int
+gmx_cpuid_family            (gmx_cpuid_t                cpuid);
+
+/* Return processor model version, For a chip of version 1.2.3, this is 2. */
+int
+gmx_cpuid_model             (gmx_cpuid_t                cpuid);
+
+/* Return processor stepping version, For a chip of version 1.2.3, this is 3. */
+int
+gmx_cpuid_stepping          (gmx_cpuid_t                cpuid);
+
+
+/* Check whether a particular CPUID feature is set.
+ * Returns 0 if flag "feature" is not set, 1 if the flag is set. We cannot use
+ * gmx_bool here since this file must be possible to compile without simple.h.
+ */
+int
+gmx_cpuid_feature           (gmx_cpuid_t                cpuid,
+                             enum gmx_cpuid_feature     feature);
+
+
+/* Enumerated values for x86 SMT enabled-status. Note that this does not refer
+ * to Hyper-Threading support (that is the flag GMX_CPUID_FEATURE_X86_HTT), but
+ * whether Hyper-Threading is _enabled_ and _used_ in bios right now.
+ */
+enum gmx_cpuid_x86_smt
+{
+    GMX_CPUID_X86_SMT_CANNOTDETECT,
+    GMX_CPUID_X86_SMT_DISABLED,
+    GMX_CPUID_X86_SMT_ENABLED
+};
+
+/* Returns the status of x86 SMT support. IMPORTANT: There are non-zero
+ * return values for this routine that still do not indicate supported and
+ * enabled smt/Hyper-Threading. You need to carefully check the return value
+ * against the enumerated type values to see what you are getting.
+ *
+ * Long-term, this functionality will move to a new hardware topology detection
+ * layer, but that will require a lot of new code and a working interface to the
+ * hwloc library. Surprisingly, there is no simple way to find out that
+ * Hyper-Threading is actually turned on without fully enumerating and checking
+ * all the cores, which we presently can only do on Linux. This means a couple
+ * of things:
+ *
+ * 1) If you want to know whether your CPU _supports_ Hyper-Threading in the
+ *    first place, check the GMX_CPUID_FEATURE_X86_HTT flag instead!
+ * 2) There are several scenarios where this routine will say that it cannot
+ *    detect whether SMT is enabled and used right now.
+ * 3) If you need support on non-Linux x86, you have to write it :-)
+ * 4) Don't invest too much efforts, since this will be replaced with
+ *    full hardware topology detection in the future.
+ * 5) Don't worry if the detection does not work. It is not a catastrophe, but
+ *    but we get slightly better performance on x86 if we use Hyper-Threading
+ *    cores in direct space, but not reciprocal space.
+ *
+ * Since this routine presently only supports Hyper-Threading we say X86_SMT
+ * in order not to give the impression we can detect any SMT. We haven't
+ * even tested the performance on other SMT implementations, so it is not
+ * obvious we shouldn't use SMT there.
+ */
+enum gmx_cpuid_x86_smt
+gmx_cpuid_x86_smt(gmx_cpuid_t cpuid);
+
+
+
+/* Formats a text string (up to n characters) from the data structure.
+ * The output will have max 80 chars between newline characters.
+ */
+int
+gmx_cpuid_formatstring      (gmx_cpuid_t                cpuid,
+                             char *                     s,
+                             int                        n);
+
+
+/* Suggests a suitable gromacs acceleration based on the support in the
+ * hardware.
+ */
+enum gmx_cpuid_acceleration
+gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                    cpuid);
+
+
+/* Check if this binary was compiled with the same acceleration as we
+ * would suggest for the current hardware. Always print stats to the log file
+ * if it is non-NULL, and print a warning in stdout if we don't have a match.
+ */
+int
+gmx_cpuid_acceleration_check    (gmx_cpuid_t                cpuid,
+                                 FILE *                     log);
+
+
+/* Release resources used by data structure. Note that the pointer to the
+ * CPU brand string will no longer be valid once this routine has been called.
+ */
+void
+gmx_cpuid_done              (gmx_cpuid_t                cpuid);
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* GMX_CPUID_H_ */
diff --git a/include/gmx_detect_hardware.h b/include/gmx_detect_hardware.h
new file mode 100644
index 0000000000..a3acf0b7b4
--- /dev/null
+++ b/include/gmx_detect_hardware.h
@@ -0,0 +1,50 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * 
+ * This file is part of GROMACS.
+ * Copyright (c) 2012-  
+ *
+ * Written by the Gromacs development team under coordination of
+ * David van der Spoel, Berk Hess, and Erik Lindahl.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org
+ * 
+ * And Hey:
+ * GROup of MAchos and Cynical Suckers
+ */
+
+#ifndef GMX_HARDWARE_DETECT_H
+#define GMX_HARDWARE_DETECT_H
+
+#include "types/hw_info.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} /* fixes auto-indentation problems */
+#endif
+
+void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
+                         const t_commrec *cr,
+                         gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
+                         const char *gpu_id);
+
+void gmx_hardware_info_free(gmx_hw_info_t *hwinfo);
+
+void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
+                                      const t_commrec *cr, int ntmpi_requsted,
+                                      gmx_bool bUseGPU);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* GMX_HARDWARE_DETECT_H */
diff --git a/include/gmx_detectcpu.h b/include/gmx_detectcpu.h
deleted file mode 100644
index fc001c2335..0000000000
--- a/include/gmx_detectcpu.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
- *
- * 
- * This file is part of GROMACS.
- * Copyright (c) 2012-  
- *
- * Written by the Gromacs development team under coordination of
- * David van der Spoel, Berk Hess, and Erik Lindahl.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org
- * 
- * And Hey:
- * Gnomes, ROck Monsters And Chili Sauce
- */
-#ifndef _GMX_detectcpu_H_
-#define _GMX_detectcpu_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#if 0
-} /* fixes auto-indentation problems */
-#endif
-
-
-/* Currently identifiable CPU Vendors */
-typedef enum
-{
-    GMX_DETECTCPU_VENDOR_UNKNOWN = 0,
-    GMX_DETECTCPU_VENDOR_INTEL,
-    GMX_DETECTCPU_VENDOR_AMD,
-    GMX_DETECTCPU_NVENDORS
-}
-gmx_detectcpu_vendorid_t;
-
-/* Text strings corresponding to CPU vendors */
-extern const char *
-gmx_detectcpu_vendorid_string[GMX_DETECTCPU_NVENDORS];
-
-
-
-
-/* CPU feature/property list, to be used as indices into the feature array of the
- * gmxDetectCpu_t data structure.
- *
- * Always add entries to the end of this list, just before the last NFEATURES line.
- * To keep the length of this list reasonable, we only add flags referring to
- * features that we actually might have to check/use in Gromacs - feel free to add more.
- *
- * AMD and Intel are unfortunately gradually diverging, so while we can use the
- * same type of intrinsic instruction functions in the source, the resulting binary
- * is frequently not compatible starting from AVX.
- */
-typedef enum
-{
-    GMX_DETECTCPU_FEATURE_CANNOTDETECT = 0,  /* Flag set if we could not detect on this CPU  */
-    GMX_DETECTCPU_FEATURE_X86_HTT,           /* Hyperthreading technology                    */
-    GMX_DETECTCPU_FEATURE_X86_SSE2,          /* SSE 2                                        */
-    GMX_DETECTCPU_FEATURE_X86_SSE4_1,        /* SSE 4.1                                      */
-    GMX_DETECTCPU_FEATURE_X86_RDRAND,        /* RDRAND high-quality hardware random numbers  */
-    GMX_DETECTCPU_FEATURE_X86_AES,           /* x86 advanced encryption standard accel.      */
-    GMX_DETECTCPU_FEATURE_X86_AVX,           /* Advanced vector extensions                   */
-    GMX_DETECTCPU_FEATURE_X86_FMA,           /* Fused-multiply add support (mainly for AVX)  */
-    GMX_DETECTCPU_FEATURE_X86_FMA4,          /* 4-operand FMA, only on AMD for now           */
-    GMX_DETECTCPU_FEATURE_X86_XOP,           /* AMD extended instructions, only AMD for now  */
-    GMX_DETECTCPU_FEATURE_X86_AVX2,          /* AVX2 including gather support (not used yet) */
-    GMX_DETECTCPU_FEATURE_X86_RDTSCP,        /* Serializing rdtscp instruction available     */
-    GMX_DETECTCPU_NFEATURES
-}
-gmx_detectcpu_feature_t;
-
-/* Text strings for CPU feature indices */
-extern const char *
-gmx_detectcpu_feature_string[GMX_DETECTCPU_NFEATURES];
-
-
-/* Currently supported acceleration instruction sets, intrinsics or other similar combinations
- * in Gromacs. There is not always a 1-to-1 correspondence with feature flags; on some AMD
- * hardware we prefer to use 128bit AVX instructions (although 256-bit ones could be executed),
- * and we still havent written the AVX2 kernels.
- */
-typedef enum
-{
-    GMX_DETECTCPU_ACCELERATION_NONE = 0,
-    GMX_DETECTCPU_ACCELERATION_X86_SSE2,
-    GMX_DETECTCPU_ACCELERATION_X86_SSE4_1,
-    GMX_DETECTCPU_ACCELERATION_X86_AVX_128_FMA,
-    GMX_DETECTCPU_ACCELERATION_X86_AVX_256,
-    GMX_DETECTCPU_NACCELERATIONS
-}
-gmx_detectcpu_acceleration_t;
-
-/* Text strings for Gromacs acceleration/instruction sets */
-extern const char *
-gmx_detectcpu_acceleration_string[GMX_DETECTCPU_NACCELERATIONS];
-
-
-
-#define GMX_DETECTCPU_STRLEN  64
-
-/* Data structure with CPU detection information. Set by gmxDetectCpu().
- * This is listed in the header for now, since we might want to access it in
- * performance-sensitive part of the code where we don't want function calls.
- */
-typedef struct
-{
-    gmx_detectcpu_vendorid_t  vendorid;
-    char                       brand[GMX_DETECTCPU_STRLEN];
-    int                        family;
-    int                        model;
-    int                        stepping;
-
-    char                       feature[GMX_DETECTCPU_NFEATURES];
-}
-gmx_detectcpu_t;
-
-
-
-/* Fill the data structure by using CPU detection instructions.
- * Return 0 on success, 1 if something bad happened.
- */
-int
-gmx_detectcpu                   (gmx_detectcpu_t *              data);
-
-
-/* Formats a text string (up to n characters) from the data structure.
- * The output will have max 80 chars between newline characters.
- */
-int
-gmx_detectcpu_formatstring       (gmx_detectcpu_t                data,
-                                  char *                         s,
-                                  int                            n);
-
-
-/* Suggests a suitable gromacs acceleration based on the support in the
- * hardware.
- */
-int
-gmx_detectcpu_suggest_acceleration  (gmx_detectcpu_t                 data,
-                                     gmx_detectcpu_acceleration_t *  acc);
-
-/* Check if this binary was compiled with the same acceleration as we
- * would suggest for the current hardware. Always print stats to the log file
- * if it is non-NULL, and print a warning in stdout if we don't have a match.
- */
-int
-gmx_detectcpu_check_acceleration    (gmx_detectcpu_t                data,
-                                     FILE *                         log);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif /* _GMX_DETECTCPU_H_ */
diff --git a/include/gmx_fatal.h b/include/gmx_fatal.h
index df1bda65b5..6dfd887196 100644
--- a/include/gmx_fatal.h
+++ b/include/gmx_fatal.h
@@ -40,7 +40,7 @@
 #include <stdio.h>
 #include <stdarg.h>
 #include <errno.h>
-#include "typedefs.h"
+#include "types/simple.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -94,22 +94,14 @@ gmx_fatal(int fatal_errno,const char *file,int line,const char *fmt,...) GMX_ATT
  * The format of fmt is that like printf etc, only %d, %x, %c, %f, %g and %s
  * are allowed as format specifiers.
  *
+ * In case all MPI processes want to stop with the same fatal error,
+ * use gmx_fatal_collective, declared in gmx_fatal_collective.h,
+ * to avoid having as many error messages as processes.
+ *
  * Tip of the week:
  * call this function using the FARGS macro:
  * gmx_fatal(FARGS,fmt,...)
- */
-
-void
-gmx_fatal_collective(int f_errno,const char *file,int line,
-		     t_commrec *cr,gmx_domdec_t *dd,
-		     const char *fmt,...) GMX_ATTRIBUTE_NORETURN;
-/* As gmx_fatal, but only the master process prints the error message.
- * This should only be called one of the following two situations:
- * 1) On all nodes in cr->mpi_comm_mysim, with cr!=NULL,dd==NULL.
- * 2) On all nodes in dd->mpi_comm_all,   with cr==NULL,dd!=NULL.
- * This will call MPI_Finalize instead of MPI_Abort when possible,
- * This is useful for handling errors in code that is executed identically
- * for all processes.
+ *
  */
 
 void
@@ -203,6 +195,7 @@ void gmx_warning(const char *fmt,...);
  * and should NOT end with a newline.
  */
 
+
 #ifdef __cplusplus
 	   }
 #endif
diff --git a/src/kernel/membed.h b/include/gmx_fatal_collective.h
similarity index 61%
copy from src/kernel/membed.h
copy to include/gmx_fatal_collective.h
index 6e03136367..96f5e055b3 100644
--- a/src/kernel/membed.h
+++ b/include/gmx_fatal_collective.h
@@ -1,11 +1,12 @@
-/*
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
+ * 
  *                This source code is part of
- *
+ * 
  *                 G   R   O   M   A   C   S
- *
+ * 
  *          GROningen MAchine for Chemical Simulations
- *
+ * 
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2012, The GROMACS development team,
@@ -15,41 +16,51 @@
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- *
+ * 
  * If you want to redistribute modifications, please consider that
  * scientific software is very special. Version control is crucial -
  * bugs must be traceable. We will be happy to consider code for
  * inclusion in the official distribution, but derived work must not
  * be called official GROMACS. Details are found in the README & COPYING
  * files - if they are missing, get the official version at www.gromacs.org.
- *
+ * 
  * To help us fund GROMACS development, we humbly ask that you cite
  * the papers on the package - you can find them in the top README file.
- *
+ * 
  * For more info, check our website at http://www.gromacs.org
- *
+ * 
  * And Hey:
  * Gromacs Runs On Most of All Computer Systems
  */
 
-#ifndef _gmx_membed_h
-#define _gmx_membed_h
+#ifndef _fatal_collective_h
+#define _fatal_collective_h
 
-#include "typedefs.h"
+#include "types/simple.h"
+#include "types/commrec.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
+  
 
-/* initialisation of membed code */
-gmx_membed_t init_membed(FILE *fplog, int nfile, const t_filenm fnm[], gmx_mtop_t *mtop,
-                         t_inputrec *inputrec, t_state *state, t_commrec *cr, real *cpt);
+void
+gmx_fatal_collective(int f_errno,const char *file,int line,
+		     const t_commrec *cr,gmx_domdec_t *dd,
+		     const char *fmt,...);
+/* As gmx_fatal declared in gmx_fatal.h,
+ * but only the master process prints the error message.
+ * This should only be called one of the following two situations:
+ * 1) On all nodes in cr->mpi_comm_mysim, with cr!=NULL,dd==NULL.
+ * 2) On all nodes in dd->mpi_comm_all,   with cr==NULL,dd!=NULL.
+ * This will call MPI_Finalize instead of MPI_Abort when possible,
+ * This is useful for handling errors in code that is executed identically
+ * for all processes.
+ */
 
-/* rescaling the coordinates voor de membed code */
-void rescale_membed(int step_rel, gmx_membed_t membed, rvec *x);
 
 #ifdef __cplusplus
-}
+	   }
 #endif
 
-#endif
+#endif	/* _fatal_collective_h */
diff --git a/include/gmx_hash.h b/include/gmx_hash.h
new file mode 100644
index 0000000000..8d0ca4ed95
--- /dev/null
+++ b/include/gmx_hash.h
@@ -0,0 +1,318 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gromacs Runs On Most of All Computer Systems
+ */
+#ifndef _gmx_hash_h
+#define _gmx_hash_h
+
+#include "typedefs.h"
+#include "smalloc.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This include file implements the simplest hash table possible.
+ * It is limited to integer keys and integer values.
+ * The purpose is highest efficiency and lowest memory usage possible.
+ *
+ * The type definition is placed in types/commrec.h, as it is used there:
+ * typedef struct gmx_hash *gmx_hash_t
+ */
+
+typedef struct {
+    int  key;
+    int  val;
+    int  next;
+} gmx_hash_e_t;
+
+typedef struct gmx_hash {
+    int          mod;
+    int          mask;
+    int          nalloc;
+    int          *direct;
+    gmx_hash_e_t *hash;
+    int          nkey;
+    int          start_space_search;
+} t_gmx_hash;
+
+/* Clear all the entries in the hash table */
+static void gmx_hash_clear(gmx_hash_t hash)
+{
+    int i;
+
+    for(i=0; i<hash->nalloc; i++)
+    {
+        hash->hash[i].key  = -1;
+        hash->hash[i].next = -1;
+    }
+    hash->start_space_search = hash->mod;
+
+    hash->nkey = 0;
+}
+
+static void gmx_hash_realloc(gmx_hash_t hash,int nkey_used_estimate)
+{
+    /* Memory requirements:
+     * nkey_used_est*(2+1-2(1-e^-1/2))*3 ints
+     * where nkey_used_est is the local number of keys used.
+     *
+     * Make the direct list twice as long as the number of local keys.
+     * The fraction of entries in the list with:
+     * 0   size lists: e^-f
+     * >=1 size lists: 1 - e^-f
+     * where f is: the #keys / mod
+     * The fraction of keys not in the direct list is: 1-1/f(1-e^-f).
+     * The optimal table size is roughly double the number of keys.
+     */
+    /* Make the hash table a power of 2 and at least double the number of keys */
+    hash->mod = 4;
+    while (2*nkey_used_estimate > hash->mod)
+    {
+        hash->mod *= 2;
+    }
+    hash->mask = hash->mod - 1;
+    hash->nalloc = over_alloc_dd(hash->mod);
+    srenew(hash->hash,hash->nalloc);
+
+    if (debug != NULL)
+    {
+        fprintf(debug,"Hash table mod %d nalloc %d\n",hash->mod,hash->nalloc);
+    }
+}
+
+/* Clear all the entries in the hash table.
+ * With the current number of keys check if the table size is still good,
+ * if not optimize it with the currenr number of keys.
+ */
+static void gmx_hash_clear_and_optimize(gmx_hash_t hash)
+{
+    /* Resize the hash table when the occupation is < 1/4 or > 2/3 */
+    if (hash->nkey > 0 &&
+        (4*hash->nkey < hash->mod || 3*hash->nkey > 2*hash->mod))
+    {
+        if (debug != NULL)
+        {
+            fprintf(debug,"Hash table size %d #key %d: resizing\n",
+                    hash->mod,hash->nkey);
+        }
+        gmx_hash_realloc(hash,hash->nkey);
+    }
+
+    gmx_hash_clear(hash);
+}
+
+static gmx_hash_t gmx_hash_init(int nkey_used_estimate)
+{
+    gmx_hash_t hash;
+
+    snew(hash,1);
+    hash->hash = NULL;
+
+    gmx_hash_realloc(hash,nkey_used_estimate);
+
+    gmx_hash_clear(hash);
+
+    return hash;
+}
+
+/* Set the hash entry for global atom a_gl to local atom a_loc and cell. */
+static void gmx_hash_set(gmx_hash_t hash,int key,int value)
+{
+    int ind,ind_prev,i;
+
+    ind = key & hash->mask;
+    
+    if (hash->hash[ind].key >= 0)
+    {
+        /* Search the last entry in the linked list for this index */
+        ind_prev = ind;
+        while(hash->hash[ind_prev].next >= 0)
+        {
+            ind_prev = hash->hash[ind_prev].next;
+        }
+        /* Search for space in the array */
+        ind = hash->start_space_search;
+        while (ind < hash->nalloc && hash->hash[ind].key >= 0)
+        {
+            ind++;
+        }
+        /* If we are at the end of the list we need to increase the size */
+        if (ind == hash->nalloc)
+        {
+            hash->nalloc = over_alloc_dd(ind+1);
+            srenew(hash->hash,hash->nalloc);
+            for(i=ind; i<hash->nalloc; i++)
+            {
+                hash->hash[i].key  = -1;
+                hash->hash[i].next = -1;
+            }
+        }
+        hash->hash[ind_prev].next = ind;
+    
+        hash->start_space_search = ind + 1;
+    }
+    hash->hash[ind].key = key;
+    hash->hash[ind].val = value;
+
+    hash->nkey++;
+}
+
+/* Delete the hash entry for key */
+static void gmx_hash_del(gmx_hash_t hash,int key)
+{
+    int ind,ind_prev;
+
+    ind_prev = -1;
+    ind = key & hash->mask;
+    do
+    {
+        if (hash->hash[ind].key == key)
+        {
+            if (ind_prev >= 0)
+            {
+                hash->hash[ind_prev].next = hash->hash[ind].next;
+
+                /* This index is a linked entry, so we free an entry.
+                 * Check if we are creating the first empty space.
+                 */
+                if (ind < hash->start_space_search)
+                {
+                    hash->start_space_search = ind;
+                }
+            }
+            hash->hash[ind].key  = -1;
+            hash->hash[ind].val  = -1;
+            hash->hash[ind].next = -1;
+
+            hash->nkey--;
+
+            return;
+        }
+        ind_prev = ind;
+        ind = hash->hash[ind].next;
+    }
+    while (ind >= 0);
+
+    return;
+}
+
+/* Change the value for present hash entry for key */
+static void gmx_hash_change_value(gmx_hash_t hash,int key,int value)
+{
+    int ind;
+
+    ind = key & hash->mask;
+    do
+    {        
+        if (hash->hash[ind].key == key)
+        {
+            hash->hash[ind].val = value;
+            
+            return;
+        }
+        ind = hash->hash[ind].next;
+    }
+    while (ind >= 0);
+
+    return;
+}
+
+/* Change the hash value if already set, otherwise set the hash value */
+static void gmx_hash_change_or_set(gmx_hash_t hash,int key,int value)
+{
+    int ind;
+
+    ind = key & hash->mask;
+    do
+    {        
+        if (hash->hash[ind].key == key)
+        {
+            hash->hash[ind].val = value;
+
+            return;
+        }
+        ind = hash->hash[ind].next;
+    }
+    while (ind >= 0);
+
+    gmx_hash_set(hash,key,value);
+
+    return;
+}
+
+/* Returns if the key is present, if the key is present *value is set */
+static gmx_bool gmx_hash_get(const gmx_hash_t hash,int key,int *value)
+{
+    int ind;
+
+    ind = key & hash->mask;
+    do
+    {
+        if (hash->hash[ind].key == key)
+        {
+            *value = hash->hash[ind].val;
+
+            return TRUE;
+        }
+        ind = hash->hash[ind].next;
+    }
+    while (ind >= 0);
+
+    return FALSE;
+}
+
+/* Returns the value or -1 if the key is not present */
+static int gmx_hash_get_minone(const gmx_hash_t hash,int key)
+{
+    int ind;
+
+    ind = key & hash->mask;
+    do
+    {
+        if (hash->hash[ind].key == key)
+        {
+            return hash->hash[ind].val;
+        }
+        ind = hash->hash[ind].next;
+    }
+    while (ind >= 0);
+
+    return -1;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _gmx_hash_h */
diff --git a/include/gmx_math_x86_avx_128_fma_double.h b/include/gmx_math_x86_avx_128_fma_double.h
index 3189a407e8..07087fd599 100644
--- a/include/gmx_math_x86_avx_128_fma_double.h
+++ b/include/gmx_math_x86_avx_128_fma_double.h
@@ -64,13 +64,13 @@ gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsq
     __m128d lu1,lu2;
     
     /* Do first N-R step in float for 2x throughput */
-    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1),_mm_cvtpd_ps(x2),MM_SHUFFLE(1,0,1,0));
+    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1),_mm_cvtpd_ps(x2),_MM_SHUFFLE(1,0,1,0));
     luf = _mm_rsqrt_ps(xf);
     
     luf = _mm_mul_ps(_mm_mul_ps(halff,luf),_mm_nmacc_ps(_mm_mul_ps(luf,luf),xf,threef));
 
     
-    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf,luf,MM_SHUFFLE(3,2,3,2)));
+    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf,luf,_MM_SHUFFLE(3,2,3,2)));
     lu1 = _mm_cvtps_pd(luf);
     
     *invsqrt1 = _mm_mul_pd(_mm_mul_pd(half,lu1),_mm_nmacc_pd(_mm_mul_pd(lu1,lu1),x1,three));
diff --git a/include/gmx_math_x86_avx_128_fma_single.h b/include/gmx_math_x86_avx_128_fma_single.h
index d9c61a46db..8d48f2cbca 100644
--- a/include/gmx_math_x86_avx_128_fma_single.h
+++ b/include/gmx_math_x86_avx_128_fma_single.h
@@ -636,7 +636,7 @@ gmx_mm_erfc_ps(__m128 x)
  *    vectorial force to add to the particles.
  *
  */
-__m128
+static __m128
 gmx_mm_pmecorrF_ps(__m128 z2)
 {
     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
@@ -706,7 +706,7 @@ gmx_mm_pmecorrF_ps(__m128 z2)
  * 6. Add the result to 1/r, multiply by the product of the charges,
  *    and you have your potential.
  */
-__m128
+static __m128
 gmx_mm_pmecorrV_ps(__m128 z2)
 {
     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
diff --git a/include/gmx_math_x86_avx_256_double.h b/include/gmx_math_x86_avx_256_double.h
index aa6f4d7fb5..f84e92ea0e 100644
--- a/include/gmx_math_x86_avx_256_double.h
+++ b/include/gmx_math_x86_avx_256_double.h
@@ -102,11 +102,11 @@ gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsq
     __m128d lu1,lu2;
     
     /* Do first N-R step in float for 2x throughput */
-    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1),_mm_cvtpd_ps(x2),MM_SHUFFLE(1,0,1,0));
+    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1),_mm_cvtpd_ps(x2),_MM_SHUFFLE(1,0,1,0));
     luf = _mm_rsqrt_ps(xf);
     luf = _mm_mul_ps(halff,_mm_mul_ps(_mm_sub_ps(threef,_mm_mul_ps(_mm_mul_ps(luf,luf),xf)),luf));
     
-    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf,luf,MM_SHUFFLE(3,2,3,2)));
+    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf,luf,_MM_SHUFFLE(3,2,3,2)));
     lu1 = _mm_cvtps_pd(luf);
     
     *invsqrt1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),x1)),lu1));
@@ -1583,7 +1583,7 @@ gmx_mm_erfc_pd(__m128d x)
  *    vectorial force to add to the particles.
  *
  */
-__m256d
+static __m256d
 gmx_mm256_pmecorrF_pd(__m256d z2)
 {
     const __m256d  FN10     = _mm256_set1_pd(-8.0072854618360083154e-14);
@@ -1648,8 +1648,7 @@ gmx_mm256_pmecorrF_pd(__m256d z2)
 }
 
 
-
-__m128d
+static __m128d
 gmx_mm_pmecorrF_pd(__m128d z2)
 {
     const __m128d  FN10     = _mm_set1_pd(-8.0072854618360083154e-14);
@@ -1745,7 +1744,7 @@ gmx_mm_pmecorrF_pd(__m128d z2)
  *    and you have your potential.
  *
  */
-__m256d
+static __m256d
 gmx_mm256_pmecorrV_pd(__m256d z2)
 {
     const __m256d  VN9      = _mm256_set1_pd(-9.3723776169321855475e-13);
@@ -1807,8 +1806,8 @@ gmx_mm256_pmecorrV_pd(__m256d z2)
 }
 
 
-__m128d
-gmx_mm_pmecorrV_pd(__m256d z2)
+static __m128d
+gmx_mm_pmecorrV_pd(__m128d z2)
 {
     const __m128d  VN9      = _mm_set1_pd(-9.3723776169321855475e-13);
     const __m128d  VN8      = _mm_set1_pd(1.2280156762674215741e-10);
diff --git a/include/gmx_math_x86_avx_256_single.h b/include/gmx_math_x86_avx_256_single.h
index 0eb653934c..ada0db8cba 100644
--- a/include/gmx_math_x86_avx_256_single.h
+++ b/include/gmx_math_x86_avx_256_single.h
@@ -1305,7 +1305,7 @@ gmx_mm_erfc_ps(__m128 x)
  *    vectorial force to add to the particles.
  *
  */
-__m256
+static __m256
 gmx_mm256_pmecorrF_ps(__m256 z2)
 {
     const __m256  FN6      = _mm256_set1_ps(-1.7357322914161492954e-8f);
@@ -1355,8 +1355,7 @@ gmx_mm256_pmecorrF_ps(__m256 z2)
 }
 
 
-
-__m128
+static __m128
 gmx_mm_pmecorrF_ps(__m128 z2)
 {
     const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
@@ -1435,7 +1434,7 @@ gmx_mm_pmecorrF_ps(__m128 z2)
  * 6. Add the result to 1/r, multiply by the product of the charges,
  *    and you have your potential.
  */
-__m256
+static __m256
 gmx_mm256_pmecorrV_ps(__m256 z2)
 {
     const __m256  VN6      = _mm256_set1_ps(1.9296833005951166339e-8f);
@@ -1482,7 +1481,7 @@ gmx_mm256_pmecorrV_ps(__m256 z2)
 }
 
 
-__m128
+static __m128
 gmx_mm_pmecorrV_ps(__m128 z2)
 {
     const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
diff --git a/include/gmx_math_x86_sse2_double.h b/include/gmx_math_x86_sse2_double.h
index 303eb7e3bc..2233d963a2 100644
--- a/include/gmx_math_x86_sse2_double.h
+++ b/include/gmx_math_x86_sse2_double.h
@@ -67,11 +67,11 @@ gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsq
     __m128d lu1,lu2;
     
     /* Do first N-R step in float for 2x throughput */
-    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1),_mm_cvtpd_ps(x2),MM_SHUFFLE(1,0,1,0));
+    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1),_mm_cvtpd_ps(x2),_MM_SHUFFLE(1,0,1,0));
     luf = _mm_rsqrt_ps(xf);
     luf = _mm_mul_ps(halff,_mm_mul_ps(_mm_sub_ps(threef,_mm_mul_ps(_mm_mul_ps(luf,luf),xf)),luf));
 
-    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf,luf,MM_SHUFFLE(3,2,3,2)));
+    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf,luf,_MM_SHUFFLE(3,2,3,2)));
     lu1 = _mm_cvtps_pd(luf);
     
     *invsqrt1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),x1)),lu1));
@@ -843,7 +843,7 @@ gmx_mm_erfc_pd(__m128d x)
  *    vectorial force to add to the particles.
  *
  */
-__m128d
+static __m128d
 gmx_mm_pmecorrF_pd(__m128d z2)
 {
     const __m128d  FN10     = _mm_set1_pd(-8.0072854618360083154e-14);
@@ -939,8 +939,8 @@ gmx_mm_pmecorrF_pd(__m128d z2)
  *    and you have your potential.
  *
  */
-__m128d
-gmx_mm_pmecorrV_pd(__m256d z2)
+static __m128d
+gmx_mm_pmecorrV_pd(__m128d z2)
 {
     const __m128d  VN9      = _mm_set1_pd(-9.3723776169321855475e-13);
     const __m128d  VN8      = _mm_set1_pd(1.2280156762674215741e-10);
diff --git a/include/gmx_math_x86_sse4_1_double.h b/include/gmx_math_x86_sse4_1_double.h
index 37a9cac29b..e742f95285 100644
--- a/include/gmx_math_x86_sse4_1_double.h
+++ b/include/gmx_math_x86_sse4_1_double.h
@@ -24,7 +24,7 @@
 #include <stdio.h>
 #include <math.h>
 
-#include "gmx_x86_sse4.h"
+#include "gmx_x86_sse4_1.h"
 
 
 
@@ -65,11 +65,11 @@ gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsq
     __m128d lu1,lu2;
     
     /* Do first N-R step in float for 2x throughput */
-    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1),_mm_cvtpd_ps(x2),MM_SHUFFLE(1,0,1,0));
+    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1),_mm_cvtpd_ps(x2),_MM_SHUFFLE(1,0,1,0));
     luf = _mm_rsqrt_ps(xf);
     luf = _mm_mul_ps(halff,_mm_mul_ps(_mm_sub_ps(threef,_mm_mul_ps(_mm_mul_ps(luf,luf),xf)),luf));
     
-    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf,luf,MM_SHUFFLE(3,2,3,2)));
+    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf,luf,_MM_SHUFFLE(3,2,3,2)));
     lu1 = _mm_cvtps_pd(luf);
     
     *invsqrt1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),x1)),lu1));
@@ -839,7 +839,7 @@ gmx_mm_erfc_pd(__m128d x)
  *    vectorial force to add to the particles.
  *
  */
-__m128d
+static __m128d
 gmx_mm_pmecorrF_pd(__m128d z2)
 {
     const __m128d  FN10     = _mm_set1_pd(-8.0072854618360083154e-14);
@@ -935,8 +935,8 @@ gmx_mm_pmecorrF_pd(__m128d z2)
  *    and you have your potential.
  *
  */
-__m128d
-gmx_mm_pmecorrV_pd(__m256d z2)
+static __m128d
+gmx_mm_pmecorrV_pd(__m128d z2)
 {
     const __m128d  VN9      = _mm_set1_pd(-9.3723776169321855475e-13);
     const __m128d  VN8      = _mm_set1_pd(1.2280156762674215741e-10);
diff --git a/include/gmx_math_x86_sse4_1_single.h b/include/gmx_math_x86_sse4_1_single.h
index 3a430edba6..df051161f6 100644
--- a/include/gmx_math_x86_sse4_1_single.h
+++ b/include/gmx_math_x86_sse4_1_single.h
@@ -24,7 +24,7 @@
 #include <stdio.h>
 #include <math.h>
 
-#include "gmx_x86_sse4.h"
+#include "gmx_x86_sse4_1.h"
 
 
 
diff --git a/include/gmx_omp.h b/include/gmx_omp.h
index 3fe53b00a7..de55a43bf5 100644
--- a/include/gmx_omp.h
+++ b/include/gmx_omp.h
@@ -27,22 +27,26 @@
 
 /* This module defines wrappers for OpenMP API functions and enables compiling
  * code even when OpenMP is turned off in the build system.
- * Therfore, OpenMP API functions should always be used through these wrappers
+ * Therefore, OpenMP API functions should always be used through these wrappers
  * and omp.h should never be directly included. Instead, this header should be
- * used whnever OpenMP API functions are needed.
+ * used whenever OpenMP API functions are needed.
  */
 
-/*! Sets the number of threads in subsequent parallel regions, unless overridden
- *  by a num_threads clause. Acts as a wrapper for omp_get_max_threads(void). */
+/*! Returns an integer equal to or greater than the number of threads
+ *  that would be available if a parallel region without num_threads were
+ *  defined at that point in the code. Acts as a wrapper for omp_set_num_threads(void). */
 int  gmx_omp_get_max_threads(void);
 
+/*! Returns the number of processors available when the function is called.
+ *  Acts as a wrapper around omp_get_num_procs() */
+int gmx_omp_get_num_procs(void);
+
 /*! Returns the thread number of the thread executing within its thread team.
- *  Acts as a warpper for omp_get_thread_num(void). */
+ *  Acts as a wrapper for omp_get_thread_num(void). */
 int  gmx_omp_get_thread_num(void);
 
-/*! Returns an integer that is equal to or greater than the number of threads
- * that would be available if a parallel region without num_threads were
- * defined at that point in the code. Acts as a wapepr for omp_set_num_threads(void). */
+/*! Sets the number of threads in subsequent parallel regions, unless overridden
+ *  by a num_threads clause. Acts as a wrapper for omp_get_max_threads(void). */
 void gmx_omp_set_num_threads(int num_threads);
 
 #endif /* GMX_OMP_H */
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h b/include/gmx_omp_nthreads.h
similarity index 51%
copy from src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
copy to include/gmx_omp_nthreads.h
index 76070804ea..5ab8252ffd 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
+++ b/include/gmx_omp_nthreads.h
@@ -1,56 +1,65 @@
 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
- * 
+ *
  *                This source code is part of
- * 
+ *
  *                 G   R   O   M   A   C   S
- * 
+ *
  *          GROningen MAchine for Chemical Simulations
- * 
+ *
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
+ * Copyright (c) 2001-2012, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
-
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- * 
+ *
  * If you want to redistribute modifications, please consider that
  * scientific software is very special. Version control is crucial -
  * bugs must be traceable. We will be happy to consider code for
  * inclusion in the official distribution, but derived work must not
  * be called official GROMACS. Details are found in the README & COPYING
  * files - if they are missing, get the official version at www.gromacs.org.
- * 
+ *
  * To help us fund GROMACS development, we humbly ask that you cite
  * the papers on the package - you can find them in the top README file.
- * 
+ *
  * For more info, check our website at http://www.gromacs.org
- * 
+ *
  * And Hey:
  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
 
-#ifndef _GMX_GPU_UTILS_H_
-#define _GMX_GPU_UTILS_H_
-
-#ifndef __cplusplus
-extern "C" {
-#endif
-
-int do_quick_memtest(int /*dev_id*/);
-
-int do_full_memtest(int /*dev_id*/);
+#ifndef GMX_OMP_NTHREADS
+#define GMX_OMP_NTHREADS
 
-int do_timed_memtest(int /*dev_id*/, int /*time_limit*/);
+/*! Enum values corresponding to multithreaded algorithmic modules. */
+typedef enum module_nth
+{
+    /* Default is meant to be used in OMP regions outside the named
+     * algorithmic modules listed below. */
+    emntDefault, emntDomdec, emntPairsearch, emntNonbonded,
+    emntBonded, emntPME,  emntUpdate, emntLINCS, emntSETTLE,
+    emntNR
+} module_nth_t;
 
-int is_supported_cuda_gpu(int /*dev_id*/, char* /*gpu_name*/);
+/*! Initializes the per-module thread count. It is compatible with tMPI, 
+ *  thread-safety is ensured (for the features available with tMPI). 
+ *  This function should caled only once during the initialization of mdrun. */
+void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
+                           int nthreads_hw_avail,
+                           int omp_nthreads_req,
+                           int omp_nthreads_pme_req,
+                           gmx_bool bCurrNodePMEOnly,
+                           gmx_bool bFullOmpSupport);
 
-#ifndef __cplusplus
-}  /* extern "C" */
-#endif
+/*! Returns the number of threads to be used in the given module m. */
+int gmx_omp_nthreads_get(int mod);
 
-#endif // _GMX_GPU_UTILS_H_
+/*! Read the OMP_NUM_THREADS env. var. and check against the value set on the command line. */
+void gmx_omp_nthreads_read_env(int *nthreads_omp);
 
+#endif /* GMX_OMP_NTHREADS */
diff --git a/include/gmx_wallcycle.h b/include/gmx_wallcycle.h
index 1bcd6b79c7..cd1083bec4 100644
--- a/include/gmx_wallcycle.h
+++ b/include/gmx_wallcycle.h
@@ -38,23 +38,42 @@
 
 #include <stdio.h>
 #include "typedefs.h"
+#include "types/commrec.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-  enum { ewcRUN, ewcSTEP, ewcPPDURINGPME, ewcDOMDEC, ewcDDCOMMLOAD, ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcMOVEX, ewcNS, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH, ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcPME_SOLVE, ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcVSITESPREAD, ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcTEST, ewcNR };
+enum { ewcRUN, ewcSTEP, ewcPPDURINGPME, ewcDOMDEC, ewcDDCOMMLOAD,
+       ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU_NB,
+       ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
+       ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcPME_SOLVE,
+       ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
+       ewcVSITESPREAD, ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd,
+       ewcTEST, ewcNR };
+
+enum { ewcsDD_REDIST, ewcsDD_GRID, ewcsDD_SETUPCOMM,
+       ewcsDD_MAKETOP, ewcsDD_MAKECONSTR, ewcsDD_TOPOTHER,
+       ewcsNBS_GRID_LOCAL, ewcsNBS_GRID_NONLOCAL,
+       ewcsNBS_SEARCH_LOCAL, ewcsNBS_SEARCH_NONLOCAL,
+       ewcsBONDED, ewcsNONBONDED, ewcsEWALD_CORRECTION,
+       ewcsNB_X_BUF_OPS, ewcsNB_F_BUF_OPS,
+       ewcsNR };
 
 gmx_bool wallcycle_have_counter(void);
 /* Returns if cycle counting is supported */
 
-gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec *cr, int omp_nthreads);
+gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec *cr, 
+                               int nthreads_pp, int nthreads_pme);
 /* Returns the wall cycle structure.
  * Returns NULL when cycle counting is not supported.
  */
 
 void wallcycle_start(gmx_wallcycle_t wc, int ewc);
-/* Set the start cycle count for ewc */
+/* Starts the cycle counter (and increases the call count) */
+
+void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc);
+/* Starts the cycle counter without increasing the call count */
 
 double wallcycle_stop(gmx_wallcycle_t wc, int ewc);
 /* Stop the cycle count for ewc, returns the last cycle count */
@@ -62,11 +81,11 @@ double wallcycle_stop(gmx_wallcycle_t wc, int ewc);
 void wallcycle_reset_all(gmx_wallcycle_t wc);
 /* Resets all cycle counters to zero */
 
-void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc,double cycles[]);
+void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc);
 /* Sum the cycles over the nodes in cr->mpi_comm_mysim */
 
 void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
-			    gmx_wallcycle_t wc, double cycles[]);
+			    gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t);
 /* Print the cycle and time accounting */
 
 gmx_large_int_t wcycle_get_reset_counters(gmx_wallcycle_t wc);
@@ -75,6 +94,23 @@ gmx_large_int_t wcycle_get_reset_counters(gmx_wallcycle_t wc);
 void wcycle_set_reset_counters(gmx_wallcycle_t wc, gmx_large_int_t reset_counters);
 /* Set reset_counters */
 
+/* Uncomment the next line to get extra cycle counters of sub parts */
+/* #define GMX_CYCLE_SUBCOUNTERS */
+
+#ifdef GMX_CYCLE_SUBCOUNTERS
+
+void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs);
+/* Set the start sub cycle count for ewcs */
+
+void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs);
+/* Stop the sub cycle count for ewcs */
+
+#else
+/* Define the counter call to nothing to avoid any effect on performance */
+#define wallcycle_sub_start(wc, ewcs)
+#define wallcycle_sub_stop(wc, ewcs)
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/gmx_x86_avx_128_fma.h b/include/gmx_x86_avx_128_fma.h
index 260a317147..821924db56 100644
--- a/include/gmx_x86_avx_128_fma.h
+++ b/include/gmx_x86_avx_128_fma.h
@@ -84,7 +84,7 @@ static __m128i gmx_mm_castpd_si128(__m128d a)
 /* The warning directive is not supported by MSVC, and that compiler
  * does not support overriding built-in functions anyway...
  */
-#if !defined(HAVE_x86INTRIN_H) || !defined(__FMA4__)
+#if !defined(HAVE_X86INTRIN_H) || !defined(__FMA4__)
 #warning Emulating FMA instructions - this is probably not what you want!
 /* Wrapper routines so we can do test builds on non-FMA hardware */
 static __m128
diff --git a/include/gmx_x86_simd_double.h b/include/gmx_x86_simd_double.h
new file mode 100644
index 0000000000..42c4d1cf40
--- /dev/null
+++ b/include/gmx_x86_simd_double.h
@@ -0,0 +1,70 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * 
+ * This file is part of GROMACS.
+ * Copyright (c) 2012-  
+ *
+ * Written by the Gromacs development team under coordination of
+ * David van der Spoel, Berk Hess, and Erik Lindahl.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gnomes, ROck Monsters And Chili Sauce
+ */
+#ifndef _gmx_x86_simd_double_h_
+#define _gmx_x86_simd_double_h_
+
+/* This file includes the highest possible level of x86 (math) acceleration */
+
+#ifdef GMX_X86_AVX_256
+#include "gmx_x86_avx_256.h"
+#include "gmx_math_x86_avx_256_double.h"
+#else
+#ifdef GMX_X86_AVX_128_FMA
+#include "gmx_x86_avx_128_fma.h"
+#include "gmx_math_x86_avx_128_fma_double.h"
+#else
+#ifdef GMX_X86_SSE4_1
+#include "gmx_x86_sse4_1.h"
+#include "gmx_math_x86_sse4_1_double.h"
+#else
+#ifdef GMX_X86_SSE2
+#include "gmx_x86_sse2.h"
+#include "gmx_math_x86_sse2_double.h"
+#else
+#error No x86 acceleration defined
+#endif
+#endif
+#endif
+#endif
+
+static inline __m128d
+gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
+{
+    return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx), _mm_mul_pd(dy,dy) ), _mm_mul_pd(dz,dz) );
+}
+
+/* Normal sum of four __m128d registers */
+#define gmx_mm_sum4_pd(t0,t1,t2,t3)  _mm_add_pd(_mm_add_pd(t0,t1),_mm_add_pd(t2,t3))
+
+#ifdef GMX_X86_AVX_256
+
+static inline __m256d
+gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
+{
+    return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx,dx), _mm256_mul_pd(dy,dy) ), _mm256_mul_pd(dz,dz) );
+}
+
+/* Normal sum of four xmm registers */
+#define gmx_mm256_sum4_pd(t0,t1,t2,t3)  _mm256_add_pd(_mm256_add_pd(t0,t1),_mm256_add_pd(t2,t3))
+
+#endif
+
+#endif /* _gmx_x86_simd_double_h_ */
diff --git a/include/gmx_x86_simd_macros.h b/include/gmx_x86_simd_macros.h
new file mode 100644
index 0000000000..b896d396ba
--- /dev/null
+++ b/include/gmx_x86_simd_macros.h
@@ -0,0 +1,261 @@
+/*
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under 
+ * the terms of the GNU Lesser General Public License as published by the Free 
+ * Software Foundation; either version 2 of the License, or (at your option) any 
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.  
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+
+/* Undefine all defines used below so we can include this file multiple times
+ * with different settings from the same source file.
+ */
+
+/* NOTE: floor and blend are NOT available with SSE2 only acceleration */
+
+#undef GMX_X86_SIMD_WIDTH_HERE
+
+#undef gmx_epi32
+
+#undef gmx_mm_pr
+
+#undef gmx_load_pr
+#undef gmx_load1_pr
+#undef gmx_set1_pr
+#undef gmx_setzero_pr
+#undef gmx_store_pr
+#undef gmx_storeu_pr
+
+#undef gmx_add_pr
+#undef gmx_sub_pr
+#undef gmx_mul_pr
+#undef gmx_max_pr
+#undef gmx_cmplt_pr
+#undef gmx_and_pr
+#undef gmx_or_pr
+#undef gmx_andnot_pr
+
+#undef gmx_floor_pr
+#undef gmx_blendv_pr
+
+#undef gmx_movemask_pr
+
+#undef gmx_mm_castsi128_pr
+
+#undef gmx_cvttpr_epi32
+#undef gmx_cvtepi32_pr
+
+#undef gmx_invsqrt_pr
+#undef gmx_calc_rsq_pr
+#undef gmx_sum4_pr
+
+
+/* By defining GMX_MM128_HERE or GMX_MM256_HERE before including this file
+ * the same intrinsics, with defines, can be compiled for either 128 or 256
+ * bit wide SSE or AVX instructions.
+ * The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
+ * The _pr suffix is replaced by _ps or _pd (single or double precision).
+ * Note that compiler settings will decide if 128-bit intrinsics will
+ * be translated into SSE or AVX instructions.
+ */
+
+#if !defined GMX_MM128_HERE && !defined GMX_MM256_HERE
+"You should define GMX_MM128_HERE or GMX_MM256_HERE"
+#endif
+
+#if defined GMX_MM128_HERE && defined GMX_MM256_HERE
+"You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
+#endif
+
+#ifdef GMX_MM128_HERE
+
+#define gmx_epi32  __m128i
+
+#ifndef GMX_DOUBLE
+
+#include "gmx_x86_simd_single.h"
+
+#define GMX_X86_SIMD_WIDTH_HERE  4
+
+#define gmx_mm_pr  __m128
+
+#define gmx_load_pr       _mm_load_ps
+#define gmx_load1_pr      _mm_load1_ps
+#define gmx_set1_pr       _mm_set1_ps
+#define gmx_setzero_pr    _mm_setzero_ps
+#define gmx_store_pr      _mm_store_ps
+#define gmx_storeu_pr     _mm_storeu_ps
+
+#define gmx_add_pr        _mm_add_ps
+#define gmx_sub_pr        _mm_sub_ps
+#define gmx_mul_pr        _mm_mul_ps
+#define gmx_max_pr        _mm_max_ps
+#define gmx_cmplt_pr      _mm_cmplt_ps
+#define gmx_and_pr        _mm_and_ps
+#define gmx_or_pr         _mm_or_ps
+#define gmx_andnot_pr     _mm_andnot_ps
+
+#define gmx_floor_pr      _mm_floor_ps
+#define gmx_blendv_pr     _mm_blendv_ps
+
+#define gmx_movemask_pr   _mm_movemask_ps
+
+#define gmx_mm_castsi128_pr gmx_mm_castsi128_ps
+
+#define gmx_cvttpr_epi32  _mm_cvttps_epi32
+#define gmx_cvtepi32_pr   _mm_cvtepi32_ps
+
+#define gmx_invsqrt_pr    gmx_mm_invsqrt_ps
+#define gmx_calc_rsq_pr   gmx_mm_calc_rsq_ps
+#define gmx_sum4_pr       gmx_mm_sum4_ps
+
+#else /* ifndef GMX_DOUBLE */
+
+#include "gmx_x86_simd_double.h"
+
+#define GMX_X86_SIMD_WIDTH_HERE  2
+
+#define gmx_mm_pr  __m128d
+
+#define gmx_load_pr       _mm_load_pd
+#define gmx_load1_pr      _mm_load1_pd
+#define gmx_set1_pr       _mm_set1_pd
+#define gmx_setzero_pr    _mm_setzero_pd
+#define gmx_store_pr      _mm_store_pd
+#define gmx_storeu_pr     _mm_storeu_pd
+
+#define gmx_add_pr        _mm_add_pd
+#define gmx_sub_pr        _mm_sub_pd
+#define gmx_mul_pr        _mm_mul_pd
+#define gmx_max_pr        _mm_max_pd
+#define gmx_cmplt_pr      _mm_cmplt_pd
+#define gmx_and_pr        _mm_and_pd
+#define gmx_or_pr         _mm_or_pd
+#define gmx_andnot_pr     _mm_andnot_pd
+
+#define gmx_floor_pr      _mm_floor_pd
+#define gmx_blendv_pr     _mm_blendv_pd
+
+#define gmx_movemask_pr   _mm_movemask_pd
+
+#define gmx_mm_castsi128_pr gmx_mm_castsi128_pd
+
+#define gmx_cvttpr_epi32  _mm_cvttpd_epi32
+#define gmx_cvtepi32_pr   _mm_cvtepi32_pd
+
+#define gmx_invsqrt_pr    gmx_mm_invsqrt_pd
+#define gmx_calc_rsq_pr   gmx_mm_calc_rsq_pd
+#define gmx_sum4_pr       gmx_mm_sum4_pd
+
+#endif /* ifndef GMX_DOUBLE */
+
+#endif /* GMX_MM128_HERE */
+
+#ifdef GMX_MM256_HERE
+
+#define gmx_epi32 __m256i
+
+#ifndef GMX_DOUBLE
+
+#include "gmx_x86_simd_single.h"
+
+#define GMX_X86_SIMD_WIDTH_HERE  8
+
+#define gmx_mm_pr  __m256
+
+#define gmx_load_pr       _mm256_load_ps
+#define gmx_load1_pr(x)   _mm256_set1_ps((x)[0])
+#define gmx_set1_pr       _mm256_set1_ps
+#define gmx_setzero_pr    _mm256_setzero_ps
+#define gmx_store_pr      _mm256_store_ps
+#define gmx_storeu_pr     _mm256_storeu_ps
+
+#define gmx_add_pr        _mm256_add_ps
+#define gmx_sub_pr        _mm256_sub_ps
+#define gmx_mul_pr        _mm256_mul_ps
+#define gmx_max_pr        _mm256_max_ps
+/* Not-equal (ordered, non-signaling)  */
+#define gmx_cmpneq_pr(x,y)  _mm256_cmp_ps(x,y,0x0c)
+/* Less-than (ordered, non-signaling)  */
+#define gmx_cmplt_pr(x,y) _mm256_cmp_ps(x,y,0x11)
+#define gmx_and_pr        _mm256_and_ps
+#define gmx_or_pr         _mm256_or_ps
+#define gmx_andnot_pr     _mm256_andnot_ps
+
+#define gmx_floor_pr      _mm256_floor_ps
+#define gmx_blendv_pr     _mm256_blendv_ps
+
+#define gmx_movemask_pr   _mm256_movemask_ps
+
+#define gmx_mm_castsi256_pr _mm256_castsi256_ps
+
+#define gmx_cvttpr_epi32  _mm256_cvttps_epi32
+
+#define gmx_invsqrt_pr    gmx_mm256_invsqrt_ps
+#define gmx_calc_rsq_pr   gmx_mm256_calc_rsq_ps
+#define gmx_sum4_pr       gmx_mm256_sum4_ps
+
+#else
+
+#include "gmx_x86_simd_double.h"
+
+#define GMX_X86_SIMD_WIDTH_HERE  4
+
+#define gmx_mm_pr  __m256d
+
+#define gmx_load_pr       _mm256_load_pd
+#define gmx_load1_pr(x)   _mm256_set1_pd((x)[0])
+#define gmx_set1_pr       _mm256_set1_pd
+#define gmx_setzero_pr    _mm256_setzero_pd
+#define gmx_store_pr      _mm256_store_pd
+#define gmx_storeu_pr     _mm256_storeu_pd
+
+#define gmx_add_pr        _mm256_add_pd
+#define gmx_sub_pr        _mm256_sub_pd
+#define gmx_mul_pr        _mm256_mul_pd
+#define gmx_max_pr        _mm256_max_pd
+/* Not-equal (ordered, non-signaling)  */
+#define gmx_cmpneq_pr(x,y)  _mm256_cmp_pd(x,y,0x0c)
+/* Less-than (ordered, non-signaling)  */
+#define gmx_cmplt_pr(x,y) _mm256_cmp_pd(x,y,0x11)
+#define gmx_and_pr        _mm256_and_pd
+#define gmx_or_pr         _mm256_or_pd
+#define gmx_andnot_pr     _mm256_andnot_pd
+
+#define gmx_floor_pr      _mm256_floor_pd
+#define gmx_blendv_pr     _mm256_blendv_pd
+
+#define gmx_movemask_pr   _mm256_movemask_pd
+
+#define gmx_mm_castsi256_pr _mm256_castsi256_pd
+
+#define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
+
+#define gmx_invsqrt_pr    gmx_mm256_invsqrt_pd
+#define gmx_calc_rsq_pr   gmx_mm256_calc_rsq_pd
+#define gmx_sum4_pr       gmx_mm256_sum4_pd
+
+#endif
+
+#endif /* GMX_MM256_HERE */
diff --git a/include/gmx_x86_simd_single.h b/include/gmx_x86_simd_single.h
new file mode 100644
index 0000000000..10e8836a75
--- /dev/null
+++ b/include/gmx_x86_simd_single.h
@@ -0,0 +1,71 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * 
+ * This file is part of GROMACS.
+ * Copyright (c) 2012-  
+ *
+ * Written by the Gromacs development team under coordination of
+ * David van der Spoel, Berk Hess, and Erik Lindahl.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gnomes, ROck Monsters And Chili Sauce
+ */
+#ifndef _gmx_x86_simd256_single_h_
+#define _gmx_x86_simd256_single_h_
+
+/* This file includes the highest possible level of x86 (math) acceleration */
+
+#ifdef GMX_X86_AVX_256
+#include "gmx_x86_avx_256.h"
+#include "gmx_math_x86_avx_256_single.h"
+#else
+#ifdef GMX_X86_AVX_128_FMA
+#include "gmx_x86_avx_128_fma.h"
+#include "gmx_math_x86_avx_128_fma_single.h"
+#else
+#ifdef GMX_X86_SSE4_1
+#include "gmx_x86_sse4_1.h"
+#include "gmx_math_x86_sse4_1_single.h"
+#else
+#ifdef GMX_X86_SSE2
+#include "gmx_x86_sse2.h"
+#include "gmx_math_x86_sse2_single.h"
+#else
+#error No x86 acceleration defined
+#endif
+#endif
+#endif
+#endif
+
+
+static inline __m128
+gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
+{
+    return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx), _mm_mul_ps(dy,dy) ), _mm_mul_ps(dz,dz) );
+}
+
+/* Normal sum of four __m128 registers */
+#define gmx_mm_sum4_ps(t0,t1,t2,t3)  _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
+
+#ifdef GMX_X86_AVX_256
+
+static inline __m256
+gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
+{
+    return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx,dx), _mm256_mul_ps(dy,dy) ), _mm256_mul_ps(dz,dz) );
+}
+
+/* Normal sum of four __m256 registers */
+#define gmx_mm256_sum4_ps(t0,t1,t2,t3)  _mm256_add_ps(_mm256_add_ps(t0,t1),_mm256_add_ps(t2,t3))
+
+#endif
+
+#endif /* _gmx_x86_simd256_single_h_ */
diff --git a/include/gpu_utils.h b/include/gpu_utils.h
new file mode 100644
index 0000000000..751936a4fb
--- /dev/null
+++ b/include/gpu_utils.h
@@ -0,0 +1,105 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2010, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef _GPU_UTILS_H_
+#define _GPU_UTILS_H_
+
+#include "types/simple.h"
+#include "types/hw_info.h"
+
+#ifdef GMX_GPU
+#define FUNC_TERM_INT ;
+#define FUNC_TERM_VOID ;
+#define FUNC_QUALIFIER
+#else
+#define FUNC_TERM_INT {return -1;}
+#define FUNC_TERM_VOID {}
+#define FUNC_QUALIFIER static
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FUNC_QUALIFIER
+int do_quick_memtest(int dev_id) FUNC_TERM_INT
+
+FUNC_QUALIFIER
+int do_full_memtest(int dev_id) FUNC_TERM_INT
+
+FUNC_QUALIFIER
+int do_timed_memtest(int dev_id, int time_limit) FUNC_TERM_INT
+
+FUNC_QUALIFIER
+gmx_bool is_gmx_openmm_supported_gpu(int dev_id, char *gpu_name) FUNC_TERM_INT
+
+FUNC_QUALIFIER
+void detect_cuda_gpus(gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID
+
+FUNC_QUALIFIER
+void pick_compatible_gpus(gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID
+
+FUNC_QUALIFIER
+gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
+                                const int *requested_devs, int count) FUNC_TERM_INT
+
+FUNC_QUALIFIER
+void free_gpu_info(const gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID
+
+FUNC_QUALIFIER
+gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info) FUNC_TERM_INT
+
+FUNC_QUALIFIER
+gmx_bool free_gpu(char *result_str) FUNC_TERM_INT
+
+/*! \brief Returns the device ID of the GPU currently in use.*/
+FUNC_QUALIFIER
+int get_current_gpu_device_id(void) FUNC_TERM_INT
+
+FUNC_QUALIFIER
+int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int index) FUNC_TERM_INT
+
+FUNC_QUALIFIER
+void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index) FUNC_TERM_VOID
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef FUNC_TERM_INT
+#undef FUNC_TERM_VOID
+#undef FUNC_QUALIFIER
+
+#endif /* _GPU_UTILS_H_ */
diff --git a/include/main.h b/include/main.h
index 5b10a1b6e4..c6a08cc1d9 100644
--- a/include/main.h
+++ b/include/main.h
@@ -44,8 +44,14 @@
 extern "C" {
 #endif
 
+char *gmx_gethostname(char *name, size_t len);
+/* Sets the hostname to the value given by gethostname, if available,
+ * and to "unknown" otherwise. name should have at least size len.
+ * Returns name.
+ */
+
 void gmx_log_open(const char *fn,const t_commrec *cr,
-                          gmx_bool bMasterOnly, unsigned long Flags, FILE**);
+                          gmx_bool bMasterOnly, gmx_bool bAppendFiles, FILE**);
 /* Open the log file, if necessary (nprocs > 1) the logfile name is
  * communicated around the ring.
  */
diff --git a/include/maths.h b/include/maths.h
index 5d29e53487..4a82797d3c 100644
--- a/include/maths.h
+++ b/include/maths.h
@@ -102,8 +102,17 @@ real    sign(real x,real y);
 int		gmx_nint(real a);
 real    sign(real x,real y);
 real    cuberoot (real a);
-real    gmx_erf(real x);
-real    gmx_erfc(real x);
+double  gmx_erfd(double x);
+double  gmx_erfcd(double x);
+float   gmx_erff(float x);
+float   gmx_erfcf(float x);
+#ifdef GMX_DOUBLE
+#define gmx_erf(x)   gmx_erfd(x)
+#define gmx_erfc(x)  gmx_erfcd(x)
+#else
+#define gmx_erf(x)   gmx_erff(x)
+#define gmx_erfc(x)  gmx_erfcf(x)
+#endif
 
 gmx_bool gmx_isfinite(real x);
 
diff --git a/include/types/graph.h b/include/md_logging.h
similarity index 61%
copy from include/types/graph.h
copy to include/md_logging.h
index 5d80ee4a60..e197520b31 100644
--- a/include/types/graph.h
+++ b/include/md_logging.h
@@ -30,36 +30,35 @@
  * For more info, check our website at http://www.gromacs.org
  * 
  * And Hey:
- * GRoups of Organic Molecules in ACtion for Science
+ * Gromacs Runs On Most of All Computer Systems
  */
 
-#include "idef.h"
+#ifndef _md_logging_h
+#define _md_logging_h
+
+#include "types/commrec.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+void md_print_info(const t_commrec *cr, FILE *fplog,
+                   const char *fmt, ...);
+/* Print an general information message to stderr on the master node
+ * and to fplog if fplog!=NULL.
+ * fmt is a standard printf formatting string which should end in \n,
+ * the arguments after that contain the values to be printed, as in printf.
+ */
 
-typedef enum { egcolWhite, egcolGrey, egcolBlack, egcolNR } egCol;
-
-typedef struct {
-  int      nnodes;	/* The number of nodes, nnodes=at_end-at_start	*/
-  int      nbound;	/* The number of nodes with edges		*/
-  int      natoms;      /* Total range for this graph: 0 to natoms      */
-  int      at_start;	/* The first connected atom in this graph	*/
-  int      at_end;	/* The last+1 connected atom in this graph	*/
-  int      *nedge;	/* For each node the number of edges		*/
-  atom_id  **edge;	/* For each node, the actual edges (bidirect.)	*/
-  gmx_bool     bScrewPBC;   /* Screw boundary conditions                    */
-  ivec     *ishift;	/* Shift for each particle              	*/
-  int      negc;         
-  egCol   *egc;         /* color of each node */
-} t_graph;
-
-
-#define SHIFT_IVEC(g,i) ((g)->ishift[i])
+void md_print_warn(const t_commrec *cr, FILE *fplog,
+                   const char *fmt, ...);
+/* As md_print_info above, but for important notices or warnings.
+ * The only difference with md_print_info is that a newline is printed
+ * before and after the message such that it stands out.
+ */
 
 #ifdef __cplusplus
 }
 #endif
 
+#endif	/* _md_logging_h */
diff --git a/include/md_support.h b/include/md_support.h
new file mode 100644
index 0000000000..7543ceaeba
--- /dev/null
+++ b/include/md_support.h
@@ -0,0 +1,149 @@
+/*
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ *                        VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gromacs Runs On Most of All Computer Systems
+ */
+
+#ifndef _md_support_h
+#define _md_support_h
+
+#include "typedefs.h"
+#include "types/globsig.h"
+#include "sim_util.h"
+#include "vcm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Define a number of flags to better control the information
+ * passed to compute_globals in md.c and global_stat.
+ */
+
+/* We are rerunning the simulation */
+#define CGLO_RERUNMD        (1<<1)
+/* we are computing the kinetic energy from average velocities */
+#define CGLO_EKINAVEVEL     (1<<2)
+/* we are removing the center of mass momenta */
+#define CGLO_STOPCM         (1<<3)
+/* bGStat is defined in do_md */
+#define CGLO_GSTAT          (1<<4)
+/* Sum the energy terms in global computation */
+#define CGLO_ENERGY         (1<<6)
+/* Sum the kinetic energy terms in global computation */
+#define CGLO_TEMPERATURE    (1<<7)
+/* Sum the kinetic energy terms in global computation */
+#define CGLO_PRESSURE       (1<<8)
+/* Sum the constraint term in global computation */
+#define CGLO_CONSTRAINT     (1<<9)
+/* we are using an integrator that requires iteration over some steps - currently not used*/
+#define CGLO_ITERATE        (1<<10)
+/* it is the first time we are iterating (or, only once through is required */
+#define CGLO_FIRSTITERATE   (1<<11)
+/* Reading ekin from the trajectory */
+#define CGLO_READEKIN       (1<<12)
+/* we need to reset the ekin rescaling factor here */
+#define CGLO_SCALEEKIN      (1<<13)
+
+
+/* return the number of steps between global communcations */
+int check_nstglobalcomm(FILE *fplog,t_commrec *cr,
+                        int nstglobalcomm,t_inputrec *ir);
+
+/* check whether an 'nst'-style parameter p is a multiple of nst, and
+   set it to be one if not, with a warning. */
+void check_nst_param(FILE *fplog,t_commrec *cr,
+                     const char *desc_nst,int nst,
+                     const char *desc_p,int *p);
+
+/* check which of the multisim simulations has the shortest number of
+   steps and return that number of nsteps */
+gmx_large_int_t get_multisim_nsteps(const t_commrec *cr,
+                                    gmx_large_int_t nsteps);
+
+void rerun_parallel_comm(t_commrec *cr,t_trxframe *fr,
+                         gmx_bool *bNotLastFrame);
+
+/* get the conserved energy associated with the ensemble type*/
+real compute_conserved_from_auxiliary(t_inputrec *ir, t_state *state,           
+                                      t_extmass *MassQ);
+
+/* set the lambda values at each step of mdrun when they change */
+void set_current_lambdas(gmx_large_int_t step, t_lambda *fepvals, gmx_bool bRerunMD,
+                         t_trxframe *rerun_fr, t_state *state_global, t_state *state, double lam0[]);
+
+int multisim_min(const gmx_multisim_t *ms,int nmin,int n);
+/* Set an appropriate value for n across the whole multi-simulation */
+
+int multisim_nstsimsync(const t_commrec *cr,
+			const t_inputrec *ir,int repl_ex_nst);
+/* Determine the interval for inter-simulation communication */
+				   
+void init_global_signals(globsig_t *gs,const t_commrec *cr,
+			 const t_inputrec *ir,int repl_ex_nst);
+/* Constructor for globsig_t */
+
+void copy_coupling_state(t_state *statea,t_state *stateb,
+			 gmx_ekindata_t *ekinda,gmx_ekindata_t *ekindb, t_grpopts* opts);
+/* Copy stuff from state A to state B */
+
+void compute_globals(FILE *fplog, gmx_global_stat_t gstat, t_commrec *cr, t_inputrec *ir,
+		     t_forcerec *fr, gmx_ekindata_t *ekind,
+		     t_state *state, t_state *state_global, t_mdatoms *mdatoms,
+		     t_nrnb *nrnb, t_vcm *vcm, gmx_wallcycle_t wcycle,
+		     gmx_enerdata_t *enerd,tensor force_vir, tensor shake_vir, tensor total_vir,
+		     tensor pres, rvec mu_tot, gmx_constr_t constr,
+		     globsig_t *gs,gmx_bool bInterSimGS,
+		     matrix box, gmx_mtop_t *top_global, real *pcurr,
+		     int natoms, gmx_bool *bSumEkinhOld, int flags);
+/* Compute global variables during integration */
+
+void md_print_info(const t_commrec *cr, FILE *fplog,
+                   const char *fmt, ...);
+/* Print an general information message to stderr on the master node
+ * and to fplog if fplog!=NULL.
+ * fmt is a standard printf formatting string which should end in \n,
+ * the arguments after that contain the values to be printed, as in printf.
+ */
+
+void md_print_warn(const t_commrec *cr, FILE *fplog,
+                   const char *fmt, ...);
+/* As md_print_info above, but for important notices or warnings.
+ * The only difference with md_print_info is that a newline is printed
+ * before and after the message such that it stands out.
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _md_support_h */
diff --git a/include/mdebin.h b/include/mdebin.h
index 0235184487..3471fd8f64 100644
--- a/include/mdebin.h
+++ b/include/mdebin.h
@@ -67,18 +67,18 @@ typedef struct {
   int    mde_n,mdeb_n;
   real   *tmp_r;
   rvec   *tmp_v;
-  gmx_bool	 bConstr;
+  gmx_bool   bConstr;
   gmx_bool   bConstrVir;
   gmx_bool   bTricl;
   gmx_bool   bDynBox;
   gmx_bool   bNHC_trotter;
   gmx_bool   bPrintNHChains;
   gmx_bool   bMTTK;
+  gmx_bool   bMu; /* true if dipole is calculated */
   gmx_bool   bDiagPres;
   gmx_bool   bVir;
   gmx_bool   bPress;
   gmx_bool   bSurft;
-  gmx_bool   bMu;
   int    f_nre;
   int    epc;
   real   ref_p;
diff --git a/include/mdrun.h b/include/mdrun.h
dissimilarity index 62%
index db7923a571..5914c054d2 100644
--- a/include/mdrun.h
+++ b/include/mdrun.h
@@ -1,462 +1,209 @@
-/*
- * 
- *                This source code is part of
- * 
- *                 G   R   O   M   A   C   S
- * 
- *          GROningen MAchine for Chemical Simulations
- * 
- *                        VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- * 
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
- * 
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
- * 
- * For more info, check our website at http://www.gromacs.org
- * 
- * And Hey:
- * Gromacs Runs On Most of All Computer Systems
- */
-
-#ifndef _mdrun_h
-#define _mdrun_h
-
-#include <stdio.h>
-#include <time.h>
-#include "typedefs.h"
-#include "network.h"
-#include "tgroup.h"
-#include "filenm.h"
-#include "mshift.h"
-#include "force.h"
-#include "edsam.h"
-#include "mdebin.h"
-#include "vcm.h"
-#include "vsite.h"
-#include "pull.h"
-#include "update.h"
-
-
-#ifdef GMX_THREAD_MPI
-#include "thread_mpi/threads.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define MD_POLARISE       (1<<2)
-#define MD_IONIZE         (1<<3)
-#define MD_RERUN          (1<<4)
-#define MD_RERUN_VSITE    (1<<5)
-#define MD_FFSCAN         (1<<6)
-#define MD_SEPPOT         (1<<7)
-#define MD_PARTDEC        (1<<9)
-#define MD_DDBONDCHECK    (1<<10)
-#define MD_DDBONDCOMM     (1<<11)
-#define MD_CONFOUT        (1<<12)
-#define MD_REPRODUCIBLE   (1<<13)
-#define MD_READ_RNG       (1<<14)
-#define MD_APPENDFILES    (1<<15)
-#define MD_APPENDFILESSET (1<<21)
-#define MD_KEEPANDNUMCPT  (1<<16)
-#define MD_READ_EKIN      (1<<17)
-#define MD_STARTFROMCPT   (1<<18)
-#define MD_RESETCOUNTERSHALFWAY (1<<19)
-
-/* Define a number of flags to better control the information
- * passed to compute_globals in md.c and global_stat.
- */
-
-/* We are rerunning the simulation */
-#define CGLO_RERUNMD        (1<<1)
-/* we are computing the kinetic energy from average velocities */
-#define CGLO_EKINAVEVEL     (1<<2)
-/* we are removing the center of mass momenta */
-#define CGLO_STOPCM         (1<<3)
-/* bGStat is defined in do_md */
-#define CGLO_GSTAT          (1<<4)
-/* Sum the energy terms in global computation */
-#define CGLO_ENERGY         (1<<6)
-/* Sum the kinetic energy terms in global computation */
-#define CGLO_TEMPERATURE    (1<<7)
-/* Sum the kinetic energy terms in global computation */
-#define CGLO_PRESSURE       (1<<8)
-/* Sum the constraint term in global computation */
-#define CGLO_CONSTRAINT     (1<<9)
-/* we are using an integrator that requires iteration over some steps - currently not used*/
-#define CGLO_ITERATE        (1<<10)
-/* it is the first time we are iterating (or, only once through is required */
-#define CGLO_FIRSTITERATE   (1<<11)
-/* Reading ekin from the trajectory */
-#define CGLO_READEKIN       (1<<12)
-/* we need to reset the ekin rescaling factor here */
-#define CGLO_SCALEEKIN      (1<<13)
-  
-enum {
-  ddnoSEL, ddnoINTERLEAVE, ddnoPP_PME, ddnoCARTESIAN, ddnoNR
-};
-
-typedef struct {
-  double real;
-#ifdef GMX_CRAY_XT3
-  double proc;
-#else
-  clock_t proc;
-#endif
-  double realtime;
-  double proctime;
-  double time_per_step;
-  double last;
-  gmx_large_int_t nsteps_done;
-} gmx_runtime_t;
-
-typedef struct {
-  t_fileio *fp_trn;
-  t_fileio *fp_xtc;
-  int  xtc_prec;
-  ener_file_t fp_ene;
-  const char *fn_cpt;
-  gmx_bool bKeepAndNumCPT;
-  int  eIntegrator;
-  gmx_bool  bExpanded;
-  int elamstats;
-  int  simulation_part;
-  FILE *fp_dhdl;
-  FILE *fp_field;
-} gmx_mdoutf_t;
-
-/* Variables for temporary use with the deform option,
- * used in runner.c and md.c.
- * (These variables should be stored in the tpx file.)
- */
-extern gmx_large_int_t     deform_init_init_step_tpx;
-extern matrix              deform_init_box_tpx;
-#ifdef GMX_THREAD_MPI
-extern tMPI_Thread_mutex_t deform_init_box_mutex;
-
-/* The minimum number of atoms per thread. With fewer atoms than this,
- * the number of threads will get lowered.
- */
-#define MIN_ATOMS_PER_THREAD    90
-#endif
-
-
-typedef double gmx_integrator_t(FILE *log,t_commrec *cr,
-				int nfile,const t_filenm fnm[],
-				const output_env_t oenv, gmx_bool bVerbose,
-                                gmx_bool bCompact, int nstglobalcomm,
-				gmx_vsite_t *vsite,gmx_constr_t constr,
-				int stepout,
-				t_inputrec *inputrec,
-				gmx_mtop_t *mtop,t_fcdata *fcd,
-				t_state *state,
-				t_mdatoms *mdatoms,
-				t_nrnb *nrnb,gmx_wallcycle_t wcycle,
-				gmx_edsam_t ed, 
-				t_forcerec *fr,
-				int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
-                                gmx_membed_t membed,
-				real cpt_period,real max_hours,
-				const char *deviceOptions,
-				unsigned long Flags,
-				gmx_runtime_t *runtime);
-
-typedef struct gmx_global_stat *gmx_global_stat_t;
-
-/* ROUTINES from md.c */
-
-gmx_integrator_t do_md;
-
-gmx_integrator_t do_md_openmm;
-
-
-
-/* ROUTINES from minimize.c */
-
-gmx_integrator_t do_steep;
-/* Do steepest descents EM */
-
-gmx_integrator_t do_cg;
-/* Do conjugate gradient EM */
-
-gmx_integrator_t do_lbfgs;
-/* Do conjugate gradient L-BFGS */
-
-gmx_integrator_t do_nm;
-/* Do normal mode analysis */
-
-/* ROUTINES from tpi.c */
-
-gmx_integrator_t do_tpi;
-/* Do test particle insertion */
-
-
-/* ROUTINES from md_support.c */
-
-/* return the number of steps between global communcations */
-int check_nstglobalcomm(FILE *fplog,t_commrec *cr,
-                        int nstglobalcomm,t_inputrec *ir);
-
-/* check whether an 'nst'-style parameter p is a multiple of nst, and
-   set it to be one if not, with a warning. */
-void check_nst_param(FILE *fplog,t_commrec *cr,
-                     const char *desc_nst,int nst,
-                     const char *desc_p,int *p);
-
-/* check which of the multisim simulations has the shortest number of
-   steps and return that number of nsteps */
-gmx_large_int_t get_multisim_nsteps(const t_commrec *cr,
-                                    gmx_large_int_t nsteps);
-
-void rerun_parallel_comm(t_commrec *cr,t_trxframe *fr,
-                         gmx_bool *bNotLastFrame);
-
-/* get the conserved energy associated with the ensemble type*/
-real compute_conserved_from_auxiliary(t_inputrec *ir, t_state *state,           
-                                      t_extmass *MassQ);
-
-/* set the lambda values at each step of mdrun when they change */
-void set_current_lambdas(gmx_large_int_t step, t_lambda *fepvals, gmx_bool bRerunMD,
-                         t_trxframe *rerun_fr, t_state *state_global, t_state *state, double lam0[]);
-
-/* reset all cycle and time counters. */
-void reset_all_counters(FILE *fplog,t_commrec *cr,
-                        gmx_large_int_t step,
-                        gmx_large_int_t *step_rel,t_inputrec *ir,
-                        gmx_wallcycle_t wcycle,t_nrnb *nrnb,
-                        gmx_runtime_t *runtime);
-
-
-
-/* ROUTINES from sim_util.c */
-void do_pbc_first(FILE *log,matrix box,t_forcerec *fr,
-			 t_graph *graph,rvec x[]);
-
-void do_pbc_first_mtop(FILE *fplog,int ePBC,matrix box,
-			      gmx_mtop_t *mtop,rvec x[]);
-
-void do_pbc_mtop(FILE *fplog,int ePBC,matrix box,
-			gmx_mtop_t *mtop,rvec x[]);
-
-
-		     
-/* ROUTINES from stat.c */
-gmx_global_stat_t global_stat_init(t_inputrec *ir);
-
-void global_stat_destroy(gmx_global_stat_t gs);
-
-void global_stat(FILE *log,gmx_global_stat_t gs,
-			t_commrec *cr,gmx_enerdata_t *enerd,
-			tensor fvir,tensor svir,rvec mu_tot,
-			t_inputrec *inputrec,
-			gmx_ekindata_t *ekind,
-			gmx_constr_t constr,t_vcm *vcm,
-			int nsig,real *sig,
-			gmx_mtop_t *top_global, t_state *state_local, 
-			gmx_bool bSumEkinhOld, int flags);
-/* Communicate statistics over cr->mpi_comm_mysim */
-
-gmx_mdoutf_t *init_mdoutf(int nfile,const t_filenm fnm[],
-				 int mdrun_flags,
-				 const t_commrec *cr,const t_inputrec *ir,
-				 const output_env_t oenv);
-/* Returns a pointer to a data structure with all output file pointers
- * and names required by mdrun.
- */
-
-void done_mdoutf(gmx_mdoutf_t *of);
-/* Close all open output files and free the of pointer */
-
-#define MDOF_X   (1<<0)
-#define MDOF_V   (1<<1)
-#define MDOF_F   (1<<2)
-#define MDOF_XTC (1<<3)
-#define MDOF_CPT (1<<4)
-
-void write_traj(FILE *fplog,t_commrec *cr,
-		       gmx_mdoutf_t *of,
-		       int mdof_flags,
-		       gmx_mtop_t *top_global,
-		       gmx_large_int_t step,double t,
-		       t_state *state_local,t_state *state_global,
-		       rvec *f_local,rvec *f_global,
-		       int *n_xtc,rvec **x_xtc);
-/* Routine that writes frames to trn, xtc and/or checkpoint.
- * What is written is determined by the mdof_flags defined above.
- * Data is collected to the master node only when necessary.
- */
-
-int do_per_step(gmx_large_int_t step,gmx_large_int_t nstep);
-/* Return TRUE if io should be done */
-
-int do_any_io(int step, t_inputrec *ir);
-
-/* ROUTINES from sim_util.c */
-
-double gmx_gettime();
-
-void print_time(FILE *out, gmx_runtime_t *runtime,
-                       gmx_large_int_t step,t_inputrec *ir, t_commrec *cr);
-
-void runtime_start(gmx_runtime_t *runtime);
-
-void runtime_end(gmx_runtime_t *runtime);
-
-void runtime_upd_proc(gmx_runtime_t *runtime);
-/* The processor time should be updated every once in a while,
- * since on 32-bit manchines it loops after 72 minutes.
- */
-  
-void print_date_and_time(FILE *log,int pid,const char *title,
-				const gmx_runtime_t *runtime);
-  
-void nstop_cm(FILE *log,t_commrec *cr,
-		     int start,int nr_atoms,real mass[],rvec x[],rvec v[]);
-
-void finish_run(FILE *log,t_commrec *cr,const char *confout,
-		       t_inputrec *inputrec,
-		       t_nrnb nrnb[],gmx_wallcycle_t wcycle,
-		       gmx_runtime_t *runtime,
-		       gmx_bool bWriteStat);
-
-void calc_enervirdiff(FILE *fplog,int eDispCorr,t_forcerec *fr);
-
-void calc_dispcorr(FILE *fplog,t_inputrec *ir,t_forcerec *fr,
-                   gmx_large_int_t step, int natoms,
-                   matrix box,real lambda,tensor pres,tensor virial,
-                   real *prescorr, real *enercorr, real *dvdlcorr);
-
-void initialize_lambdas(FILE *fplog,t_inputrec *ir,int *fep_state,real *lambda,double *lam0);
-
-void init_npt_masses(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bInit);
-
-int ExpandedEnsembleDynamics(FILE *log,t_inputrec *ir, gmx_enerdata_t *enerd,
-                             t_state *state, t_extmass *MassQ, df_history_t *dfhist,
-                             gmx_large_int_t step, gmx_rng_t mcrng,
-                             rvec *v, t_mdatoms *mdatoms);
-
-void PrintFreeEnergyInfoToFile(FILE *outfile, t_lambda *fep, t_expanded *expand, t_simtemp *simtemp, df_history_t *dfhist,
-                               int nlam, int frequency, gmx_large_int_t step);
-
-void get_mc_state(gmx_rng_t rng,t_state *state);
-
-void set_mc_state(gmx_rng_t rng,t_state *state);
-
-
-typedef enum
-{
-  LIST_SCALARS	=0001,
-  LIST_INPUTREC	=0002,
-  LIST_TOP	=0004,
-  LIST_X	=0010,
-  LIST_V	=0020,
-  LIST_F	=0040,
-  LIST_LOAD	=0100
-} t_listitem;
-
-void check_nnodes_top(char *fn,t_topology *top);
-/* Reset the tpr file to work with one node if necessary */
-
-
-/* check the version */
-void check_ir_old_tpx_versions(t_commrec *cr,FILE *fplog,
-                               t_inputrec *ir,gmx_mtop_t *mtop);
-
-/* Allocate and initialize node-local state entries. */
-void set_state_entries(t_state *state,const t_inputrec *ir,int nnodes);
-
-/* Broadcast the data for a simulation, and allocate node-specific settings
-   such as rng generators. */
-void init_parallel(FILE *log, t_commrec *cr, t_inputrec *inputrec,
-                          gmx_mtop_t *mtop);
-
-
-void do_constrain_first(FILE *log,gmx_constr_t constr,
-			       t_inputrec *inputrec,t_mdatoms *md,
-			       t_state *state,rvec *f,
-			       t_graph *graph,t_commrec *cr,t_nrnb *nrnb,
-			       t_forcerec *fr, gmx_localtop_t *top, tensor shake_vir); 
-			  
-void dynamic_load_balancing(gmx_bool bVerbose,t_commrec *cr,real capacity[],
-				   int dimension,t_mdatoms *md,t_topology *top,
-				   rvec x[],rvec v[],matrix box);
-/* Perform load balancing, i.e. split the particles over processors
- * based on their coordinates in the "dimension" direction.
- */
-
-int multisim_min(const gmx_multisim_t *ms,int nmin,int n);
-/* Set an appropriate value for n across the whole multi-simulation */
-
-int multisim_nstsimsync(const t_commrec *cr,
-			const t_inputrec *ir,int repl_ex_nst);
-/* Determine the interval for inter-simulation communication */
-				   
-void init_global_signals(globsig_t *gs,const t_commrec *cr,
-			 const t_inputrec *ir,int repl_ex_nst);
-/* Constructor for globsig_t */
-
-void copy_coupling_state(t_state *statea,t_state *stateb,
-			 gmx_ekindata_t *ekinda,gmx_ekindata_t *ekindb, t_grpopts* opts);
-/* Copy stuff from state A to state B */
-
-void compute_globals(FILE *fplog, gmx_global_stat_t gstat, t_commrec *cr, t_inputrec *ir,
-		     t_forcerec *fr, gmx_ekindata_t *ekind,
-		     t_state *state, t_state *state_global, t_mdatoms *mdatoms,
-		     t_nrnb *nrnb, t_vcm *vcm, gmx_wallcycle_t wcycle,
-		     gmx_enerdata_t *enerd,tensor force_vir, tensor shake_vir, tensor total_vir,
-		     tensor pres, rvec mu_tot, gmx_constr_t constr,
-		     globsig_t *gs,gmx_bool bInterSimGS,
-		     matrix box, gmx_mtop_t *top_global, real *pcurr,
-		     int natoms, gmx_bool *bSumEkinhOld, int flags);
-/* Compute global variables during integration */
-
-int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
-             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
-             gmx_bool bCompact, int nstglobalcomm, ivec ddxyz,int dd_node_order,
-             real rdd, real rconstr, const char *dddlb_opt,real dlb_scale,
-	     const char *ddcsx,const char *ddcsy,const char *ddcsz,
-	     int nstepout, int resetstep, int nmultisim, int repl_ex_nst, int repl_ex_nex,
-             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
-	     const char *deviceOptions, unsigned long Flags);
-/* Driver routine, that calls the different methods */
-
-void md_print_warning(const t_commrec *cr,FILE *fplog,const char *buf);
-/* Print a warning message to stderr on the master node
- * and to fplog if fplog!=NULL.
- */
-
-void init_md(FILE *fplog,
-		    t_commrec *cr,t_inputrec *ir, const output_env_t oenv, 
-		    double *t,double *t0,
-		    real *lambda,int *fep_state, double *lam0,
-		    t_nrnb *nrnb,gmx_mtop_t *mtop,
-		    gmx_update_t *upd,
-		    int nfile,const t_filenm fnm[],
-		    gmx_mdoutf_t **outf,t_mdebin **mdebin,
-		    tensor force_vir,tensor shake_vir,
-		    rvec mu_tot,
-		    gmx_bool *bSimAnn,t_vcm **vcm, 
-		    t_state *state, unsigned long Flags);
-  /* Routine in sim_util.c */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif	/* _mdrun_h */
+/*
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ *                        VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gromacs Runs On Most of All Computer Systems
+ */
+
+#ifndef _mdrun_h
+#define _mdrun_h
+
+#include <stdio.h>
+#include <time.h>
+#include "typedefs.h"
+#include "network.h"
+#include "sim_util.h"
+#include "tgroup.h"
+#include "filenm.h"
+#include "mshift.h"
+#include "force.h"
+#include "edsam.h"
+#include "mdebin.h"
+#include "vcm.h"
+#include "vsite.h"
+#include "pull.h"
+#include "update.h"
+#include "types/membedt.h"
+#include "types/globsig.h"
+
+
+#ifdef GMX_THREAD_MPI
+#include "thread_mpi/threads.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MD_POLARISE       (1<<2)
+#define MD_IONIZE         (1<<3)
+#define MD_RERUN          (1<<4)
+#define MD_RERUN_VSITE    (1<<5)
+#define MD_FFSCAN         (1<<6)
+#define MD_SEPPOT         (1<<7)
+#define MD_PARTDEC        (1<<9)
+#define MD_DDBONDCHECK    (1<<10)
+#define MD_DDBONDCOMM     (1<<11)
+#define MD_CONFOUT        (1<<12)
+#define MD_REPRODUCIBLE   (1<<13)
+#define MD_READ_RNG       (1<<14)
+#define MD_APPENDFILES    (1<<15)
+#define MD_APPENDFILESSET (1<<21)
+#define MD_KEEPANDNUMCPT  (1<<16)
+#define MD_READ_EKIN      (1<<17)
+#define MD_STARTFROMCPT   (1<<18)
+#define MD_RESETCOUNTERSHALFWAY (1<<19)
+#define MD_TUNEPME        (1<<20)
+#define MD_TESTVERLET     (1<<22)
+
+enum {
+  ddnoSEL, ddnoINTERLEAVE, ddnoPP_PME, ddnoCARTESIAN, ddnoNR
+};
+
+typedef struct {
+    int      nthreads_tot;        /* Total number of threads requested (TMPI) */
+    int      nthreads_tmpi;       /* Number of TMPI threads requested         */
+    int      nthreads_omp;        /* Number of OpenMP threads requested       */
+    int      nthreads_omp_pme;    /* As nthreads_omp, but for PME only nodes  */
+    gmx_bool bThreadPinning;      /* Pin OpenMP threads to cores?             */
+    gmx_bool bPinHyperthreading;  /* Pin pairs of threads to physical cores   */
+    int      core_pinning_offset; /* Physical core pinning offset             */
+    char    *gpu_id;              /* GPU id's to use, each specified as chars */
+} gmx_hw_opt_t;
+
+/* Variables for temporary use with the deform option,
+ * used in runner.c and md.c.
+ * (These variables should be stored in the tpx file.)
+ */
+extern gmx_large_int_t     deform_init_init_step_tpx;
+extern matrix              deform_init_box_tpx;
+#ifdef GMX_THREAD_MPI
+extern tMPI_Thread_mutex_t deform_init_box_mutex;
+
+/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
+ * the number of threads will get lowered.
+ */
+#define MIN_ATOMS_PER_MPI_THREAD    90
+#define MIN_ATOMS_PER_GPU           900
+#endif
+
+
+typedef double gmx_integrator_t(FILE *log,t_commrec *cr,
+				int nfile,const t_filenm fnm[],
+				const output_env_t oenv, gmx_bool bVerbose,
+                                gmx_bool bCompact, int nstglobalcomm,
+				gmx_vsite_t *vsite,gmx_constr_t constr,
+				int stepout,
+				t_inputrec *inputrec,
+				gmx_mtop_t *mtop,t_fcdata *fcd,
+				t_state *state,
+				t_mdatoms *mdatoms,
+				t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+				gmx_edsam_t ed, 
+				t_forcerec *fr,
+				int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+                                gmx_membed_t membed,
+				real cpt_period,real max_hours,
+				const char *deviceOptions,
+				unsigned long Flags,
+				gmx_runtime_t *runtime);
+
+/* ROUTINES from md.c */
+
+gmx_integrator_t do_md;
+
+gmx_integrator_t do_md_openmm;
+
+
+
+/* ROUTINES from minimize.c */
+
+gmx_integrator_t do_steep;
+/* Do steepest descents EM */
+
+gmx_integrator_t do_cg;
+/* Do conjugate gradient EM */
+
+gmx_integrator_t do_lbfgs;
+/* Do conjugate gradient L-BFGS */
+
+gmx_integrator_t do_nm;
+/* Do normal mode analysis */
+
+/* ROUTINES from tpi.c */
+
+gmx_integrator_t do_tpi;
+/* Do test particle insertion */
+
+void init_npt_masses(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bInit);
+
+int ExpandedEnsembleDynamics(FILE *log,t_inputrec *ir, gmx_enerdata_t *enerd,
+                             t_state *state, t_extmass *MassQ, df_history_t *dfhist,
+                             gmx_large_int_t step, gmx_rng_t mcrng,
+                             rvec *v, t_mdatoms *mdatoms);
+
+void PrintFreeEnergyInfoToFile(FILE *outfile, t_lambda *fep, t_expanded *expand, t_simtemp *simtemp, df_history_t *dfhist,
+                               int nlam, int frequency, gmx_large_int_t step);
+
+void get_mc_state(gmx_rng_t rng,t_state *state);
+
+void set_mc_state(gmx_rng_t rng,t_state *state);
+
+/* check the version */
+void check_ir_old_tpx_versions(t_commrec *cr,FILE *fplog,
+                               t_inputrec *ir,gmx_mtop_t *mtop);
+
+/* Allocate and initialize node-local state entries. */
+void set_state_entries(t_state *state,const t_inputrec *ir,int nnodes);
+
+/* Broadcast the data for a simulation, and allocate node-specific settings
+   such as rng generators. */
+void init_parallel(FILE *log, t_commrec *cr, t_inputrec *inputrec,
+                          gmx_mtop_t *mtop);
+
+int mdrunner(gmx_hw_opt_t *hw_opt,
+	     FILE *fplog,t_commrec *cr,int nfile,
+             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+             gmx_bool bCompact, int nstglobalcomm, ivec ddxyz,int dd_node_order,
+             real rdd, real rconstr, const char *dddlb_opt,real dlb_scale,
+	     const char *ddcsx,const char *ddcsy,const char *ddcsz,
+	     const char *nbpu_opt,
+	     int nsteps_cmdline, int nstepout, int resetstep,
+	     int nmultisim, int repl_ex_nst, int repl_ex_nex,
+             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
+	     const char *deviceOptions, unsigned long Flags);
+/* Driver routine, that calls the different methods */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _mdrun_h */
diff --git a/include/mtop_util.h b/include/mtop_util.h
index a9a43b25ee..c43e335129 100644
--- a/include/mtop_util.h
+++ b/include/mtop_util.h
@@ -50,13 +50,33 @@ gmx_mtop_finalize(gmx_mtop_t *mtop);
 int
 ncg_mtop(const gmx_mtop_t *mtop);
 
+/* Removes the charge groups, i.e. makes single atom charge groups, in mtop */
+void gmx_mtop_remove_chargegroups(gmx_mtop_t *mtop);
+
+
+/* Abstract data type for looking up atoms by global atom number */
+typedef struct gmx_mtop_atomlookup *gmx_mtop_atomlookup_t;
+
+/* Initialize atom lookup by global atom number */
+gmx_mtop_atomlookup_t
+gmx_mtop_atomlookup_init(const gmx_mtop_t *mtop);
+
+/* As gmx_mtop_atomlookup_init, but optimized for atoms involved in settle */
+gmx_mtop_atomlookup_t
+gmx_mtop_atomlookup_settle_init(const gmx_mtop_t *mtop);
+
+/* Destroy a gmx_mtop_atomlookup_t data structure */
+void
+gmx_mtop_atomlookup_destroy(gmx_mtop_atomlookup_t alook);
+
 
 /* Returns a pointer to the t_atom struct belonging to atnr_global.
  * This can be an expensive operation, so if possible use
  * one of the atom loop constructs below.
  */
 void
-gmx_mtop_atomnr_to_atom(const gmx_mtop_t *mtop,int atnr_global,
+gmx_mtop_atomnr_to_atom(const gmx_mtop_atomlookup_t alook,
+			int atnr_global,
 			t_atom **atom);
 
 
@@ -64,7 +84,8 @@ gmx_mtop_atomnr_to_atom(const gmx_mtop_t *mtop,int atnr_global,
  * and the local atom number in the molecule belonging to atnr_global.
  */
 void
-gmx_mtop_atomnr_to_ilist(const gmx_mtop_t *mtop,int atnr_global,
+gmx_mtop_atomnr_to_ilist(const gmx_mtop_atomlookup_t alook,
+			 int atnr_global,
 			 t_ilist **ilist_mol,int *atnr_offset);
 
 
@@ -74,7 +95,8 @@ gmx_mtop_atomnr_to_ilist(const gmx_mtop_t *mtop,int atnr_global,
  * belonging to atnr_global.
  */
 void
-gmx_mtop_atomnr_to_molblock_ind(const gmx_mtop_t *mtop,int atnr_global,
+gmx_mtop_atomnr_to_molblock_ind(const gmx_mtop_atomlookup_t alook,
+				int atnr_global,
 				int *molb,int *molnr,int *atnr_mol);
 
 
diff --git a/include/mvdata.h b/include/mvdata.h
index 21767fac18..3f49a0822b 100644
--- a/include/mvdata.h
+++ b/include/mvdata.h
@@ -37,6 +37,7 @@
 #define _mvdata_h
 
 #include "typedefs.h"
+#include "types/commrec.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/names.h b/include/names.h
index 057995ea9c..934ee29f6a 100644
--- a/include/names.h
+++ b/include/names.h
@@ -52,6 +52,7 @@ extern const char *etcoupl_names[etcNR+1];
 extern const char *epcoupl_names[epcNR+1];
 extern const char *epcoupltype_names[epctNR+1];
 extern const char *erefscaling_names[erscNR+1];
+extern const char *ecutscheme_names[ecutsNR+1];
 extern const char *ens_names[ensNR+1];
 extern const char *ei_names[eiNR+1];
 extern const char *yesno_names[BOOL_NR+1];
@@ -101,6 +102,7 @@ extern const char *eAdressSITEtype_names[eAdressSITENR+1];
 #define ENUM_NAME(e,max,names)	((((e)<0)||((e)>=(max)))?UNDEFINED:(names)[e])
 
 #define EBOOL(e)       ENUM_NAME(e,BOOL_NR,bool_names)
+#define ECUTSCHEME(e)  ENUM_NAME(e,ecutsNR,ecutscheme_names)
 #define ENS(e)         ENUM_NAME(e,ensNR,ens_names)
 #define EI(e)          ENUM_NAME(e,eiNR,ei_names)
 #define EPBC(e)        ENUM_NAME(e,epbcNR,epbc_names)
diff --git a/include/nbnxn_cuda_data_mgmt.h b/include/nbnxn_cuda_data_mgmt.h
new file mode 100644
index 0000000000..48e54632f5
--- /dev/null
+++ b/include/nbnxn_cuda_data_mgmt.h
@@ -0,0 +1,131 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef NBNXN_CUDA_DATA_MGMT_H
+#define NBNXN_CUDA_DATA_MGMT_H
+
+#include "types/simple.h"
+#include "types/interaction_const.h"
+#include "types/nbnxn_cuda_types_ext.h"
+#include "types/hw_info.h"
+
+#ifdef GMX_GPU
+#define FUNC_TERM ;
+#define FUNC_QUALIFIER
+#else
+#define FUNC_TERM {}
+#define FUNC_QUALIFIER static
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! Initializes the data structures related to CUDA nonbonded calculations. */
+FUNC_QUALIFIER
+void nbnxn_cuda_init(FILE *fplog,
+                     nbnxn_cuda_ptr_t *p_cu_nb,
+                     gmx_gpu_info_t *gpu_info, int my_gpu_index,
+                     /* true of both local and non-local are don on GPU */
+                     gmx_bool bLocalAndNonlocal) FUNC_TERM
+
+/*! Initializes simulation constant data. */
+FUNC_QUALIFIER
+void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t p_cu_nb,
+                           const interaction_const_t *ic,
+                           const nonbonded_verlet_t *nbv) FUNC_TERM
+
+/*! Initializes pair-list data for GPU, called at every pair search step. */
+FUNC_QUALIFIER
+void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
+                              const nbnxn_pairlist_t *h_nblist,
+                              int iloc) FUNC_TERM
+
+/*! Initializes atom-data on the GPU, called at every pair search step. */
+FUNC_QUALIFIER
+void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
+                              const nbnxn_atomdata_t *atomdata) FUNC_TERM
+
+/*! \brief Update parameters during PME auto-tuning. */
+FUNC_QUALIFIER
+void nbnxn_cuda_pmetune_update_param(nbnxn_cuda_ptr_t cu_nb,
+                                     const interaction_const_t *ic) FUNC_TERM
+
+/*! Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
+FUNC_QUALIFIER
+void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
+                                const nbnxn_atomdata_t *nbatom) FUNC_TERM
+
+/*! Clears GPU outputs: nonbonded force, shift force and energy. */
+FUNC_QUALIFIER
+void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb,
+                              int flags) FUNC_TERM
+
+/*! Frees all GPU resources used for the nonbonded calculations. */
+FUNC_QUALIFIER
+void nbnxn_cuda_free(FILE *fplog,
+                     nbnxn_cuda_ptr_t cu_nb) FUNC_TERM
+
+/*! Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
+FUNC_QUALIFIER
+wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
+#ifdef GMX_GPU
+;
+#else
+{ return NULL; }
+#endif
+
+/*! Resets nonbonded GPU timings. */
+FUNC_QUALIFIER
+void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb) FUNC_TERM
+
+/*! Calculates the minimum size of proximity lists to improve SM load balance 
+    with CUDA non-bonded kernels. */
+FUNC_QUALIFIER
+int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
+#ifdef GMX_GPU
+;
+#else
+{ return -1; }
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef FUNC_TERM
+#undef FUNC_QUALIFIER
+
+#endif /* NBNXN_CUDA_DATA_MGMT_H */
diff --git a/include/nbnxn_search.h b/include/nbnxn_search.h
new file mode 100644
index 0000000000..9c078c90fb
--- /dev/null
+++ b/include/nbnxn_search.h
@@ -0,0 +1,177 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef _nbnxn_search_h
+#define _nsnxn_search_h
+
+#include "typedefs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Tells if the pair-list corresponding to nb_kernel_type is simple.
+ * Returns FALSE for super-sub type pair-list.
+ */
+gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type);
+
+/* Due to the cluster size the effective pair-list is longer than
+ * that of a simple atom pair-list. This function gives the extra distance.
+ */
+real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density);
+
+/* Allocates and initializes a pair search data structure */
+void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
+                       ivec *n_dd_cells,
+                       gmx_domdec_zones_t *zones,
+                       int nthread_max);
+
+/* Put the atoms on the pair search grid.
+ * Only atoms a0 to a1 in x are put on the grid.
+ * The atom_density is used to determine the grid size.
+ * When atom_density=-1, the density is determined from a1-a0 and the corners.
+ * With domain decomposition part of the n particles might have migrated,
+ * but have not been removed yet. This count is given by nmoved.
+ * When move[i] < 0 particle i has migrated and will not be put on the grid.
+ * Without domain decomposition move will be NULL.
+ */
+void nbnxn_put_on_grid(nbnxn_search_t nbs,
+                       int ePBC,matrix box,
+                       int dd_zone,
+                       rvec corner0,rvec corner1,
+                       int a0,int a1,
+                       real atom_density,
+                       const int *atinfo,
+                       rvec *x,
+                       int nmoved,int *move,
+                       int nb_kernel_type,
+                       nbnxn_atomdata_t *nbat);
+
+/* As nbnxn_put_on_grid, but for the non-local atoms
+ * with domain decomposition. Should be called after calling
+ * nbnxn_search_put_on_grid for the local atoms / home zone.
+ */
+void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
+                                const gmx_domdec_zones_t *zones,
+                                const int *atinfo,
+                                rvec *x,
+				int nb_kernel_type,
+                                nbnxn_atomdata_t *nbat);
+
+/* Add simple grid type information to the local super/sub grid */
+void nbnxn_grid_add_simple(nbnxn_search_t nbs,
+			   nbnxn_atomdata_t *nbat);
+
+/* Return the number of x and y cells in the local grid */
+void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy);
+
+/* Return the order indices *a of the atoms on the ns grid, size n */
+void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n);
+
+/* Renumber the atom indices on the grid to consecutive order */
+void nbnxn_set_atomorder(nbnxn_search_t nbs);
+
+/* Initializes a set of pair lists stored in nbnxn_pairlist_set_t */
+void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
+                             gmx_bool simple, gmx_bool combined,
+                             gmx_nbat_alloc_t *alloc,
+                             gmx_nbat_free_t  *free);
+
+/* Make a apir-list with radius rlist, store it in nbl.
+ * The parameter min_ci_balanced sets the minimum required
+ * number or roughly equally sized ci blocks in nbl.
+ * When set >0 ci lists will be chopped up when the estimate
+ * for the number of equally sized lists is below min_ci_balanced.
+ */
+void nbnxn_make_pairlist(const nbnxn_search_t nbs,
+			 const nbnxn_atomdata_t *nbat,
+			 const t_blocka *excl,
+			 real rlist,
+			 int min_ci_balanced,
+			 nbnxn_pairlist_set_t *nbl_list,
+			 int iloc,
+			 int nb_kernel_type,
+			 t_nrnb *nrnb);
+
+/* Initialize the non-bonded atom data structure.
+ * The enum for nbatXFormat is in the file defining nbnxn_atomdata_t.
+ * Copy the ntypes*ntypes*2 sized nbfp non-bonded parameter list
+ * to the atom data structure.
+ */
+void nbnxn_atomdata_init(FILE *fp,
+			 nbnxn_atomdata_t *nbat,
+			 int nb_kernel_type,
+			 int ntype,const real *nbfp,
+			 int n_energygroups,
+			 int nout,
+			 gmx_nbat_alloc_t *alloc,
+			 gmx_nbat_free_t  *free);
+
+/* Copy the atom data to the non-bonded atom data structure */
+void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat,
+                         int locality,
+                         const nbnxn_search_t nbs,
+                         const t_mdatoms *mdatoms,
+                         const int *atinfo);
+
+/* Copy the shift vectors to nbat */
+void nbnxn_atomdata_copy_shiftvec(gmx_bool dynamic_box,
+                                   rvec *shift_vec,
+                                   nbnxn_atomdata_t *nbat);
+
+/* Copy x to nbat->x.
+ * FillLocal tells if the local filler particle coordinates should be zeroed.
+ */
+void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs,
+                                      int locality,
+                                      gmx_bool FillLocal,
+                                      rvec *x,
+                                      nbnxn_atomdata_t *nbat);
+
+/* Add the forces stored in nbat to f, zeros the forces in nbat */
+void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
+                                     int locality,
+                                     const nbnxn_atomdata_t *nbat,
+                                     rvec *f);
+
+/* Add the fshift force stored in nbat to fshift */
+void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
+                                               rvec *fshift);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/network.h b/include/network.h
index daa5968921..f7d6e4aff5 100644
--- a/include/network.h
+++ b/include/network.h
@@ -70,6 +70,9 @@ int gmx_hostname_num(void);
 void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr);
 /* Sets up fast global communication for clusters with multi-core nodes */
 
+void gmx_init_intra_counters(t_commrec *cr);
+/* Initializes intra-node process counts and ID. */
+
 gmx_bool gmx_mpi_initialized(void);
 /* return TRUE when MPI_Init has been called.
  * return FALSE when MPI_Init has not been called OR
diff --git a/include/nrnb.h b/include/nrnb.h
index 9865fee2c5..2c4b15bbc1 100644
--- a/include/nrnb.h
+++ b/include/nrnb.h
@@ -37,6 +37,7 @@
 #define _nrnb_h
 
 #include "typedefs.h"
+#include "types/commrec.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -66,7 +67,8 @@ void print_flop(FILE *out,t_nrnb *nrnb,double *nbfs,double *mflop);
 
 void print_perf(FILE *out,double nodetime,double realtime,int nprocs,
 		       gmx_large_int_t nsteps,real delta_t,
-		       double nbfs,double mflop);
+		       double nbfs,double mflop,
+                       int omp_nth_pp);
 /* Prints the performance, nbfs and mflop come from print_flop */
 
 void pr_load(FILE *log,t_commrec *cr,t_nrnb nrnb[]);
diff --git a/include/nsgrid.h b/include/nsgrid.h
index 0c427cc560..f0785477cb 100644
--- a/include/nsgrid.h
+++ b/include/nsgrid.h
@@ -50,13 +50,18 @@ extern "C" {
  * to account for less dense regions at the edges of the system.
  */
 
+#define NSGRID_SIGNAL_MOVED_FAC  4
+/* A cell index of NSGRID_SIGNAL_MOVED_FAC*ncells signals
+ * that a charge group moved to another DD domain.
+ */
+
 t_grid *init_grid(FILE *fplog,t_forcerec *fr);
 
 void done_grid(t_grid *grid);
 
-void get_nsgrid_boundaries(t_grid *grid,
-				  gmx_domdec_t *dd,
-				  matrix box,gmx_ddbox_t *ddbox,
+void get_nsgrid_boundaries(int nboundeddim,matrix box,
+			          gmx_domdec_t *dd,
+				  gmx_ddbox_t *ddbox,
 				  rvec *gr0,rvec *gr1,
 				  int ncg,rvec *cgcm,
 				  rvec grid_x0,rvec grid_x1,
diff --git a/include/pbc.h b/include/pbc.h
index 03bf49be03..e3c49ca249 100644
--- a/include/pbc.h
+++ b/include/pbc.h
@@ -38,6 +38,7 @@
 
 #include "sysstuff.h"
 #include "typedefs.h"
+#include "types/commrec.h"
 
 #ifdef __cplusplus
 extern "C" { 
@@ -188,9 +189,15 @@ extern "C" {
    * The index consists of NCUCEDGE pairs of vertex indices.
    * The index does not change, so it needs to be retrieved only once.
    */
-  void put_atom_in_box(matrix box,rvec x);
 
-  void put_atoms_in_box(matrix box,int natoms,rvec x[]);
+  void put_atoms_in_box_omp(int ePBC,matrix box,int natoms,rvec x[]);
+  /* This wrapper function around put_atoms_in_box() with the ugly manual
+   * workload splitting is needed toavoid silently introducing multithreading
+   * in tools.
+   * */
+
+
+  void put_atoms_in_box(int ePBC, matrix box,int natoms,rvec x[]);
   /* These routines puts ONE or ALL atoms in the box, not caring 
    * about charge groups!
    * Also works for triclinic cells.
diff --git a/include/physics.h b/include/physics.h
index e72608c8b5..e707f0af20 100644
--- a/include/physics.h
+++ b/include/physics.h
@@ -136,6 +136,27 @@ extern "C" {
 #define unit_density_SI unit_mass_SI "/" unit_length_SI "^3"
 #define unit_invvisc_SI unit_length_SI " " unit_time_SI "/" unit_mass_SI
 
+  /* The routines below can be used for converting units from or to GROMACS
+     internal units. */
+  enum { eg2cAngstrom, eg2cNm, eg2cBohr, eg2cKcal_Mole, 
+	 eg2cHartree, eg2cHartree_e, eg2cAngstrom3, eg2cCoulomb,
+	 eg2cDebye, eg2cElectron, eg2cBuckingham, eg2cNR };
+  
+  /* Convert value x to GROMACS units. Energy -> Energy, Length -> Length etc. 
+     The type of x is deduced from unit, 
+     which should be taken from the enum above. */
+  extern double convert2gmx(double x,int unit);
+  
+  /* Convert value x from GROMACS units to the desired one. 
+     The type of return value is deduced from unit, see above */
+  extern double gmx2convert(double x,int unit);
+
+  /* Convert the string to one of the units supported. Returns -1 if not found. */
+  extern int string2unit(char *string);
+  
+  /* Convert the unit to a string. Return NULL when unit is out of range. */
+  extern const char *unit2string(int unit);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h b/include/pmalloc_cuda.h
similarity index 71%
copy from src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
copy to include/pmalloc_cuda.h
index 76070804ea..4cc2c76097 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
+++ b/include/pmalloc_cuda.h
@@ -1,56 +1,63 @@
 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
- * 
+ *
  *                This source code is part of
- * 
+ *
  *                 G   R   O   M   A   C   S
- * 
+ *
  *          GROningen MAchine for Chemical Simulations
- * 
+ *
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
+ * Copyright (c) 2001-2012, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
-
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- * 
+ *
  * If you want to redistribute modifications, please consider that
  * scientific software is very special. Version control is crucial -
  * bugs must be traceable. We will be happy to consider code for
  * inclusion in the official distribution, but derived work must not
  * be called official GROMACS. Details are found in the README & COPYING
  * files - if they are missing, get the official version at www.gromacs.org.
- * 
+ *
  * To help us fund GROMACS development, we humbly ask that you cite
  * the papers on the package - you can find them in the top README file.
- * 
+ *
  * For more info, check our website at http://www.gromacs.org
- * 
+ *
  * And Hey:
  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
 
-#ifndef _GMX_GPU_UTILS_H_
-#define _GMX_GPU_UTILS_H_
+#ifndef PMALLOC_CUDA_H
+#define PMALLOC_CUDA_H
 
-#ifndef __cplusplus
-extern "C" {
+#ifdef GMX_GPU
+#define FUNC_TERM ;
+#else
+#define FUNC_TERM {}
 #endif
 
-int do_quick_memtest(int /*dev_id*/);
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-int do_full_memtest(int /*dev_id*/);
+/*! Allocates nbytes of page-locked memory. */
+void pmalloc(void **h_ptr, size_t nbytes) FUNC_TERM
 
-int do_timed_memtest(int /*dev_id*/, int /*time_limit*/);
+/*! Allocates nbytes of page-locked memory with write-combining. */
+void pmalloc_wc(void **h_ptr, size_t nbytes) FUNC_TERM
 
-int is_supported_cuda_gpu(int /*dev_id*/, char* /*gpu_name*/);
+/*! Frees page locked memory allocated with pmalloc. */
+void pfree(void *h_ptr) FUNC_TERM
 
-#ifndef __cplusplus
-}  /* extern "C" */
+#ifdef __cplusplus
+}
 #endif
+#endif  /* PMALLOC_CUDA_H */
 
-#endif // _GMX_GPU_UTILS_H_
 
diff --git a/include/pme.h b/include/pme.h
index b2765b37a2..576b0a394c 100644
--- a/include/pme.h
+++ b/include/pme.h
@@ -53,9 +53,19 @@ int gmx_pme_init(gmx_pme_t *pmedata,t_commrec *cr,
                  int nnodes_major,int nnodes_minor,
                  t_inputrec *ir,int homenr,
                  gmx_bool bFreeEnergy, gmx_bool bReproducible, int nthread);
+/* Initialize the pme data structures resepectively.
+ * Return value 0 indicates all well, non zero is an error code.
+ */
 
+int gmx_pme_reinit(gmx_pme_t *         pmedata,
+		   t_commrec *         cr,
+		   gmx_pme_t           pme_src,
+		   const t_inputrec *  ir,
+		   ivec                grid_size);
+/* As gmx_pme_init, but takes most settings, except the grid, from pme_src */
+			
 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata);
-/* Initialize and destroy the pme data structures resepectively.
+/* Destroy the pme data structures resepectively.
  * Return value 0 indicates all well, non zero is an error code.
  */
 
@@ -117,9 +127,12 @@ void gmx_pme_send_x(t_commrec *cr, matrix box, rvec *x,
 			   gmx_large_int_t step);
 /* Send the coordinates to our PME-only node and request a PME calculation */
 
-void gmx_pme_finish(t_commrec *cr);
+void gmx_pme_send_finish(t_commrec *cr);
 /* Tell our PME-only node to finish */
 
+void gmx_pme_send_switch(t_commrec *cr, ivec grid_size, real ewaldcoeff);
+/* Tell our PME-only node to switch to a new grid size */
+
 void gmx_pme_receive_f(t_commrec *cr,
 			      rvec f[], matrix vir, 
 			      real *energy, real *dvdlambda,
@@ -127,14 +140,18 @@ void gmx_pme_receive_f(t_commrec *cr,
 /* PP nodes receive the long range forces from the PME nodes */
 
 int gmx_pme_recv_q_x(gmx_pme_pp_t pme_pp,
-			    real **chargeA, real **chargeB,
-			    matrix box, rvec **x,rvec **f,
-			    int *maxshift_x,int *maxshift_y,
-			    gmx_bool *bFreeEnergy,real *lambda,
-			    gmx_bool *bEnerVir,
-			    gmx_large_int_t *step);
+		     real **chargeA, real **chargeB,
+		     matrix box, rvec **x,rvec **f,
+		     int *maxshift_x, int *maxshift_y,
+		     gmx_bool *bFreeEnergy, real *lambda,
+		     gmx_bool *bEnerVir,
+		     gmx_large_int_t *step,
+		     ivec grid_size, real *ewaldcoeff);
+;
 /* Receive charges and/or coordinates from the PP-only nodes.
  * Returns the number of atoms, or -1 when the run is finished.
+ * In the special case of a PME grid size switch request, -2 is returned
+ * and grid_size and *ewaldcoeff are set, which are otherwise not set.
  */
 
 void gmx_pme_send_force_vir_ener(gmx_pme_pp_t pme_pp,
diff --git a/include/sim_util.h b/include/sim_util.h
new file mode 100644
index 0000000000..9282f25107
--- /dev/null
+++ b/include/sim_util.h
@@ -0,0 +1,202 @@
+/*
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ *                        VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gromacs Runs On Most of All Computer Systems
+ */
+
+#ifndef _sim_util_h
+#define _sim_util_h
+
+#include <time.h>
+#include "typedefs.h"
+#include "enxio.h"
+#include "mdebin.h"
+#include "update.h"
+#include "vcm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  t_fileio *fp_trn;
+  t_fileio *fp_xtc;
+  int  xtc_prec;
+  ener_file_t fp_ene;
+  const char *fn_cpt;
+  gmx_bool bKeepAndNumCPT;
+  int  eIntegrator;
+  gmx_bool  bExpanded;
+  int elamstats;
+  int  simulation_part;
+  FILE *fp_dhdl;
+  FILE *fp_field;
+} gmx_mdoutf_t;
+
+typedef struct gmx_global_stat *gmx_global_stat_t;
+
+typedef struct {
+  double real;
+#ifdef GMX_CRAY_XT3
+  double proc;
+#else
+  clock_t proc;
+#endif
+  double realtime;
+  double proctime;
+  double time_per_step;
+  double last;
+  gmx_large_int_t nsteps_done;
+} gmx_runtime_t;
+
+
+void do_pbc_first(FILE *log,matrix box,t_forcerec *fr,
+			 t_graph *graph,rvec x[]);
+
+void do_pbc_first_mtop(FILE *fplog,int ePBC,matrix box,
+			      gmx_mtop_t *mtop,rvec x[]);
+
+void do_pbc_mtop(FILE *fplog,int ePBC,matrix box,
+			gmx_mtop_t *mtop,rvec x[]);
+
+
+		     
+/* ROUTINES from stat.c */
+gmx_global_stat_t global_stat_init(t_inputrec *ir);
+
+void global_stat_destroy(gmx_global_stat_t gs);
+
+void global_stat(FILE *log,gmx_global_stat_t gs,
+			t_commrec *cr,gmx_enerdata_t *enerd,
+			tensor fvir,tensor svir,rvec mu_tot,
+			t_inputrec *inputrec,
+			gmx_ekindata_t *ekind,
+			gmx_constr_t constr,t_vcm *vcm,
+			int nsig,real *sig,
+			gmx_mtop_t *top_global, t_state *state_local, 
+			gmx_bool bSumEkinhOld, int flags);
+/* Communicate statistics over cr->mpi_comm_mysim */
+
+gmx_mdoutf_t *init_mdoutf(int nfile,const t_filenm fnm[],
+				 int mdrun_flags,
+				 const t_commrec *cr,const t_inputrec *ir,
+				 const output_env_t oenv);
+/* Returns a pointer to a data structure with all output file pointers
+ * and names required by mdrun.
+ */
+
+void done_mdoutf(gmx_mdoutf_t *of);
+/* Close all open output files and free the of pointer */
+
+#define MDOF_X   (1<<0)
+#define MDOF_V   (1<<1)
+#define MDOF_F   (1<<2)
+#define MDOF_XTC (1<<3)
+#define MDOF_CPT (1<<4)
+
+void write_traj(FILE *fplog,t_commrec *cr,
+		       gmx_mdoutf_t *of,
+		       int mdof_flags,
+		       gmx_mtop_t *top_global,
+		       gmx_large_int_t step,double t,
+		       t_state *state_local,t_state *state_global,
+		       rvec *f_local,rvec *f_global,
+		       int *n_xtc,rvec **x_xtc);
+/* Routine that writes frames to trn, xtc and/or checkpoint.
+ * What is written is determined by the mdof_flags defined above.
+ * Data is collected to the master node only when necessary.
+ */
+
+int do_per_step(gmx_large_int_t step,gmx_large_int_t nstep);
+/* Return TRUE if io should be done */
+
+/* ROUTINES from sim_util.c */
+
+double gmx_gettime();
+
+void print_time(FILE *out, gmx_runtime_t *runtime,
+                       gmx_large_int_t step,t_inputrec *ir, t_commrec *cr);
+
+void runtime_start(gmx_runtime_t *runtime);
+
+void runtime_end(gmx_runtime_t *runtime);
+
+void runtime_upd_proc(gmx_runtime_t *runtime);
+/* The processor time should be updated every once in a while,
+ * since on 32-bit manchines it loops after 72 minutes.
+ */
+  
+void print_date_and_time(FILE *log,int pid,const char *title,
+				const gmx_runtime_t *runtime);
+  
+void finish_run(FILE *log,t_commrec *cr,const char *confout,
+		       t_inputrec *inputrec,
+		       t_nrnb nrnb[],gmx_wallcycle_t wcycle,
+		       gmx_runtime_t *runtime,
+                       wallclock_gpu_t *gputimes,
+                       int omp_nth_pp,
+		       gmx_bool bWriteStat);
+
+void calc_enervirdiff(FILE *fplog,int eDispCorr,t_forcerec *fr);
+
+void calc_dispcorr(FILE *fplog,t_inputrec *ir,t_forcerec *fr,
+                   gmx_large_int_t step, int natoms,
+                   matrix box,real lambda,tensor pres,tensor virial,
+                   real *prescorr, real *enercorr, real *dvdlcorr);
+
+void initialize_lambdas(FILE *fplog,t_inputrec *ir,int *fep_state,real *lambda,double *lam0);
+
+void do_constrain_first(FILE *log,gmx_constr_t constr,
+			       t_inputrec *inputrec,t_mdatoms *md,
+			       t_state *state,rvec *f,
+			       t_graph *graph,t_commrec *cr,t_nrnb *nrnb,
+			       t_forcerec *fr, gmx_localtop_t *top, tensor shake_vir); 
+			  
+void init_md(FILE *fplog,
+		    t_commrec *cr,t_inputrec *ir, const output_env_t oenv, 
+		    double *t,double *t0,
+		    real *lambda,int *fep_state, double *lam0,
+		    t_nrnb *nrnb,gmx_mtop_t *mtop,
+		    gmx_update_t *upd,
+		    int nfile,const t_filenm fnm[],
+		    gmx_mdoutf_t **outf,t_mdebin **mdebin,
+		    tensor force_vir,tensor shake_vir,
+		    rvec mu_tot,
+		    gmx_bool *bSimAnn,t_vcm **vcm, 
+		    t_state *state, unsigned long Flags);
+  /* Routine in sim_util.c */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _sim_util_h */
diff --git a/include/smalloc.h b/include/smalloc.h
index c032e703e1..88ad9f903a 100644
--- a/include/smalloc.h
+++ b/include/smalloc.h
@@ -141,8 +141,10 @@ size_t memavail(void);
 
 /* Aligned-memory counterparts */
 
+void *save_malloc_aligned(const char *name,const char *file,int line,
+                          unsigned nelem,size_t elsize,size_t alignment);
 void *save_calloc_aligned(const char *name,const char *file,int line,
-			  unsigned nelem,size_t elsize,size_t alignment); 
+                          unsigned nelem,size_t elsize,size_t alignment);
 void save_free_aligned(const char *name,const char *file,int line, void *ptr);
 
 #ifdef __cplusplus
diff --git a/src/kernel/membed.h b/include/tables.h
similarity index 61%
copy from src/kernel/membed.h
copy to include/tables.h
index 6e03136367..ec942425a0 100644
--- a/src/kernel/membed.h
+++ b/include/tables.h
@@ -1,11 +1,12 @@
-/*
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
+ * 
  *                This source code is part of
- *
+ * 
  *                 G   R   O   M   A   C   S
- *
+ * 
  *          GROningen MAchine for Chemical Simulations
- *
+ * 
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2012, The GROMACS development team,
@@ -15,41 +16,49 @@
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- *
+ * 
  * If you want to redistribute modifications, please consider that
  * scientific software is very special. Version control is crucial -
  * bugs must be traceable. We will be happy to consider code for
  * inclusion in the official distribution, but derived work must not
  * be called official GROMACS. Details are found in the README & COPYING
  * files - if they are missing, get the official version at www.gromacs.org.
- *
+ * 
  * To help us fund GROMACS development, we humbly ask that you cite
  * the papers on the package - you can find them in the top README file.
- *
+ * 
  * For more info, check our website at http://www.gromacs.org
- *
+ * 
  * And Hey:
  * Gromacs Runs On Most of All Computer Systems
  */
 
-#ifndef _gmx_membed_h
-#define _gmx_membed_h
+#ifndef _tables_h
+#define _tables_h
 
-#include "typedefs.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/* initialisation of membed code */
-gmx_membed_t init_membed(FILE *fplog, int nfile, const t_filenm fnm[], gmx_mtop_t *mtop,
-                         t_inputrec *inputrec, t_state *state, t_commrec *cr, real *cpt);
 
-/* rescaling the coordinates voor de membed code */
-void rescale_membed(int step_rel, gmx_membed_t membed, rvec *x);
+void table_spline3_fill_ewald_lr(real *tabf,real *tabv,
+                                 int ntab,int tableformat,
+                                 real dr,real beta);
+/* Fill table tabf of size ntab with spacing dr with the ewald long-range
+ * (mesh) force and with tableformatF and tabv!=NULL, fill tabv energy.
+ * With tableformatFDV0 the size of the tabf array should be ntab*4, tabv=NULL.
+ * This function interpolates the Ewald mesh potential contribution
+ * with coefficient beta using a quadratic spline.
+ * The force can then be interpolated linearly.
+ */
+
+real ewald_spline3_table_scale(real ewaldcoeff,real rc);
+/* Return the scaling for the Ewald quadratic spline tables. */
+
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif
+#endif	/* _tables\_h */
diff --git a/include/thread_mpi/atomic/gcc_intrinsics.h b/include/thread_mpi/atomic/gcc_intrinsics.h
index 49db56b102..9d9f89b4d5 100644
--- a/include/thread_mpi/atomic/gcc_intrinsics.h
+++ b/include/thread_mpi/atomic/gcc_intrinsics.h
@@ -90,7 +90,7 @@ static inline void* tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
 static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t* a, void *oldval, 
                                       void *newval)
 {
-#if !defined(__INTEL_COMPILER)
+#if !defined(__INTEL_COMPILER) && !defined(__CUDACC__)
     return __sync_bool_compare_and_swap( &(a->value), oldval, newval);
 #else
     /* the intel compilers need integer type arguments for compare_and_swap.
diff --git a/include/thread_mpi/mpi_bindings.h b/include/thread_mpi/mpi_bindings.h
index a3338c2ea5..4f464f976d 100644
--- a/include/thread_mpi/mpi_bindings.h
+++ b/include/thread_mpi/mpi_bindings.h
@@ -76,7 +76,6 @@ typedef struct tmpi_datatype_ *MPI_Datatype;
 typedef tMPI_Op MPI_Op;
 
 
-
 #define MPI_CHAR                TMPI_CHAR
 #define MPI_SHORT               TMPI_SHORT
 #define MPI_INT                 TMPI_INT
diff --git a/include/typedefs.h b/include/typedefs.h
index 7b01ff14de..caac2ce977 100644
--- a/include/typedefs.h
+++ b/include/typedefs.h
@@ -37,9 +37,8 @@
 #define _typedefs_h
 
 
-#define STRLEN 4096
+/* DEPRECATED! value for signaling unitialized variables */
 #define NOTSET -12345
-#define BIG_STRLEN 1048576
 
 #include <sys/types.h>
 #include "sysstuff.h"
@@ -57,8 +56,8 @@
 #include "types/graph.h"
 #include "types/nrnb.h"
 #include "types/nblist.h"
+#include "types/nbnxn_pairlist.h"
 #include "types/nsgrid.h"
-#include "types/commrec.h"
 #include "types/forcerec.h"
 #include "types/fcdata.h"
 #include "types/mdatom.h"
@@ -71,10 +70,6 @@
 #include "types/constr.h"
 #include "types/matrix.h"
 #include "types/oenv.h"
-#include "types/globsig.h"
-#include "types/nlistheuristics.h"
-#include "types/iteratedconstraints.h"
-#include "types/membedt.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/types/commrec.h b/include/types/commrec.h
index 0d2fce9813..793bb6e0b6 100644
--- a/include/types/commrec.h
+++ b/include/types/commrec.h
@@ -61,8 +61,8 @@ extern "C" {
 typedef struct gmx_domdec_master *gmx_domdec_master_p_t;
 
 typedef struct {
-  int  j0;       /* j-cell start               */
-  int  j1;       /* j-cell end                 */
+  int  j0;       /* j-zone start               */
+  int  j1;       /* j-zone end                 */
   int  cg1;      /* i-charge-group end         */
   int  jcg0;     /* j-charge-group start       */
   int  jcg1;     /* j-charge-group end         */
@@ -71,6 +71,13 @@ typedef struct {
 } gmx_domdec_ns_ranges_t;
 
 typedef struct {
+  rvec x0;       /* Zone lower corner in triclinic coordinates         */
+  rvec x1;       /* Zone upper corner in triclinic coordinates         */
+  rvec bb_x0;    /* Zone bounding box lower corner in Cartesian coords */
+  rvec bb_x1;    /* Zone bounding box upper corner in Cartesian coords */
+} gmx_domdec_zone_size_t;
+
+typedef struct {
   /* The number of zones including the home zone */
   int  n;
   /* The shift of the zones with respect to the home zone */
@@ -81,10 +88,16 @@ typedef struct {
   int  nizone;
   /* The neighbor search charge group ranges for each i-zone */
   gmx_domdec_ns_ranges_t izone[DD_MAXIZONE];
+  /* Boundaries of the zones */
+  gmx_domdec_zone_size_t size[DD_MAXZONE];
+  /* The cg density of the home zone */
+  real dens_zone0;
 } gmx_domdec_zones_t;
 
 typedef struct gmx_ga2la *gmx_ga2la_t;
 
+typedef struct gmx_hash *gmx_hash_t;
+
 typedef struct gmx_reverse_top *gmx_reverse_top_p_t;
 
 typedef struct gmx_domdec_constraints *gmx_domdec_constraints_p_t;
@@ -169,6 +182,7 @@ typedef struct {
 
   /* Are there inter charge group constraints */
   gmx_bool bInterCGcons;
+  gmx_bool bInterCGsettles;
 
   /* Global atom number to interaction list */
   gmx_reverse_top_p_t reverse_top;
@@ -179,7 +193,7 @@ typedef struct {
   int  n_intercg_excl;
 
   /* Vsite stuff */
-  int  *ga2la_vsite;
+  gmx_hash_t  ga2la_vsite;
   gmx_domdec_specat_comm_p_t vsite_comm;
 
   /* Constraint stuff */
@@ -261,6 +275,12 @@ typedef struct {
   MPI_Comm mpi_comm_mysim;
   MPI_Comm mpi_comm_mygroup;
 
+  /* intra-node stuff */
+  int nodeid_intra;         /* ID over all intra nodes */ 
+  int nodeid_group_intra;   /* ID within my group (separate 0-n IDs for PP/PME-only nodes) */
+  int nnodes_intra;         /* total number of intra nodes */
+  int nnodes_pp_intra;      /* total number of PP intra nodes */
+
 #ifdef GMX_THREAD_SHM_FDECOMP
   gmx_commrec_thread_t thread;
 #endif
diff --git a/include/types/enums.h b/include/types/enums.h
index e081177f36..ed01ad6e5f 100644
--- a/include/types/enums.h
+++ b/include/types/enums.h
@@ -33,6 +33,9 @@
  * GRoups of Organic Molecules in ACtion for Science
  */
 
+#ifndef ENUMS_H_
+#define ENUMS_H_
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -73,6 +76,10 @@ enum {
   erscNO, erscALL, erscCOM, erscNR
 };
 
+enum { 
+  ecutsGROUP, ecutsVERLET, ecutsNR
+};
+
 /*
  * eelNOTUSED1 used to be GB, but to enable generalized born with different
  * forms of electrostatics (RF, switch, etc.) in the future it is now selected
@@ -96,6 +103,8 @@ enum {
 
 #define EEL_SWITCHED(e) ((e) == eelSWITCH || (e) == eelSHIFT || (e) == eelENCADSHIFT || (e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 
+#define EEL_USER(e) ((e) == eelUSER || (e) == eelPMEUSER || (e) == (eelPMESWITCH))
+
 #define EEL_IS_ZERO_AT_CUTOFF(e) (EEL_SWITCHED(e) || (e) == eelRF_ZERO)
 
 #define EEL_MIGHT_BE_ZERO_AT_CUTOFF(e) (EEL_IS_ZERO_AT_CUTOFF(e) || (e) == eelUSER || (e) == eelPMEUSER)
@@ -121,14 +130,15 @@ enum {
   eiMD, eiSteep, eiCG, eiBD, eiSD2, eiNM, eiLBFGS, eiTPI, eiTPIC, eiSD1, eiVV, eiVVAK, eiNR
 };
 #define EI_VV(e) ((e) == eiVV || (e) == eiVVAK)
+#define EI_MD(e) ((e) == eiMD || EI_VV(e))
 #define EI_SD(e) ((e) == eiSD1 || (e) == eiSD2)
 #define EI_RANDOM(e) (EI_SD(e) || (e) == eiBD)
 /*above integrators may not conserve momenta*/
-#define EI_DYNAMICS(e) ((e) == eiMD || EI_SD(e) || (e) == eiBD || EI_VV(e))
+#define EI_DYNAMICS(e) (EI_MD(e) || EI_SD(e) || (e) == eiBD)
 #define EI_ENERGY_MINIMIZATION(e) ((e) == eiSteep || (e) == eiCG || (e) == eiLBFGS)
 #define EI_TPI(e) ((e) == eiTPI || (e) == eiTPIC)
 
-#define EI_STATE_VELOCITY(e) ((e) == eiMD || EI_VV(e) || EI_SD(e))
+#define EI_STATE_VELOCITY(e) (EI_MD(e) || EI_SD(e))
 
 enum {
   econtLINCS, econtSHAKE, econtNR
@@ -332,3 +342,4 @@ enum {
 }
 #endif
 
+#endif /* ENUMS_H_ */
diff --git a/include/types/fcdata.h b/include/types/fcdata.h
index 0c94ba67d8..014ad0b936 100644
--- a/include/types/fcdata.h
+++ b/include/types/fcdata.h
@@ -32,8 +32,8 @@
  * And Hey:
  * GRoups of Organic Molecules in ACtion for Science
  */
-
-#include "commrec.h"
+#ifndef _fcdata_h
+#define _fcdata_h
 
 #ifdef __cplusplus
 extern "C" {
@@ -62,7 +62,6 @@ typedef struct {
   real *Rt_6;         /* The calculated inst. ens. averaged r^-6 (nr)     */
   real *Rtav_6;       /* The calculated time and ens. averaged r^-6 (nr)  */
   int  nsystems;      /* The number of systems for ensemble averaging     */
-  MPI_Comm mpi_comm_ensemble; /* For ensemble averaging                   */
 } t_disresdata;
 
 
@@ -117,3 +116,4 @@ typedef struct {
 }
 #endif
 
+#endif /* _fcdata_h */
diff --git a/src/kernel/membed.h b/include/types/force_flags.h
similarity index 53%
copy from src/kernel/membed.h
copy to include/types/force_flags.h
index 6e03136367..759b31587a 100644
--- a/src/kernel/membed.h
+++ b/include/types/force_flags.h
@@ -1,11 +1,12 @@
-/*
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
+ * 
  *                This source code is part of
- *
+ * 
  *                 G   R   O   M   A   C   S
- *
+ * 
  *          GROningen MAchine for Chemical Simulations
- *
+ * 
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2012, The GROMACS development team,
@@ -15,41 +16,62 @@
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- *
+ * 
  * If you want to redistribute modifications, please consider that
  * scientific software is very special. Version control is crucial -
  * bugs must be traceable. We will be happy to consider code for
  * inclusion in the official distribution, but derived work must not
  * be called official GROMACS. Details are found in the README & COPYING
  * files - if they are missing, get the official version at www.gromacs.org.
- *
+ * 
  * To help us fund GROMACS development, we humbly ask that you cite
  * the papers on the package - you can find them in the top README file.
- *
+ * 
  * For more info, check our website at http://www.gromacs.org
- *
+ * 
  * And Hey:
  * Gromacs Runs On Most of All Computer Systems
  */
 
-#ifndef _gmx_membed_h
-#define _gmx_membed_h
+#ifndef _force_flags_h
+#define _force_flags_h
 
-#include "typedefs.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/* initialisation of membed code */
-gmx_membed_t init_membed(FILE *fplog, int nfile, const t_filenm fnm[], gmx_mtop_t *mtop,
-                         t_inputrec *inputrec, t_state *state, t_commrec *cr, real *cpt);
 
-/* rescaling the coordinates voor de membed code */
-void rescale_membed(int step_rel, gmx_membed_t membed, rvec *x);
+/* Flags to tell the force calculation routines what (not) to do */
+
+/* The state has changed */
+#define GMX_FORCE_STATECHANGED (1<<0)
+/* The box might have changed */
+#define GMX_FORCE_DYNAMICBOX   (1<<1)
+/* Do neighbor searching */
+#define GMX_FORCE_NS           (1<<2)
+/* Calculate long-range energies/forces */
+#define GMX_FORCE_DOLR         (1<<3)
+/* Calculate bonded energies/forces */
+#define GMX_FORCE_BONDED       (1<<4)
+/* Store long-range forces in a separate array */
+#define GMX_FORCE_SEPLRF       (1<<5)
+/* Calculate non-bonded energies/forces */
+#define GMX_FORCE_NONBONDED    (1<<6)
+/* Calculate forces (not only energies) */
+#define GMX_FORCE_FORCES       (1<<7)
+/* Calculate the virial */
+#define GMX_FORCE_VIRIAL       (1<<8)
+/* Calculate energies */
+#define GMX_FORCE_ENERGY       (1<<9)
+/* Calculate dHdl */
+#define GMX_FORCE_DHDL         (1<<10)
+/* Normally one want all energy terms and forces */
+#define GMX_FORCE_ALLFORCES    (GMX_FORCE_BONDED | GMX_FORCE_NONBONDED | GMX_FORCE_FORCES)
+
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif
+#endif	/* _force_flags_h */
diff --git a/include/types/forcerec.h b/include/types/forcerec.h
index adbe9212e8..06210d2dd2 100644
--- a/include/types/forcerec.h
+++ b/include/types/forcerec.h
@@ -37,7 +37,9 @@
 #include "genborn.h"
 #include "qmmmrec.h"
 #include "idef.h"
-#include "../gmx_detectcpu.h"
+#include "nb_verlet.h"
+#include "interaction_const.h"
+#include "hw_info.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -69,8 +71,8 @@ typedef struct {
 } t_nblists;
 
 /* macros for the cginfo data in forcerec */
-/* The maximum cg size in cginfo is 255,
- * because we only have space for 8 bits in cginfo,
+/* The maximum cg size in cginfo is 63
+ * because we only have space for 6 bits in cginfo,
  * this cg size entry is actually only read with domain decomposition.
  * But there is a smaller limit due to the t_excl data structure
  * which is defined in nblist.h.
@@ -81,13 +83,21 @@ typedef struct {
 #define GET_CGINFO_EXCL_INTRA(cgi) ( (cgi)            &  (1<<16))
 #define SET_CGINFO_EXCL_INTER(cgi)   (cgi) =  ((cgi)  |  (1<<17))
 #define GET_CGINFO_EXCL_INTER(cgi) ( (cgi)            &  (1<<17))
-#define SET_CGINFO_SOLOPT(cgi,opt)   (cgi) = (((cgi)  & ~(15<<18)) | ((opt)<<18))
-#define GET_CGINFO_SOLOPT(cgi)     (((cgi)>>18)       &   15)
+#define SET_CGINFO_SOLOPT(cgi,opt)   (cgi) = (((cgi)  & ~(3<<18)) | ((opt)<<18))
+#define GET_CGINFO_SOLOPT(cgi)     (((cgi)>>18)       &   3)
+#define SET_CGINFO_CONSTR(cgi)       (cgi) =  ((cgi)  |  (1<<20))
+#define GET_CGINFO_CONSTR(cgi)     ( (cgi)            &  (1<<20))
+#define SET_CGINFO_SETTLE(cgi)       (cgi) =  ((cgi)  |  (1<<21))
+#define GET_CGINFO_SETTLE(cgi)     ( (cgi)            &  (1<<21))
 /* This bit is only used with bBondComm in the domain decomposition */
 #define SET_CGINFO_BOND_INTER(cgi)   (cgi) =  ((cgi)  |  (1<<22))
 #define GET_CGINFO_BOND_INTER(cgi) ( (cgi)            &  (1<<22))
-#define SET_CGINFO_NATOMS(cgi,opt)   (cgi) = (((cgi)  & ~(255<<23)) | ((opt)<<23))
-#define GET_CGINFO_NATOMS(cgi)     (((cgi)>>23)       &   255)
+#define SET_CGINFO_HAS_VDW(cgi)      (cgi) =  ((cgi)  |  (1<<23))
+#define GET_CGINFO_HAS_VDW(cgi)    ( (cgi)            &  (1<<23))
+#define SET_CGINFO_HAS_Q(cgi)        (cgi) =  ((cgi)  |  (1<<24))
+#define GET_CGINFO_HAS_Q(cgi)      ( (cgi)            &  (1<<24))
+#define SET_CGINFO_NATOMS(cgi,opt)   (cgi) = (((cgi)  & ~(63<<25)) | ((opt)<<25))
+#define GET_CGINFO_NATOMS(cgi)     (((cgi)>>25)       &   63)
 
 
 /* Value to be used in mdrun for an infinite cut-off.
@@ -136,6 +146,20 @@ typedef struct {
 typedef struct ewald_tab *ewald_tab_t; 
 
 typedef struct {
+    rvec *f;
+    int  f_nalloc;
+    unsigned red_mask; /* Mask for marking which parts of f are filled */
+    rvec *fshift;
+    real ener[F_NRE];
+    gmx_grppairener_t grpp;
+    real Vcorr;
+    real dvdl[efptNR];
+    tensor vir;
+} f_thread_t;
+
+typedef struct {
+  interaction_const_t *ic;
+
   /* Domain Decomposition */
   gmx_bool bDomDec;
 
@@ -146,8 +170,8 @@ typedef struct {
   rvec posres_com;
   rvec posres_comB;
 
-  gmx_detectcpu_t cpu_information;
-  gmx_bool        use_acceleration;
+  gmx_hw_info_t *hwinfo;
+  gmx_bool      use_cpu_acceleration;
 
   /* Use special N*N kernels? */
   gmx_bool bAllvsAll;
@@ -169,6 +193,7 @@ typedef struct {
 
   /* Charge sum and dipole for topology A/B ([0]/[1]) for Ewald corrections */
   double qsum[2];
+  double q2sum[2];
   rvec   mu_tot[2];
 
   /* Dispersion correction stuff */
@@ -227,6 +252,7 @@ typedef struct {
   int  solvent_opt;
   int  nWatMol;
   gmx_bool bGrid;
+  gmx_bool bExcl_IntraCGAll_InterCGNone;
   cginfo_mb_t *cginfo_mb;
   int  *cginfo;
   rvec *cg_cm;
@@ -238,18 +264,14 @@ typedef struct {
   int  *gid2nblists;
   t_nblists *nblists;
 
+  int      cutoff_scheme; /* old- or Verlet-style cutoff */
+  gmx_bool bNonbonded;    /* true if nonbonded calculations are *not* turned off */
+  nonbonded_verlet_t *nbv;
+
   /* The wall tables (if used) */
   int  nwall;
   t_forcetable **wall_tab;
 
-  /* This mask array of length nn determines whether or not this bit of the
-   * neighbourlists should be computed. Usually all these are true of course,
-   * but not when shells are used. During minimisation all the forces that 
-   * include shells are done, then after minimsation is converged the remaining
-   * forces are computed.
-   */
-  /* gmx_bool *bMask; */
-
   /* The number of charge groups participating in do_force_lowlevel */
   int ncg_force;
   /* The number of atoms participating in do_force_lowlevel */
@@ -391,6 +413,16 @@ typedef struct {
   real userreal2;
   real userreal3;
   real userreal4;
+
+  /* Thread local force and energy data */ 
+  /* FIXME move to bonded_thread_data_t */
+  int  nthreads;
+  int  red_ashift;
+  int  red_nblock;
+  f_thread_t *f_t;
+
+  /* Exclusion load distribution over the threads */
+  int  *excl_load;
 } t_forcerec;
 
 #define C6(nbfp,ntp,ai,aj)     (nbfp)[2*((ntp)*(ai)+(aj))]
diff --git a/include/types/graph.h b/include/types/graph.h
index 5d80ee4a60..24ecf8e2f1 100644
--- a/include/types/graph.h
+++ b/include/types/graph.h
@@ -33,6 +33,9 @@
  * GRoups of Organic Molecules in ACtion for Science
  */
 
+#ifndef _types_graph_h
+#define _types_graph_h
+
 #include "idef.h"
 
 #ifdef __cplusplus
@@ -63,3 +66,4 @@ typedef struct {
 }
 #endif
 
+#endif /* _types_graph_h */
diff --git a/include/types/group.h b/include/types/group.h
index 0b407bb1fd..08b1756702 100644
--- a/include/types/group.h
+++ b/include/types/group.h
@@ -71,6 +71,8 @@ typedef struct {
   gmx_bool         bNEMD;
   int          ngtc;            /* The number of T-coupling groups      */
   t_grp_tcstat *tcstat;         /* T-coupling data 			*/
+  tensor       **ekin_work_alloc; /* Allocated locations of ekin_work   */
+  tensor       **ekin_work;     /* Work arrays for tcstat per thread    */
   int          ngacc;           /* The number of acceleration groups    */
   t_grp_acc    *grpstat;	/* Acceleration data			*/
   tensor       ekin;            /* overall kinetic energy               */
diff --git a/include/types/hw_info.h b/include/types/hw_info.h
new file mode 100644
index 0000000000..1d04f60f1c
--- /dev/null
+++ b/include/types/hw_info.h
@@ -0,0 +1,83 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * 
+ * This file is part of GROMACS.
+ * Copyright (c) 2012-  
+ *
+ * Written by the Gromacs development team under coordination of
+ * David van der Spoel, Berk Hess, and Erik Lindahl.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gromacs Runs On Most of All Computer Systems
+ */
+
+#ifndef HWINFO_H
+#define HWINFO_H
+
+#include "types/simple.h"
+#include "types/nbnxn_cuda_types_ext.h"
+#include "gmx_cpuid.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} /* fixes auto-indentation problems */
+#endif
+
+/* Possible results of the GPU detection/check.
+ *
+ * The egpuInsane value means that during the sanity checks an error
+ * occurred that indicates malfunctioning of the device, driver, or
+ * incompatible driver/runtime. */
+typedef enum
+{
+    egpuCompatible = 0,  egpuNonexistent,  egpuIncompatible, egpuInsane
+} e_gpu_detect_res_t;
+
+/* Textual names of the GPU detection/check results (see e_gpu_detect_res_t). */
+static const char * const gpu_detect_res_str[] =
+{
+    "compatible", "inexistent", "incompatible", "insane"
+};
+
+/* GPU device information -- for now with only CUDA devices.
+ * The gmx_hardware_detect module initializes it. */
+typedef struct 
+{
+    gmx_bool            bUserSet;       /* true if the GPUs in cuda_dev_use are manually provided by the user */
+
+    int                 ncuda_dev_use;  /* number of devices selected to be used */
+    int                 *cuda_dev_use;  /* index of the devices selected to be used */
+    int                 ncuda_dev;      /* total number of devices detected */
+    cuda_dev_info_ptr_t cuda_dev;       /* devices detected in the system (per node) */
+} gmx_gpu_info_t;
+
+/* Hardware information structure with CPU and GPU information.
+ * It is initialized by gmx_detect_hardware(). */
+typedef struct
+{
+    gmx_bool        bCanUseGPU; /* True if compatible GPUs are detected during hardware detection */
+    gmx_gpu_info_t  gpu_info;   /* Information about GPUs detected in the system */
+
+    gmx_cpuid_t     cpuid_info; /* CPUID information about CPU detected;
+                                   NOTE: this will only detect the CPU thread 0 of the
+                                   current process runs on. */
+    int             nthreads_hw_avail; /* Number of hardware threads available; this number
+                                         is based on the number of CPUs reported as available
+                                         by the OS at the time of detection. */
+} gmx_hw_info_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* HWINFO_H */
diff --git a/include/types/idef.h b/include/types/idef.h
index 0fa9a1b19f..7d44e4ccb2 100644
--- a/include/types/idef.h
+++ b/include/types/idef.h
@@ -146,6 +146,12 @@ enum {
 
 #define IS_RESTRAINT_TYPE(ifunc) (((ifunc==F_POSRES) || (ifunc==F_DISRES) || (ifunc==F_RESTRBONDS) || (ifunc==F_DISRESVIOL) || (ifunc==F_ORIRES) || (ifunc==F_ORIRESDEV) || (ifunc==F_ANGRES) || (ifunc == F_ANGRESZ) || (ifunc==F_DIHRES)))
 
+/* A macro for checking if ftype is an explicit pair-listed LJ or COULOMB
+ * interaction type:
+ * bonded LJ (usually 1-4), or special listed non-bonded for FEP.
+ */
+#define IS_LISTED_LJ_C(ftype) ((ftype) >= F_LJ14 && (ftype) <= F_LJC_PAIRS_NB)
+
 typedef union
 {
   /* Some parameters have A and B values for free energy calculations.
diff --git a/include/types/ifunc.h b/include/types/ifunc.h
index 72026c57c0..503e53d176 100644
--- a/include/types/ifunc.h
+++ b/include/types/ifunc.h
@@ -37,7 +37,11 @@
 #ifndef _ifunc_h
 #define _ifunc_h
 
-#include "../typedefs.h"
+#include "types/idef.h"
+#include "types/mdatom.h"
+#include "types/fcdata.h"
+#include "types/graph.h"
+#include "types/pbc.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/types/inputrec.h b/include/types/inputrec.h
index c7deddf4f7..008b6a987b 100644
--- a/include/types/inputrec.h
+++ b/include/types/inputrec.h
@@ -269,7 +269,8 @@ typedef struct {
   gmx_large_int_t nsteps;	/* number of steps to be taken			*/
   int  simulation_part; /* Used in checkpointing to separate chunks */
   gmx_large_int_t init_step;	/* start at a stepcount >0 (used w. tpbconv)    */
-  int  nstcalcenergy;	/* fequency of energy calc. and T/P coupl. upd.	*/
+  int  nstcalcenergy;	/* frequency of energy calc. and T/P coupl. upd.	*/
+  int  cutoff_scheme;   /* cut-off scheme: group or verlet              */
   int  ns_type;		/* which ns method should we use?               */
   int  nstlist;		/* number of steps before pairlist is generated	*/
   int  ndelta;		/* number of cells per rlong			*/
@@ -286,6 +287,7 @@ typedef struct {
   double init_t;	/* initial time (ps) 				*/
   double delta_t;	/* time step (ps)				*/
   real xtcprec;         /* precision of xtc file                        */
+  real fourier_spacing; /* requested fourier_spacing, when nk? not set  */
   int  nkx,nky,nkz;     /* number of k vectors in each spatial dimension*/
                         /* for fourier methods for long range electrost.*/
   int  pme_order;       /* interpolation order for PME                  */
@@ -310,6 +312,7 @@ typedef struct {
   rvec posres_com;      /* The COM of the posres atoms                  */
   rvec posres_comB;     /* The B-state COM of the posres atoms          */
   int  andersen_seed;   /* Random seed for Andersen thermostat (obsolete) */
+  real verletbuf_drift; /* Max. drift (kJ/mol/ps/atom) for list buffer  */
   real rlist;		/* short range pairlist cut-off (nm)		*/
   real rlistlong;	/* long range pairlist cut-off (nm)		*/
   real rtpi;            /* Radius for test particle insertion           */
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h b/include/types/interaction_const.h
similarity index 50%
copy from src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
copy to include/types/interaction_const.h
index 76070804ea..842d25b622 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
+++ b/include/types/interaction_const.h
@@ -1,56 +1,90 @@
 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
- * 
+ *
  *                This source code is part of
- * 
+ *
  *                 G   R   O   M   A   C   S
- * 
+ *
  *          GROningen MAchine for Chemical Simulations
- * 
+ *
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
+ * Copyright (c) 2001-2012, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
-
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- * 
+ *
  * If you want to redistribute modifications, please consider that
  * scientific software is very special. Version control is crucial -
  * bugs must be traceable. We will be happy to consider code for
  * inclusion in the official distribution, but derived work must not
  * be called official GROMACS. Details are found in the README & COPYING
  * files - if they are missing, get the official version at www.gromacs.org.
- * 
+ *
  * To help us fund GROMACS development, we humbly ask that you cite
  * the papers on the package - you can find them in the top README file.
- * 
+ *
  * For more info, check our website at http://www.gromacs.org
- * 
+ *
  * And Hey:
  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
 
-#ifndef _GMX_GPU_UTILS_H_
-#define _GMX_GPU_UTILS_H_
+#ifndef _INTERACTION_CONST_
+#define _INTERACTION_CONST_
 
-#ifndef __cplusplus
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-int do_quick_memtest(int /*dev_id*/);
+enum { tableformatNONE, tableformatF, tableformatFDV0 };
 
-int do_full_memtest(int /*dev_id*/);
+typedef struct {
+    /* VdW */
+    real rvdw;
+    real sh_invrc6; /* For shifting the LJ potential */
 
-int do_timed_memtest(int /*dev_id*/, int /*time_limit*/);
+    /* type of electrostatics (defined in enums.h) */
+    int  eeltype;
 
-int is_supported_cuda_gpu(int /*dev_id*/, char* /*gpu_name*/);
+    /* Coulomb */
+    real rcoulomb;
 
-#ifndef __cplusplus
-}  /* extern "C" */
-#endif
+    /* Cut-off */
+    real rlist;
+
+    /* PME/Ewald */
+    real ewaldcoeff;
+    real sh_ewald;   /* For shifting the Ewald potential */
 
-#endif // _GMX_GPU_UTILS_H_
+    /* Dielectric constant resp. multiplication factor for charges */
+    real epsilon_r;
+    real epsfac;
+
+    /* Constants for reaction-field or plain cut-off */
+    real epsilon_rf;
+    real k_rf;
+    real c_rf;
+
+    /* Force/energy interpolation tables, linear in force, quadratic in V */
+    real tabq_scale;
+    int  tabq_size;
+    int  tabq_format;
+    /* Coulomb force table, size of array is tabq_size (when used) */
+    real *tabq_coul_F;
+    /* Coulomb energy table, size of array is tabq_size (when used) */
+    real *tabq_coul_V;
+    /* Coulomb force+energy table, size of array is tabq_size*4,
+       entry quadruplets are: F[i], F[i+1]-F[i], V[i], 0,
+       this is used with single precision x86 SIMD for aligned loads */
+    real *tabq_coul_FDV0;
+} interaction_const_t;
+
+#ifdef __cplusplus
+}
+#endif
 
+#endif /* _INTERACTION_CONST_ */
diff --git a/include/types/nb_verlet.h b/include/types/nb_verlet.h
new file mode 100644
index 0000000000..a6e7ecf67a
--- /dev/null
+++ b/include/types/nb_verlet.h
@@ -0,0 +1,130 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef NB_VERLET_H
+#define NB_VERLET_H
+
+#include "nbnxn_pairlist.h"
+#include "nbnxn_cuda_types_ext.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! Nonbonded NxN kernel types: plain C, SSE/AVX, GPU CUDA, GPU emulation, etc */
+enum { nbkNotSet = 0, 
+       nbk4x4_PlainC, 
+       nbk4xN_X86_SIMD128,
+       nbk4xN_X86_SIMD256,
+       nbk8x8x8_CUDA,
+       nbk8x8x8_PlainC };
+
+/* Note that _mm_... intrinsics can be converted to either SSE or AVX
+ * depending on compiler flags.
+ * For gcc we check for __AVX__
+ * At least a check for icc should be added (if there is a macro)
+ */
+static const char *nbk_name[] =
+  { "not set", "plain C 4x4",
+#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__)
+#ifndef GMX_X86_SSE4_1
+#ifndef GMX_DOUBLE
+    "SSE2 4x4",
+#else
+    "SSE2 4x2",
+#endif
+#else
+#ifndef GMX_DOUBLE
+    "SSE4.1 4x4",
+#else
+    "SSE4.1 4x2",
+#endif
+#endif
+#else
+#ifndef GMX_DOUBLE
+    "AVX-128 4x4",
+#else
+    "AVX-128 4x2",
+#endif
+#endif
+#ifndef GMX_DOUBLE
+    "AVX-256 4x8",
+#else
+    "AVX-256 4x4",
+#endif
+    "CUDA 8x8x8", "plain C 8x8x8" };
+
+/* Atom locality indicator: local, non-local, all, used for calls to:
+   gridding, pair-search, force calculation, x/f buffer operations */
+enum { eatLocal = 0, eatNonlocal = 1, eatAll  };
+
+#define LOCAL_A(x)               ((x) == eatLocal)
+#define NONLOCAL_A(x)            ((x) == eatNonlocal)
+#define LOCAL_OR_NONLOCAL_A(x)   (LOCAL_A(x) || NONLOCAL_A(x))
+
+/* Interaction locality indicator (used in pair-list search/calculations):
+    - local interactions require local atom data and affect local output only;
+    - non-local interactions require both local and non-local atom data and
+      affect both local- and non-local output. */
+enum { eintLocal = 0, eintNonlocal = 1 };
+
+#define LOCAL_I(x)               ((x) == eintLocal)
+#define NONLOCAL_I(x)            ((x) == eintNonlocal)
+
+enum { enbvClearFNo, enbvClearFYes };
+
+typedef struct {
+    nbnxn_pairlist_set_t nbl_lists;   /* pair list(s)                       */
+    nbnxn_atomdata_t     *nbat;       /* atom data                          */
+    int                  kernel_type; /* non-bonded kernel - see enum above */
+} nonbonded_verlet_group_t;
+
+/* non-bonded data structure with Verlet-type cut-off */
+typedef struct {
+    nbnxn_search_t           nbs;   /* n vs n atom pair searching data          */
+    int                      ngrp;  /* number of interaction groups             */
+    nonbonded_verlet_group_t grp[2];/* local and non-local interaction group    */
+
+    gmx_bool         bUseGPU;          /* TRUE when GPU acceleration is used */
+    nbnxn_cuda_ptr_t cu_nbv;           /* pointer to CUDA nb verlet data     */
+    int              min_ci_balanced;  /* pair list balancing parameter
+                                          used for the 8x8x8 CUDA kernels    */
+} nonbonded_verlet_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* NB_VERLET_H */
diff --git a/include/types/nbnxn_cuda_types_ext.h b/include/types/nbnxn_cuda_types_ext.h
new file mode 100644
index 0000000000..dd8c6206be
--- /dev/null
+++ b/include/types/nbnxn_cuda_types_ext.h
@@ -0,0 +1,77 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef NBNXN_CUDA_TYPES_EXT_H
+#define NBNXN_CUDA_TYPES_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Abstract types */
+/* CUDA nonbonded structure */
+typedef struct nbnxn_cuda *nbnxn_cuda_ptr_t;
+/* CUDA GPU device info */
+typedef struct cuda_dev_info *cuda_dev_info_ptr_t;
+
+/* Types defined for the structs below. */
+typedef struct wallclock_gpu wallclock_gpu_t;
+typedef struct nbnxn_cuda_ktime nbnxn_cuda_ktime_t;
+
+/* Nonbonded kernel time and call count. */
+struct nbnxn_cuda_ktime
+{
+    double  t;
+    int     c;
+}; 
+
+/* GPU timings for kernels and H2d/D2H transfers. */
+struct wallclock_gpu
+{
+    nbnxn_cuda_ktime_t ktime[2][2]; /* table containing the timings of the four 
+                                       version of the nonbonded kernels: force-only, 
+                                       force+energy, force+pruning, and force+energy+pruning */
+    double  nb_h2d_t;               /* host to device transfer time in nb calculation  */
+    double  nb_d2h_t;               /* device to host transfer time in nb calculation */
+    int     nb_c;                   /* total call count of the nonbonded gpu operations */
+    double  pl_h2d_t;               /* pair search step host to device transfer time */
+    int     pl_h2d_c;               /* pair search step  host to device transfer call count */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* NBNXN_CUDA_TYPES_EXT_H */
diff --git a/include/types/nbnxn_pairlist.h b/include/types/nbnxn_pairlist.h
new file mode 100644
index 0000000000..9490c16b6c
--- /dev/null
+++ b/include/types/nbnxn_pairlist.h
@@ -0,0 +1,204 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef _nbnxn_pairlist_h
+#define _nbnxn_pairlist_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* A buffer data structure of 64 bytes
+ * to be placed at the beginning and end of structs
+ * to avoid cache invalidation of the real contents
+ * of the struct by writes to neighboring memory.
+ */
+typedef struct {
+    int dummy[16];
+} gmx_cache_protect_t;
+
+/* Abstract type for pair searching data */
+typedef struct nbnxn_search * nbnxn_search_t;
+
+/* Function that should return a pointer *ptr to memory
+ * of size nbytes.
+ * Error handling should be done within this function.
+ */
+typedef void gmx_nbat_alloc_t(void **ptr,size_t nbytes);
+
+/* Function that should free the memory pointed to by *ptr.
+ * NULL should not be passed to this function.
+ */
+typedef void gmx_nbat_free_t(void *ptr);
+
+typedef struct {
+    int      cj;    /* The j-cluster                    */
+    unsigned excl;  /* The exclusion (interaction) bits */
+} nbnxn_cj_t;
+
+#define NBNXN_CI_SHIFT          127
+#define NBNXN_CI_DO_LJ(subc)    (1<<(7+3*(subc)))
+#define NBNXN_CI_HALF_LJ(subc)  (1<<(8+3*(subc)))
+#define NBNXN_CI_DO_COUL(subc)  (1<<(9+3*(subc)))
+
+/* Simple pair-list i-unit */
+typedef struct {
+    int ci;             /* i-cluster             */
+    int shift;          /* Shift vector index plus possible flags */
+    int cj_ind_start;   /* Start index into cj   */
+    int cj_ind_end;     /* End index into cj     */
+} nbnxn_ci_t;
+
+/* Grouped pair-list i-unit */
+typedef struct {
+    int sci;            /* i-super-cluster       */
+    int shift;          /* Shift vector index plus possible flags */
+    int cj4_ind_start;  /* Start index into cj4  */
+    int cj4_ind_end;    /* End index into cj4    */
+} nbnxn_sci_t;
+
+typedef struct {
+    unsigned imask;        /* The i-cluster interactions mask for 1 warp  */
+    int excl_ind;          /* Index into the exclusion array for 1 warp   */
+} nbnxn_im_ei_t;
+
+typedef struct {
+    int cj[4];             /* The 4 j-clusters                            */
+    nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
+} nbnxn_cj4_t;
+
+typedef struct {
+    unsigned pair[32];     /* Exclusion bits for one warp,                *
+                            * each unsigned has bit for 4*8 i clusters    */
+} nbnxn_excl_t;
+
+typedef struct {
+    gmx_cache_protect_t cp0;
+
+    gmx_nbat_alloc_t *alloc;
+    gmx_nbat_free_t  *free;
+
+    gmx_bool bSimple;      /* Simple list has na_sc=na_s and uses cj   *
+                            * Complex list uses cj4                    */
+
+    int      na_ci;        /* The number of atoms per i-cluster        */
+    int      na_cj;        /* The number of atoms per j-cluster        */
+    int      na_sc;        /* The number of atoms per super cluster    */
+    real     rlist;        /* The radius for constructing the list     */
+    int      nci;          /* The number of i-clusters in the list     */
+    nbnxn_ci_t *ci;        /* The i-cluster list, size nci             */
+    int      ci_nalloc;    /* The allocation size of ci                */
+    int      nsci;         /* The number of i-super-clusters in the list */
+    nbnxn_sci_t *sci;      /* The i-super-cluster list                 */
+    int      sci_nalloc;   /* The allocation size of sci               */
+
+    int      ncj;          /* The number of j-clusters in the list     */
+    nbnxn_cj_t *cj;        /* The j-cluster list, size ncj             */
+    int      cj_nalloc;    /* The allocation size of cj                */
+
+    int      ncj4;         /* The total number of 4*j clusters         */
+    nbnxn_cj4_t *cj4;      /* The 4*j cluster list, size ncj4          */
+    int      cj4_nalloc;   /* The allocation size of cj4               */
+    int      nexcl;        /* The count for excl                       */
+    nbnxn_excl_t *excl;    /* Atom interaction bits (non-exclusions)   */
+    int      excl_nalloc;  /* The allocation size for excl             */
+    int      nci_tot;      /* The total number of i clusters           */
+
+    struct nbnxn_list_work *work;
+
+    gmx_cache_protect_t cp1;
+} nbnxn_pairlist_t;
+
+typedef struct {
+    int          nnbl;      /* number of lists */
+    nbnxn_pairlist_t **nbl; /* lists */
+    gmx_bool     bCombined; /* TRUE if lists get combined into one (the 1st) */
+    gmx_bool     bSimple;   /* TRUE if the list of of type "simple"
+                               (na_sc=na_s, no super-clusters used) */
+    int          natpair_ljq; /* Total number of atom pairs for LJ+Q kernel */
+    int          natpair_lj;  /* Total number of atom pairs for LJ kernel   */
+    int          natpair_q;   /* Total number of atom pairs for Q kernel    */
+} nbnxn_pairlist_set_t;
+
+enum { nbatXYZ, nbatXYZQ, nbatX4, nbatX8 };
+
+typedef struct {
+    real *f;      /* f, size natoms*fstride                             */
+    real *fshift; /* Shift force array, size SHIFTS*DIM                 */
+    int  nV;      /* The size of *Vvdw and *Vc                          */
+    real *Vvdw;   /* Temporary Van der Waals group energy storage       */
+    real *Vc;     /* Temporary Coulomb group energy storage             */
+    int  nVS;     /* The size of *VSvdw and *VSc                        */
+    real *VSvdw;  /* Temporary SIMD Van der Waals group energy storage  */
+    real *VSc;    /* Temporary SIMD Coulomb group energy storage        */
+} nbnxn_atomdata_output_t;
+
+/* LJ combination rules: geometric, Lorentz-Berthelot, none */
+enum { ljcrGEOM, ljcrLB, ljcrNONE, ljcrNR };
+
+typedef struct {
+    gmx_nbat_alloc_t *alloc;
+    gmx_nbat_free_t  *free;
+    int  ntype;      /* The number of different atom types                 */
+    real *nbfp;      /* Lennard-Jones 6*C6 and 12*C12 params, size ntype^2*2 */
+    int  comb_rule;  /* Combination rule, see enum above                   */
+    real *nbfp_comb; /* LJ parameter per atom type, size ntype*2           */
+    real *nbfp_s4;   /* As nbfp, but with stride 4, size ntype^2*4         */
+    int  natoms;     /* Number of atoms                                    */
+    int  natoms_local;  /* Number of local atoms                           */
+    int  *type;      /* Atom types                                         */
+    real *lj_comb;   /* LJ parameters per atom for combining for pairs     */
+    int  XFormat;    /* The format of x (and q), enum                      */
+    int  FFormat;    /* The format of f, enum                              */
+    real *q;         /* Charges, can be NULL if incorporated in x          */
+    int  na_c;       /* The number of atoms per cluster                    */
+    int  nenergrp;   /* The number of energy groups                        */
+    int  neg_2log;   /* Log2 of nenergrp                                   */
+    int  *energrp;   /* The energy groups per cluster, can be NULL         */
+    gmx_bool bDynamicBox; /* Do we need to update shift_vec every step?    */
+    rvec *shift_vec; /* Shift vectors, copied from t_forcerec              */
+    int  xstride;    /* stride for a coordinate in x (usually 3 or 4)      */
+    int  fstride;    /* stride for a coordinate in f (usually 3 or 4)      */
+    real *x;         /* x and possibly q, size natoms*xstride              */
+    int  nout;       /* The number of force arrays                         */
+    nbnxn_atomdata_output_t *out;  /* Output data structures               */
+    int  nalloc;     /* Allocation size of all arrays (for x/f *x/fstride) */
+} nbnxn_atomdata_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/types/nrnb.h b/include/types/nrnb.h
index 8f18e18b5d..aca5408c27 100644
--- a/include/types/nrnb.h
+++ b/include/types/nrnb.h
@@ -90,6 +90,12 @@ enum
     eNR_NBKERNEL_ALLVSALL,
     eNR_NBKERNEL_ALLVSALLGB,
     eNR_NBKERNEL_OUTER,
+    eNR_NBNXN_DIST2,
+    eNR_NBNXN_LJ_RF,  eNR_NBNXN_LJ_RF_E,
+    eNR_NBNXN_LJ_TAB, eNR_NBNXN_LJ_TAB_E,
+    eNR_NBNXN_LJ,     eNR_NBNXN_LJ_E,
+    eNR_NBNXN_RF,     eNR_NBNXN_RF_E,
+    eNR_NBNXN_TAB,    eNR_NBNXN_TAB_E,
     eNR_NB14,
     eNR_BORN_RADII_STILL,     eNR_BORN_RADII_HCT_OBC,
     eNR_BORN_CHAINRULE,
diff --git a/include/types/simple.h b/include/types/simple.h
index 63031a98e0..c9be7a06c1 100644
--- a/include/types/simple.h
+++ b/include/types/simple.h
@@ -273,6 +273,10 @@ typedef int gmx_large_int_t;
 #endif
 
 
+/* Standard sizes for char* string buffers */
+#define STRLEN 4096
+#define BIG_STRLEN 1048576
+
 
 #ifdef __cplusplus
 }
diff --git a/include/update.h b/include/update.h
index 4f8ae204a2..a83c71c20d 100644
--- a/include/update.h
+++ b/include/update.h
@@ -93,6 +93,7 @@ void update_coords(FILE         *fplog,
 			  t_inputrec   *inputrec,  /* input record and box stuff	*/
 			  t_mdatoms    *md,
 			  t_state      *state,
+		          gmx_bool     bMolPBC,
 			  rvec         *f,    /* forces on home particles */
 			  gmx_bool         bDoLR,
 			  rvec         *f_lr,
@@ -119,6 +120,7 @@ void update_constraints(FILE         *fplog,
 			       gmx_ekindata_t *ekind,
 			       t_mdatoms    *md,
 			       t_state      *state,
+	        	       gmx_bool     bMolPBC,
 			       t_graph      *graph,	
 			       rvec         force[],    /* forces on home particles */
 			       t_idef       *idef,
diff --git a/include/vsite.h b/include/vsite.h
index 024b6af4fb..67dc820701 100644
--- a/include/vsite.h
+++ b/include/vsite.h
@@ -38,6 +38,7 @@
 
 #include <stdio.h>
 #include "typedefs.h"
+#include "types/commrec.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/share/html/online/mdp_opt.html b/share/html/online/mdp_opt.html
index 23c937332f..e528c80b83 100644
--- a/share/html/online/mdp_opt.html
+++ b/share/html/online/mdp_opt.html
@@ -35,7 +35,7 @@ IF YOU'RE NOT SURE ABOUT WHAT YOU'RE DOING, DON'T DO IT!
 <li><a HREF="#xmdrun"><b>shell molecular dynamics</b></a>(emtol,niter,fcstep)
 <li><a HREF="#tpi"><b>test particle insertion</b></a>(rtpi)
 <li><A HREF="#out"><b>output control</b></A> (nstxout, nstvout, nstfout, nstlog, nstcalcenergy, nstenergy, nstxtcout, xtc-precision, xtc-grps, energygrps)
-<li><A HREF="#nl"><b>neighbor searching</b></A> (nstlist, ns-type, pbc, periodic-molecules, rlist, rlistlong)
+<li><A HREF="#nl"><b>neighbor searching</b></A> (cutoff-scheme, nstlist, ns-type, pbc, periodic-molecules, verlet-buffer-drift, rlist, rlistlong)
 <li><A HREF="#el"><b>electrostatics</b></A> (coulombtype, rcoulomb-switch, rcoulomb, epsilon-r, epsilon-rf)
 <li><A HREF="#vdw"><b>VdW</b></A> (vdwtype, rvdw-switch, rvdw, DispCorr)
 <li><A HREF="#table"><b>tables</b></A> (table-extension, energygrp-table)
@@ -259,7 +259,7 @@ set <tt>init-step</tt> to the step number of the restart frame.
 <dt><b>None</b></dt>
 <dd>No restriction on the center of mass motion
 </dl></dd>
-<dt><b>nstcomm: (10) [steps]</b></dt>
+<dt><b>nstcomm: (100) [steps]</b></dt>
 <dd>frequency for center of mass motion removal</dd>
 <dt><b>comm-grps:</b></dt>
 <dd>group(s) for center of mass motion removal, default is the whole system</dd>
@@ -350,7 +350,7 @@ the last velocities are always written</dd>
 <dt><b>nstlog: (1000) [steps]</b></dt>
 <dd>frequency to write energies to <!--Idx-->log file<!--EIdx-->,
 the last energies are always written</dd>
-<dt><b>nstcalcenergy: (-1)</b></dt>
+<dt><b>nstcalcenergy: (100)</b></dt>
 <dd>frequency for calculating the energies, 0 is never.
 This option is only relevant with dynamics.
 With a twin-range cut-off setup <b>nstcalcenergy</b> should be equal to
@@ -358,12 +358,8 @@ or a multiple of <b>nstlist</b>.
 This option affects the performance in parallel simulations,
 because calculating energies requires global communication between all
 processes which can become a bottleneck at high parallelization.
-The default value of -1 sets <b>nstcalcenergy</b> equal to <b>nstlist</b>,
-unless <b>nstlist</b> &le;0, then a value of 10 is used.
-If <b>nstenergy</b> is smaller than the automatically generated value,
-the lowest common denominator of <b>nstenergy</b> and <b>nstlist</b> is used.
 </dd>
-<dt><b>nstenergy: (100) [steps]</b></dt>
+<dt><b>nstenergy: (1000) [steps]</b></dt>
 <dd>frequency to write energies to energy file,
 the last energies are always written,
 should be a multiple of <b>nstcalcenergy</b>.
@@ -386,6 +382,33 @@ energy averages and fluctuations also when <b>nstenergy</b><tt>&gt;1</tt></dd>
 <hr>
 <h3>Neighbor searching<!--QuietIdx-->neighbor searching<!--EQuietIdx--></h3>
 <dl>
+<dt><b>cutoff-scheme:</b></dt>
+<dd><dl compact>
+<dt><b>group</b></dt>
+<dd>Generate a pair list for groups of atoms. These groups correspond to the 
+charge groups in the topology. This was the only cut-off treatment scheme 
+before version 4.6. 
+There is no explicit buffering of the pair list. This enables efficient force 
+calculations, but energy is only conserved when a buffer is explicitly added. 
+For energy conservation, the <b>Verlet</b> option provides a more convenient 
+and efficient algorithm.</dd>
+
+<dt><b>Verlet</b></dt>
+<dd>Generate a pair list with buffering. The buffer size is automatically set 
+based on <b>verlet-buffer-drift</b>, unless this is set to -1, in which case
+<b>rlist</b> will be used. This option has an explicit, exact cut-off at 
+<b>rvdw</b>=<b>rcoulomb</b>. Currently only cut-off, reaction-field, 
+PME electrostatics and plain LJ are supported. Some <tt>mdrun</tt> functionality 
+is not yet supported with the <b>Verlet</b> scheme, but <tt>grompp</tt> checks for this. 
+Native GPU acceleration is only supported with <b>Verlet</b>. With GPU-accelerated PME,
+<tt>mdrun</tt> will automatically tune the CPU/GPU load balance by 
+scaling <b>rcoulomb</b> and the grid spacing. This can be turned off with 
+<tt>-notunepme</tt>.
+
+<b>Verlet<\b> is somewhat faster than <b>group</b> when there is no water, or if <b>group</b> would use a pair-list buffer to conserve energy.
+</dd>
+</dl></dd>
+
 <dt><b>nstlist: (10) [steps]</b></dt>
 <dd><dl compact>
 <dt><b>&gt;0</b></dt>
@@ -393,13 +416,15 @@ energy averages and fluctuations also when <b>nstenergy</b><tt>&gt;1</tt></dd>
 the long-range forces, when using twin-range cut-offs). When this is 0,
 the neighbor list is made only once.
 With energy minimization the neighborlist will be updated for every
-energy evaluation when <b>nstlist</b><tt>&gt;0</tt>.</dd>
+energy evaluation when <b>nstlist</b><tt>&gt;0</tt>.
+With non-bonded force calculation on the GPU, a value of 20 or more gives
+the best performance.</dd>
 <dt><b>0</b></dt>
 <dd>The neighbor list is only constructed once and never updated.
 This is mainly useful for vacuum simulations in which all particles
 see each other.</dd>
 <dt><b>-1</b></dt>
-<dd>Automated update frequency.
+<dd>Automated update frequency, only supported with <b>cutoff-scheme</b>=<b>group</b>.
 This can only be used with switched, shifted or user potentials where
 the cut-off can be smaller than <b>rlist</b>. One then has a buffer
 of size <b>rlist</b> minus the longest cut-off.
@@ -458,8 +483,30 @@ the periodic boundary conditions, this requires a slower PBC algorithm
 and molecules are not made whole in the output</dd>
 </dl></dd>
 
-<dt><b>rlist: (-1) [nm]</b></dt>
-<dd>cut-off distance for the short-range neighbor list, should be &ge; 0</dd>
+<dt><b>verlet-buffer-drift: (0.005) [kJ/mol/ps]</b></dt>
+<dd>Useful only with <b>cutoff-scheme</b>=<b>Verlet</b>. This sets the target energy drift
+per particle caused by the Verlet buffer, which indirectly sets <b>rlist</b>. 
+As both <b>nstlist</b> and the Verlet buffer size are fixed 
+(for performance reasons), particle pairs not in the pair list can occasionally 
+get within the cut-off distance during <b>nstlist</b>-1 nsteps. This 
+generates energy drift. In a constant-temperature ensemble, the drift can be 
+estimated for a given cut-off and <b>rlist</b>. The estimate assumes a 
+homogeneous particle distribution, hence the drift might be slightly 
+underestimated for multi-phase systems. For longer pair-list life-time
+(<b>nstlist</b>-1)*dt the drift is overestimated, because the interactions
+between particles are ignored. Combined with cancellation of errors,
+the actual energy drift is usually one to two orders of magnitude smaller.
+Note that the generated buffer size takes into account that
+the GROMACS pair-list setup leads to a reduction in the drift by
+a factor 10, compared to a simple particle-pair based list.
+Without dynamics (energy minimization etc.), the buffer is 5% of the cut-off.
+For dynamics without temperature coupling or to override the buffer size,
+use <b>verlet-buffer-drift</b>=-1 and set <b>rlist</b> manually.</dd>
+
+<dt><b>rlist: (1) [nm]</b></dt>
+<dd>Cut-off distance for the short-range neighbor list, should be &ge; 0.
+With <b>cutoff-scheme</b>=<b>Verlet</b>, this is by default set by the
+<b>verlet-buffer-drift</b> option and the value of <b>rlist</b> is ignored.</dd>
 
 <dt><b>rlistlong: (-1) [nm]</b></dt>
 <dd>Cut-off distance for the long-range neighbor list.
@@ -527,7 +574,8 @@ The temperature for the GRF potential is set with
 <b><A HREF="#tc">ref-t</A></b> [K].</dd>
 
 <dt><b>Reaction-Field-zero</b></dt>
-<dd>In GROMACS normal reaction-field electrostatics leads to bad
+<dd>In GROMACS, normal reaction-field electrostatics with
+<b>cutoff-scheme</b><b>=group</b> leads to bad
 energy conservation. <b>Reaction-Field-zero</b> solves this
 by making the potential zero beyond the cut-off. It can only
 be used with an infinite dielectric constant (<b>epsilon-rf=0</b>),
@@ -594,9 +642,8 @@ in the printed manual.</dd>
 <dt><b>PME-Switch</b></dt>
 <dd>A combination of PME and a switch function for the direct-space part
 (see above). <b>rcoulomb</b> is allowed to be smaller than <b>rlist</b>.
-This is mainly useful constant energy simulations. For constant temperature
-simulations the advantage of improved energy conservation
-is usually outweighed by the small loss in accuracy of the electrostatics.
+This is mainly useful constant energy simulations (note that using
+<b>PME</b> with <b>cutoff-scheme</b>=<b>Verlet</b> will be more efficient).
 </dd>
 
 <dt><b>PME-User</b></dt>
@@ -1863,6 +1910,7 @@ reals to your subroutine. Check the inputrec definition in
 <A HREF="#free">couple-lambda0</A><br>
 <A HREF="#free">couple-lambda1</A><br>
 <A HREF="#free">couple-moltype</A><br>
+<A HREF="#nl">cutoff-scheme</A><br>
 <A HREF="#pp">define</A><br>
 <A HREF="#neq">deform</A><br>
 <A HREF="#free">delta-lambda</A><br>
@@ -1971,6 +2019,7 @@ reals to your subroutine. Check the inputrec definition in
 <A HREF="#user">userreal3</A><br>
 <A HREF="#user">userreal4</A><br>
 <A HREF="#el">vdwtype</A><br>
+<A HREF="#nl">verlet-buffer-drift</A><br>
 <A HREF="#out">xtc-grps</A><br>
 <A HREF="#out">xtc-precision</A><br>
 <A HREF="#sa">zero-temp-time</A><br>
diff --git a/src/config.h.cmakein b/src/config.h.cmakein
index 81fd750a99..8277706b2c 100644
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -161,6 +161,9 @@
 /* Use the PowerPC hardware 1/sqrt(x) */
 #cmakedefine GMX_POWERPC_INVSQRT
 
+/* Use sub-counters */
+#cmakedefine GMX_CYCLE_SUBCOUNTERS
+
 /* Compile with plugin support */
 #cmakedefine GMX_USE_PLUGINS
 
@@ -173,6 +176,9 @@
 /* Define when Windows threads are used */
 #cmakedefine THREAD_WINDOWS
 
+/* Define when thread-MPI atomic operations are available */
+#cmakedefine TMPI_ATOMICS
+
 /* Define for busy wait option  */
 #cmakedefine TMPI_WAIT_FOR_NO_ONE
 
@@ -197,6 +203,9 @@
 /* Enable x86 gcc inline assembly */
 #cmakedefine GMX_X86_GCC_INLINE_ASM
 
+/* Use GPU native acceleration */
+#cmakedefine GMX_GPU
+
 /* Define to 1 if the system has the type gmx_bool. */
 #cmakedefine HAVE_BOOL
 
@@ -341,12 +350,18 @@
 /* Define to 1 if you have the <x86intrin.h> header file */
 #cmakedefine HAVE_X86INTRIN_H
 
-/* Define for sched.h (this is for thread_mpi)*/
-#define HAVE_SCHED_H
+/* Define to 1 if you have the <sched.h> header */
+#cmakedefine HAVE_SCHED_H
 
 /* Define to 1 if you have the vprintf() function. */
 #cmakedefine HAVE_VPRINTF
 
+/* Define to 1 if you have the sysconf() function */
+#cmakedefine HAVE_SYSCONF
+
+/* Define to 1 if you have the sched_setaffinity() function */
+#cmakedefine HAVE_SCHED_SETAFFINITY
+
 /* Bytes in IEEE fp word are in big-endian order if set, little-endian if not.
    Only relevant when FLOAT_FORMAT_IEEE754 is defined. */
 #cmakedefine GMX_IEEE754_BIG_ENDIAN_BYTE_ORDER
diff --git a/src/gmxlib/CMakeLists.txt b/src/gmxlib/CMakeLists.txt
index fd6bcaa7e9..8f414010bc 100644
--- a/src/gmxlib/CMakeLists.txt
+++ b/src/gmxlib/CMakeLists.txt
@@ -57,6 +57,12 @@ endif(NOT GMX_EXTERNAL_LAPACK)
 #endif(GMX_THREAD_MPI)
 #target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
 
+# apply gcc 4.4.x bug workaround
+if(GMX_USE_GCC44_BUG_WORKAROUND)
+   include(gmxGCC44O3BugWorkaround)
+   gmx_apply_gcc44_bug_workaround("bondfree.c")
+endif()
+
 # Files called xxx_test.c are test drivers with a main() function for module xxx.c,
 # so they should not be included in the library
 file(GLOB_RECURSE NOT_GMXLIB_SOURCES *_test.c *\#*)
@@ -65,14 +71,21 @@ list(REMOVE_ITEM GMXLIB_SOURCES ${NOT_GMXLIB_SOURCES})
 file(GLOB SELECTION_TEST selection/test*)
 list(REMOVE_ITEM GMXLIB_SOURCES ${SELECTION_TEST})
 
+# gpu utils + cuda tools module
+if(GMX_GPU)
+    add_subdirectory(cuda_tools)
+    add_subdirectory(gpu_utils)   
+    set(GMX_GPU_LIBRARIES gpu_utils cuda_tools)
+endif()
+
 # NONBONDED_SOURCES is imported from the nonbonded subdirectory.
 add_library(gmx ${GMXLIB_SOURCES} ${BLAS_SOURCES} ${LAPACK_SOURCES} ${THREAD_MPI_SRC} ${NONBONDED_SOURCES})
-
-target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES}  ${THREAD_LIB} ${OpenMP_SHARED_LINKER_FLAGS})
+target_link_libraries(gmx ${GMX_GPU_LIBRARIES} ${GMX_EXTRA_LIBRARIES} ${THREAD_LIB} ${OpenMP_SHARED_LINKER_FLAGS})
 if(USE_VERSION_H)
 	add_dependencies(gmx gmx_version) 
 endif()
-set_target_properties(gmx PROPERTIES OUTPUT_NAME "gmx${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}")
+set_target_properties(gmx PROPERTIES OUTPUT_NAME "gmx${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}"
+    COMPILE_FLAGS "${OpenMP_C_FLAGS}")
 
 install(TARGETS gmx DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
 
diff --git a/src/gmxlib/bondfree.c b/src/gmxlib/bondfree.c
index 0b1c6799d6..e66d15ce50 100644
--- a/src/gmxlib/bondfree.c
+++ b/src/gmxlib/bondfree.c
@@ -55,7 +55,11 @@
 #include "orires.h"
 #include "force.h"
 #include "nonbonded.h"
-#include "mdrun.h"
+
+#if !defined GMX_DOUBLE && defined GMX_X86_SSE2
+#include "gmx_x86_simd_single.h"
+#define SSE_PROPER_DIHEDRALS
+#endif
 
 /* Find a better place for this? */
 const int cmap_coeff_matrix[] = {
@@ -802,79 +806,88 @@ real bond_angle(const rvec xi,const rvec xj,const rvec xk,const t_pbc *pbc,
 }
 
 real angles(int nbonds,
-	    const t_iatom forceatoms[],const t_iparams forceparams[],
-	    const rvec x[],rvec f[],rvec fshift[],
-	    const t_pbc *pbc,const t_graph *g,
-	    real lambda,real *dvdlambda,
-	    const t_mdatoms *md,t_fcdata *fcd,
-	    int *global_atom_index)
+            const t_iatom forceatoms[],const t_iparams forceparams[],
+            const rvec x[],rvec f[],rvec fshift[],
+            const t_pbc *pbc,const t_graph *g,
+            real lambda,real *dvdlambda,
+            const t_mdatoms *md,t_fcdata *fcd,
+            int *global_atom_index)
 {
-  int  i,ai,aj,ak,t1,t2,type;
-  rvec r_ij,r_kj;
-  real cos_theta,cos_theta2,theta,dVdt,va,vtot;
-  ivec jt,dt_ij,dt_kj;
-  
-  vtot = 0.0;
-  for(i=0; (i<nbonds); ) {
-    type = forceatoms[i++];
-    ai   = forceatoms[i++];
-    aj   = forceatoms[i++];
-    ak   = forceatoms[i++];
-    
-    theta  = bond_angle(x[ai],x[aj],x[ak],pbc,
-			r_ij,r_kj,&cos_theta,&t1,&t2);	/*  41		*/
+    int  i,ai,aj,ak,t1,t2,type;
+    rvec r_ij,r_kj;
+    real cos_theta,cos_theta2,theta,dVdt,va,vtot;
+    ivec jt,dt_ij,dt_kj;
+
+    vtot = 0.0;
+    for(i=0; i<nbonds; )
+    {
+        type = forceatoms[i++];
+        ai   = forceatoms[i++];
+        aj   = forceatoms[i++];
+        ak   = forceatoms[i++];
+
+        theta  = bond_angle(x[ai],x[aj],x[ak],pbc,
+                            r_ij,r_kj,&cos_theta,&t1,&t2);	/*  41		*/
   
-    *dvdlambda += harmonic(forceparams[type].harmonic.krA,
-			   forceparams[type].harmonic.krB,
-			   forceparams[type].harmonic.rA*DEG2RAD,
-			   forceparams[type].harmonic.rB*DEG2RAD,
-			   theta,lambda,&va,&dVdt);  /*  21  */
-    vtot += va;
-    
-    cos_theta2 = sqr(cos_theta);
-    if (cos_theta2 < 1) {
-      int  m;
-      real st,sth;
-      real cik,cii,ckk;
-      real nrkj2,nrij2;
-      rvec f_i,f_j,f_k;
-      
-      st  = dVdt*gmx_invsqrt(1 - cos_theta2);	/*  12		*/
-      sth = st*cos_theta;			/*   1		*/
+        *dvdlambda += harmonic(forceparams[type].harmonic.krA,
+                               forceparams[type].harmonic.krB,
+                               forceparams[type].harmonic.rA*DEG2RAD,
+                               forceparams[type].harmonic.rB*DEG2RAD,
+                               theta,lambda,&va,&dVdt);  /*  21  */
+        vtot += va;
+
+        cos_theta2 = sqr(cos_theta);
+        if (cos_theta2 < 1)
+        {
+            int  m;
+            real st,sth;
+            real cik,cii,ckk;
+            real nrkj2,nrij2;
+            real nrkj_1,nrij_1;
+            rvec f_i,f_j,f_k;
+
+            st  = dVdt*gmx_invsqrt(1 - cos_theta2);	/*  12		*/
+            sth = st*cos_theta;			/*   1		*/
 #ifdef DEBUG
-      if (debug)
-	fprintf(debug,"ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
-		theta*RAD2DEG,va,dVdt);
+            if (debug)
+                fprintf(debug,"ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
+                        theta*RAD2DEG,va,dVdt);
 #endif
-      nrkj2=iprod(r_kj,r_kj);			/*   5		*/
-      nrij2=iprod(r_ij,r_ij);
-      
-      cik=st*gmx_invsqrt(nrkj2*nrij2);		/*  12		*/ 
-      cii=sth/nrij2;				/*  10		*/
-      ckk=sth/nrkj2;				/*  10		*/
-      
-      for (m=0; (m<DIM); m++) {			/*  39		*/
-	f_i[m]=-(cik*r_kj[m]-cii*r_ij[m]);
-	f_k[m]=-(cik*r_ij[m]-ckk*r_kj[m]);
-	f_j[m]=-f_i[m]-f_k[m];
-	f[ai][m]+=f_i[m];
-	f[aj][m]+=f_j[m];
-	f[ak][m]+=f_k[m];
-      }
-      if (g) {
-	copy_ivec(SHIFT_IVEC(g,aj),jt);
+            nrij2 = iprod(r_ij,r_ij);			/*   5		*/
+            nrkj2 = iprod(r_kj,r_kj);			/*   5		*/
+
+            nrij_1 = gmx_invsqrt(nrij2);		/*  10		*/
+            nrkj_1 = gmx_invsqrt(nrkj2);		/*  10		*/
+
+            cik = st*nrij_1*nrkj_1;			/*   2		*/
+            cii = sth*nrij_1*nrij_1;			/*   2		*/
+            ckk = sth*nrkj_1*nrkj_1;			/*   2		*/
       
-	ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
-	ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
-	t1=IVEC2IS(dt_ij);
-	t2=IVEC2IS(dt_kj);
-      }
-      rvec_inc(fshift[t1],f_i);
-      rvec_inc(fshift[CENTRAL],f_j);
-      rvec_inc(fshift[t2],f_k);
-    }                                           /* 161 TOTAL	*/
-  }
-  return vtot;
+            for (m=0; m<DIM; m++)
+            {			/*  39		*/
+                f_i[m]    = -(cik*r_kj[m] - cii*r_ij[m]);
+                f_k[m]    = -(cik*r_ij[m] - ckk*r_kj[m]);
+                f_j[m]    = -f_i[m] - f_k[m];
+                f[ai][m] += f_i[m];
+                f[aj][m] += f_j[m];
+                f[ak][m] += f_k[m];
+            }
+            if (g != NULL)
+            {
+                copy_ivec(SHIFT_IVEC(g,aj),jt);
+
+                ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
+                ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
+                t1 = IVEC2IS(dt_ij);
+                t2 = IVEC2IS(dt_kj);
+            }
+            rvec_inc(fshift[t1],f_i);
+            rvec_inc(fshift[CENTRAL],f_j);
+            rvec_inc(fshift[t2],f_k);
+        }                                           /* 161 TOTAL	*/
+    }
+
+    return vtot;
 }
 
 real linear_angles(int nbonds,
@@ -1156,6 +1169,162 @@ real dih_angle(const rvec xi,const rvec xj,const rvec xk,const rvec xl,
 }
 
 
+#ifdef SSE_PROPER_DIHEDRALS
+
+/* x86 SIMD inner-product of 4 float vectors */
+#define GMX_MM_IPROD_PS(ax,ay,az,bx,by,bz)                 \
+    _mm_add_ps(_mm_add_ps(_mm_mul_ps(ax,bx),_mm_mul_ps(ay,by)),_mm_mul_ps(az,bz))
+
+/* x86 SIMD norm^2 of 4 float vectors */
+#define GMX_MM_NORM2_PS(ax,ay,az) GMX_MM_IPROD_PS(ax,ay,az,ax,ay,az)
+
+/* x86 SIMD cross-product of 4 float vectors */
+#define GMX_MM_CPROD_PS(ax,ay,az,bx,by,bz,cx,cy,cz)        \
+{                                                          \
+    cx = _mm_sub_ps(_mm_mul_ps(ay,bz),_mm_mul_ps(az,by));  \
+    cy = _mm_sub_ps(_mm_mul_ps(az,bx),_mm_mul_ps(ax,bz));  \
+    cz = _mm_sub_ps(_mm_mul_ps(ax,by),_mm_mul_ps(ay,bx));  \
+}
+
+/* load 4 rvec's into 3 x86 SIMD float registers */
+#define load_rvec4(r0,r1,r2,r3,rx_SSE,ry_SSE,rz_SSE)          \
+{                                                             \
+    __m128 tmp;                                               \
+    rx_SSE = _mm_load_ps(r0);                                 \
+    ry_SSE = _mm_load_ps(r1);                                 \
+    rz_SSE = _mm_load_ps(r2);                                 \
+    tmp    = _mm_load_ps(r3);                                 \
+    _MM_TRANSPOSE4_PS(rx_SSE,ry_SSE,rz_SSE,tmp);              \
+}
+
+#define store_rvec4(rx_SSE,ry_SSE,rz_SSE,r0,r1,r2,r3)         \
+{                                                             \
+    __m128 tmp=_mm_setzero_ps();                              \
+    _MM_TRANSPOSE4_PS(rx_SSE,ry_SSE,rz_SSE,tmp);              \
+    _mm_store_ps(r0,rx_SSE);                                  \
+    _mm_store_ps(r1,ry_SSE);                                  \
+    _mm_store_ps(r2,rz_SSE);                                  \
+    _mm_store_ps(r3,tmp   );                                  \
+}
+
+/* An rvec in a structure which can be allocated 16-byte aligned */
+typedef struct {
+    rvec  v;
+    float f;
+} rvec_sse_t;
+
+/* As dih_angle above, but calculates 4 dihedral angles at once using SSE,
+ * also calculates the pre-factor required for the dihedral force update.
+ * Note that bv and buf should be 16-byte aligned.
+ */
+static void
+dih_angle_sse(const rvec *x,
+              int ai[4],int aj[4],int ak[4],int al[4],
+              const t_pbc *pbc,
+              int t1[4],int t2[4],int t3[4],
+              rvec_sse_t *bv,
+              real *buf)
+{
+    int s;
+    __m128 rijx_SSE,rijy_SSE,rijz_SSE;
+    __m128 rkjx_SSE,rkjy_SSE,rkjz_SSE;
+    __m128 rklx_SSE,rkly_SSE,rklz_SSE;
+    __m128 mx_SSE,my_SSE,mz_SSE;
+    __m128 nx_SSE,ny_SSE,nz_SSE;
+    __m128 cx_SSE,cy_SSE,cz_SSE;
+    __m128 cn_SSE;
+    __m128 s_SSE;
+    __m128 phi_SSE;
+    __m128 ipr_SSE;
+    int signs;
+    __m128 iprm_SSE,iprn_SSE;
+    __m128 nrkj2_SSE,nrkj_1_SSE,nrkj_2_SSE,nrkj_SSE;
+    __m128 nrkj_m2_SSE,nrkj_n2_SSE;
+    __m128 p_SSE,q_SSE;
+    __m128 fmin_SSE=_mm_set1_ps(GMX_FLOAT_MIN);
+
+    for(s=0; s<4; s++)
+    {
+        t1[s] = pbc_rvec_sub(pbc,x[ai[s]],x[aj[s]],bv[0+s].v);
+        t2[s] = pbc_rvec_sub(pbc,x[ak[s]],x[aj[s]],bv[4+s].v);
+        t3[s] = pbc_rvec_sub(pbc,x[ak[s]],x[al[s]],bv[8+s].v);
+    }
+
+    load_rvec4(bv[0].v,bv[1].v,bv[2].v,bv[3].v,rijx_SSE,rijy_SSE,rijz_SSE);
+    load_rvec4(bv[4].v,bv[5].v,bv[6].v,bv[7].v,rkjx_SSE,rkjy_SSE,rkjz_SSE);
+    load_rvec4(bv[8].v,bv[9].v,bv[10].v,bv[11].v,rklx_SSE,rkly_SSE,rklz_SSE);
+
+    GMX_MM_CPROD_PS(rijx_SSE,rijy_SSE,rijz_SSE,
+                    rkjx_SSE,rkjy_SSE,rkjz_SSE,
+                    mx_SSE,my_SSE,mz_SSE);
+
+    GMX_MM_CPROD_PS(rkjx_SSE,rkjy_SSE,rkjz_SSE,
+                    rklx_SSE,rkly_SSE,rklz_SSE,
+                    nx_SSE,ny_SSE,nz_SSE);
+
+    GMX_MM_CPROD_PS(mx_SSE,my_SSE,mz_SSE,
+                    nx_SSE,ny_SSE,nz_SSE,
+                    cx_SSE,cy_SSE,cz_SSE);
+
+    cn_SSE = gmx_mm_sqrt_ps(GMX_MM_NORM2_PS(cx_SSE,cy_SSE,cz_SSE));
+    
+    s_SSE = GMX_MM_IPROD_PS(mx_SSE,my_SSE,mz_SSE,nx_SSE,ny_SSE,nz_SSE);
+
+    phi_SSE = gmx_mm_atan2_ps(cn_SSE,s_SSE);
+    _mm_store_ps(buf+16,phi_SSE);
+
+    ipr_SSE = GMX_MM_IPROD_PS(rijx_SSE,rijy_SSE,rijz_SSE,
+                              nx_SSE,ny_SSE,nz_SSE);
+
+    signs = _mm_movemask_ps(ipr_SSE);
+    
+    for(s=0; s<4; s++)
+    {
+        if (signs & (1<<s))
+        {
+            buf[16+s] = -buf[16+s];
+        }
+    }
+
+    iprm_SSE    = GMX_MM_NORM2_PS(mx_SSE,my_SSE,mz_SSE);
+    iprn_SSE    = GMX_MM_NORM2_PS(nx_SSE,ny_SSE,nz_SSE);
+
+    /* store_rvec4 messes with the input, don't use it after this! */
+    store_rvec4(mx_SSE,my_SSE,mz_SSE,bv[0].v,bv[1].v,bv[2].v,bv[3].v);
+    store_rvec4(nx_SSE,ny_SSE,nz_SSE,bv[4].v,bv[5].v,bv[6].v,bv[7].v);
+
+    nrkj2_SSE   = GMX_MM_NORM2_PS(rkjx_SSE,rkjy_SSE,rkjz_SSE);
+
+    /* Avoid division by zero. When zero, the result is multiplied by 0
+     * anyhow, so the 3 max below do not affect the final result.
+     */
+    nrkj2_SSE   = _mm_max_ps(nrkj2_SSE,fmin_SSE);
+    nrkj_1_SSE  = gmx_mm_invsqrt_ps(nrkj2_SSE);
+    nrkj_2_SSE  = _mm_mul_ps(nrkj_1_SSE,nrkj_1_SSE);
+    nrkj_SSE    = _mm_mul_ps(nrkj2_SSE,nrkj_1_SSE);
+
+    iprm_SSE    = _mm_max_ps(iprm_SSE,fmin_SSE);
+    iprn_SSE    = _mm_max_ps(iprn_SSE,fmin_SSE);
+    nrkj_m2_SSE = _mm_mul_ps(nrkj_SSE,gmx_mm_inv_ps(iprm_SSE));
+    nrkj_n2_SSE = _mm_mul_ps(nrkj_SSE,gmx_mm_inv_ps(iprn_SSE));
+
+    _mm_store_ps(buf+0,nrkj_m2_SSE);
+    _mm_store_ps(buf+4,nrkj_n2_SSE);
+
+    p_SSE       = GMX_MM_IPROD_PS(rijx_SSE,rijy_SSE,rijz_SSE,
+                                  rkjx_SSE,rkjy_SSE,rkjz_SSE);
+    p_SSE       = _mm_mul_ps(p_SSE,nrkj_2_SSE);
+
+    q_SSE       = GMX_MM_IPROD_PS(rklx_SSE,rkly_SSE,rklz_SSE,
+                                  rkjx_SSE,rkjy_SSE,rkjz_SSE);
+    q_SSE       = _mm_mul_ps(q_SSE,nrkj_2_SSE);
+
+    _mm_store_ps(buf+8 ,p_SSE);
+    _mm_store_ps(buf+12,q_SSE);
+}
+
+#endif /* SSE_PROPER_DIHEDRALS */
+
 
 void do_dih_fup(int i,int j,int k,int l,real ddphi,
 		rvec r_ij,rvec r_kj,rvec r_kl,
@@ -1166,8 +1335,8 @@ void do_dih_fup(int i,int j,int k,int l,real ddphi,
   /* 143 FLOPS */
   rvec f_i,f_j,f_k,f_l;
   rvec uvec,vvec,svec,dx_jl;
-  real iprm,iprn,nrkj,nrkj2;
-  real a,p,q,toler;
+  real iprm,iprn,nrkj,nrkj2,nrkj_1,nrkj_2;
+  real a,b,p,q,toler;
   ivec jt,dt_ij,dt_kj,dt_lj;  
   
   iprm  = iprod(m,m);		/*  5 	*/
@@ -1175,15 +1344,17 @@ void do_dih_fup(int i,int j,int k,int l,real ddphi,
   nrkj2 = iprod(r_kj,r_kj);	/*  5	*/
   toler = nrkj2*GMX_REAL_EPS;
   if ((iprm > toler) && (iprn > toler)) {
-    nrkj  = nrkj2*gmx_invsqrt(nrkj2);	/* 10	*/
+    nrkj_1 = gmx_invsqrt(nrkj2);	/* 10	*/
+    nrkj_2 = nrkj_1*nrkj_1;	/*  1	*/
+    nrkj  = nrkj2*nrkj_1;	/*  1	*/
     a     = -ddphi*nrkj/iprm;	/* 11	*/
     svmul(a,m,f_i);		/*  3	*/
-    a     = ddphi*nrkj/iprn;	/* 11	*/
-    svmul(a,n,f_l);		/*  3 	*/
+    b     = ddphi*nrkj/iprn;	/* 11	*/
+    svmul(b,n,f_l);		/*  3 	*/
     p     = iprod(r_ij,r_kj);	/*  5	*/
-    p    /= nrkj2;		/* 10	*/
+    p    *= nrkj_2;		/*  1	*/
     q     = iprod(r_kl,r_kj);	/*  5	*/
-    q    /= nrkj2;		/* 10	*/
+    q    *= nrkj_2;		/*  1	*/
     svmul(p,f_i,uvec);		/*  3	*/
     svmul(q,f_l,vvec);		/*  3	*/
     rvec_sub(uvec,vvec,svec);	/*  3	*/
@@ -1216,6 +1387,73 @@ void do_dih_fup(int i,int j,int k,int l,real ddphi,
   /* 112 TOTAL 	*/
 }
 
+/* As do_dih_fup above, but without shift forces */
+static void
+do_dih_fup_noshiftf(int i,int j,int k,int l,real ddphi,
+                    rvec r_ij,rvec r_kj,rvec r_kl,
+                    rvec m,rvec n,rvec f[])
+{
+  rvec f_i,f_j,f_k,f_l;
+  rvec uvec,vvec,svec,dx_jl;
+  real iprm,iprn,nrkj,nrkj2,nrkj_1,nrkj_2;
+  real a,b,p,q,toler;
+  ivec jt,dt_ij,dt_kj,dt_lj;  
+  
+  iprm  = iprod(m,m);		/*  5 	*/
+  iprn  = iprod(n,n);		/*  5	*/
+  nrkj2 = iprod(r_kj,r_kj);	/*  5	*/
+  toler = nrkj2*GMX_REAL_EPS;
+  if ((iprm > toler) && (iprn > toler)) {
+    nrkj_1 = gmx_invsqrt(nrkj2);	/* 10	*/
+    nrkj_2 = nrkj_1*nrkj_1;	/*  1	*/
+    nrkj  = nrkj2*nrkj_1;	/*  1	*/
+    a     = -ddphi*nrkj/iprm;	/* 11	*/
+    svmul(a,m,f_i);		/*  3	*/
+    b     = ddphi*nrkj/iprn;	/* 11	*/
+    svmul(b,n,f_l);		/*  3 	*/
+    p     = iprod(r_ij,r_kj);	/*  5	*/
+    p    *= nrkj_2;		/*  1	*/
+    q     = iprod(r_kl,r_kj);	/*  5	*/
+    q    *= nrkj_2;		/*  1	*/
+    svmul(p,f_i,uvec);		/*  3	*/
+    svmul(q,f_l,vvec);		/*  3	*/
+    rvec_sub(uvec,vvec,svec);	/*  3	*/
+    rvec_sub(f_i,svec,f_j);	/*  3	*/
+    rvec_add(f_l,svec,f_k);	/*  3	*/
+    rvec_inc(f[i],f_i);   	/*  3	*/
+    rvec_dec(f[j],f_j);   	/*  3	*/
+    rvec_dec(f[k],f_k);   	/*  3	*/
+    rvec_inc(f[l],f_l);   	/*  3	*/
+  }
+}
+
+/* As do_dih_fup_noshiftf above, but with pre-calculated pre-factors */
+static void
+do_dih_fup_noshiftf_precalc(int i,int j,int k,int l,real ddphi,
+                            real nrkj_m2,real nrkj_n2,
+                            real p,real q,
+                            rvec m,rvec n,rvec f[])
+{
+    rvec f_i,f_j,f_k,f_l;
+    rvec uvec,vvec,svec,dx_jl;
+    real a,b,toler;
+    ivec jt,dt_ij,dt_kj,dt_lj;  
+  
+    a = -ddphi*nrkj_m2;
+    svmul(a,m,f_i);
+    b =  ddphi*nrkj_n2;
+    svmul(b,n,f_l);
+    svmul(p,f_i,uvec);
+    svmul(q,f_l,vvec);
+    rvec_sub(uvec,vvec,svec);
+    rvec_sub(f_i,svec,f_j);
+    rvec_add(f_l,svec,f_k);
+    rvec_inc(f[i],f_i);
+    rvec_dec(f[j],f_j);
+    rvec_dec(f[k],f_k);
+    rvec_inc(f[l],f_l);
+}
+
 
 real dopdihs(real cpA,real cpB,real phiA,real phiB,int mult,
 	     real phi,real lambda,real *V,real *F)
@@ -1242,6 +1480,36 @@ real dopdihs(real cpA,real cpB,real phiA,real phiB,int mult,
   /* That was 40 flops */
 }
 
+static void
+dopdihs_noener(real cpA,real cpB,real phiA,real phiB,int mult,
+               real phi,real lambda,real *F)
+{
+  real mdphi,sdphi,ddphi;
+  real L1   = 1.0 - lambda;
+  real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
+  real cp   = L1*cpA + lambda*cpB;
+  
+  mdphi = mult*phi - ph0;
+  sdphi = sin(mdphi);
+  ddphi = -cp*mult*sdphi;
+  
+  *F = ddphi;
+  
+  /* That was 20 flops */
+}
+
+static void
+dopdihs_mdphi(real cpA,real cpB,real phiA,real phiB,int mult,
+              real phi,real lambda,real *cp,real *mdphi)
+{
+    real L1   = 1.0 - lambda;
+    real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
+
+    *cp    = L1*cpA + lambda*cpB;
+
+    *mdphi = mult*phi - ph0;
+}
+
 static real dopdihs_min(real cpA,real cpB,real phiA,real phiB,int mult,
 			real phi,real lambda,real *V,real *F)
      /* similar to dopdihs, except for a minus sign  *
@@ -1327,6 +1595,153 @@ void make_dp_periodic(real *dp)  /* 1 flop? */
     return;
 }
 
+/* As pdihs above, but without calculating energies and shift forces */
+static void
+pdihs_noener(int nbonds,
+             const t_iatom forceatoms[],const t_iparams forceparams[],
+             const rvec x[],rvec f[],
+             const t_pbc *pbc,const t_graph *g,
+             real lambda,
+             const t_mdatoms *md,t_fcdata *fcd,
+             int *global_atom_index)
+{
+    int  i,type,ai,aj,ak,al;
+    int  t1,t2,t3;
+    rvec r_ij,r_kj,r_kl,m,n;
+    real phi,sign,ddphi_tot,ddphi;
+
+    for(i=0; (i<nbonds); )
+    {
+        ai   = forceatoms[i+1];
+        aj   = forceatoms[i+2];
+        ak   = forceatoms[i+3];
+        al   = forceatoms[i+4];
+
+        phi = dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
+                        &sign,&t1,&t2,&t3);
+
+        ddphi_tot = 0;
+
+        /* Loop over dihedrals working on the same atoms,
+         * so we avoid recalculating angles and force distributions.
+         */
+        do
+        {
+            type = forceatoms[i];
+            dopdihs_noener(forceparams[type].pdihs.cpA,
+                           forceparams[type].pdihs.cpB,
+                           forceparams[type].pdihs.phiA,
+                           forceparams[type].pdihs.phiB,
+                           forceparams[type].pdihs.mult,
+                           phi,lambda,&ddphi);
+            ddphi_tot += ddphi;
+
+            i += 5;
+        }
+        while(i < nbonds &&
+              forceatoms[i+1] == ai &&
+              forceatoms[i+2] == aj &&
+              forceatoms[i+3] == ak &&
+              forceatoms[i+4] == al);
+
+        do_dih_fup_noshiftf(ai,aj,ak,al,ddphi_tot,r_ij,r_kj,r_kl,m,n,f);
+    }
+}
+
+
+#ifdef SSE_PROPER_DIHEDRALS
+
+/* As pdihs_noner above, but using SSE to calculate 4 dihedrals at once */
+static void
+pdihs_noener_sse(int nbonds,
+                 const t_iatom forceatoms[],const t_iparams forceparams[],
+                 const rvec x[],rvec f[],
+                 const t_pbc *pbc,const t_graph *g,
+                 real lambda,
+                 const t_mdatoms *md,t_fcdata *fcd,
+                 int *global_atom_index)
+{
+    int  i,i4,s;
+    int  type,ai[4],aj[4],ak[4],al[4];
+    int  t1[4],t2[4],t3[4];
+    int  mult[4];
+    real cp[4],mdphi[4];
+    real ddphi;
+    rvec_sse_t rs_array[13],*rs;
+    real buf_array[24],*buf;
+    __m128 mdphi_SSE,sin_SSE,cos_SSE;
+
+    /* Ensure 16-byte alignment */
+    rs  = (rvec_sse_t *)(((size_t)(rs_array +1)) & (~((size_t)15)));
+    buf =      (float *)(((size_t)(buf_array+3)) & (~((size_t)15)));
+
+    for(i=0; (i<nbonds); i+=20)
+    {
+        /* Collect atoms quadruplets for 4 dihedrals */
+        i4 = i;
+        for(s=0; s<4; s++)
+        {
+            ai[s] = forceatoms[i4+1];
+            aj[s] = forceatoms[i4+2];
+            ak[s] = forceatoms[i4+3];
+            al[s] = forceatoms[i4+4];
+            /* At the end fill the arrays with identical entries */
+            if (i4 + 5 < nbonds)
+            {
+                i4 += 5;
+            }
+        }
+
+        /* Caclulate 4 dihedral angles at once */
+        dih_angle_sse(x,ai,aj,ak,al,pbc,t1,t2,t3,rs,buf);
+
+        i4 = i;
+        for(s=0; s<4; s++)
+        {
+            if (i4 < nbonds)
+            {
+                /* Calculate the coefficient and angle deviation */
+                type = forceatoms[i4];
+                dopdihs_mdphi(forceparams[type].pdihs.cpA,
+                              forceparams[type].pdihs.cpB,
+                              forceparams[type].pdihs.phiA,
+                              forceparams[type].pdihs.phiB,
+                              forceparams[type].pdihs.mult,
+                              buf[16+s],lambda,&cp[s],&buf[16+s]);
+                mult[s] = forceparams[type].pdihs.mult;
+            }
+            else
+            {
+                buf[16+s] = 0;
+            }
+            i4 += 5;
+        }
+
+        /* Calculate 4 sines at once */
+        mdphi_SSE = _mm_load_ps(buf+16);
+        gmx_mm_sincos_ps(mdphi_SSE,&sin_SSE,&cos_SSE);
+        _mm_store_ps(buf+16,sin_SSE);
+
+        i4 = i;
+        s = 0;
+        do
+        {
+            ddphi = -cp[s]*mult[s]*buf[16+s];
+
+            do_dih_fup_noshiftf_precalc(ai[s],aj[s],ak[s],al[s],ddphi,
+                                        buf[ 0+s],buf[ 4+s],
+                                        buf[ 8+s],buf[12+s],
+                                        rs[0+s].v,rs[4+s].v,
+                                        f);
+            s++;
+            i4 += 5;
+        }
+        while (s < 4 && i4 < nbonds);
+    }
+}
+
+#endif /* SSE_PROPER_DIHEDRALS */
+
 
 real idihs(int nbonds,
 	   const t_iatom forceatoms[],const t_iparams forceparams[],
@@ -2817,95 +3232,383 @@ real tab_dihs(int nbonds,
   return vtot;
 }
 
-real calc_one_bond(FILE *fplog,int ftype, const t_idef *idef,
-                   rvec x[], rvec f[], t_forcerec *fr,
-                   const t_pbc *pbc,const t_graph *g,
-                   gmx_enerdata_t *enerd, t_nrnb *nrnb,
-                   real *lambda, real *dvdl,
-                   const t_mdatoms *md,t_fcdata *fcd,
-                   int *global_atom_index, gmx_bool bPrintSepPot)
+static unsigned
+calc_bonded_reduction_mask(const t_idef *idef,
+                           int shift,
+                           int t,int nt)
 {
-    int ind,nat1,nbonds,efptFTYPE;
-    real v=0;
-    t_iatom *iatoms;
+    unsigned mask;
+    int ftype,nb,nat1,nb0,nb1,i,a;
 
-    if (IS_RESTRAINT_TYPE(ftype))
+    mask = 0;
+
+    for(ftype=0; ftype<F_NRE; ftype++)
     {
-        efptFTYPE = efptRESTRAINT;
+        if (interaction_function[ftype].flags & IF_BOND &&
+            !(ftype == F_CONNBONDS || ftype == F_POSRES) &&
+            (ftype<F_GB12 || ftype>F_GB14))
+        {
+            nb = idef->il[ftype].nr;
+            if (nb > 0)
+            {
+                nat1 = interaction_function[ftype].nratoms + 1;
+
+                /* Divide this interaction equally over the threads.
+                 * This is not stored: should match division in calc_bonds.
+                 */
+                nb0 = (((nb/nat1)* t   )/nt)*nat1;
+                nb1 = (((nb/nat1)*(t+1))/nt)*nat1;
+
+                for(i=nb0; i<nb1; i+=nat1)
+                {
+                    for(a=1; a<nat1; a++)
+                    {
+                        mask |= (1U << (idef->il[ftype].iatoms[i+a]>>shift));
+                    }
+                }
+            }
+        }
     }
-    else
+
+    return mask;
+}
+
+void init_bonded_thread_force_reduction(t_forcerec *fr,
+                                        const t_idef *idef)
+{
+#define MAX_BLOCK_BITS 32
+    int t;
+    int ctot,c,b;
+
+    if (fr->nthreads <= 1)
     {
-        efptFTYPE = efptBONDED;
+        fr->red_nblock = 0;
+
+        return;
     }
 
-    if (ftype<F_GB12 || ftype>F_GB14)
+    /* We divide the force array in a maximum of 32 blocks.
+     * Minimum force block reduction size is 2^6=64.
+     */
+    fr->red_ashift = 6;
+    while (fr->natoms_force > (int)(MAX_BLOCK_BITS*(1U<<fr->red_ashift)))
     {
-        if (interaction_function[ftype].flags & IF_BOND &&
-            !(ftype == F_CONNBONDS || ftype == F_POSRES))
+        fr->red_ashift++;
+    }
+    if (debug)
+    {
+        fprintf(debug,"bonded force buffer block atom shift %d bits\n",
+                fr->red_ashift);
+    }
+
+    /* Determine to which blocks each thread's bonded force calculation
+     * contributes. Store this is a mask for each thread.
+     */
+#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
+    for(t=1; t<fr->nthreads; t++)
+    {
+        fr->f_t[t].red_mask =
+            calc_bonded_reduction_mask(idef,fr->red_ashift,t,fr->nthreads);
+    }
+
+    /* Determine the maximum number of blocks we need to reduce over */
+    fr->red_nblock = 0;
+    ctot = 0;
+    for(t=0; t<fr->nthreads; t++)
+    {
+        c = 0;
+        for(b=0; b<MAX_BLOCK_BITS; b++)
         {
-            ind  = interaction_function[ftype].nrnb_ind;
-            nat1 = interaction_function[ftype].nratoms+1;
-            nbonds    = idef->il[ftype].nr;
-            iatoms    = idef->il[ftype].iatoms;
-            if (nbonds > 0)
+            if (fr->f_t[t].red_mask & (1U<<b))
             {
-                if (ftype < F_LJ14 || ftype > F_LJC_PAIRS_NB)
+                fr->red_nblock = max(fr->red_nblock,b+1);
+                c++;
+            }
+        }
+        if (debug)
+        {
+            fprintf(debug,"thread %d flags %x count %d\n",
+                    t,fr->f_t[t].red_mask,c);
+        }
+        ctot += c;
+    }
+    if (debug)
+    {
+        fprintf(debug,"Number of blocks to reduce: %d of size %d\n",
+                fr->red_nblock,1<<fr->red_ashift);
+        fprintf(debug,"Reduction density %.2f density/#thread %.2f\n",
+                ctot*(1<<fr->red_ashift)/(double)fr->natoms_force,
+                ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
+    }
+}
+
+static void zero_thread_forces(f_thread_t *f_t,int n,
+                               int nblock,int blocksize)
+{
+    int b,a0,a1,a,i,j;
+
+    if (n > f_t->f_nalloc)
+    {
+        f_t->f_nalloc = over_alloc_large(n);
+        srenew(f_t->f,f_t->f_nalloc);
+    }
+
+    if (f_t->red_mask != 0)
+    {
+        for(b=0; b<nblock; b++)
+        {
+            if (f_t->red_mask && (1U<<b))
+            {
+                a0 = b*blocksize;
+                a1 = min((b+1)*blocksize,n);
+                for(a=a0; a<a1; a++)
                 {
-                    if(ftype==F_CMAP)
-                    {
-                        v = cmap_dihs(nbonds,iatoms,
-                                      idef->iparams,&idef->cmap_grid,
-                                      (const rvec*)x,f,fr->fshift,
-                                      pbc,g,lambda[efptFTYPE],&(dvdl[efptFTYPE]),
-                                      md,fcd,global_atom_index);
-                    }
-                    else
-                    {
-                        v =	    interaction_function[ftype].ifunc(nbonds,iatoms,
-                                                                  idef->iparams,
-                                                                  (const rvec*)x,f,fr->fshift,
-                                                                  pbc,g,lambda[efptFTYPE],&(dvdl[efptFTYPE]),
-                                                                  md,fcd,global_atom_index);
-                    }
-                    enerd->dvdl_nonlin[efptFTYPE] += dvdl[efptFTYPE];
-                    if (bPrintSepPot)
-                    {
-                        fprintf(fplog,"  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
-                                interaction_function[ftype].longname,
-                                nbonds/nat1,v,lambda[efptFTYPE]);
-                    }
+                    clear_rvec(f_t->f[a]);
                 }
-                else
+            }
+        }
+    }
+    for(i=0; i<SHIFTS; i++)
+    {
+        clear_rvec(f_t->fshift[i]);
+    }
+    for(i=0; i<F_NRE; i++)
+    {
+        f_t->ener[i] = 0;
+    }
+    for(i=0; i<egNR; i++)
+    {
+        for(j=0; j<f_t->grpp.nener; j++)
+        {
+            f_t->grpp.ener[i][j] = 0;
+        }
+    }
+    for(i=0; i<efptNR; i++)
+    {
+        f_t->dvdl[i] = 0;
+    }
+}
+
+static void reduce_thread_force_buffer(int n,rvec *f,
+                                       int nthreads,f_thread_t *f_t,
+                                       int nblock,int block_size)
+{
+    /* The max thread number is arbitrary,
+     * we used a fixed number to avoid memory management.
+     * Using more than 16 threads is probably never useful performance wise.
+     */
+#define MAX_BONDED_THREADS 256
+    int b;
+
+    if (nthreads > MAX_BONDED_THREADS)
+    {
+        gmx_fatal(FARGS,"Can not reduce bonded forces on more than %d threads",
+                  MAX_BONDED_THREADS);
+    }
+
+    /* This reduction can run on any number of threads,
+     * independently of nthreads.
+     */
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+    for(b=0; b<nblock; b++)
+    {
+        rvec *fp[MAX_BONDED_THREADS];
+        int nfb,ft,fb;
+        int a0,a1,a;
+
+        /* Determine which threads contribute to this block */
+        nfb = 0;
+        for(ft=1; ft<nthreads; ft++)
+        {
+            if (f_t[ft].red_mask & (1U<<b))
+            {
+                fp[nfb++] = f_t[ft].f;
+            }
+        }
+        if (nfb > 0)
+        {
+            /* Reduce force buffers for threads that contribute */
+            a0 =  b   *block_size;
+            a1 = (b+1)*block_size;
+            a1 = min(a1,n);
+            for(a=a0; a<a1; a++)
+            {
+                for(fb=0; fb<nfb; fb++)
                 {
-                    v = do_listed_vdw_q(ftype,nbonds,iatoms,
-                                        idef->iparams,
-                                        (const rvec*)x,f,fr->fshift,
-                                        pbc,g,lambda,dvdl,
-                                        md,fr,&enerd->grpp,global_atom_index);
-                    enerd->dvdl_nonlin[efptCOUL] += dvdl[efptCOUL];
-                    enerd->dvdl_nonlin[efptVDW] += dvdl[efptVDW];
+                    rvec_inc(f[a],fp[fb][a]);
+                }
+            }
+        }
+    }
+}
 
-                    if (bPrintSepPot)
-                    {
-                        fprintf(fplog,"  %-5s + %-15s #%4d                  dVdl %12.5e\n",
-                                interaction_function[ftype].longname,
-                                interaction_function[F_LJ14].longname,nbonds/nat1,dvdl[efptVDW]);
-                        fprintf(fplog,"  %-5s + %-15s #%4d                  dVdl %12.5e\n",
-                                interaction_function[ftype].longname,
-                                interaction_function[F_COUL14].longname,nbonds/nat1,dvdl[efptCOUL]);
-                    }
+static void reduce_thread_forces(int n,rvec *f,rvec *fshift,
+                                 real *ener,gmx_grppairener_t *grpp,real *dvdl,
+                                 int nthreads,f_thread_t *f_t,
+                                 int nblock,int block_size,
+                                 gmx_bool bCalcEnerVir,
+                                 gmx_bool bDHDL)
+{
+    if (nblock > 0)
+    {
+        /* Reduce the bonded force buffer */
+        reduce_thread_force_buffer(n,f,nthreads,f_t,nblock,block_size);
+    }
+
+    /* When necessary, reduce energy and virial using one thread only */
+    if (bCalcEnerVir)
+    {
+        int t,i,j;
+
+        for(i=0; i<SHIFTS; i++)
+        {
+            for(t=1; t<nthreads; t++)
+            {
+                rvec_inc(fshift[i],f_t[t].fshift[i]);
+            }
+        }
+        for(i=0; i<F_NRE; i++)
+        {
+            for(t=1; t<nthreads; t++)
+            {
+                ener[i] += f_t[t].ener[i];
+            }
+        }
+        for(i=0; i<egNR; i++)
+        {
+            for(j=0; j<f_t[1].grpp.nener; j++)
+            {
+                for(t=1; t<nthreads; t++)
+                {
+                    
+                    grpp->ener[i][j] += f_t[t].grpp.ener[i][j];
                 }
-                if (ind != -1)
+            }
+        }
+        if (bDHDL)
+        {
+            for(i=0; i<efptNR; i++)
+            {
+                
+                for(t=1; t<nthreads; t++)
                 {
-                    inc_nrnb(nrnb,ind,nbonds/nat1);
+                    dvdl[i] += f_t[t].dvdl[i];
                 }
             }
         }
     }
+}
+
+static real calc_one_bond(FILE *fplog,int thread,
+                          int ftype,const t_idef *idef,
+                          rvec x[], rvec f[], rvec fshift[],
+                          t_forcerec *fr,
+                          const t_pbc *pbc,const t_graph *g,
+                          gmx_enerdata_t *enerd, gmx_grppairener_t *grpp,
+                          t_nrnb *nrnb,
+                          real *lambda, real *dvdl,
+                          const t_mdatoms *md,t_fcdata *fcd,
+                          gmx_bool bCalcEnerVir,
+                          int *global_atom_index, gmx_bool bPrintSepPot)
+{
+    int ind,nat1,nbonds,efptFTYPE;
+    real v=0;
+    t_iatom *iatoms;
+    int nb0,nbn;
+
+    if (IS_RESTRAINT_TYPE(ftype))
+    {
+        efptFTYPE = efptRESTRAINT;
+    }
+    else
+    {
+        efptFTYPE = efptBONDED;
+    }
+
+    if (interaction_function[ftype].flags & IF_BOND &&
+        !(ftype == F_CONNBONDS || ftype == F_POSRES))
+    {
+        ind  = interaction_function[ftype].nrnb_ind;
+        nat1 = interaction_function[ftype].nratoms + 1;
+        nbonds    = idef->il[ftype].nr/nat1;
+        iatoms    = idef->il[ftype].iatoms;
+
+        nb0 = ((nbonds* thread   )/(fr->nthreads))*nat1;
+        nbn = ((nbonds*(thread+1))/(fr->nthreads))*nat1 - nb0;
+
+        if (!IS_LISTED_LJ_C(ftype))
+        {
+            if(ftype==F_CMAP)
+            {
+                v = cmap_dihs(nbn,iatoms+nb0,
+                              idef->iparams,&idef->cmap_grid,
+                              (const rvec*)x,f,fshift,
+                              pbc,g,lambda[efptFTYPE],&(dvdl[efptFTYPE]),
+                              md,fcd,global_atom_index);
+            }
+            else if (ftype == F_PDIHS &&
+                     !bCalcEnerVir && fr->efep==efepNO)
+            {
+                /* No energies, shift forces, dvdl */
+#ifndef SSE_PROPER_DIHEDRALS
+                pdihs_noener
+#else
+                pdihs_noener_sse
+#endif
+                    (nbn,idef->il[ftype].iatoms+nb0,
+                     idef->iparams,
+                     (const rvec*)x,f,
+                     pbc,g,lambda[efptFTYPE],md,fcd,
+                     global_atom_index);
+                v = 0;
+                dvdl[efptFTYPE] = 0;
+            }
+            else
+            {
+                v = interaction_function[ftype].ifunc(nbn,iatoms+nb0,
+                                                      idef->iparams,
+                                                      (const rvec*)x,f,fshift,
+                                                      pbc,g,lambda[efptFTYPE],&(dvdl[efptFTYPE]),
+                                                      md,fcd,global_atom_index);
+            }
+            enerd->dvdl_nonlin[efptFTYPE] += dvdl[efptFTYPE];
+            if (bPrintSepPot)
+            {
+                fprintf(fplog,"  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
+                        interaction_function[ftype].longname,
+                        nbonds/nat1,v,lambda[efptFTYPE]);
+            }
+        }
+        else
+        {
+            v = do_listed_vdw_q(ftype,nbn,iatoms+nb0,
+                                idef->iparams,
+                                (const rvec*)x,f,fshift,
+                                pbc,g,lambda,dvdl,
+                                md,fr,grpp,global_atom_index);
+            enerd->dvdl_nonlin[efptCOUL] += dvdl[efptCOUL];
+            enerd->dvdl_nonlin[efptVDW] += dvdl[efptVDW];
+            
+            if (bPrintSepPot)
+            {
+                fprintf(fplog,"  %-5s + %-15s #%4d                  dVdl %12.5e\n",
+                        interaction_function[ftype].longname,
+                        interaction_function[F_LJ14].longname,nbonds/nat1,dvdl[efptVDW]);
+                fprintf(fplog,"  %-5s + %-15s #%4d                  dVdl %12.5e\n",
+                        interaction_function[ftype].longname,
+                        interaction_function[F_COUL14].longname,nbonds/nat1,dvdl[efptCOUL]);
+            }
+        }
+        if (ind != -1 && thread == 0)
+        {
+            inc_nrnb(nrnb,ind,nbonds);
+        }
+    }
+
     return v;
 }
 
-/* WARNING!  THIS FUNCTION MUST EXACTLY TRACK THE calc_one_bond
+/* WARNING!  THIS FUNCTION MUST EXACTLY TRACK THE calc
    function, or horrible things will happen when doing free energy
    calculations!  In a good coding world, this would not be a
    different function, but for speed reasons, it needs to be made a
@@ -2913,13 +3616,13 @@ real calc_one_bond(FILE *fplog,int ftype, const t_idef *idef,
    to reduce duplication.
 */
 
-real calc_one_bond_foreign(FILE *fplog,int ftype, const t_idef *idef,
-                           rvec x[], rvec f[], t_forcerec *fr,
-                           const t_pbc *pbc,const t_graph *g,
-                           gmx_enerdata_t *enerd, t_nrnb *nrnb,
-                           real *lambda, real *dvdl,
-                           const t_mdatoms *md,t_fcdata *fcd,
-                           int *global_atom_index, gmx_bool bPrintSepPot)
+static real calc_one_bond_foreign(FILE *fplog,int ftype, const t_idef *idef,
+                                  rvec x[], rvec f[], t_forcerec *fr,
+                                  const t_pbc *pbc,const t_graph *g,
+                                  gmx_enerdata_t *enerd, t_nrnb *nrnb,
+                                  real *lambda, real *dvdl,
+                                  const t_mdatoms *md,t_fcdata *fcd,
+                                  int *global_atom_index, gmx_bool bPrintSepPot)
 {
     int ind,nat1,nbonds,efptFTYPE,nbonds_np;
     real v=0;
@@ -2946,7 +3649,7 @@ real calc_one_bond_foreign(FILE *fplog,int ftype, const t_idef *idef,
             iatoms    = idef->il[ftype].iatoms + nbonds_np;
             if (nbonds > 0)
             {
-                if (ftype < F_LJ14 || ftype > F_LJC_PAIRS_NB)
+                if (!IS_LISTED_LJ_C(ftype))
                 {
                     if(ftype==F_CMAP)
                     {
@@ -2993,14 +3696,18 @@ void calc_bonds(FILE *fplog,const gmx_multisim_t *ms,
                 const t_mdatoms *md,
                 t_fcdata *fcd,int *global_atom_index,
                 t_atomtypes *atype, gmx_genborn_t *born,
+                int force_flags,
                 gmx_bool bPrintSepPot,gmx_large_int_t step)
 {
-    int    i,ftype,nbonds,ind,nat;
+    gmx_bool bCalcEnerVir;
+    int    i;
     real   v,dvdl[efptNR],dvdl_dum[efptNR]; /* The dummy array is to have a place to store the dhdl at other values
                                                of lambda, which will be thrown away in the end*/
-    real   *epot;
     const  t_pbc *pbc_null;
     char   buf[22];
+    int    thread;
+
+    bCalcEnerVir = (force_flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY));
 
     for (i=0;i<efptNR;i++)
     {
@@ -3027,34 +3734,85 @@ void calc_bonds(FILE *fplog,const gmx_multisim_t *ms,
     }
 #endif
 
-    epot = enerd->term;
-
     /* Do pre force calculation stuff which might require communication */
-    if (idef->il[F_ORIRES].nr) {
-        epot[F_ORIRESDEV] = calc_orires_dev(ms,idef->il[F_ORIRES].nr,
-                                            idef->il[F_ORIRES].iatoms,
-                                            idef->iparams,md,(const rvec*)x,
-                                            pbc_null,fcd,hist);
+    if (idef->il[F_ORIRES].nr)
+    {
+        enerd->term[F_ORIRESDEV] =
+            calc_orires_dev(ms,idef->il[F_ORIRES].nr,
+                            idef->il[F_ORIRES].iatoms,
+                            idef->iparams,md,(const rvec*)x,
+                            pbc_null,fcd,hist);
     }
-    if (idef->il[F_DISRES].nr) {
+    if (idef->il[F_DISRES].nr)
+    {
         calc_disres_R_6(ms,idef->il[F_DISRES].nr,
                         idef->il[F_DISRES].iatoms,
                         idef->iparams,(const rvec*)x,pbc_null,
                         fcd,hist);
     }
 
-    /* Loop over all bonded force types to calculate the bonded forces */
-    for(ftype=0; (ftype<F_NRE); ftype++) 
+#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
+    for(thread=0; thread<fr->nthreads; thread++)
     {
-        v = calc_one_bond(fplog,ftype,idef,x, 
-                          f,fr,pbc_null,g,enerd,nrnb,lambda,dvdl,
-                          md,fcd,global_atom_index,bPrintSepPot);
-        epot[ftype]        += v;
+        int    ftype,nbonds,ind,nat1;
+        real   *epot,v;
+        /* thread stuff */
+        rvec   *ft,*fshift;
+        real   *dvdlt;
+        gmx_grppairener_t *grpp;
+        int    nb0,nbn;
+
+        if (thread == 0)
+        {
+            ft     = f;
+            fshift = fr->fshift;
+            epot   = enerd->term;
+            grpp   = &enerd->grpp;
+            dvdlt  = dvdl;
+        }
+        else
+        {
+            zero_thread_forces(&fr->f_t[thread],fr->natoms_force,
+                               fr->red_nblock,1<<fr->red_ashift);
+
+            ft     = fr->f_t[thread].f;
+            fshift = fr->f_t[thread].fshift;
+            epot   = fr->f_t[thread].ener;
+            grpp   = &fr->f_t[thread].grpp;
+            dvdlt  = fr->f_t[thread].dvdl;
+        }
+        /* Loop over all bonded force types to calculate the bonded forces */
+        for(ftype=0; (ftype<F_NRE); ftype++)
+        {
+            if (idef->il[ftype].nr > 0 &&
+                (interaction_function[ftype].flags & IF_BOND) &&
+                (ftype < F_GB12 || ftype > F_GB14) &&
+                !(ftype == F_CONNBONDS || ftype == F_POSRES))
+            {
+                v = calc_one_bond(fplog,thread,ftype,idef,x, 
+                                  ft,fshift,fr,pbc_null,g,enerd,grpp,
+                                  nrnb,lambda,dvdlt,
+                                  md,fcd,bCalcEnerVir,
+                                  global_atom_index,bPrintSepPot);
+                epot[ftype]        += v;
+            }
+        }
+    }
+    if (fr->nthreads > 1)
+    {
+        reduce_thread_forces(fr->natoms_force,f,fr->fshift,
+                             enerd->term,&enerd->grpp,dvdl,
+                             fr->nthreads,fr->f_t,
+                             fr->red_nblock,1<<fr->red_ashift,
+                             bCalcEnerVir,
+                             force_flags & GMX_FORCE_DHDL);
     }
+
     /* Copy the sum of violations for the distance restraints from fcd */
     if (fcd)
     {
-        epot[F_DISRESVIOL] = fcd->disres.sumviol;
+        enerd->term[F_DISRESVIOL] = fcd->disres.sumviol;
+
     }
 }
 
diff --git a/src/gmxlib/calcgrid.c b/src/gmxlib/calcgrid.c
index 7d77c4a352..ede8b4dba9 100644
--- a/src/gmxlib/calcgrid.c
+++ b/src/gmxlib/calcgrid.c
@@ -69,7 +69,7 @@ real calc_grid(FILE *fp,matrix box,real gr_sp,
     rvec spacing;
     real max_spacing;
 
-    if (gr_sp <= 0)
+    if ((*nx <= 0 || *ny <= 0 || *nz <= 0) && gr_sp <= 0)
     {
         gmx_fatal(FARGS,"invalid fourier grid spacing: %g",gr_sp);
     }
diff --git a/src/gmxlib/checkpoint.c b/src/gmxlib/checkpoint.c
index 2b314ec75d..927f82fa38 100644
--- a/src/gmxlib/checkpoint.c
+++ b/src/gmxlib/checkpoint.c
@@ -49,7 +49,6 @@
 #include "statutil.h"
 #include "txtdump.h"
 #include "vec.h"
-#include "mdrun.h"
 #include "network.h"
 #include "gmx_random.h"
 #include "checkpoint.h"
diff --git a/src/gmxlib/cuda_tools/CMakeLists.txt b/src/gmxlib/cuda_tools/CMakeLists.txt
new file mode 100644
index 0000000000..e008c50795
--- /dev/null
+++ b/src/gmxlib/cuda_tools/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(GMX_GPU)
+    file(GLOB CUDA_TOOLS_SOURCES *.cu)
+    CUDA_ADD_LIBRARY(cuda_tools STATIC ${CUDA_TOOLS_SOURCES}
+            OPTIONS
+            RELWITHDEBINFO -g
+            DEBUG -g -D_DEBUG_=1)
+endif()
diff --git a/src/gmxlib/cuda_tools/cudautils.cu b/src/gmxlib/cuda_tools/cudautils.cu
new file mode 100644
index 0000000000..606a811692
--- /dev/null
+++ b/src/gmxlib/cuda_tools/cudautils.cu
@@ -0,0 +1,266 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#include <stdlib.h>
+
+#include "gmx_fatal.h"
+#include "smalloc.h"
+#include "typedefs.h"
+#include "cudautils.cuh"
+
+/*** Generic CUDA data operation wrappers ***/
+
+/*! Launches synchronous or asynchronous host to device memory copy.
+ *
+ *  The copy is launched in stream s or if not specified, in stream 0.
+ */
+static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes, 
+                               bool bAsync = false, cudaStream_t s = 0)
+{
+    cudaError_t stat;
+    
+    if (h_dest == NULL || d_src == NULL || bytes == 0)
+        return -1;
+
+    if (bAsync)
+    {
+        stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
+        CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
+
+    }
+    else
+    {
+        stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
+        CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
+    }
+
+    return 0;
+}
+
+int cu_copy_D2H(void * h_dest, void * d_src, size_t bytes)
+{
+    return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
+}
+
+/*!
+ *  The copy is launched in stream s or if not specified, in stream 0.
+ */
+int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
+{
+    return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
+}
+
+int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
+{ 
+    if (h_dest == NULL || d_src == NULL || bytes == 0)
+        return -1;
+
+    smalloc(*h_dest, bytes);
+
+    return cu_copy_D2H(*h_dest, d_src, bytes);
+}
+
+/*! Launches synchronous or asynchronous device to host memory copy.
+ *
+ *  The copy is launched in stream s or if not specified, in stream 0.
+ */
+static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes, 
+                               bool bAsync = false, cudaStream_t s = 0)
+{
+    cudaError_t stat;
+
+    if (d_dest == NULL || h_src == NULL || bytes == 0)
+        return -1;
+
+    if (bAsync)
+    {
+        stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
+        CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
+    }
+    else
+    {
+        stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
+        CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
+    }
+
+    return 0;
+}
+
+int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
+{   
+    return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
+}
+
+/*!
+ *  The copy is launched in stream s or if not specified, in stream 0.
+ */
+int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
+{   
+    return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
+}
+
+int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
+{
+    cudaError_t stat;
+
+    if (d_dest == NULL || h_src == NULL || bytes == 0)
+        return -1;
+
+    stat = cudaMalloc(d_dest, bytes);
+    CU_RET_ERR(stat, "cudaMalloc failed in cu_copy_H2D_alloc");
+
+    return cu_copy_H2D(*d_dest, h_src, bytes);
+}
+
+float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
+{
+    float t = 0.0;
+    cudaError_t stat;
+
+    stat = cudaEventElapsedTime(&t, start, end);
+    CU_RET_ERR(stat, "cudaEventElapsedTime failed in cu_event_elapsed");
+
+    return t;
+}
+
+int cu_wait_event(cudaEvent_t e)
+{
+    cudaError_t s;
+
+    s = cudaEventSynchronize(e);
+    CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
+
+    return 0;
+}
+
+/*! 
+ *  If time != NULL it also calculates the time elapsed between start and end and
+ *  return this is milliseconds.
+ */ 
+int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
+{
+    cudaError_t s;
+
+    s = cudaEventSynchronize(end);
+    CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
+
+    if (time)
+    {
+        *time = cu_event_elapsed(start, end);
+    }
+
+    return 0;
+}
+
+/**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
+
+/*!
+ * If the pointers to the size variables are NULL no resetting happens.
+ */
+void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
+{
+    cudaError_t stat;
+
+    if (d_ptr)
+    {
+        stat = cudaFree(d_ptr);
+        CU_RET_ERR(stat, "cudaFree failed");
+    }
+
+    if (n)
+    {
+        *n = -1;
+    }
+
+    if (nalloc)
+    {
+        *nalloc = -1;
+    }
+}
+
+/*!
+ *  Reallocation of the memory pointed by d_ptr and copying of the data from 
+ *  the location pointed by h_src host-side pointer is done. Allocation is 
+ *  buffered and therefore freeing is only needed if the previously allocated 
+ *  space is not enough.
+ *  The H2D copy is launched in stream s and can be done synchronously or 
+ *  asynchronously (the default is the latter).
+ */
+void cu_realloc_buffered(void **d_dest, void *h_src,
+                         size_t type_size,
+                         int *curr_size, int *curr_alloc_size,
+                         int req_size,
+                         cudaStream_t s,
+                         bool bAsync = true)
+{
+    cudaError_t stat;
+
+    if (d_dest == NULL || req_size < 0)
+    {
+        return;
+    }
+
+    /* reallocate only if the data does not fit = allocation size is smaller 
+       than the current requested size */
+    if (req_size > *curr_alloc_size)
+    {
+        /* only free if the array has already been initialized */
+        if (*curr_alloc_size >= 0)
+        {
+            cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
+        }
+
+        *curr_alloc_size = over_alloc_large(req_size);
+
+        stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
+        CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
+    }
+
+    /* size could have changed without actual reallocation */
+    *curr_size = req_size;
+
+    /* upload to device */
+    if (h_src)
+    {
+        if (bAsync)
+        {
+            cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
+        }
+        else
+        {
+            cu_copy_H2D(*d_dest, h_src,  *curr_size * type_size);
+        }
+    }
+}
diff --git a/src/gmxlib/cuda_tools/cudautils.cuh b/src/gmxlib/cuda_tools/cudautils.cuh
new file mode 100644
index 0000000000..fe1f47cba7
--- /dev/null
+++ b/src/gmxlib/cuda_tools/cudautils.cuh
@@ -0,0 +1,168 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef CUDAUTILS_CUH
+#define CUDAUTILS_CUH
+
+#include <stdio.h>
+
+#include "gmx_fatal.h"
+
+/* CUDA library and hardware related defines */
+/* TODO list some constants instead that can be used for consistency checks to
+   detect future devices with features that make the currect code incompatible
+   with them (e.g. expected warp size = 32, check against the dev_info->props.warpsize). */
+#define WARP_SIZE           32
+
+/* TODO error checking needs to be rewritten. We have 2 types of error checks needed 
+   based on where they occur in the code: 
+   - non performance-critical: these errors are unsafe to be ignored and must be 
+     _always_ checked for, e.g. initializations
+   - performance critical: handling errors might hurt performance so care need to be taken
+     when/if we should check for them at all, e.g. in cu_upload_X. However, we should be 
+     able to turn the check for these errors on!
+
+  Probably we'll need two sets of the macros below... 
+ 
+ */
+#define CHECK_CUDA_ERRORS
+
+#ifdef CHECK_CUDA_ERRORS
+
+/*! Check for CUDA error on the return status of a CUDA RT API call. */
+#define CU_RET_ERR(status, msg) \
+    do { \
+        if (status != cudaSuccess) \
+        { \
+            gmx_fatal(FARGS, "%s: %s\n", msg, cudaGetErrorString(status)); \
+        } \
+    } while (0)
+
+/*! Check for any previously occurred uncaught CUDA error. */
+#define CU_CHECK_PREV_ERR() \
+    do { \
+        cudaError_t _CU_CHECK_PREV_ERR_status = cudaGetLastError(); \
+        if (_CU_CHECK_PREV_ERR_status != cudaSuccess) { \
+            gmx_warning("Just caught a previously occurred CUDA error (%s), will try to continue.", cudaGetErrorString(_CU_CHECK_PREV_ERR_status)); \
+        } \
+    } while (0)
+
+/*! Check for any previously occurred uncaught CUDA error 
+  -- aimed at use after kernel calls. */
+#define CU_LAUNCH_ERR(msg) \
+    do { \
+        cudaError_t _CU_LAUNCH_ERR_status = cudaGetLastError(); \
+        if (_CU_LAUNCH_ERR_status != cudaSuccess) { \
+            gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_LAUNCH_ERR_status)); \
+        } \
+    } while (0)
+
+/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error 
+  -- aimed at use after kernel calls. */
+#define CU_LAUNCH_ERR_SYNC(msg) \
+    do { \
+        cudaError_t _CU_SYNC_LAUNCH_ERR_status = cudaThreadSynchronize(); \
+        if (_CU_SYNC_LAUNCH_ERR_status != cudaSuccess) { \
+            gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_SYNC_LAUNCH_ERR_status)); \
+        } \
+    } while (0)
+
+#else
+
+#define CU_RET_ERR(status, msg) do { } while (0)
+#define CU_CHECK_PREV_ERR()     do { } while (0)
+#define CU_LAUNCH_ERR(msg)      do { } while (0)
+#define CU_LAUNCH_ERR_SYNC(msg) do { } while (0)
+
+#endif /* CHECK_CUDA_ERRORS */ 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! CUDA device information. */
+typedef struct cuda_dev_info cuda_dev_info_t;
+struct cuda_dev_info
+{
+    int             id;      /* id of the CUDA device */
+    cudaDeviceProp  prop;    /* CUDA device properties */
+    int             stat;    /* result of the device check */
+};
+
+
+/*! Launches asynchronous host to device memory copy in stream 0. */
+int cu_copy_D2H(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
+
+/*! Launches asynchronous host to device memory copy in stream s. */
+int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
+
+/*! Allocates host memory and launches synchronous host to device memory copy. */
+int cu_copy_D2H_alloc(void ** /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
+
+
+/*! Launches synchronous host to device memory copy. */
+int cu_copy_H2D(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
+
+/*! Launches asynchronous host to device memory copy in stream s. */
+int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
+
+/*! Allocates device memory and launches synchronous host to device memory copy. */
+int cu_copy_H2D_alloc(void ** /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
+
+/*! Frees device memory and resets the size and allocation size to -1. */
+void cu_free_buffered(void *d_ptr, int *n = NULL, int *nalloc = NULL);
+
+/*! Reallocates the device memory and copies data from the host. */
+void cu_realloc_buffered(void **d_dest, void *h_src,
+                         size_t type_size,
+                         int *curr_size, int *curr_alloc_size,
+                         int req_size,
+                         cudaStream_t s,
+                         bool bAsync);
+
+/*! Waits for event e to complete, */
+int cu_wait_event(cudaEvent_t /*e*/);
+
+/*! Calculates and returns the time elapsed between event start and end. */
+float cu_event_elapsed(cudaEvent_t /*start*/, cudaEvent_t /*end*/);
+
+/*! Waits for event end to complete and calculates the time between start and end. */
+int cu_wait_event_time(cudaEvent_t /*end*/, cudaEvent_t /*begin*/, float * /*time*/);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CUDAUTILS_CUH */
diff --git a/src/gmxlib/cuda_tools/pmalloc_cuda.cu b/src/gmxlib/cuda_tools/pmalloc_cuda.cu
new file mode 100644
index 0000000000..ac80122ecd
--- /dev/null
+++ b/src/gmxlib/cuda_tools/pmalloc_cuda.cu
@@ -0,0 +1,106 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#include <stdlib.h>
+
+#include "gmx_fatal.h"
+
+#include "cudautils.cuh"
+#include "pmalloc_cuda.h"
+
+/*! Allocates nbytes of page-locked memory. 
+ *  This memory should always be freed using pfree (or with the page-locked 
+ *  free functions provied by the CUDA library).
+ */
+void pmalloc(void **h_ptr, size_t nbytes)
+{
+    cudaError_t stat;
+    char        strbuf[STRLEN];
+    int         flag = cudaHostAllocDefault;
+
+    if (nbytes == 0)
+    {
+        *h_ptr = NULL;
+        return;
+    }
+
+    CU_CHECK_PREV_ERR();
+
+    stat = cudaMallocHost(h_ptr, nbytes, flag);    
+    sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
+    CU_RET_ERR(stat, strbuf);  
+}
+
+/*! Allocates nbytes of page-locked memory with write-combining. 
+ *  This memory should always be freed using pfree (or with the page-locked 
+ *  free functions provied by the CUDA library).
+ */
+void pmalloc_wc(void **h_ptr, size_t nbytes)
+{
+    cudaError_t stat;
+    char        strbuf[STRLEN];
+    int         flag = cudaHostAllocDefault || cudaHostAllocWriteCombined;
+
+    if (nbytes == 0)
+    {
+        *h_ptr = NULL;
+        return;
+    }
+
+    CU_CHECK_PREV_ERR();
+
+    stat = cudaMallocHost(h_ptr, nbytes, flag);    
+    sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
+    CU_RET_ERR(stat, strbuf);  
+}
+
+/*! Frees page locked memory allocated with pmalloc.
+ *  This function can safely be called also with a pointer to a page-locked 
+ *  memory allocated directly with CUDA API calls.
+ */
+void pfree(void *h_ptr) 
+{
+    cudaError_t stat; 
+
+    if (h_ptr == NULL)
+    {        
+        return;
+    }
+
+    CU_CHECK_PREV_ERR();
+
+    stat = cudaFreeHost(h_ptr);
+    CU_RET_ERR(stat, "cudaFreeHost failed");
+}
diff --git a/src/gmxlib/cuda_tools/vectype_ops.cuh b/src/gmxlib/cuda_tools/vectype_ops.cuh
new file mode 100644
index 0000000000..a2657b240b
--- /dev/null
+++ b/src/gmxlib/cuda_tools/vectype_ops.cuh
@@ -0,0 +1,156 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef VECTYPE_OPS_CUH
+#define VECTYPE_OPS_CUH
+
+/**** float3 ****/
+inline __host__ __device__ float3 make_float3(float s)
+{
+    return make_float3(s, s, s);
+}
+inline __host__ __device__ float3 make_float3(float4 a)
+{
+    return make_float3(a.x, a.y, a.z);
+}
+inline __host__ __device__ float3 operator-(float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ float3 operator*(float3 a, float k)
+{
+    return make_float3(k * a.x, k * a.y, k * a.z);
+}
+inline __host__ __device__ float3 operator*(float k, float3 a)
+{
+    return make_float3(k * a.x, k * a.y, k * a.z);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __host__ __device__ void operator+=(float3 &a, float4 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+inline __host__ __device__ float norm(float3 a)
+{
+    return sqrt(a.x * a.x + a.y * a.y + a.z * a.z);
+}
+inline __host__ __device__ float norm2(float3 a)
+{
+    return (a.x * a.x + a.y * a.y + a.z * a.z);
+}
+inline __host__ __device__ float dist3(float3 a, float3 b)
+{
+    return norm(b - a);
+}
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float3 b)
+{
+    a.x *= b.x; a.y *= b.y; a.z *= b.z;
+}
+inline __device__ void atomicAdd(float3 *addr, float3 val)
+{
+    atomicAdd(&addr->x, val.x);
+    atomicAdd(&addr->y, val.y);
+    atomicAdd(&addr->z, val.z);
+}
+/****************************************************************/
+
+/**** float4 ****/
+inline __host__ __device__ float4 make_float4(float s)
+{
+    return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 make_float4(float3 a)
+{
+    return make_float4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+inline __host__ __device__ float4 operator+(float4 a, float3 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w);
+}
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+inline __host__ __device__ float4 operator*(float4 a, float k)
+{
+    return make_float4(k * a.x, k * a.y, k * a.z, k * a.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+inline __host__ __device__ void operator+=(float4 &a, float3 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __host__ __device__ void operator-=(float4 &a, float3 b)
+{
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+
+inline __host__ __device__ float norm(float4 a)
+{
+    return sqrt(a.x * a.x + a.y * a.y + a.z * a.z + a.w * a.w);
+}
+
+inline __host__ __device__ float dist3(float4 a, float4 b)
+{
+    return norm(b - a);
+}
+
+#endif /* VECTYPE_OPS_CUH */
diff --git a/src/gmxlib/disre.c b/src/gmxlib/disre.c
index 0551026fd5..dc7bd69ca1 100644
--- a/src/gmxlib/disre.c
+++ b/src/gmxlib/disre.c
@@ -192,15 +192,15 @@ void init_disres(FILE *fplog,const gmx_mtop_t *mtop,
         }
         check_multi_int(fplog,cr->ms,dd->nsystems,
                         "the number of systems per ensemble");
-        if (dd->nsystems <= 0 ||  cr->ms->nsim % dd->nsystems != 0)
+        /* We use to allow any value of nsystems which was a divisor
+         * of ms->nsim. But this required an extra communicator which
+         * was stored in t_fcdata. This pulled in mpi.h in nearly all C files.
+         */
+        if (!(cr->ms->nsim == 1 || cr->ms->nsim == dd->nsystems))
         {
-            gmx_fatal(FARGS,"The number of systems %d is not divisible by the number of systems per ensemble %d\n",cr->ms->nsim,dd->nsystems);
+            gmx_fatal(FARGS,"GMX_DISRE_ENSEMBLE_SIZE (%d) is not equal to 1 or the number of systems (option -multi) %d",dd->nsystems,cr->ms->nsim);
         }
-        /* Split the inter-master communicator into different ensembles */
-        MPI_Comm_split(cr->ms->mpi_comm_masters,
-                       cr->ms->sim/dd->nsystems,
-                       cr->ms->sim,
-                       &dd->mpi_comm_ensemble);
+
         if (fplog)
         {
             fprintf(fplog,"Our ensemble consists of systems:");
@@ -343,7 +343,7 @@ void calc_disres_R_6(const gmx_multisim_t *ms,
 #ifdef GMX_MPI
     if (dd->nsystems > 1)
     {
-        gmx_sum_comm(2*dd->nres,Rt_6,dd->mpi_comm_ensemble);
+        gmx_sum_sim(2*dd->nres,Rt_6,ms);
     }
 #endif
 }
diff --git a/src/gmxlib/ewald_util.c b/src/gmxlib/ewald_util.c
index cd5590347f..e688418f62 100644
--- a/src/gmxlib/ewald_util.c
+++ b/src/gmxlib/ewald_util.c
@@ -78,22 +78,22 @@ real calc_ewaldcoeff(real rc,real dtol)
 
 real ewald_LRcorrection(FILE *fplog,
 			int start,int end,
-			t_commrec *cr,t_forcerec *fr,
+			t_commrec *cr,int thread,t_forcerec *fr,
 			real *chargeA,real *chargeB,
+			gmx_bool calc_excl_corr,
 			t_blocka *excl,rvec x[],
 			matrix box,rvec mu_tot[],
 			int ewald_geometry,real epsilon_surface,
-			real lambda,real *dvdlambda,
-			real *vdip,real *vcharge)
+			rvec *f,tensor vir,
+			real lambda,real *dvdlambda)
 {
   int     i,i1,i2,j,k,m,iv,jv,q;
   atom_id *AA;
   double  q2sumA,q2sumB,Vexcl,dvdl_excl; /* Necessary for precision */
   real    one_4pi_eps;
   real    v,vc,qiA,qiB,dr,dr2,rinv,fscal,enercorr;
-  real    VselfA,VselfB=0,Vcharge[2],Vdipole[2],rinv2,ewc=fr->ewaldcoeff,ewcdr;
+  real    Vself[2],Vdipole[2],rinv2,ewc=fr->ewaldcoeff,ewcdr;
   rvec    df,dx,mutot[2],dipcorrA,dipcorrB;
-  rvec    *f=fr->f_novirsum;
   tensor  dxdf;
   real    vol = box[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
   real    L1,dipole_coeff,qqA,qqB,qqL,vr0;
@@ -106,7 +106,6 @@ real ewald_LRcorrection(FILE *fplog,
 #else
   double  isp=0.564189583547756;
 #endif
-  int     niat;
   gmx_bool    bFreeEnergy = (chargeB != NULL);
   gmx_bool    bMolPBC = fr->bMolPBC;
 
@@ -120,8 +119,6 @@ real ewald_LRcorrection(FILE *fplog,
   q2sumB     = 0;
   Vdipole[0] = 0;
   Vdipole[1] = 0;
-  Vcharge[0] = 0;
-  Vcharge[1] = 0;
   L1         = 1.0-lambda;
 
   /* Note that we have to transform back to gromacs units, since
@@ -160,21 +157,17 @@ real ewald_LRcorrection(FILE *fplog,
     fprintf(debug,"mutot   = %8.3f  %8.3f  %8.3f\n",
 	    mutot[0][XX],mutot[0][YY],mutot[0][ZZ]);
   }
-
-  if (DOMAINDECOMP(cr))
-    niat = excl->nr;
-  else
-    niat = end; 
       
   clear_mat(dxdf);
-  if (!bFreeEnergy) {
-    for(i=start; (i<niat); i++) {
+  if ((calc_excl_corr || dipole_coeff != 0) && !bFreeEnergy) {
+    for(i=start; (i<end); i++) {
       /* Initiate local variables (for this i-particle) to 0 */
       qiA = chargeA[i]*one_4pi_eps;
+
+      if (calc_excl_corr)
+      {
       i1  = excl->index[i];
       i2  = excl->index[i+1];
-      if (i < end)
-	q2sumA += chargeA[i]*chargeA[i];
       
       /* Loop over excluded neighbours */
       for(j=i1; (j<i2); j++) {
@@ -265,23 +258,23 @@ real ewald_LRcorrection(FILE *fplog,
 	  }
 	}
       }
+      }
       /* Dipole correction on force */
       if (dipole_coeff != 0) {
 	for(j=0; (j<DIM); j++)
 	  f[i][j] -= dipcorrA[j]*chargeA[i];
       }
     }
-  } else {
-    for(i=start; (i<niat); i++) {
+  } else if (calc_excl_corr || dipole_coeff != 0) {
+    for(i=start; (i<end); i++) {
       /* Initiate local variables (for this i-particle) to 0 */
       qiA = chargeA[i]*one_4pi_eps;
       qiB = chargeB[i]*one_4pi_eps;
+
+      if (calc_excl_corr)
+      {
       i1  = excl->index[i];
       i2  = excl->index[i+1];
-      if (i < end) {
-	q2sumA += chargeA[i]*chargeA[i];
-	q2sumB += chargeB[i]*chargeB[i];
-      }
       
       /* Loop over excluded neighbours */
       for(j=i1; (j<i2); j++) {
@@ -324,6 +317,7 @@ real ewald_LRcorrection(FILE *fplog,
 	  }
 	}
       }
+      }
       /* Dipole correction on force */
       if (dipole_coeff != 0) {
 	for(j=0; (j<DIM); j++)
@@ -334,18 +328,18 @@ real ewald_LRcorrection(FILE *fplog,
   }
   for(iv=0; (iv<DIM); iv++)
     for(jv=0; (jv<DIM); jv++)
-      fr->vir_el_recip[iv][jv] += 0.5*dxdf[iv][jv];
+      vir[iv][jv] += 0.5*dxdf[iv][jv];
       
+
+  Vself[0] = 0;
+  Vself[1] = 0;
   /* Global corrections only on master process */
-  if (MASTER(cr)) {
+  if (MASTER(cr) && thread == 0) {
     for(q=0; q<(bFreeEnergy ? 2 : 1); q++) {
-      /* Apply charge correction */
-      /* use vc as a dummy variable */
-      vc = fr->qsum[q]*fr->qsum[q]*M_PI*one_4pi_eps/(2.0*vol*vol*ewc*ewc);
-      for(iv=0; (iv<DIM); iv++)
-	fr->vir_el_recip[iv][iv] +=
-	  (bFreeEnergy ? (q==0 ? L1*vc : lambda*vc) : vc);
-      Vcharge[q] = -vol*vc;
+      if (calc_excl_corr) {
+        /* Self-energy correction */
+        Vself[q] = ewc*one_4pi_eps*fr->q2sum[q]/sqrt(M_PI);
+      }
       
       /* Apply surface dipole correction:
        * correction = dipole_coeff * (dipole)^2
@@ -358,31 +352,24 @@ real ewald_LRcorrection(FILE *fplog,
       }
     }
   }    
-  
-  VselfA = ewc*one_4pi_eps*q2sumA/sqrt(M_PI);
 
   if (!bFreeEnergy) {
-    *vcharge = Vcharge[0];
-    *vdip    = Vdipole[0];
-    enercorr = *vcharge + *vdip - VselfA - Vexcl;
+    enercorr = Vdipole[0] - Vself[0] - Vexcl;
    } else {
-    VselfB = ewc*one_4pi_eps*q2sumB/sqrt(M_PI);
-    *vcharge = L1*Vcharge[0] + lambda*Vcharge[1];
-    *vdip    = L1*Vdipole[0] + lambda*Vdipole[1];
-    enercorr = *vcharge + *vdip - (L1*VselfA + lambda*VselfB) - Vexcl;
-    *dvdlambda += Vdipole[1] + Vcharge[1] - VselfB
-      - (Vdipole[0] + Vcharge[0] - VselfA) - dvdl_excl;
+    enercorr = L1*(Vdipole[0] - Vself[0])
+      + lambda*(Vdipole[1] - Vself[1])
+      - Vexcl;
+    *dvdlambda += Vdipole[1] - Vself[1]
+      - (Vdipole[0] - Vself[0]) - dvdl_excl;
   }
 
   if (debug) {
     fprintf(debug,"Long Range corrections for Ewald interactions:\n");
     fprintf(debug,"start=%d,natoms=%d\n",start,end-start);
     fprintf(debug,"q2sum = %g, Vself=%g\n",
-	    L1*q2sumA+lambda*q2sumB,L1*VselfA+lambda*VselfB);
+	    L1*q2sumA+lambda*q2sumB,L1*Vself[0]+lambda*Vself[1]);
     fprintf(debug,"Long Range correction: Vexcl=%g\n",Vexcl);
-    if (MASTER(cr)) {
-      fprintf(debug,"Total charge correction: Vcharge=%g\n",
-	      L1*Vcharge[0]+lambda*Vcharge[1]);
+    if (MASTER(cr) && thread == 0) {
       if (epsilon_surface > 0 || ewald_geometry == eewg3DC) {
 	fprintf(debug,"Total dipole correction: Vdipole=%g\n",
 		L1*Vdipole[0]+lambda*Vdipole[1]);
@@ -393,3 +380,45 @@ real ewald_LRcorrection(FILE *fplog,
   /* Return the correction to the energy */
   return enercorr;
 }
+
+real ewald_charge_correction(t_commrec *cr,t_forcerec *fr,real lambda,
+                             matrix box,
+			     real *dvdlambda,tensor vir)
+
+{
+    real vol,fac,qs2A,qs2B,vc,enercorr;
+    int  d;
+
+    if (MASTER(cr))
+    {
+        /* Apply charge correction */
+        vol = box[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
+
+	fac = M_PI*ONE_4PI_EPS0/(fr->epsilon_r*2.0*vol*vol*sqr(fr->ewaldcoeff));
+
+	qs2A = fr->qsum[0]*fr->qsum[0];
+	qs2B = fr->qsum[1]*fr->qsum[1];
+
+        vc = (qs2A*(1 - lambda) + qs2B*lambda)*fac;
+
+	enercorr = -vol*vc;
+
+	*dvdlambda += -vol*(qs2B - qs2A)*fac;
+
+        for(d=0; d<DIM; d++)
+	{
+	    vir[d][d] += vc;
+	}
+
+        if (debug)
+	{
+            fprintf(debug,"Total charge correction: Vcharge=%g\n",enercorr);
+        }
+    }
+    else
+    {
+        enercorr = 0;
+    }
+
+    return enercorr;
+}
diff --git a/src/gmxlib/gmx_cpuid.c b/src/gmxlib/gmx_cpuid.c
new file mode 100644
index 0000000000..90c73f8318
--- /dev/null
+++ b/src/gmxlib/gmx_cpuid.c
@@ -0,0 +1,858 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * 
+ * This file is part of GROMACS.
+ * Copyright (c) 2012-  
+ *
+ * Written by the Gromacs development team under coordination of
+ * David van der Spoel, Berk Hess, and Erik Lindahl.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gnomes, ROck Monsters And Chili Sauce
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#ifdef HAVE_SCHED_H
+#define _GNU_SOURCE
+#include <sched.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#ifdef _MSC_VER
+/* MSVC definition for __cpuid() */
+#include <intrin.h>
+#endif
+#ifdef HAVE_UNISTD_H
+/* sysconf() definition */
+#include <unistd.h>
+#endif
+
+
+
+
+#include "gmx_cpuid.h"
+
+
+/* Global constant character strings corresponding to our enumerated types */
+const char *
+gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS] =
+{
+    "CannotDetect",
+    "Unknown",
+    "GenuineIntel",
+    "AuthenticAMD"
+};
+
+const char *
+gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] =
+{
+    "CannotDetect",
+    "aes",
+    "apic",
+    "avx",
+    "avx2",
+    "clfsh",
+    "cmov",
+    "cx8",
+    "cx16",
+    "f16c",
+    "fma",
+    "fma4",
+    "htt",
+    "lahf_lm",
+    "misalignsse",
+    "mmx",
+    "msr",
+    "nonstop_tsc",
+    "pcid",
+    "pclmuldq",
+    "pdcm",
+    "pdpe1gb",
+    "popcnt",
+    "pse",
+    "rdrnd",
+    "rdtscp",
+    "sse2",
+    "sse3",
+    "sse4a",
+    "sse4.1",
+    "sse4.2",
+    "ssse3",
+    "tdt",
+    "x2apic",
+    "xop"
+};
+
+const char *
+gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS] =
+{
+    "CannotDetect",
+    "None",
+    "SSE2",
+    "SSE4.1",
+    "AVX_128_FMA",
+    "AVX_256"
+};
+
+/* Max length of brand string */
+#define GMX_CPUID_BRAND_MAXLEN 256
+
+
+/* Contents of the abstract datatype */
+struct gmx_cpuid
+{
+    enum gmx_cpuid_vendor      vendor;
+    char                       brand[GMX_CPUID_BRAND_MAXLEN];
+    int                        family;
+    int                        model;
+    int                        stepping;
+    /* Not using gmx_bool here, since this file must be possible to compile without simple.h */
+    char                       feature[GMX_CPUID_NFEATURES];
+};
+
+
+/* Simple routines to access the data structure. The initialization routine is
+ * further down since that needs to call other static routines in this file.
+ */
+enum gmx_cpuid_vendor
+gmx_cpuid_vendor            (gmx_cpuid_t                cpuid)
+{
+    return cpuid->vendor;
+}
+
+
+const char *
+gmx_cpuid_brand             (gmx_cpuid_t                cpuid)
+{
+    return cpuid->brand;
+}
+
+int
+gmx_cpuid_family            (gmx_cpuid_t                cpuid)
+{
+    return cpuid->family;
+}
+
+int
+gmx_cpuid_model             (gmx_cpuid_t                cpuid)
+{
+    return cpuid->model;
+}
+
+int
+gmx_cpuid_stepping          (gmx_cpuid_t                cpuid)
+{
+    return cpuid->stepping;
+}
+
+int
+gmx_cpuid_feature           (gmx_cpuid_t                cpuid,
+                             enum gmx_cpuid_feature     feature)
+{
+    return (cpuid->feature[feature]!=0);
+}
+
+
+
+
+/* What type of acceleration was compiled in, if any?
+ * This is set from Cmake. Note that the SSE2 and SSE4_1 macros are set for
+ * AVX too, so it is important that they appear last in the list.
+ */
+#ifdef GMX_X86_AVX_256
+static const
+enum gmx_cpuid_acceleration
+compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_256;
+#elif defined GMX_X86_AVX_128_FMA
+static const
+enum gmx_cpuid_acceleration
+compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA;
+#elif defined GMX_X86_SSE4_1
+static const
+enum gmx_cpuid_acceleration
+compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
+#elif defined GMX_X86_SSE2
+static const
+enum gmx_cpuid_acceleration
+compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE2;
+#else
+static const
+enum gmx_cpuid_acceleration
+compiled_acc = GMX_CPUID_ACCELERATION_NONE;
+#endif
+
+
+/* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
+ * if the compiler handles GNU-style inline assembly.
+ */
+#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+
+/* Execute CPUID on x86 class CPUs. level sets function to exec, and the
+ * contents of register output is returned. See Intel/AMD docs for details.
+ *
+ * This version supports extended information where we can also have an input
+ * value in the ecx register. This is ignored for most levels, but some of them
+ * (e.g. level 0xB on Intel) use it.
+ */
+static int
+execute_x86cpuid(unsigned int   level,
+                 unsigned int   ecxval,
+                 unsigned int * eax,
+                 unsigned int * ebx,
+                 unsigned int * ecx,
+                 unsigned int * edx)
+{
+    int rc = 0;
+
+#if (defined _MSC_VER)
+    int CPUInfo[4];
+
+#if (_MSC_VER > 1500) || (_MSC_VER==1500 & _MSC_FULL_VER >= 150030729)
+    /* MSVC 9.0 SP1 or later */
+    __cpuidex(CPUInfo,level,ecxval);
+    rc = 0;
+#else
+    __cpuid(CPUInfo,level);
+    /* Set an error code if the user wanted a non-zero ecxval, since we did not have cpuidex */
+    rc = (ecxval>0) ? -1 : 0;
+#endif
+    *eax=CPUInfo[0];
+    *ebx=CPUInfo[1];
+    *ecx=CPUInfo[2];
+    *edx=CPUInfo[3];
+
+#elif (defined GMX_X86_GCC_INLINE_ASM)
+    /* for now this means GMX_X86_GCC_INLINE_ASM should be defined,
+     * but there might be more options added in the future.
+     */
+    *eax = level;
+    *ecx = ecxval;
+    *ebx = 0;
+    *edx = 0;
+#if defined(__i386__) && defined(__PIC__)
+    /* Avoid clobbering the global offset table in 32-bit pic code (ebx register) */
+    __asm__ __volatile__ ("xchgl %%ebx, %1  \n\t"
+                          "cpuid            \n\t"
+                          "xchgl %%ebx, %1  \n\t"
+                          : "+a"(*eax), "+r"(*ebx), "+c"(*ecx), "+d"(*edx));
+#else
+    /* i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want :-) */
+    __asm__ __volatile__ ("cpuid            \n\t"
+                          : "+a"(*eax), "+b"(*ebx), "+c"(*ecx), "+d"(*edx));
+#endif
+    rc = 0;
+#else
+    /* Death and horror!
+     * Apparently this is an x86 platform where we don't know how to call cpuid.
+     *
+     * This is REALLY bad, since we will lose all Gromacs acceleration.
+     */
+    *eax = 0;
+    *ebx = 0;
+    *ecx = 0;
+    *edx = 0;
+
+    rc = -1;
+#endif
+    return rc;
+}
+#endif /* architecture is x86 */
+
+
+/* Identify CPU features common to Intel & AMD - mainly brand string,
+ * version and some features. Vendor has already been detected outside this.
+ */
+static int
+cpuid_check_common_x86(gmx_cpuid_t                cpuid)
+{
+    int                       fn,max_stdfn,max_extfn;
+    unsigned int              eax,ebx,ecx,edx;
+    char                      str[GMX_CPUID_BRAND_MAXLEN];
+    char *                    p;
+
+    /* Find largest standard/extended function input value */
+    execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
+    max_stdfn = eax;
+    execute_x86cpuid(0x80000000,0,&eax,&ebx,&ecx,&edx);
+    max_extfn = eax;
+
+    p = str;
+    if(max_extfn>=0x80000005)
+    {
+        /* Get CPU brand string */
+        for(fn=0x80000002;fn<0x80000005;fn++)
+        {
+            execute_x86cpuid(fn,0,&eax,&ebx,&ecx,&edx);
+            memcpy(p,&eax,4);
+            memcpy(p+4,&ebx,4);
+            memcpy(p+8,&ecx,4);
+            memcpy(p+12,&edx,4);
+            p+=16;
+        }
+        *p='\0';
+
+        /* Remove empty initial space */
+        p = str;
+        while(isspace(*(p)))
+        {
+            p++;
+        }
+        strncpy(cpuid->brand,p,GMX_CPUID_BRAND_MAXLEN);
+    }
+    else
+    {
+        strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN);
+    }
+
+    /* Find basic CPU properties */
+    if(max_stdfn>=1)
+    {
+        execute_x86cpuid(0x1,0,&eax,&ebx,&ecx,&edx);
+
+        cpuid->family   = ((eax & 0x0FF00000) >> 20) + ((eax & 0x00000F00) >> 8);
+        /* Note that extended model should be shifted left 4, so only shift right 12 iso 16. */
+        cpuid->model    = ((eax & 0x000F0000) >> 12) + ((eax & 0x000000F0) >> 4);
+        cpuid->stepping = (eax & 0x0000000F);
+
+        /* Feature flags common to AMD and intel */
+        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE3]     = (ecx & (1 << 0))  != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_PCLMULDQ] = (ecx & (1 << 1))  != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_SSSE3]    = (ecx & (1 << 9))  != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_FMA]      = (ecx & (1 << 12)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_CX16]     = (ecx & (1 << 13)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4_1]   = (ecx & (1 << 19)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4_2]   = (ecx & (1 << 20)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_POPCNT]   = (ecx & (1 << 23)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_AES]      = (ecx & (1 << 25)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_AVX]      = (ecx & (1 << 28)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_F16C]     = (ecx & (1 << 29)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_RDRND]    = (ecx & (1 << 30)) != 0;
+
+        cpuid->feature[GMX_CPUID_FEATURE_X86_PSE]      = (edx & (1 << 3))  != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_MSR]      = (edx & (1 << 5))  != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_CX8]      = (edx & (1 << 8))  != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_APIC]     = (edx & (1 << 9))  != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_CMOV]     = (edx & (1 << 15)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_CLFSH]    = (edx & (1 << 19)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_MMX]      = (edx & (1 << 23)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE2]     = (edx & (1 << 26)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_HTT]      = (edx & (1 << 28)) != 0;
+    }
+    else
+    {
+        cpuid->family   = -1;
+        cpuid->model    = -1;
+        cpuid->stepping = -1;
+    }
+
+    if(max_extfn>=0x80000001)
+    {
+        execute_x86cpuid(0x80000001,0,&eax,&ebx,&ecx,&edx);
+        cpuid->feature[GMX_CPUID_FEATURE_X86_LAHF_LM] = (ecx & (1 << 0))  != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_PDPE1GB] = (edx & (1 << 26)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_RDTSCP]  = (edx & (1 << 27)) != 0;
+    }
+
+    if(max_extfn>=0x80000007)
+    {
+        execute_x86cpuid(0x80000007,0,&eax,&ebx,&ecx,&edx);
+        cpuid->feature[GMX_CPUID_FEATURE_X86_NONSTOP_TSC]  = (edx & (1 << 8))  != 0;
+    }
+
+    return 0;
+}
+
+/* Detection of AMD-specific CPU features */
+static int
+cpuid_check_amd_x86(gmx_cpuid_t                cpuid)
+{
+    int                       max_stdfn,max_extfn;
+    unsigned int              eax,ebx,ecx,edx;
+
+    cpuid_check_common_x86(cpuid);
+
+    execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
+    max_stdfn = eax;
+
+    execute_x86cpuid(0x80000000,0,&eax,&ebx,&ecx,&edx);
+    max_extfn = eax;
+
+    if(max_extfn>=0x80000001)
+    {
+        execute_x86cpuid(0x80000001,0,&eax,&ebx,&ecx,&edx);
+
+        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4A]       = (ecx & (1 << 6))  != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_MISALIGNSSE] = (ecx & (1 << 7))  != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_XOP]         = (ecx & (1 << 11)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_FMA4]        = (ecx & (1 << 16)) != 0;
+    }
+
+    return 0;
+}
+
+/* Detection of Intel-specific CPU features */
+static int
+cpuid_check_intel_x86(gmx_cpuid_t                cpuid)
+{
+    unsigned int              max_stdfn,max_extfn;
+    unsigned int              eax,ebx,ecx,edx;
+    unsigned int              i;
+    unsigned int              max_logical_cores,max_physical_cores;
+
+    cpuid_check_common_x86(cpuid);
+
+    execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
+    max_stdfn = eax;
+
+    execute_x86cpuid(0x80000000,0,&eax,&ebx,&ecx,&edx);
+    max_extfn = eax;
+
+    if(max_stdfn>=1)
+    {
+        execute_x86cpuid(0x1,0,&eax,&ebx,&ecx,&edx);
+        cpuid->feature[GMX_CPUID_FEATURE_X86_PDCM]    = (ecx & (1 << 15)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_PCID]    = (ecx & (1 << 17)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_X2APIC]  = (ecx & (1 << 21)) != 0;
+        cpuid->feature[GMX_CPUID_FEATURE_X86_TDT]     = (ecx & (1 << 24)) != 0;
+    }
+
+    if(max_stdfn>=7)
+    {
+        execute_x86cpuid(0x7,0,&eax,&ebx,&ecx,&edx);
+        cpuid->feature[GMX_CPUID_FEATURE_X86_AVX2]    = (ebx & (1 << 5))  != 0;
+    }
+
+    /* Check whether Hyper-Threading is enabled, not only supported */
+    if(cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] && max_stdfn>=4)
+    {
+        execute_x86cpuid(0x1,0,&eax,&ebx,&ecx,&edx);
+        max_logical_cores  = (ebx >> 16) & 0x0FF;
+        execute_x86cpuid(0x4,0,&eax,&ebx,&ecx,&edx);
+        max_physical_cores = ((eax >> 26) & 0x3F) + 1;
+
+        /* Clear HTT flag if we only have 1 logical core per physical */
+        if(max_logical_cores/max_physical_cores < 2)
+        {
+            cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] = 0;
+        }
+    }
+    return 0;
+}
+
+/* Try to find the vendor of the current CPU, so we know what specific
+ * detection routine to call.
+ */
+static enum gmx_cpuid_vendor
+cpuid_check_vendor(void)
+{
+    enum gmx_cpuid_vendor      i,vendor;
+    /* Register data used on x86 */
+    unsigned int               eax,ebx,ecx,edx;
+    char                       vendorstring[13];
+
+    /* Set default first */
+    vendor = GMX_CPUID_VENDOR_UNKNOWN;
+
+    execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
+
+    memcpy(vendorstring,&ebx,4);
+    memcpy(vendorstring+4,&edx,4);
+    memcpy(vendorstring+8,&ecx,4);
+
+    vendorstring[12]='\0';
+
+    for(i=GMX_CPUID_VENDOR_UNKNOWN;i<GMX_CPUID_NVENDORS;i++)
+    {
+        if(!strncmp(vendorstring,gmx_cpuid_vendor_string[i],12))
+        {
+            vendor = i;
+        }
+    }
+
+    return vendor;
+}
+
+
+
+
+int
+gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
+{
+    gmx_cpuid_t cpuid;
+    int i;
+
+    cpuid = malloc(sizeof(*cpuid));
+
+    *pcpuid = cpuid;
+
+    for(i=0;i<GMX_CPUID_NFEATURES;i++)
+    {
+        cpuid->feature[i]=0;
+    }
+
+    cpuid->vendor = cpuid_check_vendor();
+
+    switch(cpuid->vendor)
+    {
+        case GMX_CPUID_VENDOR_INTEL:
+            cpuid_check_intel_x86(cpuid);
+            break;
+        case GMX_CPUID_VENDOR_AMD:
+            cpuid_check_amd_x86(cpuid);
+            break;
+        default:
+            /* Could not find vendor */
+            strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN);
+            cpuid->family         = 0;
+            cpuid->model          = 0;
+            cpuid->stepping       = 0;
+
+            for(i=0;i<GMX_CPUID_NFEATURES;i++)
+            {
+                cpuid->feature[i]=0;
+            }
+            cpuid->feature[GMX_CPUID_FEATURE_CANNOTDETECT] = 1;
+            break;
+    }
+
+    return 0;
+}
+
+
+
+void
+gmx_cpuid_done               (gmx_cpuid_t              cpuid)
+{
+    free(cpuid);
+}
+
+
+int
+gmx_cpuid_formatstring       (gmx_cpuid_t              cpuid,
+                              char *                   str,
+                              int                      n)
+{
+    int c;
+    int i;
+    enum gmx_cpuid_feature  feature;
+
+#ifdef _MSC_VER
+    _snprintf(str,n,
+              "Vendor: %s\n"
+              "Brand:  %s\n"
+              "Family: %2d  Model: %2d  Stepping: %2d\n"
+              "Features:",
+              gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)],
+              gmx_cpuid_brand(cpuid),
+              gmx_cpuid_family(cpuid),gmx_cpuid_model(cpuid),gmx_cpuid_stepping(cpuid));
+#else
+    snprintf(str,n,
+             "Vendor: %s\n"
+             "Brand:  %s\n"
+             "Family: %2d  Model: %2d  Stepping: %2d\n"
+             "Features:",
+             gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)],
+             gmx_cpuid_brand(cpuid),
+             gmx_cpuid_family(cpuid),gmx_cpuid_model(cpuid),gmx_cpuid_stepping(cpuid));
+#endif
+
+    str[n-1] = '\0';
+    c = strlen(str);
+    n   -= c;
+    str += c;
+
+    for(feature=GMX_CPUID_FEATURE_CANNOTDETECT;feature<GMX_CPUID_NFEATURES;feature++)
+    {
+        if(gmx_cpuid_feature(cpuid,feature)==1)
+        {
+#ifdef _MSC_VER
+            _snprintf(str,n," %s",gmx_cpuid_feature_string[feature]);
+#else
+            snprintf(str,n," %s",gmx_cpuid_feature_string[feature]);
+#endif
+            str[n-1] = '\0';
+            c = strlen(str);
+            n   -= c;
+            str += c;
+        }
+    }
+#ifdef _MSC_VER
+    _snprintf(str,n,"\n");
+#else
+    snprintf(str,n,"\n");
+#endif
+    str[n-1] = '\0';
+
+    return 0;
+}
+
+
+
+enum gmx_cpuid_acceleration
+gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                 cpuid)
+{
+    enum gmx_cpuid_acceleration  tmpacc;
+
+    tmpacc = GMX_CPUID_ACCELERATION_NONE;
+
+    if(gmx_cpuid_vendor(cpuid)==GMX_CPUID_VENDOR_INTEL)
+    {
+        if(gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_AVX))
+        {
+            tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_256;
+        }
+        else if(gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_SSE4_1))
+        {
+            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
+        }
+        else if(gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_SSE2))
+        {
+            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
+        }
+    }
+    else if(gmx_cpuid_vendor(cpuid)==GMX_CPUID_VENDOR_AMD)
+    {
+        if(gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_AVX))
+        {
+            tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA;
+        }
+        else if(gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_SSE4_1))
+        {
+            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
+        }
+        else if(gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_SSE2))
+        {
+            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
+        }
+    }
+
+    return tmpacc;
+}
+
+
+
+int
+gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
+                             FILE *        log)
+{
+    int                           rc;
+    char                          str[1024];
+    enum gmx_cpuid_acceleration   acc;
+
+    acc = gmx_cpuid_acceleration_suggest(cpuid);
+
+    rc = (acc != compiled_acc);
+
+    gmx_cpuid_formatstring(cpuid,str,1023);
+    str[1023] = '\0';
+
+    if(log!=NULL)
+    {
+        fprintf(log,
+                "\nDetecting CPU-specific acceleration.\nPresent hardware specification:\n"
+                "%s"
+                "Acceleration most likely to fit this hardware: %s\n"
+                "Acceleration selected at GROMACS compile time: %s\n\n",
+                str,
+                gmx_cpuid_acceleration_string[acc],
+                gmx_cpuid_acceleration_string[compiled_acc]);
+    }
+
+    if(rc!=0)
+    {
+        if(log!=NULL)
+        {
+            fprintf(log,"WARNING! Binary not matching hardware - you are likely losing performance.\n\n");
+        }
+        printf("\nWARNING! Binary not matching hardware - you are likely losing performance.\n"
+               "Acceleration most likely to fit this hardware: %s\n"
+               "Acceleration selected at GROMACS compile time: %s\n\n",
+               gmx_cpuid_acceleration_string[acc],
+               gmx_cpuid_acceleration_string[compiled_acc]);
+    }
+
+    return rc;
+}
+
+
+enum gmx_cpuid_x86_smt
+gmx_cpuid_x86_smt(gmx_cpuid_t cpuid)
+{
+
+#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
+    int            i;
+    int            nproc;
+    cpu_set_t      cpuset,save_cpuset;
+    int *          apic_id;
+    unsigned int   eax,ebx,ecx,edx;
+    int            core_shift_bits;
+    int            smt_found;
+
+    if( gmx_cpuid_vendor(cpuid)!=GMX_CPUID_VENDOR_INTEL ||
+       gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_HTT)==0)
+    {
+        return GMX_CPUID_X86_SMT_DISABLED;
+    }
+
+    /* Check cpuid max standard function */
+    execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx);
+
+    /* Early CPUs that do not support function 11 do not support SMT either */
+    if(eax<0xB)
+    {
+        return GMX_CPUID_X86_SMT_DISABLED;
+    }
+
+    /* If we got here, it is a modern Intel CPU that supports detection, as does our OS */
+
+    /* How many processors? */
+    nproc = sysconf(_SC_NPROCESSORS_ONLN);
+
+    apic_id      = malloc(sizeof(int)*nproc);
+
+    sched_getaffinity(0,sizeof(cpu_set_t),&save_cpuset);
+
+    /* Get x2APIC ID from each hardware thread */
+    CPU_ZERO(&cpuset);
+    for(i=0;i<nproc;i++)
+    {
+        CPU_SET(i,&cpuset);
+        sched_setaffinity(0,sizeof(cpu_set_t),&cpuset);
+        execute_x86cpuid(0xB,0,&eax,&ebx,&ecx,&edx);
+        apic_id[i]=edx;
+        CPU_CLR(i,&cpuset);
+    }
+    /* Reset affinity to the value it had when calling this routine */
+    sched_setaffinity(0,sizeof(cpu_set_t),&save_cpuset);
+
+    core_shift_bits = eax & 0x1F;
+
+    /* Check if there is any other APIC id that is identical to [0], apart from
+     * the hardware thread bit.
+     */
+    smt_found  = 0;
+    for(i=1;i<nproc && smt_found==0;i++)
+    {
+        smt_found = (apic_id[i]>>core_shift_bits == apic_id[0] >> core_shift_bits);
+    }
+
+    free(apic_id);
+
+    if(smt_found==1)
+    {
+        return GMX_CPUID_X86_SMT_ENABLED;
+    }
+    else
+    {
+        return GMX_CPUID_X86_SMT_DISABLED;
+    }
+#else
+    /* Do the trivial stuff first. If Hyper-Threading isn't even supported it
+     * cannot be enabled, no matter what OS detection we use!
+     */
+    if(0==gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_HTT))
+    {
+        return GMX_CPUID_X86_SMT_DISABLED;
+    }
+    else
+    {
+        return GMX_CPUID_X86_SMT_CANNOTDETECT;
+    }
+#endif
+}
+
+
+
+
+#ifdef GMX_CPUID_STANDALONE
+/* Stand-alone program to enable queries of CPU features from Cmake.
+ * Note that you need to check inline ASM capabilities before compiling and set
+ * -DGMX_X86_GCC_INLINE_ASM for the cpuid instruction to work...
+ */
+int
+main(int argc, char **argv)
+{
+    gmx_cpuid_t                   cpuid;
+    enum gmx_cpuid_acceleration   acc;
+    int                           i,cnt;
+
+    if(argc<2)
+    {
+        fprintf(stdout,
+                "Usage:\n\n%s [flags]\n\n"
+                "Available flags:\n"
+                "-vendor        Print CPU vendor.\n"
+                "-brand         Print CPU brand string.\n"
+                "-family        Print CPU family version.\n"
+                "-model         Print CPU model version.\n"
+                "-stepping      Print CPU stepping version.\n"
+                "-features      Print CPU feature flags.\n"
+                "-acceleration  Print suggested GROMACS acceleration.\n"
+                ,argv[0]);
+        exit(0);
+    }
+
+    gmx_cpuid_init(&cpuid);
+
+    if(!strncmp(argv[1],"-vendor",3))
+    {
+        printf("%s\n",gmx_cpuid_vendor_string[cpuid->vendor]);
+    }
+    else if(!strncmp(argv[1],"-brand",3))
+    {
+        printf("%s\n",cpuid->brand);
+    }
+    else if(!strncmp(argv[1],"-family",3))
+    {
+        printf("%d\n",cpuid->family);
+    }
+    else if(!strncmp(argv[1],"-model",3))
+    {
+        printf("%d\n",cpuid->model);
+    }
+    else if(!strncmp(argv[1],"-stepping",3))
+    {
+        printf("%d\n",cpuid->stepping);
+    }
+    else if(!strncmp(argv[1],"-features",3))
+    {
+        cnt = 0;
+        for(i=0;i<GMX_CPUID_NFEATURES;i++)
+        {
+            if(cpuid->feature[i]==1)
+            {
+                if(cnt++ > 0)
+                {
+                    printf(" ");
+                }
+                printf("%s",gmx_cpuid_feature_string[i]);
+            }
+        }
+        printf("\n");
+    }
+    else if(!strncmp(argv[1],"-acceleration",3))
+    {
+        acc = gmx_cpuid_acceleration_suggest(cpuid);
+        fprintf(stdout,"%s\n",gmx_cpuid_acceleration_string[acc]);
+    }
+
+    gmx_cpuid_done(cpuid);
+
+
+    return 0;
+}
+
+#endif
diff --git a/src/gmxlib/gmx_detect_hardware.c b/src/gmxlib/gmx_detect_hardware.c
new file mode 100644
index 0000000000..6fe77d759e
--- /dev/null
+++ b/src/gmxlib/gmx_detect_hardware.c
@@ -0,0 +1,594 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * 
+ * This file is part of GROMACS.
+ * Copyright (c) 2012-  
+ *
+ * Written by the Gromacs development team under coordination of
+ * David van der Spoel, Berk Hess, and Erik Lindahl.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org
+ * 
+ * And Hey:
+ * GROup of MAchos and Cynical Suckers
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+#include "types/enums.h"
+#include "types/hw_info.h"
+#include "types/commrec.h"
+#include "gmx_fatal.h"
+#include "gmx_fatal_collective.h"
+#include "smalloc.h"
+#include "gpu_utils.h"
+#include "statutil.h"
+#include "gmx_detect_hardware.h"
+#include "main.h"
+#include "md_logging.h"
+
+#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
+#include "windows.h"
+#endif
+
+/* Although we can't have more than 10 GPU different ID-s passed by the user as
+ * the id-s are assumed to be represented by single digits, as multiple
+ * processes can share a GPU, we can end up with more than 10 IDs.
+ * To account for potential extreme cases we'll set the limit to a pretty
+ * ridiculous number. */
+static unsigned int max_gpu_ids_user = 64;
+
+/* FW decl. */
+void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
+
+static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info, gmx_bool bPrintAll)
+{
+    int      i, ndev;
+    char     stmp[STRLEN];
+
+    ndev = gpu_info->ncuda_dev;
+
+    sbuf[0] = '\0';
+    for (i = 0; i < ndev; i++)
+    {
+        get_gpu_device_info_string(stmp, gpu_info, i);
+        strcat(sbuf, "  ");
+        strcat(sbuf, stmp);
+        if (i < ndev - 1)
+        {
+            strcat(sbuf, "\n");
+        }
+    }
+}
+
+static void print_gpu_detection_stats(FILE *fplog,
+                                      const gmx_gpu_info_t *gpu_info,
+                                      const t_commrec *cr)
+{
+    char onhost[266],stmp[STRLEN];
+    int  ngpu;
+
+    ngpu = gpu_info->ncuda_dev;
+
+#if defined GMX_MPI && !defined GMX_THREAD_MPI
+    /* We only print the detection on one, of possibly multiple, nodes */
+    strncpy(onhost," on host ",10);
+    gmx_gethostname(onhost+9,256);
+#else
+    /* We detect all relevant GPUs */
+    strncpy(onhost,"",1);
+#endif
+
+    if (ngpu > 0)
+    {
+        sprint_gpus(stmp, gpu_info, TRUE);
+        md_print_warn(cr, fplog, "%d GPU%s detected%s:\n%s\n",
+                      ngpu, (ngpu > 1) ? "s" : "", onhost, stmp);
+    }
+    else
+    {
+        md_print_warn(cr, fplog, "No GPUs detected%s\n", onhost);
+    }
+}
+
+static void print_gpu_use_stats(FILE *fplog,
+                                const gmx_gpu_info_t *gpu_info,
+                                const t_commrec *cr)
+{
+    char sbuf[STRLEN], stmp[STRLEN];
+    int  i, ngpu, ngpu_all;
+
+    ngpu     = gpu_info->ncuda_dev_use;
+    ngpu_all = gpu_info->ncuda_dev;
+
+    /* Issue note if GPUs are available but not used */
+    if (ngpu_all > 0 && ngpu < 1)
+    {
+        sprintf(sbuf,
+                "%d compatible GPU%s detected in the system, but none will be used.\n"
+                "Consider trying GPU acceleration with the Verlet scheme!",
+                ngpu_all, (ngpu_all > 1) ? "s" : "");
+    }
+    else
+    {
+        sprintf(sbuf, "%d GPU%s %sselected to be used for this run: ",
+                ngpu, (ngpu > 1) ? "s" : "",
+                gpu_info->bUserSet ? "user-" : "auto-");
+        for (i = 0; i < ngpu; i++)
+        {
+            sprintf(stmp, "#%d", get_gpu_device_id(gpu_info, i));
+            if (i < ngpu - 1)
+            {
+                strcat(stmp, ", ");
+            }
+            strcat(sbuf, stmp);
+        }
+    }
+    md_print_info(cr, fplog, "%s\n\n", sbuf);
+}
+
+/* Parse a "plain" GPU ID string which contains a sequence of digits corresponding
+ * to GPU IDs; the order will indicate the process/tMPI thread - GPU assignment. */
+static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
+{
+    int  i;
+    size_t len_idstr;
+
+    len_idstr = strlen(idstr);
+
+    if (len_idstr > max_gpu_ids_user)
+    {
+        gmx_fatal(FARGS,"%d GPU IDs provided, but only at most %d are supported",
+                  len_idstr, max_gpu_ids_user);
+    }
+
+    *nid = len_idstr;
+
+    for (i = 0; i < *nid; i++)
+    {
+        if (idstr[i] < '0' || idstr[i] > '9')
+        {
+            gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n", idstr[i]);
+        }
+        idlist[i] = idstr[i] - '0';
+    }
+}
+
+static void parse_gpu_id_csv_string(const char *idstr, int *nid, int *idlist)
+{
+    /* XXX implement cvs format to support more than 10 different GPUs in a box. */
+    gmx_incons("Not implemented yet");
+}
+
+void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
+                                      const t_commrec *cr, int ntmpi_requested,
+                                      gmx_bool bUseGPU)
+{
+    int      npppn, ntmpi_pp, ngpu;
+    char     sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
+    char     gpu_plural[2];
+    gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
+
+    assert(hwinfo);
+    assert(cr);
+
+    btMPI = bMPI = FALSE;
+    bNthreadsAuto = FALSE;
+#if defined(GMX_THREAD_MPI)
+    btMPI = TRUE;
+    bNthreadsAuto = (ntmpi_requested < 1);
+#elif defined(GMX_LIB_MPI)
+    bMPI  = TRUE;
+#endif
+
+#ifdef GMX_GPU
+    bGPUBin      = TRUE;
+#else
+    bGPUBin      = FALSE;
+#endif
+
+    /* GPU emulation detection is done later, but we need here as well
+     * -- uncool, but there's no elegant workaround */
+    bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
+    bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
+
+    if (SIMMASTER(cr))
+    {
+        /* check the acceleration mdrun is compiled with against hardware capabilities */
+        /* TODO: Here we assume homogeneous hardware which is not necessarily the case!
+         *       Might not hurt to add an extra check over MPI. */
+        gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog);
+    }
+
+    /* Below we only do consistency checks for PP and GPUs,
+     * this is irrelevant for PME only nodes, so in that case we return here.
+     */
+    if (!(cr->duty & DUTY_PP))
+    {
+        return;
+    }
+
+    /* Need to ensure that we have enough GPUs:
+     * - need one GPU per PP node
+     * - no GPU oversubscription with tMPI
+     * => keep on the GPU support, otherwise turn off (or bail if forced)
+     * */
+    /* number of PP processes per node */
+    npppn = cr->nnodes_pp_intra;
+
+    pernode[0] = '\0';
+    th_or_proc_plural[0] = '\0';
+    if (btMPI)
+    {
+        sprintf(th_or_proc, "thread-MPI thread");
+        if (npppn > 1)
+        {
+            sprintf(th_or_proc_plural, "s");
+        }
+    }
+    else if (bMPI)
+    {
+        sprintf(th_or_proc, "MPI process");
+        if (npppn > 1)
+        {
+            sprintf(th_or_proc_plural, "es");
+        }
+        sprintf(pernode, " per node");
+    }
+    else
+    {
+        /* neither MPI nor tMPI */
+        sprintf(th_or_proc, "process");
+    }
+
+    if (bGPUBin)
+    {
+        print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+    }
+
+    if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
+    {
+        ngpu = hwinfo->gpu_info.ncuda_dev_use;
+        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+
+        /* number of tMPI threads atuo-adjusted */
+        if (btMPI && bNthreadsAuto && SIMMASTER(cr))
+        {
+            if (npppn < ngpu)
+            {
+                if (hwinfo->gpu_info.bUserSet)
+                {
+                    /* The user manually provided more GPUs than threads we could
+                     * automatically start. */
+                    gmx_fatal(FARGS,
+                              "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
+                              "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
+                              ngpu, gpu_plural, npppn, th_or_proc_plural,
+                              ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
+                }
+                else
+                {
+                    /* There are more GPUs than tMPI threads; we have to limit the number GPUs used. */
+                    md_print_warn(cr,fplog,
+                                  "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
+                                  "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
+                                  ngpu, gpu_plural, npppn, th_or_proc_plural,
+                                  ShortProgram(), npppn, npppn > 1 ? "s" : "",
+                                  bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
+
+                    if (cr->nodeid_intra == 0)
+                    {
+                        limit_num_gpus_used(hwinfo, npppn);
+                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
+                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+                    }
+                }
+            }
+        }
+
+        if (ngpu != npppn)
+        {
+            if (hwinfo->gpu_info.bUserSet)
+            {
+                gmx_fatal(FARGS,
+                          "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
+                          "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
+                          th_or_proc, btMPI ? "s" : "es" , pernode,
+                          ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
+            }
+            else
+            {
+                if (ngpu > npppn)
+                {
+                    md_print_warn(cr,fplog,
+                                  "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
+                                  "      PP %s%s%s than GPU%s available.\n"
+                                  "      Each PP %s can only use one GPU, so only %d GPU%s%s will be used.",
+                                  ShortProgram(),
+                                  th_or_proc, th_or_proc_plural, pernode, gpu_plural,
+                                  th_or_proc, npppn, gpu_plural, pernode);
+
+                    if (bMPI || (btMPI && cr->nodeid_intra == 0))
+                    {
+                        limit_num_gpus_used(hwinfo, npppn);
+                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
+                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+                    }
+                }
+                else
+                {
+                    /* Avoid duplicate error messages.
+                     * Unfortunately we can only do this at the physical node
+                     * level, since the hardware setup and MPI process count
+                     * might be differ over physical nodes.
+                     */
+                    if (cr->nodeid_intra == 0)
+                    {
+                        gmx_fatal(FARGS,
+                                  "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
+                                  "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
+                                  th_or_proc, btMPI ? "s" : "es" , pernode,
+                                  ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
+                    }
+#ifdef GMX_MPI
+                    else
+                    {
+                        /* Avoid other ranks to continue after inconsistency */
+                        MPI_Barrier(cr->mpi_comm_mygroup);
+                    }
+#endif
+                }
+            }
+        }
+
+        if (hwinfo->gpu_info.bUserSet && (cr->nodeid_intra == 0))
+        {
+            int i, j, same_count;
+            gmx_bool bSomeSame, bAllDifferent;
+
+            same_count = 0;
+            bSomeSame = FALSE;
+            bAllDifferent = TRUE;
+
+            for (i = 0; i < ngpu - 1; i++)
+            {
+                for (j = i + 1; j < ngpu; j++)
+                {
+                    bSomeSame       |= hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
+                    bAllDifferent   &= hwinfo->gpu_info.cuda_dev_use[i] != hwinfo->gpu_info.cuda_dev_use[j];
+                    same_count      += hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
+                }
+            }
+
+            if (btMPI && !bAllDifferent)
+            {
+                gmx_fatal(FARGS,
+                          "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
+                          "Use MPI if you are sure that you want to assign GPU to multiple threads.");
+            }
+
+            if (bSomeSame)
+            {
+                md_print_warn(cr,fplog,
+                              "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
+                              "      multiple %s%s; this should be avoided as it generally\n"
+                              "      causes performance loss.",
+                              same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
+            }
+        }
+        print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
+    }
+}
+
+/* Return the number of hardware threads supported by the current CPU.
+ * We assume that this is equal with the number of CPUs reported to be
+ * online by the OS at the time of the call.
+ */
+static int get_nthreads_hw_avail(FILE *fplog, const t_commrec *cr)
+{
+     int ret = 0;
+
+#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
+    /* Windows */
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo( &sysinfo );
+    ret = sysinfo.dwNumberOfProcessors;
+#elif defined HAVE_SYSCONF
+    /* We are probably on Unix.
+     * Now check if we have the argument to use before executing the call
+     */
+#if defined(_SC_NPROCESSORS_ONLN)
+    ret = sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+    ret = sysconf(_SC_NPROC_ONLN);
+#elif defined(_SC_NPROCESSORS_CONF)
+    ret = sysconf(_SC_NPROCESSORS_CONF);
+#elif defined(_SC_NPROC_CONF)
+    ret = sysconf(_SC_NPROC_CONF);
+#endif /* End of check for sysconf argument values */
+
+#else
+    /* Neither windows nor Unix. No fscking idea how many CPUs we have! */
+    ret = -1;
+#endif
+
+    if (debug)
+    {
+        fprintf(debug, "Detected %d processors, will use this as the number "
+                "of supported hardware threads.\n", ret);
+    }
+
+#ifdef GMX_OMPENMP
+    if (ret != gmx_omp_get_num_procs())
+    {
+        md_print_warn(cr, fplog,
+                      "Number of CPUs detected (%d) does not match the number reported by OpenMP (%d).\n"
+                      "Consider setting the launch configuration manually!",
+                      ret, gmx_omp_get_num_procs());
+    }
+#endif
+
+    return ret;
+}
+
+void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
+                         const t_commrec *cr,
+                         gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
+                         const char *gpu_id)
+{
+    int             i;
+    const char      *env;
+    char            sbuf[STRLEN], stmp[STRLEN];
+    gmx_hw_info_t   *hw;
+    gmx_gpu_info_t  gpuinfo_auto, gpuinfo_user;
+    gmx_bool        bGPUBin;
+
+    assert(hwinfo);
+
+    /* detect CPUID info; no fuss, we don't detect system-wide
+     * -- sloppy, but that's it for now */
+    if (gmx_cpuid_init(&hwinfo->cpuid_info) != 0)
+    {
+        gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
+    }
+
+    /* detect number of hardware threads */
+    hwinfo->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
+
+    /* detect GPUs */
+    hwinfo->gpu_info.ncuda_dev_use  = 0;
+    hwinfo->gpu_info.cuda_dev_use   = NULL;
+    hwinfo->gpu_info.ncuda_dev      = 0;
+    hwinfo->gpu_info.cuda_dev       = NULL;
+
+#ifdef GMX_GPU
+    bGPUBin      = TRUE;
+#else
+    bGPUBin      = FALSE;
+#endif
+
+    /* Bail if binary is not compiled with GPU on */
+    if (bForceUseGPU && !bGPUBin)
+    {
+        gmx_fatal_collective(FARGS, cr, NULL, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+    }
+
+    /* run the detection if the binary was compiled with GPU support */
+    if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION")==NULL)
+    {
+        detect_cuda_gpus(&hwinfo->gpu_info);
+    }
+
+    if (bForceUseGPU || bTryUseGPU)
+    {
+        env = getenv("GMX_GPU_ID");
+        if (env != NULL && gpu_id != NULL)
+        {
+            gmx_fatal(FARGS,"GMX_GPU_ID and -gpu_id can not be used at the same time");
+        }
+        if (env == NULL)
+        {
+            env = gpu_id;
+        }
+
+        /* parse GPU IDs if the user passed any */
+        if (env != NULL)
+        {
+            int *gpuid, *checkres;
+            int nid, res;
+
+            snew(gpuid, max_gpu_ids_user);
+            snew(checkres, max_gpu_ids_user);
+
+            parse_gpu_id_plain_string(env, &nid, gpuid);
+
+            if (nid == 0)
+            {
+                gmx_fatal(FARGS, "Empty GPU ID string passed\n");
+            }
+
+            res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
+
+            if (!res)
+            {
+                print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+
+                sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
+                for (i = 0; i < nid; i++)
+                {
+                    if (checkres[i] != egpuCompatible)
+                    {
+                        sprintf(stmp, "    GPU #%d: %s\n",
+                                gpuid[i], gpu_detect_res_str[checkres[i]]);
+                        strcat(sbuf, stmp);
+                    }
+                }
+                gmx_fatal(FARGS, "%s", sbuf);
+            }
+
+            hwinfo->gpu_info.bUserSet = TRUE;
+
+            sfree(gpuid);
+            sfree(checkres);
+        }
+        else
+        {
+            pick_compatible_gpus(&hwinfo->gpu_info);
+            hwinfo->gpu_info.bUserSet = FALSE;
+        }
+
+        /* decide whether we can use GPU */
+        hwinfo->bCanUseGPU = (hwinfo->gpu_info.ncuda_dev_use > 0);
+        if (!hwinfo->bCanUseGPU && bForceUseGPU)
+        {
+            gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
+        }
+    }
+}
+
+void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
+{
+    int ndev_use;
+
+    assert(hwinfo);
+
+    ndev_use = hwinfo->gpu_info.ncuda_dev_use;
+
+    if (count > ndev_use)
+    {
+        /* won't increase the # of GPUs */
+        return;
+    }
+
+    if (count < 1)
+    {
+        char sbuf[STRLEN];
+        sprintf(sbuf, "Limiting the number of GPUs to <1 doesn't make sense (detected %d, %d requested)!",
+                ndev_use, count);
+        gmx_incons(sbuf);
+    }
+
+    /* TODO: improve this implementation: either sort GPUs or remove the weakest here */
+    hwinfo->gpu_info.ncuda_dev_use = count;
+}
+
+void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
+{
+    if (hwinfo)
+    {
+        gmx_cpuid_done(hwinfo->cpuid_info);
+        free_gpu_info(&hwinfo->gpu_info);
+        sfree(hwinfo);
+    }
+}
diff --git a/src/gmxlib/gmx_detectcpu.c b/src/gmxlib/gmx_detectcpu.c
deleted file mode 100644
index d58d34bd29..0000000000
--- a/src/gmxlib/gmx_detectcpu.c
+++ /dev/null
@@ -1,615 +0,0 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
- *
- * 
- * This file is part of GROMACS.
- * Copyright (c) 2012-  
- *
- * Written by the Gromacs development team under coordination of
- * David van der Spoel, Berk Hess, and Erik Lindahl.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org
- * 
- * And Hey:
- * Gnomes, ROck Monsters And Chili Sauce
- */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#ifdef _MSC_VER
-/* MSVC definition for __cpuid() */
-#include <intrin.h>
-#endif
-
-
-#include "gmx_detectcpu.h"
-
-
-const char *
-gmx_detectcpu_vendorid_string[GMX_DETECTCPU_NVENDORS] =
-{
-    "Unknown",
-    "GenuineIntel",
-    "AuthenticAMD"
-};
-
-const char *
-gmx_detectcpu_feature_string[GMX_DETECTCPU_NFEATURES] =
-{
-    "CannotDetect",
-    "htt",
-    "sse2",
-    "sse4.1",
-    "rdrand",
-    "aes",
-    "avx",
-    "fma",
-    "fma4",
-    "xop",
-    "avx2",
-    "rdtscp"
-};
-
-const char *
-gmx_detectcpu_acceleration_string[GMX_DETECTCPU_NACCELERATIONS] =
-{
-    "None",
-    "SSE2",
-    "SSE4.1",
-    "AVX_128_FMA",
-    "AVX_256"
-};
-
-
-
-
-
-/* What type of acceleration was compiled in, if any?
- * This is set from Cmake. Note that the SSE2 and SSE4_1 macros are set for
- * AVX too, so it is important that they appear last in the list.
- */
-#ifdef GMX_X86_AVX_256
-static const
-gmx_detectcpu_acceleration_t 
-compiled_acc = GMX_DETECTCPU_ACCELERATION_X86_AVX_256;
-#elif defined GMX_X86_AVX_128_FMA
-static const
-gmx_detectcpu_acceleration_t 
-compiled_acc = GMX_DETECTCPU_ACCELERATION_X86_AVX_128_FMA;
-#elif defined GMX_X86_SSE4_1
-static const
-gmx_detectcpu_acceleration_t 
-compiled_acc = GMX_DETECTCPU_ACCELERATION_X86_SSE4_1;
-#elif defined GMX_X86_SSE2
-static const
-gmx_detectcpu_acceleration_t 
-compiled_acc = GMX_DETECTCPU_ACCELERATION_X86_SSE2;
-#else
-static const
-gmx_detectcpu_acceleration_t 
-compiled_acc = GMX_DETECTCPU_ACCELERATION_NONE;
-#endif
-
-/* Execute CPUID on x86 class CPUs. level sets function to exec, and the
- * contents of register output is returned. See Intel/AMD docs for details.
- */
-#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
-/* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
- * if the compiler handles GNU-style inline assembly.
- */
-#if (defined GMX_X86_GCC_INLINE_ASM || defined _MSC_VER)
-#define GMX_X86_HAVE_CPUID
-static int
-execute_cpuid_x86(unsigned int level,
-                  unsigned int * eax,
-                  unsigned int * ebx,
-                  unsigned int * ecx,
-                  unsigned int * edx)
-{
-    unsigned int _eax,_ebx,_ecx,_edx;
-    int rc;
-
-#ifdef _MSC_VER
-    int CPUInfo[4];
-
-    /* MSVC */
-    __cpuid(CPUInfo,level);
-
-    _eax=CPUInfo[0];
-    _ebx=CPUInfo[1];
-    _ecx=CPUInfo[2];
-    _edx=CPUInfo[3];
-
-    rc = 0;
-
-#else
-    /* for now this means GMX_X86_GCC_INLINE_ASM should be defined,
-     * but there might be more options added in the future.
-     */
-    /* tested on 32 & 64 GCC, and Intel icc. */
-#if defined (__x86_64__) || defined (_M_X64)
-    __asm__("push  %%rbx      \n\t"
-            "cpuid            \n\t"
-            "movl %%ebx, %1   \n\t"
-            "pop  %%rbx       \n\t"
-            : "=a"(_eax), "=r"(_ebx), "=c"(_ecx), "=d"(_edx) : "0"(level));
-#else
-    __asm__("push %%ebx       \n\t"
-            "cpuid            \n\t"
-            "movl %%ebx, %1   \n\t"
-            "pop %%ebx        \n\t"
-            : "=a"(_eax), "=r"(_ebx), "=c"(_ecx), "=d"(_edx) : "0"(level));
-#endif
-    
-    rc = 0;
-#endif
-    /* If you end up having a compiler that really doesn't understand this and
-     * you can't fix it, create a separate ifdef and set the results to:
-     *
-     * _eax=_ebx=_ecx=_edx=0;
-     * rc = -1;
-     *
-     * However, this will lose you ALL Gromacs x86 acceleration, so you want to
-     * try really hard before giving up!
-     */
-
-    *eax = _eax;
-    *ebx = _ebx;
-    *ecx = _ecx;
-    *edx = _edx;
-
-    return rc;
-}
-#endif /* GMX_X86_GCC_INLINE_ASM or _MSC_VER */
-#endif /* architecture is x86 */
-
-
-/* Identify CPU features common to Intel & AMD - mainly brand string,
- * version and some features. Vendor has already been detected outside this.
- */
-static int
-detectcpu_common_x86(gmx_detectcpu_t *              data)
-{
-    int                       fn,max_stdfn,max_extfn;
-    unsigned int              eax,ebx,ecx,edx;
-    char                      str[GMX_DETECTCPU_STRLEN];
-    char *                    p;
-
-#ifdef GMX_X86_HAVE_CPUID
-    /* Find largest standard/extended function input value */
-    execute_cpuid_x86(0x0,&eax,&ebx,&ecx,&edx);
-    max_stdfn = eax;
-    execute_cpuid_x86(0x80000000,&eax,&ebx,&ecx,&edx);
-    max_extfn = eax;
-
-    p = str;
-    if(max_extfn>=0x80000005)
-    {
-        /* Get CPU brand string */
-        for(fn=0x80000002;fn<0x80000005;fn++)
-        {
-            execute_cpuid_x86(fn,&eax,&ebx,&ecx,&edx);
-            memcpy(p,&eax,4);
-            memcpy(p+4,&ebx,4);
-            memcpy(p+8,&ecx,4);
-            memcpy(p+12,&edx,4);
-            p+=16;
-        }
-        *p='\0';
-
-        /* Remove empty initial space */
-        p = str;
-        while(isspace(*(p)))
-        {
-            p++;
-        }
-    }
-    else
-    {
-        *p='\0';
-    }
-    strncpy(data->brand,p,GMX_DETECTCPU_STRLEN);
-
-    /* Find basic CPU properties */
-    if(max_stdfn>=1)
-    {
-        execute_cpuid_x86(1,&eax,&ebx,&ecx,&edx);
-
-        data->family   = ((eax & 0x0FF00000) >> 20) + ((eax & 0x00000F00) >> 8);
-        /* Note that extended model should be shifted left 4, so only shift right 12 iso 16. */
-        data->model    = ((eax & 0x000F0000) >> 12) + ((eax & 0x000000F0) >> 4);
-        data->stepping = (eax & 0x0000000F);
-
-        /* Feature flags common to AMD and intel */
-        data->feature[GMX_DETECTCPU_FEATURE_X86_FMA]     = (ecx & (1 << 12)) != 0;
-        data->feature[GMX_DETECTCPU_FEATURE_X86_SSE4_1]  = (ecx & (1 << 19)) != 0;
-        data->feature[GMX_DETECTCPU_FEATURE_X86_AES]     = (ecx & (1 << 25)) != 0;
-        data->feature[GMX_DETECTCPU_FEATURE_X86_AVX]     = (ecx & (1 << 28)) != 0;
-        data->feature[GMX_DETECTCPU_FEATURE_X86_RDRAND]  = (ecx & (1 << 30)) != 0;
-
-        data->feature[GMX_DETECTCPU_FEATURE_X86_SSE2]    = (edx & (1 << 26)) != 0;
-        data->feature[GMX_DETECTCPU_FEATURE_X86_HTT]     = (edx & (1 << 28)) != 0;
-    }
-
-    if(max_extfn>=0x80000001)
-    {
-        execute_cpuid_x86(0x80000001,&eax,&ebx,&ecx,&edx);
-        data->feature[GMX_DETECTCPU_FEATURE_X86_RDTSCP]  = (edx & (1 << 27)) != 0;
-    }
-
-#else
-    /* No CPUID present */
-    strncpy(data->brand,"Unknown CPU brand",GMX_DETECTCPU_STRLEN);
-    data->family   = 0;
-    data->model    = 0;
-    data->stepping = 0;
-#endif
-
-    return 0;
-}
-
-/* Detection of AMD-specific CPU features */
-static int
-detectcpu_amd(gmx_detectcpu_t *              data)
-{
-    int                       max_stdfn,max_extfn;
-    unsigned int              eax,ebx,ecx,edx;
-
-    detectcpu_common_x86(data);
-
-#ifdef GMX_X86_HAVE_CPUID
-    execute_cpuid_x86(0x0,&eax,&ebx,&ecx,&edx);
-    max_stdfn = eax;
-
-    execute_cpuid_x86(0x80000000,&eax,&ebx,&ecx,&edx);
-    max_extfn = eax;
-
-    if(max_extfn>=0x80000001)
-    {
-        execute_cpuid_x86(0x80000001,&eax,&ebx,&ecx,&edx);
-
-        data->feature[GMX_DETECTCPU_FEATURE_X86_XOP]     = (ecx & (1 << 11)) != 0;
-        data->feature[GMX_DETECTCPU_FEATURE_X86_FMA4]    = (ecx & (1 << 16)) != 0;
-    }
-#endif
-
-    return 0;
-}
-
-/* Detection of Intel-specific CPU features */
-static int
-detectcpu_intel(gmx_detectcpu_t *              data)
-{
-    int                       max_stdfn;
-    unsigned int              eax,ebx,ecx,edx;
-
-    detectcpu_common_x86(data);
-
-#ifdef GMX_X86_HAVE_CPUID
-    execute_cpuid_x86(0x0,&eax,&ebx,&ecx,&edx);
-    max_stdfn = eax;
-
-    if(max_stdfn>=7)
-    {
-        execute_cpuid_x86(0x7,&eax,&ebx,&ecx,&edx);
-        data->feature[GMX_DETECTCPU_FEATURE_X86_AVX2]    = (ebx & (1 << 5)) != 0;
-    }
-
-#endif
-
-    return 0;
-}
-
-/* Try to find the vendor of the current CPU, so we know what specific
- * detection routine to call.
- */
-static gmx_detectcpu_vendorid_t
-detectcpu_vendor(void)
-{
-    gmx_detectcpu_vendorid_t   i,vendor;
-    /* Register data used on x86 */
-    unsigned int               eax,ebx,ecx,edx;
-    char                       vendorstring[13];
-
-    /* Set default first */
-    vendor = GMX_DETECTCPU_VENDOR_UNKNOWN;
-
-#ifdef GMX_X86_HAVE_CPUID
-    execute_cpuid_x86(0,&eax,&ebx,&ecx,&edx);
-
-    memcpy(vendorstring,&ebx,4);
-    memcpy(vendorstring+4,&edx,4);
-    memcpy(vendorstring+8,&ecx,4);
-
-    vendorstring[12]='\0';
-
-    for(i=GMX_DETECTCPU_VENDOR_UNKNOWN;i<GMX_DETECTCPU_NVENDORS;i++)
-    {
-        if(!strncmp(vendorstring,gmx_detectcpu_vendorid_string[i],12))
-        {
-            vendor = i;
-        }
-    }
-#endif
-
-    return vendor;
-}
-
-int
-gmx_detectcpu                   (gmx_detectcpu_t *              data)
-{
-    int i;
-
-    for(i=0;i<GMX_DETECTCPU_NFEATURES;i++)
-    {
-        data->feature[i]=0;
-    }
-    
-    data->vendorid = detectcpu_vendor();
-
-    switch(data->vendorid)
-    {
-        case GMX_DETECTCPU_VENDOR_INTEL:
-            detectcpu_intel(data);
-            break;
-        case GMX_DETECTCPU_VENDOR_AMD:
-            detectcpu_amd(data);
-            break;
-        default:
-            /* Could not find vendor */
-            strncpy(data->brand,"Unknown CPU brand",GMX_DETECTCPU_STRLEN);
-            data->family         = 0;
-            data->model          = 0;
-            data->stepping       = 0;
-
-            for(i=0;i<GMX_DETECTCPU_NFEATURES;i++)
-            {
-                data->feature[i]=0;
-            }
-            data->feature[GMX_DETECTCPU_FEATURE_CANNOTDETECT] = 1;
-            break;
-    }
-
-    return 0;
-}
-
-
-
-
-int
-gmx_detectcpu_formatstring       (gmx_detectcpu_t              data,
-                                  char *                        str,
-                                  int                           n)
-{
-    int c;
-    int i;
-
-#ifdef _MSC_VER
-    _snprintf(str,n,
-              "Vendor: %s\n"
-              "Brand:  %s\n"
-              "Family: %2d  Model: %2d  Stepping: %2d\n"
-              "Features:",
-              gmx_detectcpu_vendorid_string[data.vendorid],
-              data.brand,
-              data.family,data.model,data.stepping);
-#else
-    snprintf(str,n,
-             "Vendor: %s\n"
-             "Brand:  %s\n"
-             "Family: %2d  Model: %2d  Stepping: %2d\n"
-             "Features:",
-             gmx_detectcpu_vendorid_string[data.vendorid],
-             data.brand,
-             data.family,data.model,data.stepping);
-#endif
-
-    str[n-1] = '\0';
-    c = strlen(str);
-    n   -= c;
-    str += c;
-
-    for(i=0;i<GMX_DETECTCPU_NFEATURES;i++)
-    {
-        if(data.feature[i]==1)
-        {
-#ifdef _MSC_VER
-            _snprintf(str,n," %s",gmx_detectcpu_feature_string[i]);
-#else
-            snprintf(str,n," %s",gmx_detectcpu_feature_string[i]);
-#endif
-            str[n-1] = '\0';
-            c = strlen(str);
-            n   -= c;
-            str += c;
-        }
-    }
-#ifdef _MSC_VER
-    _snprintf(str,n,"\n");
-#else
-    snprintf(str,n,"\n");
-#endif
-    str[n-1] = '\0';
-
-    return 0;
-}
-
-
-
-int
-gmx_detectcpu_suggest_acceleration  (gmx_detectcpu_t                 data,
-                                     gmx_detectcpu_acceleration_t *  acc)
-{
-    gmx_detectcpu_acceleration_t tmpacc;
-
-    tmpacc = GMX_DETECTCPU_ACCELERATION_NONE;
-
-    if(data.vendorid==GMX_DETECTCPU_VENDOR_INTEL)
-    {
-        if(data.feature[GMX_DETECTCPU_FEATURE_X86_AVX]==1)
-        {
-            tmpacc = GMX_DETECTCPU_ACCELERATION_X86_AVX_256;
-        }
-        else if(data.feature[GMX_DETECTCPU_FEATURE_X86_SSE4_1]==1)
-        {
-            tmpacc = GMX_DETECTCPU_ACCELERATION_X86_SSE4_1;
-        }
-        else if(data.feature[GMX_DETECTCPU_FEATURE_X86_SSE2]==1)
-        {
-            tmpacc = GMX_DETECTCPU_ACCELERATION_X86_SSE2;
-        }
-    }
-    else if(data.vendorid==GMX_DETECTCPU_VENDOR_AMD)
-    {
-        if(data.feature[GMX_DETECTCPU_FEATURE_X86_AVX]==1)
-        {
-            tmpacc = GMX_DETECTCPU_ACCELERATION_X86_AVX_128_FMA;
-        }
-        else if(data.feature[GMX_DETECTCPU_FEATURE_X86_SSE4_1]==1)
-        {
-            tmpacc = GMX_DETECTCPU_ACCELERATION_X86_SSE4_1;
-        }
-        else if(data.feature[GMX_DETECTCPU_FEATURE_X86_SSE2]==1)
-        {
-            tmpacc = GMX_DETECTCPU_ACCELERATION_X86_SSE2;
-        }
-    }
-
-    *acc = tmpacc;
-
-    return 0;
-}
-
-
-
-int
-gmx_detectcpu_check_acceleration(gmx_detectcpu_t   data,
-                                 FILE *           log)
-{
-    int                           rc;
-    char                          str[1024];
-    gmx_detectcpu_acceleration_t  acc;
-
-    gmx_detectcpu_suggest_acceleration(data,&acc);
-    rc = (acc != compiled_acc);
-
-    gmx_detectcpu_formatstring(data,str,1023);
-    str[1023] = '\0';
-
-    if(log!=NULL)
-    {
-        fprintf(log,
-                "Detecting CPU-specific acceleration. Present hardware specification:\n"
-                "%s"
-                "Acceleration most likely to fit this hardware: %s\n"
-                "Acceleration selected at Gromacs compile time: %s\n\n",
-                str,
-                gmx_detectcpu_acceleration_string[acc],
-                gmx_detectcpu_acceleration_string[compiled_acc]);
-    }
-
-    if(rc!=0)
-    {
-        if(log!=NULL)
-        {
-            fprintf(log,"WARNING! Binary not matching hardware - you are likely losing performance.\n\n");
-        }
-        printf("\nWARNING! Binary not matching hardware - you are likely losing performance.\n"
-               "Acceleration most likely to fit this hardware: %s\n"
-               "Acceleration selected at Gromacs compile time: %s\n\n",
-               gmx_detectcpu_acceleration_string[acc],
-               gmx_detectcpu_acceleration_string[compiled_acc]);
-    }
-
-    return rc;
-}
-
-
-
-
-#ifdef GMX_DETECTCPU_STANDALONE
-/* Stand-alone program to enable queries of CPU features from Cmake.
- * Note that you need to check inline ASM capabilities before compling and set 
- * -DGMX_X86_GCC_INLINE_ASM for the cpuid instruction to work...
- */
-int
-main(int argc, char **argv)
-{
-    gmx_detectcpu_t               data;
-    gmx_detectcpu_acceleration_t  acc;
-    int                           i,cnt;
-
-    if(argc<2)
-    {
-        fprintf(stdout,
-                "Usage:\n\n%s [flags]\n\n"
-                "Available flags:\n"
-                "-vendor        Print CPU vendor.\n"
-                "-brand         Print CPU brand string.\n"
-                "-family        Print CPU family version.\n"
-                "-model         Print CPU model version.\n"
-                "-stepping      Print CPU stepping version.\n"
-                "-features      Print CPU feature flags.\n"
-                "-acceleration  Print suggested Gromacs acceleration.\n"
-                ,argv[0]);
-        exit(0);
-    }
-
-    gmx_detectcpu(&data);
-
-    if(!strncmp(argv[1],"-vendor",3))
-    {
-        printf("%s\n",gmx_detectcpu_vendorid_string[data.vendorid]);
-    }
-    else if(!strncmp(argv[1],"-brand",3))
-    {
-        printf("%s\n",data.brand);
-    }
-    else if(!strncmp(argv[1],"-family",3))
-    {
-        printf("%d\n",data.family);
-    }
-    else if(!strncmp(argv[1],"-model",3))
-    {
-        printf("%d\n",data.model);
-    }
-    else if(!strncmp(argv[1],"-stepping",3))
-    {
-        printf("%d\n",data.stepping);
-    }
-    else if(!strncmp(argv[1],"-features",3))
-    {
-        cnt = 0;
-        for(i=0;i<GMX_DETECTCPU_NFEATURES;i++)
-        {
-            if(data.feature[i]==1)
-            {
-                if(cnt++ > 0)
-                {
-                    printf(" ");
-                }
-                printf("%s",gmx_detectcpu_feature_string[i]);
-            }
-        }
-        printf("\n");
-    }
-    else if(!strncmp(argv[1],"-acceleration",3))
-    {
-        gmx_detectcpu_suggest_acceleration(data,&acc);
-        fprintf(stdout,"%s\n",gmx_detectcpu_acceleration_string[acc]);
-    }
-
-    return 0;
-}
-
-#endif
diff --git a/src/gmxlib/gmx_fatal.c b/src/gmxlib/gmx_fatal.c
index 510ff1aab7..f4131d6d15 100644
--- a/src/gmxlib/gmx_fatal.c
+++ b/src/gmxlib/gmx_fatal.c
@@ -450,7 +450,7 @@ void gmx_fatal(int f_errno,const char *file,int line,const char *fmt,...)
 }
 
 void gmx_fatal_collective(int f_errno,const char *file,int line,
-                          t_commrec *cr,gmx_domdec_t *dd,
+                          const t_commrec *cr,gmx_domdec_t *dd,
                           const char *fmt,...)
 {
     gmx_bool    bFinalize;
diff --git a/src/gmxlib/gmx_omp.c b/src/gmxlib/gmx_omp.c
index a5040dce0a..e5c1f4540b 100644
--- a/src/gmxlib/gmx_omp.c
+++ b/src/gmxlib/gmx_omp.c
@@ -41,6 +41,14 @@ int gmx_omp_get_max_threads(void)
 #endif
 }
 
+int gmx_omp_get_num_procs(void)
+{
+#ifdef GMX_OPENMP
+    return omp_get_num_procs();
+#else
+    return 1;
+#endif
+}
 
 int gmx_omp_get_thread_num(void)
 {
diff --git a/src/gmxlib/gmx_omp_nthreads.c b/src/gmxlib/gmx_omp_nthreads.c
new file mode 100644
index 0000000000..9aa167c845
--- /dev/null
+++ b/src/gmxlib/gmx_omp_nthreads.c
@@ -0,0 +1,445 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2010, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "gmx_fatal.h"
+#include "typedefs.h"
+#include "macros.h"
+#include "network.h"
+#include "statutil.h"
+#include "gmx_omp.h"
+#include "gmx_omp_nthreads.h"
+#include "md_logging.h"
+
+/*! Structure with the number of threads for each OpenMP multi-threaded
+ *  algorithmic module in mdrun. */
+typedef struct
+{
+    int gnth;               /*! Global num. of threads per PP or PP+PME process/tMPI thread. */
+    int gnth_pme;           /*! Global num. of threads per PME only process/tMPI thread. */
+
+    int nth[emntNR];        /*! Number of threads for each module, indexed with module_nth_t */
+    gmx_bool initialized;   /*! TRUE if the module as been initialized. */
+} omp_module_nthreads_t;
+
+/*! Names of environment variables to set the per module number of threads.
+ *
+ *  Indexed with the values of module_nth_t.
+ * */
+static const char *modth_env_var[emntNR] =
+{
+    "GMX_DEFAULT_NUM_THREADS should never be set",
+    "GMX_DOMDEC_NUM_THREADS", "GMX_PAIRSEARCH_NUM_THREADS",
+    "GMX_NONBONDED_NUM_THREADS", "GMX_BONDED_NUM_THREADS",
+    "GMX_PME_NUM_THREADS", "GMX_UPDATE_NUM_THREADS",
+    "GMX_LINCS_NUM_THREADS", "GMX_SETTLE_NUM_THREADS"
+};
+
+/*! Names of the modules. */
+static const char *mod_name[emntNR] =
+{
+    "default", "domain decomposition", "pair search", "non-bonded",
+    "bonded", "PME", "update", "LINCS", "SETTLE"
+};
+
+/*! Number of threads for each algorithmic module.
+ *
+ *  File-scope global variable that gets set once in \init_module_nthreads
+ *  and queried via gmx_omp_nthreads_get.
+ *
+ *  All fields are initialized to 0 which should result in errors if
+ *  the init call is omitted.
+ * */
+static omp_module_nthreads_t modth = { 0, 0, {0, 0, 0, 0, 0, 0, 0, 0}, FALSE};
+
+
+/*! Determine the number of threads for module \mod.
+ *
+ *  \m takes values form the module_nth_t enum and maps these to the
+ *  corresponding value in modth_env_var.
+ *
+ *  Each number of threads per module takes the default value unless
+ *  GMX_*_NUM_THERADS env var is set, case in which its value overrides
+ *  the deafult.
+ *
+ *  The "group" scheme supports OpenMP only in PME and in thise case all but
+ *  the PME nthread values default to 1.
+ */
+static int pick_module_nthreads(FILE *fplog, int m,
+                                gmx_bool bSimMaster,
+                                gmx_bool bFullOmpSupport,
+                                gmx_bool bSepPME)
+{
+    char *env;
+    int  nth;
+    char sbuf[STRLEN];
+    gmx_bool bOMP;
+
+#ifdef GMX_OPENMP
+    bOMP = TRUE;
+#else
+    bOMP = FALSE;
+#endif /* GMX_OPENMP */
+
+    /* The default should never be set through a GMX_*_NUM_THREADS env var
+     * as it's always equal with gnth. */
+    if (m == emntDefault)
+    {
+        return modth.nth[emntDefault];
+    }
+
+    /* check the environment variable */
+    if ((env = getenv(modth_env_var[m])) != NULL)
+    {
+        sscanf(env, "%d", &nth);
+
+        if (!bOMP)
+        {
+            gmx_warning("%s=%d is set, but %s is compiled without OpenMP!",
+                        modth_env_var[m], nth, ShortProgram());
+        }
+
+        /* with the verlet codepath, when any GMX_*_NUM_THREADS env var is set,
+         * OMP_NUM_THREADS also has to be set */
+        if (bFullOmpSupport && getenv("OMP_NUM_THREADS") == NULL)
+        {
+            gmx_fatal(FARGS, "%s=%d is set, the default number of threads also "
+                      "needs to be set with OMP_NUM_THREADS!",
+                      modth_env_var[m], nth);
+        }
+
+        /* with the group scheme warn if any env var except PME is set */
+        if (!bFullOmpSupport)
+        {
+            if (m != emntPME)
+            {
+                gmx_warning("%s=%d is set, but OpenMP multithreading is not "
+                            "supported in %s!",
+                            modth_env_var[m], nth, mod_name[m]);
+                nth = 1;
+            }
+        }
+
+        /* only babble if we are really overriding with a different value */
+        if ((bSepPME && m == emntPME && nth != modth.gnth_pme) || (nth != modth.gnth))
+        {
+            sprintf(sbuf, "%s=%d set, overriding the default number of %s threads",
+                    modth_env_var[m], nth, mod_name[m]);
+            if (bSimMaster)
+            {
+                fprintf(stderr, "\n%s\n", sbuf);
+            }
+            if (fplog)
+            {
+                fprintf(fplog, "%s\n", sbuf);
+            }
+        }
+    }
+    else
+    {
+        /* pick the global PME node nthreads if we are setting the number
+         * of threads in separate PME nodes  */
+        nth = (bSepPME && m == emntPME) ? modth.gnth_pme : modth.gnth;
+    }
+
+    return modth.nth[m] = nth;
+}
+
+void gmx_omp_nthreads_read_env(int *nthreads_omp)
+{
+    char *env;
+
+    assert(nthreads_omp);
+
+    if ((env = getenv("OMP_NUM_THREADS")) != NULL)
+    {
+        int nt_omp;
+
+        sscanf(env,"%d",&nt_omp);
+        if (nt_omp <= 0)
+        {
+            gmx_fatal(FARGS,"OMP_NUM_THREADS is invalid: '%s'",env);
+        }
+
+        if (*nthreads_omp > 0 && nt_omp != *nthreads_omp)
+        {
+            gmx_fatal(FARGS,"OMP_NUM_THREADS (%d) and the number of threads requested on the command line (%d) have different values",nt_omp,*nthreads_omp);
+        }
+
+        /* Setting the number of OpenMP threads.
+         * NOTE: with tMPI this function is only called on the master node,
+         * but with MPI on all nodes which means lots of messages on stderr.
+         */
+        fprintf(stderr,"Getting the number of OpenMP threads from OMP_NUM_THREADS: %d\n",nt_omp);
+        *nthreads_omp = nt_omp;
+    }
+}
+
+void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
+                           int nthreads_hw_avail,
+                           int omp_nthreads_req,
+                           int omp_nthreads_pme_req,
+                           gmx_bool bThisNodePMEOnly,
+                           gmx_bool bFullOmpSupport)
+{
+    int  nth, nth_pmeonly, gmx_maxth, nppn;
+    char *env;
+    gmx_bool bSepPME, bOMP;
+
+#ifdef GMX_OPENMP
+    bOMP = TRUE;
+#else
+    bOMP = FALSE;
+#endif /* GMX_OPENMP */
+
+    /* number of processes per node */
+    nppn = cr->nnodes_intra;
+
+    bSepPME = ( (cr->duty & DUTY_PP) && !(cr->duty & DUTY_PME)) ||
+              (!(cr->duty & DUTY_PP) &&  (cr->duty & DUTY_PME));
+
+#ifdef GMX_THREAD_MPI
+    /* modth is shared among tMPI threads, so for thread safety do the
+     * detection is done on the master only. It is not thread-safe with
+     * multiple simulations, but that's anyway not supported by tMPI. */
+    if (SIMMASTER(cr))
+#endif
+    {
+        /* just return if the initialization has already been done */
+        if (modth.initialized)
+        {
+            return;
+        }
+
+        /* With full OpenMP support (verlet scheme) set the number of threads
+         * per process / default:
+         * - 1 if not compiled with OpenMP or
+         * - OMP_NUM_THREADS if the env. var is set, or
+         * - omp_nthreads_req = #of threads requested by the user on the mdrun
+         *   command line, otherwise
+         * - take the max number of available threads and distribute them
+         *   on the processes/tMPI threads.
+         * ~ The GMX_*_NUM_THREADS env var overrides the number of threads of
+         *   the respective module and it has to be used in conjunction with
+         *   OMP_NUM_THREADS.
+         *
+         * With the group scheme OpenMP multithreading is only supported in PME,
+         * for all other modules nthreads is set to 1.
+         * The number of PME threads is equal to:
+         * - 1 if not compiled with OpenMP or
+         * - GMX_PME_NUM_THREADS if defined, otherwise
+         * - OMP_NUM_THREADS if defined, otherwise
+         * - 1
+         */
+        nth = 1;
+        if ((env = getenv("OMP_NUM_THREADS")) != NULL)
+        {
+            if (!bOMP && (strncmp(env, "1", 1) != 0))
+            {
+                gmx_warning("OMP_NUM_THREADS is set, but %s was compiled without OpenMP support!",
+                            ShortProgram());
+            }
+            else
+            {
+                nth = gmx_omp_get_max_threads();
+            }
+        }
+        else if (omp_nthreads_req > 0)
+        {
+            nth = omp_nthreads_req;
+        }
+        else if (bFullOmpSupport && bOMP)
+        {
+            /* max available threads per node */
+            nth = nthreads_hw_avail;
+
+            /* divide the threads among the MPI processes/tMPI threads */
+            if (nth >= nppn)
+            {
+                nth /= nppn;
+            }
+            else
+            {
+                nth = 1;
+            }
+        }
+
+        /* now we have the global values, set them:
+         * - 1 if not compiled with OpenMP and for the group scheme
+         * - nth for the verlet scheme when compiled with OpenMP
+         */
+        if (bFullOmpSupport && bOMP)
+        {
+            modth.gnth = nth;
+        }
+        else
+        {
+            modth.gnth = 1;
+        }
+
+        if (bSepPME)
+        {
+            if (omp_nthreads_pme_req > 0)
+            {
+                modth.gnth_pme = omp_nthreads_pme_req;
+            }
+            else
+            {
+                modth.gnth_pme = nth;
+            }
+        }
+        else
+        {
+            modth.gnth_pme = 0;
+        }
+
+        /* now set the per-module values */
+        modth.nth[emntDefault] = modth.gnth;
+        pick_module_nthreads(fplog, emntDomdec, SIMMASTER(cr), bFullOmpSupport, bSepPME);
+        pick_module_nthreads(fplog, emntPairsearch, SIMMASTER(cr), bFullOmpSupport, bSepPME);
+        pick_module_nthreads(fplog, emntNonbonded, SIMMASTER(cr), bFullOmpSupport, bSepPME);
+        pick_module_nthreads(fplog, emntBonded, SIMMASTER(cr), bFullOmpSupport, bSepPME);
+        pick_module_nthreads(fplog, emntPME, SIMMASTER(cr), bFullOmpSupport, bSepPME);
+        pick_module_nthreads(fplog, emntUpdate, SIMMASTER(cr), bFullOmpSupport, bSepPME);
+        pick_module_nthreads(fplog, emntLINCS, SIMMASTER(cr), bFullOmpSupport, bSepPME);
+        pick_module_nthreads(fplog, emntSETTLE, SIMMASTER(cr), bFullOmpSupport, bSepPME);
+
+        /* set the number of threads globally */
+        if (bOMP)
+        {
+#ifndef GMX_THREAD_MPI
+            if (bThisNodePMEOnly)
+            {
+                gmx_omp_set_num_threads(modth.gnth_pme);
+            }
+            else
+#endif /* GMX_THREAD_MPI */
+            {
+                if (bFullOmpSupport)
+                {
+                    gmx_omp_set_num_threads(nth);
+                }
+                else
+                {
+                    gmx_omp_set_num_threads(1);
+                }
+            }
+        }
+
+        modth.initialized = TRUE;
+    }
+#ifdef GMX_THREAD_MPI
+    /* Non-master threads have to wait for the detection to be done. */
+    if (PAR(cr))
+    {
+        MPI_Barrier(cr->mpi_comm_mysim);
+    }
+#endif
+
+    /* inform the user about the settings */
+    if (SIMMASTER(cr) && bOMP)
+    {
+#ifdef GMX_THREAD_MPI
+        const char *mpi_str="per tMPI thread";
+#else
+        const char *mpi_str="per MPI process";
+#endif
+
+        /* for group scheme we print PME threads info only */
+        if (bFullOmpSupport)
+        {
+            fprintf(stderr, "Using %d OpenMP thread%s %s\n",
+                    modth.gnth,modth.gnth > 1 ? "s" : "",
+                    cr->nnodes > 1 ? mpi_str : "");
+        }
+        if (bSepPME && modth.gnth_pme != modth.gnth)
+        {
+            fprintf(stderr, "Using %d OpenMP thread%s %s for PME\n",
+                    modth.gnth_pme,modth.gnth_pme > 1 ? "s" : "",
+                    cr->nnodes > 1 ? mpi_str : "");
+        }
+    }
+
+    /* detect and warn about oversubscription
+     * TODO: enable this for separate PME nodes as well! */
+    if (!bSepPME && cr->nodeid_intra == 0)
+    {
+        char sbuf[STRLEN], sbuf1[STRLEN], sbuf2[STRLEN];
+
+        if (modth.gnth*nppn > nthreads_hw_avail)
+        {
+            sprintf(sbuf, "threads");
+            sbuf1[0] = '\0';
+            sprintf(sbuf2, "O");
+#ifdef GMX_MPI
+            if (modth.gnth == 1)
+            {
+#ifdef GMX_THREAD_MPI
+                sprintf(sbuf, "thread-MPI threads");
+#else
+                sprintf(sbuf, "MPI processes");
+                sprintf(sbuf1, " per node");
+                sprintf(sbuf2, "On node %d: o", cr->sim_nodeid);
+#endif
+            }
+#endif
+            md_print_warn(cr, fplog,
+                          "WARNING: %sversubscribing the available %d logical CPU cores%s with %d %s.\n"
+                          "         This will cause considerable performance loss!",
+                          sbuf2, nthreads_hw_avail, sbuf1, nppn*modth.gnth, sbuf);
+        }
+    }
+}
+
+int gmx_omp_nthreads_get(int mod)
+{
+    if (mod < 0 || mod >= emntNR)
+    {
+        /* invalid module queried */
+        return -1;
+    }
+    else
+    {
+        return modth.nth[mod];
+    }
+}
diff --git a/src/kernel/gmx_gpu_utils/CMakeLists.txt b/src/gmxlib/gpu_utils/CMakeLists.txt
similarity index 63%
rename from src/kernel/gmx_gpu_utils/CMakeLists.txt
rename to src/gmxlib/gpu_utils/CMakeLists.txt
index f06debc0b4..bf45130afe 100644
--- a/src/kernel/gmx_gpu_utils/CMakeLists.txt
+++ b/src/gmxlib/gpu_utils/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 # (slightly sloppy) OS definitions required by memtestG80
 set(_os_def)
 if(UNIX)
@@ -17,14 +16,10 @@ endif()
 
 CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)        
-if(CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-    CUDA_ADD_LIBRARY(gmx_gpu_utils STATIC
-        gmx_gpu_utils.cu memtestG80_core.cu 
-        OPTIONS ${_os_def}
-        DEBUG -g -D_DEBUG_=1 )
-else()
-    CUDA_ADD_LIBRARY(gmx_gpu_utils STATIC
-        gmx_gpu_utils.cu memtestG80_core.cu 
-        OPTIONS ${_os_def} )
-endif()
+file(GLOB GPU_UTILS_SOURCES *.cu)
+CUDA_ADD_LIBRARY(gpu_utils STATIC ${GPU_UTILS_SOURCES}
+                 OPTIONS ${_os_def}
+                 RELWITHDEBINFO -g
+                 DEBUG -g -D_DEBUG_=1 )
+
 CUDA_BUILD_CLEAN_TARGET()
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.cu b/src/gmxlib/gpu_utils/gpu_utils.cu
similarity index 54%
rename from src/kernel/gmx_gpu_utils/gmx_gpu_utils.cu
rename to src/gmxlib/gpu_utils/gpu_utils.cu
index 6f786798c6..64d1e39e52 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.cu
+++ b/src/gmxlib/gpu_utils/gpu_utils.cu
@@ -35,48 +35,37 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <assert.h>
 
-#include "cuda.h"
-#include "cuda_runtime_api.h"
+#include "smalloc.h"
+#include "string2.h"
+#include "types/hw_info.h"
 
+#include "gpu_utils.h"
+#include "../cuda_tools/cudautils.cuh"
 #include "memtestG80_core.h"
 
-/*! \cond  TEST */
-#ifdef _DEBUG_
-#undef _DEBUG_
-#endif
-#define _DEBUG_           0
-
-#if _DEBUG_ >= 1
-#define debug stderr 
-#define DUPME(msg) printf("---> %s\n", msg);
-#else
-#define DUPME(msg) ;
-#endif
-/*! \endcond TEST*/
-
-#if _DEBUG_ == 0/* no gromacs utils in debug mode */
-#include "gmx_fatal.h"
-#include "string2.h"
-#endif
 
 #define QUICK_MEM       250 /*!< Amount of memory to be used in quick memtest. */
-#define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bitflag with type of tests 
+#define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests
                                                                             to run in quick memtest. */
 #define QUICK_ITER      3 /*!< Number of iterations in quick memtest. */
 
 #define FULL_TESTS      0x3FFF /*!<  Bitflag with all test set on for full memetest. */
 #define FULL_ITER       25 /*!< Number of iterations in full memtest. */
 
-#define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bitflag with type of tests to 
+#define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests to
                                                                             run in time constrained memtest. */
 
 /*! Number of supported GPUs */
 #define NB_GPUS (sizeof(SupportedGPUs)/sizeof(SupportedGPUs[0]))
 
-/*
-TODO add proper gromacs logging?
-*/
+static int cuda_max_device_count = 32; /*! Max number of devices supported by CUDA (for consistency checking).
+                                           In reality it 16 with CUDA <=v5.0, but let's stay on the safe side. */
+
+/*! Dummy kernel used for sanity checking. */
+__global__ void k_dummy_test(){}
+
 
 /*! Bit-flags which refer to memtestG80 test types and are used in do_memtest to specify which tests to run. */
 enum memtest_G80_test_types {
@@ -144,24 +133,18 @@ static const char * const SupportedGPUs[] = {
     "Quadro Plex 2100 D4"
 };
 
-/*! \cond  TEST */
-#ifndef _string2_h
-/* debug functions, see @the end */
-void ltrim (char *);
-void rtrim (char *);
-void trim  (char *);
-int gmx_strncasecmp(const char*, const char*, int);
-#endif
-/*! \endcond  TEST */
-
 
 /*! 
   * \brief Runs GPU sanity checks.
-  * Returnes properties of a device with given id or the one that has
-  * already been initialized earlier in the case if of dev_id == -1.
   *
-  * \param[in] dev_id       the device id of the GPU or -1 if the device has laredy been selected
+  * Runs a series of checks to determine that the given GPU and underlying CUDA
+  * driver/runtime functions properly.
+  * Returns properties of a device with given ID or the one that has
+  * already been initialized earlier in the case if of \dev_id == -1.
+  *
+  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
   * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
+  * \returns                0 if the device looks OK
   */
 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 {
@@ -181,10 +164,10 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
         return -1;
 
     /* things might go horribly wrong if cudart is not compatible with the driver */
-    if (dev_count < 0 || dev_count > 20)
+    if (dev_count < 0 || dev_count > cuda_max_device_count)
         return -1;
 
-    if (dev_id == -1) /* device already selected let's do not destroy the context */
+    if (dev_id == -1) /* device already selected let's not destroy the context */
     {
         cu_err = cudaGetDevice(&id);
         if (cu_err != cudaSuccess)
@@ -221,23 +204,44 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
     if (dev_prop->major == 0)
         return -1;
 
-    if ((dev_id != -1) && (cu_err = cudaSetDevice(dev_id)) != cudaSuccess)
+    if (id != -1)
     {
-        fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, dev_id,
-                cudaGetErrorString(cu_err));
-        return -1;
+        cu_err = cudaSetDevice(id);
+        if (cu_err != cudaSuccess)
+        {
+            fprintf(stderr, "Error %d while switching to device #%d: %s\n",
+                    cu_err, id, cudaGetErrorString(cu_err));
+            return -1;
+        }
+    }
+
+    /* try to execute a dummy kernel */
+    k_dummy_test<<<1, 512>>>();
+    CU_LAUNCH_ERR_SYNC("dummy test kernel");
+
+    /* destroy context if we created one */
+    if (id != -1)
+    {
+#if CUDA_VERSION < 4000
+        cu_err = cudaThreadExit();
+        CU_RET_ERR(cu_err, "cudaThreadExit failed");
+#else
+        cu_err = cudaDeviceReset();
+        CU_RET_ERR(cu_err, "cudaDeviceReset failed");
+#endif
     }
 
     return 0;
 }
 
+
 /*! 
- * \brief Checks whether the GPU with the given name is supported.
+ * \brief Checks whether the GPU with the given name is supported in Gromacs-OpenMM.
  * 
  * \param[in] gpu_name  the name of the CUDA device
- * \returns             1 if the device is supported, otherwise 0
+ * \returns             TRUE if the device is supported, otherwise FALSE
  */
-static int is_supported_gpu_n(char *gpuName)
+static bool is_gmx_openmm_supported_gpu_name(char *gpuName)
 {
     size_t i;
     for (i = 0; i < NB_GPUS; i++)
@@ -249,13 +253,14 @@ static int is_supported_gpu_n(char *gpuName)
     return 0;
 }
 
-/*! \brief Checks whether the GPU with the given device id is supported. 
+/*! \brief Checks whether the GPU with the given device id is supported in Gromacs-OpenMM.
  *
- * \param[in] dev_id    the device id of the GPU or -1 if the device has laredy been selected
+ * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
  * \param[out] gpu_name Set to contain the name of the CUDA device, if NULL passed, no device name is set. 
- * \returns             1 if the device is supported, otherwise 0
+ * \returns             TRUE if the device is supported, otherwise FALSE
+ * 
  */
-int is_supported_cuda_gpu(int dev_id, char *gpu_name)
+gmx_bool is_gmx_openmm_supported_gpu(int dev_id, char *gpu_name)
 {
     cudaDeviceProp dev_prop;
 
@@ -268,17 +273,17 @@ int is_supported_cuda_gpu(int dev_id, char *gpu_name)
     { 
         strcpy(gpu_name, dev_prop.name);
     }
-    return is_supported_gpu_n(dev_prop.name);
+    return is_gmx_openmm_supported_gpu_name(dev_prop.name);
 }
 
 
 /*!
  * \brief Runs a set of memory tests specified by the given bit-flags.
- * Tries to allocate and do the test on \p megs Mb memory or 
+ * Tries to allocate and do the test on \p megs Mb memory or
  * the greatest amount that can be allocated (>10Mb).
- * In case if an error is detected it stops without finishing the remainings 
- * steps/iterations and returns greater then zero value.  
- * In case of other errors (e.g. kernel launch errors, device querying erros) 
+ * In case if an error is detected it stops without finishing the remaining
+ * steps/iterations and returns greater then zero value.
+ * In case of other errors (e.g. kernel launch errors, device querying errors)
  * -1 is returned.
  *
  * \param[in] which_tests   variable with bit-flags of the requested tests
@@ -426,12 +431,12 @@ static int do_memtest(unsigned int which_tests, int megs, int iter)
     return err_count;
 }
 
-/*! \brief Runs a quick memory test and returns 0 in case if no error is detected. 
- * If an error is detected it stops before completing the test and returns a 
- * value greater then 0. In case of other errors (e.g. kernel launch errors, 
- * device querying erros) -1 is returned.
+/*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
+ * If an error is detected it stops before completing the test and returns a
+ * value greater then 0. In case of other errors (e.g. kernel launch errors,
+ * device querying errors) -1 is returned.
  *
- * \param[in] dev_id    the device id of the GPU or -1 if the device has laredy been selected
+ * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
  * \returns             0 if no error was detected, otherwise >0
  */
 int do_quick_memtest(int dev_id)
@@ -467,12 +472,12 @@ int do_quick_memtest(int dev_id)
     return res;
 }
 
-/*! \brief Runs a full memory test and returns 0 in case if no error is detected. 
- * If an error is detected  it stops before completing the test and returns a 
- * value greater then 0. In case of other errors (e.g. kernel launch errors, 
- * device querying erros) -1 is returned.
+/*! \brief Runs a full memory test and returns 0 in case if no error is detected.
+ * If an error is detected  it stops before completing the test and returns a
+ * value greater then 0. In case of other errors (e.g. kernel launch errors,
+ * device querying errors) -1 is returned.
  *
- * \param[in] dev_id    the device id of the GPU or -1 if the device has laredy been selected
+ * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
  * \returns             0 if no error was detected, otherwise >0
  */
 
@@ -512,9 +517,9 @@ int do_full_memtest(int dev_id)
 }
 
 /*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
- * If an error is detected it stops before completing the test and returns a value greater 
- * than zero. In case of other errors (e.g. kernel launch errors, device querying erros) -1 
- * is returned. Note, that test iterations are not interrupted therefor the total runtime of 
+ * If an error is detected it stops before completing the test and returns a value greater
+ * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
+ * is returned. Note, that test iterations are not interrupted therefor the total runtime of
  * the test will always be multipple of one iteration's runtime.
  *
  * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
@@ -564,126 +569,357 @@ int do_timed_memtest(int dev_id, int time_constr)
     return res;
 }
 
-/*! \cond TEST */
-
-/*******************************************************
- * The code below is for testing purposes. */
-int do_custom_memtest(int dev_id)
+/*! \brief Initializes the GPU with the given index.
+ *
+ * The varible \mygpu is the index of the GPU to initialize in the
+ * gpu_info.cuda_dev array.
+ *
+ * \param[in]  mygpu        index of the GPU to initialize
+ * \param[out] result_str   the message related to the error that occurred
+ *                          during the initialization (if there was any).
+ * \param[in] gpu_info      GPU info of all detected devices in the system.
+ * \returns                 true if no error occurs during initialization.
+ */
+gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info)
 {
-    cudaDeviceProp  dev_prop;
-    int             mem2test, /*devmem,*/ res;
-//    memtestState    tester;
-//    double          bandwidth;
+    cudaError_t stat;
+    char sbuf[STRLEN];
+    int gpuid;
 
-#if _DEBUG_ >= 1
-    int time = getTimeMilliseconds();
-#endif
+    assert(gpu_info);
+    assert(result_str);
 
-   if (do_sanity_checks(dev_id, &dev_prop) != 0)
-        return -1;
+    if (mygpu < 0 || mygpu >= gpu_info->ncuda_dev_use)
+    {
+        sprintf(sbuf, "Trying to initialize an inexistent GPU: "
+                "there are %d %s-selected GPU(s), but #%d was requested.",
+                 gpu_info->ncuda_dev_use, gpu_info->bUserSet ? "user" : "auto", mygpu);
+        gmx_incons(sbuf);
+    }
 
-//    if ((res=tester.allocate(100))==0)
-//        printf("alloc failed\n");
-//    printf("alloc res = %d\n", res);
-//    res = tester.gpuMemoryBandwidth(bandwidth, tester.size(), 10);
-//    printf("Bandwidth on %d (res %d)= %5.2f\n", tester.size(), res, bandwidth);
-//    tester.deallocate();
+    gpuid = gpu_info->cuda_dev[gpu_info->cuda_dev_use[mygpu]].id;
 
-//    devmem   = dev_prop.totalGlobalMem/(1024*1024); // in MiB
-    mem2test = 80;
+    stat = cudaSetDevice(gpuid);
+    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 
-#if _DEBUG_ >= 1
-    printf(">> Running CUSTOM memtests [%x] on %d MiB, %d iterations\n",
-        QUICK_TESTS, mem2test, 1);
-#endif
+    if (debug)
+    {
+        fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
+    }
+
+    return (stat == cudaSuccess);
+}
+
+/*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
+ *
+ * The context is explicitly destroyed and therefore all data uploaded to the GPU
+ * is lost. This should only be called when none of this data is required anymore.
+ *
+ * \param[out] result_str   the message related to the error that occurred
+ *                          during the initialization (if there was any).
+ * \returns                 true if no error occurs during the freeing.
+ */
+gmx_bool free_gpu(char *result_str)
+{
+    cudaError_t stat;
 
-    res = do_memtest(QUICK_TESTS, mem2test, 1);
-    cudaThreadExit();
+    assert(result_str);
 
-#if _DEBUG_ >= 1
-    printf("C-RES = %d\n", res);
-    printf("C-runtime: %d ms\n", getTimeMilliseconds() - time);
+    if (debug)
+    {
+        int gpuid;
+        stat = cudaGetDevice(&gpuid);
+        CU_RET_ERR(stat, "cudaGetDevice failed");
+        fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
+    }
+
+#if CUDA_VERSION < 4000
+    stat = cudaThreadExit();
+#else
+    stat = cudaDeviceReset();
 #endif
-    return res;
+    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
+
+    return (stat == cudaSuccess);
 }
 
-#if _DEBUG_ > 1
-/*!
- * Only for debugging purposes, compile with:
- * nvcc -DLINUX -D_DEBUG_=2  -L -O  -Xcompiler -Wall memtestG80_core.o gmx_gpu_utils.cu  -o gmx_gpu_utils_test
+/*! \brief Returns true if the gpu characterized by the device properties is
+ *  supported by the native gpu acceleration.
+ *
+ * \param[in] dev_prop  the CUDA device properties of the gpus to test.
+ * \returns             true if the GPU properties passed indicate a compatible
+ *                      GPU, otherwise false.
  */
-int main( int argc, char** argv)
+static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 {
-    int dev_id = 0;
-    char msg[100];
-    sprintf(msg, "Device #%d supported: ", dev_id);
-    switch (is_supported_cuda_gpu(dev_id, NULL))
+    return (dev_prop->major >= 2);
+}
+
+/*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
+ *
+ * \param[in] stat  GPU status.
+ * \returns         true if the provided status is egpuCompatible, otherwise false.
+ */
+static bool is_compatible_gpu(int stat)
+{
+    return (stat == egpuCompatible);
+}
+
+/*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
+ *
+ *  Returns a status value which indicates compatibility or one of the following
+ *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
+ *  It also returns the respective device's properties in \dev_prop (if applicable).
+ *
+ *  \param[in]  dev_id   the ID of the GPU to check.
+ *  \param[out] dev_prop the CUDA device properties of the device checked.
+ *  \returns             the status of the requested device
+ */
+static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
+{
+    cudaError_t stat;
+    int         ndev;
+
+    stat = cudaGetDeviceCount(&ndev);
+    CU_RET_ERR(stat, "cudaGetDeviceCount failed");
+
+    if (dev_id > ndev - 1)
     {
-        case -1: strcat(msg, "error occured"); break;
-        case  0: strcat(msg, "no"); break;
-        case  1: strcat(msg, "yes"); break;
-        default: strcat(msg, "\nhmmm, you should not see this!");
+        return egpuNonexistent;
     }
-    printf("%s\n", msg);
 
-    printf("Doing memtest.\n");
-    printf("quick memtest result: %d\n", do_quick_memtest(dev_id));
-    printf("timed memtest result: %d\n", do_timed_memtest(dev_id, 15));
-    printf("full memtest result: %d\n", do_full_memtest(dev_id));
-    return 0;
+    if (do_sanity_checks(dev_id, dev_prop) == 0)
+    {
+        if (is_gmx_supported_gpu(dev_prop))
+        {
+            return egpuCompatible;
+        }
+        else
+        {
+            return egpuIncompatible;
+        }
+    }
+    else
+    {
+        return egpuInsane;
+    }
 }
-#endif
 
 
-#ifndef _string2_h
-#include <string.h>
-/* 
-    Functions only used if this file is compiled in debug mode (_DEBUG_ > 0)
-    when the gromacs version are not available.
-    - string trimming function - duplicated from ~/src/gmxlib/string2.c 
-    - case agnostic straing compare
+/*! \brief Detect all NVIDIA GPUs in the system.
+ *
+ *  Will detect every NVIDIA GPU supported by the device driver in use. Also
+ *  check for the compatibility of each and fill the gpu_info->cuda_dev array
+ *  with the required information on each the device: ID, device properties,
+ *  status.
+ *
+ *  \param[in] gpu_info    pointer to structure holding GPU information.
  */
-static void ltrim (char *str)
+void detect_cuda_gpus(gmx_gpu_info_t *gpu_info)
 {
-  char *tr;
-  int c;
+    int             i, ndev, checkres;
+    cudaError_t     stat;
+    cudaDeviceProp  prop;
+    cuda_dev_info_t *devs;
 
-  if (!str)
-    return;
+    assert(gpu_info);
 
-  tr = strdup (str);
-  c  = 0;
-  while ((tr[c] == ' ') || (tr[c] == '\t'))
-    c++;
+    stat = cudaGetDeviceCount(&ndev);
+    CU_RET_ERR(stat, "cudaGetDeviceCount failed");
 
-  strcpy (str,tr+c);
-  free (tr);
+    snew(devs, ndev);
+    for (i = 0; i < ndev; i++)
+    {
+        checkres = is_gmx_supported_gpu_id(i, &prop);
+
+        devs[i].id   = i;
+        devs[i].prop = prop;
+        devs[i].stat = checkres;
+    }
+
+    gpu_info->ncuda_dev = ndev;
+    gpu_info->cuda_dev  = devs;
 }
 
-static void rtrim (char *str)
+/*! \brief Select the GPUs compatible with the native GROMACS acceleration.
+ *
+ * This function selects the compatible gpus and initializes
+ * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
+ *
+ * Given the list of GPUs available in the system the it checks each gpu in
+ * gpu_info->cuda_dev and puts the the indices (into gpu_info->cuda_dev) of
+ * the compatible ones into cuda_dev_use with this marking the respective
+ * GPUs as "available for use."
+ * Note that \detect_cuda_gpus must have been called before.
+ *
+ * \param[in]    gpu_info    pointer to structure holding GPU information
+ */
+void pick_compatible_gpus(gmx_gpu_info_t *gpu_info)
 {
-  int nul;
+    int i, ncompat;
+    int *compat;
+
+    assert(gpu_info);
+    /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
+    assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
+
+    snew(compat, gpu_info->ncuda_dev);
+    ncompat = 0;
+    for (i = 0; i < gpu_info->ncuda_dev; i++)
+    {
+        if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
+        {
+            ncompat++;
+            compat[ncompat - 1] = i;
+        }
+    }
+
+    gpu_info->ncuda_dev_use = ncompat;
+    snew(gpu_info->cuda_dev_use, ncompat);
+    memcpy(gpu_info->cuda_dev_use, compat, ncompat*sizeof(*compat));
+    sfree(compat);
+}
+
+/*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
+ *
+ * Given the a list of GPU devide IDs in \requested_devs, check for the
+ * existence and compatibility of the respective GPUs and fill in \gpu_info
+ * with the collected information. Also provide the caller with an array with
+ * the result of checks in \checkres.
+ *
+ * \param[out]  checkres    check result for each ID passed in \requested_devs
+ * \param[in]   gpu_info    pointer to structure holding GPU information
+ * \param[in]   requested_devs array of requested device IDs
+ * \param[in]   count       number of IDs in \requested_devs
+ * \returns                 TRUE if every requested GPU is compatible
+ */
+gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
+                                const int *requested_devs, int count)
+{
+    int i, id;
+    bool bAllOk;
+
+    assert(checkres);
+    assert(gpu_info);
+    assert(requested_devs);
+    assert(count >= 0);
+
+    if (count == 0)
+    {
+        return TRUE;
+    }
+
+    /* we will assume that all GPUs requested are valid IDs,
+       otherwise we'll bail anyways */
+    gpu_info->ncuda_dev_use = count;
+    snew(gpu_info->cuda_dev_use, count);
+
+    bAllOk = true;
+    for (i = 0; i < count; i++)
+    {
+        id = requested_devs[i];
+
+        /* devices are stored in increasing order of IDs in cuda_dev */
+        gpu_info->cuda_dev_use[i] = id;
+
+        checkres[i] = (id >= gpu_info->ncuda_dev) ?
+            egpuNonexistent : gpu_info->cuda_dev[id].stat;
 
-  if (!str)
-    return;
+        bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
+    }
 
-  nul = strlen(str)-1;
-  while ((nul > 0) && ((str[nul] == ' ') || (str[nul] == '\t')) ) {
-    str[nul] = '\0';
-    nul--;
-  }
+    return bAllOk;
 }
 
-static void trim (char *str)
+/*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
+ *
+ * \param[in]    gpu_info    pointer to structure holding GPU information
+ */
+void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 {
-  ltrim (str);
-  rtrim (str);
+    if (gpu_info == NULL)
+    {
+        return;
+    }
+
+    sfree(gpu_info->cuda_dev_use);
+    sfree(gpu_info->cuda_dev);
 }
 
-static int gmx_strncasecmp(const char* s1, const char* s2, int len)
+/*! \brief Formats and returns a device information string for a given GPU.
+ *
+ * Given an index *directly* into the array of available GPUs (cuda_dev)
+ * returns a formatted info string for the respective GPU which includes
+ * ID, name, compute capability, and detection status.
+ *
+ * \param[out]  s           pointer to output string (has to be allocated externally)
+ * \param[in]   gpu_info    pointer to structure holding GPU information
+ * \param[in]   index       an index *directly* into the array of available GPUs
+ */
+void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 {
-  return strncasecmp(s1, s2, len);
+    assert(s);
+    assert(gpu_info);
+
+    if (index < 0 && index >= gpu_info->ncuda_dev)
+    {
+        return;
+    }
+
+    cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
+
+    bool bGpuExists =
+        dinfo->stat == egpuCompatible ||
+        dinfo->stat == egpuIncompatible;
+
+    if (!bGpuExists)
+    {
+        sprintf(s, "#%d: %s, stat: %s",
+                dinfo->id, "N/A",
+                gpu_detect_res_str[dinfo->stat]);
+    }
+    else
+    {
+        sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
+                dinfo->id, dinfo->prop.name,
+                dinfo->prop.major, dinfo->prop.minor,
+                dinfo->prop.ECCEnabled ? "yes" : " no",
+                gpu_detect_res_str[dinfo->stat]);
+    }
+}
+
+/*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
+ *
+ * Getter function which, given an index into the array of GPUs in use
+ * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
+ * respective CUDA GPU.
+ *
+ * \param[in]    gpu_info   pointer to structure holding GPU information
+ * \param[in]    idx        index into the array of used GPUs
+ * \returns                 device ID of the requested GPU
+ */
+int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int idx)
+{
+    assert(gpu_info);
+    if (idx < 0 && idx >= gpu_info->ncuda_dev_use)
+    {
+        return -1;
+    }
+
+    return gpu_info->cuda_dev[gpu_info->cuda_dev_use[idx]].id;
 }
-#endif
 
-/*! \endcond TEST */
+/*! \brief Returns the device ID of the GPU currently in use.
+ *
+ * The GPU used is the one that is active at the time of the call in the active context.
+ *
+ * \param[in]    gpu_info   pointer to structure holding GPU information
+ * \returns                 device ID of the GPU in use at the time of the call
+ */
+int get_current_gpu_device_id(void)
+{
+    int gpuid;
+    CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
+
+    return gpuid;
+}
diff --git a/src/kernel/gmx_gpu_utils/memtestG80_core.cu b/src/gmxlib/gpu_utils/memtestG80_core.cu
similarity index 100%
rename from src/kernel/gmx_gpu_utils/memtestG80_core.cu
rename to src/gmxlib/gpu_utils/memtestG80_core.cu
diff --git a/src/kernel/gmx_gpu_utils/memtestG80_core.h b/src/gmxlib/gpu_utils/memtestG80_core.h
similarity index 100%
rename from src/kernel/gmx_gpu_utils/memtestG80_core.h
rename to src/gmxlib/gpu_utils/memtestG80_core.h
diff --git a/src/gmxlib/main.c b/src/gmxlib/main.c
index 3101457bea..4bc2066a83 100644
--- a/src/gmxlib/main.c
+++ b/src/gmxlib/main.c
@@ -61,7 +61,6 @@
 #include "macros.h"
 #include "futil.h"
 #include "filenm.h"
-#include "mdrun.h"
 #include "gmxfio.h"
 #include "string2.h"
 
@@ -212,8 +211,27 @@ void check_multi_large_int(FILE *log,const gmx_multisim_t *ms,
 }
 
 
+char *gmx_gethostname(char *name, size_t len)
+{
+    if (len < 8)
+    {
+        gmx_incons("gmx_gethostname called with len<8");
+    }
+#ifdef HAVE_UNISTD_H
+    if (gethostname(name, len-1) != 0)
+    {
+        strncpy(name, "unknown",8);
+    }
+#else
+    strncpy(name, "unknown",8);
+#endif
+
+    return name;
+}
+
+
 void gmx_log_open(const char *lognm,const t_commrec *cr,gmx_bool bMasterOnly, 
-                   unsigned long Flags, FILE** fplog)
+                  gmx_bool bAppendFiles, FILE** fplog)
 {
     int  len,testlen,pid;
     char buf[256],host[256];
@@ -221,8 +239,6 @@ void gmx_log_open(const char *lognm,const t_commrec *cr,gmx_bool bMasterOnly,
     char timebuf[STRLEN];
     FILE *fp=*fplog;
     char *tmpnm;
-
-    gmx_bool bAppend = Flags & MD_APPENDFILES;	
   
     debug_gmx();
   
@@ -262,11 +278,11 @@ void gmx_log_open(const char *lognm,const t_commrec *cr,gmx_bool bMasterOnly,
     {
         /* Since log always ends with '.log' let's use this info */
         par_fn(tmpnm,efLOG,cr,FALSE,!bMasterOnly,buf,255);
-        fp = gmx_fio_fopen(buf, bAppend ? "a+" : "w+" );
+        fp = gmx_fio_fopen(buf, bAppendFiles ? "a+" : "w+" );
     }
-    else if (!bAppend)
+    else if (!bAppendFiles)
     {
-        fp = gmx_fio_fopen(tmpnm, bAppend ? "a+" : "w+" );
+        fp = gmx_fio_fopen(tmpnm, bAppendFiles ? "a+" : "w+" );
     }
 
     sfree(tmpnm);
@@ -274,14 +290,7 @@ void gmx_log_open(const char *lognm,const t_commrec *cr,gmx_bool bMasterOnly,
     gmx_fatal_set_log_file(fp);
   
     /* Get some machine parameters */
-#ifdef HAVE_UNISTD_H
-    if (gethostname(host,255) != 0)
-    {
-        sprintf(host,"unknown");
-    }
-#else
-    sprintf(host,"unknown");
-#endif  
+    gmx_gethostname(host,256);
 
     time(&t);
 
@@ -295,7 +304,7 @@ void gmx_log_open(const char *lognm,const t_commrec *cr,gmx_bool bMasterOnly,
 	pid = 0;
 #endif
 
-    if (bAppend)
+    if (bAppendFiles)
     {
         fprintf(fp,
                 "\n"
diff --git a/src/gmxlib/maths.c b/src/gmxlib/maths.c
index b327777992..13388c5d93 100644
--- a/src/gmxlib/maths.c
+++ b/src/gmxlib/maths.c
@@ -103,8 +103,6 @@ real sign(real x,real y)
 #endif
 
 
-#ifdef GMX_DOUBLE
-
 static const double
 tiny	    = 1e-300,
 half=  5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
@@ -180,7 +178,7 @@ sb5  =  2.55305040643316442583e+03, /* 0x40A3F219, 0xCEDF3BE6 */
 sb6  =  4.74528541206955367215e+02, /* 0x407DA874, 0xE79FE763 */
 sb7  = -2.24409524465858183362e+01; /* 0xC03670E2, 0x42712D62 */
 
-double gmx_erf(double x)
+double gmx_erfd(double x)
 {
   
 	erf_int32_t hx,ix,i;
@@ -275,7 +273,7 @@ double gmx_erf(double x)
 }
 
 
-double gmx_erfc(double x)
+double gmx_erfcd(double x)
 {
 	erf_int32_t hx,ix;
 	double R,S,P,Q,s,y,z,r;
@@ -388,84 +386,81 @@ double gmx_erfc(double x)
 	}
 }
 
-#else /* single precision */
-
-
 
 static const float
-tiny	    = 1e-30,
-half=  5.0000000000e-01, /* 0x3F000000 */
-one =  1.0000000000e+00, /* 0x3F800000 */
-two =  2.0000000000e+00, /* 0x40000000 */
+tinyf=  1e-30,
+halff=  5.0000000000e-01, /* 0x3F000000 */
+onef =  1.0000000000e+00, /* 0x3F800000 */
+twof =  2.0000000000e+00, /* 0x40000000 */
 	/* c = (subfloat)0.84506291151 */
-erx =  8.4506291151e-01, /* 0x3f58560b */
+erxf =  8.4506291151e-01, /* 0x3f58560b */
 /*
  * Coefficients for approximation to  erf on [0,0.84375]
  */
-efx =  1.2837916613e-01, /* 0x3e0375d4 */
-efx8=  1.0270333290e+00, /* 0x3f8375d4 */
-pp0  =  1.2837916613e-01, /* 0x3e0375d4 */
-pp1  = -3.2504209876e-01, /* 0xbea66beb */
-pp2  = -2.8481749818e-02, /* 0xbce9528f */
-pp3  = -5.7702702470e-03, /* 0xbbbd1489 */
-pp4  = -2.3763017452e-05, /* 0xb7c756b1 */
-qq1  =  3.9791721106e-01, /* 0x3ecbbbce */
-qq2  =  6.5022252500e-02, /* 0x3d852a63 */
-qq3  =  5.0813062117e-03, /* 0x3ba68116 */
-qq4  =  1.3249473704e-04, /* 0x390aee49 */
-qq5  = -3.9602282413e-06, /* 0xb684e21a */
+efxf =  1.2837916613e-01, /* 0x3e0375d4 */
+efx8f=  1.0270333290e+00, /* 0x3f8375d4 */
+pp0f =  1.2837916613e-01, /* 0x3e0375d4 */
+pp1f = -3.2504209876e-01, /* 0xbea66beb */
+pp2f = -2.8481749818e-02, /* 0xbce9528f */
+pp3f = -5.7702702470e-03, /* 0xbbbd1489 */
+pp4f = -2.3763017452e-05, /* 0xb7c756b1 */
+qq1f =  3.9791721106e-01, /* 0x3ecbbbce */
+qq2f =  6.5022252500e-02, /* 0x3d852a63 */
+qq3f =  5.0813062117e-03, /* 0x3ba68116 */
+qq4f =  1.3249473704e-04, /* 0x390aee49 */
+qq5f = -3.9602282413e-06, /* 0xb684e21a */
 /*
  * Coefficients for approximation to  erf  in [0.84375,1.25] 
  */
-pa0  = -2.3621185683e-03, /* 0xbb1acdc6 */
-pa1  =  4.1485610604e-01, /* 0x3ed46805 */
-pa2  = -3.7220788002e-01, /* 0xbebe9208 */
-pa3  =  3.1834661961e-01, /* 0x3ea2fe54 */
-pa4  = -1.1089469492e-01, /* 0xbde31cc2 */
-pa5  =  3.5478305072e-02, /* 0x3d1151b3 */
-pa6  = -2.1663755178e-03, /* 0xbb0df9c0 */
-qa1  =  1.0642088205e-01, /* 0x3dd9f331 */
-qa2  =  5.4039794207e-01, /* 0x3f0a5785 */
-qa3  =  7.1828655899e-02, /* 0x3d931ae7 */
-qa4  =  1.2617121637e-01, /* 0x3e013307 */
-qa5  =  1.3637083583e-02, /* 0x3c5f6e13 */
-qa6  =  1.1984500103e-02, /* 0x3c445aa3 */
+pa0f = -2.3621185683e-03, /* 0xbb1acdc6 */
+pa1f =  4.1485610604e-01, /* 0x3ed46805 */
+pa2f = -3.7220788002e-01, /* 0xbebe9208 */
+pa3f =  3.1834661961e-01, /* 0x3ea2fe54 */
+pa4f = -1.1089469492e-01, /* 0xbde31cc2 */
+pa5f =  3.5478305072e-02, /* 0x3d1151b3 */
+pa6f = -2.1663755178e-03, /* 0xbb0df9c0 */
+qa1f =  1.0642088205e-01, /* 0x3dd9f331 */
+qa2f =  5.4039794207e-01, /* 0x3f0a5785 */
+qa3f =  7.1828655899e-02, /* 0x3d931ae7 */
+qa4f =  1.2617121637e-01, /* 0x3e013307 */
+qa5f =  1.3637083583e-02, /* 0x3c5f6e13 */
+qa6f =  1.1984500103e-02, /* 0x3c445aa3 */
 /*
  * Coefficients for approximation to  erfc in [1.25,1/0.35]
  */
-ra0  = -9.8649440333e-03, /* 0xbc21a093 */
-ra1  = -6.9385856390e-01, /* 0xbf31a0b7 */
-ra2  = -1.0558626175e+01, /* 0xc128f022 */
-ra3  = -6.2375331879e+01, /* 0xc2798057 */
-ra4  = -1.6239666748e+02, /* 0xc322658c */
-ra5  = -1.8460508728e+02, /* 0xc3389ae7 */
-ra6  = -8.1287437439e+01, /* 0xc2a2932b */
-ra7  = -9.8143291473e+00, /* 0xc11d077e */
-sa1  =  1.9651271820e+01, /* 0x419d35ce */
-sa2  =  1.3765776062e+02, /* 0x4309a863 */
-sa3  =  4.3456588745e+02, /* 0x43d9486f */
-sa4  =  6.4538726807e+02, /* 0x442158c9 */
-sa5  =  4.2900814819e+02, /* 0x43d6810b */
-sa6  =  1.0863500214e+02, /* 0x42d9451f */
-sa7  =  6.5702495575e+00, /* 0x40d23f7c */
-sa8  = -6.0424413532e-02, /* 0xbd777f97 */
+ra0f = -9.8649440333e-03, /* 0xbc21a093 */
+ra1f = -6.9385856390e-01, /* 0xbf31a0b7 */
+ra2f = -1.0558626175e+01, /* 0xc128f022 */
+ra3f = -6.2375331879e+01, /* 0xc2798057 */
+ra4f = -1.6239666748e+02, /* 0xc322658c */
+ra5f = -1.8460508728e+02, /* 0xc3389ae7 */
+ra6f = -8.1287437439e+01, /* 0xc2a2932b */
+ra7f = -9.8143291473e+00, /* 0xc11d077e */
+sa1f =  1.9651271820e+01, /* 0x419d35ce */
+sa2f =  1.3765776062e+02, /* 0x4309a863 */
+sa3f =  4.3456588745e+02, /* 0x43d9486f */
+sa4f =  6.4538726807e+02, /* 0x442158c9 */
+sa5f =  4.2900814819e+02, /* 0x43d6810b */
+sa6f =  1.0863500214e+02, /* 0x42d9451f */
+sa7f =  6.5702495575e+00, /* 0x40d23f7c */
+sa8f = -6.0424413532e-02, /* 0xbd777f97 */
 /*
  * Coefficients for approximation to  erfc in [1/.35,28]
  */
-rb0  = -9.8649431020e-03, /* 0xbc21a092 */
-rb1  = -7.9928326607e-01, /* 0xbf4c9dd4 */
-rb2  = -1.7757955551e+01, /* 0xc18e104b */
-rb3  = -1.6063638306e+02, /* 0xc320a2ea */
-rb4  = -6.3756646729e+02, /* 0xc41f6441 */
-rb5  = -1.0250950928e+03, /* 0xc480230b */
-rb6  = -4.8351919556e+02, /* 0xc3f1c275 */
-sb1  =  3.0338060379e+01, /* 0x41f2b459 */
-sb2  =  3.2579251099e+02, /* 0x43a2e571 */
-sb3  =  1.5367296143e+03, /* 0x44c01759 */
-sb4  =  3.1998581543e+03, /* 0x4547fdbb */
-sb5  =  2.5530502930e+03, /* 0x451f90ce */
-sb6  =  4.7452853394e+02, /* 0x43ed43a7 */
-sb7  = -2.2440952301e+01; /* 0xc1b38712 */
+rb0f = -9.8649431020e-03, /* 0xbc21a092 */
+rb1f = -7.9928326607e-01, /* 0xbf4c9dd4 */
+rb2f = -1.7757955551e+01, /* 0xc18e104b */
+rb3f = -1.6063638306e+02, /* 0xc320a2ea */
+rb4f = -6.3756646729e+02, /* 0xc41f6441 */
+rb5f = -1.0250950928e+03, /* 0xc480230b */
+rb6f = -4.8351919556e+02, /* 0xc3f1c275 */
+sb1f =  3.0338060379e+01, /* 0x41f2b459 */
+sb2f =  3.2579251099e+02, /* 0x43a2e571 */
+sb3f =  1.5367296143e+03, /* 0x44c01759 */
+sb4f =  3.1998581543e+03, /* 0x4547fdbb */
+sb5f =  2.5530502930e+03, /* 0x451f90ce */
+sb6f =  4.7452853394e+02, /* 0x43ed43a7 */
+sb7f = -2.2440952301e+01; /* 0xc1b38712 */
 
 
 typedef union
@@ -490,7 +485,7 @@ do {								\
 } while (0)
 
 
-float gmx_erf(float x)
+float gmx_erff(float x)
 {
 	erf_int32_t hx,ix,i;
 	float R,S,P,Q,s,y,z,r;
@@ -510,7 +505,7 @@ float gmx_erf(float x)
     {
 		/* erf(nan)=nan */
 	    i = ((erf_u_int32_t)hx>>31)<<1;
-	    return (float)(1-i)+one/x;	/* erf(+-inf)=+-1 */
+	    return (float)(1-i)+onef/x;	/* erf(+-inf)=+-1 */
 	}
 
 	if(ix < 0x3f580000)
@@ -520,41 +515,41 @@ float gmx_erf(float x)
         { 
             /* |x|<2**-28 */
 	        if (ix < 0x04000000) 
-                return (float)0.125*((float)8.0*x+efx8*x); 		    /*avoid underflow */
-            return x + efx*x;
+                return (float)0.125*((float)8.0*x+efx8f*x); 		    /*avoid underflow */
+            return x + efxf*x;
 	    }
         z = x*x;
-        r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
-	    s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
+        r = pp0f+z*(pp1f+z*(pp2f+z*(pp3f+z*pp4f)));
+	    s = onef+z*(qq1f+z*(qq2f+z*(qq3f+z*(qq4f+z*qq5f))));
 	    y = r/s;
 	    return x + x*y;
 	}
 	if(ix < 0x3fa00000) 
     {	
         /* 0.84375 <= |x| < 1.25 */
-	    s = fabs(x)-one;
-	    P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
-	    Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
-	    if(hx>=0) return erx + P/Q; else return -erx - P/Q;
+	    s = fabs(x)-onef;
+	    P = pa0f+s*(pa1f+s*(pa2f+s*(pa3f+s*(pa4f+s*(pa5f+s*pa6f)))));
+	    Q = onef+s*(qa1f+s*(qa2f+s*(qa3f+s*(qa4f+s*(qa5f+s*qa6f)))));
+	    if(hx>=0) return erxf + P/Q; else return -erxf - P/Q;
 	}
     if (ix >= 0x40c00000)
     {
 		/* inf>|x|>=6 */
-        if(hx>=0) return one-tiny; else return tiny-one;
+        if(hx>=0) return onef-tinyf; else return tinyf-onef;
 	}
 	x = fabs(x);
- 	s = one/(x*x);
+ 	s = onef/(x*x);
     if(ix< 0x4036DB6E)
     {
         /* |x| < 1/0.35 */
-	    R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(ra5+s*(ra6+s*ra7))))));
-	    S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(sa5+s*(sa6+s*(sa7+s*sa8)))))));
+	    R=ra0f+s*(ra1f+s*(ra2f+s*(ra3f+s*(ra4f+s*(ra5f+s*(ra6f+s*ra7f))))));
+	    S=onef+s*(sa1f+s*(sa2f+s*(sa3f+s*(sa4f+s*(sa5f+s*(sa6f+s*(sa7f+s*sa8f)))))));
 	} 
     else
     {	
         /* |x| >= 1/0.35 */
-	    R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(rb5+s*rb6)))));
-	    S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(sb5+s*(sb6+s*sb7))))));
+	    R=rb0f+s*(rb1f+s*(rb2f+s*(rb3f+s*(rb4f+s*(rb5f+s*rb6f)))));
+	    S=onef+s*(sb1f+s*(sb2f+s*(sb3f+s*(sb4f+s*(sb5f+s*(sb6f+s*sb7f))))));
 	}
     
     conv.f = x;
@@ -562,10 +557,10 @@ float gmx_erf(float x)
     z = conv.f;
 
 	r  =  exp(-z*z-(float)0.5625)*exp((z-x)*(z+x)+R/S);
-	if(hx>=0) return one-r/x; else return  r/x-one;
+	if(hx>=0) return onef-r/x; else return  r/x-onef;
 }
 
-float gmx_erfc(float x)
+float gmx_erfcf(float x)
 {
 	erf_int32_t hx,ix;
 	float R,S,P,Q,s,y,z,r;
@@ -585,55 +580,55 @@ float gmx_erfc(float x)
     {
         /* erfc(nan)=nan */
         /* erfc(+-inf)=0,2 */
-	    return (float)(((erf_u_int32_t)hx>>31)<<1)+one/x;
+	    return (float)(((erf_u_int32_t)hx>>31)<<1)+onef/x;
 	}
 
 	if(ix < 0x3f580000) 
     {
 		/* |x|<0.84375 */
 	    if(ix < 0x23800000)    
-            return one-x;	/* |x|<2**-56 */
+            return onef-x;	/* |x|<2**-56 */
 	    z = x*x;
-	    r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
-	    s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
+	    r = pp0f+z*(pp1f+z*(pp2f+z*(pp3f+z*pp4f)));
+	    s = onef+z*(qq1f+z*(qq2f+z*(qq3f+z*(qq4f+z*qq5f))));
 	    y = r/s;
 	    if(hx < 0x3e800000)
         {
             /* x<1/4 */
-            return one-(x+x*y);
+            return onef-(x+x*y);
 	    } else {
             r = x*y;
-            r += (x-half);
-	        return half - r ;
+            r += (x-halff);
+	        return halff - r ;
 	    }
 	}
 	if(ix < 0x3fa00000) 
     {	
         /* 0.84375 <= |x| < 1.25 */
-	    s = fabs(x)-one;
-	    P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
-	    Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
+	    s = fabs(x)-onef;
+	    P = pa0f+s*(pa1f+s*(pa2f+s*(pa3f+s*(pa4f+s*(pa5f+s*pa6f)))));
+	    Q = onef+s*(qa1f+s*(qa2f+s*(qa3f+s*(qa4f+s*(qa5f+s*qa6f)))));
 	    if(hx>=0) {
-	        z  = one-erx; return z - P/Q; 
+	        z  = onef-erxf; return z - P/Q; 
 	    } else {
-            z = erx+P/Q; return one+z;
+            z = erxf+P/Q; return onef+z;
 	    }
 	}
 	if (ix < 0x41e00000) 
     {
 		/* |x|<28 */
         x = fabs(x);
- 	    s = one/(x*x);
+ 	    s = onef/(x*x);
 	    if(ix< 0x4036DB6D)
         {
             /* |x| < 1/.35 ~ 2.857143*/
-            R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(ra5+s*(ra6+s*ra7))))));
-	        S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(sa5+s*(sa6+s*(sa7+s*sa8)))))));
+                R=ra0f+s*(ra1f+s*(ra2f+s*(ra3f+s*(ra4f+s*(ra5f+s*(ra6f+s*ra7f))))));
+	        S=onef+s*(sa1f+s*(sa2f+s*(sa3f+s*(sa4f+s*(sa5f+s*(sa6f+s*(sa7f+s*sa8f)))))));
 	    } else {		
             /* |x| >= 1/.35 ~ 2.857143 */
-            if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */
-	        R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(rb5+s*rb6)))));
-	        S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(sb5+s*(sb6+s*sb7))))));
+            if(hx<0&&ix>=0x40c00000) return twof-tinyf;/* x < -6 */
+	        R=rb0f+s*(rb1f+s*(rb2f+s*(rb3f+s*(rb4f+s*(rb5f+s*rb6f)))));
+	        S=onef+s*(sb1f+s*(sb2f+s*(sb3f+s*(sb4f+s*(sb5f+s*(sb6f+s*sb7f))))));
 	    }
         
         conv.f = x;
@@ -641,13 +636,12 @@ float gmx_erfc(float x)
         z = conv.f;
         
 	    r  =  exp(-z*z-(float)0.5625)*exp((z-x)*(z+x)+R/S);
-	    if(hx>0) return r/x; else return two-r/x;
+	    if(hx>0) return r/x; else return twof-r/x;
 	} else {
-	    if(hx>0) return tiny*tiny; else return two-tiny;
+	    if(hx>0) return tinyf*tinyf; else return twof-tinyf;
 	}
 }
 
-#endif
 
 gmx_bool gmx_isfinite(real x)
 {
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h b/src/gmxlib/md_logging.c
similarity index 57%
copy from src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
copy to src/gmxlib/md_logging.c
index 76070804ea..a7a87fe5eb 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
+++ b/src/gmxlib/md_logging.c
@@ -7,9 +7,10 @@
  * 
  *          GROningen MAchine for Chemical Simulations
  * 
+ *                        VERSION 3.2.0
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
+ * Copyright (c) 2001-2004, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
 
  * This program is free software; you can redistribute it and/or
@@ -32,25 +33,62 @@
  * And Hey:
  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
 
-#ifndef _GMX_GPU_UTILS_H_
-#define _GMX_GPU_UTILS_H_
+#include <stdio.h>
+#include <stdarg.h>
+#include "types/commrec.h"
+#include "md_logging.h"
 
-#ifndef __cplusplus
-extern "C" {
-#endif
 
-int do_quick_memtest(int /*dev_id*/);
+void md_print_info(const t_commrec *cr, FILE *fplog,
+                   const char *fmt, ...)
+{
+    va_list ap;
 
-int do_full_memtest(int /*dev_id*/);
+    if (cr == NULL || SIMMASTER(cr))
+    {
+        va_start(ap,fmt);
 
-int do_timed_memtest(int /*dev_id*/, int /*time_limit*/);
+        vfprintf(stderr,fmt,ap);
+        
+        va_end(ap);
+    }
+    if (fplog != NULL)
+    {
+        va_start(ap,fmt);
 
-int is_supported_cuda_gpu(int /*dev_id*/, char* /*gpu_name*/);
+        vfprintf(fplog,fmt,ap);
 
-#ifndef __cplusplus
-}  /* extern "C" */
-#endif
+        va_end(ap);
+    }
+}
+
+void md_print_warn(const t_commrec *cr, FILE *fplog,
+                   const char *fmt, ...)
+{
+    va_list ap;
+
+    if (cr == NULL || SIMMASTER(cr))
+    {
+        va_start(ap,fmt);
+
+        fprintf(stderr,"\n");
+        vfprintf(stderr,fmt,ap);
+        fprintf(stderr,"\n");
+
+        va_end(ap);
+    }
+    if (fplog != NULL)
+    {
+        va_start(ap,fmt);
 
-#endif // _GMX_GPU_UTILS_H_
+        fprintf(fplog,"\n");
+        vfprintf(fplog,fmt,ap);
+        fprintf(fplog,"\n");
 
+        va_end(ap);
+    }
+}
diff --git a/src/gmxlib/mtop_util.c b/src/gmxlib/mtop_util.c
index adc441209e..7ffce20878 100644
--- a/src/gmxlib/mtop_util.c
+++ b/src/gmxlib/mtop_util.c
@@ -89,85 +89,250 @@ int ncg_mtop(const gmx_mtop_t *mtop)
     return ncg;
 }
 
-void gmx_mtop_atomnr_to_atom(const gmx_mtop_t *mtop,int atnr_global,
+void gmx_mtop_remove_chargegroups(gmx_mtop_t *mtop)
+{
+    int mt;
+    t_block *cgs;
+    int i;
+
+    for(mt=0; mt<mtop->nmoltype; mt++)
+    {
+        cgs = &mtop->moltype[mt].cgs;
+        if (cgs->nr < mtop->moltype[mt].atoms.nr)
+        {
+            cgs->nr = mtop->moltype[mt].atoms.nr;
+            srenew(cgs->index,cgs->nr+1);
+            for(i=0; i<cgs->nr+1; i++)
+            {
+                cgs->index[i] = i;
+            }
+        }
+    }
+}
+
+
+typedef struct
+{
+    int a_start;
+    int a_end;
+    int na_mol;
+} mb_at_t;
+
+typedef struct gmx_mtop_atomlookup
+{
+    const gmx_mtop_t *mtop;
+    int     nmb;
+    int     mb_start;
+    mb_at_t *mba;
+} t_gmx_mtop_atomlookup;
+
+
+gmx_mtop_atomlookup_t
+gmx_mtop_atomlookup_init(const gmx_mtop_t *mtop)
+{
+    t_gmx_mtop_atomlookup *alook;
+    int mb;
+    int a_start,a_end,na,na_start=-1;
+
+    snew(alook,1);
+
+    alook->mtop     = mtop;
+    alook->nmb      = mtop->nmolblock;
+    alook->mb_start = 0;
+    snew(alook->mba,alook->nmb);
+
+    a_start = 0;
+    for(mb=0; mb<mtop->nmolblock; mb++)
+    {
+        na    = mtop->molblock[mb].nmol*mtop->molblock[mb].natoms_mol;
+        a_end = a_start + na;
+
+        alook->mba[mb].a_start = a_start;
+        alook->mba[mb].a_end   = a_end;
+        alook->mba[mb].na_mol  = mtop->molblock[mb].natoms_mol;
+
+        /* We start the binary search with the largest block */
+        if (mb == 0 || na > na_start)
+        {
+            alook->mb_start = mb;
+            na_start        = na;
+        }
+
+        a_start = a_end;
+    }
+
+    return alook;
+}
+
+gmx_mtop_atomlookup_t
+gmx_mtop_atomlookup_settle_init(const gmx_mtop_t *mtop)
+{
+     t_gmx_mtop_atomlookup *alook;
+     int mb;
+     int na,na_start=-1;
+
+     alook = gmx_mtop_atomlookup_init(mtop);
+
+     /* Check if the starting molblock has settle */
+     if (mtop->moltype[mtop->molblock[alook->mb_start].type].ilist[F_SETTLE].nr  == 0)
+     {
+         /* Search the largest molblock with settle */
+         alook->mb_start = -1;
+         for(mb=0; mb<mtop->nmolblock; mb++)
+         {
+             if (mtop->moltype[mtop->molblock[mb].type].ilist[F_SETTLE].nr > 0)
+             {
+                 na = alook->mba[mb].a_end - alook->mba[mb].a_start;
+                 if (alook->mb_start == -1 || na > na_start)
+                 {
+                     alook->mb_start = mb;
+                     na_start        = na;
+                 }
+             }
+         }
+
+         if (alook->mb_start == -1)
+         {
+             gmx_incons("gmx_mtop_atomlookup_settle_init called without settles");
+         }
+     }
+
+     return alook;
+}
+
+void
+gmx_mtop_atomlookup_destroy(gmx_mtop_atomlookup_t alook)
+{
+    sfree(alook->mba);
+    sfree(alook);
+}
+
+void gmx_mtop_atomnr_to_atom(const gmx_mtop_atomlookup_t alook,
+                             int atnr_global,
                              t_atom **atom)
 {
-    int mb,a_start,a_end,atnr_mol;
+    int mb0,mb1,mb;
+    int a_start,atnr_mol;
 
+#ifdef DEBUG_MTOP
     if (atnr_global < 0 || atnr_global >= mtop->natoms)
     {
-        gmx_fatal(FARGS,"gmx_mtop_atomnr_to_atom was called with atnr_global=%d which is not in the atom range of this system (%d-%d)",
+        gmx_fatal(FARGS,"gmx_mtop_atomnr_to_moltype was called with atnr_global=%d which is not in the atom range of this system (%d-%d)",
                   atnr_global,0,mtop->natoms-1);
     }
-    
-    mb = -1;
-    a_end = 0;
-    do
+#endif
+
+    mb0 = -1;
+    mb1 = alook->nmb;
+    mb  = alook->mb_start;
+        
+    while (TRUE)
     {
-        mb++;
-        a_start = a_end;
-        a_end = a_start + mtop->molblock[mb].nmol*mtop->molblock[mb].natoms_mol;
+        a_start = alook->mba[mb].a_start;
+        if (atnr_global < a_start)
+        {
+            mb1 = mb;
+        }
+        else if (atnr_global >= alook->mba[mb].a_end)
+        {
+            mb0 = mb;
+        }
+        else
+        {
+            break;
+        }
+        mb = ((mb0 + mb1 + 1)>>1);
     }
-    while (atnr_global >= a_end);
     
-    atnr_mol = (atnr_global - a_start) % mtop->molblock[mb].natoms_mol;
+    atnr_mol = (atnr_global - a_start) % alook->mba[mb].na_mol;
 
-    *atom = &mtop->moltype[mtop->molblock[mb].type].atoms.atom[atnr_mol];
+    *atom = &alook->mtop->moltype[alook->mtop->molblock[mb].type].atoms.atom[atnr_mol];
 }
 
-void gmx_mtop_atomnr_to_ilist(const gmx_mtop_t *mtop,int atnr_global,
+void gmx_mtop_atomnr_to_ilist(const gmx_mtop_atomlookup_t alook,
+                              int atnr_global,
                               t_ilist **ilist_mol,int *atnr_offset)
 {
-    int mb,a_start,a_end,atnr_local;
+    int mb0,mb1,mb;
+    int a_start,atnr_local;
 
+#ifdef DEBUG_MTOP
     if (atnr_global < 0 || atnr_global >= mtop->natoms)
     {
         gmx_fatal(FARGS,"gmx_mtop_atomnr_to_moltype was called with atnr_global=%d which is not in the atom range of this system (%d-%d)",
                   atnr_global,0,mtop->natoms-1);
     }
-    
-    mb = -1;
-    a_end = 0;
-    do
+#endif
+
+    mb0 = -1;
+    mb1 = alook->nmb;
+    mb  = alook->mb_start;
+        
+    while (TRUE)
     {
-        mb++;
-        a_start = a_end;
-        a_end = a_start + mtop->molblock[mb].nmol*mtop->molblock[mb].natoms_mol;
+        a_start = alook->mba[mb].a_start;
+        if (atnr_global < a_start)
+        {
+            mb1 = mb;
+        }
+        else if (atnr_global >= alook->mba[mb].a_end)
+        {
+            mb0 = mb;
+        }
+        else
+        {
+            break;
+        }
+        mb = ((mb0 + mb1 + 1)>>1);
     }
-    while (atnr_global >= a_end);
 
-    *ilist_mol = mtop->moltype[mtop->molblock[mb].type].ilist;
+    *ilist_mol = alook->mtop->moltype[alook->mtop->molblock[mb].type].ilist;
     
-    atnr_local = (atnr_global - a_start) % mtop->molblock[mb].natoms_mol;
+    atnr_local = (atnr_global - a_start) % alook->mba[mb].na_mol;
 
     *atnr_offset = atnr_global - atnr_local;
 }
 
-void gmx_mtop_atomnr_to_molblock_ind(const gmx_mtop_t *mtop,int atnr_global,
+void gmx_mtop_atomnr_to_molblock_ind(const gmx_mtop_atomlookup_t alook,
+                                     int atnr_global,
                                      int *molb,int *molnr,int *atnr_mol)
 {
-    int mb,a_start,a_end;
-    t_atoms *atoms;
+    int mb0,mb1,mb;
+    int a_start;
 
+#ifdef DEBUG_MTOP
     if (atnr_global < 0 || atnr_global >= mtop->natoms)
     {
         gmx_fatal(FARGS,"gmx_mtop_atomnr_to_moltype was called with atnr_global=%d which is not in the atom range of this system (%d-%d)",
                   atnr_global,0,mtop->natoms-1);
     }
-    
-    mb = -1;
-    a_end = 0;
-    do
+#endif
+
+    mb0 = -1;
+    mb1 = alook->nmb;
+    mb  = alook->mb_start;
+        
+    while (TRUE)
     {
-        mb++;
-        a_start = a_end;
-        a_end = a_start + mtop->molblock[mb].nmol*mtop->molblock[mb].natoms_mol;
+        a_start = alook->mba[mb].a_start;
+        if (atnr_global < a_start)
+        {
+            mb1 = mb;
+        }
+        else if (atnr_global >= alook->mba[mb].a_end)
+        {
+            mb0 = mb;
+        }
+        else
+        {
+            break;
+        }
+        mb = ((mb0 + mb1 + 1)>>1);
     }
-    while (atnr_global >= a_end);
 
     *molb  = mb;
-    *molnr = (atnr_global - a_start) / mtop->molblock[mb].natoms_mol;
-    *atnr_mol = atnr_global - a_start - (*molnr)*mtop->molblock[mb].natoms_mol;
+    *molnr = (atnr_global - a_start) / alook->mba[mb].na_mol;
+    *atnr_mol = atnr_global - a_start - (*molnr)*alook->mba[mb].na_mol;
 }
 
 void gmx_mtop_atominfo_global(const gmx_mtop_t *mtop,int atnr_global,
diff --git a/src/gmxlib/names.c b/src/gmxlib/names.c
index c67faa60ea..f759537ba2 100644
--- a/src/gmxlib/names.c
+++ b/src/gmxlib/names.c
@@ -70,6 +70,10 @@ const char *ptype_str[eptNR+1] = {
   "Atom", "Nucleus", "Shell", "Bond", "VSite", NULL
 };
 
+const char *ecutscheme_names[ecutsNR+1] = {
+  "Group", "Verlet", NULL
+};
+
 const char *eel_names[eelNR+1] = {
   "Cut-off", "Reaction-Field", "Generalized-Reaction-Field",
   "PME", "Ewald", "P3M-AD", "Poisson", "Switch", "Shift", "User", 
diff --git a/src/gmxlib/network.c b/src/gmxlib/network.c
index 08f573b766..4c4afaec57 100644
--- a/src/gmxlib/network.c
+++ b/src/gmxlib/network.c
@@ -1,4 +1,4 @@
-/*
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  * 
  *                This source code is part of
  * 
@@ -260,6 +260,13 @@ int gmx_hostname_num()
 #ifndef GMX_MPI
   return 0;
 #else
+#ifdef GMX_THREAD_MPI
+  /* thread-MPI currently puts the thread number in the process name,
+   * we might want to change this, as this is inconsistent with what
+   * most MPI implementations would do when running on a single node.
+   */
+  return 0;
+#else
   int  resultlen,hostnum,i,j;
   char mpi_hostname[MPI_MAX_PROCESSOR_NAME],hostnum_str[MPI_MAX_PROCESSOR_NAME];
 
@@ -290,48 +297,48 @@ int gmx_hostname_num()
   }
   return hostnum;
 #endif
+#endif
 }
 
 void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr)
 {
-  gmx_nodecomm_t *nc;
-  int  n,rank,hostnum,ng,ni;
-
-  /* Many MPI implementations do not optimize MPI_Allreduce
-   * (and probably also other global communication calls)
-   * for multi-core nodes connected by a network.
-   * We can optimize such communication by using one MPI call
-   * within each node and one between the nodes.
-   * For MVAPICH2 and Intel MPI this reduces the time for
-   * the global_stat communication by 25%
-   * for 2x2-core 3 GHz Woodcrest connected by mixed DDR/SDR Infiniband.
-   * B. Hess, November 2007
-   */
+    gmx_nodecomm_t *nc;
+    int  n,rank,hostnum,ng,ni;
+
+    /* Many MPI implementations do not optimize MPI_Allreduce
+     * (and probably also other global communication calls)
+     * for multi-core nodes connected by a network.
+     * We can optimize such communication by using one MPI call
+     * within each node and one between the nodes.
+     * For MVAPICH2 and Intel MPI this reduces the time for
+     * the global_stat communication by 25%
+     * for 2x2-core 3 GHz Woodcrest connected by mixed DDR/SDR Infiniband.
+     * B. Hess, November 2007
+     */
 
-  nc = &cr->nc;
+    nc = &cr->nc;
 
-  nc->bUse = FALSE;
+    nc->bUse = FALSE;
 #ifndef GMX_THREAD_MPI
-  if (getenv("GMX_NO_NODECOMM") == NULL) {
 #ifdef GMX_MPI
     MPI_Comm_size(cr->mpi_comm_mygroup,&n);
     MPI_Comm_rank(cr->mpi_comm_mygroup,&rank);
 
     hostnum = gmx_hostname_num();
 
-    if (debug) {
-      fprintf(debug,
-              "In gmx_setup_nodecomm: splitting communicator of size %d\n",
-              n);
+    if (debug)
+    {
+        fprintf(debug,"In gmx_setup_nodecomm: splitting communicator of size %d\n",n);
     }
 
 
     /* The intra-node communicator, split on node number */
     MPI_Comm_split(cr->mpi_comm_mygroup,hostnum,rank,&nc->comm_intra);
     MPI_Comm_rank(nc->comm_intra,&nc->rank_intra);
-    if (debug) {
-      fprintf(debug,"In gmx_setup_nodecomm: node rank %d rank_intra %d\n",
-	      rank,nc->rank_intra);
+    if (debug)
+    {
+        fprintf(debug,"In gmx_setup_nodecomm: node rank %d rank_intra %d\n",
+                rank,nc->rank_intra);
     }
     /* The inter-node communicator, split on rank_intra.
      * We actually only need the one for rank=0,
@@ -341,26 +348,134 @@ void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr)
     /* Check if this really created two step communication */
     MPI_Comm_size(nc->comm_inter,&ng);
     MPI_Comm_size(nc->comm_intra,&ni);
-    if (debug) {
-      fprintf(debug,"In gmx_setup_nodecomm: groups %d, my group size %d\n",
-	      ng,ni);
+    if (debug)
+    {
+        fprintf(debug,"In gmx_setup_nodecomm: groups %d, my group size %d\n",
+                ng,ni);
     }
-    if ((ng > 1 && ng < n) || (ni > 1 && ni < n)) {
-      nc->bUse = TRUE;
-      if (fplog)
-	fprintf(fplog,"Using two step summing over %d groups of on average %.1f processes\n\n",ng,(real)n/(real)ng);
-      if (nc->rank_intra > 0)
-	MPI_Comm_free(&nc->comm_inter);
-    } else {
-      /* One group or all processes in a separate group, use normal summing */
-      MPI_Comm_free(&nc->comm_inter);
-      MPI_Comm_free(&nc->comm_intra);
+
+    if (getenv("GMX_NO_NODECOMM") == NULL &&
+        ((ng > 1 && ng < n) || (ni > 1 && ni < n)))
+    {
+        nc->bUse = TRUE;
+        if (fplog)
+        {
+            fprintf(fplog,"Using two step summing over %d groups of on average %.1f processes\n\n",
+                    ng,(real)n/(real)ng);
+        }
+        if (nc->rank_intra > 0)
+        {
+            MPI_Comm_free(&nc->comm_inter);
+        }
+    }
+    else
+    {
+        /* One group or all processes in a separate group, use normal summing */
+        MPI_Comm_free(&nc->comm_inter);
+        MPI_Comm_free(&nc->comm_intra);
+        if (debug)
+        {
+            fprintf(debug,"In gmx_setup_nodecomm: not unsing separate inter- and intra-node communicators.\n");
+        }
     }
 #endif
-  }
+#else
+    /* tMPI runs only on a single node so just use the nodeid */
+    nc->rank_intra = cr->nodeid;
 #endif
 }
 
+void gmx_init_intra_counters(t_commrec *cr)
+{
+    /* counters for PP+PME and PP-only processes on my node */
+    int nnodes, nnodes_pp, id_mynode=-1, id_mynode_group=-1, nproc_mynode, nproc_mynode_pp;
+#if defined GMX_MPI && !defined GMX_THREAD_MPI
+    int i, mynum, *num, *num_s, *num_pp, *num_pp_s;
+#endif
+
+    nnodes    = cr->nnodes;
+    nnodes_pp = nnodes - cr->npmenodes;
+
+#if defined GMX_MPI && !defined GMX_THREAD_MPI
+    /* We have MPI and can expect to have different compute nodes */
+    mynum = gmx_hostname_num();
+
+    /* We can't rely on MPI_IN_PLACE, so we need send and receive buffers */
+    snew(num,   nnodes);
+    snew(num_s, nnodes);
+    snew(num_pp,   nnodes_pp);
+    snew(num_pp_s, nnodes_pp);
+
+    num_s[cr->sim_nodeid] = mynum;
+    if (cr->duty & DUTY_PP)
+    {
+        num_pp_s[cr->nodeid] = mynum;
+    }
+
+    MPI_Allreduce(num_s, num, nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
+    MPI_Allreduce(num_pp_s, num_pp, nnodes_pp, MPI_INT, MPI_SUM, cr->mpi_comm_mygroup);
+
+    id_mynode       = 0;
+    id_mynode_group = 0;
+    nproc_mynode    = 0;
+    nproc_mynode_pp = 0;
+    for(i=0; i<nnodes; i++)
+    {
+        if (num[i] == mynum)
+        {
+            nproc_mynode++;
+            if (i < cr->sim_nodeid)
+            {
+                id_mynode++;
+            }
+            if (i < cr->nodeid)
+            {
+                id_mynode_group++;
+            }
+        }
+    }
+    for(i=0; i<nnodes_pp; i++)
+    {
+        if (num_pp[i] == mynum)
+        {
+            nproc_mynode_pp++;
+        }
+    }
+    sfree(num);
+    sfree(num_s);
+    sfree(num_pp);
+    sfree(num_pp_s);
+#else
+    /* Serial or thread-MPI code, we are running within a node */
+    id_mynode       = cr->sim_nodeid;
+    id_mynode_group = cr->nodeid;
+    nproc_mynode    = cr->nnodes;
+    nproc_mynode_pp = cr->nnodes - cr->npmenodes;
+#endif
+
+    if (debug)
+    {
+        char sbuf[STRLEN];
+        if (cr->duty & DUTY_PP && cr->duty & DUTY_PME)
+        {
+            sprintf(sbuf, "PP+PME");
+        }
+        else
+        {
+            sprintf(sbuf, "%s", cr->duty & DUTY_PP ? "PP" : "PME");
+        }
+        fprintf(debug, "On %3s node %d: nodeid_intra=%d, nodeid_group_intra=%d, "
+                "nnodes_intra=%d, nnodes_pp_intra=%d\n", sbuf, cr->sim_nodeid,
+                id_mynode, id_mynode_group, nproc_mynode, nproc_mynode_pp);
+    }
+
+    cr->nodeid_intra        = id_mynode;
+    cr->nodeid_group_intra  = id_mynode_group;
+    cr->nnodes_intra        = nproc_mynode;
+    cr->nnodes_pp_intra     = nproc_mynode_pp;
+}
+
+
 void gmx_barrier(const t_commrec *cr)
 {
 #ifndef GMX_MPI
diff --git a/src/gmxlib/nonbonded/nonbonded.c b/src/gmxlib/nonbonded/nonbonded.c
index 39854e15e9..d0120c03fa 100644
--- a/src/gmxlib/nonbonded/nonbonded.c
+++ b/src/gmxlib/nonbonded/nonbonded.c
@@ -216,7 +216,7 @@ gmx_setup_kernels(FILE *fplog,t_forcerec *fr,gmx_bool bGenericKernelOnly)
 	
     nb_kernel_setup(fplog,nb_kernel_list);
     
-    if(fr->use_acceleration==FALSE)
+    if(fr->use_cpu_acceleration==FALSE)
     {
         return;
     }
@@ -312,7 +312,7 @@ void do_nonbonded(t_commrec *cr,t_forcerec *fr,
         {
 #if 0 && defined (GMX_X86_SSE2)
 # ifdef GMX_DOUBLE
-            if(fr->use_acceleration)
+            if(fr->use_cpu_acceleration)
             {
                 nb_kernel_allvsallgb_sse2_double(fr,mdatoms,excl,x[0],f[0],egcoul,egnb,egpol,
                                                  &outeriter,&inneriter,&fr->AllvsAll_work);
@@ -323,7 +323,7 @@ void do_nonbonded(t_commrec *cr,t_forcerec *fr,
                                      &outeriter,&inneriter,&fr->AllvsAll_work);        
             }
 #  else /* not double */
-            if(fr->use_acceleration)
+            if(fr->use_cpu_acceleration)
             {
                 nb_kernel_allvsallgb_sse2_single(fr,mdatoms,excl,x[0],f[0],egcoul,egnb,egpol,
                                                  &outeriter,&inneriter,&fr->AllvsAll_work);
@@ -344,7 +344,7 @@ void do_nonbonded(t_commrec *cr,t_forcerec *fr,
         { 
 #if 0 && defined (GMX_X86_SSE2)
 # ifdef GMX_DOUBLE
-            if(fr->use_acceleration)
+            if(fr->use_cpu_acceleration)
             {
                 nb_kernel_allvsall_sse2_double(fr,mdatoms,excl,x[0],f[0],egcoul,egnb,
                                                &outeriter,&inneriter,&fr->AllvsAll_work);
@@ -356,7 +356,7 @@ void do_nonbonded(t_commrec *cr,t_forcerec *fr,
             }
             
 #  else /* not double */
-            if(fr->use_acceleration)
+            if(fr->use_cpu_acceleration)
             {
                 nb_kernel_allvsall_sse2_single(fr,mdatoms,excl,x[0],f[0],egcoul,egnb,
                                                &outeriter,&inneriter,&fr->AllvsAll_work);
diff --git a/src/gmxlib/nrnb.c b/src/gmxlib/nrnb.c
index fba9cb76ea..5bc7155ae7 100644
--- a/src/gmxlib/nrnb.c
+++ b/src/gmxlib/nrnb.c
@@ -37,6 +37,7 @@
 #endif
 
 #include <string.h>
+#include "types/commrec.h"
 #include "sysstuff.h"
 #include "gmx_fatal.h"
 #include "names.h"
@@ -189,6 +190,27 @@ static const t_nrnb_data nbdata[eNRNB] = {
     { "All-vs-All, Coul + LJ",          38 },
     { "All-vs-All, GB + LJ",            61 },
     { "Outer nonbonded loop",           10 },
+    { "Pair Search distance check",      9 }, /* nbnxn pair dist. check */
+    /* nbnxn kernel flops are based on inner-loops without exclusion checks.
+     * Plain Coulomb runs through the RF kernels, except with CUDA.
+     * invsqrt is counted as 6 flops: 1 for _mm_rsqt_ps + 5 for iteration.
+     * The flops are equal for plain-C, x86 SIMD and CUDA, except for:
+     * - plain-C kernel uses one flop more for Coulomb-only (F) than listed
+     * - x86 SIMD LJ geom-comb.rule kernels (fastest) use 2 more flops
+     * - x86 SIMD LJ LB-comb.rule kernels (fast) use 3 (8 for F+E) more flops
+     * - GPU always does exclusions, which requires 2-4 flops, but as invsqrt
+     *   is always counted as 6 flops, this roughly compensates.
+     */
+    { "LJ + Coulomb RF (F)",            38 }, /* nbnxn kernel LJ+RF, no ener */
+    { "LJ + Coulomb RF (F+E)",          54 },
+    { "LJ + Coulomb tabulated (F)",     41 }, /* nbnxn kernel LJ+tab, no en */
+    { "LJ + Coulomb tabulated (F+E)",   59 },
+    { "LJ (F)",                         33 }, /* nbnxn kernel LJ, no ener */
+    { "LJ (F+E)",                       43 },
+    { "Coulomb RF (F)",                 31 }, /* nbnxn kernel RF, no ener */
+    { "Coulomb RF (F+E)",               36 },
+    { "Coulomb tabulated (F)",          34 }, /* nbnxn kernel tab, no ener */
+    { "Coulomb tabulated (F+E)",        41 },
     { "1,4 nonbonded interactions",     90 },
     { "Born radii (Still)",             47 },
     { "Born radii (HCT/OBC)",          183 },
@@ -365,50 +387,64 @@ void print_flop(FILE *out,t_nrnb *nrnb,double *nbfs,double *mflop)
 
 void print_perf(FILE *out,double nodetime,double realtime,int nprocs,
 		gmx_large_int_t nsteps,real delta_t,
-		double nbfs,double mflop)
+		double nbfs,double mflop,
+                int omp_nth_pp)
 {
   real runtime;
 
   fprintf(out,"\n");
 
-  if (nodetime == 0.0) {
-    fprintf(out,"nodetime = 0! Infinite Giga flopses!\n");
-  }
-#ifdef GMX_OPENMM
-  nodetime = realtime;
-  fprintf(out,"\tOpenMM run - timing based on wallclock.\n\n");
-#else
-  if (nprocs > 1)
+  if (realtime > 0) 
   {
-      nodetime = realtime;
-      fprintf(out,"\tParallel run - timing based on wallclock.\n\n");
-  }
-#endif
-
-  if ((nodetime > 0) && (realtime > 0)) {
-    fprintf(out,"%12s %10s %10s %8s\n","","NODE (s)","Real (s)","(%)");
-    fprintf(out,"%12s %10.3f %10.3f %8.1f\n","Time:",
+    fprintf(out,"%12s %12s %12s %10s\n","","Core t (s)","Wall t (s)","(%)");
+    fprintf(out,"%12s %12.3f %12.3f %10.1f\n","Time:",
 	    nodetime, realtime, 100.0*nodetime/realtime);
-    if (nodetime > 60) {
-      fprintf(out,"%12s %10s","","");
-      pr_difftime(out,nodetime);
+    /* only print day-hour-sec format if realtime is more than 30 min */
+    if (realtime > 30*60)
+    {
+      fprintf(out,"%12s %12s","","");
+      pr_difftime(out,realtime);
     }
-    if (delta_t > 0) {
-      mflop = mflop/nodetime;
+    if (delta_t > 0) 
+    {
+      mflop = mflop/realtime;
       runtime = nsteps*delta_t;
-      fprintf(out,"%12s %10s %10s %10s %10s\n",
-	      "","(Mnbf/s)",(mflop > 1000) ? "(GFlops)" : "(MFlops)",
-	      "(ns/day)","(hour/ns)");
-      fprintf(out,"%12s %10.3f %10.3f %10.3f %10.3f\n","Performance:",
-	      nbfs/nodetime,(mflop > 1000) ? (mflop/1000) : mflop,
-	      runtime*24*3.6/nodetime,1000*nodetime/(3600*runtime));
-    } else {
-      fprintf(out,"%12s %10s %10s %14s\n",
-	      "","(Mnbf/s)",(mflop > 1000) ? "(GFlops)" : "(MFlops)",
-	      "(steps/hour)");
-      fprintf(out,"%12s %10.3f %10.3f %14.1f\n","Performance:",
-	      nbfs/nodetime,(mflop > 1000) ? (mflop/1000) : mflop,
-	      nsteps*3600.0/nodetime);
+
+      if (getenv("GMX_DETAILED_PERF_STATS") == NULL)
+      {
+          fprintf(out,"%12s %12s %12s\n",
+                  "","(ns/day)","(hour/ns)");
+          fprintf(out,"%12s %12.3f %12.3f\n","Performance:",
+                  runtime*24*3.6/realtime,1000*realtime/(3600*runtime));
+      }
+      else
+      {
+        fprintf(out,"%12s %12s %12s %12s %12s\n",
+	        "","(Mnbf/s)",(mflop > 1000) ? "(GFlops)" : "(MFlops)",
+	        "(ns/day)","(hour/ns)");
+        fprintf(out,"%12s %12.3f %12.3f %12.3f %12.3f\n","Performance:",
+	        nbfs/realtime,(mflop > 1000) ? (mflop/1000) : mflop,
+	        runtime*24*3.6/realtime,1000*realtime/(3600*runtime));
+      }
+    } 
+    else 
+    {
+      if (getenv("GMX_DETAILED_PERF_STATS") == NULL)
+      {
+          fprintf(out,"%12s %14s\n",
+                  "","(steps/hour)");
+          fprintf(out,"%12s %14.1f\n","Performance:",
+                  nsteps*3600.0/realtime);
+      }
+      else
+      {
+          fprintf(out,"%12s %12s %12s %14s\n",
+	          "","(Mnbf/s)",(mflop > 1000) ? "(GFlops)" : "(MFlops)",
+	          "(steps/hour)");
+          fprintf(out,"%12s %12.3f %12.3f %14.1f\n","Performance:",
+	      nbfs/realtime,(mflop > 1000) ? (mflop/1000) : mflop,
+	      nsteps*3600.0/realtime);
+      }
     }
   }
 }
diff --git a/src/gmxlib/pbc.c b/src/gmxlib/pbc.c
index e9cdd3fb9c..1a71f6ffea 100644
--- a/src/gmxlib/pbc.c
+++ b/src/gmxlib/pbc.c
@@ -48,6 +48,7 @@
 #include "txtdump.h"
 #include "gmx_fatal.h"
 #include "names.h"
+#include "gmx_omp_nthreads.h"
 
 /* Skip 0 so we have more chance of detecting if we forgot to call set_pbc. */
 enum { epbcdxRECTANGULAR=1, epbcdxTRICLINIC,
@@ -1190,26 +1191,78 @@ int *compact_unitcell_edges()
     return edge;
 }
 
-void put_atom_in_box(matrix box,rvec x)
+void put_atoms_in_box_omp(int ePBC,matrix box,int natoms,rvec x[])
 {
-    int i,m,d;
-
-    for(m=DIM-1; m>=0; m--) {
-        while (x[m] < 0) 
-            for(d=0; d<=m; d++)
-                x[d] += box[m][d];
-        while (x[m] >= box[m][m])
-            for(d=0; d<=m; d++)
-                x[d] -= box[m][d];
+    int t, nth;
+    nth = gmx_omp_nthreads_get(emntDefault);
+
+#pragma omp parallel for num_threads(nth) schedule(static)
+    for(t=0; t<nth; t++)
+    {
+        int offset, len;
+
+        offset = (natoms*t    )/nth;
+        len    = (natoms*(t + 1))/nth - offset;        
+        put_atoms_in_box(ePBC, box, len, x + offset);
     }
 }
 
-void put_atoms_in_box(matrix box,int natoms,rvec x[])
+void put_atoms_in_box(int ePBC,matrix box,int natoms,rvec x[])
 {
-    int i,m,d;
+    int npbcdim,i,m,d;
 
-    for(i=0; (i<natoms); i++)
-        put_atom_in_box(box,x[i]);
+    if (ePBC == epbcSCREW)
+    {
+        gmx_fatal(FARGS,"Sorry, %s pbc is not yet supported",epbc_names[ePBC]);
+    }
+
+    if (ePBC == epbcXY)
+    {
+        npbcdim = 2;
+    }
+    else
+    {
+        npbcdim = 3;
+    }
+
+    if (TRICLINIC(box))
+    {
+        for(i=0; (i<natoms); i++)
+        {
+            for(m=npbcdim-1; m>=0; m--) {
+                while (x[i][m] < 0)
+                {
+                    for(d=0; d<=m; d++)
+                    {
+                        x[i][d] += box[m][d];
+                    }
+                }
+                while (x[i][m] >= box[m][m])
+                {
+                    for(d=0; d<=m; d++)
+                    {
+                        x[i][d] -= box[m][d];
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        for(i=0; i<natoms; i++)
+        {
+            for(d=0; d<npbcdim; d++) {
+                while (x[i][d] < 0)
+                {
+                    x[i][d] += box[d][d];
+                }
+                while (x[i][d] >= box[d][d])
+                {
+                    x[i][d] -= box[d][d];
+                }
+            }
+        }
+    }
 }
 
 void put_atoms_in_triclinic_unitcell(int ecenter,matrix box,
diff --git a/src/gmxlib/smalloc.c b/src/gmxlib/smalloc.c
index 77069ab5da..8a78196c2c 100644
--- a/src/gmxlib/smalloc.c
+++ b/src/gmxlib/smalloc.c
@@ -284,7 +284,7 @@ size_t memavail(void)
  * on systems that lack posix_memalign() and memalign() when 
  * freeing memory that needed to be adjusted to achieve
  * the necessary alignment. */
-void *save_calloc_aligned(const char *name,const char *file,int line,
+void *save_malloc_aligned(const char *name,const char *file,int line,
                           unsigned nelem,size_t elsize,size_t alignment)
 {
     void **aligned=NULL;
@@ -345,11 +345,21 @@ void *save_calloc_aligned(const char *name,const char *file,int line,
            pointer we're going to return */
         aligned[-1] = malloced;
 #endif
-	memset(aligned, 0,(size_t) (nelem * elsize));
     }
     return (void*)aligned;
 }
 
+void *save_calloc_aligned(const char *name,const char *file,int line,
+                          unsigned nelem,size_t elsize,size_t alignment)
+{
+    void *aligned = save_malloc_aligned(name, file, line, nelem, elsize, alignment);
+    if (aligned != NULL)
+    {
+        memset(aligned, 0, (size_t)(nelem * elsize));
+    }
+    return aligned;
+}
+
 /* This routine can NOT be called with any pointer */
 void save_free_aligned(const char *name,const char *file,int line,void *ptr)
 {
diff --git a/src/gmxlib/tpxio.c b/src/gmxlib/tpxio.c
index 29bf75167a..96df62a493 100644
--- a/src/gmxlib/tpxio.c
+++ b/src/gmxlib/tpxio.c
@@ -73,12 +73,13 @@
 static const char *tpx_tag = TPX_TAG_RELEASE;
 
 /* This number should be increased whenever the file format changes! */
-static const int tpx_version = 79;
+static const int tpx_version = 80;
 
 /* This number should only be increased when you edit the TOPOLOGY section
- * of the tpx format. This way we can maintain forward compatibility too
- * for all analysis tools and/or external programs that only need to
- * know the atom/residue names, charges, and bond connectivity.
+ * or the HEADER of the tpx format.
+ * This way we can maintain forward compatibility too for all analysis tools
+ * and/or external programs that only need to know the atom/residue names,
+ * charges, and bond connectivity.
  *  
  * It first appeared in tpx version 26, when I also moved the inputrecord
  * to the end of the tpx file, so we can just skip it if we only
@@ -615,6 +616,14 @@ static void do_inputrec(t_fileio *fio, t_inputrec *ir,gmx_bool bRead,
 	}
       }
     }
+    if (file_version >= 80)
+    {
+        gmx_fio_do_int(fio,ir->cutoff_scheme);
+    }
+    else
+    {
+        ir->cutoff_scheme = ecutsGROUP;
+    }
     gmx_fio_do_int(fio,ir->ns_type);
     gmx_fio_do_int(fio,ir->nstlist);
     gmx_fio_do_int(fio,ir->ndelta);
@@ -669,6 +678,11 @@ static void do_inputrec(t_fileio *fio, t_inputrec *ir,gmx_bool bRead,
     }
     if(file_version < 18)
       gmx_fio_do_int(fio,idum); 
+    if (file_version >= 80) {
+      gmx_fio_do_real(fio,ir->verletbuf_drift);
+    } else {
+      ir->verletbuf_drift = 0;
+    }
     gmx_fio_do_real(fio,ir->rlist); 
     if (file_version >= 67) {
       gmx_fio_do_real(fio,ir->rlistlong);
@@ -756,7 +770,15 @@ static void do_inputrec(t_fileio *fio, t_inputrec *ir,gmx_bool bRead,
 		ir->sa_surface_tension = 2.092;
 	}
 
-	  
+	 
+    if (file_version >= 80)
+    {
+        gmx_fio_do_real(fio,ir->fourier_spacing); 
+    }
+    else
+    {
+        ir->fourier_spacing = 0.0;
+    }
     gmx_fio_do_int(fio,ir->nkx); 
     gmx_fio_do_int(fio,ir->nky); 
     gmx_fio_do_int(fio,ir->nkz);
@@ -2357,8 +2379,27 @@ static void do_tpxheader(t_fileio *fio,gmx_bool bRead,t_tpxheader *tpx,
   
     /* Check versions! */
     gmx_fio_do_int(fio,fver);
+
+    /* This is for backward compatibility with development versions 77-79
+     * where the tag was, mistakenly, placed before the generation,
+     * which would cause a segv instead of a proper error message
+     * when reading the topology only from tpx with <77 code.
+     */
+    if (fver >= 77 && fver <= 79)
+    {
+        gmx_fio_do_string(fio,file_tag);
+    }
   
-    if (fver >= 77)
+    if (fver >= 26)
+    {
+        gmx_fio_do_int(fio,fgen);
+    }
+    else
+    {
+        fgen = 0;
+    }
+ 
+    if (fver >= 80)
     {
         gmx_fio_do_string(fio,file_tag);
     }
@@ -2378,7 +2419,7 @@ static void do_tpxheader(t_fileio *fio,gmx_bool bRead,t_tpxheader *tpx,
             /* We only support reading tpx files with the same tag as the code
              * or tpx files with the release tag and with lower version number.
              */
-            if (!(strcmp(file_tag,TPX_TAG_RELEASE) == 0 && fver < tpx_version))
+            if (!strcmp(file_tag,TPX_TAG_RELEASE) == 0 && fver < tpx_version) 
             {
                 gmx_fatal(FARGS,"tpx tag/version mismatch: reading tpx file (%s) version %d, tag '%s' with program for tpx version %d, tag '%s'",
                           gmx_fio_getname(fio),fver,file_tag,
@@ -2387,15 +2428,6 @@ static void do_tpxheader(t_fileio *fio,gmx_bool bRead,t_tpxheader *tpx,
         }
     }
 
-    if (fver >= 26)
-    {
-        gmx_fio_do_int(fio,fgen);
-    }
-    else
-    {
-        fgen=0;
-    }
- 
     if (file_version != NULL)
     {
         *file_version = fver;
diff --git a/src/gmxlib/txtdump.c b/src/gmxlib/txtdump.c
index 4fe598129c..25da6e1069 100644
--- a/src/gmxlib/txtdump.c
+++ b/src/gmxlib/txtdump.c
@@ -645,7 +645,8 @@ void pr_inputrec(FILE *fp,int indent,const char *title,t_inputrec *ir,
     PS("integrator",EI(ir->eI));
     PSTEP("nsteps",ir->nsteps);
     PSTEP("init-step",ir->init_step);
-    PS("ns-type",ENS(ir->ns_type));
+    PS("cutoff-scheme",ECUTSCHEME(ir->cutoff_scheme));
+    PS("ns_type",ENS(ir->ns_type));
     PI("nstlist",ir->nstlist);
     PI("ndelta",ir->ndelta);
     PI("nstcomm",ir->nstcomm);
@@ -661,6 +662,7 @@ void pr_inputrec(FILE *fp,int indent,const char *title,t_inputrec *ir,
     PR("delta-t",ir->delta_t);
     
     PR("xtcprec",ir->xtcprec);
+    PR("fourierspacing",ir->fourier_spacing);
     PI("nkx",ir->nkx);
     PI("nky",ir->nky);
     PI("nkz",ir->nkz);
@@ -693,6 +695,7 @@ void pr_inputrec(FILE *fp,int indent,const char *title,t_inputrec *ir,
 	      ir->posres_comB[YY],ir->posres_comB[ZZ]);
     else
       pr_rvec(fp,indent,"posres-comB",ir->posres_comB,DIM,TRUE);
+    PR("verlet-buffer-drift",ir->verletbuf_drift);
     PR("rlist",ir->rlist);
     PR("rlistlong",ir->rlistlong);
     PR("rtpi",ir->rtpi);
@@ -1511,6 +1514,7 @@ void pr_mtop(FILE *fp,int indent,const char *title,gmx_mtop_t *mtop,
         (void) pr_indent(fp,indent);
         (void) fprintf(fp,"name=\"%s\"\n",*(mtop->name));
         pr_int(fp,indent,"#atoms",mtop->natoms);
+        pr_int(fp,indent,"#molblock",mtop->nmolblock);
         for(mb=0; mb<mtop->nmolblock; mb++) {
             pr_molblock(fp,indent,"molblock",&mtop->molblock[mb],mb,
                         mtop->moltype,bShowNumbers);
diff --git a/src/kernel/CMakeLists.txt b/src/kernel/CMakeLists.txt
index 50f7747641..cb5d998dee 100644
--- a/src/kernel/CMakeLists.txt
+++ b/src/kernel/CMakeLists.txt
@@ -1,6 +1,7 @@
 
 set(GMXPREPROCESS_SOURCES 
     add_par.c       
+    calc_verletbuf.c
     compute_io.c    
     convparm.c      
     gen_ad.c        
@@ -36,7 +37,7 @@ set(GMXPREPROCESS_SOURCES
 
 set(MDRUN_SOURCES 
     gctio.c    ionize.c runner.c
-    do_gct.c     repl_ex.c  xutils.c
+    do_gct.c     repl_ex.c  xutils.c pme_switch.c
     md.c         mdrun.c    genalg.c membed.c
     md_openmm.c)
 
@@ -46,17 +47,20 @@ set_target_properties(gmxpreprocess PROPERTIES OUTPUT_NAME "gmxpreprocess${GMX_L
     COMPILE_FLAGS "${OpenMP_C_FLAGS}")
 
 
+if(GMX_GPU OR GMX_OPENMM) 
+    include_directories(${CMAKE_SOURCE_DIR}/src/gmxlib/gpu_utils)
+endif()
+
 if(GMX_OPENMM) 
-    add_subdirectory(gmx_gpu_utils)
-    include_directories(./gmx_gpu_utils ${OpenMM_INCLUDE_DIR})
+    include_directories(${OpenMM_INCLUDE_DIR})
     link_directories(${OpenMM_LIBRARY_DIR}) 
     # with this define no evn.var. is needed with OPENMM_PLUGIN_DIR
     # if the same OpenMM installation is used for running and building 
     add_definitions( -DOPENMM_PLUGIN_DIR="${OpenMM_PLUGIN_DIR}" ) 
     file(TO_CMAKE_PATH ${OpenMM_PLUGIN_DIR} _path)
     add_library(openmm_api_wrapper STATIC openmm_wrapper.cpp)
-    target_link_libraries(openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})
-    set(GMX_OPENMM_LIBRARIES openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})   
+    target_link_libraries(openmm_api_wrapper ${OpenMM_LIBRARIES})
+    set(GMX_OPENMM_LIBRARIES openmm_api_wrapper ${OpenMM_LIBRARIES})   
 endif(GMX_OPENMM)
 
 if(GMX_FAHCORE)
diff --git a/src/kernel/calc_verletbuf.c b/src/kernel/calc_verletbuf.c
new file mode 100644
index 0000000000..9dbacd2016
--- /dev/null
+++ b/src/kernel/calc_verletbuf.c
@@ -0,0 +1,716 @@
+/*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ *                        VERSION 3.2.03
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "assert.h"
+
+#include <sys/types.h>
+#include <math.h>
+#include "typedefs.h"
+#include "physics.h"
+#include "smalloc.h"
+#include "gmx_fatal.h"
+#include "macros.h"
+#include "vec.h"
+#include "coulomb.h"
+#include "calc_verletbuf.h"
+#include "../mdlib/nbnxn_consts.h"
+
+/* Struct for unique atom type for calculating the energy drift.
+ * The atom displacement depends on mass and constraints.
+ * The energy jump for given distance depend on LJ type and q.
+ */
+typedef struct
+{
+    real     mass; /* mass */
+    int      type; /* type (used for LJ parameters) */
+    real     q;    /* charge */
+    int      con;  /* constrained: 0, else 1, if 1, use #DOF=2 iso 3 */
+    int      n;    /* total #atoms of this type in the system */
+} verletbuf_atomtype_t;
+
+
+void verletbuf_get_list_setup(gmx_bool bGPU,
+                              verletbuf_list_setup_t *list_setup)
+{
+    list_setup->cluster_size_i     = NBNXN_CPU_CLUSTER_I_SIZE;
+
+    if (bGPU)
+    {
+        list_setup->cluster_size_j = NBNXN_GPU_CLUSTER_SIZE;
+    }
+    else
+    {
+#ifndef GMX_X86_SSE2
+        list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE;
+#else
+        int simd_width;
+
+#ifdef GMX_X86_AVX_256
+        simd_width = 256;
+#else
+        simd_width = 128;
+#endif
+        list_setup->cluster_size_j = simd_width/(sizeof(real)*8);
+#endif
+    }
+}
+
+static void add_at(verletbuf_atomtype_t **att_p,int *natt_p,
+                   real mass,int type,real q,int con,int nmol)
+{
+    verletbuf_atomtype_t *att;
+    int natt,i;
+
+    if (mass == 0)
+    {
+        /* Ignore massless particles */
+        return;
+    }
+
+    att  = *att_p;
+    natt = *natt_p;
+
+    i = 0;
+    while (i < natt &&
+           !(mass == att[i].mass &&
+             type == att[i].type &&
+             q    == att[i].q &&
+             con  == att[i].con))
+    {
+        i++;
+    }
+
+    if (i < natt)
+    {
+        att[i].n += nmol;
+    }
+    else
+    {
+        (*natt_p)++;
+        srenew(*att_p,*natt_p);
+        (*att_p)[i].mass = mass;
+        (*att_p)[i].type = type;
+        (*att_p)[i].q    = q;
+        (*att_p)[i].con  = con;
+        (*att_p)[i].n    = nmol;
+    }
+}
+
+static void get_verlet_buffer_atomtypes(const gmx_mtop_t *mtop,
+                                        verletbuf_atomtype_t **att_p,
+                                        int *natt_p,
+                                        int *n_nonlin_vsite)
+{
+    verletbuf_atomtype_t *att;
+    int natt;
+    int mb,nmol,ft,i,j,a1,a2,a3,a;
+    const t_atoms *atoms;
+    const t_ilist *il;
+    const t_atom *at;
+    const t_iparams *ip;
+    real *con_m,*vsite_m,cam[5];
+
+    att  = NULL;
+    natt = 0;
+
+    if (n_nonlin_vsite != NULL)
+    {
+        *n_nonlin_vsite = 0;
+    }
+
+    for(mb=0; mb<mtop->nmolblock; mb++)
+    {
+        nmol = mtop->molblock[mb].nmol;
+
+        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+
+        /* Check for constraints, as they affect the kinetic energy */
+        snew(con_m,atoms->nr);
+        snew(vsite_m,atoms->nr);
+
+        for(ft=F_CONSTR; ft<=F_CONSTRNC; ft++)
+        {
+            il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
+
+            for(i=0; i<il->nr; i+=1+NRAL(ft))
+            {
+                a1 = il->iatoms[i+1];
+                a2 = il->iatoms[i+2];
+                con_m[a1] += atoms->atom[a2].m;
+                con_m[a2] += atoms->atom[a1].m;
+            }
+        }
+
+        il = &mtop->moltype[mtop->molblock[mb].type].ilist[F_SETTLE];
+
+        for(i=0; i<il->nr; i+=1+NRAL(F_SETTLE))
+        {
+            a1 = il->iatoms[i+1];
+            a2 = il->iatoms[i+2];
+            a3 = il->iatoms[i+3];
+            con_m[a1] += atoms->atom[a2].m + atoms->atom[a3].m;
+            con_m[a2] += atoms->atom[a1].m + atoms->atom[a3].m;
+            con_m[a3] += atoms->atom[a1].m + atoms->atom[a2].m;
+        }
+
+        /* Check for virtual sites, determine mass from constructing atoms */
+        for(ft=0; ft<F_NRE; ft++)
+        {
+            if (IS_VSITE(ft))
+            {
+                il = &mtop->moltype[mtop->molblock[mb].type].ilist[ft];
+
+                for(i=0; i<il->nr; i+=1+NRAL(ft))
+                {
+                    ip = &mtop->ffparams.iparams[il->iatoms[i]];
+
+                    a1 = il->iatoms[i+1];
+
+                    for(j=1; j<NRAL(ft); j++)
+                    {
+                        cam[j] = atoms->atom[il->iatoms[i+1+j]].m;
+                        if (cam[j] == 0)
+                        {
+                            cam[j] = vsite_m[il->iatoms[i+1+j]];
+                        }
+                        if (cam[j] == 0)
+                        {
+                            gmx_fatal(FARGS,"In molecule type '%s' %s construction involves atom %d, which is a virtual site of equal or high complexity. This is not supported.",
+                                      *mtop->moltype[mtop->molblock[mb].type].name,
+                                      interaction_function[ft].longname,
+                                      il->iatoms[i+1+j]+1);
+                        }
+                    }
+
+                    switch(ft)
+                    {
+                    case F_VSITE2:
+                        /* Exact except for ignoring constraints */
+                        vsite_m[a1] = (cam[2]*sqr(1-ip->vsite.a) + cam[1]*sqr(ip->vsite.a))/(cam[1]*cam[2]);
+                        break;
+                    case F_VSITE3:
+                        /* Exact except for ignoring constraints */
+                        vsite_m[a1] = (cam[2]*cam[3]*sqr(1-ip->vsite.a-ip->vsite.b) + cam[1]*cam[3]*sqr(ip->vsite.a) + cam[1]*cam[2]*sqr(ip->vsite.b))/(cam[1]*cam[2]*cam[3]);
+                        break;
+                    default:
+                        /* Use the mass of the lightest constructing atom.
+                         * This is an approximation.
+                         * If the distance of the virtual site to the
+                         * constructing atom is less than all distances
+                         * between constructing atoms, this is a safe
+                         * over-estimate of the displacement of the vsite.
+                         * This condition holds for all H mass replacement
+                         * replacement vsite constructions, except for SP2/3
+                         * groups. In SP3 groups one H will have a F_VSITE3
+                         * construction, so even there the total drift
+                         * estimation shouldn't be far off.
+                         */
+                        assert(j>=1);
+                        vsite_m[a1] = cam[1];
+                        for(j=2; j<NRAL(ft); j++)
+                        {
+                            vsite_m[a1] = min(vsite_m[a1],cam[j]);
+                        }
+                        if (n_nonlin_vsite != NULL)
+                        {
+                            *n_nonlin_vsite += nmol;
+                        }
+                        break;
+                    }
+                }
+            }
+        }
+
+        for(a=0; a<atoms->nr; a++)
+        {
+            at = &atoms->atom[a];
+            /* We consider an atom constrained, #DOF=2, when it is
+             * connected with constraints to one or more atoms with
+             * total mass larger than 1.5 that of the atom itself.
+             */
+            add_at(&att,&natt,
+                   at->m,at->type,at->q,con_m[a] > 1.5*at->m,nmol);
+        }
+
+        sfree(vsite_m);
+        sfree(con_m);
+    }
+
+    if (gmx_debug_at)
+    {
+        for(a=0; a<natt; a++)
+        {
+            fprintf(debug,"type %d: m %5.2f t %d q %6.3f con %d n %d\n",
+                    a,att[a].mass,att[a].type,att[a].q,att[a].con,att[a].n);
+        }
+    }
+
+    *att_p  = att;
+    *natt_p = natt;
+}
+
+static void approx_2dof(real s2,real x,
+                        real *shift,real *scale)
+{
+    /* A particle with 1 DOF constrained has 2 DOFs instead of 3.
+     * This code is also used for particles with multiple constraints,
+     * in which case we overestimate the displacement.
+     * The 2DOF distribution is sqrt(pi/2)*erfc(r/(sqrt(2)*s))/(2*s).
+     * We approximate this with scale*Gaussian(s,r+shift),
+     * by matching the distribution value and derivative at x.
+     * This is a tight overestimate for all r>=0 at any s and x.
+     */
+    real ex,er;
+
+    ex = exp(-x*x/(2*s2));
+    er = gmx_erfc(x/sqrt(2*s2));
+
+    *shift = -x + sqrt(2*s2/M_PI)*ex/er;
+    *scale = 0.5*M_PI*exp(ex*ex/(M_PI*er*er))*er;
+}
+
+static real ener_drift(const verletbuf_atomtype_t *att,int natt,
+                       const gmx_ffparams_t *ffp,
+                       real kT_fac,
+                       real md_ljd,real md_ljr,real md_el,real dd_el,
+                       real r_buffer,
+                       real rlist,real boxvol)
+{
+    double drift_tot,pot1,pot2,pot;
+    int    i,j;
+    real   s2i,s2j,s2,s;
+    int    ti,tj;
+    real   md,dd;
+    real   sc_fac,rsh;
+    double c_exp,c_erfc;
+
+    drift_tot = 0;
+
+    /* Loop over the different atom type pairs */
+    for(i=0; i<natt; i++)
+    {
+        s2i = kT_fac/att[i].mass;
+        ti  = att[i].type;
+
+        for(j=i; j<natt; j++)
+        {
+            s2j = kT_fac/att[j].mass;
+            tj = att[j].type;
+
+            /* Note that attractive and repulsive potentials for individual
+             * pairs will partially cancel.
+             */
+            /* -dV/dr at the cut-off for LJ + Coulomb */
+            md =
+                md_ljd*ffp->iparams[ti*ffp->atnr+tj].lj.c6 +
+                md_ljr*ffp->iparams[ti*ffp->atnr+tj].lj.c12 +
+                md_el*att[i].q*att[j].q;
+
+            /* d2V/dr2 at the cut-off for Coulomb, we neglect LJ */
+            dd = dd_el*att[i].q*att[j].q;
+
+            s2  = s2i + s2j;
+
+            rsh    = r_buffer;
+            sc_fac = 1.0;
+            /* For constraints: adapt r and scaling for the Gaussian */
+            if (att[i].con)
+            {
+                real sh,sc;
+                approx_2dof(s2i,r_buffer*s2i/s2,&sh,&sc);
+                rsh    += sh;
+                sc_fac *= sc;
+            }
+            if (att[j].con)
+            {
+                real sh,sc;
+                approx_2dof(s2j,r_buffer*s2j/s2,&sh,&sc);
+                rsh    += sh;
+                sc_fac *= sc;
+            }
+
+            /* Exact contribution of an atom pair with Gaussian displacement
+             * with sigma s to the energy drift for a potential with
+             * derivative -md and second derivative dd at the cut-off.
+             * The only catch is that for potentials that change sign
+             * near the cut-off there could be an unlucky compensation
+             * of positive and negative energy drift.
+             * Such potentials are extremely rare though.
+             *
+             * Note that pot has unit energy*length, as the linear
+             * atom density still needs to be put in.
+             */
+            c_exp  = exp(-rsh*rsh/(2*s2))/sqrt(2*M_PI);
+            c_erfc = 0.5*gmx_erfc(rsh/(sqrt(2*s2)));
+            s      = sqrt(s2);
+
+            pot1 = sc_fac*
+                md/2*((rsh*rsh + s2)*c_erfc - rsh*s*c_exp);
+            pot2 = sc_fac*
+                dd/6*(s*(rsh*rsh + 2*s2)*c_exp - rsh*(rsh*rsh + 3*s2)*c_erfc);
+            pot = pot1 + pot2;
+
+            if (gmx_debug_at)
+            {
+                fprintf(debug,"n %d %d d s %.3f %.3f con %d md %8.1e dd %8.1e pot1 %8.1e pot2 %8.1e pot %8.1e\n",
+                        att[i].n,att[j].n,sqrt(s2i),sqrt(s2j),
+                        att[i].con+att[j].con,
+                        md,dd,pot1,pot2,pot);
+            }
+
+            /* Multiply by the number of atom pairs */
+            if (j == i)
+            {
+                pot *= (double)att[i].n*(att[i].n - 1)/2;
+            }
+            else
+            {
+                pot *= (double)att[i].n*att[j].n;
+            }
+            /* We need the line density to get the energy drift of the system.
+             * The effective average r^2 is close to (rlist+sigma)^2.
+             */
+            pot *= 4*M_PI*sqr(rlist + s)/boxvol;
+
+            /* Add the unsigned drift to avoid cancellation of errors */
+            drift_tot += fabs(pot);
+        }
+    }
+
+    return drift_tot;
+}
+
+static real surface_frac(int cluster_size,real particle_distance,real rlist)
+{
+    real d,area_rel;
+
+    if (rlist < 0.5*particle_distance)
+    {
+        /* We have non overlapping spheres */
+        return 1.0;
+    }
+
+    /* Half the inter-particle distance relative to rlist */
+    d = 0.5*particle_distance/rlist;
+
+    /* Determine the area of the surface at distance rlist to the closest
+     * particle, relative to surface of a sphere of radius rlist.
+     * The formulas below assume close to cubic cells for the pair search grid,
+     * which the pair search code tries to achieve.
+     * Note that in practice particle distances will not be delta distributed,
+     * but have some spread, often involving shorter distances,
+     * as e.g. O-H bonds in a water molecule. Thus the estimates below will
+     * usually be slightly too high and thus conservative.
+     */
+    switch (cluster_size)
+    {
+    case 1:
+        /* One particle: trivial */
+        area_rel = 1.0;
+        break;
+    case 2:
+        /* Two particles: two spheres at fractional distance 2*a */
+        area_rel = 1.0 + d;
+        break;
+    case 4:
+        /* We assume a perfect, symmetric tetrahedron geometry.
+         * The surface around a tetrahedron is too complex for a full
+         * analytical solution, so we use a Taylor expansion.
+         */
+        area_rel = (1.0 + 1/M_PI*(6*acos(1/sqrt(3))*d +
+                                  sqrt(3)*d*d*(1.0 +
+                                               5.0/18.0*d*d +
+                                               7.0/45.0*d*d*d*d +
+                                               83.0/756.0*d*d*d*d*d*d)));
+        break;
+    default:
+        gmx_incons("surface_frac called with unsupported cluster_size");
+        area_rel = 1.0;
+    }
+        
+    return area_rel/cluster_size;
+}
+
+void calc_verlet_buffer_size(const gmx_mtop_t *mtop,real boxvol,
+                             const t_inputrec *ir,real drift_target,
+                             const verletbuf_list_setup_t *list_setup,
+                             int *n_nonlin_vsite,
+                             real *rlist)
+{
+    double resolution;
+    char *env;
+
+    real particle_distance;
+    real nb_clust_frac_pairs_not_in_list_at_cutoff;
+
+    verletbuf_atomtype_t *att=NULL;
+    int  natt=-1,i;
+    double reppow;
+    real md_ljd,md_ljr,md_el,dd_el;
+    real elfac;
+    real kT_fac,mass_min;
+    int  ib0,ib1,ib;
+    real rb,rl;
+    real drift;
+
+    /* Resolution of the buffer size */
+    resolution = 0.001;
+
+    env = getenv("GMX_VERLET_BUFFER_RES");
+    if (env != NULL)
+    {
+        sscanf(env,"%lf",&resolution);
+    }
+
+    /* In an atom wise pair-list there would be no pairs in the list
+     * beyond the pair-list cut-off.
+     * However, we use a pair-list of groups vs groups of atoms.
+     * For groups of 4 atoms, the parallelism of SSE instructions, only
+     * 10% of the atoms pairs are not in the list just beyond the cut-off.
+     * As this percentage increases slowly compared to the decrease of the
+     * Gaussian displacement distribution over this range, we can simply
+     * reduce the drift by this fraction.
+     * For larger groups, e.g. of 8 atoms, this fraction will be lower,
+     * so then buffer size will be on the conservative (large) side.
+     *
+     * Note that the formulas used here do not take into account
+     * cancellation of errors which could occur by missing both
+     * attractive and repulsive interactions.
+     *
+     * The only major assumption is homogeneous particle distribution.
+     * For an inhomogeneous system, such as a liquid-vapor system,
+     * the buffer will be underestimated. The actual energy drift
+     * will be higher by the factor: local/homogeneous particle density.
+     *
+     * The results of this estimate have been checked againt simulations.
+     * In most cases the real drift differs by less than a factor 2.
+     */
+
+    /* Worst case assumption: HCP packing of particles gives largest distance */
+    particle_distance = pow(boxvol*sqrt(2)/mtop->natoms,1.0/3.0);
+
+    get_verlet_buffer_atomtypes(mtop,&att,&natt,n_nonlin_vsite);
+    assert(att != NULL && natt >= 0);
+
+    if (debug)
+    {
+        fprintf(debug,"particle distance assuming HCP packing: %f nm\n",
+                particle_distance);
+        fprintf(debug,"energy drift atom types: %d\n",natt);
+    }
+
+    reppow = mtop->ffparams.reppow;
+    md_ljd = 0;
+    md_ljr = 0;
+    if (ir->vdwtype == evdwCUT)
+    {
+        /* -dV/dr of -r^-6 and r^-repporw */
+        md_ljd = -6*pow(ir->rvdw,-7.0);
+        md_ljr = reppow*pow(ir->rvdw,-(reppow+1));
+        /* The contribution of the second derivative is negligible */
+    }
+    else
+    {
+        gmx_fatal(FARGS,"Energy drift calculation is only implemented for plain cut-off Lennard-Jones interactions");
+    }
+
+    elfac = ONE_4PI_EPS0/ir->epsilon_r;
+
+    /* Determine md=-dV/dr and dd=d^2V/dr^2 */
+    md_el = 0;
+    dd_el = 0;
+    if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype))
+    {
+        real eps_rf,k_rf;
+
+        if (ir->coulombtype == eelCUT)
+        {
+            eps_rf = 1;
+            k_rf = 0;
+        }
+        else
+        {
+            eps_rf = ir->epsilon_rf/ir->epsilon_r;
+            if (eps_rf != 0)
+            {
+                k_rf = pow(ir->rcoulomb,-3.0)*(eps_rf - ir->epsilon_r)/(2*eps_rf + ir->epsilon_r);
+            }
+            else
+            {
+                /* epsilon_rf = infinity */
+                k_rf = 0.5*pow(ir->rcoulomb,-3.0);
+            }
+        }
+
+        if (eps_rf > 0)
+        {
+            md_el = elfac*(pow(ir->rcoulomb,-2.0) - 2*k_rf*ir->rcoulomb);
+        }
+        dd_el = elfac*(2*pow(ir->rcoulomb,-3.0) + 2*k_rf);
+    }
+    else if (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD)
+    {
+        real b,rc,br;
+
+        b  = calc_ewaldcoeff(ir->rcoulomb,ir->ewald_rtol);
+        rc = ir->rcoulomb;
+        br = b*rc;
+        md_el = elfac*(2*b*exp(-br*br)/(sqrt(M_PI)*rc) + gmx_erfc(br)/(rc*rc));
+        dd_el = elfac/(rc*rc)*(4*b*(1 + br*br)*exp(-br*br)/sqrt(M_PI) + 2*gmx_erfc(br)/rc);
+    }
+    else
+    {
+        gmx_fatal(FARGS,"Energy drift calculation is only implemented for Reaction-Field and Ewald electrostatics");
+    }
+
+    /* Determine the variance of the atomic displacement
+     * over nstlist-1 steps: kT_fac
+     * For inertial dynamics (not Brownian dynamics) the mass factor
+     * is not included in kT_fac, it is added later.
+     */
+    if (ir->eI == eiBD)
+    {
+        /* Get the displacement distribution from the random component only.
+         * With accurate integration the systematic (force) displacement
+         * should be negligible (unless nstlist is extremely large, which
+         * you wouldn't do anyhow).
+         */
+        kT_fac = 2*BOLTZ*ir->opts.ref_t[0]*(ir->nstlist-1)*ir->delta_t;
+        if (ir->bd_fric > 0)
+        {
+            /* This is directly sigma^2 of the displacement */
+            kT_fac /= ir->bd_fric;
+
+            /* Set the masses to 1 as kT_fac is the full sigma^2,
+             * but we divide by m in ener_drift().
+             */
+            for(i=0; i<natt; i++)
+            {
+                att[i].mass = 1;
+            }
+        }
+        else
+        {
+            real tau_t;
+
+            /* Per group tau_t is not implemented yet, use the maximum */
+            tau_t = ir->opts.tau_t[0];
+            for(i=1; i<ir->opts.ngtc; i++)
+            {
+                tau_t = max(tau_t,ir->opts.tau_t[i]);
+            }
+
+            kT_fac *= tau_t;
+            /* This kT_fac needs to be divided by the mass to get sigma^2 */
+        }
+    }
+    else
+    {
+        kT_fac = BOLTZ*ir->opts.ref_t[0]*sqr((ir->nstlist-1)*ir->delta_t);
+    }
+
+    mass_min = att[0].mass;
+    for(i=1; i<natt; i++)
+    {
+        mass_min = min(mass_min,att[i].mass);
+    }
+
+    if (debug)
+    {
+        fprintf(debug,"md_ljd %e md_ljr %e\n",md_ljd,md_ljr);
+        fprintf(debug,"md_el %e dd_el %e\n",md_el,dd_el);
+        fprintf(debug,"sqrt(kT_fac) %f\n",sqrt(kT_fac));
+        fprintf(debug,"mass_min %f\n",mass_min);
+    }
+
+    /* Search using bisection */
+    ib0 = -1;
+    /* The drift will be neglible at 5 times the max sigma */
+    ib1 = (int)(5*2*sqrt(kT_fac/mass_min)/resolution) + 1;
+    while (ib1 - ib0 > 1)
+    {
+        ib = (ib0 + ib1)/2;
+        rb = ib*resolution;
+        rl = max(ir->rvdw,ir->rcoulomb) + rb;
+
+        /* Calculate the average energy drift at the last step
+         * of the nstlist steps at which the pair-list is used.
+         */
+        drift = ener_drift(att,natt,&mtop->ffparams,
+                           kT_fac,
+                           md_ljd,md_ljr,md_el,dd_el,rb,
+                           rl,boxvol);
+
+        /* Correct for the fact that we are using a Ni x Nj particle pair list
+         * and not a 1 x 1 particle pair list. This reduces the drift.
+         */
+        /* We don't have a formula for 8 (yet), use 4 which is conservative */
+        nb_clust_frac_pairs_not_in_list_at_cutoff =
+            surface_frac(min(list_setup->cluster_size_i,4),
+                         particle_distance,rl)*
+            surface_frac(min(list_setup->cluster_size_j,4),
+                         particle_distance,rl);
+        drift *= nb_clust_frac_pairs_not_in_list_at_cutoff;
+
+        /* Convert the drift to drift per unit time per atom */
+        drift /= ir->nstlist*ir->delta_t*mtop->natoms;
+
+        if (debug)
+        {
+            fprintf(debug,"ib %3d %3d %3d rb %.3f %dx%d fac %.3f drift %f\n",
+                    ib0,ib,ib1,rb,
+                    list_setup->cluster_size_i,list_setup->cluster_size_j,
+                    nb_clust_frac_pairs_not_in_list_at_cutoff,
+                    drift);
+        }
+
+        if (fabs(drift) > drift_target)
+        {
+            ib0 = ib;
+        }
+        else
+        {
+            ib1 = ib;
+        }
+    }
+
+    sfree(att);
+
+    *rlist = max(ir->rvdw,ir->rcoulomb) + ib1*resolution;
+}
diff --git a/src/kernel/repl_ex.h b/src/kernel/calc_verletbuf.h
similarity index 52%
copy from src/kernel/repl_ex.h
copy to src/kernel/calc_verletbuf.h
index 7e4bf23d68..d9a85fcb58 100644
--- a/src/kernel/repl_ex.h
+++ b/src/kernel/calc_verletbuf.h
@@ -33,40 +33,38 @@
  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
 
-#ifndef _repl_ex_h
-#define _repl_ex_h
+#ifndef _calc_verletbuf_h
+#define _calc_verletbuf_h
 
 #include "typedefs.h"
 
-/* Abstract type for replica exchange */
-typedef struct gmx_repl_ex *gmx_repl_ex_t;
+typedef struct
+{
+    int  cluster_size_i;  /* Cluster pair-list i-cluster size atom count */
+    int  cluster_size_j;  /* Cluster pair-list j-cluster size atom count */
+} verletbuf_list_setup_t;
 
-extern gmx_repl_ex_t init_replica_exchange(FILE *fplog,
-					   const gmx_multisim_t *ms,
-					   const t_state *state,
-					   const t_inputrec *ir,
-					   int nst, int nmultiex, int init_seed);
-/* Should only be called on the master nodes */
 
-extern gmx_bool replica_exchange(FILE *fplog,
-			     const t_commrec *cr,
-			     gmx_repl_ex_t re,
-			     t_state *state,gmx_enerdata_t *enerd,
-			     t_state *state_local,
-			     gmx_large_int_t step,real time);
-/* Attempts replica exchange, should be called on all nodes.
- * Returns TRUE if this state has been exchanged.
- * When running each replica in parallel,
- * this routine collects the state on the master node before exchange.
- * With particle the state is redistributed over the nodes after exchange.
- * With domain decomposition the global state after exchanged in stored
- * in state and still needs to be redistributed over the nodes.
+/* Sets the pair-list setup assumed for the current Gromacs configuration.
+ * The setup with smallest cluster sizes is return, such that the Verlet
+ * buffer size estimated with this setup will be conservative.
  */
+void verletbuf_get_list_setup(gmx_bool bGPU,
+			      verletbuf_list_setup_t *list_setup);
 
-extern void print_replica_exchange_statistics(FILE *fplog,gmx_repl_ex_t re);
-/* Should only be called on the master nodes */
 
-extern void pd_distribute_state(const t_commrec *cr,t_state *state);
-/* Distributes the state after exchange for particle decomposition */
+/* Calculate the non-bonded pair-list buffer size for the Verlet list
+ * based on the particle masses, temperature, LJ types, charges
+ * and constraints as well as the non-bonded force behavior at the cut-off.
+ * The target is a maximum energy drift.
+ * Returns the number of non-linear virtual sites. For these it's difficult
+ * to determine their contribution to the drift exaclty, so we approximate.
+ * Returns the pair-list cut-off.
+ */
+void calc_verlet_buffer_size(const gmx_mtop_t *mtop,real boxvol,
+			     const t_inputrec *ir,real drift_target,
+			     const verletbuf_list_setup_t *list_setup,
+			     int *n_nonlin_vsite,
+			     real *rlist);
 
-#endif	/* _repl_ex_h */
+#endif	/* _calc_verletbuf_h */
diff --git a/src/kernel/grompp.c b/src/kernel/grompp.c
index 1bf343125b..40caa60a70 100644
--- a/src/kernel/grompp.c
+++ b/src/kernel/grompp.c
@@ -80,6 +80,7 @@
 #include "gpp_tomorse.h"
 #include "mtop_util.h"
 #include "genborn.h"
+#include "calc_verletbuf.h"
 
 static int rm_interactions(int ifunc,int nrmols,t_molinfo mols[])
 {
@@ -1128,20 +1129,73 @@ static void check_gbsa_params(t_inputrec *ir,gpp_atomtype_t atype)
   
 }
 
-static void check_settle(gmx_mtop_t   *sys)
+static void set_verlet_buffer(const gmx_mtop_t *mtop,
+                              t_inputrec *ir,
+                              matrix box,
+                              real verletbuf_drift,
+                              warninp_t wi)
 {
-    int i,j,cgj1,nra;
-    
-    nra = interaction_function[F_SETTLE].nratoms;
-    for(i=0; (i<sys->nmoltype); i++) 
+    real ref_T;
+    int i;
+    verletbuf_list_setup_t ls;
+    real rlist_1x1;
+    int n_nonlin_vsite;
+    char warn_buf[STRLEN];
+
+    ref_T = 0;
+    for(i=0; i<ir->opts.ngtc; i++)
+    {
+        if (ir->opts.ref_t[i] < 0)
+        {
+            warning(wi,"Some atom groups do not use temperature coupling. This cannot be accounted for in the energy drift estimation for the Verlet buffer size. The energy drift and the Verlet buffer might be underestimated.");
+        }
+        else
+        {
+            ref_T = max(ref_T,ir->opts.ref_t[i]);
+        }
+    }
+
+    printf("Determining Verlet buffer for an energy drift of %g kJ/mol/ps at %g K\n",verletbuf_drift,ref_T);
+
+    for(i=0; i<ir->opts.ngtc; i++)
     {
-        for(j=0; (j<sys->moltype[i].ilist[F_SETTLE].nr); j+=nra+1)
+        if (ir->opts.ref_t[i] >= 0 && ir->opts.ref_t[i] != ref_T)
         {
-            cgj1 = sys->moltype[i].cgs.index[j+1];
-            if (j+2 >= cgj1)
-                gmx_fatal(FARGS,"For SETTLE you need to have all atoms involved in one charge group. Please fix your topology.");
+            sprintf(warn_buf,"ref_T for group of %.1f DOFs is %g K, which is smaller than the maximum of %g K used for the buffer size calculation. The buffer size might be on the conservative (large) side.",
+                    ir->opts.nrdf[i],ir->opts.ref_t[i],ref_T);
+            warning_note(wi,warn_buf);
         }
     }
+
+    /* Calculate the buffer size for simple atom vs atoms list */
+    ls.cluster_size_i = 1;
+    ls.cluster_size_j = 1;
+    calc_verlet_buffer_size(mtop,det(box),ir,verletbuf_drift,
+                            &ls,&n_nonlin_vsite,&rlist_1x1);
+
+    /* Set the pair-list buffer size in ir */
+    verletbuf_get_list_setup(FALSE,&ls);
+    calc_verlet_buffer_size(mtop,det(box),ir,verletbuf_drift,
+                            &ls,&n_nonlin_vsite,&ir->rlist);
+
+    if (n_nonlin_vsite > 0)
+    {
+        sprintf(warn_buf,"There are %d non-linear virtual site constructions. Their contribution to the energy drift is approximated. In most cases this does not affect the energy drift significantly.",n_nonlin_vsite);
+        warning_note(wi,warn_buf);
+    }
+
+    printf("Calculated rlist for %dx%d atom pair-list as %.3f nm, buffer size %.3f nm\n",
+           1,1,rlist_1x1,rlist_1x1-max(ir->rvdw,ir->rcoulomb));
+
+    ir->rlistlong = ir->rlist;
+    printf("Set rlist, assuming %dx%d atom pair-list, to %.3f nm, buffer size %.3f nm\n",
+           ls.cluster_size_i,ls.cluster_size_j,
+           ir->rlist,ir->rlist-max(ir->rvdw,ir->rcoulomb));
+            
+    if (sqr(ir->rlistlong) >= max_cutoff2(ir->ePBC,box))
+    {
+        gmx_fatal(FARGS,"The pair-list cut-off (%g nm) is longer than half the shortest box vector or longer than the smallest box diagonal element (%g nm). Increase the box size or decrease nstlist or increase verlet-buffer-drift.",ir->rlistlong,sqrt(max_cutoff2(ir->ePBC,box)));
+    }
 }
 
 int main (int argc, char *argv[])
@@ -1360,6 +1414,15 @@ int main (int argc, char *argv[])
   
   if (debug)
     pr_symtab(debug,0,"After new_status",&sys->symtab);
+
+    if (ir->cutoff_scheme == ecutsVERLET)
+    {
+        fprintf(stderr,"Removing all charge groups because cutoff-scheme=%s\n",
+                ecutscheme_names[ir->cutoff_scheme]);
+
+        /* Remove all charge groups */
+        gmx_mtop_remove_chargegroups(sys);
+    }
   
   if (count_constraints(sys,mi,wi) && (ir->eConstrAlg == econtSHAKE)) {
     if (ir->eI == eiCG || ir->eI == eiLBFGS) {
@@ -1501,9 +1564,6 @@ int main (int argc, char *argv[])
     check_vel(sys,state.v);
   }
     
-  /* check for charge groups in settles */
-  check_settle(sys);
-  
   /* check masses */
   check_mol(sys,wi);
   
@@ -1530,6 +1590,17 @@ int main (int argc, char *argv[])
            bGenVel ? state.v : NULL,
            wi);
   
+    if (ir->cutoff_scheme == ecutsVERLET && ir->verletbuf_drift > 0 &&
+        ir->nstlist > 1)
+    {
+        if (EI_DYNAMICS(ir->eI) &&
+            !(EI_MD(ir->eI) && ir->etc==etcNO) &&
+            inputrec2nboundeddim(ir) == 3)
+        {
+            set_verlet_buffer(sys,ir,state.box,ir->verletbuf_drift,wi);
+        }
+    }
+
   /* Init the temperature coupling state */
   init_gtc_state(&state,ir->opts.ngtc,0,ir->opts.nhchainlength); /* need to add nnhpres here? */
 
@@ -1566,7 +1637,7 @@ int main (int argc, char *argv[])
         clear_rvec(state.box[ZZ]);
     }
   
-    if (ir->rlist > 0)
+    if (ir->cutoff_scheme != ecutsVERLET && ir->rlist > 0)
     {
         set_warning_line(wi,mdparin,-1);
         check_chargegroup_radii(sys,ir,state.x,wi);
@@ -1577,7 +1648,17 @@ int main (int argc, char *argv[])
     copy_mat(state.box,box);
     if (ir->ePBC==epbcXY && ir->nwall==2)
       svmul(ir->wall_ewald_zfac,box[ZZ],box[ZZ]);
-    max_spacing = calc_grid(stdout,box,opts->fourierspacing,
+    if (ir->nkx > 0 && ir->nky > 0 && ir->nkz > 0)
+    {
+        /* Mark fourier_spacing as not used */
+        ir->fourier_spacing = 0;
+    }
+    else if (ir->nkx != 0 && ir->nky != 0 && ir->nkz != 0)
+    {
+        set_warning_line(wi,mdparin,-1);
+        warning_error(wi,"Some of the Fourier grid sizes are set, but all of them need to be set.");
+    }
+    max_spacing = calc_grid(stdout,box,ir->fourier_spacing,
                             &(ir->nkx),&(ir->nky),&(ir->nkz));
   }
 
diff --git a/src/kernel/md.c b/src/kernel/md.c
index 9e082c257b..a7f3f0b50e 100644
--- a/src/kernel/md.c
+++ b/src/kernel/md.c
@@ -53,6 +53,7 @@
 #include "trnio.h"
 #include "xtcio.h"
 #include "mdrun.h"
+#include "md_support.h"
 #include "confio.h"
 #include "network.h"
 #include "pull.h"
@@ -69,6 +70,7 @@
 #include "qmmm.h"
 #include "mpelogging.h"
 #include "domdec.h"
+#include "domdec_network.h"
 #include "partdec.h"
 #include "topsort.h"
 #include "coulomb.h"
@@ -81,7 +83,12 @@
 #include "sighandler.h"
 #include "txtdump.h"
 #include "string2.h"
+#include "pme_switch.h"
+#include "bondf.h"
 #include "membed.h"
+#include "types/nlistheuristics.h"
+#include "types/iteratedconstraints.h"
+#include "nbnxn_cuda_data_mgmt.h"
 
 #ifdef GMX_LIB_MPI
 #include <mpi.h>
@@ -94,6 +101,39 @@
 #include "corewrap.h"
 #endif
 
+static void reset_all_counters(FILE *fplog,t_commrec *cr,
+                               gmx_large_int_t step,
+                               gmx_large_int_t *step_rel,t_inputrec *ir,
+                               gmx_wallcycle_t wcycle,t_nrnb *nrnb,
+                               gmx_runtime_t *runtime,
+                               nbnxn_cuda_ptr_t cu_nbv)
+{
+    char sbuf[STEPSTRSIZE];
+
+    /* Reset all the counters related to performance over the run */
+    md_print_warn(cr,fplog,"step %s: resetting all time and cycle counters\n",
+                  gmx_step_str(step,sbuf));
+
+    if (cu_nbv)
+    {
+        nbnxn_cuda_reset_timings(cu_nbv);
+    }
+
+    wallcycle_stop(wcycle,ewcRUN);
+    wallcycle_reset_all(wcycle);
+    if (DOMAINDECOMP(cr))
+    {
+        reset_dd_statistics_counters(cr->dd);
+    }
+    init_nrnb(nrnb);
+    ir->init_step += *step_rel;
+    ir->nsteps    -= *step_rel;
+    *step_rel = 0;
+    wallcycle_start(wcycle,ewcRUN);
+    runtime_start(runtime);
+    print_date_and_time(fplog,cr->nodeid,"Restarted time",runtime);
+}
+
 double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
              const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
              int nstglobalcomm,
@@ -115,15 +155,15 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
     gmx_large_int_t step,step_rel;
     double     run_time;
     double     t,t0,lam0[efptNR];
-    gmx_bool   bGStatEveryStep,bGStat,bNstEner,bCalcEnerPres,bEnergyHere;
-    gmx_bool   bNS,bNStList,bSimAnn,bStopCM,bRerunMD,bNotLastFrame=FALSE,
+    gmx_bool       bGStatEveryStep,bGStat,bCalcVir,bCalcEner;
+    gmx_bool       bNS,bNStList,bSimAnn,bStopCM,bRerunMD,bNotLastFrame=FALSE,
                bFirstStep,bStateFromCP,bStateFromTPX,bInitStep,bLastStep,
                bBornRadii,bStartingFromCpt;
     gmx_bool   bDoDHDL=FALSE,bDoFEP=FALSE,bDoExpanded=FALSE;
-    gmx_bool   do_ene,do_log,do_verbose,bRerunWarnNoV=TRUE,
+    gmx_bool       do_ene,do_log,do_verbose,bRerunWarnNoV=TRUE,
                bForceUpdate=FALSE,bCPT;
     int        mdof_flags;
-    gmx_bool   bMasterState;
+    gmx_bool       bMasterState;
     int        force_flags,cglo_flags;
     tensor     force_vir,shake_vir,total_vir,tmp_vir,pres;
     int        i,m;
@@ -189,6 +229,10 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
     gmx_large_int_t multisim_nsteps=-1; /* number of steps to do  before first multisim 
                                           simulation stops. If equal to zero, don't
                                           communicate any more between multisims.*/
+    /* PME load balancing data for GPU kernels */
+    pme_switch_t    pme_switch=NULL;
+    double          cycles_pmes;
+    gmx_bool        bPMETuneTry=FALSE,bPMETuneRunning=FALSE;
 
     if(MASTER(cr))
     {
@@ -204,7 +248,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
     /* Temporary addition for FAHCORE checkpointing */
     int chkpt_ret;
 #endif
-
+    
     /* Check for special mdrun options */
     bRerunMD = (Flags & MD_RERUN);
     bIonize  = (Flags & MD_IONIZE);
@@ -360,6 +404,8 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             a1 = top_global->natoms;
         }
 
+        forcerec_set_excl_load(fr,top,cr);
+
         state = partdec_init_local_state(cr,state_global);
         f_global = f;
 
@@ -369,7 +415,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             set_vsite_top(vsite,top,mdatoms,cr);
         }
 
-        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols) {
+        if (ir->ePBC != epbcNONE && !fr->bMolPBC) {
             graph = mk_graph(fplog,&(top->idef),0,top_global->natoms,FALSE,FALSE);
         }
 
@@ -377,6 +423,8 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             make_local_shells(cr,mdatoms,shellfc);
         }
 
+        init_bonded_thread_force_reduction(fr,&top->idef);
+
         if (ir->pull && PAR(cr)) {
             dd_make_local_pull_groups(NULL,ir->pull,mdatoms);
         }
@@ -390,6 +438,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                             state,&f,mdatoms,top,fr,
                             vsite,shellfc,constr,
                             nrnb,wcycle,FALSE);
+
     }
 
     update_mdatoms(mdatoms,state->lambda[efptMASS]);
@@ -467,8 +516,30 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
     if (repl_ex_nst > 0 && MASTER(cr))
     {
         repl_ex = init_replica_exchange(fplog,cr->ms,state_global,ir,
-                                        repl_ex_nst,repl_ex_nex,repl_ex_seed); 
+                                        repl_ex_nst,repl_ex_nex,repl_ex_seed);
+    }
+
+    /* PME tuning is only supported with GPUs or PME nodes and not with rerun */
+    if ((Flags & MD_TUNEPME) &&
+        EEL_PME(fr->eeltype) &&
+        fr->cutoff_scheme == ecutsVERLET &&
+        (fr->nbv->bUseGPU || !(cr->duty & DUTY_PME)) &&
+        !bRerunMD)
+    {
+        switch_pme_init(&pme_switch,ir,state->box,fr->ic,fr->pmedata);
+        cycles_pmes = 0;
+        if (cr->duty & DUTY_PME)
+        {
+            /* Start tuning right away, as we can't measure the load */
+            bPMETuneRunning = TRUE;
+        }
+        else
+        {
+            /* Separate PME nodes, we can measure the PP/PME load balance */
+            bPMETuneTry = TRUE;
+        }
     }
+
     if (!ir->bContinuation && !bRerunMD)
     {
         if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
@@ -502,7 +573,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
     }
 
     debug_gmx();
-
+  
     /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
     nstfep = ir->fepvals->nstdhdl;
     if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
@@ -754,7 +825,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
         }
 
         if (ir->efep != efepNO || ir->bSimTemp)
-        {
+            {
             /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
                requiring different logic. */
             
@@ -932,7 +1003,8 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                                     state_global,top_global,ir,
                                     state,&f,mdatoms,top,fr,
                                     vsite,shellfc,constr,
-                                    nrnb,wcycle,do_verbose);
+                                    nrnb,wcycle,
+                                    do_verbose && !bPMETuneRunning);
                 wallcycle_stop(wcycle,ewcDOMDEC);
                 /* If using an iterative integrator, reallocate space to match the decomposition */
             }
@@ -948,7 +1020,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             update_mdatoms(mdatoms,state->lambda[efptMASS]);
         }
 
-        if (bRerunMD && rerun_fr.bV)
+        if ((bRerunMD && rerun_fr.bV) || bExchanged)
         {
             
             /* We need the kinetic energy at minus the half step for determining
@@ -1000,18 +1072,20 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
         /* Determine the energy and pressure:
          * at nstcalcenergy steps and at energy output steps (set below).
          */
-
-        if (EI_VV(ir->eI) && (!bInitStep)) {  /* for vv, the first half actually corresponds to the last step */
-            bNstEner = do_per_step(step-1,ir->nstcalcenergy);
-        } else {
-            bNstEner = do_per_step(step,ir->nstcalcenergy);
+        if (EI_VV(ir->eI) && (!bInitStep))
+        {
+            /* for vv, the first half actually corresponds to the last step */
+            bCalcEner = do_per_step(step-1,ir->nstcalcenergy);
         }
-        bCalcEnerPres =
-            (bNstEner ||
-             (ir->epc > epcNO && do_per_step(step,ir->nstpcouple)));
+        else
+        {
+            bCalcEner = do_per_step(step,ir->nstcalcenergy);
+        }
+        bCalcVir = bCalcEner ||
+            (ir->epc != epcNO && do_per_step(step,ir->nstpcouple));
 
         /* Do we need global communication ? */
-        bGStat = (bCalcEnerPres || bStopCM ||
+        bGStat = (bCalcVir || bCalcEner || bStopCM ||
                   do_per_step(step,nstglobalcomm) ||
                   (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
 
@@ -1019,8 +1093,9 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
 
         if (do_ene || do_log)
         {
-            bCalcEnerPres = TRUE;
-            bGStat        = TRUE;
+            bCalcVir  = TRUE;
+            bCalcEner = TRUE;
+            bGStat    = TRUE;
         }
         
         /* these CGLO_ options remain the same throughout the iteration */
@@ -1033,7 +1108,8 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                        GMX_FORCE_ALLFORCES |
                        (bNStList ? GMX_FORCE_DOLR : 0) |
                        GMX_FORCE_SEPLRF |
-                       (bCalcEnerPres ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
                        (bDoFEP ? GMX_FORCE_DHDL : 0)
             );
         
@@ -1063,7 +1139,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
              * This is parallellized as well, and does communication too. 
              * Check comments in sim_util.c
              */
-            do_force(fplog,cr,ir,step,nrnb,wcycle,top,top_global,groups,
+             do_force(fplog,cr,ir,step,nrnb,wcycle,top,top_global,groups,
                      state->box,state->x,&state->hist,
                      f,force_vir,mdatoms,enerd,fcd,
                      state->lambda,graph,
@@ -1103,7 +1179,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                 trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ1);            
             }
 
-            update_coords(fplog,step,ir,mdatoms,state,
+            update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,
                           f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
                           ekind,M,wcycle,upd,bInitStep,etrtVELOCITY1,
                           cr,nrnb,constr,&top->idef);
@@ -1149,10 +1225,11 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                 if ( !bRerunMD || rerun_fr.bV || bForceUpdate) {  /* Why is rerun_fr.bV here?  Unclear. */
                     dvdl = 0;
                     
-                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
+                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
+                                       state,fr->bMolPBC,graph,f,
                                        &top->idef,shake_vir,NULL,
                                        cr,nrnb,wcycle,upd,constr,
-                                       bInitStep,TRUE,bCalcEnerPres,vetanew);
+                                       bInitStep,TRUE,bCalcVir,vetanew);
                     
                     if (!bOK && !bFFscan)
                     {
@@ -1165,7 +1242,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                      called in the previous step */
                     unshift_self(graph,state->box,state->x);
                 }
-
+                
                 
                 /* if VV, compute the pressure and constraints */
                 /* For VV2, we strictly only need this if using pressure
@@ -1174,11 +1251,11 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                  * Think about ways around this in the future?
                  * For now, keep this choice in comments.
                  */
-                /* bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
-                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
+                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
+                    /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
                 bPres = TRUE;
                 bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK));
-                if (bNstEner && ir->eI==eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
+                if (bCalcEner && ir->eI==eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
                 {
                     bSumEkinhOld = TRUE;
                 }
@@ -1213,6 +1290,20 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                     } 
                     else 
                     {
+                        if (bExchanged)
+                        {
+            
+                            /* We need the kinetic energy at minus the half step for determining
+                             * the full step kinetic energy and possibly for T-coupling.*/
+                            /* This may not be quite working correctly yet . . . . */
+                            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+                                            wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
+                                            constr,NULL,FALSE,state->box,
+                                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+                        }
+
+
                         update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
                     }
                 }
@@ -1249,7 +1340,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             
             GMX_MPE_LOG(ev_timestep1);
         }
-
+    
         /* MRS -- now done iterating -- compute the conserved quantity */
         if (bVV) {
             saved_conserved_quantity = compute_conserved_from_auxiliary(ir,state,&MassQ);
@@ -1272,7 +1363,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                actually move to the new state before outputting
                statistics, but if performing simulated tempering, we
                do update the velocities and the tau_t. */
-
+        
             lamnew = ExpandedEnsembleDynamics(fplog,ir,enerd,state,&MassQ,&df_history,step,mcrng,state->v,mdatoms);
         }
         /* ################## START TRAJECTORY OUTPUT ################# */
@@ -1359,8 +1450,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                  * at the last step.
                  */
                 fprintf(stderr,"\nWriting final coordinates.\n");
-                if (ir->ePBC != epbcNONE && !ir->bPeriodicMols &&
-                    DOMAINDECOMP(cr))
+                if (fr->bMolPBC)
                 {
                     /* Make molecules whole only for confout writing */
                     do_pbc_mtop(fplog,ir->ePBC,state->box,top_global,state_global->x);
@@ -1383,30 +1473,9 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
         }
         /*  ################## END TRAJECTORY OUTPUT ################ */
         
-        /* Determine the pressure:
-         * always when we want exact averages in the energy file,
-         * at ns steps when we have pressure coupling,
-         * otherwise only at energy output steps (set below).
-         */
-
-        
-        bNstEner = (bGStatEveryStep || do_per_step(step,ir->nstcalcenergy));
-        bCalcEnerPres = bNstEner;
-
-        /* Do we need global communication ? */
-        bGStat = (bGStatEveryStep || bStopCM || bNS ||
-                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
-
-        do_ene = (do_per_step(step,ir->nstenergy) || bLastStep);
-
-        if (do_ene || do_log)
-        {
-            bCalcEnerPres = TRUE;
-            bGStat        = TRUE;
-        }
-
         /* Determine the wallclock run time up till now */
         run_time = gmx_gettime() - (double)runtime->real;
+
         /* Check whether everything is still allright */    
         if (((int)gmx_get_stop_condition() > handled_stop_condition)
 #ifdef GMX_THREAD_MPI
@@ -1493,7 +1562,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
         {
             gs.sig[eglsCHKPT] = 1;
         }
-
+  
 
         /* at the start of step, randomize the velocities */
         if (ETC_ANDERSEN(ir->etc) && EI_VV(ir->eI))
@@ -1503,10 +1572,11 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
             if (bDoAndersenConstr)
             {
-                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
+                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
+                                   state,fr->bMolPBC,graph,f,
                                    &top->idef,tmp_vir,NULL,
                                    cr,nrnb,wcycle,upd,constr,
-                                   bInitStep,TRUE,FALSE,vetanew);
+                                   bInitStep,TRUE,bCalcVir,vetanew);
             }
         }
 
@@ -1581,7 +1651,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                 if (bVV)
                 {
                     /* velocity half-step update */
-                    update_coords(fplog,step,ir,mdatoms,state,f,
+                    update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
                                   fr->bTwinRange && bNStList,fr->f_twin,fcd,
                                   ekind,M,wcycle,upd,FALSE,etrtVELOCITY2,
                                   cr,nrnb,constr,&top->idef);
@@ -1597,16 +1667,18 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                     copy_rvecn(state->x,cbuf,0,state->natoms);
                 }
                 
-                update_coords(fplog,step,ir,mdatoms,state,f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
+                update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
+                              fr->bTwinRange && bNStList,fr->f_twin,fcd,
                               ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
                 wallcycle_stop(wcycle,ewcUPDATE);
 
-                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
+                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,
+                                   fr->bMolPBC,graph,f,
                                    &top->idef,shake_vir,force_vir,
                                    cr,nrnb,wcycle,upd,constr,
-                                   bInitStep,FALSE,bCalcEnerPres,state->veta);  
+                                   bInitStep,FALSE,bCalcVir,state->veta);  
                 
-                if (ir->eI==eiVVAK)
+                if (ir->eI==eiVVAK) 
                 {
                     /* erase F_EKIN and F_TEMP here? */
                     /* just compute the kinetic energy at the half step to perform a trotter step */
@@ -1614,14 +1686,15 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                                     wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
                                     constr,NULL,FALSE,lastbox,
                                     top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
-                                    cglo_flags | CGLO_TEMPERATURE
+                                    cglo_flags | CGLO_TEMPERATURE    
                         );
                     wallcycle_start(wcycle,ewcUPDATE);
                     trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ4);            
                     /* now we know the scaling, we can compute the positions again again */
                     copy_rvecn(cbuf,state->x,0,state->natoms);
 
-                    update_coords(fplog,step,ir,mdatoms,state,f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
+                    update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
+                                  fr->bTwinRange && bNStList,fr->f_twin,fcd,
                                   ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
                     wallcycle_stop(wcycle,ewcUPDATE);
 
@@ -1630,10 +1703,11 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
                      * to numerical errors, or are they important
                      * physically? I'm thinking they are just errors, but not completely sure. 
                      * For now, will call without actually constraining, constr=NULL*/
-                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
+                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
+                                       state,fr->bMolPBC,graph,f,
                                        &top->idef,tmp_vir,force_vir,
                                        cr,nrnb,wcycle,upd,NULL,
-                                       bInitStep,FALSE,bCalcEnerPres,
+                                       bInitStep,FALSE,bCalcVir,
                                        state->veta);  
                 }
                 if (!bOK && !bFFscan) 
@@ -1675,32 +1749,39 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             }
             
             /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints ############ */
-            if (ir->nstlist == -1 && bFirstIterate)
+            /* With Leap-Frog we can skip compute_globals at
+             * non-communication steps, but we need to calculate
+             * the kinetic energy one step before communication.
+             */
+            if (bGStat || do_per_step(step+1,nstglobalcomm) ||
+                EI_VV(ir->eI))
             {
-                gs.sig[eglsNABNSB] = nlh.nabnsb;
-            }
-            bEnergyHere = (!EI_VV(ir->eI) || (EI_VV(ir->eI) && bRerunMD)); /* this is not quite working for vv and rerun! fails for running rerun on multiple threads. This is caught in runner.c. */
-            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
-                            wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
-                            constr,
-                            bFirstIterate ? &gs : NULL, 
-                            (step_rel % gs.nstms == 0) && 
+                if (ir->nstlist == -1 && bFirstIterate)
+                {
+                    gs.sig[eglsNABNSB] = nlh.nabnsb;
+                }
+                compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+                                wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
+                                constr,
+                                bFirstIterate ? &gs : NULL, 
+                                (step_rel % gs.nstms == 0) && 
                                 (multisim_nsteps<0 || (step_rel<multisim_nsteps)),
-                            lastbox,
-                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
-                            cglo_flags 
-                            | (!EI_VV(ir->eI) ? CGLO_ENERGY : 0) 
-                            | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
-                            | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0) 
-                            | (bEnergyHere || bRerunMD ? CGLO_PRESSURE : 0) 
-                            | (bIterations && iterate.bIterate ? CGLO_ITERATE : 0) 
-                            | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-                            | CGLO_CONSTRAINT
-                );
-            if (ir->nstlist == -1 && bFirstIterate)
-            {
-                nlh.nabnsb = gs.set[eglsNABNSB];
-                gs.set[eglsNABNSB] = 0;
+                                lastbox,
+                                top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+                                cglo_flags 
+                                | (!EI_VV(ir->eI) ? CGLO_ENERGY : 0)
+                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0) 
+                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0) 
+                                | (bIterations && iterate.bIterate ? CGLO_ITERATE : 0) 
+                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+                                | CGLO_CONSTRAINT 
+                    );
+                if (ir->nstlist == -1 && bFirstIterate)
+                {
+                    nlh.nabnsb = gs.set[eglsNABNSB];
+                    gs.set[eglsNABNSB] = 0;
+                }
             }
             /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
             /* #############  END CALC EKIN AND PRESSURE ################# */
@@ -1774,7 +1855,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
         }
 
         /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
-
+        
         /* use the directly determined last velocity, not actually the averaged half steps */
         if (bTrotter && ir->eI==eiVV) 
         {
@@ -1825,9 +1906,9 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             }
             if (!(bStartingFromCpt && (EI_VV(ir->eI)))) 
             {
-                if (bNstEner)
+                if (bCalcEner)
                 {
-                    upd_mdebin(mdebin,bDoDHDL,TRUE,
+                    upd_mdebin(mdebin,bDoDHDL, TRUE,
                                t,mdatoms->tmass,enerd,state,
                                ir->fepvals,ir->expandedvals,lastbox,
                                shake_vir,force_vir,total_vir,pres,
@@ -1868,7 +1949,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             }
         }
         /* Remaining runtime */
-        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal() ))
+        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
         {
             if (shellfc) 
             {
@@ -1947,12 +2028,58 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
         {
             dd_cycles_add(cr->dd,cycles,ddCyclStep);
         }
+
+        if (bPMETuneRunning || bPMETuneTry)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+
+            /* Count the total cycles over the last steps */
+            cycles_pmes += cycles;
+
+            /* We can only switch cut-off at NS steps */
+            if (step % ir->nstlist == 0)
+            {
+                /* PME grid + cut-off optimization with GPUs or PME nodes */
+                if (bPMETuneTry)
+                {
+                    if (DDMASTER(cr->dd))
+                    {
+                        /* PME node load is too high, start tuning */
+                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
+                    }
+                    dd_bcast(cr->dd,sizeof(gmx_bool),&bPMETuneRunning);
+
+                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
+                    {
+                        bPMETuneTry     = FALSE;
+                    }
+                }
+                if (bPMETuneRunning)
+                {
+                    /* init_step might not be a multiple of nstlist,
+                     * but the first cycle is always skipped anyhow.
+                     */
+                    bPMETuneRunning =
+                        switch_pme(pme_switch,cr,
+                                   (bVerbose && MASTER(cr)) ? stderr : NULL,
+                                   fplog,
+                                   ir,state,cycles_pmes,
+                                   fr->ic,fr->nbv,&fr->pmedata,
+                                   step);
+
+                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
+                }
+
+                cycles_pmes = 0;
+            }
+        }
         
         if (step_rel == wcycle_get_reset_counters(wcycle) ||
             gs.set[eglsRESETCOUNTERS] != 0)
         {
             /* Reset all the counters related to performance over the run */
-            reset_all_counters(fplog,cr,step,&step_rel,ir,wcycle,nrnb,runtime);
+            reset_all_counters(fplog,cr,step,&step_rel,ir,wcycle,nrnb,runtime,
+                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
             wcycle_set_reset_counters(wcycle,-1);
             /* Correct max_hours for the elapsed time */
             max_hours -= run_time/(60.0*60.0);
@@ -1975,7 +2102,7 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
     if (!(cr->duty & DUTY_PME))
     {
         /* Tell the PME only node to finish */
-        gmx_pme_finish(cr);
+        gmx_pme_send_finish(cr);
     }
     
     if (MASTER(cr))
@@ -2011,6 +2138,6 @@ double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
     }
     
     runtime->nsteps_done = step_rel;
-    
-    return 0;
+
+   return 0;
 }
diff --git a/src/kernel/md_openmm.c b/src/kernel/md_openmm.c
index 30e2e0f3ec..80df1d1827 100644
--- a/src/kernel/md_openmm.c
+++ b/src/kernel/md_openmm.c
@@ -56,6 +56,7 @@
 #include "trnio.h"
 #include "xtcio.h"
 #include "mdrun.h"
+#include "md_support.h"
 #include "confio.h"
 #include "network.h"
 #include "pull.h"
diff --git a/src/kernel/mdrun.c b/src/kernel/mdrun.c
index 1d011082be..316b485143 100644
--- a/src/kernel/mdrun.c
+++ b/src/kernel/mdrun.c
@@ -156,9 +156,77 @@ int main(int argc,char *argv[])
     "([TT]-x[tt]).[PAR]",
     "The option [TT]-dhdl[tt] is only used when free energy calculation is",
     "turned on.[PAR]",
-    "When [TT]mdrun[tt] is started using MPI with more than 1 node, parallelization",
-    "is used. By default domain decomposition is used, unless the [TT]-pd[tt]",
-    "option is set, which selects particle decomposition.[PAR]",
+    "A simulation can be run in parallel using two different parallelization",
+    "schemes: MPI parallelization and/or OpenMP thread parallelization.",
+    "The MPI parallelization uses multiple processes when [TT]mdrun[tt] is",
+    "compiled with a normal MPI library or threads when [TT]mdrun[tt] is",
+    "compiled with the GROMACS built-in thread-MPI library. OpenMP threads",
+    "are supported when mdrun is compiled with OpenMP. Full OpenMP support",
+    "is only available with the Verlet cut-off scheme, with the (older)",
+    "group scheme only PME-only processes can use OpenMP parallelization.",
+    "In all cases [TT]mdrun[tt] will by default try to use all the available",
+    "hardware resources. With a normal MPI library only the options",
+    "[TT]-ntomp[tt] (with the Verlet cut-off scheme) and [TT]-ntomp_pme[tt],",
+    "for PME-only processes, can be used to control the number of threads.",
+    "With thread-MPI there are additional options [TT]-nt[tt], which sets",
+    "the total number of threads, and [TT]-ntmpi[tt], which sets the number",
+    "of thread-MPI threads.",
+    "Note that using combined MPI+OpenMP parallelization is almost always",
+    "slower than single parallelization, except at the scaling limit, where",
+    "especially OpenMP parallelization of PME reduces the communication cost.",
+    "[PAR]",
+    "To quickly test the performance of the new Verlet cut-off scheme",
+    "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
+    "the [TT]-testverlet[tt] option. This should not be used for production,",
+    "since it can slightly modify potentials and it will remove charge groups",
+    "making analysis difficult, as the [TT].tpr[tt] file will still contain",
+    "charge groups. For production simulations it is highly recommended",
+    "to specify [TT]cutoff-scheme = Verlet[tt] in the [TT].mdp[tt] file.",
+    "[PAR]",
+    "With GPUs (only supported with the Verlet cut-off scheme), the number",
+    "of GPUs should match the number of MPI processes or MPI threads,",
+    "excluding PME-only processes/threads. With thread-MPI the number",
+    "of MPI threads will automatically be set to the number of GPUs detected.",
+    "When you want to use a subset of the available GPUs, you can use",
+    "the [TT]-gpu_id[tt] option, where GPU id's are passed as a string,",
+    "e.g. 02 for using GPUs 0 and 2. When you want different GPU id's",
+    "on different nodes of a compute cluster, use the GMX_GPU_ID environment",
+    "variable instead. The format for GMX_GPU_ID is identical to ",
+    "[TT]-gpu_id[tt], but an environment variable can have different values",
+    "on different nodes of a cluster.",
+    "[PAR]",
+    "When using PME with separate PME nodes or with a GPU, the two major",
+    "compute tasks, the non-bonded force calculation and the PME calculation",
+    "run on different compute resources. If this load is not balanced,",
+    "some of the resources will be idle part of time. With the Verlet",
+    "cut-off scheme this load is automatically balanced when the PME load",
+    "is too high (but not when it is too low). This is done by scaling",
+    "the Coulomb cut-off and PME grid spacing by the same amount. In the first",
+    "few hundred steps different settings are tried and the fastest is chosen",
+    "for the rest of the simulation. This does not affect the accuracy of",
+    "the results, but it does affect the decomposition of the Coulomb energy",
+    "into particle and mesh contributions. The auto-tuning can be turned off",
+    "with the option [TT]-notunepme[tt].",
+    "[PAR]",
+    "When compiled with OpenMP on Linux, [TT]mdrun[tt] pins threads to cores,",
+    "as this usually results in significantly better performance.",
+    "If you don't want this, use [TT]-nopin[tt].",
+    "With Intel CPUs with hyper-threading enabled, you should pin",
+    "consecutive threads to the same physical core for optimal",
+    "performance when you use virtual cores. This is done automatically",
+    "when you use more than half of the virtual cores. It can also be set",
+    "manually with [TT]-pinht[tt], e.g. for running multiple simulations",
+    "on one compute node.",
+    "When running multiple mdrun (or other) simulations on the same physical",
+    "node, some simulations need to start pinning from a non-zero core",
+    "to avoid overloading cores; with [TT]-pinoffset[tt] you can specify",
+    "the offset in (physical) cores for pinning.",
+    "[PAR]",
+    "When [TT]mdrun[tt] is started using MPI with more than 1 process",
+    "or with thread-MPI with more than 1 thread, MPI parallelization is used.",
+    "By default domain decomposition is used, unless the [TT]-pd[tt]",
+    "option is set, which selects particle decomposition.",
+    "[PAR]",
     "With domain decomposition, the spatial decomposition can be set",
     "with option [TT]-dd[tt]. By default [TT]mdrun[tt] selects a good decomposition.",
     "The user only needs to change this when the system is very inhomogeneous.",
@@ -401,6 +469,8 @@ int main(int argc,char *argv[])
   gmx_bool bPartDec     = FALSE;
   gmx_bool bDDBondCheck = TRUE;
   gmx_bool bDDBondComm  = TRUE;
+  gmx_bool bTunePME     = TRUE;
+  gmx_bool bTestVerlet  = FALSE;
   gmx_bool bVerbose     = FALSE;
   gmx_bool bCompact     = TRUE;
   gmx_bool bSepPot      = FALSE;
@@ -416,14 +486,16 @@ int main(int argc,char *argv[])
   int  repl_ex_seed=-1;
   int  repl_ex_nex=0;
   int  nstepout=100;
-  int  nthreads=0; /* set to determine # of threads automatically */
   int  resetstep=-1;
+  int  nsteps=-2; /* the value -2 means that the mdp option will be used */
   
   rvec realddxyz={0,0,0};
   const char *ddno_opt[ddnoNR+1] =
     { NULL, "interleave", "pp_pme", "cartesian", NULL };
-    const char *dddlb_opt[] =
+  const char *dddlb_opt[] =
     { NULL, "auto", "no", "yes", NULL };
+  const char *nbpu_opt[] =
+    { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
   real rdd=0.0,rconstr=0.0,dlb_scale=0.8,pforce=-1;
   char *ddcsx=NULL,*ddcsy=NULL,*ddcsz=NULL;
   real cpt_period=15.0,max_hours=-1;
@@ -433,20 +505,34 @@ int main(int argc,char *argv[])
   output_env_t oenv=NULL;
   const char *deviceOptions = "";
 
+  gmx_hw_opt_t hw_opt={0,0,0,0,TRUE,FALSE,0,NULL};
+
   t_pargs pa[] = {
 
     { "-pd",      FALSE, etBOOL,{&bPartDec},
       "Use particle decompostion" },
     { "-dd",      FALSE, etRVEC,{&realddxyz},
       "Domain decomposition grid, 0 is optimize" },
-#ifdef GMX_THREAD_MPI
-    { "-nt",      FALSE, etINT, {&nthreads},
-      "Number of threads to start (0 is guess)" },
-#endif
-    { "-npme",    FALSE, etINT, {&npme},
-      "Number of separate nodes to be used for PME, -1 is guess" },
     { "-ddorder", FALSE, etENUM, {ddno_opt},
       "DD node order" },
+    { "-npme",    FALSE, etINT, {&npme},
+      "Number of separate nodes to be used for PME, -1 is guess" },
+    { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
+      "Total number of threads to start (0 is guess)" },
+    { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
+      "Number of thread-MPI threads to start (0 is guess)" },
+    { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
+      "Number of OpenMP threads to start (0 is guess)" },
+    { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
+      "Number of OpenMP threads to start (0 is -ntomp)" },
+    { "-pin",     FALSE, etBOOL, {&hw_opt.bThreadPinning},
+      "Pin OpenMP threads to cores" },
+    { "-pinht",   FALSE, etBOOL, {&hw_opt.bPinHyperthreading},
+      "Always pin threads to Hyper-Threading cores" },
+    { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
+      "Core offset for pinning (for running multiple mdrun processes on a single physical node)" },
+    { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_id},
+      "List of GPU id's to use" },
     { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
       "Check for all bonded interactions with DD" },
     { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
@@ -467,6 +553,12 @@ int main(int argc,char *argv[])
       "HIDDENThe DD cell sizes in z" },
     { "-gcom",    FALSE, etINT,{&nstglobalcomm},
       "Global communication frequency" },
+    { "-nb",      FALSE, etENUM, {&nbpu_opt},
+      "Calculate non-bonded interactions on" },
+    { "-tunepme", FALSE, etBOOL, {&bTunePME},  
+      "Optimize PME load between PP/PME nodes or GPU/CPU" },
+    { "-testverlet", FALSE, etBOOL, {&bTestVerlet},
+      "Test the Verlet non-bonded scheme" },
     { "-v",       FALSE, etBOOL,{&bVerbose},  
       "Be loud and noisy" },
     { "-compact", FALSE, etBOOL,{&bCompact},  
@@ -483,6 +575,8 @@ int main(int argc,char *argv[])
       "Keep and number checkpoint files" },
     { "-append",  FALSE, etBOOL, {&bAppendFiles},
       "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
+    { "-nsteps",  FALSE, etINT, {&nsteps},
+      "Run this number of steps, overrides .mdp file option" },
     { "-maxh",   FALSE, etREAL, {&max_hours},
       "Terminate after 0.99 times this time (hours)" },
     { "-multi",   FALSE, etINT,{&nmultisim}, 
@@ -555,10 +649,6 @@ int main(int argc,char *argv[])
   dd_node_order = nenum(ddno_opt);
   cr->npmenodes = npme;
 
-#ifndef GMX_THREAD_MPI
-  nthreads=1;
-#endif
-
   /* now check the -multi and -multidir option */
   if (opt2bSet("-multidir", NFILE, fnm))
   {
@@ -647,6 +737,8 @@ int main(int argc,char *argv[])
   Flags = Flags | (bPartDec      ? MD_PARTDEC      : 0);
   Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
   Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
+  Flags = Flags | (bTunePME      ? MD_TUNEPME      : 0);
+  Flags = Flags | (bTestVerlet   ? MD_TESTVERLET   : 0);
   Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
   Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
   Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
@@ -662,7 +754,8 @@ int main(int argc,char *argv[])
      there instead.  */
   if ((MASTER(cr) || bSepPot) && !bAppendFiles) 
   {
-      gmx_log_open(ftp2fn(efLOG,NFILE,fnm),cr,!bSepPot,Flags,&fplog);
+      gmx_log_open(ftp2fn(efLOG,NFILE,fnm),cr,
+                   !bSepPot,Flags & MD_APPENDFILES,&fplog);
       CopyRight(fplog,argv[0]);
       please_cite(fplog,"Hess2008b");
       please_cite(fplog,"Spoel2005a");
@@ -682,10 +775,12 @@ int main(int argc,char *argv[])
   ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
   ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
 
-  rc = mdrunner(nthreads, fplog,cr,NFILE,fnm,oenv,bVerbose,bCompact,
+  rc = mdrunner(&hw_opt, fplog,cr,NFILE,fnm,oenv,bVerbose,bCompact,
                 nstglobalcomm, ddxyz,dd_node_order,rdd,rconstr,
                 dddlb_opt[0],dlb_scale,ddcsx,ddcsy,ddcsz,
-                nstepout,resetstep,nmultisim,repl_ex_nst,repl_ex_nex,repl_ex_seed,
+                nbpu_opt[0],
+                nsteps,nstepout,resetstep,
+                nmultisim,repl_ex_nst,repl_ex_nex,repl_ex_seed,
                 pforce, cpt_period,max_hours,deviceOptions,Flags);
 
   gmx_finalize_par();
diff --git a/src/kernel/membed.c b/src/kernel/membed.c
index 050dce7397..7037c7aa8e 100644
--- a/src/kernel/membed.c
+++ b/src/kernel/membed.c
@@ -108,14 +108,18 @@ static int get_mol_id(int at, gmx_mtop_t  *mtop, int *type, int *block)
     int mol_id=0;
     int i;
     int atnr_mol;
+    gmx_mtop_atomlookup_t alook;
 
-    gmx_mtop_atomnr_to_molblock_ind(mtop,at,block,&mol_id,&atnr_mol);
+    alook = gmx_mtop_atomlookup_settle_init(mtop);
+    gmx_mtop_atomnr_to_molblock_ind(alook,at,block,&mol_id,&atnr_mol);
     for(i=0;i<*block;i++)
     {
         mol_id += mtop->molblock[i].nmol;
     }
     *type = mtop->molblock[*block].type;
 
+    gmx_mtop_atomlookup_destroy(alook);
+
     return mol_id;
 }
 
diff --git a/src/kernel/membed.h b/src/kernel/membed.h
index 6e03136367..25133045aa 100644
--- a/src/kernel/membed.h
+++ b/src/kernel/membed.h
@@ -36,6 +36,7 @@
 #define _gmx_membed_h
 
 #include "typedefs.h"
+#include "types/membedt.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/kernel/openmm_wrapper.cpp b/src/kernel/openmm_wrapper.cpp
index 5cc4999c08..297bbc1a0b 100644
--- a/src/kernel/openmm_wrapper.cpp
+++ b/src/kernel/openmm_wrapper.cpp
@@ -63,7 +63,7 @@ using namespace std;
 #include "mdrun.h"
 #include "physics.h"
 #include "string2.h"
-#include "gmx_gpu_utils.h"
+#include "gpu_utils.h"
 #include "mtop_util.h"
 
 #include "openmm_wrapper.h"
@@ -1309,7 +1309,7 @@ void* openmm_init(FILE *fplog, const char *platformOptStr,
             /* check GPU compatibility */
             char gpuname[STRLEN];
             devId = atoi(opt->getOptionValue("deviceid").c_str());
-            if (!is_supported_cuda_gpu(-1, gpuname))
+            if (!is_gmx_openmm_supported_gpu(-1, gpuname))
             {
                 if (!gmx_strcasecmp(opt->getOptionValue("force-device").c_str(), "yes"))
                 {
diff --git a/src/kernel/pme_switch.c b/src/kernel/pme_switch.c
new file mode 100644
index 0000000000..bdae8b7d1c
--- /dev/null
+++ b/src/kernel/pme_switch.c
@@ -0,0 +1,529 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ *                        VERSION 4.6.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2011, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "smalloc.h"
+#include "network.h"
+#include "calcgrid.h"
+#include "pme.h"
+#include "vec.h"
+#include "domdec.h"
+#include "nbnxn_cuda_data_mgmt.h"
+#include "force.h"
+#include "pme_switch.h"
+
+typedef struct {
+    real rcut;
+    real rlist;
+    real spacing;
+    ivec grid;
+    real grid_eff;
+    real coeff;
+    gmx_pme_t pmedata;
+
+    int  count;
+    double cycles;
+} pme_setup_t;
+
+/* In the initial scan, step by grids that are at least a factor 0.8 coarser */
+#define PMES_GRID_SCALE_FAC  0.8
+/* In the initial scan, try to skip grids with uneven x/y/z spacing,
+ * checking if the "efficiency" is more than 5% worse than the previous grid.
+ */
+#define PMES_GRID_EFF_FAC  1.05
+/* Rerun up till 12% slower setups than the fastest up till now */
+#define PMES_SLOW_FAC  1.12
+/* If setups get more than 2% faster, do another round to avoid
+ * choosing a slower setup due to acceleration or fluctuations.
+ */
+#define PMES_ACCEL_TOL 1.02
+
+typedef struct pme_switch {
+    int  nstage;        /* the current maximum number of stages */
+
+    real cut_spacing;   /* the minimum cutoff / PME grid spacing ratio */
+    real rbuf;          /* the pairlist buffer size */
+    matrix box_start;   /* the initial simulation box */
+    int n;              /* the count of setup as well as the allocation size */
+    pme_setup_t *setup; /* the PME+cutoff setups */
+    int cur;            /* the current setup */
+    int fastest;        /* fastest setup up till now */
+    int start;          /* start of setup range to consider in stage>0 */
+    int end;            /* end   of setup range to consider in stage>0 */
+
+    int stage;          /* the current stage */
+} t_pme_switch;
+
+void switch_pme_init(pme_switch_t *pmes_p,
+                     const t_inputrec *ir,matrix box,
+                     const interaction_const_t *ic,
+                     gmx_pme_t pmedata)
+{
+    pme_switch_t pmes;
+    real spm,sp;
+    int  d;
+
+    snew(pmes,1);
+
+    /* Any number of stages >= 2 is supported */
+    pmes->nstage   = 2;
+
+    pmes->rbuf = ic->rlist - ic->rcoulomb;
+
+    copy_mat(box,pmes->box_start);
+    if (ir->ePBC==epbcXY && ir->nwall==2)
+    {
+        svmul(ir->wall_ewald_zfac,pmes->box_start[ZZ],pmes->box_start[ZZ]);
+    }
+
+    pmes->n = 1;
+    snew(pmes->setup,pmes->n);
+
+    pmes->cur = 0;
+    pmes->setup[0].rcut     = ic->rcoulomb;
+    pmes->setup[0].rlist    = ic->rlist;
+    pmes->setup[0].grid[XX] = ir->nkx;
+    pmes->setup[0].grid[YY] = ir->nky;
+    pmes->setup[0].grid[ZZ] = ir->nkz;
+    pmes->setup[0].coeff    = ic->ewaldcoeff;
+
+    pmes->setup[0].pmedata  = pmedata;
+    
+    spm = 0;
+    for(d=0; d<DIM; d++)
+    {
+        sp = norm(pmes->box_start[d])/pmes->setup[0].grid[d];
+        if (sp > spm)
+        {
+            spm = sp;
+        }
+    }
+    pmes->setup[0].spacing = spm;
+
+    if (ir->fourier_spacing > 0)
+    {
+        pmes->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
+    }
+    else
+    {
+        pmes->cut_spacing = ir->rcoulomb/pmes->setup[0].spacing;
+    }
+
+    pmes->stage = 0;
+
+    pmes->fastest = 0;
+    pmes->start   = 0;
+
+    *pmes_p = pmes;
+}
+
+static gmx_bool switch_pme_increase_cutoff(pme_switch_t pmes,int pme_order)
+{
+    pme_setup_t *set;
+    real fac,sp;
+    int d;
+
+    /* Try to add a new setup with next larger cut-off to the list */
+    pmes->n++;
+    srenew(pmes->setup,pmes->n);
+    set = &pmes->setup[pmes->n-1];
+    set->pmedata = NULL;
+
+    fac = 1;
+    do
+    {
+        fac *= 1.01;
+        clear_ivec(set->grid);
+        sp = calc_grid(NULL,pmes->box_start,
+                       fac*pmes->setup[pmes->cur].spacing,
+                       &set->grid[XX],
+                       &set->grid[YY],
+                       &set->grid[ZZ]);
+
+        /* In parallel we can't have grids smaller than 2*pme_order,
+         * and we would anyhow not gain much speed at these grid sizes.
+         */
+        for(d=0; d<DIM; d++)
+        {
+            if (set->grid[d] <= 2*pme_order)
+            {
+                pmes->n--;
+
+                return FALSE;
+            }
+        }
+    }
+    while (sp <= 1.001*pmes->setup[pmes->cur].spacing);
+
+    set->rcut    = pmes->cut_spacing*sp;
+    set->rlist   = set->rcut + pmes->rbuf;
+    set->spacing = sp;
+    /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
+    set->grid_eff = 1;
+    for(d=0; d<DIM; d++)
+    {
+        set->grid_eff *= (set->grid[d]*sp)/norm(pmes->box_start[d]);
+    }
+    /* The Ewald coefficient is inversly proportional to the cut-off */
+    set->coeff   = pmes->setup[0].coeff*pmes->setup[0].rcut/set->rcut;
+
+    set->count   = 0;
+    set->cycles  = 0;
+
+    if (debug)
+    {
+        fprintf(debug,"PME switch grid %d %d %d, cutoff %f\n",
+                set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut);
+    }
+
+    return TRUE;
+}
+
+static void print_grid(FILE *fp_err,FILE *fp_log,
+                       const char *pre,
+                       const char *desc,
+                       const pme_setup_t *set,
+                       double cycles)
+{
+    char buf[STRLEN],buft[STRLEN];
+    
+    if (cycles >= 0)
+    {
+        sprintf(buft,": %.1f M-cycles",cycles*1e-6);
+    }
+    else
+    {
+        buft[0] = '\0';
+    }
+    sprintf(buf,"%-11s%10s pme grid %d %d %d, cutoff %.3f%s",
+            pre,
+            desc,set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut,
+            buft);
+    if (fp_err != NULL)
+    {
+        fprintf(fp_err,"%s\n",buf);
+    }
+    if (fp_log != NULL)
+    {
+        fprintf(fp_log,"%s\n",buf);
+    }
+}
+
+static void switch_to_stage1(pme_switch_t pmes)
+{
+    pmes->start = 0;
+    while (pmes->start+1 < pmes->n &&
+           (pmes->setup[pmes->start].count == 0 ||
+            pmes->setup[pmes->start].cycles >
+            pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC))
+    {
+        pmes->start++;
+    }
+    while (pmes->start > 0 && pmes->setup[pmes->start-1].cycles == 0)
+    {
+        pmes->start--;
+    }
+
+    pmes->end = pmes->n;
+    if (pmes->setup[pmes->end-1].count > 0 &&
+        pmes->setup[pmes->end-1].cycles >
+        pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC)
+    {
+        pmes->end--;
+    }
+
+    pmes->stage = 1;
+
+    /* Start add start, 1 will be added immediately after returning */
+    pmes->cur = pmes->start - 1;
+}
+
+gmx_bool switch_pme(pme_switch_t pmes,
+                    t_commrec *cr,
+                    FILE *fp_err,
+                    FILE *fp_log,
+                    t_inputrec *ir,
+                    t_state *state,
+                    double cycles,
+                    interaction_const_t *ic,
+                    nonbonded_verlet_t *nbv,
+                    gmx_pme_t *pmedata,
+                    int step)
+{
+    gmx_bool OK;
+    pme_setup_t *set;
+    double cycles_fast;
+    char buf[STRLEN];
+
+    if (pmes->stage == pmes->nstage)
+    {
+        return FALSE;
+    }
+
+    if (PAR(cr))
+    {
+        gmx_sumd(1,&cycles,cr);
+        cycles /= cr->nnodes;
+    }
+
+    set = &pmes->setup[pmes->cur];
+
+    set->count++;
+    if (set->count % 2 == 1)
+    {
+        /* Skip the first cycle, because the first step after a switch
+         * is much slower due to allocation and/or caching effects.
+         */
+        return TRUE;
+    }
+
+    sprintf(buf, "step %4d: ", step);
+    print_grid(fp_err,fp_log,buf,"timed with",set,cycles);
+
+    if (set->count <= 2)
+    {
+        set->cycles = cycles;
+    }
+    else
+    {
+        if (cycles*PMES_ACCEL_TOL < set->cycles &&
+            pmes->stage == pmes->nstage - 1)
+        {
+            /* The performance went up a lot (due to e.g. DD load balancing).
+             * Add a stage, keep the minima, but rescan all setups.
+             */
+            pmes->nstage++;
+
+            if (debug)
+            {
+                fprintf(debug,"The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
+                        "Increased the number stages to %d"
+                        " and ignoring the previous performance\n",
+                        set->grid[XX],set->grid[YY],set->grid[ZZ],
+                        cycles*1e-6,set->cycles*1e-6,PMES_ACCEL_TOL,
+                        pmes->nstage);
+            }
+        }
+        set->cycles = min(set->cycles,cycles);
+    }
+
+    if (set->cycles < pmes->setup[pmes->fastest].cycles)
+    {
+        pmes->fastest = pmes->cur;
+    }
+    cycles_fast = pmes->setup[pmes->fastest].cycles;
+
+    /* Check in stage 0 if we should stop scanning grids.
+     * Stop when the time is more than SLOW_FAC longer than the fastest.
+     */
+    if (pmes->stage == 0 && pmes->cur > 0 &&
+        cycles > pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC)
+    {
+        pmes->n = pmes->cur + 1;
+        /* Done with scanning, go to stage 1 */
+        switch_to_stage1(pmes);
+    }
+
+    if (pmes->stage == 0)
+    {
+        int gridsize_start;
+
+        gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
+
+        do
+        {
+            if (pmes->cur+1 < pmes->n)
+            {
+                /* We had already generated the next setup */
+                OK = TRUE;
+            }
+            else
+            {
+                /* Find the next setup */
+                OK = switch_pme_increase_cutoff(pmes,ir->pme_order);
+            }
+                
+            if (OK && ir->ePBC != epbcNONE)
+            {
+                OK = (sqr(pmes->setup[pmes->cur+1].rlist)
+                      <= max_cutoff2(ir->ePBC,state->box));
+            }
+
+            if (OK)
+            {
+                pmes->cur++;
+
+                if (DOMAINDECOMP(cr))
+                {
+                    OK = change_dd_cutoff(cr,state,ir,
+                                          pmes->setup[pmes->cur].rlist);
+                    if (!OK)
+                    {
+                        /* Failed: do not use this setup */
+                        pmes->cur--;
+                    }
+                }
+            }
+            if (!OK)
+            {
+                /* We hit the upper limit for the cut-off,
+                 * the setup should not go further than cur.
+                 */
+                pmes->n = pmes->cur + 1;
+                /* Switch to the next stage */
+                switch_to_stage1(pmes);
+            }
+        }
+        while (OK &&
+               !(pmes->setup[pmes->cur].grid[XX]*
+                 pmes->setup[pmes->cur].grid[YY]*
+                 pmes->setup[pmes->cur].grid[ZZ] <
+                 gridsize_start*PMES_GRID_SCALE_FAC
+                 &&
+                 pmes->setup[pmes->cur].grid_eff <
+                 pmes->setup[pmes->cur-1].grid_eff*PMES_GRID_EFF_FAC));
+    }
+
+    if (pmes->stage > 0 && pmes->end == 1)
+    {
+        pmes->cur = 0;
+        pmes->stage = pmes->nstage;
+    }
+    else if (pmes->stage > 0 && pmes->end > 1)
+    {
+        /* If stage = nstage-1:
+         *   scan over all setups, rerunning only those setups
+         *   which are not much slower than the fastest
+         * else:
+         *   use the next setup
+         */
+        do
+        {
+            pmes->cur++;
+            if (pmes->cur == pmes->end)
+            {
+                pmes->stage++;
+                pmes->cur = pmes->start;
+            }
+        }
+        while (pmes->stage == pmes->nstage - 1 &&
+               pmes->setup[pmes->cur].count > 0 &&
+               pmes->setup[pmes->cur].cycles > cycles_fast*PMES_SLOW_FAC);
+
+        if (pmes->stage == pmes->nstage)
+        {
+            /* We are done optiming, use the fastest setup we found */
+            pmes->cur = pmes->fastest;
+        }
+    }
+
+    if (DOMAINDECOMP(cr) && pmes->stage > 0)
+    {
+        OK = change_dd_cutoff(cr,state,ir,pmes->setup[pmes->cur].rlist);
+        if (!OK)
+        {
+            /* Failsafe solution */
+            if (pmes->cur > 1 && pmes->stage == pmes->nstage)
+            {
+                pmes->stage--;
+            }
+            pmes->fastest = 0;
+            pmes->start   = 0;
+            pmes->end     = pmes->cur;
+            pmes->cur     = pmes->start;
+        }
+    }
+
+    /* Change the Coulomb cut-off and the PME grid */
+
+    set = &pmes->setup[pmes->cur];
+
+    ic->rcoulomb   = set->rcut;
+    ic->rlist      = set->rlist;
+    ic->ewaldcoeff = set->coeff;
+
+    if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+    {
+        nbnxn_cuda_pmetune_update_param(nbv->cu_nbv,ic);
+    }
+    else
+    {
+        init_interaction_const_tables(NULL,ic,nbv->grp[0].kernel_type);
+    }
+
+    if (nbv->ngrp > 1)
+    {
+        init_interaction_const_tables(NULL,ic,nbv->grp[1].kernel_type);
+    }
+
+    if (cr->duty & DUTY_PME)
+    {
+        if (pmes->setup[pmes->cur].pmedata == NULL)
+        {
+            /* Generate a new PME data structure,
+             * copying part of the old pointers.
+             */
+            gmx_pme_reinit(&set->pmedata,
+                           cr,pmes->setup[0].pmedata,ir,
+                           set->grid);
+        }
+        *pmedata = set->pmedata;
+    }
+    else
+    {
+        /* Tell our PME-only node to switch grid */
+        gmx_pme_send_switch(cr, set->grid, set->coeff);
+    }
+
+    if (debug)
+    {
+        print_grid(NULL,debug,"","switched to",set,-1);
+    }
+
+    if (pmes->stage == pmes->nstage)
+    {
+        print_grid(fp_err,fp_log,"","optimal",set,-1);
+    }
+
+    return TRUE;
+}
+
+void restart_switch_pme(pme_switch_t pmes, int n)
+{
+    pmes->nstage += n;
+}
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h b/src/kernel/pme_switch.h
similarity index 56%
copy from src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
copy to src/kernel/pme_switch.h
index 76070804ea..76060a5a47 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
+++ b/src/kernel/pme_switch.h
@@ -7,9 +7,10 @@
  * 
  *          GROningen MAchine for Chemical Simulations
  * 
+ *                        VERSION 4.6.0
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
+ * Copyright (c) 2001-2011, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
 
  * This program is free software; you can redistribute it and/or
@@ -33,24 +34,33 @@
  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
 
-#ifndef _GMX_GPU_UTILS_H_
-#define _GMX_GPU_UTILS_H_
+#ifndef _pme_switch_h
+#define _pme_switch_h
 
-#ifndef __cplusplus
-extern "C" {
-#endif
+typedef struct pme_switch *pme_switch_t;
 
-int do_quick_memtest(int /*dev_id*/);
+/* Initialze the PME grid tuning data and infrastructure */
+void switch_pme_init(pme_switch_t *pmes_p,
+                     const t_inputrec *ir,matrix box,
+                     const interaction_const_t *ic,
+                     gmx_pme_t pmedata);
 
-int do_full_memtest(int /*dev_id*/);
-
-int do_timed_memtest(int /*dev_id*/, int /*time_limit*/);
-
-int is_supported_cuda_gpu(int /*dev_id*/, char* /*gpu_name*/);
-
-#ifndef __cplusplus
-}  /* extern "C" */
-#endif
+/* Adjust the PME grid and Coulomb cut-off.
+ * Returns TRUE the tuning continues, FALSE is the tuning is done.
+ */
+gmx_bool switch_pme(pme_switch_t pmes,
+                    t_commrec *cr,
+                    FILE *fp_err,
+                    FILE *fp_log,
+                    t_inputrec *ir,
+                    t_state *state,
+                    double cycles,
+                    interaction_const_t *ic,
+                    nonbonded_verlet_t *nbv,
+                    gmx_pme_t *pmedata,
+                    int step);
 
-#endif // _GMX_GPU_UTILS_H_
+/* Restart the PME tuning, discarding all timings gathered up till now */
+void restart_switch_pme(pme_switch_t pmes, int n);
 
+#endif /* _pme_switch_h */
diff --git a/src/kernel/readir.c b/src/kernel/readir.c
index 24451eaaa8..81d428bf98 100644
--- a/src/kernel/readir.c
+++ b/src/kernel/readir.c
@@ -206,41 +206,145 @@ void check_ir(const char *mdparin,t_inputrec *ir, t_gromppopts *opts,
 
   set_warning_line(wi,mdparin,-1);
 
-  /* BASIC CUT-OFF STUFF */
-  if (ir->rcoulomb < 0)
-  {
-      warning_error(wi,"rcoulomb should be >= 0");
-  }
-  if (ir->rvdw < 0)
-  {
-      warning_error(wi,"rvdw should be >= 0");
-  }
-  if (ir->rlist < 0)
-  {
-      warning_error(wi,"rlist should be >= 0");
-  }
-  if (ir->rlist == 0 ||
-      !((EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > ir->rlist) ||
-        (EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype)    && ir->rvdw     > ir->rlist))) {
-    /* No switched potential and/or no twin-range:
-     * we can set the long-range cut-off to the maximum of the other cut-offs.
-     */
-    ir->rlistlong = max_cutoff(ir->rlist,max_cutoff(ir->rvdw,ir->rcoulomb));
-  } else if (ir->rlistlong < 0) {
-    ir->rlistlong = max_cutoff(ir->rlist,max_cutoff(ir->rvdw,ir->rcoulomb));
-    sprintf(warn_buf,"rlistlong was not set, setting it to %g (no buffer)",
-	    ir->rlistlong);
-    warning(wi,warn_buf);
-  }
-  if (ir->rlistlong == 0 && ir->ePBC != epbcNONE) {
-      warning_error(wi,"Can not have an infinite cut-off with PBC");
-  }
-  if (ir->rlistlong > 0 && (ir->rlist == 0 || ir->rlistlong < ir->rlist)) {
-      warning_error(wi,"rlistlong can not be shorter than rlist");
-  }
-  if (IR_TWINRANGE(*ir) && ir->nstlist <= 0) {
-      warning_error(wi,"Can not have nstlist<=0 with twin-range interactions");
-  }
+    /* BASIC CUT-OFF STUFF */
+    if (ir->rcoulomb < 0)
+    {
+        warning_error(wi,"rcoulomb should be >= 0");
+    }
+    if (ir->rvdw < 0)
+    {
+        warning_error(wi,"rvdw should be >= 0");
+    }
+    if (ir->rlist < 0 &&
+        !(ir->cutoff_scheme == ecutsVERLET && ir->verletbuf_drift > 0))
+    {
+        warning_error(wi,"rlist should be >= 0");
+    }
+
+    if (ir->cutoff_scheme == ecutsGROUP)
+    {
+        /* BASIC CUT-OFF STUFF */
+        if (ir->rlist == 0 ||
+            !((EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > ir->rlist) ||
+              (EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype)    && ir->rvdw     > ir->rlist))) {
+            /* No switched potential and/or no twin-range:
+             * we can set the long-range cut-off to the maximum of the other cut-offs.
+             */
+            ir->rlistlong = max_cutoff(ir->rlist,max_cutoff(ir->rvdw,ir->rcoulomb));
+        }
+        else if (ir->rlistlong < 0)
+        {
+            ir->rlistlong = max_cutoff(ir->rlist,max_cutoff(ir->rvdw,ir->rcoulomb));
+            sprintf(warn_buf,"rlistlong was not set, setting it to %g (no buffer)",
+                    ir->rlistlong);
+            warning(wi,warn_buf);
+        }
+        if (ir->rlistlong == 0 && ir->ePBC != epbcNONE)
+        {
+            warning_error(wi,"Can not have an infinite cut-off with PBC");
+        }
+        if (ir->rlistlong > 0 && (ir->rlist == 0 || ir->rlistlong < ir->rlist))
+        {
+            warning_error(wi,"rlistlong can not be shorter than rlist");
+        }
+        if (IR_TWINRANGE(*ir) && ir->nstlist <= 0)
+        {
+            warning_error(wi,"Can not have nstlist<=0 with twin-range interactions");
+        }
+    }
+
+    if (ir->cutoff_scheme == ecutsVERLET)
+    {
+        real rc_max;
+
+        /* Normal Verlet type neighbor-list, currently only limited feature support */
+        if (inputrec2nboundeddim(ir) < 3)
+        {
+            warning_error(wi,"With Verlet lists only full pbc or pbc=xy with walls is supported");
+        }
+        if (ir->rcoulomb != ir->rvdw)
+        {
+            warning_error(wi,"With Verlet lists rcoulomb!=rvdw is not supported");
+        }
+        if (ir->vdwtype != evdwCUT)
+        {
+            warning_error(wi,"With Verlet lists only cut-off LJ interactions are supported");
+        }
+        if (!(ir->coulombtype == eelCUT ||
+              (EEL_RF(ir->coulombtype) && ir->coulombtype != eelRF_NEC) ||
+              EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD))
+        {
+            warning_error(wi,"With Verlet lists only cut-off, reaction-field, PME and Ewald electrostatics are supported");
+        }
+
+        if (ir->nstlist <= 0)
+        {
+             warning_error(wi,"With Verlet lists nstlist should be larger than 0");
+        }
+
+        if (ir->nstlist < 10)
+        {
+            warning_note(wi,"With Verlet lists the optimal nstlist is >= 10, with GPUs >= 20. Note that with the Verlet scheme, nstlist has no effect on the accuracy of your simulation.");
+        }
+
+        rc_max = max(ir->rvdw,ir->rcoulomb);
+
+        if (ir->verletbuf_drift <= 0)
+        {
+            if (ir->verletbuf_drift == 0)
+            {
+                warning_error(wi,"Can not have an energy drift of exactly 0");
+            }
+
+            if (ir->rlist < rc_max)
+            {
+                warning_error(wi,"With verlet lists rlist can not be smaller than rvdw or rcoulomb");
+            }
+            
+            if (ir->rlist == rc_max && ir->nstlist > 1)
+            {
+                warning_note(wi,"rlist is equal to rvdw and/or rcoulomb: there is no explicit Verlet buffer. The cluster pair list does have a buffering effect, but choosing a larger rlist might be necessary for good energy conservation.");
+            }
+        }
+        else
+        {
+            if (ir->rlist > rc_max)
+            {
+                warning_note(wi,"You have set rlist larger than the interaction cut-off, but you also have verlet-buffer-drift > 0. Will set rlist using verlet-buffer-drift.");
+            }
+
+            if (ir->nstlist == 1)
+            {
+                /* No buffer required */
+                ir->rlist = rc_max;
+            }
+            else
+            {
+                if (EI_DYNAMICS(ir->eI))
+                {
+                    if (EI_MD(ir->eI) && ir->etc == etcNO)
+                    {
+                        warning_error(wi,"Temperature coupling is required for calculating rlist using the energy drift with verlet-buffer-drift > 0. Either use temperature coupling or set rlist yourself together with verlet-buffer-drift = -1."); 
+                    }
+
+                    if (inputrec2nboundeddim(ir) < 3)
+                    {
+                        warning_error(wi,"The box volume is required for calculating rlist from the energy drift with verlet-buffer-drift > 0. You are using at least one unbounded dimension, so no volume can be computed. Either use a finite box, or set rlist yourself together with verlet-buffer-drift = -1.");
+                    }
+                    /* Set rlist temporarily so we can continue processing */
+                    ir->rlist = rc_max;
+                }
+                else
+                {
+                    /* Set the buffer to 5% of the cut-off */
+                    ir->rlist = 1.05*rc_max;
+                }
+            }
+        }
+
+        /* No twin-range calculations with Verlet lists */
+        ir->rlistlong = ir->rlist;
+    }
 
     /* GENERAL INTEGRATOR STUFF */
     if (!(ir->eI == eiMD || EI_VV(ir->eI)))
@@ -275,6 +379,13 @@ void check_ir(const char *mdparin,t_inputrec *ir, t_gromppopts *opts,
                 }
             }
         }
+        else if (ir->nstenergy > 0 && ir->nstcalcenergy > ir->nstenergy)
+        {
+            /* If the user sets nstenergy small, we should respect that */
+            sprintf(warn_buf,"Setting nstcalcenergy (%d) equal to nstenergy (%d)",ir->nstcalcenergy,ir->nstenergy);
+            ir->nstcalcenergy = ir->nstenergy;
+        }
+
         if (ir->epc != epcNO)
         {
             if (ir->nstpcouple < 0)
@@ -793,9 +904,11 @@ void check_ir(const char *mdparin,t_inputrec *ir, t_gromppopts *opts,
       CHECK(ir->rcoulomb_switch >= ir->rcoulomb);
     }
   } else if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype)) {
-    sprintf(err_buf,"With coulombtype = %s, rcoulomb must be >= rlist",
-	    eel_names[ir->coulombtype]);
-    CHECK(ir->rlist > ir->rcoulomb);
+      if (ir->cutoff_scheme == ecutsGROUP) {
+          sprintf(err_buf,"With coulombtype = %s, rcoulomb must be >= rlist",
+                  eel_names[ir->coulombtype]);
+          CHECK(ir->rlist > ir->rcoulomb);
+      }
   }
 
   if (EEL_FULL(ir->coulombtype)) {
@@ -804,7 +917,7 @@ void check_ir(const char *mdparin,t_inputrec *ir, t_gromppopts *opts,
       sprintf(err_buf,"With coulombtype = %s, rcoulomb must be <= rlist",
 	      eel_names[ir->coulombtype]);
       CHECK(ir->rcoulomb > ir->rlist);
-    } else {
+    } else if (ir->cutoff_scheme == ecutsGROUP) {
       if (ir->coulombtype == eelPME || ir->coulombtype == eelP3M_AD) {
 	sprintf(err_buf,
 		"With coulombtype = %s, rcoulomb must be equal to rlist\n"
@@ -841,20 +954,27 @@ void check_ir(const char *mdparin,t_inputrec *ir, t_gromppopts *opts,
 	    evdw_names[ir->vdwtype]);
     CHECK(ir->rvdw_switch >= ir->rvdw);
   } else if (ir->vdwtype == evdwCUT) {
-    sprintf(err_buf,"With vdwtype = %s, rvdw must be >= rlist",evdw_names[ir->vdwtype]);
-    CHECK(ir->rlist > ir->rvdw);
-  }
-  if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype)
-      && (ir->rlistlong <= ir->rcoulomb)) {
-    sprintf(warn_buf,"For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rcoulomb.",
-	    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
-    warning_note(wi,warn_buf);
-  }
-  if (EVDW_SWITCHED(ir->vdwtype) && (ir->rlistlong <= ir->rvdw)) {
-    sprintf(warn_buf,"For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rvdw.",
-	    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
-    warning_note(wi,warn_buf);
+      if (ir->cutoff_scheme == ecutsGROUP) {
+          sprintf(err_buf,"With vdwtype = %s, rvdw must be >= rlist",evdw_names[ir->vdwtype]);
+          CHECK(ir->rlist > ir->rvdw);
+      }
   }
+    if (ir->cutoff_scheme == ecutsGROUP)
+    {
+        if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype)
+            && (ir->rlistlong <= ir->rcoulomb))
+        {
+            sprintf(warn_buf,"For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rcoulomb.",
+                    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
+            warning_note(wi,warn_buf);
+        }
+        if (EVDW_SWITCHED(ir->vdwtype) && (ir->rlistlong <= ir->rvdw))
+        {
+            sprintf(warn_buf,"For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rvdw.",
+                    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
+            warning_note(wi,warn_buf);
+        }
+    }
 
   if (ir->vdwtype == evdwUSER && ir->eDispCorr != edispcNO) {
       warning_note(wi,"You have selected user tables with dispersion correction, the dispersion will be corrected to -C6/r^6 beyond rvdw_switch (the tabulated interaction between rvdw_switch and rvdw will not be double counted). Make sure that you really want dispersion correction to -C6/r^6.");
@@ -883,22 +1003,22 @@ void check_ir(const char *mdparin,t_inputrec *ir, t_gromppopts *opts,
     warning(wi,"Using L-BFGS with nbfgscorr<=0 just gets you steepest descent.");
   }
 
-  /* ENERGY CONSERVATION */
-  if (ir_NVE(ir))
-  {
-      if (!EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype) && ir->rvdw > 0)
-      {
-          sprintf(warn_buf,"You are using a cut-off for VdW interactions with NVE, for good energy conservation use vdwtype = %s (possibly with DispCorr)",
-                  evdw_names[evdwSHIFT]);
-          warning_note(wi,warn_buf);
-      }
-      if (!EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > 0)
-      {
-          sprintf(warn_buf,"You are using a cut-off for electrostatics with NVE, for good energy conservation use coulombtype = %s or %s",
-                  eel_names[eelPMESWITCH],eel_names[eelRF_ZERO]);
-          warning_note(wi,warn_buf);
-      }
-  }
+    /* ENERGY CONSERVATION */
+    if (ir_NVE(ir) && ir->cutoff_scheme == ecutsGROUP)
+    {
+        if (!EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype) && ir->rvdw > 0)
+        {
+            sprintf(warn_buf,"You are using a cut-off for VdW interactions with NVE, for good energy conservation use vdwtype = %s (possibly with DispCorr)",
+                    evdw_names[evdwSHIFT]);
+            warning_note(wi,warn_buf);
+        }
+        if (!EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > 0)
+        {
+            sprintf(warn_buf,"You are using a cut-off for electrostatics with NVE, for good energy conservation use coulombtype = %s or %s",
+                    eel_names[eelPMESWITCH],eel_names[eelRF_ZERO]);
+            warning_note(wi,warn_buf);
+        }
+    }
 
   /* IMPLICIT SOLVENT */
   if(ir->coulombtype==eelGB_NOTUSED)
@@ -964,16 +1084,25 @@ void check_ir(const char *mdparin,t_inputrec *ir, t_gromppopts *opts,
     
   }
 
-  if (ir->bAdress && !EI_SD(ir->eI)){
-       warning_error(wi,"AdresS simulation supports only stochastic dynamics");
-  }
-  if (ir->bAdress && ir->epc != epcNO){
-       warning_error(wi,"AdresS simulation does not support pressure coupling");
-  }
-  if (ir->bAdress && (EEL_FULL(ir->coulombtype))){
-       warning_error(wi,"AdresS simulation does not support long-range electrostatics");
-  }
-
+    if (ir->bAdress)
+    {
+        if (ir->cutoff_scheme != ecutsGROUP)
+        {
+            warning_error(wi,"AdresS simulation supports only cutoff-scheme=group");
+        }
+        if (!EI_SD(ir->eI))
+        {
+            warning_error(wi,"AdresS simulation supports only stochastic dynamics");
+        }
+        if (ir->epc != epcNO)
+        {
+            warning_error(wi,"AdresS simulation does not support pressure coupling");
+        }
+        if (EEL_FULL(ir->coulombtype))
+        {
+            warning_error(wi,"AdresS simulation does not support long-range electrostatics");
+        }
+    }
 }
 
 /* count the number of text elemets separated by whitespace in a string.
@@ -1363,7 +1492,7 @@ void get_ir(const char *mdparin,const char *mdparout,
   CTYPE ("mode for center of mass motion removal");
   EETYPE("comm-mode",   ir->comm_mode,  ecm_names);
   CTYPE ("number of steps for center of mass motion removal");
-  ITYPE ("nstcomm",	ir->nstcomm,	10);
+  ITYPE ("nstcomm",	ir->nstcomm,	100);
   CTYPE ("group(s) for center of mass motion removal");
   STYPE ("comm-grps",   vcm,            NULL);
   
@@ -1397,8 +1526,8 @@ void get_ir(const char *mdparin,const char *mdparout,
   ir->nstcheckpoint = 1000;
   CTYPE ("Output frequency for energies to log file and energy file");
   ITYPE ("nstlog",	ir->nstlog,	1000);
-  ITYPE ("nstcalcenergy",ir->nstcalcenergy,	-1);
-  ITYPE ("nstenergy",   ir->nstenergy,  100);
+  ITYPE ("nstcalcenergy",ir->nstcalcenergy,	100);
+  ITYPE ("nstenergy",   ir->nstenergy,  1000);
   CTYPE ("Output frequency and precision for .xtc file");
   ITYPE ("nstxtcout",   ir->nstxtcout,  0);
   RTYPE ("xtc-precision",ir->xtcprec,   1000.0);
@@ -1410,6 +1539,8 @@ void get_ir(const char *mdparin,const char *mdparout,
 
   /* Neighbor searching */  
   CCTYPE ("NEIGHBORSEARCHING PARAMETERS");
+  CTYPE ("cut-off scheme (group: using charge groups, Verlet: particle based cut-offs)");
+  EETYPE("cutoff-scheme",     ir->cutoff_scheme,    ecutscheme_names);
   CTYPE ("nblist update frequency");
   ITYPE ("nstlist",	ir->nstlist,	10);
   CTYPE ("ns algorithm (simple or grid)");
@@ -1419,6 +1550,9 @@ void get_ir(const char *mdparin,const char *mdparout,
   CTYPE ("Periodic boundary conditions: xyz, no, xy");
   EETYPE("pbc",         ir->ePBC,       epbc_names);
   EETYPE("periodic-molecules", ir->bPeriodicMols, yesno_names);
+  CTYPE ("Allowed energy drift due to the Verlet buffer in kJ/mol/ps per atom,");
+  CTYPE ("a value of -1 means: use rlist");
+  RTYPE("verlet-buffer-drift", ir->verletbuf_drift,    0.005);
   CTYPE ("nblist cut-off");
   RTYPE ("rlist",	ir->rlist,	-1);
   CTYPE ("long-range cut-off for switched potentials");
@@ -1446,7 +1580,7 @@ void get_ir(const char *mdparin,const char *mdparout,
   CTYPE ("Seperate tables between energy group pairs");
   STYPE ("energygrp-table", egptable,   NULL);
   CTYPE ("Spacing for the PME/PPPM FFT grid");
-  RTYPE ("fourierspacing", opts->fourierspacing,0.12);
+  RTYPE ("fourierspacing", ir->fourier_spacing,0.12);
   CTYPE ("FFT grid size, when a value is 0 fourierspacing will be used");
   ITYPE ("fourier-nx",  ir->nkx,         0);
   ITYPE ("fourier-ny",  ir->nky,         0);
@@ -2456,8 +2590,13 @@ void do_index(const char* mdparin, const char *ndx,
               sprintf(warn_buf,"With integrator %s tau-t should be larger than 0",ei_names[ir->eI]);
               warning_error(wi,warn_buf);
           }
-          if ((ir->etc == etcVRESCALE && ir->opts.tau_t[i] >= 0) || 
-              (ir->etc != etcVRESCALE && ir->opts.tau_t[i] >  0))
+
+          if (ir->etc != etcVRESCALE && ir->opts.tau_t[i] == 0)
+          {
+              warning_note(wi,"tau-t = -1 is the new value to signal that a group should not have temperature coupling. Treating your use of tau-t = 0 as if you used -1.");
+          }
+
+          if (ir->opts.tau_t[i] >= 0)
           {
               tau_min = min(tau_min,ir->opts.tau_t[i]);
           }
@@ -2801,6 +2940,10 @@ void do_index(const char* mdparin, const char *ndx,
   snew(ir->opts.egp_flags,nr*nr);
 
   bExcl = do_egp_flag(ir,groups,"energygrp-excl",egpexcl,EGP_EXCL);
+    if (bExcl && ir->cutoff_scheme == ecutsVERLET) 
+    {
+        warning_error(wi,"Energy group exclusions are not (yet) implemented for the Verlet scheme");
+    } 
   if (bExcl && EEL_FULL(ir->coulombtype))
     warning(wi,"Can not exclude the lattice Coulomb energy between energy groups");
 
diff --git a/src/kernel/readir.h b/src/kernel/readir.h
index 9e904a2b52..d84c2699fc 100644
--- a/src/kernel/readir.h
+++ b/src/kernel/readir.h
@@ -56,7 +56,6 @@ static const char *couple_lam[ecouplamNR+1]    = {
 typedef struct {
   int warnings;
   int nshake;
-  real fourierspacing;
   char *include;
   char *define;
   gmx_bool bGenVel;
diff --git a/src/kernel/repl_ex.h b/src/kernel/repl_ex.h
index 7e4bf23d68..30a726157b 100644
--- a/src/kernel/repl_ex.h
+++ b/src/kernel/repl_ex.h
@@ -37,6 +37,7 @@
 #define _repl_ex_h
 
 #include "typedefs.h"
+#include "types/commrec.h"
 
 /* Abstract type for replica exchange */
 typedef struct gmx_repl_ex *gmx_repl_ex_t;
diff --git a/src/kernel/runner.c b/src/kernel/runner.c
index 313221c579..9cab66fb00 100644
--- a/src/kernel/runner.c
+++ b/src/kernel/runner.c
@@ -43,12 +43,16 @@
 #endif
 #include <signal.h>
 #include <stdlib.h>
+#include <string.h>
+#include <assert.h>
 
 #include "typedefs.h"
 #include "smalloc.h"
 #include "sysstuff.h"
 #include "statutil.h"
 #include "mdrun.h"
+#include "md_logging.h"
+#include "md_support.h"
 #include "network.h"
 #include "pull.h"
 #include "names.h"
@@ -69,10 +73,15 @@
 #include "sighandler.h"
 #include "tpxio.h"
 #include "txtdump.h"
+#include "gmx_detect_hardware.h"
+#include "gmx_omp_nthreads.h"
 #include "pull_rotation.h"
+#include "calc_verletbuf.h"
+#include "nbnxn_search.h"
+#include "../mdlib/nbnxn_consts.h"
+#include "gmx_fatal_collective.h"
 #include "membed.h"
 #include "md_openmm.h"
-
 #include "gmx_omp.h"
 
 #ifdef GMX_LIB_MPI
@@ -90,6 +99,8 @@
 #include "md_openmm.h"
 #endif
 
+#include "gpu_utils.h"
+#include "nbnxn_cuda_data_mgmt.h"
 
 typedef struct { 
     gmx_integrator_t *func;
@@ -112,6 +123,7 @@ tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
 #ifdef GMX_THREAD_MPI
 struct mdrunner_arglist
 {
+    gmx_hw_opt_t *hw_opt;
     FILE *fplog;
     t_commrec *cr;
     int nfile;
@@ -129,6 +141,8 @@ struct mdrunner_arglist
     const char *ddcsx;
     const char *ddcsy;
     const char *ddcsz;
+    const char *nbpu_opt;
+    int nsteps_cmdline;
     int nstepout;
     int resetstep;
     int nmultisim;
@@ -167,12 +181,14 @@ static void mdrunner_start_fn(void *arg)
         fplog=mc.fplog;
     }
 
-    mda->ret=mdrunner(cr->nnodes, fplog, cr, mc.nfile, fnm, mc.oenv, 
+    mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv, 
                       mc.bVerbose, mc.bCompact, mc.nstglobalcomm, 
                       mc.ddxyz, mc.dd_node_order, mc.rdd,
                       mc.rconstr, mc.dddlb_opt, mc.dlb_scale, 
-                      mc.ddcsx, mc.ddcsy, mc.ddcsz, mc.nstepout, mc.resetstep, 
-                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+                      mc.ddcsx, mc.ddcsy, mc.ddcsz,
+                      mc.nbpu_opt,
+                      mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce, 
                       mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 }
 
@@ -180,15 +196,17 @@ static void mdrunner_start_fn(void *arg)
    the main thread) for thread-parallel runs. This in turn calls mdrunner()
    for each thread. 
    All options besides nthreads are the same as for mdrunner(). */
-static t_commrec *mdrunner_start_threads(int nthreads, 
+static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt, 
               FILE *fplog,t_commrec *cr,int nfile, 
               const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
               gmx_bool bCompact, int nstglobalcomm,
               ivec ddxyz,int dd_node_order,real rdd,real rconstr,
               const char *dddlb_opt,real dlb_scale,
               const char *ddcsx,const char *ddcsy,const char *ddcsz,
-              int nstepout,int resetstep,int nmultisim,int repl_ex_nst,
-              int repl_ex_nex, int repl_ex_seed, real pforce,real cpt_period, real max_hours,
+              const char *nbpu_opt,
+              int nsteps_cmdline, int nstepout,int resetstep,
+              int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
+              real pforce,real cpt_period, real max_hours, 
               const char *deviceOptions, unsigned long Flags)
 {
     int ret;
@@ -197,14 +215,17 @@ static t_commrec *mdrunner_start_threads(int nthreads,
     t_filenm *fnmn;
 
     /* first check whether we even need to start tMPI */
-    if (nthreads<2)
+    if (hw_opt->nthreads_tmpi < 2)
+    {
         return cr;
+    }
 
     /* a few small, one-time, almost unavoidable memory leaks: */
     snew(mda,1);
     fnmn=dup_tfn(nfile, fnm);
 
     /* fill the data structure to pass as void pointer to thread start fn */
+    mda->hw_opt=hw_opt;
     mda->fplog=fplog;
     mda->cr=cr;
     mda->nfile=nfile;
@@ -224,6 +245,8 @@ static t_commrec *mdrunner_start_threads(int nthreads,
     mda->ddcsx=ddcsx;
     mda->ddcsy=ddcsy;
     mda->ddcsz=ddcsz;
+    mda->nbpu_opt=nbpu_opt;
+    mda->nsteps_cmdline=nsteps_cmdline;
     mda->nstepout=nstepout;
     mda->resetstep=resetstep;
     mda->nmultisim=nmultisim;
@@ -236,11 +259,12 @@ static t_commrec *mdrunner_start_threads(int nthreads,
     mda->deviceOptions=deviceOptions;
     mda->Flags=Flags;
 
-    fprintf(stderr, "Starting %d threads\n",nthreads);
+    fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
     fflush(stderr);
     /* now spawn new threads that start mdrunner_start_fn(), while 
        the main thread returns */
-    ret=tMPI_Init_fn(TRUE, nthreads, mdrunner_start_fn, (void*)(mda) );
+    ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
+                     mdrunner_start_fn, (void*)(mda) );
     if (ret!=TMPI_SUCCESS)
         return NULL;
 
@@ -250,66 +274,133 @@ static t_commrec *mdrunner_start_threads(int nthreads,
 }
 
 
+static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
+                                            int nthreads_tot,
+                                            int ngpu)
+{
+    int nthreads_tmpi;
+
+    /* There are no separate PME nodes here, as we ensured in
+     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
+     * and a conditional ensures we would not have ended up here.
+     * Note that separate PME nodes might be switched on later.
+     */
+    if (ngpu > 0)
+    {
+        nthreads_tmpi = ngpu;
+        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
+        {
+            nthreads_tmpi = nthreads_tot;
+        }
+    }
+    else if (hw_opt->nthreads_omp > 0)
+    {
+        if (hw_opt->nthreads_omp > nthreads_tot)
+        {
+            gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
+        }
+        nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
+    }
+    else
+    {
+        /* TODO choose nthreads_omp based on hardware topology
+           when we have a hardware topology detection library */
+        /* Don't use OpenMP parallelization */
+        nthreads_tmpi = nthreads_tot;
+    }
+
+    return nthreads_tmpi;
+}
+
+
 /* Get the number of threads to use for thread-MPI based on how many
  * were requested, which algorithms we're using,
  * and how many particles there are.
+ * At the point we have already called check_and_update_hw_opt.
+ * Thus all options should be internally consistent and consistent
+ * with the hardware, except that ntmpi could be larger than #GPU.
  */
-static int get_nthreads_mpi(int nthreads_requested, t_inputrec *inputrec,
-                            gmx_mtop_t *mtop)
+static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
+                            gmx_hw_opt_t *hw_opt,
+                            t_inputrec *inputrec, gmx_mtop_t *mtop,
+                            const t_commrec *cr,
+                            FILE *fplog)
 {
-    int nthreads,nthreads_new;
-    int min_atoms_per_thread;
+    int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
+    int min_atoms_per_mpi_thread;
     char *env;
+    char sbuf[STRLEN];
+    gmx_bool bCanUseGPU;
 
-    nthreads = nthreads_requested;
+    if (hw_opt->nthreads_tmpi > 0)
+    {
+        /* Trivial, return right away */
+        return hw_opt->nthreads_tmpi;
+    }
 
-    /* determine # of hardware threads. */
-    if (nthreads_requested < 1)
+    /* How many total (#tMPI*#OpenMP) threads can we start? */ 
+    if (hw_opt->nthreads_tot > 0)
     {
-        if ((env = getenv("GMX_MAX_THREADS")) != NULL)
-        {
-            nthreads = 0;
-            sscanf(env,"%d",&nthreads);
-            if (nthreads < 1)
-            {
-                gmx_fatal(FARGS,"GMX_MAX_THREADS (%d) should be larger than 0",
-                          nthreads);
-            }
-        }
-        else
-        {
-            nthreads = tMPI_Thread_get_hw_number();
-        }
+        nthreads_tot_max = hw_opt->nthreads_tot;
+    }
+    else
+    {
+        nthreads_tot_max = tMPI_Thread_get_hw_number();
+    }
+
+    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
+    if (bCanUseGPU)
+    {
+        ngpu = hwinfo->gpu_info.ncuda_dev_use;
+    }
+    else
+    {
+        ngpu = 0;
     }
 
+    nthreads_tmpi =
+        get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
+
     if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
     {
         /* Steps are divided over the nodes iso splitting the atoms */
-        min_atoms_per_thread = 0;
+        min_atoms_per_mpi_thread = 0;
     }
     else
     {
-        min_atoms_per_thread = MIN_ATOMS_PER_THREAD;
+        if (bCanUseGPU)
+        {
+            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
+        }
+        else
+        {
+            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
+        }
     }
 
     /* Check if an algorithm does not support parallel simulation.  */
-    if (nthreads != 1 && 
+    if (nthreads_tmpi != 1 &&
         ( inputrec->eI == eiLBFGS ||
           inputrec->coulombtype == eelEWALD ) )
     {
-        fprintf(stderr,"\nThe integration or electrostatics algorithm doesn't support parallel runs. Not starting any threads.\n");
-        nthreads = 1;
+        nthreads_tmpi = 1;
+
+        md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
+        {
+            gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+        }
     }
-    else if (nthreads_requested < 1 &&
-             mtop->natoms/nthreads < min_atoms_per_thread)
+    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
     {
         /* the thread number was chosen automatically, but there are too many
            threads (too few atoms per thread) */
-        nthreads_new = max(1,mtop->natoms/min_atoms_per_thread);
+        nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
 
-        if (nthreads_new > 8 || (nthreads == 8 && nthreads_new > 4))
+        if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
         {
-            /* Use only multiples of 4 above 8 threads
+            /* TODO replace this once we have proper HT detection
+             * Use only multiples of 4 above 8 threads
              * or with an 8-core processor
              * (to avoid 6 threads on 8 core processors with 4 real cores).
              */
@@ -321,28 +412,611 @@ static int get_nthreads_mpi(int nthreads_requested, t_inputrec *inputrec,
             nthreads_new = (nthreads_new/2)*2;
         }
 
-        nthreads = nthreads_new;
+        nthreads_tmpi = nthreads_new;
 
         fprintf(stderr,"\n");
         fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
-        fprintf(stderr,"      only starting %d threads.\n",nthreads);
-        fprintf(stderr,"      You can use the -nt option to optimize the number of threads.\n\n");
+        fprintf(stderr,"      only starting %d thread-MPI threads.\n",nthreads_tmpi);
+        fprintf(stderr,"      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
+    }
+
+    return nthreads_tmpi;
+}
+#endif /* GMX_THREAD_MPI */
+
+
+/* Environment variable for setting nstlist */
+static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
+/* Try to increase nstlist when using a GPU with nstlist less than this */
+static const int    NSTLIST_GPU_ENOUGH      = 20;
+/* Increase nstlist until the non-bonded cost increases more than this factor */
+static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
+/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
+static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
+
+/* Try to increase nstlist when running on a GPU */
+static void increase_nstlist(FILE *fp,t_commrec *cr,
+                             t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
+{
+    char *env;
+    int  nstlist_orig,nstlist_prev;
+    verletbuf_list_setup_t ls;
+    real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
+    int  i;
+    t_state state_tmp;
+    gmx_bool bBox,bDD,bCont;
+    const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+    const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
+    const char *box_err="Can not increase nstlist for GPU run because the box is too small";
+    const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
+    char buf[STRLEN];
+
+    /* Number of + nstlist alternative values to try when switching  */
+    const int nstl[]={ 20, 25, 40, 50 };
+#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
+
+    env = getenv(NSTLIST_ENVVAR);
+    if (env == NULL)
+    {
+        if (fp != NULL)
+        {
+            fprintf(fp,nstl_fmt,ir->nstlist);
+        }
+    }
+
+    if (ir->verletbuf_drift == 0)
+    {
+        gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+    }
+
+    if (ir->verletbuf_drift < 0)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr,"%s\n",vbd_err);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp,"%s\n",vbd_err);
+        }
+
+        return;
+    }
+
+    nstlist_orig = ir->nstlist;
+    if (env != NULL)
+    {
+        sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
+        if (MASTER(cr))
+        {
+            fprintf(stderr,"%s\n",buf);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp,"%s\n",buf);
+        }
+        sscanf(env,"%d",&ir->nstlist);
+    }
+
+    verletbuf_get_list_setup(TRUE,&ls);
+
+    /* Allow rlist to make the list double the size of the cut-off sphere */
+    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
+    rlist_ok  = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
+    rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
+    if (debug)
+    {
+        fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
+                rlist_inc,rlist_max);
+    }
+
+    i = 0;
+    nstlist_prev = nstlist_orig;
+    rlist_prev   = ir->rlist;
+    do
+    {
+        if (env == NULL)
+        {
+            ir->nstlist = nstl[i];
+        }
+
+        /* Set the pair-list buffer size in ir */
+        calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
+                                NULL,&rlist_new);
+
+        /* Does rlist fit in the box? */
+        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
+        bDD  = TRUE;
+        if (bBox && DOMAINDECOMP(cr))
+        {
+            /* Check if rlist fits in the domain decomposition */
+            if (inputrec2nboundeddim(ir) < DIM)
+            {
+                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+            }
+            copy_mat(box,state_tmp.box);
+            bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
+        }
+
+        bCont = FALSE;
+
+        if (env == NULL)
+        {
+            if (bBox && bDD && rlist_new <= rlist_max)
+            {
+                /* Increase nstlist */
+                nstlist_prev = ir->nstlist;
+                rlist_prev   = rlist_new;
+                bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
+            }
+            else
+            {
+                /* Stick with the previous nstlist */
+                ir->nstlist = nstlist_prev;
+                rlist_new   = rlist_prev;
+                bBox = TRUE;
+                bDD  = TRUE;
+            }
+        }
+
+        i++;
+    }
+    while (bCont);
+
+    if (!bBox || !bDD)
+    {
+        gmx_warning(!bBox ? box_err : dd_err);
+        if (fp != NULL)
+        {
+            fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
+        }
+        ir->nstlist = nstlist_orig;
+    }
+    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+    {
+        sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
+                nstlist_orig,ir->nstlist,
+                ir->rlist,rlist_new);
+        if (MASTER(cr))
+        {
+            fprintf(stderr,"%s\n\n",buf);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp,"%s\n\n",buf);
+        }
+        ir->rlist     = rlist_new;
+        ir->rlistlong = rlist_new;
+    }
+}
+
+static void prepare_verlet_scheme(FILE *fplog,
+                                  gmx_hw_info_t *hwinfo,
+                                  t_commrec *cr,
+                                  gmx_hw_opt_t *hw_opt,
+                                  const char *nbpu_opt,
+                                  t_inputrec *ir,
+                                  const gmx_mtop_t *mtop,
+                                  matrix box,
+                                  gmx_bool *bUseGPU)
+{
+    /* Here we only check for GPU usage on the MPI master process,
+     * as here we don't know how many GPUs we will use yet.
+     * We check for a GPU on all processes later.
+     */
+    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
+
+    if (ir->verletbuf_drift > 0)
+    {
+        /* Update the Verlet buffer size for the current run setup */
+        verletbuf_list_setup_t ls;
+        real rlist_new;
+
+        /* Here we assume CPU acceleration is on. But as currently
+         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+         * and 4x2 gives a larger buffer than 4x4, this is ok.
+         */
+        verletbuf_get_list_setup(*bUseGPU,&ls);
+
+        calc_verlet_buffer_size(mtop,det(box),ir,
+                                ir->verletbuf_drift,&ls,
+                                NULL,&rlist_new);
+        if (rlist_new != ir->rlist)
+        {
+            if (fplog != NULL)
+            {
+                fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+                        ir->rlist,rlist_new,
+                        ls.cluster_size_i,ls.cluster_size_j);
+            }
+            ir->rlist     = rlist_new;
+            ir->rlistlong = rlist_new;
+        }
+    }
+
+    /* With GPU or emulation we should check nstlist for performance */
+    if ((EI_DYNAMICS(ir->eI) &&
+         *bUseGPU &&
+         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
+        getenv(NSTLIST_ENVVAR) != NULL)
+    {
+        /* Choose a better nstlist */
+        increase_nstlist(fplog,cr,ir,mtop,box);
+    }
+}
+
+static void convert_to_verlet_scheme(FILE *fplog,
+                                     t_inputrec *ir,
+                                     gmx_mtop_t *mtop,real box_vol)
+{
+    char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
+
+    md_print_warn(NULL,fplog,"%s\n",conv_mesg);
+
+    ir->cutoff_scheme   = ecutsVERLET;
+    ir->verletbuf_drift = 0.005;
+
+    if (ir->rcoulomb != ir->rvdw)
+    {
+        gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
+    }
+
+    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
+    {
+        gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
+    }
+    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
+    {
+        md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
+
+        if (EVDW_SWITCHED(ir->vdwtype))
+        {
+            ir->vdwtype = evdwCUT;
+        }
+        if (EEL_SWITCHED(ir->coulombtype))
+        {
+            if (EEL_FULL(ir->coulombtype))
+            {
+                /* With full electrostatic only PME can be switched */
+                ir->coulombtype = eelPME;
+            }
+            else
+            {
+                md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
+                ir->coulombtype = eelRF;
+                ir->epsilon_rf  = 0.0;
+            }
+        }
+
+        /* We set the target energy drift to a small number.
+         * Note that this is only for testing. For production the user
+         * should think about this and set the mdp options.
+         */
+        ir->verletbuf_drift = 1e-4;
+    }
+
+    if (inputrec2nboundeddim(ir) != 3)
+    {
+        gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
     }
-    return nthreads;
+
+    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
+    {
+        gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
+    }
+
+    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
+    {
+        verletbuf_list_setup_t ls;
+
+        verletbuf_get_list_setup(FALSE,&ls);
+        calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
+                                NULL,&ir->rlist);
+    }
+    else
+    {
+        ir->verletbuf_drift = -1;
+        ir->rlist           = 1.05*max(ir->rvdw,ir->rcoulomb);
+    }
+
+    gmx_mtop_remove_chargegroups(mtop);
 }
+
+
+/* Set CPU affinity. Can be important for performance.
+   On some systems (e.g. Cray) CPU Affinity is set by default.
+   But default assigning doesn't work (well) with only some ranks
+   having threads. This causes very low performance.
+   External tools have cumbersome syntax for setting affinity
+   in the case that only some ranks have threads.
+   Thus it is important that GROMACS sets the affinity internally
+   if only PME is using threads.
+*/
+static void set_cpu_affinity(FILE *fplog,
+                             const t_commrec *cr,
+                             const gmx_hw_opt_t *hw_opt,
+                             int nthreads_pme,
+                             const gmx_hw_info_t *hwinfo,
+                             const t_inputrec *inputrec)
+{
+#ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
+#ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
+    if (hw_opt->bThreadPinning)
+    {
+        int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
+        int offset;
+        char *env;
+
+        /* threads on this MPI process or TMPI thread */
+        if (cr->duty & DUTY_PP)
+        {
+            nthread_local = gmx_omp_nthreads_get(emntNonbonded);
+        }
+        else
+        {
+            nthread_local = gmx_omp_nthreads_get(emntPME);
+        }
+
+        /* map the current process to cores */
+        thread = 0;
+        nthread_node = nthread_local;
+#ifdef GMX_MPI
+        if (PAR(cr) || MULTISIM(cr))
+        {
+            /* We need to determine a scan of the thread counts in this
+             * compute node.
+             */
+            MPI_Comm comm_intra;
+
+            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
+                           &comm_intra);
+            MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
+            /* MPI_Scan is inclusive, but here we need exclusive */
+            thread -= nthread_local;
+            /* Get the total number of threads on this physical node */
+            MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
+            MPI_Comm_free(&comm_intra);
+        }
 #endif
 
+        offset = 0;
+        if (hw_opt->core_pinning_offset > 0)
+        {
+            offset = hw_opt->core_pinning_offset;
+            if (SIMMASTER(cr))
+            {
+                fprintf(stderr, "Applying core pinning offset %d\n", offset);
+            }
+            if (fplog)
+            {
+                fprintf(fplog, "Applying core pinning offset %d\n", offset);
+            }
+        }
+
+        /* With Intel Hyper-Threading enabled, we want to pin consecutive
+         * threads to physical cores when using more threads than physical
+         * cores or when the user requests so.
+         */
+        nthread_hw_max = hwinfo->nthreads_hw_avail;
+        nphyscore = -1;
+        if (hw_opt->bPinHyperthreading ||
+            (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+             nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
+        {
+            if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
+            {
+                /* We print to stderr on all processes, as we might have
+                 * different settings on different physical nodes.
+                 */
+                if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
+                {
+                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
+                                  "but non-Intel CPU detected (vendor: %s)\n",
+                                  gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
+                }
+                else
+                {
+                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
+                                  "but the CPU detected does not have Intel Hyper-Threading support "
+                                  "(or it is turned off)\n");
+                }
+            }
+            nphyscore = nthread_hw_max/2;
 
-int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
+            if (SIMMASTER(cr))
+            {
+                fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
+                        nphyscore);
+            }
+            if (fplog)
+            {
+                fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
+                        nphyscore);
+            }
+        }
+
+        /* set the per-thread affinity */
+#pragma omp parallel firstprivate(thread) num_threads(nthread_local)
+        {
+            cpu_set_t mask;
+            int core;
+
+            CPU_ZERO(&mask);
+            thread += gmx_omp_get_thread_num();
+            if (nphyscore <= 0)
+            {
+                core = offset + thread;
+            }
+            else
+            {
+                /* Lock pairs of threads to the same hyperthreaded core */
+                core = offset + thread/2 + (thread % 2)*nphyscore;
+            }
+            CPU_SET(core, &mask);
+            sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
+        }
+    }
+#endif /* __linux    */
+#endif /* GMX_OPENMP */
+}
+
+
+static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
+                                    int cutoff_scheme)
+{
+    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
+
+#ifndef GMX_THREAD_MPI
+    if (hw_opt->nthreads_tot > 0)
+    {
+        gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+    }
+    if (hw_opt->nthreads_tmpi > 0)
+    {
+        gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+    }
+#endif
+
+    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
+    {
+        /* We have the same number of OpenMP threads for PP and PME processes,
+         * thus we can perform several consistency checks.
+         */
+        if (hw_opt->nthreads_tmpi > 0 &&
+            hw_opt->nthreads_omp > 0 &&
+            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
+        {
+            gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
+        }
+
+        if (hw_opt->nthreads_tmpi > 0 &&
+            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
+        {
+            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
+        }
+
+        if (hw_opt->nthreads_omp > 0 &&
+            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
+        {
+            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
+                      hw_opt->nthreads_tot,hw_opt->nthreads_omp);
+        }
+
+        if (hw_opt->nthreads_tmpi > 0 &&
+            hw_opt->nthreads_omp <= 0)
+        {
+            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+        }
+    }
+
+#ifndef GMX_OPENMP
+    if (hw_opt->nthreads_omp > 1)
+    {
+        gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
+    }
+#endif
+
+    if (cutoff_scheme == ecutsGROUP)
+    {
+        /* We only have OpenMP support for PME only nodes */
+        if (hw_opt->nthreads_omp > 1)
+        {
+            gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
+                      ecutscheme_names[cutoff_scheme],
+                      ecutscheme_names[ecutsVERLET]);
+        }
+        hw_opt->nthreads_omp = 1;
+    }
+
+    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
+    {
+        gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
+    }
+
+    if (hw_opt->nthreads_tot == 1)
+    {
+        hw_opt->nthreads_tmpi = 1;
+
+        if (hw_opt->nthreads_omp > 1)
+        {
+            gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
+                      hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
+        }
+        hw_opt->nthreads_omp = 1;
+    }
+
+    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+    {
+        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+    }
+
+    if (debug)
+    {
+        fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+                hw_opt->nthreads_tot,
+                hw_opt->nthreads_tmpi,
+                hw_opt->nthreads_omp,
+                hw_opt->nthreads_omp_pme,
+                hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
+                
+    }
+}
+
+
+/* Override the value in inputrec with value passed on the command line (if any) */
+static void override_nsteps_cmdline(FILE *fplog,
+                                    int nsteps_cmdline,
+                                    t_inputrec *ir,
+                                    const t_commrec *cr)
+{
+    assert(ir);
+    assert(cr);
+
+    /* override with anything else than the default -2 */
+    if (nsteps_cmdline > -2)
+    {
+        char stmp[STRLEN];
+
+        ir->nsteps = nsteps_cmdline;
+        if (EI_DYNAMICS(ir->eI))
+        {
+            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
+                    nsteps_cmdline, nsteps_cmdline*ir->delta_t);
+        }
+        else
+        {
+            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
+                    nsteps_cmdline);
+        }
+
+        md_print_warn(cr, fplog, "%s\n", stmp);
+    }
+}
+
+/* Data structure set by SIMMASTER which needs to be passed to all nodes
+ * before the other nodes have read the tpx file and called gmx_detect_hardware.
+ */
+typedef struct {
+    int      cutoff_scheme; /* The cutoff-scheme from inputrec_t */
+    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
+} master_inf_t;
+
+int mdrunner(gmx_hw_opt_t *hw_opt,
+             FILE *fplog,t_commrec *cr,int nfile,
              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
              gmx_bool bCompact, int nstglobalcomm,
              ivec ddxyz,int dd_node_order,real rdd,real rconstr,
              const char *dddlb_opt,real dlb_scale,
              const char *ddcsx,const char *ddcsy,const char *ddcsz,
-             int nstepout,int resetstep,int nmultisim, int repl_ex_nst, int repl_ex_nex,
+             const char *nbpu_opt,
+             int nsteps_cmdline, int nstepout,int resetstep,
+             int nmultisim,int repl_ex_nst,int repl_ex_nex,
              int repl_ex_seed, real pforce,real cpt_period,real max_hours,
              const char *deviceOptions, unsigned long Flags)
 {
+    gmx_bool   bForceUseGPU,bTryUseGPU;
     double     nodetime=0,realtime;
     t_inputrec *inputrec;
     t_state    *state=NULL;
@@ -369,43 +1043,120 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
     gmx_large_int_t reset_counters;
     gmx_edsam_t ed=NULL;
     t_commrec   *cr_old=cr; 
-    int         nthreads_mpi=1;
     int         nthreads_pme=1;
+    int         nthreads_pp=1;
     gmx_membed_t membed=NULL;
+    gmx_hw_info_t *hwinfo=NULL;
+    master_inf_t minf={-1,FALSE};
 
     /* CAUTION: threads may be started later on in this function, so
        cr doesn't reflect the final parallel state right now */
     snew(inputrec,1);
     snew(mtop,1);
-
-    if (bVerbose && SIMMASTER(cr))
-    {
-        fprintf(stderr,"Getting Loaded...\n");
-    }
     
     if (Flags & MD_APPENDFILES) 
     {
         fplog = NULL;
     }
 
+    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+
     snew(state,1);
-    if (MASTER(cr)) 
+    if (SIMMASTER(cr)) 
     {
         /* Read (nearly) all data required for the simulation */
         read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
 
-        /* NOW the threads will be started: */
+        if (inputrec->cutoff_scheme != ecutsVERLET &&
+            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
+        {
+            convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
+        }
+
+        /* Detect hardware, gather information. With tMPI only thread 0 does it
+         * and after threads are started broadcasts hwinfo around. */
+        snew(hwinfo, 1);
+        gmx_detect_hardware(fplog, hwinfo, cr,
+                            bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
+
+        minf.cutoff_scheme = inputrec->cutoff_scheme;
+        minf.bUseGPU       = FALSE;
+
+        if (inputrec->cutoff_scheme == ecutsVERLET)
+        {
+            prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
+                                  inputrec,mtop,state->box,
+                                  &minf.bUseGPU);
+        }
+        else if (hwinfo->bCanUseGPU)
+        {
+            md_print_warn(cr,fplog,
+                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
+                          "      (for quick performance testing you can use the -testverlet option)\n");
+
+            if (bForceUseGPU)
+            {
+                gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
+            }
+        }
+    }
+#ifndef GMX_THREAD_MPI
+    if (PAR(cr))
+    {
+        gmx_bcast_sim(sizeof(minf),&minf,cr);
+    }
+#endif
+    if (minf.bUseGPU && cr->npmenodes == -1)
+    {
+        /* Don't automatically use PME-only nodes with GPUs */
+        cr->npmenodes = 0;
+    }
+
 #ifdef GMX_THREAD_MPI
-        nthreads_mpi = get_nthreads_mpi(nthreads_requested, inputrec, mtop);
+    /* With thread-MPI inputrec is only set here on the master thread */
+    if (SIMMASTER(cr))
+#endif
+    {
+        check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
 
-        if (nthreads_mpi > 1)
+#ifdef GMX_THREAD_MPI
+        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+        {
+            gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
+        }
+#endif
+
+        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
+            cr->npmenodes <= 0)
+        {
+            gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
+        }
+    }
+
+#ifdef GMX_THREAD_MPI
+    if (SIMMASTER(cr))
+    {
+        /* NOW the threads will be started: */
+        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+                                                 hw_opt,
+                                                 inputrec, mtop,
+                                                 cr, fplog);
+        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
+        {
+            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+        }
+
+        if (hw_opt->nthreads_tmpi > 1)
         {
             /* now start the threads. */
-            cr=mdrunner_start_threads(nthreads_mpi, fplog, cr_old, nfile, fnm,
+            cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm, 
                                       oenv, bVerbose, bCompact, nstglobalcomm, 
                                       ddxyz, dd_node_order, rdd, rconstr, 
                                       dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
-                                      nstepout, resetstep, nmultisim, 
+                                      nbpu_opt,
+                                      nsteps_cmdline, nstepout, resetstep, nmultisim, 
                                       repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
                                       cpt_period, max_hours, deviceOptions, 
                                       Flags);
@@ -416,8 +1167,8 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
                 gmx_comm("Failed to spawn threads");
             }
         }
-#endif
     }
+#endif
     /* END OF CAUTION: cr is now reliable */
 
     /* g_membed initialisation *
@@ -436,12 +1187,43 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
     {
         /* now broadcast everything to the non-master nodes/threads: */
         init_parallel(fplog, cr, inputrec, mtop);
+
+        /* This check needs to happen after get_nthreads_mpi() */
+        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
+        {
+            gmx_fatal_collective(FARGS,cr,NULL,
+                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
+                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
+        }
     }
     if (fplog != NULL)
     {
         pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
     }
 
+#if defined GMX_THREAD_MPI
+    /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
+     * to the other threads  -- slightly uncool, but works fine, just need to
+     * make sure that the data doesn't get freed twice. */
+    if (cr->nnodes > 1)
+    {
+        if (!SIMMASTER(cr))
+        {
+            snew(hwinfo, 1);
+        }
+        gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
+    }
+#else
+    if (PAR(cr) && !SIMMASTER(cr))
+    {
+        /* now we have inputrec on all nodes, can run the detection */
+        /* TODO: perhaps it's better to propagate within a node instead? */
+        snew(hwinfo, 1);
+        gmx_detect_hardware(fplog, hwinfo, cr,
+                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
+    }
+#endif
+
     /* now make sure the state is initialized and propagated */
     set_state_entries(state,inputrec,cr->nnodes);
 
@@ -460,14 +1242,15 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
         gmx_fatal(FARGS,
                   "The -dd or -npme option request a parallel simulation, "
 #ifndef GMX_MPI
-                  "but mdrun was compiled without threads or MPI enabled"
+                  "but %s was compiled without threads or MPI enabled"
 #else
 #ifdef GMX_THREAD_MPI
                   "but the number of threads (option -nt) is 1"
 #else
-                  "but mdrun was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec" 
+                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
 #endif
 #endif
+                  , ShortProgram()
             );
     }
 
@@ -477,7 +1260,7 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
         gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
     }
 
-    if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog))
+    if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
     {
         /* All-vs-all loops do not work with domain decomposition */
         Flags |= MD_PARTDEC;
@@ -596,6 +1379,9 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
                              Flags,&fplog);
     }
 
+    /* override nsteps with value from cmdline */
+    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+
     if (SIMMASTER(cr)) 
     {
         copy_mat(state->box,box);
@@ -613,11 +1399,6 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
         ed = ed_open(nfile,fnm,Flags,cr);
     }
 
-    if (bVerbose && SIMMASTER(cr))
-    {
-        fprintf(stderr,"Loaded with Money\n\n");
-    }
-
     if (PAR(cr) && !((Flags & MD_PARTDEC) ||
                      EI_TPI(inputrec->eI) ||
                      inputrec->eI == eiNM))
@@ -662,31 +1443,41 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
         gmx_setup_nodecomm(fplog,cr);
     }
 
-    /* get number of OpenMP/PME threads
-     * env variable should be read only on one node to make sure it is identical everywhere */
-#ifdef GMX_OPENMP
-    if (EEL_PME(inputrec->coulombtype))
-    {
-        if (MASTER(cr))
-        {
-            char *ptr;
-            if ((ptr=getenv("GMX_PME_NTHREADS")) != NULL)
-            {
-                sscanf(ptr,"%d",&nthreads_pme);
-            }
-            if (fplog != NULL && nthreads_pme > 1)
-            {
-                fprintf(fplog,"Using %d threads for PME\n",nthreads_pme);
-            }
-        }
-        if (PAR(cr))
-        {
-            gmx_bcast_sim(sizeof(nthreads_pme),&nthreads_pme,cr);
-        }
-    }
+    /* Initialize per-node process ID and counters. */
+    gmx_init_intra_counters(cr);
+
+#ifdef GMX_MPI
+    md_print_info(cr,fplog,"Using %d MPI %s\n",
+                  cr->nnodes,
+#ifdef GMX_THREAD_MPI
+                  cr->nnodes==1 ? "thread" : "threads"
+#else
+                  cr->nnodes==1 ? "process" : "processes"
+#endif
+                  );
 #endif
 
-    wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pme);
+    gmx_omp_nthreads_init(fplog, cr,
+                          hwinfo->nthreads_hw_avail,
+                          hw_opt->nthreads_omp,
+                          hw_opt->nthreads_omp_pme,
+                          (cr->duty & DUTY_PP) == 0,
+                          inputrec->cutoff_scheme == ecutsVERLET);
+
+    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
+
+    /* getting number of PP/PME threads
+       PME: env variable should be read only on one node to make sure it is 
+       identical everywhere;
+     */
+    /* TODO nthreads_pp is only used for pinning threads.
+     * This is a temporary solution until we have a hw topology library.
+     */
+    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
+    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+    wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
+
     if (PAR(cr))
     {
         /* Master synchronizes its value of reset_counters with all nodes 
@@ -696,7 +1487,6 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
         wcycle_set_reset_counters(wcycle, reset_counters);
     }
 
-
     snew(nrnb,1);
     if (cr->duty & DUTY_PP)
     {
@@ -717,11 +1507,14 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
 
         /* Initiate forcerecord */
         fr = mk_forcerec();
+        fr->hwinfo = hwinfo;
         init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
                       opt2fn("-table",nfile,fnm),
                       opt2fn("-tabletf",nfile,fnm),
                       opt2fn("-tablep",nfile,fnm),
-                      opt2fn("-tableb",nfile,fnm),FALSE,pforce);
+                      opt2fn("-tableb",nfile,fnm),
+                      nbpu_opt,
+                      FALSE,pforce);
 
         /* version for PCA_NOT_READ_NODE (see md.c) */
         /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
@@ -788,6 +1581,17 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
         snew(pmedata,1);
     }
 
+#if defined GMX_THREAD_MPI
+    /* With the number of TMPI threads equal to the number of cores
+     * we already pinned in thread-MPI, so don't pin again here.
+     */
+    if (hw_opt->nthreads_tmpi != tMPI_Thread_get_hw_number())
+#endif
+    {
+        /* Set the CPU affinity */
+        set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
+    }
+
     /* Initiate PME if necessary,
      * either on all nodes or on dedicated PME nodes only. */
     if (EEL_PME(inputrec->coulombtype))
@@ -802,40 +1606,6 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
             gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
         }
 
-
-        /* Set CPU affinity. Can be important for performance.
-           On some systems (e.g. Cray) CPU Affinity is set by default.
-           But default assigning doesn't work (well) with only some ranks
-           having threads. This causes very low performance.
-           External tools have cumbersome syntax for setting affinity
-           in the case that only some ranks have threads.
-           Thus it is important that GROMACS sets the affinity internally at
-           if only PME is using threads.
-        */
-
-#ifdef GMX_OPENMP
-#ifdef __linux
-#ifdef GMX_LIB_MPI
-        {
-            int core;
-            MPI_Comm comm_intra; /* intra communicator (but different to nc.comm_intra includes PME nodes) */
-            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),gmx_node_rank(),&comm_intra);
-            int local_omp_nthreads = (cr->duty & DUTY_PME) ? nthreads_pme : 1; /* threads on this node */
-            MPI_Scan(&local_omp_nthreads,&core, 1, MPI_INT, MPI_SUM, comm_intra);
-            core-=local_omp_nthreads; /* make exclusive scan */
-#pragma omp parallel firstprivate(core) num_threads(local_omp_nthreads)
-            {
-                cpu_set_t mask;
-                CPU_ZERO(&mask);
-                core+=gmx_omp_get_thread_num();
-                CPU_SET(core,&mask);
-                sched_setaffinity((pid_t) syscall (SYS_gettid),sizeof(cpu_set_t),&mask);
-            }
-        }
-#endif /*GMX_MPI*/
-#endif /*__linux*/
-#endif /*GMX_OPENMP*/
-
         if (cr->duty & DUTY_PME)
         {
             status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
@@ -947,13 +1717,37 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
      */
     finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
                inputrec,nrnb,wcycle,&runtime,
+               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
+                 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
+               nthreads_pp, 
                EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 
+    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
+    {
+        char gpu_err_str[STRLEN];
+
+        /* free GPU memory and uninitialize GPU (by destroying the context) */
+        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
+
+        if (!free_gpu(gpu_err_str))
+        {
+            gmx_warning("On node %d failed to free GPU #%d: %s",
+                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+        }
+    }
+
     if (opt2bSet("-membed",nfile,fnm))
     {
         sfree(membed);
     }
 
+#ifdef GMX_THREAD_MPI
+    if (PAR(cr) && SIMMASTER(cr))
+#endif
+    {
+        gmx_hardware_info_free(hwinfo);
+    }
+
     /* Does what it says */  
     print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
 
diff --git a/src/kernel/tpbcmp.c b/src/kernel/tpbcmp.c
index 60d80e1b66..6677703f0f 100644
--- a/src/kernel/tpbcmp.c
+++ b/src/kernel/tpbcmp.c
@@ -587,6 +587,7 @@ static void cmp_inputrec(FILE *fp,t_inputrec *ir1,t_inputrec *ir2,real ftol, rea
   cmp_int(fp,"inputrec->simulation_part",-1,ir1->simulation_part,ir2->simulation_part);
   cmp_int(fp,"inputrec->ePBC",-1,ir1->ePBC,ir2->ePBC);
   cmp_int(fp,"inputrec->bPeriodicMols",-1,ir1->bPeriodicMols,ir2->bPeriodicMols);
+  cmp_int(fp,"inputrec->cutoff_scheme",-1,ir1->cutoff_scheme,ir2->cutoff_scheme);
   cmp_int(fp,"inputrec->ns_type",-1,ir1->ns_type,ir2->ns_type);
   cmp_int(fp,"inputrec->nstlist",-1,ir1->nstlist,ir2->nstlist);
   cmp_int(fp,"inputrec->ndelta",-1,ir1->ndelta,ir2->ndelta);
@@ -603,6 +604,7 @@ static void cmp_inputrec(FILE *fp,t_inputrec *ir1,t_inputrec *ir2,real ftol, rea
   cmp_double(fp,"inputrec->init_t",-1,ir1->init_t,ir2->init_t,ftol,abstol);
   cmp_double(fp,"inputrec->delta_t",-1,ir1->delta_t,ir2->delta_t,ftol,abstol);
   cmp_real(fp,"inputrec->xtcprec",-1,ir1->xtcprec,ir2->xtcprec,ftol,abstol);
+  cmp_real(fp,"inputrec->fourierspacing",-1,ir1->fourier_spacing,ir2->fourier_spacing,ftol,abstol);
   cmp_int(fp,"inputrec->nkx",-1,ir1->nkx,ir2->nkx);
   cmp_int(fp,"inputrec->nky",-1,ir1->nky,ir2->nky);
   cmp_int(fp,"inputrec->nkz",-1,ir1->nkz,ir2->nkz);
@@ -627,6 +629,7 @@ static void cmp_inputrec(FILE *fp,t_inputrec *ir1,t_inputrec *ir2,real ftol, rea
   cmp_int(fp,"refcoord_scaling",-1,ir1->refcoord_scaling,ir2->refcoord_scaling);
   cmp_rvec(fp,"inputrec->posres_com",-1,ir1->posres_com,ir2->posres_com,ftol,abstol);
   cmp_rvec(fp,"inputrec->posres_comB",-1,ir1->posres_comB,ir2->posres_comB,ftol,abstol);
+  cmp_real(fp,"inputrec->verletbuf_drift",-1,ir1->verletbuf_drift,ir2->verletbuf_drift,ftol,abstol);
   cmp_real(fp,"inputrec->rlist",-1,ir1->rlist,ir2->rlist,ftol,abstol);
   cmp_real(fp,"inputrec->rlistlong",-1,ir1->rlistlong,ir2->rlistlong,ftol,abstol);
   cmp_real(fp,"inputrec->rtpi",-1,ir1->rtpi,ir2->rtpi,ftol,abstol);
diff --git a/src/mdlib/CMakeLists.txt b/src/mdlib/CMakeLists.txt
index 1b8e219fc3..bb4f80aaeb 100644
--- a/src/mdlib/CMakeLists.txt
+++ b/src/mdlib/CMakeLists.txt
@@ -1,5 +1,17 @@
 
-file(GLOB MDLIB_SOURCES *.c)
+file(GLOB MDLIB_SOURCES *.c nbnxn_kernels/*.c)
+
+if(GMX_GPU)
+    add_subdirectory(nbnxn_cuda)
+    set(GMX_GPU_LIBRARIES nbnxn_cuda)
+endif()
+
+# apply gcc 4.4.x bug workaround
+if(GMX_USE_GCC44_BUG_WORKAROUND)
+   include(gmxGCC44O3BugWorkaround)
+   gmx_apply_gcc44_bug_workaround("force.c")
+   gmx_apply_gcc44_bug_workaround("constr.c")
+endif()
 
 # Files	called xxx_test.c are test drivers with a main() function for 
 # module xxx.c, so they should not be included in the library
@@ -9,7 +21,9 @@ list(REMOVE_ITEM MDLIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/fftpack.c)
 endif()
 
 add_library(md ${MDLIB_SOURCES})
-target_link_libraries(md gmx ${GMX_EXTRA_LIBRARIES} ${FFT_LIBRARIES} ${XML_LIBRARIES} ${OpenMP_SHARED_LINKER_FLAGS})
+
+target_link_libraries(md ${GMX_GPU_LIBRARIES} gmx ${GMX_EXTRA_LIBRARIES} ${FFT_LIBRARIES} ${XML_LIBRARIES} ${OpenMP_SHARED_LINKER_FLAGS})
+
 set_target_properties(md PROPERTIES OUTPUT_NAME "md${GMX_LIBS_SUFFIX}" SOVERSION ${SOVERSION} INSTALL_NAME_DIR "${LIB_INSTALL_DIR}"
     COMPILE_FLAGS "${OpenMP_C_FLAGS}")
 
diff --git a/src/mdlib/calcmu.c b/src/mdlib/calcmu.c
index bab3ad8ad9..c3de19f571 100644
--- a/src/mdlib/calcmu.c
+++ b/src/mdlib/calcmu.c
@@ -46,47 +46,69 @@
 #include "physics.h"
 #include "main.h"
 #include "calcmu.h"
+#include "gmx_omp_nthreads.h"
 
 void calc_mu(int start,int homenr,rvec x[],real q[],real qB[],
-	     int nChargePerturbed,
-	     dvec mu,dvec mu_B)
+             int nChargePerturbed,
+             dvec mu,dvec mu_B)
 {
-  int i,end,m;
-  
-  end   = start + homenr;  
-  
-  clear_dvec(mu);
-  for(i=start; (i<end); i++)
-    for(m=0; (m<DIM); m++)
-      mu[m] += q[i]*x[i][m];
-  
-  for(m=0; (m<DIM); m++)
-    mu[m] *= ENM2DEBYE;
-  
-  if (nChargePerturbed) {
-    clear_dvec(mu_B);
-    for(i=start; (i<end); i++)
-      for(m=0; (m<DIM); m++)
-	mu_B[m] += qB[i]*x[i][m];
-    
+    int i,end,m;
+    double mu_x, mu_y, mu_z;
+
+    end   = start + homenr;
+
+    mu_x = mu_y = mu_z = 0.0;
+#pragma omp parallel for reduction(+: mu_x, mu_y, mu_z) schedule(static) \
+                         num_threads(gmx_omp_nthreads_get(emntDefault))
+    for(i=start; i<end; i++)
+    {
+        mu_x += q[i]*x[i][XX];
+        mu_y += q[i]*x[i][YY];
+        mu_z += q[i]*x[i][ZZ];
+    }
+    mu[XX] = mu_x;
+    mu[YY] = mu_y;
+    mu[ZZ] = mu_z;
+
     for(m=0; (m<DIM); m++)
-      mu_B[m] *= ENM2DEBYE;
-  } else {
-    copy_dvec(mu,mu_B);
-  }
+    {
+        mu[m] *= ENM2DEBYE;
+    }
+
+    if (nChargePerturbed)
+    {
+        mu_x = mu_y = mu_z = 0.0;
+#pragma omp parallel for reduction(+: mu_x, mu_y, mu_z) schedule(static) \
+                         num_threads(gmx_omp_nthreads_get(emntDefault))
+        for(i=start; i<end; i++)
+        {
+             mu_x += qB[i]*x[i][XX];
+             mu_y += qB[i]*x[i][YY];
+             mu_z += qB[i]*x[i][ZZ];
+        }
+        mu_B[XX] = mu_x * ENM2DEBYE;
+        mu_B[YY] = mu_y * ENM2DEBYE;
+        mu_B[ZZ] = mu_z * ENM2DEBYE;
+    }
+    else
+    {
+        copy_dvec(mu,mu_B);
+    }
 }
 
 gmx_bool read_mu(FILE *fp,rvec mu,real *vol)
 {
-  /* For backward compatibility */
-  real mmm[4];
-  
-  if (fread(mmm,(size_t)(4*sizeof(real)),1,fp) != 1)
-    return FALSE;
-    
-  copy_rvec(mmm,mu);
-  *vol = mmm[3];
-  
-  return TRUE;
+    /* For backward compatibility */
+    real mmm[4];
+
+    if (fread(mmm,(size_t)(4*sizeof(real)),1,fp) != 1)
+    {
+        return FALSE;
+    }
+
+    copy_rvec(mmm,mu);
+    *vol = mmm[3];
+
+    return TRUE;
 }
 
diff --git a/src/mdlib/clincs.c b/src/mdlib/clincs.c
index 0fc1d7bda5..d124eec791 100644
--- a/src/mdlib/clincs.c
+++ b/src/mdlib/clincs.c
@@ -52,6 +52,17 @@
 #include "partdec.h"
 #include "mtop_util.h"
 #include "gmxfio.h"
+#include "gmx_omp_nthreads.h"
+
+typedef struct {
+    int b0;           /* first constraint for this thread */
+    int b1;           /* b1-1 is the last constraint for this thread */
+    int nind;         /* number of indices */
+    int *ind;         /* constraint index for updating atom data */
+    int nind_r;       /* number of indices */
+    int *ind_r;       /* constraint index for updating atom data */
+    int ind_nalloc;   /* allocation size of ind and ind_r */
+} lincs_thread_t;
 
 typedef struct gmx_lincsdata {
     int  ncg;         /* the global number of constraints */
@@ -78,13 +89,18 @@ typedef struct gmx_lincsdata {
     real *blmf;       /* matrix of mass factors for constraint connections */
     real *blmf1;      /* as blmf, but with all masses 1 */
     real *bllen;      /* the reference bond length */
+    int  nth;         /* The number of threads doing LINCS */
+    lincs_thread_t *th; /* LINCS thread division */
+    unsigned *atf;    /* atom flags for thread parallelization */
+    int  atf_nalloc;  /* allocation size of atf */
     /* arrays for temporary storage in the LINCS algorithm */
     rvec *tmpv;
     real *tmpncc;
     real *tmp1;
     real *tmp2;
     real *tmp3;
-    real *lambda;  /* the Lagrange multipliers */
+    real *tmp4;
+    real *mlambda;  /* the Lagrange multipliers * -1 */
     /* storage for the constraint RMS relative deviation output */
     real rmsd_data[3];
 } t_gmx_lincsdata;
@@ -106,23 +122,28 @@ real lincs_rmsd(struct gmx_lincsdata *lincsd,gmx_bool bSD2)
     }
 }
 
+/* Do a set of nrec LINCS matrix multiplications.
+ * This function will return with up to date thread-local
+ * constraint data, without an OpenMP barrier.
+ */
 static void lincs_matrix_expand(const struct gmx_lincsdata *lincsd,
+                                int b0,int b1,
                                 const real *blcc,
                                 real *rhs1,real *rhs2,real *sol)
 {
-    int  nrec,rec,ncons,b,j,n,nr0,nr1;
+    int  nrec,rec,b,j,n,nr0,nr1;
     real mvb,*swap;
     int  ntriangle,tb,bits;
     const int *blnr=lincsd->blnr,*blbnb=lincsd->blbnb;
     const int *triangle=lincsd->triangle,*tri_bits=lincsd->tri_bits;
     
-    ncons     = lincsd->nc;
     ntriangle = lincsd->ntriangle;
     nrec      = lincsd->nOrder;
     
     for(rec=0; rec<nrec; rec++)
     {
-        for(b=0; b<ncons; b++)
+#pragma omp barrier
+        for(b=b0; b<b1; b++)
         {
             mvb = 0;
             for(n=blnr[b]; n<blnr[b+1]; n++)
@@ -148,37 +169,141 @@ static void lincs_matrix_expand(const struct gmx_lincsdata *lincsd,
          * is around 0.4 (and 0.7*0.7=0.5).
          */
         /* We need to copy the temporary array, since only the elements
-         * for constraints involved in triangles are updated
-         * and then the pointers are swapped.
+         * for constraints involved in triangles are updated and then
+         * the pointers are swapped. This saving copying the whole arrary.
+         * We need barrier as other threads might still be reading from rhs2.
          */
-        for(b=0; b<ncons; b++)
+#pragma omp barrier
+        for(b=b0; b<b1; b++)
         {
             rhs2[b] = rhs1[b];
         }
-        for(rec=0; rec<nrec; rec++)
+#pragma omp barrier
+#pragma omp master
         {
-            for(tb=0; tb<ntriangle; tb++)
+            for(rec=0; rec<nrec; rec++)
             {
-                b    = triangle[tb];
-                bits = tri_bits[tb];
-                mvb = 0;
-                nr0 = blnr[b];
-                nr1 = blnr[b+1];
-                for(n=nr0; n<nr1; n++)
+                for(tb=0; tb<ntriangle; tb++)
                 {
-                    if (bits & (1<<(n-nr0)))
+                    b    = triangle[tb];
+                    bits = tri_bits[tb];
+                    mvb = 0;
+                    nr0 = blnr[b];
+                    nr1 = blnr[b+1];
+                    for(n=nr0; n<nr1; n++)
                     {
-                        j = blbnb[n];
-                        mvb = mvb + blcc[n]*rhs1[j];
+                        if (bits & (1<<(n-nr0)))
+                        {
+                            j = blbnb[n];
+                            mvb = mvb + blcc[n]*rhs1[j];
+                        }
                     }
+                    rhs2[b] = mvb;
+                    sol[b]  = sol[b] + mvb;
                 }
-                rhs2[b] = mvb;
-                sol[b]  = sol[b] + mvb;
+                swap = rhs1;
+                rhs1 = rhs2;
+                rhs2 = swap;
             }
-            swap = rhs1;
-            rhs1 = rhs2;
-            rhs2 = swap;
         } /* flops count is missing here */
+
+        /* We need a barrier here as the calling routine will continue
+         * to operate on the thread-local constraints without barrier.
+         */
+#pragma omp barrier
+    }
+}
+
+static void lincs_update_atoms_noind(int ncons,const int *bla,
+                                     real prefac,
+                                     const real *fac,rvec *r,
+                                     const real *invmass,
+                                     rvec *x)
+{
+    int  b,i,j;
+    real mvb,im1,im2,tmp0,tmp1,tmp2;
+
+    for(b=0; b<ncons; b++)
+    {
+        i = bla[2*b];
+        j = bla[2*b+1];
+        mvb = prefac*fac[b];
+        im1 = invmass[i];
+        im2 = invmass[j];
+        tmp0 = r[b][0]*mvb;
+        tmp1 = r[b][1]*mvb;
+        tmp2 = r[b][2]*mvb;
+        x[i][0] -= tmp0*im1;
+        x[i][1] -= tmp1*im1;
+        x[i][2] -= tmp2*im1;
+        x[j][0] += tmp0*im2;
+        x[j][1] += tmp1*im2;
+        x[j][2] += tmp2*im2;
+    } /* 16 ncons flops */
+}
+
+static void lincs_update_atoms_ind(int ncons,const int *ind,const int *bla,
+                                   real prefac,
+                                   const real *fac,rvec *r,
+                                   const real *invmass,
+                                   rvec *x)
+{
+    int  bi,b,i,j;
+    real mvb,im1,im2,tmp0,tmp1,tmp2;
+
+    for(bi=0; bi<ncons; bi++)
+    {
+        b = ind[bi];
+        i = bla[2*b];
+        j = bla[2*b+1];
+        mvb = prefac*fac[b];
+        im1 = invmass[i];
+        im2 = invmass[j];
+        tmp0 = r[b][0]*mvb;
+        tmp1 = r[b][1]*mvb;
+        tmp2 = r[b][2]*mvb;
+        x[i][0] -= tmp0*im1;
+        x[i][1] -= tmp1*im1;
+        x[i][2] -= tmp2*im1;
+        x[j][0] += tmp0*im2;
+        x[j][1] += tmp1*im2;
+        x[j][2] += tmp2*im2;
+    } /* 16 ncons flops */
+}
+
+static void lincs_update_atoms(struct gmx_lincsdata *li,int th,
+                               real prefac,
+                               const real *fac,rvec *r,
+                               const real *invmass,
+                               rvec *x)
+{
+    if (li->nth == 1)
+    {
+        /* Single thread, we simply update for all constraints */
+        lincs_update_atoms_noind(li->nc,li->bla,prefac,fac,r,invmass,x);
+    }
+    else
+    {
+        /* Update the atom vector components for our thread local
+         * constraints that only access our local atom range.
+         * This can be done without a barrier.
+         */
+        lincs_update_atoms_ind(li->th[th].nind,li->th[th].ind,
+                               li->bla,prefac,fac,r,invmass,x);
+
+        if (li->th[li->nth].nind > 0)
+        {
+            /* Update the constraints that operate on atoms
+             * in multiple thread atom blocks on the master thread.
+             */
+#pragma omp barrier
+#pragma omp master
+            {    
+                lincs_update_atoms_ind(li->th[li->nth].nind,
+                                       li->th[li->nth].ind,
+                                       li->bla,prefac,fac,r,invmass,x);
+            }
+        }
     }
 }
 
@@ -260,7 +385,7 @@ static void do_lincsp(rvec *x,rvec *f,rvec *fp,t_pbc *pbc,
     }
     /* Together: 23*ncons + 6*nrtot flops */
     
-    lincs_matrix_expand(lincsd,blcc,rhs1,rhs2,sol);
+    lincs_matrix_expand(lincsd,0,ncons,blcc,rhs1,rhs2,sol);
     /* nrec*(ncons+2*nrtot) flops */
     
     if (econq != econqForce)
@@ -340,20 +465,25 @@ static void do_lincsp(rvec *x,rvec *f,rvec *fp,t_pbc *pbc,
 }
 
 static void do_lincs(rvec *x,rvec *xp,matrix box,t_pbc *pbc,
-                     struct gmx_lincsdata *lincsd,real *invmass,
+                     struct gmx_lincsdata *lincsd,int th,
+                     real *invmass,
 					 t_commrec *cr,
+                     gmx_bool bCalcLambda,
                      real wangle,int *warn,
                      real invdt,rvec *v,
                      gmx_bool bCalcVir,tensor rmdr)
 {
-    int     b,i,j,k,n,iter;
-    real    tmp0,tmp1,tmp2,im1,im2,mvb,rlen,len,len2,dlen2,wfac,lam;  
+    int     b0,b1,b,i,j,k,n,iter;
+    real    tmp0,tmp1,tmp2,im1,im2,mvb,rlen,len,len2,dlen2,wfac;
     rvec    dx;
     int     ncons,*bla,*blnr,*blbnb;
     rvec    *r;
-    real    *blc,*blmf,*bllen,*blcc,*rhs1,*rhs2,*sol,*lambda;
+    real    *blc,*blmf,*bllen,*blcc,*rhs1,*rhs2,*sol,*blc_sol,*mlambda;
     int     *nlocat;
-    
+
+    b0 = lincsd->th[th].b0;
+    b1 = lincsd->th[th].b1;
+
     ncons  = lincsd->nc;
     bla    = lincsd->bla;
     r      = lincsd->tmpv;
@@ -366,7 +496,8 @@ static void do_lincs(rvec *x,rvec *xp,matrix box,t_pbc *pbc,
     rhs1   = lincsd->tmp1;
     rhs2   = lincsd->tmp2;
     sol    = lincsd->tmp3;
-    lambda = lincsd->lambda;
+    blc_sol= lincsd->tmp4;
+    mlambda= lincsd->mlambda;
     
     if (DOMAINDECOMP(cr) && cr->dd->constraints)
     {
@@ -386,12 +517,13 @@ static void do_lincs(rvec *x,rvec *xp,matrix box,t_pbc *pbc,
     if (pbc)
     {
         /* Compute normalized i-j vectors */
-        for(b=0; b<ncons; b++)
+        for(b=b0; b<b1; b++)
         {
             pbc_dx_aiuc(pbc,x[bla[2*b]],x[bla[2*b+1]],dx);
             unitv(dx,r[b]);
-        }  
-        for(b=0; b<ncons; b++)
+        }
+#pragma omp barrier
+        for(b=b0; b<b1; b++)
         {
             for(n=blnr[b]; n<blnr[b+1]; n++)
             {
@@ -406,7 +538,7 @@ static void do_lincs(rvec *x,rvec *xp,matrix box,t_pbc *pbc,
     else
     {
         /* Compute normalized i-j vectors */
-        for(b=0; b<ncons; b++)
+        for(b=b0; b<b1; b++)
         {
             i = bla[2*b];
             j = bla[2*b+1];
@@ -418,8 +550,9 @@ static void do_lincs(rvec *x,rvec *xp,matrix box,t_pbc *pbc,
             r[b][1] = rlen*tmp1;
             r[b][2] = rlen*tmp2;
         } /* 16 ncons flops */
-        
-        for(b=0; b<ncons; b++)
+
+#pragma omp barrier
+        for(b=b0; b<b1; b++)
         {
             tmp0 = r[b][0];
             tmp1 = r[b][1];
@@ -442,28 +575,16 @@ static void do_lincs(rvec *x,rvec *xp,matrix box,t_pbc *pbc,
         /* Together: 26*ncons + 6*nrtot flops */
     }
     
-    lincs_matrix_expand(lincsd,blcc,rhs1,rhs2,sol);
+    lincs_matrix_expand(lincsd,b0,b1,blcc,rhs1,rhs2,sol);
     /* nrec*(ncons+2*nrtot) flops */
-    
-    for(b=0; b<ncons; b++)
+
+    for(b=b0; b<b1; b++)
     {
-        i = bla[2*b];
-        j = bla[2*b+1];
-        mvb = blc[b]*sol[b];
-        lambda[b] = -mvb;
-        im1 = invmass[i];
-        im2 = invmass[j];
-        tmp0 = r[b][0]*mvb;
-        tmp1 = r[b][1]*mvb;
-        tmp2 = r[b][2]*mvb;
-        xp[i][0] -= tmp0*im1;
-        xp[i][1] -= tmp1*im1;
-        xp[i][2] -= tmp2*im1;
-        xp[j][0] += tmp0*im2;
-        xp[j][1] += tmp1*im2;
-        xp[j][2] += tmp2*im2;
-    } /* 16 ncons flops */
+        mlambda[b] = blc[b]*sol[b]; 
+    }
 
+    /* Update the coordinates */
+    lincs_update_atoms(lincsd,th,1.0,mlambda,r,invmass,xp);
 
     /*     
      ********  Correction for centripetal effects  ********  
@@ -474,17 +595,26 @@ static void do_lincs(rvec *x,rvec *xp,matrix box,t_pbc *pbc,
 	
     for(iter=0; iter<lincsd->nIter; iter++)
     {
-        if (DOMAINDECOMP(cr) && cr->dd->constraints)
+        if ((DOMAINDECOMP(cr) && cr->dd->constraints) ||
+            PARTDECOMP(cr))
         {
-            /* Communicate the corrected non-local coordinates */
-            dd_move_x_constraints(cr->dd,box,xp,NULL);
-        } 
-		else if (PARTDECOMP(cr))
-		{
-			pd_move_x_constraints(cr,xp,NULL);
-		}	
+#pragma omp barrier
+#pragma omp master
+            {
+                 /* Communicate the corrected non-local coordinates */
+                if (DOMAINDECOMP(cr))
+                {
+                    dd_move_x_constraints(cr->dd,box,xp,NULL);
+                }
+                else
+                {
+                    pd_move_x_constraints(cr,xp,NULL);
+                }
+            }
+        }
         
-        for(b=0; b<ncons; b++)
+#pragma omp barrier
+        for(b=b0; b<b1; b++)
         {
             len = bllen[b];
             if (pbc)
@@ -513,72 +643,56 @@ static void do_lincs(rvec *x,rvec *xp,matrix box,t_pbc *pbc,
             sol[b]  = mvb;
         } /* 20*ncons flops */
         
-        lincs_matrix_expand(lincsd,blcc,rhs1,rhs2,sol);
+        lincs_matrix_expand(lincsd,b0,b1,blcc,rhs1,rhs2,sol);
         /* nrec*(ncons+2*nrtot) flops */
-        
-        for(b=0; b<ncons; b++)
+
+        for(b=b0; b<b1; b++)
         {
-            i = bla[2*b];
-            j = bla[2*b+1];
-            lam = lambda[b];
             mvb = blc[b]*sol[b];
-            lambda[b] = lam - mvb;
-            im1 = invmass[i];
-            im2 = invmass[j];
-            tmp0 = r[b][0]*mvb;
-            tmp1 = r[b][1]*mvb;
-            tmp2 = r[b][2]*mvb;
-            xp[i][0] -= tmp0*im1;
-            xp[i][1] -= tmp1*im1;
-            xp[i][2] -= tmp2*im1;
-            xp[j][0] += tmp0*im2;
-            xp[j][1] += tmp1*im2;
-            xp[j][2] += tmp2*im2;
-        } /* 17 ncons flops */
-    } /* nit*ncons*(37+9*nrec) flops */
-    
-    if (v)
+            blc_sol[b]  = mvb;
+            mlambda[b] += mvb;
+        }
+
+        /* Update the coordinates */
+        lincs_update_atoms(lincsd,th,1.0,blc_sol,r,invmass,xp);
+    }
+    /* nit*ncons*(37+9*nrec) flops */
+
+    if (v != NULL)
     {
-        /* Correct the velocities */
-        for(b=0; b<ncons; b++)
-        {
-            i = bla[2*b];
-            j = bla[2*b+1];
-            im1 = invmass[i]*lambda[b]*invdt;
-            im2 = invmass[j]*lambda[b]*invdt;
-            v[i][0] += im1*r[b][0];
-            v[i][1] += im1*r[b][1];
-            v[i][2] += im1*r[b][2];
-            v[j][0] -= im2*r[b][0];
-            v[j][1] -= im2*r[b][1];
-            v[j][2] -= im2*r[b][2];
-        } /* 16 ncons flops */
+        /* Update the velocities */
+        lincs_update_atoms(lincsd,th,invdt,mlambda,r,invmass,v);
+        /* 16 ncons flops */
     }
     
-    if (nlocat)
+    if (nlocat && bCalcLambda)
     {
         /* Only account for local atoms */
-        for(b=0; b<ncons; b++)
+        for(b=b0; b<b1; b++)
         {
-            lambda[b] *= 0.5*nlocat[b];
+            mlambda[b] *= 0.5*nlocat[b];
         }
+#pragma omp barrier
     }
-    
+
     if (bCalcVir)
     {
-        /* Constraint virial */
-        for(b=0; b<ncons; b++)
-        {
-            tmp0 = bllen[b]*lambda[b];
-            for(i=0; i<DIM; i++)
+#pragma omp master
+        {        
+            /* Constraint virial */
+            for(b=0; b<ncons; b++)
             {
-                tmp1 = tmp0*r[b][i];
-                for(j=0; j<DIM; j++)
+                tmp0 = -bllen[b]*mlambda[b];
+                for(i=0; i<DIM; i++)
                 {
-                    rmdr[i][j] -= tmp1*r[b][j];
+                    tmp1 = tmp0*r[b][i];
+                    for(j=0; j<DIM; j++)
+                    {
+                        rmdr[i][j] -= tmp1*r[b][j];
+                    }
                 }
-            }
-        } /* 22 ncons flops */
+            } /* 22 ncons flops */
+        }
     }
     
     /* Total:
@@ -781,7 +895,26 @@ gmx_lincsdata_t init_lincs(FILE *fplog,gmx_mtop_t *mtop,
     
     li->nIter  = nIter;
     li->nOrder = nProjOrder;
-    
+
+    /* LINCS can run on any number of threads.
+     * Currently the number is fixed for the whole simulation,
+     * but it could be set in set_lincs().
+     */
+    li->nth = gmx_omp_nthreads_get(emntLINCS);
+    if (li->nth == 1)
+    {
+        snew(li->th,1);
+    }
+    else
+    {
+        /* Allocate an extra elements for "thread-overlap" constraints */
+        snew(li->th,li->nth+1);
+    }
+    if (debug)
+    {
+        fprintf(debug,"LINCS: using %d threads\n",li->nth);
+    }
+
     if (bPLINCS || li->ncg_triangle > 0)
     {
         please_cite(fplog,"Hess2008a");
@@ -812,6 +945,138 @@ gmx_lincsdata_t init_lincs(FILE *fplog,gmx_mtop_t *mtop,
     return li;
 }
 
+/* Sets up the work division over the threads */
+static void lincs_thread_setup(struct gmx_lincsdata *li,int natoms)
+{
+    lincs_thread_t *li_m;
+    int th;
+    unsigned *atf;
+    int a;
+
+    if (natoms > li->atf_nalloc)
+    {
+        li->atf_nalloc = over_alloc_large(natoms);
+        srenew(li->atf,li->atf_nalloc);
+    }
+
+    atf = li->atf;
+    /* Clear the atom flags */
+    for(a=0; a<natoms; a++)
+    {
+        atf[a] = 0;
+    }
+
+    for(th=0; th<li->nth; th++)
+    {
+        lincs_thread_t *li_th;
+        int b;
+
+        li_th = &li->th[th];
+        
+        /* The constraints are divided equally over the threads */
+        li_th->b0 = (li->nc* th   )/li->nth;
+        li_th->b1 = (li->nc*(th+1))/li->nth;
+
+        if (th < sizeof(*atf)*8)
+        {
+            /* For each atom set a flag for constraints from each */
+            for(b=li_th->b0; b<li_th->b1; b++)
+            {
+                atf[li->bla[b*2]  ] |= (1U<<th);
+                atf[li->bla[b*2+1]] |= (1U<<th);
+            }
+        }
+    }
+
+#pragma omp parallel for num_threads(li->nth) schedule(static)
+    for(th=0; th<li->nth; th++)
+    {
+        lincs_thread_t *li_th;
+        unsigned mask;
+        int b;
+
+        li_th = &li->th[th];
+        
+        if (li_th->b1 - li_th->b0 > li_th->ind_nalloc)
+        {
+            li_th->ind_nalloc = over_alloc_large(li_th->b1-li_th->b0);
+            srenew(li_th->ind,li_th->ind_nalloc);
+            srenew(li_th->ind_r,li_th->ind_nalloc);
+        }
+
+        if (th < sizeof(*atf)*8)
+        {
+            mask = (1U<<th) - 1U;
+
+            li_th->nind   = 0;
+            li_th->nind_r = 0;
+            for(b=li_th->b0; b<li_th->b1; b++)
+            {
+                /* We let the constraint with the lowest thread index
+                 * operate on atoms with constraints from multiple threads.
+                 */
+                if (((atf[li->bla[b*2]]   & mask) == 0) &&
+                    ((atf[li->bla[b*2+1]] & mask) == 0))
+                {
+                    /* Add the constraint to the local atom update index */
+                    li_th->ind[li_th->nind++] = b;
+                }
+                else
+                {
+                    /* Add the constraint to the rest block */
+                    li_th->ind_r[li_th->nind_r++] = b;
+                }
+            }
+        }
+        else
+        {
+            /* We are out of bits, assign all constraints to rest */
+            for(b=li_th->b0; b<li_th->b1; b++)
+            {
+                li_th->ind_r[li_th->nind_r++] = b;
+            }
+        }
+    }
+
+    /* We need to copy all constraints which have not be assigned
+     * to a thread to a separate list which will be handled by one thread.
+     */
+    li_m = &li->th[li->nth];
+
+    li_m->nind = 0;
+    for(th=0; th<li->nth; th++)
+    {
+        lincs_thread_t *li_th;
+        int b;
+
+        li_th   = &li->th[th];
+
+        if (li_m->nind + li_th->nind_r > li_m->ind_nalloc)
+        {
+            li_m->ind_nalloc = over_alloc_large(li_m->nind+li_th->nind_r);
+            srenew(li_m->ind,li_m->ind_nalloc);
+        }
+
+        for(b=0; b<li_th->nind_r; b++)
+        {
+            li_m->ind[li_m->nind++] = li_th->ind_r[b];
+        }
+
+        if (debug)
+        {
+            fprintf(debug,"LINCS thread %d: %d constraints\n",
+                    th,li_th->nind);
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(debug,"LINCS thread r: %d constraints\n",
+                li_m->nind);
+    }
+}
+
+
 void set_lincs(t_idef *idef,t_mdatoms *md,
                gmx_bool bDynamics,t_commrec *cr,
                struct gmx_lincsdata *li)
@@ -826,6 +1091,19 @@ void set_lincs(t_idef *idef,t_mdatoms *md,
 
     li->nc = 0;
     li->ncc = 0;
+    /* Zero the thread index ranges.
+     * Otherwise without local constraints we could return with old ranges.
+     */
+    for(i=0; i<li->nth; i++)
+    {
+        li->th[i].b0   = 0;
+        li->th[i].b1   = 0;
+        li->th[i].nind = 0;
+    }
+    if (li->nth > 1)
+    {
+        li->th[li->nth].nind = 0;
+    }
 		
     /* This is the local topology, so there are only F_CONSTR constraints */
     if (idef->il[F_CONSTR].nr == 0)
@@ -880,7 +1158,8 @@ void set_lincs(t_idef *idef,t_mdatoms *md,
         srenew(li->tmp1,li->nc_alloc);
         srenew(li->tmp2,li->nc_alloc);
         srenew(li->tmp3,li->nc_alloc);
-        srenew(li->lambda,li->nc_alloc);
+        srenew(li->tmp4,li->nc_alloc);
+        srenew(li->mlambda,li->nc_alloc);
         if (li->ncg_triangle > 0)
         {
             /* This is allocating too much, but it is difficult to improve */
@@ -985,6 +1264,16 @@ void set_lincs(t_idef *idef,t_mdatoms *md,
                 li->nc,li->ncc);
     }
 
+    if (li->nth == 1)
+    {
+        li->th[0].b0 = 0;
+        li->th[0].b1 = li->nc;
+    }
+    else
+    {
+        lincs_thread_setup(li,md->nr);
+    }
+
     set_lincs_matrix(li,md->invmass,md->lambda);
 }
 
@@ -1144,22 +1433,22 @@ static void dump_conf(gmx_domdec_t *dd,struct gmx_lincsdata *li,
 }
 
 gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
-                     t_inputrec *ir,
-                     gmx_large_int_t step,
-                     struct gmx_lincsdata *lincsd,t_mdatoms *md,
-                     t_commrec *cr, 
-                     rvec *x,rvec *xprime,rvec *min_proj,matrix box,
-                     real lambda,real *dvdlambda,
-                     real invdt,rvec *v,
-                     gmx_bool bCalcVir,tensor rmdr,
-                     int econq,
-                     t_nrnb *nrnb,
-                     int maxwarn,int *warncount)
+                         t_inputrec *ir,
+                         gmx_large_int_t step,
+                         struct gmx_lincsdata *lincsd,t_mdatoms *md,
+                         t_commrec *cr, 
+                         rvec *x,rvec *xprime,rvec *min_proj,
+                         matrix box,t_pbc *pbc,
+                         real lambda,real *dvdlambda,
+                         real invdt,rvec *v,
+                         gmx_bool bCalcVir,tensor rmdr,
+                         int econq,
+                         t_nrnb *nrnb,
+                         int maxwarn,int *warncount)
 {
     char  buf[STRLEN],buf2[22],buf3[STRLEN];
-    int   i,warn,p_imax,error;
+    int   i,warn=0,p_imax,error;
     real  ncons_loc,p_ssd,p_max=0;
-    t_pbc pbc,*pbc_null;
     rvec  dx;
     gmx_bool  bOK;
     
@@ -1184,32 +1473,6 @@ gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
         return bOK;
     }
     
-    /* We do not need full pbc when constraints do not cross charge groups,
-     * i.e. when dd->constraint_comm==NULL
-     */
-    if ((cr->dd || ir->bPeriodicMols) && !(cr->dd && cr->dd->constraint_comm==NULL))
-    {
-        /* With pbc=screw the screw has been changed to a shift
-         * by the constraint coordinate communication routine,
-         * so that here we can use normal pbc.
-         */
-        pbc_null = set_pbc_dd(&pbc,ir->ePBC,cr->dd,FALSE,box);
-    }
-    else
-    {
-        pbc_null = NULL;
-    }
-    if (cr->dd)
-    {
-        /* Communicate the coordinates required for the non-local constraints */
-        dd_move_x_constraints(cr->dd,box,x,xprime);
-        /* dump_conf(dd,lincsd,NULL,"con",TRUE,xprime,box); */
-    }
-	else if (PARTDECOMP(cr))
-	{
-		pd_move_x_constraints(cr,x,xprime);
-	}	
-	
     if (econq == econqCoord)
     {
         if (ir->efep != efepNO)
@@ -1228,12 +1491,12 @@ gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
         if (lincsd->ncg_flex)
         {
             /* Set the flexible constraint lengths to the old lengths */
-            if (pbc_null)
+            if (pbc != NULL)
             {
                 for(i=0; i<lincsd->nc; i++)
                 {
                     if (lincsd->bllen[i] == 0) {
-                        pbc_dx_aiuc(pbc_null,x[lincsd->bla[2*i]],x[lincsd->bla[2*i+1]],dx);
+                        pbc_dx_aiuc(pbc,x[lincsd->bla[2*i]],x[lincsd->bla[2*i+1]],dx);
                         lincsd->bllen[i] = norm(dx);
                     }
                 }
@@ -1254,13 +1517,24 @@ gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
         
         if (bLog && fplog)
         {
-            cconerr(cr->dd,lincsd->nc,lincsd->bla,lincsd->bllen,xprime,pbc_null,
+            cconerr(cr->dd,lincsd->nc,lincsd->bla,lincsd->bllen,xprime,pbc,
                     &ncons_loc,&p_ssd,&p_max,&p_imax);
         }
-        
-        do_lincs(x,xprime,box,pbc_null,lincsd,md->invmass,cr,
-                 ir->LincsWarnAngle,&warn,
-                 invdt,v,bCalcVir,rmdr);
+
+        /* The (only) OpenMP parallel region of constrain_lincs */
+        {
+            int th;
+
+#pragma omp parallel for num_threads(lincsd->nth) schedule(static)
+            for(th=0; th<lincsd->nth; th++)
+            {
+                do_lincs(x,xprime,box,pbc,lincsd,th,
+                         md->invmass,cr,
+                         bCalcVir || (ir->efep != efepNO),
+                         ir->LincsWarnAngle,&warn,
+                         invdt,v,bCalcVir,rmdr);
+            }
+        }
         
         if (ir->efep != efepNO)
         {
@@ -1269,7 +1543,7 @@ gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
             dt_2 = 1.0/(ir->delta_t*ir->delta_t);
             for(i=0; (i<lincsd->nc); i++)
             {
-                dvdl += lincsd->lambda[i]*dt_2*lincsd->ddist[i];
+                dvdl -= lincsd->mlambda[i]*dt_2*lincsd->ddist[i];
             }
             *dvdlambda += dvdl;
 		}
@@ -1284,7 +1558,7 @@ gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
         }
         if (bLog || bEner)
         {
-            cconerr(cr->dd,lincsd->nc,lincsd->bla,lincsd->bllen,xprime,pbc_null,
+            cconerr(cr->dd,lincsd->nc,lincsd->bla,lincsd->bllen,xprime,pbc,
                     &ncons_loc,&p_ssd,&p_max,&p_imax);
             /* Check if we are doing the second part of SD */
             if (ir->eI == eiSD2 && v == NULL)
@@ -1317,7 +1591,7 @@ gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
         {
             if (maxwarn >= 0)
             {
-                cconerr(cr->dd,lincsd->nc,lincsd->bla,lincsd->bllen,xprime,pbc_null,
+                cconerr(cr->dd,lincsd->nc,lincsd->bla,lincsd->bllen,xprime,pbc,
                         &ncons_loc,&p_ssd,&p_max,&p_imax);
                 if (MULTISIM(cr))
                 {
@@ -1340,7 +1614,7 @@ gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
                     fprintf(fplog,"%s",buf);
                 }
                 fprintf(stderr,"%s",buf);
-                lincs_warning(fplog,cr->dd,x,xprime,pbc_null,
+                lincs_warning(fplog,cr->dd,x,xprime,pbc,
                               lincsd->nc,lincsd->bla,lincsd->bllen,
                               ir->LincsWarnAngle,maxwarn,warncount);
             }
@@ -1355,7 +1629,7 @@ gmx_bool constrain_lincs(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
     } 
     else
     {
-        do_lincsp(x,xprime,min_proj,pbc_null,lincsd,md->invmass,econq,dvdlambda,
+        do_lincsp(x,xprime,min_proj,pbc,lincsd,md->invmass,econq,dvdlambda,
                   bCalcVir,rmdr);
     }
   
diff --git a/src/mdlib/constr.c b/src/mdlib/constr.c
index f12c725dc9..f59b3b6996 100644
--- a/src/mdlib/constr.c
+++ b/src/mdlib/constr.c
@@ -55,12 +55,16 @@
 #include "splitter.h"
 #include "mtop_util.h"
 #include "gmxfio.h"
+#include "gmx_omp_nthreads.h"
 
 typedef struct gmx_constr {
   int              ncon_tot;     /* The total number of constraints    */
   int              nflexcon;     /* The number of flexible constraints */
   int              n_at2con_mt;  /* The size of at2con = #moltypes     */
   t_blocka         *at2con_mt;   /* A list of atoms to constraints     */
+  int              n_at2settle_mt; /* The size of at2settle = #moltypes  */
+  int              **at2settle_mt; /* A list of atoms to settles         */
+  gmx_bool         bInterCGsettles;
   gmx_lincsdata_t  lincsd;       /* LINCS data                         */
   gmx_shakedata_t  shaked;       /* SHAKE data                         */
   gmx_settledata_t settled;      /* SETTLE data                        */
@@ -74,6 +78,9 @@ typedef struct gmx_constr {
   int              warncount_settle;
   gmx_edsam_t      ed;           /* The essential dynamics data        */
 
+    tensor           *rmdr_th;   /* Thread local working data          */
+    int              *settle_error; /* Thread local working data          */
+
   gmx_mtop_t       *warn_mtop;   /* Only used for printing warnings    */
 } t_gmx_constr;
 
@@ -280,28 +287,31 @@ static void pr_sortblock(FILE *fp,const char *title,int nsb,t_sortblock sb[])
 }
 
 gmx_bool constrain(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
-               struct gmx_constr *constr,
-               t_idef *idef,t_inputrec *ir,gmx_ekindata_t *ekind,
-               t_commrec *cr,
-               gmx_large_int_t step,int delta_step,
-               t_mdatoms *md,
-               rvec *x,rvec *xprime,rvec *min_proj,matrix box,
-               real lambda,real *dvdlambda,
-               rvec *v,tensor *vir,
-               t_nrnb *nrnb,int econq,gmx_bool bPscal,real veta, real vetanew)
+                   struct gmx_constr *constr,
+                   t_idef *idef,t_inputrec *ir,gmx_ekindata_t *ekind,
+                   t_commrec *cr,
+                   gmx_large_int_t step,int delta_step,
+                   t_mdatoms *md,
+                   rvec *x,rvec *xprime,rvec *min_proj,
+                   gmx_bool bMolPBC,matrix box,
+                   real lambda,real *dvdlambda,
+                   rvec *v,tensor *vir,
+                   t_nrnb *nrnb,int econq,gmx_bool bPscal,
+                   real veta, real vetanew)
 {
     gmx_bool    bOK,bDump;
     int     start,homenr,nrend;
     int     i,j,d;
-    int     ncons,error;
+    int     ncons,settle_error;
     tensor  rmdr;
     rvec    *vstor;
     real    invdt,vir_fac,t;
     t_ilist *settle;
     int     nsettle;
-    t_pbc   pbc;
+    t_pbc   pbc,*pbc_null;
     char    buf[22];
     t_vetavars vetavar;
+    int     nth,th;
 
     if (econq == econqForceDispl && !EI_ENERGY_MINIMIZATION(ir->eI))
     {
@@ -342,10 +352,63 @@ gmx_bool constrain(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
     }
     
     where();
-    if (constr->lincsd)
+
+    settle  = &idef->il[F_SETTLE];
+    nsettle = settle->nr/(1+NRAL(F_SETTLE));
+
+    if (nsettle > 0)
+    {
+        nth = gmx_omp_nthreads_get(emntSETTLE);
+    }
+    else
+    {
+        nth = 1;
+    }
+
+    if (nth > 1 && constr->rmdr_th == NULL)
+    {
+        snew(constr->rmdr_th,nth);
+        snew(constr->settle_error,nth);
+    }
+    
+    settle_error = -1;
+
+    /* We do not need full pbc when constraints do not cross charge groups,
+     * i.e. when dd->constraint_comm==NULL.
+     * Note that PBC for constraints is different from PBC for bondeds.
+     * For constraints there is both forward and backward communication.
+     */
+    if (ir->ePBC != epbcNONE &&
+        (cr->dd || bMolPBC) && !(cr->dd && cr->dd->constraint_comm==NULL))
+    {
+        /* With pbc=screw the screw has been changed to a shift
+         * by the constraint coordinate communication routine,
+         * so that here we can use normal pbc.
+         */
+        pbc_null = set_pbc_dd(&pbc,ir->ePBC,cr->dd,FALSE,box);
+    }
+    else
+    {
+        pbc_null = NULL;
+    }
+
+    /* Communicate the coordinates required for the non-local constraints
+     * for LINCS and/or SETTLE.
+     */
+    if (cr->dd)
+    {
+        dd_move_x_constraints(cr->dd,box,x,xprime);
+    }
+	else if (PARTDECOMP(cr))
+	{
+		pd_move_x_constraints(cr,x,xprime);
+	}	
+
+    if (constr->lincsd != NULL)
     {
         bOK = constrain_lincs(fplog,bLog,bEner,ir,step,constr->lincsd,md,cr,
-                              x,xprime,min_proj,box,lambda,dvdlambda,
+                              x,xprime,min_proj,
+                              box,pbc_null,lambda,dvdlambda,
                               invdt,v,vir!=NULL,rmdr,
                               econq,nrnb,
                               constr->maxwarn,&constr->warncount_lincs);
@@ -366,24 +429,22 @@ gmx_bool constrain(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
         case (econqCoord):
             bOK = bshakef(fplog,constr->shaked,
                           homenr,md->invmass,constr->nblocks,constr->sblock,
-                          idef,ir,box,x,xprime,nrnb,
+                          idef,ir,x,xprime,nrnb,
                           constr->lagr,lambda,dvdlambda,
-                          invdt,v,vir!=NULL,rmdr,constr->maxwarn>=0,econq,
-                          &vetavar);
+                          invdt,v,vir!=NULL,rmdr,constr->maxwarn>=0,econq,&vetavar);
             break;
         case (econqVeloc):
             bOK = bshakef(fplog,constr->shaked,
                           homenr,md->invmass,constr->nblocks,constr->sblock,
-                          idef,ir,box,x,min_proj,nrnb,
+                          idef,ir,x,min_proj,nrnb,
                           constr->lagr,lambda,dvdlambda,
-                          invdt,NULL,vir!=NULL,rmdr,constr->maxwarn>=0,econq,
-                          &vetavar);
+                          invdt,NULL,vir!=NULL,rmdr,constr->maxwarn>=0,econq,&vetavar);
             break;
         default:
             gmx_fatal(FARGS,"Internal error, SHAKE called for constraining something else than coordinates");
             break;
         }
-
+        
         if (!bOK && constr->maxwarn >= 0)
         {
             if (fplog != NULL)
@@ -394,18 +455,48 @@ gmx_bool constrain(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
             bDump = TRUE;
         }
     }
-        
-    settle  = &idef->il[F_SETTLE];
-    if (settle->nr > 0)
+    
+    if (nsettle > 0)
     {
-        nsettle = settle->nr/4;
-        
+        int calcvir_atom_end;
+
+        if (vir == NULL)
+        {
+            calcvir_atom_end = 0;
+        }
+        else
+        {
+            calcvir_atom_end = md->start + md->homenr;
+        }
+
         switch (econq)
         {
         case econqCoord:
-            csettle(constr->settled,
-                    nsettle,settle->iatoms,x[0],xprime[0],
-                    invdt,v?v[0]:NULL,vir!=NULL,rmdr,&error,&vetavar);
+#pragma omp parallel for num_threads(nth) schedule(static)
+            for(th=0; th<nth; th++)
+            {
+                int start_th,end_th;
+
+                if (th > 0)
+                {
+                    clear_mat(constr->rmdr_th[th]);
+                }
+
+                start_th = (nsettle* th   )/nth;
+                end_th   = (nsettle*(th+1))/nth;
+                if (start_th >= 0 && end_th - start_th > 0)
+                {
+                    csettle(constr->settled,
+                            end_th-start_th,
+                            settle->iatoms+start_th*(1+NRAL(F_SETTLE)),
+                            pbc_null,
+                            x[0],xprime[0],
+                            invdt,v?v[0]:NULL,calcvir_atom_end,
+                            th == 0 ? rmdr : constr->rmdr_th[th],
+                            th == 0 ? &settle_error : &constr->settle_error[th],
+                            &vetavar);
+                }
+            }
             inc_nrnb(nrnb,eNR_SETTLE,nsettle);
             if (v != NULL)
             {
@@ -415,15 +506,66 @@ gmx_bool constrain(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
             {
                 inc_nrnb(nrnb,eNR_CONSTR_VIR,nsettle*3);
             }
-            
-            bOK = (error < 0);
-            if (!bOK && constr->maxwarn >= 0)
+            break;
+        case econqVeloc:
+        case econqDeriv:
+        case econqForce:
+        case econqForceDispl:
+#pragma omp parallel for num_threads(nth) schedule(static)
+            for(th=0; th<nth; th++)
+            {
+                int start_th,end_th;
+
+                if (th > 0)
+                {
+                    clear_mat(constr->rmdr_th[th]);
+                }
+                
+                start_th = (nsettle* th   )/nth;
+                end_th   = (nsettle*(th+1))/nth;
+
+                if (start_th >= 0 && end_th - start_th > 0)
+                {
+                    settle_proj(fplog,constr->settled,econq,
+                                end_th-start_th,
+                                settle->iatoms+start_th*(1+NRAL(F_SETTLE)),
+                                pbc_null,
+                                x,
+                                xprime,min_proj,calcvir_atom_end,
+                                th == 0 ? rmdr : constr->rmdr_th[th],
+                                &vetavar);
+                }
+            }
+            /* This is an overestimate */
+            inc_nrnb(nrnb,eNR_SETTLE,nsettle);
+            break;
+        case econqDeriv_FlexCon:
+            /* Nothing to do, since the are no flexible constraints in settles */
+            break;
+        default:
+            gmx_incons("Unknown constraint quantity for settle");
+        }
+    }
+
+    if (settle->nr > 0)
+    {
+        /* Combine virial and error info of the other threads */
+        for(i=1; i<nth; i++)
+        {
+            m_add(rmdr,constr->rmdr_th[i],rmdr);
+            settle_error = constr->settle_error[i];
+        } 
+
+        if (econq == econqCoord && settle_error >= 0)
+        {
+            bOK = FALSE;
+            if (constr->maxwarn >= 0)
             {
                 char buf[256];
                 sprintf(buf,
                         "\nstep " gmx_large_int_pfmt ": Water molecule starting at atom %d can not be "
                         "settled.\nCheck for bad contacts and/or reduce the timestep if appropriate.\n",
-                        step,ddglatnr(cr->dd,settle->iatoms[error*4+1]));
+                        step,ddglatnr(cr->dd,settle->iatoms[settle_error*(1+NRAL(F_SETTLE))+1]));
                 if (fplog)
                 {
                     fprintf(fplog,"%s",buf);
@@ -435,26 +577,10 @@ gmx_bool constrain(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
                     too_many_constraint_warnings(-1,constr->warncount_settle);
                 }
                 bDump = TRUE;
-                break;
-            case econqVeloc:
-            case econqDeriv:
-            case econqForce:
-            case econqForceDispl:
-                settle_proj(fplog,constr->settled,econq,
-                            nsettle,settle->iatoms,x,
-                            xprime,min_proj,vir!=NULL,rmdr,&vetavar);
-                /* This is an overestimate */
-                inc_nrnb(nrnb,eNR_SETTLE,nsettle);
-                break;
-            case econqDeriv_FlexCon:
-                /* Nothing to do, since the are no flexible constraints in settles */
-                break;
-            default:
-                gmx_incons("Unknown constraint quantity for settle");
             }
         }
     }
-
+        
     free_vetavars(&vetavar);
     
     if (vir != NULL)
@@ -725,6 +851,30 @@ t_blocka make_at2con(int start,int natoms,
   return at2con;
 }
 
+static int *make_at2settle(int natoms,const t_ilist *ilist)
+{
+    int *at2s;
+    int a,stride,s;
+  
+    snew(at2s,natoms);
+    /* Set all to no settle */
+    for(a=0; a<natoms; a++)
+    {
+        at2s[a] = -1;
+    }
+
+    stride = 1 + NRAL(F_SETTLE);
+
+    for(s=0; s<ilist->nr; s+=stride)
+    {
+        at2s[ilist->iatoms[s+1]] = s/stride;
+        at2s[ilist->iatoms[s+2]] = s/stride;
+        at2s[ilist->iatoms[s+3]] = s/stride;
+    }
+
+    return at2s;
+}
+
 void set_constraints(struct gmx_constr *constr,
                      gmx_localtop_t *top,t_inputrec *ir,
                      t_mdatoms *md,t_commrec *cr)
@@ -1032,7 +1182,9 @@ gmx_constr_t init_constraints(FILE *fplog,
   
     if (nset > 0) {
         please_cite(fplog,"Miyamoto92a");
-        
+
+        constr->bInterCGsettles = inter_charge_group_settles(mtop);
+
         /* Check that we have only one settle type */
         settle_type = -1;
         iloop = gmx_mtop_ilistloop_init(mtop);
@@ -1056,6 +1208,15 @@ gmx_constr_t init_constraints(FILE *fplog,
                 }
             }
         }
+
+        constr->n_at2settle_mt = mtop->nmoltype;
+        snew(constr->at2settle_mt,constr->n_at2settle_mt);
+        for(mt=0; mt<mtop->nmoltype; mt++) 
+        {
+            constr->at2settle_mt[mt] =
+                make_at2settle(mtop->moltype[mt].atoms.nr,
+                               &mtop->moltype[mt].ilist[F_SETTLE]);
+        }
     }
     
     constr->maxwarn = 999;
@@ -1097,46 +1258,104 @@ gmx_constr_t init_constraints(FILE *fplog,
     return constr;
 }
 
-t_blocka *atom2constraints_moltype(gmx_constr_t constr)
+const t_blocka *atom2constraints_moltype(gmx_constr_t constr)
 {
   return constr->at2con_mt;
 }
 
+const int **atom2settle_moltype(gmx_constr_t constr)
+{
+    return (const int **)constr->at2settle_mt;
+}
+
 
-gmx_bool inter_charge_group_constraints(gmx_mtop_t *mtop)
+gmx_bool inter_charge_group_constraints(const gmx_mtop_t *mtop)
 {
-  const gmx_moltype_t *molt;
-  const t_block *cgs;
-  const t_ilist *il;
-  int  mb;
-  int  nat,*at2cg,cg,a,ftype,i;
-  gmx_bool bInterCG;
-
-  bInterCG = FALSE;
-  for(mb=0; mb<mtop->nmolblock && !bInterCG; mb++) {
-    molt = &mtop->moltype[mtop->molblock[mb].type];
-
-    if (molt->ilist[F_CONSTR].nr   > 0 ||
-	molt->ilist[F_CONSTRNC].nr > 0) {
-      cgs  = &molt->cgs;
-      snew(at2cg,molt->atoms.nr);
-      for(cg=0; cg<cgs->nr; cg++) {
-	for(a=cgs->index[cg]; a<cgs->index[cg+1]; a++)
-	  at2cg[a] = cg;
-      }
-      
-      for(ftype=F_CONSTR; ftype<=F_CONSTRNC; ftype++) {
-	il = &molt->ilist[ftype];
-	for(i=0; i<il->nr && !bInterCG; i+=3) {
-	  if (at2cg[il->iatoms[i+1]] != at2cg[il->iatoms[i+2]])
-	    bInterCG = TRUE;
-	}
-      }
-      sfree(at2cg);
+    const gmx_moltype_t *molt;
+    const t_block *cgs;
+    const t_ilist *il;
+    int  mb;
+    int  nat,*at2cg,cg,a,ftype,i;
+    gmx_bool bInterCG;
+
+    bInterCG = FALSE;
+    for(mb=0; mb<mtop->nmolblock && !bInterCG; mb++)
+    {
+        molt = &mtop->moltype[mtop->molblock[mb].type];
+
+        if (molt->ilist[F_CONSTR].nr   > 0 ||
+            molt->ilist[F_CONSTRNC].nr > 0 ||
+            molt->ilist[F_SETTLE].nr > 0)
+        {
+            cgs  = &molt->cgs;
+            snew(at2cg,molt->atoms.nr);
+            for(cg=0; cg<cgs->nr; cg++)
+            {
+                for(a=cgs->index[cg]; a<cgs->index[cg+1]; a++)
+                    at2cg[a] = cg;
+            }
+
+            for(ftype=F_CONSTR; ftype<=F_CONSTRNC; ftype++)
+            {
+                il = &molt->ilist[ftype];
+                for(i=0; i<il->nr && !bInterCG; i+=1+NRAL(ftype))
+                {
+                    if (at2cg[il->iatoms[i+1]] != at2cg[il->iatoms[i+2]])
+                    {
+                        bInterCG = TRUE;
+                    }
+                }
+            }
+            
+            sfree(at2cg);
+        }
+    }
+
+    return bInterCG;
+}
+
+gmx_bool inter_charge_group_settles(const gmx_mtop_t *mtop)
+{
+    const gmx_moltype_t *molt;
+    const t_block *cgs;
+    const t_ilist *il;
+    int  mb;
+    int  nat,*at2cg,cg,a,ftype,i;
+    gmx_bool bInterCG;
+
+    bInterCG = FALSE;
+    for(mb=0; mb<mtop->nmolblock && !bInterCG; mb++)
+    {
+        molt = &mtop->moltype[mtop->molblock[mb].type];
+
+        if (molt->ilist[F_SETTLE].nr > 0)
+        {
+            cgs  = &molt->cgs;
+            snew(at2cg,molt->atoms.nr);
+            for(cg=0; cg<cgs->nr; cg++)
+            {
+                for(a=cgs->index[cg]; a<cgs->index[cg+1]; a++)
+                    at2cg[a] = cg;
+            }
+
+            for(ftype=F_SETTLE; ftype<=F_SETTLE; ftype++)
+            {
+                il = &molt->ilist[ftype];
+                for(i=0; i<il->nr && !bInterCG; i+=1+NRAL(F_SETTLE))
+                {
+                    if (at2cg[il->iatoms[i+1]] != at2cg[il->iatoms[i+2]] ||
+                        at2cg[il->iatoms[i+1]] != at2cg[il->iatoms[i+3]])
+                    {
+                        bInterCG = TRUE;
+                    }
+                }
+            }       
+            
+            sfree(at2cg);
+        }
     }
-  }
 
-  return bInterCG;
+    return bInterCG;
 }
 
 /* helper functions for andersen temperature control, because the
diff --git a/src/mdlib/csettle.c b/src/mdlib/csettle.c
index eea04e2e3c..54c98ec302 100644
--- a/src/mdlib/csettle.c
+++ b/src/mdlib/csettle.c
@@ -43,6 +43,7 @@
 #include "constr.h"
 #include "gmx_fatal.h"
 #include "smalloc.h"
+#include "pbc.h"
 
 typedef struct
 {
@@ -56,7 +57,7 @@ typedef struct
     real   ra;
     real   rb;
     real   rc;
-    real   rc2;
+    real   irc2;
     /* For projection */
     real   imO;
     real   imH;
@@ -120,7 +121,7 @@ static void settleparam_init(settleparam_t *p,
     p->rc     = dHH/2.0;
     p->ra     = 2.0*p->wh*sqrt(dOH*dOH - p->rc*p->rc)/p->wohh;
     p->rb     = sqrt(dOH*dOH - p->rc*p->rc) - p->ra;
-    p->rc2    = dHH;
+    p->irc2   = 1.0/dHH;
 
     p->wo    /= p->wohh;
     p->wh    /= p->wohh;
@@ -132,8 +133,8 @@ static void settleparam_init(settleparam_t *p,
     {
         fprintf(debug,"wo = %g, wh =%g, wohh = %g, rc = %g, ra = %g\n",
                 p->wo,p->wh,p->wohh,p->rc,p->ra);
-        fprintf(debug,"rb = %g, rc2 = %g, dHH = %g, dOH = %g\n",
-                p->rb,p->rc2,p->dHH,p->dOH);
+        fprintf(debug,"rb = %g, irc2 = %g, dHH = %g, dOH = %g\n",
+                p->rb,p->irc2,p->dHH,p->dOH);
     }
 }
 
@@ -170,9 +171,11 @@ static void check_cons(FILE *fp,char *title,real x[],int OW1,int HW2,int HW3)
 
 void settle_proj(FILE *fp,
                  gmx_settledata_t settled,int econq,
-                 int nsettle, t_iatom iatoms[],rvec x[],
+                 int nsettle, t_iatom iatoms[],
+                 const t_pbc *pbc,
+                 rvec x[],
                  rvec *der,rvec *derp,
-                 gmx_bool bCalcVir,tensor rmdder,t_vetavars *vetavar)
+                 int calcvir_atom_end,tensor rmdder,t_vetavars *vetavar)
 {
     /* Settle for projection out constraint components
      * of derivatives of the coordinates.
@@ -188,6 +191,8 @@ void settle_proj(FILE *fp,
     real   invvscale,vscale_nhc,veta;
     real   kfacOH,kfacHH;
 
+    calcvir_atom_end *= DIM;
+
     if (econq == econqForce)
     {
         p = &settled->mass1;
@@ -230,12 +235,21 @@ void settle_proj(FILE *fp,
         }
         /* 27 flops */
         
-        for(m=0; m<DIM; m++)
+        if (pbc == NULL)
+        {
+            rvec_sub(x[ow1],x[hw2],roh2);
+            rvec_sub(x[ow1],x[hw3],roh3);
+            rvec_sub(x[hw2],x[hw3],rhh);
+        }
+        else
         {
-            roh2[m] = (x[ow1][m] - x[hw2][m])*invdOH;
-            roh3[m] = (x[ow1][m] - x[hw3][m])*invdOH;
-            rhh [m] = (x[hw2][m] - x[hw3][m])*invdHH;
+            pbc_dx_aiuc(pbc,x[ow1],x[hw2],roh2);
+            pbc_dx_aiuc(pbc,x[ow1],x[hw3],roh3);
+            pbc_dx_aiuc(pbc,x[hw2],x[hw3],rhh);
         }
+        svmul(invdOH,roh2,roh2);
+        svmul(invdOH,roh3,roh3);
+        svmul(invdHH,rhh,rhh);
         /* 18 flops */
         
         /* Determine the projections of der(modified) on the bonds */
@@ -267,7 +281,7 @@ void settle_proj(FILE *fp,
         
         /* 45 flops */
 
-        if (bCalcVir)
+        if (ow1 < calcvir_atom_end)
         {
             /* Determining r \dot m der is easy,
              * since fc contains the mass weighted corrections for der.
@@ -285,98 +299,23 @@ void settle_proj(FILE *fp,
             }
         }
     }
-    /* conrect rmdder, which will be used to calcualate the virial; we need to use 
-       the unscaled multipliers in the virial */
-    msmul(rmdder,1.0/vetavar->vscale,rmdder);
-}
-
-
-/* Our local shake routine to be used when settle breaks down due to a zero determinant */
-static int xshake(real b4[], real after[], real dOH, real dHH, real mO, real mH) 
-{  
-  real bondsq[3];
-  real bond[9];
-  real invmass[3];
-  real M2[3];
-  int iconv;
-  int iatom[3]={0,0,1};
-  int jatom[3]={1,2,2};
-  real rijx,rijy,rijz,tx,ty,tz,im,jm,acor,rp,diff;
-  int i,ll,ii,jj,l3,ix,iy,iz,jx,jy,jz,conv;
-
-  invmass[0]=1.0/mO;
-  invmass[1]=1.0/mH;
-  invmass[2]=1.0/mH;
-
-  bondsq[0]=dOH*dOH;
-  bondsq[1]=bondsq[0];
-  bondsq[2]=dHH*dHH;
-  
-  M2[0]=1.0/(2.0*(invmass[0]+invmass[1]));
-  M2[1]=M2[0];
-  M2[2]=1.0/(2.0*(invmass[1]+invmass[2]));
-
-  for(ll=0;ll<3;ll++) {
-    l3=3*ll;
-    ix=3*iatom[ll];
-    jx=3*jatom[ll];
-    for(i=0;i<3;i++) 
-      bond[l3+i]= b4[ix+i] - b4[jx+i];
-  }
-
-  for(i=0,iconv=0;i<1000 && iconv<3; i++) {
-    for(ll=0;ll<3;ll++) {
-      ii = iatom[ll];
-      jj = jatom[ll];
-      l3 = 3*ll;
-      ix = 3*ii;
-      jx = 3*jj;
-      iy = ix+1;
-      jy = jx+1;
-      iz = ix+2;
-      jz = jx+2;
-
-      rijx = bond[l3];
-      rijy = bond[l3+1];
-      rijz = bond[l3+2];  
-
-      
-      tx   = after[ix]-after[jx];
-      ty   = after[iy]-after[jy];
-      tz   = after[iz]-after[jz];
-      
-      rp   = tx*tx+ty*ty+tz*tz;
-      diff = bondsq[ll] - rp;
 
-      if(fabs(diff)<1e-8) {
-	iconv++;
-      } else {
-	rp = rijx*tx+rijy*ty+rijz*tz;
-	if(rp<1e-8) {
-	  return -1;
-	}
-	acor = diff*M2[ll]/rp;
-	im           = invmass[ii];
-	jm           = invmass[jj];
-	tx           = rijx*acor;
-	ty           = rijy*acor;
-	tz           = rijz*acor;
-	after[ix] += tx*im;
-	after[iy] += ty*im;
-	after[iz] += tz*im;
-	after[jx] -= tx*jm;
-	after[jy] -= ty*jm;
-	after[jz] -= tz*jm;
-      }
+    if (calcvir_atom_end > 0)
+    {
+        /* Correct rmdder, which will be used to calcualate the virial;
+         * we need to use the unscaled multipliers in the virial.
+         */
+        msmul(rmdder,1.0/vetavar->vscale,rmdder);
     }
-  }
-  return 0;
 }
 
 
 void csettle(gmx_settledata_t settled,
-             int nsettle, t_iatom iatoms[],real b4[], real after[],
-             real invdt,real *v,gmx_bool bCalcVir,tensor rmdr,int *error,t_vetavars *vetavar)
+             int nsettle, t_iatom iatoms[],
+             const t_pbc *pbc,
+             real b4[], real after[],
+             real invdt,real *v,int CalcVirAtomEnd,
+             tensor rmdr,int *error,t_vetavars *vetavar)
 {
     /* ***************************************************************** */
     /*                                                               ** */
@@ -395,7 +334,7 @@ void csettle(gmx_settledata_t settled,
     /* These three weights need have double precision. Using single precision
      * can result in huge velocity and pressure deviations. */
     double wo,wh,wohh;
-    real   ra,rb,rc,rc2,dOH,dHH;
+    real   ra,rb,rc,irc2,dOH,dHH;
     
     /* Local variables */
     real gama, beta, alpa, xcom, ycom, zcom, al2be2, tmp, tmp2;
@@ -412,12 +351,17 @@ void csettle(gmx_settledata_t settled,
     real dax, day, daz, dbx, dby, dbz, dcx, dcy, dcz;
     real mdax, mday, mdaz, mdbx, mdby, mdbz, mdcx, mdcy, mdcz;
     
-    int doshake;
+    gmx_bool bOK;
     
-    int i, shakeret, ow1, hw2, hw3;
+    int i, ow1, hw2, hw3;
+
+    rvec dx,sh_hw2={0,0,0},sh_hw3={0,0,0};
+    int  is;
     
     *error = -1;
 
+    CalcVirAtomEnd *= 3;
+
     p = &settled->massw;
     mO   = p->mO;
     mH   = p->mH;
@@ -427,30 +371,70 @@ void csettle(gmx_settledata_t settled,
     rc   = p->rc;
     ra   = p->ra;
     rb   = p->rb;
-    rc2  = p->rc2;
+    irc2 = p->irc2;
     dOH  = p->dOH;
     dHH  = p->dHH;
     
     mOs  = mO / vetavar->rvscale;
     mHs  = mH / vetavar->rvscale;
-    invdts = invdt/(vetavar->rscale);
+    invdts = invdt / vetavar->rscale;
     
 #ifdef PRAGMAS
 #pragma ivdep
 #endif
   for (i = 0; i < nsettle; ++i) {
-    doshake = 0;
+    bOK = TRUE;
     /*    --- Step1  A1' ---      */
     ow1 = iatoms[i*4+1] * 3;
     hw2 = iatoms[i*4+2] * 3;
     hw3 = iatoms[i*4+3] * 3;
-    xb0 = b4[hw2    ] - b4[ow1];
-    yb0 = b4[hw2 + 1] - b4[ow1 + 1];
-    zb0 = b4[hw2 + 2] - b4[ow1 + 2];
-    xc0 = b4[hw3    ] - b4[ow1];
-    yc0 = b4[hw3 + 1] - b4[ow1 + 1];
-    zc0 = b4[hw3 + 2] - b4[ow1 + 2];
-    /* 6 flops */
+    if (pbc == NULL)
+    {
+        xb0 = b4[hw2    ] - b4[ow1];
+        yb0 = b4[hw2 + 1] - b4[ow1 + 1];
+        zb0 = b4[hw2 + 2] - b4[ow1 + 2];
+        xc0 = b4[hw3    ] - b4[ow1];
+        yc0 = b4[hw3 + 1] - b4[ow1 + 1];
+        zc0 = b4[hw3 + 2] - b4[ow1 + 2];
+        /* 6 flops */
+    }
+    else
+    {
+        pbc_dx_aiuc(pbc,b4+hw2,b4+ow1,dx);
+        xb0 = dx[XX];
+        yb0 = dx[YY];
+        zb0 = dx[ZZ];
+        pbc_dx_aiuc(pbc,b4+hw3,b4+ow1,dx);
+        xc0 = dx[XX];
+        yc0 = dx[YY];
+        zc0 = dx[ZZ];
+
+        /* Tedious way of doing pbc */
+        is = pbc_dx_aiuc(pbc,after+hw2,after+ow1,dx);
+        if (is == CENTRAL)
+        {
+            clear_rvec(sh_hw2);
+        }
+        else
+        {
+            sh_hw2[XX] = after[hw2    ] - (after[ow1    ] + dx[XX]);
+            sh_hw2[YY] = after[hw2 + 1] - (after[ow1 + 1] + dx[YY]);
+            sh_hw2[ZZ] = after[hw2 + 2] - (after[ow1 + 2] + dx[ZZ]);
+            rvec_dec(after+hw2,sh_hw2);
+        }
+        is = pbc_dx_aiuc(pbc,after+hw3,after+ow1,dx);
+        if (is == CENTRAL)
+        {
+            clear_rvec(sh_hw3);
+        }
+        else
+        {
+            sh_hw3[XX] = after[hw3    ] - (after[ow1    ] + dx[XX]);
+            sh_hw3[YY] = after[hw3 + 1] - (after[ow1 + 1] + dx[YY]);
+            sh_hw3[ZZ] = after[hw3 + 2] - (after[ow1 + 2] + dx[ZZ]);
+            rvec_dec(after+hw3,sh_hw3);
+        }
+    }
     
     xcom = (after[ow1    ] * wo + (after[hw2    ] + after[hw3    ]) * wh);
     ycom = (after[ow1 + 1] * wo + (after[hw2 + 1] + after[hw3 + 1]) * wh);
@@ -511,27 +495,24 @@ void csettle(gmx_settledata_t settled,
     zc1d = trns13 * xc1 + trns23 * yc1 + trns33 * zc1;
     /* 65 flops */
         
-    sinphi = za1d / ra;
+    sinphi = za1d * gmx_invsqrt(ra*ra);
     tmp    = 1.0 - sinphi * sinphi;
     if (tmp <= 0) {
-      *error = i;
-      doshake = 1;
-      cosphi = 0;
-    }
-    else
-      cosphi = tmp*gmx_invsqrt(tmp);
-    sinpsi = (zb1d - zc1d) / (rc2 * cosphi);
-    tmp2   = 1.0 - sinpsi * sinpsi;
-    if (tmp2 <= 0) {
-      *error = i;
-      doshake = 1;
-      cospsi = 0;
+      bOK = FALSE;
+    } else {
+      tmp2   = gmx_invsqrt(tmp);
+      cosphi = tmp*tmp2;
+      sinpsi = (zb1d - zc1d) * irc2 * tmp2;
+      tmp2   = 1.0 - sinpsi * sinpsi;
+      if (tmp2 <= 0) {
+        bOK = FALSE;
+      } else {
+        cospsi = tmp2*gmx_invsqrt(tmp2);
+      }
     }
-    else
-      cospsi = tmp2*gmx_invsqrt(tmp2);
     /* 46 flops */
     
-    if(!doshake) {
+    if (bOK) {
       ya2d =  ra * cosphi;
       xb2d = -rc * cospsi;
       t1   = -rb * cosphi;
@@ -546,11 +527,11 @@ void csettle(gmx_settledata_t settled,
       gama   = xb0d * yb1d - xb1d * yb0d + xc0d * yc1d - xc1d * yc0d;
       al2be2 = alpa * alpa + beta * beta;
       tmp2   = (al2be2 - gama * gama);
-      sinthe = (alpa * gama - beta * tmp2*gmx_invsqrt(tmp2)) / al2be2;
+      sinthe = (alpa * gama - beta * tmp2*gmx_invsqrt(tmp2)) * gmx_invsqrt(al2be2*al2be2);
       /* 47 flops */
       
       /*  --- Step4  A3' --- */
-      tmp2  = 1.0 - sinthe *sinthe;
+      tmp2  = 1.0 - sinthe * sinthe;
       costhe = tmp2*gmx_invsqrt(tmp2);
       xa3d = -ya2d * sinthe;
       ya3d = ya2d * costhe;
@@ -585,6 +566,12 @@ void csettle(gmx_settledata_t settled,
       after[hw3 + 2] = zcom + zc3;
       /* 9 flops */
 
+      if (pbc != NULL)
+      {
+          rvec_inc(after+hw2,sh_hw2);
+          rvec_inc(after+hw3,sh_hw3);
+      }
+
       dax = xa3 - xa1;
       day = ya3 - ya1;
       daz = za3 - za1;
@@ -596,7 +583,7 @@ void csettle(gmx_settledata_t settled,
       dcz = zc3 - zc1;
       /* 9 flops, counted with the virial */
 
-      if (v) {
+      if (v != NULL) {
           v[ow1]     += dax*invdts;
           v[ow1 + 1] += day*invdts;
           v[ow1 + 2] += daz*invdts;
@@ -609,7 +596,7 @@ void csettle(gmx_settledata_t settled,
           /* 3*6 flops */
       }
 
-      if (bCalcVir) {
+      if (ow1 < CalcVirAtomEnd) {
           mdax = mOs*dax;
           mday = mOs*day;
           mdaz = mOs*daz;
@@ -619,22 +606,19 @@ void csettle(gmx_settledata_t settled,
           mdcx = mHs*dcx;
           mdcy = mHs*dcy;
           mdcz = mHs*dcz;
-          rmdr[XX][XX] -= b4[ow1]*mdax + b4[hw2]*mdbx + b4[hw3]*mdcx;
-          rmdr[XX][YY] -= b4[ow1]*mday + b4[hw2]*mdby + b4[hw3]*mdcy;
-          rmdr[XX][ZZ] -= b4[ow1]*mdaz + b4[hw2]*mdbz + b4[hw3]*mdcz;
-          rmdr[YY][XX] -= b4[ow1+1]*mdax + b4[hw2+1]*mdbx + b4[hw3+1]*mdcx;
-          rmdr[YY][YY] -= b4[ow1+1]*mday + b4[hw2+1]*mdby + b4[hw3+1]*mdcy;
-          rmdr[YY][ZZ] -= b4[ow1+1]*mdaz + b4[hw2+1]*mdbz + b4[hw3+1]*mdcz;
-          rmdr[ZZ][XX] -= b4[ow1+2]*mdax + b4[hw2+2]*mdbx + b4[hw3+2]*mdcx;
-          rmdr[ZZ][YY] -= b4[ow1+2]*mday + b4[hw2+2]*mdby + b4[hw3+2]*mdcy;
-          rmdr[ZZ][ZZ] -= b4[ow1+2]*mdaz + b4[hw2+2]*mdbz + b4[hw3+2]*mdcz;
+          rmdr[XX][XX] -= b4[ow1  ]*mdax + (b4[ow1  ]+xb0)*mdbx + (b4[ow1  ]+xc0)*mdcx;
+          rmdr[XX][YY] -= b4[ow1  ]*mday + (b4[ow1  ]+xb0)*mdby + (b4[ow1  ]+xc0)*mdcy;
+          rmdr[XX][ZZ] -= b4[ow1  ]*mdaz + (b4[ow1  ]+xb0)*mdbz + (b4[ow1  ]+xc0)*mdcz;
+          rmdr[YY][XX] -= b4[ow1+1]*mdax + (b4[ow1+1]+yb0)*mdbx + (b4[ow1+1]+yc0)*mdcx;
+          rmdr[YY][YY] -= b4[ow1+1]*mday + (b4[ow1+1]+yb0)*mdby + (b4[ow1+1]+yc0)*mdcy;
+          rmdr[YY][ZZ] -= b4[ow1+1]*mdaz + (b4[ow1+1]+yb0)*mdbz + (b4[ow1+1]+yc0)*mdcz;
+          rmdr[ZZ][XX] -= b4[ow1+2]*mdax + (b4[ow1+2]+zb0)*mdbx + (b4[ow1+2]+zc0)*mdcx;
+          rmdr[ZZ][YY] -= b4[ow1+2]*mday + (b4[ow1+2]+zb0)*mdby + (b4[ow1+2]+zc0)*mdcy;
+          rmdr[ZZ][ZZ] -= b4[ow1+2]*mdaz + (b4[ow1+2]+zb0)*mdbz + (b4[ow1+2]+zc0)*mdcz;
 	/* 3*24 - 9 flops */
       }
     } else {
-      /* If we couldn't settle this water, try a simplified iterative shake instead */
-        /* no pressure control in here yet */
-     if(xshake(b4+ow1,after+ow1,dOH,dHH,mO,mH)!=0)
-	*error=i;
+      *error = i;
     }
 #ifdef DEBUG
     if (debug)
diff --git a/src/mdlib/domdec.c b/src/mdlib/domdec.c
index 472778af04..ad1b8c8077 100644
--- a/src/mdlib/domdec.c
+++ b/src/mdlib/domdec.c
@@ -27,6 +27,8 @@
 #include <stdlib.h>
 #include "typedefs.h"
 #include "smalloc.h"
+#include "gmx_fatal.h"
+#include "gmx_fatal_collective.h"
 #include "vec.h"
 #include "domdec.h"
 #include "domdec_network.h"
@@ -50,6 +52,9 @@
 #include "gmxfio.h"
 #include "gmx_ga2la.h"
 #include "gmx_sort.h"
+#include "nbnxn_search.h"
+#include "bondf.h"
+#include "gmx_omp_nthreads.h"
 
 #ifdef GMX_LIB_MPI
 #include <mpi.h>
@@ -139,7 +144,8 @@ typedef struct
 
 typedef struct
 {
-    gmx_cgsort_t *sort1,*sort2;
+    gmx_cgsort_t *sort;
+    gmx_cgsort_t *sort2;
     int  sort_nalloc;
     gmx_cgsort_t *sort_new;
     int  sort_new_nalloc;
@@ -177,12 +183,24 @@ typedef struct
 {
     real min0;    /* The minimum bottom of this zone                        */
     real max1;    /* The maximum top of this zone                           */
+    real min1;    /* The minimum top of this zone                           */
     real mch0;    /* The maximum bottom communicaton height for this zone   */
     real mch1;    /* The maximum top communicaton height for this zone      */
     real p1_0;    /* The bottom value of the first cell in this zone        */
     real p1_1;    /* The top value of the first cell in this zone           */
 } gmx_ddzone_t;
 
+typedef struct
+{
+    gmx_domdec_ind_t ind;
+    int *ibuf;
+    int ibuf_nalloc;
+    vec_rvec_t vbuf;
+    int nsend;
+    int nat;
+    int nsend_zone;
+} dd_comm_setup_work_t;
+
 typedef struct gmx_domdec_comm
 {
     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
@@ -215,6 +233,9 @@ typedef struct gmx_domdec_comm
     int  nstSortCG;
     gmx_domdec_sort_t *sort;
     
+    /* Are there charge groups? */
+    gmx_bool bCGs;
+
     /* Are there bonded and multi-body interactions between charge groups? */
     gmx_bool bInterCGBondeds;
     gmx_bool bInterCGMultiBody;
@@ -280,14 +301,22 @@ typedef struct gmx_domdec_comm
     
     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
     int  nat[ddnatNR];
+
+    /* Array for signalling if atoms have moved to another domain */
+    int  *moved;
+    int  moved_nalloc;
     
     /* Communication buffer for general use */
     int  *buf_int;
     int  nalloc_int;
 
-     /* Communication buffer for general use */
+    /* Communication buffer for general use */
     vec_rvec_t vbuf;
-    
+
+    /* Temporary storage for thread parallel communication setup */
+    int nth;
+    dd_comm_setup_work_t *dth;
+
     /* Communication buffers only used with multiple grid pulses */
     int  *buf_int2;
     int  nalloc_int2;
@@ -342,7 +371,7 @@ typedef struct gmx_domdec_comm
     double load_pme;
 
     /* The last partition step */
-    gmx_large_int_t globalcomm_step;
+    gmx_large_int_t partition_step;
 
     /* Debugging */
     int  nstDDDump;
@@ -944,44 +973,59 @@ static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
             zone->p1_0,zone->p1_1);
 }
 
+
+#define DDZONECOMM_MAXZONE  5
+#define DDZONECOMM_BUFSIZE  3
+
 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
                                int ddimind,int direction,
                                gmx_ddzone_t *buf_s,int n_s,
                                gmx_ddzone_t *buf_r,int n_r)
 {
-    rvec vbuf_s[5*2],vbuf_r[5*2];
+#define ZBS  DDZONECOMM_BUFSIZE
+    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
+    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
     int i;
 
     for(i=0; i<n_s; i++)
     {
-        vbuf_s[i*2  ][0] = buf_s[i].min0;
-        vbuf_s[i*2  ][1] = buf_s[i].max1;
-        vbuf_s[i*2  ][2] = buf_s[i].mch0;
-        vbuf_s[i*2+1][0] = buf_s[i].mch1;
-        vbuf_s[i*2+1][1] = buf_s[i].p1_0;
-        vbuf_s[i*2+1][2] = buf_s[i].p1_1;
+        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
+        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
+        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
+        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
+        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
+        vbuf_s[i*ZBS+1][2] = 0;
+        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
+        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
+        vbuf_s[i*ZBS+2][2] = 0;
     }
 
     dd_sendrecv_rvec(dd, ddimind, direction,
-                     vbuf_s, n_s*2,
-                     vbuf_r, n_r*2);
+                     vbuf_s, n_s*ZBS,
+                     vbuf_r, n_r*ZBS);
 
     for(i=0; i<n_r; i++)
     {
-        buf_r[i].min0 = vbuf_r[i*2  ][0];
-        buf_r[i].max1 = vbuf_r[i*2  ][1];
-        buf_r[i].mch0 = vbuf_r[i*2  ][2];
-        buf_r[i].mch1 = vbuf_r[i*2+1][0];
-        buf_r[i].p1_0 = vbuf_r[i*2+1][1];
-        buf_r[i].p1_1 = vbuf_r[i*2+1][2];
+        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
+        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
+        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
+        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
+        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
+        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
+        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
     }
+
+#undef ZBS
 }
 
 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
                           rvec cell_ns_x0,rvec cell_ns_x1)
 {
     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
-    gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
+    gmx_ddzone_t *zp;
+    gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
+    gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
+    gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
     rvec extr_s[2],extr_r[2];
     rvec dh;
     real dist_d,c=0,det;
@@ -996,6 +1040,7 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
         zp->min0 = cell_ns_x0[dim];
         zp->max1 = cell_ns_x1[dim];
+        zp->min1 = cell_ns_x1[dim];
         zp->mch0 = cell_ns_x0[dim];
         zp->mch1 = cell_ns_x1[dim];
         zp->p1_0 = cell_ns_x0[dim];
@@ -1010,7 +1055,7 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
         /* Use an rvec to store two reals */
         extr_s[d][0] = comm->cell_f0[d+1];
         extr_s[d][1] = comm->cell_f1[d+1];
-        extr_s[d][2] = 0;
+        extr_s[d][2] = comm->cell_f1[d+1];
 
         pos = 0;
         /* Store the extremes in the backward sending buffer,
@@ -1021,6 +1066,7 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
             /* We invert the order to be able to use the same loop for buf_e */
             buf_s[pos].min0 = extr_s[d1][1];
             buf_s[pos].max1 = extr_s[d1][0];
+            buf_s[pos].min1 = extr_s[d1][2];
             buf_s[pos].mch0 = 0;
             buf_s[pos].mch1 = 0;
             /* Store the cell corner of the dimension we communicate along */
@@ -1073,6 +1119,7 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
                 {
                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
+                    extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
                 }
             }
         }
@@ -1135,6 +1182,7 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
                     {
                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
+                        buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
                     }
 
                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
@@ -1166,6 +1214,7 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
                 {
                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
+                    extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
                     pos++;
                 }
 
@@ -1543,17 +1592,6 @@ void dd_collect_state(gmx_domdec_t *dd,
     }
 }
 
-static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
-{
-    if (debug)
-    {
-        fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
-    }
-    fr->cg_nalloc = over_alloc_dd(nalloc);
-    srenew(fr->cg_cm,fr->cg_nalloc);
-    srenew(fr->cginfo,fr->cg_nalloc);
-}
-
 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
 {
     int est;
@@ -1602,6 +1640,31 @@ static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
     }
 }
 
+static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
+                               int nalloc)
+{
+    if (nalloc > fr->cg_nalloc)
+    {
+        if (debug)
+        {
+            fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
+        }
+        fr->cg_nalloc = over_alloc_dd(nalloc);
+        srenew(fr->cginfo,fr->cg_nalloc);
+        if (fr->cutoff_scheme == ecutsGROUP)
+        {
+            srenew(fr->cg_cm,fr->cg_nalloc);
+        }
+    }
+    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
+    {
+        /* We don't use charge groups, we use x in state to set up
+         * the atom communication.
+         */
+        dd_realloc_state(state,f,nalloc);
+    }
+}
+
 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
                                        rvec *v,rvec *lv)
 {
@@ -2334,7 +2397,8 @@ static void set_zones_ncg_home(gmx_domdec_t *dd)
     }
 }
 
-static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
+static void rebuild_cgindex(gmx_domdec_t *dd,
+                            const int *gcgs_index,t_state *state)
 {
     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
     
@@ -2395,12 +2459,14 @@ static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
     }
 }
 
-static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
+static void make_dd_indices(gmx_domdec_t *dd,
+                            const int *gcgs_index,int cg_start)
 {
-    int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
+    int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
     gmx_ga2la_t *ga2la;
     char *bLocalCG;
+    gmx_bool bCGs;
 
     bLocalCG = dd->comm->bLocalCG;
 
@@ -2415,6 +2481,7 @@ static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
     zone_ncg1  = dd->comm->zone_ncg1;
     index_gl   = dd->index_gl;
     gatindex   = dd->gatindex;
+    bCGs       = dd->comm->bCGs;
 
     if (zone2cg[1] != dd->ncg_home)
     {
@@ -2433,19 +2500,31 @@ static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
         {
             cg0 = zone2cg[zone];
         }
-        for(cg=cg0; cg<zone2cg[zone+1]; cg++)
+        cg1    = zone2cg[zone+1];
+        cg1_p1 = cg0 + zone_ncg1[zone];
+
+        for(cg=cg0; cg<cg1; cg++)
         {
             zone1 = zone;
-            if (cg - cg0 >= zone_ncg1[zone])
+            if (cg >= cg1_p1)
             {
-                /* Signal that this cg is from more than one zone away */
+                /* Signal that this cg is from more than one pulse away */
                 zone1 += nzone;
             }
             cg_gl = index_gl[cg];
-            for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
+            if (bCGs)
+            {
+                for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
+                {
+                    gatindex[a] = a_gl;
+                    ga2la_set(dd->ga2la,a_gl,a,zone1);
+                    a++;
+                }
+            }
+            else
             {
-                gatindex[a] = a_gl;
-                ga2la_set(dd->ga2la,a_gl,a,zone1);
+                gatindex[a] = cg_gl;
+                ga2la_set(dd->ga2la,cg_gl,a,zone1);
                 a++;
             }
         }
@@ -2598,7 +2677,8 @@ static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
     }
 }
 
-static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
+static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
+                            int dim_ind)
 {
     real grid_jump_limit;
 
@@ -2612,24 +2692,31 @@ static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
     if (!comm->bVacDLBNoLimit)
     {
         grid_jump_limit = max(grid_jump_limit,
-                              comm->cutoff/comm->cd[dim_ind].np);
+                              cutoff/comm->cd[dim_ind].np);
     }
 
     return grid_jump_limit;
 }
 
-static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
+static gmx_bool check_grid_jump(gmx_large_int_t step,
+                                gmx_domdec_t *dd,
+                                real cutoff,
+                                gmx_ddbox_t *ddbox,
+                                gmx_bool bFatal)
 {
     gmx_domdec_comm_t *comm;
     int  d,dim;
     real limit,bfac;
-    
+    gmx_bool bInvalid;
+
+    bInvalid = FALSE;
+
     comm = dd->comm;
     
     for(d=1; d<dd->ndim; d++)
     {
         dim = dd->dim[d];
-        limit = grid_jump_limit(comm,d);
+        limit = grid_jump_limit(comm,cutoff,d);
         bfac = ddbox->box_size[dim];
         if (ddbox->tric_dir[dim])
         {
@@ -2638,12 +2725,23 @@ static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *d
         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
         {
-            char buf[22];
-            gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
-                      gmx_step_str(step,buf),
-                      dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
+            bInvalid = TRUE;
+
+            if (bFatal)
+            {
+                char buf[22];
+
+                /* This error should never be triggered under normal
+                 * circumstances, but you never know ...
+                 */
+                gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
+                          gmx_step_str(step,buf),
+                          dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
+            }
         }
     }
+
+    return bInvalid;
 }
 
 static int dd_load_count(gmx_domdec_comm_t *comm)
@@ -3255,8 +3353,8 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
     
     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
     cellsize_limit_f *= DD_CELL_MARGIN;
-    dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
-    dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
+    dist_min_f_hard   = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
+    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
     if (ddbox->tric_dir[dim])
     {
         cellsize_limit_f /= ddbox->skew_fac[dim];
@@ -3609,7 +3707,7 @@ static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
         if (dd->bGridJump && dd->ndim > 1)
         {
-            check_grid_jump(step,dd,ddbox);
+            check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
         }
     }
 }
@@ -4070,8 +4168,8 @@ static void clear_and_mark_ind(int ncg,int *move,
                 bLocalCG[index_gl[cg]] = FALSE;
             }
             /* Signal that this cg has moved using the ns cell index.
-             * Here we set it to -1.
-             * fill_grid will change it from -1 to 4*grid->ncells.
+             * Here we set it to -1. fill_grid will change it
+             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
              */
             cell_index[cg] = -1;
         }
@@ -4173,121 +4271,41 @@ static void rotate_state_atom(t_state *state,int a)
     }
 }
 
-static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
-                              gmx_domdec_t *dd,ivec tric_dir,
-                              t_state *state,rvec **f,
-                              t_forcerec *fr,t_mdatoms *md,
-                              gmx_bool bCompact,
-                              t_nrnb *nrnb)
+static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
+{
+    if (natoms > comm->moved_nalloc)
+    {
+        /* Contents should be preserved here */
+        comm->moved_nalloc = over_alloc_dd(natoms);
+        srenew(comm->moved,comm->moved_nalloc);
+    }
+
+    return comm->moved;
+}
+
+static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
+                         gmx_domdec_t *dd,
+                         t_state *state,
+                         ivec tric_dir,matrix tcm,
+                         rvec cell_x0,rvec cell_x1,
+                         rvec limitd,rvec limit0,rvec limit1,
+                         const int *cgindex,
+                         int cg_start,int cg_end,
+                         rvec *cg_cm,
+                         int *move)
 {
-    int  *move;
     int  npbcdim;
-    int  ncg[DIM*2],nat[DIM*2];
     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
-    int  sbuf[2],rbuf[2];
-    int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
     int  flag;
-    gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
     gmx_bool bScrew;
     ivec dev;
     real inv_ncg,pos_d;
-    matrix tcm;
-    rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
-    atom_id *cgindex;
-    cginfo_mb_t *cginfo_mb;
-    gmx_domdec_comm_t *comm;
-    
-    if (dd->bScrewPBC)
-    {
-        check_screw_box(state->box);
-    }
-    
-    comm  = dd->comm;
-    cg_cm = fr->cg_cm;
-    
-    for(i=0; i<estNR; i++)
-    {
-        if (EST_DISTR(i))
-        {
-            switch (i)
-            {
-            case estX:   /* Always present */            break;
-            case estV:   bV   = (state->flags & (1<<i)); break;
-            case estSDX: bSDX = (state->flags & (1<<i)); break;
-            case estCGP: bCGP = (state->flags & (1<<i)); break;
-            case estLD_RNG:
-            case estLD_RNGI:
-            case estDISRE_INITF:
-            case estDISRE_RM3TAV:
-            case estORIRE_INITF:
-            case estORIRE_DTAV:
-                /* No processing required */
-                break;
-            default:
-            gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
-            }
-        }
-    }
-    
-    if (dd->ncg_tot > comm->nalloc_int)
-    {
-        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
-        srenew(comm->buf_int,comm->nalloc_int);
-    }
-    move = comm->buf_int;
-    
-    /* Clear the count */
-    for(c=0; c<dd->ndim*2; c++)
-    {
-        ncg[c] = 0;
-        nat[c] = 0;
-    }
+    rvec cm_new;
 
     npbcdim = dd->npbcdim;
 
-    for(d=0; (d<DIM); d++)
-    {
-        limitd[d] = dd->comm->cellsize_min[d];
-        if (d >= npbcdim && dd->ci[d] == 0)
-        {
-            cell_x0[d] = -GMX_FLOAT_MAX;
-        }
-        else
-        {
-            cell_x0[d] = comm->cell_x0[d];
-        }
-        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
-        {
-            cell_x1[d] = GMX_FLOAT_MAX;
-        }
-        else
-        {
-            cell_x1[d] = comm->cell_x1[d];
-        }
-        if (d < npbcdim)
-        {
-            limit0[d] = comm->old_cell_x0[d] - limitd[d];
-            limit1[d] = comm->old_cell_x1[d] + limitd[d];
-        }
-        else
-        {
-            /* We check after communication if a charge group moved
-             * more than one cell. Set the pre-comm check limit to float_max.
-             */
-            limit0[d] = -GMX_FLOAT_MAX;
-            limit1[d] =  GMX_FLOAT_MAX;
-        }
-    }
-    
-    make_tric_corr_matrix(npbcdim,state->box,tcm);
-    
-    cgindex = dd->cgindex;
-    
-    /* Compute the center of geometry for all home charge groups
-     * and put them in the box and determine where they should go.
-     */
-    for(cg=0; cg<dd->ncg_home; cg++)
+    for(cg=cg_start; cg<cg_end; cg++)
     {
         k0   = cgindex[cg];
         k1   = cgindex[cg+1];
@@ -4434,98 +4452,286 @@ static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
                 }
             }
         }
-        move[cg] = mc;
-        if (mc >= 0)
-        {
-            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
-            {
-                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
-                srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
-            }
-            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
-            /* We store the cg size in the lower 16 bits
-             * and the place where the charge group should go
-             * in the next 6 bits. This saves some communication volume.
-             */
-            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
-            ncg[mc] += 1;
-            nat[mc] += nrcg;
-        }
+        /* Temporarily store the flag in move */
+        move[cg] = mc + flag;
     }
+}
+
+static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
+                               gmx_domdec_t *dd,ivec tric_dir,
+                               t_state *state,rvec **f,
+                               t_forcerec *fr,t_mdatoms *md,
+                               gmx_bool bCompact,
+                               t_nrnb *nrnb,
+                               int *ncg_stay_home,
+                               int *ncg_moved)
+{
+    int  *move;
+    int  npbcdim;
+    int  ncg[DIM*2],nat[DIM*2];
+    int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
+    int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
+    int  sbuf[2],rbuf[2];
+    int  home_pos_cg,home_pos_at,buf_pos;
+    int  flag;
+    gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
+    gmx_bool bScrew;
+    ivec dev;
+    real inv_ncg,pos_d;
+    matrix tcm;
+    rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
+    atom_id *cgindex;
+    cginfo_mb_t *cginfo_mb;
+    gmx_domdec_comm_t *comm;
+    int  *moved;
+    int  nthread,thread;
     
-    inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
-    inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
-    
-    nvec = 1;
-    if (bV)
-    {
-        nvec++;
-    }
-    if (bSDX)
+    if (dd->bScrewPBC)
     {
-        nvec++;
+        check_screw_box(state->box);
     }
-    if (bCGP)
+    
+    comm  = dd->comm;
+    if (fr->cutoff_scheme == ecutsGROUP)
     {
-        nvec++;
+        cg_cm = fr->cg_cm;
     }
     
-    /* Make sure the communication buffers are large enough */
-    for(mc=0; mc<dd->ndim*2; mc++)
+    for(i=0; i<estNR; i++)
     {
-        nvr = ncg[mc] + nat[mc]*nvec;
-        if (nvr > comm->cgcm_state_nalloc[mc])
+        if (EST_DISTR(i))
         {
-            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
-            srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
+            switch (i)
+            {
+            case estX:   /* Always present */            break;
+            case estV:   bV   = (state->flags & (1<<i)); break;
+            case estSDX: bSDX = (state->flags & (1<<i)); break;
+            case estCGP: bCGP = (state->flags & (1<<i)); break;
+            case estLD_RNG:
+            case estLD_RNGI:
+            case estDISRE_INITF:
+            case estDISRE_RM3TAV:
+            case estORIRE_INITF:
+            case estORIRE_DTAV:
+                /* No processing required */
+                break;
+            default:
+            gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
+            }
         }
     }
     
-    /* Recalculating cg_cm might be cheaper than communicating,
-     * but that could give rise to rounding issues.
-     */
-    home_pos_cg =
-        compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
-                                nvec,cg_cm,comm,bCompact);
-    
-    vec = 0;
-    home_pos_at =
-        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
-                                nvec,vec++,state->x,comm,bCompact);
-    if (bV)
-    {
-        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
-                                nvec,vec++,state->v,comm,bCompact);
-    }
-    if (bSDX)
-    {
-        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
-                                nvec,vec++,state->sd_X,comm,bCompact);
-    }
-    if (bCGP)
+    if (dd->ncg_tot > comm->nalloc_int)
     {
-        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
-                                nvec,vec++,state->cg_p,comm,bCompact);
+        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
+        srenew(comm->buf_int,comm->nalloc_int);
     }
+    move = comm->buf_int;
     
-    if (bCompact)
+    /* Clear the count */
+    for(c=0; c<dd->ndim*2; c++)
     {
-        compact_ind(dd->ncg_home,move,
-                    dd->index_gl,dd->cgindex,dd->gatindex,
-                    dd->ga2la,comm->bLocalCG,
-                    fr->cginfo);
+        ncg[c] = 0;
+        nat[c] = 0;
+    }
+
+    npbcdim = dd->npbcdim;
+
+    for(d=0; (d<DIM); d++)
+    {
+        limitd[d] = dd->comm->cellsize_min[d];
+        if (d >= npbcdim && dd->ci[d] == 0)
+        {
+            cell_x0[d] = -GMX_FLOAT_MAX;
+        }
+        else
+        {
+            cell_x0[d] = comm->cell_x0[d];
+        }
+        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
+        {
+            cell_x1[d] = GMX_FLOAT_MAX;
+        }
+        else
+        {
+            cell_x1[d] = comm->cell_x1[d];
+        }
+        if (d < npbcdim)
+        {
+            limit0[d] = comm->old_cell_x0[d] - limitd[d];
+            limit1[d] = comm->old_cell_x1[d] + limitd[d];
+        }
+        else
+        {
+            /* We check after communication if a charge group moved
+             * more than one cell. Set the pre-comm check limit to float_max.
+             */
+            limit0[d] = -GMX_FLOAT_MAX;
+            limit1[d] =  GMX_FLOAT_MAX;
+        }
+    }
+    
+    make_tric_corr_matrix(npbcdim,state->box,tcm);
+    
+    cgindex = dd->cgindex;
+
+    nthread = gmx_omp_nthreads_get(emntDomdec);
+
+    /* Compute the center of geometry for all home charge groups
+     * and put them in the box and determine where they should go.
+     */
+#pragma omp parallel for num_threads(nthread) schedule(static)
+    for(thread=0; thread<nthread; thread++)
+    {
+        calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
+                     cell_x0,cell_x1,limitd,limit0,limit1,
+                     cgindex,
+                     ( thread   *dd->ncg_home)/nthread,
+                     ((thread+1)*dd->ncg_home)/nthread,
+                     fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
+                     move);
+    }
+
+    for(cg=0; cg<dd->ncg_home; cg++)
+    {
+        if (move[cg] >= 0)
+        {
+            mc = move[cg];
+            flag     = mc & ~DD_FLAG_NRCG;
+            mc       = mc & DD_FLAG_NRCG;
+            move[cg] = mc;
+
+            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
+            {
+                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
+                srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
+            }
+            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
+            /* We store the cg size in the lower 16 bits
+             * and the place where the charge group should go
+             * in the next 6 bits. This saves some communication volume.
+             */
+            nrcg = cgindex[cg+1] - cgindex[cg];
+            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
+            ncg[mc] += 1;
+            nat[mc] += nrcg;
+        }
+    }
+    
+    inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
+    inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
+
+    *ncg_moved = 0;
+    for(i=0; i<dd->ndim*2; i++)
+    {
+        *ncg_moved += ncg[i];
+    }
+    
+    nvec = 1;
+    if (bV)
+    {
+        nvec++;
+    }
+    if (bSDX)
+    {
+        nvec++;
+    }
+    if (bCGP)
+    {
+        nvec++;
+    }
+    
+    /* Make sure the communication buffers are large enough */
+    for(mc=0; mc<dd->ndim*2; mc++)
+    {
+        nvr = ncg[mc] + nat[mc]*nvec;
+        if (nvr > comm->cgcm_state_nalloc[mc])
+        {
+            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
+            srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
+        }
+    }
+    
+    switch (fr->cutoff_scheme)
+    {
+    case ecutsGROUP:
+        /* Recalculating cg_cm might be cheaper than communicating,
+         * but that could give rise to rounding issues.
+         */
+        home_pos_cg =
+            compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
+                                    nvec,cg_cm,comm,bCompact);
+    break;
+    case ecutsVERLET:
+        /* Without charge groups we send the moved atom coordinates
+         * over twice. This is so the code below can be used without
+         * many conditionals for both for with and without charge groups.
+         */
+        home_pos_cg =
+            compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
+                                    nvec,state->x,comm,FALSE);
+        if (bCompact)
+        {
+            home_pos_cg -= *ncg_moved;
+        }
+        break;
+    default:
+        gmx_incons("unimplemented");
+        home_pos_cg = 0;
+    }
+    
+    vec = 0;
+    home_pos_at =
+        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
+                                nvec,vec++,state->x,comm,bCompact);
+    if (bV)
+    {
+        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
+                                nvec,vec++,state->v,comm,bCompact);
+    }
+    if (bSDX)
+    {
+        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
+                                nvec,vec++,state->sd_X,comm,bCompact);
+    }
+    if (bCGP)
+    {
+        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
+                                nvec,vec++,state->cg_p,comm,bCompact);
+    }
+    
+    if (bCompact)
+    {
+        compact_ind(dd->ncg_home,move,
+                    dd->index_gl,dd->cgindex,dd->gatindex,
+                    dd->ga2la,comm->bLocalCG,
+                    fr->cginfo);
     }
     else
     {
+        if (fr->cutoff_scheme == ecutsVERLET)
+        {
+            moved = get_moved(comm,dd->ncg_home);
+
+            for(k=0; k<dd->ncg_home; k++)
+            {
+                moved[k] = 0;
+            }
+        }
+        else
+        {
+            moved = fr->ns.grid->cell_index;
+        }
+
         clear_and_mark_ind(dd->ncg_home,move,
                            dd->index_gl,dd->cgindex,dd->gatindex,
                            dd->ga2la,comm->bLocalCG,
-                           fr->ns.grid->cell_index);
+                           moved);
     }
     
     cginfo_mb = fr->cginfo_mb;
 
-    ncg_stay_home = home_pos_cg;
+    *ncg_stay_home = home_pos_cg;
     for(d=0; d<dd->ndim; d++)
     {
         dim = dd->dim[d];
@@ -4681,12 +4887,14 @@ static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
                 /* Copy the state from the buffer */
-                if (home_pos_cg >= fr->cg_nalloc)
+                dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
+                if (fr->cutoff_scheme == ecutsGROUP)
                 {
-                    dd_realloc_fr_cg(fr,home_pos_cg+1);
                     cg_cm = fr->cg_cm;
+                    copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
                 }
-                copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
+                buf_pos++;
+
                 /* Set the cginfo */
                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
                                                    dd->index_gl[home_pos_cg]);
@@ -4763,15 +4971,25 @@ static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
      * and ncg_home and nat_home are not the real count, since there are
      * "holes" in the arrays for the charge groups that moved to neighbors.
      */
+    if (fr->cutoff_scheme == ecutsVERLET)
+    {
+        moved = get_moved(comm,home_pos_cg);
+
+        for(i=dd->ncg_home; i<home_pos_cg; i++)
+        {
+            moved[i] = 0;
+        }
+    }
     dd->ncg_home = home_pos_cg;
     dd->nat_home = home_pos_at;
 
     if (debug)
     {
-        fprintf(debug,"Finished repartitioning\n");
+        fprintf(debug,
+                "Finished repartitioning: cgs moved out %d, new home %d\n",
+                *ncg_moved,dd->ncg_home-*ncg_moved);
+                
     }
-
-    return ncg_stay_home;
 }
 
 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
@@ -5154,9 +5372,16 @@ static float dd_f_imbal(gmx_domdec_t *dd)
     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 }
 
-static float dd_pme_f_ratio(gmx_domdec_t *dd)
+float dd_pme_f_ratio(gmx_domdec_t *dd)
 {
-    return dd->comm->load[0].pme/dd->comm->load[0].mdf;
+    if (dd->comm->cycl_n[ddCyclPME] > 0)
+    {
+        return dd->comm->load[0].pme/dd->comm->load[0].mdf;
+    }
+    else
+    {
+        return -1.0;
+    }
 }
 
 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
@@ -6264,6 +6489,8 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
             fprintf(fplog,"Will not sort the charge groups\n");
         }
     }
+
+    comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
     
     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
     if (comm->bInterCGBondeds)
@@ -6275,7 +6502,8 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
         comm->bInterCGMultiBody = FALSE;
     }
     
-    dd->bInterCGcons = inter_charge_group_constraints(mtop);
+    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
+    dd->bInterCGsettles = inter_charge_group_settles(mtop);
 
     if (ir->rlistlong == 0)
     {
@@ -6583,7 +6811,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
         check_dd_restrictions(cr,dd,ir,fplog);
     }
 
-    comm->globalcomm_step = INT_MIN;
+    comm->partition_step = INT_MIN;
     dd->ddp_count = 0;
 
     clear_dd_cycle_counts(dd);
@@ -6831,19 +7059,142 @@ static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
     fflush(fplog);
 }
 
+static void set_cell_limits_dlb(gmx_domdec_t *dd,
+                                real dlb_scale,
+                                const t_inputrec *ir,
+                                const gmx_ddbox_t *ddbox)
+{
+    gmx_domdec_comm_t *comm;
+    int  d,dim,npulse,npulse_d_max,npulse_d;
+    gmx_bool bNoCutOff;
+
+    comm = dd->comm;
+
+    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
+
+    /* Determine the maximum number of comm. pulses in one dimension */
+        
+    comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
+        
+    /* Determine the maximum required number of grid pulses */
+    if (comm->cellsize_limit >= comm->cutoff)
+    {
+        /* Only a single pulse is required */
+        npulse = 1;
+    }
+    else if (!bNoCutOff && comm->cellsize_limit > 0)
+    {
+        /* We round down slightly here to avoid overhead due to the latency
+         * of extra communication calls when the cut-off
+         * would be only slightly longer than the cell size.
+         * Later cellsize_limit is redetermined,
+         * so we can not miss interactions due to this rounding.
+         */
+        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
+    }
+    else
+    {
+        /* There is no cell size limit */
+        npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
+    }
+
+    if (!bNoCutOff && npulse > 1)
+    {
+        /* See if we can do with less pulses, based on dlb_scale */
+        npulse_d_max = 0;
+        for(d=0; d<dd->ndim; d++)
+        {
+            dim = dd->dim[d];
+            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
+                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
+            npulse_d_max = max(npulse_d_max,npulse_d);
+        }
+        npulse = min(npulse,npulse_d_max);
+    }
+
+    /* This env var can override npulse */
+    d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
+    if (d > 0)
+    {
+        npulse = d;
+    }
+
+    comm->maxpulse = 1;
+    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
+    for(d=0; d<dd->ndim; d++)
+    {
+        comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
+        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
+        snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
+        comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
+        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
+        {
+            comm->bVacDLBNoLimit = FALSE;
+        }
+    }
+
+    /* cellsize_limit is set for LINCS in init_domain_decomposition */
+    if (!comm->bVacDLBNoLimit)
+    {
+        comm->cellsize_limit = max(comm->cellsize_limit,
+                                   comm->cutoff/comm->maxpulse);
+    }
+    comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
+    /* Set the minimum cell size for each DD dimension */
+    for(d=0; d<dd->ndim; d++)
+    {
+        if (comm->bVacDLBNoLimit ||
+            comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
+        {
+            comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
+        }
+        else
+        {
+            comm->cellsize_min_dlb[dd->dim[d]] =
+                comm->cutoff/comm->cd[d].np_dlb;
+        }
+    }
+    if (comm->cutoff_mbody <= 0)
+    {
+        comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
+    }
+    if (comm->bDynLoadBal)
+    {
+        set_dlb_limits(dd);
+    }
+}
+
+gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
+{
+    /* If each molecule is a single charge group
+     * or we use domain decomposition for each periodic dimension,
+     * we do not need to take pbc into account for the bonded interactions.
+     */
+    return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
+            !(dd->nc[XX]>1 &&
+              dd->nc[YY]>1 &&
+              (dd->nc[ZZ]>1 || ePBC==epbcXY)));
+}
+
 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
                        t_inputrec *ir,t_forcerec *fr,
                        gmx_ddbox_t *ddbox)
 {
     gmx_domdec_comm_t *comm;
-    int  d,dim,npulse,npulse_d_max,npulse_d;
-    gmx_bool bNoCutOff;
     int  natoms_tot;
     real vol_frac;
 
     comm = dd->comm;
 
-    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
+    /* Initialize the thread data.
+     * This can not be done in init_domain_decomposition,
+     * as the numbers of threads is determined later.
+     */
+    comm->nth = gmx_omp_nthreads_get(emntDomdec);
+    if (comm->nth > 1)
+    {
+        snew(comm->dth,comm->nth);
+    }
 
     if (EEL_PME(ir->coulombtype))
     {
@@ -6862,20 +7213,6 @@ void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
                                  "Can not have separate PME nodes without PME electrostatics");
         }
     }
-    
-    /* If each molecule is a single charge group
-     * or we use domain decomposition for each periodic dimension,
-     * we do not need to take pbc into account for the bonded interactions.
-     */
-    if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
-        (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
-    {
-        fr->bMolPBC = FALSE;
-    }
-    else
-    {
-        fr->bMolPBC = TRUE;
-    }
         
     if (debug)
     {
@@ -6883,107 +7220,18 @@ void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
     }
     if (comm->eDLB != edlbNO)
     {
-        /* Determine the maximum number of comm. pulses in one dimension */
-        
-        comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
-        
-        /* Determine the maximum required number of grid pulses */
-        if (comm->cellsize_limit >= comm->cutoff)
-        {
-            /* Only a single pulse is required */
-            npulse = 1;
-        }
-        else if (!bNoCutOff && comm->cellsize_limit > 0)
-        {
-            /* We round down slightly here to avoid overhead due to the latency
-             * of extra communication calls when the cut-off
-             * would be only slightly longer than the cell size.
-             * Later cellsize_limit is redetermined,
-             * so we can not miss interactions due to this rounding.
-             */
-            npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
-        }
-        else
+        set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
+    }
+    
+    print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
+    if (comm->eDLB == edlbAUTO)
+    {
+        if (fplog)
         {
-            /* There is no cell size limit */
-            npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
+            fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
         }
-
-        if (!bNoCutOff && npulse > 1)
-        {
-            /* See if we can do with less pulses, based on dlb_scale */
-            npulse_d_max = 0;
-            for(d=0; d<dd->ndim; d++)
-            {
-                dim = dd->dim[d];
-                npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
-                                 /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
-                npulse_d_max = max(npulse_d_max,npulse_d);
-            }
-            npulse = min(npulse,npulse_d_max);
-        }
-        
-        /* This env var can override npulse */
-        d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
-        if (d > 0)
-        {
-            npulse = d;
-        }
-
-        comm->maxpulse = 1;
-        comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
-        for(d=0; d<dd->ndim; d++)
-        {
-            comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
-            comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
-            snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
-            comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
-            if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
-            {
-                comm->bVacDLBNoLimit = FALSE;
-            }
-        }
-        
-        /* cellsize_limit is set for LINCS in init_domain_decomposition */
-        if (!comm->bVacDLBNoLimit)
-        {
-            comm->cellsize_limit = max(comm->cellsize_limit,
-                                       comm->cutoff/comm->maxpulse);
-        }
-        comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
-        /* Set the minimum cell size for each DD dimension */
-        for(d=0; d<dd->ndim; d++)
-        {
-            if (comm->bVacDLBNoLimit ||
-                comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
-            {
-                comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
-            }
-            else
-            {
-                comm->cellsize_min_dlb[dd->dim[d]] =
-                    comm->cutoff/comm->cd[d].np_dlb;
-            }
-        }
-        if (comm->cutoff_mbody <= 0)
-        {
-            comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
-        }
-        if (comm->bDynLoadBal)
-        {
-            set_dlb_limits(dd);
-        }
-    }
-    
-    print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
-    if (comm->eDLB == edlbAUTO)
-    {
-        if (fplog)
-        {
-            fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
-        }
-        print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
-    }
+        print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
+    }
 
     if (ir->ePBC == epbcNONE)
     {
@@ -7003,6 +7251,73 @@ void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
 }
 
+gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
+                          real cutoff_req)
+{
+    gmx_domdec_t *dd;
+    gmx_ddbox_t ddbox;
+    int d,dim,np;
+    real inv_cell_size;
+    int LocallyLimited;
+
+    dd = cr->dd;
+
+    set_ddbox(dd,FALSE,cr,ir,state->box,
+              TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
+
+    LocallyLimited = 0;
+
+    for(d=0; d<dd->ndim; d++)
+    {
+        dim = dd->dim[d];
+
+        inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
+        if (dynamic_dd_box(&ddbox,ir))
+        {
+            inv_cell_size *= DD_PRES_SCALE_MARGIN;
+        }
+
+        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
+
+        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
+            dd->comm->cd[d].np_dlb > 0)
+        {
+            if (np > dd->comm->cd[d].np_dlb)
+            {
+                return FALSE;
+            }
+
+            /* If a current local cell size is smaller than the requested
+             * cut-off, we could still fix it, but this gets very complicated.
+             * Without fixing here, we might actually need more checks.
+             */
+            if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
+            {
+                LocallyLimited = 1;
+            }
+        }
+    }
+
+    if (dd->comm->eDLB != edlbNO)
+    {
+        if (check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
+        {
+            LocallyLimited = 1; 
+        }
+
+        gmx_sumi(1,&LocallyLimited,cr);
+
+        if (LocallyLimited > 0)
+        {
+            return FALSE;
+        }
+    }
+
+    dd->comm->cutoff = cutoff_req;
+
+    return TRUE;
+}
+
 static void merge_cg_buffers(int ncell,
                              gmx_domdec_comm_dim_t *cd, int pulse,
                              int  *ncg_cell,
@@ -7122,102 +7437,65 @@ static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
     return bMiss;
 }
 
-static void setup_dd_communication(gmx_domdec_t *dd,
-                                   matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
-{
-    int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
-    int nzone,nzone_send,zone,zonei,cg0,cg1;
-    int c,i,j,cg,cg_gl,nrcg;
-    int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
-    gmx_domdec_comm_t *comm;
-    gmx_domdec_zones_t *zones;
-    gmx_domdec_comm_dim_t *cd;
-    gmx_domdec_ind_t *ind;
-    cginfo_mb_t *cginfo_mb;
-    gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
-    real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
-    rvec rb,rn;
-    real corner[DIM][4],corner_round_0=0,corner_round_1[4];
-    real bcorner[DIM],bcorner_round_1=0;
-    ivec tric_dist;
-    rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
-    real skew_fac2_d,skew_fac_01;
-    rvec sf2_round;
-    int  nsend,nat;
-    
-    if (debug)
-    {
-        fprintf(debug,"Setting up DD communication\n");
-    }
-    
-    comm  = dd->comm;
-    cg_cm = fr->cg_cm;
-
-    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
-    {
-        dim = dd->dim[dim_ind];
-
-        /* Check if we need to use triclinic distances */
-        tric_dist[dim_ind] = 0;
-        for(i=0; i<=dim_ind; i++)
-        {
-            if (ddbox->tric_dir[dd->dim[i]])
-            {
-                tric_dist[dim_ind] = 1;
-            }
-        }
-    }
+/* Domain corners for communication, a maximum of 4 i-zones see a j domain */
+typedef struct {
+    real c[DIM][4]; /* the corners for the non-bonded communication */
+    real cr0;       /* corner for rounding */
+    real cr1[4];    /* corners for rounding */
+    real bc[DIM];   /* corners for bounded communication */
+    real bcr1;      /* corner for rounding for bonded communication */
+} dd_corners_t;
 
-    bBondComm = comm->bBondComm;
+/* Determine the corners of the domain(s) we are communicating with */
+static void
+set_dd_corners(const gmx_domdec_t *dd,
+               int dim0, int dim1, int dim2,
+               gmx_bool bDistMB,
+               dd_corners_t *c)
+{
+    const gmx_domdec_comm_t *comm;
+    const gmx_domdec_zones_t *zones;
+    int i,j;
 
-    /* Do we need to determine extra distances for multi-body bondeds? */
-    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
-    
-    /* Do we need to determine extra distances for only two-body bondeds? */
-    bDist2B = (bBondComm && !bDistMB);
+    comm = dd->comm;
 
-    r_comm2  = sqr(comm->cutoff);
-    r_bcomm2 = sqr(comm->cutoff_mbody);
+    zones = &comm->zones;
 
-    if (debug)
-    {
-        fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
-    }
+    /* Keep the compiler happy */
+    c->cr0  = 0;
+    c->bcr1 = 0;
 
-    zones = &comm->zones;
-    
-    dim0 = dd->dim[0];
     /* The first dimension is equal for all cells */
-    corner[0][0] = comm->cell_x0[dim0];
+    c->c[0][0] = comm->cell_x0[dim0];
     if (bDistMB)
     {
-        bcorner[0] = corner[0][0];
+        c->bc[0] = c->c[0][0];
     }
     if (dd->ndim >= 2)
     {
         dim1 = dd->dim[1];
         /* This cell row is only seen from the first row */
-        corner[1][0] = comm->cell_x0[dim1];
+        c->c[1][0] = comm->cell_x0[dim1];
         /* All rows can see this row */
-        corner[1][1] = comm->cell_x0[dim1];
+        c->c[1][1] = comm->cell_x0[dim1];
         if (dd->bGridJump)
         {
-            corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
+            c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
             if (bDistMB)
             {
                 /* For the multi-body distance we need the maximum */
-                bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
+                c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
             }
         }
         /* Set the upper-right corner for rounding */
-        corner_round_0 = comm->cell_x1[dim0];
+        c->cr0 = comm->cell_x1[dim0];
         
         if (dd->ndim >= 3)
         {
             dim2 = dd->dim[2];
             for(j=0; j<4; j++)
             {
-                corner[2][j] = comm->cell_x0[dim2];
+                c->c[2][j] = comm->cell_x0[dim2];
             }
             if (dd->bGridJump)
             {
@@ -7228,8 +7506,8 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                     {
                         if (j >= 4)
                         {
-                            corner[2][j-4] =
-                                max(corner[2][j-4],
+                            c->c[2][j-4] =
+                                max(c->c[2][j-4],
                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
                         }
                     }
@@ -7237,13 +7515,12 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                 if (bDistMB)
                 {
                     /* For the multi-body distance we need the maximum */
-                    bcorner[2] = comm->cell_x0[dim2];
+                    c->bc[2] = comm->cell_x0[dim2];
                     for(i=0; i<2; i++)
                     {
                         for(j=0; j<2; j++)
                         {
-                            bcorner[2] = max(bcorner[2],
-                                             comm->zone_d2[i][j].p1_0);
+                            c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
                         }
                     }
                 }
@@ -7253,21 +7530,369 @@ static void setup_dd_communication(gmx_domdec_t *dd,
             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
              * Only cell (0,0,0) can see cell 7 (1,1,1)
              */
-            corner_round_1[0] = comm->cell_x1[dim1];
-            corner_round_1[3] = comm->cell_x1[dim1];
+            c->cr1[0] = comm->cell_x1[dim1];
+            c->cr1[3] = comm->cell_x1[dim1];
             if (dd->bGridJump)
             {
-                corner_round_1[0] = max(comm->cell_x1[dim1],
-                                        comm->zone_d1[1].mch1);
+                c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
                 if (bDistMB)
                 {
                     /* For the multi-body distance we need the maximum */
-                    bcorner_round_1 = max(comm->cell_x1[dim1],
-                                          comm->zone_d1[1].p1_1);
+                    c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
+                }
+            }
+        }
+    }
+}
+
+/* Determine which cg's we need to send in this pulse from this zone */
+static void
+get_zone_pulse_cgs(gmx_domdec_t *dd,
+                   int zonei, int zone,
+                   int cg0, int cg1,
+                   const int *index_gl,
+                   const int *cgindex,
+                   int dim, int dim_ind,
+                   int dim0, int dim1, int dim2,
+                   real r_comm2, real r_bcomm2,
+                   matrix box,
+                   ivec tric_dist,
+                   rvec *normal,
+                   real skew_fac2_d, real skew_fac_01,
+                   rvec *v_d, rvec *v_0, rvec *v_1,
+                   const dd_corners_t *c,
+                   rvec sf2_round,
+                   gmx_bool bDistBonded,
+                   gmx_bool bBondComm,
+                   gmx_bool bDist2B,
+                   gmx_bool bDistMB,
+                   rvec *cg_cm,
+                   int *cginfo,
+                   gmx_domdec_ind_t *ind,
+                   int **ibuf, int *ibuf_nalloc,
+                   vec_rvec_t *vbuf,
+                   int *nsend_ptr,
+                   int *nat_ptr,
+                   int *nsend_z_ptr)
+{
+    gmx_domdec_comm_t *comm;
+    gmx_bool bScrew;
+    gmx_bool bDistMB_pulse;
+    int  cg,i;
+    real r2,rb2,r,tric_sh;
+    rvec rn,rb;
+    int  dimd;
+    int  nsend_z,nsend,nat;
+
+    comm = dd->comm;
+
+    bScrew = (dd->bScrewPBC && dim == XX);
+
+    bDistMB_pulse = (bDistMB && bDistBonded);
+
+    nsend_z = 0;
+    nsend   = *nsend_ptr;
+    nat     = *nat_ptr;
+
+    for(cg=cg0; cg<cg1; cg++)
+    {
+        r2  = 0;
+        rb2 = 0;
+        if (tric_dist[dim_ind] == 0)
+        {
+            /* Rectangular direction, easy */
+            r = cg_cm[cg][dim] - c->c[dim_ind][zone];
+            if (r > 0)
+            {
+                r2 += r*r;
+            }
+            if (bDistMB_pulse)
+            {
+                r = cg_cm[cg][dim] - c->bc[dim_ind];
+                if (r > 0)
+                {
+                    rb2 += r*r;
+                }
+            }
+            /* Rounding gives at most a 16% reduction
+             * in communicated atoms
+             */
+            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
+            {
+                r = cg_cm[cg][dim0] - c->cr0;
+                /* This is the first dimension, so always r >= 0 */
+                r2 += r*r;
+                if (bDistMB_pulse)
+                {
+                    rb2 += r*r;
+                }
+            }
+            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
+            {
+                r = cg_cm[cg][dim1] - c->cr1[zone];
+                if (r > 0)
+                {
+                    r2 += r*r;
+                }
+                if (bDistMB_pulse)
+                {
+                    r = cg_cm[cg][dim1] - c->bcr1;
+                    if (r > 0)
+                    {
+                        rb2 += r*r;
+                    }
+                }
+            }
+        }
+        else
+        {
+            /* Triclinic direction, more complicated */
+            clear_rvec(rn);
+            clear_rvec(rb);
+            /* Rounding, conservative as the skew_fac multiplication
+             * will slightly underestimate the distance.
+             */
+            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
+            {
+                rn[dim0] = cg_cm[cg][dim0] - c->cr0;
+                for(i=dim0+1; i<DIM; i++)
+                {
+                    rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
+                }
+                r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
+                if (bDistMB_pulse)
+                {
+                    rb[dim0] = rn[dim0];
+                    rb2 = r2;
+                }
+                /* Take care that the cell planes along dim0 might not
+                 * be orthogonal to those along dim1 and dim2.
+                 */
+                for(i=1; i<=dim_ind; i++)
+                {
+                    dimd = dd->dim[i];
+                    if (normal[dim0][dimd] > 0)
+                    {
+                        rn[dimd] -= rn[dim0]*normal[dim0][dimd];
+                        if (bDistMB_pulse)
+                        {
+                            rb[dimd] -= rb[dim0]*normal[dim0][dimd];
+                        }
+                    }
+                }
+            }
+            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
+            {
+                rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
+                tric_sh = 0;
+                for(i=dim1+1; i<DIM; i++)
+                {
+                    tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
+                }
+                rn[dim1] += tric_sh;
+                if (rn[dim1] > 0)
+                {
+                    r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
+                    /* Take care of coupling of the distances
+                     * to the planes along dim0 and dim1 through dim2.
+                     */
+                    r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
+                    /* Take care that the cell planes along dim1
+                     * might not be orthogonal to that along dim2.
+                     */
+                    if (normal[dim1][dim2] > 0)
+                    {
+                        rn[dim2] -= rn[dim1]*normal[dim1][dim2];
+                    }
+                }
+                if (bDistMB_pulse)
+                {
+                    rb[dim1] +=
+                        cg_cm[cg][dim1] - c->bcr1 + tric_sh;
+                    if (rb[dim1] > 0)
+                    {
+                        rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
+                        /* Take care of coupling of the distances
+                         * to the planes along dim0 and dim1 through dim2.
+                         */
+                        rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
+                        /* Take care that the cell planes along dim1
+                         * might not be orthogonal to that along dim2.
+                         */
+                        if (normal[dim1][dim2] > 0)
+                        {
+                            rb[dim2] -= rb[dim1]*normal[dim1][dim2];
+                        }
+                    }
+                }
+            }
+            /* The distance along the communication direction */
+            rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
+            tric_sh = 0;
+            for(i=dim+1; i<DIM; i++)
+            {
+                tric_sh -= cg_cm[cg][i]*v_d[i][dim];
+            }
+            rn[dim] += tric_sh;
+            if (rn[dim] > 0)
+            {
+                r2 += rn[dim]*rn[dim]*skew_fac2_d;
+                /* Take care of coupling of the distances
+                 * to the planes along dim0 and dim1 through dim2.
+                 */
+                if (dim_ind == 1 && zonei == 1)
+                {
+                    r2 -= rn[dim0]*rn[dim]*skew_fac_01;
+                }
+            }
+            if (bDistMB_pulse)
+            {
+                clear_rvec(rb);
+                rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
+                if (rb[dim] > 0)
+                {
+                    rb2 += rb[dim]*rb[dim]*skew_fac2_d;
+                    /* Take care of coupling of the distances
+                     * to the planes along dim0 and dim1 through dim2.
+                     */
+                    if (dim_ind == 1 && zonei == 1)
+                    {
+                        rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
+                    }
+                }
+            }
+        }
+        
+        if (r2 < r_comm2 ||
+            (bDistBonded &&
+             ((bDistMB && rb2 < r_bcomm2) ||
+              (bDist2B && r2  < r_bcomm2)) &&
+             (!bBondComm ||
+              (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
+               missing_link(comm->cglink,index_gl[cg],
+                            comm->bLocalCG)))))
+        {
+            /* Make an index to the local charge groups */
+            if (nsend+1 > ind->nalloc)
+            {
+                ind->nalloc = over_alloc_large(nsend+1);
+                srenew(ind->index,ind->nalloc);
+            }
+            if (nsend+1 > *ibuf_nalloc)
+            {
+                *ibuf_nalloc = over_alloc_large(nsend+1);
+                srenew(*ibuf,*ibuf_nalloc);
+            }
+            ind->index[nsend] = cg;
+            (*ibuf)[nsend] = index_gl[cg];
+            nsend_z++;
+            vec_rvec_check_alloc(vbuf,nsend+1);
+            
+            if (dd->ci[dim] == 0)
+            {
+                /* Correct cg_cm for pbc */
+                rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
+                if (bScrew)
+                {
+                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
+                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
                 }
             }
+            else
+            {
+                copy_rvec(cg_cm[cg],vbuf->v[nsend]);
+            }
+            nsend++;
+            nat += cgindex[cg+1] - cgindex[cg];
         }
     }
+
+    *nsend_ptr   = nsend;
+    *nat_ptr     = nat;
+    *nsend_z_ptr = nsend_z;
+}
+
+static void setup_dd_communication(gmx_domdec_t *dd,
+                                   matrix box,gmx_ddbox_t *ddbox,
+                                   t_forcerec *fr,t_state *state,rvec **f)
+{
+    int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
+    int nzone,nzone_send,zone,zonei,cg0,cg1;
+    int c,i,j,cg,cg_gl,nrcg;
+    int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
+    gmx_domdec_comm_t *comm;
+    gmx_domdec_zones_t *zones;
+    gmx_domdec_comm_dim_t *cd;
+    gmx_domdec_ind_t *ind;
+    cginfo_mb_t *cginfo_mb;
+    gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
+    real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
+    dd_corners_t corners;
+    ivec tric_dist;
+    rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
+    real skew_fac2_d,skew_fac_01;
+    rvec sf2_round;
+    int  nsend,nat;
+    int  th;
+    
+    if (debug)
+    {
+        fprintf(debug,"Setting up DD communication\n");
+    }
+    
+    comm  = dd->comm;
+
+    switch (fr->cutoff_scheme)
+    {
+    case ecutsGROUP:
+        cg_cm = fr->cg_cm;
+        break;
+    case ecutsVERLET:
+        cg_cm = state->x;
+        break;
+    default:
+        gmx_incons("unimplemented");
+        cg_cm = NULL;
+    }
+
+    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
+    {
+        dim = dd->dim[dim_ind];
+
+        /* Check if we need to use triclinic distances */
+        tric_dist[dim_ind] = 0;
+        for(i=0; i<=dim_ind; i++)
+        {
+            if (ddbox->tric_dir[dd->dim[i]])
+            {
+                tric_dist[dim_ind] = 1;
+            }
+        }
+    }
+
+    bBondComm = comm->bBondComm;
+
+    /* Do we need to determine extra distances for multi-body bondeds? */
+    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
+    
+    /* Do we need to determine extra distances for only two-body bondeds? */
+    bDist2B = (bBondComm && !bDistMB);
+
+    r_comm2  = sqr(comm->cutoff);
+    r_bcomm2 = sqr(comm->cutoff_mbody);
+
+    if (debug)
+    {
+        fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
+    }
+
+    zones = &comm->zones;
+    
+    dim0 = dd->dim[0];
+    dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
+    dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
+
+    set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
     
     /* Triclinic stuff */
     normal = ddbox->normal;
@@ -7321,8 +7946,6 @@ static void setup_dd_communication(gmx_domdec_t *dd,
             nzone_send = nzone;
         }
 
-        bScrew = (dd->bScrewPBC && dim == XX);
-        
         v_d = ddbox->v[dim];
         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 
@@ -7332,8 +7955,7 @@ static void setup_dd_communication(gmx_domdec_t *dd,
             /* Only atoms communicated in the first pulse are used
              * for multi-body bonded interactions or for bBondComm.
              */
-            bDistBonded   = ((bDistMB || bDist2B) && p == 0);
-            bDistMB_pulse = (bDistMB && bDistBonded);
+            bDistBonded = ((bDistMB || bDist2B) && p == 0);
 
             ind = &cd->ind[p];
             nsend = 0;
@@ -7386,220 +8008,108 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                     cg1 = zone_cg_range[nzone+zone+1];
                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
                 }
-                ind->nsend[zone] = 0;
-                for(cg=cg0; cg<cg1; cg++)
+
+#pragma omp parallel for num_threads(comm->nth) schedule(static)
+                for(th=0; th<comm->nth; th++)
                 {
-                    r2  = 0;
-                    rb2 = 0;
-                    if (tric_dist[dim_ind] == 0)
+                    gmx_domdec_ind_t *ind_p;
+                    int **ibuf_p,*ibuf_nalloc_p;
+                    vec_rvec_t *vbuf_p;
+                    int *nsend_p,*nat_p;
+                    int *nsend_zone_p;
+                    int cg0_th,cg1_th;
+
+                    if (th == 0)
                     {
-                        /* Rectangular direction, easy */
-                        r = cg_cm[cg][dim] - corner[dim_ind][zone];
-                        if (r > 0)
-                        {
-                            r2 += r*r;
-                        }
-                        if (bDistMB_pulse)
-                        {
-                            r = cg_cm[cg][dim] - bcorner[dim_ind];
-                            if (r > 0)
-                            {
-                                rb2 += r*r;
-                            }
-                        }
-                        /* Rounding gives at most a 16% reduction
-                         * in communicated atoms
-                         */
-                        if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
-                        {
-                            r = cg_cm[cg][dim0] - corner_round_0;
-                            /* This is the first dimension, so always r >= 0 */
-                            r2 += r*r;
-                            if (bDistMB_pulse)
-                            {
-                                rb2 += r*r;
-                            }
-                        }
-                        if (dim_ind == 2 && (zonei == 2 || zonei == 3))
-                        {
-                            r = cg_cm[cg][dim1] - corner_round_1[zone];
-                            if (r > 0)
-                            {
-                                r2 += r*r;
-                            }
-                            if (bDistMB_pulse)
-                            {
-                                r = cg_cm[cg][dim1] - bcorner_round_1;
-                                if (r > 0)
-                                {
-                                    rb2 += r*r;
-                                }
-                            }
-                        }
+                        /* Thread 0 writes in the comm buffers */
+                        ind_p         = ind;
+                        ibuf_p        = &comm->buf_int;
+                        ibuf_nalloc_p = &comm->nalloc_int;
+                        vbuf_p        = &comm->vbuf;
+                        nsend_p       = &nsend;
+                        nat_p         = &nat;
+                        nsend_zone_p  = &ind->nsend[zone];
                     }
                     else
                     {
-                        /* Triclinic direction, more complicated */
-                        clear_rvec(rn);
-                        clear_rvec(rb);
-                        /* Rounding, conservative as the skew_fac multiplication
-                         * will slightly underestimate the distance.
-                         */
-                        if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
-                        {
-                            rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
-                            for(i=dim0+1; i<DIM; i++)
-                            {
-                                rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
-                            }
-                            r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
-                            if (bDistMB_pulse)
-                            {
-                                rb[dim0] = rn[dim0];
-                                rb2 = r2;
-                            }
-                            /* Take care that the cell planes along dim0 might not
-                             * be orthogonal to those along dim1 and dim2.
-                             */
-                            for(i=1; i<=dim_ind; i++)
-                            {
-                                dimd = dd->dim[i];
-                                if (normal[dim0][dimd] > 0)
-                                {
-                                    rn[dimd] -= rn[dim0]*normal[dim0][dimd];
-                                    if (bDistMB_pulse)
-                                    {
-                                        rb[dimd] -= rb[dim0]*normal[dim0][dimd];
-                                    }
-                                }
-                            }
-                        }
-                        if (dim_ind == 2 && (zonei == 2 || zonei == 3))
-                        {
-                            rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
-                            tric_sh = 0;
-                            for(i=dim1+1; i<DIM; i++)
-                            {
-                                tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
-                            }
-                            rn[dim1] += tric_sh;
-                            if (rn[dim1] > 0)
-                            {
-                                r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
-                                /* Take care of coupling of the distances
-                                 * to the planes along dim0 and dim1 through dim2.
-                                 */
-                                r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
-                                /* Take care that the cell planes along dim1
-                                 * might not be orthogonal to that along dim2.
-                                 */
-                                if (normal[dim1][dim2] > 0)
-                                {
-                                    rn[dim2] -= rn[dim1]*normal[dim1][dim2];
-                                }
-                            }
-                            if (bDistMB_pulse)
-                            {
-                                rb[dim1] +=
-                                    cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
-                                if (rb[dim1] > 0)
-                                {
-                                    rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
-                                    /* Take care of coupling of the distances
-                                     * to the planes along dim0 and dim1 through dim2.
-                                     */
-                                    rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
-                                    /* Take care that the cell planes along dim1
-                                     * might not be orthogonal to that along dim2.
-                                     */
-                                    if (normal[dim1][dim2] > 0)
-                                    {
-                                        rb[dim2] -= rb[dim1]*normal[dim1][dim2];
-                                    }
-                                }
-                            }
-                        }
-                        /* The distance along the communication direction */
-                        rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
-                        tric_sh = 0;
-                        for(i=dim+1; i<DIM; i++)
-                        {
-                            tric_sh -= cg_cm[cg][i]*v_d[i][dim];
-                        }
-                        rn[dim] += tric_sh;
-                        if (rn[dim] > 0)
-                        {
-                            r2 += rn[dim]*rn[dim]*skew_fac2_d;
-                            /* Take care of coupling of the distances
-                             * to the planes along dim0 and dim1 through dim2.
-                             */
-                            if (dim_ind == 1 && zonei == 1)
-                            {
-                                r2 -= rn[dim0]*rn[dim]*skew_fac_01;
-                            }
-                        }
-                        if (bDistMB_pulse)
-                        {
-                            clear_rvec(rb);
-                            rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
-                            if (rb[dim] > 0)
-                            {
-                                rb2 += rb[dim]*rb[dim]*skew_fac2_d;
-                                /* Take care of coupling of the distances
-                                 * to the planes along dim0 and dim1 through dim2.
-                                 */
-                                if (dim_ind == 1 && zonei == 1)
-                                {
-                                    rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
-                                }
-                            }
-                        }
+                        /* Other threads write into temp buffers */
+                        ind_p         = &comm->dth[th].ind;
+                        ibuf_p        = &comm->dth[th].ibuf;
+                        ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
+                        vbuf_p        = &comm->dth[th].vbuf;
+                        nsend_p       = &comm->dth[th].nsend;
+                        nat_p         = &comm->dth[th].nat;
+                        nsend_zone_p  = &comm->dth[th].nsend_zone;
+
+                        comm->dth[th].nsend      = 0;
+                        comm->dth[th].nat        = 0;
+                        comm->dth[th].nsend_zone = 0;
+                    }
+
+                    if (comm->nth == 1)
+                    {
+                        cg0_th = cg0;
+                        cg1_th = cg1;
+                    }
+                    else
+                    {
+                        cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
+                        cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
                     }
                     
-                    if (r2 < r_comm2 ||
-                        (bDistBonded &&
-                         ((bDistMB && rb2 < r_bcomm2) ||
-                          (bDist2B && r2  < r_bcomm2)) &&
-                         (!bBondComm ||
-                          (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
-                           missing_link(comm->cglink,index_gl[cg],
-                                        comm->bLocalCG)))))
+                    /* Get the cg's for this pulse in this zone */
+                    get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
+                                       index_gl,cgindex,
+                                       dim,dim_ind,dim0,dim1,dim2,
+                                       r_comm2,r_bcomm2,
+                                       box,tric_dist,
+                                       normal,skew_fac2_d,skew_fac_01,
+                                       v_d,v_0,v_1,&corners,sf2_round,
+                                       bDistBonded,bBondComm,
+                                       bDist2B,bDistMB,
+                                       cg_cm,fr->cginfo,
+                                       ind_p,
+                                       ibuf_p,ibuf_nalloc_p,
+                                       vbuf_p,
+                                       nsend_p,nat_p,
+                                       nsend_zone_p);
+                }
+
+                /* Append data of threads>=1 to the communication buffers */
+                for(th=1; th<comm->nth; th++)
+                {
+                    dd_comm_setup_work_t *dth;
+                    int i,ns1;
+
+                    dth = &comm->dth[th];
+
+                    ns1 = nsend + dth->nsend_zone;
+                    if (ns1 > ind->nalloc)
                     {
-                        /* Make an index to the local charge groups */
-                        if (nsend+1 > ind->nalloc)
-                        {
-                            ind->nalloc = over_alloc_large(nsend+1);
-                            srenew(ind->index,ind->nalloc);
-                        }
-                        if (nsend+1 > comm->nalloc_int)
-                        {
-                            comm->nalloc_int = over_alloc_large(nsend+1);
-                            srenew(comm->buf_int,comm->nalloc_int);
-                        }
-                        ind->index[nsend] = cg;
-                        comm->buf_int[nsend] = index_gl[cg];
-                        ind->nsend[zone]++;
-                        vec_rvec_check_alloc(&comm->vbuf,nsend+1);
+                        ind->nalloc = over_alloc_dd(ns1);
+                        srenew(ind->index,ind->nalloc);
+                    }
+                    if (ns1 > comm->nalloc_int)
+                    {
+                        comm->nalloc_int = over_alloc_dd(ns1);
+                        srenew(comm->buf_int,comm->nalloc_int);
+                    }
+                    if (ns1 > comm->vbuf.nalloc)
+                    {
+                        comm->vbuf.nalloc = over_alloc_dd(ns1);
+                        srenew(comm->vbuf.v,comm->vbuf.nalloc);
+                    }
 
-                        if (dd->ci[dim] == 0)
-                        {
-                            /* Correct cg_cm for pbc */
-                            rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
-                            if (bScrew)
-                            {
-                                comm->vbuf.v[nsend][YY] =
-                                    box[YY][YY]-comm->vbuf.v[nsend][YY];
-                                comm->vbuf.v[nsend][ZZ] =
-                                    box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
-                            }
-                        }
-                        else
-                        {
-                            copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
-                        }
+                    for(i=0; i<dth->nsend_zone; i++)
+                    {
+                        ind->index[nsend] = dth->ind.index[i];
+                        comm->buf_int[nsend] = dth->ibuf[i];
+                        copy_rvec(dth->vbuf.v[i],
+                                  comm->vbuf.v[nsend]);
                         nsend++;
-                        nat += cgindex[cg+1] - cgindex[cg];
                     }
+                    nat              += dth->nat;
+                    ind->nsend[zone] += dth->nsend_zone;
                 }
             }
             /* Clear the counts in case we do not have pbc */
@@ -7667,11 +8177,15 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                             recv_i,        ind->nrecv[nzone]);
 
             /* Make space for cg_cm */
-            if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
+            dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
+            if (fr->cutoff_scheme == ecutsGROUP)
             {
-                dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
                 cg_cm = fr->cg_cm;
             }
+            else
+            {
+                cg_cm = state->x;
+            }
             /* Communicate cg_cm */
             if (cd->bInPlace)
             {
@@ -7774,6 +8288,207 @@ static void set_cg_boundaries(gmx_domdec_zones_t *zones)
     }
 }
 
+static void set_zones_size(gmx_domdec_t *dd,
+                           matrix box,const gmx_ddbox_t *ddbox,
+                           int zone_start,int zone_end)
+{
+    gmx_domdec_comm_t *comm;
+    gmx_domdec_zones_t *zones;
+    gmx_bool bDistMB;
+    int  z,zi,zj0,zj1,d,dim;
+    real rcs,rcmbs;
+    int  i,j;
+    real size_j,add_tric;
+    real vol;
+
+    comm = dd->comm;
+
+    zones = &comm->zones;
+
+    /* Do we need to determine extra distances for multi-body bondeds? */
+    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
+
+    for(z=zone_start; z<zone_end; z++)
+    {
+        /* Copy cell limits to zone limits.
+         * Valid for non-DD dims and non-shifted dims.
+         */
+        copy_rvec(comm->cell_x0,zones->size[z].x0);
+        copy_rvec(comm->cell_x1,zones->size[z].x1);
+    }
+
+    for(d=0; d<dd->ndim; d++)
+    {
+        dim = dd->dim[d];
+
+        for(z=0; z<zones->n; z++)
+        {
+            /* With a staggered grid we have different sizes
+             * for non-shifted dimensions.
+             */
+            if (dd->bGridJump && zones->shift[z][dim] == 0)
+            {
+                if (d == 1)
+                {
+                    zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
+                    zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
+                }
+                else if (d == 2)
+                {
+                    zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
+                    zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
+                }
+            }
+        }
+
+        rcs   = comm->cutoff;
+        rcmbs = comm->cutoff_mbody;
+        if (ddbox->tric_dir[dim])
+        {
+            rcs   /= ddbox->skew_fac[dim];
+            rcmbs /= ddbox->skew_fac[dim];
+        }
+
+        /* Set the lower limit for the shifted zone dimensions */
+        for(z=zone_start; z<zone_end; z++)
+        {
+            if (zones->shift[z][dim] > 0)
+            {
+                dim = dd->dim[d];
+                if (!dd->bGridJump || d == 0)
+                {
+                    zones->size[z].x0[dim] = comm->cell_x1[dim];
+                    zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
+                }
+                else
+                {
+                    /* Here we take the lower limit of the zone from
+                     * the lowest domain of the zone below.
+                     */
+                    if (z < 4)
+                    {
+                        zones->size[z].x0[dim] =
+                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
+                    }
+                    else
+                    {
+                        if (d == 1)
+                        {
+                            zones->size[z].x0[dim] =
+                                zones->size[zone_perm[2][z-4]].x0[dim];
+                        }
+                        else
+                        {
+                            zones->size[z].x0[dim] =
+                                comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
+                        }
+                    }
+                    /* A temporary limit, is updated below */
+                    zones->size[z].x1[dim] = zones->size[z].x0[dim];
+
+                    if (bDistMB)
+                    {
+                        for(zi=0; zi<zones->nizone; zi++)
+                        {
+                            if (zones->shift[zi][dim] == 0)
+                            {
+                                /* This takes the whole zone into account.
+                                 * With multiple pulses this will lead
+                                 * to a larger zone then strictly necessary.
+                                 */
+                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
+                                                             zones->size[zi].x1[dim]+rcmbs);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        /* Loop over the i-zones to set the upper limit of each
+         * j-zone they see.
+         */
+        for(zi=0; zi<zones->nizone; zi++)
+        {
+            if (zones->shift[zi][dim] == 0)
+            {
+                for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
+                {
+                    if (zones->shift[z][dim] > 0)
+                    {
+                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
+                                                     zones->size[zi].x1[dim]+rcs);
+                    }
+                }
+            }
+        }
+    }
+
+    for(z=zone_start; z<zone_end; z++)
+    {
+        for(i=0; i<DIM; i++)
+        {
+            zones->size[z].bb_x0[i] = zones->size[z].x0[i];
+            zones->size[z].bb_x1[i] = zones->size[z].x1[i];
+
+            for(j=i+1; j<ddbox->npbcdim; j++)
+            {
+                /* With 1D domain decomposition the cg's are not in
+                 * the triclinic box, but trilinic x-y and rectangular y-z.
+                 */
+                if (box[j][i] != 0 &&
+                    !(dd->ndim == 1 && i == YY && j == ZZ))
+                {
+                    /* Correct for triclinic offset of the lower corner */
+                    add_tric = zones->size[z].x0[j]*box[j][i]/box[j][j];
+                    zones->size[z].bb_x0[i] += add_tric;
+                    zones->size[z].bb_x1[i] += add_tric;
+
+                    /* Correct for triclinic offset of the upper corner */
+                    size_j = zones->size[z].x1[j] - zones->size[z].x0[j];
+                    add_tric = size_j*box[j][i]/box[j][j];
+
+                    if (box[j][i] < 0)
+                    {
+                        zones->size[z].bb_x0[i] += add_tric;
+                    }
+                    else
+                    {
+                        zones->size[z].bb_x1[i] += add_tric;
+                    }
+                }
+            }
+        }
+    }
+
+    if (zone_start == 0)
+    {
+        vol = 1;
+        for(dim=0; dim<DIM; dim++)
+        {
+            vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
+        }
+        zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
+    }
+
+    if (debug)
+    {
+        for(z=zone_start; z<zone_end; z++)
+        {
+            fprintf(debug,"zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
+                    z,
+                    zones->size[z].x0[XX],zones->size[z].x1[XX],
+                    zones->size[z].x0[YY],zones->size[z].x1[YY],
+                    zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
+            fprintf(debug,"zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
+                    z,
+                    zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
+                    zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
+                    zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
+        }
+    }
+}
+
 static int comp_cgsort(const void *a,const void *b)
 {
     int comp;
@@ -7791,7 +8506,7 @@ static int comp_cgsort(const void *a,const void *b)
     return comp;
 }
 
-static void order_int_cg(int n,gmx_cgsort_t *sort,
+static void order_int_cg(int n,const gmx_cgsort_t *sort,
                          int *a,int *buf)
 {
     int i;
@@ -7809,7 +8524,7 @@ static void order_int_cg(int n,gmx_cgsort_t *sort,
     }
 }
 
-static void order_vec_cg(int n,gmx_cgsort_t *sort,
+static void order_vec_cg(int n,const gmx_cgsort_t *sort,
                          rvec *v,rvec *buf)
 {
     int i;
@@ -7827,11 +8542,19 @@ static void order_vec_cg(int n,gmx_cgsort_t *sort,
     }
 }
 
-static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
+static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
                            rvec *v,rvec *buf)
 {
     int a,atot,cg,cg0,cg1,i;
     
+    if (cgindex == NULL)
+    {
+        /* Avoid the useless loop of the atoms within a cg */
+        order_vec_cg(ncg,sort,v,buf);
+
+        return;
+    }
+
     /* Order the data */
     a = 0;
     for(cg=0; cg<ncg; cg++)
@@ -7889,24 +8612,19 @@ static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
     }
 }
 
-static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
-                          rvec *cgcm,t_forcerec *fr,t_state *state,
-                          int ncg_home_old)
+static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
 {
     gmx_domdec_sort_t *sort;
     gmx_cgsort_t *cgsort,*sort_i;
-    int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
-    rvec *vbuf;
-    
+    int  ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
+    int  sort_last,sort_skip;
+
     sort = dd->comm->sort;
-    
-    if (dd->ncg_home > sort->sort_nalloc)
-    {
-        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
-        srenew(sort->sort1,sort->sort_nalloc);
-        srenew(sort->sort2,sort->sort_nalloc);
-    }
-    
+
+    a = fr->ns.grid->cell_index;
+
+    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
+
     if (ncg_home_old >= 0)
     {
         /* The charge groups that remained in the same ns grid cell
@@ -7919,10 +8637,9 @@ static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
         for(i=0; i<dd->ncg_home; i++)
         {
             /* Check if this cg did not move to another node */
-            cell_index = fr->ns.grid->cell_index[i];
-            if (cell_index !=  4*fr->ns.grid->ncells)
+            if (a[i] < moved)
             {
-                if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
+                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
                 {
                     /* This cg is new on this node or moved ns grid cell */
                     if (nsort_new >= sort->sort_new_nalloc)
@@ -7938,9 +8655,11 @@ static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
                     sort_i = &(sort->sort2[nsort2++]);
                 }
                 /* Sort on the ns grid cell indices
-                 * and the global topology index
+                 * and the global topology index.
+                 * index_gl is irrelevant with cell ns,
+                 * but we set it here anyhow to avoid a conditional.
                  */
-                sort_i->nsc    = cell_index;
+                sort_i->nsc    = a[i];
                 sort_i->ind_gl = dd->index_gl[i];
                 sort_i->ind    = i;
                 ncg_new++;
@@ -7952,21 +8671,22 @@ static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
                     nsort2,nsort_new);
         }
         /* Sort efficiently */
-        ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
+        ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
+                     sort->sort);
     }
     else
     {
-        cgsort = sort->sort1;
+        cgsort = sort->sort;
         ncg_new = 0;
         for(i=0; i<dd->ncg_home; i++)
         {
             /* Sort on the ns grid cell indices
              * and the global topology index
              */
-            cgsort[i].nsc    = fr->ns.grid->cell_index[i];
+            cgsort[i].nsc    = a[i];
             cgsort[i].ind_gl = dd->index_gl[i];
             cgsort[i].ind    = i;
-            if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
+            if (cgsort[i].nsc < moved)
             {
                 ncg_new++;
             }
@@ -7978,14 +8698,85 @@ static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
         /* Determine the order of the charge groups using qsort */
         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
     }
-    cgsort = sort->sort1;
+
+    return ncg_new;
+}
+
+static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
+{
+    gmx_cgsort_t *sort;
+    int  ncg_new,i,*a,na;
+
+    sort = dd->comm->sort->sort;
+
+    nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
+
+    ncg_new = 0;
+    for(i=0; i<na; i++)
+    {
+        if (a[i] >= 0)
+        {
+            sort[ncg_new].ind = a[i];
+            ncg_new++;
+        }
+    }
+
+    return ncg_new;
+}
+
+static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
+                          rvec *cgcm,t_forcerec *fr,t_state *state,
+                          int ncg_home_old)
+{
+    gmx_domdec_sort_t *sort;
+    gmx_cgsort_t *cgsort,*sort_i;
+    int  *cgindex;
+    int  ncg_new,i,*ibuf,cgsize;
+    rvec *vbuf;
     
+    sort = dd->comm->sort;
+    
+    if (dd->ncg_home > sort->sort_nalloc)
+    {
+        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
+        srenew(sort->sort,sort->sort_nalloc);
+        srenew(sort->sort2,sort->sort_nalloc);
+    }
+    cgsort = sort->sort;
+
+    switch (fr->cutoff_scheme)
+    {
+    case ecutsGROUP:
+        ncg_new = dd_sort_order(dd,fr,ncg_home_old);
+        break;
+    case ecutsVERLET:
+        ncg_new = dd_sort_order_nbnxn(dd,fr);
+        break;
+    default:
+        gmx_incons("unimplemented");
+        ncg_new = 0;
+    }
+
     /* We alloc with the old size, since cgindex is still old */
     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
     vbuf = dd->comm->vbuf.v;
     
+    if (dd->comm->bCGs)
+    {
+        cgindex = dd->cgindex;
+    }
+    else
+    {
+        cgindex = NULL;
+    }
+
     /* Remove the charge groups which are no longer at home here */
     dd->ncg_home = ncg_new;
+    if (debug)
+    {
+        fprintf(debug,"Set the new home charge group count to %d\n",
+                dd->ncg_home);
+    }
     
     /* Reorder the state */
     for(i=0; i<estNR; i++)
@@ -7995,16 +8786,16 @@ static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
             switch (i)
             {
             case estX:
-                order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
+                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
                 break;
             case estV:
-                order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
+                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
                 break;
             case estSDX:
-                order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
+                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
                 break;
             case estCGP:
-                order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
+                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
                 break;
             case estLD_RNG:
             case estLD_RNGI:
@@ -8020,8 +8811,11 @@ static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
             }
         }
     }
-    /* Reorder cgcm */
-    order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
+    if (fr->cutoff_scheme == ecutsGROUP)
+    {
+        /* Reorder cgcm */
+        order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
+    }
     
     if (dd->ncg_home+1 > sort->ibuf_nalloc)
     {
@@ -8034,25 +8828,43 @@ static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
     /* Reorder the cginfo */
     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
     /* Rebuild the local cg index */
-    ibuf[0] = 0;
-    for(i=0; i<dd->ncg_home; i++)
+    if (dd->comm->bCGs)
     {
-        cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
-        ibuf[i+1] = ibuf[i] + cgsize;
+        ibuf[0] = 0;
+        for(i=0; i<dd->ncg_home; i++)
+        {
+            cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
+            ibuf[i+1] = ibuf[i] + cgsize;
+        }
+        for(i=0; i<dd->ncg_home+1; i++)
+        {
+            dd->cgindex[i] = ibuf[i];
+        }
     }
-    for(i=0; i<dd->ncg_home+1; i++)
+    else
     {
-        dd->cgindex[i] = ibuf[i];
+        for(i=0; i<dd->ncg_home+1; i++)
+        {
+            dd->cgindex[i] = i;
+        }
     }
     /* Set the home atom number */
     dd->nat_home = dd->cgindex[dd->ncg_home];
-    
-    /* Copy the sorted ns cell indices back to the ns grid struct */
-    for(i=0; i<dd->ncg_home; i++)
+
+    if (fr->cutoff_scheme == ecutsVERLET)
+    {
+        /* The atoms are now exactly in grid order, update the grid order */
+        nbnxn_set_atomorder(fr->nbv->nbs);
+    }
+    else
     {
-        fr->ns.grid->cell_index[i] = cgsort[i].nsc;
+        /* Copy the sorted ns cell indices back to the ns grid struct */
+        for(i=0; i<dd->ncg_home; i++)
+        {
+            fr->ns.grid->cell_index[i] = cgsort[i].nsc;
+        }
+        fr->ns.grid->nr = dd->ncg_home;
     }
-    fr->ns.grid->nr = dd->ncg_home;
 }
 
 static void add_dd_statistics(gmx_domdec_t *dd)
@@ -8174,10 +8986,10 @@ void dd_partition_system(FILE            *fplog,
     t_block *cgs_gl;
     gmx_large_int_t step_pcoupl;
     rvec cell_ns_x0,cell_ns_x1;
-    int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
+    int  i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
     gmx_bool bRedist,bSortCG,bResortAll;
-    ivec ncells_old,np;
+    ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
     real grid_density;
     char sbuf[22];
 	
@@ -8205,13 +9017,13 @@ void dd_partition_system(FILE            *fplog,
         {
             step_pcoupl = ((step - 1)/n)*n + 1;
         }
-        if (step_pcoupl >= comm->globalcomm_step)
+        if (step_pcoupl >= comm->partition_step)
         {
             bBoxChanged = TRUE;
         }
     }
 
-    bNStGlobalComm = (step >= comm->globalcomm_step + nstglobalcomm);
+    bNStGlobalComm = (step % nstglobalcomm == 0);
 
     if (!comm->bDynLoadBal)
     {
@@ -8320,12 +9132,14 @@ void dd_partition_system(FILE            *fplog,
         
         dd_make_local_cgs(dd,&top_local->cgs);
         
-        if (dd->ncg_home > fr->cg_nalloc)
+        /* Ensure that we have space for the new distribution */
+        dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
+
+        if (fr->cutoff_scheme == ecutsGROUP)
         {
-            dd_realloc_fr_cg(fr,dd->ncg_home);
+            calc_cgcm(fplog,0,dd->ncg_home,
+                      &top_local->cgs,state_local->x,fr->cg_cm);
         }
-        calc_cgcm(fplog,0,dd->ncg_home,
-                  &top_local->cgs,state_local->x,fr->cg_cm);
         
         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
         
@@ -8351,10 +9165,13 @@ void dd_partition_system(FILE            *fplog,
         /* Build the new indices */
         rebuild_cgindex(dd,cgs_gl->index,state_local);
         make_dd_indices(dd,cgs_gl->index,0);
-        
-        /* Redetermine the cg COMs */
-        calc_cgcm(fplog,0,dd->ncg_home,
-                  &top_local->cgs,state_local->x,fr->cg_cm);
+
+        if (fr->cutoff_scheme == ecutsGROUP)
+        {
+            /* Redetermine the cg COMs */
+            calc_cgcm(fplog,0,dd->ncg_home,
+                      &top_local->cgs,state_local->x,fr->cg_cm);
+        }
         
         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 
@@ -8409,15 +9226,21 @@ void dd_partition_system(FILE            *fplog,
 
     ncg_home_old = dd->ncg_home;
 
+    ncg_moved = 0;
     if (bRedist)
     {
-        cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
-                                 state_local,f,fr,mdatoms,
-                                 !bSortCG,nrnb);
+        wallcycle_sub_start(wcycle,ewcsDD_REDIST);
+
+        dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
+                           state_local,f,fr,mdatoms,
+                           !bSortCG,nrnb,&cg0,&ncg_moved);
+
+        wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
     }
     
-    get_nsgrid_boundaries(fr->ns.grid,dd,
-                          state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
+    get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
+                          dd,&ddbox,
+                          &comm->cell_x0,&comm->cell_x1,
                           dd->ncg_home,fr->cg_cm,
                           cell_ns_x0,cell_ns_x1,&grid_density);
 
@@ -8426,15 +9249,27 @@ void dd_partition_system(FILE            *fplog,
         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
     }
 
-    copy_ivec(fr->ns.grid->n,ncells_old);
-    grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
-               state_local->box,cell_ns_x0,cell_ns_x1,
-               fr->rlistlong,grid_density);
+    switch (fr->cutoff_scheme)
+    {
+    case ecutsGROUP:
+        copy_ivec(fr->ns.grid->n,ncells_old);
+        grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
+                   state_local->box,cell_ns_x0,cell_ns_x1,
+                   fr->rlistlong,grid_density);
+        break;
+    case ecutsVERLET:
+        nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
+        break;
+    default:
+        gmx_incons("unimplemented");
+    }
     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
     copy_ivec(ddbox.tric_dir,comm->tric_dir);
 
     if (bSortCG)
     {
+        wallcycle_sub_start(wcycle,ewcsDD_GRID);
+
         /* Sort the state on charge group position.
          * This enables exact restarts from this step.
          * It also improves performance by about 15% with larger numbers
@@ -8445,16 +9280,47 @@ void dd_partition_system(FILE            *fplog,
          * so we can sort with the indices.
          */
         set_zones_ncg_home(dd);
-        fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
-                  0,dd->ncg_home,fr->cg_cm);
-        
+
+        switch (fr->cutoff_scheme)
+        {
+        case ecutsVERLET:
+            set_zones_size(dd,state_local->box,&ddbox,0,1);
+
+            nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
+                              0,
+                              comm->zones.size[0].bb_x0,
+                              comm->zones.size[0].bb_x1,
+                              0,dd->ncg_home,
+                              comm->zones.dens_zone0,
+                              fr->cginfo,
+                              state_local->x,
+                              ncg_moved,comm->moved,
+                              fr->nbv->grp[eintLocal].kernel_type,
+                              fr->nbv->grp[eintLocal].nbat);
+
+            nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
+            break;
+        case ecutsGROUP:
+            fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
+                      0,dd->ncg_home,fr->cg_cm);
+            
+            copy_ivec(fr->ns.grid->n,ncells_new);
+            break;
+        default:
+            gmx_incons("unimplemented");
+        }
+
+        bResortAll = bMasterState;
+   
         /* Check if we can user the old order and ns grid cell indices
          * of the charge groups to sort the charge groups efficiently.
          */
-        bResortAll = (bMasterState ||
-                      fr->ns.grid->n[XX] != ncells_old[XX] ||
-                      fr->ns.grid->n[YY] != ncells_old[YY] ||
-                      fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
+        if (ncells_new[XX] != ncells_old[XX] ||
+            ncells_new[YY] != ncells_old[YY] ||
+            ncells_new[ZZ] != ncells_old[ZZ])
+        {
+            bResortAll = TRUE;
+        }
 
         if (debug)
         {
@@ -8466,21 +9332,35 @@ void dd_partition_system(FILE            *fplog,
         /* Rebuild all the indices */
         cg0 = 0;
         ga2la_clear(dd->ga2la);
+
+        wallcycle_sub_stop(wcycle,ewcsDD_GRID);
     }
+
+    wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
     
     /* Setup up the communication and communicate the coordinates */
-    setup_dd_communication(dd,state_local->box,&ddbox,fr);
+    setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
     
     /* Set the indices */
     make_dd_indices(dd,cgs_gl->index,cg0);
 
     /* Set the charge group boundaries for neighbor searching */
     set_cg_boundaries(&comm->zones);
-    
+
+    if (fr->cutoff_scheme == ecutsVERLET)
+    {
+        set_zones_size(dd,state_local->box,&ddbox,
+                       bSortCG ? 1 : 0,comm->zones.n);
+    }
+
+    wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
+
     /*
     write_dd_pdb("dd_home",step,"dump",top_global,cr,
                  -1,state_local->x,state_local->box);
     */
+
+    wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
     
     /* Extract a local topology from the global topology */
     for(i=0; i<dd->ndim; i++)
@@ -8489,7 +9369,13 @@ void dd_partition_system(FILE            *fplog,
     }
     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
                       comm->cellsize_min,np,
-                      fr,vsite,top_global,top_local);
+                      fr,
+                      fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
+                      vsite,top_global,top_local);
+
+    wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
+
+    wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
     
     /* Set up the special atom communication */
     n = comm->nat[ddnatZONE];
@@ -8504,12 +9390,12 @@ void dd_partition_system(FILE            *fplog,
             }
             break;
         case ddnatCON:
-            if (dd->bInterCGcons)
+            if (dd->bInterCGcons || dd->bInterCGsettles)
             {
                 /* Only for inter-cg constraints we need special code */
-                n = dd_make_local_constraints(dd,n,top_global,
+                n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
                                               constr,ir->nProjOrder,
-                                              &top_local->idef.il[F_CONSTR]);
+                                              top_local->idef.il);
             }
             break;
         default:
@@ -8517,7 +9403,11 @@ void dd_partition_system(FILE            *fplog,
         }
         comm->nat[i] = n;
     }
-    
+
+    wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
+
+    wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
+
     /* Make space for the extra coordinates for virtual site
      * or constraint communication.
      */
@@ -8580,7 +9470,9 @@ void dd_partition_system(FILE            *fplog,
     {
         make_local_gb(cr,fr->born,ir->gb_algorithm);
     }
-	
+
+    init_bonded_thread_force_reduction(fr,&top_local->idef);
+
     if (!(cr->duty & DUTY_PME))
     {
         /* Send the charges to our PME only node */
@@ -8617,6 +9509,8 @@ void dd_partition_system(FILE            *fplog,
      * atom coordinates again (for spreading the forces this MD step).
      */
     dd_move_x_vsites(dd,state_local->box,state_local->x);
+
+    wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
     
     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
     {
@@ -8625,11 +9519,8 @@ void dd_partition_system(FILE            *fplog,
                      -1,state_local->x,state_local->box);
     }
 
-    if (bNStGlobalComm)
-    {
-        /* Store the global communication step */
-        comm->globalcomm_step = step;
-    }
+    /* Store the partitioning step */
+    comm->partition_step = step;
     
     /* Increase the DD partitioning counter */
     dd->ddp_count++;
diff --git a/src/mdlib/domdec_con.c b/src/mdlib/domdec_con.c
index 84e662c351..60d50916c2 100644
--- a/src/mdlib/domdec_con.c
+++ b/src/mdlib/domdec_con.c
@@ -28,6 +28,8 @@
 #include "domdec_network.h"
 #include "mtop_util.h"
 #include "gmx_ga2la.h"
+#include "gmx_hash.h"
+#include "gmx_omp_nthreads.h"
 
 typedef struct {
     int nsend;
@@ -36,11 +38,13 @@ typedef struct {
     int nrecv;
 } gmx_specatsend_t;
 
+typedef struct {
+    int *ind;
+    int nalloc;
+    int n;
+} ind_req_t;
+
 typedef struct gmx_domdec_specat_comm {
-    /* The atom indices we need from the surrounding cells */
-    int  nind_req;
-    int  *ind_req;
-    int  ind_req_nalloc;
     /* The number of indices to receive during the setup */
     int  nreq[DIM][2][2];
     /* The atoms to send */
@@ -57,6 +61,12 @@ typedef struct gmx_domdec_specat_comm {
     /* The range in the local buffer(s) for received atoms */
     int  at_start;
     int  at_end;
+
+    /* The atom indices we need from the surrounding cells.
+     * We can gather the indices over nthread threads.
+     */
+    int nthread;
+    ind_req_t *ireq;
 } gmx_domdec_specat_comm_t;
 
 typedef struct gmx_domdec_constraints {
@@ -71,7 +81,11 @@ typedef struct gmx_domdec_constraints {
     /* Boolean that tells if a global constraint index has been requested */
     char *gc_req;
     /* Global to local communicated constraint atom only index */
-    int  *ga2la;
+    gmx_hash_t ga2la;
+
+    /* Multi-threading stuff */
+    int nthread;
+    t_ilist *ils;
 } gmx_domdec_constraints_t;
 
 
@@ -424,10 +438,7 @@ void dd_clear_local_constraint_indices(gmx_domdec_t *dd)
   
     if (dd->constraint_comm)
     {
-        for(i=dd->constraint_comm->at_start; i<dd->constraint_comm->at_end; i++)
-        {
-            dc->ga2la[dd->gatindex[i]] = -1;
-        }
+        gmx_hash_clear_and_optimize(dc->ga2la);
     }
 }
 
@@ -437,23 +448,21 @@ void dd_clear_local_vsite_indices(gmx_domdec_t *dd)
     
     if (dd->vsite_comm)
     {
-        for(i=dd->vsite_comm->at_start; i<dd->vsite_comm->at_end; i++)
-        {
-            dd->ga2la_vsite[dd->gatindex[i]] = -1;
-        }
+        gmx_hash_clear_and_optimize(dd->ga2la_vsite);
     }
 }
 
 static int setup_specat_communication(gmx_domdec_t *dd,
+                                      ind_req_t *ireq,
                                       gmx_domdec_specat_comm_t *spac,
-                                      int *ga2la_specat,
+                                      gmx_hash_t ga2la_specat,
                                       int at_start,
                                       int vbuf_fac,
                                       const char *specat_type,
                                       const char *add_err)
 {
     int  nsend[2],nlast,nsend_zero[2]={0,0},*nsend_ptr;
-    int  d,dim,ndir,dir,nr,ns,i,nrecv_local,n0,start,ireq,ind,buf[2];
+    int  d,dim,ndir,dir,nr,ns,i,nrecv_local,n0,start,indr,ind,buf[2];
     int  nat_tot_specat,nat_tot_prev,nalloc_old;
     gmx_bool bPBC,bFirst;
     gmx_specatsend_t *spas;
@@ -467,7 +476,7 @@ static int setup_specat_communication(gmx_domdec_t *dd,
      *           we communicate this for more efficients checks
      * nsend[1]: the total number of requested atoms
      */
-    nsend[0] = spac->nind_req;
+    nsend[0] = ireq->n;
     nsend[1] = nsend[0];
     nlast    = nsend[1];
     for(d=dd->ndim-1; d>=0; d--)
@@ -502,14 +511,14 @@ static int setup_specat_communication(gmx_domdec_t *dd,
             dd_sendrecv_int(dd,d,dir==0 ? dddirForward : dddirBackward,
                             nsend_ptr,2,spac->nreq[d][dir],2);
             nr = spac->nreq[d][dir][1];
-            if (nlast+nr > spac->ind_req_nalloc)
+            if (nlast+nr > ireq->nalloc)
             {
-                spac->ind_req_nalloc = over_alloc_dd(nlast+nr);
-                srenew(spac->ind_req,spac->ind_req_nalloc);
+                ireq->nalloc = over_alloc_dd(nlast+nr);
+                srenew(ireq->ind,ireq->nalloc);
             }
             /* Communicate the indices */
             dd_sendrecv_int(dd,d,dir==0 ? dddirForward : dddirBackward,
-                            spac->ind_req,nsend_ptr[1],spac->ind_req+nlast,nr);
+                            ireq->ind,nsend_ptr[1],ireq->ind+nlast,nr);
             nlast += nr;
         }
         nsend[1] = nlast;
@@ -560,13 +569,13 @@ static int setup_specat_communication(gmx_domdec_t *dd,
             nsend[0] = 0;
             for(i=0; i<nr; i++)
             {
-                ireq = spac->ind_req[start+i];
+                indr = ireq->ind[start+i];
                 ind = -1;
                 /* Check if this is a home atom and if so ind will be set */
-                if (!ga2la_get_home(dd->ga2la,ireq,&ind))
+                if (!ga2la_get_home(dd->ga2la,indr,&ind))
                 {
                     /* Search in the communicated atoms */
-                    ind = ga2la_specat[ireq];
+                    ind = gmx_hash_get_minone(ga2la_specat,indr);
                 }
                 if (ind >= 0)
                 {
@@ -588,7 +597,7 @@ static int setup_specat_communication(gmx_domdec_t *dd,
                             srenew(spac->ibuf,spac->ibuf_nalloc);
                         }
                         /* Store the global index so we can send it now */
-                        spac->ibuf[spas->nsend] = ireq;
+                        spac->ibuf[spas->nsend] = indr;
                         if (i < n0)
                         {
                             nsend[0]++;
@@ -659,41 +668,42 @@ static int setup_specat_communication(gmx_domdec_t *dd,
         /* Make a global to local index for the communication atoms */
         for(i=nat_tot_prev; i<nat_tot_specat; i++)
         {
-            ga2la_specat[dd->gatindex[i]] = i;
+            gmx_hash_change_or_set(ga2la_specat,dd->gatindex[i],i);
         }
     }
     
     /* Check that in the end we got the number of atoms we asked for */
-    if (nrecv_local != spac->nind_req)
+    if (nrecv_local != ireq->n)
     {
         if (debug)
         {
             fprintf(debug,"Requested %d, received %d (tot recv %d)\n",
-                    spac->nind_req,nrecv_local,nat_tot_specat-at_start);
+                    ireq->n,nrecv_local,nat_tot_specat-at_start);
             if (gmx_debug_at)
             {
-                for(i=0; i<spac->nind_req; i++)
+                for(i=0; i<ireq->n; i++)
                 {
+                    ind = gmx_hash_get_minone(ga2la_specat,ireq->ind[i]);
                     fprintf(debug," %s%d",
-                            ga2la_specat[spac->ind_req[i]]>=0 ? "" : "!",
-                            spac->ind_req[i]+1);
+                            (ind >= 0) ? "" : "!",
+                            ireq->ind[i]+1);
                 }
                 fprintf(debug,"\n");
             }
         }
         fprintf(stderr,"\nDD cell %d %d %d: Neighboring cells do not have atoms:",
                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
-        for(i=0; i<spac->nind_req; i++)
+        for(i=0; i<ireq->n; i++)
         {
-            if (ga2la_specat[spac->ind_req[i]] < 0)
+            if (gmx_hash_get_minone(ga2la_specat,ireq->ind[i]) < 0)
             {
-                fprintf(stderr," %d",spac->ind_req[i]+1);
+                fprintf(stderr," %d",ireq->ind[i]+1);
             }
         }
         fprintf(stderr,"\n");
         gmx_fatal(FARGS,"DD cell %d %d %d could only obtain %d of the %d atoms that are connected via %ss from the neighboring cells. This probably means your %s lengths are too long compared to the domain decomposition cell size. Decrease the number of domain decomposition grid cells%s%s.",
                   dd->ci[XX],dd->ci[YY],dd->ci[ZZ],
-                  nrecv_local,spac->nind_req,specat_type,
+                  nrecv_local,ireq->n,specat_type,
                   specat_type,add_err,
                   dd->bGridJump ? " or use the -rcon option of mdrun" : "");
     }
@@ -715,7 +725,8 @@ static void walk_out(int con,int con_offset,int a,int offset,int nrec,
                      const gmx_ga2la_t ga2la,gmx_bool bHomeConnect,
                      gmx_domdec_constraints_t *dc,
                      gmx_domdec_specat_comm_t *dcc,
-                     t_ilist *il_local)
+                     t_ilist *il_local,
+                     ind_req_t *ireq)
 {
     int a1_gl,a2_gl,a_loc,i,coni,b;
     const t_iatom *iap;
@@ -763,18 +774,18 @@ static void walk_out(int con,int con_offset,int a,int offset,int nrec,
         dc->ncon++;
     }
     /* Check to not ask for the same atom more than once */
-    if (dc->ga2la[offset+a] == -1)
+    if (gmx_hash_get_minone(dc->ga2la,offset+a) == -1)
     {
         assert(dcc);
         /* Add this non-home atom to the list */
-        if (dcc->nind_req+1 > dcc->ind_req_nalloc)
+        if (ireq->n+1 > ireq->nalloc)
         {
-            dcc->ind_req_nalloc = over_alloc_large(dcc->nind_req+1);
-            srenew(dcc->ind_req,dcc->ind_req_nalloc);
+            ireq->nalloc = over_alloc_large(ireq->n+1);
+            srenew(ireq->ind,ireq->nalloc);
         }
-        dcc->ind_req[dcc->nind_req++] = offset + a;
+        ireq->ind[ireq->n++] = offset + a;
         /* Temporarily mark with -2, we get the index later */
-        dc->ga2la[offset+a] = -2;
+        gmx_hash_set(dc->ga2la,offset+a,-2);
     }
     
     if (nrec > 0)
@@ -798,141 +809,410 @@ static void walk_out(int con,int con_offset,int a,int offset,int nrec,
                 {
                     walk_out(coni,con_offset,b,offset,nrec-1,
                              ncon1,ia1,ia2,at2con,
-                             ga2la,FALSE,dc,dcc,il_local);
+                             ga2la,FALSE,dc,dcc,il_local,ireq);
                 }
             }
         }
     }
 }
 
-int dd_make_local_constraints(gmx_domdec_t *dd,int at_start,
-                              gmx_mtop_t *mtop,
-                              gmx_constr_t constr,int nrec,
-                              t_ilist *il_local)
+static void atoms_to_settles(gmx_domdec_t *dd,
+                             const gmx_mtop_t *mtop,
+                             const int *cginfo,
+                             const int **at2settle_mt,
+                             int cg_start,int cg_end,
+                             t_ilist *ils_local,
+                             ind_req_t *ireq)
 {
-    t_blocka *at2con_mt,*at2con;
     gmx_ga2la_t ga2la;
-    int ncon1,ncon2;
+    gmx_mtop_atomlookup_t alook;
+    int settle;
+    int nral,sa;
+    int cg,a,a_gl,a_glsa,a_gls[3],a_locs[3];
+    int mb,molnr,a_mol,offset;
+    const gmx_molblock_t *molb;
+    const t_iatom *ia1;
+    gmx_bool a_home[3];
+    int nlocal;
+    gmx_bool bAssign;
+
+    ga2la  = dd->ga2la;
+
+    alook = gmx_mtop_atomlookup_settle_init(mtop);
+
+    nral = NRAL(F_SETTLE);
+
+    for(cg=cg_start; cg<cg_end; cg++)
+    {
+        if (GET_CGINFO_SETTLE(cginfo[cg]))
+        {
+            for(a=dd->cgindex[cg]; a<dd->cgindex[cg+1]; a++)
+            {
+                a_gl = dd->gatindex[a];
+                
+                gmx_mtop_atomnr_to_molblock_ind(alook,a_gl,&mb,&molnr,&a_mol);
+                molb = &mtop->molblock[mb];
+
+                settle = at2settle_mt[molb->type][a_mol];
+
+                if (settle >= 0)
+                {
+                    offset = a_gl - a_mol;
+
+                    ia1 = mtop->moltype[molb->type].ilist[F_SETTLE].iatoms;
+
+                    bAssign = FALSE;
+                    nlocal = 0;
+                    for(sa=0; sa<nral; sa++)
+                    {
+                        a_glsa = offset + ia1[settle*(1+nral)+1+sa];
+                        a_gls[sa] = a_glsa;
+                        a_home[sa] = ga2la_get_home(ga2la,a_glsa,&a_locs[sa]);
+                        if (a_home[sa])
+                        {
+                            if (nlocal == 0 && a_gl == a_glsa)
+                            {
+                                bAssign = TRUE;
+                            }
+                            nlocal++;
+                        }
+                    }
+
+                    if (bAssign)
+                    {
+                        if (ils_local->nr+1+nral > ils_local->nalloc)
+                        {
+                            ils_local->nalloc = over_alloc_dd(ils_local->nr+1+nral);
+                            srenew(ils_local->iatoms,ils_local->nalloc);
+                        }
+
+                        ils_local->iatoms[ils_local->nr++] = ia1[settle*4];
+
+                        for(sa=0; sa<nral; sa++)
+                        {
+                            if (ga2la_get_home(ga2la,a_gls[sa],&a_locs[sa]))
+                            {
+                                ils_local->iatoms[ils_local->nr++] = a_locs[sa];
+                            }
+                            else
+                            {
+                                ils_local->iatoms[ils_local->nr++] = -a_gls[sa] - 1;
+                                /* Add this non-home atom to the list */
+                                if (ireq->n+1 > ireq->nalloc)
+                                {
+                                    ireq->nalloc = over_alloc_large(ireq->n+1);
+                                    srenew(ireq->ind,ireq->nalloc);
+                                }
+                                ireq->ind[ireq->n++] = a_gls[sa];
+                                /* A check on double atom requests is
+                                 * not required for settle.
+                                 */
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    gmx_mtop_atomlookup_destroy(alook);
+}
+
+static void atoms_to_constraints(gmx_domdec_t *dd,
+                                 const gmx_mtop_t *mtop,
+                                 const int *cginfo,
+                                 const t_blocka *at2con_mt,int nrec,
+                                 t_ilist *ilc_local,
+                                 ind_req_t *ireq)
+{
+    const t_blocka *at2con;
+    gmx_ga2la_t ga2la;
+    gmx_mtop_atomlookup_t alook;
+    int ncon1;
     gmx_molblock_t *molb;
     t_iatom *ia1,*ia2,*iap;
-    int nhome,a,a_gl,a_mol,a_loc,b_lo,offset,mb,molnr,b_mol,i,con,con_offset;
+    int nhome,cg,a,a_gl,a_mol,a_loc,b_lo,offset,mb,molnr,b_mol,i,con,con_offset;
     gmx_domdec_constraints_t *dc;
-    int at_end,*ga2la_specat,j;
+    gmx_domdec_specat_comm_t *dcc;
     
-    dc = dd->constraints;
+    dc  = dd->constraints;
+    dcc = dd->constraint_comm;
     
-    at2con_mt = atom2constraints_moltype(constr);
     ga2la  = dd->ga2la;
-    
-    dc->ncon     = 0;
-    il_local->nr = 0;
+
+    alook = gmx_mtop_atomlookup_init(mtop);
+
     nhome = 0;
-    if (dd->constraint_comm)
-    {
-        dd->constraint_comm->nind_req = 0;
-    }
-    for(a=0; a<dd->nat_home; a++)
+    for(cg=0; cg<dd->ncg_home; cg++)
     {
-        a_gl = dd->gatindex[a];
+        if (GET_CGINFO_CONSTR(cginfo[cg]))
+        {
+            for(a=dd->cgindex[cg]; a<dd->cgindex[cg+1]; a++)
+            {
+                a_gl = dd->gatindex[a];
         
-        gmx_mtop_atomnr_to_molblock_ind(mtop,a_gl,&mb,&molnr,&a_mol);
-        molb = &mtop->molblock[mb];
+                gmx_mtop_atomnr_to_molblock_ind(alook,a_gl,&mb,&molnr,&a_mol);
+                molb = &mtop->molblock[mb];
         
-        ncon1 = mtop->moltype[molb->type].ilist[F_CONSTR].nr/3;
-        ncon2 = mtop->moltype[molb->type].ilist[F_CONSTRNC].nr/3;
-        if (ncon1 > 0 || ncon2 > 0)
-        {
-            ia1 = mtop->moltype[molb->type].ilist[F_CONSTR].iatoms;
-            ia2 = mtop->moltype[molb->type].ilist[F_CONSTRNC].iatoms;
-
-            /* Calculate the global constraint number offset for the molecule.
-             * This is only required for the global index to make sure
-             * that we use each constraint only once.
-             */
-            con_offset = dc->molb_con_offset[mb] + molnr*dc->molb_ncon_mol[mb];
+                ncon1 = mtop->moltype[molb->type].ilist[F_CONSTR].nr/NRAL(F_SETTLE);
+
+                ia1 = mtop->moltype[molb->type].ilist[F_CONSTR].iatoms;
+                ia2 = mtop->moltype[molb->type].ilist[F_CONSTRNC].iatoms;
+
+                /* Calculate the global constraint number offset for the molecule.
+                 * This is only required for the global index to make sure
+                 * that we use each constraint only once.
+                 */
+                con_offset =
+                    dc->molb_con_offset[mb] + molnr*dc->molb_ncon_mol[mb];
             
-            /* The global atom number offset for this molecule */
-            offset = a_gl - a_mol;
-            at2con = &at2con_mt[molb->type];
-            for(i=at2con->index[a_mol]; i<at2con->index[a_mol+1]; i++)
-            {
-                con = at2con->a[i];
-                iap = constr_iatomptr(ncon1,ia1,ia2,con);
-                if (a_mol == iap[1])
-                {
-                    b_mol = iap[2];
-                }
-                else
-                {
-                    b_mol = iap[1];
-                }
-                if (ga2la_get_home(ga2la,offset+b_mol,&a_loc))
+                /* The global atom number offset for this molecule */
+                offset = a_gl - a_mol;
+                at2con = &at2con_mt[molb->type];
+                for(i=at2con->index[a_mol]; i<at2con->index[a_mol+1]; i++)
                 {
-                    /* Add this fully home constraint at the first atom */
-                    if (a_mol < b_mol)
+                    con = at2con->a[i];
+                    iap = constr_iatomptr(ncon1,ia1,ia2,con);
+                    if (a_mol == iap[1])
                     {
-                        if (dc->ncon+1 > dc->con_nalloc)
-                        {
-                            dc->con_nalloc = over_alloc_large(dc->ncon+1);
-                            srenew(dc->con_gl,dc->con_nalloc);
-                            srenew(dc->con_nlocat,dc->con_nalloc);
-                        }
-                        dc->con_gl[dc->ncon] = con_offset + con;
-                        dc->con_nlocat[dc->ncon] = 2;
-                        if (il_local->nr + 3 > il_local->nalloc)
+                        b_mol = iap[2];
+                    }
+                    else
+                    {
+                        b_mol = iap[1];
+                    }
+                    if (ga2la_get_home(ga2la,offset+b_mol,&a_loc))
+                    {
+                        /* Add this fully home constraint at the first atom */
+                        if (a_mol < b_mol)
                         {
-                            il_local->nalloc = over_alloc_dd(il_local->nr + 3);
-                            srenew(il_local->iatoms,il_local->nalloc);
+                            if (dc->ncon+1 > dc->con_nalloc)
+                            {
+                                dc->con_nalloc = over_alloc_large(dc->ncon+1);
+                                srenew(dc->con_gl,dc->con_nalloc);
+                                srenew(dc->con_nlocat,dc->con_nalloc);
+                            }
+                            dc->con_gl[dc->ncon] = con_offset + con;
+                            dc->con_nlocat[dc->ncon] = 2;
+                            if (ilc_local->nr + 3 > ilc_local->nalloc)
+                            {
+                                ilc_local->nalloc = over_alloc_dd(ilc_local->nr + 3);
+                                srenew(ilc_local->iatoms,ilc_local->nalloc);
+                            }
+                            b_lo = a_loc;
+                            ilc_local->iatoms[ilc_local->nr++] = iap[0];
+                            ilc_local->iatoms[ilc_local->nr++] = (a_gl == iap[1] ? a    : b_lo);
+                            ilc_local->iatoms[ilc_local->nr++] = (a_gl == iap[1] ? b_lo : a   );
+                            dc->ncon++;
+                            nhome++;
                         }
-                        b_lo = a_loc;
-                        il_local->iatoms[il_local->nr++] = iap[0];
-                        il_local->iatoms[il_local->nr++] = (a_gl == iap[1] ? a    : b_lo);
-                        il_local->iatoms[il_local->nr++] = (a_gl == iap[1] ? b_lo : a   );
-                        dc->ncon++;
-                        nhome++;
                     }
-                }
-                else
-                {
-                    /* We need the nrec constraints coupled to this constraint,
-                     * so we need to walk out of the home cell by nrec+1 atoms,
-                     * since already atom bg is not locally present.
-                     * Therefore we call walk_out with nrec recursions to go
-                     * after this first call.
-                     */
-                    walk_out(con,con_offset,b_mol,offset,nrec,
-                             ncon1,ia1,ia2,at2con,
-                             dd->ga2la,TRUE,dc,dd->constraint_comm,il_local);
+                    else
+                    {
+                        /* We need the nrec constraints coupled to this constraint,
+                         * so we need to walk out of the home cell by nrec+1 atoms,
+                         * since already atom bg is not locally present.
+                         * Therefore we call walk_out with nrec recursions to go
+                         * after this first call.
+                         */
+                        walk_out(con,con_offset,b_mol,offset,nrec,
+                                 ncon1,ia1,ia2,at2con,
+                                 dd->ga2la,TRUE,dc,dcc,ilc_local,ireq);
+                    }
                 }
             }
         }
     }
-    
+
+    gmx_mtop_atomlookup_destroy(alook);
+
     if (debug)
     {
         fprintf(debug,
                 "Constraints: home %3d border %3d atoms: %3d\n",
                 nhome,dc->ncon-nhome,
-                dd->constraint_comm ? dd->constraint_comm->nind_req : 0);
+                dd->constraint_comm ? ireq->n : 0);
+    }
+}
+
+int dd_make_local_constraints(gmx_domdec_t *dd,int at_start,
+                              const gmx_mtop_t *mtop,
+                              const int *cginfo,
+                              gmx_constr_t constr,int nrec,
+                              t_ilist *il_local)
+{
+    gmx_domdec_constraints_t *dc;
+    t_ilist *ilc_local,*ils_local;
+    ind_req_t *ireq;
+    const t_blocka *at2con_mt;
+    const int **at2settle_mt;
+    gmx_hash_t ga2la_specat;
+    int at_end,i,j;
+    t_iatom *iap;
+    
+    dc = dd->constraints;
+    
+    ilc_local = &il_local[F_CONSTR];
+    ils_local = &il_local[F_SETTLE];
+
+    dc->ncon      = 0;
+    ilc_local->nr = 0;
+    if (dd->constraint_comm)
+    {
+        at2con_mt = atom2constraints_moltype(constr);
+        ireq = &dd->constraint_comm->ireq[0];
+        ireq->n = 0;
+    }
+    else
+    {
+        at2con_mt = NULL;
+        ireq = NULL;
+    }
+
+    if (dd->bInterCGsettles)
+    {
+        at2settle_mt = atom2settle_moltype(constr);
+        ils_local->nr = 0;
+    }
+    else
+    {
+        /* Settle works inside charge groups, we assigned them already */
+        at2settle_mt = NULL;
+    }
+
+    if (at2settle_mt == NULL)
+    {
+        atoms_to_constraints(dd,mtop,cginfo,at2con_mt,nrec,
+                             ilc_local,ireq);
+    }
+    else
+    {
+        int t0_set;
+        int thread;
+
+        /* Do the constraints, if present, on the first thread.
+         * Do the settles on all other threads.
+         */
+        t0_set = ((at2con_mt != NULL && dc->nthread > 1) ? 1 : 0);
+
+#pragma omp parallel for num_threads(dc->nthread) schedule(static)
+        for(thread=0; thread<dc->nthread; thread++)
+        {
+            if (at2con_mt && thread == 0)
+            {
+                atoms_to_constraints(dd,mtop,cginfo,at2con_mt,nrec,
+                                     ilc_local,ireq);
+            }
+
+            if (thread >= t0_set)
+            {
+                int cg0,cg1;
+                t_ilist *ilst;
+                ind_req_t *ireqt;
+
+                /* Distribute the settle check+assignments over
+                 * dc->nthread or dc->nthread-1 threads.
+                 */
+                cg0 = (dd->ncg_home*(thread-t0_set  ))/(dc->nthread-t0_set);
+                cg1 = (dd->ncg_home*(thread-t0_set+1))/(dc->nthread-t0_set);
+
+                if (thread == t0_set)
+                {
+                    ilst = ils_local;
+                }
+                else
+                {
+                    ilst = &dc->ils[thread];
+                }
+                ilst->nr = 0;
+
+                ireqt = &dd->constraint_comm->ireq[thread];
+                if (thread > 0)
+                {
+                    ireqt->n = 0;
+                }
+
+                atoms_to_settles(dd,mtop,cginfo,at2settle_mt,
+                                 cg0,cg1,
+                                 ilst,ireqt);
+            }
+        }
+
+        /* Combine the generate settles and requested indices */
+        for(thread=1; thread<dc->nthread; thread++)
+        {
+            t_ilist *ilst;
+            ind_req_t *ireqt;
+            int ia;
+
+            if (thread > t0_set)
+            {
+                ilst = &dc->ils[thread];
+                if (ils_local->nr + ilst->nr > ils_local->nalloc)
+                {
+                    ils_local->nalloc = over_alloc_large(ils_local->nr + ilst->nr);
+                    srenew(ils_local->iatoms,ils_local->nalloc);
+                }
+                for(ia=0; ia<ilst->nr; ia++)
+                {
+                    ils_local->iatoms[ils_local->nr+ia] = ilst->iatoms[ia];
+                }
+                ils_local->nr += ilst->nr;
+            }
+
+            ireqt = &dd->constraint_comm->ireq[thread];
+            if (ireq->n+ireqt->n > ireq->nalloc)
+            {
+                ireq->nalloc = over_alloc_large(ireq->n+ireqt->n);
+                srenew(ireq->ind,ireq->nalloc);
+            }
+            for(ia=0; ia<ireqt->n; ia++)
+            {
+                ireq->ind[ireq->n+ia] = ireqt->ind[ia];
+            }
+            ireq->n += ireqt->n;
+        }
+
+        if (debug)
+        {
+            fprintf(debug,"Settles: total %3d\n",ils_local->nr/4);
+        }
     }
 
     if (dd->constraint_comm) {
+        int nral1;
+
         at_end =
-            setup_specat_communication(dd,dd->constraint_comm,
+            setup_specat_communication(dd,ireq,dd->constraint_comm,
                                        dd->constraints->ga2la,
                                        at_start,2,
                                        "constraint"," or lincs-order");
         
         /* Fill in the missing indices */
         ga2la_specat = dd->constraints->ga2la;
-        for(i=0; i<il_local->nr; i+=3)
+
+        nral1 = 1 + NRAL(F_CONSTR);
+        for(i=0; i<ilc_local->nr; i+=nral1)
+        {
+            iap = ilc_local->iatoms + i;
+            for(j=1; j<nral1; j++)
+            {
+                if (iap[j] < 0)
+                {
+                    iap[j] = gmx_hash_get_minone(ga2la_specat,-iap[j]-1);
+                }
+            }
+        }
+
+        nral1 = 1 + NRAL(F_SETTLE);
+        for(i=0; i<ils_local->nr; i+=nral1)
         {
-            iap = il_local->iatoms + i;
-            for(j=1; j<3; j++)
+            iap = ils_local->iatoms + i;
+            for(j=1; j<nral1; j++)
             {
                 if (iap[j] < 0)
                 {
-                    iap[j] = ga2la_specat[-iap[j]-1];
+                    iap[j] = gmx_hash_get_minone(ga2la_specat,-iap[j]-1);
                 }
             }
         }
@@ -948,16 +1228,18 @@ int dd_make_local_constraints(gmx_domdec_t *dd,int at_start,
 int dd_make_local_vsites(gmx_domdec_t *dd,int at_start,t_ilist *lil)
 {
     gmx_domdec_specat_comm_t *spac;
-    int  *ga2la_specat;
+    ind_req_t *ireq;
+    gmx_hash_t ga2la_specat;
     int  ftype,nral,i,j,gat,a;
     t_ilist *lilf;
     t_iatom *iatoms;
     int  at_end;
     
     spac         = dd->vsite_comm;
+    ireq         = &spac->ireq[0];
     ga2la_specat = dd->ga2la_vsite;
     
-    spac->nind_req = 0;
+    ireq->n = 0;
     /* Loop over all the home vsites */
     for(ftype=0; ftype<F_NRE; ftype++)
     {
@@ -977,20 +1259,19 @@ int dd_make_local_vsites(gmx_domdec_t *dd,int at_start,t_ilist *lil)
                          */
                         a = -iatoms[j] - 1;
                         /* Check to not ask for the same atom more than once */
-                        if (ga2la_specat[a] == -1)
+                        if (gmx_hash_get_minone(dd->ga2la_vsite,a) == -1)
                         {
                             /* Add this non-home atom to the list */
-                            if (spac->nind_req+1 > spac->ind_req_nalloc)
+                            if (ireq->n+1 > ireq->nalloc)
                             {
-                                spac->ind_req_nalloc =
-                                    over_alloc_small(spac->nind_req+1);
-                                srenew(spac->ind_req,spac->ind_req_nalloc);
+                                ireq->nalloc = over_alloc_large(ireq->n+1);
+                                srenew(ireq->ind,ireq->nalloc);
                             }
-                            spac->ind_req[spac->nind_req++] = a;
+                            ireq->ind[ireq->n++] = a;
                             /* Temporarily mark with -2,
                              * we get the index later.
                              */
-                            ga2la_specat[a] = -2;
+                            gmx_hash_set(ga2la_specat,a,-2);
                         }
                     }
                 }
@@ -998,7 +1279,7 @@ int dd_make_local_vsites(gmx_domdec_t *dd,int at_start,t_ilist *lil)
         }
     }
     
-    at_end = setup_specat_communication(dd,dd->vsite_comm,ga2la_specat,
+    at_end = setup_specat_communication(dd,ireq,dd->vsite_comm,ga2la_specat,
                                         at_start,1,"vsite","");
     
     /* Fill in the missing indices */
@@ -1015,7 +1296,7 @@ int dd_make_local_vsites(gmx_domdec_t *dd,int at_start,t_ilist *lil)
                 {
                     if (iatoms[j] < 0)
                     {
-                        iatoms[j] = ga2la_specat[-iatoms[j]-1];
+                        iatoms[j] = gmx_hash_get_minone(ga2la_specat,-iatoms[j]-1);
                     }
                 }
             }
@@ -1025,8 +1306,19 @@ int dd_make_local_vsites(gmx_domdec_t *dd,int at_start,t_ilist *lil)
     return at_end;
 }
 
+static gmx_domdec_specat_comm_t *specat_comm_init(int nthread)
+{
+    gmx_domdec_specat_comm_t *spac;
+
+    snew(spac,1);
+    spac->nthread = nthread;
+    snew(spac->ireq,spac->nthread);
+
+    return spac;
+}
+
 void init_domdec_constraints(gmx_domdec_t *dd,
-                             int natoms,gmx_mtop_t *mtop,
+                             gmx_mtop_t *mtop,
                              gmx_constr_t constr)
 {
     gmx_domdec_constraints_t *dc;
@@ -1055,22 +1347,28 @@ void init_domdec_constraints(gmx_domdec_t *dd,
         ncon += molb->nmol*dc->molb_ncon_mol[mb];
     }
     
-    snew(dc->gc_req,ncon);
-    for(c=0; c<ncon; c++)
-    {
-        dc->gc_req[c] = 0;
-    }
-    
-    snew(dc->ga2la,natoms);
-    for(a=0; a<natoms; a++)
+    if (ncon > 0)
     {
-        dc->ga2la[a] = -1;
+        snew(dc->gc_req,ncon);
+        for(c=0; c<ncon; c++)
+        {
+            dc->gc_req[c] = 0;
+        }
     }
-    
-    snew(dd->constraint_comm,1);
+
+    /* Use a hash table for the global to local index.
+     * The number of keys is a rough estimate, it will be optimized later.
+     */
+    dc->ga2la = gmx_hash_init(min(mtop->natoms/20,
+                                  mtop->natoms/(2*dd->nnodes)));
+
+    dc->nthread = gmx_omp_nthreads_get(emntDomdec);
+    snew(dc->ils,dc->nthread);
+
+    dd->constraint_comm = specat_comm_init(dc->nthread);
 }
 
-void init_domdec_vsites(gmx_domdec_t *dd,int natoms)
+void init_domdec_vsites(gmx_domdec_t *dd,int n_intercg_vsite)
 {
     int i;
     gmx_domdec_constraints_t *dc;
@@ -1080,11 +1378,11 @@ void init_domdec_vsites(gmx_domdec_t *dd,int natoms)
         fprintf(debug,"Begin init_domdec_vsites\n");
     }
     
-    snew(dd->ga2la_vsite,natoms);
-    for(i=0; i<natoms; i++)
-    {
-        dd->ga2la_vsite[i] = -1;
-    }
+    /* Use a hash table for the global to local index.
+     * The number of keys is a rough estimate, it will be optimized later.
+     */
+    dd->ga2la_vsite = gmx_hash_init(min(n_intercg_vsite/20,
+                                        n_intercg_vsite/(2*dd->nnodes)));
     
-    snew(dd->vsite_comm,1);
+    dd->vsite_comm = specat_comm_init(1);
 }
diff --git a/src/mdlib/domdec_top.c b/src/mdlib/domdec_top.c
index e28a958c9f..65917df616 100644
--- a/src/mdlib/domdec_top.c
+++ b/src/mdlib/domdec_top.c
@@ -36,6 +36,8 @@
 #include "mshift.h"
 #include "vsite.h"
 #include "gmx_ga2la.h"
+#include "force.h"
+#include "gmx_omp_nthreads.h"
 
 /* for dd_init_local_state */
 #define NITEM_DD_INIT_LOCAL_STATE 5
@@ -55,12 +57,23 @@ typedef struct {
 typedef struct gmx_reverse_top {
     gmx_bool bExclRequired; /* Do we require all exclusions to be assigned? */
     gmx_bool bConstr;       /* Are there constraints in this revserse top?  */
+    gmx_bool bSettle;       /* Are there settles in this revserse top?  */
     gmx_bool bBCheck;       /* All bonded interactions have to be assigned? */
     gmx_bool bMultiCGmols;  /* Are the multi charge-group molecules?        */
     gmx_reverse_ilist_t *ril_mt; /* Reverse ilist for all moltypes      */
     int  ril_mt_tot_size;
     int  ilsort;        /* The sorting state of bondeds for free energy */
     gmx_molblock_ind_t *mbi;
+    int nmolblock;
+
+    /* Work data structures for multi-threading */
+    int      nthread;
+    t_idef   *idef_thread;
+    int      ***vsite_pbc;
+    int      **vsite_pbc_nalloc;
+    int      *nbonded_thread;
+    t_blocka *excl_thread;
+    int      *excl_count_thread;
     
     /* Pointers only used for an error message */
     gmx_mtop_t     *err_top_global;
@@ -84,13 +97,15 @@ static int nral_rt(int ftype)
     return nral;
 }
 
-static gmx_bool dd_check_ftype(int ftype,gmx_bool bBCheck,gmx_bool bConstr)
+/* This function tells which interactions need to be assigned exactly once */
+static gmx_bool dd_check_ftype(int ftype,gmx_bool bBCheck,
+                               gmx_bool bConstr,gmx_bool bSettle)
 {
     return (((interaction_function[ftype].flags & IF_BOND) &&
              !(interaction_function[ftype].flags & IF_VSITE) &&
              (bBCheck || !(interaction_function[ftype].flags & IF_LIMZERO))) ||
-            ftype == F_SETTLE ||
-            (bConstr && (ftype == F_CONSTR || ftype == F_CONSTRNC)));
+            (bConstr && (ftype == F_CONSTR || ftype == F_CONSTRNC)) ||
+            (bSettle && ftype == F_SETTLE));
 }
 
 static void print_error_header(FILE *fplog,char *moltypename,int nprint)
@@ -126,7 +141,7 @@ static void print_missing_interactions_mb(FILE *fplog,t_commrec *cr,
     gatindex = cr->dd->gatindex;
     for(ftype=0; ftype<F_NRE; ftype++)
     {
-        if (dd_check_ftype(ftype,rt->bBCheck,rt->bConstr))
+        if (dd_check_ftype(ftype,rt->bBCheck,rt->bConstr,rt->bSettle))
         {
             nral = NRAL(ftype);
             il = &idef->il[ftype];
@@ -312,8 +327,8 @@ void dd_print_missing_interactions(FILE *fplog,t_commrec *cr,int local_count,  g
             if (((interaction_function[ftype].flags & IF_BOND) &&
                  (dd->reverse_top->bBCheck 
                   || !(interaction_function[ftype].flags & IF_LIMZERO)))
-                || ftype == F_SETTLE
-                || (dd->reverse_top->bConstr && ftype == F_CONSTR))
+                || (dd->reverse_top->bConstr && ftype == F_CONSTR)
+                || (dd->reverse_top->bSettle && ftype == F_SETTLE))
             {
                 nral = NRAL(ftype);
                 n = gmx_mtop_ftype_count(err_top_global,ftype);
@@ -361,17 +376,37 @@ void dd_print_missing_interactions(FILE *fplog,t_commrec *cr,int local_count,  g
     }
 }
 
-static void global_atomnr_to_moltype_ind(gmx_molblock_ind_t *mbi,int i_gl,
+static void global_atomnr_to_moltype_ind(gmx_reverse_top_t *rt,int i_gl,
 					 int *mb,int *mt,int *mol,int *i_mol)
 {
   int molb;
 
-  *mb = 0;
-  while (i_gl >= mbi->a_end) {
-    (*mb)++;
-    mbi++;
+
+  gmx_molblock_ind_t *mbi = rt->mbi;
+  int start = 0;
+  int end =  rt->nmolblock; /* exclusive */
+  int mid;
+
+  /* binary search for molblock_ind */
+  while (TRUE) {
+      mid = (start+end)/2;
+      if (i_gl >= mbi[mid].a_end)
+      {
+          start = mid+1;
+      }
+      else if (i_gl < mbi[mid].a_start)
+      {
+          end = mid;
+      }
+      else
+      {
+          break;
+      }
   }
 
+  *mb = mid;
+  mbi += mid;
+
   *mt    = mbi->type;
   *mol   = (i_gl - mbi->a_start) / mbi->natoms_mol;
   *i_mol = (i_gl - mbi->a_start) - (*mol)*mbi->natoms_mol;
@@ -410,7 +445,8 @@ static int count_excls(t_block *cgs,t_blocka *excls,int *n_intercg_excl)
 static int low_make_reverse_ilist(t_ilist *il_mt,t_atom *atom,
                                   int **vsite_pbc,
                                   int *count,
-                                  gmx_bool bConstr,gmx_bool bBCheck,
+                                  gmx_bool bConstr,gmx_bool bSettle,
+                                  gmx_bool bBCheck,
                                   int *r_index,int *r_il,
                                   gmx_bool bLinkToAllAtoms,
                                   gmx_bool bAssign)
@@ -426,8 +462,9 @@ static int low_make_reverse_ilist(t_ilist *il_mt,t_atom *atom,
     for(ftype=0; ftype<F_NRE; ftype++)
     {
         if ((interaction_function[ftype].flags & (IF_BOND | IF_VSITE)) ||
-            ftype == F_SETTLE ||
-            (bConstr && (ftype == F_CONSTR || ftype == F_CONSTRNC))) {
+            (bConstr && (ftype == F_CONSTR || ftype == F_CONSTRNC)) ||
+            (bSettle && ftype == F_SETTLE))
+        {
             bVSite = (interaction_function[ftype].flags & IF_VSITE);
             nral = NRAL(ftype);
             il = &il_mt[ftype];
@@ -510,7 +547,8 @@ static int low_make_reverse_ilist(t_ilist *il_mt,t_atom *atom,
 
 static int make_reverse_ilist(gmx_moltype_t *molt,
                               int **vsite_pbc,
-                              gmx_bool bConstr,gmx_bool bBCheck,
+                              gmx_bool bConstr,gmx_bool bSettle,
+                              gmx_bool bBCheck,
                               gmx_bool bLinkToAllAtoms,
                               gmx_reverse_ilist_t *ril_mt)
 {
@@ -521,7 +559,7 @@ static int make_reverse_ilist(gmx_moltype_t *molt,
     snew(count,nat_mt);
     low_make_reverse_ilist(molt->ilist,molt->atoms.atom,vsite_pbc,
                            count,
-                           bConstr,bBCheck,NULL,NULL,
+                           bConstr,bSettle,bBCheck,NULL,NULL,
                            bLinkToAllAtoms,FALSE);
     
     snew(ril_mt->index,nat_mt+1);
@@ -537,7 +575,7 @@ static int make_reverse_ilist(gmx_moltype_t *molt,
     nint_mt =
         low_make_reverse_ilist(molt->ilist,molt->atoms.atom,vsite_pbc,
                                count,
-                               bConstr,bBCheck,
+                               bConstr,bSettle,bBCheck,
                                ril_mt->index,ril_mt->il,
                                bLinkToAllAtoms,TRUE);
     
@@ -554,18 +592,20 @@ static void destroy_reverse_ilist(gmx_reverse_ilist_t *ril)
 
 static gmx_reverse_top_t *make_reverse_top(gmx_mtop_t *mtop,gmx_bool bFE,
                                            int ***vsite_pbc_molt,
-                                           gmx_bool bConstr,
+                                           gmx_bool bConstr,gmx_bool bSettle,
                                            gmx_bool bBCheck,int *nint)
 {
     int mt,i,mb;
     gmx_reverse_top_t *rt;
     int *nint_mt;
     gmx_moltype_t *molt;
+    int thread;
     
     snew(rt,1);
     
     /* Should we include constraints (for SHAKE) in rt? */
     rt->bConstr = bConstr;
+    rt->bSettle = bSettle;
     rt->bBCheck = bBCheck;
     
     rt->bMultiCGmols = FALSE;
@@ -583,7 +623,7 @@ static gmx_reverse_top_t *make_reverse_top(gmx_mtop_t *mtop,gmx_bool bFE,
         /* Make the atom to interaction list for this molecule type */
         nint_mt[mt] =
             make_reverse_ilist(molt,vsite_pbc_molt ? vsite_pbc_molt[mt] : NULL,
-                               rt->bConstr,rt->bBCheck,FALSE,
+                               rt->bConstr,rt->bSettle,rt->bBCheck,FALSE,
                                &rt->ril_mt[mt]);
         
         rt->ril_mt_tot_size += rt->ril_mt[mt].index[molt->atoms.nr];
@@ -610,6 +650,7 @@ static gmx_reverse_top_t *make_reverse_top(gmx_mtop_t *mtop,gmx_bool bFE,
     
     /* Make a molblock index for fast searching */
     snew(rt->mbi,mtop->nmolblock);
+    rt->nmolblock = mtop->nmolblock;
     i = 0;
     for(mb=0; mb<mtop->nmolblock; mb++)
     {
@@ -619,6 +660,22 @@ static gmx_reverse_top_t *make_reverse_top(gmx_mtop_t *mtop,gmx_bool bFE,
         rt->mbi[mb].natoms_mol = mtop->molblock[mb].natoms_mol;
         rt->mbi[mb].type       = mtop->molblock[mb].type;
     }
+
+    rt->nthread = gmx_omp_nthreads_get(emntDomdec);
+    snew(rt->idef_thread,rt->nthread);
+    if (vsite_pbc_molt != NULL)
+    {
+        snew(rt->vsite_pbc,rt->nthread);
+        snew(rt->vsite_pbc_nalloc,rt->nthread);
+        for(thread=0; thread<rt->nthread; thread++)
+        {
+            snew(rt->vsite_pbc[thread],F_VSITEN-F_VSITE2+1);
+            snew(rt->vsite_pbc_nalloc[thread],F_VSITEN-F_VSITE2+1);
+        }
+    }
+    snew(rt->nbonded_thread,rt->nthread);
+    snew(rt->excl_thread,rt->nthread);
+    snew(rt->excl_count_thread,rt->nthread);
     
     return rt;
 }
@@ -628,7 +685,7 @@ void dd_make_reverse_top(FILE *fplog,
                          gmx_vsite_t *vsite,gmx_constr_t constr,
                          t_inputrec *ir,gmx_bool bBCheck)
 {
-    int mb,natoms,n_recursive_vsite,nexcl,nexcl_icg,a;
+    int mb,n_recursive_vsite,nexcl,nexcl_icg,a;
     gmx_molblock_t *molb;
     gmx_moltype_t *molt;
     
@@ -636,10 +693,16 @@ void dd_make_reverse_top(FILE *fplog,
     {
         fprintf(fplog,"\nLinking all bonded interactions to atoms\n");
     }
+
+    /* If normal and/or settle constraints act only within charge groups,
+     * we can store them in the reverse top and simply assign them to domains.
+     * Otherwise we need to assign them to multiple domains and set up
+     * the parallel version constraint algoirthm(s).
+     */
     
     dd->reverse_top = make_reverse_top(mtop,ir->efep!=efepNO,
                                        vsite ? vsite->vsite_pbc_molt : NULL,
-                                       !dd->bInterCGcons,
+                                       !dd->bInterCGcons,!dd->bInterCGsettles,
                                        bBCheck,&dd->nbonded_global);
     
     if (dd->reverse_top->ril_mt_tot_size >= 200000 &&
@@ -679,8 +742,6 @@ void dd_make_reverse_top(FILE *fplog,
                     dd->n_intercg_excl,eel_names[ir->coulombtype]);
         }
     }
-    
-    natoms = mtop->natoms;
 
     if (vsite && vsite->n_intercg_vsite > 0)
     {
@@ -690,12 +751,12 @@ void dd_make_reverse_top(FILE *fplog,
                     "will an extra communication step for selected coordinates and forces\n",
 	      vsite->n_intercg_vsite);
         }
-        init_domdec_vsites(dd,natoms);
+        init_domdec_vsites(dd,vsite->n_intercg_vsite);
     }
     
-    if (dd->bInterCGcons)
+    if (dd->bInterCGcons || dd->bInterCGsettles)
     {
-        init_domdec_constraints(dd,natoms,mtop,constr);
+        init_domdec_constraints(dd,mtop,constr);
     }
     if (fplog)
     {
@@ -710,7 +771,7 @@ static inline void add_ifunc(int nral,t_iatom *tiatoms,t_ilist *il)
     
     if (il->nr+1+nral > il->nalloc)
     {
-        il->nalloc += over_alloc_large(il->nr+1+nral);
+        il->nalloc = over_alloc_large(il->nr+1+nral);
         srenew(il->iatoms,il->nalloc);
     }
     liatoms = il->iatoms + il->nr;
@@ -721,8 +782,9 @@ static inline void add_ifunc(int nral,t_iatom *tiatoms,t_ilist *il)
     il->nr += 1 + nral;
 }
 
-static void add_posres(int mol,int a_mol,gmx_molblock_t *molb,
-                       t_iatom *iatoms,t_idef *idef)
+static void add_posres(int mol,int a_mol,const gmx_molblock_t *molb,
+                       t_iatom *iatoms,const t_iparams *ip_in,
+                       t_idef *idef)
 {
     int n,a_molb;
     t_iparams *ip;
@@ -738,9 +800,9 @@ static void add_posres(int mol,int a_mol,gmx_molblock_t *molb,
     }
     ip = &idef->iparams_posres[n];
     /* Copy the force constants */
-    *ip = idef->iparams[iatoms[0]];
+    *ip = ip_in[iatoms[0]];
     
-    /* Get the position restriant coordinats from the molblock */
+    /* Get the position restraint coordinates from the molblock */
     a_molb = mol*molb->natoms_mol + a_mol;
     if (a_molb >= molb->nposres_xA)
     {
@@ -918,397 +980,453 @@ static real dd_dist2(t_pbc *pbc_null,rvec *cg_cm,const int *la2lc,int i,int j)
     return norm2(dx);
 }
 
-static int make_local_bondeds(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
-                              gmx_molblock_t *molb,
-                              gmx_bool bRCheckMB,ivec rcheck,gmx_bool bRCheck2B,
-                              real rc,
-                              int *la2lc,t_pbc *pbc_null,rvec *cg_cm,
-                              t_idef *idef,gmx_vsite_t *vsite)
+/* Append the nsrc t_blocka block structures in src to *dest */
+static void combine_blocka(t_blocka *dest,const t_blocka *src,int nsrc)
+{
+    int ni,na,s,i;
+
+    ni = src[nsrc-1].nr;
+    na = 0;
+    for(s=0; s<nsrc; s++)
+    {
+        na += src[s].nra;
+    }
+    if (ni + 1 > dest->nalloc_index)
+    {
+        dest->nalloc_index = over_alloc_large(ni+1);
+        srenew(dest->index,dest->nalloc_index);
+    }
+    if (dest->nra + na > dest->nalloc_a)
+    {
+        dest->nalloc_a = over_alloc_large(dest->nra+na);
+        srenew(dest->a,dest->nalloc_a);
+    }
+    for(s=0; s<nsrc; s++)
+    {
+        for(i=dest->nr+1; i<src[s].nr+1; i++)
+        {
+            dest->index[i] = dest->nra + src[s].index[i];
+        }
+        for(i=0; i<src[s].nra; i++)
+        {
+            dest->a[dest->nra+i] = src[s].a[i];
+        }
+        dest->nr   = src[s].nr;
+        dest->nra += src[s].nra;
+    }
+}
+
+/* Append the nsrc t_idef structures in src to *dest,
+ * virtual sites need special attention, as pbc info differs per vsite.
+ */
+static void combine_idef(t_idef *dest,const t_idef *src,int nsrc,
+                         gmx_vsite_t *vsite,int ***vsite_pbc_t)
 {
-    int nzone,nizone,ic,la0,la1,i,i_gl,mb,mt,mol,i_mol,j,ftype,nral,d,k;
-    int *index,*rtil,**vsite_pbc,*vsite_pbc_nalloc;
+    int ftype,n,s,i;
+    t_ilist *ild;
+    const t_ilist *ils;
+    gmx_bool vpbc;
+    int nral1=0,ftv=0;
+
+    for(ftype=0; ftype<F_NRE; ftype++)
+    {
+        n = 0;
+        for(s=0; s<nsrc; s++)
+        {
+            n += src[s].il[ftype].nr;
+        }
+        if (n > 0)
+        {
+            ild = &dest->il[ftype];
+
+            if (ild->nr + n > ild->nalloc)
+            {
+                ild->nalloc = over_alloc_large(ild->nr+n);
+                srenew(ild->iatoms,ild->nalloc);
+            }
+
+            vpbc = ((interaction_function[ftype].flags & IF_VSITE) &&
+                    vsite->vsite_pbc_loc != NULL);
+            if (vpbc)
+            {
+                nral1 = 1 + NRAL(ftype);
+                ftv = ftype - F_VSITE2;
+                if ((ild->nr + n)/nral1 > vsite->vsite_pbc_loc_nalloc[ftv])
+                {
+                    vsite->vsite_pbc_loc_nalloc[ftv] =
+                        over_alloc_large((ild->nr + n)/nral1);
+                    srenew(vsite->vsite_pbc_loc[ftv],
+                           vsite->vsite_pbc_loc_nalloc[ftv]);
+                }
+            }
+
+            for(s=0; s<nsrc; s++)
+            {
+                ils = &src[s].il[ftype];
+                for(i=0; i<ils->nr; i++)
+                {
+                    ild->iatoms[ild->nr+i] = ils->iatoms[i];
+                }
+                if (vpbc)
+                {
+                    for(i=0; i<ils->nr; i+=nral1)
+                    {
+                        vsite->vsite_pbc_loc[ftv][(ild->nr+i)/nral1] =
+                            vsite_pbc_t[s][ftv][i/nral1];
+                    }
+                }
+                
+                ild->nr += ils->nr;
+            }
+        }
+    }
+
+    /* Position restraints need an additional treatment */
+    if (dest->il[F_POSRES].nr > 0)
+    {
+        n = dest->il[F_POSRES].nr/2;
+        if (n > dest->iparams_posres_nalloc)
+        {
+            dest->iparams_posres_nalloc = over_alloc_large(n);
+            srenew(dest->iparams_posres,dest->iparams_posres_nalloc);
+        }
+        /* Set n to the number of original position restraints in dest */
+        for(s=0; s<nsrc; s++)
+        {
+            n -= src[s].il[F_POSRES].nr/2;
+        }
+        for(s=0; s<nsrc; s++)
+        {
+            for(i=0; i<src[s].il[F_POSRES].nr/2; i++)
+            {
+                /* Correct the index into iparams_posres */
+                dest->il[F_POSRES].iatoms[n*2] = n;
+                /* Copy the position restraint force parameters */
+                dest->iparams_posres[n] = src[s].iparams_posres[i];
+                n++;
+            }
+        }
+    }
+}
+
+/* This function looks up and assigns bonded interactions for zone iz.
+ * With thread parallelizing each thread acts on a different atom range:
+ * at_start to at_end.
+ */
+static int make_bondeds_zone(gmx_domdec_t *dd,
+                             const gmx_domdec_zones_t *zones,
+                             const gmx_molblock_t *molb,
+                             gmx_bool bRCheckMB,ivec rcheck,gmx_bool bRCheck2B,
+                             real rc2,
+                             int *la2lc,t_pbc *pbc_null,rvec *cg_cm,
+                             const t_iparams *ip_in,
+                             t_idef *idef,gmx_vsite_t *vsite,
+                             int **vsite_pbc,
+                             int *vsite_pbc_nalloc,
+                             int iz,int nzone,
+                             int at_start,int at_end)
+{
+    int i,i_gl,mb,mt,mol,i_mol,j,ftype,nral,d,k;
+    int *index,*rtil;
     t_iatom *iatoms,tiatoms[1+MAXATOMLIST];
     gmx_bool bBCheck,bUse,bLocal;
-    real rc2;
     ivec k_zero,k_plus;
     gmx_ga2la_t ga2la;
     int  a_loc;
-    int  kc;
-    gmx_domdec_ns_ranges_t *izone;
+    int  kz;
+    int  nizone;
+    const gmx_domdec_ns_ranges_t *izone;
     gmx_reverse_top_t *rt;
-    gmx_molblock_ind_t *mbi;
     int nbonded_local;
-    
-    nzone  = zones->n;
+
     nizone = zones->nizone;
     izone  = zones->izone;
     
-    rc2 = rc*rc;
-    
-    if (vsite && vsite->n_intercg_vsite > 0)
-    {
-        vsite_pbc        = vsite->vsite_pbc_loc;
-        vsite_pbc_nalloc = vsite->vsite_pbc_loc_nalloc;
-    }
-    else
-    {
-        vsite_pbc        = NULL;
-        vsite_pbc_nalloc = NULL;
-    }
-    
     rt = dd->reverse_top;
     
     bBCheck = rt->bBCheck;
     
-    /* Clear the counts */
-    for(ftype=0; ftype<F_NRE; ftype++)
-    {
-        idef->il[ftype].nr = 0;
-    }
     nbonded_local = 0;
     
-    mbi = rt->mbi;
-
     ga2la = dd->ga2la;
-    
-    for(ic=0; ic<nzone; ic++)
+
+    for(i=at_start; i<at_end; i++)
     {
-        la0 = dd->cgindex[zones->cg_range[ic]];
-        la1 = dd->cgindex[zones->cg_range[ic+1]];
-        for(i=la0; i<la1; i++)
+        /* Get the global atom number */
+        i_gl = dd->gatindex[i];
+        global_atomnr_to_moltype_ind(rt,i_gl,&mb,&mt,&mol,&i_mol);
+        /* Check all interactions assigned to this atom */
+        index = rt->ril_mt[mt].index;
+        rtil  = rt->ril_mt[mt].il;
+        j = index[i_mol];
+        while (j < index[i_mol+1])
         {
-            /* Get the global atom number */
-            i_gl = dd->gatindex[i];
-            global_atomnr_to_moltype_ind(mbi,i_gl,&mb,&mt,&mol,&i_mol);
-            /* Check all interactions assigned to this atom */
-            index = rt->ril_mt[mt].index;
-            rtil  = rt->ril_mt[mt].il;
-            j = index[i_mol];
-            while (j < index[i_mol+1])
+            ftype  = rtil[j++];
+            iatoms = rtil + j;
+            nral = NRAL(ftype);
+            if (ftype == F_SETTLE)
             {
-                ftype  = rtil[j++];
-                iatoms = rtil + j;
-                nral = NRAL(ftype);
-                if (interaction_function[ftype].flags & IF_VSITE)
+                /* Settles are only in the reverse top when they
+                 * operate within a charge group. So we can assign
+                 * them without checks. We do this only for performance
+                 * reasons; it could be handled by the code below.
+                 */
+                if (iz == 0)
                 {
-                    /* The vsite construction goes where the vsite itself is */
-                    if (ic == 0)
-                    {
-                        add_vsite(dd->ga2la,index,rtil,ftype,nral,
-                                  TRUE,i,i_gl,i_mol,
-                                  iatoms,idef,vsite_pbc,vsite_pbc_nalloc);
-                    }
-                    j += 1 + nral + 2;
+                    /* Home zone: add this settle to the local topology */
+                    tiatoms[0] = iatoms[0];
+                    tiatoms[1] = i;
+                    tiatoms[2] = i + iatoms[2] - iatoms[1];
+                    tiatoms[3] = i + iatoms[3] - iatoms[1];
+                    add_ifunc(nral,tiatoms,&idef->il[ftype]);
+                    nbonded_local++;
                 }
-                else
+                j += 1 + nral;
+            }
+            else if (interaction_function[ftype].flags & IF_VSITE)
+            {
+                /* The vsite construction goes where the vsite itself is */
+                if (iz == 0)
                 {
-                    /* Copy the type */
-                    tiatoms[0] = iatoms[0];
-                    
-                    if (nral == 1)
+                    add_vsite(dd->ga2la,index,rtil,ftype,nral,
+                              TRUE,i,i_gl,i_mol,
+                              iatoms,idef,vsite_pbc,vsite_pbc_nalloc);
+                }
+                j += 1 + nral + 2;
+            }
+            else
+            {
+                /* Copy the type */
+                tiatoms[0] = iatoms[0];
+
+                if (nral == 1)
+                {
+                    /* Assign single-body interactions to the home zone */
+                    if (iz == 0)
                     {
-                        /* Assign single-body interactions to the home zone */
-                        if (ic == 0)
-                        {
-                            bUse = TRUE;
+                        bUse = TRUE;
                             tiatoms[1] = i;
                             if (ftype == F_POSRES)
                             {
-                                add_posres(mol,i_mol,&molb[mb],tiatoms,idef);
+                                add_posres(mol,i_mol,&molb[mb],tiatoms,ip_in,
+                                           idef);
                             }
+                    }
+                    else
+                    {
+                        bUse = FALSE;
+                    }
+                }
+                else if (nral == 2)
+                {
+                    /* This is a two-body interaction, we can assign
+                     * analogous to the non-bonded assignments.
+                     */
+                    if (!ga2la_get(ga2la,i_gl+iatoms[2]-i_mol,&a_loc,&kz))
+                    {
+                        bUse = FALSE;
+                    }
+                    else
+                    {
+                        if (kz >= nzone)
+                        {
+                            kz -= nzone;
                         }
-                        else
+                        /* Check zone interaction assignments */
+                        bUse = ((iz < nizone && iz <= kz &&
+                                 izone[iz].j0 <= kz && kz < izone[iz].j1) ||
+                                (kz < nizone && iz >  kz &&
+                                 izone[kz].j0 <= iz && iz < izone[kz].j1));
+                        if (bUse)
                         {
-                            bUse = FALSE;
+                            tiatoms[1] = i;
+                            tiatoms[2] = a_loc;
+                            /* If necessary check the cgcm distance */
+                            if (bRCheck2B &&
+                                dd_dist2(pbc_null,cg_cm,la2lc,
+                                         tiatoms[1],tiatoms[2]) >= rc2)
+                            {
+                                bUse = FALSE;
+                            }
                         }
                     }
-                    else if (nral == 2)
+                }
+                else
+                {
+                    /* Assign this multi-body bonded interaction to
+                     * the local node if we have all the atoms involved
+                     * (local or communicated) and the minimum zone shift
+                     * in each dimension is zero, for dimensions
+                     * with 2 DD cells an extra check may be necessary.
+                     */
+                    bUse = TRUE;
+                    clear_ivec(k_zero);
+                    clear_ivec(k_plus);
+                    for(k=1; k<=nral && bUse; k++)
                     {
-                        /* This is a two-body interaction, we can assign
-                         * analogous to the non-bonded assignments.
-                         */
-                        if (!ga2la_get(ga2la,i_gl+iatoms[2]-i_mol,&a_loc,&kc))
+                        bLocal = ga2la_get(ga2la,i_gl+iatoms[k]-i_mol,
+                                           &a_loc,&kz);
+                        if (!bLocal || kz >= zones->n)
                         {
+                            /* We do not have this atom of this interaction
+                             * locally, or it comes from more than one cell
+                             * away.
+                             */
                             bUse = FALSE;
                         }
                         else
                         {
-                            if (kc >= nzone)
-                            {
-                                kc -= nzone;
-                            }
-                            /* Check zone interaction assignments */
-                            bUse = ((ic < nizone && ic <= kc &&
-                                     izone[ic].j0 <= kc && kc < izone[ic].j1) ||
-                                    (kc < nizone && ic >  kc &&
-                                     izone[kc].j0 <= ic && ic < izone[kc].j1));
-                            if (bUse)
+                            tiatoms[k] = a_loc;
+                            for(d=0; d<DIM; d++)
                             {
-                                tiatoms[1] = i;
-                                tiatoms[2] = a_loc;
-                                /* If necessary check the cgcm distance */
-                                if (bRCheck2B &&
-                                    dd_dist2(pbc_null,cg_cm,la2lc,
-                                             tiatoms[1],tiatoms[2]) >= rc2)
+                                if (zones->shift[kz][d] == 0)
+                                {
+                                    k_zero[d] = k;
+                                }
+                                else
                                 {
-                                    bUse = FALSE;
+                                    k_plus[d] = k;
                                 }
                             }
                         }
                     }
-                    else
+                    bUse = (bUse &&
+                            k_zero[XX] && k_zero[YY] && k_zero[ZZ]);
+                    if (bRCheckMB)
                     {
-                        /* Assign this multi-body bonded interaction to
-                         * the local node if we have all the atoms involved
-                         * (local or communicated) and the minimum zone shift
-                         * in each dimension is zero, for dimensions
-                         * with 2 DD cells an extra check may be necessary.
-                         */
-                        bUse = TRUE;
-                        clear_ivec(k_zero);
-                        clear_ivec(k_plus);
-                        for(k=1; k<=nral && bUse; k++)
+                        for(d=0; (d<DIM && bUse); d++)
                         {
-                            bLocal = ga2la_get(ga2la,i_gl+iatoms[k]-i_mol,
-                                               &a_loc,&kc);
-                            if (!bLocal || kc >= zones->n)
+                            /* Check if the cg_cm distance falls within
+                             * the cut-off to avoid possible multiple
+                             * assignments of bonded interactions.
+                             */
+                            if (rcheck[d] && 
+                                k_plus[d] &&
+                                dd_dist2(pbc_null,cg_cm,la2lc,
+                                         tiatoms[k_zero[d]],tiatoms[k_plus[d]]) >= rc2)
                             {
-                                /* We do not have this atom of this interaction
-                                 * locally, or it comes from more than one cell
-                                 * away.
-                                 */
                                 bUse = FALSE;
                             }
-                            else
-                            {
-                                tiatoms[k] = a_loc;
-                                for(d=0; d<DIM; d++)
-                                {
-                                    if (zones->shift[kc][d] == 0)
-                                    {
-                                        k_zero[d] = k;
-                                    }
-                                    else
-                                    {
-                                        k_plus[d] = k;
-                                    }
-                                }
-                            }
-                        }
-                        bUse = (bUse &&
-                                k_zero[XX] && k_zero[YY] && k_zero[ZZ]);
-                        if (bRCheckMB)
-                        {
-                            for(d=0; (d<DIM && bUse); d++)
-                            {
-                                /* Check if the cg_cm distance falls within
-                                 * the cut-off to avoid possible multiple
-                                 * assignments of bonded interactions.
-                                 */
-                                if (rcheck[d] && 
-                                    k_plus[d] &&
-                                    dd_dist2(pbc_null,cg_cm,la2lc,
-                                             tiatoms[k_zero[d]],tiatoms[k_plus[d]]) >= rc2)
-                                {
-                                    bUse = FALSE;
-                                }
-                            }
                         }
                     }
-                    if (bUse)
+                }
+                if (bUse)
+                {
+                    /* Add this interaction to the local topology */
+                    add_ifunc(nral,tiatoms,&idef->il[ftype]);
+                    /* Sum so we can check in global_stat
+                     * if we have everything.
+                     */
+                    if (bBCheck ||
+                        !(interaction_function[ftype].flags & IF_LIMZERO))
                     {
-                        /* Add this interaction to the local topology */
-                        add_ifunc(nral,tiatoms,&idef->il[ftype]);
-                        /* Sum so we can check in global_stat
-                         * if we have everything.
-                         */
-                        if (bBCheck ||
-                            !(interaction_function[ftype].flags & IF_LIMZERO))
-                        {
-                            nbonded_local++;
-                        }
+                        nbonded_local++;
                     }
-                    j += 1 + nral;
                 }
+                j += 1 + nral;
             }
         }
     }
-    
+
     return nbonded_local;
 }
 
-static int make_local_bondeds_intracg(gmx_domdec_t *dd,gmx_molblock_t *molb,
-                                      t_idef *idef,gmx_vsite_t *vsite)
+static void set_no_exclusions_zone(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
+                                   int iz,t_blocka *lexcls)
 {
-    int i,i_gl,mb,mt,mol,i_mol,j,ftype,nral,k;
-    int *index,*rtil,**vsite_pbc,*vsite_pbc_nalloc;
-    t_iatom *iatoms,tiatoms[1+MAXATOMLIST];
-    gmx_reverse_top_t *rt;
-    gmx_molblock_ind_t *mbi;
-    int nbonded_local;
+    int  a0,a1,a;
     
-    if (vsite && vsite->n_intercg_vsite > 0)
-    {
-        vsite_pbc        = vsite->vsite_pbc_loc;
-        vsite_pbc_nalloc = vsite->vsite_pbc_loc_nalloc;
-    }
-    else
-    {
-        vsite_pbc        = NULL;
-        vsite_pbc_nalloc = NULL;
-    }
-    
-    /* Clear the counts */
-    for(ftype=0; ftype<F_NRE; ftype++)
-    {
-        idef->il[ftype].nr = 0;
-    }
-    nbonded_local = 0;
-    
-    rt = dd->reverse_top;
-    
-    if (rt->ril_mt_tot_size == 0)
+    a0 = dd->cgindex[zones->cg_range[iz]];
+    a1 = dd->cgindex[zones->cg_range[iz+1]];
+
+    for(a=a0+1; a<a1+1; a++)
     {
-        /* There are no interactions to assign */
-        return nbonded_local;
+        lexcls->index[a] = lexcls->nra;
     }
-    
-    mbi = rt->mbi;
-    
-    for(i=0; i<dd->nat_home; i++)
-    {
-        /* Get the global atom number */
-        i_gl = dd->gatindex[i];
-        global_atomnr_to_moltype_ind(mbi,i_gl,&mb,&mt,&mol,&i_mol);
-        /* Check all interactions assigned to this atom */
-        index = rt->ril_mt[mt].index;
-        rtil  = rt->ril_mt[mt].il;
-        /* Check all interactions assigned to this atom */
-        j = index[i_mol];
-        while (j < index[i_mol+1])
-        {
-            ftype  = rtil[j++];
-            iatoms = rtil + j;
-            nral = NRAL(ftype);
-            if (interaction_function[ftype].flags & IF_VSITE)
-            {
-                /* The vsite construction goes where the vsite itself is */
-                add_vsite(dd->ga2la,index,rtil,ftype,nral,
-                          TRUE,i,i_gl,i_mol,
-                          iatoms,idef,vsite_pbc,vsite_pbc_nalloc);
-                j += 1 + nral + 2;
-            }
-            else
-            {
-                /* Copy the type */
-                tiatoms[0] = iatoms[0];
-                tiatoms[1] = i;
-                for(k=2; k<=nral; k++)
-                {
-                    tiatoms[k] = i + iatoms[k] - iatoms[1];
-                }
-                if (ftype == F_POSRES)
-                {
-                    add_posres(mol,i_mol,&molb[mb],tiatoms,idef);
-                }
-                /* Add this interaction to the local topology */
-                add_ifunc(nral,tiatoms,&idef->il[ftype]);
-                /* Sum so we can check in global_stat if we have everything */
-                nbonded_local++;
-                j += 1 + nral;
-            }
-        }
-    }
-    
-    return nbonded_local;
 }
 
-static int make_local_exclusions(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
-                                 gmx_mtop_t *mtop,
-                                 gmx_bool bRCheck,real rc,
-                                 int *la2lc,t_pbc *pbc_null,rvec *cg_cm,
-                                 t_forcerec *fr,
-                                 t_blocka *lexcls)
+static int make_exclusions_zone(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
+                                const gmx_moltype_t *moltype,
+                                gmx_bool bRCheck,real rc2,
+                                int *la2lc,t_pbc *pbc_null,rvec *cg_cm,
+                                const int *cginfo,
+                                t_blocka *lexcls,
+                                int iz,
+                                int cg_start,int cg_end)
 {
-    int  nizone,n,count,ic,jla0,jla1,jla;
+    int  nizone,n,count,jla0,jla1,jla;
     int  cg,la0,la1,la,a_gl,mb,mt,mol,a_mol,j,aj_mol;
-    t_blocka *excls;
+    const t_blocka *excls;
     gmx_ga2la_t ga2la;
     int  a_loc;
     int  cell;
-    gmx_molblock_ind_t *mbi;
-    real rc2;
-    
-    /* Since for RF and PME we need to loop over the exclusions
-     * we should store each exclusion only once. This is done
-     * using the same zone scheme as used for neighbor searching.
-     * The exclusions involving non-home atoms are stored only
-     * one way: atom j is in the excl list of i only for j > i,
-     * where i and j are local atom numbers.
-     */
-    
-    lexcls->nr = dd->cgindex[zones->izone[zones->nizone-1].cg1];
-    if (lexcls->nr+1 > lexcls->nalloc_index)
-    {
-        lexcls->nalloc_index = over_alloc_dd(lexcls->nr)+1;
-        srenew(lexcls->index,lexcls->nalloc_index);
-    }
-    
-    mbi = dd->reverse_top->mbi;
-    
+
     ga2la = dd->ga2la;
 
-    rc2 = rc*rc;
-    
-    if (dd->n_intercg_excl)
-    {
-        nizone = zones->nizone;
-    }
-    else
-    {
-        nizone = 1;
-    }
-    n = 0;
+    jla0 = dd->cgindex[zones->izone[iz].jcg0];
+    jla1 = dd->cgindex[zones->izone[iz].jcg1];
+
+    /* We set the end index, but note that we might not start at zero here */
+    lexcls->nr = dd->cgindex[cg_end];
+
+    n = lexcls->nra;
     count = 0;
-    for(ic=0; ic<nizone; ic++)
+    for(cg=cg_start; cg<cg_end; cg++)
     {
-        jla0 = dd->cgindex[zones->izone[ic].jcg0];
-        jla1 = dd->cgindex[zones->izone[ic].jcg1];
-        for(cg=zones->cg_range[ic]; cg<zones->cg_range[ic+1]; cg++)
+        /* Here we assume the number of exclusions in one charge group
+         * is never larger than 1000.
+         */
+        if (n+1000 > lexcls->nalloc_a)
         {
-            /* Here we assume the number of exclusions in one charge group
-             * is never larger than 1000.
-             */
-            if (n+1000 > lexcls->nalloc_a)
-            {
-                lexcls->nalloc_a = over_alloc_large(n+1000);
-                srenew(lexcls->a,lexcls->nalloc_a);
-            }
-            la0 = dd->cgindex[cg];
-            la1 = dd->cgindex[cg+1];
-            if (GET_CGINFO_EXCL_INTER(fr->cginfo[cg]) ||
-                !GET_CGINFO_EXCL_INTRA(fr->cginfo[cg]))
-            {
-                /* Copy the exclusions from the global top */
-                for(la=la0; la<la1; la++) {
-                    lexcls->index[la] = n;
-                    a_gl = dd->gatindex[la];
-                    global_atomnr_to_moltype_ind(mbi,a_gl,&mb,&mt,&mol,&a_mol);
-                    excls = &mtop->moltype[mt].excls;
-                    for(j=excls->index[a_mol]; j<excls->index[a_mol+1]; j++)
+            lexcls->nalloc_a = over_alloc_large(n+1000);
+            srenew(lexcls->a,lexcls->nalloc_a);
+        }
+        la0 = dd->cgindex[cg];
+        la1 = dd->cgindex[cg+1];
+        if (GET_CGINFO_EXCL_INTER(cginfo[cg]) ||
+            !GET_CGINFO_EXCL_INTRA(cginfo[cg]))
+        {
+            /* Copy the exclusions from the global top */
+            for(la=la0; la<la1; la++) {
+                lexcls->index[la] = n;
+                a_gl = dd->gatindex[la];
+                global_atomnr_to_moltype_ind(dd->reverse_top,a_gl,&mb,&mt,&mol,&a_mol);
+                excls = &moltype[mt].excls;
+                for(j=excls->index[a_mol]; j<excls->index[a_mol+1]; j++)
+                {
+                    aj_mol = excls->a[j];
+                    /* This computation of jla is only correct intra-cg */
+                    jla = la + aj_mol - a_mol;
+                    if (jla >= la0 && jla < la1)
                     {
-                        aj_mol = excls->a[j];
-                        /* This computation of jla is only correct intra-cg */
-                        jla = la + aj_mol - a_mol;
-                        if (jla >= la0 && jla < la1)
+                        /* This is an intra-cg exclusion. We can skip
+                         *  the global indexing and distance checking.
+                         */
+                        /* Intra-cg exclusions are only required
+                         * for the home zone.
+                         */
+                        if (iz == 0)
                         {
-                            /* This is an intra-cg exclusion. We can skip
-                             *  the global indexing and distance checking.
-                             */
-                            /* Intra-cg exclusions are only required
-                             * for the home zone.
-                             */
-                            if (ic == 0)
+                            lexcls->a[n++] = jla;
+                            /* Check to avoid double counts */
+                            if (jla > la)
+                            {
+                                count++;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        /* This is a inter-cg exclusion */
+                        /* Since exclusions are pair interactions,
+                         * just like non-bonded interactions,
+                         * they can be assigned properly up
+                         * to the DD cutoff (not cutoff_min as
+                         * for the other bonded interactions).
+                         */
+                        if (ga2la_get(ga2la,a_gl+aj_mol-a_mol,&jla,&cell))
+                        {
+                            if (iz == 0 && cell == 0)
                             {
                                 lexcls->a[n++] = jla;
                                 /* Check to avoid double counts */
@@ -1317,69 +1435,86 @@ static int make_local_exclusions(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
                                     count++;
                                 }
                             }
-                        }
-                        else
-                        {
-                            /* This is a inter-cg exclusion */
-                            /* Since exclusions are pair interactions,
-                             * just like non-bonded interactions,
-                             * they can be assigned properly up
-                             * to the DD cutoff (not cutoff_min as
-                             * for the other bonded interactions).
-                             */
-                            if (ga2la_get(ga2la,a_gl+aj_mol-a_mol,&jla,&cell))
+                            else if (jla >= jla0 && jla < jla1 &&
+                                     (!bRCheck ||
+                                      dd_dist2(pbc_null,cg_cm,la2lc,la,jla) < rc2))
                             {
-                                if (ic == 0 && cell == 0)
-                                {
-                                    lexcls->a[n++] = jla;
-                                    /* Check to avoid double counts */
-                                    if (jla > la)
-                                    {
-                                        count++;
-                                    }
-                                }
-                                else if (jla >= jla0 && jla < jla1 &&
-                                         (!bRCheck ||
-                                          dd_dist2(pbc_null,cg_cm,la2lc,la,jla) < rc2))
-                                {
-                                    /* jla > la, since jla0 > la */
-                                    lexcls->a[n++] = jla;
-                                    count++;
-                                }
+                                /* jla > la, since jla0 > la */
+                                lexcls->a[n++] = jla;
+                                count++;
                             }
                         }
                     }
                 }
             }
-            else
+        }
+        else
+        {
+            /* There are no inter-cg excls and this cg is self-excluded.
+             * These exclusions are only required for zone 0,
+             * since other zones do not see themselves.
+             */
+            if (iz == 0)
             {
-                /* There are no inter-cg excls and this cg is self-excluded.
-                 * These exclusions are only required for zone 0,
-                 * since other zones do not see themselves.
-                 */
-                if (ic == 0)
+                for(la=la0; la<la1; la++)
                 {
-                    for(la=la0; la<la1; la++)
+                    lexcls->index[la] = n;
+                    for(j=la0; j<la1; j++)
                     {
-                        lexcls->index[la] = n;
-                        for(j=la0; j<la1; j++)
-                        {
-                            lexcls->a[n++] = j;
-                        }
+                        lexcls->a[n++] = j;
                     }
-                    count += ((la1 - la0)*(la1 - la0 - 1))/2;
                 }
-                else
+                count += ((la1 - la0)*(la1 - la0 - 1))/2;
+            }
+            else
+            {
+                /* We don't need exclusions for this cg */
+                for(la=la0; la<la1; la++)
                 {
-                    /* We don't need exclusions for this cg */
-                    for(la=la0; la<la1; la++)
-                    {
-                        lexcls->index[la] = n;
-                    }
+                    lexcls->index[la] = n;
                 }
             }
         }
     }
+
+    lexcls->index[lexcls->nr] = n;
+    lexcls->nra = n;
+
+    return count;
+}
+
+static void check_alloc_index(t_blocka *ba,int nindex_max)
+{
+    if (nindex_max+1 > ba->nalloc_index)
+    {
+        ba->nalloc_index = over_alloc_dd(nindex_max+1);
+        srenew(ba->index,ba->nalloc_index);
+    }
+}
+
+static void check_exclusions_alloc(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
+                                   t_blocka *lexcls)
+{
+    int nr;
+    int thread;
+
+    nr = dd->cgindex[zones->izone[zones->nizone-1].cg1];
+
+    check_alloc_index(lexcls,nr);
+
+    for(thread=1; thread<dd->reverse_top->nthread; thread++)
+    {
+        check_alloc_index(&dd->reverse_top->excl_thread[thread],nr);
+    }
+}
+
+static void finish_local_exclusions(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
+                                    t_blocka *lexcls)
+{
+    int la0,la;
+
+    lexcls->nr = dd->cgindex[zones->izone[zones->nizone-1].cg1];
+
     if (dd->n_intercg_excl == 0)
     {
         /* There are no exclusions involving non-home charge groups,
@@ -1388,25 +1523,202 @@ static int make_local_exclusions(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
         la0 = dd->cgindex[zones->izone[0].cg1];
         for(la=la0; la<lexcls->nr; la++)
         {
-            lexcls->index[la] = n;
+            lexcls->index[la] = lexcls->nra;
         }
-    }
-    lexcls->index[lexcls->nr] = n;
-    lexcls->nra = n;
-    if (dd->n_intercg_excl == 0)
-    {
+
         /* nr is only used to loop over the exclusions for Ewald and RF,
          * so we can set it to the number of home atoms for efficiency.
          */
         lexcls->nr = dd->cgindex[zones->izone[0].cg1];
     }
+}
+
+static void clear_idef(t_idef *idef)
+{
+    int  ftype;
+
+     /* Clear the counts */
+    for(ftype=0; ftype<F_NRE; ftype++)
+    {
+        idef->il[ftype].nr = 0;
+    }
+}
+
+static int make_local_bondeds_excls(gmx_domdec_t *dd,
+                                    gmx_domdec_zones_t *zones,
+                                    const gmx_mtop_t *mtop,
+                                    const int *cginfo,
+                                    gmx_bool bRCheckMB,ivec rcheck,gmx_bool bRCheck2B,
+                                    real rc,
+                                    int *la2lc,t_pbc *pbc_null,rvec *cg_cm,
+                                    t_idef *idef,gmx_vsite_t *vsite,
+                                    t_blocka *lexcls,int *excl_count)
+{
+    int  nzone_bondeds,nzone_excl;
+    int  iz,cg0,cg1;
+    real rc2;
+    int  nbonded_local;
+    int  thread;
+    gmx_reverse_top_t *rt;
+
+    if (dd->reverse_top->bMultiCGmols)
+    {
+        nzone_bondeds = zones->n;
+    }
+    else
+    {
+        /* Only single charge group molecules, so interactions don't
+         * cross zone boundaries and we only need to assign in the home zone.
+         */
+        nzone_bondeds = 1;
+    }
+
+    if (dd->n_intercg_excl > 0)
+    {
+        /* We only use exclusions from i-zones to i- and j-zones */
+        nzone_excl = zones->nizone;
+    }
+    else
+    {
+        /* There are no inter-cg exclusions and only zone 0 sees itself */
+        nzone_excl = 1;
+    }
+
+    check_exclusions_alloc(dd,zones,lexcls);
+    
+    rt = dd->reverse_top;
+
+    rc2 = rc*rc;
+    
+    /* Clear the counts */
+    clear_idef(idef);
+    nbonded_local = 0;
+
+    lexcls->nr    = 0;
+    lexcls->nra   = 0;
+    *excl_count   = 0;
+
+    for(iz=0; iz<nzone_bondeds; iz++)
+    {
+        cg0 = zones->cg_range[iz];
+        cg1 = zones->cg_range[iz+1];
+
+#pragma omp parallel for num_threads(rt->nthread) schedule(static)
+        for(thread=0; thread<rt->nthread; thread++)
+        {
+            int cg0t,cg1t;
+            t_idef *idef_t;
+            int ftype;
+            int **vsite_pbc;
+            int *vsite_pbc_nalloc;
+            t_blocka *excl_t;
+
+            cg0t = cg0 + ((cg1 - cg0)* thread   )/rt->nthread;
+            cg1t = cg0 + ((cg1 - cg0)*(thread+1))/rt->nthread;
+
+            if (thread == 0)
+            {
+                idef_t = idef;
+            }
+            else
+            {
+                idef_t = &rt->idef_thread[thread];
+                clear_idef(idef_t);
+            }
+
+            if (vsite && vsite->n_intercg_vsite > 0)
+            {
+                if (thread == 0)
+                {
+                    vsite_pbc        = vsite->vsite_pbc_loc;
+                    vsite_pbc_nalloc = vsite->vsite_pbc_loc_nalloc;
+                }
+                else
+                {
+                    vsite_pbc        = rt->vsite_pbc[thread];
+                    vsite_pbc_nalloc = rt->vsite_pbc_nalloc[thread];
+                }
+            }
+            else
+            {
+                vsite_pbc        = NULL;
+                vsite_pbc_nalloc = NULL;
+            }
+
+            rt->nbonded_thread[thread] =
+                make_bondeds_zone(dd,zones,
+                                  mtop->molblock,
+                                  bRCheckMB,rcheck,bRCheck2B,rc2,
+                                  la2lc,pbc_null,cg_cm,idef->iparams,
+                                  idef_t,
+                                  vsite,vsite_pbc,vsite_pbc_nalloc,
+                                  iz,zones->n,
+                                  dd->cgindex[cg0t],dd->cgindex[cg1t]);
+
+            if (iz < nzone_excl)
+            {
+                if (thread == 0)
+                {
+                    excl_t = lexcls;
+                }
+                else
+                {
+                    excl_t = &rt->excl_thread[thread];
+                    excl_t->nr  = 0;
+                    excl_t->nra = 0;
+                }
+
+                rt->excl_count_thread[thread] =
+                    make_exclusions_zone(dd,zones,
+                                         mtop->moltype,bRCheck2B,rc2,
+                                         la2lc,pbc_null,cg_cm,cginfo,
+                                         excl_t,
+                                         iz,
+                                         cg0t,cg1t);
+            }
+        }
+
+        if (rt->nthread > 1)
+        {
+            combine_idef(idef,rt->idef_thread+1,rt->nthread-1,
+                         vsite,rt->vsite_pbc+1);
+        }
+
+        for(thread=0; thread<rt->nthread; thread++)
+        {
+            nbonded_local += rt->nbonded_thread[thread];
+        }
+
+        if (iz < nzone_excl)
+        {
+            if (rt->nthread > 1)
+            {
+                combine_blocka(lexcls,rt->excl_thread+1,rt->nthread-1);
+            }
+
+            for(thread=0; thread<rt->nthread; thread++)
+            {
+                *excl_count += rt->excl_count_thread[thread];
+            }
+        }
+    }
+
+    /* Some zones might not have exclusions, but some code still needs to
+     * loop over the index, so we set the indices here.
+     */
+    for(iz=nzone_excl; iz<zones->nizone; iz++)
+    {
+        set_no_exclusions_zone(dd,zones,iz,lexcls);
+    }
+
+    finish_local_exclusions(dd,zones,lexcls);
     if (debug)
     {
         fprintf(debug,"We have %d exclusions, check count %d\n",
-                lexcls->nra,count);
+                lexcls->nra,*excl_count);
     }
     
-    return count;
+    return nbonded_local;
 }
 
 void dd_make_local_cgs(gmx_domdec_t *dd,t_block *lcgs)
@@ -1419,7 +1731,9 @@ void dd_make_local_top(FILE *fplog,
                        gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
                        int npbcdim,matrix box,
                        rvec cellsize_min,ivec npulse,
-                       t_forcerec *fr,gmx_vsite_t *vsite,
+                       t_forcerec *fr,
+                       rvec *cgcm_or_x,
+                       gmx_vsite_t *vsite,
                        gmx_mtop_t *mtop,gmx_localtop_t *ltop)
 {
     gmx_bool bUniqueExcl,bRCheckMB,bRCheck2B,bRCheckExcl;
@@ -1439,14 +1753,7 @@ void dd_make_local_top(FILE *fplog,
     bRCheck2B   = FALSE;
     bRCheckExcl = FALSE;
     
-    if (!dd->reverse_top->bMultiCGmols)
-    {
-        /* We don't need checks, assign all interactions with local atoms */
-        
-        dd->nbonded_local = make_local_bondeds_intracg(dd,mtop->molblock,
-                                                       &ltop->idef,vsite);
-    }
-    else
+    if (dd->reverse_top->bMultiCGmols)
     {
         /* We need to check to which cell bondeds should be assigned */
         rc = dd_cutoff_twobody(dd);
@@ -1507,26 +1814,26 @@ void dd_make_local_top(FILE *fplog,
                 pbc_null = NULL;
             }
         }
-        
-        dd->nbonded_local = make_local_bondeds(dd,zones,mtop->molblock,
-                                               bRCheckMB,rcheck,bRCheck2B,rc,
-                                               dd->la2lc,
-                                               pbc_null,fr->cg_cm,
-                                               &ltop->idef,vsite);
     }
+        
+    dd->nbonded_local =
+        make_local_bondeds_excls(dd,zones,mtop,fr->cginfo,
+                                 bRCheckMB,rcheck,bRCheck2B,rc,
+                                 dd->la2lc,
+                                 pbc_null,cgcm_or_x,
+                                 &ltop->idef,vsite,
+                                 &ltop->excls,&nexcl);
     
     /* The ilist is not sorted yet,
      * we can only do this when we have the charge arrays.
      */
     ltop->idef.ilsort = ilsortUNKNOWN;
     
-    nexcl = make_local_exclusions(dd,zones,mtop,bRCheckExcl,
-                                  rc,dd->la2lc,pbc_null,fr->cg_cm,
-                                  fr,&ltop->excls);
-    
     if (dd->reverse_top->bExclRequired)
     {
         dd->nbonded_local += nexcl;
+
+        forcerec_set_excl_load(fr,ltop,NULL);
     }
     
     ltop->atomtypes  = mtop->atomtypes;
@@ -1701,7 +2008,7 @@ t_blocka *make_charge_group_links(gmx_mtop_t *mtop,gmx_domdec_t *dd,
          * to all atoms, not only the first atom as in gmx_reverse_top.
          * The constraints are discarded here.
          */
-        make_reverse_ilist(molt,NULL,FALSE,FALSE,TRUE,&ril);
+        make_reverse_ilist(molt,NULL,FALSE,FALSE,FALSE,TRUE,&ril);
 
         cgi_mb = &cginfo_mb[mb];
         
@@ -1809,7 +2116,7 @@ static void bonded_cg_distance_mol(gmx_moltype_t *molt,int *at2cg,
     r2_mb = 0;
     for(ftype=0; ftype<F_NRE; ftype++)
     {
-        if (dd_check_ftype(ftype,bBCheck,FALSE))
+        if (dd_check_ftype(ftype,bBCheck,FALSE,FALSE))
         {
             il = &molt->ilist[ftype];
             nral = NRAL(ftype);
diff --git a/src/mdlib/edsam.c b/src/mdlib/edsam.c
index 15fdd6873b..3101f68eb9 100644
--- a/src/mdlib/edsam.c
+++ b/src/mdlib/edsam.c
@@ -1189,6 +1189,7 @@ static void init_edi(gmx_mtop_t *mtop,t_inputrec *ir,
     int  i;
     real totalmass = 0.0;
     rvec com;
+    gmx_mtop_atomlookup_t alook=NULL;
     t_atom *atom;
 
     /* NOTE Init_edi is executed on the master process only
@@ -1202,13 +1203,15 @@ static void init_edi(gmx_mtop_t *mtop,t_inputrec *ir,
                      || edi->vecs.radacc.neig
                      || edi->vecs.radcon.neig;
 
+    alook = gmx_mtop_atomlookup_init(mtop);
+
     /* evaluate masses (reference structure) */
     snew(edi->sref.m, edi->sref.nr);
     for (i = 0; i < edi->sref.nr; i++)
     {
         if (edi->fitmas)
         {
-            gmx_mtop_atomnr_to_atom(mtop,edi->sref.anrs[i],&atom);
+            gmx_mtop_atomnr_to_atom(alook,edi->sref.anrs[i],&atom);
             edi->sref.m[i] = atom->m;
         }
         else
@@ -1236,7 +1239,7 @@ static void init_edi(gmx_mtop_t *mtop,t_inputrec *ir,
     snew(edi->sav.m    , edi->sav.nr );
     for (i = 0; i < edi->sav.nr; i++)
     {
-        gmx_mtop_atomnr_to_atom(mtop,edi->sav.anrs[i],&atom);
+        gmx_mtop_atomnr_to_atom(alook,edi->sav.anrs[i],&atom);
         edi->sav.m[i] = atom->m;
         if (edi->pcamas)
         {
@@ -1258,6 +1261,8 @@ static void init_edi(gmx_mtop_t *mtop,t_inputrec *ir,
         }
     }
 
+    gmx_mtop_atomlookup_destroy(alook);
+
     /* put reference structure in origin */
     get_center(edi->sref.x, edi->sref.m, edi->sref.nr, com);
     com[XX] = -com[XX];
diff --git a/src/mdlib/fft5d.c b/src/mdlib/fft5d.c
index c606b6ad67..df8d310d2a 100644
--- a/src/mdlib/fft5d.c
+++ b/src/mdlib/fft5d.c
@@ -60,9 +60,6 @@
 #ifdef GMX_OPENMP
 /* TODO: Do we still need this? Are we still planning ot use fftw + OpenMP? */
 #define FFT5D_THREADS
-#endif
-#ifdef FFT5D_THREADS
-#include "gmx_omp.h"
 /* requires fftw compiled with openmp */
 /* #define FFT5D_FFTW_THREADS (now set by cmake) */
 #endif
@@ -383,13 +380,31 @@ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_
     if (!(flags&FFT5D_NOMALLOC)) { 
         snew_aligned(lin, lsize, 32);
         snew_aligned(lout, lsize, 32);
-        snew_aligned(lout2, lsize, 32);
-        snew_aligned(lout3, lsize, 32);
+        if (nthreads > 1)
+        {
+            /* We need extra transpose buffers to avoid OpenMP barriers */
+            snew_aligned(lout2, lsize, 32);
+            snew_aligned(lout3, lsize, 32);
+        }
+        else
+        {
+            /* We can reuse the buffers to avoid cache misses */
+            lout2 = lin;
+            lout3 = lout;
+        }
     } else {
         lin = *rlin;
         lout = *rlout;
-        lout2 = *rlout2;
-        lout3 = *rlout3;
+        if (nthreads > 1)
+        {
+            lout2 = *rlout2;
+            lout3 = *rlout3;
+        }
+        else
+        {
+            lout2 = lin;
+            lout3 = lout;
+        }
     }
 
     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
@@ -509,6 +524,7 @@ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_
              */
 #pragma omp parallel for num_threads(nthreads) schedule(static) ordered
             for(t=0; t<nthreads; t++)
+#pragma omp ordered
             {
                 int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
 
@@ -904,7 +920,7 @@ void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
         }
 #endif
 
-        if (bParallelDim) {
+        if (bParallelDim || plan->nthreads == 1) {
             fftout = lout;
         }
         else
@@ -1091,6 +1107,14 @@ llToAll
 
 void fft5d_destroy(fft5d_plan plan) {
     int s,t;
+
+    /* Note that we expect plan->lin and plan->lout to be freed elsewhere */
+    if (plan->nthreads > 1)
+    {
+        free(plan->lout2);
+        free(plan->lout3);
+    }
+
     for (s=0;s<3;s++)
     {
         if (plan->p1d[s])
diff --git a/src/mdlib/force.c b/src/mdlib/force.c
index 19be8f6fe6..17441e18a6 100644
--- a/src/mdlib/force.c
+++ b/src/mdlib/force.c
@@ -63,7 +63,7 @@
 #include "partdec.h"
 #include "qmmm.h"
 #include "mpelogging.h"
-
+#include "gmx_omp_nthreads.h"
 
 void ns(FILE *fp,
         t_forcerec *fr,
@@ -113,6 +113,31 @@ void ns(FILE *fp,
   GMX_MPE_LOG(ev_ns_finish);
 }
 
+static void reduce_thread_forces(int n,rvec *f,
+                                 tensor vir,
+                                 real *Vcorr,
+                                 int efpt_ind,real *dvdl,
+                                 int nthreads,f_thread_t *f_t)
+{
+    int t,i;
+
+    /* This reduction can run over any number of threads */
+#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static)
+    for(i=0; i<n; i++)
+    {
+        for(t=1; t<nthreads; t++)
+        {
+            rvec_inc(f[i],f_t[t].f[i]);
+        }
+    }
+    for(t=1; t<nthreads; t++)
+    {
+        *Vcorr += f_t[t].Vcorr;
+        *dvdl  += f_t[t].dvdl[efpt_ind];
+        m_add(vir,f_t[t].vir,vir);
+    }
+}
+
 void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
                        t_forcerec *fr,      t_inputrec *ir,
                        t_idef     *idef,    t_commrec  *cr,
@@ -143,14 +168,14 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
     int     pme_flags;
     matrix  boxs;
     rvec    box_size;
-    real    Vsr,Vlr,Vcorr=0,vdip,vcharge;
+    real    Vsr,Vlr,Vcorr=0;
     t_pbc   pbc;
     real    dvdgb;
     char    buf[22];
     gmx_enerdata_t ed_lam;
     double  clam_i,vlam_i;
-    real    dvdl_dum[efptNR], dvdlambda[efptNR], lam_i[efptNR];
-    real    dvdlsum,dvdl_walls;
+    real    dvdl_dum[efptNR], dvdl, dvdl_nb[efptNR], lam_i[efptNR];
+    real    dvdlsum;
 
 #ifdef GMX_MPI
     double  t0=0.0,t1,t2,t3; /* time measurement for coarse load balancing */
@@ -164,7 +189,7 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
     /* reset free energy components */
     for (i=0;i<efptNR;i++)
     {
-        dvdlambda[i] = 0;
+        dvdl_nb[i]  = 0;
         dvdl_dum[i] = 0;
     }
 
@@ -205,17 +230,16 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
     if (ir->nwall)
     {
         /* foreign lambda component for walls */
-        dvdl_walls = do_walls(ir,fr,box,md,x,f,lambda[efptVDW],
-                 enerd->grpp.ener[egLJSR],nrnb);
-        PRINT_SEPDVDL("Walls",0.0,dvdl_walls);
-        dvdlambda[efptVDW] += dvdl_walls;
-        enerd->dvdl_lin[efptVDW] += dvdl_walls;
+        dvdl = do_walls(ir,fr,box,md,x,f,lambda[efptVDW],
+                        enerd->grpp.ener[egLJSR],nrnb);
+        PRINT_SEPDVDL("Walls",0.0,dvdl);
+        enerd->dvdl_lin[efptVDW] += dvdl;
     }
 
 	/* If doing GB, reset dvda and calculate the Born radii */
 	if (ir->implicit_solvent)
 	{
-		/* wallcycle_start(wcycle,ewcGB); */
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
 
 		for(i=0;i<born->nr;i++)
 		{
@@ -227,28 +251,35 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
 			calc_gb_rad(cr,fr,ir,top,atype,x,&(fr->gblist),born,md,nrnb);
 		}
 
-		/* wallcycle_stop(wcycle, ewcGB); */
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 	}
 
     where();
-    donb_flags = 0;
-    if (flags & GMX_FORCE_FORCES)
+    if (flags & GMX_FORCE_NONBONDED)
     {
-        donb_flags |= GMX_DONB_FORCES;
+        donb_flags = 0;
+        if (flags & GMX_FORCE_FORCES)
+        {
+            donb_flags |= GMX_DONB_FORCES;
+        }
+
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+        do_nonbonded(cr,fr,x,f,md,excl,
+                    fr->bBHAM ?
+                    enerd->grpp.ener[egBHAMSR] :
+                    enerd->grpp.ener[egLJSR],
+                    enerd->grpp.ener[egCOULSR],
+                    enerd->grpp.ener[egGB],box_size,nrnb,
+                    lambda,dvdl_nb,-1,-1,donb_flags);
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
     }
 
-    do_nonbonded(cr,fr,x,f,md,excl,
-                 fr->bBHAM ?
-                 enerd->grpp.ener[egBHAMSR] :
-                 enerd->grpp.ener[egLJSR],
-                 enerd->grpp.ener[egCOULSR],
-				 enerd->grpp.ener[egGB],box_size,nrnb,
-                 lambda,dvdlambda,-1,-1,donb_flags);
     /* If we do foreign lambda and we have soft-core interactions
      * we have to recalculate the (non-linear) energies contributions.
      */
     if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
     {
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
         init_enerdata(mtop->groups.grps[egcENER].nr,fepvals->n_lambda,&ed_lam);
 
         for(i=0; i<enerd->n_lambda; i++)
@@ -270,15 +301,18 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
             enerd->enerpart_lambda[i] += ed_lam.term[F_EPOT];
         }
         destroy_enerdata(&ed_lam);
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
     }
     where();
 
 	/* If we are doing GB, calculate bonded forces and apply corrections
 	 * to the solvation forces */
     /* MRS: Eventually, many need to include free energy contribution here! */
-	if (ir->implicit_solvent)  {
+	if (ir->implicit_solvent)
+    {
 		calc_gb_forces(cr,md,born,top,atype,x,f,fr,idef,
                        ir->gb_algorithm,ir->sa_algorithm,nrnb,bBornRadii,&pbc,graph,enerd);
+        wallcycle_sub_stop(wcycle, ewcsBONDED);
     }
 
 #ifdef GMX_MPI
@@ -291,11 +325,11 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
 
     if (fepvals->sc_alpha!=0)
     {
-        enerd->dvdl_nonlin[efptVDW] += dvdlambda[efptVDW];
+        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
     }
     else
     {
-        enerd->dvdl_lin[efptVDW] += dvdlambda[efptVDW];
+        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
     }
 
     if (fepvals->sc_alpha!=0)
@@ -303,11 +337,11 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
         /* even though coulomb part is linear, we already added it, beacuse we
            need to go through the vdw calculation anyway */
     {
-        enerd->dvdl_nonlin[efptCOUL] += dvdlambda[efptCOUL];
+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
     }
     else
     {
-        enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
     }
 
     Vsr = 0;
@@ -321,7 +355,7 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
                  enerd->grpp.ener[egLJSR][i])
                 + enerd->grpp.ener[egCOULSR][i] + enerd->grpp.ener[egGB][i];
         }
-        dvdlsum = dvdlambda[efptVDW]+dvdlambda[efptCOUL];
+        dvdlsum = dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
         PRINT_SEPDVDL("VdW and Coulomb SR particle-p.",Vsr,dvdlsum);
     }
     debug_gmx();
@@ -368,9 +402,12 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
     if (flags & GMX_FORCE_BONDED)
     {
         GMX_MPE_LOG(ev_calc_bonds_start);
+
+        wallcycle_sub_start(wcycle, ewcsBONDED);
         calc_bonds(fplog,cr->ms,
                    idef,x,hist,f,fr,&pbc,graph,enerd,nrnb,lambda,md,fcd,
                    DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
+                   flags,
                    fr->bSepDVDL && do_per_step(step,ir->nstlog),step);
 
         /* Check if we have to determine energy differences
@@ -401,6 +438,7 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
         }
         debug_gmx();
         GMX_MPE_LOG(ev_calc_bonds_finish);
+        wallcycle_sub_stop(wcycle, ewcsBONDED);
     }
 
     where();
@@ -420,36 +458,93 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
 
         if (fr->bEwald)
         {
-            if (fr->n_tpi == 0)
+            Vcorr = 0;
+            dvdl  = 0;
+
+            /* With the Verlet scheme exclusion forces are calculated
+             * in the non-bonded kernel.
+             */
+            /* The TPI molecule does not have exclusions with the rest
+             * of the system and no intra-molecular PME grid contributions
+             * will be calculated in gmx_pme_calc_energy.
+             */
+            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
+                ir->ewald_geometry != eewg3D ||
+                ir->epsilon_surface != 0)
             {
-                dvdlambda[efptCOUL] = 0;
-                Vcorr = ewald_LRcorrection(fplog,md->start,md->start+md->homenr,
-                                           cr,fr,
+                int nthreads,t;
+
+                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
+
+                if (fr->n_tpi > 0)
+                {
+                    gmx_fatal(FARGS,"TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
+                }
+
+                nthreads = gmx_omp_nthreads_get(emntBonded);
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+                for(t=0; t<nthreads; t++)
+                {
+                    int s,e,i;
+                    rvec *fnv;
+                    tensor *vir;
+                    real *Vcorrt,*dvdlt;
+                    if (t == 0)
+                    {
+                        fnv    = fr->f_novirsum;
+                        vir    = &fr->vir_el_recip;
+                        Vcorrt = &Vcorr;
+                        dvdlt  = &dvdl;
+                    }
+                    else
+                    {
+                        fnv    = fr->f_t[t].f;
+                        vir    = &fr->f_t[t].vir;
+                        Vcorrt = &fr->f_t[t].Vcorr;
+                        dvdlt  = &fr->f_t[t].dvdl[efptCOUL];
+                        for(i=0; i<fr->natoms_force; i++)
+                        {
+                            clear_rvec(fnv[i]);
+                        }
+                        clear_mat(*vir);
+                    }
+                    *dvdlt = 0;
+                    *Vcorrt =
+                        ewald_LRcorrection(fplog,
+                                           fr->excl_load[t],fr->excl_load[t+1],
+                                           cr,t,fr,
                                            md->chargeA,
                                            md->nChargePerturbed ? md->chargeB : NULL,
+                                           ir->cutoff_scheme != ecutsVERLET,
                                            excl,x,bSB ? boxs : box,mu_tot,
                                            ir->ewald_geometry,
                                            ir->epsilon_surface,
-                                           lambda[efptCOUL],&dvdlambda[efptCOUL],&vdip,&vcharge);
-                PRINT_SEPDVDL("Ewald excl./charge/dip. corr.",Vcorr,dvdlambda);
-                enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
-            }
-            else
-            {
-                if (ir->ewald_geometry != eewg3D || ir->epsilon_surface != 0)
+                                           fnv,*vir,
+                                           lambda[efptCOUL],dvdlt);
+                }
+                if (nthreads > 1)
                 {
-                    gmx_fatal(FARGS,"TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
+                    reduce_thread_forces(fr->natoms_force,fr->f_novirsum,
+                                         fr->vir_el_recip,
+                                         &Vcorr,efptCOUL,&dvdl,
+                                         nthreads,fr->f_t);
                 }
-                /* The TPI molecule does not have exclusions with the rest
-                 * of the system and no intra-molecular PME grid contributions
-                 * will be calculated in gmx_pme_calc_energy.
-                 */
-                Vcorr = 0;
+
+                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
+            }
+
+            if (fr->n_tpi == 0)
+            {
+                Vcorr += ewald_charge_correction(cr,fr,lambda[efptCOUL],box,
+                                                 &dvdl,fr->vir_el_recip);
             }
+
+            PRINT_SEPDVDL("Ewald excl./charge/dip. corr.",Vcorr,dvdl);
+            enerd->dvdl_lin[efptCOUL] += dvdl;
         }
 
-        dvdlambda[efptCOUL] = 0;
         status = 0;
+        dvdl = 0;
         switch (fr->eeltype)
         {
         case eelPME:
@@ -467,7 +562,7 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
                     {
                         pme_flags |= GMX_PME_CALC_F;
                     }
-                    if (flags & GMX_FORCE_VIRIAL)
+                    if (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY))
                     {
                         pme_flags |= GMX_PME_CALC_ENER_VIR;
                     }
@@ -486,7 +581,7 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
                                         DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
                                         nrnb,wcycle,
                                         fr->vir_el_recip,fr->ewaldcoeff,
-                                        &Vlr,lambda[efptCOUL],&dvdlambda[efptCOUL],
+                                        &Vlr,lambda[efptCOUL],&dvdl,
                                         pme_flags);
                     *cycles_pme = wallcycle_stop(wcycle,ewcPMEMESH);
 
@@ -508,7 +603,7 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
                                         md->chargeA + md->homenr - fr->n_tpi,
                                         &Vlr);
                 }
-                PRINT_SEPDVDL("PME mesh",Vlr,dvdlambda[efptCOUL]);
+                PRINT_SEPDVDL("PME mesh",Vlr,dvdl);
             }
             else
             {
@@ -522,8 +617,8 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
                            md->chargeA,md->chargeB,
                            box_size,cr,md->homenr,
                            fr->vir_el_recip,fr->ewaldcoeff,
-                           lambda[efptCOUL],&dvdlambda[efptCOUL],fr->ewald_table);
-            PRINT_SEPDVDL("Ewald long-range",Vlr,dvdlambda[efptCOUL]);
+                           lambda[efptCOUL],&dvdl,fr->ewald_table);
+            PRINT_SEPDVDL("Ewald long-range",Vlr,dvdl);
             break;
         default:
             Vlr = 0;
@@ -535,7 +630,7 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
             gmx_fatal(FARGS,"Error %d in long range electrostatics routine %s",
                       status,EELTYPE(fr->eeltype));
 		}
-        enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
+        enerd->dvdl_lin[efptCOUL] += dvdl;
         enerd->term[F_COUL_RECIP] = Vlr + Vcorr;
         if (debug)
         {
@@ -549,18 +644,20 @@ void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
     {
         if (EEL_RF(fr->eeltype))
         {
-            dvdlambda[efptCOUL] = 0;
-
-            if (fr->eeltype != eelRF_NEC)
+            /* With the Verlet scheme exclusion forces are calculated
+             * in the non-bonded kernel.
+             */
+            if (ir->cutoff_scheme != ecutsVERLET && fr->eeltype != eelRF_NEC)
             {
+                dvdl = 0;
                 enerd->term[F_RF_EXCL] =
                     RF_excl_correction(fplog,fr,graph,md,excl,x,f,
-                                       fr->fshift,&pbc,lambda[efptCOUL],&dvdlambda[efptCOUL]);
+                                       fr->fshift,&pbc,lambda[efptCOUL],&dvdl);
             }
 
-            enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
+            enerd->dvdl_lin[efptCOUL] += dvdl;
             PRINT_SEPDVDL("RF exclusion correction",
-                          enerd->term[F_RF_EXCL],dvdlambda[efptCOUL]);
+                          enerd->term[F_RF_EXCL],dvdl);
         }
     }
     where();
diff --git a/src/mdlib/forcerec.c b/src/mdlib/forcerec.c
index ee164d229c..82db34d700 100644
--- a/src/mdlib/forcerec.c
+++ b/src/mdlib/forcerec.c
@@ -39,13 +39,19 @@
 
 #include <math.h>
 #include <string.h>
+#include "assert.h"
 #include "sysstuff.h"
 #include "typedefs.h"
+#include "vec.h"
+#include "maths.h"
 #include "macros.h"
 #include "smalloc.h"
 #include "macros.h"
+#include "gmx_fatal.h"
+#include "gmx_fatal_collective.h"
 #include "physics.h"
 #include "force.h"
+#include "tables.h"
 #include "nonbonded.h"
 #include "invblock.h"
 #include "names.h"
@@ -55,20 +61,26 @@
 #include "mshift.h"
 #include "txtdump.h"
 #include "coulomb.h"
-#include "mdrun.h"
+#include "md_support.h"
 #include "domdec.h"
 #include "partdec.h"
 #include "qmmm.h"
 #include "copyrite.h"
 #include "mtop_util.h"
-#include "gmx_detectcpu.h"
+#include "nbnxn_search.h"
+#include "nbnxn_consts.h"
+#include "statutil.h"
+#include "gmx_omp_nthreads.h"
 
 #ifdef _MSC_VER
 /* MSVC definition for __cpuid() */
 #include <intrin.h>
 #endif
 
-
+#include "types/nbnxn_cuda_types_ext.h"
+#include "gpu_utils.h"
+#include "nbnxn_cuda_data_mgmt.h"
+#include "pmalloc_cuda.h"
 
 t_forcerec *mk_forcerec(void)
 {
@@ -517,22 +529,44 @@ check_solvent(FILE *                fp,
     fr->solvent_opt = bestsol;
 }
 
+enum { acNONE=0, acCONSTRAINT, acSETTLE };
+
 static cginfo_mb_t *init_cginfo_mb(FILE *fplog,const gmx_mtop_t *mtop,
-                                   t_forcerec *fr,gmx_bool bNoSolvOpt)
+                                   t_forcerec *fr,gmx_bool bNoSolvOpt,
+                                   gmx_bool *bExcl_IntraCGAll_InterCGNone)
 {
     const t_block *cgs;
     const t_blocka *excl;
     const gmx_moltype_t *molt;
     const gmx_molblock_t *molb;
     cginfo_mb_t *cginfo_mb;
+    gmx_bool *type_VDW;
     int  *cginfo;
     int  cg_offset,a_offset,cgm,am;
     int  mb,m,ncg_tot,cg,a0,a1,gid,ai,j,aj,excl_nalloc;
-    gmx_bool bId,*bExcl,bExclIntraAll,bExclInter;
+    int  *a_con;
+    int  ftype;
+    int  ia;
+    gmx_bool bId,*bExcl,bExclIntraAll,bExclInter,bHaveVDW,bHaveQ;
 
     ncg_tot = ncg_mtop(mtop);
     snew(cginfo_mb,mtop->nmolblock);
 
+    snew(type_VDW,fr->ntype);
+    for(ai=0; ai<fr->ntype; ai++)
+    {
+        type_VDW[ai] = FALSE;
+        for(j=0; j<fr->ntype; j++)
+        {
+            type_VDW[ai] = type_VDW[ai] ||
+                fr->bBHAM ||
+                C6(fr->nbfp,fr->ntype,ai,j) != 0 ||
+                C12(fr->nbfp,fr->ntype,ai,j) != 0;
+        }
+    }
+
+    *bExcl_IntraCGAll_InterCGNone = TRUE;
+
     excl_nalloc = 10;
     snew(bExcl,excl_nalloc);
     cg_offset = 0;
@@ -582,6 +616,28 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog,const gmx_mtop_t *mtop,
         snew(cginfo_mb[mb].cginfo,cginfo_mb[mb].cg_mod);
         cginfo = cginfo_mb[mb].cginfo;
 
+        /* Set constraints flags for constrained atoms */
+        snew(a_con,molt->atoms.nr);
+        for(ftype=0; ftype<F_NRE; ftype++)
+        {
+            if (interaction_function[ftype].flags & IF_CONSTRAINT)
+            {
+                int nral;
+
+                nral = NRAL(ftype);
+                for(ia=0; ia<molt->ilist[ftype].nr; ia+=1+nral)
+                {
+                    int a;
+
+                    for(a=0; a<nral; a++)
+                    {
+                        a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
+                            (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
+                    }
+                }
+            }
+        }
+
         for(m=0; m<(bId ? 1 : molb->nmol); m++)
         {
             cgm = m*cgs->nr;
@@ -605,9 +661,19 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog,const gmx_mtop_t *mtop,
                  */
                 bExclIntraAll = TRUE;
                 bExclInter    = FALSE;
-                for(ai=a0; ai<a1; ai++) {
+                bHaveVDW      = FALSE;
+                bHaveQ        = FALSE;
+                for(ai=a0; ai<a1; ai++)
+                {
+                    /* Check VDW and electrostatic interactions */
+                    bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
+                                            type_VDW[molt->atoms.atom[ai].typeB]);
+                    bHaveQ  = bHaveQ    || (molt->atoms.atom[ai].q != 0 ||
+                                            molt->atoms.atom[ai].qB != 0);
+
                     /* Clear the exclusion list for atom ai */
-                    for(aj=a0; aj<a1; aj++) {
+                    for(aj=a0; aj<a1; aj++)
+                    {
                         bExcl[aj-a0] = FALSE;
                     }
                     /* Loop over all the exclusions of atom ai */
@@ -631,6 +697,18 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog,const gmx_mtop_t *mtop,
                             bExclIntraAll = FALSE;
                         }
                     }
+
+                    switch (a_con[ai])
+                    {
+                    case acCONSTRAINT:
+                        SET_CGINFO_CONSTR(cginfo[cgm+cg]);
+                        break;
+                    case acSETTLE:
+                        SET_CGINFO_SETTLE(cginfo[cgm+cg]);
+                        break;
+                    default:
+                        break;
+                    }
                 }
                 if (bExclIntraAll)
                 {
@@ -645,9 +723,26 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog,const gmx_mtop_t *mtop,
                     /* The size in cginfo is currently only read with DD */
                     gmx_fatal(FARGS,"A charge group has size %d which is larger than the limit of %d atoms",a1-a0,MAX_CHARGEGROUP_SIZE);
                 }
+                if (bHaveVDW)
+                {
+                    SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
+                }
+                if (bHaveQ)
+                {
+                    SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
+                }
+                /* Store the charge group size */
                 SET_CGINFO_NATOMS(cginfo[cgm+cg],a1-a0);
+
+                if (!bExclIntraAll || bExclInter)
+                {
+                    *bExcl_IntraCGAll_InterCGNone = FALSE;
+                }
             }
         }
+
+        sfree(a_con);
+
         cg_offset += molb->nmol*cgs->nr;
         a_offset  += molb->nmol*cgs->index[cgs->nr];
     }
@@ -710,38 +805,47 @@ static int *cginfo_expand(int nmb,cginfo_mb_t *cgi_mb)
 
 static void set_chargesum(FILE *log,t_forcerec *fr,const gmx_mtop_t *mtop)
 {
-    double qsum;
+    double qsum,q2sum,q;
     int    mb,nmol,i;
     const t_atoms *atoms;
     
-    qsum = 0;
+    qsum  = 0;
+    q2sum = 0;
     for(mb=0; mb<mtop->nmolblock; mb++)
     {
         nmol  = mtop->molblock[mb].nmol;
         atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
         for(i=0; i<atoms->nr; i++)
         {
-            qsum += nmol*atoms->atom[i].q;
+            q = atoms->atom[i].q;
+            qsum  += nmol*q;
+            q2sum += nmol*q*q;
         }
     }
-    fr->qsum[0] = qsum;
+    fr->qsum[0]  = qsum;
+    fr->q2sum[0] = q2sum;
     if (fr->efep != efepNO)
     {
-        qsum = 0;
+        qsum  = 0;
+        q2sum = 0;
         for(mb=0; mb<mtop->nmolblock; mb++)
         {
             nmol  = mtop->molblock[mb].nmol;
             atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
             for(i=0; i<atoms->nr; i++)
             {
-                qsum += nmol*atoms->atom[i].qB;
+                q = atoms->atom[i].qB;
+                qsum  += nmol*q;
+                q2sum += nmol*q*q;
             }
-            fr->qsum[1] = qsum;
+            fr->qsum[1]  = qsum;
+            fr->q2sum[1] = q2sum;
         }
     }
     else
     {
-        fr->qsum[1] = fr->qsum[0];
+        fr->qsum[1]  = fr->qsum[0];
+        fr->q2sum[1] = fr->q2sum[0];
     }
     if (log) {
         if (fr->efep == efepNO)
@@ -1240,6 +1344,458 @@ gmx_bool can_use_allvsall(const t_inputrec *ir, const gmx_mtop_t *mtop,
 }
 
 
+static void init_forcerec_f_threads(t_forcerec *fr,int grpp_nener)
+{
+    int t,i;
+
+    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
+
+    if (fr->nthreads > 1)
+    {
+        snew(fr->f_t,fr->nthreads);
+        /* Thread 0 uses the global force and energy arrays */
+        for(t=1; t<fr->nthreads; t++)
+        {
+            fr->f_t[t].f = NULL;
+            fr->f_t[t].f_nalloc = 0;
+            snew(fr->f_t[t].fshift,SHIFTS);
+            /* snew(fr->f_t[t].ener,F_NRE); */
+            fr->f_t[t].grpp.nener = grpp_nener;
+            for(i=0; i<egNR; i++)
+            {
+                snew(fr->f_t[t].grpp.ener[i],grpp_nener);
+            }
+        }
+    }
+}
+
+
+static void pick_nbnxn_kernel_cpu(FILE *fp,
+                                  const t_commrec *cr,
+                                  const gmx_cpuid_t cpuid_info,
+                                  int *kernel_type)
+{
+    *kernel_type = nbk4x4_PlainC;
+
+#ifdef GMX_X86_SSE2
+    {
+        /* On Intel Sandy-Bridge AVX-256 kernels are always faster.
+         * On AMD Bulldozer AVX-256 is much slower than AVX-128.
+         */
+        if(gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX) == 1 &&
+           gmx_cpuid_vendor(cpuid_info) != GMX_CPUID_VENDOR_AMD)
+        {
+#ifdef GMX_X86_AVX_256
+            *kernel_type = nbk4xN_X86_SIMD256;
+#else
+            *kernel_type = nbk4xN_X86_SIMD128;
+#endif
+        }
+        else
+        {
+            *kernel_type = nbk4xN_X86_SIMD128;
+        }
+
+        if (getenv("GMX_NBNXN_AVX128") != NULL)
+        {
+            *kernel_type = nbk4xN_X86_SIMD128;
+        }
+        if (getenv("GMX_NBNXN_AVX256") != NULL)
+        {
+#ifdef GMX_X86_AVX_256
+            *kernel_type = nbk4xN_X86_SIMD256;
+#else
+            gmx_fatal(FARGS,"You requested AVX-256 nbnxn kernels, but GROMACS was built without AVX support");
+#endif
+        }
+    }
+#endif /* GMX_X86_SSE2 */
+}
+
+static void pick_nbnxn_kernel(FILE *fp,
+                              const t_commrec *cr,
+                              const gmx_hw_info_t *hwinfo,
+                              gmx_bool use_cpu_acceleration,
+                              gmx_bool *bUseGPU,
+                              int *kernel_type)
+{
+    gmx_bool bEmulateGPU, bGPU;
+    char gpu_err_str[STRLEN];
+
+    assert(kernel_type);
+
+    *kernel_type = nbkNotSet;
+    /* if bUseGPU == NULL we don't want a GPU (e.g. hybrid mode kernel selection) */
+    bGPU = (bUseGPU != NULL) && hwinfo->bCanUseGPU;
+
+    /* Run GPU emulation mode if GMX_EMULATE_GPU is defined or in case if nobonded
+       calculations are turned off via GMX_NO_NONBONDED -- this is the simple way
+       to turn off GPU/CUDA initializations as well.. */
+    bEmulateGPU = ((getenv("GMX_EMULATE_GPU") != NULL) ||
+                   (getenv("GMX_NO_NONBONDED") != NULL));
+
+    if (bGPU)
+    {
+        if (bEmulateGPU)
+        {
+            bGPU = FALSE;
+        }
+        else
+        {
+            /* Each PP node will use the intra-node id-th device from the
+             * list of detected/selected GPUs. */ 
+            if (!init_gpu(cr->nodeid_group_intra, gpu_err_str, &hwinfo->gpu_info))
+            {
+                /* At this point the init should never fail as we made sure that 
+                 * we have all the GPUs we need. If it still does, we'll bail. */
+                gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
+                          cr->nodeid,
+                          get_gpu_device_id(&hwinfo->gpu_info, cr->nodeid_group_intra),
+                          gpu_err_str);
+            }
+        }
+        *bUseGPU = bGPU;
+    }
+
+    if (bEmulateGPU)
+    {
+        *kernel_type = nbk8x8x8_PlainC;
+
+        md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
+    }
+    else if (bGPU)
+    {
+        *kernel_type = nbk8x8x8_CUDA;
+    }
+
+    if (*kernel_type == nbkNotSet)
+    {
+        if (use_cpu_acceleration)
+        {
+            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,kernel_type);
+        }
+        else
+        {
+            *kernel_type = nbk4x4_PlainC;
+        }
+    }
+
+    if (fp != NULL)
+    {
+        if (MASTER(cr))
+        {
+            fprintf(stderr,"Using %s non-bonded kernels\n",
+                    nbk_name[*kernel_type]);
+        }
+        fprintf(fp,"\nUsing %s non-bonded kernels\n\n",
+                nbk_name[*kernel_type]);
+    }
+}
+
+
+static void init_verlet_ewald_f_table(interaction_const_t *ic,
+                                      int verlet_kernel_type)
+{
+    if (nbnxn_kernel_pairlist_simple(verlet_kernel_type))
+    {
+        /* With a spacing of 0.0005 we are at the force summation accuracy
+         * for the SSE kernels for "normal" atomistic simulations.
+         */
+        ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
+                                                   ic->rcoulomb);
+        ic->tabq_size  = (int)(ic->rcoulomb*ic->tabq_scale) + 2;
+#ifndef GMX_DOUBLE
+        ic->tabq_format = tableformatFDV0;
+#else
+        ic->tabq_format = tableformatF;
+#endif
+    }
+    else
+    {
+        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
+        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
+        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
+        if (verlet_kernel_type == nbk8x8x8_CUDA)
+        {
+            /* This case is handled in the nbnxn CUDA module */
+            ic->tabq_format = tableformatNONE;
+        }
+        else
+        {
+            ic->tabq_format = tableformatF;
+        }
+    }
+
+    switch (ic->tabq_format)
+    {
+    case tableformatNONE:
+        break;
+    case tableformatF:
+        sfree_aligned(ic->tabq_coul_F);
+        sfree_aligned(ic->tabq_coul_V);
+        snew_aligned(ic->tabq_coul_F,ic->tabq_size,16);
+        snew_aligned(ic->tabq_coul_V,ic->tabq_size,16);
+        table_spline3_fill_ewald_lr(ic->tabq_coul_F,ic->tabq_coul_V,
+                                    ic->tabq_size,ic->tabq_format,
+                                    1/ic->tabq_scale,ic->ewaldcoeff);
+        break;
+    case tableformatFDV0:
+        sfree_aligned(ic->tabq_coul_F);
+        snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,16);
+        table_spline3_fill_ewald_lr(ic->tabq_coul_FDV0,NULL,
+                                    ic->tabq_size,ic->tabq_format,
+                                    1/ic->tabq_scale,ic->ewaldcoeff);
+        break;
+    default:
+        gmx_incons("Unknown table format");
+    }
+}
+
+void init_interaction_const_tables(FILE *fp, 
+                                   interaction_const_t *ic,
+                                   int verlet_kernel_type)
+{
+    real spacing;
+
+    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
+    {
+        init_verlet_ewald_f_table(ic,verlet_kernel_type);
+
+        if (fp != NULL)
+        {
+            fprintf(fp,"Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
+                    1/ic->tabq_scale,ic->tabq_size);
+        }
+    }
+}
+
+void init_interaction_const(FILE *fp, 
+                            interaction_const_t **interaction_const,
+                            const t_forcerec *fr)
+{
+    interaction_const_t *ic;
+    gmx_bool shLJ,shCoul;
+
+    shLJ   = (getenv("GMX_NO_SHIFT_LJ") == NULL);
+    shCoul = (getenv("GMX_NO_SHIFT_COUL") == NULL);
+
+    snew(ic, 1);
+
+    ic->rlist       = fr->rlist;
+
+    /* Lennard-Jones */
+    ic->rvdw        = fr->rvdw;
+    if (shLJ)
+    {
+        ic->sh_invrc6 = pow(ic->rvdw,-6.0);
+    }
+    else
+    {
+        ic->sh_invrc6 = 0;
+    }
+
+    /* Electrostatics */
+    ic->eeltype     = fr->eeltype;
+    ic->rcoulomb    = fr->rcoulomb;
+    ic->epsilon_r   = fr->epsilon_r;
+    ic->epsfac      = fr->epsfac;
+
+    /* Ewald */
+    ic->ewaldcoeff  = fr->ewaldcoeff;
+    if (shCoul)
+    {
+        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
+    }
+    else
+    {
+        ic->sh_ewald = 0;
+    }
+
+    /* Reaction-field */
+    if (EEL_RF(ic->eeltype))
+    {
+        ic->epsilon_rf = fr->epsilon_rf;
+        ic->k_rf       = fr->k_rf;
+        ic->c_rf       = fr->c_rf;
+    }
+    else
+    {
+        /* For plain cut-off we might use the reaction-field kernels */
+        ic->epsilon_rf = ic->epsilon_r;
+        ic->k_rf       = 0;
+        if (shCoul)
+        {
+            ic->c_rf   = 1/ic->rcoulomb;
+        }
+        else
+        {
+            ic->c_rf   = 0;
+        }
+    }
+
+    if (fp != NULL)
+    {
+        fprintf(fp,"Potential shift: LJ r^-12: %.3f r^-6 %.3f",
+                sqr(ic->sh_invrc6),ic->sh_invrc6);
+        if (ic->eeltype == eelCUT)
+        {
+            fprintf(fp,", Coulomb %.3f",ic->c_rf);
+        }
+        else if (EEL_PME(ic->eeltype))
+        {
+            fprintf(fp,", Ewald %.3e",ic->sh_ewald);
+        }
+        fprintf(fp,"\n");
+    }
+
+    *interaction_const = ic;
+
+    if (fr->nbv != NULL && fr->nbv->bUseGPU)
+    {
+        nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv);
+    }
+
+    if (fr->cutoff_scheme == ecutsVERLET)
+    {
+        assert(fr->nbv != NULL && fr->nbv->grp != NULL);
+        init_interaction_const_tables(fp,ic,fr->nbv->grp[fr->nbv->ngrp-1].kernel_type);
+    }
+}
+
+static void init_nb_verlet(FILE *fp,
+                           nonbonded_verlet_t **nb_verlet,
+                           const t_inputrec *ir,
+                           const t_forcerec *fr,
+                           const t_commrec *cr,
+                           const char *nbpu_opt)
+{
+    nonbonded_verlet_t *nbv;
+    int  i;
+    char *env;
+    gmx_bool bHybridGPURun = FALSE;
+
+    gmx_nbat_alloc_t *nb_alloc;
+    gmx_nbat_free_t  *nb_free;
+
+    snew(nbv, 1);
+
+    nbv->nbs = NULL;
+
+    nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
+    for(i=0; i<nbv->ngrp; i++)
+    {
+        nbv->grp[i].nbl_lists.nnbl = 0;
+        nbv->grp[i].nbat           = NULL;
+        nbv->grp[i].kernel_type    = nbkNotSet;
+
+        if (i == 0) /* local */
+        {
+            pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
+                              &nbv->bUseGPU,
+                              &nbv->grp[i].kernel_type);
+        }
+        else /* non-local */
+        {
+            if (nbpu_opt != NULL && strcmp(nbpu_opt,"gpu_cpu") == 0)
+            {
+                /* Use GPU for local, select a CPU kernel for non-local */
+                pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
+                                  NULL,
+                                  &nbv->grp[i].kernel_type);
+
+                bHybridGPURun = TRUE;
+            }
+            else
+            {
+                /* Use the same kernel for local and non-local interactions */
+                nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
+            }
+        }
+    }
+
+    if (nbv->bUseGPU)
+    {
+        /* init the NxN GPU data; the last argument tells whether we'll have
+         * both local and non-local NB calculation on GPU */
+        nbnxn_cuda_init(fp, &nbv->cu_nbv,
+                        &fr->hwinfo->gpu_info, cr->nodeid_group_intra,
+                        (nbv->ngrp > 1) && !bHybridGPURun);
+
+        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
+        {
+            char *end;
+
+            nbv->min_ci_balanced = strtol(env, &end, 10);
+            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
+            {
+                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
+            }
+
+            if (debug)
+            {
+                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n", 
+                        nbv->min_ci_balanced);
+            }
+        }
+        else
+        {
+            nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
+            if (debug)
+            {
+                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
+                        nbv->min_ci_balanced);
+            }
+        }
+    }
+    else
+    {
+        nbv->min_ci_balanced = 0;
+    }
+
+    *nb_verlet = nbv;
+
+    nbnxn_init_search(&nbv->nbs,
+                      DOMAINDECOMP(cr) ? & cr->dd->nc : NULL,
+                      DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
+                      gmx_omp_nthreads_get(emntNonbonded));
+
+    for(i=0; i<nbv->ngrp; i++)
+    {
+        if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
+        {
+            nb_alloc = &pmalloc;
+            nb_free  = &pfree;
+        }
+        else
+        {
+            nb_alloc = NULL;
+            nb_free  = NULL;
+        }
+
+        nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
+                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
+                                /* 8x8x8 "non-simple" lists are ATM always combined */
+                                !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
+                                nb_alloc, nb_free);
+
+        if (i == 0 ||
+            nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
+        {
+            snew(nbv->grp[i].nbat,1);
+            nbnxn_atomdata_init(fp,
+                                nbv->grp[i].nbat,
+                                nbv->grp[i].kernel_type,
+                                fr->ntype,fr->nbfp,
+                                ir->opts.ngener,
+                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
+                                nb_alloc, nb_free);
+        }
+        else
+        {
+            nbv->grp[i].nbat = nbv->grp[0].nbat;
+        }
+    }
+}
 
 void init_forcerec(FILE *fp,
                    const output_env_t oenv,
@@ -1254,7 +1810,8 @@ void init_forcerec(FILE *fp,
                    const char *tabafn,
                    const char *tabpfn,
                    const char *tabbfn,
-                   gmx_bool       bNoSolvOpt,
+                   const char *nbpu_opt,
+                   gmx_bool   bNoSolvOpt,
                    real       print_force)
 {
     int     i,j,m,natoms,ngrp,negp_pp,negptable,egi,egj;
@@ -1268,15 +1825,8 @@ void init_forcerec(FILE *fp,
     t_nblists *nbl;
     int     *nm_ind,egp_flags;
     
-    gmx_detectcpu(&fr->cpu_information);
-    if(MASTER(cr))
-    {
-        /* Only print warnings from master */
-        gmx_detectcpu_check_acceleration(fr->cpu_information,fp);
-    }
-
     /* By default we turn acceleration on, but it might be turned off further down... */
-    fr->use_acceleration = TRUE;
+    fr->use_cpu_acceleration = TRUE;
 
     fr->bDomDec = DOMAINDECOMP(cr);
 
@@ -1372,6 +1922,16 @@ void init_forcerec(FILE *fp,
         }
     }
 
+    fr->bNonbonded = TRUE;
+    if (getenv("GMX_NO_NONBONDED") != NULL)
+    {
+        /* turn off non-bonded calculations */
+        fr->bNonbonded = FALSE;
+        md_print_warn(cr,fp,
+                      "Found environment variable GMX_NO_NONBONDED.\n"
+                      "Disabling nonbonded calculations.\n");
+    }
+
     bGenericKernelOnly = FALSE;
     if (getenv("GMX_NB_GENERIC") != NULL)
     {
@@ -1384,15 +1944,15 @@ void init_forcerec(FILE *fp,
         bGenericKernelOnly = TRUE;
         bNoSolvOpt         = TRUE;
     }
-    
-    if (getenv("GMX_DISABLE_ACCELERATION") != NULL)
+
+    if( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
     {
-        fr->use_acceleration = FALSE;
+        fr->use_cpu_acceleration = FALSE;
         if (fp != NULL)
         {
             fprintf(fp,
-                    "\nFound environment variable GMX_DISABLE_ACCELERATION.\n"
-                    "Disabling all architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
+                    "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
+                    "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
         }
     }
 
@@ -1401,12 +1961,49 @@ void init_forcerec(FILE *fp,
     fr->AllvsAll_work   = NULL;
     fr->AllvsAll_workgb = NULL;
 
-    
-    
+
     /* Neighbour searching stuff */
-    fr->bGrid      = (ir->ns_type == ensGRID);
-    fr->ePBC       = ir->ePBC;
-    fr->bMolPBC    = ir->bPeriodicMols;
+    fr->cutoff_scheme = ir->cutoff_scheme;
+    fr->bGrid         = (ir->ns_type == ensGRID);
+    fr->ePBC          = ir->ePBC;
+
+    /* Determine if we will do PBC for distances in bonded interactions */
+    if (fr->ePBC == epbcNONE)
+    {
+        fr->bMolPBC = FALSE;
+    }
+    else
+    {
+        if (!DOMAINDECOMP(cr))
+        {
+            /* The group cut-off scheme and SHAKE assume charge groups
+             * are whole, but not using molpbc is faster in most cases.
+             */
+            if (fr->cutoff_scheme == ecutsGROUP ||
+                (ir->eConstrAlg == econtSHAKE &&
+                 (gmx_mtop_ftype_count(mtop,F_CONSTR) > 0 ||
+                  gmx_mtop_ftype_count(mtop,F_CONSTRNC) > 0)))
+            {
+                fr->bMolPBC = ir->bPeriodicMols;
+            }
+            else
+            {
+                fr->bMolPBC = TRUE;
+                if (getenv("GMX_USE_GRAPH") != NULL)
+                {
+                    fr->bMolPBC = FALSE;
+                    if (fp)
+                    {
+                        fprintf(fp,"\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
+                    }
+                }
+            }
+        }
+        else
+        {
+            fr->bMolPBC = dd_bonded_molpbc(cr->dd,fr->ePBC);
+        }
+    }
     fr->rc_scaling = ir->refcoord_scaling;
     copy_rvec(ir->posres_com,fr->posres_com);
     copy_rvec(ir->posres_comB,fr->posres_comB);
@@ -1419,20 +2016,35 @@ void init_forcerec(FILE *fp,
     fr->bEwald     = (EEL_PME(fr->eeltype) || fr->eeltype==eelEWALD);
     
     fr->reppow     = mtop->ffparams.reppow;
-    fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
-                      !gmx_within_tol(fr->reppow,12.0,10*GMX_DOUBLE_EPS));
-    fr->bcoultab   = (!(fr->eeltype == eelCUT || EEL_RF(fr->eeltype)) ||
-                      fr->eeltype == eelRF_ZERO);
-    
-    if (getenv("GMX_REQUIRE_TABLES"))
+
+    if (ir->cutoff_scheme == ecutsGROUP)
     {
-        fr->bvdwtab  = TRUE;
-        fr->bcoultab = TRUE;
+        fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
+                          !gmx_within_tol(fr->reppow,12.0,10*GMX_DOUBLE_EPS));
+        fr->bcoultab   = (!(fr->eeltype == eelCUT || EEL_RF(fr->eeltype)) ||
+                          fr->eeltype == eelRF_ZERO);
+
+        if (getenv("GMX_REQUIRE_TABLES"))
+        {
+            fr->bvdwtab  = TRUE;
+            fr->bcoultab = TRUE;
+        }
+
+        if (fp)
+        {
+            fprintf(fp,"Table routines are used for coulomb: %s\n",bool_names[fr->bcoultab]);
+            fprintf(fp,"Table routines are used for vdw:     %s\n",bool_names[fr->bvdwtab ]);
+        }
     }
-    
-    if (fp) {
-        fprintf(fp,"Table routines are used for coulomb: %s\n",bool_names[fr->bcoultab]);
-        fprintf(fp,"Table routines are used for vdw:     %s\n",bool_names[fr->bvdwtab ]);
+
+    if (ir->cutoff_scheme == ecutsVERLET)
+    {
+        if (!gmx_within_tol(fr->reppow,12.0,10*GMX_DOUBLE_EPS))
+        {
+            gmx_fatal(FARGS,"Cut-off scheme %S only supports LJ repulsion power 12",ecutscheme_names[ir->cutoff_scheme]);
+        }
+        fr->bvdwtab  = FALSE;
+        fr->bcoultab = FALSE;
     }
     
     /* Tables are used for direct ewald sum */
@@ -1500,16 +2112,8 @@ void init_forcerec(FILE *fp,
                        (fr->adress_icor != eAdressICOff)
                       );
     
-    /* Mask that says whether or not this NBF list should be computed */
-    /*  if (fr->bMask == NULL) {
-        ngrp = ir->opts.ngener*ir->opts.ngener;
-        snew(fr->bMask,ngrp);*/
-    /* Defaults to always */
-    /*    for(i=0; (i<ngrp); i++)
-          fr->bMask[i] = TRUE;
-          }*/
-    
-    if (ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr)) {
+    if (fr->cutoff_scheme == ecutsGROUP &&
+        ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr)) {
         /* Count the total number of charge groups */
         fr->cg_nalloc = ncg_mtop(mtop);
         srenew(fr->cg_cm,fr->cg_nalloc);
@@ -1753,11 +2357,6 @@ void init_forcerec(FILE *fp,
             fprintf(debug,"No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
     }
     
-    if (ir->eDispCorr != edispcNO)
-    {
-        calc_enervirdiff(fp,ir->eDispCorr,fr);
-    }
-
     /* QM/MM initialization if requested
      */
     if (ir->bQMMM)
@@ -1769,8 +2368,8 @@ void init_forcerec(FILE *fp,
     fr->qr         = mk_QMMMrec();
     
     /* Set all the static charge group info */
-    fr->cginfo_mb = init_cginfo_mb(fp,mtop,fr,bNoSolvOpt);
-
+    fr->cginfo_mb = init_cginfo_mb(fp,mtop,fr,bNoSolvOpt,
+                                   &fr->bExcl_IntraCGAll_InterCGNone);
     if (DOMAINDECOMP(cr)) {
         fr->cginfo = NULL;
     } else {
@@ -1797,10 +2396,38 @@ void init_forcerec(FILE *fp,
     /* Initialize neighbor search */
     init_ns(fp,cr,&fr->ns,fr,mtop,box);
     
-    if (cr->duty & DUTY_PP){
+    if (cr->duty & DUTY_PP)
+    {
         gmx_setup_kernels(fp,fr,bGenericKernelOnly);
         if (ir->bAdress)
+        {
             gmx_setup_adress_kernels(fp,bGenericKernelOnly);
+        }
+    }
+
+    /* Initialize the thread working data for bonded interactions */
+    init_forcerec_f_threads(fr,mtop->groups.grps[egcENER].nr);
+    
+    snew(fr->excl_load,fr->nthreads+1);
+
+    if (fr->cutoff_scheme == ecutsVERLET)
+    {
+        if (ir->rcoulomb != ir->rvdw)
+        {
+            gmx_fatal(FARGS,"With Verlet lists rcoulomb and rvdw should be identical");
+        }
+
+        init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
+
+        /* initialize interaction constants
+         * TODO should be moved out during modularization.
+         */
+        init_interaction_const(fp, &fr->ic, fr);
+    }
+
+    if (ir->eDispCorr != edispcNO)
+    {
+        calc_enervirdiff(fp,ir->eDispCorr,fr);
     }
 }
 
@@ -1826,3 +2453,56 @@ void pr_forcerec(FILE *fp,t_forcerec *fr,t_commrec *cr)
   
   fflush(fp);
 }
+
+void forcerec_set_excl_load(t_forcerec *fr,
+                            const gmx_localtop_t *top,const t_commrec *cr)
+{
+    const int *ind,*a;
+    int t,i,j,ntot,n,ntarget;
+
+    if (cr != NULL && PARTDECOMP(cr))
+    {
+        /* No OpenMP with particle decomposition */
+        pd_at_range(cr,
+                    &fr->excl_load[0],
+                    &fr->excl_load[1]);
+
+        return;
+    }
+
+    ind = top->excls.index;
+    a   = top->excls.a;
+
+    ntot = 0;
+    for(i=0; i<top->excls.nr; i++)
+    {
+        for(j=ind[i]; j<ind[i+1]; j++)
+        {
+            if (a[j] > i)
+            {
+                ntot++;
+            }
+        }
+    }
+
+    fr->excl_load[0] = 0;
+    n = 0;
+    i = 0;
+    for(t=1; t<=fr->nthreads; t++)
+    {
+        ntarget = (ntot*t)/fr->nthreads;
+        while(i < top->excls.nr && n < ntarget)
+        {
+            for(j=ind[i]; j<ind[i+1]; j++)
+            {
+                if (a[j] > i)
+                {
+                    n++;
+                }
+            }
+            i++;
+        }
+        fr->excl_load[t] = i;
+    }
+}
+
diff --git a/src/mdlib/gmx_wallcycle.c b/src/mdlib/gmx_wallcycle.c
index b62f921c2a..129fe5fc95 100644
--- a/src/mdlib/gmx_wallcycle.c
+++ b/src/mdlib/gmx_wallcycle.c
@@ -51,6 +51,12 @@
 #include "tmpi.h"
 #endif
 
+/* DEBUG_WCYCLE adds consistency checking for the counters.
+ * It checks if you stop a counter different from the last
+ * one that was opened and if you do nest too deep.
+ */
+/* #define DEBUG_WCYCLE */
+
 typedef struct
 {
     int          n;
@@ -66,25 +72,50 @@ typedef struct gmx_wallcycle
     gmx_bool         wc_barrier;
     wallcc_t     *wcc_all;
     int          wc_depth;
+#ifdef DEBUG_WCYCLE
+#define DEPTH_MAX 6
+    int          counterlist[DEPTH_MAX];
+    int          count_depth;
+#endif
     int          ewc_prev;
     gmx_cycles_t cycle_prev;
     gmx_large_int_t   reset_counters;
 #ifdef GMX_MPI
     MPI_Comm     mpi_comm_mygroup;
 #endif
-    int          omp_nthreads;
+    int          nthreads_pp;
+    int          nthreads_pme;
+#ifdef GMX_CYCLE_SUBCOUNTERS
+    wallcc_t     *wcsc;
+#endif
+    double       *cycles_sum;
 } gmx_wallcycle_t_t;
 
 /* Each name should not exceed 19 characters */
 static const char *wcn[ewcNR] =
-{ "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load", "DD comm. bounds", "Vsite constr.", "Send X to PME", "Comm. coord.", "Neighbor search", "Born radii", "Force", "Wait + Comm. F", "PME mesh", "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve", "Wait + Comm. X/F", "Wait + Recv. PME F", "Vsite spread", "Write traj.", "Update", "Constraints", "Comm. energies", "Enforced rotation", "Add rot. forces", "Test" };
+{ "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load",
+  "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
+  "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
+  "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve",
+  "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
+  "Vsite spread", "Write traj.", "Update", "Constraints", "Comm. energies",
+  "Enforced rotation", "Add rot. forces", "Test" };
+
+static const char *wcsn[ewcsNR] =
+{ "DD redist.", "DD NS grid + sort", "DD setup comm.",
+  "DD make top.", "DD make constr.", "DD top. other",
+  "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
+  "Bonded F", "Nonbonded F", "Ewald F correction",
+  "NB X buffer ops.", "NB F buffer ops."
+};
 
 gmx_bool wallcycle_have_counter(void)
 {
   return gmx_cycles_have_counter();
 }
 
-gmx_wallcycle_t wallcycle_init(FILE *fplog,int resetstep,t_commrec *cr, int omp_nthreads)
+gmx_wallcycle_t wallcycle_init(FILE *fplog,int resetstep,t_commrec *cr, 
+                               int nthreads_pp, int nthreads_pme)
 {
     gmx_wallcycle_t wc;
     
@@ -96,12 +127,14 @@ gmx_wallcycle_t wallcycle_init(FILE *fplog,int resetstep,t_commrec *cr, int omp_
 
     snew(wc,1);
 
-    wc->wc_barrier = FALSE;
-    wc->wcc_all    = NULL;
-    wc->wc_depth   = 0;
-    wc->ewc_prev   = -1;
-    wc->reset_counters = resetstep;
-    wc->omp_nthreads = omp_nthreads;
+    wc->wc_barrier          = FALSE;
+    wc->wcc_all             = NULL;
+    wc->wc_depth            = 0;
+    wc->ewc_prev            = -1;
+    wc->reset_counters      = resetstep;
+    wc->nthreads_pp         = nthreads_pp;
+    wc->nthreads_pme        = nthreads_pme;
+    wc->cycles_sum          = NULL;
 
 #ifdef GMX_MPI
     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != NULL)
@@ -118,20 +151,48 @@ gmx_wallcycle_t wallcycle_init(FILE *fplog,int resetstep,t_commrec *cr, int omp_
     snew(wc->wcc,ewcNR);
     if (getenv("GMX_CYCLE_ALL") != NULL)
     {
-/*#ifndef GMX_THREAD_MPI*/
         if (fplog) 
         {
             fprintf(fplog,"\nWill time all the code during the run\n\n");
         }
         snew(wc->wcc_all,ewcNR*ewcNR);
-/*#else*/
-        gmx_fatal(FARGS, "GMX_CYCLE_ALL is incompatible with threaded code");
-/*#endif*/
     }
-    
+
+#ifdef GMX_CYCLE_SUBCOUNTERS
+    snew(wc->wcsc,ewcsNR);
+#endif
+
+#ifdef DEBUG_WCYCLE
+    wc->count_depth = 0;
+#endif
+
     return wc;
 }
 
+void wallcycle_destroy(gmx_wallcycle_t wc)
+{
+    if (wc == NULL)
+    {
+        return;
+    }
+    
+    if (wc->wcc != NULL)
+    {
+        sfree(wc->wcc);
+    }
+    if (wc->wcc_all != NULL)
+    {
+        sfree(wc->wcc_all);
+    }
+#ifdef GMX_CYCLE_SUBCOUNTERS
+    if (wc->wcsc != NULL)
+    {
+        sfree(wc->wcsc);
+    }
+#endif
+    sfree(wc);
+}
+
 static void wallcycle_all_start(gmx_wallcycle_t wc,int ewc,gmx_cycles_t cycle)
 {
     wc->ewc_prev = ewc;
@@ -144,6 +205,39 @@ static void wallcycle_all_stop(gmx_wallcycle_t wc,int ewc,gmx_cycles_t cycle)
     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].c += cycle - wc->cycle_prev;
 }
 
+
+#ifdef DEBUG_WCYCLE
+static void debug_start_check(gmx_wallcycle_t wc, int ewc)
+{
+    /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
+
+    if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
+    {
+        gmx_fatal(FARGS,"wallcycle counter depth out of range: %d",
+                  wc->count_depth);
+    }
+    wc->counterlist[wc->count_depth] = ewc;
+    wc->count_depth++;
+}
+
+static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
+{
+    wc->count_depth--;
+
+    /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
+
+    if (wc->count_depth < 0)
+    {
+        gmx_fatal(FARGS,"wallcycle counter depth out of range when stopping %s: %d",wcn[ewc],wc->count_depth);
+    }
+    if (wc->counterlist[wc->count_depth] != ewc)
+    {
+        gmx_fatal(FARGS,"wallcycle mismatch at stop, start %s, stop %s",
+                  wcn[wc->counterlist[wc->count_depth]],wcn[ewc]);
+    }
+}
+#endif
+
 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 {
     gmx_cycles_t cycle;
@@ -160,6 +254,10 @@ void wallcycle_start(gmx_wallcycle_t wc, int ewc)
     }
 #endif
 
+#ifdef DEBUG_WCYCLE
+    debug_start_check(wc,ewc);
+#endif
+
     cycle = gmx_cycles_read();
     wc->wcc[ewc].start = cycle;
     if (wc->wcc_all != NULL)
@@ -176,6 +274,17 @@ void wallcycle_start(gmx_wallcycle_t wc, int ewc)
     }
 }
 
+void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
+{
+    if (wc == NULL)
+    {
+        return;
+    }
+
+    wallcycle_start(wc, ewc);
+    wc->wcc[ewc].n--;
+}
+
 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 {
     gmx_cycles_t cycle,last;
@@ -191,6 +300,10 @@ double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
         MPI_Barrier(wc->mpi_comm_mygroup);
     }
 #endif
+
+#ifdef DEBUG_WCYCLE
+    debug_stop_check(wc,ewc);
+#endif
     
     cycle = gmx_cycles_read();
     last = cycle - wc->wcc[ewc].start;
@@ -225,36 +338,76 @@ void wallcycle_reset_all(gmx_wallcycle_t wc)
     {
         wc->wcc[i].n = 0;
         wc->wcc[i].c = 0;
-        wc->wcc[i].start = 0;
-        wc->wcc[i].last = 0;
     }
+    if (wc->wcc_all)
+    {
+        for(i=0; i<ewcNR*ewcNR; i++)
+        {
+            wc->wcc_all[i].n = 0;
+            wc->wcc_all[i].c = 0;
+        }
+    }
+#ifdef GMX_CYCLE_SUBCOUNTERS
+    for (i=0; i<ewcsNR; i++)
+    {
+        wc->wcsc[i].n = 0;
+        wc->wcsc[i].c = 0;
+    }
+#endif
+}
+
+static gmx_bool is_pme_counter(int ewc)
+{
+    return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 }
 
-static gmx_bool pme_subdivision(int ewc)
+static gmx_bool is_pme_subcounter(int ewc)
 {
-    return (ewc >= ewcPME_REDISTXF && ewc <= ewcPME_SOLVE);
+    return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 }
 
-void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc,double cycles[])
+void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
 {
     wallcc_t *wcc;
-    double cycles_n[ewcNR],buf[ewcNR],*cyc_all,*buf_all;
-    int    i;
+    double *cycles;
+    double cycles_n[ewcNR+ewcsNR],buf[ewcNR+ewcsNR],*cyc_all,*buf_all;
+    int    i,j;
+    int    nsum;
 
     if (wc == NULL)
     {
         return;
     }
 
+    snew(wc->cycles_sum,ewcNR+ewcsNR);
+    cycles = wc->cycles_sum;
+
     wcc = wc->wcc;
 
-    if (wc->omp_nthreads>1)
+    for(i=0; i<ewcNR; i++)
     {
-        for(i=0; i<ewcNR; i++)
+        if (is_pme_counter(i) || (i==ewcRUN && cr->duty == DUTY_PME))
         {
-            if (pme_subdivision(i) || i==ewcPMEMESH || (i==ewcRUN && cr->duty == DUTY_PME))
+            wcc[i].c *= wc->nthreads_pme;
+
+            if (wc->wcc_all)
             {
-                wcc[i].c *= wc->omp_nthreads;
+                for(j=0; j<ewcNR; j++)
+                {
+                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme;
+                }
+            }
+        }
+        else
+        {
+            wcc[i].c *= wc->nthreads_pp;
+
+            if (wc->wcc_all)
+            {
+                for(j=0; j<ewcNR; j++)
+                {
+                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp;
+                }
             }
         }
     }
@@ -296,19 +449,36 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc,double cycles[])
         cycles_n[i] = (double)wcc[i].n;
         cycles[i]   = (double)wcc[i].c;
     }
+    nsum = ewcNR;
+#ifdef GMX_CYCLE_SUBCOUNTERS
+    for(i=0; i<ewcsNR; i++)
+    {
+        wc->wcsc[i].c *= wc->nthreads_pp;
+        cycles_n[ewcNR+i] = (double)wc->wcsc[i].n;
+        cycles[ewcNR+i]   = (double)wc->wcsc[i].c;
+    }
+    nsum += ewcsNR;
+#endif   
     
 #ifdef GMX_MPI
     if (cr->nnodes > 1)
     {
-        MPI_Allreduce(cycles_n,buf,ewcNR,MPI_DOUBLE,MPI_MAX,
+        MPI_Allreduce(cycles_n,buf,nsum,MPI_DOUBLE,MPI_MAX,
                       cr->mpi_comm_mysim);
         for(i=0; i<ewcNR; i++)
         {
             wcc[i].n = (int)(buf[i] + 0.5);
         }
-        MPI_Allreduce(cycles,buf,ewcNR,MPI_DOUBLE,MPI_SUM,
+#ifdef GMX_CYCLE_SUBCOUNTERS
+        for(i=0; i<ewcsNR; i++)
+        {
+            wc->wcsc[i].n = (int)(buf[ewcNR+i] + 0.5);
+        }
+#endif   
+
+        MPI_Allreduce(cycles,buf,nsum,MPI_DOUBLE,MPI_SUM,
                       cr->mpi_comm_mysim);
-        for(i=0; i<ewcNR; i++)
+        for(i=0; i<nsum; i++)
         {
             cycles[i] = buf[i];
         }
@@ -334,40 +504,82 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc,double cycles[])
 #endif
 }
 
-static void print_cycles(FILE *fplog, double c2t, const char *name, int nnodes,
+static void print_cycles(FILE *fplog, double c2t, const char *name, 
+                         int nnodes_tot,int nnodes, int nthreads,
                          int n, double c, double tot)
 {
     char num[11];
+    char thstr[6];
+    double wallt;
   
     if (c > 0)
     {
         if (n > 0)
         {
             sprintf(num,"%10d",n);
+            if (nthreads < 0)
+                sprintf(thstr, "N/A");
+            else
+                sprintf(thstr, "%4d", nthreads);
         }
         else
         {
             sprintf(num,"          ");
+            sprintf(thstr, "    ");
         }
-        fprintf(fplog," %-19s %4d %10s %12.3f %10.1f   %5.1f\n",
-                name,nnodes,num,c*1e-9,c*c2t,100*c/tot);
+        wallt = c*c2t*nnodes_tot/(double)nnodes;
+        fprintf(fplog," %-19s %4d %4s %10s  %10.3f %12.3f   %5.1f\n",
+                name,nnodes,thstr,num,wallt,c*1e-9,100*c/tot);
     }
 }
 
+static void print_gputimes(FILE *fplog, const char *name, 
+                           int n, double t, double tot_t)
+{
+    char num[11];
+    char avg_perf[11];
+
+    if (n > 0)
+    {
+        sprintf(num, "%10d", n);
+        sprintf(avg_perf, "%10.3f", t/n);
+    }
+    else
+    {
+      sprintf(num,"          ");
+      sprintf(avg_perf,"          ");
+    }
+    if (t != tot_t)
+    {
+        fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
+                name, num, t/1000, avg_perf, 100 * t/tot_t); 
+    }
+    else
+    {
+         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
+               name, "", t/1000, avg_perf, 100.0); 
+    }
+}
 
 void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
-                     gmx_wallcycle_t wc, double cycles[])
+                     gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
 {
-    double c2t,tot,sum;
-    int    i,j,npp;
+    double *cycles;
+    double c2t,tot,tot_gpu,tot_cpu_overlap,gpu_cpu_ratio,sum,tot_k;
+    int    i,j,npp,nth_pp,nth_pme;
     char   buf[STRLEN];
-    const char *myline = "-----------------------------------------------------------------------";
+    const char *hline = "-----------------------------------------------------------------------------";
     
     if (wc == NULL)
     {
         return;
     }
 
+    nth_pp  = wc->nthreads_pp;
+    nth_pme = wc->nthreads_pme;
+
+    cycles = wc->cycles_sum;
+
     if (npme > 0)
     {
         npp = nnodes - npme;
@@ -378,32 +590,29 @@ void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
         npme = nnodes;
     }
     tot = cycles[ewcRUN];
-    /* PME part has to be multiplied with number of threads */
-    if (npme == 0)
-    {
-        tot += cycles[ewcPMEMESH]*(wc->omp_nthreads-1);
-    }
+
     /* Conversion factor from cycles to seconds */
     if (tot > 0)
     {
-      c2t = (npp+npme*wc->omp_nthreads)*realtime/tot;
+        c2t = realtime/tot;
     }
     else
     {
-      c2t = 0;
+        c2t = 0;
     }
 
     fprintf(fplog,"\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 
-    fprintf(fplog," Computing:         Nodes     Number     G-Cycles    Seconds     %c\n",'%');
-    fprintf(fplog,"%s\n",myline);
+    fprintf(fplog," Computing:         Nodes   Th.     Count  Wall t (s)     G-Cycles       %c\n",'%');
+    fprintf(fplog,"%s\n",hline);
     sum = 0;
     for(i=ewcPPDURINGPME+1; i<ewcNR; i++)
     {
-        if (!pme_subdivision(i))
+        if (!is_pme_subcounter(i))
         {
-            print_cycles(fplog,c2t,wcn[i],
-                         (i==ewcPMEMESH || i==ewcPMEWAITCOMM) ? npme : npp,
+            print_cycles(fplog,c2t,wcn[i],nnodes,
+                         is_pme_counter(i) ? npme : npp,
+                         is_pme_counter(i) ? nth_pme : nth_pp, 
                          wc->wcc[i].n,cycles[i],tot);
             sum += cycles[i];
         }
@@ -418,32 +627,157 @@ void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
                 buf[9] = ' ';
                 sprintf(buf+10,"%-9s",wcn[j]);
                 buf[19] = '\0';
-                print_cycles(fplog,c2t,buf,
-                             (i==ewcPMEMESH || i==ewcPMEWAITCOMM) ? npme : npp,
+                print_cycles(fplog,c2t,buf,nnodes,
+                             is_pme_counter(i) ? npme : npp,
+                             is_pme_counter(i) ? nth_pme : nth_pp,
                              wc->wcc_all[i*ewcNR+j].n,
                              wc->wcc_all[i*ewcNR+j].c,
                              tot);
             }
         }
     }
-    print_cycles(fplog,c2t,"Rest",npp,0,tot-sum,tot);
-    fprintf(fplog,"%s\n",myline);
-    print_cycles(fplog,c2t,"Total",nnodes,0,tot,tot);
-    fprintf(fplog,"%s\n",myline);
+    print_cycles(fplog,c2t,"Rest",npp,npp,-1,0,tot-sum,tot);
+    fprintf(fplog,"%s\n",hline);
+    print_cycles(fplog,c2t,"Total",nnodes,nnodes,-1,0,tot,tot);
+    fprintf(fplog,"%s\n",hline);
     
     if (wc->wcc[ewcPMEMESH].n > 0)
     {
-        fprintf(fplog,"%s\n",myline);
+        fprintf(fplog,"%s\n",hline);
         for(i=ewcPPDURINGPME+1; i<ewcNR; i++)
         {
-            if (pme_subdivision(i))
+            if (is_pme_subcounter(i))
             {
-                print_cycles(fplog,c2t,wcn[i],
-                             (i>=ewcPMEMESH && i<=ewcPME_SOLVE) ? npme : npp,
+                print_cycles(fplog,c2t,wcn[i],nnodes,
+                             is_pme_counter(i) ? npme : npp,
+                             is_pme_counter(i) ? nth_pme : nth_pp,
                              wc->wcc[i].n,cycles[i],tot);
             }
         }
-        fprintf(fplog,"%s\n",myline);
+        fprintf(fplog,"%s\n",hline);
+    }
+
+#ifdef GMX_CYCLE_SUBCOUNTERS
+    fprintf(fplog,"%s\n",hline);
+    for(i=0; i<ewcsNR; i++)
+    {
+        print_cycles(fplog,c2t,wcsn[i],nnodes,npp,nth_pp,
+                     wc->wcsc[i].n,cycles[ewcNR+i],tot);
+    }
+    fprintf(fplog,"%s\n",hline);
+#endif
+
+    /* print GPU timing summary */
+    if (gpu_t)
+    {
+        const char *k_log_str[2][2] = {
+                {"Nonbonded F kernel", "Nonbonded F+ene k."},
+                {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}};
+
+        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;
+
+        /* add up the kernel timings */
+        tot_k = 0.0;
+        for (i = 0; i < 2; i++)
+        {
+            for(j = 0; j < 2; j++)
+            {
+                tot_k += gpu_t->ktime[i][j].t;
+            }
+        }
+        tot_gpu += tot_k;
+    
+        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
+        if (wc->wcc[ewcPMEMESH].n > 0)
+        {
+            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
+        }
+        tot_cpu_overlap *= c2t * 1000; /* convert s to ms */
+
+        fprintf(fplog, "\n GPU timings\n%s\n", hline);
+        fprintf(fplog," Computing:                         Count  Wall t (s)      ms/step       %c\n",'%');
+        fprintf(fplog, "%s\n", hline);
+        print_gputimes(fplog, "Pair list H2D",
+                gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
+         print_gputimes(fplog, "X / q H2D", 
+                gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);
+
+        for (i = 0; i < 2; i++)
+        {
+            for(j = 0; j < 2; j++)
+            {
+                if (gpu_t->ktime[i][j].c)
+                {
+                    print_gputimes(fplog, k_log_str[i][j],
+                            gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
+                }
+            }
+        }        
+
+        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
+        fprintf(fplog, "%s\n", hline);
+        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
+        fprintf(fplog, "%s\n", hline);
+
+        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
+        fprintf(fplog, "\n Force evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
+                tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
+                gpu_cpu_ratio);
+
+        /* only print notes related to CPU-GPU load balance with PME */
+        if (wc->wcc[ewcPMEMESH].n > 0)
+        {
+            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");
+
+            /* print note if the imbalance is high with PME case in which
+             * CPU-GPU load balancing is possible */
+            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
+            {
+                if (gpu_cpu_ratio < 0.75)
+                {
+                    sprintf(buf, "NOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
+                            "      performance loss, consider turning on PME tuning (-tunepme).");
+                }
+                if (gpu_cpu_ratio > 1.2)
+                {
+                    sprintf(buf, "NOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
+                            "      performance loss, consider using a shorter cut-off.");
+                }
+                if (fplog)
+                {
+                    fprintf(fplog,"\n%s\n",buf);
+                }
+                fprintf(stderr,"\n\n%s\n",buf);
+            }
+        }
+    }
+
+    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
+        (cycles[ewcDOMDEC] > tot*0.1 ||
+         cycles[ewcNS] > tot*0.1))
+    {
+        if (wc->wcc[ewcDOMDEC].n == 0)
+        {
+            sprintf(buf,
+                    "NOTE: %d %% of the run time was spent in pair search,\n"
+                    "      you might want to increase nstlist (this has no effect on accuracy)\n",
+                    (int)(100*cycles[ewcNS]/tot+0.5));
+        }
+        else
+        {
+            sprintf(buf,
+                    "NOTE: %d %% of the run time was spent in domain decomposition,\n"
+                    "      %d %% of the run time was spent in pair search,\n"
+                    "      you might want to increase nstlist (this has no effect on accuracy)\n",
+                    (int)(100*cycles[ewcDOMDEC]/tot+0.5),
+                    (int)(100*cycles[ewcNS]/tot+0.5));
+        }
+        if (fplog)
+        {
+            fprintf(fplog,"\n%s\n",buf);
+        }
+        /* Only the sim master calls this function, so always print to stderr */
+        fprintf(stderr,"\n%s\n",buf);
     }
 
     if (cycles[ewcMoveE] > tot*0.05)
@@ -478,3 +812,24 @@ extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, gmx_large_int_t reset_
 
     wc->reset_counters = reset_counters;
 }
+
+#ifdef GMX_CYCLE_SUBCOUNTERS
+
+void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
+{
+    if (wc != NULL)
+    {
+        wc->wcsc[ewcs].start = gmx_cycles_read();
+    }
+}
+
+void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
+{
+    if (wc != NULL)
+    {
+        wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
+        wc->wcsc[ewcs].n++;
+    }
+}
+
+#endif /* GMX_CYCLE_SUBCOUNTERS */
diff --git a/src/mdlib/groupcoord.h b/src/mdlib/groupcoord.h
index 79d107dcc3..989068a9f8 100644
--- a/src/mdlib/groupcoord.h
+++ b/src/mdlib/groupcoord.h
@@ -51,6 +51,7 @@
 
 #include <stdio.h>
 #include "typedefs.h"
+#include "types/commrec.h"
 
 
 /*! \brief Select local atoms of a group.
diff --git a/src/mdlib/iteratedconstraints.c b/src/mdlib/iteratedconstraints.c
index 4fcf8a7e75..6a4a21532f 100644
--- a/src/mdlib/iteratedconstraints.c
+++ b/src/mdlib/iteratedconstraints.c
@@ -37,9 +37,12 @@
 #include <config.h>
 #endif
 
+#include <math.h>
 #include "typedefs.h"
 #include "gmx_fatal.h"
 #include "mdrun.h"
+#include "md_support.h"
+#include "types/iteratedconstraints.h"
 
 #ifdef GMX_DOUBLE
 #define CONVERGEITER  0.000000001
@@ -96,7 +99,6 @@ gmx_bool done_iterating(const t_commrec *cr,FILE *fplog, int nsteps, gmx_iterate
        practice. Generally, 3-5 iterations will be sufficient */
 
     real relerr,err,xmin;
-    char buf[256];
     int i;
     gmx_bool incycle;
     
@@ -200,8 +202,7 @@ gmx_bool done_iterating(const t_commrec *cr,FILE *fplog, int nsteps, gmx_iterate
                     /* how many close calls have we had?  If less than a few, we're OK */
                     if (iterate->num_close < MAX_NUMBER_CLOSE) 
                     {
-                        sprintf(buf,"Slight numerical convergence deviation with NPT at step %d, relative error only %10.5g, likely not a problem, continuing\n",nsteps,relerr);
-                        md_print_warning(cr,fplog,buf);
+                        md_print_warn(cr,fplog,"Slight numerical convergence deviation with NPT at step %d, relative error only %10.5g, likely not a problem, continuing\n",nsteps,relerr);
                         iterate->num_close++;
                         return TRUE;
                         /* if more than a few, check the total fraction.  If too high, die. */
diff --git a/src/mdlib/md_support.c b/src/mdlib/md_support.c
index 4901aaf68c..5c098bbdfd 100644
--- a/src/mdlib/md_support.c
+++ b/src/mdlib/md_support.c
@@ -46,6 +46,8 @@
 #include "gmx_wallcycle.h"
 #include "vcm.h"
 #include "nrnb.h"
+#include "md_logging.h"
+#include "md_support.h"
 
 /* Is the signal in one simulation independent of other simulations? */
 gmx_bool gs_simlocal[eglsNR] = { TRUE, FALSE, FALSE, TRUE };
@@ -499,14 +501,12 @@ void check_nst_param(FILE *fplog,t_commrec *cr,
                      const char *desc_nst,int nst,
                      const char *desc_p,int *p)
 {
-    char buf[STRLEN];
-
     if (*p > 0 && *p % nst != 0)
     {
         /* Round up to the next multiple of nst */
         *p = ((*p)/nst + 1)*nst;
-        sprintf(buf,"NOTE: %s changes %s to %d\n",desc_nst,desc_p,*p);
-        md_print_warning(cr,fplog,buf);
+        md_print_warn(cr,fplog,
+                      "NOTE: %s changes %s to %d\n",desc_nst,desc_p,*p);
     }
 }
 
@@ -589,35 +589,7 @@ void set_current_lambdas(gmx_large_int_t step, t_lambda *fepvals, gmx_bool bReru
     }
 }
 
-void reset_all_counters(FILE *fplog,t_commrec *cr,
-                        gmx_large_int_t step,
-                        gmx_large_int_t *step_rel,t_inputrec *ir,
-                        gmx_wallcycle_t wcycle,t_nrnb *nrnb,
-                        gmx_runtime_t *runtime)
-{
-    char buf[STRLEN],sbuf[STEPSTRSIZE];
-
-    /* Reset all the counters related to performance over the run */
-    sprintf(buf,"Step %s: resetting all time and cycle counters\n",
-            gmx_step_str(step,sbuf));
-    md_print_warning(cr,fplog,buf);
-
-    wallcycle_stop(wcycle,ewcRUN);
-    wallcycle_reset_all(wcycle);
-    if (DOMAINDECOMP(cr))
-    {
-        reset_dd_statistics_counters(cr->dd);
-    }
-    init_nrnb(nrnb);
-    ir->init_step += *step_rel;
-    ir->nsteps    -= *step_rel;
-    *step_rel = 0;
-    wallcycle_start(wcycle,ewcRUN);
-    runtime_start(runtime);
-    print_date_and_time(fplog,cr->nodeid,"Restarted time",runtime);
-}
-
-void min_zero(int *n,int i)
+static void min_zero(int *n,int i)
 {
     if (i > 0 && (*n == 0 || i < *n))
     {
@@ -625,7 +597,7 @@ void min_zero(int *n,int i)
     }
 }
 
-int lcd4(int i1,int i2,int i3,int i4)
+static int lcd4(int i1,int i2,int i3,int i4)
 {
     int nst;
 
@@ -653,8 +625,6 @@ int lcd4(int i1,int i2,int i3,int i4)
 int check_nstglobalcomm(FILE *fplog,t_commrec *cr,
                         int nstglobalcomm,t_inputrec *ir)
 {
-    char buf[STRLEN];
-
     if (!EI_DYNAMICS(ir->eI))
     {
         nstglobalcomm = 1;
@@ -690,8 +660,7 @@ int check_nstglobalcomm(FILE *fplog,t_commrec *cr,
             nstglobalcomm > ir->nstlist && nstglobalcomm % ir->nstlist != 0)
         {
             nstglobalcomm = (nstglobalcomm / ir->nstlist)*ir->nstlist;
-            sprintf(buf,"WARNING: nstglobalcomm is larger than nstlist, but not a multiple, setting it to %d\n",nstglobalcomm);
-            md_print_warning(cr,fplog,buf);
+            md_print_warn(cr,fplog,"WARNING: nstglobalcomm is larger than nstlist, but not a multiple, setting it to %d\n",nstglobalcomm);
         }
         if (ir->nstcalcenergy > 0)
         {
@@ -718,9 +687,8 @@ int check_nstglobalcomm(FILE *fplog,t_commrec *cr,
 
     if (ir->comm_mode != ecmNO && ir->nstcomm < nstglobalcomm)
     {
-        sprintf(buf,"WARNING: Changing nstcomm from %d to %d\n",
-                ir->nstcomm,nstglobalcomm);
-        md_print_warning(cr,fplog,buf);
+        md_print_warn(cr,fplog,"WARNING: Changing nstcomm from %d to %d\n",
+                      ir->nstcomm,nstglobalcomm);
         ir->nstcomm = nstglobalcomm;
     }
 
@@ -734,13 +702,13 @@ void check_ir_old_tpx_versions(t_commrec *cr,FILE *fplog,
     if (IR_TWINRANGE(*ir) && ir->nstlist > 1 &&
         ir->nstcalcenergy % ir->nstlist != 0)
     {
-        md_print_warning(cr,fplog,"Old tpr file with twin-range settings: modifying energy calculation and/or T/P-coupling frequencies");
+        md_print_warn(cr,fplog,"Old tpr file with twin-range settings: modifying energy calculation and/or T/P-coupling frequencies\n");
 
         if (gmx_mtop_ftype_count(mtop,F_CONSTR) +
             gmx_mtop_ftype_count(mtop,F_CONSTRNC) > 0 &&
             ir->eConstrAlg == econtSHAKE)
         {
-            md_print_warning(cr,fplog,"With twin-range cut-off's and SHAKE the virial and pressure are incorrect");
+            md_print_warn(cr,fplog,"With twin-range cut-off's and SHAKE the virial and pressure are incorrect\n");
             if (ir->epc != epcNO)
             {
                 gmx_fatal(FARGS,"Can not do pressure coupling with twin-range cut-off's and SHAKE");
@@ -805,15 +773,3 @@ void rerun_parallel_comm(t_commrec *cr,t_trxframe *fr,
         }
     }
 }
-
-void md_print_warning(const t_commrec *cr,FILE *fplog,const char *buf)
-{
-    if (MASTER(cr))
-    {
-        fprintf(stderr,"\n%s\n",buf);
-    }
-    if (fplog)
-    {
-        fprintf(fplog,"\n%s\n",buf);
-    }
-}
diff --git a/src/mdlib/mdatom.c b/src/mdlib/mdatom.c
index a3ad311e22..826df3160a 100644
--- a/src/mdlib/mdatom.c
+++ b/src/mdlib/mdatom.c
@@ -43,6 +43,7 @@
 #include "main.h"
 #include "qmmm.h"
 #include "mtop_util.h"
+#include "gmx_omp_nthreads.h"
 
 #define ALMOST_ZERO 1e-30
 
@@ -97,10 +98,8 @@ void atoms2md(gmx_mtop_t *mtop,t_inputrec *ir,
 	      int start,int homenr,
 	      t_mdatoms *md)
 {
-  t_atoms   *atoms_mol;
-  int       i,g,ag,as,ae,molb;
-  real      mA,mB,fac;
-  t_atom    *atom;
+  gmx_mtop_atomlookup_t alook;
+  int       i;
   t_grpopts *opts;
   gmx_groups_t *groups;
   gmx_molblock_t *molblock;
@@ -177,22 +176,20 @@ void atoms2md(gmx_mtop_t *mtop,t_inputrec *ir,
       md->pureex = FALSE;
   }
 
-  for(i=0; (i<md->nr); i++) {
+  alook = gmx_mtop_atomlookup_init(mtop);
+
+#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntDefault)) schedule(static)
+  for(i=0; i<md->nr; i++) {
+    int     g,ag,molb;
+    real    mA,mB,fac;
+    t_atom  *atom;
+
     if (index == NULL) {
       ag = i;
-      gmx_mtop_atomnr_to_atom(mtop,ag,&atom);
     } else {
       ag   = index[i];
-      molb = -1;
-      ae   = 0;
-      do {
-	molb++;
-	as = ae;
-	ae = as + molblock[molb].nmol*molblock[molb].natoms_mol;
-      } while (ag >= ae);
-      atoms_mol = &mtop->moltype[molblock[molb].type].atoms;
-      atom = &atoms_mol->atom[(ag - as) % atoms_mol->nr];
     }
+    gmx_mtop_atomnr_to_atom(alook,ag,&atom);
 
     if (md->cFREEZE) {
       md->cFREEZE[i] = ggrpnr(groups,egcFREEZE,ag);
@@ -303,6 +300,8 @@ void atoms2md(gmx_mtop_t *mtop,t_inputrec *ir,
     }
   }
 
+  gmx_mtop_atomlookup_destroy(alook);
+
   md->start  = start;
   md->homenr = homenr;
   md->lambda = 0;
diff --git a/src/mdlib/mdebin.c b/src/mdlib/mdebin.c
index 7c13b9e147..8255c6ad8b 100644
--- a/src/mdlib/mdebin.c
+++ b/src/mdlib/mdebin.c
@@ -196,7 +196,7 @@ t_mdebin *init_mdebin(ener_file_t fp_ene,
         else if (i == F_BHAM_LR)
             md->bEner[i] = (bBHAM && ir->rvdw > ir->rlist);
         else if (i == F_RF_EXCL)
-            md->bEner[i] = (EEL_RF(ir->coulombtype) && ir->coulombtype != eelRF_NEC);
+            md->bEner[i] = (EEL_RF(ir->coulombtype) && ir->coulombtype != eelRF_NEC && ir->cutoff_scheme == ecutsGROUP);
         else if (i == F_COUL_RECIP)
             md->bEner[i] = EEL_FULL(ir->coulombtype);
         else if (i == F_LJ14)
@@ -269,8 +269,6 @@ t_mdebin *init_mdebin(ener_file_t fp_ene,
     {
         if (md->bEner[i])
         {
-            /* FIXME: The constness should not be cast away */
-            /*ener_nm[f_nre]=(char *)interaction_function[i].longname;*/
             ener_nm[md->f_nre]=interaction_function[i].longname;
             md->f_nre++;
         }
@@ -285,6 +283,7 @@ t_mdebin *init_mdebin(ener_file_t fp_ene,
     md->bNHC_trotter = IR_NVT_TROTTER(ir);
     md->bPrintNHChains = ir-> bPrintNHChains;
     md->bMTTK = (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir));
+    md->bMu = NEED_MUTOT(*ir);
 
     md->ebin  = mk_ebin();
     /* Pass NULL for unit to let get_ebin_space determine the units
@@ -330,7 +329,9 @@ t_mdebin *init_mdebin(ener_file_t fp_ene,
                                  boxvel_nm,unit_vel);
     }
     if (md->bMu)
+    {
         md->imu    = get_ebin_space(md->ebin,asize(mu_nm),mu_nm,unit_dipole_D);
+    }
     if (ir->cos_accel != 0)
     {
         md->ivcos = get_ebin_space(md->ebin,asize(vcos_nm),vcos_nm,unit_vel);
@@ -880,8 +881,10 @@ void upd_mdebin(t_mdebin *md,
         tmp6[5] = state->boxv[ZZ][YY];
         add_ebin(md->ebin,md->ipc,md->bTricl ? 6 : 3,tmp6,bSum);
     }
-    if(md->bMu)
+    if (md->bMu)
+    {
         add_ebin(md->ebin,md->imu,3,mu_tot,bSum);
+    }
     if (ekind && ekind->cosacc.cos_accel != 0)
     {
         vol  = box[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
@@ -1305,9 +1308,12 @@ void print_ebin(ener_file_t fp_ene,gmx_bool bEne,gmx_bool bDR,gmx_bool bOR,
                 pr_ebin(log,md->ebin,md->ipres,9,3,mode,FALSE);
                 fprintf(log,"\n");
             }
-            fprintf(log,"   Total Dipole (%s)\n",unit_dipole_D);
-            pr_ebin(log,md->ebin,md->imu,3,3,mode,FALSE);
-            fprintf(log,"\n");
+            if (md->bMu)
+            {
+                fprintf(log,"   Total Dipole (%s)\n",unit_dipole_D);
+                pr_ebin(log,md->ebin,md->imu,3,3,mode,FALSE);
+                fprintf(log,"\n");
+            }
 
             if (md->nE > 1)
             {
diff --git a/src/mdlib/minimize.c b/src/mdlib/minimize.c
index 48fcd2523d..d14f434eed 100644
--- a/src/mdlib/minimize.c
+++ b/src/mdlib/minimize.c
@@ -64,6 +64,7 @@
 #include "vsite.h"
 #include "force.h"
 #include "mdrun.h"
+#include "md_support.h"
 #include "domdec.h"
 #include "partdec.h"
 #include "trnio.h"
@@ -75,6 +76,9 @@
 #include "mtop_util.h"
 #include "gmxfio.h"
 #include "pme.h"
+#include "bondf.h"
+#include "gmx_omp_nthreads.h"
+
 
 typedef struct {
   t_state s;
@@ -349,7 +353,11 @@ void init_em(FILE *fplog,const char *title,
         }
         *f_global = *f;
 
-        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols)
+        forcerec_set_excl_load(fr,*top,cr);
+
+        init_bonded_thread_force_reduction(fr,&(*top)->idef);      
+        
+        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
         {
             *graph = mk_graph(fplog,&((*top)->idef),0,top_global->natoms,FALSE,FALSE);
         }
@@ -397,7 +405,7 @@ void init_em(FILE *fplog,const char *title,
             dvdlambda=0;
             constrain(PAR(cr) ? NULL : fplog,TRUE,TRUE,constr,&(*top)->idef,
                       ir,NULL,cr,-1,0,mdatoms,
-                      ems->s.x,ems->s.x,NULL,ems->s.box,
+                      ems->s.x,ems->s.x,NULL,fr->bMolPBC,ems->s.box,
                       ems->s.lambda[efptFEP],&dvdlambda,
                       NULL,NULL,nrnb,econqCoord,FALSE,0,0);
         }
@@ -429,7 +437,7 @@ static void finish_em(FILE *fplog,t_commrec *cr,gmx_mdoutf_t *outf,
 {
   if (!(cr->duty & DUTY_PME)) {
     /* Tell the PME only node to finish */
-    gmx_pme_finish(cr);
+    gmx_pme_send_finish(cr);
   }
 
   done_mdoutf(outf);
@@ -495,87 +503,123 @@ static void write_em_traj(FILE *fplog,t_commrec *cr,
 }
 
 static void do_em_step(t_commrec *cr,t_inputrec *ir,t_mdatoms *md,
-		       em_state_t *ems1,real a,rvec *f,em_state_t *ems2,
-		       gmx_constr_t constr,gmx_localtop_t *top,
-		       t_nrnb *nrnb,gmx_wallcycle_t wcycle,
-		       gmx_large_int_t count)
+                       gmx_bool bMolPBC,
+                       em_state_t *ems1,real a,rvec *f,em_state_t *ems2,
+                       gmx_constr_t constr,gmx_localtop_t *top,
+                       t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+                       gmx_large_int_t count)
 
 {
-  t_state *s1,*s2;
-  int  start,end,gf,i,m;
-  rvec *x1,*x2;
-  real dvdlambda;
-
-  s1 = &ems1->s;
-  s2 = &ems2->s;
+    t_state *s1,*s2;
+    int  i;
+    int  start,end;
+    rvec *x1,*x2;
+    real dvdlambda;
 
-  if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
-    gmx_incons("state mismatch in do_em_step");
+    s1 = &ems1->s;
+    s2 = &ems2->s;
 
-  s2->flags = s1->flags;
+    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
+    {
+        gmx_incons("state mismatch in do_em_step");
+    }
 
-  if (s2->nalloc != s1->nalloc) {
-    s2->nalloc = s1->nalloc;
-    srenew(s2->x,s1->nalloc);
-    srenew(ems2->f,  s1->nalloc);
-    if (s2->flags & (1<<estCGP))
-      srenew(s2->cg_p,  s1->nalloc);
-  }
+    s2->flags = s1->flags;
 
-  s2->natoms = s1->natoms;
-  /* Copy free energy state -> is this necessary? */
-  for (i=0;i<efptNR;i++)
-  {
-      s2->lambda[i] = s1->lambda[i];
-  }
-  copy_mat(s1->box,s2->box);
+    if (s2->nalloc != s1->nalloc)
+    {
+        s2->nalloc = s1->nalloc;
+        srenew(s2->x,s1->nalloc);
+        srenew(ems2->f,  s1->nalloc);
+        if (s2->flags & (1<<estCGP))
+        {
+            srenew(s2->cg_p,  s1->nalloc);
+        }
+    }
+  
+    s2->natoms = s1->natoms;
+    copy_mat(s1->box,s2->box);
+    /* Copy free energy state */
+    for (i=0;i<efptNR;i++)
+    {
+        s2->lambda[i] = s1->lambda[i];
+    }
+    copy_mat(s1->box,s2->box);
 
-  start = md->start;
-  end   = md->start + md->homenr;
+    start = md->start;
+    end   = md->start + md->homenr;
 
-  x1 = s1->x;
-  x2 = s2->x;
-  gf = 0;
-  for(i=start; i<end; i++) {
-    if (md->cFREEZE)
-      gf = md->cFREEZE[i];
-    for(m=0; m<DIM; m++) {
-      if (ir->opts.nFreeze[gf][m])
-	x2[i][m] = x1[i][m];
-      else
-	x2[i][m] = x1[i][m] + a*f[i][m];
-    }
-  }
+    x1 = s1->x;
+    x2 = s2->x;
 
-  if (s2->flags & (1<<estCGP)) {
-    /* Copy the CG p vector */
-    x1 = s1->cg_p;
-    x2 = s2->cg_p;
-    for(i=start; i<end; i++)
-      copy_rvec(x1[i],x2[i]);
-  }
+#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
+    {
+        int gf,i,m;
 
-  if (DOMAINDECOMP(cr)) {
-    s2->ddp_count = s1->ddp_count;
-    if (s2->cg_gl_nalloc < s1->cg_gl_nalloc) {
-      s2->cg_gl_nalloc = s1->cg_gl_nalloc;
-      srenew(s2->cg_gl,s2->cg_gl_nalloc);
-    }
-    s2->ncg_gl = s1->ncg_gl;
-    for(i=0; i<s2->ncg_gl; i++)
-      s2->cg_gl[i] = s1->cg_gl[i];
-    s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
-  }
+        gf = 0;
+#pragma omp for schedule(static) nowait
+        for(i=start; i<end; i++)
+        {
+            if (md->cFREEZE)
+            {
+                gf = md->cFREEZE[i];
+            }
+            for(m=0; m<DIM; m++)
+            {
+                if (ir->opts.nFreeze[gf][m])
+                {
+                    x2[i][m] = x1[i][m];
+                }
+                else
+                {
+                    x2[i][m] = x1[i][m] + a*f[i][m];
+                }
+            }
+        }
 
-  if (constr) {
-    wallcycle_start(wcycle,ewcCONSTR);
-    dvdlambda = 0;
-    constrain(NULL,TRUE,TRUE,constr,&top->idef,
-              ir,NULL,cr,count,0,md,
-              s1->x,s2->x,NULL,s2->box,s2->lambda[efptBONDED],
-              &dvdlambda,NULL,NULL,nrnb,econqCoord,FALSE,0,0);
-    wallcycle_stop(wcycle,ewcCONSTR);
-  }
+        if (s2->flags & (1<<estCGP))
+        {
+            /* Copy the CG p vector */
+            x1 = s1->cg_p;
+            x2 = s2->cg_p;
+#pragma omp for schedule(static) nowait
+            for(i=start; i<end; i++)
+            {
+                copy_rvec(x1[i],x2[i]);
+            }
+        }
+        
+        if (DOMAINDECOMP(cr))
+        {
+            s2->ddp_count = s1->ddp_count;
+            if (s2->cg_gl_nalloc < s1->cg_gl_nalloc)
+            {
+#pragma omp barrier
+                s2->cg_gl_nalloc = s1->cg_gl_nalloc;
+                srenew(s2->cg_gl,s2->cg_gl_nalloc);
+#pragma omp barrier
+            }
+            s2->ncg_gl = s1->ncg_gl;
+#pragma omp for schedule(static) nowait
+            for(i=0; i<s2->ncg_gl; i++)
+            {
+                s2->cg_gl[i] = s1->cg_gl[i];
+            }
+            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
+        }
+    }
+    
+    if (constr)
+    {
+        wallcycle_start(wcycle,ewcCONSTR);
+        dvdlambda = 0;
+        constrain(NULL,TRUE,TRUE,constr,&top->idef,	
+                  ir,NULL,cr,count,0,md,
+                  s1->x,s2->x,NULL,bMolPBC,s2->box,
+                  s2->lambda[efptBONDED],&dvdlambda,
+                  NULL,NULL,nrnb,econqCoord,FALSE,0,0);
+        wallcycle_stop(wcycle,ewcCONSTR);
+    }
 }
 
 static void em_dd_partition_system(FILE *fplog,int step,t_commrec *cr,
@@ -658,7 +702,8 @@ static void evaluate_energy(FILE *fplog,gmx_bool bVerbose,t_commrec *cr,
              ems->s.box,ems->s.x,&ems->s.hist,
              ems->f,force_vir,mdatoms,enerd,fcd,
              ems->s.lambda,graph,fr,vsite,mu_tot,t,NULL,NULL,TRUE,
-             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES | GMX_FORCE_VIRIAL |
+             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
+             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
              (bNS ? GMX_FORCE_NS | GMX_FORCE_DOLR : 0));
 
     /* Clear the unused shake virial and pressure */
@@ -697,7 +742,8 @@ static void evaluate_energy(FILE *fplog,gmx_bool bVerbose,t_commrec *cr,
     dvdlambda = 0;
     constrain(NULL,FALSE,FALSE,constr,&top->idef,
               inputrec,NULL,cr,count,0,mdatoms,
-              ems->s.x,ems->f,ems->f,ems->s.box,ems->s.lambda[efptBONDED],&dvdlambda,
+              ems->s.x,ems->f,ems->f,fr->bMolPBC,ems->s.box,
+              ems->s.lambda[efptBONDED],&dvdlambda,
               NULL,&shake_vir,nrnb,econqForceDispl,FALSE,0,0);
     if (fr->bSepDVDL && fplog)
       fprintf(fplog,sepdvdlformat,"Constraints",t,dvdlambda);
@@ -1051,8 +1097,8 @@ double do_cg(FILE *fplog,t_commrec *cr,
     }
 
     /* Take a trial step (new coords in s_c) */
-    do_em_step(cr,inputrec,mdatoms,s_min,c,s_min->s.cg_p,s_c,
-	       constr,top,nrnb,wcycle,-1);
+    do_em_step(cr,inputrec,mdatoms,fr->bMolPBC,s_min,c,s_min->s.cg_p,s_c,
+               constr,top,nrnb,wcycle,-1);
 
     neval++;
     /* Calculate energy for the trial step */
@@ -1138,8 +1184,8 @@ double do_cg(FILE *fplog,t_commrec *cr,
 	}
 
 	/* Take a trial step to this new point - new coords in s_b */
-	do_em_step(cr,inputrec,mdatoms,s_min,b,s_min->s.cg_p,s_b,
-		   constr,top,nrnb,wcycle,-1);
+	do_em_step(cr,inputrec,mdatoms,fr->bMolPBC,s_min,b,s_min->s.cg_p,s_b,
+               constr,top,nrnb,wcycle,-1);
 
 	neval++;
 	/* Calculate energy for the trial step */
@@ -2080,8 +2126,9 @@ double do_steep(FILE *fplog,t_commrec *cr,
 
     /* set new coordinates, except for first step */
     if (count > 0) {
-      do_em_step(cr,inputrec,mdatoms,s_min,stepsize,s_min->f,s_try,
-		 constr,top,nrnb,wcycle,count);
+        do_em_step(cr,inputrec,mdatoms,fr->bMolPBC,
+                   s_min,stepsize,s_min->f,s_try,
+                   constr,top,nrnb,wcycle,count);
     }
 
     evaluate_energy(fplog,bVerbose,cr,
diff --git a/src/mdlib/nbnxn_consts.h b/src/mdlib/nbnxn_consts.h
new file mode 100644
index 0000000000..c217ff1e30
--- /dev/null
+++ b/src/mdlib/nbnxn_consts.h
@@ -0,0 +1,86 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef _nbnxn_consts_h
+#define _nbnxn_consts_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* The number of pair-search sub-cells per super-cell, used for GPU */
+#define GPU_NSUBCELL_Z 2
+#define GPU_NSUBCELL_Y 2
+#define GPU_NSUBCELL_X 2
+#define GPU_NSUBCELL   (GPU_NSUBCELL_Z*GPU_NSUBCELL_Y*GPU_NSUBCELL_X)
+/* In the non-bonded GPU kernel we operate on cluster-pairs, not cells.
+ * The number of cluster in a super-cluster matches the number of sub-cells
+ * in a pair-search cell, so we introduce a new name for the same value.
+ */
+#define NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER  GPU_NSUBCELL
+
+/* With CPU kernels the i-cluster size is always 4 atoms.
+ * With x86 SIMD the j-cluster size can be 2, 4 or 8, otherwise 4.
+ */
+#define NBNXN_CPU_CLUSTER_I_SIZE       4
+
+#define NBNXN_CPU_CLUSTER_I_SIZE_2LOG  2
+
+/* With GPU kernels the cluster size is 8 atoms */
+#define NBNXN_GPU_CLUSTER_SIZE         8
+
+/* With GPU kernels we group cluster pairs in 4 to optimize memory usage */
+#define NBNXN_GPU_JGROUP_SIZE  4
+
+/* To avoid NaN when excluded atoms are at zero distance, we add a small
+ * number to r^2. NBNXN_AVOID_SING_R2_INC^-3 should fit in real.
+ */
+#ifndef GMX_DOUBLE
+#define NBNXN_AVOID_SING_R2_INC  1.0e-12f
+#else
+/* The double prec. x86 SIMD kernels use a single prec. invsqrt, so > 1e-38 */
+#define NBNXN_AVOID_SING_R2_INC  1.0e-36
+#endif
+
+/* Coulomb force table size chosen such that it fits along the non-bonded
+   parameters in the texture cache. */
+#define GPU_EWALD_COULOMB_FORCE_TABLE_SIZE 1536
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/mdlib/nbnxn_cuda/CMakeLists.txt b/src/mdlib/nbnxn_cuda/CMakeLists.txt
new file mode 100644
index 0000000000..b42694bf9f
--- /dev/null
+++ b/src/mdlib/nbnxn_cuda/CMakeLists.txt
@@ -0,0 +1,8 @@
+if(GMX_GPU)
+    file(GLOB CUDA_NB_SOURCES *.cu)
+    CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+    CUDA_ADD_LIBRARY(nbnxn_cuda STATIC ${CUDA_NB_SOURCES}
+            OPTIONS
+            RELWITHDEBINFO -g
+            DEBUG -g -D_DEBUG_=1)
+endif()
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
new file mode 100644
index 0000000000..86f81aa3e7
--- /dev/null
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -0,0 +1,671 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+
+#if defined(_MSVC)
+#include <limits>
+#endif
+
+#include "types/simple.h" 
+#include "types/nbnxn_pairlist.h"
+#include "types/nb_verlet.h"
+#include "types/ishift.h"
+#include "types/force_flags.h"
+#include "../nbnxn_consts.h"
+
+#ifdef TMPI_ATOMICS
+#include "thread_mpi/atomic.h"
+#endif
+
+#include "nbnxn_cuda_types.h"
+#include "../../gmxlib/cuda_tools/cudautils.cuh"
+#include "nbnxn_cuda.h"
+#include "nbnxn_cuda_data_mgmt.h"
+
+
+/*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
+texture<float, 1, cudaReadModeElementType> tex_nbfp;
+
+/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
+texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
+
+/* Convenience defines */
+#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
+
+/***** The kernels come here *****/
+#include "nbnxn_cuda_kernel_utils.cuh"
+
+/* Generate all combinations of kernels through multiple inclusion:
+   F, F + E, F + prune, F + E + prune. */
+/** Force only **/
+#include "nbnxn_cuda_kernels.cuh"
+/** Force & energy **/
+#define CALC_ENERGIES
+#include "nbnxn_cuda_kernels.cuh"
+#undef CALC_ENERGIES
+
+/*** Pair-list pruning kernels ***/
+/** Force only **/
+#define PRUNE_NBL
+#include "nbnxn_cuda_kernels.cuh"
+/** Force & energy **/
+#define CALC_ENERGIES
+#include "nbnxn_cuda_kernels.cuh"
+#undef CALC_ENERGIES
+#undef PRUNE_NBL
+
+/*! Nonbonded kernel function pointer type */
+typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
+                                     const cu_nbparam_t,
+                                     const cu_plist_t,
+                                     bool);
+
+/*********************************/
+
+/* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
+static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
+static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
+static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
+
+
+/* Bit-pattern used for polling-based GPU synchronization. It is used as a float
+ * and corresponds to having the exponent set to the maximum (127 -- single
+ * precision) and the mantissa to 0.
+ */
+static unsigned int poll_wait_pattern = (0x7FU << 23);
+
+/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
+static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
+{
+    int max_grid_x_size;
+
+    assert(dinfo);
+
+    max_grid_x_size = dinfo->prop.maxGridSize[0];
+
+    /* do we exceed the grid x dimension limit? */
+    if (nwork_units > max_grid_x_size)
+    {
+        gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
+                  "The number of nonbonded work units (=number of super-clusters) exceeds the"
+                  "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
+    }
+
+    return nwork_units;
+}
+
+
+/* Constant arrays listing all kernel function pointers and enabling selection
+   of a kernel in an elegant manner. */
+
+static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
+static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
+
+/* Default kernels */
+static const nbnxn_cu_kfunc_ptr_t
+nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+{
+    { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
+      { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
+    { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
+      { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
+    { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
+      { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
+    { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
+      { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
+};
+
+/* Legacy kernels */
+static const nbnxn_cu_kfunc_ptr_t
+nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+{
+    { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
+      { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
+    { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
+      { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
+    { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
+      { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
+    { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
+      { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
+};
+
+/*! Return a pointer to the kernel version to be executed at the current step. */
+static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
+                                                       bool bDoEne, bool bDoPrune)
+{
+    assert(kver < eNbnxnCuKNR);
+    assert(eeltype < eelCuNR);
+
+    if (NBNXN_KVER_LEGACY(kver))
+    {
+        return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
+    }
+    else
+    {
+        return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
+    }
+}
+
+/*! Calculates the amount of shared memory required for kernel version in use. */
+static inline int calc_shmem_required(int kver)
+{
+    int shmem;
+
+    /* size of shmem (force-buffers/xq/atom type preloading) */
+    if (NBNXN_KVER_LEGACY(kver))
+    {
+        /* i-atom x+q in shared memory */
+        shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+        /* force reduction buffers in shared memory */
+        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+    }
+    else
+    {
+        /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
+        /* i-atom x+q in shared memory */
+        shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+#ifdef IATYPE_SHMEM
+        /* i-atom types in shared memory */
+        shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
+#endif
+#if __CUDA_ARCH__ < 300
+        /* force reduction buffers in shared memory */
+        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+#endif
+    }
+
+    return shmem;
+}
+
+/*! As we execute nonbonded workload in separate streams, before launching 
+   the kernel we need to make sure that he following operations have completed:
+   - atomdata allocation and related H2D transfers (every nstlist step);
+   - pair list H2D transfer (every nstlist step);
+   - shift vector H2D transfer (every nstlist step);
+   - force (+shift force and energy) output clearing (every step).
+
+   These operations are issued in the local stream at the beginning of the step
+   and therefore always complete before the local kernel launch. The non-local
+   kernel is launched after the local on the same device/context, so this is
+   inherently scheduled after the operations in the local stream (including the
+   above "misc_ops").
+   However, for the sake of having a future-proof implementation, we use the
+   misc_ops_done event to record the point in time when the above  operations
+   are finished and synchronize with this event in the non-local stream.
+*/
+void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
+                              const nbnxn_atomdata_t *nbatom,
+                              int flags,
+                              int iloc)
+{
+    cudaError_t stat;
+    int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
+    /* CUDA kernel launch-related stuff */
+    int  shmem, nblock;
+    dim3 dim_block, dim_grid;
+    nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
+
+    cu_atomdata_t   *adat   = cu_nb->atdat;
+    cu_nbparam_t    *nbp    = cu_nb->nbparam;
+    cu_plist_t      *plist  = cu_nb->plist[iloc];
+    cu_timers_t     *t      = cu_nb->timers;
+    cudaStream_t    stream  = cu_nb->stream[iloc];
+
+    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
+    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool bDoTime     = cu_nb->bDoTime;
+
+    /* turn energy calculation always on/off (for debugging/testing only) */
+    bCalcEner = (bCalcEner || always_ener) && !never_ener;
+
+    /* don't launch the kernel if there is no work to do */
+    if (plist->nsci == 0)
+    {
+        return;
+    }
+
+    /* calculate the atom data index range based on locality */
+    if (LOCAL_I(iloc))
+    {
+        adat_begin  = 0;
+        adat_len    = adat->natoms_local;
+    }
+    else
+    {
+        adat_begin  = adat->natoms_local;
+        adat_len    = adat->natoms - adat->natoms_local;
+    }
+
+    /* When we get here all misc operations issues in the local stream are done,
+       so we record that in the local stream and wait for it in the nonlocal one. */
+    if (cu_nb->bUseTwoStreams)
+    {
+        if (iloc == eintLocal)
+        {
+            stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
+            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
+        }
+        else
+        {
+            stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
+            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
+        }
+    }
+
+    /* beginning of timed HtoD section */
+    if (bDoTime)
+    {
+        stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
+        CU_RET_ERR(stat, "cudaEventRecord failed");
+    }
+
+    /* HtoD x, q */
+    cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
+                      adat_len * sizeof(*adat->xq), stream); 
+
+    if (bDoTime)
+    {
+        stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
+        CU_RET_ERR(stat, "cudaEventRecord failed");
+    }
+
+    /* beginning of timed nonbonded calculation section */
+    if (bDoTime)
+    {
+        stat = cudaEventRecord(t->start_nb_k[iloc], stream);
+        CU_RET_ERR(stat, "cudaEventRecord failed");
+    }
+
+    /* get the pointer to the kernel flavor we need to use */
+    nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
+                                    plist->bDoPrune || always_prune);
+
+    /* kernel launch config */
+    nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
+    dim_block = dim3(CL_SIZE, CL_SIZE, 1);
+    dim_grid  = dim3(nblock, 1, 1);
+    shmem     = calc_shmem_required(cu_nb->kernel_ver);
+
+    if (debug)
+    {
+        fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
+                "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+                dim_block.x, dim_block.y, dim_block.z,
+                dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
+                NCL_PER_SUPERCL, plist->na_c);
+    }
+
+    nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
+    CU_LAUNCH_ERR("k_calc_nb");
+
+    if (bDoTime)
+    {
+        stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
+        CU_RET_ERR(stat, "cudaEventRecord failed");
+    }
+}
+
+void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
+                               const nbnxn_atomdata_t *nbatom,
+                               int flags,
+                               int aloc)
+{
+    cudaError_t stat;
+    int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
+    int iloc = -1;
+
+    /* determine interaction locality from atom locality */
+    if (LOCAL_A(aloc))
+    {
+        iloc = eintLocal;
+    }
+    else if (NONLOCAL_A(aloc))
+    {
+        iloc = eintNonlocal;
+    }
+    else
+    {
+        char stmp[STRLEN];
+        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+        gmx_incons(stmp);
+    }
+
+    cu_atomdata_t   *adat   = cu_nb->atdat;
+    cu_timers_t     *t      = cu_nb->timers;
+    bool            bDoTime = cu_nb->bDoTime;
+    cudaStream_t    stream  = cu_nb->stream[iloc];
+
+    bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
+    bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+
+    /* don't launch copy-back if there was no work to do */
+    if (cu_nb->plist[iloc]->nsci == 0)
+    {
+        return;
+    }
+
+    /* calculate the atom data index range based on locality */
+    if (LOCAL_A(aloc))
+    {
+        adat_begin  = 0;
+        adat_len    = adat->natoms_local;
+        adat_end    = cu_nb->atdat->natoms_local;
+    }
+    else
+    {
+        adat_begin  = adat->natoms_local;
+        adat_len    = adat->natoms - adat->natoms_local;
+        adat_end    = cu_nb->atdat->natoms;
+    }
+
+    /* beginning of timed D2H section */
+    if (bDoTime)
+    {
+        stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
+        CU_RET_ERR(stat, "cudaEventRecord failed");
+    }
+
+    if (!cu_nb->bUseStreamSync)
+    {
+        /* For safety reasons set a few (5%) forces to NaN. This way even if the
+           polling "hack" fails with some future NVIDIA driver we'll get a crash. */
+        for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
+        {
+#ifdef NAN
+            nbatom->out[0].f[i] = NAN;
+#else
+#  ifdef _MSVC
+            if (numeric_limits<float>::has_quiet_NaN)
+            {
+                nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
+            }
+            else
+#  endif
+            {
+                nbatom->out[0].f[i] = GMX_REAL_MAX;
+            }
+#endif
+        }
+
+        /* Set the last four bytes of the force array to a bit pattern
+           which can't be the result of the force calculation:
+           max exponent (127) and zero mantissa. */
+        *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
+    }
+
+    /* With DD the local D2H transfer can only start after the non-local 
+       has been launched. */
+    if (iloc == eintLocal && cu_nb->bUseTwoStreams)
+    {
+        stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
+        CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
+    }
+
+    /* DtoH f */
+    cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin, 
+                      (adat_len)*sizeof(*adat->f), stream);
+
+    /* After the non-local D2H is launched the nonlocal_done event can be
+       recorded which signals that the local D2H can proceed. This event is not
+       placed after the non-local kernel because we first need the non-local
+       data back first. */
+    if (iloc == eintNonlocal)
+    {
+        stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
+        CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
+    }
+
+    /* only transfer energies in the local stream */
+    if (LOCAL_I(iloc))
+    {
+        /* DtoH fshift */
+        if (bCalcFshift)
+        {
+            cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
+                              SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
+        }
+
+        /* DtoH energies */
+        if (bCalcEner)
+        {
+            cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
+                              sizeof(*cu_nb->nbst.e_lj), stream);
+            cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
+                              sizeof(*cu_nb->nbst.e_el), stream);
+        }
+    }
+
+    if (bDoTime)
+    {
+        stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
+        CU_RET_ERR(stat, "cudaEventRecord failed");
+    }
+}
+
+/* Atomic compare-exchange operation on unsigned values. It is used in
+ * polling wait for the GPU.
+ */
+static inline bool atomic_cas(volatile unsigned int *ptr,
+                              unsigned int oldval,
+                              unsigned int newval)
+{
+    assert(ptr);
+
+#ifdef TMPI_ATOMICS
+    return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
+#else
+    gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
+    return true;
+#endif
+}
+
+void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
+                         const nbnxn_atomdata_t *nbatom,
+                         int flags, int aloc,
+                         float *e_lj, float *e_el, rvec *fshift)
+{
+    cudaError_t stat;
+    int i, adat_end, iloc = -1;
+    volatile unsigned int *poll_word;
+
+    /* determine interaction locality from atom locality */
+    if (LOCAL_A(aloc))
+    {
+        iloc = eintLocal;
+    }
+    else if (NONLOCAL_A(aloc))
+    {
+        iloc = eintNonlocal;
+    }
+    else
+    {
+        char stmp[STRLEN];
+        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+        gmx_incons(stmp);
+    }
+
+    cu_plist_t      *plist   = cu_nb->plist[iloc];
+    cu_timers_t     *timers  = cu_nb->timers;
+    wallclock_gpu_t *timings = cu_nb->timings;
+    nb_staging      nbst     = cu_nb->nbst;
+
+    bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
+    bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
+
+    /* turn energy calculation always on/off (for debugging/testing only) */
+    bCalcEner = (bCalcEner || always_ener) && !never_ener; 
+
+    /* don't launch wait/update timers & counters if there was no work to do
+
+       NOTE: if timing with multiple GPUs (streams) becomes possible, the
+       counters could end up being inconsistent due to not being incremented
+       on some of the nodes! */
+    if (cu_nb->plist[iloc]->nsci == 0)
+    {
+        return;
+    }
+
+    /* calculate the atom data index range based on locality */
+    if (LOCAL_A(aloc))
+    {
+        adat_end = cu_nb->atdat->natoms_local;
+    }
+    else
+    {
+        adat_end = cu_nb->atdat->natoms;
+    }
+
+    if (cu_nb->bUseStreamSync)
+    {
+        stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
+        CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
+    }
+    else 
+    {
+        /* Busy-wait until we get the signal pattern set in last byte
+         * of the l/nl float vector. This pattern corresponds to a floating
+         * point number which can't be the result of the force calculation
+         * (maximum, 127 exponent and 0 mantissa).
+         * The polling uses atomic compare-exchange.
+         */
+        poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
+        while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
+    }
+
+    /* timing data accumulation */
+    if (cu_nb->bDoTime)
+    {
+        /* only increase counter once (at local F wait) */
+        if (LOCAL_I(iloc))
+        {
+            timings->nb_c++;
+            timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
+        }
+
+        /* kernel timings */
+        timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
+            cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
+
+        /* X/q H2D and F D2H timings */
+        timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
+                                                 timers->stop_nb_h2d[iloc]);
+        timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
+                                                 timers->stop_nb_d2h[iloc]);
+
+        /* only count atdat and pair-list H2D at pair-search step */
+        if (plist->bDoPrune)
+        {
+            /* atdat transfer timing (add only once, at local F wait) */
+            if (LOCAL_A(aloc))
+            {
+                timings->pl_h2d_c++;
+                timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
+                                                         timers->stop_atdat);
+            }
+
+            timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
+                                                     timers->stop_pl_h2d[iloc]);
+        }
+    }
+
+    /* add up energies and shift forces (only once at local F wait) */
+    if (LOCAL_I(iloc))
+    {
+        if (bCalcEner)
+        {
+            *e_lj += *nbst.e_lj;
+            *e_el += *nbst.e_el;
+        }
+
+        if (bCalcFshift)
+        {
+            for (i = 0; i < SHIFTS; i++)
+            {
+                fshift[i][0] += nbst.fshift[i].x;
+                fshift[i][1] += nbst.fshift[i].y;
+                fshift[i][2] += nbst.fshift[i].z;
+            }
+        }
+    }
+
+    /* turn off pruning (doesn't matter if this is pair-search step or not) */
+    plist->bDoPrune = false;
+}
+
+/*! Return the reference to the nbfp texture. */
+const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
+{
+    return tex_nbfp;
+}
+
+/*! Return the reference to the coulomb_tab. */
+const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
+{
+    return tex_coulomb_tab;
+}
+
+/*! Set up the cache configuration for the non-bonded kernels,
+ */
+void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
+{
+    cudaError_t stat;
+
+    for (int i = 0; i < eelCuNR; i++)
+        for (int j = 0; j < nEnergyKernelTypes; j++)
+            for (int k = 0; k < nPruneKernelTypes; k++)
+            {
+                /* Legacy kernel 16/48 kB Shared/L1 */
+                stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
+                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+
+                if (devinfo->prop.major >= 3)
+                {
+                    /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
+                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
+                }
+                else
+                {
+                    /* On Fermi prefer L1 gives 2% higher performance */
+                    /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
+                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
+                }
+                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+            }
+}
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.h b/src/mdlib/nbnxn_cuda/nbnxn_cuda.h
new file mode 100644
index 0000000000..6eb2d970e3
--- /dev/null
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.h
@@ -0,0 +1,87 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef NBNXN_CUDA_H
+#define NBNXN_CUDA_H
+
+#include "types/nbnxn_cuda_types_ext.h"
+
+#ifdef GMX_GPU
+#define FUNC_TERM ;
+#else
+#define FUNC_TERM {}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! Launch asynchronously the nonbonded force calculations.
+ *  This consists of the following (async) steps launched:
+ *  - upload x and q;
+ *  - upload shift vector;
+ *  - launch kernel;
+ *  The local and non-local interaction calculations are launched in two
+ *  separate streams.
+ */
+void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
+                              const nbnxn_atomdata_t *nbdata,
+                              int flags,
+                              int iloc) FUNC_TERM
+
+/*! Launch asynchronously the download of nonbonded forces from the GPU
+ *  (and energies/shift forces if required).
+ */
+void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
+                               const nbnxn_atomdata_t *nbatom,
+                               int flags,
+                               int aloc) FUNC_TERM
+
+/*! Wait for the asynchronously launched nonbonded calculations and data
+ *  transfers to finish.
+ */
+void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
+                         const nbnxn_atomdata_t * nbatom,
+                         int flags, int aloc,
+                         real *e_lj, real *e_el,
+                         rvec *fshift) FUNC_TERM
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef FUNC_TERM
+
+#endif /* NBNXN_CUDA_H */
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
new file mode 100644
index 0000000000..35f990db78
--- /dev/null
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -0,0 +1,884 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "gmx_fatal.h"
+#include "smalloc.h"
+#include "tables.h"
+#include "typedefs.h"
+#include "types/nb_verlet.h"
+#include "types/interaction_const.h"
+#include "types/force_flags.h"
+#include "../nbnxn_consts.h"
+
+#include "nbnxn_cuda_types.h"
+#include "../../gmxlib/cuda_tools/cudautils.cuh"
+#include "nbnxn_cuda_data_mgmt.h"
+#include "pmalloc_cuda.h"
+#include "gpu_utils.h"
+
+static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
+
+/* This is a heuristically determined parameter for the Fermi architecture for
+ * the minimum size of ci lists by multiplying this constant with the # of
+ * multiprocessors on the current device.
+ */
+static unsigned int gpu_min_ci_balanced_factor = 40;
+
+/* Functions from nbnxn_cuda.cu */
+extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
+extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref();
+extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref();
+
+/* Fw. decl. */
+static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
+
+
+/*! Tabulates the Ewald Coulomb force and initializes the size/scale
+    and the table GPU array. If called with an already allocated table,
+    it just re-uploads the table.
+ */
+static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp)
+{
+    float       *ftmp, *coul_tab;
+    int         tabsize;
+    double      tabscale;
+    cudaError_t stat;
+
+    tabsize     = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
+    /* Subtract 2 iso 1 to avoid access out of range due to rounding */
+    tabscale    = (tabsize - 2) / sqrt(nbp->rcoulomb_sq);
+
+    pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp));
+
+    table_spline3_fill_ewald_lr(ftmp, NULL, tabsize, tableformatF,
+                                1/tabscale, nbp->ewald_beta);
+
+    /* If the table pointer == NULL the table is generated the first time =>
+       the array pointer will be saved to nbparam and the texture is bound.
+     */
+    coul_tab = nbp->coulomb_tab;
+    if (coul_tab == NULL)
+    {
+        stat = cudaMalloc((void **)&coul_tab, tabsize*sizeof(*coul_tab));
+        CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
+
+        nbp->coulomb_tab = coul_tab;
+
+        cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
+        stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
+                               coul_tab, &cd, tabsize*sizeof(*coul_tab));
+        CU_RET_ERR(stat, "cudaBindTexture on coul_tab failed");
+    }
+
+    cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
+
+    nbp->coulomb_tab_size     = tabsize;
+    nbp->coulomb_tab_scale    = tabscale;
+
+    pfree(ftmp);
+}
+
+
+/*! Initializes the atomdata structure first time, it only gets filled at
+    pair-search. */
+static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
+{
+    cudaError_t stat;
+
+    ad->ntypes  = ntypes;
+    stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
+    CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
+    ad->bShiftVecUploaded = false;
+
+    stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
+    CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
+
+    stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
+    CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
+    stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
+    CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
+
+    /* initialize to NULL poiters to data that is not allocated here and will
+       need reallocation in nbnxn_cuda_init_atomdata */
+    ad->xq = NULL;
+    ad->f  = NULL;
+
+    /* size -1 indicates that the respective array hasn't been initialized yet */
+    ad->natoms = -1;
+    ad->nalloc = -1;
+}
+
+/*! Initializes the nonbonded parameter data structure. */
+static void init_nbparam(cu_nbparam_t *nbp,
+                         const interaction_const_t *ic,
+                         const nonbonded_verlet_t *nbv)
+{
+    cudaError_t stat;
+    int         ntypes, nnbfp;
+
+    ntypes  = nbv->grp[0].nbat->ntype;
+
+    nbp->ewald_beta = ic->ewaldcoeff;
+    nbp->sh_ewald   = ic->sh_ewald;
+    nbp->epsfac     = ic->epsfac;
+    nbp->two_k_rf   = 2.0 * ic->k_rf;
+    nbp->c_rf       = ic->c_rf;
+    nbp->rvdw_sq    = ic->rvdw * ic->rvdw;
+    nbp->rcoulomb_sq= ic->rcoulomb * ic->rcoulomb;
+    nbp->rlist_sq   = ic->rlist * ic->rlist;
+    nbp->sh_invrc6  = ic->sh_invrc6;
+
+    if (ic->eeltype == eelCUT)
+    {
+        nbp->eeltype = eelCuCUT;
+    }
+    else if (EEL_RF(ic->eeltype))
+    {
+        nbp->eeltype = eelCuRF;
+    }
+    else if ((EEL_PME(ic->eeltype) || ic->eeltype==eelEWALD))
+    {
+        /* Initially rcoulomb == rvdw, so it's surely not twin cut-off, unless
+           forced by the env. var. (used only for benchmarking). */
+        if (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
+        {
+            nbp->eeltype = eelCuEWALD;
+        }
+        else
+        {
+            nbp->eeltype = eelCuEWALD_TWIN;
+        }
+    }
+    else
+    {
+        /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
+        gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
+    }
+
+    /* generate table for PME */
+    if (nbp->eeltype == eelCuEWALD)
+    {
+        nbp->coulomb_tab = NULL;
+        init_ewald_coulomb_force_table(nbp);
+    }
+
+    nnbfp = 2*ntypes*ntypes;
+    stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
+    CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
+    cu_copy_H2D(nbp->nbfp, nbv->grp[0].nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
+
+    cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
+    stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
+                           nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
+    CU_RET_ERR(stat, "cudaBindTexture on nbfp failed");
+}
+
+/*! Re-generate the GPU Ewald force table, resets rlist, and update the
+ *  electrostatic type switching to twin cut-off (or back) if needed. */
+void nbnxn_cuda_pmetune_update_param(nbnxn_cuda_ptr_t cu_nb,
+                                     const interaction_const_t *ic)
+{
+    cu_nbparam_t *nbp = cu_nb->nbparam;
+
+    nbp->rlist_sq       = ic->rlist * ic->rlist;
+    nbp->rcoulomb_sq    = ic->rcoulomb * ic->rcoulomb;
+    nbp->ewald_beta     = ic->ewaldcoeff;
+
+    /* When switching to/from twin cut-off, the electrostatics type needs updating.
+       (The env. var. that forces twin cut-off is for benchmarking only!) */
+    if (ic->rcoulomb == ic->rvdw &&
+        getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
+    {
+        nbp->eeltype = eelCuEWALD;
+    }
+    else
+    {
+        nbp->eeltype = eelCuEWALD_TWIN;
+    }
+
+    init_ewald_coulomb_force_table(cu_nb->nbparam);
+}
+
+/*! Initializes the pair list data structure. */
+static void init_plist(cu_plist_t *pl)
+{
+    /* initialize to NULL pointers to data that is not allocated here and will
+       need reallocation in nbnxn_cuda_init_pairlist */
+    pl->sci     = NULL;
+    pl->cj4     = NULL;
+    pl->excl    = NULL;
+
+    /* size -1 indicates that the respective array hasn't been initialized yet */
+    pl->na_c        = -1;
+    pl->nsci        = -1;
+    pl->sci_nalloc  = -1;
+    pl->ncj4        = -1;
+    pl->cj4_nalloc  = -1;
+    pl->nexcl       = -1;
+    pl->excl_nalloc = -1;
+    pl->bDoPrune    = false;
+}
+
+/*! Initializes the timer data structure. */
+static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
+{
+    cudaError_t stat;
+    int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync: cudaEventDefault );
+
+    stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
+    CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
+    stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
+    CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
+
+    /* The non-local counters/stream (second in the array) are needed only with DD. */
+    for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
+    {
+        stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
+        CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
+        stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
+        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
+
+
+        stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
+        CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
+        stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
+        CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
+
+        stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
+        CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
+        stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
+        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
+
+        stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
+        CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
+        stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
+        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
+    }
+}
+
+/*! Initializes the timings data structure. */
+static void init_timings(wallclock_gpu_t *t)
+{
+    int i, j;
+
+    t->nb_h2d_t = 0.0;
+    t->nb_d2h_t = 0.0;
+    t->nb_c    = 0;
+    t->pl_h2d_t = 0.0;
+    t->pl_h2d_c = 0;
+    for (i = 0; i < 2; i++)
+    {
+        for(j = 0; j < 2; j++)
+        {
+            t->ktime[i][j].t = 0.0;
+            t->ktime[i][j].c = 0;
+        }
+    }
+}
+
+/* Decide which kernel version to use (default or legacy) based on:
+ *  - CUDA version
+ *  - non-bonded kernel selector environment variables
+ *  - GPU SM version TODO ???
+ */
+static int pick_nbnxn_kernel_version()
+{
+    bool bLegacyKernel, bDefaultKernel, bCUDA40, bCUDA32;
+    char sbuf[STRLEN];
+    int  kver;
+
+    /* legacy kernel (former k2), kept for now for backward compatibility,
+       faster than the default with  CUDA 3.2/4.0 (TODO: on Kepler?). */
+    bLegacyKernel  = (getenv("GMX_CUDA_NB_LEGACY") != NULL);
+    /* default kernel (former k3). */
+    bDefaultKernel = (getenv("GMX_CUDA_NB_DEFAULT") != NULL);
+
+    if ((unsigned)(bLegacyKernel + bDefaultKernel) > 1)
+    {
+        gmx_fatal(FARGS, "Multiple CUDA non-bonded kernels requested; to manually pick a kernel set only one \n"
+                  "of the following environment variables: \n"
+                  "GMX_CUDA_NB_DEFAULT, GMX_CUDA_NB_LEGACY");
+    }
+
+    bCUDA32 = bCUDA40 = false;
+#if CUDA_VERSION == 3200
+    bCUDA32 = true;
+    sprintf(sbuf, "3.2");
+#elif CUDA_VERSION == 4000
+    bCUDA40 = true;
+    sprintf(sbuf, "4.0");
+#endif
+
+    /* default is default ;) */
+    kver = eNbnxnCuKDefault;
+
+    if (bCUDA32 || bCUDA40)
+    {
+        /* use legacy kernel unless something else is forced by an env. var */
+        if (bDefaultKernel)
+        {
+            fprintf(stderr,
+                    "\nNOTE: CUDA %s compilation detected; with this compiler version the legacy\n"
+                    "      non-bonded kernels perform best. However, the default kernels were\n"
+                    "      selected by the GMX_CUDA_NB_DEFAULT environment variable.\n"
+                    "      For best performance upgrade your CUDA toolkit.",
+                    sbuf);
+        }
+        else
+        {
+            kver = eNbnxnCuKLegacy;
+        }
+    }
+    else
+    {
+        /* issue not if the non-default kernel is forced by an env. var */
+        if (bLegacyKernel)
+        {
+            fprintf(stderr,
+                    "\nNOTE: Legacy non-bonded CUDA kernels were selected by the GMX_CUDA_NB_LEGACY\n"
+                    "      env. var. Consider using using the default kernels which should be faster!\n");
+
+            kver = eNbnxnCuKLegacy;
+        }
+    }
+
+    return kver;
+}
+
+void nbnxn_cuda_init(FILE *fplog,
+                     nbnxn_cuda_ptr_t *p_cu_nb,
+                     gmx_gpu_info_t *gpu_info, int my_gpu_index,
+                     gmx_bool bLocalAndNonlocal)
+{
+    cudaError_t stat;
+    nbnxn_cuda_ptr_t  nb;
+    char sbuf[STRLEN];
+    bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86;
+
+    assert(gpu_info);
+
+    if (p_cu_nb == NULL) return;
+
+    snew(nb, 1);
+    snew(nb->atdat, 1);
+    snew(nb->nbparam, 1);
+    snew(nb->plist[eintLocal], 1);
+    if (bLocalAndNonlocal)
+    {
+        snew(nb->plist[eintNonlocal], 1);
+    }
+
+    nb->bUseTwoStreams = bLocalAndNonlocal;
+
+    snew(nb->timers, 1);
+    snew(nb->timings, 1);
+
+    /* init nbst */
+    pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
+    pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
+    pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
+
+    init_plist(nb->plist[eintLocal]);
+
+    /* local/non-local GPU streams */
+    stat = cudaStreamCreate(&nb->stream[eintLocal]);
+    CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
+    if (nb->bUseTwoStreams)
+    {
+        init_plist(nb->plist[eintNonlocal]);
+        stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
+        CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
+    }
+
+    /* init events for sychronization (timing disabled for performance reasons!) */
+    stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
+    CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
+    stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
+    CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
+
+    /* set device info, just point it to the right GPU among the detected ones */
+    nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, my_gpu_index)];
+
+    /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
+     * (which increases with shorter time/step) caused by a known CUDA driver bug.
+     * To work around the issue we'll use an (admittedly fragile) memory polling
+     * waiting to preserve performance. This requires support for atomic
+     * operations and only works on x86/x86_64.
+     * With polling wait event-timing also needs to be disabled.
+     */
+
+    bStreamSync    = getenv("GMX_CUDA_STREAMSYNC") != NULL;
+    bNoStreamSync  = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
+
+#ifdef TMPI_ATOMICS
+    bTMPIAtomics = true;
+#else
+    bTMPIAtomics = false;
+#endif
+
+#if defined(i386) || defined(__x86_64__)
+    bX86 = true;
+#else
+    bX86 = false;
+#endif
+
+    if (bStreamSync && bNoStreamSync)
+    {
+        gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
+    }
+
+    if (nb->dev_info->prop.ECCEnabled == 1)
+    {
+        if (bStreamSync)
+        {
+            nb->bUseStreamSync = true;
+
+            sprintf(sbuf,
+                    "NOTE: Using a GPU with ECC enabled, but cudaStreamSynchronize-based waiting is\n"
+                    "      forced by the GMX_CUDA_STREAMSYNC env. var. Due to a CUDA bug, this \n"
+                    "      combination causes performance loss.");
+            fprintf(stderr, "\n%s\n", sbuf);
+            if (fplog)
+            {
+                fprintf(fplog, "\n%s\n", sbuf);
+            }
+        }
+        else
+        {
+            /* can use polling wait only on x86/x86_64 *if* atomics are available */
+            nb->bUseStreamSync = ((bX86 && bTMPIAtomics) == false);
+
+            if (!bX86)
+            {
+                sprintf(sbuf,
+                        "Using a GPU with ECC on; the standard cudaStreamSynchronize waiting, due to a\n"
+                        "      CUDA bug, causes performance loss when used in combination with ECC.\n"
+                        "      However, the polling waiting workaround can not be used as it is only\n"
+                        "      supported on x86/x86_64, but not on the current architecture.");
+                gmx_warning("%s\n", sbuf);
+                if (fplog)
+                {
+                    fprintf(fplog, "\n%s\n", sbuf);
+                }
+
+            }
+            else if (bTMPIAtomics)
+            {
+                if (fplog)
+                {
+                    fprintf(fplog,
+                            "NOTE: Using a GPU with ECC enabled; will use polling waiting.\n");
+                }
+            }
+            else
+            {
+                sprintf(sbuf,
+                        "Using a GPU with ECC on; the standard cudaStreamSynchronize waiting, due to a\n"
+                        "      CUDA bug, causes performance loss when used in combination with ECC.\n"
+                        "      However, the polling waiting workaround can not be used as atomic\n"
+                        "      operations are not supported by the current CPU+compiler combination.");
+                gmx_warning("%s\n", sbuf);
+                if (fplog)
+                {
+                    fprintf(fplog, "\n%s\n", sbuf);
+                }
+            }
+        }
+    }
+    else
+    {
+        if (bNoStreamSync)
+        {
+            nb->bUseStreamSync = false;
+
+            sprintf(sbuf,
+                    "NOTE: Using a GPU with no/disabled ECC, but cudaStreamSynchronize-based waiting\n"
+                    "      is turned off and polling turned on by the GMX_NO_CUDA_STREAMSYNC env. var.");
+            fprintf(stderr, "\n%s\n", sbuf);
+            if (fplog)
+            {
+                fprintf(fplog, "\n%s\n", sbuf);
+            }
+        }
+        else
+        {
+            /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
+            nb->bUseStreamSync = true;
+        }
+    }
+
+    /* CUDA timing disabled as event timers don't work:
+       - with multiple streams = domain-decomposition;
+       - with the polling waiting hack (without cudaStreamSynchronize);
+       - when turned off by GMX_DISABLE_CUDA_TIMING.
+     */
+    nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
+                   (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
+
+    if (nb->bDoTime)
+    {
+        init_timers(nb->timers, nb->bUseTwoStreams);
+        init_timings(nb->timings);
+    }
+
+    /* set the kernel type for the current GPU */
+    nb->kernel_ver = pick_nbnxn_kernel_version();
+    /* pick L1 cache configuration */
+    nbnxn_cuda_set_cacheconfig(nb->dev_info);
+
+    *p_cu_nb = nb;
+
+    if (debug)
+    {
+        fprintf(debug, "Initialized CUDA data structures.\n");
+    }
+}
+
+void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t cu_nb,
+                           const interaction_const_t *ic,
+                           const nonbonded_verlet_t *nbv)
+{
+    init_atomdata_first(cu_nb->atdat, nbv->grp[0].nbat->ntype);
+    init_nbparam(cu_nb->nbparam, ic, nbv);
+
+    /* clear energy and shift force outputs */
+    nbnxn_cuda_clear_e_fshift(cu_nb);
+}
+
+void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
+                              const nbnxn_pairlist_t *h_plist,
+                              int iloc)
+{
+    char         sbuf[STRLEN];
+    cudaError_t  stat;
+    bool         bDoTime    = cu_nb->bDoTime;
+    cudaStream_t stream     = cu_nb->stream[iloc];
+    cu_plist_t   *d_plist   = cu_nb->plist[iloc];
+
+    if (d_plist->na_c < 0)
+    {
+        d_plist->na_c = h_plist->na_ci;
+    }
+    else
+    {
+        if (d_plist->na_c != h_plist->na_ci)
+        {
+            sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
+                    d_plist->na_c, h_plist->na_ci);
+            gmx_incons(sbuf);
+        }
+    }
+
+    if (bDoTime)
+    {
+        stat = cudaEventRecord(cu_nb->timers->start_pl_h2d[iloc], stream);
+        CU_RET_ERR(stat, "cudaEventRecord failed");
+    }
+
+    cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
+                         &d_plist->nsci, &d_plist->sci_nalloc,
+                         h_plist->nsci,
+                         stream, true);
+
+    cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
+                         &d_plist->ncj4, &d_plist->cj4_nalloc,
+                         h_plist->ncj4,
+                         stream, true);
+
+    cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
+                         &d_plist->nexcl, &d_plist->excl_nalloc,
+                         h_plist->nexcl,
+                         stream, true);
+
+    if (bDoTime)
+    {
+        stat = cudaEventRecord(cu_nb->timers->stop_pl_h2d[iloc], stream);
+        CU_RET_ERR(stat, "cudaEventRecord failed");
+    }
+
+    /* need to prune the pair list during the next step */
+    d_plist->bDoPrune = true;
+}
+
+void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
+                                const nbnxn_atomdata_t *nbatom)
+{
+    cu_atomdata_t *adat = cu_nb->atdat;
+    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+
+    /* only if we have a dynamic box */
+    if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+    {
+        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 
+                          SHIFTS * sizeof(*adat->shift_vec), ls);
+        adat->bShiftVecUploaded = true;
+    }
+}
+
+/*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
+static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
+{
+    cudaError_t   stat;
+    cu_atomdata_t *adat = cu_nb->atdat;
+    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+
+    stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
+    CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
+}
+
+/*! Clears nonbonded shift force output array and energy outputs on the GPU. */
+static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
+{
+    cudaError_t   stat;
+    cu_atomdata_t *adat = cu_nb->atdat;
+    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+
+    stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
+    CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
+    stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
+    CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
+    stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
+    CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
+}
+
+void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
+{
+    nbnxn_cuda_clear_f(cu_nb, cu_nb->atdat->natoms);
+    /* clear shift force array and energies if the outputs were 
+       used in the current step */
+    if (flags & GMX_FORCE_VIRIAL)
+    {
+        nbnxn_cuda_clear_e_fshift(cu_nb);
+    }
+}
+
+void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
+                              const nbnxn_atomdata_t *nbat)
+{
+    cudaError_t   stat;
+    int           nalloc, natoms;
+    bool          realloced;
+    bool          bDoTime   = cu_nb->bDoTime;
+    cu_timers_t   *timers   = cu_nb->timers;
+    cu_atomdata_t *d_atdat  = cu_nb->atdat;
+    cudaStream_t  ls        = cu_nb->stream[eintLocal];
+
+    natoms = nbat->natoms;
+    realloced = false;
+
+    if (bDoTime)
+    {
+        /* time async copy */
+        stat = cudaEventRecord(timers->start_atdat, ls);
+        CU_RET_ERR(stat, "cudaEventRecord failed");
+    }
+
+    /* need to reallocate if we have to copy more atoms than the amount of space
+       available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
+    if (natoms > d_atdat->nalloc)
+    {
+        nalloc = over_alloc_small(natoms);
+
+        /* free up first if the arrays have already been initialized */
+        if (d_atdat->nalloc != -1)
+        {
+            cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
+            cu_free_buffered(d_atdat->xq);
+            cu_free_buffered(d_atdat->atom_types);
+        }
+
+        stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
+        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
+        stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
+        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
+
+        stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
+        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
+
+        d_atdat->nalloc = nalloc;
+        realloced = true;
+    }
+
+    d_atdat->natoms = natoms;
+    d_atdat->natoms_local = nbat->natoms_local;
+
+    /* need to clear GPU f output if realloc happened */
+    if (realloced)
+    {
+        nbnxn_cuda_clear_f(cu_nb, nalloc);
+    }
+
+    cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
+                      natoms*sizeof(*d_atdat->atom_types), ls);
+
+    if (bDoTime)
+    {
+        stat = cudaEventRecord(timers->stop_atdat, ls);
+        CU_RET_ERR(stat, "cudaEventRecord failed");
+    }
+}
+
+void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb)
+{
+    cudaError_t     stat;
+    cu_atomdata_t   *atdat;
+    cu_nbparam_t    *nbparam;
+    cu_plist_t      *plist, *plist_nl;
+    cu_timers_t     *timers;
+
+    if (cu_nb == NULL) return;
+
+    atdat       = cu_nb->atdat;
+    nbparam     = cu_nb->nbparam;
+    plist       = cu_nb->plist[eintLocal];
+    plist_nl    = cu_nb->plist[eintNonlocal];
+    timers      = cu_nb->timers;
+
+    if (nbparam->eeltype == eelCuEWALD || nbparam->eeltype == eelCuEWALD_TWIN)
+    {
+      stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
+      CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
+      cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
+    }
+
+    stat = cudaEventDestroy(cu_nb->nonlocal_done);
+    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
+    stat = cudaEventDestroy(cu_nb->misc_ops_done);
+    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
+
+    if (cu_nb->bDoTime)
+    {
+        stat = cudaEventDestroy(timers->start_atdat);
+        CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
+        stat = cudaEventDestroy(timers->stop_atdat);
+        CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
+
+        /* The non-local counters/stream (second in the array) are needed only with DD. */
+        for (int i = 0; i <= (cu_nb->bUseTwoStreams ? 1 : 0); i++)
+        {
+            stat = cudaEventDestroy(timers->start_nb_k[i]);
+            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
+            stat = cudaEventDestroy(timers->stop_nb_k[i]);
+            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
+
+            stat = cudaEventDestroy(timers->start_pl_h2d[i]);
+            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
+            stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
+            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
+
+            stat = cudaStreamDestroy(cu_nb->stream[i]);
+            CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
+
+            stat = cudaEventDestroy(timers->start_nb_h2d[i]);
+            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
+            stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
+            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
+
+            stat = cudaEventDestroy(timers->start_nb_d2h[i]);
+            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
+            stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
+            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
+        }
+    }
+
+    stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
+    CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
+    cu_free_buffered(nbparam->nbfp);
+
+    stat = cudaFree(atdat->shift_vec);
+    CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
+    stat = cudaFree(atdat->fshift);
+    CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
+
+    stat = cudaFree(atdat->e_lj);
+    CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
+    stat = cudaFree(atdat->e_el);
+    CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
+
+    cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
+    cu_free_buffered(atdat->xq);
+    cu_free_buffered(atdat->atom_types, &atdat->ntypes);
+
+    cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
+    cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
+    cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
+    if (cu_nb->bUseTwoStreams)
+    {
+        cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
+        cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
+        cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
+    }
+
+    if (debug)
+    {
+        fprintf(debug, "Cleaned up CUDA data structures.\n");
+    }
+}
+
+void cu_synchstream_atdat(nbnxn_cuda_ptr_t cu_nb, int iloc)
+{
+    cudaError_t stat;
+    cudaStream_t stream = cu_nb->stream[iloc];
+
+    stat = cudaStreamWaitEvent(stream, cu_nb->timers->stop_atdat, 0);
+    CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
+}
+
+wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
+{
+    return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL;
+}
+
+void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
+{
+    if (cu_nb->bDoTime)
+    {
+        init_timings(cu_nb->timings);
+    }
+}
+
+int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
+{
+    return cu_nb != NULL ?
+        gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
+
+}
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
new file mode 100644
index 0000000000..19bc1374c9
--- /dev/null
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -0,0 +1,421 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#if __CUDA_ARCH__ >= 300
+#define REDUCE_SHUFFLE
+/* On Kepler pre-loading i-atom types to shmem gives a few %,
+   but on Fermi it does not */
+#define IATYPE_SHMEM
+#endif
+
+/*
+   Kernel launch parameters:
+    - #blocks   = #pair lists, blockId = pair list Id
+    - #threads  = CL_SIZE^2
+    - shmem     = CL_SIZE^2 * sizeof(float)
+
+    Each thread calculates an i force-component taking one pair of i-j atoms.
+ */
+#ifdef PRUNE_NBL
+#ifdef CALC_ENERGIES
+__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune)
+#else
+__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _prune)
+#endif
+#else
+#ifdef CALC_ENERGIES
+__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener)
+#else
+__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
+#endif
+#endif
+            (const cu_atomdata_t atdat,
+             const cu_nbparam_t nbparam,
+             const cu_plist_t plist,
+             bool bCalcFshift)
+{
+    /* convenience variables */
+    const nbnxn_sci_t *pl_sci   = plist.sci;
+#ifndef PRUNE_NBL
+    const
+#endif
+    nbnxn_cj4_t *pl_cj4         = plist.cj4;
+    const nbnxn_excl_t *excl    = plist.excl;
+    const int *atom_types       = atdat.atom_types;
+    int ntypes                  = atdat.ntypes;
+    const float4 *xq            = atdat.xq;
+    float3 *f                   = atdat.f;
+    const float3 *shift_vec     = atdat.shift_vec;
+    float rcoulomb_sq           = nbparam.rcoulomb_sq;
+#ifdef VDW_CUTOFF_CHECK
+    float rvdw_sq               = nbparam.rvdw_sq;
+    float vdw_in_range;
+#endif
+#ifdef EL_RF
+    float two_k_rf              = nbparam.two_k_rf;
+#endif
+#ifdef EL_EWALD
+    float coulomb_tab_scale     = nbparam.coulomb_tab_scale;
+#endif
+#ifdef PRUNE_NBL
+    float rlist_sq              = nbparam.rlist_sq;
+#endif
+
+#ifdef CALC_ENERGIES
+    float lj_shift    = nbparam.sh_invrc6;
+#ifdef EL_EWALD
+    float beta        = nbparam.ewald_beta;
+    float ewald_shift = nbparam.sh_ewald;
+#else
+    float c_rf        = nbparam.c_rf;
+#endif
+    float *e_lj       = atdat.e_lj;
+    float *e_el       = atdat.e_el;
+#endif
+
+    /* thread/block/warp id-s */
+    unsigned int tidxi  = threadIdx.x;
+    unsigned int tidxj  = threadIdx.y;
+    unsigned int tidx   = threadIdx.y * blockDim.x + threadIdx.x;
+    unsigned int bidx   = blockIdx.x;
+    unsigned int widx   = tidx / WARP_SIZE; /* warp index */
+
+    int sci, ci, cj, ci_offset,
+        ai, aj,
+        cij4_start, cij4_end,
+        typei, typej,
+        i, jm, j4, wexcl_idx;
+    float qi, qj_f,
+          r2, inv_r, inv_r2, inv_r6,
+          c6, c12,
+          int_bit,
+#ifdef CALC_ENERGIES
+          E_lj, E_el,
+#endif
+          F_invr;
+    unsigned int wexcl, imask, mask_ji;
+    float4 xqbuf;
+    float3 xi, xj, rv, f_ij, fcj_buf, fshift_buf;
+    float3 fci_buf[NCL_PER_SUPERCL];    /* i force buffer */
+    nbnxn_sci_t nb_sci;
+
+    /* shmem buffer for i x+q pre-loading */
+    extern __shared__  float4 xqib[];
+#ifdef IATYPE_SHMEM
+    /* shmem buffer for i atom-type pre-loading */
+    int *atib = (int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+#endif
+
+#ifndef REDUCE_SHUFFLE
+    /* shmem j force buffer */
+#ifdef IATYPE_SHMEM
+    float *f_buf = (float *)(atib + NCL_PER_SUPERCL * CL_SIZE);
+#else
+    float *f_buf = (float *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+#endif
+#endif
+
+    nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
+    sci         = nb_sci.sci;           /* super-cluster */
+    cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
+    cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
+
+    /* Store the i-atom x and q in shared memory */
+    /* Note: the thread indexing here is inverted with respect to the
+       inner-loop as this results in slightly higher performance */
+    ci = sci * NCL_PER_SUPERCL + tidxi;
+    ai = ci * CL_SIZE + tidxj;
+    xqib[tidxi * CL_SIZE + tidxj] = xq[ai] + shift_vec[nb_sci.shift];
+#ifdef IATYPE_SHMEM
+    ci = sci * NCL_PER_SUPERCL + tidxj;
+    ai = ci * CL_SIZE + tidxi;
+    atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+#endif
+    __syncthreads();
+
+    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        fci_buf[ci_offset] = make_float3(0.0f);
+    }
+
+#ifdef CALC_ENERGIES
+    E_lj = 0.0f;
+    E_el = 0.0f;
+
+#if defined EL_EWALD || defined EL_RF
+    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
+    {
+        /* we have the diagonal: add the charge self interaction energy term */
+        for (i = 0; i < NCL_PER_SUPERCL; i++)
+        {
+            qi    = xqib[i * CL_SIZE + tidxi].w;
+            E_el += qi*qi;
+        }
+        /* divide the self term equally over the j-threads */
+        E_el /= CL_SIZE;
+#ifdef EL_RF
+        E_el *= -nbparam.epsfac*0.5f*c_rf;
+#else
+        E_el *= -nbparam.epsfac*beta*0.56418958f; /* last factor 1/sqrt(pi) */
+#endif
+    }
+#endif
+#endif
+
+    /* skip central shifts when summing shift forces */
+    if (nb_sci.shift == CENTRAL)
+    {
+        bCalcFshift = false;
+    }
+
+    fshift_buf = make_float3(0.0f);
+
+    /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
+    for (j4 = cij4_start; j4 < cij4_end; j4++)
+    {
+        wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
+        imask       = pl_cj4[j4].imei[widx].imask;
+        wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+
+#ifndef PRUNE_NBL
+        if (imask)
+#endif
+        {
+            /* Unrolling this loop
+               - with pruning leads to register spilling;
+               - on Kepler is much slower;
+               - doesn't work on CUDA <v4.1
+               Tested with nvcc 3.2 - 5.0.7 */
+#if !defined PRUNE_NBL && __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
+#pragma unroll 4
+#endif
+            for (jm = 0; jm < 4; jm++)
+            {
+                if (imask & (255U << (jm * NCL_PER_SUPERCL)))
+                {
+                    mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+
+                    cj      = pl_cj4[j4].cj[jm];
+                    aj      = cj * CL_SIZE + tidxj;
+
+                    /* load j atom data */
+                    xqbuf   = xq[aj];
+                    xj      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
+                    qj_f    = nbparam.epsfac * xqbuf.w;
+                    typej   = atom_types[aj];
+
+                    fcj_buf = make_float3(0.0f);
+
+                    /* The PME and RF kernels don't unroll with CUDA <v4.1. */
+#if !defined PRUNE_NBL && !(CUDA_VERSION < 4010 && (defined EL_EWALD || defined EL_RF))
+#pragma unroll 8
+#endif
+                    for(i = 0; i < NCL_PER_SUPERCL; i++)
+                    {
+                        if (imask & mask_ji)
+                        {
+                            ci_offset   = i;    /* i force buffer offset */
+
+                            ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+                            ai      = ci * CL_SIZE + tidxi;      /* i atom index */
+
+                            /* all threads load an atom from i cluster ci into shmem! */
+                            xqbuf   = xqib[i * CL_SIZE + tidxi];
+                            xi      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
+
+                            /* distance between i and j atoms */
+                            rv      = xi - xj;
+                            r2      = norm2(rv);
+
+#ifdef PRUNE_NBL
+                            /* If _none_ of the atoms pairs are in cutoff range,
+                               the bit corresponding to the current
+                               cluster-pair in imask gets set to 0. */
+                            if (!__any(r2 < rlist_sq))
+                            {
+                                imask &= ~mask_ji;
+                            }
+#endif
+
+                            int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+
+                            /* cutoff & exclusion check */
+#if defined EL_EWALD || defined EL_RF
+                            if (r2 < rcoulomb_sq *
+                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+#else
+                            if (r2 < rcoulomb_sq * int_bit)
+#endif
+                            {
+                                /* load the rest of the i-atom parameters */
+                                qi      = xqbuf.w;
+#ifdef IATYPE_SHMEM
+                                typei   = atib[i * CL_SIZE + tidxi];
+#else
+                                typei   = atom_types[ai];
+#endif
+
+                                /* LJ 6*C6 and 12*C12 */
+                                c6      = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej));
+                                c12     = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej) + 1);
+
+                                /* avoid NaN for excluded pairs at r=0 */
+                                r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+
+                                inv_r   = rsqrt(r2);
+                                inv_r2  = inv_r * inv_r;
+                                inv_r6  = inv_r2 * inv_r2 * inv_r2;
+#if defined EL_EWALD || defined EL_RF
+                                /* We could mask inv_r2, but with Ewald
+                                 * masking both inv_r6 and F_invr is faster */
+                                inv_r6  *= int_bit;
+#endif
+
+                                F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+
+#ifdef CALC_ENERGIES
+                                E_lj    += int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
+#endif
+
+#ifdef VDW_CUTOFF_CHECK
+                                /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
+                                vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                F_invr  *= vdw_in_range;
+#ifdef CALC_ENERGIES
+                                E_lj    *= vdw_in_range;
+#endif
+#endif
+
+#ifdef EL_CUTOFF
+                                F_invr  += qi * qj_f * inv_r2 * inv_r;
+#endif
+#ifdef EL_RF
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
+#endif
+#ifdef EL_EWALD
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef EL_CUTOFF
+                                E_el    += qi * qj_f * (inv_r - c_rf);
+#endif
+#ifdef EL_RF
+                                E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
+#endif
+#ifdef EL_EWALD
+                                /* 1.0f - erff is faster than erfcf */
+                                E_el    += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
+#endif
+#endif
+                                f_ij    = rv * F_invr;
+
+                                /* accumulate j forces in registers */
+                                fcj_buf -= f_ij;
+
+                                /* accumulate i forces in registers */
+                                fci_buf[ci_offset] += f_ij;
+                            }
+                        }
+
+                        /* shift the mask bit by 1 */
+                        mask_ji += mask_ji;
+                    }
+
+                    /* reduce j forces */
+#ifdef REDUCE_SHUFFLE
+                    reduce_force_j_warp_shfl(fcj_buf, f, tidxi, aj);
+#else
+                    /* store j forces in shmem */
+                    f_buf[                  tidx] = fcj_buf.x;
+                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
+                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+#endif
+                }
+            }
+#ifdef PRUNE_NBL
+            /* Update the imask with the new one which does not contain the
+               out of range clusters anymore. */
+            pl_cj4[j4].imei[widx].imask = imask;
+#endif
+        }
+    }
+
+    /* reduce i forces */
+    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+#ifdef REDUCE_SHUFFLE
+        reduce_force_i_warp_shfl(fci_buf[ci_offset], f,
+                                 &fshift_buf, bCalcFshift,
+                                 tidxj, ai);
+#else
+        f_buf[                  tidx] = fci_buf[ci_offset].x;
+        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        __syncthreads();
+        reduce_force_i(f_buf, f,
+                       &fshift_buf, bCalcFshift,
+                       tidxi, tidxj, ai);
+        __syncthreads();
+#endif
+    }
+
+    /* add up local shift forces into global mem */
+#ifdef REDUCE_SHUFFLE
+    if (bCalcFshift && (tidxj == 0 || tidxj == 4))
+#else
+    if (bCalcFshift && tidxj == 0)
+#endif
+    {
+        atomicAdd(&atdat.fshift[nb_sci.shift].x, fshift_buf.x);
+        atomicAdd(&atdat.fshift[nb_sci.shift].y, fshift_buf.y);
+        atomicAdd(&atdat.fshift[nb_sci.shift].z, fshift_buf.z);
+    }
+
+#ifdef CALC_ENERGIES
+#ifdef REDUCE_SHUFFLE
+    /* reduce the energies over warps and store into global memory */
+    reduce_energy_warp_shfl(E_lj, E_el, e_lj, e_el, tidx);
+#else
+    /* flush the energies to shmem and reduce them */
+    f_buf[              tidx] = E_lj;
+    f_buf[FBUF_STRIDE + tidx] = E_el;
+    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+#endif
+#endif
+}
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
new file mode 100644
index 0000000000..39eb9988c1
--- /dev/null
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
@@ -0,0 +1,375 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+/*
+   Kernel launch parameters:
+    - #blocks   = #pair lists, blockId = pair list Id
+    - #threads  = CL_SIZE^2
+    - shmem     = CL_SIZE^2 * sizeof(float)
+
+    Each thread calculates an i force-component taking one pair of i-j atoms.
+ */
+#ifdef PRUNE_NBL
+#ifdef CALC_ENERGIES
+__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune_legacy)
+#else
+__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _prune_legacy)
+#endif
+#else
+#ifdef CALC_ENERGIES
+__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_legacy)
+#else
+__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
+#endif
+#endif
+            (const cu_atomdata_t atdat,
+             const cu_nbparam_t nbparam,
+             const cu_plist_t plist,
+             bool bCalcFshift)
+{
+    /* convenience variables */
+    const nbnxn_sci_t *pl_sci   = plist.sci;
+#ifndef PRUNE_NBL
+    const
+#endif
+    nbnxn_cj4_t *pl_cj4         = plist.cj4;
+    const nbnxn_excl_t *excl    = plist.excl;
+    const int *atom_types       = atdat.atom_types;
+    int ntypes                  = atdat.ntypes;
+    const float4 *xq            = atdat.xq;
+    float3 *f                   = atdat.f;
+    const float3 *shift_vec     = atdat.shift_vec;
+    float rcoulomb_sq           = nbparam.rcoulomb_sq;
+#ifdef VDW_CUTOFF_CHECK
+    float rvdw_sq               = nbparam.rvdw_sq;
+    float vdw_in_range;
+#endif
+#ifdef EL_RF
+    float two_k_rf              = nbparam.two_k_rf;
+#endif
+#ifdef EL_EWALD
+    float coulomb_tab_scale     = nbparam.coulomb_tab_scale;
+#endif
+#ifdef PRUNE_NBL
+    float rlist_sq              = nbparam.rlist_sq;
+#endif
+
+#ifdef CALC_ENERGIES
+    float lj_shift    = nbparam.sh_invrc6;
+#ifdef EL_EWALD
+    float beta        = nbparam.ewald_beta;
+    float ewald_shift = nbparam.sh_ewald;
+#else
+    float c_rf        = nbparam.c_rf;
+#endif
+    float *e_lj       = atdat.e_lj;
+    float *e_el       = atdat.e_el;
+#endif
+
+    /* thread/block/warp id-s */
+    unsigned int tidxi  = threadIdx.x;
+    unsigned int tidxj  = threadIdx.y;
+    unsigned int tidx   = threadIdx.y * blockDim.x + threadIdx.x;
+    unsigned int bidx   = blockIdx.x;
+    unsigned int widx   = tidx / WARP_SIZE; /* warp index */
+
+    int sci, ci, cj, ci_offset,
+        ai, aj,
+        cij4_start, cij4_end,
+        typei, typej,
+        i, cii, jm, j4, nsubi, wexcl_idx;
+    float qi, qj_f,
+          r2, inv_r, inv_r2, inv_r6,
+          c6, c12,
+#ifdef CALC_ENERGIES
+          E_lj, E_el,
+#endif
+          F_invr;
+    unsigned int wexcl, int_bit, imask, imask_j;
+#ifdef PRUNE_NBL
+    unsigned int imask_prune;
+#endif
+    float4 xqbuf;
+    float3 xi, xj, rv, f_ij, fcj_buf, fshift_buf;
+    float3 fci_buf[NCL_PER_SUPERCL];    /* i force buffer */
+    nbnxn_sci_t nb_sci;
+
+    /* shmem buffer for i x+q pre-loading */
+    extern __shared__  float4 xqib[];
+    /* shmem j force buffer */
+    float *f_buf = (float *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+
+    nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
+    sci         = nb_sci.sci;           /* super-cluster */
+    cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
+    cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
+
+    /* Store the i-atom x and q in shared memory */
+    /* Note: the thread indexing here is inverted with respect to the
+       inner-loop as this results in slightly higher performance */
+    ci = sci * NCL_PER_SUPERCL + tidxi;
+    ai = ci * CL_SIZE + tidxj;
+    xqib[tidxi * CL_SIZE + tidxj] = xq[ai] + shift_vec[nb_sci.shift];
+    __syncthreads();
+
+    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        fci_buf[ci_offset] = make_float3(0.0f);
+    }
+
+#ifdef CALC_ENERGIES
+    E_lj = 0.0f;
+    E_el = 0.0f;
+
+#if defined EL_EWALD || defined EL_RF
+    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
+    {
+        /* we have the diagonal: add the charge self interaction energy term */
+        for (i = 0; i < NCL_PER_SUPERCL; i++)
+        {
+            qi    = xqib[i * CL_SIZE + tidxi].w;
+            E_el += qi*qi;
+        }
+        /* divide the self term equally over the j-threads */
+        E_el /= CL_SIZE;
+#ifdef EL_RF
+        E_el *= -nbparam.epsfac*0.5f*c_rf;
+#else
+        E_el *= -nbparam.epsfac*beta*0.56418958f; /* last factor 1/sqrt(pi) */
+#endif
+    }
+#endif
+#endif
+
+    /* skip central shifts when summing shift forces */
+    if (nb_sci.shift == CENTRAL)
+    {
+        bCalcFshift = false;
+    }
+
+    fshift_buf = make_float3(0.0f);
+
+    /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
+    for (j4 = cij4_start; j4 < cij4_end; j4++)
+    {
+        wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
+        imask       = pl_cj4[j4].imei[widx].imask;
+        wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+
+#ifndef PRUNE_NBL
+        if (imask)
+#endif
+        {
+#ifdef PRUNE_NBL
+            imask_prune = imask;
+#endif
+
+            /* nvcc >v4.1 doesn't like this loop, it refuses to unroll it */
+#if CUDA_VERSION >= 4010
+            #pragma unroll 4
+#endif
+            for (jm = 0; jm < 4; jm++)
+            {
+                imask_j = (imask >> (jm * 8)) & 255U;
+                if (imask_j)
+                {
+                    nsubi = __popc(imask_j);
+
+                    cj      = pl_cj4[j4].cj[jm];
+                    aj      = cj * CL_SIZE + tidxj;
+
+                    /* load j atom data */
+                    xqbuf   = xq[aj];
+                    xj      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
+                    qj_f    = nbparam.epsfac * xqbuf.w;
+                    typej   = atom_types[aj];
+
+                    fcj_buf = make_float3(0.0f);
+
+                    /* loop over the i-clusters in sci */
+                    /* #pragma unroll 8
+                       -- nvcc doesn't like my code, it refuses to unroll it
+                       which is a pity because here unrolling could help.  */
+                    for (cii = 0; cii < nsubi; cii++)
+                    {
+                        i = __ffs(imask_j) - 1;
+                        imask_j &= ~(1U << i);
+
+                        ci_offset   = i;    /* i force buffer offset */
+
+                        ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+                        ai      = ci * CL_SIZE + tidxi;      /* i atom index */
+
+                        /* all threads load an atom from i cluster ci into shmem! */
+                        xqbuf   = xqib[i * CL_SIZE + tidxi];
+                        xi      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
+
+                        /* distance between i and j atoms */
+                        rv      = xi - xj;
+                        r2      = norm2(rv);
+
+#ifdef PRUNE_NBL
+                        /* If _none_ of the atoms pairs are in cutoff range,
+                               the bit corresponding to the current
+                               cluster-pair in imask gets set to 0. */
+                        if (!__any(r2 < rlist_sq))
+                        {
+                            imask_prune &= ~(1U << (jm * NCL_PER_SUPERCL + i));
+                        }
+#endif
+
+                        int_bit = ((wexcl >> (jm * NCL_PER_SUPERCL + i)) & 1);
+
+                        /* cutoff & exclusion check */
+#if defined EL_EWALD || defined EL_RF
+                        if (r2 < rcoulomb_sq *
+                            (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+#else
+                        if (r2 < rcoulomb_sq * int_bit)
+#endif
+                        {
+                            /* load the rest of the i-atom parameters */
+                            qi      = xqbuf.w;
+                            typei   = atom_types[ai];
+
+                            /* LJ 6*C6 and 12*C12 */
+                            c6      = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej));
+                            c12     = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej) + 1);
+
+                            /* avoid NaN for excluded pairs at r=0 */
+                            r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+
+                            inv_r   = rsqrt(r2);
+                            inv_r2  = inv_r * inv_r;
+                            inv_r6  = inv_r2 * inv_r2 * inv_r2;
+#if defined EL_EWALD || defined EL_RF
+                            /* We could mask inv_r2, but with Ewald
+                             * masking both inv_r6 and F_invr is faster */
+                            inv_r6  *= int_bit;
+#endif
+
+                            F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+
+#ifdef CALC_ENERGIES
+                            E_lj    += int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
+#endif
+
+#ifdef VDW_CUTOFF_CHECK
+                                /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
+                                vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                F_invr  *= vdw_in_range;
+#ifdef CALC_ENERGIES
+                                E_lj    *= vdw_in_range;
+#endif
+#endif
+
+#ifdef EL_CUTOFF
+                            F_invr  += qi * qj_f * inv_r2 * inv_r;
+#endif
+#ifdef EL_RF
+                            F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
+#endif
+#ifdef EL_EWALD
+                            F_invr  += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef EL_CUTOFF
+                            E_el    += qi * qj_f * (inv_r - c_rf);
+#endif
+#ifdef EL_RF
+                            E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
+#endif
+#ifdef EL_EWALD
+                            /* 1.0f - erff is faster than erfcf */
+                            E_el    += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
+#endif
+#endif
+                            f_ij    = rv * F_invr;
+
+                            /* accumulate j forces in registers */
+                            fcj_buf -= f_ij;
+
+                            /* accumulate i forces in registers */
+                            fci_buf[ci_offset] += f_ij;
+                        }
+                    }
+
+                    /* store j forces in shmem */
+                    f_buf[                  tidx] = fcj_buf.x;
+                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
+                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+                    /* reduce j forces */
+                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+                }
+            }
+#ifdef PRUNE_NBL
+            /* Update the imask with the new one which does not contain the
+               out of range clusters anymore. */
+            pl_cj4[j4].imei[widx].imask = imask_prune;
+#endif
+        }
+    }
+
+    /* reduce i forces */
+    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+        f_buf[                  tidx] = fci_buf[ci_offset].x;
+        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        __syncthreads();
+        reduce_force_i(f_buf, f,
+                       &fshift_buf, bCalcFshift,
+                       tidxi, tidxj, ai);
+        __syncthreads();
+    }
+
+    /* add up local shift forces into global mem */
+    if (bCalcFshift && tidxj == 0)
+    {
+        atomicAdd(&atdat.fshift[nb_sci.shift].x, fshift_buf.x);
+        atomicAdd(&atdat.fshift[nb_sci.shift].y, fshift_buf.y);
+        atomicAdd(&atdat.fshift[nb_sci.shift].z, fshift_buf.z);
+    }
+
+#ifdef CALC_ENERGIES
+    /* flush the energies to shmem and reduce them */
+    f_buf[              tidx] = E_lj;
+    f_buf[FBUF_STRIDE + tidx] = E_el;
+    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+#endif
+}
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
new file mode 100644
index 0000000000..5233ddffc3
--- /dev/null
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -0,0 +1,296 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#include "../../gmxlib/cuda_tools/vectype_ops.cuh"
+
+#ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
+#define NBNXN_CUDA_KERNEL_UTILS_CUH
+
+#define WARP_SIZE_POW2_EXPONENT     (5)
+#define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
+#define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
+#define FBUF_STRIDE                 (CL_SIZE_SQ)
+
+/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
+ *  Original idea: OpenMM
+ */
+static inline __device__
+float interpolate_coulomb_force_r(float r, float scale)
+{
+    float   normalized = scale * r;
+    int     index = (int) normalized;
+    float   fract2 = normalized - index;
+    float   fract1 = 1.0f - fract2;
+
+    return  fract1 * tex1Dfetch(tex_coulomb_tab, index)
+            + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
+}
+
+/*! Final j-force reduction; this generic implementation works with
+ *  arbitrary array sizes.
+ */
+static inline __device__
+void reduce_force_j_generic(float *f_buf, float3 *fout,
+                            int tidxi, int tidxj, int aidx)
+{
+    if (tidxi == 0)
+    {
+        float3 f = make_float3(0.0f);
+        for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
+        {
+            f.x += f_buf[                  j];
+            f.y += f_buf[    FBUF_STRIDE + j];
+            f.z += f_buf[2 * FBUF_STRIDE + j];
+        }
+
+        atomicAdd(&fout[aidx], f);
+    }
+}
+
+/*! Final j-force reduction; this implementation only with power of two
+ *  array sizes and with sm >= 3.0
+ */
+#if __CUDA_ARCH__ >= 300
+static inline __device__
+void reduce_force_j_warp_shfl(float3 f, float3 *fout,
+                              int tidxi, int aidx)
+{
+    int i;
+
+#pragma unroll 3
+    for (i = 0; i < 3; i++)
+    {
+        f.x += __shfl_down(f.x, 1<<i);
+        f.y += __shfl_down(f.y, 1<<i);
+        f.z += __shfl_down(f.z, 1<<i);
+    }
+
+    /* Write the reduced j-force on one thread for each j */
+    if (tidxi == 0)
+    {
+        atomicAdd(&fout[aidx], f);
+    }
+}
+#endif
+
+/*! Final i-force reduction; this generic implementation works with
+ *  arbitrary array sizes.
+ */
+static inline __device__
+void reduce_force_i_generic(float *f_buf, float3 *fout,
+                            float3 *fshift_buf, bool bCalcFshift,
+                            int tidxi, int tidxj, int aidx)
+{
+    if (tidxj == 0)
+    {
+        float3 f = make_float3(0.0f);
+        for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
+        {
+            f.x += f_buf[                  j];
+            f.y += f_buf[    FBUF_STRIDE + j];
+            f.z += f_buf[2 * FBUF_STRIDE + j];
+        }
+
+        atomicAdd(&fout[aidx], f);
+
+        if (bCalcFshift)
+        {
+            *fshift_buf += f;
+        }
+    }
+}
+
+/*! Final i-force reduction; this implementation works only with power of two
+ *  array sizes.
+ */
+static inline __device__
+void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
+                         float3 *fshift_buf, bool bCalcFshift,
+                         int tidxi, int tidxj, int aidx)
+{
+    int     i, j;
+    float3  f = make_float3(0.0f);
+
+    /* Reduce the initial CL_SIZE values for each i atom to half
+     * every step by using CL_SIZE * i threads.
+     * Can't just use i as loop variable because than nvcc refuses to unroll.
+     */
+    i = CL_SIZE/2;
+    # pragma unroll 5
+    for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+    {
+        if (tidxj < i)
+        {
+
+            f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
+            f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+            f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+        }
+        i >>= 1;
+    }
+
+    /* i == 1, last reduction step, writing to global mem */
+    if (tidxj == 0)
+    {
+        f.x = f_buf[                  tidxj * CL_SIZE + tidxi] + f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
+        f.y = f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+        f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+
+        atomicAdd(&fout[aidx], f);
+
+        if (bCalcFshift)
+        {
+            *fshift_buf += f;
+        }
+    }
+}
+
+/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
+ *  on whether the size of the array to be reduced is power of two or not.
+ */
+static inline __device__
+void reduce_force_i(float *f_buf, float3 *f,
+                    float3 *fshift_buf, bool bCalcFshift,
+                    int tidxi, int tidxj, int ai)
+{
+    if ((CL_SIZE & (CL_SIZE - 1)))
+    {
+        reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+    }
+    else
+    {
+        reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+    }
+}
+
+/*! Final i-force reduction; this implementation works only with power of two
+ *  array sizes and with sm >= 3.0
+ */
+#if __CUDA_ARCH__ >= 300
+static inline __device__
+void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
+                              float3 *fshift_buf, bool bCalcFshift,
+                              int tidxj, int aidx)
+{
+    int j;
+
+#pragma unroll 2
+    for (j = 0; j < 2; j++)
+    {
+        fin.x += __shfl_down(fin.x,  CL_SIZE<<j);
+        fin.y += __shfl_down(fin.y,  CL_SIZE<<j);
+        fin.z += __shfl_down(fin.z,  CL_SIZE<<j);
+    }
+
+    /* The first thread in the warp writes the reduced force */
+    if (tidxj == 0 || tidxj == 4)
+    {
+        atomicAdd(&fout[aidx], fin);
+
+        if (bCalcFshift)
+        {
+            fshift_buf->x += fin.x;
+            fshift_buf->y += fin.y;
+            fshift_buf->z += fin.z;
+        }
+    }
+}
+#endif
+
+/*! Energy reduction; this implementation works only with power of two
+ *  array sizes.
+ */
+static inline __device__
+void reduce_energy_pow2(volatile float *buf,
+                        float *e_lj, float *e_el,
+                        unsigned int tidx)
+{
+    int     i, j;
+    float   e1, e2;
+
+    i = WARP_SIZE/2;
+
+    /* Can't just use i as loop variable because than nvcc refuses to unroll. */
+# pragma unroll 10
+    for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+    {
+        if (tidx < i)
+        {
+            buf[              tidx] += buf[              tidx + i];
+            buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
+        }
+        i >>= 1;
+    }
+
+    /* last reduction step, writing to global mem */
+    if (tidx == 0)
+    {
+        e1 = buf[              tidx] + buf[              tidx + i];
+        e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
+
+        atomicAdd(e_lj, e1);
+        atomicAdd(e_el, e2);
+    }
+}
+
+/*! Energy reduction; this implementation works only with power of two
+ *  array sizes and with sm >= 3.0
+ */
+#if __CUDA_ARCH__ >= 300
+static inline __device__
+void reduce_energy_warp_shfl(float E_lj, float E_el,
+                             float *e_lj, float *e_el,
+                             int tidx)
+{
+    int i, sh;
+
+    sh = 1;
+#pragma unroll 5
+    for (i = 0; i < 5; i++)
+    {
+        E_lj += __shfl_down(E_lj,sh);
+        E_el += __shfl_down(E_el,sh);
+        sh += sh;
+    }
+
+    /* The first thread in the warp writes the reduced energies */
+    if (tidx == 0 || tidx == WARP_SIZE)
+    {
+        atomicAdd(e_lj,E_lj);
+        atomicAdd(e_el,E_el);
+    }
+}
+#endif /* __CUDA_ARCH__ */
+
+#endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
similarity index 50%
copy from src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
copy to src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
index 76070804ea..a18f905bbf 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -1,56 +1,77 @@
 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
- * 
+ *
  *                This source code is part of
- * 
+ *
  *                 G   R   O   M   A   C   S
- * 
+ *
  *          GROningen MAchine for Chemical Simulations
- * 
+ *
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
+ * Copyright (c) 2001-2012, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
-
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- * 
+ *
  * If you want to redistribute modifications, please consider that
  * scientific software is very special. Version control is crucial -
  * bugs must be traceable. We will be happy to consider code for
  * inclusion in the official distribution, but derived work must not
  * be called official GROMACS. Details are found in the README & COPYING
  * files - if they are missing, get the official version at www.gromacs.org.
- * 
+ *
  * To help us fund GROMACS development, we humbly ask that you cite
  * the papers on the package - you can find them in the top README file.
- * 
+ *
  * For more info, check our website at http://www.gromacs.org
- * 
+ *
  * And Hey:
  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
 
-#ifndef _GMX_GPU_UTILS_H_
-#define _GMX_GPU_UTILS_H_
-
-#ifndef __cplusplus
-extern "C" {
-#endif
-
-int do_quick_memtest(int /*dev_id*/);
-
-int do_full_memtest(int /*dev_id*/);
-
-int do_timed_memtest(int /*dev_id*/, int /*time_limit*/);
+/*! \file
+ *  This header has the sole purpose of generating kernels for the different
+ *  type of electrostatics supported: Cut-off, Reaction-Field, and Ewald/PME;
+ *  the latter has a twin-range cut-off version (rcoul!=rvdw) which enables
+ *  PME tuning (otherwise in the Verlet scheme rcoul==rvdw).
+ *
+ *  (No include fence as it is meant to be included multiple times.)
+ */
 
-int is_supported_cuda_gpu(int /*dev_id*/, char* /*gpu_name*/);
+/* Cut-Off */
+#define EL_CUTOFF
+#define NB_KERNEL_FUNC_NAME(x,...) x##_cutoff##__VA_ARGS__
+#include "nbnxn_cuda_kernel_legacy.cuh"
+#include "nbnxn_cuda_kernel.cuh"
+#undef EL_CUTOFF
+#undef NB_KERNEL_FUNC_NAME
 
-#ifndef __cplusplus
-}  /* extern "C" */
-#endif
+/* Reaction-Field */
+#define EL_RF
+#define NB_KERNEL_FUNC_NAME(x,...) x##_rf##__VA_ARGS__
+#include "nbnxn_cuda_kernel_legacy.cuh"
+#include "nbnxn_cuda_kernel.cuh"
+#undef EL_RF
+#undef NB_KERNEL_FUNC_NAME
 
-#endif // _GMX_GPU_UTILS_H_
+/* Ewald */
+#define EL_EWALD
+#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald##__VA_ARGS__
+#include "nbnxn_cuda_kernel_legacy.cuh"
+#include "nbnxn_cuda_kernel.cuh"
+#undef EL_EWALD
+#undef NB_KERNEL_FUNC_NAME
 
+/* Ewald with twin-range cut-off */
+#define EL_EWALD
+#define VDW_CUTOFF_CHECK
+#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_twin##__VA_ARGS__
+#include "nbnxn_cuda_kernel_legacy.cuh"
+#include "nbnxn_cuda_kernel.cuh"
+#undef EL_EWALD
+#undef VDW_CUTOFF_CHECK
+#undef NB_KERNEL_FUNC_NAME
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
new file mode 100644
index 0000000000..63df03d1a6
--- /dev/null
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -0,0 +1,187 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef NBNXN_CUDA_TYPES_H
+#define NBNXN_CUDA_TYPES_H
+
+#include "types/nbnxn_pairlist.h"
+#include "types/nbnxn_cuda_types_ext.h"
+#include "../../gmxlib/cuda_tools/cudautils.cuh"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! Types of electrostatics available in the CUDA nonbonded force kernels. */
+enum { eelCuEWALD, eelCuEWALD_TWIN, eelCuRF, eelCuCUT, eelCuNR };
+
+enum { eNbnxnCuKDefault, eNbnxnCuKLegacy, eNbnxnCuKOld, eNbnxnCuKNR };
+
+#define NBNXN_KVER_OLD(k)      (k == eNbnxnCuKOld)
+#define NBNXN_KVER_LEGACY(k)   (k == eNbnxnCuKLegacy)
+#define NBNXN_KVER_DEFAULT(k)  (k == eNbnxnCuKDefault)
+
+/*! Non-bonded kernel versions. */
+
+/* All structs prefixed with "cu_" hold data used in GPU calculations and
+ * are passed to the kernels, except cu_timers_t. */
+typedef struct cu_plist     cu_plist_t;
+typedef struct cu_atomdata  cu_atomdata_t;
+typedef struct cu_nbparam   cu_nbparam_t;
+typedef struct cu_timers    cu_timers_t;
+typedef struct nb_staging   nb_staging_t;
+
+
+/*! Staging area for temporary data. The energies get downloaded here first,
+ *  before getting added to the CPU-side aggregate values.
+ */
+struct nb_staging
+{
+    float   *e_lj;      /* LJ energy            */
+    float   *e_el;      /* electrostatic energy */
+    float3  *fshift;    /* shift forces         */
+};
+
+/*! Nonbonded atom data -- both inputs and outputs. */
+struct cu_atomdata
+{
+    int     natoms;             /* number of atoms                              */
+    int     natoms_local;       /* number of local atoms                        */
+    int     nalloc;             /* allocation size for the atom data (xq, f)    */
+
+    float4  *xq;                /* atom coordinates + charges, size natoms      */
+    float3  *f;                 /* force output array, size natoms              */
+    /* TODO: try float2 for the energies */
+    float   *e_lj,              /* LJ energy output, size 1                     */
+            *e_el;              /* Electrostatics energy input, size 1          */
+
+    float3  *fshift;            /* shift forces                                 */
+
+    int     ntypes;             /* number of atom types                         */
+    int     *atom_types;        /* atom type indices, size natoms               */
+
+    float3  *shift_vec;         /* shifts                                       */
+    bool    bShiftVecUploaded;  /* true if the shift vector has been uploaded   */
+};
+
+/*! Parameters required for the CUDA nonbonded calculations. */
+struct cu_nbparam
+{
+    int     eeltype;        /* type of electrostatics                       */
+
+    float   epsfac;         /* charge multiplication factor                 */
+    float   c_rf, two_k_rf; /* Reaction-Field constants                     */
+    float   ewald_beta;     /* Ewald/PME parameter                          */
+    float   sh_ewald;       /* Ewald/PME  correction term                   */
+    float   rvdw_sq;        /* VdW cut-off                                  */
+    float   rcoulomb_sq;    /* Coulomb cut-off                              */
+    float   rlist_sq;       /* pair-list cut-off                            */
+    float   sh_invrc6;      /* LJ potential correction term                 */
+
+    float   *nbfp;          /* nonbonded parameter table with C6/C12 pairs  */
+
+    /* Ewald Coulomb force table */
+    int     coulomb_tab_size;
+    float   coulomb_tab_scale;
+    float   *coulomb_tab;
+};
+
+/*! Pair list data */
+struct cu_plist
+{
+    int             na_c;       /* number of atoms per cluster                  */
+
+    int             nsci;       /* size of sci, # of i clusters in the list     */
+    int             sci_nalloc; /* allocation size of sci                       */
+    nbnxn_sci_t     *sci;       /* list of i-cluster ("super-clusters")         */
+
+    int             ncj4;       /* total # of 4*j clusters                      */
+    int             cj4_nalloc; /* allocation size of cj4                       */
+    nbnxn_cj4_t     *cj4;       /* 4*j cluster list, contains j cluster number
+                                   and index into the i cluster list            */
+    nbnxn_excl_t    *excl;      /* atom interaction bits                        */
+    int             nexcl;      /* count for excl                               */
+    int             excl_nalloc;/* allocation size of excl                      */
+
+    bool            bDoPrune;   /* true if pair-list pruning needs to be
+                                   done during the  current step                */
+};
+
+/* CUDA events used for timing GPU kernels and H2D/D2H transfers.
+ * The two-sized arrays hold the local and non-local values and should always
+ * be indexed with eintLocal/eintNonlocal.
+ */
+struct cu_timers
+{
+    cudaEvent_t start_atdat, stop_atdat;         /* atom data transfer (every PS step)      */
+    cudaEvent_t start_nb_h2d[2], stop_nb_h2d[2]; /* x/q H2D transfer (every step)           */
+    cudaEvent_t start_nb_d2h[2], stop_nb_d2h[2]; /* f D2H transfer (every step)             */
+    cudaEvent_t start_pl_h2d[2], stop_pl_h2d[2]; /* pair-list H2D transfer (every PS step)  */
+    cudaEvent_t start_nb_k[2], stop_nb_k[2];     /* non-bonded kernels (every step)         */
+};
+
+/* Main data structure for CUDA nonbonded force calculations. */
+struct nbnxn_cuda
+{
+    cuda_dev_info_t *dev_info;      /* CUDA device information                              */
+    int             kernel_ver;     /* The version of the kernel to be executed on the
+                                       device in use, possible values: eNbnxnCuK*           */
+    bool            bUseTwoStreams; /* true if doing both local/non-local NB work on GPU    */
+    bool            bUseStreamSync; /* true if the standard cudaStreamSynchronize is used
+                                       and not memory polling-based waiting                 */
+    cu_atomdata_t   *atdat;         /* atom data                                            */
+    cu_nbparam_t    *nbparam;       /* parameters required for the non-bonded calc.         */
+    cu_plist_t      *plist[2];      /* pair-list data structures (local and non-local)      */
+    nb_staging_t    nbst;           /* staging area where fshift/energies get downloaded    */
+
+    cudaStream_t    stream[2];      /* local and non-local GPU streams                      */
+
+    /* events used for synchronization */
+    cudaEvent_t    nonlocal_done, misc_ops_done;
+
+    /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
+     * concurrent streams, so we won't time if both l/nl work is done on GPUs.
+     * Timer init/uninit is still done even with timing off so only the condition
+     * setting bDoTime needs to be change if this CUDA "feature" gets fixed. */
+    bool            bDoTime;        /* True if event-based timing is enabled.               */
+    cu_timers_t     *timers;        /* CUDA event-based timers.                             */
+    wallclock_gpu_t *timings;       /* Timing data.                                         */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* NBNXN_CUDA_TYPES_H */
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.c
new file mode 100644
index 0000000000..8bdcfb4aa6
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.c
@@ -0,0 +1,59 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "nbnxn_kernel_common.h"
+
+void
+clear_f(const nbnxn_atomdata_t *nbat,real *f)
+{
+    int i;
+
+    for(i=0; i<nbat->natoms*nbat->fstride; i++)
+    {
+        f[i] = 0;
+    }
+}
+
+void
+clear_fshift(real *fshift)
+{
+    int i;
+
+    for(i=0; i<SHIFTS*DIM; i++)
+    {
+        fshift[i] = 0;
+    }
+}
+
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.h
similarity index 74%
copy from src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
copy to src/mdlib/nbnxn_kernels/nbnxn_kernel_common.h
index 76070804ea..eacbaee440 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.h
@@ -1,15 +1,16 @@
 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
- * 
+ *
  *                This source code is part of
  * 
  *                 G   R   O   M   A   C   S
  * 
  *          GROningen MAchine for Chemical Simulations
  * 
+ *                        VERSION 3.2.0
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
+ * Copyright (c) 2001-2004, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
 
  * This program is free software; you can redistribute it and/or
@@ -28,29 +29,31 @@
  * the papers on the package - you can find them in the top README file.
  * 
  * For more info, check our website at http://www.gromacs.org
- * 
- * And Hey:
- * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
 
-#ifndef _GMX_GPU_UTILS_H_
-#define _GMX_GPU_UTILS_H_
+#ifndef _nbnxn_kernel_common_h
+#define _nbnxn_kernel_common_h
 
-#ifndef __cplusplus
+#include "typedefs.h"
+
+#ifdef __cplusplus
 extern "C" {
 #endif
+#if 0
+}
+#endif
 
-int do_quick_memtest(int /*dev_id*/);
-
-int do_full_memtest(int /*dev_id*/);
-
-int do_timed_memtest(int /*dev_id*/, int /*time_limit*/);
+void
+clear_f(const nbnxn_atomdata_t *nbat,real *f);
 
-int is_supported_cuda_gpu(int /*dev_id*/, char* /*gpu_name*/);
+void
+clear_fshift(real *fshift);
 
-#ifndef __cplusplus
-}  /* extern "C" */
+#if 0
+{
+#endif
+#ifdef __cplusplus
+}
 #endif
 
-#endif // _GMX_GPU_UTILS_H_
-
+#endif
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c
new file mode 100644
index 0000000000..ef2c42f9a0
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c
@@ -0,0 +1,377 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ *                        VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * GROningen Mixture of Alchemy and Childrens' Stories
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "types/simple.h"
+#include "maths.h"
+#include "vec.h"
+#include "typedefs.h"
+#include "force.h"
+#include "nbnxn_kernel_gpu_ref.h"
+#include "../nbnxn_consts.h"
+#include "nbnxn_kernel_common.h"
+
+#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
+
+void
+nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t     *nbl,
+                     const nbnxn_atomdata_t     *nbat,
+                     const interaction_const_t  *iconst,
+                     rvec                       *shift_vec,
+                     int                        force_flags,
+                     int                        clearF,
+                     real *                     f,
+                     real *                     fshift,
+                     real *                     Vc,
+                     real *                     Vvdw)
+{
+    const nbnxn_sci_t *nbln;
+    const real    *x;
+    gmx_bool      bEner;
+    gmx_bool      bEwald;
+    const real    *Ftab=NULL;
+    real          rcut2,rvdw2,rlist2;
+    int           ntype;
+    real          facel;
+    int           n;
+    int           ish3;
+    int           sci;
+    int           cj4_ind0,cj4_ind1,cj4_ind;
+    int           ci,cj;
+    int           ic,jc,ia,ja,is,ifs,js,jfs,im,jm;
+    int           n0;
+    int           ggid;
+    real          shX,shY,shZ;
+    real          fscal,tx,ty,tz;
+    real          rinvsq;
+    real          iq;
+    real          qq,vcoul=0,krsq,vctot;
+    int           nti;
+    int           tj;
+    real          rt,r,eps;
+    real          rinvsix;
+    real          Vvdwtot;
+    real          Vvdw_rep,Vvdw_disp;
+    real          ix,iy,iz,fix,fiy,fiz;
+    real          jx,jy,jz;
+    real          dx,dy,dz,rsq,rinv;
+    int           int_bit;
+    real          fexcl;
+    real          c6,c12,cexp1,cexp2,br;
+    const real *  shiftvec;
+    real *        vdwparam;
+    int *         shift;
+    int *         type;
+    const nbnxn_excl_t *excl[2];
+
+    int           npair_tot,npair;
+    int           nhwu,nhwu_pruned;
+
+    if (nbl->na_ci != CL_SIZE)
+    {
+        gmx_fatal(FARGS,"The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d",nbl->na_ci,CL_SIZE);
+    }
+
+    if (clearF == enbvClearFYes)
+    {
+        clear_f(nbat, f);
+    }
+
+    bEner = (force_flags & GMX_FORCE_ENERGY);
+
+    bEwald = EEL_FULL(iconst->eeltype);
+    if (bEwald)
+    {
+        Ftab = iconst->tabq_coul_F;
+    }
+
+    rcut2               = iconst->rcoulomb*iconst->rcoulomb;
+    rvdw2               = iconst->rvdw*iconst->rvdw;
+
+    rlist2              = nbl->rlist*nbl->rlist;
+
+    type                = nbat->type;
+    facel               = iconst->epsfac;
+    shiftvec            = shift_vec[0];
+    vdwparam            = nbat->nbfp;
+    ntype               = nbat->ntype;
+
+    x = nbat->x;
+
+    npair_tot   = 0;
+    nhwu        = 0;
+    nhwu_pruned = 0;
+
+    for(n=0; n<nbl->nsci; n++)
+    {
+        nbln = &nbl->sci[n];
+
+        ish3             = 3*nbln->shift;     
+        shX              = shiftvec[ish3];  
+        shY              = shiftvec[ish3+1];
+        shZ              = shiftvec[ish3+2];
+        cj4_ind0         = nbln->cj4_ind_start;      
+        cj4_ind1         = nbln->cj4_ind_end;    
+        sci              = nbln->sci;
+        vctot            = 0;              
+        Vvdwtot          = 0;              
+
+        if (nbln->shift == CENTRAL &&
+            nbl->cj4[cj4_ind0].cj[0] == sci*NCL_PER_SUPERCL)
+        {
+            /* we have the diagonal:
+             * add the charge self interaction energy term
+             */
+            for(im=0; im<NCL_PER_SUPERCL; im++)
+            {
+                ci = sci*NCL_PER_SUPERCL + im;
+                for (ic=0; ic<CL_SIZE; ic++)
+                {
+                    ia     = ci*CL_SIZE + ic;
+                    iq     = x[ia*nbat->xstride+3];
+                    vctot += iq*iq;
+                }
+            }
+            if (!bEwald)
+            {
+                vctot *= -facel*0.5*iconst->c_rf;
+            }
+            else
+            {
+                /* last factor 1/sqrt(pi) */
+                vctot *= -facel*iconst->ewaldcoeff*0.564189583548;
+            }
+        }
+        
+        for(cj4_ind=cj4_ind0; (cj4_ind<cj4_ind1); cj4_ind++)
+        {
+            excl[0]           = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
+            excl[1]           = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
+
+            for(jm=0; jm<4; jm++)
+            {
+                cj               = nbl->cj4[cj4_ind].cj[jm];
+
+                for(im=0; im<NCL_PER_SUPERCL; im++)
+                {
+                    /* We're only using the first imask,
+                     * but here imei[1].imask is identical.
+                     */
+                    if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm*NCL_PER_SUPERCL+im)) & 1)
+                    {
+                        gmx_bool within_rlist;
+
+                        ci               = sci*NCL_PER_SUPERCL + im;
+
+                        within_rlist     = FALSE;
+                        npair            = 0;
+                        for(ic=0; ic<CL_SIZE; ic++)
+                        {
+                            ia               = ci*CL_SIZE + ic;
+                    
+                            is               = ia*nbat->xstride;
+                            ifs              = ia*nbat->fstride;
+                            ix               = shX + x[is+0];
+                            iy               = shY + x[is+1];
+                            iz               = shZ + x[is+2];
+                            iq               = facel*x[is+3];
+                            nti              = ntype*2*type[ia];
+                    
+                            fix              = 0;
+                            fiy              = 0;
+                            fiz              = 0;
+
+                            for(jc=0; jc<CL_SIZE; jc++)
+                            {
+                                ja               = cj*CL_SIZE + jc;
+
+                                if (nbln->shift == CENTRAL &&
+                                    ci == cj && ja <= ia)
+                                {
+                                    continue;
+                                }
+                        
+                                int_bit = ((excl[jc>>2]->pair[(jc & 3)*CL_SIZE+ic] >> (jm*NCL_PER_SUPERCL+im)) & 1); 
+
+                                js               = ja*nbat->xstride;
+                                jfs              = ja*nbat->fstride;
+                                jx               = x[js+0];      
+                                jy               = x[js+1];      
+                                jz               = x[js+2];      
+                                dx               = ix - jx;      
+                                dy               = iy - jy;      
+                                dz               = iz - jz;      
+                                rsq              = dx*dx + dy*dy + dz*dz;
+                                if (rsq < rlist2)
+                                {
+                                    within_rlist = TRUE;
+                                }
+                                if (rsq >= rcut2)
+                                {
+                                    continue;
+                                }
+
+                                if (type[ia] != ntype-1 && type[ja] != ntype-1)
+                                {
+                                    npair++;
+                                }
+
+                                /* avoid NaN for excluded pairs at r=0 */
+                                rsq             += (1.0 - int_bit)*NBNXN_AVOID_SING_R2_INC;
+
+                                rinv             = gmx_invsqrt(rsq);
+                                rinvsq           = rinv*rinv;  
+                                fscal            = 0;
+                        
+                                qq               = iq*x[js+3];
+                                if (!bEwald)
+                                {
+                                    /* Reaction-field */
+                                    krsq  = iconst->k_rf*rsq;
+                                    fscal = qq*(int_bit*rinv - 2*krsq)*rinvsq;
+                                    if (bEner)
+                                    {
+                                        vcoul = qq*(int_bit*rinv + krsq - iconst->c_rf);
+                                    }
+                                }
+                                else
+                                {
+                                    r     = rsq*rinv;
+                                    rt    = r*iconst->tabq_scale;
+                                    n0    = rt;
+                                    eps   = rt - n0;
+
+                                    fexcl = (1 - eps)*Ftab[n0] + eps*Ftab[n0+1];
+
+                                    fscal = qq*(int_bit*rinvsq - fexcl)*rinv;
+
+                                    if (bEner)
+                                    {
+                                        vcoul = qq*((int_bit - gmx_erf(iconst->ewaldcoeff*r))*rinv - int_bit*iconst->sh_ewald);
+                                    }
+                                }
+
+                                if (rsq < rvdw2)
+                                {
+                                    tj        = nti + 2*type[ja];
+
+                                    /* Vanilla Lennard-Jones cutoff */
+                                    c6        = vdwparam[tj];
+                                    c12       = vdwparam[tj+1];
+                                
+                                    rinvsix   = int_bit*rinvsq*rinvsq*rinvsq;
+                                    Vvdw_disp = c6*rinvsix;     
+                                    Vvdw_rep  = c12*rinvsix*rinvsix;
+                                    fscal    += (Vvdw_rep - Vvdw_disp)*rinvsq;
+
+                                    if (bEner)
+                                    {
+                                        vctot   += vcoul;
+
+                                        Vvdwtot +=
+                                            (Vvdw_rep - int_bit*c12*iconst->sh_invrc6*iconst->sh_invrc6)/12 -
+                                            (Vvdw_disp - int_bit*c6*iconst->sh_invrc6)/6;
+                                    }
+                                }
+                                
+                                tx        = fscal*dx;
+                                ty        = fscal*dy;
+                                tz        = fscal*dz;
+                                fix       = fix + tx;
+                                fiy       = fiy + ty;
+                                fiz       = fiz + tz;
+                                f[jfs+0] -= tx;
+                                f[jfs+1] -= ty;
+                                f[jfs+2] -= tz;
+                            }
+                            
+                            f[ifs+0]        += fix;
+                            f[ifs+1]        += fiy;
+                            f[ifs+2]        += fiz;
+                            fshift[ish3]     = fshift[ish3]   + fix;
+                            fshift[ish3+1]   = fshift[ish3+1] + fiy;
+                            fshift[ish3+2]   = fshift[ish3+2] + fiz;
+
+                            /* Count in half work-units.
+                             * In CUDA one work-unit is 2 warps.
+                             */
+                            if ((ic+1) % (CL_SIZE/2) == 0)
+                            {
+                                npair_tot += npair;
+
+                                nhwu++;
+                                if (within_rlist)
+                                {
+                                    nhwu_pruned++;
+                                }
+
+                                within_rlist = FALSE;
+                                npair        = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        
+        if (bEner)
+        {
+            ggid = 0;
+            Vc[ggid]         = Vc[ggid]   + vctot;
+            Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(debug,"number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
+                nbl->na_ci,nbl->na_ci,
+                nhwu,nhwu_pruned,nhwu_pruned/(double)nhwu);
+        fprintf(debug,"generic kernel pair interactions:            %d\n",
+                nhwu*nbl->na_ci/2*nbl->na_ci);
+        fprintf(debug,"generic kernel post-prune pair interactions: %d\n",
+                nhwu_pruned*nbl->na_ci/2*nbl->na_ci);
+        fprintf(debug,"generic kernel non-zero pair interactions:   %d\n",
+                npair_tot);
+        fprintf(debug,"ratio non-zero/post-prune pair interactions: %4.2f\n",
+                npair_tot/(double)(nhwu_pruned*nbl->na_ci/2*nbl->na_ci));
+    }
+}
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h
similarity index 61%
copy from src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
copy to src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h
index 76070804ea..0ac60cd8b3 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h
@@ -1,15 +1,16 @@
 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
- * 
+ *
  *                This source code is part of
  * 
  *                 G   R   O   M   A   C   S
  * 
  *          GROningen MAchine for Chemical Simulations
  * 
+ *                        VERSION 3.2.0
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
+ * Copyright (c) 2001-2004, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
 
  * This program is free software; you can redistribute it and/or
@@ -28,29 +29,32 @@
  * the papers on the package - you can find them in the top README file.
  * 
  * For more info, check our website at http://www.gromacs.org
- * 
- * And Hey:
- * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
 
-#ifndef _GMX_GPU_UTILS_H_
-#define _GMX_GPU_UTILS_H_
+#ifndef _nbnxn_kernel_gpu_ref_h
+#define _nbnxn_kernel_gpu_ref_h
 
-#ifndef __cplusplus
+#include "typedefs.h"
+
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-int do_quick_memtest(int /*dev_id*/);
-
-int do_full_memtest(int /*dev_id*/);
-
-int do_timed_memtest(int /*dev_id*/, int /*time_limit*/);
-
-int is_supported_cuda_gpu(int /*dev_id*/, char* /*gpu_name*/);
-
-#ifndef __cplusplus
-}  /* extern "C" */
+/* Reference (slow) kernel for nb n vs n GPU type pair lists */
+void
+nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t     *nbl,
+                     const nbnxn_atomdata_t     *nbat,
+                     const interaction_const_t  *iconst,
+                     rvec                       *shift_vec,
+                     int                        force_flags,
+                     int                        clearF,
+                     real *                     f,
+                     real *                     fshift,
+                     real *                     Vc,
+                     real *                     Vvdw);
+
+#ifdef __cplusplus
+}
 #endif
 
-#endif // _GMX_GPU_UTILS_H_
-
+#endif
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
new file mode 100644
index 0000000000..c1f9e2e40a
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
@@ -0,0 +1,265 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "typedefs.h"
+#include "vec.h"
+#include "smalloc.h"
+#include "force.h"
+#include "gmx_omp_nthreads.h"
+#include "nbnxn_kernel_ref.h"
+#include "../nbnxn_consts.h"
+#include "nbnxn_kernel_common.h"
+
+/* Analytical reaction-field kernels */
+#define CALC_COUL_RF
+
+/* Include the force+energy kernels */
+#define CALC_ENERGIES
+#include "nbnxn_kernel_ref_outer.h"
+#undef CALC_ENERGIES
+
+/* Include the force+energygroups kernels */
+#define CALC_ENERGIES
+#define ENERGY_GROUPS
+#include "nbnxn_kernel_ref_outer.h"
+#undef ENERGY_GROUPS
+#undef CALC_ENERGIES
+
+/* Include the force only kernels */
+#include "nbnxn_kernel_ref_outer.h"
+
+#undef CALC_COUL_RF
+
+
+/* Tabulated exclusion interaction electrostatics kernels */
+#define CALC_COUL_TAB
+
+/* Include the force+energy kernels */
+#define CALC_ENERGIES
+#include "nbnxn_kernel_ref_outer.h"
+#undef CALC_ENERGIES
+
+/* Include the force+energygroups kernels */
+#define CALC_ENERGIES
+#define ENERGY_GROUPS
+#include "nbnxn_kernel_ref_outer.h"
+#undef ENERGY_GROUPS
+#undef CALC_ENERGIES
+
+/* Include the force only kernels */
+#include "nbnxn_kernel_ref_outer.h"
+
+/* Twin-range cut-off kernels */
+#define VDW_CUTOFF_CHECK
+
+/* Include the force+energy kernels */
+#define CALC_ENERGIES
+#include "nbnxn_kernel_ref_outer.h"
+#undef CALC_ENERGIES
+
+/* Include the force+energygroups kernels */
+#define CALC_ENERGIES
+#define ENERGY_GROUPS
+#include "nbnxn_kernel_ref_outer.h"
+#undef ENERGY_GROUPS
+#undef CALC_ENERGIES
+
+/* Include the force only kernels */
+#include "nbnxn_kernel_ref_outer.h"
+
+#undef VDW_CUTOFF_CHECK
+
+#undef CALC_COUL_TAB
+
+
+typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
+                                const nbnxn_atomdata_t     *nbat,
+                                const interaction_const_t  *ic,
+                                rvec                       *shift_vec,
+                                real                       *f,
+                                real                       *fshift,
+                                real                       *Vvdw,
+                                real                       *Vc);
+
+typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
+                                  const nbnxn_atomdata_t     *nbat,
+                                  const interaction_const_t  *ic,
+                                  rvec                       *shift_vec,
+                                  real                       *f,
+                                  real                       *fshift);
+
+enum { coultRF, coultTAB, coultTAB_TWIN, coultNR };
+
+p_nbk_func_ener p_nbk_c_ener[coultNR] =
+{ nbnxn_kernel_ref_rf_ener,
+  nbnxn_kernel_ref_tab_ener,
+  nbnxn_kernel_ref_tab_twin_ener };
+
+p_nbk_func_ener p_nbk_c_energrp[coultNR] =
+{ nbnxn_kernel_ref_rf_energrp,
+  nbnxn_kernel_ref_tab_energrp,
+  nbnxn_kernel_ref_tab_twin_energrp};
+
+p_nbk_func_noener p_nbk_c_noener[coultNR] =
+{ nbnxn_kernel_ref_rf_noener,
+  nbnxn_kernel_ref_tab_noener,
+  nbnxn_kernel_ref_tab_twin_noener };
+
+void
+nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list,
+                 const nbnxn_atomdata_t     *nbat,
+                 const interaction_const_t  *ic,
+                 rvec                       *shift_vec,
+                 int                        force_flags,
+                 int                        clearF,
+                 real                       *fshift,
+                 real                       *Vc,
+                 real                       *Vvdw)
+{
+    int              nnbl;
+    nbnxn_pairlist_t **nbl;
+    int coult;
+    int nb;
+
+    nnbl = nbl_list->nnbl;
+    nbl  = nbl_list->nbl;
+
+    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+    {
+        coult = coultRF;
+    }
+    else
+    {
+        if (ic->rcoulomb == ic->rvdw)
+        {
+            coult = coultTAB;
+        }
+        else
+        {
+            coult = coultTAB_TWIN;
+        }
+    }
+
+#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+    for(nb=0; nb<nnbl; nb++)
+    {
+        nbnxn_atomdata_output_t *out;
+        real *fshift_p;
+
+        out = &nbat->out[nb];
+
+        if (clearF == enbvClearFYes)
+        {
+            clear_f(nbat,out->f);
+        }
+
+        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
+        {
+            fshift_p = fshift;
+        }
+        else
+        {
+            fshift_p = out->fshift;
+
+            if (clearF == enbvClearFYes)
+            {
+                clear_fshift(fshift_p);
+            }
+        }
+
+        if (!(force_flags & GMX_FORCE_ENERGY))
+        {
+            /* Don't calculate energies */
+            p_nbk_c_noener[coult](nbl[nb],nbat,
+                                  ic,
+                                  shift_vec,
+                                  out->f,
+                                  fshift_p);
+        }
+        else if (out->nV == 1)
+        {
+            /* No energy groups */
+            out->Vvdw[0] = 0;
+            out->Vc[0]   = 0;
+
+            p_nbk_c_ener[coult](nbl[nb],nbat,
+                                ic,
+                                shift_vec,
+                                out->f,
+                                fshift_p,
+                                out->Vvdw,
+                                out->Vc);
+        }
+        else
+        {
+            /* Calculate energy group contributions */
+            int i;
+
+            for(i=0; i<out->nV; i++)
+            {
+                out->Vvdw[i] = 0;
+            }
+            for(i=0; i<out->nV; i++)
+            {
+                out->Vc[i] = 0;
+            }
+
+            p_nbk_c_energrp[coult](nbl[nb],nbat,
+                                   ic,
+                                   shift_vec,
+                                   out->f,
+                                   fshift_p,
+                                   out->Vvdw,
+                                   out->Vc);
+        }
+    }
+
+    if (force_flags & GMX_FORCE_ENERGY)
+    {
+        /* Reduce the energies */
+        for(nb=0; nb<nnbl; nb++)
+        {
+            int i;
+
+            for(i=0; i<nbat->out[nb].nV; i++)
+            {
+                Vvdw[i] += nbat->out[nb].Vvdw[i];
+                Vc[i]   += nbat->out[nb].Vc[i];
+            }
+        }
+    }
+}
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h
similarity index 64%
copy from src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
copy to src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h
index 76070804ea..ab6a7ddd66 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h
@@ -1,15 +1,16 @@
 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
- * 
+ *
  *                This source code is part of
  * 
  *                 G   R   O   M   A   C   S
  * 
  *          GROningen MAchine for Chemical Simulations
  * 
+ *                        VERSION 3.2.0
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
+ * Copyright (c) 2001-2004, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
 
  * This program is free software; you can redistribute it and/or
@@ -28,29 +29,31 @@
  * the papers on the package - you can find them in the top README file.
  * 
  * For more info, check our website at http://www.gromacs.org
- * 
- * And Hey:
- * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
 
-#ifndef _GMX_GPU_UTILS_H_
-#define _GMX_GPU_UTILS_H_
+#ifndef _nbnxn_kernel_ref_h
+#define _nbnxn_kernel_ref_h
 
-#ifndef __cplusplus
+#include "typedefs.h"
+
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-int do_quick_memtest(int /*dev_id*/);
-
-int do_full_memtest(int /*dev_id*/);
-
-int do_timed_memtest(int /*dev_id*/, int /*time_limit*/);
-
-int is_supported_cuda_gpu(int /*dev_id*/, char* /*gpu_name*/);
-
-#ifndef __cplusplus
-}  /* extern "C" */
+/* Wrapper call for the non-bonded n vs n reference kernels */
+void
+nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list,
+                 const nbnxn_atomdata_t     *nbat,
+                 const interaction_const_t  *ic,
+                 rvec                       *shift_vec,
+                 int                        force_flags,
+                 int                        clearF,
+                 real                       *fshift,
+                 real                       *Vc,
+                 real                       *Vvdw);
+
+#ifdef __cplusplus
+}
 #endif
 
-#endif // _GMX_GPU_UTILS_H_
-
+#endif
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_inner.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_inner.h
new file mode 100644
index 0000000000..534c07861e
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_inner.h
@@ -0,0 +1,274 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+
+/* When calculating RF or Ewald interactions we calculate the electrostatic
+ * forces and energies on excluded atom pairs here in the non-bonded loops.
+ */
+#if defined CHECK_EXCLS && defined CALC_COULOMB
+#define EXCL_FORCES
+#endif
+
+        {
+            int cj;
+#ifdef ENERGY_GROUPS
+            int egp_cj;
+#endif
+            int i;
+
+            cj = l_cj[cjind].cj;
+
+#ifdef ENERGY_GROUPS
+            egp_cj = nbat->energrp[cj];
+#endif
+            for(i=0; i<UNROLLI; i++)
+            {
+                int ai;
+                int type_i_off;
+                int j;
+
+                ai = ci*UNROLLI + i;
+
+                type_i_off = type[ai]*ntype2;
+
+                for(j=0; j<UNROLLJ; j++)
+                {
+                    int  aj;
+                    real dx,dy,dz;
+                    real rsq,rinv;
+                    real rinvsq,rinvsix;
+                    real c6,c12;
+                    real FrLJ6=0,FrLJ12=0,VLJ=0;
+#ifdef CALC_COULOMB
+                    real qq;
+                    real fcoul;
+#ifdef CALC_COUL_TAB
+                    real rs,frac;
+                    int  ri;
+                    real fexcl;
+#endif
+#ifdef CALC_ENERGIES
+                    real vcoul;
+#endif
+#endif
+                    real fscal;
+                    real fx,fy,fz;
+
+                    /* A multiply mask used to zero an interaction
+                     * when either the distance cutoff is exceeded, or
+                     * (if appropriate) the i and j indices are
+                     * unsuitable for this kind of inner loop. */
+                    real skipmask;
+#ifdef VDW_CUTOFF_CHECK
+                    real skipmask_rvdw;
+#endif
+#ifdef CHECK_EXCLS
+                    /* A multiply mask used to zero an interaction
+                     * when that interaction should be excluded
+                     * (e.g. because of bonding). */
+                    int interact;
+
+                    interact = ((l_cj[cjind].excl>>(i*UNROLLI + j)) & 1);
+#ifndef EXCL_FORCES
+                    skipmask = interact;
+#else
+                    skipmask = !(cj == ci_sh && j <= i);
+#endif
+#else
+#define interact 1.0
+                    skipmask = 1.0;
+#endif
+
+                    aj = cj*UNROLLJ + j;
+
+                    dx  = xi[i*XI_STRIDE+XX] - x[aj*X_STRIDE+XX];
+                    dy  = xi[i*XI_STRIDE+YY] - x[aj*X_STRIDE+YY];
+                    dz  = xi[i*XI_STRIDE+ZZ] - x[aj*X_STRIDE+ZZ];
+
+                    rsq = dx*dx + dy*dy + dz*dz;
+
+                    /* Prepare to enforce the cut-off. */
+                    skipmask = (rsq >= rcut2) ? 0 : skipmask;
+                    /* 9 flops for r^2 + cut-off check */
+
+#ifdef CHECK_EXCLS
+                    /* Excluded atoms are allowed to be on top of each other.
+                     * To avoid overflow of rinv, rinvsq and rinvsix
+                     * we add a small number to rsq for excluded pairs only.
+                     */
+                    rsq += (1 - interact)*NBNXN_AVOID_SING_R2_INC;
+#endif
+
+#ifdef COUNT_PAIRS
+                    npair++;
+#endif
+
+                    rinv = gmx_invsqrt(rsq);
+                    /* 5 flops for invsqrt */
+
+                    /* Partially enforce the cut-off (and perhaps
+                     * exclusions) to avoid possible overflow of
+                     * rinvsix when computing LJ, and/or overflowing
+                     * the Coulomb table during lookup. */
+                    rinv = rinv * skipmask;
+
+                    rinvsq  = rinv*rinv;
+
+#ifdef HALF_LJ
+                    if (i < UNROLLI/2)
+#endif
+                    {
+                        rinvsix = interact*rinvsq*rinvsq*rinvsq;
+
+#ifdef VDW_CUTOFF_CHECK
+                        skipmask_rvdw = (rsq < rvdw2);
+                        rinvsix *= skipmask_rvdw;
+#endif
+
+                        c6      = nbfp[type_i_off+type[aj]*2  ];
+                        c12     = nbfp[type_i_off+type[aj]*2+1];
+                        FrLJ6   = c6*rinvsix;
+                        FrLJ12  = c12*rinvsix*rinvsix;
+                        /* 6 flops for r^-2 + LJ force */
+#ifdef CALC_ENERGIES
+                        VLJ     = (FrLJ12 - c12*sh_invrc6*sh_invrc6)/12 -
+                                  (FrLJ6 - c6*sh_invrc6)/6;
+                        /* Need to zero the interaction if r >= rcut
+                         * or there should be exclusion. */
+                        VLJ     = VLJ * skipmask * interact;
+                        /* 9 flops for LJ energy */
+#ifdef VDW_CUTOFF_CHECK
+                        VLJ    *= skipmask_rvdw;
+#endif
+#ifdef ENERGY_GROUPS
+                        Vvdw[egp_sh_i[i]+((egp_cj>>(nbat->neg_2log*j)) & egp_mask)] += VLJ;
+#else
+                        Vvdw_ci += VLJ;
+                        /* 1 flop for LJ energy addition */
+#endif
+#endif
+                    }
+
+#ifdef CALC_COULOMB
+                    /* Enforce the cut-off and perhaps exclusions. In
+                     * those cases, rinv is zero because of skipmask,
+                     * but fcoul and vcoul will later be non-zero (in
+                     * both RF and table cases) because of the
+                     * contributions that do not depend on rinv. These
+                     * contributions cannot be allowed to accumulate
+                     * to the force and potential, and the easiest way
+                     * to do this is to zero the charges in
+                     * advance. */
+                    qq = skipmask * qi[i] * q[aj];
+
+#ifdef CALC_COUL_RF
+                    fcoul  = qq*(interact*rinv*rinvsq - k_rf2);
+                    /* 4 flops for RF force */
+#ifdef CALC_ENERGIES
+                    vcoul  = qq*(interact*rinv + k_rf*rsq - c_rf);
+                    /* 4 flops for RF energy */
+#endif
+#endif
+
+#ifdef CALC_COUL_TAB
+                    rs     = rsq*rinv*ic->tabq_scale;
+                    ri     = (int)rs;
+                    frac   = rs - ri;
+#ifndef GMX_DOUBLE
+                    /* fexcl = F_i + frac * (F_(i+1)-F_i) */
+                    fexcl  = tab_coul_FDV0[ri*4] + frac*tab_coul_FDV0[ri*4+1];
+#else
+                    /* fexcl = (1-frac) * F_i + frac * F_(i+1) */
+                    fexcl  = (1 - frac)*tab_coul_F[ri] + frac*tab_coul_F[ri+1];
+#endif
+                    fcoul  = interact*rinvsq - fexcl;
+                    /* 7 flops for float 1/r-table force */
+#ifdef CALC_ENERGIES
+#ifndef GMX_DOUBLE
+                    vcoul  = qq*(interact*(rinv - ic->sh_ewald)
+                                 -(tab_coul_FDV0[ri*4+2]
+                                   -halfsp*frac*(tab_coul_FDV0[ri*4] + fexcl)));
+                    /* 7 flops for float 1/r-table energy (8 with excls) */
+#else
+                    vcoul  = qq*(interact*(rinv - ic->sh_ewald)
+                                 -(tab_coul_V[ri]
+                                   -halfsp*frac*(tab_coul_F[ri] + fexcl)));
+#endif
+#endif
+                    fcoul *= qq*rinv;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef ENERGY_GROUPS
+                    Vc[egp_sh_i[i]+((egp_cj>>(nbat->neg_2log*j)) & egp_mask)] += vcoul;
+#else
+                    Vc_ci += vcoul;
+                    /* 1 flop for Coulomb energy addition */
+#endif
+#endif
+#endif
+
+#ifdef CALC_COULOMB
+#ifdef HALF_LJ
+                    if (i < UNROLLI/2)
+#endif
+                    {
+                        fscal = (FrLJ12 - FrLJ6)*rinvsq + fcoul;
+                        /* 3 flops for scalar LJ+Coulomb force */
+                    }
+#ifdef HALF_LJ
+                    else
+                    {
+                        fscal = fcoul;
+                    }
+#endif
+#else
+                    fscal = (FrLJ12 - FrLJ6)*rinvsq;
+#endif
+                    fx = fscal*dx;
+                    fy = fscal*dy;
+                    fz = fscal*dz;
+
+                    /* Increment i-atom force */
+                    fi[i*FI_STRIDE+XX] += fx;
+                    fi[i*FI_STRIDE+YY] += fy;
+                    fi[i*FI_STRIDE+ZZ] += fz;
+                    /* Decrement j-atom force */
+                    f[aj*F_STRIDE+XX]  -= fx;
+                    f[aj*F_STRIDE+YY]  -= fy;
+                    f[aj*F_STRIDE+ZZ]  -= fz;
+                    /* 9 flops for force addition */
+                }
+            }
+        }
+
+#undef interact
+#undef EXCL_FORCES
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h
new file mode 100644
index 0000000000..f248d3c121
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h
@@ -0,0 +1,365 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under 
+ * the terms of the GNU Lesser General Public License as published by the Free 
+ * Software Foundation; either version 2 of the License, or (at your option) any 
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.  
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+
+#define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
+#define UNROLLJ    NBNXN_CPU_CLUSTER_I_SIZE
+
+/* We could use nbat->xstride and nbat->fstride, but macros might be faster */
+#define X_STRIDE   3
+#define F_STRIDE   3
+/* Local i-atom buffer strides */
+#define XI_STRIDE  3
+#define FI_STRIDE  3
+
+
+/* All functionality defines are set here, except for:
+ * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
+ * CHECK_EXCLS, which is set just before including the inner loop contents.
+ */
+
+/* We always calculate shift forces, because it's cheap anyhow */
+#define CALC_SHIFTFORCES
+
+#ifdef CALC_COUL_RF
+#define NBK_FUNC_NAME(x,y) x##_rf_##y
+#endif
+#ifdef CALC_COUL_TAB
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(x,y) x##_tab_##y
+#else
+#define NBK_FUNC_NAME(x,y) x##_tab_twin_##y
+#endif
+#endif
+
+static void
+#ifndef CALC_ENERGIES
+NBK_FUNC_NAME(nbnxn_kernel_ref,noener)
+#else
+#ifndef ENERGY_GROUPS
+NBK_FUNC_NAME(nbnxn_kernel_ref,ener)
+#else
+NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
+#endif
+#endif
+#undef NBK_FUNC_NAME
+                            (const nbnxn_pairlist_t     *nbl,
+                             const nbnxn_atomdata_t     *nbat,
+                             const interaction_const_t  *ic,
+                             rvec                       *shift_vec,
+                             real                       *f
+#ifdef CALC_SHIFTFORCES
+                             ,
+                             real                       *fshift
+#endif
+#ifdef CALC_ENERGIES
+                             ,
+                             real                       *Vvdw,
+                             real                       *Vc
+#endif
+                            )
+{
+    const nbnxn_ci_t   *nbln;
+    const nbnxn_cj_t   *l_cj;
+    const int          *type;
+    const real         *q;
+    const real         *shiftvec;
+    const real         *x;
+    const real         *nbfp;
+    real       rcut2;
+#ifdef VDW_CUTOFF_CHECK
+    real       rvdw2;
+#endif
+    int        ntype2;
+    real       facel;
+    real       *nbfp_i;
+    int        n,ci,ci_sh;
+    int        ish,ishf;
+    gmx_bool   half_LJ,do_coul;
+    int        cjind0,cjind1,cjind;
+    int        ip,jp;
+
+    real       xi[UNROLLI*XI_STRIDE];
+    real       fi[UNROLLI*FI_STRIDE];
+    real       qi[UNROLLI];
+
+#ifdef CALC_ENERGIES
+#ifndef ENERGY_GROUPS
+
+    real       Vvdw_ci,Vc_ci;
+#else
+    int        egp_mask;
+    int        egp_sh_i[UNROLLI];
+#endif
+    real       sh_invrc6;
+#endif
+
+#ifdef CALC_COUL_RF
+    real       k_rf2;
+#ifdef CALC_ENERGIES
+    real       k_rf,c_rf;
+#endif
+#endif
+#ifdef CALC_COUL_TAB
+    real       tabscale;
+#ifdef CALC_ENERGIES
+    real       halfsp;
+#endif
+#ifndef GMX_DOUBLE
+    const real *tab_coul_FDV0;
+#else
+    const real *tab_coul_F;
+    const real *tab_coul_V;
+#endif
+#endif
+
+    int ninner;
+
+#ifdef COUNT_PAIRS
+    int npair=0;
+#endif
+
+#ifdef CALC_ENERGIES
+    sh_invrc6 = ic->sh_invrc6;
+#endif
+
+#ifdef CALC_COUL_RF
+    k_rf2 = 2*ic->k_rf;
+#ifdef CALC_ENERGIES
+    k_rf = ic->k_rf;
+    c_rf = ic->c_rf;
+#endif
+#endif
+#ifdef CALC_COUL_TAB
+    tabscale = ic->tabq_scale;
+#ifdef CALC_ENERGIES
+    halfsp = 0.5/ic->tabq_scale;
+#endif
+
+#ifndef GMX_DOUBLE
+    tab_coul_FDV0 = ic->tabq_coul_FDV0;
+#else
+    tab_coul_F    = ic->tabq_coul_F;
+    tab_coul_V    = ic->tabq_coul_V;
+#endif
+#endif
+
+#ifdef ENERGY_GROUPS
+    egp_mask = (1<<nbat->neg_2log) - 1;
+#endif
+
+
+    rcut2               = ic->rcoulomb*ic->rcoulomb;
+#ifdef VDW_CUTOFF_CHECK
+    rvdw2               = ic->rvdw*ic->rvdw;
+#endif
+
+    ntype2              = nbat->ntype*2;
+    nbfp                = nbat->nbfp;
+    q                   = nbat->q;
+    type                = nbat->type;
+    facel               = ic->epsfac;
+    shiftvec            = shift_vec[0];
+    x                   = nbat->x;
+
+    l_cj = nbl->cj;
+
+    ninner = 0;
+    for(n=0; n<nbl->nci; n++)
+    {
+        int i,d;
+
+        nbln = &nbl->ci[n];
+
+        ish              = (nbln->shift & NBNXN_CI_SHIFT);
+        /* x, f and fshift are assumed to be stored with stride 3 */
+        ishf             = ish*DIM;
+        cjind0           = nbln->cj_ind_start;
+        cjind1           = nbln->cj_ind_end;
+        /* Currently only works super-cells equal to sub-cells */
+        ci               = nbln->ci;
+        ci_sh            = (ish == CENTRAL ? ci : -1);
+
+        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+
+#ifdef CALC_ENERGIES
+#ifndef ENERGY_GROUPS
+        Vvdw_ci = 0;
+        Vc_ci   = 0;
+#else
+        for(i=0; i<UNROLLI; i++)
+        {
+            egp_sh_i[i] = ((nbat->energrp[ci]>>(i*nbat->neg_2log)) & egp_mask)*nbat->nenergrp;
+        }
+#endif
+#endif
+
+        for(i=0; i<UNROLLI; i++)
+        {
+            for(d=0; d<DIM; d++)
+            {
+                xi[i*XI_STRIDE+d] = x[(ci*UNROLLI+i)*X_STRIDE+d] + shiftvec[ishf+d];
+                fi[i*FI_STRIDE+d] = 0;
+            }
+        }
+
+        /* With half_LJ we currently always calculate Coulomb interactions */
+        if (do_coul || half_LJ)
+        {
+#ifdef CALC_ENERGIES
+            real Vc_sub_self;
+
+#ifdef CALC_COUL_RF
+            Vc_sub_self = 0.5*c_rf;
+#endif
+#ifdef CALC_COUL_TAB
+#ifdef GMX_DOUBLE
+            Vc_sub_self = 0.5*tab_coul_V[0];
+#else
+            Vc_sub_self = 0.5*tab_coul_FDV0[2];
+#endif
+#endif
+#endif
+
+            for(i=0; i<UNROLLI; i++)
+            {
+                qi[i] = facel*q[ci*UNROLLI+i];
+
+#ifdef CALC_ENERGIES
+                if (l_cj[nbln->cj_ind_start].cj == ci_sh)
+                {
+#ifdef ENERGY_GROUPS
+                    Vc[egp_sh_i[i]+((nbat->energrp[ci]>>(i*nbat->neg_2log)) & egp_mask)]
+#else
+                    Vc[0]
+#endif
+                        -= qi[i]*q[ci*UNROLLI+i]*Vc_sub_self;
+                }
+#endif
+            }
+        }
+
+        cjind = cjind0;
+        while (cjind < cjind1 && nbl->cj[cjind].excl != 0xffff)
+        {
+#define CHECK_EXCLS
+            if (half_LJ)
+            {
+#define CALC_COULOMB
+#define HALF_LJ
+#include "nbnxn_kernel_ref_inner.h"
+#undef HALF_LJ
+#undef CALC_COULOMB
+            }
+            /* cppcheck-suppress duplicateBranch */
+            else if (do_coul)
+            {
+#define CALC_COULOMB
+#include "nbnxn_kernel_ref_inner.h"
+#undef CALC_COULOMB
+            }
+            else
+            {
+#include "nbnxn_kernel_ref_inner.h"
+            }
+#undef CHECK_EXCLS
+            cjind++;
+        }
+
+        for(; (cjind<cjind1); cjind++)
+        {
+            if (half_LJ)
+            {
+#define CALC_COULOMB
+#define HALF_LJ
+#include "nbnxn_kernel_ref_inner.h"
+#undef HALF_LJ
+#undef CALC_COULOMB
+            }
+            /* cppcheck-suppress duplicateBranch */
+            else if (do_coul)
+            {
+#define CALC_COULOMB
+#include "nbnxn_kernel_ref_inner.h"
+#undef CALC_COULOMB
+            }
+            else
+            {
+#include "nbnxn_kernel_ref_inner.h"
+            }
+        }
+        ninner += cjind1 - cjind0;
+
+        /* Add accumulated i-forces to the force array */
+        for(i=0; i<UNROLLI; i++)
+        {
+            for(d=0; d<DIM; d++)
+            {
+                f[(ci*UNROLLI+i)*F_STRIDE+d] += fi[i*FI_STRIDE+d];
+            }
+        }
+#ifdef CALC_SHIFTFORCES
+        if (fshift != NULL)
+        {
+            /* Add i forces to shifted force list */
+            for(i=0; i<UNROLLI; i++)
+            {
+                for(d=0; d<DIM; d++)
+                {
+                    fshift[ishf+d] += fi[i*FI_STRIDE+d];
+                }
+            }
+        }
+#endif
+
+#ifdef CALC_ENERGIES
+#ifndef ENERGY_GROUPS
+        *Vvdw += Vvdw_ci;
+        *Vc   += Vc_ci;
+#endif
+#endif
+	}
+
+#ifdef COUNT_PAIRS
+    printf("atom pairs %d\n",npair);
+#endif
+}
+
+#undef CALC_SHIFTFORCES
+
+#undef X_STRIDE
+#undef F_STRIDE
+#undef XI_STRIDE
+#undef FI_STRIDE
+
+#undef UNROLLI
+#undef UNROLLJ
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
new file mode 100644
index 0000000000..8d7497a515
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
@@ -0,0 +1,316 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "typedefs.h"
+#include "vec.h"
+#include "smalloc.h"
+#include "force.h"
+#include "gmx_omp_nthreads.h"
+#include "../nbnxn_consts.h"
+#include "nbnxn_kernel_common.h"
+
+#ifdef GMX_X86_SSE2
+
+#include "nbnxn_kernel_x86_simd128.h"
+
+/* Include all flavors of the 128-bit SSE or AVX kernel loops */
+
+#define GMX_MM128_HERE
+
+/* Analytical reaction-field kernels */
+#define CALC_COUL_RF
+
+#include "nbnxn_kernel_x86_simd_includes.h"
+
+#undef CALC_COUL_RF
+
+/* Tabulated exclusion interaction electrostatics kernels */
+#define CALC_COUL_TAB
+
+/* Single cut-off: rcoulomb = rvdw */
+#include "nbnxn_kernel_x86_simd_includes.h"
+
+/* Twin cut-off: rcoulomb >= rvdw */
+#define VDW_CUTOFF_CHECK
+#include "nbnxn_kernel_x86_simd_includes.h"
+#undef VDW_CUTOFF_CHECK
+
+#undef CALC_COUL_TAB
+
+
+typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
+                                const nbnxn_atomdata_t     *nbat,
+                                const interaction_const_t  *ic,
+                                rvec                       *shift_vec,
+                                real                       *f,
+                                real                       *fshift,
+                                real                       *Vvdw,
+                                real                       *Vc);
+
+typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
+                                  const nbnxn_atomdata_t     *nbat,
+                                  const interaction_const_t  *ic,
+                                  rvec                       *shift_vec,
+                                  real                       *f,
+                                  real                       *fshift);
+
+enum { coultRF, coultTAB, coultTAB_TWIN, coultNR };
+
+
+static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
+{ { nbnxn_kernel_x86_simd128_rf_comb_geom_ener,
+    nbnxn_kernel_x86_simd128_rf_comb_lb_ener,
+    nbnxn_kernel_x86_simd128_rf_comb_none_ener },
+  { nbnxn_kernel_x86_simd128_tab_comb_geom_ener,
+    nbnxn_kernel_x86_simd128_tab_comb_lb_ener,
+    nbnxn_kernel_x86_simd128_tab_twin_comb_none_ener },
+  { nbnxn_kernel_x86_simd128_tab_twin_comb_geom_ener,
+    nbnxn_kernel_x86_simd128_tab_twin_comb_lb_ener,
+    nbnxn_kernel_x86_simd128_tab_twin_comb_none_ener }  };
+
+static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
+{ { nbnxn_kernel_x86_simd128_rf_comb_geom_energrp,
+    nbnxn_kernel_x86_simd128_rf_comb_lb_energrp,
+    nbnxn_kernel_x86_simd128_rf_comb_none_energrp },
+  { nbnxn_kernel_x86_simd128_tab_comb_geom_energrp,
+    nbnxn_kernel_x86_simd128_tab_comb_lb_energrp,
+    nbnxn_kernel_x86_simd128_tab_comb_none_energrp },
+  { nbnxn_kernel_x86_simd128_tab_twin_comb_geom_energrp,
+    nbnxn_kernel_x86_simd128_tab_twin_comb_lb_energrp,
+    nbnxn_kernel_x86_simd128_tab_twin_comb_none_energrp } };
+
+static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
+{ { nbnxn_kernel_x86_simd128_rf_comb_geom_noener,
+    nbnxn_kernel_x86_simd128_rf_comb_lb_noener,
+    nbnxn_kernel_x86_simd128_rf_comb_none_noener },
+  { nbnxn_kernel_x86_simd128_tab_comb_geom_noener,
+    nbnxn_kernel_x86_simd128_tab_comb_lb_noener,
+    nbnxn_kernel_x86_simd128_tab_comb_none_noener },
+  { nbnxn_kernel_x86_simd128_tab_twin_comb_geom_noener,
+    nbnxn_kernel_x86_simd128_tab_twin_comb_lb_noener,
+    nbnxn_kernel_x86_simd128_tab_twin_comb_none_noener } };
+
+#endif /* SSE */
+
+
+static void reduce_group_energies(int ng,int ng_2log,
+                                  const real *VSvdw,const real *VSc,
+                                  real *Vvdw,real *Vc)
+{
+    int ng_p2,i,j,j0,j1,c,s;
+
+#ifndef GMX_DOUBLE
+#define SIMD_WIDTH   4
+#define SIMD_WIDTH_2 2
+#else
+#define SIMD_WIDTH   2
+#define SIMD_WIDTH_2 1
+#endif
+
+    ng_p2 = (1<<ng_2log);
+
+    /* The size of the SSE energy group buffer array is:
+     * stride^3*SIMD_WIDTH_2*SIMD_WIDTH
+     */
+    for(i=0; i<ng; i++)
+    {
+        for(j=0; j<ng; j++)
+        {
+            Vvdw[i*ng+j] = 0;
+            Vc[i*ng+j]   = 0;
+        }
+
+        for(j1=0; j1<ng; j1++)
+        {
+            for(j0=0; j0<ng; j0++)
+            {
+                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_2*SIMD_WIDTH;
+                for(s=0; s<SIMD_WIDTH_2; s++)
+                {
+                    Vvdw[i*ng+j0] += VSvdw[c+0];
+                    Vvdw[i*ng+j1] += VSvdw[c+1];
+                    Vc  [i*ng+j0] += VSc  [c+0];
+                    Vc  [i*ng+j1] += VSc  [c+1];
+                    c += SIMD_WIDTH + 2;
+                }
+            }
+        }
+    }
+}
+
+void
+nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
+                         const nbnxn_atomdata_t     *nbat,
+                         const interaction_const_t  *ic,
+                         rvec                       *shift_vec, 
+                         int                        force_flags,
+                         int                        clearF,
+                         real                       *fshift,
+                         real                       *Vc,
+                         real                       *Vvdw)
+#ifdef GMX_X86_SSE2
+{
+    int              nnbl;
+    nbnxn_pairlist_t **nbl;
+    int coult;
+    int nb;
+
+    nnbl = nbl_list->nnbl;
+    nbl  = nbl_list->nbl;
+
+    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+    {
+        coult = coultRF;
+    }
+    else
+    {
+        if (ic->rcoulomb == ic->rvdw)
+        {
+            coult = coultTAB;
+        }
+        else
+        {
+            coult = coultTAB_TWIN;
+        }
+    }
+
+#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+    for(nb=0; nb<nnbl; nb++)
+    {
+        nbnxn_atomdata_output_t *out;
+        real *fshift_p;
+
+        out = &nbat->out[nb];
+
+        if (clearF == enbvClearFYes)
+        {
+            clear_f(nbat,out->f);
+        }
+
+        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
+        {
+            fshift_p = fshift;
+        }
+        else
+        {
+            fshift_p = out->fshift;
+
+            if (clearF == enbvClearFYes)
+            {
+                clear_fshift(fshift_p);
+            }
+        }
+
+        /* With Ewald type electrostatics we the forces for excluded atom pairs
+         * should not contribute to the virial sum. The exclusion forces
+         * are not calculate in the energy kernels, but are in _noener.
+         */
+        if (!((force_flags & GMX_FORCE_ENERGY) ||
+              (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
+        {
+            /* Don't calculate energies */
+            p_nbk_noener[coult][nbat->comb_rule](nbl[nb],nbat,
+                                                 ic,
+                                                 shift_vec,
+                                                 out->f,
+                                                 fshift_p);
+        }
+        else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
+        {
+            /* No energy groups */
+            out->Vvdw[0] = 0;
+            out->Vc[0]   = 0;
+
+            p_nbk_ener[coult][nbat->comb_rule](nbl[nb],nbat,
+                                               ic,
+                                               shift_vec,
+                                               out->f,
+                                               fshift_p,
+                                               out->Vvdw,
+                                               out->Vc);
+        }
+        else
+        {
+            /* Calculate energy group contributions */
+            int i;
+
+            for(i=0; i<out->nVS; i++)
+            {
+                out->VSvdw[i] = 0;
+            }
+            for(i=0; i<out->nVS; i++)
+            {
+                out->VSc[i] = 0;
+            }
+
+            p_nbk_energrp[coult][nbat->comb_rule](nbl[nb],nbat,
+                                                  ic,
+                                                  shift_vec,
+                                                  out->f,
+                                                  fshift_p,
+                                                  out->VSvdw,
+                                                  out->VSc);
+
+            reduce_group_energies(nbat->nenergrp,nbat->neg_2log,
+                                  out->VSvdw,out->VSc,
+                                  out->Vvdw,out->Vc);
+        }
+    }
+
+    if (force_flags & GMX_FORCE_ENERGY)
+    {
+        /* Reduce the energies */
+        for(nb=0; nb<nnbl; nb++)
+        {
+            int i;
+
+            for(i=0; i<nbat->out[nb].nV; i++)
+            {
+                Vvdw[i] += nbat->out[nb].Vvdw[i];
+                Vc[i]   += nbat->out[nb].Vc[i];
+            }
+        }
+    }
+}
+#else
+{
+    gmx_incons("nbnxn_kernel_x86_simd128 called while GROMACS was configured without SSE enabled");
+}
+#endif
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h
similarity index 63%
copy from src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
copy to src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h
index 76070804ea..5732f9e1ca 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h
@@ -1,56 +1,60 @@
 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
- * 
+ *
  *                This source code is part of
- * 
+ *
  *                 G   R   O   M   A   C   S
- * 
+ *
  *          GROningen MAchine for Chemical Simulations
- * 
+ *
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
+ * Copyright (c) 2001-2012, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
-
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- * 
+ *
  * If you want to redistribute modifications, please consider that
  * scientific software is very special. Version control is crucial -
  * bugs must be traceable. We will be happy to consider code for
  * inclusion in the official distribution, but derived work must not
  * be called official GROMACS. Details are found in the README & COPYING
  * files - if they are missing, get the official version at www.gromacs.org.
- * 
+ *
  * To help us fund GROMACS development, we humbly ask that you cite
  * the papers on the package - you can find them in the top README file.
- * 
+ *
  * For more info, check our website at http://www.gromacs.org
- * 
+ *
  * And Hey:
  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
+#ifndef _nbnxn_kernel_x86_simd128_h
+#define _nbnxn_kernel_x86_simd128_h
 
-#ifndef _GMX_GPU_UTILS_H_
-#define _GMX_GPU_UTILS_H_
+#include "typedefs.h"
 
-#ifndef __cplusplus
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-int do_quick_memtest(int /*dev_id*/);
-
-int do_full_memtest(int /*dev_id*/);
-
-int do_timed_memtest(int /*dev_id*/, int /*time_limit*/);
-
-int is_supported_cuda_gpu(int /*dev_id*/, char* /*gpu_name*/);
-
-#ifndef __cplusplus
-}  /* extern "C" */
+/* Wrapper call for the non-bonded cluster vs cluster kernels */
+void
+nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
+                         const nbnxn_atomdata_t     *nbat,
+                         const interaction_const_t  *ic,
+                         rvec                       *shift_vec,
+                         int                        force_flags,
+                         int                        clearF,
+                         real                       *fshift,
+                         real                       *Vc,
+                         real                       *Vvdw);
+
+#ifdef __cplusplus
+}
 #endif
 
-#endif // _GMX_GPU_UTILS_H_
-
+#endif
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
new file mode 100644
index 0000000000..2396702b6e
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
@@ -0,0 +1,316 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "typedefs.h"
+#include "vec.h"
+#include "smalloc.h"
+#include "force.h"
+#include "gmx_omp_nthreads.h"
+#include "../nbnxn_consts.h"
+#include "nbnxn_kernel_common.h"
+
+#ifdef GMX_X86_AVX_256
+
+#include "nbnxn_kernel_x86_simd256.h"
+
+/* Include all flavors of the 256-bit AVX kernel loops */
+
+#define GMX_MM256_HERE
+
+/* Analytical reaction-field kernels */
+#define CALC_COUL_RF
+
+#include "nbnxn_kernel_x86_simd_includes.h"
+
+#undef CALC_COUL_RF
+
+/* Tabulated exclusion interaction electrostatics kernels */
+#define CALC_COUL_TAB
+
+/* Single cut-off: rcoulomb = rvdw */
+#include "nbnxn_kernel_x86_simd_includes.h"
+
+/* Twin cut-off: rcoulomb >= rvdw */
+#define VDW_CUTOFF_CHECK
+#include "nbnxn_kernel_x86_simd_includes.h"
+#undef VDW_CUTOFF_CHECK
+
+#undef CALC_COUL_TAB
+
+
+typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
+                                const nbnxn_atomdata_t     *nbat,
+                                const interaction_const_t  *ic,
+                                rvec                       *shift_vec,
+                                real                       *f,
+                                real                       *fshift,
+                                real                       *Vvdw,
+                                real                       *Vc);
+
+typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
+                                  const nbnxn_atomdata_t     *nbat,
+                                  const interaction_const_t  *ic,
+                                  rvec                       *shift_vec,
+                                  real                       *f,
+                                  real                       *fshift);
+
+enum { coultRF, coultTAB, coultTAB_TWIN, coultNR };
+
+
+static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
+{ { nbnxn_kernel_x86_simd256_rf_comb_geom_ener,
+    nbnxn_kernel_x86_simd256_rf_comb_lb_ener,
+    nbnxn_kernel_x86_simd256_rf_comb_none_ener },
+  { nbnxn_kernel_x86_simd256_tab_comb_geom_ener,
+    nbnxn_kernel_x86_simd256_tab_comb_lb_ener,
+    nbnxn_kernel_x86_simd256_tab_twin_comb_none_ener },
+  { nbnxn_kernel_x86_simd256_tab_twin_comb_geom_ener,
+    nbnxn_kernel_x86_simd256_tab_twin_comb_lb_ener,
+    nbnxn_kernel_x86_simd256_tab_twin_comb_none_ener }  };
+
+static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
+{ { nbnxn_kernel_x86_simd256_rf_comb_geom_energrp,
+    nbnxn_kernel_x86_simd256_rf_comb_lb_energrp,
+    nbnxn_kernel_x86_simd256_rf_comb_none_energrp },
+  { nbnxn_kernel_x86_simd256_tab_comb_geom_energrp,
+    nbnxn_kernel_x86_simd256_tab_comb_lb_energrp,
+    nbnxn_kernel_x86_simd256_tab_comb_none_energrp },
+  { nbnxn_kernel_x86_simd256_tab_twin_comb_geom_energrp,
+    nbnxn_kernel_x86_simd256_tab_twin_comb_lb_energrp,
+    nbnxn_kernel_x86_simd256_tab_twin_comb_none_energrp } };
+
+static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
+{ { nbnxn_kernel_x86_simd256_rf_comb_geom_noener,
+    nbnxn_kernel_x86_simd256_rf_comb_lb_noener,
+    nbnxn_kernel_x86_simd256_rf_comb_none_noener },
+  { nbnxn_kernel_x86_simd256_tab_comb_geom_noener,
+    nbnxn_kernel_x86_simd256_tab_comb_lb_noener,
+    nbnxn_kernel_x86_simd256_tab_comb_none_noener },
+  { nbnxn_kernel_x86_simd256_tab_twin_comb_geom_noener,
+    nbnxn_kernel_x86_simd256_tab_twin_comb_lb_noener,
+    nbnxn_kernel_x86_simd256_tab_twin_comb_none_noener } };
+
+#endif /* SSE */
+
+
+static void reduce_group_energies(int ng,int ng_2log,
+                                  const real *VSvdw,const real *VSc,
+                                  real *Vvdw,real *Vc)
+{
+    int ng_p2,i,j,j0,j1,c,s;
+
+#ifndef GMX_DOUBLE
+#define SIMD_WIDTH   4
+#define SIMD_WIDTH_2 2
+#else
+#define SIMD_WIDTH   2
+#define SIMD_WIDTH_2 1
+#endif
+
+    ng_p2 = (1<<ng_2log);
+
+    /* The size of the SSE energy group buffer array is:
+     * stride^3*SIMD_WIDTH_2*SIMD_WIDTH
+     */
+    for(i=0; i<ng; i++)
+    {
+        for(j=0; j<ng; j++)
+        {
+            Vvdw[i*ng+j] = 0;
+            Vc[i*ng+j]   = 0;
+        }
+
+        for(j1=0; j1<ng; j1++)
+        {
+            for(j0=0; j0<ng; j0++)
+            {
+                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_2*SIMD_WIDTH;
+                for(s=0; s<SIMD_WIDTH_2; s++)
+                {
+                    Vvdw[i*ng+j0] += VSvdw[c+0];
+                    Vvdw[i*ng+j1] += VSvdw[c+1];
+                    Vc  [i*ng+j0] += VSc  [c+0];
+                    Vc  [i*ng+j1] += VSc  [c+1];
+                    c += SIMD_WIDTH + 2;
+                }
+            }
+        }
+    }
+}
+
+void
+nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
+                         const nbnxn_atomdata_t     *nbat,
+                         const interaction_const_t  *ic,
+                         rvec                       *shift_vec, 
+                         int                        force_flags,
+                         int                        clearF,
+                         real                       *fshift,
+                         real                       *Vc,
+                         real                       *Vvdw)
+#ifdef GMX_X86_AVX_256
+{
+    int              nnbl;
+    nbnxn_pairlist_t **nbl;
+    int coult;
+    int nb;
+
+    nnbl = nbl_list->nnbl;
+    nbl  = nbl_list->nbl;
+
+    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+    {
+        coult = coultRF;
+    }
+    else
+    {
+        if (ic->rcoulomb == ic->rvdw)
+        {
+            coult = coultTAB;
+        }
+        else
+        {
+            coult = coultTAB_TWIN;
+        }
+    }
+
+#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
+    for(nb=0; nb<nnbl; nb++)
+    {
+        nbnxn_atomdata_output_t *out;
+        real *fshift_p;
+
+        out = &nbat->out[nb];
+
+        if (clearF == enbvClearFYes)
+        {
+            clear_f(nbat,out->f);
+        }
+
+        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
+        {
+            fshift_p = fshift;
+        }
+        else
+        {
+            fshift_p = out->fshift;
+
+            if (clearF == enbvClearFYes)
+            {
+                clear_fshift(fshift_p);
+            }
+        }
+
+        /* With Ewald type electrostatics we the forces for excluded atom pairs
+         * should not contribute to the virial sum. The exclusion forces
+         * are not calculate in the energy kernels, but are in _noener.
+         */
+        if (!((force_flags & GMX_FORCE_ENERGY) ||
+              (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
+        {
+            /* Don't calculate energies */
+            p_nbk_noener[coult][nbat->comb_rule](nbl[nb],nbat,
+                                                 ic,
+                                                 shift_vec,
+                                                 out->f,
+                                                 fshift_p);
+        }
+        else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
+        {
+            /* No energy groups */
+            out->Vvdw[0] = 0;
+            out->Vc[0]   = 0;
+
+            p_nbk_ener[coult][nbat->comb_rule](nbl[nb],nbat,
+                                               ic,
+                                               shift_vec,
+                                               out->f,
+                                               fshift_p,
+                                               out->Vvdw,
+                                               out->Vc);
+        }
+        else
+        {
+            /* Calculate energy group contributions */
+            int i;
+
+            for(i=0; i<out->nVS; i++)
+            {
+                out->VSvdw[i] = 0;
+            }
+            for(i=0; i<out->nVS; i++)
+            {
+                out->VSc[i] = 0;
+            }
+
+            p_nbk_energrp[coult][nbat->comb_rule](nbl[nb],nbat,
+                                                  ic,
+                                                  shift_vec,
+                                                  out->f,
+                                                  fshift_p,
+                                                  out->VSvdw,
+                                                  out->VSc);
+
+            reduce_group_energies(nbat->nenergrp,nbat->neg_2log,
+                                  out->VSvdw,out->VSc,
+                                  out->Vvdw,out->Vc);
+        }
+    }
+
+    if (force_flags & GMX_FORCE_ENERGY)
+    {
+        /* Reduce the energies */
+        for(nb=0; nb<nnbl; nb++)
+        {
+            int i;
+
+            for(i=0; i<nbat->out[nb].nV; i++)
+            {
+                Vvdw[i] += nbat->out[nb].Vvdw[i];
+                Vc[i]   += nbat->out[nb].Vc[i];
+            }
+        }
+    }
+}
+#else
+{
+    gmx_incons("nbnxn_kernel_x86_simd256 called while GROMACS was configured without AVX enabled");
+}
+#endif
diff --git a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h
similarity index 63%
rename from src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h
index 76070804ea..c56754284e 100644
--- a/src/kernel/gmx_gpu_utils/gmx_gpu_utils.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h
@@ -1,56 +1,60 @@
 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
- * 
+ *
  *                This source code is part of
- * 
+ *
  *                 G   R   O   M   A   C   S
- * 
+ *
  *          GROningen MAchine for Chemical Simulations
- * 
+ *
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
+ * Copyright (c) 2001-2012, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
-
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- * 
+ *
  * If you want to redistribute modifications, please consider that
  * scientific software is very special. Version control is crucial -
  * bugs must be traceable. We will be happy to consider code for
  * inclusion in the official distribution, but derived work must not
  * be called official GROMACS. Details are found in the README & COPYING
  * files - if they are missing, get the official version at www.gromacs.org.
- * 
+ *
  * To help us fund GROMACS development, we humbly ask that you cite
  * the papers on the package - you can find them in the top README file.
- * 
+ *
  * For more info, check our website at http://www.gromacs.org
- * 
+ *
  * And Hey:
  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  */
+#ifndef _nbnxn_kernel_x86_simd256_h
+#define _nbnxn_kernel_x86_simd256_h
 
-#ifndef _GMX_GPU_UTILS_H_
-#define _GMX_GPU_UTILS_H_
+#include "typedefs.h"
 
-#ifndef __cplusplus
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-int do_quick_memtest(int /*dev_id*/);
-
-int do_full_memtest(int /*dev_id*/);
-
-int do_timed_memtest(int /*dev_id*/, int /*time_limit*/);
-
-int is_supported_cuda_gpu(int /*dev_id*/, char* /*gpu_name*/);
-
-#ifndef __cplusplus
-}  /* extern "C" */
+/* Wrapper call for the non-bonded cluster vs cluster kernels */
+void
+nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
+                         const nbnxn_atomdata_t     *nbat,
+                         const interaction_const_t  *ic,
+                         rvec                       *shift_vec,
+                         int                        force_flags,
+                         int                        clearF,
+                         real                       *fshift,
+                         real                       *Vc,
+                         real                       *Vvdw);
+
+#ifdef __cplusplus
+}
 #endif
 
-#endif // _GMX_GPU_UTILS_H_
-
+#endif
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h
new file mode 100644
index 0000000000..936c8aa802
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_includes.h
@@ -0,0 +1,69 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+
+/* This files includes all x86 SIMD kernel flavors.
+ * Only the Electrostatics type and optionally the VdW cut-off check
+ * need to be set before including this file.
+ */
+
+/* Include the force+energy kernels */
+#define CALC_ENERGIES
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_x86_simd_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_x86_simd_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_x86_simd_outer.h"
+#undef CALC_ENERGIES
+
+/* Include the force+energygroups kernels */
+#define CALC_ENERGIES
+#define ENERGY_GROUPS
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_x86_simd_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_x86_simd_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_x86_simd_outer.h"
+#undef ENERGY_GROUPS
+#undef CALC_ENERGIES
+
+/* Include the force only kernels */
+#define LJ_COMB_GEOM
+#include "nbnxn_kernel_x86_simd_outer.h"
+#undef LJ_COMB_GEOM
+#define LJ_COMB_LB
+#include "nbnxn_kernel_x86_simd_outer.h"
+#undef LJ_COMB_LB
+#include "nbnxn_kernel_x86_simd_outer.h"
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
new file mode 100644
index 0000000000..b9fdd34efc
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
@@ -0,0 +1,949 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under 
+ * the terms of the GNU Lesser General Public License as published by the Free 
+ * Software Foundation; either version 2 of the License, or (at your option) any 
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.  
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+
+/* This is the innermost loop contents for the n vs n atom
+ * SSE2 single precision kernels.
+ */
+
+
+/* When calculating RF or Ewald interactions we calculate the electrostatic
+ * forces on excluded atom pairs here in the non-bonded loops.
+ * But when energies and/or virial is required we calculate them
+ * separately to as then it is easier to separate the energy and virial
+ * contributions.
+ */
+#if defined CHECK_EXCLS && defined CALC_COULOMB
+#define EXCL_FORCES
+#endif
+
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_X86_SSE4_1 && !defined COUNT_PAIRS && !(defined __GNUC__ && (defined CALC_COUL_TAB || (defined CALC_COUL_RF && defined GMX_MM128_HERE)))
+/* Without exclusions and energies we only need to mask the cut-off,
+ * this is faster with blendv (only available with SSE4.1 and later).
+ * With gcc and PME or RF in 128-bit, blendv is slower;
+ * tested with gcc 4.6.2, 4.6.3 and 4.7.1.
+ */
+#define CUTOFF_BLENDV
+#endif
+
+        {
+            int        cj,aj,ajx,ajy,ajz;
+
+#ifdef ENERGY_GROUPS
+            int        egps_j;
+            int        egp_jj[UNROLLJ>>1];
+            int        jj;
+#endif
+
+#ifdef CHECK_EXCLS
+            /* Interaction (non-exclusion) mask of all 1's or 0's */
+            gmx_mm_pr  int_SSE0;
+            gmx_mm_pr  int_SSE1;
+            gmx_mm_pr  int_SSE2;
+            gmx_mm_pr  int_SSE3;
+#endif
+
+            gmx_mm_pr  jxSSE,jySSE,jzSSE;
+            gmx_mm_pr  dx_SSE0,dy_SSE0,dz_SSE0;
+            gmx_mm_pr  dx_SSE1,dy_SSE1,dz_SSE1;
+            gmx_mm_pr  dx_SSE2,dy_SSE2,dz_SSE2;
+            gmx_mm_pr  dx_SSE3,dy_SSE3,dz_SSE3;
+            gmx_mm_pr  tx_SSE0,ty_SSE0,tz_SSE0;
+            gmx_mm_pr  tx_SSE1,ty_SSE1,tz_SSE1;
+            gmx_mm_pr  tx_SSE2,ty_SSE2,tz_SSE2;
+            gmx_mm_pr  tx_SSE3,ty_SSE3,tz_SSE3;
+            gmx_mm_pr  rsq_SSE0,rinv_SSE0,rinvsq_SSE0;
+            gmx_mm_pr  rsq_SSE1,rinv_SSE1,rinvsq_SSE1;
+            gmx_mm_pr  rsq_SSE2,rinv_SSE2,rinvsq_SSE2;
+            gmx_mm_pr  rsq_SSE3,rinv_SSE3,rinvsq_SSE3;
+#ifndef CUTOFF_BLENDV
+            /* wco: within cut-off, mask of all 1's or 0's */
+            gmx_mm_pr  wco_SSE0;
+            gmx_mm_pr  wco_SSE1;
+            gmx_mm_pr  wco_SSE2;
+            gmx_mm_pr  wco_SSE3;
+#endif
+#ifdef VDW_CUTOFF_CHECK
+            gmx_mm_pr  wco_vdw_SSE0;
+            gmx_mm_pr  wco_vdw_SSE1;
+#ifndef HALF_LJ
+            gmx_mm_pr  wco_vdw_SSE2;
+            gmx_mm_pr  wco_vdw_SSE3;
+#endif
+#endif
+#ifdef CALC_COULOMB
+#ifdef CHECK_EXCLS
+            /* 1/r masked with the interaction mask */
+            gmx_mm_pr  rinv_ex_SSE0;
+            gmx_mm_pr  rinv_ex_SSE1;
+            gmx_mm_pr  rinv_ex_SSE2;
+            gmx_mm_pr  rinv_ex_SSE3;
+#endif
+            gmx_mm_pr  jq_SSE;
+            gmx_mm_pr  qq_SSE0;
+            gmx_mm_pr  qq_SSE1;
+            gmx_mm_pr  qq_SSE2;
+            gmx_mm_pr  qq_SSE3;
+#ifdef CALC_COUL_TAB
+            /* The force (PME mesh force) we need to subtract from 1/r^2 */
+            gmx_mm_pr  fsub_SSE0;
+            gmx_mm_pr  fsub_SSE1;
+            gmx_mm_pr  fsub_SSE2;
+            gmx_mm_pr  fsub_SSE3;
+#endif
+            /* frcoul = (1/r - fsub)*r */
+            gmx_mm_pr  frcoul_SSE0;
+            gmx_mm_pr  frcoul_SSE1;
+            gmx_mm_pr  frcoul_SSE2;
+            gmx_mm_pr  frcoul_SSE3;
+#ifdef CALC_COUL_TAB
+            /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
+            gmx_mm_pr  r_SSE0,rs_SSE0,rf_SSE0,frac_SSE0;
+            gmx_mm_pr  r_SSE1,rs_SSE1,rf_SSE1,frac_SSE1;
+            gmx_mm_pr  r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
+            gmx_mm_pr  r_SSE3,rs_SSE3,rf_SSE3,frac_SSE3;
+            /* Table index: rs converted to an int */ 
+#if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
+            gmx_epi32  ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
+#else
+            __m128i    ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
+#endif
+            /* Linear force table values */
+            gmx_mm_pr  ctab0_SSE0,ctab1_SSE0;
+            gmx_mm_pr  ctab0_SSE1,ctab1_SSE1;
+            gmx_mm_pr  ctab0_SSE2,ctab1_SSE2;
+            gmx_mm_pr  ctab0_SSE3,ctab1_SSE3;
+#ifdef CALC_ENERGIES
+            /* Quadratic energy table value */
+            gmx_mm_pr  ctabv_SSE0;
+            gmx_mm_pr  ctabv_SSE1;
+            gmx_mm_pr  ctabv_SSE2;
+            gmx_mm_pr  ctabv_SSE3;
+            /* The potential (PME mesh) we need to subtract from 1/r */
+            gmx_mm_pr  vc_sub_SSE0;
+            gmx_mm_pr  vc_sub_SSE1;
+            gmx_mm_pr  vc_sub_SSE2;
+            gmx_mm_pr  vc_sub_SSE3;
+#endif
+#endif
+#ifdef CALC_ENERGIES
+            /* Electrostatic potential */
+            gmx_mm_pr  vcoul_SSE0;
+            gmx_mm_pr  vcoul_SSE1;
+            gmx_mm_pr  vcoul_SSE2;
+            gmx_mm_pr  vcoul_SSE3;
+#endif
+#endif
+            /* The force times 1/r */
+            gmx_mm_pr  fscal_SSE0;
+            gmx_mm_pr  fscal_SSE1;
+            gmx_mm_pr  fscal_SSE2;
+            gmx_mm_pr  fscal_SSE3;
+
+#ifdef CALC_LJ
+#ifdef LJ_COMB_LB
+            /* LJ sigma_j/2 and sqrt(epsilon_j) */
+            gmx_mm_pr  hsig_j_SSE,seps_j_SSE;
+            /* LJ sigma_ij and epsilon_ij */
+            gmx_mm_pr  sig_SSE0,eps_SSE0;
+            gmx_mm_pr  sig_SSE1,eps_SSE1;
+#ifndef HALF_LJ
+            gmx_mm_pr  sig_SSE2,eps_SSE2;
+            gmx_mm_pr  sig_SSE3,eps_SSE3;
+#endif
+#ifdef CALC_ENERGIES
+            gmx_mm_pr  sig2_SSE0,sig6_SSE0;
+            gmx_mm_pr  sig2_SSE1,sig6_SSE1;
+#ifndef HALF_LJ
+            gmx_mm_pr  sig2_SSE2,sig6_SSE2;
+            gmx_mm_pr  sig2_SSE3,sig6_SSE3;
+#endif
+#endif /* LJ_COMB_LB */
+#endif /* CALC_LJ */
+
+#ifdef LJ_COMB_GEOM
+            gmx_mm_pr  c6s_j_SSE,c12s_j_SSE;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+            /* Index for loading LJ parameters, complicated when interleaving */
+            int         aj2;
+#endif
+
+#ifndef FIX_LJ_C
+            /* LJ C6 and C12 parameters, used with geometric comb. rule */
+            gmx_mm_pr  c6_SSE0,c12_SSE0;
+            gmx_mm_pr  c6_SSE1,c12_SSE1;
+#ifndef HALF_LJ
+            gmx_mm_pr  c6_SSE2,c12_SSE2;
+            gmx_mm_pr  c6_SSE3,c12_SSE3;
+#endif
+#endif
+
+            /* Intermediate variables for LJ calculation */
+#ifndef LJ_COMB_LB
+            gmx_mm_pr  rinvsix_SSE0;
+            gmx_mm_pr  rinvsix_SSE1;
+#ifndef HALF_LJ
+            gmx_mm_pr  rinvsix_SSE2;
+            gmx_mm_pr  rinvsix_SSE3;
+#endif
+#endif
+#ifdef LJ_COMB_LB
+            gmx_mm_pr  sir_SSE0,sir2_SSE0,sir6_SSE0;
+            gmx_mm_pr  sir_SSE1,sir2_SSE1,sir6_SSE1;
+#ifndef HALF_LJ
+            gmx_mm_pr  sir_SSE2,sir2_SSE2,sir6_SSE2;
+            gmx_mm_pr  sir_SSE3,sir2_SSE3,sir6_SSE3;
+#endif
+#endif
+
+            gmx_mm_pr  FrLJ6_SSE0,FrLJ12_SSE0;
+            gmx_mm_pr  FrLJ6_SSE1,FrLJ12_SSE1;
+#ifndef HALF_LJ
+            gmx_mm_pr  FrLJ6_SSE2,FrLJ12_SSE2;
+            gmx_mm_pr  FrLJ6_SSE3,FrLJ12_SSE3;
+#endif
+#ifdef CALC_ENERGIES
+            gmx_mm_pr  VLJ6_SSE0,VLJ12_SSE0,VLJ_SSE0;
+            gmx_mm_pr  VLJ6_SSE1,VLJ12_SSE1,VLJ_SSE1;
+#ifndef HALF_LJ
+            gmx_mm_pr  VLJ6_SSE2,VLJ12_SSE2,VLJ_SSE2;
+            gmx_mm_pr  VLJ6_SSE3,VLJ12_SSE3,VLJ_SSE3;
+#endif
+#endif
+#endif /* CALC_LJ */
+
+            /* j-cluster index */
+            cj            = l_cj[cjind].cj;
+
+            /* Atom indices (of the first atom in the cluster) */
+            aj            = cj*UNROLLJ;
+#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
+#if UNROLLJ == STRIDE
+            aj2           = aj*2;
+#else
+            aj2           = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+#endif
+#if UNROLLJ == STRIDE
+            ajx           = aj*DIM;
+#else
+            ajx           = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
+#endif
+            ajy           = ajx + STRIDE;
+            ajz           = ajy + STRIDE;
+
+#ifdef CHECK_EXCLS
+#ifndef GMX_MM256_HERE
+            {
+                /* Load integer interaction mask */
+                __m128i mask_int = _mm_set1_epi32(l_cj[cjind].excl);
+
+                /* The is no unequal sse instruction, so we need a not here */
+                int_SSE0  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask0),zeroi_SSE));
+                int_SSE1  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask1),zeroi_SSE));
+                int_SSE2  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask2),zeroi_SSE));
+                int_SSE3  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask3),zeroi_SSE));
+            }
+#else
+            {
+#ifndef GMX_DOUBLE
+                /* Load integer interaction mask */
+                /* With AVX there are no integer operations, so cast to real */
+                gmx_mm_pr mask_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl));
+                /* We can't compare all 4*8=32 float bits: shift the mask */
+                gmx_mm_pr masksh_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl>>(2*UNROLLJ)));
+                /* Intel Compiler version 12.1.3 20120130 is buggy: use cast.
+                 * With gcc we don't need the cast, but it's faster.
+                 */
+#define cast_cvt(x)  _mm256_cvtepi32_ps(_mm256_castps_si256(x))
+                int_SSE0  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask0)),zero_SSE);
+                int_SSE1  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask1)),zero_SSE);
+                int_SSE2  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr,mask0)),zero_SSE);
+                int_SSE3  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr,mask1)),zero_SSE);
+#undef cast_cvt
+#else
+                /* Load integer interaction mask */
+                /* With AVX there are no integer operations,
+                 * and there is no int to double conversion, so cast to float
+                 */
+                __m256 mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(l_cj[cjind].excl));
+#define cast_cvt(x)  _mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castps_si256(x)))
+                int_SSE0  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask0)),zero_SSE);
+                int_SSE1  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask1)),zero_SSE);
+                int_SSE2  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask2)),zero_SSE);
+                int_SSE3  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask3)),zero_SSE);
+#undef cast_cvt
+#endif
+            }
+#endif
+#endif
+            /* load j atom coordinates */
+            jxSSE         = gmx_load_pr(x+ajx);
+            jySSE         = gmx_load_pr(x+ajy);
+            jzSSE         = gmx_load_pr(x+ajz);
+            
+            /* Calculate distance */
+            dx_SSE0       = gmx_sub_pr(ix_SSE0,jxSSE);
+            dy_SSE0       = gmx_sub_pr(iy_SSE0,jySSE);
+            dz_SSE0       = gmx_sub_pr(iz_SSE0,jzSSE);
+            dx_SSE1       = gmx_sub_pr(ix_SSE1,jxSSE);
+            dy_SSE1       = gmx_sub_pr(iy_SSE1,jySSE);
+            dz_SSE1       = gmx_sub_pr(iz_SSE1,jzSSE);
+            dx_SSE2       = gmx_sub_pr(ix_SSE2,jxSSE);
+            dy_SSE2       = gmx_sub_pr(iy_SSE2,jySSE);
+            dz_SSE2       = gmx_sub_pr(iz_SSE2,jzSSE);
+            dx_SSE3       = gmx_sub_pr(ix_SSE3,jxSSE);
+            dy_SSE3       = gmx_sub_pr(iy_SSE3,jySSE);
+            dz_SSE3       = gmx_sub_pr(iz_SSE3,jzSSE);
+            
+            /* rsq = dx*dx+dy*dy+dz*dz */
+            rsq_SSE0      = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+            rsq_SSE1      = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1);
+            rsq_SSE2      = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+            rsq_SSE3      = gmx_calc_rsq_pr(dx_SSE3,dy_SSE3,dz_SSE3);
+
+#ifndef CUTOFF_BLENDV
+            wco_SSE0      = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+            wco_SSE1      = gmx_cmplt_pr(rsq_SSE1,rc2_SSE);
+            wco_SSE2      = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+            wco_SSE3      = gmx_cmplt_pr(rsq_SSE3,rc2_SSE);
+#endif
+
+#ifdef CHECK_EXCLS
+#ifdef EXCL_FORCES
+            /* Only remove the (sub-)diagonal to avoid double counting */
+#if UNROLLJ == UNROLLI
+            if (cj == ci_sh)
+            {
+                wco_SSE0  = gmx_and_pr(wco_SSE0,diag_SSE0);
+                wco_SSE1  = gmx_and_pr(wco_SSE1,diag_SSE1);
+                wco_SSE2  = gmx_and_pr(wco_SSE2,diag_SSE2);
+                wco_SSE3  = gmx_and_pr(wco_SSE3,diag_SSE3);
+            }
+#else
+#if UNROLLJ < UNROLLI
+            if (cj == ci_sh*2)
+            {
+                wco_SSE0  = gmx_and_pr(wco_SSE0,diag0_SSE0);
+                wco_SSE1  = gmx_and_pr(wco_SSE1,diag0_SSE1);
+                wco_SSE2  = gmx_and_pr(wco_SSE2,diag0_SSE2);
+                wco_SSE3  = gmx_and_pr(wco_SSE3,diag0_SSE3);
+            }
+            if (cj == ci_sh*2 + 1)
+            { 
+                wco_SSE0  = gmx_and_pr(wco_SSE0,diag1_SSE0);
+                wco_SSE1  = gmx_and_pr(wco_SSE1,diag1_SSE1);
+                wco_SSE2  = gmx_and_pr(wco_SSE2,diag1_SSE2);
+                wco_SSE3  = gmx_and_pr(wco_SSE3,diag1_SSE3);
+            }
+#else
+            if (cj*2 == ci_sh)
+            {
+                wco_SSE0  = gmx_and_pr(wco_SSE0,diag0_SSE0);
+                wco_SSE1  = gmx_and_pr(wco_SSE1,diag0_SSE1);
+                wco_SSE2  = gmx_and_pr(wco_SSE2,diag0_SSE2);
+                wco_SSE3  = gmx_and_pr(wco_SSE3,diag0_SSE3);
+            }
+            else if (cj*2 + 1 == ci_sh)
+            {
+                wco_SSE0  = gmx_and_pr(wco_SSE0,diag1_SSE0);
+                wco_SSE1  = gmx_and_pr(wco_SSE1,diag1_SSE1);
+                wco_SSE2  = gmx_and_pr(wco_SSE2,diag1_SSE2);
+                wco_SSE3  = gmx_and_pr(wco_SSE3,diag1_SSE3);
+            }
+#endif
+#endif
+#else /* EXCL_FORCES */
+            /* Remove all excluded atom pairs from the list */
+            wco_SSE0      = gmx_and_pr(wco_SSE0,int_SSE0);
+            wco_SSE1      = gmx_and_pr(wco_SSE1,int_SSE1);
+            wco_SSE2      = gmx_and_pr(wco_SSE2,int_SSE2);
+            wco_SSE3      = gmx_and_pr(wco_SSE3,int_SSE3);
+#endif
+#endif
+
+#ifdef COUNT_PAIRS
+            {
+                int i,j;
+                real tmp[UNROLLJ];
+                for(i=0; i<UNROLLI; i++)
+                {
+                    gmx_storeu_pr(tmp,i==0 ? wco_SSE0 : (i==1 ? wco_SSE1 : (i==2 ? wco_SSE2 : wco_SSE3)));
+                    for(j=0; j<UNROLLJ; j++)
+                    {
+                        if (!(tmp[j] == 0))
+                        {
+                            npair++;
+                        }
+                    }
+                }
+            }
+#endif
+
+#ifdef CHECK_EXCLS
+            /* For excluded pairs add a small number to avoid r^-6 = NaN */
+            rsq_SSE0      = gmx_add_pr(rsq_SSE0,gmx_andnot_pr(int_SSE0,avoid_sing_SSE));
+            rsq_SSE1      = gmx_add_pr(rsq_SSE1,gmx_andnot_pr(int_SSE1,avoid_sing_SSE));
+            rsq_SSE2      = gmx_add_pr(rsq_SSE2,gmx_andnot_pr(int_SSE2,avoid_sing_SSE));
+            rsq_SSE3      = gmx_add_pr(rsq_SSE3,gmx_andnot_pr(int_SSE3,avoid_sing_SSE));
+#endif
+
+            /* Calculate 1/r */
+#ifndef GMX_DOUBLE
+            rinv_SSE0     = gmx_invsqrt_pr(rsq_SSE0);
+            rinv_SSE1     = gmx_invsqrt_pr(rsq_SSE1);
+            rinv_SSE2     = gmx_invsqrt_pr(rsq_SSE2);
+            rinv_SSE3     = gmx_invsqrt_pr(rsq_SSE3);
+#else
+            GMX_MM_INVSQRT2_PD(rsq_SSE0,rsq_SSE1,rinv_SSE0,rinv_SSE1);
+            GMX_MM_INVSQRT2_PD(rsq_SSE2,rsq_SSE3,rinv_SSE2,rinv_SSE3);
+#endif
+
+#ifdef CALC_COULOMB
+            /* Load parameters for j atom */
+            jq_SSE        = gmx_load_pr(q+aj);
+            qq_SSE0       = gmx_mul_pr(iq_SSE0,jq_SSE);
+            qq_SSE1       = gmx_mul_pr(iq_SSE1,jq_SSE);
+            qq_SSE2       = gmx_mul_pr(iq_SSE2,jq_SSE);
+            qq_SSE3       = gmx_mul_pr(iq_SSE3,jq_SSE);
+#endif
+
+#ifdef CALC_LJ
+
+#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
+            load_lj_pair_params(nbfp0,type,aj,c6_SSE0,c12_SSE0);
+            load_lj_pair_params(nbfp1,type,aj,c6_SSE1,c12_SSE1);
+#ifndef HALF_LJ
+            load_lj_pair_params(nbfp2,type,aj,c6_SSE2,c12_SSE2);
+            load_lj_pair_params(nbfp3,type,aj,c6_SSE3,c12_SSE3);
+#endif
+#endif /* not defined any LJ rule */
+
+#ifdef LJ_COMB_GEOM
+            c6s_j_SSE     = gmx_load_pr(ljc+aj2+0);
+            c12s_j_SSE    = gmx_load_pr(ljc+aj2+STRIDE);
+            c6_SSE0       = gmx_mul_pr(c6s_SSE0 ,c6s_j_SSE );
+            c6_SSE1       = gmx_mul_pr(c6s_SSE1 ,c6s_j_SSE );
+#ifndef HALF_LJ
+            c6_SSE2       = gmx_mul_pr(c6s_SSE2 ,c6s_j_SSE );
+            c6_SSE3       = gmx_mul_pr(c6s_SSE3 ,c6s_j_SSE );
+#endif
+            c12_SSE0      = gmx_mul_pr(c12s_SSE0,c12s_j_SSE);
+            c12_SSE1      = gmx_mul_pr(c12s_SSE1,c12s_j_SSE);
+#ifndef HALF_LJ
+            c12_SSE2      = gmx_mul_pr(c12s_SSE2,c12s_j_SSE);
+            c12_SSE3      = gmx_mul_pr(c12s_SSE3,c12s_j_SSE);
+#endif
+#endif /* LJ_COMB_GEOM */
+
+#ifdef LJ_COMB_LB
+            hsig_j_SSE    = gmx_load_pr(ljc+aj2+0);
+            seps_j_SSE    = gmx_load_pr(ljc+aj2+STRIDE);
+
+            sig_SSE0      = gmx_add_pr(hsig_i_SSE0,hsig_j_SSE);
+            sig_SSE1      = gmx_add_pr(hsig_i_SSE1,hsig_j_SSE);
+            eps_SSE0      = gmx_mul_pr(seps_i_SSE0,seps_j_SSE);
+            eps_SSE1      = gmx_mul_pr(seps_i_SSE1,seps_j_SSE);
+#ifndef HALF_LJ
+            sig_SSE2      = gmx_add_pr(hsig_i_SSE2,hsig_j_SSE);
+            sig_SSE3      = gmx_add_pr(hsig_i_SSE3,hsig_j_SSE);
+            eps_SSE2      = gmx_mul_pr(seps_i_SSE2,seps_j_SSE);
+            eps_SSE3      = gmx_mul_pr(seps_i_SSE3,seps_j_SSE);
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+
+#ifndef CUTOFF_BLENDV
+            rinv_SSE0     = gmx_and_pr(rinv_SSE0,wco_SSE0);
+            rinv_SSE1     = gmx_and_pr(rinv_SSE1,wco_SSE1);
+            rinv_SSE2     = gmx_and_pr(rinv_SSE2,wco_SSE2);
+            rinv_SSE3     = gmx_and_pr(rinv_SSE3,wco_SSE3);
+#else
+            /* We only need to mask for the cut-off: blendv is faster */
+            rinv_SSE0     = gmx_blendv_pr(rinv_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0));
+            rinv_SSE1     = gmx_blendv_pr(rinv_SSE1,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE1));
+            rinv_SSE2     = gmx_blendv_pr(rinv_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2));
+            rinv_SSE3     = gmx_blendv_pr(rinv_SSE3,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE3));
+#endif
+
+            rinvsq_SSE0   = gmx_mul_pr(rinv_SSE0,rinv_SSE0);
+            rinvsq_SSE1   = gmx_mul_pr(rinv_SSE1,rinv_SSE1);
+            rinvsq_SSE2   = gmx_mul_pr(rinv_SSE2,rinv_SSE2);
+            rinvsq_SSE3   = gmx_mul_pr(rinv_SSE3,rinv_SSE3);
+
+#ifdef CALC_COULOMB
+            /* Note that here we calculate force*r, not the usual force/r.
+             * This allows avoiding masking the reaction-field contribution,
+             * as frcoul is later multiplied by rinvsq which has been
+             * masked with the cut-off check.
+             */
+
+#ifdef EXCL_FORCES
+            /* Only add 1/r for non-excluded atom pairs */
+            rinv_ex_SSE0  = gmx_and_pr(rinv_SSE0,int_SSE0);
+            rinv_ex_SSE1  = gmx_and_pr(rinv_SSE1,int_SSE1);
+            rinv_ex_SSE2  = gmx_and_pr(rinv_SSE2,int_SSE2);
+            rinv_ex_SSE3  = gmx_and_pr(rinv_SSE3,int_SSE3);
+#else
+            /* No exclusion forces, we always need 1/r */
+#define     rinv_ex_SSE0    rinv_SSE0
+#define     rinv_ex_SSE1    rinv_SSE1
+#define     rinv_ex_SSE2    rinv_SSE2
+#define     rinv_ex_SSE3    rinv_SSE3
+#endif
+
+#ifdef CALC_COUL_RF
+            /* Electrostatic interactions */
+            frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(rsq_SSE0,mrc_3_SSE)));
+            frcoul_SSE1   = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_mul_pr(rsq_SSE1,mrc_3_SSE)));
+            frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(rsq_SSE2,mrc_3_SSE)));
+            frcoul_SSE3   = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_mul_pr(rsq_SSE3,mrc_3_SSE)));
+
+#ifdef CALC_ENERGIES
+            vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_add_pr(gmx_mul_pr(rsq_SSE0,hrc_3_SSE),moh_rc_SSE)));
+            vcoul_SSE1    = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_add_pr(gmx_mul_pr(rsq_SSE1,hrc_3_SSE),moh_rc_SSE)));
+            vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_add_pr(gmx_mul_pr(rsq_SSE2,hrc_3_SSE),moh_rc_SSE)));
+            vcoul_SSE3    = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_add_pr(gmx_mul_pr(rsq_SSE3,hrc_3_SSE),moh_rc_SSE)));
+#endif
+#endif
+
+#ifdef CALC_COUL_TAB
+            /* Electrostatic interactions */
+            r_SSE0        = gmx_mul_pr(rsq_SSE0,rinv_SSE0);
+            r_SSE1        = gmx_mul_pr(rsq_SSE1,rinv_SSE1);
+            r_SSE2        = gmx_mul_pr(rsq_SSE2,rinv_SSE2);
+            r_SSE3        = gmx_mul_pr(rsq_SSE3,rinv_SSE3);
+            /* Convert r to scaled table units */
+            rs_SSE0       = gmx_mul_pr(r_SSE0,invtsp_SSE);
+            rs_SSE1       = gmx_mul_pr(r_SSE1,invtsp_SSE);
+            rs_SSE2       = gmx_mul_pr(r_SSE2,invtsp_SSE);
+            rs_SSE3       = gmx_mul_pr(r_SSE3,invtsp_SSE);
+            /* Truncate scaled r to an int */
+            ti_SSE0       = gmx_cvttpr_epi32(rs_SSE0);
+            ti_SSE1       = gmx_cvttpr_epi32(rs_SSE1);
+            ti_SSE2       = gmx_cvttpr_epi32(rs_SSE2);
+            ti_SSE3       = gmx_cvttpr_epi32(rs_SSE3);
+#ifdef GMX_X86_SSE4_1
+            /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
+            rf_SSE0       = gmx_floor_pr(rs_SSE0);
+            rf_SSE1       = gmx_floor_pr(rs_SSE1);
+            rf_SSE2       = gmx_floor_pr(rs_SSE2);
+            rf_SSE3       = gmx_floor_pr(rs_SSE3);
+#else
+            rf_SSE0       = gmx_cvtepi32_pr(ti_SSE0);
+            rf_SSE1       = gmx_cvtepi32_pr(ti_SSE1);
+            rf_SSE2       = gmx_cvtepi32_pr(ti_SSE2);
+            rf_SSE3       = gmx_cvtepi32_pr(ti_SSE3);
+#endif
+            frac_SSE0     = gmx_sub_pr(rs_SSE0,rf_SSE0);
+            frac_SSE1     = gmx_sub_pr(rs_SSE1,rf_SSE1);
+            frac_SSE2     = gmx_sub_pr(rs_SSE2,rf_SSE2);
+            frac_SSE3     = gmx_sub_pr(rs_SSE3,rf_SSE3);
+
+            /* Load and interpolate table forces and possibly energies.
+             * Force and energy can be combined in one table, stride 4: FDV0
+             * or in two separate tables with stride 1: F and V
+             * Currently single precision uses FDV0, double F and V.
+             */
+#ifndef CALC_ENERGIES
+            load_table_f(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0);
+            load_table_f(tab_coul_F,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1);
+            load_table_f(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2);
+            load_table_f(tab_coul_F,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3);
+#else
+#ifdef TAB_FDV0
+            load_table_f_v(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
+            load_table_f_v(tab_coul_F,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1,ctabv_SSE1);
+            load_table_f_v(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
+            load_table_f_v(tab_coul_F,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3,ctabv_SSE3);
+#else
+            load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
+            load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1,ctabv_SSE1);
+            load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
+            load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3,ctabv_SSE3);
+#endif
+#endif
+            fsub_SSE0     = gmx_add_pr(ctab0_SSE0,gmx_mul_pr(frac_SSE0,ctab1_SSE0));
+            fsub_SSE1     = gmx_add_pr(ctab0_SSE1,gmx_mul_pr(frac_SSE1,ctab1_SSE1));
+            fsub_SSE2     = gmx_add_pr(ctab0_SSE2,gmx_mul_pr(frac_SSE2,ctab1_SSE2));
+            fsub_SSE3     = gmx_add_pr(ctab0_SSE3,gmx_mul_pr(frac_SSE3,ctab1_SSE3));
+            frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,gmx_mul_pr(fsub_SSE0,r_SSE0)));
+            frcoul_SSE1   = gmx_mul_pr(qq_SSE1,gmx_sub_pr(rinv_ex_SSE1,gmx_mul_pr(fsub_SSE1,r_SSE1)));
+            frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,gmx_mul_pr(fsub_SSE2,r_SSE2)));
+            frcoul_SSE3   = gmx_mul_pr(qq_SSE3,gmx_sub_pr(rinv_ex_SSE3,gmx_mul_pr(fsub_SSE3,r_SSE3)));
+
+#ifdef CALC_ENERGIES
+            vc_sub_SSE0   = gmx_add_pr(ctabv_SSE0,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE0),gmx_add_pr(ctab0_SSE0,fsub_SSE0)));
+            vc_sub_SSE1   = gmx_add_pr(ctabv_SSE1,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE1),gmx_add_pr(ctab0_SSE1,fsub_SSE1)));
+            vc_sub_SSE2   = gmx_add_pr(ctabv_SSE2,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE2),gmx_add_pr(ctab0_SSE2,fsub_SSE2)));
+            vc_sub_SSE3   = gmx_add_pr(ctabv_SSE3,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE3),gmx_add_pr(ctab0_SSE3,fsub_SSE3)));
+
+#ifndef NO_SHIFT_EWALD
+            /* Add Ewald potential shift to vc_sub for convenience */
+#ifdef CHECK_EXCLS
+            vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,gmx_and_pr(sh_ewald_SSE,int_SSE0));
+            vc_sub_SSE1   = gmx_add_pr(vc_sub_SSE1,gmx_and_pr(sh_ewald_SSE,int_SSE1));
+            vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,gmx_and_pr(sh_ewald_SSE,int_SSE2));
+            vc_sub_SSE3   = gmx_add_pr(vc_sub_SSE3,gmx_and_pr(sh_ewald_SSE,int_SSE3));
+#else
+            vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,sh_ewald_SSE);
+            vc_sub_SSE1   = gmx_add_pr(vc_sub_SSE1,sh_ewald_SSE);
+            vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,sh_ewald_SSE);
+            vc_sub_SSE3   = gmx_add_pr(vc_sub_SSE3,sh_ewald_SSE);
+#endif
+#endif
+            
+            vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,vc_sub_SSE0));
+            vcoul_SSE1    = gmx_mul_pr(qq_SSE1,gmx_sub_pr(rinv_ex_SSE1,vc_sub_SSE1));
+            vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,vc_sub_SSE2));
+            vcoul_SSE3    = gmx_mul_pr(qq_SSE3,gmx_sub_pr(rinv_ex_SSE3,vc_sub_SSE3));
+
+#endif
+#endif
+
+#ifdef CALC_ENERGIES
+            /* Mask energy for cut-off and diagonal */
+            vcoul_SSE0    = gmx_and_pr(vcoul_SSE0,wco_SSE0);
+            vcoul_SSE1    = gmx_and_pr(vcoul_SSE1,wco_SSE1);
+            vcoul_SSE2    = gmx_and_pr(vcoul_SSE2,wco_SSE2);
+            vcoul_SSE3    = gmx_and_pr(vcoul_SSE3,wco_SSE3);
+#endif
+
+#endif /* CALC_COULOMB */
+
+#ifdef CALC_LJ
+            /* Lennard-Jones interaction */
+
+#ifdef VDW_CUTOFF_CHECK
+            wco_vdw_SSE0  = gmx_cmplt_pr(rsq_SSE0,rcvdw2_SSE);
+            wco_vdw_SSE1  = gmx_cmplt_pr(rsq_SSE1,rcvdw2_SSE);
+#ifndef HALF_LJ
+            wco_vdw_SSE2  = gmx_cmplt_pr(rsq_SSE2,rcvdw2_SSE);
+            wco_vdw_SSE3  = gmx_cmplt_pr(rsq_SSE3,rcvdw2_SSE);
+#endif
+#else
+            /* Same cut-off for Coulomb and VdW, reuse the registers */
+#define     wco_vdw_SSE0    wco_SSE0
+#define     wco_vdw_SSE1    wco_SSE1
+#define     wco_vdw_SSE2    wco_SSE2
+#define     wco_vdw_SSE3    wco_SSE3
+#endif
+
+#ifndef LJ_COMB_LB
+            rinvsix_SSE0  = gmx_mul_pr(rinvsq_SSE0,gmx_mul_pr(rinvsq_SSE0,rinvsq_SSE0));
+            rinvsix_SSE1  = gmx_mul_pr(rinvsq_SSE1,gmx_mul_pr(rinvsq_SSE1,rinvsq_SSE1));
+#ifdef EXCL_FORCES
+            rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,int_SSE0);
+            rinvsix_SSE1  = gmx_and_pr(rinvsix_SSE1,int_SSE1);
+#endif
+#ifndef HALF_LJ
+            rinvsix_SSE2  = gmx_mul_pr(rinvsq_SSE2,gmx_mul_pr(rinvsq_SSE2,rinvsq_SSE2));
+            rinvsix_SSE3  = gmx_mul_pr(rinvsq_SSE3,gmx_mul_pr(rinvsq_SSE3,rinvsq_SSE3));
+#ifdef EXCL_FORCES
+            rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,int_SSE2);
+            rinvsix_SSE3  = gmx_and_pr(rinvsix_SSE3,int_SSE3);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+            rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,wco_vdw_SSE0);
+            rinvsix_SSE1  = gmx_and_pr(rinvsix_SSE1,wco_vdw_SSE1);
+#ifndef HALF_LJ
+            rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,wco_vdw_SSE2);
+            rinvsix_SSE3  = gmx_and_pr(rinvsix_SSE3,wco_vdw_SSE3);
+#endif
+#endif
+            FrLJ6_SSE0    = gmx_mul_pr(c6_SSE0,rinvsix_SSE0);
+            FrLJ6_SSE1    = gmx_mul_pr(c6_SSE1,rinvsix_SSE1);
+#ifndef HALF_LJ
+            FrLJ6_SSE2    = gmx_mul_pr(c6_SSE2,rinvsix_SSE2);
+            FrLJ6_SSE3    = gmx_mul_pr(c6_SSE3,rinvsix_SSE3);
+#endif
+            FrLJ12_SSE0   = gmx_mul_pr(c12_SSE0,gmx_mul_pr(rinvsix_SSE0,rinvsix_SSE0));
+            FrLJ12_SSE1   = gmx_mul_pr(c12_SSE1,gmx_mul_pr(rinvsix_SSE1,rinvsix_SSE1));
+#ifndef HALF_LJ
+            FrLJ12_SSE2   = gmx_mul_pr(c12_SSE2,gmx_mul_pr(rinvsix_SSE2,rinvsix_SSE2));
+            FrLJ12_SSE3   = gmx_mul_pr(c12_SSE3,gmx_mul_pr(rinvsix_SSE3,rinvsix_SSE3));
+#endif
+#endif /* not LJ_COMB_LB */
+
+#ifdef LJ_COMB_LB
+            sir_SSE0      = gmx_mul_pr(sig_SSE0,rinv_SSE0);
+            sir_SSE1      = gmx_mul_pr(sig_SSE1,rinv_SSE1);
+#ifndef HALF_LJ
+            sir_SSE2      = gmx_mul_pr(sig_SSE2,rinv_SSE2);
+            sir_SSE3      = gmx_mul_pr(sig_SSE3,rinv_SSE3);
+#endif
+            sir2_SSE0     = gmx_mul_pr(sir_SSE0,sir_SSE0);
+            sir2_SSE1     = gmx_mul_pr(sir_SSE1,sir_SSE1);
+#ifndef HALF_LJ
+            sir2_SSE2     = gmx_mul_pr(sir_SSE2,sir_SSE2);
+            sir2_SSE3     = gmx_mul_pr(sir_SSE3,sir_SSE3);
+#endif
+            sir6_SSE0     = gmx_mul_pr(sir2_SSE0,gmx_mul_pr(sir2_SSE0,sir2_SSE0));
+            sir6_SSE1     = gmx_mul_pr(sir2_SSE1,gmx_mul_pr(sir2_SSE1,sir2_SSE1));
+#ifdef EXCL_FORCES
+            sir6_SSE0     = gmx_and_pr(sir6_SSE0,int_SSE0);
+            sir6_SSE1     = gmx_and_pr(sir6_SSE1,int_SSE1);
+#endif
+#ifndef HALF_LJ
+            sir6_SSE2     = gmx_mul_pr(sir2_SSE2,gmx_mul_pr(sir2_SSE2,sir2_SSE2));
+            sir6_SSE3     = gmx_mul_pr(sir2_SSE3,gmx_mul_pr(sir2_SSE3,sir2_SSE3));
+#ifdef EXCL_FORCES
+            sir6_SSE2     = gmx_and_pr(sir6_SSE2,int_SSE2);
+            sir6_SSE3     = gmx_and_pr(sir6_SSE3,int_SSE3);
+#endif
+#endif
+#ifdef VDW_CUTOFF_CHECK
+            sir6_SSE0     = gmx_and_pr(sir6_SSE0,wco_vdw_SSE0);
+            sir6_SSE1     = gmx_and_pr(sir6_SSE1,wco_vdw_SSE1);
+#ifndef HALF_LJ
+            sir6_SSE2     = gmx_and_pr(sir6_SSE2,wco_vdw_SSE2);
+            sir6_SSE3     = gmx_and_pr(sir6_SSE3,wco_vdw_SSE3);
+#endif
+#endif
+            FrLJ6_SSE0    = gmx_mul_pr(eps_SSE0,sir6_SSE0);
+            FrLJ6_SSE1    = gmx_mul_pr(eps_SSE1,sir6_SSE1);
+#ifndef HALF_LJ
+            FrLJ6_SSE2    = gmx_mul_pr(eps_SSE2,sir6_SSE2);
+            FrLJ6_SSE3    = gmx_mul_pr(eps_SSE3,sir6_SSE3);
+#endif
+            FrLJ12_SSE0   = gmx_mul_pr(FrLJ6_SSE0,sir6_SSE0);
+            FrLJ12_SSE1   = gmx_mul_pr(FrLJ6_SSE1,sir6_SSE1);
+#ifndef HALF_LJ
+            FrLJ12_SSE2   = gmx_mul_pr(FrLJ6_SSE2,sir6_SSE2);
+            FrLJ12_SSE3   = gmx_mul_pr(FrLJ6_SSE3,sir6_SSE3);
+#endif
+#if defined CALC_ENERGIES
+            /* We need C6 and C12 to calculate the LJ potential shift */
+            sig2_SSE0     = gmx_mul_pr(sig_SSE0,sig_SSE0);
+            sig2_SSE1     = gmx_mul_pr(sig_SSE1,sig_SSE1);
+#ifndef HALF_LJ
+            sig2_SSE2     = gmx_mul_pr(sig_SSE2,sig_SSE2);
+            sig2_SSE3     = gmx_mul_pr(sig_SSE3,sig_SSE3);
+#endif
+            sig6_SSE0     = gmx_mul_pr(sig2_SSE0,gmx_mul_pr(sig2_SSE0,sig2_SSE0));
+            sig6_SSE1     = gmx_mul_pr(sig2_SSE1,gmx_mul_pr(sig2_SSE1,sig2_SSE1));
+#ifndef HALF_LJ
+            sig6_SSE2     = gmx_mul_pr(sig2_SSE2,gmx_mul_pr(sig2_SSE2,sig2_SSE2));
+            sig6_SSE3     = gmx_mul_pr(sig2_SSE3,gmx_mul_pr(sig2_SSE3,sig2_SSE3));
+#endif
+            c6_SSE0       = gmx_mul_pr(eps_SSE0,sig6_SSE0);
+            c6_SSE1       = gmx_mul_pr(eps_SSE1,sig6_SSE1);
+#ifndef HALF_LJ
+            c6_SSE2       = gmx_mul_pr(eps_SSE2,sig6_SSE2);
+            c6_SSE3       = gmx_mul_pr(eps_SSE3,sig6_SSE3);
+#endif
+            c12_SSE0      = gmx_mul_pr(c6_SSE0,sig6_SSE0);
+            c12_SSE1      = gmx_mul_pr(c6_SSE1,sig6_SSE1);
+#ifndef HALF_LJ
+            c12_SSE2      = gmx_mul_pr(c6_SSE2,sig6_SSE2);
+            c12_SSE3      = gmx_mul_pr(c6_SSE3,sig6_SSE3);
+#endif
+#endif
+#endif /* LJ_COMB_LB */
+
+#endif /* CALC_LJ */
+            
+#ifdef CALC_ENERGIES
+#ifdef ENERGY_GROUPS
+            /* Extract the group pair index per j pair */
+#if UNROLLJ == 2
+            egps_j        = nbat->energrp[cj>>1];
+            egp_jj[0]     = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+#else
+            egps_j        = nbat->energrp[cj];
+            for(jj=0; jj<(UNROLLJ>>1); jj++)
+            {
+                egp_jj[jj]  = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+            }
+#endif
+#endif
+
+#ifdef CALC_COULOMB
+#ifndef ENERGY_GROUPS
+            vctotSSE      = gmx_add_pr(vctotSSE, gmx_sum4_pr(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
+#else
+            add_ener_grp(vcoul_SSE0,vctp[0],egp_jj);
+            add_ener_grp(vcoul_SSE1,vctp[1],egp_jj);
+            add_ener_grp(vcoul_SSE2,vctp[2],egp_jj);
+            add_ener_grp(vcoul_SSE3,vctp[3],egp_jj);
+#endif
+#endif
+
+#ifdef CALC_LJ
+            /* Calculate the LJ energies */
+            VLJ6_SSE0     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE0,gmx_mul_pr(c6_SSE0,sh_invrc6_SSE)));
+            VLJ6_SSE1     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE1,gmx_mul_pr(c6_SSE1,sh_invrc6_SSE)));
+#ifndef HALF_LJ
+            VLJ6_SSE2     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE2,gmx_mul_pr(c6_SSE2,sh_invrc6_SSE)));
+            VLJ6_SSE3     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE3,gmx_mul_pr(c6_SSE3,sh_invrc6_SSE)));
+#endif
+            VLJ12_SSE0    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE0,gmx_mul_pr(c12_SSE0,sh_invrc12_SSE)));
+            VLJ12_SSE1    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE1,gmx_mul_pr(c12_SSE1,sh_invrc12_SSE)));
+#ifndef HALF_LJ
+            VLJ12_SSE2    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE2,gmx_mul_pr(c12_SSE2,sh_invrc12_SSE)));
+            VLJ12_SSE3    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE3,gmx_mul_pr(c12_SSE3,sh_invrc12_SSE)));
+#endif
+
+            VLJ_SSE0      = gmx_sub_pr(VLJ12_SSE0,VLJ6_SSE0);
+            VLJ_SSE1      = gmx_sub_pr(VLJ12_SSE1,VLJ6_SSE1);
+#ifndef HALF_LJ
+            VLJ_SSE2      = gmx_sub_pr(VLJ12_SSE2,VLJ6_SSE2);
+            VLJ_SSE3      = gmx_sub_pr(VLJ12_SSE3,VLJ6_SSE3);
+#endif
+            /* The potential shift should be removed for pairs beyond cut-off */
+            VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,wco_vdw_SSE0);
+            VLJ_SSE1      = gmx_and_pr(VLJ_SSE1,wco_vdw_SSE1);
+#ifndef HALF_LJ
+            VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,wco_vdw_SSE2);
+            VLJ_SSE3      = gmx_and_pr(VLJ_SSE3,wco_vdw_SSE3);
+#endif
+#ifdef CHECK_EXCLS
+            /* The potential shift should be removed for excluded pairs */
+            VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,int_SSE0);
+            VLJ_SSE1      = gmx_and_pr(VLJ_SSE1,int_SSE1);
+#ifndef HALF_LJ
+            VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,int_SSE2);
+            VLJ_SSE3      = gmx_and_pr(VLJ_SSE3,int_SSE3);
+#endif
+#endif
+#ifndef ENERGY_GROUPS
+            VvdwtotSSE    = gmx_add_pr(VvdwtotSSE,
+#ifndef HALF_LJ
+                                       gmx_sum4_pr(VLJ_SSE0,VLJ_SSE1,VLJ_SSE2,VLJ_SSE3)
+#else
+                                       gmx_add_pr(VLJ_SSE0,VLJ_SSE1)
+#endif
+                                      );
+#else
+            add_ener_grp(VLJ_SSE0,vvdwtp[0],egp_jj);
+            add_ener_grp(VLJ_SSE1,vvdwtp[1],egp_jj);
+#ifndef HALF_LJ
+            add_ener_grp(VLJ_SSE2,vvdwtp[2],egp_jj);
+            add_ener_grp(VLJ_SSE3,vvdwtp[3],egp_jj);
+#endif
+#endif
+#endif /* CALC_LJ */
+#endif /* CALC_ENERGIES */
+
+#ifdef CALC_LJ
+            fscal_SSE0    = gmx_mul_pr(rinvsq_SSE0,
+#ifdef CALC_COULOMB
+                                                   gmx_add_pr(frcoul_SSE0,
+#else
+                                                   (
+#endif
+                                                    gmx_sub_pr(FrLJ12_SSE0,FrLJ6_SSE0)));
+            fscal_SSE1    = gmx_mul_pr(rinvsq_SSE1,
+#ifdef CALC_COULOMB
+                                                   gmx_add_pr(frcoul_SSE1,
+#else
+                                                   (
+#endif
+                                                    gmx_sub_pr(FrLJ12_SSE1,FrLJ6_SSE1)));
+#else
+            fscal_SSE0    = gmx_mul_pr(rinvsq_SSE0,frcoul_SSE0);
+            fscal_SSE1    = gmx_mul_pr(rinvsq_SSE1,frcoul_SSE1);
+#endif /* CALC_LJ */
+#if defined CALC_LJ && !defined HALF_LJ
+            fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,
+#ifdef CALC_COULOMB
+                                                   gmx_add_pr(frcoul_SSE2,
+#else
+                                                   (
+#endif
+                                                    gmx_sub_pr(FrLJ12_SSE2,FrLJ6_SSE2)));
+            fscal_SSE3    = gmx_mul_pr(rinvsq_SSE3,
+#ifdef CALC_COULOMB
+                                                   gmx_add_pr(frcoul_SSE3,
+#else
+                                                   (
+#endif
+                                                    gmx_sub_pr(FrLJ12_SSE3,FrLJ6_SSE3)));
+#else
+            /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
+            fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,frcoul_SSE2);
+            fscal_SSE3    = gmx_mul_pr(rinvsq_SSE3,frcoul_SSE3);
+#endif
+            
+            /* Calculate temporary vectorial force */
+            tx_SSE0       = gmx_mul_pr(fscal_SSE0,dx_SSE0);
+            tx_SSE1       = gmx_mul_pr(fscal_SSE1,dx_SSE1);
+            tx_SSE2       = gmx_mul_pr(fscal_SSE2,dx_SSE2);
+            tx_SSE3       = gmx_mul_pr(fscal_SSE3,dx_SSE3);
+            ty_SSE0       = gmx_mul_pr(fscal_SSE0,dy_SSE0);
+            ty_SSE1       = gmx_mul_pr(fscal_SSE1,dy_SSE1);
+            ty_SSE2       = gmx_mul_pr(fscal_SSE2,dy_SSE2);
+            ty_SSE3       = gmx_mul_pr(fscal_SSE3,dy_SSE3);
+            tz_SSE0       = gmx_mul_pr(fscal_SSE0,dz_SSE0);
+            tz_SSE1       = gmx_mul_pr(fscal_SSE1,dz_SSE1);
+            tz_SSE2       = gmx_mul_pr(fscal_SSE2,dz_SSE2);
+            tz_SSE3       = gmx_mul_pr(fscal_SSE3,dz_SSE3);
+            
+            /* Increment i atom force */
+            fix_SSE0      = gmx_add_pr(fix_SSE0,tx_SSE0);
+            fix_SSE1      = gmx_add_pr(fix_SSE1,tx_SSE1);
+            fix_SSE2      = gmx_add_pr(fix_SSE2,tx_SSE2);
+            fix_SSE3      = gmx_add_pr(fix_SSE3,tx_SSE3);
+            fiy_SSE0      = gmx_add_pr(fiy_SSE0,ty_SSE0);
+            fiy_SSE1      = gmx_add_pr(fiy_SSE1,ty_SSE1);
+            fiy_SSE2      = gmx_add_pr(fiy_SSE2,ty_SSE2);
+            fiy_SSE3      = gmx_add_pr(fiy_SSE3,ty_SSE3);
+            fiz_SSE0      = gmx_add_pr(fiz_SSE0,tz_SSE0);
+            fiz_SSE1      = gmx_add_pr(fiz_SSE1,tz_SSE1);
+            fiz_SSE2      = gmx_add_pr(fiz_SSE2,tz_SSE2);
+            fiz_SSE3      = gmx_add_pr(fiz_SSE3,tz_SSE3);
+            
+            /* Decrement j atom force */
+            gmx_store_pr(f+ajx,
+                         gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
+            gmx_store_pr(f+ajy,
+                         gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
+            gmx_store_pr(f+ajz,
+                         gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
+        }
+
+#undef  rinv_ex_SSE0
+#undef  rinv_ex_SSE1
+#undef  rinv_ex_SSE2
+#undef  rinv_ex_SSE3
+
+#undef  wco_vdw_SSE0
+#undef  wco_vdw_SSE1
+#undef  wco_vdw_SSE2
+#undef  wco_vdw_SSE3
+
+#undef  CUTOFF_BLENDV
+
+#undef  EXCL_FORCES
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
new file mode 100644
index 0000000000..960d783e5e
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
@@ -0,0 +1,760 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2009, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+
+/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
+#include "gmx_x86_simd_macros.h"
+
+#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
+
+#define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
+#define UNROLLJ    GMX_X86_SIMD_WIDTH_HERE
+
+#if defined GMX_MM128_HERE || defined GMX_DOUBLE
+#define STRIDE     4
+#endif
+#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+#define STRIDE     8
+#endif 
+
+#ifdef GMX_MM128_HERE
+#ifndef GMX_DOUBLE
+/* SSE single precision 4x4 kernel */
+#define SUM_SIMD(x) SUM_SIMD4(x)
+#define TAB_FDV0
+#else
+/* SSE double precision 4x2 kernel */
+#define SUM_SIMD(x) (x[0]+x[1])
+#endif
+#endif
+
+#ifdef GMX_MM256_HERE
+#ifndef GMX_DOUBLE
+/* AVX single precision 4x8 kernel */
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#define TAB_FDV0
+#else
+/* AVX double precision 4x4 kernel */
+#define SUM_SIMD(x) SUM_SIMD4(x)
+#endif
+#endif
+
+#define SIMD_MASK_ALL   0xffffffff
+
+#include "nbnxn_kernel_x86_simd_utils.h"
+
+/* All functionality defines are set here, except for:
+ * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
+ * CHECK_EXCLS, which is set just before including the inner loop contents.
+ * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
+ * set before calling the kernel function. We might want to move that
+ * to inside the n-loop and have a different combination rule for different
+ * ci's, as no combination rule gives a 50% performance hit for LJ.
+ */
+
+/* We always calculate shift forces, because it's cheap anyhow */
+#define CALC_SHIFTFORCES
+
+/* Assumes all LJ parameters are identical */
+/* #define FIX_LJ_C */
+
+#define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
+
+#if defined LJ_COMB_GEOM
+#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
+#else
+#if defined LJ_COMB_LB
+#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
+#else
+#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
+#endif
+#endif
+
+#ifdef CALC_COUL_RF
+#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
+#endif
+#ifdef CALC_COUL_TAB
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
+#else
+#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
+#endif
+#endif
+
+#ifdef GMX_MM128_HERE
+#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
+#endif
+#ifdef GMX_MM256_HERE
+#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
+#endif
+
+static void
+#ifndef CALC_ENERGIES
+NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
+#else
+#ifndef ENERGY_GROUPS
+NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
+#else
+NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
+#endif
+#endif
+#undef NBK_FUNC_NAME
+#undef NBK_FUNC_NAME_C
+#undef NBK_FUNC_NAME_C_LJC
+                            (const nbnxn_pairlist_t     *nbl,
+                             const nbnxn_atomdata_t     *nbat,
+                             const interaction_const_t  *ic,
+                             rvec                       *shift_vec, 
+                             real                       *f
+#ifdef CALC_SHIFTFORCES
+                             ,
+                             real                       *fshift
+#endif
+#ifdef CALC_ENERGIES
+                             ,
+                             real                       *Vvdw,
+                             real                       *Vc
+#endif
+                            )
+{
+    const nbnxn_ci_t   *nbln;
+    const nbnxn_cj_t   *l_cj;
+    const int          *type;
+    const real         *q;
+    const real         *shiftvec;
+    const real         *x;
+    const real         *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
+    real       facel;
+    real       *nbfp_ptr;
+    int        nbfp_stride;
+    int        n,ci,ci_sh;
+    int        ish,ish3;
+    gmx_bool   half_LJ,do_coul;
+    int        sci,scix,sciy,sciz,sci2;
+    int        cjind0,cjind1,cjind;
+    int        ip,jp;
+
+#ifdef ENERGY_GROUPS
+    int        Vstride_i;
+    int        egps_ishift,egps_imask;
+    int        egps_jshift,egps_jmask,egps_jstride;
+    int        egps_i;
+    real       *vvdwtp[UNROLLI];
+    real       *vctp[UNROLLI];
+#endif
+    
+    gmx_mm_pr  shX_SSE;
+    gmx_mm_pr  shY_SSE;
+    gmx_mm_pr  shZ_SSE;
+    gmx_mm_pr  ix_SSE0,iy_SSE0,iz_SSE0;
+    gmx_mm_pr  ix_SSE1,iy_SSE1,iz_SSE1;
+    gmx_mm_pr  ix_SSE2,iy_SSE2,iz_SSE2;
+    gmx_mm_pr  ix_SSE3,iy_SSE3,iz_SSE3;
+    gmx_mm_pr  fix_SSE0,fiy_SSE0,fiz_SSE0;
+    gmx_mm_pr  fix_SSE1,fiy_SSE1,fiz_SSE1;
+    gmx_mm_pr  fix_SSE2,fiy_SSE2,fiz_SSE2;
+    gmx_mm_pr  fix_SSE3,fiy_SSE3,fiz_SSE3;
+#if UNROLLJ >= 4
+#ifndef GMX_DOUBLE
+    __m128     fix_SSE,fiy_SSE,fiz_SSE;
+#else
+    __m256d    fix_SSE,fiy_SSE,fiz_SSE;
+#endif
+#else
+    __m128d    fix0_SSE,fiy0_SSE,fiz0_SSE;
+    __m128d    fix2_SSE,fiy2_SSE,fiz2_SSE;
+#endif
+
+#ifndef GMX_MM256_HERE
+#ifndef GMX_DOUBLE
+    __m128i    mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
+    __m128i    mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
+    __m128i    mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
+    __m128i    mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
+#else
+    /* For double precision we need to set two 32bit ints for one double */
+    __m128i    mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
+    __m128i    mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
+    __m128i    mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
+    __m128i    mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
+#endif
+#else
+    /* AVX: use floating point masks, as there are no integer instructions */
+#ifndef GMX_DOUBLE
+    gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
+    gmx_mm_pr  mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
+#else
+    /* There is no 256-bit int to double conversion, so we use float here */
+    __m256     mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
+    __m256     mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
+    __m256     mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
+    __m256     mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
+#endif
+#endif
+
+#ifndef GMX_MM256_HERE
+#ifndef GMX_DOUBLE
+    __m128     diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
+    __m128     diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
+    __m128     diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
+    __m128     diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+#else
+    __m128d    diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
+    __m128d    diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+    __m128d    diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+    __m128d    diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+    __m128d    diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
+    __m128d    diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
+    __m128d    diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
+    __m128d    diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+#endif
+#else /* GMX_MM256_HERE */
+#ifndef GMX_DOUBLE
+    gmx_mm_pr  diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
+    gmx_mm_pr  diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
+    gmx_mm_pr  diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
+    gmx_mm_pr  diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+    gmx_mm_pr  diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+    gmx_mm_pr  diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+    gmx_mm_pr  diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+    gmx_mm_pr  diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+#else
+    gmx_mm_pr  diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
+    gmx_mm_pr  diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+    gmx_mm_pr  diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+    gmx_mm_pr  diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+#endif
+#endif
+
+#ifndef GMX_MM256_HERE
+    __m128i    zeroi_SSE = _mm_setzero_si128();
+#endif
+#ifdef GMX_X86_SSE4_1
+    gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
+#endif
+
+    gmx_mm_pr  one_SSE=gmx_set1_pr(1.0);
+    gmx_mm_pr  iq_SSE0=gmx_setzero_pr();
+    gmx_mm_pr  iq_SSE1=gmx_setzero_pr();
+    gmx_mm_pr  iq_SSE2=gmx_setzero_pr();
+    gmx_mm_pr  iq_SSE3=gmx_setzero_pr();
+    gmx_mm_pr  mrc_3_SSE;
+#ifdef CALC_ENERGIES
+    gmx_mm_pr  hrc_3_SSE,moh_rc_SSE;
+#endif
+#ifdef CALC_COUL_TAB
+    /* Coulomb table variables */
+    gmx_mm_pr  invtsp_SSE;
+    const real *tab_coul_F;
+#ifndef TAB_FDV0
+    const real *tab_coul_V;
+#endif
+#ifdef GMX_MM256_HERE
+    int        ti0_array[2*UNROLLJ-1],*ti0;
+    int        ti1_array[2*UNROLLJ-1],*ti1;
+    int        ti2_array[2*UNROLLJ-1],*ti2;
+    int        ti3_array[2*UNROLLJ-1],*ti3;
+#endif
+#ifdef CALC_ENERGIES
+    gmx_mm_pr  mhalfsp_SSE;
+    gmx_mm_pr  sh_ewald_SSE;
+#endif
+#endif
+
+#ifdef LJ_COMB_LB
+    const real *ljc;
+
+    gmx_mm_pr  hsig_i_SSE0,seps_i_SSE0;
+    gmx_mm_pr  hsig_i_SSE1,seps_i_SSE1;
+    gmx_mm_pr  hsig_i_SSE2,seps_i_SSE2;
+    gmx_mm_pr  hsig_i_SSE3,seps_i_SSE3;
+#else
+#ifdef FIX_LJ_C
+    real       pvdw_array[2*UNROLLI*UNROLLJ+3];
+    real       *pvdw_c6,*pvdw_c12;
+    gmx_mm_pr  c6_SSE0,c12_SSE0;
+    gmx_mm_pr  c6_SSE1,c12_SSE1;
+    gmx_mm_pr  c6_SSE2,c12_SSE2;
+    gmx_mm_pr  c6_SSE3,c12_SSE3;
+#endif
+
+#ifdef LJ_COMB_GEOM
+    const real *ljc;
+
+    gmx_mm_pr  c6s_SSE0,c12s_SSE0;
+    gmx_mm_pr  c6s_SSE1,c12s_SSE1;
+    gmx_mm_pr  c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
+    gmx_mm_pr  c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
+#endif
+#endif /* LJ_COMB_LB */
+
+    gmx_mm_pr  vctotSSE,VvdwtotSSE;
+    gmx_mm_pr  sixthSSE,twelvethSSE;
+
+    gmx_mm_pr  avoid_sing_SSE;
+    gmx_mm_pr  rc2_SSE;
+#ifdef VDW_CUTOFF_CHECK
+    gmx_mm_pr  rcvdw2_SSE;
+#endif
+
+#ifdef CALC_ENERGIES
+    gmx_mm_pr  sh_invrc6_SSE,sh_invrc12_SSE;
+
+    /* cppcheck-suppress unassignedVariable */
+    real       tmpsum_array[15],*tmpsum;
+#endif
+#ifdef CALC_SHIFTFORCES
+    /* cppcheck-suppress unassignedVariable */
+    real       shf_array[15],*shf;
+#endif
+
+    int ninner;
+
+#ifdef COUNT_PAIRS
+    int npair=0;
+#endif
+
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+    ljc = nbat->lj_comb;
+#else
+    /* No combination rule used */
+#ifndef GMX_DOUBLE
+    nbfp_ptr    = nbat->nbfp_s4;
+#define NBFP_STRIDE  4
+#else
+    nbfp_ptr    = nbat->nbfp;
+#define NBFP_STRIDE  2
+#endif
+    nbfp_stride = NBFP_STRIDE;
+#endif
+
+#ifdef CALC_COUL_TAB
+#ifdef GMX_MM256_HERE
+    /* Generate aligned table pointers */
+    ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+    ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+    ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+    ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+#endif
+
+    invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
+#ifdef CALC_ENERGIES
+    mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
+
+    sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
+#endif
+
+#ifdef TAB_FDV0
+    tab_coul_F = ic->tabq_coul_FDV0;
+#else
+    tab_coul_F = ic->tabq_coul_F;
+    tab_coul_V = ic->tabq_coul_V;
+#endif
+#endif
+
+    q                   = nbat->q;
+    type                = nbat->type;
+    facel               = ic->epsfac;
+    shiftvec            = shift_vec[0];
+    x                   = nbat->x;
+
+    avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+
+    /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
+    rc2_SSE    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+#ifdef VDW_CUTOFF_CHECK
+    rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
+#endif
+
+#ifdef CALC_ENERGIES
+    sixthSSE    = gmx_set1_pr(1.0/6.0);
+    twelvethSSE = gmx_set1_pr(1.0/12.0);
+
+    sh_invrc6_SSE  = gmx_set1_pr(ic->sh_invrc6);
+    sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+#endif
+
+    mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
+
+#ifdef CALC_ENERGIES
+    hrc_3_SSE = gmx_set1_pr(ic->k_rf);
+    
+    moh_rc_SSE = gmx_set1_pr(-ic->c_rf); 
+#endif
+
+#ifdef CALC_ENERGIES
+    tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
+#endif
+#ifdef CALC_SHIFTFORCES
+    shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
+#endif
+
+#ifdef FIX_LJ_C
+    pvdw_c6  = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
+    pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
+
+    for(jp=0; jp<UNROLLJ; jp++)
+    {
+        pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
+        pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
+        pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
+        pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
+
+        pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+        pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+        pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+        pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
+    }
+    c6_SSE0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+    c6_SSE1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+    c6_SSE2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+    c6_SSE3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+
+    c12_SSE0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+    c12_SSE1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+    c12_SSE2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+    c12_SSE3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+#endif /* FIX_LJ_C */
+
+#ifdef ENERGY_GROUPS
+    egps_ishift  = nbat->neg_2log;
+    egps_imask   = (1<<egps_ishift) - 1;
+    egps_jshift  = 2*nbat->neg_2log;
+    egps_jmask   = (1<<egps_jshift) - 1;
+    egps_jstride = (UNROLLJ>>1)*UNROLLJ;
+    /* Major division is over i-particles: divide nVS by 4 for i-stride */
+    Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
+#endif
+
+    l_cj = nbl->cj;
+
+    ninner = 0;
+    for(n=0; n<nbl->nci; n++)
+    {
+        nbln = &nbl->ci[n];
+
+        ish              = (nbln->shift & NBNXN_CI_SHIFT);
+        ish3             = ish*3;
+        cjind0           = nbln->cj_ind_start;      
+        cjind1           = nbln->cj_ind_end;    
+        /* Currently only works super-cells equal to sub-cells */
+        ci               = nbln->ci;
+        ci_sh            = (ish == CENTRAL ? ci : -1);
+
+        shX_SSE = gmx_load1_pr(shiftvec+ish3);
+        shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
+        shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
+
+#if UNROLLJ <= 4
+        sci              = ci*STRIDE;
+        scix             = sci*DIM;
+        sci2             = sci*2;
+#else
+        sci              = (ci>>1)*STRIDE;
+        scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
+        sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
+        sci             += (ci & 1)*(STRIDE>>1);
+#endif
+
+        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+
+#ifdef ENERGY_GROUPS
+        egps_i = nbat->energrp[ci];
+        {
+            int ia,egp_ia;
+
+            for(ia=0; ia<4; ia++)
+            {
+                egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
+                vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
+                vctp[ia]   = Vc   + egp_ia*Vstride_i;
+            }
+        }
+#endif
+#if defined CALC_ENERGIES
+#if UNROLLJ == 4
+        if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
+#endif
+#if UNROLLJ == 2
+        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
+#endif
+#if UNROLLJ == 8
+        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
+#endif
+        {
+            int  ia;
+            real Vc_sub_self;
+
+#ifdef CALC_COUL_RF
+            Vc_sub_self = 0.5*ic->c_rf;
+#endif
+#ifdef CALC_COUL_TAB
+#ifdef TAB_FDV0
+            Vc_sub_self = 0.5*tab_coul_F[2];
+#else
+            Vc_sub_self = 0.5*tab_coul_V[0];
+#endif
+#endif
+
+            for(ia=0; ia<UNROLLI; ia++)
+            {
+                real qi;
+
+                qi = q[sci+ia];
+#ifdef ENERGY_GROUPS
+                vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
+#else
+                Vc[0]
+#endif
+                    -= facel*qi*qi*Vc_sub_self;
+            }
+        }
+#endif
+
+		/* Load i atom data */
+        sciy             = scix + STRIDE;
+        sciz             = sciy + STRIDE;
+        ix_SSE0          = gmx_add_pr(gmx_load1_pr(x+scix)  ,shX_SSE);
+        ix_SSE1          = gmx_add_pr(gmx_load1_pr(x+scix+1),shX_SSE);
+        ix_SSE2          = gmx_add_pr(gmx_load1_pr(x+scix+2),shX_SSE);
+        ix_SSE3          = gmx_add_pr(gmx_load1_pr(x+scix+3),shX_SSE);
+        iy_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciy)  ,shY_SSE);
+        iy_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciy+1),shY_SSE);
+        iy_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciy+2),shY_SSE);
+        iy_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciy+3),shY_SSE);
+        iz_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciz)  ,shZ_SSE);
+        iz_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciz+1),shZ_SSE);
+        iz_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
+        iz_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
+
+        /* With half_LJ we currently always calculate Coulomb interactions */
+        if (do_coul || half_LJ)
+        {
+            iq_SSE0      = gmx_set1_pr(facel*q[sci]);
+            iq_SSE1      = gmx_set1_pr(facel*q[sci+1]);
+            iq_SSE2      = gmx_set1_pr(facel*q[sci+2]);
+            iq_SSE3      = gmx_set1_pr(facel*q[sci+3]);
+        }
+
+#ifdef LJ_COMB_LB
+        hsig_i_SSE0      = gmx_load1_pr(ljc+sci2+0);
+        hsig_i_SSE1      = gmx_load1_pr(ljc+sci2+1);
+        hsig_i_SSE2      = gmx_load1_pr(ljc+sci2+2);
+        hsig_i_SSE3      = gmx_load1_pr(ljc+sci2+3);
+        seps_i_SSE0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
+        seps_i_SSE1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
+        seps_i_SSE2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
+        seps_i_SSE3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
+#else
+#ifdef LJ_COMB_GEOM
+        c6s_SSE0         = gmx_load1_pr(ljc+sci2+0);
+        c6s_SSE1         = gmx_load1_pr(ljc+sci2+1);
+        if (!half_LJ)
+        {
+            c6s_SSE2     = gmx_load1_pr(ljc+sci2+2);
+            c6s_SSE3     = gmx_load1_pr(ljc+sci2+3);
+        }
+        c12s_SSE0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
+        c12s_SSE1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
+        if (!half_LJ)
+        {
+            c12s_SSE2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
+            c12s_SSE3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
+        }
+#else
+        nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
+        nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
+        if (!half_LJ)
+        {
+            nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
+            nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
+        }
+#endif
+#endif
+
+        /* Zero the potential energy for this list */
+        VvdwtotSSE       = gmx_setzero_pr();
+        vctotSSE         = gmx_setzero_pr();
+
+        /* Clear i atom forces */
+        fix_SSE0           = gmx_setzero_pr();
+        fix_SSE1           = gmx_setzero_pr();
+        fix_SSE2           = gmx_setzero_pr();
+        fix_SSE3           = gmx_setzero_pr();
+        fiy_SSE0           = gmx_setzero_pr();
+        fiy_SSE1           = gmx_setzero_pr();
+        fiy_SSE2           = gmx_setzero_pr();
+        fiy_SSE3           = gmx_setzero_pr();
+        fiz_SSE0           = gmx_setzero_pr();
+        fiz_SSE1           = gmx_setzero_pr();
+        fiz_SSE2           = gmx_setzero_pr();
+        fiz_SSE3           = gmx_setzero_pr();
+
+        cjind = cjind0;
+
+        /* Currently all kernels use (at least half) LJ */
+#define CALC_LJ
+        if (half_LJ)
+        {
+#define CALC_COULOMB
+#define HALF_LJ
+#define CHECK_EXCLS
+            while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+            {
+#include "nbnxn_kernel_x86_simd_inner.h"
+                cjind++;
+            }
+#undef CHECK_EXCLS
+            for(; (cjind<cjind1); cjind++)
+            {
+#include "nbnxn_kernel_x86_simd_inner.h"
+            }
+#undef HALF_LJ
+#undef CALC_COULOMB
+        }
+        else if (do_coul)
+        {
+#define CALC_COULOMB
+#define CHECK_EXCLS
+            while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+            {
+#include "nbnxn_kernel_x86_simd_inner.h"
+                cjind++;
+            }
+#undef CHECK_EXCLS
+            for(; (cjind<cjind1); cjind++)
+            {
+#include "nbnxn_kernel_x86_simd_inner.h"
+            }
+#undef CALC_COULOMB
+        }
+        else
+        {
+#define CHECK_EXCLS
+            while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
+            {
+#include "nbnxn_kernel_x86_simd_inner.h"
+                cjind++;
+            }
+#undef CHECK_EXCLS
+            for(; (cjind<cjind1); cjind++)
+            {
+#include "nbnxn_kernel_x86_simd_inner.h"
+            }
+        }
+#undef CALC_LJ
+        ninner += cjind1 - cjind0;
+
+        /* Add accumulated i-forces to the force array */
+#if UNROLLJ >= 4
+#ifndef GMX_DOUBLE
+#define gmx_load_ps4  _mm_load_ps
+#define gmx_store_ps4 _mm_store_ps
+#define gmx_add_ps4   _mm_add_ps
+#else
+#define gmx_load_ps4  _mm256_load_pd
+#define gmx_store_ps4 _mm256_store_pd
+#define gmx_add_ps4   _mm256_add_pd
+#endif
+        GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3,fix_SSE);
+        gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
+
+        GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3,fiy_SSE);
+        gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
+
+        GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3,fiz_SSE);
+        gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
+
+#ifdef CALC_SHIFTFORCES
+        gmx_store_ps4(shf,fix_SSE);
+        fshift[ish3+0] += SUM_SIMD4(shf);
+        gmx_store_ps4(shf,fiy_SSE);
+        fshift[ish3+1] += SUM_SIMD4(shf);
+        gmx_store_ps4(shf,fiz_SSE);
+        fshift[ish3+2] += SUM_SIMD4(shf);
+#endif
+#else
+        GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
+        _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
+        _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
+
+        GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
+        _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
+        _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
+
+        GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
+        _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
+        _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
+
+#ifdef CALC_SHIFTFORCES
+        _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
+        fshift[ish3+0] += shf[0] + shf[1];
+        _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
+        fshift[ish3+1] += shf[0] + shf[1];
+        _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
+        fshift[ish3+2] += shf[0] + shf[1];
+#endif
+#endif
+		
+#ifdef CALC_ENERGIES
+        if (do_coul)
+        {
+            gmx_store_pr(tmpsum,vctotSSE);
+            *Vc += SUM_SIMD(tmpsum);
+        }
+		
+        gmx_store_pr(tmpsum,VvdwtotSSE);
+        *Vvdw += SUM_SIMD(tmpsum);
+#endif
+		
+		/* Outer loop uses 6 flops/iteration */
+	}
+
+#ifdef COUNT_PAIRS
+    printf("atom pairs %d\n",npair);
+#endif
+}
+
+#undef gmx_load_ps4
+#undef gmx_store_ps4
+#undef gmx_store_ps4
+
+#undef CALC_SHIFTFORCES
+
+#undef UNROLLI   
+#undef UNROLLJ   
+#undef STRIDE
+#undef TAB_FDV0
+#undef NBFP_STRIDE
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h
new file mode 100644
index 0000000000..4ef4610922
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h
@@ -0,0 +1,489 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under 
+ * the terms of the GNU Lesser General Public License as published by the Free 
+ * Software Foundation; either version 2 of the License, or (at your option) any 
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.  
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+#ifndef _nbnxn_kernel_sse_utils_h_
+#define _nbnxn_kernel_sse_utils_h_
+
+/* This files contains all functions/macros for the SSE/AVX kernels
+ * which have explicit dependencies on the j-size / SIMD-width, which
+ * can be 2 (SSE-double), 4 (SSE-single,AVX-double) or 8 (AVX-single).
+ * The functionality which depends on the j-cluster size is:
+ *   LJ-parameter lookup
+ *   force table lookup
+ *   energy group pair energy storage
+ */
+
+#define GMX_MM_TRANSPOSE2_OP_PD(in0,in1,out0,out1)                      \
+{                                                                       \
+    out0 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(0,0));                   \
+    out1 = _mm_shuffle_pd(in0,in1,_MM_SHUFFLE2(1,1));                   \
+}
+
+#if defined GMX_MM128_HERE || !defined GMX_DOUBLE
+#define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1)    \
+{                                                                       \
+    __m128 _c01,_c23;                                                   \
+    _c01 = _mm_shuffle_ps(in0,in1,_MM_SHUFFLE(1,0,1,0));                \
+    _c23 = _mm_shuffle_ps(in2,in3,_MM_SHUFFLE(1,0,1,0));                \
+    out0 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(2,0,2,0));              \
+    out1 = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(3,1,3,1));              \
+}
+#else
+#define GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(in0,in1,in2,in3,out0,out1)    \
+{                                                                       \
+    __m256d _c01,_c23;                                                  \
+    _c01 = _mm256_shuffle_pd(in0,in1,_MM_SHUFFLE(1,0,1,0));             \
+    _c23 = _mm256_shuffle_pd(in2,in3,_MM_SHUFFLE(1,0,1,0));             \
+    out0 = _mm256_shuffle_pd(_c01,_c23,_MM_SHUFFLE(2,0,2,0));           \
+    out1 = _mm256_shuffle_pd(_c01,_c23,_MM_SHUFFLE(3,1,3,1));           \
+}
+#endif
+
+#define GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(in0,in1,in2,in3,out)           \
+{                                                                       \
+    __m128 _c01,_c23;                                                   \
+    _c01 = _mm_shuffle_ps(in0,in1,_MM_SHUFFLE(3,2,3,2));                \
+    _c23 = _mm_shuffle_ps(in2,in3,_MM_SHUFFLE(3,2,3,2));                \
+    out  = _mm_shuffle_ps(_c01,_c23,_MM_SHUFFLE(2,0,2,0));              \
+}
+
+#ifndef GMX_MM256_HERE
+#ifndef GMX_DOUBLE
+#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
+{                                                                       \
+    _MM_TRANSPOSE4_PS(i_SSE0,i_SSE1,i_SSE2,i_SSE3);                     \
+    i_SSE0 = _mm_add_ps(i_SSE0,i_SSE1);                                 \
+    i_SSE2 = _mm_add_ps(i_SSE2,i_SSE3);                                 \
+    o_SSE  = _mm_add_ps(i_SSE0,i_SSE2);                                 \
+}
+#else
+#define GMX_MM_TRANSPOSE_SUM2_PD(i_SSE0,i_SSE1,o_SSE)                   \
+{                                                                       \
+    GMX_MM_TRANSPOSE2_PD(i_SSE0,i_SSE1);                                \
+    o_SSE  = _mm_add_pd(i_SSE0,i_SSE1);                                 \
+}
+#endif
+#else
+#ifndef GMX_DOUBLE
+#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
+{                                                                       \
+    i_SSE0 = _mm256_hadd_ps(i_SSE0,i_SSE1);                             \
+    i_SSE2 = _mm256_hadd_ps(i_SSE2,i_SSE3);                             \
+    i_SSE1 = _mm256_hadd_ps(i_SSE0,i_SSE2);                             \
+    o_SSE  = _mm_add_ps(_mm256_castps256_ps128(i_SSE1),_mm256_extractf128_ps(i_SSE1,1)); \
+}
+#else
+#define GMX_MM_TRANSPOSE_SUM4_PR(i_SSE0,i_SSE1,i_SSE2,i_SSE3,o_SSE)     \
+{                                                                       \
+    i_SSE0 = _mm256_hadd_pd(i_SSE0,i_SSE1);                             \
+    i_SSE2 = _mm256_hadd_pd(i_SSE2,i_SSE3);                             \
+    o_SSE  = _mm256_add_pd(_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x20),_mm256_permute2f128_pd(i_SSE0,i_SSE2,0x31)); \
+}
+#endif
+#endif
+
+#ifdef GMX_MM128_HERE
+
+static inline __m128
+gmx_mm128_invsqrt_ps_single(__m128 x)
+{
+    const __m128 half  = _mm_set_ps(0.5,0.5,0.5,0.5);
+    const __m128 three = _mm_set_ps(3.0,3.0,3.0,3.0);
+    
+    __m128 lu = _mm_rsqrt_ps(x);
+    
+    return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
+}
+
+/* Do 2/4 double precision invsqrt operations.
+ * Doing the SSE rsqrt and the first Newton Raphson iteration
+ * in single precision gives full double precision accuracy.
+ * The speed is more than twice as fast as two gmx_mm_invsqrt_pd calls.
+ */
+#define GMX_MM128_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1)              \
+{                                                                       \
+    const __m128d half  = _mm_set1_pd(0.5);                             \
+    const __m128d three = _mm_set1_pd(3.0);                             \
+    __m128  s_SSE,ir_SSE;                                               \
+    __m128d lu0,lu1;                                                    \
+                                                                        \
+    s_SSE  = _mm_movelh_ps(_mm_cvtpd_ps(i_SSE0),_mm_cvtpd_ps(i_SSE1));  \
+    ir_SSE = gmx_mm128_invsqrt_ps_single(s_SSE);                        \
+    lu0    = _mm_cvtps_pd(ir_SSE);                                      \
+    lu1    = _mm_cvtps_pd(_mm_movehl_ps(ir_SSE,ir_SSE));                \
+    o_SSE0 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
+    o_SSE1 = _mm_mul_pd(half,_mm_mul_pd(_mm_sub_pd(three,_mm_mul_pd(_mm_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
+}
+
+#define GMX_MM_INVSQRT2_PD GMX_MM128_INVSQRT2_PD
+
+#endif
+
+#ifdef GMX_MM256_HERE
+
+static inline __m256
+gmx_mm256_invsqrt_ps_single(__m256 x)
+{
+    const __m256 half  = _mm256_set_ps(0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5);
+    const __m256 three = _mm256_set_ps(3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0);
+    
+    __m256 lu = _mm256_rsqrt_ps(x);
+    
+    return _mm256_mul_ps(half,_mm256_mul_ps(_mm256_sub_ps(three,_mm256_mul_ps(_mm256_mul_ps(lu,lu),x)),lu));
+}
+
+#define GMX_MM256_INVSQRT2_PD(i_SSE0,i_SSE1,o_SSE0,o_SSE1)              \
+{                                                                       \
+    const __m256d half  = _mm256_set1_pd(0.5);                          \
+    const __m256d three = _mm256_set1_pd(3.0);                          \
+    __m256  s_SSE,ir_SSE;                                               \
+    __m256d lu0,lu1;                                                    \
+                                                                        \
+    s_SSE  = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(i_SSE0)),_mm256_cvtpd_ps(i_SSE1),1); \
+    ir_SSE = gmx_mm256_invsqrt_ps_single(s_SSE);                        \
+    lu0    = _mm256_cvtps_pd(_mm256_castps256_ps128(ir_SSE));           \
+    lu1    = _mm256_cvtps_pd(_mm256_extractf128_ps(ir_SSE,1));          \
+    o_SSE0 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu0,lu0),i_SSE0)),lu0)); \
+    o_SSE1 = _mm256_mul_pd(half,_mm256_mul_pd(_mm256_sub_pd(three,_mm256_mul_pd(_mm256_mul_pd(lu1,lu1),i_SSE1)),lu1)); \
+}
+
+#define GMX_MM_INVSQRT2_PD GMX_MM256_INVSQRT2_PD
+
+#endif
+
+/* Force and energy table load and interpolation routines */
+
+#if defined GMX_MM128_HERE && !defined GMX_DOUBLE
+
+#define load_lj_pair_params(nbfp,type,aj,c6_SSE,c12_SSE)                \
+{                                                                       \
+    gmx_mm_pr clj_SSE[UNROLLJ];                                         \
+    int p;                                                              \
+                                                                        \
+    for(p=0; p<UNROLLJ; p++)                                            \
+    {                                                                   \
+        /* Here we load 4 aligned floats, but we need just 2 */         \
+        clj_SSE[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE);          \
+    }                                                                   \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6_SSE,c12_SSE); \
+}
+
+#endif
+
+#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+
+/* Put two 128-bit 4-float registers into one 256-bit 8-float register */
+#define GMX_2_MM_TO_M256(in0,in1,out)                                   \
+{                                                                       \
+    out = _mm256_insertf128_ps(_mm256_castps128_ps256(in0),in1,1);      \
+}
+
+#define load_lj_pair_params(nbfp,type,aj,c6_SSE,c12_SSE)                \
+{                                                                       \
+    __m128 clj_SSE[UNROLLJ],c6t_SSE[2],c12t_SSE[2];                     \
+    int p;                                                              \
+                                                                        \
+    for(p=0; p<UNROLLJ; p++)                                            \
+    {                                                                   \
+        /* Here we load 4 aligned floats, but we need just 2 */         \
+        clj_SSE[p] = _mm_load_ps(nbfp+type[aj+p]*NBFP_STRIDE);          \
+    }                                                                   \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[0],clj_SSE[1],clj_SSE[2],clj_SSE[3],c6t_SSE[0],c12t_SSE[0]); \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(clj_SSE[4],clj_SSE[5],clj_SSE[6],clj_SSE[7],c6t_SSE[1],c12t_SSE[1]); \
+                                                                        \
+    GMX_2_MM_TO_M256(c6t_SSE[0],c6t_SSE[1],c6_SSE);                     \
+    GMX_2_MM_TO_M256(c12t_SSE[0],c12t_SSE[1],c12_SSE);                  \
+}
+
+#endif
+
+#if defined GMX_MM128_HERE && defined GMX_DOUBLE
+
+#define load_lj_pair_params(nbfp,type,aj,c6_SSE,c12_SSE)                \
+{                                                                       \
+    gmx_mm_pr clj_SSE[UNROLLJ];                                         \
+    int p;                                                              \
+                                                                        \
+    for(p=0; p<UNROLLJ; p++)                                            \
+    {                                                                   \
+        clj_SSE[p] = gmx_load_pr(nbfp+type[aj+p]*NBFP_STRIDE);          \
+    }                                                                   \
+    GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[0],clj_SSE[1],c6_SSE,c12_SSE);      \
+}
+
+#endif
+
+#if defined GMX_MM256_HERE && defined GMX_DOUBLE
+
+#define load_lj_pair_params(nbfp,type,aj,c6_SSE,c12_SSE)                \
+{                                                                       \
+    __m128d clj_SSE[UNROLLJ],c6t_SSE[2],c12t_SSE[2];                    \
+    int p;                                                              \
+                                                                        \
+    for(p=0; p<UNROLLJ; p++)                                            \
+    {                                                                   \
+        clj_SSE[p] = _mm_load_pd(nbfp+type[aj+p]*NBFP_STRIDE);          \
+    }                                                                   \
+    GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[0],clj_SSE[1],c6t_SSE[0],c12t_SSE[0]); \
+    GMX_MM_TRANSPOSE2_OP_PD(clj_SSE[2],clj_SSE[3],c6t_SSE[1],c12t_SSE[1]); \
+    GMX_2_M128D_TO_M256D(c6t_SSE[0],c6t_SSE[1],c6_SSE);                 \
+    GMX_2_M128D_TO_M256D(c12t_SSE[0],c12t_SSE[1],c12_SSE);              \
+}
+
+#endif
+
+
+/* The load_table functions below are performance critical.
+ * The routines issue UNROLLI*UNROLLJ _mm_load_ps calls.
+ * As these all have latencies, scheduling is crucial.
+ * The Intel compilers and CPUs seem to do a good job at this.
+ * But AMD CPUs perform significantly worse with gcc than with icc.
+ * Performance is improved a bit by using the extract function UNROLLJ times,
+ * instead of doing an _mm_store_si128 for every i-particle.
+ * With AVX this significantly deteriorates performance (8 extracts iso 4).
+ * Because of this, the load_table_f macro always takes the ti parameter,
+ * but it is only used with AVX.
+ */
+
+#if defined GMX_MM128_HERE && !defined GMX_DOUBLE
+
+#define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE)   \
+{                                                                       \
+    int    idx[4];                                                      \
+    __m128 ctab_SSE[4];                                                 \
+                                                                        \
+    /* Table has 4 entries, left-shift index by 2 */                    \
+    ti_SSE = _mm_slli_epi32(ti_SSE,2);                                  \
+    /* Without SSE4.1 the extract macro needs an immediate: unroll */   \
+    idx[0] = gmx_mm_extract_epi32(ti_SSE,0);                            \
+    ctab_SSE[0] = _mm_load_ps(tab_coul_FDV0+idx[0]);                    \
+    idx[1] = gmx_mm_extract_epi32(ti_SSE,1);                            \
+    ctab_SSE[1] = _mm_load_ps(tab_coul_FDV0+idx[1]);                    \
+    idx[2] = gmx_mm_extract_epi32(ti_SSE,2);                            \
+    ctab_SSE[2] = _mm_load_ps(tab_coul_FDV0+idx[2]);                    \
+    idx[3] = gmx_mm_extract_epi32(ti_SSE,3);                            \
+    ctab_SSE[3] = _mm_load_ps(tab_coul_FDV0+idx[3]);                    \
+                                                                        \
+    /* Shuffle the force table entries to a convenient order */         \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctab0_SSE,ctab1_SSE); \
+}
+
+#define load_table_f_v(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
+{                                                                       \
+    int    idx[4];                                                      \
+    __m128 ctab_SSE[4];                                                 \
+                                                                        \
+    /* Table has 4 entries, left-shift index by 2 */                    \
+    ti_SSE = _mm_slli_epi32(ti_SSE,2);                                  \
+    /* Without SSE4.1 the extract macro needs an immediate: unroll */   \
+    idx[0] = gmx_mm_extract_epi32(ti_SSE,0);                            \
+    ctab_SSE[0] = _mm_load_ps(tab_coul_FDV0+idx[0]);                    \
+    idx[1] = gmx_mm_extract_epi32(ti_SSE,1);                            \
+    ctab_SSE[1] = _mm_load_ps(tab_coul_FDV0+idx[1]);                    \
+    idx[2] = gmx_mm_extract_epi32(ti_SSE,2);                            \
+    ctab_SSE[2] = _mm_load_ps(tab_coul_FDV0+idx[2]);                    \
+    idx[3] = gmx_mm_extract_epi32(ti_SSE,3);                            \
+    ctab_SSE[3] = _mm_load_ps(tab_coul_FDV0+idx[3]);                    \
+                                                                        \
+    /* Shuffle the force  table entries to a convenient order */        \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctab0_SSE,ctab1_SSE); \
+    /* Shuffle the energy table entries to a convenient order */        \
+    GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctabv_SSE); \
+}
+
+#endif
+
+#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+
+#define load_table_f(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE)   \
+{                                                                       \
+    __m128 ctab_SSE[8],ctabt_SSE[4];                                    \
+    int    j;                                                           \
+                                                                        \
+    /* Bit shifting would be faster, but AVX doesn't support that */    \
+    _mm256_store_si256((__m256i *)ti,ti_SSE);                           \
+    for(j=0; j<8; j++)                                                  \
+    {                                                                   \
+        ctab_SSE[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4);               \
+    }                                                                   \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctabt_SSE[0],ctabt_SSE[2]); \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[4],ctab_SSE[5],ctab_SSE[6],ctab_SSE[7],ctabt_SSE[1],ctabt_SSE[3]); \
+                                                                        \
+    GMX_2_MM_TO_M256(ctabt_SSE[0],ctabt_SSE[1],ctab0_SSE);              \
+    GMX_2_MM_TO_M256(ctabt_SSE[2],ctabt_SSE[3],ctab1_SSE);              \
+}
+
+#define load_table_f_v(tab_coul_FDV0, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
+{                                                                       \
+    __m128 ctab_SSE[8],ctabt_SSE[4],ctabvt_SSE[2];                      \
+    int    j;                                                           \
+                                                                        \
+    /* Bit shifting would be faster, but AVX doesn't support that */    \
+    _mm256_store_si256((__m256i *)ti,ti_SSE);                           \
+    for(j=0; j<8; j++)                                                  \
+    {                                                                   \
+        ctab_SSE[j] = _mm_load_ps(tab_coul_FDV0+ti[j]*4);               \
+    }                                                                   \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctabt_SSE[0],ctabt_SSE[2]); \
+    GMX_MM_SHUFFLE_4_PS_FIL01_TO_2_PS(ctab_SSE[4],ctab_SSE[5],ctab_SSE[6],ctab_SSE[7],ctabt_SSE[1],ctabt_SSE[3]); \
+                                                                        \
+    GMX_2_MM_TO_M256(ctabt_SSE[0],ctabt_SSE[1],ctab0_SSE);              \
+    GMX_2_MM_TO_M256(ctabt_SSE[2],ctabt_SSE[3],ctab1_SSE);              \
+                                                                        \
+    GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[0],ctab_SSE[1],ctab_SSE[2],ctab_SSE[3],ctabvt_SSE[0]); \
+    GMX_MM_SHUFFLE_4_PS_FIL2_TO_1_PS(ctab_SSE[4],ctab_SSE[5],ctab_SSE[6],ctab_SSE[7],ctabvt_SSE[1]); \
+                                                                        \
+    GMX_2_MM_TO_M256(ctabvt_SSE[0],ctabvt_SSE[1],ctabv_SSE);            \
+}
+
+#endif
+
+#if defined GMX_MM128_HERE && defined GMX_DOUBLE
+
+#define load_table_f(tab_coul_F, ti_SSE, ti, ctab0_SSE, ctab1_SSE)      \
+{                                                                       \
+    int     idx[2];                                                     \
+    __m128d ctab_SSE[2];                                                \
+                                                                        \
+    /* Without SSE4.1 the extract macro needs an immediate: unroll */   \
+    idx[0] = gmx_mm_extract_epi32(ti_SSE,0);                            \
+    ctab_SSE[0] = _mm_loadu_pd(tab_coul_F+idx[0]);                      \
+    idx[1] = gmx_mm_extract_epi32(ti_SSE,1);                            \
+    ctab_SSE[1] = _mm_loadu_pd(tab_coul_F+idx[1]);                      \
+                                                                        \
+    /* Shuffle the force table entries to a convenient order */         \
+    GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0],ctab_SSE[1],ctab0_SSE,ctab1_SSE); \
+    /* The second force table entry should contain the difference */    \
+    ctab1_SSE = _mm_sub_pd(ctab1_SSE,ctab0_SSE);                        \
+}
+
+#define load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
+{                                                                       \
+    int     idx[2];                                                     \
+    __m128d ctab_SSE[4];                                                \
+                                                                        \
+    /* Without SSE4.1 the extract macro needs an immediate: unroll */   \
+    idx[0] = gmx_mm_extract_epi32(ti_SSE,0);                            \
+    ctab_SSE[0] = _mm_loadu_pd(tab_coul_F+idx[0]);                      \
+    idx[1] = gmx_mm_extract_epi32(ti_SSE,1);                            \
+    ctab_SSE[1] = _mm_loadu_pd(tab_coul_F+idx[1]);                      \
+                                                                        \
+    /* Shuffle the force table entries to a convenient order */         \
+    GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0],ctab_SSE[1],ctab0_SSE,ctab1_SSE); \
+    /* The second force table entry should contain the difference */    \
+    ctab1_SSE = _mm_sub_pd(ctab1_SSE,ctab0_SSE);                        \
+                                                                        \
+    ctab_SSE[2] = _mm_loadu_pd(tab_coul_V+idx[0]);                      \
+    ctab_SSE[3] = _mm_loadu_pd(tab_coul_V+idx[1]);                      \
+                                                                        \
+    /* Shuffle the energy table entries to a single register */         \
+    ctabv_SSE = _mm_shuffle_pd(ctab_SSE[2],ctab_SSE[3],_MM_SHUFFLE2(0,0)); \
+}
+
+#endif
+
+#if defined GMX_MM256_HERE && defined GMX_DOUBLE
+
+/* Put two 128-bit 2-double registers into one 256-bit 4-ouble register */
+#define GMX_2_M128D_TO_M256D(in0,in1,out)                               \
+{                                                                       \
+    out = _mm256_insertf128_pd(_mm256_castpd128_pd256(in0),in1,1);      \
+}
+
+#define load_table_f(tab_coul_F, ti_SSE, ti, ctab0_SSE, ctab1_SSE)      \
+{                                                                       \
+    __m128d ctab_SSE[4],tr_SSE[4];                                      \
+    int     j;                                                          \
+                                                                        \
+    _mm_store_si128((__m128i *)ti,ti_SSE);                              \
+    for(j=0; j<4; j++)                                                  \
+    {                                                                   \
+        ctab_SSE[j] = _mm_loadu_pd(tab_coul_F+ti[j]);                   \
+    }                                                                   \
+    /* Shuffle the force table entries to a convenient order */         \
+    GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0],ctab_SSE[1],tr_SSE[0],tr_SSE[1]); \
+    GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[2],ctab_SSE[3],tr_SSE[2],tr_SSE[3]); \
+    GMX_2_M128D_TO_M256D(tr_SSE[0],tr_SSE[2],ctab0_SSE);                \
+    GMX_2_M128D_TO_M256D(tr_SSE[1],tr_SSE[3],ctab1_SSE);                \
+    /* The second force table entry should contain the difference */    \
+    ctab1_SSE = _mm256_sub_pd(ctab1_SSE,ctab0_SSE);                     \
+}
+
+#define load_table_f_v(tab_coul_F, tab_coul_V, ti_SSE, ti, ctab0_SSE, ctab1_SSE, ctabv_SSE) \
+{                                                                       \
+    __m128d ctab_SSE[8],tr_SSE[4];                                      \
+    int     j;                                                          \
+                                                                        \
+    _mm_store_si128((__m128i *)ti,ti_SSE);                              \
+    for(j=0; j<4; j++)                                                  \
+    {                                                                   \
+        ctab_SSE[j] = _mm_loadu_pd(tab_coul_F+ti[j]);                   \
+    }                                                                   \
+    /* Shuffle the force table entries to a convenient order */         \
+    GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[0],ctab_SSE[1],tr_SSE[0],tr_SSE[1]); \
+    GMX_MM_TRANSPOSE2_OP_PD(ctab_SSE[2],ctab_SSE[3],tr_SSE[2],tr_SSE[3]); \
+    GMX_2_M128D_TO_M256D(tr_SSE[0],tr_SSE[2],ctab0_SSE);                \
+    GMX_2_M128D_TO_M256D(tr_SSE[1],tr_SSE[3],ctab1_SSE);                \
+    /* The second force table entry should contain the difference */    \
+    ctab1_SSE = _mm256_sub_pd(ctab1_SSE,ctab0_SSE);                     \
+                                                                        \
+    for(j=0; j<4; j++)                                                  \
+    {                                                                   \
+        ctab_SSE[4+j] = _mm_loadu_pd(tab_coul_V+ti[j]);                 \
+    }                                                                   \
+    /* Shuffle the energy table entries to a single register */         \
+    GMX_2_M128D_TO_M256D(_mm_shuffle_pd(ctab_SSE[4],ctab_SSE[5],_MM_SHUFFLE2(0,0)),_mm_shuffle_pd(ctab_SSE[6],ctab_SSE[7],_MM_SHUFFLE2(0,0)),ctabv_SSE); \
+}
+
+#endif
+
+
+/* Add energy register to possibly multiple terms in the energy array.
+ * This function is the same for SSE/AVX single/double.
+ */
+static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
+{
+    int jj;
+
+    /* We need to balance the number of store operations with
+     * the rapidly increases number of combinations of energy groups.
+     * We add to a temporary buffer for 1 i-group vs 2 j-groups.
+     */
+    for(jj=0; jj<(UNROLLJ>>1); jj++)
+    {
+        gmx_mm_pr v_SSE;
+
+        v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*UNROLLJ);
+        gmx_store_pr(v+offset_jj[jj]+jj*UNROLLJ,gmx_add_pr(v_SSE,e_SSE));
+    }
+}
+
+#endif /* _nbnxn_kernel_sse_utils_h_ */
diff --git a/src/mdlib/nbnxn_search.c b/src/mdlib/nbnxn_search.c
new file mode 100644
index 0000000000..477210fb6d
--- /dev/null
+++ b/src/mdlib/nbnxn_search.c
@@ -0,0 +1,6053 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+#include "sysstuff.h"
+#include "smalloc.h"
+#include "macros.h"
+#include "maths.h"
+#include "vec.h"
+#include "pbc.h"
+#include "nbnxn_search.h"
+#include "nbnxn_consts.h"
+#include "gmx_cyclecounter.h"
+#include "gmxfio.h"
+#include "gmx_omp_nthreads.h"
+#include "nrnb.h"
+
+#ifdef GMX_X86_SSE2
+#define NBNXN_SEARCH_SSE
+
+#ifndef GMX_DOUBLE
+#define NBNXN_SEARCH_SSE_SINGLE
+#include "gmx_x86_simd_single.h"
+#else
+#include "gmx_x86_simd_double.h"
+#endif
+
+#if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
+#define NBNXN_8BB_SSE
+#endif
+
+/* The width of SSE with single precision, used for bounding boxes */
+#define SSE_F_WIDTH        4
+#define SSE_F_WIDTH_2LOG   2
+
+#endif /* NBNXN_SEARCH_SSE */
+
+/* Pair search box lower and upper corner in x,y,z.
+ * Store this in 4 iso 3 reals, which is useful with SSE.
+ * To avoid complicating the code we also use 4 without SSE.
+ */
+#define NNBSBB_C         4
+#define NNBSBB_B         (2*NNBSBB_C)
+/* Pair search box lower and upper bound in z only. */
+#define NNBSBB_D         2
+/* Pair search box lower and upper corner x,y,z indices */
+#define BBL_X  0
+#define BBL_Y  1
+#define BBL_Z  2
+#define BBU_X  4
+#define BBU_Y  5
+#define BBU_Z  6
+
+/* Strides for x/f with xyz and xyzq coordinate (and charge) storage */
+#define STRIDE_XYZ   3
+#define STRIDE_XYZQ  4
+/* Size of packs of x, y or z with SSE/AVX packed coords/forces */
+#define PACK_X4      4
+#define PACK_X8      8
+/* Strides for a pack of 4 and 8 coordinates/forces */
+#define STRIDE_P4    (DIM*PACK_X4)
+#define STRIDE_P8    (DIM*PACK_X8)
+
+/* Index of atom a into the SSE/AVX coordinate/force array */
+#define X4_IND_A(a)  (STRIDE_P4*((a) >> 2) + ((a) & (PACK_X4 - 1)))
+#define X8_IND_A(a)  (STRIDE_P8*((a) >> 3) + ((a) & (PACK_X8 - 1)))
+
+
+#ifdef NBNXN_SEARCH_SSE
+
+/* The functions below are macros as they are performance sensitive */
+
+/* 4x4 list, pack=4: no complex conversion required */
+/* i-cluster to j-cluster conversion */
+#define CI_TO_CJ_J4(ci)   (ci)
+/* cluster index to coordinate array index conversion */
+#define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
+#define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
+
+/* 4x2 list, pack=4: j-cluster size is half the packing width */
+/* i-cluster to j-cluster conversion */
+#define CI_TO_CJ_J2(ci)  ((ci)<<1)
+/* cluster index to coordinate array index conversion */
+#define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
+#define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
+
+/* 4x8 list, pack=8: i-cluster size is half the packing width */
+/* i-cluster to j-cluster conversion */
+#define CI_TO_CJ_J8(ci)  ((ci)>>1)
+/* cluster index to coordinate array index conversion */
+#define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
+#define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
+
+/* The j-cluster size is matched to the SIMD width */
+#ifndef GMX_DOUBLE
+/* 128 bits can hold 4 floats */
+#define CI_TO_CJ_S128(ci)  CI_TO_CJ_J4(ci)
+#define X_IND_CI_S128(ci)  X_IND_CI_J4(ci)
+#define X_IND_CJ_S128(cj)  X_IND_CJ_J4(cj)
+/* 256 bits can hold 8 floats */
+#define CI_TO_CJ_S256(ci)  CI_TO_CJ_J8(ci)
+#define X_IND_CI_S256(ci)  X_IND_CI_J8(ci)
+#define X_IND_CJ_S256(cj)  X_IND_CJ_J8(cj)
+#else
+/* 128 bits can hold 2 doubles */
+#define CI_TO_CJ_S128(ci)  CI_TO_CJ_J2(ci)
+#define X_IND_CI_S128(ci)  X_IND_CI_J2(ci)
+#define X_IND_CJ_S128(cj)  X_IND_CJ_J2(cj)
+/* 256 bits can hold 4 doubles */
+#define CI_TO_CJ_S256(ci)  CI_TO_CJ_J4(ci)
+#define X_IND_CI_S256(ci)  X_IND_CI_J4(ci)
+#define X_IND_CJ_S256(cj)  X_IND_CJ_J4(cj)
+#endif
+
+#endif /* NBNXN_SEARCH_SSE */
+
+
+/* Interaction masks for 4xN atom interactions.
+ * Bit i*CJ_SIZE + j tells if atom i and j interact.
+ */
+/* All interaction mask is the same for all kernels */
+#define NBNXN_INT_MASK_ALL        0xffffffff
+/* 4x4 kernel diagonal mask */
+#define NBNXN_INT_MASK_DIAG       0x08ce
+/* 4x2 kernel diagonal masks */
+#define NBNXN_INT_MASK_DIAG_J2_0  0x0002
+#define NBNXN_INT_MASK_DIAG_J2_1  0x002F
+/* 4x8 kernel diagonal masks */
+#define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
+#define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
+
+
+#ifdef NBNXN_SEARCH_SSE
+/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
+#define NBNXN_BBXXXX
+/* Size of bounding box corners quadruplet */
+#define NNBSBB_XXXX      (NNBSBB_D*DIM*SSE_F_WIDTH)
+#endif
+
+/* We shift the i-particles backward for PBC.
+ * This leads to more conditionals than shifting forward.
+ * We do this to get more balanced pair lists.
+ */
+#define NBNXN_SHIFT_BACKWARD
+
+
+/* This define is a lazy way to avoid interdependence of the grid
+ * and searching data structures.
+ */
+#define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
+
+#ifdef NBNXN_SEARCH_SSE
+#define GMX_MM128_HERE
+#include "gmx_x86_simd_macros.h"
+typedef struct nbnxn_x_ci_x86_simd128 {
+    /* The i-cluster coordinates for simple search */
+    gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
+    gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
+    gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
+    gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
+} nbnxn_x_ci_x86_simd128_t;
+#undef GMX_MM128_HERE
+#ifdef GMX_X86_AVX_256
+#define GMX_MM256_HERE
+#include "gmx_x86_simd_macros.h"
+typedef struct nbnxn_x_ci_x86_simd256 {
+    /* The i-cluster coordinates for simple search */
+    gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
+    gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
+    gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
+    gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
+} nbnxn_x_ci_x86_simd256_t;
+#undef GMX_MM256_HERE
+#endif
+#endif
+
+/* Working data for the actual i-supercell during pair search */
+typedef struct nbnxn_list_work {
+    gmx_cache_protect_t cp0; /* Protect cache between threads               */
+
+    float *bb_ci;      /* The bounding boxes, pbc shifted, for each cluster */
+    real  *x_ci;       /* The coordinates, pbc shifted, for each atom       */
+#ifdef NBNXN_SEARCH_SSE
+    nbnxn_x_ci_x86_simd128_t *x_ci_x86_simd128;
+#ifdef GMX_X86_AVX_256
+    nbnxn_x_ci_x86_simd256_t *x_ci_x86_simd256;
+#endif
+#endif
+    int  cj_ind;       /* The current cj_ind index for the current list     */
+    int  cj4_init;     /* The first unitialized cj4 block                   */
+
+    float *d2;         /* Bounding box distance work array                  */
+
+    nbnxn_cj_t *cj;    /* The j-cell list                                   */
+    int  cj_nalloc;    /* Allocation size of cj                             */
+
+    int ncj_noq;       /* Nr. of cluster pairs without Coul for flop count  */
+    int ncj_hlj;       /* Nr. of cluster pairs with 1/2 LJ for flop count   */
+
+    gmx_cache_protect_t cp1; /* Protect cache between threads               */
+} nbnxn_list_work_t;
+
+/* Function type for setting the i-atom coordinate working data */
+typedef void
+gmx_icell_set_x_t(int ci,
+                  real shx,real shy,real shz,
+                  int na_c,
+                  int stride,const real *x,
+                  nbnxn_list_work_t *work);
+
+static gmx_icell_set_x_t icell_set_x_simple;
+#ifdef NBNXN_SEARCH_SSE
+static gmx_icell_set_x_t icell_set_x_simple_x86_simd128;
+#ifdef GMX_X86_AVX_256
+static gmx_icell_set_x_t icell_set_x_simple_x86_simd256;
+#endif
+#endif
+static gmx_icell_set_x_t icell_set_x_supersub;
+#ifdef NBNXN_SEARCH_SSE
+static gmx_icell_set_x_t icell_set_x_supersub_sse8;
+#endif
+
+/* Function type for checking if sub-cells are within range */
+typedef gmx_bool
+gmx_subcell_in_range_t(int na_c,
+                       int si,const real *x_or_bb_i,
+                       int csj,int stride,const real *x_or_bb_j,
+                       real rl2);
+
+static gmx_subcell_in_range_t subc_in_range_x;
+static gmx_subcell_in_range_t subc_in_range_sse8;
+
+/* Local cycle count struct for profiling */
+typedef struct {
+    int          count;
+    gmx_cycles_t c;
+    gmx_cycles_t start;
+} nbnxn_cycle_t;
+
+/* Local cycle count enum for profiling */
+enum { enbsCCgrid, enbsCCsearch, enbsCCcombine, enbsCCreducef, enbsCCnr };
+
+/* A pair-search grid struct for one domain decomposition zone */
+typedef struct {
+    rvec c0;             /* The lower corner of the (local) grid        */
+    rvec c1;             /* The upper corner of the (local) grid        */
+    real atom_density;   /* The atom number density for the local grid  */
+
+    gmx_bool bSimple;    /* Is this grid simple or super/sub            */
+    int  na_c;           /* Number of atoms per cluster                 */
+    int  na_cj;          /* Number of atoms for list j-clusters         */
+    int  na_sc;          /* Number of atoms per super-cluster           */
+    int  na_c_2log;      /* 2log of na_c                                */
+
+    int  ncx;            /* Number of (super-)cells along x             */
+    int  ncy;            /* Number of (super-)cells along y             */
+    int  nc;             /* Total number of (super-)cells               */
+
+    real sx;             /* x-size of a (super-)cell                    */
+    real sy;             /* y-size of a (super-)cell                    */
+    real inv_sx;         /* 1/sx                                        */
+    real inv_sy;         /* 1/sy                                        */
+
+    int  cell0;          /* Index in nbs->cell corresponding to cell 0  */
+
+    int  *cxy_na;        /* The number of atoms for each column in x,y  */
+    int  *cxy_ind;       /* Grid (super)cell index, offset from cell0   */
+    int  cxy_nalloc;     /* Allocation size for cxy_na and cxy_ind      */
+
+    int   *nsubc;        /* The number of sub cells for each super cell */
+    float *bbcz;         /* Bounding boxes in z for the super cells     */
+    float *bb;           /* 3D bounding boxes for the sub cells         */
+    float *bbj;          /* 3D j-b.boxes for SSE-double or AVX-single   */
+    int   *flags;        /* Flag for the super cells                    */
+    int   nc_nalloc;     /* Allocation size for the pointers above      */
+
+    float *bbcz_simple;  /* bbcz for simple grid converted from super   */
+    float *bb_simple;    /* bb for simple grid converted from super     */
+    int   *flags_simple; /* flags for simple grid converted from super  */
+    int   nc_nalloc_simple; /* Allocation size for the pointers above   */
+
+    int  nsubc_tot;      /* Total number of subcell, used for printing  */
+} nbnxn_grid_t;
+
+/* Thread-local work struct, contains part of nbnxn_grid_t */
+typedef struct {
+    gmx_cache_protect_t cp0;
+
+    int *cxy_na;
+    int cxy_na_nalloc;
+
+    int  *sort_work;
+    int  sort_work_nalloc;
+
+    int  ndistc;         /* Number of distance checks for flop counting */
+
+    nbnxn_cycle_t cc[enbsCCnr];
+
+    gmx_cache_protect_t cp1;
+} nbnxn_search_work_t;
+
+/* Main pair-search struct, contains the grid(s), not the pair-list(s) */
+typedef struct nbnxn_search {
+    int  ePBC;            /* PBC type enum                              */
+    matrix box;           /* The periodic unit-cell                     */
+
+    gmx_bool DomDec;      /* Are we doing domain decomposition?         */
+    ivec dd_dim;          /* Are we doing DD in x,y,z?                  */
+    gmx_domdec_zones_t *zones; /* The domain decomposition zones        */
+
+    int  ngrid;           /* The number of grids, equal to #DD-zones    */
+    nbnxn_grid_t *grid;   /* Array of grids, size ngrid                 */
+    int  *cell;           /* Actual allocated cell array for all grids  */
+    int  cell_nalloc;     /* Allocation size of cell                    */
+    int  *a;              /* Atom index for grid, the inverse of cell   */
+    int  a_nalloc;        /* Allocation size of a                       */
+
+    int  natoms_local;    /* The local atoms run from 0 to natoms_local */
+    int  natoms_nonlocal; /* The non-local atoms run from natoms_local
+                           * to natoms_nonlocal */
+
+    gmx_bool print_cycles;
+    int      search_count;
+    nbnxn_cycle_t cc[enbsCCnr];
+
+    gmx_icell_set_x_t *icell_set_x; /* Function for setting i-coords    */
+
+    gmx_subcell_in_range_t *subc_dc; /* Function for sub-cell range check */
+
+    int  nthread_max;     /* Maximum number of threads for pair-search  */
+    nbnxn_search_work_t *work; /* Work array, size nthread_max          */
+} nbnxn_search_t_t;
+
+
+static void nbs_cycle_clear(nbnxn_cycle_t *cc)
+{
+    int i;
+
+    for(i=0; i<enbsCCnr; i++)
+    {
+        cc[i].count = 0;
+        cc[i].c     = 0;
+    }
+}
+
+static void nbs_cycle_start(nbnxn_cycle_t *cc)
+{
+    cc->start = gmx_cycles_read();
+}
+
+static void nbs_cycle_stop(nbnxn_cycle_t *cc)
+{
+    cc->c += gmx_cycles_read() - cc->start;
+    cc->count++;
+}
+
+static double Mcyc_av(const nbnxn_cycle_t *cc)
+{
+    return (double)cc->c*1e-6/cc->count;
+}
+
+static void nbs_cycle_print(FILE *fp,const nbnxn_search_t nbs)
+{
+    int n;
+    int t;
+
+    fprintf(fp,"\n");
+    fprintf(fp,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
+            nbs->cc[enbsCCgrid].count,
+            Mcyc_av(&nbs->cc[enbsCCgrid]),
+            Mcyc_av(&nbs->cc[enbsCCsearch]),
+            Mcyc_av(&nbs->cc[enbsCCreducef]));
+
+    if (nbs->nthread_max > 1)
+    {
+        if (nbs->cc[enbsCCcombine].count > 0)
+        {
+            fprintf(fp," comb %5.2f",
+                    Mcyc_av(&nbs->cc[enbsCCcombine]));
+        }
+        fprintf(fp," s. th");
+        for(t=0; t<nbs->nthread_max; t++)
+        {
+            fprintf(fp," %4.1f",
+                    Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
+        }
+    }
+    fprintf(fp,"\n");
+}
+
+static void nbnxn_grid_init(nbnxn_grid_t * grid)
+{
+    grid->cxy_na      = NULL;
+    grid->cxy_ind     = NULL;
+    grid->cxy_nalloc  = 0;
+    grid->bb          = NULL;
+    grid->bbj         = NULL;
+    grid->nc_nalloc   = 0;
+}
+
+static int get_2log(int n)
+{
+    int log2;
+
+    log2 = 0;
+    while ((1<<log2) < n)
+    {
+        log2++;
+    }
+    if ((1<<log2) != n)
+    {
+        gmx_fatal(FARGS,"nbnxn na_c (%d) is not a power of 2",n);
+    }
+
+    return log2;
+}
+
+static int kernel_to_ci_size(int nb_kernel_type)
+{
+    switch (nb_kernel_type)
+    {
+    case nbk4x4_PlainC:
+    case nbk4xN_X86_SIMD128:
+    case nbk4xN_X86_SIMD256:
+        return NBNXN_CPU_CLUSTER_I_SIZE;
+    case nbk8x8x8_CUDA:
+    case nbk8x8x8_PlainC:
+        /* The cluster size for super/sub lists is only set here.
+         * Any value should work for the pair-search and atomdata code.
+         * The kernels, of course, might require a particular value.
+         */
+        return NBNXN_GPU_CLUSTER_SIZE;
+    default:
+        gmx_incons("unknown kernel type");
+    }
+
+    return 0;
+}
+
+static int kernel_to_cj_size(int nb_kernel_type)
+{
+    switch (nb_kernel_type)
+    {
+    case nbk4x4_PlainC:
+        return NBNXN_CPU_CLUSTER_I_SIZE;
+    case nbk4xN_X86_SIMD128:
+        /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
+        return 16/sizeof(real);
+    case nbk4xN_X86_SIMD256:
+        /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
+        return 32/sizeof(real);
+    case nbk8x8x8_CUDA:
+    case nbk8x8x8_PlainC:
+        return kernel_to_ci_size(nb_kernel_type);
+    default:
+        gmx_incons("unknown kernel type");
+    }
+
+    return 0;
+}
+
+static int ci_to_cj(int na_cj_2log,int ci)
+{
+    switch (na_cj_2log)
+    {
+    case 2: return  ci;     break;
+    case 1: return (ci<<1); break;
+    case 3: return (ci>>1); break;
+    }
+
+    return 0;
+}
+
+gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
+{
+    if (nb_kernel_type == nbkNotSet)
+    {
+        gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
+    }
+
+    switch (nb_kernel_type)
+    {
+    case nbk8x8x8_CUDA:
+    case nbk8x8x8_PlainC:
+        return FALSE;
+
+    case nbk4x4_PlainC:
+    case nbk4xN_X86_SIMD128:
+    case nbk4xN_X86_SIMD256:
+        return TRUE;
+
+    default:
+        gmx_incons("Invalid nonbonded kernel type passed!");
+        return FALSE;
+    }
+}
+
+void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
+                       ivec *n_dd_cells,
+                       gmx_domdec_zones_t *zones,
+                       int nthread_max)
+{
+    nbnxn_search_t nbs;
+    int d,g,t;
+
+    snew(nbs,1);
+    *nbs_ptr = nbs;
+
+    nbs->DomDec = (n_dd_cells != NULL);
+
+    clear_ivec(nbs->dd_dim);
+    nbs->ngrid = 1;
+    if (nbs->DomDec)
+    {
+        nbs->zones = zones;
+
+        for(d=0; d<DIM; d++)
+        {
+            if ((*n_dd_cells)[d] > 1)
+            {
+                nbs->dd_dim[d] = 1;
+                /* Each grid matches a DD zone */
+                nbs->ngrid *= 2;
+            }
+        }
+    }
+
+    snew(nbs->grid,nbs->ngrid);
+    for(g=0; g<nbs->ngrid; g++)
+    {
+        nbnxn_grid_init(&nbs->grid[g]);
+    }
+    nbs->cell        = NULL;
+    nbs->cell_nalloc = 0;
+    nbs->a           = NULL;
+    nbs->a_nalloc    = 0;
+
+    /* nbs->subc_dc is only used with super/sub setup */
+#ifdef NBNXN_8BB_SSE
+    nbs->subc_dc = subc_in_range_sse8;
+#else
+    if (getenv("GMX_NBNXN_BB") != NULL)
+    {
+        /* Use only bounding box sub cell pair distances,
+         * fast, but produces slightly more sub cell pairs.
+         */
+        nbs->subc_dc = NULL;
+    }
+    else
+    {
+        nbs->subc_dc = subc_in_range_x;
+    }
+#endif
+
+    nbs->nthread_max = nthread_max;
+
+    /* Initialize the work data structures for each thread */
+    snew(nbs->work,nbs->nthread_max);
+    for(t=0; t<nbs->nthread_max; t++)
+    {
+        nbs->work[t].cxy_na           = NULL;
+        nbs->work[t].cxy_na_nalloc    = 0;
+        nbs->work[t].sort_work        = NULL;
+        nbs->work[t].sort_work_nalloc = 0;
+    }
+
+    /* Initialize detailed nbsearch cycle counting */
+    nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
+    nbs->search_count = 0;
+    nbs_cycle_clear(nbs->cc);
+    for(t=0; t<nbs->nthread_max; t++)
+    {
+        nbs_cycle_clear(nbs->work[t].cc);
+    }
+}
+
+static real grid_atom_density(int n,rvec corner0,rvec corner1)
+{
+    rvec size;
+
+    rvec_sub(corner1,corner0,size);
+
+    return n/(size[XX]*size[YY]*size[ZZ]);
+}
+
+static int set_grid_size_xy(const nbnxn_search_t nbs,
+                            nbnxn_grid_t *grid,
+                            int n,rvec corner0,rvec corner1,
+                            real atom_density,
+                            int XFormat)
+{
+    rvec size;
+    int  na_c;
+    real adens,tlen,tlen_x,tlen_y,nc_max;
+    int  t;
+
+    rvec_sub(corner1,corner0,size);
+
+    if (n > grid->na_sc)
+    {
+        /* target cell length */
+        if (grid->bSimple)
+        {
+            /* To minimize the zero interactions, we should make
+             * the largest of the i/j cell cubic.
+             */
+            na_c = max(grid->na_c,grid->na_cj);
+
+            /* Approximately cubic cells */
+            tlen   = pow(na_c/atom_density,1.0/3.0);
+            tlen_x = tlen;
+            tlen_y = tlen;
+        }
+        else
+        {
+            /* Approximately cubic sub cells */
+            tlen   = pow(grid->na_c/atom_density,1.0/3.0);
+            tlen_x = tlen*GPU_NSUBCELL_X;
+            tlen_y = tlen*GPU_NSUBCELL_Y;
+        }
+        /* We round ncx and ncy down, because we get less cell pairs
+         * in the nbsist when the fixed cell dimensions (x,y) are
+         * larger than the variable one (z) than the other way around.
+         */
+        grid->ncx = max(1,(int)(size[XX]/tlen_x));
+        grid->ncy = max(1,(int)(size[YY]/tlen_y));
+    }
+    else
+    {
+        grid->ncx = 1;
+        grid->ncy = 1;
+    }
+
+    /* We need one additional cell entry for particles moved by DD */
+    if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
+    {
+        grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
+        srenew(grid->cxy_na,grid->cxy_nalloc);
+        srenew(grid->cxy_ind,grid->cxy_nalloc+1);
+    }
+    for(t=0; t<nbs->nthread_max; t++)
+    {
+        if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
+        {
+            nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
+            srenew(nbs->work[t].cxy_na,nbs->work[t].cxy_na_nalloc);
+        }
+    }
+
+    /* Worst case scenario of 1 atom in each last cell */
+    if (grid->na_cj <= grid->na_c)
+    {
+        nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
+    }
+    else
+    {
+        nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
+    }
+
+    if (nc_max > grid->nc_nalloc)
+    {
+        int bb_nalloc;
+
+        grid->nc_nalloc = over_alloc_large(nc_max);
+        srenew(grid->nsubc,grid->nc_nalloc);
+        srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
+#ifdef NBNXN_8BB_SSE
+        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/SSE_F_WIDTH*NNBSBB_XXXX;
+#else
+        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
+#endif
+        sfree_aligned(grid->bb);
+        /* This snew also zeros the contents, this avoid possible
+         * floating exceptions in SSE with the unused bb elements.
+         */
+        snew_aligned(grid->bb,bb_nalloc,16);
+
+        if (grid->bSimple)
+        {
+            if (grid->na_cj == grid->na_c)
+            {
+                grid->bbj = grid->bb;
+            }
+            else
+            {
+                sfree_aligned(grid->bbj);
+                snew_aligned(grid->bbj,bb_nalloc*grid->na_c/grid->na_cj,16);
+            }
+        }
+
+        srenew(grid->flags,grid->nc_nalloc);
+    }
+
+    copy_rvec(corner0,grid->c0);
+    copy_rvec(corner1,grid->c1);
+    grid->sx = size[XX]/grid->ncx;
+    grid->sy = size[YY]/grid->ncy;
+    grid->inv_sx = 1/grid->sx;
+    grid->inv_sy = 1/grid->sy;
+
+    return nc_max;
+}
+
+#define SORT_GRID_OVERSIZE 2
+#define SGSF (SORT_GRID_OVERSIZE + 1)
+
+static void sort_atoms(int dim,gmx_bool Backwards,
+                       int *a,int n,rvec *x,
+                       real h0,real invh,int nsort,int *sort)
+{
+    int i,c;
+    int zi,zim;
+    int cp,tmp;
+
+    if (n <= 1)
+    {
+        /* Nothing to do */
+        return;
+    }
+
+    /* For small oversize factors clearing the whole area is fastest.
+     * For large oversize we should clear the used elements after use.
+     */
+    for(i=0; i<nsort; i++)
+    {
+        sort[i] = -1;
+    }
+    /* Sort the particles using a simple index sort */
+    for(i=0; i<n; i++)
+    {
+        /* The cast takes care of float-point rounding effects below zero.
+         * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
+         * times the box height out of the box.
+         */
+        zi = (int)((x[a[i]][dim] - h0)*invh);
+
+#ifdef DEBUG_NBNXN_GRIDDING
+        if (zi < 0 || zi >= nsort)
+        {
+            gmx_fatal(FARGS,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
+                      a[i],'x'+dim,x[a[i]][dim],h0,invh,zi,nsort);
+        }
+#endif
+
+        /* Ideally this particle should go in sort cell zi,
+         * but that might already be in use,
+         * in that case find the first empty cell higher up
+         */
+        if (sort[zi] < 0)
+        {
+            sort[zi] = a[i];
+        }
+        else
+        {
+            /* We have multiple atoms in the same sorting slot.
+             * Sort on real z for minimal bounding box size.
+             * There is an extra check for identical z to ensure
+             * well-defined output order, independent of input order
+             * to ensure binary reproducibility after restarts.
+             */
+            while(sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
+                                    (x[a[i]][dim] == x[sort[zi]][dim] &&
+                                     a[i] > sort[zi])))
+            {
+                zi++;
+            }
+
+            if (sort[zi] >= 0)
+            {
+                /* Shift all elements by one slot until we find an empty slot */
+                cp = sort[zi];
+                zim = zi + 1;
+                while (sort[zim] >= 0)
+                {
+                    tmp = sort[zim];
+                    sort[zim] = cp;
+                    cp  = tmp;
+                    zim++;
+                }
+                sort[zim] = cp;
+            }
+            sort[zi] = a[i];
+        }
+    }
+
+    c = 0;
+    if (!Backwards)
+    {
+        for(zi=0; zi<nsort; zi++)
+        {
+            if (sort[zi] >= 0)
+            {
+                a[c++] = sort[zi];
+            }
+        }
+    }
+    else
+    {
+        for(zi=nsort-1; zi>=0; zi--)
+        {
+            if (sort[zi] >= 0)
+            {
+                a[c++] = sort[zi];
+            }
+        }
+    }
+    if (c < n)
+    {
+        gmx_incons("Lost particles while sorting");
+    }
+}
+
+#ifdef GMX_DOUBLE
+#define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
+#define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
+#else
+#define R2F_D(x) (x)
+#define R2F_U(x) (x)
+#endif
+
+/* Coordinate order x,y,z, bb order xyz0 */
+static void calc_bounding_box(int na,int stride,const real *x,float *bb)
+{
+    int  i,j;
+    real xl,xh,yl,yh,zl,zh;
+
+    i = 0;
+    xl = x[i+XX];
+    xh = x[i+XX];
+    yl = x[i+YY];
+    yh = x[i+YY];
+    zl = x[i+ZZ];
+    zh = x[i+ZZ];
+    i += stride;
+    for(j=1; j<na; j++)
+    {
+        xl = min(xl,x[i+XX]);
+        xh = max(xh,x[i+XX]);
+        yl = min(yl,x[i+YY]);
+        yh = max(yh,x[i+YY]);
+        zl = min(zl,x[i+ZZ]);
+        zh = max(zh,x[i+ZZ]);
+        i += stride;
+    }
+    /* Note: possible double to float conversion here */
+    bb[BBL_X] = R2F_D(xl);
+    bb[BBL_Y] = R2F_D(yl);
+    bb[BBL_Z] = R2F_D(zl);
+    bb[BBU_X] = R2F_U(xh);
+    bb[BBU_Y] = R2F_U(yh);
+    bb[BBU_Z] = R2F_U(zh);
+}
+
+/* Packed coordinates, bb order xyz0 */
+static void calc_bounding_box_x_x4(int na,const real *x,float *bb)
+{
+    int  j;
+    real xl,xh,yl,yh,zl,zh;
+
+    xl = x[XX*PACK_X4];
+    xh = x[XX*PACK_X4];
+    yl = x[YY*PACK_X4];
+    yh = x[YY*PACK_X4];
+    zl = x[ZZ*PACK_X4];
+    zh = x[ZZ*PACK_X4];
+    for(j=1; j<na; j++)
+    {
+        xl = min(xl,x[j+XX*PACK_X4]);
+        xh = max(xh,x[j+XX*PACK_X4]);
+        yl = min(yl,x[j+YY*PACK_X4]);
+        yh = max(yh,x[j+YY*PACK_X4]);
+        zl = min(zl,x[j+ZZ*PACK_X4]);
+        zh = max(zh,x[j+ZZ*PACK_X4]);
+    }
+    /* Note: possible double to float conversion here */
+    bb[BBL_X] = R2F_D(xl);
+    bb[BBL_Y] = R2F_D(yl);
+    bb[BBL_Z] = R2F_D(zl);
+    bb[BBU_X] = R2F_U(xh);
+    bb[BBU_Y] = R2F_U(yh);
+    bb[BBU_Z] = R2F_U(zh);
+}
+
+/* Packed coordinates, bb order xyz0 */
+static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
+{
+    int  j;
+    real xl,xh,yl,yh,zl,zh;
+
+    xl = x[XX*PACK_X8];
+    xh = x[XX*PACK_X8];
+    yl = x[YY*PACK_X8];
+    yh = x[YY*PACK_X8];
+    zl = x[ZZ*PACK_X8];
+    zh = x[ZZ*PACK_X8];
+    for(j=1; j<na; j++)
+    {
+        xl = min(xl,x[j+XX*PACK_X8]);
+        xh = max(xh,x[j+XX*PACK_X8]);
+        yl = min(yl,x[j+YY*PACK_X8]);
+        yh = max(yh,x[j+YY*PACK_X8]);
+        zl = min(zl,x[j+ZZ*PACK_X8]);
+        zh = max(zh,x[j+ZZ*PACK_X8]);
+    }
+    /* Note: possible double to float conversion here */
+    bb[BBL_X] = R2F_D(xl);
+    bb[BBL_Y] = R2F_D(yl);
+    bb[BBL_Z] = R2F_D(zl);
+    bb[BBU_X] = R2F_U(xh);
+    bb[BBU_Y] = R2F_U(yh);
+    bb[BBU_Z] = R2F_U(zh);
+}
+
+#ifdef NBNXN_SEARCH_SSE
+
+/* Packed coordinates, bb order xyz0 */
+static void calc_bounding_box_x_x4_halves(int na,const real *x,
+                                          float *bb,float *bbj)
+{
+    calc_bounding_box_x_x4(min(na,2),x,bbj);
+
+    if (na > 2)
+    {
+        calc_bounding_box_x_x4(min(na-2,2),x+(PACK_X4>>1),bbj+NNBSBB_B);
+    }
+    else
+    {
+        /* Set the "empty" bounding box to the same as the first one,
+         * so we don't need to treat special cases in the rest of the code.
+         */
+        _mm_store_ps(bbj+NNBSBB_B         ,_mm_load_ps(bbj));
+        _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C,_mm_load_ps(bbj+NNBSBB_C));
+    }
+
+    _mm_store_ps(bb         ,_mm_min_ps(_mm_load_ps(bbj),
+                                        _mm_load_ps(bbj+NNBSBB_B)));
+    _mm_store_ps(bb+NNBSBB_C,_mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
+                                        _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
+}
+
+/* Coordinate order xyz, bb order xxxxyyyyzzzz */
+static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
+{
+    int  i,j;
+    real xl,xh,yl,yh,zl,zh;
+
+    i = 0;
+    xl = x[i+XX];
+    xh = x[i+XX];
+    yl = x[i+YY];
+    yh = x[i+YY];
+    zl = x[i+ZZ];
+    zh = x[i+ZZ];
+    i += stride;
+    for(j=1; j<na; j++)
+    {
+        xl = min(xl,x[i+XX]);
+        xh = max(xh,x[i+XX]);
+        yl = min(yl,x[i+YY]);
+        yh = max(yh,x[i+YY]);
+        zl = min(zl,x[i+ZZ]);
+        zh = max(zh,x[i+ZZ]);
+        i += stride;
+    }
+    /* Note: possible double to float conversion here */
+    bb[ 0] = R2F_D(xl);
+    bb[ 4] = R2F_D(yl);
+    bb[ 8] = R2F_D(zl);
+    bb[12] = R2F_U(xh);
+    bb[16] = R2F_U(yh);
+    bb[20] = R2F_U(zh);
+}
+
+#endif /* NBNXN_SEARCH_SSE */
+
+#ifdef NBNXN_SEARCH_SSE_SINGLE
+
+/* Coordinate order xyz?, bb order xyz0 */
+static void calc_bounding_box_sse(int na,const float *x,float *bb)
+{
+    __m128 bb_0_SSE,bb_1_SSE;
+    __m128 x_SSE;
+
+    int  i;
+
+    bb_0_SSE = _mm_load_ps(x);
+    bb_1_SSE = bb_0_SSE;
+
+    for(i=1; i<na; i++)
+    {
+        x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
+        bb_0_SSE = _mm_min_ps(bb_0_SSE,x_SSE);
+        bb_1_SSE = _mm_max_ps(bb_1_SSE,x_SSE);
+    }
+
+    _mm_store_ps(bb  ,bb_0_SSE);
+    _mm_store_ps(bb+4,bb_1_SSE);
+}
+
+/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
+static void calc_bounding_box_xxxx_sse(int na,const float *x,
+                                       float *bb_work,
+                                       real *bb)
+{
+    calc_bounding_box_sse(na,x,bb_work);
+
+    bb[ 0] = bb_work[BBL_X];
+    bb[ 4] = bb_work[BBL_Y];
+    bb[ 8] = bb_work[BBL_Z];
+    bb[12] = bb_work[BBU_X];
+    bb[16] = bb_work[BBU_Y];
+    bb[20] = bb_work[BBU_Z];
+}
+
+#endif /* NBNXN_SEARCH_SSE_SINGLE */
+
+#ifdef NBNXN_SEARCH_SSE
+
+/* Combines pairs of consecutive bounding boxes */
+static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
+{
+    int    i,j,sc2,nc2,c2;
+    __m128 min_SSE,max_SSE;
+
+    for(i=0; i<grid->ncx*grid->ncy; i++)
+    {
+        /* Starting bb in a column is expected to be 2-aligned */
+        sc2 = grid->cxy_ind[i]>>1;
+        /* For odd numbers skip the last bb here */
+        nc2 = (grid->cxy_na[i]+3)>>(2+1);
+        for(c2=sc2; c2<sc2+nc2; c2++)
+        {
+            min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
+                                 _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
+            max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
+                                 _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
+            _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C,min_SSE);
+            _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C,max_SSE);
+        }
+        if (((grid->cxy_na[i]+3)>>2) & 1)
+        {
+            /* Copy the last bb for odd bb count in this column */
+            for(j=0; j<NNBSBB_C; j++)
+            {
+                grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
+                grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
+            }
+        }
+    }
+}
+
+#endif
+
+
+/* Prints the average bb size, used for debug output */
+static void print_bbsizes_simple(FILE *fp,
+                                 const nbnxn_search_t nbs,
+                                 const nbnxn_grid_t *grid)
+{
+    int  c,d;
+    dvec ba;
+
+    clear_dvec(ba);
+    for(c=0; c<grid->nc; c++)
+    {
+        for(d=0; d<DIM; d++)
+        {
+            ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
+        }
+    }
+    dsvmul(1.0/grid->nc,ba,ba);
+
+    fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
+            nbs->box[XX][XX]/grid->ncx,
+            nbs->box[YY][YY]/grid->ncy,
+            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
+            ba[XX],ba[YY],ba[ZZ],
+            ba[XX]*grid->ncx/nbs->box[XX][XX],
+            ba[YY]*grid->ncy/nbs->box[YY][YY],
+            ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
+}
+
+/* Prints the average bb size, used for debug output */
+static void print_bbsizes_supersub(FILE *fp,
+                                   const nbnxn_search_t nbs,
+                                   const nbnxn_grid_t *grid)
+{
+    int  ns,c,s;
+    dvec ba;
+
+    clear_dvec(ba);
+    ns = 0;
+    for(c=0; c<grid->nc; c++)
+    {
+#ifdef NBNXN_BBXXXX
+        for(s=0; s<grid->nsubc[c]; s+=SSE_F_WIDTH)
+        {
+            int cs_w,i,d;
+
+            cs_w = (c*GPU_NSUBCELL + s)/SSE_F_WIDTH;
+            for(i=0; i<SSE_F_WIDTH; i++)
+            {
+                for(d=0; d<DIM; d++)
+                {
+                    ba[d] +=
+                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*SSE_F_WIDTH+i] -
+                        grid->bb[cs_w*NNBSBB_XXXX+     d *SSE_F_WIDTH+i];
+                }
+            }
+        }
+#else
+        for(s=0; s<grid->nsubc[c]; s++)
+        {
+            int cs,d;
+
+            cs = c*GPU_NSUBCELL + s;
+            for(d=0; d<DIM; d++)
+            {
+                ba[d] +=
+                    grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
+                    grid->bb[cs*NNBSBB_B         +d];
+            }
+        }
+#endif
+        ns += grid->nsubc[c];
+    }
+    dsvmul(1.0/ns,ba,ba);
+
+    fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
+            nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
+            nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
+            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
+            ba[XX],ba[YY],ba[ZZ],
+            ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
+            ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
+            ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
+}
+
+static void copy_int_to_nbat_int(const int *a,int na,int na_round,
+                                 const int *in,int fill,int *innb)
+{
+    int i,j;
+
+    j = 0;
+    for(i=0; i<na; i++)
+    {
+        innb[j++] = in[a[i]];
+    }
+    /* Complete the partially filled last cell with fill */
+    for(; i<na_round; i++)
+    {
+        innb[j++] = fill;
+    }
+}
+
+static void clear_nbat_real(int na,int nbatFormat,real *xnb,int a0)
+{
+    int a,d,j,c;
+
+    switch (nbatFormat)
+    {
+    case nbatXYZ:
+        for(a=0; a<na; a++)
+        {
+            for(d=0; d<DIM; d++)
+            {
+                xnb[(a0+a)*STRIDE_XYZ+d] = 0;
+            }
+        }
+        break;
+    case nbatXYZQ:
+        for(a=0; a<na; a++)
+        {
+            for(d=0; d<DIM; d++)
+            {
+                xnb[(a0+a)*STRIDE_XYZQ+d] = 0;
+            }
+        }
+        break;
+    case nbatX4:
+        j = X4_IND_A(a0);
+        c = a0 & (PACK_X4-1);
+        for(a=0; a<na; a++)
+        {
+            xnb[j+XX*PACK_X4] = 0;
+            xnb[j+YY*PACK_X4] = 0;
+            xnb[j+ZZ*PACK_X4] = 0;
+            j++;
+            c++;
+            if (c == PACK_X4)
+            {
+                j += (DIM-1)*PACK_X4;
+                c  = 0;
+            }
+        }
+        break;
+    case nbatX8:
+        j = X8_IND_A(a0);
+        c = a0 & (PACK_X8-1);
+        for(a=0; a<na; a++)
+        {
+            xnb[j+XX*PACK_X8] = 0;
+            xnb[j+YY*PACK_X8] = 0;
+            xnb[j+ZZ*PACK_X8] = 0;
+            j++;
+            c++;
+            if (c == PACK_X8)
+            {
+                j += (DIM-1)*PACK_X8;
+                c  = 0;
+            }
+        }
+        break;
+    }
+}
+
+static void copy_rvec_to_nbat_real(const int *a,int na,int na_round,
+                                   rvec *x,int nbatFormat,real *xnb,int a0,
+                                   int cx,int cy,int cz)
+{
+    int i,j,c;
+
+/* We might need to place filler particles to fill up the cell to na_round.
+ * The coefficients (LJ and q) for such particles are zero.
+ * But we might still get NaN as 0*NaN when distances are too small.
+ * We hope that -107 nm is far away enough from to zero
+ * to avoid accidental short distances to particles shifted down for pbc.
+ */
+#define NBAT_FAR_AWAY 107
+
+    switch (nbatFormat)
+    {
+    case nbatXYZ:
+        j = a0*STRIDE_XYZ;
+        for(i=0; i<na; i++)
+        {
+            xnb[j++] = x[a[i]][XX];
+            xnb[j++] = x[a[i]][YY];
+            xnb[j++] = x[a[i]][ZZ];
+        }
+        /* Complete the partially filled last cell with copies of the last element.
+         * This simplifies the bounding box calculation and avoid
+         * numerical issues with atoms that are coincidentally close.
+         */
+        for(; i<na_round; i++)
+        {
+            xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
+            xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
+            xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
+        }
+        break;
+    case nbatXYZQ:
+        j = a0*STRIDE_XYZQ;
+        for(i=0; i<na; i++)
+        {
+            xnb[j++] = x[a[i]][XX];
+            xnb[j++] = x[a[i]][YY];
+            xnb[j++] = x[a[i]][ZZ];
+            j++;
+        }
+        /* Complete the partially filled last cell with particles far apart */
+        for(; i<na_round; i++)
+        {
+            xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
+            xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
+            xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
+            j++;
+        }
+        break;
+    case nbatX4:
+        j = X4_IND_A(a0);
+        c = a0 & (PACK_X4-1);
+        for(i=0; i<na; i++)
+        {
+            xnb[j+XX*PACK_X4] = x[a[i]][XX];
+            xnb[j+YY*PACK_X4] = x[a[i]][YY];
+            xnb[j+ZZ*PACK_X4] = x[a[i]][ZZ];
+            j++;
+            c++;
+            if (c == PACK_X4)
+            {
+                j += (DIM-1)*PACK_X4;
+                c  = 0;
+            }
+        }
+        /* Complete the partially filled last cell with particles far apart */
+        for(; i<na_round; i++)
+        {
+            xnb[j+XX*PACK_X4] = -NBAT_FAR_AWAY*(1 + cx);
+            xnb[j+YY*PACK_X4] = -NBAT_FAR_AWAY*(1 + cy);
+            xnb[j+ZZ*PACK_X4] = -NBAT_FAR_AWAY*(1 + cz + i);
+            j++;
+            c++;
+            if (c == PACK_X4)
+            {
+                j += (DIM-1)*PACK_X4;
+                c  = 0;
+            }
+        }
+        break;
+    case nbatX8:
+        j = X8_IND_A(a0);
+        c = a0 & (PACK_X8 - 1);
+        for(i=0; i<na; i++)
+        {
+            xnb[j+XX*PACK_X8] = x[a[i]][XX];
+            xnb[j+YY*PACK_X8] = x[a[i]][YY];
+            xnb[j+ZZ*PACK_X8] = x[a[i]][ZZ];
+            j++;
+            c++;
+            if (c == PACK_X8)
+            {
+                j += (DIM-1)*PACK_X8;
+                c  = 0;
+            }
+        }
+        /* Complete the partially filled last cell with particles far apart */
+        for(; i<na_round; i++)
+        {
+            xnb[j+XX*PACK_X8] = -NBAT_FAR_AWAY*(1 + cx);
+            xnb[j+YY*PACK_X8] = -NBAT_FAR_AWAY*(1 + cy);
+            xnb[j+ZZ*PACK_X8] = -NBAT_FAR_AWAY*(1 + cz + i);
+            j++;
+            c++;
+            if (c == PACK_X8)
+            {
+                j += (DIM-1)*PACK_X8;
+                c  = 0;
+            }
+        }
+        break;
+    default:
+        gmx_incons("Unsupported stride");
+    }
+}
+
+/* Potentially sorts atoms on LJ coefficients !=0 and ==0.
+ * Also sets interaction flags.
+ */
+void sort_on_lj(nbnxn_atomdata_t *nbat,int na_c,
+                int a0,int a1,const int *atinfo,
+                int *order,
+                int *flags)
+{
+    int subc,s,a,n1,n2,a_lj_max,i,j;
+    int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
+    int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
+    gmx_bool haveQ;
+
+    *flags = 0;
+
+    subc = 0;
+    for(s=a0; s<a1; s+=na_c)
+    {
+        /* Make lists for this (sub-)cell on atoms with and without LJ */
+        n1 = 0;
+        n2 = 0;
+        haveQ = FALSE;
+        a_lj_max = -1;
+        for(a=s; a<min(s+na_c,a1); a++)
+        {
+            haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
+
+            if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
+            {
+                sort1[n1++] = order[a];
+                a_lj_max = a;
+            }
+            else
+            {
+                sort2[n2++] = order[a];
+            }
+        }
+
+        /* If we don't have atom with LJ, there's nothing to sort */
+        if (n1 > 0)
+        {
+            *flags |= NBNXN_CI_DO_LJ(subc);
+
+            if (2*n1 <= na_c)
+            {
+                /* Only sort when strictly necessary. Ordering particles
+                 * Ordering particles can lead to less accurate summation
+                 * due to rounding, both for LJ and Coulomb interactions.
+                 */
+                if (2*(a_lj_max - s) >= na_c)
+                {
+                    for(i=0; i<n1; i++)
+                    {
+                        order[a0+i] = sort1[i];
+                    }
+                    for(j=0; j<n2; j++)
+                    {
+                        order[a0+n1+j] = sort2[j];
+                    }
+                }
+
+                *flags |= NBNXN_CI_HALF_LJ(subc);
+            }
+        }
+        if (haveQ)
+        {
+            *flags |= NBNXN_CI_DO_COUL(subc);
+        }
+        subc++;
+    }
+}
+
+/* Fill a pair search cell with atoms.
+ * Potentially sorts atoms and sets the interaction flags.
+ */
+void fill_cell(const nbnxn_search_t nbs,
+               nbnxn_grid_t *grid,
+               nbnxn_atomdata_t *nbat,
+               int a0,int a1,
+               const int *atinfo,
+               rvec *x,
+               int sx,int sy, int sz,
+               float *bb_work)
+{
+    int    na,a;
+    size_t offset;
+    float  *bb_ptr;
+
+    na = a1 - a0;
+
+    if (grid->bSimple)
+    {
+        sort_on_lj(nbat,grid->na_c,a0,a1,atinfo,nbs->a,
+                   grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
+    }
+
+    /* Now we have sorted the atoms, set the cell indices */
+    for(a=a0; a<a1; a++)
+    {
+        nbs->cell[nbs->a[a]] = a;
+    }
+
+    copy_rvec_to_nbat_real(nbs->a+a0,a1-a0,grid->na_c,x,
+                           nbat->XFormat,nbat->x,a0,
+                           sx,sy,sz);
+
+    if (nbat->XFormat == nbatX4)
+    {
+        /* Store the bounding boxes as xyz.xyz. */
+        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+        bb_ptr = grid->bb + offset;
+
+#if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
+        if (2*grid->na_cj == grid->na_c)
+        {
+            calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
+                                          grid->bbj+offset*2);
+        }
+        else
+#endif
+        {
+            calc_bounding_box_x_x4(na,nbat->x+X4_IND_A(a0),bb_ptr);
+        }
+    }
+    else if (nbat->XFormat == nbatX8)
+    {
+        /* Store the bounding boxes as xyz.xyz. */
+        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+        bb_ptr = grid->bb + offset;
+
+        calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(a0),bb_ptr);
+    }
+#ifdef NBNXN_BBXXXX
+    else if (!grid->bSimple)
+    {
+        /* Store the bounding boxes in a format convenient
+         * for SSE calculations: xxxxyyyyzzzz...
+                             */
+        bb_ptr =
+            grid->bb +
+            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+SSE_F_WIDTH_2LOG))*NNBSBB_XXXX +
+            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (SSE_F_WIDTH-1));
+
+#ifdef NBNXN_SEARCH_SSE_SINGLE
+        if (nbat->XFormat == nbatXYZQ)
+        {
+            calc_bounding_box_xxxx_sse(na,nbat->x+a0*nbat->xstride,
+                                       bb_work,bb_ptr);
+        }
+        else
+#endif
+        {
+            calc_bounding_box_xxxx(na,nbat->xstride,nbat->x+a0*nbat->xstride,
+                                   bb_ptr);
+        }
+        if (gmx_debug_at)
+        {
+            fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
+                    sx,sy,sz,
+                    bb_ptr[0],bb_ptr[12],
+                    bb_ptr[4],bb_ptr[16],
+                    bb_ptr[8],bb_ptr[20]);
+        }
+    }
+#endif
+    else
+    {
+        /* Store the bounding boxes as xyz.xyz. */
+        bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
+
+        calc_bounding_box(na,nbat->xstride,nbat->x+a0*nbat->xstride,
+                          bb_ptr);
+
+        if (gmx_debug_at)
+        {
+            int bbo;
+            bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
+            fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
+                    sx,sy,sz,
+                    (grid->bb+bbo*NNBSBB_B)[BBL_X],
+                    (grid->bb+bbo*NNBSBB_B)[BBU_X],
+                    (grid->bb+bbo*NNBSBB_B)[BBL_Y],
+                    (grid->bb+bbo*NNBSBB_B)[BBU_Y],
+                    (grid->bb+bbo*NNBSBB_B)[BBL_Z],
+                    (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
+        }
+    }
+}
+
+/* Spatially sort the atoms within one grid column */
+static void sort_columns_simple(const nbnxn_search_t nbs,
+                                int dd_zone,
+                                nbnxn_grid_t *grid,
+                                int a0,int a1,
+                                const int *atinfo,
+                                rvec *x,
+                                nbnxn_atomdata_t *nbat,
+                                int cxy_start,int cxy_end,
+                                int *sort_work)
+{
+    int  cxy;
+    int  cx,cy,cz,ncz,cfilled,c;
+    int  na,ash,ind,a;
+    int  na_c,ash_c;
+
+    if (debug)
+    {
+        fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
+                grid->cell0,cxy_start,cxy_end,a0,a1);
+    }
+
+    /* Sort the atoms within each x,y column in 3 dimensions */
+    for(cxy=cxy_start; cxy<cxy_end; cxy++)
+    {
+        cx = cxy/grid->ncy;
+        cy = cxy - cx*grid->ncy;
+
+        na  = grid->cxy_na[cxy];
+        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
+        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+
+        /* Sort the atoms within each x,y column on z coordinate */
+        sort_atoms(ZZ,FALSE,
+                   nbs->a+ash,na,x,
+                   grid->c0[ZZ],
+                   ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
+                   ncz*grid->na_sc*SGSF,sort_work);
+
+        /* Fill the ncz cells in this column */
+        cfilled = grid->cxy_ind[cxy];
+        for(cz=0; cz<ncz; cz++)
+        {
+            c  = grid->cxy_ind[cxy] + cz ;
+
+            ash_c = ash + cz*grid->na_sc;
+            na_c  = min(grid->na_sc,na-(ash_c-ash));
+
+            fill_cell(nbs,grid,nbat,
+                      ash_c,ash_c+na_c,atinfo,x,
+                      grid->na_sc*cx + (dd_zone >> 2),
+                      grid->na_sc*cy + (dd_zone & 3),
+                      grid->na_sc*cz,
+                      NULL);
+
+            /* This copy to bbcz is not really necessary.
+             * But it allows to use the same grid search code
+             * for the simple and supersub cell setups.
+             */
+            if (na_c > 0)
+            {
+                cfilled = c;
+            }
+            grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
+            grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
+        }
+
+        /* Set the unused atom indices to -1 */
+        for(ind=na; ind<ncz*grid->na_sc; ind++)
+        {
+            nbs->a[ash+ind] = -1;
+        }
+    }
+}
+
+/* Spatially sort the atoms within one grid column */
+static void sort_columns_supersub(const nbnxn_search_t nbs,
+                                  int dd_zone,
+                                  nbnxn_grid_t *grid,
+                                  int a0,int a1,
+                                  const int *atinfo,
+                                  rvec *x,
+                                  nbnxn_atomdata_t *nbat,
+                                  int cxy_start,int cxy_end,
+                                  int *sort_work)
+{
+    int  cxy;
+    int  cx,cy,cz=-1,c=-1,ncz;
+    int  na,ash,na_c,ind,a;
+    int  subdiv_z,sub_z,na_z,ash_z;
+    int  subdiv_y,sub_y,na_y,ash_y;
+    int  subdiv_x,sub_x,na_x,ash_x;
+
+    /* cppcheck-suppress unassignedVariable */
+    float bb_work_array[NNBSBB_B+3],*bb_work_align;
+
+    bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
+
+    if (debug)
+    {
+        fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
+                grid->cell0,cxy_start,cxy_end,a0,a1);
+    }
+
+    subdiv_x = grid->na_c;
+    subdiv_y = GPU_NSUBCELL_X*subdiv_x;
+    subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
+
+    /* Sort the atoms within each x,y column in 3 dimensions */
+    for(cxy=cxy_start; cxy<cxy_end; cxy++)
+    {
+        cx = cxy/grid->ncy;
+        cy = cxy - cx*grid->ncy;
+
+        na  = grid->cxy_na[cxy];
+        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
+        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+
+        /* Sort the atoms within each x,y column on z coordinate */
+        sort_atoms(ZZ,FALSE,
+                   nbs->a+ash,na,x,
+                   grid->c0[ZZ],
+                   ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
+                   ncz*grid->na_sc*SGSF,sort_work);
+
+        /* This loop goes over the supercells and subcells along z at once */
+        for(sub_z=0; sub_z<ncz*GPU_NSUBCELL_Z; sub_z++)
+        {
+            ash_z = ash + sub_z*subdiv_z;
+            na_z  = min(subdiv_z,na-(ash_z-ash));
+
+            /* We have already sorted on z */
+
+            if (sub_z % GPU_NSUBCELL_Z == 0)
+            {
+                cz = sub_z/GPU_NSUBCELL_Z;
+                c  = grid->cxy_ind[cxy] + cz ;
+
+                /* The number of atoms in this supercell */
+                na_c = min(grid->na_sc,na-(ash_z-ash));
+
+                grid->nsubc[c] = min(GPU_NSUBCELL,(na_c+grid->na_c-1)/grid->na_c);
+
+                /* Store the z-boundaries of the super cell */
+                grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
+                grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
+            }
+
+#if GPU_NSUBCELL_Y > 1
+            /* Sort the atoms along y */
+            sort_atoms(YY,(sub_z & 1),
+                       nbs->a+ash_z,na_z,x,
+                       grid->c0[YY]+cy*grid->sy,grid->inv_sy,
+                       subdiv_y*SGSF,sort_work);
+#endif
+
+            for(sub_y=0; sub_y<GPU_NSUBCELL_Y; sub_y++)
+            {
+                ash_y = ash_z + sub_y*subdiv_y;
+                na_y  = min(subdiv_y,na-(ash_y-ash));
+
+#if GPU_NSUBCELL_X > 1
+                /* Sort the atoms along x */
+                sort_atoms(XX,((cz*GPU_NSUBCELL_Y + sub_y) & 1),
+                           nbs->a+ash_y,na_y,x,
+                           grid->c0[XX]+cx*grid->sx,grid->inv_sx,
+                           subdiv_x*SGSF,sort_work);
+#endif
+
+                for(sub_x=0; sub_x<GPU_NSUBCELL_X; sub_x++)
+                {
+                    ash_x = ash_y + sub_x*subdiv_x;
+                    na_x  = min(subdiv_x,na-(ash_x-ash));
+
+                    fill_cell(nbs,grid,nbat,
+                              ash_x,ash_x+na_x,atinfo,x,
+                              grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
+                              grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
+                              grid->na_c*sub_z,
+                              bb_work_align);
+                }
+            }
+        }
+
+        /* Set the unused atom indices to -1 */
+        for(ind=na; ind<ncz*grid->na_sc; ind++)
+        {
+            nbs->a[ash+ind] = -1;
+        }
+    }
+}
+
+/* Determine in which grid column atoms should go */
+static void calc_column_indices(nbnxn_grid_t *grid,
+                                int a0,int a1,
+                                rvec *x,const int *move,
+                                int thread,int nthread,
+                                int *cell,
+                                int *cxy_na)
+{
+    int  n0,n1,i;
+    int  cx,cy;
+
+    /* We add one extra cell for particles which moved during DD */
+    for(i=0; i<grid->ncx*grid->ncy+1; i++)
+    {
+        cxy_na[i] = 0;
+    }
+
+    n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
+    n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
+    for(i=n0; i<n1; i++)
+    {
+        if (move == NULL || move[i] >= 0)
+        {
+            /* We need to be careful with rounding,
+             * particles might be a few bits outside the local box.
+             * The int cast takes care of the lower bound,
+             * we need to explicitly take care of the upper bound.
+             */
+            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
+            if (cx == grid->ncx)
+            {
+                cx = grid->ncx - 1;
+            }
+            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
+            if (cy == grid->ncy)
+            {
+                cy = grid->ncy - 1;
+            }
+            /* For the moment cell contains only the, grid local,
+             * x and y indices, not z.
+             */
+            cell[i] = cx*grid->ncy + cy;
+
+#ifdef DEBUG_NBNXN_GRIDDING
+            if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
+            {
+                gmx_fatal(FARGS,
+                          "grid cell cx %d cy %d out of range (max %d %d)",
+                          cx,cy,grid->ncx,grid->ncy);
+            }
+#endif
+        }
+        else
+        {
+            /* Put this moved particle after the end of the grid,
+             * so we can process it later without using conditionals.
+             */
+            cell[i] = grid->ncx*grid->ncy;
+        }
+
+        cxy_na[cell[i]]++;
+    }
+}
+
+/* Determine in which grid cells the atoms should go */
+static void calc_cell_indices(const nbnxn_search_t nbs,
+                              int dd_zone,
+                              nbnxn_grid_t *grid,
+                              int a0,int a1,
+                              const int *atinfo,
+                              rvec *x,
+                              const int *move,
+                              nbnxn_atomdata_t *nbat)
+{
+    int  n0,n1,i;
+    int  cx,cy,cxy,ncz_max,ncz;
+    int  nthread,thread;
+    int  *cxy_na,cxy_na_i;
+
+    nthread = gmx_omp_nthreads_get(emntPairsearch);
+
+#pragma omp parallel for num_threads(nthread) schedule(static)
+    for(thread=0; thread<nthread; thread++)
+    {
+        calc_column_indices(grid,a0,a1,x,move,thread,nthread,
+                            nbs->cell,nbs->work[thread].cxy_na);
+    }
+
+    /* Make the cell index as a function of x and y */
+    ncz_max = 0;
+    ncz = 0;
+    grid->cxy_ind[0] = 0;
+    for(i=0; i<grid->ncx*grid->ncy+1; i++)
+    {
+        /* We set ncz_max at the beginning of the loop iso at the end
+         * to skip i=grid->ncx*grid->ncy which are moved particles
+         * that do not need to be ordered on the grid.
+         */
+        if (ncz > ncz_max)
+        {
+            ncz_max = ncz;
+        }
+        cxy_na_i = nbs->work[0].cxy_na[i];
+        for(thread=1; thread<nthread; thread++)
+        {
+            cxy_na_i += nbs->work[thread].cxy_na[i];
+        }
+        ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
+        if (nbat->XFormat == nbatX8)
+        {
+            /* Make the number of cell a multiple of 2 */
+            ncz = (ncz + 1) & ~1;
+        }
+        grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
+        /* Clear cxy_na, so we can reuse the array below */
+        grid->cxy_na[i] = 0;
+    }
+    grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
+
+    nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
+
+    if (debug)
+    {
+        fprintf(debug,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
+                grid->na_sc,grid->na_c,grid->nc,
+                grid->ncx,grid->ncy,grid->nc/((double)(grid->ncx*grid->ncy)),
+                ncz_max);
+        if (gmx_debug_at)
+        {
+            i = 0;
+            for(cy=0; cy<grid->ncy; cy++)
+            {
+                for(cx=0; cx<grid->ncx; cx++)
+                {
+                    fprintf(debug," %2d",grid->cxy_ind[i+1]-grid->cxy_ind[i]);
+                    i++;
+                }
+                fprintf(debug,"\n");
+            }
+        }
+    }
+
+    /* Make sure the work array for sorting is large enough */
+    if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
+    {
+        for(thread=0; thread<nbs->nthread_max; thread++)
+        {
+            nbs->work[thread].sort_work_nalloc =
+                over_alloc_large(ncz_max*grid->na_sc*SGSF);
+            srenew(nbs->work[thread].sort_work,
+                   nbs->work[thread].sort_work_nalloc);
+        }
+    }
+
+    /* Now we know the dimensions we can fill the grid.
+     * This is the first, unsorted fill. We sort the columns after this.
+     */
+    for(i=a0; i<a1; i++)
+    {
+        /* At this point nbs->cell contains the local grid x,y indices */
+        cxy = nbs->cell[i];
+        nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
+    }
+
+    /* Set the cell indices for the moved particles */
+    n0 = grid->nc*grid->na_sc;
+    n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
+    for(i=n0; i<n1; i++)
+    {
+        nbs->cell[nbs->a[i]] = i;
+    }
+
+    /* Sort the super-cell columns along z into the sub-cells. */
+#pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
+    for(thread=0; thread<nbs->nthread_max; thread++)
+    {
+        if (grid->bSimple)
+        {
+            sort_columns_simple(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
+                                ((thread+0)*grid->ncx*grid->ncy)/nthread,
+                                ((thread+1)*grid->ncx*grid->ncy)/nthread,
+                                nbs->work[thread].sort_work);
+        }
+        else
+        {
+            sort_columns_supersub(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
+                                  ((thread+0)*grid->ncx*grid->ncy)/nthread,
+                                  ((thread+1)*grid->ncx*grid->ncy)/nthread,
+                                  nbs->work[thread].sort_work);
+        }
+    }
+
+#ifdef NBNXN_SEARCH_SSE
+    if (grid->bSimple && nbat->XFormat == nbatX8)
+    {
+        combine_bounding_box_pairs(grid,grid->bb);
+    }
+#endif
+
+    if (!grid->bSimple)
+    {
+        grid->nsubc_tot = 0;
+        for(i=0; i<grid->nc; i++)
+        {
+            grid->nsubc_tot += grid->nsubc[i];
+        }
+    }
+
+    if (debug)
+    {
+        if (grid->bSimple)
+        {
+            print_bbsizes_simple(debug,nbs,grid);
+        }
+        else
+        {
+            fprintf(debug,"ns non-zero sub-cells: %d average atoms %.2f\n",
+                    grid->nsubc_tot,(a1-a0)/(double)grid->nsubc_tot);
+
+            print_bbsizes_supersub(debug,nbs,grid);
+        }
+    }
+}
+
+/* Reallocation wrapper function for nbnxn data structures */
+static void nb_realloc_void(void **ptr,
+                            int nbytes_copy,int nbytes_new,
+                            gmx_nbat_alloc_t *ma,
+                            gmx_nbat_free_t  *mf)
+{
+    void *ptr_new;
+
+    ma(&ptr_new,nbytes_new);
+
+    if (nbytes_new > 0 && ptr_new == NULL)
+    {
+        gmx_fatal(FARGS, "Allocation of %d bytes failed", nbytes_new);
+    }
+
+    if (nbytes_copy > 0)
+    {
+        if (nbytes_new < nbytes_copy)
+        {
+            gmx_incons("In nb_realloc_void: new size less than copy size");
+        }
+        memcpy(ptr_new,*ptr,nbytes_copy);
+    }
+    if (*ptr != NULL)
+    {
+        mf(*ptr);
+    }
+    *ptr = ptr_new;
+}
+
+/* NOTE: does not preserve the contents! */
+static void nb_realloc_int(int **ptr,int n,
+                           gmx_nbat_alloc_t *ma,
+                           gmx_nbat_free_t  *mf)
+{
+    if (*ptr != NULL)
+    {
+        mf(*ptr);
+    }
+    ma((void **)ptr,n*sizeof(**ptr));
+}
+
+/* NOTE: does not preserve the contents! */
+static void nb_realloc_real(real **ptr,int n,
+                            gmx_nbat_alloc_t *ma,
+                            gmx_nbat_free_t  *mf)
+{
+    if (*ptr != NULL)
+    {
+        mf(*ptr);
+    }
+    ma((void **)ptr,n*sizeof(**ptr));
+}
+
+/* Reallocate the nbnxn_atomdata_t for a size of n atoms */
+static void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat,int n)
+{
+    int t;
+
+    nb_realloc_void((void **)&nbat->type,
+                    nbat->natoms*sizeof(*nbat->type),
+                    n*sizeof(*nbat->type),
+                    nbat->alloc,nbat->free);
+    nb_realloc_void((void **)&nbat->lj_comb,
+                    nbat->natoms*2*sizeof(*nbat->lj_comb),
+                    n*2*sizeof(*nbat->lj_comb),
+                    nbat->alloc,nbat->free);
+    if (nbat->XFormat != nbatXYZQ)
+    {
+        nb_realloc_void((void **)&nbat->q,
+                        nbat->natoms*sizeof(*nbat->q),
+                        n*sizeof(*nbat->q),
+                        nbat->alloc,nbat->free);
+    }
+    if (nbat->nenergrp > 1)
+    {
+        nb_realloc_void((void **)&nbat->energrp,
+                        nbat->natoms/nbat->na_c*sizeof(*nbat->energrp),
+                        n/nbat->na_c*sizeof(*nbat->energrp),
+                        nbat->alloc,nbat->free);
+    }
+    nb_realloc_void((void **)&nbat->x,
+                    nbat->natoms*nbat->xstride*sizeof(*nbat->x),
+                    n*nbat->xstride*sizeof(*nbat->x),
+                    nbat->alloc,nbat->free);
+    for(t=0; t<nbat->nout; t++)
+    {
+        /* Allocate one element extra for possible signaling with CUDA */
+        nb_realloc_void((void **)&nbat->out[t].f,
+                        nbat->natoms*nbat->fstride*sizeof(*nbat->out[t].f),
+                        n*nbat->fstride*sizeof(*nbat->out[t].f),
+                        nbat->alloc,nbat->free);
+    }
+    nbat->nalloc = n;
+}
+
+/* Sets up a grid and puts the atoms on the grid.
+ * This function only operates on one domain of the domain decompostion.
+ * Note that without domain decomposition there is only one domain.
+ */
+void nbnxn_put_on_grid(nbnxn_search_t nbs,
+                       int ePBC,matrix box,
+                       int dd_zone,
+                       rvec corner0,rvec corner1,
+                       int a0,int a1,
+                       real atom_density,
+                       const int *atinfo,
+                       rvec *x,
+                       int nmoved,int *move,
+                       int nb_kernel_type,
+                       nbnxn_atomdata_t *nbat)
+{
+    nbnxn_grid_t *grid;
+    int n;
+    int nc_max_grid,nc_max;
+
+    grid = &nbs->grid[dd_zone];
+
+    nbs_cycle_start(&nbs->cc[enbsCCgrid]);
+
+    grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
+
+    grid->na_c      = kernel_to_ci_size(nb_kernel_type);
+    grid->na_cj     = kernel_to_cj_size(nb_kernel_type);
+    grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
+    grid->na_c_2log = get_2log(grid->na_c);
+
+    nbat->na_c = grid->na_c;
+
+    if (dd_zone == 0)
+    {
+        grid->cell0 = 0;
+    }
+    else
+    {
+        grid->cell0 =
+            (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
+            nbs->grid[dd_zone-1].na_sc/grid->na_sc;
+    }
+
+    n = a1 - a0;
+
+    if (dd_zone == 0)
+    {
+        nbs->ePBC = ePBC;
+        copy_mat(box,nbs->box);
+
+        if (atom_density >= 0)
+        {
+            grid->atom_density = atom_density;
+        }
+        else
+        {
+            grid->atom_density = grid_atom_density(n-nmoved,corner0,corner1);
+        }
+
+        grid->cell0 = 0;
+
+        nbs->natoms_local    = a1 - nmoved;
+        /* We assume that nbnxn_put_on_grid is called first
+         * for the local atoms (dd_zone=0).
+         */
+        nbs->natoms_nonlocal = a1 - nmoved;
+    }
+    else
+    {
+        nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
+    }
+
+    nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
+                                   nbs->grid[0].atom_density,
+                                   nbat->XFormat);
+
+    nc_max = grid->cell0 + nc_max_grid;
+
+    if (a1 > nbs->cell_nalloc)
+    {
+        nbs->cell_nalloc = over_alloc_large(a1);
+        srenew(nbs->cell,nbs->cell_nalloc);
+    }
+
+    /* To avoid conditionals we store the moved particles at the end of a,
+     * make sure we have enough space.
+     */
+    if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
+    {
+        nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
+        srenew(nbs->a,nbs->a_nalloc);
+    }
+
+    if (nc_max*grid->na_sc > nbat->nalloc)
+    {
+        nbnxn_atomdata_realloc(nbat,nc_max*grid->na_sc);
+    }
+
+    calc_cell_indices(nbs,dd_zone,grid,a0,a1,atinfo,x,move,nbat);
+
+    if (dd_zone == 0)
+    {
+        nbat->natoms_local = nbat->natoms;
+    }
+
+    nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
+}
+
+/* Calls nbnxn_put_on_grid for all non-local domains */
+void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
+                                const gmx_domdec_zones_t *zones,
+                                const int *atinfo,
+                                rvec *x,
+                                int nb_kernel_type,
+                                nbnxn_atomdata_t *nbat)
+{
+    int  zone,d;
+    rvec c0,c1;
+
+    for(zone=1; zone<zones->n; zone++)
+    {
+        for(d=0; d<DIM; d++)
+        {
+            c0[d] = zones->size[zone].bb_x0[d];
+            c1[d] = zones->size[zone].bb_x1[d];
+        }
+
+        nbnxn_put_on_grid(nbs,nbs->ePBC,NULL,
+                          zone,c0,c1,
+                          zones->cg_range[zone],
+                          zones->cg_range[zone+1],
+                          -1,
+                          atinfo,
+                          x,
+                          0,NULL,
+                          nb_kernel_type,
+                          nbat);
+    }
+}
+
+/* Add simple grid type information to the local super/sub grid */
+void nbnxn_grid_add_simple(nbnxn_search_t nbs,
+                           nbnxn_atomdata_t *nbat)
+{
+    nbnxn_grid_t *grid;
+    float *bbcz,*bb;
+    int ncd,sc;
+
+    grid = &nbs->grid[0];
+
+    if (grid->bSimple)
+    {
+        gmx_incons("nbnxn_grid_simple called with a simple grid");
+    }
+
+    ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
+
+    if (grid->nc*ncd > grid->nc_nalloc_simple)
+    {
+        grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
+        srenew(grid->bbcz_simple,grid->nc_nalloc_simple*NNBSBB_D);
+        srenew(grid->bb_simple,grid->nc_nalloc_simple*NNBSBB_B);
+        srenew(grid->flags_simple,grid->nc_nalloc_simple);
+        if (nbat->XFormat)
+        {
+            sfree_aligned(grid->bbj);
+            snew_aligned(grid->bbj,grid->nc_nalloc_simple/2,16);
+        }
+    }
+
+    bbcz = grid->bbcz_simple;
+    bb   = grid->bb_simple;
+
+#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+    for(sc=0; sc<grid->nc; sc++)
+    {
+        int c,tx,na;
+
+        for(c=0; c<ncd; c++)
+        {
+            tx = sc*ncd + c;
+
+            na = NBNXN_CPU_CLUSTER_I_SIZE;
+            while (na > 0 &&
+                   nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
+            {
+                na--;
+            }
+
+            if (na > 0)
+            {
+                switch (nbat->XFormat)
+                {
+                case nbatX4:
+                    /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
+                    calc_bounding_box_x_x4(na,nbat->x+tx*STRIDE_P4,
+                                           bb+tx*NNBSBB_B);
+                    break;
+                case nbatX8:
+                    /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
+                    calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
+                                           bb+tx*NNBSBB_B);
+                    break;
+                default:
+                    calc_bounding_box(na,nbat->xstride,
+                                      nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
+                                      bb+tx*NNBSBB_B);
+                    break;
+                }
+                bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
+                bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
+
+                /* No interaction optimization yet here */
+                grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
+            }
+            else
+            {
+                grid->flags_simple[tx] = 0;
+            }
+        }
+    }
+
+#ifdef NBNXN_SEARCH_SSE
+    if (grid->bSimple && nbat->XFormat == nbatX8)
+    {
+        combine_bounding_box_pairs(grid,grid->bb_simple);
+    }
+#endif
+}
+
+void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy)
+{
+    *ncx = nbs->grid[0].ncx;
+    *ncy = nbs->grid[0].ncy;
+}
+
+void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n)
+{
+    const nbnxn_grid_t *grid;
+
+    grid = &nbs->grid[0];
+
+    /* Return the atom order for the home cell (index 0) */
+    *a  = nbs->a;
+
+    *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
+}
+
+void nbnxn_set_atomorder(nbnxn_search_t nbs)
+{
+    nbnxn_grid_t *grid;
+    int ao,cx,cy,cxy,cz,j;
+
+    /* Set the atom order for the home cell (index 0) */
+    grid = &nbs->grid[0];
+
+    ao = 0;
+    for(cx=0; cx<grid->ncx; cx++)
+    {
+        for(cy=0; cy<grid->ncy; cy++)
+        {
+            cxy = cx*grid->ncy + cy;
+            j   = grid->cxy_ind[cxy]*grid->na_sc;
+            for(cz=0; cz<grid->cxy_na[cxy]; cz++)
+            {
+                nbs->a[j]     = ao;
+                nbs->cell[ao] = j;
+                ao++;
+                j++;
+            }
+        }
+    }
+}
+
+/* Determines the cell range along one dimension that
+ * the bounding box b0 - b1 sees.
+ */
+static void get_cell_range(real b0,real b1,
+                           int nc,real c0,real s,real invs,
+                           real d2,real r2,int *cf,int *cl)
+{
+    *cf = max((int)((b0 - c0)*invs),0);
+
+    while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
+    {
+        (*cf)--;
+    }
+
+    *cl = min((int)((b1 - c0)*invs),nc-1);
+    while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
+    {
+        (*cl)++;
+    }
+}
+
+/* Reference code calculating the distance^2 between two bounding boxes */
+static float box_dist2(float bx0,float bx1,float by0,
+                       float by1,float bz0,float bz1,
+                       const float *bb)
+{
+    float d2;
+    float dl,dh,dm,dm0;
+
+    d2 = 0;
+
+    dl  = bx0 - bb[BBU_X];
+    dh  = bb[BBL_X] - bx1;
+    dm  = max(dl,dh);
+    dm0 = max(dm,0);
+    d2 += dm0*dm0;
+
+    dl  = by0 - bb[BBU_Y];
+    dh  = bb[BBL_Y] - by1;
+    dm  = max(dl,dh);
+    dm0 = max(dm,0);
+    d2 += dm0*dm0;
+
+    dl  = bz0 - bb[BBU_Z];
+    dh  = bb[BBL_Z] - bz1;
+    dm  = max(dl,dh);
+    dm0 = max(dm,0);
+    d2 += dm0*dm0;
+
+    return d2;
+}
+
+/* Plain C code calculating the distance^2 between two bounding boxes */
+static float subc_bb_dist2(int si,const float *bb_i_ci,
+                           int csj,const float *bb_j_all)
+{
+    const float *bb_i,*bb_j;
+    float d2;
+    float dl,dh,dm,dm0;
+
+    bb_i = bb_i_ci  +  si*NNBSBB_B;
+    bb_j = bb_j_all + csj*NNBSBB_B;
+
+    d2 = 0;
+
+    dl  = bb_i[BBL_X] - bb_j[BBU_X];
+    dh  = bb_j[BBL_X] - bb_i[BBU_X];
+    dm  = max(dl,dh);
+    dm0 = max(dm,0);
+    d2 += dm0*dm0;
+
+    dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
+    dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
+    dm  = max(dl,dh);
+    dm0 = max(dm,0);
+    d2 += dm0*dm0;
+
+    dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
+    dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
+    dm  = max(dl,dh);
+    dm0 = max(dm,0);
+    d2 += dm0*dm0;
+
+    return d2;
+}
+
+#ifdef NBNXN_SEARCH_SSE
+
+/* SSE code for bb distance for bb format xyz0 */
+static float subc_bb_dist2_sse(int na_c,
+                              int si,const float *bb_i_ci,
+                              int csj,const float *bb_j_all)
+{
+    const float *bb_i,*bb_j;
+
+    __m128 bb_i_SSE0,bb_i_SSE1;
+    __m128 bb_j_SSE0,bb_j_SSE1;
+    __m128 dl_SSE;
+    __m128 dh_SSE;
+    __m128 dm_SSE;
+    __m128 dm0_SSE;
+    __m128 d2_SSE;
+#ifndef GMX_X86_SSE4_1
+    float d2_array[7],*d2_align;
+
+    d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
+#else
+    float d2;
+#endif
+
+    bb_i = bb_i_ci  +  si*NNBSBB_B;
+    bb_j = bb_j_all + csj*NNBSBB_B;
+
+    bb_i_SSE0 = _mm_load_ps(bb_i);
+    bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
+    bb_j_SSE0 = _mm_load_ps(bb_j);
+    bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
+
+    dl_SSE    = _mm_sub_ps(bb_i_SSE0,bb_j_SSE1);
+    dh_SSE    = _mm_sub_ps(bb_j_SSE0,bb_i_SSE1);
+
+    dm_SSE    = _mm_max_ps(dl_SSE,dh_SSE);
+    dm0_SSE   = _mm_max_ps(dm_SSE,_mm_setzero_ps());
+#ifndef GMX_X86_SSE4_1
+    d2_SSE    = _mm_mul_ps(dm0_SSE,dm0_SSE);
+
+    _mm_store_ps(d2_align,d2_SSE);
+
+    return d2_align[0] + d2_align[1] + d2_align[2];
+#else
+    /* SSE4.1 dot product of components 0,1,2 */
+    d2_SSE    = _mm_dp_ps(dm0_SSE,dm0_SSE,0x71);
+
+    _mm_store_ss(&d2,d2_SSE);
+
+    return d2;
+#endif
+}
+
+/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
+static void subc_bb_dist2_sse_xxxx(const float *bb_j,
+                                   int nsi,const float *bb_i,
+                                   float *d2)
+{
+    int si;
+    int shi;
+
+    __m128 xj_l,yj_l,zj_l;
+    __m128 xj_h,yj_h,zj_h;
+    __m128 xi_l,yi_l,zi_l;
+    __m128 xi_h,yi_h,zi_h;
+
+    __m128 dx_0,dy_0,dz_0;
+    __m128 dx_1,dy_1,dz_1;
+
+    __m128 mx,my,mz;
+    __m128 m0x,m0y,m0z;
+
+    __m128 d2x,d2y,d2z;
+    __m128 d2s,d2t;
+
+    __m128 zero;
+
+    zero = _mm_setzero_ps();
+
+    xj_l = _mm_load1_ps(bb_j+0*SSE_F_WIDTH);
+    yj_l = _mm_load1_ps(bb_j+1*SSE_F_WIDTH);
+    zj_l = _mm_load1_ps(bb_j+2*SSE_F_WIDTH);
+    xj_h = _mm_load1_ps(bb_j+3*SSE_F_WIDTH);
+    yj_h = _mm_load1_ps(bb_j+4*SSE_F_WIDTH);
+    zj_h = _mm_load1_ps(bb_j+5*SSE_F_WIDTH);
+
+    for(si=0; si<nsi; si+=SSE_F_WIDTH)
+    {
+        shi = si*NNBSBB_D*DIM;
+
+        xi_l = _mm_load_ps(bb_i+shi+0*SSE_F_WIDTH);
+        yi_l = _mm_load_ps(bb_i+shi+1*SSE_F_WIDTH);
+        zi_l = _mm_load_ps(bb_i+shi+2*SSE_F_WIDTH);
+        xi_h = _mm_load_ps(bb_i+shi+3*SSE_F_WIDTH);
+        yi_h = _mm_load_ps(bb_i+shi+4*SSE_F_WIDTH);
+        zi_h = _mm_load_ps(bb_i+shi+5*SSE_F_WIDTH);
+
+        dx_0 = _mm_sub_ps(xi_l,xj_h);
+        dy_0 = _mm_sub_ps(yi_l,yj_h);
+        dz_0 = _mm_sub_ps(zi_l,zj_h);
+
+        dx_1 = _mm_sub_ps(xj_l,xi_h);
+        dy_1 = _mm_sub_ps(yj_l,yi_h);
+        dz_1 = _mm_sub_ps(zj_l,zi_h);
+
+        mx   = _mm_max_ps(dx_0,dx_1);
+        my   = _mm_max_ps(dy_0,dy_1);
+        mz   = _mm_max_ps(dz_0,dz_1);
+
+        m0x  = _mm_max_ps(mx,zero);
+        m0y  = _mm_max_ps(my,zero);
+        m0z  = _mm_max_ps(mz,zero);
+
+        d2x  = _mm_mul_ps(m0x,m0x);
+        d2y  = _mm_mul_ps(m0y,m0y);
+        d2z  = _mm_mul_ps(m0z,m0z);
+
+        d2s  = _mm_add_ps(d2x,d2y);
+        d2t  = _mm_add_ps(d2s,d2z);
+
+        _mm_store_ps(d2+si,d2t);
+    }
+}
+
+#endif /* NBNXN_SEARCH_SSE */
+
+/* Plain C function which determines if any atom pair between two cells
+ * is within distance sqrt(rl2).
+ */
+static gmx_bool subc_in_range_x(int na_c,
+                                int si,const real *x_i,
+                                int csj,int stride,const real *x_j,
+                                real rl2)
+{
+    int  i,j,i0,j0;
+    real d2;
+
+    for(i=0; i<na_c; i++)
+    {
+        i0 = (si*na_c + i)*DIM;
+        for(j=0; j<na_c; j++)
+        {
+            j0 = (csj*na_c + j)*stride;
+
+            d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
+                 sqr(x_i[i0+1] - x_j[j0+1]) +
+                 sqr(x_i[i0+2] - x_j[j0+2]);
+
+            if (d2 < rl2)
+            {
+                return TRUE;
+            }
+        }
+    }
+
+    return FALSE;
+}
+
+/* SSE function which determines if any atom pair between two cells,
+ * both with 8 atoms, is within distance sqrt(rl2).
+ */
+static gmx_bool subc_in_range_sse8(int na_c,
+                                   int si,const real *x_i,
+                                   int csj,int stride,const real *x_j,
+                                   real rl2)
+{
+#ifdef NBNXN_SEARCH_SSE_SINGLE
+    __m128 ix_SSE0,iy_SSE0,iz_SSE0;
+    __m128 ix_SSE1,iy_SSE1,iz_SSE1;
+    __m128 jx0_SSE,jy0_SSE,jz0_SSE;
+    __m128 jx1_SSE,jy1_SSE,jz1_SSE;
+
+    __m128     dx_SSE0,dy_SSE0,dz_SSE0;
+    __m128     dx_SSE1,dy_SSE1,dz_SSE1;
+    __m128     dx_SSE2,dy_SSE2,dz_SSE2;
+    __m128     dx_SSE3,dy_SSE3,dz_SSE3;
+
+    __m128     rsq_SSE0;
+    __m128     rsq_SSE1;
+    __m128     rsq_SSE2;
+    __m128     rsq_SSE3;
+
+    __m128     wco_SSE0;
+    __m128     wco_SSE1;
+    __m128     wco_SSE2;
+    __m128     wco_SSE3;
+    __m128     wco_any_SSE01,wco_any_SSE23,wco_any_SSE;
+
+    __m128 rc2_SSE;
+
+    int na_c_sse;
+    int j0,j1;
+
+    rc2_SSE   = _mm_set1_ps(rl2);
+
+    na_c_sse = 8/4;
+    ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*SSE_F_WIDTH);
+    iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*SSE_F_WIDTH);
+    iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*SSE_F_WIDTH);
+    ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*SSE_F_WIDTH);
+    iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*SSE_F_WIDTH);
+    iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*SSE_F_WIDTH);
+
+    /* We loop from the outer to the inner particles to maximize
+     * the chance that we find a pair in range quickly and return.
+     */
+    j0 = csj*na_c;
+    j1 = j0 + na_c - 1;
+    while (j0 < j1)
+    {
+        jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
+        jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
+        jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
+
+        jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
+        jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
+        jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
+
+        /* Calculate distance */
+        dx_SSE0            = _mm_sub_ps(ix_SSE0,jx0_SSE);
+        dy_SSE0            = _mm_sub_ps(iy_SSE0,jy0_SSE);
+        dz_SSE0            = _mm_sub_ps(iz_SSE0,jz0_SSE);
+        dx_SSE1            = _mm_sub_ps(ix_SSE1,jx0_SSE);
+        dy_SSE1            = _mm_sub_ps(iy_SSE1,jy0_SSE);
+        dz_SSE1            = _mm_sub_ps(iz_SSE1,jz0_SSE);
+        dx_SSE2            = _mm_sub_ps(ix_SSE0,jx1_SSE);
+        dy_SSE2            = _mm_sub_ps(iy_SSE0,jy1_SSE);
+        dz_SSE2            = _mm_sub_ps(iz_SSE0,jz1_SSE);
+        dx_SSE3            = _mm_sub_ps(ix_SSE1,jx1_SSE);
+        dy_SSE3            = _mm_sub_ps(iy_SSE1,jy1_SSE);
+        dz_SSE3            = _mm_sub_ps(iz_SSE1,jz1_SSE);
+
+        /* rsq = dx*dx+dy*dy+dz*dz */
+        rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
+        rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
+        rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
+        rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
+
+        wco_SSE0           = _mm_cmplt_ps(rsq_SSE0,rc2_SSE);
+        wco_SSE1           = _mm_cmplt_ps(rsq_SSE1,rc2_SSE);
+        wco_SSE2           = _mm_cmplt_ps(rsq_SSE2,rc2_SSE);
+        wco_SSE3           = _mm_cmplt_ps(rsq_SSE3,rc2_SSE);
+
+        wco_any_SSE01      = _mm_or_ps(wco_SSE0,wco_SSE1);
+        wco_any_SSE23      = _mm_or_ps(wco_SSE2,wco_SSE3);
+        wco_any_SSE        = _mm_or_ps(wco_any_SSE01,wco_any_SSE23);
+
+        if (_mm_movemask_ps(wco_any_SSE))
+        {
+            return TRUE;
+        }
+
+        j0++;
+        j1--;
+    }
+    return FALSE;
+
+#else
+    /* No SSE */
+    gmx_incons("SSE function called without SSE support");
+
+    return TRUE;
+#endif
+}
+
+/* Returns the j sub-cell for index cj_ind */
+static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
+{
+    return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
+}
+
+/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
+static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
+{
+    return nbl->cj4[cj_ind>>2].imei[0].imask;
+}
+
+/* Ensures there is enough space for extra extra exclusion masks */
+static void check_excl_space(nbnxn_pairlist_t *nbl,int extra)
+{
+    if (nbl->nexcl+extra > nbl->excl_nalloc)
+    {
+        nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
+        nb_realloc_void((void **)&nbl->excl,
+                        nbl->nexcl*sizeof(*nbl->excl),
+                        nbl->excl_nalloc*sizeof(*nbl->excl),
+                        nbl->alloc,nbl->free);
+    }
+}
+
+/* Ensures there is enough space for ncell extra j-cells in the list */
+static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
+                                            int ncell)
+{
+    int cj_max;
+
+    cj_max = nbl->ncj + ncell;
+
+    if (cj_max > nbl->cj_nalloc)
+    {
+        nbl->cj_nalloc = over_alloc_small(cj_max);
+        nb_realloc_void((void **)&nbl->cj,
+                        nbl->ncj*sizeof(*nbl->cj),
+                        nbl->cj_nalloc*sizeof(*nbl->cj),
+                        nbl->alloc,nbl->free);
+    }
+}
+
+/* Ensures there is enough space for ncell extra j-subcells in the list */
+static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
+                                              int nsupercell)
+{
+    int ncj4_max,j4,j,w,t;
+
+#define NWARP       2
+#define WARP_SIZE  32
+
+    /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
+    /* We can store 4 j-subcell - i-supercell pairs in one struct.
+     * since we round down, we need one extra entry.
+     */
+    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
+
+    if (ncj4_max > nbl->cj4_nalloc)
+    {
+        nbl->cj4_nalloc = over_alloc_small(ncj4_max);
+        nb_realloc_void((void **)&nbl->cj4,
+                        nbl->work->cj4_init*sizeof(*nbl->cj4),
+                        nbl->cj4_nalloc*sizeof(*nbl->cj4),
+                        nbl->alloc,nbl->free);
+    }
+
+    if (ncj4_max > nbl->work->cj4_init)
+    {
+        for(j4=nbl->work->cj4_init; j4<ncj4_max; j4++)
+        {
+            /* No i-subcells and no excl's in the list initially */
+            for(w=0; w<NWARP; w++)
+            {
+                nbl->cj4[j4].imei[w].imask    = 0U;
+                nbl->cj4[j4].imei[w].excl_ind = 0;
+
+            }
+        }
+        nbl->work->cj4_init = ncj4_max;
+    }
+}
+
+/* Default nbnxn allocation routine, allocates 32 byte aligned,
+ * which works for plain C and aligned SSE and AVX loads/stores.
+ */
+static void nbnxn_alloc_aligned(void **ptr,size_t nbytes)
+{
+    *ptr = save_malloc_aligned("ptr",__FILE__,__LINE__,nbytes,1,32);
+}
+
+/* Free function for memory allocated with nbnxn_alloc_aligned */
+static void nbnxn_free_aligned(void *ptr)
+{
+    sfree_aligned(ptr);
+}
+
+/* Set all excl masks for one GPU warp no exclusions */
+static void set_no_excls(nbnxn_excl_t *excl)
+{
+    int t;
+
+    for(t=0; t<WARP_SIZE; t++)
+    {
+        /* Turn all interaction bits on */
+        excl->pair[t] = NBNXN_INT_MASK_ALL;
+    }
+}
+
+/* Initializes a single nbnxn_pairlist_t data structure */
+static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
+                                gmx_bool bSimple,
+                                gmx_nbat_alloc_t *alloc,
+                                gmx_nbat_free_t  *free)
+{
+    if (alloc == NULL)
+    {
+        nbl->alloc = nbnxn_alloc_aligned;
+    }
+    else
+    {
+        nbl->alloc = alloc;
+    }
+    if (free == NULL)
+    {
+        nbl->free = nbnxn_free_aligned;
+    }
+    else
+    {
+        nbl->free = free;
+    }
+
+    nbl->bSimple     = bSimple;
+    nbl->na_sc       = 0;
+    nbl->na_ci       = 0;
+    nbl->na_cj       = 0;
+    nbl->nci         = 0;
+    nbl->ci          = NULL;
+    nbl->ci_nalloc   = 0;
+    nbl->ncj         = 0;
+    nbl->cj          = NULL;
+    nbl->cj_nalloc   = 0;
+    nbl->ncj4        = 0;
+    /* We need one element extra in sj, so alloc initially with 1 */
+    nbl->cj4_nalloc  = 0;
+    nbl->cj4         = NULL;
+    nbl->nci_tot     = 0;
+
+    if (!nbl->bSimple)
+    {
+        nbl->excl        = NULL;
+        nbl->excl_nalloc = 0;
+        nbl->nexcl       = 0;
+        check_excl_space(nbl,1);
+        nbl->nexcl       = 1;
+        set_no_excls(&nbl->excl[0]);
+    }
+
+    snew(nbl->work,1);
+#ifdef NBNXN_BBXXXX
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/SSE_F_WIDTH*NNBSBB_XXXX,16);
+#else
+    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
+#endif
+    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
+#ifdef NBNXN_SEARCH_SSE
+    snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
+#ifdef GMX_X86_AVX_256
+    snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
+#endif
+#endif
+    snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
+}
+
+void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
+                             gmx_bool bSimple, gmx_bool bCombined,
+                             gmx_nbat_alloc_t *alloc,
+                             gmx_nbat_free_t  *free)
+{
+    int i;
+
+    nbl_list->bSimple   = bSimple;
+    nbl_list->bCombined = bCombined;
+
+    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
+
+    snew(nbl_list->nbl,nbl_list->nnbl);
+    /* Execute in order to avoid memory interleaving between threads */
+#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
+    for(i=0; i<nbl_list->nnbl; i++)
+    {
+        /* Allocate the nblist data structure locally on each thread
+         * to optimize memory access for NUMA architectures.
+         */
+        snew(nbl_list->nbl[i],1);
+
+        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
+        if (i == 0)
+        {
+            nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,alloc,free);
+        }
+        else
+        {
+            nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,NULL,NULL);
+        }
+    }
+}
+
+/* Print statistics of a pair list, used for debug output */
+static void print_nblist_statistics_simple(FILE *fp,const nbnxn_pairlist_t *nbl,
+                                           const nbnxn_search_t nbs,real rl)
+{
+    const nbnxn_grid_t *grid;
+    int cs[SHIFTS];
+    int s,i,j;
+    int npexcl;
+
+    /* This code only produces correct statistics with domain decomposition */
+    grid = &nbs->grid[0];
+
+    fprintf(fp,"nbl nci %d ncj %d\n",
+            nbl->nci,nbl->ncj);
+    fprintf(fp,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
+            nbl->na_sc,rl,nbl->ncj,nbl->ncj/(double)grid->nc,
+            nbl->ncj/(double)grid->nc*grid->na_sc,
+            nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
+
+    fprintf(fp,"nbl average j cell list length %.1f\n",
+            0.25*nbl->ncj/(double)nbl->nci);
+
+    for(s=0; s<SHIFTS; s++)
+    {
+        cs[s] = 0;
+    }
+    npexcl = 0;
+    for(i=0; i<nbl->nci; i++)
+    {
+        cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
+            nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
+
+        j = nbl->ci[i].cj_ind_start;
+        while (j < nbl->ci[i].cj_ind_end &&
+               nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
+        {
+            npexcl++;
+            j++;
+        }
+    }
+    fprintf(fp,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
+            nbl->ncj,npexcl,100*npexcl/(double)nbl->ncj);
+    for(s=0; s<SHIFTS; s++)
+    {
+        if (cs[s] > 0)
+        {
+            fprintf(fp,"nbl shift %2d ncj %3d\n",s,cs[s]);
+        }
+    }
+}
+
+/* Print statistics of a pair lists, used for debug output */
+static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nbl,
+                                             const nbnxn_search_t nbs,real rl)
+{
+    const nbnxn_grid_t *grid;
+    int i,j4,j,si,b;
+    int c[GPU_NSUBCELL+1];
+
+    /* This code only produces correct statistics with domain decomposition */
+    grid = &nbs->grid[0];
+
+    fprintf(fp,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
+            nbl->nsci,nbl->ncj4,nbl->nci_tot,nbl->nexcl);
+    fprintf(fp,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
+            nbl->na_ci,rl,nbl->nci_tot,nbl->nci_tot/(double)grid->nsubc_tot,
+            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
+            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
+
+    fprintf(fp,"nbl average j super cell list length %.1f\n",
+            0.25*nbl->ncj4/(double)nbl->nsci);
+    fprintf(fp,"nbl average i sub cell list length %.1f\n",
+            nbl->nci_tot/(0.25*nbl->ncj4));
+
+    for(si=0; si<=GPU_NSUBCELL; si++)
+    {
+        c[si] = 0;
+    }
+    for(i=0; i<nbl->nsci; i++)
+    {
+        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
+        {
+            for(j=0; j<4; j++)
+            {
+                b = 0;
+                for(si=0; si<GPU_NSUBCELL; si++)
+                {
+                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
+                    {
+                        b++;
+                    }
+                }
+                c[b]++;
+            }
+        }
+    }
+    for(b=0; b<=GPU_NSUBCELL; b++)
+    {
+        fprintf(fp,"nbl j-list #i-subcell %d %7d %4.1f\n",
+                b,c[b],100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
+    }
+}
+
+/* Print the full pair list, used for debug output */
+static void print_supersub_nsp(const char *fn,
+                               const nbnxn_pairlist_t *nbl,
+                               int iloc)
+{
+    char buf[STRLEN];
+    FILE *fp;
+    int i,nsp,j4,p;
+
+    sprintf(buf,"%s_%s.xvg",fn,NONLOCAL_I(iloc) ? "nl" : "l");
+    fp = ffopen(buf,"w");
+
+    for(i=0; i<nbl->nci; i++)
+    {
+        nsp = 0;
+        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
+        {
+            for(p=0; p<NBNXN_GPU_JGROUP_SIZE*GPU_NSUBCELL; p++)
+            {
+                nsp += (nbl->cj4[j4].imei[0].imask >> p) & 1;
+            }
+        }
+        fprintf(fp,"%4d %3d %3d\n",
+                i,
+                nsp,
+                nbl->sci[i].cj4_ind_end-nbl->sci[i].cj4_ind_start);
+    }
+
+    fclose(fp);
+}
+
+/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
+static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl,int cj4,
+                                   int warp,nbnxn_excl_t **excl)
+{
+    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
+    {
+        /* No exclusions set, make a new list entry */
+        nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
+        nbl->nexcl++;
+        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
+        set_no_excls(*excl);
+    }
+    else
+    {
+        /* We already have some exclusions, new ones can be added to the list */
+        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
+    }
+}
+
+/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
+ * allocates extra memory, if necessary.
+ */
+static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl,int cj4,
+                                 int warp,nbnxn_excl_t **excl)
+{
+    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
+    {
+        /* We need to make a new list entry, check if we have space */
+        check_excl_space(nbl,1);
+    }
+    low_get_nbl_exclusions(nbl,cj4,warp,excl);
+}
+
+/* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
+ * allocates extra memory, if necessary.
+ */
+static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl,int cj4,
+                                 nbnxn_excl_t **excl_w0,
+                                 nbnxn_excl_t **excl_w1)
+{
+    /* Check for space we might need */
+    check_excl_space(nbl,2);
+
+    low_get_nbl_exclusions(nbl,cj4,0,excl_w0);
+    low_get_nbl_exclusions(nbl,cj4,1,excl_w1);
+}
+
+/* Sets the self exclusions i=j and pair exclusions i>j */
+static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
+                                               int cj4_ind,int sj_offset,
+                                               int si)
+{
+    nbnxn_excl_t *excl[2];
+    int  ei,ej,w;
+
+    /* Here we only set the set self and double pair exclusions */
+
+    get_nbl_exclusions_2(nbl,cj4_ind,&excl[0],&excl[1]);
+
+    /* Only minor < major bits set */
+    for(ej=0; ej<nbl->na_ci; ej++)
+    {
+        w = (ej>>2);
+        for(ei=ej; ei<nbl->na_ci; ei++)
+        {
+            excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
+                ~(1U << (sj_offset*GPU_NSUBCELL+si));
+        }
+    }
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
+static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
+{
+    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
+}
+
+#ifdef NBNXN_SEARCH_SSE
+/* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
+static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
+{
+#ifndef GMX_DOUBLE /* cj-size = 4 */
+    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
+#else              /* cj-size = 2 */
+    return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
+            (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
+             NBNXN_INT_MASK_ALL));
+#endif
+}
+
+#ifdef GMX_X86_AVX_256
+/* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
+static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
+{
+#ifndef GMX_DOUBLE /* cj-size = 8 */
+    return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
+            (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
+             NBNXN_INT_MASK_ALL));
+#else              /* cj-size = 2 */
+    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
+#endif
+}
+#endif
+#endif /* NBNXN_SEARCH_SSE */
+
+/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
+ * Checks bounding box distances and possibly atom pair distances.
+ */
+static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
+                                     nbnxn_pairlist_t *nbl,
+                                     int ci,int cjf,int cjl,
+                                     gmx_bool remove_sub_diag,
+                                     const real *x_j,
+                                     real rl2,float rbb2,
+                                     int *ndistc)
+{
+    const nbnxn_list_work_t *work;
+
+    const float *bb_ci;
+    const real  *x_ci;
+
+    gmx_bool   InRange;
+    real       d2;
+    int        cjf_gl,cjl_gl,cj;
+
+    work = nbl->work;
+
+    bb_ci = nbl->work->bb_ci;
+    x_ci  = nbl->work->x_ci;
+
+    InRange = FALSE;
+    while (!InRange && cjf <= cjl)
+    {
+        d2 = subc_bb_dist2(0,bb_ci,cjf,gridj->bb);
+        *ndistc += 2;
+
+        /* Check if the distance is within the distance where
+         * we use only the bounding box distance rbb,
+         * or within the cut-off and there is at least one atom pair
+         * within the cut-off.
+         */
+        if (d2 < rbb2)
+        {
+            InRange = TRUE;
+        }
+        else if (d2 < rl2)
+        {
+            int i,j;
+
+            cjf_gl = gridj->cell0 + cjf;
+            for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
+            {
+                for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
+                {
+                    InRange = InRange ||
+                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
+                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
+                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
+                }
+            }
+            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
+        }
+        if (!InRange)
+        {
+            cjf++;
+        }
+    }
+    if (!InRange)
+    {
+        return;
+    }
+
+    InRange = FALSE;
+    while (!InRange && cjl > cjf)
+    {
+        d2 = subc_bb_dist2(0,bb_ci,cjl,gridj->bb);
+        *ndistc += 2;
+
+        /* Check if the distance is within the distance where
+         * we use only the bounding box distance rbb,
+         * or within the cut-off and there is at least one atom pair
+         * within the cut-off.
+         */
+        if (d2 < rbb2)
+        {
+            InRange = TRUE;
+        }
+        else if (d2 < rl2)
+        {
+            int i,j;
+
+            cjl_gl = gridj->cell0 + cjl;
+            for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
+            {
+                for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
+                {
+                    InRange = InRange ||
+                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
+                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
+                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
+                }
+            }
+            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
+        }
+        if (!InRange)
+        {
+            cjl--;
+        }
+    }
+
+    if (cjf <= cjl)
+    {
+        for(cj=cjf; cj<=cjl; cj++)
+        {
+            /* Store cj and the interaction mask */
+            nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
+            nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag,ci,cj);
+            nbl->ncj++;
+        }
+        /* Increase the closing index in i super-cell list */
+        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+    }
+}
+
+#ifdef NBNXN_SEARCH_SSE
+/* Include make_cluster_list_x86_simd128/256 */
+#define GMX_MM128_HERE
+#include "gmx_x86_simd_macros.h"
+#define STRIDE_S  PACK_X4
+#include "nbnxn_search_x86_simd.h"
+#undef STRIDE_S
+#undef GMX_MM128_HERE
+#ifdef GMX_X86_AVX_256
+/* Include make_cluster_list_x86_simd128/256 */
+#define GMX_MM256_HERE
+#include "gmx_x86_simd_macros.h"
+#define STRIDE_S  GMX_X86_SIMD_WIDTH_HERE
+#include "nbnxn_search_x86_simd.h"
+#undef STRIDE_S
+#undef GMX_MM256_HERE
+#endif
+#endif
+
+/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
+ * Checks bounding box distances and possibly atom pair distances.
+ */
+static void make_cluster_list(const nbnxn_search_t nbs,
+                              const nbnxn_grid_t *gridi,
+                              const nbnxn_grid_t *gridj,
+                              nbnxn_pairlist_t *nbl,
+                              int sci,int scj,
+                              gmx_bool sci_equals_scj,
+                              int stride,const real *x,
+                              real rl2,float rbb2,
+                              int *ndistc)
+{
+    int  na_c;
+    int  npair;
+    int  cjo,ci1,ci,cj,cj_gl;
+    int  cj4_ind,cj_offset;
+    unsigned imask;
+    nbnxn_cj4_t *cj4;
+    const float *bb_ci;
+    const real *x_ci;
+    float *d2l,d2;
+    int  w;
+#define PRUNE_LIST_CPU_ONE
+#ifdef PRUNE_LIST_CPU_ONE
+    int  ci_last=-1;
+#endif
+
+    d2l = nbl->work->d2;
+
+    bb_ci = nbl->work->bb_ci;
+    x_ci  = nbl->work->x_ci;
+
+    na_c = gridj->na_c;
+
+    for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
+    {
+        cj4_ind   = (nbl->work->cj_ind >> 2);
+        cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
+        cj4       = &nbl->cj4[cj4_ind];
+
+        cj = scj*GPU_NSUBCELL + cjo;
+
+        cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
+
+        /* Initialize this j-subcell i-subcell list */
+        cj4->cj[cj_offset] = cj_gl;
+        imask              = 0;
+
+        if (sci_equals_scj)
+        {
+            ci1 = cjo + 1;
+        }
+        else
+        {
+            ci1 = gridi->nsubc[sci];
+        }
+
+#ifdef NBNXN_BBXXXX
+        /* Determine all ci1 bb distances in one call with SSE */
+        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>SSE_F_WIDTH_2LOG)*NNBSBB_XXXX+(cj & (SSE_F_WIDTH-1)),
+                               ci1,bb_ci,d2l);
+        *ndistc += na_c*2;
+#endif
+
+        npair = 0;
+        for(ci=0; ci<ci1; ci++)
+        {
+#ifndef NBNXN_BBXXXX
+            /* Determine the bb distance between ci and cj */
+            d2l[ci] = subc_bb_dist2(ci,bb_ci,cj,gridj->bb);
+            *ndistc += 2;
+#endif
+            d2 = d2l[ci];
+
+#ifdef PRUNE_LIST_CPU_ALL
+            /* Check if the distance is within the distance where
+             * we use only the bounding box distance rbb,
+             * or within the cut-off and there is at least one atom pair
+             * within the cut-off. This check is very costly.
+             */
+            *ndistc += na_c*na_c;
+            if (d2 < rbb2 ||
+                (d2 < rl2 && nbs->subc_dc(na_c,ci,x_ci,cj_gl,stride,x,rl2)))
+#else
+            /* Check if the distance between the two bounding boxes
+             * in within the pair-list cut-off.
+             */
+            if (d2 < rl2)
+#endif
+            {
+                /* Flag this i-subcell to be taken into account */
+                imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
+
+#ifdef PRUNE_LIST_CPU_ONE
+                ci_last = ci;
+#endif
+
+                npair++;
+            }
+        }
+
+#ifdef PRUNE_LIST_CPU_ONE
+        /* If we only found 1 pair, check if any atoms are actually
+         * within the cut-off, so we could get rid of it.
+         */
+        if (npair == 1 && d2l[ci_last] >= rbb2)
+        {
+            if (!nbs->subc_dc(na_c,ci_last,x_ci,cj_gl,stride,x,rl2))
+            {
+                imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
+                npair--;
+            }
+        }
+#endif
+
+        if (npair > 0)
+        {
+            /* We have a useful sj entry, close it now */
+
+            /* Set the exclucions for the ci== sj entry.
+             * Here we don't bother to check if this entry is actually flagged,
+             * as it will nearly always be in the list.
+             */
+            if (sci_equals_scj)
+            {
+                set_self_and_newton_excls_supersub(nbl,cj4_ind,cj_offset,cjo);
+            }
+
+            /* Copy the cluster interaction mask to the list */
+            for(w=0; w<NWARP; w++)
+            {
+                cj4->imei[w].imask |= imask;
+            }
+
+            nbl->work->cj_ind++;
+
+            /* Keep the count */
+            nbl->nci_tot += npair;
+
+            /* Increase the closing index in i super-cell list */
+            nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
+        }
+    }
+}
+
+/* Set all atom-pair exclusions from the topology stored in excl
+ * as masks in the pair-list for simple list i-entry nbl_ci
+ */
+static void set_ci_top_excls(const nbnxn_search_t nbs,
+                             nbnxn_pairlist_t *nbl,
+                             gmx_bool diagRemoved,
+                             int na_ci_2log,
+                             int na_cj_2log,
+                             const nbnxn_ci_t *nbl_ci,
+                             const t_blocka *excl)
+{
+    const int *cell;
+    int ci;
+    int cj_ind_first,cj_ind_last;
+    int cj_first,cj_last;
+    int ndirect;
+    int i,ai,aj,si,eind,ge,se;
+    int found,cj_ind_0,cj_ind_1,cj_ind_m;
+    int cj_m;
+    gmx_bool Found_si;
+    int si_ind;
+    nbnxn_excl_t *nbl_excl;
+    int inner_i,inner_e;
+
+    cell = nbs->cell;
+
+    if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
+    {
+        /* Empty list */
+        return;
+    }
+
+    ci = nbl_ci->ci;
+
+    cj_ind_first = nbl_ci->cj_ind_start;
+    cj_ind_last  = nbl->ncj - 1;
+
+    cj_first = nbl->cj[cj_ind_first].cj;
+    cj_last  = nbl->cj[cj_ind_last].cj;
+
+    /* Determine how many contiguous j-cells we have starting
+     * from the first i-cell. This number can be used to directly
+     * calculate j-cell indices for excluded atoms.
+     */
+    ndirect = 0;
+    if (na_ci_2log == na_cj_2log)
+    {
+        while (cj_ind_first + ndirect <= cj_ind_last &&
+               nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
+        {
+            ndirect++;
+        }
+    }
+#ifdef NBNXN_SEARCH_SSE
+    else
+    {
+        while (cj_ind_first + ndirect <= cj_ind_last &&
+               nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log,ci) + ndirect)
+        {
+            ndirect++;
+        }
+    }
+#endif
+
+    /* Loop over the atoms in the i super-cell */
+    for(i=0; i<nbl->na_sc; i++)
+    {
+        ai = nbs->a[ci*nbl->na_sc+i];
+        if (ai >= 0)
+        {
+            si  = (i>>na_ci_2log);
+
+            /* Loop over the topology-based exclusions for this i-atom */
+            for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
+            {
+                aj = excl->a[eind];
+
+                if (aj == ai)
+                {
+                    /* The self exclusion are already set, save some time */
+                    continue;
+                }
+
+                ge = cell[aj];
+
+                /* Without shifts we only calculate interactions j>i
+                 * for one-way pair-lists.
+                 */
+                if (diagRemoved && ge <= ci*nbl->na_sc + i)
+                {
+                    continue;
+                }
+
+                se = (ge >> na_cj_2log);
+
+                /* Could the cluster se be in our list? */
+                if (se >= cj_first && se <= cj_last)
+                {
+                    if (se < cj_first + ndirect)
+                    {
+                        /* We can calculate cj_ind directly from se */
+                        found = cj_ind_first + se - cj_first;
+                    }
+                    else
+                    {
+                        /* Search for se using bisection */
+                        found = -1;
+                        cj_ind_0 = cj_ind_first + ndirect;
+                        cj_ind_1 = cj_ind_last + 1;
+                        while (found == -1 && cj_ind_0 < cj_ind_1)
+                        {
+                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
+
+                            cj_m = nbl->cj[cj_ind_m].cj;
+
+                            if (se == cj_m)
+                            {
+                                found = cj_ind_m;
+                            }
+                            else if (se < cj_m)
+                            {
+                                cj_ind_1 = cj_ind_m;
+                            }
+                            else
+                            {
+                                cj_ind_0 = cj_ind_m + 1;
+                            }
+                        }
+                    }
+
+                    if (found >= 0)
+                    {
+                        inner_i = i  - (si << na_ci_2log);
+                        inner_e = ge - (se << na_cj_2log);
+
+                        nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
+                    }
+                }
+            }
+        }
+    }
+}
+
+/* Set all atom-pair exclusions from the topology stored in excl
+ * as masks in the pair-list for i-super-cell entry nbl_sci
+ */
+static void set_sci_top_excls(const nbnxn_search_t nbs,
+                              nbnxn_pairlist_t *nbl,
+                              gmx_bool diagRemoved,
+                              int na_c_2log,
+                              const nbnxn_sci_t *nbl_sci,
+                              const t_blocka *excl)
+{
+    const int *cell;
+    int na_c;
+    int sci;
+    int cj_ind_first,cj_ind_last;
+    int cj_first,cj_last;
+    int ndirect;
+    int i,ai,aj,si,eind,ge,se;
+    int found,cj_ind_0,cj_ind_1,cj_ind_m;
+    int cj_m;
+    gmx_bool Found_si;
+    int si_ind;
+    nbnxn_excl_t *nbl_excl;
+    int inner_i,inner_e,w;
+
+    cell = nbs->cell;
+
+    na_c = nbl->na_ci;
+
+    if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
+    {
+        /* Empty list */
+        return;
+    }
+
+    sci = nbl_sci->sci;
+
+    cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
+    cj_ind_last  = nbl->work->cj_ind - 1;
+
+    cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
+    cj_last  = nbl_cj(nbl,cj_ind_last);
+
+    /* Determine how many contiguous j-clusters we have starting
+     * from the first i-cluster. This number can be used to directly
+     * calculate j-cluster indices for excluded atoms.
+     */
+    ndirect = 0;
+    while (cj_ind_first + ndirect <= cj_ind_last &&
+           nbl_cj(nbl,cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
+    {
+        ndirect++;
+    }
+
+    /* Loop over the atoms in the i super-cell */
+    for(i=0; i<nbl->na_sc; i++)
+    {
+        ai = nbs->a[sci*nbl->na_sc+i];
+        if (ai >= 0)
+        {
+            si  = (i>>na_c_2log);
+
+            /* Loop over the topology-based exclusions for this i-atom */
+            for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
+            {
+                aj = excl->a[eind];
+
+                if (aj == ai)
+                {
+                    /* The self exclusion are already set, save some time */
+                    continue;
+                }
+
+                ge = cell[aj];
+
+                /* Without shifts we only calculate interactions j>i
+                 * for one-way pair-lists.
+                 */
+                if (diagRemoved && ge <= sci*nbl->na_sc + i)
+                {
+                    continue;
+                }
+
+                se = ge>>na_c_2log;
+                /* Could the cluster se be in our list? */
+                if (se >= cj_first && se <= cj_last)
+                {
+                    if (se < cj_first + ndirect)
+                    {
+                        /* We can calculate cj_ind directly from se */
+                        found = cj_ind_first + se - cj_first;
+                    }
+                    else
+                    {
+                        /* Search for se using bisection */
+                        found = -1;
+                        cj_ind_0 = cj_ind_first + ndirect;
+                        cj_ind_1 = cj_ind_last + 1;
+                        while (found == -1 && cj_ind_0 < cj_ind_1)
+                        {
+                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
+
+                            cj_m = nbl_cj(nbl,cj_ind_m);
+
+                            if (se == cj_m)
+                            {
+                                found = cj_ind_m;
+                            }
+                            else if (se < cj_m)
+                            {
+                                cj_ind_1 = cj_ind_m;
+                            }
+                            else
+                            {
+                                cj_ind_0 = cj_ind_m + 1;
+                            }
+                        }
+                    }
+
+                    if (found >= 0)
+                    {
+                        inner_i = i  - si*na_c;
+                        inner_e = ge - se*na_c;
+
+/* Macro for getting the index of atom a within a cluster */
+#define AMODI(a)  ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
+/* Macro for converting an atom number to a cluster number */
+#define A2CI(a)   ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
+
+                        if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
+                        {
+                            w       = (inner_e >> 2);
+
+                            get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
+
+                            nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
+                                ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
+                        }
+
+#undef AMODI
+#undef A2CI
+                    }
+                }
+            }
+        }
+    }
+}
+
+/* Reallocate the simple ci list for at least n entries */
+static void nb_realloc_ci(nbnxn_pairlist_t *nbl,int n)
+{
+    nbl->ci_nalloc = over_alloc_small(n);
+    nb_realloc_void((void **)&nbl->ci,
+                    nbl->nci*sizeof(*nbl->ci),
+                    nbl->ci_nalloc*sizeof(*nbl->ci),
+                    nbl->alloc,nbl->free);
+}
+
+/* Reallocate the super-cell sci list for at least n entries */
+static void nb_realloc_sci(nbnxn_pairlist_t *nbl,int n)
+{
+    nbl->sci_nalloc = over_alloc_small(n);
+    nb_realloc_void((void **)&nbl->sci,
+                    nbl->nsci*sizeof(*nbl->sci),
+                    nbl->sci_nalloc*sizeof(*nbl->sci),
+                    nbl->alloc,nbl->free);
+}
+
+/* Make a new ci entry at index nbl->nci */
+static void new_ci_entry(nbnxn_pairlist_t *nbl,int ci,int shift,int flags,
+                         nbnxn_list_work_t *work)
+{
+    if (nbl->nci + 1 > nbl->ci_nalloc)
+    {
+        nb_realloc_ci(nbl,nbl->nci+1);
+    }
+    nbl->ci[nbl->nci].ci            = ci;
+    nbl->ci[nbl->nci].shift         = shift;
+    /* Store the interaction flags along with the shift */
+    nbl->ci[nbl->nci].shift        |= flags;
+    nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
+    nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
+}
+
+/* Make a new sci entry at index nbl->nsci */
+static void new_sci_entry(nbnxn_pairlist_t *nbl,int sci,int shift,int flags,
+                          nbnxn_list_work_t *work)
+{
+    if (nbl->nsci + 1 > nbl->sci_nalloc)
+    {
+        nb_realloc_sci(nbl,nbl->nsci+1);
+    }
+    nbl->sci[nbl->nsci].sci           = sci;
+    nbl->sci[nbl->nsci].shift         = shift;
+    nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
+    nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
+}
+
+/* Sort the simple j-list cj on exclusions.
+ * Entries with exclusions will all be sorted to the beginning of the list.
+ */
+static void sort_cj_excl(nbnxn_cj_t *cj,int ncj,
+                         nbnxn_list_work_t *work)
+{
+    int jnew,j;
+
+    if (ncj > work->cj_nalloc)
+    {
+        work->cj_nalloc = over_alloc_large(ncj);
+        srenew(work->cj,work->cj_nalloc);
+    }
+
+    /* Make a list of the j-cells involving exclusions */
+    jnew = 0;
+    for(j=0; j<ncj; j++)
+    {
+        if (cj[j].excl != NBNXN_INT_MASK_ALL)
+        {
+            work->cj[jnew++] = cj[j];
+        }
+    }
+    /* Check if there are exclusions at all or not just the first entry */
+    if (!((jnew == 0) ||
+          (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
+    {
+        for(j=0; j<ncj; j++)
+        {
+            if (cj[j].excl == NBNXN_INT_MASK_ALL)
+            {
+                work->cj[jnew++] = cj[j];
+            }
+        }
+        for(j=0; j<ncj; j++)
+        {
+            cj[j] = work->cj[j];
+        }
+    }
+}
+
+/* Close this simple list i entry */
+static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
+{
+    int jlen;
+
+    /* All content of the new ci entry have already been filled correctly,
+     * we only need to increase the count here (for non empty lists).
+     */
+    jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
+    if (jlen > 0)
+    {
+        sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
+
+        if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
+        {
+            nbl->work->ncj_hlj += jlen;
+        }
+        else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+        {
+            nbl->work->ncj_noq += jlen;
+        }
+
+        nbl->nci++;
+    }
+}
+
+/* Split sci entry for load balancing on the GPU.
+ * As we only now the current count on our own thread,
+ * we will need to estimate the current total amount of i-entries.
+ * As the lists get concatenated later, this estimate depends
+ * both on nthread and our own thread index thread.
+ */
+static void split_sci_entry(nbnxn_pairlist_t *nbl,
+                            int nsp_max_av,gmx_bool progBal,int nc_bal,
+                            int thread,int nthread)
+{
+    int nsci_est;
+    int nsp_max;
+    int cj4_start,cj4_end,j4len,cj4;
+    int sci;
+    int nsp,nsp_sci,nsp_cj4,nsp_cj4_e,nsp_cj4_p;
+    int p;
+
+    /* Estimate the total numbers of ci's of the nblist combined
+     * over all threads using the target number of ci's.
+     */
+    nsci_est = nc_bal*thread/nthread + nbl->nsci;
+    if (progBal)
+    {
+        /* The first ci blocks should be larger, to avoid overhead.
+         * The last ci blocks should be smaller, to improve load balancing.
+         */
+        nsp_max = max(1,
+                      nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
+    }
+    else
+    {
+        nsp_max = nsp_max_av;
+    }
+
+    cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
+    cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
+    j4len = cj4_end - cj4_start;
+
+    if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
+    {
+        /* Remove the last ci entry and process the cj4's again */
+        nbl->nsci -= 1;
+
+        sci        = nbl->nsci;
+        cj4        = cj4_start;
+        nsp        = 0;
+        nsp_sci    = 0;
+        nsp_cj4_e  = 0;
+        nsp_cj4    = 0;
+        while (cj4 < cj4_end)
+        {
+            nsp_cj4_p = nsp_cj4;
+            nsp_cj4   = 0;
+            for(p=0; p<GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
+            {
+                nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
+            }
+            nsp += nsp_cj4;
+
+            if (nsp > nsp_max && nsp > nsp_cj4)
+            {
+                nbl->sci[sci].cj4_ind_end = cj4;
+                sci++;
+                nbl->nsci++;
+                if (nbl->nsci+1 > nbl->sci_nalloc)
+                {
+                    nb_realloc_sci(nbl,nbl->nsci+1);
+                }
+                nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
+                nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
+                nbl->sci[sci].cj4_ind_start = cj4;
+                nsp_sci   = nsp - nsp_cj4;
+                nsp_cj4_e = nsp_cj4_p;
+                nsp       = nsp_cj4;
+            }
+
+            cj4++;
+        }
+
+        /* Put the remaining cj4's in a new ci entry */
+        nbl->sci[sci].cj4_ind_end = cj4_end;
+
+        /* Possibly balance out the last two ci's
+         * by moving the last cj4 of the second last ci.
+         */
+        if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
+        {
+            nbl->sci[sci-1].cj4_ind_end--;
+            nbl->sci[sci].cj4_ind_start--;
+        }
+
+        sci++;
+        nbl->nsci++;
+    }
+}
+
+/* Clost this super/sub list i entry */
+static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
+                                    int nsp_max_av,
+                                    gmx_bool progBal,int nc_bal,
+                                    int thread,int nthread)
+{
+    int j4len,tlen;
+    int nb,b;
+
+    /* All content of the new ci entry have already been filled correctly,
+     * we only need to increase the count here (for non empty lists).
+     */
+    j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
+    if (j4len > 0)
+    {
+        /* We can only have complete blocks of 4 j-entries in a list,
+         * so round the count up before closing.
+         */
+        nbl->ncj4         = ((nbl->work->cj_ind + 4-1) >> 2);
+        nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
+
+        nbl->nsci++;
+
+        if (nsp_max_av > 0)
+        {
+            split_sci_entry(nbl,nsp_max_av,progBal,nc_bal,thread,nthread);
+        }
+    }
+}
+
+/* Syncs the working array before adding another grid pair to the list */
+static void sync_work(nbnxn_pairlist_t *nbl)
+{
+    if (!nbl->bSimple)
+    {
+        nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
+        nbl->work->cj4_init = nbl->ncj4;
+    }
+}
+
+/* Clears an nbnxn_pairlist_t data structure */
+static void clear_pairlist(nbnxn_pairlist_t *nbl)
+{
+    nbl->nci           = 0;
+    nbl->nsci          = 0;
+    nbl->ncj           = 0;
+    nbl->ncj4          = 0;
+    nbl->nci_tot       = 0;
+    nbl->nexcl         = 1;
+
+    nbl->work->ncj_noq = 0;
+    nbl->work->ncj_hlj = 0;
+}
+
+/* Sets a simple list i-cell bounding box, including PBC shift */
+static void set_icell_bb_simple(const float *bb,int ci,
+                                real shx,real shy,real shz,
+                                float *bb_ci)
+{
+    int ia;
+
+    ia = ci*NNBSBB_B;
+    bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
+    bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
+    bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
+    bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
+    bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
+    bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
+}
+
+/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
+static void set_icell_bb_supersub(const float *bb,int ci,
+                                  real shx,real shy,real shz,
+                                  float *bb_ci)
+{
+    int ia,m,i;
+
+#ifdef NBNXN_BBXXXX
+    ia = ci*(GPU_NSUBCELL>>SSE_F_WIDTH_2LOG)*NNBSBB_XXXX;
+    for(m=0; m<(GPU_NSUBCELL>>SSE_F_WIDTH_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
+    {
+        for(i=0; i<SSE_F_WIDTH; i++)
+        {
+            bb_ci[m+ 0+i] = bb[ia+m+ 0+i] + shx;
+            bb_ci[m+ 4+i] = bb[ia+m+ 4+i] + shy;
+            bb_ci[m+ 8+i] = bb[ia+m+ 8+i] + shz;
+            bb_ci[m+12+i] = bb[ia+m+12+i] + shx;
+            bb_ci[m+16+i] = bb[ia+m+16+i] + shy;
+            bb_ci[m+20+i] = bb[ia+m+20+i] + shz;
+        }
+    }
+#else
+    ia = ci*GPU_NSUBCELL*NNBSBB_B;
+    for(i=0; i<GPU_NSUBCELL*NNBSBB_B; i+=NNBSBB_B)
+    {
+        bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
+        bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
+        bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
+        bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
+        bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
+        bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
+    }
+#endif
+}
+
+/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
+static void icell_set_x_simple(int ci,
+                               real shx,real shy,real shz,
+                               int na_c,
+                               int stride,const real *x,
+                               nbnxn_list_work_t *work)
+{
+    int  ia,i;
+
+    ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
+
+    for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE; i++)
+    {
+        work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
+        work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
+        work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
+    }
+}
+
+/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
+static void icell_set_x_supersub(int ci,
+                                 real shx,real shy,real shz,
+                                 int na_c,
+                                 int stride,const real *x,
+                                 nbnxn_list_work_t *work)
+{
+    int  ia,i;
+    real *x_ci;
+
+    x_ci = work->x_ci;
+
+    ia = ci*GPU_NSUBCELL*na_c;
+    for(i=0; i<GPU_NSUBCELL*na_c; i++)
+    {
+        x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
+        x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
+        x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
+    }
+}
+
+#ifdef NBNXN_SEARCH_SSE
+/* Copies PBC shifted super-cell packed atom coordinates to working array */
+static void icell_set_x_supersub_sse8(int ci,
+                                      real shx,real shy,real shz,
+                                      int na_c,
+                                      int stride,const real *x,
+                                      nbnxn_list_work_t *work)
+{
+    int  si,io,ia,i,j;
+    real *x_ci;
+
+    x_ci = work->x_ci;
+
+    for(si=0; si<GPU_NSUBCELL; si++)
+    {
+        for(i=0; i<na_c; i+=SSE_F_WIDTH)
+        {
+            io = si*na_c + i;
+            ia = ci*GPU_NSUBCELL*na_c + io;
+            for(j=0; j<SSE_F_WIDTH; j++)
+            {
+                x_ci[io*DIM + j + XX*SSE_F_WIDTH] = x[(ia+j)*stride+XX] + shx;
+                x_ci[io*DIM + j + YY*SSE_F_WIDTH] = x[(ia+j)*stride+YY] + shy;
+                x_ci[io*DIM + j + ZZ*SSE_F_WIDTH] = x[(ia+j)*stride+ZZ] + shz;
+            }
+        }
+    }
+}
+#endif
+
+static real nbnxn_rlist_inc_nonloc_fac = 0.6;
+
+/* Due to the cluster size the effective pair-list is longer than
+ * that of a simple atom pair-list. This function gives the extra distance.
+ */
+real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density)
+{
+    return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density),1.0/3.0));
+}
+
+/* Estimates the interaction volume^2 for non-local interactions */
+static real nonlocal_vol2(const gmx_domdec_zones_t *zones,rvec ls,real r)
+{
+    int  z,d;
+    real cl,ca,za;
+    real vold_est;
+    real vol2_est_tot;
+
+    vol2_est_tot = 0;
+
+    /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
+     * not home interaction volume^2. As these volumes are not additive,
+     * this is an overestimate, but it would only be significant in the limit
+     * of small cells, where we anyhow need to split the lists into
+     * as small parts as possible.
+     */
+
+    for(z=0; z<zones->n; z++)
+    {
+        if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
+        {
+            cl = 0;
+            ca = 1;
+            za = 1;
+            for(d=0; d<DIM; d++)
+            {
+                if (zones->shift[z][d] == 0)
+                {
+                    cl += 0.5*ls[d];
+                    ca *= ls[d];
+                    za *= zones->size[z].x1[d] - zones->size[z].x0[d];
+                }
+            }
+
+            /* 4 octants of a sphere */
+            vold_est  = 0.25*M_PI*r*r*r*r;
+            /* 4 quarter pie slices on the edges */
+            vold_est += 4*cl*M_PI/6.0*r*r*r;
+            /* One rectangular volume on a face */
+            vold_est += ca*0.5*r*r;
+
+            vol2_est_tot += vold_est*za;
+        }
+    }
+
+    return vol2_est_tot;
+}
+
+/* Estimates the average size of a full j-list for super/sub setup */
+static int get_nsubpair_max(const nbnxn_search_t nbs,
+                            int iloc,
+                            real rlist,
+                            int min_ci_balanced)
+{
+    const nbnxn_grid_t *grid;
+    rvec ls;
+    real xy_diag2,r_eff_sup,vol_est,nsp_est,nsp_est_nl;
+    int  nsubpair_max;
+
+    grid = &nbs->grid[0];
+
+    ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
+    ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
+    ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
+
+    /* The average squared length of the diagonal of a sub cell */
+    xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
+
+    /* The formulas below are a heuristic estimate of the average nsj per si*/
+    r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
+
+    if (!nbs->DomDec || nbs->zones->n == 1)
+    {
+        nsp_est_nl = 0;
+    }
+    else
+    {
+        nsp_est_nl =
+            sqr(grid->atom_density/grid->na_c)*
+            nonlocal_vol2(nbs->zones,ls,r_eff_sup);
+    }
+
+    if (LOCAL_I(iloc))
+    {
+        /* Sub-cell interacts with itself */
+        vol_est  = ls[XX]*ls[YY]*ls[ZZ];
+        /* 6/2 rectangular volume on the faces */
+        vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
+        /* 12/2 quarter pie slices on the edges */
+        vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
+        /* 4 octants of a sphere */
+        vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup,3);
+
+        nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
+
+        /* Subtract the non-local pair count */
+        nsp_est -= nsp_est_nl;
+
+        if (debug)
+        {
+            fprintf(debug,"nsp_est local %5.1f non-local %5.1f\n",
+                    nsp_est,nsp_est_nl);
+        }
+    }
+    else
+    {
+        nsp_est = nsp_est_nl;
+    }
+
+    if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
+    {
+        /* We don't need to worry */
+        nsubpair_max = -1;
+    }
+    else
+    {
+        /* Thus the (average) maximum j-list size should be as follows */
+        nsubpair_max = max(1,(int)(nsp_est/min_ci_balanced+0.5));
+
+        /* Since the target value is a maximum (this avoid high outliers,
+         * which lead to load imbalance), not average, we get more lists
+         * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
+         * But more importantly, the optimal GPU performance moves
+         * to lower number of block for very small blocks.
+         * To compensate we add the maximum pair count per cj4.
+         */
+        nsubpair_max += GPU_NSUBCELL*NBNXN_CPU_CLUSTER_I_SIZE;
+    }
+
+    if (debug)
+    {
+        fprintf(debug,"nbl nsp estimate %.1f, nsubpair_max %d\n",
+                nsp_est,nsubpair_max);
+    }
+
+    return nsubpair_max;
+}
+
+/* Debug list print function */
+static void print_nblist_ci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
+{
+    int i,j;
+
+    for(i=0; i<nbl->nci; i++)
+    {
+        fprintf(fp,"ci %4d  shift %2d  ncj %3d\n",
+                nbl->ci[i].ci,nbl->ci[i].shift,
+                nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
+
+        for(j=nbl->ci[i].cj_ind_start; j<nbl->ci[i].cj_ind_end; j++)
+        {
+            fprintf(fp,"  cj %5d  imask %x\n",
+                    nbl->cj[j].cj,
+                    nbl->cj[j].excl);
+        }
+    }
+}
+
+/* Debug list print function */
+static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
+{
+    int i,j4,j;
+
+    for(i=0; i<nbl->nsci; i++)
+    {
+        fprintf(fp,"ci %4d  shift %2d  ncj4 %2d\n",
+                nbl->sci[i].sci,nbl->sci[i].shift,
+                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
+
+        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
+        {
+            for(j=0; j<4; j++)
+            {
+                fprintf(fp,"  sj %5d  imask %x\n",
+                        nbl->cj4[j4].cj[j],
+                        nbl->cj4[j4].imei[0].imask);
+            }
+        }
+    }
+}
+
+/* Combine pair lists *nbl generated on multiple threads nblc */
+static void combine_nblists(int nnbl,nbnxn_pairlist_t **nbl,
+                            nbnxn_pairlist_t *nblc)
+{
+    int nsci,ncj4,nexcl;
+    int n,i;
+
+    if (nblc->bSimple)
+    {
+        gmx_incons("combine_nblists does not support simple lists");
+    }
+
+    nsci  = nblc->nsci;
+    ncj4  = nblc->ncj4;
+    nexcl = nblc->nexcl;
+    for(i=0; i<nnbl; i++)
+    {
+        nsci  += nbl[i]->nsci;
+        ncj4  += nbl[i]->ncj4;
+        nexcl += nbl[i]->nexcl;
+    }
+
+    if (nsci > nblc->sci_nalloc)
+    {
+        nb_realloc_sci(nblc,nsci);
+    }
+    if (ncj4 > nblc->cj4_nalloc)
+    {
+        nblc->cj4_nalloc = over_alloc_small(ncj4);
+        nb_realloc_void((void **)&nblc->cj4,
+                        nblc->ncj4*sizeof(*nblc->cj4),
+                        nblc->cj4_nalloc*sizeof(*nblc->cj4),
+                        nblc->alloc,nblc->free);
+    }
+    if (nexcl > nblc->excl_nalloc)
+    {
+        nblc->excl_nalloc = over_alloc_small(nexcl);
+        nb_realloc_void((void **)&nblc->excl,
+                        nblc->nexcl*sizeof(*nblc->excl),
+                        nblc->excl_nalloc*sizeof(*nblc->excl),
+                        nblc->alloc,nblc->free);
+    }
+
+    /* Each thread should copy its own data to the combined arrays,
+     * as otherwise data will go back and forth between different caches.
+     */
+#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
+    for(n=0; n<nnbl; n++)
+    {
+        int sci_offset;
+        int cj4_offset;
+        int ci_offset;
+        int excl_offset;
+        int i,j4;
+        const nbnxn_pairlist_t *nbli;
+
+        /* Determine the offset in the combined data for our thread */
+        sci_offset  = nblc->nsci;
+        cj4_offset  = nblc->ncj4;
+        ci_offset   = nblc->nci_tot;
+        excl_offset = nblc->nexcl;
+
+        for(i=0; i<n; i++)
+        {
+            sci_offset  += nbl[i]->nsci;
+            cj4_offset  += nbl[i]->ncj4;
+            ci_offset   += nbl[i]->nci_tot;
+            excl_offset += nbl[i]->nexcl;
+        }
+
+        nbli = nbl[n];
+
+        for(i=0; i<nbli->nsci; i++)
+        {
+            nblc->sci[sci_offset+i]                = nbli->sci[i];
+            nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
+            nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
+        }
+
+        for(j4=0; j4<nbli->ncj4; j4++)
+        {
+            nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
+            nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
+            nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
+        }
+
+        for(j4=0; j4<nbli->nexcl; j4++)
+        {
+            nblc->excl[excl_offset+j4] = nbli->excl[j4];
+        }
+    }
+
+    for(n=0; n<nnbl; n++)
+    {
+        nblc->nsci    += nbl[n]->nsci;
+        nblc->ncj4    += nbl[n]->ncj4;
+        nblc->nci_tot += nbl[n]->nci_tot;
+        nblc->nexcl   += nbl[n]->nexcl;
+    }
+}
+
+/* Returns the next ci to be processes by our thread */
+static gmx_bool next_ci(const nbnxn_grid_t *grid,
+                        int conv,
+                        int nth,int ci_block,
+                        int *ci_x,int *ci_y,
+                        int *ci_b,int *ci)
+{
+    (*ci_b)++;
+    (*ci)++;
+
+    if (*ci_b == ci_block)
+    {
+        /* Jump to the next block assigned to this task */
+        *ci   += (nth - 1)*ci_block;
+        *ci_b  = 0;
+    }
+
+    if (*ci >= grid->nc*conv)
+    {
+        return FALSE;
+    }
+
+    while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
+    {
+        *ci_y += 1;
+        if (*ci_y == grid->ncy)
+        {
+            *ci_x += 1;
+            *ci_y  = 0;
+        }
+    }
+
+    return TRUE;
+}
+
+/* Returns the distance^2 for which we put cell pairs in the list
+ * without checking atom pair distances. This is usually < rlist^2.
+ */
+static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
+                                        const nbnxn_grid_t *gridj,
+                                        real rlist,
+                                        gmx_bool simple)
+{
+    /* If the distance between two sub-cell bounding boxes is less
+     * than this distance, do not check the distance between
+     * all particle pairs in the sub-cell, since then it is likely
+     * that the box pair has atom pairs within the cut-off.
+     * We use the nblist cut-off minus 0.5 times the average x/y diagonal
+     * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
+     * Using more than 0.5 gains at most 0.5%.
+     * If forces are calculated more than twice, the performance gain
+     * in the force calculation outweighs the cost of checking.
+     * Note that with subcell lists, the atom-pair distance check
+     * is only performed when only 1 out of 8 sub-cells in within range,
+     * this is because the GPU is much faster than the cpu.
+     */
+    real bbx,bby;
+    real rbb2;
+
+    bbx = 0.5*(gridi->sx + gridj->sx);
+    bby = 0.5*(gridi->sy + gridj->sy);
+    if (!simple)
+    {
+        bbx /= GPU_NSUBCELL_X;
+        bby /= GPU_NSUBCELL_Y;
+    }
+
+    rbb2 = sqr(max(0,rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
+
+#ifndef GMX_DOUBLE
+    return rbb2;
+#else
+    return (float)((1+GMX_FLOAT_EPS)*rbb2);
+#endif
+}
+
+/* Generates the part of pair-list nbl assigned to our thread */
+static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
+                                     const nbnxn_grid_t *gridi,
+                                     const nbnxn_grid_t *gridj,
+                                     nbnxn_search_work_t *work,
+                                     const nbnxn_atomdata_t *nbat,
+                                     const t_blocka *excl,
+                                     real rlist,
+                                     int nb_kernel_type,
+                                     int nsubpair_max,
+                                     gmx_bool progBal,
+                                     int min_ci_balanced,
+                                     int th,int nth,
+                                     nbnxn_pairlist_t *nbl)
+{
+    int  na_cj_2log;
+    matrix box;
+    real rl2;
+    float rbb2;
+    int  d;
+    int  ci_block,ci_b,ci,ci_x,ci_y,ci_xy,cj;
+    ivec shp;
+    int  tx,ty,tz;
+    int  shift;
+    gmx_bool bMakeList;
+    real shx,shy,shz;
+    int  conv_i,cell0_i;
+    const float *bb_i,*bbcz_i,*bbcz_j;
+    const int *flags_i;
+    real bx0,bx1,by0,by1,bz0,bz1;
+    real bz1_frac;
+    real d2cx,d2z,d2z_cx,d2z_cy,d2zx,d2zxy,d2xy;
+    int  cxf,cxl,cyf,cyf_x,cyl;
+    int  cx,cy;
+    int  c0,c1,cs,cf,cl;
+    int  ndistc;
+    int  ncpcheck;
+
+    nbs_cycle_start(&work->cc[enbsCCsearch]);
+
+    if (gridj->bSimple != nbl->bSimple)
+    {
+        gmx_incons("Grid incompatible with pair-list");
+    }
+
+    sync_work(nbl);
+
+    nbl->na_sc = gridj->na_sc;
+    nbl->na_ci = gridj->na_c;
+    nbl->na_cj = kernel_to_cj_size(nb_kernel_type);
+    na_cj_2log = get_2log(nbl->na_cj);
+
+    nbl->rlist  = rlist;
+
+    copy_mat(nbs->box,box);
+
+    rl2 = nbl->rlist*nbl->rlist;
+
+    rbb2 = boundingbox_only_distance2(gridi,gridj,nbl->rlist,nbl->bSimple);
+
+    if (debug)
+    {
+        fprintf(debug,"nbl bounding box only distance %f\n",sqrt(rbb2));
+    }
+
+    /* Set the shift range */
+    for(d=0; d<DIM; d++)
+    {
+        /* Check if we need periodicity shifts.
+         * Without PBC or with domain decomposition we don't need them.
+         */
+        if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
+        {
+            shp[d] = 0;
+        }
+        else
+        {
+            if (d == XX &&
+                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
+            {
+                shp[d] = 2;
+            }
+            else
+            {
+                shp[d] = 1;
+            }
+        }
+    }
+
+    if (nbl->bSimple && !gridi->bSimple)
+    {
+        conv_i  = gridi->na_sc/gridj->na_sc;
+        bb_i    = gridi->bb_simple;
+        bbcz_i  = gridi->bbcz_simple;
+        flags_i = gridi->flags_simple;
+    }
+    else
+    {
+        conv_i  = 1;
+        bb_i    = gridi->bb;
+        bbcz_i  = gridi->bbcz;
+        flags_i = gridi->flags;
+    }
+    cell0_i = gridi->cell0*conv_i;
+
+    bbcz_j = gridj->bbcz;
+
+    if (conv_i == 1)
+    {
+#define CI_BLOCK_ENUM    5
+#define CI_BLOCK_DENOM  11
+        /* Here we decide how to distribute the blocks over the threads.
+         * We use prime numbers to try to avoid that the grid size becomes
+         * a multiple of the number of threads, which would lead to some
+         * threads getting "inner" pairs and others getting boundary pairs,
+         * which in turns will lead to load imbalance between threads.
+         * Set the block size as 5/11/ntask times the average number of cells
+         * in a y,z slab. This should ensure a quite uniform distribution
+         * of the grid parts of the different thread along all three grid
+         * zone boundaries with 3D domain decomposition. At the same time
+         * the blocks will not become too small.
+         */
+        ci_block = (gridi->nc*CI_BLOCK_ENUM)/(CI_BLOCK_DENOM*gridi->ncx*nth);
+
+        /* Ensure the blocks are not too small: avoids cache invalidation */
+        if (ci_block*gridi->na_sc < 16)
+        {
+            ci_block = (16 + gridi->na_sc - 1)/gridi->na_sc;
+        }
+
+        /* Without domain decomposition
+         * or with less than 3 blocks per task, divide in nth blocks.
+         */
+        if (!nbs->DomDec || ci_block*3*nth > gridi->nc)
+        {
+            ci_block = (gridi->nc + nth - 1)/nth;
+        }
+    }
+    else
+    {
+        /* Blocks of the conversion factor - 1 give a large repeat count
+         * combined with a small block size. This should result in good
+         * load balancing for both small and large domains.
+         */
+        ci_block = conv_i - 1;
+    }
+    if (debug)
+    {
+        fprintf(debug,"nbl nc_i %d col.av. %.1f ci_block %d\n",
+                gridi->nc,gridi->nc/(double)(gridi->ncx*gridi->ncy),ci_block);
+    }
+
+    ndistc = 0;
+    ncpcheck = 0;
+
+    ci_b = -1;
+    ci   = th*ci_block - 1;
+    ci_x = 0;
+    ci_y = 0;
+    while (next_ci(gridi,conv_i,nth,ci_block,&ci_x,&ci_y,&ci_b,&ci))
+    {
+        if (nbl->bSimple && flags_i[ci] == 0)
+        {
+            continue;
+        }
+
+        d2cx = 0;
+        if (gridj != gridi && shp[XX] == 0)
+        {
+            if (nbl->bSimple)
+            {
+                bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
+            }
+            else
+            {
+                bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
+            }
+            if (bx1 < gridj->c0[XX])
+            {
+                d2cx = sqr(gridj->c0[XX] - bx1);
+
+                if (d2cx >= rl2)
+                {
+                    continue;
+                }
+            }
+        }
+
+        ci_xy = ci_x*gridi->ncy + ci_y;
+
+        /* Loop over shift vectors in three dimensions */
+        for (tz=-shp[ZZ]; tz<=shp[ZZ]; tz++)
+        {
+            shz = tz*box[ZZ][ZZ];
+
+            bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
+            bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
+
+            if (tz == 0)
+            {
+                d2z = 0;
+            }
+            else if (tz < 0)
+            {
+                d2z = sqr(bz1);
+            }
+            else
+            {
+                d2z = sqr(bz0 - box[ZZ][ZZ]);
+            }
+
+            d2z_cx = d2z + d2cx;
+
+            if (d2z_cx >= rl2)
+            {
+                continue;
+            }
+
+            bz1_frac =
+                bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
+            if (bz1_frac < 0)
+            {
+                bz1_frac = 0;
+            }
+            /* The check with bz1_frac close to or larger than 1 comes later */
+
+            for (ty=-shp[YY]; ty<=shp[YY]; ty++)
+            {
+                shy = ty*box[YY][YY] + tz*box[ZZ][YY];
+
+                if (nbl->bSimple)
+                {
+                    by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
+                    by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
+                }
+                else
+                {
+                    by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
+                    by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
+                }
+
+                get_cell_range(by0,by1,
+                               gridj->ncy,gridj->c0[YY],gridj->sy,gridj->inv_sy,
+                               d2z_cx,rl2,
+                               &cyf,&cyl);
+
+                if (cyf > cyl)
+                {
+                    continue;
+                }
+
+                d2z_cy = d2z;
+                if (by1 < gridj->c0[YY])
+                {
+                    d2z_cy += sqr(gridj->c0[YY] - by1);
+                }
+                else if (by0 > gridj->c1[YY])
+                {
+                    d2z_cy += sqr(by0 - gridj->c1[YY]);
+                }
+
+                for (tx=-shp[XX]; tx<=shp[XX]; tx++)
+                {
+                    shift = XYZ2IS(tx,ty,tz);
+
+#ifdef NBNXN_SHIFT_BACKWARD
+                    if (gridi == gridj && shift > CENTRAL)
+                    {
+                        continue;
+                    }
+#endif
+
+                    shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
+
+                    if (nbl->bSimple)
+                    {
+                        bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
+                        bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
+                    }
+                    else
+                    {
+                        bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
+                        bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
+                    }
+
+                    get_cell_range(bx0,bx1,
+                                   gridj->ncx,gridj->c0[XX],gridj->sx,gridj->inv_sx,
+                                   d2z_cy,rl2,
+                                   &cxf,&cxl);
+
+                    if (cxf > cxl)
+                    {
+                        continue;
+                    }
+
+                    if (nbl->bSimple)
+                    {
+                        new_ci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
+                                     nbl->work);
+                    }
+                    else
+                    {
+                        new_sci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
+                                      nbl->work);
+                    }
+
+#ifndef NBNXN_SHIFT_BACKWARD
+                    if (cxf < ci_x)
+#else
+                    if (shift == CENTRAL && gridi == gridj &&
+                        cxf < ci_x)
+#endif
+                    {
+                        /* Leave the pairs with i > j.
+                         * x is the major index, so skip half of it.
+                         */
+                        cxf = ci_x;
+                    }
+
+                    if (nbl->bSimple)
+                    {
+                        set_icell_bb_simple(bb_i,ci,shx,shy,shz,
+                                            nbl->work->bb_ci);
+                    }
+                    else
+                    {
+                        set_icell_bb_supersub(bb_i,ci,shx,shy,shz,
+                                              nbl->work->bb_ci);
+                    }
+
+                    nbs->icell_set_x(cell0_i+ci,shx,shy,shz,
+                                     gridi->na_c,nbat->xstride,nbat->x,
+                                     nbl->work);
+
+                    for(cx=cxf; cx<=cxl; cx++)
+                    {
+                        d2zx = d2z;
+                        if (gridj->c0[XX] + cx*gridj->sx > bx1)
+                        {
+                            d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
+                        }
+                        else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
+                        {
+                            d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
+                        }
+
+#ifndef NBNXN_SHIFT_BACKWARD
+                        if (gridi == gridj &&
+                            cx == 0 && cyf < ci_y)
+#else
+                        if (gridi == gridj &&
+                            cx == 0 && shift == CENTRAL && cyf < ci_y)
+#endif
+                        {
+                            /* Leave the pairs with i > j.
+                             * Skip half of y when i and j have the same x.
+                             */
+                            cyf_x = ci_y;
+                        }
+                        else
+                        {
+                            cyf_x = cyf;
+                        }
+
+                        for(cy=cyf_x; cy<=cyl; cy++)
+                        {
+                            c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
+                            c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
+#ifdef NBNXN_SHIFT_BACKWARD
+                            if (gridi == gridj &&
+                                shift == CENTRAL && c0 < ci)
+                            {
+                                c0 = ci;
+                            }
+#endif
+
+                            d2zxy = d2zx;
+                            if (gridj->c0[YY] + cy*gridj->sy > by1)
+                            {
+                                d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
+                            }
+                            else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
+                            {
+                                d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
+                            }
+                            if (c1 > c0 && d2zxy < rl2)
+                            {
+                                cs = c0 + (int)(bz1_frac*(c1 - c0));
+                                if (cs >= c1)
+                                {
+                                    cs = c1 - 1;
+                                }
+
+                                d2xy = d2zxy - d2z;
+
+                                /* Find the lowest cell that can possibly
+                                 * be within range.
+                                 */
+                                cf = cs;
+                                while(cf > c0 &&
+                                      (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
+                                       d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
+                                {
+                                    cf--;
+                                }
+
+                                /* Find the highest cell that can possibly
+                                 * be within range.
+                                 */
+                                cl = cs;
+                                while(cl < c1-1 &&
+                                      (bbcz_j[cl*NNBSBB_D] <= bz1 ||
+                                       d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
+                                {
+                                    cl++;
+                                }
+
+#ifdef NBNXN_REFCODE
+                                {
+                                    /* Simple reference code */
+                                    int k;
+                                    cf = c1;
+                                    cl = -1;
+                                    for(k=c0; k<c1; k++)
+                                    {
+                                        if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
+                                                      bb+k*NNBSBB_B) < rl2 &&
+                                            k < cf)
+                                        {
+                                            cf = k;
+                                        }
+                                        if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
+                                                      bb+k*NNBSBB_B) < rl2 &&
+                                            k > cl)
+                                        {
+                                            cl = k;
+                                        }
+                                    }
+                                }
+#endif
+
+                                if (gridi == gridj)
+                                {
+                                    /* We want each atom/cell pair only once,
+                                     * only use cj >= ci.
+                                     */
+#ifndef NBNXN_SHIFT_BACKWARD
+                                    cf = max(cf,ci);
+#else
+                                    if (shift == CENTRAL)
+                                    {
+                                        cf = max(cf,ci);
+                                    }
+#endif
+                                }
+
+                                if (cf <= cl)
+                                {
+                                    switch (nb_kernel_type)
+                                    {
+                                    case nbk4x4_PlainC:
+                                        check_subcell_list_space_simple(nbl,cl-cf+1);
+
+                                        make_cluster_list_simple(gridj,
+                                                                 nbl,ci,cf,cl,
+                                                                 (gridi == gridj && shift == CENTRAL),
+                                                                 nbat->x,
+                                                                 rl2,rbb2,
+                                                                 &ndistc);
+                                        break;
+#ifdef NBNXN_SEARCH_SSE
+                                    case nbk4xN_X86_SIMD128:
+                                        check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
+                                        make_cluster_list_x86_simd128(gridj,
+                                                                      nbl,ci,cf,cl,
+                                                                      (gridi == gridj && shift == CENTRAL),
+                                                                      nbat->x,
+                                                                      rl2,rbb2,
+                                                                      &ndistc);
+                                        break;
+#ifdef GMX_X86_AVX_256
+                                    case nbk4xN_X86_SIMD256:
+                                        check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
+                                        make_cluster_list_x86_simd256(gridj,
+                                                                      nbl,ci,cf,cl,
+                                                                      (gridi == gridj && shift == CENTRAL),
+                                                                      nbat->x,
+                                                                      rl2,rbb2,
+                                                                      &ndistc);
+                                        break;
+#endif
+#endif
+                                    case nbk8x8x8_PlainC:
+                                    case nbk8x8x8_CUDA:
+                                        check_subcell_list_space_supersub(nbl,cl-cf+1);
+                                        for(cj=cf; cj<=cl; cj++)
+                                        {
+                                            make_cluster_list(nbs,gridi,gridj,
+                                                              nbl,ci,cj,
+                                                              (gridi == gridj && shift == CENTRAL && ci == cj),
+                                                              nbat->xstride,nbat->x,
+                                                              rl2,rbb2,
+                                                              &ndistc);
+                                        }
+                                        break;
+                                    }
+                                    ncpcheck += cl - cf + 1;
+                                }
+                            }
+                        }
+                    }
+
+                    /* Set the exclusions for this ci list */
+                    if (nbl->bSimple)
+                    {
+                        set_ci_top_excls(nbs,
+                                         nbl,
+                                         shift == CENTRAL && gridi == gridj,
+                                         gridj->na_c_2log,
+                                         na_cj_2log,
+                                         &(nbl->ci[nbl->nci]),
+                                         excl);
+                    }
+                    else
+                    {
+                        set_sci_top_excls(nbs,
+                                          nbl,
+                                          shift == CENTRAL && gridi == gridj,
+                                          gridj->na_c_2log,
+                                          &(nbl->sci[nbl->nsci]),
+                                          excl);
+                    }
+
+                    /* Close this ci list */
+                    if (nbl->bSimple)
+                    {
+                        close_ci_entry_simple(nbl);
+                    }
+                    else
+                    {
+                        close_ci_entry_supersub(nbl,
+                                                nsubpair_max,
+                                                progBal,min_ci_balanced,
+                                                th,nth);
+                    }
+                }
+            }
+        }
+    }
+
+    work->ndistc = ndistc;
+
+    nbs_cycle_stop(&work->cc[enbsCCsearch]);
+
+    if (debug)
+    {
+        fprintf(debug,"number of distance checks %d\n",ndistc);
+        fprintf(debug,"ncpcheck %s %d\n",gridi==gridj ? "local" : "non-local",
+                ncpcheck);
+
+        if (nbl->bSimple)
+        {
+            print_nblist_statistics_simple(debug,nbl,nbs,rlist);
+        }
+        else
+        {
+            print_nblist_statistics_supersub(debug,nbl,nbs,rlist);
+        }
+
+    }
+}
+
+/* Make a local or non-local pair-list, depending on iloc */
+void nbnxn_make_pairlist(const nbnxn_search_t nbs,
+                         const nbnxn_atomdata_t *nbat,
+                         const t_blocka *excl,
+                         real rlist,
+                         int min_ci_balanced,
+                         nbnxn_pairlist_set_t *nbl_list,
+                         int iloc,
+                         int nb_kernel_type,
+                         t_nrnb *nrnb)
+{
+    const nbnxn_grid_t *gridi,*gridj;
+    int nzi,zi,zj0,zj1,zj;
+    int nsubpair_max;
+    int nth,th;
+    int nnbl;
+    nbnxn_pairlist_t **nbl;
+    gmx_bool CombineNBLists;
+    int np_tot,np_noq,np_hlj,nap;
+
+    nnbl            = nbl_list->nnbl;
+    nbl             = nbl_list->nbl;
+    CombineNBLists  = nbl_list->bCombined;
+
+    if (debug)
+    {
+        fprintf(debug,"ns making %d nblists\n", nnbl);
+    }
+
+    if (nbl_list->bSimple)
+    {
+        switch (nb_kernel_type)
+        {
+#ifdef NBNXN_SEARCH_SSE
+        case nbk4xN_X86_SIMD128:
+            nbs->icell_set_x = icell_set_x_x86_simd128;
+            break;
+#ifdef GMX_X86_AVX_256
+        case nbk4xN_X86_SIMD256:
+            nbs->icell_set_x = icell_set_x_x86_simd256;
+            break;
+#endif
+#endif
+        default:
+            nbs->icell_set_x = icell_set_x_simple;
+            break;
+        }
+    }
+    else
+    {
+#ifdef NBNXN_SEARCH_SSE
+        nbs->icell_set_x = icell_set_x_supersub_sse8;
+#else
+        nbs->icell_set_x = icell_set_x_supersub;
+#endif
+    }
+
+    if (LOCAL_I(iloc))
+    {
+        /* Only zone (grid) 0 vs 0 */
+        nzi = 1;
+        zj0 = 0;
+        zj1 = 1;
+    }
+    else
+    {
+        nzi = nbs->zones->nizone;
+    }
+
+    if (!nbl_list->bSimple && min_ci_balanced > 0)
+    {
+        nsubpair_max = get_nsubpair_max(nbs,iloc,rlist,min_ci_balanced);
+    }
+    else
+    {
+        nsubpair_max = 0;
+    }
+
+    /* Clear all pair-lists */
+    for(th=0; th<nnbl; th++)
+    {
+        clear_pairlist(nbl[th]);
+    }
+
+    for(zi=0; zi<nzi; zi++)
+    {
+        gridi = &nbs->grid[zi];
+
+        if (NONLOCAL_I(iloc))
+        {
+            zj0 = nbs->zones->izone[zi].j0;
+            zj1 = nbs->zones->izone[zi].j1;
+            if (zi == 0)
+            {
+                zj0++;
+            }
+        }
+        for(zj=zj0; zj<zj1; zj++)
+        {
+            gridj = &nbs->grid[zj];
+
+            if (debug)
+            {
+                fprintf(debug,"ns search grid %d vs %d\n",zi,zj);
+            }
+
+            nbs_cycle_start(&nbs->cc[enbsCCsearch]);
+
+#pragma omp parallel for num_threads(nnbl) schedule(static)
+            for(th=0; th<nnbl; th++)
+            {
+                if (CombineNBLists && th > 0)
+                {
+                    clear_pairlist(nbl[th]);
+                }
+
+                /* Divide the i super cell equally over the nblists */
+                nbnxn_make_pairlist_part(nbs,gridi,gridj,
+                                         &nbs->work[th],nbat,excl,
+                                         rlist,
+                                         nb_kernel_type,
+                                         nsubpair_max,
+                                         (LOCAL_I(iloc) || nbs->zones->n <= 2),
+                                         min_ci_balanced,
+                                         th,nnbl,
+                                         nbl[th]);
+            }
+            nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
+
+            np_tot = 0;
+            np_noq = 0;
+            np_hlj = 0;
+            for(th=0; th<nnbl; th++)
+            {
+                inc_nrnb(nrnb,eNR_NBNXN_DIST2,nbs->work[th].ndistc);
+
+                if (nbl_list->bSimple)
+                {
+                    np_tot += nbl[th]->ncj;
+                    np_noq += nbl[th]->work->ncj_noq;
+                    np_hlj += nbl[th]->work->ncj_hlj;
+                }
+                else
+                {
+                    /* This count ignores potential subsequent pair pruning */
+                    np_tot += nbl[th]->nci_tot;
+                }
+            }
+            nap = nbl[0]->na_ci*nbl[0]->na_cj;
+            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
+            nbl_list->natpair_lj  = np_noq*nap;
+            nbl_list->natpair_q   = np_hlj*nap/2;
+
+            if (CombineNBLists && nnbl > 1)
+            {
+                nbs_cycle_start(&nbs->cc[enbsCCcombine]);
+
+                combine_nblists(nnbl-1,nbl+1,nbl[0]);
+
+                nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
+            }
+
+        }
+    }
+
+    /*
+    print_supersub_nsp("nsubpair",nbl[0],iloc);
+    */
+
+    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
+    if (LOCAL_I(iloc))
+    {
+        nbs->search_count++;
+    }
+    if (nbs->print_cycles &&
+        (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
+        nbs->search_count % 100 == 0)
+    {
+        nbs_cycle_print(stderr,nbs);
+    }
+
+    if (debug && (CombineNBLists && nnbl > 1))
+    {
+        if (nbl[0]->bSimple)
+        {
+            print_nblist_statistics_simple(debug,nbl[0],nbs,rlist);
+        }
+        else
+        {
+            print_nblist_statistics_supersub(debug,nbl[0],nbs,rlist);
+        }
+    }
+
+    if (gmx_debug_at)
+    {
+        if (nbl[0]->bSimple)
+        {
+            print_nblist_ci_cj(debug,nbl[0]);
+        }
+        else
+        {
+            print_nblist_sci_cj(debug,nbl[0]);
+        }
+    }
+}
+
+/* Initializes an nbnxn_atomdata_output_t data structure */
+static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t *out,
+                                       int nb_kernel_type,
+                                       int nenergrp,int stride,
+                                       gmx_nbat_alloc_t *ma)
+{
+    int cj_size;
+
+    out->f = NULL;
+    ma((void **)&out->fshift,SHIFTS*DIM*sizeof(*out->fshift));
+    out->nV = nenergrp*nenergrp;
+    ma((void **)&out->Vvdw,out->nV*sizeof(*out->Vvdw));
+    ma((void **)&out->Vc  ,out->nV*sizeof(*out->Vc  ));
+
+    if (nb_kernel_type == nbk4xN_X86_SIMD128 ||
+        nb_kernel_type == nbk4xN_X86_SIMD256)
+    {
+        cj_size = kernel_to_cj_size(nb_kernel_type);
+        out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size;
+        ma((void **)&out->VSvdw,out->nVS*sizeof(*out->VSvdw));
+        ma((void **)&out->VSc  ,out->nVS*sizeof(*out->VSc  ));
+    }
+    else
+    {
+        out->nVS = 0;
+    }
+}
+
+/* Determines the combination rule (or none) to be used, stores it,
+ * and sets the LJ parameters required with the rule.
+ */
+static void set_combination_rule_data(nbnxn_atomdata_t *nbat)
+{
+    int  nt,i,j;
+    real c6,c12;
+
+    nt = nbat->ntype;
+
+    switch (nbat->comb_rule)
+    {
+    case  ljcrGEOM:
+        nbat->comb_rule = ljcrGEOM;
+
+        for(i=0; i<nt; i++)
+        {
+            /* Copy the diagonal from the nbfp matrix */
+            nbat->nbfp_comb[i*2  ] = sqrt(nbat->nbfp[(i*nt+i)*2  ]);
+            nbat->nbfp_comb[i*2+1] = sqrt(nbat->nbfp[(i*nt+i)*2+1]);
+        }
+        break;
+    case ljcrLB:
+        for(i=0; i<nt; i++)
+        {
+            /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
+            c6  = nbat->nbfp[(i*nt+i)*2  ];
+            c12 = nbat->nbfp[(i*nt+i)*2+1];
+            if (c6 > 0 && c12 > 0)
+            {
+                /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
+                 * so we get 6*C6 and 12*C12 after combining.
+                 */
+                nbat->nbfp_comb[i*2  ] = 0.5*pow(c12/c6,1.0/6.0);
+                nbat->nbfp_comb[i*2+1] = sqrt(c6*c6/c12);
+            }
+            else
+            {
+                nbat->nbfp_comb[i*2  ] = 0;
+                nbat->nbfp_comb[i*2+1] = 0;
+            }
+        }
+        break;
+    case ljcrNONE:
+        /* In nbfp_s4 we use a stride of 4 for storing two parameters */
+        nbat->alloc((void **)&nbat->nbfp_s4,nt*nt*4*sizeof(*nbat->nbfp_s4));
+        for(i=0; i<nt; i++)
+        {
+            for(j=0; j<nt; j++)
+            {
+                nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
+                nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
+                nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
+                nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
+            }
+        }
+        break;
+    default:
+        gmx_incons("Unknown combination rule");
+        break;
+    }
+}
+
+/* Initializes an nbnxn_atomdata_t data structure */
+void nbnxn_atomdata_init(FILE *fp,
+                         nbnxn_atomdata_t *nbat,
+                         int nb_kernel_type,
+                         int ntype,const real *nbfp,
+                         int n_energygroups,
+                         int nout,
+                         gmx_nbat_alloc_t *alloc,
+                         gmx_nbat_free_t  *free)
+{
+    int  i,j;
+    real c6,c12,tol;
+    char *ptr;
+    gmx_bool simple,bCombGeom,bCombLB;
+
+    if (alloc == NULL)
+    {
+        nbat->alloc = nbnxn_alloc_aligned;
+    }
+    else
+    {
+        nbat->alloc = alloc;
+    }
+    if (free == NULL)
+    {
+        nbat->free = nbnxn_free_aligned;
+    }
+    else
+    {
+        nbat->free = free;
+    }
+
+    if (debug)
+    {
+        fprintf(debug,"There are %d atom types in the system, adding one for nbnxn_atomdata_t\n",ntype);
+    }
+    nbat->ntype = ntype + 1;
+    nbat->alloc((void **)&nbat->nbfp,
+                nbat->ntype*nbat->ntype*2*sizeof(*nbat->nbfp));
+    nbat->alloc((void **)&nbat->nbfp_comb,nbat->ntype*2*sizeof(*nbat->nbfp_comb));
+
+    /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
+     * force-field floating point parameters.
+     */
+    tol = 1e-5;
+    ptr = getenv("GMX_LJCOMB_TOL");
+    if (ptr != NULL)
+    {
+        double dbl;
+
+        sscanf(ptr,"%lf",&dbl);
+        tol = dbl;
+    }
+    bCombGeom = TRUE;
+    bCombLB   = TRUE;
+
+    /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
+     * to check for the LB rule.
+     */
+    for(i=0; i<ntype; i++)
+    {
+        c6  = nbfp[(i*ntype+i)*2  ];
+        c12 = nbfp[(i*ntype+i)*2+1];
+        if (c6 > 0 && c12 > 0)
+        {
+            nbat->nbfp_comb[i*2  ] = pow(c12/c6,1.0/6.0);
+            nbat->nbfp_comb[i*2+1] = 0.25*c6*c6/c12;
+        }
+        else if (c6 == 0 && c12 == 0)
+        {
+            nbat->nbfp_comb[i*2  ] = 0;
+            nbat->nbfp_comb[i*2+1] = 0;
+        }
+        else
+        {
+            /* Can not use LB rule with only dispersion or repulsion */
+            bCombLB = FALSE;
+        }
+    }
+
+    for(i=0; i<nbat->ntype; i++)
+    {
+        for(j=0; j<nbat->ntype; j++)
+        {
+            if (i < ntype && j < ntype)
+            {
+                /* We store the prefactor in the derivative of the potential
+                 * in the parameter to avoid multiplications in the inner loop.
+                 */
+                c6  = nbfp[(i*ntype+j)*2  ];
+                c12 = nbfp[(i*ntype+j)*2+1];
+                nbat->nbfp[(i*nbat->ntype+j)*2  ] =  6.0*c6;
+                nbat->nbfp[(i*nbat->ntype+j)*2+1] = 12.0*c12;
+
+                bCombGeom = bCombGeom &&
+                    gmx_within_tol(c6*c6  ,nbfp[(i*ntype+i)*2  ]*nbfp[(j*ntype+j)*2  ],tol) &&
+                    gmx_within_tol(c12*c12,nbfp[(i*ntype+i)*2+1]*nbfp[(j*ntype+j)*2+1],tol);
+
+                bCombLB = bCombLB &&
+                    ((c6 == 0 && c12 == 0 &&
+                      (nbat->nbfp_comb[i*2+1] == 0 || nbat->nbfp_comb[j*2+1] == 0)) ||
+                     (c6 > 0 && c12 > 0 &&
+                      gmx_within_tol(pow(c12/c6,1.0/6.0),0.5*(nbat->nbfp_comb[i*2]+nbat->nbfp_comb[j*2]),tol) &&
+                      gmx_within_tol(0.25*c6*c6/c12,sqrt(nbat->nbfp_comb[i*2+1]*nbat->nbfp_comb[j*2+1]),tol)));
+            }
+            else
+            {
+                /* Add zero parameters for the additional dummy atom type */
+                nbat->nbfp[(i*nbat->ntype+j)*2  ] = 0;
+                nbat->nbfp[(i*nbat->ntype+j)*2+1] = 0;
+            }
+        }
+    }
+    if (debug)
+    {
+        fprintf(debug,"Combination rules: geometric %d Lorentz-Berthelot %d\n",
+                bCombGeom,bCombLB);
+    }
+
+    simple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
+
+    if (simple)
+    {
+        /* We prefer the geometic combination rule,
+         * as that gives a slightly faster kernel than the LB rule.
+         */
+        if (bCombGeom)
+        {
+            nbat->comb_rule = ljcrGEOM;
+        }
+        else if (bCombLB)
+        {
+            nbat->comb_rule = ljcrLB;
+        }
+        else
+        {
+            nbat->comb_rule = ljcrNONE;
+
+            nbat->free(nbat->nbfp_comb);
+        }
+
+        if (fp)
+        {
+            if (nbat->comb_rule == ljcrNONE)
+            {
+                fprintf(fp,"Using full Lennard-Jones parameter combination matrix\n\n");
+            }
+            else
+            {
+                fprintf(fp,"Using %s Lennard-Jones combination rule\n\n",
+                        nbat->comb_rule==ljcrGEOM ? "geometric" : "Lorentz-Berthelot");
+            }
+        }
+
+        set_combination_rule_data(nbat);
+    }
+    else
+    {
+        nbat->comb_rule = ljcrNONE;
+
+        nbat->free(nbat->nbfp_comb);
+    }
+
+    nbat->natoms  = 0;
+    nbat->type    = NULL;
+    nbat->lj_comb = NULL;
+    if (simple)
+    {
+        switch (nb_kernel_type)
+        {
+        case nbk4xN_X86_SIMD128:
+            nbat->XFormat = nbatX4;
+            break;
+        case nbk4xN_X86_SIMD256:
+#ifndef GMX_DOUBLE
+            nbat->XFormat = nbatX8;
+#else
+            nbat->XFormat = nbatX4;
+#endif
+            break;
+        default:
+            nbat->XFormat = nbatXYZ;
+            break;
+        }
+
+        nbat->FFormat = nbat->XFormat;
+    }
+    else
+    {
+        nbat->XFormat = nbatXYZQ;
+        nbat->FFormat = nbatXYZ;
+    }
+    nbat->q       = NULL;
+    nbat->nenergrp = n_energygroups;
+    if (!simple)
+    {
+        /* Energy groups not supported yet for super-sub lists */
+        nbat->nenergrp = 1;
+    }
+    /* Temporary storage goes is #grp^3*8 real, so limit to 64 */
+    if (nbat->nenergrp > 64)
+    {
+        gmx_fatal(FARGS,"With NxN kernels not more than 64 energy groups are supported\n");
+    }
+    nbat->neg_2log = 1;
+    while (nbat->nenergrp > (1<<nbat->neg_2log))
+    {
+        nbat->neg_2log++;
+    }
+    nbat->energrp = NULL;
+    nbat->alloc((void **)&nbat->shift_vec,SHIFTS*sizeof(*nbat->shift_vec));
+    nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
+    nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
+    nbat->x       = NULL;
+    nbat->nout    = nout;
+    snew(nbat->out,nbat->nout);
+    nbat->nalloc  = 0;
+    for(i=0; i<nbat->nout; i++)
+    {
+        nbnxn_atomdata_output_init(&nbat->out[i],
+                                   nb_kernel_type,
+                                   nbat->nenergrp,1<<nbat->neg_2log,
+                                   nbat->alloc);
+    }
+}
+
+static void copy_lj_to_nbat_lj_comb_x4(const real *ljparam_type,
+                                       const int *type,int na,
+                                       real *ljparam_at)
+{
+    int is,k,i;
+
+    /* The LJ params follow the combination rule:
+     * copy the params for the type array to the atom array.
+     */
+    for(is=0; is<na; is+=PACK_X4)
+    {
+        for(k=0; k<PACK_X4; k++)
+        {
+            i = is + k;
+            ljparam_at[is*2        +k] = ljparam_type[type[i]*2  ];
+            ljparam_at[is*2+PACK_X4+k] = ljparam_type[type[i]*2+1];
+        }
+    }
+}
+
+static void copy_lj_to_nbat_lj_comb_x8(const real *ljparam_type,
+                                       const int *type,int na,
+                                       real *ljparam_at)
+{
+    int is,k,i;
+
+    /* The LJ params follow the combination rule:
+     * copy the params for the type array to the atom array.
+     */
+    for(is=0; is<na; is+=PACK_X8)
+    {
+        for(k=0; k<PACK_X8; k++)
+        {
+            i = is + k;
+            ljparam_at[is*2        +k] = ljparam_type[type[i]*2  ];
+            ljparam_at[is*2+PACK_X8+k] = ljparam_type[type[i]*2+1];
+        }
+    }
+}
+
+/* Sets the atom type and LJ data in nbnxn_atomdata_t */
+static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t *nbat,
+                                         int ngrid,
+                                         const nbnxn_search_t nbs,
+                                         const int *type)
+{
+    int g,i,ncz,ash;
+    const nbnxn_grid_t *grid;
+
+    for(g=0; g<ngrid; g++)
+    {
+        grid = &nbs->grid[g];
+
+        /* Loop over all columns and copy and fill */
+        for(i=0; i<grid->ncx*grid->ncy; i++)
+        {
+            ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
+            ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
+
+            copy_int_to_nbat_int(nbs->a+ash,grid->cxy_na[i],ncz*grid->na_sc,
+                                 type,nbat->ntype-1,nbat->type+ash);
+
+            if (nbat->comb_rule != ljcrNONE)
+            {
+                if (nbat->XFormat == nbatX4)
+                {
+                    copy_lj_to_nbat_lj_comb_x4(nbat->nbfp_comb,
+                                               nbat->type+ash,ncz*grid->na_sc,
+                                               nbat->lj_comb+ash*2);
+                }
+                else if (nbat->XFormat == nbatX8)
+                {
+                    copy_lj_to_nbat_lj_comb_x8(nbat->nbfp_comb,
+                                               nbat->type+ash,ncz*grid->na_sc,
+                                               nbat->lj_comb+ash*2);
+                }
+            }
+        }
+    }
+}
+
+/* Sets the charges in nbnxn_atomdata_t *nbat */
+static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t *nbat,
+                                       int ngrid,
+                                       const nbnxn_search_t nbs,
+                                       const real *charge)
+{
+    int  g,cxy,ncz,ash,na,na_round,i,j;
+    real *q;
+    const nbnxn_grid_t *grid;
+
+    for(g=0; g<ngrid; g++)
+    {
+        grid = &nbs->grid[g];
+
+        /* Loop over all columns and copy and fill */
+        for(cxy=0; cxy<grid->ncx*grid->ncy; cxy++)
+        {
+            ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+            na  = grid->cxy_na[cxy];
+            na_round = (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
+
+            if (nbat->XFormat == nbatXYZQ)
+            {
+                q = nbat->x + ash*STRIDE_XYZQ + ZZ + 1;
+                for(i=0; i<na; i++)
+                {
+                    *q = charge[nbs->a[ash+i]];
+                    q += STRIDE_XYZQ;
+                }
+                /* Complete the partially filled last cell with zeros */
+                for(; i<na_round; i++)
+                {
+                    *q = 0;
+                    q += STRIDE_XYZQ;
+                }
+            }
+            else
+            {
+                q = nbat->q + ash;
+                for(i=0; i<na; i++)
+                {
+                    *q = charge[nbs->a[ash+i]];
+                    q++;
+                }
+                /* Complete the partially filled last cell with zeros */
+                for(; i<na_round; i++)
+                {
+                    *q = 0;
+                    q++;
+                }
+            }
+        }
+    }
+}
+
+/* Copies the energy group indices to a reordered and packed array */
+static void copy_egp_to_nbat_egps(const int *a,int na,int na_round,
+                                  int na_c,int bit_shift,
+                                  const int *in,int *innb)
+{
+    int i,j,sa,at;
+    int comb;
+
+    j = 0;
+    for(i=0; i<na; i+=na_c)
+    {
+        /* Store na_c energy groups number into one int */
+        comb = 0;
+        for(sa=0; sa<na_c; sa++)
+        {
+            at = a[i+sa];
+            if (at >= 0)
+            {
+                comb |= (GET_CGINFO_GID(in[at]) << (sa*bit_shift));
+            }
+        }
+        innb[j++] = comb;
+    }
+    /* Complete the partially filled last cell with fill */
+    for(; i<na_round; i+=na_c)
+    {
+        innb[j++] = 0;
+    }
+}
+
+/* Set the energy group indices for atoms in nbnxn_atomdata_t */
+static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t *nbat,
+                                            int ngrid,
+                                            const nbnxn_search_t nbs,
+                                            const int *atinfo)
+{
+    int g,i,ncz,ash;
+    const nbnxn_grid_t *grid;
+
+    for(g=0; g<ngrid; g++)
+    {
+        grid = &nbs->grid[g];
+
+        /* Loop over all columns and copy and fill */
+        for(i=0; i<grid->ncx*grid->ncy; i++)
+        {
+            ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
+            ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
+
+            copy_egp_to_nbat_egps(nbs->a+ash,grid->cxy_na[i],ncz*grid->na_sc,
+                                  nbat->na_c,nbat->neg_2log,
+                                  atinfo,nbat->energrp+(ash>>grid->na_c_2log));
+        }
+    }
+}
+
+/* Sets all required atom parameter data in nbnxn_atomdata_t */
+void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat,
+                        int locality,
+                        const nbnxn_search_t nbs,
+                        const t_mdatoms *mdatoms,
+                        const int *atinfo)
+{
+    int ngrid;
+
+    if (locality == eatLocal)
+    {
+        ngrid = 1;
+    }
+    else
+    {
+        ngrid = nbs->ngrid;
+    }
+
+    nbnxn_atomdata_set_atomtypes(nbat,ngrid,nbs,mdatoms->typeA);
+
+    nbnxn_atomdata_set_charges(nbat,ngrid,nbs,mdatoms->chargeA);
+
+    if (nbat->nenergrp > 1)
+    {
+        nbnxn_atomdata_set_energygroups(nbat,ngrid,nbs,atinfo);
+    }
+}
+
+/* Copies the shift vector array to nbnxn_atomdata_t */
+void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox,
+                                   rvec *shift_vec,
+                                   nbnxn_atomdata_t *nbat)
+{
+    int i;
+
+    nbat->bDynamicBox = bDynamicBox;
+    for(i=0; i<SHIFTS; i++)
+    {
+        copy_rvec(shift_vec[i],nbat->shift_vec[i]);
+    }
+}
+
+/* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
+void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs,
+                                      int locality,
+                                      gmx_bool FillLocal,
+                                      rvec *x,
+                                      nbnxn_atomdata_t *nbat)
+{
+    int g0=0,g1=0;
+    int nth,th;
+
+    switch (locality)
+    {
+    case eatAll:
+        g0 = 0;
+        g1 = nbs->ngrid;
+        break;
+    case eatLocal:
+        g0 = 0;
+        g1 = 1;
+        break;
+    case eatNonlocal:
+        g0 = 1;
+        g1 = nbs->ngrid;
+        break;
+    }
+
+    if (FillLocal)
+    {
+        nbat->natoms_local = nbs->grid[0].nc*nbs->grid[0].na_sc;
+    }
+
+    nth = gmx_omp_nthreads_get(emntPairsearch);
+
+#pragma omp parallel for num_threads(nth) schedule(static)
+    for(th=0; th<nth; th++)
+    {
+        int g;
+
+        for(g=g0; g<g1; g++)
+        {
+            const nbnxn_grid_t *grid;
+            int cxy0,cxy1,cxy;
+
+            grid = &nbs->grid[g];
+
+            cxy0 = (grid->ncx*grid->ncy* th   +nth-1)/nth;
+            cxy1 = (grid->ncx*grid->ncy*(th+1)+nth-1)/nth;
+
+            for(cxy=cxy0; cxy<cxy1; cxy++)
+            {
+                int na,ash,na_fill;
+
+                na  = grid->cxy_na[cxy];
+                ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
+
+                if (g == 0 && FillLocal)
+                {
+                    na_fill =
+                        (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
+                }
+                else
+                {
+                    /* We fill only the real particle locations.
+                     * We assume the filling entries at the end have been
+                     * properly set before during ns.
+                     */
+                    na_fill = na;
+                }
+                copy_rvec_to_nbat_real(nbs->a+ash,na,na_fill,x,
+                                       nbat->XFormat,nbat->x,ash,
+                                       0,0,0);
+            }
+        }
+    }
+}
+
+/* Add part of the force array(s) from nbnxn_atomdata_t to f */
+static void
+nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs,
+                                    const nbnxn_atomdata_t *nbat,
+                                    nbnxn_atomdata_output_t *out,
+                                    int nfa,
+                                    int a0,int a1,
+                                    rvec *f)
+{
+    int  a,i,fa;
+    const int  *cell;
+    const real *fnb;
+
+    cell = nbs->cell;
+
+    /* Loop over all columns and copy and fill */
+    switch (nbat->FFormat)
+    {
+    case nbatXYZ:
+    case nbatXYZQ:
+        if (nfa == 1)
+        {
+            fnb = out[0].f;
+
+            for(a=a0; a<a1; a++)
+            {
+                i = cell[a]*nbat->fstride;
+
+                f[a][XX] += fnb[i];
+                f[a][YY] += fnb[i+1];
+                f[a][ZZ] += fnb[i+2];
+            }
+        }
+        else
+        {
+            for(a=a0; a<a1; a++)
+            {
+                i = cell[a]*nbat->fstride;
+
+                for(fa=0; fa<nfa; fa++)
+                {
+                    f[a][XX] += out[fa].f[i];
+                    f[a][YY] += out[fa].f[i+1];
+                    f[a][ZZ] += out[fa].f[i+2];
+                }
+            }
+        }
+        break;
+    case nbatX4:
+        if (nfa == 1)
+        {
+            fnb = out[0].f;
+
+            for(a=a0; a<a1; a++)
+            {
+                i = X4_IND_A(cell[a]);
+
+                f[a][XX] += fnb[i+XX*PACK_X4];
+                f[a][YY] += fnb[i+YY*PACK_X4];
+                f[a][ZZ] += fnb[i+ZZ*PACK_X4];
+            }
+        }
+        else
+        {
+            for(a=a0; a<a1; a++)
+            {
+                i = X4_IND_A(cell[a]);
+                
+                for(fa=0; fa<nfa; fa++)
+                {
+                    f[a][XX] += out[fa].f[i+XX*PACK_X4];
+                    f[a][YY] += out[fa].f[i+YY*PACK_X4];
+                    f[a][ZZ] += out[fa].f[i+ZZ*PACK_X4];
+                }
+            }
+        }
+        break;
+    case nbatX8:
+        if (nfa == 1)
+        {
+            fnb = out[0].f;
+
+            for(a=a0; a<a1; a++)
+            {
+                i = X8_IND_A(cell[a]);
+
+                f[a][XX] += fnb[i+XX*PACK_X8];
+                f[a][YY] += fnb[i+YY*PACK_X8];
+                f[a][ZZ] += fnb[i+ZZ*PACK_X8];
+            }
+        }
+        else
+        {
+            for(a=a0; a<a1; a++)
+            {
+                i = X8_IND_A(cell[a]);
+                
+                for(fa=0; fa<nfa; fa++)
+                {
+                    f[a][XX] += out[fa].f[i+XX*PACK_X8];
+                    f[a][YY] += out[fa].f[i+YY*PACK_X8];
+                    f[a][ZZ] += out[fa].f[i+ZZ*PACK_X8];
+                }
+            }
+        }
+        break;
+    }
+}
+
+/* Add the force array(s) from nbnxn_atomdata_t to f */
+void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
+                                    int locality,
+                                    const nbnxn_atomdata_t *nbat,
+                                    rvec *f)
+{
+    int a0=0,na=0;
+    int nth,th;
+
+    nbs_cycle_start(&nbs->cc[enbsCCreducef]);
+
+    switch (locality)
+    {
+    case eatAll:
+        a0 = 0;
+        na = nbs->natoms_nonlocal;
+        break;
+    case eatLocal:
+        a0 = 0;
+        na = nbs->natoms_local;
+        break;
+    case eatNonlocal:
+        a0 = nbs->natoms_local;
+        na = nbs->natoms_nonlocal - nbs->natoms_local;
+        break;
+    }
+
+    nth = gmx_omp_nthreads_get(emntNonbonded);
+#pragma omp parallel for num_threads(nth) schedule(static)
+    for(th=0; th<nth; th++)
+    {
+        nbnxn_atomdata_add_nbat_f_to_f_part(nbs,nbat,
+                                             nbat->out,
+                                             nbat->nout,
+                                             a0+((th+0)*na)/nth,
+                                             a0+((th+1)*na)/nth,
+                                             f);
+    }
+
+    nbs_cycle_stop(&nbs->cc[enbsCCreducef]);
+}
+
+/* Adds the shift forces from nbnxn_atomdata_t to fshift */
+void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
+                                              rvec *fshift)
+{
+    const nbnxn_atomdata_output_t *out;
+    int  th;
+    int  s;
+    rvec sum;
+
+    out = nbat->out;
+    
+    for(s=0; s<SHIFTS; s++)
+    {
+        clear_rvec(sum);
+        for(th=0; th<nbat->nout; th++)
+        {
+            sum[XX] += out[th].fshift[s*DIM+XX];
+            sum[YY] += out[th].fshift[s*DIM+YY];
+            sum[ZZ] += out[th].fshift[s*DIM+ZZ];
+        }
+        rvec_inc(fshift[s],sum);
+    }
+}
diff --git a/src/mdlib/nbnxn_search_x86_simd.h b/src/mdlib/nbnxn_search_x86_simd.h
new file mode 100644
index 0000000000..eb962590e1
--- /dev/null
+++ b/src/mdlib/nbnxn_search_x86_simd.h
@@ -0,0 +1,307 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file.
+ * gmx_sse_or_avh.h should be included before including this file.
+ */
+
+/* Copies PBC shifted i-cell packed atom coordinates to working array */
+#ifdef GMX_MM128_HERE
+static void icell_set_x_x86_simd128
+#else
+#ifdef GMX_MM256_HERE
+static void icell_set_x_x86_simd256
+#else
+"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
+#endif
+#endif
+                                   (int ci,
+                                    real shx,real shy,real shz,
+                                    int na_c,
+                                    int stride,const real *x,
+                                    nbnxn_list_work_t *work)
+{
+    int  ia;
+#ifdef GMX_MM128_HERE
+    nbnxn_x_ci_x86_simd128_t *x_ci;
+
+    x_ci = work->x_ci_x86_simd128;
+
+    ia = X_IND_CI_S128(ci);
+#else
+    nbnxn_x_ci_x86_simd256_t *x_ci;
+
+    x_ci = work->x_ci_x86_simd256;
+
+    ia = X_IND_CI_S256(ci);
+#endif
+
+    x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S    ] + shx);
+    x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S    ] + shy);
+    x_ci->iz_SSE0 = gmx_set1_pr(x[ia + 2*STRIDE_S    ] + shz);
+    x_ci->ix_SSE1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
+    x_ci->iy_SSE1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
+    x_ci->iz_SSE1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
+    x_ci->ix_SSE2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
+    x_ci->iy_SSE2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
+    x_ci->iz_SSE2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
+    x_ci->ix_SSE3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
+    x_ci->iy_SSE3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
+    x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
+}
+
+/* SSE or AVX code for making a pair list of cell ci vs cell cjf-cjl
+ * for coordinates in packed format.
+ * Checks bouding box distances and possibly atom pair distances.
+ * This is an accelerated version of make_cluster_list_simple.
+ */
+#ifdef GMX_MM128_HERE
+static void make_cluster_list_x86_simd128
+#else
+#ifdef GMX_MM256_HERE
+static void make_cluster_list_x86_simd256
+#else
+"error: GMX_MM128_HERE or GMX_MM256_HERE not defined"
+#endif
+#endif
+                                         (const nbnxn_grid_t *gridj,
+                                          nbnxn_pairlist_t *nbl,
+                                          int ci,int cjf,int cjl,
+                                          gmx_bool remove_sub_diag,
+                                          const real *x_j,
+                                          real rl2,float rbb2,
+                                          int *ndistc)
+{
+#ifdef GMX_MM128_HERE
+    const nbnxn_x_ci_x86_simd128_t *work;
+#else
+    const nbnxn_x_ci_x86_simd256_t *work;
+#endif
+
+    const float *bb_ci;
+
+    gmx_mm_pr  jx_SSE,jy_SSE,jz_SSE;
+
+    gmx_mm_pr  dx_SSE0,dy_SSE0,dz_SSE0;
+    gmx_mm_pr  dx_SSE1,dy_SSE1,dz_SSE1;
+    gmx_mm_pr  dx_SSE2,dy_SSE2,dz_SSE2;
+    gmx_mm_pr  dx_SSE3,dy_SSE3,dz_SSE3;
+
+    gmx_mm_pr  rsq_SSE0;
+    gmx_mm_pr  rsq_SSE1;
+    gmx_mm_pr  rsq_SSE2;
+    gmx_mm_pr  rsq_SSE3;
+
+    gmx_mm_pr  wco_SSE0;
+    gmx_mm_pr  wco_SSE1;
+    gmx_mm_pr  wco_SSE2;
+    gmx_mm_pr  wco_SSE3;
+    gmx_mm_pr  wco_any_SSE01,wco_any_SSE23,wco_any_SSE;
+    
+    gmx_mm_pr  rc2_SSE;
+
+    gmx_bool   InRange;
+    float      d2;
+    int        xind_f,xind_l,cj;
+
+#ifdef GMX_MM128_HERE
+    cjf = CI_TO_CJ_S128(cjf);
+    cjl = CI_TO_CJ_S128(cjl+1) - 1;
+
+    work = nbl->work->x_ci_x86_simd128;
+#else
+    cjf = CI_TO_CJ_S256(cjf);
+    cjl = CI_TO_CJ_S256(cjl+1) - 1;
+
+    work = nbl->work->x_ci_x86_simd256;
+#endif
+
+    bb_ci = nbl->work->bb_ci;
+
+    rc2_SSE   = gmx_set1_pr(rl2);
+
+    InRange = FALSE;
+    while (!InRange && cjf <= cjl)
+    {
+        d2 = subc_bb_dist2_sse(4,0,bb_ci,cjf,gridj->bbj);
+        *ndistc += 2;
+        
+        /* Check if the distance is within the distance where
+         * we use only the bounding box distance rbb,
+         * or within the cut-off and there is at least one atom pair
+         * within the cut-off.
+         */
+        if (d2 < rbb2)
+        {
+            InRange = TRUE;
+        }
+        else if (d2 < rl2)
+        {
+#ifdef GMX_MM128_HERE
+            xind_f  = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjf);
+#else
+            xind_f  = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjf);
+#endif
+            jx_SSE  = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
+            jy_SSE  = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
+            jz_SSE  = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
+
+            
+            /* Calculate distance */
+            dx_SSE0            = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+            dy_SSE0            = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+            dz_SSE0            = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+            dx_SSE1            = gmx_sub_pr(work->ix_SSE1,jx_SSE);
+            dy_SSE1            = gmx_sub_pr(work->iy_SSE1,jy_SSE);
+            dz_SSE1            = gmx_sub_pr(work->iz_SSE1,jz_SSE);
+            dx_SSE2            = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+            dy_SSE2            = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+            dz_SSE2            = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+            dx_SSE3            = gmx_sub_pr(work->ix_SSE3,jx_SSE);
+            dy_SSE3            = gmx_sub_pr(work->iy_SSE3,jy_SSE);
+            dz_SSE3            = gmx_sub_pr(work->iz_SSE3,jz_SSE);
+            
+            /* rsq = dx*dx+dy*dy+dz*dz */
+            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+            rsq_SSE1           = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1);
+            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+            rsq_SSE3           = gmx_calc_rsq_pr(dx_SSE3,dy_SSE3,dz_SSE3);
+            
+            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+            wco_SSE1           = gmx_cmplt_pr(rsq_SSE1,rc2_SSE);
+            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+            wco_SSE3           = gmx_cmplt_pr(rsq_SSE3,rc2_SSE);
+            
+            wco_any_SSE01      = gmx_or_pr(wco_SSE0,wco_SSE1);
+            wco_any_SSE23      = gmx_or_pr(wco_SSE2,wco_SSE3);
+            wco_any_SSE        = gmx_or_pr(wco_any_SSE01,wco_any_SSE23);
+            
+            InRange            = gmx_movemask_pr(wco_any_SSE);
+
+            *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+        }
+        if (!InRange)
+        {
+            cjf++;
+        }
+    }
+    if (!InRange)
+    {
+        return;
+    }
+
+    InRange = FALSE;
+    while (!InRange && cjl > cjf)
+    {
+        d2 = subc_bb_dist2_sse(4,0,bb_ci,cjl,gridj->bbj);
+        *ndistc += 2;
+        
+        /* Check if the distance is within the distance where
+         * we use only the bounding box distance rbb,
+         * or within the cut-off and there is at least one atom pair
+         * within the cut-off.
+         */
+        if (d2 < rbb2)
+        {
+            InRange = TRUE;
+        }
+        else if (d2 < rl2)
+        {
+#ifdef GMX_MM128_HERE
+            xind_l  = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjl);
+#else
+            xind_l  = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjl);
+#endif
+            jx_SSE  = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
+            jy_SSE  = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
+            jz_SSE  = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
+            
+            /* Calculate distance */
+            dx_SSE0            = gmx_sub_pr(work->ix_SSE0,jx_SSE);
+            dy_SSE0            = gmx_sub_pr(work->iy_SSE0,jy_SSE);
+            dz_SSE0            = gmx_sub_pr(work->iz_SSE0,jz_SSE);
+            dx_SSE1            = gmx_sub_pr(work->ix_SSE1,jx_SSE);
+            dy_SSE1            = gmx_sub_pr(work->iy_SSE1,jy_SSE);
+            dz_SSE1            = gmx_sub_pr(work->iz_SSE1,jz_SSE);
+            dx_SSE2            = gmx_sub_pr(work->ix_SSE2,jx_SSE);
+            dy_SSE2            = gmx_sub_pr(work->iy_SSE2,jy_SSE);
+            dz_SSE2            = gmx_sub_pr(work->iz_SSE2,jz_SSE);
+            dx_SSE3            = gmx_sub_pr(work->ix_SSE3,jx_SSE);
+            dy_SSE3            = gmx_sub_pr(work->iy_SSE3,jy_SSE);
+            dz_SSE3            = gmx_sub_pr(work->iz_SSE3,jz_SSE);
+            
+            /* rsq = dx*dx+dy*dy+dz*dz */
+            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
+            rsq_SSE1           = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1);
+            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
+            rsq_SSE3           = gmx_calc_rsq_pr(dx_SSE3,dy_SSE3,dz_SSE3);
+            
+            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
+            wco_SSE1           = gmx_cmplt_pr(rsq_SSE1,rc2_SSE);
+            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
+            wco_SSE3           = gmx_cmplt_pr(rsq_SSE3,rc2_SSE);
+            
+            wco_any_SSE01      = gmx_or_pr(wco_SSE0,wco_SSE1);
+            wco_any_SSE23      = gmx_or_pr(wco_SSE2,wco_SSE3);
+            wco_any_SSE        = gmx_or_pr(wco_any_SSE01,wco_any_SSE23);
+            
+            InRange            = gmx_movemask_pr(wco_any_SSE);
+
+            *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+        }
+        if (!InRange)
+        {
+            cjl--;
+        }
+    }
+
+    if (cjf <= cjl)
+    {
+        for(cj=cjf; cj<=cjl; cj++)
+        {
+            /* Store cj and the interaction mask */
+#ifdef GMX_MM128_HERE
+            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_S128(gridj->cell0) + cj;
+            nbl->cj[nbl->ncj].excl = get_imask_x86_simd128(remove_sub_diag,ci,cj);
+#else
+            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_S256(gridj->cell0) + cj;
+            nbl->cj[nbl->ncj].excl = get_imask_x86_simd256(remove_sub_diag,ci,cj);
+#endif
+            nbl->ncj++;
+        }
+        /* Increase the closing index in i super-cell list */
+        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
+    }
+}
diff --git a/src/mdlib/nlistheuristics.c b/src/mdlib/nlistheuristics.c
index cd12600f9a..cddfca5acf 100644
--- a/src/mdlib/nlistheuristics.c
+++ b/src/mdlib/nlistheuristics.c
@@ -38,6 +38,7 @@
 #endif
 
 #include "typedefs.h"
+#include "types/nlistheuristics.h"
 #include "gmx_fatal.h"
 #include "vec.h"
 
diff --git a/src/mdlib/ns.c b/src/mdlib/ns.c
index 2757cbd2dd..571e7047c7 100644
--- a/src/mdlib/ns.c
+++ b/src/mdlib/ns.c
@@ -2585,7 +2585,7 @@ int search_neighbours(FILE *log,t_forcerec *fr,
         {
             dd_zones = NULL;
 
-            get_nsgrid_boundaries(grid,NULL,box,NULL,NULL,NULL,
+            get_nsgrid_boundaries(grid->nboundeddim,box,NULL,NULL,NULL,NULL,
                                   cgs->nr,fr->cg_cm,grid_x0,grid_x1,&grid_dens);
 
             grid_first(log,grid,NULL,NULL,fr->ePBC,box,grid_x0,grid_x1,
diff --git a/src/mdlib/nsgrid.c b/src/mdlib/nsgrid.c
index ee9a6dca82..b8b6261d98 100644
--- a/src/mdlib/nsgrid.c
+++ b/src/mdlib/nsgrid.c
@@ -42,6 +42,7 @@
 
 #include "sysstuff.h"
 #include "typedefs.h"
+#include "types/commrec.h"
 #include "macros.h"
 #include "smalloc.h"
 #include "nsgrid.h"
@@ -126,9 +127,9 @@ static void dd_box_bounds_to_ns_bounds(real box0,real box_size,
     *gr1 = av + NSGRID_STDDEV_FAC*stddev;
 }
 
-void get_nsgrid_boundaries(t_grid *grid,
+void get_nsgrid_boundaries(int nboundeddim,matrix box,
                            gmx_domdec_t *dd,
-                           matrix box,gmx_ddbox_t *ddbox,rvec *gr0,rvec *gr1,
+                           gmx_ddbox_t *ddbox,rvec *gr0,rvec *gr1,
                            int ncg,rvec *cgcm,
                            rvec grid_x0,rvec grid_x1,
                            real *grid_density)
@@ -137,7 +138,7 @@ void get_nsgrid_boundaries(t_grid *grid,
     real vol,bdens0,bdens1;
     int d;
 
-    if (grid->nboundeddim < DIM)
+    if (nboundeddim < DIM)
     {
         calc_x_av_stddev(ncg,cgcm,av,stddev);
     }
@@ -145,7 +146,7 @@ void get_nsgrid_boundaries(t_grid *grid,
     vol = 1;
     for(d=0; d<DIM; d++)
     {
-        if (d < grid->nboundeddim)
+        if (d < nboundeddim)
         {
             grid_x0[d] = (gr0 != NULL ? (*gr0)[d] : 0);
             grid_x1[d] = (gr1 != NULL ? (*gr1)[d] : box[d][d]);
@@ -725,7 +726,7 @@ void fill_grid(FILE *log,
                 if (cell_index[cg] == -1)
                 {
                     /* This cg has moved to another node */
-                    cell_index[cg] = 4*grid->ncells;
+                    cell_index[cg] = NSGRID_SIGNAL_MOVED_FAC*grid->ncells;
                     continue;
                 }
                 
diff --git a/src/mdlib/partdec.c b/src/mdlib/partdec.c
index e5b8ab249e..1837d21f07 100644
--- a/src/mdlib/partdec.c
+++ b/src/mdlib/partdec.c
@@ -336,6 +336,11 @@ pd_move_x_constraints(t_commrec *  cr,
         pd  = cr->pd;
         pdc = pd->constraints;
         
+       if (pdc == NULL)
+       {
+	        return;		
+       }
+
         thisnode  = cr->nodeid;
         
         /* First pulse to right */
diff --git a/src/mdlib/perf_est.c b/src/mdlib/perf_est.c
index 868ff84ee6..3f30f596fe 100644
--- a/src/mdlib/perf_est.c
+++ b/src/mdlib/perf_est.c
@@ -1,4 +1,5 @@
-/*
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
  * 
  *                This source code is part of
  * 
@@ -41,6 +42,49 @@
 #include "physics.h"
 #include "vec.h"
 #include "mtop_util.h"
+#include "types/commrec.h"
+#include "nbnxn_search.h"
+#include "nbnxn_consts.h"
+
+
+/* Computational cost of bonded, non-bonded and PME calculations.
+ * This will be machine dependent.
+ * The numbers here are accurate for Intel Core2 and AMD Athlon 64
+ * in single precision. In double precision PME mesh is slightly cheaper,
+ * although not so much that the numbers need to be adjusted.
+ */
+
+/* Cost of a pair interaction in the "group" cut-off scheme" */
+#define C_GR_FQ       1.5
+#define C_GR_QLJ_CUT  1.5
+#define C_GR_QLJ_TAB  2.0
+#define C_GR_LJ_CUT   1.0
+#define C_GR_LJ_TAB   1.75
+/* Cost of 1 water with one Q/LJ atom */
+#define C_GR_QLJW_CUT 2.0
+#define C_GR_QLJW_TAB 2.25
+/* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
+#define C_GR_QW       1.75
+
+/* Cost of a pair interaction in the "Verlet" cut-off scheme" */
+#define C_VT_LJ       0.30
+#define C_VT_QLJ_RF   0.40
+#define C_VT_Q_RF     0.30
+#define C_VT_QLJ_TAB  0.55
+#define C_VT_Q_TAB    0.50
+
+/* Cost of PME, with all components running with SSE instructions */
+/* Cost of particle reordering and redistribution */
+#define C_PME_REDIST  12.0
+/* Cost of q spreading and force interpolation per charge (mainly memory) */
+#define C_PME_SPREAD  0.30
+/* Cost of fft's, will be multiplied with N log(N) */
+#define C_PME_FFT     0.20
+/* Cost of pme_solve, will be multiplied with N */
+#define C_PME_SOLVE   0.50
+
+/* Cost of a bonded interaction divided by the number of (pbc_)dx nrequired */
+#define C_BOND        5.0
 
 int n_bonded_dx(gmx_mtop_t *mtop,gmx_bool bExcl)
 {
@@ -81,132 +125,249 @@ int n_bonded_dx(gmx_mtop_t *mtop,gmx_bool bExcl)
   return ndx;
 }
 
+static void pp_group_load(gmx_mtop_t *mtop,t_inputrec *ir,matrix box,
+                          int *nq_tot,
+                          double *cost_pp,
+                          gmx_bool *bChargePerturbed)
+{
+    t_atom *atom;
+    int  mb,nmol,atnr,cg,a,a0,ncqlj,ncq,nclj;
+    gmx_bool bBHAM,bLJcut,bWater,bQ,bLJ;
+    int nw,nqlj,nq,nlj;
+    float fq,fqlj,flj,fljtab,fqljw,fqw;
+    t_iparams *iparams;
+    gmx_moltype_t *molt;
+
+    bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
+
+    bLJcut = ((ir->vdwtype == evdwCUT) && !bBHAM);
+
+    /* Computational cost of bonded, non-bonded and PME calculations.
+     * This will be machine dependent.
+     * The numbers here are accurate for Intel Core2 and AMD Athlon 64
+     * in single precision. In double precision PME mesh is slightly cheaper,
+     * although not so much that the numbers need to be adjusted.
+     */
+    fq    = C_GR_FQ;
+    fqlj  = (bLJcut ? C_GR_QLJ_CUT : C_GR_QLJ_TAB);
+    flj   = (bLJcut ? C_GR_LJ_CUT  : C_GR_LJ_TAB);
+    /* Cost of 1 water with one Q/LJ atom */
+    fqljw = (bLJcut ? C_GR_QLJW_CUT : C_GR_QLJW_TAB);
+    /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
+    fqw   = C_GR_QW;
+
+    iparams = mtop->ffparams.iparams;
+    atnr = mtop->ffparams.atnr;
+    nw   = 0;
+    nqlj = 0;
+    nq   = 0;
+    nlj  = 0;
+    *bChargePerturbed = FALSE;
+    for(mb=0; mb<mtop->nmolblock; mb++)
+	{
+        molt = &mtop->moltype[mtop->molblock[mb].type];
+        atom = molt->atoms.atom;
+        nmol = mtop->molblock[mb].nmol;
+        a = 0;
+        for(cg=0; cg<molt->cgs.nr; cg++)
+        {
+            bWater = !bBHAM;
+            ncqlj = 0;
+            ncq   = 0;
+            nclj  = 0;
+            a0    = a;
+            while (a < molt->cgs.index[cg+1])
+            {
+                bQ  = (atom[a].q != 0 || atom[a].qB != 0);
+                bLJ = (iparams[(atnr+1)*atom[a].type].lj.c6  != 0 ||
+                       iparams[(atnr+1)*atom[a].type].lj.c12 != 0);
+                if (atom[a].q != atom[a].qB)
+                {
+                    *bChargePerturbed = TRUE;
+                }
+                /* This if this atom fits into water optimization */
+                if (!((a == a0   &&  bQ &&  bLJ) ||
+                      (a == a0+1 &&  bQ && !bLJ) ||
+                      (a == a0+2 &&  bQ && !bLJ && atom[a].q == atom[a-1].q) ||
+                      (a == a0+3 && !bQ &&  bLJ)))
+                    bWater = FALSE;
+                if (bQ && bLJ)
+                {
+                    ncqlj++;
+                }
+                else
+                {
+                    if (bQ)
+                    {
+                        ncq++;
+                    }
+                    if (bLJ)
+                    {
+                        nclj++;
+                    }
+                }
+                a++;
+            }
+            if (bWater)
+            {
+                nw   += nmol;
+            }
+            else
+            {
+                nqlj += nmol*ncqlj;
+                nq   += nmol*ncq;
+                nlj  += nmol*nclj;
+            }
+        }
+    }
+
+    *nq_tot = nq + nqlj + nw*3;
+
+    if (debug)
+    {
+      fprintf(debug,"nw %d nqlj %d nq %d nlj %d\n",nw,nqlj,nq,nlj);
+    }
+
+    /* For the PP non-bonded cost it is (unrealistically) assumed
+     * that all atoms are distributed homogeneously in space.
+     * Factor 3 is used because a water molecule has 3 atoms
+     * (and TIP4P effectively has 3 interactions with (water) atoms)).
+     */
+    *cost_pp = 0.5*(fqljw*nw*nqlj +
+                    fqw  *nw*(3*nw + nq) +
+                    fqlj *nqlj*nqlj +
+                    fq   *nq*(3*nw + nqlj + nq) +
+                    flj  *nlj*(nw + nqlj + nlj))
+        *4/3*M_PI*ir->rlist*ir->rlist*ir->rlist/det(box);
+}
+
+static void pp_verlet_load(gmx_mtop_t *mtop,t_inputrec *ir,matrix box,
+                           int *nq_tot,
+                           double *cost_pp,
+                           gmx_bool *bChargePerturbed)
+{
+    t_atom *atom;
+    int  mb,nmol,atnr,cg,a,a0,nqlj,nq,nlj;
+    gmx_bool bQRF;
+    t_iparams *iparams;
+    gmx_moltype_t *molt;
+    float r_eff;
+    double nat;
+
+    bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT);
+
+    iparams = mtop->ffparams.iparams;
+    atnr = mtop->ffparams.atnr;
+    nqlj = 0;
+    nq   = 0;
+    *bChargePerturbed = FALSE;
+    for(mb=0; mb<mtop->nmolblock; mb++)
+	{
+        molt = &mtop->moltype[mtop->molblock[mb].type];
+        atom = molt->atoms.atom;
+        nmol = mtop->molblock[mb].nmol;
+        a = 0;
+        for(a=0; a<molt->atoms.nr; a++)
+        {
+            if (atom[a].q != 0 || atom[a].qB != 0)
+            {
+                if (iparams[(atnr+1)*atom[a].type].lj.c6  != 0 ||
+                    iparams[(atnr+1)*atom[a].type].lj.c12 != 0)
+                {
+                    nqlj += nmol;
+                }
+                else
+                {
+                    nq += nmol;
+                }
+            }
+            if (atom[a].q != atom[a].qB)
+            {
+                *bChargePerturbed = TRUE;
+            }
+        }
+    }
+
+    nlj = mtop->natoms - nqlj - nq;
+
+    *nq_tot = nqlj + nq;
+
+    /* Effective cut-off for cluster pair list of 4x4 atoms */
+    r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(NBNXN_CPU_CLUSTER_I_SIZE,mtop->natoms/det(box));
+
+    if (debug)
+    {
+        fprintf(debug,"nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f\n",
+                nqlj,nq,nlj,ir->rlist,r_eff);
+    }
+
+    /* For the PP non-bonded cost it is (unrealistically) assumed
+     * that all atoms are distributed homogeneously in space.
+     */
+    /* Convert mtop->natoms to double to avoid int overflow */
+    nat = mtop->natoms;
+    *cost_pp = 0.5*(nqlj*nat*(bQRF ? C_VT_QLJ_RF : C_VT_QLJ_TAB) +
+                    nq*nat*(bQRF ? C_VT_Q_RF : C_VT_Q_TAB) +
+                    nlj*nat*C_VT_LJ)
+        *4/3*M_PI*r_eff*r_eff*r_eff/det(box);
+}
+
 float pme_load_estimate(gmx_mtop_t *mtop,t_inputrec *ir,matrix box)
 {
   t_atom *atom;
-  int  mb,nmol,atnr,cg,a,a0,ncqlj,ncq,nclj;
+  int  mb,nmol,atnr,cg,a,a0,nq_tot;
   gmx_bool bBHAM,bLJcut,bChargePerturbed,bWater,bQ,bLJ;
-  double nw,nqlj,nq,nlj;
-  double cost_bond,cost_pp,cost_spread,cost_fft,cost_solve,cost_pme;
-  float fq,fqlj,flj,fljtab,fqljw,fqw,fqspread,ffft,fsolve,fbond;
+  double cost_bond,cost_pp,cost_redist,cost_spread,cost_fft,cost_solve,cost_pme;
   float ratio;
   t_iparams *iparams;
   gmx_moltype_t *molt;
 
-  bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
-
-  bLJcut = ((ir->vdwtype == evdwCUT) && !bBHAM);
-
   /* Computational cost of bonded, non-bonded and PME calculations.
    * This will be machine dependent.
    * The numbers here are accurate for Intel Core2 and AMD Athlon 64
    * in single precision. In double precision PME mesh is slightly cheaper,
    * although not so much that the numbers need to be adjusted.
    */
-  fq    = 1.5;
-  fqlj  = (bLJcut ? 1.5  : 2.0 );
-  flj   = (bLJcut ? 1.0  : 1.75);
-  /* Cost of 1 water with one Q/LJ atom */
-  fqljw = (bLJcut ? 2.0  : 2.25);
-  /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
-  fqw   = 1.75;
-  /* Cost of q spreading and force interpolation per charge (mainly memory) */
-  fqspread = 0.55;
-  /* Cost of fft's, will be multiplied with N log(N) */
-  ffft     = 0.20;
-  /* Cost of pme_solve, will be multiplied with N */
-  fsolve   = 0.80;
-  /* Cost of a bonded interaction divided by the number of (pbc_)dx nrequired */
-  fbond = 5.0;
 
   iparams = mtop->ffparams.iparams;
   atnr = mtop->ffparams.atnr;
-  nw   = 0;
-  nqlj = 0;
-  nq   = 0;
-  nlj  = 0;
-  bChargePerturbed = FALSE;
-  for(mb=0; mb<mtop->nmolblock; mb++) {
-    molt = &mtop->moltype[mtop->molblock[mb].type];
-    atom = molt->atoms.atom;
-    nmol = mtop->molblock[mb].nmol;
-    a = 0;
-    for(cg=0; cg<molt->cgs.nr; cg++) {
-      bWater = !bBHAM;
-      ncqlj = 0;
-      ncq   = 0;
-      nclj  = 0;
-      a0    = a;
-      while (a < molt->cgs.index[cg+1]) {
-	bQ  = (atom[a].q != 0 || atom[a].qB != 0);
-	bLJ = (iparams[(atnr+1)*atom[a].type].lj.c6  != 0 ||
-	       iparams[(atnr+1)*atom[a].type].lj.c12 != 0);
-	if (atom[a].q != atom[a].qB) {
-	  bChargePerturbed = TRUE;
-	}
-	/* This if this atom fits into water optimization */
-	if (!((a == a0   &&  bQ &&  bLJ) ||
-	      (a == a0+1 &&  bQ && !bLJ) ||
-	      (a == a0+2 &&  bQ && !bLJ && atom[a].q == atom[a-1].q) ||
-	      (a == a0+3 && !bQ &&  bLJ)))
-	  bWater = FALSE;
-	if (bQ && bLJ) {
-	  ncqlj++;
-	} else {
-	  if (bQ)
-	    ncq++;
-	  if (bLJ)
-	    nclj++;
-	}
-	a++;
-      }
-      if (bWater) {
-	nw   += nmol;
-      } else {
-	nqlj += nmol*ncqlj;
-	nq   += nmol*ncq;
-	nlj  += nmol*nclj;
-      }
-    }
-  }
-  if (debug)
-    fprintf(debug,"nw %g nqlj %g nq %g nlj %g\n",nw,nqlj,nq,nlj);
 
-  cost_bond = fbond*n_bonded_dx(mtop,TRUE);
+  cost_bond = C_BOND*n_bonded_dx(mtop,TRUE);
 
-  /* For the PP non-bonded cost it is (unrealistically) assumed
-   * that all atoms are distributed homogeneously in space.
-   */
-  cost_pp = 0.5*(fqljw*nw*nqlj +
-		 fqw  *nw*(3*nw + nq) +
-		 fqlj *nqlj*nqlj +
-		 fq   *nq*(3*nw + nqlj + nq) +
-		 flj  *nlj*(nw + nqlj + nlj))
-    *4/3*M_PI*ir->rlist*ir->rlist*ir->rlist/det(box);
+  if (ir->cutoff_scheme == ecutsGROUP)
+  {
+      pp_group_load(mtop,ir,box,&nq_tot,&cost_pp,&bChargePerturbed);
+  }
+  else
+  {
+      pp_verlet_load(mtop,ir,box,&nq_tot,&cost_pp,&bChargePerturbed);
+  }
   
-  cost_spread = fqspread*(3*nw + nqlj + nq)*pow(ir->pme_order,3);
-  cost_fft    = ffft*ir->nkx*ir->nky*ir->nkz*log(ir->nkx*ir->nky*ir->nkz);
-  cost_solve  = fsolve*ir->nkx*ir->nky*ir->nkz;
+  cost_redist = C_PME_REDIST*nq_tot;
+  cost_spread = C_PME_SPREAD*nq_tot*pow(ir->pme_order,3);
+  cost_fft    = C_PME_FFT*ir->nkx*ir->nky*ir->nkz*log(ir->nkx*ir->nky*ir->nkz);
+  cost_solve  = C_PME_SOLVE*ir->nkx*ir->nky*ir->nkz;
 
   if (ir->efep != efepNO && bChargePerturbed) {
-    /* All PME work, except the spline coefficient calculation, doubles */
+    /* All PME work, except redist & spline coefficient calculation, doubles */
     cost_spread *= 2;
     cost_fft    *= 2;
     cost_solve  *= 2;
   }
 
-  cost_pme = cost_spread + cost_fft + cost_solve;
+  cost_pme = cost_redist + cost_spread + cost_fft + cost_solve;
 
   ratio = cost_pme/(cost_bond + cost_pp + cost_pme);
 
   if (debug) {
     fprintf(debug,
-	    "cost_bond   %f\n"
-	    "cost_pp     %f\n"
-	    "cost_spread %f\n"
-	    "cost_fft    %f\n"
-	    "cost_solve  %f\n",
-	    cost_bond,cost_pp,cost_spread,cost_fft,cost_solve);
+            "cost_bond   %f\n"
+            "cost_pp     %f\n"
+            "cost_redist %f\n"
+            "cost_spread %f\n"
+            "cost_fft    %f\n"
+            "cost_solve  %f\n",
+            cost_bond,cost_pp,cost_redist,cost_spread,cost_fft,cost_solve);
 
     fprintf(debug,"Estimate for relative PME load: %.3f\n",ratio);
   }
diff --git a/src/mdlib/pme.c b/src/mdlib/pme.c
index 8342621cad..a9fa5c1d07 100644
--- a/src/mdlib/pme.c
+++ b/src/mdlib/pme.c
@@ -92,8 +92,7 @@
 /* Single precision, with SSE2 or higher available */
 #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
 
-#include "gmx_x86_sse2.h"
-#include "gmx_math_x86_sse2_single.h"
+#include "gmx_x86_simd_single.h"
 
 #define PME_SSE
 /* Some old AMD processors could have problems with unaligned loads+stores */
@@ -133,6 +132,7 @@ typedef struct {
     int send_nindex;
     int recv_index0;
     int recv_nindex;
+    int recv_size;   /* Receive buffer width, used with OpenMP */
 } pme_grid_comm_t;
 
 typedef struct {
@@ -144,6 +144,7 @@ typedef struct {
     int  *s2g1;
     int  noverlap_nodes;
     int  *send_id,*recv_id;
+    int  send_size;             /* Send buffer width, used with OpenMP */
     pme_grid_comm_t *comm_data;
     real *sendbuf;
     real *recvbuf;
@@ -156,10 +157,13 @@ typedef struct {
 } thread_plist_t;
 
 typedef struct {
+    int  *thread_one;
     int  n;
     int  *ind;
     splinevec theta;
+    real *ptr_theta_z;
     splinevec dtheta;
+    real *ptr_dtheta_z;
 } splinedata_t;
 
 typedef struct {
@@ -204,11 +208,12 @@ typedef struct {
 #define FLBSZ 4
 
 typedef struct {
-    ivec ci;     /* The spatial location of this grid       */
-    ivec n;      /* The size of *grid, including order-1    */
-    ivec offset; /* The grid offset from the full node grid */
-    int  order;  /* PME spreading order                     */
-    real *grid;  /* The grid local thread, size n           */
+    ivec ci;     /* The spatial location of this grid         */
+    ivec n;      /* The used size of *grid, including order-1 */
+    ivec offset; /* The grid offset from the full node grid   */
+    int  order;  /* PME spreading order                       */
+    ivec s;      /* The allocated size of *grid, s >= n       */
+    real *grid;  /* The grid local thread, size n             */
 } pmegrid_t;
 
 typedef struct {
@@ -216,6 +221,7 @@ typedef struct {
     int  nthread;       /* The number of threads operating on this grid     */
     ivec nc;            /* The local spatial decomposition over the threads */
     pmegrid_t *grid_th; /* Array of grids for each thread                   */
+    real *grid_all;     /* Allocated array for the grids in *grid_th        */
     int  **g2t;         /* The grid to thread index                         */
     ivec nthread_comm;  /* The number of threads to communicate with        */
 } pmegrids_t;
@@ -563,6 +569,24 @@ static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
     }
 }
 
+static void realloc_splinevec(splinevec th,real **ptr_z,int nalloc)
+{
+    const int padding=4;
+    int i;
+
+    srenew(th[XX],nalloc);
+    srenew(th[YY],nalloc);
+    /* In z we add padding, this is only required for the aligned SSE code */
+    srenew(*ptr_z,nalloc+2*padding);
+    th[ZZ] = *ptr_z + padding;
+
+    for(i=0; i<padding; i++)
+    {
+        (*ptr_z)[               i] = 0;
+        (*ptr_z)[padding+nalloc+i] = 0;
+    }
+}
+
 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 {
     int i,d;
@@ -574,11 +598,10 @@ static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
         spline->ind[i] = i;
     }
 
-    for(d=0;d<DIM;d++)
-    {
-        srenew(spline->theta[d] ,atc->pme_order*atc->nalloc);
-        srenew(spline->dtheta[d],atc->pme_order*atc->nalloc);
-    }
+    realloc_splinevec(spline->theta,&spline->ptr_theta_z,
+                      atc->pme_order*atc->nalloc);
+    realloc_splinevec(spline->dtheta,&spline->ptr_dtheta_z,
+                      atc->pme_order*atc->nalloc);
 }
 
 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
@@ -1425,9 +1448,9 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
     int      pnx,pny,pnz,ndatatot;
     int      offx,offy,offz;
 
-    pnx = pmegrid->n[XX];
-    pny = pmegrid->n[YY];
-    pnz = pmegrid->n[ZZ];
+    pnx = pmegrid->s[XX];
+    pny = pmegrid->s[YY];
+    pnz = pmegrid->s[ZZ];
 
     offx = pmegrid->offset[XX];
     offy = pmegrid->offset[YY];
@@ -1439,7 +1462,7 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
     {
         grid[i] = 0;
     }
-
+    
     order = pmegrid->order;
 
     for(nn=0; nn<spline->n; nn++)
@@ -1542,14 +1565,15 @@ static void pmegrid_init(pmegrid_t *grid,
     grid->n[XX]      = x1 - x0 + pme_order - 1;
     grid->n[YY]      = y1 - y0 + pme_order - 1;
     grid->n[ZZ]      = z1 - z0 + pme_order - 1;
+    copy_ivec(grid->n,grid->s);
 
-    nz = grid->n[ZZ];
+    nz = grid->s[ZZ];
     set_grid_alignment(&nz,pme_order);
     if (set_alignment)
     {
-        grid->n[ZZ] = nz;
+        grid->s[ZZ] = nz;
     }
-    else if (nz != grid->n[ZZ])
+    else if (nz != grid->s[ZZ])
     {
         gmx_incons("pmegrid_init call with an unaligned z size");
     }
@@ -1557,7 +1581,7 @@ static void pmegrid_init(pmegrid_t *grid,
     grid->order = pme_order;
     if (ptr == NULL)
     {
-        gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ];
+        gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
         set_gridsize_alignment(&gridsize,pme_order);
         snew_aligned(grid->grid,gridsize,16);
     }
@@ -1635,7 +1659,7 @@ static void pmegrids_init(pmegrids_t *grids,
 {
     ivec n,n_base,g0,g1;
     int t,x,y,z,d,i,tfac;
-    int max_comm_lines;
+    int max_comm_lines=-1;
 
     n[XX] = nx - (pme_order - 1);
     n[YY] = ny - (pme_order - 1);
@@ -1655,7 +1679,6 @@ static void pmegrids_init(pmegrids_t *grids,
     {
         ivec nst;
         int gridsize;
-        real *grid_all;
 
         for(d=0; d<DIM; d++)
         {
@@ -1676,7 +1699,7 @@ static void pmegrids_init(pmegrids_t *grids,
         t = 0;
         gridsize = nst[XX]*nst[YY]*nst[ZZ];
         set_gridsize_alignment(&gridsize,pme_order);
-        snew_aligned(grid_all,
+        snew_aligned(grids->grid_all,
                      grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
                      16);
 
@@ -1696,7 +1719,7 @@ static void pmegrids_init(pmegrids_t *grids,
                                  (n[ZZ]*(z+1))/grids->nc[ZZ],
                                  TRUE,
                                  pme_order,
-                                 grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
+                                 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
                     t++;
                 }
             }
@@ -1730,7 +1753,8 @@ static void pmegrids_init(pmegrids_t *grids,
         case ZZ: max_comm_lines = pme_order - 1; break;
         }
         grids->nthread_comm[d] = 0;
-        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines)
+        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
+               grids->nthread_comm[d] < grids->nc[d])
         {
             grids->nthread_comm[d]++;
         }
@@ -2694,6 +2718,8 @@ static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
             snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
             atc->thread_plist[thread].n += GMX_CACHE_SEP;
         }
+        snew(atc->spline[thread].thread_one,pme->nthread);
+        atc->spline[thread].thread_one[thread] = 1;
     }
 }
 
@@ -2714,15 +2740,16 @@ init_overlap_comm(pme_overlap_t *  ol,
     pme_grid_comm_t *pgc;
     gmx_bool bCont;
     int fft_start,fft_end,send_index1,recv_index1;
-
 #ifdef GMX_MPI
+    MPI_Status stat;
+
     ol->mpi_comm = comm;
 #endif
 
     ol->nnodes = nnodes;
     ol->nodeid = nodeid;
 
-    /* Linear translation of the PME grid wo'nt affect reciprocal space
+    /* Linear translation of the PME grid won't affect reciprocal space
      * calculations, so to optimize we only interpolate "upwards",
      * which also means we only have to consider overlap in one direction.
      * I.e., particles on this node might also be spread to grid indices
@@ -2777,6 +2804,7 @@ init_overlap_comm(pme_overlap_t *  ol,
     }
     snew(ol->comm_data, ol->noverlap_nodes);
 
+    ol->send_size = 0;
     for(b=0; b<ol->noverlap_nodes; b++)
     {
         pgc = &ol->comm_data[b];
@@ -2792,6 +2820,7 @@ init_overlap_comm(pme_overlap_t *  ol,
         send_index1      = min(send_index1,fft_end);
         pgc->send_index0 = fft_start;
         pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
+        ol->send_size    += pgc->send_nindex;
 
         /* We always start receiving to the first index of our slab */
         fft_start        = ol->s2g0[ol->nodeid];
@@ -2806,6 +2835,16 @@ init_overlap_comm(pme_overlap_t *  ol,
         pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
     }
 
+#ifdef GMX_MPI
+    /* Communicate the buffer sizes to receive */
+    for(b=0; b<ol->noverlap_nodes; b++)
+    {
+        MPI_Sendrecv(&ol->send_size             ,1,MPI_INT,ol->send_id[b],b,
+                     &ol->comm_data[b].recv_size,1,MPI_INT,ol->recv_id[b],b,
+                     ol->mpi_comm,&stat);
+    }
+#endif
+
     /* For non-divisible grid we need pme_order iso pme_order-1 */
     snew(ol->sendbuf,norder*commplainsize);
     snew(ol->recvbuf,norder*commplainsize);
@@ -3075,7 +3114,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
         pme->nkz <= pme->pme_order)
     {
-        gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
+        gmx_fatal(FARGS,"The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",pme->pme_order);
     }
 
     if (pme->nnodes > 1) {
@@ -3121,20 +3160,26 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                       pme->nkx,
                       (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 
+    /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
+     * We do this with an offset buffer of equal size, so we need to allocate
+     * extra for the offset. That's what the (+1)*pme->nkz is for.
+     */
     init_overlap_comm(&pme->overlap[1],pme->pme_order,
 #ifdef GMX_MPI
                       pme->mpi_comm_d[1],
 #endif
                       pme->nnodes_minor,pme->nodeid_minor,
                       pme->nky,
-                      (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz);
+                      (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
 
-    /* Check for a limitation of the (current) sum_fftgrid_dd code */
-    if (pme->nthread > 1 &&
-        (pme->overlap[0].noverlap_nodes > 1 ||
-         pme->overlap[1].noverlap_nodes > 1))
+    /* Check for a limitation of the (current) sum_fftgrid_dd code.
+     * We only allow multiple communication pulses in dim 1, not in dim 0.
+     */
+    if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
+                             pme->nkx < pme->nnodes_major*pme->pme_order))
     {
-        gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order);
+        gmx_fatal(FARGS,"The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
+                  pme->nkx/(double)pme->nnodes_major,pme->pme_order);
     }
 
     snew(pme->bsp_mod[XX],pme->nkx);
@@ -3249,10 +3294,72 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
     }
 
     *pmedata = pme;
-
+    
     return 0;
 }
 
+static void reuse_pmegrids(const pmegrids_t *old,pmegrids_t *new)
+{
+    int d,t;
+
+    for(d=0; d<DIM; d++)
+    {
+        if (new->grid.n[d] > old->grid.n[d])
+        {
+            return;
+        }
+    }
+
+    sfree_aligned(new->grid.grid);
+    new->grid.grid = old->grid.grid;
+
+    if (new->nthread > 1 && new->nthread == old->nthread)
+    {
+        sfree_aligned(new->grid_all);
+        for(t=0; t<new->nthread; t++)
+        {
+            new->grid_th[t].grid = old->grid_th[t].grid;
+        }
+    }
+}
+
+int gmx_pme_reinit(gmx_pme_t *         pmedata,
+                   t_commrec *         cr,
+                   gmx_pme_t           pme_src,
+                   const t_inputrec *  ir,
+                   ivec                grid_size)
+{
+    t_inputrec irc;
+    int homenr;
+    int ret;
+
+    irc = *ir;
+    irc.nkx = grid_size[XX];
+    irc.nky = grid_size[YY];
+    irc.nkz = grid_size[ZZ];
+
+    if (pme_src->nnodes == 1)
+    {
+        homenr = pme_src->atc[0].n;
+    }
+    else
+    {
+        homenr = -1;
+    }
+
+    ret = gmx_pme_init(pmedata,cr,pme_src->nnodes_major,pme_src->nnodes_minor,
+                       &irc,homenr,pme_src->bFEP,FALSE,pme_src->nthread);
+
+    if (ret == 0)
+    {
+        /* We can easily reuse the allocated pme grids in pme_src */
+        reuse_pmegrids(&pme_src->pmegridA,&(*pmedata)->pmegridA);
+        /* We would like to reuse the fft grids, but that's harder */
+    }
+
+    return ret;
+}
+
 
 static void copy_local_grid(gmx_pme_t pme,
                             pmegrids_t *pmegrids,int thread,real *fftgrid)
@@ -3275,9 +3382,9 @@ static void copy_local_grid(gmx_pme_t pme,
 
     pmegrid = &pmegrids->grid_th[thread];
 
-    nsx = pmegrid->n[XX];
-    nsy = pmegrid->n[YY];
-    nsz = pmegrid->n[ZZ];
+    nsx = pmegrid->s[XX];
+    nsy = pmegrid->s[YY];
+    nsz = pmegrid->s[ZZ];
 
     for(d=0; d<DIM; d++)
     {
@@ -3307,41 +3414,6 @@ static void copy_local_grid(gmx_pme_t pme,
     }
 }
 
-static void print_sendbuf(gmx_pme_t pme,real *sendbuf)
-{
-    ivec local_fft_ndata,local_fft_offset,local_fft_size;
-    pme_overlap_t *overlap;
-    int datasize,nind;
-    int i,x,y,z,n;
-
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
-                                   local_fft_ndata,
-                                   local_fft_offset,
-                                   local_fft_size);
-    /* Major dimension */
-    overlap = &pme->overlap[0];
-
-    nind   = overlap->comm_data[0].send_nindex;
-
-    for(y=0; y<local_fft_ndata[YY]; y++) {
-         printf(" %2d",y);
-    }
-    printf("\n");
-
-    i = 0;
-    for(x=0; x<nind; x++) {
-        for(y=0; y<local_fft_ndata[YY]; y++) {
-            n = 0;
-            for(z=0; z<local_fft_ndata[ZZ]; z++) {
-                if (sendbuf[i] != 0) n++;
-                i++;
-            }
-            printf(" %2d",n);
-        }
-        printf("\n");
-    }
-}
-
 static void
 reduce_threadgrid_overlap(gmx_pme_t pme,
                           const pmegrids_t *pmegrids,int thread,
@@ -3476,9 +3548,9 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
 
                 grid_th = pmegrid_f->grid;
 
-                nsx = pmegrid_f->n[XX];
-                nsy = pmegrid_f->n[YY];
-                nsz = pmegrid_f->n[ZZ];
+                nsx = pmegrid_f->s[XX];
+                nsy = pmegrid_f->s[YY];
+                nsz = pmegrid_f->s[ZZ];
 
 #ifdef DEBUG_PME_REDUCE
                 printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
@@ -3575,11 +3647,12 @@ static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
 {
     ivec local_fft_ndata,local_fft_offset,local_fft_size;
     pme_overlap_t *overlap;
-    int  send_nindex;
-    int  recv_index0,recv_nindex;
+    int  send_index0,send_nindex;
+    int  recv_nindex;
 #ifdef GMX_MPI
     MPI_Status stat;
 #endif
+    int  send_size_y,recv_size_y;
     int  ipulse,send_id,recv_id,datasize,gridsize,size_yx;
     real *sendptr,*recvptr;
     int  x,y,z,indg,indb;
@@ -3596,9 +3669,6 @@ static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
                                    local_fft_offset,
                                    local_fft_size);
 
-    /* Currently supports only a single communication pulse */
-
-/* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
     if (pme->nnodes_minor > 1)
     {
         /* Major dimension */
@@ -3612,66 +3682,70 @@ static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
         {
             size_yx = 0;
         }
-        datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ];
+        datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
 
-        ipulse = 0;
+        send_size_y = overlap->send_size;
 
-        send_id = overlap->send_id[ipulse];
-        recv_id = overlap->recv_id[ipulse];
-        send_nindex   = overlap->comm_data[ipulse].send_nindex;
-        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
-        recv_index0 = 0;
-        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
-
-        sendptr = overlap->sendbuf;
-        recvptr = overlap->recvbuf;
+        for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
+        {
+            send_id = overlap->send_id[ipulse];
+            recv_id = overlap->recv_id[ipulse];
+            send_index0   =
+                overlap->comm_data[ipulse].send_index0 -
+                overlap->comm_data[0].send_index0;
+            send_nindex   = overlap->comm_data[ipulse].send_nindex;
+            /* We don't use recv_index0, as we always receive starting at 0 */
+            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
+            recv_size_y   = overlap->comm_data[ipulse].recv_size;
 
-        /*
-        printf("node %d comm %2d x %2d x %2d\n",pme->nodeid,
-               local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]);
-        printf("node %d send %f, %f\n",pme->nodeid,
-               sendptr[0],sendptr[send_nindex*datasize-1]);
-        */
+            sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
+            recvptr = overlap->recvbuf;
 
 #ifdef GMX_MPI
-        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
-                     send_id,ipulse,
-                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
-                     recv_id,ipulse,
-                     overlap->mpi_comm,&stat);
+            MPI_Sendrecv(sendptr,send_size_y*datasize,GMX_MPI_REAL,
+                         send_id,ipulse,
+                         recvptr,recv_size_y*datasize,GMX_MPI_REAL,
+                         recv_id,ipulse,
+                         overlap->mpi_comm,&stat);
 #endif
 
-        for(x=0; x<local_fft_ndata[XX]; x++)
-        {
-            for(y=0; y<recv_nindex; y++)
+            for(x=0; x<local_fft_ndata[XX]; x++)
             {
-                indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
-                indb = (x*recv_nindex        + y)*local_fft_ndata[ZZ];
-                for(z=0; z<local_fft_ndata[ZZ]; z++)
+                for(y=0; y<recv_nindex; y++)
                 {
-                    fftgrid[indg+z] += recvptr[indb+z];
+                    indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
+                    indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
+                    for(z=0; z<local_fft_ndata[ZZ]; z++)
+                    {
+                        fftgrid[indg+z] += recvptr[indb+z];
+                    }
                 }
             }
-        }
-        if (pme->nnodes_major > 1)
-        {
-            sendptr = pme->overlap[0].sendbuf;
-            for(x=0; x<size_yx; x++)
+
+            if (pme->nnodes_major > 1)
             {
-                for(y=0; y<recv_nindex; y++)
+                /* Copy from the received buffer to the send buffer for dim 0 */
+                sendptr = pme->overlap[0].sendbuf;
+                for(x=0; x<size_yx; x++)
                 {
-                    indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
-                    indb = ((local_fft_ndata[XX] + x)*recv_nindex +y)*local_fft_ndata[ZZ];
-                    for(z=0; z<local_fft_ndata[ZZ]; z++)
+                    for(y=0; y<recv_nindex; y++)
                     {
-                        sendptr[indg+z] += recvptr[indb+z];
+                        indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
+                        indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
+                        for(z=0; z<local_fft_ndata[ZZ]; z++)
+                        {
+                            sendptr[indg+z] += recvptr[indb+z];
+                        }
                     }
                 }
             }
         }
     }
 
-    /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
+    /* We only support a single pulse here.
+     * This is not a severe limitation, as this code is only used
+     * with OpenMP and with OpenMP the (PME) domains can be larger.
+     */
     if (pme->nnodes_major > 1)
     {
         /* Major dimension */
@@ -3685,8 +3759,7 @@ static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
         send_id = overlap->send_id[ipulse];
         recv_id = overlap->recv_id[ipulse];
         send_nindex   = overlap->comm_data[ipulse].send_nindex;
-        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
-        recv_index0 = 0;
+        /* We don't use recv_index0, as we always receive starting at 0 */
         recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 
         sendptr = overlap->sendbuf;
@@ -3830,9 +3903,6 @@ static void spread_on_grid(gmx_pme_t pme,
                                       fftgrid,
                                       pme->overlap[0].sendbuf,
                                       pme->overlap[1].sendbuf);
-#ifdef PRINT_PME_SENDBUF
-            print_sendbuf(pme,pme->overlap[0].sendbuf);
-#endif
         }
 #ifdef PME_TIME_THREADS
         c3 = omp_cyc_end(c3);
@@ -3953,12 +4023,48 @@ static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
 }
 
 
+static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
+                               ivec grid_size,
+                               t_commrec *cr, t_inputrec *ir,
+                               gmx_pme_t *pme_ret)
+{
+    int ind;
+    gmx_pme_t pme = NULL;
+
+    ind = 0;
+    while (ind < *npmedata)
+    {
+        pme = (*pmedata)[ind];
+        if (pme->nkx == grid_size[XX] &&
+            pme->nky == grid_size[YY] &&
+            pme->nkz == grid_size[ZZ])
+        {
+            *pme_ret = pme;
+
+            return;
+        }
+
+        ind++;
+    }
+
+    (*npmedata)++;
+    srenew(*pmedata,*npmedata);
+
+    /* Generate a new PME data structure, copying part of the old pointers */
+    gmx_pme_reinit(&((*pmedata)[ind]),cr,pme,ir,grid_size);
+
+    *pme_ret = (*pmedata)[ind];
+}
+
+
 int gmx_pmeonly(gmx_pme_t pme,
                 t_commrec *cr,    t_nrnb *nrnb,
                 gmx_wallcycle_t wcycle,
                 real ewaldcoeff,  gmx_bool bGatherOnly,
                 t_inputrec *ir)
 {
+    int npmedata;
+    gmx_pme_t *pmedata;
     gmx_pme_pp_t pme_pp;
     int  natoms;
     matrix box;
@@ -3972,7 +4078,12 @@ int gmx_pmeonly(gmx_pme_t pme,
     int  count;
     gmx_bool bEnerVir;
     gmx_large_int_t step,step_rel;
+    ivec grid_switch;
 
+    /* This data will only use with PME tuning, i.e. switching PME grids */
+    npmedata = 1;
+    snew(pmedata,npmedata);
+    pmedata[0] = pme;
 
     pme_pp = gmx_pme_pp_init(cr);
 
@@ -3981,15 +4092,28 @@ int gmx_pmeonly(gmx_pme_t pme,
     count = 0;
     do /****** this is a quasi-loop over time steps! */
     {
-        /* Domain decomposition */
-        natoms = gmx_pme_recv_q_x(pme_pp,
-                                  &chargeA,&chargeB,box,&x_pp,&f_pp,
-                                  &maxshift_x,&maxshift_y,
-                                  &pme->bFEP,&lambda,
-                                  &bEnerVir,
-                                  &step);
-
-        if (natoms == -1) {
+        /* The reason for having a loop here is PME grid tuning/switching */
+        do
+        {
+            /* Domain decomposition */
+            natoms = gmx_pme_recv_q_x(pme_pp,
+                                      &chargeA,&chargeB,box,&x_pp,&f_pp,
+                                      &maxshift_x,&maxshift_y,
+                                      &pme->bFEP,&lambda,
+                                      &bEnerVir,
+                                      &step,
+                                      grid_switch,&ewaldcoeff);
+
+            if (natoms == -2)
+            {
+                /* Switch the PME grid to grid_switch */
+                gmx_pmeonly_switch(&npmedata,&pmedata,grid_switch,cr,ir,&pme);
+            }
+        }
+        while (natoms == -2);
+
+        if (natoms == -1)
+        {
             /* We should stop: break out of the loop */
             break;
         }
@@ -4259,7 +4383,7 @@ int gmx_pme_do(gmx_pme_t pme,
                 if (thread == 0)
                 {
                     wallcycle_stop(wcycle,ewcPME_FFT);
-
+                    
                     where();
                     GMX_MPE_LOG(ev_gmxfft3d_finish);
 
diff --git a/src/mdlib/pme_pp.c b/src/mdlib/pme_pp.c
index 03ab7c0786..c08cdeebf3 100644
--- a/src/mdlib/pme_pp.c
+++ b/src/mdlib/pme_pp.c
@@ -1,4 +1,5 @@
-/*
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
  * 
  *                This source code is part of
  * 
@@ -65,6 +66,7 @@
 #define PP_PME_FEP      (1<<3)
 #define PP_PME_ENER_VIR (1<<4)
 #define PP_PME_FINISH   (1<<5)
+#define PP_PME_SWITCH   (1<<6)
 
 #define PME_PP_SIGSTOP     (1<<0)
 #define PME_PP_SIGSTOPNSS     (1<<1)
@@ -90,13 +92,15 @@ typedef struct gmx_pme_pp {
 } t_gmx_pme_pp;
 
 typedef struct gmx_pme_comm_n_box {
-  int    natoms;
-  matrix box;
-  int    maxshift_x;
-  int    maxshift_y;
-  real   lambda;
-  int    flags;
-  gmx_large_int_t step;
+    int    natoms;
+    matrix box;
+    int    maxshift_x;
+    int    maxshift_y;
+    real   lambda;
+    int    flags;
+    gmx_large_int_t step;
+    ivec   grid_size;     /* For PME grid tuning */
+    real   ewaldcoeff;    /* For PME grid tuning */
 } gmx_pme_comm_n_box_t;
 
 typedef struct {
@@ -256,7 +260,7 @@ void gmx_pme_send_x(t_commrec *cr, matrix box, rvec *x,
   gmx_pme_send_q_x(cr,flags,NULL,NULL,box,x,lambda,0,0,step);
 }
 
-void gmx_pme_finish(t_commrec *cr)
+void gmx_pme_send_finish(t_commrec *cr)
 {
   int flags;
 
@@ -265,13 +269,32 @@ void gmx_pme_finish(t_commrec *cr)
   gmx_pme_send_q_x(cr,flags,NULL,NULL,NULL,NULL,0,0,0,-1);
 }
 
+void gmx_pme_send_switch(t_commrec *cr, ivec grid_size, real ewaldcoeff)
+{
+#ifdef GMX_MPI
+    gmx_pme_comm_n_box_t cnb;
+
+    if (cr->dd->pme_receive_vir_ener)
+    {
+        cnb.flags = PP_PME_SWITCH;
+        copy_ivec(grid_size,cnb.grid_size);
+        cnb.ewaldcoeff = ewaldcoeff;
+
+        /* We send this, uncommon, message blocking to simplify the code */
+        MPI_Send(&cnb,sizeof(cnb),MPI_BYTE,
+                 cr->dd->pme_nodeid,0,cr->mpi_comm_mysim);
+    }
+#endif
+}
+
 int gmx_pme_recv_q_x(struct gmx_pme_pp *pme_pp,
                      real **chargeA, real **chargeB,
                      matrix box, rvec **x,rvec **f,
                      int *maxshift_x, int *maxshift_y,
                      gmx_bool *bFreeEnergy,real *lambda,
-		     gmx_bool *bEnerVir,
-                     gmx_large_int_t *step)
+                     gmx_bool *bEnerVir,
+                     gmx_large_int_t *step,
+                     ivec grid_size, real *ewaldcoeff)
 {
     gmx_pme_comm_n_box_t cnb;
     int  nat=0,q,messages,sender;
@@ -289,10 +312,22 @@ int gmx_pme_recv_q_x(struct gmx_pme_pp *pme_pp,
                  pme_pp->mpi_comm_mysim,MPI_STATUS_IGNORE);
 
         if (debug)
-            fprintf(debug,"PME only node receiving:%s%s%s\n",
+        {
+            fprintf(debug,"PME only node receiving:%s%s%s%s\n",
                     (cnb.flags & PP_PME_CHARGE) ? " charges" : "",
-                        (cnb.flags & PP_PME_COORD ) ? " coordinates" : "",
-                            (cnb.flags & PP_PME_FINISH) ? " finish" : "");
+                    (cnb.flags & PP_PME_COORD ) ? " coordinates" : "",
+                    (cnb.flags & PP_PME_FINISH) ? " finish" : "",
+                    (cnb.flags & PP_PME_SWITCH) ? " switch" : "");
+        }
+
+        if (cnb.flags & PP_PME_SWITCH)
+        {
+            /* Special case, receive the new parameters and return */
+            copy_ivec(cnb.grid_size,grid_size);
+            *ewaldcoeff = cnb.ewaldcoeff;
+
+            return -2;
+        }
 
         if (cnb.flags & PP_PME_CHARGE) {
             /* Receive the send counts from the other PP nodes */
diff --git a/src/mdlib/pme_sse_single.h b/src/mdlib/pme_sse_single.h
index 1b0b61760b..7d1623528e 100644
--- a/src/mdlib/pme_sse_single.h
+++ b/src/mdlib/pme_sse_single.h
@@ -1,12 +1,12 @@
 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
- *
+ * 
  *                This source code is part of
- *
+ * 
  *                 G   R   O   M   A   C   S
- *
+ * 
  *          GROningen MAchine for Chemical Simulations
- *
+ * 
  *                        VERSION 4.5
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
@@ -24,12 +24,12 @@
  * inclusion in the official distribution, but derived work must not
  * be called official GROMACS. Details are found in the README & COPYING
  * files - if they are missing, get the official version at www.gromacs.org.
- *
+ * 
  * To help us fund GROMACS development, we humbly ask that you cite
  * the papers on the package - you can find them in the top README file.
- *
+ * 
  * For more info, check our website at http://www.gromacs.org
- *
+ * 
  * And Hey:
  * GROwing Monsters And Cloning Shrimps
  */
@@ -155,86 +155,86 @@
     int offset;
     int index;
     __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3,ty_SSE4;
-    __m128 tz_SSE0;
-    __m128 tz_SSE1;
-    __m128 vx_SSE;
-    __m128 vx_tz_SSE0;
-    __m128 vx_tz_SSE1;
-    __m128 sum_SSE00,sum_SSE01,sum_SSE02,sum_SSE03,sum_SSE04;
-    __m128 sum_SSE10,sum_SSE11,sum_SSE12,sum_SSE13,sum_SSE14;
-    __m128 gri_SSE00,gri_SSE01,gri_SSE02,gri_SSE03,gri_SSE04;
-    __m128 gri_SSE10,gri_SSE11,gri_SSE12,gri_SSE13,gri_SSE14;
-
-    offset = k0 & 3;
-
-    ty_SSE0 = _mm_load1_ps(&thy[0]);
-    ty_SSE1 = _mm_load1_ps(&thy[1]);
-    ty_SSE2 = _mm_load1_ps(&thy[2]);
-    ty_SSE3 = _mm_load1_ps(&thy[3]);
+    __m128 tz_SSE0;                                  
+    __m128 tz_SSE1;                                  
+    __m128 vx_SSE;                                   
+    __m128 vx_tz_SSE0;                               
+    __m128 vx_tz_SSE1;                               
+    __m128 sum_SSE00,sum_SSE01,sum_SSE02,sum_SSE03,sum_SSE04; 
+    __m128 sum_SSE10,sum_SSE11,sum_SSE12,sum_SSE13,sum_SSE14; 
+    __m128 gri_SSE00,gri_SSE01,gri_SSE02,gri_SSE03,gri_SSE04; 
+    __m128 gri_SSE10,gri_SSE11,gri_SSE12,gri_SSE13,gri_SSE14; 
+                                                     
+    offset = k0 & 3;                                 
+                                                     
+    ty_SSE0 = _mm_load1_ps(&thy[0]);                 
+    ty_SSE1 = _mm_load1_ps(&thy[1]);                 
+    ty_SSE2 = _mm_load1_ps(&thy[2]);                 
+    ty_SSE3 = _mm_load1_ps(&thy[3]);                 
+#if PME_ORDER == 5                                       
+    ty_SSE4 = _mm_load1_ps(&thy[4]);                 
+#endif                                               
+                                                     
+    tz_SSE0 = _mm_loadu_ps(thz-offset);              
+    tz_SSE1 = _mm_loadu_ps(thz-offset+4);            
+    tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]); 
+    tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]); 
+
+    for(ithx=0; (ithx<PME_ORDER); ithx++)                
+    {                                                
+        index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;  
+        valx  = qn*thx[ithx];                        
+                                                     
+        vx_SSE   = _mm_load1_ps(&valx);              
+                                                     
+        vx_tz_SSE0 = _mm_mul_ps(vx_SSE,tz_SSE0);     
+        vx_tz_SSE1 = _mm_mul_ps(vx_SSE,tz_SSE1);     
+                                                     
+        gri_SSE00 = _mm_load_ps(grid+index+0*pnz);   
+        gri_SSE01 = _mm_load_ps(grid+index+1*pnz);   
+        gri_SSE02 = _mm_load_ps(grid+index+2*pnz);   
+        gri_SSE03 = _mm_load_ps(grid+index+3*pnz);   
 #if PME_ORDER == 5
-    ty_SSE4 = _mm_load1_ps(&thy[4]);
-#endif
-
-    tz_SSE0 = _mm_loadu_ps(thz-offset);
-    tz_SSE1 = _mm_loadu_ps(thz-offset+4);
-    tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]);
-    tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]);
-
-    for(ithx=0; (ithx<PME_ORDER); ithx++)
-    {
-        index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
-        valx  = qn*thx[ithx];
-
-        vx_SSE   = _mm_load1_ps(&valx);
-
-        vx_tz_SSE0 = _mm_mul_ps(vx_SSE,tz_SSE0);
-        vx_tz_SSE1 = _mm_mul_ps(vx_SSE,tz_SSE1);
-
-        gri_SSE00 = _mm_load_ps(grid+index+0*pnz);
-        gri_SSE01 = _mm_load_ps(grid+index+1*pnz);
-        gri_SSE02 = _mm_load_ps(grid+index+2*pnz);
-        gri_SSE03 = _mm_load_ps(grid+index+3*pnz);
+        gri_SSE04 = _mm_load_ps(grid+index+4*pnz);   
+#endif                                               
+        gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4); 
+        gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4); 
+        gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4); 
+        gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4); 
 #if PME_ORDER == 5
-        gri_SSE04 = _mm_load_ps(grid+index+4*pnz);
-#endif
-        gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4);
-        gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4);
-        gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4);
-        gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4);
+        gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4); 
+#endif                                               
+                                                     
+        sum_SSE00 = _mm_add_ps(gri_SSE00,_mm_mul_ps(vx_tz_SSE0,ty_SSE0)); 
+        sum_SSE01 = _mm_add_ps(gri_SSE01,_mm_mul_ps(vx_tz_SSE0,ty_SSE1)); 
+        sum_SSE02 = _mm_add_ps(gri_SSE02,_mm_mul_ps(vx_tz_SSE0,ty_SSE2)); 
+        sum_SSE03 = _mm_add_ps(gri_SSE03,_mm_mul_ps(vx_tz_SSE0,ty_SSE3)); 
 #if PME_ORDER == 5
-        gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4);
-#endif
-
-        sum_SSE00 = _mm_add_ps(gri_SSE00,_mm_mul_ps(vx_tz_SSE0,ty_SSE0));
-        sum_SSE01 = _mm_add_ps(gri_SSE01,_mm_mul_ps(vx_tz_SSE0,ty_SSE1));
-        sum_SSE02 = _mm_add_ps(gri_SSE02,_mm_mul_ps(vx_tz_SSE0,ty_SSE2));
-        sum_SSE03 = _mm_add_ps(gri_SSE03,_mm_mul_ps(vx_tz_SSE0,ty_SSE3));
+        sum_SSE04 = _mm_add_ps(gri_SSE04,_mm_mul_ps(vx_tz_SSE0,ty_SSE4)); 
+#endif                                                                  
+        sum_SSE10 = _mm_add_ps(gri_SSE10,_mm_mul_ps(vx_tz_SSE1,ty_SSE0)); 
+        sum_SSE11 = _mm_add_ps(gri_SSE11,_mm_mul_ps(vx_tz_SSE1,ty_SSE1)); 
+        sum_SSE12 = _mm_add_ps(gri_SSE12,_mm_mul_ps(vx_tz_SSE1,ty_SSE2)); 
+        sum_SSE13 = _mm_add_ps(gri_SSE13,_mm_mul_ps(vx_tz_SSE1,ty_SSE3)); 
 #if PME_ORDER == 5
-        sum_SSE04 = _mm_add_ps(gri_SSE04,_mm_mul_ps(vx_tz_SSE0,ty_SSE4));
-#endif
-        sum_SSE10 = _mm_add_ps(gri_SSE10,_mm_mul_ps(vx_tz_SSE1,ty_SSE0));
-        sum_SSE11 = _mm_add_ps(gri_SSE11,_mm_mul_ps(vx_tz_SSE1,ty_SSE1));
-        sum_SSE12 = _mm_add_ps(gri_SSE12,_mm_mul_ps(vx_tz_SSE1,ty_SSE2));
-        sum_SSE13 = _mm_add_ps(gri_SSE13,_mm_mul_ps(vx_tz_SSE1,ty_SSE3));
+        sum_SSE14 = _mm_add_ps(gri_SSE14,_mm_mul_ps(vx_tz_SSE1,ty_SSE4)); 
+#endif      
+                                                     
+        _mm_store_ps(grid+index+0*pnz,sum_SSE00);    
+        _mm_store_ps(grid+index+1*pnz,sum_SSE01);    
+        _mm_store_ps(grid+index+2*pnz,sum_SSE02);    
+        _mm_store_ps(grid+index+3*pnz,sum_SSE03);    
 #if PME_ORDER == 5
-        sum_SSE14 = _mm_add_ps(gri_SSE14,_mm_mul_ps(vx_tz_SSE1,ty_SSE4));
-#endif
-
-        _mm_store_ps(grid+index+0*pnz,sum_SSE00);
-        _mm_store_ps(grid+index+1*pnz,sum_SSE01);
-        _mm_store_ps(grid+index+2*pnz,sum_SSE02);
-        _mm_store_ps(grid+index+3*pnz,sum_SSE03);
-#if PME_ORDER == 5
-        _mm_store_ps(grid+index+4*pnz,sum_SSE04);
-#endif
-        _mm_store_ps(grid+index+0*pnz+4,sum_SSE10);
-        _mm_store_ps(grid+index+1*pnz+4,sum_SSE11);
-        _mm_store_ps(grid+index+2*pnz+4,sum_SSE12);
-        _mm_store_ps(grid+index+3*pnz+4,sum_SSE13);
+        _mm_store_ps(grid+index+4*pnz,sum_SSE04);    
+#endif    
+        _mm_store_ps(grid+index+0*pnz+4,sum_SSE10);  
+        _mm_store_ps(grid+index+1*pnz+4,sum_SSE11);  
+        _mm_store_ps(grid+index+2*pnz+4,sum_SSE12);  
+        _mm_store_ps(grid+index+3*pnz+4,sum_SSE13);  
 #if PME_ORDER == 5
-        _mm_store_ps(grid+index+4*pnz+4,sum_SSE14);
+        _mm_store_ps(grid+index+4*pnz+4,sum_SSE14);  
 #endif
-    }
+    }                                            
 }
 #undef PME_ORDER
 #undef PME_SPREAD_SSE_ALIGNED
diff --git a/src/mdlib/pull.c b/src/mdlib/pull.c
index bb85739bf9..173a245a1a 100644
--- a/src/mdlib/pull.c
+++ b/src/mdlib/pull.c
@@ -1074,6 +1074,7 @@ static void init_pull_group_index(FILE *fplog,t_commrec *cr,
   gmx_bool bDomDec;
   gmx_ga2la_t ga2la=NULL;
   gmx_groups_t *groups;
+  gmx_mtop_atomlookup_t alook;
   t_atom *atom;
 
   bDomDec = (cr && DOMAINDECOMP(cr));
@@ -1109,13 +1110,15 @@ static void init_pull_group_index(FILE *fplog,t_commrec *cr,
 
   groups = &mtop->groups;
 
+  alook = gmx_mtop_atomlookup_init(mtop);
+
   nfrozen = 0;
   tmass  = 0;
   wmass  = 0;
   wwmass = 0;
   for(i=0; i<pg->nat; i++) {
     ii = pg->ind[i];
-    gmx_mtop_atomnr_to_atom(mtop,ii,&atom);
+    gmx_mtop_atomnr_to_atom(alook,ii,&atom);
     if (cr && PAR(cr) && !bDomDec && ii >= start && ii < end)
       pg->ind_loc[pg->nat_loc++] = ii;
     if (ir->opts.nFreeze) {
@@ -1157,6 +1160,8 @@ static void init_pull_group_index(FILE *fplog,t_commrec *cr,
     wwmass += m*w*w;
   }
 
+  gmx_mtop_atomlookup_destroy(alook);
+
   if (wmass == 0) {
     gmx_fatal(FARGS,"The total%s mass of pull group %d is zero",
 	      pg->weight ? " weighted" : "",g);
diff --git a/src/mdlib/pull_rotation.c b/src/mdlib/pull_rotation.c
index 5e269382a2..1ade3ac65a 100644
--- a/src/mdlib/pull_rotation.c
+++ b/src/mdlib/pull_rotation.c
@@ -3263,6 +3263,7 @@ static void init_rot_group(FILE *fplog,t_commrec *cr,int g,t_rotgrp *rotg,
     t_atom      *atom;
     gmx_enfrotgrp_t erg;      /* Pointer to enforced rotation group data */
     int         ref_firstindex, ref_lastindex;
+    gmx_mtop_atomlookup_t alook=NULL;
     real        mass,totalmass;
     real        start=0.0;
     
@@ -3338,6 +3339,10 @@ static void init_rot_group(FILE *fplog,t_commrec *cr,int g,t_rotgrp *rotg,
 
     /* Copy the masses so that the center can be determined. For all types of
      * enforced rotation, we store the masses in the erg->mc array. */
+    if (rotg->bMassW)
+    {
+	alook = gmx_mtop_atomlookup_init(mtop);
+    }
     snew(erg->mc, rotg->nat);
     if (bFlex)
         snew(erg->mc_sorted, rotg->nat);
@@ -3348,7 +3353,7 @@ static void init_rot_group(FILE *fplog,t_commrec *cr,int g,t_rotgrp *rotg,
     {
         if (rotg->bMassW)
         {
-            gmx_mtop_atomnr_to_atom(mtop,rotg->ind[i],&atom);
+            gmx_mtop_atomnr_to_atom(alook,rotg->ind[i],&atom);
             mass=atom->m;
         }
         else
@@ -3360,6 +3365,11 @@ static void init_rot_group(FILE *fplog,t_commrec *cr,int g,t_rotgrp *rotg,
     }
     erg->invmass = 1.0/totalmass;
     
+    if (rotg->bMassW)
+    {
+	gmx_mtop_atomlookup_destroy(alook);
+    }
+
     /* Set xc_ref_center for any rotation potential */
     if ((rotg->eType==erotgISO) || (rotg->eType==erotgPM) || (rotg->eType==erotgRM) || (rotg->eType==erotgRM2))
     {
diff --git a/src/mdlib/qmmm.c b/src/mdlib/qmmm.c
index 154f8999ac..0a0c9030f2 100644
--- a/src/mdlib/qmmm.c
+++ b/src/mdlib/qmmm.c
@@ -312,6 +312,7 @@ static void init_QMrec(int grpnr, t_QMrec *qm,int nr, int *atomarray,
   /* fills the t_QMrec struct of QM group grpnr 
    */
   int i;
+  gmx_mtop_atomlookup_t alook;
   t_atom *atom;
 
 
@@ -323,12 +324,17 @@ static void init_QMrec(int grpnr, t_QMrec *qm,int nr, int *atomarray,
     qm->indexQM[i]=atomarray[i];
   }
 
+  alook = gmx_mtop_atomlookup_init(mtop);
+
   snew(qm->atomicnumberQM,nr);
   for (i=0;i<qm->nrQMatoms;i++){
-    gmx_mtop_atomnr_to_atom(mtop,qm->indexQM[i],&atom);
+    gmx_mtop_atomnr_to_atom(alook,qm->indexQM[i],&atom);
     qm->nelectrons       += mtop->atomtypes.atomnumber[atom->type];
     qm->atomicnumberQM[i] = mtop->atomtypes.atomnumber[atom->type];
   }
+
+  gmx_mtop_atomlookup_destroy(alook);
+
   qm->QMcharge       = ir->opts.QMcharge[grpnr];
   qm->multiplicity   = ir->opts.QMmult[grpnr];
   qm->nelectrons    -= ir->opts.QMcharge[grpnr];
@@ -456,6 +462,7 @@ void init_QMMMrec(t_commrec *cr,
   gmx_mtop_ilistloop_all_t iloop;
   int       a_offset;
   t_ilist   *ilist_mol;
+  gmx_mtop_atomlookup_t alook;
 
   c6au  = (HARTREE2KJ*AVOGADRO*pow(BOHR2NM,6)); 
   c12au = (HARTREE2KJ*AVOGADRO*pow(BOHR2NM,12)); 
@@ -610,8 +617,11 @@ void init_QMMMrec(t_commrec *cr,
      * Also we set the charges to zero in the md->charge arrays to prevent 
      * the innerloops from doubly counting the electostatic QM MM interaction
      */
+
+    alook = gmx_mtop_atomlookup_init(mtop);
+
     for (k=0;k<qm_nr;k++){
-      gmx_mtop_atomnr_to_atom(mtop,qm_arr[k],&atom);
+      gmx_mtop_atomnr_to_atom(alook,qm_arr[k],&atom);
       atom->q  = 0.0;
       atom->qB = 0.0;
     } 
@@ -621,7 +631,7 @@ void init_QMMMrec(t_commrec *cr,
     init_QMrec(0,qr->qm[0],qm_nr,qm_arr,mtop,ir);
     if(qr->qm[0]->bOPT || qr->qm[0]->bTS){
       for(i=0;i<qm_nr;i++){
-	gmx_mtop_atomnr_to_atom(mtop,qm_arr[i],&atom);
+	gmx_mtop_atomnr_to_atom(alook,qm_arr[i],&atom);
 	qr->qm[0]->c6[i]  =  C6(fr->nbfp,mtop->ffparams.atnr,
 				atom->type,atom->type)/c6au;
 	qr->qm[0]->c12[i] = C12(fr->nbfp,mtop->ffparams.atnr,
@@ -635,7 +645,7 @@ void init_QMMMrec(t_commrec *cr,
     /* find frontier atoms and mark them true in the frontieratoms array.
      */
     for(i=0;i<qm_nr;i++) {
-      gmx_mtop_atomnr_to_ilist(mtop,qm_arr[i],&ilist_mol,&a_offset);
+      gmx_mtop_atomnr_to_ilist(alook,qm_arr[i],&ilist_mol,&a_offset);
       nrvsite2 = ilist_mol[F_VSITE2].nr;
       iatoms   = ilist_mol[F_VSITE2].iatoms;
       
@@ -659,7 +669,9 @@ void init_QMMMrec(t_commrec *cr,
 	}
       }
     }
-      
+
+    gmx_mtop_atomlookup_destroy(alook);
+
     /* MM rec creation */
     mm               = mk_MMrec(); 
     mm->scalefactor  = ir->scalefactor;
diff --git a/src/mdlib/shakef.c b/src/mdlib/shakef.c
index 7f003fe670..1097841124 100644
--- a/src/mdlib/shakef.c
+++ b/src/mdlib/shakef.c
@@ -359,7 +359,7 @@ static void check_cons(FILE *log,int nc,rvec x[],rvec prime[], rvec v[],
 
 gmx_bool bshakef(FILE *log,gmx_shakedata_t shaked,
              int natoms,real invmass[],int nblocks,int sblock[],
-             t_idef *idef,t_inputrec *ir,matrix box,rvec x_s[],rvec prime[],
+             t_idef *idef,t_inputrec *ir,rvec x_s[],rvec prime[],
              t_nrnb *nrnb,real *lagr,real lambda,real *dvdlambda,
              real invdt,rvec *v,gmx_bool bCalcVir,tensor rmdr,gmx_bool bDumpOnError,int econq,t_vetavars *vetavar)
 {
diff --git a/src/mdlib/shellfc.c b/src/mdlib/shellfc.c
index 5d9e7febe0..86ee3c05aa 100644
--- a/src/mdlib/shellfc.c
+++ b/src/mdlib/shellfc.c
@@ -117,7 +117,12 @@ static void predict_shells(FILE *fplog,rvec x[],rvec v[],real dt,
   int  i,m,s1,n1,n2,n3;
   real dt_1,dt_2,dt_3,fudge,tm,m1,m2,m3;
   rvec *ptr;
+  gmx_mtop_atomlookup_t alook=NULL;
   t_atom *atom;
+
+  if (mass == NULL) {
+    alook = gmx_mtop_atomlookup_init(mtop);
+  }
   
   /* We introduce a fudge factor for performance reasons: with this choice
    * the initial force on the shells is about a factor of two lower than 
@@ -171,11 +176,11 @@ static void predict_shells(FILE *fplog,rvec x[],rvec v[],real dt,
 	m3 = mass[n3];
       } else {
 	/* Not the correct masses with FE, but it is just a prediction... */
-	gmx_mtop_atomnr_to_atom(mtop,n1,&atom);
+	gmx_mtop_atomnr_to_atom(alook,n1,&atom);
 	m1 = atom->m;
-	gmx_mtop_atomnr_to_atom(mtop,n2,&atom);
+	gmx_mtop_atomnr_to_atom(alook,n2,&atom);
 	m2 = atom->m;
-	gmx_mtop_atomnr_to_atom(mtop,n3,&atom);
+	gmx_mtop_atomnr_to_atom(alook,n3,&atom);
 	m3 = atom->m;
       }
       tm = dt_1/(m1+m2+m3);
@@ -186,6 +191,10 @@ static void predict_shells(FILE *fplog,rvec x[],rvec v[],real dt,
       gmx_fatal(FARGS,"Shell %d has %d nuclei!",i,s[i].nnucl);
     }
   }
+
+  if (mass == NULL) {
+    gmx_mtop_atomlookup_destroy(alook);
+  }
 }
 
 gmx_shellfc_t init_shell_flexcon(FILE *fplog,
@@ -698,7 +707,8 @@ static void init_adir(FILE *log,gmx_shellfc_t shfc,
 		      t_commrec *cr,int dd_ac1,
 		      gmx_large_int_t step,t_mdatoms *md,int start,int end,
 		      rvec *x_old,rvec *x_init,rvec *x,
-		      rvec *f,rvec *acc_dir,matrix box,
+		      rvec *f,rvec *acc_dir,
+		      gmx_bool bMolPBC,matrix box,
 		      real *lambda,real *dvdlambda,t_nrnb *nrnb)
 {
   rvec   *xnold,*xnew;
@@ -740,13 +750,14 @@ static void init_adir(FILE *log,gmx_shellfc_t shfc,
     }
   }
   constrain(log,FALSE,FALSE,constr,idef,ir,NULL,cr,step,0,md,
-	    x,xnold-start,NULL,box,
-	    lambda[efptBONDED],&(dvdlambda[efptBONDED]),NULL,NULL,nrnb,econqCoord,FALSE,0,0);
+	    x,xnold-start,NULL,bMolPBC,box,
+	    lambda[efptBONDED],&(dvdlambda[efptBONDED]),
+	    NULL,NULL,nrnb,econqCoord,FALSE,0,0);
   constrain(log,FALSE,FALSE,constr,idef,ir,NULL,cr,step,0,md,
-	    x,xnew-start,NULL,box,
-	    lambda[efptBONDED],&(dvdlambda[efptBONDED]),NULL,NULL,nrnb,econqCoord,FALSE,0,0);
+	    x,xnew-start,NULL,bMolPBC,box,
+	    lambda[efptBONDED],&(dvdlambda[efptBONDED]),
+	    NULL,NULL,nrnb,econqCoord,FALSE,0,0);
 
-  /* Set xnew to minus the acceleration */
   for (n=start; n<end; n++) {
     for(d=0; d<DIM; d++)
       xnew[n-start][d] =
@@ -757,8 +768,9 @@ static void init_adir(FILE *log,gmx_shellfc_t shfc,
 
   /* Project the acceleration on the old bond directions */
   constrain(log,FALSE,FALSE,constr,idef,ir,NULL,cr,step,0,md,
-	    x_old,xnew-start,acc_dir,box,
-	    lambda[efptBONDED],&(dvdlambda[efptBONDED]),NULL,NULL,nrnb,econqDeriv_FlexCon,FALSE,0,0);
+	    x_old,xnew-start,acc_dir,bMolPBC,box,
+	    lambda[efptBONDED],&(dvdlambda[efptBONDED]),
+	    NULL,NULL,nrnb,econqDeriv_FlexCon,FALSE,0,0); 
 }
 
 int relax_shell_flexcon(FILE *fplog,t_commrec *cr,gmx_bool bVerbose,
@@ -897,7 +909,8 @@ int relax_shell_flexcon(FILE *fplog,t_commrec *cr,gmx_bool bVerbose,
     init_adir(fplog,shfc,
 	      constr,idef,inputrec,cr,dd_ac1,mdstep,md,start,end,
 	      shfc->x_old-start,state->x,state->x,force[Min],
-	      shfc->acc_dir-start,state->box,state->lambda,&dum,nrnb);
+	      shfc->acc_dir-start,
+	      fr->bMolPBC,state->box,state->lambda,&dum,nrnb);
 
     for(i=start; i<end; i++)
       sf_dir += md->massT[i]*norm2(shfc->acc_dir[i-start]);
@@ -952,7 +965,7 @@ int relax_shell_flexcon(FILE *fplog,t_commrec *cr,gmx_bool bVerbose,
       init_adir(fplog,shfc,
 		constr,idef,inputrec,cr,dd_ac1,mdstep,md,start,end,
 		x_old-start,state->x,pos[Min],force[Min],acc_dir-start,
-		state->box,state->lambda,&dum,nrnb);
+		fr->bMolPBC,state->box,state->lambda,&dum,nrnb);
       
       directional_sd(fplog,pos[Min],pos[Try],acc_dir-start,start,end,
 		     fr->fc_stepsize);
@@ -986,7 +999,7 @@ int relax_shell_flexcon(FILE *fplog,t_commrec *cr,gmx_bool bVerbose,
       init_adir(fplog,shfc,
 		constr,idef,inputrec,cr,dd_ac1,mdstep,md,start,end,
 		x_old-start,state->x,pos[Try],force[Try],acc_dir-start,
-		state->box,state->lambda,&dum,nrnb);
+		fr->bMolPBC,state->box,state->lambda,&dum,nrnb);
 
       for(i=start; i<end; i++)
 	sf_dir += md->massT[i]*norm2(acc_dir[i-start]);
diff --git a/src/mdlib/sim_util.c b/src/mdlib/sim_util.c
index e06f3d88ce..92486e389b 100644
--- a/src/mdlib/sim_util.c
+++ b/src/mdlib/sim_util.c
@@ -63,6 +63,7 @@
 #include "nrnb.h"
 #include "mshift.h"
 #include "mdrun.h"
+#include "sim_util.h"
 #include "update.h"
 #include "physics.h"
 #include "main.h"
@@ -86,6 +87,11 @@
 #include "partdec.h"
 #include "gmx_wallcycle.h"
 #include "genborn.h"
+#include "nbnxn_search.h"
+#include "nbnxn_kernels/nbnxn_kernel_ref.h"
+#include "nbnxn_kernels/nbnxn_kernel_x86_simd128.h"
+#include "nbnxn_kernels/nbnxn_kernel_x86_simd256.h"
+#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
 
 #ifdef GMX_LIB_MPI
 #include <mpi.h>
@@ -97,6 +103,9 @@
 #include "adress.h"
 #include "qmmm.h"
 
+#include "nbnxn_cuda_data_mgmt.h"
+#include "nbnxn_cuda/nbnxn_cuda.h"
+
 #if 0
 typedef struct gmx_timeprint {
 
@@ -149,13 +158,10 @@ void print_time(FILE *out,gmx_runtime_t *runtime,gmx_large_int_t step,
     fprintf(out,"step %s",gmx_step_str(step,buf));
     if ((step >= ir->nstlist))
     {
-        if ((ir->nstlist == 0) || ((step % ir->nstlist) == 0))
-        {
-            /* We have done a full cycle let's update time_per_step */
-            runtime->last = gmx_gettime();
-            dt = difftime(runtime->last,runtime->real);
-            runtime->time_per_step = dt/(step - ir->init_step + 1);
-        }
+        runtime->last = gmx_gettime();
+        dt = difftime(runtime->last,runtime->real);
+        runtime->time_per_step = dt/(step - ir->init_step + 1);
+
         dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
 
         if (ir->nsteps >= 0)
@@ -343,70 +349,1046 @@ static void calc_f_el(FILE *fp,int  start,int homenr,
             for(i=start; (i<start+homenr); i++)
                 f[i][m] += charge[i]*Ext[m];
         }
-        else
+        else
+        {
+            Ext[m] = 0;
+        }
+    }
+    if (fp != NULL)
+    {
+        fprintf(fp,"%10g  %10g  %10g  %10g #FIELD\n",t,
+                Ext[XX]/FIELDFAC,Ext[YY]/FIELDFAC,Ext[ZZ]/FIELDFAC);
+    }
+}
+
+static void calc_virial(FILE *fplog,int start,int homenr,rvec x[],rvec f[],
+			tensor vir_part,t_graph *graph,matrix box,
+			t_nrnb *nrnb,const t_forcerec *fr,int ePBC)
+{
+  int i,j;
+  tensor virtest;
+
+  /* The short-range virial from surrounding boxes */
+  clear_mat(vir_part);
+  calc_vir(fplog,SHIFTS,fr->shift_vec,fr->fshift,vir_part,ePBC==epbcSCREW,box);
+  inc_nrnb(nrnb,eNR_VIRIAL,SHIFTS);
+
+  /* Calculate partial virial, for local atoms only, based on short range.
+   * Total virial is computed in global_stat, called from do_md
+   */
+  f_calc_vir(fplog,start,start+homenr,x,f,vir_part,graph,box);
+  inc_nrnb(nrnb,eNR_VIRIAL,homenr);
+
+  /* Add position restraint contribution */
+  for(i=0; i<DIM; i++) {
+    vir_part[i][i] += fr->vir_diag_posres[i];
+  }
+
+  /* Add wall contribution */
+  for(i=0; i<DIM; i++) {
+    vir_part[i][ZZ] += fr->vir_wall_z[i];
+  }
+
+  if (debug)
+    pr_rvecs(debug,0,"vir_part",vir_part,DIM);
+}
+
+static void posres_wrapper(FILE *fplog,
+                           int flags,
+                           gmx_bool bSepDVDL,
+                           t_inputrec *ir,
+                           t_nrnb *nrnb,
+                           gmx_localtop_t *top,
+                           matrix box,rvec x[],
+                           rvec f[],
+                           gmx_enerdata_t *enerd,
+                           real *lambda,
+                           t_forcerec *fr)
+{
+    t_pbc pbc;
+    real  v,dvdl;
+    int   i;
+
+    /* Position restraints always require full pbc */
+    set_pbc(&pbc,ir->ePBC,box);
+    dvdl = 0;
+    v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
+               top->idef.iparams_posres,
+               (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
+               ir->ePBC==epbcNONE ? NULL : &pbc,
+               lambda[efptRESTRAINT],&dvdl,
+               fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
+    if (bSepDVDL)
+    {
+        fprintf(fplog,sepdvdlformat,
+                interaction_function[F_POSRES].longname,v,dvdl);
+    }
+    enerd->term[F_POSRES] += v;
+    /* If just the force constant changes, the FEP term is linear,
+     * but if k changes, it is not.
+     */
+    enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
+    inc_nrnb(nrnb,eNR_POSRES,top->idef.il[F_POSRES].nr/2);
+
+    if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
+    {
+        for(i=0; i<enerd->n_lambda; i++)
+        {
+            real dvdl_dum,lambda_dum;
+
+            lambda_dum = (i==0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
+            v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
+                       top->idef.iparams_posres,
+                       (const rvec*)x,NULL,NULL,
+                       ir->ePBC==epbcNONE ? NULL : &pbc,lambda_dum,&dvdl,
+                       fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
+            enerd->enerpart_lambda[i] += v;
+        }
+    }
+}
+
+static void pull_potential_wrapper(FILE *fplog,
+                                   gmx_bool bSepDVDL,
+                                   t_commrec *cr,
+                                   t_inputrec *ir,
+                                   matrix box,rvec x[],
+                                   rvec f[],
+                                   tensor vir_force,
+                                   t_mdatoms *mdatoms,
+                                   gmx_enerdata_t *enerd,
+                                   real *lambda,
+                                   double t)
+{
+    t_pbc  pbc;
+    real   dvdl;
+
+    /* Calculate the center of mass forces, this requires communication,
+     * which is why pull_potential is called close to other communication.
+     * The virial contribution is calculated directly,
+     * which is why we call pull_potential after calc_virial.
+     */
+    set_pbc(&pbc,ir->ePBC,box);
+    dvdl = 0; 
+    enerd->term[F_COM_PULL] +=
+        pull_potential(ir->ePull,ir->pull,mdatoms,&pbc,
+                       cr,t,lambda[efptRESTRAINT],x,f,vir_force,&dvdl);
+    if (bSepDVDL)
+    {
+        fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdl);
+    }
+    enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+}
+
+static void pme_receive_force_ener(FILE *fplog,
+                                   gmx_bool bSepDVDL,
+                                   t_commrec *cr,
+                                   gmx_wallcycle_t wcycle,
+                                   gmx_enerdata_t *enerd,
+                                   t_forcerec *fr)
+{
+    real   e,v,dvdl;    
+    float  cycles_ppdpme,cycles_seppme;
+
+    cycles_ppdpme = wallcycle_stop(wcycle,ewcPPDURINGPME);
+    dd_cycles_add(cr->dd,cycles_ppdpme,ddCyclPPduringPME);
+
+    /* In case of node-splitting, the PP nodes receive the long-range 
+     * forces, virial and energy from the PME nodes here.
+     */    
+    wallcycle_start(wcycle,ewcPP_PMEWAITRECVF);
+    dvdl = 0;
+    gmx_pme_receive_f(cr,fr->f_novirsum,fr->vir_el_recip,&e,&dvdl,
+                      &cycles_seppme);
+    if (bSepDVDL)
+    {
+        fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdl);
+    }
+    enerd->term[F_COUL_RECIP] += e;
+    enerd->dvdl_lin[efptCOUL] += dvdl;
+    if (wcycle)
+    {
+        dd_cycles_add(cr->dd,cycles_seppme,ddCyclPME);
+    }
+    wallcycle_stop(wcycle,ewcPP_PMEWAITRECVF);
+}
+
+static void print_large_forces(FILE *fp,t_mdatoms *md,t_commrec *cr,
+			       gmx_large_int_t step,real pforce,rvec *x,rvec *f)
+{
+  int  i;
+  real pf2,fn2;
+  char buf[STEPSTRSIZE];
+
+  pf2 = sqr(pforce);
+  for(i=md->start; i<md->start+md->homenr; i++) {
+    fn2 = norm2(f[i]);
+    /* We also catch NAN, if the compiler does not optimize this away. */
+    if (fn2 >= pf2 || fn2 != fn2) {
+      fprintf(fp,"step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
+	      gmx_step_str(step,buf),
+	      ddglatnr(cr->dd,i),x[i][XX],x[i][YY],x[i][ZZ],sqrt(fn2));
+    }
+  }
+}
+
+static void post_process_forces(FILE *fplog,
+                                t_commrec *cr,
+                                gmx_large_int_t step,
+                                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+                                gmx_localtop_t *top,
+                                matrix box,rvec x[],
+                                rvec f[],
+                                tensor vir_force,
+                                t_mdatoms *mdatoms,
+                                t_graph *graph,
+                                t_forcerec *fr,gmx_vsite_t *vsite,
+                                int flags)
+{
+    if (fr->bF_NoVirSum)
+    {
+        if (vsite)
+        {
+            /* Spread the mesh force on virtual sites to the other particles... 
+             * This is parallellized. MPI communication is performed
+             * if the constructing atoms aren't local.
+             */
+            wallcycle_start(wcycle,ewcVSITESPREAD);
+            spread_vsite_f(fplog,vsite,x,fr->f_novirsum,NULL,
+                           (flags & GMX_FORCE_VIRIAL),fr->vir_el_recip,
+                           nrnb,
+                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
+            wallcycle_stop(wcycle,ewcVSITESPREAD);
+        }
+        if (flags & GMX_FORCE_VIRIAL)
+        {
+            /* Now add the forces, this is local */
+            if (fr->bDomDec)
+            {
+                sum_forces(0,fr->f_novirsum_n,f,fr->f_novirsum);
+            }
+            else
+            {
+                sum_forces(mdatoms->start,mdatoms->start+mdatoms->homenr,
+                           f,fr->f_novirsum);
+            }
+            if (EEL_FULL(fr->eeltype))
+            {
+                /* Add the mesh contribution to the virial */
+                m_add(vir_force,fr->vir_el_recip,vir_force);
+            }
+            if (debug)
+            {
+                pr_rvecs(debug,0,"vir_force",vir_force,DIM);
+            }
+        }
+    }
+    
+    if (fr->print_force >= 0)
+    {
+        print_large_forces(stderr,mdatoms,cr,step,fr->print_force,x,f);
+    }
+}
+
+static void do_nb_verlet(t_forcerec *fr,
+                         interaction_const_t *ic,
+                         gmx_enerdata_t *enerd,
+                         int flags, int ilocality,
+                         int clearF,
+                         t_nrnb *nrnb,
+                         gmx_wallcycle_t wcycle)
+{
+    int     nnbl, kernel_type, sh_e;
+    char    *env;
+    nonbonded_verlet_group_t  *nbvg;
+
+    if (!(flags & GMX_FORCE_NONBONDED))
+    {
+        /* skip non-bonded calculation */
+        return;
+    }
+
+    nbvg = &fr->nbv->grp[ilocality];
+
+    /* CUDA kernel launch overhead is already timed separately */
+    if (fr->cutoff_scheme != ecutsVERLET)
+    {
+        gmx_incons("Invalid cut-off scheme passed!");
+    }
+
+    if (nbvg->kernel_type != nbk8x8x8_CUDA)
+    {
+        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+    }
+    switch (nbvg->kernel_type)
+    {
+        case nbk4x4_PlainC:
+            nbnxn_kernel_ref(&nbvg->nbl_lists,
+                             nbvg->nbat, ic,
+                             fr->shift_vec,
+                             flags,
+                             clearF,
+                             fr->fshift[0],
+                             enerd->grpp.ener[egCOULSR],
+                             fr->bBHAM ?
+                             enerd->grpp.ener[egBHAMSR] :
+                             enerd->grpp.ener[egLJSR]);
+            break;
+        
+        case nbk4xN_X86_SIMD128:
+            nbnxn_kernel_x86_simd128(&nbvg->nbl_lists,
+                                     nbvg->nbat, ic,
+                                     fr->shift_vec,
+                                     flags,
+                                     clearF,
+                                     fr->fshift[0],
+                                     enerd->grpp.ener[egCOULSR],
+                                     fr->bBHAM ?
+                                     enerd->grpp.ener[egBHAMSR] :
+                                     enerd->grpp.ener[egLJSR]);
+            break;
+        case nbk4xN_X86_SIMD256:
+            nbnxn_kernel_x86_simd256(&nbvg->nbl_lists,
+                                     nbvg->nbat, ic,
+                                     fr->shift_vec,
+                                     flags,
+                                     clearF,
+                                     fr->fshift[0],
+                                     enerd->grpp.ener[egCOULSR],
+                                     fr->bBHAM ?
+                                     enerd->grpp.ener[egBHAMSR] :
+                                     enerd->grpp.ener[egLJSR]);
+            break;
+
+        case nbk8x8x8_CUDA:
+            nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
+            break;
+
+        case nbk8x8x8_PlainC:
+            nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
+                                 nbvg->nbat, ic,
+                                 fr->shift_vec,
+                                 flags,
+                                 clearF,
+                                 nbvg->nbat->out[0].f,
+                                 fr->fshift[0],
+                                 enerd->grpp.ener[egCOULSR],
+                                 fr->bBHAM ?
+                                 enerd->grpp.ener[egBHAMSR] :
+                                 enerd->grpp.ener[egLJSR]);
+            break;
+
+        default:
+            gmx_incons("Invalid nonbonded kernel type passed!");
+
+    }
+    if (nbvg->kernel_type != nbk8x8x8_CUDA)
+    {
+        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+    }
+
+    /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
+    sh_e = ((flags & GMX_FORCE_ENERGY) ? 1 : 0);
+    inc_nrnb(nrnb,
+             ((EEL_RF(ic->eeltype) || ic->eeltype == eelCUT) ?
+              eNR_NBNXN_LJ_RF : eNR_NBNXN_LJ_TAB) + sh_e,
+             nbvg->nbl_lists.natpair_ljq);
+    inc_nrnb(nrnb,eNR_NBNXN_LJ+sh_e,nbvg->nbl_lists.natpair_lj);
+    inc_nrnb(nrnb,
+             ((EEL_RF(ic->eeltype) || ic->eeltype == eelCUT) ?
+              eNR_NBNXN_RF : eNR_NBNXN_TAB)+sh_e,
+             nbvg->nbl_lists.natpair_q);
+}
+
+void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
+              t_inputrec *inputrec,
+              gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+              gmx_localtop_t *top,
+              gmx_mtop_t *mtop,
+              gmx_groups_t *groups,
+              matrix box,rvec x[],history_t *hist,
+              rvec f[],
+              tensor vir_force,
+              t_mdatoms *mdatoms,
+              gmx_enerdata_t *enerd,t_fcdata *fcd,
+              real *lambda,t_graph *graph,
+              t_forcerec *fr, interaction_const_t *ic,
+              gmx_vsite_t *vsite,rvec mu_tot,
+              double t,FILE *field,gmx_edsam_t ed,
+              gmx_bool bBornRadii,
+              int flags)
+{
+    int     cg0,cg1,i,j;
+    int     start,homenr;
+    int     nb_kernel_type;
+    double  mu[2*DIM];
+    gmx_bool   bSepDVDL,bStateChanged,bNS,bFillGrid,bCalcCGCM,bBS;
+    gmx_bool   bDoLongRange,bDoForces,bSepLRF,bUseGPU,bUseOrEmulGPU;
+    gmx_bool   bDiffKernels=FALSE;
+    matrix  boxs;
+    rvec    vzero,box_diag;
+    real    e,v,dvdl;
+    float  cycles_pme,cycles_force;
+    nonbonded_verlet_t *nbv;
+
+    cycles_force = 0;
+    nbv = fr->nbv;
+    nb_kernel_type = fr->nbv->grp[0].kernel_type;
+
+    start  = mdatoms->start;
+    homenr = mdatoms->homenr;
+
+    bSepDVDL = (fr->bSepDVDL && do_per_step(step,inputrec->nstlog));
+
+    clear_mat(vir_force);
+
+    cg0 = 0;
+    if (DOMAINDECOMP(cr))
+    {
+        cg1 = cr->dd->ncg_tot;
+    }
+    else
+    {
+        cg1 = top->cgs.nr;
+    }
+    if (fr->n_tpi > 0)
+    {
+        cg1--;
+    }
+
+    bStateChanged = (flags & GMX_FORCE_STATECHANGED);
+    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll==FALSE); 
+    bFillGrid     = (bNS && bStateChanged);
+    bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
+    bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DOLR));
+    bDoForces     = (flags & GMX_FORCE_FORCES);
+    bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
+    bUseGPU       = fr->nbv->bUseGPU;
+    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbk8x8x8_PlainC);
+
+    if (bStateChanged)
+    {
+        update_forcerec(fplog,fr,box);
+
+        if (NEED_MUTOT(*inputrec))
+        {
+            /* Calculate total (local) dipole moment in a temporary common array.
+             * This makes it possible to sum them over nodes faster.
+             */
+            calc_mu(start,homenr,
+                    x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
+                    mu,mu+DIM);
+        }
+    }
+
+    if (fr->ePBC != epbcNONE) { 
+        /* Compute shift vectors every step,
+         * because of pressure coupling or box deformation!
+         */
+        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+            calc_shifts(box,fr->shift_vec);
+
+        if (bCalcCGCM) { 
+            put_atoms_in_box_omp(fr->ePBC,box,homenr,x);
+            inc_nrnb(nrnb,eNR_SHIFTX,homenr);
+        } 
+        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
+            unshift_self(graph,box,x);
+        }
+    } 
+
+    nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
+                                  fr->shift_vec,nbv->grp[0].nbat);
+
+#ifdef GMX_MPI
+    if (!(cr->duty & DUTY_PME)) {
+        /* Send particle coordinates to the pme nodes.
+         * Since this is only implemented for domain decomposition
+         * and domain decomposition does not use the graph,
+         * we do not need to worry about shifting.
+         */    
+
+        wallcycle_start(wcycle,ewcPP_PMESENDX);
+        GMX_MPE_LOG(ev_send_coordinates_start);
+
+        bBS = (inputrec->nwall == 2);
+        if (bBS) {
+            copy_mat(box,boxs);
+            svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
+        }
+
+        gmx_pme_send_x(cr,bBS ? boxs : box,x,
+                       mdatoms->nChargePerturbed,lambda[efptCOUL],
+                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),step);
+
+        GMX_MPE_LOG(ev_send_coordinates_finish);
+        wallcycle_stop(wcycle,ewcPP_PMESENDX);
+    }
+#endif /* GMX_MPI */
+
+    /* do gridding for pair search */
+    if (bNS)
+    {
+        if (graph && bStateChanged)
+        {
+            /* Calculate intramolecular shift vectors to make molecules whole */
+            mk_mshift(fplog,graph,fr->ePBC,box,x);
+        }
+
+        clear_rvec(vzero);
+        box_diag[XX] = box[XX][XX];
+        box_diag[YY] = box[YY][YY];
+        box_diag[ZZ] = box[ZZ][ZZ];
+
+        wallcycle_start(wcycle,ewcNS);
+        if (!fr->bDomDec)
+        {
+            wallcycle_sub_start(wcycle,ewcsNBS_GRID_LOCAL);
+            nbnxn_put_on_grid(nbv->nbs,fr->ePBC,box,
+                              0,vzero,box_diag,
+                              0,mdatoms->homenr,-1,fr->cginfo,x,
+                              0,NULL,
+                              nbv->grp[eintLocal].kernel_type,
+                              nbv->grp[eintLocal].nbat);
+            wallcycle_sub_stop(wcycle,ewcsNBS_GRID_LOCAL);
+        }
+        else
+        {
+            wallcycle_sub_start(wcycle,ewcsNBS_GRID_NONLOCAL);
+            nbnxn_put_on_grid_nonlocal(nbv->nbs,domdec_zones(cr->dd),
+                                       fr->cginfo,x,
+                                       nbv->grp[eintNonlocal].kernel_type,
+                                       nbv->grp[eintNonlocal].nbat);
+            wallcycle_sub_stop(wcycle,ewcsNBS_GRID_NONLOCAL);
+        }
+
+        if (nbv->ngrp == 1 ||
+            nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
+        {
+            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat,eatAll,
+                                nbv->nbs,mdatoms,fr->cginfo);
+        }
+        else
+        {
+            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat,eatLocal,
+                                nbv->nbs,mdatoms,fr->cginfo);
+            nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat,eatAll,
+                                nbv->nbs,mdatoms,fr->cginfo);
+        }
+        wallcycle_stop(wcycle, ewcNS);
+    }
+
+    /* initialize the GPU atom data and copy shift vector */
+    if (bUseGPU)
+    {
+        if (bNS)
+        {
+            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+            nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+        }
+
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+        nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+    }
+
+    /* do local pair search */
+    if (bNS)
+    {
+        wallcycle_start_nocount(wcycle,ewcNS);
+        wallcycle_sub_start(wcycle,ewcsNBS_SEARCH_LOCAL);
+        nbnxn_make_pairlist(nbv->nbs,nbv->grp[eintLocal].nbat,
+                            &top->excls,
+                            ic->rlist,
+                            nbv->min_ci_balanced,
+                            &nbv->grp[eintLocal].nbl_lists,
+                            eintLocal,
+                            nbv->grp[eintLocal].kernel_type,
+                            nrnb);
+        wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_LOCAL);
+
+        if (bUseGPU)
+        {
+            /* initialize local pair-list on the GPU */
+            nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
+                                     eintLocal);
+        }
+        wallcycle_stop(wcycle, ewcNS);
+    }
+    else
+    {
+        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+        wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatLocal,FALSE,x,
+                                        nbv->grp[eintLocal].nbat);
+        wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+    }
+
+    if (bUseGPU)
+    {
+        wallcycle_start(wcycle,ewcLAUNCH_GPU_NB);
+        /* launch local nonbonded F on GPU */
+        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+                     nrnb, wcycle);
+        wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
+    }
+
+    /* Communicate coordinates and sum dipole if necessary + 
+       do non-local pair search */
+    if (DOMAINDECOMP(cr))
+    {
+        bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
+                        nbv->grp[eintLocal].kernel_type);
+
+        if (bDiffKernels)
+        {
+            /* With GPU+CPU non-bonded calculations we need to copy
+             * the local coordinates to the non-local nbat struct
+             * (in CPU format) as the non-local kernel call also
+             * calculates the local - non-local interactions.
+             */
+            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatLocal,TRUE,x,
+                                             nbv->grp[eintNonlocal].nbat);
+            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+            wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+        }
+
+        if (bNS)
+        {
+            wallcycle_start_nocount(wcycle,ewcNS);
+            wallcycle_sub_start(wcycle,ewcsNBS_SEARCH_NONLOCAL);
+
+            if (bDiffKernels)
+            {
+                nbnxn_grid_add_simple(nbv->nbs,nbv->grp[eintNonlocal].nbat);
+            }
+
+            nbnxn_make_pairlist(nbv->nbs,nbv->grp[eintNonlocal].nbat,
+                                &top->excls,
+                                ic->rlist,
+                                nbv->min_ci_balanced,
+                                &nbv->grp[eintNonlocal].nbl_lists,
+                                eintNonlocal,
+                                nbv->grp[eintNonlocal].kernel_type,
+                                nrnb);
+
+            wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_NONLOCAL);
+
+            if (nbv->grp[eintNonlocal].kernel_type == nbk8x8x8_CUDA)
+            {
+                /* initialize non-local pair-list on the GPU */
+                nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
+                                         eintNonlocal);
+            }
+            wallcycle_stop(wcycle,ewcNS);
+        } 
+        else
+        {
+            wallcycle_start(wcycle,ewcMOVEX);
+            dd_move_x(cr->dd,box,x);
+
+            /* When we don't need the total dipole we sum it in global_stat */
+            if (bStateChanged && NEED_MUTOT(*inputrec))
+            {
+                gmx_sumd(2*DIM,mu,cr);
+            }
+            wallcycle_stop(wcycle,ewcMOVEX);
+
+            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatNonlocal,FALSE,x,
+                                            nbv->grp[eintNonlocal].nbat);
+            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+        }
+
+        if (bUseGPU && !bDiffKernels)
+        { 
+            wallcycle_start(wcycle,ewcLAUNCH_GPU_NB);
+            /* launch non-local nonbonded F on GPU */
+            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+                         nrnb, wcycle);
+            cycles_force += wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
+        }
+    }
+
+    if (bUseGPU)
+    {
+        /* launch D2H copy-back F */
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+        if (DOMAINDECOMP(cr) && !bDiffKernels)
+        {
+            nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
+                                      flags, eatNonlocal);
+        }
+        nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
+                                  flags, eatLocal);
+        cycles_force += wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
+    }
+
+    if (bStateChanged && NEED_MUTOT(*inputrec))
+    {
+        if (PAR(cr))
+        {
+            gmx_sumd(2*DIM,mu,cr);
+        } 
+
+        for(i=0; i<2; i++)
+        {
+            for(j=0;j<DIM;j++)
+            {
+                fr->mu_tot[i][j] = mu[i*DIM + j];
+            }
+        }
+    }
+    if (fr->efep == efepNO)
+    {
+        copy_rvec(fr->mu_tot[0],mu_tot);
+    }
+    else
+    {
+        for(j=0; j<DIM; j++)
+        {
+            mu_tot[j] =
+                (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
+                lambda[efptCOUL]*fr->mu_tot[1][j];
+        }
+    }
+
+    /* Reset energies */
+    reset_enerdata(&(inputrec->opts),fr,bNS,enerd,MASTER(cr));
+    clear_rvecs(SHIFTS,fr->fshift);
+
+    if (DOMAINDECOMP(cr))
+    {
+        if (!(cr->duty & DUTY_PME))
+        {
+            wallcycle_start(wcycle,ewcPPDURINGPME);
+            dd_force_flop_start(cr->dd,nrnb);
+        }
+    }
+    
+    /* Start the force cycle counter.
+     * This counter is stopped in do_forcelow_level.
+     * No parallel communication should occur while this counter is running,
+     * since that will interfere with the dynamic load balancing.
+     */
+    wallcycle_start(wcycle,ewcFORCE);
+    if (bDoForces)
+    {
+        /* Reset forces for which the virial is calculated separately:
+         * PME/Ewald forces if necessary */
+        if (fr->bF_NoVirSum) 
+        {
+            if (flags & GMX_FORCE_VIRIAL)
+            {
+                fr->f_novirsum = fr->f_novirsum_alloc;
+                GMX_BARRIER(cr->mpi_comm_mygroup);
+                if (fr->bDomDec)
+                {
+                    clear_rvecs(fr->f_novirsum_n,fr->f_novirsum);
+                }
+                else
+                {
+                    clear_rvecs(homenr,fr->f_novirsum+start);
+                }
+                GMX_BARRIER(cr->mpi_comm_mygroup);
+            }
+            else
+            {
+                /* We are not calculating the pressure so we do not need
+                 * a separate array for forces that do not contribute
+                 * to the pressure.
+                 */
+                fr->f_novirsum = f;
+            }
+        }
+
+        if (bSepLRF)
+        {
+            /* Add the long range forces to the short range forces */
+            for(i=0; i<fr->natoms_force_constr; i++)
+            {
+                copy_rvec(fr->f_twin[i],f[i]);
+            }
+        }
+        else if (!(fr->bTwinRange && bNS))
+        {
+            /* Clear the short-range forces */
+            clear_rvecs(fr->natoms_force_constr,f);
+        }
+
+        clear_rvec(fr->vir_diag_posres);
+
+        GMX_BARRIER(cr->mpi_comm_mygroup);
+    }
+    if (inputrec->ePull == epullCONSTRAINT)
+    {
+        clear_pull_forces(inputrec->pull);
+    }
+
+    /* update QMMMrec, if necessary */
+    if(fr->bQMMM)
+    {
+        update_QMMMrec(cr,fr,x,mdatoms,box,top);
+    }
+
+    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+    {
+        posres_wrapper(fplog,flags,bSepDVDL,inputrec,nrnb,top,box,x,
+                       f,enerd,lambda,fr);
+    }
+
+    /* Compute the bonded and non-bonded energies and optionally forces */    
+    /* if we use the GPU turn off the nonbonded */
+    do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
+                      cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
+                      x,hist,f,enerd,fcd,mtop,top,fr->born,
+                      &(top->atomtypes),bBornRadii,box,
+                      inputrec->fepvals,lambda,graph,&(top->excls),fr->mu_tot,
+                      ((nb_kernel_type == nbk8x8x8_CUDA || nb_kernel_type == nbk8x8x8_PlainC) 
+                        ? flags&~GMX_FORCE_NONBONDED : flags),
+                      &cycles_pme);
+
+    if (!bUseOrEmulGPU)
+    {
+        /* Maybe we should move this into do_force_lowlevel */
+        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+                     nrnb, wcycle);
+    }
+        
+
+    if (!bUseOrEmulGPU || bDiffKernels)
+    {
+        int aloc;
+
+        if (DOMAINDECOMP(cr))
+        {
+            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
+                         bDiffKernels ? enbvClearFYes : enbvClearFNo,
+                         nrnb, wcycle);
+        }
+
+        if (!bUseOrEmulGPU)
+        {
+            aloc = eintLocal;
+        }
+        else
+        {
+            aloc = eintNonlocal;
+        }
+
+        /* Add all the non-bonded force to the normal force array.
+         * This can be split into a local a non-local part when overlapping
+         * communication with calculation with domain decomposition.
+         */
+        cycles_force += wallcycle_stop(wcycle,ewcFORCE);
+        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatAll,nbv->grp[aloc].nbat,f);
+        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+        cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+        wallcycle_start_nocount(wcycle,ewcFORCE);
+
+        /* if there are multiple fshift output buffers reduce them */
+        if ((flags & GMX_FORCE_VIRIAL) &&
+            nbv->grp[aloc].nbl_lists.nnbl > 1)
+        {
+            nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
+                                                      fr->fshift);
+        }
+    }
+    
+    cycles_force += wallcycle_stop(wcycle,ewcFORCE);
+    GMX_BARRIER(cr->mpi_comm_mygroup);
+    
+    if (ed)
+    {
+        do_flood(fplog,cr,x,f,ed,box,step,bNS);
+    }
+
+    if (bUseOrEmulGPU && !bDiffKernels)
+    {
+        /* wait for non-local forces (or calculate in emulation mode) */
+        if (DOMAINDECOMP(cr))
+        {
+            if (bUseGPU)
+            {
+                wallcycle_start(wcycle,ewcWAIT_GPU_NB_NL);
+                nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+                                    nbv->grp[eintNonlocal].nbat,
+                                    flags, eatNonlocal,
+                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                    fr->fshift);
+                cycles_force += wallcycle_stop(wcycle,ewcWAIT_GPU_NB_NL);
+            }
+            else
+            {
+                wallcycle_start_nocount(wcycle,ewcFORCE);
+                do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+                             nrnb, wcycle);
+                cycles_force += wallcycle_stop(wcycle,ewcFORCE);
+            }            
+            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+            wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+            /* skip the reduction if there was no non-local work to do */
+            if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+            {
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatNonlocal,
+                                               nbv->grp[eintNonlocal].nbat,f);
+            }
+            wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+        }
+    }
+
+    if (bDoForces)
+    {
+        /* Communicate the forces */
+        if (PAR(cr))
+        {
+            wallcycle_start(wcycle,ewcMOVEF);
+            if (DOMAINDECOMP(cr))
+            {
+                dd_move_f(cr->dd,f,fr->fshift);
+                /* Do we need to communicate the separate force array
+                 * for terms that do not contribute to the single sum virial?
+                 * Position restraints and electric fields do not introduce
+                 * inter-cg forces, only full electrostatics methods do.
+                 * When we do not calculate the virial, fr->f_novirsum = f,
+                 * so we have already communicated these forces.
+                 */
+                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+                    (flags & GMX_FORCE_VIRIAL))
+                {
+                    dd_move_f(cr->dd,fr->f_novirsum,NULL);
+                }
+                if (bSepLRF)
+                {
+                    /* We should not update the shift forces here,
+                     * since f_twin is already included in f.
+                     */
+                    dd_move_f(cr->dd,fr->f_twin,NULL);
+                }
+            }
+            wallcycle_stop(wcycle,ewcMOVEF);
+        }
+    }
+ 
+    if (bUseOrEmulGPU)
+    {
+        /* wait for local forces (or calculate in emulation mode) */
+        if (bUseGPU)
+        {
+            wallcycle_start(wcycle,ewcWAIT_GPU_NB_L);
+            nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+                                nbv->grp[eintLocal].nbat,
+                                flags, eatLocal,
+                                enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                fr->fshift);
+            wallcycle_stop(wcycle,ewcWAIT_GPU_NB_L);
+
+            /* now clear the GPU outputs while we finish the step on the CPU */
+            nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+        }
+        else
+        {            
+            wallcycle_start_nocount(wcycle,ewcFORCE);
+            do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+                         DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
+                         nrnb, wcycle);
+            wallcycle_stop(wcycle,ewcFORCE);
+        }
+        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+        if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+        {
+            /* skip the reduction if there was no non-local work to do */
+            nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatLocal,
+                                           nbv->grp[eintLocal].nbat,f);
+        }
+        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+    }
+    
+    if (DOMAINDECOMP(cr))
+    {
+        dd_force_flop_stop(cr->dd,nrnb);
+        if (wcycle)
+        {
+            dd_cycles_add(cr->dd,cycles_force-cycles_pme,ddCyclF);
+        }
+    }
+
+    if (bDoForces)
+    {
+        if (IR_ELEC_FIELD(*inputrec))
+        {
+            /* Compute forces due to electric field */
+            calc_f_el(MASTER(cr) ? field : NULL,
+                      start,homenr,mdatoms->chargeA,x,fr->f_novirsum,
+                      inputrec->ex,inputrec->et,t);
+        }
+
+        /* If we have NoVirSum forces, but we do not calculate the virial,
+         * we sum fr->f_novirum=f later.
+         */
+        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+        {
+            wallcycle_start(wcycle,ewcVSITESPREAD);
+            spread_vsite_f(fplog,vsite,x,f,fr->fshift,FALSE,NULL,nrnb,
+                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
+            wallcycle_stop(wcycle,ewcVSITESPREAD);
+
+            if (bSepLRF)
+            {
+                wallcycle_start(wcycle,ewcVSITESPREAD);
+                spread_vsite_f(fplog,vsite,x,fr->f_twin,NULL,FALSE,NULL,
+                               nrnb,
+                               &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
+                wallcycle_stop(wcycle,ewcVSITESPREAD);
+            }
+        }
+
+        if (flags & GMX_FORCE_VIRIAL)
         {
-            Ext[m] = 0;
+            /* Calculation of the virial must be done after vsites! */
+            calc_virial(fplog,mdatoms->start,mdatoms->homenr,x,f,
+                        vir_force,graph,box,nrnb,fr,inputrec->ePBC);
         }
     }
-    if (fp != NULL)
+
+    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
     {
-        fprintf(fp,"%10g  %10g  %10g  %10g #FIELD\n",t,
-                Ext[XX]/FIELDFAC,Ext[YY]/FIELDFAC,Ext[ZZ]/FIELDFAC);
+        pull_potential_wrapper(fplog,bSepDVDL,cr,inputrec,box,x,
+                               f,vir_force,mdatoms,enerd,lambda,t);
     }
-}
-
-static void calc_virial(FILE *fplog,int start,int homenr,rvec x[],rvec f[],
-			tensor vir_part,t_graph *graph,matrix box,
-			t_nrnb *nrnb,const t_forcerec *fr,int ePBC)
-{
-  int i,j;
-  tensor virtest;
-
-  /* The short-range virial from surrounding boxes */
-  clear_mat(vir_part);
-  calc_vir(fplog,SHIFTS,fr->shift_vec,fr->fshift,vir_part,ePBC==epbcSCREW,box);
-  inc_nrnb(nrnb,eNR_VIRIAL,SHIFTS);
-
-  /* Calculate partial virial, for local atoms only, based on short range.
-   * Total virial is computed in global_stat, called from do_md
-   */
-  f_calc_vir(fplog,start,start+homenr,x,f,vir_part,graph,box);
-  inc_nrnb(nrnb,eNR_VIRIAL,homenr);
-
-  /* Add position restraint contribution */
-  for(i=0; i<DIM; i++) {
-    vir_part[i][i] += fr->vir_diag_posres[i];
-  }
-
-  /* Add wall contribution */
-  for(i=0; i<DIM; i++) {
-    vir_part[i][ZZ] += fr->vir_wall_z[i];
-  }
-
-  if (debug)
-    pr_rvecs(debug,0,"vir_part",vir_part,DIM);
-}
 
-static void print_large_forces(FILE *fp,t_mdatoms *md,t_commrec *cr,
-			       gmx_large_int_t step,real pforce,rvec *x,rvec *f)
-{
-  int  i;
-  real pf2,fn2;
-  char buf[STEPSTRSIZE];
+    if (PAR(cr) && !(cr->duty & DUTY_PME))
+    {
+        /* In case of node-splitting, the PP nodes receive the long-range 
+         * forces, virial and energy from the PME nodes here.
+         */    
+        pme_receive_force_ener(fplog,bSepDVDL,cr,wcycle,enerd,fr);
+    }
 
-  pf2 = sqr(pforce);
-  for(i=md->start; i<md->start+md->homenr; i++) {
-    fn2 = norm2(f[i]);
-    /* We also catch NAN, if the compiler does not optimize this away. */
-    if (fn2 >= pf2 || fn2 != fn2) {
-      fprintf(fp,"step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
-	      gmx_step_str(step,buf),
-	      ddglatnr(cr->dd,i),x[i][XX],x[i][YY],x[i][ZZ],sqrt(fn2));
+    if (bDoForces)
+    {
+        post_process_forces(fplog,cr,step,nrnb,wcycle,
+                            top,box,x,f,vir_force,mdatoms,graph,fr,vsite,
+                            flags);
     }
-  }
+    
+    /* Sum the potential energy terms from group contributions */
+    sum_epot(&(inputrec->opts),enerd);
 }
 
-void do_force(FILE *fplog,t_commrec *cr,
+void do_force_cutsGROUP(FILE *fplog,t_commrec *cr,
               t_inputrec *inputrec,
               gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
               gmx_localtop_t *top,
@@ -430,10 +1412,10 @@ void do_force(FILE *fplog,t_commrec *cr,
     gmx_bool   bDoLongRange,bDoForces,bSepLRF;
     gmx_bool   bDoAdressWF;
     matrix boxs;
+    rvec   vzero,box_diag;
     real   e,v,dvdlambda[efptNR];
-    real   dvdl_dum,lambda_dum;
     t_pbc  pbc;
-    float  cycles_ppdpme,cycles_pme,cycles_seppme,cycles_force;
+    float  cycles_pme,cycles_force;
 
     start  = mdatoms->start;
     homenr = mdatoms->homenr;
@@ -477,68 +1459,71 @@ void do_force(FILE *fplog,t_commrec *cr,
     {
         update_forcerec(fplog,fr,box);
 
-        /* Calculate total (local) dipole moment in a temporary common array.
-         * This makes it possible to sum them over nodes faster.
-         */
-        calc_mu(start,homenr,
-                x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
-                mu,mu+DIM);
+        if (NEED_MUTOT(*inputrec))
+        {
+            /* Calculate total (local) dipole moment in a temporary common array.
+             * This makes it possible to sum them over nodes faster.
+             */
+            calc_mu(start,homenr,
+                    x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
+                    mu,mu+DIM);
+        }
     }
 
-  if (fr->ePBC != epbcNONE) {
-    /* Compute shift vectors every step,
-     * because of pressure coupling or box deformation!
-     */
-    if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
-      calc_shifts(box,fr->shift_vec);
-
-    if (bCalcCGCM) {
-      put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,box,
-			       &(top->cgs),x,fr->cg_cm);
-      inc_nrnb(nrnb,eNR_CGCM,homenr);
-      inc_nrnb(nrnb,eNR_RESETX,cg1-cg0);
-    }
-    else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
-      unshift_self(graph,box,x);
+    if (fr->ePBC != epbcNONE) { 
+        /* Compute shift vectors every step,
+         * because of pressure coupling or box deformation!
+         */
+        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+            calc_shifts(box,fr->shift_vec);
+
+        if (bCalcCGCM) { 
+            put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,box,
+                    &(top->cgs),x,fr->cg_cm);
+            inc_nrnb(nrnb,eNR_CGCM,homenr);
+            inc_nrnb(nrnb,eNR_RESETX,cg1-cg0);
+        } 
+        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
+            unshift_self(graph,box,x);
+        }
+    } 
+    else if (bCalcCGCM) {
+        calc_cgcm(fplog,cg0,cg1,&(top->cgs),x,fr->cg_cm);
+        inc_nrnb(nrnb,eNR_CGCM,homenr);
     }
-  }
-  else if (bCalcCGCM) {
-    calc_cgcm(fplog,cg0,cg1,&(top->cgs),x,fr->cg_cm);
-    inc_nrnb(nrnb,eNR_CGCM,homenr);
-  }
 
-  if (bCalcCGCM) {
-    if (PAR(cr)) {
-      move_cgcm(fplog,cr,fr->cg_cm);
+    if (bCalcCGCM) {
+        if (PAR(cr)) {
+            move_cgcm(fplog,cr,fr->cg_cm);
+        }
+        if (gmx_debug_at)
+            pr_rvecs(debug,0,"cgcm",fr->cg_cm,top->cgs.nr);
     }
-    if (gmx_debug_at)
-      pr_rvecs(debug,0,"cgcm",fr->cg_cm,top->cgs.nr);
-  }
 
 #ifdef GMX_MPI
-  if (!(cr->duty & DUTY_PME)) {
-    /* Send particle coordinates to the pme nodes.
-     * Since this is only implemented for domain decomposition
-     * and domain decomposition does not use the graph,
-     * we do not need to worry about shifting.
-     */
+    if (!(cr->duty & DUTY_PME)) {
+        /* Send particle coordinates to the pme nodes.
+         * Since this is only implemented for domain decomposition
+         * and domain decomposition does not use the graph,
+         * we do not need to worry about shifting.
+         */    
+
+        wallcycle_start(wcycle,ewcPP_PMESENDX);
+        GMX_MPE_LOG(ev_send_coordinates_start);
+
+        bBS = (inputrec->nwall == 2);
+        if (bBS) {
+            copy_mat(box,boxs);
+            svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
+        }
 
-    wallcycle_start(wcycle,ewcPP_PMESENDX);
-    GMX_MPE_LOG(ev_send_coordinates_start);
+        gmx_pme_send_x(cr,bBS ? boxs : box,x,
+                       mdatoms->nChargePerturbed,lambda[efptCOUL],
+                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),step);
 
-    bBS = (inputrec->nwall == 2);
-    if (bBS) {
-      copy_mat(box,boxs);
-      svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
+        GMX_MPE_LOG(ev_send_coordinates_finish);
+        wallcycle_stop(wcycle,ewcPP_PMESENDX);
     }
-
-    gmx_pme_send_x(cr,bBS ? boxs : box,x,
-                   mdatoms->nChargePerturbed,lambda[efptCOUL],
-                   ( flags & GMX_FORCE_VIRIAL),step);
-
-    GMX_MPE_LOG(ev_send_coordinates_finish);
-    wallcycle_stop(wcycle,ewcPP_PMESENDX);
-  }
 #endif /* GMX_MPI */
 
     /* Communicate coordinates and sum dipole if necessary */
@@ -553,60 +1538,63 @@ void do_force(FILE *fplog,t_commrec *cr,
         {
             move_x(fplog,cr,GMX_LEFT,GMX_RIGHT,x,nrnb);
         }
-        /* When we don't need the total dipole we sum it in global_stat */
-        if (bStateChanged && NEED_MUTOT(*inputrec))
+        wallcycle_stop(wcycle,ewcMOVEX);
+    }
+
+    /* update adress weight beforehand */
+    if(bStateChanged && bDoAdressWF)
+    {
+        /* need pbc for adress weight calculation with pbc_dx */
+        set_pbc(&pbc,inputrec->ePBC,box);
+        if(fr->adress_site == eAdressSITEcog)
         {
-            gmx_sumd(2*DIM,mu,cr);
+            update_adress_weights_cog(top->idef.iparams,top->idef.il,x,fr,mdatoms,
+                                      inputrec->ePBC==epbcNONE ? NULL : &pbc);
+        }
+        else if (fr->adress_site == eAdressSITEcom)
+        {
+            update_adress_weights_com(fplog,cg0,cg1,&(top->cgs),x,fr,mdatoms,
+                                      inputrec->ePBC==epbcNONE ? NULL : &pbc);
+        }
+        else if (fr->adress_site == eAdressSITEatomatom){
+            update_adress_weights_atom_per_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
+                                                inputrec->ePBC==epbcNONE ? NULL : &pbc);
+        }
+        else
+        {
+            update_adress_weights_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
+                                       inputrec->ePBC==epbcNONE ? NULL : &pbc);
         }
-        wallcycle_stop(wcycle,ewcMOVEX);
     }
-    if (bStateChanged)
+
+    if (NEED_MUTOT(*inputrec))
     {
 
-        /* update adress weight beforehand */
-        if(bDoAdressWF)
+        if (bStateChanged)
         {
-            /* need pbc for adress weight calculation with pbc_dx */
-            set_pbc(&pbc,inputrec->ePBC,box);
-            if(fr->adress_site == eAdressSITEcog)
-            {
-                update_adress_weights_cog(top->idef.iparams,top->idef.il,x,fr,mdatoms,
-                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
-            }
-            else if (fr->adress_site == eAdressSITEcom)
+            if (PAR(cr))
             {
-                update_adress_weights_com(fplog,cg0,cg1,&(top->cgs),x,fr,mdatoms,
-                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
-            }
-            else if (fr->adress_site == eAdressSITEatomatom){
-                update_adress_weights_atom_per_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
-                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
+                gmx_sumd(2*DIM,mu,cr);
             }
-            else
+            for(i=0; i<2; i++)
             {
-                update_adress_weights_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
-                                           inputrec->ePBC==epbcNONE ? NULL : &pbc);
+                for(j=0;j<DIM;j++)
+                {
+                    fr->mu_tot[i][j] = mu[i*DIM + j];
+                }
             }
         }
-
-        for(i=0; i<2; i++)
+        if (fr->efep == efepNO)
         {
-            for(j=0;j<DIM;j++)
-            {
-                fr->mu_tot[i][j] = mu[i*DIM + j];
-            }
+            copy_rvec(fr->mu_tot[0],mu_tot);
         }
-    }
-    if (fr->efep == efepNO)
-    {
-        copy_rvec(fr->mu_tot[0],mu_tot);
-    }
-    else
-    {
-        for(j=0; j<DIM; j++)
+        else
         {
-            mu_tot[j] =
-                (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
+            for(j=0; j<DIM; j++)
+            {
+                mu_tot[j] =
+                    (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
+            }
         }
     }
 
@@ -680,7 +1668,7 @@ void do_force(FILE *fplog,t_commrec *cr,
      * since that will interfere with the dynamic load balancing.
      */
     wallcycle_start(wcycle,ewcFORCE);
-
+    
     if (bDoForces)
     {
         /* Reset forces for which the virial is calculated separately:
@@ -742,54 +1730,19 @@ void do_force(FILE *fplog,t_commrec *cr,
 
     if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
     {
-        /* Position restraints always require full pbc. Check if we already did it for Adress */
-        if(!(bStateChanged && bDoAdressWF))
-        {
-            set_pbc(&pbc,inputrec->ePBC,box);
-        }
-        v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
-                   top->idef.iparams_posres,
-                   (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
-                   inputrec->ePBC==epbcNONE ? NULL : &pbc,lambda[efptRESTRAINT],&(dvdlambda[efptRESTRAINT]),
-                   fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
-        if (bSepDVDL)
-        {
-            fprintf(fplog,sepdvdlformat,
-                    interaction_function[F_POSRES].longname,v,dvdlambda);
-        }
-        enerd->term[F_POSRES] += v;
-        /* This linear lambda dependence assumption is only correct
-         * when only k depends on lambda,
-         * not when the reference position depends on lambda.
-         * grompp checks for this.  (verify this is still the case?)
-         */
-        enerd->dvdl_nonlin[efptRESTRAINT] += dvdlambda[efptRESTRAINT]; /* if just the force constant changes, this is linear,
-                                                                          but we can't be sure w/o additional checking that is
-                                                                          hard to do at this level of code. Otherwise,
-                                                                          the dvdl is not differentiable */
-        inc_nrnb(nrnb,eNR_POSRES,top->idef.il[F_POSRES].nr/2);
-        if ((inputrec->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
-        {
-            for(i=0; i<enerd->n_lambda; i++)
-            {
-                lambda_dum = (i==0 ? lambda[efptRESTRAINT] : inputrec->fepvals->all_lambda[efptRESTRAINT][i-1]);
-                v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
-                           top->idef.iparams_posres,
-                           (const rvec*)x,NULL,NULL,
-                           inputrec->ePBC==epbcNONE ? NULL : &pbc,lambda_dum,&dvdl_dum,
-                           fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
-                enerd->enerpart_lambda[i] += v;
-            }
-        }
-   }
+        posres_wrapper(fplog,flags,bSepDVDL,inputrec,nrnb,top,box,x,
+                       f,enerd,lambda,fr);
+    }
 
     /* Compute the bonded and non-bonded energies and optionally forces */
     do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
                       cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
                       x,hist,f,enerd,fcd,mtop,top,fr->born,
                       &(top->atomtypes),bBornRadii,box,
-                      inputrec->fepvals,lambda,graph,&(top->excls),fr->mu_tot,
-                      flags,&cycles_pme);
+                      inputrec->fepvals,lambda,
+                      graph,&(top->excls),fr->mu_tot,
+                      flags,
+                      &cycles_pme);
 
     cycles_force = wallcycle_stop(wcycle,ewcFORCE);
     GMX_BARRIER(cr->mpi_comm_mygroup);
@@ -891,24 +1844,10 @@ void do_force(FILE *fplog,t_commrec *cr,
         }
     }
 
-    enerd->term[F_COM_PULL] = 0;
     if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
     {
-        /* Calculate the center of mass forces, this requires communication,
-         * which is why pull_potential is called close to other communication.
-         * The virial contribution is calculated directly,
-         * which is why we call pull_potential after calc_virial.
-         */
-        set_pbc(&pbc,inputrec->ePBC,box);
-        dvdlambda[efptRESTRAINT] = 0;
-        enerd->term[F_COM_PULL] +=
-            pull_potential(inputrec->ePull,inputrec->pull,mdatoms,&pbc,
-                           cr,t,lambda[efptRESTRAINT],x,f,vir_force,&(dvdlambda[efptRESTRAINT]));
-        if (bSepDVDL)
-        {
-            fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdlambda[efptRESTRAINT]);
-        }
-        enerd->dvdl_lin[efptRESTRAINT] += dvdlambda[efptRESTRAINT];
+        pull_potential_wrapper(fplog,bSepDVDL,cr,inputrec,box,x,
+                               f,vir_force,mdatoms,enerd,lambda,t);
     }
 
     /* Add the forces from enforced rotation potentials (if any) */
@@ -921,76 +1860,86 @@ void do_force(FILE *fplog,t_commrec *cr,
 
     if (PAR(cr) && !(cr->duty & DUTY_PME))
     {
-        cycles_ppdpme = wallcycle_stop(wcycle,ewcPPDURINGPME);
-        dd_cycles_add(cr->dd,cycles_ppdpme,ddCyclPPduringPME);
-
-        /* In case of node-splitting, the PP nodes receive the long-range
+        /* In case of node-splitting, the PP nodes receive the long-range 
          * forces, virial and energy from the PME nodes here.
          */
-        wallcycle_start(wcycle,ewcPP_PMEWAITRECVF);
-        dvdlambda[efptCOUL] = 0;
-        gmx_pme_receive_f(cr,fr->f_novirsum,fr->vir_el_recip,&e,&dvdlambda[efptCOUL],
-                          &cycles_seppme);
-        if (bSepDVDL)
-        {
-            fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdlambda[efptCOUL]);
-        }
-        enerd->term[F_COUL_RECIP] += e;
-        enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
-        if (wcycle)
-        {
-            dd_cycles_add(cr->dd,cycles_seppme,ddCyclPME);
-        }
-        wallcycle_stop(wcycle,ewcPP_PMEWAITRECVF);
+        pme_receive_force_ener(fplog,bSepDVDL,cr,wcycle,enerd,fr);
     }
 
-    if (bDoForces && fr->bF_NoVirSum)
+    if (bDoForces)
     {
-        if (vsite)
-        {
-            /* Spread the mesh force on virtual sites to the other particles...
-             * This is parallellized. MPI communication is performed
-             * if the constructing atoms aren't local.
-             */
-            wallcycle_start(wcycle,ewcVSITESPREAD);
-            spread_vsite_f(fplog,vsite,x,fr->f_novirsum,NULL,
-                           (flags & GMX_FORCE_VIRIAL),fr->vir_el_recip,
-                           nrnb,
-                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
-            wallcycle_stop(wcycle,ewcVSITESPREAD);
-        }
-        if (flags & GMX_FORCE_VIRIAL)
-        {
-            /* Now add the forces, this is local */
-            if (fr->bDomDec)
-            {
-                sum_forces(0,fr->f_novirsum_n,f,fr->f_novirsum);
-            }
-            else
-            {
-                sum_forces(start,start+homenr,f,fr->f_novirsum);
-            }
-            if (EEL_FULL(fr->eeltype))
-            {
-                /* Add the mesh contribution to the virial */
-                m_add(vir_force,fr->vir_el_recip,vir_force);
-            }
-            if (debug)
-            {
-                pr_rvecs(debug,0,"vir_force",vir_force,DIM);
-            }
-        }
+        post_process_forces(fplog,cr,step,nrnb,wcycle,
+                            top,box,x,f,vir_force,mdatoms,graph,fr,vsite,
+                            flags);
     }
 
     /* Sum the potential energy terms from group contributions */
     sum_epot(&(inputrec->opts),enerd);
+}
+
+void do_force(FILE *fplog,t_commrec *cr,
+              t_inputrec *inputrec,
+              gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+              gmx_localtop_t *top,
+              gmx_mtop_t *mtop,
+              gmx_groups_t *groups,
+              matrix box,rvec x[],history_t *hist,
+              rvec f[],
+              tensor vir_force,
+              t_mdatoms *mdatoms,
+              gmx_enerdata_t *enerd,t_fcdata *fcd,
+              real *lambda,t_graph *graph,
+              t_forcerec *fr,
+              gmx_vsite_t *vsite,rvec mu_tot,
+              double t,FILE *field,gmx_edsam_t ed,
+              gmx_bool bBornRadii,
+              int flags)
+{
+    /* modify force flag if not doing nonbonded */
+    if (!fr->bNonbonded)
+    {
+        flags &= ~GMX_FORCE_NONBONDED;
+    }
 
-    if (fr->print_force >= 0 && bDoForces)
+    switch (inputrec->cutoff_scheme)
     {
-        print_large_forces(stderr,mdatoms,cr,step,fr->print_force,x,f);
+        case ecutsVERLET:
+            do_force_cutsVERLET(fplog, cr, inputrec,
+                                step, nrnb, wcycle,
+                                top, mtop,
+                                groups,
+                                box, x, hist,
+                                f, vir_force,
+                                mdatoms,
+                                enerd, fcd,
+                                lambda, graph,
+                                fr, fr->ic, 
+                                vsite, mu_tot,
+                                t, field, ed,
+                                bBornRadii,
+                                flags);
+            break;
+        case ecutsGROUP:
+             do_force_cutsGROUP(fplog, cr, inputrec,
+                                step, nrnb, wcycle,
+                                top, mtop,
+                                groups,
+                                box, x, hist,
+                                f, vir_force,
+                                mdatoms,
+                                enerd, fcd,
+                                lambda, graph,
+                                fr, vsite, mu_tot,
+                                t, field, ed,
+                                bBornRadii,
+                                flags);
+            break;
+        default:
+            gmx_incons("Invalid cut-off scheme passed!");
     }
 }
 
+
 void do_constrain_first(FILE *fplog,gmx_constr_t constr,
                         t_inputrec *ir,t_mdatoms *md,
                         t_state *state,rvec *f,
@@ -1025,8 +1974,10 @@ void do_constrain_first(FILE *fplog,gmx_constr_t constr,
     constrain(NULL,TRUE,FALSE,constr,&(top->idef),
               ir,NULL,cr,step,0,md,
               state->x,state->x,NULL,
-              state->box,state->lambda[efptBONDED],&dvdl_dum,
-              NULL,NULL,nrnb,econqCoord,ir->epc==epcMTTK,state->veta,state->veta);
+              fr->bMolPBC,state->box,
+              state->lambda[efptBONDED],&dvdl_dum,
+              NULL,NULL,nrnb,econqCoord,
+              ir->epc==epcMTTK,state->veta,state->veta);
     if (EI_VV(ir->eI))
     {
         /* constrain the inital velocity, and save it */
@@ -1035,8 +1986,10 @@ void do_constrain_first(FILE *fplog,gmx_constr_t constr,
         constrain(NULL,TRUE,FALSE,constr,&(top->idef),
                   ir,NULL,cr,step,0,md,
                   state->x,state->v,state->v,
-                  state->box,state->lambda[efptBONDED],&dvdl_dum,
-                  NULL,NULL,nrnb,econqVeloc,ir->epc==epcMTTK,state->veta,state->veta);
+                  fr->bMolPBC,state->box,
+                  state->lambda[efptBONDED],&dvdl_dum,
+                  NULL,NULL,nrnb,econqVeloc,
+                  ir->epc==epcMTTK,state->veta,state->veta);
     }
     /* constrain the inital velocities at t-dt/2 */
     if (EI_STATE_VELOCITY(ir->eI) && ir->eI!=eiVV)
@@ -1064,9 +2017,11 @@ void do_constrain_first(FILE *fplog,gmx_constr_t constr,
         constrain(NULL,TRUE,FALSE,constr,&(top->idef),
                   ir,NULL,cr,step,-1,md,
                   state->x,savex,NULL,
-                  state->box,state->lambda[efptBONDED],&dvdl_dum,
-                  state->v,NULL,nrnb,econqCoord,ir->epc==epcMTTK,state->veta,state->veta);
-
+                  fr->bMolPBC,state->box,
+                  state->lambda[efptBONDED],&dvdl_dum,
+                  state->v,NULL,nrnb,econqCoord,
+                  ir->epc==epcMTTK,state->veta,state->veta);
+        
         for(i=start; i<end; i++) {
             for(m=0; m<DIM; m++) {
                 /* Re-reverse the velocities */
@@ -1190,8 +2145,15 @@ void calc_enervirdiff(FILE *fplog,int eDispCorr,t_forcerec *fr)
 		"WARNING: using dispersion correction with user tables\n");
       rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
       rc9  = rc3*rc3*rc3;
+      /* Contribution beyond the cut-off */
       eners[0] += -4.0*M_PI/(3.0*rc3);
       eners[1] +=  4.0*M_PI/(9.0*rc9);
+      if (fr->cutoff_scheme == ecutsVERLET && fr->ic->sh_invrc6 != 0) {
+          /* Contribution within the cut-off */
+          eners[0] += -4.0*M_PI/(3.0*rc3);
+          eners[1] +=  4.0*M_PI/(3.0*rc9);
+      }
+      /* Contribution beyond the cut-off */
       virs[0]  +=  8.0*M_PI/rc3;
       virs[1]  += -16.0*M_PI/(3.0*rc9);
     } else {
@@ -1399,37 +2361,54 @@ void finish_run(FILE *fplog,t_commrec *cr,const char *confout,
                 t_inputrec *inputrec,
                 t_nrnb nrnb[],gmx_wallcycle_t wcycle,
                 gmx_runtime_t *runtime,
+                wallclock_gpu_t *gputimes,
+                int omp_nth_pp,
                 gmx_bool bWriteStat)
 {
-  int    i,j;
-  t_nrnb *nrnb_tot=NULL;
-  real   delta_t;
-  double nbfs,mflop;
-  double cycles[ewcNR];
+    int    i,j;
+    t_nrnb *nrnb_tot=NULL;
+    real   delta_t;
+    double nbfs,mflop;
 
-  wallcycle_sum(cr,wcycle,cycles);
+    wallcycle_sum(cr,wcycle);
 
-  if (cr->nnodes > 1) {
-    if (SIMMASTER(cr))
-      snew(nrnb_tot,1);
+    if (cr->nnodes > 1)
+    {
+        snew(nrnb_tot,1);
 #ifdef GMX_MPI
-    MPI_Reduce(nrnb->n,nrnb_tot->n,eNRNB,MPI_DOUBLE,MPI_SUM,
-               MASTERRANK(cr),cr->mpi_comm_mysim);
+        MPI_Allreduce(nrnb->n,nrnb_tot->n,eNRNB,MPI_DOUBLE,MPI_SUM,
+                      cr->mpi_comm_mysim);
 #endif
-  } else {
-    nrnb_tot = nrnb;
-  }
+    }
+    else
+    {
+        nrnb_tot = nrnb;
+    }
 
-  if (SIMMASTER(cr)) {
-    print_flop(fplog,nrnb_tot,&nbfs,&mflop);
-    if (cr->nnodes > 1) {
-      sfree(nrnb_tot);
+#if defined(GMX_MPI) && !defined(GMX_THREAD_MPI)
+    if (cr->nnodes > 1)
+    {
+        /* reduce nodetime over all MPI processes in the current simulation */
+        double sum;
+        MPI_Allreduce(&runtime->proctime,&sum,1,MPI_DOUBLE,MPI_SUM,
+                      cr->mpi_comm_mysim);
+        runtime->proctime = sum;
     }
-  }
+#endif
 
-  if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr)) {
-    print_dd_statistics(cr,inputrec,fplog);
-  }
+    if (SIMMASTER(cr))
+    {
+        print_flop(fplog,nrnb_tot,&nbfs,&mflop);
+    }
+    if (cr->nnodes > 1)
+    {
+        sfree(nrnb_tot);
+    }
+
+    if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
+    {
+        print_dd_statistics(cr,inputrec,fplog);
+    }
 
 #ifdef GMX_MPI
     if (PARTDECOMP(cr))
@@ -1458,44 +2437,35 @@ void finish_run(FILE *fplog,t_commrec *cr,const char *confout,
     }
 #endif
 
-  if (SIMMASTER(cr)) {
-    wallcycle_print(fplog,cr->nnodes,cr->npmenodes,runtime->realtime,
-                    wcycle,cycles);
-
-    if (EI_DYNAMICS(inputrec->eI)) {
-      delta_t = inputrec->delta_t;
-    } else {
-      delta_t = 0;
-    }
+    if (SIMMASTER(cr))
+    {
+        wallcycle_print(fplog,cr->nnodes,cr->npmenodes,runtime->realtime,
+                        wcycle,gputimes);
 
-    if (fplog) {
-        print_perf(fplog,runtime->proctime,runtime->realtime,
-                   cr->nnodes-cr->npmenodes,
-                   runtime->nsteps_done,delta_t,nbfs,mflop);
-    }
-    if (bWriteStat) {
-        print_perf(stderr,runtime->proctime,runtime->realtime,
-                   cr->nnodes-cr->npmenodes,
-                   runtime->nsteps_done,delta_t,nbfs,mflop);
-    }
+        if (EI_DYNAMICS(inputrec->eI))
+        {
+            delta_t = inputrec->delta_t;
+        }
+        else
+        {
+            delta_t = 0;
+        }
 
-    /*
-    runtime=inputrec->nsteps*inputrec->delta_t;
-    if (bWriteStat) {
-      if (cr->nnodes == 1)
-	fprintf(stderr,"\n\n");
-      print_perf(stderr,nodetime,realtime,runtime,&ntot,
-		 cr->nnodes-cr->npmenodes,FALSE);
+        if (fplog)
+        {
+            print_perf(fplog,runtime->proctime,runtime->realtime,
+                       cr->nnodes-cr->npmenodes,
+                       runtime->nsteps_done,delta_t,nbfs,mflop,
+                       omp_nth_pp);
+        }
+        if (bWriteStat)
+        {
+            print_perf(stderr,runtime->proctime,runtime->realtime,
+                       cr->nnodes-cr->npmenodes,
+                       runtime->nsteps_done,delta_t,nbfs,mflop,
+                       omp_nth_pp);
+        }
     }
-    wallcycle_print(fplog,cr->nnodes,cr->npmenodes,realtime,wcycle,cycles);
-    print_perf(fplog,nodetime,realtime,runtime,&ntot,cr->nnodes-cr->npmenodes,
-	       TRUE);
-    if (PARTDECOMP(cr))
-      pr_load(fplog,cr,nrnb_all);
-    if (cr->nnodes > 1)
-      sfree(nrnb_all);
-    */
-  }
 }
 
 extern void initialize_lambdas(FILE *fplog,t_inputrec *ir,int *fep_state,real *lambda,double *lam0)
@@ -1641,6 +2611,3 @@ void init_md(FILE *fplog,
     debug_gmx();
 }
 
-
-
-
diff --git a/src/mdlib/stat.c b/src/mdlib/stat.c
index b7343eb93c..88dbfef23d 100644
--- a/src/mdlib/stat.c
+++ b/src/mdlib/stat.c
@@ -65,8 +65,10 @@
 #include "partdec.h"
 #include "constr.h"
 #include "checkpoint.h"
-#include "mdrun.h"
 #include "xvgr.h"
+#include "md_support.h"
+#include "mdrun.h"
+#include "sim_util.h"
 
 typedef struct gmx_global_stat
 {
diff --git a/src/mdlib/tables.c b/src/mdlib/tables.c
index f0c73782ec..0c26ea947a 100644
--- a/src/mdlib/tables.c
+++ b/src/mdlib/tables.c
@@ -1,4 +1,5 @@
-/*
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
  * 
  *                This source code is part of
  * 
@@ -50,6 +51,7 @@
 #include "physics.h"
 #include "force.h"
 #include "gmxfio.h"
+#include "tables.h"
 
 /* All the possible (implemented) table functions */
 enum { 
@@ -124,6 +126,192 @@ typedef struct {
 #define pow4(x) ((x)*(x)*(x)*(x))
 #define pow5(x) ((x)*(x)*(x)*(x)*(x))
 
+
+static double v_ewald_lr(double beta,double r)
+{
+    if (r == 0)
+    {
+        return beta*2/sqrt(M_PI);
+    }
+    else
+    {
+        return gmx_erfd(beta*r)/r;
+    }
+}
+
+void table_spline3_fill_ewald_lr(real *tabf,real *tabv,
+                                 int ntab,int tableformat,
+                                 real dx,real beta)
+{
+    real tab_max;
+    int stride=0;
+    int i,i_inrange;
+    double dc,dc_new;
+    gmx_bool bOutOfRange;
+    double v_r0,v_r1,v_inrange,vi,a0,a1,a2dx;
+    double x_r0;
+
+    if (ntab < 2)
+    {
+        gmx_fatal(FARGS,"Can not make a spline table with less than 2 points");
+    }
+
+    /* We need some margin to be able to divide table values by r
+     * in the kernel and also to do the integration arithmetics
+     * without going out of range. Furthemore, we divide by dx below.
+     */
+    tab_max = GMX_REAL_MAX*0.0001;
+
+    /* This function produces a table with:
+     * maximum energy error: V'''/(6*12*sqrt(3))*dx^3
+     * maximum force error:  V'''/(6*4)*dx^2
+     * The rms force error is the max error times 1/sqrt(5)=0.45.
+     */
+
+    switch (tableformat)
+    {
+    case tableformatF:    stride = 1; break;
+    case tableformatFDV0: stride = 4; break;
+    default: gmx_incons("Unknown table format");
+    }
+
+    bOutOfRange = FALSE;
+    i_inrange = ntab;
+    v_inrange = 0;
+    dc = 0;
+    for(i=ntab-1; i>=0; i--)
+    {
+        x_r0 = i*dx;
+
+        v_r0 = v_ewald_lr(beta,x_r0);
+
+        if (!bOutOfRange)
+        {
+            i_inrange = i;
+            v_inrange = v_r0;
+    
+            vi = v_r0;
+        }
+        else
+        {
+            /* Linear continuation for the last point in range */
+            vi = v_inrange - dc*(i - i_inrange)*dx;
+        }
+
+        switch (tableformat)
+        {
+        case tableformatF:
+            if (tabv != NULL)
+            {
+                tabv[i] = vi;
+            }
+            break;
+        case tableformatFDV0:
+            tabf[i*stride+2] = vi;
+            tabf[i*stride+3] = 0;
+            break;
+        default:
+            gmx_incons("Unknown table format");
+        }
+
+        if (i == 0)
+        {
+            continue;
+        }
+
+        /* Get the potential at table point i-1 */
+        v_r1 = v_ewald_lr(beta,(i-1)*dx);
+
+        if (v_r1 != v_r1 || v_r1 < -tab_max || v_r1 > tab_max)
+        {
+            bOutOfRange = TRUE;
+        }
+
+        if (!bOutOfRange)
+        {
+            /* Calculate the average second derivative times dx over interval i-1 to i.
+             * Using the function values at the end points and in the middle.
+             */
+            a2dx = (v_r0 + v_r1 - 2*v_ewald_lr(beta,x_r0-0.5*dx))/(0.25*dx);
+            /* Set the derivative of the spline to match the difference in potential
+             * over the interval plus the average effect of the quadratic term.
+             * This is the essential step for minimizing the error in the force.
+             */
+            dc = (v_r0 - v_r1)/dx + 0.5*a2dx;
+        }
+
+        if (i == ntab - 1)
+        {
+            /* Fill the table with the force, minus the derivative of the spline */
+            tabf[i*stride] = -dc;
+        }
+        else
+        {
+            /* tab[i] will contain the average of the splines over the two intervals */
+            tabf[i*stride] += -0.5*dc;
+        }
+
+        if (!bOutOfRange)
+        {
+            /* Make spline s(x) = a0 + a1*(x - xr) + 0.5*a2*(x - xr)^2
+             * matching the potential at the two end points
+             * and the derivative dc at the end point xr.
+             */
+            a0   = v_r0;
+            a1   = dc;
+            a2dx = (a1*dx + v_r1 - a0)*2/dx;
+
+            /* Set dc to the derivative at the next point */
+            dc_new = a1 - a2dx;
+                
+            if (dc_new != dc_new || dc_new < -tab_max || dc_new > tab_max)
+            {
+                bOutOfRange = TRUE;
+            }
+            else
+            {
+                dc = dc_new;
+            }
+        }
+
+        tabf[(i-1)*stride] = -0.5*dc;
+    }
+    /* Currently the last value only contains half the force: double it */
+    tabf[0] *= 2;
+
+    if (tableformat == tableformatFDV0)
+    {
+        /* Store the force difference in the second entry */
+        for(i=0; i<ntab-1; i++)
+        {
+            tabf[i*stride+1] = tabf[(i+1)*stride] - tabf[i*stride];
+        }
+        tabf[(ntab-1)*stride+1] = -tabf[i*stride];
+    }
+}
+
+/* The scale (1/spacing) for third order spline interpolation
+ * of the Ewald mesh contribution which needs to be subtracted
+ * from the non-bonded interactions.
+ */
+real ewald_spline3_table_scale(real ewaldcoeff,real rc)
+{
+    double erf_x_d3=1.0522; /* max of (erf(x)/x)''' */
+    double ftol,etol;
+    double sc_f,sc_e;
+
+    /* Force tolerance: single precision accuracy */
+    ftol = GMX_FLOAT_EPS;
+    sc_f = sqrt(erf_x_d3/(6*4*ftol*ewaldcoeff))*ewaldcoeff;
+
+    /* Energy tolerance: 10x more accurate than the cut-off jump */
+    etol = 0.1*gmx_erfc(ewaldcoeff*rc);
+    etol = max(etol,GMX_REAL_EPS);
+    sc_e = pow(erf_x_d3/(6*12*sqrt(3)*etol),1.0/3.0)*ewaldcoeff;
+
+    return max(sc_f,sc_e);
+}
+
 /* Calculate the potential and force for an r value
  * in exactly the same way it is done in the inner loop.
  * VFtab is a pointer to the table data, offset is
diff --git a/src/mdlib/tgroup.c b/src/mdlib/tgroup.c
index c3c759413b..262abf2f59 100644
--- a/src/mdlib/tgroup.c
+++ b/src/mdlib/tgroup.c
@@ -50,6 +50,7 @@
 #include "update.h"
 #include "rbin.h"
 #include "mtop_util.h"
+#include "gmx_omp_nthreads.h"
 
 static void init_grptcstat(int ngtc,t_grp_tcstat tcstat[])
 { 
@@ -93,6 +94,7 @@ void init_ekindata(FILE *log,gmx_mtop_t *mtop,t_grpopts *opts,
                    gmx_ekindata_t *ekind)
 {
   int i;
+  int nthread,thread;
 #ifdef DEBUG
   fprintf(log,"ngtc: %d, ngacc: %d, ngener: %d\n",opts->ngtc,opts->ngacc,
 	  opts->ngener);
@@ -119,6 +121,21 @@ void init_ekindata(FILE *log,gmx_mtop_t *mtop,t_grpopts *opts,
       ekind->tcstat[i].ekinscalef_nhc = 1.0;
   }
   
+    nthread = gmx_omp_nthreads_get(emntUpdate);
+
+    snew(ekind->ekin_work_alloc,nthread);
+    snew(ekind->ekin_work,nthread);
+#pragma omp parallel for num_threads(nthread) schedule(static)
+    for(thread=0; thread<nthread; thread++)
+    {
+        /* Allocate 2 elements extra on both sides,
+         * so in single precision we have 2*3*3*4=72 bytes buffer
+         * on both sides to avoid cache pollution.
+         */
+        snew(ekind->ekin_work_alloc[thread],ekind->ngtc+4);
+        ekind->ekin_work[thread] = ekind->ekin_work_alloc[thread] + 2;
+    }
+
   ekind->ngacc = opts->ngacc;
   snew(ekind->grpstat,opts->ngacc);
   init_grpstat(log,mtop,opts->ngacc,ekind->grpstat);
diff --git a/src/mdlib/tpi.c b/src/mdlib/tpi.c
index 65e3fde043..a5a2a71893 100644
--- a/src/mdlib/tpi.c
+++ b/src/mdlib/tpi.c
@@ -567,7 +567,7 @@ double do_tpi(FILE *fplog,t_commrec *cr,
                          f,force_vir,mdatoms,enerd,fcd,
                          state->lambda,
                          NULL,fr,NULL,mu_tot,t,NULL,NULL,FALSE,
-                         GMX_FORCE_NONBONDED |
+                         GMX_FORCE_NONBONDED | GMX_FORCE_ENERGY |
                          (bNS ? GMX_FORCE_DYNAMICBOX | GMX_FORCE_NS | GMX_FORCE_DOLR : 0) |
                          (bStateChanged ? GMX_FORCE_STATECHANGED : 0)); 
                 cr->nnodes = nnodes;
diff --git a/src/mdlib/update.c b/src/mdlib/update.c
index 1ca0b6148b..e4e4763ad0 100644
--- a/src/mdlib/update.c
+++ b/src/mdlib/update.c
@@ -41,6 +41,7 @@
 #include <stdio.h>
 #include <math.h>
 
+#include "types/commrec.h"
 #include "sysstuff.h"
 #include "smalloc.h"
 #include "typedefs.h"
@@ -66,6 +67,7 @@
 #include "disre.h"
 #include "orires.h"
 #include "gmx_wallcycle.h"
+#include "gmx_omp_nthreads.h"
 
 /*For debugging, start at v(-dt/2) for velolcity verlet -- uncomment next line */
 /*#define STARTFROMDT2*/
@@ -121,8 +123,11 @@ typedef struct gmx_update
 
 
 static void do_update_md(int start,int nrend,double dt,
-                         t_grp_tcstat *tcstat,t_grp_acc *gstat,double nh_vxi[],
-                         rvec accel[],ivec nFreeze[],real invmass[],
+                         t_grp_tcstat *tcstat,
+                         double nh_vxi[],
+                         gmx_bool bNEMD,t_grp_acc *gstat,rvec accel[],
+                         ivec nFreeze[],
+                         real invmass[],
                          unsigned short ptype[],unsigned short cFREEZE[],
                          unsigned short cACC[],unsigned short cTC[],
                          rvec x[],rvec xprime[],rvec v[],
@@ -182,11 +187,13 @@ static void do_update_md(int start,int nrend,double dt,
               }
           }
       }
-  }
-  else
+  } 
+  else if (cFREEZE != NULL ||
+           nFreeze[0][XX] || nFreeze[0][YY] || nFreeze[0][ZZ] ||
+           bNEMD)
   {
-      /* Classic version of update, used with berendsen coupling */
-      for(n=start; n<nrend; n++)
+      /* Update with Berendsen/v-rescale coupling and freeze or NEMD */
+      for(n=start; n<nrend; n++) 
       {
           w_dt = invmass[n]*dt;
           if (cFREEZE)
@@ -225,6 +232,37 @@ static void do_update_md(int start,int nrend,double dt,
           }
       }
   }
+    else
+    {
+        /* Plain update with Berendsen/v-rescale coupling */
+        for(n=start; n<nrend; n++) 
+        {
+            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
+            {
+                w_dt = invmass[n]*dt;
+                if (cTC)
+                {
+                    gt = cTC[n];
+                }
+                lg = tcstat[gt].lambda;
+
+                for(d=0; d<DIM; d++)
+                {
+                    vn           = lg*v[n][d] + f[n][d]*w_dt;
+                    v[n][d]      = vn;
+                    xprime[n][d] = x[n][d] + vn*dt;
+                }
+            }
+            else
+            {
+                for(d=0; d<DIM; d++)
+                {
+                    v[n][d]        = 0.0;
+                    xprime[n][d]   = x[n][d];
+                }
+            }
+        }
+    }
 }
 
 static void do_update_vv_vel(int start,int nrend,double dt,
@@ -323,7 +361,9 @@ static void do_update_vv_pos(int start,int nrend,double dt,
 }/* do_update_vv_pos */
 
 static void do_update_visc(int start,int nrend,double dt,
-                           t_grp_tcstat *tcstat,real invmass[],double nh_vxi[],
+                           t_grp_tcstat *tcstat,
+                           double nh_vxi[],
+                           real invmass[],
                            unsigned short ptype[],unsigned short cTC[],
                            rvec x[],rvec xprime[],rvec v[],
                            rvec f[],matrix M,matrix box,real
@@ -822,13 +862,10 @@ static void calc_ke_part_normal(rvec v[], t_grpopts *opts,t_mdatoms *md,
                                 gmx_ekindata_t *ekind,t_nrnb *nrnb,gmx_bool bEkinAveVel,
                                 gmx_bool bSaveEkinOld)
 {
-  int          start=md->start,homenr=md->homenr;
-  int          g,d,n,m,ga=0,gt=0;
-  rvec         v_corrt;
-  real         hm;
+  int          g;
   t_grp_tcstat *tcstat=ekind->tcstat;
   t_grp_acc    *grpstat=ekind->grpstat;
-  real         dekindl;
+  int          nthread,thread;
 
   /* three main: VV with AveVel, vv with AveEkin, leap with AveEkin.  Leap with AveVel is also
      an option, but not supported now.  Additionally, if we are doing iterations.
@@ -857,46 +894,86 @@ static void calc_ke_part_normal(rvec v[], t_grpopts *opts,t_mdatoms *md,
       }
   }
   ekind->dekindl_old = ekind->dekindl;
+  
+  nthread = gmx_omp_nthreads_get(emntUpdate);
 
-  dekindl = 0;
-  for(n=start; (n<start+homenr); n++)
-  {
-      if (md->cACC)
-      {
-          ga = md->cACC[n];
-      }
-      if (md->cTC)
-      {
-          gt = md->cTC[n];
-      }
-      hm   = 0.5*md->massT[n];
+#pragma omp parallel for num_threads(nthread) schedule(static)
+    for(thread=0; thread<nthread; thread++)
+    {
+        int  start_t,end_t,n;
+        int  ga,gt;
+        rvec v_corrt;
+        real hm;
+        int  d,m;
+        matrix *ekin_sum;
+        real   *dekindl_sum;
 
-      for(d=0; (d<DIM); d++)
-      {
-          v_corrt[d]  = v[n][d]  - grpstat[ga].u[d];
-      }
-      for(d=0; (d<DIM); d++)
-      {
-          for (m=0;(m<DIM); m++)
-          {
-              /* if we're computing a full step velocity, v_corrt[d] has v(t).  Otherwise, v(t+dt/2) */
-              if (bEkinAveVel)
-              {
-                  tcstat[gt].ekinf[m][d]+=hm*v_corrt[m]*v_corrt[d];
-              }
-              else
-              {
-                  tcstat[gt].ekinh[m][d]+=hm*v_corrt[m]*v_corrt[d];
-              }
-          }
-      }
-      if (md->nMassPerturbed && md->bPerturbed[n])
-      {
-          dekindl -= 0.5*(md->massB[n] - md->massA[n])*iprod(v_corrt,v_corrt);
-      }
-  }
-  ekind->dekindl = dekindl;
-  inc_nrnb(nrnb,eNR_EKIN,homenr);
+        start_t = md->start + ((thread+0)*md->homenr)/nthread;
+        end_t   = md->start + ((thread+1)*md->homenr)/nthread;
+
+        ekin_sum    = ekind->ekin_work[thread];
+        dekindl_sum = &ekind->ekin_work[thread][opts->ngtc][0][0];
+
+        for(gt=0; gt<opts->ngtc; gt++)
+        {
+            clear_mat(ekin_sum[gt]);
+        }
+
+        ga = 0;
+        gt = 0;
+        for(n=start_t; n<end_t; n++) 
+        {
+            if (md->cACC)
+            {
+                ga = md->cACC[n];
+            }
+            if (md->cTC)
+            {
+                gt = md->cTC[n];
+            }
+            hm   = 0.5*md->massT[n];
+            
+            for(d=0; (d<DIM); d++) 
+            {
+                v_corrt[d]  = v[n][d]  - grpstat[ga].u[d];
+            }
+            for(d=0; (d<DIM); d++) 
+            {
+                for (m=0;(m<DIM); m++) 
+                {
+                    /* if we're computing a full step velocity, v_corrt[d] has v(t).  Otherwise, v(t+dt/2) */
+                    ekin_sum[gt][m][d] += hm*v_corrt[m]*v_corrt[d];
+                }
+            }
+            if (md->nMassPerturbed && md->bPerturbed[n]) 
+            {
+                *dekindl_sum -=
+                    0.5*(md->massB[n] - md->massA[n])*iprod(v_corrt,v_corrt);
+            }
+        }
+    }
+
+    ekind->dekindl = 0;
+    for(thread=0; thread<nthread; thread++)
+    {
+        for(g=0; g<opts->ngtc; g++)
+        {
+            if (bEkinAveVel) 
+            {
+                m_add(tcstat[g].ekinf,ekind->ekin_work[thread][g],
+                      tcstat[g].ekinf);
+            }
+            else
+            {
+                m_add(tcstat[g].ekinh,ekind->ekin_work[thread][g],
+                      tcstat[g].ekinh);
+            }
+        }
+
+        ekind->dekindl += ekind->ekin_work[thread][opts->ngtc][0][0];
+    }
+
+    inc_nrnb(nrnb,eNR_EKIN,md->homenr);
 }
 
 static void calc_ke_part_visc(matrix box,rvec x[],rvec v[],
@@ -1131,7 +1208,9 @@ static void deform(gmx_update_t upd,
 static void combine_forces(int nstlist,
                            gmx_constr_t constr,
                            t_inputrec *ir,t_mdatoms *md,t_idef *idef,
-                           t_commrec *cr,gmx_large_int_t step,t_state *state,
+                           t_commrec *cr,
+                           gmx_large_int_t step,
+                           t_state *state,gmx_bool bMolPBC,
                            int start,int nrend,
                            rvec f[],rvec f_lr[],
                            t_nrnb *nrnb)
@@ -1152,7 +1231,7 @@ static void combine_forces(int nstlist,
          */
         /* MRS -- need to make sure this works with trotter integration -- the constraint calls may not be right.*/
         constrain(NULL,FALSE,FALSE,constr,idef,ir,NULL,cr,step,0,md,
-                  state->x,f_lr,f_lr,state->box,state->lambda[efptBONDED],NULL,
+                  state->x,f_lr,f_lr,bMolPBC,state->box,state->lambda[efptBONDED],NULL,
                   NULL,NULL,nrnb,econqForce,ir->epc==epcMTTK,state->veta,state->veta);
     }
 
@@ -1316,6 +1395,7 @@ void update_constraints(FILE         *fplog,
                         gmx_ekindata_t *ekind,
                         t_mdatoms    *md,
                         t_state      *state,
+                        gmx_bool     bMolPBC,
                         t_graph      *graph,
                         rvec         force[],        /* forces on home particles */
                         t_idef       *idef,
@@ -1379,7 +1459,8 @@ void update_constraints(FILE         *fplog,
             constrain(NULL,bLog,bEner,constr,idef,
                       inputrec,ekind,cr,step,1,md,
                       state->x,state->v,state->v,
-                      state->box,state->lambda[efptBONDED],dvdlambda,
+                      bMolPBC,state->box,
+                      state->lambda[efptBONDED],dvdlambda,
                       NULL,bCalcVir ? &vir_con : NULL,nrnb,econqVeloc,
                       inputrec->epc==epcMTTK,state->veta,vetanew);
         }
@@ -1388,7 +1469,8 @@ void update_constraints(FILE         *fplog,
             constrain(NULL,bLog,bEner,constr,idef,
                       inputrec,ekind,cr,step,1,md,
                       state->x,xprime,NULL,
-                      state->box,state->lambda[efptBONDED],dvdlambda,
+                      bMolPBC,state->box,
+                      state->lambda[efptBONDED],dvdlambda,
                       state->v,bCalcVir ? &vir_con : NULL ,nrnb,econqCoord,
                       inputrec->epc==epcMTTK,state->veta,state->veta);
         }
@@ -1448,7 +1530,8 @@ void update_constraints(FILE         *fplog,
             constrain(NULL,bLog,bEner,constr,idef,
                       inputrec,NULL,cr,step,1,md,
                       state->x,xprime,NULL,
-                      state->box,state->lambda[efptBONDED],dvdlambda,
+                      bMolPBC,state->box,
+                      state->lambda[efptBONDED],dvdlambda,
                       NULL,NULL,nrnb,econqCoord,FALSE,0,0);
             wallcycle_stop(wcycle,ewcCONSTR);
         }
@@ -1473,7 +1556,11 @@ void update_constraints(FILE         *fplog,
         }
         else
         {
-            copy_rvecn(upd->xp,state->x,start,nrend);
+#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntUpdate)) schedule(static)
+            for(i=start; i<nrend; i++)
+            {
+                copy_rvec(upd->xp[i],state->x[i]);
+            }
         }
 
         dump_it_all(fplog,"After unshift",
@@ -1595,6 +1682,7 @@ void update_coords(FILE         *fplog,
                    t_inputrec   *inputrec,      /* input record and box stuff	*/
                    t_mdatoms    *md,
                    t_state      *state,
+                   gmx_bool     bMolPBC,
                    rvec         *f,        /* forces on home particles */
                    gmx_bool         bDoLR,
                    rvec         *f_lr,
@@ -1620,7 +1708,7 @@ void update_coords(FILE         *fplog,
     int              *icom = NULL;
     tensor           vir_con;
     rvec             *vcom,*xcom,*vall,*xall,*xin,*vin,*forcein,*fall,*xpall,*xprimein,*xprime;
-
+    int              nth,th;
 
     /* Running the velocity half does nothing except for velocity verlet */
     if ((UpdatePart == etrtVELOCITY1 || UpdatePart == etrtVELOCITY2) &&
@@ -1659,7 +1747,8 @@ void update_coords(FILE         *fplog,
          * to produce twin time stepping.
          */
         /* is this correct in the new construction? MRS */
-        combine_forces(inputrec->nstlist,constr,inputrec,md,idef,cr,step,state,
+        combine_forces(inputrec->nstlist,constr,inputrec,md,idef,cr,
+                       step,state,bMolPBC,
                        start,nrend,f,f_lr,nrnb);
         force = f_lr;
     }
@@ -1673,79 +1762,108 @@ void update_coords(FILE         *fplog,
     dump_it_all(fplog,"Before update",
                 state->natoms,state->x,xprime,state->v,force);
 
-    switch (inputrec->eI) {
-    case (eiMD):
-        if (ekind->cosacc.cos_accel == 0) {
-            /* use normal version of update */
-            do_update_md(start,nrend,dt,
-                         ekind->tcstat,ekind->grpstat,state->nosehoover_vxi,
-                         inputrec->opts.acc,inputrec->opts.nFreeze,md->invmass,md->ptype,
-                         md->cFREEZE,md->cACC,md->cTC,
-                         state->x,xprime,state->v,force,M,
-                         bNH,bPR);
-        }
-        else
-        {
-            do_update_visc(start,nrend,dt,
-                           ekind->tcstat,md->invmass,state->nosehoover_vxi,
-                           md->ptype,md->cTC,state->x,xprime,state->v,force,M,
-                           state->box,ekind->cosacc.cos_accel,ekind->cosacc.vcos,bNH,bPR);
-        }
-        break;
-    case (eiSD1):
-        do_update_sd1(upd->sd,start,homenr,dt,
-                      inputrec->opts.acc,inputrec->opts.nFreeze,
-                      md->invmass,md->ptype,
-                      md->cFREEZE,md->cACC,md->cTC,
-                      state->x,xprime,state->v,force,state->sd_X,
-                      inputrec->opts.ngtc,inputrec->opts.tau_t,inputrec->opts.ref_t);
-        break;
-    case (eiSD2):
-        /* The SD update is done in 2 parts, because an extra constraint step
-         * is needed
+    if (EI_RANDOM(inputrec->eI))
+    {
+        /* We still need to take care of generating random seeds properly
+         * when multi-threading.
          */
-        do_update_sd2(upd->sd,bInitStep,start,homenr,
-                      inputrec->opts.acc,inputrec->opts.nFreeze,
-                      md->invmass,md->ptype,
-                      md->cFREEZE,md->cACC,md->cTC,
-                      state->x,xprime,state->v,force,state->sd_X,
-                      inputrec->opts.ngtc,inputrec->opts.tau_t,inputrec->opts.ref_t,
-                      TRUE);
-        break;
-    case (eiBD):
-        do_update_bd(start,nrend,dt,
-                     inputrec->opts.nFreeze,md->invmass,md->ptype,
-                     md->cFREEZE,md->cTC,
-                     state->x,xprime,state->v,force,
-                     inputrec->bd_fric,
-                     inputrec->opts.ngtc,inputrec->opts.tau_t,inputrec->opts.ref_t,
-                     upd->sd->bd_rf,upd->sd->gaussrand);
+        nth = 1;
+    }
+    else
+    {
+        nth = gmx_omp_nthreads_get(emntUpdate);
+    }
+
+# pragma omp parallel for num_threads(nth) schedule(static)
+    for(th=0; th<nth; th++)
+    {
+        int start_th,end_th;
+
+        start_th = start + ((nrend-start)* th   )/nth;
+        end_th   = start + ((nrend-start)*(th+1))/nth;
+
+        switch (inputrec->eI) {
+        case (eiMD):
+            if (ekind->cosacc.cos_accel == 0)
+            {
+                do_update_md(start_th,end_th,dt,
+                             ekind->tcstat,state->nosehoover_vxi,
+                             ekind->bNEMD,ekind->grpstat,inputrec->opts.acc,
+                             inputrec->opts.nFreeze,
+                             md->invmass,md->ptype,
+                             md->cFREEZE,md->cACC,md->cTC,
+                             state->x,xprime,state->v,force,M,
+                             bNH,bPR);
+            } 
+            else 
+            {
+                do_update_visc(start_th,end_th,dt,
+                               ekind->tcstat,state->nosehoover_vxi,
+                               md->invmass,md->ptype,
+                               md->cTC,state->x,xprime,state->v,force,M,
+                               state->box,
+                               ekind->cosacc.cos_accel,
+                               ekind->cosacc.vcos,
+                               bNH,bPR);
+            }
+            break;
+        case (eiSD1):
+            do_update_sd1(upd->sd,start,homenr,dt,
+                          inputrec->opts.acc,inputrec->opts.nFreeze,
+                          md->invmass,md->ptype,
+                          md->cFREEZE,md->cACC,md->cTC,
+                          state->x,xprime,state->v,force,state->sd_X,
+                          inputrec->opts.ngtc,inputrec->opts.tau_t,inputrec->opts.ref_t);
+            break;
+        case (eiSD2):
+            /* The SD update is done in 2 parts, because an extra constraint step
+             * is needed 
+             */
+            do_update_sd2(upd->sd,bInitStep,start,homenr,
+                          inputrec->opts.acc,inputrec->opts.nFreeze,
+                          md->invmass,md->ptype,
+                          md->cFREEZE,md->cACC,md->cTC,
+                          state->x,xprime,state->v,force,state->sd_X,
+                          inputrec->opts.ngtc,inputrec->opts.tau_t,inputrec->opts.ref_t,
+                          TRUE);
         break;
-    case (eiVV):
-    case (eiVVAK):
-        alpha = 1.0 + DIM/((double)inputrec->opts.nrdf[0]); /* assuming barostat coupled to group 0. */
-        switch (UpdatePart) {
-        case etrtVELOCITY1:
-        case etrtVELOCITY2:
-            do_update_vv_vel(start,nrend,dt,
-                             ekind->tcstat,ekind->grpstat,
-                             inputrec->opts.acc,inputrec->opts.nFreeze,
-                             md->invmass,md->ptype,md->cFREEZE,md->cACC,
-                             state->v,force,(bNH || bPR),state->veta,alpha);
+        case (eiBD):
+            do_update_bd(start,nrend,dt,
+                         inputrec->opts.nFreeze,md->invmass,md->ptype,
+                         md->cFREEZE,md->cTC,
+                         state->x,xprime,state->v,force,
+                         inputrec->bd_fric,
+                         inputrec->opts.ngtc,inputrec->opts.tau_t,inputrec->opts.ref_t,
+                         upd->sd->bd_rf,upd->sd->gaussrand);
+            break;
+        case (eiVV):
+        case (eiVVAK):
+            alpha = 1.0 + DIM/((double)inputrec->opts.nrdf[0]); /* assuming barostat coupled to group 0. */
+            switch (UpdatePart) {
+            case etrtVELOCITY1:
+            case etrtVELOCITY2:
+                do_update_vv_vel(start_th,end_th,dt,
+                                 ekind->tcstat,ekind->grpstat,
+                                 inputrec->opts.acc,inputrec->opts.nFreeze,
+                                 md->invmass,md->ptype,
+                                 md->cFREEZE,md->cACC,
+                                 state->v,force,
+                                 (bNH || bPR),state->veta,alpha);  
+                break;
+            case etrtPOSITION:
+                do_update_vv_pos(start_th,end_th,dt,
+                                 ekind->tcstat,ekind->grpstat,
+                                 inputrec->opts.acc,inputrec->opts.nFreeze,
+                                 md->invmass,md->ptype,md->cFREEZE,
+                                 state->x,xprime,state->v,force,
+                                 (bNH || bPR),state->veta,alpha);
+                break;
+            }
             break;
-        case etrtPOSITION:
-            do_update_vv_pos(start,nrend,dt,
-                             ekind->tcstat,ekind->grpstat,
-                             inputrec->opts.acc,inputrec->opts.nFreeze,
-                             md->invmass,md->ptype,md->cFREEZE,
-                             state->x,xprime,state->v,force,
-                             (bNH || bPR) ,state->veta,alpha);
+        default:
+            gmx_fatal(FARGS,"Don't know how to update coordinates");
             break;
         }
-        break;
-    default:
-        gmx_fatal(FARGS,"Don't know how to update coordinates");
-        break;
     }
 
 }
diff --git a/src/tools/addconf.c b/src/tools/addconf.c
index aad6a64a7c..a474002228 100644
--- a/src/tools/addconf.c
+++ b/src/tools/addconf.c
@@ -267,8 +267,8 @@ void do_nsgrid(FILE *fp,gmx_bool bVerbose,
   /* create free energy data to avoid NULLs */
   snew(ir->fepvals,1);
   printf("Neighborsearching with a cut-off of %g\n",rlong);
-  init_forcerec(stdout,oenv,fr,NULL,ir,mtop,cr,box,FALSE,NULL,NULL,NULL,NULL,
-                TRUE,-1);
+  init_forcerec(stdout,oenv,fr,NULL,ir,mtop,cr,box,FALSE,
+                NULL,NULL,NULL,NULL,NULL,TRUE,-1);
   if (debug)
     pr_forcerec(debug,fr,cr);
 
@@ -528,7 +528,7 @@ void add_conf(t_atoms *atoms, rvec **x, rvec **v, real **r, gmx_bool bSrenew,
 	atoms->resinfo[atoms->nres-1].nr = resnr;
 	/* calculate shift of the solvent molecule using the first atom */
 	copy_rvec(x_solvt[i],dx);
-	put_atoms_in_box(box,1,&dx);
+	put_atoms_in_box(ePBC,box,1,&dx);
 	rvec_dec(dx,x_solvt[i]);
       }
       atoms->atom[atoms->nr] = atoms_solvt->atom[i];
diff --git a/src/tools/calcpot.c b/src/tools/calcpot.c
index c60122a495..ff0a5bec4a 100644
--- a/src/tools/calcpot.c
+++ b/src/tools/calcpot.c
@@ -36,6 +36,9 @@
 #include <config.h>
 #endif
 
+#include <stddef.h>
+
+#include "types/commrec.h"
 #include "vec.h"
 #include "calcpot.h"
 #include "nrnb.h"
@@ -284,7 +287,7 @@ FILE *init_calcpot(const char *log,const char *tpx,const char *table,
   /* Initiate forcerecord */
   *fr = mk_forcerec();
   init_forcerec(fplog,oenv,*fr,NULL,inputrec,mtop,*cr,
-		state->box,FALSE,table,NULL,table,NULL,TRUE,-1);
+		state->box,FALSE,table,NULL,table,NULL,NULL,TRUE,-1);
 
   /* Remove periodicity */  
   for(m=0; (m<DIM); m++)
diff --git a/src/tools/gmx_clustsize.c b/src/tools/gmx_clustsize.c
index f0ff0b3633..aeca8d6955 100644
--- a/src/tools/gmx_clustsize.c
+++ b/src/tools/gmx_clustsize.c
@@ -88,6 +88,7 @@ static void clust_size(const char *ndx,const char *trx,const char *xpm,
   gmx_mtop_t *mtop=NULL;
   int     ePBC=-1;
   t_block *mols=NULL;
+  gmx_mtop_atomlookup_t alook;
   t_atom  *atom;
   int     version,generation,ii,jj,nsame;
   real    temp,tfac;
@@ -142,6 +143,8 @@ static void clust_size(const char *ndx,const char *trx,const char *xpm,
   else
     rd_index(ndx,1,&nindex,&index,&gname);
   
+  alook = gmx_mtop_atomlookup_init(mtop);
+
   snew(clust_index,nindex);
   snew(clust_size,nindex);
   cut2   = cut*cut;
@@ -266,7 +269,7 @@ static void clust_size(const char *ndx,const char *trx,const char *xpm,
 	  for(i=0; (i<nindex); i++) 
 	    if (clust_index[i] == max_clust_ind) {
 	      ai    = index[i];
-	      gmx_mtop_atomnr_to_atom(mtop,ai,&atom);
+	      gmx_mtop_atomnr_to_atom(alook,ai,&atom);
 	      ekin += 0.5*atom->m*iprod(v[ai],v[ai]);
 	    }
 	  temp = (ekin*2.0)/(3.0*tfac*max_clust_size*BOLTZ);
@@ -281,6 +284,9 @@ static void clust_size(const char *ndx,const char *trx,const char *xpm,
   ffclose(gp);
   ffclose(hp);
   ffclose(tp);
+
+  gmx_mtop_atomlookup_destroy(alook);
+
   if (max_clust_ind >= 0) {
     fp = ffopen(mcn,"w");
     fprintf(fp,"[ max_clust ]\n");
diff --git a/src/tools/gmx_disre.c b/src/tools/gmx_disre.c
index 715d48c8f6..6a8935c667 100644
--- a/src/tools/gmx_disre.c
+++ b/src/tools/gmx_disre.c
@@ -711,8 +711,8 @@ int gmx_disre(int argc,char *argv[])
   update_mdatoms(mdatoms,ir.fepvals->init_lambda);
   fr      = mk_forcerec();
   fprintf(fplog,"Made forcerec\n");
-  init_forcerec(fplog,oenv,fr,NULL,&ir,&mtop,cr,box,FALSE,NULL,NULL,NULL,NULL,
-                FALSE,-1);
+  init_forcerec(fplog,oenv,fr,NULL,&ir,&mtop,cr,box,FALSE,
+                NULL,NULL,NULL,NULL,NULL,FALSE,-1);
   init_nrnb(&nrnb);
   if (ir.ePBC != epbcNONE)
     gpbc = gmx_rmpbc_init(&top->idef,ir.ePBC,natoms,box);
diff --git a/src/tools/gmx_pme_error.c b/src/tools/gmx_pme_error.c
index 66fb961f03..1e7c59082f 100644
--- a/src/tools/gmx_pme_error.c
+++ b/src/tools/gmx_pme_error.c
@@ -788,8 +788,9 @@ static void create_info(t_inputinfo *info)
  */
 static int prepare_x_q(real *q[], rvec *x[], gmx_mtop_t *mtop, rvec x_orig[], t_commrec *cr)
 {
-    int i,anr_global;
+    int i;
     int nq; /* number of charged particles */
+    gmx_mtop_atomloop_all_t aloop;
     t_atom *atom;
     
     
@@ -798,10 +799,11 @@ static int prepare_x_q(real *q[], rvec *x[], gmx_mtop_t *mtop, rvec x_orig[], t_
         snew(*q, mtop->natoms);
         snew(*x, mtop->natoms);
         nq=0;
-        for (i=0; i<mtop->natoms; i++)
+
+        aloop = gmx_mtop_atomloop_all_init(mtop);
+
+        while (gmx_mtop_atomloop_all_next(aloop,&i,&atom))
         {
-            anr_global = i;
-            gmx_mtop_atomnr_to_atom(mtop,anr_global,&atom);
             if (is_charge(atom->q))
             {
                 (*q)[nq] = atom->q;
diff --git a/src/tools/gmx_trjconv.c b/src/tools/gmx_trjconv.c
index b7c3233581..209a122ff7 100644
--- a/src/tools/gmx_trjconv.c
+++ b/src/tools/gmx_trjconv.c
@@ -278,7 +278,7 @@ static void put_molecule_com_in_box(int unitcell_enum,int ecenter,
         copy_rvec(com,new_com);
         switch (unitcell_enum) {
         case euRect: 
-            put_atoms_in_box(box,1,&new_com);
+            put_atoms_in_box(ePBC,box,1,&new_com);
             break;
         case euTric: 
             put_atoms_in_triclinic_unitcell(ecenter,box,1,&new_com);
@@ -326,7 +326,7 @@ static void put_residue_com_in_box(int unitcell_enum,int ecenter,
             copy_rvec(com,new_com);
             switch (unitcell_enum) {
             case euRect: 
-                put_atoms_in_box(box,1,&new_com);
+                put_atoms_in_box(ePBC,box,1,&new_com);
                 break;
             case euTric: 
                 put_atoms_in_triclinic_unitcell(ecenter,box,1,&new_com);
@@ -1335,7 +1335,7 @@ int gmx_trjconv(int argc,char *argv[])
                         if (bPBCcomAtom) {
                             switch (unitcell_enum) {
                             case euRect:
-                                put_atoms_in_box(fr.box,natoms,fr.x);
+                                put_atoms_in_box(ePBC,fr.box,natoms,fr.x);
                                 break;
                             case euTric:
                                 put_atoms_in_triclinic_unitcell(ecenter,fr.box,natoms,fr.x);
diff --git a/src/tools/gmx_tune_pme.c b/src/tools/gmx_tune_pme.c
index b09f2f39ef..7f96afbf7a 100644
--- a/src/tools/gmx_tune_pme.c
+++ b/src/tools/gmx_tune_pme.c
@@ -720,6 +720,7 @@ static void make_benchmark_tprs(
         gmx_large_int_t statesteps, /* Step counter in checkpoint file               */
         real rmin,                  /* Minimal Coulomb radius                        */
         real rmax,                  /* Maximal Coulomb radius                        */
+	real bScaleRvdw,            /* Scale rvdw along with rcoulomb */
         int *ntprs,                 /* No. of TPRs to write, each with a different
                                        rcoulomb and fourierspacing                   */
         t_inputinfo *info,          /* Contains information about mdp file options   */
@@ -759,7 +760,8 @@ static void make_benchmark_tprs(
                 EELTYPE(eelPME));
 
     /* Check if rcoulomb == rlist, which is necessary for plain PME. */
-    if (  (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist) )
+    if (  (ir->cutoff_scheme != ecutsVERLET) && 
+          (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
     {
         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
                 EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
@@ -771,6 +773,12 @@ static void make_benchmark_tprs(
                 EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
     }
 
+    if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
+    {
+        fprintf(stdout,"NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
+        bScaleRvdw = FALSE;
+    }
+
     /* Reduce the number of steps for the benchmarks */
     info->orig_sim_steps = ir->nsteps;
     ir->nsteps           = benchsteps;
@@ -787,13 +795,32 @@ static void make_benchmark_tprs(
         box_size[d] = sqrt(box_size[d]);
     }
 
-    /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
-    info->fsx[0] = box_size[XX]/ir->nkx;
-    info->fsy[0] = box_size[YY]/ir->nky;
-    info->fsz[0] = box_size[ZZ]/ir->nkz;
+    if (ir->fourier_spacing > 0)
+    {
+        info->fsx[0] = ir->fourier_spacing;
+        info->fsy[0] = ir->fourier_spacing;
+        info->fsz[0] = ir->fourier_spacing;
+    }
+    else
+    {
+        /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
+        info->fsx[0] = box_size[XX]/ir->nkx;
+        info->fsy[0] = box_size[YY]/ir->nky;
+        info->fsz[0] = box_size[ZZ]/ir->nkz;
+    }
 
-    /* Reconstruct the fourierspacing from the number of PME grid points found in the tpr */
-    fourierspacing = max(box_size[ZZ]/ir->nkz, max(box_size[XX]/ir->nkx, box_size[YY]/ir->nky));
+    /* If no value for the fourierspacing was provided on the command line, we
+     * use the reconstruction from the tpr file */
+    if (ir->fourier_spacing > 0)
+    {
+        /* Use the spacing from the tpr */
+        fourierspacing = ir->fourier_spacing;
+    }
+    else
+    {
+	/* Use the maximum observed spacing */
+        fourierspacing = max(max(info->fsx[0],info->fsy[0]),info->fsz[0]);
+    }
 
     fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
 
@@ -868,7 +895,7 @@ static void make_benchmark_tprs(
                 ir->rlist = ir->rcoulomb + nlist_buffer;
             }
 
-            if (evdwCUT == ir->vdwtype)
+            if (bScaleRvdw && evdwCUT == ir->vdwtype)
             {
                 /* For vdw cutoff, rvdw >= rlist */
                 ir->rvdw = max(info->rvdw[0], ir->rlist);
@@ -1841,6 +1868,7 @@ int gmx_tune_pme(int argc,char *argv[])
     int        ntprs=0;
     real       rmin=0.0,rmax=0.0;  /* min and max value for rcoulomb if scaling is requested */
     real       rcoulomb=-1.0;             /* Coulomb radius as set in .tpr file */
+    gmx_bool   bScaleRvdw=TRUE;
     gmx_large_int_t bench_nsteps=BENCHSTEPS;
     gmx_large_int_t new_sim_nsteps=-1;   /* -1 indicates: not set by the user */
     gmx_large_int_t cpt_steps=0;         /* Step counter in .cpt input file   */
@@ -1973,6 +2001,8 @@ int gmx_tune_pme(int argc,char *argv[])
         "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
       { "-rmin",     FALSE, etREAL, {&rmin},
         "If >0, minimal rcoulomb for -ntpr>1" },
+      { "-scalevdw",  FALSE, etBOOL, {&bScaleRvdw},
+        "Scale rvdw along with rcoulomb"},
       { "-ntpr",     FALSE, etINT,  {&ntprs},
         "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
         "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
@@ -2194,7 +2224,7 @@ int gmx_tune_pme(int argc,char *argv[])
     /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
      * different grids could be found. */
     make_benchmark_tprs(opt2fn("-s",NFILE,fnm), tpr_names, bench_nsteps+presteps,
-            cpt_steps, rmin, rmax, &ntprs, info, fp);
+			cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
 
     /********************************************************************************/
     /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
-- 
2.11.4.GIT