From 1d7858f64201aefb6dcfac21a292758118f14eb3 Mon Sep 17 00:00:00 2001
From: Erik Lindahl <lindahl@cbr.su.se>
Date: Wed, 11 Aug 2010 01:07:19 +0200
Subject: [PATCH] Changes to make Cmake work under windows again

---
 CMakeLists.txt                                     |   53 +-
 cmake/FindFFTW3.cmake                              |    4 +-
 cmake/FindFFTW3F.cmake                             |   44 +-
 include/types/simple.h                             |   39 +-
 src/config.h.cmakein                               |    3 -
 src/gmxlib/CMakeLists.txt                          |   58 +-
 .../nonbonded/nb_kernel_ia32_sse/Makefile.am       |   52 +-
 .../nb_kernel400_ia32_sse.intel_syntax.s           | 1728 --------------
 .../nb_kernel_ia32_sse/nb_kernel400_ia32_sse.s     | 1701 --------------
 .../nb_kernel410_ia32_sse.intel_syntax.s           | 2049 -----------------
 .../nb_kernel_ia32_sse/nb_kernel410_ia32_sse.s     | 2022 ----------------
 .../nb_kernel430_ia32_sse.intel_syntax.s           | 2409 --------------------
 .../nb_kernel_ia32_sse/nb_kernel430_ia32_sse.s     | 2382 -------------------
 .../nonbonded/nb_kernel_ia32_sse2/Makefile.am      |   52 +-
 .../nb_kernel400_ia32_sse2.intel_syntax.s          | 1287 -----------
 .../nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.s   | 1261 ----------
 .../nb_kernel410_ia32_sse2.intel_syntax.s          | 1530 -------------
 .../nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.s   | 1503 ------------
 .../nb_kernel430_ia32_sse2.intel_syntax.s          | 1714 --------------
 .../nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.s   | 1688 --------------
 .../nonbonded/nb_kernel_x86_64_sse/Makefile.am     |   52 +-
 .../nb_kernel400_x86_64_sse.intel_syntax.s         | 1662 --------------
 .../nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.s | 1638 -------------
 .../nb_kernel410_x86_64_sse.intel_syntax.s         | 2009 ----------------
 .../nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.s | 1985 ----------------
 .../nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.c |  111 +-
 .../nb_kernel430_x86_64_sse.intel_syntax.s         | 2330 -------------------
 .../nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.s | 2306 -------------------
 .../nonbonded/nb_kernel_x86_64_sse2/Makefile.am    |   52 +-
 .../nb_kernel400_x86_64_sse2.intel_syntax.s        | 1236 ----------
 .../nb_kernel400_x86_64_sse2.s                     | 1212 ----------
 .../nb_kernel410_x86_64_sse2.intel_syntax.s        | 1488 ------------
 .../nb_kernel410_x86_64_sse2.s                     | 1464 ------------
 .../nb_kernel430_x86_64_sse2.intel_syntax.s        | 1664 --------------
 .../nb_kernel430_x86_64_sse2.s                     | 1640 -------------
 src/tools/CMakeLists.txt                           |    2 +-
 36 files changed, 273 insertions(+), 42157 deletions(-)
 rewrite cmake/FindFFTW3F.cmake (65%)
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.intel_syntax.s
 delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.s

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2fe8b1a7e3..c9b2af63b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ project(Gromacs)
 include(Dart)
 # PROJECT_VERSION should have the following structure: 
 # VERSION[-dev-SUFFIX] where the VERSION can have any form and the suffix
-set(PROJECT_VERSION "4.0.99-dev-20100315"
+set(PROJECT_VERSION "4.5-beta3"
     CACHE STRING "Gromacs version string")
 
 # Cmake modules/macros are in a subdirectory to keep this file cleaner
@@ -16,7 +16,6 @@ endif(NOT CMAKE_BUILD_TYPE)
 
 enable_language(C)
 
-
 ########################################################################
 # Fix stupid flags on MSVC
 ########################################################################
@@ -37,11 +36,6 @@ set(GMX_EXTRA_LIBRARIES)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 
-# --- REMOVED as bugzilla 431 turned out to be a bug in GB
-# and there is no confirmed case in which gcc causes a crash 
-# check for buggy GCC 4.1.x 
-# include(gmxCheckGCCVersion)
-
 include(gmxCFlags)
 gmx_c_flags()
 
@@ -72,6 +66,7 @@ mark_as_advanced(GMX_MPI_IN_PLACE)
 
 option(GMX_IA32_ASM "Add SSE assembly files for IA32" OFF)
 option(GMX_X86_64_ASM "Add SSE assembly files for X86_64" OFF)
+
 option(USE_VERSION_H "Generate development version string/information" ON)
 # --- REMOVED as bugzilla 431 turned out to be a bug in GB
 # and there is no confirmed case in which gcc causes a crash 
@@ -199,6 +194,8 @@ check_library_exists(m sqrt "" HAVE_LIBM)
 check_library_exists(m cbrt "" HAVE_CBRT)
 
 include(CheckTypeSize)
+set(CMAKE_REQUIRED_FLAGS ${CMAKE_C_FLAGS_RELEASE})
+
 check_type_size("bool"          SIZEOF_BOOL) # will also set HAVE_BOOL
 check_type_size("int"           SIZEOF_INT) 
 check_type_size("long int"      SIZEOF_LONG_INT) 
@@ -260,14 +257,6 @@ if(LIBXML2_FOUND)
     set(HAVE_LIBXML2 1)
 endif(LIBXML2_FOUND)
 
-find_package(gsl)
-set(PKG_GSL "")
-if(GSL_FOUND)
-    include_directories(${GSL_INCLUDE_DIR})
-    set(PKG_GSL gsl)
-    set(HAVE_LIBGSL 1)
-endif(GSL_FOUND)
-
 find_package(X11)
 # X11 includes/libraries are only set in the ngmx subdirectory!
 if(X11_FOUND)
@@ -370,6 +359,8 @@ if (${GMX_ACCELERATION} STREQUAL "auto")
             set(GMX_IA32_ASM ON CACHE BOOL 
                 "Add SSE assembly files for IA32" FORCE)
         endif (GMX_64_BIT)
+    else(GMX_X86_GCC_INLINE_ASM)
+      set(GMX_ACCELERATION "none" CACHE STRING "Accelerated kernels. Pick one of: none, SSE, BlueGene, Power6, ia64, altivec" FORCE)
     endif (GMX_X86_GCC_INLINE_ASM)
 endif (${GMX_ACCELERATION} STREQUAL "auto")
 
@@ -409,6 +400,11 @@ string(TOUPPER ${GMX_ACCELERATION} ${GMX_ACCELERATION})
 if(${GMX_ACCELERATION} STREQUAL "NONE")
   # nothing to do
 elseif(${GMX_ACCELERATION} STREQUAL "SSE")
+    if(CMAKE_GENERATOR MATCHES "Visual Studio")
+      option(GMX_ASM_USEASM-NASM "Use the Nasm assembler (windows)" ON)
+    else(CMAKE_GENERATOR MATCHES "Visual Studio")
+      option(GMX_ASM_USEASM-NASM "Use the Nasm assembler (windows)" OFF)
+    endif(CMAKE_GENERATOR MATCHES "Visual Studio")
     if (NOT GMX_64_BIT)
         # for 32-bit compiles, we might need to turn on sse 
         CHECK_C_COMPILER_FLAG("-msse2" XFLAGS_SSE)
@@ -422,17 +418,23 @@ elseif(${GMX_ACCELERATION} STREQUAL "SSE")
     endif (NOT GMX_64_BIT)
     if(HAVE_XMMINTRIN_H)
         if(GMX_IA32_ASM)
-          option(GMX_ASM_USEASM-ATT "Use ATT-style assembly" ON)
-          set(GMX_IA32_SSE 1)
-        else(GMX_IA32_ASM) #only use intrinsic if not using the ASM loops
+          if(GMX_DOUBLE)
+            set(GMX_IA32_SSE2 1)
+          else(GMX_DOUBLE)
+            set(GMX_IA32_SSE 1)
+          endif(GMX_DOUBLE)
+        else(GMX_IA32_ASM)
           set(GMX_SSE 1)
         endif(GMX_IA32_ASM)
     endif(HAVE_XMMINTRIN_H)
     if(HAVE_EMMINTRIN_H)
         if(GMX_X86_64_ASM)
-          option(GMX_ASM_USEASM-ATT "Use ATT-style assembly" ON)
-          set(GMX_X86_64_SSE 1)
-        else(GMX_X86_64_ASM)  #only use intrinsic if not using the ASM loops
+      	  if(GMX_DOUBLE)
+      	    set(GMX_X86_64_SSE2 1)
+      	  else(GMX_DOUBLE)
+            set(GMX_X86_64_SSE 1)
+      	  endif(GMX_DOUBLE)
+        else(GMX_X86_64_ASM)
           if(NOT GMX_IA32_ASM)
             set(GMX_SSE2 1)
           endif(NOT GMX_IA32_ASM)
@@ -494,17 +496,22 @@ if(${GMX_FFT_LIBRARY} STREQUAL "FFTW3")
 #    MESSAGE(STATUS "Using external FFT library - fftw3")
     if(GMX_DOUBLE)
         find_package(FFTW3 REQUIRED)
+	include_directories(${FFTW3_INCLUDE_DIR})
+        set(FFT_LIBRARIES ${FFTW3_LIBRARIES})
         set(PKG_FFT "fftw3")
     else(GMX_DOUBLE)
         find_package(FFTW3F REQUIRED)
+        include_directories(${FFTW3F_INCLUDE_DIR})
+        set(FFT_LIBRARIES ${FFTW3F_LIBRARIES})
         set(PKG_FFT "fftw3f")
     endif(GMX_DOUBLE)
 
-    if(NOT FFTW3_FOUND)
+    if(NOT FFTW3_FOUND AND NOT FFTW3F_FOUND)
         MESSAGE(FATAL_ERROR "Cannot find fftw3 (with correct precision). Fix it, choose another FFT library, or use the Gromacs built-in fftpack (slower)!")
-    endif(NOT FFTW3_FOUND)
+    endif(NOT FFTW3_FOUND AND NOT FFTW3F_FOUND)
 
     set(GMX_FFT_FFTW3 1)
+
     include_directories(${FFTW3_INCLUDE_DIR})
     set(FFT_LIBRARIES ${FFTW3_LIBRARIES})
 
diff --git a/cmake/FindFFTW3.cmake b/cmake/FindFFTW3.cmake
index 8bf3a26434..636a96ec50 100644
--- a/cmake/FindFFTW3.cmake
+++ b/cmake/FindFFTW3.cmake
@@ -5,10 +5,10 @@
 #  FFTW3_LIBRARIES   - List of libraries when using FFTW.
 #  FFTW3_FOUND       - True if FFTW found.
 
-if (FFTW3_INCLUDE_DIR)
+if (FFTW3_INCLUDE_DIR AND FFTW3_LIBRARIES)
   # Already in cache, be silent
   set (FFTW3_FIND_QUIETLY TRUE)
-endif (FFTW3_INCLUDE_DIR)
+endif (FFTW3_INCLUDE_DIR AND FFTW3_LIBRARIES)
 
 find_path (FFTW3_INCLUDE_DIR fftw3.h)
 
diff --git a/cmake/FindFFTW3F.cmake b/cmake/FindFFTW3F.cmake
dissimilarity index 65%
index f0227cbd7b..5ca2fc3e30 100644
--- a/cmake/FindFFTW3F.cmake
+++ b/cmake/FindFFTW3F.cmake
@@ -1,22 +1,22 @@
-# - Find FFTW3
-# Find the native FFTW3 includes and library, single precision
-#
-#  FFTW3_INCLUDE_DIR    - where to find fftw3.h
-#  FFTW3_LIBRARIES   - List of libraries when using FFTW.
-#  FFTW3_FOUND       - True if FFTW found.
-
-if (FFTW3_INCLUDE_DIR)
-  # Already in cache, be silent
-  set (FFTW3_FIND_QUIETLY TRUE)
-endif (FFTW3_INCLUDE_DIR)
-
-find_path (FFTW3_INCLUDE_DIR fftw3.h)
-
-find_library (FFTW3_LIBRARIES NAMES fftw3f)
-
-# handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
-# all listed variables are TRUE
-include (FindPackageHandleStandardArgs)
-find_package_handle_standard_args (FFTW3 DEFAULT_MSG FFTW3_LIBRARIES FFTW3_INCLUDE_DIR)
-
-mark_as_advanced (FFTW3_LIBRARIES FFTW3_INCLUDE_DIR)
+# - Find FFTW3F
+# Find the native FFTW3 includes and library, single precision
+#
+#  FFTW3F_INCLUDE_DIR    - where to find fftw3.h
+#  FFTW3F_LIBRARIES   - List of libraries when using FFTW.
+#  FFTW3F_FOUND       - True if FFTW found.
+
+if (FFTW3F_INCLUDE_DIR)
+  # Already in cache, be silent
+  set (FFTW3F_FIND_QUIETLY TRUE)
+endif (FFTW3F_INCLUDE_DIR)
+
+find_path (FFTW3F_INCLUDE_DIR fftw3.h)
+
+find_library (FFTW3F_LIBRARIES NAMES fftw3f)
+
+# handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
+# all listed variables are TRUE
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (FFTW3F DEFAULT_MSG FFTW3F_LIBRARIES FFTW3F_INCLUDE_DIR)
+
+mark_as_advanced (FFTW3F_LIBRARIES FFTW3F_INCLUDE_DIR)
diff --git a/include/types/simple.h b/include/types/simple.h
index 789ab1f134..7461b13d51 100644
--- a/include/types/simple.h
+++ b/include/types/simple.h
@@ -134,26 +134,43 @@ typedef int             imatrix[DIM][DIM];
 
 /* For the step count type gmx_large_int_t we aim for 8 bytes (64bit),
  * but we might only be able to get 4 bytes (32bit).
+ *
+ * Avoid using "long int" if we can. This type is really dangerous,
+ * since the width frequently depends on compiler options, and they
+ * might not be set correctly when (buggy) Cmake is detecting things.
+ * Instead, start by looking for "long long", and just go down if we
+ * have to (rarely on new systems). /EL 20100810
  */
-#if (!(defined SIZEOF_LONG_LONG_INT) || SIZEOF_LONG_INT == 8)
-typedef long int gmx_large_int_t;
-#define gmx_large_int_fmt   "ld"
-#define gmx_large_int_pfmt "%ld"
-#define SIZEOF_LARGE_INT SIZEOF_LONG_INT
-#define LARGE_INT_MAX LONG_MAX
-#else
+#if (defined SIZEOF_LONG_LONG_INT && SIZEOF_LONG_LONG_INT==8)
+
 typedef long long int gmx_large_int_t;
 #define gmx_large_int_fmt   "lld"
 #define gmx_large_int_pfmt "%lld"
 #define SIZEOF_LARGE_INT SIZEOF_LONG_LONG_INT
 /* LLONG_MAX is not defined by the C-standard, so check for it */
-#if (!(defined LLONG_MAX) && SIZEOF_LONG_LONG_INT == 8)
-#define LARGE_INT_MAX 9223372036854775807LL
-#else
+#ifdef LLONG_MAX 
 #define LARGE_INT_MAX LLONG_MAX
-#endif
+#else
+#define LARGE_INT_MAX 9223372036854775807LL
 #endif
 
+#elif (defined SIZEOF_LONG_INT && SIZEOF_LONG_INT==8)
+
+typedef long int gmx_large_int_t;
+#define gmx_large_int_fmt   "ld"
+#define gmx_large_int_pfmt "%ld"
+#define SIZEOF_LARGE_INT SIZEOF_LONG_INT
+#define LARGE_INT_MAX LONG_MAX
+
+#else
+
+typedef int gmx_large_int_t;
+#define gmx_large_int_fmt   "d"
+#define gmx_large_int_pfmt "%d"
+#define SIZEOF_LARGE_INT SIZEOF_INT
+#define LARGE_INT_MAX INT_MAX
+
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/src/config.h.cmakein b/src/config.h.cmakein
index 2380d39a57..5e5b761f1a 100644
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -215,9 +215,6 @@
 /* Define to 1 if you have the xml2 library (-lxml2). */
 #cmakedefine HAVE_LIBXML2
 
-/* Define to 1 if you have the gsl library (-lgsl). */
-#cmakedefine HAVE_LIBGSL
-
 /* Define to 1 if you have the dl library (-ldl). */
 #cmakedefine HAVE_LIBDL
 
diff --git a/src/gmxlib/CMakeLists.txt b/src/gmxlib/CMakeLists.txt
index db55cb62a1..a6f8dd6050 100644
--- a/src/gmxlib/CMakeLists.txt
+++ b/src/gmxlib/CMakeLists.txt
@@ -24,6 +24,10 @@ file(GLOB GMXLIB_SOURCES *.c
      selection/*.c trajana/*.c
      statistics/*.c nonbonded/*.c nonbonded/nb_kernel_c/*.c)
 
+# This source file is generated
+file(GLOB VERSION_SOURCE version.c)
+list(REMOVE_ITEM GMXLIB_SOURCES ${VERSION_SOURCE})
+
 # add version.c to the list of sources and tell cmake that it is generated
 if(USE_VERSION_H)
 LIST(APPEND GMXLIB_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/version.c) # auto-generated
@@ -31,43 +35,53 @@ set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/version.c
                 PROPERTIES GENERATED true)
 endif()
 
-if(GMX_IA32_SSE)
-  if(GMX_ASM_USEASM-ATT)
+if(GMX_IA32_SSE OR GMX_IA32_SSE2)
+  if(GMX_ASM_USEASM-NASM)
+    enable_language(ASM-NASM)
+    if(GMX_DOUBLE)
+      file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse2/*.c nonbonded/nb_kernel_ia32_sse2/*intel_syntax.s)
+    else(GMX_DOUBLE)
+      file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse/*.c nonbonded/nb_kernel_ia32_sse/*intel_syntax.s)
+    endif(GMX_DOUBLE)
+  else(GMX_ASM_USEASM-NASM)
     if(GMX_ASM_USECCOMPILER)
       SET(CMAKE_ASM-ATT_COMPILER ${CMAKE_C_COMPILER})
     endif(GMX_ASM_USECCOMPILER)
     enable_language(ASM-ATT)
-    file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse/*.c nonbonded/nb_kernel_ia32_sse/*.s)
-    file(GLOB SKIP_INTELSYNTAX nonbonded/nb_kernel_ia32_sse/*intel_syntax*)
-    list(REMOVE_ITEM GMX_MORESSE_SOURCES ${SKIP_INTELSYNTAX})
+    if(GMX_DOUBLE)
+      file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse2/*.c nonbonded/nb_kernel_ia32_sse2/*sse2.s nonbonded/nb_kernel_ia32_sse2/*asm.s)
+    else(GMX_DOUBLE)
+      file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse/*.c nonbonded/nb_kernel_ia32_sse/*sse.s nonbonded/nb_kernel_ia32_sse/*asm.s)
+    endif(GMX_DOUBLE)
     if(GMX_ASM_USECCOMPILER)
       set_source_files_properties(${GMX_MORESSE_SOURCES} PROPERTIES COMPILE_FLAGS "-c -m32")
     endif(GMX_ASM_USECCOMPILER)
-  else(GMX_ASM_USEASM-ATT)
-    enable_language(ASM)
-    file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse/*.c nonbonded/nb_kernel_ia32_sse/*.intel_syntax.s)
-  endif(GMX_ASM_USEASM-ATT)
-endif(GMX_IA32_SSE)
+  endif(GMX_ASM_USEASM-NASM)
+endif(GMX_IA32_SSE OR GMX_IA32_SSE2)
 
-if(GMX_X86_64_SSE)
-  if(GMX_ASM_USEASM-ATT)
+if(GMX_X86_64_SSE OR GMX_X86_64_SSE2)
+  if(GMX_ASM_USEASM-NASM)
+    enable_language(ASM-NASM)
+    if(GMX_DOUBLE)
+      file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse2/*.c nonbonded/nb_kernel_x86_64_sse2/*intel_syntax.s)
+    else(GMX_DOUBLE)
+      file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse/*.c nonbonded/nb_kernel_x86_64_sse/*intel_syntax.s)
+    endif(GMX_DOUBLE)
+  else(GMX_ASM_USEASM-NASM)
     if(GMX_ASM_USECCOMPILER)
       SET(CMAKE_ASM-ATT_COMPILER ${CMAKE_C_COMPILER})
     endif(GMX_ASM_USECCOMPILER)
     enable_language(ASM-ATT)
-    file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse/*.c nonbonded/nb_kernel_x86_64_sse/*.s)
-    file(GLOB SKIP_INTELSYNTAX nonbonded/nb_kernel_x86_64_sse/*intel_syntax*)
-    file(GLOB SKIP_400ASM nonbonded/nb_kernel_x86_64_sse/nb_kernel4*.s)
-    list(REMOVE_ITEM GMX_MORESSE_SOURCES ${SKIP_INTELSYNTAX})
-    list(REMOVE_ITEM GMX_MORESSE_SOURCES ${SKIP_400ASM})  #use new C-intrinsics instread
+    if(GMX_DOUBLE)
+      file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse2/*.c nonbonded/nb_kernel_x86_64_sse2/*sse2.s nonbonded/nb_kernel_x86_64_sse2/*asm.s)
+    else(GMX_DOUBLE)
+      file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse/*.c nonbonded/nb_kernel_x86_64_sse/*sse.s nonbonded/nb_kernel_x86_64_sse/*asm.s)
+    endif(GMX_DOUBLE)
     if(GMX_ASM_USECCOMPILER)
       set_source_files_properties(${GMX_MORESSE_SOURCES} PROPERTIES COMPILE_FLAGS "-c")
     endif(GMX_ASM_USECCOMPILER)
-  else(GMX_ASM_USEASM-ATT)
-    enable_language(ASM)
-    file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse/*.c nonbonded/nb_kernel_x86_64_sse/*.intel_syntax.s)
-  endif(GMX_ASM_USEASM-ATT)
-endif(GMX_X86_64_SSE)
+  endif(GMX_ASM_USEASM-NASM)
+endif(GMX_X86_64_SSE OR GMX_X86_64_SSE2)
 
 if(GMX_SSE2)
   if(GMX_DOUBLE)
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/Makefile.am b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/Makefile.am
index b7be181468..9e40177198 100644
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/Makefile.am
+++ b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/Makefile.am
@@ -63,29 +63,29 @@ libnb_kernel_ia32_sse_la_SOURCES = \
 
 
 EXTRA_DIST = \
-	nb_kernel010_ia32_sse.intel_syntax.s	nb_kernel030_ia32_sse.intel_syntax.s	\
-	nb_kernel100_ia32_sse.intel_syntax.s	nb_kernel101_ia32_sse.intel_syntax.s	\
-	nb_kernel102_ia32_sse.intel_syntax.s	nb_kernel103_ia32_sse.intel_syntax.s	\
-	nb_kernel104_ia32_sse.intel_syntax.s	nb_kernel110_ia32_sse.intel_syntax.s	\
-	nb_kernel111_ia32_sse.intel_syntax.s	nb_kernel112_ia32_sse.intel_syntax.s	\
-	nb_kernel113_ia32_sse.intel_syntax.s	nb_kernel114_ia32_sse.intel_syntax.s	\
-	nb_kernel130_ia32_sse.intel_syntax.s	nb_kernel131_ia32_sse.intel_syntax.s	\
-	nb_kernel132_ia32_sse.intel_syntax.s	nb_kernel133_ia32_sse.intel_syntax.s	\
-	nb_kernel134_ia32_sse.intel_syntax.s	nb_kernel200_ia32_sse.intel_syntax.s	\
-	nb_kernel201_ia32_sse.intel_syntax.s	nb_kernel202_ia32_sse.intel_syntax.s	\
-	nb_kernel203_ia32_sse.intel_syntax.s	nb_kernel204_ia32_sse.intel_syntax.s	\
-	nb_kernel210_ia32_sse.intel_syntax.s	nb_kernel211_ia32_sse.intel_syntax.s	\
-	nb_kernel212_ia32_sse.intel_syntax.s	nb_kernel213_ia32_sse.intel_syntax.s	\
-	nb_kernel214_ia32_sse.intel_syntax.s	nb_kernel230_ia32_sse.intel_syntax.s	\
-	nb_kernel231_ia32_sse.intel_syntax.s	nb_kernel232_ia32_sse.intel_syntax.s	\
-	nb_kernel233_ia32_sse.intel_syntax.s	nb_kernel234_ia32_sse.intel_syntax.s	\
-	nb_kernel300_ia32_sse.intel_syntax.s	nb_kernel301_ia32_sse.intel_syntax.s	\
-	nb_kernel302_ia32_sse.intel_syntax.s	nb_kernel303_ia32_sse.intel_syntax.s	\
-	nb_kernel304_ia32_sse.intel_syntax.s	nb_kernel310_ia32_sse.intel_syntax.s	\
-	nb_kernel311_ia32_sse.intel_syntax.s	nb_kernel312_ia32_sse.intel_syntax.s	\
-	nb_kernel313_ia32_sse.intel_syntax.s	nb_kernel314_ia32_sse.intel_syntax.s	\
-	nb_kernel330_ia32_sse.intel_syntax.s	nb_kernel331_ia32_sse.intel_syntax.s	\
-	nb_kernel332_ia32_sse.intel_syntax.s	nb_kernel333_ia32_sse.intel_syntax.s	\
-	nb_kernel334_ia32_sse.intel_syntax.s	nb_kernel400_ia32_sse.intel_syntax.s	\
-	nb_kernel410_ia32_sse.intel_syntax.s	nb_kernel430_ia32_sse.intel_syntax.s	\
-	nb_kernel_ia32_sse_test_asm.intel_syntax.s	
+	nb_kernel010_ia32_sse_intel_syntax.s	nb_kernel030_ia32_sse_intel_syntax.s	\
+	nb_kernel100_ia32_sse_intel_syntax.s	nb_kernel101_ia32_sse_intel_syntax.s	\
+	nb_kernel102_ia32_sse_intel_syntax.s	nb_kernel103_ia32_sse_intel_syntax.s	\
+	nb_kernel104_ia32_sse_intel_syntax.s	nb_kernel110_ia32_sse_intel_syntax.s	\
+	nb_kernel111_ia32_sse_intel_syntax.s	nb_kernel112_ia32_sse_intel_syntax.s	\
+	nb_kernel113_ia32_sse_intel_syntax.s	nb_kernel114_ia32_sse_intel_syntax.s	\
+	nb_kernel130_ia32_sse_intel_syntax.s	nb_kernel131_ia32_sse_intel_syntax.s	\
+	nb_kernel132_ia32_sse_intel_syntax.s	nb_kernel133_ia32_sse_intel_syntax.s	\
+	nb_kernel134_ia32_sse_intel_syntax.s	nb_kernel200_ia32_sse_intel_syntax.s	\
+	nb_kernel201_ia32_sse_intel_syntax.s	nb_kernel202_ia32_sse_intel_syntax.s	\
+	nb_kernel203_ia32_sse_intel_syntax.s	nb_kernel204_ia32_sse_intel_syntax.s	\
+	nb_kernel210_ia32_sse_intel_syntax.s	nb_kernel211_ia32_sse_intel_syntax.s	\
+	nb_kernel212_ia32_sse_intel_syntax.s	nb_kernel213_ia32_sse_intel_syntax.s	\
+	nb_kernel214_ia32_sse_intel_syntax.s	nb_kernel230_ia32_sse_intel_syntax.s	\
+	nb_kernel231_ia32_sse_intel_syntax.s	nb_kernel232_ia32_sse_intel_syntax.s	\
+	nb_kernel233_ia32_sse_intel_syntax.s	nb_kernel234_ia32_sse_intel_syntax.s	\
+	nb_kernel300_ia32_sse_intel_syntax.s	nb_kernel301_ia32_sse_intel_syntax.s	\
+	nb_kernel302_ia32_sse_intel_syntax.s	nb_kernel303_ia32_sse_intel_syntax.s	\
+	nb_kernel304_ia32_sse_intel_syntax.s	nb_kernel310_ia32_sse_intel_syntax.s	\
+	nb_kernel311_ia32_sse_intel_syntax.s	nb_kernel312_ia32_sse_intel_syntax.s	\
+	nb_kernel313_ia32_sse_intel_syntax.s	nb_kernel314_ia32_sse_intel_syntax.s	\
+	nb_kernel330_ia32_sse_intel_syntax.s	nb_kernel331_ia32_sse_intel_syntax.s	\
+	nb_kernel332_ia32_sse_intel_syntax.s	nb_kernel333_ia32_sse_intel_syntax.s	\
+	nb_kernel334_ia32_sse_intel_syntax.s	nb_kernel400_ia32_sse_intel_syntax.s	\
+	nb_kernel410_ia32_sse_intel_syntax.s	nb_kernel430_ia32_sse_intel_syntax.s	\
+	nb_kernel_ia32_sse_test_asm_intel_syntax.s	
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.intel_syntax.s
deleted file mode 100644
index 00149aa925..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.intel_syntax.s
+++ /dev/null
@@ -1,1728 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-	
-
-.globl nb_kernel400_ia32_sse
-.globl _nb_kernel400_ia32_sse
-nb_kernel400_ia32_sse:	
-_nb_kernel400_ia32_sse:	
-.equiv          nb400_p_nri,            8
-.equiv          nb400_iinr,             12
-.equiv          nb400_jindex,           16
-.equiv          nb400_jjnr,             20
-.equiv          nb400_shift,            24
-.equiv          nb400_shiftvec,         28
-.equiv          nb400_fshift,           32
-.equiv          nb400_gid,              36
-.equiv          nb400_pos,              40
-.equiv          nb400_faction,          44
-.equiv          nb400_charge,           48
-.equiv          nb400_p_facel,          52
-.equiv          nb400_argkrf,           56
-.equiv          nb400_argcrf,           60
-.equiv          nb400_Vc,               64
-.equiv          nb400_type,             68
-.equiv          nb400_p_ntype,          72
-.equiv          nb400_vdwparam,         76
-.equiv          nb400_Vvdw,             80
-.equiv          nb400_p_tabscale,       84
-.equiv          nb400_VFtab,            88
-.equiv          nb400_invsqrta,         92
-.equiv          nb400_dvda,             96
-.equiv          nb400_p_gbtabscale,     100
-.equiv          nb400_GBtab,            104
-.equiv          nb400_p_nthreads,       108
-.equiv          nb400_count,            112
-.equiv          nb400_mtx,              116
-.equiv          nb400_outeriter,        120
-.equiv          nb400_inneriter,        124
-.equiv          nb400_work,             128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb400_ix,               0
-.equiv          nb400_iy,               16
-.equiv          nb400_iz,               32
-.equiv          nb400_iq,               48
-.equiv          nb400_dx,               64
-.equiv          nb400_dy,               80
-.equiv          nb400_dz,               96
-.equiv          nb400_two,              112
-.equiv          nb400_gbtsc,            128
-.equiv          nb400_qq,               144
-.equiv          nb400_r,                160
-.equiv          nb400_vctot,            176
-.equiv          nb400_fix,              192
-.equiv          nb400_fiy,              208
-.equiv          nb400_fiz,              224
-.equiv          nb400_half,             240
-.equiv          nb400_three,            256
-.equiv          nb400_isai,             272
-.equiv          nb400_isaprod,          288
-.equiv          nb400_dvdasum,          304
-.equiv          nb400_gbscale,          320
-.equiv          nb400_is3,              336
-.equiv          nb400_ii3,              340
-.equiv          nb400_ii,               344
-.equiv          nb400_innerjjnr,        348
-.equiv          nb400_innerk,           352
-.equiv          nb400_n,                356
-.equiv          nb400_nn1,              360
-.equiv          nb400_jnra,             364
-.equiv          nb400_jnrb,             368
-.equiv          nb400_jnrc,             372
-.equiv          nb400_jnrd,             376
-.equiv          nb400_nri,              380
-.equiv          nb400_facel,            384
-.equiv          nb400_nouter,           388
-.equiv          nb400_ninner,           392
-.equiv          nb400_salign,           396
-	push ebp
-	mov ebp,esp	
-    	push eax
-    	push ebx
-    	push ecx
-    	push edx
-	push esi
-	push edi
-	sub esp, 400		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb400_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb400_p_nri]
-	mov esi, [ebp + nb400_p_facel]
-	mov ecx, [ecx]
-	mov esi, [esi]
-	mov [esp + nb400_nri], ecx
-	mov [esp + nb400_facel], esi
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb400_nouter], eax
-	mov [esp + nb400_ninner], eax
-
-
-	mov eax, [ebp + nb400_p_gbtabscale]
-	movss xmm3, [eax]
-	shufps xmm3, xmm3, 0
-	movaps [esp + nb400_gbtsc], xmm3
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# constant 0.5 in IEEE (hex)
-	mov [esp + nb400_half], eax
-	movss xmm1, [esp + nb400_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# constant 1.0
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# constant 2.0
-	addps  xmm3, xmm2	;# constant 3.0
-	movaps [esp + nb400_half],  xmm1
-	movaps [esp + nb400_two],  xmm2
-	movaps [esp + nb400_three],  xmm3
-
-.nb400_threadloop:
-        mov   esi, [ebp + nb400_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb400_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb400_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb400_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb400_n], eax
-        mov [esp + nb400_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb400_outerstart
-        jmp .nb400_end
-
-.nb400_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb400_nouter]
-	mov [esp + nb400_nouter], ebx
-
-.nb400_outer:
-	mov   eax, [ebp + nb400_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax + esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb400_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb400_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movss xmm0, [eax + ebx*4]
-	movss xmm1, [eax + ebx*4 + 4]
-	movss xmm2, [eax + ebx*4 + 8] 
-
-	mov   ecx, [ebp + nb400_iinr]       ;# ecx = pointer into iinr[] 	
-	mov   ebx, [ecx + esi*4]	    ;# ebx =ii 
-	mov   [esp + nb400_ii], ebx
-	
-	mov   edx, [ebp + nb400_charge]
-	movss xmm3, [edx + ebx*4]	
-	mulss xmm3, [esp + nb400_facel]
-	shufps xmm3, xmm3, 0
-
-
-	mov   edx, [ebp + nb400_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [edx + ebx*4]
-	shufps xmm4, xmm4, 0
-
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb400_pos]    ;# eax = base of pos[]  
-
-	addss xmm0, [eax + ebx*4]
-	addss xmm1, [eax + ebx*4 + 4]
-	addss xmm2, [eax + ebx*4 + 8]
-
-	movaps [esp + nb400_iq], xmm3
-	movaps [esp + nb400_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [esp + nb400_ix], xmm0
-	movaps [esp + nb400_iy], xmm1
-	movaps [esp + nb400_iz], xmm2
-
-	mov   [esp + nb400_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorps xmm4, xmm4
-	movaps [esp + nb400_vctot], xmm4
-	movaps [esp + nb400_dvdasum], xmm4
-	movaps [esp + nb400_fix], xmm4
-	movaps [esp + nb400_fiy], xmm4
-	movaps [esp + nb400_fiz], xmm4
-	
-	mov   eax, [ebp + nb400_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb400_pos]
-	mov   edi, [ebp + nb400_faction]	
-	mov   eax, [ebp + nb400_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb400_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [esp + nb400_ninner]
-	mov   [esp + nb400_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb400_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb400_unroll_loop
-	jmp   .nb400_finish_inner
-.nb400_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   edx, [esp + nb400_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [edx]	
-	mov   ebx, [edx + 4]              
-	mov   ecx, [edx + 8]            
-	mov   edx, [edx + 12]         ;# eax-edx=jnr1-4 
-	add dword ptr [esp + nb400_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-
-	;# load isaj
-	mov esi, [ebp + nb400_invsqrta]
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-	movaps xmm2, [esp + nb400_isai]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all isaj in xmm3 
-	mulps  xmm2, xmm3
-		
-	movaps [esp + nb400_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb400_gbtsc]
-	movaps [esp + nb400_gbscale], xmm1
-	
-	mov esi, [ebp + nb400_charge]    ;# base of charge[] 
-	
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-
-	mulps xmm2, [esp + nb400_iq]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3  
-	mulps  xmm3, xmm2
-	movaps [esp + nb400_qq], xmm3	
-
-	
-	mov esi, [ebp + nb400_pos]       ;# base of pos[] 
-
-	mov [esp + nb400_jnra], eax
-	mov [esp + nb400_jnrb], ebx
-	mov [esp + nb400_jnrc], ecx
-	mov [esp + nb400_jnrd], edx
-	
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-	lea   ecx, [ecx + ecx*2]    
-	lea   edx, [edx + edx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-
-	movlps xmm4, [esi + eax*4]
-	movlps xmm5, [esi + ecx*4]
-	movss xmm2, [esi + eax*4 + 8]
-	movss xmm6, [esi + ecx*4 + 8]
-
-	movhps xmm4, [esi + ebx*4]
-	movhps xmm5, [esi + edx*4]
-
-	movss xmm0, [esi + ebx*4 + 8]
-	movss xmm1, [esi + edx*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm5, 136  ;# constant 10001000
-	shufps xmm1, xmm5, 221  ;# constant 11011101		
-
-	;# move ix-iz to xmm4-xmm6 
-	movaps xmm4, [esp + nb400_ix]
-	movaps xmm5, [esp + nb400_iy]
-	movaps xmm6, [esp + nb400_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# store dr 
-	movaps [esp + nb400_dx], xmm4
-	movaps [esp + nb400_dy], xmm5
-	movaps [esp + nb400_dz], xmm6
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb400_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb400_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r
-	movaps [esp + nb400_r], xmm4
-	mulps xmm4, [esp + nb400_gbscale]
-
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 2
-	pslld mm7, 2
-
-	movd mm0, eax	
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-
-	mov  esi, [ebp + nb400_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-
-	;# load coulomb table
-	movaps xmm4, [esi + eax*4]
-	movaps xmm5, [esi + ebx*4]
-	movaps xmm6, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# coulomb table ready, in xmm4-xmm7  	
-	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm7, [esp + nb400_two]	;# two*Heps2 
-	movaps xmm3, [esp + nb400_qq]
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulps  xmm3, xmm7 ;# fijC=FF*qq 
-	;# at this point mm5 contains vcoul and mm3 fijC
-
-	;# get jnr from stack
-	mov eax, [esp + nb400_jnra]
-	mov ebx, [esp + nb400_jnrb]
-	mov ecx, [esp + nb400_jnrc]
-	mov edx, [esp + nb400_jnrd]
-	
-	mov esi, [ebp + nb400_dvda]
-	
-	;# Calculate dVda
-	xorps  xmm7, xmm7
-	mulps xmm3, [esp + nb400_gbscale]
-	movaps xmm6, xmm3
-	mulps  xmm6, [esp + nb400_r]
-	addps  xmm6, xmm5
-	addps  xmm5, [esp + nb400_vctot]
-	movaps [esp + nb400_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum
-	addps  xmm7, [esp + nb400_dvdasum]
-	movaps [esp + nb400_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	movaps  xmm5, xmm6
-	movaps  xmm4, xmm7
-	shufps  xmm5, xmm5, 0x1
-	shufps  xmm4, xmm4, 0x1
-	;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-	addss  xmm6, [esi + eax*4]
-	addss  xmm5, [esi + ebx*4]
-	addss  xmm7, [esi + ecx*4]
-	addss  xmm4, [esi + edx*4]
-	movss  [esi + eax*4], xmm6
-	movss  [esi + ebx*4], xmm5
-	movss  [esi + ecx*4], xmm7
-	movss  [esi + edx*4], xmm4
-	
-	xorps  xmm4, xmm4	
-	mulps xmm3, xmm0
-	subps  xmm4, xmm3
-
-	movaps xmm0, [esp + nb400_dx]
-	movaps xmm1, [esp + nb400_dy]
-	movaps xmm2, [esp + nb400_dz]
-
-	movd eax, mm0	
-	movd ebx, mm1
-	movd ecx, mm2
-	movd edx, mm3
-
-	mov    edi, [ebp + nb400_faction]
-	mulps  xmm0, xmm4
-	mulps  xmm1, xmm4
-	mulps  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movaps xmm3, [esp + nb400_fix]
-	movaps xmm4, [esp + nb400_fiy]
-	movaps xmm5, [esp + nb400_fiz]
-	addps  xmm3, xmm0
-	addps  xmm4, xmm1
-	addps  xmm5, xmm2
-	movaps [esp + nb400_fix], xmm3
-	movaps [esp + nb400_fiy], xmm4
-	movaps [esp + nb400_fiz], xmm5
-	;# the fj's - start by accumulating x & y forces from memory 
-	movlps xmm4, [edi + eax*4]
-	movlps xmm6, [edi + ecx*4]
-	movhps xmm4, [edi + ebx*4]
-	movhps xmm6, [edi + edx*4]
-
-	movaps xmm3, xmm4
-	shufps xmm3, xmm6, 136  ;# constant 10001000
-	shufps xmm4, xmm6, 221  ;# constant 11011101			      
-
-	;# now xmm3-xmm5 contains fjx, fjy, fjz 
-	subps  xmm3, xmm0
-	subps  xmm4, xmm1
-	
-	;# unpack them back so we can store them - first x & y in xmm3/xmm4 
-
-	movaps xmm6, xmm3
-	unpcklps xmm6, xmm4
-	unpckhps xmm3, xmm4	
-	;# xmm6(l)=x & y for j1, (h) for j2 
-	;# xmm3(l)=x & y for j3, (h) for j4 
-	movlps [edi + eax*4], xmm6
-	movlps [edi + ecx*4], xmm3
-	
-	movhps [edi + ebx*4], xmm6
-	movhps [edi + edx*4], xmm3
-
-	;# and the z forces 
-	movss  xmm4, [edi + eax*4 + 8]
-	movss  xmm5, [edi + ebx*4 + 8]
-	movss  xmm6, [edi + ecx*4 + 8]
-	movss  xmm7, [edi + edx*4 + 8]
-	subss  xmm4, xmm2
-	shufps xmm2, xmm2, 229  ;# constant 11100101
-	subss  xmm5, xmm2
-	shufps xmm2, xmm2, 234  ;# constant 11101010
-	subss  xmm6, xmm2
-	shufps xmm2, xmm2, 255  ;# constant 11111111
-	subss  xmm7, xmm2
-	movss  [edi + eax*4 + 8], xmm4
-	movss  [edi + ebx*4 + 8], xmm5
-	movss  [edi + ecx*4 + 8], xmm6
-	movss  [edi + edx*4 + 8], xmm7
-	
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb400_innerk],  4
-	jl    .nb400_finish_inner
-	jmp   .nb400_unroll_loop
-.nb400_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [esp + nb400_innerk],  4
-	mov   edx, [esp + nb400_innerk]
-	and   edx, 2
-	jnz   .nb400_dopair
-	jmp   .nb400_checksingle
-.nb400_dopair:	
-	mov   ecx, [esp + nb400_innerjjnr]
-	
-	mov   eax, [ecx]	
-	mov   ebx, [ecx + 4]              
-	add dword ptr [esp + nb400_innerjjnr],  8
-
-	xorps xmm2, xmm2
-	movaps xmm6, xmm2
-	
-	;# load isaj
-	mov esi, [ebp + nb400_invsqrta]
-	movss xmm2, [esi + eax*4]
-	movss xmm3, [esi + ebx*4]
-	unpcklps xmm2, xmm3	;# isaj in xmm2(0,1)
-	mulps  xmm2, [esp + nb400_isai]
-	movaps [esp + nb400_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb400_gbtsc]
-	movaps [esp + nb400_gbscale], xmm1	
-	
-	mov esi, [ebp + nb400_charge]    ;# base of charge[] 	
-	movss xmm3, [esi + eax*4]		
-	movss xmm6, [esi + ebx*4]
-	unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges 
-
-	mulps  xmm2, [esp + nb400_iq]
-	mulps  xmm3, xmm2
-	movaps [esp + nb400_qq], xmm3
-
-	mov edi, [ebp + nb400_pos]	
-
-	movd  mm0, eax		;# copy jnr to mm0/mm1
-	movd  mm1, ebx
-		
-	lea   eax, [eax + eax*2]
-	lea   ebx, [ebx + ebx*2]
-	;# move coordinates to xmm0-xmm2 
-	movlps xmm1, [edi + eax*4]
-	movss xmm2, [edi + eax*4 + 8]	
-	movhps xmm1, [edi + ebx*4]
-	movss xmm0, [edi + ebx*4 + 8]	
-
-	movlhps xmm3, xmm7
-	
-	shufps xmm2, xmm0, 0
-	
-	movaps xmm0, xmm1
-
-	shufps xmm2, xmm2, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm0, 136  ;# constant 10001000
-	shufps xmm1, xmm1, 221  ;# constant 11011101
-			
-	mov    edi, [ebp + nb400_faction]
-	;# move ix-iz to xmm4-xmm6 
-	xorps   xmm7, xmm7
-	
-	movaps xmm4, [esp + nb400_ix]
-	movaps xmm5, [esp + nb400_iy]
-	movaps xmm6, [esp + nb400_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# store dr 
-	movaps [esp + nb400_dx], xmm4
-	movaps [esp + nb400_dy], xmm5
-	movaps [esp + nb400_dz], xmm6
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb400_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb400_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	movaps [esp + nb400_r], xmm4
-	mulps xmm4, [esp + nb400_gbscale]
-
-	cvttps2pi mm6, xmm4     ;# mm6 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-
-	pslld mm6, 2
-
-	mov  esi, [ebp + nb400_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-
-	;# load coulomb table
-	movaps xmm4, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# coulomb table ready, in xmm4-xmm7  	
-
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm7, [esp + nb400_two]	;# two*Heps2 
-	movaps xmm3, [esp + nb400_qq]
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulps  xmm3, xmm7 ;# fijC=FF*qq 
-	;# at this point mm5 contains vcoul and mm3 fijC
-	
-	;# get jnr from mm0/mm1
-	movd ecx, mm0
-	movd edx, mm1
-
-	mov esi, [ebp + nb400_dvda]
-	
-	;# Calculate dVda
-	xorps  xmm7, xmm7
-	mulps xmm3, [esp + nb400_gbscale]
-	movaps xmm6, xmm3
-	mulps  xmm6, [esp + nb400_r]
-	addps  xmm6, xmm5
-	addps  xmm5, [esp + nb400_vctot]
-	movaps [esp + nb400_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum
-	addps  xmm7, [esp + nb400_dvdasum]
-	movaps [esp + nb400_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movaps xmm7, xmm6
-	shufps xmm7, xmm7, 0x1
-	addss  xmm6, [esi + ecx*4]
-	addss  xmm7, [esi + edx*4]
-	movss  [esi + ecx*4], xmm6
-	movss  [esi + edx*4], xmm7
-	
-	xorps  xmm4, xmm4	
-	mulps xmm3, xmm0
-	subps  xmm4, xmm3
-
-	movaps xmm0, [esp + nb400_dx]
-	movaps xmm1, [esp + nb400_dy]
-	movaps xmm2, [esp + nb400_dz]
-
-	mulps  xmm0, xmm4
-	mulps  xmm1, xmm4
-	mulps  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movaps xmm3, [esp + nb400_fix]
-	movaps xmm4, [esp + nb400_fiy]
-	movaps xmm5, [esp + nb400_fiz]
-	addps  xmm3, xmm0
-	addps  xmm4, xmm1
-	addps  xmm5, xmm2
-	movaps [esp + nb400_fix], xmm3
-	movaps [esp + nb400_fiy], xmm4
-	movaps [esp + nb400_fiz], xmm5
-	;# update the fj's 
-	movss   xmm3, [edi + eax*4]
-	movss   xmm4, [edi + eax*4 + 4]
-	movss   xmm5, [edi + eax*4 + 8]
-	subss   xmm3, xmm0
-	subss   xmm4, xmm1
-	subss   xmm5, xmm2	
-	movss   [edi + eax*4], xmm3
-	movss   [edi + eax*4 + 4], xmm4
-	movss   [edi + eax*4 + 8], xmm5	
-
-	shufps  xmm0, xmm0, 225  ;# constant 11100001
-	shufps  xmm1, xmm1, 225  ;# constant 11100001
-	shufps  xmm2, xmm2, 225  ;# constant 11100001
-
-	movss   xmm3, [edi + ebx*4]
-	movss   xmm4, [edi + ebx*4 + 4]
-	movss   xmm5, [edi + ebx*4 + 8]
-	subss   xmm3, xmm0
-	subss   xmm4, xmm1
-	subss   xmm5, xmm2	
-	movss   [edi + ebx*4], xmm3
-	movss   [edi + ebx*4 + 4], xmm4
-	movss   [edi + ebx*4 + 8], xmm5	
-
-.nb400_checksingle:				
-	mov   edx, [esp + nb400_innerk]
-	and   edx, 1
-	jnz    .nb400_dosingle
-	jmp    .nb400_updateouterdata
-.nb400_dosingle:
-	mov esi, [ebp + nb400_charge]
-	mov edx, [ebp + nb400_invsqrta]
-	mov edi, [ebp + nb400_pos]
-	mov   ecx, [esp + nb400_innerjjnr]
-	mov   eax, [ecx]	
-	xorps  xmm2, xmm2
-	movaps xmm6, xmm2
-	movss xmm2, [edx + eax*4]	;# isaj
-	mulss xmm2, [esp + nb400_isai]
-	movss [esp + nb400_isaprod], xmm2	
-	movss xmm1, xmm2
-	mulss xmm1, [esp + nb400_gbtsc]
-	movss [esp + nb400_gbscale], xmm1	
-	
-	mulss  xmm2, [esp + nb400_iq]
-	movss xmm6, [esi + eax*4]	;# xmm6(0) has the charge 	
-	mulss  xmm6, xmm2
-	movss [esp + nb400_qq], xmm6
-
-	movd  mm0, eax
-	lea   eax, [eax + eax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movss xmm0, [edi + eax*4]	
-	movss xmm1, [edi + eax*4 + 4]	
-	movss xmm2, [edi + eax*4 + 8]	 
-	
-	movss xmm4, [esp + nb400_ix]
-	movss xmm5, [esp + nb400_iy]
-	movss xmm6, [esp + nb400_iz]
-
-	;# calc dr 
-	subss xmm4, xmm0
-	subss xmm5, xmm1
-	subss xmm6, xmm2
-
-	;# store dr 
-	movss [esp + nb400_dx], xmm4
-	movss [esp + nb400_dy], xmm5
-	movss [esp + nb400_dz], xmm6
-	;# square it 
-	mulss xmm4,xmm4
-	mulss xmm5,xmm5
-	mulss xmm6,xmm6
-	addss xmm4, xmm5
-	addss xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movss xmm2, xmm5
-	mulss xmm5, xmm5
-	movss xmm1, [esp + nb400_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movss xmm0, [esp + nb400_half]
-	subss xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-
-	mulss xmm4, xmm0	;# xmm4=r 
-	movss [esp + nb400_r], xmm4
-	mulss xmm4, [esp + nb400_gbscale]
-
-	cvttss2si ebx, xmm4     ;# mm6 contain lu indices 
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movss xmm1, xmm4	;# xmm1=eps 
-	movss xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl  ebx, 2
-
-	mov  esi, [ebp + nb400_GBtab]
-
-	movaps xmm4, [esi + ebx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	mulss  xmm7, [esp + nb400_two]	;# two*Heps2 
-	movss xmm3, [esp + nb400_qq]
-	addss  xmm7, xmm6
-	addss  xmm7, xmm5 ;# xmm7=FF 
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulss  xmm3, xmm7 ;# fijC=FF*qq 
-	;# at this point mm5 contains vcoul and mm3 fijC 
-
-	movd ebx, mm0
-	mov esi, [ebp + nb400_dvda]
-	
-	;# Calculate dVda
-	xorps xmm7, xmm7
-	mulss xmm3, [esp + nb400_gbscale]
-	movaps xmm6, xmm3
-	mulss  xmm6, [esp + nb400_r]
-	addss  xmm6, xmm5
-	addss  xmm5, [esp + nb400_vctot]
-	movss [esp + nb400_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum
-	addps  xmm7, [esp + nb400_dvdasum]
-	movaps [esp + nb400_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	addss  xmm6, [esi + ebx*4]
-	movss  [esi + ebx*4], xmm6
-	
-	xorps  xmm4, xmm4	
-	mulss xmm3, xmm0
-	subss  xmm4, xmm3
-
-	mov    edi, [ebp + nb400_faction]
-
-	movss xmm0, [esp + nb400_dx]
-	movss xmm1, [esp + nb400_dy]
-	movss xmm2, [esp + nb400_dz]
-
-	mulss  xmm0, xmm4
-	mulss  xmm1, xmm4
-	mulss  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movss xmm3, [esp + nb400_fix]
-	movss xmm4, [esp + nb400_fiy]
-	movss xmm5, [esp + nb400_fiz]
-	addss  xmm3, xmm0
-	addss  xmm4, xmm1
-	addss  xmm5, xmm2
-	movss [esp + nb400_fix], xmm3
-	movss [esp + nb400_fiy], xmm4
-	movss [esp + nb400_fiz], xmm5
-	;# update fj 
-	
-	movss   xmm3, [edi + eax*4]
-	movss   xmm4, [edi + eax*4 + 4]
-	movss   xmm5, [edi + eax*4 + 8]
-	subss   xmm3, xmm0
-	subss   xmm4, xmm1
-	subss   xmm5, xmm2	
-	movss   [edi + eax*4], xmm3
-	movss   [edi + eax*4 + 4], xmm4
-	movss   [edi + eax*4 + 8], xmm5	
-.nb400_updateouterdata:
-	mov   ecx, [esp + nb400_ii3]
-	mov   edi, [ebp + nb400_faction]
-	mov   esi, [ebp + nb400_fshift]
-	mov   edx, [esp + nb400_is3]
-
-	;# accumulate i forces in xmm0, xmm1, xmm2 
-	movaps xmm0, [esp + nb400_fix]
-	movaps xmm1, [esp + nb400_fiy]
-	movaps xmm2, [esp + nb400_fiz]
-
-	movhlps xmm3, xmm0
-	movhlps xmm4, xmm1
-	movhlps xmm5, xmm2
-	addps  xmm0, xmm3
-	addps  xmm1, xmm4
-	addps  xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2 
-
-	movaps xmm3, xmm0	
-	movaps xmm4, xmm1	
-	movaps xmm5, xmm2	
-
-	shufps xmm3, xmm3, 1
-	shufps xmm4, xmm4, 1
-	shufps xmm5, xmm5, 1
-	addss  xmm0, xmm3
-	addss  xmm1, xmm4
-	addss  xmm2, xmm5	;# xmm0-xmm2 has single force in pos0 
-
-	;# increment i force 
-	movss  xmm3, [edi + ecx*4]
-	movss  xmm4, [edi + ecx*4 + 4]
-	movss  xmm5, [edi + ecx*4 + 8]
-	addss  xmm3, xmm0
-	addss  xmm4, xmm1
-	addss  xmm5, xmm2
-	movss  [edi + ecx*4],     xmm3
-	movss  [edi + ecx*4 + 4], xmm4
-	movss  [edi + ecx*4 + 8], xmm5
-
-	;# increment fshift force  
-	movss  xmm3, [esi + edx*4]
-	movss  xmm4, [esi + edx*4 + 4]
-	movss  xmm5, [esi + edx*4 + 8]
-	addss  xmm3, xmm0
-	addss  xmm4, xmm1
-	addss  xmm5, xmm2
-	movss  [esi + edx*4],     xmm3
-	movss  [esi + edx*4 + 4], xmm4
-	movss  [esi + edx*4 + 8], xmm5
-
-	;# get n from stack
-	mov esi, [esp + nb400_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb400_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movaps xmm7, [esp + nb400_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb400_Vc]
-	addss xmm7, [eax + edx*4] 
-	;# move back to mem 
-	movss [eax + edx*4], xmm7 
-	
-	;# accumulate dVda and update it 
-	movaps xmm7, [esp + nb400_dvdasum]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-	
-	mov edx, [esp + nb400_ii]
-	mov eax, [ebp + nb400_dvda]
-	addss xmm7, [eax + edx*4]
-	movss [eax + edx*4], xmm7
-	
-        ;# finish if last 
-        mov ecx, [esp + nb400_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb400_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb400_n], esi
-        jmp .nb400_outer
-.nb400_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb400_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb400_end
-        ;# non-zero, do one more workunit
-        jmp   .nb400_threadloop
-.nb400_end:
-	emms
-
-	mov eax, [esp + nb400_nouter]
-	mov ebx, [esp + nb400_ninner]
-	mov ecx, [ebp + nb400_outeriter]
-	mov edx, [ebp + nb400_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb400_salign]
-	add esp, eax
-	add esp, 400
-	pop edi
-	pop esi
-    	pop edx
-    	pop ecx
-    	pop ebx
-    	pop eax
-	leave
-	ret
-
-
-
-		
-
-.globl nb_kernel400nf_ia32_sse
-.globl _nb_kernel400nf_ia32_sse
-nb_kernel400nf_ia32_sse:	
-_nb_kernel400nf_ia32_sse:	
-.equiv          nb400nf_p_nri,          8
-.equiv          nb400nf_iinr,           12
-.equiv          nb400nf_jindex,         16
-.equiv          nb400nf_jjnr,           20
-.equiv          nb400nf_shift,          24
-.equiv          nb400nf_shiftvec,       28
-.equiv          nb400nf_fshift,         32
-.equiv          nb400nf_gid,            36
-.equiv          nb400nf_pos,            40
-.equiv          nb400nf_faction,        44
-.equiv          nb400nf_charge,         48
-.equiv          nb400nf_p_facel,        52
-.equiv          nb400nf_argkrf,         56
-.equiv          nb400nf_argcrf,         60
-.equiv          nb400nf_Vc,             64
-.equiv          nb400nf_type,           68
-.equiv          nb400nf_p_ntype,        72
-.equiv          nb400nf_vdwparam,       76
-.equiv          nb400nf_Vvdw,           80
-.equiv          nb400nf_p_tabscale,     84
-.equiv          nb400nf_VFtab,          88
-.equiv          nb400nf_invsqrta,       92
-.equiv          nb400nf_dvda,           96
-.equiv          nb400nf_p_gbtabscale,   100
-.equiv          nb400nf_GBtab,          104
-.equiv          nb400nf_p_nthreads,     108
-.equiv          nb400nf_count,          112
-.equiv          nb400nf_mtx,            116
-.equiv          nb400nf_outeriter,      120
-.equiv          nb400nf_inneriter,      124
-.equiv          nb400nf_work,           128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb400nf_ix,             0
-.equiv          nb400nf_iy,             16
-.equiv          nb400nf_iz,             32
-.equiv          nb400nf_iq,             48
-.equiv          nb400nf_gbtsc,          64
-.equiv          nb400nf_qq,             80
-.equiv          nb400nf_vctot,          96
-.equiv          nb400nf_half,           112
-.equiv          nb400nf_three,          128
-.equiv          nb400nf_isai,           144
-.equiv          nb400nf_isaprod,        160
-.equiv          nb400nf_gbscale,        176
-.equiv          nb400nf_is3,            192
-.equiv          nb400nf_ii3,            196
-.equiv          nb400nf_innerjjnr,      200
-.equiv          nb400nf_innerk,         204
-.equiv          nb400nf_n,              208
-.equiv          nb400nf_nn1,            212
-.equiv          nb400nf_nri,            216
-.equiv          nb400nf_facel,          220
-.equiv          nb400nf_nouter,         224
-.equiv          nb400nf_ninner,         228
-.equiv          nb400nf_salign,         232
-	push ebp
-	mov ebp,esp	
-    	push eax
-    	push ebx
-    	push ecx
-    	push edx
-	push esi
-	push edi
-	sub esp, 236		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb400nf_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb400nf_p_nri]
-	mov esi, [ebp + nb400nf_p_facel]
-	mov ecx, [ecx]
-	mov esi, [esi]
-	mov [esp + nb400nf_nri], ecx
-	mov [esp + nb400nf_facel], esi
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb400nf_nouter], eax
-	mov [esp + nb400nf_ninner], eax
-
-
-	mov eax, [ebp + nb400nf_p_gbtabscale]
-	movss xmm3, [eax]
-	shufps xmm3, xmm3, 0
-	movaps [esp + nb400nf_gbtsc], xmm3
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# constant 0.5 in IEEE (hex)
-	mov [esp + nb400nf_half], eax
-	movss xmm1, [esp + nb400nf_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# constant 1.0
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# constant 2.0
-	addps  xmm3, xmm2	;# constant 3.0
-	movaps [esp + nb400nf_half],  xmm1
-	movaps [esp + nb400nf_three],  xmm3
-
-.nb400nf_threadloop:
-        mov   esi, [ebp + nb400nf_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb400nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb400nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb400nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb400nf_n], eax
-        mov [esp + nb400nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb400nf_outerstart
-        jmp .nb400nf_end
-
-.nb400nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb400nf_nouter]
-	mov [esp + nb400nf_nouter], ebx
-
-.nb400nf_outer:
-	mov   eax, [ebp + nb400nf_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax + esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb400nf_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb400nf_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movss xmm0, [eax + ebx*4]
-	movss xmm1, [eax + ebx*4 + 4]
-	movss xmm2, [eax + ebx*4 + 8] 
-
-	mov   ecx, [ebp + nb400nf_iinr]       ;# ecx = pointer into iinr[] 	
-	mov   ebx, [ecx + esi*4]	    ;# ebx =ii 
-	
-	mov   edx, [ebp + nb400nf_charge]
-	movss xmm3, [edx + ebx*4]	
-	mulss xmm3, [esp + nb400nf_facel]
-	shufps xmm3, xmm3, 0
-
-	mov   edx, [ebp + nb400nf_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [edx + ebx*4]
-	shufps xmm4, xmm4, 0
-
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb400nf_pos]    ;# eax = base of pos[]  
-
-	addss xmm0, [eax + ebx*4]
-	addss xmm1, [eax + ebx*4 + 4]
-	addss xmm2, [eax + ebx*4 + 8]
-
-	movaps [esp + nb400nf_iq], xmm3
-	movaps [esp + nb400nf_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [esp + nb400nf_ix], xmm0
-	movaps [esp + nb400nf_iy], xmm1
-	movaps [esp + nb400nf_iz], xmm2
-
-	mov   [esp + nb400nf_ii3], ebx
-	
-	;# clear vctot 
-	xorps xmm4, xmm4
-	movaps [esp + nb400nf_vctot], xmm4
-	
-	mov   eax, [ebp + nb400nf_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb400nf_pos]
-	mov   edi, [ebp + nb400nf_faction]	
-	mov   eax, [ebp + nb400nf_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb400nf_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [esp + nb400nf_ninner]
-	mov   [esp + nb400nf_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb400nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb400nf_unroll_loop
-	jmp   .nb400nf_finish_inner
-.nb400nf_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   edx, [esp + nb400nf_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [edx]	
-	mov   ebx, [edx + 4]              
-	mov   ecx, [edx + 8]            
-	mov   edx, [edx + 12]         ;# eax-edx=jnr1-4 
-	add dword ptr [esp + nb400nf_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-
-	;# load isa2
-	mov esi, [ebp + nb400nf_invsqrta]
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-	movaps xmm2, [esp + nb400nf_isai]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3  
-	mulps  xmm2, xmm3
-		
-	movaps [esp + nb400nf_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb400nf_gbtsc]
-	movaps [esp + nb400nf_gbscale], xmm1
-	
-	mov esi, [ebp + nb400nf_charge]    ;# base of charge[] 
-	
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-
-	mulps xmm2, [esp + nb400nf_iq]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3  
-	mulps  xmm3, xmm2
-	movaps [esp + nb400nf_qq], xmm3	
-
-	
-	mov esi, [ebp + nb400nf_pos]       ;# base of pos[] 
-
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-
-	lea   ecx, [ecx + ecx*2]     ;# replace jnr with j3 
-	lea   edx, [edx + edx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-
-	movlps xmm4, [esi + eax*4]
-	movlps xmm5, [esi + ecx*4]
-	movss xmm2, [esi + eax*4 + 8]
-	movss xmm6, [esi + ecx*4 + 8]
-
-	movhps xmm4, [esi + ebx*4]
-	movhps xmm5, [esi + edx*4]
-
-	movss xmm0, [esi + ebx*4 + 8]
-	movss xmm1, [esi + edx*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm5, 136  ;# constant 10001000
-	shufps xmm1, xmm5, 221  ;# constant 11011101		
-
-	;# move ix-iz to xmm4-xmm6 
-	movaps xmm4, [esp + nb400nf_ix]
-	movaps xmm5, [esp + nb400nf_iy]
-	movaps xmm6, [esp + nb400nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb400nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb400nf_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r
-	mulps xmm4, [esp + nb400nf_gbscale]
-
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 2
-	pslld mm7, 2
-
-	movd mm0, eax	
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-
-	mov  esi, [ebp + nb400nf_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-
-	;# load coulomb table
-	movaps xmm4, [esi + eax*4]
-	movaps xmm5, [esi + ebx*4]
-	movaps xmm6, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# coulomb table ready, in xmm4-xmm7  	
-	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 
-	movaps xmm3, [esp + nb400nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	addps  xmm5, [esp + nb400nf_vctot]
-	movaps [esp + nb400nf_vctot], xmm5 
-	
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb400nf_innerk],  4
-	jl    .nb400nf_finish_inner
-	jmp   .nb400nf_unroll_loop
-.nb400nf_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [esp + nb400nf_innerk],  4
-	mov   edx, [esp + nb400nf_innerk]
-	and   edx, 2
-	jnz   .nb400nf_dopair
-	jmp   .nb400nf_checksingle
-.nb400nf_dopair:	
-	mov   ecx, [esp + nb400nf_innerjjnr]
-	
-	mov   eax, [ecx]	
-	mov   ebx, [ecx + 4]              
-	add dword ptr [esp + nb400nf_innerjjnr],  8
-
-	xorps xmm2, xmm2
-	movaps xmm6, xmm2
-	
-	;# load isa2
-	mov esi, [ebp + nb400nf_invsqrta]
-	movss xmm2, [esi + eax*4]
-	movss xmm3, [esi + ebx*4]
-	unpcklps xmm2, xmm3	;# isa2 in xmm3(0,1)
-	mulps  xmm2, [esp + nb400nf_isai]
-	movaps [esp + nb400nf_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb400nf_gbtsc]
-	movaps [esp + nb400nf_gbscale], xmm1	
-	
-	mov esi, [ebp + nb400nf_charge]    ;# base of charge[] 	
-	movss xmm3, [esi + eax*4]		
-	movss xmm6, [esi + ebx*4]
-	unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges 
-
-	mulps  xmm2, [esp + nb400nf_iq]
-	mulps  xmm3, xmm2
-	movaps [esp + nb400nf_qq], xmm3
-
-	mov edi, [ebp + nb400nf_pos]	
-	
-	lea   eax, [eax + eax*2]
-	lea   ebx, [ebx + ebx*2]
-	;# move coordinates to xmm0-xmm2 
-	movlps xmm1, [edi + eax*4]
-	movss xmm2, [edi + eax*4 + 8]	
-	movhps xmm1, [edi + ebx*4]
-	movss xmm0, [edi + ebx*4 + 8]	
-
-	movlhps xmm3, xmm7
-	
-	shufps xmm2, xmm0, 0
-	
-	movaps xmm0, xmm1
-
-	shufps xmm2, xmm2, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm0, 136  ;# constant 10001000
-	shufps xmm1, xmm1, 221  ;# constant 11011101
-			
-	mov    edi, [ebp + nb400nf_faction]
-	;# move ix-iz to xmm4-xmm6 
-	xorps   xmm7, xmm7
-	
-	movaps xmm4, [esp + nb400nf_ix]
-	movaps xmm5, [esp + nb400nf_iy]
-	movaps xmm6, [esp + nb400nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb400nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb400nf_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	mulps xmm4, [esp + nb400nf_gbscale]
-
-	cvttps2pi mm6, xmm4     ;# mm6 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-
-	pslld mm6, 2
-
-	mov  esi, [ebp + nb400nf_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-
-	;# load coulomb table
-	movaps xmm4, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# coulomb table ready, in xmm4-xmm7  	
-
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	movaps xmm3, [esp + nb400nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	addps  xmm5, [esp + nb400nf_vctot]
-	movaps [esp + nb400nf_vctot], xmm5 
-
-.nb400nf_checksingle:				
-	mov   edx, [esp + nb400nf_innerk]
-	and   edx, 1
-	jnz    .nb400nf_dosingle
-	jmp    .nb400nf_updateouterdata
-.nb400nf_dosingle:
-	mov esi, [ebp + nb400nf_charge]
-	mov edx, [ebp + nb400nf_invsqrta]
-	mov edi, [ebp + nb400nf_pos]
-	mov   ecx, [esp + nb400nf_innerjjnr]
-	mov   eax, [ecx]	
-	xorps  xmm2, xmm2
-	movaps xmm6, xmm2
-	movss xmm2, [edx + eax*4]	;# isa2
-	mulss xmm2, [esp + nb400nf_isai]
-	movss [esp + nb400nf_isaprod], xmm2	
-	movss xmm1, xmm2
-	mulss xmm1, [esp + nb400nf_gbtsc]
-	movss [esp + nb400nf_gbscale], xmm1	
-	
-	mulss  xmm2, [esp + nb400nf_iq]
-	movss xmm6, [esi + eax*4]	;# xmm6(0) has the charge 	
-	mulss  xmm6, xmm2
-	movss [esp + nb400nf_qq], xmm6
-		
-	lea   eax, [eax + eax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movss xmm0, [edi + eax*4]	
-	movss xmm1, [edi + eax*4 + 4]	
-	movss xmm2, [edi + eax*4 + 8]	 
-	
-	movss xmm4, [esp + nb400nf_ix]
-	movss xmm5, [esp + nb400nf_iy]
-	movss xmm6, [esp + nb400nf_iz]
-
-	;# calc dr 
-	subss xmm4, xmm0
-	subss xmm5, xmm1
-	subss xmm6, xmm2
-
-	;# square it 
-	mulss xmm4,xmm4
-	mulss xmm5,xmm5
-	mulss xmm6,xmm6
-	addss xmm4, xmm5
-	addss xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movss xmm2, xmm5
-	mulss xmm5, xmm5
-	movss xmm1, [esp + nb400nf_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movss xmm0, [esp + nb400nf_half]
-	subss xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-
-	mulss xmm4, xmm0	;# xmm4=r 
-	mulss xmm4, [esp + nb400nf_gbscale]
-
-	cvttss2si ebx, xmm4     ;# mm6 contain lu indices 
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movss xmm1, xmm4	;# xmm1=eps 
-	movss xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl  ebx, 2
-
-	mov  esi, [ebp + nb400nf_GBtab]
-
-	movaps xmm4, [esi + ebx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	movss xmm3, [esp + nb400nf_qq]
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, xmm3 ;# vcoul=qq*VV  
-	addss  xmm5, [esp + nb400nf_vctot]
-	movss [esp + nb400nf_vctot], xmm5 
-.nb400nf_updateouterdata:
-	;# get n from stack
-	mov esi, [esp + nb400nf_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb400nf_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movaps xmm7, [esp + nb400nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb400nf_Vc]
-	addss xmm7, [eax + edx*4] 
-	;# move back to mem 
-	movss [eax + edx*4], xmm7 
-	
-        ;# finish if last 
-        mov ecx, [esp + nb400nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb400nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb400nf_n], esi
-        jmp .nb400nf_outer
-.nb400nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb400nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb400nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb400nf_threadloop
-.nb400nf_end:
-	emms
-
-	mov eax, [esp + nb400nf_nouter]
-	mov ebx, [esp + nb400nf_ninner]
-	mov ecx, [ebp + nb400nf_outeriter]
-	mov edx, [ebp + nb400nf_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb400nf_salign]
-	add esp, eax
-	add esp, 236
-	pop edi
-	pop esi
-    	pop edx
-    	pop ecx
-    	pop ebx
-    	pop eax
-	leave
-	ret
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.s
deleted file mode 100644
index 5db96cb9c9..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.s
+++ /dev/null
@@ -1,1701 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-
-.globl nb_kernel400_ia32_sse
-.globl _nb_kernel400_ia32_sse
-nb_kernel400_ia32_sse:  
-_nb_kernel400_ia32_sse: 
-.set nb400_p_nri, 8
-.set nb400_iinr, 12
-.set nb400_jindex, 16
-.set nb400_jjnr, 20
-.set nb400_shift, 24
-.set nb400_shiftvec, 28
-.set nb400_fshift, 32
-.set nb400_gid, 36
-.set nb400_pos, 40
-.set nb400_faction, 44
-.set nb400_charge, 48
-.set nb400_p_facel, 52
-.set nb400_argkrf, 56
-.set nb400_argcrf, 60
-.set nb400_Vc, 64
-.set nb400_type, 68
-.set nb400_p_ntype, 72
-.set nb400_vdwparam, 76
-.set nb400_Vvdw, 80
-.set nb400_p_tabscale, 84
-.set nb400_VFtab, 88
-.set nb400_invsqrta, 92
-.set nb400_dvda, 96
-.set nb400_p_gbtabscale, 100
-.set nb400_GBtab, 104
-.set nb400_p_nthreads, 108
-.set nb400_count, 112
-.set nb400_mtx, 116
-.set nb400_outeriter, 120
-.set nb400_inneriter, 124
-.set nb400_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb400_ix, 0
-.set nb400_iy, 16
-.set nb400_iz, 32
-.set nb400_iq, 48
-.set nb400_dx, 64
-.set nb400_dy, 80
-.set nb400_dz, 96
-.set nb400_two, 112
-.set nb400_gbtsc, 128
-.set nb400_qq, 144
-.set nb400_r, 160
-.set nb400_vctot, 176
-.set nb400_fix, 192
-.set nb400_fiy, 208
-.set nb400_fiz, 224
-.set nb400_half, 240
-.set nb400_three, 256
-.set nb400_isai, 272
-.set nb400_isaprod, 288
-.set nb400_dvdasum, 304
-.set nb400_gbscale, 320
-.set nb400_is3, 336
-.set nb400_ii3, 340
-.set nb400_ii, 344
-.set nb400_innerjjnr, 348
-.set nb400_innerk, 352
-.set nb400_n, 356
-.set nb400_nn1, 360
-.set nb400_jnra, 364
-.set nb400_jnrb, 368
-.set nb400_jnrc, 372
-.set nb400_jnrd, 376
-.set nb400_nri, 380
-.set nb400_facel, 384
-.set nb400_nouter, 388
-.set nb400_ninner, 392
-.set nb400_salign, 396
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $400,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb400_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb400_p_nri(%ebp),%ecx
-        movl nb400_p_facel(%ebp),%esi
-        movl (%ecx),%ecx
-        movl (%esi),%esi
-        movl %ecx,nb400_nri(%esp)
-        movl %esi,nb400_facel(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb400_nouter(%esp)
-        movl %eax,nb400_ninner(%esp)
-
-
-        movl nb400_p_gbtabscale(%ebp),%eax
-        movss (%eax),%xmm3
-        shufps $0,%xmm3,%xmm3
-        movaps %xmm3,nb400_gbtsc(%esp)
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## constant 0.5 in IEEE (hex)
-        movl %eax,nb400_half(%esp)
-        movss nb400_half(%esp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## constant 1.0
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## constant 2.0
-        addps  %xmm2,%xmm3      ## constant 3.0
-        movaps %xmm1,nb400_half(%esp)
-        movaps %xmm2,nb400_two(%esp)
-        movaps %xmm3,nb400_three(%esp)
-
-_nb_kernel400_ia32_sse.nb400_threadloop: 
-        movl  nb400_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel400_ia32_sse.nb400_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel400_ia32_sse.nb400_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb400_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb400_n(%esp)
-        movl %ebx,nb400_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel400_ia32_sse.nb400_outerstart
-        jmp _nb_kernel400_ia32_sse.nb400_end
-
-_nb_kernel400_ia32_sse.nb400_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb400_nouter(%esp),%ebx
-        movl %ebx,nb400_nouter(%esp)
-
-_nb_kernel400_ia32_sse.nb400_outer: 
-        movl  nb400_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx                ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb400_is3(%esp)      ## store is3 
-
-        movl  nb400_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movss (%eax,%ebx,4),%xmm0
-        movss 4(%eax,%ebx,4),%xmm1
-        movss 8(%eax,%ebx,4),%xmm2
-
-        movl  nb400_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]        
-        movl  (%ecx,%esi,4),%ebx            ## ebx =ii 
-        movl  %ebx,nb400_ii(%esp)
-
-        movl  nb400_charge(%ebp),%edx
-        movss (%edx,%ebx,4),%xmm3
-        mulss nb400_facel(%esp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-
-        movl  nb400_invsqrta(%ebp),%edx         ## load invsqrta[ii]
-        movss (%edx,%ebx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb400_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addss (%eax,%ebx,4),%xmm0
-        addss 4(%eax,%ebx,4),%xmm1
-        addss 8(%eax,%ebx,4),%xmm2
-
-        movaps %xmm3,nb400_iq(%esp)
-        movaps %xmm4,nb400_isai(%esp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb400_ix(%esp)
-        movaps %xmm1,nb400_iy(%esp)
-        movaps %xmm2,nb400_iz(%esp)
-
-        movl  %ebx,nb400_ii3(%esp)
-
-        ## clear vctot and i forces 
-        xorps %xmm4,%xmm4
-        movaps %xmm4,nb400_vctot(%esp)
-        movaps %xmm4,nb400_dvdasum(%esp)
-        movaps %xmm4,nb400_fix(%esp)
-        movaps %xmm4,nb400_fiy(%esp)
-        movaps %xmm4,nb400_fiz(%esp)
-
-        movl  nb400_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb400_pos(%ebp),%esi
-        movl  nb400_faction(%ebp),%edi
-        movl  nb400_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb400_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb400_ninner(%esp),%ecx
-        movl  %ecx,nb400_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb400_innerk(%esp)      ## number of innerloop atoms 
-        jge   _nb_kernel400_ia32_sse.nb400_unroll_loop
-        jmp   _nb_kernel400_ia32_sse.nb400_finish_inner
-_nb_kernel400_ia32_sse.nb400_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movl  nb400_innerjjnr(%esp),%edx       ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        movl  8(%edx),%ecx
-        movl  12(%edx),%edx           ## eax-edx=jnr1-4 
-        addl $16,nb400_innerjjnr(%esp)             ## advance pointer (unrolled 4) 
-
-        ## load isaj
-        movl nb400_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-        movaps nb400_isai(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all isaj in xmm3 
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb400_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb400_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb400_gbscale(%esp)
-
-        movl nb400_charge(%ebp),%esi     ## base of charge[] 
-
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-
-        mulps nb400_iq(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3  
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb400_qq(%esp)
-
-
-        movl nb400_pos(%ebp),%esi        ## base of pos[] 
-
-        movl %eax,nb400_jnra(%esp)
-        movl %ebx,nb400_jnrb(%esp)
-        movl %ecx,nb400_jnrc(%esp)
-        movl %edx,nb400_jnrd(%esp)
-
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-        leal  (%ecx,%ecx,2),%ecx
-        leal  (%edx,%edx,2),%edx
-
-        ## move four coordinates to xmm0-xmm2   
-
-        movlps (%esi,%eax,4),%xmm4
-        movlps (%esi,%ecx,4),%xmm5
-        movss 8(%esi,%eax,4),%xmm2
-        movss 8(%esi,%ecx,4),%xmm6
-
-        movhps (%esi,%ebx,4),%xmm4
-        movhps (%esi,%edx,4),%xmm5
-
-        movss 8(%esi,%ebx,4),%xmm0
-        movss 8(%esi,%edx,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm5,%xmm0 ## constant 10001000
-        shufps $221,%xmm5,%xmm1 ## constant 11011101            
-
-        ## move ix-iz to xmm4-xmm6 
-        movaps nb400_ix(%esp),%xmm4
-        movaps nb400_iy(%esp),%xmm5
-        movaps nb400_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## store dr 
-        movaps %xmm4,nb400_dx(%esp)
-        movaps %xmm5,nb400_dy(%esp)
-        movaps %xmm6,nb400_dz(%esp)
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb400_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb400_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb400_r(%esp)
-        mulps nb400_gbscale(%esp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $2,%mm6
-        pslld $2,%mm7
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movl nb400_GBtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## load coulomb table
-        movaps (%esi,%eax,4),%xmm4
-        movaps (%esi,%ebx,4),%xmm5
-        movaps (%esi,%ecx,4),%xmm6
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  nb400_two(%esp),%xmm7    ## two*Heps2 
-        movaps nb400_qq(%esp),%xmm3
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulps  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## at this point mm5 contains vcoul and mm3 fijC
-
-        ## get jnr from stack
-        movl nb400_jnra(%esp),%eax
-        movl nb400_jnrb(%esp),%ebx
-        movl nb400_jnrc(%esp),%ecx
-        movl nb400_jnrd(%esp),%edx
-
-        movl nb400_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorps  %xmm7,%xmm7
-        mulps nb400_gbscale(%esp),%xmm3
-        movaps %xmm3,%xmm6
-        mulps  nb400_r(%esp),%xmm6
-        addps  %xmm5,%xmm6
-        addps  nb400_vctot(%esp),%xmm5
-        movaps %xmm5,nb400_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum
-        addps  nb400_dvdasum(%esp),%xmm7
-        movaps %xmm7,nb400_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        movaps  %xmm6,%xmm5
-        movaps  %xmm7,%xmm4
-        shufps $0x1,%xmm5,%xmm5
-        shufps $0x1,%xmm4,%xmm4
-        ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-        addss  (%esi,%eax,4),%xmm6
-        addss  (%esi,%ebx,4),%xmm5
-        addss  (%esi,%ecx,4),%xmm7
-        addss  (%esi,%edx,4),%xmm4
-        movss  %xmm6,(%esi,%eax,4)
-        movss  %xmm5,(%esi,%ebx,4)
-        movss  %xmm7,(%esi,%ecx,4)
-        movss  %xmm4,(%esi,%edx,4)
-
-        xorps  %xmm4,%xmm4
-        mulps %xmm0,%xmm3
-        subps  %xmm3,%xmm4
-
-        movaps nb400_dx(%esp),%xmm0
-        movaps nb400_dy(%esp),%xmm1
-        movaps nb400_dz(%esp),%xmm2
-
-        movd %mm0,%eax
-        movd %mm1,%ebx
-        movd %mm2,%ecx
-        movd %mm3,%edx
-
-        movl   nb400_faction(%ebp),%edi
-        mulps  %xmm4,%xmm0
-        mulps  %xmm4,%xmm1
-        mulps  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movaps nb400_fix(%esp),%xmm3
-        movaps nb400_fiy(%esp),%xmm4
-        movaps nb400_fiz(%esp),%xmm5
-        addps  %xmm0,%xmm3
-        addps  %xmm1,%xmm4
-        addps  %xmm2,%xmm5
-        movaps %xmm3,nb400_fix(%esp)
-        movaps %xmm4,nb400_fiy(%esp)
-        movaps %xmm5,nb400_fiz(%esp)
-        ## the fj's - start by accumulating x & y forces from memory 
-        movlps (%edi,%eax,4),%xmm4
-        movlps (%edi,%ecx,4),%xmm6
-        movhps (%edi,%ebx,4),%xmm4
-        movhps (%edi,%edx,4),%xmm6
-
-        movaps %xmm4,%xmm3
-        shufps $136,%xmm6,%xmm3 ## constant 10001000
-        shufps $221,%xmm6,%xmm4 ## constant 11011101                          
-
-        ## now xmm3-xmm5 contains fjx, fjy, fjz 
-        subps  %xmm0,%xmm3
-        subps  %xmm1,%xmm4
-
-        ## unpack them back so we can store them - first x & y in xmm3/xmm4 
-
-        movaps %xmm3,%xmm6
-        unpcklps %xmm4,%xmm6
-        unpckhps %xmm4,%xmm3
-        ## xmm6(l)=x & y for j1, (h) for j2 
-        ## xmm3(l)=x & y for j3, (h) for j4 
-        movlps %xmm6,(%edi,%eax,4)
-        movlps %xmm3,(%edi,%ecx,4)
-
-        movhps %xmm6,(%edi,%ebx,4)
-        movhps %xmm3,(%edi,%edx,4)
-
-        ## and the z forces 
-        movss  8(%edi,%eax,4),%xmm4
-        movss  8(%edi,%ebx,4),%xmm5
-        movss  8(%edi,%ecx,4),%xmm6
-        movss  8(%edi,%edx,4),%xmm7
-        subss  %xmm2,%xmm4
-        shufps $229,%xmm2,%xmm2 ## constant 11100101
-        subss  %xmm2,%xmm5
-        shufps $234,%xmm2,%xmm2 ## constant 11101010
-        subss  %xmm2,%xmm6
-        shufps $255,%xmm2,%xmm2 ## constant 11111111
-        subss  %xmm2,%xmm7
-        movss  %xmm4,8(%edi,%eax,4)
-        movss  %xmm5,8(%edi,%ebx,4)
-        movss  %xmm6,8(%edi,%ecx,4)
-        movss  %xmm7,8(%edi,%edx,4)
-
-        ## should we do one more iteration? 
-        subl $4,nb400_innerk(%esp)
-        jl    _nb_kernel400_ia32_sse.nb400_finish_inner
-        jmp   _nb_kernel400_ia32_sse.nb400_unroll_loop
-_nb_kernel400_ia32_sse.nb400_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb400_innerk(%esp)
-        movl  nb400_innerk(%esp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel400_ia32_sse.nb400_dopair
-        jmp   _nb_kernel400_ia32_sse.nb400_checksingle
-_nb_kernel400_ia32_sse.nb400_dopair: 
-        movl  nb400_innerjjnr(%esp),%ecx
-
-        movl  (%ecx),%eax
-        movl  4(%ecx),%ebx
-        addl $8,nb400_innerjjnr(%esp)
-
-        xorps %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-
-        ## load isaj
-        movl nb400_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm2
-        movss (%esi,%ebx,4),%xmm3
-        unpcklps %xmm3,%xmm2    ## isaj in xmm2(0,1)
-        mulps  nb400_isai(%esp),%xmm2
-        movaps %xmm2,nb400_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb400_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb400_gbscale(%esp)
-
-        movl nb400_charge(%ebp),%esi     ## base of charge[]    
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ebx,4),%xmm6
-        unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges 
-
-        mulps  nb400_iq(%esp),%xmm2
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb400_qq(%esp)
-
-        movl nb400_pos(%ebp),%edi
-
-        movd  %eax,%mm0         ## copy jnr to mm0/mm1
-        movd  %ebx,%mm1
-
-        leal  (%eax,%eax,2),%eax
-        leal  (%ebx,%ebx,2),%ebx
-        ## move coordinates to xmm0-xmm2 
-        movlps (%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-        movhps (%edi,%ebx,4),%xmm1
-        movss 8(%edi,%ebx,4),%xmm0
-
-        movlhps %xmm7,%xmm3
-
-        shufps $0,%xmm0,%xmm2
-
-        movaps %xmm1,%xmm0
-
-        shufps $136,%xmm2,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm0,%xmm0 ## constant 10001000
-        shufps $221,%xmm1,%xmm1 ## constant 11011101
-
-        movl   nb400_faction(%ebp),%edi
-        ## move ix-iz to xmm4-xmm6 
-        xorps   %xmm7,%xmm7
-
-        movaps nb400_ix(%esp),%xmm4
-        movaps nb400_iy(%esp),%xmm5
-        movaps nb400_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## store dr 
-        movaps %xmm4,nb400_dx(%esp)
-        movaps %xmm5,nb400_dy(%esp)
-        movaps %xmm6,nb400_dz(%esp)
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb400_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb400_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        movaps %xmm4,nb400_r(%esp)
-        mulps nb400_gbscale(%esp),%xmm4
-
-        cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6
-
-        movl nb400_GBtab(%ebp),%esi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## load coulomb table
-        movaps (%esi,%ecx,4),%xmm4
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  nb400_two(%esp),%xmm7    ## two*Heps2 
-        movaps nb400_qq(%esp),%xmm3
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulps  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## at this point mm5 contains vcoul and mm3 fijC
-
-        ## get jnr from mm0/mm1
-        movd %mm0,%ecx
-        movd %mm1,%edx
-
-        movl nb400_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorps  %xmm7,%xmm7
-        mulps nb400_gbscale(%esp),%xmm3
-        movaps %xmm3,%xmm6
-        mulps  nb400_r(%esp),%xmm6
-        addps  %xmm5,%xmm6
-        addps  nb400_vctot(%esp),%xmm5
-        movaps %xmm5,nb400_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum
-        addps  nb400_dvdasum(%esp),%xmm7
-        movaps %xmm7,nb400_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        movaps %xmm6,%xmm7
-        shufps $0x1,%xmm7,%xmm7
-        addss  (%esi,%ecx,4),%xmm6
-        addss  (%esi,%edx,4),%xmm7
-        movss  %xmm6,(%esi,%ecx,4)
-        movss  %xmm7,(%esi,%edx,4)
-
-        xorps  %xmm4,%xmm4
-        mulps %xmm0,%xmm3
-        subps  %xmm3,%xmm4
-
-        movaps nb400_dx(%esp),%xmm0
-        movaps nb400_dy(%esp),%xmm1
-        movaps nb400_dz(%esp),%xmm2
-
-        mulps  %xmm4,%xmm0
-        mulps  %xmm4,%xmm1
-        mulps  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movaps nb400_fix(%esp),%xmm3
-        movaps nb400_fiy(%esp),%xmm4
-        movaps nb400_fiz(%esp),%xmm5
-        addps  %xmm0,%xmm3
-        addps  %xmm1,%xmm4
-        addps  %xmm2,%xmm5
-        movaps %xmm3,nb400_fix(%esp)
-        movaps %xmm4,nb400_fiy(%esp)
-        movaps %xmm5,nb400_fiz(%esp)
-        ## update the fj's 
-        movss   (%edi,%eax,4),%xmm3
-        movss   4(%edi,%eax,4),%xmm4
-        movss   8(%edi,%eax,4),%xmm5
-        subss   %xmm0,%xmm3
-        subss   %xmm1,%xmm4
-        subss   %xmm2,%xmm5
-        movss   %xmm3,(%edi,%eax,4)
-        movss   %xmm4,4(%edi,%eax,4)
-        movss   %xmm5,8(%edi,%eax,4)
-
-        shufps $225,%xmm0,%xmm0 ## constant 11100001
-        shufps $225,%xmm1,%xmm1 ## constant 11100001
-        shufps $225,%xmm2,%xmm2 ## constant 11100001
-
-        movss   (%edi,%ebx,4),%xmm3
-        movss   4(%edi,%ebx,4),%xmm4
-        movss   8(%edi,%ebx,4),%xmm5
-        subss   %xmm0,%xmm3
-        subss   %xmm1,%xmm4
-        subss   %xmm2,%xmm5
-        movss   %xmm3,(%edi,%ebx,4)
-        movss   %xmm4,4(%edi,%ebx,4)
-        movss   %xmm5,8(%edi,%ebx,4)
-
-_nb_kernel400_ia32_sse.nb400_checksingle:       
-        movl  nb400_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel400_ia32_sse.nb400_dosingle
-        jmp    _nb_kernel400_ia32_sse.nb400_updateouterdata
-_nb_kernel400_ia32_sse.nb400_dosingle: 
-        movl nb400_charge(%ebp),%esi
-        movl nb400_invsqrta(%ebp),%edx
-        movl nb400_pos(%ebp),%edi
-        movl  nb400_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-        xorps  %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-        movss (%edx,%eax,4),%xmm2       ## isaj
-        mulss nb400_isai(%esp),%xmm2
-        movss %xmm2,nb400_isaprod(%esp)
-        movss %xmm2,%xmm1
-        mulss nb400_gbtsc(%esp),%xmm1
-        movss %xmm1,nb400_gbscale(%esp)
-
-        mulss  nb400_iq(%esp),%xmm2
-        movss (%esi,%eax,4),%xmm6       ## xmm6(0) has the charge       
-        mulss  %xmm2,%xmm6
-        movss %xmm6,nb400_qq(%esp)
-
-        movd  %eax,%mm0
-        leal  (%eax,%eax,2),%eax
-
-        ## move coordinates to xmm0-xmm2 
-        movss (%edi,%eax,4),%xmm0
-        movss 4(%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-
-        movss nb400_ix(%esp),%xmm4
-        movss nb400_iy(%esp),%xmm5
-        movss nb400_iz(%esp),%xmm6
-
-        ## calc dr 
-        subss %xmm0,%xmm4
-        subss %xmm1,%xmm5
-        subss %xmm2,%xmm6
-
-        ## store dr 
-        movss %xmm4,nb400_dx(%esp)
-        movss %xmm5,nb400_dy(%esp)
-        movss %xmm6,nb400_dz(%esp)
-        ## square it 
-        mulss %xmm4,%xmm4
-        mulss %xmm5,%xmm5
-        mulss %xmm6,%xmm6
-        addss %xmm5,%xmm4
-        addss %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movss %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movss nb400_three(%esp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movss nb400_half(%esp),%xmm0
-        subss %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-
-        mulss %xmm0,%xmm4       ## xmm4=r 
-        movss %xmm4,nb400_r(%esp)
-        mulss nb400_gbscale(%esp),%xmm4
-
-        cvttss2si %xmm4,%ebx    ## mm6 contain lu indices 
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movss %xmm4,%xmm1       ## xmm1=eps 
-        movss %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%ebx
-
-        movl nb400_GBtab(%ebp),%esi
-
-        movaps (%esi,%ebx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        mulss  nb400_two(%esp),%xmm7    ## two*Heps2 
-        movss nb400_qq(%esp),%xmm3
-        addss  %xmm6,%xmm7
-        addss  %xmm5,%xmm7 ## xmm7=FF 
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulss  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## at this point mm5 contains vcoul and mm3 fijC 
-
-        movd %mm0,%ebx
-        movl nb400_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorps %xmm7,%xmm7
-        mulss nb400_gbscale(%esp),%xmm3
-        movaps %xmm3,%xmm6
-        mulss  nb400_r(%esp),%xmm6
-        addss  %xmm5,%xmm6
-        addss  nb400_vctot(%esp),%xmm5
-        movss %xmm5,nb400_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum
-        addps  nb400_dvdasum(%esp),%xmm7
-        movaps %xmm7,nb400_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        addss  (%esi,%ebx,4),%xmm6
-        movss  %xmm6,(%esi,%ebx,4)
-
-        xorps  %xmm4,%xmm4
-        mulss %xmm0,%xmm3
-        subss  %xmm3,%xmm4
-
-        movl   nb400_faction(%ebp),%edi
-
-        movss nb400_dx(%esp),%xmm0
-        movss nb400_dy(%esp),%xmm1
-        movss nb400_dz(%esp),%xmm2
-
-        mulss  %xmm4,%xmm0
-        mulss  %xmm4,%xmm1
-        mulss  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movss nb400_fix(%esp),%xmm3
-        movss nb400_fiy(%esp),%xmm4
-        movss nb400_fiz(%esp),%xmm5
-        addss  %xmm0,%xmm3
-        addss  %xmm1,%xmm4
-        addss  %xmm2,%xmm5
-        movss %xmm3,nb400_fix(%esp)
-        movss %xmm4,nb400_fiy(%esp)
-        movss %xmm5,nb400_fiz(%esp)
-        ## update fj 
-
-        movss   (%edi,%eax,4),%xmm3
-        movss   4(%edi,%eax,4),%xmm4
-        movss   8(%edi,%eax,4),%xmm5
-        subss   %xmm0,%xmm3
-        subss   %xmm1,%xmm4
-        subss   %xmm2,%xmm5
-        movss   %xmm3,(%edi,%eax,4)
-        movss   %xmm4,4(%edi,%eax,4)
-        movss   %xmm5,8(%edi,%eax,4)
-_nb_kernel400_ia32_sse.nb400_updateouterdata: 
-        movl  nb400_ii3(%esp),%ecx
-        movl  nb400_faction(%ebp),%edi
-        movl  nb400_fshift(%ebp),%esi
-        movl  nb400_is3(%esp),%edx
-
-        ## accumulate i forces in xmm0, xmm1, xmm2 
-        movaps nb400_fix(%esp),%xmm0
-        movaps nb400_fiy(%esp),%xmm1
-        movaps nb400_fiz(%esp),%xmm2
-
-        movhlps %xmm0,%xmm3
-        movhlps %xmm1,%xmm4
-        movhlps %xmm2,%xmm5
-        addps  %xmm3,%xmm0
-        addps  %xmm4,%xmm1
-        addps  %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 
-
-        movaps %xmm0,%xmm3
-        movaps %xmm1,%xmm4
-        movaps %xmm2,%xmm5
-
-        shufps $1,%xmm3,%xmm3
-        shufps $1,%xmm4,%xmm4
-        shufps $1,%xmm5,%xmm5
-        addss  %xmm3,%xmm0
-        addss  %xmm4,%xmm1
-        addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0 
-
-        ## increment i force 
-        movss  (%edi,%ecx,4),%xmm3
-        movss  4(%edi,%ecx,4),%xmm4
-        movss  8(%edi,%ecx,4),%xmm5
-        addss  %xmm0,%xmm3
-        addss  %xmm1,%xmm4
-        addss  %xmm2,%xmm5
-        movss  %xmm3,(%edi,%ecx,4)
-        movss  %xmm4,4(%edi,%ecx,4)
-        movss  %xmm5,8(%edi,%ecx,4)
-
-        ## increment fshift force  
-        movss  (%esi,%edx,4),%xmm3
-        movss  4(%esi,%edx,4),%xmm4
-        movss  8(%esi,%edx,4),%xmm5
-        addss  %xmm0,%xmm3
-        addss  %xmm1,%xmm4
-        addss  %xmm2,%xmm5
-        movss  %xmm3,(%esi,%edx,4)
-        movss  %xmm4,4(%esi,%edx,4)
-        movss  %xmm5,8(%esi,%edx,4)
-
-        ## get n from stack
-        movl nb400_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb400_gid(%ebp),%edx              ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movaps nb400_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movl  nb400_Vc(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%eax,%edx,4)
-
-        ## accumulate dVda and update it 
-        movaps nb400_dvdasum(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        movl nb400_ii(%esp),%edx
-        movl nb400_dvda(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        movss %xmm7,(%eax,%edx,4)
-
-        ## finish if last 
-        movl nb400_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel400_ia32_sse.nb400_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb400_n(%esp)
-        jmp _nb_kernel400_ia32_sse.nb400_outer
-_nb_kernel400_ia32_sse.nb400_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb400_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel400_ia32_sse.nb400_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel400_ia32_sse.nb400_threadloop
-_nb_kernel400_ia32_sse.nb400_end: 
-        emms
-
-        movl nb400_nouter(%esp),%eax
-        movl nb400_ninner(%esp),%ebx
-        movl nb400_outeriter(%ebp),%ecx
-        movl nb400_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb400_salign(%esp),%eax
-        addl %eax,%esp
-        addl $400,%esp
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-        popl %eax
-        leave
-        ret
-
-
-
-
-
-.globl nb_kernel400nf_ia32_sse
-.globl _nb_kernel400nf_ia32_sse
-nb_kernel400nf_ia32_sse:        
-_nb_kernel400nf_ia32_sse:       
-.set nb400nf_p_nri, 8
-.set nb400nf_iinr, 12
-.set nb400nf_jindex, 16
-.set nb400nf_jjnr, 20
-.set nb400nf_shift, 24
-.set nb400nf_shiftvec, 28
-.set nb400nf_fshift, 32
-.set nb400nf_gid, 36
-.set nb400nf_pos, 40
-.set nb400nf_faction, 44
-.set nb400nf_charge, 48
-.set nb400nf_p_facel, 52
-.set nb400nf_argkrf, 56
-.set nb400nf_argcrf, 60
-.set nb400nf_Vc, 64
-.set nb400nf_type, 68
-.set nb400nf_p_ntype, 72
-.set nb400nf_vdwparam, 76
-.set nb400nf_Vvdw, 80
-.set nb400nf_p_tabscale, 84
-.set nb400nf_VFtab, 88
-.set nb400nf_invsqrta, 92
-.set nb400nf_dvda, 96
-.set nb400nf_p_gbtabscale, 100
-.set nb400nf_GBtab, 104
-.set nb400nf_p_nthreads, 108
-.set nb400nf_count, 112
-.set nb400nf_mtx, 116
-.set nb400nf_outeriter, 120
-.set nb400nf_inneriter, 124
-.set nb400nf_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb400nf_ix, 0
-.set nb400nf_iy, 16
-.set nb400nf_iz, 32
-.set nb400nf_iq, 48
-.set nb400nf_gbtsc, 64
-.set nb400nf_qq, 80
-.set nb400nf_vctot, 96
-.set nb400nf_half, 112
-.set nb400nf_three, 128
-.set nb400nf_isai, 144
-.set nb400nf_isaprod, 160
-.set nb400nf_gbscale, 176
-.set nb400nf_is3, 192
-.set nb400nf_ii3, 196
-.set nb400nf_innerjjnr, 200
-.set nb400nf_innerk, 204
-.set nb400nf_n, 208
-.set nb400nf_nn1, 212
-.set nb400nf_nri, 216
-.set nb400nf_facel, 220
-.set nb400nf_nouter, 224
-.set nb400nf_ninner, 228
-.set nb400nf_salign, 232
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $236,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb400nf_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb400nf_p_nri(%ebp),%ecx
-        movl nb400nf_p_facel(%ebp),%esi
-        movl (%ecx),%ecx
-        movl (%esi),%esi
-        movl %ecx,nb400nf_nri(%esp)
-        movl %esi,nb400nf_facel(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb400nf_nouter(%esp)
-        movl %eax,nb400nf_ninner(%esp)
-
-
-        movl nb400nf_p_gbtabscale(%ebp),%eax
-        movss (%eax),%xmm3
-        shufps $0,%xmm3,%xmm3
-        movaps %xmm3,nb400nf_gbtsc(%esp)
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## constant 0.5 in IEEE (hex)
-        movl %eax,nb400nf_half(%esp)
-        movss nb400nf_half(%esp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## constant 1.0
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## constant 2.0
-        addps  %xmm2,%xmm3      ## constant 3.0
-        movaps %xmm1,nb400nf_half(%esp)
-        movaps %xmm3,nb400nf_three(%esp)
-
-_nb_kernel400nf_ia32_sse.nb400nf_threadloop: 
-        movl  nb400nf_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel400nf_ia32_sse.nb400nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel400nf_ia32_sse.nb400nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb400nf_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb400nf_n(%esp)
-        movl %ebx,nb400nf_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel400nf_ia32_sse.nb400nf_outerstart
-        jmp _nb_kernel400nf_ia32_sse.nb400nf_end
-
-_nb_kernel400nf_ia32_sse.nb400nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb400nf_nouter(%esp),%ebx
-        movl %ebx,nb400nf_nouter(%esp)
-
-_nb_kernel400nf_ia32_sse.nb400nf_outer: 
-        movl  nb400nf_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx                ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb400nf_is3(%esp)            ## store is3 
-
-        movl  nb400nf_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movss (%eax,%ebx,4),%xmm0
-        movss 4(%eax,%ebx,4),%xmm1
-        movss 8(%eax,%ebx,4),%xmm2
-
-        movl  nb400nf_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]      
-        movl  (%ecx,%esi,4),%ebx            ## ebx =ii 
-
-        movl  nb400nf_charge(%ebp),%edx
-        movss (%edx,%ebx,4),%xmm3
-        mulss nb400nf_facel(%esp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-        movl  nb400nf_invsqrta(%ebp),%edx       ## load invsqrta[ii]
-        movss (%edx,%ebx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb400nf_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addss (%eax,%ebx,4),%xmm0
-        addss 4(%eax,%ebx,4),%xmm1
-        addss 8(%eax,%ebx,4),%xmm2
-
-        movaps %xmm3,nb400nf_iq(%esp)
-        movaps %xmm4,nb400nf_isai(%esp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb400nf_ix(%esp)
-        movaps %xmm1,nb400nf_iy(%esp)
-        movaps %xmm2,nb400nf_iz(%esp)
-
-        movl  %ebx,nb400nf_ii3(%esp)
-
-        ## clear vctot 
-        xorps %xmm4,%xmm4
-        movaps %xmm4,nb400nf_vctot(%esp)
-
-        movl  nb400nf_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb400nf_pos(%ebp),%esi
-        movl  nb400nf_faction(%ebp),%edi
-        movl  nb400nf_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb400nf_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb400nf_ninner(%esp),%ecx
-        movl  %ecx,nb400nf_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb400nf_innerk(%esp)      ## number of innerloop atoms 
-        jge   _nb_kernel400nf_ia32_sse.nb400nf_unroll_loop
-        jmp   _nb_kernel400nf_ia32_sse.nb400nf_finish_inner
-_nb_kernel400nf_ia32_sse.nb400nf_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movl  nb400nf_innerjjnr(%esp),%edx       ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        movl  8(%edx),%ecx
-        movl  12(%edx),%edx           ## eax-edx=jnr1-4 
-        addl $16,nb400nf_innerjjnr(%esp)             ## advance pointer (unrolled 4) 
-
-        ## load isa2
-        movl nb400nf_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-        movaps nb400nf_isai(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3  
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb400nf_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb400nf_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb400nf_gbscale(%esp)
-
-        movl nb400nf_charge(%ebp),%esi     ## base of charge[] 
-
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-
-        mulps nb400nf_iq(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3  
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb400nf_qq(%esp)
-
-
-        movl nb400nf_pos(%ebp),%esi        ## base of pos[] 
-
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-
-        leal  (%ecx,%ecx,2),%ecx     ## replace jnr with j3 
-        leal  (%edx,%edx,2),%edx
-
-        ## move four coordinates to xmm0-xmm2   
-
-        movlps (%esi,%eax,4),%xmm4
-        movlps (%esi,%ecx,4),%xmm5
-        movss 8(%esi,%eax,4),%xmm2
-        movss 8(%esi,%ecx,4),%xmm6
-
-        movhps (%esi,%ebx,4),%xmm4
-        movhps (%esi,%edx,4),%xmm5
-
-        movss 8(%esi,%ebx,4),%xmm0
-        movss 8(%esi,%edx,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm5,%xmm0 ## constant 10001000
-        shufps $221,%xmm5,%xmm1 ## constant 11011101            
-
-        ## move ix-iz to xmm4-xmm6 
-        movaps nb400nf_ix(%esp),%xmm4
-        movaps nb400nf_iy(%esp),%xmm5
-        movaps nb400nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb400nf_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb400nf_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r
-        mulps nb400nf_gbscale(%esp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $2,%mm6
-        pslld $2,%mm7
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movl nb400nf_GBtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## load coulomb table
-        movaps (%esi,%eax,4),%xmm4
-        movaps (%esi,%ebx,4),%xmm5
-        movaps (%esi,%ecx,4),%xmm6
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp 
-        movaps nb400nf_qq(%esp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addps  nb400nf_vctot(%esp),%xmm5
-        movaps %xmm5,nb400nf_vctot(%esp)
-
-        ## should we do one more iteration? 
-        subl $4,nb400nf_innerk(%esp)
-        jl    _nb_kernel400nf_ia32_sse.nb400nf_finish_inner
-        jmp   _nb_kernel400nf_ia32_sse.nb400nf_unroll_loop
-_nb_kernel400nf_ia32_sse.nb400nf_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb400nf_innerk(%esp)
-        movl  nb400nf_innerk(%esp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel400nf_ia32_sse.nb400nf_dopair
-        jmp   _nb_kernel400nf_ia32_sse.nb400nf_checksingle
-_nb_kernel400nf_ia32_sse.nb400nf_dopair: 
-        movl  nb400nf_innerjjnr(%esp),%ecx
-
-        movl  (%ecx),%eax
-        movl  4(%ecx),%ebx
-        addl $8,nb400nf_innerjjnr(%esp)
-
-        xorps %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-
-        ## load isa2
-        movl nb400nf_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm2
-        movss (%esi,%ebx,4),%xmm3
-        unpcklps %xmm3,%xmm2    ## isa2 in xmm3(0,1)
-        mulps  nb400nf_isai(%esp),%xmm2
-        movaps %xmm2,nb400nf_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb400nf_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb400nf_gbscale(%esp)
-
-        movl nb400nf_charge(%ebp),%esi     ## base of charge[]  
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ebx,4),%xmm6
-        unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges 
-
-        mulps  nb400nf_iq(%esp),%xmm2
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb400nf_qq(%esp)
-
-        movl nb400nf_pos(%ebp),%edi
-
-        leal  (%eax,%eax,2),%eax
-        leal  (%ebx,%ebx,2),%ebx
-        ## move coordinates to xmm0-xmm2 
-        movlps (%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-        movhps (%edi,%ebx,4),%xmm1
-        movss 8(%edi,%ebx,4),%xmm0
-
-        movlhps %xmm7,%xmm3
-
-        shufps $0,%xmm0,%xmm2
-
-        movaps %xmm1,%xmm0
-
-        shufps $136,%xmm2,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm0,%xmm0 ## constant 10001000
-        shufps $221,%xmm1,%xmm1 ## constant 11011101
-
-        movl   nb400nf_faction(%ebp),%edi
-        ## move ix-iz to xmm4-xmm6 
-        xorps   %xmm7,%xmm7
-
-        movaps nb400nf_ix(%esp),%xmm4
-        movaps nb400nf_iy(%esp),%xmm5
-        movaps nb400nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb400nf_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb400nf_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        mulps nb400nf_gbscale(%esp),%xmm4
-
-        cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6
-
-        movl nb400nf_GBtab(%ebp),%esi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## load coulomb table
-        movaps (%esi,%ecx,4),%xmm4
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        movaps nb400nf_qq(%esp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addps  nb400nf_vctot(%esp),%xmm5
-        movaps %xmm5,nb400nf_vctot(%esp)
-
-_nb_kernel400nf_ia32_sse.nb400nf_checksingle:   
-        movl  nb400nf_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel400nf_ia32_sse.nb400nf_dosingle
-        jmp    _nb_kernel400nf_ia32_sse.nb400nf_updateouterdata
-_nb_kernel400nf_ia32_sse.nb400nf_dosingle: 
-        movl nb400nf_charge(%ebp),%esi
-        movl nb400nf_invsqrta(%ebp),%edx
-        movl nb400nf_pos(%ebp),%edi
-        movl  nb400nf_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-        xorps  %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-        movss (%edx,%eax,4),%xmm2       ## isa2
-        mulss nb400nf_isai(%esp),%xmm2
-        movss %xmm2,nb400nf_isaprod(%esp)
-        movss %xmm2,%xmm1
-        mulss nb400nf_gbtsc(%esp),%xmm1
-        movss %xmm1,nb400nf_gbscale(%esp)
-
-        mulss  nb400nf_iq(%esp),%xmm2
-        movss (%esi,%eax,4),%xmm6       ## xmm6(0) has the charge       
-        mulss  %xmm2,%xmm6
-        movss %xmm6,nb400nf_qq(%esp)
-
-        leal  (%eax,%eax,2),%eax
-
-        ## move coordinates to xmm0-xmm2 
-        movss (%edi,%eax,4),%xmm0
-        movss 4(%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-
-        movss nb400nf_ix(%esp),%xmm4
-        movss nb400nf_iy(%esp),%xmm5
-        movss nb400nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subss %xmm0,%xmm4
-        subss %xmm1,%xmm5
-        subss %xmm2,%xmm6
-
-        ## square it 
-        mulss %xmm4,%xmm4
-        mulss %xmm5,%xmm5
-        mulss %xmm6,%xmm6
-        addss %xmm5,%xmm4
-        addss %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movss %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movss nb400nf_three(%esp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movss nb400nf_half(%esp),%xmm0
-        subss %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-
-        mulss %xmm0,%xmm4       ## xmm4=r 
-        mulss nb400nf_gbscale(%esp),%xmm4
-
-        cvttss2si %xmm4,%ebx    ## mm6 contain lu indices 
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movss %xmm4,%xmm1       ## xmm1=eps 
-        movss %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%ebx
-
-        movl nb400nf_GBtab(%ebp),%esi
-
-        movaps (%esi,%ebx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        movss nb400nf_qq(%esp),%xmm3
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addss  nb400nf_vctot(%esp),%xmm5
-        movss %xmm5,nb400nf_vctot(%esp)
-_nb_kernel400nf_ia32_sse.nb400nf_updateouterdata: 
-        ## get n from stack
-        movl nb400nf_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb400nf_gid(%ebp),%edx            ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movaps nb400nf_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movl  nb400nf_Vc(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%eax,%edx,4)
-
-        ## finish if last 
-        movl nb400nf_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel400nf_ia32_sse.nb400nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb400nf_n(%esp)
-        jmp _nb_kernel400nf_ia32_sse.nb400nf_outer
-_nb_kernel400nf_ia32_sse.nb400nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb400nf_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel400nf_ia32_sse.nb400nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel400nf_ia32_sse.nb400nf_threadloop
-_nb_kernel400nf_ia32_sse.nb400nf_end: 
-        emms
-
-        movl nb400nf_nouter(%esp),%eax
-        movl nb400nf_ninner(%esp),%ebx
-        movl nb400nf_outeriter(%ebp),%ecx
-        movl nb400nf_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb400nf_salign(%esp),%eax
-        addl %eax,%esp
-        addl $236,%esp
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-        popl %eax
-        leave
-        ret
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.intel_syntax.s
deleted file mode 100644
index 492e8655cd..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.intel_syntax.s
+++ /dev/null
@@ -1,2049 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-
-
-.globl nb_kernel410_ia32_sse
-.globl _nb_kernel410_ia32_sse
-nb_kernel410_ia32_sse:	
-_nb_kernel410_ia32_sse:	
-.equiv          nb410_p_nri,            8
-.equiv          nb410_iinr,             12
-.equiv          nb410_jindex,           16
-.equiv          nb410_jjnr,             20
-.equiv          nb410_shift,            24
-.equiv          nb410_shiftvec,         28
-.equiv          nb410_fshift,           32
-.equiv          nb410_gid,              36
-.equiv          nb410_pos,              40
-.equiv          nb410_faction,          44
-.equiv          nb410_charge,           48
-.equiv          nb410_p_facel,          52
-.equiv          nb410_argkrf,           56
-.equiv          nb410_argcrf,           60
-.equiv          nb410_Vc,               64
-.equiv          nb410_type,             68
-.equiv          nb410_p_ntype,          72
-.equiv          nb410_vdwparam,         76
-.equiv          nb410_Vvdw,             80
-.equiv          nb410_p_tabscale,       84
-.equiv          nb410_VFtab,            88
-.equiv          nb410_invsqrta,         92
-.equiv          nb410_dvda,             96
-.equiv          nb410_p_gbtabscale,     100
-.equiv          nb410_GBtab,            104
-.equiv          nb410_p_nthreads,       108
-.equiv          nb410_count,            112
-.equiv          nb410_mtx,              116
-.equiv          nb410_outeriter,        120
-.equiv          nb410_inneriter,        124
-.equiv          nb410_work,             128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb410_ix,               0
-.equiv          nb410_iy,               16
-.equiv          nb410_iz,               32
-.equiv          nb410_iq,               48
-.equiv          nb410_dx,               64
-.equiv          nb410_dy,               80
-.equiv          nb410_dz,               96
-.equiv          nb410_two,              112
-.equiv          nb410_six,              128
-.equiv          nb410_twelve,           144
-.equiv          nb410_gbtsc,            160
-.equiv          nb410_qq,               176
-.equiv          nb410_c6,               192
-.equiv          nb410_c12,              208
-.equiv          nb410_fscal,            224
-.equiv          nb410_vctot,            240
-.equiv          nb410_Vvdwtot,          256
-.equiv          nb410_fix,              272
-.equiv          nb410_fiy,              288
-.equiv          nb410_fiz,              304
-.equiv          nb410_half,             320
-.equiv          nb410_three,            336
-.equiv          nb410_r,                352
-.equiv          nb410_isai,             368
-.equiv          nb410_isaprod,          384
-.equiv          nb410_dvdasum,          400
-.equiv          nb410_gbscale,          416
-.equiv          nb410_is3,              432
-.equiv          nb410_ii3,              436
-.equiv          nb410_ii,               440
-.equiv          nb410_ntia,             444
-.equiv          nb410_innerjjnr,        448
-.equiv          nb410_innerk,           452
-.equiv          nb410_n,                456
-.equiv          nb410_nn1,              460
-.equiv          nb410_jnra,             464
-.equiv          nb410_jnrb,             468
-.equiv          nb410_jnrc,             472
-.equiv          nb410_jnrd,             476
-.equiv          nb410_nri,              480
-.equiv          nb410_facel,            484
-.equiv          nb410_ntype,            488
-.equiv          nb410_nouter,           492
-.equiv          nb410_ninner,           496
-.equiv          nb410_salign,           500
-	push ebp
-	mov ebp,esp	
-    	push eax
-    	push ebx
-    	push ecx
-    	push edx
-	push esi
-	push edi
-	sub esp, 504		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb410_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb410_p_nri]
-	mov esi, [ebp + nb410_p_facel]
-	mov edi, [ebp + nb410_p_ntype]
-	mov ecx, [ecx]
-	mov esi, [esi]
-	mov edi, [edi]
-	mov [esp + nb410_nri], ecx
-	mov [esp + nb410_facel], esi
-	mov [esp + nb410_ntype], edi
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb410_nouter], eax
-	mov [esp + nb410_ninner], eax
-
-
-	mov eax, [ebp + nb410_p_gbtabscale]
-	movss xmm5, [eax]	
-	shufps xmm5, xmm5, 0
-	movaps [esp + nb410_gbtsc], xmm5
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# constant 0.5 in IEEE (hex)
-	mov [esp + nb410_half], eax
-	movss xmm1, [esp + nb410_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# constant 1.0
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# constant 2.0
-	addps  xmm3, xmm2	;# constant 3.0
-	movaps xmm4, xmm3
-	addps  xmm4, xmm4	;# 6.0
-	movaps xmm5, xmm4
-	addps  xmm5, xmm5	;# constant 12.0
-	movaps [esp + nb410_half],  xmm1
-	movaps [esp + nb410_two],  xmm2
-	movaps [esp + nb410_three],  xmm3
-	movaps [esp + nb410_six],  xmm4
-	movaps [esp + nb410_twelve],  xmm5
-
-.nb410_threadloop:
-        mov   esi, [ebp + nb410_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb410_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb410_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb410_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb410_n], eax
-        mov [esp + nb410_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb410_outerstart
-        jmp .nb410_end
-
-.nb410_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb410_nouter]
-	mov [esp + nb410_nouter], ebx
-
-.nb410_outer:
-	mov   eax, [ebp + nb410_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax+esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb410_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb410_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movss xmm0, [eax + ebx*4]
-	movss xmm1, [eax + ebx*4 + 4]
-	movss xmm2, [eax + ebx*4 + 8] 
-
-	mov   ecx, [ebp + nb410_iinr]       ;# ecx = pointer into iinr[] 	
-	mov   ebx, [ecx + esi*4]	    ;# ebx =ii 
-	mov   [esp + nb410_ii], ebx
-
-	mov   edx, [ebp + nb410_charge]
-	movss xmm3, [edx + ebx*4]	
-	mulss xmm3, [esp + nb410_facel]
-	shufps xmm3, xmm3, 0
-
-	mov   edx, [ebp + nb410_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [edx + ebx*4]
-	shufps xmm4, xmm4, 0
-
-    	mov   edx, [ebp + nb410_type] 
-    	mov   edx, [edx + ebx*4]
-    	imul  edx, [esp + nb410_ntype]
-    	shl   edx, 1
-    	mov   [esp + nb410_ntia], edx
-		
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb410_pos]    ;# eax = base of pos[]  
-
-	addss xmm0, [eax + ebx*4]
-	addss xmm1, [eax + ebx*4 + 4]
-	addss xmm2, [eax + ebx*4 + 8]
-
-	movaps [esp + nb410_iq], xmm3
-	movaps [esp + nb410_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [esp + nb410_ix], xmm0
-	movaps [esp + nb410_iy], xmm1
-	movaps [esp + nb410_iz], xmm2
-
-	mov   [esp + nb410_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorps xmm4, xmm4
-	movaps [esp + nb410_vctot], xmm4
-	movaps [esp + nb410_Vvdwtot], xmm4
-	movaps [esp + nb410_dvdasum], xmm4
-	movaps [esp + nb410_fix], xmm4
-	movaps [esp + nb410_fiy], xmm4
-	movaps [esp + nb410_fiz], xmm4
-	
-	mov   eax, [ebp + nb410_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb410_pos]
-	mov   edi, [ebp + nb410_faction]	
-	mov   eax, [ebp + nb410_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb410_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [esp + nb410_ninner]
-	mov   [esp + nb410_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb410_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb410_unroll_loop
-	jmp   .nb410_finish_inner
-.nb410_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   edx, [esp + nb410_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [edx]	
-	mov   ebx, [edx + 4]              
-	mov   ecx, [edx + 8]            
-	mov   edx, [edx + 12]         ;# eax-edx=jnr1-4 
-	add dword ptr [esp + nb410_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-
-	;# load isaj
-	mov esi, [ebp + nb410_invsqrta]
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-	movaps xmm2, [esp + nb410_isai]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all isaj in xmm3
-	mulps  xmm2, xmm3
-		
-	movaps [esp + nb410_isaprod], xmm2
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb410_gbtsc]
-	movaps [esp + nb410_gbscale], xmm1
-	
-	mov esi, [ebp + nb410_charge]    ;# base of charge[] 
-	
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-
-	mulps xmm2, [esp + nb410_iq]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3  
-	mulps  xmm3, xmm2
-	movaps [esp + nb410_qq], xmm3	
-
-	movd mm0, eax
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-	
-	mov esi, [ebp + nb410_type]
-	mov eax, [esi + eax*4]
-	mov ebx, [esi + ebx*4]
-	mov ecx, [esi + ecx*4]
-	mov edx, [esi + edx*4]
-	mov esi, [ebp + nb410_vdwparam]
-	shl eax, 1	
-	shl ebx, 1	
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [esp + nb410_ntia]
-	add eax, edi
-	add ebx, edi
-	add ecx, edi
-	add edx, edi
-
-	movlps xmm6, [esi + eax*4]
-	movlps xmm7, [esi + ecx*4]
-	movhps xmm6, [esi + ebx*4]
-	movhps xmm7, [esi + edx*4]
-
-	movaps xmm4, xmm6
-	shufps xmm4, xmm7, 136  ;# constant 10001000
-	shufps xmm6, xmm7, 221  ;# constant 11011101
-	
-	movd  eax, mm0		
-	movd  ebx, mm1
-	movd  ecx, mm2
-	movd  edx, mm3
-
-	movaps [esp + nb410_c6], xmm4
-	movaps [esp + nb410_c12], xmm6
-	
-	mov esi, [ebp + nb410_pos]       ;# base of pos[] 
-
-	mov [esp + nb410_jnra], eax
-	mov [esp + nb410_jnrb], ebx
-	mov [esp + nb410_jnrc], ecx
-	mov [esp + nb410_jnrd], edx
-
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-
-	lea   ecx, [ecx + ecx*2]     ;# replace jnr with j3 
-	lea   edx, [edx + edx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-
-	movlps xmm4, [esi + eax*4]
-	movlps xmm5, [esi + ecx*4]
-	movss xmm2, [esi + eax*4 + 8]
-	movss xmm6, [esi + ecx*4 + 8]
-
-	movhps xmm4, [esi + ebx*4]
-	movhps xmm5, [esi + edx*4]
-
-	movss xmm0, [esi + ebx*4 + 8]
-	movss xmm1, [esi + edx*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm5, 136  ;# constant 10001000
-	shufps xmm1, xmm5, 221  ;# constant 11011101		
-
-	;# move ix-iz to xmm4-xmm6 
-	movaps xmm4, [esp + nb410_ix]
-	movaps xmm5, [esp + nb410_iy]
-	movaps xmm6, [esp + nb410_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# store dr 
-	movaps [esp + nb410_dx], xmm4
-	movaps [esp + nb410_dy], xmm5
-	movaps [esp + nb410_dz], xmm6
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb410_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb410_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	movaps [esp + nb410_r], xmm4
-	mulps xmm4, [esp + nb410_gbscale]
-
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 2
-	pslld mm7, 2
-
-	movd mm0, eax	
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-
-	mov  esi, [ebp + nb410_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-
-	;# load coulomb table
-	movaps xmm4, [esi + eax*4]
-	movaps xmm5, [esi + ebx*4]
-	movaps xmm6, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm7, [esp + nb410_two]	;# two*Heps2 
-	movaps xmm3, [esp + nb410_qq]
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulps  xmm3, xmm7 ;# fijC=FF*qq
-	;# get jnr from stack
-	mov eax, [esp + nb410_jnra]
-	mov ebx, [esp + nb410_jnrb]
-	mov ecx, [esp + nb410_jnrc]
-	mov edx, [esp + nb410_jnrd]
-	
-	mov esi, [ebp + nb410_dvda]
-	
-	;# Calculate dVda
-	xorps xmm7, xmm7
-	mulps xmm3, [esp + nb410_gbscale]
-	movaps xmm6, xmm3
-	mulps  xmm6, [esp + nb410_r]
-	addps  xmm6, xmm5
-	addps  xmm5, [esp + nb410_vctot]
-	movaps [esp + nb410_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum
-	addps  xmm7, [esp + nb410_dvdasum]
-	movaps [esp + nb410_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	movaps  xmm5, xmm6
-	movaps  xmm4, xmm7
-	shufps  xmm5, xmm5, 0x1
-	shufps  xmm4, xmm4, 0x1
-	;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-	addss  xmm6, [esi + eax*4]
-	addss  xmm5, [esi + ebx*4]
-	addss  xmm7, [esi + ecx*4]
-	addss  xmm4, [esi + edx*4]
-	movss  [esi + eax*4], xmm6
-	movss  [esi + ebx*4], xmm5
-	movss  [esi + ecx*4], xmm7
-	movss  [esi + edx*4], xmm4
-	
-	;# L-J 
-	movaps xmm4, xmm0
-	mulps  xmm4, xmm0	;# xmm4=rinvsq 
-
-	movaps xmm6, xmm4
-	mulps  xmm6, xmm4
-
-	mulps  xmm6, xmm4	;# xmm6=rinvsix 
-	movaps xmm4, xmm6
-	mulps  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulps  xmm6, [esp + nb410_c6]
-	mulps  xmm4, [esp + nb410_c12]
-	movaps xmm7, [esp + nb410_Vvdwtot]
-	addps  xmm7, xmm4
-	mulps  xmm4, [esp + nb410_twelve]
-	subps  xmm7, xmm6
-	mulps  xmm6, [esp + nb410_six]
-	movaps [esp + nb410_Vvdwtot], xmm7
-	subps  xmm4, xmm6
-	mulps  xmm4, xmm0
-	subps  xmm4, xmm3
-	mulps  xmm4, xmm0
-
-	movaps xmm0, [esp + nb410_dx]
-	movaps xmm1, [esp + nb410_dy]
-	movaps xmm2, [esp + nb410_dz]
-
-	movd eax, mm0	
-	movd ebx, mm1
-	movd ecx, mm2
-	movd edx, mm3
-
-	mov    edi, [ebp + nb410_faction]
-	mulps  xmm0, xmm4
-	mulps  xmm1, xmm4
-	mulps  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movaps xmm3, [esp + nb410_fix]
-	movaps xmm4, [esp + nb410_fiy]
-	movaps xmm5, [esp + nb410_fiz]
-	addps  xmm3, xmm0
-	addps  xmm4, xmm1
-	addps  xmm5, xmm2
-	movaps [esp + nb410_fix], xmm3
-	movaps [esp + nb410_fiy], xmm4
-	movaps [esp + nb410_fiz], xmm5
-	;# the fj's - start by accumulating x & y forces from memory 
-	movlps xmm4, [edi + eax*4]
-	movlps xmm6, [edi + ecx*4]
-	movhps xmm4, [edi + ebx*4]
-	movhps xmm6, [edi + edx*4]
-
-	movaps xmm3, xmm4
-	shufps xmm3, xmm6, 136  ;# constant 10001000
-	shufps xmm4, xmm6, 221  ;# constant 11011101			      
-
-	;# now xmm3-xmm5 contains fjx, fjy, fjz 
-	subps  xmm3, xmm0
-	subps  xmm4, xmm1
-	
-	;# unpack them back so we can store them - first x & y in xmm3/xmm4 
-
-	movaps xmm6, xmm3
-	unpcklps xmm6, xmm4
-	unpckhps xmm3, xmm4	
-	;# xmm6(l)=x & y for j1, (h) for j2 
-	;# xmm3(l)=x & y for j3, (h) for j4 
-	movlps [edi + eax*4], xmm6
-	movlps [edi + ecx*4], xmm3
-	
-	movhps [edi + ebx*4], xmm6
-	movhps [edi + edx*4], xmm3
-
-	;# and the z forces 
-	movss  xmm4, [edi + eax*4 + 8]
-	movss  xmm5, [edi + ebx*4 + 8]
-	movss  xmm6, [edi + ecx*4 + 8]
-	movss  xmm7, [edi + edx*4 + 8]
-	subss  xmm4, xmm2
-	shufps xmm2, xmm2, 229  ;# constant 11100101
-	subss  xmm5, xmm2
-	shufps xmm2, xmm2, 234  ;# constant 11101010
-	subss  xmm6, xmm2
-	shufps xmm2, xmm2, 255  ;# constant 11111111
-	subss  xmm7, xmm2
-	movss  [edi + eax*4 + 8], xmm4
-	movss  [edi + ebx*4 + 8], xmm5
-	movss  [edi + ecx*4 + 8], xmm6
-	movss  [edi + edx*4 + 8], xmm7
-	
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb410_innerk],  4
-	jl    .nb410_finish_inner
-	jmp   .nb410_unroll_loop
-.nb410_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [esp + nb410_innerk],  4
-	mov   edx, [esp + nb410_innerk]
-	and   edx, 2
-	jnz   .nb410_dopair
-	jmp   .nb410_checksingle
-.nb410_dopair:	
-	mov   ecx, [esp + nb410_innerjjnr]
-	mov   eax, [ecx]	
-	mov   ebx, [ecx + 4]              
-	add dword ptr [esp + nb410_innerjjnr],  8
-
-	xorps xmm2, xmm2
-	movaps xmm6, xmm2
-	
-	;# load isaj
-	mov esi, [ebp + nb410_invsqrta]
-	movss xmm2, [esi + eax*4]
-	movss xmm3, [esi + ebx*4]
-	unpcklps xmm2, xmm3	;# isaj in xmm2(0,1)
-	mulps  xmm2, [esp + nb410_isai]
-	movaps [esp + nb410_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb410_gbtsc]
-	movaps [esp + nb410_gbscale], xmm1	
-	
-	mov esi, [ebp + nb410_charge]    ;# base of charge[] 	
-	movss xmm3, [esi + eax*4]		
-	movss xmm6, [esi + ebx*4]
-	unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges 
-
-	mulps  xmm2, [esp + nb410_iq]
-	mulps  xmm3, xmm2
-	movaps [esp + nb410_qq], xmm3
-
-	mov esi, [ebp + nb410_type]
-	mov   ecx, eax
-	mov   edx, ebx
-	mov ecx, [esi + ecx*4]
-	mov edx, [esi + edx*4]	
-	mov esi, [ebp + nb410_vdwparam]
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [esp + nb410_ntia]
-	add ecx, edi
-	add edx, edi
-	movlps xmm6, [esi + ecx*4]
-	movhps xmm6, [esi + edx*4]
-	mov edi, [ebp + nb410_pos]	
-	
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 8 ;# constant 00001000 	
-	shufps xmm6, xmm6, 13 ;# constant 00001101
-	movlhps xmm4, xmm7
-	movlhps xmm6, xmm7
-	
-	movaps [esp + nb410_c6], xmm4
-	movaps [esp + nb410_c12], xmm6	
-		
-	movd  mm0, eax
-	movd  mm1, ebx
-		
-	lea   eax, [eax + eax*2]
-	lea   ebx, [ebx + ebx*2]
-	;# move coordinates to xmm0-xmm2 
-	movlps xmm1, [edi + eax*4]
-	movss xmm2, [edi + eax*4 + 8]	
-	movhps xmm1, [edi + ebx*4]
-	movss xmm0, [edi + ebx*4 + 8]	
-
-	movlhps xmm3, xmm7
-	
-	shufps xmm2, xmm0, 0
-	
-	movaps xmm0, xmm1
-
-	shufps xmm2, xmm2, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm0, 136  ;# constant 10001000
-	shufps xmm1, xmm1, 221  ;# constant 11011101
-			
-	mov    edi, [ebp + nb410_faction]
-	;# move ix-iz to xmm4-xmm6 
-	xorps   xmm7, xmm7
-	
-	movaps xmm4, [esp + nb410_ix]
-	movaps xmm5, [esp + nb410_iy]
-	movaps xmm6, [esp + nb410_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# store dr 
-	movaps [esp + nb410_dx], xmm4
-	movaps [esp + nb410_dy], xmm5
-	movaps [esp + nb410_dz], xmm6
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb410_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb410_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	movaps [esp + nb410_r], xmm4
-	mulps xmm4, [esp + nb410_gbscale]
-
-	cvttps2pi mm6, xmm4     ;# mm6 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-
-	pslld mm6, 2
-
-	mov  esi, [ebp + nb410_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-
-	;# load coulomb table
-	movaps xmm4, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# coulomb table ready, in xmm4-xmm7  	
-
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm7, [esp + nb410_two]	;# two*Heps2 
-	movaps xmm3, [esp + nb410_qq]
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulps  xmm3, xmm7 ;# fijC=FF*qq 
-	;# get jnr from regs
-	movd ecx, mm0
-	movd edx, mm1
-	
-	mov esi, [ebp + nb410_dvda]
-	;# Calculate dVda
-	xorps xmm7, xmm7
-	mulps xmm3, [esp + nb410_gbscale]
-	movaps xmm6, xmm3
-	mulps  xmm6, [esp + nb410_r]
-	addps  xmm6, xmm5
-	addps  xmm5, [esp + nb410_vctot]
-	movaps [esp + nb410_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum
-	addps  xmm7, [esp + nb410_dvdasum]
-	movaps [esp + nb410_dvdasum], xmm7 
-	
-	;# update j atoms dvdaj
-	movaps xmm7, xmm6
-	shufps xmm7, xmm7, 0x1
-	addss  xmm6, [esi + ecx*4]
-	addss  xmm7, [esi + edx*4]
-	movss  [esi + ecx*4], xmm6
-	movss  [esi + edx*4], xmm7
-		
-	;# L-J 
-	movaps xmm4, xmm0
-	mulps  xmm4, xmm0	;# xmm4=rinvsq 
-
-	;# at this point mm5 contains vcoul and mm3 fijC 
-	;# increment vcoul - then we can get rid of mm5 
-	;# update vctot 
-
-	movaps xmm6, xmm4
-	mulps  xmm6, xmm4
-
-	mulps  xmm6, xmm4	;# xmm6=rinvsix 
-	movaps xmm4, xmm6
-	mulps  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulps  xmm6, [esp + nb410_c6]
-	mulps  xmm4, [esp + nb410_c12]
-	movaps xmm7, [esp + nb410_Vvdwtot]
-	addps  xmm7, xmm4
-	mulps  xmm4, [esp + nb410_twelve]
-	subps  xmm7, xmm6
-	mulps  xmm6, [esp + nb410_six]
-	movaps [esp + nb410_Vvdwtot], xmm7
-	subps  xmm4, xmm6
-	mulps  xmm4, xmm0
-	subps  xmm4, xmm3
-	mulps  xmm4, xmm0
-
-	movaps xmm0, [esp + nb410_dx]
-	movaps xmm1, [esp + nb410_dy]
-	movaps xmm2, [esp + nb410_dz]
-
-	mulps  xmm0, xmm4
-	mulps  xmm1, xmm4
-	mulps  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movaps xmm3, [esp + nb410_fix]
-	movaps xmm4, [esp + nb410_fiy]
-	movaps xmm5, [esp + nb410_fiz]
-	addps  xmm3, xmm0
-	addps  xmm4, xmm1
-	addps  xmm5, xmm2
-	movaps [esp + nb410_fix], xmm3
-	movaps [esp + nb410_fiy], xmm4
-	movaps [esp + nb410_fiz], xmm5
-	;# update the fj's 
-	movss   xmm3, [edi + eax*4]
-	movss   xmm4, [edi + eax*4 + 4]
-	movss   xmm5, [edi + eax*4 + 8]
-	subss   xmm3, xmm0
-	subss   xmm4, xmm1
-	subss   xmm5, xmm2	
-	movss   [edi + eax*4], xmm3
-	movss   [edi + eax*4 + 4], xmm4
-	movss   [edi + eax*4 + 8], xmm5	
-
-	shufps  xmm0, xmm0, 225  ;# constant 11100001
-	shufps  xmm1, xmm1, 225  ;# constant 11100001
-	shufps  xmm2, xmm2, 225  ;# constant 11100001
-
-	movss   xmm3, [edi + ebx*4]
-	movss   xmm4, [edi + ebx*4 + 4]
-	movss   xmm5, [edi + ebx*4 + 8]
-	subss   xmm3, xmm0
-	subss   xmm4, xmm1
-	subss   xmm5, xmm2	
-	movss   [edi + ebx*4], xmm3
-	movss   [edi + ebx*4 + 4], xmm4
-	movss   [edi + ebx*4 + 8], xmm5	
-
-.nb410_checksingle:				
-	mov   edx, [esp + nb410_innerk]
-	and   edx, 1
-	jnz    .nb410_dosingle
-	jmp    .nb410_updateouterdata
-.nb410_dosingle:
-	mov esi, [ebp + nb410_charge]
-	mov edx, [ebp + nb410_invsqrta]
-	mov edi, [ebp + nb410_pos]
-	mov   ecx, [esp + nb410_innerjjnr]
-	mov   eax, [ecx]	
-	xorps  xmm2, xmm2
-	movaps xmm6, xmm2
-	movss xmm2, [edx + eax*4]	;# isaj
-	mulss xmm2, [esp + nb410_isai]
-	movss [esp + nb410_isaprod], xmm2	
-	movss xmm1, xmm2
-	mulss xmm1, [esp + nb410_gbtsc]
-	movss [esp + nb410_gbscale], xmm1	
-	
-	mulss  xmm2, [esp + nb410_iq]
-	movss xmm6, [esi + eax*4]	;# xmm6(0) has the charge 	
-	mulss  xmm6, xmm2
-	movss [esp + nb410_qq], xmm6
-		
-	mov esi, [ebp + nb410_type]
-	mov ecx, eax
-	mov ecx, [esi + ecx*4]	
-	mov esi, [ebp + nb410_vdwparam]
-	shl ecx, 1
-	add ecx, [esp + nb410_ntia]
-	movlps xmm6, [esi + ecx*4]
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 252  ;# constant 11111100	
-	shufps xmm6, xmm6, 253  ;# constant 11111101	
-			
-	movaps [esp + nb410_c6], xmm4
-	movaps [esp + nb410_c12], xmm6	
-
-	movd  mm0, eax
-	lea   eax, [eax + eax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movss xmm0, [edi + eax*4]	
-	movss xmm1, [edi + eax*4 + 4]	
-	movss xmm2, [edi + eax*4 + 8]	 
-	
-	movaps xmm4, [esp + nb410_ix]
-	movaps xmm5, [esp + nb410_iy]
-	movaps xmm6, [esp + nb410_iz]
-
-	;# calc dr 
-	subss xmm4, xmm0
-	subss xmm5, xmm1
-	subss xmm6, xmm2
-
-	;# store dr 
-	movss [esp + nb410_dx], xmm4
-	movss [esp + nb410_dy], xmm5
-	movss [esp + nb410_dz], xmm6
-	;# square it 
-	mulss xmm4,xmm4
-	mulss xmm5,xmm5
-	mulss xmm6,xmm6
-	addss xmm4, xmm5
-	addss xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulss xmm5, xmm5
-	movss xmm1, [esp + nb410_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movss xmm0, [esp + nb410_half]
-	subss xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-
-	mulss xmm4, xmm0	;# xmm4=r 
-	movss [esp + nb410_r], xmm4
-	mulss xmm4, [esp + nb410_gbscale]
-
-	cvttss2si ebx, xmm4     ;# mm6 contain lu indices 
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl ebx, 2
-	mov  esi, [ebp + nb410_GBtab]
-	
-	movaps xmm4, [esi + ebx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	mulss  xmm7, [esp + nb410_two]	;# two*Heps2 
-	movss xmm3, [esp + nb410_qq]
-	addss  xmm7, xmm6
-	addss  xmm7, xmm5 ;# xmm7=FF 
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulss  xmm3, xmm7 ;# fijC=FF*qq 
-
-	movd ebx, mm0
-	mov esi, [ebp + nb410_dvda]
-	
-	;# Calculate dVda
-	xorps xmm7, xmm7
-	mulss xmm3, [esp + nb410_gbscale]
-	movaps xmm6, xmm3
-	mulss  xmm6, [esp + nb410_r]
-	addss  xmm6, xmm5
-	addss  xmm5, [esp + nb410_vctot]
-	movss [esp + nb410_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum
-	addps  xmm7, [esp + nb410_dvdasum]
-	movaps [esp + nb410_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	addss  xmm6, [esi + ebx*4]
-	movss  [esi + ebx*4], xmm6
-	
-	;# L-J 
-	movaps xmm4, xmm0
-	mulss  xmm4, xmm0	;# xmm4=rinvsq 
-
-	movaps xmm6, xmm4
-	mulss  xmm6, xmm4
-
-	mulss  xmm6, xmm4	;# xmm6=rinvsix 
-	movaps xmm4, xmm6
-	mulss  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulss  xmm6, [esp + nb410_c6]
-	mulss  xmm4, [esp + nb410_c12]
-	movss xmm7, [esp + nb410_Vvdwtot]
-	addss  xmm7, xmm4
-	mulss  xmm4, [esp + nb410_twelve]
-	subss  xmm7, xmm6
-	mulss  xmm6, [esp + nb410_six]
-	movss [esp + nb410_Vvdwtot], xmm7
-	subss  xmm4, xmm6
-	mulss  xmm4, xmm0
-	subss  xmm4, xmm3
-	mulss  xmm4, xmm0
-
-	movss xmm0, [esp + nb410_dx]
-	movss xmm1, [esp + nb410_dy]
-	movss xmm2, [esp + nb410_dz]
-
-	mov    edi, [ebp + nb410_faction]
-	mulss  xmm0, xmm4
-	mulss  xmm1, xmm4
-	mulss  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movss xmm3, [esp + nb410_fix]
-	movss xmm4, [esp + nb410_fiy]
-	movss xmm5, [esp + nb410_fiz]
-	addss  xmm3, xmm0
-	addss  xmm4, xmm1
-	addss  xmm5, xmm2
-	movss [esp + nb410_fix], xmm3
-	movss [esp + nb410_fiy], xmm4
-	movss [esp + nb410_fiz], xmm5
-	;# update fj 
-	
-	movss   xmm3, [edi + eax*4]
-	movss   xmm4, [edi + eax*4 + 4]
-	movss   xmm5, [edi + eax*4 + 8]
-	subss   xmm3, xmm0
-	subss   xmm4, xmm1
-	subss   xmm5, xmm2	
-	movss   [edi + eax*4], xmm3
-	movss   [edi + eax*4 + 4], xmm4
-	movss   [edi + eax*4 + 8], xmm5	
-.nb410_updateouterdata:
-	mov   ecx, [esp + nb410_ii3]
-	mov   edi, [ebp + nb410_faction]
-	mov   esi, [ebp + nb410_fshift]
-	mov   edx, [esp + nb410_is3]
-
-	;# accumulate i forces in xmm0, xmm1, xmm2 
-	movaps xmm0, [esp + nb410_fix]
-	movaps xmm1, [esp + nb410_fiy]
-	movaps xmm2, [esp + nb410_fiz]
-
-	movhlps xmm3, xmm0
-	movhlps xmm4, xmm1
-	movhlps xmm5, xmm2
-	addps  xmm0, xmm3
-	addps  xmm1, xmm4
-	addps  xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2 
-
-	movaps xmm3, xmm0	
-	movaps xmm4, xmm1	
-	movaps xmm5, xmm2	
-
-	shufps xmm3, xmm3, 1
-	shufps xmm4, xmm4, 1
-	shufps xmm5, xmm5, 1
-	addss  xmm0, xmm3
-	addss  xmm1, xmm4
-	addss  xmm2, xmm5	;# xmm0-xmm2 has single force in pos0 
-
-	;# increment i force 
-	movss  xmm3, [edi + ecx*4]
-	movss  xmm4, [edi + ecx*4 + 4]
-	movss  xmm5, [edi + ecx*4 + 8]
-	addss  xmm3, xmm0
-	addss  xmm4, xmm1
-	addss  xmm5, xmm2
-	movss  [edi + ecx*4],     xmm3
-	movss  [edi + ecx*4 + 4], xmm4
-	movss  [edi + ecx*4 + 8], xmm5
-
-	;# increment fshift force  
-	movss  xmm3, [esi + edx*4]
-	movss  xmm4, [esi + edx*4 + 4]
-	movss  xmm5, [esi + edx*4 + 8]
-	addss  xmm3, xmm0
-	addss  xmm4, xmm1
-	addss  xmm5, xmm2
-	movss  [esi + edx*4],     xmm3
-	movss  [esi + edx*4 + 4], xmm4
-	movss  [esi + edx*4 + 8], xmm5
-
-	;# get n from stack
-	mov esi, [esp + nb410_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb410_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movaps xmm7, [esp + nb410_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb410_Vc]
-	addss xmm7, [eax + edx*4] 
-	;# move back to mem 
-	movss [eax + edx*4], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movaps xmm7, [esp + nb410_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb410_Vvdw]
-	addss xmm7, [eax + edx*4] 
-	;# move back to mem 
-	movss [eax + edx*4], xmm7 
-	
-	;# accumulate dVda and update it 
-	movaps xmm7, [esp + nb410_dvdasum]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-	
-	mov edx, [esp + nb410_ii]
-	mov eax, [ebp + nb410_dvda]
-	addss xmm7, [eax + edx*4]
-	movss [eax + edx*4], xmm7
-	
-        ;# finish if last 
-        mov ecx, [esp + nb410_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb410_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb410_n], esi
-        jmp .nb410_outer
-.nb410_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb410_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb410_end
-        ;# non-zero, do one more workunit
-        jmp   .nb410_threadloop
-.nb410_end:
-	emms
-
-	mov eax, [esp + nb410_nouter]
-	mov ebx, [esp + nb410_ninner]
-	mov ecx, [ebp + nb410_outeriter]
-	mov edx, [ebp + nb410_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb410_salign]
-	add esp, eax
-	add esp, 504
-	pop edi
-	pop esi
-    	pop edx
-    	pop ecx
-    	pop ebx
-    	pop eax
-	leave
-	ret
-
-
-
-.globl nb_kernel410nf_ia32_sse
-.globl _nb_kernel410nf_ia32_sse
-nb_kernel410nf_ia32_sse:	
-_nb_kernel410nf_ia32_sse:	
-.equiv          nb410nf_p_nri,          8
-.equiv          nb410nf_iinr,           12
-.equiv          nb410nf_jindex,         16
-.equiv          nb410nf_jjnr,           20
-.equiv          nb410nf_shift,          24
-.equiv          nb410nf_shiftvec,       28
-.equiv          nb410nf_fshift,         32
-.equiv          nb410nf_gid,            36
-.equiv          nb410nf_pos,            40
-.equiv          nb410nf_faction,        44
-.equiv          nb410nf_charge,         48
-.equiv          nb410nf_p_facel,        52
-.equiv          nb410nf_argkrf,         56
-.equiv          nb410nf_argcrf,         60
-.equiv          nb410nf_Vc,             64
-.equiv          nb410nf_type,           68
-.equiv          nb410nf_p_ntype,        72
-.equiv          nb410nf_vdwparam,       76
-.equiv          nb410nf_Vvdw,           80
-.equiv          nb410nf_p_tabscale,     84
-.equiv          nb410nf_VFtab,          88
-.equiv          nb410nf_invsqrta,       92
-.equiv          nb410nf_dvda,           96
-.equiv          nb410nf_p_gbtabscale,   100
-.equiv          nb410nf_GBtab,          104
-.equiv          nb410nf_p_nthreads,     108
-.equiv          nb410nf_count,          112
-.equiv          nb410nf_mtx,            116
-.equiv          nb410nf_outeriter,      120
-.equiv          nb410nf_inneriter,      124
-.equiv          nb410nf_work,           128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb410nf_ix,             0
-.equiv          nb410nf_iy,             16
-.equiv          nb410nf_iz,             32
-.equiv          nb410nf_iq,             48
-.equiv          nb410nf_gbtsc,          64
-.equiv          nb410nf_qq,             80
-.equiv          nb410nf_c6,             96
-.equiv          nb410nf_c12,            112
-.equiv          nb410nf_vctot,          128
-.equiv          nb410nf_Vvdwtot,        144
-.equiv          nb410nf_half,           160
-.equiv          nb410nf_three,          176
-.equiv          nb410nf_isai,           192
-.equiv          nb410nf_isaprod,        208
-.equiv          nb410nf_gbscale,        224
-.equiv          nb410nf_is3,            240
-.equiv          nb410nf_ii3,            244
-.equiv          nb410nf_ntia,           248
-.equiv          nb410nf_innerjjnr,      252
-.equiv          nb410nf_innerk,         256
-.equiv          nb410nf_n,              260
-.equiv          nb410nf_nn1,            264
-.equiv          nb410nf_nri,            268
-.equiv          nb410nf_facel,          272
-.equiv          nb410nf_ntype,          276
-.equiv          nb410nf_nouter,         280
-.equiv          nb410nf_ninner,         284
-.equiv          nb410nf_salign,         288
-	push ebp
-	mov ebp,esp	
-    	push eax
-    	push ebx
-    	push ecx
-    	push edx
-	push esi
-	push edi
-	sub esp, 292		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb410nf_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb410nf_p_nri]
-	mov esi, [ebp + nb410nf_p_facel]
-	mov edi, [ebp + nb410nf_p_ntype]
-	mov ecx, [ecx]
-	mov esi, [esi]
-	mov edi, [edi]
-	mov [esp + nb410nf_nri], ecx
-	mov [esp + nb410nf_facel], esi
-	mov [esp + nb410nf_ntype], edi
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb410nf_nouter], eax
-	mov [esp + nb410nf_ninner], eax
-
-
-	mov eax, [ebp + nb410nf_p_gbtabscale]
-	movss xmm5, [eax]	
-	shufps xmm5, xmm5, 0
-	movaps [esp + nb410nf_gbtsc], xmm5
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# constant 0.5 in IEEE (hex)
-	mov [esp + nb410nf_half], eax
-	movss xmm1, [esp + nb410nf_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# constant 1.0
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# constant 2.0
-	addps  xmm3, xmm2	;# constant 3.0
-	movaps [esp + nb410nf_half],  xmm1
-	movaps [esp + nb410nf_three],  xmm3
-
-.nb410nf_threadloop:
-        mov   esi, [ebp + nb410nf_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb410nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb410nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb410nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb410nf_n], eax
-        mov [esp + nb410nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb410nf_outerstart
-        jmp .nb410nf_end
-
-.nb410nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb410nf_nouter]
-	mov [esp + nb410nf_nouter], ebx
-
-.nb410nf_outer:
-	mov   eax, [ebp + nb410nf_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax+esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb410nf_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb410nf_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movss xmm0, [eax + ebx*4]
-	movss xmm1, [eax + ebx*4 + 4]
-	movss xmm2, [eax + ebx*4 + 8] 
-
-	mov   ecx, [ebp + nb410nf_iinr]       ;# ecx = pointer into iinr[] 	
-	mov   ebx, [ecx + esi*4]	    ;# ebx =ii
-	
-	mov   edx, [ebp + nb410nf_charge]
-	movss xmm3, [edx + ebx*4]	
-	mulss xmm3, [esp + nb410nf_facel]
-	shufps xmm3, xmm3, 0
-
-	mov   edx, [ebp + nb410nf_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [edx + ebx*4]
-	shufps xmm4, xmm4, 0
-
-    	mov   edx, [ebp + nb410nf_type] 
-    	mov   edx, [edx + ebx*4]
-    	imul  edx, [esp + nb410nf_ntype]
-    	shl   edx, 1
-    	mov   [esp + nb410nf_ntia], edx
-		
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb410nf_pos]    ;# eax = base of pos[]  
-
-	addss xmm0, [eax + ebx*4]
-	addss xmm1, [eax + ebx*4 + 4]
-	addss xmm2, [eax + ebx*4 + 8]
-
-	movaps [esp + nb410nf_iq], xmm3
-	movaps [esp + nb410nf_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [esp + nb410nf_ix], xmm0
-	movaps [esp + nb410nf_iy], xmm1
-	movaps [esp + nb410nf_iz], xmm2
-
-	mov   [esp + nb410nf_ii3], ebx
-	
-	;# clear vctot
-	xorps xmm4, xmm4
-	movaps [esp + nb410nf_vctot], xmm4
-	movaps [esp + nb410nf_Vvdwtot], xmm4
-	
-	mov   eax, [ebp + nb410nf_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb410nf_pos]
-	mov   edi, [ebp + nb410nf_faction]	
-	mov   eax, [ebp + nb410nf_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb410nf_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [esp + nb410nf_ninner]
-	mov   [esp + nb410nf_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb410nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb410nf_unroll_loop
-	jmp   .nb410nf_finish_inner
-.nb410nf_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   edx, [esp + nb410nf_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [edx]	
-	mov   ebx, [edx + 4]              
-	mov   ecx, [edx + 8]            
-	mov   edx, [edx + 12]         ;# eax-edx=jnr1-4 
-	add dword ptr [esp + nb410nf_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-
-	;# load isa2
-	mov esi, [ebp + nb410nf_invsqrta]
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-	movaps xmm2, [esp + nb410nf_isai]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3  
-	mulps  xmm2, xmm3
-		
-	movaps [esp + nb410nf_isaprod], xmm2
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb410nf_gbtsc]
-	movaps [esp + nb410nf_gbscale], xmm1
-	
-	mov esi, [ebp + nb410nf_charge]    ;# base of charge[] 
-	
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-
-	mulps xmm2, [esp + nb410nf_iq]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3  
-	mulps  xmm3, xmm2
-	movaps [esp + nb410nf_qq], xmm3	
-
-	movd mm0, eax
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-	
-	mov esi, [ebp + nb410nf_type]
-	mov eax, [esi + eax*4]
-	mov ebx, [esi + ebx*4]
-	mov ecx, [esi + ecx*4]
-	mov edx, [esi + edx*4]
-	mov esi, [ebp + nb410nf_vdwparam]
-	shl eax, 1	
-	shl ebx, 1	
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [esp + nb410nf_ntia]
-	add eax, edi
-	add ebx, edi
-	add ecx, edi
-	add edx, edi
-
-	movlps xmm6, [esi + eax*4]
-	movlps xmm7, [esi + ecx*4]
-	movhps xmm6, [esi + ebx*4]
-	movhps xmm7, [esi + edx*4]
-
-	movaps xmm4, xmm6
-	shufps xmm4, xmm7, 136  ;# constant 10001000
-	shufps xmm6, xmm7, 221  ;# constant 11011101
-	
-	movd  eax, mm0		
-	movd  ebx, mm1
-	movd  ecx, mm2
-	movd  edx, mm3
-
-	movaps [esp + nb410nf_c6], xmm4
-	movaps [esp + nb410nf_c12], xmm6
-	
-	mov esi, [ebp + nb410nf_pos]       ;# base of pos[] 
-
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-
-	lea   ecx, [ecx + ecx*2]     ;# replace jnr with j3 
-	lea   edx, [edx + edx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-
-	movlps xmm4, [esi + eax*4]
-	movlps xmm5, [esi + ecx*4]
-	movss xmm2, [esi + eax*4 + 8]
-	movss xmm6, [esi + ecx*4 + 8]
-
-	movhps xmm4, [esi + ebx*4]
-	movhps xmm5, [esi + edx*4]
-
-	movss xmm0, [esi + ebx*4 + 8]
-	movss xmm1, [esi + edx*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm5, 136  ;# constant 10001000
-	shufps xmm1, xmm5, 221  ;# constant 11011101		
-
-	;# move ix-iz to xmm4-xmm6 
-	movaps xmm4, [esp + nb410nf_ix]
-	movaps xmm5, [esp + nb410nf_iy]
-	movaps xmm6, [esp + nb410nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb410nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb410nf_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	mulps xmm4, [esp + nb410nf_gbscale]
-
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 2
-	pslld mm7, 2
-
-	movd mm0, eax	
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-
-	mov  esi, [ebp + nb410nf_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-
-	;# load coulomb table
-	movaps xmm4, [esi + eax*4]
-	movaps xmm5, [esi + ebx*4]
-	movaps xmm6, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	movaps xmm3, [esp + nb410nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	;# update vctot
-	addps  xmm5, [esp + nb410nf_vctot]
-	movaps [esp + nb410nf_vctot], xmm5	
-	
-	;# L-J 
-	movaps xmm4, xmm0
-	mulps  xmm4, xmm0	;# xmm4=rinvsq 
-
-	movaps xmm6, xmm4
-	mulps  xmm6, xmm4
-
-	mulps  xmm6, xmm4	;# xmm6=rinvsix 
-	movaps xmm4, xmm6
-	mulps  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulps  xmm6, [esp + nb410nf_c6]
-	mulps  xmm4, [esp + nb410nf_c12]
-	movaps xmm7, [esp + nb410nf_Vvdwtot]
-	addps  xmm7, xmm4
-	subps  xmm7, xmm6
-	movaps [esp + nb410nf_Vvdwtot], xmm7
-		
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb410nf_innerk],  4
-	jl    .nb410nf_finish_inner
-	jmp   .nb410nf_unroll_loop
-.nb410nf_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [esp + nb410nf_innerk],  4
-	mov   edx, [esp + nb410nf_innerk]
-	and   edx, 2
-	jnz   .nb410nf_dopair
-	jmp   .nb410nf_checksingle
-.nb410nf_dopair:	
-	mov   ecx, [esp + nb410nf_innerjjnr]
-	mov   eax, [ecx]	
-	mov   ebx, [ecx + 4]              
-	add dword ptr [esp + nb410nf_innerjjnr],  8
-
-	xorps xmm2, xmm2
-	movaps xmm6, xmm2
-	
-	;# load isa2
-	mov esi, [ebp + nb410nf_invsqrta]
-	movss xmm2, [esi + eax*4]
-	movss xmm3, [esi + ebx*4]
-	unpcklps xmm2, xmm3	;# isa2 in xmm3(0,1)
-	mulps  xmm2, [esp + nb410nf_isai]
-	movaps [esp + nb410nf_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb410nf_gbtsc]
-	movaps [esp + nb410nf_gbscale], xmm1	
-	
-	mov esi, [ebp + nb410nf_charge]    ;# base of charge[] 	
-	movss xmm3, [esi + eax*4]		
-	movss xmm6, [esi + ebx*4]
-	unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges 
-
-	mulps  xmm2, [esp + nb410nf_iq]
-	mulps  xmm3, xmm2
-	movaps [esp + nb410nf_qq], xmm3
-
-	mov esi, [ebp + nb410nf_type]
-	mov   ecx, eax
-	mov   edx, ebx
-	mov ecx, [esi + ecx*4]
-	mov edx, [esi + edx*4]	
-	mov esi, [ebp + nb410nf_vdwparam]
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [esp + nb410nf_ntia]
-	add ecx, edi
-	add edx, edi
-	movlps xmm6, [esi + ecx*4]
-	movhps xmm6, [esi + edx*4]
-	mov edi, [ebp + nb410nf_pos]	
-	
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 8 ;# constant 00001000 	
-	shufps xmm6, xmm6, 13 ;# constant 00001101
-	movlhps xmm4, xmm7
-	movlhps xmm6, xmm7
-	
-	movaps [esp + nb410nf_c6], xmm4
-	movaps [esp + nb410nf_c12], xmm6	
-			
-	lea   eax, [eax + eax*2]
-	lea   ebx, [ebx + ebx*2]
-	;# move coordinates to xmm0-xmm2 
-	movlps xmm1, [edi + eax*4]
-	movss xmm2, [edi + eax*4 + 8]	
-	movhps xmm1, [edi + ebx*4]
-	movss xmm0, [edi + ebx*4 + 8]	
-
-	movlhps xmm3, xmm7
-	
-	shufps xmm2, xmm0, 0
-	
-	movaps xmm0, xmm1
-
-	shufps xmm2, xmm2, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm0, 136  ;# constant 10001000
-	shufps xmm1, xmm1, 221  ;# constant 11011101
-			
-	mov    edi, [ebp + nb410nf_faction]
-	;# move ix-iz to xmm4-xmm6 
-	xorps   xmm7, xmm7
-	
-	movaps xmm4, [esp + nb410nf_ix]
-	movaps xmm5, [esp + nb410nf_iy]
-	movaps xmm6, [esp + nb410nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb410nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb410nf_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	mulps xmm4, [esp + nb410nf_gbscale]
-
-	cvttps2pi mm6, xmm4     ;# mm6 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-
-	pslld mm6, 2
-
-	mov  esi, [ebp + nb410nf_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-
-	;# load coulomb table
-	movaps xmm4, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# coulomb table ready, in xmm4-xmm7  	
-
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	movaps xmm3, [esp + nb410nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-
-	addps  xmm5, [esp + nb410nf_vctot]
-	movaps [esp + nb410nf_vctot], xmm5
-	
-	;# L-J 
-	movaps xmm4, xmm0
-	mulps  xmm4, xmm0	;# xmm4=rinvsq 
-
-	;# at this point mm5 contains vcoul and mm3 fijC 
-	;# increment vcoul - then we can get rid of mm5 
-	;# update vctot 
-
-	movaps xmm6, xmm4
-	mulps  xmm6, xmm4
-
-	mulps  xmm6, xmm4	;# xmm6=rinvsix 
-	movaps xmm4, xmm6
-	mulps  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulps  xmm6, [esp + nb410nf_c6]
-	mulps  xmm4, [esp + nb410nf_c12]
-	movaps xmm7, [esp + nb410nf_Vvdwtot]
-	addps  xmm7, xmm4
-	subps  xmm7, xmm6
-	movaps [esp + nb410nf_Vvdwtot], xmm7
-	
-.nb410nf_checksingle:				
-	mov   edx, [esp + nb410nf_innerk]
-	and   edx, 1
-	jnz    .nb410nf_dosingle
-	jmp    .nb410nf_updateouterdata
-.nb410nf_dosingle:
-	mov esi, [ebp + nb410nf_charge]
-	mov edx, [ebp + nb410nf_invsqrta]
-	mov edi, [ebp + nb410nf_pos]
-	mov   ecx, [esp + nb410nf_innerjjnr]
-	mov   eax, [ecx]	
-	xorps  xmm2, xmm2
-	movaps xmm6, xmm2
-	movss xmm2, [edx + eax*4]	;# isa2
-	mulss xmm2, [esp + nb410nf_isai]
-	movss [esp + nb410nf_isaprod], xmm2	
-	movss xmm1, xmm2
-	mulss xmm1, [esp + nb410nf_gbtsc]
-	movss [esp + nb410nf_gbscale], xmm1	
-	
-	mulss  xmm2, [esp + nb410nf_iq]
-	movss xmm6, [esi + eax*4]	;# xmm6(0) has the charge 	
-	mulss  xmm6, xmm2
-	movss [esp + nb410nf_qq], xmm6
-		
-	mov esi, [ebp + nb410nf_type]
-	mov ecx, eax
-	mov ecx, [esi + ecx*4]	
-	mov esi, [ebp + nb410nf_vdwparam]
-	shl ecx, 1
-	add ecx, [esp + nb410nf_ntia]
-	movlps xmm6, [esi + ecx*4]
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 252  ;# constant 11111100	
-	shufps xmm6, xmm6, 253  ;# constant 11111101	
-			
-	movaps [esp + nb410nf_c6], xmm4
-	movaps [esp + nb410nf_c12], xmm6	
-		
-	lea   eax, [eax + eax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movss xmm0, [edi + eax*4]	
-	movss xmm1, [edi + eax*4 + 4]	
-	movss xmm2, [edi + eax*4 + 8]	 
-	
-	movaps xmm4, [esp + nb410nf_ix]
-	movaps xmm5, [esp + nb410nf_iy]
-	movaps xmm6, [esp + nb410nf_iz]
-
-	;# calc dr 
-	subss xmm4, xmm0
-	subss xmm5, xmm1
-	subss xmm6, xmm2
-
-	;# square it 
-	mulss xmm4,xmm4
-	mulss xmm5,xmm5
-	mulss xmm6,xmm6
-	addss xmm4, xmm5
-	addss xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulss xmm5, xmm5
-	movss xmm1, [esp + nb410nf_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movss xmm0, [esp + nb410nf_half]
-	subss xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-
-	mulss xmm4, xmm0	;# xmm4=r 
-	mulss xmm4, [esp + nb410nf_gbscale]
-
-	cvttss2si ebx, xmm4     ;# mm6 contain lu indices 
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl ebx, 2
-	mov  esi, [ebp + nb410nf_GBtab]
-	
-	movaps xmm4, [esi + ebx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	movss xmm3, [esp + nb410nf_qq]
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, xmm3 ;# vcoul=qq*VV  
-	addss  xmm5, [esp + nb410nf_vctot]
-	movss [esp + nb410nf_vctot], xmm5 	
-	
-	;# L-J 
-	movaps xmm4, xmm0
-	mulss  xmm4, xmm0	;# xmm4=rinvsq 
-
-	movaps xmm6, xmm4
-	mulss  xmm6, xmm4
-
-	mulss  xmm6, xmm4	;# xmm6=rinvsix 
-	movaps xmm4, xmm6
-	mulss  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulss  xmm6, [esp + nb410nf_c6]
-	mulss  xmm4, [esp + nb410nf_c12]
-	movss xmm7, [esp + nb410nf_Vvdwtot]
-	addps  xmm7, xmm4
-	subps  xmm7, xmm6
-	movss [esp + nb410nf_Vvdwtot], xmm7
-	
-.nb410nf_updateouterdata:
-	;# get n from stack
-	mov esi, [esp + nb410nf_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb410nf_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movaps xmm7, [esp + nb410nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb410nf_Vc]
-	addss xmm7, [eax + edx*4] 
-	;# move back to mem 
-	movss [eax + edx*4], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movaps xmm7, [esp + nb410nf_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb410nf_Vvdw]
-	addss xmm7, [eax + edx*4] 
-	;# move back to mem 
-	movss [eax + edx*4], xmm7 
-		
-        ;# finish if last 
-        mov ecx, [esp + nb410nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb410nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb410nf_n], esi
-        jmp .nb410nf_outer
-.nb410nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb410nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb410nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb410nf_threadloop
-.nb410nf_end:
-	emms
-
-	mov eax, [esp + nb410nf_nouter]
-	mov ebx, [esp + nb410nf_ninner]
-	mov ecx, [ebp + nb410nf_outeriter]
-	mov edx, [ebp + nb410nf_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb410nf_salign]
-	add esp, eax
-	add esp, 292
-	pop edi
-	pop esi
-    	pop edx
-    	pop ecx
-    	pop ebx
-    	pop eax
-	leave
-	ret
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.s
deleted file mode 100644
index 0c05ad5c91..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.s
+++ /dev/null
@@ -1,2022 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-
-.globl nb_kernel410_ia32_sse
-.globl _nb_kernel410_ia32_sse
-nb_kernel410_ia32_sse:  
-_nb_kernel410_ia32_sse: 
-.set nb410_p_nri, 8
-.set nb410_iinr, 12
-.set nb410_jindex, 16
-.set nb410_jjnr, 20
-.set nb410_shift, 24
-.set nb410_shiftvec, 28
-.set nb410_fshift, 32
-.set nb410_gid, 36
-.set nb410_pos, 40
-.set nb410_faction, 44
-.set nb410_charge, 48
-.set nb410_p_facel, 52
-.set nb410_argkrf, 56
-.set nb410_argcrf, 60
-.set nb410_Vc, 64
-.set nb410_type, 68
-.set nb410_p_ntype, 72
-.set nb410_vdwparam, 76
-.set nb410_Vvdw, 80
-.set nb410_p_tabscale, 84
-.set nb410_VFtab, 88
-.set nb410_invsqrta, 92
-.set nb410_dvda, 96
-.set nb410_p_gbtabscale, 100
-.set nb410_GBtab, 104
-.set nb410_p_nthreads, 108
-.set nb410_count, 112
-.set nb410_mtx, 116
-.set nb410_outeriter, 120
-.set nb410_inneriter, 124
-.set nb410_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb410_ix, 0
-.set nb410_iy, 16
-.set nb410_iz, 32
-.set nb410_iq, 48
-.set nb410_dx, 64
-.set nb410_dy, 80
-.set nb410_dz, 96
-.set nb410_two, 112
-.set nb410_six, 128
-.set nb410_twelve, 144
-.set nb410_gbtsc, 160
-.set nb410_qq, 176
-.set nb410_c6, 192
-.set nb410_c12, 208
-.set nb410_fscal, 224
-.set nb410_vctot, 240
-.set nb410_Vvdwtot, 256
-.set nb410_fix, 272
-.set nb410_fiy, 288
-.set nb410_fiz, 304
-.set nb410_half, 320
-.set nb410_three, 336
-.set nb410_r, 352
-.set nb410_isai, 368
-.set nb410_isaprod, 384
-.set nb410_dvdasum, 400
-.set nb410_gbscale, 416
-.set nb410_is3, 432
-.set nb410_ii3, 436
-.set nb410_ii, 440
-.set nb410_ntia, 444
-.set nb410_innerjjnr, 448
-.set nb410_innerk, 452
-.set nb410_n, 456
-.set nb410_nn1, 460
-.set nb410_jnra, 464
-.set nb410_jnrb, 468
-.set nb410_jnrc, 472
-.set nb410_jnrd, 476
-.set nb410_nri, 480
-.set nb410_facel, 484
-.set nb410_ntype, 488
-.set nb410_nouter, 492
-.set nb410_ninner, 496
-.set nb410_salign, 500
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $504,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb410_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb410_p_nri(%ebp),%ecx
-        movl nb410_p_facel(%ebp),%esi
-        movl nb410_p_ntype(%ebp),%edi
-        movl (%ecx),%ecx
-        movl (%esi),%esi
-        movl (%edi),%edi
-        movl %ecx,nb410_nri(%esp)
-        movl %esi,nb410_facel(%esp)
-        movl %edi,nb410_ntype(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb410_nouter(%esp)
-        movl %eax,nb410_ninner(%esp)
-
-
-        movl nb410_p_gbtabscale(%ebp),%eax
-        movss (%eax),%xmm5
-        shufps $0,%xmm5,%xmm5
-        movaps %xmm5,nb410_gbtsc(%esp)
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## constant 0.5 in IEEE (hex)
-        movl %eax,nb410_half(%esp)
-        movss nb410_half(%esp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## constant 1.0
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## constant 2.0
-        addps  %xmm2,%xmm3      ## constant 3.0
-        movaps %xmm3,%xmm4
-        addps  %xmm4,%xmm4      ## 6.0
-        movaps %xmm4,%xmm5
-        addps  %xmm5,%xmm5      ## constant 12.0
-        movaps %xmm1,nb410_half(%esp)
-        movaps %xmm2,nb410_two(%esp)
-        movaps %xmm3,nb410_three(%esp)
-        movaps %xmm4,nb410_six(%esp)
-        movaps %xmm5,nb410_twelve(%esp)
-
-_nb_kernel410_ia32_sse.nb410_threadloop: 
-        movl  nb410_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel410_ia32_sse.nb410_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel410_ia32_sse.nb410_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb410_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb410_n(%esp)
-        movl %ebx,nb410_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel410_ia32_sse.nb410_outerstart
-        jmp _nb_kernel410_ia32_sse.nb410_end
-
-_nb_kernel410_ia32_sse.nb410_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb410_nouter(%esp),%ebx
-        movl %ebx,nb410_nouter(%esp)
-
-_nb_kernel410_ia32_sse.nb410_outer: 
-        movl  nb410_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx        ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb410_is3(%esp)      ## store is3 
-
-        movl  nb410_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movss (%eax,%ebx,4),%xmm0
-        movss 4(%eax,%ebx,4),%xmm1
-        movss 8(%eax,%ebx,4),%xmm2
-
-        movl  nb410_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]        
-        movl  (%ecx,%esi,4),%ebx            ## ebx =ii 
-        movl  %ebx,nb410_ii(%esp)
-
-        movl  nb410_charge(%ebp),%edx
-        movss (%edx,%ebx,4),%xmm3
-        mulss nb410_facel(%esp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-        movl  nb410_invsqrta(%ebp),%edx         ## load invsqrta[ii]
-        movss (%edx,%ebx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        movl  nb410_type(%ebp),%edx
-        movl  (%edx,%ebx,4),%edx
-        imull nb410_ntype(%esp),%edx
-        shll  %edx
-        movl  %edx,nb410_ntia(%esp)
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb410_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addss (%eax,%ebx,4),%xmm0
-        addss 4(%eax,%ebx,4),%xmm1
-        addss 8(%eax,%ebx,4),%xmm2
-
-        movaps %xmm3,nb410_iq(%esp)
-        movaps %xmm4,nb410_isai(%esp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb410_ix(%esp)
-        movaps %xmm1,nb410_iy(%esp)
-        movaps %xmm2,nb410_iz(%esp)
-
-        movl  %ebx,nb410_ii3(%esp)
-
-        ## clear vctot and i forces 
-        xorps %xmm4,%xmm4
-        movaps %xmm4,nb410_vctot(%esp)
-        movaps %xmm4,nb410_Vvdwtot(%esp)
-        movaps %xmm4,nb410_dvdasum(%esp)
-        movaps %xmm4,nb410_fix(%esp)
-        movaps %xmm4,nb410_fiy(%esp)
-        movaps %xmm4,nb410_fiz(%esp)
-
-        movl  nb410_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb410_pos(%ebp),%esi
-        movl  nb410_faction(%ebp),%edi
-        movl  nb410_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb410_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb410_ninner(%esp),%ecx
-        movl  %ecx,nb410_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb410_innerk(%esp)      ## number of innerloop atoms 
-        jge   _nb_kernel410_ia32_sse.nb410_unroll_loop
-        jmp   _nb_kernel410_ia32_sse.nb410_finish_inner
-_nb_kernel410_ia32_sse.nb410_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movl  nb410_innerjjnr(%esp),%edx       ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        movl  8(%edx),%ecx
-        movl  12(%edx),%edx           ## eax-edx=jnr1-4 
-        addl $16,nb410_innerjjnr(%esp)             ## advance pointer (unrolled 4) 
-
-        ## load isaj
-        movl nb410_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-        movaps nb410_isai(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all isaj in xmm3
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb410_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb410_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb410_gbscale(%esp)
-
-        movl nb410_charge(%ebp),%esi     ## base of charge[] 
-
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-
-        mulps nb410_iq(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3  
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb410_qq(%esp)
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movl nb410_type(%ebp),%esi
-        movl (%esi,%eax,4),%eax
-        movl (%esi,%ebx,4),%ebx
-        movl (%esi,%ecx,4),%ecx
-        movl (%esi,%edx,4),%edx
-        movl nb410_vdwparam(%ebp),%esi
-        shll %eax
-        shll %ebx
-        shll %ecx
-        shll %edx
-        movl nb410_ntia(%esp),%edi
-        addl %edi,%eax
-        addl %edi,%ebx
-        addl %edi,%ecx
-        addl %edi,%edx
-
-        movlps (%esi,%eax,4),%xmm6
-        movlps (%esi,%ecx,4),%xmm7
-        movhps (%esi,%ebx,4),%xmm6
-        movhps (%esi,%edx,4),%xmm7
-
-        movaps %xmm6,%xmm4
-        shufps $136,%xmm7,%xmm4 ## constant 10001000
-        shufps $221,%xmm7,%xmm6 ## constant 11011101
-
-        movd  %mm0,%eax
-        movd  %mm1,%ebx
-        movd  %mm2,%ecx
-        movd  %mm3,%edx
-
-        movaps %xmm4,nb410_c6(%esp)
-        movaps %xmm6,nb410_c12(%esp)
-
-        movl nb410_pos(%ebp),%esi        ## base of pos[] 
-
-        movl %eax,nb410_jnra(%esp)
-        movl %ebx,nb410_jnrb(%esp)
-        movl %ecx,nb410_jnrc(%esp)
-        movl %edx,nb410_jnrd(%esp)
-
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-
-        leal  (%ecx,%ecx,2),%ecx     ## replace jnr with j3 
-        leal  (%edx,%edx,2),%edx
-
-        ## move four coordinates to xmm0-xmm2   
-
-        movlps (%esi,%eax,4),%xmm4
-        movlps (%esi,%ecx,4),%xmm5
-        movss 8(%esi,%eax,4),%xmm2
-        movss 8(%esi,%ecx,4),%xmm6
-
-        movhps (%esi,%ebx,4),%xmm4
-        movhps (%esi,%edx,4),%xmm5
-
-        movss 8(%esi,%ebx,4),%xmm0
-        movss 8(%esi,%edx,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm5,%xmm0 ## constant 10001000
-        shufps $221,%xmm5,%xmm1 ## constant 11011101            
-
-        ## move ix-iz to xmm4-xmm6 
-        movaps nb410_ix(%esp),%xmm4
-        movaps nb410_iy(%esp),%xmm5
-        movaps nb410_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## store dr 
-        movaps %xmm4,nb410_dx(%esp)
-        movaps %xmm5,nb410_dy(%esp)
-        movaps %xmm6,nb410_dz(%esp)
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb410_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb410_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        movaps %xmm4,nb410_r(%esp)
-        mulps nb410_gbscale(%esp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $2,%mm6
-        pslld $2,%mm7
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movl nb410_GBtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## load coulomb table
-        movaps (%esi,%eax,4),%xmm4
-        movaps (%esi,%ebx,4),%xmm5
-        movaps (%esi,%ecx,4),%xmm6
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## coulomb table ready, in xmm4-xmm7            
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  nb410_two(%esp),%xmm7    ## two*Heps2 
-        movaps nb410_qq(%esp),%xmm3
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulps  %xmm7,%xmm3 ## fijC=FF*qq
-        ## get jnr from stack
-        movl nb410_jnra(%esp),%eax
-        movl nb410_jnrb(%esp),%ebx
-        movl nb410_jnrc(%esp),%ecx
-        movl nb410_jnrd(%esp),%edx
-
-        movl nb410_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorps %xmm7,%xmm7
-        mulps nb410_gbscale(%esp),%xmm3
-        movaps %xmm3,%xmm6
-        mulps  nb410_r(%esp),%xmm6
-        addps  %xmm5,%xmm6
-        addps  nb410_vctot(%esp),%xmm5
-        movaps %xmm5,nb410_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum
-        addps  nb410_dvdasum(%esp),%xmm7
-        movaps %xmm7,nb410_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        movaps  %xmm6,%xmm5
-        movaps  %xmm7,%xmm4
-        shufps $0x1,%xmm5,%xmm5
-        shufps $0x1,%xmm4,%xmm4
-        ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-        addss  (%esi,%eax,4),%xmm6
-        addss  (%esi,%ebx,4),%xmm5
-        addss  (%esi,%ecx,4),%xmm7
-        addss  (%esi,%edx,4),%xmm4
-        movss  %xmm6,(%esi,%eax,4)
-        movss  %xmm5,(%esi,%ebx,4)
-        movss  %xmm7,(%esi,%ecx,4)
-        movss  %xmm4,(%esi,%edx,4)
-
-        ## L-J 
-        movaps %xmm0,%xmm4
-        mulps  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        movaps %xmm4,%xmm6
-        mulps  %xmm4,%xmm6
-
-        mulps  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movaps %xmm6,%xmm4
-        mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulps  nb410_c6(%esp),%xmm6
-        mulps  nb410_c12(%esp),%xmm4
-        movaps nb410_Vvdwtot(%esp),%xmm7
-        addps  %xmm4,%xmm7
-        mulps  nb410_twelve(%esp),%xmm4
-        subps  %xmm6,%xmm7
-        mulps  nb410_six(%esp),%xmm6
-        movaps %xmm7,nb410_Vvdwtot(%esp)
-        subps  %xmm6,%xmm4
-        mulps  %xmm0,%xmm4
-        subps  %xmm3,%xmm4
-        mulps  %xmm0,%xmm4
-
-        movaps nb410_dx(%esp),%xmm0
-        movaps nb410_dy(%esp),%xmm1
-        movaps nb410_dz(%esp),%xmm2
-
-        movd %mm0,%eax
-        movd %mm1,%ebx
-        movd %mm2,%ecx
-        movd %mm3,%edx
-
-        movl   nb410_faction(%ebp),%edi
-        mulps  %xmm4,%xmm0
-        mulps  %xmm4,%xmm1
-        mulps  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movaps nb410_fix(%esp),%xmm3
-        movaps nb410_fiy(%esp),%xmm4
-        movaps nb410_fiz(%esp),%xmm5
-        addps  %xmm0,%xmm3
-        addps  %xmm1,%xmm4
-        addps  %xmm2,%xmm5
-        movaps %xmm3,nb410_fix(%esp)
-        movaps %xmm4,nb410_fiy(%esp)
-        movaps %xmm5,nb410_fiz(%esp)
-        ## the fj's - start by accumulating x & y forces from memory 
-        movlps (%edi,%eax,4),%xmm4
-        movlps (%edi,%ecx,4),%xmm6
-        movhps (%edi,%ebx,4),%xmm4
-        movhps (%edi,%edx,4),%xmm6
-
-        movaps %xmm4,%xmm3
-        shufps $136,%xmm6,%xmm3 ## constant 10001000
-        shufps $221,%xmm6,%xmm4 ## constant 11011101                          
-
-        ## now xmm3-xmm5 contains fjx, fjy, fjz 
-        subps  %xmm0,%xmm3
-        subps  %xmm1,%xmm4
-
-        ## unpack them back so we can store them - first x & y in xmm3/xmm4 
-
-        movaps %xmm3,%xmm6
-        unpcklps %xmm4,%xmm6
-        unpckhps %xmm4,%xmm3
-        ## xmm6(l)=x & y for j1, (h) for j2 
-        ## xmm3(l)=x & y for j3, (h) for j4 
-        movlps %xmm6,(%edi,%eax,4)
-        movlps %xmm3,(%edi,%ecx,4)
-
-        movhps %xmm6,(%edi,%ebx,4)
-        movhps %xmm3,(%edi,%edx,4)
-
-        ## and the z forces 
-        movss  8(%edi,%eax,4),%xmm4
-        movss  8(%edi,%ebx,4),%xmm5
-        movss  8(%edi,%ecx,4),%xmm6
-        movss  8(%edi,%edx,4),%xmm7
-        subss  %xmm2,%xmm4
-        shufps $229,%xmm2,%xmm2 ## constant 11100101
-        subss  %xmm2,%xmm5
-        shufps $234,%xmm2,%xmm2 ## constant 11101010
-        subss  %xmm2,%xmm6
-        shufps $255,%xmm2,%xmm2 ## constant 11111111
-        subss  %xmm2,%xmm7
-        movss  %xmm4,8(%edi,%eax,4)
-        movss  %xmm5,8(%edi,%ebx,4)
-        movss  %xmm6,8(%edi,%ecx,4)
-        movss  %xmm7,8(%edi,%edx,4)
-
-        ## should we do one more iteration? 
-        subl $4,nb410_innerk(%esp)
-        jl    _nb_kernel410_ia32_sse.nb410_finish_inner
-        jmp   _nb_kernel410_ia32_sse.nb410_unroll_loop
-_nb_kernel410_ia32_sse.nb410_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb410_innerk(%esp)
-        movl  nb410_innerk(%esp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel410_ia32_sse.nb410_dopair
-        jmp   _nb_kernel410_ia32_sse.nb410_checksingle
-_nb_kernel410_ia32_sse.nb410_dopair: 
-        movl  nb410_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-        movl  4(%ecx),%ebx
-        addl $8,nb410_innerjjnr(%esp)
-
-        xorps %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-
-        ## load isaj
-        movl nb410_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm2
-        movss (%esi,%ebx,4),%xmm3
-        unpcklps %xmm3,%xmm2    ## isaj in xmm2(0,1)
-        mulps  nb410_isai(%esp),%xmm2
-        movaps %xmm2,nb410_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb410_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb410_gbscale(%esp)
-
-        movl nb410_charge(%ebp),%esi     ## base of charge[]    
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ebx,4),%xmm6
-        unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges 
-
-        mulps  nb410_iq(%esp),%xmm2
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb410_qq(%esp)
-
-        movl nb410_type(%ebp),%esi
-        movl  %eax,%ecx
-        movl  %ebx,%edx
-        movl (%esi,%ecx,4),%ecx
-        movl (%esi,%edx,4),%edx
-        movl nb410_vdwparam(%ebp),%esi
-        shll %ecx
-        shll %edx
-        movl nb410_ntia(%esp),%edi
-        addl %edi,%ecx
-        addl %edi,%edx
-        movlps (%esi,%ecx,4),%xmm6
-        movhps (%esi,%edx,4),%xmm6
-        movl nb410_pos(%ebp),%edi
-
-        movaps %xmm6,%xmm4
-        shufps $8,%xmm4,%xmm4 ## constant 00001000       
-        shufps $13,%xmm6,%xmm6 ## constant 00001101
-        movlhps %xmm7,%xmm4
-        movlhps %xmm7,%xmm6
-
-        movaps %xmm4,nb410_c6(%esp)
-        movaps %xmm6,nb410_c12(%esp)
-
-        movd  %eax,%mm0
-        movd  %ebx,%mm1
-
-        leal  (%eax,%eax,2),%eax
-        leal  (%ebx,%ebx,2),%ebx
-        ## move coordinates to xmm0-xmm2 
-        movlps (%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-        movhps (%edi,%ebx,4),%xmm1
-        movss 8(%edi,%ebx,4),%xmm0
-
-        movlhps %xmm7,%xmm3
-
-        shufps $0,%xmm0,%xmm2
-
-        movaps %xmm1,%xmm0
-
-        shufps $136,%xmm2,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm0,%xmm0 ## constant 10001000
-        shufps $221,%xmm1,%xmm1 ## constant 11011101
-
-        movl   nb410_faction(%ebp),%edi
-        ## move ix-iz to xmm4-xmm6 
-        xorps   %xmm7,%xmm7
-
-        movaps nb410_ix(%esp),%xmm4
-        movaps nb410_iy(%esp),%xmm5
-        movaps nb410_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## store dr 
-        movaps %xmm4,nb410_dx(%esp)
-        movaps %xmm5,nb410_dy(%esp)
-        movaps %xmm6,nb410_dz(%esp)
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb410_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb410_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        movaps %xmm4,nb410_r(%esp)
-        mulps nb410_gbscale(%esp),%xmm4
-
-        cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6
-
-        movl nb410_GBtab(%ebp),%esi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## load coulomb table
-        movaps (%esi,%ecx,4),%xmm4
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  nb410_two(%esp),%xmm7    ## two*Heps2 
-        movaps nb410_qq(%esp),%xmm3
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulps  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## get jnr from regs
-        movd %mm0,%ecx
-        movd %mm1,%edx
-
-        movl nb410_dvda(%ebp),%esi
-        ## Calculate dVda
-        xorps %xmm7,%xmm7
-        mulps nb410_gbscale(%esp),%xmm3
-        movaps %xmm3,%xmm6
-        mulps  nb410_r(%esp),%xmm6
-        addps  %xmm5,%xmm6
-        addps  nb410_vctot(%esp),%xmm5
-        movaps %xmm5,nb410_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum
-        addps  nb410_dvdasum(%esp),%xmm7
-        movaps %xmm7,nb410_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        movaps %xmm6,%xmm7
-        shufps $0x1,%xmm7,%xmm7
-        addss  (%esi,%ecx,4),%xmm6
-        addss  (%esi,%edx,4),%xmm7
-        movss  %xmm6,(%esi,%ecx,4)
-        movss  %xmm7,(%esi,%edx,4)
-
-        ## L-J 
-        movaps %xmm0,%xmm4
-        mulps  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        ## at this point mm5 contains vcoul and mm3 fijC 
-        ## increment vcoul - then we can get rid of mm5 
-        ## update vctot 
-
-        movaps %xmm4,%xmm6
-        mulps  %xmm4,%xmm6
-
-        mulps  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movaps %xmm6,%xmm4
-        mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulps  nb410_c6(%esp),%xmm6
-        mulps  nb410_c12(%esp),%xmm4
-        movaps nb410_Vvdwtot(%esp),%xmm7
-        addps  %xmm4,%xmm7
-        mulps  nb410_twelve(%esp),%xmm4
-        subps  %xmm6,%xmm7
-        mulps  nb410_six(%esp),%xmm6
-        movaps %xmm7,nb410_Vvdwtot(%esp)
-        subps  %xmm6,%xmm4
-        mulps  %xmm0,%xmm4
-        subps  %xmm3,%xmm4
-        mulps  %xmm0,%xmm4
-
-        movaps nb410_dx(%esp),%xmm0
-        movaps nb410_dy(%esp),%xmm1
-        movaps nb410_dz(%esp),%xmm2
-
-        mulps  %xmm4,%xmm0
-        mulps  %xmm4,%xmm1
-        mulps  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movaps nb410_fix(%esp),%xmm3
-        movaps nb410_fiy(%esp),%xmm4
-        movaps nb410_fiz(%esp),%xmm5
-        addps  %xmm0,%xmm3
-        addps  %xmm1,%xmm4
-        addps  %xmm2,%xmm5
-        movaps %xmm3,nb410_fix(%esp)
-        movaps %xmm4,nb410_fiy(%esp)
-        movaps %xmm5,nb410_fiz(%esp)
-        ## update the fj's 
-        movss   (%edi,%eax,4),%xmm3
-        movss   4(%edi,%eax,4),%xmm4
-        movss   8(%edi,%eax,4),%xmm5
-        subss   %xmm0,%xmm3
-        subss   %xmm1,%xmm4
-        subss   %xmm2,%xmm5
-        movss   %xmm3,(%edi,%eax,4)
-        movss   %xmm4,4(%edi,%eax,4)
-        movss   %xmm5,8(%edi,%eax,4)
-
-        shufps $225,%xmm0,%xmm0 ## constant 11100001
-        shufps $225,%xmm1,%xmm1 ## constant 11100001
-        shufps $225,%xmm2,%xmm2 ## constant 11100001
-
-        movss   (%edi,%ebx,4),%xmm3
-        movss   4(%edi,%ebx,4),%xmm4
-        movss   8(%edi,%ebx,4),%xmm5
-        subss   %xmm0,%xmm3
-        subss   %xmm1,%xmm4
-        subss   %xmm2,%xmm5
-        movss   %xmm3,(%edi,%ebx,4)
-        movss   %xmm4,4(%edi,%ebx,4)
-        movss   %xmm5,8(%edi,%ebx,4)
-
-_nb_kernel410_ia32_sse.nb410_checksingle:       
-        movl  nb410_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel410_ia32_sse.nb410_dosingle
-        jmp    _nb_kernel410_ia32_sse.nb410_updateouterdata
-_nb_kernel410_ia32_sse.nb410_dosingle: 
-        movl nb410_charge(%ebp),%esi
-        movl nb410_invsqrta(%ebp),%edx
-        movl nb410_pos(%ebp),%edi
-        movl  nb410_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-        xorps  %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-        movss (%edx,%eax,4),%xmm2       ## isaj
-        mulss nb410_isai(%esp),%xmm2
-        movss %xmm2,nb410_isaprod(%esp)
-        movss %xmm2,%xmm1
-        mulss nb410_gbtsc(%esp),%xmm1
-        movss %xmm1,nb410_gbscale(%esp)
-
-        mulss  nb410_iq(%esp),%xmm2
-        movss (%esi,%eax,4),%xmm6       ## xmm6(0) has the charge       
-        mulss  %xmm2,%xmm6
-        movss %xmm6,nb410_qq(%esp)
-
-        movl nb410_type(%ebp),%esi
-        movl %eax,%ecx
-        movl (%esi,%ecx,4),%ecx
-        movl nb410_vdwparam(%ebp),%esi
-        shll %ecx
-        addl nb410_ntia(%esp),%ecx
-        movlps (%esi,%ecx,4),%xmm6
-        movaps %xmm6,%xmm4
-        shufps $252,%xmm4,%xmm4 ## constant 11111100    
-        shufps $253,%xmm6,%xmm6 ## constant 11111101    
-
-        movaps %xmm4,nb410_c6(%esp)
-        movaps %xmm6,nb410_c12(%esp)
-
-        movd  %eax,%mm0
-        leal  (%eax,%eax,2),%eax
-
-        ## move coordinates to xmm0-xmm2 
-        movss (%edi,%eax,4),%xmm0
-        movss 4(%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-
-        movaps nb410_ix(%esp),%xmm4
-        movaps nb410_iy(%esp),%xmm5
-        movaps nb410_iz(%esp),%xmm6
-
-        ## calc dr 
-        subss %xmm0,%xmm4
-        subss %xmm1,%xmm5
-        subss %xmm2,%xmm6
-
-        ## store dr 
-        movss %xmm4,nb410_dx(%esp)
-        movss %xmm5,nb410_dy(%esp)
-        movss %xmm6,nb410_dz(%esp)
-        ## square it 
-        mulss %xmm4,%xmm4
-        mulss %xmm5,%xmm5
-        mulss %xmm6,%xmm6
-        addss %xmm5,%xmm4
-        addss %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movss nb410_three(%esp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movss nb410_half(%esp),%xmm0
-        subss %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-
-        mulss %xmm0,%xmm4       ## xmm4=r 
-        movss %xmm4,nb410_r(%esp)
-        mulss nb410_gbscale(%esp),%xmm4
-
-        cvttss2si %xmm4,%ebx    ## mm6 contain lu indices 
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%ebx
-        movl nb410_GBtab(%ebp),%esi
-
-        movaps (%esi,%ebx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        mulss  nb410_two(%esp),%xmm7    ## two*Heps2 
-        movss nb410_qq(%esp),%xmm3
-        addss  %xmm6,%xmm7
-        addss  %xmm5,%xmm7 ## xmm7=FF 
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulss  %xmm7,%xmm3 ## fijC=FF*qq 
-
-        movd %mm0,%ebx
-        movl nb410_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorps %xmm7,%xmm7
-        mulss nb410_gbscale(%esp),%xmm3
-        movaps %xmm3,%xmm6
-        mulss  nb410_r(%esp),%xmm6
-        addss  %xmm5,%xmm6
-        addss  nb410_vctot(%esp),%xmm5
-        movss %xmm5,nb410_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum
-        addps  nb410_dvdasum(%esp),%xmm7
-        movaps %xmm7,nb410_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        addss  (%esi,%ebx,4),%xmm6
-        movss  %xmm6,(%esi,%ebx,4)
-
-        ## L-J 
-        movaps %xmm0,%xmm4
-        mulss  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        movaps %xmm4,%xmm6
-        mulss  %xmm4,%xmm6
-
-        mulss  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movaps %xmm6,%xmm4
-        mulss  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulss  nb410_c6(%esp),%xmm6
-        mulss  nb410_c12(%esp),%xmm4
-        movss nb410_Vvdwtot(%esp),%xmm7
-        addss  %xmm4,%xmm7
-        mulss  nb410_twelve(%esp),%xmm4
-        subss  %xmm6,%xmm7
-        mulss  nb410_six(%esp),%xmm6
-        movss %xmm7,nb410_Vvdwtot(%esp)
-        subss  %xmm6,%xmm4
-        mulss  %xmm0,%xmm4
-        subss  %xmm3,%xmm4
-        mulss  %xmm0,%xmm4
-
-        movss nb410_dx(%esp),%xmm0
-        movss nb410_dy(%esp),%xmm1
-        movss nb410_dz(%esp),%xmm2
-
-        movl   nb410_faction(%ebp),%edi
-        mulss  %xmm4,%xmm0
-        mulss  %xmm4,%xmm1
-        mulss  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movss nb410_fix(%esp),%xmm3
-        movss nb410_fiy(%esp),%xmm4
-        movss nb410_fiz(%esp),%xmm5
-        addss  %xmm0,%xmm3
-        addss  %xmm1,%xmm4
-        addss  %xmm2,%xmm5
-        movss %xmm3,nb410_fix(%esp)
-        movss %xmm4,nb410_fiy(%esp)
-        movss %xmm5,nb410_fiz(%esp)
-        ## update fj 
-
-        movss   (%edi,%eax,4),%xmm3
-        movss   4(%edi,%eax,4),%xmm4
-        movss   8(%edi,%eax,4),%xmm5
-        subss   %xmm0,%xmm3
-        subss   %xmm1,%xmm4
-        subss   %xmm2,%xmm5
-        movss   %xmm3,(%edi,%eax,4)
-        movss   %xmm4,4(%edi,%eax,4)
-        movss   %xmm5,8(%edi,%eax,4)
-_nb_kernel410_ia32_sse.nb410_updateouterdata: 
-        movl  nb410_ii3(%esp),%ecx
-        movl  nb410_faction(%ebp),%edi
-        movl  nb410_fshift(%ebp),%esi
-        movl  nb410_is3(%esp),%edx
-
-        ## accumulate i forces in xmm0, xmm1, xmm2 
-        movaps nb410_fix(%esp),%xmm0
-        movaps nb410_fiy(%esp),%xmm1
-        movaps nb410_fiz(%esp),%xmm2
-
-        movhlps %xmm0,%xmm3
-        movhlps %xmm1,%xmm4
-        movhlps %xmm2,%xmm5
-        addps  %xmm3,%xmm0
-        addps  %xmm4,%xmm1
-        addps  %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 
-
-        movaps %xmm0,%xmm3
-        movaps %xmm1,%xmm4
-        movaps %xmm2,%xmm5
-
-        shufps $1,%xmm3,%xmm3
-        shufps $1,%xmm4,%xmm4
-        shufps $1,%xmm5,%xmm5
-        addss  %xmm3,%xmm0
-        addss  %xmm4,%xmm1
-        addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0 
-
-        ## increment i force 
-        movss  (%edi,%ecx,4),%xmm3
-        movss  4(%edi,%ecx,4),%xmm4
-        movss  8(%edi,%ecx,4),%xmm5
-        addss  %xmm0,%xmm3
-        addss  %xmm1,%xmm4
-        addss  %xmm2,%xmm5
-        movss  %xmm3,(%edi,%ecx,4)
-        movss  %xmm4,4(%edi,%ecx,4)
-        movss  %xmm5,8(%edi,%ecx,4)
-
-        ## increment fshift force  
-        movss  (%esi,%edx,4),%xmm3
-        movss  4(%esi,%edx,4),%xmm4
-        movss  8(%esi,%edx,4),%xmm5
-        addss  %xmm0,%xmm3
-        addss  %xmm1,%xmm4
-        addss  %xmm2,%xmm5
-        movss  %xmm3,(%esi,%edx,4)
-        movss  %xmm4,4(%esi,%edx,4)
-        movss  %xmm5,8(%esi,%edx,4)
-
-        ## get n from stack
-        movl nb410_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb410_gid(%ebp),%edx              ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movaps nb410_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movl  nb410_Vc(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%eax,%edx,4)
-
-        ## accumulate total lj energy and update it 
-        movaps nb410_Vvdwtot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movl  nb410_Vvdw(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%eax,%edx,4)
-
-        ## accumulate dVda and update it 
-        movaps nb410_dvdasum(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        movl nb410_ii(%esp),%edx
-        movl nb410_dvda(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        movss %xmm7,(%eax,%edx,4)
-
-        ## finish if last 
-        movl nb410_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel410_ia32_sse.nb410_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb410_n(%esp)
-        jmp _nb_kernel410_ia32_sse.nb410_outer
-_nb_kernel410_ia32_sse.nb410_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb410_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel410_ia32_sse.nb410_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel410_ia32_sse.nb410_threadloop
-_nb_kernel410_ia32_sse.nb410_end: 
-        emms
-
-        movl nb410_nouter(%esp),%eax
-        movl nb410_ninner(%esp),%ebx
-        movl nb410_outeriter(%ebp),%ecx
-        movl nb410_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb410_salign(%esp),%eax
-        addl %eax,%esp
-        addl $504,%esp
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-        popl %eax
-        leave
-        ret
-
-
-
-.globl nb_kernel410nf_ia32_sse
-.globl _nb_kernel410nf_ia32_sse
-nb_kernel410nf_ia32_sse:        
-_nb_kernel410nf_ia32_sse:       
-.set nb410nf_p_nri, 8
-.set nb410nf_iinr, 12
-.set nb410nf_jindex, 16
-.set nb410nf_jjnr, 20
-.set nb410nf_shift, 24
-.set nb410nf_shiftvec, 28
-.set nb410nf_fshift, 32
-.set nb410nf_gid, 36
-.set nb410nf_pos, 40
-.set nb410nf_faction, 44
-.set nb410nf_charge, 48
-.set nb410nf_p_facel, 52
-.set nb410nf_argkrf, 56
-.set nb410nf_argcrf, 60
-.set nb410nf_Vc, 64
-.set nb410nf_type, 68
-.set nb410nf_p_ntype, 72
-.set nb410nf_vdwparam, 76
-.set nb410nf_Vvdw, 80
-.set nb410nf_p_tabscale, 84
-.set nb410nf_VFtab, 88
-.set nb410nf_invsqrta, 92
-.set nb410nf_dvda, 96
-.set nb410nf_p_gbtabscale, 100
-.set nb410nf_GBtab, 104
-.set nb410nf_p_nthreads, 108
-.set nb410nf_count, 112
-.set nb410nf_mtx, 116
-.set nb410nf_outeriter, 120
-.set nb410nf_inneriter, 124
-.set nb410nf_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb410nf_ix, 0
-.set nb410nf_iy, 16
-.set nb410nf_iz, 32
-.set nb410nf_iq, 48
-.set nb410nf_gbtsc, 64
-.set nb410nf_qq, 80
-.set nb410nf_c6, 96
-.set nb410nf_c12, 112
-.set nb410nf_vctot, 128
-.set nb410nf_Vvdwtot, 144
-.set nb410nf_half, 160
-.set nb410nf_three, 176
-.set nb410nf_isai, 192
-.set nb410nf_isaprod, 208
-.set nb410nf_gbscale, 224
-.set nb410nf_is3, 240
-.set nb410nf_ii3, 244
-.set nb410nf_ntia, 248
-.set nb410nf_innerjjnr, 252
-.set nb410nf_innerk, 256
-.set nb410nf_n, 260
-.set nb410nf_nn1, 264
-.set nb410nf_nri, 268
-.set nb410nf_facel, 272
-.set nb410nf_ntype, 276
-.set nb410nf_nouter, 280
-.set nb410nf_ninner, 284
-.set nb410nf_salign, 288
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $292,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb410nf_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb410nf_p_nri(%ebp),%ecx
-        movl nb410nf_p_facel(%ebp),%esi
-        movl nb410nf_p_ntype(%ebp),%edi
-        movl (%ecx),%ecx
-        movl (%esi),%esi
-        movl (%edi),%edi
-        movl %ecx,nb410nf_nri(%esp)
-        movl %esi,nb410nf_facel(%esp)
-        movl %edi,nb410nf_ntype(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb410nf_nouter(%esp)
-        movl %eax,nb410nf_ninner(%esp)
-
-
-        movl nb410nf_p_gbtabscale(%ebp),%eax
-        movss (%eax),%xmm5
-        shufps $0,%xmm5,%xmm5
-        movaps %xmm5,nb410nf_gbtsc(%esp)
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## constant 0.5 in IEEE (hex)
-        movl %eax,nb410nf_half(%esp)
-        movss nb410nf_half(%esp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## constant 1.0
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## constant 2.0
-        addps  %xmm2,%xmm3      ## constant 3.0
-        movaps %xmm1,nb410nf_half(%esp)
-        movaps %xmm3,nb410nf_three(%esp)
-
-_nb_kernel410nf_ia32_sse.nb410nf_threadloop: 
-        movl  nb410nf_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel410nf_ia32_sse.nb410nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel410nf_ia32_sse.nb410nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb410nf_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb410nf_n(%esp)
-        movl %ebx,nb410nf_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel410nf_ia32_sse.nb410nf_outerstart
-        jmp _nb_kernel410nf_ia32_sse.nb410nf_end
-
-_nb_kernel410nf_ia32_sse.nb410nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb410nf_nouter(%esp),%ebx
-        movl %ebx,nb410nf_nouter(%esp)
-
-_nb_kernel410nf_ia32_sse.nb410nf_outer: 
-        movl  nb410nf_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx        ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb410nf_is3(%esp)            ## store is3 
-
-        movl  nb410nf_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movss (%eax,%ebx,4),%xmm0
-        movss 4(%eax,%ebx,4),%xmm1
-        movss 8(%eax,%ebx,4),%xmm2
-
-        movl  nb410nf_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]      
-        movl  (%ecx,%esi,4),%ebx            ## ebx =ii
-
-        movl  nb410nf_charge(%ebp),%edx
-        movss (%edx,%ebx,4),%xmm3
-        mulss nb410nf_facel(%esp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-        movl  nb410nf_invsqrta(%ebp),%edx       ## load invsqrta[ii]
-        movss (%edx,%ebx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        movl  nb410nf_type(%ebp),%edx
-        movl  (%edx,%ebx,4),%edx
-        imull nb410nf_ntype(%esp),%edx
-        shll  %edx
-        movl  %edx,nb410nf_ntia(%esp)
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb410nf_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addss (%eax,%ebx,4),%xmm0
-        addss 4(%eax,%ebx,4),%xmm1
-        addss 8(%eax,%ebx,4),%xmm2
-
-        movaps %xmm3,nb410nf_iq(%esp)
-        movaps %xmm4,nb410nf_isai(%esp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb410nf_ix(%esp)
-        movaps %xmm1,nb410nf_iy(%esp)
-        movaps %xmm2,nb410nf_iz(%esp)
-
-        movl  %ebx,nb410nf_ii3(%esp)
-
-        ## clear vctot
-        xorps %xmm4,%xmm4
-        movaps %xmm4,nb410nf_vctot(%esp)
-        movaps %xmm4,nb410nf_Vvdwtot(%esp)
-
-        movl  nb410nf_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb410nf_pos(%ebp),%esi
-        movl  nb410nf_faction(%ebp),%edi
-        movl  nb410nf_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb410nf_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb410nf_ninner(%esp),%ecx
-        movl  %ecx,nb410nf_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb410nf_innerk(%esp)      ## number of innerloop atoms 
-        jge   _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop
-        jmp   _nb_kernel410nf_ia32_sse.nb410nf_finish_inner
-_nb_kernel410nf_ia32_sse.nb410nf_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movl  nb410nf_innerjjnr(%esp),%edx       ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        movl  8(%edx),%ecx
-        movl  12(%edx),%edx           ## eax-edx=jnr1-4 
-        addl $16,nb410nf_innerjjnr(%esp)             ## advance pointer (unrolled 4) 
-
-        ## load isa2
-        movl nb410nf_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-        movaps nb410nf_isai(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3  
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb410nf_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb410nf_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb410nf_gbscale(%esp)
-
-        movl nb410nf_charge(%ebp),%esi     ## base of charge[] 
-
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-
-        mulps nb410nf_iq(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3  
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb410nf_qq(%esp)
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movl nb410nf_type(%ebp),%esi
-        movl (%esi,%eax,4),%eax
-        movl (%esi,%ebx,4),%ebx
-        movl (%esi,%ecx,4),%ecx
-        movl (%esi,%edx,4),%edx
-        movl nb410nf_vdwparam(%ebp),%esi
-        shll %eax
-        shll %ebx
-        shll %ecx
-        shll %edx
-        movl nb410nf_ntia(%esp),%edi
-        addl %edi,%eax
-        addl %edi,%ebx
-        addl %edi,%ecx
-        addl %edi,%edx
-
-        movlps (%esi,%eax,4),%xmm6
-        movlps (%esi,%ecx,4),%xmm7
-        movhps (%esi,%ebx,4),%xmm6
-        movhps (%esi,%edx,4),%xmm7
-
-        movaps %xmm6,%xmm4
-        shufps $136,%xmm7,%xmm4 ## constant 10001000
-        shufps $221,%xmm7,%xmm6 ## constant 11011101
-
-        movd  %mm0,%eax
-        movd  %mm1,%ebx
-        movd  %mm2,%ecx
-        movd  %mm3,%edx
-
-        movaps %xmm4,nb410nf_c6(%esp)
-        movaps %xmm6,nb410nf_c12(%esp)
-
-        movl nb410nf_pos(%ebp),%esi        ## base of pos[] 
-
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-
-        leal  (%ecx,%ecx,2),%ecx     ## replace jnr with j3 
-        leal  (%edx,%edx,2),%edx
-
-        ## move four coordinates to xmm0-xmm2   
-
-        movlps (%esi,%eax,4),%xmm4
-        movlps (%esi,%ecx,4),%xmm5
-        movss 8(%esi,%eax,4),%xmm2
-        movss 8(%esi,%ecx,4),%xmm6
-
-        movhps (%esi,%ebx,4),%xmm4
-        movhps (%esi,%edx,4),%xmm5
-
-        movss 8(%esi,%ebx,4),%xmm0
-        movss 8(%esi,%edx,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm5,%xmm0 ## constant 10001000
-        shufps $221,%xmm5,%xmm1 ## constant 11011101            
-
-        ## move ix-iz to xmm4-xmm6 
-        movaps nb410nf_ix(%esp),%xmm4
-        movaps nb410nf_iy(%esp),%xmm5
-        movaps nb410nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb410nf_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb410nf_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        mulps nb410nf_gbscale(%esp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $2,%mm6
-        pslld $2,%mm7
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movl nb410nf_GBtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## load coulomb table
-        movaps (%esi,%eax,4),%xmm4
-        movaps (%esi,%ebx,4),%xmm5
-        movaps (%esi,%ecx,4),%xmm6
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## coulomb table ready, in xmm4-xmm7            
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        movaps nb410nf_qq(%esp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        ## update vctot
-        addps  nb410nf_vctot(%esp),%xmm5
-        movaps %xmm5,nb410nf_vctot(%esp)
-
-        ## L-J 
-        movaps %xmm0,%xmm4
-        mulps  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        movaps %xmm4,%xmm6
-        mulps  %xmm4,%xmm6
-
-        mulps  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movaps %xmm6,%xmm4
-        mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulps  nb410nf_c6(%esp),%xmm6
-        mulps  nb410nf_c12(%esp),%xmm4
-        movaps nb410nf_Vvdwtot(%esp),%xmm7
-        addps  %xmm4,%xmm7
-        subps  %xmm6,%xmm7
-        movaps %xmm7,nb410nf_Vvdwtot(%esp)
-
-        ## should we do one more iteration? 
-        subl $4,nb410nf_innerk(%esp)
-        jl    _nb_kernel410nf_ia32_sse.nb410nf_finish_inner
-        jmp   _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop
-_nb_kernel410nf_ia32_sse.nb410nf_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb410nf_innerk(%esp)
-        movl  nb410nf_innerk(%esp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel410nf_ia32_sse.nb410nf_dopair
-        jmp   _nb_kernel410nf_ia32_sse.nb410nf_checksingle
-_nb_kernel410nf_ia32_sse.nb410nf_dopair: 
-        movl  nb410nf_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-        movl  4(%ecx),%ebx
-        addl $8,nb410nf_innerjjnr(%esp)
-
-        xorps %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-
-        ## load isa2
-        movl nb410nf_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm2
-        movss (%esi,%ebx,4),%xmm3
-        unpcklps %xmm3,%xmm2    ## isa2 in xmm3(0,1)
-        mulps  nb410nf_isai(%esp),%xmm2
-        movaps %xmm2,nb410nf_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb410nf_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb410nf_gbscale(%esp)
-
-        movl nb410nf_charge(%ebp),%esi     ## base of charge[]  
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ebx,4),%xmm6
-        unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges 
-
-        mulps  nb410nf_iq(%esp),%xmm2
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb410nf_qq(%esp)
-
-        movl nb410nf_type(%ebp),%esi
-        movl  %eax,%ecx
-        movl  %ebx,%edx
-        movl (%esi,%ecx,4),%ecx
-        movl (%esi,%edx,4),%edx
-        movl nb410nf_vdwparam(%ebp),%esi
-        shll %ecx
-        shll %edx
-        movl nb410nf_ntia(%esp),%edi
-        addl %edi,%ecx
-        addl %edi,%edx
-        movlps (%esi,%ecx,4),%xmm6
-        movhps (%esi,%edx,4),%xmm6
-        movl nb410nf_pos(%ebp),%edi
-
-        movaps %xmm6,%xmm4
-        shufps $8,%xmm4,%xmm4 ## constant 00001000       
-        shufps $13,%xmm6,%xmm6 ## constant 00001101
-        movlhps %xmm7,%xmm4
-        movlhps %xmm7,%xmm6
-
-        movaps %xmm4,nb410nf_c6(%esp)
-        movaps %xmm6,nb410nf_c12(%esp)
-
-        leal  (%eax,%eax,2),%eax
-        leal  (%ebx,%ebx,2),%ebx
-        ## move coordinates to xmm0-xmm2 
-        movlps (%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-        movhps (%edi,%ebx,4),%xmm1
-        movss 8(%edi,%ebx,4),%xmm0
-
-        movlhps %xmm7,%xmm3
-
-        shufps $0,%xmm0,%xmm2
-
-        movaps %xmm1,%xmm0
-
-        shufps $136,%xmm2,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm0,%xmm0 ## constant 10001000
-        shufps $221,%xmm1,%xmm1 ## constant 11011101
-
-        movl   nb410nf_faction(%ebp),%edi
-        ## move ix-iz to xmm4-xmm6 
-        xorps   %xmm7,%xmm7
-
-        movaps nb410nf_ix(%esp),%xmm4
-        movaps nb410nf_iy(%esp),%xmm5
-        movaps nb410nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb410nf_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb410nf_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        mulps nb410nf_gbscale(%esp),%xmm4
-
-        cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6
-
-        movl nb410nf_GBtab(%ebp),%esi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## load coulomb table
-        movaps (%esi,%ecx,4),%xmm4
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        movaps nb410nf_qq(%esp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-
-        addps  nb410nf_vctot(%esp),%xmm5
-        movaps %xmm5,nb410nf_vctot(%esp)
-
-        ## L-J 
-        movaps %xmm0,%xmm4
-        mulps  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        ## at this point mm5 contains vcoul and mm3 fijC 
-        ## increment vcoul - then we can get rid of mm5 
-        ## update vctot 
-
-        movaps %xmm4,%xmm6
-        mulps  %xmm4,%xmm6
-
-        mulps  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movaps %xmm6,%xmm4
-        mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulps  nb410nf_c6(%esp),%xmm6
-        mulps  nb410nf_c12(%esp),%xmm4
-        movaps nb410nf_Vvdwtot(%esp),%xmm7
-        addps  %xmm4,%xmm7
-        subps  %xmm6,%xmm7
-        movaps %xmm7,nb410nf_Vvdwtot(%esp)
-
-_nb_kernel410nf_ia32_sse.nb410nf_checksingle:   
-        movl  nb410nf_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel410nf_ia32_sse.nb410nf_dosingle
-        jmp    _nb_kernel410nf_ia32_sse.nb410nf_updateouterdata
-_nb_kernel410nf_ia32_sse.nb410nf_dosingle: 
-        movl nb410nf_charge(%ebp),%esi
-        movl nb410nf_invsqrta(%ebp),%edx
-        movl nb410nf_pos(%ebp),%edi
-        movl  nb410nf_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-        xorps  %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-        movss (%edx,%eax,4),%xmm2       ## isa2
-        mulss nb410nf_isai(%esp),%xmm2
-        movss %xmm2,nb410nf_isaprod(%esp)
-        movss %xmm2,%xmm1
-        mulss nb410nf_gbtsc(%esp),%xmm1
-        movss %xmm1,nb410nf_gbscale(%esp)
-
-        mulss  nb410nf_iq(%esp),%xmm2
-        movss (%esi,%eax,4),%xmm6       ## xmm6(0) has the charge       
-        mulss  %xmm2,%xmm6
-        movss %xmm6,nb410nf_qq(%esp)
-
-        movl nb410nf_type(%ebp),%esi
-        movl %eax,%ecx
-        movl (%esi,%ecx,4),%ecx
-        movl nb410nf_vdwparam(%ebp),%esi
-        shll %ecx
-        addl nb410nf_ntia(%esp),%ecx
-        movlps (%esi,%ecx,4),%xmm6
-        movaps %xmm6,%xmm4
-        shufps $252,%xmm4,%xmm4 ## constant 11111100    
-        shufps $253,%xmm6,%xmm6 ## constant 11111101    
-
-        movaps %xmm4,nb410nf_c6(%esp)
-        movaps %xmm6,nb410nf_c12(%esp)
-
-        leal  (%eax,%eax,2),%eax
-
-        ## move coordinates to xmm0-xmm2 
-        movss (%edi,%eax,4),%xmm0
-        movss 4(%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-
-        movaps nb410nf_ix(%esp),%xmm4
-        movaps nb410nf_iy(%esp),%xmm5
-        movaps nb410nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subss %xmm0,%xmm4
-        subss %xmm1,%xmm5
-        subss %xmm2,%xmm6
-
-        ## square it 
-        mulss %xmm4,%xmm4
-        mulss %xmm5,%xmm5
-        mulss %xmm6,%xmm6
-        addss %xmm5,%xmm4
-        addss %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movss nb410nf_three(%esp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movss nb410nf_half(%esp),%xmm0
-        subss %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-
-        mulss %xmm0,%xmm4       ## xmm4=r 
-        mulss nb410nf_gbscale(%esp),%xmm4
-
-        cvttss2si %xmm4,%ebx    ## mm6 contain lu indices 
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%ebx
-        movl nb410nf_GBtab(%ebp),%esi
-
-        movaps (%esi,%ebx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        movss nb410nf_qq(%esp),%xmm3
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addss  nb410nf_vctot(%esp),%xmm5
-        movss %xmm5,nb410nf_vctot(%esp)
-
-        ## L-J 
-        movaps %xmm0,%xmm4
-        mulss  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        movaps %xmm4,%xmm6
-        mulss  %xmm4,%xmm6
-
-        mulss  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movaps %xmm6,%xmm4
-        mulss  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulss  nb410nf_c6(%esp),%xmm6
-        mulss  nb410nf_c12(%esp),%xmm4
-        movss nb410nf_Vvdwtot(%esp),%xmm7
-        addps  %xmm4,%xmm7
-        subps  %xmm6,%xmm7
-        movss %xmm7,nb410nf_Vvdwtot(%esp)
-
-_nb_kernel410nf_ia32_sse.nb410nf_updateouterdata: 
-        ## get n from stack
-        movl nb410nf_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb410nf_gid(%ebp),%edx            ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movaps nb410nf_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movl  nb410nf_Vc(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%eax,%edx,4)
-
-        ## accumulate total lj energy and update it 
-        movaps nb410nf_Vvdwtot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movl  nb410nf_Vvdw(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%eax,%edx,4)
-
-        ## finish if last 
-        movl nb410nf_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel410nf_ia32_sse.nb410nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb410nf_n(%esp)
-        jmp _nb_kernel410nf_ia32_sse.nb410nf_outer
-_nb_kernel410nf_ia32_sse.nb410nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb410nf_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel410nf_ia32_sse.nb410nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel410nf_ia32_sse.nb410nf_threadloop
-_nb_kernel410nf_ia32_sse.nb410nf_end: 
-        emms
-
-        movl nb410nf_nouter(%esp),%eax
-        movl nb410nf_ninner(%esp),%ebx
-        movl nb410nf_outeriter(%ebp),%ecx
-        movl nb410nf_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb410nf_salign(%esp),%eax
-        addl %eax,%esp
-        addl $292,%esp
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-        popl %eax
-        leave
-        ret
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.intel_syntax.s
deleted file mode 100644
index 9fefa0256f..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.intel_syntax.s
+++ /dev/null
@@ -1,2409 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-
-
-
-.globl nb_kernel430_ia32_sse
-.globl _nb_kernel430_ia32_sse
-nb_kernel430_ia32_sse:	
-_nb_kernel430_ia32_sse:	
-.equiv          nb430_p_nri,            8
-.equiv          nb430_iinr,             12
-.equiv          nb430_jindex,           16
-.equiv          nb430_jjnr,             20
-.equiv          nb430_shift,            24
-.equiv          nb430_shiftvec,         28
-.equiv          nb430_fshift,           32
-.equiv          nb430_gid,              36
-.equiv          nb430_pos,              40
-.equiv          nb430_faction,          44
-.equiv          nb430_charge,           48
-.equiv          nb430_p_facel,          52
-.equiv          nb430_argkrf,           56
-.equiv          nb430_argcrf,           60
-.equiv          nb430_Vc,               64
-.equiv          nb430_type,             68
-.equiv          nb430_p_ntype,          72
-.equiv          nb430_vdwparam,         76
-.equiv          nb430_Vvdw,             80
-.equiv          nb430_p_tabscale,       84
-.equiv          nb430_VFtab,            88
-.equiv          nb430_invsqrta,         92
-.equiv          nb430_dvda,             96
-.equiv          nb430_p_gbtabscale,     100
-.equiv          nb430_GBtab,            104
-.equiv          nb430_p_nthreads,       108
-.equiv          nb430_count,            112
-.equiv          nb430_mtx,              116
-.equiv          nb430_outeriter,        120
-.equiv          nb430_inneriter,        124
-.equiv          nb430_work,             128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb430_ix,               0
-.equiv          nb430_iy,               16
-.equiv          nb430_iz,               32
-.equiv          nb430_iq,               48
-.equiv          nb430_dx,               64
-.equiv          nb430_dy,               80
-.equiv          nb430_dz,               96
-.equiv          nb430_two,              112
-.equiv          nb430_gbtsc,            128
-.equiv          nb430_tsc,              144
-.equiv          nb430_qq,               160
-.equiv          nb430_c6,               176
-.equiv          nb430_c12,              192
-.equiv          nb430_fscal,            208
-.equiv          nb430_vctot,            224
-.equiv          nb430_Vvdwtot,          240
-.equiv          nb430_fix,              256
-.equiv          nb430_fiy,              272
-.equiv          nb430_fiz,              288
-.equiv          nb430_half,             304
-.equiv          nb430_three,            320
-.equiv          nb430_r,                336
-.equiv          nb430_isai,             352
-.equiv          nb430_isaprod,          368
-.equiv          nb430_dvdasum,          384
-.equiv          nb430_gbscale,          400
-.equiv          nb430_ii,               416
-.equiv          nb430_is3,              420
-.equiv          nb430_ii3,              424
-.equiv          nb430_ntia,             428
-.equiv          nb430_innerjjnr,        432
-.equiv          nb430_innerk,           436
-.equiv          nb430_n,                440
-.equiv          nb430_nn1,              444
-.equiv          nb430_jnra,             448
-.equiv          nb430_jnrb,             452
-.equiv          nb430_jnrc,             456
-.equiv          nb430_jnrd,             460
-.equiv          nb430_nri,              464
-.equiv          nb430_facel,            468
-.equiv          nb430_ntype,            472
-.equiv          nb430_nouter,           476
-.equiv          nb430_ninner,           480
-.equiv          nb430_salign,           484
-	push ebp
-	mov ebp,esp	
-    	push eax
-    	push ebx
-    	push ecx
-    	push edx
-	push esi
-	push edi
-	sub esp, 488		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb430_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb430_p_nri]
-	mov esi, [ebp + nb430_p_facel]
-	mov edi, [ebp + nb430_p_ntype]
-	mov ecx, [ecx]
-	mov esi, [esi]
-	mov edi, [edi]
-	mov [esp + nb430_nri], ecx
-	mov [esp + nb430_facel], esi
-	mov [esp + nb430_ntype], edi
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb430_nouter], eax
-	mov [esp + nb430_ninner], eax
-
-
-	mov eax, [ebp + nb430_p_gbtabscale]
-	movss xmm3, [eax]
-	mov eax, [ebp + nb430_p_tabscale]
-	movss xmm4, [eax]
-	shufps xmm3, xmm3, 0
-	shufps xmm4, xmm4, 0
-	movaps [esp + nb430_gbtsc], xmm3
-	movaps [esp + nb430_tsc], xmm4
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# constant 0.5 in IEEE (hex)
-	mov [esp + nb430_half], eax
-	movss xmm1, [esp + nb430_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# constant 1.0
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# constant 2.0
-	addps  xmm3, xmm2	;# constant 3.0
-	movaps [esp + nb430_half],  xmm1
-	movaps [esp + nb430_two],  xmm2
-	movaps [esp + nb430_three],  xmm3
-
-.nb430_threadloop:
-        mov   esi, [ebp + nb430_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb430_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb430_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb430_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb430_n], eax
-        mov [esp + nb430_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb430_outerstart
-        jmp .nb430_end
-
-.nb430_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb430_nouter]
-	mov [esp + nb430_nouter], ebx
-
-.nb430_outer:
-	mov   eax, [ebp + nb430_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax + esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb430_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb430_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movss xmm0, [eax + ebx*4]
-	movss xmm1, [eax + ebx*4 + 4]
-	movss xmm2, [eax + ebx*4 + 8] 
-
-	mov   ecx, [ebp + nb430_iinr]       ;# ecx = pointer into iinr[]
-	mov   ebx, [ecx + esi*4]	    ;# ebx =ii 
-	mov   [esp + nb430_ii], ebx
-
-	mov   edx, [ebp + nb430_charge]
-	movss xmm3, [edx + ebx*4]	
-	mulss xmm3, [esp + nb430_facel]
-	shufps xmm3, xmm3, 0
-
-	mov   edx, [ebp + nb430_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [edx + ebx*4]
-	shufps xmm4, xmm4, 0
-
-    	mov   edx, [ebp + nb430_type] 
-    	mov   edx, [edx + ebx*4]
-    	imul  edx, [esp + nb430_ntype]
-    	shl   edx, 1
-    	mov   [esp + nb430_ntia], edx
-		
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb430_pos]    ;# eax = base of pos[]  
-
-	addss xmm0, [eax + ebx*4]
-	addss xmm1, [eax + ebx*4 + 4]
-	addss xmm2, [eax + ebx*4 + 8]
-
-	movaps [esp + nb430_iq], xmm3
-	movaps [esp + nb430_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [esp + nb430_ix], xmm0
-	movaps [esp + nb430_iy], xmm1
-	movaps [esp + nb430_iz], xmm2
-
-	mov   [esp + nb430_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorps xmm4, xmm4
-	movaps [esp + nb430_vctot], xmm4
-	movaps [esp + nb430_Vvdwtot], xmm4
-	movaps [esp + nb430_dvdasum], xmm4
-	movaps [esp + nb430_fix], xmm4
-	movaps [esp + nb430_fiy], xmm4
-	movaps [esp + nb430_fiz], xmm4
-	
-	mov   eax, [ebp + nb430_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb430_pos]
-	mov   edi, [ebp + nb430_faction]	
-	mov   eax, [ebp + nb430_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb430_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [esp + nb430_ninner]
-	mov   [esp + nb430_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb430_innerk], edx    ;# number of innerloop atoms
-	
-	jge   .nb430_unroll_loop
-	jmp   .nb430_finish_inner
-.nb430_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   edx, [esp + nb430_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [edx]	
-	mov   ebx, [edx + 4]              
-	mov   ecx, [edx + 8]            
-	mov   edx, [edx + 12]         ;# eax-edx=jnr1-4 
-	add dword ptr [esp + nb430_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-
-	;# load isaj
-	mov esi, [ebp + nb430_invsqrta]
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-	movaps xmm2, [esp + nb430_isai]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all isaj in xmm3  
-	mulps  xmm2, xmm3
-		
-	movaps [esp + nb430_isaprod], xmm2
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb430_gbtsc]
-	movaps [esp + nb430_gbscale], xmm1
-	
-	mov esi, [ebp + nb430_charge]    ;# base of charge[] 
-	
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-
-	mulps xmm2, [esp + nb430_iq]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3  
-	mulps  xmm3, xmm2
-	movaps [esp + nb430_qq], xmm3	
-
-	movd  mm0, eax		;# use mmx registers as temp storage 
-	movd  mm1, ebx
-	movd  mm2, ecx
-	movd  mm3, edx
-	
-	mov esi, [ebp + nb430_type]
-	mov eax, [esi + eax*4]
-	mov ebx, [esi + ebx*4]
-	mov ecx, [esi + ecx*4]
-	mov edx, [esi + edx*4]
-	mov esi, [ebp + nb430_vdwparam]
-	shl eax, 1	
-	shl ebx, 1	
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [esp + nb430_ntia]
-	add eax, edi
-	add ebx, edi
-	add ecx, edi
-	add edx, edi
-
-	movlps xmm6, [esi + eax*4]
-	movlps xmm7, [esi + ecx*4]
-	movhps xmm6, [esi + ebx*4]
-	movhps xmm7, [esi + edx*4]
-
-	movaps xmm4, xmm6
-	shufps xmm4, xmm7, 136  ;# constant 10001000
-	shufps xmm6, xmm7, 221  ;# constant 11011101
-	
-	movd  eax, mm0		
-	movd  ebx, mm1
-	movd  ecx, mm2
-	movd  edx, mm3
-
-	movaps [esp + nb430_c6], xmm4
-	movaps [esp + nb430_c12], xmm6
-	
-	mov esi, [ebp + nb430_pos]       ;# base of pos[] 
-
-	mov [esp + nb430_jnra], eax
-	mov [esp + nb430_jnrb], ebx
-	mov [esp + nb430_jnrc], ecx
-	mov [esp + nb430_jnrd], edx
-	
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-	lea   ecx, [ecx + ecx*2]     
-	lea   edx, [edx + edx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-
-	movlps xmm4, [esi + eax*4]
-	movlps xmm5, [esi + ecx*4]
-	movss xmm2, [esi + eax*4 + 8]
-	movss xmm6, [esi + ecx*4 + 8]
-
-	movhps xmm4, [esi + ebx*4]
-	movhps xmm5, [esi + edx*4]
-
-	movss xmm0, [esi + ebx*4 + 8]
-	movss xmm1, [esi + edx*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm5, 136  ;# constant 10001000
-	shufps xmm1, xmm5, 221  ;# constant 11011101		
-
-	;# move ix-iz to xmm4-xmm6 
-	movaps xmm4, [esp + nb430_ix]
-	movaps xmm5, [esp + nb430_iy]
-	movaps xmm6, [esp + nb430_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# store dr 
-	movaps [esp + nb430_dx], xmm4
-	movaps [esp + nb430_dy], xmm5
-	movaps [esp + nb430_dz], xmm6
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb430_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb430_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	movaps [esp + nb430_r], xmm4
-	mulps xmm4, [esp + nb430_gbscale]
-
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 2
-	pslld mm7, 2
-
-	movd mm0, eax	
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-
-	mov  esi, [ebp + nb430_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-		
-	;# load coulomb table
-	movaps xmm4, [esi + eax*4]
-	movaps xmm5, [esi + ebx*4]
-	movaps xmm6, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# coulomb table ready, in xmm4-xmm7  		
-	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm7, [esp + nb430_two]	;# two*Heps2 
-	movaps xmm3, [esp + nb430_qq]
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulps  xmm3, xmm7 ;# fijC=FF*qq 
-
-	;# get jnr from stack
-	mov eax, [esp + nb430_jnra]
-	mov ebx, [esp + nb430_jnrb]
-	mov ecx, [esp + nb430_jnrc]
-	mov edx, [esp + nb430_jnrd]
-	
-	mov esi, [ebp + nb430_dvda]
-	
-	;# Calculate dVda
-	xorps xmm7, xmm7
-	mulps xmm3, [esp + nb430_gbscale]
-	movaps xmm6, xmm3
-	mulps  xmm6, [esp + nb430_r]
-	addps  xmm6, xmm5
-	addps  xmm5, [esp + nb430_vctot]
-	movaps [esp + nb430_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum
-	addps  xmm7, [esp + nb430_dvdasum]
-	movaps [esp + nb430_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	movaps  xmm5, xmm6
-	movaps  xmm4, xmm7
-	shufps  xmm5, xmm5, 0x1
-	shufps  xmm4, xmm4, 0x1
-	;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-	addss  xmm6, [esi + eax*4]
-	addss  xmm5, [esi + ebx*4]
-	addss  xmm7, [esi + ecx*4]
-	addss  xmm4, [esi + edx*4]
-	movss  [esi + eax*4], xmm6
-	movss  [esi + ebx*4], xmm5
-	movss  [esi + ecx*4], xmm7
-	movss  [esi + edx*4], xmm4
-	
-	;# put scalar force on stack temporarily 
-	movaps [esp + nb430_fscal], xmm3
-
-	movaps xmm4, [esp + nb430_r]
-	mulps xmm4, [esp + nb430_tsc]
-	
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 3
-	pslld mm7, 3
-	
-	mov  esi, [ebp + nb430_VFtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-		
-	;# dispersion 
-	movaps xmm4, [esi + eax*4]
-	movaps xmm5, [esi + ebx*4]
-	movaps xmm6, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# dispersion table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm7, [esp + nb430_two]	;# two*Heps2 
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-
-	movaps xmm4, [esp + nb430_c6]
-	mulps  xmm7, xmm4	 ;# fijD 
-	mulps  xmm5, xmm4	 ;# Vvdw6
-	mulps  xmm7, [esp + nb430_tsc]
-	addps  xmm7, [esp + nb430_fscal] ;# add to fscal 
-
-	;# put scalar force on stack Update Vvdwtot directly 
-	addps  xmm5, [esp + nb430_Vvdwtot]
-	movaps [esp + nb430_fscal], xmm7
-	movaps [esp + nb430_Vvdwtot], xmm5
-
-	;# repulsion 
-	movaps xmm4, [esi + eax*4 + 16]
-	movaps xmm5, [esi + ebx*4 + 16]
-	movaps xmm6, [esi + ecx*4 + 16]
-	movaps xmm7, [esi + edx*4 + 16]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm7, [esp + nb430_two]	;# two*Heps2 
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
- 	
-	movaps xmm4, [esp + nb430_c12]
-	mulps  xmm7, xmm4 ;# fijR 
-	mulps  xmm5, xmm4 ;# Vvdw12
-	mulps xmm7, [esp + nb430_tsc]
-	addps  xmm7, [esp + nb430_fscal] 
-	
-	addps  xmm5, [esp + nb430_Vvdwtot]
-	movaps [esp + nb430_Vvdwtot], xmm5
-	xorps  xmm4, xmm4
-
-	mulps xmm7, xmm0
-	subps  xmm4, xmm7
-
-	movaps xmm0, [esp + nb430_dx]
-	movaps xmm1, [esp + nb430_dy]
-	movaps xmm2, [esp + nb430_dz]
-
-	movd eax, mm0	
-	movd ebx, mm1
-	movd ecx, mm2
-	movd edx, mm3
-
-	mov    edi, [ebp + nb430_faction]
-	mulps  xmm0, xmm4
-	mulps  xmm1, xmm4
-	mulps  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movaps xmm3, [esp + nb430_fix]
-	movaps xmm4, [esp + nb430_fiy]
-	movaps xmm5, [esp + nb430_fiz]
-	addps  xmm3, xmm0
-	addps  xmm4, xmm1
-	addps  xmm5, xmm2
-	movaps [esp + nb430_fix], xmm3
-	movaps [esp + nb430_fiy], xmm4
-	movaps [esp + nb430_fiz], xmm5
-	;# the fj's - start by accumulating x & y forces from memory 
-	movlps xmm4, [edi + eax*4]
-	movlps xmm6, [edi + ecx*4]
-	movhps xmm4, [edi + ebx*4]
-	movhps xmm6, [edi + edx*4]
-
-	movaps xmm3, xmm4
-	shufps xmm3, xmm6, 136  ;# constant 10001000
-	shufps xmm4, xmm6, 221  ;# constant 11011101			      
-
-	;# now xmm3-xmm5 contains fjx, fjy, fjz 
-	subps  xmm3, xmm0
-	subps  xmm4, xmm1
-	
-	;# unpack them back so we can store them - first x & y in xmm3/xmm4 
-
-	movaps xmm6, xmm3
-	unpcklps xmm6, xmm4
-	unpckhps xmm3, xmm4	
-	;# xmm6(l)=x & y for j1, (h) for j2 
-	;# xmm3(l)=x & y for j3, (h) for j4 
-	movlps [edi + eax*4], xmm6
-	movlps [edi + ecx*4], xmm3
-	
-	movhps [edi + ebx*4], xmm6
-	movhps [edi + edx*4], xmm3
-
-	;# and the z forces 
-	movss  xmm4, [edi + eax*4 + 8]
-	movss  xmm5, [edi + ebx*4 + 8]
-	movss  xmm6, [edi + ecx*4 + 8]
-	movss  xmm7, [edi + edx*4 + 8]
-	subss  xmm4, xmm2
-	shufps xmm2, xmm2, 229  ;# constant 11100101
-	subss  xmm5, xmm2
-	shufps xmm2, xmm2, 234  ;# constant 11101010
-	subss  xmm6, xmm2
-	shufps xmm2, xmm2, 255  ;# constant 11111111
-	subss  xmm7, xmm2
-	movss  [edi + eax*4 + 8], xmm4
-	movss  [edi + ebx*4 + 8], xmm5
-	movss  [edi + ecx*4 + 8], xmm6
-	movss  [edi + edx*4 + 8], xmm7
-	
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb430_innerk],  4
-	jl    .nb430_finish_inner
-	jmp   .nb430_unroll_loop
-.nb430_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [esp + nb430_innerk],  4
-	mov   edx, [esp + nb430_innerk]
-	and   edx, 2
-	jnz   .nb430_dopair
-	jmp   .nb430_checksingle
-.nb430_dopair:	
-
-	mov   ecx, [esp + nb430_innerjjnr]
-	
-	mov   eax, [ecx]	
-	mov   ebx, [ecx + 4]              
-	add dword ptr [esp + nb430_innerjjnr],  8	
-
-	xorps xmm2, xmm2
-	movaps xmm6, xmm2
-	
-	;# load isaj
-	mov esi, [ebp + nb430_invsqrta]
-	movss xmm2, [esi + eax*4]
-	movss xmm3, [esi + ebx*4]
-	unpcklps xmm2, xmm3	;# isaj in xmm3(0,1)
-	mulps  xmm2, [esp + nb430_isai]
-	movaps [esp + nb430_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb430_gbtsc]
-	movaps [esp + nb430_gbscale], xmm1	
-	
-	mov esi, [ebp + nb430_charge]    ;# base of charge[] 	
-	movss xmm3, [esi + eax*4]		
-	movss xmm6, [esi + ebx*4]
-	unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges 
-
-	mulps  xmm2, [esp + nb430_iq]
-	mulps  xmm3, xmm2
-	movaps [esp + nb430_qq], xmm3
-
-	mov esi, [ebp + nb430_type]
-	mov   ecx, eax
-	mov   edx, ebx
-	mov ecx, [esi + ecx*4]
-	mov edx, [esi + edx*4]	
-	mov esi, [ebp + nb430_vdwparam]
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [esp + nb430_ntia]
-	add ecx, edi
-	add edx, edi
-	movlps xmm6, [esi + ecx*4]
-	movhps xmm6, [esi + edx*4]
-	mov edi, [ebp + nb430_pos]	
-	
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 8 ;# constant 00001000 	
-	shufps xmm6, xmm6, 13 ;# constant 00001101
-	movlhps xmm4, xmm7
-	movlhps xmm6, xmm7
-	
-	movaps [esp + nb430_c6], xmm4
-	movaps [esp + nb430_c12], xmm6	
-			
-	movd  mm0, eax		;# copy jnr to mm0/mm1
-	movd  mm1, ebx
-		
-	lea   eax, [eax + eax*2]
-	lea   ebx, [ebx + ebx*2]
-	;# move coordinates to xmm0-xmm2 
-	movlps xmm1, [edi + eax*4]
-	movss xmm2, [edi + eax*4 + 8]	
-	movhps xmm1, [edi + ebx*4]
-	movss xmm0, [edi + ebx*4 + 8]	
-
-	movlhps xmm3, xmm7
-	
-	shufps xmm2, xmm0, 0
-	
-	movaps xmm0, xmm1
-
-	shufps xmm2, xmm2, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm0, 136  ;# constant 10001000
-	shufps xmm1, xmm1, 221  ;# constant 11011101
-			
-	mov    edi, [ebp + nb430_faction]
-	;# move ix-iz to xmm4-xmm6 
-	xorps   xmm7, xmm7
-	
-	movaps xmm4, [esp + nb430_ix]
-	movaps xmm5, [esp + nb430_iy]
-	movaps xmm6, [esp + nb430_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# store dr 
-	movaps [esp + nb430_dx], xmm4
-	movaps [esp + nb430_dy], xmm5
-	movaps [esp + nb430_dz], xmm6
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb430_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb430_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	movaps [esp + nb430_r], xmm4
-	mulps xmm4, [esp + nb430_gbscale]
-
-	cvttps2pi mm6, xmm4     ;# mm6 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-
-	pslld mm6, 2
-
-	mov  esi, [ebp + nb430_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-
-	;# load coulomb table
-	movaps xmm4, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# coulomb table ready, in xmm4-xmm7  	
-
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm7, [esp + nb430_two]	;# two*Heps2 
-	movaps xmm3, [esp + nb430_qq]
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulps  xmm3, xmm7 ;# fijC=FF*qq 
-
-	;# get jnr from mm0/mm1
-	movd ecx, mm0
-	movd edx, mm1
-
-	mov esi, [ebp + nb430_dvda]
-	
-	;# Calculate dVda
-	xorps xmm7, xmm7
-	mulps xmm3, [esp + nb430_gbscale]
-	movaps xmm6, xmm3
-	mulps  xmm6, [esp + nb430_r]
-	addps  xmm6, xmm5
-	addps  xmm5, [esp + nb430_vctot]
-	movaps [esp + nb430_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum
-	addps  xmm7, [esp + nb430_dvdasum]
-	movaps [esp + nb430_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movaps xmm7, xmm6
-	shufps xmm7, xmm7, 0x1
-	addss  xmm6, [esi + ecx*4]
-	addss  xmm7, [esi + edx*4]
-	movss  [esi + ecx*4], xmm6
-	movss  [esi + edx*4], xmm7
-	
-	;# put scalar force on stack temporarily 
-	movaps [esp + nb430_fscal], xmm3
-
-	movaps xmm4, [esp + nb430_r]
-	mulps xmm4, [esp + nb430_tsc]
-	
-	cvttps2pi mm6, xmm4
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 3
-	
-	mov  esi, [ebp + nb430_VFtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-			
-	;# dispersion 
-	movaps xmm4, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# dispersion table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm7, [esp + nb430_two]	;# two*Heps2 
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-
-	movaps xmm4, [esp + nb430_c6]
-	mulps  xmm7, xmm4	 ;# fijD 
-	mulps  xmm5, xmm4	 ;# Vvdw6 
-	mulps  xmm7, [esp + nb430_tsc]
-	addps  xmm7, [esp + nb430_fscal] ;# add to fscal 
-
-	;# put scalar force on stack Update Vvdwtot directly 
-	addps  xmm5, [esp + nb430_Vvdwtot]
-	movaps [esp + nb430_fscal], xmm7
-	movaps [esp + nb430_Vvdwtot], xmm5
-
-	;# repulsion 
-	movaps xmm4, [esi + ecx*4 + 16]
-	movaps xmm7, [esi + edx*4 + 16]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm7, [esp + nb430_two]	;# two*Heps2 
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
- 	
-	movaps xmm4, [esp + nb430_c12]
-	mulps  xmm7, xmm4 ;# fijR 
-	mulps  xmm5, xmm4 ;# Vvdw12 
-	mulps  xmm7, [esp + nb430_tsc]
-	addps  xmm7, [esp + nb430_fscal] 
-	
-	addps  xmm5, [esp + nb430_Vvdwtot]
-	movaps [esp + nb430_Vvdwtot], xmm5
-	xorps  xmm4, xmm4
-
-	mulps xmm7, xmm0
-	subps  xmm4, xmm7
-
-	movaps xmm0, [esp + nb430_dx]
-	movaps xmm1, [esp + nb430_dy]
-	movaps xmm2, [esp + nb430_dz]
-
-	mulps  xmm0, xmm4
-	mulps  xmm1, xmm4
-	mulps  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movaps xmm3, [esp + nb430_fix]
-	movaps xmm4, [esp + nb430_fiy]
-	movaps xmm5, [esp + nb430_fiz]
-	addps  xmm3, xmm0
-	addps  xmm4, xmm1
-	addps  xmm5, xmm2
-	movaps [esp + nb430_fix], xmm3
-	movaps [esp + nb430_fiy], xmm4
-	movaps [esp + nb430_fiz], xmm5
-	;# update the fj's 
-	movss   xmm3, [edi + eax*4]
-	movss   xmm4, [edi + eax*4 + 4]
-	movss   xmm5, [edi + eax*4 + 8]
-	subss   xmm3, xmm0
-	subss   xmm4, xmm1
-	subss   xmm5, xmm2	
-	movss   [edi + eax*4], xmm3
-	movss   [edi + eax*4 + 4], xmm4
-	movss   [edi + eax*4 + 8], xmm5	
-
-	shufps  xmm0, xmm0, 225  ;# constant 11100001
-	shufps  xmm1, xmm1, 225  ;# constant 11100001
-	shufps  xmm2, xmm2, 225  ;# constant 11100001
-
-	movss   xmm3, [edi + ebx*4]
-	movss   xmm4, [edi + ebx*4 + 4]
-	movss   xmm5, [edi + ebx*4 + 8]
-	subss   xmm3, xmm0
-	subss   xmm4, xmm1
-	subss   xmm5, xmm2	
-	movss   [edi + ebx*4], xmm3
-	movss   [edi + ebx*4 + 4], xmm4
-	movss   [edi + ebx*4 + 8], xmm5	
-
-.nb430_checksingle:				
-	mov   edx, [esp + nb430_innerk]
-	and   edx, 1
-	jnz    .nb430_dosingle
-	jmp    .nb430_updateouterdata
-.nb430_dosingle:
-	mov esi, [ebp + nb430_charge]
-	mov edx, [ebp + nb430_invsqrta]
-	mov edi, [ebp + nb430_pos]
-	mov   ecx, [esp + nb430_innerjjnr]
-	mov   eax, [ecx]	
-	xorps  xmm2, xmm2
-	movaps xmm6, xmm2
-	movss xmm2, [edx + eax*4]	;# isaj
-	mulss xmm2, [esp + nb430_isai]
-	movss [esp + nb430_isaprod], xmm2	
-	movss xmm1, xmm2
-	mulss xmm1, [esp + nb430_gbtsc]
-	movss [esp + nb430_gbscale], xmm1	
-	
-	mulss  xmm2, [esp + nb430_iq]
-	movss xmm6, [esi + eax*4]	;# xmm6(0) has the charge 	
-	mulss  xmm6, xmm2
-	movss [esp + nb430_qq], xmm6
-		
-	mov esi, [ebp + nb430_type]
-	mov ecx, eax
-	mov ecx, [esi + ecx*4]	
-	mov esi, [ebp + nb430_vdwparam]
-	shl ecx, 1
-	add ecx, [esp + nb430_ntia]
-	movlps xmm6, [esi + ecx*4]
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 252  ;# constant 11111100	
-	shufps xmm6, xmm6, 253  ;# constant 11111101	
-			
-	movss [esp + nb430_c6], xmm4
-	movss [esp + nb430_c12], xmm6	
-
-	movd  mm0, eax
-	lea   eax, [eax + eax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movss xmm0, [edi + eax*4]	
-	movss xmm1, [edi + eax*4 + 4]	
-	movss xmm2, [edi + eax*4 + 8]	 
-	
-	movss xmm4, [esp + nb430_ix]
-	movss xmm5, [esp + nb430_iy]
-	movss xmm6, [esp + nb430_iz]
-
-	;# calc dr 
-	subss xmm4, xmm0
-	subss xmm5, xmm1
-	subss xmm6, xmm2
-
-	;# store dr 
-	movaps [esp + nb430_dx], xmm4
-	movaps [esp + nb430_dy], xmm5
-	movaps [esp + nb430_dz], xmm6
-	;# square it 
-	mulss xmm4,xmm4
-	mulss xmm5,xmm5
-	mulss xmm6,xmm6
-	addss xmm4, xmm5
-	addss xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulss xmm5, xmm5
-	movss xmm1, [esp + nb430_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movss xmm0, [esp + nb430_half]
-	subss xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-
-	mulss xmm4, xmm0	;# xmm4=r 
-	movss [esp + nb430_r], xmm4
-	mulss xmm4, [esp + nb430_gbscale]
-
-	cvttss2si ebx, xmm4     ;# mm6 contain lu indices 
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl ebx, 2
-
-	mov  esi, [ebp + nb430_GBtab]
-						
-	movaps xmm4, [esi + ebx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	mulss  xmm7, [esp + nb430_two]	;# two*Heps2 
-	movss xmm3, [esp + nb430_qq]
-	addss  xmm7, xmm6
-	addss  xmm7, xmm5 ;# xmm7=FF 
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulss  xmm3, xmm7 ;# fijC=FF*qq 
-
-	movd ebx, mm0
-	mov esi, [ebp + nb430_dvda]
-	
-	;# Calculate dVda
-	xorps xmm7, xmm7
-	mulss xmm3, [esp + nb430_gbscale]
-	movaps xmm6, xmm3
-	mulss  xmm6, [esp + nb430_r]
-	addss  xmm6, xmm5
-	addss  xmm5, [esp + nb430_vctot]
-	movss [esp + nb430_vctot], xmm5 
-	
-
-	;# xmm6=(vcoul+fijC*r)
-	subss  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum
-	addss  xmm7, [esp + nb430_dvdasum]
-	movaps [esp + nb430_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	addss  xmm6, [esi + ebx*4]
-	movss  [esi + ebx*4], xmm6
-	
-	;# put scalar force on stack temporarily 
-	movss [esp + nb430_fscal], xmm3
-
-	movss xmm4, [esp + nb430_r]
-	mulps xmm4, [esp + nb430_tsc]
-	
-	cvttss2si ebx, xmm4
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movss xmm1, xmm4	;# xmm1=eps 
-	movss xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl ebx, 3
-	mov  esi, [ebp + nb430_VFtab]
-			
-	;# dispersion 
-	movaps xmm4, [esi + ebx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	mulss  xmm7, [esp + nb430_two]	;# two*Heps2 
-	addss  xmm7, xmm6
-	addss  xmm7, xmm5 ;# xmm7=FF 
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-
-	movss xmm4, [esp + nb430_c6]
-	mulss  xmm7, xmm4	 ;# fijD 
-	mulss  xmm5, xmm4	 ;# Vvdw6
-	mulps  xmm7, [esp + nb430_tsc]
-	addss  xmm7, [esp + nb430_fscal] ;# add to fscal 
-
-	;# put scalar force on stack Update Vvdwtot directly 
-	addss  xmm5, [esp + nb430_Vvdwtot]
-	movss [esp + nb430_fscal], xmm7
-	movss [esp + nb430_Vvdwtot], xmm5
-
-	;# repulsion 
-	movaps xmm4, [esi + ebx*4 + 16]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-	
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	mulss  xmm7, [esp + nb430_two]	;# two*Heps2 
-	addss  xmm7, xmm6
-	addss  xmm7, xmm5 ;# xmm7=FF 
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
- 	
-	movss xmm4, [esp + nb430_c12]
-	mulss  xmm7, xmm4 ;# fijR 
-	mulss  xmm5, xmm4 ;# Vvdw12 
-	mulps  xmm7, [esp + nb430_tsc]
-	addss  xmm7, [esp + nb430_fscal] 
-	
-	addss  xmm5, [esp + nb430_Vvdwtot]
-	movss [esp + nb430_Vvdwtot], xmm5
-	xorps  xmm4, xmm4
-
-	mulss xmm7, xmm0
-	subss  xmm4, xmm7
-	mov    edi, [ebp + nb430_faction]
-
-	movss xmm0, [esp + nb430_dx]
-	movss xmm1, [esp + nb430_dy]
-	movss xmm2, [esp + nb430_dz]
-
-	mulss  xmm0, xmm4
-	mulss  xmm1, xmm4
-	mulss  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movss xmm3, [esp + nb430_fix]
-	movss xmm4, [esp + nb430_fiy]
-	movss xmm5, [esp + nb430_fiz]
-	addss  xmm3, xmm0
-	addss  xmm4, xmm1
-	addss  xmm5, xmm2
-	movss [esp + nb430_fix], xmm3
-	movss [esp + nb430_fiy], xmm4
-	movss [esp + nb430_fiz], xmm5
-	;# update fj 
-	
-	movss   xmm3, [edi + eax*4]
-	movss   xmm4, [edi + eax*4 + 4]
-	movss   xmm5, [edi + eax*4 + 8]
-	subss   xmm3, xmm0
-	subss   xmm4, xmm1
-	subss   xmm5, xmm2	
-	movss   [edi + eax*4], xmm3
-	movss   [edi + eax*4 + 4], xmm4
-	movss   [edi + eax*4 + 8], xmm5	
-.nb430_updateouterdata:
-	mov   ecx, [esp + nb430_ii3]
-	mov   edi, [ebp + nb430_faction]
-	mov   esi, [ebp + nb430_fshift]
-	mov   edx, [esp + nb430_is3]
-
-	;# accumulate i forces in xmm0, xmm1, xmm2 
-	movaps xmm0, [esp + nb430_fix]
-	movaps xmm1, [esp + nb430_fiy]
-	movaps xmm2, [esp + nb430_fiz]
-
-	movhlps xmm3, xmm0
-	movhlps xmm4, xmm1
-	movhlps xmm5, xmm2
-	addps  xmm0, xmm3
-	addps  xmm1, xmm4
-	addps  xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2 
-
-	movaps xmm3, xmm0	
-	movaps xmm4, xmm1	
-	movaps xmm5, xmm2	
-
-	shufps xmm3, xmm3, 1
-	shufps xmm4, xmm4, 1
-	shufps xmm5, xmm5, 1
-	addss  xmm0, xmm3
-	addss  xmm1, xmm4
-	addss  xmm2, xmm5	;# xmm0-xmm2 has single force in pos0 
-
-	;# increment i force 
-	movss  xmm3, [edi + ecx*4]
-	movss  xmm4, [edi + ecx*4 + 4]
-	movss  xmm5, [edi + ecx*4 + 8]
-	addss  xmm3, xmm0
-	addss  xmm4, xmm1
-	addss  xmm5, xmm2
-	movss  [edi + ecx*4],     xmm3
-	movss  [edi + ecx*4 + 4], xmm4
-	movss  [edi + ecx*4 + 8], xmm5
-
-	;# increment fshift force  
-	movss  xmm3, [esi + edx*4]
-	movss  xmm4, [esi + edx*4 + 4]
-	movss  xmm5, [esi + edx*4 + 8]
-	addss  xmm3, xmm0
-	addss  xmm4, xmm1
-	addss  xmm5, xmm2
-	movss  [esi + edx*4],     xmm3
-	movss  [esi + edx*4 + 4], xmm4
-	movss  [esi + edx*4 + 8], xmm5
-
-	;# get n from stack
-	mov esi, [esp + nb430_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb430_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movaps xmm7, [esp + nb430_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb430_Vc]
-	addss xmm7, [eax + edx*4] 
-	;# move back to mem 
-	movss [eax + edx*4], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movaps xmm7, [esp + nb430_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb430_Vvdw]
-	addss xmm7, [eax + edx*4] 
-	;# move back to mem 
-	movss [eax + edx*4], xmm7 
-	
-	;# accumulate dVda and update it 
-	movaps xmm7, [esp + nb430_dvdasum]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-	
-	mov edx, [esp + nb430_ii]
-	mov eax, [ebp + nb430_dvda]
-	addss xmm7, [eax + edx*4]
-	movss [eax + edx*4], xmm7
-	
-        ;# finish if last 
-        mov ecx, [esp + nb430_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb430_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb430_n], esi
-        jmp .nb430_outer
-.nb430_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb430_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb430_end
-        ;# non-zero, do one more workunit
-        jmp   .nb430_threadloop
-.nb430_end:
-	emms
-
-	mov eax, [esp + nb430_nouter]
-	mov ebx, [esp + nb430_ninner]
-	mov ecx, [ebp + nb430_outeriter]
-	mov edx, [ebp + nb430_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb430_salign]
-	add esp, eax
-	add esp, 488
-	pop edi
-	pop esi
-    	pop edx
-    	pop ecx
-    	pop ebx
-    	pop eax
-	leave
-	ret
-
-
-	
-
-
-
-
-.globl nb_kernel430nf_ia32_sse
-.globl _nb_kernel430nf_ia32_sse
-nb_kernel430nf_ia32_sse:	
-_nb_kernel430nf_ia32_sse:	
-.equiv          nb430nf_p_nri,          8
-.equiv          nb430nf_iinr,           12
-.equiv          nb430nf_jindex,         16
-.equiv          nb430nf_jjnr,           20
-.equiv          nb430nf_shift,          24
-.equiv          nb430nf_shiftvec,       28
-.equiv          nb430nf_fshift,         32
-.equiv          nb430nf_gid,            36
-.equiv          nb430nf_pos,            40
-.equiv          nb430nf_faction,        44
-.equiv          nb430nf_charge,         48
-.equiv          nb430nf_p_facel,        52
-.equiv          nb430nf_argkrf,         56
-.equiv          nb430nf_argcrf,         60
-.equiv          nb430nf_Vc,             64
-.equiv          nb430nf_type,           68
-.equiv          nb430nf_p_ntype,        72
-.equiv          nb430nf_vdwparam,       76
-.equiv          nb430nf_Vvdw,           80
-.equiv          nb430nf_p_tabscale,     84
-.equiv          nb430nf_VFtab,          88
-.equiv          nb430nf_invsqrta,       92
-.equiv          nb430nf_dvda,           96
-.equiv          nb430nf_p_gbtabscale,   100
-.equiv          nb430nf_GBtab,          104
-.equiv          nb430nf_p_nthreads,     108
-.equiv          nb430nf_count,          112
-.equiv          nb430nf_mtx,            116
-.equiv          nb430nf_outeriter,      120
-.equiv          nb430nf_inneriter,      124
-.equiv          nb430nf_work,           128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb430nf_ix,             0
-.equiv          nb430nf_iy,             16
-.equiv          nb430nf_iz,             32
-.equiv          nb430nf_iq,             48
-.equiv          nb430nf_gbtsc,          64
-.equiv          nb430nf_tsc,            80
-.equiv          nb430nf_qq,             96
-.equiv          nb430nf_c6,             112
-.equiv          nb430nf_c12,            128
-.equiv          nb430nf_vctot,          144
-.equiv          nb430nf_Vvdwtot,        160
-.equiv          nb430nf_half,           176
-.equiv          nb430nf_three,          192
-.equiv          nb430nf_isai,           208
-.equiv          nb430nf_isaprod,        224
-.equiv          nb430nf_gbscale,        240
-.equiv          nb430nf_r,              256
-.equiv          nb430nf_is3,            272
-.equiv          nb430nf_ii3,            276
-.equiv          nb430nf_ntia,           280
-.equiv          nb430nf_innerjjnr,      284
-.equiv          nb430nf_innerk,         288
-.equiv          nb430nf_n,              292
-.equiv          nb430nf_nn1,            296
-.equiv          nb430nf_nri,            300
-.equiv          nb430nf_facel,          304
-.equiv          nb430nf_ntype,          308
-.equiv          nb430nf_nouter,         312
-.equiv          nb430nf_ninner,         316
-.equiv          nb430nf_salign,         320
-	push ebp
-	mov ebp,esp	
-    	push eax
-    	push ebx
-    	push ecx
-    	push edx
-	push esi
-	push edi
-	sub esp, 324		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb430nf_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb430nf_p_nri]
-	mov esi, [ebp + nb430nf_p_facel]
-	mov edi, [ebp + nb430nf_p_ntype]
-	mov ecx, [ecx]
-	mov esi, [esi]
-	mov edi, [edi]
-	mov [esp + nb430nf_nri], ecx
-	mov [esp + nb430nf_facel], esi
-	mov [esp + nb430nf_ntype], edi
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb430nf_nouter], eax
-	mov [esp + nb430nf_ninner], eax
-
-
-	mov eax, [ebp + nb430nf_p_gbtabscale]
-	movss xmm3, [eax]
-	mov eax, [ebp + nb430nf_p_tabscale]
-	movss xmm4, [eax]
-	shufps xmm3, xmm3, 0
-	shufps xmm4, xmm4, 0
-	movaps [esp + nb430nf_gbtsc], xmm3
-	movaps [esp + nb430nf_tsc], xmm4
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# constant 0.5 in IEEE (hex)
-	mov [esp + nb430nf_half], eax
-	movss xmm1, [esp + nb430nf_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# constant 1.0
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# constant 2.0
-	addps  xmm3, xmm2	;# constant 3.0
-	movaps [esp + nb430nf_half],  xmm1
-	movaps [esp + nb430nf_three],  xmm3
-
-.nb430nf_threadloop:
-        mov   esi, [ebp + nb430nf_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb430nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb430nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb430nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb430nf_n], eax
-        mov [esp + nb430nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb430nf_outerstart
-        jmp .nb430nf_end
-
-.nb430nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb430nf_nouter]
-	mov [esp + nb430nf_nouter], ebx
-
-.nb430nf_outer:
-	mov   eax, [ebp + nb430nf_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax + esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb430nf_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb430nf_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movss xmm0, [eax + ebx*4]
-	movss xmm1, [eax + ebx*4 + 4]
-	movss xmm2, [eax + ebx*4 + 8] 
-
-	mov   ecx, [ebp + nb430nf_iinr]       ;# ecx = pointer into iinr[] 	
-	mov   ebx, [ecx + esi*4]	    ;# ebx =ii 
-
-	mov   edx, [ebp + nb430nf_charge]
-	movss xmm3, [edx + ebx*4]	
-	mulss xmm3, [esp + nb430nf_facel]
-	shufps xmm3, xmm3, 0
-
-	mov   edx, [ebp + nb430nf_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [edx + ebx*4]
-	shufps xmm4, xmm4, 0
-
-    	mov   edx, [ebp + nb430nf_type] 
-    	mov   edx, [edx + ebx*4]
-    	imul  edx, [esp + nb430nf_ntype]
-    	shl   edx, 1
-    	mov   [esp + nb430nf_ntia], edx
-		
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb430nf_pos]    ;# eax = base of pos[]  
-
-	addss xmm0, [eax + ebx*4]
-	addss xmm1, [eax + ebx*4 + 4]
-	addss xmm2, [eax + ebx*4 + 8]
-
-	movaps [esp + nb430nf_iq], xmm3
-	movaps [esp + nb430nf_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [esp + nb430nf_ix], xmm0
-	movaps [esp + nb430nf_iy], xmm1
-	movaps [esp + nb430nf_iz], xmm2
-
-	mov   [esp + nb430nf_ii3], ebx
-	
-	;# clear vctot 
-	xorps xmm4, xmm4
-	movaps [esp + nb430nf_vctot], xmm4
-	movaps [esp + nb430nf_Vvdwtot], xmm4
-	
-	mov   eax, [ebp + nb430nf_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb430nf_pos]
-	mov   edi, [ebp + nb430nf_faction]	
-	mov   eax, [ebp + nb430nf_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb430nf_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [esp + nb430nf_ninner]
-	mov   [esp + nb430nf_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb430nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb430nf_unroll_loop
-	jmp   .nb430nf_finish_inner
-.nb430nf_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   edx, [esp + nb430nf_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [edx]	
-	mov   ebx, [edx + 4]              
-	mov   ecx, [edx + 8]            
-	mov   edx, [edx + 12]         ;# eax-edx=jnr1-4 
-	add dword ptr [esp + nb430nf_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-
-	;# load isa2
-	mov esi, [ebp + nb430nf_invsqrta]
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-	movaps xmm2, [esp + nb430nf_isai]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3  
-	mulps  xmm2, xmm3
-		
-	movaps [esp + nb430nf_isaprod], xmm2
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb430nf_gbtsc]
-	movaps [esp + nb430nf_gbscale], xmm1
-	
-	mov esi, [ebp + nb430nf_charge]    ;# base of charge[] 
-	
-	movss xmm3, [esi + eax*4]
-	movss xmm4, [esi + ecx*4]
-	movss xmm6, [esi + ebx*4]
-	movss xmm7, [esi + edx*4]
-
-	mulps xmm2, [esp + nb430nf_iq]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3  
-	mulps  xmm3, xmm2
-	movaps [esp + nb430nf_qq], xmm3	
-
-	movd  mm0, eax		;# use mmx registers as temp storage 
-	movd  mm1, ebx
-	movd  mm2, ecx
-	movd  mm3, edx
-	
-	mov esi, [ebp + nb430nf_type]
-	mov eax, [esi + eax*4]
-	mov ebx, [esi + ebx*4]
-	mov ecx, [esi + ecx*4]
-	mov edx, [esi + edx*4]
-	mov esi, [ebp + nb430nf_vdwparam]
-	shl eax, 1	
-	shl ebx, 1	
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [esp + nb430nf_ntia]
-	add eax, edi
-	add ebx, edi
-	add ecx, edi
-	add edx, edi
-
-	movlps xmm6, [esi + eax*4]
-	movlps xmm7, [esi + ecx*4]
-	movhps xmm6, [esi + ebx*4]
-	movhps xmm7, [esi + edx*4]
-
-	movaps xmm4, xmm6
-	shufps xmm4, xmm7, 136  ;# constant 10001000
-	shufps xmm6, xmm7, 221  ;# constant 11011101
-	
-	movd  eax, mm0		
-	movd  ebx, mm1
-	movd  ecx, mm2
-	movd  edx, mm3
-
-	movaps [esp + nb430nf_c6], xmm4
-	movaps [esp + nb430nf_c12], xmm6
-	
-	mov esi, [ebp + nb430nf_pos]       ;# base of pos[] 
-
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-
-	lea   ecx, [ecx + ecx*2]     ;# replace jnr with j3 
-	lea   edx, [edx + edx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-
-	movlps xmm4, [esi + eax*4]
-	movlps xmm5, [esi + ecx*4]
-	movss xmm2, [esi + eax*4 + 8]
-	movss xmm6, [esi + ecx*4 + 8]
-
-	movhps xmm4, [esi + ebx*4]
-	movhps xmm5, [esi + edx*4]
-
-	movss xmm0, [esi + ebx*4 + 8]
-	movss xmm1, [esi + edx*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm5, 136  ;# constant 10001000
-	shufps xmm1, xmm5, 221  ;# constant 11011101		
-
-	;# move ix-iz to xmm4-xmm6 
-	movaps xmm4, [esp + nb430nf_ix]
-	movaps xmm5, [esp + nb430nf_iy]
-	movaps xmm6, [esp + nb430nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb430nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb430nf_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r
-	movaps [esp + nb430nf_r], xmm4
-	mulps xmm4, [esp + nb430nf_gbscale]
-
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 2
-	pslld mm7, 2
-
-	movd mm0, eax	
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-
-	mov  esi, [ebp + nb430nf_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-		
-	;# load coulomb table
-	movaps xmm4, [esi + eax*4]
-	movaps xmm5, [esi + ebx*4]
-	movaps xmm6, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# coulomb table ready, in xmm4-xmm7  		
-	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	movaps xmm3, [esp + nb430nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	addps  xmm5, [esp + nb430nf_vctot]
-	movaps [esp + nb430nf_vctot], xmm5
-
-	
-	movaps xmm4, [esp + nb430nf_r]
-	mulps xmm4, [esp + nb430nf_tsc]
-	
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 3
-	pslld mm7, 3
-	
-	mov  esi, [ebp + nb430nf_VFtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-		
-	;# dispersion 
-	movaps xmm4, [esi + eax*4]
-	movaps xmm5, [esi + ebx*4]
-	movaps xmm6, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# dispersion table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, [esp + nb430nf_c6]	 ;# Vvdw6
-	addps  xmm5, [esp + nb430nf_Vvdwtot]
-	movaps [esp + nb430nf_Vvdwtot], xmm5
-
-	;# repulsion 
-	movaps xmm4, [esi + eax*4 + 16]
-	movaps xmm5, [esi + ebx*4 + 16]
-	movaps xmm6, [esi + ecx*4 + 16]
-	movaps xmm7, [esi + edx*4 + 16]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
- 	
-	mulps  xmm5, [esp + nb430nf_c12] ;# Vvdw12
-	addps  xmm5, [esp + nb430nf_Vvdwtot]
-	movaps [esp + nb430nf_Vvdwtot], xmm5
-		
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb430nf_innerk],  4
-	jl    .nb430nf_finish_inner
-	jmp   .nb430nf_unroll_loop
-.nb430nf_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [esp + nb430nf_innerk],  4
-	mov   edx, [esp + nb430nf_innerk]
-	and   edx, 2
-	jnz   .nb430nf_dopair
-	jmp   .nb430nf_checksingle
-.nb430nf_dopair:	
-
-	mov   ecx, [esp + nb430nf_innerjjnr]
-	
-	mov   eax, [ecx]	
-	mov   ebx, [ecx + 4]              
-	add dword ptr [esp + nb430nf_innerjjnr],  8	
-
-	xorps xmm2, xmm2
-	movaps xmm6, xmm2
-	
-	;# load isa2
-	mov esi, [ebp + nb430nf_invsqrta]
-	movss xmm2, [esi + eax*4]
-	movss xmm3, [esi + ebx*4]
-	unpcklps xmm2, xmm3	;# isa2 in xmm3(0,1)
-	mulps  xmm2, [esp + nb430nf_isai]
-	movaps [esp + nb430nf_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [esp + nb430nf_gbtsc]
-	movaps [esp + nb430nf_gbscale], xmm1	
-	
-	mov esi, [ebp + nb430nf_charge]    ;# base of charge[] 	
-	movss xmm3, [esi + eax*4]		
-	movss xmm6, [esi + ebx*4]
-	unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges 
-
-	mulps  xmm2, [esp + nb430nf_iq]
-	mulps  xmm3, xmm2
-	movaps [esp + nb430nf_qq], xmm3
-
-	mov esi, [ebp + nb430nf_type]
-	mov   ecx, eax
-	mov   edx, ebx
-	mov ecx, [esi + ecx*4]
-	mov edx, [esi + edx*4]	
-	mov esi, [ebp + nb430nf_vdwparam]
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [esp + nb430nf_ntia]
-	add ecx, edi
-	add edx, edi
-	movlps xmm6, [esi + ecx*4]
-	movhps xmm6, [esi + edx*4]
-	mov edi, [ebp + nb430nf_pos]	
-	
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 8 ;# constant 00001000 	
-	shufps xmm6, xmm6, 13 ;# constant 00001101
-	movlhps xmm4, xmm7
-	movlhps xmm6, xmm7
-	
-	movaps [esp + nb430nf_c6], xmm4
-	movaps [esp + nb430nf_c12], xmm6	
-			
-	lea   eax, [eax + eax*2]
-	lea   ebx, [ebx + ebx*2]
-	;# move coordinates to xmm0-xmm2 
-	movlps xmm1, [edi + eax*4]
-	movss xmm2, [edi + eax*4 + 8]	
-	movhps xmm1, [edi + ebx*4]
-	movss xmm0, [edi + ebx*4 + 8]	
-
-	movlhps xmm3, xmm7
-	
-	shufps xmm2, xmm0, 0
-	
-	movaps xmm0, xmm1
-
-	shufps xmm2, xmm2, 136  ;# constant 10001000
-	
-	shufps xmm0, xmm0, 136  ;# constant 10001000
-	shufps xmm1, xmm1, 221  ;# constant 11011101
-			
-	mov    edi, [ebp + nb430nf_faction]
-	;# move ix-iz to xmm4-xmm6 
-	xorps   xmm7, xmm7
-	
-	movaps xmm4, [esp + nb430nf_ix]
-	movaps xmm5, [esp + nb430nf_iy]
-	movaps xmm6, [esp + nb430nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [esp + nb430nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [esp + nb430nf_half]
-	subps xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	movaps [esp + nb430nf_r], xmm4
-	mulps xmm4, [esp + nb430nf_gbscale]
-
-	cvttps2pi mm6, xmm4     ;# mm6 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-
-	pslld mm6, 2
-
-	mov  esi, [ebp + nb430nf_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-
-	;# load coulomb table
-	movaps xmm4, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# coulomb table ready, in xmm4-xmm7  	
-
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	movaps xmm3, [esp + nb430nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	addps  xmm5, [esp + nb430nf_vctot]
-	movaps [esp + nb430nf_vctot], xmm5 
-
-	movaps xmm4, [esp + nb430nf_r]
-	mulps xmm4, [esp + nb430nf_tsc]
-	
-	cvttps2pi mm6, xmm4
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 3
-	
-	mov  esi, [ebp + nb430nf_VFtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-			
-	;# dispersion 
-	movaps xmm4, [esi + ecx*4]
-	movaps xmm7, [esi + edx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# dispersion table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-
-	mulps  xmm5, [esp + nb430nf_c6]	 ;# Vvdw6 
-	addps  xmm5, [esp + nb430nf_Vvdwtot]
-	movaps [esp + nb430nf_Vvdwtot], xmm5
-
-	;# repulsion 
-	movaps xmm4, [esi + ecx*4 + 16]
-	movaps xmm7, [esi + edx*4 + 16]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
- 	
-	mulps  xmm5, [esp + nb430nf_c12] ;# Vvdw12 
-	
-	addps  xmm5, [esp + nb430nf_Vvdwtot]
-	movaps [esp + nb430nf_Vvdwtot], xmm5
-.nb430nf_checksingle:				
-	mov   edx, [esp + nb430nf_innerk]
-	and   edx, 1
-	jnz    .nb430nf_dosingle
-	jmp    .nb430nf_updateouterdata
-.nb430nf_dosingle:
-	mov esi, [ebp + nb430nf_charge]
-	mov edx, [ebp + nb430nf_invsqrta]
-	mov edi, [ebp + nb430nf_pos]
-	mov   ecx, [esp + nb430nf_innerjjnr]
-	mov   eax, [ecx]	
-	xorps  xmm2, xmm2
-	movaps xmm6, xmm2
-	movss xmm2, [edx + eax*4]	;# isa2
-	mulss xmm2, [esp + nb430nf_isai]
-	movss [esp + nb430nf_isaprod], xmm2	
-	movss xmm1, xmm2
-	mulss xmm1, [esp + nb430nf_gbtsc]
-	movss [esp + nb430nf_gbscale], xmm1	
-	
-	mulss  xmm2, [esp + nb430nf_iq]
-	movss xmm6, [esi + eax*4]	;# xmm6(0) has the charge 	
-	mulss  xmm6, xmm2
-	movss [esp + nb430nf_qq], xmm6
-		
-	mov esi, [ebp + nb430nf_type]
-	mov ecx, eax
-	mov ecx, [esi + ecx*4]	
-	mov esi, [ebp + nb430nf_vdwparam]
-	shl ecx, 1
-	add ecx, [esp + nb430nf_ntia]
-	movlps xmm6, [esi + ecx*4]
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 252  ;# constant 11111100	
-	shufps xmm6, xmm6, 253  ;# constant 11111101	
-			
-	movss [esp + nb430nf_c6], xmm4
-	movss [esp + nb430nf_c12], xmm6	
-		
-	lea   eax, [eax + eax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movss xmm0, [edi + eax*4]	
-	movss xmm1, [edi + eax*4 + 4]	
-	movss xmm2, [edi + eax*4 + 8]	 
-	
-	movss xmm4, [esp + nb430nf_ix]
-	movss xmm5, [esp + nb430nf_iy]
-	movss xmm6, [esp + nb430nf_iz]
-
-	;# calc dr 
-	subss xmm4, xmm0
-	subss xmm5, xmm1
-	subss xmm6, xmm2
-
-	;# square it 
-	mulss xmm4,xmm4
-	mulss xmm5,xmm5
-	mulss xmm6,xmm6
-	addss xmm4, xmm5
-	addss xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulss xmm5, xmm5
-	movss xmm1, [esp + nb430nf_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movss xmm0, [esp + nb430nf_half]
-	subss xmm1, xmm5	;# constant 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-
-	mulss xmm4, xmm0	;# xmm4=r 
-	movaps [esp + nb430nf_r], xmm4
-	mulss xmm4, [esp + nb430nf_gbscale]
-
-	cvttss2si ebx, xmm4     ;# mm6 contain lu indices 
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl ebx, 2
-
-	mov  esi, [ebp + nb430nf_GBtab]
-						
-	movaps xmm4, [esi + ebx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	movss xmm3, [esp + nb430nf_qq]
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, xmm3 ;# vcoul=qq*VV  
-	addss  xmm5, [esp + nb430nf_vctot]
-	movss [esp + nb430nf_vctot], xmm5
-	
-	movss xmm4, [esp + nb430nf_r]
-	mulps xmm4, [esp + nb430nf_tsc]
-	
-	cvttss2si ebx, xmm4
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movss xmm1, xmm4	;# xmm1=eps 
-	movss xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl ebx, 3
-	mov  esi, [ebp + nb430nf_VFtab]
-			
-	;# dispersion 
-	movaps xmm4, [esi + ebx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-	
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, [esp + nb430nf_c6]	 ;# Vvdw6
-	addss  xmm5, [esp + nb430nf_Vvdwtot]
-	movss [esp + nb430nf_Vvdwtot], xmm5
-
-	;# repulsion 
-	movaps xmm4, [esi + ebx*4 + 16]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-	
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
- 	
-	mulss  xmm5, [esp + nb430nf_c12] ;# Vvdw12 
-	
-	addss  xmm5, [esp + nb430nf_Vvdwtot]
-	movss [esp + nb430nf_Vvdwtot], xmm5
-
-.nb430nf_updateouterdata:
-	;# get n from stack
-	mov esi, [esp + nb430nf_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb430nf_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movaps xmm7, [esp + nb430nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb430nf_Vc]
-	addss xmm7, [eax + edx*4] 
-	;# move back to mem 
-	movss [eax + edx*4], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movaps xmm7, [esp + nb430nf_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb430nf_Vvdw]
-	addss xmm7, [eax + edx*4] 
-	;# move back to mem 
-	movss [eax + edx*4], xmm7 
-	
-        ;# finish if last 
-        mov ecx, [esp + nb430nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb430nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb430nf_n], esi
-        jmp .nb430nf_outer
-.nb430nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb430nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb430nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb430nf_threadloop
-.nb430nf_end:
-	emms
-
-	mov eax, [esp + nb430nf_nouter]
-	mov ebx, [esp + nb430nf_ninner]
-	mov ecx, [ebp + nb430nf_outeriter]
-	mov edx, [ebp + nb430nf_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb430nf_salign]
-	add esp, eax
-	add esp, 324
-	pop edi
-	pop esi
-    	pop edx
-    	pop ecx
-    	pop ebx
-    	pop eax
-	leave
-	ret
-
-
-	
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.s
deleted file mode 100644
index 477f512d8c..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.s
+++ /dev/null
@@ -1,2382 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-
-
-.globl nb_kernel430_ia32_sse
-.globl _nb_kernel430_ia32_sse
-nb_kernel430_ia32_sse:  
-_nb_kernel430_ia32_sse: 
-.set nb430_p_nri, 8
-.set nb430_iinr, 12
-.set nb430_jindex, 16
-.set nb430_jjnr, 20
-.set nb430_shift, 24
-.set nb430_shiftvec, 28
-.set nb430_fshift, 32
-.set nb430_gid, 36
-.set nb430_pos, 40
-.set nb430_faction, 44
-.set nb430_charge, 48
-.set nb430_p_facel, 52
-.set nb430_argkrf, 56
-.set nb430_argcrf, 60
-.set nb430_Vc, 64
-.set nb430_type, 68
-.set nb430_p_ntype, 72
-.set nb430_vdwparam, 76
-.set nb430_Vvdw, 80
-.set nb430_p_tabscale, 84
-.set nb430_VFtab, 88
-.set nb430_invsqrta, 92
-.set nb430_dvda, 96
-.set nb430_p_gbtabscale, 100
-.set nb430_GBtab, 104
-.set nb430_p_nthreads, 108
-.set nb430_count, 112
-.set nb430_mtx, 116
-.set nb430_outeriter, 120
-.set nb430_inneriter, 124
-.set nb430_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb430_ix, 0
-.set nb430_iy, 16
-.set nb430_iz, 32
-.set nb430_iq, 48
-.set nb430_dx, 64
-.set nb430_dy, 80
-.set nb430_dz, 96
-.set nb430_two, 112
-.set nb430_gbtsc, 128
-.set nb430_tsc, 144
-.set nb430_qq, 160
-.set nb430_c6, 176
-.set nb430_c12, 192
-.set nb430_fscal, 208
-.set nb430_vctot, 224
-.set nb430_Vvdwtot, 240
-.set nb430_fix, 256
-.set nb430_fiy, 272
-.set nb430_fiz, 288
-.set nb430_half, 304
-.set nb430_three, 320
-.set nb430_r, 336
-.set nb430_isai, 352
-.set nb430_isaprod, 368
-.set nb430_dvdasum, 384
-.set nb430_gbscale, 400
-.set nb430_ii, 416
-.set nb430_is3, 420
-.set nb430_ii3, 424
-.set nb430_ntia, 428
-.set nb430_innerjjnr, 432
-.set nb430_innerk, 436
-.set nb430_n, 440
-.set nb430_nn1, 444
-.set nb430_jnra, 448
-.set nb430_jnrb, 452
-.set nb430_jnrc, 456
-.set nb430_jnrd, 460
-.set nb430_nri, 464
-.set nb430_facel, 468
-.set nb430_ntype, 472
-.set nb430_nouter, 476
-.set nb430_ninner, 480
-.set nb430_salign, 484
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $488,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb430_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb430_p_nri(%ebp),%ecx
-        movl nb430_p_facel(%ebp),%esi
-        movl nb430_p_ntype(%ebp),%edi
-        movl (%ecx),%ecx
-        movl (%esi),%esi
-        movl (%edi),%edi
-        movl %ecx,nb430_nri(%esp)
-        movl %esi,nb430_facel(%esp)
-        movl %edi,nb430_ntype(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb430_nouter(%esp)
-        movl %eax,nb430_ninner(%esp)
-
-
-        movl nb430_p_gbtabscale(%ebp),%eax
-        movss (%eax),%xmm3
-        movl nb430_p_tabscale(%ebp),%eax
-        movss (%eax),%xmm4
-        shufps $0,%xmm3,%xmm3
-        shufps $0,%xmm4,%xmm4
-        movaps %xmm3,nb430_gbtsc(%esp)
-        movaps %xmm4,nb430_tsc(%esp)
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## constant 0.5 in IEEE (hex)
-        movl %eax,nb430_half(%esp)
-        movss nb430_half(%esp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## constant 1.0
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## constant 2.0
-        addps  %xmm2,%xmm3      ## constant 3.0
-        movaps %xmm1,nb430_half(%esp)
-        movaps %xmm2,nb430_two(%esp)
-        movaps %xmm3,nb430_three(%esp)
-
-_nb_kernel430_ia32_sse.nb430_threadloop: 
-        movl  nb430_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel430_ia32_sse.nb430_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel430_ia32_sse.nb430_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb430_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb430_n(%esp)
-        movl %ebx,nb430_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel430_ia32_sse.nb430_outerstart
-        jmp _nb_kernel430_ia32_sse.nb430_end
-
-_nb_kernel430_ia32_sse.nb430_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb430_nouter(%esp),%ebx
-        movl %ebx,nb430_nouter(%esp)
-
-_nb_kernel430_ia32_sse.nb430_outer: 
-        movl  nb430_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx                ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb430_is3(%esp)      ## store is3 
-
-        movl  nb430_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movss (%eax,%ebx,4),%xmm0
-        movss 4(%eax,%ebx,4),%xmm1
-        movss 8(%eax,%ebx,4),%xmm2
-
-        movl  nb430_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]
-        movl  (%ecx,%esi,4),%ebx            ## ebx =ii 
-        movl  %ebx,nb430_ii(%esp)
-
-        movl  nb430_charge(%ebp),%edx
-        movss (%edx,%ebx,4),%xmm3
-        mulss nb430_facel(%esp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-        movl  nb430_invsqrta(%ebp),%edx         ## load invsqrta[ii]
-        movss (%edx,%ebx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        movl  nb430_type(%ebp),%edx
-        movl  (%edx,%ebx,4),%edx
-        imull nb430_ntype(%esp),%edx
-        shll  %edx
-        movl  %edx,nb430_ntia(%esp)
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb430_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addss (%eax,%ebx,4),%xmm0
-        addss 4(%eax,%ebx,4),%xmm1
-        addss 8(%eax,%ebx,4),%xmm2
-
-        movaps %xmm3,nb430_iq(%esp)
-        movaps %xmm4,nb430_isai(%esp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb430_ix(%esp)
-        movaps %xmm1,nb430_iy(%esp)
-        movaps %xmm2,nb430_iz(%esp)
-
-        movl  %ebx,nb430_ii3(%esp)
-
-        ## clear vctot and i forces 
-        xorps %xmm4,%xmm4
-        movaps %xmm4,nb430_vctot(%esp)
-        movaps %xmm4,nb430_Vvdwtot(%esp)
-        movaps %xmm4,nb430_dvdasum(%esp)
-        movaps %xmm4,nb430_fix(%esp)
-        movaps %xmm4,nb430_fiy(%esp)
-        movaps %xmm4,nb430_fiz(%esp)
-
-        movl  nb430_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb430_pos(%ebp),%esi
-        movl  nb430_faction(%ebp),%edi
-        movl  nb430_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb430_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb430_ninner(%esp),%ecx
-        movl  %ecx,nb430_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb430_innerk(%esp)      ## number of innerloop atoms
-
-        jge   _nb_kernel430_ia32_sse.nb430_unroll_loop
-        jmp   _nb_kernel430_ia32_sse.nb430_finish_inner
-_nb_kernel430_ia32_sse.nb430_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movl  nb430_innerjjnr(%esp),%edx       ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        movl  8(%edx),%ecx
-        movl  12(%edx),%edx           ## eax-edx=jnr1-4 
-        addl $16,nb430_innerjjnr(%esp)             ## advance pointer (unrolled 4) 
-
-        ## load isaj
-        movl nb430_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-        movaps nb430_isai(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all isaj in xmm3  
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb430_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb430_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb430_gbscale(%esp)
-
-        movl nb430_charge(%ebp),%esi     ## base of charge[] 
-
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-
-        mulps nb430_iq(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3  
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb430_qq(%esp)
-
-        movd  %eax,%mm0         ## use mmx registers as temp storage 
-        movd  %ebx,%mm1
-        movd  %ecx,%mm2
-        movd  %edx,%mm3
-
-        movl nb430_type(%ebp),%esi
-        movl (%esi,%eax,4),%eax
-        movl (%esi,%ebx,4),%ebx
-        movl (%esi,%ecx,4),%ecx
-        movl (%esi,%edx,4),%edx
-        movl nb430_vdwparam(%ebp),%esi
-        shll %eax
-        shll %ebx
-        shll %ecx
-        shll %edx
-        movl nb430_ntia(%esp),%edi
-        addl %edi,%eax
-        addl %edi,%ebx
-        addl %edi,%ecx
-        addl %edi,%edx
-
-        movlps (%esi,%eax,4),%xmm6
-        movlps (%esi,%ecx,4),%xmm7
-        movhps (%esi,%ebx,4),%xmm6
-        movhps (%esi,%edx,4),%xmm7
-
-        movaps %xmm6,%xmm4
-        shufps $136,%xmm7,%xmm4 ## constant 10001000
-        shufps $221,%xmm7,%xmm6 ## constant 11011101
-
-        movd  %mm0,%eax
-        movd  %mm1,%ebx
-        movd  %mm2,%ecx
-        movd  %mm3,%edx
-
-        movaps %xmm4,nb430_c6(%esp)
-        movaps %xmm6,nb430_c12(%esp)
-
-        movl nb430_pos(%ebp),%esi        ## base of pos[] 
-
-        movl %eax,nb430_jnra(%esp)
-        movl %ebx,nb430_jnrb(%esp)
-        movl %ecx,nb430_jnrc(%esp)
-        movl %edx,nb430_jnrd(%esp)
-
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-        leal  (%ecx,%ecx,2),%ecx
-        leal  (%edx,%edx,2),%edx
-
-        ## move four coordinates to xmm0-xmm2   
-
-        movlps (%esi,%eax,4),%xmm4
-        movlps (%esi,%ecx,4),%xmm5
-        movss 8(%esi,%eax,4),%xmm2
-        movss 8(%esi,%ecx,4),%xmm6
-
-        movhps (%esi,%ebx,4),%xmm4
-        movhps (%esi,%edx,4),%xmm5
-
-        movss 8(%esi,%ebx,4),%xmm0
-        movss 8(%esi,%edx,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm5,%xmm0 ## constant 10001000
-        shufps $221,%xmm5,%xmm1 ## constant 11011101            
-
-        ## move ix-iz to xmm4-xmm6 
-        movaps nb430_ix(%esp),%xmm4
-        movaps nb430_iy(%esp),%xmm5
-        movaps nb430_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## store dr 
-        movaps %xmm4,nb430_dx(%esp)
-        movaps %xmm5,nb430_dy(%esp)
-        movaps %xmm6,nb430_dz(%esp)
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb430_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb430_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        movaps %xmm4,nb430_r(%esp)
-        mulps nb430_gbscale(%esp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $2,%mm6
-        pslld $2,%mm7
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movl nb430_GBtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## load coulomb table
-        movaps (%esi,%eax,4),%xmm4
-        movaps (%esi,%ebx,4),%xmm5
-        movaps (%esi,%ecx,4),%xmm6
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## coulomb table ready, in xmm4-xmm7            
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  nb430_two(%esp),%xmm7    ## two*Heps2 
-        movaps nb430_qq(%esp),%xmm3
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulps  %xmm7,%xmm3 ## fijC=FF*qq 
-
-        ## get jnr from stack
-        movl nb430_jnra(%esp),%eax
-        movl nb430_jnrb(%esp),%ebx
-        movl nb430_jnrc(%esp),%ecx
-        movl nb430_jnrd(%esp),%edx
-
-        movl nb430_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorps %xmm7,%xmm7
-        mulps nb430_gbscale(%esp),%xmm3
-        movaps %xmm3,%xmm6
-        mulps  nb430_r(%esp),%xmm6
-        addps  %xmm5,%xmm6
-        addps  nb430_vctot(%esp),%xmm5
-        movaps %xmm5,nb430_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum
-        addps  nb430_dvdasum(%esp),%xmm7
-        movaps %xmm7,nb430_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        movaps  %xmm6,%xmm5
-        movaps  %xmm7,%xmm4
-        shufps $0x1,%xmm5,%xmm5
-        shufps $0x1,%xmm4,%xmm4
-        ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-        addss  (%esi,%eax,4),%xmm6
-        addss  (%esi,%ebx,4),%xmm5
-        addss  (%esi,%ecx,4),%xmm7
-        addss  (%esi,%edx,4),%xmm4
-        movss  %xmm6,(%esi,%eax,4)
-        movss  %xmm5,(%esi,%ebx,4)
-        movss  %xmm7,(%esi,%ecx,4)
-        movss  %xmm4,(%esi,%edx,4)
-
-        ## put scalar force on stack temporarily 
-        movaps %xmm3,nb430_fscal(%esp)
-
-        movaps nb430_r(%esp),%xmm4
-        mulps nb430_tsc(%esp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $3,%mm6
-        pslld $3,%mm7
-
-        movl nb430_VFtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## dispersion 
-        movaps (%esi,%eax,4),%xmm4
-        movaps (%esi,%ebx,4),%xmm5
-        movaps (%esi,%ecx,4),%xmm6
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## dispersion table ready, in xmm4-xmm7         
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  nb430_two(%esp),%xmm7    ## two*Heps2 
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-
-        movaps nb430_c6(%esp),%xmm4
-        mulps  %xmm4,%xmm7       ## fijD 
-        mulps  %xmm4,%xmm5       ## Vvdw6
-        mulps  nb430_tsc(%esp),%xmm7
-        addps  nb430_fscal(%esp),%xmm7   ## add to fscal 
-
-        ## put scalar force on stack Update Vvdwtot directly 
-        addps  nb430_Vvdwtot(%esp),%xmm5
-        movaps %xmm7,nb430_fscal(%esp)
-        movaps %xmm5,nb430_Vvdwtot(%esp)
-
-        ## repulsion 
-        movaps 16(%esi,%eax,4),%xmm4
-        movaps 16(%esi,%ebx,4),%xmm5
-        movaps 16(%esi,%ecx,4),%xmm6
-        movaps 16(%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## table ready, in xmm4-xmm7    
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  nb430_two(%esp),%xmm7    ## two*Heps2 
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-
-        movaps nb430_c12(%esp),%xmm4
-        mulps  %xmm4,%xmm7 ## fijR 
-        mulps  %xmm4,%xmm5 ## Vvdw12
-        mulps nb430_tsc(%esp),%xmm7
-        addps  nb430_fscal(%esp),%xmm7
-
-        addps  nb430_Vvdwtot(%esp),%xmm5
-        movaps %xmm5,nb430_Vvdwtot(%esp)
-        xorps  %xmm4,%xmm4
-
-        mulps %xmm0,%xmm7
-        subps  %xmm7,%xmm4
-
-        movaps nb430_dx(%esp),%xmm0
-        movaps nb430_dy(%esp),%xmm1
-        movaps nb430_dz(%esp),%xmm2
-
-        movd %mm0,%eax
-        movd %mm1,%ebx
-        movd %mm2,%ecx
-        movd %mm3,%edx
-
-        movl   nb430_faction(%ebp),%edi
-        mulps  %xmm4,%xmm0
-        mulps  %xmm4,%xmm1
-        mulps  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movaps nb430_fix(%esp),%xmm3
-        movaps nb430_fiy(%esp),%xmm4
-        movaps nb430_fiz(%esp),%xmm5
-        addps  %xmm0,%xmm3
-        addps  %xmm1,%xmm4
-        addps  %xmm2,%xmm5
-        movaps %xmm3,nb430_fix(%esp)
-        movaps %xmm4,nb430_fiy(%esp)
-        movaps %xmm5,nb430_fiz(%esp)
-        ## the fj's - start by accumulating x & y forces from memory 
-        movlps (%edi,%eax,4),%xmm4
-        movlps (%edi,%ecx,4),%xmm6
-        movhps (%edi,%ebx,4),%xmm4
-        movhps (%edi,%edx,4),%xmm6
-
-        movaps %xmm4,%xmm3
-        shufps $136,%xmm6,%xmm3 ## constant 10001000
-        shufps $221,%xmm6,%xmm4 ## constant 11011101                          
-
-        ## now xmm3-xmm5 contains fjx, fjy, fjz 
-        subps  %xmm0,%xmm3
-        subps  %xmm1,%xmm4
-
-        ## unpack them back so we can store them - first x & y in xmm3/xmm4 
-
-        movaps %xmm3,%xmm6
-        unpcklps %xmm4,%xmm6
-        unpckhps %xmm4,%xmm3
-        ## xmm6(l)=x & y for j1, (h) for j2 
-        ## xmm3(l)=x & y for j3, (h) for j4 
-        movlps %xmm6,(%edi,%eax,4)
-        movlps %xmm3,(%edi,%ecx,4)
-
-        movhps %xmm6,(%edi,%ebx,4)
-        movhps %xmm3,(%edi,%edx,4)
-
-        ## and the z forces 
-        movss  8(%edi,%eax,4),%xmm4
-        movss  8(%edi,%ebx,4),%xmm5
-        movss  8(%edi,%ecx,4),%xmm6
-        movss  8(%edi,%edx,4),%xmm7
-        subss  %xmm2,%xmm4
-        shufps $229,%xmm2,%xmm2 ## constant 11100101
-        subss  %xmm2,%xmm5
-        shufps $234,%xmm2,%xmm2 ## constant 11101010
-        subss  %xmm2,%xmm6
-        shufps $255,%xmm2,%xmm2 ## constant 11111111
-        subss  %xmm2,%xmm7
-        movss  %xmm4,8(%edi,%eax,4)
-        movss  %xmm5,8(%edi,%ebx,4)
-        movss  %xmm6,8(%edi,%ecx,4)
-        movss  %xmm7,8(%edi,%edx,4)
-
-        ## should we do one more iteration? 
-        subl $4,nb430_innerk(%esp)
-        jl    _nb_kernel430_ia32_sse.nb430_finish_inner
-        jmp   _nb_kernel430_ia32_sse.nb430_unroll_loop
-_nb_kernel430_ia32_sse.nb430_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb430_innerk(%esp)
-        movl  nb430_innerk(%esp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel430_ia32_sse.nb430_dopair
-        jmp   _nb_kernel430_ia32_sse.nb430_checksingle
-_nb_kernel430_ia32_sse.nb430_dopair: 
-
-        movl  nb430_innerjjnr(%esp),%ecx
-
-        movl  (%ecx),%eax
-        movl  4(%ecx),%ebx
-        addl $8,nb430_innerjjnr(%esp)
-
-        xorps %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-
-        ## load isaj
-        movl nb430_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm2
-        movss (%esi,%ebx,4),%xmm3
-        unpcklps %xmm3,%xmm2    ## isaj in xmm3(0,1)
-        mulps  nb430_isai(%esp),%xmm2
-        movaps %xmm2,nb430_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb430_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb430_gbscale(%esp)
-
-        movl nb430_charge(%ebp),%esi     ## base of charge[]    
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ebx,4),%xmm6
-        unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges 
-
-        mulps  nb430_iq(%esp),%xmm2
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb430_qq(%esp)
-
-        movl nb430_type(%ebp),%esi
-        movl  %eax,%ecx
-        movl  %ebx,%edx
-        movl (%esi,%ecx,4),%ecx
-        movl (%esi,%edx,4),%edx
-        movl nb430_vdwparam(%ebp),%esi
-        shll %ecx
-        shll %edx
-        movl nb430_ntia(%esp),%edi
-        addl %edi,%ecx
-        addl %edi,%edx
-        movlps (%esi,%ecx,4),%xmm6
-        movhps (%esi,%edx,4),%xmm6
-        movl nb430_pos(%ebp),%edi
-
-        movaps %xmm6,%xmm4
-        shufps $8,%xmm4,%xmm4 ## constant 00001000       
-        shufps $13,%xmm6,%xmm6 ## constant 00001101
-        movlhps %xmm7,%xmm4
-        movlhps %xmm7,%xmm6
-
-        movaps %xmm4,nb430_c6(%esp)
-        movaps %xmm6,nb430_c12(%esp)
-
-        movd  %eax,%mm0         ## copy jnr to mm0/mm1
-        movd  %ebx,%mm1
-
-        leal  (%eax,%eax,2),%eax
-        leal  (%ebx,%ebx,2),%ebx
-        ## move coordinates to xmm0-xmm2 
-        movlps (%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-        movhps (%edi,%ebx,4),%xmm1
-        movss 8(%edi,%ebx,4),%xmm0
-
-        movlhps %xmm7,%xmm3
-
-        shufps $0,%xmm0,%xmm2
-
-        movaps %xmm1,%xmm0
-
-        shufps $136,%xmm2,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm0,%xmm0 ## constant 10001000
-        shufps $221,%xmm1,%xmm1 ## constant 11011101
-
-        movl   nb430_faction(%ebp),%edi
-        ## move ix-iz to xmm4-xmm6 
-        xorps   %xmm7,%xmm7
-
-        movaps nb430_ix(%esp),%xmm4
-        movaps nb430_iy(%esp),%xmm5
-        movaps nb430_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## store dr 
-        movaps %xmm4,nb430_dx(%esp)
-        movaps %xmm5,nb430_dy(%esp)
-        movaps %xmm6,nb430_dz(%esp)
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb430_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb430_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        movaps %xmm4,nb430_r(%esp)
-        mulps nb430_gbscale(%esp),%xmm4
-
-        cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6
-
-        movl nb430_GBtab(%ebp),%esi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## load coulomb table
-        movaps (%esi,%ecx,4),%xmm4
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  nb430_two(%esp),%xmm7    ## two*Heps2 
-        movaps nb430_qq(%esp),%xmm3
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulps  %xmm7,%xmm3 ## fijC=FF*qq 
-
-        ## get jnr from mm0/mm1
-        movd %mm0,%ecx
-        movd %mm1,%edx
-
-        movl nb430_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorps %xmm7,%xmm7
-        mulps nb430_gbscale(%esp),%xmm3
-        movaps %xmm3,%xmm6
-        mulps  nb430_r(%esp),%xmm6
-        addps  %xmm5,%xmm6
-        addps  nb430_vctot(%esp),%xmm5
-        movaps %xmm5,nb430_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum
-        addps  nb430_dvdasum(%esp),%xmm7
-        movaps %xmm7,nb430_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        movaps %xmm6,%xmm7
-        shufps $0x1,%xmm7,%xmm7
-        addss  (%esi,%ecx,4),%xmm6
-        addss  (%esi,%edx,4),%xmm7
-        movss  %xmm6,(%esi,%ecx,4)
-        movss  %xmm7,(%esi,%edx,4)
-
-        ## put scalar force on stack temporarily 
-        movaps %xmm3,nb430_fscal(%esp)
-
-        movaps nb430_r(%esp),%xmm4
-        mulps nb430_tsc(%esp),%xmm4
-
-        cvttps2pi %xmm4,%mm6
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $3,%mm6
-
-        movl nb430_VFtab(%ebp),%esi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## dispersion 
-        movaps (%esi,%ecx,4),%xmm4
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## dispersion table ready, in xmm4-xmm7         
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  nb430_two(%esp),%xmm7    ## two*Heps2 
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-
-        movaps nb430_c6(%esp),%xmm4
-        mulps  %xmm4,%xmm7       ## fijD 
-        mulps  %xmm4,%xmm5       ## Vvdw6 
-        mulps  nb430_tsc(%esp),%xmm7
-        addps  nb430_fscal(%esp),%xmm7   ## add to fscal 
-
-        ## put scalar force on stack Update Vvdwtot directly 
-        addps  nb430_Vvdwtot(%esp),%xmm5
-        movaps %xmm7,nb430_fscal(%esp)
-        movaps %xmm5,nb430_Vvdwtot(%esp)
-
-        ## repulsion 
-        movaps 16(%esi,%ecx,4),%xmm4
-        movaps 16(%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## table ready, in xmm4-xmm7    
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  nb430_two(%esp),%xmm7    ## two*Heps2 
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-
-        movaps nb430_c12(%esp),%xmm4
-        mulps  %xmm4,%xmm7 ## fijR 
-        mulps  %xmm4,%xmm5 ## Vvdw12 
-        mulps  nb430_tsc(%esp),%xmm7
-        addps  nb430_fscal(%esp),%xmm7
-
-        addps  nb430_Vvdwtot(%esp),%xmm5
-        movaps %xmm5,nb430_Vvdwtot(%esp)
-        xorps  %xmm4,%xmm4
-
-        mulps %xmm0,%xmm7
-        subps  %xmm7,%xmm4
-
-        movaps nb430_dx(%esp),%xmm0
-        movaps nb430_dy(%esp),%xmm1
-        movaps nb430_dz(%esp),%xmm2
-
-        mulps  %xmm4,%xmm0
-        mulps  %xmm4,%xmm1
-        mulps  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movaps nb430_fix(%esp),%xmm3
-        movaps nb430_fiy(%esp),%xmm4
-        movaps nb430_fiz(%esp),%xmm5
-        addps  %xmm0,%xmm3
-        addps  %xmm1,%xmm4
-        addps  %xmm2,%xmm5
-        movaps %xmm3,nb430_fix(%esp)
-        movaps %xmm4,nb430_fiy(%esp)
-        movaps %xmm5,nb430_fiz(%esp)
-        ## update the fj's 
-        movss   (%edi,%eax,4),%xmm3
-        movss   4(%edi,%eax,4),%xmm4
-        movss   8(%edi,%eax,4),%xmm5
-        subss   %xmm0,%xmm3
-        subss   %xmm1,%xmm4
-        subss   %xmm2,%xmm5
-        movss   %xmm3,(%edi,%eax,4)
-        movss   %xmm4,4(%edi,%eax,4)
-        movss   %xmm5,8(%edi,%eax,4)
-
-        shufps $225,%xmm0,%xmm0 ## constant 11100001
-        shufps $225,%xmm1,%xmm1 ## constant 11100001
-        shufps $225,%xmm2,%xmm2 ## constant 11100001
-
-        movss   (%edi,%ebx,4),%xmm3
-        movss   4(%edi,%ebx,4),%xmm4
-        movss   8(%edi,%ebx,4),%xmm5
-        subss   %xmm0,%xmm3
-        subss   %xmm1,%xmm4
-        subss   %xmm2,%xmm5
-        movss   %xmm3,(%edi,%ebx,4)
-        movss   %xmm4,4(%edi,%ebx,4)
-        movss   %xmm5,8(%edi,%ebx,4)
-
-_nb_kernel430_ia32_sse.nb430_checksingle:       
-        movl  nb430_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel430_ia32_sse.nb430_dosingle
-        jmp    _nb_kernel430_ia32_sse.nb430_updateouterdata
-_nb_kernel430_ia32_sse.nb430_dosingle: 
-        movl nb430_charge(%ebp),%esi
-        movl nb430_invsqrta(%ebp),%edx
-        movl nb430_pos(%ebp),%edi
-        movl  nb430_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-        xorps  %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-        movss (%edx,%eax,4),%xmm2       ## isaj
-        mulss nb430_isai(%esp),%xmm2
-        movss %xmm2,nb430_isaprod(%esp)
-        movss %xmm2,%xmm1
-        mulss nb430_gbtsc(%esp),%xmm1
-        movss %xmm1,nb430_gbscale(%esp)
-
-        mulss  nb430_iq(%esp),%xmm2
-        movss (%esi,%eax,4),%xmm6       ## xmm6(0) has the charge       
-        mulss  %xmm2,%xmm6
-        movss %xmm6,nb430_qq(%esp)
-
-        movl nb430_type(%ebp),%esi
-        movl %eax,%ecx
-        movl (%esi,%ecx,4),%ecx
-        movl nb430_vdwparam(%ebp),%esi
-        shll %ecx
-        addl nb430_ntia(%esp),%ecx
-        movlps (%esi,%ecx,4),%xmm6
-        movaps %xmm6,%xmm4
-        shufps $252,%xmm4,%xmm4 ## constant 11111100    
-        shufps $253,%xmm6,%xmm6 ## constant 11111101    
-
-        movss %xmm4,nb430_c6(%esp)
-        movss %xmm6,nb430_c12(%esp)
-
-        movd  %eax,%mm0
-        leal  (%eax,%eax,2),%eax
-
-        ## move coordinates to xmm0-xmm2 
-        movss (%edi,%eax,4),%xmm0
-        movss 4(%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-
-        movss nb430_ix(%esp),%xmm4
-        movss nb430_iy(%esp),%xmm5
-        movss nb430_iz(%esp),%xmm6
-
-        ## calc dr 
-        subss %xmm0,%xmm4
-        subss %xmm1,%xmm5
-        subss %xmm2,%xmm6
-
-        ## store dr 
-        movaps %xmm4,nb430_dx(%esp)
-        movaps %xmm5,nb430_dy(%esp)
-        movaps %xmm6,nb430_dz(%esp)
-        ## square it 
-        mulss %xmm4,%xmm4
-        mulss %xmm5,%xmm5
-        mulss %xmm6,%xmm6
-        addss %xmm5,%xmm4
-        addss %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movss nb430_three(%esp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movss nb430_half(%esp),%xmm0
-        subss %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-
-        mulss %xmm0,%xmm4       ## xmm4=r 
-        movss %xmm4,nb430_r(%esp)
-        mulss nb430_gbscale(%esp),%xmm4
-
-        cvttss2si %xmm4,%ebx    ## mm6 contain lu indices 
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%ebx
-
-        movl nb430_GBtab(%ebp),%esi
-
-        movaps (%esi,%ebx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        mulss  nb430_two(%esp),%xmm7    ## two*Heps2 
-        movss nb430_qq(%esp),%xmm3
-        addss  %xmm6,%xmm7
-        addss  %xmm5,%xmm7 ## xmm7=FF 
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulss  %xmm7,%xmm3 ## fijC=FF*qq 
-
-        movd %mm0,%ebx
-        movl nb430_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorps %xmm7,%xmm7
-        mulss nb430_gbscale(%esp),%xmm3
-        movaps %xmm3,%xmm6
-        mulss  nb430_r(%esp),%xmm6
-        addss  %xmm5,%xmm6
-        addss  nb430_vctot(%esp),%xmm5
-        movss %xmm5,nb430_vctot(%esp)
-
-
-        ## xmm6=(vcoul+fijC*r)
-        subss  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum
-        addss  nb430_dvdasum(%esp),%xmm7
-        movaps %xmm7,nb430_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        addss  (%esi,%ebx,4),%xmm6
-        movss  %xmm6,(%esi,%ebx,4)
-
-        ## put scalar force on stack temporarily 
-        movss %xmm3,nb430_fscal(%esp)
-
-        movss nb430_r(%esp),%xmm4
-        mulps nb430_tsc(%esp),%xmm4
-
-        cvttss2si %xmm4,%ebx
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movss %xmm4,%xmm1       ## xmm1=eps 
-        movss %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $3,%ebx
-        movl nb430_VFtab(%ebp),%esi
-
-        ## dispersion 
-        movaps (%esi,%ebx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        mulss  nb430_two(%esp),%xmm7    ## two*Heps2 
-        addss  %xmm6,%xmm7
-        addss  %xmm5,%xmm7 ## xmm7=FF 
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-
-        movss nb430_c6(%esp),%xmm4
-        mulss  %xmm4,%xmm7       ## fijD 
-        mulss  %xmm4,%xmm5       ## Vvdw6
-        mulps  nb430_tsc(%esp),%xmm7
-        addss  nb430_fscal(%esp),%xmm7   ## add to fscal 
-
-        ## put scalar force on stack Update Vvdwtot directly 
-        addss  nb430_Vvdwtot(%esp),%xmm5
-        movss %xmm7,nb430_fscal(%esp)
-        movss %xmm5,nb430_Vvdwtot(%esp)
-
-        ## repulsion 
-        movaps 16(%esi,%ebx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        mulss  nb430_two(%esp),%xmm7    ## two*Heps2 
-        addss  %xmm6,%xmm7
-        addss  %xmm5,%xmm7 ## xmm7=FF 
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-
-        movss nb430_c12(%esp),%xmm4
-        mulss  %xmm4,%xmm7 ## fijR 
-        mulss  %xmm4,%xmm5 ## Vvdw12 
-        mulps  nb430_tsc(%esp),%xmm7
-        addss  nb430_fscal(%esp),%xmm7
-
-        addss  nb430_Vvdwtot(%esp),%xmm5
-        movss %xmm5,nb430_Vvdwtot(%esp)
-        xorps  %xmm4,%xmm4
-
-        mulss %xmm0,%xmm7
-        subss  %xmm7,%xmm4
-        movl   nb430_faction(%ebp),%edi
-
-        movss nb430_dx(%esp),%xmm0
-        movss nb430_dy(%esp),%xmm1
-        movss nb430_dz(%esp),%xmm2
-
-        mulss  %xmm4,%xmm0
-        mulss  %xmm4,%xmm1
-        mulss  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movss nb430_fix(%esp),%xmm3
-        movss nb430_fiy(%esp),%xmm4
-        movss nb430_fiz(%esp),%xmm5
-        addss  %xmm0,%xmm3
-        addss  %xmm1,%xmm4
-        addss  %xmm2,%xmm5
-        movss %xmm3,nb430_fix(%esp)
-        movss %xmm4,nb430_fiy(%esp)
-        movss %xmm5,nb430_fiz(%esp)
-        ## update fj 
-
-        movss   (%edi,%eax,4),%xmm3
-        movss   4(%edi,%eax,4),%xmm4
-        movss   8(%edi,%eax,4),%xmm5
-        subss   %xmm0,%xmm3
-        subss   %xmm1,%xmm4
-        subss   %xmm2,%xmm5
-        movss   %xmm3,(%edi,%eax,4)
-        movss   %xmm4,4(%edi,%eax,4)
-        movss   %xmm5,8(%edi,%eax,4)
-_nb_kernel430_ia32_sse.nb430_updateouterdata: 
-        movl  nb430_ii3(%esp),%ecx
-        movl  nb430_faction(%ebp),%edi
-        movl  nb430_fshift(%ebp),%esi
-        movl  nb430_is3(%esp),%edx
-
-        ## accumulate i forces in xmm0, xmm1, xmm2 
-        movaps nb430_fix(%esp),%xmm0
-        movaps nb430_fiy(%esp),%xmm1
-        movaps nb430_fiz(%esp),%xmm2
-
-        movhlps %xmm0,%xmm3
-        movhlps %xmm1,%xmm4
-        movhlps %xmm2,%xmm5
-        addps  %xmm3,%xmm0
-        addps  %xmm4,%xmm1
-        addps  %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 
-
-        movaps %xmm0,%xmm3
-        movaps %xmm1,%xmm4
-        movaps %xmm2,%xmm5
-
-        shufps $1,%xmm3,%xmm3
-        shufps $1,%xmm4,%xmm4
-        shufps $1,%xmm5,%xmm5
-        addss  %xmm3,%xmm0
-        addss  %xmm4,%xmm1
-        addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0 
-
-        ## increment i force 
-        movss  (%edi,%ecx,4),%xmm3
-        movss  4(%edi,%ecx,4),%xmm4
-        movss  8(%edi,%ecx,4),%xmm5
-        addss  %xmm0,%xmm3
-        addss  %xmm1,%xmm4
-        addss  %xmm2,%xmm5
-        movss  %xmm3,(%edi,%ecx,4)
-        movss  %xmm4,4(%edi,%ecx,4)
-        movss  %xmm5,8(%edi,%ecx,4)
-
-        ## increment fshift force  
-        movss  (%esi,%edx,4),%xmm3
-        movss  4(%esi,%edx,4),%xmm4
-        movss  8(%esi,%edx,4),%xmm5
-        addss  %xmm0,%xmm3
-        addss  %xmm1,%xmm4
-        addss  %xmm2,%xmm5
-        movss  %xmm3,(%esi,%edx,4)
-        movss  %xmm4,4(%esi,%edx,4)
-        movss  %xmm5,8(%esi,%edx,4)
-
-        ## get n from stack
-        movl nb430_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb430_gid(%ebp),%edx              ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movaps nb430_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movl  nb430_Vc(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%eax,%edx,4)
-
-        ## accumulate total lj energy and update it 
-        movaps nb430_Vvdwtot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movl  nb430_Vvdw(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%eax,%edx,4)
-
-        ## accumulate dVda and update it 
-        movaps nb430_dvdasum(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        movl nb430_ii(%esp),%edx
-        movl nb430_dvda(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        movss %xmm7,(%eax,%edx,4)
-
-        ## finish if last 
-        movl nb430_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel430_ia32_sse.nb430_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb430_n(%esp)
-        jmp _nb_kernel430_ia32_sse.nb430_outer
-_nb_kernel430_ia32_sse.nb430_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb430_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel430_ia32_sse.nb430_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel430_ia32_sse.nb430_threadloop
-_nb_kernel430_ia32_sse.nb430_end: 
-        emms
-
-        movl nb430_nouter(%esp),%eax
-        movl nb430_ninner(%esp),%ebx
-        movl nb430_outeriter(%ebp),%ecx
-        movl nb430_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb430_salign(%esp),%eax
-        addl %eax,%esp
-        addl $488,%esp
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-        popl %eax
-        leave
-        ret
-
-
-
-
-
-
-
-.globl nb_kernel430nf_ia32_sse
-.globl _nb_kernel430nf_ia32_sse
-nb_kernel430nf_ia32_sse:        
-_nb_kernel430nf_ia32_sse:       
-.set nb430nf_p_nri, 8
-.set nb430nf_iinr, 12
-.set nb430nf_jindex, 16
-.set nb430nf_jjnr, 20
-.set nb430nf_shift, 24
-.set nb430nf_shiftvec, 28
-.set nb430nf_fshift, 32
-.set nb430nf_gid, 36
-.set nb430nf_pos, 40
-.set nb430nf_faction, 44
-.set nb430nf_charge, 48
-.set nb430nf_p_facel, 52
-.set nb430nf_argkrf, 56
-.set nb430nf_argcrf, 60
-.set nb430nf_Vc, 64
-.set nb430nf_type, 68
-.set nb430nf_p_ntype, 72
-.set nb430nf_vdwparam, 76
-.set nb430nf_Vvdw, 80
-.set nb430nf_p_tabscale, 84
-.set nb430nf_VFtab, 88
-.set nb430nf_invsqrta, 92
-.set nb430nf_dvda, 96
-.set nb430nf_p_gbtabscale, 100
-.set nb430nf_GBtab, 104
-.set nb430nf_p_nthreads, 108
-.set nb430nf_count, 112
-.set nb430nf_mtx, 116
-.set nb430nf_outeriter, 120
-.set nb430nf_inneriter, 124
-.set nb430nf_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb430nf_ix, 0
-.set nb430nf_iy, 16
-.set nb430nf_iz, 32
-.set nb430nf_iq, 48
-.set nb430nf_gbtsc, 64
-.set nb430nf_tsc, 80
-.set nb430nf_qq, 96
-.set nb430nf_c6, 112
-.set nb430nf_c12, 128
-.set nb430nf_vctot, 144
-.set nb430nf_Vvdwtot, 160
-.set nb430nf_half, 176
-.set nb430nf_three, 192
-.set nb430nf_isai, 208
-.set nb430nf_isaprod, 224
-.set nb430nf_gbscale, 240
-.set nb430nf_r, 256
-.set nb430nf_is3, 272
-.set nb430nf_ii3, 276
-.set nb430nf_ntia, 280
-.set nb430nf_innerjjnr, 284
-.set nb430nf_innerk, 288
-.set nb430nf_n, 292
-.set nb430nf_nn1, 296
-.set nb430nf_nri, 300
-.set nb430nf_facel, 304
-.set nb430nf_ntype, 308
-.set nb430nf_nouter, 312
-.set nb430nf_ninner, 316
-.set nb430nf_salign, 320
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $324,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb430nf_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb430nf_p_nri(%ebp),%ecx
-        movl nb430nf_p_facel(%ebp),%esi
-        movl nb430nf_p_ntype(%ebp),%edi
-        movl (%ecx),%ecx
-        movl (%esi),%esi
-        movl (%edi),%edi
-        movl %ecx,nb430nf_nri(%esp)
-        movl %esi,nb430nf_facel(%esp)
-        movl %edi,nb430nf_ntype(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb430nf_nouter(%esp)
-        movl %eax,nb430nf_ninner(%esp)
-
-
-        movl nb430nf_p_gbtabscale(%ebp),%eax
-        movss (%eax),%xmm3
-        movl nb430nf_p_tabscale(%ebp),%eax
-        movss (%eax),%xmm4
-        shufps $0,%xmm3,%xmm3
-        shufps $0,%xmm4,%xmm4
-        movaps %xmm3,nb430nf_gbtsc(%esp)
-        movaps %xmm4,nb430nf_tsc(%esp)
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## constant 0.5 in IEEE (hex)
-        movl %eax,nb430nf_half(%esp)
-        movss nb430nf_half(%esp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## constant 1.0
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## constant 2.0
-        addps  %xmm2,%xmm3      ## constant 3.0
-        movaps %xmm1,nb430nf_half(%esp)
-        movaps %xmm3,nb430nf_three(%esp)
-
-_nb_kernel430nf_ia32_sse.nb430nf_threadloop: 
-        movl  nb430nf_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel430nf_ia32_sse.nb430nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel430nf_ia32_sse.nb430nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb430nf_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb430nf_n(%esp)
-        movl %ebx,nb430nf_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel430nf_ia32_sse.nb430nf_outerstart
-        jmp _nb_kernel430nf_ia32_sse.nb430nf_end
-
-_nb_kernel430nf_ia32_sse.nb430nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb430nf_nouter(%esp),%ebx
-        movl %ebx,nb430nf_nouter(%esp)
-
-_nb_kernel430nf_ia32_sse.nb430nf_outer: 
-        movl  nb430nf_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx                ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb430nf_is3(%esp)            ## store is3 
-
-        movl  nb430nf_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movss (%eax,%ebx,4),%xmm0
-        movss 4(%eax,%ebx,4),%xmm1
-        movss 8(%eax,%ebx,4),%xmm2
-
-        movl  nb430nf_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]      
-        movl  (%ecx,%esi,4),%ebx            ## ebx =ii 
-
-        movl  nb430nf_charge(%ebp),%edx
-        movss (%edx,%ebx,4),%xmm3
-        mulss nb430nf_facel(%esp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-        movl  nb430nf_invsqrta(%ebp),%edx       ## load invsqrta[ii]
-        movss (%edx,%ebx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        movl  nb430nf_type(%ebp),%edx
-        movl  (%edx,%ebx,4),%edx
-        imull nb430nf_ntype(%esp),%edx
-        shll  %edx
-        movl  %edx,nb430nf_ntia(%esp)
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb430nf_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addss (%eax,%ebx,4),%xmm0
-        addss 4(%eax,%ebx,4),%xmm1
-        addss 8(%eax,%ebx,4),%xmm2
-
-        movaps %xmm3,nb430nf_iq(%esp)
-        movaps %xmm4,nb430nf_isai(%esp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb430nf_ix(%esp)
-        movaps %xmm1,nb430nf_iy(%esp)
-        movaps %xmm2,nb430nf_iz(%esp)
-
-        movl  %ebx,nb430nf_ii3(%esp)
-
-        ## clear vctot 
-        xorps %xmm4,%xmm4
-        movaps %xmm4,nb430nf_vctot(%esp)
-        movaps %xmm4,nb430nf_Vvdwtot(%esp)
-
-        movl  nb430nf_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb430nf_pos(%ebp),%esi
-        movl  nb430nf_faction(%ebp),%edi
-        movl  nb430nf_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb430nf_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb430nf_ninner(%esp),%ecx
-        movl  %ecx,nb430nf_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb430nf_innerk(%esp)      ## number of innerloop atoms 
-        jge   _nb_kernel430nf_ia32_sse.nb430nf_unroll_loop
-        jmp   _nb_kernel430nf_ia32_sse.nb430nf_finish_inner
-_nb_kernel430nf_ia32_sse.nb430nf_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movl  nb430nf_innerjjnr(%esp),%edx       ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        movl  8(%edx),%ecx
-        movl  12(%edx),%edx           ## eax-edx=jnr1-4 
-        addl $16,nb430nf_innerjjnr(%esp)             ## advance pointer (unrolled 4) 
-
-        ## load isa2
-        movl nb430nf_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-        movaps nb430nf_isai(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3  
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb430nf_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb430nf_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb430nf_gbscale(%esp)
-
-        movl nb430nf_charge(%ebp),%esi     ## base of charge[] 
-
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ecx,4),%xmm4
-        movss (%esi,%ebx,4),%xmm6
-        movss (%esi,%edx,4),%xmm7
-
-        mulps nb430nf_iq(%esp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3  
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb430nf_qq(%esp)
-
-        movd  %eax,%mm0         ## use mmx registers as temp storage 
-        movd  %ebx,%mm1
-        movd  %ecx,%mm2
-        movd  %edx,%mm3
-
-        movl nb430nf_type(%ebp),%esi
-        movl (%esi,%eax,4),%eax
-        movl (%esi,%ebx,4),%ebx
-        movl (%esi,%ecx,4),%ecx
-        movl (%esi,%edx,4),%edx
-        movl nb430nf_vdwparam(%ebp),%esi
-        shll %eax
-        shll %ebx
-        shll %ecx
-        shll %edx
-        movl nb430nf_ntia(%esp),%edi
-        addl %edi,%eax
-        addl %edi,%ebx
-        addl %edi,%ecx
-        addl %edi,%edx
-
-        movlps (%esi,%eax,4),%xmm6
-        movlps (%esi,%ecx,4),%xmm7
-        movhps (%esi,%ebx,4),%xmm6
-        movhps (%esi,%edx,4),%xmm7
-
-        movaps %xmm6,%xmm4
-        shufps $136,%xmm7,%xmm4 ## constant 10001000
-        shufps $221,%xmm7,%xmm6 ## constant 11011101
-
-        movd  %mm0,%eax
-        movd  %mm1,%ebx
-        movd  %mm2,%ecx
-        movd  %mm3,%edx
-
-        movaps %xmm4,nb430nf_c6(%esp)
-        movaps %xmm6,nb430nf_c12(%esp)
-
-        movl nb430nf_pos(%ebp),%esi        ## base of pos[] 
-
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-
-        leal  (%ecx,%ecx,2),%ecx     ## replace jnr with j3 
-        leal  (%edx,%edx,2),%edx
-
-        ## move four coordinates to xmm0-xmm2   
-
-        movlps (%esi,%eax,4),%xmm4
-        movlps (%esi,%ecx,4),%xmm5
-        movss 8(%esi,%eax,4),%xmm2
-        movss 8(%esi,%ecx,4),%xmm6
-
-        movhps (%esi,%ebx,4),%xmm4
-        movhps (%esi,%edx,4),%xmm5
-
-        movss 8(%esi,%ebx,4),%xmm0
-        movss 8(%esi,%edx,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm5,%xmm0 ## constant 10001000
-        shufps $221,%xmm5,%xmm1 ## constant 11011101            
-
-        ## move ix-iz to xmm4-xmm6 
-        movaps nb430nf_ix(%esp),%xmm4
-        movaps nb430nf_iy(%esp),%xmm5
-        movaps nb430nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb430nf_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb430nf_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb430nf_r(%esp)
-        mulps nb430nf_gbscale(%esp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $2,%mm6
-        pslld $2,%mm7
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movl nb430nf_GBtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## load coulomb table
-        movaps (%esi,%eax,4),%xmm4
-        movaps (%esi,%ebx,4),%xmm5
-        movaps (%esi,%ecx,4),%xmm6
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## coulomb table ready, in xmm4-xmm7            
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        movaps nb430nf_qq(%esp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addps  nb430nf_vctot(%esp),%xmm5
-        movaps %xmm5,nb430nf_vctot(%esp)
-
-
-        movaps nb430nf_r(%esp),%xmm4
-        mulps nb430nf_tsc(%esp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $3,%mm6
-        pslld $3,%mm7
-
-        movl nb430nf_VFtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## dispersion 
-        movaps (%esi,%eax,4),%xmm4
-        movaps (%esi,%ebx,4),%xmm5
-        movaps (%esi,%ecx,4),%xmm6
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## dispersion table ready, in xmm4-xmm7         
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  nb430nf_c6(%esp),%xmm5    ## Vvdw6
-        addps  nb430nf_Vvdwtot(%esp),%xmm5
-        movaps %xmm5,nb430nf_Vvdwtot(%esp)
-
-        ## repulsion 
-        movaps 16(%esi,%eax,4),%xmm4
-        movaps 16(%esi,%ebx,4),%xmm5
-        movaps 16(%esi,%ecx,4),%xmm6
-        movaps 16(%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## table ready, in xmm4-xmm7    
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulps  nb430nf_c12(%esp),%xmm5   ## Vvdw12
-        addps  nb430nf_Vvdwtot(%esp),%xmm5
-        movaps %xmm5,nb430nf_Vvdwtot(%esp)
-
-        ## should we do one more iteration? 
-        subl $4,nb430nf_innerk(%esp)
-        jl    _nb_kernel430nf_ia32_sse.nb430nf_finish_inner
-        jmp   _nb_kernel430nf_ia32_sse.nb430nf_unroll_loop
-_nb_kernel430nf_ia32_sse.nb430nf_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb430nf_innerk(%esp)
-        movl  nb430nf_innerk(%esp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel430nf_ia32_sse.nb430nf_dopair
-        jmp   _nb_kernel430nf_ia32_sse.nb430nf_checksingle
-_nb_kernel430nf_ia32_sse.nb430nf_dopair: 
-
-        movl  nb430nf_innerjjnr(%esp),%ecx
-
-        movl  (%ecx),%eax
-        movl  4(%ecx),%ebx
-        addl $8,nb430nf_innerjjnr(%esp)
-
-        xorps %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-
-        ## load isa2
-        movl nb430nf_invsqrta(%ebp),%esi
-        movss (%esi,%eax,4),%xmm2
-        movss (%esi,%ebx,4),%xmm3
-        unpcklps %xmm3,%xmm2    ## isa2 in xmm3(0,1)
-        mulps  nb430nf_isai(%esp),%xmm2
-        movaps %xmm2,nb430nf_isaprod(%esp)
-        movaps %xmm2,%xmm1
-        mulps nb430nf_gbtsc(%esp),%xmm1
-        movaps %xmm1,nb430nf_gbscale(%esp)
-
-        movl nb430nf_charge(%ebp),%esi     ## base of charge[]  
-        movss (%esi,%eax,4),%xmm3
-        movss (%esi,%ebx,4),%xmm6
-        unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges 
-
-        mulps  nb430nf_iq(%esp),%xmm2
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb430nf_qq(%esp)
-
-        movl nb430nf_type(%ebp),%esi
-        movl  %eax,%ecx
-        movl  %ebx,%edx
-        movl (%esi,%ecx,4),%ecx
-        movl (%esi,%edx,4),%edx
-        movl nb430nf_vdwparam(%ebp),%esi
-        shll %ecx
-        shll %edx
-        movl nb430nf_ntia(%esp),%edi
-        addl %edi,%ecx
-        addl %edi,%edx
-        movlps (%esi,%ecx,4),%xmm6
-        movhps (%esi,%edx,4),%xmm6
-        movl nb430nf_pos(%ebp),%edi
-
-        movaps %xmm6,%xmm4
-        shufps $8,%xmm4,%xmm4 ## constant 00001000       
-        shufps $13,%xmm6,%xmm6 ## constant 00001101
-        movlhps %xmm7,%xmm4
-        movlhps %xmm7,%xmm6
-
-        movaps %xmm4,nb430nf_c6(%esp)
-        movaps %xmm6,nb430nf_c12(%esp)
-
-        leal  (%eax,%eax,2),%eax
-        leal  (%ebx,%ebx,2),%ebx
-        ## move coordinates to xmm0-xmm2 
-        movlps (%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-        movhps (%edi,%ebx,4),%xmm1
-        movss 8(%edi,%ebx,4),%xmm0
-
-        movlhps %xmm7,%xmm3
-
-        shufps $0,%xmm0,%xmm2
-
-        movaps %xmm1,%xmm0
-
-        shufps $136,%xmm2,%xmm2 ## constant 10001000
-
-        shufps $136,%xmm0,%xmm0 ## constant 10001000
-        shufps $221,%xmm1,%xmm1 ## constant 11011101
-
-        movl   nb430nf_faction(%ebp),%edi
-        ## move ix-iz to xmm4-xmm6 
-        xorps   %xmm7,%xmm7
-
-        movaps nb430nf_ix(%esp),%xmm4
-        movaps nb430nf_iy(%esp),%xmm5
-        movaps nb430nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb430nf_three(%esp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb430nf_half(%esp),%xmm0
-        subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        movaps %xmm4,nb430nf_r(%esp)
-        mulps nb430nf_gbscale(%esp),%xmm4
-
-        cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6
-
-        movl nb430nf_GBtab(%ebp),%esi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## load coulomb table
-        movaps (%esi,%ecx,4),%xmm4
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        movaps nb430nf_qq(%esp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addps  nb430nf_vctot(%esp),%xmm5
-        movaps %xmm5,nb430nf_vctot(%esp)
-
-        movaps nb430nf_r(%esp),%xmm4
-        mulps nb430nf_tsc(%esp),%xmm4
-
-        cvttps2pi %xmm4,%mm6
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $3,%mm6
-
-        movl nb430nf_VFtab(%ebp),%esi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## dispersion 
-        movaps (%esi,%ecx,4),%xmm4
-        movaps (%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## dispersion table ready, in xmm4-xmm7         
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulps  nb430nf_c6(%esp),%xmm5    ## Vvdw6 
-        addps  nb430nf_Vvdwtot(%esp),%xmm5
-        movaps %xmm5,nb430nf_Vvdwtot(%esp)
-
-        ## repulsion 
-        movaps 16(%esi,%ecx,4),%xmm4
-        movaps 16(%esi,%edx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## table ready, in xmm4-xmm7    
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulps  nb430nf_c12(%esp),%xmm5   ## Vvdw12 
-
-        addps  nb430nf_Vvdwtot(%esp),%xmm5
-        movaps %xmm5,nb430nf_Vvdwtot(%esp)
-_nb_kernel430nf_ia32_sse.nb430nf_checksingle:   
-        movl  nb430nf_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel430nf_ia32_sse.nb430nf_dosingle
-        jmp    _nb_kernel430nf_ia32_sse.nb430nf_updateouterdata
-_nb_kernel430nf_ia32_sse.nb430nf_dosingle: 
-        movl nb430nf_charge(%ebp),%esi
-        movl nb430nf_invsqrta(%ebp),%edx
-        movl nb430nf_pos(%ebp),%edi
-        movl  nb430nf_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-        xorps  %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-        movss (%edx,%eax,4),%xmm2       ## isa2
-        mulss nb430nf_isai(%esp),%xmm2
-        movss %xmm2,nb430nf_isaprod(%esp)
-        movss %xmm2,%xmm1
-        mulss nb430nf_gbtsc(%esp),%xmm1
-        movss %xmm1,nb430nf_gbscale(%esp)
-
-        mulss  nb430nf_iq(%esp),%xmm2
-        movss (%esi,%eax,4),%xmm6       ## xmm6(0) has the charge       
-        mulss  %xmm2,%xmm6
-        movss %xmm6,nb430nf_qq(%esp)
-
-        movl nb430nf_type(%ebp),%esi
-        movl %eax,%ecx
-        movl (%esi,%ecx,4),%ecx
-        movl nb430nf_vdwparam(%ebp),%esi
-        shll %ecx
-        addl nb430nf_ntia(%esp),%ecx
-        movlps (%esi,%ecx,4),%xmm6
-        movaps %xmm6,%xmm4
-        shufps $252,%xmm4,%xmm4 ## constant 11111100    
-        shufps $253,%xmm6,%xmm6 ## constant 11111101    
-
-        movss %xmm4,nb430nf_c6(%esp)
-        movss %xmm6,nb430nf_c12(%esp)
-
-        leal  (%eax,%eax,2),%eax
-
-        ## move coordinates to xmm0-xmm2 
-        movss (%edi,%eax,4),%xmm0
-        movss 4(%edi,%eax,4),%xmm1
-        movss 8(%edi,%eax,4),%xmm2
-
-        movss nb430nf_ix(%esp),%xmm4
-        movss nb430nf_iy(%esp),%xmm5
-        movss nb430nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subss %xmm0,%xmm4
-        subss %xmm1,%xmm5
-        subss %xmm2,%xmm6
-
-        ## square it 
-        mulss %xmm4,%xmm4
-        mulss %xmm5,%xmm5
-        mulss %xmm6,%xmm6
-        addss %xmm5,%xmm4
-        addss %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movss nb430nf_three(%esp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movss nb430nf_half(%esp),%xmm0
-        subss %xmm5,%xmm1       ## constant 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-
-        mulss %xmm0,%xmm4       ## xmm4=r 
-        movaps %xmm4,nb430nf_r(%esp)
-        mulss nb430nf_gbscale(%esp),%xmm4
-
-        cvttss2si %xmm4,%ebx    ## mm6 contain lu indices 
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%ebx
-
-        movl nb430nf_GBtab(%ebp),%esi
-
-        movaps (%esi,%ebx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        movss nb430nf_qq(%esp),%xmm3
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addss  nb430nf_vctot(%esp),%xmm5
-        movss %xmm5,nb430nf_vctot(%esp)
-
-        movss nb430nf_r(%esp),%xmm4
-        mulps nb430nf_tsc(%esp),%xmm4
-
-        cvttss2si %xmm4,%ebx
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movss %xmm4,%xmm1       ## xmm1=eps 
-        movss %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $3,%ebx
-        movl nb430nf_VFtab(%ebp),%esi
-
-        ## dispersion 
-        movaps (%esi,%ebx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  nb430nf_c6(%esp),%xmm5    ## Vvdw6
-        addss  nb430nf_Vvdwtot(%esp),%xmm5
-        movss %xmm5,nb430nf_Vvdwtot(%esp)
-
-        ## repulsion 
-        movaps 16(%esi,%ebx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulss  nb430nf_c12(%esp),%xmm5   ## Vvdw12 
-
-        addss  nb430nf_Vvdwtot(%esp),%xmm5
-        movss %xmm5,nb430nf_Vvdwtot(%esp)
-
-_nb_kernel430nf_ia32_sse.nb430nf_updateouterdata: 
-        ## get n from stack
-        movl nb430nf_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb430nf_gid(%ebp),%edx            ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movaps nb430nf_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movl  nb430nf_Vc(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%eax,%edx,4)
-
-        ## accumulate total lj energy and update it 
-        movaps nb430nf_Vvdwtot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movl  nb430nf_Vvdw(%ebp),%eax
-        addss (%eax,%edx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%eax,%edx,4)
-
-        ## finish if last 
-        movl nb430nf_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel430nf_ia32_sse.nb430nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb430nf_n(%esp)
-        jmp _nb_kernel430nf_ia32_sse.nb430nf_outer
-_nb_kernel430nf_ia32_sse.nb430nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb430nf_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel430nf_ia32_sse.nb430nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel430nf_ia32_sse.nb430nf_threadloop
-_nb_kernel430nf_ia32_sse.nb430nf_end: 
-        emms
-
-        movl nb430nf_nouter(%esp),%eax
-        movl nb430nf_ninner(%esp),%ebx
-        movl nb430nf_outeriter(%ebp),%ecx
-        movl nb430nf_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb430nf_salign(%esp),%eax
-        addl %eax,%esp
-        addl $324,%esp
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-        popl %eax
-        leave
-        ret
-
-
-
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/Makefile.am b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/Makefile.am
index d28786df9b..fbf7bbfd5b 100644
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/Makefile.am
+++ b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/Makefile.am
@@ -59,32 +59,32 @@ libnb_kernel_ia32_sse2_la_SOURCES = \
 
 
 EXTRA_DIST = \
-	nb_kernel010_ia32_sse2.intel_syntax.s	nb_kernel030_ia32_sse2.intel_syntax.s	\
-	nb_kernel100_ia32_sse2.intel_syntax.s	nb_kernel101_ia32_sse2.intel_syntax.s	\
-	nb_kernel102_ia32_sse2.intel_syntax.s	nb_kernel103_ia32_sse2.intel_syntax.s	\
-	nb_kernel104_ia32_sse2.intel_syntax.s	nb_kernel110_ia32_sse2.intel_syntax.s	\
-	nb_kernel111_ia32_sse2.intel_syntax.s	nb_kernel112_ia32_sse2.intel_syntax.s	\
-	nb_kernel113_ia32_sse2.intel_syntax.s	nb_kernel114_ia32_sse2.intel_syntax.s	\
-	nb_kernel130_ia32_sse2.intel_syntax.s	nb_kernel131_ia32_sse2.intel_syntax.s	\
-	nb_kernel132_ia32_sse2.intel_syntax.s	nb_kernel133_ia32_sse2.intel_syntax.s	\
-	nb_kernel134_ia32_sse2.intel_syntax.s	nb_kernel200_ia32_sse2.intel_syntax.s	\
-	nb_kernel201_ia32_sse2.intel_syntax.s	nb_kernel202_ia32_sse2.intel_syntax.s	\
-	nb_kernel203_ia32_sse2.intel_syntax.s	nb_kernel204_ia32_sse2.intel_syntax.s	\
-	nb_kernel210_ia32_sse2.intel_syntax.s	nb_kernel211_ia32_sse2.intel_syntax.s	\
-	nb_kernel212_ia32_sse2.intel_syntax.s	nb_kernel213_ia32_sse2.intel_syntax.s	\
-	nb_kernel214_ia32_sse2.intel_syntax.s	nb_kernel230_ia32_sse2.intel_syntax.s	\
-	nb_kernel231_ia32_sse2.intel_syntax.s	nb_kernel232_ia32_sse2.intel_syntax.s	\
-	nb_kernel233_ia32_sse2.intel_syntax.s	nb_kernel234_ia32_sse2.intel_syntax.s	\
-	nb_kernel300_ia32_sse2.intel_syntax.s	nb_kernel301_ia32_sse2.intel_syntax.s	\
-	nb_kernel302_ia32_sse2.intel_syntax.s	nb_kernel303_ia32_sse2.intel_syntax.s	\
-	nb_kernel304_ia32_sse2.intel_syntax.s	nb_kernel310_ia32_sse2.intel_syntax.s	\
-	nb_kernel311_ia32_sse2.intel_syntax.s	nb_kernel312_ia32_sse2.intel_syntax.s	\
-	nb_kernel313_ia32_sse2.intel_syntax.s	nb_kernel314_ia32_sse2.intel_syntax.s	\
-	nb_kernel330_ia32_sse2.intel_syntax.s	nb_kernel331_ia32_sse2.intel_syntax.s	\
-	nb_kernel332_ia32_sse2.intel_syntax.s	nb_kernel333_ia32_sse2.intel_syntax.s	\
-	nb_kernel334_ia32_sse2.intel_syntax.s	nb_kernel400_ia32_sse2.intel_syntax.s	\
-	nb_kernel410_ia32_sse2.intel_syntax.s	nb_kernel430_ia32_sse2.intel_syntax.s	\
-	nb_kernel_ia32_sse2_test_asm.intel_syntax.s
+	nb_kernel010_ia32_sse2_intel_syntax.s	nb_kernel030_ia32_sse2_intel_syntax.s	\
+	nb_kernel100_ia32_sse2_intel_syntax.s	nb_kernel101_ia32_sse2_intel_syntax.s	\
+	nb_kernel102_ia32_sse2_intel_syntax.s	nb_kernel103_ia32_sse2_intel_syntax.s	\
+	nb_kernel104_ia32_sse2_intel_syntax.s	nb_kernel110_ia32_sse2_intel_syntax.s	\
+	nb_kernel111_ia32_sse2_intel_syntax.s	nb_kernel112_ia32_sse2_intel_syntax.s	\
+	nb_kernel113_ia32_sse2_intel_syntax.s	nb_kernel114_ia32_sse2_intel_syntax.s	\
+	nb_kernel130_ia32_sse2_intel_syntax.s	nb_kernel131_ia32_sse2_intel_syntax.s	\
+	nb_kernel132_ia32_sse2_intel_syntax.s	nb_kernel133_ia32_sse2_intel_syntax.s	\
+	nb_kernel134_ia32_sse2_intel_syntax.s	nb_kernel200_ia32_sse2_intel_syntax.s	\
+	nb_kernel201_ia32_sse2_intel_syntax.s	nb_kernel202_ia32_sse2_intel_syntax.s	\
+	nb_kernel203_ia32_sse2_intel_syntax.s	nb_kernel204_ia32_sse2_intel_syntax.s	\
+	nb_kernel210_ia32_sse2_intel_syntax.s	nb_kernel211_ia32_sse2_intel_syntax.s	\
+	nb_kernel212_ia32_sse2_intel_syntax.s	nb_kernel213_ia32_sse2_intel_syntax.s	\
+	nb_kernel214_ia32_sse2_intel_syntax.s	nb_kernel230_ia32_sse2_intel_syntax.s	\
+	nb_kernel231_ia32_sse2_intel_syntax.s	nb_kernel232_ia32_sse2_intel_syntax.s	\
+	nb_kernel233_ia32_sse2_intel_syntax.s	nb_kernel234_ia32_sse2_intel_syntax.s	\
+	nb_kernel300_ia32_sse2_intel_syntax.s	nb_kernel301_ia32_sse2_intel_syntax.s	\
+	nb_kernel302_ia32_sse2_intel_syntax.s	nb_kernel303_ia32_sse2_intel_syntax.s	\
+	nb_kernel304_ia32_sse2_intel_syntax.s	nb_kernel310_ia32_sse2_intel_syntax.s	\
+	nb_kernel311_ia32_sse2_intel_syntax.s	nb_kernel312_ia32_sse2_intel_syntax.s	\
+	nb_kernel313_ia32_sse2_intel_syntax.s	nb_kernel314_ia32_sse2_intel_syntax.s	\
+	nb_kernel330_ia32_sse2_intel_syntax.s	nb_kernel331_ia32_sse2_intel_syntax.s	\
+	nb_kernel332_ia32_sse2_intel_syntax.s	nb_kernel333_ia32_sse2_intel_syntax.s	\
+	nb_kernel334_ia32_sse2_intel_syntax.s	nb_kernel400_ia32_sse2_intel_syntax.s	\
+	nb_kernel410_ia32_sse2_intel_syntax.s	nb_kernel430_ia32_sse2_intel_syntax.s	\
+	nb_kernel_ia32_sse2_test_asm_intel_syntax.s
 
 
 
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.intel_syntax.s
deleted file mode 100644
index c9c1dfb868..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.intel_syntax.s
+++ /dev/null
@@ -1,1287 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-
-
-.globl nb_kernel400_ia32_sse2
-.globl _nb_kernel400_ia32_sse2
-nb_kernel400_ia32_sse2:	
-_nb_kernel400_ia32_sse2:	
-.equiv          nb400_p_nri,            8
-.equiv          nb400_iinr,             12
-.equiv          nb400_jindex,           16
-.equiv          nb400_jjnr,             20
-.equiv          nb400_shift,            24
-.equiv          nb400_shiftvec,         28
-.equiv          nb400_fshift,           32
-.equiv          nb400_gid,              36
-.equiv          nb400_pos,              40
-.equiv          nb400_faction,          44
-.equiv          nb400_charge,           48
-.equiv          nb400_p_facel,          52
-.equiv          nb400_argkrf,           56
-.equiv          nb400_argcrf,           60
-.equiv          nb400_Vc,               64
-.equiv          nb400_type,             68
-.equiv          nb400_p_ntype,          72
-.equiv          nb400_vdwparam,         76
-.equiv          nb400_Vvdw,             80
-.equiv          nb400_p_tabscale,       84
-.equiv          nb400_VFtab,            88
-.equiv          nb400_invsqrta,         92
-.equiv          nb400_dvda,             96
-.equiv          nb400_p_gbtabscale,     100
-.equiv          nb400_GBtab,            104
-.equiv          nb400_p_nthreads,       108
-.equiv          nb400_count,            112
-.equiv          nb400_mtx,              116
-.equiv          nb400_outeriter,        120
-.equiv          nb400_inneriter,        124
-.equiv          nb400_work,             128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb400_ix,               0
-.equiv          nb400_iy,               16
-.equiv          nb400_iz,               32
-.equiv          nb400_iq,               48
-.equiv          nb400_dx,               64
-.equiv          nb400_dy,               80
-.equiv          nb400_dz,               96
-.equiv          nb400_two,              112
-.equiv          nb400_gbtsc,            128
-.equiv          nb400_qq,               144
-.equiv          nb400_r,                160
-.equiv          nb400_vctot,            176
-.equiv          nb400_fix,              192
-.equiv          nb400_fiy,              208
-.equiv          nb400_fiz,              224
-.equiv          nb400_half,             240
-.equiv          nb400_three,            256
-.equiv          nb400_isai,             272
-.equiv          nb400_isaprod,          288
-.equiv          nb400_dvdasum,          304
-.equiv          nb400_gbscale,          320
-.equiv          nb400_is3,              336
-.equiv          nb400_ii3,              340
-.equiv          nb400_ii,               344
-.equiv          nb400_innerjjnr,        348
-.equiv          nb400_innerk,           352
-.equiv          nb400_n,                356
-.equiv          nb400_nn1,              360
-.equiv          nb400_nri,              364
-.equiv          nb400_facel,            368   ;# uses 8 bytes
-.equiv          nb400_nouter,           376
-.equiv          nb400_ninner,           380
-.equiv          nb400_salign,           384
-	push ebp
-	mov ebp,esp	
-	push eax
-	push ebx
-	push ecx
-	push edx
-	push esi
-	push edi
-	sub esp, 388		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb400_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb400_p_nri]
-	mov esi, [ebp + nb400_p_facel]
-	mov ecx, [ecx]
-	movsd xmm7, [esi]
-	mov [esp + nb400_nri], ecx
-	movsd [esp + nb400_facel], xmm7
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb400_nouter], eax
-	mov [esp + nb400_ninner], eax
-
-
-	mov eax, [ebp + nb400_p_gbtabscale]
-	movsd xmm3, [eax]
-	shufpd xmm3, xmm3, 0
-	movapd [esp + nb400_gbtsc], xmm3
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double 0.5 IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [esp + nb400_half], eax
-	mov [esp + nb400_half+4], ebx
-	movsd xmm1, [esp + nb400_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# 1.0
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# 2.0
-	addpd  xmm3, xmm2	;# 3.0
-	movapd [esp + nb400_half], xmm1
-	movapd [esp + nb400_two], xmm2
-	movapd [esp + nb400_three], xmm3
-
-.nb400_threadloop:
-        mov   esi, [ebp + nb400_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb400_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb400_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb400_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb400_n], eax
-        mov [esp + nb400_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb400_outerstart
-        jmp .nb400_end
-
-.nb400_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb400_nouter]
-	mov [esp + nb400_nouter], ebx
-
-.nb400_outer:
-	mov   eax, [ebp + nb400_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax+esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb400_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb400_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movsd xmm0, [eax + ebx*8]
-	movsd xmm1, [eax + ebx*8 + 8]
-	movsd xmm2, [eax + ebx*8 + 16] 
-
-	mov   ecx, [ebp + nb400_iinr]       ;# ecx = pointer into iinr[] 	
-	mov   ebx, [ecx+esi*4]	    ;# ebx =ii 
-	mov   [esp + nb400_ii], ebx
-	
-	mov   edx, [ebp + nb400_charge]
-	movsd xmm3, [edx + ebx*8]	
-	mulsd xmm3, [esp + nb400_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   edx, [ebp + nb400_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [edx + ebx*8]
-	shufpd xmm4, xmm4, 0
-
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb400_pos]    ;# eax = base of pos[]  
-
-	addsd xmm0, [eax + ebx*8]
-	addsd xmm1, [eax + ebx*8 + 8]
-	addsd xmm2, [eax + ebx*8 + 16]
-
-	movapd [esp + nb400_iq], xmm3
-	movapd [esp + nb400_isai], xmm4
-	
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [esp + nb400_ix], xmm0
-	movapd [esp + nb400_iy], xmm1
-	movapd [esp + nb400_iz], xmm2
-
-	mov   [esp + nb400_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorpd xmm4, xmm4
-	movapd [esp + nb400_vctot], xmm4
-	movapd [esp + nb400_dvdasum], xmm4
-	movapd [esp + nb400_fix], xmm4
-	movapd [esp + nb400_fiy], xmm4
-	movapd [esp + nb400_fiz], xmm4
-	
-	mov   eax, [ebp + nb400_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb400_pos]
-	mov   edi, [ebp + nb400_faction]	
-	mov   eax, [ebp + nb400_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb400_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [esp + nb400_ninner]
-	mov   [esp + nb400_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb400_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb400_unroll_loop
-	jmp   .nb400_checksingle
-.nb400_unroll_loop:
-	;# twice unrolled innerloop here 
-	mov   edx, [esp + nb400_innerjjnr]   ;# pointer to jjnr[k] 
-	mov   eax, [edx]
-	mov   ebx, [edx + 4]
-	add dword ptr [esp + nb400_innerjjnr], 8	;# advance pointer (unrolled 2) 
-
-	;# load isaj
-	mov esi, [ebp + nb400_invsqrta]
-	movlpd xmm2, [esi + eax*8]
-	movhpd xmm2, [esi + ebx*8]
-	mulpd  xmm2, [esp + nb400_isai]
-	movapd [esp + nb400_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulpd xmm1, [esp + nb400_gbtsc]
-	movapd [esp + nb400_gbscale], xmm1
-	
-	mov esi, [ebp + nb400_charge]    ;# base of charge[] 
-	movlpd xmm3, [esi + eax*8]
-	movhpd xmm3, [esi + ebx*8]
-
-	mulpd xmm2, [esp + nb400_iq]
-	mulpd  xmm3, xmm2
-	movapd [esp + nb400_qq], xmm3	
-	
-	mov esi, [ebp + nb400_pos]		;# base of pos[] 
-
-	movd  mm2, eax
-	movd  mm3, ebx
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-
-	;# move two coordinates to xmm0-xmm2 
-	movlpd xmm0, [esi + eax*8]
-	movlpd xmm1, [esi + eax*8 + 8]
-	movlpd xmm2, [esi + eax*8 + 16]
-	movhpd xmm0, [esi + ebx*8]
-	movhpd xmm1, [esi + ebx*8 + 8]
-	movhpd xmm2, [esi + ebx*8 + 16]		
-
-	mov    edi, [ebp + nb400_faction]
-	
-	;# move nb400_ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb400_ix]
-	movapd xmm5, [esp + nb400_iy]
-	movapd xmm6, [esp + nb400_iz]
-
-	;# calc dr 
-	subpd xmm4, xmm0
-	subpd xmm5, xmm1
-	subpd xmm6, xmm2
-
-	;# store dr 
-	movapd [esp + nb400_dx], xmm4
-	movapd [esp + nb400_dy], xmm5
-	movapd [esp + nb400_dz], xmm6
-	;# square it 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb400_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb400_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb400_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb400_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=iter2 of rinv (new lu) 
-	mulpd xmm4, xmm0	;# xmm4=r 
-	movapd [esp + nb400_r], xmm4
-	mulpd xmm4, [esp + nb400_gbscale]
-
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 2		;# idx *= 4 
-	
-	movd mm0, eax	
-	movd mm1, ebx
-
-	mov  esi, [ebp + nb400_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ebx, mm6		;# indices in eax/ebx 
-
-	movapd xmm4, [esi + eax*8]	;# Y1 F1 	
-	movapd xmm3, [esi + ebx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [esi + eax*8 + 16]	;# G1 H1 	
-	movapd xmm3, [esi + ebx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	mulpd  xmm7, [esp + nb400_two]	;# two*Heps2 
-	movapd xmm3, [esp + nb400_qq]
-	addpd  xmm7, xmm6
-	addpd  xmm7, xmm5 ;# xmm7=FF 
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulpd  xmm3, xmm7 ;# fijC=FF*qq 
-	;# get jnr from regs
-	movd ecx, mm2
-	movd edx, mm3
-	mov esi, [ebp + nb400_dvda]
-	
-	;# Calculate dVda
-	xorpd xmm7, xmm7
-	mulpd xmm3, [esp + nb400_gbscale]
-	movapd xmm6, xmm3
-	mulpd  xmm6, [esp + nb400_r]
-	addpd  xmm6, xmm5
-	addpd  xmm5, [esp + nb400_vctot]
-	movapd [esp + nb400_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subpd  xmm7, xmm6
-	movapd xmm6, xmm7
-	
-	;# update dvdasum
-	addpd  xmm7, [esp + nb400_dvdasum]
-	movapd [esp + nb400_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	addsd  xmm6, [esi + ecx*8]
-	addsd  xmm7, [esi + edx*8]
-	movsd  [esi + ecx*8], xmm6
-	movsd  [esi + edx*8], xmm7
-	
-	xorpd  xmm4, xmm4
-
-	mulpd xmm3, xmm0
-	subpd  xmm4, xmm3
-
-	movapd xmm0, [esp + nb400_dx]
-	movapd xmm1, [esp + nb400_dy]
-	movapd xmm2, [esp + nb400_dz]
-
-	movd eax, mm0	
-	movd ebx, mm1
-
-	mov    edi, [ebp + nb400_faction]
-	mulpd  xmm0, xmm4
-	mulpd  xmm1, xmm4
-	mulpd  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movapd xmm3, [esp + nb400_fix]
-	movapd xmm4, [esp + nb400_fiy]
-	movapd xmm5, [esp + nb400_fiz]
-	addpd  xmm3, xmm0
-	addpd  xmm4, xmm1
-	addpd  xmm5, xmm2
-	movapd [esp + nb400_fix], xmm3
-	movapd [esp + nb400_fiy], xmm4
-	movapd [esp + nb400_fiz], xmm5
-	;# the fj's - start by accumulating forces from memory 
-	movlpd xmm3, [edi + eax*8]
-	movlpd xmm4, [edi + eax*8 + 8]
-	movlpd xmm5, [edi + eax*8 + 16]
-	movhpd xmm3, [edi + ebx*8]
-	movhpd xmm4, [edi + ebx*8 + 8]
-	movhpd xmm5, [edi + ebx*8 + 16]
-	subpd xmm3, xmm0
-	subpd xmm4, xmm1
-	subpd xmm5, xmm2
-	movlpd [edi + eax*8], xmm3
-	movlpd [edi + eax*8 + 8], xmm4
-	movlpd [edi + eax*8 + 16], xmm5
-	movhpd [edi + ebx*8], xmm3
-	movhpd [edi + ebx*8 + 8], xmm4
-	movhpd [edi + ebx*8 + 16], xmm5
-		
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb400_innerk],  2
-	jl    .nb400_checksingle
-	jmp   .nb400_unroll_loop
-.nb400_checksingle:
-	mov   edx, [esp + nb400_innerk]
-	and   edx, 1
-	jnz    .nb400_dosingle
-	jmp    .nb400_updateouterdata
-.nb400_dosingle:
-	mov esi, [ebp + nb400_charge]
-	mov edx, [ebp + nb400_invsqrta]
-	mov edi, [ebp + nb400_pos]
-	mov   ecx, [esp + nb400_innerjjnr]
-	mov   eax, [ecx]	
-	xorpd  xmm6, xmm6
-	movapd xmm7, xmm6
-	movsd  xmm7, [edx + eax*8]
-	movlpd xmm6, [esi + eax*8]	;# xmm6(0) has the charge
-	mulsd  xmm7, [esp + nb400_isai]
-	movapd [esp + nb400_isaprod], xmm7
-	movapd xmm1, xmm7
-	mulpd xmm1, [esp + nb400_gbtsc]
-	movapd [esp + nb400_gbscale], xmm1
-	
-	mulsd  xmm7, [esp + nb400_iq]
-	mulsd  xmm6, xmm7
-	movapd [esp + nb400_qq], xmm6
-
-	movd  mm2, eax
-	lea   eax, [eax + eax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movlpd xmm0, [edi + eax*8]
-	movlpd xmm1, [edi + eax*8 + 8]
-	movlpd xmm2, [edi + eax*8 + 16]
-
-	;# move nb400_ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb400_ix]
-	movapd xmm5, [esp + nb400_iy]
-	movapd xmm6, [esp + nb400_iz]
-
-	;# calc dr 
-	subsd xmm4, xmm0
-	subsd xmm5, xmm1
-	subsd xmm6, xmm2
-
-	;# store dr 
-	movapd [esp + nb400_dx], xmm4
-	movapd [esp + nb400_dy], xmm5
-	movapd [esp + nb400_dz], xmm6
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb400_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb400_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb400_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb400_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=iter2 of rinv (new lu) 
-	
-	mulsd xmm4, xmm0	;# xmm4=r 
-	movapd [esp + nb400_r], xmm4
-	mulsd xmm4, [esp + nb400_gbscale]
-	
-	movd mm0, eax	
-
-	cvttsd2si eax, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, eax
-	subsd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2 
-	
-	shl eax, 2		;# idx *= 4 
-	
-	mov  esi, [ebp + nb400_GBtab]
-
-	;# Coulomb 
-	movapd xmm4, [esi + eax*8]	;# Y1 F1 
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1  
-	unpckhpd xmm5, xmm3	;# F1  
-
-	movapd xmm6, [esi + eax*8 + 16]	;# G1 H1 
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1  
-	unpckhpd xmm7, xmm3	;# H1  	
-	;# table ready in xmm4-xmm7 
-
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	mulsd  xmm7, [esp + nb400_two]	;# two*Heps2 
-	movapd xmm3, [esp + nb400_qq]
-	addsd  xmm7, xmm6
-	addsd  xmm7, xmm5 ;# xmm7=FF 
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulsd  xmm3, xmm7 ;# fijC=FF*qq
-	;# get jnr from regs
-	movd ebx, mm2
-	mov esi, [ebp + nb400_dvda]
-	
-	;# Calculate dVda
-	mulsd xmm3, [esp + nb400_gbscale]
-	movsd xmm6, xmm3
-	mulsd  xmm6, [esp + nb400_r]
-	addsd  xmm6, xmm5
-	addsd  xmm5, [esp + nb400_vctot]
-	movsd [esp + nb400_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subpd  xmm7, xmm6
-	movsd xmm6, xmm7
-	
-	;# update dvdasum
-	addsd  xmm7, [esp + nb400_dvdasum]
-	movsd [esp + nb400_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	addsd  xmm6, [esi + ebx*8]
-	movsd  [esi + ebx*8], xmm6
-	
-	xorpd xmm4, xmm4
-	movd eax, mm0
-
-	mulsd xmm3, xmm0
-	subsd  xmm4, xmm3
-	mov    edi, [ebp + nb400_faction]
-
-	movsd xmm0, [esp + nb400_dx]
-	movsd xmm1, [esp + nb400_dy]
-	movsd xmm2, [esp + nb400_dz]
-
-	mulsd  xmm0, xmm4
-	mulsd  xmm1, xmm4
-	mulsd  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movsd xmm3, [esp + nb400_fix]
-	movsd xmm4, [esp + nb400_fiy]
-	movsd xmm5, [esp + nb400_fiz]
-	addsd  xmm3, xmm0
-	addsd  xmm4, xmm1
-	addsd  xmm5, xmm2
-	movlpd [esp + nb400_fix], xmm3
-	movlpd [esp + nb400_fiy], xmm4
-	movlpd [esp + nb400_fiz], xmm5
-	;# update fj 
-	movlpd xmm3, [edi + eax*8]
-	movlpd xmm4, [edi + eax*8 + 8]
-	movlpd xmm5, [edi + eax*8 + 16]
-	subsd xmm3, xmm0
-	subsd xmm4, xmm1
-	subsd xmm5, xmm2
-	movlpd [edi + eax*8], xmm3
-	movlpd [edi + eax*8 + 8], xmm4
-	movlpd [edi + eax*8 + 16], xmm5
-
-.nb400_updateouterdata:
-	mov   ecx, [esp + nb400_ii3]
-	mov   edi, [ebp + nb400_faction]
-	mov   esi, [ebp + nb400_fshift]
-	mov   edx, [esp + nb400_is3]
-
-	;# accumulate i forces in xmm0, xmm1, xmm2 
-	movapd xmm0, [esp + nb400_fix]
-	movapd xmm1, [esp + nb400_fiy]
-	movapd xmm2, [esp + nb400_fiz]
-
-	movhlps xmm3, xmm0
-	movhlps xmm4, xmm1
-	movhlps xmm5, xmm2
-	addsd  xmm0, xmm3
-	addsd  xmm1, xmm4
-	addsd  xmm2, xmm5 ;# sum is in low xmm0-xmm2 
-
-	;# increment i force 
-	movsd  xmm3, [edi + ecx*8]
-	movsd  xmm4, [edi + ecx*8 + 8]
-	movsd  xmm5, [edi + ecx*8 + 16]
-	addsd  xmm3, xmm0
-	addsd  xmm4, xmm1
-	addsd  xmm5, xmm2
-	movsd  [edi + ecx*8],     xmm3
-	movsd  [edi + ecx*8 + 8], xmm4
-	movsd  [edi + ecx*8 + 16], xmm5
-
-	;# increment fshift force  
-	movsd  xmm3, [esi + edx*8]
-	movsd  xmm4, [esi + edx*8 + 8]
-	movsd  xmm5, [esi + edx*8 + 16]
-	addsd  xmm3, xmm0
-	addsd  xmm4, xmm1
-	addsd  xmm5, xmm2
-	movsd  [esi + edx*8],     xmm3
-	movsd  [esi + edx*8 + 8], xmm4
-	movsd  [esi + edx*8 + 16], xmm5
-
-	;# get n from stack
-	mov esi, [esp + nb400_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb400_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movapd xmm7, [esp + nb400_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb400_Vc]
-	addsd xmm7, [eax + edx*8] 
-	;# move back to mem 
-	movsd [eax + edx*8], xmm7 
-	
-	;# accumulate dVda and update it 
-	movapd xmm7, [esp + nb400_dvdasum]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	mov edx, [esp + nb400_ii]
-	mov eax, [ebp + nb400_dvda]
-	addsd xmm7, [eax + edx*8]
-	movsd [eax + edx*8], xmm7
-	
-        ;# finish if last 
-        mov ecx, [esp + nb400_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb400_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb400_n], esi
-        jmp .nb400_outer
-.nb400_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb400_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb400_end
-        ;# non-zero, do one more workunit
-        jmp   .nb400_threadloop
-.nb400_end:
-	emms
-
-	mov eax, [esp + nb400_nouter]
-	mov ebx, [esp + nb400_ninner]
-	mov ecx, [ebp + nb400_outeriter]
-	mov edx, [ebp + nb400_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb400_salign]
-	add esp, eax
-	add esp, 388
-	pop edi
-	pop esi
-    	pop edx
-    	pop ecx
-    	pop ebx
-    	pop eax
-	leave
-	ret
-
-
-
-
-
-
-.globl nb_kernel400nf_ia32_sse2
-.globl _nb_kernel400nf_ia32_sse2
-nb_kernel400nf_ia32_sse2:	
-_nb_kernel400nf_ia32_sse2:	
-.equiv          nb400nf_p_nri,          8
-.equiv          nb400nf_iinr,           12
-.equiv          nb400nf_jindex,         16
-.equiv          nb400nf_jjnr,           20
-.equiv          nb400nf_shift,          24
-.equiv          nb400nf_shiftvec,       28
-.equiv          nb400nf_fshift,         32
-.equiv          nb400nf_gid,            36
-.equiv          nb400nf_pos,            40
-.equiv          nb400nf_faction,        44
-.equiv          nb400nf_charge,         48
-.equiv          nb400nf_p_facel,        52
-.equiv          nb400nf_argkrf,         56
-.equiv          nb400nf_argcrf,         60
-.equiv          nb400nf_Vc,             64
-.equiv          nb400nf_type,           68
-.equiv          nb400nf_p_ntype,        72
-.equiv          nb400nf_vdwparam,       76
-.equiv          nb400nf_Vvdw,           80
-.equiv          nb400nf_p_tabscale,     84
-.equiv          nb400nf_VFtab,          88
-.equiv          nb400nf_invsqrta,       92
-.equiv          nb400nf_dvda,           96
-.equiv          nb400nf_p_gbtabscale,   100
-.equiv          nb400nf_GBtab,          104
-.equiv          nb400nf_p_nthreads,     108
-.equiv          nb400nf_count,          112
-.equiv          nb400nf_mtx,            116
-.equiv          nb400nf_outeriter,      120
-.equiv          nb400nf_inneriter,      124
-.equiv          nb400nf_work,           128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb400nf_ix,             0
-.equiv          nb400nf_iy,             16
-.equiv          nb400nf_iz,             32
-.equiv          nb400nf_iq,             48
-.equiv          nb400nf_gbtsc,          64
-.equiv          nb400nf_qq,             80
-.equiv          nb400nf_vctot,          96
-.equiv          nb400nf_half,           112
-.equiv          nb400nf_three,          128
-.equiv          nb400nf_isai,           144
-.equiv          nb400nf_isaprod,        160
-.equiv          nb400nf_gbscale,        176
-.equiv          nb400nf_is3,            192
-.equiv          nb400nf_ii3,            196
-.equiv          nb400nf_innerjjnr,      200
-.equiv          nb400nf_innerk,         204
-.equiv          nb400nf_n,              208
-.equiv          nb400nf_nn1,            212
-.equiv          nb400nf_nri,            216
-.equiv          nb400nf_facel,          224   ;# uses 8 bytes
-.equiv          nb400nf_nouter,         232
-.equiv          nb400nf_ninner,         236
-.equiv          nb400nf_salign,         240
-	push ebp
-	mov ebp,esp	
-	push eax
-	push ebx
-	push ecx
-	push edx
-	push esi
-	push edi
-	sub esp, 244		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb400nf_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb400nf_p_nri]
-	mov esi, [ebp + nb400nf_p_facel]
-	mov ecx, [ecx]
-	movsd xmm7, [esi]
-	mov [esp + nb400nf_nri], ecx
-	movsd [esp + nb400nf_facel], xmm7
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb400nf_nouter], eax
-	mov [esp + nb400nf_ninner], eax
-
-
-	mov eax, [ebp + nb400nf_p_gbtabscale]
-	movsd xmm3, [eax]
-	shufpd xmm3, xmm3, 0
-	movapd [esp + nb400nf_gbtsc], xmm3
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double 0.5 IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [esp + nb400nf_half], eax
-	mov [esp + nb400nf_half+4], ebx
-	movsd xmm1, [esp + nb400nf_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# 1.0
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# 2.0
-	addpd  xmm3, xmm2	;# 3.0
-	movapd [esp + nb400nf_half], xmm1
-	movapd [esp + nb400nf_three], xmm3
-
-.nb400nf_threadloop:
-        mov   esi, [ebp + nb400nf_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb400nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb400nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb400nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb400nf_n], eax
-        mov [esp + nb400nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb400nf_outerstart
-        jmp .nb400nf_end
-
-.nb400nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb400nf_nouter]
-	mov [esp + nb400nf_nouter], ebx
-
-.nb400nf_outer:
-	mov   eax, [ebp + nb400nf_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax+esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb400nf_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb400nf_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movsd xmm0, [eax + ebx*8]
-	movsd xmm1, [eax + ebx*8 + 8]
-	movsd xmm2, [eax + ebx*8 + 16] 
-
-	mov   ecx, [ebp + nb400nf_iinr]       ;# ecx = pointer into iinr[] 	
-	mov   ebx, [ecx+esi*4]	    ;# ebx =ii 
-
-	mov   edx, [ebp + nb400nf_charge]
-	movsd xmm3, [edx + ebx*8]	
-	mulsd xmm3, [esp + nb400nf_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   edx, [ebp + nb400nf_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [edx + ebx*8]
-	shufpd xmm4, xmm4, 0
-
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb400nf_pos]    ;# eax = base of pos[]  
-
-	addsd xmm0, [eax + ebx*8]
-	addsd xmm1, [eax + ebx*8 + 8]
-	addsd xmm2, [eax + ebx*8 + 16]
-
-	movapd [esp + nb400nf_iq], xmm3
-	movapd [esp + nb400nf_isai], xmm4
-	
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [esp + nb400nf_ix], xmm0
-	movapd [esp + nb400nf_iy], xmm1
-	movapd [esp + nb400nf_iz], xmm2
-
-	mov   [esp + nb400nf_ii3], ebx
-	
-	;# clear vctot
-	xorpd xmm4, xmm4
-	movapd [esp + nb400nf_vctot], xmm4
-	
-	mov   eax, [ebp + nb400nf_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb400nf_pos]
-	mov   edi, [ebp + nb400nf_faction]	
-	mov   eax, [ebp + nb400nf_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb400nf_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [esp + nb400nf_ninner]
-	mov   [esp + nb400nf_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb400nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb400nf_unroll_loop
-	jmp   .nb400nf_checksingle
-.nb400nf_unroll_loop:
-	;# twice unrolled innerloop here 
-	mov   edx, [esp + nb400nf_innerjjnr]   ;# pointer to jjnr[k] 
-	mov   eax, [edx]
-	mov   ebx, [edx + 4]
-	add dword ptr [esp + nb400nf_innerjjnr], 8	;# advance pointer (unrolled 2) 
-
-	;# load isa2
-	mov esi, [ebp + nb400nf_invsqrta]
-	movlpd xmm2, [esi + eax*8]
-	movhpd xmm2, [esi + ebx*8]
-	mulpd  xmm2, [esp + nb400nf_isai]
-	movapd [esp + nb400nf_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulpd xmm1, [esp + nb400nf_gbtsc]
-	movapd [esp + nb400nf_gbscale], xmm1
-	
-	mov esi, [ebp + nb400nf_charge]    ;# base of charge[] 
-	movlpd xmm3, [esi + eax*8]
-	movhpd xmm3, [esi + ebx*8]
-
-	mulpd xmm2, [esp + nb400nf_iq]
-	mulpd  xmm3, xmm2
-	movapd [esp + nb400nf_qq], xmm3	
-	
-	mov esi, [ebp + nb400nf_pos]		;# base of pos[] 
-
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-
-	;# move two coordinates to xmm0-xmm2 
-	movlpd xmm0, [esi + eax*8]
-	movlpd xmm1, [esi + eax*8 + 8]
-	movlpd xmm2, [esi + eax*8 + 16]
-	movhpd xmm0, [esi + ebx*8]
-	movhpd xmm1, [esi + ebx*8 + 8]
-	movhpd xmm2, [esi + ebx*8 + 16]		
-
-	mov    edi, [ebp + nb400nf_faction]
-	
-	;# move nb400nf_ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb400nf_ix]
-	movapd xmm5, [esp + nb400nf_iy]
-	movapd xmm6, [esp + nb400nf_iz]
-
-	;# calc dr 
-	subpd xmm4, xmm0
-	subpd xmm5, xmm1
-	subpd xmm6, xmm2
-
-	;# square it 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb400nf_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb400nf_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb400nf_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb400nf_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=iter2 of rinv (new lu) 
-	mulpd xmm4, xmm0	;# xmm4=r 
-	mulpd xmm4, [esp + nb400nf_gbscale]
-
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 2		;# idx *= 4 
-	
-	movd mm0, eax	
-	movd mm1, ebx
-
-	mov  esi, [ebp + nb400nf_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ebx, mm6		;# indices in eax/ebx 
-
-	movapd xmm4, [esi + eax*8]	;# Y1 F1 	
-	movapd xmm3, [esi + ebx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [esi + eax*8 + 16]	;# G1 H1 	
-	movapd xmm3, [esi + ebx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [esp + nb400nf_qq]
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  
-	addpd  xmm5, [esp + nb400nf_vctot]
-	movapd [esp + nb400nf_vctot], xmm5  
-			
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb400nf_innerk],  2
-	jl    .nb400nf_checksingle
-	jmp   .nb400nf_unroll_loop
-.nb400nf_checksingle:
-	mov   edx, [esp + nb400nf_innerk]
-	and   edx, 1
-	jnz    .nb400nf_dosingle
-	jmp    .nb400nf_updateouterdata
-.nb400nf_dosingle:
-	mov esi, [ebp + nb400nf_charge]
-	mov edx, [ebp + nb400nf_invsqrta]
-	mov edi, [ebp + nb400nf_pos]
-	mov   ecx, [esp + nb400nf_innerjjnr]
-	mov   eax, [ecx]	
-	xorpd  xmm6, xmm6
-	movapd xmm7, xmm6
-	movsd  xmm7, [edx + eax*8]
-	movlpd xmm6, [esi + eax*8]	;# xmm6(0) has the charge
-	mulsd  xmm7, [esp + nb400nf_isai]
-	movapd [esp + nb400nf_isaprod], xmm7
-	movapd xmm1, xmm7
-	mulpd xmm1, [esp + nb400nf_gbtsc]
-	movapd [esp + nb400nf_gbscale], xmm1
-	
-	mulsd  xmm7, [esp + nb400nf_iq]
-	mulsd  xmm6, xmm7
-	movapd [esp + nb400nf_qq], xmm6
-		
-	lea   eax, [eax + eax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movlpd xmm0, [edi + eax*8]
-	movlpd xmm1, [edi + eax*8 + 8]
-	movlpd xmm2, [edi + eax*8 + 16]
-
-	;# move nb400nf_ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb400nf_ix]
-	movapd xmm5, [esp + nb400nf_iy]
-	movapd xmm6, [esp + nb400nf_iz]
-
-	;# calc dr 
-	subsd xmm4, xmm0
-	subsd xmm5, xmm1
-	subsd xmm6, xmm2
-
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb400nf_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb400nf_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb400nf_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb400nf_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=iter2 of rinv (new lu) 
-	
-	mulsd xmm4, xmm0	;# xmm4=r 
-	mulsd xmm4, [esp + nb400nf_gbscale]
-	
-	movd mm0, eax	
-
-	cvttsd2si eax, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, eax
-	subsd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2 
-	
-	shl eax, 2		;# idx *= 4 
-	
-	mov  esi, [ebp + nb400nf_GBtab]
-
-	;# Coulomb 
-	movapd xmm4, [esi + eax*8]	;# Y1 F1 
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1  
-	unpckhpd xmm5, xmm3	;# F1  
-
-	movapd xmm6, [esi + eax*8 + 16]	;# G1 H1 
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1  
-	unpckhpd xmm7, xmm3	;# H1  	
-	;# table ready in xmm4-xmm7 
-
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [esp + nb400nf_qq]
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  
-	addsd  xmm5, [esp + nb400nf_vctot]
-	movsd [esp + nb400nf_vctot], xmm5
-	
-.nb400nf_updateouterdata:
-	;# get n from stack
-	mov esi, [esp + nb400nf_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb400nf_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movapd xmm7, [esp + nb400nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb400nf_Vc]
-	addsd xmm7, [eax + edx*8] 
-	;# move back to mem 
-	movsd [eax + edx*8], xmm7 
-	
-        ;# finish if last 
-        mov ecx, [esp + nb400nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb400nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb400nf_n], esi
-        jmp .nb400nf_outer
-.nb400nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb400nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb400nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb400nf_threadloop
-.nb400nf_end:
-	emms
-
-	mov eax, [esp + nb400nf_nouter]
-	mov ebx, [esp + nb400nf_ninner]
-	mov ecx, [ebp + nb400nf_outeriter]
-	mov edx, [ebp + nb400nf_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb400nf_salign]
-	add esp, eax
-	add esp, 244
-	pop edi
-	pop esi
-    	pop edx
-    	pop ecx
-    	pop ebx
-    	pop eax
-	leave
-	ret
-
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.s
deleted file mode 100644
index 81ad136251..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.s
+++ /dev/null
@@ -1,1261 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-
-.globl nb_kernel400_ia32_sse2
-.globl _nb_kernel400_ia32_sse2
-nb_kernel400_ia32_sse2: 
-_nb_kernel400_ia32_sse2:        
-.set nb400_p_nri, 8
-.set nb400_iinr, 12
-.set nb400_jindex, 16
-.set nb400_jjnr, 20
-.set nb400_shift, 24
-.set nb400_shiftvec, 28
-.set nb400_fshift, 32
-.set nb400_gid, 36
-.set nb400_pos, 40
-.set nb400_faction, 44
-.set nb400_charge, 48
-.set nb400_p_facel, 52
-.set nb400_argkrf, 56
-.set nb400_argcrf, 60
-.set nb400_Vc, 64
-.set nb400_type, 68
-.set nb400_p_ntype, 72
-.set nb400_vdwparam, 76
-.set nb400_Vvdw, 80
-.set nb400_p_tabscale, 84
-.set nb400_VFtab, 88
-.set nb400_invsqrta, 92
-.set nb400_dvda, 96
-.set nb400_p_gbtabscale, 100
-.set nb400_GBtab, 104
-.set nb400_p_nthreads, 108
-.set nb400_count, 112
-.set nb400_mtx, 116
-.set nb400_outeriter, 120
-.set nb400_inneriter, 124
-.set nb400_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb400_ix, 0
-.set nb400_iy, 16
-.set nb400_iz, 32
-.set nb400_iq, 48
-.set nb400_dx, 64
-.set nb400_dy, 80
-.set nb400_dz, 96
-.set nb400_two, 112
-.set nb400_gbtsc, 128
-.set nb400_qq, 144
-.set nb400_r, 160
-.set nb400_vctot, 176
-.set nb400_fix, 192
-.set nb400_fiy, 208
-.set nb400_fiz, 224
-.set nb400_half, 240
-.set nb400_three, 256
-.set nb400_isai, 272
-.set nb400_isaprod, 288
-.set nb400_dvdasum, 304
-.set nb400_gbscale, 320
-.set nb400_is3, 336
-.set nb400_ii3, 340
-.set nb400_ii, 344
-.set nb400_innerjjnr, 348
-.set nb400_innerk, 352
-.set nb400_n, 356
-.set nb400_nn1, 360
-.set nb400_nri, 364
-.set nb400_facel, 368                         ## uses 8 bytes
-.set nb400_nouter, 376
-.set nb400_ninner, 380
-.set nb400_salign, 384
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $388,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb400_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb400_p_nri(%ebp),%ecx
-        movl nb400_p_facel(%ebp),%esi
-        movl (%ecx),%ecx
-        movsd (%esi),%xmm7
-        movl %ecx,nb400_nri(%esp)
-        movsd %xmm7,nb400_facel(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb400_nouter(%esp)
-        movl %eax,nb400_ninner(%esp)
-
-
-        movl nb400_p_gbtabscale(%ebp),%eax
-        movsd (%eax),%xmm3
-        shufpd $0,%xmm3,%xmm3
-        movapd %xmm3,nb400_gbtsc(%esp)
-
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double 0.5 IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb400_half(%esp)
-        movl %ebx,nb400_half+4(%esp)
-        movsd nb400_half(%esp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## 1.0
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## 2.0
-        addpd  %xmm2,%xmm3      ## 3.0
-        movapd %xmm1,nb400_half(%esp)
-        movapd %xmm2,nb400_two(%esp)
-        movapd %xmm3,nb400_three(%esp)
-
-_nb_kernel400_ia32_sse2.nb400_threadloop: 
-        movl  nb400_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel400_ia32_sse2.nb400_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel400_ia32_sse2.nb400_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb400_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb400_n(%esp)
-        movl %ebx,nb400_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel400_ia32_sse2.nb400_outerstart
-        jmp _nb_kernel400_ia32_sse2.nb400_end
-
-_nb_kernel400_ia32_sse2.nb400_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb400_nouter(%esp),%ebx
-        movl %ebx,nb400_nouter(%esp)
-
-_nb_kernel400_ia32_sse2.nb400_outer: 
-        movl  nb400_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx        ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb400_is3(%esp)      ## store is3 
-
-        movl  nb400_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movsd (%eax,%ebx,8),%xmm0
-        movsd 8(%eax,%ebx,8),%xmm1
-        movsd 16(%eax,%ebx,8),%xmm2
-
-        movl  nb400_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]        
-        movl  (%ecx,%esi,4),%ebx    ## ebx =ii 
-        movl  %ebx,nb400_ii(%esp)
-
-        movl  nb400_charge(%ebp),%edx
-        movsd (%edx,%ebx,8),%xmm3
-        mulsd nb400_facel(%esp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movl  nb400_invsqrta(%ebp),%edx         ## load invsqrta[ii]
-        movsd (%edx,%ebx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb400_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addsd (%eax,%ebx,8),%xmm0
-        addsd 8(%eax,%ebx,8),%xmm1
-        addsd 16(%eax,%ebx,8),%xmm2
-
-        movapd %xmm3,nb400_iq(%esp)
-        movapd %xmm4,nb400_isai(%esp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb400_ix(%esp)
-        movapd %xmm1,nb400_iy(%esp)
-        movapd %xmm2,nb400_iz(%esp)
-
-        movl  %ebx,nb400_ii3(%esp)
-
-        ## clear vctot and i forces 
-        xorpd %xmm4,%xmm4
-        movapd %xmm4,nb400_vctot(%esp)
-        movapd %xmm4,nb400_dvdasum(%esp)
-        movapd %xmm4,nb400_fix(%esp)
-        movapd %xmm4,nb400_fiy(%esp)
-        movapd %xmm4,nb400_fiz(%esp)
-
-        movl  nb400_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb400_pos(%ebp),%esi
-        movl  nb400_faction(%ebp),%edi
-        movl  nb400_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb400_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb400_ninner(%esp),%ecx
-        movl  %ecx,nb400_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb400_innerk(%esp)      ## number of innerloop atoms 
-        jge   _nb_kernel400_ia32_sse2.nb400_unroll_loop
-        jmp   _nb_kernel400_ia32_sse2.nb400_checksingle
-_nb_kernel400_ia32_sse2.nb400_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movl  nb400_innerjjnr(%esp),%edx     ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        addl $8,nb400_innerjjnr(%esp)                   ## advance pointer (unrolled 2) 
-
-        ## load isaj
-        movl nb400_invsqrta(%ebp),%esi
-        movlpd (%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm2
-        mulpd  nb400_isai(%esp),%xmm2
-        movapd %xmm2,nb400_isaprod(%esp)
-        movapd %xmm2,%xmm1
-        mulpd nb400_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb400_gbscale(%esp)
-
-        movl nb400_charge(%ebp),%esi     ## base of charge[] 
-        movlpd (%esi,%eax,8),%xmm3
-        movhpd (%esi,%ebx,8),%xmm3
-
-        mulpd nb400_iq(%esp),%xmm2
-        mulpd  %xmm2,%xmm3
-        movapd %xmm3,nb400_qq(%esp)
-
-        movl nb400_pos(%ebp),%esi               ## base of pos[] 
-
-        movd  %eax,%mm2
-        movd  %ebx,%mm3
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-
-        ## move two coordinates to xmm0-xmm2 
-        movlpd (%esi,%eax,8),%xmm0
-        movlpd 8(%esi,%eax,8),%xmm1
-        movlpd 16(%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm0
-        movhpd 8(%esi,%ebx,8),%xmm1
-        movhpd 16(%esi,%ebx,8),%xmm2
-
-        movl   nb400_faction(%ebp),%edi
-
-        ## move nb400_ix-iz to xmm4-xmm6 
-        movapd nb400_ix(%esp),%xmm4
-        movapd nb400_iy(%esp),%xmm5
-        movapd nb400_iz(%esp),%xmm6
-
-        ## calc dr 
-        subpd %xmm0,%xmm4
-        subpd %xmm1,%xmm5
-        subpd %xmm2,%xmm6
-
-        ## store dr 
-        movapd %xmm4,nb400_dx(%esp)
-        movapd %xmm5,nb400_dy(%esp)
-        movapd %xmm6,nb400_dz(%esp)
-        ## square it 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb400_three(%esp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb400_half(%esp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb400_three(%esp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb400_half(%esp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=iter2 of rinv (new lu) 
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb400_r(%esp)
-        mulpd nb400_gbscale(%esp),%xmm4
-
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6           ## idx *= 4 
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-
-        movl nb400_GBtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm6,%ebx          ## indices in eax/ebx 
-
-        movapd (%esi,%eax,8),%xmm4      ## Y1 F1        
-        movapd (%esi,%ebx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%esi,%eax,8),%xmm6    ## G1 H1        
-        movapd 16(%esi,%ebx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulpd  nb400_two(%esp),%xmm7    ## two*Heps2 
-        movapd nb400_qq(%esp),%xmm3
-        addpd  %xmm6,%xmm7
-        addpd  %xmm5,%xmm7 ## xmm7=FF 
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-        mulpd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulpd  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## get jnr from regs
-        movd %mm2,%ecx
-        movd %mm3,%edx
-        movl nb400_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorpd %xmm7,%xmm7
-        mulpd nb400_gbscale(%esp),%xmm3
-        movapd %xmm3,%xmm6
-        mulpd  nb400_r(%esp),%xmm6
-        addpd  %xmm5,%xmm6
-        addpd  nb400_vctot(%esp),%xmm5
-        movapd %xmm5,nb400_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subpd  %xmm6,%xmm7
-        movapd %xmm7,%xmm6
-
-        ## update dvdasum
-        addpd  nb400_dvdasum(%esp),%xmm7
-        movapd %xmm7,nb400_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        addsd  (%esi,%ecx,8),%xmm6
-        addsd  (%esi,%edx,8),%xmm7
-        movsd  %xmm6,(%esi,%ecx,8)
-        movsd  %xmm7,(%esi,%edx,8)
-
-        xorpd  %xmm4,%xmm4
-
-        mulpd %xmm0,%xmm3
-        subpd  %xmm3,%xmm4
-
-        movapd nb400_dx(%esp),%xmm0
-        movapd nb400_dy(%esp),%xmm1
-        movapd nb400_dz(%esp),%xmm2
-
-        movd %mm0,%eax
-        movd %mm1,%ebx
-
-        movl   nb400_faction(%ebp),%edi
-        mulpd  %xmm4,%xmm0
-        mulpd  %xmm4,%xmm1
-        mulpd  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movapd nb400_fix(%esp),%xmm3
-        movapd nb400_fiy(%esp),%xmm4
-        movapd nb400_fiz(%esp),%xmm5
-        addpd  %xmm0,%xmm3
-        addpd  %xmm1,%xmm4
-        addpd  %xmm2,%xmm5
-        movapd %xmm3,nb400_fix(%esp)
-        movapd %xmm4,nb400_fiy(%esp)
-        movapd %xmm5,nb400_fiz(%esp)
-        ## the fj's - start by accumulating forces from memory 
-        movlpd (%edi,%eax,8),%xmm3
-        movlpd 8(%edi,%eax,8),%xmm4
-        movlpd 16(%edi,%eax,8),%xmm5
-        movhpd (%edi,%ebx,8),%xmm3
-        movhpd 8(%edi,%ebx,8),%xmm4
-        movhpd 16(%edi,%ebx,8),%xmm5
-        subpd %xmm0,%xmm3
-        subpd %xmm1,%xmm4
-        subpd %xmm2,%xmm5
-        movlpd %xmm3,(%edi,%eax,8)
-        movlpd %xmm4,8(%edi,%eax,8)
-        movlpd %xmm5,16(%edi,%eax,8)
-        movhpd %xmm3,(%edi,%ebx,8)
-        movhpd %xmm4,8(%edi,%ebx,8)
-        movhpd %xmm5,16(%edi,%ebx,8)
-
-        ## should we do one more iteration? 
-        subl $2,nb400_innerk(%esp)
-        jl    _nb_kernel400_ia32_sse2.nb400_checksingle
-        jmp   _nb_kernel400_ia32_sse2.nb400_unroll_loop
-_nb_kernel400_ia32_sse2.nb400_checksingle: 
-        movl  nb400_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel400_ia32_sse2.nb400_dosingle
-        jmp    _nb_kernel400_ia32_sse2.nb400_updateouterdata
-_nb_kernel400_ia32_sse2.nb400_dosingle: 
-        movl nb400_charge(%ebp),%esi
-        movl nb400_invsqrta(%ebp),%edx
-        movl nb400_pos(%ebp),%edi
-        movl  nb400_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-        xorpd  %xmm6,%xmm6
-        movapd %xmm6,%xmm7
-        movsd  (%edx,%eax,8),%xmm7
-        movlpd (%esi,%eax,8),%xmm6      ## xmm6(0) has the charge
-        mulsd  nb400_isai(%esp),%xmm7
-        movapd %xmm7,nb400_isaprod(%esp)
-        movapd %xmm7,%xmm1
-        mulpd nb400_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb400_gbscale(%esp)
-
-        mulsd  nb400_iq(%esp),%xmm7
-        mulsd  %xmm7,%xmm6
-        movapd %xmm6,nb400_qq(%esp)
-
-        movd  %eax,%mm2
-        leal  (%eax,%eax,2),%eax
-
-        ## move coordinates to xmm0-xmm2 
-        movlpd (%edi,%eax,8),%xmm0
-        movlpd 8(%edi,%eax,8),%xmm1
-        movlpd 16(%edi,%eax,8),%xmm2
-
-        ## move nb400_ix-iz to xmm4-xmm6 
-        movapd nb400_ix(%esp),%xmm4
-        movapd nb400_iy(%esp),%xmm5
-        movapd nb400_iz(%esp),%xmm6
-
-        ## calc dr 
-        subsd %xmm0,%xmm4
-        subsd %xmm1,%xmm5
-        subsd %xmm2,%xmm6
-
-        ## store dr 
-        movapd %xmm4,nb400_dx(%esp)
-        movapd %xmm5,nb400_dy(%esp)
-        movapd %xmm6,nb400_dz(%esp)
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb400_three(%esp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb400_half(%esp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb400_three(%esp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb400_half(%esp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=iter2 of rinv (new lu) 
-
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb400_r(%esp)
-        mulsd nb400_gbscale(%esp),%xmm4
-
-        movd %eax,%mm0
-
-        cvttsd2si %xmm4,%eax    ## mm6 = lu idx 
-        cvtsi2sd %eax,%xmm5
-        subsd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%eax            ## idx *= 4 
-
-        movl nb400_GBtab(%ebp),%esi
-
-        ## Coulomb 
-        movapd (%esi,%eax,8),%xmm4      ## Y1 F1 
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1  
-        unpckhpd %xmm3,%xmm5    ## F1  
-
-        movapd 16(%esi,%eax,8),%xmm6    ## G1 H1 
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1  
-        unpckhpd %xmm3,%xmm7    ## H1   
-        ## table ready in xmm4-xmm7 
-
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulsd  nb400_two(%esp),%xmm7    ## two*Heps2 
-        movapd nb400_qq(%esp),%xmm3
-        addsd  %xmm6,%xmm7
-        addsd  %xmm5,%xmm7 ## xmm7=FF 
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulsd  %xmm7,%xmm3 ## fijC=FF*qq
-        ## get jnr from regs
-        movd %mm2,%ebx
-        movl nb400_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        mulsd nb400_gbscale(%esp),%xmm3
-        movsd %xmm3,%xmm6
-        mulsd  nb400_r(%esp),%xmm6
-        addsd  %xmm5,%xmm6
-        addsd  nb400_vctot(%esp),%xmm5
-        movsd %xmm5,nb400_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subpd  %xmm6,%xmm7
-        movsd %xmm7,%xmm6
-
-        ## update dvdasum
-        addsd  nb400_dvdasum(%esp),%xmm7
-        movsd %xmm7,nb400_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        addsd  (%esi,%ebx,8),%xmm6
-        movsd  %xmm6,(%esi,%ebx,8)
-
-        xorpd %xmm4,%xmm4
-        movd %mm0,%eax
-
-        mulsd %xmm0,%xmm3
-        subsd  %xmm3,%xmm4
-        movl   nb400_faction(%ebp),%edi
-
-        movsd nb400_dx(%esp),%xmm0
-        movsd nb400_dy(%esp),%xmm1
-        movsd nb400_dz(%esp),%xmm2
-
-        mulsd  %xmm4,%xmm0
-        mulsd  %xmm4,%xmm1
-        mulsd  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movsd nb400_fix(%esp),%xmm3
-        movsd nb400_fiy(%esp),%xmm4
-        movsd nb400_fiz(%esp),%xmm5
-        addsd  %xmm0,%xmm3
-        addsd  %xmm1,%xmm4
-        addsd  %xmm2,%xmm5
-        movlpd %xmm3,nb400_fix(%esp)
-        movlpd %xmm4,nb400_fiy(%esp)
-        movlpd %xmm5,nb400_fiz(%esp)
-        ## update fj 
-        movlpd (%edi,%eax,8),%xmm3
-        movlpd 8(%edi,%eax,8),%xmm4
-        movlpd 16(%edi,%eax,8),%xmm5
-        subsd %xmm0,%xmm3
-        subsd %xmm1,%xmm4
-        subsd %xmm2,%xmm5
-        movlpd %xmm3,(%edi,%eax,8)
-        movlpd %xmm4,8(%edi,%eax,8)
-        movlpd %xmm5,16(%edi,%eax,8)
-
-_nb_kernel400_ia32_sse2.nb400_updateouterdata: 
-        movl  nb400_ii3(%esp),%ecx
-        movl  nb400_faction(%ebp),%edi
-        movl  nb400_fshift(%ebp),%esi
-        movl  nb400_is3(%esp),%edx
-
-        ## accumulate i forces in xmm0, xmm1, xmm2 
-        movapd nb400_fix(%esp),%xmm0
-        movapd nb400_fiy(%esp),%xmm1
-        movapd nb400_fiz(%esp),%xmm2
-
-        movhlps %xmm0,%xmm3
-        movhlps %xmm1,%xmm4
-        movhlps %xmm2,%xmm5
-        addsd  %xmm3,%xmm0
-        addsd  %xmm4,%xmm1
-        addsd  %xmm5,%xmm2 ## sum is in low xmm0-xmm2 
-
-        ## increment i force 
-        movsd  (%edi,%ecx,8),%xmm3
-        movsd  8(%edi,%ecx,8),%xmm4
-        movsd  16(%edi,%ecx,8),%xmm5
-        addsd  %xmm0,%xmm3
-        addsd  %xmm1,%xmm4
-        addsd  %xmm2,%xmm5
-        movsd  %xmm3,(%edi,%ecx,8)
-        movsd  %xmm4,8(%edi,%ecx,8)
-        movsd  %xmm5,16(%edi,%ecx,8)
-
-        ## increment fshift force  
-        movsd  (%esi,%edx,8),%xmm3
-        movsd  8(%esi,%edx,8),%xmm4
-        movsd  16(%esi,%edx,8),%xmm5
-        addsd  %xmm0,%xmm3
-        addsd  %xmm1,%xmm4
-        addsd  %xmm2,%xmm5
-        movsd  %xmm3,(%esi,%edx,8)
-        movsd  %xmm4,8(%esi,%edx,8)
-        movsd  %xmm5,16(%esi,%edx,8)
-
-        ## get n from stack
-        movl nb400_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb400_gid(%ebp),%edx              ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movapd nb400_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movl  nb400_Vc(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## accumulate dVda and update it 
-        movapd nb400_dvdasum(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        movl nb400_ii(%esp),%edx
-        movl nb400_dvda(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## finish if last 
-        movl nb400_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel400_ia32_sse2.nb400_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb400_n(%esp)
-        jmp _nb_kernel400_ia32_sse2.nb400_outer
-_nb_kernel400_ia32_sse2.nb400_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb400_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel400_ia32_sse2.nb400_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel400_ia32_sse2.nb400_threadloop
-_nb_kernel400_ia32_sse2.nb400_end: 
-        emms
-
-        movl nb400_nouter(%esp),%eax
-        movl nb400_ninner(%esp),%ebx
-        movl nb400_outeriter(%ebp),%ecx
-        movl nb400_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb400_salign(%esp),%eax
-        addl %eax,%esp
-        addl $388,%esp
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-        popl %eax
-        leave
-        ret
-
-
-
-
-
-
-.globl nb_kernel400nf_ia32_sse2
-.globl _nb_kernel400nf_ia32_sse2
-nb_kernel400nf_ia32_sse2:       
-_nb_kernel400nf_ia32_sse2:      
-.set nb400nf_p_nri, 8
-.set nb400nf_iinr, 12
-.set nb400nf_jindex, 16
-.set nb400nf_jjnr, 20
-.set nb400nf_shift, 24
-.set nb400nf_shiftvec, 28
-.set nb400nf_fshift, 32
-.set nb400nf_gid, 36
-.set nb400nf_pos, 40
-.set nb400nf_faction, 44
-.set nb400nf_charge, 48
-.set nb400nf_p_facel, 52
-.set nb400nf_argkrf, 56
-.set nb400nf_argcrf, 60
-.set nb400nf_Vc, 64
-.set nb400nf_type, 68
-.set nb400nf_p_ntype, 72
-.set nb400nf_vdwparam, 76
-.set nb400nf_Vvdw, 80
-.set nb400nf_p_tabscale, 84
-.set nb400nf_VFtab, 88
-.set nb400nf_invsqrta, 92
-.set nb400nf_dvda, 96
-.set nb400nf_p_gbtabscale, 100
-.set nb400nf_GBtab, 104
-.set nb400nf_p_nthreads, 108
-.set nb400nf_count, 112
-.set nb400nf_mtx, 116
-.set nb400nf_outeriter, 120
-.set nb400nf_inneriter, 124
-.set nb400nf_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb400nf_ix, 0
-.set nb400nf_iy, 16
-.set nb400nf_iz, 32
-.set nb400nf_iq, 48
-.set nb400nf_gbtsc, 64
-.set nb400nf_qq, 80
-.set nb400nf_vctot, 96
-.set nb400nf_half, 112
-.set nb400nf_three, 128
-.set nb400nf_isai, 144
-.set nb400nf_isaprod, 160
-.set nb400nf_gbscale, 176
-.set nb400nf_is3, 192
-.set nb400nf_ii3, 196
-.set nb400nf_innerjjnr, 200
-.set nb400nf_innerk, 204
-.set nb400nf_n, 208
-.set nb400nf_nn1, 212
-.set nb400nf_nri, 216
-.set nb400nf_facel, 224                       ## uses 8 bytes
-.set nb400nf_nouter, 232
-.set nb400nf_ninner, 236
-.set nb400nf_salign, 240
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $244,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb400nf_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb400nf_p_nri(%ebp),%ecx
-        movl nb400nf_p_facel(%ebp),%esi
-        movl (%ecx),%ecx
-        movsd (%esi),%xmm7
-        movl %ecx,nb400nf_nri(%esp)
-        movsd %xmm7,nb400nf_facel(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb400nf_nouter(%esp)
-        movl %eax,nb400nf_ninner(%esp)
-
-
-        movl nb400nf_p_gbtabscale(%ebp),%eax
-        movsd (%eax),%xmm3
-        shufpd $0,%xmm3,%xmm3
-        movapd %xmm3,nb400nf_gbtsc(%esp)
-
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double 0.5 IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb400nf_half(%esp)
-        movl %ebx,nb400nf_half+4(%esp)
-        movsd nb400nf_half(%esp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## 1.0
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## 2.0
-        addpd  %xmm2,%xmm3      ## 3.0
-        movapd %xmm1,nb400nf_half(%esp)
-        movapd %xmm3,nb400nf_three(%esp)
-
-_nb_kernel400nf_ia32_sse2.nb400nf_threadloop: 
-        movl  nb400nf_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel400nf_ia32_sse2.nb400nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel400nf_ia32_sse2.nb400nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb400nf_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb400nf_n(%esp)
-        movl %ebx,nb400nf_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel400nf_ia32_sse2.nb400nf_outerstart
-        jmp _nb_kernel400nf_ia32_sse2.nb400nf_end
-
-_nb_kernel400nf_ia32_sse2.nb400nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb400nf_nouter(%esp),%ebx
-        movl %ebx,nb400nf_nouter(%esp)
-
-_nb_kernel400nf_ia32_sse2.nb400nf_outer: 
-        movl  nb400nf_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx        ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb400nf_is3(%esp)            ## store is3 
-
-        movl  nb400nf_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movsd (%eax,%ebx,8),%xmm0
-        movsd 8(%eax,%ebx,8),%xmm1
-        movsd 16(%eax,%ebx,8),%xmm2
-
-        movl  nb400nf_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]      
-        movl  (%ecx,%esi,4),%ebx    ## ebx =ii 
-
-        movl  nb400nf_charge(%ebp),%edx
-        movsd (%edx,%ebx,8),%xmm3
-        mulsd nb400nf_facel(%esp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movl  nb400nf_invsqrta(%ebp),%edx       ## load invsqrta[ii]
-        movsd (%edx,%ebx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb400nf_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addsd (%eax,%ebx,8),%xmm0
-        addsd 8(%eax,%ebx,8),%xmm1
-        addsd 16(%eax,%ebx,8),%xmm2
-
-        movapd %xmm3,nb400nf_iq(%esp)
-        movapd %xmm4,nb400nf_isai(%esp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb400nf_ix(%esp)
-        movapd %xmm1,nb400nf_iy(%esp)
-        movapd %xmm2,nb400nf_iz(%esp)
-
-        movl  %ebx,nb400nf_ii3(%esp)
-
-        ## clear vctot
-        xorpd %xmm4,%xmm4
-        movapd %xmm4,nb400nf_vctot(%esp)
-
-        movl  nb400nf_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb400nf_pos(%ebp),%esi
-        movl  nb400nf_faction(%ebp),%edi
-        movl  nb400nf_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb400nf_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb400nf_ninner(%esp),%ecx
-        movl  %ecx,nb400nf_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb400nf_innerk(%esp)      ## number of innerloop atoms 
-        jge   _nb_kernel400nf_ia32_sse2.nb400nf_unroll_loop
-        jmp   _nb_kernel400nf_ia32_sse2.nb400nf_checksingle
-_nb_kernel400nf_ia32_sse2.nb400nf_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movl  nb400nf_innerjjnr(%esp),%edx     ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        addl $8,nb400nf_innerjjnr(%esp)                 ## advance pointer (unrolled 2) 
-
-        ## load isa2
-        movl nb400nf_invsqrta(%ebp),%esi
-        movlpd (%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm2
-        mulpd  nb400nf_isai(%esp),%xmm2
-        movapd %xmm2,nb400nf_isaprod(%esp)
-        movapd %xmm2,%xmm1
-        mulpd nb400nf_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb400nf_gbscale(%esp)
-
-        movl nb400nf_charge(%ebp),%esi     ## base of charge[] 
-        movlpd (%esi,%eax,8),%xmm3
-        movhpd (%esi,%ebx,8),%xmm3
-
-        mulpd nb400nf_iq(%esp),%xmm2
-        mulpd  %xmm2,%xmm3
-        movapd %xmm3,nb400nf_qq(%esp)
-
-        movl nb400nf_pos(%ebp),%esi             ## base of pos[] 
-
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-
-        ## move two coordinates to xmm0-xmm2 
-        movlpd (%esi,%eax,8),%xmm0
-        movlpd 8(%esi,%eax,8),%xmm1
-        movlpd 16(%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm0
-        movhpd 8(%esi,%ebx,8),%xmm1
-        movhpd 16(%esi,%ebx,8),%xmm2
-
-        movl   nb400nf_faction(%ebp),%edi
-
-        ## move nb400nf_ix-iz to xmm4-xmm6 
-        movapd nb400nf_ix(%esp),%xmm4
-        movapd nb400nf_iy(%esp),%xmm5
-        movapd nb400nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subpd %xmm0,%xmm4
-        subpd %xmm1,%xmm5
-        subpd %xmm2,%xmm6
-
-        ## square it 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb400nf_three(%esp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb400nf_half(%esp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb400nf_three(%esp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb400nf_half(%esp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=iter2 of rinv (new lu) 
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-        mulpd nb400nf_gbscale(%esp),%xmm4
-
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6           ## idx *= 4 
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-
-        movl nb400nf_GBtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm6,%ebx          ## indices in eax/ebx 
-
-        movapd (%esi,%eax,8),%xmm4      ## Y1 F1        
-        movapd (%esi,%ebx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%esi,%eax,8),%xmm6    ## G1 H1        
-        movapd 16(%esi,%ebx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb400nf_qq(%esp),%xmm3
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-        mulpd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addpd  nb400nf_vctot(%esp),%xmm5
-        movapd %xmm5,nb400nf_vctot(%esp)
-
-        ## should we do one more iteration? 
-        subl $2,nb400nf_innerk(%esp)
-        jl    _nb_kernel400nf_ia32_sse2.nb400nf_checksingle
-        jmp   _nb_kernel400nf_ia32_sse2.nb400nf_unroll_loop
-_nb_kernel400nf_ia32_sse2.nb400nf_checksingle: 
-        movl  nb400nf_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel400nf_ia32_sse2.nb400nf_dosingle
-        jmp    _nb_kernel400nf_ia32_sse2.nb400nf_updateouterdata
-_nb_kernel400nf_ia32_sse2.nb400nf_dosingle: 
-        movl nb400nf_charge(%ebp),%esi
-        movl nb400nf_invsqrta(%ebp),%edx
-        movl nb400nf_pos(%ebp),%edi
-        movl  nb400nf_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-        xorpd  %xmm6,%xmm6
-        movapd %xmm6,%xmm7
-        movsd  (%edx,%eax,8),%xmm7
-        movlpd (%esi,%eax,8),%xmm6      ## xmm6(0) has the charge
-        mulsd  nb400nf_isai(%esp),%xmm7
-        movapd %xmm7,nb400nf_isaprod(%esp)
-        movapd %xmm7,%xmm1
-        mulpd nb400nf_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb400nf_gbscale(%esp)
-
-        mulsd  nb400nf_iq(%esp),%xmm7
-        mulsd  %xmm7,%xmm6
-        movapd %xmm6,nb400nf_qq(%esp)
-
-        leal  (%eax,%eax,2),%eax
-
-        ## move coordinates to xmm0-xmm2 
-        movlpd (%edi,%eax,8),%xmm0
-        movlpd 8(%edi,%eax,8),%xmm1
-        movlpd 16(%edi,%eax,8),%xmm2
-
-        ## move nb400nf_ix-iz to xmm4-xmm6 
-        movapd nb400nf_ix(%esp),%xmm4
-        movapd nb400nf_iy(%esp),%xmm5
-        movapd nb400nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subsd %xmm0,%xmm4
-        subsd %xmm1,%xmm5
-        subsd %xmm2,%xmm6
-
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb400nf_three(%esp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb400nf_half(%esp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb400nf_three(%esp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb400nf_half(%esp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=iter2 of rinv (new lu) 
-
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-        mulsd nb400nf_gbscale(%esp),%xmm4
-
-        movd %eax,%mm0
-
-        cvttsd2si %xmm4,%eax    ## mm6 = lu idx 
-        cvtsi2sd %eax,%xmm5
-        subsd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%eax            ## idx *= 4 
-
-        movl nb400nf_GBtab(%ebp),%esi
-
-        ## Coulomb 
-        movapd (%esi,%eax,8),%xmm4      ## Y1 F1 
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1  
-        unpckhpd %xmm3,%xmm5    ## F1  
-
-        movapd 16(%esi,%eax,8),%xmm6    ## G1 H1 
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1  
-        unpckhpd %xmm3,%xmm7    ## H1   
-        ## table ready in xmm4-xmm7 
-
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb400nf_qq(%esp),%xmm3
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addsd  nb400nf_vctot(%esp),%xmm5
-        movsd %xmm5,nb400nf_vctot(%esp)
-
-_nb_kernel400nf_ia32_sse2.nb400nf_updateouterdata: 
-        ## get n from stack
-        movl nb400nf_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb400nf_gid(%ebp),%edx            ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movapd nb400nf_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movl  nb400nf_Vc(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## finish if last 
-        movl nb400nf_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel400nf_ia32_sse2.nb400nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb400nf_n(%esp)
-        jmp _nb_kernel400nf_ia32_sse2.nb400nf_outer
-_nb_kernel400nf_ia32_sse2.nb400nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb400nf_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel400nf_ia32_sse2.nb400nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel400nf_ia32_sse2.nb400nf_threadloop
-_nb_kernel400nf_ia32_sse2.nb400nf_end: 
-        emms
-
-        movl nb400nf_nouter(%esp),%eax
-        movl nb400nf_ninner(%esp),%ebx
-        movl nb400nf_outeriter(%ebp),%ecx
-        movl nb400nf_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb400nf_salign(%esp),%eax
-        addl %eax,%esp
-        addl $244,%esp
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-        popl %eax
-        leave
-        ret
-
-
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.intel_syntax.s
deleted file mode 100644
index c5010b4e62..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.intel_syntax.s
+++ /dev/null
@@ -1,1530 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-
-
-
-.globl nb_kernel410_ia32_sse2
-.globl _nb_kernel410_ia32_sse2
-nb_kernel410_ia32_sse2:	
-_nb_kernel410_ia32_sse2:	
-.equiv          nb410_p_nri,            8
-.equiv          nb410_iinr,             12
-.equiv          nb410_jindex,           16
-.equiv          nb410_jjnr,             20
-.equiv          nb410_shift,            24
-.equiv          nb410_shiftvec,         28
-.equiv          nb410_fshift,           32
-.equiv          nb410_gid,              36
-.equiv          nb410_pos,              40
-.equiv          nb410_faction,          44
-.equiv          nb410_charge,           48
-.equiv          nb410_p_facel,          52
-.equiv          nb410_argkrf,           56
-.equiv          nb410_argcrf,           60
-.equiv          nb410_Vc,               64
-.equiv          nb410_type,             68
-.equiv          nb410_p_ntype,          72
-.equiv          nb410_vdwparam,         76
-.equiv          nb410_Vvdw,             80
-.equiv          nb410_p_tabscale,       84
-.equiv          nb410_VFtab,            88
-.equiv          nb410_invsqrta,         92
-.equiv          nb410_dvda,             96
-.equiv          nb410_p_gbtabscale,     100
-.equiv          nb410_GBtab,            104
-.equiv          nb410_p_nthreads,       108
-.equiv          nb410_count,            112
-.equiv          nb410_mtx,              116
-.equiv          nb410_outeriter,        120
-.equiv          nb410_inneriter,        124
-.equiv          nb410_work,             128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb410_ix,               0
-.equiv          nb410_iy,               16
-.equiv          nb410_iz,               32
-.equiv          nb410_iq,               48
-.equiv          nb410_dx,               64
-.equiv          nb410_dy,               80
-.equiv          nb410_dz,               96
-.equiv          nb410_two,              112
-.equiv          nb410_six,              128
-.equiv          nb410_twelve,           144
-.equiv          nb410_gbtsc,            160
-.equiv          nb410_qq,               176
-.equiv          nb410_c6,               192
-.equiv          nb410_c12,              208
-.equiv          nb410_fscal,            224
-.equiv          nb410_vctot,            240
-.equiv          nb410_Vvdwtot,          256
-.equiv          nb410_fix,              272
-.equiv          nb410_fiy,              288
-.equiv          nb410_fiz,              304
-.equiv          nb410_half,             320
-.equiv          nb410_three,            336
-.equiv          nb410_r,                352
-.equiv          nb410_isai,             368
-.equiv          nb410_isaprod,          384
-.equiv          nb410_dvdasum,          400
-.equiv          nb410_gbscale,          416
-.equiv          nb410_ii,               432
-.equiv          nb410_is3,              436
-.equiv          nb410_ii3,              440
-.equiv          nb410_ntia,             444
-.equiv          nb410_innerjjnr,        448
-.equiv          nb410_innerk,           452
-.equiv          nb410_n,                456
-.equiv          nb410_nn1,              460
-.equiv          nb410_nri,              464
-.equiv          nb410_facel,            472   ;# uses 8 bytes
-.equiv          nb410_ntype,            480
-.equiv          nb410_nouter,           484
-.equiv          nb410_ninner,           488
-.equiv          nb410_salign,           492
-	push ebp
-	mov ebp,esp	
-    	push eax
-    	push ebx
-    	push ecx
-    	push edx
-	push esi
-	push edi
-	sub esp, 496		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb410_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb410_p_nri]
-	mov esi, [ebp + nb410_p_facel]
-	mov edi, [ebp + nb410_p_ntype]
-	mov ecx, [ecx]
-	movsd xmm7, [esi]
-	mov edi, [edi]
-	mov [esp + nb410_nri], ecx
-	movsd [esp + nb410_facel], xmm7
-	mov [esp + nb410_ntype], edi
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb410_nouter], eax
-	mov [esp + nb410_ninner], eax
-
-
-	mov eax, [ebp + nb410_p_gbtabscale]
-	movsd xmm5, [eax]
-	shufpd xmm5, xmm5, 0
-	movapd [esp + nb410_gbtsc], xmm5
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double 0.5 IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [esp + nb410_half], eax
-	mov [esp + nb410_half+4], ebx
-	movsd xmm1, [esp + nb410_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# 1.0
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# 2.0
-	addpd  xmm3, xmm2	;# 3.0
-	movapd xmm4, xmm3
-	addpd  xmm4, xmm4       ;# 6.0
-	movapd xmm5, xmm4
-	addpd  xmm5, xmm5       ;# 12.0
-	movapd [esp + nb410_half], xmm1
-	movapd [esp + nb410_two], xmm2
-	movapd [esp + nb410_three], xmm3
-	movapd [esp + nb410_six], xmm4
-	movapd [esp + nb410_twelve], xmm5
-
-.nb410_threadloop:
-        mov   esi, [ebp + nb410_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb410_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb410_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb410_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb410_n], eax
-        mov [esp + nb410_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb410_outerstart
-        jmp .nb410_end
-
-.nb410_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb410_nouter]
-	mov [esp + nb410_nouter], ebx
-
-.nb410_outer:
-	mov   eax, [ebp + nb410_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax+esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb410_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb410_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movsd xmm0, [eax + ebx*8]
-	movsd xmm1, [eax + ebx*8 + 8]
-	movsd xmm2, [eax + ebx*8 + 16] 
-
-	mov   ecx, [ebp + nb410_iinr]       ;# ecx = pointer into iinr[] 	
-	mov   ebx, [ecx+esi*4]	    ;# ebx =ii 
-	mov   [esp + nb410_ii], ebx
-
-	mov   edx, [ebp + nb410_charge]
-	movsd xmm3, [edx + ebx*8]	
-	mulsd xmm3, [esp + nb410_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   edx, [ebp + nb410_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [edx + ebx*8]
-	shufpd xmm4, xmm4, 0
-
-    	mov   edx, [ebp + nb410_type] 
-    	mov   edx, [edx + ebx*4]
-    	imul  edx, [esp + nb410_ntype]
-    	shl   edx, 1
-    	mov   [esp + nb410_ntia], edx
-		
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb410_pos]    ;# eax = base of pos[]  
-
-	addsd xmm0, [eax + ebx*8]
-	addsd xmm1, [eax + ebx*8 + 8]
-	addsd xmm2, [eax + ebx*8 + 16]
-
-	movapd [esp + nb410_iq], xmm3
-	movapd [esp + nb410_isai], xmm4
-
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [esp + nb410_ix], xmm0
-	movapd [esp + nb410_iy], xmm1
-	movapd [esp + nb410_iz], xmm2
-
-	mov   [esp + nb410_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorpd xmm4, xmm4
-	movapd [esp + nb410_vctot], xmm4
-	movapd [esp + nb410_Vvdwtot], xmm4
-	movapd [esp + nb410_dvdasum], xmm4
-	movapd [esp + nb410_fix], xmm4
-	movapd [esp + nb410_fiy], xmm4
-	movapd [esp + nb410_fiz], xmm4
-	
-	mov   eax, [ebp + nb410_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb410_pos]
-	mov   edi, [ebp + nb410_faction]	
-	mov   eax, [ebp + nb410_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb410_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [esp + nb410_ninner]
-	mov   [esp + nb410_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb410_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb410_unroll_loop
-	jmp   .nb410_checksingle
-.nb410_unroll_loop:	
-	;# twice unrolled innerloop here 
-	mov   edx, [esp + nb410_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [edx]	
-	mov   ebx, [edx + 4]              
-	add dword ptr [esp + nb410_innerjjnr],  8 ;# advance pointer (unrolled 2) 
-
-	;# load isaj
-	mov esi, [ebp + nb410_invsqrta]
-	movlpd xmm2, [esi + eax*8]
-	movhpd xmm2, [esi + ebx*8]
-	mulpd  xmm2, [esp + nb410_isai]
-	movapd [esp + nb410_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulpd xmm1, [esp + nb410_gbtsc]
-	movapd [esp + nb410_gbscale], xmm1
-	
-	mov esi, [ebp + nb410_charge]    ;# base of charge[] 
-	movlpd xmm3, [esi + eax*8]
-	movhpd xmm3, [esi + ebx*8]
-
-	mulpd xmm2, [esp + nb410_iq]
-	mulpd  xmm3, xmm2
-	movapd [esp + nb410_qq], xmm3	
-	
-	movd  mm0, eax		;# use mmx registers as temp storage 
-	movd  mm1, ebx
-	
-	mov esi, [ebp + nb410_type]
-	mov eax, [esi + eax*4]
-	mov ebx, [esi + ebx*4]
-	mov esi, [ebp + nb410_vdwparam]
-	shl eax, 1
-	shl ebx, 1
-	mov edi, [esp + nb410_ntia]
-	add eax, edi
-	add ebx, edi
-
-	movlpd xmm6, [esi + eax*8]	;# c6a
-	movlpd xmm7, [esi + ebx*8]	;# c6b
-	movhpd xmm6, [esi + eax*8 + 8]	;# c6a c12a 
-	movhpd xmm7, [esi + ebx*8 + 8]	;# c6b c12b 
-
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movd  eax, mm0
-	movd  ebx, mm1
-	movapd [esp + nb410_c6], xmm4
-	movapd [esp + nb410_c12], xmm6
-	
-	mov esi, [ebp + nb410_pos]       ;# base of pos[] 
-
-	movd  mm2, eax
-	movd  mm3, ebx
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-
-	;# move two coordinates to xmm0-xmm2 	
-	movlpd xmm0, [esi + eax*8]
-	movlpd xmm1, [esi + eax*8 + 8]
-	movlpd xmm2, [esi + eax*8 + 16]
-	movhpd xmm0, [esi + ebx*8]
-	movhpd xmm1, [esi + ebx*8 + 8]
-	movhpd xmm2, [esi + ebx*8 + 16]		
-	
-	;# move ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb410_ix]
-	movapd xmm5, [esp + nb410_iy]
-	movapd xmm6, [esp + nb410_iz]
-
-	;# calc dr 
-	subpd xmm4, xmm0
-	subpd xmm5, xmm1
-	subpd xmm6, xmm2
-
-	;# store dr 
-	movapd [esp + nb410_dx], xmm4
-	movapd [esp + nb410_dy], xmm5
-	movapd [esp + nb410_dz], xmm6
-	;# square it 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb410_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb410_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb410_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb410_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=rinv 
-	
-	mulpd xmm4, xmm0	;# xmm4=r 
-	movapd [esp + nb410_r], xmm4
-	mulpd xmm4, [esp + nb410_gbscale]
-
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 2		;# idx *= 4 
-	
-	movd mm0, eax	
-	movd mm1, ebx
-
-	mov  esi, [ebp + nb410_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ebx, mm6		;# indices in eax/ebx 
-
-	movapd xmm4, [esi + eax*8]	;# Y1 F1 	
-	movapd xmm3, [esi + ebx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [esi + eax*8 + 16]	;# G1 H1 	
-	movapd xmm3, [esi + ebx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	mulpd  xmm7, [esp + nb410_two]	;# two*Heps2 
-	movapd xmm3, [esp + nb410_qq]
-	addpd  xmm7, xmm6
-	addpd  xmm7, xmm5 ;# xmm7=FF 
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulpd  xmm3, xmm7 ;# fijC=FF*qq 
-	;# get jnr from regs
-	movd ecx, mm2
-	movd edx, mm3
-	mov esi, [ebp + nb410_dvda]
-	
-	;# Calculate dVda
-	xorpd xmm7, xmm7
-	mulpd xmm3, [esp + nb410_gbscale]
-	movapd xmm6, xmm3
-	mulpd  xmm6, [esp + nb410_r]
-	addpd  xmm6, xmm5
-	addpd  xmm5, [esp + nb410_vctot]
-	movapd [esp + nb410_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subpd  xmm7, xmm6
-	movapd xmm6, xmm7
-	
-	;# update dvdasum
-	addpd  xmm7, [esp + nb410_dvdasum]
-	movapd [esp + nb410_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	addsd  xmm6, [esi + ecx*8]
-	addsd  xmm7, [esi + edx*8]
-	movsd  [esi + ecx*8], xmm6
-	movsd  [esi + edx*8], xmm7
-	
-	;# L-J 
-	movapd xmm4, xmm0
-	mulpd  xmm4, xmm0	;# xmm4=rinvsq 
-
-	movapd xmm6, xmm4
-	mulpd  xmm6, xmm4
-	
-	mulpd  xmm6, xmm4	;# xmm6=rinvsix 
-	movapd xmm4, xmm6
-	mulpd  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulpd  xmm6, [esp + nb410_c6]
-	mulpd  xmm4, [esp + nb410_c12]
-	movapd xmm7, [esp + nb410_Vvdwtot]
-	addpd  xmm7, xmm4
-	mulpd  xmm4, [esp + nb410_twelve]
-	subpd  xmm7, xmm6
-	mulpd  xmm6, [esp + nb410_six]
-	movapd [esp + nb410_Vvdwtot], xmm7
-	subpd  xmm4, xmm6
-	mulpd  xmm4, xmm0
-	subpd  xmm4, xmm3
-	mulpd  xmm4, xmm0
-
-	movapd xmm0, [esp + nb410_dx]
-	movapd xmm1, [esp + nb410_dy]
-	movapd xmm2, [esp + nb410_dz]
-
-	movd eax, mm0	
-	movd ebx, mm1
-
-	mov    edi, [ebp + nb410_faction]
-	mulpd  xmm0, xmm4
-	mulpd  xmm1, xmm4
-	mulpd  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movapd xmm3, [esp + nb410_fix]
-	movapd xmm4, [esp + nb410_fiy]
-	movapd xmm5, [esp + nb410_fiz]
-	addpd  xmm3, xmm0
-	addpd  xmm4, xmm1
-	addpd  xmm5, xmm2
-	movapd [esp + nb410_fix], xmm3
-	movapd [esp + nb410_fiy], xmm4
-	movapd [esp + nb410_fiz], xmm5
-	;# the fj's - start by accumulating forces from memory 
-	movlpd xmm3, [edi + eax*8]
-	movlpd xmm4, [edi + eax*8 + 8]
-	movlpd xmm5, [edi + eax*8 + 16]
-	movhpd xmm3, [edi + ebx*8]
-	movhpd xmm4, [edi + ebx*8 + 8]
-	movhpd xmm5, [edi + ebx*8 + 16]
-	subpd xmm3, xmm0
-	subpd xmm4, xmm1
-	subpd xmm5, xmm2
-	movlpd [edi + eax*8], xmm3
-	movlpd [edi + eax*8 + 8], xmm4
-	movlpd [edi + eax*8 + 16], xmm5
-	movhpd [edi + ebx*8], xmm3
-	movhpd [edi + ebx*8 + 8], xmm4
-	movhpd [edi + ebx*8 + 16], xmm5
-		
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb410_innerk],  2
-	jl    .nb410_checksingle
-	jmp   .nb410_unroll_loop
-.nb410_checksingle:
-	mov   edx, [esp + nb410_innerk]
-	and   edx, 1
-	jnz    .nb410_dosingle
-	jmp    .nb410_updateouterdata
-.nb410_dosingle:
-	mov esi, [ebp + nb410_charge]
-	mov edx, [ebp + nb410_invsqrta]
-	mov edi, [ebp + nb410_pos]
-	mov   ecx, [esp + nb410_innerjjnr]
-	mov   eax, [ecx]
-	
-	xorpd  xmm6, xmm6
-	movapd xmm7, xmm6
-	movsd  xmm7, [edx + eax*8]
-	movlpd xmm6, [esi + eax*8]	;# xmm6(0) has the charge
-	mulsd  xmm7, [esp + nb410_isai]
-	movapd [esp + nb410_isaprod], xmm7
-	movapd xmm1, xmm7
-	mulpd xmm1, [esp + nb410_gbtsc]
-	movapd [esp + nb410_gbscale], xmm1
-	
-	mulsd  xmm7, [esp + nb410_iq]
-	mulsd  xmm6, xmm7
-	movapd [esp + nb410_qq], xmm6
-		
-	movd  mm0, eax		;# use mmx registers as temp storage 
-	mov esi, [ebp + nb410_type]
-	mov eax, [esi + eax*4]
-	mov esi, [ebp + nb410_vdwparam]
-	shl eax, 1
-	mov edi, [esp + nb410_ntia]
-	add eax, edi
-
-	movlpd xmm6, [esi + eax*8]	;# c6a
-	movhpd xmm6, [esi + eax*8 + 8]	;# c6a c12a 
-	xorpd xmm7, xmm7
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movd  eax, mm0
-	movapd [esp + nb410_c6], xmm4
-	movapd [esp + nb410_c12], xmm6
-	
-	mov esi, [ebp + nb410_pos]       ;# base of pos[]
-	
-	movd  mm2, eax
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-
-	;# move coordinates to xmm0-xmm2 	
-	movlpd xmm0, [esi + eax*8]
-	movlpd xmm1, [esi + eax*8 + 8]
-	movlpd xmm2, [esi + eax*8 + 16]
-	
-	;# move ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb410_ix]
-	movapd xmm5, [esp + nb410_iy]
-	movapd xmm6, [esp + nb410_iz]
-
-	;# calc dr 
-	subsd xmm4, xmm0
-	subsd xmm5, xmm1
-	subsd xmm6, xmm2
-
-	;# store dr 
-	movapd [esp + nb410_dx], xmm4
-	movapd [esp + nb410_dy], xmm5
-	movapd [esp + nb410_dz], xmm6
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb410_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb410_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb410_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb410_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=rinv 
-	
-	mulsd xmm4, xmm0	;# xmm4=r 
-	movapd [esp + nb410_r], xmm4
-	mulsd xmm4, [esp + nb410_gbscale]
-
-	movd mm0, eax	
-	cvttsd2si eax, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, eax
-	subsd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2 
-	
-	shl eax, 2		;# idx *= 4 
-	
-	mov  esi, [ebp + nb410_GBtab]
-
-	movapd xmm4, [esi + eax*8]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [esi + eax*8 + 16]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	mulsd  xmm7, [esp + nb410_two]	;# two*Heps2 
-	movapd xmm3, [esp + nb410_qq]
-	addsd  xmm7, xmm6
-	addsd  xmm7, xmm5 ;# xmm7=FF 
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulsd  xmm3, xmm7 ;# fijC=FF*qq 
-	;# get jnr from regs
-	movd ebx, mm2
-	mov esi, [ebp + nb410_dvda]
-	
-	;# Calculate dVda
-	xorpd xmm7, xmm7
-	mulsd xmm3, [esp + nb410_gbscale]
-	movsd xmm6, xmm3
-	mulsd  xmm6, [esp + nb410_r]
-	addsd  xmm6, xmm5
-	addsd  xmm5, [esp + nb410_vctot]
-	movsd [esp + nb410_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subpd xmm7, xmm7
-	movsd xmm6, xmm7
-	
-	;# update dvdasum
-	addsd  xmm7, [esp + nb410_dvdasum]
-	movsd [esp + nb410_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	addsd  xmm6, [esi + ebx*8]
-	movsd  [esi + ebx*8], xmm6
-	
-	;# L-J 
-	movapd xmm4, xmm0
-	mulsd  xmm4, xmm0	;# xmm4=rinvsq 
-
-
-	movapd xmm6, xmm4
-	mulsd  xmm6, xmm4
-
-	mulsd  xmm6, xmm4	;# xmm6=rinvsix 
-	movapd xmm4, xmm6
-	mulsd  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulsd  xmm6, [esp + nb410_c6]
-	mulsd  xmm4, [esp + nb410_c12]
-	movapd xmm7, [esp + nb410_Vvdwtot]
-	addsd  xmm7, xmm4
-	mulsd  xmm4, [esp + nb410_twelve]
-	subsd  xmm7, xmm6
-	mulsd  xmm6, [esp + nb410_six]
-	movlpd [esp + nb410_Vvdwtot], xmm7
-	subsd  xmm4, xmm6
-	mulsd  xmm4, xmm0
-	subsd  xmm4, xmm3
-	mulsd  xmm4, xmm0
-
-	movapd xmm0, [esp + nb410_dx]
-	movapd xmm1, [esp + nb410_dy]
-	movapd xmm2, [esp + nb410_dz]
-
-	movd eax, mm0	
-
-	mov    edi, [ebp + nb410_faction]
-	mulsd  xmm0, xmm4
-	mulsd  xmm1, xmm4
-	mulsd  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movapd xmm3, [esp + nb410_fix]
-	movapd xmm4, [esp + nb410_fiy]
-	movapd xmm5, [esp + nb410_fiz]
-	addsd  xmm3, xmm0
-	addsd  xmm4, xmm1
-	addsd  xmm5, xmm2
-	movlpd [esp + nb410_fix], xmm3
-	movlpd [esp + nb410_fiy], xmm4
-	movlpd [esp + nb410_fiz], xmm5
-	;# the fj's - start by accumulating forces from memory 
-	movlpd xmm3, [edi + eax*8]
-	movlpd xmm4, [edi + eax*8 + 8]
-	movlpd xmm5, [edi + eax*8 + 16]
-	subsd xmm3, xmm0
-	subsd xmm4, xmm1
-	subsd xmm5, xmm2
-	movlpd [edi + eax*8], xmm3
-	movlpd [edi + eax*8 + 8], xmm4
-	movlpd [edi + eax*8 + 16], xmm5
-		
-.nb410_updateouterdata:
-	mov   ecx, [esp + nb410_ii3]
-	mov   edi, [ebp + nb410_faction]
-	mov   esi, [ebp + nb410_fshift]
-	mov   edx, [esp + nb410_is3]
-
-	;# accumulate i forces in xmm0, xmm1, xmm2 
-	movapd xmm0, [esp + nb410_fix]
-	movapd xmm1, [esp + nb410_fiy]
-	movapd xmm2, [esp + nb410_fiz]
-
-	movhlps xmm3, xmm0
-	movhlps xmm4, xmm1
-	movhlps xmm5, xmm2
-	addsd  xmm0, xmm3
-	addsd  xmm1, xmm4
-	addsd  xmm2, xmm5 ;# sum is in low xmm0-xmm2 
-
-	;# increment i force 
-	movsd  xmm3, [edi + ecx*8]
-	movsd  xmm4, [edi + ecx*8 + 8]
-	movsd  xmm5, [edi + ecx*8 + 16]
-	addsd  xmm3, xmm0
-	addsd  xmm4, xmm1
-	addsd  xmm5, xmm2
-	movsd  [edi + ecx*8],     xmm3
-	movsd  [edi + ecx*8 + 8], xmm4
-	movsd  [edi + ecx*8 + 16], xmm5
-
-	;# increment fshift force  
-	movsd  xmm3, [esi + edx*8]
-	movsd  xmm4, [esi + edx*8 + 8]
-	movsd  xmm5, [esi + edx*8 + 16]
-	addsd  xmm3, xmm0
-	addsd  xmm4, xmm1
-	addsd  xmm5, xmm2
-	movsd  [esi + edx*8],     xmm3
-	movsd  [esi + edx*8 + 8], xmm4
-	movsd  [esi + edx*8 + 16], xmm5
-
-	;# get n from stack
-	mov esi, [esp + nb410_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb410_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movapd xmm7, [esp + nb410_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb410_Vc]
-	addsd xmm7, [eax + edx*8] 
-	;# move back to mem 
-	movsd [eax + edx*8], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movapd xmm7, [esp + nb410_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb410_Vvdw]
-	addsd xmm7, [eax + edx*8] 
-	;# move back to mem 
-	movsd [eax + edx*8], xmm7 
-	
-	;# accumulate dVda and update it 
-	movapd xmm7, [esp + nb410_dvdasum]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	mov edx, [esp + nb410_ii]
-	mov eax, [ebp + nb410_dvda]
-	addsd xmm7, [eax + edx*8]
-	movsd [eax + edx*8], xmm7
-	
-        ;# finish if last 
-        mov ecx, [esp + nb410_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb410_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb410_n], esi
-        jmp .nb410_outer
-.nb410_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb410_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb410_end
-        ;# non-zero, do one more workunit
-        jmp   .nb410_threadloop
-.nb410_end:
-	emms
-
-	mov eax, [esp + nb410_nouter]
-	mov ebx, [esp + nb410_ninner]
-	mov ecx, [ebp + nb410_outeriter]
-	mov edx, [ebp + nb410_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb410_salign]
-	add esp, eax
-	add esp, 496
-	pop edi
-	pop esi
-    	pop edx
-    	pop ecx
-    	pop ebx
-    	pop eax
-	leave
-	ret
-
-
-
-
-
-
-
-.globl nb_kernel410nf_ia32_sse2
-.globl _nb_kernel410nf_ia32_sse2
-nb_kernel410nf_ia32_sse2:	
-_nb_kernel410nf_ia32_sse2:	
-.equiv          nb410nf_p_nri,          8
-.equiv          nb410nf_iinr,           12
-.equiv          nb410nf_jindex,         16
-.equiv          nb410nf_jjnr,           20
-.equiv          nb410nf_shift,          24
-.equiv          nb410nf_shiftvec,       28
-.equiv          nb410nf_fshift,         32
-.equiv          nb410nf_gid,            36
-.equiv          nb410nf_pos,            40
-.equiv          nb410nf_faction,        44
-.equiv          nb410nf_charge,         48
-.equiv          nb410nf_p_facel,        52
-.equiv          nb410nf_argkrf,         56
-.equiv          nb410nf_argcrf,         60
-.equiv          nb410nf_Vc,             64
-.equiv          nb410nf_type,           68
-.equiv          nb410nf_p_ntype,        72
-.equiv          nb410nf_vdwparam,       76
-.equiv          nb410nf_Vvdw,           80
-.equiv          nb410nf_p_tabscale,     84
-.equiv          nb410nf_VFtab,          88
-.equiv          nb410nf_invsqrta,       92
-.equiv          nb410nf_dvda,           96
-.equiv          nb410nf_p_gbtabscale,   100
-.equiv          nb410nf_GBtab,          104
-.equiv          nb410nf_p_nthreads,     108
-.equiv          nb410nf_count,          112
-.equiv          nb410nf_mtx,            116
-.equiv          nb410nf_outeriter,      120
-.equiv          nb410nf_inneriter,      124
-.equiv          nb410nf_work,           128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb410nf_ix,             0
-.equiv          nb410nf_iy,             16
-.equiv          nb410nf_iz,             32
-.equiv          nb410nf_iq,             48
-.equiv          nb410nf_two,            64
-.equiv          nb410nf_gbtsc,          80
-.equiv          nb410nf_qq,             96
-.equiv          nb410nf_c6,             112
-.equiv          nb410nf_c12,            128
-.equiv          nb410nf_vctot,          144
-.equiv          nb410nf_Vvdwtot,        160
-.equiv          nb410nf_half,           176
-.equiv          nb410nf_three,          192
-.equiv          nb410nf_r,              208
-.equiv          nb410nf_isai,           224
-.equiv          nb410nf_isaprod,        240
-.equiv          nb410nf_gbscale,        256
-.equiv          nb410nf_ii,             272
-.equiv          nb410nf_is3,            276
-.equiv          nb410nf_ii3,            280
-.equiv          nb410nf_ntia,           284
-.equiv          nb410nf_innerjjnr,      288
-.equiv          nb410nf_innerk,         292
-.equiv          nb410nf_n,              296
-.equiv          nb410nf_nn1,            300
-.equiv          nb410nf_nri,            304
-.equiv          nb410nf_facel,          312   ;# uses 8 bytes
-.equiv          nb410nf_ntype,          320
-.equiv          nb410nf_nouter,         324
-.equiv          nb410nf_ninner,         328
-.equiv          nb410nf_salign,         332
-	push ebp
-	mov ebp,esp	
-    	push eax
-    	push ebx
-    	push ecx
-    	push edx
-	push esi
-	push edi
-	sub esp, 336		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb410nf_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb410nf_p_nri]
-	mov esi, [ebp + nb410nf_p_facel]
-	mov edi, [ebp + nb410nf_p_ntype]
-	mov ecx, [ecx]
-	movsd xmm7, [esi]
-	mov edi, [edi]
-	mov [esp + nb410nf_nri], ecx
-	movsd [esp + nb410nf_facel], xmm7
-	mov [esp + nb410nf_ntype], edi
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb410nf_nouter], eax
-	mov [esp + nb410nf_ninner], eax
-
-
-	mov eax, [ebp + nb410nf_p_gbtabscale]
-	movsd xmm5, [eax]
-	shufpd xmm5, xmm5, 0
-	movapd [esp + nb410nf_gbtsc], xmm5
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double 0.5 IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [esp + nb410nf_half], eax
-	mov [esp + nb410nf_half+4], ebx
-	movsd xmm1, [esp + nb410nf_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# 1.0
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# 2.0
-	addpd  xmm3, xmm2	;# 3.0
-	movapd [esp + nb410nf_half], xmm1
-	movapd [esp + nb410nf_two], xmm2
-	movapd [esp + nb410nf_three], xmm3
-
-.nb410nf_threadloop:
-        mov   esi, [ebp + nb410nf_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb410nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb410nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb410nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb410nf_n], eax
-        mov [esp + nb410nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-		mov esi, eax				;# copy n to esi
-        jg  .nb410nf_outerstart
-        jmp .nb410nf_end
-
-.nb410nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb410nf_nouter]
-	mov [esp + nb410nf_nouter], ebx
-
-.nb410nf_outer:
-	mov   eax, [ebp + nb410nf_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax+esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb410nf_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb410nf_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movsd xmm0, [eax + ebx*8]
-	movsd xmm1, [eax + ebx*8 + 8]
-	movsd xmm2, [eax + ebx*8 + 16] 
-
-	mov   ecx, [ebp + nb410nf_iinr]       ;# ecx = pointer into iinr[] 	
-	mov   ebx, [ecx+esi*4]	    ;# ebx =ii 
-	mov   [esp + nb410nf_ii], ebx
-
-	mov   edx, [ebp + nb410nf_charge]
-	movsd xmm3, [edx + ebx*8]	
-	mulsd xmm3, [esp + nb410nf_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   edx, [ebp + nb410nf_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [edx + ebx*8]
-	shufpd xmm4, xmm4, 0
-
-   	mov   edx, [ebp + nb410nf_type] 
-   	mov   edx, [edx + ebx*4]
-   	imul  edx, [esp + nb410nf_ntype]
-   	shl   edx, 1
-    mov   [esp + nb410nf_ntia], edx
-		
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb410nf_pos]    ;# eax = base of pos[]  
-
-	addsd xmm0, [eax + ebx*8]
-	addsd xmm1, [eax + ebx*8 + 8]
-	addsd xmm2, [eax + ebx*8 + 16]
-
-	movapd [esp + nb410nf_iq], xmm3
-	movapd [esp + nb410nf_isai], xmm4
-
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [esp + nb410nf_ix], xmm0
-	movapd [esp + nb410nf_iy], xmm1
-	movapd [esp + nb410nf_iz], xmm2
-
-	mov   [esp + nb410nf_ii3], ebx
-	
-	;# clear vctot and Vvdwtot
-	xorpd xmm4, xmm4
-	movapd [esp + nb410nf_vctot], xmm4
-	movapd [esp + nb410nf_Vvdwtot], xmm4
-	
-	mov   eax, [ebp + nb410nf_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb410nf_pos]
-	mov   edi, [ebp + nb410nf_faction]	
-	mov   eax, [ebp + nb410nf_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb410nf_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [esp + nb410nf_ninner]
-	mov   [esp + nb410nf_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb410nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb410nf_unroll_loop
-	jmp   .nb410nf_checksingle
-.nb410nf_unroll_loop:	
-	;# twice unrolled innerloop here 
-	mov   edx, [esp + nb410nf_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [edx]	
-	mov   ebx, [edx + 4]              
-	add dword ptr [esp + nb410nf_innerjjnr],  8 ;# advance pointer (unrolled 2) 
-
-	;# load isaj
-	mov esi, [ebp + nb410nf_invsqrta]
-	movlpd xmm2, [esi + eax*8]
-	movhpd xmm2, [esi + ebx*8]
-	mulpd  xmm2, [esp + nb410nf_isai]
-	movapd [esp + nb410nf_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulpd xmm1, [esp + nb410nf_gbtsc]
-	movapd [esp + nb410nf_gbscale], xmm1
-	
-	mov esi, [ebp + nb410nf_charge]    ;# base of charge[] 
-	movlpd xmm3, [esi + eax*8]
-	movhpd xmm3, [esi + ebx*8]
-
-	mulpd xmm2, [esp + nb410nf_iq]
-	mulpd  xmm3, xmm2
-	movapd [esp + nb410nf_qq], xmm3	
-	
-	movd  mm0, eax		;# use mmx registers as temp storage 
-	movd  mm1, ebx
-	
-	mov esi, [ebp + nb410nf_type]
-	mov eax, [esi + eax*4]
-	mov ebx, [esi + ebx*4]
-	mov esi, [ebp + nb410nf_vdwparam]
-	shl eax, 1
-	shl ebx, 1
-	mov edi, [esp + nb410nf_ntia]
-	add eax, edi
-	add ebx, edi
-
-	movlpd xmm6, [esi + eax*8]	;# c6a
-	movlpd xmm7, [esi + ebx*8]	;# c6b
-	movhpd xmm6, [esi + eax*8 + 8]	;# c6a c12a 
-	movhpd xmm7, [esi + ebx*8 + 8]	;# c6b c12b 
-
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movd  eax, mm0
-	movd  ebx, mm1
-	movapd [esp + nb410nf_c6], xmm4
-	movapd [esp + nb410nf_c12], xmm6
-	
-	mov esi, [ebp + nb410nf_pos]       ;# base of pos[] 
-
-	movd  mm2, eax
-	movd  mm3, ebx
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-
-	;# move two coordinates to xmm0-xmm2 	
-	movlpd xmm0, [esi + eax*8]
-	movlpd xmm1, [esi + eax*8 + 8]
-	movlpd xmm2, [esi + eax*8 + 16]
-	movhpd xmm0, [esi + ebx*8]
-	movhpd xmm1, [esi + ebx*8 + 8]
-	movhpd xmm2, [esi + ebx*8 + 16]		
-	
-	;# move ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb410nf_ix]
-	movapd xmm5, [esp + nb410nf_iy]
-	movapd xmm6, [esp + nb410nf_iz]
-
-	;# calc dr 
-	subpd xmm4, xmm0
-	subpd xmm5, xmm1
-	subpd xmm6, xmm2
-
-	;# square dr 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb410nf_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb410nf_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb410nf_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb410nf_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=rinv 
-	
-	mulpd xmm4, xmm0	;# xmm4=r 
-	movapd [esp + nb410nf_r], xmm4
-	mulpd xmm4, [esp + nb410nf_gbscale]
-
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 2		;# idx *= 4 
-	
-	movd mm0, eax	
-	movd mm1, ebx
-
-	mov  esi, [ebp + nb410nf_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ebx, mm6		;# indices in eax/ebx 
-
-	movapd xmm4, [esi + eax*8]	;# Y1 F1 	
-	movapd xmm3, [esi + ebx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [esi + eax*8 + 16]	;# G1 H1 	
-	movapd xmm3, [esi + ebx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [esp + nb410nf_qq]
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  
-
-	addpd  xmm5, [esp + nb410nf_vctot]
-	movapd [esp + nb410nf_vctot], xmm5 
-
-	;# L-J 
-	movapd xmm4, xmm0
-	mulpd  xmm4, xmm0	;# xmm4=rinvsq 
-
-	movapd xmm6, xmm4
-	mulpd  xmm6, xmm4
-	
-	mulpd  xmm6, xmm4	;# xmm6=rinvsix 
-	movapd xmm4, xmm6
-	mulpd  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulpd  xmm6, [esp + nb410nf_c6]
-	mulpd  xmm4, [esp + nb410nf_c12]
-	movapd xmm7, [esp + nb410nf_Vvdwtot]
-	addpd  xmm7, xmm4
-	subpd  xmm7, xmm6
-	movapd [esp + nb410nf_Vvdwtot], xmm7
-
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb410nf_innerk],  2
-	jl    .nb410nf_checksingle
-	jmp   .nb410nf_unroll_loop
-.nb410nf_checksingle:
-	mov   edx, [esp + nb410nf_innerk]
-	and   edx, 1
-	jnz    .nb410nf_dosingle
-	jmp    .nb410nf_updateouterdata
-.nb410nf_dosingle:
-	mov esi, [ebp + nb410nf_charge]
-	mov edx, [ebp + nb410nf_invsqrta]
-	mov edi, [ebp + nb410nf_pos]
-	mov   ecx, [esp + nb410nf_innerjjnr]
-	mov   eax, [ecx]
-	
-	xorpd  xmm6, xmm6
-	movapd xmm7, xmm6
-	movsd  xmm7, [edx + eax*8]
-	movlpd xmm6, [esi + eax*8]	;# xmm6(0) has the charge
-	mulsd  xmm7, [esp + nb410nf_isai]
-	movapd [esp + nb410nf_isaprod], xmm7
-	movapd xmm1, xmm7
-	mulpd xmm1, [esp + nb410nf_gbtsc]
-	movapd [esp + nb410nf_gbscale], xmm1
-	
-	mulsd  xmm7, [esp + nb410nf_iq]
-	mulsd  xmm6, xmm7
-	movapd [esp + nb410nf_qq], xmm6
-		
-	movd  mm0, eax		;# use mmx registers as temp storage 
-	mov esi, [ebp + nb410nf_type]
-	mov eax, [esi + eax*4]
-	mov esi, [ebp + nb410nf_vdwparam]
-	shl eax, 1
-	mov edi, [esp + nb410nf_ntia]
-	add eax, edi
-
-	movlpd xmm6, [esi + eax*8]	;# c6a
-	movhpd xmm6, [esi + eax*8 + 8]	;# c6a c12a 
-
-	xorpd xmm7, xmm7
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movd  eax, mm0
-	movapd [esp + nb410nf_c6], xmm4
-	movapd [esp + nb410nf_c12], xmm6
-	
-	mov esi, [ebp + nb410nf_pos]       ;# base of pos[]
-	
-	movd  mm2, eax
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-
-	;# move coordinates to xmm0-xmm2 	
-	movlpd xmm0, [esi + eax*8]
-	movlpd xmm1, [esi + eax*8 + 8]
-	movlpd xmm2, [esi + eax*8 + 16]
-	
-	;# move ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb410nf_ix]
-	movapd xmm5, [esp + nb410nf_iy]
-	movapd xmm6, [esp + nb410nf_iz]
-
-	;# calc dr 
-	subsd xmm4, xmm0
-	subsd xmm5, xmm1
-	subsd xmm6, xmm2
-
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb410nf_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb410nf_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb410nf_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb410nf_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=rinv 
-	
-	mulsd xmm4, xmm0	;# xmm4=r 
-	movapd [esp + nb410nf_r], xmm4
-	mulsd xmm4, [esp + nb410nf_gbscale]
-
-	movd mm0, eax	
-	cvttsd2si eax, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, eax
-	subsd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2 
-	
-	shl eax, 2		;# idx *= 4 
-	
-	mov  esi, [ebp + nb410nf_GBtab]
-
-	movapd xmm4, [esi + eax*8]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [esi + eax*8 + 16]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [esp + nb410nf_qq]
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  
-
-	addsd  xmm5, [esp + nb410nf_vctot]
-	movsd [esp + nb410nf_vctot], xmm5 
-
-	;# L-J 
-	movapd xmm4, xmm0
-	mulsd  xmm4, xmm0	;# xmm4=rinvsq 
-
-
-	movapd xmm6, xmm4
-	mulsd  xmm6, xmm4
-
-	mulsd  xmm6, xmm4	;# xmm6=rinvsix 
-	movapd xmm4, xmm6
-	mulsd  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulsd  xmm6, [esp + nb410nf_c6]
-	mulsd  xmm4, [esp + nb410nf_c12]
-	movapd xmm7, [esp + nb410nf_Vvdwtot]
-	addsd  xmm7, xmm4
-	subsd  xmm7, xmm6
-	movlpd [esp + nb410nf_Vvdwtot], xmm7
-
-.nb410nf_updateouterdata:
-	mov   ecx, [esp + nb410nf_ii3]
-	mov   edx, [esp + nb410nf_is3]
-
-	;# get n from stack
-	mov esi, [esp + nb410nf_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb410nf_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movapd xmm7, [esp + nb410nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb410nf_Vc]
-	addsd xmm7, [eax + edx*8] 
-	;# move back to mem 
-	movsd [eax + edx*8], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movapd xmm7, [esp + nb410nf_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb410nf_Vvdw]
-	addsd xmm7, [eax + edx*8] 
-	;# move back to mem 
-	movsd [eax + edx*8], xmm7 
-	
-        ;# finish if last 
-        mov ecx, [esp + nb410nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb410nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb410nf_n], esi
-        jmp .nb410nf_outer
-.nb410nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb410nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb410nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb410nf_threadloop
-.nb410nf_end:
-	emms
-
-	mov eax, [esp + nb410nf_nouter]
-	mov ebx, [esp + nb410nf_ninner]
-	mov ecx, [ebp + nb410nf_outeriter]
-	mov edx, [ebp + nb410nf_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb410nf_salign]
-	add esp, eax
-	add esp, 336
-	pop edi
-	pop esi
-    pop edx
-    pop ecx
-    pop ebx
-    pop eax
-	leave
-	ret
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.s
deleted file mode 100644
index c8c4a4eea4..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.s
+++ /dev/null
@@ -1,1503 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-
-
-.globl nb_kernel410_ia32_sse2
-.globl _nb_kernel410_ia32_sse2
-nb_kernel410_ia32_sse2: 
-_nb_kernel410_ia32_sse2:        
-.set nb410_p_nri, 8
-.set nb410_iinr, 12
-.set nb410_jindex, 16
-.set nb410_jjnr, 20
-.set nb410_shift, 24
-.set nb410_shiftvec, 28
-.set nb410_fshift, 32
-.set nb410_gid, 36
-.set nb410_pos, 40
-.set nb410_faction, 44
-.set nb410_charge, 48
-.set nb410_p_facel, 52
-.set nb410_argkrf, 56
-.set nb410_argcrf, 60
-.set nb410_Vc, 64
-.set nb410_type, 68
-.set nb410_p_ntype, 72
-.set nb410_vdwparam, 76
-.set nb410_Vvdw, 80
-.set nb410_p_tabscale, 84
-.set nb410_VFtab, 88
-.set nb410_invsqrta, 92
-.set nb410_dvda, 96
-.set nb410_p_gbtabscale, 100
-.set nb410_GBtab, 104
-.set nb410_p_nthreads, 108
-.set nb410_count, 112
-.set nb410_mtx, 116
-.set nb410_outeriter, 120
-.set nb410_inneriter, 124
-.set nb410_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb410_ix, 0
-.set nb410_iy, 16
-.set nb410_iz, 32
-.set nb410_iq, 48
-.set nb410_dx, 64
-.set nb410_dy, 80
-.set nb410_dz, 96
-.set nb410_two, 112
-.set nb410_six, 128
-.set nb410_twelve, 144
-.set nb410_gbtsc, 160
-.set nb410_qq, 176
-.set nb410_c6, 192
-.set nb410_c12, 208
-.set nb410_fscal, 224
-.set nb410_vctot, 240
-.set nb410_Vvdwtot, 256
-.set nb410_fix, 272
-.set nb410_fiy, 288
-.set nb410_fiz, 304
-.set nb410_half, 320
-.set nb410_three, 336
-.set nb410_r, 352
-.set nb410_isai, 368
-.set nb410_isaprod, 384
-.set nb410_dvdasum, 400
-.set nb410_gbscale, 416
-.set nb410_ii, 432
-.set nb410_is3, 436
-.set nb410_ii3, 440
-.set nb410_ntia, 444
-.set nb410_innerjjnr, 448
-.set nb410_innerk, 452
-.set nb410_n, 456
-.set nb410_nn1, 460
-.set nb410_nri, 464
-.set nb410_facel, 472                         ## uses 8 bytes
-.set nb410_ntype, 480
-.set nb410_nouter, 484
-.set nb410_ninner, 488
-.set nb410_salign, 492
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $496,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb410_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb410_p_nri(%ebp),%ecx
-        movl nb410_p_facel(%ebp),%esi
-        movl nb410_p_ntype(%ebp),%edi
-        movl (%ecx),%ecx
-        movsd (%esi),%xmm7
-        movl (%edi),%edi
-        movl %ecx,nb410_nri(%esp)
-        movsd %xmm7,nb410_facel(%esp)
-        movl %edi,nb410_ntype(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb410_nouter(%esp)
-        movl %eax,nb410_ninner(%esp)
-
-
-        movl nb410_p_gbtabscale(%ebp),%eax
-        movsd (%eax),%xmm5
-        shufpd $0,%xmm5,%xmm5
-        movapd %xmm5,nb410_gbtsc(%esp)
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double 0.5 IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb410_half(%esp)
-        movl %ebx,nb410_half+4(%esp)
-        movsd nb410_half(%esp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## 1.0
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## 2.0
-        addpd  %xmm2,%xmm3      ## 3.0
-        movapd %xmm3,%xmm4
-        addpd  %xmm4,%xmm4      ## 6.0
-        movapd %xmm4,%xmm5
-        addpd  %xmm5,%xmm5      ## 12.0
-        movapd %xmm1,nb410_half(%esp)
-        movapd %xmm2,nb410_two(%esp)
-        movapd %xmm3,nb410_three(%esp)
-        movapd %xmm4,nb410_six(%esp)
-        movapd %xmm5,nb410_twelve(%esp)
-
-_nb_kernel410_ia32_sse2.nb410_threadloop: 
-        movl  nb410_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel410_ia32_sse2.nb410_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel410_ia32_sse2.nb410_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb410_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb410_n(%esp)
-        movl %ebx,nb410_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel410_ia32_sse2.nb410_outerstart
-        jmp _nb_kernel410_ia32_sse2.nb410_end
-
-_nb_kernel410_ia32_sse2.nb410_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb410_nouter(%esp),%ebx
-        movl %ebx,nb410_nouter(%esp)
-
-_nb_kernel410_ia32_sse2.nb410_outer: 
-        movl  nb410_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx        ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb410_is3(%esp)      ## store is3 
-
-        movl  nb410_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movsd (%eax,%ebx,8),%xmm0
-        movsd 8(%eax,%ebx,8),%xmm1
-        movsd 16(%eax,%ebx,8),%xmm2
-
-        movl  nb410_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]        
-        movl  (%ecx,%esi,4),%ebx    ## ebx =ii 
-        movl  %ebx,nb410_ii(%esp)
-
-        movl  nb410_charge(%ebp),%edx
-        movsd (%edx,%ebx,8),%xmm3
-        mulsd nb410_facel(%esp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movl  nb410_invsqrta(%ebp),%edx         ## load invsqrta[ii]
-        movsd (%edx,%ebx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        movl  nb410_type(%ebp),%edx
-        movl  (%edx,%ebx,4),%edx
-        imull nb410_ntype(%esp),%edx
-        shll  %edx
-        movl  %edx,nb410_ntia(%esp)
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb410_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addsd (%eax,%ebx,8),%xmm0
-        addsd 8(%eax,%ebx,8),%xmm1
-        addsd 16(%eax,%ebx,8),%xmm2
-
-        movapd %xmm3,nb410_iq(%esp)
-        movapd %xmm4,nb410_isai(%esp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb410_ix(%esp)
-        movapd %xmm1,nb410_iy(%esp)
-        movapd %xmm2,nb410_iz(%esp)
-
-        movl  %ebx,nb410_ii3(%esp)
-
-        ## clear vctot and i forces 
-        xorpd %xmm4,%xmm4
-        movapd %xmm4,nb410_vctot(%esp)
-        movapd %xmm4,nb410_Vvdwtot(%esp)
-        movapd %xmm4,nb410_dvdasum(%esp)
-        movapd %xmm4,nb410_fix(%esp)
-        movapd %xmm4,nb410_fiy(%esp)
-        movapd %xmm4,nb410_fiz(%esp)
-
-        movl  nb410_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb410_pos(%ebp),%esi
-        movl  nb410_faction(%ebp),%edi
-        movl  nb410_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb410_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb410_ninner(%esp),%ecx
-        movl  %ecx,nb410_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb410_innerk(%esp)      ## number of innerloop atoms 
-        jge   _nb_kernel410_ia32_sse2.nb410_unroll_loop
-        jmp   _nb_kernel410_ia32_sse2.nb410_checksingle
-_nb_kernel410_ia32_sse2.nb410_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movl  nb410_innerjjnr(%esp),%edx       ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        addl $8,nb410_innerjjnr(%esp)             ## advance pointer (unrolled 2) 
-
-        ## load isaj
-        movl nb410_invsqrta(%ebp),%esi
-        movlpd (%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm2
-        mulpd  nb410_isai(%esp),%xmm2
-        movapd %xmm2,nb410_isaprod(%esp)
-        movapd %xmm2,%xmm1
-        mulpd nb410_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb410_gbscale(%esp)
-
-        movl nb410_charge(%ebp),%esi     ## base of charge[] 
-        movlpd (%esi,%eax,8),%xmm3
-        movhpd (%esi,%ebx,8),%xmm3
-
-        mulpd nb410_iq(%esp),%xmm2
-        mulpd  %xmm2,%xmm3
-        movapd %xmm3,nb410_qq(%esp)
-
-        movd  %eax,%mm0         ## use mmx registers as temp storage 
-        movd  %ebx,%mm1
-
-        movl nb410_type(%ebp),%esi
-        movl (%esi,%eax,4),%eax
-        movl (%esi,%ebx,4),%ebx
-        movl nb410_vdwparam(%ebp),%esi
-        shll %eax
-        shll %ebx
-        movl nb410_ntia(%esp),%edi
-        addl %edi,%eax
-        addl %edi,%ebx
-
-        movlpd (%esi,%eax,8),%xmm6      ## c6a
-        movlpd (%esi,%ebx,8),%xmm7      ## c6b
-        movhpd 8(%esi,%eax,8),%xmm6     ## c6a c12a 
-        movhpd 8(%esi,%ebx,8),%xmm7     ## c6b c12b 
-
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movd  %mm0,%eax
-        movd  %mm1,%ebx
-        movapd %xmm4,nb410_c6(%esp)
-        movapd %xmm6,nb410_c12(%esp)
-
-        movl nb410_pos(%ebp),%esi        ## base of pos[] 
-
-        movd  %eax,%mm2
-        movd  %ebx,%mm3
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-
-        ## move two coordinates to xmm0-xmm2    
-        movlpd (%esi,%eax,8),%xmm0
-        movlpd 8(%esi,%eax,8),%xmm1
-        movlpd 16(%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm0
-        movhpd 8(%esi,%ebx,8),%xmm1
-        movhpd 16(%esi,%ebx,8),%xmm2
-
-        ## move ix-iz to xmm4-xmm6 
-        movapd nb410_ix(%esp),%xmm4
-        movapd nb410_iy(%esp),%xmm5
-        movapd nb410_iz(%esp),%xmm6
-
-        ## calc dr 
-        subpd %xmm0,%xmm4
-        subpd %xmm1,%xmm5
-        subpd %xmm2,%xmm6
-
-        ## store dr 
-        movapd %xmm4,nb410_dx(%esp)
-        movapd %xmm5,nb410_dy(%esp)
-        movapd %xmm6,nb410_dz(%esp)
-        ## square it 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb410_three(%esp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb410_half(%esp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb410_three(%esp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb410_half(%esp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=rinv 
-
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb410_r(%esp)
-        mulpd nb410_gbscale(%esp),%xmm4
-
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6           ## idx *= 4 
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-
-        movl nb410_GBtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm6,%ebx          ## indices in eax/ebx 
-
-        movapd (%esi,%eax,8),%xmm4      ## Y1 F1        
-        movapd (%esi,%ebx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%esi,%eax,8),%xmm6    ## G1 H1        
-        movapd 16(%esi,%ebx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulpd  nb410_two(%esp),%xmm7    ## two*Heps2 
-        movapd nb410_qq(%esp),%xmm3
-        addpd  %xmm6,%xmm7
-        addpd  %xmm5,%xmm7 ## xmm7=FF 
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-        mulpd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulpd  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## get jnr from regs
-        movd %mm2,%ecx
-        movd %mm3,%edx
-        movl nb410_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorpd %xmm7,%xmm7
-        mulpd nb410_gbscale(%esp),%xmm3
-        movapd %xmm3,%xmm6
-        mulpd  nb410_r(%esp),%xmm6
-        addpd  %xmm5,%xmm6
-        addpd  nb410_vctot(%esp),%xmm5
-        movapd %xmm5,nb410_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subpd  %xmm6,%xmm7
-        movapd %xmm7,%xmm6
-
-        ## update dvdasum
-        addpd  nb410_dvdasum(%esp),%xmm7
-        movapd %xmm7,nb410_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        addsd  (%esi,%ecx,8),%xmm6
-        addsd  (%esi,%edx,8),%xmm7
-        movsd  %xmm6,(%esi,%ecx,8)
-        movsd  %xmm7,(%esi,%edx,8)
-
-        ## L-J 
-        movapd %xmm0,%xmm4
-        mulpd  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        movapd %xmm4,%xmm6
-        mulpd  %xmm4,%xmm6
-
-        mulpd  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movapd %xmm6,%xmm4
-        mulpd  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulpd  nb410_c6(%esp),%xmm6
-        mulpd  nb410_c12(%esp),%xmm4
-        movapd nb410_Vvdwtot(%esp),%xmm7
-        addpd  %xmm4,%xmm7
-        mulpd  nb410_twelve(%esp),%xmm4
-        subpd  %xmm6,%xmm7
-        mulpd  nb410_six(%esp),%xmm6
-        movapd %xmm7,nb410_Vvdwtot(%esp)
-        subpd  %xmm6,%xmm4
-        mulpd  %xmm0,%xmm4
-        subpd  %xmm3,%xmm4
-        mulpd  %xmm0,%xmm4
-
-        movapd nb410_dx(%esp),%xmm0
-        movapd nb410_dy(%esp),%xmm1
-        movapd nb410_dz(%esp),%xmm2
-
-        movd %mm0,%eax
-        movd %mm1,%ebx
-
-        movl   nb410_faction(%ebp),%edi
-        mulpd  %xmm4,%xmm0
-        mulpd  %xmm4,%xmm1
-        mulpd  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movapd nb410_fix(%esp),%xmm3
-        movapd nb410_fiy(%esp),%xmm4
-        movapd nb410_fiz(%esp),%xmm5
-        addpd  %xmm0,%xmm3
-        addpd  %xmm1,%xmm4
-        addpd  %xmm2,%xmm5
-        movapd %xmm3,nb410_fix(%esp)
-        movapd %xmm4,nb410_fiy(%esp)
-        movapd %xmm5,nb410_fiz(%esp)
-        ## the fj's - start by accumulating forces from memory 
-        movlpd (%edi,%eax,8),%xmm3
-        movlpd 8(%edi,%eax,8),%xmm4
-        movlpd 16(%edi,%eax,8),%xmm5
-        movhpd (%edi,%ebx,8),%xmm3
-        movhpd 8(%edi,%ebx,8),%xmm4
-        movhpd 16(%edi,%ebx,8),%xmm5
-        subpd %xmm0,%xmm3
-        subpd %xmm1,%xmm4
-        subpd %xmm2,%xmm5
-        movlpd %xmm3,(%edi,%eax,8)
-        movlpd %xmm4,8(%edi,%eax,8)
-        movlpd %xmm5,16(%edi,%eax,8)
-        movhpd %xmm3,(%edi,%ebx,8)
-        movhpd %xmm4,8(%edi,%ebx,8)
-        movhpd %xmm5,16(%edi,%ebx,8)
-
-        ## should we do one more iteration? 
-        subl $2,nb410_innerk(%esp)
-        jl    _nb_kernel410_ia32_sse2.nb410_checksingle
-        jmp   _nb_kernel410_ia32_sse2.nb410_unroll_loop
-_nb_kernel410_ia32_sse2.nb410_checksingle: 
-        movl  nb410_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel410_ia32_sse2.nb410_dosingle
-        jmp    _nb_kernel410_ia32_sse2.nb410_updateouterdata
-_nb_kernel410_ia32_sse2.nb410_dosingle: 
-        movl nb410_charge(%ebp),%esi
-        movl nb410_invsqrta(%ebp),%edx
-        movl nb410_pos(%ebp),%edi
-        movl  nb410_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-
-        xorpd  %xmm6,%xmm6
-        movapd %xmm6,%xmm7
-        movsd  (%edx,%eax,8),%xmm7
-        movlpd (%esi,%eax,8),%xmm6      ## xmm6(0) has the charge
-        mulsd  nb410_isai(%esp),%xmm7
-        movapd %xmm7,nb410_isaprod(%esp)
-        movapd %xmm7,%xmm1
-        mulpd nb410_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb410_gbscale(%esp)
-
-        mulsd  nb410_iq(%esp),%xmm7
-        mulsd  %xmm7,%xmm6
-        movapd %xmm6,nb410_qq(%esp)
-
-        movd  %eax,%mm0         ## use mmx registers as temp storage 
-        movl nb410_type(%ebp),%esi
-        movl (%esi,%eax,4),%eax
-        movl nb410_vdwparam(%ebp),%esi
-        shll %eax
-        movl nb410_ntia(%esp),%edi
-        addl %edi,%eax
-
-        movlpd (%esi,%eax,8),%xmm6      ## c6a
-        movhpd 8(%esi,%eax,8),%xmm6     ## c6a c12a 
-        xorpd %xmm7,%xmm7
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movd  %mm0,%eax
-        movapd %xmm4,nb410_c6(%esp)
-        movapd %xmm6,nb410_c12(%esp)
-
-        movl nb410_pos(%ebp),%esi        ## base of pos[]
-
-        movd  %eax,%mm2
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-
-        ## move coordinates to xmm0-xmm2        
-        movlpd (%esi,%eax,8),%xmm0
-        movlpd 8(%esi,%eax,8),%xmm1
-        movlpd 16(%esi,%eax,8),%xmm2
-
-        ## move ix-iz to xmm4-xmm6 
-        movapd nb410_ix(%esp),%xmm4
-        movapd nb410_iy(%esp),%xmm5
-        movapd nb410_iz(%esp),%xmm6
-
-        ## calc dr 
-        subsd %xmm0,%xmm4
-        subsd %xmm1,%xmm5
-        subsd %xmm2,%xmm6
-
-        ## store dr 
-        movapd %xmm4,nb410_dx(%esp)
-        movapd %xmm5,nb410_dy(%esp)
-        movapd %xmm6,nb410_dz(%esp)
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb410_three(%esp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb410_half(%esp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb410_three(%esp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb410_half(%esp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=rinv 
-
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb410_r(%esp)
-        mulsd nb410_gbscale(%esp),%xmm4
-
-        movd %eax,%mm0
-        cvttsd2si %xmm4,%eax    ## mm6 = lu idx 
-        cvtsi2sd %eax,%xmm5
-        subsd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%eax            ## idx *= 4 
-
-        movl nb410_GBtab(%ebp),%esi
-
-        movapd (%esi,%eax,8),%xmm4      ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 16(%esi,%eax,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulsd  nb410_two(%esp),%xmm7    ## two*Heps2 
-        movapd nb410_qq(%esp),%xmm3
-        addsd  %xmm6,%xmm7
-        addsd  %xmm5,%xmm7 ## xmm7=FF 
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulsd  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## get jnr from regs
-        movd %mm2,%ebx
-        movl nb410_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorpd %xmm7,%xmm7
-        mulsd nb410_gbscale(%esp),%xmm3
-        movsd %xmm3,%xmm6
-        mulsd  nb410_r(%esp),%xmm6
-        addsd  %xmm5,%xmm6
-        addsd  nb410_vctot(%esp),%xmm5
-        movsd %xmm5,nb410_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subpd %xmm7,%xmm7
-        movsd %xmm7,%xmm6
-
-        ## update dvdasum
-        addsd  nb410_dvdasum(%esp),%xmm7
-        movsd %xmm7,nb410_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        addsd  (%esi,%ebx,8),%xmm6
-        movsd  %xmm6,(%esi,%ebx,8)
-
-        ## L-J 
-        movapd %xmm0,%xmm4
-        mulsd  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-
-        movapd %xmm4,%xmm6
-        mulsd  %xmm4,%xmm6
-
-        mulsd  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movapd %xmm6,%xmm4
-        mulsd  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulsd  nb410_c6(%esp),%xmm6
-        mulsd  nb410_c12(%esp),%xmm4
-        movapd nb410_Vvdwtot(%esp),%xmm7
-        addsd  %xmm4,%xmm7
-        mulsd  nb410_twelve(%esp),%xmm4
-        subsd  %xmm6,%xmm7
-        mulsd  nb410_six(%esp),%xmm6
-        movlpd %xmm7,nb410_Vvdwtot(%esp)
-        subsd  %xmm6,%xmm4
-        mulsd  %xmm0,%xmm4
-        subsd  %xmm3,%xmm4
-        mulsd  %xmm0,%xmm4
-
-        movapd nb410_dx(%esp),%xmm0
-        movapd nb410_dy(%esp),%xmm1
-        movapd nb410_dz(%esp),%xmm2
-
-        movd %mm0,%eax
-
-        movl   nb410_faction(%ebp),%edi
-        mulsd  %xmm4,%xmm0
-        mulsd  %xmm4,%xmm1
-        mulsd  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movapd nb410_fix(%esp),%xmm3
-        movapd nb410_fiy(%esp),%xmm4
-        movapd nb410_fiz(%esp),%xmm5
-        addsd  %xmm0,%xmm3
-        addsd  %xmm1,%xmm4
-        addsd  %xmm2,%xmm5
-        movlpd %xmm3,nb410_fix(%esp)
-        movlpd %xmm4,nb410_fiy(%esp)
-        movlpd %xmm5,nb410_fiz(%esp)
-        ## the fj's - start by accumulating forces from memory 
-        movlpd (%edi,%eax,8),%xmm3
-        movlpd 8(%edi,%eax,8),%xmm4
-        movlpd 16(%edi,%eax,8),%xmm5
-        subsd %xmm0,%xmm3
-        subsd %xmm1,%xmm4
-        subsd %xmm2,%xmm5
-        movlpd %xmm3,(%edi,%eax,8)
-        movlpd %xmm4,8(%edi,%eax,8)
-        movlpd %xmm5,16(%edi,%eax,8)
-
-_nb_kernel410_ia32_sse2.nb410_updateouterdata: 
-        movl  nb410_ii3(%esp),%ecx
-        movl  nb410_faction(%ebp),%edi
-        movl  nb410_fshift(%ebp),%esi
-        movl  nb410_is3(%esp),%edx
-
-        ## accumulate i forces in xmm0, xmm1, xmm2 
-        movapd nb410_fix(%esp),%xmm0
-        movapd nb410_fiy(%esp),%xmm1
-        movapd nb410_fiz(%esp),%xmm2
-
-        movhlps %xmm0,%xmm3
-        movhlps %xmm1,%xmm4
-        movhlps %xmm2,%xmm5
-        addsd  %xmm3,%xmm0
-        addsd  %xmm4,%xmm1
-        addsd  %xmm5,%xmm2 ## sum is in low xmm0-xmm2 
-
-        ## increment i force 
-        movsd  (%edi,%ecx,8),%xmm3
-        movsd  8(%edi,%ecx,8),%xmm4
-        movsd  16(%edi,%ecx,8),%xmm5
-        addsd  %xmm0,%xmm3
-        addsd  %xmm1,%xmm4
-        addsd  %xmm2,%xmm5
-        movsd  %xmm3,(%edi,%ecx,8)
-        movsd  %xmm4,8(%edi,%ecx,8)
-        movsd  %xmm5,16(%edi,%ecx,8)
-
-        ## increment fshift force  
-        movsd  (%esi,%edx,8),%xmm3
-        movsd  8(%esi,%edx,8),%xmm4
-        movsd  16(%esi,%edx,8),%xmm5
-        addsd  %xmm0,%xmm3
-        addsd  %xmm1,%xmm4
-        addsd  %xmm2,%xmm5
-        movsd  %xmm3,(%esi,%edx,8)
-        movsd  %xmm4,8(%esi,%edx,8)
-        movsd  %xmm5,16(%esi,%edx,8)
-
-        ## get n from stack
-        movl nb410_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb410_gid(%ebp),%edx              ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movapd nb410_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movl  nb410_Vc(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## accumulate total lj energy and update it 
-        movapd nb410_Vvdwtot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movl  nb410_Vvdw(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## accumulate dVda and update it 
-        movapd nb410_dvdasum(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        movl nb410_ii(%esp),%edx
-        movl nb410_dvda(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## finish if last 
-        movl nb410_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel410_ia32_sse2.nb410_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb410_n(%esp)
-        jmp _nb_kernel410_ia32_sse2.nb410_outer
-_nb_kernel410_ia32_sse2.nb410_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb410_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel410_ia32_sse2.nb410_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel410_ia32_sse2.nb410_threadloop
-_nb_kernel410_ia32_sse2.nb410_end: 
-        emms
-
-        movl nb410_nouter(%esp),%eax
-        movl nb410_ninner(%esp),%ebx
-        movl nb410_outeriter(%ebp),%ecx
-        movl nb410_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb410_salign(%esp),%eax
-        addl %eax,%esp
-        addl $496,%esp
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-        popl %eax
-        leave
-        ret
-
-
-
-
-
-
-
-.globl nb_kernel410nf_ia32_sse2
-.globl _nb_kernel410nf_ia32_sse2
-nb_kernel410nf_ia32_sse2:       
-_nb_kernel410nf_ia32_sse2:      
-.set nb410nf_p_nri, 8
-.set nb410nf_iinr, 12
-.set nb410nf_jindex, 16
-.set nb410nf_jjnr, 20
-.set nb410nf_shift, 24
-.set nb410nf_shiftvec, 28
-.set nb410nf_fshift, 32
-.set nb410nf_gid, 36
-.set nb410nf_pos, 40
-.set nb410nf_faction, 44
-.set nb410nf_charge, 48
-.set nb410nf_p_facel, 52
-.set nb410nf_argkrf, 56
-.set nb410nf_argcrf, 60
-.set nb410nf_Vc, 64
-.set nb410nf_type, 68
-.set nb410nf_p_ntype, 72
-.set nb410nf_vdwparam, 76
-.set nb410nf_Vvdw, 80
-.set nb410nf_p_tabscale, 84
-.set nb410nf_VFtab, 88
-.set nb410nf_invsqrta, 92
-.set nb410nf_dvda, 96
-.set nb410nf_p_gbtabscale, 100
-.set nb410nf_GBtab, 104
-.set nb410nf_p_nthreads, 108
-.set nb410nf_count, 112
-.set nb410nf_mtx, 116
-.set nb410nf_outeriter, 120
-.set nb410nf_inneriter, 124
-.set nb410nf_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb410nf_ix, 0
-.set nb410nf_iy, 16
-.set nb410nf_iz, 32
-.set nb410nf_iq, 48
-.set nb410nf_two, 64
-.set nb410nf_gbtsc, 80
-.set nb410nf_qq, 96
-.set nb410nf_c6, 112
-.set nb410nf_c12, 128
-.set nb410nf_vctot, 144
-.set nb410nf_Vvdwtot, 160
-.set nb410nf_half, 176
-.set nb410nf_three, 192
-.set nb410nf_r, 208
-.set nb410nf_isai, 224
-.set nb410nf_isaprod, 240
-.set nb410nf_gbscale, 256
-.set nb410nf_ii, 272
-.set nb410nf_is3, 276
-.set nb410nf_ii3, 280
-.set nb410nf_ntia, 284
-.set nb410nf_innerjjnr, 288
-.set nb410nf_innerk, 292
-.set nb410nf_n, 296
-.set nb410nf_nn1, 300
-.set nb410nf_nri, 304
-.set nb410nf_facel, 312                       ## uses 8 bytes
-.set nb410nf_ntype, 320
-.set nb410nf_nouter, 324
-.set nb410nf_ninner, 328
-.set nb410nf_salign, 332
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $336,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb410nf_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb410nf_p_nri(%ebp),%ecx
-        movl nb410nf_p_facel(%ebp),%esi
-        movl nb410nf_p_ntype(%ebp),%edi
-        movl (%ecx),%ecx
-        movsd (%esi),%xmm7
-        movl (%edi),%edi
-        movl %ecx,nb410nf_nri(%esp)
-        movsd %xmm7,nb410nf_facel(%esp)
-        movl %edi,nb410nf_ntype(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb410nf_nouter(%esp)
-        movl %eax,nb410nf_ninner(%esp)
-
-
-        movl nb410nf_p_gbtabscale(%ebp),%eax
-        movsd (%eax),%xmm5
-        shufpd $0,%xmm5,%xmm5
-        movapd %xmm5,nb410nf_gbtsc(%esp)
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double 0.5 IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb410nf_half(%esp)
-        movl %ebx,nb410nf_half+4(%esp)
-        movsd nb410nf_half(%esp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## 1.0
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## 2.0
-        addpd  %xmm2,%xmm3      ## 3.0
-        movapd %xmm1,nb410nf_half(%esp)
-        movapd %xmm2,nb410nf_two(%esp)
-        movapd %xmm3,nb410nf_three(%esp)
-
-_nb_kernel410nf_ia32_sse2.nb410nf_threadloop: 
-        movl  nb410nf_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel410nf_ia32_sse2.nb410nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel410nf_ia32_sse2.nb410nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb410nf_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb410nf_n(%esp)
-        movl %ebx,nb410nf_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-                movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel410nf_ia32_sse2.nb410nf_outerstart
-        jmp _nb_kernel410nf_ia32_sse2.nb410nf_end
-
-_nb_kernel410nf_ia32_sse2.nb410nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb410nf_nouter(%esp),%ebx
-        movl %ebx,nb410nf_nouter(%esp)
-
-_nb_kernel410nf_ia32_sse2.nb410nf_outer: 
-        movl  nb410nf_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx        ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb410nf_is3(%esp)            ## store is3 
-
-        movl  nb410nf_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movsd (%eax,%ebx,8),%xmm0
-        movsd 8(%eax,%ebx,8),%xmm1
-        movsd 16(%eax,%ebx,8),%xmm2
-
-        movl  nb410nf_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]      
-        movl  (%ecx,%esi,4),%ebx    ## ebx =ii 
-        movl  %ebx,nb410nf_ii(%esp)
-
-        movl  nb410nf_charge(%ebp),%edx
-        movsd (%edx,%ebx,8),%xmm3
-        mulsd nb410nf_facel(%esp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movl  nb410nf_invsqrta(%ebp),%edx       ## load invsqrta[ii]
-        movsd (%edx,%ebx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        movl  nb410nf_type(%ebp),%edx
-        movl  (%edx,%ebx,4),%edx
-        imull nb410nf_ntype(%esp),%edx
-        shll  %edx
-    movl  %edx,nb410nf_ntia(%esp)
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb410nf_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addsd (%eax,%ebx,8),%xmm0
-        addsd 8(%eax,%ebx,8),%xmm1
-        addsd 16(%eax,%ebx,8),%xmm2
-
-        movapd %xmm3,nb410nf_iq(%esp)
-        movapd %xmm4,nb410nf_isai(%esp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb410nf_ix(%esp)
-        movapd %xmm1,nb410nf_iy(%esp)
-        movapd %xmm2,nb410nf_iz(%esp)
-
-        movl  %ebx,nb410nf_ii3(%esp)
-
-        ## clear vctot and Vvdwtot
-        xorpd %xmm4,%xmm4
-        movapd %xmm4,nb410nf_vctot(%esp)
-        movapd %xmm4,nb410nf_Vvdwtot(%esp)
-
-        movl  nb410nf_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb410nf_pos(%ebp),%esi
-        movl  nb410nf_faction(%ebp),%edi
-        movl  nb410nf_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb410nf_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb410nf_ninner(%esp),%ecx
-        movl  %ecx,nb410nf_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb410nf_innerk(%esp)      ## number of innerloop atoms 
-        jge   _nb_kernel410nf_ia32_sse2.nb410nf_unroll_loop
-        jmp   _nb_kernel410nf_ia32_sse2.nb410nf_checksingle
-_nb_kernel410nf_ia32_sse2.nb410nf_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movl  nb410nf_innerjjnr(%esp),%edx       ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        addl $8,nb410nf_innerjjnr(%esp)             ## advance pointer (unrolled 2) 
-
-        ## load isaj
-        movl nb410nf_invsqrta(%ebp),%esi
-        movlpd (%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm2
-        mulpd  nb410nf_isai(%esp),%xmm2
-        movapd %xmm2,nb410nf_isaprod(%esp)
-        movapd %xmm2,%xmm1
-        mulpd nb410nf_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb410nf_gbscale(%esp)
-
-        movl nb410nf_charge(%ebp),%esi     ## base of charge[] 
-        movlpd (%esi,%eax,8),%xmm3
-        movhpd (%esi,%ebx,8),%xmm3
-
-        mulpd nb410nf_iq(%esp),%xmm2
-        mulpd  %xmm2,%xmm3
-        movapd %xmm3,nb410nf_qq(%esp)
-
-        movd  %eax,%mm0         ## use mmx registers as temp storage 
-        movd  %ebx,%mm1
-
-        movl nb410nf_type(%ebp),%esi
-        movl (%esi,%eax,4),%eax
-        movl (%esi,%ebx,4),%ebx
-        movl nb410nf_vdwparam(%ebp),%esi
-        shll %eax
-        shll %ebx
-        movl nb410nf_ntia(%esp),%edi
-        addl %edi,%eax
-        addl %edi,%ebx
-
-        movlpd (%esi,%eax,8),%xmm6      ## c6a
-        movlpd (%esi,%ebx,8),%xmm7      ## c6b
-        movhpd 8(%esi,%eax,8),%xmm6     ## c6a c12a 
-        movhpd 8(%esi,%ebx,8),%xmm7     ## c6b c12b 
-
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movd  %mm0,%eax
-        movd  %mm1,%ebx
-        movapd %xmm4,nb410nf_c6(%esp)
-        movapd %xmm6,nb410nf_c12(%esp)
-
-        movl nb410nf_pos(%ebp),%esi        ## base of pos[] 
-
-        movd  %eax,%mm2
-        movd  %ebx,%mm3
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-
-        ## move two coordinates to xmm0-xmm2    
-        movlpd (%esi,%eax,8),%xmm0
-        movlpd 8(%esi,%eax,8),%xmm1
-        movlpd 16(%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm0
-        movhpd 8(%esi,%ebx,8),%xmm1
-        movhpd 16(%esi,%ebx,8),%xmm2
-
-        ## move ix-iz to xmm4-xmm6 
-        movapd nb410nf_ix(%esp),%xmm4
-        movapd nb410nf_iy(%esp),%xmm5
-        movapd nb410nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subpd %xmm0,%xmm4
-        subpd %xmm1,%xmm5
-        subpd %xmm2,%xmm6
-
-        ## square dr 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb410nf_three(%esp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb410nf_half(%esp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb410nf_three(%esp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb410nf_half(%esp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=rinv 
-
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb410nf_r(%esp)
-        mulpd nb410nf_gbscale(%esp),%xmm4
-
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6           ## idx *= 4 
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-
-        movl nb410nf_GBtab(%ebp),%esi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm6,%ebx          ## indices in eax/ebx 
-
-        movapd (%esi,%eax,8),%xmm4      ## Y1 F1        
-        movapd (%esi,%ebx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%esi,%eax,8),%xmm6    ## G1 H1        
-        movapd 16(%esi,%ebx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb410nf_qq(%esp),%xmm3
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-        mulpd  %xmm3,%xmm5 ## vcoul=qq*VV  
-
-        addpd  nb410nf_vctot(%esp),%xmm5
-        movapd %xmm5,nb410nf_vctot(%esp)
-
-        ## L-J 
-        movapd %xmm0,%xmm4
-        mulpd  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        movapd %xmm4,%xmm6
-        mulpd  %xmm4,%xmm6
-
-        mulpd  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movapd %xmm6,%xmm4
-        mulpd  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulpd  nb410nf_c6(%esp),%xmm6
-        mulpd  nb410nf_c12(%esp),%xmm4
-        movapd nb410nf_Vvdwtot(%esp),%xmm7
-        addpd  %xmm4,%xmm7
-        subpd  %xmm6,%xmm7
-        movapd %xmm7,nb410nf_Vvdwtot(%esp)
-
-        ## should we do one more iteration? 
-        subl $2,nb410nf_innerk(%esp)
-        jl    _nb_kernel410nf_ia32_sse2.nb410nf_checksingle
-        jmp   _nb_kernel410nf_ia32_sse2.nb410nf_unroll_loop
-_nb_kernel410nf_ia32_sse2.nb410nf_checksingle: 
-        movl  nb410nf_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel410nf_ia32_sse2.nb410nf_dosingle
-        jmp    _nb_kernel410nf_ia32_sse2.nb410nf_updateouterdata
-_nb_kernel410nf_ia32_sse2.nb410nf_dosingle: 
-        movl nb410nf_charge(%ebp),%esi
-        movl nb410nf_invsqrta(%ebp),%edx
-        movl nb410nf_pos(%ebp),%edi
-        movl  nb410nf_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-
-        xorpd  %xmm6,%xmm6
-        movapd %xmm6,%xmm7
-        movsd  (%edx,%eax,8),%xmm7
-        movlpd (%esi,%eax,8),%xmm6      ## xmm6(0) has the charge
-        mulsd  nb410nf_isai(%esp),%xmm7
-        movapd %xmm7,nb410nf_isaprod(%esp)
-        movapd %xmm7,%xmm1
-        mulpd nb410nf_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb410nf_gbscale(%esp)
-
-        mulsd  nb410nf_iq(%esp),%xmm7
-        mulsd  %xmm7,%xmm6
-        movapd %xmm6,nb410nf_qq(%esp)
-
-        movd  %eax,%mm0         ## use mmx registers as temp storage 
-        movl nb410nf_type(%ebp),%esi
-        movl (%esi,%eax,4),%eax
-        movl nb410nf_vdwparam(%ebp),%esi
-        shll %eax
-        movl nb410nf_ntia(%esp),%edi
-        addl %edi,%eax
-
-        movlpd (%esi,%eax,8),%xmm6      ## c6a
-        movhpd 8(%esi,%eax,8),%xmm6     ## c6a c12a 
-
-        xorpd %xmm7,%xmm7
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movd  %mm0,%eax
-        movapd %xmm4,nb410nf_c6(%esp)
-        movapd %xmm6,nb410nf_c12(%esp)
-
-        movl nb410nf_pos(%ebp),%esi        ## base of pos[]
-
-        movd  %eax,%mm2
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-
-        ## move coordinates to xmm0-xmm2        
-        movlpd (%esi,%eax,8),%xmm0
-        movlpd 8(%esi,%eax,8),%xmm1
-        movlpd 16(%esi,%eax,8),%xmm2
-
-        ## move ix-iz to xmm4-xmm6 
-        movapd nb410nf_ix(%esp),%xmm4
-        movapd nb410nf_iy(%esp),%xmm5
-        movapd nb410nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subsd %xmm0,%xmm4
-        subsd %xmm1,%xmm5
-        subsd %xmm2,%xmm6
-
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb410nf_three(%esp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb410nf_half(%esp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb410nf_three(%esp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb410nf_half(%esp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=rinv 
-
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb410nf_r(%esp)
-        mulsd nb410nf_gbscale(%esp),%xmm4
-
-        movd %eax,%mm0
-        cvttsd2si %xmm4,%eax    ## mm6 = lu idx 
-        cvtsi2sd %eax,%xmm5
-        subsd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%eax            ## idx *= 4 
-
-        movl nb410nf_GBtab(%ebp),%esi
-
-        movapd (%esi,%eax,8),%xmm4      ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 16(%esi,%eax,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb410nf_qq(%esp),%xmm3
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  %xmm3,%xmm5 ## vcoul=qq*VV  
-
-        addsd  nb410nf_vctot(%esp),%xmm5
-        movsd %xmm5,nb410nf_vctot(%esp)
-
-        ## L-J 
-        movapd %xmm0,%xmm4
-        mulsd  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-
-        movapd %xmm4,%xmm6
-        mulsd  %xmm4,%xmm6
-
-        mulsd  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movapd %xmm6,%xmm4
-        mulsd  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulsd  nb410nf_c6(%esp),%xmm6
-        mulsd  nb410nf_c12(%esp),%xmm4
-        movapd nb410nf_Vvdwtot(%esp),%xmm7
-        addsd  %xmm4,%xmm7
-        subsd  %xmm6,%xmm7
-        movlpd %xmm7,nb410nf_Vvdwtot(%esp)
-
-_nb_kernel410nf_ia32_sse2.nb410nf_updateouterdata: 
-        movl  nb410nf_ii3(%esp),%ecx
-        movl  nb410nf_is3(%esp),%edx
-
-        ## get n from stack
-        movl nb410nf_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb410nf_gid(%ebp),%edx            ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movapd nb410nf_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movl  nb410nf_Vc(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## accumulate total lj energy and update it 
-        movapd nb410nf_Vvdwtot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movl  nb410nf_Vvdw(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## finish if last 
-        movl nb410nf_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel410nf_ia32_sse2.nb410nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb410nf_n(%esp)
-        jmp _nb_kernel410nf_ia32_sse2.nb410nf_outer
-_nb_kernel410nf_ia32_sse2.nb410nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb410nf_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel410nf_ia32_sse2.nb410nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel410nf_ia32_sse2.nb410nf_threadloop
-_nb_kernel410nf_ia32_sse2.nb410nf_end: 
-        emms
-
-        movl nb410nf_nouter(%esp),%eax
-        movl nb410nf_ninner(%esp),%ebx
-        movl nb410nf_outeriter(%ebp),%ecx
-        movl nb410nf_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb410nf_salign(%esp),%eax
-        addl %eax,%esp
-        addl $336,%esp
-        popl %edi
-        popl %esi
-    popl %edx
-    popl %ecx
-    popl %ebx
-    popl %eax
-        leave
-        ret
-
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.intel_syntax.s
deleted file mode 100644
index 30eb5c3cbb..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.intel_syntax.s
+++ /dev/null
@@ -1,1714 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-.globl nb_kernel430_ia32_sse2
-.globl _nb_kernel430_ia32_sse2
-nb_kernel430_ia32_sse2:	
-_nb_kernel430_ia32_sse2:	
-.equiv          nb430_p_nri,            8
-.equiv          nb430_iinr,             12
-.equiv          nb430_jindex,           16
-.equiv          nb430_jjnr,             20
-.equiv          nb430_shift,            24
-.equiv          nb430_shiftvec,         28
-.equiv          nb430_fshift,           32
-.equiv          nb430_gid,              36
-.equiv          nb430_pos,              40
-.equiv          nb430_faction,          44
-.equiv          nb430_charge,           48
-.equiv          nb430_p_facel,          52
-.equiv          nb430_argkrf,           56
-.equiv          nb430_argcrf,           60
-.equiv          nb430_Vc,               64
-.equiv          nb430_type,             68
-.equiv          nb430_p_ntype,          72
-.equiv          nb430_vdwparam,         76
-.equiv          nb430_Vvdw,             80
-.equiv          nb430_p_tabscale,       84
-.equiv          nb430_VFtab,            88
-.equiv          nb430_invsqrta,         92
-.equiv          nb430_dvda,             96
-.equiv          nb430_p_gbtabscale,     100
-.equiv          nb430_GBtab,            104
-.equiv          nb430_p_nthreads,       108
-.equiv          nb430_count,            112
-.equiv          nb430_mtx,              116
-.equiv          nb430_outeriter,        120
-.equiv          nb430_inneriter,        124
-.equiv          nb430_work,             128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb430_ix,               0
-.equiv          nb430_iy,               16
-.equiv          nb430_iz,               32
-.equiv          nb430_iq,               48
-.equiv          nb430_dx,               64
-.equiv          nb430_dy,               80
-.equiv          nb430_dz,               96
-.equiv          nb430_two,              112
-.equiv          nb430_gbtsc,            128
-.equiv          nb430_tsc,              144
-.equiv          nb430_qq,               160
-.equiv          nb430_c6,               176
-.equiv          nb430_c12,              192
-.equiv          nb430_fscal,            208
-.equiv          nb430_vctot,            224
-.equiv          nb430_Vvdwtot,          240
-.equiv          nb430_fix,              256
-.equiv          nb430_fiy,              272
-.equiv          nb430_fiz,              288
-.equiv          nb430_half,             304
-.equiv          nb430_three,            320
-.equiv          nb430_r,                336
-.equiv          nb430_isai,             352
-.equiv          nb430_isaprod,          368
-.equiv          nb430_dvdasum,          384
-.equiv          nb430_gbscale,          400
-.equiv          nb430_ii,               416
-.equiv          nb430_is3,              420
-.equiv          nb430_ii3,              424
-.equiv          nb430_ntia,             428
-.equiv          nb430_innerjjnr,        432
-.equiv          nb430_innerk,           436
-.equiv          nb430_n,                440
-.equiv          nb430_nn1,              444
-.equiv          nb430_nri,              448
-.equiv          nb430_facel,            456   ;# uses 8 bytes
-.equiv          nb430_ntype,            464
-.equiv          nb430_nouter,           468
-.equiv          nb430_ninner,           472
-.equiv          nb430_salign,           476
-	push ebp
-	mov ebp,esp	
-    	push eax
-    	push ebx
-    	push ecx
-    	push edx
-	push esi
-	push edi
-	sub esp, 484		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb430_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb430_p_nri]
-	mov esi, [ebp + nb430_p_facel]
-	mov edi, [ebp + nb430_p_ntype]
-	mov ecx, [ecx]
-	movsd xmm7, [esi]
-	mov edi, [edi]
-	mov [esp + nb430_nri], ecx
-	movsd [esp + nb430_facel], xmm7
-	mov [esp + nb430_ntype], edi
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb430_nouter], eax
-	mov [esp + nb430_ninner], eax
-
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double 0.5 IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [esp + nb430_half], eax
-	mov [esp + nb430_half+4], ebx
-	movsd xmm1, [esp + nb430_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# 1.0
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# 2.0
-	addpd  xmm3, xmm2	;# 3.0
-	movapd [esp + nb430_half], xmm1
-	movapd [esp + nb430_two], xmm2
-	movapd [esp + nb430_three], xmm3
-	mov eax, [ebp + nb430_p_tabscale]
-	movsd xmm3, [eax]
-	mov eax, [ebp + nb430_p_gbtabscale]
-	movsd xmm4, [eax]
-	shufpd xmm3, xmm3, 0
-	shufpd xmm4, xmm4, 0
-	movapd [esp + nb430_tsc], xmm3
-	movapd [esp + nb430_gbtsc], xmm4
-
-.nb430_threadloop:
-        mov   esi, [ebp + nb430_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb430_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb430_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb430_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb430_n], eax
-        mov [esp + nb430_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb430_outerstart
-        jmp .nb430_end
-
-.nb430_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb430_nouter]
-	mov [esp + nb430_nouter], ebx
-
-.nb430_outer:
-	mov   eax, [ebp + nb430_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax+esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb430_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb430_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movsd xmm0, [eax + ebx*8]
-	movsd xmm1, [eax + ebx*8 + 8]
-	movsd xmm2, [eax + ebx*8 + 16] 
-
-	mov   ecx, [ebp + nb430_iinr]       ;# ecx = pointer into iinr[]
-	mov   ebx, [ecx+esi*4]	    ;# ebx =ii 
-	mov   [esp + nb430_ii], ebx
-
-	mov   edx, [ebp + nb430_charge]
-	movsd xmm3, [edx + ebx*8]	
-	mulsd xmm3, [esp + nb430_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   edx, [ebp + nb430_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [edx + ebx*8]
-	shufpd xmm4, xmm4, 0
-
-    	mov   edx, [ebp + nb430_type] 
-    	mov   edx, [edx + ebx*4]
-    	imul  edx, [esp + nb430_ntype]
-    	shl   edx, 1
-    	mov   [esp + nb430_ntia], edx
-		
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb430_pos]    ;# eax = base of pos[]  
-
-	addsd xmm0, [eax + ebx*8]
-	addsd xmm1, [eax + ebx*8 + 8]
-	addsd xmm2, [eax + ebx*8 + 16]
-
-	movapd [esp + nb430_iq], xmm3
-	movapd [esp + nb430_isai], xmm4
-	
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [esp + nb430_ix], xmm0
-	movapd [esp + nb430_iy], xmm1
-	movapd [esp + nb430_iz], xmm2
-
-	mov   [esp + nb430_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorpd xmm4, xmm4
-	movapd [esp + nb430_vctot], xmm4
-	movapd [esp + nb430_Vvdwtot], xmm4
-	movapd [esp + nb430_dvdasum], xmm4
-	movapd [esp + nb430_fix], xmm4
-	movapd [esp + nb430_fiy], xmm4
-	movapd [esp + nb430_fiz], xmm4
-	
-	mov   eax, [ebp + nb430_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb430_pos]
-	mov   edi, [ebp + nb430_faction]	
-	mov   eax, [ebp + nb430_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb430_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [esp + nb430_ninner]
-	mov   [esp + nb430_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb430_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb430_unroll_loop
-	jmp   .nb430_checksingle
-.nb430_unroll_loop:	
-	;# twice unrolled innerloop here 
-	mov   edx, [esp + nb430_innerjjnr]   ;# pointer to jjnr[k] 
-	mov   eax, [edx]
-	mov   ebx, [edx + 4]
-	add dword ptr [esp + nb430_innerjjnr], 8	;# advance pointer (unrolled 2) 
-
-	;# load isaj
-	mov esi, [ebp + nb430_invsqrta]
-	movlpd xmm2, [esi + eax*8]
-	movhpd xmm2, [esi + ebx*8]
-	mulpd  xmm2, [esp + nb430_isai]
-	movapd [esp + nb430_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulpd xmm1, [esp + nb430_gbtsc]
-	movapd [esp + nb430_gbscale], xmm1
-	
-	mov esi, [ebp + nb430_charge]    ;# base of charge[] 
-	movlpd xmm3, [esi + eax*8]
-	movhpd xmm3, [esi + ebx*8]
-
-	mulpd xmm2, [esp + nb430_iq]
-	mulpd  xmm3, xmm2
-	movapd [esp + nb430_qq], xmm3	
-	
-	mov esi, [ebp + nb430_type]
-	mov ecx, [esi + eax*4]
-	mov edx, [esi + ebx*4]
-	mov esi, [ebp + nb430_vdwparam]
-	shl ecx, 1
-	shl edx, 1
-	mov edi, [esp + nb430_ntia]
-	add ecx, edi
-	add edx, edi
-
-	movlpd xmm6, [esi + ecx*8]	;# c6a
-	movlpd xmm7, [esi + edx*8]	;# c6b
-	movhpd xmm6, [esi + ecx*8 + 8]	;# c6a c12a 
-	movhpd xmm7, [esi + edx*8 + 8]	;# c6b c12b 
-
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movapd [esp + nb430_c6], xmm4
-	movapd [esp + nb430_c12], xmm6
-	
-	mov esi, [ebp + nb430_pos]		;# base of pos[] 
-
-	movd  mm2, eax
-	movd  mm3, ebx
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-
-	;# move two coordinates to xmm0-xmm2 
-	movlpd xmm0, [esi + eax*8]
-	movlpd xmm1, [esi + eax*8 + 8]
-	movlpd xmm2, [esi + eax*8 + 16]
-	movhpd xmm0, [esi + ebx*8]
-	movhpd xmm1, [esi + ebx*8 + 8]
-	movhpd xmm2, [esi + ebx*8 + 16]		
-
-	mov    edi, [ebp + nb430_faction]
-	
-	;# move nb430_ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb430_ix]
-	movapd xmm5, [esp + nb430_iy]
-	movapd xmm6, [esp + nb430_iz]
-
-	;# calc dr 
-	subpd xmm4, xmm0
-	subpd xmm5, xmm1
-	subpd xmm6, xmm2
-
-	;# store dr 
-	movapd [esp + nb430_dx], xmm4
-	movapd [esp + nb430_dy], xmm5
-	movapd [esp + nb430_dz], xmm6
-	;# square it 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb430_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb430_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb430_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb430_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=iter2 of rinv 
-	mulpd xmm4, xmm0	;# xmm4=r 
-	movapd [esp + nb430_r], xmm4
-	mulpd xmm4, [esp + nb430_gbscale]
-
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 2		;# idx *= 4 
-
-	mov  esi, [ebp + nb430_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6		;# indices in eax/ebx 
-
-	;# Coulomb 
-	movapd xmm4, [esi + ecx*8]	;# Y1 F1 	
-	movapd xmm3, [esi + edx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [esi + ecx*8 + 16]	;# G1 H1 	
-	movapd xmm3, [esi + edx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	mulpd  xmm7, [esp + nb430_two]	;# two*Heps2 
-	movapd xmm3, [esp + nb430_qq]
-	addpd  xmm7, xmm6
-	addpd  xmm7, xmm5 ;# xmm7=FF 
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulpd  xmm3, xmm7 ;# fijC=FF*qq 
-	;# get jnr from regs
-	movd ecx, mm2
-	movd edx, mm3
-	mov esi, [ebp + nb430_dvda]
-	
-	;# Calculate dVda
-	xorpd xmm7, xmm7
-	mulpd xmm3, [esp + nb430_gbscale]
-	movapd xmm6, xmm3
-	mulpd  xmm6, [esp + nb430_r]
-	addpd  xmm6, xmm5
-	addpd  xmm5, [esp + nb430_vctot]
-	movapd [esp + nb430_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subpd  xmm7, xmm6
-	movapd xmm6, xmm7
-	
-	;# update dvdasum
-	addpd  xmm7, [esp + nb430_dvdasum]
-	movapd [esp + nb430_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	addsd  xmm6, [esi + ecx*8]
-	addsd  xmm7, [esi + edx*8]
-	movsd  [esi + ecx*8], xmm6
-	movsd  [esi + edx*8], xmm7
-	
-	;# put scalar force on stack temporarily 
-	movapd [esp + nb430_fscal], xmm3
-
-	movapd xmm4, [esp + nb430_r]
-	mulpd  xmm4, [esp + nb430_tsc]
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 3		;# idx *= 8
-
-	mov  esi, [ebp + nb430_VFtab]
-
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6		;# indices in eax/ebx 
-
-	;# Dispersion 
-	movapd xmm4, [esi + ecx*8]	;# Y1 F1 	
-	movapd xmm3, [esi + edx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [esi + ecx*8 + 16]	;# G1 H1 	
-	movapd xmm3, [esi + edx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	mulpd  xmm7, [esp + nb430_two]	;# two*Heps2 
-	addpd  xmm7, xmm6
-	addpd  xmm7, xmm5 ;# xmm7=FF 
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-
-	movapd xmm4, [esp + nb430_c6]
-	mulpd  xmm7, xmm4	 ;# fijD 
-	mulpd  xmm5, xmm4	 ;# Vvdw6
-	mulpd  xmm7, [esp + nb430_tsc]
-	addpd  xmm7, [esp + nb430_fscal] ;# add to fscal 
-
-	;# put scalar force back on stack Update Vvdwtot directly 
-	addpd  xmm5, [esp + nb430_Vvdwtot]
-	movapd [esp + nb430_fscal], xmm7
-	movapd [esp + nb430_Vvdwtot], xmm5
-
-	;# Repulsion 
-	movapd xmm4, [esi + ecx*8 + 32]	;# Y1 F1 	
-	movapd xmm3, [esi + edx*8 + 32]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [esi + ecx*8 + 48]	;# G1 H1 	
-	movapd xmm3, [esi + edx*8 + 48]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	mulpd  xmm7, [esp + nb430_two]	;# two*Heps2 
-	addpd  xmm7, xmm6
-	addpd  xmm7, xmm5 ;# xmm7=FF 
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-
-	movapd xmm4, [esp + nb430_c12]
-	mulpd  xmm7, xmm4 ;# fijR 
-	mulpd  xmm5, xmm4 ;# Vvdw12 
-	mulpd  xmm7, [esp + nb430_tsc]
-	addpd  xmm7, [esp + nb430_fscal] 
-	
-	addpd  xmm5, [esp + nb430_Vvdwtot]
-	movapd [esp + nb430_Vvdwtot], xmm5
-	xorpd  xmm4, xmm4
-
-	mulpd xmm7, xmm0
-	subpd xmm4, xmm7
-
-	movapd xmm0, [esp + nb430_dx]
-	movapd xmm1, [esp + nb430_dy]
-	movapd xmm2, [esp + nb430_dz]
-
-	mov    edi, [ebp + nb430_faction]
-	mulpd  xmm0, xmm4
-	mulpd  xmm1, xmm4
-	mulpd  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movapd xmm3, [esp + nb430_fix]
-	movapd xmm4, [esp + nb430_fiy]
-	movapd xmm5, [esp + nb430_fiz]
-	addpd  xmm3, xmm0
-	addpd  xmm4, xmm1
-	addpd  xmm5, xmm2
-	movapd [esp + nb430_fix], xmm3
-	movapd [esp + nb430_fiy], xmm4
-	movapd [esp + nb430_fiz], xmm5
-	;# the fj's - start by accumulating forces from memory 
-	movlpd xmm3, [edi + eax*8]
-	movlpd xmm4, [edi + eax*8 + 8]
-	movlpd xmm5, [edi + eax*8 + 16]
-	movhpd xmm3, [edi + ebx*8]
-	movhpd xmm4, [edi + ebx*8 + 8]
-	movhpd xmm5, [edi + ebx*8 + 16]
-	subpd xmm3, xmm0
-	subpd xmm4, xmm1
-	subpd xmm5, xmm2
-	movlpd [edi + eax*8], xmm3
-	movlpd [edi + eax*8 + 8], xmm4
-	movlpd [edi + eax*8 + 16], xmm5
-	movhpd [edi + ebx*8], xmm3
-	movhpd [edi + ebx*8 + 8], xmm4
-	movhpd [edi + ebx*8 + 16], xmm5
-	
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb430_innerk],  2
-	jl    .nb430_checksingle
-	jmp   .nb430_unroll_loop
-.nb430_checksingle:
-	mov   edx, [esp + nb430_innerk]
-	and   edx, 1
-	jnz    .nb430_dosingle
-	jmp    .nb430_updateouterdata
-.nb430_dosingle:
-	mov esi, [ebp + nb430_charge]
-	mov edx, [ebp + nb430_invsqrta]
-	mov edi, [ebp + nb430_pos]
-	mov   ecx, [esp + nb430_innerjjnr]
-	mov   eax, [ecx]	
-
-	xorpd  xmm6, xmm6
-	movapd xmm7, xmm6
-	movsd  xmm7, [edx + eax*8]
-	movlpd xmm6, [esi + eax*8]	;# xmm6(0) has the charge
-	mulsd  xmm7, [esp + nb430_isai]
-	movapd [esp + nb430_isaprod], xmm7
-	movapd xmm1, xmm7
-	mulpd xmm1, [esp + nb430_gbtsc]
-	movapd [esp + nb430_gbscale], xmm1
-	
-	mulsd  xmm7, [esp + nb430_iq]
-	mulsd  xmm6, xmm7
-	movapd [esp + nb430_qq], xmm6
-	
-	mov esi, [ebp + nb430_type]
-	mov edx, [esi + eax*4]
-	mov esi, [ebp + nb430_vdwparam]
-	shl edx, 1
-	mov edi, [esp + nb430_ntia]
-	add edx, edi
-
-	movlpd xmm6, [esi + edx*8]	;# c6a
-	movhpd xmm6, [esi + edx*8 + 8]	;# c6a c12a 
-
-	xorpd xmm7, xmm7
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movapd [esp + nb430_c6], xmm4
-	movapd [esp + nb430_c12], xmm6
-	
-	mov esi, [ebp + nb430_pos]		;# base of pos[]
-	
-	movd  mm2, eax
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-
-	;# move two coordinates to xmm0-xmm2 
-	movlpd xmm0, [esi + eax*8]
-	movlpd xmm1, [esi + eax*8 + 8]
-	movlpd xmm2, [esi + eax*8 + 16]
-
-	mov    edi, [ebp + nb430_faction]
-
-	;# move nb430_ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb430_ix]
-	movapd xmm5, [esp + nb430_iy]
-	movapd xmm6, [esp + nb430_iz]
-
-	;# calc dr 
-	subsd xmm4, xmm0
-	subsd xmm5, xmm1
-	subsd xmm6, xmm2
-
-	;# store dr 
-	movapd [esp + nb430_dx], xmm4
-	movapd [esp + nb430_dy], xmm5
-	movapd [esp + nb430_dz], xmm6
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb430_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb430_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb430_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb430_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=iter2 of rinv (new lu) 
-	mulsd xmm4, xmm0	;# xmm4=r 
-	movsd [esp + nb430_r], xmm4
-	mulsd xmm4, [esp + nb430_gbscale]
-	
-	cvttsd2si edx, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, edx
-	subsd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2 
-	
-	shl edx, 2		;# idx *= 4 
-	mov  esi, [ebp + nb430_GBtab]
-
-	;# Coulomb 
-	movapd xmm4, [esi + edx*8]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [esi + edx*8 + 16]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	mulsd  xmm7, [esp + nb430_two]	;# two*Heps2 
-	movapd xmm3, [esp + nb430_qq]
-	addsd  xmm7, xmm6
-	addsd  xmm7, xmm5 ;# xmm7=FF 
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulsd  xmm3, xmm7 ;# fijC=FF*qq 
-	;# get jnr from regs
-	movd ebx, mm2
-	mov esi, [ebp + nb430_dvda]
-	
-	;# Calculate dVda
-	xorpd xmm7, xmm7
-	mulsd xmm3, [esp + nb430_gbscale]
-	movsd xmm6, xmm3
-	mulsd  xmm6, [esp + nb430_r]
-	addsd  xmm6, xmm5
-	addsd  xmm5, [esp + nb430_vctot]
-	movsd [esp + nb430_vctot], xmm5 
-
-	;# xmm6=(vcoul+fijC*r)
-	subpd xmm7, xmm6
-	movsd xmm6, xmm7
-	
-	;# update dvdasum
-	addsd  xmm7, [esp + nb430_dvdasum]
-	movsd [esp + nb430_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	addsd  xmm6, [esi + ebx*8]
-	movsd  [esi + ebx*8], xmm6
-	
-	;# put scalar force on stack temporarily 
-	movsd [esp + nb430_fscal], xmm3
-
-	movsd xmm4, [esp + nb430_r]
-	mulsd  xmm4, [esp + nb430_tsc]
-	cvttsd2si edx, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, edx
-	subsd xmm4, xmm5
-	movsd xmm1, xmm4	;# xmm1=eps 
-	movsd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2 
-
-	shl edx, 3
-
-	mov  esi, [ebp + nb430_VFtab]
-
-	;# Dispersion 
-	movapd xmm4, [esi + edx*8]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [esi + edx*8 + 16]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	mulsd  xmm7, [esp + nb430_two]	;# two*Heps2 
-	movapd xmm3, [esp + nb430_qq]
-	addsd  xmm7, xmm6
-	addsd  xmm7, xmm5 ;# xmm7=FF 
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-
-	movapd xmm4, [esp + nb430_c6]
-	mulsd  xmm7, xmm4	 ;# fijD 
-	mulsd  xmm5, xmm4	 ;# Vvdw6
-	mulpd  xmm7, [esp + nb430_tsc]
-	addsd  xmm7, [esp + nb430_fscal] ;# add to fscal 
-
-	;# put scalar force back on stack Update Vvdwtot directly 
-	addsd  xmm5, [esp + nb430_Vvdwtot]
-	movlpd [esp + nb430_fscal], xmm7
-	movlpd [esp + nb430_Vvdwtot], xmm5
-
-	;# Repulsion 
-	movapd xmm4, [esi + edx*8 + 32]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [esi + edx*8 + 48]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	mulsd  xmm7, [esp + nb430_two]	;# two*Heps2 
-	movapd xmm3, [esp + nb430_qq]
-	addsd  xmm7, xmm6
-	addsd  xmm7, xmm5 ;# xmm7=FF 
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-
-	movapd xmm4, [esp + nb430_c12]
-	mulsd  xmm7, xmm4 ;# fijR 
-	mulsd  xmm5, xmm4 ;# Vvdw12 
-	mulpd  xmm7, [esp + nb430_tsc]
-	addsd  xmm7, [esp + nb430_fscal] 
-	
-	addsd  xmm5, [esp + nb430_Vvdwtot]
-	movlpd [esp + nb430_Vvdwtot], xmm5
-	xorpd  xmm4, xmm4
-
-	mulsd xmm7, xmm0
-	subsd xmm4, xmm7
-
-	movapd xmm0, [esp + nb430_dx]
-	movapd xmm1, [esp + nb430_dy]
-	movapd xmm2, [esp + nb430_dz]
-
-	mov    edi, [ebp + nb430_faction]
-	mulsd  xmm0, xmm4
-	mulsd  xmm1, xmm4
-	mulsd  xmm2, xmm4
-	;# xmm0-xmm2 contains tx-tz (partial force) 
-	;# now update f_i 
-	movapd xmm3, [esp + nb430_fix]
-	movapd xmm4, [esp + nb430_fiy]
-	movapd xmm5, [esp + nb430_fiz]
-	addsd  xmm3, xmm0
-	addsd  xmm4, xmm1
-	addsd  xmm5, xmm2
-	movlpd [esp + nb430_fix], xmm3
-	movlpd [esp + nb430_fiy], xmm4
-	movlpd [esp + nb430_fiz], xmm5
-	;# the fj's - start by accumulating forces from memory 
-	movlpd xmm3, [edi + eax*8]
-	movlpd xmm4, [edi + eax*8 + 8]
-	movlpd xmm5, [edi + eax*8 + 16]
-	subsd xmm3, xmm0
-	subsd xmm4, xmm1
-	subsd xmm5, xmm2
-	movlpd [edi + eax*8], xmm3
-	movlpd [edi + eax*8 + 8], xmm4
-	movlpd [edi + eax*8 + 16], xmm5
-.nb430_updateouterdata:
-	mov   ecx, [esp + nb430_ii3]
-	mov   edi, [ebp + nb430_faction]
-	mov   esi, [ebp + nb430_fshift]
-	mov   edx, [esp + nb430_is3]
-
-	;# accumulate i forces in xmm0, xmm1, xmm2 
-	movapd xmm0, [esp + nb430_fix]
-	movapd xmm1, [esp + nb430_fiy]
-	movapd xmm2, [esp + nb430_fiz]
-
-	movhlps xmm3, xmm0
-	movhlps xmm4, xmm1
-	movhlps xmm5, xmm2
-	addsd  xmm0, xmm3
-	addsd  xmm1, xmm4
-	addsd  xmm2, xmm5 ;# sum is in low xmm0-xmm2 
-
-	;# increment i force 
-	movsd  xmm3, [edi + ecx*8]
-	movsd  xmm4, [edi + ecx*8 + 8]
-	movsd  xmm5, [edi + ecx*8 + 16]
-	addsd  xmm3, xmm0
-	addsd  xmm4, xmm1
-	addsd  xmm5, xmm2
-	movsd  [edi + ecx*8],     xmm3
-	movsd  [edi + ecx*8 + 8], xmm4
-	movsd  [edi + ecx*8 + 16], xmm5
-
-	;# increment fshift force  
-	movsd  xmm3, [esi + edx*8]
-	movsd  xmm4, [esi + edx*8 + 8]
-	movsd  xmm5, [esi + edx*8 + 16]
-	addsd  xmm3, xmm0
-	addsd  xmm4, xmm1
-	addsd  xmm5, xmm2
-	movsd  [esi + edx*8],     xmm3
-	movsd  [esi + edx*8 + 8], xmm4
-	movsd  [esi + edx*8 + 16], xmm5
-
-	;# get n from stack
-	mov esi, [esp + nb430_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb430_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movapd xmm7, [esp + nb430_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb430_Vc]
-	addsd xmm7, [eax + edx*8] 
-	;# move back to mem 
-	movsd [eax + edx*8], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movapd xmm7, [esp + nb430_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb430_Vvdw]
-	addsd xmm7, [eax + edx*8] 
-	;# move back to mem 
-	movsd [eax + edx*8], xmm7 
-	
-	;# accumulate dVda and update it 
-	movapd xmm7, [esp + nb430_dvdasum]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	mov edx, [esp + nb430_ii]
-	mov eax, [ebp + nb430_dvda]
-	addsd xmm7, [eax + edx*8]
-	movsd [eax + edx*8], xmm7
-	
-        ;# finish if last 
-        mov ecx, [esp + nb430_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb430_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb430_n], esi
-        jmp .nb430_outer
-.nb430_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb430_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb430_end
-        ;# non-zero, do one more workunit
-        jmp   .nb430_threadloop
-.nb430_end:
-	emms
-
-	mov eax, [esp + nb430_nouter]
-	mov ebx, [esp + nb430_ninner]
-	mov ecx, [ebp + nb430_outeriter]
-	mov edx, [ebp + nb430_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb430_salign]
-	add esp, eax
-	add esp, 484
-	pop edi
-	pop esi
-    	pop edx
-    	pop ecx
-    	pop ebx
-    	pop eax
-	leave
-	ret
-
-
-
-
-	
-.globl nb_kernel430nf_ia32_sse2
-.globl _nb_kernel430nf_ia32_sse2
-nb_kernel430nf_ia32_sse2:	
-_nb_kernel430nf_ia32_sse2:	
-.equiv          nb430nf_p_nri,          8
-.equiv          nb430nf_iinr,           12
-.equiv          nb430nf_jindex,         16
-.equiv          nb430nf_jjnr,           20
-.equiv          nb430nf_shift,          24
-.equiv          nb430nf_shiftvec,       28
-.equiv          nb430nf_fshift,         32
-.equiv          nb430nf_gid,            36
-.equiv          nb430nf_pos,            40
-.equiv          nb430nf_faction,        44
-.equiv          nb430nf_charge,         48
-.equiv          nb430nf_p_facel,        52
-.equiv          nb430nf_argkrf,         56
-.equiv          nb430nf_argcrf,         60
-.equiv          nb430nf_Vc,             64
-.equiv          nb430nf_type,           68
-.equiv          nb430nf_p_ntype,        72
-.equiv          nb430nf_vdwparam,       76
-.equiv          nb430nf_Vvdw,           80
-.equiv          nb430nf_p_tabscale,     84
-.equiv          nb430nf_VFtab,          88
-.equiv          nb430nf_invsqrta,       92
-.equiv          nb430nf_dvda,           96
-.equiv          nb430nf_p_gbtabscale,   100
-.equiv          nb430nf_GBtab,          104
-.equiv          nb430nf_p_nthreads,     108
-.equiv          nb430nf_count,          112
-.equiv          nb430nf_mtx,            116
-.equiv          nb430nf_outeriter,      120
-.equiv          nb430nf_inneriter,      124
-.equiv          nb430nf_work,           128
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb430nf_ix,             0
-.equiv          nb430nf_iy,             16
-.equiv          nb430nf_iz,             32
-.equiv          nb430nf_iq,             48
-.equiv          nb430nf_gbtsc,          64
-.equiv          nb430nf_tsc,            80
-.equiv          nb430nf_qq,             96
-.equiv          nb430nf_c6,             112
-.equiv          nb430nf_c12,            128
-.equiv          nb430nf_vctot,          144
-.equiv          nb430nf_Vvdwtot,        160
-.equiv          nb430nf_half,           176
-.equiv          nb430nf_three,          192
-.equiv          nb430nf_r,              208
-.equiv          nb430nf_isai,           224
-.equiv          nb430nf_isaprod,        240
-.equiv          nb430nf_gbscale,        256
-.equiv          nb430nf_is3,            272
-.equiv          nb430nf_ii3,            276
-.equiv          nb430nf_ntia,           280
-.equiv          nb430nf_innerjjnr,      284
-.equiv          nb430nf_innerk,         288
-.equiv          nb430nf_n,              292
-.equiv          nb430nf_nn1,            296
-.equiv          nb430nf_nri,            300
-.equiv          nb430nf_facel,          304   ;# uses 8 bytes
-.equiv          nb430nf_ntype,          312
-.equiv          nb430nf_nouter,         316
-.equiv          nb430nf_ninner,         320
-.equiv          nb430nf_salign,         324
-	push ebp
-	mov ebp,esp	
-    	push eax
-    	push ebx
-    	push ecx
-    	push edx
-	push esi
-	push edi
-	sub esp, 328		;# local stack space 
-	mov  eax, esp
-	and  eax, 0xf
-	sub esp, eax
-	mov [esp + nb430nf_salign], eax
-
-	emms
-
-	;# Move args passed by reference to stack
-	mov ecx, [ebp + nb430nf_p_nri]
-	mov esi, [ebp + nb430nf_p_facel]
-	mov edi, [ebp + nb430nf_p_ntype]
-	mov ecx, [ecx]
-	movsd xmm7, [esi]
-	mov edi, [edi]
-	mov [esp + nb430nf_nri], ecx
-	movsd [esp + nb430nf_facel], xmm7
-	mov [esp + nb430nf_ntype], edi
-
-	;# zero iteration counters
-	mov eax, 0
-	mov [esp + nb430nf_nouter], eax
-	mov [esp + nb430nf_ninner], eax
-
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double 0.5 IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [esp + nb430nf_half], eax
-	mov [esp + nb430nf_half+4], ebx
-	movsd xmm1, [esp + nb430nf_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# 1.0
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# 2.0
-	addpd  xmm3, xmm2	;# 3.0
-	movapd [esp + nb430nf_half], xmm1
-	movapd [esp + nb430nf_three], xmm3
-	mov eax, [ebp + nb430nf_p_tabscale]
-	movsd xmm3, [eax]
-	mov eax, [ebp + nb430nf_p_gbtabscale]
-	movsd xmm4, [eax]
-	shufpd xmm3, xmm3, 0
-	shufpd xmm4, xmm4, 0
-	movapd [esp + nb430nf_tsc], xmm3
-	movapd [esp + nb430nf_gbtsc], xmm4
-
-.nb430nf_threadloop:
-        mov   esi, [ebp + nb430nf_count]          ;# pointer to sync counter
-        mov   eax, [esi]
-.nb430nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb430nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [esp + nb430nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [esp + nb430nf_n], eax
-        mov [esp + nb430nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb430nf_outerstart
-        jmp .nb430nf_end
-
-.nb430nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [esp + nb430nf_nouter]
-	mov [esp + nb430nf_nouter], ebx
-
-.nb430nf_outer:
-	mov   eax, [ebp + nb430nf_shift]      ;# eax = pointer into shift[] 
-	mov   ebx, [eax+esi*4]		;# ebx=shift[n] 
-	
-	lea   ebx, [ebx + ebx*2]    ;# ebx=3*is 
-	mov   [esp + nb430nf_is3],ebx    	;# store is3 
-
-	mov   eax, [ebp + nb430nf_shiftvec]   ;# eax = base of shiftvec[] 
-
-	movsd xmm0, [eax + ebx*8]
-	movsd xmm1, [eax + ebx*8 + 8]
-	movsd xmm2, [eax + ebx*8 + 16] 
-
-	mov   ecx, [ebp + nb430nf_iinr]       ;# ecx = pointer into iinr[]
-	mov   ebx, [ecx+esi*4]	    ;# ebx =ii 
-
-	mov   edx, [ebp + nb430nf_charge]
-	movsd xmm3, [edx + ebx*8]	
-	mulsd xmm3, [esp + nb430nf_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   edx, [ebp + nb430nf_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [edx + ebx*8]
-	shufpd xmm4, xmm4, 0
-
-    	mov   edx, [ebp + nb430nf_type] 
-    	mov   edx, [edx + ebx*4]
-    	imul  edx, [esp + nb430nf_ntype]
-    	shl   edx, 1
-    	mov   [esp + nb430nf_ntia], edx
-		
-	lea   ebx, [ebx + ebx*2]	;# ebx = 3*ii=ii3 
-	mov   eax, [ebp + nb430nf_pos]    ;# eax = base of pos[]  
-
-	addsd xmm0, [eax + ebx*8]
-	addsd xmm1, [eax + ebx*8 + 8]
-	addsd xmm2, [eax + ebx*8 + 16]
-
-	movapd [esp + nb430nf_iq], xmm3
-	movapd [esp + nb430nf_isai], xmm4	
-	
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [esp + nb430nf_ix], xmm0
-	movapd [esp + nb430nf_iy], xmm1
-	movapd [esp + nb430nf_iz], xmm2
-
-	mov   [esp + nb430nf_ii3], ebx
-	
-	;# clear vctot
-	xorpd xmm4, xmm4
-	movapd [esp + nb430nf_vctot], xmm4
-	movapd [esp + nb430nf_Vvdwtot], xmm4
-
-	mov   eax, [ebp + nb430nf_jindex]
-	mov   ecx, [eax + esi*4]	     ;# jindex[n] 
-	mov   edx, [eax + esi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   esi, [ebp + nb430nf_pos]
-	mov   edi, [ebp + nb430nf_faction]	
-	mov   eax, [ebp + nb430nf_jjnr]
-	shl   ecx, 2
-	add   eax, ecx
-	mov   [esp + nb430nf_innerjjnr], eax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [esp + nb430nf_ninner]
-	mov   [esp + nb430nf_ninner], ecx
-	add   edx, 0
-	mov   [esp + nb430nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb430nf_unroll_loop
-	jmp   .nb430nf_checksingle
-.nb430nf_unroll_loop:	
-	;# twice unrolled innerloop here 
-	mov   edx, [esp + nb430nf_innerjjnr]   ;# pointer to jjnr[k] 
-	mov   eax, [edx]
-	mov   ebx, [edx + 4]
-	add dword ptr [esp + nb430nf_innerjjnr], 8	;# advance pointer (unrolled 2) 
-
-	;# load isaj
-	mov esi, [ebp + nb430nf_invsqrta]
-	movlpd xmm2, [esi + eax*8]
-	movhpd xmm2, [esi + ebx*8]
-	mulpd  xmm2, [esp + nb430nf_isai]
-	movapd [esp + nb430nf_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulpd xmm1, [esp + nb430nf_gbtsc]
-	movapd [esp + nb430nf_gbscale], xmm1
-	
-	mov esi, [ebp + nb430nf_charge]    ;# base of charge[] 
-	movlpd xmm3, [esi + eax*8]
-	movhpd xmm3, [esi + ebx*8]
-
-	mulpd xmm2, [esp + nb430nf_iq]
-	mulpd  xmm3, xmm2
-	movapd [esp + nb430nf_qq], xmm3	
-	
-	mov esi, [ebp + nb430nf_type]
-	mov ecx, [esi + eax*4]
-	mov edx, [esi + ebx*4]
-	mov esi, [ebp + nb430nf_vdwparam]
-	shl ecx, 1
-	shl edx, 1
-	mov edi, [esp + nb430nf_ntia]
-	add ecx, edi
-	add edx, edi
-
-	movlpd xmm6, [esi + ecx*8]	;# c6a
-	movlpd xmm7, [esi + edx*8]	;# c6b
-	movhpd xmm6, [esi + ecx*8 + 8]	;# c6a c12a 
-	movhpd xmm7, [esi + edx*8 + 8]	;# c6b c12b 
-
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movapd [esp + nb430nf_c6], xmm4
-	movapd [esp + nb430nf_c12], xmm6
-	
-	mov esi, [ebp + nb430nf_pos]		;# base of pos[] 
-
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-	lea   ebx, [ebx + ebx*2]	
-
-	;# move two coordinates to xmm0-xmm2 
-	movlpd xmm0, [esi + eax*8]
-	movlpd xmm1, [esi + eax*8 + 8]
-	movlpd xmm2, [esi + eax*8 + 16]
-	movhpd xmm0, [esi + ebx*8]
-	movhpd xmm1, [esi + ebx*8 + 8]
-	movhpd xmm2, [esi + ebx*8 + 16]		
-
-	mov    edi, [ebp + nb430nf_faction]
-	
-	;# move nb430nf_ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb430nf_ix]
-	movapd xmm5, [esp + nb430nf_iy]
-	movapd xmm6, [esp + nb430nf_iz]
-
-	;# calc dr 
-	subpd xmm4, xmm0
-	subpd xmm5, xmm1
-	subpd xmm6, xmm2
-
-	;# square it 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb430nf_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb430nf_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb430nf_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb430nf_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=iter2 of rinv 
-	mulpd xmm4, xmm0	;# xmm4=r 
-	movapd [esp + nb430nf_r], xmm4
-	mulpd xmm4, [esp + nb430nf_gbscale]
-
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 2		;# idx *= 4 
-
-	mov  esi, [ebp + nb430nf_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6		;# indices in eax/ebx 
-
-	;# Coulomb 
-	movapd xmm4, [esi + ecx*8]	;# Y1 F1 	
-	movapd xmm3, [esi + edx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [esi + ecx*8 + 16]	;# G1 H1 	
-	movapd xmm3, [esi + edx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [esp + nb430nf_qq]
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  
-	addpd  xmm5, [esp + nb430nf_vctot]
-	movapd [esp + nb430nf_vctot], xmm5
-	
-	movapd xmm4, [esp + nb430nf_r]
-	mulpd  xmm4, [esp + nb430nf_tsc]
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 3		;# idx *= 8
-
-	mov  esi, [ebp + nb430nf_VFtab]
-
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6		;# indices in eax/ebx 
-
-	;# Dispersion 
-	movapd xmm4, [esi + ecx*8]	;# Y1 F1 	
-	movapd xmm3, [esi + edx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [esi + ecx*8 + 16]	;# G1 H1 	
-	movapd xmm3, [esi + edx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-
-	mulpd  xmm5, [esp + nb430nf_c6]	 ;# Vvdw6
-	addpd  xmm5, [esp + nb430nf_Vvdwtot]
-	movapd [esp + nb430nf_Vvdwtot], xmm5
-
-	;# Repulsion 
-	movapd xmm4, [esi + ecx*8 + 32]	;# Y1 F1 	
-	movapd xmm3, [esi + edx*8 + 32]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [esi + ecx*8 + 48]	;# G1 H1 	
-	movapd xmm3, [esi + edx*8 + 48]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-
-	mulpd  xmm5, [esp + nb430nf_c12] ;# Vvdw12 
-	addpd  xmm5, [esp + nb430nf_Vvdwtot]
-	movapd [esp + nb430nf_Vvdwtot], xmm5
-	xorpd  xmm4, xmm4
-	
-	;# should we do one more iteration? 
-	sub dword ptr [esp + nb430nf_innerk],  2
-	jl    .nb430nf_checksingle
-	jmp   .nb430nf_unroll_loop
-.nb430nf_checksingle:
-	mov   edx, [esp + nb430nf_innerk]
-	and   edx, 1
-	jnz    .nb430nf_dosingle
-	jmp    .nb430nf_updateouterdata
-.nb430nf_dosingle:
-	mov esi, [ebp + nb430nf_charge]
-	mov edx, [ebp + nb430nf_invsqrta]
-	mov edi, [ebp + nb430nf_pos]
-	mov   ecx, [esp + nb430nf_innerjjnr]
-	mov   eax, [ecx]	
-
-	xorpd  xmm6, xmm6
-	movapd xmm7, xmm6
-	movsd  xmm7, [edx + eax*8]
-	movlpd xmm6, [esi + eax*8]	;# xmm6(0) has the charge
-	mulsd  xmm7, [esp + nb430nf_isai]
-	movapd [esp + nb430nf_isaprod], xmm7
-	movapd xmm1, xmm7
-	mulpd xmm1, [esp + nb430nf_gbtsc]
-	movapd [esp + nb430nf_gbscale], xmm1
-	
-	mulsd  xmm7, [esp + nb430nf_iq]
-	mulsd  xmm6, xmm7
-	movapd [esp + nb430nf_qq], xmm6
-	
-	mov esi, [ebp + nb430nf_type]
-	mov edx, [esi + eax*4]
-	mov esi, [ebp + nb430nf_vdwparam]
-	shl edx, 1
-	mov edi, [esp + nb430nf_ntia]
-	add edx, edi
-
-	movlpd xmm6, [esi + edx*8]	;# c6a
-	movhpd xmm6, [esi + edx*8 + 8]	;# c6a c12a 
-
-	xorpd xmm7, xmm7
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movapd [esp + nb430nf_c6], xmm4
-	movapd [esp + nb430nf_c12], xmm6
-	
-	mov esi, [ebp + nb430nf_pos]		;# base of pos[] 
-
-	lea   eax, [eax + eax*2]     ;# replace jnr with j3 
-
-	;# move two coordinates to xmm0-xmm2 
-	movlpd xmm0, [esi + eax*8]
-	movlpd xmm1, [esi + eax*8 + 8]
-	movlpd xmm2, [esi + eax*8 + 16]
-
-	mov    edi, [ebp + nb430nf_faction]
-
-	;# move nb430nf_ix-iz to xmm4-xmm6 
-	movapd xmm4, [esp + nb430nf_ix]
-	movapd xmm5, [esp + nb430nf_iy]
-	movapd xmm6, [esp + nb430nf_iz]
-
-	;# calc dr 
-	subsd xmm4, xmm0
-	subsd xmm5, xmm1
-	subsd xmm6, xmm2
-
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [esp + nb430nf_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb430nf_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [esp + nb430nf_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [esp + nb430nf_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=iter2 of rinv (new lu) 
-	mulsd xmm4, xmm0	;# xmm4=r 
-	movsd [esp + nb430nf_r], xmm4
-	mulsd xmm4, [esp + nb430nf_gbscale]
-	
-	cvttsd2si edx, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, edx
-	subsd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2 
-	
-	shl edx, 2		;# idx *= 4 
-	mov  esi, [ebp + nb430nf_GBtab]
-
-	;# Coulomb 
-	movapd xmm4, [esi + edx*8]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [esi + edx*8 + 16]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [esp + nb430nf_qq]
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  
-	addsd  xmm5, [esp + nb430nf_vctot]
-	movsd [esp + nb430nf_vctot], xmm5 
-
-	movsd xmm4, [esp + nb430nf_r]
-	mulsd  xmm4, [esp + nb430nf_tsc]
-	cvttsd2si edx, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, edx
-	subsd xmm4, xmm5
-	movsd xmm1, xmm4	;# xmm1=eps 
-	movsd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2
-
-	shl edx, 3
-
-	mov  esi, [ebp + nb430nf_VFtab]
-
-	;# Dispersion 
-	movapd xmm4, [esi + edx*8]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [esi + edx*8 + 16]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-
-	mulsd  xmm5, [esp + nb430nf_c6]	 ;# Vvdw6
-	addsd  xmm5, [esp + nb430nf_Vvdwtot]
-	movlpd [esp + nb430nf_Vvdwtot], xmm5
-
-	;# Repulsion 
-	movapd xmm4, [esi + edx*8 + 32]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [esi + edx*8 + 48]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, [esp + nb430nf_c12] ;# Vvdw12 
-	addsd  xmm5, [esp + nb430nf_Vvdwtot]
-	movlpd [esp + nb430nf_Vvdwtot], xmm5
-.nb430nf_updateouterdata:
-	;# get n from stack
-	mov esi, [esp + nb430nf_n]
-        ;# get group index for i particle 
-        mov   edx, [ebp + nb430nf_gid]      	;# base of gid[]
-        mov   edx, [edx + esi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movapd xmm7, [esp + nb430nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb430nf_Vc]
-	addsd xmm7, [eax + edx*8] 
-	;# move back to mem 
-	movsd [eax + edx*8], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movapd xmm7, [esp + nb430nf_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	;# add earlier value from mem 
-	mov   eax, [ebp + nb430nf_Vvdw]
-	addsd xmm7, [eax + edx*8] 
-	;# move back to mem 
-	movsd [eax + edx*8], xmm7 
-		
-        ;# finish if last 
-        mov ecx, [esp + nb430nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb430nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [esp + nb430nf_n], esi
-        jmp .nb430nf_outer
-.nb430nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [esp + nb430nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb430nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb430nf_threadloop
-.nb430nf_end:
-	emms
-
-	mov eax, [esp + nb430nf_nouter]
-	mov ebx, [esp + nb430nf_ninner]
-	mov ecx, [ebp + nb430nf_outeriter]
-	mov edx, [ebp + nb430nf_inneriter]
-	mov [ecx], eax
-	mov [edx], ebx
-
-	mov eax, [esp + nb430nf_salign]
-	add esp, eax
-	add esp, 328
-	pop edi
-	pop esi
-    	pop edx
-    	pop ecx
-    	pop ebx
-    	pop eax
-	leave
-	ret
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.s
deleted file mode 100644
index fb9c0e2bcf..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.s
+++ /dev/null
@@ -1,1688 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-.globl nb_kernel430_ia32_sse2
-.globl _nb_kernel430_ia32_sse2
-nb_kernel430_ia32_sse2: 
-_nb_kernel430_ia32_sse2:        
-.set nb430_p_nri, 8
-.set nb430_iinr, 12
-.set nb430_jindex, 16
-.set nb430_jjnr, 20
-.set nb430_shift, 24
-.set nb430_shiftvec, 28
-.set nb430_fshift, 32
-.set nb430_gid, 36
-.set nb430_pos, 40
-.set nb430_faction, 44
-.set nb430_charge, 48
-.set nb430_p_facel, 52
-.set nb430_argkrf, 56
-.set nb430_argcrf, 60
-.set nb430_Vc, 64
-.set nb430_type, 68
-.set nb430_p_ntype, 72
-.set nb430_vdwparam, 76
-.set nb430_Vvdw, 80
-.set nb430_p_tabscale, 84
-.set nb430_VFtab, 88
-.set nb430_invsqrta, 92
-.set nb430_dvda, 96
-.set nb430_p_gbtabscale, 100
-.set nb430_GBtab, 104
-.set nb430_p_nthreads, 108
-.set nb430_count, 112
-.set nb430_mtx, 116
-.set nb430_outeriter, 120
-.set nb430_inneriter, 124
-.set nb430_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb430_ix, 0
-.set nb430_iy, 16
-.set nb430_iz, 32
-.set nb430_iq, 48
-.set nb430_dx, 64
-.set nb430_dy, 80
-.set nb430_dz, 96
-.set nb430_two, 112
-.set nb430_gbtsc, 128
-.set nb430_tsc, 144
-.set nb430_qq, 160
-.set nb430_c6, 176
-.set nb430_c12, 192
-.set nb430_fscal, 208
-.set nb430_vctot, 224
-.set nb430_Vvdwtot, 240
-.set nb430_fix, 256
-.set nb430_fiy, 272
-.set nb430_fiz, 288
-.set nb430_half, 304
-.set nb430_three, 320
-.set nb430_r, 336
-.set nb430_isai, 352
-.set nb430_isaprod, 368
-.set nb430_dvdasum, 384
-.set nb430_gbscale, 400
-.set nb430_ii, 416
-.set nb430_is3, 420
-.set nb430_ii3, 424
-.set nb430_ntia, 428
-.set nb430_innerjjnr, 432
-.set nb430_innerk, 436
-.set nb430_n, 440
-.set nb430_nn1, 444
-.set nb430_nri, 448
-.set nb430_facel, 456                         ## uses 8 bytes
-.set nb430_ntype, 464
-.set nb430_nouter, 468
-.set nb430_ninner, 472
-.set nb430_salign, 476
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $484,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb430_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb430_p_nri(%ebp),%ecx
-        movl nb430_p_facel(%ebp),%esi
-        movl nb430_p_ntype(%ebp),%edi
-        movl (%ecx),%ecx
-        movsd (%esi),%xmm7
-        movl (%edi),%edi
-        movl %ecx,nb430_nri(%esp)
-        movsd %xmm7,nb430_facel(%esp)
-        movl %edi,nb430_ntype(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb430_nouter(%esp)
-        movl %eax,nb430_ninner(%esp)
-
-
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double 0.5 IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb430_half(%esp)
-        movl %ebx,nb430_half+4(%esp)
-        movsd nb430_half(%esp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## 1.0
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## 2.0
-        addpd  %xmm2,%xmm3      ## 3.0
-        movapd %xmm1,nb430_half(%esp)
-        movapd %xmm2,nb430_two(%esp)
-        movapd %xmm3,nb430_three(%esp)
-        movl nb430_p_tabscale(%ebp),%eax
-        movsd (%eax),%xmm3
-        movl nb430_p_gbtabscale(%ebp),%eax
-        movsd (%eax),%xmm4
-        shufpd $0,%xmm3,%xmm3
-        shufpd $0,%xmm4,%xmm4
-        movapd %xmm3,nb430_tsc(%esp)
-        movapd %xmm4,nb430_gbtsc(%esp)
-
-_nb_kernel430_ia32_sse2.nb430_threadloop: 
-        movl  nb430_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel430_ia32_sse2.nb430_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel430_ia32_sse2.nb430_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb430_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb430_n(%esp)
-        movl %ebx,nb430_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel430_ia32_sse2.nb430_outerstart
-        jmp _nb_kernel430_ia32_sse2.nb430_end
-
-_nb_kernel430_ia32_sse2.nb430_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb430_nouter(%esp),%ebx
-        movl %ebx,nb430_nouter(%esp)
-
-_nb_kernel430_ia32_sse2.nb430_outer: 
-        movl  nb430_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx        ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb430_is3(%esp)      ## store is3 
-
-        movl  nb430_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movsd (%eax,%ebx,8),%xmm0
-        movsd 8(%eax,%ebx,8),%xmm1
-        movsd 16(%eax,%ebx,8),%xmm2
-
-        movl  nb430_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]
-        movl  (%ecx,%esi,4),%ebx    ## ebx =ii 
-        movl  %ebx,nb430_ii(%esp)
-
-        movl  nb430_charge(%ebp),%edx
-        movsd (%edx,%ebx,8),%xmm3
-        mulsd nb430_facel(%esp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movl  nb430_invsqrta(%ebp),%edx         ## load invsqrta[ii]
-        movsd (%edx,%ebx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        movl  nb430_type(%ebp),%edx
-        movl  (%edx,%ebx,4),%edx
-        imull nb430_ntype(%esp),%edx
-        shll  %edx
-        movl  %edx,nb430_ntia(%esp)
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb430_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addsd (%eax,%ebx,8),%xmm0
-        addsd 8(%eax,%ebx,8),%xmm1
-        addsd 16(%eax,%ebx,8),%xmm2
-
-        movapd %xmm3,nb430_iq(%esp)
-        movapd %xmm4,nb430_isai(%esp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb430_ix(%esp)
-        movapd %xmm1,nb430_iy(%esp)
-        movapd %xmm2,nb430_iz(%esp)
-
-        movl  %ebx,nb430_ii3(%esp)
-
-        ## clear vctot and i forces 
-        xorpd %xmm4,%xmm4
-        movapd %xmm4,nb430_vctot(%esp)
-        movapd %xmm4,nb430_Vvdwtot(%esp)
-        movapd %xmm4,nb430_dvdasum(%esp)
-        movapd %xmm4,nb430_fix(%esp)
-        movapd %xmm4,nb430_fiy(%esp)
-        movapd %xmm4,nb430_fiz(%esp)
-
-        movl  nb430_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb430_pos(%ebp),%esi
-        movl  nb430_faction(%ebp),%edi
-        movl  nb430_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb430_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb430_ninner(%esp),%ecx
-        movl  %ecx,nb430_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb430_innerk(%esp)      ## number of innerloop atoms 
-        jge   _nb_kernel430_ia32_sse2.nb430_unroll_loop
-        jmp   _nb_kernel430_ia32_sse2.nb430_checksingle
-_nb_kernel430_ia32_sse2.nb430_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movl  nb430_innerjjnr(%esp),%edx     ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        addl $8,nb430_innerjjnr(%esp)                   ## advance pointer (unrolled 2) 
-
-        ## load isaj
-        movl nb430_invsqrta(%ebp),%esi
-        movlpd (%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm2
-        mulpd  nb430_isai(%esp),%xmm2
-        movapd %xmm2,nb430_isaprod(%esp)
-        movapd %xmm2,%xmm1
-        mulpd nb430_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb430_gbscale(%esp)
-
-        movl nb430_charge(%ebp),%esi     ## base of charge[] 
-        movlpd (%esi,%eax,8),%xmm3
-        movhpd (%esi,%ebx,8),%xmm3
-
-        mulpd nb430_iq(%esp),%xmm2
-        mulpd  %xmm2,%xmm3
-        movapd %xmm3,nb430_qq(%esp)
-
-        movl nb430_type(%ebp),%esi
-        movl (%esi,%eax,4),%ecx
-        movl (%esi,%ebx,4),%edx
-        movl nb430_vdwparam(%ebp),%esi
-        shll %ecx
-        shll %edx
-        movl nb430_ntia(%esp),%edi
-        addl %edi,%ecx
-        addl %edi,%edx
-
-        movlpd (%esi,%ecx,8),%xmm6      ## c6a
-        movlpd (%esi,%edx,8),%xmm7      ## c6b
-        movhpd 8(%esi,%ecx,8),%xmm6     ## c6a c12a 
-        movhpd 8(%esi,%edx,8),%xmm7     ## c6b c12b 
-
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movapd %xmm4,nb430_c6(%esp)
-        movapd %xmm6,nb430_c12(%esp)
-
-        movl nb430_pos(%ebp),%esi               ## base of pos[] 
-
-        movd  %eax,%mm2
-        movd  %ebx,%mm3
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-
-        ## move two coordinates to xmm0-xmm2 
-        movlpd (%esi,%eax,8),%xmm0
-        movlpd 8(%esi,%eax,8),%xmm1
-        movlpd 16(%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm0
-        movhpd 8(%esi,%ebx,8),%xmm1
-        movhpd 16(%esi,%ebx,8),%xmm2
-
-        movl   nb430_faction(%ebp),%edi
-
-        ## move nb430_ix-iz to xmm4-xmm6 
-        movapd nb430_ix(%esp),%xmm4
-        movapd nb430_iy(%esp),%xmm5
-        movapd nb430_iz(%esp),%xmm6
-
-        ## calc dr 
-        subpd %xmm0,%xmm4
-        subpd %xmm1,%xmm5
-        subpd %xmm2,%xmm6
-
-        ## store dr 
-        movapd %xmm4,nb430_dx(%esp)
-        movapd %xmm5,nb430_dy(%esp)
-        movapd %xmm6,nb430_dz(%esp)
-        ## square it 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb430_three(%esp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb430_half(%esp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb430_three(%esp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb430_half(%esp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=iter2 of rinv 
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb430_r(%esp)
-        mulpd nb430_gbscale(%esp),%xmm4
-
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6           ## idx *= 4 
-
-        movl nb430_GBtab(%ebp),%esi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx          ## indices in eax/ebx 
-
-        ## Coulomb 
-        movapd (%esi,%ecx,8),%xmm4      ## Y1 F1        
-        movapd (%esi,%edx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%esi,%ecx,8),%xmm6    ## G1 H1        
-        movapd 16(%esi,%edx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulpd  nb430_two(%esp),%xmm7    ## two*Heps2 
-        movapd nb430_qq(%esp),%xmm3
-        addpd  %xmm6,%xmm7
-        addpd  %xmm5,%xmm7 ## xmm7=FF 
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-        mulpd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulpd  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## get jnr from regs
-        movd %mm2,%ecx
-        movd %mm3,%edx
-        movl nb430_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorpd %xmm7,%xmm7
-        mulpd nb430_gbscale(%esp),%xmm3
-        movapd %xmm3,%xmm6
-        mulpd  nb430_r(%esp),%xmm6
-        addpd  %xmm5,%xmm6
-        addpd  nb430_vctot(%esp),%xmm5
-        movapd %xmm5,nb430_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subpd  %xmm6,%xmm7
-        movapd %xmm7,%xmm6
-
-        ## update dvdasum
-        addpd  nb430_dvdasum(%esp),%xmm7
-        movapd %xmm7,nb430_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        addsd  (%esi,%ecx,8),%xmm6
-        addsd  (%esi,%edx,8),%xmm7
-        movsd  %xmm6,(%esi,%ecx,8)
-        movsd  %xmm7,(%esi,%edx,8)
-
-        ## put scalar force on stack temporarily 
-        movapd %xmm3,nb430_fscal(%esp)
-
-        movapd nb430_r(%esp),%xmm4
-        mulpd  nb430_tsc(%esp),%xmm4
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $3,%mm6           ## idx *= 8
-
-        movl nb430_VFtab(%ebp),%esi
-
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx          ## indices in eax/ebx 
-
-        ## Dispersion 
-        movapd (%esi,%ecx,8),%xmm4      ## Y1 F1        
-        movapd (%esi,%edx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%esi,%ecx,8),%xmm6    ## G1 H1        
-        movapd 16(%esi,%edx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulpd  nb430_two(%esp),%xmm7    ## two*Heps2 
-        addpd  %xmm6,%xmm7
-        addpd  %xmm5,%xmm7 ## xmm7=FF 
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-
-        movapd nb430_c6(%esp),%xmm4
-        mulpd  %xmm4,%xmm7       ## fijD 
-        mulpd  %xmm4,%xmm5       ## Vvdw6
-        mulpd  nb430_tsc(%esp),%xmm7
-        addpd  nb430_fscal(%esp),%xmm7   ## add to fscal 
-
-        ## put scalar force back on stack Update Vvdwtot directly 
-        addpd  nb430_Vvdwtot(%esp),%xmm5
-        movapd %xmm7,nb430_fscal(%esp)
-        movapd %xmm5,nb430_Vvdwtot(%esp)
-
-        ## Repulsion 
-        movapd 32(%esi,%ecx,8),%xmm4    ## Y1 F1        
-        movapd 32(%esi,%edx,8),%xmm3    ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 48(%esi,%ecx,8),%xmm6    ## G1 H1        
-        movapd 48(%esi,%edx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulpd  nb430_two(%esp),%xmm7    ## two*Heps2 
-        addpd  %xmm6,%xmm7
-        addpd  %xmm5,%xmm7 ## xmm7=FF 
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-
-        movapd nb430_c12(%esp),%xmm4
-        mulpd  %xmm4,%xmm7 ## fijR 
-        mulpd  %xmm4,%xmm5 ## Vvdw12 
-        mulpd  nb430_tsc(%esp),%xmm7
-        addpd  nb430_fscal(%esp),%xmm7
-
-        addpd  nb430_Vvdwtot(%esp),%xmm5
-        movapd %xmm5,nb430_Vvdwtot(%esp)
-        xorpd  %xmm4,%xmm4
-
-        mulpd %xmm0,%xmm7
-        subpd %xmm7,%xmm4
-
-        movapd nb430_dx(%esp),%xmm0
-        movapd nb430_dy(%esp),%xmm1
-        movapd nb430_dz(%esp),%xmm2
-
-        movl   nb430_faction(%ebp),%edi
-        mulpd  %xmm4,%xmm0
-        mulpd  %xmm4,%xmm1
-        mulpd  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movapd nb430_fix(%esp),%xmm3
-        movapd nb430_fiy(%esp),%xmm4
-        movapd nb430_fiz(%esp),%xmm5
-        addpd  %xmm0,%xmm3
-        addpd  %xmm1,%xmm4
-        addpd  %xmm2,%xmm5
-        movapd %xmm3,nb430_fix(%esp)
-        movapd %xmm4,nb430_fiy(%esp)
-        movapd %xmm5,nb430_fiz(%esp)
-        ## the fj's - start by accumulating forces from memory 
-        movlpd (%edi,%eax,8),%xmm3
-        movlpd 8(%edi,%eax,8),%xmm4
-        movlpd 16(%edi,%eax,8),%xmm5
-        movhpd (%edi,%ebx,8),%xmm3
-        movhpd 8(%edi,%ebx,8),%xmm4
-        movhpd 16(%edi,%ebx,8),%xmm5
-        subpd %xmm0,%xmm3
-        subpd %xmm1,%xmm4
-        subpd %xmm2,%xmm5
-        movlpd %xmm3,(%edi,%eax,8)
-        movlpd %xmm4,8(%edi,%eax,8)
-        movlpd %xmm5,16(%edi,%eax,8)
-        movhpd %xmm3,(%edi,%ebx,8)
-        movhpd %xmm4,8(%edi,%ebx,8)
-        movhpd %xmm5,16(%edi,%ebx,8)
-
-        ## should we do one more iteration? 
-        subl $2,nb430_innerk(%esp)
-        jl    _nb_kernel430_ia32_sse2.nb430_checksingle
-        jmp   _nb_kernel430_ia32_sse2.nb430_unroll_loop
-_nb_kernel430_ia32_sse2.nb430_checksingle: 
-        movl  nb430_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel430_ia32_sse2.nb430_dosingle
-        jmp    _nb_kernel430_ia32_sse2.nb430_updateouterdata
-_nb_kernel430_ia32_sse2.nb430_dosingle: 
-        movl nb430_charge(%ebp),%esi
-        movl nb430_invsqrta(%ebp),%edx
-        movl nb430_pos(%ebp),%edi
-        movl  nb430_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-
-        xorpd  %xmm6,%xmm6
-        movapd %xmm6,%xmm7
-        movsd  (%edx,%eax,8),%xmm7
-        movlpd (%esi,%eax,8),%xmm6      ## xmm6(0) has the charge
-        mulsd  nb430_isai(%esp),%xmm7
-        movapd %xmm7,nb430_isaprod(%esp)
-        movapd %xmm7,%xmm1
-        mulpd nb430_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb430_gbscale(%esp)
-
-        mulsd  nb430_iq(%esp),%xmm7
-        mulsd  %xmm7,%xmm6
-        movapd %xmm6,nb430_qq(%esp)
-
-        movl nb430_type(%ebp),%esi
-        movl (%esi,%eax,4),%edx
-        movl nb430_vdwparam(%ebp),%esi
-        shll %edx
-        movl nb430_ntia(%esp),%edi
-        addl %edi,%edx
-
-        movlpd (%esi,%edx,8),%xmm6      ## c6a
-        movhpd 8(%esi,%edx,8),%xmm6     ## c6a c12a 
-
-        xorpd %xmm7,%xmm7
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movapd %xmm4,nb430_c6(%esp)
-        movapd %xmm6,nb430_c12(%esp)
-
-        movl nb430_pos(%ebp),%esi               ## base of pos[]
-
-        movd  %eax,%mm2
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-
-        ## move two coordinates to xmm0-xmm2 
-        movlpd (%esi,%eax,8),%xmm0
-        movlpd 8(%esi,%eax,8),%xmm1
-        movlpd 16(%esi,%eax,8),%xmm2
-
-        movl   nb430_faction(%ebp),%edi
-
-        ## move nb430_ix-iz to xmm4-xmm6 
-        movapd nb430_ix(%esp),%xmm4
-        movapd nb430_iy(%esp),%xmm5
-        movapd nb430_iz(%esp),%xmm6
-
-        ## calc dr 
-        subsd %xmm0,%xmm4
-        subsd %xmm1,%xmm5
-        subsd %xmm2,%xmm6
-
-        ## store dr 
-        movapd %xmm4,nb430_dx(%esp)
-        movapd %xmm5,nb430_dy(%esp)
-        movapd %xmm6,nb430_dz(%esp)
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb430_three(%esp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb430_half(%esp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb430_three(%esp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb430_half(%esp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=iter2 of rinv (new lu) 
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-        movsd %xmm4,nb430_r(%esp)
-        mulsd nb430_gbscale(%esp),%xmm4
-
-        cvttsd2si %xmm4,%edx    ## mm6 = lu idx 
-        cvtsi2sd %edx,%xmm5
-        subsd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%edx            ## idx *= 4 
-        movl nb430_GBtab(%ebp),%esi
-
-        ## Coulomb 
-        movapd (%esi,%edx,8),%xmm4      ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 16(%esi,%edx,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulsd  nb430_two(%esp),%xmm7    ## two*Heps2 
-        movapd nb430_qq(%esp),%xmm3
-        addsd  %xmm6,%xmm7
-        addsd  %xmm5,%xmm7 ## xmm7=FF 
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulsd  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## get jnr from regs
-        movd %mm2,%ebx
-        movl nb430_dvda(%ebp),%esi
-
-        ## Calculate dVda
-        xorpd %xmm7,%xmm7
-        mulsd nb430_gbscale(%esp),%xmm3
-        movsd %xmm3,%xmm6
-        mulsd  nb430_r(%esp),%xmm6
-        addsd  %xmm5,%xmm6
-        addsd  nb430_vctot(%esp),%xmm5
-        movsd %xmm5,nb430_vctot(%esp)
-
-        ## xmm6=(vcoul+fijC*r)
-        subpd %xmm6,%xmm7
-        movsd %xmm7,%xmm6
-
-        ## update dvdasum
-        addsd  nb430_dvdasum(%esp),%xmm7
-        movsd %xmm7,nb430_dvdasum(%esp)
-
-        ## update j atoms dvdaj
-        addsd  (%esi,%ebx,8),%xmm6
-        movsd  %xmm6,(%esi,%ebx,8)
-
-        ## put scalar force on stack temporarily 
-        movsd %xmm3,nb430_fscal(%esp)
-
-        movsd nb430_r(%esp),%xmm4
-        mulsd  nb430_tsc(%esp),%xmm4
-        cvttsd2si %xmm4,%edx    ## mm6 = lu idx 
-        cvtsi2sd %edx,%xmm5
-        subsd %xmm5,%xmm4
-        movsd %xmm4,%xmm1       ## xmm1=eps 
-        movsd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $3,%edx
-
-        movl nb430_VFtab(%ebp),%esi
-
-        ## Dispersion 
-        movapd (%esi,%edx,8),%xmm4      ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 16(%esi,%edx,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulsd  nb430_two(%esp),%xmm7    ## two*Heps2 
-        movapd nb430_qq(%esp),%xmm3
-        addsd  %xmm6,%xmm7
-        addsd  %xmm5,%xmm7 ## xmm7=FF 
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-
-        movapd nb430_c6(%esp),%xmm4
-        mulsd  %xmm4,%xmm7       ## fijD 
-        mulsd  %xmm4,%xmm5       ## Vvdw6
-        mulpd  nb430_tsc(%esp),%xmm7
-        addsd  nb430_fscal(%esp),%xmm7   ## add to fscal 
-
-        ## put scalar force back on stack Update Vvdwtot directly 
-        addsd  nb430_Vvdwtot(%esp),%xmm5
-        movlpd %xmm7,nb430_fscal(%esp)
-        movlpd %xmm5,nb430_Vvdwtot(%esp)
-
-        ## Repulsion 
-        movapd 32(%esi,%edx,8),%xmm4    ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 48(%esi,%edx,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulsd  nb430_two(%esp),%xmm7    ## two*Heps2 
-        movapd nb430_qq(%esp),%xmm3
-        addsd  %xmm6,%xmm7
-        addsd  %xmm5,%xmm7 ## xmm7=FF 
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-
-        movapd nb430_c12(%esp),%xmm4
-        mulsd  %xmm4,%xmm7 ## fijR 
-        mulsd  %xmm4,%xmm5 ## Vvdw12 
-        mulpd  nb430_tsc(%esp),%xmm7
-        addsd  nb430_fscal(%esp),%xmm7
-
-        addsd  nb430_Vvdwtot(%esp),%xmm5
-        movlpd %xmm5,nb430_Vvdwtot(%esp)
-        xorpd  %xmm4,%xmm4
-
-        mulsd %xmm0,%xmm7
-        subsd %xmm7,%xmm4
-
-        movapd nb430_dx(%esp),%xmm0
-        movapd nb430_dy(%esp),%xmm1
-        movapd nb430_dz(%esp),%xmm2
-
-        movl   nb430_faction(%ebp),%edi
-        mulsd  %xmm4,%xmm0
-        mulsd  %xmm4,%xmm1
-        mulsd  %xmm4,%xmm2
-        ## xmm0-xmm2 contains tx-tz (partial force) 
-        ## now update f_i 
-        movapd nb430_fix(%esp),%xmm3
-        movapd nb430_fiy(%esp),%xmm4
-        movapd nb430_fiz(%esp),%xmm5
-        addsd  %xmm0,%xmm3
-        addsd  %xmm1,%xmm4
-        addsd  %xmm2,%xmm5
-        movlpd %xmm3,nb430_fix(%esp)
-        movlpd %xmm4,nb430_fiy(%esp)
-        movlpd %xmm5,nb430_fiz(%esp)
-        ## the fj's - start by accumulating forces from memory 
-        movlpd (%edi,%eax,8),%xmm3
-        movlpd 8(%edi,%eax,8),%xmm4
-        movlpd 16(%edi,%eax,8),%xmm5
-        subsd %xmm0,%xmm3
-        subsd %xmm1,%xmm4
-        subsd %xmm2,%xmm5
-        movlpd %xmm3,(%edi,%eax,8)
-        movlpd %xmm4,8(%edi,%eax,8)
-        movlpd %xmm5,16(%edi,%eax,8)
-_nb_kernel430_ia32_sse2.nb430_updateouterdata: 
-        movl  nb430_ii3(%esp),%ecx
-        movl  nb430_faction(%ebp),%edi
-        movl  nb430_fshift(%ebp),%esi
-        movl  nb430_is3(%esp),%edx
-
-        ## accumulate i forces in xmm0, xmm1, xmm2 
-        movapd nb430_fix(%esp),%xmm0
-        movapd nb430_fiy(%esp),%xmm1
-        movapd nb430_fiz(%esp),%xmm2
-
-        movhlps %xmm0,%xmm3
-        movhlps %xmm1,%xmm4
-        movhlps %xmm2,%xmm5
-        addsd  %xmm3,%xmm0
-        addsd  %xmm4,%xmm1
-        addsd  %xmm5,%xmm2 ## sum is in low xmm0-xmm2 
-
-        ## increment i force 
-        movsd  (%edi,%ecx,8),%xmm3
-        movsd  8(%edi,%ecx,8),%xmm4
-        movsd  16(%edi,%ecx,8),%xmm5
-        addsd  %xmm0,%xmm3
-        addsd  %xmm1,%xmm4
-        addsd  %xmm2,%xmm5
-        movsd  %xmm3,(%edi,%ecx,8)
-        movsd  %xmm4,8(%edi,%ecx,8)
-        movsd  %xmm5,16(%edi,%ecx,8)
-
-        ## increment fshift force  
-        movsd  (%esi,%edx,8),%xmm3
-        movsd  8(%esi,%edx,8),%xmm4
-        movsd  16(%esi,%edx,8),%xmm5
-        addsd  %xmm0,%xmm3
-        addsd  %xmm1,%xmm4
-        addsd  %xmm2,%xmm5
-        movsd  %xmm3,(%esi,%edx,8)
-        movsd  %xmm4,8(%esi,%edx,8)
-        movsd  %xmm5,16(%esi,%edx,8)
-
-        ## get n from stack
-        movl nb430_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb430_gid(%ebp),%edx              ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movapd nb430_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movl  nb430_Vc(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## accumulate total lj energy and update it 
-        movapd nb430_Vvdwtot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movl  nb430_Vvdw(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## accumulate dVda and update it 
-        movapd nb430_dvdasum(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        movl nb430_ii(%esp),%edx
-        movl nb430_dvda(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## finish if last 
-        movl nb430_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel430_ia32_sse2.nb430_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb430_n(%esp)
-        jmp _nb_kernel430_ia32_sse2.nb430_outer
-_nb_kernel430_ia32_sse2.nb430_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb430_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel430_ia32_sse2.nb430_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel430_ia32_sse2.nb430_threadloop
-_nb_kernel430_ia32_sse2.nb430_end: 
-        emms
-
-        movl nb430_nouter(%esp),%eax
-        movl nb430_ninner(%esp),%ebx
-        movl nb430_outeriter(%ebp),%ecx
-        movl nb430_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb430_salign(%esp),%eax
-        addl %eax,%esp
-        addl $484,%esp
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-        popl %eax
-        leave
-        ret
-
-
-
-
-
-.globl nb_kernel430nf_ia32_sse2
-.globl _nb_kernel430nf_ia32_sse2
-nb_kernel430nf_ia32_sse2:       
-_nb_kernel430nf_ia32_sse2:      
-.set nb430nf_p_nri, 8
-.set nb430nf_iinr, 12
-.set nb430nf_jindex, 16
-.set nb430nf_jjnr, 20
-.set nb430nf_shift, 24
-.set nb430nf_shiftvec, 28
-.set nb430nf_fshift, 32
-.set nb430nf_gid, 36
-.set nb430nf_pos, 40
-.set nb430nf_faction, 44
-.set nb430nf_charge, 48
-.set nb430nf_p_facel, 52
-.set nb430nf_argkrf, 56
-.set nb430nf_argcrf, 60
-.set nb430nf_Vc, 64
-.set nb430nf_type, 68
-.set nb430nf_p_ntype, 72
-.set nb430nf_vdwparam, 76
-.set nb430nf_Vvdw, 80
-.set nb430nf_p_tabscale, 84
-.set nb430nf_VFtab, 88
-.set nb430nf_invsqrta, 92
-.set nb430nf_dvda, 96
-.set nb430nf_p_gbtabscale, 100
-.set nb430nf_GBtab, 104
-.set nb430nf_p_nthreads, 108
-.set nb430nf_count, 112
-.set nb430nf_mtx, 116
-.set nb430nf_outeriter, 120
-.set nb430nf_inneriter, 124
-.set nb430nf_work, 128
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb430nf_ix, 0
-.set nb430nf_iy, 16
-.set nb430nf_iz, 32
-.set nb430nf_iq, 48
-.set nb430nf_gbtsc, 64
-.set nb430nf_tsc, 80
-.set nb430nf_qq, 96
-.set nb430nf_c6, 112
-.set nb430nf_c12, 128
-.set nb430nf_vctot, 144
-.set nb430nf_Vvdwtot, 160
-.set nb430nf_half, 176
-.set nb430nf_three, 192
-.set nb430nf_r, 208
-.set nb430nf_isai, 224
-.set nb430nf_isaprod, 240
-.set nb430nf_gbscale, 256
-.set nb430nf_is3, 272
-.set nb430nf_ii3, 276
-.set nb430nf_ntia, 280
-.set nb430nf_innerjjnr, 284
-.set nb430nf_innerk, 288
-.set nb430nf_n, 292
-.set nb430nf_nn1, 296
-.set nb430nf_nri, 300
-.set nb430nf_facel, 304                       ## uses 8 bytes
-.set nb430nf_ntype, 312
-.set nb430nf_nouter, 316
-.set nb430nf_ninner, 320
-.set nb430nf_salign, 324
-        pushl %ebp
-        movl %esp,%ebp
-        pushl %eax
-        pushl %ebx
-        pushl %ecx
-        pushl %edx
-        pushl %esi
-        pushl %edi
-        subl $328,%esp          ## local stack space 
-        movl %esp,%eax
-        andl $0xf,%eax
-        subl %eax,%esp
-        movl %eax,nb430nf_salign(%esp)
-
-        emms
-
-        ## Move args passed by reference to stack
-        movl nb430nf_p_nri(%ebp),%ecx
-        movl nb430nf_p_facel(%ebp),%esi
-        movl nb430nf_p_ntype(%ebp),%edi
-        movl (%ecx),%ecx
-        movsd (%esi),%xmm7
-        movl (%edi),%edi
-        movl %ecx,nb430nf_nri(%esp)
-        movsd %xmm7,nb430nf_facel(%esp)
-        movl %edi,nb430nf_ntype(%esp)
-
-        ## zero iteration counters
-        movl $0,%eax
-        movl %eax,nb430nf_nouter(%esp)
-        movl %eax,nb430nf_ninner(%esp)
-
-
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double 0.5 IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb430nf_half(%esp)
-        movl %ebx,nb430nf_half+4(%esp)
-        movsd nb430nf_half(%esp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## 1.0
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## 2.0
-        addpd  %xmm2,%xmm3      ## 3.0
-        movapd %xmm1,nb430nf_half(%esp)
-        movapd %xmm3,nb430nf_three(%esp)
-        movl nb430nf_p_tabscale(%ebp),%eax
-        movsd (%eax),%xmm3
-        movl nb430nf_p_gbtabscale(%ebp),%eax
-        movsd (%eax),%xmm4
-        shufpd $0,%xmm3,%xmm3
-        shufpd $0,%xmm4,%xmm4
-        movapd %xmm3,nb430nf_tsc(%esp)
-        movapd %xmm4,nb430nf_gbtsc(%esp)
-
-_nb_kernel430nf_ia32_sse2.nb430nf_threadloop: 
-        movl  nb430nf_count(%ebp),%esi            ## pointer to sync counter
-        movl  (%esi),%eax
-_nb_kernel430nf_ia32_sse2.nb430nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel430nf_ia32_sse2.nb430nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb430nf_nri(%esp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb430nf_n(%esp)
-        movl %ebx,nb430nf_nn1(%esp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel430nf_ia32_sse2.nb430nf_outerstart
-        jmp _nb_kernel430nf_ia32_sse2.nb430nf_end
-
-_nb_kernel430nf_ia32_sse2.nb430nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb430nf_nouter(%esp),%ebx
-        movl %ebx,nb430nf_nouter(%esp)
-
-_nb_kernel430nf_ia32_sse2.nb430nf_outer: 
-        movl  nb430nf_shift(%ebp),%eax        ## eax = pointer into shift[] 
-        movl  (%eax,%esi,4),%ebx        ## ebx=shift[n] 
-
-        leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is 
-        movl  %ebx,nb430nf_is3(%esp)            ## store is3 
-
-        movl  nb430nf_shiftvec(%ebp),%eax     ## eax = base of shiftvec[] 
-
-        movsd (%eax,%ebx,8),%xmm0
-        movsd 8(%eax,%ebx,8),%xmm1
-        movsd 16(%eax,%ebx,8),%xmm2
-
-        movl  nb430nf_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]
-        movl  (%ecx,%esi,4),%ebx    ## ebx =ii 
-
-        movl  nb430nf_charge(%ebp),%edx
-        movsd (%edx,%ebx,8),%xmm3
-        mulsd nb430nf_facel(%esp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movl  nb430nf_invsqrta(%ebp),%edx       ## load invsqrta[ii]
-        movsd (%edx,%ebx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        movl  nb430nf_type(%ebp),%edx
-        movl  (%edx,%ebx,4),%edx
-        imull nb430nf_ntype(%esp),%edx
-        shll  %edx
-        movl  %edx,nb430nf_ntia(%esp)
-
-        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3 
-        movl  nb430nf_pos(%ebp),%eax      ## eax = base of pos[]  
-
-        addsd (%eax,%ebx,8),%xmm0
-        addsd 8(%eax,%ebx,8),%xmm1
-        addsd 16(%eax,%ebx,8),%xmm2
-
-        movapd %xmm3,nb430nf_iq(%esp)
-        movapd %xmm4,nb430nf_isai(%esp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb430nf_ix(%esp)
-        movapd %xmm1,nb430nf_iy(%esp)
-        movapd %xmm2,nb430nf_iz(%esp)
-
-        movl  %ebx,nb430nf_ii3(%esp)
-
-        ## clear vctot
-        xorpd %xmm4,%xmm4
-        movapd %xmm4,nb430nf_vctot(%esp)
-        movapd %xmm4,nb430nf_Vvdwtot(%esp)
-
-        movl  nb430nf_jindex(%ebp),%eax
-        movl  (%eax,%esi,4),%ecx             ## jindex[n] 
-        movl  4(%eax,%esi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movl  nb430nf_pos(%ebp),%esi
-        movl  nb430nf_faction(%ebp),%edi
-        movl  nb430nf_jjnr(%ebp),%eax
-        shll  $2,%ecx
-        addl  %ecx,%eax
-        movl  %eax,nb430nf_innerjjnr(%esp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb430nf_ninner(%esp),%ecx
-        movl  %ecx,nb430nf_ninner(%esp)
-        addl  $0,%edx
-        movl  %edx,nb430nf_innerk(%esp)      ## number of innerloop atoms 
-        jge   _nb_kernel430nf_ia32_sse2.nb430nf_unroll_loop
-        jmp   _nb_kernel430nf_ia32_sse2.nb430nf_checksingle
-_nb_kernel430nf_ia32_sse2.nb430nf_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movl  nb430nf_innerjjnr(%esp),%edx     ## pointer to jjnr[k] 
-        movl  (%edx),%eax
-        movl  4(%edx),%ebx
-        addl $8,nb430nf_innerjjnr(%esp)                 ## advance pointer (unrolled 2) 
-
-        ## load isaj
-        movl nb430nf_invsqrta(%ebp),%esi
-        movlpd (%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm2
-        mulpd  nb430nf_isai(%esp),%xmm2
-        movapd %xmm2,nb430nf_isaprod(%esp)
-        movapd %xmm2,%xmm1
-        mulpd nb430nf_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb430nf_gbscale(%esp)
-
-        movl nb430nf_charge(%ebp),%esi     ## base of charge[] 
-        movlpd (%esi,%eax,8),%xmm3
-        movhpd (%esi,%ebx,8),%xmm3
-
-        mulpd nb430nf_iq(%esp),%xmm2
-        mulpd  %xmm2,%xmm3
-        movapd %xmm3,nb430nf_qq(%esp)
-
-        movl nb430nf_type(%ebp),%esi
-        movl (%esi,%eax,4),%ecx
-        movl (%esi,%ebx,4),%edx
-        movl nb430nf_vdwparam(%ebp),%esi
-        shll %ecx
-        shll %edx
-        movl nb430nf_ntia(%esp),%edi
-        addl %edi,%ecx
-        addl %edi,%edx
-
-        movlpd (%esi,%ecx,8),%xmm6      ## c6a
-        movlpd (%esi,%edx,8),%xmm7      ## c6b
-        movhpd 8(%esi,%ecx,8),%xmm6     ## c6a c12a 
-        movhpd 8(%esi,%edx,8),%xmm7     ## c6b c12b 
-
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movapd %xmm4,nb430nf_c6(%esp)
-        movapd %xmm6,nb430nf_c12(%esp)
-
-        movl nb430nf_pos(%ebp),%esi             ## base of pos[] 
-
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-        leal  (%ebx,%ebx,2),%ebx
-
-        ## move two coordinates to xmm0-xmm2 
-        movlpd (%esi,%eax,8),%xmm0
-        movlpd 8(%esi,%eax,8),%xmm1
-        movlpd 16(%esi,%eax,8),%xmm2
-        movhpd (%esi,%ebx,8),%xmm0
-        movhpd 8(%esi,%ebx,8),%xmm1
-        movhpd 16(%esi,%ebx,8),%xmm2
-
-        movl   nb430nf_faction(%ebp),%edi
-
-        ## move nb430nf_ix-iz to xmm4-xmm6 
-        movapd nb430nf_ix(%esp),%xmm4
-        movapd nb430nf_iy(%esp),%xmm5
-        movapd nb430nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subpd %xmm0,%xmm4
-        subpd %xmm1,%xmm5
-        subpd %xmm2,%xmm6
-
-        ## square it 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb430nf_three(%esp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb430nf_half(%esp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb430nf_three(%esp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb430nf_half(%esp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=iter2 of rinv 
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb430nf_r(%esp)
-        mulpd nb430nf_gbscale(%esp),%xmm4
-
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6           ## idx *= 4 
-
-        movl nb430nf_GBtab(%ebp),%esi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx          ## indices in eax/ebx 
-
-        ## Coulomb 
-        movapd (%esi,%ecx,8),%xmm4      ## Y1 F1        
-        movapd (%esi,%edx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%esi,%ecx,8),%xmm6    ## G1 H1        
-        movapd 16(%esi,%edx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb430nf_qq(%esp),%xmm3
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-        mulpd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addpd  nb430nf_vctot(%esp),%xmm5
-        movapd %xmm5,nb430nf_vctot(%esp)
-
-        movapd nb430nf_r(%esp),%xmm4
-        mulpd  nb430nf_tsc(%esp),%xmm4
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $3,%mm6           ## idx *= 8
-
-        movl nb430nf_VFtab(%ebp),%esi
-
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx          ## indices in eax/ebx 
-
-        ## Dispersion 
-        movapd (%esi,%ecx,8),%xmm4      ## Y1 F1        
-        movapd (%esi,%edx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%esi,%ecx,8),%xmm6    ## G1 H1        
-        movapd 16(%esi,%edx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulpd  nb430nf_c6(%esp),%xmm5    ## Vvdw6
-        addpd  nb430nf_Vvdwtot(%esp),%xmm5
-        movapd %xmm5,nb430nf_Vvdwtot(%esp)
-
-        ## Repulsion 
-        movapd 32(%esi,%ecx,8),%xmm4    ## Y1 F1        
-        movapd 32(%esi,%edx,8),%xmm3    ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 48(%esi,%ecx,8),%xmm6    ## G1 H1        
-        movapd 48(%esi,%edx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulpd  nb430nf_c12(%esp),%xmm5   ## Vvdw12 
-        addpd  nb430nf_Vvdwtot(%esp),%xmm5
-        movapd %xmm5,nb430nf_Vvdwtot(%esp)
-        xorpd  %xmm4,%xmm4
-
-        ## should we do one more iteration? 
-        subl $2,nb430nf_innerk(%esp)
-        jl    _nb_kernel430nf_ia32_sse2.nb430nf_checksingle
-        jmp   _nb_kernel430nf_ia32_sse2.nb430nf_unroll_loop
-_nb_kernel430nf_ia32_sse2.nb430nf_checksingle: 
-        movl  nb430nf_innerk(%esp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel430nf_ia32_sse2.nb430nf_dosingle
-        jmp    _nb_kernel430nf_ia32_sse2.nb430nf_updateouterdata
-_nb_kernel430nf_ia32_sse2.nb430nf_dosingle: 
-        movl nb430nf_charge(%ebp),%esi
-        movl nb430nf_invsqrta(%ebp),%edx
-        movl nb430nf_pos(%ebp),%edi
-        movl  nb430nf_innerjjnr(%esp),%ecx
-        movl  (%ecx),%eax
-
-        xorpd  %xmm6,%xmm6
-        movapd %xmm6,%xmm7
-        movsd  (%edx,%eax,8),%xmm7
-        movlpd (%esi,%eax,8),%xmm6      ## xmm6(0) has the charge
-        mulsd  nb430nf_isai(%esp),%xmm7
-        movapd %xmm7,nb430nf_isaprod(%esp)
-        movapd %xmm7,%xmm1
-        mulpd nb430nf_gbtsc(%esp),%xmm1
-        movapd %xmm1,nb430nf_gbscale(%esp)
-
-        mulsd  nb430nf_iq(%esp),%xmm7
-        mulsd  %xmm7,%xmm6
-        movapd %xmm6,nb430nf_qq(%esp)
-
-        movl nb430nf_type(%ebp),%esi
-        movl (%esi,%eax,4),%edx
-        movl nb430nf_vdwparam(%ebp),%esi
-        shll %edx
-        movl nb430nf_ntia(%esp),%edi
-        addl %edi,%edx
-
-        movlpd (%esi,%edx,8),%xmm6      ## c6a
-        movhpd 8(%esi,%edx,8),%xmm6     ## c6a c12a 
-
-        xorpd %xmm7,%xmm7
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movapd %xmm4,nb430nf_c6(%esp)
-        movapd %xmm6,nb430nf_c12(%esp)
-
-        movl nb430nf_pos(%ebp),%esi             ## base of pos[] 
-
-        leal  (%eax,%eax,2),%eax     ## replace jnr with j3 
-
-        ## move two coordinates to xmm0-xmm2 
-        movlpd (%esi,%eax,8),%xmm0
-        movlpd 8(%esi,%eax,8),%xmm1
-        movlpd 16(%esi,%eax,8),%xmm2
-
-        movl   nb430nf_faction(%ebp),%edi
-
-        ## move nb430nf_ix-iz to xmm4-xmm6 
-        movapd nb430nf_ix(%esp),%xmm4
-        movapd nb430nf_iy(%esp),%xmm5
-        movapd nb430nf_iz(%esp),%xmm6
-
-        ## calc dr 
-        subsd %xmm0,%xmm4
-        subsd %xmm1,%xmm5
-        subsd %xmm2,%xmm6
-
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb430nf_three(%esp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb430nf_half(%esp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb430nf_three(%esp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb430nf_half(%esp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=iter2 of rinv (new lu) 
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-        movsd %xmm4,nb430nf_r(%esp)
-        mulsd nb430nf_gbscale(%esp),%xmm4
-
-        cvttsd2si %xmm4,%edx    ## mm6 = lu idx 
-        cvtsi2sd %edx,%xmm5
-        subsd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%edx            ## idx *= 4 
-        movl nb430nf_GBtab(%ebp),%esi
-
-        ## Coulomb 
-        movapd (%esi,%edx,8),%xmm4      ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 16(%esi,%edx,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb430nf_qq(%esp),%xmm3
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addsd  nb430nf_vctot(%esp),%xmm5
-        movsd %xmm5,nb430nf_vctot(%esp)
-
-        movsd nb430nf_r(%esp),%xmm4
-        mulsd  nb430nf_tsc(%esp),%xmm4
-        cvttsd2si %xmm4,%edx    ## mm6 = lu idx 
-        cvtsi2sd %edx,%xmm5
-        subsd %xmm5,%xmm4
-        movsd %xmm4,%xmm1       ## xmm1=eps 
-        movsd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2
-
-        shll $3,%edx
-
-        movl nb430nf_VFtab(%ebp),%esi
-
-        ## Dispersion 
-        movapd (%esi,%edx,8),%xmm4      ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 16(%esi,%edx,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulsd  nb430nf_c6(%esp),%xmm5    ## Vvdw6
-        addsd  nb430nf_Vvdwtot(%esp),%xmm5
-        movlpd %xmm5,nb430nf_Vvdwtot(%esp)
-
-        ## Repulsion 
-        movapd 32(%esi,%edx,8),%xmm4    ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 48(%esi,%edx,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  nb430nf_c12(%esp),%xmm5   ## Vvdw12 
-        addsd  nb430nf_Vvdwtot(%esp),%xmm5
-        movlpd %xmm5,nb430nf_Vvdwtot(%esp)
-_nb_kernel430nf_ia32_sse2.nb430nf_updateouterdata: 
-        ## get n from stack
-        movl nb430nf_n(%esp),%esi
-        ## get group index for i particle 
-        movl  nb430nf_gid(%ebp),%edx            ## base of gid[]
-        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movapd nb430nf_vctot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movl  nb430nf_Vc(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## accumulate total lj energy and update it 
-        movapd nb430nf_Vvdwtot(%esp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movl  nb430nf_Vvdw(%ebp),%eax
-        addsd (%eax,%edx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%eax,%edx,8)
-
-        ## finish if last 
-        movl nb430nf_nn1(%esp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel430nf_ia32_sse2.nb430nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb430nf_n(%esp)
-        jmp _nb_kernel430nf_ia32_sse2.nb430nf_outer
-_nb_kernel430nf_ia32_sse2.nb430nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb430nf_nri(%esp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel430nf_ia32_sse2.nb430nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel430nf_ia32_sse2.nb430nf_threadloop
-_nb_kernel430nf_ia32_sse2.nb430nf_end: 
-        emms
-
-        movl nb430nf_nouter(%esp),%eax
-        movl nb430nf_ninner(%esp),%ebx
-        movl nb430nf_outeriter(%ebp),%ecx
-        movl nb430nf_inneriter(%ebp),%edx
-        movl %eax,(%ecx)
-        movl %ebx,(%edx)
-
-        movl nb430nf_salign(%esp),%eax
-        addl %eax,%esp
-        addl $328,%esp
-        popl %edi
-        popl %esi
-        popl %edx
-        popl %ecx
-        popl %ebx
-        popl %eax
-        leave
-        ret
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/Makefile.am b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/Makefile.am
index 04b13b177a..c5704df8e3 100644
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/Makefile.am
+++ b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/Makefile.am
@@ -62,29 +62,29 @@ libnb_kernel_x86_64_sse_la_SOURCES = \
 
 
 EXTRA_DIST = \
-	nb_kernel010_x86_64_sse.intel_syntax.s	nb_kernel030_x86_64_sse.intel_syntax.s	\
-	nb_kernel100_x86_64_sse.intel_syntax.s	nb_kernel101_x86_64_sse.intel_syntax.s	\
-	nb_kernel102_x86_64_sse.intel_syntax.s	nb_kernel103_x86_64_sse.intel_syntax.s	\
-	nb_kernel104_x86_64_sse.intel_syntax.s	nb_kernel110_x86_64_sse.intel_syntax.s	\
-	nb_kernel111_x86_64_sse.intel_syntax.s	nb_kernel112_x86_64_sse.intel_syntax.s	\
-	nb_kernel113_x86_64_sse.intel_syntax.s	nb_kernel114_x86_64_sse.intel_syntax.s	\
-	nb_kernel130_x86_64_sse.intel_syntax.s	nb_kernel131_x86_64_sse.intel_syntax.s	\
-	nb_kernel132_x86_64_sse.intel_syntax.s	nb_kernel133_x86_64_sse.intel_syntax.s	\
-	nb_kernel134_x86_64_sse.intel_syntax.s	nb_kernel200_x86_64_sse.intel_syntax.s	\
-	nb_kernel201_x86_64_sse.intel_syntax.s	nb_kernel202_x86_64_sse.intel_syntax.s	\
-	nb_kernel203_x86_64_sse.intel_syntax.s	nb_kernel204_x86_64_sse.intel_syntax.s	\
-	nb_kernel210_x86_64_sse.intel_syntax.s	nb_kernel211_x86_64_sse.intel_syntax.s	\
-	nb_kernel212_x86_64_sse.intel_syntax.s	nb_kernel213_x86_64_sse.intel_syntax.s	\
-	nb_kernel214_x86_64_sse.intel_syntax.s	nb_kernel230_x86_64_sse.intel_syntax.s	\
-	nb_kernel231_x86_64_sse.intel_syntax.s	nb_kernel232_x86_64_sse.intel_syntax.s	\
-	nb_kernel233_x86_64_sse.intel_syntax.s	nb_kernel234_x86_64_sse.intel_syntax.s	\
-	nb_kernel300_x86_64_sse.intel_syntax.s	nb_kernel301_x86_64_sse.intel_syntax.s	\
-	nb_kernel302_x86_64_sse.intel_syntax.s	nb_kernel303_x86_64_sse.intel_syntax.s	\
-	nb_kernel304_x86_64_sse.intel_syntax.s	nb_kernel310_x86_64_sse.intel_syntax.s	\
-	nb_kernel311_x86_64_sse.intel_syntax.s	nb_kernel312_x86_64_sse.intel_syntax.s	\
-	nb_kernel313_x86_64_sse.intel_syntax.s	nb_kernel314_x86_64_sse.intel_syntax.s	\
-	nb_kernel330_x86_64_sse.intel_syntax.s	nb_kernel331_x86_64_sse.intel_syntax.s	\
-	nb_kernel332_x86_64_sse.intel_syntax.s	nb_kernel333_x86_64_sse.intel_syntax.s	\
-	nb_kernel334_x86_64_sse.intel_syntax.s	nb_kernel400_x86_64_sse.intel_syntax.s  \
-	nb_kernel410_x86_64_sse.intel_syntax.s  nb_kernel430_x86_64_sse.intel_syntax.s  \
-	nb_kernel_x86_64_sse_test_asm.intel_syntax.s
+	nb_kernel010_x86_64_sse_intel_syntax.s	nb_kernel030_x86_64_sse_intel_syntax.s	\
+	nb_kernel100_x86_64_sse_intel_syntax.s	nb_kernel101_x86_64_sse_intel_syntax.s	\
+	nb_kernel102_x86_64_sse_intel_syntax.s	nb_kernel103_x86_64_sse_intel_syntax.s	\
+	nb_kernel104_x86_64_sse_intel_syntax.s	nb_kernel110_x86_64_sse_intel_syntax.s	\
+	nb_kernel111_x86_64_sse_intel_syntax.s	nb_kernel112_x86_64_sse_intel_syntax.s	\
+	nb_kernel113_x86_64_sse_intel_syntax.s	nb_kernel114_x86_64_sse_intel_syntax.s	\
+	nb_kernel130_x86_64_sse_intel_syntax.s	nb_kernel131_x86_64_sse_intel_syntax.s	\
+	nb_kernel132_x86_64_sse_intel_syntax.s	nb_kernel133_x86_64_sse_intel_syntax.s	\
+	nb_kernel134_x86_64_sse_intel_syntax.s	nb_kernel200_x86_64_sse_intel_syntax.s	\
+	nb_kernel201_x86_64_sse_intel_syntax.s	nb_kernel202_x86_64_sse_intel_syntax.s	\
+	nb_kernel203_x86_64_sse_intel_syntax.s	nb_kernel204_x86_64_sse_intel_syntax.s	\
+	nb_kernel210_x86_64_sse_intel_syntax.s	nb_kernel211_x86_64_sse_intel_syntax.s	\
+	nb_kernel212_x86_64_sse_intel_syntax.s	nb_kernel213_x86_64_sse_intel_syntax.s	\
+	nb_kernel214_x86_64_sse_intel_syntax.s	nb_kernel230_x86_64_sse_intel_syntax.s	\
+	nb_kernel231_x86_64_sse_intel_syntax.s	nb_kernel232_x86_64_sse_intel_syntax.s	\
+	nb_kernel233_x86_64_sse_intel_syntax.s	nb_kernel234_x86_64_sse_intel_syntax.s	\
+	nb_kernel300_x86_64_sse_intel_syntax.s	nb_kernel301_x86_64_sse_intel_syntax.s	\
+	nb_kernel302_x86_64_sse_intel_syntax.s	nb_kernel303_x86_64_sse_intel_syntax.s	\
+	nb_kernel304_x86_64_sse_intel_syntax.s	nb_kernel310_x86_64_sse_intel_syntax.s	\
+	nb_kernel311_x86_64_sse_intel_syntax.s	nb_kernel312_x86_64_sse_intel_syntax.s	\
+	nb_kernel313_x86_64_sse_intel_syntax.s	nb_kernel314_x86_64_sse_intel_syntax.s	\
+	nb_kernel330_x86_64_sse_intel_syntax.s	nb_kernel331_x86_64_sse_intel_syntax.s	\
+	nb_kernel332_x86_64_sse_intel_syntax.s	nb_kernel333_x86_64_sse_intel_syntax.s	\
+	nb_kernel334_x86_64_sse_intel_syntax.s	nb_kernel400_x86_64_sse_intel_syntax.s  \
+	nb_kernel410_x86_64_sse_intel_syntax.s  nb_kernel430_x86_64_sse_intel_syntax.s  \
+	nb_kernel_x86_64_sse_test_asm_intel_syntax.s
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.intel_syntax.s
deleted file mode 100644
index 99680cf17e..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.intel_syntax.s
+++ /dev/null
@@ -1,1662 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-	
-
-.globl nb_kernel400_x86_64_sse
-.globl _nb_kernel400_x86_64_sse
-nb_kernel400_x86_64_sse:	
-_nb_kernel400_x86_64_sse:	
-;#	Room for return address and rbp (16 bytes)
-.equiv          nb400_fshift,           16
-.equiv          nb400_gid,              24
-.equiv          nb400_pos,              32
-.equiv          nb400_faction,          40
-.equiv          nb400_charge,           48
-.equiv          nb400_p_facel,          56
-.equiv          nb400_argkrf,           64
-.equiv          nb400_argcrf,           72
-.equiv          nb400_Vc,               80
-.equiv          nb400_type,             88
-.equiv          nb400_p_ntype,          96
-.equiv          nb400_vdwparam,         104
-.equiv          nb400_Vvdw,             112
-.equiv          nb400_p_tabscale,       120
-.equiv          nb400_VFtab,            128
-.equiv          nb400_invsqrta,         136
-.equiv          nb400_dvda,             144
-.equiv          nb400_p_gbtabscale,     152
-.equiv          nb400_GBtab,            160
-.equiv          nb400_p_nthreads,       168
-.equiv          nb400_count,            176
-.equiv          nb400_mtx,              184
-.equiv          nb400_outeriter,        192
-.equiv          nb400_inneriter,        200
-.equiv          nb400_work,             208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb400_ix,               0
-.equiv          nb400_iy,               16
-.equiv          nb400_iz,               32
-.equiv          nb400_iq,               48
-.equiv          nb400_dx,               64
-.equiv          nb400_dy,               80
-.equiv          nb400_dz,               96
-.equiv          nb400_two,              112
-.equiv          nb400_gbtsc,            128
-.equiv          nb400_qq,               144
-.equiv          nb400_r,                160
-.equiv          nb400_vctot,            176
-.equiv          nb400_fix,              192
-.equiv          nb400_fiy,              208
-.equiv          nb400_fiz,              224
-.equiv          nb400_half,             240
-.equiv          nb400_three,            256
-.equiv          nb400_isai,             272
-.equiv          nb400_isaprod,          288
-.equiv          nb400_dvdasum,          304
-.equiv          nb400_gbscale,          320
-.equiv          nb400_nri,              336
-.equiv          nb400_iinr,             344
-.equiv          nb400_jindex,           352
-.equiv          nb400_jjnr,             360
-.equiv          nb400_shift,            368
-.equiv          nb400_shiftvec,         376
-.equiv          nb400_facel,            384
-.equiv          nb400_innerjjnr,        392
-.equiv          nb400_is3,              400
-.equiv          nb400_ii3,              404
-.equiv          nb400_ii,               408
-.equiv          nb400_innerk,           412
-.equiv          nb400_n,                416
-.equiv          nb400_nn1,              420
-.equiv          nb400_nouter,           424
-.equiv          nb400_ninner,           428
-.equiv          nb400_jnra,             432
-.equiv          nb400_jnrb,             436
-.equiv          nb400_jnrc,             440
-.equiv          nb400_jnrd,             444
-
-	push rbp
-	mov  rbp, rsp
-	push rbx
-
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 456		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb400_nouter], eax
-	mov [rsp + nb400_ninner], eax
-
-	mov edi, [rdi]
-	mov [rsp + nb400_nri], edi
-	mov [rsp + nb400_iinr], rsi
-	mov [rsp + nb400_jindex], rdx
-	mov [rsp + nb400_jjnr], rcx
-	mov [rsp + nb400_shift], r8
-	mov [rsp + nb400_shiftvec], r9
-	mov rsi, [rbp + nb400_p_facel]
-	movss xmm0, [rsi]
-	movss [rsp + nb400_facel], xmm0
-
-	mov rbx, [rbp + nb400_p_gbtabscale]
-	movss xmm4, [rbx]
-	shufps xmm4, xmm4, 0
-	movaps [rsp + nb400_gbtsc], xmm4
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# half in IEEE (hex)
-	mov [rsp + nb400_half], eax
-	movss xmm1, [rsp + nb400_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# one
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# two
-	addps  xmm3, xmm2	;# three
-	movaps [rsp + nb400_half],  xmm1
-	movaps [rsp + nb400_two],  xmm2
-	movaps [rsp + nb400_three],  xmm3
-
-.nb400_threadloop:
-        mov   rsi, [rbp + nb400_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb400_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb400_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb400_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb400_n], eax
-        mov [rsp + nb400_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb400_outerstart
-        jmp .nb400_end
-
-.nb400_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb400_nouter]
-	mov [rsp + nb400_nouter], ebx
-
-.nb400_outer:
-	mov   rax, [rsp + nb400_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax + rsi*4]		;# ebx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb400_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb400_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movss xmm0, [rax + rbx*4]
-	movss xmm1, [rax + rbx*4 + 4]
-	movss xmm2, [rax + rbx*4 + 8] 
-
-	mov   rcx, [rsp + nb400_iinr]       ;# rcx = pointer into iinr[] 	
-	mov   ebx, [rcx + rsi*4]	    ;# ebx =ii 
-	mov   [rsp + nb400_ii], ebx
-	
-	mov   rdx, [rbp + nb400_charge]
-	movss xmm3, [rdx + rbx*4]	
-	mulss xmm3, [rsp + nb400_facel]
-	shufps xmm3, xmm3, 0
-
-
-	mov   rdx, [rbp + nb400_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [rdx + rbx*4]
-	shufps xmm4, xmm4, 0
-
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb400_pos]    ;# rax = base of pos[]  
-
-	addss xmm0, [rax + rbx*4]
-	addss xmm1, [rax + rbx*4 + 4]
-	addss xmm2, [rax + rbx*4 + 8]
-
-	movaps [rsp + nb400_iq], xmm3
-	movaps [rsp + nb400_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [rsp + nb400_ix], xmm0
-	movaps [rsp + nb400_iy], xmm1
-	movaps [rsp + nb400_iz], xmm2
-
-	mov   [rsp + nb400_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorps xmm4, xmm4
-	movaps [rsp + nb400_dvdasum], xmm4
-	movaps xmm12, xmm4
-	movaps xmm13, xmm4
-	movaps xmm14, xmm4
-	movaps xmm15, xmm4
-	
-	mov   rax, [rsp + nb400_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb400_pos]
-	mov   rdi, [rbp + nb400_faction]	
-	mov   rax, [rsp + nb400_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb400_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [rsp + nb400_ninner]
-	mov   [rsp + nb400_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb400_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb400_unroll_loop
-	jmp   .nb400_finish_inner
-.nb400_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   rdx, [rsp + nb400_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [rdx]	
-	mov   ebx, [rdx + 4]              
-	mov   ecx, [rdx + 8]            
-	mov   edx, [rdx + 12]         ;# eax-edx=jnr1-4 
-
-	add qword ptr [rsp + nb400_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-	
-	mov rsi, [rbp + nb400_pos]       ;# base of pos[] 
-	
-	lea   r8, [rax + rax*2]     ;# j3
-	lea   r9, [rbx + rbx*2]	
-	lea   r10, [rcx + rcx*2]    
-	lea   r11, [rdx + rdx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-	movlps xmm4, [rsi + r8*4]
-	movlps xmm5, [rsi + r10*4]
-	movss xmm2, [rsi + r8*4 + 8]
-	movss xmm6, [rsi + r10*4 + 8]
-
-	movhps xmm4, [rsi + r9*4]
-	movhps xmm5, [rsi + r11*4]
-
-	movss xmm0, [rsi + r9*4 + 8]
-	movss xmm1, [rsi + r11*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# 10001000
-	
-	shufps xmm0, xmm5, 136  ;# 10001000
-	shufps xmm1, xmm5, 221  ;# 11011101		
-
-	;# calc dr 
-	subps xmm0, [rsp + nb400_ix]
-	subps xmm1, [rsp + nb400_iy]
-	subps xmm2, [rsp + nb400_iz]
-
-	;# store dr 
-	movaps xmm9, xmm0
-	movaps xmm10, xmm1
-	movaps xmm11, xmm2
-
-	;# square it 
-	mulps xmm0,xmm0
-	mulps xmm1,xmm1
-	mulps xmm2,xmm2
-	addps xmm0, xmm1
-	addps xmm0, xmm2
-    movaps xmm4, xmm0
-	;# rsq in xmm4 
-    
-	;# load isaj
-	mov rsi, [rbp + nb400_invsqrta]
-	movss xmm0, [rsi + rax*4]
-	movss xmm1, [rsi + rcx*4]
-	movss xmm2, [rsi + rbx*4]
-	movss xmm3, [rsi + rdx*4]
-	movaps xmm7, [rsp + nb400_isai]
-	shufps xmm0, xmm2, 0
-    shufps xmm1, xmm3, 0
-	shufps xmm0, xmm1, 136  ;# 10001000 ;# all isaj in xmm3 
-	mulps  xmm7, xmm0
-	
-	movaps [rsp + nb400_isaprod], xmm7	
-	movaps xmm1, xmm7
-	mulps xmm1, [rsp + nb400_gbtsc]
-	movaps [rsp + nb400_gbscale], xmm1
-	
-	mov rsi, [rbp + nb400_charge]    ;# base of charge[] 
-	
-	movss xmm0, [rsi + rax*4]
-	movss xmm1, [rsi + rcx*4]
-	movss xmm2, [rsi + rbx*4]
-	movss xmm3, [rsi + rdx*4]
-
-    mulps xmm7, [rsp + nb400_iq]
-	shufps xmm0, xmm2, 0
-	shufps xmm1, xmm3, 0
-    shufps xmm0, xmm1, 136  ;# 10001000 ;# all charges in xmm3  
-
-	mulps  xmm0, xmm7
-	movaps [rsp + nb400_qq], xmm0
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb400_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb400_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r
-	movaps [rsp + nb400_r], xmm4
-	mulps xmm4, [rsp + nb400_gbscale]
-
-    ;# truncate and convert to integers
-    cvttps2dq xmm5, xmm4
-    
-    ;# convert back to float
-    cvtdq2ps  xmm6, xmm5
-    
-    ;# multiply by 4
-    pslld   xmm5, 2
-
-    ;# move to integer registers
-    movhlps xmm7, xmm5
-    movd    r12d, xmm5
-    movd    r14d, xmm7
-    pshufd  xmm5, xmm5, 1
-    pshufd  xmm7, xmm7, 1
-    movd    r13d, xmm5
-    movd    r15d, xmm7
-    
-    ;# calculate eps
-    subps     xmm4, xmm6
-    movaps    xmm1, xmm4 ;#eps
-    
-	mov  rsi, [rbp + nb400_GBtab]
-
-    ;# load table data
-   	movlps xmm5, [rsi + r12*4]
-	movlps xmm7, [rsi + r14*4]
-	movhps xmm5, [rsi + r13*4]
-	movhps xmm7, [rsi + r15*4]
-
-    movaps xmm4, xmm5
-	shufps xmm4, xmm7, 136  ;# 10001000
-	shufps xmm5, xmm7, 221  ;# 11011101
-    
-	movlps xmm7, [rsi + r12*4 + 8]   
-	movlps xmm8, [rsi + r14*4 + 8]
-	movhps xmm7, [rsi + r13*4 + 8]
-	movhps xmm8, [rsi + r15*4 + 8]
-
-    movaps xmm6, xmm7
-    
-	shufps xmm6, xmm8, 136  ;# 10001000
-	shufps xmm7, xmm8, 221  ;# 11011101
-    ;# table data ready in xmm4-xmm7
-
-    mulps  xmm7, xmm1   ;# Heps
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm1	;# Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	addps  xmm7, xmm7	;# two*Heps2 
-	movaps xmm3, [rsp + nb400_qq]
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulps  xmm3, xmm7 ;# fijC=FF*qq 
-	;# at this point xmm5 contains vcoul and xmm3 fijC
-
-	mov rsi, [rbp + nb400_dvda]
-	
-	;# Calculate dVda
-	xorps  xmm7, xmm7
-	mulps xmm3, [rsp + nb400_gbscale]
-	movaps xmm6, xmm3
-	mulps  xmm6, [rsp + nb400_r]
-	addps  xmm6, xmm5
-    
-    ;# increment vctot (sum in xmm12)
-	addps  xmm12, xmm5
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-    ;# update dvdasum
-    addps  xmm7, [rsp + nb400_dvdasum]
-    movaps [rsp + nb400_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	movaps  xmm5, xmm6
-	movaps  xmm4, xmm7
-	shufps  xmm5, xmm5, 0x1
-	shufps  xmm4, xmm4, 0x1
-
-	;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-	addss  xmm6, [rsi + rax*4]
-	addss  xmm5, [rsi + rbx*4]
-	addss  xmm7, [rsi + rcx*4]
-	addss  xmm4, [rsi + rdx*4]
-	movss  [rsi + rax*4], xmm6
-	movss  [rsi + rbx*4], xmm5
-	movss  [rsi + rcx*4], xmm7
-	movss  [rsi + rdx*4], xmm4
-
-	xorps  xmm4, xmm4	
-	mulps xmm3, xmm0
-	subps  xmm4, xmm3
-
-	mov rsi, [rbp + nb400_faction]
-	;# the fj's - start by accumulating x & y forces from memory 
-	movlps xmm0, [rsi + r8*4] ;# x1 y1 - -
-	movlps xmm1, [rsi + r10*4] ;# x3 y3 - -
-	movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2
-	movhps xmm1, [rsi + r11*4] ;# x3 y3 x4 y4
-
-    mulps  xmm9, xmm4
-    mulps  xmm10, xmm4
-    mulps  xmm11, xmm4
-    
-	;# accumulate i forces
-    addps xmm13, xmm9
-    addps xmm14, xmm10
-    addps xmm15, xmm11
-
-    movaps xmm8, xmm9
-    unpcklps xmm9, xmm10 ;# x1 y1 x2 y2
-    unpckhps xmm8, xmm10 ;# x3 y3 x4 y4
-    
-    ;# update fjx and fjy
-	addps  xmm0, xmm9
-	addps  xmm1, xmm8
-	
-	movlps [rsi + r8*4], xmm0
-	movlps [rsi + r10*4], xmm1
-	movhps [rsi + r9*4], xmm0
-	movhps [rsi + r11*4], xmm1
-    
-    ;# xmm11: fjz1 fjz2 fjz3 fjz4
-    pshufd  xmm10, xmm11, 1  ;# fjz2 - - -
-    movhlps xmm9,  xmm11     ;# fjz3 - - -
-    pshufd  xmm8,  xmm11, 3  ;# fjz4 - - -
-    
-	addss  xmm11, [rsi + r8*4 + 8]
-	addss  xmm10, [rsi + r9*4 + 8]
-	addss  xmm9,  [rsi + r10*4 + 8]
-	addss  xmm8,  [rsi + r11*4 + 8]    
-	movss  [rsi + r8*4 + 8], xmm11
-	movss  [rsi + r9*4 + 8], xmm10
-	movss  [rsi + r10*4 + 8], xmm9
-	movss  [rsi + r11*4 + 8], xmm8
-	
-	;# should we do one more iteration? 
-	sub dword ptr [rsp + nb400_innerk],  4
-	jl    .nb400_finish_inner
-	jmp   .nb400_unroll_loop
-.nb400_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [rsp + nb400_innerk],  4
-	mov   edx, [rsp + nb400_innerk]
-	and   edx, 2
-	jnz   .nb400_dopair
-	jmp   .nb400_checksingle
-.nb400_dopair:	
-	mov   rcx, [rsp + nb400_innerjjnr]
-	
-	mov   eax, [rcx]	
-	mov   ebx, [rcx + 4]              
-	add qword ptr [rsp + nb400_innerjjnr],  8
-
-	;# load isaj
-	mov rsi, [rbp + nb400_invsqrta]
-	movss xmm3, [rsi + rax*4]
-	movss xmm6, [rsi + rbx*4]
-    unpcklps xmm3, xmm6
-
-	movaps xmm2, [rsp + nb400_isai]
-	mulps  xmm2, xmm3
-	
-	movaps [rsp + nb400_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [rsp + nb400_gbtsc]
-	movaps [rsp + nb400_gbscale], xmm1
-	
-	mov rsi, [rbp + nb400_charge]    ;# base of charge[] 
-    
-    mulps xmm2, [rsp + nb400_iq]
-	movss xmm3, [rsi + rax*4]
-	movss xmm6, [rsi + rbx*4]
-    unpcklps xmm3, xmm6
-    
-	mulps xmm3, xmm2
-	movaps [rsp + nb400_qq], xmm3	
-	
-	mov rsi, [rbp + nb400_pos]       ;# base of pos[] 
-	
-	lea   r8, [rax + rax*2]     ;# j3 
-	lea   r9, [rbx + rbx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-	movlps xmm4, [rsi + r8*4]	;# x1 y1 - - 
-	movlps xmm5, [rsi + r9*4]	;# x2 y2 - - 
-
-	movss xmm6, [rsi + r8*4 + 8]	;# z1 - - - 
-	movss xmm7, [rsi + r9*4 + 8]	;# z2 - - - 
-
-    unpcklps xmm4, xmm5 ;# x1 x2 y1 y2
-    movhlps  xmm5, xmm4 ;# y1 y2 -  -
-    unpcklps xmm6, xmm7 ;# z1 z2 -  -
-    
-	;# calc dr 
-	subps xmm4, [rsp + nb400_ix]
-	subps xmm5, [rsp + nb400_iy]
-	subps xmm6, [rsp + nb400_iz]
-
-	;# store dr 
-	movaps xmm9, xmm4
-	movaps xmm10, xmm5
-	movaps xmm11, xmm6
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb400_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb400_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r
-	movaps [rsp + nb400_r], xmm4
-	mulps xmm4, [rsp + nb400_gbscale]
-
-    ;# truncate and convert to integers
-    cvttps2dq xmm5, xmm4
-    
-    ;# convert back to float
-    cvtdq2ps  xmm6, xmm5
-    
-    ;# multiply by 4
-    pslld   xmm5, 2
-
-    ;# move to integer registers
-    movd    r12d, xmm5
-    pshufd  xmm5, xmm5, 1
-    movd    r13d, xmm5
-    
-    ;# calculate eps
-    subps     xmm4, xmm6
-    movaps    xmm1, xmm4 ;#eps
-    
-	mov  rsi, [rbp + nb400_GBtab]
-
-    ;# load table data
-   	movlps xmm4, [rsi + r12*4]
-	movlps xmm5, [rsi + r13*4]
-    unpcklps xmm4, xmm5
-    movhlps  xmm5, xmm4
-    
-   	movlps xmm6, [rsi + r12*4 + 8]
-	movlps xmm7, [rsi + r13*4 + 8] 
-    unpcklps xmm6, xmm7
-    movhlps  xmm7, xmm6
-    ;# table data ready in xmm4-xmm7
-
-    mulps  xmm7, xmm1   ;# Heps
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm1	;# Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	addps  xmm7, xmm7	;# two*Heps2 
-	movaps xmm3, [rsp + nb400_qq]
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulps  xmm3, xmm7 ;# fijC=FF*qq 
-	;# at this point xmm5 contains vcoul and xmm3 fijC
-
-    ;# zero upper part of vcoul 
-    xorps xmm2, xmm2
-    movlhps xmm5, xmm2
-    
-	mov rsi, [rbp + nb400_dvda]
-	
-	;# Calculate dVda
-	xorps  xmm7, xmm7
-	mulps xmm3, [rsp + nb400_gbscale]
-	movaps xmm6, xmm3
-	mulps  xmm6, [rsp + nb400_r]
-	addps  xmm6, xmm5
-    
-    ;# increment vctot (sum in xmm12)
-	addps  xmm12, xmm5
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-
-    ;# zero upper half of dvda
-    movlhps xmm7, xmm2
-    
-    ;# update dvdasum
-    addps  xmm7, [rsp + nb400_dvdasum]
-    movaps [rsp + nb400_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movaps  xmm5, xmm6
-	shufps  xmm5, xmm5, 0x1
-
-	;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-	addss  xmm6, [rsi + rax*4]
-	addss  xmm5, [rsi + rbx*4]
-	movss  [rsi + rax*4], xmm6
-	movss  [rsi + rbx*4], xmm5
-
-	xorps  xmm4, xmm4	
-	mulps xmm3, xmm0
-	subps  xmm4, xmm3
-
-    mulps  xmm9, xmm4
-    mulps  xmm10, xmm4
-    mulps  xmm11, xmm4
-
-    movlhps xmm9, xmm2
-    movlhps xmm10, xmm2
-    movlhps xmm11, xmm2
-    
-	;# accumulate i forces
-    addps xmm13, xmm9
-    addps xmm14, xmm10
-    addps xmm15, xmm11
-
-	mov rsi, [rbp + nb400_faction]
-	;# the fj's - start by accumulating x & y forces from memory 
-	movlps xmm0, [rsi + r8*4] ;# x1 y1 - -
-	movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2
-
-    unpcklps xmm9, xmm10  ;# x1 y1 x2 y2
-    addps    xmm0, xmm9
-
-	movlps [rsi + r8*4], xmm0
-	movhps [rsi + r9*4], xmm0
-    
-    ;# z forces
-    pshufd xmm8, xmm11, 1
-    addss  xmm11, [rsi + r8*4 + 8] 
-    addss  xmm8,  [rsi + r9*4 + 8]
-    movss  [rsi + r8*4 + 8], xmm11
-    movss  [rsi + r9*4 + 8], xmm8
-
-.nb400_checksingle:				
-	mov   edx, [rsp + nb400_innerk]
-	and   edx, 1
-	jnz    .nb400_dosingle
-	jmp    .nb400_updateouterdata
-.nb400_dosingle:
-	mov   rcx, [rsp + nb400_innerjjnr]
-	mov   eax, [rcx]	
-
-	;# load isaj
-	mov rsi, [rbp + nb400_invsqrta]
-	movss xmm2, [rsi + rax*4]
-	mulss xmm2, [rsp + nb400_isai]	
-	movss [rsp + nb400_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulss xmm1, [rsp + nb400_gbtsc]
-	movss [rsp + nb400_gbscale], xmm1
-	
-	mov rsi, [rbp + nb400_charge]    ;# base of charge[] 
-
-    mulss xmm2, [rsp + nb400_iq]
-	movss xmm3, [rsi + rax*4]
-	mulss xmm3, xmm2
-	movss [rsp + nb400_qq], xmm3	
-	
-	mov rsi, [rbp + nb400_pos]       ;# base of pos[] 
-	
-	lea   r8, [rax + rax*2]     ;# j3=3*jnr
-
-	;# move four coordinates to xmm0-xmm2 	
-	movss xmm4, [rsi + r8*4]	
-	movss xmm5, [rsi + r8*4 + 4]	
-	movss xmm6, [rsi + r8*4 + 8]
-    
-	;# calc dr 
-	subss xmm4, [rsp + nb400_ix]
-	subss xmm5, [rsp + nb400_iy]
-	subss xmm6, [rsp + nb400_iz]
-
-	;# store dr 
-	movaps xmm9, xmm4
-	movaps xmm10, xmm5
-	movaps xmm11, xmm6
-
-	;# square it 
-	mulss xmm4,xmm4
-	mulss xmm5,xmm5
-	mulss xmm6,xmm6
-	addss xmm4, xmm5
-	addss xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulss xmm5, xmm5
-	movaps xmm1, [rsp + nb400_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb400_half]
-	subss xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-	mulss xmm4, xmm0	;# xmm4=r
-	movaps [rsp + nb400_r], xmm4
-	mulss xmm4, [rsp + nb400_gbscale]
-
-    ;# truncate and convert to integers
-    cvttss2si r12d, xmm4
-    
-    ;# convert back to float
-    cvtsi2ss  xmm6, r12d
-    
-    ;# multiply by 4
-    shl r12d, 2
-
-    ;# calculate eps
-    subss     xmm4, xmm6
-    movaps    xmm1, xmm4 ;#eps
-    
-	mov  rsi, [rbp + nb400_GBtab]
-
-    ;# load table data
-   	movss xmm4, [rsi + r12*4]
-	movss xmm5, [rsi + r12*4 + 4]
-   	movss xmm6, [rsi + r12*4 + 8]
-	movss xmm7, [rsi + r12*4 + 12]
-    ;# table data ready in xmm4-xmm7
-
-    mulss  xmm7, xmm1   ;# Heps
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm1	;# Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	addss  xmm7, xmm7	;# two*Heps2 
-	movss  xmm3, [rsp + nb400_qq]
-	addss  xmm7, xmm6
-	addss  xmm7, xmm5 ;# xmm7=FF 
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulss  xmm3, xmm7 ;# fijC=FF*qq 
-	;# at this point xmm5 contains vcoul and xmm3 fijC
-
-	mov rsi, [rbp + nb400_dvda]
-	
-	;# Calculate dVda
-	xorps  xmm7, xmm7
-	mulss xmm3, [rsp + nb400_gbscale]
-	movaps xmm6, xmm3
-	mulss  xmm6, [rsp + nb400_r]
-	addss  xmm6, xmm5
-    
-    ;# increment vctot (sum in xmm12)
-	addss  xmm12, xmm5
-
-	;# xmm6=(vcoul+fijC*r)
-	subss  xmm7, xmm6
-	movaps xmm6, xmm7
-
-    ;# update dvdasum
-    addss  xmm7, [rsp + nb400_dvdasum]
-    movss [rsp + nb400_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	addss  xmm6, [rsi + rax*4]
-	movss  [rsi + rax*4], xmm6
-
-	xorps  xmm4, xmm4	
-	mulss xmm3, xmm0
-	subss  xmm4, xmm3
-
-    mulss  xmm9, xmm4
-    mulss  xmm10, xmm4
-    mulss  xmm11, xmm4
-
-	;# accumulate i forces
-    addss xmm13, xmm9
-    addss xmm14, xmm10
-    addss xmm15, xmm11
-
-	mov rsi, [rbp + nb400_faction]
-    ;# add to j forces
-    addss  xmm9,  [rsi + r8*4]
-    addss  xmm10, [rsi + r8*4 + 4]
-    addss  xmm11, [rsi + r8*4 + 8]
-    movss  [rsi + r8*4],     xmm9
-    movss  [rsi + r8*4 + 4], xmm10
-    movss  [rsi + r8*4 + 8], xmm11
-    
-.nb400_updateouterdata:
-	mov   ecx, [rsp + nb400_ii3]
-	mov   rdi, [rbp + nb400_faction]
-	mov   rsi, [rbp + nb400_fshift]
-	mov   edx, [rsp + nb400_is3]
-
-	;# accumulate i forces in xmm13, xmm14, xmm15
-	movhlps xmm0, xmm13
-	movhlps xmm1, xmm14
-	movhlps xmm2, xmm15
-	addps  xmm0, xmm13
-	addps  xmm1, xmm14
-	addps  xmm2, xmm15 
-    movaps xmm3, xmm0	
-	movaps xmm4, xmm1	
-	movaps xmm5, xmm2	
-	shufps xmm3, xmm3, 1
-	shufps xmm4, xmm4, 1
-	shufps xmm5, xmm5, 1
-	addss  xmm0, xmm3
-	addss  xmm1, xmm4
-	addss  xmm2, xmm5	;# xmm0-xmm2 has single force in pos0 
-
-	;# increment i force 
-	movss  xmm3, [rdi + rcx*4]
-	movss  xmm4, [rdi + rcx*4 + 4]
-	movss  xmm5, [rdi + rcx*4 + 8]
-	subss  xmm3, xmm0
-	subss  xmm4, xmm1
-	subss  xmm5, xmm2
-	movss  [rdi + rcx*4],     xmm3
-	movss  [rdi + rcx*4 + 4], xmm4
-	movss  [rdi + rcx*4 + 8], xmm5
-
-	;# increment fshift force  
-	movss  xmm3, [rsi + rdx*4]
-	movss  xmm4, [rsi + rdx*4 + 4]
-	movss  xmm5, [rsi + rdx*4 + 8]
-	subss  xmm3, xmm0
-	subss  xmm4, xmm1
-	subss  xmm5, xmm2
-	movss  [rsi + rdx*4],     xmm3
-	movss  [rsi + rdx*4 + 4], xmm4
-	movss  [rsi + rdx*4 + 8], xmm5
-
-	;# get n from stack
-	mov esi, [rsp + nb400_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb400_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	;# accumulate 
-	movhlps xmm6, xmm12
-	addps  xmm12, xmm6	;# pos 0-1 in xmm12 have the sum now 
-	movaps xmm6, xmm12
-	shufps xmm6, xmm6, 1
-	addss  xmm12, xmm6
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb400_Vc]
-	addss xmm12, [rax + rdx*4] 
-	;# move back to mem 
-	movss [rax + rdx*4], xmm12
-	
-	;# accumulate dVda and update it 
-	movaps xmm7, [rsp + nb400_dvdasum]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-	
-	mov edx, [rsp + nb400_ii]
-	mov rax, [rbp + nb400_dvda]
-	addss xmm7, [rax + rdx*4]
-	movss [rax + rdx*4], xmm7
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb400_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb400_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb400_n], esi
-        jmp .nb400_outer
-.nb400_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb400_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb400_end
-        ;# non-zero, do one more workunit
-        jmp   .nb400_threadloop
-.nb400_end:
-
-	mov eax, [rsp + nb400_nouter]
-	mov ebx, [rsp + nb400_ninner]
-	mov rcx, [rbp + nb400_outeriter]
-	mov rdx, [rbp + nb400_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 456
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
-
-	
-
-.globl nb_kernel400nf_x86_64_sse
-.globl _nb_kernel400nf_x86_64_sse
-nb_kernel400nf_x86_64_sse:	
-_nb_kernel400nf_x86_64_sse:	
-.equiv          nb400nf_fshift,         16
-.equiv          nb400nf_gid,            24
-.equiv          nb400nf_pos,            32
-.equiv          nb400nf_faction,        40
-.equiv          nb400nf_charge,         48
-.equiv          nb400nf_p_facel,        56
-.equiv          nb400nf_argkrf,         64
-.equiv          nb400nf_argcrf,         72
-.equiv          nb400nf_Vc,             80
-.equiv          nb400nf_type,           88
-.equiv          nb400nf_p_ntype,        96
-.equiv          nb400nf_vdwparam,       104
-.equiv          nb400nf_Vvdw,           112
-.equiv          nb400nf_p_tabscale,     120
-.equiv          nb400nf_VFtab,          128
-.equiv          nb400nf_invsqrta,       136
-.equiv          nb400nf_dvda,           144
-.equiv          nb400nf_p_gbtabscale,   152
-.equiv          nb400nf_GBtab,          160
-.equiv          nb400nf_p_nthreads,     168
-.equiv          nb400nf_count,          176
-.equiv          nb400nf_mtx,            184
-.equiv          nb400nf_outeriter,      192
-.equiv          nb400nf_inneriter,      200
-.equiv          nb400nf_work,           208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb400nf_ix,             0
-.equiv          nb400nf_iy,             16
-.equiv          nb400nf_iz,             32
-.equiv          nb400nf_iq,             48
-.equiv          nb400nf_gbtsc,          64
-.equiv          nb400nf_qq,             80
-.equiv          nb400nf_vctot,          96
-.equiv          nb400nf_half,           112
-.equiv          nb400nf_three,          128
-.equiv          nb400nf_isai,           144
-.equiv          nb400nf_isaprod,        160
-.equiv          nb400nf_gbscale,        176
-.equiv          nb400nf_nri,            192
-.equiv          nb400nf_iinr,           200
-.equiv          nb400nf_jindex,         208
-.equiv          nb400nf_jjnr,           216
-.equiv          nb400nf_shift,          224
-.equiv          nb400nf_shiftvec,       232
-.equiv          nb400nf_facel,          240
-.equiv          nb400nf_innerjjnr,      248
-.equiv          nb400nf_is3,            256
-.equiv          nb400nf_ii3,            260
-.equiv          nb400nf_innerk,         264
-.equiv          nb400nf_n,              268
-.equiv          nb400nf_nn1,            272
-.equiv          nb400nf_nouter,         276
-.equiv          nb400nf_ninner,         280
-
-
-	push rbp
-	mov  rbp, rsp
-	push rbx
-
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 296		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb400nf_nouter], eax
-	mov [rsp + nb400nf_ninner], eax
-
-	mov edi, [rdi]
-	mov [rsp + nb400nf_nri], edi
-	mov [rsp + nb400nf_iinr], rsi
-	mov [rsp + nb400nf_jindex], rdx
-	mov [rsp + nb400nf_jjnr], rcx
-	mov [rsp + nb400nf_shift], r8
-	mov [rsp + nb400nf_shiftvec], r9
-	mov rsi, [rbp + nb400nf_p_facel]
-	movss xmm0, [rsi]
-	movss [rsp + nb400nf_facel], xmm0
-
-	mov rbx, [rbp + nb400nf_p_gbtabscale]
-	movss xmm4, [rbx]
-	shufps xmm4, xmm4, 0
-	movaps [rsp + nb400nf_gbtsc],  xmm4
-
-
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# half in IEEE (hex)
-	mov [rsp + nb400nf_half], eax
-	movss xmm1, [rsp + nb400nf_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# one
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# two
-	addps  xmm3, xmm2	;# three
-	movaps [rsp + nb400nf_half],  xmm1
-	movaps [rsp + nb400nf_three],  xmm3
-
-.nb400nf_threadloop:
-        mov   rsi, [rbp + nb400nf_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb400nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb400nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb400nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb400nf_n], eax
-        mov [rsp + nb400nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb400nf_outerstart
-        jmp .nb400nf_end
-
-.nb400nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb400nf_nouter]
-	mov [rsp + nb400nf_nouter], ebx
-
-.nb400nf_outer:
-	mov   rax, [rsp + nb400nf_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax + rsi*4]		;# ebx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb400nf_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb400nf_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movss xmm0, [rax + rbx*4]
-	movss xmm1, [rax + rbx*4 + 4]
-	movss xmm2, [rax + rbx*4 + 8] 
-
-	mov   rcx, [rsp + nb400nf_iinr]       ;# rcx = pointer into iinr[] 	
-	mov   ebx, [rcx + rsi*4]	    ;# ebx =ii 
-	
-	mov   rdx, [rbp + nb400nf_charge]
-	movss xmm3, [rdx + rbx*4]	
-	mulss xmm3, [rsp + nb400nf_facel]
-	shufps xmm3, xmm3, 0
-
-	mov   rdx, [rbp + nb400nf_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [rdx + rbx*4]
-	shufps xmm4, xmm4, 0
-
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb400nf_pos]    ;# rax = base of pos[]  
-
-	addss xmm0, [rax + rbx*4]
-	addss xmm1, [rax + rbx*4 + 4]
-	addss xmm2, [rax + rbx*4 + 8]
-
-	movaps [rsp + nb400nf_iq], xmm3
-	movaps [rsp + nb400nf_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [rsp + nb400nf_ix], xmm0
-	movaps [rsp + nb400nf_iy], xmm1
-	movaps [rsp + nb400nf_iz], xmm2
-
-	mov   [rsp + nb400nf_ii3], ebx
-	
-	;# clear vctot 
-	xorps xmm4, xmm4
-	movaps [rsp + nb400nf_vctot], xmm4
-	
-	mov   rax, [rsp + nb400nf_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb400nf_pos]
-	mov   rdi, [rbp + nb400nf_faction]	
-	mov   rax, [rsp + nb400nf_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb400nf_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [rsp + nb400nf_ninner]
-	mov   [rsp + nb400nf_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb400nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb400nf_unroll_loop
-	jmp   .nb400nf_finish_inner
-.nb400nf_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   rdx, [rsp + nb400nf_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [rdx]	
-	mov   ebx, [rdx + 4]              
-	mov   ecx, [rdx + 8]            
-	mov   edx, [rdx + 12]         ;# eax-edx=jnr1-4 
-	add qword ptr [rsp + nb400nf_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-
-	;# load isa2
-	mov rsi, [rbp + nb400nf_invsqrta]
-	movss xmm3, [rsi + rax*4]
-	movss xmm4, [rsi + rcx*4]
-	movss xmm6, [rsi + rbx*4]
-	movss xmm7, [rsi + rdx*4]
-	movaps xmm2, [rsp + nb400nf_isai]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# 10001000 ;# all charges in xmm3  
-	mulps  xmm2, xmm3
-	
-	movaps [rsp + nb400nf_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [rsp + nb400nf_gbtsc]
-	movaps [rsp + nb400nf_gbscale], xmm1
-	
-	mov rsi, [rbp + nb400nf_charge]    ;# base of charge[] 
-	
-	movss xmm3, [rsi + rax*4]
-	movss xmm4, [rsi + rcx*4]
-	movss xmm6, [rsi + rbx*4]
-	movss xmm7, [rsi + rdx*4]
-
-	mulps xmm2, [rsp + nb400nf_iq]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# 10001000 ;# all charges in xmm3  
-	mulps  xmm3, xmm2
-	movaps [rsp + nb400nf_qq], xmm3	
-
-	
-	mov rsi, [rbp + nb400nf_pos]       ;# base of pos[] 
-
-	lea   rax, [rax + rax*2]     ;# replace jnr with j3 
-	lea   rbx, [rbx + rbx*2]	
-
-	lea   rcx, [rcx + rcx*2]     ;# replace jnr with j3 
-	lea   rdx, [rdx + rdx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-
-	movlps xmm4, [rsi + rax*4]
-	movlps xmm5, [rsi + rcx*4]
-	movss xmm2, [rsi + rax*4 + 8]
-	movss xmm6, [rsi + rcx*4 + 8]
-
-	movhps xmm4, [rsi + rbx*4]
-	movhps xmm5, [rsi + rdx*4]
-
-	movss xmm0, [rsi + rbx*4 + 8]
-	movss xmm1, [rsi + rdx*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# 10001000
-	
-	shufps xmm0, xmm5, 136  ;# 10001000
-	shufps xmm1, xmm5, 221  ;# 11011101		
-
-	;# move ix-iz to xmm4-xmm6 
-	movaps xmm4, [rsp + nb400nf_ix]
-	movaps xmm5, [rsp + nb400nf_iy]
-	movaps xmm6, [rsp + nb400nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb400nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb400nf_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r
-	mulps xmm4, [rsp + nb400nf_gbscale]
-
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 2
-	pslld mm7, 2
-
-	movd mm0, eax	
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-
-	mov  rsi, [rbp + nb400nf_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-
-	;# load coulomb table
-	movaps xmm4, [rsi + rax*4]
-	movaps xmm5, [rsi + rbx*4]
-	movaps xmm6, [rsi + rcx*4]
-	movaps xmm7, [rsi + rdx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# coulomb table ready, in xmm4-xmm7  	
-	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 
-	movaps xmm3, [rsp + nb400nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	addps  xmm5, [rsp + nb400nf_vctot]
-	movaps [rsp + nb400nf_vctot], xmm5 
-	
-	;# should we do one more iteration? 
-	sub dword ptr [rsp + nb400nf_innerk],  4
-	jl    .nb400nf_finish_inner
-	jmp   .nb400nf_unroll_loop
-.nb400nf_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [rsp + nb400nf_innerk],  4
-	mov   edx, [rsp + nb400nf_innerk]
-	and   edx, 2
-	jnz   .nb400nf_dopair
-	jmp   .nb400nf_checksingle
-.nb400nf_dopair:	
-	mov   rcx, [rsp + nb400nf_innerjjnr]
-	
-	mov   eax, [rcx]	
-	mov   ebx, [rcx + 4]              
-	add qword ptr [rsp + nb400nf_innerjjnr],  8
-
-	xorps xmm2, xmm2
-	movaps xmm6, xmm2
-	
-	;# load isa2
-	mov rsi, [rbp + nb400nf_invsqrta]
-	movss xmm2, [rsi + rax*4]
-	movss xmm3, [rsi + rbx*4]
-	unpcklps xmm2, xmm3	;# isa2 in xmm3(0,1)
-	mulps  xmm2, [rsp + nb400nf_isai]
-	movaps [rsp + nb400nf_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [rsp + nb400nf_gbtsc]
-	movaps [rsp + nb400nf_gbscale], xmm1	
-	
-	mov rsi, [rbp + nb400nf_charge]    ;# base of charge[] 	
-	movss xmm3, [rsi + rax*4]		
-	movss xmm6, [rsi + rbx*4]
-	unpcklps xmm3, xmm6 ;# 00001000 ;# xmm3(0,1) has the charges 
-
-	mulps  xmm2, [rsp + nb400nf_iq]
-	mulps  xmm3, xmm2
-	movaps [rsp + nb400nf_qq], xmm3
-
-	mov rdi, [rbp + nb400nf_pos]	
-	
-	lea   rax, [rax + rax*2]
-	lea   rbx, [rbx + rbx*2]
-	;# move coordinates to xmm0-xmm2 
-	movlps xmm1, [rdi + rax*4]
-	movss xmm2, [rdi + rax*4 + 8]	
-	movhps xmm1, [rdi + rbx*4]
-	movss xmm0, [rdi + rbx*4 + 8]	
-
-	movlhps xmm3, xmm7
-	
-	shufps xmm2, xmm0, 0
-	
-	movaps xmm0, xmm1
-
-	shufps xmm2, xmm2, 136  ;# 10001000
-	
-	shufps xmm0, xmm0, 136  ;# 10001000
-	shufps xmm1, xmm1, 221  ;# 11011101
-	
-	mov    rdi, [rbp + nb400nf_faction]
-	;# move ix-iz to xmm4-xmm6 
-	xorps   xmm7, xmm7
-	
-	movaps xmm4, [rsp + nb400nf_ix]
-	movaps xmm5, [rsp + nb400nf_iy]
-	movaps xmm6, [rsp + nb400nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb400nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb400nf_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	mulps xmm4, [rsp + nb400nf_gbscale]
-
-	cvttps2pi mm6, xmm4     ;# mm6 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-
-	pslld mm6, 2
-
-	mov  rsi, [rbp + nb400nf_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-
-	;# load coulomb table
-	movaps xmm4, [rsi + rcx*4]
-	movaps xmm7, [rsi + rdx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# coulomb table ready, in xmm4-xmm7  	
-
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	movaps xmm3, [rsp + nb400nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	addps  xmm5, [rsp + nb400nf_vctot]
-	movaps [rsp + nb400nf_vctot], xmm5 
-
-.nb400nf_checksingle:				
-	mov   edx, [rsp + nb400nf_innerk]
-	and   edx, 1
-	jnz    .nb400nf_dosingle
-	jmp    .nb400nf_updateouterdata
-.nb400nf_dosingle:
-	mov rsi, [rbp + nb400nf_charge]
-	mov rdx, [rbp + nb400nf_invsqrta]
-	mov rdi, [rbp + nb400nf_pos]
-	mov   rcx, [rsp + nb400nf_innerjjnr]
-	mov   eax, [rcx]	
-	xorps  xmm2, xmm2
-	movaps xmm6, xmm2
-	movss xmm2, [rdx + rax*4]	;# isa2
-	mulss xmm2, [rsp + nb400nf_isai]
-	movss [rsp + nb400nf_isaprod], xmm2	
-	movss xmm1, xmm2
-	mulss xmm1, [rsp + nb400nf_gbtsc]
-	movss [rsp + nb400nf_gbscale], xmm1	
-	
-	mulss  xmm2, [rsp + nb400nf_iq]
-	movss xmm6, [rsi + rax*4]	;# xmm6(0) has the charge 	
-	mulss  xmm6, xmm2
-	movss [rsp + nb400nf_qq], xmm6
-	
-	lea   rax, [rax + rax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movss xmm0, [rdi + rax*4]	
-	movss xmm1, [rdi + rax*4 + 4]	
-	movss xmm2, [rdi + rax*4 + 8]	 
-	
-	movss xmm4, [rsp + nb400nf_ix]
-	movss xmm5, [rsp + nb400nf_iy]
-	movss xmm6, [rsp + nb400nf_iz]
-
-	;# calc dr 
-	subss xmm4, xmm0
-	subss xmm5, xmm1
-	subss xmm6, xmm2
-
-	;# square it 
-	mulss xmm4,xmm4
-	mulss xmm5,xmm5
-	mulss xmm6,xmm6
-	addss xmm4, xmm5
-	addss xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movss xmm2, xmm5
-	mulss xmm5, xmm5
-	movss xmm1, [rsp + nb400nf_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movss xmm0, [rsp + nb400nf_half]
-	subss xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-
-	mulss xmm4, xmm0	;# xmm4=r 
-	mulss xmm4, [rsp + nb400nf_gbscale]
-
-	cvttss2si ebx, xmm4     ;# mm6 contain lu indices 
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movss xmm1, xmm4	;# xmm1=eps 
-	movss xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl  ebx, 2
-
-	mov  rsi, [rbp + nb400nf_GBtab]
-
-	movaps xmm4, [rsi + rbx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	movss xmm3, [rsp + nb400nf_qq]
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, xmm3 ;# vcoul=qq*VV  
-	addss  xmm5, [rsp + nb400nf_vctot]
-	movss [rsp + nb400nf_vctot], xmm5 
-.nb400nf_updateouterdata:
-	;# get n from stack
-	mov esi, [rsp + nb400nf_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb400nf_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movaps xmm7, [rsp + nb400nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb400nf_Vc]
-	addss xmm7, [rax + rdx*4] 
-	;# move back to mem 
-	movss [rax + rdx*4], xmm7 
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb400nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb400nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb400nf_n], esi
-        jmp .nb400nf_outer
-.nb400nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb400nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb400nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb400nf_threadloop
-.nb400nf_end:
-
-	mov eax, [rsp + nb400nf_nouter]
-	mov ebx, [rsp + nb400nf_ninner]
-	mov rcx, [rbp + nb400nf_outeriter]
-	mov rdx, [rbp + nb400nf_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 296
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.s
deleted file mode 100644
index 6b8062d5d1..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.s
+++ /dev/null
@@ -1,1638 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-
-
-
-
-.globl nb_kernel400_x86_64_sse
-.globl _nb_kernel400_x86_64_sse
-nb_kernel400_x86_64_sse:        
-_nb_kernel400_x86_64_sse:       
-##      Room for return address and rbp (16 bytes)
-.set nb400_fshift, 16
-.set nb400_gid, 24
-.set nb400_pos, 32
-.set nb400_faction, 40
-.set nb400_charge, 48
-.set nb400_p_facel, 56
-.set nb400_argkrf, 64
-.set nb400_argcrf, 72
-.set nb400_Vc, 80
-.set nb400_type, 88
-.set nb400_p_ntype, 96
-.set nb400_vdwparam, 104
-.set nb400_Vvdw, 112
-.set nb400_p_tabscale, 120
-.set nb400_VFtab, 128
-.set nb400_invsqrta, 136
-.set nb400_dvda, 144
-.set nb400_p_gbtabscale, 152
-.set nb400_GBtab, 160
-.set nb400_p_nthreads, 168
-.set nb400_count, 176
-.set nb400_mtx, 184
-.set nb400_outeriter, 192
-.set nb400_inneriter, 200
-.set nb400_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb400_ix, 0
-.set nb400_iy, 16
-.set nb400_iz, 32
-.set nb400_iq, 48
-.set nb400_dx, 64
-.set nb400_dy, 80
-.set nb400_dz, 96
-.set nb400_two, 112
-.set nb400_gbtsc, 128
-.set nb400_qq, 144
-.set nb400_r, 160
-.set nb400_vctot, 176
-.set nb400_fix, 192
-.set nb400_fiy, 208
-.set nb400_fiz, 224
-.set nb400_half, 240
-.set nb400_three, 256
-.set nb400_isai, 272
-.set nb400_isaprod, 288
-.set nb400_dvdasum, 304
-.set nb400_gbscale, 320
-.set nb400_nri, 336
-.set nb400_iinr, 344
-.set nb400_jindex, 352
-.set nb400_jjnr, 360
-.set nb400_shift, 368
-.set nb400_shiftvec, 376
-.set nb400_facel, 384
-.set nb400_innerjjnr, 392
-.set nb400_is3, 400
-.set nb400_ii3, 404
-.set nb400_ii, 408
-.set nb400_innerk, 412
-.set nb400_n, 416
-.set nb400_nn1, 420
-.set nb400_nouter, 424
-.set nb400_ninner, 428
-.set nb400_jnra, 432
-.set nb400_jnrb, 436
-.set nb400_jnrc, 440
-.set nb400_jnrd, 444
-
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $456,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb400_nouter(%rsp)
-        movl %eax,nb400_ninner(%rsp)
-
-        movl (%rdi),%edi
-        movl %edi,nb400_nri(%rsp)
-        movq %rsi,nb400_iinr(%rsp)
-        movq %rdx,nb400_jindex(%rsp)
-        movq %rcx,nb400_jjnr(%rsp)
-        movq %r8,nb400_shift(%rsp)
-        movq %r9,nb400_shiftvec(%rsp)
-        movq nb400_p_facel(%rbp),%rsi
-        movss (%rsi),%xmm0
-        movss %xmm0,nb400_facel(%rsp)
-
-        movq nb400_p_gbtabscale(%rbp),%rbx
-        movss (%rbx),%xmm4
-        shufps $0,%xmm4,%xmm4
-        movaps %xmm4,nb400_gbtsc(%rsp)
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## half in IEEE (hex)
-        movl %eax,nb400_half(%rsp)
-        movss nb400_half(%rsp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## one
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## two
-        addps  %xmm2,%xmm3      ## three
-        movaps %xmm1,nb400_half(%rsp)
-        movaps %xmm2,nb400_two(%rsp)
-        movaps %xmm3,nb400_three(%rsp)
-
-_nb_kernel400_x86_64_sse.nb400_threadloop: 
-        movq  nb400_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel400_x86_64_sse.nb400_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel400_x86_64_sse.nb400_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb400_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb400_n(%rsp)
-        movl %ebx,nb400_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel400_x86_64_sse.nb400_outerstart
-        jmp _nb_kernel400_x86_64_sse.nb400_end
-
-_nb_kernel400_x86_64_sse.nb400_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb400_nouter(%rsp),%ebx
-        movl %ebx,nb400_nouter(%rsp)
-
-_nb_kernel400_x86_64_sse.nb400_outer: 
-        movq  nb400_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx                ## ebx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb400_is3(%rsp)      ## store is3 
-
-        movq  nb400_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movss (%rax,%rbx,4),%xmm0
-        movss 4(%rax,%rbx,4),%xmm1
-        movss 8(%rax,%rbx,4),%xmm2
-
-        movq  nb400_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]        
-        movl  (%rcx,%rsi,4),%ebx            ## ebx =ii 
-        movl  %ebx,nb400_ii(%rsp)
-
-        movq  nb400_charge(%rbp),%rdx
-        movss (%rdx,%rbx,4),%xmm3
-        mulss nb400_facel(%rsp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-
-        movq  nb400_invsqrta(%rbp),%rdx         ## load invsqrta[ii]
-        movss (%rdx,%rbx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb400_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addss (%rax,%rbx,4),%xmm0
-        addss 4(%rax,%rbx,4),%xmm1
-        addss 8(%rax,%rbx,4),%xmm2
-
-        movaps %xmm3,nb400_iq(%rsp)
-        movaps %xmm4,nb400_isai(%rsp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb400_ix(%rsp)
-        movaps %xmm1,nb400_iy(%rsp)
-        movaps %xmm2,nb400_iz(%rsp)
-
-        movl  %ebx,nb400_ii3(%rsp)
-
-        ## clear vctot and i forces 
-        xorps %xmm4,%xmm4
-        movaps %xmm4,nb400_dvdasum(%rsp)
-        movaps %xmm4,%xmm12
-        movaps %xmm4,%xmm13
-        movaps %xmm4,%xmm14
-        movaps %xmm4,%xmm15
-
-        movq  nb400_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb400_pos(%rbp),%rsi
-        movq  nb400_faction(%rbp),%rdi
-        movq  nb400_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb400_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb400_ninner(%rsp),%ecx
-        movl  %ecx,nb400_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb400_innerk(%rsp)      ## number of innerloop atoms 
-        jge   _nb_kernel400_x86_64_sse.nb400_unroll_loop
-        jmp   _nb_kernel400_x86_64_sse.nb400_finish_inner
-_nb_kernel400_x86_64_sse.nb400_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movq  nb400_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k] 
-        movl  (%rdx),%eax
-        movl  4(%rdx),%ebx
-        movl  8(%rdx),%ecx
-        movl  12(%rdx),%edx           ## eax-edx=jnr1-4 
-
-        addq $16,nb400_innerjjnr(%rsp)             ## advance pointer (unrolled 4) 
-
-        movq nb400_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r8     ## j3
-        lea  (%rbx,%rbx,2),%r9
-        lea  (%rcx,%rcx,2),%r10
-        lea  (%rdx,%rdx,2),%r11
-
-        ## move four coordinates to xmm0-xmm2   
-        movlps (%rsi,%r8,4),%xmm4
-        movlps (%rsi,%r10,4),%xmm5
-        movss 8(%rsi,%r8,4),%xmm2
-        movss 8(%rsi,%r10,4),%xmm6
-
-        movhps (%rsi,%r9,4),%xmm4
-        movhps (%rsi,%r11,4),%xmm5
-
-        movss 8(%rsi,%r9,4),%xmm0
-        movss 8(%rsi,%r11,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## 10001000
-
-        shufps $136,%xmm5,%xmm0 ## 10001000
-        shufps $221,%xmm5,%xmm1 ## 11011101             
-
-        ## calc dr 
-        subps nb400_ix(%rsp),%xmm0
-        subps nb400_iy(%rsp),%xmm1
-        subps nb400_iz(%rsp),%xmm2
-
-        ## store dr 
-        movaps %xmm0,%xmm9
-        movaps %xmm1,%xmm10
-        movaps %xmm2,%xmm11
-
-        ## square it 
-        mulps %xmm0,%xmm0
-        mulps %xmm1,%xmm1
-        mulps %xmm2,%xmm2
-        addps %xmm1,%xmm0
-        addps %xmm2,%xmm0
-    movaps %xmm0,%xmm4
-        ## rsq in xmm4 
-
-        ## load isaj
-        movq nb400_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm0
-        movss (%rsi,%rcx,4),%xmm1
-        movss (%rsi,%rbx,4),%xmm2
-        movss (%rsi,%rdx,4),%xmm3
-        movaps nb400_isai(%rsp),%xmm7
-        shufps $0,%xmm2,%xmm0
-    shufps $0,%xmm3,%xmm1
-        shufps $136,%xmm1,%xmm0 ## 10001000 ;# all isaj in xmm3 
-        mulps  %xmm0,%xmm7
-
-        movaps %xmm7,nb400_isaprod(%rsp)
-        movaps %xmm7,%xmm1
-        mulps nb400_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb400_gbscale(%rsp)
-
-        movq nb400_charge(%rbp),%rsi     ## base of charge[] 
-
-        movss (%rsi,%rax,4),%xmm0
-        movss (%rsi,%rcx,4),%xmm1
-        movss (%rsi,%rbx,4),%xmm2
-        movss (%rsi,%rdx,4),%xmm3
-
-    mulps nb400_iq(%rsp),%xmm7
-        shufps $0,%xmm2,%xmm0
-        shufps $0,%xmm3,%xmm1
-    shufps $136,%xmm1,%xmm0 ## 10001000 ;# all charges in xmm3  
-
-        mulps  %xmm7,%xmm0
-        movaps %xmm0,nb400_qq(%rsp)
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb400_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb400_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb400_r(%rsp)
-        mulps nb400_gbscale(%rsp),%xmm4
-
-    ## truncate and convert to integers
-    cvttps2dq %xmm4,%xmm5
-
-    ## convert back to float
-    cvtdq2ps  %xmm5,%xmm6
-
-    ## multiply by 4
-    pslld   $2,%xmm5
-
-    ## move to integer registers
-    movhlps %xmm5,%xmm7
-    movd    %xmm5,%r12d
-    movd    %xmm7,%r14d
-    pshufd $1,%xmm5,%xmm5
-    pshufd $1,%xmm7,%xmm7
-    movd    %xmm5,%r13d
-    movd    %xmm7,%r15d
-
-    ## calculate eps
-    subps     %xmm6,%xmm4
-    movaps    %xmm4,%xmm1 ##eps
-
-        movq nb400_GBtab(%rbp),%rsi
-
-    ## load table data
-        movlps (%rsi,%r12,4),%xmm5
-        movlps (%rsi,%r14,4),%xmm7
-        movhps (%rsi,%r13,4),%xmm5
-        movhps (%rsi,%r15,4),%xmm7
-
-    movaps %xmm5,%xmm4
-        shufps $136,%xmm7,%xmm4 ## 10001000
-        shufps $221,%xmm7,%xmm5 ## 11011101
-
-        movlps 8(%rsi,%r12,4),%xmm7
-        movlps 8(%rsi,%r14,4),%xmm8
-        movhps 8(%rsi,%r13,4),%xmm7
-        movhps 8(%rsi,%r15,4),%xmm8
-
-    movaps %xmm7,%xmm6
-
-        shufps $136,%xmm8,%xmm6 ## 10001000
-        shufps $221,%xmm8,%xmm7 ## 11011101
-    ## table data ready in xmm4-xmm7
-
-    mulps  %xmm1,%xmm7  ## Heps
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm1,%xmm7      ## Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        addps  %xmm7,%xmm7      ## two*Heps2 
-        movaps nb400_qq(%rsp),%xmm3
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulps  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## at this point xmm5 contains vcoul and xmm3 fijC
-
-        movq nb400_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        xorps  %xmm7,%xmm7
-        mulps nb400_gbscale(%rsp),%xmm3
-        movaps %xmm3,%xmm6
-        mulps  nb400_r(%rsp),%xmm6
-        addps  %xmm5,%xmm6
-
-    ## increment vctot (sum in xmm12)
-        addps  %xmm5,%xmm12
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-    ## update dvdasum
-    addps  nb400_dvdasum(%rsp),%xmm7
-    movaps %xmm7,nb400_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        movaps  %xmm6,%xmm5
-        movaps  %xmm7,%xmm4
-        shufps $0x1,%xmm5,%xmm5
-        shufps $0x1,%xmm4,%xmm4
-
-        ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-        addss  (%rsi,%rax,4),%xmm6
-        addss  (%rsi,%rbx,4),%xmm5
-        addss  (%rsi,%rcx,4),%xmm7
-        addss  (%rsi,%rdx,4),%xmm4
-        movss  %xmm6,(%rsi,%rax,4)
-        movss  %xmm5,(%rsi,%rbx,4)
-        movss  %xmm7,(%rsi,%rcx,4)
-        movss  %xmm4,(%rsi,%rdx,4)
-
-        xorps  %xmm4,%xmm4
-        mulps %xmm0,%xmm3
-        subps  %xmm3,%xmm4
-
-        movq nb400_faction(%rbp),%rsi
-        ## the fj's - start by accumulating x & y forces from memory 
-        movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
-        movlps (%rsi,%r10,4),%xmm1 ## x3 y3 - -
-        movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
-        movhps (%rsi,%r11,4),%xmm1 ## x3 y3 x4 y4
-
-    mulps  %xmm4,%xmm9
-    mulps  %xmm4,%xmm10
-    mulps  %xmm4,%xmm11
-
-        ## accumulate i forces
-    addps %xmm9,%xmm13
-    addps %xmm10,%xmm14
-    addps %xmm11,%xmm15
-
-    movaps %xmm9,%xmm8
-    unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
-    unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4
-
-    ## update fjx and fjy
-        addps  %xmm9,%xmm0
-        addps  %xmm8,%xmm1
-
-        movlps %xmm0,(%rsi,%r8,4)
-        movlps %xmm1,(%rsi,%r10,4)
-        movhps %xmm0,(%rsi,%r9,4)
-        movhps %xmm1,(%rsi,%r11,4)
-
-    ## xmm11: fjz1 fjz2 fjz3 fjz4
-    pshufd $1,%xmm11,%xmm10 ## fjz2 - - -
-    movhlps %xmm11,%xmm9     ## fjz3 - - -
-    pshufd $3,%xmm11,%xmm8  ## fjz4 - - -
-
-        addss  8(%rsi,%r8,4),%xmm11
-        addss  8(%rsi,%r9,4),%xmm10
-        addss  8(%rsi,%r10,4),%xmm9
-        addss  8(%rsi,%r11,4),%xmm8
-        movss  %xmm11,8(%rsi,%r8,4)
-        movss  %xmm10,8(%rsi,%r9,4)
-        movss  %xmm9,8(%rsi,%r10,4)
-        movss  %xmm8,8(%rsi,%r11,4)
-
-        ## should we do one more iteration? 
-        subl $4,nb400_innerk(%rsp)
-        jl    _nb_kernel400_x86_64_sse.nb400_finish_inner
-        jmp   _nb_kernel400_x86_64_sse.nb400_unroll_loop
-_nb_kernel400_x86_64_sse.nb400_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb400_innerk(%rsp)
-        movl  nb400_innerk(%rsp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel400_x86_64_sse.nb400_dopair
-        jmp   _nb_kernel400_x86_64_sse.nb400_checksingle
-_nb_kernel400_x86_64_sse.nb400_dopair: 
-        movq  nb400_innerjjnr(%rsp),%rcx
-
-        movl  (%rcx),%eax
-        movl  4(%rcx),%ebx
-        addq $8,nb400_innerjjnr(%rsp)
-
-        ## load isaj
-        movq nb400_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rbx,4),%xmm6
-    unpcklps %xmm6,%xmm3
-
-        movaps nb400_isai(%rsp),%xmm2
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb400_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulps nb400_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb400_gbscale(%rsp)
-
-        movq nb400_charge(%rbp),%rsi     ## base of charge[] 
-
-    mulps nb400_iq(%rsp),%xmm2
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rbx,4),%xmm6
-    unpcklps %xmm6,%xmm3
-
-        mulps %xmm2,%xmm3
-        movaps %xmm3,nb400_qq(%rsp)
-
-        movq nb400_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r8     ## j3 
-        lea  (%rbx,%rbx,2),%r9
-
-        ## move four coordinates to xmm0-xmm2   
-        movlps (%rsi,%r8,4),%xmm4       ## x1 y1 - - 
-        movlps (%rsi,%r9,4),%xmm5       ## x2 y2 - - 
-
-        movss 8(%rsi,%r8,4),%xmm6       ## z1 - - - 
-        movss 8(%rsi,%r9,4),%xmm7       ## z2 - - - 
-
-    unpcklps %xmm5,%xmm4 ## x1 x2 y1 y2
-    movhlps  %xmm4,%xmm5 ## y1 y2 -  -
-    unpcklps %xmm7,%xmm6 ## z1 z2 -  -
-
-        ## calc dr 
-        subps nb400_ix(%rsp),%xmm4
-        subps nb400_iy(%rsp),%xmm5
-        subps nb400_iz(%rsp),%xmm6
-
-        ## store dr 
-        movaps %xmm4,%xmm9
-        movaps %xmm5,%xmm10
-        movaps %xmm6,%xmm11
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb400_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb400_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb400_r(%rsp)
-        mulps nb400_gbscale(%rsp),%xmm4
-
-    ## truncate and convert to integers
-    cvttps2dq %xmm4,%xmm5
-
-    ## convert back to float
-    cvtdq2ps  %xmm5,%xmm6
-
-    ## multiply by 4
-    pslld   $2,%xmm5
-
-    ## move to integer registers
-    movd    %xmm5,%r12d
-    pshufd $1,%xmm5,%xmm5
-    movd    %xmm5,%r13d
-
-    ## calculate eps
-    subps     %xmm6,%xmm4
-    movaps    %xmm4,%xmm1 ##eps
-
-        movq nb400_GBtab(%rbp),%rsi
-
-    ## load table data
-        movlps (%rsi,%r12,4),%xmm4
-        movlps (%rsi,%r13,4),%xmm5
-    unpcklps %xmm5,%xmm4
-    movhlps  %xmm4,%xmm5
-
-        movlps 8(%rsi,%r12,4),%xmm6
-        movlps 8(%rsi,%r13,4),%xmm7
-    unpcklps %xmm7,%xmm6
-    movhlps  %xmm6,%xmm7
-    ## table data ready in xmm4-xmm7
-
-    mulps  %xmm1,%xmm7  ## Heps
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm1,%xmm7      ## Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        addps  %xmm7,%xmm7      ## two*Heps2 
-        movaps nb400_qq(%rsp),%xmm3
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulps  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## at this point xmm5 contains vcoul and xmm3 fijC
-
-    ## zero upper part of vcoul 
-    xorps %xmm2,%xmm2
-    movlhps %xmm2,%xmm5
-
-        movq nb400_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        xorps  %xmm7,%xmm7
-        mulps nb400_gbscale(%rsp),%xmm3
-        movaps %xmm3,%xmm6
-        mulps  nb400_r(%rsp),%xmm6
-        addps  %xmm5,%xmm6
-
-    ## increment vctot (sum in xmm12)
-        addps  %xmm5,%xmm12
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-    ## zero upper half of dvda
-    movlhps %xmm2,%xmm7
-
-    ## update dvdasum
-    addps  nb400_dvdasum(%rsp),%xmm7
-    movaps %xmm7,nb400_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        movaps  %xmm6,%xmm5
-        shufps $0x1,%xmm5,%xmm5
-
-        ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-        addss  (%rsi,%rax,4),%xmm6
-        addss  (%rsi,%rbx,4),%xmm5
-        movss  %xmm6,(%rsi,%rax,4)
-        movss  %xmm5,(%rsi,%rbx,4)
-
-        xorps  %xmm4,%xmm4
-        mulps %xmm0,%xmm3
-        subps  %xmm3,%xmm4
-
-    mulps  %xmm4,%xmm9
-    mulps  %xmm4,%xmm10
-    mulps  %xmm4,%xmm11
-
-    movlhps %xmm2,%xmm9
-    movlhps %xmm2,%xmm10
-    movlhps %xmm2,%xmm11
-
-        ## accumulate i forces
-    addps %xmm9,%xmm13
-    addps %xmm10,%xmm14
-    addps %xmm11,%xmm15
-
-        movq nb400_faction(%rbp),%rsi
-        ## the fj's - start by accumulating x & y forces from memory 
-        movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
-        movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
-
-    unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
-    addps    %xmm9,%xmm0
-
-        movlps %xmm0,(%rsi,%r8,4)
-        movhps %xmm0,(%rsi,%r9,4)
-
-    ## z forces
-    pshufd $1,%xmm11,%xmm8
-    addss  8(%rsi,%r8,4),%xmm11
-    addss  8(%rsi,%r9,4),%xmm8
-    movss  %xmm11,8(%rsi,%r8,4)
-    movss  %xmm8,8(%rsi,%r9,4)
-
-_nb_kernel400_x86_64_sse.nb400_checksingle:     
-        movl  nb400_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel400_x86_64_sse.nb400_dosingle
-        jmp    _nb_kernel400_x86_64_sse.nb400_updateouterdata
-_nb_kernel400_x86_64_sse.nb400_dosingle: 
-        movq  nb400_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-
-        ## load isaj
-        movq nb400_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm2
-        mulss nb400_isai(%rsp),%xmm2
-        movss %xmm2,nb400_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulss nb400_gbtsc(%rsp),%xmm1
-        movss %xmm1,nb400_gbscale(%rsp)
-
-        movq nb400_charge(%rbp),%rsi     ## base of charge[] 
-
-    mulss nb400_iq(%rsp),%xmm2
-        movss (%rsi,%rax,4),%xmm3
-        mulss %xmm2,%xmm3
-        movss %xmm3,nb400_qq(%rsp)
-
-        movq nb400_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r8     ## j3=3*jnr
-
-        ## move four coordinates to xmm0-xmm2   
-        movss (%rsi,%r8,4),%xmm4
-        movss 4(%rsi,%r8,4),%xmm5
-        movss 8(%rsi,%r8,4),%xmm6
-
-        ## calc dr 
-        subss nb400_ix(%rsp),%xmm4
-        subss nb400_iy(%rsp),%xmm5
-        subss nb400_iz(%rsp),%xmm6
-
-        ## store dr 
-        movaps %xmm4,%xmm9
-        movaps %xmm5,%xmm10
-        movaps %xmm6,%xmm11
-
-        ## square it 
-        mulss %xmm4,%xmm4
-        mulss %xmm5,%xmm5
-        mulss %xmm6,%xmm6
-        addss %xmm5,%xmm4
-        addss %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movaps nb400_three(%rsp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb400_half(%rsp),%xmm0
-        subss %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-        mulss %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb400_r(%rsp)
-        mulss nb400_gbscale(%rsp),%xmm4
-
-    ## truncate and convert to integers
-    cvttss2si %xmm4,%r12d
-
-    ## convert back to float
-    cvtsi2ss  %r12d,%xmm6
-
-    ## multiply by 4
-    shll $2,%r12d
-
-    ## calculate eps
-    subss     %xmm6,%xmm4
-    movaps    %xmm4,%xmm1 ##eps
-
-        movq nb400_GBtab(%rbp),%rsi
-
-    ## load table data
-        movss (%rsi,%r12,4),%xmm4
-        movss 4(%rsi,%r12,4),%xmm5
-        movss 8(%rsi,%r12,4),%xmm6
-        movss 12(%rsi,%r12,4),%xmm7
-    ## table data ready in xmm4-xmm7
-
-    mulss  %xmm1,%xmm7  ## Heps
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm1,%xmm7      ## Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        addss  %xmm7,%xmm7      ## two*Heps2 
-        movss  nb400_qq(%rsp),%xmm3
-        addss  %xmm6,%xmm7
-        addss  %xmm5,%xmm7 ## xmm7=FF 
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulss  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## at this point xmm5 contains vcoul and xmm3 fijC
-
-        movq nb400_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        xorps  %xmm7,%xmm7
-        mulss nb400_gbscale(%rsp),%xmm3
-        movaps %xmm3,%xmm6
-        mulss  nb400_r(%rsp),%xmm6
-        addss  %xmm5,%xmm6
-
-    ## increment vctot (sum in xmm12)
-        addss  %xmm5,%xmm12
-
-        ## xmm6=(vcoul+fijC*r)
-        subss  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-    ## update dvdasum
-    addss  nb400_dvdasum(%rsp),%xmm7
-    movss %xmm7,nb400_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        addss  (%rsi,%rax,4),%xmm6
-        movss  %xmm6,(%rsi,%rax,4)
-
-        xorps  %xmm4,%xmm4
-        mulss %xmm0,%xmm3
-        subss  %xmm3,%xmm4
-
-    mulss  %xmm4,%xmm9
-    mulss  %xmm4,%xmm10
-    mulss  %xmm4,%xmm11
-
-        ## accumulate i forces
-    addss %xmm9,%xmm13
-    addss %xmm10,%xmm14
-    addss %xmm11,%xmm15
-
-        movq nb400_faction(%rbp),%rsi
-    ## add to j forces
-    addss  (%rsi,%r8,4),%xmm9
-    addss  4(%rsi,%r8,4),%xmm10
-    addss  8(%rsi,%r8,4),%xmm11
-    movss  %xmm9,(%rsi,%r8,4)
-    movss  %xmm10,4(%rsi,%r8,4)
-    movss  %xmm11,8(%rsi,%r8,4)
-
-_nb_kernel400_x86_64_sse.nb400_updateouterdata: 
-        movl  nb400_ii3(%rsp),%ecx
-        movq  nb400_faction(%rbp),%rdi
-        movq  nb400_fshift(%rbp),%rsi
-        movl  nb400_is3(%rsp),%edx
-
-        ## accumulate i forces in xmm13, xmm14, xmm15
-        movhlps %xmm13,%xmm0
-        movhlps %xmm14,%xmm1
-        movhlps %xmm15,%xmm2
-        addps  %xmm13,%xmm0
-        addps  %xmm14,%xmm1
-        addps  %xmm15,%xmm2
-    movaps %xmm0,%xmm3
-        movaps %xmm1,%xmm4
-        movaps %xmm2,%xmm5
-        shufps $1,%xmm3,%xmm3
-        shufps $1,%xmm4,%xmm4
-        shufps $1,%xmm5,%xmm5
-        addss  %xmm3,%xmm0
-        addss  %xmm4,%xmm1
-        addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0 
-
-        ## increment i force 
-        movss  (%rdi,%rcx,4),%xmm3
-        movss  4(%rdi,%rcx,4),%xmm4
-        movss  8(%rdi,%rcx,4),%xmm5
-        subss  %xmm0,%xmm3
-        subss  %xmm1,%xmm4
-        subss  %xmm2,%xmm5
-        movss  %xmm3,(%rdi,%rcx,4)
-        movss  %xmm4,4(%rdi,%rcx,4)
-        movss  %xmm5,8(%rdi,%rcx,4)
-
-        ## increment fshift force  
-        movss  (%rsi,%rdx,4),%xmm3
-        movss  4(%rsi,%rdx,4),%xmm4
-        movss  8(%rsi,%rdx,4),%xmm5
-        subss  %xmm0,%xmm3
-        subss  %xmm1,%xmm4
-        subss  %xmm2,%xmm5
-        movss  %xmm3,(%rsi,%rdx,4)
-        movss  %xmm4,4(%rsi,%rdx,4)
-        movss  %xmm5,8(%rsi,%rdx,4)
-
-        ## get n from stack
-        movl nb400_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb400_gid(%rbp),%rdx              ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        ## accumulate 
-        movhlps %xmm12,%xmm6
-        addps  %xmm6,%xmm12     ## pos 0-1 in xmm12 have the sum now 
-        movaps %xmm12,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm12
-
-        ## add earlier value from mem 
-        movq  nb400_Vc(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm12
-        ## move back to mem 
-        movss %xmm12,(%rax,%rdx,4)
-
-        ## accumulate dVda and update it 
-        movaps nb400_dvdasum(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        movl nb400_ii(%rsp),%edx
-        movq nb400_dvda(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm7
-        movss %xmm7,(%rax,%rdx,4)
-
-        ## finish if last 
-        movl nb400_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel400_x86_64_sse.nb400_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb400_n(%rsp)
-        jmp _nb_kernel400_x86_64_sse.nb400_outer
-_nb_kernel400_x86_64_sse.nb400_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb400_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel400_x86_64_sse.nb400_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel400_x86_64_sse.nb400_threadloop
-_nb_kernel400_x86_64_sse.nb400_end: 
-
-        movl nb400_nouter(%rsp),%eax
-        movl nb400_ninner(%rsp),%ebx
-        movq nb400_outeriter(%rbp),%rcx
-        movq nb400_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $456,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
-
-
-.globl nb_kernel400nf_x86_64_sse
-.globl _nb_kernel400nf_x86_64_sse
-nb_kernel400nf_x86_64_sse:      
-_nb_kernel400nf_x86_64_sse:     
-.set nb400nf_fshift, 16
-.set nb400nf_gid, 24
-.set nb400nf_pos, 32
-.set nb400nf_faction, 40
-.set nb400nf_charge, 48
-.set nb400nf_p_facel, 56
-.set nb400nf_argkrf, 64
-.set nb400nf_argcrf, 72
-.set nb400nf_Vc, 80
-.set nb400nf_type, 88
-.set nb400nf_p_ntype, 96
-.set nb400nf_vdwparam, 104
-.set nb400nf_Vvdw, 112
-.set nb400nf_p_tabscale, 120
-.set nb400nf_VFtab, 128
-.set nb400nf_invsqrta, 136
-.set nb400nf_dvda, 144
-.set nb400nf_p_gbtabscale, 152
-.set nb400nf_GBtab, 160
-.set nb400nf_p_nthreads, 168
-.set nb400nf_count, 176
-.set nb400nf_mtx, 184
-.set nb400nf_outeriter, 192
-.set nb400nf_inneriter, 200
-.set nb400nf_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb400nf_ix, 0
-.set nb400nf_iy, 16
-.set nb400nf_iz, 32
-.set nb400nf_iq, 48
-.set nb400nf_gbtsc, 64
-.set nb400nf_qq, 80
-.set nb400nf_vctot, 96
-.set nb400nf_half, 112
-.set nb400nf_three, 128
-.set nb400nf_isai, 144
-.set nb400nf_isaprod, 160
-.set nb400nf_gbscale, 176
-.set nb400nf_nri, 192
-.set nb400nf_iinr, 200
-.set nb400nf_jindex, 208
-.set nb400nf_jjnr, 216
-.set nb400nf_shift, 224
-.set nb400nf_shiftvec, 232
-.set nb400nf_facel, 240
-.set nb400nf_innerjjnr, 248
-.set nb400nf_is3, 256
-.set nb400nf_ii3, 260
-.set nb400nf_innerk, 264
-.set nb400nf_n, 268
-.set nb400nf_nn1, 272
-.set nb400nf_nouter, 276
-.set nb400nf_ninner, 280
-
-
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $296,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb400nf_nouter(%rsp)
-        movl %eax,nb400nf_ninner(%rsp)
-
-        movl (%rdi),%edi
-        movl %edi,nb400nf_nri(%rsp)
-        movq %rsi,nb400nf_iinr(%rsp)
-        movq %rdx,nb400nf_jindex(%rsp)
-        movq %rcx,nb400nf_jjnr(%rsp)
-        movq %r8,nb400nf_shift(%rsp)
-        movq %r9,nb400nf_shiftvec(%rsp)
-        movq nb400nf_p_facel(%rbp),%rsi
-        movss (%rsi),%xmm0
-        movss %xmm0,nb400nf_facel(%rsp)
-
-        movq nb400nf_p_gbtabscale(%rbp),%rbx
-        movss (%rbx),%xmm4
-        shufps $0,%xmm4,%xmm4
-        movaps %xmm4,nb400nf_gbtsc(%rsp)
-
-
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## half in IEEE (hex)
-        movl %eax,nb400nf_half(%rsp)
-        movss nb400nf_half(%rsp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## one
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## two
-        addps  %xmm2,%xmm3      ## three
-        movaps %xmm1,nb400nf_half(%rsp)
-        movaps %xmm3,nb400nf_three(%rsp)
-
-_nb_kernel400nf_x86_64_sse.nb400nf_threadloop: 
-        movq  nb400nf_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel400nf_x86_64_sse.nb400nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel400nf_x86_64_sse.nb400nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb400nf_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb400nf_n(%rsp)
-        movl %ebx,nb400nf_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel400nf_x86_64_sse.nb400nf_outerstart
-        jmp _nb_kernel400nf_x86_64_sse.nb400nf_end
-
-_nb_kernel400nf_x86_64_sse.nb400nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb400nf_nouter(%rsp),%ebx
-        movl %ebx,nb400nf_nouter(%rsp)
-
-_nb_kernel400nf_x86_64_sse.nb400nf_outer: 
-        movq  nb400nf_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx                ## ebx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb400nf_is3(%rsp)            ## store is3 
-
-        movq  nb400nf_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movss (%rax,%rbx,4),%xmm0
-        movss 4(%rax,%rbx,4),%xmm1
-        movss 8(%rax,%rbx,4),%xmm2
-
-        movq  nb400nf_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]      
-        movl  (%rcx,%rsi,4),%ebx            ## ebx =ii 
-
-        movq  nb400nf_charge(%rbp),%rdx
-        movss (%rdx,%rbx,4),%xmm3
-        mulss nb400nf_facel(%rsp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-        movq  nb400nf_invsqrta(%rbp),%rdx       ## load invsqrta[ii]
-        movss (%rdx,%rbx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb400nf_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addss (%rax,%rbx,4),%xmm0
-        addss 4(%rax,%rbx,4),%xmm1
-        addss 8(%rax,%rbx,4),%xmm2
-
-        movaps %xmm3,nb400nf_iq(%rsp)
-        movaps %xmm4,nb400nf_isai(%rsp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb400nf_ix(%rsp)
-        movaps %xmm1,nb400nf_iy(%rsp)
-        movaps %xmm2,nb400nf_iz(%rsp)
-
-        movl  %ebx,nb400nf_ii3(%rsp)
-
-        ## clear vctot 
-        xorps %xmm4,%xmm4
-        movaps %xmm4,nb400nf_vctot(%rsp)
-
-        movq  nb400nf_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb400nf_pos(%rbp),%rsi
-        movq  nb400nf_faction(%rbp),%rdi
-        movq  nb400nf_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb400nf_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb400nf_ninner(%rsp),%ecx
-        movl  %ecx,nb400nf_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb400nf_innerk(%rsp)      ## number of innerloop atoms 
-        jge   _nb_kernel400nf_x86_64_sse.nb400nf_unroll_loop
-        jmp   _nb_kernel400nf_x86_64_sse.nb400nf_finish_inner
-_nb_kernel400nf_x86_64_sse.nb400nf_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movq  nb400nf_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k] 
-        movl  (%rdx),%eax
-        movl  4(%rdx),%ebx
-        movl  8(%rdx),%ecx
-        movl  12(%rdx),%edx           ## eax-edx=jnr1-4 
-        addq $16,nb400nf_innerjjnr(%rsp)             ## advance pointer (unrolled 4) 
-
-        ## load isa2
-        movq nb400nf_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rcx,4),%xmm4
-        movss (%rsi,%rbx,4),%xmm6
-        movss (%rsi,%rdx,4),%xmm7
-        movaps nb400nf_isai(%rsp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3  
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb400nf_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulps nb400nf_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb400nf_gbscale(%rsp)
-
-        movq nb400nf_charge(%rbp),%rsi     ## base of charge[] 
-
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rcx,4),%xmm4
-        movss (%rsi,%rbx,4),%xmm6
-        movss (%rsi,%rdx,4),%xmm7
-
-        mulps nb400nf_iq(%rsp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3  
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb400nf_qq(%rsp)
-
-
-        movq nb400nf_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%rax     ## replace jnr with j3 
-        lea  (%rbx,%rbx,2),%rbx
-
-        lea  (%rcx,%rcx,2),%rcx     ## replace jnr with j3 
-        lea  (%rdx,%rdx,2),%rdx
-
-        ## move four coordinates to xmm0-xmm2   
-
-        movlps (%rsi,%rax,4),%xmm4
-        movlps (%rsi,%rcx,4),%xmm5
-        movss 8(%rsi,%rax,4),%xmm2
-        movss 8(%rsi,%rcx,4),%xmm6
-
-        movhps (%rsi,%rbx,4),%xmm4
-        movhps (%rsi,%rdx,4),%xmm5
-
-        movss 8(%rsi,%rbx,4),%xmm0
-        movss 8(%rsi,%rdx,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## 10001000
-
-        shufps $136,%xmm5,%xmm0 ## 10001000
-        shufps $221,%xmm5,%xmm1 ## 11011101             
-
-        ## move ix-iz to xmm4-xmm6 
-        movaps nb400nf_ix(%rsp),%xmm4
-        movaps nb400nf_iy(%rsp),%xmm5
-        movaps nb400nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb400nf_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb400nf_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r
-        mulps nb400nf_gbscale(%rsp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $2,%mm6
-        pslld $2,%mm7
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movq nb400nf_GBtab(%rbp),%rsi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## load coulomb table
-        movaps (%rsi,%rax,4),%xmm4
-        movaps (%rsi,%rbx,4),%xmm5
-        movaps (%rsi,%rcx,4),%xmm6
-        movaps (%rsi,%rdx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp 
-        movaps nb400nf_qq(%rsp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addps  nb400nf_vctot(%rsp),%xmm5
-        movaps %xmm5,nb400nf_vctot(%rsp)
-
-        ## should we do one more iteration? 
-        subl $4,nb400nf_innerk(%rsp)
-        jl    _nb_kernel400nf_x86_64_sse.nb400nf_finish_inner
-        jmp   _nb_kernel400nf_x86_64_sse.nb400nf_unroll_loop
-_nb_kernel400nf_x86_64_sse.nb400nf_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb400nf_innerk(%rsp)
-        movl  nb400nf_innerk(%rsp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel400nf_x86_64_sse.nb400nf_dopair
-        jmp   _nb_kernel400nf_x86_64_sse.nb400nf_checksingle
-_nb_kernel400nf_x86_64_sse.nb400nf_dopair: 
-        movq  nb400nf_innerjjnr(%rsp),%rcx
-
-        movl  (%rcx),%eax
-        movl  4(%rcx),%ebx
-        addq $8,nb400nf_innerjjnr(%rsp)
-
-        xorps %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-
-        ## load isa2
-        movq nb400nf_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm2
-        movss (%rsi,%rbx,4),%xmm3
-        unpcklps %xmm3,%xmm2    ## isa2 in xmm3(0,1)
-        mulps  nb400nf_isai(%rsp),%xmm2
-        movaps %xmm2,nb400nf_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulps nb400nf_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb400nf_gbscale(%rsp)
-
-        movq nb400nf_charge(%rbp),%rsi     ## base of charge[]  
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rbx,4),%xmm6
-        unpcklps %xmm6,%xmm3 ## 00001000 ;# xmm3(0,1) has the charges 
-
-        mulps  nb400nf_iq(%rsp),%xmm2
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb400nf_qq(%rsp)
-
-        movq nb400nf_pos(%rbp),%rdi
-
-        lea  (%rax,%rax,2),%rax
-        lea  (%rbx,%rbx,2),%rbx
-        ## move coordinates to xmm0-xmm2 
-        movlps (%rdi,%rax,4),%xmm1
-        movss 8(%rdi,%rax,4),%xmm2
-        movhps (%rdi,%rbx,4),%xmm1
-        movss 8(%rdi,%rbx,4),%xmm0
-
-        movlhps %xmm7,%xmm3
-
-        shufps $0,%xmm0,%xmm2
-
-        movaps %xmm1,%xmm0
-
-        shufps $136,%xmm2,%xmm2 ## 10001000
-
-        shufps $136,%xmm0,%xmm0 ## 10001000
-        shufps $221,%xmm1,%xmm1 ## 11011101
-
-        movq   nb400nf_faction(%rbp),%rdi
-        ## move ix-iz to xmm4-xmm6 
-        xorps   %xmm7,%xmm7
-
-        movaps nb400nf_ix(%rsp),%xmm4
-        movaps nb400nf_iy(%rsp),%xmm5
-        movaps nb400nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb400nf_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb400nf_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        mulps nb400nf_gbscale(%rsp),%xmm4
-
-        cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6
-
-        movq nb400nf_GBtab(%rbp),%rsi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## load coulomb table
-        movaps (%rsi,%rcx,4),%xmm4
-        movaps (%rsi,%rdx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        movaps nb400nf_qq(%rsp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addps  nb400nf_vctot(%rsp),%xmm5
-        movaps %xmm5,nb400nf_vctot(%rsp)
-
-_nb_kernel400nf_x86_64_sse.nb400nf_checksingle: 
-        movl  nb400nf_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel400nf_x86_64_sse.nb400nf_dosingle
-        jmp    _nb_kernel400nf_x86_64_sse.nb400nf_updateouterdata
-_nb_kernel400nf_x86_64_sse.nb400nf_dosingle: 
-        movq nb400nf_charge(%rbp),%rsi
-        movq nb400nf_invsqrta(%rbp),%rdx
-        movq nb400nf_pos(%rbp),%rdi
-        movq  nb400nf_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-        xorps  %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-        movss (%rdx,%rax,4),%xmm2       ## isa2
-        mulss nb400nf_isai(%rsp),%xmm2
-        movss %xmm2,nb400nf_isaprod(%rsp)
-        movss %xmm2,%xmm1
-        mulss nb400nf_gbtsc(%rsp),%xmm1
-        movss %xmm1,nb400nf_gbscale(%rsp)
-
-        mulss  nb400nf_iq(%rsp),%xmm2
-        movss (%rsi,%rax,4),%xmm6       ## xmm6(0) has the charge       
-        mulss  %xmm2,%xmm6
-        movss %xmm6,nb400nf_qq(%rsp)
-
-        lea  (%rax,%rax,2),%rax
-
-        ## move coordinates to xmm0-xmm2 
-        movss (%rdi,%rax,4),%xmm0
-        movss 4(%rdi,%rax,4),%xmm1
-        movss 8(%rdi,%rax,4),%xmm2
-
-        movss nb400nf_ix(%rsp),%xmm4
-        movss nb400nf_iy(%rsp),%xmm5
-        movss nb400nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subss %xmm0,%xmm4
-        subss %xmm1,%xmm5
-        subss %xmm2,%xmm6
-
-        ## square it 
-        mulss %xmm4,%xmm4
-        mulss %xmm5,%xmm5
-        mulss %xmm6,%xmm6
-        addss %xmm5,%xmm4
-        addss %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movss %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movss nb400nf_three(%rsp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movss nb400nf_half(%rsp),%xmm0
-        subss %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-
-        mulss %xmm0,%xmm4       ## xmm4=r 
-        mulss nb400nf_gbscale(%rsp),%xmm4
-
-        cvttss2si %xmm4,%ebx    ## mm6 contain lu indices 
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movss %xmm4,%xmm1       ## xmm1=eps 
-        movss %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%ebx
-
-        movq nb400nf_GBtab(%rbp),%rsi
-
-        movaps (%rsi,%rbx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        movss nb400nf_qq(%rsp),%xmm3
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addss  nb400nf_vctot(%rsp),%xmm5
-        movss %xmm5,nb400nf_vctot(%rsp)
-_nb_kernel400nf_x86_64_sse.nb400nf_updateouterdata: 
-        ## get n from stack
-        movl nb400nf_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb400nf_gid(%rbp),%rdx            ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movaps nb400nf_vctot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movq  nb400nf_Vc(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%rax,%rdx,4)
-
-        ## finish if last 
-        movl nb400nf_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel400nf_x86_64_sse.nb400nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb400nf_n(%rsp)
-        jmp _nb_kernel400nf_x86_64_sse.nb400nf_outer
-_nb_kernel400nf_x86_64_sse.nb400nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb400nf_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel400nf_x86_64_sse.nb400nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel400nf_x86_64_sse.nb400nf_threadloop
-_nb_kernel400nf_x86_64_sse.nb400nf_end: 
-
-        movl nb400nf_nouter(%rsp),%eax
-        movl nb400nf_ninner(%rsp),%ebx
-        movq nb400nf_outeriter(%rbp),%rcx
-        movq nb400nf_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $296,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.intel_syntax.s
deleted file mode 100644
index a7f86f162d..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.intel_syntax.s
+++ /dev/null
@@ -1,2009 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-
-
-.globl nb_kernel410_x86_64_sse
-.globl _nb_kernel410_x86_64_sse
-nb_kernel410_x86_64_sse:	
-_nb_kernel410_x86_64_sse:	
-;#	Room for return address and rbp (16 bytes)
-.equiv          nb410_fshift,           16
-.equiv          nb410_gid,              24
-.equiv          nb410_pos,              32
-.equiv          nb410_faction,          40
-.equiv          nb410_charge,           48
-.equiv          nb410_p_facel,          56
-.equiv          nb410_argkrf,           64
-.equiv          nb410_argcrf,           72
-.equiv          nb410_Vc,               80
-.equiv          nb410_type,             88
-.equiv          nb410_p_ntype,          96
-.equiv          nb410_vdwparam,         104
-.equiv          nb410_Vvdw,             112
-.equiv          nb410_p_tabscale,       120
-.equiv          nb410_VFtab,            128
-.equiv          nb410_invsqrta,         136
-.equiv          nb410_dvda,             144
-.equiv          nb410_p_gbtabscale,     152
-.equiv          nb410_GBtab,            160
-.equiv          nb410_p_nthreads,       168
-.equiv          nb410_count,            176
-.equiv          nb410_mtx,              184
-.equiv          nb410_outeriter,        192
-.equiv          nb410_inneriter,        200
-.equiv          nb410_work,             208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb410_ix,               0
-.equiv          nb410_iy,               16
-.equiv          nb410_iz,               32
-.equiv          nb410_iq,               48
-.equiv          nb410_dx,               64
-.equiv          nb410_dy,               80
-.equiv          nb410_dz,               96
-.equiv          nb410_two,              112
-.equiv          nb410_six,              128
-.equiv          nb410_twelve,           144
-.equiv          nb410_gbtsc,            160
-.equiv          nb410_qq,               176
-.equiv          nb410_c6,               192
-.equiv          nb410_c12,              208
-.equiv          nb410_fscal,            224
-.equiv          nb410_vctot,            240
-.equiv          nb410_Vvdwtot,          256
-.equiv          nb410_fix,              272
-.equiv          nb410_fiy,              288
-.equiv          nb410_fiz,              304
-.equiv          nb410_half,             320
-.equiv          nb410_three,            336
-.equiv          nb410_r,                352
-.equiv          nb410_isai,             368
-.equiv          nb410_isaprod,          384
-.equiv          nb410_dvdasum,          400
-.equiv          nb410_gbscale,          416
-.equiv          nb410_nri,              432
-.equiv          nb410_iinr,             440
-.equiv          nb410_jindex,           448
-.equiv          nb410_jjnr,             456
-.equiv          nb410_shift,            464
-.equiv          nb410_shiftvec,         472
-.equiv          nb410_facel,            480
-.equiv          nb410_innerjjnr,        488
-.equiv          nb410_is3,              496
-.equiv          nb410_ii3,              500
-.equiv          nb410_ii,               504
-.equiv          nb410_ntia,             508
-.equiv          nb410_innerk,           512
-.equiv          nb410_n,                516
-.equiv          nb410_nn1,              520
-.equiv          nb410_ntype,            524
-.equiv          nb410_nouter,           528
-.equiv          nb410_ninner,           532
-.equiv          nb410_jnra,             536
-.equiv          nb410_jnrb,             540
-.equiv          nb410_jnrc,             544
-.equiv          nb410_jnrd,             548
-
-	push rbp
-	mov  rbp, rsp
-	push rbx
-
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 568		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb410_nouter], eax
-	mov [rsp + nb410_ninner], eax
-
-	mov edi, [rdi]
-	mov [rsp + nb410_nri], edi
-	mov [rsp + nb410_iinr], rsi
-	mov [rsp + nb410_jindex], rdx
-	mov [rsp + nb410_jjnr], rcx
-	mov [rsp + nb410_shift], r8
-	mov [rsp + nb410_shiftvec], r9
-	mov rdi, [rbp + nb410_p_ntype]
-	mov edi, [rdi]
-	mov [rsp + nb410_ntype], edi
-	mov rsi, [rbp + nb410_p_facel]
-	movss xmm0, [rsi]
-	movss [rsp + nb410_facel], xmm0
-
-	mov rbx, [rbp + nb410_p_gbtabscale]
-	movss xmm4, [rbx]
-	shufps xmm4, xmm4, 0
-	movaps [rsp + nb410_gbtsc], xmm4
-
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# half in IEEE (hex)
-	mov [rsp + nb410_half], eax
-	movss xmm1, [rsp + nb410_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# one
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# two
-	addps  xmm3, xmm2	;# three
-	movaps xmm4, xmm3
-	addps  xmm4, xmm4	;# six
-	movaps xmm5, xmm4
-	addps  xmm5, xmm5	;# twelve
-	movaps [rsp + nb410_half],  xmm1
-	movaps [rsp + nb410_two],  xmm2
-	movaps [rsp + nb410_three],  xmm3
-	movaps [rsp + nb410_six],  xmm4
-	movaps [rsp + nb410_twelve],  xmm5
-
-.nb410_threadloop:
-        mov   rsi, [rbp + nb410_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb410_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb410_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb410_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb410_n], eax
-        mov [rsp + nb410_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb410_outerstart
-        jmp .nb410_end
-
-.nb410_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb410_nouter]
-	mov [rsp + nb410_nouter], ebx
-
-.nb410_outer:
-	mov   rax, [rsp + nb410_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax+rsi*4]		;# ebx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb410_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb410_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movss xmm0, [rax + rbx*4]
-	movss xmm1, [rax + rbx*4 + 4]
-	movss xmm2, [rax + rbx*4 + 8] 
-
-	mov   rcx, [rsp + nb410_iinr]       ;# rcx = pointer into iinr[] 	
-	mov   ebx, [rcx + rsi*4]	    ;# ebx =ii 
-	mov   [rsp + nb410_ii], ebx
-
-	mov   rdx, [rbp + nb410_charge]
-	movss xmm3, [rdx + rbx*4]	
-	mulss xmm3, [rsp + nb410_facel]
-	shufps xmm3, xmm3, 0
-
-	mov   rdx, [rbp + nb410_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [rdx + rbx*4]
-	shufps xmm4, xmm4, 0
-
-    	mov   rdx, [rbp + nb410_type] 
-    	mov   edx, [rdx + rbx*4]
-    	imul  edx, [rsp + nb410_ntype]
-    	shl   edx, 1
-    	mov   [rsp + nb410_ntia], edx
-	
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb410_pos]    ;# rax = base of pos[]  
-
-	addss xmm0, [rax + rbx*4]
-	addss xmm1, [rax + rbx*4 + 4]
-	addss xmm2, [rax + rbx*4 + 8]
-
-	movaps [rsp + nb410_iq], xmm3
-	movaps [rsp + nb410_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [rsp + nb410_ix], xmm0
-	movaps [rsp + nb410_iy], xmm1
-	movaps [rsp + nb410_iz], xmm2
-
-	mov   [rsp + nb410_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorps xmm13, xmm13
-	movaps xmm12, xmm13
-	movaps [rsp + nb410_Vvdwtot], xmm13
-	movaps [rsp + nb410_dvdasum], xmm13
-	movaps xmm14, xmm13
-	movaps xmm15, xmm13
-	
-	mov   rax, [rsp + nb410_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb410_pos]
-	mov   rdi, [rbp + nb410_faction]	
-	mov   rax, [rsp + nb410_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb410_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [rsp + nb410_ninner]
-	mov   [rsp + nb410_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb410_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb410_unroll_loop
-	jmp   .nb410_finish_inner
-.nb410_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   rdx, [rsp + nb410_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [rdx]	
-	mov   ebx, [rdx + 4]              
-	mov   ecx, [rdx + 8]            
-	mov   edx, [rdx + 12]         ;# eax-edx=jnr1-4 
-
-	add qword ptr [rsp + nb410_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-
-	;# load isaj
-	mov rsi, [rbp + nb410_invsqrta]
-	movss xmm3, [rsi + rax*4]
-	movss xmm4, [rsi + rcx*4]
-	movss xmm6, [rsi + rbx*4]
-	movss xmm7, [rsi + rdx*4]
-	movaps xmm2, [rsp + nb410_isai]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# 10001000 ;# all isaj in xmm3 
-	mulps  xmm2, xmm3
-	
-	movaps [rsp + nb410_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [rsp + nb410_gbtsc]
-	movaps [rsp + nb410_gbscale], xmm1
-	
-	mov rsi, [rbp + nb410_charge]    ;# base of charge[] 
-	
-	movss xmm3, [rsi + rax*4]
-	movss xmm4, [rsi + rcx*4]
-	movss xmm6, [rsi + rbx*4]
-	movss xmm7, [rsi + rdx*4]
-
-	mulps xmm2, [rsp + nb410_iq]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# 10001000 ;# all charges in xmm3  
-	mulps  xmm3, xmm2
-	movaps [rsp + nb410_qq], xmm3	
-	
-    ;# vdw parameters
-	mov rsi, [rbp + nb410_type]
-	mov r12d, [rsi + rax*4]
-	mov r13d, [rsi + rbx*4]
-	mov r14d, [rsi + rcx*4]
-	mov r15d, [rsi + rdx*4]
-	shl r12d, 1	
-	shl r13d, 1	
-	shl r14d, 1	
-	shl r15d, 1	
-    mov edi, [rsp + nb410_ntia]
-	add r12d, edi
-	add r13d, edi
-	add r14d, edi
-	add r15d, edi
-
-	mov rsi, [rbp + nb410_vdwparam]
-	movlps xmm3, [rsi + r12*4]
-	movlps xmm7, [rsi + r14*4]
-	movhps xmm3, [rsi + r13*4]
-	movhps xmm7, [rsi + r15*4]
-
-	movaps xmm0, xmm3
-	shufps xmm0, xmm7, 136  ;# 10001000
-	shufps xmm3, xmm7, 221  ;# 11011101
-
-    movaps [rsp + nb410_c6], xmm0
-    movaps [rsp + nb410_c12], xmm3
-    
-	mov rsi, [rbp + nb410_pos]       ;# base of pos[] 
-	
-	lea   r8, [rax + rax*2]     ;# jnr 
-	lea   r9, [rbx + rbx*2]	
-	lea   r10, [rcx + rcx*2]    
-	lea   r11, [rdx + rdx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-	movlps xmm4, [rsi + r8*4]
-	movlps xmm5, [rsi + r10*4]
-	movss xmm2, [rsi + r8*4 + 8]
-	movss xmm6, [rsi + r10*4 + 8]
-
-	movhps xmm4, [rsi + r9*4]
-	movhps xmm5, [rsi + r11*4]
-
-	movss xmm0, [rsi + r9*4 + 8]
-	movss xmm1, [rsi + r11*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# 10001000
-	
-	shufps xmm0, xmm5, 136  ;# 10001000
-	shufps xmm1, xmm5, 221  ;# 11011101		
-
-	;# calc dr 
-	subps xmm0, [rsp + nb410_ix]
-	subps xmm1, [rsp + nb410_iy]
-	subps xmm2, [rsp + nb410_iz]
-
-	;# store dr 
-	movaps [rsp + nb410_dx], xmm0
-	movaps [rsp + nb410_dy], xmm1
-	movaps [rsp + nb410_dz], xmm2
-
-	;# square it 
-	mulps xmm0,xmm0
-	mulps xmm1,xmm1
-	mulps xmm2,xmm2
-	addps xmm0, xmm1
-	addps xmm0, xmm2
-    movaps xmm4, xmm0
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb410_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb410_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r
-	movaps [rsp + nb410_r], xmm4
-	mulps xmm4, [rsp + nb410_gbscale]
-
-    ;# truncate and convert to integers
-    cvttps2dq xmm5, xmm4
-    
-    ;# convert back to float
-    cvtdq2ps  xmm6, xmm5
-    
-    ;# multiply by 4
-    pslld   xmm5, 2
-
-    ;# move to integer registers
-    movhlps xmm7, xmm5
-    movd    r12d, xmm5
-    movd    r14d, xmm7
-    pshufd  xmm5, xmm5, 1
-    pshufd  xmm7, xmm7, 1
-    movd    r13d, xmm5
-    movd    r15d, xmm7
-    
-    ;# calculate eps
-    subps     xmm4, xmm6
-    movaps    xmm1, xmm4 ;#eps
-    
-	mov  rsi, [rbp + nb410_GBtab]
-
-    movaps xmm9, xmm0 ;# rinv
-    mulps  xmm9, xmm9 ;# rinvsq
-    movaps xmm10, xmm9 ;# rinvsq
-    mulps  xmm10, xmm10 ;# rinv4
-    mulps  xmm10, xmm9 ;# rinv6
-    movaps xmm11, xmm10 
-    mulps  xmm11, xmm11 ;# rinv12
-
-    ;# load table data
-   	movlps xmm5, [rsi + r12*4]
-	movlps xmm7, [rsi + r14*4]
-	movhps xmm5, [rsi + r13*4]
-	movhps xmm7, [rsi + r15*4]
-
-    movaps xmm4, xmm5
-	shufps xmm4, xmm7, 136  ;# 10001000
-	shufps xmm5, xmm7, 221  ;# 11011101
-
-    mulps  xmm10, [rsp + nb410_c6]    ;# vvdw6=c6*rinv6
-	mulps  xmm11, [rsp + nb410_c12]   ;# vvdw12=c12*rinv12     
-
-	movaps xmm9, xmm11
-	subps  xmm11, xmm10	;# Vvdw=Vvdw12-Vvdw6
-
-    ;# add potential to vvdwtot 
-	addps  xmm11, [rsp + nb410_Vvdwtot]
-    movaps [rsp + nb410_Vvdwtot], xmm11
-    
-	movlps xmm7, [rsi + r12*4 + 8]   
-	movlps xmm8, [rsi + r14*4 + 8]
-	movhps xmm7, [rsi + r13*4 + 8]
-	movhps xmm8, [rsi + r15*4 + 8]
-
-    movaps xmm6, xmm7
-    
-	shufps xmm6, xmm8, 136  ;# 10001000
-	shufps xmm7, xmm8, 221  ;# 11011101
-    ;# table data ready in xmm4-xmm7
-
-    mulps  xmm7, xmm1   ;# Heps
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm1	;# Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	addps  xmm7, xmm7	;# two*Heps2 
-	movaps xmm3, [rsp + nb410_qq]
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulps  xmm3, xmm7 ;# fijC=FF*qq 
-	;# at this point xmm5 contains vcoul and xmm3 fijC
-
-    ;# LJ forces
-    mulps  xmm10, [rsp + nb410_six]
-    mulps  xmm9, [rsp + nb410_twelve]
-    subps  xmm9, xmm10
-    mulps  xmm9, xmm0 ;# (12*vnb12-6*vnb6)*rinv
-
-	mov rsi, [rbp + nb410_dvda]
-	
-	;# Calculate dVda
-	xorps  xmm7, xmm7
-	mulps xmm3, [rsp + nb410_gbscale]
-	movaps xmm6, xmm3
-	mulps  xmm6, [rsp + nb410_r]
-	addps  xmm6, xmm5
-    
-    ;# increment vctot (sum in xmm12)
-	addps  xmm12, xmm5
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-    ;# update dvdasum
-    addps  xmm7, [rsp + nb410_dvdasum]
-    movaps [rsp + nb410_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	movaps  xmm5, xmm6
-	movaps  xmm4, xmm7
-	shufps  xmm5, xmm5, 0x1
-	shufps  xmm4, xmm4, 0x1
-
-	;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-	addss  xmm6, [rsi + rax*4]
-	addss  xmm5, [rsi + rbx*4]
-	addss  xmm7, [rsi + rcx*4]
-	addss  xmm4, [rsi + rdx*4]
-	movss  [rsi + rax*4], xmm6
-	movss  [rsi + rbx*4], xmm5
-	movss  [rsi + rcx*4], xmm7
-	movss  [rsi + rdx*4], xmm4
-
-    subps  xmm9, xmm3
-    mulps  xmm9, xmm0 ;# fscal
-
-    movaps  xmm10, xmm9
-    movaps  xmm11, xmm9
-
-    mulps   xmm9, [rsp + nb410_dx]
-    mulps   xmm10, [rsp + nb410_dy]
-    mulps   xmm11, [rsp + nb410_dz]
-    
-	;# accumulate i forces
-    addps xmm13, xmm9
-    addps xmm14, xmm10
-    addps xmm15, xmm11
-
-	mov rsi, [rbp + nb410_faction]
-	;# the fj's - start by accumulating x & y forces from memory 
-	movlps xmm0, [rsi + r8*4] ;# x1 y1 - -
-	movlps xmm1, [rsi + r10*4] ;# x3 y3 - -
-	movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2
-	movhps xmm1, [rsi + r11*4] ;# x3 y3 x4 y4
-
-    movaps xmm8, xmm9
-    unpcklps xmm9, xmm10 ;# x1 y1 x2 y2
-    unpckhps xmm8, xmm10 ;# x3 y3 x4 y4
-    
-    ;# update fjx and fjy
-	addps  xmm0, xmm9
-	addps  xmm1, xmm8
-	
-	movlps [rsi + r8*4], xmm0
-	movlps [rsi + r10*4], xmm1
-	movhps [rsi + r9*4], xmm0
-	movhps [rsi + r11*4], xmm1
-    
-    ;# xmm11: fjz1 fjz2 fjz3 fjz4
-    pshufd  xmm10, xmm11, 1  ;# fjz2 - - -
-    movhlps xmm9,  xmm11     ;# fjz3 - - -
-    pshufd  xmm8,  xmm11, 3  ;# fjz4 - - -
-    
-	addss  xmm11, [rsi + r8*4 + 8]
-	addss  xmm10, [rsi + r9*4 + 8]
-	addss  xmm9,  [rsi + r10*4 + 8]
-	addss  xmm8,  [rsi + r11*4 + 8]    
-	movss  [rsi + r8*4 + 8], xmm11
-	movss  [rsi + r9*4 + 8], xmm10
-	movss  [rsi + r10*4 + 8], xmm9
-	movss  [rsi + r11*4 + 8], xmm8
-	
-	;# should we do one more iteration? 
-	sub dword ptr [rsp + nb410_innerk],  4
-	jl    .nb410_finish_inner
-	jmp   .nb410_unroll_loop
-.nb410_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [rsp + nb410_innerk],  4
-	mov   edx, [rsp + nb410_innerk]
-	and   edx, 2
-	jnz   .nb410_dopair
-	jmp   .nb410_checksingle
-.nb410_dopair:	
-	mov   rcx, [rsp + nb410_innerjjnr]
-	
-	mov   eax, [rcx]	
-	mov   ebx, [rcx + 4]              
-	add qword ptr [rsp + nb410_innerjjnr],  8
-
-	;# load isaj
-	mov rsi, [rbp + nb410_invsqrta]
-	movss xmm2, [rsi + rax*4]
-	movss xmm6, [rsi + rbx*4]
-    unpcklps xmm2, xmm6
-
-	mulps  xmm2, [rsp + nb410_isai]
-	
-	movaps [rsp + nb410_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [rsp + nb410_gbtsc]
-	movaps [rsp + nb410_gbscale], xmm1
-	
-    mulps xmm2, [rsp + nb410_iq]
-	mov rsi, [rbp + nb410_charge]    ;# base of charge[] 
-	movss xmm3, [rsi + rax*4]
-	movss xmm6, [rsi + rbx*4]
-    unpcklps xmm3, xmm6
-    
-
-	mulps xmm3, xmm2
-	movaps [rsp + nb410_qq], xmm3	
-	
-     ;# vdw parameters
-	mov rsi, [rbp + nb410_type]
-	mov r12d, [rsi + rax*4]
-	mov r13d, [rsi + rbx*4]
-	shl r12d, 1	
-	shl r13d, 1	
-    mov edi, [rsp + nb410_ntia]
-	add r12d, edi
-	add r13d, edi
-
-	mov rsi, [rbp + nb410_vdwparam]
-	movlps xmm3, [rsi + r12*4]
-	movhps xmm3, [rsi + r13*4]
-
-    xorps xmm7, xmm7
-	movaps xmm0, xmm3
-	shufps xmm0, xmm7, 136  ;# 10001000
-	shufps xmm3, xmm7, 221  ;# 11011101
-
-    movaps [rsp + nb410_c6], xmm0
-    movaps [rsp + nb410_c12], xmm3
-
-	mov rsi, [rbp + nb410_pos]       ;# base of pos[] 
-	
-	lea   r8, [rax + rax*2]     ;# j3
-	lea   r9, [rbx + rbx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-	movlps xmm4, [rsi + r8*4]	;# x1 y1 - - 
-	movlps xmm5, [rsi + r9*4]	;# x2 y2 - - 
-
-	movss xmm6, [rsi + r8*4 + 8]	;# z1 - - - 
-	movss xmm7, [rsi + r9*4 + 8]	;# z2 - - - 
-
-    unpcklps xmm4, xmm5 ;# x1 x2 y1 y2
-    movhlps  xmm5, xmm4 ;# y1 y2 -  -
-    unpcklps xmm6, xmm7 ;# z1 z2 -  -
-    
-	;# calc dr 
-	subps xmm4, [rsp + nb410_ix]
-	subps xmm5, [rsp + nb410_iy]
-	subps xmm6, [rsp + nb410_iz]
-
-	;# store dr 
-	movaps [rsp + nb410_dx], xmm4
-	movaps [rsp + nb410_dy], xmm5
-	movaps [rsp + nb410_dz], xmm6
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb410_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb410_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r
-	movaps [rsp + nb410_r], xmm4
-	mulps xmm4, [rsp + nb410_gbscale]
-
-    ;# truncate and convert to integers
-    cvttps2dq xmm5, xmm4
-    
-    ;# convert back to float
-    cvtdq2ps  xmm6, xmm5
-    
-    ;# multiply by 4
-    pslld   xmm5, 2
-
-    ;# move to integer registers
-    movd    r12d, xmm5
-    pshufd  xmm5, xmm5, 1
-    movd    r13d, xmm5
-    
-    ;# calculate eps
-    subps     xmm4, xmm6
-    movaps    xmm1, xmm4 ;#eps
-    
-	mov  rsi, [rbp + nb410_GBtab]
-
-    movaps xmm9, xmm0 ;# rinv
-    mulps  xmm9, xmm9 ;# rinvsq
-    movaps xmm10, xmm9 ;# rinvsq
-    mulps  xmm10, xmm10 ;# rinv4
-    mulps  xmm10, xmm9 ;# rinv6
-    movaps xmm11, xmm10 
-    mulps  xmm11, xmm11 ;# rinv12
-
-    ;# load table data
-   	movlps xmm4, [rsi + r12*4]  ;# Y1 F1
-	movlps xmm5, [rsi + r13*4]  ;# Y2 F2
-    unpcklps xmm4, xmm5         ;# Y1 Y2 F1 F2
-    movhlps  xmm5, xmm4         ;# F1 F2
-    
-    mulps  xmm10, [rsp + nb410_c6]    ;# vvdw6=c6*rinv6
-	mulps  xmm11, [rsp + nb410_c12]   ;# vvdw12=c12*rinv12     
-
-	movaps xmm9, xmm11
-	subps  xmm11, xmm10	;# Vvdw=Vvdw12-Vvdw6
-
-    ;# add potential to vvdwtot 
-	addps  xmm11, [rsp + nb410_Vvdwtot]
-    movlps [rsp + nb410_Vvdwtot], xmm11
-    
-   	movlps xmm6, [rsi + r12*4 + 8]   ;# G1 H1
-	movlps xmm7, [rsi + r13*4 + 8]   ;# G2 H2
-    unpcklps xmm6, xmm7              ;# G1 G2
-    movhlps  xmm7, xmm6              ;# H1 H2
-    ;# table data ready in xmm4-xmm7
-
-    mulps  xmm7, xmm1   ;# Heps
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm1	;# Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	addps  xmm7, xmm7	;# two*Heps2 
-	movaps xmm3, [rsp + nb410_qq]
-
-	addps  xmm7, xmm6
-	addps  xmm7, xmm5 ;# xmm7=FF 
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulps  xmm3, xmm7 ;# fijC=FF*qq 
-	;# at this point xmm5 contains vcoul and xmm3 fijC
-
-    ;# LJ forces
-    mulps  xmm10, [rsp + nb410_six]
-    mulps  xmm9,  [rsp + nb410_twelve]
-    subps  xmm9, xmm10
-    mulps  xmm9, xmm0 ;# (12*vnb12-6*vnb6)*rinv
-
-    ;# zero upper part of vcoul 
-    xorps xmm2, xmm2
-    movlhps xmm5, xmm2
-    
-	mov rsi, [rbp + nb410_dvda]
-	
-	;# Calculate dVda
-	xorps  xmm7, xmm7
-	mulps xmm3, [rsp + nb410_gbscale]
-	movaps xmm6, xmm3
-	mulps  xmm6, [rsp + nb410_r]
-	addps  xmm6, xmm5
-    
-    xorps  xmm4, xmm4
-    ;# increment vctot (sum in xmm12)
-	addps  xmm12, xmm5
-
-	;# xmm6=(vcoul+fijC*r)
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-
-    ;# zero upper half of dvda
-    movlhps xmm7, xmm4
-    
-    ;# update dvdasum
-    addps  xmm7, [rsp + nb410_dvdasum]
-    movaps [rsp + nb410_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movaps  xmm5, xmm6
-	shufps  xmm5, xmm5, 0x1
-
-	;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-	addss  xmm6, [rsi + rax*4]
-	addss  xmm5, [rsi + rbx*4]
-	movss  [rsi + rax*4], xmm6
-	movss  [rsi + rbx*4], xmm5
-
-    xorps xmm7, xmm7
-    
-    subps  xmm9, xmm3
-    mulps  xmm9, xmm0 ;# fscal
-
-    movaps  xmm10, xmm9
-    movaps  xmm11, xmm9
-
-    mulps   xmm9, [rsp + nb410_dx]
-    mulps   xmm10, [rsp + nb410_dy]
-    mulps   xmm11, [rsp + nb410_dz]
-
-    movlhps  xmm9, xmm7
-    movlhps  xmm10, xmm7
-    movlhps  xmm11, xmm7
-    
-	;# accumulate i forces
-    addps xmm13, xmm9
-    addps xmm14, xmm10
-    addps xmm15, xmm11
-
-	mov rsi, [rbp + nb410_faction]
-	;# the fj's - start by accumulating x & y forces from memory 
-	movlps xmm0, [rsi + r8*4] ;# x1 y1 - -
-	movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2
-
-    unpcklps xmm9, xmm10  ;# x1 y1 x2 y2
-    addps    xmm0, xmm9
-
-	movlps [rsi + r8*4], xmm0
-	movhps [rsi + r9*4], xmm0
-    
-    ;# z forces
-    pshufd xmm8, xmm11, 1
-    addss  xmm11, [rsi + r8*4 + 8] 
-    addss  xmm8,  [rsi + r9*4 + 8]
-    movss  [rsi + r8*4 + 8], xmm11
-    movss  [rsi + r9*4 + 8], xmm8
-
-.nb410_checksingle:				
-	mov   edx, [rsp + nb410_innerk]
-	and   edx, 1
-	jnz    .nb410_dosingle
-	jmp    .nb410_updateouterdata
-.nb410_dosingle:
-	mov rsi, [rbp + nb410_charge]
-	mov rdx, [rbp + nb410_invsqrta]
-	mov rdi, [rbp + nb410_pos]
-	mov   rcx, [rsp + nb410_innerjjnr]
-	mov   eax, [rcx]	
-
-	;# load isaj
-	mov rsi, [rbp + nb410_invsqrta]
-	movss xmm3, [rsi + rax*4]
-	movaps xmm2, [rsp + nb410_isai]
-	mulss  xmm2, xmm3
-	
-	movss [rsp + nb410_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulss xmm1, [rsp + nb410_gbtsc]
-	movss [rsp + nb410_gbscale], xmm1
-	
-    mulss xmm2, [rsp + nb410_iq]
-	mov rsi, [rbp + nb410_charge]    ;# base of charge[] 
-
-	movss xmm3, [rsi + rax*4]
-	mulss xmm3, xmm2
-	movss [rsp + nb410_qq], xmm3	
-	
-    ;# vdw parameters
-	mov rsi, [rbp + nb410_type]
-	mov r12d, [rsi + rax*4]
-	shl r12d, 1	
-    mov edi, [rsp + nb410_ntia]
-	add r12d, edi
-
-	mov rsi, [rbp + nb410_vdwparam]
-	movss xmm0, [rsi + r12*4]
-	movss xmm3, [rsi + r12*4 + 4]
-    movaps [rsp + nb410_c6], xmm0
-    movaps [rsp + nb410_c12], xmm3
-
-	mov rsi, [rbp + nb410_pos]       ;# base of pos[] 
-	
-	lea   r8, [rax + rax*2]     ;# jnr 
-
-	;# move four coordinates to xmm0-xmm2 	
-	movss xmm4, [rsi + r8*4]	
-	movss xmm5, [rsi + r8*4 + 4]	
-	movss xmm6, [rsi + r8*4 + 8]
-    
-	;# calc dr 
-	subss xmm4, [rsp + nb410_ix]
-	subss xmm5, [rsp + nb410_iy]
-	subss xmm6, [rsp + nb410_iz]
-
-	;# store dr 
-	movaps [rsp + nb410_dx], xmm4
-	movaps [rsp + nb410_dy], xmm5
-	movaps [rsp + nb410_dz], xmm6
-
-	;# square it 
-	mulss xmm4,xmm4
-	mulss xmm5,xmm5
-	mulss xmm6,xmm6
-	addss xmm4, xmm5
-	addss xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulss xmm5, xmm5
-	movaps xmm1, [rsp + nb410_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb410_half]
-	subss xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-	mulss xmm4, xmm0	;# xmm4=r
-	movaps [rsp + nb410_r], xmm4
-	mulss xmm4, [rsp + nb410_gbscale]
-
-    ;# truncate and convert to integers
-    cvttss2si r12d, xmm4
-    
-    ;# convert back to float
-    cvtsi2ss  xmm6, r12d
-    
-    ;# multiply by 4
-    shl   r12d, 2
-
-    ;# calculate eps
-    subss     xmm4, xmm6
-    movaps    xmm1, xmm4 ;#eps
-    
-	mov  rsi, [rbp + nb410_GBtab]
-
-    movaps xmm9, xmm0 ;# rinv
-    mulss  xmm9, xmm9 ;# rinvsq
-    movaps xmm10, xmm9 ;# rinvsq
-    mulss  xmm10, xmm10 ;# rinv4
-    mulss  xmm10, xmm9 ;# rinv6
-    movaps xmm11, xmm10 
-    mulss  xmm11, xmm11 ;# rinv12
-
-    ;# load table data
-   	movss xmm4, [rsi + r12*4]
-	movss xmm5, [rsi + r12*4 + 4]
-   	movss xmm6, [rsi + r12*4 + 8]
-	movss xmm7, [rsi + r12*4 + 12]
-    ;# table data ready in xmm4-xmm7
-
-    mulss  xmm10, [rsp + nb410_c6]    ;# vvdw6=c6*rinv6
-	mulss  xmm11, [rsp + nb410_c12]   ;# vvdw12=c12*rinv12     
-
-	movaps xmm9, xmm11
-	subss  xmm11, xmm10	;# Vvdw=Vvdw12-Vvdw6
-
-    ;# add potential to vvdwtot 
-	addss  xmm11, [rsp + nb410_Vvdwtot]
-    movss [rsp + nb410_Vvdwtot], xmm11    
-
-    mulss  xmm7, xmm1   ;# Heps
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm1	;# Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	addss  xmm7, xmm7	;# two*Heps2 
-	movss  xmm3, [rsp + nb410_qq]
-	addss  xmm7, xmm6
-	addss  xmm7, xmm5 ;# xmm7=FF 
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulss  xmm3, xmm7 ;# fijC=FF*qq 
-	;# at this point xmm5 contains vcoul and xmm3 fijC
-
-    ;# LJ forces
-    mulss  xmm10, [rsp + nb410_six]
-    mulss  xmm9,  [rsp + nb410_twelve]
-    subss  xmm9, xmm10
-    mulss  xmm9, xmm0 ;# (12*vnb12-6*vnb6)*rinv
-
-	mov rsi, [rbp + nb410_dvda]
-	
-	;# Calculate dVda
-	xorps  xmm7, xmm7
-	mulss xmm3, [rsp + nb410_gbscale]
-	movaps xmm6, xmm3
-	mulss  xmm6, [rsp + nb410_r]
-	addss  xmm6, xmm5
-    
-    ;# increment vctot (sum in xmm12)
-	addss  xmm12, xmm5
-
-	;# xmm6=(vcoul+fijC*r)
-	subss  xmm7, xmm6
-	movaps xmm6, xmm7
-
-    ;# update dvdasum
-    addss  xmm7, [rsp + nb410_dvdasum]
-    movss [rsp + nb410_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	addss  xmm6, [rsi + rax*4]
-	movss  [rsi + rax*4], xmm6
-
-    subss  xmm9, xmm3
-    mulss  xmm9, xmm0 ;# fscal
-
-    movaps  xmm10, xmm9
-    movaps  xmm11, xmm9
-
-    mulss   xmm9, [rsp + nb410_dx]
-    mulss   xmm10, [rsp + nb410_dy]
-    mulss   xmm11, [rsp + nb410_dz]
-    
-	;# accumulate i forces
-    addss xmm13, xmm9
-    addss xmm14, xmm10
-    addss xmm15, xmm11
-
-	mov rsi, [rbp + nb410_faction]
-    ;# add to j forces
-    addss  xmm9,  [rsi + r8*4]
-    addss  xmm10, [rsi + r8*4 + 4]
-    addss  xmm11, [rsi + r8*4 + 8]
-    movss  [rsi + r8*4],     xmm9
-    movss  [rsi + r8*4 + 4], xmm10
-    movss  [rsi + r8*4 + 8], xmm11
-    
-.nb410_updateouterdata:
-	mov   ecx, [rsp + nb410_ii3]
-	mov   rdi, [rbp + nb410_faction]
-	mov   rsi, [rbp + nb410_fshift]
-	mov   edx, [rsp + nb410_is3]
-
-	;# accumulate i forces in xmm13, xmm14, xmm15
-	movhlps xmm0, xmm13
-	movhlps xmm1, xmm14
-	movhlps xmm2, xmm15
-	addps  xmm0, xmm13
-	addps  xmm1, xmm14
-	addps  xmm2, xmm15 
-    movaps xmm3, xmm0	
-	movaps xmm4, xmm1	
-	movaps xmm5, xmm2	
-	shufps xmm3, xmm3, 1
-	shufps xmm4, xmm4, 1
-	shufps xmm5, xmm5, 1
-	addss  xmm0, xmm3
-	addss  xmm1, xmm4
-	addss  xmm2, xmm5	;# xmm0-xmm2 has single force in pos0 
-
-
-	;# increment i force 
-	movss  xmm3, [rdi + rcx*4]
-	movss  xmm4, [rdi + rcx*4 + 4]
-	movss  xmm5, [rdi + rcx*4 + 8]
-	subss  xmm3, xmm0
-	subss  xmm4, xmm1
-	subss  xmm5, xmm2
-	movss  [rdi + rcx*4],     xmm3
-	movss  [rdi + rcx*4 + 4], xmm4
-	movss  [rdi + rcx*4 + 8], xmm5
-
-	;# increment fshift force  
-	movss  xmm3, [rsi + rdx*4]
-	movss  xmm4, [rsi + rdx*4 + 4]
-	movss  xmm5, [rsi + rdx*4 + 8]
-	subss  xmm3, xmm0
-	subss  xmm4, xmm1
-	subss  xmm5, xmm2
-	movss  [rsi + rdx*4],     xmm3
-	movss  [rsi + rdx*4 + 4], xmm4
-	movss  [rsi + rdx*4 + 8], xmm5
-
-	;# get n from stack
-	mov esi, [rsp + nb410_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb410_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	;# accumulate 
-	movhlps xmm6, xmm12
-	addps  xmm12, xmm6	;# pos 0-1 in xmm12 have the sum now 
-	movaps xmm6, xmm12
-	shufps xmm6, xmm6, 1
-	addss  xmm12, xmm6
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb410_Vc]
-	addss xmm12, [rax + rdx*4] 
-	;# move back to mem 
-	movss [rax + rdx*4], xmm12
-	
-	;# accumulate total lj energy and update it 
-	movaps xmm7, [rsp + nb410_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb410_Vvdw]
-	addss xmm7, [rax + rdx*4] 
-	;# move back to mem 
-	movss [rax + rdx*4], xmm7 
-	
-	;# accumulate dVda and update it 
-	movaps xmm7, [rsp + nb410_dvdasum]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-	
-	mov edx, [rsp + nb410_ii]
-	mov rax, [rbp + nb410_dvda]
-	addss xmm7, [rax + rdx*4]
-	movss [rax + rdx*4], xmm7
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb410_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb410_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb410_n], esi
-        jmp .nb410_outer
-.nb410_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb410_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb410_end
-        ;# non-zero, do one more workunit
-        jmp   .nb410_threadloop
-.nb410_end:
-
-	mov eax, [rsp + nb410_nouter]
-	mov ebx, [rsp + nb410_ninner]
-	mov rcx, [rbp + nb410_outeriter]
-	mov rdx, [rbp + nb410_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 568
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
-
-
-.globl nb_kernel410nf_x86_64_sse
-.globl _nb_kernel410nf_x86_64_sse
-nb_kernel410nf_x86_64_sse:	
-_nb_kernel410nf_x86_64_sse:	
-;#	Room for return address and rbp (16 bytes)
-.equiv          nb410nf_fshift,         16
-.equiv          nb410nf_gid,            24
-.equiv          nb410nf_pos,            32
-.equiv          nb410nf_faction,        40
-.equiv          nb410nf_charge,         48
-.equiv          nb410nf_p_facel,        56
-.equiv          nb410nf_argkrf,         64
-.equiv          nb410nf_argcrf,         72
-.equiv          nb410nf_Vc,             80
-.equiv          nb410nf_type,           88
-.equiv          nb410nf_p_ntype,        96
-.equiv          nb410nf_vdwparam,       104
-.equiv          nb410nf_Vvdw,           112
-.equiv          nb410nf_p_tabscale,     120
-.equiv          nb410nf_VFtab,          128
-.equiv          nb410nf_invsqrta,       136
-.equiv          nb410nf_dvda,           144
-.equiv          nb410nf_p_gbtabscale,   152
-.equiv          nb410nf_GBtab,          160
-.equiv          nb410nf_p_nthreads,     168
-.equiv          nb410nf_count,          176
-.equiv          nb410nf_mtx,            184
-.equiv          nb410nf_outeriter,      192
-.equiv          nb410nf_inneriter,      200
-.equiv          nb410nf_work,           208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb410nf_ix,             0
-.equiv          nb410nf_iy,             16
-.equiv          nb410nf_iz,             32
-.equiv          nb410nf_iq,             48
-.equiv          nb410nf_gbtsc,          64
-.equiv          nb410nf_qq,             80
-.equiv          nb410nf_c6,             96
-.equiv          nb410nf_c12,            112
-.equiv          nb410nf_vctot,          128
-.equiv          nb410nf_Vvdwtot,        144
-.equiv          nb410nf_half,           160
-.equiv          nb410nf_three,          176
-.equiv          nb410nf_isai,           192
-.equiv          nb410nf_isaprod,        208
-.equiv          nb410nf_gbscale,        224
-.equiv          nb410nf_nri,            240
-.equiv          nb410nf_iinr,           248
-.equiv          nb410nf_jindex,         256
-.equiv          nb410nf_jjnr,           264
-.equiv          nb410nf_shift,          272
-.equiv          nb410nf_shiftvec,       280
-.equiv          nb410nf_facel,          288
-.equiv          nb410nf_innerjjnr,      296
-.equiv          nb410nf_is3,            304
-.equiv          nb410nf_ii3,            308
-.equiv          nb410nf_ntia,           312
-.equiv          nb410nf_innerk,         316
-.equiv          nb410nf_n,              320
-.equiv          nb410nf_nn1,            324
-.equiv          nb410nf_ntype,          328
-.equiv          nb410nf_nouter,         332
-.equiv          nb410nf_ninner,         336
-
-	push rbp
-	mov  rbp, rsp
-	push rbx
-
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 360		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb410nf_nouter], eax
-	mov [rsp + nb410nf_ninner], eax
-
-	mov edi, [rdi]
-	mov [rsp + nb410nf_nri], edi
-	mov [rsp + nb410nf_iinr], rsi
-	mov [rsp + nb410nf_jindex], rdx
-	mov [rsp + nb410nf_jjnr], rcx
-	mov [rsp + nb410nf_shift], r8
-	mov [rsp + nb410nf_shiftvec], r9
-	mov rdi, [rbp + nb410nf_p_ntype]
-	mov edi, [rdi]
-	mov [rsp + nb410nf_ntype], edi
-	mov rsi, [rbp + nb410nf_p_facel]
-	movss xmm0, [rsi]
-	movss [rsp + nb410nf_facel], xmm0
-
-	mov rbx, [rbp + nb410nf_p_gbtabscale]
-	movss xmm4, [rbx]
-	shufps xmm4, xmm4, 0
-	movaps [rsp + nb410nf_gbtsc],  xmm4
-
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# half in IEEE (hex)
-	mov [rsp + nb410nf_half], eax
-	movss xmm1, [rsp + nb410nf_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# one
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# two
-	addps  xmm3, xmm2	;# three
-	movaps [rsp + nb410nf_half],  xmm1
-	movaps [rsp + nb410nf_three],  xmm3
-
-.nb410nf_threadloop:
-        mov   rsi, [rbp + nb410nf_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb410nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb410nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb410nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb410nf_n], eax
-        mov [rsp + nb410nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb410nf_outerstart
-        jmp .nb410nf_end
-
-.nb410nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb410nf_nouter]
-	mov [rsp + nb410nf_nouter], ebx
-
-.nb410nf_outer:
-	mov   rax, [rsp + nb410nf_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax+rsi*4]		;# ebx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb410nf_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb410nf_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movss xmm0, [rax + rbx*4]
-	movss xmm1, [rax + rbx*4 + 4]
-	movss xmm2, [rax + rbx*4 + 8] 
-
-	mov   rcx, [rsp + nb410nf_iinr]       ;# rcx = pointer into iinr[] 	
-	mov   ebx, [rcx + rsi*4]	    ;# ebx =ii
-	
-	mov   rdx, [rbp + nb410nf_charge]
-	movss xmm3, [rdx + rbx*4]	
-	mulss xmm3, [rsp + nb410nf_facel]
-	shufps xmm3, xmm3, 0
-
-	mov   rdx, [rbp + nb410nf_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [rdx + rbx*4]
-	shufps xmm4, xmm4, 0
-
-    	mov   rdx, [rbp + nb410nf_type] 
-    	mov   edx, [rdx + rbx*4]
-    	imul  edx, [rsp + nb410nf_ntype]
-    	shl   edx, 1
-    	mov   [rsp + nb410nf_ntia], edx
-	
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb410nf_pos]    ;# rax = base of pos[]  
-
-	addss xmm0, [rax + rbx*4]
-	addss xmm1, [rax + rbx*4 + 4]
-	addss xmm2, [rax + rbx*4 + 8]
-
-	movaps [rsp + nb410nf_iq], xmm3
-	movaps [rsp + nb410nf_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [rsp + nb410nf_ix], xmm0
-	movaps [rsp + nb410nf_iy], xmm1
-	movaps [rsp + nb410nf_iz], xmm2
-
-	mov   [rsp + nb410nf_ii3], ebx
-	
-	;# clear vctot
-	xorps xmm4, xmm4
-	movaps [rsp + nb410nf_vctot], xmm4
-	movaps [rsp + nb410nf_Vvdwtot], xmm4
-	
-	mov   rax, [rsp + nb410nf_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb410nf_pos]
-	mov   rdi, [rbp + nb410nf_faction]	
-	mov   rax, [rsp + nb410nf_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb410nf_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [rsp + nb410nf_ninner]
-	mov   [rsp + nb410nf_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb410nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb410nf_unroll_loop
-	jmp   .nb410nf_finish_inner
-.nb410nf_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   rdx, [rsp + nb410nf_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [rdx]	
-	mov   ebx, [rdx + 4]              
-	mov   ecx, [rdx + 8]            
-	mov   edx, [rdx + 12]         ;# eax-edx=jnr1-4 
-	add qword ptr [rsp + nb410nf_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-
-	;# load isa2
-	mov rsi, [rbp + nb410nf_invsqrta]
-	movss xmm3, [rsi + rax*4]
-	movss xmm4, [rsi + rcx*4]
-	movss xmm6, [rsi + rbx*4]
-	movss xmm7, [rsi + rdx*4]
-	movaps xmm2, [rsp + nb410nf_isai]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# 10001000 ;# all charges in xmm3  
-	mulps  xmm2, xmm3
-	
-	movaps [rsp + nb410nf_isaprod], xmm2
-	movaps xmm1, xmm2
-	mulps xmm1, [rsp + nb410nf_gbtsc]
-	movaps [rsp + nb410nf_gbscale], xmm1
-	
-	mov rsi, [rbp + nb410nf_charge]    ;# base of charge[] 
-	
-	movss xmm3, [rsi + rax*4]
-	movss xmm4, [rsi + rcx*4]
-	movss xmm6, [rsi + rbx*4]
-	movss xmm7, [rsi + rdx*4]
-
-	mulps xmm2, [rsp + nb410nf_iq]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# 10001000 ;# all charges in xmm3  
-	mulps  xmm3, xmm2
-	movaps [rsp + nb410nf_qq], xmm3	
-
-	movd mm0, eax
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-	
-	mov rsi, [rbp + nb410nf_type]
-	mov eax, [rsi + rax*4]
-	mov ebx, [rsi + rbx*4]
-	mov ecx, [rsi + rcx*4]
-	mov edx, [rsi + rdx*4]
-	mov rsi, [rbp + nb410nf_vdwparam]
-	shl eax, 1	
-	shl ebx, 1	
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [rsp + nb410nf_ntia]
-	add eax, edi
-	add ebx, edi
-	add ecx, edi
-	add edx, edi
-
-	movlps xmm6, [rsi + rax*4]
-	movlps xmm7, [rsi + rcx*4]
-	movhps xmm6, [rsi + rbx*4]
-	movhps xmm7, [rsi + rdx*4]
-
-	movaps xmm4, xmm6
-	shufps xmm4, xmm7, 136  ;# 10001000
-	shufps xmm6, xmm7, 221  ;# 11011101
-	
-	movd  eax, mm0		
-	movd  ebx, mm1
-	movd  ecx, mm2
-	movd  edx, mm3
-
-	movaps [rsp + nb410nf_c6], xmm4
-	movaps [rsp + nb410nf_c12], xmm6
-	
-	mov rsi, [rbp + nb410nf_pos]       ;# base of pos[] 
-
-	lea   rax, [rax + rax*2]     ;# replace jnr with j3 
-	lea   rbx, [rbx + rbx*2]	
-
-	lea   rcx, [rcx + rcx*2]     ;# replace jnr with j3 
-	lea   rdx, [rdx + rdx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-
-	movlps xmm4, [rsi + rax*4]
-	movlps xmm5, [rsi + rcx*4]
-	movss xmm2, [rsi + rax*4 + 8]
-	movss xmm6, [rsi + rcx*4 + 8]
-
-	movhps xmm4, [rsi + rbx*4]
-	movhps xmm5, [rsi + rdx*4]
-
-	movss xmm0, [rsi + rbx*4 + 8]
-	movss xmm1, [rsi + rdx*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# 10001000
-	
-	shufps xmm0, xmm5, 136  ;# 10001000
-	shufps xmm1, xmm5, 221  ;# 11011101		
-
-	;# move ix-iz to xmm4-xmm6 
-	movaps xmm4, [rsp + nb410nf_ix]
-	movaps xmm5, [rsp + nb410nf_iy]
-	movaps xmm6, [rsp + nb410nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb410nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb410nf_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	mulps xmm4, [rsp + nb410nf_gbscale]
-
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 2
-	pslld mm7, 2
-
-	movd mm0, eax	
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-
-	mov  rsi, [rbp + nb410nf_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-
-	;# load coulomb table
-	movaps xmm4, [rsi + rax*4]
-	movaps xmm5, [rsi + rbx*4]
-	movaps xmm6, [rsi + rcx*4]
-	movaps xmm7, [rsi + rdx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	movaps xmm3, [rsp + nb410nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	;# update vctot
-	addps  xmm5, [rsp + nb410nf_vctot]
-	movaps [rsp + nb410nf_vctot], xmm5	
-	
-	;# L-J 
-	movaps xmm4, xmm0
-	mulps  xmm4, xmm0	;# xmm4=rinvsq 
-
-	movaps xmm6, xmm4
-	mulps  xmm6, xmm4
-
-	mulps  xmm6, xmm4	;# xmm6=rinvsix 
-	movaps xmm4, xmm6
-	mulps  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulps  xmm6, [rsp + nb410nf_c6]
-	mulps  xmm4, [rsp + nb410nf_c12]
-	movaps xmm7, [rsp + nb410nf_Vvdwtot]
-	addps  xmm7, xmm4
-	subps  xmm7, xmm6
-	movaps [rsp + nb410nf_Vvdwtot], xmm7
-	
-	;# should we do one more iteration? 
-	sub dword ptr [rsp + nb410nf_innerk],  4
-	jl    .nb410nf_finish_inner
-	jmp   .nb410nf_unroll_loop
-.nb410nf_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [rsp + nb410nf_innerk],  4
-	mov   edx, [rsp + nb410nf_innerk]
-	and   edx, 2
-	jnz   .nb410nf_dopair
-	jmp   .nb410nf_checksingle
-.nb410nf_dopair:	
-	mov   rcx, [rsp + nb410nf_innerjjnr]
-	mov   eax, [rcx]	
-	mov   ebx, [rcx + 4]              
-	add qword ptr [rsp + nb410nf_innerjjnr],  8
-
-	xorps xmm2, xmm2
-	movaps xmm6, xmm2
-	
-	;# load isa2
-	mov rsi, [rbp + nb410nf_invsqrta]
-	movss xmm2, [rsi + rax*4]
-	movss xmm3, [rsi + rbx*4]
-	unpcklps xmm2, xmm3	;# isa2 in xmm3(0,1)
-	mulps  xmm2, [rsp + nb410nf_isai]
-	movaps [rsp + nb410nf_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [rsp + nb410nf_gbtsc]
-	movaps [rsp + nb410nf_gbscale], xmm1	
-	
-	mov rsi, [rbp + nb410nf_charge]    ;# base of charge[] 	
-	movss xmm3, [rsi + rax*4]		
-	movss xmm6, [rsi + rbx*4]
-	unpcklps xmm3, xmm6 ;# 00001000 ;# xmm3(0,1) has the charges 
-
-	mulps  xmm2, [rsp + nb410nf_iq]
-	mulps  xmm3, xmm2
-	movaps [rsp + nb410nf_qq], xmm3
-
-	mov rsi, [rbp + nb410nf_type]
-	mov   ecx, eax
-	mov   edx, ebx
-	mov ecx, [rsi + rcx*4]
-	mov edx, [rsi + rdx*4]	
-	mov rsi, [rbp + nb410nf_vdwparam]
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [rsp + nb410nf_ntia]
-	add ecx, edi
-	add edx, edi
-	movlps xmm6, [rsi + rcx*4]
-	movhps xmm6, [rsi + rdx*4]
-	mov rdi, [rbp + nb410nf_pos]	
-	
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 8 ;# 00001000 	
-	shufps xmm6, xmm6, 13 ;# 00001101
-	movlhps xmm4, xmm7
-	movlhps xmm6, xmm7
-	
-	movaps [rsp + nb410nf_c6], xmm4
-	movaps [rsp + nb410nf_c12], xmm6	
-	
-	lea   rax, [rax + rax*2]
-	lea   rbx, [rbx + rbx*2]
-	;# move coordinates to xmm0-xmm2 
-	movlps xmm1, [rdi + rax*4]
-	movss xmm2, [rdi + rax*4 + 8]	
-	movhps xmm1, [rdi + rbx*4]
-	movss xmm0, [rdi + rbx*4 + 8]	
-
-	movlhps xmm3, xmm7
-	
-	shufps xmm2, xmm0, 0
-	
-	movaps xmm0, xmm1
-
-	shufps xmm2, xmm2, 136  ;# 10001000
-	
-	shufps xmm0, xmm0, 136  ;# 10001000
-	shufps xmm1, xmm1, 221  ;# 11011101
-	
-	mov    rdi, [rbp + nb410nf_faction]
-	;# move ix-iz to xmm4-xmm6 
-	xorps   xmm7, xmm7
-	
-	movaps xmm4, [rsp + nb410nf_ix]
-	movaps xmm5, [rsp + nb410nf_iy]
-	movaps xmm6, [rsp + nb410nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb410nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb410nf_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	mulps xmm4, [rsp + nb410nf_gbscale]
-
-	cvttps2pi mm6, xmm4     ;# mm6 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-
-	pslld mm6, 2
-
-	mov  rsi, [rbp + nb410nf_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-
-	;# load coulomb table
-	movaps xmm4, [rsi + rcx*4]
-	movaps xmm7, [rsi + rdx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# coulomb table ready, in xmm4-xmm7  	
-
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	movaps xmm3, [rsp + nb410nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-
-	addps  xmm5, [rsp + nb410nf_vctot]
-	movaps [rsp + nb410nf_vctot], xmm5
-	
-	;# L-J 
-	movaps xmm4, xmm0
-	mulps  xmm4, xmm0	;# xmm4=rinvsq 
-
-	;# at this point mm5 contains vcoul and mm3 fijC 
-	;# increment vcoul - then we can get rid of mm5 
-	;# update vctot 
-
-	movaps xmm6, xmm4
-	mulps  xmm6, xmm4
-
-	mulps  xmm6, xmm4	;# xmm6=rinvsix 
-	movaps xmm4, xmm6
-	mulps  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulps  xmm6, [rsp + nb410nf_c6]
-	mulps  xmm4, [rsp + nb410nf_c12]
-	movaps xmm7, [rsp + nb410nf_Vvdwtot]
-	addps  xmm7, xmm4
-	subps  xmm7, xmm6
-	movaps [rsp + nb410nf_Vvdwtot], xmm7
-	
-.nb410nf_checksingle:				
-	mov   edx, [rsp + nb410nf_innerk]
-	and   edx, 1
-	jnz    .nb410nf_dosingle
-	jmp    .nb410nf_updateouterdata
-.nb410nf_dosingle:
-	mov rsi, [rbp + nb410nf_charge]
-	mov rdx, [rbp + nb410nf_invsqrta]
-	mov rdi, [rbp + nb410nf_pos]
-	mov   rcx, [rsp + nb410nf_innerjjnr]
-	mov   eax, [rcx]	
-	xorps  xmm2, xmm2
-	movaps xmm6, xmm2
-	movss xmm2, [rdx + rax*4]	;# isa2
-	mulss xmm2, [rsp + nb410nf_isai]
-	movss [rsp + nb410nf_isaprod], xmm2	
-	movss xmm1, xmm2
-	mulss xmm1, [rsp + nb410nf_gbtsc]
-	movss [rsp + nb410nf_gbscale], xmm1	
-	
-	mulss  xmm2, [rsp + nb410nf_iq]
-	movss xmm6, [rsi + rax*4]	;# xmm6(0) has the charge 	
-	mulss  xmm6, xmm2
-	movss [rsp + nb410nf_qq], xmm6
-	
-	mov rsi, [rbp + nb410nf_type]
-	mov ecx, eax
-	mov ecx, [rsi + rcx*4]	
-	mov rsi, [rbp + nb410nf_vdwparam]
-	shl ecx, 1
-	add ecx, [rsp + nb410nf_ntia]
-	movlps xmm6, [rsi + rcx*4]
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 252  ;# 11111100	
-	shufps xmm6, xmm6, 253  ;# 11111101	
-	
-	movaps [rsp + nb410nf_c6], xmm4
-	movaps [rsp + nb410nf_c12], xmm6	
-	
-	lea   rax, [rax + rax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movss xmm0, [rdi + rax*4]	
-	movss xmm1, [rdi + rax*4 + 4]	
-	movss xmm2, [rdi + rax*4 + 8]	 
-	
-	movaps xmm4, [rsp + nb410nf_ix]
-	movaps xmm5, [rsp + nb410nf_iy]
-	movaps xmm6, [rsp + nb410nf_iz]
-
-	;# calc dr 
-	subss xmm4, xmm0
-	subss xmm5, xmm1
-	subss xmm6, xmm2
-
-	;# square it 
-	mulss xmm4,xmm4
-	mulss xmm5,xmm5
-	mulss xmm6,xmm6
-	addss xmm4, xmm5
-	addss xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulss xmm5, xmm5
-	movss xmm1, [rsp + nb410nf_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movss xmm0, [rsp + nb410nf_half]
-	subss xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-
-	mulss xmm4, xmm0	;# xmm4=r 
-	mulss xmm4, [rsp + nb410nf_gbscale]
-
-	cvttss2si ebx, xmm4     ;# mm6 contain lu indices 
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl ebx, 2
-	mov  rsi, [rbp + nb410nf_GBtab]
-	
-	movaps xmm4, [rsi + rbx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	movss xmm3, [rsp + nb410nf_qq]
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, xmm3 ;# vcoul=qq*VV  
-	addss  xmm5, [rsp + nb410nf_vctot]
-	movss [rsp + nb410nf_vctot], xmm5 	
-	
-	;# L-J 
-	movaps xmm4, xmm0
-	mulss  xmm4, xmm0	;# xmm4=rinvsq 
-
-	movaps xmm6, xmm4
-	mulss  xmm6, xmm4
-
-	mulss  xmm6, xmm4	;# xmm6=rinvsix 
-	movaps xmm4, xmm6
-	mulss  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulss  xmm6, [rsp + nb410nf_c6]
-	mulss  xmm4, [rsp + nb410nf_c12]
-	movss xmm7, [rsp + nb410nf_Vvdwtot]
-	addps  xmm7, xmm4
-	subps  xmm7, xmm6
-	movss [rsp + nb410nf_Vvdwtot], xmm7
-	
-.nb410nf_updateouterdata:
-	;# get n from stack
-	mov esi, [rsp + nb410nf_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb410nf_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movaps xmm7, [rsp + nb410nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb410nf_Vc]
-	addss xmm7, [rax + rdx*4] 
-	;# move back to mem 
-	movss [rax + rdx*4], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movaps xmm7, [rsp + nb410nf_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb410nf_Vvdw]
-	addss xmm7, [rax + rdx*4] 
-	;# move back to mem 
-	movss [rax + rdx*4], xmm7 
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb410nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb410nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb410nf_n], esi
-        jmp .nb410nf_outer
-.nb410nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb410nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb410nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb410nf_threadloop
-.nb410nf_end:
-	
-	mov eax, [rsp + nb410nf_nouter]
-	mov ebx, [rsp + nb410nf_ninner]
-	mov rcx, [rbp + nb410nf_outeriter]
-	mov rdx, [rbp + nb410nf_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 360
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.s
deleted file mode 100644
index f1953c7bf6..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.s
+++ /dev/null
@@ -1,1985 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-
-
-
-
-.globl nb_kernel410_x86_64_sse
-.globl _nb_kernel410_x86_64_sse
-nb_kernel410_x86_64_sse:        
-_nb_kernel410_x86_64_sse:       
-##      Room for return address and rbp (16 bytes)
-.set nb410_fshift, 16
-.set nb410_gid, 24
-.set nb410_pos, 32
-.set nb410_faction, 40
-.set nb410_charge, 48
-.set nb410_p_facel, 56
-.set nb410_argkrf, 64
-.set nb410_argcrf, 72
-.set nb410_Vc, 80
-.set nb410_type, 88
-.set nb410_p_ntype, 96
-.set nb410_vdwparam, 104
-.set nb410_Vvdw, 112
-.set nb410_p_tabscale, 120
-.set nb410_VFtab, 128
-.set nb410_invsqrta, 136
-.set nb410_dvda, 144
-.set nb410_p_gbtabscale, 152
-.set nb410_GBtab, 160
-.set nb410_p_nthreads, 168
-.set nb410_count, 176
-.set nb410_mtx, 184
-.set nb410_outeriter, 192
-.set nb410_inneriter, 200
-.set nb410_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb410_ix, 0
-.set nb410_iy, 16
-.set nb410_iz, 32
-.set nb410_iq, 48
-.set nb410_dx, 64
-.set nb410_dy, 80
-.set nb410_dz, 96
-.set nb410_two, 112
-.set nb410_six, 128
-.set nb410_twelve, 144
-.set nb410_gbtsc, 160
-.set nb410_qq, 176
-.set nb410_c6, 192
-.set nb410_c12, 208
-.set nb410_fscal, 224
-.set nb410_vctot, 240
-.set nb410_Vvdwtot, 256
-.set nb410_fix, 272
-.set nb410_fiy, 288
-.set nb410_fiz, 304
-.set nb410_half, 320
-.set nb410_three, 336
-.set nb410_r, 352
-.set nb410_isai, 368
-.set nb410_isaprod, 384
-.set nb410_dvdasum, 400
-.set nb410_gbscale, 416
-.set nb410_nri, 432
-.set nb410_iinr, 440
-.set nb410_jindex, 448
-.set nb410_jjnr, 456
-.set nb410_shift, 464
-.set nb410_shiftvec, 472
-.set nb410_facel, 480
-.set nb410_innerjjnr, 488
-.set nb410_is3, 496
-.set nb410_ii3, 500
-.set nb410_ii, 504
-.set nb410_ntia, 508
-.set nb410_innerk, 512
-.set nb410_n, 516
-.set nb410_nn1, 520
-.set nb410_ntype, 524
-.set nb410_nouter, 528
-.set nb410_ninner, 532
-.set nb410_jnra, 536
-.set nb410_jnrb, 540
-.set nb410_jnrc, 544
-.set nb410_jnrd, 548
-
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $568,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb410_nouter(%rsp)
-        movl %eax,nb410_ninner(%rsp)
-
-        movl (%rdi),%edi
-        movl %edi,nb410_nri(%rsp)
-        movq %rsi,nb410_iinr(%rsp)
-        movq %rdx,nb410_jindex(%rsp)
-        movq %rcx,nb410_jjnr(%rsp)
-        movq %r8,nb410_shift(%rsp)
-        movq %r9,nb410_shiftvec(%rsp)
-        movq nb410_p_ntype(%rbp),%rdi
-        movl (%rdi),%edi
-        movl %edi,nb410_ntype(%rsp)
-        movq nb410_p_facel(%rbp),%rsi
-        movss (%rsi),%xmm0
-        movss %xmm0,nb410_facel(%rsp)
-
-        movq nb410_p_gbtabscale(%rbp),%rbx
-        movss (%rbx),%xmm4
-        shufps $0,%xmm4,%xmm4
-        movaps %xmm4,nb410_gbtsc(%rsp)
-
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## half in IEEE (hex)
-        movl %eax,nb410_half(%rsp)
-        movss nb410_half(%rsp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## one
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## two
-        addps  %xmm2,%xmm3      ## three
-        movaps %xmm3,%xmm4
-        addps  %xmm4,%xmm4      ## six
-        movaps %xmm4,%xmm5
-        addps  %xmm5,%xmm5      ## twelve
-        movaps %xmm1,nb410_half(%rsp)
-        movaps %xmm2,nb410_two(%rsp)
-        movaps %xmm3,nb410_three(%rsp)
-        movaps %xmm4,nb410_six(%rsp)
-        movaps %xmm5,nb410_twelve(%rsp)
-
-_nb_kernel410_x86_64_sse.nb410_threadloop: 
-        movq  nb410_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel410_x86_64_sse.nb410_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel410_x86_64_sse.nb410_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb410_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb410_n(%rsp)
-        movl %ebx,nb410_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel410_x86_64_sse.nb410_outerstart
-        jmp _nb_kernel410_x86_64_sse.nb410_end
-
-_nb_kernel410_x86_64_sse.nb410_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb410_nouter(%rsp),%ebx
-        movl %ebx,nb410_nouter(%rsp)
-
-_nb_kernel410_x86_64_sse.nb410_outer: 
-        movq  nb410_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx        ## ebx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb410_is3(%rsp)      ## store is3 
-
-        movq  nb410_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movss (%rax,%rbx,4),%xmm0
-        movss 4(%rax,%rbx,4),%xmm1
-        movss 8(%rax,%rbx,4),%xmm2
-
-        movq  nb410_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]        
-        movl  (%rcx,%rsi,4),%ebx            ## ebx =ii 
-        movl  %ebx,nb410_ii(%rsp)
-
-        movq  nb410_charge(%rbp),%rdx
-        movss (%rdx,%rbx,4),%xmm3
-        mulss nb410_facel(%rsp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-        movq  nb410_invsqrta(%rbp),%rdx         ## load invsqrta[ii]
-        movss (%rdx,%rbx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        movq  nb410_type(%rbp),%rdx
-        movl  (%rdx,%rbx,4),%edx
-        imull nb410_ntype(%rsp),%edx
-        shll  %edx
-        movl  %edx,nb410_ntia(%rsp)
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb410_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addss (%rax,%rbx,4),%xmm0
-        addss 4(%rax,%rbx,4),%xmm1
-        addss 8(%rax,%rbx,4),%xmm2
-
-        movaps %xmm3,nb410_iq(%rsp)
-        movaps %xmm4,nb410_isai(%rsp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb410_ix(%rsp)
-        movaps %xmm1,nb410_iy(%rsp)
-        movaps %xmm2,nb410_iz(%rsp)
-
-        movl  %ebx,nb410_ii3(%rsp)
-
-        ## clear vctot and i forces 
-        xorps %xmm13,%xmm13
-        movaps %xmm13,%xmm12
-        movaps %xmm13,nb410_Vvdwtot(%rsp)
-        movaps %xmm13,nb410_dvdasum(%rsp)
-        movaps %xmm13,%xmm14
-        movaps %xmm13,%xmm15
-
-        movq  nb410_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb410_pos(%rbp),%rsi
-        movq  nb410_faction(%rbp),%rdi
-        movq  nb410_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb410_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb410_ninner(%rsp),%ecx
-        movl  %ecx,nb410_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb410_innerk(%rsp)      ## number of innerloop atoms 
-        jge   _nb_kernel410_x86_64_sse.nb410_unroll_loop
-        jmp   _nb_kernel410_x86_64_sse.nb410_finish_inner
-_nb_kernel410_x86_64_sse.nb410_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movq  nb410_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k] 
-        movl  (%rdx),%eax
-        movl  4(%rdx),%ebx
-        movl  8(%rdx),%ecx
-        movl  12(%rdx),%edx           ## eax-edx=jnr1-4 
-
-        addq $16,nb410_innerjjnr(%rsp)             ## advance pointer (unrolled 4) 
-
-        ## load isaj
-        movq nb410_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rcx,4),%xmm4
-        movss (%rsi,%rbx,4),%xmm6
-        movss (%rsi,%rdx,4),%xmm7
-        movaps nb410_isai(%rsp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## 10001000 ;# all isaj in xmm3 
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb410_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulps nb410_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb410_gbscale(%rsp)
-
-        movq nb410_charge(%rbp),%rsi     ## base of charge[] 
-
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rcx,4),%xmm4
-        movss (%rsi,%rbx,4),%xmm6
-        movss (%rsi,%rdx,4),%xmm7
-
-        mulps nb410_iq(%rsp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3  
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb410_qq(%rsp)
-
-    ## vdw parameters
-        movq nb410_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%r12d
-        movl (%rsi,%rbx,4),%r13d
-        movl (%rsi,%rcx,4),%r14d
-        movl (%rsi,%rdx,4),%r15d
-        shll %r12d
-        shll %r13d
-        shll %r14d
-        shll %r15d
-    movl nb410_ntia(%rsp),%edi
-        addl %edi,%r12d
-        addl %edi,%r13d
-        addl %edi,%r14d
-        addl %edi,%r15d
-
-        movq nb410_vdwparam(%rbp),%rsi
-        movlps (%rsi,%r12,4),%xmm3
-        movlps (%rsi,%r14,4),%xmm7
-        movhps (%rsi,%r13,4),%xmm3
-        movhps (%rsi,%r15,4),%xmm7
-
-        movaps %xmm3,%xmm0
-        shufps $136,%xmm7,%xmm0 ## 10001000
-        shufps $221,%xmm7,%xmm3 ## 11011101
-
-    movaps %xmm0,nb410_c6(%rsp)
-    movaps %xmm3,nb410_c12(%rsp)
-
-        movq nb410_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r8     ## jnr 
-        lea  (%rbx,%rbx,2),%r9
-        lea  (%rcx,%rcx,2),%r10
-        lea  (%rdx,%rdx,2),%r11
-
-        ## move four coordinates to xmm0-xmm2   
-        movlps (%rsi,%r8,4),%xmm4
-        movlps (%rsi,%r10,4),%xmm5
-        movss 8(%rsi,%r8,4),%xmm2
-        movss 8(%rsi,%r10,4),%xmm6
-
-        movhps (%rsi,%r9,4),%xmm4
-        movhps (%rsi,%r11,4),%xmm5
-
-        movss 8(%rsi,%r9,4),%xmm0
-        movss 8(%rsi,%r11,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## 10001000
-
-        shufps $136,%xmm5,%xmm0 ## 10001000
-        shufps $221,%xmm5,%xmm1 ## 11011101             
-
-        ## calc dr 
-        subps nb410_ix(%rsp),%xmm0
-        subps nb410_iy(%rsp),%xmm1
-        subps nb410_iz(%rsp),%xmm2
-
-        ## store dr 
-        movaps %xmm0,nb410_dx(%rsp)
-        movaps %xmm1,nb410_dy(%rsp)
-        movaps %xmm2,nb410_dz(%rsp)
-
-        ## square it 
-        mulps %xmm0,%xmm0
-        mulps %xmm1,%xmm1
-        mulps %xmm2,%xmm2
-        addps %xmm1,%xmm0
-        addps %xmm2,%xmm0
-    movaps %xmm0,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb410_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb410_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb410_r(%rsp)
-        mulps nb410_gbscale(%rsp),%xmm4
-
-    ## truncate and convert to integers
-    cvttps2dq %xmm4,%xmm5
-
-    ## convert back to float
-    cvtdq2ps  %xmm5,%xmm6
-
-    ## multiply by 4
-    pslld   $2,%xmm5
-
-    ## move to integer registers
-    movhlps %xmm5,%xmm7
-    movd    %xmm5,%r12d
-    movd    %xmm7,%r14d
-    pshufd $1,%xmm5,%xmm5
-    pshufd $1,%xmm7,%xmm7
-    movd    %xmm5,%r13d
-    movd    %xmm7,%r15d
-
-    ## calculate eps
-    subps     %xmm6,%xmm4
-    movaps    %xmm4,%xmm1 ##eps
-
-        movq nb410_GBtab(%rbp),%rsi
-
-    movaps %xmm0,%xmm9 ## rinv
-    mulps  %xmm9,%xmm9 ## rinvsq
-    movaps %xmm9,%xmm10 ## rinvsq
-    mulps  %xmm10,%xmm10 ## rinv4
-    mulps  %xmm9,%xmm10 ## rinv6
-    movaps %xmm10,%xmm11
-    mulps  %xmm11,%xmm11 ## rinv12
-
-    ## load table data
-        movlps (%rsi,%r12,4),%xmm5
-        movlps (%rsi,%r14,4),%xmm7
-        movhps (%rsi,%r13,4),%xmm5
-        movhps (%rsi,%r15,4),%xmm7
-
-    movaps %xmm5,%xmm4
-        shufps $136,%xmm7,%xmm4 ## 10001000
-        shufps $221,%xmm7,%xmm5 ## 11011101
-
-    mulps  nb410_c6(%rsp),%xmm10      ## vvdw6=c6*rinv6
-        mulps  nb410_c12(%rsp),%xmm11     ## vvdw12=c12*rinv12     
-
-        movaps %xmm11,%xmm9
-        subps  %xmm10,%xmm11    ## Vvdw=Vvdw12-Vvdw6
-
-    ## add potential to vvdwtot 
-        addps  nb410_Vvdwtot(%rsp),%xmm11
-    movaps %xmm11,nb410_Vvdwtot(%rsp)
-
-        movlps 8(%rsi,%r12,4),%xmm7
-        movlps 8(%rsi,%r14,4),%xmm8
-        movhps 8(%rsi,%r13,4),%xmm7
-        movhps 8(%rsi,%r15,4),%xmm8
-
-    movaps %xmm7,%xmm6
-
-        shufps $136,%xmm8,%xmm6 ## 10001000
-        shufps $221,%xmm8,%xmm7 ## 11011101
-    ## table data ready in xmm4-xmm7
-
-    mulps  %xmm1,%xmm7  ## Heps
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm1,%xmm7      ## Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        addps  %xmm7,%xmm7      ## two*Heps2 
-        movaps nb410_qq(%rsp),%xmm3
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulps  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## at this point xmm5 contains vcoul and xmm3 fijC
-
-    ## LJ forces
-    mulps  nb410_six(%rsp),%xmm10
-    mulps  nb410_twelve(%rsp),%xmm9
-    subps  %xmm10,%xmm9
-    mulps  %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv
-
-        movq nb410_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        xorps  %xmm7,%xmm7
-        mulps nb410_gbscale(%rsp),%xmm3
-        movaps %xmm3,%xmm6
-        mulps  nb410_r(%rsp),%xmm6
-        addps  %xmm5,%xmm6
-
-    ## increment vctot (sum in xmm12)
-        addps  %xmm5,%xmm12
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-    ## update dvdasum
-    addps  nb410_dvdasum(%rsp),%xmm7
-    movaps %xmm7,nb410_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        movaps  %xmm6,%xmm5
-        movaps  %xmm7,%xmm4
-        shufps $0x1,%xmm5,%xmm5
-        shufps $0x1,%xmm4,%xmm4
-
-        ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-        addss  (%rsi,%rax,4),%xmm6
-        addss  (%rsi,%rbx,4),%xmm5
-        addss  (%rsi,%rcx,4),%xmm7
-        addss  (%rsi,%rdx,4),%xmm4
-        movss  %xmm6,(%rsi,%rax,4)
-        movss  %xmm5,(%rsi,%rbx,4)
-        movss  %xmm7,(%rsi,%rcx,4)
-        movss  %xmm4,(%rsi,%rdx,4)
-
-    subps  %xmm3,%xmm9
-    mulps  %xmm0,%xmm9 ## fscal
-
-    movaps  %xmm9,%xmm10
-    movaps  %xmm9,%xmm11
-
-    mulps   nb410_dx(%rsp),%xmm9
-    mulps   nb410_dy(%rsp),%xmm10
-    mulps   nb410_dz(%rsp),%xmm11
-
-        ## accumulate i forces
-    addps %xmm9,%xmm13
-    addps %xmm10,%xmm14
-    addps %xmm11,%xmm15
-
-        movq nb410_faction(%rbp),%rsi
-        ## the fj's - start by accumulating x & y forces from memory 
-        movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
-        movlps (%rsi,%r10,4),%xmm1 ## x3 y3 - -
-        movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
-        movhps (%rsi,%r11,4),%xmm1 ## x3 y3 x4 y4
-
-    movaps %xmm9,%xmm8
-    unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
-    unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4
-
-    ## update fjx and fjy
-        addps  %xmm9,%xmm0
-        addps  %xmm8,%xmm1
-
-        movlps %xmm0,(%rsi,%r8,4)
-        movlps %xmm1,(%rsi,%r10,4)
-        movhps %xmm0,(%rsi,%r9,4)
-        movhps %xmm1,(%rsi,%r11,4)
-
-    ## xmm11: fjz1 fjz2 fjz3 fjz4
-    pshufd $1,%xmm11,%xmm10 ## fjz2 - - -
-    movhlps %xmm11,%xmm9     ## fjz3 - - -
-    pshufd $3,%xmm11,%xmm8  ## fjz4 - - -
-
-        addss  8(%rsi,%r8,4),%xmm11
-        addss  8(%rsi,%r9,4),%xmm10
-        addss  8(%rsi,%r10,4),%xmm9
-        addss  8(%rsi,%r11,4),%xmm8
-        movss  %xmm11,8(%rsi,%r8,4)
-        movss  %xmm10,8(%rsi,%r9,4)
-        movss  %xmm9,8(%rsi,%r10,4)
-        movss  %xmm8,8(%rsi,%r11,4)
-
-        ## should we do one more iteration? 
-        subl $4,nb410_innerk(%rsp)
-        jl    _nb_kernel410_x86_64_sse.nb410_finish_inner
-        jmp   _nb_kernel410_x86_64_sse.nb410_unroll_loop
-_nb_kernel410_x86_64_sse.nb410_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb410_innerk(%rsp)
-        movl  nb410_innerk(%rsp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel410_x86_64_sse.nb410_dopair
-        jmp   _nb_kernel410_x86_64_sse.nb410_checksingle
-_nb_kernel410_x86_64_sse.nb410_dopair: 
-        movq  nb410_innerjjnr(%rsp),%rcx
-
-        movl  (%rcx),%eax
-        movl  4(%rcx),%ebx
-        addq $8,nb410_innerjjnr(%rsp)
-
-        ## load isaj
-        movq nb410_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm2
-        movss (%rsi,%rbx,4),%xmm6
-    unpcklps %xmm6,%xmm2
-
-        mulps  nb410_isai(%rsp),%xmm2
-
-        movaps %xmm2,nb410_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulps nb410_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb410_gbscale(%rsp)
-
-    mulps nb410_iq(%rsp),%xmm2
-        movq nb410_charge(%rbp),%rsi     ## base of charge[] 
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rbx,4),%xmm6
-    unpcklps %xmm6,%xmm3
-
-
-        mulps %xmm2,%xmm3
-        movaps %xmm3,nb410_qq(%rsp)
-
-     ## vdw parameters
-        movq nb410_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%r12d
-        movl (%rsi,%rbx,4),%r13d
-        shll %r12d
-        shll %r13d
-    movl nb410_ntia(%rsp),%edi
-        addl %edi,%r12d
-        addl %edi,%r13d
-
-        movq nb410_vdwparam(%rbp),%rsi
-        movlps (%rsi,%r12,4),%xmm3
-        movhps (%rsi,%r13,4),%xmm3
-
-    xorps %xmm7,%xmm7
-        movaps %xmm3,%xmm0
-        shufps $136,%xmm7,%xmm0 ## 10001000
-        shufps $221,%xmm7,%xmm3 ## 11011101
-
-    movaps %xmm0,nb410_c6(%rsp)
-    movaps %xmm3,nb410_c12(%rsp)
-
-        movq nb410_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r8     ## j3
-        lea  (%rbx,%rbx,2),%r9
-
-        ## move four coordinates to xmm0-xmm2   
-        movlps (%rsi,%r8,4),%xmm4       ## x1 y1 - - 
-        movlps (%rsi,%r9,4),%xmm5       ## x2 y2 - - 
-
-        movss 8(%rsi,%r8,4),%xmm6       ## z1 - - - 
-        movss 8(%rsi,%r9,4),%xmm7       ## z2 - - - 
-
-    unpcklps %xmm5,%xmm4 ## x1 x2 y1 y2
-    movhlps  %xmm4,%xmm5 ## y1 y2 -  -
-    unpcklps %xmm7,%xmm6 ## z1 z2 -  -
-
-        ## calc dr 
-        subps nb410_ix(%rsp),%xmm4
-        subps nb410_iy(%rsp),%xmm5
-        subps nb410_iz(%rsp),%xmm6
-
-        ## store dr 
-        movaps %xmm4,nb410_dx(%rsp)
-        movaps %xmm5,nb410_dy(%rsp)
-        movaps %xmm6,nb410_dz(%rsp)
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb410_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb410_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb410_r(%rsp)
-        mulps nb410_gbscale(%rsp),%xmm4
-
-    ## truncate and convert to integers
-    cvttps2dq %xmm4,%xmm5
-
-    ## convert back to float
-    cvtdq2ps  %xmm5,%xmm6
-
-    ## multiply by 4
-    pslld   $2,%xmm5
-
-    ## move to integer registers
-    movd    %xmm5,%r12d
-    pshufd $1,%xmm5,%xmm5
-    movd    %xmm5,%r13d
-
-    ## calculate eps
-    subps     %xmm6,%xmm4
-    movaps    %xmm4,%xmm1 ##eps
-
-        movq nb410_GBtab(%rbp),%rsi
-
-    movaps %xmm0,%xmm9 ## rinv
-    mulps  %xmm9,%xmm9 ## rinvsq
-    movaps %xmm9,%xmm10 ## rinvsq
-    mulps  %xmm10,%xmm10 ## rinv4
-    mulps  %xmm9,%xmm10 ## rinv6
-    movaps %xmm10,%xmm11
-    mulps  %xmm11,%xmm11 ## rinv12
-
-    ## load table data
-        movlps (%rsi,%r12,4),%xmm4  ## Y1 F1
-        movlps (%rsi,%r13,4),%xmm5  ## Y2 F2
-    unpcklps %xmm5,%xmm4        ## Y1 Y2 F1 F2
-    movhlps  %xmm4,%xmm5        ## F1 F2
-
-    mulps  nb410_c6(%rsp),%xmm10      ## vvdw6=c6*rinv6
-        mulps  nb410_c12(%rsp),%xmm11     ## vvdw12=c12*rinv12     
-
-        movaps %xmm11,%xmm9
-        subps  %xmm10,%xmm11    ## Vvdw=Vvdw12-Vvdw6
-
-    ## add potential to vvdwtot 
-        addps  nb410_Vvdwtot(%rsp),%xmm11
-    movlps %xmm11,nb410_Vvdwtot(%rsp)
-
-        movlps 8(%rsi,%r12,4),%xmm6      ## G1 H1
-        movlps 8(%rsi,%r13,4),%xmm7      ## G2 H2
-    unpcklps %xmm7,%xmm6             ## G1 G2
-    movhlps  %xmm6,%xmm7             ## H1 H2
-    ## table data ready in xmm4-xmm7
-
-    mulps  %xmm1,%xmm7  ## Heps
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm1,%xmm7      ## Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        addps  %xmm7,%xmm7      ## two*Heps2 
-        movaps nb410_qq(%rsp),%xmm3
-
-        addps  %xmm6,%xmm7
-        addps  %xmm5,%xmm7 ## xmm7=FF 
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulps  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## at this point xmm5 contains vcoul and xmm3 fijC
-
-    ## LJ forces
-    mulps  nb410_six(%rsp),%xmm10
-    mulps  nb410_twelve(%rsp),%xmm9
-    subps  %xmm10,%xmm9
-    mulps  %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv
-
-    ## zero upper part of vcoul 
-    xorps %xmm2,%xmm2
-    movlhps %xmm2,%xmm5
-
-        movq nb410_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        xorps  %xmm7,%xmm7
-        mulps nb410_gbscale(%rsp),%xmm3
-        movaps %xmm3,%xmm6
-        mulps  nb410_r(%rsp),%xmm6
-        addps  %xmm5,%xmm6
-
-    xorps  %xmm4,%xmm4
-    ## increment vctot (sum in xmm12)
-        addps  %xmm5,%xmm12
-
-        ## xmm6=(vcoul+fijC*r)
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-    ## zero upper half of dvda
-    movlhps %xmm4,%xmm7
-
-    ## update dvdasum
-    addps  nb410_dvdasum(%rsp),%xmm7
-    movaps %xmm7,nb410_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        movaps  %xmm6,%xmm5
-        shufps $0x1,%xmm5,%xmm5
-
-        ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-        addss  (%rsi,%rax,4),%xmm6
-        addss  (%rsi,%rbx,4),%xmm5
-        movss  %xmm6,(%rsi,%rax,4)
-        movss  %xmm5,(%rsi,%rbx,4)
-
-    xorps %xmm7,%xmm7
-
-    subps  %xmm3,%xmm9
-    mulps  %xmm0,%xmm9 ## fscal
-
-    movaps  %xmm9,%xmm10
-    movaps  %xmm9,%xmm11
-
-    mulps   nb410_dx(%rsp),%xmm9
-    mulps   nb410_dy(%rsp),%xmm10
-    mulps   nb410_dz(%rsp),%xmm11
-
-    movlhps  %xmm7,%xmm9
-    movlhps  %xmm7,%xmm10
-    movlhps  %xmm7,%xmm11
-
-        ## accumulate i forces
-    addps %xmm9,%xmm13
-    addps %xmm10,%xmm14
-    addps %xmm11,%xmm15
-
-        movq nb410_faction(%rbp),%rsi
-        ## the fj's - start by accumulating x & y forces from memory 
-        movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
-        movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
-
-    unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
-    addps    %xmm9,%xmm0
-
-        movlps %xmm0,(%rsi,%r8,4)
-        movhps %xmm0,(%rsi,%r9,4)
-
-    ## z forces
-    pshufd $1,%xmm11,%xmm8
-    addss  8(%rsi,%r8,4),%xmm11
-    addss  8(%rsi,%r9,4),%xmm8
-    movss  %xmm11,8(%rsi,%r8,4)
-    movss  %xmm8,8(%rsi,%r9,4)
-
-_nb_kernel410_x86_64_sse.nb410_checksingle:     
-        movl  nb410_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel410_x86_64_sse.nb410_dosingle
-        jmp    _nb_kernel410_x86_64_sse.nb410_updateouterdata
-_nb_kernel410_x86_64_sse.nb410_dosingle: 
-        movq nb410_charge(%rbp),%rsi
-        movq nb410_invsqrta(%rbp),%rdx
-        movq nb410_pos(%rbp),%rdi
-        movq  nb410_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-
-        ## load isaj
-        movq nb410_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm3
-        movaps nb410_isai(%rsp),%xmm2
-        mulss  %xmm3,%xmm2
-
-        movss %xmm2,nb410_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulss nb410_gbtsc(%rsp),%xmm1
-        movss %xmm1,nb410_gbscale(%rsp)
-
-    mulss nb410_iq(%rsp),%xmm2
-        movq nb410_charge(%rbp),%rsi     ## base of charge[] 
-
-        movss (%rsi,%rax,4),%xmm3
-        mulss %xmm2,%xmm3
-        movss %xmm3,nb410_qq(%rsp)
-
-    ## vdw parameters
-        movq nb410_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%r12d
-        shll %r12d
-    movl nb410_ntia(%rsp),%edi
-        addl %edi,%r12d
-
-        movq nb410_vdwparam(%rbp),%rsi
-        movss (%rsi,%r12,4),%xmm0
-        movss 4(%rsi,%r12,4),%xmm3
-    movaps %xmm0,nb410_c6(%rsp)
-    movaps %xmm3,nb410_c12(%rsp)
-
-        movq nb410_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r8     ## jnr 
-
-        ## move four coordinates to xmm0-xmm2   
-        movss (%rsi,%r8,4),%xmm4
-        movss 4(%rsi,%r8,4),%xmm5
-        movss 8(%rsi,%r8,4),%xmm6
-
-        ## calc dr 
-        subss nb410_ix(%rsp),%xmm4
-        subss nb410_iy(%rsp),%xmm5
-        subss nb410_iz(%rsp),%xmm6
-
-        ## store dr 
-        movaps %xmm4,nb410_dx(%rsp)
-        movaps %xmm5,nb410_dy(%rsp)
-        movaps %xmm6,nb410_dz(%rsp)
-
-        ## square it 
-        mulss %xmm4,%xmm4
-        mulss %xmm5,%xmm5
-        mulss %xmm6,%xmm6
-        addss %xmm5,%xmm4
-        addss %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movaps nb410_three(%rsp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb410_half(%rsp),%xmm0
-        subss %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-        mulss %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb410_r(%rsp)
-        mulss nb410_gbscale(%rsp),%xmm4
-
-    ## truncate and convert to integers
-    cvttss2si %xmm4,%r12d
-
-    ## convert back to float
-    cvtsi2ss  %r12d,%xmm6
-
-    ## multiply by 4
-    shll  $2,%r12d
-
-    ## calculate eps
-    subss     %xmm6,%xmm4
-    movaps    %xmm4,%xmm1 ##eps
-
-        movq nb410_GBtab(%rbp),%rsi
-
-    movaps %xmm0,%xmm9 ## rinv
-    mulss  %xmm9,%xmm9 ## rinvsq
-    movaps %xmm9,%xmm10 ## rinvsq
-    mulss  %xmm10,%xmm10 ## rinv4
-    mulss  %xmm9,%xmm10 ## rinv6
-    movaps %xmm10,%xmm11
-    mulss  %xmm11,%xmm11 ## rinv12
-
-    ## load table data
-        movss (%rsi,%r12,4),%xmm4
-        movss 4(%rsi,%r12,4),%xmm5
-        movss 8(%rsi,%r12,4),%xmm6
-        movss 12(%rsi,%r12,4),%xmm7
-    ## table data ready in xmm4-xmm7
-
-    mulss  nb410_c6(%rsp),%xmm10      ## vvdw6=c6*rinv6
-        mulss  nb410_c12(%rsp),%xmm11     ## vvdw12=c12*rinv12     
-
-        movaps %xmm11,%xmm9
-        subss  %xmm10,%xmm11    ## Vvdw=Vvdw12-Vvdw6
-
-    ## add potential to vvdwtot 
-        addss  nb410_Vvdwtot(%rsp),%xmm11
-    movss %xmm11,nb410_Vvdwtot(%rsp)
-
-    mulss  %xmm1,%xmm7  ## Heps
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm1,%xmm7      ## Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        addss  %xmm7,%xmm7      ## two*Heps2 
-        movss  nb410_qq(%rsp),%xmm3
-        addss  %xmm6,%xmm7
-        addss  %xmm5,%xmm7 ## xmm7=FF 
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulss  %xmm7,%xmm3 ## fijC=FF*qq 
-        ## at this point xmm5 contains vcoul and xmm3 fijC
-
-    ## LJ forces
-    mulss  nb410_six(%rsp),%xmm10
-    mulss  nb410_twelve(%rsp),%xmm9
-    subss  %xmm10,%xmm9
-    mulss  %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv
-
-        movq nb410_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        xorps  %xmm7,%xmm7
-        mulss nb410_gbscale(%rsp),%xmm3
-        movaps %xmm3,%xmm6
-        mulss  nb410_r(%rsp),%xmm6
-        addss  %xmm5,%xmm6
-
-    ## increment vctot (sum in xmm12)
-        addss  %xmm5,%xmm12
-
-        ## xmm6=(vcoul+fijC*r)
-        subss  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-    ## update dvdasum
-    addss  nb410_dvdasum(%rsp),%xmm7
-    movss %xmm7,nb410_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        addss  (%rsi,%rax,4),%xmm6
-        movss  %xmm6,(%rsi,%rax,4)
-
-    subss  %xmm3,%xmm9
-    mulss  %xmm0,%xmm9 ## fscal
-
-    movaps  %xmm9,%xmm10
-    movaps  %xmm9,%xmm11
-
-    mulss   nb410_dx(%rsp),%xmm9
-    mulss   nb410_dy(%rsp),%xmm10
-    mulss   nb410_dz(%rsp),%xmm11
-
-        ## accumulate i forces
-    addss %xmm9,%xmm13
-    addss %xmm10,%xmm14
-    addss %xmm11,%xmm15
-
-        movq nb410_faction(%rbp),%rsi
-    ## add to j forces
-    addss  (%rsi,%r8,4),%xmm9
-    addss  4(%rsi,%r8,4),%xmm10
-    addss  8(%rsi,%r8,4),%xmm11
-    movss  %xmm9,(%rsi,%r8,4)
-    movss  %xmm10,4(%rsi,%r8,4)
-    movss  %xmm11,8(%rsi,%r8,4)
-
-_nb_kernel410_x86_64_sse.nb410_updateouterdata: 
-        movl  nb410_ii3(%rsp),%ecx
-        movq  nb410_faction(%rbp),%rdi
-        movq  nb410_fshift(%rbp),%rsi
-        movl  nb410_is3(%rsp),%edx
-
-        ## accumulate i forces in xmm13, xmm14, xmm15
-        movhlps %xmm13,%xmm0
-        movhlps %xmm14,%xmm1
-        movhlps %xmm15,%xmm2
-        addps  %xmm13,%xmm0
-        addps  %xmm14,%xmm1
-        addps  %xmm15,%xmm2
-    movaps %xmm0,%xmm3
-        movaps %xmm1,%xmm4
-        movaps %xmm2,%xmm5
-        shufps $1,%xmm3,%xmm3
-        shufps $1,%xmm4,%xmm4
-        shufps $1,%xmm5,%xmm5
-        addss  %xmm3,%xmm0
-        addss  %xmm4,%xmm1
-        addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0 
-
-
-        ## increment i force 
-        movss  (%rdi,%rcx,4),%xmm3
-        movss  4(%rdi,%rcx,4),%xmm4
-        movss  8(%rdi,%rcx,4),%xmm5
-        subss  %xmm0,%xmm3
-        subss  %xmm1,%xmm4
-        subss  %xmm2,%xmm5
-        movss  %xmm3,(%rdi,%rcx,4)
-        movss  %xmm4,4(%rdi,%rcx,4)
-        movss  %xmm5,8(%rdi,%rcx,4)
-
-        ## increment fshift force  
-        movss  (%rsi,%rdx,4),%xmm3
-        movss  4(%rsi,%rdx,4),%xmm4
-        movss  8(%rsi,%rdx,4),%xmm5
-        subss  %xmm0,%xmm3
-        subss  %xmm1,%xmm4
-        subss  %xmm2,%xmm5
-        movss  %xmm3,(%rsi,%rdx,4)
-        movss  %xmm4,4(%rsi,%rdx,4)
-        movss  %xmm5,8(%rsi,%rdx,4)
-
-        ## get n from stack
-        movl nb410_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb410_gid(%rbp),%rdx              ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        ## accumulate 
-        movhlps %xmm12,%xmm6
-        addps  %xmm6,%xmm12     ## pos 0-1 in xmm12 have the sum now 
-        movaps %xmm12,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm12
-
-        ## add earlier value from mem 
-        movq  nb410_Vc(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm12
-        ## move back to mem 
-        movss %xmm12,(%rax,%rdx,4)
-
-        ## accumulate total lj energy and update it 
-        movaps nb410_Vvdwtot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movq  nb410_Vvdw(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%rax,%rdx,4)
-
-        ## accumulate dVda and update it 
-        movaps nb410_dvdasum(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        movl nb410_ii(%rsp),%edx
-        movq nb410_dvda(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm7
-        movss %xmm7,(%rax,%rdx,4)
-
-        ## finish if last 
-        movl nb410_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel410_x86_64_sse.nb410_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb410_n(%rsp)
-        jmp _nb_kernel410_x86_64_sse.nb410_outer
-_nb_kernel410_x86_64_sse.nb410_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb410_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel410_x86_64_sse.nb410_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel410_x86_64_sse.nb410_threadloop
-_nb_kernel410_x86_64_sse.nb410_end: 
-
-        movl nb410_nouter(%rsp),%eax
-        movl nb410_ninner(%rsp),%ebx
-        movq nb410_outeriter(%rbp),%rcx
-        movq nb410_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $568,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
-
-.globl nb_kernel410nf_x86_64_sse
-.globl _nb_kernel410nf_x86_64_sse
-nb_kernel410nf_x86_64_sse:      
-_nb_kernel410nf_x86_64_sse:     
-##      Room for return address and rbp (16 bytes)
-.set nb410nf_fshift, 16
-.set nb410nf_gid, 24
-.set nb410nf_pos, 32
-.set nb410nf_faction, 40
-.set nb410nf_charge, 48
-.set nb410nf_p_facel, 56
-.set nb410nf_argkrf, 64
-.set nb410nf_argcrf, 72
-.set nb410nf_Vc, 80
-.set nb410nf_type, 88
-.set nb410nf_p_ntype, 96
-.set nb410nf_vdwparam, 104
-.set nb410nf_Vvdw, 112
-.set nb410nf_p_tabscale, 120
-.set nb410nf_VFtab, 128
-.set nb410nf_invsqrta, 136
-.set nb410nf_dvda, 144
-.set nb410nf_p_gbtabscale, 152
-.set nb410nf_GBtab, 160
-.set nb410nf_p_nthreads, 168
-.set nb410nf_count, 176
-.set nb410nf_mtx, 184
-.set nb410nf_outeriter, 192
-.set nb410nf_inneriter, 200
-.set nb410nf_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb410nf_ix, 0
-.set nb410nf_iy, 16
-.set nb410nf_iz, 32
-.set nb410nf_iq, 48
-.set nb410nf_gbtsc, 64
-.set nb410nf_qq, 80
-.set nb410nf_c6, 96
-.set nb410nf_c12, 112
-.set nb410nf_vctot, 128
-.set nb410nf_Vvdwtot, 144
-.set nb410nf_half, 160
-.set nb410nf_three, 176
-.set nb410nf_isai, 192
-.set nb410nf_isaprod, 208
-.set nb410nf_gbscale, 224
-.set nb410nf_nri, 240
-.set nb410nf_iinr, 248
-.set nb410nf_jindex, 256
-.set nb410nf_jjnr, 264
-.set nb410nf_shift, 272
-.set nb410nf_shiftvec, 280
-.set nb410nf_facel, 288
-.set nb410nf_innerjjnr, 296
-.set nb410nf_is3, 304
-.set nb410nf_ii3, 308
-.set nb410nf_ntia, 312
-.set nb410nf_innerk, 316
-.set nb410nf_n, 320
-.set nb410nf_nn1, 324
-.set nb410nf_ntype, 328
-.set nb410nf_nouter, 332
-.set nb410nf_ninner, 336
-
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $360,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb410nf_nouter(%rsp)
-        movl %eax,nb410nf_ninner(%rsp)
-
-        movl (%rdi),%edi
-        movl %edi,nb410nf_nri(%rsp)
-        movq %rsi,nb410nf_iinr(%rsp)
-        movq %rdx,nb410nf_jindex(%rsp)
-        movq %rcx,nb410nf_jjnr(%rsp)
-        movq %r8,nb410nf_shift(%rsp)
-        movq %r9,nb410nf_shiftvec(%rsp)
-        movq nb410nf_p_ntype(%rbp),%rdi
-        movl (%rdi),%edi
-        movl %edi,nb410nf_ntype(%rsp)
-        movq nb410nf_p_facel(%rbp),%rsi
-        movss (%rsi),%xmm0
-        movss %xmm0,nb410nf_facel(%rsp)
-
-        movq nb410nf_p_gbtabscale(%rbp),%rbx
-        movss (%rbx),%xmm4
-        shufps $0,%xmm4,%xmm4
-        movaps %xmm4,nb410nf_gbtsc(%rsp)
-
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## half in IEEE (hex)
-        movl %eax,nb410nf_half(%rsp)
-        movss nb410nf_half(%rsp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## one
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## two
-        addps  %xmm2,%xmm3      ## three
-        movaps %xmm1,nb410nf_half(%rsp)
-        movaps %xmm3,nb410nf_three(%rsp)
-
-_nb_kernel410nf_x86_64_sse.nb410nf_threadloop: 
-        movq  nb410nf_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel410nf_x86_64_sse.nb410nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel410nf_x86_64_sse.nb410nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb410nf_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb410nf_n(%rsp)
-        movl %ebx,nb410nf_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel410nf_x86_64_sse.nb410nf_outerstart
-        jmp _nb_kernel410nf_x86_64_sse.nb410nf_end
-
-_nb_kernel410nf_x86_64_sse.nb410nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb410nf_nouter(%rsp),%ebx
-        movl %ebx,nb410nf_nouter(%rsp)
-
-_nb_kernel410nf_x86_64_sse.nb410nf_outer: 
-        movq  nb410nf_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx        ## ebx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb410nf_is3(%rsp)            ## store is3 
-
-        movq  nb410nf_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movss (%rax,%rbx,4),%xmm0
-        movss 4(%rax,%rbx,4),%xmm1
-        movss 8(%rax,%rbx,4),%xmm2
-
-        movq  nb410nf_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]      
-        movl  (%rcx,%rsi,4),%ebx            ## ebx =ii
-
-        movq  nb410nf_charge(%rbp),%rdx
-        movss (%rdx,%rbx,4),%xmm3
-        mulss nb410nf_facel(%rsp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-        movq  nb410nf_invsqrta(%rbp),%rdx       ## load invsqrta[ii]
-        movss (%rdx,%rbx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        movq  nb410nf_type(%rbp),%rdx
-        movl  (%rdx,%rbx,4),%edx
-        imull nb410nf_ntype(%rsp),%edx
-        shll  %edx
-        movl  %edx,nb410nf_ntia(%rsp)
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb410nf_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addss (%rax,%rbx,4),%xmm0
-        addss 4(%rax,%rbx,4),%xmm1
-        addss 8(%rax,%rbx,4),%xmm2
-
-        movaps %xmm3,nb410nf_iq(%rsp)
-        movaps %xmm4,nb410nf_isai(%rsp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb410nf_ix(%rsp)
-        movaps %xmm1,nb410nf_iy(%rsp)
-        movaps %xmm2,nb410nf_iz(%rsp)
-
-        movl  %ebx,nb410nf_ii3(%rsp)
-
-        ## clear vctot
-        xorps %xmm4,%xmm4
-        movaps %xmm4,nb410nf_vctot(%rsp)
-        movaps %xmm4,nb410nf_Vvdwtot(%rsp)
-
-        movq  nb410nf_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb410nf_pos(%rbp),%rsi
-        movq  nb410nf_faction(%rbp),%rdi
-        movq  nb410nf_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb410nf_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb410nf_ninner(%rsp),%ecx
-        movl  %ecx,nb410nf_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb410nf_innerk(%rsp)      ## number of innerloop atoms 
-        jge   _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop
-        jmp   _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner
-_nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movq  nb410nf_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k] 
-        movl  (%rdx),%eax
-        movl  4(%rdx),%ebx
-        movl  8(%rdx),%ecx
-        movl  12(%rdx),%edx           ## eax-edx=jnr1-4 
-        addq $16,nb410nf_innerjjnr(%rsp)             ## advance pointer (unrolled 4) 
-
-        ## load isa2
-        movq nb410nf_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rcx,4),%xmm4
-        movss (%rsi,%rbx,4),%xmm6
-        movss (%rsi,%rdx,4),%xmm7
-        movaps nb410nf_isai(%rsp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3  
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb410nf_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulps nb410nf_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb410nf_gbscale(%rsp)
-
-        movq nb410nf_charge(%rbp),%rsi     ## base of charge[] 
-
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rcx,4),%xmm4
-        movss (%rsi,%rbx,4),%xmm6
-        movss (%rsi,%rdx,4),%xmm7
-
-        mulps nb410nf_iq(%rsp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3  
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb410nf_qq(%rsp)
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movq nb410nf_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%eax
-        movl (%rsi,%rbx,4),%ebx
-        movl (%rsi,%rcx,4),%ecx
-        movl (%rsi,%rdx,4),%edx
-        movq nb410nf_vdwparam(%rbp),%rsi
-        shll %eax
-        shll %ebx
-        shll %ecx
-        shll %edx
-        movl nb410nf_ntia(%rsp),%edi
-        addl %edi,%eax
-        addl %edi,%ebx
-        addl %edi,%ecx
-        addl %edi,%edx
-
-        movlps (%rsi,%rax,4),%xmm6
-        movlps (%rsi,%rcx,4),%xmm7
-        movhps (%rsi,%rbx,4),%xmm6
-        movhps (%rsi,%rdx,4),%xmm7
-
-        movaps %xmm6,%xmm4
-        shufps $136,%xmm7,%xmm4 ## 10001000
-        shufps $221,%xmm7,%xmm6 ## 11011101
-
-        movd  %mm0,%eax
-        movd  %mm1,%ebx
-        movd  %mm2,%ecx
-        movd  %mm3,%edx
-
-        movaps %xmm4,nb410nf_c6(%rsp)
-        movaps %xmm6,nb410nf_c12(%rsp)
-
-        movq nb410nf_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%rax     ## replace jnr with j3 
-        lea  (%rbx,%rbx,2),%rbx
-
-        lea  (%rcx,%rcx,2),%rcx     ## replace jnr with j3 
-        lea  (%rdx,%rdx,2),%rdx
-
-        ## move four coordinates to xmm0-xmm2   
-
-        movlps (%rsi,%rax,4),%xmm4
-        movlps (%rsi,%rcx,4),%xmm5
-        movss 8(%rsi,%rax,4),%xmm2
-        movss 8(%rsi,%rcx,4),%xmm6
-
-        movhps (%rsi,%rbx,4),%xmm4
-        movhps (%rsi,%rdx,4),%xmm5
-
-        movss 8(%rsi,%rbx,4),%xmm0
-        movss 8(%rsi,%rdx,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## 10001000
-
-        shufps $136,%xmm5,%xmm0 ## 10001000
-        shufps $221,%xmm5,%xmm1 ## 11011101             
-
-        ## move ix-iz to xmm4-xmm6 
-        movaps nb410nf_ix(%rsp),%xmm4
-        movaps nb410nf_iy(%rsp),%xmm5
-        movaps nb410nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb410nf_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb410nf_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        mulps nb410nf_gbscale(%rsp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $2,%mm6
-        pslld $2,%mm7
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movq nb410nf_GBtab(%rbp),%rsi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## load coulomb table
-        movaps (%rsi,%rax,4),%xmm4
-        movaps (%rsi,%rbx,4),%xmm5
-        movaps (%rsi,%rcx,4),%xmm6
-        movaps (%rsi,%rdx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## coulomb table ready, in xmm4-xmm7            
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        movaps nb410nf_qq(%rsp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        ## update vctot
-        addps  nb410nf_vctot(%rsp),%xmm5
-        movaps %xmm5,nb410nf_vctot(%rsp)
-
-        ## L-J 
-        movaps %xmm0,%xmm4
-        mulps  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        movaps %xmm4,%xmm6
-        mulps  %xmm4,%xmm6
-
-        mulps  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movaps %xmm6,%xmm4
-        mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulps  nb410nf_c6(%rsp),%xmm6
-        mulps  nb410nf_c12(%rsp),%xmm4
-        movaps nb410nf_Vvdwtot(%rsp),%xmm7
-        addps  %xmm4,%xmm7
-        subps  %xmm6,%xmm7
-        movaps %xmm7,nb410nf_Vvdwtot(%rsp)
-
-        ## should we do one more iteration? 
-        subl $4,nb410nf_innerk(%rsp)
-        jl    _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner
-        jmp   _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop
-_nb_kernel410nf_x86_64_sse.nb410nf_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb410nf_innerk(%rsp)
-        movl  nb410nf_innerk(%rsp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel410nf_x86_64_sse.nb410nf_dopair
-        jmp   _nb_kernel410nf_x86_64_sse.nb410nf_checksingle
-_nb_kernel410nf_x86_64_sse.nb410nf_dopair: 
-        movq  nb410nf_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-        movl  4(%rcx),%ebx
-        addq $8,nb410nf_innerjjnr(%rsp)
-
-        xorps %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-
-        ## load isa2
-        movq nb410nf_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm2
-        movss (%rsi,%rbx,4),%xmm3
-        unpcklps %xmm3,%xmm2    ## isa2 in xmm3(0,1)
-        mulps  nb410nf_isai(%rsp),%xmm2
-        movaps %xmm2,nb410nf_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulps nb410nf_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb410nf_gbscale(%rsp)
-
-        movq nb410nf_charge(%rbp),%rsi     ## base of charge[]  
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rbx,4),%xmm6
-        unpcklps %xmm6,%xmm3 ## 00001000 ;# xmm3(0,1) has the charges 
-
-        mulps  nb410nf_iq(%rsp),%xmm2
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb410nf_qq(%rsp)
-
-        movq nb410nf_type(%rbp),%rsi
-        movl  %eax,%ecx
-        movl  %ebx,%edx
-        movl (%rsi,%rcx,4),%ecx
-        movl (%rsi,%rdx,4),%edx
-        movq nb410nf_vdwparam(%rbp),%rsi
-        shll %ecx
-        shll %edx
-        movl nb410nf_ntia(%rsp),%edi
-        addl %edi,%ecx
-        addl %edi,%edx
-        movlps (%rsi,%rcx,4),%xmm6
-        movhps (%rsi,%rdx,4),%xmm6
-        movq nb410nf_pos(%rbp),%rdi
-
-        movaps %xmm6,%xmm4
-        shufps $8,%xmm4,%xmm4 ## 00001000        
-        shufps $13,%xmm6,%xmm6 ## 00001101
-        movlhps %xmm7,%xmm4
-        movlhps %xmm7,%xmm6
-
-        movaps %xmm4,nb410nf_c6(%rsp)
-        movaps %xmm6,nb410nf_c12(%rsp)
-
-        lea  (%rax,%rax,2),%rax
-        lea  (%rbx,%rbx,2),%rbx
-        ## move coordinates to xmm0-xmm2 
-        movlps (%rdi,%rax,4),%xmm1
-        movss 8(%rdi,%rax,4),%xmm2
-        movhps (%rdi,%rbx,4),%xmm1
-        movss 8(%rdi,%rbx,4),%xmm0
-
-        movlhps %xmm7,%xmm3
-
-        shufps $0,%xmm0,%xmm2
-
-        movaps %xmm1,%xmm0
-
-        shufps $136,%xmm2,%xmm2 ## 10001000
-
-        shufps $136,%xmm0,%xmm0 ## 10001000
-        shufps $221,%xmm1,%xmm1 ## 11011101
-
-        movq   nb410nf_faction(%rbp),%rdi
-        ## move ix-iz to xmm4-xmm6 
-        xorps   %xmm7,%xmm7
-
-        movaps nb410nf_ix(%rsp),%xmm4
-        movaps nb410nf_iy(%rsp),%xmm5
-        movaps nb410nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb410nf_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb410nf_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        mulps nb410nf_gbscale(%rsp),%xmm4
-
-        cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6
-
-        movq nb410nf_GBtab(%rbp),%rsi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## load coulomb table
-        movaps (%rsi,%rcx,4),%xmm4
-        movaps (%rsi,%rdx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        movaps nb410nf_qq(%rsp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-
-        addps  nb410nf_vctot(%rsp),%xmm5
-        movaps %xmm5,nb410nf_vctot(%rsp)
-
-        ## L-J 
-        movaps %xmm0,%xmm4
-        mulps  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        ## at this point mm5 contains vcoul and mm3 fijC 
-        ## increment vcoul - then we can get rid of mm5 
-        ## update vctot 
-
-        movaps %xmm4,%xmm6
-        mulps  %xmm4,%xmm6
-
-        mulps  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movaps %xmm6,%xmm4
-        mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulps  nb410nf_c6(%rsp),%xmm6
-        mulps  nb410nf_c12(%rsp),%xmm4
-        movaps nb410nf_Vvdwtot(%rsp),%xmm7
-        addps  %xmm4,%xmm7
-        subps  %xmm6,%xmm7
-        movaps %xmm7,nb410nf_Vvdwtot(%rsp)
-
-_nb_kernel410nf_x86_64_sse.nb410nf_checksingle: 
-        movl  nb410nf_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel410nf_x86_64_sse.nb410nf_dosingle
-        jmp    _nb_kernel410nf_x86_64_sse.nb410nf_updateouterdata
-_nb_kernel410nf_x86_64_sse.nb410nf_dosingle: 
-        movq nb410nf_charge(%rbp),%rsi
-        movq nb410nf_invsqrta(%rbp),%rdx
-        movq nb410nf_pos(%rbp),%rdi
-        movq  nb410nf_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-        xorps  %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-        movss (%rdx,%rax,4),%xmm2       ## isa2
-        mulss nb410nf_isai(%rsp),%xmm2
-        movss %xmm2,nb410nf_isaprod(%rsp)
-        movss %xmm2,%xmm1
-        mulss nb410nf_gbtsc(%rsp),%xmm1
-        movss %xmm1,nb410nf_gbscale(%rsp)
-
-        mulss  nb410nf_iq(%rsp),%xmm2
-        movss (%rsi,%rax,4),%xmm6       ## xmm6(0) has the charge       
-        mulss  %xmm2,%xmm6
-        movss %xmm6,nb410nf_qq(%rsp)
-
-        movq nb410nf_type(%rbp),%rsi
-        movl %eax,%ecx
-        movl (%rsi,%rcx,4),%ecx
-        movq nb410nf_vdwparam(%rbp),%rsi
-        shll %ecx
-        addl nb410nf_ntia(%rsp),%ecx
-        movlps (%rsi,%rcx,4),%xmm6
-        movaps %xmm6,%xmm4
-        shufps $252,%xmm4,%xmm4 ## 11111100     
-        shufps $253,%xmm6,%xmm6 ## 11111101     
-
-        movaps %xmm4,nb410nf_c6(%rsp)
-        movaps %xmm6,nb410nf_c12(%rsp)
-
-        lea  (%rax,%rax,2),%rax
-
-        ## move coordinates to xmm0-xmm2 
-        movss (%rdi,%rax,4),%xmm0
-        movss 4(%rdi,%rax,4),%xmm1
-        movss 8(%rdi,%rax,4),%xmm2
-
-        movaps nb410nf_ix(%rsp),%xmm4
-        movaps nb410nf_iy(%rsp),%xmm5
-        movaps nb410nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subss %xmm0,%xmm4
-        subss %xmm1,%xmm5
-        subss %xmm2,%xmm6
-
-        ## square it 
-        mulss %xmm4,%xmm4
-        mulss %xmm5,%xmm5
-        mulss %xmm6,%xmm6
-        addss %xmm5,%xmm4
-        addss %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movss nb410nf_three(%rsp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movss nb410nf_half(%rsp),%xmm0
-        subss %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-
-        mulss %xmm0,%xmm4       ## xmm4=r 
-        mulss nb410nf_gbscale(%rsp),%xmm4
-
-        cvttss2si %xmm4,%ebx    ## mm6 contain lu indices 
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%ebx
-        movq nb410nf_GBtab(%rbp),%rsi
-
-        movaps (%rsi,%rbx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        movss nb410nf_qq(%rsp),%xmm3
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addss  nb410nf_vctot(%rsp),%xmm5
-        movss %xmm5,nb410nf_vctot(%rsp)
-
-        ## L-J 
-        movaps %xmm0,%xmm4
-        mulss  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        movaps %xmm4,%xmm6
-        mulss  %xmm4,%xmm6
-
-        mulss  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movaps %xmm6,%xmm4
-        mulss  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulss  nb410nf_c6(%rsp),%xmm6
-        mulss  nb410nf_c12(%rsp),%xmm4
-        movss nb410nf_Vvdwtot(%rsp),%xmm7
-        addps  %xmm4,%xmm7
-        subps  %xmm6,%xmm7
-        movss %xmm7,nb410nf_Vvdwtot(%rsp)
-
-_nb_kernel410nf_x86_64_sse.nb410nf_updateouterdata: 
-        ## get n from stack
-        movl nb410nf_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb410nf_gid(%rbp),%rdx            ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movaps nb410nf_vctot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movq  nb410nf_Vc(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%rax,%rdx,4)
-
-        ## accumulate total lj energy and update it 
-        movaps nb410nf_Vvdwtot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movq  nb410nf_Vvdw(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%rax,%rdx,4)
-
-        ## finish if last 
-        movl nb410nf_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel410nf_x86_64_sse.nb410nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb410nf_n(%rsp)
-        jmp _nb_kernel410nf_x86_64_sse.nb410nf_outer
-_nb_kernel410nf_x86_64_sse.nb410nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb410nf_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel410nf_x86_64_sse.nb410nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel410nf_x86_64_sse.nb410nf_threadloop
-_nb_kernel410nf_x86_64_sse.nb410nf_end: 
-
-        movl nb410nf_nouter(%rsp),%eax
-        movl nb410nf_ninner(%rsp),%ebx
-        movq nb410nf_outeriter(%rbp),%rcx
-        movq nb410nf_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $360,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.c b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.c
index 8b82656348..a6b97565f0 100644
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.c
+++ b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.c
@@ -22,25 +22,14 @@
 #include <xmmintrin.h>
 #include <emmintrin.h>
 
+#include <gmx_sse2_single.h>
+
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
 
 #include "nb_kernel430_x86_64_sse.h"
 
-/* to extract single integers from a __m128i datatype */
-#define _mm_extract_epi32(x, imm) \
-_mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
 
-static inline __m128
-my_invrsq_ps(__m128 x)
-{
-	const __m128 three = (const __m128) {3.0f, 3.0f, 3.0f, 3.0f};
-	const __m128 half  = (const __m128) {0.5f, 0.5f, 0.5f, 0.5f};
-	
-	__m128 t1 = _mm_rsqrt_ps(x);
-	
-	return (__m128) _mm_mul_ps(half,_mm_mul_ps(t1,_mm_sub_ps(three,_mm_mul_ps(x,_mm_mul_ps(t1,t1)))));
-}
 
 void nb_kernel430_x86_64_sse(int *           p_nri,
 						   int *           iinr,
@@ -101,15 +90,15 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 	__m128   fac_sse,tabscale_sse,gbtabscale_sse;
 	
 	__m128i  n0, nnn;
-	const __m128 neg    = {-1.0f,-1.0f,-1.0f,-1.0f};
-	const __m128 zero   = {0.0f,0.0f,0.0f,0.0f};
-	const __m128 half   = {0.5f,0.5f,0.5f,0.5f};
-	const __m128 two    = {2.0f,2.0f,2.0f,2.0f};
-	const __m128 three  = {3.0f,3.0f,3.0f,3.0f};
-	const __m128 six    = {6.0f,6.0f,6.0f,6.0f};
-	const __m128 twelwe = {12.0f,12.0f,12.0f,12.0f};
+	const __m128 neg    = _mm_set1_ps(-1.0f);
+	const __m128 zero   = _mm_set1_ps(0.0f);
+    const __m128 half   = _mm_set1_ps(0.5f);
+	const __m128 two    = _mm_set1_ps(2.0f);
+	const __m128 three  = _mm_set1_ps(3.0f);
+	const __m128 six    = _mm_set1_ps(6.0f);
+    const __m128 twelwe = _mm_set1_ps(12.0f);
 	
-	__m128i four        = _mm_set_epi32(4,4,4,4); 
+	__m128i four        = _mm_set1_epi32(4);
 	__m128i maski       = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);     
 	__m128i mask        = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);   
 	
@@ -224,7 +213,7 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 			
 			rsq     = _mm_add_ps(t1,t2);
 			rsq     = _mm_add_ps(rsq,t3);
-			rinv    = my_invrsq_ps(rsq);
+			rinv    = gmx_mm_invsqrt_ps(rsq);
 			
 			xmm1    = _mm_load_ss(invsqrta+jnr); 
 			xmm2    = _mm_load_ss(invsqrta+jnr2);
@@ -292,10 +281,10 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 			nnn     = _mm_slli_epi32(n0,2);
 		
 			/* the tables are 16-byte aligned, so we can use _mm_load_ps */			
-			xmm1    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-			xmm2    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-			xmm3    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-			xmm4    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+			xmm1    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+			xmm2    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+			xmm3    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+			xmm4    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
 			
 			/* transpose 4*4 */
 			xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -355,10 +344,10 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 			nnn     = _mm_slli_epi32(n0,3);
 
 			/* Tabulated VdW interaction - disperion */			
-			xmm1    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-			xmm2    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-			xmm3    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-			xmm4    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+			xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+			xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+			xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+			xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
 			
 			/* transpose 4*4 */
 			xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -387,10 +376,10 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 			/* Tabulated VdW interaction - repulsion */
 			nnn     = _mm_add_epi32(nnn,four);
 			
-			xmm1    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-			xmm2    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-			xmm3    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-			xmm4    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+			xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+			xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+			xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+			xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
 			
 			/* transpose 4*4 */
 			xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -620,15 +609,15 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 				mask  = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
 			}
 			
-			jx      = _mm_and_ps( (__m128) mask, xmm6);
-			jy      = _mm_and_ps( (__m128) mask, xmm4);
-			jz      = _mm_and_ps( (__m128) mask, xmm5);
+			jx      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm6);
+			jy      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm4);
+			jz      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm5);
 			
-			c6      = _mm_and_ps( (__m128) mask, c6);
-			c12     = _mm_and_ps( (__m128) mask, c12);
-			dvdaj   = _mm_and_ps( (__m128) mask, dvdaj);
-			isaj    = _mm_and_ps( (__m128) mask, isaj);			
-			q       = _mm_and_ps( (__m128) mask, q);
+			c6      = _mm_and_ps( gmx_mm_castsi128_ps(mask), c6);
+			c12     = _mm_and_ps( gmx_mm_castsi128_ps(mask), c12);
+			dvdaj   = _mm_and_ps( gmx_mm_castsi128_ps(mask), dvdaj);
+			isaj    = _mm_and_ps( gmx_mm_castsi128_ps(mask), isaj);			
+			q       = _mm_and_ps( gmx_mm_castsi128_ps(mask), q);
 			
 			dx1     = _mm_sub_ps(ix,jx);
 			dy1     = _mm_sub_ps(iy,jy);
@@ -641,7 +630,7 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 			rsq     = _mm_add_ps(t1,t2);
 			rsq     = _mm_add_ps(rsq,t3);
 			
-			rinv    = my_invrsq_ps(rsq);
+			rinv    = gmx_mm_invsqrt_ps(rsq);
 			
 			isaprod = _mm_mul_ps(isai,isaj);
 			qq      = _mm_mul_ps(iq,q);
@@ -664,10 +653,10 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 			nnn     = _mm_slli_epi32(n0,2);
 			
 			/* the tables are 16-byte aligned, so we can use _mm_load_ps */			
-			xmm1    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-			xmm2    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-			xmm3    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-			xmm4    = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+			xmm1    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+			xmm2    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+			xmm3    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+			xmm4    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
 			
 			/* transpose 4*4 */
 			xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -705,8 +694,8 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 			xmm1    = _mm_mul_ps(xmm1,isaj);
 			dvdaj   = _mm_add_ps(dvdaj,xmm1);
 			
-			vcoul   = _mm_and_ps( (__m128) mask, vcoul);
-			vgb     = _mm_and_ps( (__m128) mask, vgb);
+			vcoul   = _mm_and_ps( gmx_mm_castsi128_ps(mask), vcoul);
+			vgb     = _mm_and_ps( gmx_mm_castsi128_ps(mask), vgb);
 			
 			vctot   = _mm_add_ps(vctot,vcoul);
 			vgbtot  = _mm_add_ps(vgbtot,vgb);
@@ -720,10 +709,10 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 			nnn     = _mm_slli_epi32(n0,3);
 			
 			/* Tabulated VdW interaction - disperion */	
-			xmm1    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-			xmm2    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-			xmm3    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-			xmm4    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+			xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+			xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+			xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+			xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
 		
 			/* transpose 4*4 */
 			xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -752,10 +741,10 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 			/* Tabulated VdW interaction - repulsion */
 			nnn     = _mm_add_epi32(nnn,four);
 					
-			xmm1    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
-			xmm2    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
-			xmm3    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
-			xmm4    = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+			xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+			xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+			xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+			xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
 			
 			/* transpose 4*4 */
 			xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
@@ -892,9 +881,9 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 				_mm_store_ss(faction+j33+2,xmm7); 
 			}
 			
-			t1 = _mm_and_ps( (__m128) mask, t1);
-			t2 = _mm_and_ps( (__m128) mask, t2);
-			t3 = _mm_and_ps( (__m128) mask, t3);
+			t1 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t1);
+			t2 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t2);
+			t3 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t3);
 			
 			fix = _mm_add_ps(fix,t1);
 			fiy = _mm_add_ps(fiy,t2);
@@ -919,7 +908,7 @@ void nb_kernel430_x86_64_sse(int *           p_nri,
 		
 		xmm2    = _mm_unpacklo_ps(fix,fiy); /* fx, fy, - - */
 		xmm2    = _mm_movelh_ps(xmm2,fiz); 
-		xmm2    = _mm_and_ps( (__m128) maski, xmm2);
+		xmm2    = _mm_and_ps( gmx_mm_castsi128_ps(maski), xmm2);
 		
 		/* load i force from memory */
 		xmm4    = _mm_loadl_pi(xmm4, (__m64 *) (faction+ii3));
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.intel_syntax.s
deleted file mode 100644
index e3ee63bb60..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.intel_syntax.s
+++ /dev/null
@@ -1,2330 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-
-
-
-.globl nb_kernel430_x86_64_sse
-.globl _nb_kernel430_x86_64_sse
-nb_kernel430_x86_64_sse:	
-_nb_kernel430_x86_64_sse:	
-;#	Room for return address and rbp (16 bytes)
-.equiv          nb430_fshift,           16
-.equiv          nb430_gid,              24
-.equiv          nb430_pos,              32
-.equiv          nb430_faction,          40
-.equiv          nb430_charge,           48
-.equiv          nb430_p_facel,          56
-.equiv          nb430_argkrf,           64
-.equiv          nb430_argcrf,           72
-.equiv          nb430_Vc,               80
-.equiv          nb430_type,             88
-.equiv          nb430_p_ntype,          96
-.equiv          nb430_vdwparam,         104
-.equiv          nb430_Vvdw,             112
-.equiv          nb430_p_tabscale,       120
-.equiv          nb430_VFtab,            128
-.equiv          nb430_invsqrta,         136
-.equiv          nb430_dvda,             144
-.equiv          nb430_p_gbtabscale,     152
-.equiv          nb430_GBtab,            160
-.equiv          nb430_p_nthreads,       168
-.equiv          nb430_count,            176
-.equiv          nb430_mtx,              184
-.equiv          nb430_outeriter,        192
-.equiv          nb430_inneriter,        200
-.equiv          nb430_work,             208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb430_ix,               0
-.equiv          nb430_iy,               16
-.equiv          nb430_iz,               32
-.equiv          nb430_iq,               48
-.equiv          nb430_dx,               64
-.equiv          nb430_dy,               80
-.equiv          nb430_dz,               96
-.equiv          nb430_eps,              112
-.equiv          nb430_gbtsc,            128
-.equiv          nb430_tsc,              144
-.equiv          nb430_qq,               160
-.equiv          nb430_c6,               176
-.equiv          nb430_c12,              192
-.equiv          nb430_epsgb,            208
-.equiv          nb430_vctot,            224
-.equiv          nb430_Vvdwtot,          240
-.equiv          nb430_fix,              256
-.equiv          nb430_fiy,              272
-.equiv          nb430_fiz,              288
-.equiv          nb430_half,             304
-.equiv          nb430_three,            320
-.equiv          nb430_r,                336
-.equiv          nb430_isai,             352
-.equiv          nb430_isaprod,          368
-.equiv          nb430_dvdasum,          384
-.equiv          nb430_gbscale,          400
-.equiv          nb430_rinv,             416
-.equiv          nb430_nri,              432
-.equiv          nb430_iinr,             440
-.equiv          nb430_jindex,           448
-.equiv          nb430_jjnr,             456
-.equiv          nb430_shift,            464
-.equiv          nb430_shiftvec,         472
-.equiv          nb430_facel,            480
-.equiv          nb430_innerjjnr,        488
-.equiv          nb430_ii,               496
-.equiv          nb430_is3,              500
-.equiv          nb430_ii3,              504
-.equiv          nb430_ntia,             508
-.equiv          nb430_innerk,           512
-.equiv          nb430_n,                516
-.equiv          nb430_nn1,              520
-.equiv          nb430_ntype,            524
-.equiv          nb430_nouter,           528
-.equiv          nb430_ninner,           532
-
-	push rbp
-	mov  rbp, rsp
-	push rbx
-
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 552		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb430_nouter], eax
-	mov [rsp + nb430_ninner], eax
-
-
-
-	mov edi, [rdi]
-	mov [rsp + nb430_nri], edi
-	mov [rsp + nb430_iinr], rsi
-	mov [rsp + nb430_jindex], rdx
-	mov [rsp + nb430_jjnr], rcx
-	mov [rsp + nb430_shift], r8
-	mov [rsp + nb430_shiftvec], r9
-	mov rdi, [rbp + nb430_p_ntype]
-	mov edi, [rdi]
-	mov [rsp + nb430_ntype], edi
-	mov rsi, [rbp + nb430_p_facel]
-	movss xmm0, [rsi]
-	movss [rsp + nb430_facel], xmm0
-
-	mov rax, [rbp + nb430_p_tabscale]
-	movss xmm3, [rax]
-	shufps xmm3, xmm3, 0
-	movaps [rsp + nb430_tsc], xmm3
-
-	mov rbx, [rbp + nb430_p_gbtabscale]
-	movss xmm4, [rbx]
-	shufps xmm4, xmm4, 0
-	movaps [rsp + nb430_gbtsc], xmm4
-
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# half in IEEE (hex)
-	mov [rsp + nb430_half], eax
-	movss xmm1, [rsp + nb430_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# one
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# two
-	addps  xmm3, xmm2	;# three
-	movaps [rsp + nb430_half],  xmm1
-	movaps [rsp + nb430_three],  xmm3
-
-.nb430_threadloop:
-        mov   rsi, [rbp + nb430_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb430_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb430_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb430_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb430_n], eax
-        mov [rsp + nb430_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb430_outerstart
-        jmp .nb430_end
-
-.nb430_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb430_nouter]
-	mov [rsp + nb430_nouter], ebx
-
-.nb430_outer:
-	mov   rax, [rsp + nb430_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax + rsi*4]		;# ebx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb430_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb430_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movss xmm0, [rax + rbx*4]
-	movss xmm1, [rax + rbx*4 + 4]
-	movss xmm2, [rax + rbx*4 + 8] 
-
-	mov   rcx, [rsp + nb430_iinr]       ;# rcx = pointer into iinr[]
-	mov   ebx, [rcx + rsi*4]	    ;# ebx =ii 
-	mov   [rsp + nb430_ii], ebx
-
-	mov   rdx, [rbp + nb430_charge]
-	movss xmm3, [rdx + rbx*4]	
-	mulss xmm3, [rsp + nb430_facel]
-	shufps xmm3, xmm3, 0
-
-	mov   rdx, [rbp + nb430_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [rdx + rbx*4]
-	shufps xmm4, xmm4, 0
-
-    	mov   rdx, [rbp + nb430_type] 
-    	mov   edx, [rdx + rbx*4]
-    	imul  edx, [rsp + nb430_ntype]
-    	shl   edx, 1
-    	mov   [rsp + nb430_ntia], edx
-	
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb430_pos]    ;# rax = base of pos[]  
-
-	addss xmm0, [rax + rbx*4]
-	addss xmm1, [rax + rbx*4 + 4]
-	addss xmm2, [rax + rbx*4 + 8]
-
-	movaps [rsp + nb430_iq], xmm3
-	movaps [rsp + nb430_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [rsp + nb430_ix], xmm0
-	movaps [rsp + nb430_iy], xmm1
-	movaps [rsp + nb430_iz], xmm2
-
-	mov   [rsp + nb430_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorps xmm4, xmm4
-	movaps [rsp + nb430_vctot], xmm4
-	movaps [rsp + nb430_Vvdwtot], xmm4
-	movaps [rsp + nb430_dvdasum], xmm4
-	movaps [rsp + nb430_fix], xmm4
-	movaps [rsp + nb430_fiy], xmm4
-	movaps [rsp + nb430_fiz], xmm4
-	
-	mov   rax, [rsp + nb430_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb430_pos]
-	mov   rdi, [rbp + nb430_faction]	
-	mov   rax, [rsp + nb430_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb430_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [rsp + nb430_ninner]
-	mov   [rsp + nb430_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb430_innerk], edx    ;# number of innerloop atoms
-	
-	jge   .nb430_unroll_loop
-	jmp   .nb430_finish_inner
-.nb430_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   rdx, [rsp + nb430_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [rdx]	
-	mov   ebx, [rdx + 4]              
-	mov   ecx, [rdx + 8]            
-	mov   edx, [rdx + 12]         ;# eax-edx=jnr1-4 
-
-	add qword ptr [rsp + nb430_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-
-	;# load isaj
-	mov rsi, [rbp + nb430_invsqrta]
-	movss xmm3, [rsi + rax*4]
-	movss xmm4, [rsi + rcx*4]
-	movss xmm6, [rsi + rbx*4]
-	movss xmm7, [rsi + rdx*4]
-	movaps xmm2, [rsp + nb430_isai]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# 10001000 ;# all isaj in xmm3 
-	mulps  xmm2, xmm3
-	
-	movaps [rsp + nb430_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [rsp + nb430_gbtsc]
-	movaps [rsp + nb430_gbscale], xmm1
-	
-	mov rsi, [rbp + nb430_charge]    ;# base of charge[] 
-	
-	movss xmm3, [rsi + rax*4]
-	movss xmm4, [rsi + rcx*4]
-	movss xmm6, [rsi + rbx*4]
-	movss xmm7, [rsi + rdx*4]
-
-	mulps xmm2, [rsp + nb430_iq]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# 10001000 ;# all charges in xmm3  
-	mulps  xmm3, xmm2
-	movaps [rsp + nb430_qq], xmm3	
-	
-    ;# vdw parameters
-	mov rsi, [rbp + nb430_type]
-	mov r12d, [rsi + rax*4]
-	mov r13d, [rsi + rbx*4]
-	mov r14d, [rsi + rcx*4]
-	mov r15d, [rsi + rdx*4]
-	shl r12d, 1	
-	shl r13d, 1	
-	shl r14d, 1	
-	shl r15d, 1	
-    mov edi, [rsp + nb430_ntia]
-	add r12d, edi
-	add r13d, edi
-	add r14d, edi
-	add r15d, edi
-
-	mov rsi, [rbp + nb430_vdwparam]
-	movlps xmm3, [rsi + r12*4]
-	movlps xmm7, [rsi + r14*4]
-	movhps xmm3, [rsi + r13*4]
-	movhps xmm7, [rsi + r15*4]
-
-	movaps xmm0, xmm3
-	shufps xmm0, xmm7, 136  ;# 10001000
-	shufps xmm3, xmm7, 221  ;# 11011101
-
-    movaps [rsp + nb430_c6], xmm0
-    movaps [rsp + nb430_c12], xmm3
-    
-	mov rsi, [rbp + nb430_pos]       ;# base of pos[] 
-		
-	lea   r8, [rax + rax*2]     ;# jnr
-	lea   r9, [rbx + rbx*2]	
-	lea   r10, [rcx + rcx*2]    
-	lea   r11, [rdx + rdx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-	movlps xmm4, [rsi + r8*4]
-	movlps xmm5, [rsi + r10*4]
-	movss xmm2, [rsi + r8*4 + 8]
-	movss xmm6, [rsi + r10*4 + 8]
-
-	movhps xmm4, [rsi + r9*4]
-	movhps xmm5, [rsi + r11*4]
-
-	movss xmm0, [rsi + r9*4 + 8]
-	movss xmm1, [rsi + r11*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# 10001000
-	
-	shufps xmm0, xmm5, 136  ;# 10001000
-	shufps xmm1, xmm5, 221  ;# 11011101		
-
-	;# calc dr 
-	subps xmm0, [rsp + nb430_ix]
-	subps xmm1, [rsp + nb430_iy]
-	subps xmm2, [rsp + nb430_iz]
-
-	;# store dr 
-	movaps [rsp + nb430_dx], xmm0
-	movaps [rsp + nb430_dy], xmm1
-	movaps [rsp + nb430_dz], xmm2
-
-    movd mm0, r8  ;# store j3
-    movd mm1, r9
-    movd mm2, r10
-    movd mm3, r11
-
-	;# square it 
-	mulps xmm0,xmm0
-	mulps xmm1,xmm1
-	mulps xmm2,xmm2
-	addps xmm0, xmm1
-	addps xmm0, xmm2
-    movaps xmm4, xmm0
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb430_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb430_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r
-	movaps [rsp + nb430_r], xmm4
-    movaps [rsp + nb430_rinv], xmm0
-    
-    movaps xmm8, xmm4    ;# r
-	mulps xmm4, [rsp + nb430_gbscale] ;# rgbtab
-    mulps xmm8, [rsp + nb430_tsc]    ;# rtab
-    
-    ;# truncate and convert to integers
-    cvttps2dq xmm5, xmm4  ;# gb
-    cvttps2dq xmm9, xmm8  ;# lj
-    
-    ;# convert back to float
-    cvtdq2ps  xmm6, xmm5   ;# gb
-    cvtdq2ps  xmm10, xmm9  ;# lj
-    
-    ;# multiply by 4 and 8, respectively
-    pslld   xmm5, 2   ;# gb
-    pslld   xmm9, 3   ;# lj
-
-    ;# move to integer registers
-    movhlps xmm7, xmm5     ;# gb
-    movhlps xmm11, xmm9    ;# lj
-    movd    r8d, xmm5       ;# gb
-    movd    r12d, xmm9      ;# lj
-    movd    r10d, xmm7      ;# gb
-    movd    r14d, xmm11     ;# lj
-    pshufd  xmm5, xmm5, 1  ;# gb
-    pshufd  xmm9, xmm9, 1  ;# lj
-    pshufd  xmm7, xmm7, 1  ;# gb
-    pshufd  xmm11, xmm11, 1 ;# lj
-    movd    r9d, xmm5       ;# gb
-    movd    r13d, xmm9      ;# lj
-    movd    r11d, xmm7      ;# gb
-    movd    r15d, xmm11     ;# lj
-    ;# GB indices: r8-r11   LJ indices: r12-r15
-    
-    ;# calculate eps
-    subps     xmm4, xmm6   ;# gb
-    subps     xmm8, xmm10  ;# lj
-    movaps    [rsp + nb430_epsgb], xmm4 ;# gb eps
-    movaps    [rsp + nb430_eps], xmm8 ;# lj eps
-    
-	mov  rsi, [rbp + nb430_GBtab]
-	mov  rdi, [rbp + nb430_VFtab]
-
-    ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
-   	movlps xmm1, [rsi + r8*4]         ;# Y1c F1c 
-   	movlps xmm5, [rdi + r12*4]        ;# Y1d F1d 
-   	movlps xmm9, [rdi + r12*4 + 16]   ;# Y1r F1r 
-
-	movlps xmm3, [rsi + r10*4]        ;# Y3c F3c 
-	movlps xmm7, [rdi + r14*4]        ;# Y3d F3d 
-	movlps xmm11, [rdi + r14*4 + 16]  ;# Y3r F3r 
-
-	movhps xmm1, [rsi + r9*4]         ;# Y1c F1c Y2c F2c
-	movhps xmm5, [rdi + r13*4]        ;# Y1d F1d Y2d F2d
-	movhps xmm9, [rdi + r13*4 + 16]   ;# Y1r F1r Y2r F2r
-
-	movhps xmm3, [rsi + r11*4]        ;# Y3c F3c Y4c F4c
-	movhps xmm7, [rdi + r15*4]        ;# Y3d F3d Y4d F4d
-	movhps xmm11, [rdi + r15*4 + 16]  ;# Y3r F3r Y4r F4r
-
-    movaps xmm0, xmm1
-    movaps xmm4, xmm5
-    movaps xmm8, xmm9
-	shufps xmm0, xmm3, 136  ;# 10001000   => Y1c Y2c Y3c Y4c
-	shufps xmm4, xmm7, 136  ;# 10001000   => Y1d Y2d Y3d Y4d
-	shufps xmm8, xmm11, 136  ;# 10001000  => Y1r Y2r Y3r Y4r
-	shufps xmm1, xmm3, 221  ;# 11011101   => F1c F2c F3c F4c
-	shufps xmm5, xmm7, 221  ;# 11011101   => F1d F2d F3d F4d
-	shufps xmm9, xmm11, 221  ;# 11011101  => F1r F2r F3r F4r
-    
-   	movlps xmm3, [rsi + r8*4 + 8]      ;# G1c H1c 
-   	movlps xmm7, [rdi + r12*4 + 8]     ;# G1d H1d 
-   	movlps xmm11, [rdi + r12*4 + 24]   ;# G1r H1r 
-
-	movlps xmm12, [rsi + r10*4 + 8]    ;# G3c H3c 
-	movlps xmm13, [rdi + r14*4 + 8]    ;# G3d H3d 
-	movlps xmm14, [rdi + r14*4 + 24]   ;# G3r H3r 
-
-	movhps xmm3, [rsi + r9*4 + 8]      ;# G1c H1c G2c H2c
-	movhps xmm7, [rdi + r13*4 + 8]     ;# G1d H1d G2d H2d
-	movhps xmm11, [rdi + r13*4 + 24]   ;# G1r H1r G2r H2r
-
-	movhps xmm12, [rsi + r11*4 + 8]    ;# G3c H3c G4c H4c
-	movhps xmm13, [rdi + r15*4 + 8]    ;# G3d H3d G4d H4d
-	movhps xmm14, [rdi + r15*4 + 24]   ;# G3r H3r G4r H4r
-    movaps xmm2, xmm3
-    movaps xmm6, xmm7
-    movaps xmm10, xmm11
-    
-	shufps xmm2, xmm12, 136  ;# 10001000  => G1c G2c G3c G4c
-	shufps xmm6, xmm13, 136  ;# 10001000  => G1d G2d G3d G4d
-	shufps xmm10, xmm14, 136  ;# 10001000 => G1r G2r G3r G4r
-	shufps xmm3, xmm12, 221  ;# 11011101  => H1c H2c H3c H4c
-	shufps xmm7, xmm13, 221  ;# 11011101  => H1d H2d H3d H4d
-	shufps xmm11, xmm14, 221  ;# 11011101 => H1r H2r H3r H4r
-    ;# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
-
-    movaps xmm12, [rsp + nb430_epsgb]
-    movaps xmm13, [rsp + nb430_eps]
-    
-    mulps  xmm3, xmm12   ;# Heps
-    mulps  xmm7, xmm13
-    mulps  xmm11, xmm13
-    mulps  xmm2, xmm12     ;# Geps
-    mulps  xmm6, xmm13
-    mulps  xmm10, xmm13
-    mulps  xmm3, xmm12   ;# Heps2
-    mulps  xmm7, xmm13
-    mulps  xmm11, xmm13
-
-    addps  xmm1, xmm2   ;# F+Geps
-    addps  xmm5, xmm6
-    addps  xmm9, xmm10 
-    addps  xmm1, xmm3   ;# F+Geps+Heps2 = Fp
-    addps  xmm5, xmm7
-    addps  xmm9, xmm11 
-    addps  xmm3, xmm3    ;# 2*Heps2
-    addps  xmm7, xmm7
-    addps  xmm11, xmm11
-    addps  xmm3, xmm2    ;# 2*Heps2+Geps
-    addps  xmm7, xmm6  
-    addps  xmm11, xmm10
-    addps  xmm3, xmm1   ;# FF = Fp + 2*Heps2 + Geps
-    addps  xmm7, xmm5
-    addps  xmm11, xmm9
-    mulps  xmm1, xmm12   ;# eps*Fp
-    mulps  xmm5, xmm13
-    mulps  xmm9, xmm13
-    addps  xmm1, xmm0     ;# VV
-    addps  xmm5, xmm4
-    addps  xmm9, xmm8
-    mulps  xmm1, [rsp + nb430_qq]   ;# VV*qq = vcoul
-    mulps  xmm5, [rsp + nb430_c6]   ;# vnb6
-    mulps  xmm9, [rsp + nb430_c12]   ;# vnb12
-    mulps  xmm3, [rsp + nb430_qq]    ;# FF*qq = fij
-    mulps  xmm7, [rsp + nb430_c6]   ;# fijD
-    mulps  xmm11, [rsp + nb430_c12]   ;#fijR
-
-    addps  xmm11, xmm7 ;# fijD+fijR
-    mulps  xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale
-    
-    ;# accumulate Vvdwtot
-    addps  xmm5, [rsp + nb430_Vvdwtot]
-    addps  xmm5, xmm9
-    movaps [rsp + nb430_Vvdwtot], xmm5
-
-	mov rsi, [rbp + nb430_dvda]
-	
-	;# Calculate dVda
-	mulps xmm3, [rsp + nb430_gbscale]   ;# fijC=qq*FF*gbscale
-	movaps xmm6, xmm3 
-	mulps  xmm6, [rsp + nb430_r]
-	addps  xmm6, xmm1   ;# vcoul+fijC*r
-
-    addps  xmm3, xmm11  ;# fijC+fijD+fijR
-    
-    ;# increment vctot
-	addps  xmm1, [rsp + nb430_vctot]
-    movaps [rsp + nb430_vctot], xmm1
-
-	;# xmm6=(vcoul+fijC*r)
-	xorps  xmm7, xmm7
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum 
-	addps  xmm7, [rsp + nb430_dvdasum]
-    movaps [rsp + nb430_dvdasum], xmm7
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	movaps  xmm5, xmm6
-	movaps  xmm4, xmm7
-	shufps  xmm5, xmm5, 0x1
-	shufps  xmm4, xmm4, 0x1
-
-	;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-	addss  xmm6, [rsi + rax*4]
-	addss  xmm5, [rsi + rbx*4]
-	addss  xmm7, [rsi + rcx*4]
-	addss  xmm4, [rsi + rdx*4]
-	movss  [rsi + rax*4], xmm6
-	movss  [rsi + rbx*4], xmm5
-	movss  [rsi + rcx*4], xmm7
-	movss  [rsi + rdx*4], xmm4
-
-	xorps  xmm4, xmm4	
-	mulps xmm3, [rsp + nb430_rinv]
-	subps  xmm4, xmm3
-
-    movd r8, mm0   ;# fetch j3
-    movd r9, mm1
-    movd r10, mm2
-    movd r11, mm3
-
-    movaps  xmm9, xmm4
-    movaps  xmm10, xmm4
-    movaps  xmm11, xmm4
-    
-    mulps  xmm9, [rsp + nb430_dx]
-    mulps  xmm10, [rsp + nb430_dy]
-    mulps  xmm11, [rsp + nb430_dz]
-    
-	;# accumulate i forces
-    movaps xmm12, [rsp + nb430_fix]
-    movaps xmm13, [rsp + nb430_fiy]
-    movaps xmm14, [rsp + nb430_fiz]
-    addps xmm12, xmm9
-    addps xmm13, xmm10
-    addps xmm14, xmm11
-    movaps [rsp + nb430_fix], xmm12
-    movaps [rsp + nb430_fiy], xmm13
-    movaps [rsp + nb430_fiz], xmm14
-
-	mov rsi, [rbp + nb430_faction]
-	;# the fj's - start by accumulating x & y forces from memory 
-	movlps xmm0, [rsi + r8*4] ;# x1 y1 - -
-	movlps xmm1, [rsi + r10*4] ;# x3 y3 - -
-	movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2
-	movhps xmm1, [rsi + r11*4] ;# x3 y3 x4 y4
-
-    movaps xmm8, xmm9
-    unpcklps xmm9, xmm10 ;# x1 y1 x2 y2
-    unpckhps xmm8, xmm10 ;# x3 y3 x4 y4
-    
-    ;# update fjx and fjy
-	addps  xmm0, xmm9
-	addps  xmm1, xmm8
-	
-	movlps [rsi + r8*4], xmm0
-	movlps [rsi + r10*4], xmm1
-	movhps [rsi + r9*4], xmm0
-	movhps [rsi + r11*4], xmm1
-    
-    ;# xmm11: fjz1 fjz2 fjz3 fjz4
-    pshufd  xmm10, xmm11, 1  ;# fjz2 - - -
-    movhlps xmm9,  xmm11     ;# fjz3 - - -
-    pshufd  xmm8,  xmm11, 3  ;# fjz4 - - -
-    
-	addss  xmm11, [rsi + r8*4 + 8]
-	addss  xmm10, [rsi + r9*4 + 8]
-	addss  xmm9,  [rsi + r10*4 + 8]
-	addss  xmm8,  [rsi + r11*4 + 8]    
-	movss  [rsi + r8*4 + 8], xmm11
-	movss  [rsi + r9*4 + 8], xmm10
-	movss  [rsi + r10*4 + 8], xmm9
-	movss  [rsi + r11*4 + 8], xmm8
-	
-	;# should we do one more iteration? 
-	sub dword ptr [rsp + nb430_innerk],  4
-	jl    .nb430_finish_inner
-	jmp   .nb430_unroll_loop
-.nb430_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [rsp + nb430_innerk],  4
-	mov   edx, [rsp + nb430_innerk]
-	and   edx, 2
-	jnz   .nb430_dopair
-	jmp   .nb430_checksingle
-.nb430_dopair:	
-	mov   rcx, [rsp + nb430_innerjjnr]
-	
-	mov   eax, [rcx]	
-	mov   ebx, [rcx + 4]              
-	add qword ptr [rsp + nb430_innerjjnr],  8
-
-	;# load isaj
-	mov rsi, [rbp + nb430_invsqrta]
-	movss xmm3, [rsi + rax*4]
-	movss xmm6, [rsi + rbx*4]
-	movaps xmm2, [rsp + nb430_isai]
-    unpcklps xmm3, xmm6
-	mulps  xmm2, xmm3
-    movaps [rsp + nb430_isaprod], xmm2	
-    
-	movaps xmm1, xmm2
-	mulps xmm1, [rsp + nb430_gbtsc]
-	movaps [rsp + nb430_gbscale], xmm1
-	
-	mov rsi, [rbp + nb430_charge]    ;# base of charge[] 
-	
-	movss xmm3, [rsi + rax*4]
-	movss xmm6, [rsi + rbx*4]
-    unpcklps xmm3, xmm6
-	mulps xmm2, [rsp + nb430_iq]
-	mulps  xmm3, xmm2
-	movaps [rsp + nb430_qq], xmm3	
-	
-    ;# vdw parameters
-	mov rsi, [rbp + nb430_type]
-	mov r12d, [rsi + rax*4]
-	mov r13d, [rsi + rbx*4]
-	shl r12d, 1	
-	shl r13d, 1	
-    mov edi, [rsp + nb430_ntia]
-	add r12d, edi
-	add r13d, edi
-
-	mov rsi, [rbp + nb430_vdwparam]
-	movlps xmm3, [rsi + r12*4]
-	movhps xmm3, [rsi + r13*4]
-
-    xorps xmm7, xmm7
-	movaps xmm0, xmm3
-	shufps xmm0, xmm7, 136  ;# 10001000
-	shufps xmm3, xmm7, 221  ;# 11011101
-
-    movaps [rsp + nb430_c6], xmm0
-    movaps [rsp + nb430_c12], xmm3
-    
-	mov rsi, [rbp + nb430_pos]       ;# base of pos[] 
-		
-	lea   r8, [rax + rax*2]     ;# j3
-	lea   r9, [rbx + rbx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-	movlps xmm0, [rsi + r8*4]	;# x1 y1 - - 
-	movlps xmm1, [rsi + r9*4]	;# x2 y2 - - 
-
-	movss xmm2, [rsi + r8*4 + 8]	;# z1 - - - 
-	movss xmm7, [rsi + r9*4 + 8]	;# z2 - - - 
-
-    unpcklps xmm0, xmm1 ;# x1 x2 y1 y2
-    movhlps  xmm1, xmm0 ;# y1 y2 -  -
-    unpcklps xmm2, xmm7 ;# z1 z2 -  -
-    
-	;# calc dr 
-	subps xmm0, [rsp + nb430_ix]
-	subps xmm1, [rsp + nb430_iy]
-	subps xmm2, [rsp + nb430_iz]
-
-	;# store dr 
-	movaps [rsp + nb430_dx], xmm0
-	movaps [rsp + nb430_dy], xmm1
-	movaps [rsp + nb430_dz], xmm2
-
-	;# square it 
-	mulps xmm0,xmm0
-	mulps xmm1,xmm1
-	mulps xmm2,xmm2
-	addps xmm0, xmm1
-	addps xmm0, xmm2
-    movaps xmm4, xmm0
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb430_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb430_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r
-	movaps [rsp + nb430_r], xmm4
-    movaps [rsp + nb430_rinv], xmm0
-    
-    movaps xmm8, xmm4    ;# r
-	mulps xmm4, [rsp + nb430_gbscale] ;# rgbtab
-    mulps xmm8, [rsp + nb430_tsc]    ;# rtab
-    
-    ;# truncate and convert to integers
-    cvttps2dq xmm5, xmm4  ;# gb
-    cvttps2dq xmm9, xmm8  ;# lj
-    
-    ;# convert back to float
-    cvtdq2ps  xmm6, xmm5   ;# gb
-    cvtdq2ps  xmm10, xmm9  ;# lj
-    
-    ;# multiply by 4 and 8, respectively
-    pslld   xmm5, 2   ;# gb
-    pslld   xmm9, 3   ;# lj
-
-    ;# move to integer registers
-    movd    r12d, xmm5       ;# gb
-    movd    r14d, xmm9      ;# lj
-    pshufd  xmm5, xmm5, 1   ;# gb
-    pshufd  xmm9, xmm9, 1   ;# lj
-    movd    r13d, xmm5       ;# gb
-    movd    r15d, xmm9      ;# lj
-    ;# GB indices: r12-r13   LJ indices: r14-r15
-    
-    ;# calculate eps
-    subps     xmm4, xmm6   ;# gb
-    subps     xmm8, xmm10  ;# lj
-    movaps    [rsp + nb430_epsgb], xmm4 ;# gb eps
-    movaps    [rsp + nb430_eps], xmm8 ;# lj eps
-    
-	mov  rsi, [rbp + nb430_GBtab]
-	mov  rdi, [rbp + nb430_VFtab]
-
-    ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
-   	movlps xmm0, [rsi + r12*4]       ;# Y1c F1c
-	movlps xmm1, [rsi + r13*4]       ;# Y2c F2c
-   	movlps xmm4, [rdi + r14*4]       ;# Y1d F1d  
-	movlps xmm5, [rdi + r15*4]       ;# Y2d F2d
-   	movlps xmm8, [rdi + r14*4 + 16]  ;# Y1r F1r
-	movlps xmm9, [rdi + r15*4 + 16]  ;# Y2r F2r
-
-    unpcklps xmm0, xmm1
-    movhlps  xmm1, xmm0
-    unpcklps xmm4, xmm5
-    movhlps  xmm5, xmm4
-    unpcklps xmm8, xmm9
-    movhlps  xmm9, xmm8
-   	movlps xmm2, [rsi + r12*4 + 8]    ;# G1c H1c
-	movlps xmm3, [rsi + r13*4 + 8]    ;# G2c H2c
-   	movlps xmm6, [rdi + r14*4 + 8]    ;# G1d H1d  
-	movlps xmm7, [rdi + r15*4 + 8]    ;# G2d H2d
-   	movlps xmm10, [rdi + r14*4 + 24]  ;# G1r H1r
-	movlps xmm11, [rdi + r15*4 + 24]  ;# G2r H2r
-    unpcklps xmm2, xmm3
-    movhlps  xmm3, xmm2
-    unpcklps xmm6, xmm7
-    movhlps  xmm7, xmm6
-    unpcklps xmm10, xmm11
-    movhlps  xmm11, xmm10
-    ;# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
-
-    movaps xmm12, [rsp + nb430_epsgb]
-    movaps xmm13, [rsp + nb430_eps]
-        
-    mulps  xmm3, xmm12   ;# Heps
-    mulps  xmm7, xmm13
-    mulps  xmm11, xmm13
-    mulps  xmm2, xmm12     ;# Geps
-    mulps  xmm6, xmm13
-    mulps  xmm10, xmm13
-    mulps  xmm3, xmm12   ;# Heps2
-    mulps  xmm7, xmm13
-    mulps  xmm11, xmm13
-
-    addps  xmm1, xmm2   ;# F+Geps
-    addps  xmm5, xmm6
-    addps  xmm9, xmm10 
-    addps  xmm1, xmm3   ;# F+Geps+Heps2 = Fp
-    addps  xmm5, xmm7
-    addps  xmm9, xmm11 
-    addps  xmm3, xmm3    ;# 2*Heps2
-    addps  xmm7, xmm7
-    addps  xmm11, xmm11
-    addps  xmm3, xmm2    ;# 2*Heps2+Geps
-    addps  xmm7, xmm6  
-    addps  xmm11, xmm10
-    addps  xmm3, xmm1   ;# FF = Fp + 2*Heps2 + Geps
-    addps  xmm7, xmm5
-    addps  xmm11, xmm9
-    mulps  xmm1, xmm12   ;# eps*Fp
-    mulps  xmm5, xmm13
-    mulps  xmm9, xmm13
-    addps  xmm1, xmm0     ;# VV
-    addps  xmm5, xmm4
-    addps  xmm9, xmm8
-    mulps  xmm1, [rsp + nb430_qq]   ;# VV*qq = vcoul
-    mulps  xmm5, [rsp + nb430_c6]   ;# vnb6
-    mulps  xmm9, [rsp + nb430_c12]   ;# vnb12
-    mulps  xmm3, [rsp + nb430_qq]    ;# FF*qq = fij
-    mulps  xmm7, [rsp + nb430_c6]   ;# fijD
-    mulps  xmm11, [rsp + nb430_c12]   ;#fijR
-
-    addps  xmm11, xmm7 ;# fijD+fijR
-    mulps  xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale
-    
-    ;# accumulate Vvdwtot
-    addps  xmm5, [rsp + nb430_Vvdwtot]
-    addps  xmm5, xmm9
-    movlps [rsp + nb430_Vvdwtot], xmm5
-
-	mov rsi, [rbp + nb430_dvda]
-	
-	;# Calculate dVda
-	mulps xmm3, [rsp + nb430_gbscale]   ;# fijC=qq*FF*gbscale
-	movaps xmm6, xmm3 
-	mulps  xmm6, [rsp + nb430_r]
-	addps  xmm6, xmm1   ;# vcoul+fijC*r
-
-    addps  xmm3, xmm11  ;# fijC+fijD+fijR
-    
-    ;# increment vctot
-	addps  xmm1, [rsp + nb430_vctot]
-    movlps [rsp + nb430_vctot], xmm1
-
-	;# xmm6=(vcoul+fijC*r)
-	xorps  xmm7, xmm7
-	subps  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum 
-	addps  xmm7, [rsp + nb430_dvdasum]
-    movlps [rsp + nb430_dvdasum], xmm7
-
-	;# update j atoms dvdaj
-	movaps  xmm5, xmm6
-	shufps  xmm5, xmm5, 0x1
-
-	;# xmm6=dvdaj1 xmm5=dvdaj2 
-	addss  xmm6, [rsi + rax*4]
-	addss  xmm5, [rsi + rbx*4]
-	movss  [rsi + rax*4], xmm6
-	movss  [rsi + rbx*4], xmm5
-
-	xorps  xmm4, xmm4	
-	mulps xmm3, [rsp + nb430_rinv]
-	subps  xmm4, xmm3
-
-    movaps  xmm9, xmm4
-    movaps  xmm10, xmm4
-    movaps  xmm11, xmm4
-    
-    mulps  xmm9, [rsp + nb430_dx]
-    mulps  xmm10, [rsp + nb430_dy]
-    mulps  xmm11, [rsp + nb430_dz]
-    
-    
-	;# accumulate i forces
-    movaps xmm12, [rsp + nb430_fix]
-    movaps xmm13, [rsp + nb430_fiy]
-    movaps xmm14, [rsp + nb430_fiz]
-    addps xmm12, xmm9
-    addps xmm13, xmm10
-    addps xmm14, xmm11
-    movlps [rsp + nb430_fix], xmm12
-    movlps [rsp + nb430_fiy], xmm13
-    movlps [rsp + nb430_fiz], xmm14
-
-	mov rsi, [rbp + nb430_faction]
-	;# the fj's - start by accumulating x & y forces from memory 
-	movlps xmm0, [rsi + r8*4] ;# x1 y1 - -
-	movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2
-
-    unpcklps xmm9, xmm10  ;# x1 y1 x2 y2
-    addps    xmm0, xmm9
-
-	movlps [rsi + r8*4], xmm0
-	movhps [rsi + r9*4], xmm0
-    
-    ;# z forces
-    pshufd xmm8, xmm11, 1
-    addss  xmm11, [rsi + r8*4 + 8] 
-    addss  xmm8,  [rsi + r9*4 + 8]
-    movss  [rsi + r8*4 + 8], xmm11
-    movss  [rsi + r9*4 + 8], xmm8
-
-.nb430_checksingle:				
-	mov   edx, [rsp + nb430_innerk]
-	and   edx, 1
-	jnz    .nb430_dosingle
-	jmp    .nb430_updateouterdata
-.nb430_dosingle:
-	mov rsi, [rbp + nb430_charge]
-	mov rdx, [rbp + nb430_invsqrta]
-	mov rdi, [rbp + nb430_pos]
-	mov   rcx, [rsp + nb430_innerjjnr]
-	mov   eax, [rcx]	
-
-	;# load isaj
-	mov rsi, [rbp + nb430_invsqrta]
-	movss xmm3, [rsi + rax*4]
-	movaps xmm2, [rsp + nb430_isai]
-	mulss  xmm2, xmm3
-    movaps [rsp + nb430_isaprod], xmm2	
-    
-	movaps xmm1, xmm2
-	mulss xmm1, [rsp + nb430_gbtsc]
-	movaps [rsp + nb430_gbscale], xmm1
-	
-	mov rsi, [rbp + nb430_charge]    ;# base of charge[] 
-	
-	movss xmm3, [rsi + rax*4]
-	mulss xmm2, [rsp + nb430_iq]
-	mulss  xmm3, xmm2
-	movaps [rsp + nb430_qq], xmm3	
-	
-    ;# vdw parameters
-	mov rsi, [rbp + nb430_type]
-	mov r12d, [rsi + rax*4]
-	shl r12d, 1	
-    mov edi, [rsp + nb430_ntia]
-	add r12d, edi
-
-	mov rsi, [rbp + nb430_vdwparam]
-	movss xmm0, [rsi + r12*4]
-	movss xmm3, [rsi + r12*4 + 4]
-    movaps [rsp + nb430_c6], xmm0
-    movaps [rsp + nb430_c12], xmm3
-    
-	mov rsi, [rbp + nb430_pos]       ;# base of pos[] 
-		
-	lea   r8, [rax + rax*2]     ;# j3
-
-	;# move four coordinates to xmm0-xmm2 	
-    movss  xmm0, [rsi + r8*4]
-    movss  xmm1, [rsi + r8*4 + 4]
-    movss  xmm2, [rsi + r8*4 + 8]
-    
-	;# calc dr 
-	subss xmm0, [rsp + nb430_ix]
-	subss xmm1, [rsp + nb430_iy]
-	subss xmm2, [rsp + nb430_iz]
-
-	;# store dr 
-	movaps [rsp + nb430_dx], xmm0
-	movaps [rsp + nb430_dy], xmm1
-	movaps [rsp + nb430_dz], xmm2
-
-	;# square it 
-	mulss xmm0,xmm0
-	mulss xmm1,xmm1
-	mulss xmm2,xmm2
-	addss xmm0, xmm1
-	addss xmm0, xmm2
-    movaps xmm4, xmm0
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulss xmm5, xmm5
-	movaps xmm1, [rsp + nb430_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb430_half]
-	subss xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-	mulss xmm4, xmm0	;# xmm4=r
-	movaps [rsp + nb430_r], xmm4
-    movaps [rsp + nb430_rinv], xmm0
-    
-    movaps xmm8, xmm4    ;# r
-	mulss xmm4, [rsp + nb430_gbscale] ;# rgbtab
-    mulss xmm8, [rsp + nb430_tsc]    ;# rtab
-    
-    ;# truncate and convert to integers
-    cvttss2si r12d, xmm4  ;# gb
-    cvttss2si r14d, xmm8  ;# lj
-    
-    ;# convert back to float
-    cvtsi2ss  xmm6, r12d   ;# gb
-    cvtsi2ss  xmm10, r14d  ;# lj
-    
-    ;# multiply by 4 and 8, respectively
-    shl   r12d, 2   ;# gb
-    shl   r14d, 3   ;# lj
-
-    ;# GB index: r12   LJ indices: r14
-    
-    ;# calculate eps
-    subss     xmm4, xmm6   ;# gb
-    subss     xmm8, xmm10  ;# lj
-    movaps    [rsp + nb430_epsgb], xmm4 ;# gb eps
-    movaps    [rsp + nb430_eps], xmm8 ;# lj eps
-    
-	mov  rsi, [rbp + nb430_GBtab]
-	mov  rdi, [rbp + nb430_VFtab]
-
-    ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
-    movss  xmm0,  [rsi + r12*4]
-    movss  xmm1,  [rsi + r12*4 + 4]
-    movss  xmm2,  [rsi + r12*4 + 8]
-    movss  xmm3,  [rsi + r12*4 + 12]
-    movss  xmm4,  [rdi + r14*4]
-    movss  xmm5,  [rdi + r14*4 + 4]
-    movss  xmm6,  [rdi + r14*4 + 8]
-    movss  xmm7,  [rdi + r14*4 + 12]
-    movss  xmm8,  [rdi + r14*4 + 16]
-    movss  xmm9,  [rdi + r14*4 + 20]
-    movss  xmm10, [rdi + r14*4 + 24]
-    movss  xmm11, [rdi + r14*4 + 28]
-    ;# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
-
-    movaps xmm12, [rsp + nb430_epsgb]
-    movaps xmm13, [rsp + nb430_eps]
-    
-    mulss  xmm3, xmm12   ;# Heps
-    mulss  xmm7, xmm13
-    mulss  xmm11, xmm13
-    mulss  xmm2, xmm12     ;# Geps
-    mulss  xmm6, xmm13
-    mulss  xmm10, xmm13
-    mulss  xmm3, xmm12   ;# Heps2
-    mulss  xmm7, xmm13
-    mulss  xmm11, xmm13
-
-    addss  xmm1, xmm2   ;# F+Geps
-    addss  xmm5, xmm6
-    addss  xmm9, xmm10 
-    addss  xmm1, xmm3   ;# F+Geps+Heps2 = Fp
-    addss  xmm5, xmm7
-    addss  xmm9, xmm11 
-    addss  xmm3, xmm3    ;# 2*Heps2
-    addss  xmm7, xmm7
-    addss  xmm11, xmm11
-    addss  xmm3, xmm2    ;# 2*Heps2+Geps
-    addss  xmm7, xmm6  
-    addss  xmm11, xmm10
-    addss  xmm3, xmm1   ;# FF = Fp + 2*Heps2 + Geps
-    addss  xmm7, xmm5
-    addss  xmm11, xmm9
-    mulss  xmm1, xmm12   ;# eps*Fp
-    mulss  xmm5, xmm13
-    mulss  xmm9, xmm13
-    addss  xmm1, xmm0     ;# VV
-    addss  xmm5, xmm4
-    addss  xmm9, xmm8
-    mulss  xmm1, [rsp + nb430_qq]   ;# VV*qq = vcoul
-    mulss  xmm5, [rsp + nb430_c6]   ;# vnb6
-    mulss  xmm9, [rsp + nb430_c12]   ;# vnb12
-    mulss  xmm3, [rsp + nb430_qq]    ;# FF*qq = fij
-    mulss  xmm7, [rsp + nb430_c6]   ;# fijD
-    mulss  xmm11, [rsp + nb430_c12]   ;#fijR
-
-    addss  xmm11, xmm7 ;# fijD+fijR
-    mulss  xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale
-    
-    ;# accumulate Vvdwtot
-    addss  xmm5, [rsp + nb430_Vvdwtot]
-    addss  xmm5, xmm9
-    movss [rsp + nb430_Vvdwtot], xmm5
-
-	mov rsi, [rbp + nb430_dvda]
-	
-	;# Calculate dVda
-	mulss xmm3, [rsp + nb430_gbscale]   ;# fijC=qq*FF*gbscale
-	movaps xmm6, xmm3 
-	mulss  xmm6, [rsp + nb430_r]
-	addss  xmm6, xmm1   ;# vcoul+fijC*r
-
-    addss  xmm3, xmm11  ;# fijC+fijD+fijR
-    
-    ;# increment vctot
-	addss  xmm1, [rsp + nb430_vctot]
-    movss [rsp + nb430_vctot], xmm1
-
-	;# xmm6=(vcoul+fijC*r)
-	xorps  xmm7, xmm7
-	subss  xmm7, xmm6
-	movaps xmm6, xmm7
-	
-	;# update dvdasum 
-	addss  xmm7, [rsp + nb430_dvdasum]
-    movss [rsp + nb430_dvdasum], xmm7
-
-	;# update j atoms dvdaj
-
-	;# xmm6=dvdaj1
-	addss  xmm6, [rsi + rax*4]
-	movss  [rsi + rax*4], xmm6
-
-	xorps  xmm4, xmm4	
-	mulss xmm3, [rsp + nb430_rinv]
-	subss  xmm4, xmm3
-
-    movss  xmm9, xmm4
-    movss  xmm10, xmm4
-    movss  xmm11, xmm4
-    
-    mulss  xmm9, [rsp + nb430_dx]
-    mulss  xmm10, [rsp + nb430_dy]
-    mulss  xmm11, [rsp + nb430_dz]
-    
-	;# accumulate i forces
-    movaps xmm12, [rsp + nb430_fix]
-    movaps xmm13, [rsp + nb430_fiy]
-    movaps xmm14, [rsp + nb430_fiz]
-    addss xmm12, xmm9
-    addss xmm13, xmm10
-    addss xmm14, xmm11
-    movss [rsp + nb430_fix], xmm12
-    movss [rsp + nb430_fiy], xmm13
-    movss [rsp + nb430_fiz], xmm14
-
-	mov rsi, [rbp + nb430_faction]
-    ;# add to j forces
-    addss  xmm9,  [rsi + r8*4]
-    addss  xmm10, [rsi + r8*4 + 4]
-    addss  xmm11, [rsi + r8*4 + 8]
-    movss  [rsi + r8*4],     xmm9
-    movss  [rsi + r8*4 + 4], xmm10
-    movss  [rsi + r8*4 + 8], xmm11
-    
-.nb430_updateouterdata:
-	mov   ecx, [rsp + nb430_ii3]
-	mov   rdi, [rbp + nb430_faction]
-	mov   rsi, [rbp + nb430_fshift]
-	mov   edx, [rsp + nb430_is3]
-
-	;# accumulate i forces in xmm0, xmm1, xmm2 
-	movaps xmm0, [rsp + nb430_fix]
-	movaps xmm1, [rsp + nb430_fiy]
-	movaps xmm2, [rsp + nb430_fiz]
-
-	movhlps xmm3, xmm0
-	movhlps xmm4, xmm1
-	movhlps xmm5, xmm2
-	addps  xmm0, xmm3
-	addps  xmm1, xmm4
-	addps  xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2 
-
-	movaps xmm3, xmm0	
-	movaps xmm4, xmm1	
-	movaps xmm5, xmm2	
-
-	shufps xmm3, xmm3, 1
-	shufps xmm4, xmm4, 1
-	shufps xmm5, xmm5, 1
-	addss  xmm0, xmm3
-	addss  xmm1, xmm4
-	addss  xmm2, xmm5	;# xmm0-xmm2 has single force in pos0 
-
-	;# increment i force 
-	movss  xmm3, [rdi + rcx*4]
-	movss  xmm4, [rdi + rcx*4 + 4]
-	movss  xmm5, [rdi + rcx*4 + 8]
-	subss  xmm3, xmm0
-	subss  xmm4, xmm1
-	subss  xmm5, xmm2
-	movss  [rdi + rcx*4],     xmm3
-	movss  [rdi + rcx*4 + 4], xmm4
-	movss  [rdi + rcx*4 + 8], xmm5
-
-	;# increment fshift force  
-	movss  xmm3, [rsi + rdx*4]
-	movss  xmm4, [rsi + rdx*4 + 4]
-	movss  xmm5, [rsi + rdx*4 + 8]
-	subss  xmm3, xmm0
-	subss  xmm4, xmm1
-	subss  xmm5, xmm2
-	movss  [rsi + rdx*4],     xmm3
-	movss  [rsi + rdx*4 + 4], xmm4
-	movss  [rsi + rdx*4 + 8], xmm5
-
-	;# get n from stack
-	mov esi, [rsp + nb430_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb430_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movaps xmm7, [rsp + nb430_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb430_Vc]
-	addss xmm7, [rax + rdx*4] 
-	;# move back to mem 
-	movss [rax + rdx*4], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movaps xmm7, [rsp + nb430_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb430_Vvdw]
-	addss xmm7, [rax + rdx*4] 
-	;# move back to mem 
-	movss [rax + rdx*4], xmm7 
-	
-	;# accumulate dVda and update it 
-	movaps xmm7, [rsp + nb430_dvdasum]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-	
-	mov edx, [rsp + nb430_ii]
-	mov rax, [rbp + nb430_dvda]
-	addss xmm7, [rax + rdx*4]
-	movss [rax + rdx*4], xmm7
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb430_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb430_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb430_n], esi
-        jmp .nb430_outer
-.nb430_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb430_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb430_end
-        ;# non-zero, do one more workunit
-        jmp   .nb430_threadloop
-.nb430_end:
-	mov eax, [rsp + nb430_nouter]
-	mov ebx, [rsp + nb430_ninner]
-	mov rcx, [rbp + nb430_outeriter]
-	mov rdx, [rbp + nb430_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 552
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
-
-
-
-
-.globl nb_kernel430nf_x86_64_sse
-.globl _nb_kernel430nf_x86_64_sse
-nb_kernel430nf_x86_64_sse:	
-_nb_kernel430nf_x86_64_sse:	
-;#	Room for return address and rbp (16 bytes)
-.equiv          nb430nf_fshift,         16
-.equiv          nb430nf_gid,            24
-.equiv          nb430nf_pos,            32
-.equiv          nb430nf_faction,        40
-.equiv          nb430nf_charge,         48
-.equiv          nb430nf_p_facel,        56
-.equiv          nb430nf_argkrf,         64
-.equiv          nb430nf_argcrf,         72
-.equiv          nb430nf_Vc,             80
-.equiv          nb430nf_type,           88
-.equiv          nb430nf_p_ntype,        96
-.equiv          nb430nf_vdwparam,       104
-.equiv          nb430nf_Vvdw,           112
-.equiv          nb430nf_p_tabscale,     120
-.equiv          nb430nf_VFtab,          128
-.equiv          nb430nf_invsqrta,       136
-.equiv          nb430nf_dvda,           144
-.equiv          nb430nf_p_gbtabscale,   152
-.equiv          nb430nf_GBtab,          160
-.equiv          nb430nf_p_nthreads,     168
-.equiv          nb430nf_count,          176
-.equiv          nb430nf_mtx,            184
-.equiv          nb430nf_outeriter,      192
-.equiv          nb430nf_inneriter,      200
-.equiv          nb430nf_work,           208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse use 
-.equiv          nb430nf_ix,             0
-.equiv          nb430nf_iy,             16
-.equiv          nb430nf_iz,             32
-.equiv          nb430nf_iq,             48
-.equiv          nb430nf_gbtsc,          64
-.equiv          nb430nf_tsc,            80
-.equiv          nb430nf_qq,             96
-.equiv          nb430nf_c6,             112
-.equiv          nb430nf_c12,            128
-.equiv          nb430nf_vctot,          144
-.equiv          nb430nf_Vvdwtot,        160
-.equiv          nb430nf_half,           176
-.equiv          nb430nf_three,          192
-.equiv          nb430nf_isai,           208
-.equiv          nb430nf_isaprod,        224
-.equiv          nb430nf_gbscale,        240
-.equiv          nb430nf_r,              256
-.equiv          nb430nf_nri,            272
-.equiv          nb430nf_iinr,           280
-.equiv          nb430nf_jindex,         288
-.equiv          nb430nf_jjnr,           296
-.equiv          nb430nf_shift,          304
-.equiv          nb430nf_shiftvec,       312
-.equiv          nb430nf_facel,          320
-.equiv          nb430nf_innerjjnr,      328
-.equiv          nb430nf_is3,            336
-.equiv          nb430nf_ii3,            340
-.equiv          nb430nf_ntia,           344
-.equiv          nb430nf_innerk,         348
-.equiv          nb430nf_n,              352
-.equiv          nb430nf_nn1,            356
-.equiv          nb430nf_ntype,          360
-.equiv          nb430nf_nouter,         364
-.equiv          nb430nf_ninner,         368
-
-	push rbp
-	mov  rbp, rsp
-	push rbx
-
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 392		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb430nf_nouter], eax
-	mov [rsp + nb430nf_ninner], eax
-	
-	mov edi, [rdi]
-	mov [rsp + nb430nf_nri], edi
-	mov [rsp + nb430nf_iinr], rsi
-	mov [rsp + nb430nf_jindex], rdx
-	mov [rsp + nb430nf_jjnr], rcx
-	mov [rsp + nb430nf_shift], r8
-	mov [rsp + nb430nf_shiftvec], r9
-	mov rdi, [rbp + nb430nf_p_ntype]
-	mov edi, [rdi]
-	mov [rsp + nb430nf_ntype], edi
-	mov rsi, [rbp + nb430nf_p_facel]
-	movss xmm0, [rsi]
-	movss [rsp + nb430nf_facel], xmm0
-
-	mov rax, [rbp + nb430nf_p_tabscale]
-	movss xmm3, [rax]
-	shufps xmm3, xmm3, 0
-	movaps [rsp + nb430nf_tsc], xmm3
-
-	mov rbx, [rbp + nb430nf_p_gbtabscale]
-	movss xmm4, [rbx]
-	shufps xmm4, xmm4, 0
-	movaps [rsp + nb430nf_gbtsc], xmm4
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x3f000000     ;# half in IEEE (hex)
-	mov [rsp + nb430nf_half], eax
-	movss xmm1, [rsp + nb430nf_half]
-	shufps xmm1, xmm1, 0    ;# splat to all elements
-	movaps xmm2, xmm1       
-	addps  xmm2, xmm2	;# one
-	movaps xmm3, xmm2
-	addps  xmm2, xmm2	;# two
-	addps  xmm3, xmm2	;# three
-	movaps [rsp + nb430nf_half],  xmm1
-	movaps [rsp + nb430nf_three],  xmm3
-
-.nb430nf_threadloop:
-        mov   rsi, [rbp + nb430nf_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb430nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb430nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb430nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb430nf_n], eax
-        mov [rsp + nb430nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb430nf_outerstart
-        jmp .nb430nf_end
-
-.nb430nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb430nf_nouter]
-	mov [rsp + nb430nf_nouter], ebx
-
-.nb430nf_outer:
-	mov   rax, [rsp + nb430nf_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax + rsi*4]		;# ebx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb430nf_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb430nf_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movss xmm0, [rax + rbx*4]
-	movss xmm1, [rax + rbx*4 + 4]
-	movss xmm2, [rax + rbx*4 + 8] 
-
-	mov   rcx, [rsp + nb430nf_iinr]       ;# rcx = pointer into iinr[] 	
-	mov   ebx, [rcx + rsi*4]	    ;# ebx =ii 
-
-	mov   rdx, [rbp + nb430nf_charge]
-	movss xmm3, [rdx + rbx*4]	
-	mulss xmm3, [rsp + nb430nf_facel]
-	shufps xmm3, xmm3, 0
-
-	mov   rdx, [rbp + nb430nf_invsqrta]	;# load invsqrta[ii]
-	movss xmm4, [rdx + rbx*4]
-	shufps xmm4, xmm4, 0
-
-    	mov   rdx, [rbp + nb430nf_type] 
-    	mov   edx, [rdx + rbx*4]
-    	imul  edx, [rsp + nb430nf_ntype]
-    	shl   edx, 1
-    	mov   [rsp + nb430nf_ntia], edx
-	
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb430nf_pos]    ;# rax = base of pos[]  
-
-	addss xmm0, [rax + rbx*4]
-	addss xmm1, [rax + rbx*4 + 4]
-	addss xmm2, [rax + rbx*4 + 8]
-
-	movaps [rsp + nb430nf_iq], xmm3
-	movaps [rsp + nb430nf_isai], xmm4
-	
-	shufps xmm0, xmm0, 0
-	shufps xmm1, xmm1, 0
-	shufps xmm2, xmm2, 0
-
-	movaps [rsp + nb430nf_ix], xmm0
-	movaps [rsp + nb430nf_iy], xmm1
-	movaps [rsp + nb430nf_iz], xmm2
-
-	mov   [rsp + nb430nf_ii3], ebx
-	
-	;# clear vctot 
-	xorps xmm4, xmm4
-	movaps [rsp + nb430nf_vctot], xmm4
-	movaps [rsp + nb430nf_Vvdwtot], xmm4
-	
-	mov   rax, [rsp + nb430nf_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb430nf_pos]
-	mov   rdi, [rbp + nb430nf_faction]	
-	mov   rax, [rsp + nb430nf_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb430nf_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  4
-	add   ecx, [rsp + nb430nf_ninner]
-	mov   [rsp + nb430nf_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb430nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb430nf_unroll_loop
-	jmp   .nb430nf_finish_inner
-.nb430nf_unroll_loop:	
-	;# quad-unroll innerloop here 
-	mov   rdx, [rsp + nb430nf_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [rdx]	
-	mov   ebx, [rdx + 4]              
-	mov   ecx, [rdx + 8]            
-	mov   edx, [rdx + 12]         ;# eax-edx=jnr1-4 
-	add qword ptr [rsp + nb430nf_innerjjnr],  16 ;# advance pointer (unrolled 4) 
-
-	;# load isa2
-	mov rsi, [rbp + nb430nf_invsqrta]
-	movss xmm3, [rsi + rax*4]
-	movss xmm4, [rsi + rcx*4]
-	movss xmm6, [rsi + rbx*4]
-	movss xmm7, [rsi + rdx*4]
-	movaps xmm2, [rsp + nb430nf_isai]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# 10001000 ;# all charges in xmm3  
-	mulps  xmm2, xmm3
-	
-	movaps [rsp + nb430nf_isaprod], xmm2
-	movaps xmm1, xmm2
-	mulps xmm1, [rsp + nb430nf_gbtsc]
-	movaps [rsp + nb430nf_gbscale], xmm1
-	
-	mov rsi, [rbp + nb430nf_charge]    ;# base of charge[] 
-	
-	movss xmm3, [rsi + rax*4]
-	movss xmm4, [rsi + rcx*4]
-	movss xmm6, [rsi + rbx*4]
-	movss xmm7, [rsi + rdx*4]
-
-	mulps xmm2, [rsp + nb430nf_iq]
-	shufps xmm3, xmm6, 0 
-	shufps xmm4, xmm7, 0 
-	shufps xmm3, xmm4, 136  ;# 10001000 ;# all charges in xmm3  
-	mulps  xmm3, xmm2
-	movaps [rsp + nb430nf_qq], xmm3	
-
-	movd  mm0, eax		;# use mmx registers as temp storage 
-	movd  mm1, ebx
-	movd  mm2, ecx
-	movd  mm3, edx
-	
-	mov rsi, [rbp + nb430nf_type]
-	mov eax, [rsi + rax*4]
-	mov ebx, [rsi + rbx*4]
-	mov ecx, [rsi + rcx*4]
-	mov edx, [rsi + rdx*4]
-	mov rsi, [rbp + nb430nf_vdwparam]
-	shl eax, 1	
-	shl ebx, 1	
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [rsp + nb430nf_ntia]
-	add eax, edi
-	add ebx, edi
-	add ecx, edi
-	add edx, edi
-
-	movlps xmm6, [rsi + rax*4]
-	movlps xmm7, [rsi + rcx*4]
-	movhps xmm6, [rsi + rbx*4]
-	movhps xmm7, [rsi + rdx*4]
-
-	movaps xmm4, xmm6
-	shufps xmm4, xmm7, 136  ;# 10001000
-	shufps xmm6, xmm7, 221  ;# 11011101
-	
-	movd  eax, mm0		
-	movd  ebx, mm1
-	movd  ecx, mm2
-	movd  edx, mm3
-
-	movaps [rsp + nb430nf_c6], xmm4
-	movaps [rsp + nb430nf_c12], xmm6
-	
-	mov rsi, [rbp + nb430nf_pos]       ;# base of pos[] 
-
-	lea   rax, [rax + rax*2]     ;# replace jnr with j3 
-	lea   rbx, [rbx + rbx*2]	
-
-	lea   rcx, [rcx + rcx*2]     ;# replace jnr with j3 
-	lea   rdx, [rdx + rdx*2]	
-
-	;# move four coordinates to xmm0-xmm2 	
-
-	movlps xmm4, [rsi + rax*4]
-	movlps xmm5, [rsi + rcx*4]
-	movss xmm2, [rsi + rax*4 + 8]
-	movss xmm6, [rsi + rcx*4 + 8]
-
-	movhps xmm4, [rsi + rbx*4]
-	movhps xmm5, [rsi + rdx*4]
-
-	movss xmm0, [rsi + rbx*4 + 8]
-	movss xmm1, [rsi + rdx*4 + 8]
-
-	shufps xmm2, xmm0, 0
-	shufps xmm6, xmm1, 0
-	
-	movaps xmm0, xmm4
-	movaps xmm1, xmm4
-
-	shufps xmm2, xmm6, 136  ;# 10001000
-	
-	shufps xmm0, xmm5, 136  ;# 10001000
-	shufps xmm1, xmm5, 221  ;# 11011101		
-
-	;# move ix-iz to xmm4-xmm6 
-	movaps xmm4, [rsp + nb430nf_ix]
-	movaps xmm5, [rsp + nb430nf_iy]
-	movaps xmm6, [rsp + nb430nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb430nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb430nf_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r
-	movaps [rsp + nb430nf_r], xmm4
-	mulps xmm4, [rsp + nb430nf_gbscale]
-
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 2
-	pslld mm7, 2
-
-	movd mm0, eax	
-	movd mm1, ebx
-	movd mm2, ecx
-	movd mm3, edx
-
-	mov  rsi, [rbp + nb430nf_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-	
-	;# load coulomb table
-	movaps xmm4, [rsi + rax*4]
-	movaps xmm5, [rsi + rbx*4]
-	movaps xmm6, [rsi + rcx*4]
-	movaps xmm7, [rsi + rdx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# coulomb table ready, in xmm4-xmm7  		
-	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	movaps xmm3, [rsp + nb430nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	addps  xmm5, [rsp + nb430nf_vctot]
-	movaps [rsp + nb430nf_vctot], xmm5
-
-	
-	movaps xmm4, [rsp + nb430nf_r]
-	mulps xmm4, [rsp + nb430nf_tsc]
-	
-	movhlps xmm5, xmm4
-	cvttps2pi mm6, xmm4
-	cvttps2pi mm7, xmm5	;# mm6/mm7 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	cvtpi2ps xmm5, mm7
-	movlhps xmm6, xmm5
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 3
-	pslld mm7, 3
-	
-	mov  rsi, [rbp + nb430nf_VFtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ecx, mm7
-	psrlq mm7, 32
-	movd ebx, mm6
-	movd edx, mm7
-	
-	;# dispersion 
-	movaps xmm4, [rsi + rax*4]
-	movaps xmm5, [rsi + rbx*4]
-	movaps xmm6, [rsi + rcx*4]
-	movaps xmm7, [rsi + rdx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# dispersion table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, [rsp + nb430nf_c6]	 ;# Vvdw6
-	addps  xmm5, [rsp + nb430nf_Vvdwtot]
-	movaps [rsp + nb430nf_Vvdwtot], xmm5
-
-	;# repulsion 
-	movaps xmm4, [rsi + rax*4 + 16]
-	movaps xmm5, [rsi + rbx*4 + 16]
-	movaps xmm6, [rsi + rcx*4 + 16]
-	movaps xmm7, [rsi + rdx*4 + 16]
-	;# transpose, using xmm3 for scratch
-	movaps xmm3, xmm6
-	shufps xmm3, xmm7, 0xEE 
-	shufps xmm6, xmm7, 0x44
-	movaps xmm7, xmm4
-	shufps xmm7, xmm5, 0xEE
-	shufps xmm4, xmm5, 0x44
-	movaps xmm5, xmm4
-	shufps xmm5, xmm6, 0xDD
-	shufps xmm4, xmm6, 0x88
-	movaps xmm6, xmm7
-	shufps xmm6, xmm3, 0x88
-	shufps xmm7, xmm3, 0xDD
-	;# table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
- 	
-	mulps  xmm5, [rsp + nb430nf_c12] ;# Vvdw12
-	addps  xmm5, [rsp + nb430nf_Vvdwtot]
-	movaps [rsp + nb430nf_Vvdwtot], xmm5
-	
-	;# should we do one more iteration? 
-	sub dword ptr [rsp + nb430nf_innerk],  4
-	jl    .nb430nf_finish_inner
-	jmp   .nb430nf_unroll_loop
-.nb430nf_finish_inner:
-	;# check if at least two particles remain 
-	add dword ptr [rsp + nb430nf_innerk],  4
-	mov   edx, [rsp + nb430nf_innerk]
-	and   edx, 2
-	jnz   .nb430nf_dopair
-	jmp   .nb430nf_checksingle
-.nb430nf_dopair:	
-
-	mov   rcx, [rsp + nb430nf_innerjjnr]
-	
-	mov   eax, [rcx]	
-	mov   ebx, [rcx + 4]              
-	add qword ptr [rsp + nb430nf_innerjjnr],  8	
-
-	xorps xmm2, xmm2
-	movaps xmm6, xmm2
-	
-	;# load isa2
-	mov rsi, [rbp + nb430nf_invsqrta]
-	movss xmm2, [rsi + rax*4]
-	movss xmm3, [rsi + rbx*4]
-	unpcklps xmm2, xmm3	;# isa2 in xmm3(0,1)
-	mulps  xmm2, [rsp + nb430nf_isai]
-	movaps [rsp + nb430nf_isaprod], xmm2	
-	movaps xmm1, xmm2
-	mulps xmm1, [rsp + nb430nf_gbtsc]
-	movaps [rsp + nb430nf_gbscale], xmm1	
-	
-	mov rsi, [rbp + nb430nf_charge]    ;# base of charge[] 	
-	movss xmm3, [rsi + rax*4]		
-	movss xmm6, [rsi + rbx*4]
-	unpcklps xmm3, xmm6 ;# 00001000 ;# xmm3(0,1) has the charges 
-
-	mulps  xmm2, [rsp + nb430nf_iq]
-	mulps  xmm3, xmm2
-	movaps [rsp + nb430nf_qq], xmm3
-
-	mov rsi, [rbp + nb430nf_type]
-	mov   ecx, eax
-	mov   edx, ebx
-	mov ecx, [rsi + rcx*4]
-	mov edx, [rsi + rdx*4]	
-	mov rsi, [rbp + nb430nf_vdwparam]
-	shl ecx, 1	
-	shl edx, 1	
-	mov edi, [rsp + nb430nf_ntia]
-	add ecx, edi
-	add edx, edi
-	movlps xmm6, [rsi + rcx*4]
-	movhps xmm6, [rsi + rdx*4]
-	mov rdi, [rbp + nb430nf_pos]	
-	
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 8 ;# 00001000 	
-	shufps xmm6, xmm6, 13 ;# 00001101
-	movlhps xmm4, xmm7
-	movlhps xmm6, xmm7
-	
-	movaps [rsp + nb430nf_c6], xmm4
-	movaps [rsp + nb430nf_c12], xmm6	
-	
-	lea   rax, [rax + rax*2]
-	lea   rbx, [rbx + rbx*2]
-	;# move coordinates to xmm0-xmm2 
-	movlps xmm1, [rdi + rax*4]
-	movss xmm2, [rdi + rax*4 + 8]	
-	movhps xmm1, [rdi + rbx*4]
-	movss xmm0, [rdi + rbx*4 + 8]	
-
-	movlhps xmm3, xmm7
-	
-	shufps xmm2, xmm0, 0
-	
-	movaps xmm0, xmm1
-
-	shufps xmm2, xmm2, 136  ;# 10001000
-	
-	shufps xmm0, xmm0, 136  ;# 10001000
-	shufps xmm1, xmm1, 221  ;# 11011101
-	
-	mov    rdi, [rbp + nb430nf_faction]
-	;# move ix-iz to xmm4-xmm6 
-	xorps   xmm7, xmm7
-	
-	movaps xmm4, [rsp + nb430nf_ix]
-	movaps xmm5, [rsp + nb430nf_iy]
-	movaps xmm6, [rsp + nb430nf_iz]
-
-	;# calc dr 
-	subps xmm4, xmm0
-	subps xmm5, xmm1
-	subps xmm6, xmm2
-
-	;# square it 
-	mulps xmm4,xmm4
-	mulps xmm5,xmm5
-	mulps xmm6,xmm6
-	addps xmm4, xmm5
-	addps xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtps xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulps xmm5, xmm5
-	movaps xmm1, [rsp + nb430nf_three]
-	mulps xmm5, xmm4	;# rsq*lu*lu 			
-	movaps xmm0, [rsp + nb430nf_half]
-	subps xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulps xmm1, xmm2	
-	mulps xmm0, xmm1	;# xmm0=rinv 
-	mulps xmm4, xmm0	;# xmm4=r 
-	movaps [rsp + nb430nf_r], xmm4
-	mulps xmm4, [rsp + nb430nf_gbscale]
-
-	cvttps2pi mm6, xmm4     ;# mm6 contain lu indices 
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-
-	pslld mm6, 2
-
-	mov  rsi, [rbp + nb430nf_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-
-	;# load coulomb table
-	movaps xmm4, [rsi + rcx*4]
-	movaps xmm7, [rsi + rdx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# coulomb table ready, in xmm4-xmm7  	
-
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	movaps xmm3, [rsp + nb430nf_qq]
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-	mulps  xmm5, xmm3 ;# vcoul=qq*VV  
-	addps  xmm5, [rsp + nb430nf_vctot]
-	movaps [rsp + nb430nf_vctot], xmm5 
-
-	movaps xmm4, [rsp + nb430nf_r]
-	mulps xmm4, [rsp + nb430nf_tsc]
-	
-	cvttps2pi mm6, xmm4
-	cvtpi2ps xmm6, mm6
-	subps xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulps  xmm2, xmm2	;# xmm2=eps2 
-	pslld mm6, 3
-	
-	mov  rsi, [rbp + nb430nf_VFtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6
-	
-	;# dispersion 
-	movaps xmm4, [rsi + rcx*4]
-	movaps xmm7, [rsi + rdx*4]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# dispersion table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
-
-	mulps  xmm5, [rsp + nb430nf_c6]	 ;# Vvdw6 
-	addps  xmm5, [rsp + nb430nf_Vvdwtot]
-	movaps [rsp + nb430nf_Vvdwtot], xmm5
-
-	;# repulsion 
-	movaps xmm4, [rsi + rcx*4 + 16]
-	movaps xmm7, [rsi + rdx*4 + 16]
-	;# transpose, using xmm3 for scratch
-	movaps xmm6, xmm4
-	unpcklps xmm4, xmm7  	;# Y1 Y2 F1 F2 
-	unpckhps xmm6, xmm7     ;# G1 G2 H1 H2
-	movhlps  xmm5, xmm4    	;# F1 F2 
-	movhlps  xmm7, xmm6     ;# H1 H2
-	;# table ready, in xmm4-xmm7 	
-	mulps  xmm6, xmm1	;# xmm6=Geps 
-	mulps  xmm7, xmm2	;# xmm7=Heps2 
-	addps  xmm5, xmm6
-	addps  xmm5, xmm7	;# xmm5=Fp 	
-	mulps  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addps  xmm5, xmm4 ;# xmm5=VV 
- 	
-	mulps  xmm5, [rsp + nb430nf_c12] ;# Vvdw12 
-	
-	addps  xmm5, [rsp + nb430nf_Vvdwtot]
-	movaps [rsp + nb430nf_Vvdwtot], xmm5
-.nb430nf_checksingle:				
-	mov   edx, [rsp + nb430nf_innerk]
-	and   edx, 1
-	jnz    .nb430nf_dosingle
-	jmp    .nb430nf_updateouterdata
-.nb430nf_dosingle:
-	mov rsi, [rbp + nb430nf_charge]
-	mov rdx, [rbp + nb430nf_invsqrta]
-	mov rdi, [rbp + nb430nf_pos]
-	mov   rcx, [rsp + nb430nf_innerjjnr]
-	mov   eax, [rcx]	
-	xorps  xmm2, xmm2
-	movaps xmm6, xmm2
-	movss xmm2, [rdx + rax*4]	;# isa2
-	mulss xmm2, [rsp + nb430nf_isai]
-	movss [rsp + nb430nf_isaprod], xmm2	
-	movss xmm1, xmm2
-	mulss xmm1, [rsp + nb430nf_gbtsc]
-	movss [rsp + nb430nf_gbscale], xmm1	
-	
-	mulss  xmm2, [rsp + nb430nf_iq]
-	movss xmm6, [rsi + rax*4]	;# xmm6(0) has the charge 	
-	mulss  xmm6, xmm2
-	movss [rsp + nb430nf_qq], xmm6
-	
-	mov rsi, [rbp + nb430nf_type]
-	mov ecx, eax
-	mov ecx, [rsi + rcx*4]	
-	mov rsi, [rbp + nb430nf_vdwparam]
-	shl ecx, 1
-	add ecx, [rsp + nb430nf_ntia]
-	movlps xmm6, [rsi + rcx*4]
-	movaps xmm4, xmm6
-	shufps xmm4, xmm4, 252  ;# 11111100	
-	shufps xmm6, xmm6, 253  ;# 11111101	
-	
-	movss [rsp + nb430nf_c6], xmm4
-	movss [rsp + nb430nf_c12], xmm6	
-	
-	lea   rax, [rax + rax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movss xmm0, [rdi + rax*4]	
-	movss xmm1, [rdi + rax*4 + 4]	
-	movss xmm2, [rdi + rax*4 + 8]	 
-	
-	movss xmm4, [rsp + nb430nf_ix]
-	movss xmm5, [rsp + nb430nf_iy]
-	movss xmm6, [rsp + nb430nf_iz]
-
-	;# calc dr 
-	subss xmm4, xmm0
-	subss xmm5, xmm1
-	subss xmm6, xmm2
-
-	;# square it 
-	mulss xmm4,xmm4
-	mulss xmm5,xmm5
-	mulss xmm6,xmm6
-	addss xmm4, xmm5
-	addss xmm4, xmm6
-	;# rsq in xmm4 
-
-	rsqrtss xmm5, xmm4
-	;# lookup seed in xmm5 
-	movaps xmm2, xmm5
-	mulss xmm5, xmm5
-	movss xmm1, [rsp + nb430nf_three]
-	mulss xmm5, xmm4	;# rsq*lu*lu 			
-	movss xmm0, [rsp + nb430nf_half]
-	subss xmm1, xmm5	;# 30-rsq*lu*lu 
-	mulss xmm1, xmm2	
-	mulss xmm0, xmm1	;# xmm0=rinv 
-
-	mulss xmm4, xmm0	;# xmm4=r 
-	movaps [rsp + nb430nf_r], xmm4
-	mulss xmm4, [rsp + nb430nf_gbscale]
-
-	cvttss2si ebx, xmm4     ;# mm6 contain lu indices 
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movaps xmm1, xmm4	;# xmm1=eps 
-	movaps xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl ebx, 2
-
-	mov  rsi, [rbp + nb430nf_GBtab]
-	
-	movaps xmm4, [rsi + rbx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	movss xmm3, [rsp + nb430nf_qq]
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, xmm3 ;# vcoul=qq*VV  
-	addss  xmm5, [rsp + nb430nf_vctot]
-	movss [rsp + nb430nf_vctot], xmm5
-	
-	movss xmm4, [rsp + nb430nf_r]
-	mulps xmm4, [rsp + nb430nf_tsc]
-	
-	cvttss2si ebx, xmm4
-	cvtsi2ss xmm6, ebx
-	subss xmm4, xmm6	
-	movss xmm1, xmm4	;# xmm1=eps 
-	movss xmm2, xmm1	
-	mulss  xmm2, xmm2	;# xmm2=eps2 
-
-	shl ebx, 3
-	mov  rsi, [rbp + nb430nf_VFtab]
-	
-	;# dispersion 
-	movaps xmm4, [rsi + rbx*4]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-	
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
-	mulss  xmm5, [rsp + nb430nf_c6]	 ;# Vvdw6
-	addss  xmm5, [rsp + nb430nf_Vvdwtot]
-	movss [rsp + nb430nf_Vvdwtot], xmm5
-
-	;# repulsion 
-	movaps xmm4, [rsi + rbx*4 + 16]	
-	movhlps xmm6, xmm4
-	movaps xmm5, xmm4
-	movaps xmm7, xmm6
-	shufps xmm5, xmm5, 1
-	shufps xmm7, xmm7, 1
-	;# table ready in xmm4-xmm7 
-	
-	mulss  xmm6, xmm1	;# xmm6=Geps 
-	mulss  xmm7, xmm2	;# xmm7=Heps2 
-	addss  xmm5, xmm6
-	addss  xmm5, xmm7	;# xmm5=Fp 	
-	mulss  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addss  xmm5, xmm4 ;# xmm5=VV 
- 	
-	mulss  xmm5, [rsp + nb430nf_c12] ;# Vvdw12 
-	
-	addss  xmm5, [rsp + nb430nf_Vvdwtot]
-	movss [rsp + nb430nf_Vvdwtot], xmm5
-
-.nb430nf_updateouterdata:
-	;# get n from stack
-	mov esi, [rsp + nb430nf_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb430nf_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movaps xmm7, [rsp + nb430nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb430nf_Vc]
-	addss xmm7, [rax + rdx*4] 
-	;# move back to mem 
-	movss [rax + rdx*4], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movaps xmm7, [rsp + nb430nf_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addps  xmm7, xmm6	;# pos 0-1 in xmm7 have the sum now 
-	movaps xmm6, xmm7
-	shufps xmm6, xmm6, 1
-	addss  xmm7, xmm6		
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb430nf_Vvdw]
-	addss xmm7, [rax + rdx*4] 
-	;# move back to mem 
-	movss [rax + rdx*4], xmm7 
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb430nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb430nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb430nf_n], esi
-        jmp .nb430nf_outer
-.nb430nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb430nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb430nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb430nf_threadloop
-.nb430nf_end:
-
-	mov eax, [rsp + nb430nf_nouter]
-	mov ebx, [rsp + nb430nf_ninner]
-	mov rcx, [rbp + nb430nf_outeriter]
-	mov rdx, [rbp + nb430nf_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 392
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
-
-
-	
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.s
deleted file mode 100644
index b25797c2a4..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.s
+++ /dev/null
@@ -1,2306 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-
-
-
-
-
-.globl nb_kernel430_x86_64_sse
-.globl _nb_kernel430_x86_64_sse
-nb_kernel430_x86_64_sse:        
-_nb_kernel430_x86_64_sse:       
-##      Room for return address and rbp (16 bytes)
-.set nb430_fshift, 16
-.set nb430_gid, 24
-.set nb430_pos, 32
-.set nb430_faction, 40
-.set nb430_charge, 48
-.set nb430_p_facel, 56
-.set nb430_argkrf, 64
-.set nb430_argcrf, 72
-.set nb430_Vc, 80
-.set nb430_type, 88
-.set nb430_p_ntype, 96
-.set nb430_vdwparam, 104
-.set nb430_Vvdw, 112
-.set nb430_p_tabscale, 120
-.set nb430_VFtab, 128
-.set nb430_invsqrta, 136
-.set nb430_dvda, 144
-.set nb430_p_gbtabscale, 152
-.set nb430_GBtab, 160
-.set nb430_p_nthreads, 168
-.set nb430_count, 176
-.set nb430_mtx, 184
-.set nb430_outeriter, 192
-.set nb430_inneriter, 200
-.set nb430_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb430_ix, 0
-.set nb430_iy, 16
-.set nb430_iz, 32
-.set nb430_iq, 48
-.set nb430_dx, 64
-.set nb430_dy, 80
-.set nb430_dz, 96
-.set nb430_eps, 112
-.set nb430_gbtsc, 128
-.set nb430_tsc, 144
-.set nb430_qq, 160
-.set nb430_c6, 176
-.set nb430_c12, 192
-.set nb430_epsgb, 208
-.set nb430_vctot, 224
-.set nb430_Vvdwtot, 240
-.set nb430_fix, 256
-.set nb430_fiy, 272
-.set nb430_fiz, 288
-.set nb430_half, 304
-.set nb430_three, 320
-.set nb430_r, 336
-.set nb430_isai, 352
-.set nb430_isaprod, 368
-.set nb430_dvdasum, 384
-.set nb430_gbscale, 400
-.set nb430_rinv, 416
-.set nb430_nri, 432
-.set nb430_iinr, 440
-.set nb430_jindex, 448
-.set nb430_jjnr, 456
-.set nb430_shift, 464
-.set nb430_shiftvec, 472
-.set nb430_facel, 480
-.set nb430_innerjjnr, 488
-.set nb430_ii, 496
-.set nb430_is3, 500
-.set nb430_ii3, 504
-.set nb430_ntia, 508
-.set nb430_innerk, 512
-.set nb430_n, 516
-.set nb430_nn1, 520
-.set nb430_ntype, 524
-.set nb430_nouter, 528
-.set nb430_ninner, 532
-
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $552,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb430_nouter(%rsp)
-        movl %eax,nb430_ninner(%rsp)
-
-
-
-        movl (%rdi),%edi
-        movl %edi,nb430_nri(%rsp)
-        movq %rsi,nb430_iinr(%rsp)
-        movq %rdx,nb430_jindex(%rsp)
-        movq %rcx,nb430_jjnr(%rsp)
-        movq %r8,nb430_shift(%rsp)
-        movq %r9,nb430_shiftvec(%rsp)
-        movq nb430_p_ntype(%rbp),%rdi
-        movl (%rdi),%edi
-        movl %edi,nb430_ntype(%rsp)
-        movq nb430_p_facel(%rbp),%rsi
-        movss (%rsi),%xmm0
-        movss %xmm0,nb430_facel(%rsp)
-
-        movq nb430_p_tabscale(%rbp),%rax
-        movss (%rax),%xmm3
-        shufps $0,%xmm3,%xmm3
-        movaps %xmm3,nb430_tsc(%rsp)
-
-        movq nb430_p_gbtabscale(%rbp),%rbx
-        movss (%rbx),%xmm4
-        shufps $0,%xmm4,%xmm4
-        movaps %xmm4,nb430_gbtsc(%rsp)
-
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## half in IEEE (hex)
-        movl %eax,nb430_half(%rsp)
-        movss nb430_half(%rsp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## one
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## two
-        addps  %xmm2,%xmm3      ## three
-        movaps %xmm1,nb430_half(%rsp)
-        movaps %xmm3,nb430_three(%rsp)
-
-_nb_kernel430_x86_64_sse.nb430_threadloop: 
-        movq  nb430_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel430_x86_64_sse.nb430_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel430_x86_64_sse.nb430_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb430_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb430_n(%rsp)
-        movl %ebx,nb430_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel430_x86_64_sse.nb430_outerstart
-        jmp _nb_kernel430_x86_64_sse.nb430_end
-
-_nb_kernel430_x86_64_sse.nb430_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb430_nouter(%rsp),%ebx
-        movl %ebx,nb430_nouter(%rsp)
-
-_nb_kernel430_x86_64_sse.nb430_outer: 
-        movq  nb430_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx                ## ebx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb430_is3(%rsp)      ## store is3 
-
-        movq  nb430_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movss (%rax,%rbx,4),%xmm0
-        movss 4(%rax,%rbx,4),%xmm1
-        movss 8(%rax,%rbx,4),%xmm2
-
-        movq  nb430_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]
-        movl  (%rcx,%rsi,4),%ebx            ## ebx =ii 
-        movl  %ebx,nb430_ii(%rsp)
-
-        movq  nb430_charge(%rbp),%rdx
-        movss (%rdx,%rbx,4),%xmm3
-        mulss nb430_facel(%rsp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-        movq  nb430_invsqrta(%rbp),%rdx         ## load invsqrta[ii]
-        movss (%rdx,%rbx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        movq  nb430_type(%rbp),%rdx
-        movl  (%rdx,%rbx,4),%edx
-        imull nb430_ntype(%rsp),%edx
-        shll  %edx
-        movl  %edx,nb430_ntia(%rsp)
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb430_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addss (%rax,%rbx,4),%xmm0
-        addss 4(%rax,%rbx,4),%xmm1
-        addss 8(%rax,%rbx,4),%xmm2
-
-        movaps %xmm3,nb430_iq(%rsp)
-        movaps %xmm4,nb430_isai(%rsp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb430_ix(%rsp)
-        movaps %xmm1,nb430_iy(%rsp)
-        movaps %xmm2,nb430_iz(%rsp)
-
-        movl  %ebx,nb430_ii3(%rsp)
-
-        ## clear vctot and i forces 
-        xorps %xmm4,%xmm4
-        movaps %xmm4,nb430_vctot(%rsp)
-        movaps %xmm4,nb430_Vvdwtot(%rsp)
-        movaps %xmm4,nb430_dvdasum(%rsp)
-        movaps %xmm4,nb430_fix(%rsp)
-        movaps %xmm4,nb430_fiy(%rsp)
-        movaps %xmm4,nb430_fiz(%rsp)
-
-        movq  nb430_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb430_pos(%rbp),%rsi
-        movq  nb430_faction(%rbp),%rdi
-        movq  nb430_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb430_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb430_ninner(%rsp),%ecx
-        movl  %ecx,nb430_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb430_innerk(%rsp)      ## number of innerloop atoms
-
-        jge   _nb_kernel430_x86_64_sse.nb430_unroll_loop
-        jmp   _nb_kernel430_x86_64_sse.nb430_finish_inner
-_nb_kernel430_x86_64_sse.nb430_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movq  nb430_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k] 
-        movl  (%rdx),%eax
-        movl  4(%rdx),%ebx
-        movl  8(%rdx),%ecx
-        movl  12(%rdx),%edx           ## eax-edx=jnr1-4 
-
-        addq $16,nb430_innerjjnr(%rsp)             ## advance pointer (unrolled 4) 
-
-        ## load isaj
-        movq nb430_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rcx,4),%xmm4
-        movss (%rsi,%rbx,4),%xmm6
-        movss (%rsi,%rdx,4),%xmm7
-        movaps nb430_isai(%rsp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## 10001000 ;# all isaj in xmm3 
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb430_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulps nb430_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb430_gbscale(%rsp)
-
-        movq nb430_charge(%rbp),%rsi     ## base of charge[] 
-
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rcx,4),%xmm4
-        movss (%rsi,%rbx,4),%xmm6
-        movss (%rsi,%rdx,4),%xmm7
-
-        mulps nb430_iq(%rsp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3  
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb430_qq(%rsp)
-
-    ## vdw parameters
-        movq nb430_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%r12d
-        movl (%rsi,%rbx,4),%r13d
-        movl (%rsi,%rcx,4),%r14d
-        movl (%rsi,%rdx,4),%r15d
-        shll %r12d
-        shll %r13d
-        shll %r14d
-        shll %r15d
-    movl nb430_ntia(%rsp),%edi
-        addl %edi,%r12d
-        addl %edi,%r13d
-        addl %edi,%r14d
-        addl %edi,%r15d
-
-        movq nb430_vdwparam(%rbp),%rsi
-        movlps (%rsi,%r12,4),%xmm3
-        movlps (%rsi,%r14,4),%xmm7
-        movhps (%rsi,%r13,4),%xmm3
-        movhps (%rsi,%r15,4),%xmm7
-
-        movaps %xmm3,%xmm0
-        shufps $136,%xmm7,%xmm0 ## 10001000
-        shufps $221,%xmm7,%xmm3 ## 11011101
-
-    movaps %xmm0,nb430_c6(%rsp)
-    movaps %xmm3,nb430_c12(%rsp)
-
-        movq nb430_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r8     ## jnr
-        lea  (%rbx,%rbx,2),%r9
-        lea  (%rcx,%rcx,2),%r10
-        lea  (%rdx,%rdx,2),%r11
-
-        ## move four coordinates to xmm0-xmm2   
-        movlps (%rsi,%r8,4),%xmm4
-        movlps (%rsi,%r10,4),%xmm5
-        movss 8(%rsi,%r8,4),%xmm2
-        movss 8(%rsi,%r10,4),%xmm6
-
-        movhps (%rsi,%r9,4),%xmm4
-        movhps (%rsi,%r11,4),%xmm5
-
-        movss 8(%rsi,%r9,4),%xmm0
-        movss 8(%rsi,%r11,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## 10001000
-
-        shufps $136,%xmm5,%xmm0 ## 10001000
-        shufps $221,%xmm5,%xmm1 ## 11011101             
-
-        ## calc dr 
-        subps nb430_ix(%rsp),%xmm0
-        subps nb430_iy(%rsp),%xmm1
-        subps nb430_iz(%rsp),%xmm2
-
-        ## store dr 
-        movaps %xmm0,nb430_dx(%rsp)
-        movaps %xmm1,nb430_dy(%rsp)
-        movaps %xmm2,nb430_dz(%rsp)
-
-    movd %r8,%mm0 ## store j3
-    movd %r9,%mm1
-    movd %r10,%mm2
-    movd %r11,%mm3
-
-        ## square it 
-        mulps %xmm0,%xmm0
-        mulps %xmm1,%xmm1
-        mulps %xmm2,%xmm2
-        addps %xmm1,%xmm0
-        addps %xmm2,%xmm0
-    movaps %xmm0,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb430_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb430_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb430_r(%rsp)
-    movaps %xmm0,nb430_rinv(%rsp)
-
-    movaps %xmm4,%xmm8   ## r
-        mulps nb430_gbscale(%rsp),%xmm4   ## rgbtab
-    mulps nb430_tsc(%rsp),%xmm8      ## rtab
-
-    ## truncate and convert to integers
-    cvttps2dq %xmm4,%xmm5 ## gb
-    cvttps2dq %xmm8,%xmm9 ## lj
-
-    ## convert back to float
-    cvtdq2ps  %xmm5,%xmm6  ## gb
-    cvtdq2ps  %xmm9,%xmm10 ## lj
-
-    ## multiply by 4 and 8, respectively
-    pslld   $2,%xmm5  ## gb
-    pslld   $3,%xmm9  ## lj
-
-    ## move to integer registers
-    movhlps %xmm5,%xmm7    ## gb
-    movhlps %xmm9,%xmm11   ## lj
-    movd    %xmm5,%r8d      ## gb
-    movd    %xmm9,%r12d     ## lj
-    movd    %xmm7,%r10d     ## gb
-    movd    %xmm11,%r14d    ## lj
-    pshufd $1,%xmm5,%xmm5 ## gb
-    pshufd $1,%xmm9,%xmm9 ## lj
-    pshufd $1,%xmm7,%xmm7 ## gb
-    pshufd $1,%xmm11,%xmm11 ## lj
-    movd    %xmm5,%r9d      ## gb
-    movd    %xmm9,%r13d     ## lj
-    movd    %xmm7,%r11d     ## gb
-    movd    %xmm11,%r15d    ## lj
-    ## GB indices: r8-r11   LJ indices: r12-r15
-
-    ## calculate eps
-    subps     %xmm6,%xmm4  ## gb
-    subps     %xmm10,%xmm8 ## lj
-    movaps    %xmm4,nb430_epsgb(%rsp)   ## gb eps
-    movaps    %xmm8,nb430_eps(%rsp)   ## lj eps
-
-        movq nb430_GBtab(%rbp),%rsi
-        movq nb430_VFtab(%rbp),%rdi
-
-    ## load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
-        movlps (%rsi,%r8,4),%xmm1         ## Y1c F1c 
-        movlps (%rdi,%r12,4),%xmm5        ## Y1d F1d 
-        movlps 16(%rdi,%r12,4),%xmm9      ## Y1r F1r 
-
-        movlps (%rsi,%r10,4),%xmm3        ## Y3c F3c 
-        movlps (%rdi,%r14,4),%xmm7        ## Y3d F3d 
-        movlps 16(%rdi,%r14,4),%xmm11     ## Y3r F3r 
-
-        movhps (%rsi,%r9,4),%xmm1         ## Y1c F1c Y2c F2c
-        movhps (%rdi,%r13,4),%xmm5        ## Y1d F1d Y2d F2d
-        movhps 16(%rdi,%r13,4),%xmm9      ## Y1r F1r Y2r F2r
-
-        movhps (%rsi,%r11,4),%xmm3        ## Y3c F3c Y4c F4c
-        movhps (%rdi,%r15,4),%xmm7        ## Y3d F3d Y4d F4d
-        movhps 16(%rdi,%r15,4),%xmm11     ## Y3r F3r Y4r F4r
-
-    movaps %xmm1,%xmm0
-    movaps %xmm5,%xmm4
-    movaps %xmm9,%xmm8
-        shufps $136,%xmm3,%xmm0 ## 10001000   => Y1c Y2c Y3c Y4c
-        shufps $136,%xmm7,%xmm4 ## 10001000   => Y1d Y2d Y3d Y4d
-        shufps $136,%xmm11,%xmm8 ## 10001000  => Y1r Y2r Y3r Y4r
-        shufps $221,%xmm3,%xmm1 ## 11011101   => F1c F2c F3c F4c
-        shufps $221,%xmm7,%xmm5 ## 11011101   => F1d F2d F3d F4d
-        shufps $221,%xmm11,%xmm9 ## 11011101  => F1r F2r F3r F4r
-
-        movlps 8(%rsi,%r8,4),%xmm3         ## G1c H1c 
-        movlps 8(%rdi,%r12,4),%xmm7        ## G1d H1d 
-        movlps 24(%rdi,%r12,4),%xmm11      ## G1r H1r 
-
-        movlps 8(%rsi,%r10,4),%xmm12       ## G3c H3c 
-        movlps 8(%rdi,%r14,4),%xmm13       ## G3d H3d 
-        movlps 24(%rdi,%r14,4),%xmm14      ## G3r H3r 
-
-        movhps 8(%rsi,%r9,4),%xmm3         ## G1c H1c G2c H2c
-        movhps 8(%rdi,%r13,4),%xmm7        ## G1d H1d G2d H2d
-        movhps 24(%rdi,%r13,4),%xmm11      ## G1r H1r G2r H2r
-
-        movhps 8(%rsi,%r11,4),%xmm12       ## G3c H3c G4c H4c
-        movhps 8(%rdi,%r15,4),%xmm13       ## G3d H3d G4d H4d
-        movhps 24(%rdi,%r15,4),%xmm14      ## G3r H3r G4r H4r
-    movaps %xmm3,%xmm2
-    movaps %xmm7,%xmm6
-    movaps %xmm11,%xmm10
-
-        shufps $136,%xmm12,%xmm2 ## 10001000  => G1c G2c G3c G4c
-        shufps $136,%xmm13,%xmm6 ## 10001000  => G1d G2d G3d G4d
-        shufps $136,%xmm14,%xmm10 ## 10001000 => G1r G2r G3r G4r
-        shufps $221,%xmm12,%xmm3 ## 11011101  => H1c H2c H3c H4c
-        shufps $221,%xmm13,%xmm7 ## 11011101  => H1d H2d H3d H4d
-        shufps $221,%xmm14,%xmm11 ## 11011101 => H1r H2r H3r H4r
-    ## table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
-
-    movaps nb430_epsgb(%rsp),%xmm12
-    movaps nb430_eps(%rsp),%xmm13
-
-    mulps  %xmm12,%xmm3  ## Heps
-    mulps  %xmm13,%xmm7
-    mulps  %xmm13,%xmm11
-    mulps  %xmm12,%xmm2    ## Geps
-    mulps  %xmm13,%xmm6
-    mulps  %xmm13,%xmm10
-    mulps  %xmm12,%xmm3  ## Heps2
-    mulps  %xmm13,%xmm7
-    mulps  %xmm13,%xmm11
-
-    addps  %xmm2,%xmm1  ## F+Geps
-    addps  %xmm6,%xmm5
-    addps  %xmm10,%xmm9
-    addps  %xmm3,%xmm1  ## F+Geps+Heps2 = Fp
-    addps  %xmm7,%xmm5
-    addps  %xmm11,%xmm9
-    addps  %xmm3,%xmm3   ## 2*Heps2
-    addps  %xmm7,%xmm7
-    addps  %xmm11,%xmm11
-    addps  %xmm2,%xmm3   ## 2*Heps2+Geps
-    addps  %xmm6,%xmm7
-    addps  %xmm10,%xmm11
-    addps  %xmm1,%xmm3  ## FF = Fp + 2*Heps2 + Geps
-    addps  %xmm5,%xmm7
-    addps  %xmm9,%xmm11
-    mulps  %xmm12,%xmm1  ## eps*Fp
-    mulps  %xmm13,%xmm5
-    mulps  %xmm13,%xmm9
-    addps  %xmm0,%xmm1    ## VV
-    addps  %xmm4,%xmm5
-    addps  %xmm8,%xmm9
-    mulps  nb430_qq(%rsp),%xmm1     ## VV*qq = vcoul
-    mulps  nb430_c6(%rsp),%xmm5     ## vnb6
-    mulps  nb430_c12(%rsp),%xmm9     ## vnb12
-    mulps  nb430_qq(%rsp),%xmm3      ## FF*qq = fij
-    mulps  nb430_c6(%rsp),%xmm7     ## fijD
-    mulps  nb430_c12(%rsp),%xmm11     ##fijR
-
-    addps  %xmm7,%xmm11 ## fijD+fijR
-    mulps  nb430_tsc(%rsp),%xmm11   ## (fijD+fijR)*tabscale
-
-    ## accumulate Vvdwtot
-    addps  nb430_Vvdwtot(%rsp),%xmm5
-    addps  %xmm9,%xmm5
-    movaps %xmm5,nb430_Vvdwtot(%rsp)
-
-        movq nb430_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        mulps nb430_gbscale(%rsp),%xmm3     ## fijC=qq*FF*gbscale
-        movaps %xmm3,%xmm6
-        mulps  nb430_r(%rsp),%xmm6
-        addps  %xmm1,%xmm6  ## vcoul+fijC*r
-
-    addps  %xmm11,%xmm3 ## fijC+fijD+fijR
-
-    ## increment vctot
-        addps  nb430_vctot(%rsp),%xmm1
-    movaps %xmm1,nb430_vctot(%rsp)
-
-        ## xmm6=(vcoul+fijC*r)
-        xorps  %xmm7,%xmm7
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum 
-        addps  nb430_dvdasum(%rsp),%xmm7
-    movaps %xmm7,nb430_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        movaps  %xmm6,%xmm5
-        movaps  %xmm7,%xmm4
-        shufps $0x1,%xmm5,%xmm5
-        shufps $0x1,%xmm4,%xmm4
-
-        ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
-        addss  (%rsi,%rax,4),%xmm6
-        addss  (%rsi,%rbx,4),%xmm5
-        addss  (%rsi,%rcx,4),%xmm7
-        addss  (%rsi,%rdx,4),%xmm4
-        movss  %xmm6,(%rsi,%rax,4)
-        movss  %xmm5,(%rsi,%rbx,4)
-        movss  %xmm7,(%rsi,%rcx,4)
-        movss  %xmm4,(%rsi,%rdx,4)
-
-        xorps  %xmm4,%xmm4
-        mulps nb430_rinv(%rsp),%xmm3
-        subps  %xmm3,%xmm4
-
-    movd %mm0,%r8  ## fetch j3
-    movd %mm1,%r9
-    movd %mm2,%r10
-    movd %mm3,%r11
-
-    movaps  %xmm4,%xmm9
-    movaps  %xmm4,%xmm10
-    movaps  %xmm4,%xmm11
-
-    mulps  nb430_dx(%rsp),%xmm9
-    mulps  nb430_dy(%rsp),%xmm10
-    mulps  nb430_dz(%rsp),%xmm11
-
-        ## accumulate i forces
-    movaps nb430_fix(%rsp),%xmm12
-    movaps nb430_fiy(%rsp),%xmm13
-    movaps nb430_fiz(%rsp),%xmm14
-    addps %xmm9,%xmm12
-    addps %xmm10,%xmm13
-    addps %xmm11,%xmm14
-    movaps %xmm12,nb430_fix(%rsp)
-    movaps %xmm13,nb430_fiy(%rsp)
-    movaps %xmm14,nb430_fiz(%rsp)
-
-        movq nb430_faction(%rbp),%rsi
-        ## the fj's - start by accumulating x & y forces from memory 
-        movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
-        movlps (%rsi,%r10,4),%xmm1 ## x3 y3 - -
-        movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
-        movhps (%rsi,%r11,4),%xmm1 ## x3 y3 x4 y4
-
-    movaps %xmm9,%xmm8
-    unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
-    unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4
-
-    ## update fjx and fjy
-        addps  %xmm9,%xmm0
-        addps  %xmm8,%xmm1
-
-        movlps %xmm0,(%rsi,%r8,4)
-        movlps %xmm1,(%rsi,%r10,4)
-        movhps %xmm0,(%rsi,%r9,4)
-        movhps %xmm1,(%rsi,%r11,4)
-
-    ## xmm11: fjz1 fjz2 fjz3 fjz4
-    pshufd $1,%xmm11,%xmm10 ## fjz2 - - -
-    movhlps %xmm11,%xmm9     ## fjz3 - - -
-    pshufd $3,%xmm11,%xmm8  ## fjz4 - - -
-
-        addss  8(%rsi,%r8,4),%xmm11
-        addss  8(%rsi,%r9,4),%xmm10
-        addss  8(%rsi,%r10,4),%xmm9
-        addss  8(%rsi,%r11,4),%xmm8
-        movss  %xmm11,8(%rsi,%r8,4)
-        movss  %xmm10,8(%rsi,%r9,4)
-        movss  %xmm9,8(%rsi,%r10,4)
-        movss  %xmm8,8(%rsi,%r11,4)
-
-        ## should we do one more iteration? 
-        subl $4,nb430_innerk(%rsp)
-        jl    _nb_kernel430_x86_64_sse.nb430_finish_inner
-        jmp   _nb_kernel430_x86_64_sse.nb430_unroll_loop
-_nb_kernel430_x86_64_sse.nb430_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb430_innerk(%rsp)
-        movl  nb430_innerk(%rsp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel430_x86_64_sse.nb430_dopair
-        jmp   _nb_kernel430_x86_64_sse.nb430_checksingle
-_nb_kernel430_x86_64_sse.nb430_dopair: 
-        movq  nb430_innerjjnr(%rsp),%rcx
-
-        movl  (%rcx),%eax
-        movl  4(%rcx),%ebx
-        addq $8,nb430_innerjjnr(%rsp)
-
-        ## load isaj
-        movq nb430_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rbx,4),%xmm6
-        movaps nb430_isai(%rsp),%xmm2
-    unpcklps %xmm6,%xmm3
-        mulps  %xmm3,%xmm2
-    movaps %xmm2,nb430_isaprod(%rsp)
-
-        movaps %xmm2,%xmm1
-        mulps nb430_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb430_gbscale(%rsp)
-
-        movq nb430_charge(%rbp),%rsi     ## base of charge[] 
-
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rbx,4),%xmm6
-    unpcklps %xmm6,%xmm3
-        mulps nb430_iq(%rsp),%xmm2
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb430_qq(%rsp)
-
-    ## vdw parameters
-        movq nb430_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%r12d
-        movl (%rsi,%rbx,4),%r13d
-        shll %r12d
-        shll %r13d
-    movl nb430_ntia(%rsp),%edi
-        addl %edi,%r12d
-        addl %edi,%r13d
-
-        movq nb430_vdwparam(%rbp),%rsi
-        movlps (%rsi,%r12,4),%xmm3
-        movhps (%rsi,%r13,4),%xmm3
-
-    xorps %xmm7,%xmm7
-        movaps %xmm3,%xmm0
-        shufps $136,%xmm7,%xmm0 ## 10001000
-        shufps $221,%xmm7,%xmm3 ## 11011101
-
-    movaps %xmm0,nb430_c6(%rsp)
-    movaps %xmm3,nb430_c12(%rsp)
-
-        movq nb430_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r8     ## j3
-        lea  (%rbx,%rbx,2),%r9
-
-        ## move four coordinates to xmm0-xmm2   
-        movlps (%rsi,%r8,4),%xmm0       ## x1 y1 - - 
-        movlps (%rsi,%r9,4),%xmm1       ## x2 y2 - - 
-
-        movss 8(%rsi,%r8,4),%xmm2       ## z1 - - - 
-        movss 8(%rsi,%r9,4),%xmm7       ## z2 - - - 
-
-    unpcklps %xmm1,%xmm0 ## x1 x2 y1 y2
-    movhlps  %xmm0,%xmm1 ## y1 y2 -  -
-    unpcklps %xmm7,%xmm2 ## z1 z2 -  -
-
-        ## calc dr 
-        subps nb430_ix(%rsp),%xmm0
-        subps nb430_iy(%rsp),%xmm1
-        subps nb430_iz(%rsp),%xmm2
-
-        ## store dr 
-        movaps %xmm0,nb430_dx(%rsp)
-        movaps %xmm1,nb430_dy(%rsp)
-        movaps %xmm2,nb430_dz(%rsp)
-
-        ## square it 
-        mulps %xmm0,%xmm0
-        mulps %xmm1,%xmm1
-        mulps %xmm2,%xmm2
-        addps %xmm1,%xmm0
-        addps %xmm2,%xmm0
-    movaps %xmm0,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb430_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb430_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb430_r(%rsp)
-    movaps %xmm0,nb430_rinv(%rsp)
-
-    movaps %xmm4,%xmm8   ## r
-        mulps nb430_gbscale(%rsp),%xmm4   ## rgbtab
-    mulps nb430_tsc(%rsp),%xmm8      ## rtab
-
-    ## truncate and convert to integers
-    cvttps2dq %xmm4,%xmm5 ## gb
-    cvttps2dq %xmm8,%xmm9 ## lj
-
-    ## convert back to float
-    cvtdq2ps  %xmm5,%xmm6  ## gb
-    cvtdq2ps  %xmm9,%xmm10 ## lj
-
-    ## multiply by 4 and 8, respectively
-    pslld   $2,%xmm5  ## gb
-    pslld   $3,%xmm9  ## lj
-
-    ## move to integer registers
-    movd    %xmm5,%r12d      ## gb
-    movd    %xmm9,%r14d     ## lj
-    pshufd $1,%xmm5,%xmm5  ## gb
-    pshufd $1,%xmm9,%xmm9  ## lj
-    movd    %xmm5,%r13d      ## gb
-    movd    %xmm9,%r15d     ## lj
-    ## GB indices: r12-r13   LJ indices: r14-r15
-
-    ## calculate eps
-    subps     %xmm6,%xmm4  ## gb
-    subps     %xmm10,%xmm8 ## lj
-    movaps    %xmm4,nb430_epsgb(%rsp)   ## gb eps
-    movaps    %xmm8,nb430_eps(%rsp)   ## lj eps
-
-        movq nb430_GBtab(%rbp),%rsi
-        movq nb430_VFtab(%rbp),%rdi
-
-    ## load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
-        movlps (%rsi,%r12,4),%xmm0       ## Y1c F1c
-        movlps (%rsi,%r13,4),%xmm1       ## Y2c F2c
-        movlps (%rdi,%r14,4),%xmm4       ## Y1d F1d  
-        movlps (%rdi,%r15,4),%xmm5       ## Y2d F2d
-        movlps 16(%rdi,%r14,4),%xmm8     ## Y1r F1r
-        movlps 16(%rdi,%r15,4),%xmm9     ## Y2r F2r
-
-    unpcklps %xmm1,%xmm0
-    movhlps  %xmm0,%xmm1
-    unpcklps %xmm5,%xmm4
-    movhlps  %xmm4,%xmm5
-    unpcklps %xmm9,%xmm8
-    movhlps  %xmm8,%xmm9
-        movlps 8(%rsi,%r12,4),%xmm2       ## G1c H1c
-        movlps 8(%rsi,%r13,4),%xmm3       ## G2c H2c
-        movlps 8(%rdi,%r14,4),%xmm6       ## G1d H1d  
-        movlps 8(%rdi,%r15,4),%xmm7       ## G2d H2d
-        movlps 24(%rdi,%r14,4),%xmm10     ## G1r H1r
-        movlps 24(%rdi,%r15,4),%xmm11     ## G2r H2r
-    unpcklps %xmm3,%xmm2
-    movhlps  %xmm2,%xmm3
-    unpcklps %xmm7,%xmm6
-    movhlps  %xmm6,%xmm7
-    unpcklps %xmm11,%xmm10
-    movhlps  %xmm10,%xmm11
-    ## table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
-
-    movaps nb430_epsgb(%rsp),%xmm12
-    movaps nb430_eps(%rsp),%xmm13
-
-    mulps  %xmm12,%xmm3  ## Heps
-    mulps  %xmm13,%xmm7
-    mulps  %xmm13,%xmm11
-    mulps  %xmm12,%xmm2    ## Geps
-    mulps  %xmm13,%xmm6
-    mulps  %xmm13,%xmm10
-    mulps  %xmm12,%xmm3  ## Heps2
-    mulps  %xmm13,%xmm7
-    mulps  %xmm13,%xmm11
-
-    addps  %xmm2,%xmm1  ## F+Geps
-    addps  %xmm6,%xmm5
-    addps  %xmm10,%xmm9
-    addps  %xmm3,%xmm1  ## F+Geps+Heps2 = Fp
-    addps  %xmm7,%xmm5
-    addps  %xmm11,%xmm9
-    addps  %xmm3,%xmm3   ## 2*Heps2
-    addps  %xmm7,%xmm7
-    addps  %xmm11,%xmm11
-    addps  %xmm2,%xmm3   ## 2*Heps2+Geps
-    addps  %xmm6,%xmm7
-    addps  %xmm10,%xmm11
-    addps  %xmm1,%xmm3  ## FF = Fp + 2*Heps2 + Geps
-    addps  %xmm5,%xmm7
-    addps  %xmm9,%xmm11
-    mulps  %xmm12,%xmm1  ## eps*Fp
-    mulps  %xmm13,%xmm5
-    mulps  %xmm13,%xmm9
-    addps  %xmm0,%xmm1    ## VV
-    addps  %xmm4,%xmm5
-    addps  %xmm8,%xmm9
-    mulps  nb430_qq(%rsp),%xmm1     ## VV*qq = vcoul
-    mulps  nb430_c6(%rsp),%xmm5     ## vnb6
-    mulps  nb430_c12(%rsp),%xmm9     ## vnb12
-    mulps  nb430_qq(%rsp),%xmm3      ## FF*qq = fij
-    mulps  nb430_c6(%rsp),%xmm7     ## fijD
-    mulps  nb430_c12(%rsp),%xmm11     ##fijR
-
-    addps  %xmm7,%xmm11 ## fijD+fijR
-    mulps  nb430_tsc(%rsp),%xmm11   ## (fijD+fijR)*tabscale
-
-    ## accumulate Vvdwtot
-    addps  nb430_Vvdwtot(%rsp),%xmm5
-    addps  %xmm9,%xmm5
-    movlps %xmm5,nb430_Vvdwtot(%rsp)
-
-        movq nb430_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        mulps nb430_gbscale(%rsp),%xmm3     ## fijC=qq*FF*gbscale
-        movaps %xmm3,%xmm6
-        mulps  nb430_r(%rsp),%xmm6
-        addps  %xmm1,%xmm6  ## vcoul+fijC*r
-
-    addps  %xmm11,%xmm3 ## fijC+fijD+fijR
-
-    ## increment vctot
-        addps  nb430_vctot(%rsp),%xmm1
-    movlps %xmm1,nb430_vctot(%rsp)
-
-        ## xmm6=(vcoul+fijC*r)
-        xorps  %xmm7,%xmm7
-        subps  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum 
-        addps  nb430_dvdasum(%rsp),%xmm7
-    movlps %xmm7,nb430_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        movaps  %xmm6,%xmm5
-        shufps $0x1,%xmm5,%xmm5
-
-        ## xmm6=dvdaj1 xmm5=dvdaj2 
-        addss  (%rsi,%rax,4),%xmm6
-        addss  (%rsi,%rbx,4),%xmm5
-        movss  %xmm6,(%rsi,%rax,4)
-        movss  %xmm5,(%rsi,%rbx,4)
-
-        xorps  %xmm4,%xmm4
-        mulps nb430_rinv(%rsp),%xmm3
-        subps  %xmm3,%xmm4
-
-    movaps  %xmm4,%xmm9
-    movaps  %xmm4,%xmm10
-    movaps  %xmm4,%xmm11
-
-    mulps  nb430_dx(%rsp),%xmm9
-    mulps  nb430_dy(%rsp),%xmm10
-    mulps  nb430_dz(%rsp),%xmm11
-
-
-        ## accumulate i forces
-    movaps nb430_fix(%rsp),%xmm12
-    movaps nb430_fiy(%rsp),%xmm13
-    movaps nb430_fiz(%rsp),%xmm14
-    addps %xmm9,%xmm12
-    addps %xmm10,%xmm13
-    addps %xmm11,%xmm14
-    movlps %xmm12,nb430_fix(%rsp)
-    movlps %xmm13,nb430_fiy(%rsp)
-    movlps %xmm14,nb430_fiz(%rsp)
-
-        movq nb430_faction(%rbp),%rsi
-        ## the fj's - start by accumulating x & y forces from memory 
-        movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
-        movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
-
-    unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
-    addps    %xmm9,%xmm0
-
-        movlps %xmm0,(%rsi,%r8,4)
-        movhps %xmm0,(%rsi,%r9,4)
-
-    ## z forces
-    pshufd $1,%xmm11,%xmm8
-    addss  8(%rsi,%r8,4),%xmm11
-    addss  8(%rsi,%r9,4),%xmm8
-    movss  %xmm11,8(%rsi,%r8,4)
-    movss  %xmm8,8(%rsi,%r9,4)
-
-_nb_kernel430_x86_64_sse.nb430_checksingle:     
-        movl  nb430_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel430_x86_64_sse.nb430_dosingle
-        jmp    _nb_kernel430_x86_64_sse.nb430_updateouterdata
-_nb_kernel430_x86_64_sse.nb430_dosingle: 
-        movq nb430_charge(%rbp),%rsi
-        movq nb430_invsqrta(%rbp),%rdx
-        movq nb430_pos(%rbp),%rdi
-        movq  nb430_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-
-        ## load isaj
-        movq nb430_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm3
-        movaps nb430_isai(%rsp),%xmm2
-        mulss  %xmm3,%xmm2
-    movaps %xmm2,nb430_isaprod(%rsp)
-
-        movaps %xmm2,%xmm1
-        mulss nb430_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb430_gbscale(%rsp)
-
-        movq nb430_charge(%rbp),%rsi     ## base of charge[] 
-
-        movss (%rsi,%rax,4),%xmm3
-        mulss nb430_iq(%rsp),%xmm2
-        mulss  %xmm2,%xmm3
-        movaps %xmm3,nb430_qq(%rsp)
-
-    ## vdw parameters
-        movq nb430_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%r12d
-        shll %r12d
-    movl nb430_ntia(%rsp),%edi
-        addl %edi,%r12d
-
-        movq nb430_vdwparam(%rbp),%rsi
-        movss (%rsi,%r12,4),%xmm0
-        movss 4(%rsi,%r12,4),%xmm3
-    movaps %xmm0,nb430_c6(%rsp)
-    movaps %xmm3,nb430_c12(%rsp)
-
-        movq nb430_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r8     ## j3
-
-        ## move four coordinates to xmm0-xmm2   
-    movss  (%rsi,%r8,4),%xmm0
-    movss  4(%rsi,%r8,4),%xmm1
-    movss  8(%rsi,%r8,4),%xmm2
-
-        ## calc dr 
-        subss nb430_ix(%rsp),%xmm0
-        subss nb430_iy(%rsp),%xmm1
-        subss nb430_iz(%rsp),%xmm2
-
-        ## store dr 
-        movaps %xmm0,nb430_dx(%rsp)
-        movaps %xmm1,nb430_dy(%rsp)
-        movaps %xmm2,nb430_dz(%rsp)
-
-        ## square it 
-        mulss %xmm0,%xmm0
-        mulss %xmm1,%xmm1
-        mulss %xmm2,%xmm2
-        addss %xmm1,%xmm0
-        addss %xmm2,%xmm0
-    movaps %xmm0,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movaps nb430_three(%rsp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb430_half(%rsp),%xmm0
-        subss %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-        mulss %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb430_r(%rsp)
-    movaps %xmm0,nb430_rinv(%rsp)
-
-    movaps %xmm4,%xmm8   ## r
-        mulss nb430_gbscale(%rsp),%xmm4   ## rgbtab
-    mulss nb430_tsc(%rsp),%xmm8      ## rtab
-
-    ## truncate and convert to integers
-    cvttss2si %xmm4,%r12d ## gb
-    cvttss2si %xmm8,%r14d ## lj
-
-    ## convert back to float
-    cvtsi2ss  %r12d,%xmm6  ## gb
-    cvtsi2ss  %r14d,%xmm10 ## lj
-
-    ## multiply by 4 and 8, respectively
-    shll  $2,%r12d  ## gb
-    shll  $3,%r14d  ## lj
-
-    ## GB index: r12   LJ indices: r14
-
-    ## calculate eps
-    subss     %xmm6,%xmm4  ## gb
-    subss     %xmm10,%xmm8 ## lj
-    movaps    %xmm4,nb430_epsgb(%rsp)   ## gb eps
-    movaps    %xmm8,nb430_eps(%rsp)   ## lj eps
-
-        movq nb430_GBtab(%rbp),%rsi
-        movq nb430_VFtab(%rbp),%rdi
-
-    ## load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
-    movss  (%rsi,%r12,4),%xmm0
-    movss  4(%rsi,%r12,4),%xmm1
-    movss  8(%rsi,%r12,4),%xmm2
-    movss  12(%rsi,%r12,4),%xmm3
-    movss  (%rdi,%r14,4),%xmm4
-    movss  4(%rdi,%r14,4),%xmm5
-    movss  8(%rdi,%r14,4),%xmm6
-    movss  12(%rdi,%r14,4),%xmm7
-    movss  16(%rdi,%r14,4),%xmm8
-    movss  20(%rdi,%r14,4),%xmm9
-    movss  24(%rdi,%r14,4),%xmm10
-    movss  28(%rdi,%r14,4),%xmm11
-    ## table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
-
-    movaps nb430_epsgb(%rsp),%xmm12
-    movaps nb430_eps(%rsp),%xmm13
-
-    mulss  %xmm12,%xmm3  ## Heps
-    mulss  %xmm13,%xmm7
-    mulss  %xmm13,%xmm11
-    mulss  %xmm12,%xmm2    ## Geps
-    mulss  %xmm13,%xmm6
-    mulss  %xmm13,%xmm10
-    mulss  %xmm12,%xmm3  ## Heps2
-    mulss  %xmm13,%xmm7
-    mulss  %xmm13,%xmm11
-
-    addss  %xmm2,%xmm1  ## F+Geps
-    addss  %xmm6,%xmm5
-    addss  %xmm10,%xmm9
-    addss  %xmm3,%xmm1  ## F+Geps+Heps2 = Fp
-    addss  %xmm7,%xmm5
-    addss  %xmm11,%xmm9
-    addss  %xmm3,%xmm3   ## 2*Heps2
-    addss  %xmm7,%xmm7
-    addss  %xmm11,%xmm11
-    addss  %xmm2,%xmm3   ## 2*Heps2+Geps
-    addss  %xmm6,%xmm7
-    addss  %xmm10,%xmm11
-    addss  %xmm1,%xmm3  ## FF = Fp + 2*Heps2 + Geps
-    addss  %xmm5,%xmm7
-    addss  %xmm9,%xmm11
-    mulss  %xmm12,%xmm1  ## eps*Fp
-    mulss  %xmm13,%xmm5
-    mulss  %xmm13,%xmm9
-    addss  %xmm0,%xmm1    ## VV
-    addss  %xmm4,%xmm5
-    addss  %xmm8,%xmm9
-    mulss  nb430_qq(%rsp),%xmm1     ## VV*qq = vcoul
-    mulss  nb430_c6(%rsp),%xmm5     ## vnb6
-    mulss  nb430_c12(%rsp),%xmm9     ## vnb12
-    mulss  nb430_qq(%rsp),%xmm3      ## FF*qq = fij
-    mulss  nb430_c6(%rsp),%xmm7     ## fijD
-    mulss  nb430_c12(%rsp),%xmm11     ##fijR
-
-    addss  %xmm7,%xmm11 ## fijD+fijR
-    mulss  nb430_tsc(%rsp),%xmm11   ## (fijD+fijR)*tabscale
-
-    ## accumulate Vvdwtot
-    addss  nb430_Vvdwtot(%rsp),%xmm5
-    addss  %xmm9,%xmm5
-    movss %xmm5,nb430_Vvdwtot(%rsp)
-
-        movq nb430_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        mulss nb430_gbscale(%rsp),%xmm3     ## fijC=qq*FF*gbscale
-        movaps %xmm3,%xmm6
-        mulss  nb430_r(%rsp),%xmm6
-        addss  %xmm1,%xmm6  ## vcoul+fijC*r
-
-    addss  %xmm11,%xmm3 ## fijC+fijD+fijR
-
-    ## increment vctot
-        addss  nb430_vctot(%rsp),%xmm1
-    movss %xmm1,nb430_vctot(%rsp)
-
-        ## xmm6=(vcoul+fijC*r)
-        xorps  %xmm7,%xmm7
-        subss  %xmm6,%xmm7
-        movaps %xmm7,%xmm6
-
-        ## update dvdasum 
-        addss  nb430_dvdasum(%rsp),%xmm7
-    movss %xmm7,nb430_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-
-        ## xmm6=dvdaj1
-        addss  (%rsi,%rax,4),%xmm6
-        movss  %xmm6,(%rsi,%rax,4)
-
-        xorps  %xmm4,%xmm4
-        mulss nb430_rinv(%rsp),%xmm3
-        subss  %xmm3,%xmm4
-
-    movss  %xmm4,%xmm9
-    movss  %xmm4,%xmm10
-    movss  %xmm4,%xmm11
-
-    mulss  nb430_dx(%rsp),%xmm9
-    mulss  nb430_dy(%rsp),%xmm10
-    mulss  nb430_dz(%rsp),%xmm11
-
-        ## accumulate i forces
-    movaps nb430_fix(%rsp),%xmm12
-    movaps nb430_fiy(%rsp),%xmm13
-    movaps nb430_fiz(%rsp),%xmm14
-    addss %xmm9,%xmm12
-    addss %xmm10,%xmm13
-    addss %xmm11,%xmm14
-    movss %xmm12,nb430_fix(%rsp)
-    movss %xmm13,nb430_fiy(%rsp)
-    movss %xmm14,nb430_fiz(%rsp)
-
-        movq nb430_faction(%rbp),%rsi
-    ## add to j forces
-    addss  (%rsi,%r8,4),%xmm9
-    addss  4(%rsi,%r8,4),%xmm10
-    addss  8(%rsi,%r8,4),%xmm11
-    movss  %xmm9,(%rsi,%r8,4)
-    movss  %xmm10,4(%rsi,%r8,4)
-    movss  %xmm11,8(%rsi,%r8,4)
-
-_nb_kernel430_x86_64_sse.nb430_updateouterdata: 
-        movl  nb430_ii3(%rsp),%ecx
-        movq  nb430_faction(%rbp),%rdi
-        movq  nb430_fshift(%rbp),%rsi
-        movl  nb430_is3(%rsp),%edx
-
-        ## accumulate i forces in xmm0, xmm1, xmm2 
-        movaps nb430_fix(%rsp),%xmm0
-        movaps nb430_fiy(%rsp),%xmm1
-        movaps nb430_fiz(%rsp),%xmm2
-
-        movhlps %xmm0,%xmm3
-        movhlps %xmm1,%xmm4
-        movhlps %xmm2,%xmm5
-        addps  %xmm3,%xmm0
-        addps  %xmm4,%xmm1
-        addps  %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 
-
-        movaps %xmm0,%xmm3
-        movaps %xmm1,%xmm4
-        movaps %xmm2,%xmm5
-
-        shufps $1,%xmm3,%xmm3
-        shufps $1,%xmm4,%xmm4
-        shufps $1,%xmm5,%xmm5
-        addss  %xmm3,%xmm0
-        addss  %xmm4,%xmm1
-        addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0 
-
-        ## increment i force 
-        movss  (%rdi,%rcx,4),%xmm3
-        movss  4(%rdi,%rcx,4),%xmm4
-        movss  8(%rdi,%rcx,4),%xmm5
-        subss  %xmm0,%xmm3
-        subss  %xmm1,%xmm4
-        subss  %xmm2,%xmm5
-        movss  %xmm3,(%rdi,%rcx,4)
-        movss  %xmm4,4(%rdi,%rcx,4)
-        movss  %xmm5,8(%rdi,%rcx,4)
-
-        ## increment fshift force  
-        movss  (%rsi,%rdx,4),%xmm3
-        movss  4(%rsi,%rdx,4),%xmm4
-        movss  8(%rsi,%rdx,4),%xmm5
-        subss  %xmm0,%xmm3
-        subss  %xmm1,%xmm4
-        subss  %xmm2,%xmm5
-        movss  %xmm3,(%rsi,%rdx,4)
-        movss  %xmm4,4(%rsi,%rdx,4)
-        movss  %xmm5,8(%rsi,%rdx,4)
-
-        ## get n from stack
-        movl nb430_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb430_gid(%rbp),%rdx              ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movaps nb430_vctot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movq  nb430_Vc(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%rax,%rdx,4)
-
-        ## accumulate total lj energy and update it 
-        movaps nb430_Vvdwtot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movq  nb430_Vvdw(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%rax,%rdx,4)
-
-        ## accumulate dVda and update it 
-        movaps nb430_dvdasum(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        movl nb430_ii(%rsp),%edx
-        movq nb430_dvda(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm7
-        movss %xmm7,(%rax,%rdx,4)
-
-        ## finish if last 
-        movl nb430_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel430_x86_64_sse.nb430_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb430_n(%rsp)
-        jmp _nb_kernel430_x86_64_sse.nb430_outer
-_nb_kernel430_x86_64_sse.nb430_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb430_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel430_x86_64_sse.nb430_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel430_x86_64_sse.nb430_threadloop
-_nb_kernel430_x86_64_sse.nb430_end: 
-        movl nb430_nouter(%rsp),%eax
-        movl nb430_ninner(%rsp),%ebx
-        movq nb430_outeriter(%rbp),%rcx
-        movq nb430_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $552,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
-
-
-
-.globl nb_kernel430nf_x86_64_sse
-.globl _nb_kernel430nf_x86_64_sse
-nb_kernel430nf_x86_64_sse:      
-_nb_kernel430nf_x86_64_sse:     
-##      Room for return address and rbp (16 bytes)
-.set nb430nf_fshift, 16
-.set nb430nf_gid, 24
-.set nb430nf_pos, 32
-.set nb430nf_faction, 40
-.set nb430nf_charge, 48
-.set nb430nf_p_facel, 56
-.set nb430nf_argkrf, 64
-.set nb430nf_argcrf, 72
-.set nb430nf_Vc, 80
-.set nb430nf_type, 88
-.set nb430nf_p_ntype, 96
-.set nb430nf_vdwparam, 104
-.set nb430nf_Vvdw, 112
-.set nb430nf_p_tabscale, 120
-.set nb430nf_VFtab, 128
-.set nb430nf_invsqrta, 136
-.set nb430nf_dvda, 144
-.set nb430nf_p_gbtabscale, 152
-.set nb430nf_GBtab, 160
-.set nb430nf_p_nthreads, 168
-.set nb430nf_count, 176
-.set nb430nf_mtx, 184
-.set nb430nf_outeriter, 192
-.set nb430nf_inneriter, 200
-.set nb430nf_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse use 
-.set nb430nf_ix, 0
-.set nb430nf_iy, 16
-.set nb430nf_iz, 32
-.set nb430nf_iq, 48
-.set nb430nf_gbtsc, 64
-.set nb430nf_tsc, 80
-.set nb430nf_qq, 96
-.set nb430nf_c6, 112
-.set nb430nf_c12, 128
-.set nb430nf_vctot, 144
-.set nb430nf_Vvdwtot, 160
-.set nb430nf_half, 176
-.set nb430nf_three, 192
-.set nb430nf_isai, 208
-.set nb430nf_isaprod, 224
-.set nb430nf_gbscale, 240
-.set nb430nf_r, 256
-.set nb430nf_nri, 272
-.set nb430nf_iinr, 280
-.set nb430nf_jindex, 288
-.set nb430nf_jjnr, 296
-.set nb430nf_shift, 304
-.set nb430nf_shiftvec, 312
-.set nb430nf_facel, 320
-.set nb430nf_innerjjnr, 328
-.set nb430nf_is3, 336
-.set nb430nf_ii3, 340
-.set nb430nf_ntia, 344
-.set nb430nf_innerk, 348
-.set nb430nf_n, 352
-.set nb430nf_nn1, 356
-.set nb430nf_ntype, 360
-.set nb430nf_nouter, 364
-.set nb430nf_ninner, 368
-
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $392,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb430nf_nouter(%rsp)
-        movl %eax,nb430nf_ninner(%rsp)
-
-        movl (%rdi),%edi
-        movl %edi,nb430nf_nri(%rsp)
-        movq %rsi,nb430nf_iinr(%rsp)
-        movq %rdx,nb430nf_jindex(%rsp)
-        movq %rcx,nb430nf_jjnr(%rsp)
-        movq %r8,nb430nf_shift(%rsp)
-        movq %r9,nb430nf_shiftvec(%rsp)
-        movq nb430nf_p_ntype(%rbp),%rdi
-        movl (%rdi),%edi
-        movl %edi,nb430nf_ntype(%rsp)
-        movq nb430nf_p_facel(%rbp),%rsi
-        movss (%rsi),%xmm0
-        movss %xmm0,nb430nf_facel(%rsp)
-
-        movq nb430nf_p_tabscale(%rbp),%rax
-        movss (%rax),%xmm3
-        shufps $0,%xmm3,%xmm3
-        movaps %xmm3,nb430nf_tsc(%rsp)
-
-        movq nb430nf_p_gbtabscale(%rbp),%rbx
-        movss (%rbx),%xmm4
-        shufps $0,%xmm4,%xmm4
-        movaps %xmm4,nb430nf_gbtsc(%rsp)
-
-        ## create constant floating-point factors on stack
-        movl $0x3f000000,%eax   ## half in IEEE (hex)
-        movl %eax,nb430nf_half(%rsp)
-        movss nb430nf_half(%rsp),%xmm1
-        shufps $0,%xmm1,%xmm1  ## splat to all elements
-        movaps %xmm1,%xmm2
-        addps  %xmm2,%xmm2      ## one
-        movaps %xmm2,%xmm3
-        addps  %xmm2,%xmm2      ## two
-        addps  %xmm2,%xmm3      ## three
-        movaps %xmm1,nb430nf_half(%rsp)
-        movaps %xmm3,nb430nf_three(%rsp)
-
-_nb_kernel430nf_x86_64_sse.nb430nf_threadloop: 
-        movq  nb430nf_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel430nf_x86_64_sse.nb430nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel430nf_x86_64_sse.nb430nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb430nf_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb430nf_n(%rsp)
-        movl %ebx,nb430nf_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel430nf_x86_64_sse.nb430nf_outerstart
-        jmp _nb_kernel430nf_x86_64_sse.nb430nf_end
-
-_nb_kernel430nf_x86_64_sse.nb430nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb430nf_nouter(%rsp),%ebx
-        movl %ebx,nb430nf_nouter(%rsp)
-
-_nb_kernel430nf_x86_64_sse.nb430nf_outer: 
-        movq  nb430nf_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx                ## ebx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb430nf_is3(%rsp)            ## store is3 
-
-        movq  nb430nf_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movss (%rax,%rbx,4),%xmm0
-        movss 4(%rax,%rbx,4),%xmm1
-        movss 8(%rax,%rbx,4),%xmm2
-
-        movq  nb430nf_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]      
-        movl  (%rcx,%rsi,4),%ebx            ## ebx =ii 
-
-        movq  nb430nf_charge(%rbp),%rdx
-        movss (%rdx,%rbx,4),%xmm3
-        mulss nb430nf_facel(%rsp),%xmm3
-        shufps $0,%xmm3,%xmm3
-
-        movq  nb430nf_invsqrta(%rbp),%rdx       ## load invsqrta[ii]
-        movss (%rdx,%rbx,4),%xmm4
-        shufps $0,%xmm4,%xmm4
-
-        movq  nb430nf_type(%rbp),%rdx
-        movl  (%rdx,%rbx,4),%edx
-        imull nb430nf_ntype(%rsp),%edx
-        shll  %edx
-        movl  %edx,nb430nf_ntia(%rsp)
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb430nf_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addss (%rax,%rbx,4),%xmm0
-        addss 4(%rax,%rbx,4),%xmm1
-        addss 8(%rax,%rbx,4),%xmm2
-
-        movaps %xmm3,nb430nf_iq(%rsp)
-        movaps %xmm4,nb430nf_isai(%rsp)
-
-        shufps $0,%xmm0,%xmm0
-        shufps $0,%xmm1,%xmm1
-        shufps $0,%xmm2,%xmm2
-
-        movaps %xmm0,nb430nf_ix(%rsp)
-        movaps %xmm1,nb430nf_iy(%rsp)
-        movaps %xmm2,nb430nf_iz(%rsp)
-
-        movl  %ebx,nb430nf_ii3(%rsp)
-
-        ## clear vctot 
-        xorps %xmm4,%xmm4
-        movaps %xmm4,nb430nf_vctot(%rsp)
-        movaps %xmm4,nb430nf_Vvdwtot(%rsp)
-
-        movq  nb430nf_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb430nf_pos(%rbp),%rsi
-        movq  nb430nf_faction(%rbp),%rdi
-        movq  nb430nf_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb430nf_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $4,%edx
-        addl  nb430nf_ninner(%rsp),%ecx
-        movl  %ecx,nb430nf_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb430nf_innerk(%rsp)      ## number of innerloop atoms 
-        jge   _nb_kernel430nf_x86_64_sse.nb430nf_unroll_loop
-        jmp   _nb_kernel430nf_x86_64_sse.nb430nf_finish_inner
-_nb_kernel430nf_x86_64_sse.nb430nf_unroll_loop: 
-        ## quad-unroll innerloop here 
-        movq  nb430nf_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k] 
-        movl  (%rdx),%eax
-        movl  4(%rdx),%ebx
-        movl  8(%rdx),%ecx
-        movl  12(%rdx),%edx           ## eax-edx=jnr1-4 
-        addq $16,nb430nf_innerjjnr(%rsp)             ## advance pointer (unrolled 4) 
-
-        ## load isa2
-        movq nb430nf_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rcx,4),%xmm4
-        movss (%rsi,%rbx,4),%xmm6
-        movss (%rsi,%rdx,4),%xmm7
-        movaps nb430nf_isai(%rsp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3  
-        mulps  %xmm3,%xmm2
-
-        movaps %xmm2,nb430nf_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulps nb430nf_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb430nf_gbscale(%rsp)
-
-        movq nb430nf_charge(%rbp),%rsi     ## base of charge[] 
-
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rcx,4),%xmm4
-        movss (%rsi,%rbx,4),%xmm6
-        movss (%rsi,%rdx,4),%xmm7
-
-        mulps nb430nf_iq(%rsp),%xmm2
-        shufps $0,%xmm6,%xmm3
-        shufps $0,%xmm7,%xmm4
-        shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3  
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb430nf_qq(%rsp)
-
-        movd  %eax,%mm0         ## use mmx registers as temp storage 
-        movd  %ebx,%mm1
-        movd  %ecx,%mm2
-        movd  %edx,%mm3
-
-        movq nb430nf_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%eax
-        movl (%rsi,%rbx,4),%ebx
-        movl (%rsi,%rcx,4),%ecx
-        movl (%rsi,%rdx,4),%edx
-        movq nb430nf_vdwparam(%rbp),%rsi
-        shll %eax
-        shll %ebx
-        shll %ecx
-        shll %edx
-        movl nb430nf_ntia(%rsp),%edi
-        addl %edi,%eax
-        addl %edi,%ebx
-        addl %edi,%ecx
-        addl %edi,%edx
-
-        movlps (%rsi,%rax,4),%xmm6
-        movlps (%rsi,%rcx,4),%xmm7
-        movhps (%rsi,%rbx,4),%xmm6
-        movhps (%rsi,%rdx,4),%xmm7
-
-        movaps %xmm6,%xmm4
-        shufps $136,%xmm7,%xmm4 ## 10001000
-        shufps $221,%xmm7,%xmm6 ## 11011101
-
-        movd  %mm0,%eax
-        movd  %mm1,%ebx
-        movd  %mm2,%ecx
-        movd  %mm3,%edx
-
-        movaps %xmm4,nb430nf_c6(%rsp)
-        movaps %xmm6,nb430nf_c12(%rsp)
-
-        movq nb430nf_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%rax     ## replace jnr with j3 
-        lea  (%rbx,%rbx,2),%rbx
-
-        lea  (%rcx,%rcx,2),%rcx     ## replace jnr with j3 
-        lea  (%rdx,%rdx,2),%rdx
-
-        ## move four coordinates to xmm0-xmm2   
-
-        movlps (%rsi,%rax,4),%xmm4
-        movlps (%rsi,%rcx,4),%xmm5
-        movss 8(%rsi,%rax,4),%xmm2
-        movss 8(%rsi,%rcx,4),%xmm6
-
-        movhps (%rsi,%rbx,4),%xmm4
-        movhps (%rsi,%rdx,4),%xmm5
-
-        movss 8(%rsi,%rbx,4),%xmm0
-        movss 8(%rsi,%rdx,4),%xmm1
-
-        shufps $0,%xmm0,%xmm2
-        shufps $0,%xmm1,%xmm6
-
-        movaps %xmm4,%xmm0
-        movaps %xmm4,%xmm1
-
-        shufps $136,%xmm6,%xmm2 ## 10001000
-
-        shufps $136,%xmm5,%xmm0 ## 10001000
-        shufps $221,%xmm5,%xmm1 ## 11011101             
-
-        ## move ix-iz to xmm4-xmm6 
-        movaps nb430nf_ix(%rsp),%xmm4
-        movaps nb430nf_iy(%rsp),%xmm5
-        movaps nb430nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb430nf_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb430nf_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r
-        movaps %xmm4,nb430nf_r(%rsp)
-        mulps nb430nf_gbscale(%rsp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $2,%mm6
-        pslld $2,%mm7
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-        movd %ecx,%mm2
-        movd %edx,%mm3
-
-        movq nb430nf_GBtab(%rbp),%rsi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## load coulomb table
-        movaps (%rsi,%rax,4),%xmm4
-        movaps (%rsi,%rbx,4),%xmm5
-        movaps (%rsi,%rcx,4),%xmm6
-        movaps (%rsi,%rdx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## coulomb table ready, in xmm4-xmm7            
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        movaps nb430nf_qq(%rsp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addps  nb430nf_vctot(%rsp),%xmm5
-        movaps %xmm5,nb430nf_vctot(%rsp)
-
-
-        movaps nb430nf_r(%rsp),%xmm4
-        mulps nb430nf_tsc(%rsp),%xmm4
-
-        movhlps %xmm4,%xmm5
-        cvttps2pi %xmm4,%mm6
-        cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        cvtpi2ps %mm7,%xmm5
-        movlhps %xmm5,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $3,%mm6
-        pslld $3,%mm7
-
-        movq nb430nf_VFtab(%rbp),%rsi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm7,%ecx
-        psrlq $32,%mm7
-        movd %mm6,%ebx
-        movd %mm7,%edx
-
-        ## dispersion 
-        movaps (%rsi,%rax,4),%xmm4
-        movaps (%rsi,%rbx,4),%xmm5
-        movaps (%rsi,%rcx,4),%xmm6
-        movaps (%rsi,%rdx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## dispersion table ready, in xmm4-xmm7         
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  nb430nf_c6(%rsp),%xmm5    ## Vvdw6
-        addps  nb430nf_Vvdwtot(%rsp),%xmm5
-        movaps %xmm5,nb430nf_Vvdwtot(%rsp)
-
-        ## repulsion 
-        movaps 16(%rsi,%rax,4),%xmm4
-        movaps 16(%rsi,%rbx,4),%xmm5
-        movaps 16(%rsi,%rcx,4),%xmm6
-        movaps 16(%rsi,%rdx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm6,%xmm3
-        shufps $0xEE,%xmm7,%xmm3
-        shufps $0x44,%xmm7,%xmm6
-        movaps %xmm4,%xmm7
-        shufps $0xEE,%xmm5,%xmm7
-        shufps $0x44,%xmm5,%xmm4
-        movaps %xmm4,%xmm5
-        shufps $0xDD,%xmm6,%xmm5
-        shufps $0x88,%xmm6,%xmm4
-        movaps %xmm7,%xmm6
-        shufps $0x88,%xmm3,%xmm6
-        shufps $0xDD,%xmm3,%xmm7
-        ## table ready, in xmm4-xmm7    
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulps  nb430nf_c12(%rsp),%xmm5   ## Vvdw12
-        addps  nb430nf_Vvdwtot(%rsp),%xmm5
-        movaps %xmm5,nb430nf_Vvdwtot(%rsp)
-
-        ## should we do one more iteration? 
-        subl $4,nb430nf_innerk(%rsp)
-        jl    _nb_kernel430nf_x86_64_sse.nb430nf_finish_inner
-        jmp   _nb_kernel430nf_x86_64_sse.nb430nf_unroll_loop
-_nb_kernel430nf_x86_64_sse.nb430nf_finish_inner: 
-        ## check if at least two particles remain 
-        addl $4,nb430nf_innerk(%rsp)
-        movl  nb430nf_innerk(%rsp),%edx
-        andl  $2,%edx
-        jnz   _nb_kernel430nf_x86_64_sse.nb430nf_dopair
-        jmp   _nb_kernel430nf_x86_64_sse.nb430nf_checksingle
-_nb_kernel430nf_x86_64_sse.nb430nf_dopair: 
-
-        movq  nb430nf_innerjjnr(%rsp),%rcx
-
-        movl  (%rcx),%eax
-        movl  4(%rcx),%ebx
-        addq $8,nb430nf_innerjjnr(%rsp)
-
-        xorps %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-
-        ## load isa2
-        movq nb430nf_invsqrta(%rbp),%rsi
-        movss (%rsi,%rax,4),%xmm2
-        movss (%rsi,%rbx,4),%xmm3
-        unpcklps %xmm3,%xmm2    ## isa2 in xmm3(0,1)
-        mulps  nb430nf_isai(%rsp),%xmm2
-        movaps %xmm2,nb430nf_isaprod(%rsp)
-        movaps %xmm2,%xmm1
-        mulps nb430nf_gbtsc(%rsp),%xmm1
-        movaps %xmm1,nb430nf_gbscale(%rsp)
-
-        movq nb430nf_charge(%rbp),%rsi     ## base of charge[]  
-        movss (%rsi,%rax,4),%xmm3
-        movss (%rsi,%rbx,4),%xmm6
-        unpcklps %xmm6,%xmm3 ## 00001000 ;# xmm3(0,1) has the charges 
-
-        mulps  nb430nf_iq(%rsp),%xmm2
-        mulps  %xmm2,%xmm3
-        movaps %xmm3,nb430nf_qq(%rsp)
-
-        movq nb430nf_type(%rbp),%rsi
-        movl  %eax,%ecx
-        movl  %ebx,%edx
-        movl (%rsi,%rcx,4),%ecx
-        movl (%rsi,%rdx,4),%edx
-        movq nb430nf_vdwparam(%rbp),%rsi
-        shll %ecx
-        shll %edx
-        movl nb430nf_ntia(%rsp),%edi
-        addl %edi,%ecx
-        addl %edi,%edx
-        movlps (%rsi,%rcx,4),%xmm6
-        movhps (%rsi,%rdx,4),%xmm6
-        movq nb430nf_pos(%rbp),%rdi
-
-        movaps %xmm6,%xmm4
-        shufps $8,%xmm4,%xmm4 ## 00001000        
-        shufps $13,%xmm6,%xmm6 ## 00001101
-        movlhps %xmm7,%xmm4
-        movlhps %xmm7,%xmm6
-
-        movaps %xmm4,nb430nf_c6(%rsp)
-        movaps %xmm6,nb430nf_c12(%rsp)
-
-        lea  (%rax,%rax,2),%rax
-        lea  (%rbx,%rbx,2),%rbx
-        ## move coordinates to xmm0-xmm2 
-        movlps (%rdi,%rax,4),%xmm1
-        movss 8(%rdi,%rax,4),%xmm2
-        movhps (%rdi,%rbx,4),%xmm1
-        movss 8(%rdi,%rbx,4),%xmm0
-
-        movlhps %xmm7,%xmm3
-
-        shufps $0,%xmm0,%xmm2
-
-        movaps %xmm1,%xmm0
-
-        shufps $136,%xmm2,%xmm2 ## 10001000
-
-        shufps $136,%xmm0,%xmm0 ## 10001000
-        shufps $221,%xmm1,%xmm1 ## 11011101
-
-        movq   nb430nf_faction(%rbp),%rdi
-        ## move ix-iz to xmm4-xmm6 
-        xorps   %xmm7,%xmm7
-
-        movaps nb430nf_ix(%rsp),%xmm4
-        movaps nb430nf_iy(%rsp),%xmm5
-        movaps nb430nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subps %xmm0,%xmm4
-        subps %xmm1,%xmm5
-        subps %xmm2,%xmm6
-
-        ## square it 
-        mulps %xmm4,%xmm4
-        mulps %xmm5,%xmm5
-        mulps %xmm6,%xmm6
-        addps %xmm5,%xmm4
-        addps %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtps %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulps %xmm5,%xmm5
-        movaps nb430nf_three(%rsp),%xmm1
-        mulps %xmm4,%xmm5       ## rsq*lu*lu                    
-        movaps nb430nf_half(%rsp),%xmm0
-        subps %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulps %xmm2,%xmm1
-        mulps %xmm1,%xmm0       ## xmm0=rinv 
-        mulps %xmm0,%xmm4       ## xmm4=r 
-        movaps %xmm4,nb430nf_r(%rsp)
-        mulps nb430nf_gbscale(%rsp),%xmm4
-
-        cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices 
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6
-
-        movq nb430nf_GBtab(%rbp),%rsi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## load coulomb table
-        movaps (%rsi,%rcx,4),%xmm4
-        movaps (%rsi,%rdx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## coulomb table ready, in xmm4-xmm7    
-
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        movaps nb430nf_qq(%rsp),%xmm3
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-        mulps  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addps  nb430nf_vctot(%rsp),%xmm5
-        movaps %xmm5,nb430nf_vctot(%rsp)
-
-        movaps nb430nf_r(%rsp),%xmm4
-        mulps nb430nf_tsc(%rsp),%xmm4
-
-        cvttps2pi %xmm4,%mm6
-        cvtpi2ps %mm6,%xmm6
-        subps %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulps  %xmm2,%xmm2      ## xmm2=eps2 
-        pslld $3,%mm6
-
-        movq nb430nf_VFtab(%rbp),%rsi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx
-
-        ## dispersion 
-        movaps (%rsi,%rcx,4),%xmm4
-        movaps (%rsi,%rdx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## dispersion table ready, in xmm4-xmm7         
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulps  nb430nf_c6(%rsp),%xmm5    ## Vvdw6 
-        addps  nb430nf_Vvdwtot(%rsp),%xmm5
-        movaps %xmm5,nb430nf_Vvdwtot(%rsp)
-
-        ## repulsion 
-        movaps 16(%rsi,%rcx,4),%xmm4
-        movaps 16(%rsi,%rdx,4),%xmm7
-        ## transpose, using xmm3 for scratch
-        movaps %xmm4,%xmm6
-        unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2 
-        unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
-        movhlps  %xmm4,%xmm5    ## F1 F2 
-        movhlps  %xmm6,%xmm7    ## H1 H2
-        ## table ready, in xmm4-xmm7    
-        mulps  %xmm1,%xmm6      ## xmm6=Geps 
-        mulps  %xmm2,%xmm7      ## xmm7=Heps2 
-        addps  %xmm6,%xmm5
-        addps  %xmm7,%xmm5      ## xmm5=Fp      
-        mulps  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addps  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulps  nb430nf_c12(%rsp),%xmm5   ## Vvdw12 
-
-        addps  nb430nf_Vvdwtot(%rsp),%xmm5
-        movaps %xmm5,nb430nf_Vvdwtot(%rsp)
-_nb_kernel430nf_x86_64_sse.nb430nf_checksingle: 
-        movl  nb430nf_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel430nf_x86_64_sse.nb430nf_dosingle
-        jmp    _nb_kernel430nf_x86_64_sse.nb430nf_updateouterdata
-_nb_kernel430nf_x86_64_sse.nb430nf_dosingle: 
-        movq nb430nf_charge(%rbp),%rsi
-        movq nb430nf_invsqrta(%rbp),%rdx
-        movq nb430nf_pos(%rbp),%rdi
-        movq  nb430nf_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-        xorps  %xmm2,%xmm2
-        movaps %xmm2,%xmm6
-        movss (%rdx,%rax,4),%xmm2       ## isa2
-        mulss nb430nf_isai(%rsp),%xmm2
-        movss %xmm2,nb430nf_isaprod(%rsp)
-        movss %xmm2,%xmm1
-        mulss nb430nf_gbtsc(%rsp),%xmm1
-        movss %xmm1,nb430nf_gbscale(%rsp)
-
-        mulss  nb430nf_iq(%rsp),%xmm2
-        movss (%rsi,%rax,4),%xmm6       ## xmm6(0) has the charge       
-        mulss  %xmm2,%xmm6
-        movss %xmm6,nb430nf_qq(%rsp)
-
-        movq nb430nf_type(%rbp),%rsi
-        movl %eax,%ecx
-        movl (%rsi,%rcx,4),%ecx
-        movq nb430nf_vdwparam(%rbp),%rsi
-        shll %ecx
-        addl nb430nf_ntia(%rsp),%ecx
-        movlps (%rsi,%rcx,4),%xmm6
-        movaps %xmm6,%xmm4
-        shufps $252,%xmm4,%xmm4 ## 11111100     
-        shufps $253,%xmm6,%xmm6 ## 11111101     
-
-        movss %xmm4,nb430nf_c6(%rsp)
-        movss %xmm6,nb430nf_c12(%rsp)
-
-        lea  (%rax,%rax,2),%rax
-
-        ## move coordinates to xmm0-xmm2 
-        movss (%rdi,%rax,4),%xmm0
-        movss 4(%rdi,%rax,4),%xmm1
-        movss 8(%rdi,%rax,4),%xmm2
-
-        movss nb430nf_ix(%rsp),%xmm4
-        movss nb430nf_iy(%rsp),%xmm5
-        movss nb430nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subss %xmm0,%xmm4
-        subss %xmm1,%xmm5
-        subss %xmm2,%xmm6
-
-        ## square it 
-        mulss %xmm4,%xmm4
-        mulss %xmm5,%xmm5
-        mulss %xmm6,%xmm6
-        addss %xmm5,%xmm4
-        addss %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        rsqrtss %xmm4,%xmm5
-        ## lookup seed in xmm5 
-        movaps %xmm5,%xmm2
-        mulss %xmm5,%xmm5
-        movss nb430nf_three(%rsp),%xmm1
-        mulss %xmm4,%xmm5       ## rsq*lu*lu                    
-        movss nb430nf_half(%rsp),%xmm0
-        subss %xmm5,%xmm1       ## 30-rsq*lu*lu 
-        mulss %xmm2,%xmm1
-        mulss %xmm1,%xmm0       ## xmm0=rinv 
-
-        mulss %xmm0,%xmm4       ## xmm4=r 
-        movaps %xmm4,nb430nf_r(%rsp)
-        mulss nb430nf_gbscale(%rsp),%xmm4
-
-        cvttss2si %xmm4,%ebx    ## mm6 contain lu indices 
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movaps %xmm4,%xmm1      ## xmm1=eps 
-        movaps %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%ebx
-
-        movq nb430nf_GBtab(%rbp),%rsi
-
-        movaps (%rsi,%rbx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        movss nb430nf_qq(%rsp),%xmm3
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addss  nb430nf_vctot(%rsp),%xmm5
-        movss %xmm5,nb430nf_vctot(%rsp)
-
-        movss nb430nf_r(%rsp),%xmm4
-        mulps nb430nf_tsc(%rsp),%xmm4
-
-        cvttss2si %xmm4,%ebx
-        cvtsi2ss %ebx,%xmm6
-        subss %xmm6,%xmm4
-        movss %xmm4,%xmm1       ## xmm1=eps 
-        movss %xmm1,%xmm2
-        mulss  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $3,%ebx
-        movq nb430nf_VFtab(%rbp),%rsi
-
-        ## dispersion 
-        movaps (%rsi,%rbx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-        mulss  nb430nf_c6(%rsp),%xmm5    ## Vvdw6
-        addss  nb430nf_Vvdwtot(%rsp),%xmm5
-        movss %xmm5,nb430nf_Vvdwtot(%rsp)
-
-        ## repulsion 
-        movaps 16(%rsi,%rbx,4),%xmm4
-        movhlps %xmm4,%xmm6
-        movaps %xmm4,%xmm5
-        movaps %xmm6,%xmm7
-        shufps $1,%xmm5,%xmm5
-        shufps $1,%xmm7,%xmm7
-        ## table ready in xmm4-xmm7 
-
-        mulss  %xmm1,%xmm6      ## xmm6=Geps 
-        mulss  %xmm2,%xmm7      ## xmm7=Heps2 
-        addss  %xmm6,%xmm5
-        addss  %xmm7,%xmm5      ## xmm5=Fp      
-        mulss  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addss  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulss  nb430nf_c12(%rsp),%xmm5   ## Vvdw12 
-
-        addss  nb430nf_Vvdwtot(%rsp),%xmm5
-        movss %xmm5,nb430nf_Vvdwtot(%rsp)
-
-_nb_kernel430nf_x86_64_sse.nb430nf_updateouterdata: 
-        ## get n from stack
-        movl nb430nf_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb430nf_gid(%rbp),%rdx            ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movaps nb430nf_vctot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movq  nb430nf_Vc(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%rax,%rdx,4)
-
-        ## accumulate total lj energy and update it 
-        movaps nb430nf_Vvdwtot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now 
-        movaps %xmm7,%xmm6
-        shufps $1,%xmm6,%xmm6
-        addss  %xmm6,%xmm7
-
-        ## add earlier value from mem 
-        movq  nb430nf_Vvdw(%rbp),%rax
-        addss (%rax,%rdx,4),%xmm7
-        ## move back to mem 
-        movss %xmm7,(%rax,%rdx,4)
-
-        ## finish if last 
-        movl nb430nf_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel430nf_x86_64_sse.nb430nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb430nf_n(%rsp)
-        jmp _nb_kernel430nf_x86_64_sse.nb430nf_outer
-_nb_kernel430nf_x86_64_sse.nb430nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb430nf_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel430nf_x86_64_sse.nb430nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel430nf_x86_64_sse.nb430nf_threadloop
-_nb_kernel430nf_x86_64_sse.nb430nf_end: 
-
-        movl nb430nf_nouter(%rsp),%eax
-        movl nb430nf_ninner(%rsp),%ebx
-        movq nb430nf_outeriter(%rbp),%rcx
-        movq nb430nf_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $392,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
-
-
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/Makefile.am b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/Makefile.am
index 5e515024c4..260af98c5c 100644
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/Makefile.am
+++ b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/Makefile.am
@@ -64,30 +64,30 @@ libnb_kernel_x86_64_sse2_la_SOURCES = \
 
 
 EXTRA_DIST = \
-	nb_kernel010_x86_64_sse2.intel_syntax.s	nb_kernel030_x86_64_sse2.intel_syntax.s	\
-	nb_kernel100_x86_64_sse2.intel_syntax.s	nb_kernel101_x86_64_sse2.intel_syntax.s	\
-	nb_kernel102_x86_64_sse2.intel_syntax.s	nb_kernel103_x86_64_sse2.intel_syntax.s	\
-	nb_kernel104_x86_64_sse2.intel_syntax.s	nb_kernel110_x86_64_sse2.intel_syntax.s	\
-	nb_kernel111_x86_64_sse2.intel_syntax.s	nb_kernel112_x86_64_sse2.intel_syntax.s	\
-	nb_kernel113_x86_64_sse2.intel_syntax.s	nb_kernel114_x86_64_sse2.intel_syntax.s	\
-	nb_kernel130_x86_64_sse2.intel_syntax.s	nb_kernel131_x86_64_sse2.intel_syntax.s	\
-	nb_kernel132_x86_64_sse2.intel_syntax.s	nb_kernel133_x86_64_sse2.intel_syntax.s	\
-	nb_kernel134_x86_64_sse2.intel_syntax.s	nb_kernel200_x86_64_sse2.intel_syntax.s	\
-	nb_kernel201_x86_64_sse2.intel_syntax.s	nb_kernel202_x86_64_sse2.intel_syntax.s	\
-	nb_kernel203_x86_64_sse2.intel_syntax.s	nb_kernel204_x86_64_sse2.intel_syntax.s	\
-	nb_kernel210_x86_64_sse2.intel_syntax.s	nb_kernel211_x86_64_sse2.intel_syntax.s	\
-	nb_kernel212_x86_64_sse2.intel_syntax.s	nb_kernel213_x86_64_sse2.intel_syntax.s	\
-	nb_kernel214_x86_64_sse2.intel_syntax.s	nb_kernel230_x86_64_sse2.intel_syntax.s	\
-	nb_kernel231_x86_64_sse2.intel_syntax.s	nb_kernel232_x86_64_sse2.intel_syntax.s	\
-	nb_kernel233_x86_64_sse2.intel_syntax.s	nb_kernel234_x86_64_sse2.intel_syntax.s	\
-	nb_kernel300_x86_64_sse2.intel_syntax.s	nb_kernel301_x86_64_sse2.intel_syntax.s	\
-	nb_kernel302_x86_64_sse2.intel_syntax.s	nb_kernel303_x86_64_sse2.intel_syntax.s	\
-	nb_kernel304_x86_64_sse2.intel_syntax.s	nb_kernel310_x86_64_sse2.intel_syntax.s	\
-	nb_kernel311_x86_64_sse2.intel_syntax.s	nb_kernel312_x86_64_sse2.intel_syntax.s	\
-	nb_kernel313_x86_64_sse2.intel_syntax.s	nb_kernel314_x86_64_sse2.intel_syntax.s	\
-	nb_kernel330_x86_64_sse2.intel_syntax.s	nb_kernel331_x86_64_sse2.intel_syntax.s	\
-	nb_kernel332_x86_64_sse2.intel_syntax.s	nb_kernel333_x86_64_sse2.intel_syntax.s	\
-	nb_kernel334_x86_64_sse2.intel_syntax.s	nb_kernel400_x86_64_sse2.intel_syntax.s	\
-	nb_kernel410_x86_64_sse2.intel_syntax.s	nb_kernel430_x86_64_sse2.intel_syntax.s	\
-	nb_kernel_x86_64_sse2_test.intel_syntax.s
+	nb_kernel010_x86_64_sse2_intel_syntax.s	nb_kernel030_x86_64_sse2_intel_syntax.s	\
+	nb_kernel100_x86_64_sse2_intel_syntax.s	nb_kernel101_x86_64_sse2_intel_syntax.s	\
+	nb_kernel102_x86_64_sse2_intel_syntax.s	nb_kernel103_x86_64_sse2_intel_syntax.s	\
+	nb_kernel104_x86_64_sse2_intel_syntax.s	nb_kernel110_x86_64_sse2_intel_syntax.s	\
+	nb_kernel111_x86_64_sse2_intel_syntax.s	nb_kernel112_x86_64_sse2_intel_syntax.s	\
+	nb_kernel113_x86_64_sse2_intel_syntax.s	nb_kernel114_x86_64_sse2_intel_syntax.s	\
+	nb_kernel130_x86_64_sse2_intel_syntax.s	nb_kernel131_x86_64_sse2_intel_syntax.s	\
+	nb_kernel132_x86_64_sse2_intel_syntax.s	nb_kernel133_x86_64_sse2_intel_syntax.s	\
+	nb_kernel134_x86_64_sse2_intel_syntax.s	nb_kernel200_x86_64_sse2_intel_syntax.s	\
+	nb_kernel201_x86_64_sse2_intel_syntax.s	nb_kernel202_x86_64_sse2_intel_syntax.s	\
+	nb_kernel203_x86_64_sse2_intel_syntax.s	nb_kernel204_x86_64_sse2_intel_syntax.s	\
+	nb_kernel210_x86_64_sse2_intel_syntax.s	nb_kernel211_x86_64_sse2_intel_syntax.s	\
+	nb_kernel212_x86_64_sse2_intel_syntax.s	nb_kernel213_x86_64_sse2_intel_syntax.s	\
+	nb_kernel214_x86_64_sse2_intel_syntax.s	nb_kernel230_x86_64_sse2_intel_syntax.s	\
+	nb_kernel231_x86_64_sse2_intel_syntax.s	nb_kernel232_x86_64_sse2_intel_syntax.s	\
+	nb_kernel233_x86_64_sse2_intel_syntax.s	nb_kernel234_x86_64_sse2_intel_syntax.s	\
+	nb_kernel300_x86_64_sse2_intel_syntax.s	nb_kernel301_x86_64_sse2_intel_syntax.s	\
+	nb_kernel302_x86_64_sse2_intel_syntax.s	nb_kernel303_x86_64_sse2_intel_syntax.s	\
+	nb_kernel304_x86_64_sse2_intel_syntax.s	nb_kernel310_x86_64_sse2_intel_syntax.s	\
+	nb_kernel311_x86_64_sse2_intel_syntax.s	nb_kernel312_x86_64_sse2_intel_syntax.s	\
+	nb_kernel313_x86_64_sse2_intel_syntax.s	nb_kernel314_x86_64_sse2_intel_syntax.s	\
+	nb_kernel330_x86_64_sse2_intel_syntax.s	nb_kernel331_x86_64_sse2_intel_syntax.s	\
+	nb_kernel332_x86_64_sse2_intel_syntax.s	nb_kernel333_x86_64_sse2_intel_syntax.s	\
+	nb_kernel334_x86_64_sse2_intel_syntax.s	nb_kernel400_x86_64_sse2_intel_syntax.s	\
+	nb_kernel410_x86_64_sse2_intel_syntax.s	nb_kernel430_x86_64_sse2_intel_syntax.s	\
+	nb_kernel_x86_64_sse2_test_intel_syntax.s
 
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.intel_syntax.s
deleted file mode 100644
index cdc2d9f689..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.intel_syntax.s
+++ /dev/null
@@ -1,1236 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-
-
-.globl nb_kernel400_x86_64_sse2
-.globl _nb_kernel400_x86_64_sse2
-nb_kernel400_x86_64_sse2:	
-_nb_kernel400_x86_64_sse2:	
-;#	Room for return address and rbp (16 bytes)
-.equiv          nb400_fshift,           16
-.equiv          nb400_gid,              24
-.equiv          nb400_pos,              32
-.equiv          nb400_faction,          40
-.equiv          nb400_charge,           48
-.equiv          nb400_p_facel,          56
-.equiv          nb400_argkrf,           64
-.equiv          nb400_argcrf,           72
-.equiv          nb400_Vc,               80
-.equiv          nb400_type,             88
-.equiv          nb400_p_ntype,          96
-.equiv          nb400_vdwparam,         104
-.equiv          nb400_Vvdw,             112
-.equiv          nb400_p_tabscale,       120
-.equiv          nb400_VFtab,            128
-.equiv          nb400_invsqrta,         136
-.equiv          nb400_dvda,             144
-.equiv          nb400_p_gbtabscale,     152
-.equiv          nb400_GBtab,            160
-.equiv          nb400_p_nthreads,       168
-.equiv          nb400_count,            176
-.equiv          nb400_mtx,              184
-.equiv          nb400_outeriter,        192
-.equiv          nb400_inneriter,        200
-.equiv          nb400_work,             208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb400_ix,               0
-.equiv          nb400_iy,               16
-.equiv          nb400_iz,               32
-.equiv          nb400_iq,               48
-.equiv          nb400_dx,               64
-.equiv          nb400_dy,               80
-.equiv          nb400_dz,               96
-.equiv          nb400_two,              112
-.equiv          nb400_gbtsc,            128
-.equiv          nb400_qq,               144
-.equiv          nb400_r,                160
-.equiv          nb400_vctot,            176
-.equiv          nb400_fix,              192
-.equiv          nb400_fiy,              208
-.equiv          nb400_fiz,              224
-.equiv          nb400_half,             240
-.equiv          nb400_three,            256
-.equiv          nb400_isai,             272
-.equiv          nb400_isaprod,          288
-.equiv          nb400_dvdasum,          304
-.equiv          nb400_gbscale,          320
-.equiv          nb400_nri,              336
-.equiv          nb400_iinr,             344
-.equiv          nb400_jindex,           352
-.equiv          nb400_jjnr,             360
-.equiv          nb400_shift,            368
-.equiv          nb400_shiftvec,         376
-.equiv          nb400_facel,            384
-.equiv          nb400_innerjjnr,        392
-.equiv          nb400_is3,              400
-.equiv          nb400_ii3,              404
-.equiv          nb400_ii,               408
-.equiv          nb400_innerk,           412
-.equiv          nb400_n,                416
-.equiv          nb400_nn1,              420
-.equiv          nb400_nouter,           424
-.equiv          nb400_ninner,           428
-	push rbp
-	mov  rbp, rsp
-	push rbx
-
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 440		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb400_nouter], eax
-	mov [rsp + nb400_ninner], eax
-
-	mov edi, [rdi]
-	mov [rsp + nb400_nri], edi
-	mov [rsp + nb400_iinr], rsi
-	mov [rsp + nb400_jindex], rdx
-	mov [rsp + nb400_jjnr], rcx
-	mov [rsp + nb400_shift], r8
-	mov [rsp + nb400_shiftvec], r9
-	mov rsi, [rbp + nb400_p_facel]
-	movsd xmm0, [rsi]
-	movsd [rsp + nb400_facel], xmm0
-
-	mov rbx, [rbp + nb400_p_gbtabscale]
-	movsd xmm4, [rbx]
-	shufpd xmm4, xmm4, 0
-	movapd [rsp + nb400_gbtsc],  xmm4
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double half IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [rsp + nb400_half], eax
-	mov [rsp + nb400_half+4], ebx
-	movsd xmm1, [rsp + nb400_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# one
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# two
-	addpd  xmm3, xmm2	;# three
-	movapd [rsp + nb400_half], xmm1
-	movapd [rsp + nb400_two], xmm2
-	movapd [rsp + nb400_three], xmm3
-
-.nb400_threadloop:
-        mov   rsi, [rbp + nb400_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb400_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb400_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb400_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb400_n], eax
-        mov [rsp + nb400_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb400_outerstart
-        jmp .nb400_end
-
-.nb400_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb400_nouter]
-	mov [rsp + nb400_nouter], ebx
-
-.nb400_outer:
-	mov   rax, [rsp + nb400_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax+rsi*4]		;# rbx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb400_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb400_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movsd xmm0, [rax + rbx*8]
-	movsd xmm1, [rax + rbx*8 + 8]
-	movsd xmm2, [rax + rbx*8 + 16] 
-
-	mov   rcx, [rsp + nb400_iinr]       ;# rcx = pointer into iinr[] 	
-	mov   ebx, [rcx+rsi*4]	    ;# ebx =ii 
-	mov   [rsp + nb400_ii], ebx
-	
-	mov   rdx, [rbp + nb400_charge]
-	movsd xmm3, [rdx + rbx*8]	
-	mulsd xmm3, [rsp + nb400_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   rdx, [rbp + nb400_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [rdx + rbx*8]
-	shufpd xmm4, xmm4, 0
-
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb400_pos]    ;# rax = base of pos[]  
-
-	addsd xmm0, [rax + rbx*8]
-	addsd xmm1, [rax + rbx*8 + 8]
-	addsd xmm2, [rax + rbx*8 + 16]
-
-	movapd [rsp + nb400_iq], xmm3
-	movapd [rsp + nb400_isai], xmm4
-	
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [rsp + nb400_ix], xmm0
-	movapd [rsp + nb400_iy], xmm1
-	movapd [rsp + nb400_iz], xmm2
-
-	mov   [rsp + nb400_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorpd xmm4, xmm4
-	movapd xmm8, xmm4
-	movapd xmm12, xmm4
-	movapd xmm13, xmm4
-	movapd xmm14, xmm4
-	movapd xmm15, xmm4
-	
-	mov   rax, [rsp + nb400_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb400_pos]
-	mov   rdi, [rbp + nb400_faction]	
-	mov   rax, [rsp + nb400_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb400_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [rsp + nb400_ninner]
-	mov   [rsp + nb400_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb400_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb400_unroll_loop
-	jmp   .nb400_checksingle
-.nb400_unroll_loop:
-	;# twice unrolled innerloop here 
-	mov   rdx, [rsp + nb400_innerjjnr]   ;# pointer to jjnr[k] 
-	mov   r12d, [rdx]
-	mov   r13d, [rdx + 4]
-	add qword ptr [rsp + nb400_innerjjnr], 8	;# advance pointer (unrolled 2) 
-	
-	mov rsi, [rbp + nb400_pos]		;# base of pos[] 
-
-	lea   r8, [r12 + r12*2]     ;# j3 
-	lea   r9, [r13 + r13*2]	
-
-	;# move two coordinates to xmm4-xmm6
-	movlpd xmm4, [rsi + r8*8]
-	movlpd xmm5, [rsi + r8*8 + 8]
-	movlpd xmm6, [rsi + r8*8 + 16]
-	movhpd xmm4, [rsi + r9*8]
-	movhpd xmm5, [rsi + r9*8 + 8]
-	movhpd xmm6, [rsi + r9*8 + 16]		
-	
-	;# calc dr 
-	subpd xmm4, [rsp + nb400_ix]
-	subpd xmm5, [rsp + nb400_iy]
-	subpd xmm6, [rsp + nb400_iz]
-
-
-	;# store dr 
-	movapd xmm9, xmm4
-	movapd xmm10, xmm5
-	movapd xmm11, xmm6
-
-	;# square it 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	mov rsi, [rbp + nb400_invsqrta]
-	movlpd xmm3, [rsi + r12*8]
-
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	movhpd xmm3, [rsi + r13*8]
-	mulpd  xmm3, [rsp + nb400_isai]
-	movapd [rsp + nb400_isaprod], xmm3	
-    movapd xmm6, xmm3
-	mulpd xmm3, [rsp + nb400_gbtsc]
-	movapd [rsp + nb400_gbscale], xmm3
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb400_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb400_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	mov rsi, [rbp + nb400_charge]    ;# base of charge[] 
-	movlpd xmm3, [rsi + r12*8]
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb400_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb400_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=iter2 of rinv (new lu) 
-	mulpd xmm4, xmm0	;# xmm4=r 
-    
-    mulpd  xmm6, [rsp + nb400_iq]
-	movhpd xmm3, [rsi + r13*8]
-	mulpd  xmm3, xmm6
-	movapd [rsp + nb400_qq], xmm3	
-
-
-	movapd [rsp + nb400_r], xmm4
-	mulpd xmm4, [rsp + nb400_gbscale]
-
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	
-	pslld mm6, 2		;# idx *= 4 
-
-	mov  rsi, [rbp + nb400_GBtab]
-	movd r10d, mm6
-	psrlq mm6, 32
-	movd r11d, mm6		;# indices in r10/r11
-
-	movapd xmm4, [rsi + r10*8]	;# Y1 F1 	
-	movapd xmm3, [rsi + r11*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [rsi + r10*8 + 16]	;# G1 H1 	
-	movapd xmm3, [rsi + r11*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# coulomb table ready, in xmm4-xmm7  		
-
-	mulpd  xmm7, xmm1	;# xmm7=Heps
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm1	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	addpd  xmm7, xmm7	;# two*Heps2 
-	movapd xmm3, [rsp + nb400_qq]
-	addpd  xmm7, xmm6
-	addpd  xmm7, xmm5 ;# xmm7=FF 
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulpd  xmm3, xmm7 ;# fijC=FF*qq 
-
-	mov rsi, [rbp + nb400_dvda]
-	
-	;# Calculate dVda
-	xorpd xmm7, xmm7
-	mulpd xmm3, [rsp + nb400_gbscale]
-	movapd xmm6, xmm3
-	mulpd  xmm6, [rsp + nb400_r]
-	addpd  xmm6, xmm5
-
-    ;# update vctot
-	addpd  xmm12, xmm5
-
-	;# xmm6=(vcoul+fijC*r)
-	subpd  xmm7, xmm6
-	movapd xmm6, xmm7
-	
-	;# update dvdasum
-	addpd  xmm8, xmm7
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	addsd  xmm6, [rsi + r12*8]
-	addsd  xmm7, [rsi + r13*8]
-	movsd  [rsi + r12*8], xmm6
-	movsd  [rsi + r13*8], xmm7
-	
-	;# the fj's - start by accumulating forces from memory 
-    mov rdi, [rbp + nb400_faction]
-	movlpd xmm5, [rdi + r8*8]
-	movlpd xmm6, [rdi + r8*8 + 8]
-	movlpd xmm7, [rdi + r8*8 + 16]
-	movhpd xmm5, [rdi + r9*8]
-	movhpd xmm6, [rdi + r9*8 + 8]
-	movhpd xmm7, [rdi + r9*8 + 16]
-
-	xorpd  xmm4, xmm4
-
-	mulpd xmm3, xmm0
-	subpd  xmm4, xmm3
-
-	mov    rdi, [rbp + nb400_faction]
-	mulpd  xmm9, xmm4
-	mulpd  xmm10, xmm4
-	mulpd  xmm11, xmm4
-    
-	addpd xmm5, xmm9
-	addpd xmm6, xmm10
-	addpd xmm7, xmm11
-
-	;# now update f_i 
-	addpd  xmm13, xmm9
-	addpd  xmm14, xmm10
-	addpd  xmm15, xmm11
-
-	movlpd [rdi + r8*8], xmm5
-	movlpd [rdi + r8*8 + 8], xmm6
-	movlpd [rdi + r8*8 + 16], xmm7
-	movhpd [rdi + r9*8], xmm5
-	movhpd [rdi + r9*8 + 8], xmm6
-	movhpd [rdi + r9*8 + 16], xmm7
-	
-	;# should we do one more iteration? 
-	sub dword ptr [rsp + nb400_innerk],  2
-	jl    .nb400_checksingle
-	jmp   .nb400_unroll_loop
-.nb400_checksingle:
-	mov   edx, [rsp + nb400_innerk]
-	and   edx, 1
-	jnz    .nb400_dosingle
-	jmp    .nb400_updateouterdata
-.nb400_dosingle:
-	mov rsi, [rbp + nb400_charge]
-	mov rdx, [rbp + nb400_invsqrta]
-	mov rdi, [rbp + nb400_pos]
-	mov   rcx, [rsp + nb400_innerjjnr]
-	mov   eax, [rcx]	
-
-	;# load isaj
-	mov rsi, [rbp + nb400_invsqrta]
-	movsd xmm2, [rsi + rax*8]
-	mulsd  xmm2, [rsp + nb400_isai]
-	movapd [rsp + nb400_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulsd xmm1, [rsp + nb400_gbtsc]
-	movapd [rsp + nb400_gbscale], xmm1
-
-    mulsd xmm2, [rsp + nb400_iq]
-	mov rsi, [rbp + nb400_charge]    ;# base of charge[] 
-	movsd xmm3, [rsi + rax*8]
-	mulsd  xmm3, xmm2
-	movapd [rsp + nb400_qq], xmm3	
-
-	mov rsi, [rbp + nb400_pos]		;# base of pos[] 
-
-	lea   r8, [rax + rax*2]     ;# j3 
-
-	;# move coordinate to xmm4-xmm6
-	movsd xmm4, [rsi + r8*8]
-	movsd xmm5, [rsi + r8*8 + 8]
-	movsd xmm6, [rsi + r8*8 + 16]
-
-	mov    rdi, [rbp + nb400_faction]
-	
-	;# calc dr 
-	subsd xmm4, [rsp + nb400_ix]
-	subsd xmm5, [rsp + nb400_iy]
-	subsd xmm6, [rsp + nb400_iz]
-
-	;# store dr 
-	movapd xmm9, xmm4
-	movapd xmm10, xmm5
-	movapd xmm11, xmm6
-
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb400_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb400_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb400_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb400_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=iter2 of rinv (new lu) 
-	mulsd xmm4, xmm0	;# xmm4=r 
-    
-	movapd [rsp + nb400_r], xmm4
-	mulsd xmm4, [rsp + nb400_gbscale]
-
-	cvttsd2si r10d, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, r10d
-	subsd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	
-	shl r10d, 2		;# idx *= 4 
-
-	mov  rsi, [rbp + nb400_GBtab]
-
-	movapd xmm4, [rsi + r10*8]	;# Y1 F1 	
-	movhlps xmm5, xmm4
-	movapd xmm6, [rsi + r10*8 + 16]	;# G1 H1 	
-    movhlps xmm7, xmm6
-	;# coulomb table ready, in xmm4-xmm7  		
-
-	mulsd  xmm7, xmm1	;# xmm7=Heps
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm1	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	addsd  xmm7, xmm7	;# two*Heps2 
-	movapd xmm3, [rsp + nb400_qq]
-	addsd  xmm7, xmm6
-	addsd  xmm7, xmm5 ;# xmm7=FF 
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulsd  xmm3, xmm7 ;# fijC=FF*qq 
-
-	mov rsi, [rbp + nb400_dvda]
-	
-	;# Calculate dVda
-	xorpd xmm7, xmm7
-	mulsd xmm3, [rsp + nb400_gbscale]
-	movapd xmm6, xmm3
-	mulsd  xmm6, [rsp + nb400_r]
-	addsd  xmm6, xmm5
-
-    ;# update vctot
-	addsd  xmm12, xmm5
-
-	;# xmm6=(vcoul+fijC*r)
-	subsd  xmm7, xmm6
-	movapd xmm6, xmm7
-	
-	;# update dvdasum
-	addsd  xmm8, xmm7
-
-	;# update j atoms dvdaj
-	addsd  xmm6, [rsi + rax*8]
-	movsd  [rsi + rax*8], xmm6
-	
-	xorpd  xmm4, xmm4
-
-	mulsd xmm3, xmm0
-	subsd  xmm4, xmm3
-
-	mov    rdi, [rbp + nb400_faction]
-	mulsd  xmm9, xmm4
-	mulsd  xmm10, xmm4
-	mulsd  xmm11, xmm4
-    
-	;# now update f_i 
-	addsd  xmm13, xmm9
-	addsd  xmm14, xmm10
-	addsd  xmm15, xmm11
-
-	;# the fj's - start by accumulating forces from memory 
-    mov rdi, [rbp + nb400_faction]
-	addsd xmm9,  [rdi + r8*8]
-	addsd xmm10, [rdi + r8*8 + 8]
-	addsd xmm11, [rdi + r8*8 + 16]
-	movsd [rdi + r8*8], xmm9
-	movsd [rdi + r8*8 + 8], xmm10
-	movsd [rdi + r8*8 + 16], xmm11
-	
-.nb400_updateouterdata:
-	mov   ecx, [rsp + nb400_ii3]
-	mov   rdi, [rbp + nb400_faction]
-	mov   rsi, [rbp + nb400_fshift]
-	mov   edx, [rsp + nb400_is3]
-
-	;# accumulate i forces in xmm13, xmm14, xmm15
-	movhlps xmm3, xmm13
-	movhlps xmm4, xmm14
-	movhlps xmm5, xmm15
-	addsd  xmm13, xmm3
-	addsd  xmm14, xmm4
-	addsd  xmm15, xmm5 ;# sum is in low xmm13-xmm15
-
-	;# increment i force 
-	movsd  xmm3, [rdi + rcx*8]
-	movsd  xmm4, [rdi + rcx*8 + 8]
-	movsd  xmm5, [rdi + rcx*8 + 16]
-	subsd  xmm3, xmm13
-	subsd  xmm4, xmm14
-	subsd  xmm5, xmm15
-	movsd  [rdi + rcx*8],     xmm3
-	movsd  [rdi + rcx*8 + 8], xmm4
-	movsd  [rdi + rcx*8 + 16], xmm5
-
-	;# increment fshift force  
-	movsd  xmm3, [rsi + rdx*8]
-	movsd  xmm4, [rsi + rdx*8 + 8]
-	movsd  xmm5, [rsi + rdx*8 + 16]
-	subsd  xmm3, xmm13
-	subsd  xmm4, xmm14
-	subsd  xmm5, xmm15
-	movsd  [rsi + rdx*8],     xmm3
-	movsd  [rsi + rdx*8 + 8], xmm4
-	movsd  [rsi + rdx*8 + 16], xmm5
-
-	;# get n from stack
-	mov esi, [rsp + nb400_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb400_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total coulomb energy and update it 
-	movhlps xmm6, xmm12
-	addsd  xmm12, xmm6	;# low xmm12 have the sum now 
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb400_Vc]
-	addsd xmm12, [rax + rdx*8] 
-	;# move back to mem 
-	movsd [rax + rdx*8], xmm12
-	
-	;# accumulate dVda and update it 
-	movhlps xmm6, xmm8
-	addsd  xmm8, xmm6	;# low xmm8 has the sum now 
-	
-	mov edx, [rsp + nb400_ii]
-	mov rax, [rbp + nb400_dvda]
-	addsd xmm8, [rax + rdx*8]
-	movsd [rax + rdx*8], xmm8
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb400_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb400_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb400_n], esi
-        jmp .nb400_outer
-.nb400_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb400_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb400_end
-        ;# non-zero, do one more workunit
-        jmp   .nb400_threadloop
-.nb400_end:
-	mov eax, [rsp + nb400_nouter]
-	mov ebx, [rsp + nb400_ninner]
-	mov rcx, [rbp + nb400_outeriter]
-	mov rdx, [rbp + nb400_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 440
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
-
-
-
-
-
-
-.globl nb_kernel400nf_x86_64_sse2
-.globl _nb_kernel400nf_x86_64_sse2
-nb_kernel400nf_x86_64_sse2:	
-_nb_kernel400nf_x86_64_sse2:	
-.equiv          nb400nf_fshift,         16
-.equiv          nb400nf_gid,            24
-.equiv          nb400nf_pos,            32
-.equiv          nb400nf_faction,        40
-.equiv          nb400nf_charge,         48
-.equiv          nb400nf_p_facel,        56
-.equiv          nb400nf_argkrf,         64
-.equiv          nb400nf_argcrf,         72
-.equiv          nb400nf_Vc,             80
-.equiv          nb400nf_type,           88
-.equiv          nb400nf_p_ntype,        96
-.equiv          nb400nf_vdwparam,       104
-.equiv          nb400nf_Vvdw,           112
-.equiv          nb400nf_p_tabscale,     120
-.equiv          nb400nf_VFtab,          128
-.equiv          nb400nf_invsqrta,       136
-.equiv          nb400nf_dvda,           144
-.equiv          nb400nf_p_gbtabscale,   152
-.equiv          nb400nf_GBtab,          160
-.equiv          nb400nf_p_nthreads,     168
-.equiv          nb400nf_count,          176
-.equiv          nb400nf_mtx,            184
-.equiv          nb400nf_outeriter,      192
-.equiv          nb400nf_inneriter,      200
-.equiv          nb400nf_work,           208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb400nf_ix,             0
-.equiv          nb400nf_iy,             16
-.equiv          nb400nf_iz,             32
-.equiv          nb400nf_iq,             48
-.equiv          nb400nf_gbtsc,          64
-.equiv          nb400nf_qq,             80
-.equiv          nb400nf_vctot,          96
-.equiv          nb400nf_half,           112
-.equiv          nb400nf_three,          128
-.equiv          nb400nf_isai,           144
-.equiv          nb400nf_isaprod,        160
-.equiv          nb400nf_gbscale,        176
-.equiv          nb400nf_nri,            192
-.equiv          nb400nf_iinr,           200
-.equiv          nb400nf_jindex,         208
-.equiv          nb400nf_jjnr,           216
-.equiv          nb400nf_shift,          224
-.equiv          nb400nf_shiftvec,       232
-.equiv          nb400nf_facel,          240
-.equiv          nb400nf_innerjjnr,      248
-.equiv          nb400nf_is3,            256
-.equiv          nb400nf_ii3,            260
-.equiv          nb400nf_innerk,         264
-.equiv          nb400nf_n,              268
-.equiv          nb400nf_nn1,            272
-.equiv          nb400nf_nouter,         276
-.equiv          nb400nf_ninner,         280
-	push rbp
-	mov  rbp, rsp
-	push rbx
-
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 296		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb400nf_nouter], eax
-	mov [rsp + nb400nf_ninner], eax
-
-	mov edi, [rdi]
-	mov [rsp + nb400nf_nri], edi
-	mov [rsp + nb400nf_iinr], rsi
-	mov [rsp + nb400nf_jindex], rdx
-	mov [rsp + nb400nf_jjnr], rcx
-	mov [rsp + nb400nf_shift], r8
-	mov [rsp + nb400nf_shiftvec], r9
-	mov rsi, [rbp + nb400nf_p_facel]
-	movsd xmm0, [rsi]
-	movsd [rsp + nb400nf_facel], xmm0
-
-	mov rbx, [rbp + nb400nf_p_gbtabscale]
-	movsd xmm4, [rbx]
-	shufpd xmm4, xmm4, 0
-	movapd [rsp + nb400nf_gbtsc],  xmm4
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double half IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [rsp + nb400nf_half], eax
-	mov [rsp + nb400nf_half+4], ebx
-	movsd xmm1, [rsp + nb400nf_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# one
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# two
-	addpd  xmm3, xmm2	;# three
-	movapd [rsp + nb400nf_half], xmm1
-	movapd [rsp + nb400nf_three], xmm3
-
-.nb400nf_threadloop:
-        mov   rsi, [rbp + nb400nf_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb400nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb400nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb400nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb400nf_n], eax
-        mov [rsp + nb400nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb400nf_outerstart
-        jmp .nb400nf_end
-
-.nb400nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb400nf_nouter]
-	mov [rsp + nb400nf_nouter], ebx
-
-.nb400nf_outer:
-	mov   rax, [rsp + nb400nf_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax+rsi*4]		;# rbx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb400nf_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb400nf_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movsd xmm0, [rax + rbx*8]
-	movsd xmm1, [rax + rbx*8 + 8]
-	movsd xmm2, [rax + rbx*8 + 16] 
-
-	mov   rcx, [rsp + nb400nf_iinr]       ;# rcx = pointer into iinr[] 	
-	mov   ebx, [rcx+rsi*4]	    ;# ebx =ii 
-
-	mov   rdx, [rbp + nb400nf_charge]
-	movsd xmm3, [rdx + rbx*8]	
-	mulsd xmm3, [rsp + nb400nf_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   rdx, [rbp + nb400nf_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [rdx + rbx*8]
-	shufpd xmm4, xmm4, 0
-
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb400nf_pos]    ;# rax = base of pos[]  
-
-	addsd xmm0, [rax + rbx*8]
-	addsd xmm1, [rax + rbx*8 + 8]
-	addsd xmm2, [rax + rbx*8 + 16]
-
-	movapd [rsp + nb400nf_iq], xmm3
-	movapd [rsp + nb400nf_isai], xmm4
-	
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [rsp + nb400nf_ix], xmm0
-	movapd [rsp + nb400nf_iy], xmm1
-	movapd [rsp + nb400nf_iz], xmm2
-
-	mov   [rsp + nb400nf_ii3], ebx
-	
-	;# clear vctot
-	xorpd xmm4, xmm4
-	movapd [rsp + nb400nf_vctot], xmm4
-	
-	mov   rax, [rsp + nb400nf_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb400nf_pos]
-	mov   rdi, [rbp + nb400nf_faction]	
-	mov   rax, [rsp + nb400nf_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb400nf_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [rsp + nb400nf_ninner]
-	mov   [rsp + nb400nf_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb400nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb400nf_unroll_loop
-	jmp   .nb400nf_checksingle
-.nb400nf_unroll_loop:
-	;# twice unrolled innerloop here 
-	mov   rdx, [rsp + nb400nf_innerjjnr]   ;# pointer to jjnr[k] 
-	mov   eax, [rdx]
-	mov   ebx, [rdx + 4]
-	add qword ptr [rsp + nb400nf_innerjjnr], 8	;# advance pointer (unrolled 2) 
-
-	;# load isa2
-	mov rsi, [rbp + nb400nf_invsqrta]
-	movlpd xmm2, [rsi + rax*8]
-	movhpd xmm2, [rsi + rbx*8]
-	mulpd  xmm2, [rsp + nb400nf_isai]
-	movapd [rsp + nb400nf_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulpd xmm1, [rsp + nb400nf_gbtsc]
-	movapd [rsp + nb400nf_gbscale], xmm1
-	
-	mov rsi, [rbp + nb400nf_charge]    ;# base of charge[] 
-	movlpd xmm3, [rsi + rax*8]
-	movhpd xmm3, [rsi + rbx*8]
-
-	mulpd xmm2, [rsp + nb400nf_iq]
-    mulpd xmm3, xmm2
-	movapd [rsp + nb400nf_qq], xmm3	
-	
-	mov rsi, [rbp + nb400nf_pos]		;# base of pos[] 
-
-	lea   rax, [rax + rax*2]     ;# replace jnr with j3 
-	lea   rbx, [rbx + rbx*2]	
-
-	;# move two coordinates to xmm0-xmm2 
-	movlpd xmm0, [rsi + rax*8]
-	movlpd xmm1, [rsi + rax*8 + 8]
-	movlpd xmm2, [rsi + rax*8 + 16]
-	movhpd xmm0, [rsi + rbx*8]
-	movhpd xmm1, [rsi + rbx*8 + 8]
-	movhpd xmm2, [rsi + rbx*8 + 16]		
-
-	mov    rdi, [rbp + nb400nf_faction]
-	
-	;# move nb400nf_ix-iz to xmm4-xmm6 
-	movapd xmm4, [rsp + nb400nf_ix]
-	movapd xmm5, [rsp + nb400nf_iy]
-	movapd xmm6, [rsp + nb400nf_iz]
-
-	;# calc dr 
-	subpd xmm4, xmm0
-	subpd xmm5, xmm1
-	subpd xmm6, xmm2
-
-	;# square it 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb400nf_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb400nf_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb400nf_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb400nf_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=iter2 of rinv (new lu) 
-	mulpd xmm4, xmm0	;# xmm4=r 
-	mulpd xmm4, [rsp + nb400nf_gbscale]
-
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 2		;# idx *= 4 
-	
-	movd mm0, eax	
-	movd mm1, ebx
-
-	mov  rsi, [rbp + nb400nf_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ebx, mm6		;# indices in eax/ebx 
-
-	movapd xmm4, [rsi + rax*8]	;# Y1 F1 	
-	movapd xmm3, [rsi + rbx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [rsi + rax*8 + 16]	;# G1 H1 	
-	movapd xmm3, [rsi + rbx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [rsp + nb400nf_qq]
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  
-	addpd  xmm5, [rsp + nb400nf_vctot]
-	movapd [rsp + nb400nf_vctot], xmm5  
-	
-	;# should we do one more iteration? 
-	sub dword ptr [rsp + nb400nf_innerk],  2
-	jl    .nb400nf_checksingle
-	jmp   .nb400nf_unroll_loop
-.nb400nf_checksingle:
-	mov   edx, [rsp + nb400nf_innerk]
-	and   edx, 1
-	jnz    .nb400nf_dosingle
-	jmp    .nb400nf_updateouterdata
-.nb400nf_dosingle:
-	mov rsi, [rbp + nb400nf_charge]
-	mov rdx, [rbp + nb400nf_invsqrta]
-	mov rdi, [rbp + nb400nf_pos]
-	mov   rcx, [rsp + nb400nf_innerjjnr]
-	mov   eax, [rcx]	
-	xorpd  xmm6, xmm6
-	movapd xmm7, xmm6
-	movsd  xmm7, [rdx + rax*8]
-	movlpd xmm6, [rsi + rax*8]	;# xmm6(0) has the charge
-	mulsd  xmm7, [rsp + nb400nf_isai]
-	movapd [rsp + nb400nf_isaprod], xmm7
-	movapd xmm1, xmm7
-	mulpd xmm1, [rsp + nb400nf_gbtsc]
-	movapd [rsp + nb400nf_gbscale], xmm1
-	
-	mulsd  xmm7, [rsp + nb400nf_iq]
-	mulsd  xmm6, xmm7
-	movapd [rsp + nb400nf_qq], xmm6
-	
-	lea   rax, [rax + rax*2]
-	
-	;# move coordinates to xmm0-xmm2 
-	movlpd xmm0, [rdi + rax*8]
-	movlpd xmm1, [rdi + rax*8 + 8]
-	movlpd xmm2, [rdi + rax*8 + 16]
-
-	;# move nb400nf_ix-iz to xmm4-xmm6 
-	movapd xmm4, [rsp + nb400nf_ix]
-	movapd xmm5, [rsp + nb400nf_iy]
-	movapd xmm6, [rsp + nb400nf_iz]
-
-	;# calc dr 
-	subsd xmm4, xmm0
-	subsd xmm5, xmm1
-	subsd xmm6, xmm2
-
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb400nf_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb400nf_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb400nf_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb400nf_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=iter2 of rinv (new lu) 
-	
-	mulsd xmm4, xmm0	;# xmm4=r 
-	mulsd xmm4, [rsp + nb400nf_gbscale]
-	
-	movd mm0, eax	
-
-	cvttsd2si eax, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, eax
-	subsd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2 
-	
-	shl eax, 2		;# idx *= 4 
-	
-	mov  rsi, [rbp + nb400nf_GBtab]
-
-	;# Coulomb 
-	movapd xmm4, [rsi + rax*8]	;# Y1 F1 
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1  
-	unpckhpd xmm5, xmm3	;# F1  
-
-	movapd xmm6, [rsi + rax*8 + 16]	;# G1 H1 
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1  
-	unpckhpd xmm7, xmm3	;# H1  	
-	;# table ready in xmm4-xmm7 
-
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [rsp + nb400nf_qq]
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  
-	addsd  xmm5, [rsp + nb400nf_vctot]
-	movsd [rsp + nb400nf_vctot], xmm5
-	
-.nb400nf_updateouterdata:
-	;# get n from stack
-	mov esi, [rsp + nb400nf_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb400nf_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movapd xmm7, [rsp + nb400nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb400nf_Vc]
-	addsd xmm7, [rax + rdx*8] 
-	;# move back to mem 
-	movsd [rax + rdx*8], xmm7 
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb400nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb400nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb400nf_n], esi
-        jmp .nb400nf_outer
-.nb400nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb400nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb400nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb400nf_threadloop
-.nb400nf_end:
-
-	mov eax, [rsp + nb400nf_nouter]
-	mov ebx, [rsp + nb400nf_ninner]
-	mov rcx, [rbp + nb400nf_outeriter]
-	mov rdx, [rbp + nb400nf_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 296
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
-
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.s
deleted file mode 100644
index b75ce2037c..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.s
+++ /dev/null
@@ -1,1212 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-
-
-
-.globl nb_kernel400_x86_64_sse2
-.globl _nb_kernel400_x86_64_sse2
-nb_kernel400_x86_64_sse2:       
-_nb_kernel400_x86_64_sse2:      
-##      Room for return address and rbp (16 bytes)
-.set nb400_fshift, 16
-.set nb400_gid, 24
-.set nb400_pos, 32
-.set nb400_faction, 40
-.set nb400_charge, 48
-.set nb400_p_facel, 56
-.set nb400_argkrf, 64
-.set nb400_argcrf, 72
-.set nb400_Vc, 80
-.set nb400_type, 88
-.set nb400_p_ntype, 96
-.set nb400_vdwparam, 104
-.set nb400_Vvdw, 112
-.set nb400_p_tabscale, 120
-.set nb400_VFtab, 128
-.set nb400_invsqrta, 136
-.set nb400_dvda, 144
-.set nb400_p_gbtabscale, 152
-.set nb400_GBtab, 160
-.set nb400_p_nthreads, 168
-.set nb400_count, 176
-.set nb400_mtx, 184
-.set nb400_outeriter, 192
-.set nb400_inneriter, 200
-.set nb400_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb400_ix, 0
-.set nb400_iy, 16
-.set nb400_iz, 32
-.set nb400_iq, 48
-.set nb400_dx, 64
-.set nb400_dy, 80
-.set nb400_dz, 96
-.set nb400_two, 112
-.set nb400_gbtsc, 128
-.set nb400_qq, 144
-.set nb400_r, 160
-.set nb400_vctot, 176
-.set nb400_fix, 192
-.set nb400_fiy, 208
-.set nb400_fiz, 224
-.set nb400_half, 240
-.set nb400_three, 256
-.set nb400_isai, 272
-.set nb400_isaprod, 288
-.set nb400_dvdasum, 304
-.set nb400_gbscale, 320
-.set nb400_nri, 336
-.set nb400_iinr, 344
-.set nb400_jindex, 352
-.set nb400_jjnr, 360
-.set nb400_shift, 368
-.set nb400_shiftvec, 376
-.set nb400_facel, 384
-.set nb400_innerjjnr, 392
-.set nb400_is3, 400
-.set nb400_ii3, 404
-.set nb400_ii, 408
-.set nb400_innerk, 412
-.set nb400_n, 416
-.set nb400_nn1, 420
-.set nb400_nouter, 424
-.set nb400_ninner, 428
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $440,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb400_nouter(%rsp)
-        movl %eax,nb400_ninner(%rsp)
-
-        movl (%rdi),%edi
-        movl %edi,nb400_nri(%rsp)
-        movq %rsi,nb400_iinr(%rsp)
-        movq %rdx,nb400_jindex(%rsp)
-        movq %rcx,nb400_jjnr(%rsp)
-        movq %r8,nb400_shift(%rsp)
-        movq %r9,nb400_shiftvec(%rsp)
-        movq nb400_p_facel(%rbp),%rsi
-        movsd (%rsi),%xmm0
-        movsd %xmm0,nb400_facel(%rsp)
-
-        movq nb400_p_gbtabscale(%rbp),%rbx
-        movsd (%rbx),%xmm4
-        shufpd $0,%xmm4,%xmm4
-        movapd %xmm4,nb400_gbtsc(%rsp)
-
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double half IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb400_half(%rsp)
-        movl %ebx,nb400_half+4(%rsp)
-        movsd nb400_half(%rsp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## one
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## two
-        addpd  %xmm2,%xmm3      ## three
-        movapd %xmm1,nb400_half(%rsp)
-        movapd %xmm2,nb400_two(%rsp)
-        movapd %xmm3,nb400_three(%rsp)
-
-_nb_kernel400_x86_64_sse2.nb400_threadloop: 
-        movq  nb400_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel400_x86_64_sse2.nb400_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel400_x86_64_sse2.nb400_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb400_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb400_n(%rsp)
-        movl %ebx,nb400_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel400_x86_64_sse2.nb400_outerstart
-        jmp _nb_kernel400_x86_64_sse2.nb400_end
-
-_nb_kernel400_x86_64_sse2.nb400_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb400_nouter(%rsp),%ebx
-        movl %ebx,nb400_nouter(%rsp)
-
-_nb_kernel400_x86_64_sse2.nb400_outer: 
-        movq  nb400_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx        ## rbx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb400_is3(%rsp)      ## store is3 
-
-        movq  nb400_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movsd (%rax,%rbx,8),%xmm0
-        movsd 8(%rax,%rbx,8),%xmm1
-        movsd 16(%rax,%rbx,8),%xmm2
-
-        movq  nb400_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]        
-        movl  (%rcx,%rsi,4),%ebx    ## ebx =ii 
-        movl  %ebx,nb400_ii(%rsp)
-
-        movq  nb400_charge(%rbp),%rdx
-        movsd (%rdx,%rbx,8),%xmm3
-        mulsd nb400_facel(%rsp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movq  nb400_invsqrta(%rbp),%rdx         ## load invsqrta[ii]
-        movsd (%rdx,%rbx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb400_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addsd (%rax,%rbx,8),%xmm0
-        addsd 8(%rax,%rbx,8),%xmm1
-        addsd 16(%rax,%rbx,8),%xmm2
-
-        movapd %xmm3,nb400_iq(%rsp)
-        movapd %xmm4,nb400_isai(%rsp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb400_ix(%rsp)
-        movapd %xmm1,nb400_iy(%rsp)
-        movapd %xmm2,nb400_iz(%rsp)
-
-        movl  %ebx,nb400_ii3(%rsp)
-
-        ## clear vctot and i forces 
-        xorpd %xmm4,%xmm4
-        movapd %xmm4,%xmm8
-        movapd %xmm4,%xmm12
-        movapd %xmm4,%xmm13
-        movapd %xmm4,%xmm14
-        movapd %xmm4,%xmm15
-
-        movq  nb400_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb400_pos(%rbp),%rsi
-        movq  nb400_faction(%rbp),%rdi
-        movq  nb400_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb400_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb400_ninner(%rsp),%ecx
-        movl  %ecx,nb400_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb400_innerk(%rsp)      ## number of innerloop atoms 
-        jge   _nb_kernel400_x86_64_sse2.nb400_unroll_loop
-        jmp   _nb_kernel400_x86_64_sse2.nb400_checksingle
-_nb_kernel400_x86_64_sse2.nb400_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movq  nb400_innerjjnr(%rsp),%rdx     ## pointer to jjnr[k] 
-        movl  (%rdx),%r12d
-        movl  4(%rdx),%r13d
-        addq $8,nb400_innerjjnr(%rsp)                   ## advance pointer (unrolled 2) 
-
-        movq nb400_pos(%rbp),%rsi               ## base of pos[] 
-
-        lea  (%r12,%r12,2),%r8     ## j3 
-        lea  (%r13,%r13,2),%r9
-
-        ## move two coordinates to xmm4-xmm6
-        movlpd (%rsi,%r8,8),%xmm4
-        movlpd 8(%rsi,%r8,8),%xmm5
-        movlpd 16(%rsi,%r8,8),%xmm6
-        movhpd (%rsi,%r9,8),%xmm4
-        movhpd 8(%rsi,%r9,8),%xmm5
-        movhpd 16(%rsi,%r9,8),%xmm6
-
-        ## calc dr 
-        subpd nb400_ix(%rsp),%xmm4
-        subpd nb400_iy(%rsp),%xmm5
-        subpd nb400_iz(%rsp),%xmm6
-
-
-        ## store dr 
-        movapd %xmm4,%xmm9
-        movapd %xmm5,%xmm10
-        movapd %xmm6,%xmm11
-
-        ## square it 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        movq nb400_invsqrta(%rbp),%rsi
-        movlpd (%rsi,%r12,8),%xmm3
-
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        movhpd (%rsi,%r13,8),%xmm3
-        mulpd  nb400_isai(%rsp),%xmm3
-        movapd %xmm3,nb400_isaprod(%rsp)
-    movapd %xmm3,%xmm6
-        mulpd nb400_gbtsc(%rsp),%xmm3
-        movapd %xmm3,nb400_gbscale(%rsp)
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb400_three(%rsp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb400_half(%rsp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movq nb400_charge(%rbp),%rsi     ## base of charge[] 
-        movlpd (%rsi,%r12,8),%xmm3
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb400_three(%rsp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb400_half(%rsp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=iter2 of rinv (new lu) 
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-
-    mulpd  nb400_iq(%rsp),%xmm6
-        movhpd (%rsi,%r13,8),%xmm3
-        mulpd  %xmm6,%xmm3
-        movapd %xmm3,nb400_qq(%rsp)
-
-
-        movapd %xmm4,nb400_r(%rsp)
-        mulpd nb400_gbscale(%rsp),%xmm4
-
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-
-        pslld $2,%mm6           ## idx *= 4 
-
-        movq nb400_GBtab(%rbp),%rsi
-        movd %mm6,%r10d
-        psrlq $32,%mm6
-        movd %mm6,%r11d         ## indices in r10/r11
-
-        movapd (%rsi,%r10,8),%xmm4      ## Y1 F1        
-        movapd (%rsi,%r11,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%rsi,%r10,8),%xmm6    ## G1 H1        
-        movapd 16(%rsi,%r11,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## coulomb table ready, in xmm4-xmm7            
-
-        mulpd  %xmm1,%xmm7      ## xmm7=Heps
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm1,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        addpd  %xmm7,%xmm7      ## two*Heps2 
-        movapd nb400_qq(%rsp),%xmm3
-        addpd  %xmm6,%xmm7
-        addpd  %xmm5,%xmm7 ## xmm7=FF 
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-        mulpd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulpd  %xmm7,%xmm3 ## fijC=FF*qq 
-
-        movq nb400_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        xorpd %xmm7,%xmm7
-        mulpd nb400_gbscale(%rsp),%xmm3
-        movapd %xmm3,%xmm6
-        mulpd  nb400_r(%rsp),%xmm6
-        addpd  %xmm5,%xmm6
-
-    ## update vctot
-        addpd  %xmm5,%xmm12
-
-        ## xmm6=(vcoul+fijC*r)
-        subpd  %xmm6,%xmm7
-        movapd %xmm7,%xmm6
-
-        ## update dvdasum
-        addpd  %xmm7,%xmm8
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        addsd  (%rsi,%r12,8),%xmm6
-        addsd  (%rsi,%r13,8),%xmm7
-        movsd  %xmm6,(%rsi,%r12,8)
-        movsd  %xmm7,(%rsi,%r13,8)
-
-        ## the fj's - start by accumulating forces from memory 
-    movq nb400_faction(%rbp),%rdi
-        movlpd (%rdi,%r8,8),%xmm5
-        movlpd 8(%rdi,%r8,8),%xmm6
-        movlpd 16(%rdi,%r8,8),%xmm7
-        movhpd (%rdi,%r9,8),%xmm5
-        movhpd 8(%rdi,%r9,8),%xmm6
-        movhpd 16(%rdi,%r9,8),%xmm7
-
-        xorpd  %xmm4,%xmm4
-
-        mulpd %xmm0,%xmm3
-        subpd  %xmm3,%xmm4
-
-        movq   nb400_faction(%rbp),%rdi
-        mulpd  %xmm4,%xmm9
-        mulpd  %xmm4,%xmm10
-        mulpd  %xmm4,%xmm11
-
-        addpd %xmm9,%xmm5
-        addpd %xmm10,%xmm6
-        addpd %xmm11,%xmm7
-
-        ## now update f_i 
-        addpd  %xmm9,%xmm13
-        addpd  %xmm10,%xmm14
-        addpd  %xmm11,%xmm15
-
-        movlpd %xmm5,(%rdi,%r8,8)
-        movlpd %xmm6,8(%rdi,%r8,8)
-        movlpd %xmm7,16(%rdi,%r8,8)
-        movhpd %xmm5,(%rdi,%r9,8)
-        movhpd %xmm6,8(%rdi,%r9,8)
-        movhpd %xmm7,16(%rdi,%r9,8)
-
-        ## should we do one more iteration? 
-        subl $2,nb400_innerk(%rsp)
-        jl    _nb_kernel400_x86_64_sse2.nb400_checksingle
-        jmp   _nb_kernel400_x86_64_sse2.nb400_unroll_loop
-_nb_kernel400_x86_64_sse2.nb400_checksingle: 
-        movl  nb400_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel400_x86_64_sse2.nb400_dosingle
-        jmp    _nb_kernel400_x86_64_sse2.nb400_updateouterdata
-_nb_kernel400_x86_64_sse2.nb400_dosingle: 
-        movq nb400_charge(%rbp),%rsi
-        movq nb400_invsqrta(%rbp),%rdx
-        movq nb400_pos(%rbp),%rdi
-        movq  nb400_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-
-        ## load isaj
-        movq nb400_invsqrta(%rbp),%rsi
-        movsd (%rsi,%rax,8),%xmm2
-        mulsd  nb400_isai(%rsp),%xmm2
-        movapd %xmm2,nb400_isaprod(%rsp)
-        movapd %xmm2,%xmm1
-        mulsd nb400_gbtsc(%rsp),%xmm1
-        movapd %xmm1,nb400_gbscale(%rsp)
-
-    mulsd nb400_iq(%rsp),%xmm2
-        movq nb400_charge(%rbp),%rsi     ## base of charge[] 
-        movsd (%rsi,%rax,8),%xmm3
-        mulsd  %xmm2,%xmm3
-        movapd %xmm3,nb400_qq(%rsp)
-
-        movq nb400_pos(%rbp),%rsi               ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r8     ## j3 
-
-        ## move coordinate to xmm4-xmm6
-        movsd (%rsi,%r8,8),%xmm4
-        movsd 8(%rsi,%r8,8),%xmm5
-        movsd 16(%rsi,%r8,8),%xmm6
-
-        movq   nb400_faction(%rbp),%rdi
-
-        ## calc dr 
-        subsd nb400_ix(%rsp),%xmm4
-        subsd nb400_iy(%rsp),%xmm5
-        subsd nb400_iz(%rsp),%xmm6
-
-        ## store dr 
-        movapd %xmm4,%xmm9
-        movapd %xmm5,%xmm10
-        movapd %xmm6,%xmm11
-
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb400_three(%rsp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb400_half(%rsp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb400_three(%rsp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb400_half(%rsp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=iter2 of rinv (new lu) 
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-
-        movapd %xmm4,nb400_r(%rsp)
-        mulsd nb400_gbscale(%rsp),%xmm4
-
-        cvttsd2si %xmm4,%r10d   ## mm6 = lu idx 
-        cvtsi2sd %r10d,%xmm5
-        subsd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-
-        shll $2,%r10d           ## idx *= 4 
-
-        movq nb400_GBtab(%rbp),%rsi
-
-        movapd (%rsi,%r10,8),%xmm4      ## Y1 F1        
-        movhlps %xmm4,%xmm5
-        movapd 16(%rsi,%r10,8),%xmm6    ## G1 H1        
-    movhlps %xmm6,%xmm7
-        ## coulomb table ready, in xmm4-xmm7            
-
-        mulsd  %xmm1,%xmm7      ## xmm7=Heps
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm1,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        addsd  %xmm7,%xmm7      ## two*Heps2 
-        movapd nb400_qq(%rsp),%xmm3
-        addsd  %xmm6,%xmm7
-        addsd  %xmm5,%xmm7 ## xmm7=FF 
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulsd  %xmm7,%xmm3 ## fijC=FF*qq 
-
-        movq nb400_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        xorpd %xmm7,%xmm7
-        mulsd nb400_gbscale(%rsp),%xmm3
-        movapd %xmm3,%xmm6
-        mulsd  nb400_r(%rsp),%xmm6
-        addsd  %xmm5,%xmm6
-
-    ## update vctot
-        addsd  %xmm5,%xmm12
-
-        ## xmm6=(vcoul+fijC*r)
-        subsd  %xmm6,%xmm7
-        movapd %xmm7,%xmm6
-
-        ## update dvdasum
-        addsd  %xmm7,%xmm8
-
-        ## update j atoms dvdaj
-        addsd  (%rsi,%rax,8),%xmm6
-        movsd  %xmm6,(%rsi,%rax,8)
-
-        xorpd  %xmm4,%xmm4
-
-        mulsd %xmm0,%xmm3
-        subsd  %xmm3,%xmm4
-
-        movq   nb400_faction(%rbp),%rdi
-        mulsd  %xmm4,%xmm9
-        mulsd  %xmm4,%xmm10
-        mulsd  %xmm4,%xmm11
-
-        ## now update f_i 
-        addsd  %xmm9,%xmm13
-        addsd  %xmm10,%xmm14
-        addsd  %xmm11,%xmm15
-
-        ## the fj's - start by accumulating forces from memory 
-    movq nb400_faction(%rbp),%rdi
-        addsd (%rdi,%r8,8),%xmm9
-        addsd 8(%rdi,%r8,8),%xmm10
-        addsd 16(%rdi,%r8,8),%xmm11
-        movsd %xmm9,(%rdi,%r8,8)
-        movsd %xmm10,8(%rdi,%r8,8)
-        movsd %xmm11,16(%rdi,%r8,8)
-
-_nb_kernel400_x86_64_sse2.nb400_updateouterdata: 
-        movl  nb400_ii3(%rsp),%ecx
-        movq  nb400_faction(%rbp),%rdi
-        movq  nb400_fshift(%rbp),%rsi
-        movl  nb400_is3(%rsp),%edx
-
-        ## accumulate i forces in xmm13, xmm14, xmm15
-        movhlps %xmm13,%xmm3
-        movhlps %xmm14,%xmm4
-        movhlps %xmm15,%xmm5
-        addsd  %xmm3,%xmm13
-        addsd  %xmm4,%xmm14
-        addsd  %xmm5,%xmm15 ## sum is in low xmm13-xmm15
-
-        ## increment i force 
-        movsd  (%rdi,%rcx,8),%xmm3
-        movsd  8(%rdi,%rcx,8),%xmm4
-        movsd  16(%rdi,%rcx,8),%xmm5
-        subsd  %xmm13,%xmm3
-        subsd  %xmm14,%xmm4
-        subsd  %xmm15,%xmm5
-        movsd  %xmm3,(%rdi,%rcx,8)
-        movsd  %xmm4,8(%rdi,%rcx,8)
-        movsd  %xmm5,16(%rdi,%rcx,8)
-
-        ## increment fshift force  
-        movsd  (%rsi,%rdx,8),%xmm3
-        movsd  8(%rsi,%rdx,8),%xmm4
-        movsd  16(%rsi,%rdx,8),%xmm5
-        subsd  %xmm13,%xmm3
-        subsd  %xmm14,%xmm4
-        subsd  %xmm15,%xmm5
-        movsd  %xmm3,(%rsi,%rdx,8)
-        movsd  %xmm4,8(%rsi,%rdx,8)
-        movsd  %xmm5,16(%rsi,%rdx,8)
-
-        ## get n from stack
-        movl nb400_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb400_gid(%rbp),%rdx              ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total coulomb energy and update it 
-        movhlps %xmm12,%xmm6
-        addsd  %xmm6,%xmm12     ## low xmm12 have the sum now 
-
-        ## add earlier value from mem 
-        movq  nb400_Vc(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm12
-        ## move back to mem 
-        movsd %xmm12,(%rax,%rdx,8)
-
-        ## accumulate dVda and update it 
-        movhlps %xmm8,%xmm6
-        addsd  %xmm6,%xmm8      ## low xmm8 has the sum now 
-
-        movl nb400_ii(%rsp),%edx
-        movq nb400_dvda(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm8
-        movsd %xmm8,(%rax,%rdx,8)
-
-        ## finish if last 
-        movl nb400_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel400_x86_64_sse2.nb400_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb400_n(%rsp)
-        jmp _nb_kernel400_x86_64_sse2.nb400_outer
-_nb_kernel400_x86_64_sse2.nb400_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb400_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel400_x86_64_sse2.nb400_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel400_x86_64_sse2.nb400_threadloop
-_nb_kernel400_x86_64_sse2.nb400_end: 
-        movl nb400_nouter(%rsp),%eax
-        movl nb400_ninner(%rsp),%ebx
-        movq nb400_outeriter(%rbp),%rcx
-        movq nb400_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $440,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
-
-
-
-
-
-.globl nb_kernel400nf_x86_64_sse2
-.globl _nb_kernel400nf_x86_64_sse2
-nb_kernel400nf_x86_64_sse2:     
-_nb_kernel400nf_x86_64_sse2:    
-.set nb400nf_fshift, 16
-.set nb400nf_gid, 24
-.set nb400nf_pos, 32
-.set nb400nf_faction, 40
-.set nb400nf_charge, 48
-.set nb400nf_p_facel, 56
-.set nb400nf_argkrf, 64
-.set nb400nf_argcrf, 72
-.set nb400nf_Vc, 80
-.set nb400nf_type, 88
-.set nb400nf_p_ntype, 96
-.set nb400nf_vdwparam, 104
-.set nb400nf_Vvdw, 112
-.set nb400nf_p_tabscale, 120
-.set nb400nf_VFtab, 128
-.set nb400nf_invsqrta, 136
-.set nb400nf_dvda, 144
-.set nb400nf_p_gbtabscale, 152
-.set nb400nf_GBtab, 160
-.set nb400nf_p_nthreads, 168
-.set nb400nf_count, 176
-.set nb400nf_mtx, 184
-.set nb400nf_outeriter, 192
-.set nb400nf_inneriter, 200
-.set nb400nf_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb400nf_ix, 0
-.set nb400nf_iy, 16
-.set nb400nf_iz, 32
-.set nb400nf_iq, 48
-.set nb400nf_gbtsc, 64
-.set nb400nf_qq, 80
-.set nb400nf_vctot, 96
-.set nb400nf_half, 112
-.set nb400nf_three, 128
-.set nb400nf_isai, 144
-.set nb400nf_isaprod, 160
-.set nb400nf_gbscale, 176
-.set nb400nf_nri, 192
-.set nb400nf_iinr, 200
-.set nb400nf_jindex, 208
-.set nb400nf_jjnr, 216
-.set nb400nf_shift, 224
-.set nb400nf_shiftvec, 232
-.set nb400nf_facel, 240
-.set nb400nf_innerjjnr, 248
-.set nb400nf_is3, 256
-.set nb400nf_ii3, 260
-.set nb400nf_innerk, 264
-.set nb400nf_n, 268
-.set nb400nf_nn1, 272
-.set nb400nf_nouter, 276
-.set nb400nf_ninner, 280
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $296,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb400nf_nouter(%rsp)
-        movl %eax,nb400nf_ninner(%rsp)
-
-        movl (%rdi),%edi
-        movl %edi,nb400nf_nri(%rsp)
-        movq %rsi,nb400nf_iinr(%rsp)
-        movq %rdx,nb400nf_jindex(%rsp)
-        movq %rcx,nb400nf_jjnr(%rsp)
-        movq %r8,nb400nf_shift(%rsp)
-        movq %r9,nb400nf_shiftvec(%rsp)
-        movq nb400nf_p_facel(%rbp),%rsi
-        movsd (%rsi),%xmm0
-        movsd %xmm0,nb400nf_facel(%rsp)
-
-        movq nb400nf_p_gbtabscale(%rbp),%rbx
-        movsd (%rbx),%xmm4
-        shufpd $0,%xmm4,%xmm4
-        movapd %xmm4,nb400nf_gbtsc(%rsp)
-
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double half IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb400nf_half(%rsp)
-        movl %ebx,nb400nf_half+4(%rsp)
-        movsd nb400nf_half(%rsp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## one
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## two
-        addpd  %xmm2,%xmm3      ## three
-        movapd %xmm1,nb400nf_half(%rsp)
-        movapd %xmm3,nb400nf_three(%rsp)
-
-_nb_kernel400nf_x86_64_sse2.nb400nf_threadloop: 
-        movq  nb400nf_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel400nf_x86_64_sse2.nb400nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel400nf_x86_64_sse2.nb400nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb400nf_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb400nf_n(%rsp)
-        movl %ebx,nb400nf_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel400nf_x86_64_sse2.nb400nf_outerstart
-        jmp _nb_kernel400nf_x86_64_sse2.nb400nf_end
-
-_nb_kernel400nf_x86_64_sse2.nb400nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb400nf_nouter(%rsp),%ebx
-        movl %ebx,nb400nf_nouter(%rsp)
-
-_nb_kernel400nf_x86_64_sse2.nb400nf_outer: 
-        movq  nb400nf_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx        ## rbx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb400nf_is3(%rsp)            ## store is3 
-
-        movq  nb400nf_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movsd (%rax,%rbx,8),%xmm0
-        movsd 8(%rax,%rbx,8),%xmm1
-        movsd 16(%rax,%rbx,8),%xmm2
-
-        movq  nb400nf_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]      
-        movl  (%rcx,%rsi,4),%ebx    ## ebx =ii 
-
-        movq  nb400nf_charge(%rbp),%rdx
-        movsd (%rdx,%rbx,8),%xmm3
-        mulsd nb400nf_facel(%rsp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movq  nb400nf_invsqrta(%rbp),%rdx       ## load invsqrta[ii]
-        movsd (%rdx,%rbx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb400nf_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addsd (%rax,%rbx,8),%xmm0
-        addsd 8(%rax,%rbx,8),%xmm1
-        addsd 16(%rax,%rbx,8),%xmm2
-
-        movapd %xmm3,nb400nf_iq(%rsp)
-        movapd %xmm4,nb400nf_isai(%rsp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb400nf_ix(%rsp)
-        movapd %xmm1,nb400nf_iy(%rsp)
-        movapd %xmm2,nb400nf_iz(%rsp)
-
-        movl  %ebx,nb400nf_ii3(%rsp)
-
-        ## clear vctot
-        xorpd %xmm4,%xmm4
-        movapd %xmm4,nb400nf_vctot(%rsp)
-
-        movq  nb400nf_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb400nf_pos(%rbp),%rsi
-        movq  nb400nf_faction(%rbp),%rdi
-        movq  nb400nf_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb400nf_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb400nf_ninner(%rsp),%ecx
-        movl  %ecx,nb400nf_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb400nf_innerk(%rsp)      ## number of innerloop atoms 
-        jge   _nb_kernel400nf_x86_64_sse2.nb400nf_unroll_loop
-        jmp   _nb_kernel400nf_x86_64_sse2.nb400nf_checksingle
-_nb_kernel400nf_x86_64_sse2.nb400nf_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movq  nb400nf_innerjjnr(%rsp),%rdx     ## pointer to jjnr[k] 
-        movl  (%rdx),%eax
-        movl  4(%rdx),%ebx
-        addq $8,nb400nf_innerjjnr(%rsp)                 ## advance pointer (unrolled 2) 
-
-        ## load isa2
-        movq nb400nf_invsqrta(%rbp),%rsi
-        movlpd (%rsi,%rax,8),%xmm2
-        movhpd (%rsi,%rbx,8),%xmm2
-        mulpd  nb400nf_isai(%rsp),%xmm2
-        movapd %xmm2,nb400nf_isaprod(%rsp)
-        movapd %xmm2,%xmm1
-        mulpd nb400nf_gbtsc(%rsp),%xmm1
-        movapd %xmm1,nb400nf_gbscale(%rsp)
-
-        movq nb400nf_charge(%rbp),%rsi     ## base of charge[] 
-        movlpd (%rsi,%rax,8),%xmm3
-        movhpd (%rsi,%rbx,8),%xmm3
-
-        mulpd nb400nf_iq(%rsp),%xmm2
-    mulpd %xmm2,%xmm3
-        movapd %xmm3,nb400nf_qq(%rsp)
-
-        movq nb400nf_pos(%rbp),%rsi             ## base of pos[] 
-
-        lea  (%rax,%rax,2),%rax     ## replace jnr with j3 
-        lea  (%rbx,%rbx,2),%rbx
-
-        ## move two coordinates to xmm0-xmm2 
-        movlpd (%rsi,%rax,8),%xmm0
-        movlpd 8(%rsi,%rax,8),%xmm1
-        movlpd 16(%rsi,%rax,8),%xmm2
-        movhpd (%rsi,%rbx,8),%xmm0
-        movhpd 8(%rsi,%rbx,8),%xmm1
-        movhpd 16(%rsi,%rbx,8),%xmm2
-
-        movq   nb400nf_faction(%rbp),%rdi
-
-        ## move nb400nf_ix-iz to xmm4-xmm6 
-        movapd nb400nf_ix(%rsp),%xmm4
-        movapd nb400nf_iy(%rsp),%xmm5
-        movapd nb400nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subpd %xmm0,%xmm4
-        subpd %xmm1,%xmm5
-        subpd %xmm2,%xmm6
-
-        ## square it 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb400nf_three(%rsp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb400nf_half(%rsp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb400nf_three(%rsp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb400nf_half(%rsp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=iter2 of rinv (new lu) 
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-        mulpd nb400nf_gbscale(%rsp),%xmm4
-
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6           ## idx *= 4 
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-
-        movq nb400nf_GBtab(%rbp),%rsi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm6,%ebx          ## indices in eax/ebx 
-
-        movapd (%rsi,%rax,8),%xmm4      ## Y1 F1        
-        movapd (%rsi,%rbx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%rsi,%rax,8),%xmm6    ## G1 H1        
-        movapd 16(%rsi,%rbx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb400nf_qq(%rsp),%xmm3
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-        mulpd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addpd  nb400nf_vctot(%rsp),%xmm5
-        movapd %xmm5,nb400nf_vctot(%rsp)
-
-        ## should we do one more iteration? 
-        subl $2,nb400nf_innerk(%rsp)
-        jl    _nb_kernel400nf_x86_64_sse2.nb400nf_checksingle
-        jmp   _nb_kernel400nf_x86_64_sse2.nb400nf_unroll_loop
-_nb_kernel400nf_x86_64_sse2.nb400nf_checksingle: 
-        movl  nb400nf_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel400nf_x86_64_sse2.nb400nf_dosingle
-        jmp    _nb_kernel400nf_x86_64_sse2.nb400nf_updateouterdata
-_nb_kernel400nf_x86_64_sse2.nb400nf_dosingle: 
-        movq nb400nf_charge(%rbp),%rsi
-        movq nb400nf_invsqrta(%rbp),%rdx
-        movq nb400nf_pos(%rbp),%rdi
-        movq  nb400nf_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-        xorpd  %xmm6,%xmm6
-        movapd %xmm6,%xmm7
-        movsd  (%rdx,%rax,8),%xmm7
-        movlpd (%rsi,%rax,8),%xmm6      ## xmm6(0) has the charge
-        mulsd  nb400nf_isai(%rsp),%xmm7
-        movapd %xmm7,nb400nf_isaprod(%rsp)
-        movapd %xmm7,%xmm1
-        mulpd nb400nf_gbtsc(%rsp),%xmm1
-        movapd %xmm1,nb400nf_gbscale(%rsp)
-
-        mulsd  nb400nf_iq(%rsp),%xmm7
-        mulsd  %xmm7,%xmm6
-        movapd %xmm6,nb400nf_qq(%rsp)
-
-        lea  (%rax,%rax,2),%rax
-
-        ## move coordinates to xmm0-xmm2 
-        movlpd (%rdi,%rax,8),%xmm0
-        movlpd 8(%rdi,%rax,8),%xmm1
-        movlpd 16(%rdi,%rax,8),%xmm2
-
-        ## move nb400nf_ix-iz to xmm4-xmm6 
-        movapd nb400nf_ix(%rsp),%xmm4
-        movapd nb400nf_iy(%rsp),%xmm5
-        movapd nb400nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subsd %xmm0,%xmm4
-        subsd %xmm1,%xmm5
-        subsd %xmm2,%xmm6
-
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb400nf_three(%rsp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb400nf_half(%rsp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb400nf_three(%rsp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb400nf_half(%rsp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=iter2 of rinv (new lu) 
-
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-        mulsd nb400nf_gbscale(%rsp),%xmm4
-
-        movd %eax,%mm0
-
-        cvttsd2si %xmm4,%eax    ## mm6 = lu idx 
-        cvtsi2sd %eax,%xmm5
-        subsd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%eax            ## idx *= 4 
-
-        movq nb400nf_GBtab(%rbp),%rsi
-
-        ## Coulomb 
-        movapd (%rsi,%rax,8),%xmm4      ## Y1 F1 
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1  
-        unpckhpd %xmm3,%xmm5    ## F1  
-
-        movapd 16(%rsi,%rax,8),%xmm6    ## G1 H1 
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1  
-        unpckhpd %xmm3,%xmm7    ## H1   
-        ## table ready in xmm4-xmm7 
-
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb400nf_qq(%rsp),%xmm3
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addsd  nb400nf_vctot(%rsp),%xmm5
-        movsd %xmm5,nb400nf_vctot(%rsp)
-
-_nb_kernel400nf_x86_64_sse2.nb400nf_updateouterdata: 
-        ## get n from stack
-        movl nb400nf_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb400nf_gid(%rbp),%rdx            ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movapd nb400nf_vctot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movq  nb400nf_Vc(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%rax,%rdx,8)
-
-        ## finish if last 
-        movl nb400nf_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel400nf_x86_64_sse2.nb400nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb400nf_n(%rsp)
-        jmp _nb_kernel400nf_x86_64_sse2.nb400nf_outer
-_nb_kernel400nf_x86_64_sse2.nb400nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb400nf_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel400nf_x86_64_sse2.nb400nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel400nf_x86_64_sse2.nb400nf_threadloop
-_nb_kernel400nf_x86_64_sse2.nb400nf_end: 
-
-        movl nb400nf_nouter(%rsp),%eax
-        movl nb400nf_ninner(%rsp),%ebx
-        movq nb400nf_outeriter(%rbp),%rcx
-        movq nb400nf_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $296,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
-
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.intel_syntax.s
deleted file mode 100644
index 72340a98f9..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.intel_syntax.s
+++ /dev/null
@@ -1,1488 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-
-
-
-.globl nb_kernel410_x86_64_sse2
-.globl _nb_kernel410_x86_64_sse2
-nb_kernel410_x86_64_sse2:	
-_nb_kernel410_x86_64_sse2:	
-;#	Room for return address and rbp (16 bytes)
-.equiv          nb410_fshift,           16
-.equiv          nb410_gid,              24
-.equiv          nb410_pos,              32
-.equiv          nb410_faction,          40
-.equiv          nb410_charge,           48
-.equiv          nb410_p_facel,          56
-.equiv          nb410_argkrf,           64
-.equiv          nb410_argcrf,           72
-.equiv          nb410_Vc,               80
-.equiv          nb410_type,             88
-.equiv          nb410_p_ntype,          96
-.equiv          nb410_vdwparam,         104
-.equiv          nb410_Vvdw,             112
-.equiv          nb410_p_tabscale,       120
-.equiv          nb410_VFtab,            128
-.equiv          nb410_invsqrta,         136
-.equiv          nb410_dvda,             144
-.equiv          nb410_p_gbtabscale,     152
-.equiv          nb410_GBtab,            160
-.equiv          nb410_p_nthreads,       168
-.equiv          nb410_count,            176
-.equiv          nb410_mtx,              184
-.equiv          nb410_outeriter,        192
-.equiv          nb410_inneriter,        200
-.equiv          nb410_work,             208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb410_ix,               0
-.equiv          nb410_iy,               16
-.equiv          nb410_iz,               32
-.equiv          nb410_iq,               48
-.equiv          nb410_dx,               64
-.equiv          nb410_dy,               80
-.equiv          nb410_dz,               96
-.equiv          nb410_two,              112
-.equiv          nb410_six,              128
-.equiv          nb410_twelve,           144
-.equiv          nb410_gbtsc,            160
-.equiv          nb410_qq,               176
-.equiv          nb410_c6,               192
-.equiv          nb410_c12,              208
-.equiv          nb410_fscal,            224
-.equiv          nb410_vctot,            240
-.equiv          nb410_Vvdwtot,          256
-.equiv          nb410_fix,              272
-.equiv          nb410_fiy,              288
-.equiv          nb410_fiz,              304
-.equiv          nb410_half,             320
-.equiv          nb410_three,            336
-.equiv          nb410_r,                352
-.equiv          nb410_isai,             368
-.equiv          nb410_isaprod,          384
-.equiv          nb410_dvdasum,          400
-.equiv          nb410_gbscale,          416
-.equiv          nb410_nri,              432
-.equiv          nb410_iinr,             440
-.equiv          nb410_jindex,           448
-.equiv          nb410_jjnr,             456
-.equiv          nb410_shift,            464
-.equiv          nb410_shiftvec,         472
-.equiv          nb410_facel,            480
-.equiv          nb410_innerjjnr,        488
-.equiv          nb410_ii,               496
-.equiv          nb410_is3,              500
-.equiv          nb410_ii3,              504
-.equiv          nb410_ntia,             508
-.equiv          nb410_innerk,           512
-.equiv          nb410_n,                516
-.equiv          nb410_nn1,              520
-.equiv          nb410_ntype,            524
-.equiv          nb410_nouter,           528
-.equiv          nb410_ninner,           532
-	push rbp
-	mov  rbp, rsp
-	push rbx
-
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 552		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb410_nouter], eax
-	mov [rsp + nb410_ninner], eax
-
-	mov edi, [rdi]
-	mov [rsp + nb410_nri], edi
-	mov [rsp + nb410_iinr], rsi
-	mov [rsp + nb410_jindex], rdx
-	mov [rsp + nb410_jjnr], rcx
-	mov [rsp + nb410_shift], r8
-	mov [rsp + nb410_shiftvec], r9
-	mov rdi, [rbp + nb410_p_ntype]
-	mov edi, [rdi]
-	mov [rsp + nb410_ntype], edi
-	mov rsi, [rbp + nb410_p_facel]
-	movsd xmm0, [rsi]
-	movsd [rsp + nb410_facel], xmm0
-
-	mov rbx, [rbp + nb410_p_gbtabscale]
-	movsd xmm4, [rbx]
-	shufpd xmm4, xmm4, 0
-	movapd [rsp + nb410_gbtsc],  xmm4
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double half IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [rsp + nb410_half], eax
-	mov [rsp + nb410_half+4], ebx
-	movsd xmm1, [rsp + nb410_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# one
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# two
-	addpd  xmm3, xmm2	;# three
-	movapd xmm4, xmm3
-	addpd  xmm4, xmm4       ;# six
-	movapd xmm5, xmm4
-	addpd  xmm5, xmm5       ;# twelve
-	movapd [rsp + nb410_half], xmm1
-	movapd [rsp + nb410_two], xmm2
-	movapd [rsp + nb410_three], xmm3
-	movapd [rsp + nb410_six], xmm4
-	movapd [rsp + nb410_twelve], xmm5
-
-.nb410_threadloop:
-        mov   rsi, [rbp + nb410_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb410_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb410_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb410_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb410_n], eax
-        mov [rsp + nb410_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb410_outerstart
-        jmp .nb410_end
-
-.nb410_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb410_nouter]
-	mov [rsp + nb410_nouter], ebx
-
-.nb410_outer:
-	mov   rax, [rsp + nb410_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax+rsi*4]		;# rbx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb410_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb410_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movsd xmm0, [rax + rbx*8]
-	movsd xmm1, [rax + rbx*8 + 8]
-	movsd xmm2, [rax + rbx*8 + 16] 
-
-	mov   rcx, [rsp + nb410_iinr]       ;# rcx = pointer into iinr[] 	
-	mov   ebx, [rcx+rsi*4]	    ;# ebx =ii 
-	mov   [rsp + nb410_ii], ebx
-
-	mov   rdx, [rbp + nb410_charge]
-	movsd xmm3, [rdx + rbx*8]	
-	mulsd xmm3, [rsp + nb410_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   rdx, [rbp + nb410_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [rdx + rbx*8]
-	shufpd xmm4, xmm4, 0
-
-    	mov   rdx, [rbp + nb410_type] 
-    	mov   edx, [rdx + rbx*4]
-    	imul  edx, [rsp + nb410_ntype]
-    	shl   edx, 1
-    	mov   [rsp + nb410_ntia], edx
-	
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb410_pos]    ;# rax = base of pos[]  
-
-	addsd xmm0, [rax + rbx*8]
-	addsd xmm1, [rax + rbx*8 + 8]
-	addsd xmm2, [rax + rbx*8 + 16]
-
-	movapd [rsp + nb410_iq], xmm3
-	movapd [rsp + nb410_isai], xmm4
-
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [rsp + nb410_ix], xmm0
-	movapd [rsp + nb410_iy], xmm1
-	movapd [rsp + nb410_iz], xmm2
-
-	mov   [rsp + nb410_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorpd xmm13, xmm13
-	movapd xmm12, xmm13
-	movapd [rsp + nb410_Vvdwtot], xmm13
-	movapd [rsp + nb410_dvdasum], xmm13
-	movapd xmm14, xmm13
-	movapd xmm15, xmm13
-	
-	mov   rax, [rsp + nb410_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb410_pos]
-	mov   rdi, [rbp + nb410_faction]	
-	mov   rax, [rsp + nb410_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb410_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [rsp + nb410_ninner]
-	mov   [rsp + nb410_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb410_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb410_unroll_loop
-	jmp   .nb410_checksingle
-.nb410_unroll_loop:	
-	;# twice unrolled innerloop here 
-	mov   rdx, [rsp + nb410_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   r14d, [rdx]	
-	mov   r15d, [rdx + 4]              
-	add qword ptr [rsp + nb410_innerjjnr],  8 ;# advance pointer (unrolled 2) 
-	
-	mov rsi, [rbp + nb410_pos]       ;# base of pos[] 
-
-	lea   r10, [r14 + r14*2]     ;# replace jnr with j3 
-	lea   r11, [r15 + r15*2]	
-
-	;# move two coordinates to xmm4-xmm6	
-	movlpd xmm4, [rsi + r10*8]
-	movlpd xmm5, [rsi + r10*8 + 8]
-	movlpd xmm6, [rsi + r10*8 + 16]
-	movhpd xmm4, [rsi + r11*8]
-	movhpd xmm5, [rsi + r11*8 + 8]
-	movhpd xmm6, [rsi + r11*8 + 16]		
-	
-	;# calc dr 
-	subpd xmm4, [rsp + nb410_ix]
-	subpd xmm5, [rsp + nb410_iy]
-	subpd xmm6, [rsp + nb410_iz]
-
-	;# store dr 
-	movapd [rsp + nb410_dx], xmm4
-	movapd [rsp + nb410_dy], xmm5
-	movapd [rsp + nb410_dz], xmm6
-    
-	;# load isaj
-	mov rsi, [rbp + nb410_invsqrta]
-
-	;# square it 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	movlpd xmm3, [rsi + r14*8]
-	movhpd xmm3, [rsi + r15*8]
-
-	mov rdi, [rbp + nb410_type]
-	mov r8d, [rdi + r14*4]
-	mov r9d, [rdi + r15*4]
-
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	mulpd  xmm3, [rsp + nb410_isai]
-	movapd [rsp + nb410_isaprod], xmm3
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb410_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb410_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm6, xmm3
-	mulpd xmm6, [rsp + nb410_gbtsc]
-	movapd [rsp + nb410_gbscale], xmm6
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb410_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb410_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=rinv 
-	
-    mulpd  xmm3, [rsp + nb410_iq]
-	mov rsi, [rbp + nb410_charge]    ;# base of charge[] 
-	movlpd xmm6, [rsi + r14*8]
-	movhpd xmm6, [rsi + r15*8]
-	mulpd  xmm6, xmm3
-	movapd [rsp + nb410_qq], xmm6	
-	
-	mulpd xmm4, xmm0	;# xmm4=r 
-	movapd [rsp + nb410_r], xmm4
-	mulpd xmm4, [rsp + nb410_gbscale]
-	mov edi, [rsp + nb410_ntia]
-
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	shl r8d, 1
-	shl r9d, 1
-	add r8d, edi
-	add r9d, edi
-
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	mov rdi, [rbp + nb410_vdwparam]
-	
-	pslld mm6, 2		;# idx *= 4 
-	
-	mov  rsi, [rbp + nb410_GBtab]
-	movd r12d, mm6
-	psrlq mm6, 32
-	movd r13d, mm6		;# indices in r12/r13
-
-	movlpd xmm6, [rdi + r8*8]	
-	movlpd xmm7, [rdi + r8*8 + 8]
-
-    movapd xmm9, xmm0 ;# rinv
-    mulpd  xmm9, xmm9 ;# rinvsq
-    movapd xmm10, xmm9 ;# rinvsq
-    mulpd  xmm10, xmm10 ;# rinv4
-    mulpd  xmm10, xmm9 ;# rinv6
-    movapd xmm11, xmm10 
-    mulpd  xmm11, xmm11 ;# rinv12
-
-
-	movhpd xmm6, [rdi + r9*8]	
-	movhpd xmm7, [rdi + r9*8 + 8]
-
-    ;# load table data
-	movapd xmm4, [rsi + r12*8]	;# Y1 F1 	
-	movapd xmm3, [rsi + r13*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-    mulpd  xmm10, xmm6   ;# vvdw6=c6*rinv6
-	mulpd  xmm11, xmm7   ;# vvdw12=c12*rinv12     
-
-	movapd xmm9, xmm11
-	subpd  xmm11, xmm10	;# Vvdw=Vvdw12-Vvdw6
-
-    ;# add potential to vvdwtot 
-	addpd  xmm11, [rsp + nb410_Vvdwtot]
-    movapd [rsp + nb410_Vvdwtot], xmm11
-    
-	movapd xmm6, [rsi + r12*8 + 16]	;# G1 H1 	
-	movapd xmm3, [rsi + r13*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# coulomb table ready, in xmm4-xmm7  		
-
-	mulpd  xmm7, xmm1	;# xmm7=Heps
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm1	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	mulpd  xmm7, [rsp + nb410_two]	;# two*Heps2 
-	movapd xmm3, [rsp + nb410_qq]
-	addpd  xmm7, xmm6
-	addpd  xmm7, xmm5 ;# xmm7=FF 
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulpd  xmm3, xmm7 ;# fijC=FF*qq 
-    
-   ;# LJ forces
-    mulpd  xmm10, [rsp + nb410_six]
-    mulpd  xmm9, [rsp + nb410_twelve]
-    subpd  xmm9, xmm10
-    mulpd  xmm9, xmm0 ;# (12*vnb12-6*vnb6)*rinv
-
-	mov rsi, [rbp + nb410_dvda]
-	
-	;# Calculate dVda
-	xorpd xmm7, xmm7
-	mulpd xmm3, [rsp + nb410_gbscale]
-	movapd xmm6, xmm3
-	mulpd  xmm6, [rsp + nb410_r]
-	addpd  xmm6, xmm5
-
-    ;# update vctot 
-	addpd  xmm12, xmm5
-
-	;# xmm6=(vcoul+fijC*r)
-	subpd  xmm7, xmm6
-	movapd xmm6, xmm7
-	
-	mov rdi, [rbp + nb410_faction]
-	;# the fj's - start by accumulating forces from memory 
-	movlpd xmm2, [rdi + r10*8]
-	movlpd xmm4, [rdi + r10*8 + 8]
-	movlpd xmm5, [rdi + r10*8 + 16]
-
-	;# update dvdasum
-	addpd  xmm7, [rsp + nb410_dvdasum]
-	movapd [rsp + nb410_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	addsd  xmm6, [rsi + r14*8]
-	addsd  xmm7, [rsi + r15*8]
-	movsd  [rsi + r14*8], xmm6
-	movsd  [rsi + r15*8], xmm7
-
-	movhpd xmm2, [rdi + r11*8]
-	movhpd xmm4, [rdi + r11*8 + 8]
-	movhpd xmm5, [rdi + r11*8 + 16]
-
-    subpd  xmm9, xmm3
-    mulpd  xmm9, xmm0 ;# fscal
-
-    movapd  xmm10, xmm9
-    movapd  xmm11, xmm9
-
-    mulpd   xmm9, [rsp + nb410_dx]
-    mulpd   xmm10, [rsp + nb410_dy]
-    mulpd   xmm11, [rsp + nb410_dz]
-    
-	addpd xmm2, xmm9
-	addpd xmm4, xmm10
-	addpd xmm5, xmm11
-
-	movlpd [rdi + r10*8], xmm2
-	movlpd [rdi + r10*8 + 8], xmm4
-	movlpd [rdi + r10*8 + 16], xmm5
-
-	;# accumulate i forces
-    addpd xmm13, xmm9
-    addpd xmm14, xmm10
-    addpd xmm15, xmm11
-
-	movhpd [rdi + r11*8], xmm2
-	movhpd [rdi + r11*8 + 8], xmm4
-	movhpd [rdi + r11*8 + 16], xmm5
-	
-	;# should we do one more iteration? 
-	sub dword ptr [rsp + nb410_innerk],  2
-	jl    .nb410_checksingle
-	jmp   .nb410_unroll_loop
-.nb410_checksingle:
-	mov   edx, [rsp + nb410_innerk]
-	and   edx, 1
-	jnz    .nb410_dosingle
-	jmp    .nb410_updateouterdata
-.nb410_dosingle:
-	mov rsi, [rbp + nb410_charge]
-	mov rdx, [rbp + nb410_invsqrta]
-	mov rdi, [rbp + nb410_pos]
-	mov   rcx, [rsp + nb410_innerjjnr]
-	mov   eax, [rcx]
-	
-	;# load isaj
-	mov rsi, [rbp + nb410_invsqrta]
-	movsd xmm2, [rsi + rax*8]
-	mulsd  xmm2, [rsp + nb410_isai]
-	movapd [rsp + nb410_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulsd xmm1, [rsp + nb410_gbtsc]
-	movapd [rsp + nb410_gbscale], xmm1
-
-    mulsd xmm2, [rsp + nb410_iq]
-	mov rsi, [rbp + nb410_charge]    ;# base of charge[] 
-	movsd xmm3, [rsi + rax*8]
-	mulsd  xmm3, xmm2
-	movapd [rsp + nb410_qq], xmm3	
-	
-	mov rsi, [rbp + nb410_type]
-	mov r8d, [rsi + rax*4]
-	mov rsi, [rbp + nb410_vdwparam]
-	shl r8d, 1
-	mov edi, [rsp + nb410_ntia]
-	add r8d, edi
-
-	movsd xmm4, [rsi + r8*8]	
-	movsd xmm6, [rsi + r8*8 + 8]
-	movapd [rsp + nb410_c6], xmm4
-	movapd [rsp + nb410_c12], xmm6
-	
-	mov rsi, [rbp + nb410_pos]       ;# base of pos[] 
-
-	lea   r10, [rax + rax*2]     ;# replace jnr with j3 
-
-	;# move two coordinates to xmm4-xmm6	
-	movsd xmm4, [rsi + r10*8]
-	movsd xmm5, [rsi + r10*8 + 8]
-	movsd xmm6, [rsi + r10*8 + 16]
-	
-	;# calc dr 
-	subsd xmm4, [rsp + nb410_ix]
-	subsd xmm5, [rsp + nb410_iy]
-	subsd xmm6, [rsp + nb410_iz]
-
-	;# store dr 
-	movapd [rsp + nb410_dx], xmm4
-	movapd [rsp + nb410_dy], xmm5
-	movapd [rsp + nb410_dz], xmm6
-    
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb410_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb410_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb410_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb410_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=rinv 
-	
-	mulsd xmm4, xmm0	;# xmm4=r 
-	movapd [rsp + nb410_r], xmm4
-	mulsd xmm4, [rsp + nb410_gbscale]
-
-	cvttsd2si r12d, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, r12d
-	subsd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2 
-	
-	shl r12d, 2		;# idx *= 4 
-	
-	mov  rsi, [rbp + nb410_GBtab]
-
-    movapd xmm9, xmm0 ;# rinv
-    mulsd  xmm9, xmm9 ;# rinvsq
-    movapd xmm10, xmm9 ;# rinvsq
-    mulsd  xmm10, xmm10 ;# rinv4
-    mulsd  xmm10, xmm9 ;# rinv6
-    movapd xmm11, xmm10 
-    mulsd  xmm11, xmm11 ;# rinv12
-
-    ;# load table data
-	movapd xmm4, [rsi + r12*8]	;# Y1 F1 	
-    movhlps xmm5, xmm4
-
-    mulsd  xmm10, [rsp + nb410_c6]    ;# vvdw6=c6*rinv6
-	mulsd  xmm11, [rsp + nb410_c12]   ;# vvdw12=c12*rinv12     
-
-	movapd xmm9, xmm11
-	subsd  xmm11, xmm10	;# Vvdw=Vvdw12-Vvdw6
-
-    ;# add potential to vvdwtot 
-	addsd  xmm11, [rsp + nb410_Vvdwtot]
-    movsd [rsp + nb410_Vvdwtot], xmm11
-    
-	movapd xmm6, [rsi + r12*8 + 16]	;# G1 H1 	
-    movhlps xmm7, xmm6
-	;# coulomb table ready, in xmm4-xmm7  		
-
-	mulsd  xmm7, xmm1	;# xmm7=Heps
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm1	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	mulsd  xmm7, [rsp + nb410_two]	;# two*Heps2 
-	movapd xmm3, [rsp + nb410_qq]
-	addsd  xmm7, xmm6
-	addsd  xmm7, xmm5 ;# xmm7=FF 
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  
-	mulsd  xmm3, xmm7 ;# fijC=FF*qq 
-    
-   ;# LJ forces
-    mulsd  xmm10, [rsp + nb410_six]
-    mulsd  xmm9, [rsp + nb410_twelve]
-    subsd  xmm9, xmm10
-    mulsd  xmm9, xmm0 ;# (12*vnb12-6*vnb6)*rinv
-
-	mov rsi, [rbp + nb410_dvda]
-	
-	;# Calculate dVda
-	xorpd xmm7, xmm7
-	mulsd xmm3, [rsp + nb410_gbscale]
-	movapd xmm6, xmm3
-	mulsd  xmm6, [rsp + nb410_r]
-	addsd  xmm6, xmm5
-
-    ;# update vctot 
-	addsd  xmm12, xmm5
-
-	;# xmm6=(vcoul+fijC*r)
-	subsd  xmm7, xmm6
-	movapd xmm6, xmm7
-	
-	;# update dvdasum
-	addsd  xmm7, [rsp + nb410_dvdasum]
-	movsd [rsp + nb410_dvdasum], xmm7 
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	addsd  xmm6, [rsi + rax*8]
-	addsd  xmm7, [rsi + rbx*8]
-	movsd  [rsi + rax*8], xmm6
-	movsd  [rsi + rbx*8], xmm7
-
-    subsd  xmm9, xmm3
-    mulsd  xmm9, xmm0 ;# fscal
-
-    movapd  xmm10, xmm9
-    movapd  xmm11, xmm9
-
-    mulsd   xmm9, [rsp + nb410_dx]
-    mulsd   xmm10, [rsp + nb410_dy]
-    mulsd   xmm11, [rsp + nb410_dz]
-    
-	;# accumulate i forces
-    addsd xmm13, xmm9
-    addsd xmm14, xmm10
-    addsd xmm15, xmm11
-
-	mov rdi, [rbp + nb410_faction]
-	;# the fj's - start by accumulating forces from memory 
-	addsd xmm9,  [rdi + r10*8]
-	addsd xmm10, [rdi + r10*8 + 8]
-	addsd xmm11, [rdi + r10*8 + 16]
-	movsd [rdi + r10*8], xmm9
-	movsd [rdi + r10*8 + 8], xmm10
-	movsd [rdi + r10*8 + 16], xmm11
-	
-.nb410_updateouterdata:
-	mov   ecx, [rsp + nb410_ii3]
-	mov   rdi, [rbp + nb410_faction]
-	mov   rsi, [rbp + nb410_fshift]
-	mov   edx, [rsp + nb410_is3]
-
-	;# accumulate i forces in xmm13, xmm14, xmm15
-	movhlps xmm3, xmm13
-	movhlps xmm4, xmm14
-	movhlps xmm5, xmm15
-	addsd  xmm13, xmm3
-	addsd  xmm14, xmm4
-	addsd  xmm15, xmm5 ;# sum is in low xmm13-xmm15
-
-	;# increment i force 
-	movsd  xmm3, [rdi + rcx*8]
-	movsd  xmm4, [rdi + rcx*8 + 8]
-	movsd  xmm5, [rdi + rcx*8 + 16]
-	subsd  xmm3, xmm13
-	subsd  xmm4, xmm14
-	subsd  xmm5, xmm15
-	movsd  [rdi + rcx*8],     xmm3
-	movsd  [rdi + rcx*8 + 8], xmm4
-	movsd  [rdi + rcx*8 + 16], xmm5
-
-	;# increment fshift force  
-	movsd  xmm3, [rsi + rdx*8]
-	movsd  xmm4, [rsi + rdx*8 + 8]
-	movsd  xmm5, [rsi + rdx*8 + 16]
-	subsd  xmm3, xmm13
-	subsd  xmm4, xmm14
-	subsd  xmm5, xmm15
-	movsd  [rsi + rdx*8],     xmm3
-	movsd  [rsi + rdx*8 + 8], xmm4
-	movsd  [rsi + rdx*8 + 16], xmm5
-
-	;# get n from stack
-	mov esi, [rsp + nb410_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb410_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movhlps xmm6, xmm12
-	addsd  xmm12, xmm6	;# low xmm12 has the sum now 
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb410_Vc]
-	addsd xmm12, [rax + rdx*8] 
-	;# move back to mem 
-	movsd [rax + rdx*8], xmm12
-	
-	;# accumulate total lj energy and update it 
-	movapd xmm7, [rsp + nb410_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb410_Vvdw]
-	addsd xmm7, [rax + rdx*8] 
-	;# move back to mem 
-	movsd [rax + rdx*8], xmm7 
-	
-	;# accumulate dVda and update it 
-	movapd xmm7, [rsp + nb410_dvdasum]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	mov edx, [rsp + nb410_ii]
-	mov rax, [rbp + nb410_dvda]
-	addsd xmm7, [rax + rdx*8]
-	movsd [rax + rdx*8], xmm7
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb410_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb410_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb410_n], esi
-        jmp .nb410_outer
-.nb410_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb410_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb410_end
-        ;# non-zero, do one more workunit
-        jmp   .nb410_threadloop
-.nb410_end:
-	mov eax, [rsp + nb410_nouter]
-	mov ebx, [rsp + nb410_ninner]
-	mov rcx, [rbp + nb410_outeriter]
-	mov rdx, [rbp + nb410_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 552
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
-
-
-
-
-
-
-
-
-.globl nb_kernel410nf_x86_64_sse2
-.globl _nb_kernel410nf_x86_64_sse2
-nb_kernel410nf_x86_64_sse2:	
-_nb_kernel410nf_x86_64_sse2:	
-;#	Room for return address and rbp (16 bytes)
-.equiv          nb410nf_fshift,         16
-.equiv          nb410nf_gid,            24
-.equiv          nb410nf_pos,            32
-.equiv          nb410nf_faction,        40
-.equiv          nb410nf_charge,         48
-.equiv          nb410nf_p_facel,        56
-.equiv          nb410nf_argkrf,         64
-.equiv          nb410nf_argcrf,         72
-.equiv          nb410nf_Vc,             80
-.equiv          nb410nf_type,           88
-.equiv          nb410nf_p_ntype,        96
-.equiv          nb410nf_vdwparam,       104
-.equiv          nb410nf_Vvdw,           112
-.equiv          nb410nf_p_tabscale,     120
-.equiv          nb410nf_VFtab,          128
-.equiv          nb410nf_invsqrta,       136
-.equiv          nb410nf_dvda,           144
-.equiv          nb410nf_p_gbtabscale,   152
-.equiv          nb410nf_GBtab,          160
-.equiv          nb410nf_p_nthreads,     168
-.equiv          nb410nf_count,          176
-.equiv          nb410nf_mtx,            184
-.equiv          nb410nf_outeriter,      192
-.equiv          nb410nf_inneriter,      200
-.equiv          nb410nf_work,           208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb410nf_ix,             0
-.equiv          nb410nf_iy,             16
-.equiv          nb410nf_iz,             32
-.equiv          nb410nf_iq,             48
-.equiv          nb410nf_two,            64
-.equiv          nb410nf_gbtsc,          80
-.equiv          nb410nf_qq,             96
-.equiv          nb410nf_c6,             112
-.equiv          nb410nf_c12,            128
-.equiv          nb410nf_vctot,          144
-.equiv          nb410nf_Vvdwtot,        160
-.equiv          nb410nf_half,           176
-.equiv          nb410nf_three,          192
-.equiv          nb410nf_r,              208
-.equiv          nb410nf_isai,           224
-.equiv          nb410nf_isaprod,        240
-.equiv          nb410nf_gbscale,        256
-.equiv          nb410nf_nri,            272
-.equiv          nb410nf_iinr,           280
-.equiv          nb410nf_jindex,         288
-.equiv          nb410nf_jjnr,           296
-.equiv          nb410nf_shift,          304
-.equiv          nb410nf_shiftvec,       312
-.equiv          nb410nf_facel,          320
-.equiv          nb410nf_innerjjnr,      328
-.equiv          nb410nf_ii,             336
-.equiv          nb410nf_is3,            340
-.equiv          nb410nf_ii3,            344
-.equiv          nb410nf_ntia,           348
-.equiv          nb410nf_innerk,         352
-.equiv          nb410nf_n,              356
-.equiv          nb410nf_nn1,            360
-.equiv          nb410nf_ntype,          364
-.equiv          nb410nf_nouter,         368
-.equiv          nb410nf_ninner,         372
-	push rbp
-	mov  rbp, rsp
-	push rbx
-
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 392		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb410nf_nouter], eax
-	mov [rsp + nb410nf_ninner], eax
-
-	mov edi, [rdi]
-	mov [rsp + nb410nf_nri], edi
-	mov [rsp + nb410nf_iinr], rsi
-	mov [rsp + nb410nf_jindex], rdx
-	mov [rsp + nb410nf_jjnr], rcx
-	mov [rsp + nb410nf_shift], r8
-	mov [rsp + nb410nf_shiftvec], r9
-	mov rdi, [rbp + nb410nf_p_ntype]
-	mov edi, [rdi]
-	mov [rsp + nb410nf_ntype], edi
-	mov rsi, [rbp + nb410nf_p_facel]
-	movsd xmm0, [rsi]
-	movsd [rsp + nb410nf_facel], xmm0
-
-	mov rbx, [rbp + nb410nf_p_gbtabscale]
-	movsd xmm4, [rbx]
-	shufpd xmm4, xmm4, 0
-	movapd [rsp + nb410nf_gbtsc],  xmm4
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double half IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [rsp + nb410nf_half], eax
-	mov [rsp + nb410nf_half+4], ebx
-	movsd xmm1, [rsp + nb410nf_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# one
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# two
-	addpd  xmm3, xmm2	;# three
-	movapd [rsp + nb410nf_half], xmm1
-	movapd [rsp + nb410nf_two], xmm2
-	movapd [rsp + nb410nf_three], xmm3
-
-.nb410nf_threadloop:
-        mov   rsi, [rbp + nb410nf_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb410nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb410nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb410nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb410nf_n], eax
-        mov [rsp + nb410nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb410nf_outerstart
-        jmp .nb410nf_end
-
-.nb410nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb410nf_nouter]
-	mov [rsp + nb410nf_nouter], ebx
-
-.nb410nf_outer:
-	mov   rax, [rsp + nb410nf_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax+rsi*4]		;# rbx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb410nf_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb410nf_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movsd xmm0, [rax + rbx*8]
-	movsd xmm1, [rax + rbx*8 + 8]
-	movsd xmm2, [rax + rbx*8 + 16] 
-
-	mov   rcx, [rsp + nb410nf_iinr]       ;# rcx = pointer into iinr[] 	
-	mov   ebx, [rcx+rsi*4]	    ;# ebx =ii 
-	mov   [rsp + nb410nf_ii], ebx
-
-	mov   rdx, [rbp + nb410nf_charge]
-	movsd xmm3, [rdx + rbx*8]	
-	mulsd xmm3, [rsp + nb410nf_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   rdx, [rbp + nb410nf_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [rdx + rbx*8]
-	shufpd xmm4, xmm4, 0
-
-   	mov   rdx, [rbp + nb410nf_type] 
-   	mov   edx, [rdx + rbx*4]
-   	imul  edx, [rsp + nb410nf_ntype]
-   	shl   edx, 1
-    mov   [rsp + nb410nf_ntia], edx
-	
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb410nf_pos]    ;# rax = base of pos[]  
-
-	addsd xmm0, [rax + rbx*8]
-	addsd xmm1, [rax + rbx*8 + 8]
-	addsd xmm2, [rax + rbx*8 + 16]
-
-	movapd [rsp + nb410nf_iq], xmm3
-	movapd [rsp + nb410nf_isai], xmm4
-
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [rsp + nb410nf_ix], xmm0
-	movapd [rsp + nb410nf_iy], xmm1
-	movapd [rsp + nb410nf_iz], xmm2
-
-	mov   [rsp + nb410nf_ii3], ebx
-	
-	;# clear vctot and Vvdwtot
-	xorpd xmm4, xmm4
-	movapd [rsp + nb410nf_vctot], xmm4
-	movapd [rsp + nb410nf_Vvdwtot], xmm4
-	
-	mov   rax, [rsp + nb410nf_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb410nf_pos]
-	mov   rdi, [rbp + nb410nf_faction]	
-	mov   rax, [rsp + nb410nf_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb410nf_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [rsp + nb410nf_ninner]
-	mov   [rsp + nb410nf_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb410nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb410nf_unroll_loop
-	jmp   .nb410nf_checksingle
-.nb410nf_unroll_loop:	
-	;# twice unrolled innerloop here 
-	mov   rdx, [rsp + nb410nf_innerjjnr]     ;# pointer to jjnr[k] 
-	mov   eax, [rdx]	
-	mov   ebx, [rdx + 4]              
-	add qword ptr [rsp + nb410nf_innerjjnr],  8 ;# advance pointer (unrolled 2) 
-
-	;# load isaj
-	mov rsi, [rbp + nb410nf_invsqrta]
-	movlpd xmm2, [rsi + rax*8]
-	movhpd xmm2, [rsi + rbx*8]
-	mulpd  xmm2, [rsp + nb410nf_isai]
-	movapd [rsp + nb410nf_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulpd xmm1, [rsp + nb410nf_gbtsc]
-	movapd [rsp + nb410nf_gbscale], xmm1
-	
-	mov rsi, [rbp + nb410nf_charge]    ;# base of charge[] 
-	movlpd xmm3, [rsi + rax*8]
-	movhpd xmm3, [rsi + rbx*8]
-
-	mulpd xmm2, [rsp + nb410nf_iq]
-	mulpd  xmm3, xmm2
-	movapd [rsp + nb410nf_qq], xmm3	
-	
-	movd  mm0, eax		;# use mmx registers as temp storage 
-	movd  mm1, ebx
-	
-	mov rsi, [rbp + nb410nf_type]
-	mov eax, [rsi + rax*4]
-	mov ebx, [rsi + rbx*4]
-	mov rsi, [rbp + nb410nf_vdwparam]
-	shl eax, 1
-	shl ebx, 1
-	mov edi, [rsp + nb410nf_ntia]
-	add eax, edi
-	add ebx, edi
-
-	movlpd xmm6, [rsi + rax*8]	;# c6a
-	movlpd xmm7, [rsi + rbx*8]	;# c6b
-	movhpd xmm6, [rsi + rax*8 + 8]	;# c6a c12a 
-	movhpd xmm7, [rsi + rbx*8 + 8]	;# c6b c12b 
-
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movd  eax, mm0
-	movd  ebx, mm1
-	movapd [rsp + nb410nf_c6], xmm4
-	movapd [rsp + nb410nf_c12], xmm6
-	
-	mov rsi, [rbp + nb410nf_pos]       ;# base of pos[] 
-
-	movd  mm2, eax
-	movd  mm3, ebx
-	lea   rax, [rax + rax*2]     ;# replace jnr with j3 
-	lea   rbx, [rbx + rbx*2]	
-
-	;# move two coordinates to xmm0-xmm2 	
-	movlpd xmm0, [rsi + rax*8]
-	movlpd xmm1, [rsi + rax*8 + 8]
-	movlpd xmm2, [rsi + rax*8 + 16]
-	movhpd xmm0, [rsi + rbx*8]
-	movhpd xmm1, [rsi + rbx*8 + 8]
-	movhpd xmm2, [rsi + rbx*8 + 16]		
-	
-	;# move ix-iz to xmm4-xmm6 
-	movapd xmm4, [rsp + nb410nf_ix]
-	movapd xmm5, [rsp + nb410nf_iy]
-	movapd xmm6, [rsp + nb410nf_iz]
-
-	;# calc dr 
-	subpd xmm4, xmm0
-	subpd xmm5, xmm1
-	subpd xmm6, xmm2
-
-	;# square dr 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb410nf_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb410nf_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb410nf_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb410nf_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=rinv 
-	
-	mulpd xmm4, xmm0	;# xmm4=r 
-	movapd [rsp + nb410nf_r], xmm4
-	mulpd xmm4, [rsp + nb410nf_gbscale]
-
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 2		;# idx *= 4 
-	
-	movd mm0, eax	
-	movd mm1, ebx
-
-	mov  rsi, [rbp + nb410nf_GBtab]
-	movd eax, mm6
-	psrlq mm6, 32
-	movd ebx, mm6		;# indices in eax/ebx 
-
-	movapd xmm4, [rsi + rax*8]	;# Y1 F1 	
-	movapd xmm3, [rsi + rbx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [rsi + rax*8 + 16]	;# G1 H1 	
-	movapd xmm3, [rsi + rbx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [rsp + nb410nf_qq]
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  
-
-	addpd  xmm5, [rsp + nb410nf_vctot]
-	movapd [rsp + nb410nf_vctot], xmm5 
-
-	;# L-J 
-	movapd xmm4, xmm0
-	mulpd  xmm4, xmm0	;# xmm4=rinvsq 
-
-	movapd xmm6, xmm4
-	mulpd  xmm6, xmm4
-	
-	mulpd  xmm6, xmm4	;# xmm6=rinvsix 
-	movapd xmm4, xmm6
-	mulpd  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulpd  xmm6, [rsp + nb410nf_c6]
-	mulpd  xmm4, [rsp + nb410nf_c12]
-	movapd xmm7, [rsp + nb410nf_Vvdwtot]
-	addpd  xmm7, xmm4
-	subpd  xmm7, xmm6
-	movapd [rsp + nb410nf_Vvdwtot], xmm7
-
-	;# should we do one more iteration? 
-	sub dword ptr [rsp + nb410nf_innerk],  2
-	jl    .nb410nf_checksingle
-	jmp   .nb410nf_unroll_loop
-.nb410nf_checksingle:
-	mov   edx, [rsp + nb410nf_innerk]
-	and   edx, 1
-	jnz    .nb410nf_dosingle
-	jmp    .nb410nf_updateouterdata
-.nb410nf_dosingle:
-	mov rsi, [rbp + nb410nf_charge]
-	mov rdx, [rbp + nb410nf_invsqrta]
-	mov rdi, [rbp + nb410nf_pos]
-	mov   rcx, [rsp + nb410nf_innerjjnr]
-	mov   eax, [rcx]
-	
-	xorpd  xmm6, xmm6
-	movapd xmm7, xmm6
-	movsd  xmm7, [rdx + rax*8]
-	movlpd xmm6, [rsi + rax*8]	;# xmm6(0) has the charge
-	mulsd  xmm7, [rsp + nb410nf_isai]
-	movapd [rsp + nb410nf_isaprod], xmm7
-	movapd xmm1, xmm7
-	mulpd xmm1, [rsp + nb410nf_gbtsc]
-	movapd [rsp + nb410nf_gbscale], xmm1
-	
-	mulsd  xmm7, [rsp + nb410nf_iq]
-	mulsd  xmm6, xmm7
-	movapd [rsp + nb410nf_qq], xmm6
-	
-	movd  mm0, eax		;# use mmx registers as temp storage 
-	mov rsi, [rbp + nb410nf_type]
-	mov eax, [rsi + rax*4]
-	mov rsi, [rbp + nb410nf_vdwparam]
-	shl eax, 1
-	mov edi, [rsp + nb410nf_ntia]
-	add eax, edi
-
-	movlpd xmm6, [rsi + rax*8]	;# c6a
-	movhpd xmm6, [rsi + rax*8 + 8]	;# c6a c12a 
-
-	xorpd xmm7, xmm7
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movd  eax, mm0
-	movapd [rsp + nb410nf_c6], xmm4
-	movapd [rsp + nb410nf_c12], xmm6
-	
-	mov rsi, [rbp + nb410nf_pos]       ;# base of pos[]
-	
-	movd  mm2, eax
-	lea   rax, [rax + rax*2]     ;# replace jnr with j3 
-
-	;# move coordinates to xmm0-xmm2 	
-	movlpd xmm0, [rsi + rax*8]
-	movlpd xmm1, [rsi + rax*8 + 8]
-	movlpd xmm2, [rsi + rax*8 + 16]
-	
-	;# move ix-iz to xmm4-xmm6 
-	movapd xmm4, [rsp + nb410nf_ix]
-	movapd xmm5, [rsp + nb410nf_iy]
-	movapd xmm6, [rsp + nb410nf_iz]
-
-	;# calc dr 
-	subsd xmm4, xmm0
-	subsd xmm5, xmm1
-	subsd xmm6, xmm2
-
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb410nf_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb410nf_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb410nf_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb410nf_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=rinv 
-	
-	mulsd xmm4, xmm0	;# xmm4=r 
-	movapd [rsp + nb410nf_r], xmm4
-	mulsd xmm4, [rsp + nb410nf_gbscale]
-
-	movd mm0, eax	
-	cvttsd2si eax, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, eax
-	subsd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2 
-	
-	shl eax, 2		;# idx *= 4 
-	
-	mov  rsi, [rbp + nb410nf_GBtab]
-
-	movapd xmm4, [rsi + rax*8]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [rsi + rax*8 + 16]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [rsp + nb410nf_qq]
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  
-
-	addsd  xmm5, [rsp + nb410nf_vctot]
-	movsd [rsp + nb410nf_vctot], xmm5 
-
-	;# L-J 
-	movapd xmm4, xmm0
-	mulsd  xmm4, xmm0	;# xmm4=rinvsq 
-
-
-	movapd xmm6, xmm4
-	mulsd  xmm6, xmm4
-
-	mulsd  xmm6, xmm4	;# xmm6=rinvsix 
-	movapd xmm4, xmm6
-	mulsd  xmm4, xmm4	;# xmm4=rinvtwelve 
-	mulsd  xmm6, [rsp + nb410nf_c6]
-	mulsd  xmm4, [rsp + nb410nf_c12]
-	movapd xmm7, [rsp + nb410nf_Vvdwtot]
-	addsd  xmm7, xmm4
-	subsd  xmm7, xmm6
-	movlpd [rsp + nb410nf_Vvdwtot], xmm7
-
-.nb410nf_updateouterdata:
-	mov   ecx, [rsp + nb410nf_ii3]
-	mov   edx, [rsp + nb410nf_is3]
-
-	;# get n from stack
-	mov esi, [rsp + nb410nf_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb410nf_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movapd xmm7, [rsp + nb410nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb410nf_Vc]
-	addsd xmm7, [rax + rdx*8] 
-	;# move back to mem 
-	movsd [rax + rdx*8], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movapd xmm7, [rsp + nb410nf_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb410nf_Vvdw]
-	addsd xmm7, [rax + rdx*8] 
-	;# move back to mem 
-	movsd [rax + rdx*8], xmm7 
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb410nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb410nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb410nf_n], esi
-        jmp .nb410nf_outer
-.nb410nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb410nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb410nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb410nf_threadloop
-.nb410nf_end:
-	mov eax, [rsp + nb410nf_nouter]
-	mov ebx, [rsp + nb410nf_ninner]
-	mov rcx, [rbp + nb410nf_outeriter]
-	mov rdx, [rbp + nb410nf_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 392
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.s
deleted file mode 100644
index 21eb0180b4..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.s
+++ /dev/null
@@ -1,1464 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-
-
-
-
-
-.globl nb_kernel410_x86_64_sse2
-.globl _nb_kernel410_x86_64_sse2
-nb_kernel410_x86_64_sse2:       
-_nb_kernel410_x86_64_sse2:      
-##      Room for return address and rbp (16 bytes)
-.set nb410_fshift, 16
-.set nb410_gid, 24
-.set nb410_pos, 32
-.set nb410_faction, 40
-.set nb410_charge, 48
-.set nb410_p_facel, 56
-.set nb410_argkrf, 64
-.set nb410_argcrf, 72
-.set nb410_Vc, 80
-.set nb410_type, 88
-.set nb410_p_ntype, 96
-.set nb410_vdwparam, 104
-.set nb410_Vvdw, 112
-.set nb410_p_tabscale, 120
-.set nb410_VFtab, 128
-.set nb410_invsqrta, 136
-.set nb410_dvda, 144
-.set nb410_p_gbtabscale, 152
-.set nb410_GBtab, 160
-.set nb410_p_nthreads, 168
-.set nb410_count, 176
-.set nb410_mtx, 184
-.set nb410_outeriter, 192
-.set nb410_inneriter, 200
-.set nb410_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb410_ix, 0
-.set nb410_iy, 16
-.set nb410_iz, 32
-.set nb410_iq, 48
-.set nb410_dx, 64
-.set nb410_dy, 80
-.set nb410_dz, 96
-.set nb410_two, 112
-.set nb410_six, 128
-.set nb410_twelve, 144
-.set nb410_gbtsc, 160
-.set nb410_qq, 176
-.set nb410_c6, 192
-.set nb410_c12, 208
-.set nb410_fscal, 224
-.set nb410_vctot, 240
-.set nb410_Vvdwtot, 256
-.set nb410_fix, 272
-.set nb410_fiy, 288
-.set nb410_fiz, 304
-.set nb410_half, 320
-.set nb410_three, 336
-.set nb410_r, 352
-.set nb410_isai, 368
-.set nb410_isaprod, 384
-.set nb410_dvdasum, 400
-.set nb410_gbscale, 416
-.set nb410_nri, 432
-.set nb410_iinr, 440
-.set nb410_jindex, 448
-.set nb410_jjnr, 456
-.set nb410_shift, 464
-.set nb410_shiftvec, 472
-.set nb410_facel, 480
-.set nb410_innerjjnr, 488
-.set nb410_ii, 496
-.set nb410_is3, 500
-.set nb410_ii3, 504
-.set nb410_ntia, 508
-.set nb410_innerk, 512
-.set nb410_n, 516
-.set nb410_nn1, 520
-.set nb410_ntype, 524
-.set nb410_nouter, 528
-.set nb410_ninner, 532
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $552,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb410_nouter(%rsp)
-        movl %eax,nb410_ninner(%rsp)
-
-        movl (%rdi),%edi
-        movl %edi,nb410_nri(%rsp)
-        movq %rsi,nb410_iinr(%rsp)
-        movq %rdx,nb410_jindex(%rsp)
-        movq %rcx,nb410_jjnr(%rsp)
-        movq %r8,nb410_shift(%rsp)
-        movq %r9,nb410_shiftvec(%rsp)
-        movq nb410_p_ntype(%rbp),%rdi
-        movl (%rdi),%edi
-        movl %edi,nb410_ntype(%rsp)
-        movq nb410_p_facel(%rbp),%rsi
-        movsd (%rsi),%xmm0
-        movsd %xmm0,nb410_facel(%rsp)
-
-        movq nb410_p_gbtabscale(%rbp),%rbx
-        movsd (%rbx),%xmm4
-        shufpd $0,%xmm4,%xmm4
-        movapd %xmm4,nb410_gbtsc(%rsp)
-
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double half IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb410_half(%rsp)
-        movl %ebx,nb410_half+4(%rsp)
-        movsd nb410_half(%rsp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## one
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## two
-        addpd  %xmm2,%xmm3      ## three
-        movapd %xmm3,%xmm4
-        addpd  %xmm4,%xmm4      ## six
-        movapd %xmm4,%xmm5
-        addpd  %xmm5,%xmm5      ## twelve
-        movapd %xmm1,nb410_half(%rsp)
-        movapd %xmm2,nb410_two(%rsp)
-        movapd %xmm3,nb410_three(%rsp)
-        movapd %xmm4,nb410_six(%rsp)
-        movapd %xmm5,nb410_twelve(%rsp)
-
-_nb_kernel410_x86_64_sse2.nb410_threadloop: 
-        movq  nb410_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel410_x86_64_sse2.nb410_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel410_x86_64_sse2.nb410_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb410_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb410_n(%rsp)
-        movl %ebx,nb410_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel410_x86_64_sse2.nb410_outerstart
-        jmp _nb_kernel410_x86_64_sse2.nb410_end
-
-_nb_kernel410_x86_64_sse2.nb410_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb410_nouter(%rsp),%ebx
-        movl %ebx,nb410_nouter(%rsp)
-
-_nb_kernel410_x86_64_sse2.nb410_outer: 
-        movq  nb410_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx        ## rbx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb410_is3(%rsp)      ## store is3 
-
-        movq  nb410_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movsd (%rax,%rbx,8),%xmm0
-        movsd 8(%rax,%rbx,8),%xmm1
-        movsd 16(%rax,%rbx,8),%xmm2
-
-        movq  nb410_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]        
-        movl  (%rcx,%rsi,4),%ebx    ## ebx =ii 
-        movl  %ebx,nb410_ii(%rsp)
-
-        movq  nb410_charge(%rbp),%rdx
-        movsd (%rdx,%rbx,8),%xmm3
-        mulsd nb410_facel(%rsp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movq  nb410_invsqrta(%rbp),%rdx         ## load invsqrta[ii]
-        movsd (%rdx,%rbx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        movq  nb410_type(%rbp),%rdx
-        movl  (%rdx,%rbx,4),%edx
-        imull nb410_ntype(%rsp),%edx
-        shll  %edx
-        movl  %edx,nb410_ntia(%rsp)
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb410_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addsd (%rax,%rbx,8),%xmm0
-        addsd 8(%rax,%rbx,8),%xmm1
-        addsd 16(%rax,%rbx,8),%xmm2
-
-        movapd %xmm3,nb410_iq(%rsp)
-        movapd %xmm4,nb410_isai(%rsp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb410_ix(%rsp)
-        movapd %xmm1,nb410_iy(%rsp)
-        movapd %xmm2,nb410_iz(%rsp)
-
-        movl  %ebx,nb410_ii3(%rsp)
-
-        ## clear vctot and i forces 
-        xorpd %xmm13,%xmm13
-        movapd %xmm13,%xmm12
-        movapd %xmm13,nb410_Vvdwtot(%rsp)
-        movapd %xmm13,nb410_dvdasum(%rsp)
-        movapd %xmm13,%xmm14
-        movapd %xmm13,%xmm15
-
-        movq  nb410_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb410_pos(%rbp),%rsi
-        movq  nb410_faction(%rbp),%rdi
-        movq  nb410_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb410_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb410_ninner(%rsp),%ecx
-        movl  %ecx,nb410_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb410_innerk(%rsp)      ## number of innerloop atoms 
-        jge   _nb_kernel410_x86_64_sse2.nb410_unroll_loop
-        jmp   _nb_kernel410_x86_64_sse2.nb410_checksingle
-_nb_kernel410_x86_64_sse2.nb410_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movq  nb410_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k] 
-        movl  (%rdx),%r14d
-        movl  4(%rdx),%r15d
-        addq $8,nb410_innerjjnr(%rsp)             ## advance pointer (unrolled 2) 
-
-        movq nb410_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%r14,%r14,2),%r10     ## replace jnr with j3 
-        lea  (%r15,%r15,2),%r11
-
-        ## move two coordinates to xmm4-xmm6    
-        movlpd (%rsi,%r10,8),%xmm4
-        movlpd 8(%rsi,%r10,8),%xmm5
-        movlpd 16(%rsi,%r10,8),%xmm6
-        movhpd (%rsi,%r11,8),%xmm4
-        movhpd 8(%rsi,%r11,8),%xmm5
-        movhpd 16(%rsi,%r11,8),%xmm6
-
-        ## calc dr 
-        subpd nb410_ix(%rsp),%xmm4
-        subpd nb410_iy(%rsp),%xmm5
-        subpd nb410_iz(%rsp),%xmm6
-
-        ## store dr 
-        movapd %xmm4,nb410_dx(%rsp)
-        movapd %xmm5,nb410_dy(%rsp)
-        movapd %xmm6,nb410_dz(%rsp)
-
-        ## load isaj
-        movq nb410_invsqrta(%rbp),%rsi
-
-        ## square it 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        movlpd (%rsi,%r14,8),%xmm3
-        movhpd (%rsi,%r15,8),%xmm3
-
-        movq nb410_type(%rbp),%rdi
-        movl (%rdi,%r14,4),%r8d
-        movl (%rdi,%r15,4),%r9d
-
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        mulpd  nb410_isai(%rsp),%xmm3
-        movapd %xmm3,nb410_isaprod(%rsp)
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb410_three(%rsp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb410_half(%rsp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm3,%xmm6
-        mulpd nb410_gbtsc(%rsp),%xmm6
-        movapd %xmm6,nb410_gbscale(%rsp)
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb410_three(%rsp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb410_half(%rsp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=rinv 
-
-    mulpd  nb410_iq(%rsp),%xmm3
-        movq nb410_charge(%rbp),%rsi     ## base of charge[] 
-        movlpd (%rsi,%r14,8),%xmm6
-        movhpd (%rsi,%r15,8),%xmm6
-        mulpd  %xmm3,%xmm6
-        movapd %xmm6,nb410_qq(%rsp)
-
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb410_r(%rsp)
-        mulpd nb410_gbscale(%rsp),%xmm4
-        movl nb410_ntia(%rsp),%edi
-
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        shll %r8d
-        shll %r9d
-        addl %edi,%r8d
-        addl %edi,%r9d
-
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-        movq nb410_vdwparam(%rbp),%rdi
-
-        pslld $2,%mm6           ## idx *= 4 
-
-        movq nb410_GBtab(%rbp),%rsi
-        movd %mm6,%r12d
-        psrlq $32,%mm6
-        movd %mm6,%r13d         ## indices in r12/r13
-
-        movlpd (%rdi,%r8,8),%xmm6
-        movlpd 8(%rdi,%r8,8),%xmm7
-
-    movapd %xmm0,%xmm9 ## rinv
-    mulpd  %xmm9,%xmm9 ## rinvsq
-    movapd %xmm9,%xmm10 ## rinvsq
-    mulpd  %xmm10,%xmm10 ## rinv4
-    mulpd  %xmm9,%xmm10 ## rinv6
-    movapd %xmm10,%xmm11
-    mulpd  %xmm11,%xmm11 ## rinv12
-
-
-        movhpd (%rdi,%r9,8),%xmm6
-        movhpd 8(%rdi,%r9,8),%xmm7
-
-    ## load table data
-        movapd (%rsi,%r12,8),%xmm4      ## Y1 F1        
-        movapd (%rsi,%r13,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-    mulpd  %xmm6,%xmm10  ## vvdw6=c6*rinv6
-        mulpd  %xmm7,%xmm11  ## vvdw12=c12*rinv12     
-
-        movapd %xmm11,%xmm9
-        subpd  %xmm10,%xmm11    ## Vvdw=Vvdw12-Vvdw6
-
-    ## add potential to vvdwtot 
-        addpd  nb410_Vvdwtot(%rsp),%xmm11
-    movapd %xmm11,nb410_Vvdwtot(%rsp)
-
-        movapd 16(%rsi,%r12,8),%xmm6    ## G1 H1        
-        movapd 16(%rsi,%r13,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## coulomb table ready, in xmm4-xmm7            
-
-        mulpd  %xmm1,%xmm7      ## xmm7=Heps
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm1,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulpd  nb410_two(%rsp),%xmm7    ## two*Heps2 
-        movapd nb410_qq(%rsp),%xmm3
-        addpd  %xmm6,%xmm7
-        addpd  %xmm5,%xmm7 ## xmm7=FF 
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-        mulpd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulpd  %xmm7,%xmm3 ## fijC=FF*qq 
-
-   ## LJ forces
-    mulpd  nb410_six(%rsp),%xmm10
-    mulpd  nb410_twelve(%rsp),%xmm9
-    subpd  %xmm10,%xmm9
-    mulpd  %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv
-
-        movq nb410_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        xorpd %xmm7,%xmm7
-        mulpd nb410_gbscale(%rsp),%xmm3
-        movapd %xmm3,%xmm6
-        mulpd  nb410_r(%rsp),%xmm6
-        addpd  %xmm5,%xmm6
-
-    ## update vctot 
-        addpd  %xmm5,%xmm12
-
-        ## xmm6=(vcoul+fijC*r)
-        subpd  %xmm6,%xmm7
-        movapd %xmm7,%xmm6
-
-        movq nb410_faction(%rbp),%rdi
-        ## the fj's - start by accumulating forces from memory 
-        movlpd (%rdi,%r10,8),%xmm2
-        movlpd 8(%rdi,%r10,8),%xmm4
-        movlpd 16(%rdi,%r10,8),%xmm5
-
-        ## update dvdasum
-        addpd  nb410_dvdasum(%rsp),%xmm7
-        movapd %xmm7,nb410_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        addsd  (%rsi,%r14,8),%xmm6
-        addsd  (%rsi,%r15,8),%xmm7
-        movsd  %xmm6,(%rsi,%r14,8)
-        movsd  %xmm7,(%rsi,%r15,8)
-
-        movhpd (%rdi,%r11,8),%xmm2
-        movhpd 8(%rdi,%r11,8),%xmm4
-        movhpd 16(%rdi,%r11,8),%xmm5
-
-    subpd  %xmm3,%xmm9
-    mulpd  %xmm0,%xmm9 ## fscal
-
-    movapd  %xmm9,%xmm10
-    movapd  %xmm9,%xmm11
-
-    mulpd   nb410_dx(%rsp),%xmm9
-    mulpd   nb410_dy(%rsp),%xmm10
-    mulpd   nb410_dz(%rsp),%xmm11
-
-        addpd %xmm9,%xmm2
-        addpd %xmm10,%xmm4
-        addpd %xmm11,%xmm5
-
-        movlpd %xmm2,(%rdi,%r10,8)
-        movlpd %xmm4,8(%rdi,%r10,8)
-        movlpd %xmm5,16(%rdi,%r10,8)
-
-        ## accumulate i forces
-    addpd %xmm9,%xmm13
-    addpd %xmm10,%xmm14
-    addpd %xmm11,%xmm15
-
-        movhpd %xmm2,(%rdi,%r11,8)
-        movhpd %xmm4,8(%rdi,%r11,8)
-        movhpd %xmm5,16(%rdi,%r11,8)
-
-        ## should we do one more iteration? 
-        subl $2,nb410_innerk(%rsp)
-        jl    _nb_kernel410_x86_64_sse2.nb410_checksingle
-        jmp   _nb_kernel410_x86_64_sse2.nb410_unroll_loop
-_nb_kernel410_x86_64_sse2.nb410_checksingle: 
-        movl  nb410_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel410_x86_64_sse2.nb410_dosingle
-        jmp    _nb_kernel410_x86_64_sse2.nb410_updateouterdata
-_nb_kernel410_x86_64_sse2.nb410_dosingle: 
-        movq nb410_charge(%rbp),%rsi
-        movq nb410_invsqrta(%rbp),%rdx
-        movq nb410_pos(%rbp),%rdi
-        movq  nb410_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-
-        ## load isaj
-        movq nb410_invsqrta(%rbp),%rsi
-        movsd (%rsi,%rax,8),%xmm2
-        mulsd  nb410_isai(%rsp),%xmm2
-        movapd %xmm2,nb410_isaprod(%rsp)
-        movapd %xmm2,%xmm1
-        mulsd nb410_gbtsc(%rsp),%xmm1
-        movapd %xmm1,nb410_gbscale(%rsp)
-
-    mulsd nb410_iq(%rsp),%xmm2
-        movq nb410_charge(%rbp),%rsi     ## base of charge[] 
-        movsd (%rsi,%rax,8),%xmm3
-        mulsd  %xmm2,%xmm3
-        movapd %xmm3,nb410_qq(%rsp)
-
-        movq nb410_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%r8d
-        movq nb410_vdwparam(%rbp),%rsi
-        shll %r8d
-        movl nb410_ntia(%rsp),%edi
-        addl %edi,%r8d
-
-        movsd (%rsi,%r8,8),%xmm4
-        movsd 8(%rsi,%r8,8),%xmm6
-        movapd %xmm4,nb410_c6(%rsp)
-        movapd %xmm6,nb410_c12(%rsp)
-
-        movq nb410_pos(%rbp),%rsi        ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r10     ## replace jnr with j3 
-
-        ## move two coordinates to xmm4-xmm6    
-        movsd (%rsi,%r10,8),%xmm4
-        movsd 8(%rsi,%r10,8),%xmm5
-        movsd 16(%rsi,%r10,8),%xmm6
-
-        ## calc dr 
-        subsd nb410_ix(%rsp),%xmm4
-        subsd nb410_iy(%rsp),%xmm5
-        subsd nb410_iz(%rsp),%xmm6
-
-        ## store dr 
-        movapd %xmm4,nb410_dx(%rsp)
-        movapd %xmm5,nb410_dy(%rsp)
-        movapd %xmm6,nb410_dz(%rsp)
-
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb410_three(%rsp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb410_half(%rsp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb410_three(%rsp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb410_half(%rsp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=rinv 
-
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb410_r(%rsp)
-        mulsd nb410_gbscale(%rsp),%xmm4
-
-        cvttsd2si %xmm4,%r12d   ## mm6 = lu idx 
-        cvtsi2sd %r12d,%xmm5
-        subsd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%r12d           ## idx *= 4 
-
-        movq nb410_GBtab(%rbp),%rsi
-
-    movapd %xmm0,%xmm9 ## rinv
-    mulsd  %xmm9,%xmm9 ## rinvsq
-    movapd %xmm9,%xmm10 ## rinvsq
-    mulsd  %xmm10,%xmm10 ## rinv4
-    mulsd  %xmm9,%xmm10 ## rinv6
-    movapd %xmm10,%xmm11
-    mulsd  %xmm11,%xmm11 ## rinv12
-
-    ## load table data
-        movapd (%rsi,%r12,8),%xmm4      ## Y1 F1        
-    movhlps %xmm4,%xmm5
-
-    mulsd  nb410_c6(%rsp),%xmm10      ## vvdw6=c6*rinv6
-        mulsd  nb410_c12(%rsp),%xmm11     ## vvdw12=c12*rinv12     
-
-        movapd %xmm11,%xmm9
-        subsd  %xmm10,%xmm11    ## Vvdw=Vvdw12-Vvdw6
-
-    ## add potential to vvdwtot 
-        addsd  nb410_Vvdwtot(%rsp),%xmm11
-    movsd %xmm11,nb410_Vvdwtot(%rsp)
-
-        movapd 16(%rsi,%r12,8),%xmm6    ## G1 H1        
-    movhlps %xmm6,%xmm7
-        ## coulomb table ready, in xmm4-xmm7            
-
-        mulsd  %xmm1,%xmm7      ## xmm7=Heps
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm1,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulsd  nb410_two(%rsp),%xmm7    ## two*Heps2 
-        movapd nb410_qq(%rsp),%xmm3
-        addsd  %xmm6,%xmm7
-        addsd  %xmm5,%xmm7 ## xmm7=FF 
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        mulsd  %xmm7,%xmm3 ## fijC=FF*qq 
-
-   ## LJ forces
-    mulsd  nb410_six(%rsp),%xmm10
-    mulsd  nb410_twelve(%rsp),%xmm9
-    subsd  %xmm10,%xmm9
-    mulsd  %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv
-
-        movq nb410_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        xorpd %xmm7,%xmm7
-        mulsd nb410_gbscale(%rsp),%xmm3
-        movapd %xmm3,%xmm6
-        mulsd  nb410_r(%rsp),%xmm6
-        addsd  %xmm5,%xmm6
-
-    ## update vctot 
-        addsd  %xmm5,%xmm12
-
-        ## xmm6=(vcoul+fijC*r)
-        subsd  %xmm6,%xmm7
-        movapd %xmm7,%xmm6
-
-        ## update dvdasum
-        addsd  nb410_dvdasum(%rsp),%xmm7
-        movsd %xmm7,nb410_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        addsd  (%rsi,%rax,8),%xmm6
-        addsd  (%rsi,%rbx,8),%xmm7
-        movsd  %xmm6,(%rsi,%rax,8)
-        movsd  %xmm7,(%rsi,%rbx,8)
-
-    subsd  %xmm3,%xmm9
-    mulsd  %xmm0,%xmm9 ## fscal
-
-    movapd  %xmm9,%xmm10
-    movapd  %xmm9,%xmm11
-
-    mulsd   nb410_dx(%rsp),%xmm9
-    mulsd   nb410_dy(%rsp),%xmm10
-    mulsd   nb410_dz(%rsp),%xmm11
-
-        ## accumulate i forces
-    addsd %xmm9,%xmm13
-    addsd %xmm10,%xmm14
-    addsd %xmm11,%xmm15
-
-        movq nb410_faction(%rbp),%rdi
-        ## the fj's - start by accumulating forces from memory 
-        addsd (%rdi,%r10,8),%xmm9
-        addsd 8(%rdi,%r10,8),%xmm10
-        addsd 16(%rdi,%r10,8),%xmm11
-        movsd %xmm9,(%rdi,%r10,8)
-        movsd %xmm10,8(%rdi,%r10,8)
-        movsd %xmm11,16(%rdi,%r10,8)
-
-_nb_kernel410_x86_64_sse2.nb410_updateouterdata: 
-        movl  nb410_ii3(%rsp),%ecx
-        movq  nb410_faction(%rbp),%rdi
-        movq  nb410_fshift(%rbp),%rsi
-        movl  nb410_is3(%rsp),%edx
-
-        ## accumulate i forces in xmm13, xmm14, xmm15
-        movhlps %xmm13,%xmm3
-        movhlps %xmm14,%xmm4
-        movhlps %xmm15,%xmm5
-        addsd  %xmm3,%xmm13
-        addsd  %xmm4,%xmm14
-        addsd  %xmm5,%xmm15 ## sum is in low xmm13-xmm15
-
-        ## increment i force 
-        movsd  (%rdi,%rcx,8),%xmm3
-        movsd  8(%rdi,%rcx,8),%xmm4
-        movsd  16(%rdi,%rcx,8),%xmm5
-        subsd  %xmm13,%xmm3
-        subsd  %xmm14,%xmm4
-        subsd  %xmm15,%xmm5
-        movsd  %xmm3,(%rdi,%rcx,8)
-        movsd  %xmm4,8(%rdi,%rcx,8)
-        movsd  %xmm5,16(%rdi,%rcx,8)
-
-        ## increment fshift force  
-        movsd  (%rsi,%rdx,8),%xmm3
-        movsd  8(%rsi,%rdx,8),%xmm4
-        movsd  16(%rsi,%rdx,8),%xmm5
-        subsd  %xmm13,%xmm3
-        subsd  %xmm14,%xmm4
-        subsd  %xmm15,%xmm5
-        movsd  %xmm3,(%rsi,%rdx,8)
-        movsd  %xmm4,8(%rsi,%rdx,8)
-        movsd  %xmm5,16(%rsi,%rdx,8)
-
-        ## get n from stack
-        movl nb410_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb410_gid(%rbp),%rdx              ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movhlps %xmm12,%xmm6
-        addsd  %xmm6,%xmm12     ## low xmm12 has the sum now 
-
-        ## add earlier value from mem 
-        movq  nb410_Vc(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm12
-        ## move back to mem 
-        movsd %xmm12,(%rax,%rdx,8)
-
-        ## accumulate total lj energy and update it 
-        movapd nb410_Vvdwtot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movq  nb410_Vvdw(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%rax,%rdx,8)
-
-        ## accumulate dVda and update it 
-        movapd nb410_dvdasum(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        movl nb410_ii(%rsp),%edx
-        movq nb410_dvda(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm7
-        movsd %xmm7,(%rax,%rdx,8)
-
-        ## finish if last 
-        movl nb410_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel410_x86_64_sse2.nb410_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb410_n(%rsp)
-        jmp _nb_kernel410_x86_64_sse2.nb410_outer
-_nb_kernel410_x86_64_sse2.nb410_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb410_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel410_x86_64_sse2.nb410_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel410_x86_64_sse2.nb410_threadloop
-_nb_kernel410_x86_64_sse2.nb410_end: 
-        movl nb410_nouter(%rsp),%eax
-        movl nb410_ninner(%rsp),%ebx
-        movq nb410_outeriter(%rbp),%rcx
-        movq nb410_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $552,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
-
-
-
-
-
-
-
-.globl nb_kernel410nf_x86_64_sse2
-.globl _nb_kernel410nf_x86_64_sse2
-nb_kernel410nf_x86_64_sse2:     
-_nb_kernel410nf_x86_64_sse2:    
-##      Room for return address and rbp (16 bytes)
-.set nb410nf_fshift, 16
-.set nb410nf_gid, 24
-.set nb410nf_pos, 32
-.set nb410nf_faction, 40
-.set nb410nf_charge, 48
-.set nb410nf_p_facel, 56
-.set nb410nf_argkrf, 64
-.set nb410nf_argcrf, 72
-.set nb410nf_Vc, 80
-.set nb410nf_type, 88
-.set nb410nf_p_ntype, 96
-.set nb410nf_vdwparam, 104
-.set nb410nf_Vvdw, 112
-.set nb410nf_p_tabscale, 120
-.set nb410nf_VFtab, 128
-.set nb410nf_invsqrta, 136
-.set nb410nf_dvda, 144
-.set nb410nf_p_gbtabscale, 152
-.set nb410nf_GBtab, 160
-.set nb410nf_p_nthreads, 168
-.set nb410nf_count, 176
-.set nb410nf_mtx, 184
-.set nb410nf_outeriter, 192
-.set nb410nf_inneriter, 200
-.set nb410nf_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb410nf_ix, 0
-.set nb410nf_iy, 16
-.set nb410nf_iz, 32
-.set nb410nf_iq, 48
-.set nb410nf_two, 64
-.set nb410nf_gbtsc, 80
-.set nb410nf_qq, 96
-.set nb410nf_c6, 112
-.set nb410nf_c12, 128
-.set nb410nf_vctot, 144
-.set nb410nf_Vvdwtot, 160
-.set nb410nf_half, 176
-.set nb410nf_three, 192
-.set nb410nf_r, 208
-.set nb410nf_isai, 224
-.set nb410nf_isaprod, 240
-.set nb410nf_gbscale, 256
-.set nb410nf_nri, 272
-.set nb410nf_iinr, 280
-.set nb410nf_jindex, 288
-.set nb410nf_jjnr, 296
-.set nb410nf_shift, 304
-.set nb410nf_shiftvec, 312
-.set nb410nf_facel, 320
-.set nb410nf_innerjjnr, 328
-.set nb410nf_ii, 336
-.set nb410nf_is3, 340
-.set nb410nf_ii3, 344
-.set nb410nf_ntia, 348
-.set nb410nf_innerk, 352
-.set nb410nf_n, 356
-.set nb410nf_nn1, 360
-.set nb410nf_ntype, 364
-.set nb410nf_nouter, 368
-.set nb410nf_ninner, 372
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $392,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb410nf_nouter(%rsp)
-        movl %eax,nb410nf_ninner(%rsp)
-
-        movl (%rdi),%edi
-        movl %edi,nb410nf_nri(%rsp)
-        movq %rsi,nb410nf_iinr(%rsp)
-        movq %rdx,nb410nf_jindex(%rsp)
-        movq %rcx,nb410nf_jjnr(%rsp)
-        movq %r8,nb410nf_shift(%rsp)
-        movq %r9,nb410nf_shiftvec(%rsp)
-        movq nb410nf_p_ntype(%rbp),%rdi
-        movl (%rdi),%edi
-        movl %edi,nb410nf_ntype(%rsp)
-        movq nb410nf_p_facel(%rbp),%rsi
-        movsd (%rsi),%xmm0
-        movsd %xmm0,nb410nf_facel(%rsp)
-
-        movq nb410nf_p_gbtabscale(%rbp),%rbx
-        movsd (%rbx),%xmm4
-        shufpd $0,%xmm4,%xmm4
-        movapd %xmm4,nb410nf_gbtsc(%rsp)
-
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double half IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb410nf_half(%rsp)
-        movl %ebx,nb410nf_half+4(%rsp)
-        movsd nb410nf_half(%rsp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## one
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## two
-        addpd  %xmm2,%xmm3      ## three
-        movapd %xmm1,nb410nf_half(%rsp)
-        movapd %xmm2,nb410nf_two(%rsp)
-        movapd %xmm3,nb410nf_three(%rsp)
-
-_nb_kernel410nf_x86_64_sse2.nb410nf_threadloop: 
-        movq  nb410nf_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel410nf_x86_64_sse2.nb410nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel410nf_x86_64_sse2.nb410nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb410nf_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb410nf_n(%rsp)
-        movl %ebx,nb410nf_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel410nf_x86_64_sse2.nb410nf_outerstart
-        jmp _nb_kernel410nf_x86_64_sse2.nb410nf_end
-
-_nb_kernel410nf_x86_64_sse2.nb410nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb410nf_nouter(%rsp),%ebx
-        movl %ebx,nb410nf_nouter(%rsp)
-
-_nb_kernel410nf_x86_64_sse2.nb410nf_outer: 
-        movq  nb410nf_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx        ## rbx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb410nf_is3(%rsp)            ## store is3 
-
-        movq  nb410nf_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movsd (%rax,%rbx,8),%xmm0
-        movsd 8(%rax,%rbx,8),%xmm1
-        movsd 16(%rax,%rbx,8),%xmm2
-
-        movq  nb410nf_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]      
-        movl  (%rcx,%rsi,4),%ebx    ## ebx =ii 
-        movl  %ebx,nb410nf_ii(%rsp)
-
-        movq  nb410nf_charge(%rbp),%rdx
-        movsd (%rdx,%rbx,8),%xmm3
-        mulsd nb410nf_facel(%rsp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movq  nb410nf_invsqrta(%rbp),%rdx       ## load invsqrta[ii]
-        movsd (%rdx,%rbx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        movq  nb410nf_type(%rbp),%rdx
-        movl  (%rdx,%rbx,4),%edx
-        imull nb410nf_ntype(%rsp),%edx
-        shll  %edx
-    movl  %edx,nb410nf_ntia(%rsp)
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb410nf_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addsd (%rax,%rbx,8),%xmm0
-        addsd 8(%rax,%rbx,8),%xmm1
-        addsd 16(%rax,%rbx,8),%xmm2
-
-        movapd %xmm3,nb410nf_iq(%rsp)
-        movapd %xmm4,nb410nf_isai(%rsp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb410nf_ix(%rsp)
-        movapd %xmm1,nb410nf_iy(%rsp)
-        movapd %xmm2,nb410nf_iz(%rsp)
-
-        movl  %ebx,nb410nf_ii3(%rsp)
-
-        ## clear vctot and Vvdwtot
-        xorpd %xmm4,%xmm4
-        movapd %xmm4,nb410nf_vctot(%rsp)
-        movapd %xmm4,nb410nf_Vvdwtot(%rsp)
-
-        movq  nb410nf_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb410nf_pos(%rbp),%rsi
-        movq  nb410nf_faction(%rbp),%rdi
-        movq  nb410nf_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb410nf_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb410nf_ninner(%rsp),%ecx
-        movl  %ecx,nb410nf_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb410nf_innerk(%rsp)      ## number of innerloop atoms 
-        jge   _nb_kernel410nf_x86_64_sse2.nb410nf_unroll_loop
-        jmp   _nb_kernel410nf_x86_64_sse2.nb410nf_checksingle
-_nb_kernel410nf_x86_64_sse2.nb410nf_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movq  nb410nf_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k] 
-        movl  (%rdx),%eax
-        movl  4(%rdx),%ebx
-        addq $8,nb410nf_innerjjnr(%rsp)             ## advance pointer (unrolled 2) 
-
-        ## load isaj
-        movq nb410nf_invsqrta(%rbp),%rsi
-        movlpd (%rsi,%rax,8),%xmm2
-        movhpd (%rsi,%rbx,8),%xmm2
-        mulpd  nb410nf_isai(%rsp),%xmm2
-        movapd %xmm2,nb410nf_isaprod(%rsp)
-        movapd %xmm2,%xmm1
-        mulpd nb410nf_gbtsc(%rsp),%xmm1
-        movapd %xmm1,nb410nf_gbscale(%rsp)
-
-        movq nb410nf_charge(%rbp),%rsi     ## base of charge[] 
-        movlpd (%rsi,%rax,8),%xmm3
-        movhpd (%rsi,%rbx,8),%xmm3
-
-        mulpd nb410nf_iq(%rsp),%xmm2
-        mulpd  %xmm2,%xmm3
-        movapd %xmm3,nb410nf_qq(%rsp)
-
-        movd  %eax,%mm0         ## use mmx registers as temp storage 
-        movd  %ebx,%mm1
-
-        movq nb410nf_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%eax
-        movl (%rsi,%rbx,4),%ebx
-        movq nb410nf_vdwparam(%rbp),%rsi
-        shll %eax
-        shll %ebx
-        movl nb410nf_ntia(%rsp),%edi
-        addl %edi,%eax
-        addl %edi,%ebx
-
-        movlpd (%rsi,%rax,8),%xmm6      ## c6a
-        movlpd (%rsi,%rbx,8),%xmm7      ## c6b
-        movhpd 8(%rsi,%rax,8),%xmm6     ## c6a c12a 
-        movhpd 8(%rsi,%rbx,8),%xmm7     ## c6b c12b 
-
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movd  %mm0,%eax
-        movd  %mm1,%ebx
-        movapd %xmm4,nb410nf_c6(%rsp)
-        movapd %xmm6,nb410nf_c12(%rsp)
-
-        movq nb410nf_pos(%rbp),%rsi        ## base of pos[] 
-
-        movd  %eax,%mm2
-        movd  %ebx,%mm3
-        lea  (%rax,%rax,2),%rax     ## replace jnr with j3 
-        lea  (%rbx,%rbx,2),%rbx
-
-        ## move two coordinates to xmm0-xmm2    
-        movlpd (%rsi,%rax,8),%xmm0
-        movlpd 8(%rsi,%rax,8),%xmm1
-        movlpd 16(%rsi,%rax,8),%xmm2
-        movhpd (%rsi,%rbx,8),%xmm0
-        movhpd 8(%rsi,%rbx,8),%xmm1
-        movhpd 16(%rsi,%rbx,8),%xmm2
-
-        ## move ix-iz to xmm4-xmm6 
-        movapd nb410nf_ix(%rsp),%xmm4
-        movapd nb410nf_iy(%rsp),%xmm5
-        movapd nb410nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subpd %xmm0,%xmm4
-        subpd %xmm1,%xmm5
-        subpd %xmm2,%xmm6
-
-        ## square dr 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb410nf_three(%rsp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb410nf_half(%rsp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb410nf_three(%rsp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb410nf_half(%rsp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=rinv 
-
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb410nf_r(%rsp)
-        mulpd nb410nf_gbscale(%rsp),%xmm4
-
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6           ## idx *= 4 
-
-        movd %eax,%mm0
-        movd %ebx,%mm1
-
-        movq nb410nf_GBtab(%rbp),%rsi
-        movd %mm6,%eax
-        psrlq $32,%mm6
-        movd %mm6,%ebx          ## indices in eax/ebx 
-
-        movapd (%rsi,%rax,8),%xmm4      ## Y1 F1        
-        movapd (%rsi,%rbx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%rsi,%rax,8),%xmm6    ## G1 H1        
-        movapd 16(%rsi,%rbx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb410nf_qq(%rsp),%xmm3
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-        mulpd  %xmm3,%xmm5 ## vcoul=qq*VV  
-
-        addpd  nb410nf_vctot(%rsp),%xmm5
-        movapd %xmm5,nb410nf_vctot(%rsp)
-
-        ## L-J 
-        movapd %xmm0,%xmm4
-        mulpd  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-        movapd %xmm4,%xmm6
-        mulpd  %xmm4,%xmm6
-
-        mulpd  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movapd %xmm6,%xmm4
-        mulpd  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulpd  nb410nf_c6(%rsp),%xmm6
-        mulpd  nb410nf_c12(%rsp),%xmm4
-        movapd nb410nf_Vvdwtot(%rsp),%xmm7
-        addpd  %xmm4,%xmm7
-        subpd  %xmm6,%xmm7
-        movapd %xmm7,nb410nf_Vvdwtot(%rsp)
-
-        ## should we do one more iteration? 
-        subl $2,nb410nf_innerk(%rsp)
-        jl    _nb_kernel410nf_x86_64_sse2.nb410nf_checksingle
-        jmp   _nb_kernel410nf_x86_64_sse2.nb410nf_unroll_loop
-_nb_kernel410nf_x86_64_sse2.nb410nf_checksingle: 
-        movl  nb410nf_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel410nf_x86_64_sse2.nb410nf_dosingle
-        jmp    _nb_kernel410nf_x86_64_sse2.nb410nf_updateouterdata
-_nb_kernel410nf_x86_64_sse2.nb410nf_dosingle: 
-        movq nb410nf_charge(%rbp),%rsi
-        movq nb410nf_invsqrta(%rbp),%rdx
-        movq nb410nf_pos(%rbp),%rdi
-        movq  nb410nf_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-
-        xorpd  %xmm6,%xmm6
-        movapd %xmm6,%xmm7
-        movsd  (%rdx,%rax,8),%xmm7
-        movlpd (%rsi,%rax,8),%xmm6      ## xmm6(0) has the charge
-        mulsd  nb410nf_isai(%rsp),%xmm7
-        movapd %xmm7,nb410nf_isaprod(%rsp)
-        movapd %xmm7,%xmm1
-        mulpd nb410nf_gbtsc(%rsp),%xmm1
-        movapd %xmm1,nb410nf_gbscale(%rsp)
-
-        mulsd  nb410nf_iq(%rsp),%xmm7
-        mulsd  %xmm7,%xmm6
-        movapd %xmm6,nb410nf_qq(%rsp)
-
-        movd  %eax,%mm0         ## use mmx registers as temp storage 
-        movq nb410nf_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%eax
-        movq nb410nf_vdwparam(%rbp),%rsi
-        shll %eax
-        movl nb410nf_ntia(%rsp),%edi
-        addl %edi,%eax
-
-        movlpd (%rsi,%rax,8),%xmm6      ## c6a
-        movhpd 8(%rsi,%rax,8),%xmm6     ## c6a c12a 
-
-        xorpd %xmm7,%xmm7
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movd  %mm0,%eax
-        movapd %xmm4,nb410nf_c6(%rsp)
-        movapd %xmm6,nb410nf_c12(%rsp)
-
-        movq nb410nf_pos(%rbp),%rsi        ## base of pos[]
-
-        movd  %eax,%mm2
-        lea  (%rax,%rax,2),%rax     ## replace jnr with j3 
-
-        ## move coordinates to xmm0-xmm2        
-        movlpd (%rsi,%rax,8),%xmm0
-        movlpd 8(%rsi,%rax,8),%xmm1
-        movlpd 16(%rsi,%rax,8),%xmm2
-
-        ## move ix-iz to xmm4-xmm6 
-        movapd nb410nf_ix(%rsp),%xmm4
-        movapd nb410nf_iy(%rsp),%xmm5
-        movapd nb410nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subsd %xmm0,%xmm4
-        subsd %xmm1,%xmm5
-        subsd %xmm2,%xmm6
-
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb410nf_three(%rsp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb410nf_half(%rsp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb410nf_three(%rsp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb410nf_half(%rsp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=rinv 
-
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb410nf_r(%rsp)
-        mulsd nb410nf_gbscale(%rsp),%xmm4
-
-        movd %eax,%mm0
-        cvttsd2si %xmm4,%eax    ## mm6 = lu idx 
-        cvtsi2sd %eax,%xmm5
-        subsd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%eax            ## idx *= 4 
-
-        movq nb410nf_GBtab(%rbp),%rsi
-
-        movapd (%rsi,%rax,8),%xmm4      ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 16(%rsi,%rax,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb410nf_qq(%rsp),%xmm3
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  %xmm3,%xmm5 ## vcoul=qq*VV  
-
-        addsd  nb410nf_vctot(%rsp),%xmm5
-        movsd %xmm5,nb410nf_vctot(%rsp)
-
-        ## L-J 
-        movapd %xmm0,%xmm4
-        mulsd  %xmm0,%xmm4      ## xmm4=rinvsq 
-
-
-        movapd %xmm4,%xmm6
-        mulsd  %xmm4,%xmm6
-
-        mulsd  %xmm4,%xmm6      ## xmm6=rinvsix 
-        movapd %xmm6,%xmm4
-        mulsd  %xmm4,%xmm4      ## xmm4=rinvtwelve 
-        mulsd  nb410nf_c6(%rsp),%xmm6
-        mulsd  nb410nf_c12(%rsp),%xmm4
-        movapd nb410nf_Vvdwtot(%rsp),%xmm7
-        addsd  %xmm4,%xmm7
-        subsd  %xmm6,%xmm7
-        movlpd %xmm7,nb410nf_Vvdwtot(%rsp)
-
-_nb_kernel410nf_x86_64_sse2.nb410nf_updateouterdata: 
-        movl  nb410nf_ii3(%rsp),%ecx
-        movl  nb410nf_is3(%rsp),%edx
-
-        ## get n from stack
-        movl nb410nf_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb410nf_gid(%rbp),%rdx            ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movapd nb410nf_vctot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movq  nb410nf_Vc(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%rax,%rdx,8)
-
-        ## accumulate total lj energy and update it 
-        movapd nb410nf_Vvdwtot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movq  nb410nf_Vvdw(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%rax,%rdx,8)
-
-        ## finish if last 
-        movl nb410nf_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel410nf_x86_64_sse2.nb410nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb410nf_n(%rsp)
-        jmp _nb_kernel410nf_x86_64_sse2.nb410nf_outer
-_nb_kernel410nf_x86_64_sse2.nb410nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb410nf_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel410nf_x86_64_sse2.nb410nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel410nf_x86_64_sse2.nb410nf_threadloop
-_nb_kernel410nf_x86_64_sse2.nb410nf_end: 
-        movl nb410nf_nouter(%rsp),%eax
-        movl nb410nf_ninner(%rsp),%ebx
-        movq nb410nf_outeriter(%rbp),%rcx
-        movq nb410nf_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $392,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.intel_syntax.s
deleted file mode 100644
index 42ca37e0c3..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.intel_syntax.s
+++ /dev/null
@@ -1,1664 +0,0 @@
-;#
-;#
-;# Gromacs 4.0                         Copyright (c) 1991-2003 
-;# David van der Spoel, Erik Lindahl
-;#
-;# This program is free software; you can redistribute it and/or
-;# modify it under the terms of the GNU General Public License
-;# as published by the Free Software Foundation; either version 2
-;# of the License, or (at your option) any later version.
-;#
-;# To help us fund GROMACS development, we humbly ask that you cite
-;# the research papers on the package. Check out http://www.gromacs.org
-;# 
-;# And Hey:
-;# Gnomes, ROck Monsters And Chili Sauce
-;#
-
-;# These files require GNU binutils 2.10 or later, since we
-;# use intel syntax for portability, or a recent version 
-;# of NASM that understands Extended 3DNow and SSE2 instructions.
-;# (NASM is normally only used with MS Visual C++).
-;# Since NASM and gnu as disagree on some definitions and use 
-;# completely different preprocessing options I have to introduce a
-;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
-;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
-;# reason why all comments need both symbols...
-;# The source is written for GNU as, with intel syntax. When you use
-;# NASM we redefine a couple of things. The false if-statement around 
-;# the following code is seen by GNU as, but NASM doesn't see it, so 
-;# the code inside is read by NASM but not gcc.
-
-; .if 0    # block below only read by NASM
-%define .section	section
-%define .long		dd
-%define .align		align
-%define .globl		global
-;# NASM only wants 'dword', not 'dword ptr'.
-%define ptr
-%macro .equiv                  2
-   %1 equ %2
-%endmacro
-; .endif                   # End of NASM-specific block
-; .intel_syntax noprefix   # Line only read by gnu as
-
-
-.globl nb_kernel430_x86_64_sse2
-.globl _nb_kernel430_x86_64_sse2
-nb_kernel430_x86_64_sse2:	
-_nb_kernel430_x86_64_sse2:	
-;#	Room for return address and rbp (16 bytes)
-.equiv          nb430_fshift,           16
-.equiv          nb430_gid,              24
-.equiv          nb430_pos,              32
-.equiv          nb430_faction,          40
-.equiv          nb430_charge,           48
-.equiv          nb430_p_facel,          56
-.equiv          nb430_argkrf,           64
-.equiv          nb430_argcrf,           72
-.equiv          nb430_Vc,               80
-.equiv          nb430_type,             88
-.equiv          nb430_p_ntype,          96
-.equiv          nb430_vdwparam,         104
-.equiv          nb430_Vvdw,             112
-.equiv          nb430_p_tabscale,       120
-.equiv          nb430_VFtab,            128
-.equiv          nb430_invsqrta,         136
-.equiv          nb430_dvda,             144
-.equiv          nb430_p_gbtabscale,     152
-.equiv          nb430_GBtab,            160
-.equiv          nb430_p_nthreads,       168
-.equiv          nb430_count,            176
-.equiv          nb430_mtx,              184
-.equiv          nb430_outeriter,        192
-.equiv          nb430_inneriter,        200
-.equiv          nb430_work,             208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb430_ix,               0
-.equiv          nb430_iy,               16
-.equiv          nb430_iz,               32
-.equiv          nb430_iq,               48
-.equiv          nb430_dx,               64
-.equiv          nb430_dy,               80
-.equiv          nb430_dz,               96
-.equiv          nb430_eps,              112
-.equiv          nb430_gbtsc,            128
-.equiv          nb430_tsc,              144
-.equiv          nb430_qq,               160
-.equiv          nb430_c6,               176
-.equiv          nb430_c12,              192
-.equiv          nb430_epsgb,            208
-.equiv          nb430_vctot,            224
-.equiv          nb430_Vvdwtot,          240
-.equiv          nb430_fix,              256
-.equiv          nb430_fiy,              272
-.equiv          nb430_fiz,              288
-.equiv          nb430_half,             304
-.equiv          nb430_three,            320
-.equiv          nb430_r,                336
-.equiv          nb430_isai,             352
-.equiv          nb430_isaprod,          368
-.equiv          nb430_dvdasum,          384
-.equiv          nb430_gbscale,          400
-.equiv          nb430_rinv,             416
-.equiv          nb430_nri,              432
-.equiv          nb430_iinr,             440
-.equiv          nb430_jindex,           448
-.equiv          nb430_jjnr,             456
-.equiv          nb430_shift,            464
-.equiv          nb430_shiftvec,         472
-.equiv          nb430_facel,            480
-.equiv          nb430_innerjjnr,        488
-.equiv          nb430_ii,               496
-.equiv          nb430_is3,              500
-.equiv          nb430_ii3,              504
-.equiv          nb430_ntia,             508
-.equiv          nb430_innerk,           512
-.equiv          nb430_n,                516
-.equiv          nb430_nn1,              520
-.equiv          nb430_ntype,            524
-.equiv          nb430_nouter,           528
-.equiv          nb430_ninner,           532
-
-	push rbp
-	mov  rbp, rsp
-	push rbx
-
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 536		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb430_nouter], eax
-	mov [rsp + nb430_ninner], eax
-
-	mov edi, [rdi]
-	mov [rsp + nb430_nri], edi
-	mov [rsp + nb430_iinr], rsi
-	mov [rsp + nb430_jindex], rdx
-	mov [rsp + nb430_jjnr], rcx
-	mov [rsp + nb430_shift], r8
-	mov [rsp + nb430_shiftvec], r9
-	mov rdi, [rbp + nb430_p_ntype]
-	mov edi, [rdi]
-	mov [rsp + nb430_ntype], edi
-	mov rsi, [rbp + nb430_p_facel]
-	movsd xmm0, [rsi]
-	movsd [rsp + nb430_facel], xmm0
-
-	mov rax, [rbp + nb430_p_tabscale]
-	movsd xmm3, [rax]
-	shufpd xmm3, xmm3, 0
-	movapd [rsp + nb430_tsc], xmm3
-
-	mov rbx, [rbp + nb430_p_gbtabscale]
-	movsd xmm4, [rbx]
-	shufpd xmm4, xmm4, 0
-	movapd [rsp + nb430_gbtsc], xmm4
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double half IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [rsp + nb430_half], eax
-	mov [rsp + nb430_half+4], ebx
-	movsd xmm1, [rsp + nb430_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# one
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# two
-	addpd  xmm3, xmm2	;# three
-	movapd [rsp + nb430_half], xmm1
-	movapd [rsp + nb430_three], xmm3
-
-.nb430_threadloop:
-        mov   rsi, [rbp + nb430_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb430_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb430_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb430_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb430_n], eax
-        mov [rsp + nb430_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb430_outerstart
-        jmp .nb430_end
-
-.nb430_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb430_nouter]
-	mov [rsp + nb430_nouter], ebx
-
-.nb430_outer:
-	mov   rax, [rsp + nb430_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax+rsi*4]		;# rbx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb430_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb430_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movsd xmm0, [rax + rbx*8]
-	movsd xmm1, [rax + rbx*8 + 8]
-	movsd xmm2, [rax + rbx*8 + 16] 
-
-	mov   rcx, [rsp + nb430_iinr]       ;# rcx = pointer into iinr[]
-	mov   ebx, [rcx+rsi*4]	    ;# ebx =ii 
-	mov   [rsp + nb430_ii], ebx
-
-	mov   rdx, [rbp + nb430_charge]
-	movsd xmm3, [rdx + rbx*8]	
-	mulsd xmm3, [rsp + nb430_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   rdx, [rbp + nb430_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [rdx + rbx*8]
-	shufpd xmm4, xmm4, 0
-
-    	mov   rdx, [rbp + nb430_type] 
-    	mov   edx, [rdx + rbx*4]
-    	imul  edx, [rsp + nb430_ntype]
-    	shl   edx, 1
-    	mov   [rsp + nb430_ntia], edx
-	
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb430_pos]    ;# rax = base of pos[]  
-
-	addsd xmm0, [rax + rbx*8]
-	addsd xmm1, [rax + rbx*8 + 8]
-	addsd xmm2, [rax + rbx*8 + 16]
-
-	movapd [rsp + nb430_iq], xmm3
-	movapd [rsp + nb430_isai], xmm4
-	
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [rsp + nb430_ix], xmm0
-	movapd [rsp + nb430_iy], xmm1
-	movapd [rsp + nb430_iz], xmm2
-
-	mov   [rsp + nb430_ii3], ebx
-	
-	;# clear vctot and i forces 
-	xorpd xmm4, xmm4
-	movapd [rsp + nb430_vctot], xmm4
-	movapd [rsp + nb430_Vvdwtot], xmm4
-	movapd [rsp + nb430_dvdasum], xmm4
-	movapd [rsp + nb430_fix], xmm4
-	movapd [rsp + nb430_fiy], xmm4
-	movapd [rsp + nb430_fiz], xmm4
-	
-	mov   rax, [rsp + nb430_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb430_pos]
-	mov   rdi, [rbp + nb430_faction]	
-	mov   rax, [rsp + nb430_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb430_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [rsp + nb430_ninner]
-	mov   [rsp + nb430_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb430_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb430_unroll_loop
-	jmp   .nb430_checksingle
-.nb430_unroll_loop:	
-	;# twice unrolled innerloop here 
-	mov   rdx, [rsp + nb430_innerjjnr]   ;# pointer to jjnr[k] 
-	mov   eax, [rdx]
-	mov   ebx, [rdx + 4]
-	add qword ptr [rsp + nb430_innerjjnr], 8	;# advance pointer (unrolled 2) 
-
-		
-	mov rsi, [rbp + nb430_pos]		;# base of pos[] 
-
-	lea   r10, [rax + rax*2]     ;# j3 
-	lea   r11, [rbx + rbx*2]	
-
-	;# move two coordinates to xmm4-xmm6 
-	movlpd xmm4, [rsi + r10*8]
-	movlpd xmm5, [rsi + r10*8 + 8]
-	movlpd xmm6, [rsi + r10*8 + 16]
-	movhpd xmm4, [rsi + r11*8]
-	movhpd xmm5, [rsi + r11*8 + 8]
-	movhpd xmm6, [rsi + r11*8 + 16]		
-	
-	;# calc dr 
-	subpd xmm4, [rsp + nb430_ix]
-	subpd xmm5, [rsp + nb430_iy]
-	subpd xmm6, [rsp + nb430_iz]
-
-	;# store dr 
-	movapd [rsp + nb430_dx], xmm4
-	movapd [rsp + nb430_dy], xmm5
-	movapd [rsp + nb430_dz], xmm6
-    
-	;# square it 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	;# load isaj
-	mov rsi, [rbp + nb430_invsqrta]
-	movlpd xmm3, [rsi + rax*8]
-	movhpd xmm3, [rsi + rbx*8]
-	mulpd  xmm3, [rsp + nb430_isai]
-	movapd [rsp + nb430_isaprod], xmm3
-	movapd xmm6, xmm3
-	mulpd xmm3, [rsp + nb430_gbtsc]
-	movapd [rsp + nb430_gbscale], xmm3
-	
-    ;#invsqrt
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb430_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb430_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-    mulpd  xmm6, [rsp + nb430_iq]
-	mov rsi, [rbp + nb430_charge]    ;# base of charge[] 
-	movlpd xmm3, [rsi + rax*8]
-	movhpd xmm3, [rsi + rbx*8]
-	mulpd  xmm3, xmm6
-	movapd [rsp + nb430_qq], xmm3	
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb430_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb430_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=iter2 of rinv 
-	mulpd xmm4, xmm0	;# xmm4=r 
-	movapd [rsp + nb430_r], xmm4
-	movapd [rsp + nb430_rinv], xmm0
-
-	mov rsi, [rbp + nb430_type]
-	mov r8d, [rsi + rax*4]
-	mov r9d, [rsi + rbx*4]
-	shl r8d, 1
-	shl r9d, 1
-	mov edi, [rsp + nb430_ntia]
-	add r8d, edi
-	add r9d, edi
-
-    movapd xmm8, xmm4 ;# r
-	mulpd xmm4, [rsp + nb430_gbscale]
-	mulpd xmm8, [rsp + nb430_tsc]
-    
-    ;# truncate and convert to integers
-    cvttpd2pi mm0, xmm4  ;# gb
-    cvttpd2pi mm1, xmm8  ;# lj
-    
-    ;# convert back to float
-    cvtpi2pd  xmm6, mm0   ;# gb
-    cvtpi2pd  xmm10, mm1  ;# lj
-    
-    ;# multiply by 4 and 8, respectively
-    pslld   mm0, 2   ;# gb
-    pslld   mm1, 3   ;# lj
-
-    ;# move to integer registers
-    movd    r12d, mm0       ;# gb
-    movd    r14d, mm1      ;# lj
-	psrlq mm0, 32
-	psrlq mm1, 32
-    movd    r13d, mm0      ;# gb
-    movd    r15d, mm1     ;# lj
-    ;# GB indices: r10-11   LJ indices: r12-r13
-
-    ;# calculate eps
-    subpd     xmm4, xmm6   ;# gb
-    subpd     xmm8, xmm10  ;# lj
-    movapd    [rsp + nb430_epsgb], xmm4 ;# gb eps
-    movapd    [rsp + nb430_eps], xmm8 ;# lj eps
-    
-	mov  rsi, [rbp + nb430_GBtab]
-	mov  rdi, [rbp + nb430_VFtab]
-
-    ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
-    movapd xmm0,  [rsi + r12*8]        ;# Y1c F1c
-    movapd xmm12, [rsi + r13*8]        ;# Y2c F2c
-    movapd xmm4,  [rdi + r14*8]        ;# Y1d F1d
-    movapd xmm13, [rdi + r15*8]        ;# Y2d F2d
-    movapd xmm8,  [rdi + r14*8 + 32]   ;# Y1r F1r
-    movapd xmm14, [rdi + r15*8 + 32]   ;# Y2r F2r
-	movapd xmm1, xmm0
-	movapd xmm5, xmm4
-	movapd xmm9, xmm8
-	unpcklpd xmm0, xmm12	;# Y1c Y2c 
-	unpckhpd xmm1, xmm12	;# F1c F2c 
-	unpcklpd xmm4, xmm13	;# Y1d Y2d 
-	unpckhpd xmm5, xmm13	;# F1d F2d 
-	unpcklpd xmm8, xmm14	;# Y1r Y2r 
-	unpckhpd xmm9, xmm14	;# F1r F2r 
-    
-    movapd xmm2,  [rsi + r12*8 + 16]   ;# G1c H1c
-    movapd xmm12, [rsi + r13*8 + 16]   ;# G2c H2c
-    movapd xmm6,  [rdi + r14*8 + 16]   ;# G1d H1d
-    movapd xmm13, [rdi + r15*8 + 16]   ;# G2d H2d
-    movapd xmm10, [rdi + r14*8 + 48]   ;# G1r H1r
-    movapd xmm14, [rdi + r15*8 + 48]   ;# G2r H2r
-	movapd xmm3, xmm2
-	movapd xmm7, xmm6
-	movapd xmm11, xmm10
-	unpcklpd xmm2, xmm12	;# G1c G2c 
-	unpckhpd xmm3, xmm12	;# H1c H2c 
-	unpcklpd xmm6, xmm13	;# G1d G2d 
-	unpckhpd xmm7, xmm13	;# H1d H2d 
-	unpcklpd xmm10, xmm14	;# G1r G2r 
-	unpckhpd xmm11, xmm14	;# H1r H2r 
-    ;# table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
-    mov rdi, [rbp + nb430_vdwparam]
-    
-    movapd xmm12, [rsp + nb430_epsgb]
-    movapd xmm13, [rsp + nb430_eps]
-    
-    mulpd  xmm3, xmm12   ;# Heps
-    mulpd  xmm7, xmm13
-    mulpd  xmm11, xmm13
-    mulpd  xmm2, xmm12     ;# Geps
-    mulpd  xmm6, xmm13
-    mulpd  xmm10, xmm13
-    mulpd  xmm3, xmm12   ;# Heps2
-    mulpd  xmm7, xmm13
-    mulpd  xmm11, xmm13
-
-    movlpd xmm14, [rdi + r8*8]
-    movlpd xmm15, [rdi + r8*8 + 8]
-    
-    addpd  xmm1, xmm2   ;# F+Geps
-    addpd  xmm5, xmm6
-    addpd  xmm9, xmm10 
-    addpd  xmm1, xmm3   ;# F+Geps+Heps2 = Fp
-    addpd  xmm5, xmm7
-    addpd  xmm9, xmm11 
-    addpd  xmm3, xmm3    ;# 2*Heps2
-    addpd  xmm7, xmm7
-    addpd  xmm11, xmm11
-    movhpd xmm14, [rdi + r9*8]
-    movhpd xmm15, [rdi + r9*8 + 8]
-    
-    addpd  xmm3, xmm2    ;# 2*Heps2+Geps
-    addpd  xmm7, xmm6  
-    addpd  xmm11, xmm10
-    addpd  xmm3, xmm1   ;# FF = Fp + 2*Heps2 + Geps
-    addpd  xmm7, xmm5
-    addpd  xmm11, xmm9
-    mulpd  xmm1, xmm12   ;# eps*Fp
-    mulpd  xmm5, xmm13
-    mulpd  xmm9, xmm13
-    addpd  xmm1, xmm0     ;# VV
-    addpd  xmm5, xmm4
-    addpd  xmm9, xmm8
-    mulpd  xmm1, [rsp + nb430_qq]   ;# VV*qq = vcoul
-    mulpd  xmm5, xmm14   ;# vnb6
-    mulpd  xmm9, xmm15   ;# vnb12
-    mulpd  xmm3, [rsp + nb430_qq]    ;# FF*qq = fij
-    mulpd  xmm7, xmm14   ;# fijD
-    mulpd  xmm11, xmm15   ;#fijR
-
-    addpd  xmm11, xmm7 ;# fijD+fijR
-    mulpd  xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale
-    
-    ;# accumulate Vvdwtot
-    addpd  xmm5, [rsp + nb430_Vvdwtot]
-    addpd  xmm5, xmm9
-    movapd [rsp + nb430_Vvdwtot], xmm5
-
-	mov rsi, [rbp + nb430_dvda]
-	
-	;# Calculate dVda
-	mulpd xmm3, [rsp + nb430_gbscale]   ;# fijC=qq*FF*gbscale
-	movapd xmm6, xmm3 
-	mulpd  xmm6, [rsp + nb430_r]
-	addpd  xmm6, xmm1   ;# vcoul+fijC*r
-
-    addpd  xmm3, xmm11  ;# fijC+fijD+fijR
-    
-    ;# increment vctot
-	addpd  xmm1, [rsp + nb430_vctot]
-    movapd [rsp + nb430_vctot], xmm1
-
-	;# xmm6=(vcoul+fijC*r)
-	xorpd  xmm7, xmm7
-	subpd  xmm7, xmm6
-	movapd xmm6, xmm7
-	
-    ;# the fj's - start by combiningg forces from memory 
-    mov rdi, [rbp + nb430_faction]
-	movlpd xmm0, [rdi + r10*8]
-	movlpd xmm1, [rdi + r10*8 + 8]
-	movlpd xmm2, [rdi + r10*8 + 16]
-	movhpd xmm0, [rdi + r11*8]
-	movhpd xmm1, [rdi + r11*8 + 8]
-	movhpd xmm2, [rdi + r11*8 + 16]
-
-	;# update dvdasum 
-	addpd  xmm7, [rsp + nb430_dvdasum]
-    movapd [rsp + nb430_dvdasum], xmm7
-
-	;# update j atoms dvdaj
-	movhlps xmm7, xmm6
-	addsd  xmm6, [rsi + rax*8]
-	addsd  xmm7, [rsi + rbx*8]
-	movsd  [rsi + rax*8], xmm6
-	movsd  [rsi + rbx*8], xmm7
-
-	xorpd  xmm4, xmm4	
-	mulpd xmm3, [rsp + nb430_rinv]
-	subpd  xmm4, xmm3
-
-    movapd  xmm9, xmm4
-    movapd  xmm10, xmm4
-    movapd  xmm11, xmm4
-    
-    mulpd  xmm9, [rsp + nb430_dx]
-    mulpd  xmm10, [rsp + nb430_dy]
-    mulpd  xmm11, [rsp + nb430_dz]    
-
-	addpd xmm0, xmm9
-	addpd xmm1, xmm10
-	addpd xmm2, xmm11
-
-	;# accumulate i forces
-    addpd xmm9, [rsp + nb430_fix]
-    addpd xmm10, [rsp + nb430_fiy]
-    addpd xmm11, [rsp + nb430_fiz]
-
-	movlpd [rdi + r10*8], xmm0
-	movlpd [rdi + r10*8 + 8], xmm1
-	movlpd [rdi + r10*8 + 16], xmm2
-
-    movapd [rsp + nb430_fix], xmm9
-    movapd [rsp + nb430_fiy], xmm10
-    movapd [rsp + nb430_fiz], xmm11
-
-	movhpd [rdi + r11*8], xmm0
-	movhpd [rdi + r11*8 + 8], xmm1
-	movhpd [rdi + r11*8 + 16], xmm2
-	
-    ;# should we do one more iteration? 
-	sub dword ptr [rsp + nb430_innerk],  2
-	jl    .nb430_checksingle
-	jmp   .nb430_unroll_loop
-.nb430_checksingle:
-	mov   edx, [rsp + nb430_innerk]
-	and   edx, 1
-	jnz    .nb430_dosingle
-	jmp    .nb430_updateouterdata
-.nb430_dosingle:
-	mov rsi, [rbp + nb430_charge]
-	mov rdx, [rbp + nb430_invsqrta]
-	mov rdi, [rbp + nb430_pos]
-	mov   rcx, [rsp + nb430_innerjjnr]
-	mov   eax, [rcx]	
-
-	;# load isaj
-	mov rsi, [rbp + nb430_invsqrta]
-	movsd xmm2, [rsi + rax*8]
-	mulsd  xmm2, [rsp + nb430_isai]
-	movapd [rsp + nb430_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulsd xmm1, [rsp + nb430_gbtsc]
-	movapd [rsp + nb430_gbscale], xmm1
-
-    mulsd xmm2, [rsp + nb430_iq]
-	mov rsi, [rbp + nb430_charge]    ;# base of charge[] 
-	movsd xmm3, [rsi + rax*8]
-	mulsd  xmm3, xmm2
-	movapd [rsp + nb430_qq], xmm3	
-	
-	mov rsi, [rbp + nb430_type]
-	mov r8d, [rsi + rax*4]
-	mov rsi, [rbp + nb430_vdwparam]
-	shl r8d, 1
-	mov edi, [rsp + nb430_ntia]
-	add r8d, edi
-
-	movsd xmm4, [rsi + r8*8]	
-	movsd xmm6, [rsi + r8*8 + 8]
-	movapd [rsp + nb430_c6], xmm4
-	movapd [rsp + nb430_c12], xmm6
-		
-	mov rsi, [rbp + nb430_pos]		;# base of pos[] 
-
-	lea   r10, [rax + rax*2]     ;# j3 
-
-	;# move coordinate to xmm4-xmm6 
-	movsd xmm4, [rsi + r10*8]
-	movsd xmm5, [rsi + r10*8 + 8]
-	movsd xmm6, [rsi + r10*8 + 16]
-
-	mov    rdi, [rbp + nb430_faction]
-	
-	;# calc dr 
-	subsd xmm4, [rsp + nb430_ix]
-	subsd xmm5, [rsp + nb430_iy]
-	subsd xmm6, [rsp + nb430_iz]
-
-	;# store dr 
-	movapd [rsp + nb430_dx], xmm4
-	movapd [rsp + nb430_dy], xmm5
-	movapd [rsp + nb430_dz], xmm6
-    
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb430_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb430_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb430_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb430_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=iter2 of rinv 
-	mulsd xmm4, xmm0	;# xmm4=r 
-	movapd [rsp + nb430_r], xmm4
-	movapd [rsp + nb430_rinv], xmm0
-
-    movapd xmm8, xmm4 ;# r
-	mulsd xmm4, [rsp + nb430_gbscale]
-	mulsd xmm8, [rsp + nb430_tsc]
-    
-    ;# truncate and convert to integers
-    cvttsd2si r12d, xmm4  ;# gb
-    cvttsd2si r14d, xmm8  ;# lj
-    
-    ;# convert back to float
-    cvtsi2sd  xmm6, r12d   ;# gb
-    cvtsi2sd  xmm10, r14d  ;# lj
-    
-    ;# multiply by 4 and 8, respectively
-    shl    r12d, 2   ;# gb
-    shl    r14d, 3   ;# lj
-
-    ;# GB indices: r10   LJ indices: r12
-
-    ;# calculate eps
-    subsd     xmm4, xmm6   ;# gb
-    subsd     xmm8, xmm10  ;# lj
-    movapd    [rsp + nb430_epsgb], xmm4 ;# gb eps
-    movapd    [rsp + nb430_eps], xmm8 ;# lj eps
-    
-	mov  rsi, [rbp + nb430_GBtab]
-	mov  rdi, [rbp + nb430_VFtab]
-
-    ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
-    movapd xmm0,  [rsi + r12*8]        ;# Y1c F1c
-    movapd xmm4,  [rdi + r14*8]        ;# Y1d F1d
-    movapd xmm8,  [rdi + r14*8 + 32]   ;# Y1r F1r
-	movhlps xmm1, xmm0
-	movhlps xmm5, xmm4
-	movhlps xmm9, xmm8
-    
-    movapd xmm2,  [rsi + r12*8 + 16]   ;# G1c H1c
-    movapd xmm6,  [rdi + r14*8 + 16]   ;# G1d H1d
-    movapd xmm10, [rdi + r14*8 + 48]   ;# G1r H1r
-	movhlps xmm3, xmm2
-	movhlps xmm7, xmm6
-	movhlps xmm11, xmm10
-    ;# table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
-    
-    movapd xmm12, [rsp + nb430_epsgb]
-    movapd xmm13, [rsp + nb430_eps]
-    
-    mulsd  xmm3, xmm12   ;# Heps
-    mulsd  xmm7, xmm13
-    mulsd  xmm11, xmm13
-    mulsd  xmm2, xmm12     ;# Geps
-    mulsd  xmm6, xmm13
-    mulsd  xmm10, xmm13
-    mulsd  xmm3, xmm12   ;# Heps2
-    mulsd  xmm7, xmm13
-    mulsd  xmm11, xmm13
-
-    addsd  xmm1, xmm2   ;# F+Geps
-    addsd  xmm5, xmm6
-    addsd  xmm9, xmm10 
-    addsd  xmm1, xmm3   ;# F+Geps+Heps2 = Fp
-    addsd  xmm5, xmm7
-    addsd  xmm9, xmm11 
-    addsd  xmm3, xmm3    ;# 2*Heps2
-    addsd  xmm7, xmm7
-    addsd  xmm11, xmm11
-    addsd  xmm3, xmm2    ;# 2*Heps2+Geps
-    addsd  xmm7, xmm6  
-    addsd  xmm11, xmm10
-    addsd  xmm3, xmm1   ;# FF = Fp + 2*Heps2 + Geps
-    addsd  xmm7, xmm5
-    addsd  xmm11, xmm9
-    mulsd  xmm1, xmm12   ;# eps*Fp
-    mulsd  xmm5, xmm13
-    mulsd  xmm9, xmm13
-    addsd  xmm1, xmm0     ;# VV
-    addsd  xmm5, xmm4
-    addsd  xmm9, xmm8
-    mulsd  xmm1, [rsp + nb430_qq]   ;# VV*qq = vcoul
-    mulsd  xmm5, [rsp + nb430_c6]   ;# vnb6
-    mulsd  xmm9, [rsp + nb430_c12]   ;# vnb12
-    mulsd  xmm3, [rsp + nb430_qq]    ;# FF*qq = fij
-    mulsd  xmm7, [rsp + nb430_c6]   ;# fijD
-    mulsd  xmm11, [rsp + nb430_c12]   ;#fijR
-
-    addsd  xmm11, xmm7 ;# fijD+fijR
-    mulsd  xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale
-    
-    ;# accumulate Vvdwtot
-    addsd  xmm5, [rsp + nb430_Vvdwtot]
-    addsd  xmm5, xmm9
-    movsd [rsp + nb430_Vvdwtot], xmm5
-
-	mov rsi, [rbp + nb430_dvda]
-	
-	;# Calculate dVda
-	mulsd xmm3, [rsp + nb430_gbscale]   ;# fijC=qq*FF*gbscale
-	movapd xmm6, xmm3 
-	mulsd  xmm6, [rsp + nb430_r]
-	addsd  xmm6, xmm1   ;# vcoul+fijC*r
-
-    addsd  xmm3, xmm11  ;# fijC+fijD+fijR
-    
-    ;# increment vctot
-	addsd  xmm1, [rsp + nb430_vctot]
-    movsd [rsp + nb430_vctot], xmm1
-
-	;# xmm6=(vcoul+fijC*r)
-	xorpd  xmm7, xmm7
-	subsd  xmm7, xmm6
-	movapd xmm6, xmm7
-	
-	;# update dvdasum 
-	addsd  xmm7, [rsp + nb430_dvdasum]
-    movsd [rsp + nb430_dvdasum], xmm7
-
-	;# update j atoms dvdaj
-	addsd  xmm6, [rsi + rax*8]
-	movsd  [rsi + rax*8], xmm6
-
-	xorpd  xmm4, xmm4	
-	mulsd xmm3, [rsp + nb430_rinv]
-	subsd  xmm4, xmm3
-
-    movapd  xmm9, xmm4
-    movapd  xmm10, xmm4
-    movapd  xmm11, xmm4
-    
-    mulsd  xmm9, [rsp + nb430_dx]
-    mulsd  xmm10, [rsp + nb430_dy]
-    mulsd  xmm11, [rsp + nb430_dz]
-    
-    movapd xmm3, xmm9
-    movapd xmm4, xmm10
-    movapd xmm5, xmm11
-    
-	;# accumulate i forces
-    addsd xmm9, [rsp + nb430_fix]
-    addsd xmm10, [rsp + nb430_fiy]
-    addsd xmm11, [rsp + nb430_fiz]
-    movsd [rsp + nb430_fix], xmm9
-    movsd [rsp + nb430_fiy], xmm10
-    movsd [rsp + nb430_fiz], xmm11
-
-    mov rdi, [rbp + nb430_faction]
-	;# the fj's - start by accumulating forces from memory 
-	addsd xmm3,   [rdi + r10*8]
-	addsd xmm4,  [rdi + r10*8 + 8]
-	addsd xmm5,  [rdi + r10*8 + 16]
-	movsd [rdi + r10*8], xmm3
-	movsd [rdi + r10*8 + 8], xmm4
-	movsd [rdi + r10*8 + 16], xmm5
-	
-.nb430_updateouterdata:
-	mov   ecx, [rsp + nb430_ii3]
-	mov   rdi, [rbp + nb430_faction]
-	mov   rsi, [rbp + nb430_fshift]
-	mov   edx, [rsp + nb430_is3]
-
-	;# accumulate i forces in xmm0, xmm1, xmm2 
-	movapd xmm0, [rsp + nb430_fix]
-	movapd xmm1, [rsp + nb430_fiy]
-	movapd xmm2, [rsp + nb430_fiz]
-
-	movhlps xmm3, xmm0
-	movhlps xmm4, xmm1
-	movhlps xmm5, xmm2
-	addsd  xmm0, xmm3
-	addsd  xmm1, xmm4
-	addsd  xmm2, xmm5 ;# sum is in low xmm0-xmm2 
-
-	;# increment i force 
-	movsd  xmm3, [rdi + rcx*8]
-	movsd  xmm4, [rdi + rcx*8 + 8]
-	movsd  xmm5, [rdi + rcx*8 + 16]
-	subsd  xmm3, xmm0
-	subsd  xmm4, xmm1
-	subsd  xmm5, xmm2
-	movsd  [rdi + rcx*8],     xmm3
-	movsd  [rdi + rcx*8 + 8], xmm4
-	movsd  [rdi + rcx*8 + 16], xmm5
-
-	;# increment fshift force  
-	movsd  xmm3, [rsi + rdx*8]
-	movsd  xmm4, [rsi + rdx*8 + 8]
-	movsd  xmm5, [rsi + rdx*8 + 16]
-	subsd  xmm3, xmm0
-	subsd  xmm4, xmm1
-	subsd  xmm5, xmm2
-	movsd  [rsi + rdx*8],     xmm3
-	movsd  [rsi + rdx*8 + 8], xmm4
-	movsd  [rsi + rdx*8 + 16], xmm5
-
-	;# get n from stack
-	mov esi, [rsp + nb430_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb430_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movapd xmm7, [rsp + nb430_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb430_Vc]
-	addsd xmm7, [rax + rdx*8] 
-	;# move back to mem 
-	movsd [rax + rdx*8], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movapd xmm7, [rsp + nb430_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb430_Vvdw]
-	addsd xmm7, [rax + rdx*8] 
-	;# move back to mem 
-	movsd [rax + rdx*8], xmm7 
-	
-	;# accumulate dVda and update it 
-	movapd xmm7, [rsp + nb430_dvdasum]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	mov edx, [rsp + nb430_ii]
-	mov rax, [rbp + nb430_dvda]
-	addsd xmm7, [rax + rdx*8]
-	movsd [rax + rdx*8], xmm7
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb430_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb430_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb430_n], esi
-        jmp .nb430_outer
-.nb430_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb430_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb430_end
-        ;# non-zero, do one more workunit
-        jmp   .nb430_threadloop
-.nb430_end:
-	mov eax, [rsp + nb430_nouter]
-	mov ebx, [rsp + nb430_ninner]
-	mov rcx, [rbp + nb430_outeriter]
-	mov rdx, [rbp + nb430_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 536
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
-
-
-
-
-	
-.globl nb_kernel430nf_x86_64_sse2
-.globl _nb_kernel430nf_x86_64_sse2
-nb_kernel430nf_x86_64_sse2:	
-_nb_kernel430nf_x86_64_sse2:	
-;#	Room for return address and rbp (16 bytes)
-.equiv          nb430nf_fshift,         16
-.equiv          nb430nf_gid,            24
-.equiv          nb430nf_pos,            32
-.equiv          nb430nf_faction,        40
-.equiv          nb430nf_charge,         48
-.equiv          nb430nf_p_facel,        56
-.equiv          nb430nf_argkrf,         64
-.equiv          nb430nf_argcrf,         72
-.equiv          nb430nf_Vc,             80
-.equiv          nb430nf_type,           88
-.equiv          nb430nf_p_ntype,        96
-.equiv          nb430nf_vdwparam,       104
-.equiv          nb430nf_Vvdw,           112
-.equiv          nb430nf_p_tabscale,     120
-.equiv          nb430nf_VFtab,          128
-.equiv          nb430nf_invsqrta,       136
-.equiv          nb430nf_dvda,           144
-.equiv          nb430nf_p_gbtabscale,   152
-.equiv          nb430nf_GBtab,          160
-.equiv          nb430nf_p_nthreads,     168
-.equiv          nb430nf_count,          176
-.equiv          nb430nf_mtx,            184
-.equiv          nb430nf_outeriter,      192
-.equiv          nb430nf_inneriter,      200
-.equiv          nb430nf_work,           208
-	;# stack offsets for local variables  
-	;# bottom of stack is cache-aligned for sse2 use 
-.equiv          nb430nf_ix,             0
-.equiv          nb430nf_iy,             16
-.equiv          nb430nf_iz,             32
-.equiv          nb430nf_iq,             48
-.equiv          nb430nf_gbtsc,          64
-.equiv          nb430nf_tsc,            80
-.equiv          nb430nf_qq,             96
-.equiv          nb430nf_c6,             112
-.equiv          nb430nf_c12,            128
-.equiv          nb430nf_vctot,          144
-.equiv          nb430nf_Vvdwtot,        160
-.equiv          nb430nf_half,           176
-.equiv          nb430nf_three,          192
-.equiv          nb430nf_r,              208
-.equiv          nb430nf_isai,           224
-.equiv          nb430nf_isaprod,        240
-.equiv          nb430nf_gbscale,        256
-.equiv          nb430nf_nri,            272
-.equiv          nb430nf_iinr,           280
-.equiv          nb430nf_jindex,         288
-.equiv          nb430nf_jjnr,           296
-.equiv          nb430nf_shift,          304
-.equiv          nb430nf_shiftvec,       312
-.equiv          nb430nf_facel,          320
-.equiv          nb430nf_innerjjnr,      328
-.equiv          nb430nf_is3,            336
-.equiv          nb430nf_ii3,            340
-.equiv          nb430nf_ntia,           344
-.equiv          nb430nf_innerk,         348
-.equiv          nb430nf_n,              352
-.equiv          nb430nf_nn1,            356
-.equiv          nb430nf_ntype,          360
-.equiv          nb430nf_nouter,         364
-.equiv          nb430nf_ninner,         368
-	push rbp
-	mov  rbp, rsp
-	push rbx
-	
-	emms
-
-        push r12
-        push r13
-        push r14
-        push r15
-
-	sub rsp, 392		;# local variable stack space (n*16+8)
-
-	;# zero 32-bit iteration counters
-	mov eax, 0
-	mov [rsp + nb430nf_nouter], eax
-	mov [rsp + nb430nf_ninner], eax
-
-	mov edi, [rdi]
-	mov [rsp + nb430nf_nri], edi
-	mov [rsp + nb430nf_iinr], rsi
-	mov [rsp + nb430nf_jindex], rdx
-	mov [rsp + nb430nf_jjnr], rcx
-	mov [rsp + nb430nf_shift], r8
-	mov [rsp + nb430nf_shiftvec], r9
-	mov rdi, [rbp + nb430nf_p_ntype]
-	mov edi, [rdi]
-	mov [rsp + nb430nf_ntype], edi
-	mov rsi, [rbp + nb430nf_p_facel]
-	movsd xmm0, [rsi]
-	movsd [rsp + nb430nf_facel], xmm0
-
-	mov rax, [rbp + nb430nf_p_tabscale]
-	movsd xmm3, [rax]
-	shufpd xmm3, xmm3, 0
-	movapd [rsp + nb430nf_tsc], xmm3
-
-	mov rbx, [rbp + nb430nf_p_gbtabscale]
-	movsd xmm4, [rbx]
-	shufpd xmm4, xmm4, 0
-	movapd [rsp + nb430nf_gbtsc], xmm4
-
-	;# create constant floating-point factors on stack
-	mov eax, 0x00000000     ;# lower half of double half IEEE (hex)
-	mov ebx, 0x3fe00000
-	mov [rsp + nb430nf_half], eax
-	mov [rsp + nb430nf_half+4], ebx
-	movsd xmm1, [rsp + nb430nf_half]
-	shufpd xmm1, xmm1, 0    ;# splat to all elements
-	movapd xmm3, xmm1
-	addpd  xmm3, xmm3       ;# one
-	movapd xmm2, xmm3
-	addpd  xmm2, xmm2       ;# two
-	addpd  xmm3, xmm2	;# three
-	movapd [rsp + nb430nf_half], xmm1
-	movapd [rsp + nb430nf_three], xmm3
-
-.nb430nf_threadloop:
-        mov   rsi, [rbp + nb430nf_count]          ;# pointer to sync counter
-        mov   eax, [rsi]
-.nb430nf_spinlock:
-        mov   ebx, eax                          ;# ebx=*count=nn0
-        add   ebx, 1                           ;# ebx=nn1=nn0+10
-        lock
-        cmpxchg [esi], ebx                      ;# write nn1 to *counter,
-                                                ;# if it hasnt changed.
-                                                ;# or reread *counter to eax.
-        pause                                   ;# -> better p4 performance
-        jnz .nb430nf_spinlock
-
-        ;# if(nn1>nri) nn1=nri
-        mov ecx, [rsp + nb430nf_nri]
-        mov edx, ecx
-        sub ecx, ebx
-        cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
-        ;# Cleared the spinlock if we got here.
-        ;# eax contains nn0, ebx contains nn1.
-        mov [rsp + nb430nf_n], eax
-        mov [rsp + nb430nf_nn1], ebx
-        sub ebx, eax                            ;# calc number of outer lists
-	mov esi, eax				;# copy n to esi
-        jg  .nb430nf_outerstart
-        jmp .nb430nf_end
-
-.nb430nf_outerstart:
-	;# ebx contains number of outer iterations
-	add ebx, [rsp + nb430nf_nouter]
-	mov [rsp + nb430nf_nouter], ebx
-
-.nb430nf_outer:
-	mov   rax, [rsp + nb430nf_shift]      ;# rax = pointer into shift[] 
-	mov   ebx, [rax+rsi*4]		;# rbx=shift[n] 
-	
-	lea   rbx, [rbx + rbx*2]    ;# rbx=3*is 
-	mov   [rsp + nb430nf_is3],ebx    	;# store is3 
-
-	mov   rax, [rsp + nb430nf_shiftvec]   ;# rax = base of shiftvec[] 
-
-	movsd xmm0, [rax + rbx*8]
-	movsd xmm1, [rax + rbx*8 + 8]
-	movsd xmm2, [rax + rbx*8 + 16] 
-
-	mov   rcx, [rsp + nb430nf_iinr]       ;# rcx = pointer into iinr[]
-	mov   ebx, [rcx+rsi*4]	    ;# ebx =ii 
-
-	mov   rdx, [rbp + nb430nf_charge]
-	movsd xmm3, [rdx + rbx*8]	
-	mulsd xmm3, [rsp + nb430nf_facel]
-	shufpd xmm3, xmm3, 0
-
-	mov   rdx, [rbp + nb430nf_invsqrta]	;# load invsqrta[ii]
-	movsd xmm4, [rdx + rbx*8]
-	shufpd xmm4, xmm4, 0
-
-    	mov   rdx, [rbp + nb430nf_type] 
-    	mov   edx, [rdx + rbx*4]
-    	imul  edx, [rsp + nb430nf_ntype]
-    	shl   edx, 1
-    	mov   [rsp + nb430nf_ntia], edx
-	
-	lea   rbx, [rbx + rbx*2]	;# rbx = 3*ii=ii3 
-	mov   rax, [rbp + nb430nf_pos]    ;# rax = base of pos[]  
-
-	addsd xmm0, [rax + rbx*8]
-	addsd xmm1, [rax + rbx*8 + 8]
-	addsd xmm2, [rax + rbx*8 + 16]
-
-	movapd [rsp + nb430nf_iq], xmm3
-	movapd [rsp + nb430nf_isai], xmm4	
-	
-	shufpd xmm0, xmm0, 0
-	shufpd xmm1, xmm1, 0
-	shufpd xmm2, xmm2, 0
-
-	movapd [rsp + nb430nf_ix], xmm0
-	movapd [rsp + nb430nf_iy], xmm1
-	movapd [rsp + nb430nf_iz], xmm2
-
-	mov   [rsp + nb430nf_ii3], ebx
-	
-	;# clear vctot
-	xorpd xmm4, xmm4
-	movapd [rsp + nb430nf_vctot], xmm4
-	movapd [rsp + nb430nf_Vvdwtot], xmm4
-
-	mov   rax, [rsp + nb430nf_jindex]
-	mov   ecx, [rax + rsi*4]	     ;# jindex[n] 
-	mov   edx, [rax + rsi*4 + 4]	     ;# jindex[n+1] 
-	sub   edx, ecx               ;# number of innerloop atoms 
-
-	mov   rsi, [rbp + nb430nf_pos]
-	mov   rdi, [rbp + nb430nf_faction]	
-	mov   rax, [rsp + nb430nf_jjnr]
-	shl   ecx, 2
-	add   rax, rcx
-	mov   [rsp + nb430nf_innerjjnr], rax     ;# pointer to jjnr[nj0] 
-	mov   ecx, edx
-	sub   edx,  2
-	add   ecx, [rsp + nb430nf_ninner]
-	mov   [rsp + nb430nf_ninner], ecx
-	add   edx, 0
-	mov   [rsp + nb430nf_innerk], edx    ;# number of innerloop atoms 
-	jge   .nb430nf_unroll_loop
-	jmp   .nb430nf_checksingle
-.nb430nf_unroll_loop:	
-	;# twice unrolled innerloop here 
-	mov   rdx, [rsp + nb430nf_innerjjnr]   ;# pointer to jjnr[k] 
-	mov   eax, [rdx]
-	mov   ebx, [rdx + 4]
-	add qword ptr [rsp + nb430nf_innerjjnr], 8	;# advance pointer (unrolled 2) 
-
-	;# load isaj
-	mov rsi, [rbp + nb430nf_invsqrta]
-	movlpd xmm2, [rsi + rax*8]
-	movhpd xmm2, [rsi + rbx*8]
-	mulpd  xmm2, [rsp + nb430nf_isai]
-	movapd [rsp + nb430nf_isaprod], xmm2	
-	movapd xmm1, xmm2
-	mulpd xmm1, [rsp + nb430nf_gbtsc]
-	movapd [rsp + nb430nf_gbscale], xmm1
-	
-	mov rsi, [rbp + nb430nf_charge]    ;# base of charge[] 
-	movlpd xmm3, [rsi + rax*8]
-	movhpd xmm3, [rsi + rbx*8]
-
-	mulpd xmm2, [rsp + nb430nf_iq]
-	mulpd  xmm3, xmm2
-	movapd [rsp + nb430nf_qq], xmm3	
-	
-	mov rsi, [rbp + nb430nf_type]
-	mov ecx, [rsi + rax*4]
-	mov edx, [rsi + rbx*4]
-	mov rsi, [rbp + nb430nf_vdwparam]
-	shl ecx, 1
-	shl edx, 1
-	mov edi, [rsp + nb430nf_ntia]
-	add ecx, edi
-	add edx, edi
-
-	movlpd xmm6, [rsi + rcx*8]	;# c6a
-	movlpd xmm7, [rsi + rdx*8]	;# c6b
-	movhpd xmm6, [rsi + rcx*8 + 8]	;# c6a c12a 
-	movhpd xmm7, [rsi + rdx*8 + 8]	;# c6b c12b 
-
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movapd [rsp + nb430nf_c6], xmm4
-	movapd [rsp + nb430nf_c12], xmm6
-	
-	mov rsi, [rbp + nb430nf_pos]		;# base of pos[] 
-
-	lea   rax, [rax + rax*2]     ;# replace jnr with j3 
-	lea   rbx, [rbx + rbx*2]	
-
-	;# move two coordinates to xmm0-xmm2 
-	movlpd xmm0, [rsi + rax*8]
-	movlpd xmm1, [rsi + rax*8 + 8]
-	movlpd xmm2, [rsi + rax*8 + 16]
-	movhpd xmm0, [rsi + rbx*8]
-	movhpd xmm1, [rsi + rbx*8 + 8]
-	movhpd xmm2, [rsi + rbx*8 + 16]		
-
-	mov    rdi, [rbp + nb430nf_faction]
-	
-	;# move nb430nf_ix-iz to xmm4-xmm6 
-	movapd xmm4, [rsp + nb430nf_ix]
-	movapd xmm5, [rsp + nb430nf_iy]
-	movapd xmm6, [rsp + nb430nf_iz]
-
-	;# calc dr 
-	subpd xmm4, xmm0
-	subpd xmm5, xmm1
-	subpd xmm6, xmm2
-
-	;# square it 
-	mulpd xmm4,xmm4
-	mulpd xmm5,xmm5
-	mulpd xmm6,xmm6
-	addpd xmm4, xmm5
-	addpd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtpd2ps xmm5, xmm4	
-	rsqrtps xmm5, xmm5
-	cvtps2pd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulpd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb430nf_three]
-	mulpd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb430nf_half]
-	subpd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulpd xmm1, xmm5	
-	mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulpd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb430nf_three]
-	mulpd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb430nf_half]
-	subpd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulpd xmm2, xmm5	
-	mulpd xmm0, xmm2	;# xmm0=iter2 of rinv 
-	mulpd xmm4, xmm0	;# xmm4=r 
-	movapd [rsp + nb430nf_r], xmm4
-	mulpd xmm4, [rsp + nb430nf_gbscale]
-
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 2		;# idx *= 4 
-
-	mov  rsi, [rbp + nb430nf_GBtab]
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6		;# indices in eax/ebx 
-
-	;# Coulomb 
-	movapd xmm4, [rsi + rcx*8]	;# Y1 F1 	
-	movapd xmm3, [rsi + rdx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [rsi + rcx*8 + 16]	;# G1 H1 	
-	movapd xmm3, [rsi + rdx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [rsp + nb430nf_qq]
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  
-	addpd  xmm5, [rsp + nb430nf_vctot]
-	movapd [rsp + nb430nf_vctot], xmm5
-	
-	movapd xmm4, [rsp + nb430nf_r]
-	mulpd  xmm4, [rsp + nb430nf_tsc]
-	cvttpd2pi mm6, xmm4	;# mm6 = lu idx 
-	cvtpi2pd xmm5, mm6
-	subpd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulpd  xmm2, xmm2	;# xmm2=eps2 
-	
-	pslld mm6, 3		;# idx *= 8
-
-	mov  rsi, [rbp + nb430nf_VFtab]
-
-	movd ecx, mm6
-	psrlq mm6, 32
-	movd edx, mm6		;# indices in eax/ebx 
-
-	;# Dispersion 
-	movapd xmm4, [rsi + rcx*8]	;# Y1 F1 	
-	movapd xmm3, [rsi + rdx*8]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [rsi + rcx*8 + 16]	;# G1 H1 	
-	movapd xmm3, [rsi + rdx*8 + 16]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-
-	mulpd  xmm5, [rsp + nb430nf_c6]	 ;# Vvdw6
-	addpd  xmm5, [rsp + nb430nf_Vvdwtot]
-	movapd [rsp + nb430nf_Vvdwtot], xmm5
-
-	;# Repulsion 
-	movapd xmm4, [rsi + rcx*8 + 32]	;# Y1 F1 	
-	movapd xmm3, [rsi + rdx*8 + 32]	;# Y2 F2 
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 Y2 
-	unpckhpd xmm5, xmm3	;# F1 F2 
-
-	movapd xmm6, [rsi + rcx*8 + 48]	;# G1 H1 	
-	movapd xmm3, [rsi + rdx*8 + 48]	;# G2 H2 
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 G2 
-	unpckhpd xmm7, xmm3	;# H1 H2 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulpd  xmm6, xmm1	;# xmm6=Geps 
-	mulpd  xmm7, xmm2	;# xmm7=Heps2 
-	addpd  xmm5, xmm6
-	addpd  xmm5, xmm7	;# xmm5=Fp 	
-	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addpd  xmm5, xmm4 ;# xmm5=VV 
-
-	mulpd  xmm5, [rsp + nb430nf_c12] ;# Vvdw12 
-	addpd  xmm5, [rsp + nb430nf_Vvdwtot]
-	movapd [rsp + nb430nf_Vvdwtot], xmm5
-	xorpd  xmm4, xmm4
-	
-	;# should we do one more iteration? 
-	sub dword ptr [rsp + nb430nf_innerk],  2
-	jl    .nb430nf_checksingle
-	jmp   .nb430nf_unroll_loop
-.nb430nf_checksingle:
-	mov   edx, [rsp + nb430nf_innerk]
-	and   edx, 1
-	jnz    .nb430nf_dosingle
-	jmp    .nb430nf_updateouterdata
-.nb430nf_dosingle:
-	mov rsi, [rbp + nb430nf_charge]
-	mov rdx, [rbp + nb430nf_invsqrta]
-	mov rdi, [rbp + nb430nf_pos]
-	mov   rcx, [rsp + nb430nf_innerjjnr]
-	mov   eax, [rcx]	
-
-	xorpd  xmm6, xmm6
-	movapd xmm7, xmm6
-	movsd  xmm7, [rdx + rax*8]
-	movlpd xmm6, [rsi + rax*8]	;# xmm6(0) has the charge
-	mulsd  xmm7, [rsp + nb430nf_isai]
-	movapd [rsp + nb430nf_isaprod], xmm7
-	movapd xmm1, xmm7
-	mulpd xmm1, [rsp + nb430nf_gbtsc]
-	movapd [rsp + nb430nf_gbscale], xmm1
-	
-	mulsd  xmm7, [rsp + nb430nf_iq]
-	mulsd  xmm6, xmm7
-	movapd [rsp + nb430nf_qq], xmm6
-	
-	mov rsi, [rbp + nb430nf_type]
-	mov edx, [rsi + rax*4]
-	mov rsi, [rbp + nb430nf_vdwparam]
-	shl edx, 1
-	mov edi, [rsp + nb430nf_ntia]
-	add edx, edi
-
-	movlpd xmm6, [rsi + rdx*8]	;# c6a
-	movhpd xmm6, [rsi + rdx*8 + 8]	;# c6a c12a 
-
-	xorpd xmm7, xmm7
-	movapd xmm4, xmm6
-	unpcklpd xmm4, xmm7
-	unpckhpd xmm6, xmm7
-	
-	movapd [rsp + nb430nf_c6], xmm4
-	movapd [rsp + nb430nf_c12], xmm6
-	
-	mov rsi, [rbp + nb430nf_pos]		;# base of pos[] 
-
-	lea   rax, [rax + rax*2]     ;# replace jnr with j3 
-
-	;# move two coordinates to xmm0-xmm2 
-	movlpd xmm0, [rsi + rax*8]
-	movlpd xmm1, [rsi + rax*8 + 8]
-	movlpd xmm2, [rsi + rax*8 + 16]
-
-	mov    rdi, [rbp + nb430nf_faction]
-
-	;# move nb430nf_ix-iz to xmm4-xmm6 
-	movapd xmm4, [rsp + nb430nf_ix]
-	movapd xmm5, [rsp + nb430nf_iy]
-	movapd xmm6, [rsp + nb430nf_iz]
-
-	;# calc dr 
-	subsd xmm4, xmm0
-	subsd xmm5, xmm1
-	subsd xmm6, xmm2
-
-	;# square it 
-	mulsd xmm4,xmm4
-	mulsd xmm5,xmm5
-	mulsd xmm6,xmm6
-	addsd xmm4, xmm5
-	addsd xmm4, xmm6
-	;# rsq in xmm4 
-
-	cvtsd2ss xmm5, xmm4	
-	rsqrtss xmm5, xmm5
-	cvtss2sd xmm2, xmm5	;# lu in low xmm2 
-
-	;# lookup seed in xmm2 
-	movapd xmm5, xmm2	;# copy of lu 
-	mulsd xmm2, xmm2	;# lu*lu 
-	movapd xmm1, [rsp + nb430nf_three]
-	mulsd xmm2, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb430nf_half]
-	subsd xmm1, xmm2	;# 30-rsq*lu*lu 
-	mulsd xmm1, xmm5	
-	mulsd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 
-
-	movapd xmm5, xmm1	;# copy of lu 
-	mulsd xmm1, xmm1	;# lu*lu 
-	movapd xmm2, [rsp + nb430nf_three]
-	mulsd xmm1, xmm4	;# rsq*lu*lu 			
-	movapd xmm0, [rsp + nb430nf_half]
-	subsd xmm2, xmm1	;# 30-rsq*lu*lu 
-	mulsd xmm2, xmm5	
-	mulsd xmm0, xmm2	;# xmm0=iter2 of rinv (new lu) 
-	mulsd xmm4, xmm0	;# xmm4=r 
-	movsd [rsp + nb430nf_r], xmm4
-	mulsd xmm4, [rsp + nb430nf_gbscale]
-	
-	cvttsd2si edx, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, edx
-	subsd xmm4, xmm5
-	movapd xmm1, xmm4	;# xmm1=eps 
-	movapd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2 
-	
-	shl edx, 2		;# idx *= 4 
-	mov  rsi, [rbp + nb430nf_GBtab]
-
-	;# Coulomb 
-	movapd xmm4, [rsi + rdx*8]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [rsi + rdx*8 + 16]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# coulomb table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	movapd xmm3, [rsp + nb430nf_qq]
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, xmm3 ;# vcoul=qq*VV  
-	addsd  xmm5, [rsp + nb430nf_vctot]
-	movsd [rsp + nb430nf_vctot], xmm5 
-
-	movsd xmm4, [rsp + nb430nf_r]
-	mulsd  xmm4, [rsp + nb430nf_tsc]
-	cvttsd2si edx, xmm4	;# mm6 = lu idx 
-	cvtsi2sd xmm5, edx
-	subsd xmm4, xmm5
-	movsd xmm1, xmm4	;# xmm1=eps 
-	movsd xmm2, xmm1	
-	mulsd  xmm2, xmm2	;# xmm2=eps2
-
-	shl edx, 3
-
-	mov  rsi, [rbp + nb430nf_VFtab]
-
-	;# Dispersion 
-	movapd xmm4, [rsi + rdx*8]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [rsi + rdx*8 + 16]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-
-	mulsd  xmm5, [rsp + nb430nf_c6]	 ;# Vvdw6
-	addsd  xmm5, [rsp + nb430nf_Vvdwtot]
-	movlpd [rsp + nb430nf_Vvdwtot], xmm5
-
-	;# Repulsion 
-	movapd xmm4, [rsi + rdx*8 + 32]	;# Y1 F1 	
-	xorpd xmm3, xmm3
-	movapd xmm5, xmm4
-	unpcklpd xmm4, xmm3	;# Y1 
-	unpckhpd xmm5, xmm3	;# F1 
-
-	movapd xmm6, [rsi + rdx*8 + 48]	;# G1 H1 	
-	xorpd xmm3, xmm3
-	movapd xmm7, xmm6
-	unpcklpd xmm6, xmm3	;# G1 
-	unpckhpd xmm7, xmm3	;# H1 
-	;# Dispersion table ready, in xmm4-xmm7  		
-	mulsd  xmm6, xmm1	;# xmm6=Geps 
-	mulsd  xmm7, xmm2	;# xmm7=Heps2 
-	addsd  xmm5, xmm6
-	addsd  xmm5, xmm7	;# xmm5=Fp 	
-	mulsd  xmm5, xmm1 ;# xmm5=eps*Fp 
-	addsd  xmm5, xmm4 ;# xmm5=VV 
-	mulsd  xmm5, [rsp + nb430nf_c12] ;# Vvdw12 
-	addsd  xmm5, [rsp + nb430nf_Vvdwtot]
-	movlpd [rsp + nb430nf_Vvdwtot], xmm5
-.nb430nf_updateouterdata:
-	;# get n from stack
-	mov esi, [rsp + nb430nf_n]
-        ;# get group index for i particle 
-        mov   rdx, [rbp + nb430nf_gid]      	;# base of gid[]
-        mov   edx, [rdx + rsi*4]		;# ggid=gid[n]
-
-	;# accumulate total potential energy and update it 
-	movapd xmm7, [rsp + nb430nf_vctot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb430nf_Vc]
-	addsd xmm7, [rax + rdx*8] 
-	;# move back to mem 
-	movsd [rax + rdx*8], xmm7 
-	
-	;# accumulate total lj energy and update it 
-	movapd xmm7, [rsp + nb430nf_Vvdwtot]
-	;# accumulate 
-	movhlps xmm6, xmm7
-	addsd  xmm7, xmm6	;# low xmm7 has the sum now 
-	
-	;# add earlier value from mem 
-	mov   rax, [rbp + nb430nf_Vvdw]
-	addsd xmm7, [rax + rdx*8] 
-	;# move back to mem 
-	movsd [rax + rdx*8], xmm7 
-	
-        ;# finish if last 
-        mov ecx, [rsp + nb430nf_nn1]
-	;# esi already loaded with n
-	inc esi
-        sub ecx, esi
-        jz .nb430nf_outerend
-
-        ;# not last, iterate outer loop once more!  
-        mov [rsp + nb430nf_n], esi
-        jmp .nb430nf_outer
-.nb430nf_outerend:
-        ;# check if more outer neighborlists remain
-        mov   ecx, [rsp + nb430nf_nri]
-	;# esi already loaded with n above
-        sub   ecx, esi
-        jz .nb430nf_end
-        ;# non-zero, do one more workunit
-        jmp   .nb430nf_threadloop
-.nb430nf_end:
-	mov eax, [rsp + nb430nf_nouter]
-	mov ebx, [rsp + nb430nf_ninner]
-	mov rcx, [rbp + nb430nf_outeriter]
-	mov rdx, [rbp + nb430nf_inneriter]
-	mov [rcx], eax
-	mov [rdx], ebx
-
-	add rsp, 392
-	emms
-
-
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-
-	pop rbx
-	pop	rbp
-	ret
-
-
-
diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.s
deleted file mode 100644
index 56360fe61e..0000000000
--- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.s
+++ /dev/null
@@ -1,1640 +0,0 @@
-##
-##
-## Gromacs 4.0                         Copyright (c) 1991-2003 
-## David van der Spoel, Erik Lindahl
-##
-## This program is free software; you can redistribute it and/or
-## modify it under the terms of the GNU General Public License
-## as published by the Free Software Foundation; either version 2
-## of the License, or (at your option) any later version.
-##
-## To help us fund GROMACS development, we humbly ask that you cite
-## the research papers on the package. Check out http://www.gromacs.org
-## 
-## And Hey:
-## Gnomes, ROck Monsters And Chili Sauce
-##
-
-
-
-
-.globl nb_kernel430_x86_64_sse2
-.globl _nb_kernel430_x86_64_sse2
-nb_kernel430_x86_64_sse2:       
-_nb_kernel430_x86_64_sse2:      
-##      Room for return address and rbp (16 bytes)
-.set nb430_fshift, 16
-.set nb430_gid, 24
-.set nb430_pos, 32
-.set nb430_faction, 40
-.set nb430_charge, 48
-.set nb430_p_facel, 56
-.set nb430_argkrf, 64
-.set nb430_argcrf, 72
-.set nb430_Vc, 80
-.set nb430_type, 88
-.set nb430_p_ntype, 96
-.set nb430_vdwparam, 104
-.set nb430_Vvdw, 112
-.set nb430_p_tabscale, 120
-.set nb430_VFtab, 128
-.set nb430_invsqrta, 136
-.set nb430_dvda, 144
-.set nb430_p_gbtabscale, 152
-.set nb430_GBtab, 160
-.set nb430_p_nthreads, 168
-.set nb430_count, 176
-.set nb430_mtx, 184
-.set nb430_outeriter, 192
-.set nb430_inneriter, 200
-.set nb430_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb430_ix, 0
-.set nb430_iy, 16
-.set nb430_iz, 32
-.set nb430_iq, 48
-.set nb430_dx, 64
-.set nb430_dy, 80
-.set nb430_dz, 96
-.set nb430_eps, 112
-.set nb430_gbtsc, 128
-.set nb430_tsc, 144
-.set nb430_qq, 160
-.set nb430_c6, 176
-.set nb430_c12, 192
-.set nb430_epsgb, 208
-.set nb430_vctot, 224
-.set nb430_Vvdwtot, 240
-.set nb430_fix, 256
-.set nb430_fiy, 272
-.set nb430_fiz, 288
-.set nb430_half, 304
-.set nb430_three, 320
-.set nb430_r, 336
-.set nb430_isai, 352
-.set nb430_isaprod, 368
-.set nb430_dvdasum, 384
-.set nb430_gbscale, 400
-.set nb430_rinv, 416
-.set nb430_nri, 432
-.set nb430_iinr, 440
-.set nb430_jindex, 448
-.set nb430_jjnr, 456
-.set nb430_shift, 464
-.set nb430_shiftvec, 472
-.set nb430_facel, 480
-.set nb430_innerjjnr, 488
-.set nb430_ii, 496
-.set nb430_is3, 500
-.set nb430_ii3, 504
-.set nb430_ntia, 508
-.set nb430_innerk, 512
-.set nb430_n, 516
-.set nb430_nn1, 520
-.set nb430_ntype, 524
-.set nb430_nouter, 528
-.set nb430_ninner, 532
-
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $536,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb430_nouter(%rsp)
-        movl %eax,nb430_ninner(%rsp)
-
-        movl (%rdi),%edi
-        movl %edi,nb430_nri(%rsp)
-        movq %rsi,nb430_iinr(%rsp)
-        movq %rdx,nb430_jindex(%rsp)
-        movq %rcx,nb430_jjnr(%rsp)
-        movq %r8,nb430_shift(%rsp)
-        movq %r9,nb430_shiftvec(%rsp)
-        movq nb430_p_ntype(%rbp),%rdi
-        movl (%rdi),%edi
-        movl %edi,nb430_ntype(%rsp)
-        movq nb430_p_facel(%rbp),%rsi
-        movsd (%rsi),%xmm0
-        movsd %xmm0,nb430_facel(%rsp)
-
-        movq nb430_p_tabscale(%rbp),%rax
-        movsd (%rax),%xmm3
-        shufpd $0,%xmm3,%xmm3
-        movapd %xmm3,nb430_tsc(%rsp)
-
-        movq nb430_p_gbtabscale(%rbp),%rbx
-        movsd (%rbx),%xmm4
-        shufpd $0,%xmm4,%xmm4
-        movapd %xmm4,nb430_gbtsc(%rsp)
-
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double half IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb430_half(%rsp)
-        movl %ebx,nb430_half+4(%rsp)
-        movsd nb430_half(%rsp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## one
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## two
-        addpd  %xmm2,%xmm3      ## three
-        movapd %xmm1,nb430_half(%rsp)
-        movapd %xmm3,nb430_three(%rsp)
-
-_nb_kernel430_x86_64_sse2.nb430_threadloop: 
-        movq  nb430_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel430_x86_64_sse2.nb430_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel430_x86_64_sse2.nb430_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb430_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb430_n(%rsp)
-        movl %ebx,nb430_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel430_x86_64_sse2.nb430_outerstart
-        jmp _nb_kernel430_x86_64_sse2.nb430_end
-
-_nb_kernel430_x86_64_sse2.nb430_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb430_nouter(%rsp),%ebx
-        movl %ebx,nb430_nouter(%rsp)
-
-_nb_kernel430_x86_64_sse2.nb430_outer: 
-        movq  nb430_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx        ## rbx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb430_is3(%rsp)      ## store is3 
-
-        movq  nb430_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movsd (%rax,%rbx,8),%xmm0
-        movsd 8(%rax,%rbx,8),%xmm1
-        movsd 16(%rax,%rbx,8),%xmm2
-
-        movq  nb430_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]
-        movl  (%rcx,%rsi,4),%ebx    ## ebx =ii 
-        movl  %ebx,nb430_ii(%rsp)
-
-        movq  nb430_charge(%rbp),%rdx
-        movsd (%rdx,%rbx,8),%xmm3
-        mulsd nb430_facel(%rsp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movq  nb430_invsqrta(%rbp),%rdx         ## load invsqrta[ii]
-        movsd (%rdx,%rbx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        movq  nb430_type(%rbp),%rdx
-        movl  (%rdx,%rbx,4),%edx
-        imull nb430_ntype(%rsp),%edx
-        shll  %edx
-        movl  %edx,nb430_ntia(%rsp)
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb430_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addsd (%rax,%rbx,8),%xmm0
-        addsd 8(%rax,%rbx,8),%xmm1
-        addsd 16(%rax,%rbx,8),%xmm2
-
-        movapd %xmm3,nb430_iq(%rsp)
-        movapd %xmm4,nb430_isai(%rsp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb430_ix(%rsp)
-        movapd %xmm1,nb430_iy(%rsp)
-        movapd %xmm2,nb430_iz(%rsp)
-
-        movl  %ebx,nb430_ii3(%rsp)
-
-        ## clear vctot and i forces 
-        xorpd %xmm4,%xmm4
-        movapd %xmm4,nb430_vctot(%rsp)
-        movapd %xmm4,nb430_Vvdwtot(%rsp)
-        movapd %xmm4,nb430_dvdasum(%rsp)
-        movapd %xmm4,nb430_fix(%rsp)
-        movapd %xmm4,nb430_fiy(%rsp)
-        movapd %xmm4,nb430_fiz(%rsp)
-
-        movq  nb430_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb430_pos(%rbp),%rsi
-        movq  nb430_faction(%rbp),%rdi
-        movq  nb430_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb430_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb430_ninner(%rsp),%ecx
-        movl  %ecx,nb430_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb430_innerk(%rsp)      ## number of innerloop atoms 
-        jge   _nb_kernel430_x86_64_sse2.nb430_unroll_loop
-        jmp   _nb_kernel430_x86_64_sse2.nb430_checksingle
-_nb_kernel430_x86_64_sse2.nb430_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movq  nb430_innerjjnr(%rsp),%rdx     ## pointer to jjnr[k] 
-        movl  (%rdx),%eax
-        movl  4(%rdx),%ebx
-        addq $8,nb430_innerjjnr(%rsp)                   ## advance pointer (unrolled 2) 
-
-
-        movq nb430_pos(%rbp),%rsi               ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r10     ## j3 
-        lea  (%rbx,%rbx,2),%r11
-
-        ## move two coordinates to xmm4-xmm6 
-        movlpd (%rsi,%r10,8),%xmm4
-        movlpd 8(%rsi,%r10,8),%xmm5
-        movlpd 16(%rsi,%r10,8),%xmm6
-        movhpd (%rsi,%r11,8),%xmm4
-        movhpd 8(%rsi,%r11,8),%xmm5
-        movhpd 16(%rsi,%r11,8),%xmm6
-
-        ## calc dr 
-        subpd nb430_ix(%rsp),%xmm4
-        subpd nb430_iy(%rsp),%xmm5
-        subpd nb430_iz(%rsp),%xmm6
-
-        ## store dr 
-        movapd %xmm4,nb430_dx(%rsp)
-        movapd %xmm5,nb430_dy(%rsp)
-        movapd %xmm6,nb430_dz(%rsp)
-
-        ## square it 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        ## load isaj
-        movq nb430_invsqrta(%rbp),%rsi
-        movlpd (%rsi,%rax,8),%xmm3
-        movhpd (%rsi,%rbx,8),%xmm3
-        mulpd  nb430_isai(%rsp),%xmm3
-        movapd %xmm3,nb430_isaprod(%rsp)
-        movapd %xmm3,%xmm6
-        mulpd nb430_gbtsc(%rsp),%xmm3
-        movapd %xmm3,nb430_gbscale(%rsp)
-
-    ##invsqrt
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb430_three(%rsp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb430_half(%rsp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-    mulpd  nb430_iq(%rsp),%xmm6
-        movq nb430_charge(%rbp),%rsi     ## base of charge[] 
-        movlpd (%rsi,%rax,8),%xmm3
-        movhpd (%rsi,%rbx,8),%xmm3
-        mulpd  %xmm6,%xmm3
-        movapd %xmm3,nb430_qq(%rsp)
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb430_three(%rsp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb430_half(%rsp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=iter2 of rinv 
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb430_r(%rsp)
-        movapd %xmm0,nb430_rinv(%rsp)
-
-        movq nb430_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%r8d
-        movl (%rsi,%rbx,4),%r9d
-        shll %r8d
-        shll %r9d
-        movl nb430_ntia(%rsp),%edi
-        addl %edi,%r8d
-        addl %edi,%r9d
-
-    movapd %xmm4,%xmm8 ## r
-        mulpd nb430_gbscale(%rsp),%xmm4
-        mulpd nb430_tsc(%rsp),%xmm8
-
-    ## truncate and convert to integers
-    cvttpd2pi %xmm4,%mm0 ## gb
-    cvttpd2pi %xmm8,%mm1 ## lj
-
-    ## convert back to float
-    cvtpi2pd  %mm0,%xmm6  ## gb
-    cvtpi2pd  %mm1,%xmm10 ## lj
-
-    ## multiply by 4 and 8, respectively
-    pslld   $2,%mm0  ## gb
-    pslld   $3,%mm1  ## lj
-
-    ## move to integer registers
-    movd    %mm0,%r12d      ## gb
-    movd    %mm1,%r14d     ## lj
-        psrlq $32,%mm0
-        psrlq $32,%mm1
-    movd    %mm0,%r13d     ## gb
-    movd    %mm1,%r15d    ## lj
-    ## GB indices: r10-11   LJ indices: r12-r13
-
-    ## calculate eps
-    subpd     %xmm6,%xmm4  ## gb
-    subpd     %xmm10,%xmm8 ## lj
-    movapd    %xmm4,nb430_epsgb(%rsp)   ## gb eps
-    movapd    %xmm8,nb430_eps(%rsp)   ## lj eps
-
-        movq nb430_GBtab(%rbp),%rsi
-        movq nb430_VFtab(%rbp),%rdi
-
-    ## load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
-    movapd (%rsi,%r12,8),%xmm0         ## Y1c F1c
-    movapd (%rsi,%r13,8),%xmm12        ## Y2c F2c
-    movapd (%rdi,%r14,8),%xmm4         ## Y1d F1d
-    movapd (%rdi,%r15,8),%xmm13        ## Y2d F2d
-    movapd 32(%rdi,%r14,8),%xmm8       ## Y1r F1r
-    movapd 32(%rdi,%r15,8),%xmm14      ## Y2r F2r
-        movapd %xmm0,%xmm1
-        movapd %xmm4,%xmm5
-        movapd %xmm8,%xmm9
-        unpcklpd %xmm12,%xmm0   ## Y1c Y2c 
-        unpckhpd %xmm12,%xmm1   ## F1c F2c 
-        unpcklpd %xmm13,%xmm4   ## Y1d Y2d 
-        unpckhpd %xmm13,%xmm5   ## F1d F2d 
-        unpcklpd %xmm14,%xmm8   ## Y1r Y2r 
-        unpckhpd %xmm14,%xmm9   ## F1r F2r 
-
-    movapd 16(%rsi,%r12,8),%xmm2       ## G1c H1c
-    movapd 16(%rsi,%r13,8),%xmm12      ## G2c H2c
-    movapd 16(%rdi,%r14,8),%xmm6       ## G1d H1d
-    movapd 16(%rdi,%r15,8),%xmm13      ## G2d H2d
-    movapd 48(%rdi,%r14,8),%xmm10      ## G1r H1r
-    movapd 48(%rdi,%r15,8),%xmm14      ## G2r H2r
-        movapd %xmm2,%xmm3
-        movapd %xmm6,%xmm7
-        movapd %xmm10,%xmm11
-        unpcklpd %xmm12,%xmm2   ## G1c G2c 
-        unpckhpd %xmm12,%xmm3   ## H1c H2c 
-        unpcklpd %xmm13,%xmm6   ## G1d G2d 
-        unpckhpd %xmm13,%xmm7   ## H1d H2d 
-        unpcklpd %xmm14,%xmm10  ## G1r G2r 
-        unpckhpd %xmm14,%xmm11  ## H1r H2r 
-    ## table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
-    movq nb430_vdwparam(%rbp),%rdi
-
-    movapd nb430_epsgb(%rsp),%xmm12
-    movapd nb430_eps(%rsp),%xmm13
-
-    mulpd  %xmm12,%xmm3  ## Heps
-    mulpd  %xmm13,%xmm7
-    mulpd  %xmm13,%xmm11
-    mulpd  %xmm12,%xmm2    ## Geps
-    mulpd  %xmm13,%xmm6
-    mulpd  %xmm13,%xmm10
-    mulpd  %xmm12,%xmm3  ## Heps2
-    mulpd  %xmm13,%xmm7
-    mulpd  %xmm13,%xmm11
-
-    movlpd (%rdi,%r8,8),%xmm14
-    movlpd 8(%rdi,%r8,8),%xmm15
-
-    addpd  %xmm2,%xmm1  ## F+Geps
-    addpd  %xmm6,%xmm5
-    addpd  %xmm10,%xmm9
-    addpd  %xmm3,%xmm1  ## F+Geps+Heps2 = Fp
-    addpd  %xmm7,%xmm5
-    addpd  %xmm11,%xmm9
-    addpd  %xmm3,%xmm3   ## 2*Heps2
-    addpd  %xmm7,%xmm7
-    addpd  %xmm11,%xmm11
-    movhpd (%rdi,%r9,8),%xmm14
-    movhpd 8(%rdi,%r9,8),%xmm15
-
-    addpd  %xmm2,%xmm3   ## 2*Heps2+Geps
-    addpd  %xmm6,%xmm7
-    addpd  %xmm10,%xmm11
-    addpd  %xmm1,%xmm3  ## FF = Fp + 2*Heps2 + Geps
-    addpd  %xmm5,%xmm7
-    addpd  %xmm9,%xmm11
-    mulpd  %xmm12,%xmm1  ## eps*Fp
-    mulpd  %xmm13,%xmm5
-    mulpd  %xmm13,%xmm9
-    addpd  %xmm0,%xmm1    ## VV
-    addpd  %xmm4,%xmm5
-    addpd  %xmm8,%xmm9
-    mulpd  nb430_qq(%rsp),%xmm1     ## VV*qq = vcoul
-    mulpd  %xmm14,%xmm5  ## vnb6
-    mulpd  %xmm15,%xmm9  ## vnb12
-    mulpd  nb430_qq(%rsp),%xmm3      ## FF*qq = fij
-    mulpd  %xmm14,%xmm7  ## fijD
-    mulpd  %xmm15,%xmm11  ##fijR
-
-    addpd  %xmm7,%xmm11 ## fijD+fijR
-    mulpd  nb430_tsc(%rsp),%xmm11   ## (fijD+fijR)*tabscale
-
-    ## accumulate Vvdwtot
-    addpd  nb430_Vvdwtot(%rsp),%xmm5
-    addpd  %xmm9,%xmm5
-    movapd %xmm5,nb430_Vvdwtot(%rsp)
-
-        movq nb430_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        mulpd nb430_gbscale(%rsp),%xmm3     ## fijC=qq*FF*gbscale
-        movapd %xmm3,%xmm6
-        mulpd  nb430_r(%rsp),%xmm6
-        addpd  %xmm1,%xmm6  ## vcoul+fijC*r
-
-    addpd  %xmm11,%xmm3 ## fijC+fijD+fijR
-
-    ## increment vctot
-        addpd  nb430_vctot(%rsp),%xmm1
-    movapd %xmm1,nb430_vctot(%rsp)
-
-        ## xmm6=(vcoul+fijC*r)
-        xorpd  %xmm7,%xmm7
-        subpd  %xmm6,%xmm7
-        movapd %xmm7,%xmm6
-
-    ## the fj's - start by combiningg forces from memory 
-    movq nb430_faction(%rbp),%rdi
-        movlpd (%rdi,%r10,8),%xmm0
-        movlpd 8(%rdi,%r10,8),%xmm1
-        movlpd 16(%rdi,%r10,8),%xmm2
-        movhpd (%rdi,%r11,8),%xmm0
-        movhpd 8(%rdi,%r11,8),%xmm1
-        movhpd 16(%rdi,%r11,8),%xmm2
-
-        ## update dvdasum 
-        addpd  nb430_dvdasum(%rsp),%xmm7
-    movapd %xmm7,nb430_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        movhlps %xmm6,%xmm7
-        addsd  (%rsi,%rax,8),%xmm6
-        addsd  (%rsi,%rbx,8),%xmm7
-        movsd  %xmm6,(%rsi,%rax,8)
-        movsd  %xmm7,(%rsi,%rbx,8)
-
-        xorpd  %xmm4,%xmm4
-        mulpd nb430_rinv(%rsp),%xmm3
-        subpd  %xmm3,%xmm4
-
-    movapd  %xmm4,%xmm9
-    movapd  %xmm4,%xmm10
-    movapd  %xmm4,%xmm11
-
-    mulpd  nb430_dx(%rsp),%xmm9
-    mulpd  nb430_dy(%rsp),%xmm10
-    mulpd  nb430_dz(%rsp),%xmm11
-
-        addpd %xmm9,%xmm0
-        addpd %xmm10,%xmm1
-        addpd %xmm11,%xmm2
-
-        ## accumulate i forces
-    addpd nb430_fix(%rsp),%xmm9
-    addpd nb430_fiy(%rsp),%xmm10
-    addpd nb430_fiz(%rsp),%xmm11
-
-        movlpd %xmm0,(%rdi,%r10,8)
-        movlpd %xmm1,8(%rdi,%r10,8)
-        movlpd %xmm2,16(%rdi,%r10,8)
-
-    movapd %xmm9,nb430_fix(%rsp)
-    movapd %xmm10,nb430_fiy(%rsp)
-    movapd %xmm11,nb430_fiz(%rsp)
-
-        movhpd %xmm0,(%rdi,%r11,8)
-        movhpd %xmm1,8(%rdi,%r11,8)
-        movhpd %xmm2,16(%rdi,%r11,8)
-
-    ## should we do one more iteration? 
-        subl $2,nb430_innerk(%rsp)
-        jl    _nb_kernel430_x86_64_sse2.nb430_checksingle
-        jmp   _nb_kernel430_x86_64_sse2.nb430_unroll_loop
-_nb_kernel430_x86_64_sse2.nb430_checksingle: 
-        movl  nb430_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel430_x86_64_sse2.nb430_dosingle
-        jmp    _nb_kernel430_x86_64_sse2.nb430_updateouterdata
-_nb_kernel430_x86_64_sse2.nb430_dosingle: 
-        movq nb430_charge(%rbp),%rsi
-        movq nb430_invsqrta(%rbp),%rdx
-        movq nb430_pos(%rbp),%rdi
-        movq  nb430_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-
-        ## load isaj
-        movq nb430_invsqrta(%rbp),%rsi
-        movsd (%rsi,%rax,8),%xmm2
-        mulsd  nb430_isai(%rsp),%xmm2
-        movapd %xmm2,nb430_isaprod(%rsp)
-        movapd %xmm2,%xmm1
-        mulsd nb430_gbtsc(%rsp),%xmm1
-        movapd %xmm1,nb430_gbscale(%rsp)
-
-    mulsd nb430_iq(%rsp),%xmm2
-        movq nb430_charge(%rbp),%rsi     ## base of charge[] 
-        movsd (%rsi,%rax,8),%xmm3
-        mulsd  %xmm2,%xmm3
-        movapd %xmm3,nb430_qq(%rsp)
-
-        movq nb430_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%r8d
-        movq nb430_vdwparam(%rbp),%rsi
-        shll %r8d
-        movl nb430_ntia(%rsp),%edi
-        addl %edi,%r8d
-
-        movsd (%rsi,%r8,8),%xmm4
-        movsd 8(%rsi,%r8,8),%xmm6
-        movapd %xmm4,nb430_c6(%rsp)
-        movapd %xmm6,nb430_c12(%rsp)
-
-        movq nb430_pos(%rbp),%rsi               ## base of pos[] 
-
-        lea  (%rax,%rax,2),%r10     ## j3 
-
-        ## move coordinate to xmm4-xmm6 
-        movsd (%rsi,%r10,8),%xmm4
-        movsd 8(%rsi,%r10,8),%xmm5
-        movsd 16(%rsi,%r10,8),%xmm6
-
-        movq   nb430_faction(%rbp),%rdi
-
-        ## calc dr 
-        subsd nb430_ix(%rsp),%xmm4
-        subsd nb430_iy(%rsp),%xmm5
-        subsd nb430_iz(%rsp),%xmm6
-
-        ## store dr 
-        movapd %xmm4,nb430_dx(%rsp)
-        movapd %xmm5,nb430_dy(%rsp)
-        movapd %xmm6,nb430_dz(%rsp)
-
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb430_three(%rsp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb430_half(%rsp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb430_three(%rsp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb430_half(%rsp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=iter2 of rinv 
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb430_r(%rsp)
-        movapd %xmm0,nb430_rinv(%rsp)
-
-    movapd %xmm4,%xmm8 ## r
-        mulsd nb430_gbscale(%rsp),%xmm4
-        mulsd nb430_tsc(%rsp),%xmm8
-
-    ## truncate and convert to integers
-    cvttsd2si %xmm4,%r12d ## gb
-    cvttsd2si %xmm8,%r14d ## lj
-
-    ## convert back to float
-    cvtsi2sd  %r12d,%xmm6  ## gb
-    cvtsi2sd  %r14d,%xmm10 ## lj
-
-    ## multiply by 4 and 8, respectively
-    shll   $2,%r12d  ## gb
-    shll   $3,%r14d  ## lj
-
-    ## GB indices: r10   LJ indices: r12
-
-    ## calculate eps
-    subsd     %xmm6,%xmm4  ## gb
-    subsd     %xmm10,%xmm8 ## lj
-    movapd    %xmm4,nb430_epsgb(%rsp)   ## gb eps
-    movapd    %xmm8,nb430_eps(%rsp)   ## lj eps
-
-        movq nb430_GBtab(%rbp),%rsi
-        movq nb430_VFtab(%rbp),%rdi
-
-    ## load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
-    movapd (%rsi,%r12,8),%xmm0         ## Y1c F1c
-    movapd (%rdi,%r14,8),%xmm4         ## Y1d F1d
-    movapd 32(%rdi,%r14,8),%xmm8       ## Y1r F1r
-        movhlps %xmm0,%xmm1
-        movhlps %xmm4,%xmm5
-        movhlps %xmm8,%xmm9
-
-    movapd 16(%rsi,%r12,8),%xmm2       ## G1c H1c
-    movapd 16(%rdi,%r14,8),%xmm6       ## G1d H1d
-    movapd 48(%rdi,%r14,8),%xmm10      ## G1r H1r
-        movhlps %xmm2,%xmm3
-        movhlps %xmm6,%xmm7
-        movhlps %xmm10,%xmm11
-    ## table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
-
-    movapd nb430_epsgb(%rsp),%xmm12
-    movapd nb430_eps(%rsp),%xmm13
-
-    mulsd  %xmm12,%xmm3  ## Heps
-    mulsd  %xmm13,%xmm7
-    mulsd  %xmm13,%xmm11
-    mulsd  %xmm12,%xmm2    ## Geps
-    mulsd  %xmm13,%xmm6
-    mulsd  %xmm13,%xmm10
-    mulsd  %xmm12,%xmm3  ## Heps2
-    mulsd  %xmm13,%xmm7
-    mulsd  %xmm13,%xmm11
-
-    addsd  %xmm2,%xmm1  ## F+Geps
-    addsd  %xmm6,%xmm5
-    addsd  %xmm10,%xmm9
-    addsd  %xmm3,%xmm1  ## F+Geps+Heps2 = Fp
-    addsd  %xmm7,%xmm5
-    addsd  %xmm11,%xmm9
-    addsd  %xmm3,%xmm3   ## 2*Heps2
-    addsd  %xmm7,%xmm7
-    addsd  %xmm11,%xmm11
-    addsd  %xmm2,%xmm3   ## 2*Heps2+Geps
-    addsd  %xmm6,%xmm7
-    addsd  %xmm10,%xmm11
-    addsd  %xmm1,%xmm3  ## FF = Fp + 2*Heps2 + Geps
-    addsd  %xmm5,%xmm7
-    addsd  %xmm9,%xmm11
-    mulsd  %xmm12,%xmm1  ## eps*Fp
-    mulsd  %xmm13,%xmm5
-    mulsd  %xmm13,%xmm9
-    addsd  %xmm0,%xmm1    ## VV
-    addsd  %xmm4,%xmm5
-    addsd  %xmm8,%xmm9
-    mulsd  nb430_qq(%rsp),%xmm1     ## VV*qq = vcoul
-    mulsd  nb430_c6(%rsp),%xmm5     ## vnb6
-    mulsd  nb430_c12(%rsp),%xmm9     ## vnb12
-    mulsd  nb430_qq(%rsp),%xmm3      ## FF*qq = fij
-    mulsd  nb430_c6(%rsp),%xmm7     ## fijD
-    mulsd  nb430_c12(%rsp),%xmm11     ##fijR
-
-    addsd  %xmm7,%xmm11 ## fijD+fijR
-    mulsd  nb430_tsc(%rsp),%xmm11   ## (fijD+fijR)*tabscale
-
-    ## accumulate Vvdwtot
-    addsd  nb430_Vvdwtot(%rsp),%xmm5
-    addsd  %xmm9,%xmm5
-    movsd %xmm5,nb430_Vvdwtot(%rsp)
-
-        movq nb430_dvda(%rbp),%rsi
-
-        ## Calculate dVda
-        mulsd nb430_gbscale(%rsp),%xmm3     ## fijC=qq*FF*gbscale
-        movapd %xmm3,%xmm6
-        mulsd  nb430_r(%rsp),%xmm6
-        addsd  %xmm1,%xmm6  ## vcoul+fijC*r
-
-    addsd  %xmm11,%xmm3 ## fijC+fijD+fijR
-
-    ## increment vctot
-        addsd  nb430_vctot(%rsp),%xmm1
-    movsd %xmm1,nb430_vctot(%rsp)
-
-        ## xmm6=(vcoul+fijC*r)
-        xorpd  %xmm7,%xmm7
-        subsd  %xmm6,%xmm7
-        movapd %xmm7,%xmm6
-
-        ## update dvdasum 
-        addsd  nb430_dvdasum(%rsp),%xmm7
-    movsd %xmm7,nb430_dvdasum(%rsp)
-
-        ## update j atoms dvdaj
-        addsd  (%rsi,%rax,8),%xmm6
-        movsd  %xmm6,(%rsi,%rax,8)
-
-        xorpd  %xmm4,%xmm4
-        mulsd nb430_rinv(%rsp),%xmm3
-        subsd  %xmm3,%xmm4
-
-    movapd  %xmm4,%xmm9
-    movapd  %xmm4,%xmm10
-    movapd  %xmm4,%xmm11
-
-    mulsd  nb430_dx(%rsp),%xmm9
-    mulsd  nb430_dy(%rsp),%xmm10
-    mulsd  nb430_dz(%rsp),%xmm11
-
-    movapd %xmm9,%xmm3
-    movapd %xmm10,%xmm4
-    movapd %xmm11,%xmm5
-
-        ## accumulate i forces
-    addsd nb430_fix(%rsp),%xmm9
-    addsd nb430_fiy(%rsp),%xmm10
-    addsd nb430_fiz(%rsp),%xmm11
-    movsd %xmm9,nb430_fix(%rsp)
-    movsd %xmm10,nb430_fiy(%rsp)
-    movsd %xmm11,nb430_fiz(%rsp)
-
-    movq nb430_faction(%rbp),%rdi
-        ## the fj's - start by accumulating forces from memory 
-        addsd (%rdi,%r10,8),%xmm3
-        addsd 8(%rdi,%r10,8),%xmm4
-        addsd 16(%rdi,%r10,8),%xmm5
-        movsd %xmm3,(%rdi,%r10,8)
-        movsd %xmm4,8(%rdi,%r10,8)
-        movsd %xmm5,16(%rdi,%r10,8)
-
-_nb_kernel430_x86_64_sse2.nb430_updateouterdata: 
-        movl  nb430_ii3(%rsp),%ecx
-        movq  nb430_faction(%rbp),%rdi
-        movq  nb430_fshift(%rbp),%rsi
-        movl  nb430_is3(%rsp),%edx
-
-        ## accumulate i forces in xmm0, xmm1, xmm2 
-        movapd nb430_fix(%rsp),%xmm0
-        movapd nb430_fiy(%rsp),%xmm1
-        movapd nb430_fiz(%rsp),%xmm2
-
-        movhlps %xmm0,%xmm3
-        movhlps %xmm1,%xmm4
-        movhlps %xmm2,%xmm5
-        addsd  %xmm3,%xmm0
-        addsd  %xmm4,%xmm1
-        addsd  %xmm5,%xmm2 ## sum is in low xmm0-xmm2 
-
-        ## increment i force 
-        movsd  (%rdi,%rcx,8),%xmm3
-        movsd  8(%rdi,%rcx,8),%xmm4
-        movsd  16(%rdi,%rcx,8),%xmm5
-        subsd  %xmm0,%xmm3
-        subsd  %xmm1,%xmm4
-        subsd  %xmm2,%xmm5
-        movsd  %xmm3,(%rdi,%rcx,8)
-        movsd  %xmm4,8(%rdi,%rcx,8)
-        movsd  %xmm5,16(%rdi,%rcx,8)
-
-        ## increment fshift force  
-        movsd  (%rsi,%rdx,8),%xmm3
-        movsd  8(%rsi,%rdx,8),%xmm4
-        movsd  16(%rsi,%rdx,8),%xmm5
-        subsd  %xmm0,%xmm3
-        subsd  %xmm1,%xmm4
-        subsd  %xmm2,%xmm5
-        movsd  %xmm3,(%rsi,%rdx,8)
-        movsd  %xmm4,8(%rsi,%rdx,8)
-        movsd  %xmm5,16(%rsi,%rdx,8)
-
-        ## get n from stack
-        movl nb430_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb430_gid(%rbp),%rdx              ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movapd nb430_vctot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movq  nb430_Vc(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%rax,%rdx,8)
-
-        ## accumulate total lj energy and update it 
-        movapd nb430_Vvdwtot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movq  nb430_Vvdw(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%rax,%rdx,8)
-
-        ## accumulate dVda and update it 
-        movapd nb430_dvdasum(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        movl nb430_ii(%rsp),%edx
-        movq nb430_dvda(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm7
-        movsd %xmm7,(%rax,%rdx,8)
-
-        ## finish if last 
-        movl nb430_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel430_x86_64_sse2.nb430_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb430_n(%rsp)
-        jmp _nb_kernel430_x86_64_sse2.nb430_outer
-_nb_kernel430_x86_64_sse2.nb430_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb430_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel430_x86_64_sse2.nb430_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel430_x86_64_sse2.nb430_threadloop
-_nb_kernel430_x86_64_sse2.nb430_end: 
-        movl nb430_nouter(%rsp),%eax
-        movl nb430_ninner(%rsp),%ebx
-        movq nb430_outeriter(%rbp),%rcx
-        movq nb430_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $536,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
-
-
-
-
-.globl nb_kernel430nf_x86_64_sse2
-.globl _nb_kernel430nf_x86_64_sse2
-nb_kernel430nf_x86_64_sse2:     
-_nb_kernel430nf_x86_64_sse2:    
-##      Room for return address and rbp (16 bytes)
-.set nb430nf_fshift, 16
-.set nb430nf_gid, 24
-.set nb430nf_pos, 32
-.set nb430nf_faction, 40
-.set nb430nf_charge, 48
-.set nb430nf_p_facel, 56
-.set nb430nf_argkrf, 64
-.set nb430nf_argcrf, 72
-.set nb430nf_Vc, 80
-.set nb430nf_type, 88
-.set nb430nf_p_ntype, 96
-.set nb430nf_vdwparam, 104
-.set nb430nf_Vvdw, 112
-.set nb430nf_p_tabscale, 120
-.set nb430nf_VFtab, 128
-.set nb430nf_invsqrta, 136
-.set nb430nf_dvda, 144
-.set nb430nf_p_gbtabscale, 152
-.set nb430nf_GBtab, 160
-.set nb430nf_p_nthreads, 168
-.set nb430nf_count, 176
-.set nb430nf_mtx, 184
-.set nb430nf_outeriter, 192
-.set nb430nf_inneriter, 200
-.set nb430nf_work, 208
-        ## stack offsets for local variables  
-        ## bottom of stack is cache-aligned for sse2 use 
-.set nb430nf_ix, 0
-.set nb430nf_iy, 16
-.set nb430nf_iz, 32
-.set nb430nf_iq, 48
-.set nb430nf_gbtsc, 64
-.set nb430nf_tsc, 80
-.set nb430nf_qq, 96
-.set nb430nf_c6, 112
-.set nb430nf_c12, 128
-.set nb430nf_vctot, 144
-.set nb430nf_Vvdwtot, 160
-.set nb430nf_half, 176
-.set nb430nf_three, 192
-.set nb430nf_r, 208
-.set nb430nf_isai, 224
-.set nb430nf_isaprod, 240
-.set nb430nf_gbscale, 256
-.set nb430nf_nri, 272
-.set nb430nf_iinr, 280
-.set nb430nf_jindex, 288
-.set nb430nf_jjnr, 296
-.set nb430nf_shift, 304
-.set nb430nf_shiftvec, 312
-.set nb430nf_facel, 320
-.set nb430nf_innerjjnr, 328
-.set nb430nf_is3, 336
-.set nb430nf_ii3, 340
-.set nb430nf_ntia, 344
-.set nb430nf_innerk, 348
-.set nb430nf_n, 352
-.set nb430nf_nn1, 356
-.set nb430nf_ntype, 360
-.set nb430nf_nouter, 364
-.set nb430nf_ninner, 368
-        push %rbp
-        movq %rsp,%rbp
-        push %rbx
-
-        emms
-
-        push %r12
-        push %r13
-        push %r14
-        push %r15
-
-        subq $392,%rsp          ## local variable stack space (n*16+8)
-
-        ## zero 32-bit iteration counters
-        movl $0,%eax
-        movl %eax,nb430nf_nouter(%rsp)
-        movl %eax,nb430nf_ninner(%rsp)
-
-        movl (%rdi),%edi
-        movl %edi,nb430nf_nri(%rsp)
-        movq %rsi,nb430nf_iinr(%rsp)
-        movq %rdx,nb430nf_jindex(%rsp)
-        movq %rcx,nb430nf_jjnr(%rsp)
-        movq %r8,nb430nf_shift(%rsp)
-        movq %r9,nb430nf_shiftvec(%rsp)
-        movq nb430nf_p_ntype(%rbp),%rdi
-        movl (%rdi),%edi
-        movl %edi,nb430nf_ntype(%rsp)
-        movq nb430nf_p_facel(%rbp),%rsi
-        movsd (%rsi),%xmm0
-        movsd %xmm0,nb430nf_facel(%rsp)
-
-        movq nb430nf_p_tabscale(%rbp),%rax
-        movsd (%rax),%xmm3
-        shufpd $0,%xmm3,%xmm3
-        movapd %xmm3,nb430nf_tsc(%rsp)
-
-        movq nb430nf_p_gbtabscale(%rbp),%rbx
-        movsd (%rbx),%xmm4
-        shufpd $0,%xmm4,%xmm4
-        movapd %xmm4,nb430nf_gbtsc(%rsp)
-
-        ## create constant floating-point factors on stack
-        movl $0x00000000,%eax   ## lower half of double half IEEE (hex)
-        movl $0x3fe00000,%ebx
-        movl %eax,nb430nf_half(%rsp)
-        movl %ebx,nb430nf_half+4(%rsp)
-        movsd nb430nf_half(%rsp),%xmm1
-        shufpd $0,%xmm1,%xmm1  ## splat to all elements
-        movapd %xmm1,%xmm3
-        addpd  %xmm3,%xmm3      ## one
-        movapd %xmm3,%xmm2
-        addpd  %xmm2,%xmm2      ## two
-        addpd  %xmm2,%xmm3      ## three
-        movapd %xmm1,nb430nf_half(%rsp)
-        movapd %xmm3,nb430nf_three(%rsp)
-
-_nb_kernel430nf_x86_64_sse2.nb430nf_threadloop: 
-        movq  nb430nf_count(%rbp),%rsi            ## pointer to sync counter
-        movl  (%rsi),%eax
-_nb_kernel430nf_x86_64_sse2.nb430nf_spinlock: 
-        movl  %eax,%ebx                         ## ebx=*count=nn0
-        addl  $1,%ebx                          ## ebx=nn1=nn0+10
-        lock 
-        cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
-                                                ## if it hasnt changed.
-                                                ## or reread *counter to eax.
-        pause                                   ## -> better p4 performance
-        jnz _nb_kernel430nf_x86_64_sse2.nb430nf_spinlock
-
-        ## if(nn1>nri) nn1=nri
-        movl nb430nf_nri(%rsp),%ecx
-        movl %ecx,%edx
-        subl %ebx,%ecx
-        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
-        ## Cleared the spinlock if we got here.
-        ## eax contains nn0, ebx contains nn1.
-        movl %eax,nb430nf_n(%rsp)
-        movl %ebx,nb430nf_nn1(%rsp)
-        subl %eax,%ebx                          ## calc number of outer lists
-        movl %eax,%esi                          ## copy n to esi
-        jg  _nb_kernel430nf_x86_64_sse2.nb430nf_outerstart
-        jmp _nb_kernel430nf_x86_64_sse2.nb430nf_end
-
-_nb_kernel430nf_x86_64_sse2.nb430nf_outerstart: 
-        ## ebx contains number of outer iterations
-        addl nb430nf_nouter(%rsp),%ebx
-        movl %ebx,nb430nf_nouter(%rsp)
-
-_nb_kernel430nf_x86_64_sse2.nb430nf_outer: 
-        movq  nb430nf_shift(%rsp),%rax        ## rax = pointer into shift[] 
-        movl  (%rax,%rsi,4),%ebx        ## rbx=shift[n] 
-
-        lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is 
-        movl  %ebx,nb430nf_is3(%rsp)            ## store is3 
-
-        movq  nb430nf_shiftvec(%rsp),%rax     ## rax = base of shiftvec[] 
-
-        movsd (%rax,%rbx,8),%xmm0
-        movsd 8(%rax,%rbx,8),%xmm1
-        movsd 16(%rax,%rbx,8),%xmm2
-
-        movq  nb430nf_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]
-        movl  (%rcx,%rsi,4),%ebx    ## ebx =ii 
-
-        movq  nb430nf_charge(%rbp),%rdx
-        movsd (%rdx,%rbx,8),%xmm3
-        mulsd nb430nf_facel(%rsp),%xmm3
-        shufpd $0,%xmm3,%xmm3
-
-        movq  nb430nf_invsqrta(%rbp),%rdx       ## load invsqrta[ii]
-        movsd (%rdx,%rbx,8),%xmm4
-        shufpd $0,%xmm4,%xmm4
-
-        movq  nb430nf_type(%rbp),%rdx
-        movl  (%rdx,%rbx,4),%edx
-        imull nb430nf_ntype(%rsp),%edx
-        shll  %edx
-        movl  %edx,nb430nf_ntia(%rsp)
-
-        lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3 
-        movq  nb430nf_pos(%rbp),%rax      ## rax = base of pos[]  
-
-        addsd (%rax,%rbx,8),%xmm0
-        addsd 8(%rax,%rbx,8),%xmm1
-        addsd 16(%rax,%rbx,8),%xmm2
-
-        movapd %xmm3,nb430nf_iq(%rsp)
-        movapd %xmm4,nb430nf_isai(%rsp)
-
-        shufpd $0,%xmm0,%xmm0
-        shufpd $0,%xmm1,%xmm1
-        shufpd $0,%xmm2,%xmm2
-
-        movapd %xmm0,nb430nf_ix(%rsp)
-        movapd %xmm1,nb430nf_iy(%rsp)
-        movapd %xmm2,nb430nf_iz(%rsp)
-
-        movl  %ebx,nb430nf_ii3(%rsp)
-
-        ## clear vctot
-        xorpd %xmm4,%xmm4
-        movapd %xmm4,nb430nf_vctot(%rsp)
-        movapd %xmm4,nb430nf_Vvdwtot(%rsp)
-
-        movq  nb430nf_jindex(%rsp),%rax
-        movl  (%rax,%rsi,4),%ecx             ## jindex[n] 
-        movl  4(%rax,%rsi,4),%edx            ## jindex[n+1] 
-        subl  %ecx,%edx              ## number of innerloop atoms 
-
-        movq  nb430nf_pos(%rbp),%rsi
-        movq  nb430nf_faction(%rbp),%rdi
-        movq  nb430nf_jjnr(%rsp),%rax
-        shll  $2,%ecx
-        addq  %rcx,%rax
-        movq  %rax,nb430nf_innerjjnr(%rsp)       ## pointer to jjnr[nj0] 
-        movl  %edx,%ecx
-        subl  $2,%edx
-        addl  nb430nf_ninner(%rsp),%ecx
-        movl  %ecx,nb430nf_ninner(%rsp)
-        addl  $0,%edx
-        movl  %edx,nb430nf_innerk(%rsp)      ## number of innerloop atoms 
-        jge   _nb_kernel430nf_x86_64_sse2.nb430nf_unroll_loop
-        jmp   _nb_kernel430nf_x86_64_sse2.nb430nf_checksingle
-_nb_kernel430nf_x86_64_sse2.nb430nf_unroll_loop: 
-        ## twice unrolled innerloop here 
-        movq  nb430nf_innerjjnr(%rsp),%rdx     ## pointer to jjnr[k] 
-        movl  (%rdx),%eax
-        movl  4(%rdx),%ebx
-        addq $8,nb430nf_innerjjnr(%rsp)                 ## advance pointer (unrolled 2) 
-
-        ## load isaj
-        movq nb430nf_invsqrta(%rbp),%rsi
-        movlpd (%rsi,%rax,8),%xmm2
-        movhpd (%rsi,%rbx,8),%xmm2
-        mulpd  nb430nf_isai(%rsp),%xmm2
-        movapd %xmm2,nb430nf_isaprod(%rsp)
-        movapd %xmm2,%xmm1
-        mulpd nb430nf_gbtsc(%rsp),%xmm1
-        movapd %xmm1,nb430nf_gbscale(%rsp)
-
-        movq nb430nf_charge(%rbp),%rsi     ## base of charge[] 
-        movlpd (%rsi,%rax,8),%xmm3
-        movhpd (%rsi,%rbx,8),%xmm3
-
-        mulpd nb430nf_iq(%rsp),%xmm2
-        mulpd  %xmm2,%xmm3
-        movapd %xmm3,nb430nf_qq(%rsp)
-
-        movq nb430nf_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%ecx
-        movl (%rsi,%rbx,4),%edx
-        movq nb430nf_vdwparam(%rbp),%rsi
-        shll %ecx
-        shll %edx
-        movl nb430nf_ntia(%rsp),%edi
-        addl %edi,%ecx
-        addl %edi,%edx
-
-        movlpd (%rsi,%rcx,8),%xmm6      ## c6a
-        movlpd (%rsi,%rdx,8),%xmm7      ## c6b
-        movhpd 8(%rsi,%rcx,8),%xmm6     ## c6a c12a 
-        movhpd 8(%rsi,%rdx,8),%xmm7     ## c6b c12b 
-
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movapd %xmm4,nb430nf_c6(%rsp)
-        movapd %xmm6,nb430nf_c12(%rsp)
-
-        movq nb430nf_pos(%rbp),%rsi             ## base of pos[] 
-
-        lea  (%rax,%rax,2),%rax     ## replace jnr with j3 
-        lea  (%rbx,%rbx,2),%rbx
-
-        ## move two coordinates to xmm0-xmm2 
-        movlpd (%rsi,%rax,8),%xmm0
-        movlpd 8(%rsi,%rax,8),%xmm1
-        movlpd 16(%rsi,%rax,8),%xmm2
-        movhpd (%rsi,%rbx,8),%xmm0
-        movhpd 8(%rsi,%rbx,8),%xmm1
-        movhpd 16(%rsi,%rbx,8),%xmm2
-
-        movq   nb430nf_faction(%rbp),%rdi
-
-        ## move nb430nf_ix-iz to xmm4-xmm6 
-        movapd nb430nf_ix(%rsp),%xmm4
-        movapd nb430nf_iy(%rsp),%xmm5
-        movapd nb430nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subpd %xmm0,%xmm4
-        subpd %xmm1,%xmm5
-        subpd %xmm2,%xmm6
-
-        ## square it 
-        mulpd %xmm4,%xmm4
-        mulpd %xmm5,%xmm5
-        mulpd %xmm6,%xmm6
-        addpd %xmm5,%xmm4
-        addpd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtpd2ps %xmm4,%xmm5
-        rsqrtps %xmm5,%xmm5
-        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulpd %xmm2,%xmm2       ## lu*lu 
-        movapd nb430nf_three(%rsp),%xmm1
-        mulpd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb430nf_half(%rsp),%xmm0
-        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm1
-        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulpd %xmm1,%xmm1       ## lu*lu 
-        movapd nb430nf_three(%rsp),%xmm2
-        mulpd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb430nf_half(%rsp),%xmm0
-        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulpd %xmm5,%xmm2
-        mulpd %xmm2,%xmm0       ## xmm0=iter2 of rinv 
-        mulpd %xmm0,%xmm4       ## xmm4=r 
-        movapd %xmm4,nb430nf_r(%rsp)
-        mulpd nb430nf_gbscale(%rsp),%xmm4
-
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $2,%mm6           ## idx *= 4 
-
-        movq nb430nf_GBtab(%rbp),%rsi
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx          ## indices in eax/ebx 
-
-        ## Coulomb 
-        movapd (%rsi,%rcx,8),%xmm4      ## Y1 F1        
-        movapd (%rsi,%rdx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%rsi,%rcx,8),%xmm6    ## G1 H1        
-        movapd 16(%rsi,%rdx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb430nf_qq(%rsp),%xmm3
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-        mulpd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addpd  nb430nf_vctot(%rsp),%xmm5
-        movapd %xmm5,nb430nf_vctot(%rsp)
-
-        movapd nb430nf_r(%rsp),%xmm4
-        mulpd  nb430nf_tsc(%rsp),%xmm4
-        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx 
-        cvtpi2pd %mm6,%xmm5
-        subpd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulpd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        pslld $3,%mm6           ## idx *= 8
-
-        movq nb430nf_VFtab(%rbp),%rsi
-
-        movd %mm6,%ecx
-        psrlq $32,%mm6
-        movd %mm6,%edx          ## indices in eax/ebx 
-
-        ## Dispersion 
-        movapd (%rsi,%rcx,8),%xmm4      ## Y1 F1        
-        movapd (%rsi,%rdx,8),%xmm3      ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 16(%rsi,%rcx,8),%xmm6    ## G1 H1        
-        movapd 16(%rsi,%rdx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulpd  nb430nf_c6(%rsp),%xmm5    ## Vvdw6
-        addpd  nb430nf_Vvdwtot(%rsp),%xmm5
-        movapd %xmm5,nb430nf_Vvdwtot(%rsp)
-
-        ## Repulsion 
-        movapd 32(%rsi,%rcx,8),%xmm4    ## Y1 F1        
-        movapd 32(%rsi,%rdx,8),%xmm3    ## Y2 F2 
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 Y2 
-        unpckhpd %xmm3,%xmm5    ## F1 F2 
-
-        movapd 48(%rsi,%rcx,8),%xmm6    ## G1 H1        
-        movapd 48(%rsi,%rdx,8),%xmm3    ## G2 H2 
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 G2 
-        unpckhpd %xmm3,%xmm7    ## H1 H2 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulpd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulpd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addpd  %xmm6,%xmm5
-        addpd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addpd  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulpd  nb430nf_c12(%rsp),%xmm5   ## Vvdw12 
-        addpd  nb430nf_Vvdwtot(%rsp),%xmm5
-        movapd %xmm5,nb430nf_Vvdwtot(%rsp)
-        xorpd  %xmm4,%xmm4
-
-        ## should we do one more iteration? 
-        subl $2,nb430nf_innerk(%rsp)
-        jl    _nb_kernel430nf_x86_64_sse2.nb430nf_checksingle
-        jmp   _nb_kernel430nf_x86_64_sse2.nb430nf_unroll_loop
-_nb_kernel430nf_x86_64_sse2.nb430nf_checksingle: 
-        movl  nb430nf_innerk(%rsp),%edx
-        andl  $1,%edx
-        jnz    _nb_kernel430nf_x86_64_sse2.nb430nf_dosingle
-        jmp    _nb_kernel430nf_x86_64_sse2.nb430nf_updateouterdata
-_nb_kernel430nf_x86_64_sse2.nb430nf_dosingle: 
-        movq nb430nf_charge(%rbp),%rsi
-        movq nb430nf_invsqrta(%rbp),%rdx
-        movq nb430nf_pos(%rbp),%rdi
-        movq  nb430nf_innerjjnr(%rsp),%rcx
-        movl  (%rcx),%eax
-
-        xorpd  %xmm6,%xmm6
-        movapd %xmm6,%xmm7
-        movsd  (%rdx,%rax,8),%xmm7
-        movlpd (%rsi,%rax,8),%xmm6      ## xmm6(0) has the charge
-        mulsd  nb430nf_isai(%rsp),%xmm7
-        movapd %xmm7,nb430nf_isaprod(%rsp)
-        movapd %xmm7,%xmm1
-        mulpd nb430nf_gbtsc(%rsp),%xmm1
-        movapd %xmm1,nb430nf_gbscale(%rsp)
-
-        mulsd  nb430nf_iq(%rsp),%xmm7
-        mulsd  %xmm7,%xmm6
-        movapd %xmm6,nb430nf_qq(%rsp)
-
-        movq nb430nf_type(%rbp),%rsi
-        movl (%rsi,%rax,4),%edx
-        movq nb430nf_vdwparam(%rbp),%rsi
-        shll %edx
-        movl nb430nf_ntia(%rsp),%edi
-        addl %edi,%edx
-
-        movlpd (%rsi,%rdx,8),%xmm6      ## c6a
-        movhpd 8(%rsi,%rdx,8),%xmm6     ## c6a c12a 
-
-        xorpd %xmm7,%xmm7
-        movapd %xmm6,%xmm4
-        unpcklpd %xmm7,%xmm4
-        unpckhpd %xmm7,%xmm6
-
-        movapd %xmm4,nb430nf_c6(%rsp)
-        movapd %xmm6,nb430nf_c12(%rsp)
-
-        movq nb430nf_pos(%rbp),%rsi             ## base of pos[] 
-
-        lea  (%rax,%rax,2),%rax     ## replace jnr with j3 
-
-        ## move two coordinates to xmm0-xmm2 
-        movlpd (%rsi,%rax,8),%xmm0
-        movlpd 8(%rsi,%rax,8),%xmm1
-        movlpd 16(%rsi,%rax,8),%xmm2
-
-        movq   nb430nf_faction(%rbp),%rdi
-
-        ## move nb430nf_ix-iz to xmm4-xmm6 
-        movapd nb430nf_ix(%rsp),%xmm4
-        movapd nb430nf_iy(%rsp),%xmm5
-        movapd nb430nf_iz(%rsp),%xmm6
-
-        ## calc dr 
-        subsd %xmm0,%xmm4
-        subsd %xmm1,%xmm5
-        subsd %xmm2,%xmm6
-
-        ## square it 
-        mulsd %xmm4,%xmm4
-        mulsd %xmm5,%xmm5
-        mulsd %xmm6,%xmm6
-        addsd %xmm5,%xmm4
-        addsd %xmm6,%xmm4
-        ## rsq in xmm4 
-
-        cvtsd2ss %xmm4,%xmm5
-        rsqrtss %xmm5,%xmm5
-        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2 
-
-        ## lookup seed in xmm2 
-        movapd %xmm2,%xmm5      ## copy of lu 
-        mulsd %xmm2,%xmm2       ## lu*lu 
-        movapd nb430nf_three(%rsp),%xmm1
-        mulsd %xmm4,%xmm2       ## rsq*lu*lu                    
-        movapd nb430nf_half(%rsp),%xmm0
-        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm1
-        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu) 
-
-        movapd %xmm1,%xmm5      ## copy of lu 
-        mulsd %xmm1,%xmm1       ## lu*lu 
-        movapd nb430nf_three(%rsp),%xmm2
-        mulsd %xmm4,%xmm1       ## rsq*lu*lu                    
-        movapd nb430nf_half(%rsp),%xmm0
-        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu 
-        mulsd %xmm5,%xmm2
-        mulsd %xmm2,%xmm0       ## xmm0=iter2 of rinv (new lu) 
-        mulsd %xmm0,%xmm4       ## xmm4=r 
-        movsd %xmm4,nb430nf_r(%rsp)
-        mulsd nb430nf_gbscale(%rsp),%xmm4
-
-        cvttsd2si %xmm4,%edx    ## mm6 = lu idx 
-        cvtsi2sd %edx,%xmm5
-        subsd %xmm5,%xmm4
-        movapd %xmm4,%xmm1      ## xmm1=eps 
-        movapd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2 
-
-        shll $2,%edx            ## idx *= 4 
-        movq nb430nf_GBtab(%rbp),%rsi
-
-        ## Coulomb 
-        movapd (%rsi,%rdx,8),%xmm4      ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 16(%rsi,%rdx,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## coulomb table ready, in xmm4-xmm7            
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        movapd nb430nf_qq(%rsp),%xmm3
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  %xmm3,%xmm5 ## vcoul=qq*VV  
-        addsd  nb430nf_vctot(%rsp),%xmm5
-        movsd %xmm5,nb430nf_vctot(%rsp)
-
-        movsd nb430nf_r(%rsp),%xmm4
-        mulsd  nb430nf_tsc(%rsp),%xmm4
-        cvttsd2si %xmm4,%edx    ## mm6 = lu idx 
-        cvtsi2sd %edx,%xmm5
-        subsd %xmm5,%xmm4
-        movsd %xmm4,%xmm1       ## xmm1=eps 
-        movsd %xmm1,%xmm2
-        mulsd  %xmm2,%xmm2      ## xmm2=eps2
-
-        shll $3,%edx
-
-        movq nb430nf_VFtab(%rbp),%rsi
-
-        ## Dispersion 
-        movapd (%rsi,%rdx,8),%xmm4      ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 16(%rsi,%rdx,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-
-        mulsd  nb430nf_c6(%rsp),%xmm5    ## Vvdw6
-        addsd  nb430nf_Vvdwtot(%rsp),%xmm5
-        movlpd %xmm5,nb430nf_Vvdwtot(%rsp)
-
-        ## Repulsion 
-        movapd 32(%rsi,%rdx,8),%xmm4    ## Y1 F1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm4,%xmm5
-        unpcklpd %xmm3,%xmm4    ## Y1 
-        unpckhpd %xmm3,%xmm5    ## F1 
-
-        movapd 48(%rsi,%rdx,8),%xmm6    ## G1 H1        
-        xorpd %xmm3,%xmm3
-        movapd %xmm6,%xmm7
-        unpcklpd %xmm3,%xmm6    ## G1 
-        unpckhpd %xmm3,%xmm7    ## H1 
-        ## Dispersion table ready, in xmm4-xmm7                 
-        mulsd  %xmm1,%xmm6      ## xmm6=Geps 
-        mulsd  %xmm2,%xmm7      ## xmm7=Heps2 
-        addsd  %xmm6,%xmm5
-        addsd  %xmm7,%xmm5      ## xmm5=Fp      
-        mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp 
-        addsd  %xmm4,%xmm5 ## xmm5=VV 
-        mulsd  nb430nf_c12(%rsp),%xmm5   ## Vvdw12 
-        addsd  nb430nf_Vvdwtot(%rsp),%xmm5
-        movlpd %xmm5,nb430nf_Vvdwtot(%rsp)
-_nb_kernel430nf_x86_64_sse2.nb430nf_updateouterdata: 
-        ## get n from stack
-        movl nb430nf_n(%rsp),%esi
-        ## get group index for i particle 
-        movq  nb430nf_gid(%rbp),%rdx            ## base of gid[]
-        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
-
-        ## accumulate total potential energy and update it 
-        movapd nb430nf_vctot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movq  nb430nf_Vc(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%rax,%rdx,8)
-
-        ## accumulate total lj energy and update it 
-        movapd nb430nf_Vvdwtot(%rsp),%xmm7
-        ## accumulate 
-        movhlps %xmm7,%xmm6
-        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now 
-
-        ## add earlier value from mem 
-        movq  nb430nf_Vvdw(%rbp),%rax
-        addsd (%rax,%rdx,8),%xmm7
-        ## move back to mem 
-        movsd %xmm7,(%rax,%rdx,8)
-
-        ## finish if last 
-        movl nb430nf_nn1(%rsp),%ecx
-        ## esi already loaded with n
-        incl %esi
-        subl %esi,%ecx
-        jz _nb_kernel430nf_x86_64_sse2.nb430nf_outerend
-
-        ## not last, iterate outer loop once more!  
-        movl %esi,nb430nf_n(%rsp)
-        jmp _nb_kernel430nf_x86_64_sse2.nb430nf_outer
-_nb_kernel430nf_x86_64_sse2.nb430nf_outerend: 
-        ## check if more outer neighborlists remain
-        movl  nb430nf_nri(%rsp),%ecx
-        ## esi already loaded with n above
-        subl  %esi,%ecx
-        jz _nb_kernel430nf_x86_64_sse2.nb430nf_end
-        ## non-zero, do one more workunit
-        jmp   _nb_kernel430nf_x86_64_sse2.nb430nf_threadloop
-_nb_kernel430nf_x86_64_sse2.nb430nf_end: 
-        movl nb430nf_nouter(%rsp),%eax
-        movl nb430nf_ninner(%rsp),%ebx
-        movq nb430nf_outeriter(%rbp),%rcx
-        movq nb430nf_inneriter(%rbp),%rdx
-        movl %eax,(%rcx)
-        movl %ebx,(%rdx)
-
-        addq $392,%rsp
-        emms
-
-
-        pop %r15
-        pop %r14
-        pop %r13
-        pop %r12
-
-        pop %rbx
-        pop    %rbp
-        ret
-
-
-
-
diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt
index 92c4360e7d..2c7b1f0069 100644
--- a/src/tools/CMakeLists.txt
+++ b/src/tools/CMakeLists.txt
@@ -32,7 +32,7 @@ add_library(gmxana
             gmx_membed.c    )
 
 
-target_link_libraries(gmxana gmx)
+target_link_libraries(gmxana md gmx)
 set_target_properties(gmxana PROPERTIES OUTPUT_NAME "gmxana${GMX_BINARY_SUFFIX}")
 
 # List of programs with single corresponding .c source file,
-- 
2.11.4.GIT