From 1d7858f64201aefb6dcfac21a292758118f14eb3 Mon Sep 17 00:00:00 2001 From: Erik Lindahl Date: Wed, 11 Aug 2010 01:07:19 +0200 Subject: [PATCH] Changes to make Cmake work under windows again --- CMakeLists.txt | 53 +- cmake/FindFFTW3.cmake | 4 +- cmake/FindFFTW3F.cmake | 44 +- include/types/simple.h | 39 +- src/config.h.cmakein | 3 - src/gmxlib/CMakeLists.txt | 58 +- .../nonbonded/nb_kernel_ia32_sse/Makefile.am | 52 +- .../nb_kernel400_ia32_sse.intel_syntax.s | 1728 -------------- .../nb_kernel_ia32_sse/nb_kernel400_ia32_sse.s | 1701 -------------- .../nb_kernel410_ia32_sse.intel_syntax.s | 2049 ----------------- .../nb_kernel_ia32_sse/nb_kernel410_ia32_sse.s | 2022 ---------------- .../nb_kernel430_ia32_sse.intel_syntax.s | 2409 -------------------- .../nb_kernel_ia32_sse/nb_kernel430_ia32_sse.s | 2382 ------------------- .../nonbonded/nb_kernel_ia32_sse2/Makefile.am | 52 +- .../nb_kernel400_ia32_sse2.intel_syntax.s | 1287 ----------- .../nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.s | 1261 ---------- .../nb_kernel410_ia32_sse2.intel_syntax.s | 1530 ------------- .../nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.s | 1503 ------------ .../nb_kernel430_ia32_sse2.intel_syntax.s | 1714 -------------- .../nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.s | 1688 -------------- .../nonbonded/nb_kernel_x86_64_sse/Makefile.am | 52 +- .../nb_kernel400_x86_64_sse.intel_syntax.s | 1662 -------------- .../nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.s | 1638 ------------- .../nb_kernel410_x86_64_sse.intel_syntax.s | 2009 ---------------- .../nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.s | 1985 ---------------- .../nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.c | 111 +- .../nb_kernel430_x86_64_sse.intel_syntax.s | 2330 ------------------- .../nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.s | 2306 ------------------- .../nonbonded/nb_kernel_x86_64_sse2/Makefile.am | 52 +- .../nb_kernel400_x86_64_sse2.intel_syntax.s | 1236 ---------- .../nb_kernel400_x86_64_sse2.s | 1212 ---------- .../nb_kernel410_x86_64_sse2.intel_syntax.s | 1488 ------------ .../nb_kernel410_x86_64_sse2.s | 1464 ------------ .../nb_kernel430_x86_64_sse2.intel_syntax.s | 1664 -------------- .../nb_kernel430_x86_64_sse2.s | 1640 ------------- src/tools/CMakeLists.txt | 2 +- 36 files changed, 273 insertions(+), 42157 deletions(-) rewrite cmake/FindFFTW3F.cmake (65%) delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.intel_syntax.s delete mode 100644 src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.s diff --git a/CMakeLists.txt b/CMakeLists.txt index 2fe8b1a7e3..c9b2af63b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ project(Gromacs) include(Dart) # PROJECT_VERSION should have the following structure: # VERSION[-dev-SUFFIX] where the VERSION can have any form and the suffix -set(PROJECT_VERSION "4.0.99-dev-20100315" +set(PROJECT_VERSION "4.5-beta3" CACHE STRING "Gromacs version string") # Cmake modules/macros are in a subdirectory to keep this file cleaner @@ -16,7 +16,6 @@ endif(NOT CMAKE_BUILD_TYPE) enable_language(C) - ######################################################################## # Fix stupid flags on MSVC ######################################################################## @@ -37,11 +36,6 @@ set(GMX_EXTRA_LIBRARIES) include(CheckCCompilerFlag) include(CheckCXXCompilerFlag) -# --- REMOVED as bugzilla 431 turned out to be a bug in GB -# and there is no confirmed case in which gcc causes a crash -# check for buggy GCC 4.1.x -# include(gmxCheckGCCVersion) - include(gmxCFlags) gmx_c_flags() @@ -72,6 +66,7 @@ mark_as_advanced(GMX_MPI_IN_PLACE) option(GMX_IA32_ASM "Add SSE assembly files for IA32" OFF) option(GMX_X86_64_ASM "Add SSE assembly files for X86_64" OFF) + option(USE_VERSION_H "Generate development version string/information" ON) # --- REMOVED as bugzilla 431 turned out to be a bug in GB # and there is no confirmed case in which gcc causes a crash @@ -199,6 +194,8 @@ check_library_exists(m sqrt "" HAVE_LIBM) check_library_exists(m cbrt "" HAVE_CBRT) include(CheckTypeSize) +set(CMAKE_REQUIRED_FLAGS ${CMAKE_C_FLAGS_RELEASE}) + check_type_size("bool" SIZEOF_BOOL) # will also set HAVE_BOOL check_type_size("int" SIZEOF_INT) check_type_size("long int" SIZEOF_LONG_INT) @@ -260,14 +257,6 @@ if(LIBXML2_FOUND) set(HAVE_LIBXML2 1) endif(LIBXML2_FOUND) -find_package(gsl) -set(PKG_GSL "") -if(GSL_FOUND) - include_directories(${GSL_INCLUDE_DIR}) - set(PKG_GSL gsl) - set(HAVE_LIBGSL 1) -endif(GSL_FOUND) - find_package(X11) # X11 includes/libraries are only set in the ngmx subdirectory! if(X11_FOUND) @@ -370,6 +359,8 @@ if (${GMX_ACCELERATION} STREQUAL "auto") set(GMX_IA32_ASM ON CACHE BOOL "Add SSE assembly files for IA32" FORCE) endif (GMX_64_BIT) + else(GMX_X86_GCC_INLINE_ASM) + set(GMX_ACCELERATION "none" CACHE STRING "Accelerated kernels. Pick one of: none, SSE, BlueGene, Power6, ia64, altivec" FORCE) endif (GMX_X86_GCC_INLINE_ASM) endif (${GMX_ACCELERATION} STREQUAL "auto") @@ -409,6 +400,11 @@ string(TOUPPER ${GMX_ACCELERATION} ${GMX_ACCELERATION}) if(${GMX_ACCELERATION} STREQUAL "NONE") # nothing to do elseif(${GMX_ACCELERATION} STREQUAL "SSE") + if(CMAKE_GENERATOR MATCHES "Visual Studio") + option(GMX_ASM_USEASM-NASM "Use the Nasm assembler (windows)" ON) + else(CMAKE_GENERATOR MATCHES "Visual Studio") + option(GMX_ASM_USEASM-NASM "Use the Nasm assembler (windows)" OFF) + endif(CMAKE_GENERATOR MATCHES "Visual Studio") if (NOT GMX_64_BIT) # for 32-bit compiles, we might need to turn on sse CHECK_C_COMPILER_FLAG("-msse2" XFLAGS_SSE) @@ -422,17 +418,23 @@ elseif(${GMX_ACCELERATION} STREQUAL "SSE") endif (NOT GMX_64_BIT) if(HAVE_XMMINTRIN_H) if(GMX_IA32_ASM) - option(GMX_ASM_USEASM-ATT "Use ATT-style assembly" ON) - set(GMX_IA32_SSE 1) - else(GMX_IA32_ASM) #only use intrinsic if not using the ASM loops + if(GMX_DOUBLE) + set(GMX_IA32_SSE2 1) + else(GMX_DOUBLE) + set(GMX_IA32_SSE 1) + endif(GMX_DOUBLE) + else(GMX_IA32_ASM) set(GMX_SSE 1) endif(GMX_IA32_ASM) endif(HAVE_XMMINTRIN_H) if(HAVE_EMMINTRIN_H) if(GMX_X86_64_ASM) - option(GMX_ASM_USEASM-ATT "Use ATT-style assembly" ON) - set(GMX_X86_64_SSE 1) - else(GMX_X86_64_ASM) #only use intrinsic if not using the ASM loops + if(GMX_DOUBLE) + set(GMX_X86_64_SSE2 1) + else(GMX_DOUBLE) + set(GMX_X86_64_SSE 1) + endif(GMX_DOUBLE) + else(GMX_X86_64_ASM) if(NOT GMX_IA32_ASM) set(GMX_SSE2 1) endif(NOT GMX_IA32_ASM) @@ -494,17 +496,22 @@ if(${GMX_FFT_LIBRARY} STREQUAL "FFTW3") # MESSAGE(STATUS "Using external FFT library - fftw3") if(GMX_DOUBLE) find_package(FFTW3 REQUIRED) + include_directories(${FFTW3_INCLUDE_DIR}) + set(FFT_LIBRARIES ${FFTW3_LIBRARIES}) set(PKG_FFT "fftw3") else(GMX_DOUBLE) find_package(FFTW3F REQUIRED) + include_directories(${FFTW3F_INCLUDE_DIR}) + set(FFT_LIBRARIES ${FFTW3F_LIBRARIES}) set(PKG_FFT "fftw3f") endif(GMX_DOUBLE) - if(NOT FFTW3_FOUND) + if(NOT FFTW3_FOUND AND NOT FFTW3F_FOUND) MESSAGE(FATAL_ERROR "Cannot find fftw3 (with correct precision). Fix it, choose another FFT library, or use the Gromacs built-in fftpack (slower)!") - endif(NOT FFTW3_FOUND) + endif(NOT FFTW3_FOUND AND NOT FFTW3F_FOUND) set(GMX_FFT_FFTW3 1) + include_directories(${FFTW3_INCLUDE_DIR}) set(FFT_LIBRARIES ${FFTW3_LIBRARIES}) diff --git a/cmake/FindFFTW3.cmake b/cmake/FindFFTW3.cmake index 8bf3a26434..636a96ec50 100644 --- a/cmake/FindFFTW3.cmake +++ b/cmake/FindFFTW3.cmake @@ -5,10 +5,10 @@ # FFTW3_LIBRARIES - List of libraries when using FFTW. # FFTW3_FOUND - True if FFTW found. -if (FFTW3_INCLUDE_DIR) +if (FFTW3_INCLUDE_DIR AND FFTW3_LIBRARIES) # Already in cache, be silent set (FFTW3_FIND_QUIETLY TRUE) -endif (FFTW3_INCLUDE_DIR) +endif (FFTW3_INCLUDE_DIR AND FFTW3_LIBRARIES) find_path (FFTW3_INCLUDE_DIR fftw3.h) diff --git a/cmake/FindFFTW3F.cmake b/cmake/FindFFTW3F.cmake dissimilarity index 65% index f0227cbd7b..5ca2fc3e30 100644 --- a/cmake/FindFFTW3F.cmake +++ b/cmake/FindFFTW3F.cmake @@ -1,22 +1,22 @@ -# - Find FFTW3 -# Find the native FFTW3 includes and library, single precision -# -# FFTW3_INCLUDE_DIR - where to find fftw3.h -# FFTW3_LIBRARIES - List of libraries when using FFTW. -# FFTW3_FOUND - True if FFTW found. - -if (FFTW3_INCLUDE_DIR) - # Already in cache, be silent - set (FFTW3_FIND_QUIETLY TRUE) -endif (FFTW3_INCLUDE_DIR) - -find_path (FFTW3_INCLUDE_DIR fftw3.h) - -find_library (FFTW3_LIBRARIES NAMES fftw3f) - -# handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if -# all listed variables are TRUE -include (FindPackageHandleStandardArgs) -find_package_handle_standard_args (FFTW3 DEFAULT_MSG FFTW3_LIBRARIES FFTW3_INCLUDE_DIR) - -mark_as_advanced (FFTW3_LIBRARIES FFTW3_INCLUDE_DIR) +# - Find FFTW3F +# Find the native FFTW3 includes and library, single precision +# +# FFTW3F_INCLUDE_DIR - where to find fftw3.h +# FFTW3F_LIBRARIES - List of libraries when using FFTW. +# FFTW3F_FOUND - True if FFTW found. + +if (FFTW3F_INCLUDE_DIR) + # Already in cache, be silent + set (FFTW3F_FIND_QUIETLY TRUE) +endif (FFTW3F_INCLUDE_DIR) + +find_path (FFTW3F_INCLUDE_DIR fftw3.h) + +find_library (FFTW3F_LIBRARIES NAMES fftw3f) + +# handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if +# all listed variables are TRUE +include (FindPackageHandleStandardArgs) +find_package_handle_standard_args (FFTW3F DEFAULT_MSG FFTW3F_LIBRARIES FFTW3F_INCLUDE_DIR) + +mark_as_advanced (FFTW3F_LIBRARIES FFTW3F_INCLUDE_DIR) diff --git a/include/types/simple.h b/include/types/simple.h index 789ab1f134..7461b13d51 100644 --- a/include/types/simple.h +++ b/include/types/simple.h @@ -134,26 +134,43 @@ typedef int imatrix[DIM][DIM]; /* For the step count type gmx_large_int_t we aim for 8 bytes (64bit), * but we might only be able to get 4 bytes (32bit). + * + * Avoid using "long int" if we can. This type is really dangerous, + * since the width frequently depends on compiler options, and they + * might not be set correctly when (buggy) Cmake is detecting things. + * Instead, start by looking for "long long", and just go down if we + * have to (rarely on new systems). /EL 20100810 */ -#if (!(defined SIZEOF_LONG_LONG_INT) || SIZEOF_LONG_INT == 8) -typedef long int gmx_large_int_t; -#define gmx_large_int_fmt "ld" -#define gmx_large_int_pfmt "%ld" -#define SIZEOF_LARGE_INT SIZEOF_LONG_INT -#define LARGE_INT_MAX LONG_MAX -#else +#if (defined SIZEOF_LONG_LONG_INT && SIZEOF_LONG_LONG_INT==8) + typedef long long int gmx_large_int_t; #define gmx_large_int_fmt "lld" #define gmx_large_int_pfmt "%lld" #define SIZEOF_LARGE_INT SIZEOF_LONG_LONG_INT /* LLONG_MAX is not defined by the C-standard, so check for it */ -#if (!(defined LLONG_MAX) && SIZEOF_LONG_LONG_INT == 8) -#define LARGE_INT_MAX 9223372036854775807LL -#else +#ifdef LLONG_MAX #define LARGE_INT_MAX LLONG_MAX -#endif +#else +#define LARGE_INT_MAX 9223372036854775807LL #endif +#elif (defined SIZEOF_LONG_INT && SIZEOF_LONG_INT==8) + +typedef long int gmx_large_int_t; +#define gmx_large_int_fmt "ld" +#define gmx_large_int_pfmt "%ld" +#define SIZEOF_LARGE_INT SIZEOF_LONG_INT +#define LARGE_INT_MAX LONG_MAX + +#else + +typedef int gmx_large_int_t; +#define gmx_large_int_fmt "d" +#define gmx_large_int_pfmt "%d" +#define SIZEOF_LARGE_INT SIZEOF_INT +#define LARGE_INT_MAX INT_MAX + +#endif #ifdef __cplusplus } diff --git a/src/config.h.cmakein b/src/config.h.cmakein index 2380d39a57..5e5b761f1a 100644 --- a/src/config.h.cmakein +++ b/src/config.h.cmakein @@ -215,9 +215,6 @@ /* Define to 1 if you have the xml2 library (-lxml2). */ #cmakedefine HAVE_LIBXML2 -/* Define to 1 if you have the gsl library (-lgsl). */ -#cmakedefine HAVE_LIBGSL - /* Define to 1 if you have the dl library (-ldl). */ #cmakedefine HAVE_LIBDL diff --git a/src/gmxlib/CMakeLists.txt b/src/gmxlib/CMakeLists.txt index db55cb62a1..a6f8dd6050 100644 --- a/src/gmxlib/CMakeLists.txt +++ b/src/gmxlib/CMakeLists.txt @@ -24,6 +24,10 @@ file(GLOB GMXLIB_SOURCES *.c selection/*.c trajana/*.c statistics/*.c nonbonded/*.c nonbonded/nb_kernel_c/*.c) +# This source file is generated +file(GLOB VERSION_SOURCE version.c) +list(REMOVE_ITEM GMXLIB_SOURCES ${VERSION_SOURCE}) + # add version.c to the list of sources and tell cmake that it is generated if(USE_VERSION_H) LIST(APPEND GMXLIB_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/version.c) # auto-generated @@ -31,43 +35,53 @@ set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/version.c PROPERTIES GENERATED true) endif() -if(GMX_IA32_SSE) - if(GMX_ASM_USEASM-ATT) +if(GMX_IA32_SSE OR GMX_IA32_SSE2) + if(GMX_ASM_USEASM-NASM) + enable_language(ASM-NASM) + if(GMX_DOUBLE) + file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse2/*.c nonbonded/nb_kernel_ia32_sse2/*intel_syntax.s) + else(GMX_DOUBLE) + file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse/*.c nonbonded/nb_kernel_ia32_sse/*intel_syntax.s) + endif(GMX_DOUBLE) + else(GMX_ASM_USEASM-NASM) if(GMX_ASM_USECCOMPILER) SET(CMAKE_ASM-ATT_COMPILER ${CMAKE_C_COMPILER}) endif(GMX_ASM_USECCOMPILER) enable_language(ASM-ATT) - file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse/*.c nonbonded/nb_kernel_ia32_sse/*.s) - file(GLOB SKIP_INTELSYNTAX nonbonded/nb_kernel_ia32_sse/*intel_syntax*) - list(REMOVE_ITEM GMX_MORESSE_SOURCES ${SKIP_INTELSYNTAX}) + if(GMX_DOUBLE) + file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse2/*.c nonbonded/nb_kernel_ia32_sse2/*sse2.s nonbonded/nb_kernel_ia32_sse2/*asm.s) + else(GMX_DOUBLE) + file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse/*.c nonbonded/nb_kernel_ia32_sse/*sse.s nonbonded/nb_kernel_ia32_sse/*asm.s) + endif(GMX_DOUBLE) if(GMX_ASM_USECCOMPILER) set_source_files_properties(${GMX_MORESSE_SOURCES} PROPERTIES COMPILE_FLAGS "-c -m32") endif(GMX_ASM_USECCOMPILER) - else(GMX_ASM_USEASM-ATT) - enable_language(ASM) - file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_ia32_sse/*.c nonbonded/nb_kernel_ia32_sse/*.intel_syntax.s) - endif(GMX_ASM_USEASM-ATT) -endif(GMX_IA32_SSE) + endif(GMX_ASM_USEASM-NASM) +endif(GMX_IA32_SSE OR GMX_IA32_SSE2) -if(GMX_X86_64_SSE) - if(GMX_ASM_USEASM-ATT) +if(GMX_X86_64_SSE OR GMX_X86_64_SSE2) + if(GMX_ASM_USEASM-NASM) + enable_language(ASM-NASM) + if(GMX_DOUBLE) + file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse2/*.c nonbonded/nb_kernel_x86_64_sse2/*intel_syntax.s) + else(GMX_DOUBLE) + file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse/*.c nonbonded/nb_kernel_x86_64_sse/*intel_syntax.s) + endif(GMX_DOUBLE) + else(GMX_ASM_USEASM-NASM) if(GMX_ASM_USECCOMPILER) SET(CMAKE_ASM-ATT_COMPILER ${CMAKE_C_COMPILER}) endif(GMX_ASM_USECCOMPILER) enable_language(ASM-ATT) - file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse/*.c nonbonded/nb_kernel_x86_64_sse/*.s) - file(GLOB SKIP_INTELSYNTAX nonbonded/nb_kernel_x86_64_sse/*intel_syntax*) - file(GLOB SKIP_400ASM nonbonded/nb_kernel_x86_64_sse/nb_kernel4*.s) - list(REMOVE_ITEM GMX_MORESSE_SOURCES ${SKIP_INTELSYNTAX}) - list(REMOVE_ITEM GMX_MORESSE_SOURCES ${SKIP_400ASM}) #use new C-intrinsics instread + if(GMX_DOUBLE) + file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse2/*.c nonbonded/nb_kernel_x86_64_sse2/*sse2.s nonbonded/nb_kernel_x86_64_sse2/*asm.s) + else(GMX_DOUBLE) + file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse/*.c nonbonded/nb_kernel_x86_64_sse/*sse.s nonbonded/nb_kernel_x86_64_sse/*asm.s) + endif(GMX_DOUBLE) if(GMX_ASM_USECCOMPILER) set_source_files_properties(${GMX_MORESSE_SOURCES} PROPERTIES COMPILE_FLAGS "-c") endif(GMX_ASM_USECCOMPILER) - else(GMX_ASM_USEASM-ATT) - enable_language(ASM) - file(GLOB GMX_MORESSE_SOURCES nonbonded/nb_kernel_x86_64_sse/*.c nonbonded/nb_kernel_x86_64_sse/*.intel_syntax.s) - endif(GMX_ASM_USEASM-ATT) -endif(GMX_X86_64_SSE) + endif(GMX_ASM_USEASM-NASM) +endif(GMX_X86_64_SSE OR GMX_X86_64_SSE2) if(GMX_SSE2) if(GMX_DOUBLE) diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/Makefile.am b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/Makefile.am index b7be181468..9e40177198 100644 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/Makefile.am +++ b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/Makefile.am @@ -63,29 +63,29 @@ libnb_kernel_ia32_sse_la_SOURCES = \ EXTRA_DIST = \ - nb_kernel010_ia32_sse.intel_syntax.s nb_kernel030_ia32_sse.intel_syntax.s \ - nb_kernel100_ia32_sse.intel_syntax.s nb_kernel101_ia32_sse.intel_syntax.s \ - nb_kernel102_ia32_sse.intel_syntax.s nb_kernel103_ia32_sse.intel_syntax.s \ - nb_kernel104_ia32_sse.intel_syntax.s nb_kernel110_ia32_sse.intel_syntax.s \ - nb_kernel111_ia32_sse.intel_syntax.s nb_kernel112_ia32_sse.intel_syntax.s \ - nb_kernel113_ia32_sse.intel_syntax.s nb_kernel114_ia32_sse.intel_syntax.s \ - nb_kernel130_ia32_sse.intel_syntax.s nb_kernel131_ia32_sse.intel_syntax.s \ - nb_kernel132_ia32_sse.intel_syntax.s nb_kernel133_ia32_sse.intel_syntax.s \ - nb_kernel134_ia32_sse.intel_syntax.s nb_kernel200_ia32_sse.intel_syntax.s \ - nb_kernel201_ia32_sse.intel_syntax.s nb_kernel202_ia32_sse.intel_syntax.s \ - nb_kernel203_ia32_sse.intel_syntax.s nb_kernel204_ia32_sse.intel_syntax.s \ - nb_kernel210_ia32_sse.intel_syntax.s nb_kernel211_ia32_sse.intel_syntax.s \ - nb_kernel212_ia32_sse.intel_syntax.s nb_kernel213_ia32_sse.intel_syntax.s \ - nb_kernel214_ia32_sse.intel_syntax.s nb_kernel230_ia32_sse.intel_syntax.s \ - nb_kernel231_ia32_sse.intel_syntax.s nb_kernel232_ia32_sse.intel_syntax.s \ - nb_kernel233_ia32_sse.intel_syntax.s nb_kernel234_ia32_sse.intel_syntax.s \ - nb_kernel300_ia32_sse.intel_syntax.s nb_kernel301_ia32_sse.intel_syntax.s \ - nb_kernel302_ia32_sse.intel_syntax.s nb_kernel303_ia32_sse.intel_syntax.s \ - nb_kernel304_ia32_sse.intel_syntax.s nb_kernel310_ia32_sse.intel_syntax.s \ - nb_kernel311_ia32_sse.intel_syntax.s nb_kernel312_ia32_sse.intel_syntax.s \ - nb_kernel313_ia32_sse.intel_syntax.s nb_kernel314_ia32_sse.intel_syntax.s \ - nb_kernel330_ia32_sse.intel_syntax.s nb_kernel331_ia32_sse.intel_syntax.s \ - nb_kernel332_ia32_sse.intel_syntax.s nb_kernel333_ia32_sse.intel_syntax.s \ - nb_kernel334_ia32_sse.intel_syntax.s nb_kernel400_ia32_sse.intel_syntax.s \ - nb_kernel410_ia32_sse.intel_syntax.s nb_kernel430_ia32_sse.intel_syntax.s \ - nb_kernel_ia32_sse_test_asm.intel_syntax.s + nb_kernel010_ia32_sse_intel_syntax.s nb_kernel030_ia32_sse_intel_syntax.s \ + nb_kernel100_ia32_sse_intel_syntax.s nb_kernel101_ia32_sse_intel_syntax.s \ + nb_kernel102_ia32_sse_intel_syntax.s nb_kernel103_ia32_sse_intel_syntax.s \ + nb_kernel104_ia32_sse_intel_syntax.s nb_kernel110_ia32_sse_intel_syntax.s \ + nb_kernel111_ia32_sse_intel_syntax.s nb_kernel112_ia32_sse_intel_syntax.s \ + nb_kernel113_ia32_sse_intel_syntax.s nb_kernel114_ia32_sse_intel_syntax.s \ + nb_kernel130_ia32_sse_intel_syntax.s nb_kernel131_ia32_sse_intel_syntax.s \ + nb_kernel132_ia32_sse_intel_syntax.s nb_kernel133_ia32_sse_intel_syntax.s \ + nb_kernel134_ia32_sse_intel_syntax.s nb_kernel200_ia32_sse_intel_syntax.s \ + nb_kernel201_ia32_sse_intel_syntax.s nb_kernel202_ia32_sse_intel_syntax.s \ + nb_kernel203_ia32_sse_intel_syntax.s nb_kernel204_ia32_sse_intel_syntax.s \ + nb_kernel210_ia32_sse_intel_syntax.s nb_kernel211_ia32_sse_intel_syntax.s \ + nb_kernel212_ia32_sse_intel_syntax.s nb_kernel213_ia32_sse_intel_syntax.s \ + nb_kernel214_ia32_sse_intel_syntax.s nb_kernel230_ia32_sse_intel_syntax.s \ + nb_kernel231_ia32_sse_intel_syntax.s nb_kernel232_ia32_sse_intel_syntax.s \ + nb_kernel233_ia32_sse_intel_syntax.s nb_kernel234_ia32_sse_intel_syntax.s \ + nb_kernel300_ia32_sse_intel_syntax.s nb_kernel301_ia32_sse_intel_syntax.s \ + nb_kernel302_ia32_sse_intel_syntax.s nb_kernel303_ia32_sse_intel_syntax.s \ + nb_kernel304_ia32_sse_intel_syntax.s nb_kernel310_ia32_sse_intel_syntax.s \ + nb_kernel311_ia32_sse_intel_syntax.s nb_kernel312_ia32_sse_intel_syntax.s \ + nb_kernel313_ia32_sse_intel_syntax.s nb_kernel314_ia32_sse_intel_syntax.s \ + nb_kernel330_ia32_sse_intel_syntax.s nb_kernel331_ia32_sse_intel_syntax.s \ + nb_kernel332_ia32_sse_intel_syntax.s nb_kernel333_ia32_sse_intel_syntax.s \ + nb_kernel334_ia32_sse_intel_syntax.s nb_kernel400_ia32_sse_intel_syntax.s \ + nb_kernel410_ia32_sse_intel_syntax.s nb_kernel430_ia32_sse_intel_syntax.s \ + nb_kernel_ia32_sse_test_asm_intel_syntax.s diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.intel_syntax.s deleted file mode 100644 index 00149aa925..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.intel_syntax.s +++ /dev/null @@ -1,1728 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. - -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - - - -.globl nb_kernel400_ia32_sse -.globl _nb_kernel400_ia32_sse -nb_kernel400_ia32_sse: -_nb_kernel400_ia32_sse: -.equiv nb400_p_nri, 8 -.equiv nb400_iinr, 12 -.equiv nb400_jindex, 16 -.equiv nb400_jjnr, 20 -.equiv nb400_shift, 24 -.equiv nb400_shiftvec, 28 -.equiv nb400_fshift, 32 -.equiv nb400_gid, 36 -.equiv nb400_pos, 40 -.equiv nb400_faction, 44 -.equiv nb400_charge, 48 -.equiv nb400_p_facel, 52 -.equiv nb400_argkrf, 56 -.equiv nb400_argcrf, 60 -.equiv nb400_Vc, 64 -.equiv nb400_type, 68 -.equiv nb400_p_ntype, 72 -.equiv nb400_vdwparam, 76 -.equiv nb400_Vvdw, 80 -.equiv nb400_p_tabscale, 84 -.equiv nb400_VFtab, 88 -.equiv nb400_invsqrta, 92 -.equiv nb400_dvda, 96 -.equiv nb400_p_gbtabscale, 100 -.equiv nb400_GBtab, 104 -.equiv nb400_p_nthreads, 108 -.equiv nb400_count, 112 -.equiv nb400_mtx, 116 -.equiv nb400_outeriter, 120 -.equiv nb400_inneriter, 124 -.equiv nb400_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb400_ix, 0 -.equiv nb400_iy, 16 -.equiv nb400_iz, 32 -.equiv nb400_iq, 48 -.equiv nb400_dx, 64 -.equiv nb400_dy, 80 -.equiv nb400_dz, 96 -.equiv nb400_two, 112 -.equiv nb400_gbtsc, 128 -.equiv nb400_qq, 144 -.equiv nb400_r, 160 -.equiv nb400_vctot, 176 -.equiv nb400_fix, 192 -.equiv nb400_fiy, 208 -.equiv nb400_fiz, 224 -.equiv nb400_half, 240 -.equiv nb400_three, 256 -.equiv nb400_isai, 272 -.equiv nb400_isaprod, 288 -.equiv nb400_dvdasum, 304 -.equiv nb400_gbscale, 320 -.equiv nb400_is3, 336 -.equiv nb400_ii3, 340 -.equiv nb400_ii, 344 -.equiv nb400_innerjjnr, 348 -.equiv nb400_innerk, 352 -.equiv nb400_n, 356 -.equiv nb400_nn1, 360 -.equiv nb400_jnra, 364 -.equiv nb400_jnrb, 368 -.equiv nb400_jnrc, 372 -.equiv nb400_jnrd, 376 -.equiv nb400_nri, 380 -.equiv nb400_facel, 384 -.equiv nb400_nouter, 388 -.equiv nb400_ninner, 392 -.equiv nb400_salign, 396 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 400 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb400_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb400_p_nri] - mov esi, [ebp + nb400_p_facel] - mov ecx, [ecx] - mov esi, [esi] - mov [esp + nb400_nri], ecx - mov [esp + nb400_facel], esi - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb400_nouter], eax - mov [esp + nb400_ninner], eax - - - mov eax, [ebp + nb400_p_gbtabscale] - movss xmm3, [eax] - shufps xmm3, xmm3, 0 - movaps [esp + nb400_gbtsc], xmm3 - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# constant 0.5 in IEEE (hex) - mov [esp + nb400_half], eax - movss xmm1, [esp + nb400_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# constant 1.0 - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# constant 2.0 - addps xmm3, xmm2 ;# constant 3.0 - movaps [esp + nb400_half], xmm1 - movaps [esp + nb400_two], xmm2 - movaps [esp + nb400_three], xmm3 - -.nb400_threadloop: - mov esi, [ebp + nb400_count] ;# pointer to sync counter - mov eax, [esi] -.nb400_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb400_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb400_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb400_n], eax - mov [esp + nb400_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb400_outerstart - jmp .nb400_end - -.nb400_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb400_nouter] - mov [esp + nb400_nouter], ebx - -.nb400_outer: - mov eax, [ebp + nb400_shift] ;# eax = pointer into shift[] - mov ebx, [eax + esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb400_is3],ebx ;# store is3 - - mov eax, [ebp + nb400_shiftvec] ;# eax = base of shiftvec[] - - movss xmm0, [eax + ebx*4] - movss xmm1, [eax + ebx*4 + 4] - movss xmm2, [eax + ebx*4 + 8] - - mov ecx, [ebp + nb400_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx + esi*4] ;# ebx =ii - mov [esp + nb400_ii], ebx - - mov edx, [ebp + nb400_charge] - movss xmm3, [edx + ebx*4] - mulss xmm3, [esp + nb400_facel] - shufps xmm3, xmm3, 0 - - - mov edx, [ebp + nb400_invsqrta] ;# load invsqrta[ii] - movss xmm4, [edx + ebx*4] - shufps xmm4, xmm4, 0 - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb400_pos] ;# eax = base of pos[] - - addss xmm0, [eax + ebx*4] - addss xmm1, [eax + ebx*4 + 4] - addss xmm2, [eax + ebx*4 + 8] - - movaps [esp + nb400_iq], xmm3 - movaps [esp + nb400_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [esp + nb400_ix], xmm0 - movaps [esp + nb400_iy], xmm1 - movaps [esp + nb400_iz], xmm2 - - mov [esp + nb400_ii3], ebx - - ;# clear vctot and i forces - xorps xmm4, xmm4 - movaps [esp + nb400_vctot], xmm4 - movaps [esp + nb400_dvdasum], xmm4 - movaps [esp + nb400_fix], xmm4 - movaps [esp + nb400_fiy], xmm4 - movaps [esp + nb400_fiz], xmm4 - - mov eax, [ebp + nb400_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb400_pos] - mov edi, [ebp + nb400_faction] - mov eax, [ebp + nb400_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb400_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [esp + nb400_ninner] - mov [esp + nb400_ninner], ecx - add edx, 0 - mov [esp + nb400_innerk], edx ;# number of innerloop atoms - jge .nb400_unroll_loop - jmp .nb400_finish_inner -.nb400_unroll_loop: - ;# quad-unroll innerloop here - mov edx, [esp + nb400_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - mov ecx, [edx + 8] - mov edx, [edx + 12] ;# eax-edx=jnr1-4 - add dword ptr [esp + nb400_innerjjnr], 16 ;# advance pointer (unrolled 4) - - ;# load isaj - mov esi, [ebp + nb400_invsqrta] - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - movaps xmm2, [esp + nb400_isai] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all isaj in xmm3 - mulps xmm2, xmm3 - - movaps [esp + nb400_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb400_gbtsc] - movaps [esp + nb400_gbscale], xmm1 - - mov esi, [ebp + nb400_charge] ;# base of charge[] - - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - - mulps xmm2, [esp + nb400_iq] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3 - mulps xmm3, xmm2 - movaps [esp + nb400_qq], xmm3 - - - mov esi, [ebp + nb400_pos] ;# base of pos[] - - mov [esp + nb400_jnra], eax - mov [esp + nb400_jnrb], ebx - mov [esp + nb400_jnrc], ecx - mov [esp + nb400_jnrd], edx - - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - lea ecx, [ecx + ecx*2] - lea edx, [edx + edx*2] - - ;# move four coordinates to xmm0-xmm2 - - movlps xmm4, [esi + eax*4] - movlps xmm5, [esi + ecx*4] - movss xmm2, [esi + eax*4 + 8] - movss xmm6, [esi + ecx*4 + 8] - - movhps xmm4, [esi + ebx*4] - movhps xmm5, [esi + edx*4] - - movss xmm0, [esi + ebx*4 + 8] - movss xmm1, [esi + edx*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# constant 10001000 - - shufps xmm0, xmm5, 136 ;# constant 10001000 - shufps xmm1, xmm5, 221 ;# constant 11011101 - - ;# move ix-iz to xmm4-xmm6 - movaps xmm4, [esp + nb400_ix] - movaps xmm5, [esp + nb400_iy] - movaps xmm6, [esp + nb400_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# store dr - movaps [esp + nb400_dx], xmm4 - movaps [esp + nb400_dy], xmm5 - movaps [esp + nb400_dz], xmm6 - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb400_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb400_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [esp + nb400_r], xmm4 - mulps xmm4, [esp + nb400_gbscale] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 2 - pslld mm7, 2 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov esi, [ebp + nb400_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# load coulomb table - movaps xmm4, [esi + eax*4] - movaps xmm5, [esi + ebx*4] - movaps xmm6, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm7, [esp + nb400_two] ;# two*Heps2 - movaps xmm3, [esp + nb400_qq] - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - mulps xmm3, xmm7 ;# fijC=FF*qq - ;# at this point mm5 contains vcoul and mm3 fijC - - ;# get jnr from stack - mov eax, [esp + nb400_jnra] - mov ebx, [esp + nb400_jnrb] - mov ecx, [esp + nb400_jnrc] - mov edx, [esp + nb400_jnrd] - - mov esi, [ebp + nb400_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulps xmm3, [esp + nb400_gbscale] - movaps xmm6, xmm3 - mulps xmm6, [esp + nb400_r] - addps xmm6, xmm5 - addps xmm5, [esp + nb400_vctot] - movaps [esp + nb400_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [esp + nb400_dvdasum] - movaps [esp + nb400_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - movaps xmm5, xmm6 - movaps xmm4, xmm7 - shufps xmm5, xmm5, 0x1 - shufps xmm4, xmm4, 0x1 - ;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss xmm6, [esi + eax*4] - addss xmm5, [esi + ebx*4] - addss xmm7, [esi + ecx*4] - addss xmm4, [esi + edx*4] - movss [esi + eax*4], xmm6 - movss [esi + ebx*4], xmm5 - movss [esi + ecx*4], xmm7 - movss [esi + edx*4], xmm4 - - xorps xmm4, xmm4 - mulps xmm3, xmm0 - subps xmm4, xmm3 - - movaps xmm0, [esp + nb400_dx] - movaps xmm1, [esp + nb400_dy] - movaps xmm2, [esp + nb400_dz] - - movd eax, mm0 - movd ebx, mm1 - movd ecx, mm2 - movd edx, mm3 - - mov edi, [ebp + nb400_faction] - mulps xmm0, xmm4 - mulps xmm1, xmm4 - mulps xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movaps xmm3, [esp + nb400_fix] - movaps xmm4, [esp + nb400_fiy] - movaps xmm5, [esp + nb400_fiz] - addps xmm3, xmm0 - addps xmm4, xmm1 - addps xmm5, xmm2 - movaps [esp + nb400_fix], xmm3 - movaps [esp + nb400_fiy], xmm4 - movaps [esp + nb400_fiz], xmm5 - ;# the fj's - start by accumulating x & y forces from memory - movlps xmm4, [edi + eax*4] - movlps xmm6, [edi + ecx*4] - movhps xmm4, [edi + ebx*4] - movhps xmm6, [edi + edx*4] - - movaps xmm3, xmm4 - shufps xmm3, xmm6, 136 ;# constant 10001000 - shufps xmm4, xmm6, 221 ;# constant 11011101 - - ;# now xmm3-xmm5 contains fjx, fjy, fjz - subps xmm3, xmm0 - subps xmm4, xmm1 - - ;# unpack them back so we can store them - first x & y in xmm3/xmm4 - - movaps xmm6, xmm3 - unpcklps xmm6, xmm4 - unpckhps xmm3, xmm4 - ;# xmm6(l)=x & y for j1, (h) for j2 - ;# xmm3(l)=x & y for j3, (h) for j4 - movlps [edi + eax*4], xmm6 - movlps [edi + ecx*4], xmm3 - - movhps [edi + ebx*4], xmm6 - movhps [edi + edx*4], xmm3 - - ;# and the z forces - movss xmm4, [edi + eax*4 + 8] - movss xmm5, [edi + ebx*4 + 8] - movss xmm6, [edi + ecx*4 + 8] - movss xmm7, [edi + edx*4 + 8] - subss xmm4, xmm2 - shufps xmm2, xmm2, 229 ;# constant 11100101 - subss xmm5, xmm2 - shufps xmm2, xmm2, 234 ;# constant 11101010 - subss xmm6, xmm2 - shufps xmm2, xmm2, 255 ;# constant 11111111 - subss xmm7, xmm2 - movss [edi + eax*4 + 8], xmm4 - movss [edi + ebx*4 + 8], xmm5 - movss [edi + ecx*4 + 8], xmm6 - movss [edi + edx*4 + 8], xmm7 - - ;# should we do one more iteration? - sub dword ptr [esp + nb400_innerk], 4 - jl .nb400_finish_inner - jmp .nb400_unroll_loop -.nb400_finish_inner: - ;# check if at least two particles remain - add dword ptr [esp + nb400_innerk], 4 - mov edx, [esp + nb400_innerk] - and edx, 2 - jnz .nb400_dopair - jmp .nb400_checksingle -.nb400_dopair: - mov ecx, [esp + nb400_innerjjnr] - - mov eax, [ecx] - mov ebx, [ecx + 4] - add dword ptr [esp + nb400_innerjjnr], 8 - - xorps xmm2, xmm2 - movaps xmm6, xmm2 - - ;# load isaj - mov esi, [ebp + nb400_invsqrta] - movss xmm2, [esi + eax*4] - movss xmm3, [esi + ebx*4] - unpcklps xmm2, xmm3 ;# isaj in xmm2(0,1) - mulps xmm2, [esp + nb400_isai] - movaps [esp + nb400_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb400_gbtsc] - movaps [esp + nb400_gbscale], xmm1 - - mov esi, [ebp + nb400_charge] ;# base of charge[] - movss xmm3, [esi + eax*4] - movss xmm6, [esi + ebx*4] - unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges - - mulps xmm2, [esp + nb400_iq] - mulps xmm3, xmm2 - movaps [esp + nb400_qq], xmm3 - - mov edi, [ebp + nb400_pos] - - movd mm0, eax ;# copy jnr to mm0/mm1 - movd mm1, ebx - - lea eax, [eax + eax*2] - lea ebx, [ebx + ebx*2] - ;# move coordinates to xmm0-xmm2 - movlps xmm1, [edi + eax*4] - movss xmm2, [edi + eax*4 + 8] - movhps xmm1, [edi + ebx*4] - movss xmm0, [edi + ebx*4 + 8] - - movlhps xmm3, xmm7 - - shufps xmm2, xmm0, 0 - - movaps xmm0, xmm1 - - shufps xmm2, xmm2, 136 ;# constant 10001000 - - shufps xmm0, xmm0, 136 ;# constant 10001000 - shufps xmm1, xmm1, 221 ;# constant 11011101 - - mov edi, [ebp + nb400_faction] - ;# move ix-iz to xmm4-xmm6 - xorps xmm7, xmm7 - - movaps xmm4, [esp + nb400_ix] - movaps xmm5, [esp + nb400_iy] - movaps xmm6, [esp + nb400_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# store dr - movaps [esp + nb400_dx], xmm4 - movaps [esp + nb400_dy], xmm5 - movaps [esp + nb400_dz], xmm6 - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb400_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb400_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [esp + nb400_r], xmm4 - mulps xmm4, [esp + nb400_gbscale] - - cvttps2pi mm6, xmm4 ;# mm6 contain lu indices - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 - - mov esi, [ebp + nb400_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# load coulomb table - movaps xmm4, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm7, [esp + nb400_two] ;# two*Heps2 - movaps xmm3, [esp + nb400_qq] - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - mulps xmm3, xmm7 ;# fijC=FF*qq - ;# at this point mm5 contains vcoul and mm3 fijC - - ;# get jnr from mm0/mm1 - movd ecx, mm0 - movd edx, mm1 - - mov esi, [ebp + nb400_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulps xmm3, [esp + nb400_gbscale] - movaps xmm6, xmm3 - mulps xmm6, [esp + nb400_r] - addps xmm6, xmm5 - addps xmm5, [esp + nb400_vctot] - movaps [esp + nb400_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [esp + nb400_dvdasum] - movaps [esp + nb400_dvdasum], xmm7 - - ;# update j atoms dvdaj - movaps xmm7, xmm6 - shufps xmm7, xmm7, 0x1 - addss xmm6, [esi + ecx*4] - addss xmm7, [esi + edx*4] - movss [esi + ecx*4], xmm6 - movss [esi + edx*4], xmm7 - - xorps xmm4, xmm4 - mulps xmm3, xmm0 - subps xmm4, xmm3 - - movaps xmm0, [esp + nb400_dx] - movaps xmm1, [esp + nb400_dy] - movaps xmm2, [esp + nb400_dz] - - mulps xmm0, xmm4 - mulps xmm1, xmm4 - mulps xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movaps xmm3, [esp + nb400_fix] - movaps xmm4, [esp + nb400_fiy] - movaps xmm5, [esp + nb400_fiz] - addps xmm3, xmm0 - addps xmm4, xmm1 - addps xmm5, xmm2 - movaps [esp + nb400_fix], xmm3 - movaps [esp + nb400_fiy], xmm4 - movaps [esp + nb400_fiz], xmm5 - ;# update the fj's - movss xmm3, [edi + eax*4] - movss xmm4, [edi + eax*4 + 4] - movss xmm5, [edi + eax*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [edi + eax*4], xmm3 - movss [edi + eax*4 + 4], xmm4 - movss [edi + eax*4 + 8], xmm5 - - shufps xmm0, xmm0, 225 ;# constant 11100001 - shufps xmm1, xmm1, 225 ;# constant 11100001 - shufps xmm2, xmm2, 225 ;# constant 11100001 - - movss xmm3, [edi + ebx*4] - movss xmm4, [edi + ebx*4 + 4] - movss xmm5, [edi + ebx*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [edi + ebx*4], xmm3 - movss [edi + ebx*4 + 4], xmm4 - movss [edi + ebx*4 + 8], xmm5 - -.nb400_checksingle: - mov edx, [esp + nb400_innerk] - and edx, 1 - jnz .nb400_dosingle - jmp .nb400_updateouterdata -.nb400_dosingle: - mov esi, [ebp + nb400_charge] - mov edx, [ebp + nb400_invsqrta] - mov edi, [ebp + nb400_pos] - mov ecx, [esp + nb400_innerjjnr] - mov eax, [ecx] - xorps xmm2, xmm2 - movaps xmm6, xmm2 - movss xmm2, [edx + eax*4] ;# isaj - mulss xmm2, [esp + nb400_isai] - movss [esp + nb400_isaprod], xmm2 - movss xmm1, xmm2 - mulss xmm1, [esp + nb400_gbtsc] - movss [esp + nb400_gbscale], xmm1 - - mulss xmm2, [esp + nb400_iq] - movss xmm6, [esi + eax*4] ;# xmm6(0) has the charge - mulss xmm6, xmm2 - movss [esp + nb400_qq], xmm6 - - movd mm0, eax - lea eax, [eax + eax*2] - - ;# move coordinates to xmm0-xmm2 - movss xmm0, [edi + eax*4] - movss xmm1, [edi + eax*4 + 4] - movss xmm2, [edi + eax*4 + 8] - - movss xmm4, [esp + nb400_ix] - movss xmm5, [esp + nb400_iy] - movss xmm6, [esp + nb400_iz] - - ;# calc dr - subss xmm4, xmm0 - subss xmm5, xmm1 - subss xmm6, xmm2 - - ;# store dr - movss [esp + nb400_dx], xmm4 - movss [esp + nb400_dy], xmm5 - movss [esp + nb400_dz], xmm6 - ;# square it - mulss xmm4,xmm4 - mulss xmm5,xmm5 - mulss xmm6,xmm6 - addss xmm4, xmm5 - addss xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movss xmm2, xmm5 - mulss xmm5, xmm5 - movss xmm1, [esp + nb400_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movss xmm0, [esp + nb400_half] - subss xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - - mulss xmm4, xmm0 ;# xmm4=r - movss [esp + nb400_r], xmm4 - mulss xmm4, [esp + nb400_gbscale] - - cvttss2si ebx, xmm4 ;# mm6 contain lu indices - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movss xmm1, xmm4 ;# xmm1=eps - movss xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 2 - - mov esi, [ebp + nb400_GBtab] - - movaps xmm4, [esi + ebx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - mulss xmm7, [esp + nb400_two] ;# two*Heps2 - movss xmm3, [esp + nb400_qq] - addss xmm7, xmm6 - addss xmm7, xmm5 ;# xmm7=FF - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, xmm3 ;# vcoul=qq*VV - mulss xmm3, xmm7 ;# fijC=FF*qq - ;# at this point mm5 contains vcoul and mm3 fijC - - movd ebx, mm0 - mov esi, [ebp + nb400_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulss xmm3, [esp + nb400_gbscale] - movaps xmm6, xmm3 - mulss xmm6, [esp + nb400_r] - addss xmm6, xmm5 - addss xmm5, [esp + nb400_vctot] - movss [esp + nb400_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [esp + nb400_dvdasum] - movaps [esp + nb400_dvdasum], xmm7 - - ;# update j atoms dvdaj - addss xmm6, [esi + ebx*4] - movss [esi + ebx*4], xmm6 - - xorps xmm4, xmm4 - mulss xmm3, xmm0 - subss xmm4, xmm3 - - mov edi, [ebp + nb400_faction] - - movss xmm0, [esp + nb400_dx] - movss xmm1, [esp + nb400_dy] - movss xmm2, [esp + nb400_dz] - - mulss xmm0, xmm4 - mulss xmm1, xmm4 - mulss xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movss xmm3, [esp + nb400_fix] - movss xmm4, [esp + nb400_fiy] - movss xmm5, [esp + nb400_fiz] - addss xmm3, xmm0 - addss xmm4, xmm1 - addss xmm5, xmm2 - movss [esp + nb400_fix], xmm3 - movss [esp + nb400_fiy], xmm4 - movss [esp + nb400_fiz], xmm5 - ;# update fj - - movss xmm3, [edi + eax*4] - movss xmm4, [edi + eax*4 + 4] - movss xmm5, [edi + eax*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [edi + eax*4], xmm3 - movss [edi + eax*4 + 4], xmm4 - movss [edi + eax*4 + 8], xmm5 -.nb400_updateouterdata: - mov ecx, [esp + nb400_ii3] - mov edi, [ebp + nb400_faction] - mov esi, [ebp + nb400_fshift] - mov edx, [esp + nb400_is3] - - ;# accumulate i forces in xmm0, xmm1, xmm2 - movaps xmm0, [esp + nb400_fix] - movaps xmm1, [esp + nb400_fiy] - movaps xmm2, [esp + nb400_fiz] - - movhlps xmm3, xmm0 - movhlps xmm4, xmm1 - movhlps xmm5, xmm2 - addps xmm0, xmm3 - addps xmm1, xmm4 - addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2 - - movaps xmm3, xmm0 - movaps xmm4, xmm1 - movaps xmm5, xmm2 - - shufps xmm3, xmm3, 1 - shufps xmm4, xmm4, 1 - shufps xmm5, xmm5, 1 - addss xmm0, xmm3 - addss xmm1, xmm4 - addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0 - - ;# increment i force - movss xmm3, [edi + ecx*4] - movss xmm4, [edi + ecx*4 + 4] - movss xmm5, [edi + ecx*4 + 8] - addss xmm3, xmm0 - addss xmm4, xmm1 - addss xmm5, xmm2 - movss [edi + ecx*4], xmm3 - movss [edi + ecx*4 + 4], xmm4 - movss [edi + ecx*4 + 8], xmm5 - - ;# increment fshift force - movss xmm3, [esi + edx*4] - movss xmm4, [esi + edx*4 + 4] - movss xmm5, [esi + edx*4 + 8] - addss xmm3, xmm0 - addss xmm4, xmm1 - addss xmm5, xmm2 - movss [esi + edx*4], xmm3 - movss [esi + edx*4 + 4], xmm4 - movss [esi + edx*4 + 8], xmm5 - - ;# get n from stack - mov esi, [esp + nb400_n] - ;# get group index for i particle - mov edx, [ebp + nb400_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movaps xmm7, [esp + nb400_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov eax, [ebp + nb400_Vc] - addss xmm7, [eax + edx*4] - ;# move back to mem - movss [eax + edx*4], xmm7 - - ;# accumulate dVda and update it - movaps xmm7, [esp + nb400_dvdasum] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - mov edx, [esp + nb400_ii] - mov eax, [ebp + nb400_dvda] - addss xmm7, [eax + edx*4] - movss [eax + edx*4], xmm7 - - ;# finish if last - mov ecx, [esp + nb400_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb400_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb400_n], esi - jmp .nb400_outer -.nb400_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb400_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb400_end - ;# non-zero, do one more workunit - jmp .nb400_threadloop -.nb400_end: - emms - - mov eax, [esp + nb400_nouter] - mov ebx, [esp + nb400_ninner] - mov ecx, [ebp + nb400_outeriter] - mov edx, [ebp + nb400_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb400_salign] - add esp, eax - add esp, 400 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret - - - - - -.globl nb_kernel400nf_ia32_sse -.globl _nb_kernel400nf_ia32_sse -nb_kernel400nf_ia32_sse: -_nb_kernel400nf_ia32_sse: -.equiv nb400nf_p_nri, 8 -.equiv nb400nf_iinr, 12 -.equiv nb400nf_jindex, 16 -.equiv nb400nf_jjnr, 20 -.equiv nb400nf_shift, 24 -.equiv nb400nf_shiftvec, 28 -.equiv nb400nf_fshift, 32 -.equiv nb400nf_gid, 36 -.equiv nb400nf_pos, 40 -.equiv nb400nf_faction, 44 -.equiv nb400nf_charge, 48 -.equiv nb400nf_p_facel, 52 -.equiv nb400nf_argkrf, 56 -.equiv nb400nf_argcrf, 60 -.equiv nb400nf_Vc, 64 -.equiv nb400nf_type, 68 -.equiv nb400nf_p_ntype, 72 -.equiv nb400nf_vdwparam, 76 -.equiv nb400nf_Vvdw, 80 -.equiv nb400nf_p_tabscale, 84 -.equiv nb400nf_VFtab, 88 -.equiv nb400nf_invsqrta, 92 -.equiv nb400nf_dvda, 96 -.equiv nb400nf_p_gbtabscale, 100 -.equiv nb400nf_GBtab, 104 -.equiv nb400nf_p_nthreads, 108 -.equiv nb400nf_count, 112 -.equiv nb400nf_mtx, 116 -.equiv nb400nf_outeriter, 120 -.equiv nb400nf_inneriter, 124 -.equiv nb400nf_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb400nf_ix, 0 -.equiv nb400nf_iy, 16 -.equiv nb400nf_iz, 32 -.equiv nb400nf_iq, 48 -.equiv nb400nf_gbtsc, 64 -.equiv nb400nf_qq, 80 -.equiv nb400nf_vctot, 96 -.equiv nb400nf_half, 112 -.equiv nb400nf_three, 128 -.equiv nb400nf_isai, 144 -.equiv nb400nf_isaprod, 160 -.equiv nb400nf_gbscale, 176 -.equiv nb400nf_is3, 192 -.equiv nb400nf_ii3, 196 -.equiv nb400nf_innerjjnr, 200 -.equiv nb400nf_innerk, 204 -.equiv nb400nf_n, 208 -.equiv nb400nf_nn1, 212 -.equiv nb400nf_nri, 216 -.equiv nb400nf_facel, 220 -.equiv nb400nf_nouter, 224 -.equiv nb400nf_ninner, 228 -.equiv nb400nf_salign, 232 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 236 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb400nf_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb400nf_p_nri] - mov esi, [ebp + nb400nf_p_facel] - mov ecx, [ecx] - mov esi, [esi] - mov [esp + nb400nf_nri], ecx - mov [esp + nb400nf_facel], esi - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb400nf_nouter], eax - mov [esp + nb400nf_ninner], eax - - - mov eax, [ebp + nb400nf_p_gbtabscale] - movss xmm3, [eax] - shufps xmm3, xmm3, 0 - movaps [esp + nb400nf_gbtsc], xmm3 - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# constant 0.5 in IEEE (hex) - mov [esp + nb400nf_half], eax - movss xmm1, [esp + nb400nf_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# constant 1.0 - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# constant 2.0 - addps xmm3, xmm2 ;# constant 3.0 - movaps [esp + nb400nf_half], xmm1 - movaps [esp + nb400nf_three], xmm3 - -.nb400nf_threadloop: - mov esi, [ebp + nb400nf_count] ;# pointer to sync counter - mov eax, [esi] -.nb400nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb400nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb400nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb400nf_n], eax - mov [esp + nb400nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb400nf_outerstart - jmp .nb400nf_end - -.nb400nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb400nf_nouter] - mov [esp + nb400nf_nouter], ebx - -.nb400nf_outer: - mov eax, [ebp + nb400nf_shift] ;# eax = pointer into shift[] - mov ebx, [eax + esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb400nf_is3],ebx ;# store is3 - - mov eax, [ebp + nb400nf_shiftvec] ;# eax = base of shiftvec[] - - movss xmm0, [eax + ebx*4] - movss xmm1, [eax + ebx*4 + 4] - movss xmm2, [eax + ebx*4 + 8] - - mov ecx, [ebp + nb400nf_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx + esi*4] ;# ebx =ii - - mov edx, [ebp + nb400nf_charge] - movss xmm3, [edx + ebx*4] - mulss xmm3, [esp + nb400nf_facel] - shufps xmm3, xmm3, 0 - - mov edx, [ebp + nb400nf_invsqrta] ;# load invsqrta[ii] - movss xmm4, [edx + ebx*4] - shufps xmm4, xmm4, 0 - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb400nf_pos] ;# eax = base of pos[] - - addss xmm0, [eax + ebx*4] - addss xmm1, [eax + ebx*4 + 4] - addss xmm2, [eax + ebx*4 + 8] - - movaps [esp + nb400nf_iq], xmm3 - movaps [esp + nb400nf_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [esp + nb400nf_ix], xmm0 - movaps [esp + nb400nf_iy], xmm1 - movaps [esp + nb400nf_iz], xmm2 - - mov [esp + nb400nf_ii3], ebx - - ;# clear vctot - xorps xmm4, xmm4 - movaps [esp + nb400nf_vctot], xmm4 - - mov eax, [ebp + nb400nf_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb400nf_pos] - mov edi, [ebp + nb400nf_faction] - mov eax, [ebp + nb400nf_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb400nf_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [esp + nb400nf_ninner] - mov [esp + nb400nf_ninner], ecx - add edx, 0 - mov [esp + nb400nf_innerk], edx ;# number of innerloop atoms - jge .nb400nf_unroll_loop - jmp .nb400nf_finish_inner -.nb400nf_unroll_loop: - ;# quad-unroll innerloop here - mov edx, [esp + nb400nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - mov ecx, [edx + 8] - mov edx, [edx + 12] ;# eax-edx=jnr1-4 - add dword ptr [esp + nb400nf_innerjjnr], 16 ;# advance pointer (unrolled 4) - - ;# load isa2 - mov esi, [ebp + nb400nf_invsqrta] - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - movaps xmm2, [esp + nb400nf_isai] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3 - mulps xmm2, xmm3 - - movaps [esp + nb400nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb400nf_gbtsc] - movaps [esp + nb400nf_gbscale], xmm1 - - mov esi, [ebp + nb400nf_charge] ;# base of charge[] - - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - - mulps xmm2, [esp + nb400nf_iq] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3 - mulps xmm3, xmm2 - movaps [esp + nb400nf_qq], xmm3 - - - mov esi, [ebp + nb400nf_pos] ;# base of pos[] - - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - - lea ecx, [ecx + ecx*2] ;# replace jnr with j3 - lea edx, [edx + edx*2] - - ;# move four coordinates to xmm0-xmm2 - - movlps xmm4, [esi + eax*4] - movlps xmm5, [esi + ecx*4] - movss xmm2, [esi + eax*4 + 8] - movss xmm6, [esi + ecx*4 + 8] - - movhps xmm4, [esi + ebx*4] - movhps xmm5, [esi + edx*4] - - movss xmm0, [esi + ebx*4 + 8] - movss xmm1, [esi + edx*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# constant 10001000 - - shufps xmm0, xmm5, 136 ;# constant 10001000 - shufps xmm1, xmm5, 221 ;# constant 11011101 - - ;# move ix-iz to xmm4-xmm6 - movaps xmm4, [esp + nb400nf_ix] - movaps xmm5, [esp + nb400nf_iy] - movaps xmm6, [esp + nb400nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb400nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb400nf_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - mulps xmm4, [esp + nb400nf_gbscale] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 2 - pslld mm7, 2 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov esi, [ebp + nb400nf_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# load coulomb table - movaps xmm4, [esi + eax*4] - movaps xmm5, [esi + ebx*4] - movaps xmm6, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [esp + nb400nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - addps xmm5, [esp + nb400nf_vctot] - movaps [esp + nb400nf_vctot], xmm5 - - ;# should we do one more iteration? - sub dword ptr [esp + nb400nf_innerk], 4 - jl .nb400nf_finish_inner - jmp .nb400nf_unroll_loop -.nb400nf_finish_inner: - ;# check if at least two particles remain - add dword ptr [esp + nb400nf_innerk], 4 - mov edx, [esp + nb400nf_innerk] - and edx, 2 - jnz .nb400nf_dopair - jmp .nb400nf_checksingle -.nb400nf_dopair: - mov ecx, [esp + nb400nf_innerjjnr] - - mov eax, [ecx] - mov ebx, [ecx + 4] - add dword ptr [esp + nb400nf_innerjjnr], 8 - - xorps xmm2, xmm2 - movaps xmm6, xmm2 - - ;# load isa2 - mov esi, [ebp + nb400nf_invsqrta] - movss xmm2, [esi + eax*4] - movss xmm3, [esi + ebx*4] - unpcklps xmm2, xmm3 ;# isa2 in xmm3(0,1) - mulps xmm2, [esp + nb400nf_isai] - movaps [esp + nb400nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb400nf_gbtsc] - movaps [esp + nb400nf_gbscale], xmm1 - - mov esi, [ebp + nb400nf_charge] ;# base of charge[] - movss xmm3, [esi + eax*4] - movss xmm6, [esi + ebx*4] - unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges - - mulps xmm2, [esp + nb400nf_iq] - mulps xmm3, xmm2 - movaps [esp + nb400nf_qq], xmm3 - - mov edi, [ebp + nb400nf_pos] - - lea eax, [eax + eax*2] - lea ebx, [ebx + ebx*2] - ;# move coordinates to xmm0-xmm2 - movlps xmm1, [edi + eax*4] - movss xmm2, [edi + eax*4 + 8] - movhps xmm1, [edi + ebx*4] - movss xmm0, [edi + ebx*4 + 8] - - movlhps xmm3, xmm7 - - shufps xmm2, xmm0, 0 - - movaps xmm0, xmm1 - - shufps xmm2, xmm2, 136 ;# constant 10001000 - - shufps xmm0, xmm0, 136 ;# constant 10001000 - shufps xmm1, xmm1, 221 ;# constant 11011101 - - mov edi, [ebp + nb400nf_faction] - ;# move ix-iz to xmm4-xmm6 - xorps xmm7, xmm7 - - movaps xmm4, [esp + nb400nf_ix] - movaps xmm5, [esp + nb400nf_iy] - movaps xmm6, [esp + nb400nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb400nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb400nf_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - mulps xmm4, [esp + nb400nf_gbscale] - - cvttps2pi mm6, xmm4 ;# mm6 contain lu indices - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 - - mov esi, [ebp + nb400nf_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# load coulomb table - movaps xmm4, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [esp + nb400nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - addps xmm5, [esp + nb400nf_vctot] - movaps [esp + nb400nf_vctot], xmm5 - -.nb400nf_checksingle: - mov edx, [esp + nb400nf_innerk] - and edx, 1 - jnz .nb400nf_dosingle - jmp .nb400nf_updateouterdata -.nb400nf_dosingle: - mov esi, [ebp + nb400nf_charge] - mov edx, [ebp + nb400nf_invsqrta] - mov edi, [ebp + nb400nf_pos] - mov ecx, [esp + nb400nf_innerjjnr] - mov eax, [ecx] - xorps xmm2, xmm2 - movaps xmm6, xmm2 - movss xmm2, [edx + eax*4] ;# isa2 - mulss xmm2, [esp + nb400nf_isai] - movss [esp + nb400nf_isaprod], xmm2 - movss xmm1, xmm2 - mulss xmm1, [esp + nb400nf_gbtsc] - movss [esp + nb400nf_gbscale], xmm1 - - mulss xmm2, [esp + nb400nf_iq] - movss xmm6, [esi + eax*4] ;# xmm6(0) has the charge - mulss xmm6, xmm2 - movss [esp + nb400nf_qq], xmm6 - - lea eax, [eax + eax*2] - - ;# move coordinates to xmm0-xmm2 - movss xmm0, [edi + eax*4] - movss xmm1, [edi + eax*4 + 4] - movss xmm2, [edi + eax*4 + 8] - - movss xmm4, [esp + nb400nf_ix] - movss xmm5, [esp + nb400nf_iy] - movss xmm6, [esp + nb400nf_iz] - - ;# calc dr - subss xmm4, xmm0 - subss xmm5, xmm1 - subss xmm6, xmm2 - - ;# square it - mulss xmm4,xmm4 - mulss xmm5,xmm5 - mulss xmm6,xmm6 - addss xmm4, xmm5 - addss xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movss xmm2, xmm5 - mulss xmm5, xmm5 - movss xmm1, [esp + nb400nf_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movss xmm0, [esp + nb400nf_half] - subss xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - - mulss xmm4, xmm0 ;# xmm4=r - mulss xmm4, [esp + nb400nf_gbscale] - - cvttss2si ebx, xmm4 ;# mm6 contain lu indices - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movss xmm1, xmm4 ;# xmm1=eps - movss xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 2 - - mov esi, [ebp + nb400nf_GBtab] - - movaps xmm4, [esi + ebx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - movss xmm3, [esp + nb400nf_qq] - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, xmm3 ;# vcoul=qq*VV - addss xmm5, [esp + nb400nf_vctot] - movss [esp + nb400nf_vctot], xmm5 -.nb400nf_updateouterdata: - ;# get n from stack - mov esi, [esp + nb400nf_n] - ;# get group index for i particle - mov edx, [ebp + nb400nf_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movaps xmm7, [esp + nb400nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov eax, [ebp + nb400nf_Vc] - addss xmm7, [eax + edx*4] - ;# move back to mem - movss [eax + edx*4], xmm7 - - ;# finish if last - mov ecx, [esp + nb400nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb400nf_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb400nf_n], esi - jmp .nb400nf_outer -.nb400nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb400nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb400nf_end - ;# non-zero, do one more workunit - jmp .nb400nf_threadloop -.nb400nf_end: - emms - - mov eax, [esp + nb400nf_nouter] - mov ebx, [esp + nb400nf_ninner] - mov ecx, [ebp + nb400nf_outeriter] - mov edx, [ebp + nb400nf_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb400nf_salign] - add esp, eax - add esp, 236 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.s deleted file mode 100644 index 5db96cb9c9..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.s +++ /dev/null @@ -1,1701 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - - -.globl nb_kernel400_ia32_sse -.globl _nb_kernel400_ia32_sse -nb_kernel400_ia32_sse: -_nb_kernel400_ia32_sse: -.set nb400_p_nri, 8 -.set nb400_iinr, 12 -.set nb400_jindex, 16 -.set nb400_jjnr, 20 -.set nb400_shift, 24 -.set nb400_shiftvec, 28 -.set nb400_fshift, 32 -.set nb400_gid, 36 -.set nb400_pos, 40 -.set nb400_faction, 44 -.set nb400_charge, 48 -.set nb400_p_facel, 52 -.set nb400_argkrf, 56 -.set nb400_argcrf, 60 -.set nb400_Vc, 64 -.set nb400_type, 68 -.set nb400_p_ntype, 72 -.set nb400_vdwparam, 76 -.set nb400_Vvdw, 80 -.set nb400_p_tabscale, 84 -.set nb400_VFtab, 88 -.set nb400_invsqrta, 92 -.set nb400_dvda, 96 -.set nb400_p_gbtabscale, 100 -.set nb400_GBtab, 104 -.set nb400_p_nthreads, 108 -.set nb400_count, 112 -.set nb400_mtx, 116 -.set nb400_outeriter, 120 -.set nb400_inneriter, 124 -.set nb400_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb400_ix, 0 -.set nb400_iy, 16 -.set nb400_iz, 32 -.set nb400_iq, 48 -.set nb400_dx, 64 -.set nb400_dy, 80 -.set nb400_dz, 96 -.set nb400_two, 112 -.set nb400_gbtsc, 128 -.set nb400_qq, 144 -.set nb400_r, 160 -.set nb400_vctot, 176 -.set nb400_fix, 192 -.set nb400_fiy, 208 -.set nb400_fiz, 224 -.set nb400_half, 240 -.set nb400_three, 256 -.set nb400_isai, 272 -.set nb400_isaprod, 288 -.set nb400_dvdasum, 304 -.set nb400_gbscale, 320 -.set nb400_is3, 336 -.set nb400_ii3, 340 -.set nb400_ii, 344 -.set nb400_innerjjnr, 348 -.set nb400_innerk, 352 -.set nb400_n, 356 -.set nb400_nn1, 360 -.set nb400_jnra, 364 -.set nb400_jnrb, 368 -.set nb400_jnrc, 372 -.set nb400_jnrd, 376 -.set nb400_nri, 380 -.set nb400_facel, 384 -.set nb400_nouter, 388 -.set nb400_ninner, 392 -.set nb400_salign, 396 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $400,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb400_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb400_p_nri(%ebp),%ecx - movl nb400_p_facel(%ebp),%esi - movl (%ecx),%ecx - movl (%esi),%esi - movl %ecx,nb400_nri(%esp) - movl %esi,nb400_facel(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb400_nouter(%esp) - movl %eax,nb400_ninner(%esp) - - - movl nb400_p_gbtabscale(%ebp),%eax - movss (%eax),%xmm3 - shufps $0,%xmm3,%xmm3 - movaps %xmm3,nb400_gbtsc(%esp) - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## constant 0.5 in IEEE (hex) - movl %eax,nb400_half(%esp) - movss nb400_half(%esp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## constant 1.0 - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## constant 2.0 - addps %xmm2,%xmm3 ## constant 3.0 - movaps %xmm1,nb400_half(%esp) - movaps %xmm2,nb400_two(%esp) - movaps %xmm3,nb400_three(%esp) - -_nb_kernel400_ia32_sse.nb400_threadloop: - movl nb400_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel400_ia32_sse.nb400_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel400_ia32_sse.nb400_spinlock - - ## if(nn1>nri) nn1=nri - movl nb400_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb400_n(%esp) - movl %ebx,nb400_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel400_ia32_sse.nb400_outerstart - jmp _nb_kernel400_ia32_sse.nb400_end - -_nb_kernel400_ia32_sse.nb400_outerstart: - ## ebx contains number of outer iterations - addl nb400_nouter(%esp),%ebx - movl %ebx,nb400_nouter(%esp) - -_nb_kernel400_ia32_sse.nb400_outer: - movl nb400_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb400_is3(%esp) ## store is3 - - movl nb400_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movss (%eax,%ebx,4),%xmm0 - movss 4(%eax,%ebx,4),%xmm1 - movss 8(%eax,%ebx,4),%xmm2 - - movl nb400_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - movl %ebx,nb400_ii(%esp) - - movl nb400_charge(%ebp),%edx - movss (%edx,%ebx,4),%xmm3 - mulss nb400_facel(%esp),%xmm3 - shufps $0,%xmm3,%xmm3 - - - movl nb400_invsqrta(%ebp),%edx ## load invsqrta[ii] - movss (%edx,%ebx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb400_pos(%ebp),%eax ## eax = base of pos[] - - addss (%eax,%ebx,4),%xmm0 - addss 4(%eax,%ebx,4),%xmm1 - addss 8(%eax,%ebx,4),%xmm2 - - movaps %xmm3,nb400_iq(%esp) - movaps %xmm4,nb400_isai(%esp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb400_ix(%esp) - movaps %xmm1,nb400_iy(%esp) - movaps %xmm2,nb400_iz(%esp) - - movl %ebx,nb400_ii3(%esp) - - ## clear vctot and i forces - xorps %xmm4,%xmm4 - movaps %xmm4,nb400_vctot(%esp) - movaps %xmm4,nb400_dvdasum(%esp) - movaps %xmm4,nb400_fix(%esp) - movaps %xmm4,nb400_fiy(%esp) - movaps %xmm4,nb400_fiz(%esp) - - movl nb400_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb400_pos(%ebp),%esi - movl nb400_faction(%ebp),%edi - movl nb400_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb400_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb400_ninner(%esp),%ecx - movl %ecx,nb400_ninner(%esp) - addl $0,%edx - movl %edx,nb400_innerk(%esp) ## number of innerloop atoms - jge _nb_kernel400_ia32_sse.nb400_unroll_loop - jmp _nb_kernel400_ia32_sse.nb400_finish_inner -_nb_kernel400_ia32_sse.nb400_unroll_loop: - ## quad-unroll innerloop here - movl nb400_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - movl 8(%edx),%ecx - movl 12(%edx),%edx ## eax-edx=jnr1-4 - addl $16,nb400_innerjjnr(%esp) ## advance pointer (unrolled 4) - - ## load isaj - movl nb400_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - movaps nb400_isai(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all isaj in xmm3 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb400_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb400_gbtsc(%esp),%xmm1 - movaps %xmm1,nb400_gbscale(%esp) - - movl nb400_charge(%ebp),%esi ## base of charge[] - - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - - mulps nb400_iq(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3 - mulps %xmm2,%xmm3 - movaps %xmm3,nb400_qq(%esp) - - - movl nb400_pos(%ebp),%esi ## base of pos[] - - movl %eax,nb400_jnra(%esp) - movl %ebx,nb400_jnrb(%esp) - movl %ecx,nb400_jnrc(%esp) - movl %edx,nb400_jnrd(%esp) - - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - leal (%ecx,%ecx,2),%ecx - leal (%edx,%edx,2),%edx - - ## move four coordinates to xmm0-xmm2 - - movlps (%esi,%eax,4),%xmm4 - movlps (%esi,%ecx,4),%xmm5 - movss 8(%esi,%eax,4),%xmm2 - movss 8(%esi,%ecx,4),%xmm6 - - movhps (%esi,%ebx,4),%xmm4 - movhps (%esi,%edx,4),%xmm5 - - movss 8(%esi,%ebx,4),%xmm0 - movss 8(%esi,%edx,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## constant 10001000 - - shufps $136,%xmm5,%xmm0 ## constant 10001000 - shufps $221,%xmm5,%xmm1 ## constant 11011101 - - ## move ix-iz to xmm4-xmm6 - movaps nb400_ix(%esp),%xmm4 - movaps nb400_iy(%esp),%xmm5 - movaps nb400_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## store dr - movaps %xmm4,nb400_dx(%esp) - movaps %xmm5,nb400_dy(%esp) - movaps %xmm6,nb400_dz(%esp) - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb400_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb400_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb400_r(%esp) - mulps nb400_gbscale(%esp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $2,%mm6 - pslld $2,%mm7 - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movl nb400_GBtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## load coulomb table - movaps (%esi,%eax,4),%xmm4 - movaps (%esi,%ebx,4),%xmm5 - movaps (%esi,%ecx,4),%xmm6 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps nb400_two(%esp),%xmm7 ## two*Heps2 - movaps nb400_qq(%esp),%xmm3 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - mulps %xmm7,%xmm3 ## fijC=FF*qq - ## at this point mm5 contains vcoul and mm3 fijC - - ## get jnr from stack - movl nb400_jnra(%esp),%eax - movl nb400_jnrb(%esp),%ebx - movl nb400_jnrc(%esp),%ecx - movl nb400_jnrd(%esp),%edx - - movl nb400_dvda(%ebp),%esi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulps nb400_gbscale(%esp),%xmm3 - movaps %xmm3,%xmm6 - mulps nb400_r(%esp),%xmm6 - addps %xmm5,%xmm6 - addps nb400_vctot(%esp),%xmm5 - movaps %xmm5,nb400_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb400_dvdasum(%esp),%xmm7 - movaps %xmm7,nb400_dvdasum(%esp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - movaps %xmm6,%xmm5 - movaps %xmm7,%xmm4 - shufps $0x1,%xmm5,%xmm5 - shufps $0x1,%xmm4,%xmm4 - ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss (%esi,%eax,4),%xmm6 - addss (%esi,%ebx,4),%xmm5 - addss (%esi,%ecx,4),%xmm7 - addss (%esi,%edx,4),%xmm4 - movss %xmm6,(%esi,%eax,4) - movss %xmm5,(%esi,%ebx,4) - movss %xmm7,(%esi,%ecx,4) - movss %xmm4,(%esi,%edx,4) - - xorps %xmm4,%xmm4 - mulps %xmm0,%xmm3 - subps %xmm3,%xmm4 - - movaps nb400_dx(%esp),%xmm0 - movaps nb400_dy(%esp),%xmm1 - movaps nb400_dz(%esp),%xmm2 - - movd %mm0,%eax - movd %mm1,%ebx - movd %mm2,%ecx - movd %mm3,%edx - - movl nb400_faction(%ebp),%edi - mulps %xmm4,%xmm0 - mulps %xmm4,%xmm1 - mulps %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movaps nb400_fix(%esp),%xmm3 - movaps nb400_fiy(%esp),%xmm4 - movaps nb400_fiz(%esp),%xmm5 - addps %xmm0,%xmm3 - addps %xmm1,%xmm4 - addps %xmm2,%xmm5 - movaps %xmm3,nb400_fix(%esp) - movaps %xmm4,nb400_fiy(%esp) - movaps %xmm5,nb400_fiz(%esp) - ## the fj's - start by accumulating x & y forces from memory - movlps (%edi,%eax,4),%xmm4 - movlps (%edi,%ecx,4),%xmm6 - movhps (%edi,%ebx,4),%xmm4 - movhps (%edi,%edx,4),%xmm6 - - movaps %xmm4,%xmm3 - shufps $136,%xmm6,%xmm3 ## constant 10001000 - shufps $221,%xmm6,%xmm4 ## constant 11011101 - - ## now xmm3-xmm5 contains fjx, fjy, fjz - subps %xmm0,%xmm3 - subps %xmm1,%xmm4 - - ## unpack them back so we can store them - first x & y in xmm3/xmm4 - - movaps %xmm3,%xmm6 - unpcklps %xmm4,%xmm6 - unpckhps %xmm4,%xmm3 - ## xmm6(l)=x & y for j1, (h) for j2 - ## xmm3(l)=x & y for j3, (h) for j4 - movlps %xmm6,(%edi,%eax,4) - movlps %xmm3,(%edi,%ecx,4) - - movhps %xmm6,(%edi,%ebx,4) - movhps %xmm3,(%edi,%edx,4) - - ## and the z forces - movss 8(%edi,%eax,4),%xmm4 - movss 8(%edi,%ebx,4),%xmm5 - movss 8(%edi,%ecx,4),%xmm6 - movss 8(%edi,%edx,4),%xmm7 - subss %xmm2,%xmm4 - shufps $229,%xmm2,%xmm2 ## constant 11100101 - subss %xmm2,%xmm5 - shufps $234,%xmm2,%xmm2 ## constant 11101010 - subss %xmm2,%xmm6 - shufps $255,%xmm2,%xmm2 ## constant 11111111 - subss %xmm2,%xmm7 - movss %xmm4,8(%edi,%eax,4) - movss %xmm5,8(%edi,%ebx,4) - movss %xmm6,8(%edi,%ecx,4) - movss %xmm7,8(%edi,%edx,4) - - ## should we do one more iteration? - subl $4,nb400_innerk(%esp) - jl _nb_kernel400_ia32_sse.nb400_finish_inner - jmp _nb_kernel400_ia32_sse.nb400_unroll_loop -_nb_kernel400_ia32_sse.nb400_finish_inner: - ## check if at least two particles remain - addl $4,nb400_innerk(%esp) - movl nb400_innerk(%esp),%edx - andl $2,%edx - jnz _nb_kernel400_ia32_sse.nb400_dopair - jmp _nb_kernel400_ia32_sse.nb400_checksingle -_nb_kernel400_ia32_sse.nb400_dopair: - movl nb400_innerjjnr(%esp),%ecx - - movl (%ecx),%eax - movl 4(%ecx),%ebx - addl $8,nb400_innerjjnr(%esp) - - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - - ## load isaj - movl nb400_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm2 - movss (%esi,%ebx,4),%xmm3 - unpcklps %xmm3,%xmm2 ## isaj in xmm2(0,1) - mulps nb400_isai(%esp),%xmm2 - movaps %xmm2,nb400_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb400_gbtsc(%esp),%xmm1 - movaps %xmm1,nb400_gbscale(%esp) - - movl nb400_charge(%ebp),%esi ## base of charge[] - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ebx,4),%xmm6 - unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges - - mulps nb400_iq(%esp),%xmm2 - mulps %xmm2,%xmm3 - movaps %xmm3,nb400_qq(%esp) - - movl nb400_pos(%ebp),%edi - - movd %eax,%mm0 ## copy jnr to mm0/mm1 - movd %ebx,%mm1 - - leal (%eax,%eax,2),%eax - leal (%ebx,%ebx,2),%ebx - ## move coordinates to xmm0-xmm2 - movlps (%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - movhps (%edi,%ebx,4),%xmm1 - movss 8(%edi,%ebx,4),%xmm0 - - movlhps %xmm7,%xmm3 - - shufps $0,%xmm0,%xmm2 - - movaps %xmm1,%xmm0 - - shufps $136,%xmm2,%xmm2 ## constant 10001000 - - shufps $136,%xmm0,%xmm0 ## constant 10001000 - shufps $221,%xmm1,%xmm1 ## constant 11011101 - - movl nb400_faction(%ebp),%edi - ## move ix-iz to xmm4-xmm6 - xorps %xmm7,%xmm7 - - movaps nb400_ix(%esp),%xmm4 - movaps nb400_iy(%esp),%xmm5 - movaps nb400_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## store dr - movaps %xmm4,nb400_dx(%esp) - movaps %xmm5,nb400_dy(%esp) - movaps %xmm6,nb400_dz(%esp) - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb400_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb400_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb400_r(%esp) - mulps nb400_gbscale(%esp),%xmm4 - - cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 - - movl nb400_GBtab(%ebp),%esi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## load coulomb table - movaps (%esi,%ecx,4),%xmm4 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps nb400_two(%esp),%xmm7 ## two*Heps2 - movaps nb400_qq(%esp),%xmm3 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - mulps %xmm7,%xmm3 ## fijC=FF*qq - ## at this point mm5 contains vcoul and mm3 fijC - - ## get jnr from mm0/mm1 - movd %mm0,%ecx - movd %mm1,%edx - - movl nb400_dvda(%ebp),%esi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulps nb400_gbscale(%esp),%xmm3 - movaps %xmm3,%xmm6 - mulps nb400_r(%esp),%xmm6 - addps %xmm5,%xmm6 - addps nb400_vctot(%esp),%xmm5 - movaps %xmm5,nb400_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb400_dvdasum(%esp),%xmm7 - movaps %xmm7,nb400_dvdasum(%esp) - - ## update j atoms dvdaj - movaps %xmm6,%xmm7 - shufps $0x1,%xmm7,%xmm7 - addss (%esi,%ecx,4),%xmm6 - addss (%esi,%edx,4),%xmm7 - movss %xmm6,(%esi,%ecx,4) - movss %xmm7,(%esi,%edx,4) - - xorps %xmm4,%xmm4 - mulps %xmm0,%xmm3 - subps %xmm3,%xmm4 - - movaps nb400_dx(%esp),%xmm0 - movaps nb400_dy(%esp),%xmm1 - movaps nb400_dz(%esp),%xmm2 - - mulps %xmm4,%xmm0 - mulps %xmm4,%xmm1 - mulps %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movaps nb400_fix(%esp),%xmm3 - movaps nb400_fiy(%esp),%xmm4 - movaps nb400_fiz(%esp),%xmm5 - addps %xmm0,%xmm3 - addps %xmm1,%xmm4 - addps %xmm2,%xmm5 - movaps %xmm3,nb400_fix(%esp) - movaps %xmm4,nb400_fiy(%esp) - movaps %xmm5,nb400_fiz(%esp) - ## update the fj's - movss (%edi,%eax,4),%xmm3 - movss 4(%edi,%eax,4),%xmm4 - movss 8(%edi,%eax,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%edi,%eax,4) - movss %xmm4,4(%edi,%eax,4) - movss %xmm5,8(%edi,%eax,4) - - shufps $225,%xmm0,%xmm0 ## constant 11100001 - shufps $225,%xmm1,%xmm1 ## constant 11100001 - shufps $225,%xmm2,%xmm2 ## constant 11100001 - - movss (%edi,%ebx,4),%xmm3 - movss 4(%edi,%ebx,4),%xmm4 - movss 8(%edi,%ebx,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%edi,%ebx,4) - movss %xmm4,4(%edi,%ebx,4) - movss %xmm5,8(%edi,%ebx,4) - -_nb_kernel400_ia32_sse.nb400_checksingle: - movl nb400_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel400_ia32_sse.nb400_dosingle - jmp _nb_kernel400_ia32_sse.nb400_updateouterdata -_nb_kernel400_ia32_sse.nb400_dosingle: - movl nb400_charge(%ebp),%esi - movl nb400_invsqrta(%ebp),%edx - movl nb400_pos(%ebp),%edi - movl nb400_innerjjnr(%esp),%ecx - movl (%ecx),%eax - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - movss (%edx,%eax,4),%xmm2 ## isaj - mulss nb400_isai(%esp),%xmm2 - movss %xmm2,nb400_isaprod(%esp) - movss %xmm2,%xmm1 - mulss nb400_gbtsc(%esp),%xmm1 - movss %xmm1,nb400_gbscale(%esp) - - mulss nb400_iq(%esp),%xmm2 - movss (%esi,%eax,4),%xmm6 ## xmm6(0) has the charge - mulss %xmm2,%xmm6 - movss %xmm6,nb400_qq(%esp) - - movd %eax,%mm0 - leal (%eax,%eax,2),%eax - - ## move coordinates to xmm0-xmm2 - movss (%edi,%eax,4),%xmm0 - movss 4(%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - - movss nb400_ix(%esp),%xmm4 - movss nb400_iy(%esp),%xmm5 - movss nb400_iz(%esp),%xmm6 - - ## calc dr - subss %xmm0,%xmm4 - subss %xmm1,%xmm5 - subss %xmm2,%xmm6 - - ## store dr - movss %xmm4,nb400_dx(%esp) - movss %xmm5,nb400_dy(%esp) - movss %xmm6,nb400_dz(%esp) - ## square it - mulss %xmm4,%xmm4 - mulss %xmm5,%xmm5 - mulss %xmm6,%xmm6 - addss %xmm5,%xmm4 - addss %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movss %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movss nb400_three(%esp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movss nb400_half(%esp),%xmm0 - subss %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - - mulss %xmm0,%xmm4 ## xmm4=r - movss %xmm4,nb400_r(%esp) - mulss nb400_gbscale(%esp),%xmm4 - - cvttss2si %xmm4,%ebx ## mm6 contain lu indices - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movss %xmm4,%xmm1 ## xmm1=eps - movss %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%ebx - - movl nb400_GBtab(%ebp),%esi - - movaps (%esi,%ebx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - mulss nb400_two(%esp),%xmm7 ## two*Heps2 - movss nb400_qq(%esp),%xmm3 - addss %xmm6,%xmm7 - addss %xmm5,%xmm7 ## xmm7=FF - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss %xmm3,%xmm5 ## vcoul=qq*VV - mulss %xmm7,%xmm3 ## fijC=FF*qq - ## at this point mm5 contains vcoul and mm3 fijC - - movd %mm0,%ebx - movl nb400_dvda(%ebp),%esi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulss nb400_gbscale(%esp),%xmm3 - movaps %xmm3,%xmm6 - mulss nb400_r(%esp),%xmm6 - addss %xmm5,%xmm6 - addss nb400_vctot(%esp),%xmm5 - movss %xmm5,nb400_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb400_dvdasum(%esp),%xmm7 - movaps %xmm7,nb400_dvdasum(%esp) - - ## update j atoms dvdaj - addss (%esi,%ebx,4),%xmm6 - movss %xmm6,(%esi,%ebx,4) - - xorps %xmm4,%xmm4 - mulss %xmm0,%xmm3 - subss %xmm3,%xmm4 - - movl nb400_faction(%ebp),%edi - - movss nb400_dx(%esp),%xmm0 - movss nb400_dy(%esp),%xmm1 - movss nb400_dz(%esp),%xmm2 - - mulss %xmm4,%xmm0 - mulss %xmm4,%xmm1 - mulss %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movss nb400_fix(%esp),%xmm3 - movss nb400_fiy(%esp),%xmm4 - movss nb400_fiz(%esp),%xmm5 - addss %xmm0,%xmm3 - addss %xmm1,%xmm4 - addss %xmm2,%xmm5 - movss %xmm3,nb400_fix(%esp) - movss %xmm4,nb400_fiy(%esp) - movss %xmm5,nb400_fiz(%esp) - ## update fj - - movss (%edi,%eax,4),%xmm3 - movss 4(%edi,%eax,4),%xmm4 - movss 8(%edi,%eax,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%edi,%eax,4) - movss %xmm4,4(%edi,%eax,4) - movss %xmm5,8(%edi,%eax,4) -_nb_kernel400_ia32_sse.nb400_updateouterdata: - movl nb400_ii3(%esp),%ecx - movl nb400_faction(%ebp),%edi - movl nb400_fshift(%ebp),%esi - movl nb400_is3(%esp),%edx - - ## accumulate i forces in xmm0, xmm1, xmm2 - movaps nb400_fix(%esp),%xmm0 - movaps nb400_fiy(%esp),%xmm1 - movaps nb400_fiz(%esp),%xmm2 - - movhlps %xmm0,%xmm3 - movhlps %xmm1,%xmm4 - movhlps %xmm2,%xmm5 - addps %xmm3,%xmm0 - addps %xmm4,%xmm1 - addps %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 - - movaps %xmm0,%xmm3 - movaps %xmm1,%xmm4 - movaps %xmm2,%xmm5 - - shufps $1,%xmm3,%xmm3 - shufps $1,%xmm4,%xmm4 - shufps $1,%xmm5,%xmm5 - addss %xmm3,%xmm0 - addss %xmm4,%xmm1 - addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0 - - ## increment i force - movss (%edi,%ecx,4),%xmm3 - movss 4(%edi,%ecx,4),%xmm4 - movss 8(%edi,%ecx,4),%xmm5 - addss %xmm0,%xmm3 - addss %xmm1,%xmm4 - addss %xmm2,%xmm5 - movss %xmm3,(%edi,%ecx,4) - movss %xmm4,4(%edi,%ecx,4) - movss %xmm5,8(%edi,%ecx,4) - - ## increment fshift force - movss (%esi,%edx,4),%xmm3 - movss 4(%esi,%edx,4),%xmm4 - movss 8(%esi,%edx,4),%xmm5 - addss %xmm0,%xmm3 - addss %xmm1,%xmm4 - addss %xmm2,%xmm5 - movss %xmm3,(%esi,%edx,4) - movss %xmm4,4(%esi,%edx,4) - movss %xmm5,8(%esi,%edx,4) - - ## get n from stack - movl nb400_n(%esp),%esi - ## get group index for i particle - movl nb400_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movaps nb400_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movl nb400_Vc(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - ## move back to mem - movss %xmm7,(%eax,%edx,4) - - ## accumulate dVda and update it - movaps nb400_dvdasum(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - movl nb400_ii(%esp),%edx - movl nb400_dvda(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - movss %xmm7,(%eax,%edx,4) - - ## finish if last - movl nb400_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel400_ia32_sse.nb400_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb400_n(%esp) - jmp _nb_kernel400_ia32_sse.nb400_outer -_nb_kernel400_ia32_sse.nb400_outerend: - ## check if more outer neighborlists remain - movl nb400_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel400_ia32_sse.nb400_end - ## non-zero, do one more workunit - jmp _nb_kernel400_ia32_sse.nb400_threadloop -_nb_kernel400_ia32_sse.nb400_end: - emms - - movl nb400_nouter(%esp),%eax - movl nb400_ninner(%esp),%ebx - movl nb400_outeriter(%ebp),%ecx - movl nb400_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb400_salign(%esp),%eax - addl %eax,%esp - addl $400,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - - - - - -.globl nb_kernel400nf_ia32_sse -.globl _nb_kernel400nf_ia32_sse -nb_kernel400nf_ia32_sse: -_nb_kernel400nf_ia32_sse: -.set nb400nf_p_nri, 8 -.set nb400nf_iinr, 12 -.set nb400nf_jindex, 16 -.set nb400nf_jjnr, 20 -.set nb400nf_shift, 24 -.set nb400nf_shiftvec, 28 -.set nb400nf_fshift, 32 -.set nb400nf_gid, 36 -.set nb400nf_pos, 40 -.set nb400nf_faction, 44 -.set nb400nf_charge, 48 -.set nb400nf_p_facel, 52 -.set nb400nf_argkrf, 56 -.set nb400nf_argcrf, 60 -.set nb400nf_Vc, 64 -.set nb400nf_type, 68 -.set nb400nf_p_ntype, 72 -.set nb400nf_vdwparam, 76 -.set nb400nf_Vvdw, 80 -.set nb400nf_p_tabscale, 84 -.set nb400nf_VFtab, 88 -.set nb400nf_invsqrta, 92 -.set nb400nf_dvda, 96 -.set nb400nf_p_gbtabscale, 100 -.set nb400nf_GBtab, 104 -.set nb400nf_p_nthreads, 108 -.set nb400nf_count, 112 -.set nb400nf_mtx, 116 -.set nb400nf_outeriter, 120 -.set nb400nf_inneriter, 124 -.set nb400nf_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb400nf_ix, 0 -.set nb400nf_iy, 16 -.set nb400nf_iz, 32 -.set nb400nf_iq, 48 -.set nb400nf_gbtsc, 64 -.set nb400nf_qq, 80 -.set nb400nf_vctot, 96 -.set nb400nf_half, 112 -.set nb400nf_three, 128 -.set nb400nf_isai, 144 -.set nb400nf_isaprod, 160 -.set nb400nf_gbscale, 176 -.set nb400nf_is3, 192 -.set nb400nf_ii3, 196 -.set nb400nf_innerjjnr, 200 -.set nb400nf_innerk, 204 -.set nb400nf_n, 208 -.set nb400nf_nn1, 212 -.set nb400nf_nri, 216 -.set nb400nf_facel, 220 -.set nb400nf_nouter, 224 -.set nb400nf_ninner, 228 -.set nb400nf_salign, 232 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $236,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb400nf_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb400nf_p_nri(%ebp),%ecx - movl nb400nf_p_facel(%ebp),%esi - movl (%ecx),%ecx - movl (%esi),%esi - movl %ecx,nb400nf_nri(%esp) - movl %esi,nb400nf_facel(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb400nf_nouter(%esp) - movl %eax,nb400nf_ninner(%esp) - - - movl nb400nf_p_gbtabscale(%ebp),%eax - movss (%eax),%xmm3 - shufps $0,%xmm3,%xmm3 - movaps %xmm3,nb400nf_gbtsc(%esp) - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## constant 0.5 in IEEE (hex) - movl %eax,nb400nf_half(%esp) - movss nb400nf_half(%esp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## constant 1.0 - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## constant 2.0 - addps %xmm2,%xmm3 ## constant 3.0 - movaps %xmm1,nb400nf_half(%esp) - movaps %xmm3,nb400nf_three(%esp) - -_nb_kernel400nf_ia32_sse.nb400nf_threadloop: - movl nb400nf_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel400nf_ia32_sse.nb400nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel400nf_ia32_sse.nb400nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb400nf_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb400nf_n(%esp) - movl %ebx,nb400nf_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel400nf_ia32_sse.nb400nf_outerstart - jmp _nb_kernel400nf_ia32_sse.nb400nf_end - -_nb_kernel400nf_ia32_sse.nb400nf_outerstart: - ## ebx contains number of outer iterations - addl nb400nf_nouter(%esp),%ebx - movl %ebx,nb400nf_nouter(%esp) - -_nb_kernel400nf_ia32_sse.nb400nf_outer: - movl nb400nf_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb400nf_is3(%esp) ## store is3 - - movl nb400nf_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movss (%eax,%ebx,4),%xmm0 - movss 4(%eax,%ebx,4),%xmm1 - movss 8(%eax,%ebx,4),%xmm2 - - movl nb400nf_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - - movl nb400nf_charge(%ebp),%edx - movss (%edx,%ebx,4),%xmm3 - mulss nb400nf_facel(%esp),%xmm3 - shufps $0,%xmm3,%xmm3 - - movl nb400nf_invsqrta(%ebp),%edx ## load invsqrta[ii] - movss (%edx,%ebx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb400nf_pos(%ebp),%eax ## eax = base of pos[] - - addss (%eax,%ebx,4),%xmm0 - addss 4(%eax,%ebx,4),%xmm1 - addss 8(%eax,%ebx,4),%xmm2 - - movaps %xmm3,nb400nf_iq(%esp) - movaps %xmm4,nb400nf_isai(%esp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb400nf_ix(%esp) - movaps %xmm1,nb400nf_iy(%esp) - movaps %xmm2,nb400nf_iz(%esp) - - movl %ebx,nb400nf_ii3(%esp) - - ## clear vctot - xorps %xmm4,%xmm4 - movaps %xmm4,nb400nf_vctot(%esp) - - movl nb400nf_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb400nf_pos(%ebp),%esi - movl nb400nf_faction(%ebp),%edi - movl nb400nf_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb400nf_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb400nf_ninner(%esp),%ecx - movl %ecx,nb400nf_ninner(%esp) - addl $0,%edx - movl %edx,nb400nf_innerk(%esp) ## number of innerloop atoms - jge _nb_kernel400nf_ia32_sse.nb400nf_unroll_loop - jmp _nb_kernel400nf_ia32_sse.nb400nf_finish_inner -_nb_kernel400nf_ia32_sse.nb400nf_unroll_loop: - ## quad-unroll innerloop here - movl nb400nf_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - movl 8(%edx),%ecx - movl 12(%edx),%edx ## eax-edx=jnr1-4 - addl $16,nb400nf_innerjjnr(%esp) ## advance pointer (unrolled 4) - - ## load isa2 - movl nb400nf_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - movaps nb400nf_isai(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb400nf_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb400nf_gbtsc(%esp),%xmm1 - movaps %xmm1,nb400nf_gbscale(%esp) - - movl nb400nf_charge(%ebp),%esi ## base of charge[] - - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - - mulps nb400nf_iq(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3 - mulps %xmm2,%xmm3 - movaps %xmm3,nb400nf_qq(%esp) - - - movl nb400nf_pos(%ebp),%esi ## base of pos[] - - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - - leal (%ecx,%ecx,2),%ecx ## replace jnr with j3 - leal (%edx,%edx,2),%edx - - ## move four coordinates to xmm0-xmm2 - - movlps (%esi,%eax,4),%xmm4 - movlps (%esi,%ecx,4),%xmm5 - movss 8(%esi,%eax,4),%xmm2 - movss 8(%esi,%ecx,4),%xmm6 - - movhps (%esi,%ebx,4),%xmm4 - movhps (%esi,%edx,4),%xmm5 - - movss 8(%esi,%ebx,4),%xmm0 - movss 8(%esi,%edx,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## constant 10001000 - - shufps $136,%xmm5,%xmm0 ## constant 10001000 - shufps $221,%xmm5,%xmm1 ## constant 11011101 - - ## move ix-iz to xmm4-xmm6 - movaps nb400nf_ix(%esp),%xmm4 - movaps nb400nf_iy(%esp),%xmm5 - movaps nb400nf_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb400nf_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb400nf_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - mulps nb400nf_gbscale(%esp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $2,%mm6 - pslld $2,%mm7 - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movl nb400nf_GBtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## load coulomb table - movaps (%esi,%eax,4),%xmm4 - movaps (%esi,%ebx,4),%xmm5 - movaps (%esi,%ecx,4),%xmm6 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb400nf_qq(%esp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - addps nb400nf_vctot(%esp),%xmm5 - movaps %xmm5,nb400nf_vctot(%esp) - - ## should we do one more iteration? - subl $4,nb400nf_innerk(%esp) - jl _nb_kernel400nf_ia32_sse.nb400nf_finish_inner - jmp _nb_kernel400nf_ia32_sse.nb400nf_unroll_loop -_nb_kernel400nf_ia32_sse.nb400nf_finish_inner: - ## check if at least two particles remain - addl $4,nb400nf_innerk(%esp) - movl nb400nf_innerk(%esp),%edx - andl $2,%edx - jnz _nb_kernel400nf_ia32_sse.nb400nf_dopair - jmp _nb_kernel400nf_ia32_sse.nb400nf_checksingle -_nb_kernel400nf_ia32_sse.nb400nf_dopair: - movl nb400nf_innerjjnr(%esp),%ecx - - movl (%ecx),%eax - movl 4(%ecx),%ebx - addl $8,nb400nf_innerjjnr(%esp) - - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - - ## load isa2 - movl nb400nf_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm2 - movss (%esi,%ebx,4),%xmm3 - unpcklps %xmm3,%xmm2 ## isa2 in xmm3(0,1) - mulps nb400nf_isai(%esp),%xmm2 - movaps %xmm2,nb400nf_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb400nf_gbtsc(%esp),%xmm1 - movaps %xmm1,nb400nf_gbscale(%esp) - - movl nb400nf_charge(%ebp),%esi ## base of charge[] - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ebx,4),%xmm6 - unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges - - mulps nb400nf_iq(%esp),%xmm2 - mulps %xmm2,%xmm3 - movaps %xmm3,nb400nf_qq(%esp) - - movl nb400nf_pos(%ebp),%edi - - leal (%eax,%eax,2),%eax - leal (%ebx,%ebx,2),%ebx - ## move coordinates to xmm0-xmm2 - movlps (%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - movhps (%edi,%ebx,4),%xmm1 - movss 8(%edi,%ebx,4),%xmm0 - - movlhps %xmm7,%xmm3 - - shufps $0,%xmm0,%xmm2 - - movaps %xmm1,%xmm0 - - shufps $136,%xmm2,%xmm2 ## constant 10001000 - - shufps $136,%xmm0,%xmm0 ## constant 10001000 - shufps $221,%xmm1,%xmm1 ## constant 11011101 - - movl nb400nf_faction(%ebp),%edi - ## move ix-iz to xmm4-xmm6 - xorps %xmm7,%xmm7 - - movaps nb400nf_ix(%esp),%xmm4 - movaps nb400nf_iy(%esp),%xmm5 - movaps nb400nf_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb400nf_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb400nf_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - mulps nb400nf_gbscale(%esp),%xmm4 - - cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 - - movl nb400nf_GBtab(%ebp),%esi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## load coulomb table - movaps (%esi,%ecx,4),%xmm4 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb400nf_qq(%esp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - addps nb400nf_vctot(%esp),%xmm5 - movaps %xmm5,nb400nf_vctot(%esp) - -_nb_kernel400nf_ia32_sse.nb400nf_checksingle: - movl nb400nf_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel400nf_ia32_sse.nb400nf_dosingle - jmp _nb_kernel400nf_ia32_sse.nb400nf_updateouterdata -_nb_kernel400nf_ia32_sse.nb400nf_dosingle: - movl nb400nf_charge(%ebp),%esi - movl nb400nf_invsqrta(%ebp),%edx - movl nb400nf_pos(%ebp),%edi - movl nb400nf_innerjjnr(%esp),%ecx - movl (%ecx),%eax - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - movss (%edx,%eax,4),%xmm2 ## isa2 - mulss nb400nf_isai(%esp),%xmm2 - movss %xmm2,nb400nf_isaprod(%esp) - movss %xmm2,%xmm1 - mulss nb400nf_gbtsc(%esp),%xmm1 - movss %xmm1,nb400nf_gbscale(%esp) - - mulss nb400nf_iq(%esp),%xmm2 - movss (%esi,%eax,4),%xmm6 ## xmm6(0) has the charge - mulss %xmm2,%xmm6 - movss %xmm6,nb400nf_qq(%esp) - - leal (%eax,%eax,2),%eax - - ## move coordinates to xmm0-xmm2 - movss (%edi,%eax,4),%xmm0 - movss 4(%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - - movss nb400nf_ix(%esp),%xmm4 - movss nb400nf_iy(%esp),%xmm5 - movss nb400nf_iz(%esp),%xmm6 - - ## calc dr - subss %xmm0,%xmm4 - subss %xmm1,%xmm5 - subss %xmm2,%xmm6 - - ## square it - mulss %xmm4,%xmm4 - mulss %xmm5,%xmm5 - mulss %xmm6,%xmm6 - addss %xmm5,%xmm4 - addss %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movss %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movss nb400nf_three(%esp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movss nb400nf_half(%esp),%xmm0 - subss %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - - mulss %xmm0,%xmm4 ## xmm4=r - mulss nb400nf_gbscale(%esp),%xmm4 - - cvttss2si %xmm4,%ebx ## mm6 contain lu indices - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movss %xmm4,%xmm1 ## xmm1=eps - movss %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%ebx - - movl nb400nf_GBtab(%ebp),%esi - - movaps (%esi,%ebx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - movss nb400nf_qq(%esp),%xmm3 - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss %xmm3,%xmm5 ## vcoul=qq*VV - addss nb400nf_vctot(%esp),%xmm5 - movss %xmm5,nb400nf_vctot(%esp) -_nb_kernel400nf_ia32_sse.nb400nf_updateouterdata: - ## get n from stack - movl nb400nf_n(%esp),%esi - ## get group index for i particle - movl nb400nf_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movaps nb400nf_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movl nb400nf_Vc(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - ## move back to mem - movss %xmm7,(%eax,%edx,4) - - ## finish if last - movl nb400nf_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel400nf_ia32_sse.nb400nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb400nf_n(%esp) - jmp _nb_kernel400nf_ia32_sse.nb400nf_outer -_nb_kernel400nf_ia32_sse.nb400nf_outerend: - ## check if more outer neighborlists remain - movl nb400nf_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel400nf_ia32_sse.nb400nf_end - ## non-zero, do one more workunit - jmp _nb_kernel400nf_ia32_sse.nb400nf_threadloop -_nb_kernel400nf_ia32_sse.nb400nf_end: - emms - - movl nb400nf_nouter(%esp),%eax - movl nb400nf_ninner(%esp),%ebx - movl nb400nf_outeriter(%ebp),%ecx - movl nb400nf_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb400nf_salign(%esp),%eax - addl %eax,%esp - addl $236,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.intel_syntax.s deleted file mode 100644 index 492e8655cd..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.intel_syntax.s +++ /dev/null @@ -1,2049 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. - -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - - - -.globl nb_kernel410_ia32_sse -.globl _nb_kernel410_ia32_sse -nb_kernel410_ia32_sse: -_nb_kernel410_ia32_sse: -.equiv nb410_p_nri, 8 -.equiv nb410_iinr, 12 -.equiv nb410_jindex, 16 -.equiv nb410_jjnr, 20 -.equiv nb410_shift, 24 -.equiv nb410_shiftvec, 28 -.equiv nb410_fshift, 32 -.equiv nb410_gid, 36 -.equiv nb410_pos, 40 -.equiv nb410_faction, 44 -.equiv nb410_charge, 48 -.equiv nb410_p_facel, 52 -.equiv nb410_argkrf, 56 -.equiv nb410_argcrf, 60 -.equiv nb410_Vc, 64 -.equiv nb410_type, 68 -.equiv nb410_p_ntype, 72 -.equiv nb410_vdwparam, 76 -.equiv nb410_Vvdw, 80 -.equiv nb410_p_tabscale, 84 -.equiv nb410_VFtab, 88 -.equiv nb410_invsqrta, 92 -.equiv nb410_dvda, 96 -.equiv nb410_p_gbtabscale, 100 -.equiv nb410_GBtab, 104 -.equiv nb410_p_nthreads, 108 -.equiv nb410_count, 112 -.equiv nb410_mtx, 116 -.equiv nb410_outeriter, 120 -.equiv nb410_inneriter, 124 -.equiv nb410_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb410_ix, 0 -.equiv nb410_iy, 16 -.equiv nb410_iz, 32 -.equiv nb410_iq, 48 -.equiv nb410_dx, 64 -.equiv nb410_dy, 80 -.equiv nb410_dz, 96 -.equiv nb410_two, 112 -.equiv nb410_six, 128 -.equiv nb410_twelve, 144 -.equiv nb410_gbtsc, 160 -.equiv nb410_qq, 176 -.equiv nb410_c6, 192 -.equiv nb410_c12, 208 -.equiv nb410_fscal, 224 -.equiv nb410_vctot, 240 -.equiv nb410_Vvdwtot, 256 -.equiv nb410_fix, 272 -.equiv nb410_fiy, 288 -.equiv nb410_fiz, 304 -.equiv nb410_half, 320 -.equiv nb410_three, 336 -.equiv nb410_r, 352 -.equiv nb410_isai, 368 -.equiv nb410_isaprod, 384 -.equiv nb410_dvdasum, 400 -.equiv nb410_gbscale, 416 -.equiv nb410_is3, 432 -.equiv nb410_ii3, 436 -.equiv nb410_ii, 440 -.equiv nb410_ntia, 444 -.equiv nb410_innerjjnr, 448 -.equiv nb410_innerk, 452 -.equiv nb410_n, 456 -.equiv nb410_nn1, 460 -.equiv nb410_jnra, 464 -.equiv nb410_jnrb, 468 -.equiv nb410_jnrc, 472 -.equiv nb410_jnrd, 476 -.equiv nb410_nri, 480 -.equiv nb410_facel, 484 -.equiv nb410_ntype, 488 -.equiv nb410_nouter, 492 -.equiv nb410_ninner, 496 -.equiv nb410_salign, 500 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 504 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb410_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb410_p_nri] - mov esi, [ebp + nb410_p_facel] - mov edi, [ebp + nb410_p_ntype] - mov ecx, [ecx] - mov esi, [esi] - mov edi, [edi] - mov [esp + nb410_nri], ecx - mov [esp + nb410_facel], esi - mov [esp + nb410_ntype], edi - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb410_nouter], eax - mov [esp + nb410_ninner], eax - - - mov eax, [ebp + nb410_p_gbtabscale] - movss xmm5, [eax] - shufps xmm5, xmm5, 0 - movaps [esp + nb410_gbtsc], xmm5 - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# constant 0.5 in IEEE (hex) - mov [esp + nb410_half], eax - movss xmm1, [esp + nb410_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# constant 1.0 - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# constant 2.0 - addps xmm3, xmm2 ;# constant 3.0 - movaps xmm4, xmm3 - addps xmm4, xmm4 ;# 6.0 - movaps xmm5, xmm4 - addps xmm5, xmm5 ;# constant 12.0 - movaps [esp + nb410_half], xmm1 - movaps [esp + nb410_two], xmm2 - movaps [esp + nb410_three], xmm3 - movaps [esp + nb410_six], xmm4 - movaps [esp + nb410_twelve], xmm5 - -.nb410_threadloop: - mov esi, [ebp + nb410_count] ;# pointer to sync counter - mov eax, [esi] -.nb410_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb410_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb410_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb410_n], eax - mov [esp + nb410_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb410_outerstart - jmp .nb410_end - -.nb410_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb410_nouter] - mov [esp + nb410_nouter], ebx - -.nb410_outer: - mov eax, [ebp + nb410_shift] ;# eax = pointer into shift[] - mov ebx, [eax+esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb410_is3],ebx ;# store is3 - - mov eax, [ebp + nb410_shiftvec] ;# eax = base of shiftvec[] - - movss xmm0, [eax + ebx*4] - movss xmm1, [eax + ebx*4 + 4] - movss xmm2, [eax + ebx*4 + 8] - - mov ecx, [ebp + nb410_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx + esi*4] ;# ebx =ii - mov [esp + nb410_ii], ebx - - mov edx, [ebp + nb410_charge] - movss xmm3, [edx + ebx*4] - mulss xmm3, [esp + nb410_facel] - shufps xmm3, xmm3, 0 - - mov edx, [ebp + nb410_invsqrta] ;# load invsqrta[ii] - movss xmm4, [edx + ebx*4] - shufps xmm4, xmm4, 0 - - mov edx, [ebp + nb410_type] - mov edx, [edx + ebx*4] - imul edx, [esp + nb410_ntype] - shl edx, 1 - mov [esp + nb410_ntia], edx - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb410_pos] ;# eax = base of pos[] - - addss xmm0, [eax + ebx*4] - addss xmm1, [eax + ebx*4 + 4] - addss xmm2, [eax + ebx*4 + 8] - - movaps [esp + nb410_iq], xmm3 - movaps [esp + nb410_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [esp + nb410_ix], xmm0 - movaps [esp + nb410_iy], xmm1 - movaps [esp + nb410_iz], xmm2 - - mov [esp + nb410_ii3], ebx - - ;# clear vctot and i forces - xorps xmm4, xmm4 - movaps [esp + nb410_vctot], xmm4 - movaps [esp + nb410_Vvdwtot], xmm4 - movaps [esp + nb410_dvdasum], xmm4 - movaps [esp + nb410_fix], xmm4 - movaps [esp + nb410_fiy], xmm4 - movaps [esp + nb410_fiz], xmm4 - - mov eax, [ebp + nb410_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb410_pos] - mov edi, [ebp + nb410_faction] - mov eax, [ebp + nb410_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb410_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [esp + nb410_ninner] - mov [esp + nb410_ninner], ecx - add edx, 0 - mov [esp + nb410_innerk], edx ;# number of innerloop atoms - jge .nb410_unroll_loop - jmp .nb410_finish_inner -.nb410_unroll_loop: - ;# quad-unroll innerloop here - mov edx, [esp + nb410_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - mov ecx, [edx + 8] - mov edx, [edx + 12] ;# eax-edx=jnr1-4 - add dword ptr [esp + nb410_innerjjnr], 16 ;# advance pointer (unrolled 4) - - ;# load isaj - mov esi, [ebp + nb410_invsqrta] - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - movaps xmm2, [esp + nb410_isai] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all isaj in xmm3 - mulps xmm2, xmm3 - - movaps [esp + nb410_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb410_gbtsc] - movaps [esp + nb410_gbscale], xmm1 - - mov esi, [ebp + nb410_charge] ;# base of charge[] - - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - - mulps xmm2, [esp + nb410_iq] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3 - mulps xmm3, xmm2 - movaps [esp + nb410_qq], xmm3 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov esi, [ebp + nb410_type] - mov eax, [esi + eax*4] - mov ebx, [esi + ebx*4] - mov ecx, [esi + ecx*4] - mov edx, [esi + edx*4] - mov esi, [ebp + nb410_vdwparam] - shl eax, 1 - shl ebx, 1 - shl ecx, 1 - shl edx, 1 - mov edi, [esp + nb410_ntia] - add eax, edi - add ebx, edi - add ecx, edi - add edx, edi - - movlps xmm6, [esi + eax*4] - movlps xmm7, [esi + ecx*4] - movhps xmm6, [esi + ebx*4] - movhps xmm7, [esi + edx*4] - - movaps xmm4, xmm6 - shufps xmm4, xmm7, 136 ;# constant 10001000 - shufps xmm6, xmm7, 221 ;# constant 11011101 - - movd eax, mm0 - movd ebx, mm1 - movd ecx, mm2 - movd edx, mm3 - - movaps [esp + nb410_c6], xmm4 - movaps [esp + nb410_c12], xmm6 - - mov esi, [ebp + nb410_pos] ;# base of pos[] - - mov [esp + nb410_jnra], eax - mov [esp + nb410_jnrb], ebx - mov [esp + nb410_jnrc], ecx - mov [esp + nb410_jnrd], edx - - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - - lea ecx, [ecx + ecx*2] ;# replace jnr with j3 - lea edx, [edx + edx*2] - - ;# move four coordinates to xmm0-xmm2 - - movlps xmm4, [esi + eax*4] - movlps xmm5, [esi + ecx*4] - movss xmm2, [esi + eax*4 + 8] - movss xmm6, [esi + ecx*4 + 8] - - movhps xmm4, [esi + ebx*4] - movhps xmm5, [esi + edx*4] - - movss xmm0, [esi + ebx*4 + 8] - movss xmm1, [esi + edx*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# constant 10001000 - - shufps xmm0, xmm5, 136 ;# constant 10001000 - shufps xmm1, xmm5, 221 ;# constant 11011101 - - ;# move ix-iz to xmm4-xmm6 - movaps xmm4, [esp + nb410_ix] - movaps xmm5, [esp + nb410_iy] - movaps xmm6, [esp + nb410_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# store dr - movaps [esp + nb410_dx], xmm4 - movaps [esp + nb410_dy], xmm5 - movaps [esp + nb410_dz], xmm6 - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb410_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb410_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [esp + nb410_r], xmm4 - mulps xmm4, [esp + nb410_gbscale] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 2 - pslld mm7, 2 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov esi, [ebp + nb410_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# load coulomb table - movaps xmm4, [esi + eax*4] - movaps xmm5, [esi + ebx*4] - movaps xmm6, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# coulomb table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm7, [esp + nb410_two] ;# two*Heps2 - movaps xmm3, [esp + nb410_qq] - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - mulps xmm3, xmm7 ;# fijC=FF*qq - ;# get jnr from stack - mov eax, [esp + nb410_jnra] - mov ebx, [esp + nb410_jnrb] - mov ecx, [esp + nb410_jnrc] - mov edx, [esp + nb410_jnrd] - - mov esi, [ebp + nb410_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulps xmm3, [esp + nb410_gbscale] - movaps xmm6, xmm3 - mulps xmm6, [esp + nb410_r] - addps xmm6, xmm5 - addps xmm5, [esp + nb410_vctot] - movaps [esp + nb410_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [esp + nb410_dvdasum] - movaps [esp + nb410_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - movaps xmm5, xmm6 - movaps xmm4, xmm7 - shufps xmm5, xmm5, 0x1 - shufps xmm4, xmm4, 0x1 - ;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss xmm6, [esi + eax*4] - addss xmm5, [esi + ebx*4] - addss xmm7, [esi + ecx*4] - addss xmm4, [esi + edx*4] - movss [esi + eax*4], xmm6 - movss [esi + ebx*4], xmm5 - movss [esi + ecx*4], xmm7 - movss [esi + edx*4], xmm4 - - ;# L-J - movaps xmm4, xmm0 - mulps xmm4, xmm0 ;# xmm4=rinvsq - - movaps xmm6, xmm4 - mulps xmm6, xmm4 - - mulps xmm6, xmm4 ;# xmm6=rinvsix - movaps xmm4, xmm6 - mulps xmm4, xmm4 ;# xmm4=rinvtwelve - mulps xmm6, [esp + nb410_c6] - mulps xmm4, [esp + nb410_c12] - movaps xmm7, [esp + nb410_Vvdwtot] - addps xmm7, xmm4 - mulps xmm4, [esp + nb410_twelve] - subps xmm7, xmm6 - mulps xmm6, [esp + nb410_six] - movaps [esp + nb410_Vvdwtot], xmm7 - subps xmm4, xmm6 - mulps xmm4, xmm0 - subps xmm4, xmm3 - mulps xmm4, xmm0 - - movaps xmm0, [esp + nb410_dx] - movaps xmm1, [esp + nb410_dy] - movaps xmm2, [esp + nb410_dz] - - movd eax, mm0 - movd ebx, mm1 - movd ecx, mm2 - movd edx, mm3 - - mov edi, [ebp + nb410_faction] - mulps xmm0, xmm4 - mulps xmm1, xmm4 - mulps xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movaps xmm3, [esp + nb410_fix] - movaps xmm4, [esp + nb410_fiy] - movaps xmm5, [esp + nb410_fiz] - addps xmm3, xmm0 - addps xmm4, xmm1 - addps xmm5, xmm2 - movaps [esp + nb410_fix], xmm3 - movaps [esp + nb410_fiy], xmm4 - movaps [esp + nb410_fiz], xmm5 - ;# the fj's - start by accumulating x & y forces from memory - movlps xmm4, [edi + eax*4] - movlps xmm6, [edi + ecx*4] - movhps xmm4, [edi + ebx*4] - movhps xmm6, [edi + edx*4] - - movaps xmm3, xmm4 - shufps xmm3, xmm6, 136 ;# constant 10001000 - shufps xmm4, xmm6, 221 ;# constant 11011101 - - ;# now xmm3-xmm5 contains fjx, fjy, fjz - subps xmm3, xmm0 - subps xmm4, xmm1 - - ;# unpack them back so we can store them - first x & y in xmm3/xmm4 - - movaps xmm6, xmm3 - unpcklps xmm6, xmm4 - unpckhps xmm3, xmm4 - ;# xmm6(l)=x & y for j1, (h) for j2 - ;# xmm3(l)=x & y for j3, (h) for j4 - movlps [edi + eax*4], xmm6 - movlps [edi + ecx*4], xmm3 - - movhps [edi + ebx*4], xmm6 - movhps [edi + edx*4], xmm3 - - ;# and the z forces - movss xmm4, [edi + eax*4 + 8] - movss xmm5, [edi + ebx*4 + 8] - movss xmm6, [edi + ecx*4 + 8] - movss xmm7, [edi + edx*4 + 8] - subss xmm4, xmm2 - shufps xmm2, xmm2, 229 ;# constant 11100101 - subss xmm5, xmm2 - shufps xmm2, xmm2, 234 ;# constant 11101010 - subss xmm6, xmm2 - shufps xmm2, xmm2, 255 ;# constant 11111111 - subss xmm7, xmm2 - movss [edi + eax*4 + 8], xmm4 - movss [edi + ebx*4 + 8], xmm5 - movss [edi + ecx*4 + 8], xmm6 - movss [edi + edx*4 + 8], xmm7 - - ;# should we do one more iteration? - sub dword ptr [esp + nb410_innerk], 4 - jl .nb410_finish_inner - jmp .nb410_unroll_loop -.nb410_finish_inner: - ;# check if at least two particles remain - add dword ptr [esp + nb410_innerk], 4 - mov edx, [esp + nb410_innerk] - and edx, 2 - jnz .nb410_dopair - jmp .nb410_checksingle -.nb410_dopair: - mov ecx, [esp + nb410_innerjjnr] - mov eax, [ecx] - mov ebx, [ecx + 4] - add dword ptr [esp + nb410_innerjjnr], 8 - - xorps xmm2, xmm2 - movaps xmm6, xmm2 - - ;# load isaj - mov esi, [ebp + nb410_invsqrta] - movss xmm2, [esi + eax*4] - movss xmm3, [esi + ebx*4] - unpcklps xmm2, xmm3 ;# isaj in xmm2(0,1) - mulps xmm2, [esp + nb410_isai] - movaps [esp + nb410_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb410_gbtsc] - movaps [esp + nb410_gbscale], xmm1 - - mov esi, [ebp + nb410_charge] ;# base of charge[] - movss xmm3, [esi + eax*4] - movss xmm6, [esi + ebx*4] - unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges - - mulps xmm2, [esp + nb410_iq] - mulps xmm3, xmm2 - movaps [esp + nb410_qq], xmm3 - - mov esi, [ebp + nb410_type] - mov ecx, eax - mov edx, ebx - mov ecx, [esi + ecx*4] - mov edx, [esi + edx*4] - mov esi, [ebp + nb410_vdwparam] - shl ecx, 1 - shl edx, 1 - mov edi, [esp + nb410_ntia] - add ecx, edi - add edx, edi - movlps xmm6, [esi + ecx*4] - movhps xmm6, [esi + edx*4] - mov edi, [ebp + nb410_pos] - - movaps xmm4, xmm6 - shufps xmm4, xmm4, 8 ;# constant 00001000 - shufps xmm6, xmm6, 13 ;# constant 00001101 - movlhps xmm4, xmm7 - movlhps xmm6, xmm7 - - movaps [esp + nb410_c6], xmm4 - movaps [esp + nb410_c12], xmm6 - - movd mm0, eax - movd mm1, ebx - - lea eax, [eax + eax*2] - lea ebx, [ebx + ebx*2] - ;# move coordinates to xmm0-xmm2 - movlps xmm1, [edi + eax*4] - movss xmm2, [edi + eax*4 + 8] - movhps xmm1, [edi + ebx*4] - movss xmm0, [edi + ebx*4 + 8] - - movlhps xmm3, xmm7 - - shufps xmm2, xmm0, 0 - - movaps xmm0, xmm1 - - shufps xmm2, xmm2, 136 ;# constant 10001000 - - shufps xmm0, xmm0, 136 ;# constant 10001000 - shufps xmm1, xmm1, 221 ;# constant 11011101 - - mov edi, [ebp + nb410_faction] - ;# move ix-iz to xmm4-xmm6 - xorps xmm7, xmm7 - - movaps xmm4, [esp + nb410_ix] - movaps xmm5, [esp + nb410_iy] - movaps xmm6, [esp + nb410_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# store dr - movaps [esp + nb410_dx], xmm4 - movaps [esp + nb410_dy], xmm5 - movaps [esp + nb410_dz], xmm6 - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb410_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb410_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [esp + nb410_r], xmm4 - mulps xmm4, [esp + nb410_gbscale] - - cvttps2pi mm6, xmm4 ;# mm6 contain lu indices - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 - - mov esi, [ebp + nb410_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# load coulomb table - movaps xmm4, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm7, [esp + nb410_two] ;# two*Heps2 - movaps xmm3, [esp + nb410_qq] - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - mulps xmm3, xmm7 ;# fijC=FF*qq - ;# get jnr from regs - movd ecx, mm0 - movd edx, mm1 - - mov esi, [ebp + nb410_dvda] - ;# Calculate dVda - xorps xmm7, xmm7 - mulps xmm3, [esp + nb410_gbscale] - movaps xmm6, xmm3 - mulps xmm6, [esp + nb410_r] - addps xmm6, xmm5 - addps xmm5, [esp + nb410_vctot] - movaps [esp + nb410_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [esp + nb410_dvdasum] - movaps [esp + nb410_dvdasum], xmm7 - - ;# update j atoms dvdaj - movaps xmm7, xmm6 - shufps xmm7, xmm7, 0x1 - addss xmm6, [esi + ecx*4] - addss xmm7, [esi + edx*4] - movss [esi + ecx*4], xmm6 - movss [esi + edx*4], xmm7 - - ;# L-J - movaps xmm4, xmm0 - mulps xmm4, xmm0 ;# xmm4=rinvsq - - ;# at this point mm5 contains vcoul and mm3 fijC - ;# increment vcoul - then we can get rid of mm5 - ;# update vctot - - movaps xmm6, xmm4 - mulps xmm6, xmm4 - - mulps xmm6, xmm4 ;# xmm6=rinvsix - movaps xmm4, xmm6 - mulps xmm4, xmm4 ;# xmm4=rinvtwelve - mulps xmm6, [esp + nb410_c6] - mulps xmm4, [esp + nb410_c12] - movaps xmm7, [esp + nb410_Vvdwtot] - addps xmm7, xmm4 - mulps xmm4, [esp + nb410_twelve] - subps xmm7, xmm6 - mulps xmm6, [esp + nb410_six] - movaps [esp + nb410_Vvdwtot], xmm7 - subps xmm4, xmm6 - mulps xmm4, xmm0 - subps xmm4, xmm3 - mulps xmm4, xmm0 - - movaps xmm0, [esp + nb410_dx] - movaps xmm1, [esp + nb410_dy] - movaps xmm2, [esp + nb410_dz] - - mulps xmm0, xmm4 - mulps xmm1, xmm4 - mulps xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movaps xmm3, [esp + nb410_fix] - movaps xmm4, [esp + nb410_fiy] - movaps xmm5, [esp + nb410_fiz] - addps xmm3, xmm0 - addps xmm4, xmm1 - addps xmm5, xmm2 - movaps [esp + nb410_fix], xmm3 - movaps [esp + nb410_fiy], xmm4 - movaps [esp + nb410_fiz], xmm5 - ;# update the fj's - movss xmm3, [edi + eax*4] - movss xmm4, [edi + eax*4 + 4] - movss xmm5, [edi + eax*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [edi + eax*4], xmm3 - movss [edi + eax*4 + 4], xmm4 - movss [edi + eax*4 + 8], xmm5 - - shufps xmm0, xmm0, 225 ;# constant 11100001 - shufps xmm1, xmm1, 225 ;# constant 11100001 - shufps xmm2, xmm2, 225 ;# constant 11100001 - - movss xmm3, [edi + ebx*4] - movss xmm4, [edi + ebx*4 + 4] - movss xmm5, [edi + ebx*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [edi + ebx*4], xmm3 - movss [edi + ebx*4 + 4], xmm4 - movss [edi + ebx*4 + 8], xmm5 - -.nb410_checksingle: - mov edx, [esp + nb410_innerk] - and edx, 1 - jnz .nb410_dosingle - jmp .nb410_updateouterdata -.nb410_dosingle: - mov esi, [ebp + nb410_charge] - mov edx, [ebp + nb410_invsqrta] - mov edi, [ebp + nb410_pos] - mov ecx, [esp + nb410_innerjjnr] - mov eax, [ecx] - xorps xmm2, xmm2 - movaps xmm6, xmm2 - movss xmm2, [edx + eax*4] ;# isaj - mulss xmm2, [esp + nb410_isai] - movss [esp + nb410_isaprod], xmm2 - movss xmm1, xmm2 - mulss xmm1, [esp + nb410_gbtsc] - movss [esp + nb410_gbscale], xmm1 - - mulss xmm2, [esp + nb410_iq] - movss xmm6, [esi + eax*4] ;# xmm6(0) has the charge - mulss xmm6, xmm2 - movss [esp + nb410_qq], xmm6 - - mov esi, [ebp + nb410_type] - mov ecx, eax - mov ecx, [esi + ecx*4] - mov esi, [ebp + nb410_vdwparam] - shl ecx, 1 - add ecx, [esp + nb410_ntia] - movlps xmm6, [esi + ecx*4] - movaps xmm4, xmm6 - shufps xmm4, xmm4, 252 ;# constant 11111100 - shufps xmm6, xmm6, 253 ;# constant 11111101 - - movaps [esp + nb410_c6], xmm4 - movaps [esp + nb410_c12], xmm6 - - movd mm0, eax - lea eax, [eax + eax*2] - - ;# move coordinates to xmm0-xmm2 - movss xmm0, [edi + eax*4] - movss xmm1, [edi + eax*4 + 4] - movss xmm2, [edi + eax*4 + 8] - - movaps xmm4, [esp + nb410_ix] - movaps xmm5, [esp + nb410_iy] - movaps xmm6, [esp + nb410_iz] - - ;# calc dr - subss xmm4, xmm0 - subss xmm5, xmm1 - subss xmm6, xmm2 - - ;# store dr - movss [esp + nb410_dx], xmm4 - movss [esp + nb410_dy], xmm5 - movss [esp + nb410_dz], xmm6 - ;# square it - mulss xmm4,xmm4 - mulss xmm5,xmm5 - mulss xmm6,xmm6 - addss xmm4, xmm5 - addss xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulss xmm5, xmm5 - movss xmm1, [esp + nb410_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movss xmm0, [esp + nb410_half] - subss xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - - mulss xmm4, xmm0 ;# xmm4=r - movss [esp + nb410_r], xmm4 - mulss xmm4, [esp + nb410_gbscale] - - cvttss2si ebx, xmm4 ;# mm6 contain lu indices - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 2 - mov esi, [ebp + nb410_GBtab] - - movaps xmm4, [esi + ebx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - mulss xmm7, [esp + nb410_two] ;# two*Heps2 - movss xmm3, [esp + nb410_qq] - addss xmm7, xmm6 - addss xmm7, xmm5 ;# xmm7=FF - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, xmm3 ;# vcoul=qq*VV - mulss xmm3, xmm7 ;# fijC=FF*qq - - movd ebx, mm0 - mov esi, [ebp + nb410_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulss xmm3, [esp + nb410_gbscale] - movaps xmm6, xmm3 - mulss xmm6, [esp + nb410_r] - addss xmm6, xmm5 - addss xmm5, [esp + nb410_vctot] - movss [esp + nb410_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [esp + nb410_dvdasum] - movaps [esp + nb410_dvdasum], xmm7 - - ;# update j atoms dvdaj - addss xmm6, [esi + ebx*4] - movss [esi + ebx*4], xmm6 - - ;# L-J - movaps xmm4, xmm0 - mulss xmm4, xmm0 ;# xmm4=rinvsq - - movaps xmm6, xmm4 - mulss xmm6, xmm4 - - mulss xmm6, xmm4 ;# xmm6=rinvsix - movaps xmm4, xmm6 - mulss xmm4, xmm4 ;# xmm4=rinvtwelve - mulss xmm6, [esp + nb410_c6] - mulss xmm4, [esp + nb410_c12] - movss xmm7, [esp + nb410_Vvdwtot] - addss xmm7, xmm4 - mulss xmm4, [esp + nb410_twelve] - subss xmm7, xmm6 - mulss xmm6, [esp + nb410_six] - movss [esp + nb410_Vvdwtot], xmm7 - subss xmm4, xmm6 - mulss xmm4, xmm0 - subss xmm4, xmm3 - mulss xmm4, xmm0 - - movss xmm0, [esp + nb410_dx] - movss xmm1, [esp + nb410_dy] - movss xmm2, [esp + nb410_dz] - - mov edi, [ebp + nb410_faction] - mulss xmm0, xmm4 - mulss xmm1, xmm4 - mulss xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movss xmm3, [esp + nb410_fix] - movss xmm4, [esp + nb410_fiy] - movss xmm5, [esp + nb410_fiz] - addss xmm3, xmm0 - addss xmm4, xmm1 - addss xmm5, xmm2 - movss [esp + nb410_fix], xmm3 - movss [esp + nb410_fiy], xmm4 - movss [esp + nb410_fiz], xmm5 - ;# update fj - - movss xmm3, [edi + eax*4] - movss xmm4, [edi + eax*4 + 4] - movss xmm5, [edi + eax*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [edi + eax*4], xmm3 - movss [edi + eax*4 + 4], xmm4 - movss [edi + eax*4 + 8], xmm5 -.nb410_updateouterdata: - mov ecx, [esp + nb410_ii3] - mov edi, [ebp + nb410_faction] - mov esi, [ebp + nb410_fshift] - mov edx, [esp + nb410_is3] - - ;# accumulate i forces in xmm0, xmm1, xmm2 - movaps xmm0, [esp + nb410_fix] - movaps xmm1, [esp + nb410_fiy] - movaps xmm2, [esp + nb410_fiz] - - movhlps xmm3, xmm0 - movhlps xmm4, xmm1 - movhlps xmm5, xmm2 - addps xmm0, xmm3 - addps xmm1, xmm4 - addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2 - - movaps xmm3, xmm0 - movaps xmm4, xmm1 - movaps xmm5, xmm2 - - shufps xmm3, xmm3, 1 - shufps xmm4, xmm4, 1 - shufps xmm5, xmm5, 1 - addss xmm0, xmm3 - addss xmm1, xmm4 - addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0 - - ;# increment i force - movss xmm3, [edi + ecx*4] - movss xmm4, [edi + ecx*4 + 4] - movss xmm5, [edi + ecx*4 + 8] - addss xmm3, xmm0 - addss xmm4, xmm1 - addss xmm5, xmm2 - movss [edi + ecx*4], xmm3 - movss [edi + ecx*4 + 4], xmm4 - movss [edi + ecx*4 + 8], xmm5 - - ;# increment fshift force - movss xmm3, [esi + edx*4] - movss xmm4, [esi + edx*4 + 4] - movss xmm5, [esi + edx*4 + 8] - addss xmm3, xmm0 - addss xmm4, xmm1 - addss xmm5, xmm2 - movss [esi + edx*4], xmm3 - movss [esi + edx*4 + 4], xmm4 - movss [esi + edx*4 + 8], xmm5 - - ;# get n from stack - mov esi, [esp + nb410_n] - ;# get group index for i particle - mov edx, [ebp + nb410_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movaps xmm7, [esp + nb410_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov eax, [ebp + nb410_Vc] - addss xmm7, [eax + edx*4] - ;# move back to mem - movss [eax + edx*4], xmm7 - - ;# accumulate total lj energy and update it - movaps xmm7, [esp + nb410_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov eax, [ebp + nb410_Vvdw] - addss xmm7, [eax + edx*4] - ;# move back to mem - movss [eax + edx*4], xmm7 - - ;# accumulate dVda and update it - movaps xmm7, [esp + nb410_dvdasum] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - mov edx, [esp + nb410_ii] - mov eax, [ebp + nb410_dvda] - addss xmm7, [eax + edx*4] - movss [eax + edx*4], xmm7 - - ;# finish if last - mov ecx, [esp + nb410_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb410_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb410_n], esi - jmp .nb410_outer -.nb410_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb410_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb410_end - ;# non-zero, do one more workunit - jmp .nb410_threadloop -.nb410_end: - emms - - mov eax, [esp + nb410_nouter] - mov ebx, [esp + nb410_ninner] - mov ecx, [ebp + nb410_outeriter] - mov edx, [ebp + nb410_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb410_salign] - add esp, eax - add esp, 504 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret - - - -.globl nb_kernel410nf_ia32_sse -.globl _nb_kernel410nf_ia32_sse -nb_kernel410nf_ia32_sse: -_nb_kernel410nf_ia32_sse: -.equiv nb410nf_p_nri, 8 -.equiv nb410nf_iinr, 12 -.equiv nb410nf_jindex, 16 -.equiv nb410nf_jjnr, 20 -.equiv nb410nf_shift, 24 -.equiv nb410nf_shiftvec, 28 -.equiv nb410nf_fshift, 32 -.equiv nb410nf_gid, 36 -.equiv nb410nf_pos, 40 -.equiv nb410nf_faction, 44 -.equiv nb410nf_charge, 48 -.equiv nb410nf_p_facel, 52 -.equiv nb410nf_argkrf, 56 -.equiv nb410nf_argcrf, 60 -.equiv nb410nf_Vc, 64 -.equiv nb410nf_type, 68 -.equiv nb410nf_p_ntype, 72 -.equiv nb410nf_vdwparam, 76 -.equiv nb410nf_Vvdw, 80 -.equiv nb410nf_p_tabscale, 84 -.equiv nb410nf_VFtab, 88 -.equiv nb410nf_invsqrta, 92 -.equiv nb410nf_dvda, 96 -.equiv nb410nf_p_gbtabscale, 100 -.equiv nb410nf_GBtab, 104 -.equiv nb410nf_p_nthreads, 108 -.equiv nb410nf_count, 112 -.equiv nb410nf_mtx, 116 -.equiv nb410nf_outeriter, 120 -.equiv nb410nf_inneriter, 124 -.equiv nb410nf_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb410nf_ix, 0 -.equiv nb410nf_iy, 16 -.equiv nb410nf_iz, 32 -.equiv nb410nf_iq, 48 -.equiv nb410nf_gbtsc, 64 -.equiv nb410nf_qq, 80 -.equiv nb410nf_c6, 96 -.equiv nb410nf_c12, 112 -.equiv nb410nf_vctot, 128 -.equiv nb410nf_Vvdwtot, 144 -.equiv nb410nf_half, 160 -.equiv nb410nf_three, 176 -.equiv nb410nf_isai, 192 -.equiv nb410nf_isaprod, 208 -.equiv nb410nf_gbscale, 224 -.equiv nb410nf_is3, 240 -.equiv nb410nf_ii3, 244 -.equiv nb410nf_ntia, 248 -.equiv nb410nf_innerjjnr, 252 -.equiv nb410nf_innerk, 256 -.equiv nb410nf_n, 260 -.equiv nb410nf_nn1, 264 -.equiv nb410nf_nri, 268 -.equiv nb410nf_facel, 272 -.equiv nb410nf_ntype, 276 -.equiv nb410nf_nouter, 280 -.equiv nb410nf_ninner, 284 -.equiv nb410nf_salign, 288 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 292 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb410nf_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb410nf_p_nri] - mov esi, [ebp + nb410nf_p_facel] - mov edi, [ebp + nb410nf_p_ntype] - mov ecx, [ecx] - mov esi, [esi] - mov edi, [edi] - mov [esp + nb410nf_nri], ecx - mov [esp + nb410nf_facel], esi - mov [esp + nb410nf_ntype], edi - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb410nf_nouter], eax - mov [esp + nb410nf_ninner], eax - - - mov eax, [ebp + nb410nf_p_gbtabscale] - movss xmm5, [eax] - shufps xmm5, xmm5, 0 - movaps [esp + nb410nf_gbtsc], xmm5 - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# constant 0.5 in IEEE (hex) - mov [esp + nb410nf_half], eax - movss xmm1, [esp + nb410nf_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# constant 1.0 - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# constant 2.0 - addps xmm3, xmm2 ;# constant 3.0 - movaps [esp + nb410nf_half], xmm1 - movaps [esp + nb410nf_three], xmm3 - -.nb410nf_threadloop: - mov esi, [ebp + nb410nf_count] ;# pointer to sync counter - mov eax, [esi] -.nb410nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb410nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb410nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb410nf_n], eax - mov [esp + nb410nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb410nf_outerstart - jmp .nb410nf_end - -.nb410nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb410nf_nouter] - mov [esp + nb410nf_nouter], ebx - -.nb410nf_outer: - mov eax, [ebp + nb410nf_shift] ;# eax = pointer into shift[] - mov ebx, [eax+esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb410nf_is3],ebx ;# store is3 - - mov eax, [ebp + nb410nf_shiftvec] ;# eax = base of shiftvec[] - - movss xmm0, [eax + ebx*4] - movss xmm1, [eax + ebx*4 + 4] - movss xmm2, [eax + ebx*4 + 8] - - mov ecx, [ebp + nb410nf_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx + esi*4] ;# ebx =ii - - mov edx, [ebp + nb410nf_charge] - movss xmm3, [edx + ebx*4] - mulss xmm3, [esp + nb410nf_facel] - shufps xmm3, xmm3, 0 - - mov edx, [ebp + nb410nf_invsqrta] ;# load invsqrta[ii] - movss xmm4, [edx + ebx*4] - shufps xmm4, xmm4, 0 - - mov edx, [ebp + nb410nf_type] - mov edx, [edx + ebx*4] - imul edx, [esp + nb410nf_ntype] - shl edx, 1 - mov [esp + nb410nf_ntia], edx - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb410nf_pos] ;# eax = base of pos[] - - addss xmm0, [eax + ebx*4] - addss xmm1, [eax + ebx*4 + 4] - addss xmm2, [eax + ebx*4 + 8] - - movaps [esp + nb410nf_iq], xmm3 - movaps [esp + nb410nf_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [esp + nb410nf_ix], xmm0 - movaps [esp + nb410nf_iy], xmm1 - movaps [esp + nb410nf_iz], xmm2 - - mov [esp + nb410nf_ii3], ebx - - ;# clear vctot - xorps xmm4, xmm4 - movaps [esp + nb410nf_vctot], xmm4 - movaps [esp + nb410nf_Vvdwtot], xmm4 - - mov eax, [ebp + nb410nf_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb410nf_pos] - mov edi, [ebp + nb410nf_faction] - mov eax, [ebp + nb410nf_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb410nf_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [esp + nb410nf_ninner] - mov [esp + nb410nf_ninner], ecx - add edx, 0 - mov [esp + nb410nf_innerk], edx ;# number of innerloop atoms - jge .nb410nf_unroll_loop - jmp .nb410nf_finish_inner -.nb410nf_unroll_loop: - ;# quad-unroll innerloop here - mov edx, [esp + nb410nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - mov ecx, [edx + 8] - mov edx, [edx + 12] ;# eax-edx=jnr1-4 - add dword ptr [esp + nb410nf_innerjjnr], 16 ;# advance pointer (unrolled 4) - - ;# load isa2 - mov esi, [ebp + nb410nf_invsqrta] - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - movaps xmm2, [esp + nb410nf_isai] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3 - mulps xmm2, xmm3 - - movaps [esp + nb410nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb410nf_gbtsc] - movaps [esp + nb410nf_gbscale], xmm1 - - mov esi, [ebp + nb410nf_charge] ;# base of charge[] - - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - - mulps xmm2, [esp + nb410nf_iq] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3 - mulps xmm3, xmm2 - movaps [esp + nb410nf_qq], xmm3 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov esi, [ebp + nb410nf_type] - mov eax, [esi + eax*4] - mov ebx, [esi + ebx*4] - mov ecx, [esi + ecx*4] - mov edx, [esi + edx*4] - mov esi, [ebp + nb410nf_vdwparam] - shl eax, 1 - shl ebx, 1 - shl ecx, 1 - shl edx, 1 - mov edi, [esp + nb410nf_ntia] - add eax, edi - add ebx, edi - add ecx, edi - add edx, edi - - movlps xmm6, [esi + eax*4] - movlps xmm7, [esi + ecx*4] - movhps xmm6, [esi + ebx*4] - movhps xmm7, [esi + edx*4] - - movaps xmm4, xmm6 - shufps xmm4, xmm7, 136 ;# constant 10001000 - shufps xmm6, xmm7, 221 ;# constant 11011101 - - movd eax, mm0 - movd ebx, mm1 - movd ecx, mm2 - movd edx, mm3 - - movaps [esp + nb410nf_c6], xmm4 - movaps [esp + nb410nf_c12], xmm6 - - mov esi, [ebp + nb410nf_pos] ;# base of pos[] - - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - - lea ecx, [ecx + ecx*2] ;# replace jnr with j3 - lea edx, [edx + edx*2] - - ;# move four coordinates to xmm0-xmm2 - - movlps xmm4, [esi + eax*4] - movlps xmm5, [esi + ecx*4] - movss xmm2, [esi + eax*4 + 8] - movss xmm6, [esi + ecx*4 + 8] - - movhps xmm4, [esi + ebx*4] - movhps xmm5, [esi + edx*4] - - movss xmm0, [esi + ebx*4 + 8] - movss xmm1, [esi + edx*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# constant 10001000 - - shufps xmm0, xmm5, 136 ;# constant 10001000 - shufps xmm1, xmm5, 221 ;# constant 11011101 - - ;# move ix-iz to xmm4-xmm6 - movaps xmm4, [esp + nb410nf_ix] - movaps xmm5, [esp + nb410nf_iy] - movaps xmm6, [esp + nb410nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb410nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb410nf_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - mulps xmm4, [esp + nb410nf_gbscale] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 2 - pslld mm7, 2 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov esi, [ebp + nb410nf_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# load coulomb table - movaps xmm4, [esi + eax*4] - movaps xmm5, [esi + ebx*4] - movaps xmm6, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# coulomb table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [esp + nb410nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - ;# update vctot - addps xmm5, [esp + nb410nf_vctot] - movaps [esp + nb410nf_vctot], xmm5 - - ;# L-J - movaps xmm4, xmm0 - mulps xmm4, xmm0 ;# xmm4=rinvsq - - movaps xmm6, xmm4 - mulps xmm6, xmm4 - - mulps xmm6, xmm4 ;# xmm6=rinvsix - movaps xmm4, xmm6 - mulps xmm4, xmm4 ;# xmm4=rinvtwelve - mulps xmm6, [esp + nb410nf_c6] - mulps xmm4, [esp + nb410nf_c12] - movaps xmm7, [esp + nb410nf_Vvdwtot] - addps xmm7, xmm4 - subps xmm7, xmm6 - movaps [esp + nb410nf_Vvdwtot], xmm7 - - ;# should we do one more iteration? - sub dword ptr [esp + nb410nf_innerk], 4 - jl .nb410nf_finish_inner - jmp .nb410nf_unroll_loop -.nb410nf_finish_inner: - ;# check if at least two particles remain - add dword ptr [esp + nb410nf_innerk], 4 - mov edx, [esp + nb410nf_innerk] - and edx, 2 - jnz .nb410nf_dopair - jmp .nb410nf_checksingle -.nb410nf_dopair: - mov ecx, [esp + nb410nf_innerjjnr] - mov eax, [ecx] - mov ebx, [ecx + 4] - add dword ptr [esp + nb410nf_innerjjnr], 8 - - xorps xmm2, xmm2 - movaps xmm6, xmm2 - - ;# load isa2 - mov esi, [ebp + nb410nf_invsqrta] - movss xmm2, [esi + eax*4] - movss xmm3, [esi + ebx*4] - unpcklps xmm2, xmm3 ;# isa2 in xmm3(0,1) - mulps xmm2, [esp + nb410nf_isai] - movaps [esp + nb410nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb410nf_gbtsc] - movaps [esp + nb410nf_gbscale], xmm1 - - mov esi, [ebp + nb410nf_charge] ;# base of charge[] - movss xmm3, [esi + eax*4] - movss xmm6, [esi + ebx*4] - unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges - - mulps xmm2, [esp + nb410nf_iq] - mulps xmm3, xmm2 - movaps [esp + nb410nf_qq], xmm3 - - mov esi, [ebp + nb410nf_type] - mov ecx, eax - mov edx, ebx - mov ecx, [esi + ecx*4] - mov edx, [esi + edx*4] - mov esi, [ebp + nb410nf_vdwparam] - shl ecx, 1 - shl edx, 1 - mov edi, [esp + nb410nf_ntia] - add ecx, edi - add edx, edi - movlps xmm6, [esi + ecx*4] - movhps xmm6, [esi + edx*4] - mov edi, [ebp + nb410nf_pos] - - movaps xmm4, xmm6 - shufps xmm4, xmm4, 8 ;# constant 00001000 - shufps xmm6, xmm6, 13 ;# constant 00001101 - movlhps xmm4, xmm7 - movlhps xmm6, xmm7 - - movaps [esp + nb410nf_c6], xmm4 - movaps [esp + nb410nf_c12], xmm6 - - lea eax, [eax + eax*2] - lea ebx, [ebx + ebx*2] - ;# move coordinates to xmm0-xmm2 - movlps xmm1, [edi + eax*4] - movss xmm2, [edi + eax*4 + 8] - movhps xmm1, [edi + ebx*4] - movss xmm0, [edi + ebx*4 + 8] - - movlhps xmm3, xmm7 - - shufps xmm2, xmm0, 0 - - movaps xmm0, xmm1 - - shufps xmm2, xmm2, 136 ;# constant 10001000 - - shufps xmm0, xmm0, 136 ;# constant 10001000 - shufps xmm1, xmm1, 221 ;# constant 11011101 - - mov edi, [ebp + nb410nf_faction] - ;# move ix-iz to xmm4-xmm6 - xorps xmm7, xmm7 - - movaps xmm4, [esp + nb410nf_ix] - movaps xmm5, [esp + nb410nf_iy] - movaps xmm6, [esp + nb410nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb410nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb410nf_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - mulps xmm4, [esp + nb410nf_gbscale] - - cvttps2pi mm6, xmm4 ;# mm6 contain lu indices - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 - - mov esi, [ebp + nb410nf_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# load coulomb table - movaps xmm4, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [esp + nb410nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - - addps xmm5, [esp + nb410nf_vctot] - movaps [esp + nb410nf_vctot], xmm5 - - ;# L-J - movaps xmm4, xmm0 - mulps xmm4, xmm0 ;# xmm4=rinvsq - - ;# at this point mm5 contains vcoul and mm3 fijC - ;# increment vcoul - then we can get rid of mm5 - ;# update vctot - - movaps xmm6, xmm4 - mulps xmm6, xmm4 - - mulps xmm6, xmm4 ;# xmm6=rinvsix - movaps xmm4, xmm6 - mulps xmm4, xmm4 ;# xmm4=rinvtwelve - mulps xmm6, [esp + nb410nf_c6] - mulps xmm4, [esp + nb410nf_c12] - movaps xmm7, [esp + nb410nf_Vvdwtot] - addps xmm7, xmm4 - subps xmm7, xmm6 - movaps [esp + nb410nf_Vvdwtot], xmm7 - -.nb410nf_checksingle: - mov edx, [esp + nb410nf_innerk] - and edx, 1 - jnz .nb410nf_dosingle - jmp .nb410nf_updateouterdata -.nb410nf_dosingle: - mov esi, [ebp + nb410nf_charge] - mov edx, [ebp + nb410nf_invsqrta] - mov edi, [ebp + nb410nf_pos] - mov ecx, [esp + nb410nf_innerjjnr] - mov eax, [ecx] - xorps xmm2, xmm2 - movaps xmm6, xmm2 - movss xmm2, [edx + eax*4] ;# isa2 - mulss xmm2, [esp + nb410nf_isai] - movss [esp + nb410nf_isaprod], xmm2 - movss xmm1, xmm2 - mulss xmm1, [esp + nb410nf_gbtsc] - movss [esp + nb410nf_gbscale], xmm1 - - mulss xmm2, [esp + nb410nf_iq] - movss xmm6, [esi + eax*4] ;# xmm6(0) has the charge - mulss xmm6, xmm2 - movss [esp + nb410nf_qq], xmm6 - - mov esi, [ebp + nb410nf_type] - mov ecx, eax - mov ecx, [esi + ecx*4] - mov esi, [ebp + nb410nf_vdwparam] - shl ecx, 1 - add ecx, [esp + nb410nf_ntia] - movlps xmm6, [esi + ecx*4] - movaps xmm4, xmm6 - shufps xmm4, xmm4, 252 ;# constant 11111100 - shufps xmm6, xmm6, 253 ;# constant 11111101 - - movaps [esp + nb410nf_c6], xmm4 - movaps [esp + nb410nf_c12], xmm6 - - lea eax, [eax + eax*2] - - ;# move coordinates to xmm0-xmm2 - movss xmm0, [edi + eax*4] - movss xmm1, [edi + eax*4 + 4] - movss xmm2, [edi + eax*4 + 8] - - movaps xmm4, [esp + nb410nf_ix] - movaps xmm5, [esp + nb410nf_iy] - movaps xmm6, [esp + nb410nf_iz] - - ;# calc dr - subss xmm4, xmm0 - subss xmm5, xmm1 - subss xmm6, xmm2 - - ;# square it - mulss xmm4,xmm4 - mulss xmm5,xmm5 - mulss xmm6,xmm6 - addss xmm4, xmm5 - addss xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulss xmm5, xmm5 - movss xmm1, [esp + nb410nf_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movss xmm0, [esp + nb410nf_half] - subss xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - - mulss xmm4, xmm0 ;# xmm4=r - mulss xmm4, [esp + nb410nf_gbscale] - - cvttss2si ebx, xmm4 ;# mm6 contain lu indices - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 2 - mov esi, [ebp + nb410nf_GBtab] - - movaps xmm4, [esi + ebx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - movss xmm3, [esp + nb410nf_qq] - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, xmm3 ;# vcoul=qq*VV - addss xmm5, [esp + nb410nf_vctot] - movss [esp + nb410nf_vctot], xmm5 - - ;# L-J - movaps xmm4, xmm0 - mulss xmm4, xmm0 ;# xmm4=rinvsq - - movaps xmm6, xmm4 - mulss xmm6, xmm4 - - mulss xmm6, xmm4 ;# xmm6=rinvsix - movaps xmm4, xmm6 - mulss xmm4, xmm4 ;# xmm4=rinvtwelve - mulss xmm6, [esp + nb410nf_c6] - mulss xmm4, [esp + nb410nf_c12] - movss xmm7, [esp + nb410nf_Vvdwtot] - addps xmm7, xmm4 - subps xmm7, xmm6 - movss [esp + nb410nf_Vvdwtot], xmm7 - -.nb410nf_updateouterdata: - ;# get n from stack - mov esi, [esp + nb410nf_n] - ;# get group index for i particle - mov edx, [ebp + nb410nf_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movaps xmm7, [esp + nb410nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov eax, [ebp + nb410nf_Vc] - addss xmm7, [eax + edx*4] - ;# move back to mem - movss [eax + edx*4], xmm7 - - ;# accumulate total lj energy and update it - movaps xmm7, [esp + nb410nf_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov eax, [ebp + nb410nf_Vvdw] - addss xmm7, [eax + edx*4] - ;# move back to mem - movss [eax + edx*4], xmm7 - - ;# finish if last - mov ecx, [esp + nb410nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb410nf_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb410nf_n], esi - jmp .nb410nf_outer -.nb410nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb410nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb410nf_end - ;# non-zero, do one more workunit - jmp .nb410nf_threadloop -.nb410nf_end: - emms - - mov eax, [esp + nb410nf_nouter] - mov ebx, [esp + nb410nf_ninner] - mov ecx, [ebp + nb410nf_outeriter] - mov edx, [ebp + nb410nf_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb410nf_salign] - add esp, eax - add esp, 292 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.s deleted file mode 100644 index 0c05ad5c91..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.s +++ /dev/null @@ -1,2022 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - - -.globl nb_kernel410_ia32_sse -.globl _nb_kernel410_ia32_sse -nb_kernel410_ia32_sse: -_nb_kernel410_ia32_sse: -.set nb410_p_nri, 8 -.set nb410_iinr, 12 -.set nb410_jindex, 16 -.set nb410_jjnr, 20 -.set nb410_shift, 24 -.set nb410_shiftvec, 28 -.set nb410_fshift, 32 -.set nb410_gid, 36 -.set nb410_pos, 40 -.set nb410_faction, 44 -.set nb410_charge, 48 -.set nb410_p_facel, 52 -.set nb410_argkrf, 56 -.set nb410_argcrf, 60 -.set nb410_Vc, 64 -.set nb410_type, 68 -.set nb410_p_ntype, 72 -.set nb410_vdwparam, 76 -.set nb410_Vvdw, 80 -.set nb410_p_tabscale, 84 -.set nb410_VFtab, 88 -.set nb410_invsqrta, 92 -.set nb410_dvda, 96 -.set nb410_p_gbtabscale, 100 -.set nb410_GBtab, 104 -.set nb410_p_nthreads, 108 -.set nb410_count, 112 -.set nb410_mtx, 116 -.set nb410_outeriter, 120 -.set nb410_inneriter, 124 -.set nb410_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb410_ix, 0 -.set nb410_iy, 16 -.set nb410_iz, 32 -.set nb410_iq, 48 -.set nb410_dx, 64 -.set nb410_dy, 80 -.set nb410_dz, 96 -.set nb410_two, 112 -.set nb410_six, 128 -.set nb410_twelve, 144 -.set nb410_gbtsc, 160 -.set nb410_qq, 176 -.set nb410_c6, 192 -.set nb410_c12, 208 -.set nb410_fscal, 224 -.set nb410_vctot, 240 -.set nb410_Vvdwtot, 256 -.set nb410_fix, 272 -.set nb410_fiy, 288 -.set nb410_fiz, 304 -.set nb410_half, 320 -.set nb410_three, 336 -.set nb410_r, 352 -.set nb410_isai, 368 -.set nb410_isaprod, 384 -.set nb410_dvdasum, 400 -.set nb410_gbscale, 416 -.set nb410_is3, 432 -.set nb410_ii3, 436 -.set nb410_ii, 440 -.set nb410_ntia, 444 -.set nb410_innerjjnr, 448 -.set nb410_innerk, 452 -.set nb410_n, 456 -.set nb410_nn1, 460 -.set nb410_jnra, 464 -.set nb410_jnrb, 468 -.set nb410_jnrc, 472 -.set nb410_jnrd, 476 -.set nb410_nri, 480 -.set nb410_facel, 484 -.set nb410_ntype, 488 -.set nb410_nouter, 492 -.set nb410_ninner, 496 -.set nb410_salign, 500 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $504,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb410_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb410_p_nri(%ebp),%ecx - movl nb410_p_facel(%ebp),%esi - movl nb410_p_ntype(%ebp),%edi - movl (%ecx),%ecx - movl (%esi),%esi - movl (%edi),%edi - movl %ecx,nb410_nri(%esp) - movl %esi,nb410_facel(%esp) - movl %edi,nb410_ntype(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb410_nouter(%esp) - movl %eax,nb410_ninner(%esp) - - - movl nb410_p_gbtabscale(%ebp),%eax - movss (%eax),%xmm5 - shufps $0,%xmm5,%xmm5 - movaps %xmm5,nb410_gbtsc(%esp) - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## constant 0.5 in IEEE (hex) - movl %eax,nb410_half(%esp) - movss nb410_half(%esp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## constant 1.0 - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## constant 2.0 - addps %xmm2,%xmm3 ## constant 3.0 - movaps %xmm3,%xmm4 - addps %xmm4,%xmm4 ## 6.0 - movaps %xmm4,%xmm5 - addps %xmm5,%xmm5 ## constant 12.0 - movaps %xmm1,nb410_half(%esp) - movaps %xmm2,nb410_two(%esp) - movaps %xmm3,nb410_three(%esp) - movaps %xmm4,nb410_six(%esp) - movaps %xmm5,nb410_twelve(%esp) - -_nb_kernel410_ia32_sse.nb410_threadloop: - movl nb410_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel410_ia32_sse.nb410_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel410_ia32_sse.nb410_spinlock - - ## if(nn1>nri) nn1=nri - movl nb410_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb410_n(%esp) - movl %ebx,nb410_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel410_ia32_sse.nb410_outerstart - jmp _nb_kernel410_ia32_sse.nb410_end - -_nb_kernel410_ia32_sse.nb410_outerstart: - ## ebx contains number of outer iterations - addl nb410_nouter(%esp),%ebx - movl %ebx,nb410_nouter(%esp) - -_nb_kernel410_ia32_sse.nb410_outer: - movl nb410_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb410_is3(%esp) ## store is3 - - movl nb410_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movss (%eax,%ebx,4),%xmm0 - movss 4(%eax,%ebx,4),%xmm1 - movss 8(%eax,%ebx,4),%xmm2 - - movl nb410_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - movl %ebx,nb410_ii(%esp) - - movl nb410_charge(%ebp),%edx - movss (%edx,%ebx,4),%xmm3 - mulss nb410_facel(%esp),%xmm3 - shufps $0,%xmm3,%xmm3 - - movl nb410_invsqrta(%ebp),%edx ## load invsqrta[ii] - movss (%edx,%ebx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - movl nb410_type(%ebp),%edx - movl (%edx,%ebx,4),%edx - imull nb410_ntype(%esp),%edx - shll %edx - movl %edx,nb410_ntia(%esp) - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb410_pos(%ebp),%eax ## eax = base of pos[] - - addss (%eax,%ebx,4),%xmm0 - addss 4(%eax,%ebx,4),%xmm1 - addss 8(%eax,%ebx,4),%xmm2 - - movaps %xmm3,nb410_iq(%esp) - movaps %xmm4,nb410_isai(%esp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb410_ix(%esp) - movaps %xmm1,nb410_iy(%esp) - movaps %xmm2,nb410_iz(%esp) - - movl %ebx,nb410_ii3(%esp) - - ## clear vctot and i forces - xorps %xmm4,%xmm4 - movaps %xmm4,nb410_vctot(%esp) - movaps %xmm4,nb410_Vvdwtot(%esp) - movaps %xmm4,nb410_dvdasum(%esp) - movaps %xmm4,nb410_fix(%esp) - movaps %xmm4,nb410_fiy(%esp) - movaps %xmm4,nb410_fiz(%esp) - - movl nb410_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb410_pos(%ebp),%esi - movl nb410_faction(%ebp),%edi - movl nb410_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb410_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb410_ninner(%esp),%ecx - movl %ecx,nb410_ninner(%esp) - addl $0,%edx - movl %edx,nb410_innerk(%esp) ## number of innerloop atoms - jge _nb_kernel410_ia32_sse.nb410_unroll_loop - jmp _nb_kernel410_ia32_sse.nb410_finish_inner -_nb_kernel410_ia32_sse.nb410_unroll_loop: - ## quad-unroll innerloop here - movl nb410_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - movl 8(%edx),%ecx - movl 12(%edx),%edx ## eax-edx=jnr1-4 - addl $16,nb410_innerjjnr(%esp) ## advance pointer (unrolled 4) - - ## load isaj - movl nb410_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - movaps nb410_isai(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all isaj in xmm3 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb410_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb410_gbtsc(%esp),%xmm1 - movaps %xmm1,nb410_gbscale(%esp) - - movl nb410_charge(%ebp),%esi ## base of charge[] - - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - - mulps nb410_iq(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3 - mulps %xmm2,%xmm3 - movaps %xmm3,nb410_qq(%esp) - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movl nb410_type(%ebp),%esi - movl (%esi,%eax,4),%eax - movl (%esi,%ebx,4),%ebx - movl (%esi,%ecx,4),%ecx - movl (%esi,%edx,4),%edx - movl nb410_vdwparam(%ebp),%esi - shll %eax - shll %ebx - shll %ecx - shll %edx - movl nb410_ntia(%esp),%edi - addl %edi,%eax - addl %edi,%ebx - addl %edi,%ecx - addl %edi,%edx - - movlps (%esi,%eax,4),%xmm6 - movlps (%esi,%ecx,4),%xmm7 - movhps (%esi,%ebx,4),%xmm6 - movhps (%esi,%edx,4),%xmm7 - - movaps %xmm6,%xmm4 - shufps $136,%xmm7,%xmm4 ## constant 10001000 - shufps $221,%xmm7,%xmm6 ## constant 11011101 - - movd %mm0,%eax - movd %mm1,%ebx - movd %mm2,%ecx - movd %mm3,%edx - - movaps %xmm4,nb410_c6(%esp) - movaps %xmm6,nb410_c12(%esp) - - movl nb410_pos(%ebp),%esi ## base of pos[] - - movl %eax,nb410_jnra(%esp) - movl %ebx,nb410_jnrb(%esp) - movl %ecx,nb410_jnrc(%esp) - movl %edx,nb410_jnrd(%esp) - - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - - leal (%ecx,%ecx,2),%ecx ## replace jnr with j3 - leal (%edx,%edx,2),%edx - - ## move four coordinates to xmm0-xmm2 - - movlps (%esi,%eax,4),%xmm4 - movlps (%esi,%ecx,4),%xmm5 - movss 8(%esi,%eax,4),%xmm2 - movss 8(%esi,%ecx,4),%xmm6 - - movhps (%esi,%ebx,4),%xmm4 - movhps (%esi,%edx,4),%xmm5 - - movss 8(%esi,%ebx,4),%xmm0 - movss 8(%esi,%edx,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## constant 10001000 - - shufps $136,%xmm5,%xmm0 ## constant 10001000 - shufps $221,%xmm5,%xmm1 ## constant 11011101 - - ## move ix-iz to xmm4-xmm6 - movaps nb410_ix(%esp),%xmm4 - movaps nb410_iy(%esp),%xmm5 - movaps nb410_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## store dr - movaps %xmm4,nb410_dx(%esp) - movaps %xmm5,nb410_dy(%esp) - movaps %xmm6,nb410_dz(%esp) - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb410_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb410_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb410_r(%esp) - mulps nb410_gbscale(%esp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $2,%mm6 - pslld $2,%mm7 - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movl nb410_GBtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## load coulomb table - movaps (%esi,%eax,4),%xmm4 - movaps (%esi,%ebx,4),%xmm5 - movaps (%esi,%ecx,4),%xmm6 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## coulomb table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps nb410_two(%esp),%xmm7 ## two*Heps2 - movaps nb410_qq(%esp),%xmm3 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - mulps %xmm7,%xmm3 ## fijC=FF*qq - ## get jnr from stack - movl nb410_jnra(%esp),%eax - movl nb410_jnrb(%esp),%ebx - movl nb410_jnrc(%esp),%ecx - movl nb410_jnrd(%esp),%edx - - movl nb410_dvda(%ebp),%esi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulps nb410_gbscale(%esp),%xmm3 - movaps %xmm3,%xmm6 - mulps nb410_r(%esp),%xmm6 - addps %xmm5,%xmm6 - addps nb410_vctot(%esp),%xmm5 - movaps %xmm5,nb410_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb410_dvdasum(%esp),%xmm7 - movaps %xmm7,nb410_dvdasum(%esp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - movaps %xmm6,%xmm5 - movaps %xmm7,%xmm4 - shufps $0x1,%xmm5,%xmm5 - shufps $0x1,%xmm4,%xmm4 - ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss (%esi,%eax,4),%xmm6 - addss (%esi,%ebx,4),%xmm5 - addss (%esi,%ecx,4),%xmm7 - addss (%esi,%edx,4),%xmm4 - movss %xmm6,(%esi,%eax,4) - movss %xmm5,(%esi,%ebx,4) - movss %xmm7,(%esi,%ecx,4) - movss %xmm4,(%esi,%edx,4) - - ## L-J - movaps %xmm0,%xmm4 - mulps %xmm0,%xmm4 ## xmm4=rinvsq - - movaps %xmm4,%xmm6 - mulps %xmm4,%xmm6 - - mulps %xmm4,%xmm6 ## xmm6=rinvsix - movaps %xmm6,%xmm4 - mulps %xmm4,%xmm4 ## xmm4=rinvtwelve - mulps nb410_c6(%esp),%xmm6 - mulps nb410_c12(%esp),%xmm4 - movaps nb410_Vvdwtot(%esp),%xmm7 - addps %xmm4,%xmm7 - mulps nb410_twelve(%esp),%xmm4 - subps %xmm6,%xmm7 - mulps nb410_six(%esp),%xmm6 - movaps %xmm7,nb410_Vvdwtot(%esp) - subps %xmm6,%xmm4 - mulps %xmm0,%xmm4 - subps %xmm3,%xmm4 - mulps %xmm0,%xmm4 - - movaps nb410_dx(%esp),%xmm0 - movaps nb410_dy(%esp),%xmm1 - movaps nb410_dz(%esp),%xmm2 - - movd %mm0,%eax - movd %mm1,%ebx - movd %mm2,%ecx - movd %mm3,%edx - - movl nb410_faction(%ebp),%edi - mulps %xmm4,%xmm0 - mulps %xmm4,%xmm1 - mulps %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movaps nb410_fix(%esp),%xmm3 - movaps nb410_fiy(%esp),%xmm4 - movaps nb410_fiz(%esp),%xmm5 - addps %xmm0,%xmm3 - addps %xmm1,%xmm4 - addps %xmm2,%xmm5 - movaps %xmm3,nb410_fix(%esp) - movaps %xmm4,nb410_fiy(%esp) - movaps %xmm5,nb410_fiz(%esp) - ## the fj's - start by accumulating x & y forces from memory - movlps (%edi,%eax,4),%xmm4 - movlps (%edi,%ecx,4),%xmm6 - movhps (%edi,%ebx,4),%xmm4 - movhps (%edi,%edx,4),%xmm6 - - movaps %xmm4,%xmm3 - shufps $136,%xmm6,%xmm3 ## constant 10001000 - shufps $221,%xmm6,%xmm4 ## constant 11011101 - - ## now xmm3-xmm5 contains fjx, fjy, fjz - subps %xmm0,%xmm3 - subps %xmm1,%xmm4 - - ## unpack them back so we can store them - first x & y in xmm3/xmm4 - - movaps %xmm3,%xmm6 - unpcklps %xmm4,%xmm6 - unpckhps %xmm4,%xmm3 - ## xmm6(l)=x & y for j1, (h) for j2 - ## xmm3(l)=x & y for j3, (h) for j4 - movlps %xmm6,(%edi,%eax,4) - movlps %xmm3,(%edi,%ecx,4) - - movhps %xmm6,(%edi,%ebx,4) - movhps %xmm3,(%edi,%edx,4) - - ## and the z forces - movss 8(%edi,%eax,4),%xmm4 - movss 8(%edi,%ebx,4),%xmm5 - movss 8(%edi,%ecx,4),%xmm6 - movss 8(%edi,%edx,4),%xmm7 - subss %xmm2,%xmm4 - shufps $229,%xmm2,%xmm2 ## constant 11100101 - subss %xmm2,%xmm5 - shufps $234,%xmm2,%xmm2 ## constant 11101010 - subss %xmm2,%xmm6 - shufps $255,%xmm2,%xmm2 ## constant 11111111 - subss %xmm2,%xmm7 - movss %xmm4,8(%edi,%eax,4) - movss %xmm5,8(%edi,%ebx,4) - movss %xmm6,8(%edi,%ecx,4) - movss %xmm7,8(%edi,%edx,4) - - ## should we do one more iteration? - subl $4,nb410_innerk(%esp) - jl _nb_kernel410_ia32_sse.nb410_finish_inner - jmp _nb_kernel410_ia32_sse.nb410_unroll_loop -_nb_kernel410_ia32_sse.nb410_finish_inner: - ## check if at least two particles remain - addl $4,nb410_innerk(%esp) - movl nb410_innerk(%esp),%edx - andl $2,%edx - jnz _nb_kernel410_ia32_sse.nb410_dopair - jmp _nb_kernel410_ia32_sse.nb410_checksingle -_nb_kernel410_ia32_sse.nb410_dopair: - movl nb410_innerjjnr(%esp),%ecx - movl (%ecx),%eax - movl 4(%ecx),%ebx - addl $8,nb410_innerjjnr(%esp) - - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - - ## load isaj - movl nb410_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm2 - movss (%esi,%ebx,4),%xmm3 - unpcklps %xmm3,%xmm2 ## isaj in xmm2(0,1) - mulps nb410_isai(%esp),%xmm2 - movaps %xmm2,nb410_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb410_gbtsc(%esp),%xmm1 - movaps %xmm1,nb410_gbscale(%esp) - - movl nb410_charge(%ebp),%esi ## base of charge[] - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ebx,4),%xmm6 - unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges - - mulps nb410_iq(%esp),%xmm2 - mulps %xmm2,%xmm3 - movaps %xmm3,nb410_qq(%esp) - - movl nb410_type(%ebp),%esi - movl %eax,%ecx - movl %ebx,%edx - movl (%esi,%ecx,4),%ecx - movl (%esi,%edx,4),%edx - movl nb410_vdwparam(%ebp),%esi - shll %ecx - shll %edx - movl nb410_ntia(%esp),%edi - addl %edi,%ecx - addl %edi,%edx - movlps (%esi,%ecx,4),%xmm6 - movhps (%esi,%edx,4),%xmm6 - movl nb410_pos(%ebp),%edi - - movaps %xmm6,%xmm4 - shufps $8,%xmm4,%xmm4 ## constant 00001000 - shufps $13,%xmm6,%xmm6 ## constant 00001101 - movlhps %xmm7,%xmm4 - movlhps %xmm7,%xmm6 - - movaps %xmm4,nb410_c6(%esp) - movaps %xmm6,nb410_c12(%esp) - - movd %eax,%mm0 - movd %ebx,%mm1 - - leal (%eax,%eax,2),%eax - leal (%ebx,%ebx,2),%ebx - ## move coordinates to xmm0-xmm2 - movlps (%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - movhps (%edi,%ebx,4),%xmm1 - movss 8(%edi,%ebx,4),%xmm0 - - movlhps %xmm7,%xmm3 - - shufps $0,%xmm0,%xmm2 - - movaps %xmm1,%xmm0 - - shufps $136,%xmm2,%xmm2 ## constant 10001000 - - shufps $136,%xmm0,%xmm0 ## constant 10001000 - shufps $221,%xmm1,%xmm1 ## constant 11011101 - - movl nb410_faction(%ebp),%edi - ## move ix-iz to xmm4-xmm6 - xorps %xmm7,%xmm7 - - movaps nb410_ix(%esp),%xmm4 - movaps nb410_iy(%esp),%xmm5 - movaps nb410_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## store dr - movaps %xmm4,nb410_dx(%esp) - movaps %xmm5,nb410_dy(%esp) - movaps %xmm6,nb410_dz(%esp) - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb410_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb410_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb410_r(%esp) - mulps nb410_gbscale(%esp),%xmm4 - - cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 - - movl nb410_GBtab(%ebp),%esi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## load coulomb table - movaps (%esi,%ecx,4),%xmm4 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps nb410_two(%esp),%xmm7 ## two*Heps2 - movaps nb410_qq(%esp),%xmm3 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - mulps %xmm7,%xmm3 ## fijC=FF*qq - ## get jnr from regs - movd %mm0,%ecx - movd %mm1,%edx - - movl nb410_dvda(%ebp),%esi - ## Calculate dVda - xorps %xmm7,%xmm7 - mulps nb410_gbscale(%esp),%xmm3 - movaps %xmm3,%xmm6 - mulps nb410_r(%esp),%xmm6 - addps %xmm5,%xmm6 - addps nb410_vctot(%esp),%xmm5 - movaps %xmm5,nb410_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb410_dvdasum(%esp),%xmm7 - movaps %xmm7,nb410_dvdasum(%esp) - - ## update j atoms dvdaj - movaps %xmm6,%xmm7 - shufps $0x1,%xmm7,%xmm7 - addss (%esi,%ecx,4),%xmm6 - addss (%esi,%edx,4),%xmm7 - movss %xmm6,(%esi,%ecx,4) - movss %xmm7,(%esi,%edx,4) - - ## L-J - movaps %xmm0,%xmm4 - mulps %xmm0,%xmm4 ## xmm4=rinvsq - - ## at this point mm5 contains vcoul and mm3 fijC - ## increment vcoul - then we can get rid of mm5 - ## update vctot - - movaps %xmm4,%xmm6 - mulps %xmm4,%xmm6 - - mulps %xmm4,%xmm6 ## xmm6=rinvsix - movaps %xmm6,%xmm4 - mulps %xmm4,%xmm4 ## xmm4=rinvtwelve - mulps nb410_c6(%esp),%xmm6 - mulps nb410_c12(%esp),%xmm4 - movaps nb410_Vvdwtot(%esp),%xmm7 - addps %xmm4,%xmm7 - mulps nb410_twelve(%esp),%xmm4 - subps %xmm6,%xmm7 - mulps nb410_six(%esp),%xmm6 - movaps %xmm7,nb410_Vvdwtot(%esp) - subps %xmm6,%xmm4 - mulps %xmm0,%xmm4 - subps %xmm3,%xmm4 - mulps %xmm0,%xmm4 - - movaps nb410_dx(%esp),%xmm0 - movaps nb410_dy(%esp),%xmm1 - movaps nb410_dz(%esp),%xmm2 - - mulps %xmm4,%xmm0 - mulps %xmm4,%xmm1 - mulps %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movaps nb410_fix(%esp),%xmm3 - movaps nb410_fiy(%esp),%xmm4 - movaps nb410_fiz(%esp),%xmm5 - addps %xmm0,%xmm3 - addps %xmm1,%xmm4 - addps %xmm2,%xmm5 - movaps %xmm3,nb410_fix(%esp) - movaps %xmm4,nb410_fiy(%esp) - movaps %xmm5,nb410_fiz(%esp) - ## update the fj's - movss (%edi,%eax,4),%xmm3 - movss 4(%edi,%eax,4),%xmm4 - movss 8(%edi,%eax,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%edi,%eax,4) - movss %xmm4,4(%edi,%eax,4) - movss %xmm5,8(%edi,%eax,4) - - shufps $225,%xmm0,%xmm0 ## constant 11100001 - shufps $225,%xmm1,%xmm1 ## constant 11100001 - shufps $225,%xmm2,%xmm2 ## constant 11100001 - - movss (%edi,%ebx,4),%xmm3 - movss 4(%edi,%ebx,4),%xmm4 - movss 8(%edi,%ebx,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%edi,%ebx,4) - movss %xmm4,4(%edi,%ebx,4) - movss %xmm5,8(%edi,%ebx,4) - -_nb_kernel410_ia32_sse.nb410_checksingle: - movl nb410_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel410_ia32_sse.nb410_dosingle - jmp _nb_kernel410_ia32_sse.nb410_updateouterdata -_nb_kernel410_ia32_sse.nb410_dosingle: - movl nb410_charge(%ebp),%esi - movl nb410_invsqrta(%ebp),%edx - movl nb410_pos(%ebp),%edi - movl nb410_innerjjnr(%esp),%ecx - movl (%ecx),%eax - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - movss (%edx,%eax,4),%xmm2 ## isaj - mulss nb410_isai(%esp),%xmm2 - movss %xmm2,nb410_isaprod(%esp) - movss %xmm2,%xmm1 - mulss nb410_gbtsc(%esp),%xmm1 - movss %xmm1,nb410_gbscale(%esp) - - mulss nb410_iq(%esp),%xmm2 - movss (%esi,%eax,4),%xmm6 ## xmm6(0) has the charge - mulss %xmm2,%xmm6 - movss %xmm6,nb410_qq(%esp) - - movl nb410_type(%ebp),%esi - movl %eax,%ecx - movl (%esi,%ecx,4),%ecx - movl nb410_vdwparam(%ebp),%esi - shll %ecx - addl nb410_ntia(%esp),%ecx - movlps (%esi,%ecx,4),%xmm6 - movaps %xmm6,%xmm4 - shufps $252,%xmm4,%xmm4 ## constant 11111100 - shufps $253,%xmm6,%xmm6 ## constant 11111101 - - movaps %xmm4,nb410_c6(%esp) - movaps %xmm6,nb410_c12(%esp) - - movd %eax,%mm0 - leal (%eax,%eax,2),%eax - - ## move coordinates to xmm0-xmm2 - movss (%edi,%eax,4),%xmm0 - movss 4(%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - - movaps nb410_ix(%esp),%xmm4 - movaps nb410_iy(%esp),%xmm5 - movaps nb410_iz(%esp),%xmm6 - - ## calc dr - subss %xmm0,%xmm4 - subss %xmm1,%xmm5 - subss %xmm2,%xmm6 - - ## store dr - movss %xmm4,nb410_dx(%esp) - movss %xmm5,nb410_dy(%esp) - movss %xmm6,nb410_dz(%esp) - ## square it - mulss %xmm4,%xmm4 - mulss %xmm5,%xmm5 - mulss %xmm6,%xmm6 - addss %xmm5,%xmm4 - addss %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movss nb410_three(%esp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movss nb410_half(%esp),%xmm0 - subss %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - - mulss %xmm0,%xmm4 ## xmm4=r - movss %xmm4,nb410_r(%esp) - mulss nb410_gbscale(%esp),%xmm4 - - cvttss2si %xmm4,%ebx ## mm6 contain lu indices - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%ebx - movl nb410_GBtab(%ebp),%esi - - movaps (%esi,%ebx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - mulss nb410_two(%esp),%xmm7 ## two*Heps2 - movss nb410_qq(%esp),%xmm3 - addss %xmm6,%xmm7 - addss %xmm5,%xmm7 ## xmm7=FF - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss %xmm3,%xmm5 ## vcoul=qq*VV - mulss %xmm7,%xmm3 ## fijC=FF*qq - - movd %mm0,%ebx - movl nb410_dvda(%ebp),%esi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulss nb410_gbscale(%esp),%xmm3 - movaps %xmm3,%xmm6 - mulss nb410_r(%esp),%xmm6 - addss %xmm5,%xmm6 - addss nb410_vctot(%esp),%xmm5 - movss %xmm5,nb410_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb410_dvdasum(%esp),%xmm7 - movaps %xmm7,nb410_dvdasum(%esp) - - ## update j atoms dvdaj - addss (%esi,%ebx,4),%xmm6 - movss %xmm6,(%esi,%ebx,4) - - ## L-J - movaps %xmm0,%xmm4 - mulss %xmm0,%xmm4 ## xmm4=rinvsq - - movaps %xmm4,%xmm6 - mulss %xmm4,%xmm6 - - mulss %xmm4,%xmm6 ## xmm6=rinvsix - movaps %xmm6,%xmm4 - mulss %xmm4,%xmm4 ## xmm4=rinvtwelve - mulss nb410_c6(%esp),%xmm6 - mulss nb410_c12(%esp),%xmm4 - movss nb410_Vvdwtot(%esp),%xmm7 - addss %xmm4,%xmm7 - mulss nb410_twelve(%esp),%xmm4 - subss %xmm6,%xmm7 - mulss nb410_six(%esp),%xmm6 - movss %xmm7,nb410_Vvdwtot(%esp) - subss %xmm6,%xmm4 - mulss %xmm0,%xmm4 - subss %xmm3,%xmm4 - mulss %xmm0,%xmm4 - - movss nb410_dx(%esp),%xmm0 - movss nb410_dy(%esp),%xmm1 - movss nb410_dz(%esp),%xmm2 - - movl nb410_faction(%ebp),%edi - mulss %xmm4,%xmm0 - mulss %xmm4,%xmm1 - mulss %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movss nb410_fix(%esp),%xmm3 - movss nb410_fiy(%esp),%xmm4 - movss nb410_fiz(%esp),%xmm5 - addss %xmm0,%xmm3 - addss %xmm1,%xmm4 - addss %xmm2,%xmm5 - movss %xmm3,nb410_fix(%esp) - movss %xmm4,nb410_fiy(%esp) - movss %xmm5,nb410_fiz(%esp) - ## update fj - - movss (%edi,%eax,4),%xmm3 - movss 4(%edi,%eax,4),%xmm4 - movss 8(%edi,%eax,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%edi,%eax,4) - movss %xmm4,4(%edi,%eax,4) - movss %xmm5,8(%edi,%eax,4) -_nb_kernel410_ia32_sse.nb410_updateouterdata: - movl nb410_ii3(%esp),%ecx - movl nb410_faction(%ebp),%edi - movl nb410_fshift(%ebp),%esi - movl nb410_is3(%esp),%edx - - ## accumulate i forces in xmm0, xmm1, xmm2 - movaps nb410_fix(%esp),%xmm0 - movaps nb410_fiy(%esp),%xmm1 - movaps nb410_fiz(%esp),%xmm2 - - movhlps %xmm0,%xmm3 - movhlps %xmm1,%xmm4 - movhlps %xmm2,%xmm5 - addps %xmm3,%xmm0 - addps %xmm4,%xmm1 - addps %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 - - movaps %xmm0,%xmm3 - movaps %xmm1,%xmm4 - movaps %xmm2,%xmm5 - - shufps $1,%xmm3,%xmm3 - shufps $1,%xmm4,%xmm4 - shufps $1,%xmm5,%xmm5 - addss %xmm3,%xmm0 - addss %xmm4,%xmm1 - addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0 - - ## increment i force - movss (%edi,%ecx,4),%xmm3 - movss 4(%edi,%ecx,4),%xmm4 - movss 8(%edi,%ecx,4),%xmm5 - addss %xmm0,%xmm3 - addss %xmm1,%xmm4 - addss %xmm2,%xmm5 - movss %xmm3,(%edi,%ecx,4) - movss %xmm4,4(%edi,%ecx,4) - movss %xmm5,8(%edi,%ecx,4) - - ## increment fshift force - movss (%esi,%edx,4),%xmm3 - movss 4(%esi,%edx,4),%xmm4 - movss 8(%esi,%edx,4),%xmm5 - addss %xmm0,%xmm3 - addss %xmm1,%xmm4 - addss %xmm2,%xmm5 - movss %xmm3,(%esi,%edx,4) - movss %xmm4,4(%esi,%edx,4) - movss %xmm5,8(%esi,%edx,4) - - ## get n from stack - movl nb410_n(%esp),%esi - ## get group index for i particle - movl nb410_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movaps nb410_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movl nb410_Vc(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - ## move back to mem - movss %xmm7,(%eax,%edx,4) - - ## accumulate total lj energy and update it - movaps nb410_Vvdwtot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movl nb410_Vvdw(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - ## move back to mem - movss %xmm7,(%eax,%edx,4) - - ## accumulate dVda and update it - movaps nb410_dvdasum(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - movl nb410_ii(%esp),%edx - movl nb410_dvda(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - movss %xmm7,(%eax,%edx,4) - - ## finish if last - movl nb410_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel410_ia32_sse.nb410_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb410_n(%esp) - jmp _nb_kernel410_ia32_sse.nb410_outer -_nb_kernel410_ia32_sse.nb410_outerend: - ## check if more outer neighborlists remain - movl nb410_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel410_ia32_sse.nb410_end - ## non-zero, do one more workunit - jmp _nb_kernel410_ia32_sse.nb410_threadloop -_nb_kernel410_ia32_sse.nb410_end: - emms - - movl nb410_nouter(%esp),%eax - movl nb410_ninner(%esp),%ebx - movl nb410_outeriter(%ebp),%ecx - movl nb410_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb410_salign(%esp),%eax - addl %eax,%esp - addl $504,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - - - -.globl nb_kernel410nf_ia32_sse -.globl _nb_kernel410nf_ia32_sse -nb_kernel410nf_ia32_sse: -_nb_kernel410nf_ia32_sse: -.set nb410nf_p_nri, 8 -.set nb410nf_iinr, 12 -.set nb410nf_jindex, 16 -.set nb410nf_jjnr, 20 -.set nb410nf_shift, 24 -.set nb410nf_shiftvec, 28 -.set nb410nf_fshift, 32 -.set nb410nf_gid, 36 -.set nb410nf_pos, 40 -.set nb410nf_faction, 44 -.set nb410nf_charge, 48 -.set nb410nf_p_facel, 52 -.set nb410nf_argkrf, 56 -.set nb410nf_argcrf, 60 -.set nb410nf_Vc, 64 -.set nb410nf_type, 68 -.set nb410nf_p_ntype, 72 -.set nb410nf_vdwparam, 76 -.set nb410nf_Vvdw, 80 -.set nb410nf_p_tabscale, 84 -.set nb410nf_VFtab, 88 -.set nb410nf_invsqrta, 92 -.set nb410nf_dvda, 96 -.set nb410nf_p_gbtabscale, 100 -.set nb410nf_GBtab, 104 -.set nb410nf_p_nthreads, 108 -.set nb410nf_count, 112 -.set nb410nf_mtx, 116 -.set nb410nf_outeriter, 120 -.set nb410nf_inneriter, 124 -.set nb410nf_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb410nf_ix, 0 -.set nb410nf_iy, 16 -.set nb410nf_iz, 32 -.set nb410nf_iq, 48 -.set nb410nf_gbtsc, 64 -.set nb410nf_qq, 80 -.set nb410nf_c6, 96 -.set nb410nf_c12, 112 -.set nb410nf_vctot, 128 -.set nb410nf_Vvdwtot, 144 -.set nb410nf_half, 160 -.set nb410nf_three, 176 -.set nb410nf_isai, 192 -.set nb410nf_isaprod, 208 -.set nb410nf_gbscale, 224 -.set nb410nf_is3, 240 -.set nb410nf_ii3, 244 -.set nb410nf_ntia, 248 -.set nb410nf_innerjjnr, 252 -.set nb410nf_innerk, 256 -.set nb410nf_n, 260 -.set nb410nf_nn1, 264 -.set nb410nf_nri, 268 -.set nb410nf_facel, 272 -.set nb410nf_ntype, 276 -.set nb410nf_nouter, 280 -.set nb410nf_ninner, 284 -.set nb410nf_salign, 288 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $292,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb410nf_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb410nf_p_nri(%ebp),%ecx - movl nb410nf_p_facel(%ebp),%esi - movl nb410nf_p_ntype(%ebp),%edi - movl (%ecx),%ecx - movl (%esi),%esi - movl (%edi),%edi - movl %ecx,nb410nf_nri(%esp) - movl %esi,nb410nf_facel(%esp) - movl %edi,nb410nf_ntype(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb410nf_nouter(%esp) - movl %eax,nb410nf_ninner(%esp) - - - movl nb410nf_p_gbtabscale(%ebp),%eax - movss (%eax),%xmm5 - shufps $0,%xmm5,%xmm5 - movaps %xmm5,nb410nf_gbtsc(%esp) - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## constant 0.5 in IEEE (hex) - movl %eax,nb410nf_half(%esp) - movss nb410nf_half(%esp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## constant 1.0 - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## constant 2.0 - addps %xmm2,%xmm3 ## constant 3.0 - movaps %xmm1,nb410nf_half(%esp) - movaps %xmm3,nb410nf_three(%esp) - -_nb_kernel410nf_ia32_sse.nb410nf_threadloop: - movl nb410nf_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel410nf_ia32_sse.nb410nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel410nf_ia32_sse.nb410nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb410nf_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb410nf_n(%esp) - movl %ebx,nb410nf_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel410nf_ia32_sse.nb410nf_outerstart - jmp _nb_kernel410nf_ia32_sse.nb410nf_end - -_nb_kernel410nf_ia32_sse.nb410nf_outerstart: - ## ebx contains number of outer iterations - addl nb410nf_nouter(%esp),%ebx - movl %ebx,nb410nf_nouter(%esp) - -_nb_kernel410nf_ia32_sse.nb410nf_outer: - movl nb410nf_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb410nf_is3(%esp) ## store is3 - - movl nb410nf_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movss (%eax,%ebx,4),%xmm0 - movss 4(%eax,%ebx,4),%xmm1 - movss 8(%eax,%ebx,4),%xmm2 - - movl nb410nf_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - - movl nb410nf_charge(%ebp),%edx - movss (%edx,%ebx,4),%xmm3 - mulss nb410nf_facel(%esp),%xmm3 - shufps $0,%xmm3,%xmm3 - - movl nb410nf_invsqrta(%ebp),%edx ## load invsqrta[ii] - movss (%edx,%ebx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - movl nb410nf_type(%ebp),%edx - movl (%edx,%ebx,4),%edx - imull nb410nf_ntype(%esp),%edx - shll %edx - movl %edx,nb410nf_ntia(%esp) - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb410nf_pos(%ebp),%eax ## eax = base of pos[] - - addss (%eax,%ebx,4),%xmm0 - addss 4(%eax,%ebx,4),%xmm1 - addss 8(%eax,%ebx,4),%xmm2 - - movaps %xmm3,nb410nf_iq(%esp) - movaps %xmm4,nb410nf_isai(%esp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb410nf_ix(%esp) - movaps %xmm1,nb410nf_iy(%esp) - movaps %xmm2,nb410nf_iz(%esp) - - movl %ebx,nb410nf_ii3(%esp) - - ## clear vctot - xorps %xmm4,%xmm4 - movaps %xmm4,nb410nf_vctot(%esp) - movaps %xmm4,nb410nf_Vvdwtot(%esp) - - movl nb410nf_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb410nf_pos(%ebp),%esi - movl nb410nf_faction(%ebp),%edi - movl nb410nf_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb410nf_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb410nf_ninner(%esp),%ecx - movl %ecx,nb410nf_ninner(%esp) - addl $0,%edx - movl %edx,nb410nf_innerk(%esp) ## number of innerloop atoms - jge _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop - jmp _nb_kernel410nf_ia32_sse.nb410nf_finish_inner -_nb_kernel410nf_ia32_sse.nb410nf_unroll_loop: - ## quad-unroll innerloop here - movl nb410nf_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - movl 8(%edx),%ecx - movl 12(%edx),%edx ## eax-edx=jnr1-4 - addl $16,nb410nf_innerjjnr(%esp) ## advance pointer (unrolled 4) - - ## load isa2 - movl nb410nf_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - movaps nb410nf_isai(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb410nf_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb410nf_gbtsc(%esp),%xmm1 - movaps %xmm1,nb410nf_gbscale(%esp) - - movl nb410nf_charge(%ebp),%esi ## base of charge[] - - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - - mulps nb410nf_iq(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3 - mulps %xmm2,%xmm3 - movaps %xmm3,nb410nf_qq(%esp) - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movl nb410nf_type(%ebp),%esi - movl (%esi,%eax,4),%eax - movl (%esi,%ebx,4),%ebx - movl (%esi,%ecx,4),%ecx - movl (%esi,%edx,4),%edx - movl nb410nf_vdwparam(%ebp),%esi - shll %eax - shll %ebx - shll %ecx - shll %edx - movl nb410nf_ntia(%esp),%edi - addl %edi,%eax - addl %edi,%ebx - addl %edi,%ecx - addl %edi,%edx - - movlps (%esi,%eax,4),%xmm6 - movlps (%esi,%ecx,4),%xmm7 - movhps (%esi,%ebx,4),%xmm6 - movhps (%esi,%edx,4),%xmm7 - - movaps %xmm6,%xmm4 - shufps $136,%xmm7,%xmm4 ## constant 10001000 - shufps $221,%xmm7,%xmm6 ## constant 11011101 - - movd %mm0,%eax - movd %mm1,%ebx - movd %mm2,%ecx - movd %mm3,%edx - - movaps %xmm4,nb410nf_c6(%esp) - movaps %xmm6,nb410nf_c12(%esp) - - movl nb410nf_pos(%ebp),%esi ## base of pos[] - - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - - leal (%ecx,%ecx,2),%ecx ## replace jnr with j3 - leal (%edx,%edx,2),%edx - - ## move four coordinates to xmm0-xmm2 - - movlps (%esi,%eax,4),%xmm4 - movlps (%esi,%ecx,4),%xmm5 - movss 8(%esi,%eax,4),%xmm2 - movss 8(%esi,%ecx,4),%xmm6 - - movhps (%esi,%ebx,4),%xmm4 - movhps (%esi,%edx,4),%xmm5 - - movss 8(%esi,%ebx,4),%xmm0 - movss 8(%esi,%edx,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## constant 10001000 - - shufps $136,%xmm5,%xmm0 ## constant 10001000 - shufps $221,%xmm5,%xmm1 ## constant 11011101 - - ## move ix-iz to xmm4-xmm6 - movaps nb410nf_ix(%esp),%xmm4 - movaps nb410nf_iy(%esp),%xmm5 - movaps nb410nf_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb410nf_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb410nf_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - mulps nb410nf_gbscale(%esp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $2,%mm6 - pslld $2,%mm7 - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movl nb410nf_GBtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## load coulomb table - movaps (%esi,%eax,4),%xmm4 - movaps (%esi,%ebx,4),%xmm5 - movaps (%esi,%ecx,4),%xmm6 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## coulomb table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb410nf_qq(%esp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - ## update vctot - addps nb410nf_vctot(%esp),%xmm5 - movaps %xmm5,nb410nf_vctot(%esp) - - ## L-J - movaps %xmm0,%xmm4 - mulps %xmm0,%xmm4 ## xmm4=rinvsq - - movaps %xmm4,%xmm6 - mulps %xmm4,%xmm6 - - mulps %xmm4,%xmm6 ## xmm6=rinvsix - movaps %xmm6,%xmm4 - mulps %xmm4,%xmm4 ## xmm4=rinvtwelve - mulps nb410nf_c6(%esp),%xmm6 - mulps nb410nf_c12(%esp),%xmm4 - movaps nb410nf_Vvdwtot(%esp),%xmm7 - addps %xmm4,%xmm7 - subps %xmm6,%xmm7 - movaps %xmm7,nb410nf_Vvdwtot(%esp) - - ## should we do one more iteration? - subl $4,nb410nf_innerk(%esp) - jl _nb_kernel410nf_ia32_sse.nb410nf_finish_inner - jmp _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop -_nb_kernel410nf_ia32_sse.nb410nf_finish_inner: - ## check if at least two particles remain - addl $4,nb410nf_innerk(%esp) - movl nb410nf_innerk(%esp),%edx - andl $2,%edx - jnz _nb_kernel410nf_ia32_sse.nb410nf_dopair - jmp _nb_kernel410nf_ia32_sse.nb410nf_checksingle -_nb_kernel410nf_ia32_sse.nb410nf_dopair: - movl nb410nf_innerjjnr(%esp),%ecx - movl (%ecx),%eax - movl 4(%ecx),%ebx - addl $8,nb410nf_innerjjnr(%esp) - - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - - ## load isa2 - movl nb410nf_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm2 - movss (%esi,%ebx,4),%xmm3 - unpcklps %xmm3,%xmm2 ## isa2 in xmm3(0,1) - mulps nb410nf_isai(%esp),%xmm2 - movaps %xmm2,nb410nf_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb410nf_gbtsc(%esp),%xmm1 - movaps %xmm1,nb410nf_gbscale(%esp) - - movl nb410nf_charge(%ebp),%esi ## base of charge[] - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ebx,4),%xmm6 - unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges - - mulps nb410nf_iq(%esp),%xmm2 - mulps %xmm2,%xmm3 - movaps %xmm3,nb410nf_qq(%esp) - - movl nb410nf_type(%ebp),%esi - movl %eax,%ecx - movl %ebx,%edx - movl (%esi,%ecx,4),%ecx - movl (%esi,%edx,4),%edx - movl nb410nf_vdwparam(%ebp),%esi - shll %ecx - shll %edx - movl nb410nf_ntia(%esp),%edi - addl %edi,%ecx - addl %edi,%edx - movlps (%esi,%ecx,4),%xmm6 - movhps (%esi,%edx,4),%xmm6 - movl nb410nf_pos(%ebp),%edi - - movaps %xmm6,%xmm4 - shufps $8,%xmm4,%xmm4 ## constant 00001000 - shufps $13,%xmm6,%xmm6 ## constant 00001101 - movlhps %xmm7,%xmm4 - movlhps %xmm7,%xmm6 - - movaps %xmm4,nb410nf_c6(%esp) - movaps %xmm6,nb410nf_c12(%esp) - - leal (%eax,%eax,2),%eax - leal (%ebx,%ebx,2),%ebx - ## move coordinates to xmm0-xmm2 - movlps (%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - movhps (%edi,%ebx,4),%xmm1 - movss 8(%edi,%ebx,4),%xmm0 - - movlhps %xmm7,%xmm3 - - shufps $0,%xmm0,%xmm2 - - movaps %xmm1,%xmm0 - - shufps $136,%xmm2,%xmm2 ## constant 10001000 - - shufps $136,%xmm0,%xmm0 ## constant 10001000 - shufps $221,%xmm1,%xmm1 ## constant 11011101 - - movl nb410nf_faction(%ebp),%edi - ## move ix-iz to xmm4-xmm6 - xorps %xmm7,%xmm7 - - movaps nb410nf_ix(%esp),%xmm4 - movaps nb410nf_iy(%esp),%xmm5 - movaps nb410nf_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb410nf_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb410nf_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - mulps nb410nf_gbscale(%esp),%xmm4 - - cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 - - movl nb410nf_GBtab(%ebp),%esi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## load coulomb table - movaps (%esi,%ecx,4),%xmm4 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb410nf_qq(%esp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - - addps nb410nf_vctot(%esp),%xmm5 - movaps %xmm5,nb410nf_vctot(%esp) - - ## L-J - movaps %xmm0,%xmm4 - mulps %xmm0,%xmm4 ## xmm4=rinvsq - - ## at this point mm5 contains vcoul and mm3 fijC - ## increment vcoul - then we can get rid of mm5 - ## update vctot - - movaps %xmm4,%xmm6 - mulps %xmm4,%xmm6 - - mulps %xmm4,%xmm6 ## xmm6=rinvsix - movaps %xmm6,%xmm4 - mulps %xmm4,%xmm4 ## xmm4=rinvtwelve - mulps nb410nf_c6(%esp),%xmm6 - mulps nb410nf_c12(%esp),%xmm4 - movaps nb410nf_Vvdwtot(%esp),%xmm7 - addps %xmm4,%xmm7 - subps %xmm6,%xmm7 - movaps %xmm7,nb410nf_Vvdwtot(%esp) - -_nb_kernel410nf_ia32_sse.nb410nf_checksingle: - movl nb410nf_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel410nf_ia32_sse.nb410nf_dosingle - jmp _nb_kernel410nf_ia32_sse.nb410nf_updateouterdata -_nb_kernel410nf_ia32_sse.nb410nf_dosingle: - movl nb410nf_charge(%ebp),%esi - movl nb410nf_invsqrta(%ebp),%edx - movl nb410nf_pos(%ebp),%edi - movl nb410nf_innerjjnr(%esp),%ecx - movl (%ecx),%eax - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - movss (%edx,%eax,4),%xmm2 ## isa2 - mulss nb410nf_isai(%esp),%xmm2 - movss %xmm2,nb410nf_isaprod(%esp) - movss %xmm2,%xmm1 - mulss nb410nf_gbtsc(%esp),%xmm1 - movss %xmm1,nb410nf_gbscale(%esp) - - mulss nb410nf_iq(%esp),%xmm2 - movss (%esi,%eax,4),%xmm6 ## xmm6(0) has the charge - mulss %xmm2,%xmm6 - movss %xmm6,nb410nf_qq(%esp) - - movl nb410nf_type(%ebp),%esi - movl %eax,%ecx - movl (%esi,%ecx,4),%ecx - movl nb410nf_vdwparam(%ebp),%esi - shll %ecx - addl nb410nf_ntia(%esp),%ecx - movlps (%esi,%ecx,4),%xmm6 - movaps %xmm6,%xmm4 - shufps $252,%xmm4,%xmm4 ## constant 11111100 - shufps $253,%xmm6,%xmm6 ## constant 11111101 - - movaps %xmm4,nb410nf_c6(%esp) - movaps %xmm6,nb410nf_c12(%esp) - - leal (%eax,%eax,2),%eax - - ## move coordinates to xmm0-xmm2 - movss (%edi,%eax,4),%xmm0 - movss 4(%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - - movaps nb410nf_ix(%esp),%xmm4 - movaps nb410nf_iy(%esp),%xmm5 - movaps nb410nf_iz(%esp),%xmm6 - - ## calc dr - subss %xmm0,%xmm4 - subss %xmm1,%xmm5 - subss %xmm2,%xmm6 - - ## square it - mulss %xmm4,%xmm4 - mulss %xmm5,%xmm5 - mulss %xmm6,%xmm6 - addss %xmm5,%xmm4 - addss %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movss nb410nf_three(%esp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movss nb410nf_half(%esp),%xmm0 - subss %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - - mulss %xmm0,%xmm4 ## xmm4=r - mulss nb410nf_gbscale(%esp),%xmm4 - - cvttss2si %xmm4,%ebx ## mm6 contain lu indices - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%ebx - movl nb410nf_GBtab(%ebp),%esi - - movaps (%esi,%ebx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - movss nb410nf_qq(%esp),%xmm3 - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss %xmm3,%xmm5 ## vcoul=qq*VV - addss nb410nf_vctot(%esp),%xmm5 - movss %xmm5,nb410nf_vctot(%esp) - - ## L-J - movaps %xmm0,%xmm4 - mulss %xmm0,%xmm4 ## xmm4=rinvsq - - movaps %xmm4,%xmm6 - mulss %xmm4,%xmm6 - - mulss %xmm4,%xmm6 ## xmm6=rinvsix - movaps %xmm6,%xmm4 - mulss %xmm4,%xmm4 ## xmm4=rinvtwelve - mulss nb410nf_c6(%esp),%xmm6 - mulss nb410nf_c12(%esp),%xmm4 - movss nb410nf_Vvdwtot(%esp),%xmm7 - addps %xmm4,%xmm7 - subps %xmm6,%xmm7 - movss %xmm7,nb410nf_Vvdwtot(%esp) - -_nb_kernel410nf_ia32_sse.nb410nf_updateouterdata: - ## get n from stack - movl nb410nf_n(%esp),%esi - ## get group index for i particle - movl nb410nf_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movaps nb410nf_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movl nb410nf_Vc(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - ## move back to mem - movss %xmm7,(%eax,%edx,4) - - ## accumulate total lj energy and update it - movaps nb410nf_Vvdwtot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movl nb410nf_Vvdw(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - ## move back to mem - movss %xmm7,(%eax,%edx,4) - - ## finish if last - movl nb410nf_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel410nf_ia32_sse.nb410nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb410nf_n(%esp) - jmp _nb_kernel410nf_ia32_sse.nb410nf_outer -_nb_kernel410nf_ia32_sse.nb410nf_outerend: - ## check if more outer neighborlists remain - movl nb410nf_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel410nf_ia32_sse.nb410nf_end - ## non-zero, do one more workunit - jmp _nb_kernel410nf_ia32_sse.nb410nf_threadloop -_nb_kernel410nf_ia32_sse.nb410nf_end: - emms - - movl nb410nf_nouter(%esp),%eax - movl nb410nf_ninner(%esp),%ebx - movl nb410nf_outeriter(%ebp),%ecx - movl nb410nf_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb410nf_salign(%esp),%eax - addl %eax,%esp - addl $292,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.intel_syntax.s deleted file mode 100644 index 9fefa0256f..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.intel_syntax.s +++ /dev/null @@ -1,2409 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. - -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - - - - -.globl nb_kernel430_ia32_sse -.globl _nb_kernel430_ia32_sse -nb_kernel430_ia32_sse: -_nb_kernel430_ia32_sse: -.equiv nb430_p_nri, 8 -.equiv nb430_iinr, 12 -.equiv nb430_jindex, 16 -.equiv nb430_jjnr, 20 -.equiv nb430_shift, 24 -.equiv nb430_shiftvec, 28 -.equiv nb430_fshift, 32 -.equiv nb430_gid, 36 -.equiv nb430_pos, 40 -.equiv nb430_faction, 44 -.equiv nb430_charge, 48 -.equiv nb430_p_facel, 52 -.equiv nb430_argkrf, 56 -.equiv nb430_argcrf, 60 -.equiv nb430_Vc, 64 -.equiv nb430_type, 68 -.equiv nb430_p_ntype, 72 -.equiv nb430_vdwparam, 76 -.equiv nb430_Vvdw, 80 -.equiv nb430_p_tabscale, 84 -.equiv nb430_VFtab, 88 -.equiv nb430_invsqrta, 92 -.equiv nb430_dvda, 96 -.equiv nb430_p_gbtabscale, 100 -.equiv nb430_GBtab, 104 -.equiv nb430_p_nthreads, 108 -.equiv nb430_count, 112 -.equiv nb430_mtx, 116 -.equiv nb430_outeriter, 120 -.equiv nb430_inneriter, 124 -.equiv nb430_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb430_ix, 0 -.equiv nb430_iy, 16 -.equiv nb430_iz, 32 -.equiv nb430_iq, 48 -.equiv nb430_dx, 64 -.equiv nb430_dy, 80 -.equiv nb430_dz, 96 -.equiv nb430_two, 112 -.equiv nb430_gbtsc, 128 -.equiv nb430_tsc, 144 -.equiv nb430_qq, 160 -.equiv nb430_c6, 176 -.equiv nb430_c12, 192 -.equiv nb430_fscal, 208 -.equiv nb430_vctot, 224 -.equiv nb430_Vvdwtot, 240 -.equiv nb430_fix, 256 -.equiv nb430_fiy, 272 -.equiv nb430_fiz, 288 -.equiv nb430_half, 304 -.equiv nb430_three, 320 -.equiv nb430_r, 336 -.equiv nb430_isai, 352 -.equiv nb430_isaprod, 368 -.equiv nb430_dvdasum, 384 -.equiv nb430_gbscale, 400 -.equiv nb430_ii, 416 -.equiv nb430_is3, 420 -.equiv nb430_ii3, 424 -.equiv nb430_ntia, 428 -.equiv nb430_innerjjnr, 432 -.equiv nb430_innerk, 436 -.equiv nb430_n, 440 -.equiv nb430_nn1, 444 -.equiv nb430_jnra, 448 -.equiv nb430_jnrb, 452 -.equiv nb430_jnrc, 456 -.equiv nb430_jnrd, 460 -.equiv nb430_nri, 464 -.equiv nb430_facel, 468 -.equiv nb430_ntype, 472 -.equiv nb430_nouter, 476 -.equiv nb430_ninner, 480 -.equiv nb430_salign, 484 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 488 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb430_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb430_p_nri] - mov esi, [ebp + nb430_p_facel] - mov edi, [ebp + nb430_p_ntype] - mov ecx, [ecx] - mov esi, [esi] - mov edi, [edi] - mov [esp + nb430_nri], ecx - mov [esp + nb430_facel], esi - mov [esp + nb430_ntype], edi - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb430_nouter], eax - mov [esp + nb430_ninner], eax - - - mov eax, [ebp + nb430_p_gbtabscale] - movss xmm3, [eax] - mov eax, [ebp + nb430_p_tabscale] - movss xmm4, [eax] - shufps xmm3, xmm3, 0 - shufps xmm4, xmm4, 0 - movaps [esp + nb430_gbtsc], xmm3 - movaps [esp + nb430_tsc], xmm4 - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# constant 0.5 in IEEE (hex) - mov [esp + nb430_half], eax - movss xmm1, [esp + nb430_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# constant 1.0 - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# constant 2.0 - addps xmm3, xmm2 ;# constant 3.0 - movaps [esp + nb430_half], xmm1 - movaps [esp + nb430_two], xmm2 - movaps [esp + nb430_three], xmm3 - -.nb430_threadloop: - mov esi, [ebp + nb430_count] ;# pointer to sync counter - mov eax, [esi] -.nb430_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb430_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb430_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb430_n], eax - mov [esp + nb430_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb430_outerstart - jmp .nb430_end - -.nb430_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb430_nouter] - mov [esp + nb430_nouter], ebx - -.nb430_outer: - mov eax, [ebp + nb430_shift] ;# eax = pointer into shift[] - mov ebx, [eax + esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb430_is3],ebx ;# store is3 - - mov eax, [ebp + nb430_shiftvec] ;# eax = base of shiftvec[] - - movss xmm0, [eax + ebx*4] - movss xmm1, [eax + ebx*4 + 4] - movss xmm2, [eax + ebx*4 + 8] - - mov ecx, [ebp + nb430_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx + esi*4] ;# ebx =ii - mov [esp + nb430_ii], ebx - - mov edx, [ebp + nb430_charge] - movss xmm3, [edx + ebx*4] - mulss xmm3, [esp + nb430_facel] - shufps xmm3, xmm3, 0 - - mov edx, [ebp + nb430_invsqrta] ;# load invsqrta[ii] - movss xmm4, [edx + ebx*4] - shufps xmm4, xmm4, 0 - - mov edx, [ebp + nb430_type] - mov edx, [edx + ebx*4] - imul edx, [esp + nb430_ntype] - shl edx, 1 - mov [esp + nb430_ntia], edx - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb430_pos] ;# eax = base of pos[] - - addss xmm0, [eax + ebx*4] - addss xmm1, [eax + ebx*4 + 4] - addss xmm2, [eax + ebx*4 + 8] - - movaps [esp + nb430_iq], xmm3 - movaps [esp + nb430_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [esp + nb430_ix], xmm0 - movaps [esp + nb430_iy], xmm1 - movaps [esp + nb430_iz], xmm2 - - mov [esp + nb430_ii3], ebx - - ;# clear vctot and i forces - xorps xmm4, xmm4 - movaps [esp + nb430_vctot], xmm4 - movaps [esp + nb430_Vvdwtot], xmm4 - movaps [esp + nb430_dvdasum], xmm4 - movaps [esp + nb430_fix], xmm4 - movaps [esp + nb430_fiy], xmm4 - movaps [esp + nb430_fiz], xmm4 - - mov eax, [ebp + nb430_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb430_pos] - mov edi, [ebp + nb430_faction] - mov eax, [ebp + nb430_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb430_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [esp + nb430_ninner] - mov [esp + nb430_ninner], ecx - add edx, 0 - mov [esp + nb430_innerk], edx ;# number of innerloop atoms - - jge .nb430_unroll_loop - jmp .nb430_finish_inner -.nb430_unroll_loop: - ;# quad-unroll innerloop here - mov edx, [esp + nb430_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - mov ecx, [edx + 8] - mov edx, [edx + 12] ;# eax-edx=jnr1-4 - add dword ptr [esp + nb430_innerjjnr], 16 ;# advance pointer (unrolled 4) - - ;# load isaj - mov esi, [ebp + nb430_invsqrta] - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - movaps xmm2, [esp + nb430_isai] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all isaj in xmm3 - mulps xmm2, xmm3 - - movaps [esp + nb430_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb430_gbtsc] - movaps [esp + nb430_gbscale], xmm1 - - mov esi, [ebp + nb430_charge] ;# base of charge[] - - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - - mulps xmm2, [esp + nb430_iq] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3 - mulps xmm3, xmm2 - movaps [esp + nb430_qq], xmm3 - - movd mm0, eax ;# use mmx registers as temp storage - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov esi, [ebp + nb430_type] - mov eax, [esi + eax*4] - mov ebx, [esi + ebx*4] - mov ecx, [esi + ecx*4] - mov edx, [esi + edx*4] - mov esi, [ebp + nb430_vdwparam] - shl eax, 1 - shl ebx, 1 - shl ecx, 1 - shl edx, 1 - mov edi, [esp + nb430_ntia] - add eax, edi - add ebx, edi - add ecx, edi - add edx, edi - - movlps xmm6, [esi + eax*4] - movlps xmm7, [esi + ecx*4] - movhps xmm6, [esi + ebx*4] - movhps xmm7, [esi + edx*4] - - movaps xmm4, xmm6 - shufps xmm4, xmm7, 136 ;# constant 10001000 - shufps xmm6, xmm7, 221 ;# constant 11011101 - - movd eax, mm0 - movd ebx, mm1 - movd ecx, mm2 - movd edx, mm3 - - movaps [esp + nb430_c6], xmm4 - movaps [esp + nb430_c12], xmm6 - - mov esi, [ebp + nb430_pos] ;# base of pos[] - - mov [esp + nb430_jnra], eax - mov [esp + nb430_jnrb], ebx - mov [esp + nb430_jnrc], ecx - mov [esp + nb430_jnrd], edx - - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - lea ecx, [ecx + ecx*2] - lea edx, [edx + edx*2] - - ;# move four coordinates to xmm0-xmm2 - - movlps xmm4, [esi + eax*4] - movlps xmm5, [esi + ecx*4] - movss xmm2, [esi + eax*4 + 8] - movss xmm6, [esi + ecx*4 + 8] - - movhps xmm4, [esi + ebx*4] - movhps xmm5, [esi + edx*4] - - movss xmm0, [esi + ebx*4 + 8] - movss xmm1, [esi + edx*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# constant 10001000 - - shufps xmm0, xmm5, 136 ;# constant 10001000 - shufps xmm1, xmm5, 221 ;# constant 11011101 - - ;# move ix-iz to xmm4-xmm6 - movaps xmm4, [esp + nb430_ix] - movaps xmm5, [esp + nb430_iy] - movaps xmm6, [esp + nb430_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# store dr - movaps [esp + nb430_dx], xmm4 - movaps [esp + nb430_dy], xmm5 - movaps [esp + nb430_dz], xmm6 - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb430_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb430_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [esp + nb430_r], xmm4 - mulps xmm4, [esp + nb430_gbscale] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 2 - pslld mm7, 2 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov esi, [ebp + nb430_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# load coulomb table - movaps xmm4, [esi + eax*4] - movaps xmm5, [esi + ebx*4] - movaps xmm6, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm7, [esp + nb430_two] ;# two*Heps2 - movaps xmm3, [esp + nb430_qq] - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - mulps xmm3, xmm7 ;# fijC=FF*qq - - ;# get jnr from stack - mov eax, [esp + nb430_jnra] - mov ebx, [esp + nb430_jnrb] - mov ecx, [esp + nb430_jnrc] - mov edx, [esp + nb430_jnrd] - - mov esi, [ebp + nb430_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulps xmm3, [esp + nb430_gbscale] - movaps xmm6, xmm3 - mulps xmm6, [esp + nb430_r] - addps xmm6, xmm5 - addps xmm5, [esp + nb430_vctot] - movaps [esp + nb430_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [esp + nb430_dvdasum] - movaps [esp + nb430_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - movaps xmm5, xmm6 - movaps xmm4, xmm7 - shufps xmm5, xmm5, 0x1 - shufps xmm4, xmm4, 0x1 - ;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss xmm6, [esi + eax*4] - addss xmm5, [esi + ebx*4] - addss xmm7, [esi + ecx*4] - addss xmm4, [esi + edx*4] - movss [esi + eax*4], xmm6 - movss [esi + ebx*4], xmm5 - movss [esi + ecx*4], xmm7 - movss [esi + edx*4], xmm4 - - ;# put scalar force on stack temporarily - movaps [esp + nb430_fscal], xmm3 - - movaps xmm4, [esp + nb430_r] - mulps xmm4, [esp + nb430_tsc] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 3 - pslld mm7, 3 - - mov esi, [ebp + nb430_VFtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# dispersion - movaps xmm4, [esi + eax*4] - movaps xmm5, [esi + ebx*4] - movaps xmm6, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# dispersion table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm7, [esp + nb430_two] ;# two*Heps2 - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - - movaps xmm4, [esp + nb430_c6] - mulps xmm7, xmm4 ;# fijD - mulps xmm5, xmm4 ;# Vvdw6 - mulps xmm7, [esp + nb430_tsc] - addps xmm7, [esp + nb430_fscal] ;# add to fscal - - ;# put scalar force on stack Update Vvdwtot directly - addps xmm5, [esp + nb430_Vvdwtot] - movaps [esp + nb430_fscal], xmm7 - movaps [esp + nb430_Vvdwtot], xmm5 - - ;# repulsion - movaps xmm4, [esi + eax*4 + 16] - movaps xmm5, [esi + ebx*4 + 16] - movaps xmm6, [esi + ecx*4 + 16] - movaps xmm7, [esi + edx*4 + 16] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm7, [esp + nb430_two] ;# two*Heps2 - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - - movaps xmm4, [esp + nb430_c12] - mulps xmm7, xmm4 ;# fijR - mulps xmm5, xmm4 ;# Vvdw12 - mulps xmm7, [esp + nb430_tsc] - addps xmm7, [esp + nb430_fscal] - - addps xmm5, [esp + nb430_Vvdwtot] - movaps [esp + nb430_Vvdwtot], xmm5 - xorps xmm4, xmm4 - - mulps xmm7, xmm0 - subps xmm4, xmm7 - - movaps xmm0, [esp + nb430_dx] - movaps xmm1, [esp + nb430_dy] - movaps xmm2, [esp + nb430_dz] - - movd eax, mm0 - movd ebx, mm1 - movd ecx, mm2 - movd edx, mm3 - - mov edi, [ebp + nb430_faction] - mulps xmm0, xmm4 - mulps xmm1, xmm4 - mulps xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movaps xmm3, [esp + nb430_fix] - movaps xmm4, [esp + nb430_fiy] - movaps xmm5, [esp + nb430_fiz] - addps xmm3, xmm0 - addps xmm4, xmm1 - addps xmm5, xmm2 - movaps [esp + nb430_fix], xmm3 - movaps [esp + nb430_fiy], xmm4 - movaps [esp + nb430_fiz], xmm5 - ;# the fj's - start by accumulating x & y forces from memory - movlps xmm4, [edi + eax*4] - movlps xmm6, [edi + ecx*4] - movhps xmm4, [edi + ebx*4] - movhps xmm6, [edi + edx*4] - - movaps xmm3, xmm4 - shufps xmm3, xmm6, 136 ;# constant 10001000 - shufps xmm4, xmm6, 221 ;# constant 11011101 - - ;# now xmm3-xmm5 contains fjx, fjy, fjz - subps xmm3, xmm0 - subps xmm4, xmm1 - - ;# unpack them back so we can store them - first x & y in xmm3/xmm4 - - movaps xmm6, xmm3 - unpcklps xmm6, xmm4 - unpckhps xmm3, xmm4 - ;# xmm6(l)=x & y for j1, (h) for j2 - ;# xmm3(l)=x & y for j3, (h) for j4 - movlps [edi + eax*4], xmm6 - movlps [edi + ecx*4], xmm3 - - movhps [edi + ebx*4], xmm6 - movhps [edi + edx*4], xmm3 - - ;# and the z forces - movss xmm4, [edi + eax*4 + 8] - movss xmm5, [edi + ebx*4 + 8] - movss xmm6, [edi + ecx*4 + 8] - movss xmm7, [edi + edx*4 + 8] - subss xmm4, xmm2 - shufps xmm2, xmm2, 229 ;# constant 11100101 - subss xmm5, xmm2 - shufps xmm2, xmm2, 234 ;# constant 11101010 - subss xmm6, xmm2 - shufps xmm2, xmm2, 255 ;# constant 11111111 - subss xmm7, xmm2 - movss [edi + eax*4 + 8], xmm4 - movss [edi + ebx*4 + 8], xmm5 - movss [edi + ecx*4 + 8], xmm6 - movss [edi + edx*4 + 8], xmm7 - - ;# should we do one more iteration? - sub dword ptr [esp + nb430_innerk], 4 - jl .nb430_finish_inner - jmp .nb430_unroll_loop -.nb430_finish_inner: - ;# check if at least two particles remain - add dword ptr [esp + nb430_innerk], 4 - mov edx, [esp + nb430_innerk] - and edx, 2 - jnz .nb430_dopair - jmp .nb430_checksingle -.nb430_dopair: - - mov ecx, [esp + nb430_innerjjnr] - - mov eax, [ecx] - mov ebx, [ecx + 4] - add dword ptr [esp + nb430_innerjjnr], 8 - - xorps xmm2, xmm2 - movaps xmm6, xmm2 - - ;# load isaj - mov esi, [ebp + nb430_invsqrta] - movss xmm2, [esi + eax*4] - movss xmm3, [esi + ebx*4] - unpcklps xmm2, xmm3 ;# isaj in xmm3(0,1) - mulps xmm2, [esp + nb430_isai] - movaps [esp + nb430_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb430_gbtsc] - movaps [esp + nb430_gbscale], xmm1 - - mov esi, [ebp + nb430_charge] ;# base of charge[] - movss xmm3, [esi + eax*4] - movss xmm6, [esi + ebx*4] - unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges - - mulps xmm2, [esp + nb430_iq] - mulps xmm3, xmm2 - movaps [esp + nb430_qq], xmm3 - - mov esi, [ebp + nb430_type] - mov ecx, eax - mov edx, ebx - mov ecx, [esi + ecx*4] - mov edx, [esi + edx*4] - mov esi, [ebp + nb430_vdwparam] - shl ecx, 1 - shl edx, 1 - mov edi, [esp + nb430_ntia] - add ecx, edi - add edx, edi - movlps xmm6, [esi + ecx*4] - movhps xmm6, [esi + edx*4] - mov edi, [ebp + nb430_pos] - - movaps xmm4, xmm6 - shufps xmm4, xmm4, 8 ;# constant 00001000 - shufps xmm6, xmm6, 13 ;# constant 00001101 - movlhps xmm4, xmm7 - movlhps xmm6, xmm7 - - movaps [esp + nb430_c6], xmm4 - movaps [esp + nb430_c12], xmm6 - - movd mm0, eax ;# copy jnr to mm0/mm1 - movd mm1, ebx - - lea eax, [eax + eax*2] - lea ebx, [ebx + ebx*2] - ;# move coordinates to xmm0-xmm2 - movlps xmm1, [edi + eax*4] - movss xmm2, [edi + eax*4 + 8] - movhps xmm1, [edi + ebx*4] - movss xmm0, [edi + ebx*4 + 8] - - movlhps xmm3, xmm7 - - shufps xmm2, xmm0, 0 - - movaps xmm0, xmm1 - - shufps xmm2, xmm2, 136 ;# constant 10001000 - - shufps xmm0, xmm0, 136 ;# constant 10001000 - shufps xmm1, xmm1, 221 ;# constant 11011101 - - mov edi, [ebp + nb430_faction] - ;# move ix-iz to xmm4-xmm6 - xorps xmm7, xmm7 - - movaps xmm4, [esp + nb430_ix] - movaps xmm5, [esp + nb430_iy] - movaps xmm6, [esp + nb430_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# store dr - movaps [esp + nb430_dx], xmm4 - movaps [esp + nb430_dy], xmm5 - movaps [esp + nb430_dz], xmm6 - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb430_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb430_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [esp + nb430_r], xmm4 - mulps xmm4, [esp + nb430_gbscale] - - cvttps2pi mm6, xmm4 ;# mm6 contain lu indices - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 - - mov esi, [ebp + nb430_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# load coulomb table - movaps xmm4, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm7, [esp + nb430_two] ;# two*Heps2 - movaps xmm3, [esp + nb430_qq] - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - mulps xmm3, xmm7 ;# fijC=FF*qq - - ;# get jnr from mm0/mm1 - movd ecx, mm0 - movd edx, mm1 - - mov esi, [ebp + nb430_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulps xmm3, [esp + nb430_gbscale] - movaps xmm6, xmm3 - mulps xmm6, [esp + nb430_r] - addps xmm6, xmm5 - addps xmm5, [esp + nb430_vctot] - movaps [esp + nb430_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [esp + nb430_dvdasum] - movaps [esp + nb430_dvdasum], xmm7 - - ;# update j atoms dvdaj - movaps xmm7, xmm6 - shufps xmm7, xmm7, 0x1 - addss xmm6, [esi + ecx*4] - addss xmm7, [esi + edx*4] - movss [esi + ecx*4], xmm6 - movss [esi + edx*4], xmm7 - - ;# put scalar force on stack temporarily - movaps [esp + nb430_fscal], xmm3 - - movaps xmm4, [esp + nb430_r] - mulps xmm4, [esp + nb430_tsc] - - cvttps2pi mm6, xmm4 - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 3 - - mov esi, [ebp + nb430_VFtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# dispersion - movaps xmm4, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# dispersion table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm7, [esp + nb430_two] ;# two*Heps2 - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - - movaps xmm4, [esp + nb430_c6] - mulps xmm7, xmm4 ;# fijD - mulps xmm5, xmm4 ;# Vvdw6 - mulps xmm7, [esp + nb430_tsc] - addps xmm7, [esp + nb430_fscal] ;# add to fscal - - ;# put scalar force on stack Update Vvdwtot directly - addps xmm5, [esp + nb430_Vvdwtot] - movaps [esp + nb430_fscal], xmm7 - movaps [esp + nb430_Vvdwtot], xmm5 - - ;# repulsion - movaps xmm4, [esi + ecx*4 + 16] - movaps xmm7, [esi + edx*4 + 16] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm7, [esp + nb430_two] ;# two*Heps2 - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - - movaps xmm4, [esp + nb430_c12] - mulps xmm7, xmm4 ;# fijR - mulps xmm5, xmm4 ;# Vvdw12 - mulps xmm7, [esp + nb430_tsc] - addps xmm7, [esp + nb430_fscal] - - addps xmm5, [esp + nb430_Vvdwtot] - movaps [esp + nb430_Vvdwtot], xmm5 - xorps xmm4, xmm4 - - mulps xmm7, xmm0 - subps xmm4, xmm7 - - movaps xmm0, [esp + nb430_dx] - movaps xmm1, [esp + nb430_dy] - movaps xmm2, [esp + nb430_dz] - - mulps xmm0, xmm4 - mulps xmm1, xmm4 - mulps xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movaps xmm3, [esp + nb430_fix] - movaps xmm4, [esp + nb430_fiy] - movaps xmm5, [esp + nb430_fiz] - addps xmm3, xmm0 - addps xmm4, xmm1 - addps xmm5, xmm2 - movaps [esp + nb430_fix], xmm3 - movaps [esp + nb430_fiy], xmm4 - movaps [esp + nb430_fiz], xmm5 - ;# update the fj's - movss xmm3, [edi + eax*4] - movss xmm4, [edi + eax*4 + 4] - movss xmm5, [edi + eax*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [edi + eax*4], xmm3 - movss [edi + eax*4 + 4], xmm4 - movss [edi + eax*4 + 8], xmm5 - - shufps xmm0, xmm0, 225 ;# constant 11100001 - shufps xmm1, xmm1, 225 ;# constant 11100001 - shufps xmm2, xmm2, 225 ;# constant 11100001 - - movss xmm3, [edi + ebx*4] - movss xmm4, [edi + ebx*4 + 4] - movss xmm5, [edi + ebx*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [edi + ebx*4], xmm3 - movss [edi + ebx*4 + 4], xmm4 - movss [edi + ebx*4 + 8], xmm5 - -.nb430_checksingle: - mov edx, [esp + nb430_innerk] - and edx, 1 - jnz .nb430_dosingle - jmp .nb430_updateouterdata -.nb430_dosingle: - mov esi, [ebp + nb430_charge] - mov edx, [ebp + nb430_invsqrta] - mov edi, [ebp + nb430_pos] - mov ecx, [esp + nb430_innerjjnr] - mov eax, [ecx] - xorps xmm2, xmm2 - movaps xmm6, xmm2 - movss xmm2, [edx + eax*4] ;# isaj - mulss xmm2, [esp + nb430_isai] - movss [esp + nb430_isaprod], xmm2 - movss xmm1, xmm2 - mulss xmm1, [esp + nb430_gbtsc] - movss [esp + nb430_gbscale], xmm1 - - mulss xmm2, [esp + nb430_iq] - movss xmm6, [esi + eax*4] ;# xmm6(0) has the charge - mulss xmm6, xmm2 - movss [esp + nb430_qq], xmm6 - - mov esi, [ebp + nb430_type] - mov ecx, eax - mov ecx, [esi + ecx*4] - mov esi, [ebp + nb430_vdwparam] - shl ecx, 1 - add ecx, [esp + nb430_ntia] - movlps xmm6, [esi + ecx*4] - movaps xmm4, xmm6 - shufps xmm4, xmm4, 252 ;# constant 11111100 - shufps xmm6, xmm6, 253 ;# constant 11111101 - - movss [esp + nb430_c6], xmm4 - movss [esp + nb430_c12], xmm6 - - movd mm0, eax - lea eax, [eax + eax*2] - - ;# move coordinates to xmm0-xmm2 - movss xmm0, [edi + eax*4] - movss xmm1, [edi + eax*4 + 4] - movss xmm2, [edi + eax*4 + 8] - - movss xmm4, [esp + nb430_ix] - movss xmm5, [esp + nb430_iy] - movss xmm6, [esp + nb430_iz] - - ;# calc dr - subss xmm4, xmm0 - subss xmm5, xmm1 - subss xmm6, xmm2 - - ;# store dr - movaps [esp + nb430_dx], xmm4 - movaps [esp + nb430_dy], xmm5 - movaps [esp + nb430_dz], xmm6 - ;# square it - mulss xmm4,xmm4 - mulss xmm5,xmm5 - mulss xmm6,xmm6 - addss xmm4, xmm5 - addss xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulss xmm5, xmm5 - movss xmm1, [esp + nb430_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movss xmm0, [esp + nb430_half] - subss xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - - mulss xmm4, xmm0 ;# xmm4=r - movss [esp + nb430_r], xmm4 - mulss xmm4, [esp + nb430_gbscale] - - cvttss2si ebx, xmm4 ;# mm6 contain lu indices - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 2 - - mov esi, [ebp + nb430_GBtab] - - movaps xmm4, [esi + ebx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - mulss xmm7, [esp + nb430_two] ;# two*Heps2 - movss xmm3, [esp + nb430_qq] - addss xmm7, xmm6 - addss xmm7, xmm5 ;# xmm7=FF - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, xmm3 ;# vcoul=qq*VV - mulss xmm3, xmm7 ;# fijC=FF*qq - - movd ebx, mm0 - mov esi, [ebp + nb430_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulss xmm3, [esp + nb430_gbscale] - movaps xmm6, xmm3 - mulss xmm6, [esp + nb430_r] - addss xmm6, xmm5 - addss xmm5, [esp + nb430_vctot] - movss [esp + nb430_vctot], xmm5 - - - ;# xmm6=(vcoul+fijC*r) - subss xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addss xmm7, [esp + nb430_dvdasum] - movaps [esp + nb430_dvdasum], xmm7 - - ;# update j atoms dvdaj - addss xmm6, [esi + ebx*4] - movss [esi + ebx*4], xmm6 - - ;# put scalar force on stack temporarily - movss [esp + nb430_fscal], xmm3 - - movss xmm4, [esp + nb430_r] - mulps xmm4, [esp + nb430_tsc] - - cvttss2si ebx, xmm4 - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movss xmm1, xmm4 ;# xmm1=eps - movss xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 3 - mov esi, [ebp + nb430_VFtab] - - ;# dispersion - movaps xmm4, [esi + ebx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - mulss xmm7, [esp + nb430_two] ;# two*Heps2 - addss xmm7, xmm6 - addss xmm7, xmm5 ;# xmm7=FF - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - - movss xmm4, [esp + nb430_c6] - mulss xmm7, xmm4 ;# fijD - mulss xmm5, xmm4 ;# Vvdw6 - mulps xmm7, [esp + nb430_tsc] - addss xmm7, [esp + nb430_fscal] ;# add to fscal - - ;# put scalar force on stack Update Vvdwtot directly - addss xmm5, [esp + nb430_Vvdwtot] - movss [esp + nb430_fscal], xmm7 - movss [esp + nb430_Vvdwtot], xmm5 - - ;# repulsion - movaps xmm4, [esi + ebx*4 + 16] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - mulss xmm7, [esp + nb430_two] ;# two*Heps2 - addss xmm7, xmm6 - addss xmm7, xmm5 ;# xmm7=FF - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - - movss xmm4, [esp + nb430_c12] - mulss xmm7, xmm4 ;# fijR - mulss xmm5, xmm4 ;# Vvdw12 - mulps xmm7, [esp + nb430_tsc] - addss xmm7, [esp + nb430_fscal] - - addss xmm5, [esp + nb430_Vvdwtot] - movss [esp + nb430_Vvdwtot], xmm5 - xorps xmm4, xmm4 - - mulss xmm7, xmm0 - subss xmm4, xmm7 - mov edi, [ebp + nb430_faction] - - movss xmm0, [esp + nb430_dx] - movss xmm1, [esp + nb430_dy] - movss xmm2, [esp + nb430_dz] - - mulss xmm0, xmm4 - mulss xmm1, xmm4 - mulss xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movss xmm3, [esp + nb430_fix] - movss xmm4, [esp + nb430_fiy] - movss xmm5, [esp + nb430_fiz] - addss xmm3, xmm0 - addss xmm4, xmm1 - addss xmm5, xmm2 - movss [esp + nb430_fix], xmm3 - movss [esp + nb430_fiy], xmm4 - movss [esp + nb430_fiz], xmm5 - ;# update fj - - movss xmm3, [edi + eax*4] - movss xmm4, [edi + eax*4 + 4] - movss xmm5, [edi + eax*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [edi + eax*4], xmm3 - movss [edi + eax*4 + 4], xmm4 - movss [edi + eax*4 + 8], xmm5 -.nb430_updateouterdata: - mov ecx, [esp + nb430_ii3] - mov edi, [ebp + nb430_faction] - mov esi, [ebp + nb430_fshift] - mov edx, [esp + nb430_is3] - - ;# accumulate i forces in xmm0, xmm1, xmm2 - movaps xmm0, [esp + nb430_fix] - movaps xmm1, [esp + nb430_fiy] - movaps xmm2, [esp + nb430_fiz] - - movhlps xmm3, xmm0 - movhlps xmm4, xmm1 - movhlps xmm5, xmm2 - addps xmm0, xmm3 - addps xmm1, xmm4 - addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2 - - movaps xmm3, xmm0 - movaps xmm4, xmm1 - movaps xmm5, xmm2 - - shufps xmm3, xmm3, 1 - shufps xmm4, xmm4, 1 - shufps xmm5, xmm5, 1 - addss xmm0, xmm3 - addss xmm1, xmm4 - addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0 - - ;# increment i force - movss xmm3, [edi + ecx*4] - movss xmm4, [edi + ecx*4 + 4] - movss xmm5, [edi + ecx*4 + 8] - addss xmm3, xmm0 - addss xmm4, xmm1 - addss xmm5, xmm2 - movss [edi + ecx*4], xmm3 - movss [edi + ecx*4 + 4], xmm4 - movss [edi + ecx*4 + 8], xmm5 - - ;# increment fshift force - movss xmm3, [esi + edx*4] - movss xmm4, [esi + edx*4 + 4] - movss xmm5, [esi + edx*4 + 8] - addss xmm3, xmm0 - addss xmm4, xmm1 - addss xmm5, xmm2 - movss [esi + edx*4], xmm3 - movss [esi + edx*4 + 4], xmm4 - movss [esi + edx*4 + 8], xmm5 - - ;# get n from stack - mov esi, [esp + nb430_n] - ;# get group index for i particle - mov edx, [ebp + nb430_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movaps xmm7, [esp + nb430_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov eax, [ebp + nb430_Vc] - addss xmm7, [eax + edx*4] - ;# move back to mem - movss [eax + edx*4], xmm7 - - ;# accumulate total lj energy and update it - movaps xmm7, [esp + nb430_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov eax, [ebp + nb430_Vvdw] - addss xmm7, [eax + edx*4] - ;# move back to mem - movss [eax + edx*4], xmm7 - - ;# accumulate dVda and update it - movaps xmm7, [esp + nb430_dvdasum] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - mov edx, [esp + nb430_ii] - mov eax, [ebp + nb430_dvda] - addss xmm7, [eax + edx*4] - movss [eax + edx*4], xmm7 - - ;# finish if last - mov ecx, [esp + nb430_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb430_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb430_n], esi - jmp .nb430_outer -.nb430_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb430_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb430_end - ;# non-zero, do one more workunit - jmp .nb430_threadloop -.nb430_end: - emms - - mov eax, [esp + nb430_nouter] - mov ebx, [esp + nb430_ninner] - mov ecx, [ebp + nb430_outeriter] - mov edx, [ebp + nb430_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb430_salign] - add esp, eax - add esp, 488 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret - - - - - - - -.globl nb_kernel430nf_ia32_sse -.globl _nb_kernel430nf_ia32_sse -nb_kernel430nf_ia32_sse: -_nb_kernel430nf_ia32_sse: -.equiv nb430nf_p_nri, 8 -.equiv nb430nf_iinr, 12 -.equiv nb430nf_jindex, 16 -.equiv nb430nf_jjnr, 20 -.equiv nb430nf_shift, 24 -.equiv nb430nf_shiftvec, 28 -.equiv nb430nf_fshift, 32 -.equiv nb430nf_gid, 36 -.equiv nb430nf_pos, 40 -.equiv nb430nf_faction, 44 -.equiv nb430nf_charge, 48 -.equiv nb430nf_p_facel, 52 -.equiv nb430nf_argkrf, 56 -.equiv nb430nf_argcrf, 60 -.equiv nb430nf_Vc, 64 -.equiv nb430nf_type, 68 -.equiv nb430nf_p_ntype, 72 -.equiv nb430nf_vdwparam, 76 -.equiv nb430nf_Vvdw, 80 -.equiv nb430nf_p_tabscale, 84 -.equiv nb430nf_VFtab, 88 -.equiv nb430nf_invsqrta, 92 -.equiv nb430nf_dvda, 96 -.equiv nb430nf_p_gbtabscale, 100 -.equiv nb430nf_GBtab, 104 -.equiv nb430nf_p_nthreads, 108 -.equiv nb430nf_count, 112 -.equiv nb430nf_mtx, 116 -.equiv nb430nf_outeriter, 120 -.equiv nb430nf_inneriter, 124 -.equiv nb430nf_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb430nf_ix, 0 -.equiv nb430nf_iy, 16 -.equiv nb430nf_iz, 32 -.equiv nb430nf_iq, 48 -.equiv nb430nf_gbtsc, 64 -.equiv nb430nf_tsc, 80 -.equiv nb430nf_qq, 96 -.equiv nb430nf_c6, 112 -.equiv nb430nf_c12, 128 -.equiv nb430nf_vctot, 144 -.equiv nb430nf_Vvdwtot, 160 -.equiv nb430nf_half, 176 -.equiv nb430nf_three, 192 -.equiv nb430nf_isai, 208 -.equiv nb430nf_isaprod, 224 -.equiv nb430nf_gbscale, 240 -.equiv nb430nf_r, 256 -.equiv nb430nf_is3, 272 -.equiv nb430nf_ii3, 276 -.equiv nb430nf_ntia, 280 -.equiv nb430nf_innerjjnr, 284 -.equiv nb430nf_innerk, 288 -.equiv nb430nf_n, 292 -.equiv nb430nf_nn1, 296 -.equiv nb430nf_nri, 300 -.equiv nb430nf_facel, 304 -.equiv nb430nf_ntype, 308 -.equiv nb430nf_nouter, 312 -.equiv nb430nf_ninner, 316 -.equiv nb430nf_salign, 320 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 324 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb430nf_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb430nf_p_nri] - mov esi, [ebp + nb430nf_p_facel] - mov edi, [ebp + nb430nf_p_ntype] - mov ecx, [ecx] - mov esi, [esi] - mov edi, [edi] - mov [esp + nb430nf_nri], ecx - mov [esp + nb430nf_facel], esi - mov [esp + nb430nf_ntype], edi - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb430nf_nouter], eax - mov [esp + nb430nf_ninner], eax - - - mov eax, [ebp + nb430nf_p_gbtabscale] - movss xmm3, [eax] - mov eax, [ebp + nb430nf_p_tabscale] - movss xmm4, [eax] - shufps xmm3, xmm3, 0 - shufps xmm4, xmm4, 0 - movaps [esp + nb430nf_gbtsc], xmm3 - movaps [esp + nb430nf_tsc], xmm4 - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# constant 0.5 in IEEE (hex) - mov [esp + nb430nf_half], eax - movss xmm1, [esp + nb430nf_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# constant 1.0 - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# constant 2.0 - addps xmm3, xmm2 ;# constant 3.0 - movaps [esp + nb430nf_half], xmm1 - movaps [esp + nb430nf_three], xmm3 - -.nb430nf_threadloop: - mov esi, [ebp + nb430nf_count] ;# pointer to sync counter - mov eax, [esi] -.nb430nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb430nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb430nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb430nf_n], eax - mov [esp + nb430nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb430nf_outerstart - jmp .nb430nf_end - -.nb430nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb430nf_nouter] - mov [esp + nb430nf_nouter], ebx - -.nb430nf_outer: - mov eax, [ebp + nb430nf_shift] ;# eax = pointer into shift[] - mov ebx, [eax + esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb430nf_is3],ebx ;# store is3 - - mov eax, [ebp + nb430nf_shiftvec] ;# eax = base of shiftvec[] - - movss xmm0, [eax + ebx*4] - movss xmm1, [eax + ebx*4 + 4] - movss xmm2, [eax + ebx*4 + 8] - - mov ecx, [ebp + nb430nf_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx + esi*4] ;# ebx =ii - - mov edx, [ebp + nb430nf_charge] - movss xmm3, [edx + ebx*4] - mulss xmm3, [esp + nb430nf_facel] - shufps xmm3, xmm3, 0 - - mov edx, [ebp + nb430nf_invsqrta] ;# load invsqrta[ii] - movss xmm4, [edx + ebx*4] - shufps xmm4, xmm4, 0 - - mov edx, [ebp + nb430nf_type] - mov edx, [edx + ebx*4] - imul edx, [esp + nb430nf_ntype] - shl edx, 1 - mov [esp + nb430nf_ntia], edx - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb430nf_pos] ;# eax = base of pos[] - - addss xmm0, [eax + ebx*4] - addss xmm1, [eax + ebx*4 + 4] - addss xmm2, [eax + ebx*4 + 8] - - movaps [esp + nb430nf_iq], xmm3 - movaps [esp + nb430nf_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [esp + nb430nf_ix], xmm0 - movaps [esp + nb430nf_iy], xmm1 - movaps [esp + nb430nf_iz], xmm2 - - mov [esp + nb430nf_ii3], ebx - - ;# clear vctot - xorps xmm4, xmm4 - movaps [esp + nb430nf_vctot], xmm4 - movaps [esp + nb430nf_Vvdwtot], xmm4 - - mov eax, [ebp + nb430nf_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb430nf_pos] - mov edi, [ebp + nb430nf_faction] - mov eax, [ebp + nb430nf_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb430nf_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [esp + nb430nf_ninner] - mov [esp + nb430nf_ninner], ecx - add edx, 0 - mov [esp + nb430nf_innerk], edx ;# number of innerloop atoms - jge .nb430nf_unroll_loop - jmp .nb430nf_finish_inner -.nb430nf_unroll_loop: - ;# quad-unroll innerloop here - mov edx, [esp + nb430nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - mov ecx, [edx + 8] - mov edx, [edx + 12] ;# eax-edx=jnr1-4 - add dword ptr [esp + nb430nf_innerjjnr], 16 ;# advance pointer (unrolled 4) - - ;# load isa2 - mov esi, [ebp + nb430nf_invsqrta] - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - movaps xmm2, [esp + nb430nf_isai] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3 - mulps xmm2, xmm3 - - movaps [esp + nb430nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb430nf_gbtsc] - movaps [esp + nb430nf_gbscale], xmm1 - - mov esi, [ebp + nb430nf_charge] ;# base of charge[] - - movss xmm3, [esi + eax*4] - movss xmm4, [esi + ecx*4] - movss xmm6, [esi + ebx*4] - movss xmm7, [esi + edx*4] - - mulps xmm2, [esp + nb430nf_iq] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3 - mulps xmm3, xmm2 - movaps [esp + nb430nf_qq], xmm3 - - movd mm0, eax ;# use mmx registers as temp storage - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov esi, [ebp + nb430nf_type] - mov eax, [esi + eax*4] - mov ebx, [esi + ebx*4] - mov ecx, [esi + ecx*4] - mov edx, [esi + edx*4] - mov esi, [ebp + nb430nf_vdwparam] - shl eax, 1 - shl ebx, 1 - shl ecx, 1 - shl edx, 1 - mov edi, [esp + nb430nf_ntia] - add eax, edi - add ebx, edi - add ecx, edi - add edx, edi - - movlps xmm6, [esi + eax*4] - movlps xmm7, [esi + ecx*4] - movhps xmm6, [esi + ebx*4] - movhps xmm7, [esi + edx*4] - - movaps xmm4, xmm6 - shufps xmm4, xmm7, 136 ;# constant 10001000 - shufps xmm6, xmm7, 221 ;# constant 11011101 - - movd eax, mm0 - movd ebx, mm1 - movd ecx, mm2 - movd edx, mm3 - - movaps [esp + nb430nf_c6], xmm4 - movaps [esp + nb430nf_c12], xmm6 - - mov esi, [ebp + nb430nf_pos] ;# base of pos[] - - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - - lea ecx, [ecx + ecx*2] ;# replace jnr with j3 - lea edx, [edx + edx*2] - - ;# move four coordinates to xmm0-xmm2 - - movlps xmm4, [esi + eax*4] - movlps xmm5, [esi + ecx*4] - movss xmm2, [esi + eax*4 + 8] - movss xmm6, [esi + ecx*4 + 8] - - movhps xmm4, [esi + ebx*4] - movhps xmm5, [esi + edx*4] - - movss xmm0, [esi + ebx*4 + 8] - movss xmm1, [esi + edx*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# constant 10001000 - - shufps xmm0, xmm5, 136 ;# constant 10001000 - shufps xmm1, xmm5, 221 ;# constant 11011101 - - ;# move ix-iz to xmm4-xmm6 - movaps xmm4, [esp + nb430nf_ix] - movaps xmm5, [esp + nb430nf_iy] - movaps xmm6, [esp + nb430nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb430nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb430nf_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [esp + nb430nf_r], xmm4 - mulps xmm4, [esp + nb430nf_gbscale] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 2 - pslld mm7, 2 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov esi, [ebp + nb430nf_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# load coulomb table - movaps xmm4, [esi + eax*4] - movaps xmm5, [esi + ebx*4] - movaps xmm6, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [esp + nb430nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - addps xmm5, [esp + nb430nf_vctot] - movaps [esp + nb430nf_vctot], xmm5 - - - movaps xmm4, [esp + nb430nf_r] - mulps xmm4, [esp + nb430nf_tsc] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 3 - pslld mm7, 3 - - mov esi, [ebp + nb430nf_VFtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# dispersion - movaps xmm4, [esi + eax*4] - movaps xmm5, [esi + ebx*4] - movaps xmm6, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# dispersion table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, [esp + nb430nf_c6] ;# Vvdw6 - addps xmm5, [esp + nb430nf_Vvdwtot] - movaps [esp + nb430nf_Vvdwtot], xmm5 - - ;# repulsion - movaps xmm4, [esi + eax*4 + 16] - movaps xmm5, [esi + ebx*4 + 16] - movaps xmm6, [esi + ecx*4 + 16] - movaps xmm7, [esi + edx*4 + 16] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - - mulps xmm5, [esp + nb430nf_c12] ;# Vvdw12 - addps xmm5, [esp + nb430nf_Vvdwtot] - movaps [esp + nb430nf_Vvdwtot], xmm5 - - ;# should we do one more iteration? - sub dword ptr [esp + nb430nf_innerk], 4 - jl .nb430nf_finish_inner - jmp .nb430nf_unroll_loop -.nb430nf_finish_inner: - ;# check if at least two particles remain - add dword ptr [esp + nb430nf_innerk], 4 - mov edx, [esp + nb430nf_innerk] - and edx, 2 - jnz .nb430nf_dopair - jmp .nb430nf_checksingle -.nb430nf_dopair: - - mov ecx, [esp + nb430nf_innerjjnr] - - mov eax, [ecx] - mov ebx, [ecx + 4] - add dword ptr [esp + nb430nf_innerjjnr], 8 - - xorps xmm2, xmm2 - movaps xmm6, xmm2 - - ;# load isa2 - mov esi, [ebp + nb430nf_invsqrta] - movss xmm2, [esi + eax*4] - movss xmm3, [esi + ebx*4] - unpcklps xmm2, xmm3 ;# isa2 in xmm3(0,1) - mulps xmm2, [esp + nb430nf_isai] - movaps [esp + nb430nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [esp + nb430nf_gbtsc] - movaps [esp + nb430nf_gbscale], xmm1 - - mov esi, [ebp + nb430nf_charge] ;# base of charge[] - movss xmm3, [esi + eax*4] - movss xmm6, [esi + ebx*4] - unpcklps xmm3, xmm6 ;# constant 00001000 ;# xmm3(0,1) has the charges - - mulps xmm2, [esp + nb430nf_iq] - mulps xmm3, xmm2 - movaps [esp + nb430nf_qq], xmm3 - - mov esi, [ebp + nb430nf_type] - mov ecx, eax - mov edx, ebx - mov ecx, [esi + ecx*4] - mov edx, [esi + edx*4] - mov esi, [ebp + nb430nf_vdwparam] - shl ecx, 1 - shl edx, 1 - mov edi, [esp + nb430nf_ntia] - add ecx, edi - add edx, edi - movlps xmm6, [esi + ecx*4] - movhps xmm6, [esi + edx*4] - mov edi, [ebp + nb430nf_pos] - - movaps xmm4, xmm6 - shufps xmm4, xmm4, 8 ;# constant 00001000 - shufps xmm6, xmm6, 13 ;# constant 00001101 - movlhps xmm4, xmm7 - movlhps xmm6, xmm7 - - movaps [esp + nb430nf_c6], xmm4 - movaps [esp + nb430nf_c12], xmm6 - - lea eax, [eax + eax*2] - lea ebx, [ebx + ebx*2] - ;# move coordinates to xmm0-xmm2 - movlps xmm1, [edi + eax*4] - movss xmm2, [edi + eax*4 + 8] - movhps xmm1, [edi + ebx*4] - movss xmm0, [edi + ebx*4 + 8] - - movlhps xmm3, xmm7 - - shufps xmm2, xmm0, 0 - - movaps xmm0, xmm1 - - shufps xmm2, xmm2, 136 ;# constant 10001000 - - shufps xmm0, xmm0, 136 ;# constant 10001000 - shufps xmm1, xmm1, 221 ;# constant 11011101 - - mov edi, [ebp + nb430nf_faction] - ;# move ix-iz to xmm4-xmm6 - xorps xmm7, xmm7 - - movaps xmm4, [esp + nb430nf_ix] - movaps xmm5, [esp + nb430nf_iy] - movaps xmm6, [esp + nb430nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [esp + nb430nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [esp + nb430nf_half] - subps xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [esp + nb430nf_r], xmm4 - mulps xmm4, [esp + nb430nf_gbscale] - - cvttps2pi mm6, xmm4 ;# mm6 contain lu indices - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 - - mov esi, [ebp + nb430nf_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# load coulomb table - movaps xmm4, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [esp + nb430nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - addps xmm5, [esp + nb430nf_vctot] - movaps [esp + nb430nf_vctot], xmm5 - - movaps xmm4, [esp + nb430nf_r] - mulps xmm4, [esp + nb430nf_tsc] - - cvttps2pi mm6, xmm4 - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 3 - - mov esi, [ebp + nb430nf_VFtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# dispersion - movaps xmm4, [esi + ecx*4] - movaps xmm7, [esi + edx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# dispersion table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - - mulps xmm5, [esp + nb430nf_c6] ;# Vvdw6 - addps xmm5, [esp + nb430nf_Vvdwtot] - movaps [esp + nb430nf_Vvdwtot], xmm5 - - ;# repulsion - movaps xmm4, [esi + ecx*4 + 16] - movaps xmm7, [esi + edx*4 + 16] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - - mulps xmm5, [esp + nb430nf_c12] ;# Vvdw12 - - addps xmm5, [esp + nb430nf_Vvdwtot] - movaps [esp + nb430nf_Vvdwtot], xmm5 -.nb430nf_checksingle: - mov edx, [esp + nb430nf_innerk] - and edx, 1 - jnz .nb430nf_dosingle - jmp .nb430nf_updateouterdata -.nb430nf_dosingle: - mov esi, [ebp + nb430nf_charge] - mov edx, [ebp + nb430nf_invsqrta] - mov edi, [ebp + nb430nf_pos] - mov ecx, [esp + nb430nf_innerjjnr] - mov eax, [ecx] - xorps xmm2, xmm2 - movaps xmm6, xmm2 - movss xmm2, [edx + eax*4] ;# isa2 - mulss xmm2, [esp + nb430nf_isai] - movss [esp + nb430nf_isaprod], xmm2 - movss xmm1, xmm2 - mulss xmm1, [esp + nb430nf_gbtsc] - movss [esp + nb430nf_gbscale], xmm1 - - mulss xmm2, [esp + nb430nf_iq] - movss xmm6, [esi + eax*4] ;# xmm6(0) has the charge - mulss xmm6, xmm2 - movss [esp + nb430nf_qq], xmm6 - - mov esi, [ebp + nb430nf_type] - mov ecx, eax - mov ecx, [esi + ecx*4] - mov esi, [ebp + nb430nf_vdwparam] - shl ecx, 1 - add ecx, [esp + nb430nf_ntia] - movlps xmm6, [esi + ecx*4] - movaps xmm4, xmm6 - shufps xmm4, xmm4, 252 ;# constant 11111100 - shufps xmm6, xmm6, 253 ;# constant 11111101 - - movss [esp + nb430nf_c6], xmm4 - movss [esp + nb430nf_c12], xmm6 - - lea eax, [eax + eax*2] - - ;# move coordinates to xmm0-xmm2 - movss xmm0, [edi + eax*4] - movss xmm1, [edi + eax*4 + 4] - movss xmm2, [edi + eax*4 + 8] - - movss xmm4, [esp + nb430nf_ix] - movss xmm5, [esp + nb430nf_iy] - movss xmm6, [esp + nb430nf_iz] - - ;# calc dr - subss xmm4, xmm0 - subss xmm5, xmm1 - subss xmm6, xmm2 - - ;# square it - mulss xmm4,xmm4 - mulss xmm5,xmm5 - mulss xmm6,xmm6 - addss xmm4, xmm5 - addss xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulss xmm5, xmm5 - movss xmm1, [esp + nb430nf_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movss xmm0, [esp + nb430nf_half] - subss xmm1, xmm5 ;# constant 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - - mulss xmm4, xmm0 ;# xmm4=r - movaps [esp + nb430nf_r], xmm4 - mulss xmm4, [esp + nb430nf_gbscale] - - cvttss2si ebx, xmm4 ;# mm6 contain lu indices - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 2 - - mov esi, [ebp + nb430nf_GBtab] - - movaps xmm4, [esi + ebx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - movss xmm3, [esp + nb430nf_qq] - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, xmm3 ;# vcoul=qq*VV - addss xmm5, [esp + nb430nf_vctot] - movss [esp + nb430nf_vctot], xmm5 - - movss xmm4, [esp + nb430nf_r] - mulps xmm4, [esp + nb430nf_tsc] - - cvttss2si ebx, xmm4 - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movss xmm1, xmm4 ;# xmm1=eps - movss xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 3 - mov esi, [ebp + nb430nf_VFtab] - - ;# dispersion - movaps xmm4, [esi + ebx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, [esp + nb430nf_c6] ;# Vvdw6 - addss xmm5, [esp + nb430nf_Vvdwtot] - movss [esp + nb430nf_Vvdwtot], xmm5 - - ;# repulsion - movaps xmm4, [esi + ebx*4 + 16] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - - mulss xmm5, [esp + nb430nf_c12] ;# Vvdw12 - - addss xmm5, [esp + nb430nf_Vvdwtot] - movss [esp + nb430nf_Vvdwtot], xmm5 - -.nb430nf_updateouterdata: - ;# get n from stack - mov esi, [esp + nb430nf_n] - ;# get group index for i particle - mov edx, [ebp + nb430nf_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movaps xmm7, [esp + nb430nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov eax, [ebp + nb430nf_Vc] - addss xmm7, [eax + edx*4] - ;# move back to mem - movss [eax + edx*4], xmm7 - - ;# accumulate total lj energy and update it - movaps xmm7, [esp + nb430nf_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov eax, [ebp + nb430nf_Vvdw] - addss xmm7, [eax + edx*4] - ;# move back to mem - movss [eax + edx*4], xmm7 - - ;# finish if last - mov ecx, [esp + nb430nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb430nf_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb430nf_n], esi - jmp .nb430nf_outer -.nb430nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb430nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb430nf_end - ;# non-zero, do one more workunit - jmp .nb430nf_threadloop -.nb430nf_end: - emms - - mov eax, [esp + nb430nf_nouter] - mov ebx, [esp + nb430nf_ninner] - mov ecx, [ebp + nb430nf_outeriter] - mov edx, [ebp + nb430nf_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb430nf_salign] - add esp, eax - add esp, 324 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret - - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.s deleted file mode 100644 index 477f512d8c..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.s +++ /dev/null @@ -1,2382 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - - - -.globl nb_kernel430_ia32_sse -.globl _nb_kernel430_ia32_sse -nb_kernel430_ia32_sse: -_nb_kernel430_ia32_sse: -.set nb430_p_nri, 8 -.set nb430_iinr, 12 -.set nb430_jindex, 16 -.set nb430_jjnr, 20 -.set nb430_shift, 24 -.set nb430_shiftvec, 28 -.set nb430_fshift, 32 -.set nb430_gid, 36 -.set nb430_pos, 40 -.set nb430_faction, 44 -.set nb430_charge, 48 -.set nb430_p_facel, 52 -.set nb430_argkrf, 56 -.set nb430_argcrf, 60 -.set nb430_Vc, 64 -.set nb430_type, 68 -.set nb430_p_ntype, 72 -.set nb430_vdwparam, 76 -.set nb430_Vvdw, 80 -.set nb430_p_tabscale, 84 -.set nb430_VFtab, 88 -.set nb430_invsqrta, 92 -.set nb430_dvda, 96 -.set nb430_p_gbtabscale, 100 -.set nb430_GBtab, 104 -.set nb430_p_nthreads, 108 -.set nb430_count, 112 -.set nb430_mtx, 116 -.set nb430_outeriter, 120 -.set nb430_inneriter, 124 -.set nb430_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb430_ix, 0 -.set nb430_iy, 16 -.set nb430_iz, 32 -.set nb430_iq, 48 -.set nb430_dx, 64 -.set nb430_dy, 80 -.set nb430_dz, 96 -.set nb430_two, 112 -.set nb430_gbtsc, 128 -.set nb430_tsc, 144 -.set nb430_qq, 160 -.set nb430_c6, 176 -.set nb430_c12, 192 -.set nb430_fscal, 208 -.set nb430_vctot, 224 -.set nb430_Vvdwtot, 240 -.set nb430_fix, 256 -.set nb430_fiy, 272 -.set nb430_fiz, 288 -.set nb430_half, 304 -.set nb430_three, 320 -.set nb430_r, 336 -.set nb430_isai, 352 -.set nb430_isaprod, 368 -.set nb430_dvdasum, 384 -.set nb430_gbscale, 400 -.set nb430_ii, 416 -.set nb430_is3, 420 -.set nb430_ii3, 424 -.set nb430_ntia, 428 -.set nb430_innerjjnr, 432 -.set nb430_innerk, 436 -.set nb430_n, 440 -.set nb430_nn1, 444 -.set nb430_jnra, 448 -.set nb430_jnrb, 452 -.set nb430_jnrc, 456 -.set nb430_jnrd, 460 -.set nb430_nri, 464 -.set nb430_facel, 468 -.set nb430_ntype, 472 -.set nb430_nouter, 476 -.set nb430_ninner, 480 -.set nb430_salign, 484 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $488,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb430_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb430_p_nri(%ebp),%ecx - movl nb430_p_facel(%ebp),%esi - movl nb430_p_ntype(%ebp),%edi - movl (%ecx),%ecx - movl (%esi),%esi - movl (%edi),%edi - movl %ecx,nb430_nri(%esp) - movl %esi,nb430_facel(%esp) - movl %edi,nb430_ntype(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb430_nouter(%esp) - movl %eax,nb430_ninner(%esp) - - - movl nb430_p_gbtabscale(%ebp),%eax - movss (%eax),%xmm3 - movl nb430_p_tabscale(%ebp),%eax - movss (%eax),%xmm4 - shufps $0,%xmm3,%xmm3 - shufps $0,%xmm4,%xmm4 - movaps %xmm3,nb430_gbtsc(%esp) - movaps %xmm4,nb430_tsc(%esp) - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## constant 0.5 in IEEE (hex) - movl %eax,nb430_half(%esp) - movss nb430_half(%esp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## constant 1.0 - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## constant 2.0 - addps %xmm2,%xmm3 ## constant 3.0 - movaps %xmm1,nb430_half(%esp) - movaps %xmm2,nb430_two(%esp) - movaps %xmm3,nb430_three(%esp) - -_nb_kernel430_ia32_sse.nb430_threadloop: - movl nb430_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel430_ia32_sse.nb430_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel430_ia32_sse.nb430_spinlock - - ## if(nn1>nri) nn1=nri - movl nb430_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb430_n(%esp) - movl %ebx,nb430_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel430_ia32_sse.nb430_outerstart - jmp _nb_kernel430_ia32_sse.nb430_end - -_nb_kernel430_ia32_sse.nb430_outerstart: - ## ebx contains number of outer iterations - addl nb430_nouter(%esp),%ebx - movl %ebx,nb430_nouter(%esp) - -_nb_kernel430_ia32_sse.nb430_outer: - movl nb430_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb430_is3(%esp) ## store is3 - - movl nb430_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movss (%eax,%ebx,4),%xmm0 - movss 4(%eax,%ebx,4),%xmm1 - movss 8(%eax,%ebx,4),%xmm2 - - movl nb430_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - movl %ebx,nb430_ii(%esp) - - movl nb430_charge(%ebp),%edx - movss (%edx,%ebx,4),%xmm3 - mulss nb430_facel(%esp),%xmm3 - shufps $0,%xmm3,%xmm3 - - movl nb430_invsqrta(%ebp),%edx ## load invsqrta[ii] - movss (%edx,%ebx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - movl nb430_type(%ebp),%edx - movl (%edx,%ebx,4),%edx - imull nb430_ntype(%esp),%edx - shll %edx - movl %edx,nb430_ntia(%esp) - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb430_pos(%ebp),%eax ## eax = base of pos[] - - addss (%eax,%ebx,4),%xmm0 - addss 4(%eax,%ebx,4),%xmm1 - addss 8(%eax,%ebx,4),%xmm2 - - movaps %xmm3,nb430_iq(%esp) - movaps %xmm4,nb430_isai(%esp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb430_ix(%esp) - movaps %xmm1,nb430_iy(%esp) - movaps %xmm2,nb430_iz(%esp) - - movl %ebx,nb430_ii3(%esp) - - ## clear vctot and i forces - xorps %xmm4,%xmm4 - movaps %xmm4,nb430_vctot(%esp) - movaps %xmm4,nb430_Vvdwtot(%esp) - movaps %xmm4,nb430_dvdasum(%esp) - movaps %xmm4,nb430_fix(%esp) - movaps %xmm4,nb430_fiy(%esp) - movaps %xmm4,nb430_fiz(%esp) - - movl nb430_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb430_pos(%ebp),%esi - movl nb430_faction(%ebp),%edi - movl nb430_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb430_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb430_ninner(%esp),%ecx - movl %ecx,nb430_ninner(%esp) - addl $0,%edx - movl %edx,nb430_innerk(%esp) ## number of innerloop atoms - - jge _nb_kernel430_ia32_sse.nb430_unroll_loop - jmp _nb_kernel430_ia32_sse.nb430_finish_inner -_nb_kernel430_ia32_sse.nb430_unroll_loop: - ## quad-unroll innerloop here - movl nb430_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - movl 8(%edx),%ecx - movl 12(%edx),%edx ## eax-edx=jnr1-4 - addl $16,nb430_innerjjnr(%esp) ## advance pointer (unrolled 4) - - ## load isaj - movl nb430_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - movaps nb430_isai(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all isaj in xmm3 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb430_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb430_gbtsc(%esp),%xmm1 - movaps %xmm1,nb430_gbscale(%esp) - - movl nb430_charge(%ebp),%esi ## base of charge[] - - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - - mulps nb430_iq(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3 - mulps %xmm2,%xmm3 - movaps %xmm3,nb430_qq(%esp) - - movd %eax,%mm0 ## use mmx registers as temp storage - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movl nb430_type(%ebp),%esi - movl (%esi,%eax,4),%eax - movl (%esi,%ebx,4),%ebx - movl (%esi,%ecx,4),%ecx - movl (%esi,%edx,4),%edx - movl nb430_vdwparam(%ebp),%esi - shll %eax - shll %ebx - shll %ecx - shll %edx - movl nb430_ntia(%esp),%edi - addl %edi,%eax - addl %edi,%ebx - addl %edi,%ecx - addl %edi,%edx - - movlps (%esi,%eax,4),%xmm6 - movlps (%esi,%ecx,4),%xmm7 - movhps (%esi,%ebx,4),%xmm6 - movhps (%esi,%edx,4),%xmm7 - - movaps %xmm6,%xmm4 - shufps $136,%xmm7,%xmm4 ## constant 10001000 - shufps $221,%xmm7,%xmm6 ## constant 11011101 - - movd %mm0,%eax - movd %mm1,%ebx - movd %mm2,%ecx - movd %mm3,%edx - - movaps %xmm4,nb430_c6(%esp) - movaps %xmm6,nb430_c12(%esp) - - movl nb430_pos(%ebp),%esi ## base of pos[] - - movl %eax,nb430_jnra(%esp) - movl %ebx,nb430_jnrb(%esp) - movl %ecx,nb430_jnrc(%esp) - movl %edx,nb430_jnrd(%esp) - - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - leal (%ecx,%ecx,2),%ecx - leal (%edx,%edx,2),%edx - - ## move four coordinates to xmm0-xmm2 - - movlps (%esi,%eax,4),%xmm4 - movlps (%esi,%ecx,4),%xmm5 - movss 8(%esi,%eax,4),%xmm2 - movss 8(%esi,%ecx,4),%xmm6 - - movhps (%esi,%ebx,4),%xmm4 - movhps (%esi,%edx,4),%xmm5 - - movss 8(%esi,%ebx,4),%xmm0 - movss 8(%esi,%edx,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## constant 10001000 - - shufps $136,%xmm5,%xmm0 ## constant 10001000 - shufps $221,%xmm5,%xmm1 ## constant 11011101 - - ## move ix-iz to xmm4-xmm6 - movaps nb430_ix(%esp),%xmm4 - movaps nb430_iy(%esp),%xmm5 - movaps nb430_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## store dr - movaps %xmm4,nb430_dx(%esp) - movaps %xmm5,nb430_dy(%esp) - movaps %xmm6,nb430_dz(%esp) - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb430_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb430_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb430_r(%esp) - mulps nb430_gbscale(%esp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $2,%mm6 - pslld $2,%mm7 - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movl nb430_GBtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## load coulomb table - movaps (%esi,%eax,4),%xmm4 - movaps (%esi,%ebx,4),%xmm5 - movaps (%esi,%ecx,4),%xmm6 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps nb430_two(%esp),%xmm7 ## two*Heps2 - movaps nb430_qq(%esp),%xmm3 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - mulps %xmm7,%xmm3 ## fijC=FF*qq - - ## get jnr from stack - movl nb430_jnra(%esp),%eax - movl nb430_jnrb(%esp),%ebx - movl nb430_jnrc(%esp),%ecx - movl nb430_jnrd(%esp),%edx - - movl nb430_dvda(%ebp),%esi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulps nb430_gbscale(%esp),%xmm3 - movaps %xmm3,%xmm6 - mulps nb430_r(%esp),%xmm6 - addps %xmm5,%xmm6 - addps nb430_vctot(%esp),%xmm5 - movaps %xmm5,nb430_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb430_dvdasum(%esp),%xmm7 - movaps %xmm7,nb430_dvdasum(%esp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - movaps %xmm6,%xmm5 - movaps %xmm7,%xmm4 - shufps $0x1,%xmm5,%xmm5 - shufps $0x1,%xmm4,%xmm4 - ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss (%esi,%eax,4),%xmm6 - addss (%esi,%ebx,4),%xmm5 - addss (%esi,%ecx,4),%xmm7 - addss (%esi,%edx,4),%xmm4 - movss %xmm6,(%esi,%eax,4) - movss %xmm5,(%esi,%ebx,4) - movss %xmm7,(%esi,%ecx,4) - movss %xmm4,(%esi,%edx,4) - - ## put scalar force on stack temporarily - movaps %xmm3,nb430_fscal(%esp) - - movaps nb430_r(%esp),%xmm4 - mulps nb430_tsc(%esp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $3,%mm6 - pslld $3,%mm7 - - movl nb430_VFtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## dispersion - movaps (%esi,%eax,4),%xmm4 - movaps (%esi,%ebx,4),%xmm5 - movaps (%esi,%ecx,4),%xmm6 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## dispersion table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps nb430_two(%esp),%xmm7 ## two*Heps2 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - - movaps nb430_c6(%esp),%xmm4 - mulps %xmm4,%xmm7 ## fijD - mulps %xmm4,%xmm5 ## Vvdw6 - mulps nb430_tsc(%esp),%xmm7 - addps nb430_fscal(%esp),%xmm7 ## add to fscal - - ## put scalar force on stack Update Vvdwtot directly - addps nb430_Vvdwtot(%esp),%xmm5 - movaps %xmm7,nb430_fscal(%esp) - movaps %xmm5,nb430_Vvdwtot(%esp) - - ## repulsion - movaps 16(%esi,%eax,4),%xmm4 - movaps 16(%esi,%ebx,4),%xmm5 - movaps 16(%esi,%ecx,4),%xmm6 - movaps 16(%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps nb430_two(%esp),%xmm7 ## two*Heps2 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - - movaps nb430_c12(%esp),%xmm4 - mulps %xmm4,%xmm7 ## fijR - mulps %xmm4,%xmm5 ## Vvdw12 - mulps nb430_tsc(%esp),%xmm7 - addps nb430_fscal(%esp),%xmm7 - - addps nb430_Vvdwtot(%esp),%xmm5 - movaps %xmm5,nb430_Vvdwtot(%esp) - xorps %xmm4,%xmm4 - - mulps %xmm0,%xmm7 - subps %xmm7,%xmm4 - - movaps nb430_dx(%esp),%xmm0 - movaps nb430_dy(%esp),%xmm1 - movaps nb430_dz(%esp),%xmm2 - - movd %mm0,%eax - movd %mm1,%ebx - movd %mm2,%ecx - movd %mm3,%edx - - movl nb430_faction(%ebp),%edi - mulps %xmm4,%xmm0 - mulps %xmm4,%xmm1 - mulps %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movaps nb430_fix(%esp),%xmm3 - movaps nb430_fiy(%esp),%xmm4 - movaps nb430_fiz(%esp),%xmm5 - addps %xmm0,%xmm3 - addps %xmm1,%xmm4 - addps %xmm2,%xmm5 - movaps %xmm3,nb430_fix(%esp) - movaps %xmm4,nb430_fiy(%esp) - movaps %xmm5,nb430_fiz(%esp) - ## the fj's - start by accumulating x & y forces from memory - movlps (%edi,%eax,4),%xmm4 - movlps (%edi,%ecx,4),%xmm6 - movhps (%edi,%ebx,4),%xmm4 - movhps (%edi,%edx,4),%xmm6 - - movaps %xmm4,%xmm3 - shufps $136,%xmm6,%xmm3 ## constant 10001000 - shufps $221,%xmm6,%xmm4 ## constant 11011101 - - ## now xmm3-xmm5 contains fjx, fjy, fjz - subps %xmm0,%xmm3 - subps %xmm1,%xmm4 - - ## unpack them back so we can store them - first x & y in xmm3/xmm4 - - movaps %xmm3,%xmm6 - unpcklps %xmm4,%xmm6 - unpckhps %xmm4,%xmm3 - ## xmm6(l)=x & y for j1, (h) for j2 - ## xmm3(l)=x & y for j3, (h) for j4 - movlps %xmm6,(%edi,%eax,4) - movlps %xmm3,(%edi,%ecx,4) - - movhps %xmm6,(%edi,%ebx,4) - movhps %xmm3,(%edi,%edx,4) - - ## and the z forces - movss 8(%edi,%eax,4),%xmm4 - movss 8(%edi,%ebx,4),%xmm5 - movss 8(%edi,%ecx,4),%xmm6 - movss 8(%edi,%edx,4),%xmm7 - subss %xmm2,%xmm4 - shufps $229,%xmm2,%xmm2 ## constant 11100101 - subss %xmm2,%xmm5 - shufps $234,%xmm2,%xmm2 ## constant 11101010 - subss %xmm2,%xmm6 - shufps $255,%xmm2,%xmm2 ## constant 11111111 - subss %xmm2,%xmm7 - movss %xmm4,8(%edi,%eax,4) - movss %xmm5,8(%edi,%ebx,4) - movss %xmm6,8(%edi,%ecx,4) - movss %xmm7,8(%edi,%edx,4) - - ## should we do one more iteration? - subl $4,nb430_innerk(%esp) - jl _nb_kernel430_ia32_sse.nb430_finish_inner - jmp _nb_kernel430_ia32_sse.nb430_unroll_loop -_nb_kernel430_ia32_sse.nb430_finish_inner: - ## check if at least two particles remain - addl $4,nb430_innerk(%esp) - movl nb430_innerk(%esp),%edx - andl $2,%edx - jnz _nb_kernel430_ia32_sse.nb430_dopair - jmp _nb_kernel430_ia32_sse.nb430_checksingle -_nb_kernel430_ia32_sse.nb430_dopair: - - movl nb430_innerjjnr(%esp),%ecx - - movl (%ecx),%eax - movl 4(%ecx),%ebx - addl $8,nb430_innerjjnr(%esp) - - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - - ## load isaj - movl nb430_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm2 - movss (%esi,%ebx,4),%xmm3 - unpcklps %xmm3,%xmm2 ## isaj in xmm3(0,1) - mulps nb430_isai(%esp),%xmm2 - movaps %xmm2,nb430_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb430_gbtsc(%esp),%xmm1 - movaps %xmm1,nb430_gbscale(%esp) - - movl nb430_charge(%ebp),%esi ## base of charge[] - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ebx,4),%xmm6 - unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges - - mulps nb430_iq(%esp),%xmm2 - mulps %xmm2,%xmm3 - movaps %xmm3,nb430_qq(%esp) - - movl nb430_type(%ebp),%esi - movl %eax,%ecx - movl %ebx,%edx - movl (%esi,%ecx,4),%ecx - movl (%esi,%edx,4),%edx - movl nb430_vdwparam(%ebp),%esi - shll %ecx - shll %edx - movl nb430_ntia(%esp),%edi - addl %edi,%ecx - addl %edi,%edx - movlps (%esi,%ecx,4),%xmm6 - movhps (%esi,%edx,4),%xmm6 - movl nb430_pos(%ebp),%edi - - movaps %xmm6,%xmm4 - shufps $8,%xmm4,%xmm4 ## constant 00001000 - shufps $13,%xmm6,%xmm6 ## constant 00001101 - movlhps %xmm7,%xmm4 - movlhps %xmm7,%xmm6 - - movaps %xmm4,nb430_c6(%esp) - movaps %xmm6,nb430_c12(%esp) - - movd %eax,%mm0 ## copy jnr to mm0/mm1 - movd %ebx,%mm1 - - leal (%eax,%eax,2),%eax - leal (%ebx,%ebx,2),%ebx - ## move coordinates to xmm0-xmm2 - movlps (%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - movhps (%edi,%ebx,4),%xmm1 - movss 8(%edi,%ebx,4),%xmm0 - - movlhps %xmm7,%xmm3 - - shufps $0,%xmm0,%xmm2 - - movaps %xmm1,%xmm0 - - shufps $136,%xmm2,%xmm2 ## constant 10001000 - - shufps $136,%xmm0,%xmm0 ## constant 10001000 - shufps $221,%xmm1,%xmm1 ## constant 11011101 - - movl nb430_faction(%ebp),%edi - ## move ix-iz to xmm4-xmm6 - xorps %xmm7,%xmm7 - - movaps nb430_ix(%esp),%xmm4 - movaps nb430_iy(%esp),%xmm5 - movaps nb430_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## store dr - movaps %xmm4,nb430_dx(%esp) - movaps %xmm5,nb430_dy(%esp) - movaps %xmm6,nb430_dz(%esp) - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb430_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb430_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb430_r(%esp) - mulps nb430_gbscale(%esp),%xmm4 - - cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 - - movl nb430_GBtab(%ebp),%esi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## load coulomb table - movaps (%esi,%ecx,4),%xmm4 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps nb430_two(%esp),%xmm7 ## two*Heps2 - movaps nb430_qq(%esp),%xmm3 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - mulps %xmm7,%xmm3 ## fijC=FF*qq - - ## get jnr from mm0/mm1 - movd %mm0,%ecx - movd %mm1,%edx - - movl nb430_dvda(%ebp),%esi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulps nb430_gbscale(%esp),%xmm3 - movaps %xmm3,%xmm6 - mulps nb430_r(%esp),%xmm6 - addps %xmm5,%xmm6 - addps nb430_vctot(%esp),%xmm5 - movaps %xmm5,nb430_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb430_dvdasum(%esp),%xmm7 - movaps %xmm7,nb430_dvdasum(%esp) - - ## update j atoms dvdaj - movaps %xmm6,%xmm7 - shufps $0x1,%xmm7,%xmm7 - addss (%esi,%ecx,4),%xmm6 - addss (%esi,%edx,4),%xmm7 - movss %xmm6,(%esi,%ecx,4) - movss %xmm7,(%esi,%edx,4) - - ## put scalar force on stack temporarily - movaps %xmm3,nb430_fscal(%esp) - - movaps nb430_r(%esp),%xmm4 - mulps nb430_tsc(%esp),%xmm4 - - cvttps2pi %xmm4,%mm6 - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $3,%mm6 - - movl nb430_VFtab(%ebp),%esi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## dispersion - movaps (%esi,%ecx,4),%xmm4 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## dispersion table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps nb430_two(%esp),%xmm7 ## two*Heps2 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - - movaps nb430_c6(%esp),%xmm4 - mulps %xmm4,%xmm7 ## fijD - mulps %xmm4,%xmm5 ## Vvdw6 - mulps nb430_tsc(%esp),%xmm7 - addps nb430_fscal(%esp),%xmm7 ## add to fscal - - ## put scalar force on stack Update Vvdwtot directly - addps nb430_Vvdwtot(%esp),%xmm5 - movaps %xmm7,nb430_fscal(%esp) - movaps %xmm5,nb430_Vvdwtot(%esp) - - ## repulsion - movaps 16(%esi,%ecx,4),%xmm4 - movaps 16(%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps nb430_two(%esp),%xmm7 ## two*Heps2 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - - movaps nb430_c12(%esp),%xmm4 - mulps %xmm4,%xmm7 ## fijR - mulps %xmm4,%xmm5 ## Vvdw12 - mulps nb430_tsc(%esp),%xmm7 - addps nb430_fscal(%esp),%xmm7 - - addps nb430_Vvdwtot(%esp),%xmm5 - movaps %xmm5,nb430_Vvdwtot(%esp) - xorps %xmm4,%xmm4 - - mulps %xmm0,%xmm7 - subps %xmm7,%xmm4 - - movaps nb430_dx(%esp),%xmm0 - movaps nb430_dy(%esp),%xmm1 - movaps nb430_dz(%esp),%xmm2 - - mulps %xmm4,%xmm0 - mulps %xmm4,%xmm1 - mulps %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movaps nb430_fix(%esp),%xmm3 - movaps nb430_fiy(%esp),%xmm4 - movaps nb430_fiz(%esp),%xmm5 - addps %xmm0,%xmm3 - addps %xmm1,%xmm4 - addps %xmm2,%xmm5 - movaps %xmm3,nb430_fix(%esp) - movaps %xmm4,nb430_fiy(%esp) - movaps %xmm5,nb430_fiz(%esp) - ## update the fj's - movss (%edi,%eax,4),%xmm3 - movss 4(%edi,%eax,4),%xmm4 - movss 8(%edi,%eax,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%edi,%eax,4) - movss %xmm4,4(%edi,%eax,4) - movss %xmm5,8(%edi,%eax,4) - - shufps $225,%xmm0,%xmm0 ## constant 11100001 - shufps $225,%xmm1,%xmm1 ## constant 11100001 - shufps $225,%xmm2,%xmm2 ## constant 11100001 - - movss (%edi,%ebx,4),%xmm3 - movss 4(%edi,%ebx,4),%xmm4 - movss 8(%edi,%ebx,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%edi,%ebx,4) - movss %xmm4,4(%edi,%ebx,4) - movss %xmm5,8(%edi,%ebx,4) - -_nb_kernel430_ia32_sse.nb430_checksingle: - movl nb430_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel430_ia32_sse.nb430_dosingle - jmp _nb_kernel430_ia32_sse.nb430_updateouterdata -_nb_kernel430_ia32_sse.nb430_dosingle: - movl nb430_charge(%ebp),%esi - movl nb430_invsqrta(%ebp),%edx - movl nb430_pos(%ebp),%edi - movl nb430_innerjjnr(%esp),%ecx - movl (%ecx),%eax - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - movss (%edx,%eax,4),%xmm2 ## isaj - mulss nb430_isai(%esp),%xmm2 - movss %xmm2,nb430_isaprod(%esp) - movss %xmm2,%xmm1 - mulss nb430_gbtsc(%esp),%xmm1 - movss %xmm1,nb430_gbscale(%esp) - - mulss nb430_iq(%esp),%xmm2 - movss (%esi,%eax,4),%xmm6 ## xmm6(0) has the charge - mulss %xmm2,%xmm6 - movss %xmm6,nb430_qq(%esp) - - movl nb430_type(%ebp),%esi - movl %eax,%ecx - movl (%esi,%ecx,4),%ecx - movl nb430_vdwparam(%ebp),%esi - shll %ecx - addl nb430_ntia(%esp),%ecx - movlps (%esi,%ecx,4),%xmm6 - movaps %xmm6,%xmm4 - shufps $252,%xmm4,%xmm4 ## constant 11111100 - shufps $253,%xmm6,%xmm6 ## constant 11111101 - - movss %xmm4,nb430_c6(%esp) - movss %xmm6,nb430_c12(%esp) - - movd %eax,%mm0 - leal (%eax,%eax,2),%eax - - ## move coordinates to xmm0-xmm2 - movss (%edi,%eax,4),%xmm0 - movss 4(%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - - movss nb430_ix(%esp),%xmm4 - movss nb430_iy(%esp),%xmm5 - movss nb430_iz(%esp),%xmm6 - - ## calc dr - subss %xmm0,%xmm4 - subss %xmm1,%xmm5 - subss %xmm2,%xmm6 - - ## store dr - movaps %xmm4,nb430_dx(%esp) - movaps %xmm5,nb430_dy(%esp) - movaps %xmm6,nb430_dz(%esp) - ## square it - mulss %xmm4,%xmm4 - mulss %xmm5,%xmm5 - mulss %xmm6,%xmm6 - addss %xmm5,%xmm4 - addss %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movss nb430_three(%esp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movss nb430_half(%esp),%xmm0 - subss %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - - mulss %xmm0,%xmm4 ## xmm4=r - movss %xmm4,nb430_r(%esp) - mulss nb430_gbscale(%esp),%xmm4 - - cvttss2si %xmm4,%ebx ## mm6 contain lu indices - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%ebx - - movl nb430_GBtab(%ebp),%esi - - movaps (%esi,%ebx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - mulss nb430_two(%esp),%xmm7 ## two*Heps2 - movss nb430_qq(%esp),%xmm3 - addss %xmm6,%xmm7 - addss %xmm5,%xmm7 ## xmm7=FF - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss %xmm3,%xmm5 ## vcoul=qq*VV - mulss %xmm7,%xmm3 ## fijC=FF*qq - - movd %mm0,%ebx - movl nb430_dvda(%ebp),%esi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulss nb430_gbscale(%esp),%xmm3 - movaps %xmm3,%xmm6 - mulss nb430_r(%esp),%xmm6 - addss %xmm5,%xmm6 - addss nb430_vctot(%esp),%xmm5 - movss %xmm5,nb430_vctot(%esp) - - - ## xmm6=(vcoul+fijC*r) - subss %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addss nb430_dvdasum(%esp),%xmm7 - movaps %xmm7,nb430_dvdasum(%esp) - - ## update j atoms dvdaj - addss (%esi,%ebx,4),%xmm6 - movss %xmm6,(%esi,%ebx,4) - - ## put scalar force on stack temporarily - movss %xmm3,nb430_fscal(%esp) - - movss nb430_r(%esp),%xmm4 - mulps nb430_tsc(%esp),%xmm4 - - cvttss2si %xmm4,%ebx - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movss %xmm4,%xmm1 ## xmm1=eps - movss %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $3,%ebx - movl nb430_VFtab(%ebp),%esi - - ## dispersion - movaps (%esi,%ebx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - mulss nb430_two(%esp),%xmm7 ## two*Heps2 - addss %xmm6,%xmm7 - addss %xmm5,%xmm7 ## xmm7=FF - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - - movss nb430_c6(%esp),%xmm4 - mulss %xmm4,%xmm7 ## fijD - mulss %xmm4,%xmm5 ## Vvdw6 - mulps nb430_tsc(%esp),%xmm7 - addss nb430_fscal(%esp),%xmm7 ## add to fscal - - ## put scalar force on stack Update Vvdwtot directly - addss nb430_Vvdwtot(%esp),%xmm5 - movss %xmm7,nb430_fscal(%esp) - movss %xmm5,nb430_Vvdwtot(%esp) - - ## repulsion - movaps 16(%esi,%ebx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - mulss nb430_two(%esp),%xmm7 ## two*Heps2 - addss %xmm6,%xmm7 - addss %xmm5,%xmm7 ## xmm7=FF - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - - movss nb430_c12(%esp),%xmm4 - mulss %xmm4,%xmm7 ## fijR - mulss %xmm4,%xmm5 ## Vvdw12 - mulps nb430_tsc(%esp),%xmm7 - addss nb430_fscal(%esp),%xmm7 - - addss nb430_Vvdwtot(%esp),%xmm5 - movss %xmm5,nb430_Vvdwtot(%esp) - xorps %xmm4,%xmm4 - - mulss %xmm0,%xmm7 - subss %xmm7,%xmm4 - movl nb430_faction(%ebp),%edi - - movss nb430_dx(%esp),%xmm0 - movss nb430_dy(%esp),%xmm1 - movss nb430_dz(%esp),%xmm2 - - mulss %xmm4,%xmm0 - mulss %xmm4,%xmm1 - mulss %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movss nb430_fix(%esp),%xmm3 - movss nb430_fiy(%esp),%xmm4 - movss nb430_fiz(%esp),%xmm5 - addss %xmm0,%xmm3 - addss %xmm1,%xmm4 - addss %xmm2,%xmm5 - movss %xmm3,nb430_fix(%esp) - movss %xmm4,nb430_fiy(%esp) - movss %xmm5,nb430_fiz(%esp) - ## update fj - - movss (%edi,%eax,4),%xmm3 - movss 4(%edi,%eax,4),%xmm4 - movss 8(%edi,%eax,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%edi,%eax,4) - movss %xmm4,4(%edi,%eax,4) - movss %xmm5,8(%edi,%eax,4) -_nb_kernel430_ia32_sse.nb430_updateouterdata: - movl nb430_ii3(%esp),%ecx - movl nb430_faction(%ebp),%edi - movl nb430_fshift(%ebp),%esi - movl nb430_is3(%esp),%edx - - ## accumulate i forces in xmm0, xmm1, xmm2 - movaps nb430_fix(%esp),%xmm0 - movaps nb430_fiy(%esp),%xmm1 - movaps nb430_fiz(%esp),%xmm2 - - movhlps %xmm0,%xmm3 - movhlps %xmm1,%xmm4 - movhlps %xmm2,%xmm5 - addps %xmm3,%xmm0 - addps %xmm4,%xmm1 - addps %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 - - movaps %xmm0,%xmm3 - movaps %xmm1,%xmm4 - movaps %xmm2,%xmm5 - - shufps $1,%xmm3,%xmm3 - shufps $1,%xmm4,%xmm4 - shufps $1,%xmm5,%xmm5 - addss %xmm3,%xmm0 - addss %xmm4,%xmm1 - addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0 - - ## increment i force - movss (%edi,%ecx,4),%xmm3 - movss 4(%edi,%ecx,4),%xmm4 - movss 8(%edi,%ecx,4),%xmm5 - addss %xmm0,%xmm3 - addss %xmm1,%xmm4 - addss %xmm2,%xmm5 - movss %xmm3,(%edi,%ecx,4) - movss %xmm4,4(%edi,%ecx,4) - movss %xmm5,8(%edi,%ecx,4) - - ## increment fshift force - movss (%esi,%edx,4),%xmm3 - movss 4(%esi,%edx,4),%xmm4 - movss 8(%esi,%edx,4),%xmm5 - addss %xmm0,%xmm3 - addss %xmm1,%xmm4 - addss %xmm2,%xmm5 - movss %xmm3,(%esi,%edx,4) - movss %xmm4,4(%esi,%edx,4) - movss %xmm5,8(%esi,%edx,4) - - ## get n from stack - movl nb430_n(%esp),%esi - ## get group index for i particle - movl nb430_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movaps nb430_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movl nb430_Vc(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - ## move back to mem - movss %xmm7,(%eax,%edx,4) - - ## accumulate total lj energy and update it - movaps nb430_Vvdwtot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movl nb430_Vvdw(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - ## move back to mem - movss %xmm7,(%eax,%edx,4) - - ## accumulate dVda and update it - movaps nb430_dvdasum(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - movl nb430_ii(%esp),%edx - movl nb430_dvda(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - movss %xmm7,(%eax,%edx,4) - - ## finish if last - movl nb430_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel430_ia32_sse.nb430_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb430_n(%esp) - jmp _nb_kernel430_ia32_sse.nb430_outer -_nb_kernel430_ia32_sse.nb430_outerend: - ## check if more outer neighborlists remain - movl nb430_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel430_ia32_sse.nb430_end - ## non-zero, do one more workunit - jmp _nb_kernel430_ia32_sse.nb430_threadloop -_nb_kernel430_ia32_sse.nb430_end: - emms - - movl nb430_nouter(%esp),%eax - movl nb430_ninner(%esp),%ebx - movl nb430_outeriter(%ebp),%ecx - movl nb430_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb430_salign(%esp),%eax - addl %eax,%esp - addl $488,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - - - - - - - -.globl nb_kernel430nf_ia32_sse -.globl _nb_kernel430nf_ia32_sse -nb_kernel430nf_ia32_sse: -_nb_kernel430nf_ia32_sse: -.set nb430nf_p_nri, 8 -.set nb430nf_iinr, 12 -.set nb430nf_jindex, 16 -.set nb430nf_jjnr, 20 -.set nb430nf_shift, 24 -.set nb430nf_shiftvec, 28 -.set nb430nf_fshift, 32 -.set nb430nf_gid, 36 -.set nb430nf_pos, 40 -.set nb430nf_faction, 44 -.set nb430nf_charge, 48 -.set nb430nf_p_facel, 52 -.set nb430nf_argkrf, 56 -.set nb430nf_argcrf, 60 -.set nb430nf_Vc, 64 -.set nb430nf_type, 68 -.set nb430nf_p_ntype, 72 -.set nb430nf_vdwparam, 76 -.set nb430nf_Vvdw, 80 -.set nb430nf_p_tabscale, 84 -.set nb430nf_VFtab, 88 -.set nb430nf_invsqrta, 92 -.set nb430nf_dvda, 96 -.set nb430nf_p_gbtabscale, 100 -.set nb430nf_GBtab, 104 -.set nb430nf_p_nthreads, 108 -.set nb430nf_count, 112 -.set nb430nf_mtx, 116 -.set nb430nf_outeriter, 120 -.set nb430nf_inneriter, 124 -.set nb430nf_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb430nf_ix, 0 -.set nb430nf_iy, 16 -.set nb430nf_iz, 32 -.set nb430nf_iq, 48 -.set nb430nf_gbtsc, 64 -.set nb430nf_tsc, 80 -.set nb430nf_qq, 96 -.set nb430nf_c6, 112 -.set nb430nf_c12, 128 -.set nb430nf_vctot, 144 -.set nb430nf_Vvdwtot, 160 -.set nb430nf_half, 176 -.set nb430nf_three, 192 -.set nb430nf_isai, 208 -.set nb430nf_isaprod, 224 -.set nb430nf_gbscale, 240 -.set nb430nf_r, 256 -.set nb430nf_is3, 272 -.set nb430nf_ii3, 276 -.set nb430nf_ntia, 280 -.set nb430nf_innerjjnr, 284 -.set nb430nf_innerk, 288 -.set nb430nf_n, 292 -.set nb430nf_nn1, 296 -.set nb430nf_nri, 300 -.set nb430nf_facel, 304 -.set nb430nf_ntype, 308 -.set nb430nf_nouter, 312 -.set nb430nf_ninner, 316 -.set nb430nf_salign, 320 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $324,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb430nf_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb430nf_p_nri(%ebp),%ecx - movl nb430nf_p_facel(%ebp),%esi - movl nb430nf_p_ntype(%ebp),%edi - movl (%ecx),%ecx - movl (%esi),%esi - movl (%edi),%edi - movl %ecx,nb430nf_nri(%esp) - movl %esi,nb430nf_facel(%esp) - movl %edi,nb430nf_ntype(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb430nf_nouter(%esp) - movl %eax,nb430nf_ninner(%esp) - - - movl nb430nf_p_gbtabscale(%ebp),%eax - movss (%eax),%xmm3 - movl nb430nf_p_tabscale(%ebp),%eax - movss (%eax),%xmm4 - shufps $0,%xmm3,%xmm3 - shufps $0,%xmm4,%xmm4 - movaps %xmm3,nb430nf_gbtsc(%esp) - movaps %xmm4,nb430nf_tsc(%esp) - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## constant 0.5 in IEEE (hex) - movl %eax,nb430nf_half(%esp) - movss nb430nf_half(%esp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## constant 1.0 - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## constant 2.0 - addps %xmm2,%xmm3 ## constant 3.0 - movaps %xmm1,nb430nf_half(%esp) - movaps %xmm3,nb430nf_three(%esp) - -_nb_kernel430nf_ia32_sse.nb430nf_threadloop: - movl nb430nf_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel430nf_ia32_sse.nb430nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel430nf_ia32_sse.nb430nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb430nf_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb430nf_n(%esp) - movl %ebx,nb430nf_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel430nf_ia32_sse.nb430nf_outerstart - jmp _nb_kernel430nf_ia32_sse.nb430nf_end - -_nb_kernel430nf_ia32_sse.nb430nf_outerstart: - ## ebx contains number of outer iterations - addl nb430nf_nouter(%esp),%ebx - movl %ebx,nb430nf_nouter(%esp) - -_nb_kernel430nf_ia32_sse.nb430nf_outer: - movl nb430nf_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb430nf_is3(%esp) ## store is3 - - movl nb430nf_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movss (%eax,%ebx,4),%xmm0 - movss 4(%eax,%ebx,4),%xmm1 - movss 8(%eax,%ebx,4),%xmm2 - - movl nb430nf_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - - movl nb430nf_charge(%ebp),%edx - movss (%edx,%ebx,4),%xmm3 - mulss nb430nf_facel(%esp),%xmm3 - shufps $0,%xmm3,%xmm3 - - movl nb430nf_invsqrta(%ebp),%edx ## load invsqrta[ii] - movss (%edx,%ebx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - movl nb430nf_type(%ebp),%edx - movl (%edx,%ebx,4),%edx - imull nb430nf_ntype(%esp),%edx - shll %edx - movl %edx,nb430nf_ntia(%esp) - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb430nf_pos(%ebp),%eax ## eax = base of pos[] - - addss (%eax,%ebx,4),%xmm0 - addss 4(%eax,%ebx,4),%xmm1 - addss 8(%eax,%ebx,4),%xmm2 - - movaps %xmm3,nb430nf_iq(%esp) - movaps %xmm4,nb430nf_isai(%esp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb430nf_ix(%esp) - movaps %xmm1,nb430nf_iy(%esp) - movaps %xmm2,nb430nf_iz(%esp) - - movl %ebx,nb430nf_ii3(%esp) - - ## clear vctot - xorps %xmm4,%xmm4 - movaps %xmm4,nb430nf_vctot(%esp) - movaps %xmm4,nb430nf_Vvdwtot(%esp) - - movl nb430nf_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb430nf_pos(%ebp),%esi - movl nb430nf_faction(%ebp),%edi - movl nb430nf_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb430nf_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb430nf_ninner(%esp),%ecx - movl %ecx,nb430nf_ninner(%esp) - addl $0,%edx - movl %edx,nb430nf_innerk(%esp) ## number of innerloop atoms - jge _nb_kernel430nf_ia32_sse.nb430nf_unroll_loop - jmp _nb_kernel430nf_ia32_sse.nb430nf_finish_inner -_nb_kernel430nf_ia32_sse.nb430nf_unroll_loop: - ## quad-unroll innerloop here - movl nb430nf_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - movl 8(%edx),%ecx - movl 12(%edx),%edx ## eax-edx=jnr1-4 - addl $16,nb430nf_innerjjnr(%esp) ## advance pointer (unrolled 4) - - ## load isa2 - movl nb430nf_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - movaps nb430nf_isai(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb430nf_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb430nf_gbtsc(%esp),%xmm1 - movaps %xmm1,nb430nf_gbscale(%esp) - - movl nb430nf_charge(%ebp),%esi ## base of charge[] - - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ecx,4),%xmm4 - movss (%esi,%ebx,4),%xmm6 - movss (%esi,%edx,4),%xmm7 - - mulps nb430nf_iq(%esp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3 - mulps %xmm2,%xmm3 - movaps %xmm3,nb430nf_qq(%esp) - - movd %eax,%mm0 ## use mmx registers as temp storage - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movl nb430nf_type(%ebp),%esi - movl (%esi,%eax,4),%eax - movl (%esi,%ebx,4),%ebx - movl (%esi,%ecx,4),%ecx - movl (%esi,%edx,4),%edx - movl nb430nf_vdwparam(%ebp),%esi - shll %eax - shll %ebx - shll %ecx - shll %edx - movl nb430nf_ntia(%esp),%edi - addl %edi,%eax - addl %edi,%ebx - addl %edi,%ecx - addl %edi,%edx - - movlps (%esi,%eax,4),%xmm6 - movlps (%esi,%ecx,4),%xmm7 - movhps (%esi,%ebx,4),%xmm6 - movhps (%esi,%edx,4),%xmm7 - - movaps %xmm6,%xmm4 - shufps $136,%xmm7,%xmm4 ## constant 10001000 - shufps $221,%xmm7,%xmm6 ## constant 11011101 - - movd %mm0,%eax - movd %mm1,%ebx - movd %mm2,%ecx - movd %mm3,%edx - - movaps %xmm4,nb430nf_c6(%esp) - movaps %xmm6,nb430nf_c12(%esp) - - movl nb430nf_pos(%ebp),%esi ## base of pos[] - - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - - leal (%ecx,%ecx,2),%ecx ## replace jnr with j3 - leal (%edx,%edx,2),%edx - - ## move four coordinates to xmm0-xmm2 - - movlps (%esi,%eax,4),%xmm4 - movlps (%esi,%ecx,4),%xmm5 - movss 8(%esi,%eax,4),%xmm2 - movss 8(%esi,%ecx,4),%xmm6 - - movhps (%esi,%ebx,4),%xmm4 - movhps (%esi,%edx,4),%xmm5 - - movss 8(%esi,%ebx,4),%xmm0 - movss 8(%esi,%edx,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## constant 10001000 - - shufps $136,%xmm5,%xmm0 ## constant 10001000 - shufps $221,%xmm5,%xmm1 ## constant 11011101 - - ## move ix-iz to xmm4-xmm6 - movaps nb430nf_ix(%esp),%xmm4 - movaps nb430nf_iy(%esp),%xmm5 - movaps nb430nf_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb430nf_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb430nf_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb430nf_r(%esp) - mulps nb430nf_gbscale(%esp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $2,%mm6 - pslld $2,%mm7 - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movl nb430nf_GBtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## load coulomb table - movaps (%esi,%eax,4),%xmm4 - movaps (%esi,%ebx,4),%xmm5 - movaps (%esi,%ecx,4),%xmm6 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb430nf_qq(%esp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - addps nb430nf_vctot(%esp),%xmm5 - movaps %xmm5,nb430nf_vctot(%esp) - - - movaps nb430nf_r(%esp),%xmm4 - mulps nb430nf_tsc(%esp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $3,%mm6 - pslld $3,%mm7 - - movl nb430nf_VFtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## dispersion - movaps (%esi,%eax,4),%xmm4 - movaps (%esi,%ebx,4),%xmm5 - movaps (%esi,%ecx,4),%xmm6 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## dispersion table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps nb430nf_c6(%esp),%xmm5 ## Vvdw6 - addps nb430nf_Vvdwtot(%esp),%xmm5 - movaps %xmm5,nb430nf_Vvdwtot(%esp) - - ## repulsion - movaps 16(%esi,%eax,4),%xmm4 - movaps 16(%esi,%ebx,4),%xmm5 - movaps 16(%esi,%ecx,4),%xmm6 - movaps 16(%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - - mulps nb430nf_c12(%esp),%xmm5 ## Vvdw12 - addps nb430nf_Vvdwtot(%esp),%xmm5 - movaps %xmm5,nb430nf_Vvdwtot(%esp) - - ## should we do one more iteration? - subl $4,nb430nf_innerk(%esp) - jl _nb_kernel430nf_ia32_sse.nb430nf_finish_inner - jmp _nb_kernel430nf_ia32_sse.nb430nf_unroll_loop -_nb_kernel430nf_ia32_sse.nb430nf_finish_inner: - ## check if at least two particles remain - addl $4,nb430nf_innerk(%esp) - movl nb430nf_innerk(%esp),%edx - andl $2,%edx - jnz _nb_kernel430nf_ia32_sse.nb430nf_dopair - jmp _nb_kernel430nf_ia32_sse.nb430nf_checksingle -_nb_kernel430nf_ia32_sse.nb430nf_dopair: - - movl nb430nf_innerjjnr(%esp),%ecx - - movl (%ecx),%eax - movl 4(%ecx),%ebx - addl $8,nb430nf_innerjjnr(%esp) - - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - - ## load isa2 - movl nb430nf_invsqrta(%ebp),%esi - movss (%esi,%eax,4),%xmm2 - movss (%esi,%ebx,4),%xmm3 - unpcklps %xmm3,%xmm2 ## isa2 in xmm3(0,1) - mulps nb430nf_isai(%esp),%xmm2 - movaps %xmm2,nb430nf_isaprod(%esp) - movaps %xmm2,%xmm1 - mulps nb430nf_gbtsc(%esp),%xmm1 - movaps %xmm1,nb430nf_gbscale(%esp) - - movl nb430nf_charge(%ebp),%esi ## base of charge[] - movss (%esi,%eax,4),%xmm3 - movss (%esi,%ebx,4),%xmm6 - unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges - - mulps nb430nf_iq(%esp),%xmm2 - mulps %xmm2,%xmm3 - movaps %xmm3,nb430nf_qq(%esp) - - movl nb430nf_type(%ebp),%esi - movl %eax,%ecx - movl %ebx,%edx - movl (%esi,%ecx,4),%ecx - movl (%esi,%edx,4),%edx - movl nb430nf_vdwparam(%ebp),%esi - shll %ecx - shll %edx - movl nb430nf_ntia(%esp),%edi - addl %edi,%ecx - addl %edi,%edx - movlps (%esi,%ecx,4),%xmm6 - movhps (%esi,%edx,4),%xmm6 - movl nb430nf_pos(%ebp),%edi - - movaps %xmm6,%xmm4 - shufps $8,%xmm4,%xmm4 ## constant 00001000 - shufps $13,%xmm6,%xmm6 ## constant 00001101 - movlhps %xmm7,%xmm4 - movlhps %xmm7,%xmm6 - - movaps %xmm4,nb430nf_c6(%esp) - movaps %xmm6,nb430nf_c12(%esp) - - leal (%eax,%eax,2),%eax - leal (%ebx,%ebx,2),%ebx - ## move coordinates to xmm0-xmm2 - movlps (%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - movhps (%edi,%ebx,4),%xmm1 - movss 8(%edi,%ebx,4),%xmm0 - - movlhps %xmm7,%xmm3 - - shufps $0,%xmm0,%xmm2 - - movaps %xmm1,%xmm0 - - shufps $136,%xmm2,%xmm2 ## constant 10001000 - - shufps $136,%xmm0,%xmm0 ## constant 10001000 - shufps $221,%xmm1,%xmm1 ## constant 11011101 - - movl nb430nf_faction(%ebp),%edi - ## move ix-iz to xmm4-xmm6 - xorps %xmm7,%xmm7 - - movaps nb430nf_ix(%esp),%xmm4 - movaps nb430nf_iy(%esp),%xmm5 - movaps nb430nf_iz(%esp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb430nf_three(%esp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb430nf_half(%esp),%xmm0 - subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb430nf_r(%esp) - mulps nb430nf_gbscale(%esp),%xmm4 - - cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 - - movl nb430nf_GBtab(%ebp),%esi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## load coulomb table - movaps (%esi,%ecx,4),%xmm4 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb430nf_qq(%esp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - addps nb430nf_vctot(%esp),%xmm5 - movaps %xmm5,nb430nf_vctot(%esp) - - movaps nb430nf_r(%esp),%xmm4 - mulps nb430nf_tsc(%esp),%xmm4 - - cvttps2pi %xmm4,%mm6 - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $3,%mm6 - - movl nb430nf_VFtab(%ebp),%esi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## dispersion - movaps (%esi,%ecx,4),%xmm4 - movaps (%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## dispersion table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - - mulps nb430nf_c6(%esp),%xmm5 ## Vvdw6 - addps nb430nf_Vvdwtot(%esp),%xmm5 - movaps %xmm5,nb430nf_Vvdwtot(%esp) - - ## repulsion - movaps 16(%esi,%ecx,4),%xmm4 - movaps 16(%esi,%edx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - - mulps nb430nf_c12(%esp),%xmm5 ## Vvdw12 - - addps nb430nf_Vvdwtot(%esp),%xmm5 - movaps %xmm5,nb430nf_Vvdwtot(%esp) -_nb_kernel430nf_ia32_sse.nb430nf_checksingle: - movl nb430nf_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel430nf_ia32_sse.nb430nf_dosingle - jmp _nb_kernel430nf_ia32_sse.nb430nf_updateouterdata -_nb_kernel430nf_ia32_sse.nb430nf_dosingle: - movl nb430nf_charge(%ebp),%esi - movl nb430nf_invsqrta(%ebp),%edx - movl nb430nf_pos(%ebp),%edi - movl nb430nf_innerjjnr(%esp),%ecx - movl (%ecx),%eax - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - movss (%edx,%eax,4),%xmm2 ## isa2 - mulss nb430nf_isai(%esp),%xmm2 - movss %xmm2,nb430nf_isaprod(%esp) - movss %xmm2,%xmm1 - mulss nb430nf_gbtsc(%esp),%xmm1 - movss %xmm1,nb430nf_gbscale(%esp) - - mulss nb430nf_iq(%esp),%xmm2 - movss (%esi,%eax,4),%xmm6 ## xmm6(0) has the charge - mulss %xmm2,%xmm6 - movss %xmm6,nb430nf_qq(%esp) - - movl nb430nf_type(%ebp),%esi - movl %eax,%ecx - movl (%esi,%ecx,4),%ecx - movl nb430nf_vdwparam(%ebp),%esi - shll %ecx - addl nb430nf_ntia(%esp),%ecx - movlps (%esi,%ecx,4),%xmm6 - movaps %xmm6,%xmm4 - shufps $252,%xmm4,%xmm4 ## constant 11111100 - shufps $253,%xmm6,%xmm6 ## constant 11111101 - - movss %xmm4,nb430nf_c6(%esp) - movss %xmm6,nb430nf_c12(%esp) - - leal (%eax,%eax,2),%eax - - ## move coordinates to xmm0-xmm2 - movss (%edi,%eax,4),%xmm0 - movss 4(%edi,%eax,4),%xmm1 - movss 8(%edi,%eax,4),%xmm2 - - movss nb430nf_ix(%esp),%xmm4 - movss nb430nf_iy(%esp),%xmm5 - movss nb430nf_iz(%esp),%xmm6 - - ## calc dr - subss %xmm0,%xmm4 - subss %xmm1,%xmm5 - subss %xmm2,%xmm6 - - ## square it - mulss %xmm4,%xmm4 - mulss %xmm5,%xmm5 - mulss %xmm6,%xmm6 - addss %xmm5,%xmm4 - addss %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movss nb430nf_three(%esp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movss nb430nf_half(%esp),%xmm0 - subss %xmm5,%xmm1 ## constant 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - - mulss %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb430nf_r(%esp) - mulss nb430nf_gbscale(%esp),%xmm4 - - cvttss2si %xmm4,%ebx ## mm6 contain lu indices - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%ebx - - movl nb430nf_GBtab(%ebp),%esi - - movaps (%esi,%ebx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - movss nb430nf_qq(%esp),%xmm3 - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss %xmm3,%xmm5 ## vcoul=qq*VV - addss nb430nf_vctot(%esp),%xmm5 - movss %xmm5,nb430nf_vctot(%esp) - - movss nb430nf_r(%esp),%xmm4 - mulps nb430nf_tsc(%esp),%xmm4 - - cvttss2si %xmm4,%ebx - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movss %xmm4,%xmm1 ## xmm1=eps - movss %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $3,%ebx - movl nb430nf_VFtab(%ebp),%esi - - ## dispersion - movaps (%esi,%ebx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss nb430nf_c6(%esp),%xmm5 ## Vvdw6 - addss nb430nf_Vvdwtot(%esp),%xmm5 - movss %xmm5,nb430nf_Vvdwtot(%esp) - - ## repulsion - movaps 16(%esi,%ebx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - - mulss nb430nf_c12(%esp),%xmm5 ## Vvdw12 - - addss nb430nf_Vvdwtot(%esp),%xmm5 - movss %xmm5,nb430nf_Vvdwtot(%esp) - -_nb_kernel430nf_ia32_sse.nb430nf_updateouterdata: - ## get n from stack - movl nb430nf_n(%esp),%esi - ## get group index for i particle - movl nb430nf_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movaps nb430nf_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movl nb430nf_Vc(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - ## move back to mem - movss %xmm7,(%eax,%edx,4) - - ## accumulate total lj energy and update it - movaps nb430nf_Vvdwtot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movl nb430nf_Vvdw(%ebp),%eax - addss (%eax,%edx,4),%xmm7 - ## move back to mem - movss %xmm7,(%eax,%edx,4) - - ## finish if last - movl nb430nf_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel430nf_ia32_sse.nb430nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb430nf_n(%esp) - jmp _nb_kernel430nf_ia32_sse.nb430nf_outer -_nb_kernel430nf_ia32_sse.nb430nf_outerend: - ## check if more outer neighborlists remain - movl nb430nf_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel430nf_ia32_sse.nb430nf_end - ## non-zero, do one more workunit - jmp _nb_kernel430nf_ia32_sse.nb430nf_threadloop -_nb_kernel430nf_ia32_sse.nb430nf_end: - emms - - movl nb430nf_nouter(%esp),%eax - movl nb430nf_ninner(%esp),%ebx - movl nb430nf_outeriter(%ebp),%ecx - movl nb430nf_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb430nf_salign(%esp),%eax - addl %eax,%esp - addl $324,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - - - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/Makefile.am b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/Makefile.am index d28786df9b..fbf7bbfd5b 100644 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/Makefile.am +++ b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/Makefile.am @@ -59,32 +59,32 @@ libnb_kernel_ia32_sse2_la_SOURCES = \ EXTRA_DIST = \ - nb_kernel010_ia32_sse2.intel_syntax.s nb_kernel030_ia32_sse2.intel_syntax.s \ - nb_kernel100_ia32_sse2.intel_syntax.s nb_kernel101_ia32_sse2.intel_syntax.s \ - nb_kernel102_ia32_sse2.intel_syntax.s nb_kernel103_ia32_sse2.intel_syntax.s \ - nb_kernel104_ia32_sse2.intel_syntax.s nb_kernel110_ia32_sse2.intel_syntax.s \ - nb_kernel111_ia32_sse2.intel_syntax.s nb_kernel112_ia32_sse2.intel_syntax.s \ - nb_kernel113_ia32_sse2.intel_syntax.s nb_kernel114_ia32_sse2.intel_syntax.s \ - nb_kernel130_ia32_sse2.intel_syntax.s nb_kernel131_ia32_sse2.intel_syntax.s \ - nb_kernel132_ia32_sse2.intel_syntax.s nb_kernel133_ia32_sse2.intel_syntax.s \ - nb_kernel134_ia32_sse2.intel_syntax.s nb_kernel200_ia32_sse2.intel_syntax.s \ - nb_kernel201_ia32_sse2.intel_syntax.s nb_kernel202_ia32_sse2.intel_syntax.s \ - nb_kernel203_ia32_sse2.intel_syntax.s nb_kernel204_ia32_sse2.intel_syntax.s \ - nb_kernel210_ia32_sse2.intel_syntax.s nb_kernel211_ia32_sse2.intel_syntax.s \ - nb_kernel212_ia32_sse2.intel_syntax.s nb_kernel213_ia32_sse2.intel_syntax.s \ - nb_kernel214_ia32_sse2.intel_syntax.s nb_kernel230_ia32_sse2.intel_syntax.s \ - nb_kernel231_ia32_sse2.intel_syntax.s nb_kernel232_ia32_sse2.intel_syntax.s \ - nb_kernel233_ia32_sse2.intel_syntax.s nb_kernel234_ia32_sse2.intel_syntax.s \ - nb_kernel300_ia32_sse2.intel_syntax.s nb_kernel301_ia32_sse2.intel_syntax.s \ - nb_kernel302_ia32_sse2.intel_syntax.s nb_kernel303_ia32_sse2.intel_syntax.s \ - nb_kernel304_ia32_sse2.intel_syntax.s nb_kernel310_ia32_sse2.intel_syntax.s \ - nb_kernel311_ia32_sse2.intel_syntax.s nb_kernel312_ia32_sse2.intel_syntax.s \ - nb_kernel313_ia32_sse2.intel_syntax.s nb_kernel314_ia32_sse2.intel_syntax.s \ - nb_kernel330_ia32_sse2.intel_syntax.s nb_kernel331_ia32_sse2.intel_syntax.s \ - nb_kernel332_ia32_sse2.intel_syntax.s nb_kernel333_ia32_sse2.intel_syntax.s \ - nb_kernel334_ia32_sse2.intel_syntax.s nb_kernel400_ia32_sse2.intel_syntax.s \ - nb_kernel410_ia32_sse2.intel_syntax.s nb_kernel430_ia32_sse2.intel_syntax.s \ - nb_kernel_ia32_sse2_test_asm.intel_syntax.s + nb_kernel010_ia32_sse2_intel_syntax.s nb_kernel030_ia32_sse2_intel_syntax.s \ + nb_kernel100_ia32_sse2_intel_syntax.s nb_kernel101_ia32_sse2_intel_syntax.s \ + nb_kernel102_ia32_sse2_intel_syntax.s nb_kernel103_ia32_sse2_intel_syntax.s \ + nb_kernel104_ia32_sse2_intel_syntax.s nb_kernel110_ia32_sse2_intel_syntax.s \ + nb_kernel111_ia32_sse2_intel_syntax.s nb_kernel112_ia32_sse2_intel_syntax.s \ + nb_kernel113_ia32_sse2_intel_syntax.s nb_kernel114_ia32_sse2_intel_syntax.s \ + nb_kernel130_ia32_sse2_intel_syntax.s nb_kernel131_ia32_sse2_intel_syntax.s \ + nb_kernel132_ia32_sse2_intel_syntax.s nb_kernel133_ia32_sse2_intel_syntax.s \ + nb_kernel134_ia32_sse2_intel_syntax.s nb_kernel200_ia32_sse2_intel_syntax.s \ + nb_kernel201_ia32_sse2_intel_syntax.s nb_kernel202_ia32_sse2_intel_syntax.s \ + nb_kernel203_ia32_sse2_intel_syntax.s nb_kernel204_ia32_sse2_intel_syntax.s \ + nb_kernel210_ia32_sse2_intel_syntax.s nb_kernel211_ia32_sse2_intel_syntax.s \ + nb_kernel212_ia32_sse2_intel_syntax.s nb_kernel213_ia32_sse2_intel_syntax.s \ + nb_kernel214_ia32_sse2_intel_syntax.s nb_kernel230_ia32_sse2_intel_syntax.s \ + nb_kernel231_ia32_sse2_intel_syntax.s nb_kernel232_ia32_sse2_intel_syntax.s \ + nb_kernel233_ia32_sse2_intel_syntax.s nb_kernel234_ia32_sse2_intel_syntax.s \ + nb_kernel300_ia32_sse2_intel_syntax.s nb_kernel301_ia32_sse2_intel_syntax.s \ + nb_kernel302_ia32_sse2_intel_syntax.s nb_kernel303_ia32_sse2_intel_syntax.s \ + nb_kernel304_ia32_sse2_intel_syntax.s nb_kernel310_ia32_sse2_intel_syntax.s \ + nb_kernel311_ia32_sse2_intel_syntax.s nb_kernel312_ia32_sse2_intel_syntax.s \ + nb_kernel313_ia32_sse2_intel_syntax.s nb_kernel314_ia32_sse2_intel_syntax.s \ + nb_kernel330_ia32_sse2_intel_syntax.s nb_kernel331_ia32_sse2_intel_syntax.s \ + nb_kernel332_ia32_sse2_intel_syntax.s nb_kernel333_ia32_sse2_intel_syntax.s \ + nb_kernel334_ia32_sse2_intel_syntax.s nb_kernel400_ia32_sse2_intel_syntax.s \ + nb_kernel410_ia32_sse2_intel_syntax.s nb_kernel430_ia32_sse2_intel_syntax.s \ + nb_kernel_ia32_sse2_test_asm_intel_syntax.s diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.intel_syntax.s deleted file mode 100644 index c9c1dfb868..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.intel_syntax.s +++ /dev/null @@ -1,1287 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - - - -.globl nb_kernel400_ia32_sse2 -.globl _nb_kernel400_ia32_sse2 -nb_kernel400_ia32_sse2: -_nb_kernel400_ia32_sse2: -.equiv nb400_p_nri, 8 -.equiv nb400_iinr, 12 -.equiv nb400_jindex, 16 -.equiv nb400_jjnr, 20 -.equiv nb400_shift, 24 -.equiv nb400_shiftvec, 28 -.equiv nb400_fshift, 32 -.equiv nb400_gid, 36 -.equiv nb400_pos, 40 -.equiv nb400_faction, 44 -.equiv nb400_charge, 48 -.equiv nb400_p_facel, 52 -.equiv nb400_argkrf, 56 -.equiv nb400_argcrf, 60 -.equiv nb400_Vc, 64 -.equiv nb400_type, 68 -.equiv nb400_p_ntype, 72 -.equiv nb400_vdwparam, 76 -.equiv nb400_Vvdw, 80 -.equiv nb400_p_tabscale, 84 -.equiv nb400_VFtab, 88 -.equiv nb400_invsqrta, 92 -.equiv nb400_dvda, 96 -.equiv nb400_p_gbtabscale, 100 -.equiv nb400_GBtab, 104 -.equiv nb400_p_nthreads, 108 -.equiv nb400_count, 112 -.equiv nb400_mtx, 116 -.equiv nb400_outeriter, 120 -.equiv nb400_inneriter, 124 -.equiv nb400_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb400_ix, 0 -.equiv nb400_iy, 16 -.equiv nb400_iz, 32 -.equiv nb400_iq, 48 -.equiv nb400_dx, 64 -.equiv nb400_dy, 80 -.equiv nb400_dz, 96 -.equiv nb400_two, 112 -.equiv nb400_gbtsc, 128 -.equiv nb400_qq, 144 -.equiv nb400_r, 160 -.equiv nb400_vctot, 176 -.equiv nb400_fix, 192 -.equiv nb400_fiy, 208 -.equiv nb400_fiz, 224 -.equiv nb400_half, 240 -.equiv nb400_three, 256 -.equiv nb400_isai, 272 -.equiv nb400_isaprod, 288 -.equiv nb400_dvdasum, 304 -.equiv nb400_gbscale, 320 -.equiv nb400_is3, 336 -.equiv nb400_ii3, 340 -.equiv nb400_ii, 344 -.equiv nb400_innerjjnr, 348 -.equiv nb400_innerk, 352 -.equiv nb400_n, 356 -.equiv nb400_nn1, 360 -.equiv nb400_nri, 364 -.equiv nb400_facel, 368 ;# uses 8 bytes -.equiv nb400_nouter, 376 -.equiv nb400_ninner, 380 -.equiv nb400_salign, 384 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 388 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb400_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb400_p_nri] - mov esi, [ebp + nb400_p_facel] - mov ecx, [ecx] - movsd xmm7, [esi] - mov [esp + nb400_nri], ecx - movsd [esp + nb400_facel], xmm7 - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb400_nouter], eax - mov [esp + nb400_ninner], eax - - - mov eax, [ebp + nb400_p_gbtabscale] - movsd xmm3, [eax] - shufpd xmm3, xmm3, 0 - movapd [esp + nb400_gbtsc], xmm3 - - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double 0.5 IEEE (hex) - mov ebx, 0x3fe00000 - mov [esp + nb400_half], eax - mov [esp + nb400_half+4], ebx - movsd xmm1, [esp + nb400_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# 1.0 - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# 2.0 - addpd xmm3, xmm2 ;# 3.0 - movapd [esp + nb400_half], xmm1 - movapd [esp + nb400_two], xmm2 - movapd [esp + nb400_three], xmm3 - -.nb400_threadloop: - mov esi, [ebp + nb400_count] ;# pointer to sync counter - mov eax, [esi] -.nb400_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb400_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb400_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb400_n], eax - mov [esp + nb400_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb400_outerstart - jmp .nb400_end - -.nb400_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb400_nouter] - mov [esp + nb400_nouter], ebx - -.nb400_outer: - mov eax, [ebp + nb400_shift] ;# eax = pointer into shift[] - mov ebx, [eax+esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb400_is3],ebx ;# store is3 - - mov eax, [ebp + nb400_shiftvec] ;# eax = base of shiftvec[] - - movsd xmm0, [eax + ebx*8] - movsd xmm1, [eax + ebx*8 + 8] - movsd xmm2, [eax + ebx*8 + 16] - - mov ecx, [ebp + nb400_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx+esi*4] ;# ebx =ii - mov [esp + nb400_ii], ebx - - mov edx, [ebp + nb400_charge] - movsd xmm3, [edx + ebx*8] - mulsd xmm3, [esp + nb400_facel] - shufpd xmm3, xmm3, 0 - - mov edx, [ebp + nb400_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [edx + ebx*8] - shufpd xmm4, xmm4, 0 - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb400_pos] ;# eax = base of pos[] - - addsd xmm0, [eax + ebx*8] - addsd xmm1, [eax + ebx*8 + 8] - addsd xmm2, [eax + ebx*8 + 16] - - movapd [esp + nb400_iq], xmm3 - movapd [esp + nb400_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [esp + nb400_ix], xmm0 - movapd [esp + nb400_iy], xmm1 - movapd [esp + nb400_iz], xmm2 - - mov [esp + nb400_ii3], ebx - - ;# clear vctot and i forces - xorpd xmm4, xmm4 - movapd [esp + nb400_vctot], xmm4 - movapd [esp + nb400_dvdasum], xmm4 - movapd [esp + nb400_fix], xmm4 - movapd [esp + nb400_fiy], xmm4 - movapd [esp + nb400_fiz], xmm4 - - mov eax, [ebp + nb400_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb400_pos] - mov edi, [ebp + nb400_faction] - mov eax, [ebp + nb400_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb400_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [esp + nb400_ninner] - mov [esp + nb400_ninner], ecx - add edx, 0 - mov [esp + nb400_innerk], edx ;# number of innerloop atoms - jge .nb400_unroll_loop - jmp .nb400_checksingle -.nb400_unroll_loop: - ;# twice unrolled innerloop here - mov edx, [esp + nb400_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - add dword ptr [esp + nb400_innerjjnr], 8 ;# advance pointer (unrolled 2) - - ;# load isaj - mov esi, [ebp + nb400_invsqrta] - movlpd xmm2, [esi + eax*8] - movhpd xmm2, [esi + ebx*8] - mulpd xmm2, [esp + nb400_isai] - movapd [esp + nb400_isaprod], xmm2 - movapd xmm1, xmm2 - mulpd xmm1, [esp + nb400_gbtsc] - movapd [esp + nb400_gbscale], xmm1 - - mov esi, [ebp + nb400_charge] ;# base of charge[] - movlpd xmm3, [esi + eax*8] - movhpd xmm3, [esi + ebx*8] - - mulpd xmm2, [esp + nb400_iq] - mulpd xmm3, xmm2 - movapd [esp + nb400_qq], xmm3 - - mov esi, [ebp + nb400_pos] ;# base of pos[] - - movd mm2, eax - movd mm3, ebx - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [esi + eax*8] - movlpd xmm1, [esi + eax*8 + 8] - movlpd xmm2, [esi + eax*8 + 16] - movhpd xmm0, [esi + ebx*8] - movhpd xmm1, [esi + ebx*8 + 8] - movhpd xmm2, [esi + ebx*8 + 16] - - mov edi, [ebp + nb400_faction] - - ;# move nb400_ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb400_ix] - movapd xmm5, [esp + nb400_iy] - movapd xmm6, [esp + nb400_iz] - - ;# calc dr - subpd xmm4, xmm0 - subpd xmm5, xmm1 - subpd xmm6, xmm2 - - ;# store dr - movapd [esp + nb400_dx], xmm4 - movapd [esp + nb400_dy], xmm5 - movapd [esp + nb400_dz], xmm6 - ;# square it - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb400_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb400_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb400_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb400_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu) - mulpd xmm4, xmm0 ;# xmm4=r - movapd [esp + nb400_r], xmm4 - mulpd xmm4, [esp + nb400_gbscale] - - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 ;# idx *= 4 - - movd mm0, eax - movd mm1, ebx - - mov esi, [ebp + nb400_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ebx, mm6 ;# indices in eax/ebx - - movapd xmm4, [esi + eax*8] ;# Y1 F1 - movapd xmm3, [esi + ebx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [esi + eax*8 + 16] ;# G1 H1 - movapd xmm3, [esi + ebx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - mulpd xmm7, [esp + nb400_two] ;# two*Heps2 - movapd xmm3, [esp + nb400_qq] - addpd xmm7, xmm6 - addpd xmm7, xmm5 ;# xmm7=FF - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - mulpd xmm5, xmm3 ;# vcoul=qq*VV - mulpd xmm3, xmm7 ;# fijC=FF*qq - ;# get jnr from regs - movd ecx, mm2 - movd edx, mm3 - mov esi, [ebp + nb400_dvda] - - ;# Calculate dVda - xorpd xmm7, xmm7 - mulpd xmm3, [esp + nb400_gbscale] - movapd xmm6, xmm3 - mulpd xmm6, [esp + nb400_r] - addpd xmm6, xmm5 - addpd xmm5, [esp + nb400_vctot] - movapd [esp + nb400_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subpd xmm7, xmm6 - movapd xmm6, xmm7 - - ;# update dvdasum - addpd xmm7, [esp + nb400_dvdasum] - movapd [esp + nb400_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - addsd xmm6, [esi + ecx*8] - addsd xmm7, [esi + edx*8] - movsd [esi + ecx*8], xmm6 - movsd [esi + edx*8], xmm7 - - xorpd xmm4, xmm4 - - mulpd xmm3, xmm0 - subpd xmm4, xmm3 - - movapd xmm0, [esp + nb400_dx] - movapd xmm1, [esp + nb400_dy] - movapd xmm2, [esp + nb400_dz] - - movd eax, mm0 - movd ebx, mm1 - - mov edi, [ebp + nb400_faction] - mulpd xmm0, xmm4 - mulpd xmm1, xmm4 - mulpd xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movapd xmm3, [esp + nb400_fix] - movapd xmm4, [esp + nb400_fiy] - movapd xmm5, [esp + nb400_fiz] - addpd xmm3, xmm0 - addpd xmm4, xmm1 - addpd xmm5, xmm2 - movapd [esp + nb400_fix], xmm3 - movapd [esp + nb400_fiy], xmm4 - movapd [esp + nb400_fiz], xmm5 - ;# the fj's - start by accumulating forces from memory - movlpd xmm3, [edi + eax*8] - movlpd xmm4, [edi + eax*8 + 8] - movlpd xmm5, [edi + eax*8 + 16] - movhpd xmm3, [edi + ebx*8] - movhpd xmm4, [edi + ebx*8 + 8] - movhpd xmm5, [edi + ebx*8 + 16] - subpd xmm3, xmm0 - subpd xmm4, xmm1 - subpd xmm5, xmm2 - movlpd [edi + eax*8], xmm3 - movlpd [edi + eax*8 + 8], xmm4 - movlpd [edi + eax*8 + 16], xmm5 - movhpd [edi + ebx*8], xmm3 - movhpd [edi + ebx*8 + 8], xmm4 - movhpd [edi + ebx*8 + 16], xmm5 - - ;# should we do one more iteration? - sub dword ptr [esp + nb400_innerk], 2 - jl .nb400_checksingle - jmp .nb400_unroll_loop -.nb400_checksingle: - mov edx, [esp + nb400_innerk] - and edx, 1 - jnz .nb400_dosingle - jmp .nb400_updateouterdata -.nb400_dosingle: - mov esi, [ebp + nb400_charge] - mov edx, [ebp + nb400_invsqrta] - mov edi, [ebp + nb400_pos] - mov ecx, [esp + nb400_innerjjnr] - mov eax, [ecx] - xorpd xmm6, xmm6 - movapd xmm7, xmm6 - movsd xmm7, [edx + eax*8] - movlpd xmm6, [esi + eax*8] ;# xmm6(0) has the charge - mulsd xmm7, [esp + nb400_isai] - movapd [esp + nb400_isaprod], xmm7 - movapd xmm1, xmm7 - mulpd xmm1, [esp + nb400_gbtsc] - movapd [esp + nb400_gbscale], xmm1 - - mulsd xmm7, [esp + nb400_iq] - mulsd xmm6, xmm7 - movapd [esp + nb400_qq], xmm6 - - movd mm2, eax - lea eax, [eax + eax*2] - - ;# move coordinates to xmm0-xmm2 - movlpd xmm0, [edi + eax*8] - movlpd xmm1, [edi + eax*8 + 8] - movlpd xmm2, [edi + eax*8 + 16] - - ;# move nb400_ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb400_ix] - movapd xmm5, [esp + nb400_iy] - movapd xmm6, [esp + nb400_iz] - - ;# calc dr - subsd xmm4, xmm0 - subsd xmm5, xmm1 - subsd xmm6, xmm2 - - ;# store dr - movapd [esp + nb400_dx], xmm4 - movapd [esp + nb400_dy], xmm5 - movapd [esp + nb400_dz], xmm6 - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb400_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb400_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb400_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb400_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu) - - mulsd xmm4, xmm0 ;# xmm4=r - movapd [esp + nb400_r], xmm4 - mulsd xmm4, [esp + nb400_gbscale] - - movd mm0, eax - - cvttsd2si eax, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, eax - subsd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl eax, 2 ;# idx *= 4 - - mov esi, [ebp + nb400_GBtab] - - ;# Coulomb - movapd xmm4, [esi + eax*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [esi + eax*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# table ready in xmm4-xmm7 - - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - mulsd xmm7, [esp + nb400_two] ;# two*Heps2 - movapd xmm3, [esp + nb400_qq] - addsd xmm7, xmm6 - addsd xmm7, xmm5 ;# xmm7=FF - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, xmm3 ;# vcoul=qq*VV - mulsd xmm3, xmm7 ;# fijC=FF*qq - ;# get jnr from regs - movd ebx, mm2 - mov esi, [ebp + nb400_dvda] - - ;# Calculate dVda - mulsd xmm3, [esp + nb400_gbscale] - movsd xmm6, xmm3 - mulsd xmm6, [esp + nb400_r] - addsd xmm6, xmm5 - addsd xmm5, [esp + nb400_vctot] - movsd [esp + nb400_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subpd xmm7, xmm6 - movsd xmm6, xmm7 - - ;# update dvdasum - addsd xmm7, [esp + nb400_dvdasum] - movsd [esp + nb400_dvdasum], xmm7 - - ;# update j atoms dvdaj - addsd xmm6, [esi + ebx*8] - movsd [esi + ebx*8], xmm6 - - xorpd xmm4, xmm4 - movd eax, mm0 - - mulsd xmm3, xmm0 - subsd xmm4, xmm3 - mov edi, [ebp + nb400_faction] - - movsd xmm0, [esp + nb400_dx] - movsd xmm1, [esp + nb400_dy] - movsd xmm2, [esp + nb400_dz] - - mulsd xmm0, xmm4 - mulsd xmm1, xmm4 - mulsd xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movsd xmm3, [esp + nb400_fix] - movsd xmm4, [esp + nb400_fiy] - movsd xmm5, [esp + nb400_fiz] - addsd xmm3, xmm0 - addsd xmm4, xmm1 - addsd xmm5, xmm2 - movlpd [esp + nb400_fix], xmm3 - movlpd [esp + nb400_fiy], xmm4 - movlpd [esp + nb400_fiz], xmm5 - ;# update fj - movlpd xmm3, [edi + eax*8] - movlpd xmm4, [edi + eax*8 + 8] - movlpd xmm5, [edi + eax*8 + 16] - subsd xmm3, xmm0 - subsd xmm4, xmm1 - subsd xmm5, xmm2 - movlpd [edi + eax*8], xmm3 - movlpd [edi + eax*8 + 8], xmm4 - movlpd [edi + eax*8 + 16], xmm5 - -.nb400_updateouterdata: - mov ecx, [esp + nb400_ii3] - mov edi, [ebp + nb400_faction] - mov esi, [ebp + nb400_fshift] - mov edx, [esp + nb400_is3] - - ;# accumulate i forces in xmm0, xmm1, xmm2 - movapd xmm0, [esp + nb400_fix] - movapd xmm1, [esp + nb400_fiy] - movapd xmm2, [esp + nb400_fiz] - - movhlps xmm3, xmm0 - movhlps xmm4, xmm1 - movhlps xmm5, xmm2 - addsd xmm0, xmm3 - addsd xmm1, xmm4 - addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 - - ;# increment i force - movsd xmm3, [edi + ecx*8] - movsd xmm4, [edi + ecx*8 + 8] - movsd xmm5, [edi + ecx*8 + 16] - addsd xmm3, xmm0 - addsd xmm4, xmm1 - addsd xmm5, xmm2 - movsd [edi + ecx*8], xmm3 - movsd [edi + ecx*8 + 8], xmm4 - movsd [edi + ecx*8 + 16], xmm5 - - ;# increment fshift force - movsd xmm3, [esi + edx*8] - movsd xmm4, [esi + edx*8 + 8] - movsd xmm5, [esi + edx*8 + 16] - addsd xmm3, xmm0 - addsd xmm4, xmm1 - addsd xmm5, xmm2 - movsd [esi + edx*8], xmm3 - movsd [esi + edx*8 + 8], xmm4 - movsd [esi + edx*8 + 16], xmm5 - - ;# get n from stack - mov esi, [esp + nb400_n] - ;# get group index for i particle - mov edx, [ebp + nb400_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movapd xmm7, [esp + nb400_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov eax, [ebp + nb400_Vc] - addsd xmm7, [eax + edx*8] - ;# move back to mem - movsd [eax + edx*8], xmm7 - - ;# accumulate dVda and update it - movapd xmm7, [esp + nb400_dvdasum] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - mov edx, [esp + nb400_ii] - mov eax, [ebp + nb400_dvda] - addsd xmm7, [eax + edx*8] - movsd [eax + edx*8], xmm7 - - ;# finish if last - mov ecx, [esp + nb400_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb400_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb400_n], esi - jmp .nb400_outer -.nb400_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb400_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb400_end - ;# non-zero, do one more workunit - jmp .nb400_threadloop -.nb400_end: - emms - - mov eax, [esp + nb400_nouter] - mov ebx, [esp + nb400_ninner] - mov ecx, [ebp + nb400_outeriter] - mov edx, [ebp + nb400_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb400_salign] - add esp, eax - add esp, 388 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret - - - - - - -.globl nb_kernel400nf_ia32_sse2 -.globl _nb_kernel400nf_ia32_sse2 -nb_kernel400nf_ia32_sse2: -_nb_kernel400nf_ia32_sse2: -.equiv nb400nf_p_nri, 8 -.equiv nb400nf_iinr, 12 -.equiv nb400nf_jindex, 16 -.equiv nb400nf_jjnr, 20 -.equiv nb400nf_shift, 24 -.equiv nb400nf_shiftvec, 28 -.equiv nb400nf_fshift, 32 -.equiv nb400nf_gid, 36 -.equiv nb400nf_pos, 40 -.equiv nb400nf_faction, 44 -.equiv nb400nf_charge, 48 -.equiv nb400nf_p_facel, 52 -.equiv nb400nf_argkrf, 56 -.equiv nb400nf_argcrf, 60 -.equiv nb400nf_Vc, 64 -.equiv nb400nf_type, 68 -.equiv nb400nf_p_ntype, 72 -.equiv nb400nf_vdwparam, 76 -.equiv nb400nf_Vvdw, 80 -.equiv nb400nf_p_tabscale, 84 -.equiv nb400nf_VFtab, 88 -.equiv nb400nf_invsqrta, 92 -.equiv nb400nf_dvda, 96 -.equiv nb400nf_p_gbtabscale, 100 -.equiv nb400nf_GBtab, 104 -.equiv nb400nf_p_nthreads, 108 -.equiv nb400nf_count, 112 -.equiv nb400nf_mtx, 116 -.equiv nb400nf_outeriter, 120 -.equiv nb400nf_inneriter, 124 -.equiv nb400nf_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb400nf_ix, 0 -.equiv nb400nf_iy, 16 -.equiv nb400nf_iz, 32 -.equiv nb400nf_iq, 48 -.equiv nb400nf_gbtsc, 64 -.equiv nb400nf_qq, 80 -.equiv nb400nf_vctot, 96 -.equiv nb400nf_half, 112 -.equiv nb400nf_three, 128 -.equiv nb400nf_isai, 144 -.equiv nb400nf_isaprod, 160 -.equiv nb400nf_gbscale, 176 -.equiv nb400nf_is3, 192 -.equiv nb400nf_ii3, 196 -.equiv nb400nf_innerjjnr, 200 -.equiv nb400nf_innerk, 204 -.equiv nb400nf_n, 208 -.equiv nb400nf_nn1, 212 -.equiv nb400nf_nri, 216 -.equiv nb400nf_facel, 224 ;# uses 8 bytes -.equiv nb400nf_nouter, 232 -.equiv nb400nf_ninner, 236 -.equiv nb400nf_salign, 240 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 244 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb400nf_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb400nf_p_nri] - mov esi, [ebp + nb400nf_p_facel] - mov ecx, [ecx] - movsd xmm7, [esi] - mov [esp + nb400nf_nri], ecx - movsd [esp + nb400nf_facel], xmm7 - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb400nf_nouter], eax - mov [esp + nb400nf_ninner], eax - - - mov eax, [ebp + nb400nf_p_gbtabscale] - movsd xmm3, [eax] - shufpd xmm3, xmm3, 0 - movapd [esp + nb400nf_gbtsc], xmm3 - - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double 0.5 IEEE (hex) - mov ebx, 0x3fe00000 - mov [esp + nb400nf_half], eax - mov [esp + nb400nf_half+4], ebx - movsd xmm1, [esp + nb400nf_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# 1.0 - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# 2.0 - addpd xmm3, xmm2 ;# 3.0 - movapd [esp + nb400nf_half], xmm1 - movapd [esp + nb400nf_three], xmm3 - -.nb400nf_threadloop: - mov esi, [ebp + nb400nf_count] ;# pointer to sync counter - mov eax, [esi] -.nb400nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb400nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb400nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb400nf_n], eax - mov [esp + nb400nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb400nf_outerstart - jmp .nb400nf_end - -.nb400nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb400nf_nouter] - mov [esp + nb400nf_nouter], ebx - -.nb400nf_outer: - mov eax, [ebp + nb400nf_shift] ;# eax = pointer into shift[] - mov ebx, [eax+esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb400nf_is3],ebx ;# store is3 - - mov eax, [ebp + nb400nf_shiftvec] ;# eax = base of shiftvec[] - - movsd xmm0, [eax + ebx*8] - movsd xmm1, [eax + ebx*8 + 8] - movsd xmm2, [eax + ebx*8 + 16] - - mov ecx, [ebp + nb400nf_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx+esi*4] ;# ebx =ii - - mov edx, [ebp + nb400nf_charge] - movsd xmm3, [edx + ebx*8] - mulsd xmm3, [esp + nb400nf_facel] - shufpd xmm3, xmm3, 0 - - mov edx, [ebp + nb400nf_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [edx + ebx*8] - shufpd xmm4, xmm4, 0 - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb400nf_pos] ;# eax = base of pos[] - - addsd xmm0, [eax + ebx*8] - addsd xmm1, [eax + ebx*8 + 8] - addsd xmm2, [eax + ebx*8 + 16] - - movapd [esp + nb400nf_iq], xmm3 - movapd [esp + nb400nf_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [esp + nb400nf_ix], xmm0 - movapd [esp + nb400nf_iy], xmm1 - movapd [esp + nb400nf_iz], xmm2 - - mov [esp + nb400nf_ii3], ebx - - ;# clear vctot - xorpd xmm4, xmm4 - movapd [esp + nb400nf_vctot], xmm4 - - mov eax, [ebp + nb400nf_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb400nf_pos] - mov edi, [ebp + nb400nf_faction] - mov eax, [ebp + nb400nf_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb400nf_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [esp + nb400nf_ninner] - mov [esp + nb400nf_ninner], ecx - add edx, 0 - mov [esp + nb400nf_innerk], edx ;# number of innerloop atoms - jge .nb400nf_unroll_loop - jmp .nb400nf_checksingle -.nb400nf_unroll_loop: - ;# twice unrolled innerloop here - mov edx, [esp + nb400nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - add dword ptr [esp + nb400nf_innerjjnr], 8 ;# advance pointer (unrolled 2) - - ;# load isa2 - mov esi, [ebp + nb400nf_invsqrta] - movlpd xmm2, [esi + eax*8] - movhpd xmm2, [esi + ebx*8] - mulpd xmm2, [esp + nb400nf_isai] - movapd [esp + nb400nf_isaprod], xmm2 - movapd xmm1, xmm2 - mulpd xmm1, [esp + nb400nf_gbtsc] - movapd [esp + nb400nf_gbscale], xmm1 - - mov esi, [ebp + nb400nf_charge] ;# base of charge[] - movlpd xmm3, [esi + eax*8] - movhpd xmm3, [esi + ebx*8] - - mulpd xmm2, [esp + nb400nf_iq] - mulpd xmm3, xmm2 - movapd [esp + nb400nf_qq], xmm3 - - mov esi, [ebp + nb400nf_pos] ;# base of pos[] - - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [esi + eax*8] - movlpd xmm1, [esi + eax*8 + 8] - movlpd xmm2, [esi + eax*8 + 16] - movhpd xmm0, [esi + ebx*8] - movhpd xmm1, [esi + ebx*8 + 8] - movhpd xmm2, [esi + ebx*8 + 16] - - mov edi, [ebp + nb400nf_faction] - - ;# move nb400nf_ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb400nf_ix] - movapd xmm5, [esp + nb400nf_iy] - movapd xmm6, [esp + nb400nf_iz] - - ;# calc dr - subpd xmm4, xmm0 - subpd xmm5, xmm1 - subpd xmm6, xmm2 - - ;# square it - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb400nf_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb400nf_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb400nf_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb400nf_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu) - mulpd xmm4, xmm0 ;# xmm4=r - mulpd xmm4, [esp + nb400nf_gbscale] - - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 ;# idx *= 4 - - movd mm0, eax - movd mm1, ebx - - mov esi, [ebp + nb400nf_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ebx, mm6 ;# indices in eax/ebx - - movapd xmm4, [esi + eax*8] ;# Y1 F1 - movapd xmm3, [esi + ebx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [esi + eax*8 + 16] ;# G1 H1 - movapd xmm3, [esi + ebx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [esp + nb400nf_qq] - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - mulpd xmm5, xmm3 ;# vcoul=qq*VV - addpd xmm5, [esp + nb400nf_vctot] - movapd [esp + nb400nf_vctot], xmm5 - - ;# should we do one more iteration? - sub dword ptr [esp + nb400nf_innerk], 2 - jl .nb400nf_checksingle - jmp .nb400nf_unroll_loop -.nb400nf_checksingle: - mov edx, [esp + nb400nf_innerk] - and edx, 1 - jnz .nb400nf_dosingle - jmp .nb400nf_updateouterdata -.nb400nf_dosingle: - mov esi, [ebp + nb400nf_charge] - mov edx, [ebp + nb400nf_invsqrta] - mov edi, [ebp + nb400nf_pos] - mov ecx, [esp + nb400nf_innerjjnr] - mov eax, [ecx] - xorpd xmm6, xmm6 - movapd xmm7, xmm6 - movsd xmm7, [edx + eax*8] - movlpd xmm6, [esi + eax*8] ;# xmm6(0) has the charge - mulsd xmm7, [esp + nb400nf_isai] - movapd [esp + nb400nf_isaprod], xmm7 - movapd xmm1, xmm7 - mulpd xmm1, [esp + nb400nf_gbtsc] - movapd [esp + nb400nf_gbscale], xmm1 - - mulsd xmm7, [esp + nb400nf_iq] - mulsd xmm6, xmm7 - movapd [esp + nb400nf_qq], xmm6 - - lea eax, [eax + eax*2] - - ;# move coordinates to xmm0-xmm2 - movlpd xmm0, [edi + eax*8] - movlpd xmm1, [edi + eax*8 + 8] - movlpd xmm2, [edi + eax*8 + 16] - - ;# move nb400nf_ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb400nf_ix] - movapd xmm5, [esp + nb400nf_iy] - movapd xmm6, [esp + nb400nf_iz] - - ;# calc dr - subsd xmm4, xmm0 - subsd xmm5, xmm1 - subsd xmm6, xmm2 - - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb400nf_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb400nf_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb400nf_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb400nf_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu) - - mulsd xmm4, xmm0 ;# xmm4=r - mulsd xmm4, [esp + nb400nf_gbscale] - - movd mm0, eax - - cvttsd2si eax, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, eax - subsd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl eax, 2 ;# idx *= 4 - - mov esi, [ebp + nb400nf_GBtab] - - ;# Coulomb - movapd xmm4, [esi + eax*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [esi + eax*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# table ready in xmm4-xmm7 - - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [esp + nb400nf_qq] - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, xmm3 ;# vcoul=qq*VV - addsd xmm5, [esp + nb400nf_vctot] - movsd [esp + nb400nf_vctot], xmm5 - -.nb400nf_updateouterdata: - ;# get n from stack - mov esi, [esp + nb400nf_n] - ;# get group index for i particle - mov edx, [ebp + nb400nf_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movapd xmm7, [esp + nb400nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov eax, [ebp + nb400nf_Vc] - addsd xmm7, [eax + edx*8] - ;# move back to mem - movsd [eax + edx*8], xmm7 - - ;# finish if last - mov ecx, [esp + nb400nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb400nf_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb400nf_n], esi - jmp .nb400nf_outer -.nb400nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb400nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb400nf_end - ;# non-zero, do one more workunit - jmp .nb400nf_threadloop -.nb400nf_end: - emms - - mov eax, [esp + nb400nf_nouter] - mov ebx, [esp + nb400nf_ninner] - mov ecx, [ebp + nb400nf_outeriter] - mov edx, [ebp + nb400nf_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb400nf_salign] - add esp, eax - add esp, 244 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.s deleted file mode 100644 index 81ad136251..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.s +++ /dev/null @@ -1,1261 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - - -.globl nb_kernel400_ia32_sse2 -.globl _nb_kernel400_ia32_sse2 -nb_kernel400_ia32_sse2: -_nb_kernel400_ia32_sse2: -.set nb400_p_nri, 8 -.set nb400_iinr, 12 -.set nb400_jindex, 16 -.set nb400_jjnr, 20 -.set nb400_shift, 24 -.set nb400_shiftvec, 28 -.set nb400_fshift, 32 -.set nb400_gid, 36 -.set nb400_pos, 40 -.set nb400_faction, 44 -.set nb400_charge, 48 -.set nb400_p_facel, 52 -.set nb400_argkrf, 56 -.set nb400_argcrf, 60 -.set nb400_Vc, 64 -.set nb400_type, 68 -.set nb400_p_ntype, 72 -.set nb400_vdwparam, 76 -.set nb400_Vvdw, 80 -.set nb400_p_tabscale, 84 -.set nb400_VFtab, 88 -.set nb400_invsqrta, 92 -.set nb400_dvda, 96 -.set nb400_p_gbtabscale, 100 -.set nb400_GBtab, 104 -.set nb400_p_nthreads, 108 -.set nb400_count, 112 -.set nb400_mtx, 116 -.set nb400_outeriter, 120 -.set nb400_inneriter, 124 -.set nb400_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb400_ix, 0 -.set nb400_iy, 16 -.set nb400_iz, 32 -.set nb400_iq, 48 -.set nb400_dx, 64 -.set nb400_dy, 80 -.set nb400_dz, 96 -.set nb400_two, 112 -.set nb400_gbtsc, 128 -.set nb400_qq, 144 -.set nb400_r, 160 -.set nb400_vctot, 176 -.set nb400_fix, 192 -.set nb400_fiy, 208 -.set nb400_fiz, 224 -.set nb400_half, 240 -.set nb400_three, 256 -.set nb400_isai, 272 -.set nb400_isaprod, 288 -.set nb400_dvdasum, 304 -.set nb400_gbscale, 320 -.set nb400_is3, 336 -.set nb400_ii3, 340 -.set nb400_ii, 344 -.set nb400_innerjjnr, 348 -.set nb400_innerk, 352 -.set nb400_n, 356 -.set nb400_nn1, 360 -.set nb400_nri, 364 -.set nb400_facel, 368 ## uses 8 bytes -.set nb400_nouter, 376 -.set nb400_ninner, 380 -.set nb400_salign, 384 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $388,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb400_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb400_p_nri(%ebp),%ecx - movl nb400_p_facel(%ebp),%esi - movl (%ecx),%ecx - movsd (%esi),%xmm7 - movl %ecx,nb400_nri(%esp) - movsd %xmm7,nb400_facel(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb400_nouter(%esp) - movl %eax,nb400_ninner(%esp) - - - movl nb400_p_gbtabscale(%ebp),%eax - movsd (%eax),%xmm3 - shufpd $0,%xmm3,%xmm3 - movapd %xmm3,nb400_gbtsc(%esp) - - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double 0.5 IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb400_half(%esp) - movl %ebx,nb400_half+4(%esp) - movsd nb400_half(%esp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## 1.0 - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## 2.0 - addpd %xmm2,%xmm3 ## 3.0 - movapd %xmm1,nb400_half(%esp) - movapd %xmm2,nb400_two(%esp) - movapd %xmm3,nb400_three(%esp) - -_nb_kernel400_ia32_sse2.nb400_threadloop: - movl nb400_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel400_ia32_sse2.nb400_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel400_ia32_sse2.nb400_spinlock - - ## if(nn1>nri) nn1=nri - movl nb400_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb400_n(%esp) - movl %ebx,nb400_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel400_ia32_sse2.nb400_outerstart - jmp _nb_kernel400_ia32_sse2.nb400_end - -_nb_kernel400_ia32_sse2.nb400_outerstart: - ## ebx contains number of outer iterations - addl nb400_nouter(%esp),%ebx - movl %ebx,nb400_nouter(%esp) - -_nb_kernel400_ia32_sse2.nb400_outer: - movl nb400_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb400_is3(%esp) ## store is3 - - movl nb400_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movsd (%eax,%ebx,8),%xmm0 - movsd 8(%eax,%ebx,8),%xmm1 - movsd 16(%eax,%ebx,8),%xmm2 - - movl nb400_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - movl %ebx,nb400_ii(%esp) - - movl nb400_charge(%ebp),%edx - movsd (%edx,%ebx,8),%xmm3 - mulsd nb400_facel(%esp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movl nb400_invsqrta(%ebp),%edx ## load invsqrta[ii] - movsd (%edx,%ebx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb400_pos(%ebp),%eax ## eax = base of pos[] - - addsd (%eax,%ebx,8),%xmm0 - addsd 8(%eax,%ebx,8),%xmm1 - addsd 16(%eax,%ebx,8),%xmm2 - - movapd %xmm3,nb400_iq(%esp) - movapd %xmm4,nb400_isai(%esp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb400_ix(%esp) - movapd %xmm1,nb400_iy(%esp) - movapd %xmm2,nb400_iz(%esp) - - movl %ebx,nb400_ii3(%esp) - - ## clear vctot and i forces - xorpd %xmm4,%xmm4 - movapd %xmm4,nb400_vctot(%esp) - movapd %xmm4,nb400_dvdasum(%esp) - movapd %xmm4,nb400_fix(%esp) - movapd %xmm4,nb400_fiy(%esp) - movapd %xmm4,nb400_fiz(%esp) - - movl nb400_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb400_pos(%ebp),%esi - movl nb400_faction(%ebp),%edi - movl nb400_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb400_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb400_ninner(%esp),%ecx - movl %ecx,nb400_ninner(%esp) - addl $0,%edx - movl %edx,nb400_innerk(%esp) ## number of innerloop atoms - jge _nb_kernel400_ia32_sse2.nb400_unroll_loop - jmp _nb_kernel400_ia32_sse2.nb400_checksingle -_nb_kernel400_ia32_sse2.nb400_unroll_loop: - ## twice unrolled innerloop here - movl nb400_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - addl $8,nb400_innerjjnr(%esp) ## advance pointer (unrolled 2) - - ## load isaj - movl nb400_invsqrta(%ebp),%esi - movlpd (%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm2 - mulpd nb400_isai(%esp),%xmm2 - movapd %xmm2,nb400_isaprod(%esp) - movapd %xmm2,%xmm1 - mulpd nb400_gbtsc(%esp),%xmm1 - movapd %xmm1,nb400_gbscale(%esp) - - movl nb400_charge(%ebp),%esi ## base of charge[] - movlpd (%esi,%eax,8),%xmm3 - movhpd (%esi,%ebx,8),%xmm3 - - mulpd nb400_iq(%esp),%xmm2 - mulpd %xmm2,%xmm3 - movapd %xmm3,nb400_qq(%esp) - - movl nb400_pos(%ebp),%esi ## base of pos[] - - movd %eax,%mm2 - movd %ebx,%mm3 - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - - ## move two coordinates to xmm0-xmm2 - movlpd (%esi,%eax,8),%xmm0 - movlpd 8(%esi,%eax,8),%xmm1 - movlpd 16(%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm0 - movhpd 8(%esi,%ebx,8),%xmm1 - movhpd 16(%esi,%ebx,8),%xmm2 - - movl nb400_faction(%ebp),%edi - - ## move nb400_ix-iz to xmm4-xmm6 - movapd nb400_ix(%esp),%xmm4 - movapd nb400_iy(%esp),%xmm5 - movapd nb400_iz(%esp),%xmm6 - - ## calc dr - subpd %xmm0,%xmm4 - subpd %xmm1,%xmm5 - subpd %xmm2,%xmm6 - - ## store dr - movapd %xmm4,nb400_dx(%esp) - movapd %xmm5,nb400_dy(%esp) - movapd %xmm6,nb400_dz(%esp) - ## square it - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb400_three(%esp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb400_half(%esp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb400_three(%esp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb400_half(%esp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=iter2 of rinv (new lu) - mulpd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb400_r(%esp) - mulpd nb400_gbscale(%esp),%xmm4 - - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 ## idx *= 4 - - movd %eax,%mm0 - movd %ebx,%mm1 - - movl nb400_GBtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm6,%ebx ## indices in eax/ebx - - movapd (%esi,%eax,8),%xmm4 ## Y1 F1 - movapd (%esi,%ebx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%esi,%eax,8),%xmm6 ## G1 H1 - movapd 16(%esi,%ebx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - mulpd nb400_two(%esp),%xmm7 ## two*Heps2 - movapd nb400_qq(%esp),%xmm3 - addpd %xmm6,%xmm7 - addpd %xmm5,%xmm7 ## xmm7=FF - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - mulpd %xmm3,%xmm5 ## vcoul=qq*VV - mulpd %xmm7,%xmm3 ## fijC=FF*qq - ## get jnr from regs - movd %mm2,%ecx - movd %mm3,%edx - movl nb400_dvda(%ebp),%esi - - ## Calculate dVda - xorpd %xmm7,%xmm7 - mulpd nb400_gbscale(%esp),%xmm3 - movapd %xmm3,%xmm6 - mulpd nb400_r(%esp),%xmm6 - addpd %xmm5,%xmm6 - addpd nb400_vctot(%esp),%xmm5 - movapd %xmm5,nb400_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subpd %xmm6,%xmm7 - movapd %xmm7,%xmm6 - - ## update dvdasum - addpd nb400_dvdasum(%esp),%xmm7 - movapd %xmm7,nb400_dvdasum(%esp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - addsd (%esi,%ecx,8),%xmm6 - addsd (%esi,%edx,8),%xmm7 - movsd %xmm6,(%esi,%ecx,8) - movsd %xmm7,(%esi,%edx,8) - - xorpd %xmm4,%xmm4 - - mulpd %xmm0,%xmm3 - subpd %xmm3,%xmm4 - - movapd nb400_dx(%esp),%xmm0 - movapd nb400_dy(%esp),%xmm1 - movapd nb400_dz(%esp),%xmm2 - - movd %mm0,%eax - movd %mm1,%ebx - - movl nb400_faction(%ebp),%edi - mulpd %xmm4,%xmm0 - mulpd %xmm4,%xmm1 - mulpd %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movapd nb400_fix(%esp),%xmm3 - movapd nb400_fiy(%esp),%xmm4 - movapd nb400_fiz(%esp),%xmm5 - addpd %xmm0,%xmm3 - addpd %xmm1,%xmm4 - addpd %xmm2,%xmm5 - movapd %xmm3,nb400_fix(%esp) - movapd %xmm4,nb400_fiy(%esp) - movapd %xmm5,nb400_fiz(%esp) - ## the fj's - start by accumulating forces from memory - movlpd (%edi,%eax,8),%xmm3 - movlpd 8(%edi,%eax,8),%xmm4 - movlpd 16(%edi,%eax,8),%xmm5 - movhpd (%edi,%ebx,8),%xmm3 - movhpd 8(%edi,%ebx,8),%xmm4 - movhpd 16(%edi,%ebx,8),%xmm5 - subpd %xmm0,%xmm3 - subpd %xmm1,%xmm4 - subpd %xmm2,%xmm5 - movlpd %xmm3,(%edi,%eax,8) - movlpd %xmm4,8(%edi,%eax,8) - movlpd %xmm5,16(%edi,%eax,8) - movhpd %xmm3,(%edi,%ebx,8) - movhpd %xmm4,8(%edi,%ebx,8) - movhpd %xmm5,16(%edi,%ebx,8) - - ## should we do one more iteration? - subl $2,nb400_innerk(%esp) - jl _nb_kernel400_ia32_sse2.nb400_checksingle - jmp _nb_kernel400_ia32_sse2.nb400_unroll_loop -_nb_kernel400_ia32_sse2.nb400_checksingle: - movl nb400_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel400_ia32_sse2.nb400_dosingle - jmp _nb_kernel400_ia32_sse2.nb400_updateouterdata -_nb_kernel400_ia32_sse2.nb400_dosingle: - movl nb400_charge(%ebp),%esi - movl nb400_invsqrta(%ebp),%edx - movl nb400_pos(%ebp),%edi - movl nb400_innerjjnr(%esp),%ecx - movl (%ecx),%eax - xorpd %xmm6,%xmm6 - movapd %xmm6,%xmm7 - movsd (%edx,%eax,8),%xmm7 - movlpd (%esi,%eax,8),%xmm6 ## xmm6(0) has the charge - mulsd nb400_isai(%esp),%xmm7 - movapd %xmm7,nb400_isaprod(%esp) - movapd %xmm7,%xmm1 - mulpd nb400_gbtsc(%esp),%xmm1 - movapd %xmm1,nb400_gbscale(%esp) - - mulsd nb400_iq(%esp),%xmm7 - mulsd %xmm7,%xmm6 - movapd %xmm6,nb400_qq(%esp) - - movd %eax,%mm2 - leal (%eax,%eax,2),%eax - - ## move coordinates to xmm0-xmm2 - movlpd (%edi,%eax,8),%xmm0 - movlpd 8(%edi,%eax,8),%xmm1 - movlpd 16(%edi,%eax,8),%xmm2 - - ## move nb400_ix-iz to xmm4-xmm6 - movapd nb400_ix(%esp),%xmm4 - movapd nb400_iy(%esp),%xmm5 - movapd nb400_iz(%esp),%xmm6 - - ## calc dr - subsd %xmm0,%xmm4 - subsd %xmm1,%xmm5 - subsd %xmm2,%xmm6 - - ## store dr - movapd %xmm4,nb400_dx(%esp) - movapd %xmm5,nb400_dy(%esp) - movapd %xmm6,nb400_dz(%esp) - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb400_three(%esp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb400_half(%esp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb400_three(%esp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb400_half(%esp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=iter2 of rinv (new lu) - - mulsd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb400_r(%esp) - mulsd nb400_gbscale(%esp),%xmm4 - - movd %eax,%mm0 - - cvttsd2si %xmm4,%eax ## mm6 = lu idx - cvtsi2sd %eax,%xmm5 - subsd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%eax ## idx *= 4 - - movl nb400_GBtab(%ebp),%esi - - ## Coulomb - movapd (%esi,%eax,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%esi,%eax,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## table ready in xmm4-xmm7 - - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - mulsd nb400_two(%esp),%xmm7 ## two*Heps2 - movapd nb400_qq(%esp),%xmm3 - addsd %xmm6,%xmm7 - addsd %xmm5,%xmm7 ## xmm7=FF - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd %xmm3,%xmm5 ## vcoul=qq*VV - mulsd %xmm7,%xmm3 ## fijC=FF*qq - ## get jnr from regs - movd %mm2,%ebx - movl nb400_dvda(%ebp),%esi - - ## Calculate dVda - mulsd nb400_gbscale(%esp),%xmm3 - movsd %xmm3,%xmm6 - mulsd nb400_r(%esp),%xmm6 - addsd %xmm5,%xmm6 - addsd nb400_vctot(%esp),%xmm5 - movsd %xmm5,nb400_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subpd %xmm6,%xmm7 - movsd %xmm7,%xmm6 - - ## update dvdasum - addsd nb400_dvdasum(%esp),%xmm7 - movsd %xmm7,nb400_dvdasum(%esp) - - ## update j atoms dvdaj - addsd (%esi,%ebx,8),%xmm6 - movsd %xmm6,(%esi,%ebx,8) - - xorpd %xmm4,%xmm4 - movd %mm0,%eax - - mulsd %xmm0,%xmm3 - subsd %xmm3,%xmm4 - movl nb400_faction(%ebp),%edi - - movsd nb400_dx(%esp),%xmm0 - movsd nb400_dy(%esp),%xmm1 - movsd nb400_dz(%esp),%xmm2 - - mulsd %xmm4,%xmm0 - mulsd %xmm4,%xmm1 - mulsd %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movsd nb400_fix(%esp),%xmm3 - movsd nb400_fiy(%esp),%xmm4 - movsd nb400_fiz(%esp),%xmm5 - addsd %xmm0,%xmm3 - addsd %xmm1,%xmm4 - addsd %xmm2,%xmm5 - movlpd %xmm3,nb400_fix(%esp) - movlpd %xmm4,nb400_fiy(%esp) - movlpd %xmm5,nb400_fiz(%esp) - ## update fj - movlpd (%edi,%eax,8),%xmm3 - movlpd 8(%edi,%eax,8),%xmm4 - movlpd 16(%edi,%eax,8),%xmm5 - subsd %xmm0,%xmm3 - subsd %xmm1,%xmm4 - subsd %xmm2,%xmm5 - movlpd %xmm3,(%edi,%eax,8) - movlpd %xmm4,8(%edi,%eax,8) - movlpd %xmm5,16(%edi,%eax,8) - -_nb_kernel400_ia32_sse2.nb400_updateouterdata: - movl nb400_ii3(%esp),%ecx - movl nb400_faction(%ebp),%edi - movl nb400_fshift(%ebp),%esi - movl nb400_is3(%esp),%edx - - ## accumulate i forces in xmm0, xmm1, xmm2 - movapd nb400_fix(%esp),%xmm0 - movapd nb400_fiy(%esp),%xmm1 - movapd nb400_fiz(%esp),%xmm2 - - movhlps %xmm0,%xmm3 - movhlps %xmm1,%xmm4 - movhlps %xmm2,%xmm5 - addsd %xmm3,%xmm0 - addsd %xmm4,%xmm1 - addsd %xmm5,%xmm2 ## sum is in low xmm0-xmm2 - - ## increment i force - movsd (%edi,%ecx,8),%xmm3 - movsd 8(%edi,%ecx,8),%xmm4 - movsd 16(%edi,%ecx,8),%xmm5 - addsd %xmm0,%xmm3 - addsd %xmm1,%xmm4 - addsd %xmm2,%xmm5 - movsd %xmm3,(%edi,%ecx,8) - movsd %xmm4,8(%edi,%ecx,8) - movsd %xmm5,16(%edi,%ecx,8) - - ## increment fshift force - movsd (%esi,%edx,8),%xmm3 - movsd 8(%esi,%edx,8),%xmm4 - movsd 16(%esi,%edx,8),%xmm5 - addsd %xmm0,%xmm3 - addsd %xmm1,%xmm4 - addsd %xmm2,%xmm5 - movsd %xmm3,(%esi,%edx,8) - movsd %xmm4,8(%esi,%edx,8) - movsd %xmm5,16(%esi,%edx,8) - - ## get n from stack - movl nb400_n(%esp),%esi - ## get group index for i particle - movl nb400_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movapd nb400_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movl nb400_Vc(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%eax,%edx,8) - - ## accumulate dVda and update it - movapd nb400_dvdasum(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - movl nb400_ii(%esp),%edx - movl nb400_dvda(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - movsd %xmm7,(%eax,%edx,8) - - ## finish if last - movl nb400_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel400_ia32_sse2.nb400_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb400_n(%esp) - jmp _nb_kernel400_ia32_sse2.nb400_outer -_nb_kernel400_ia32_sse2.nb400_outerend: - ## check if more outer neighborlists remain - movl nb400_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel400_ia32_sse2.nb400_end - ## non-zero, do one more workunit - jmp _nb_kernel400_ia32_sse2.nb400_threadloop -_nb_kernel400_ia32_sse2.nb400_end: - emms - - movl nb400_nouter(%esp),%eax - movl nb400_ninner(%esp),%ebx - movl nb400_outeriter(%ebp),%ecx - movl nb400_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb400_salign(%esp),%eax - addl %eax,%esp - addl $388,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - - - - - - -.globl nb_kernel400nf_ia32_sse2 -.globl _nb_kernel400nf_ia32_sse2 -nb_kernel400nf_ia32_sse2: -_nb_kernel400nf_ia32_sse2: -.set nb400nf_p_nri, 8 -.set nb400nf_iinr, 12 -.set nb400nf_jindex, 16 -.set nb400nf_jjnr, 20 -.set nb400nf_shift, 24 -.set nb400nf_shiftvec, 28 -.set nb400nf_fshift, 32 -.set nb400nf_gid, 36 -.set nb400nf_pos, 40 -.set nb400nf_faction, 44 -.set nb400nf_charge, 48 -.set nb400nf_p_facel, 52 -.set nb400nf_argkrf, 56 -.set nb400nf_argcrf, 60 -.set nb400nf_Vc, 64 -.set nb400nf_type, 68 -.set nb400nf_p_ntype, 72 -.set nb400nf_vdwparam, 76 -.set nb400nf_Vvdw, 80 -.set nb400nf_p_tabscale, 84 -.set nb400nf_VFtab, 88 -.set nb400nf_invsqrta, 92 -.set nb400nf_dvda, 96 -.set nb400nf_p_gbtabscale, 100 -.set nb400nf_GBtab, 104 -.set nb400nf_p_nthreads, 108 -.set nb400nf_count, 112 -.set nb400nf_mtx, 116 -.set nb400nf_outeriter, 120 -.set nb400nf_inneriter, 124 -.set nb400nf_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb400nf_ix, 0 -.set nb400nf_iy, 16 -.set nb400nf_iz, 32 -.set nb400nf_iq, 48 -.set nb400nf_gbtsc, 64 -.set nb400nf_qq, 80 -.set nb400nf_vctot, 96 -.set nb400nf_half, 112 -.set nb400nf_three, 128 -.set nb400nf_isai, 144 -.set nb400nf_isaprod, 160 -.set nb400nf_gbscale, 176 -.set nb400nf_is3, 192 -.set nb400nf_ii3, 196 -.set nb400nf_innerjjnr, 200 -.set nb400nf_innerk, 204 -.set nb400nf_n, 208 -.set nb400nf_nn1, 212 -.set nb400nf_nri, 216 -.set nb400nf_facel, 224 ## uses 8 bytes -.set nb400nf_nouter, 232 -.set nb400nf_ninner, 236 -.set nb400nf_salign, 240 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $244,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb400nf_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb400nf_p_nri(%ebp),%ecx - movl nb400nf_p_facel(%ebp),%esi - movl (%ecx),%ecx - movsd (%esi),%xmm7 - movl %ecx,nb400nf_nri(%esp) - movsd %xmm7,nb400nf_facel(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb400nf_nouter(%esp) - movl %eax,nb400nf_ninner(%esp) - - - movl nb400nf_p_gbtabscale(%ebp),%eax - movsd (%eax),%xmm3 - shufpd $0,%xmm3,%xmm3 - movapd %xmm3,nb400nf_gbtsc(%esp) - - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double 0.5 IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb400nf_half(%esp) - movl %ebx,nb400nf_half+4(%esp) - movsd nb400nf_half(%esp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## 1.0 - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## 2.0 - addpd %xmm2,%xmm3 ## 3.0 - movapd %xmm1,nb400nf_half(%esp) - movapd %xmm3,nb400nf_three(%esp) - -_nb_kernel400nf_ia32_sse2.nb400nf_threadloop: - movl nb400nf_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel400nf_ia32_sse2.nb400nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel400nf_ia32_sse2.nb400nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb400nf_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb400nf_n(%esp) - movl %ebx,nb400nf_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel400nf_ia32_sse2.nb400nf_outerstart - jmp _nb_kernel400nf_ia32_sse2.nb400nf_end - -_nb_kernel400nf_ia32_sse2.nb400nf_outerstart: - ## ebx contains number of outer iterations - addl nb400nf_nouter(%esp),%ebx - movl %ebx,nb400nf_nouter(%esp) - -_nb_kernel400nf_ia32_sse2.nb400nf_outer: - movl nb400nf_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb400nf_is3(%esp) ## store is3 - - movl nb400nf_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movsd (%eax,%ebx,8),%xmm0 - movsd 8(%eax,%ebx,8),%xmm1 - movsd 16(%eax,%ebx,8),%xmm2 - - movl nb400nf_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - - movl nb400nf_charge(%ebp),%edx - movsd (%edx,%ebx,8),%xmm3 - mulsd nb400nf_facel(%esp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movl nb400nf_invsqrta(%ebp),%edx ## load invsqrta[ii] - movsd (%edx,%ebx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb400nf_pos(%ebp),%eax ## eax = base of pos[] - - addsd (%eax,%ebx,8),%xmm0 - addsd 8(%eax,%ebx,8),%xmm1 - addsd 16(%eax,%ebx,8),%xmm2 - - movapd %xmm3,nb400nf_iq(%esp) - movapd %xmm4,nb400nf_isai(%esp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb400nf_ix(%esp) - movapd %xmm1,nb400nf_iy(%esp) - movapd %xmm2,nb400nf_iz(%esp) - - movl %ebx,nb400nf_ii3(%esp) - - ## clear vctot - xorpd %xmm4,%xmm4 - movapd %xmm4,nb400nf_vctot(%esp) - - movl nb400nf_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb400nf_pos(%ebp),%esi - movl nb400nf_faction(%ebp),%edi - movl nb400nf_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb400nf_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb400nf_ninner(%esp),%ecx - movl %ecx,nb400nf_ninner(%esp) - addl $0,%edx - movl %edx,nb400nf_innerk(%esp) ## number of innerloop atoms - jge _nb_kernel400nf_ia32_sse2.nb400nf_unroll_loop - jmp _nb_kernel400nf_ia32_sse2.nb400nf_checksingle -_nb_kernel400nf_ia32_sse2.nb400nf_unroll_loop: - ## twice unrolled innerloop here - movl nb400nf_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - addl $8,nb400nf_innerjjnr(%esp) ## advance pointer (unrolled 2) - - ## load isa2 - movl nb400nf_invsqrta(%ebp),%esi - movlpd (%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm2 - mulpd nb400nf_isai(%esp),%xmm2 - movapd %xmm2,nb400nf_isaprod(%esp) - movapd %xmm2,%xmm1 - mulpd nb400nf_gbtsc(%esp),%xmm1 - movapd %xmm1,nb400nf_gbscale(%esp) - - movl nb400nf_charge(%ebp),%esi ## base of charge[] - movlpd (%esi,%eax,8),%xmm3 - movhpd (%esi,%ebx,8),%xmm3 - - mulpd nb400nf_iq(%esp),%xmm2 - mulpd %xmm2,%xmm3 - movapd %xmm3,nb400nf_qq(%esp) - - movl nb400nf_pos(%ebp),%esi ## base of pos[] - - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - - ## move two coordinates to xmm0-xmm2 - movlpd (%esi,%eax,8),%xmm0 - movlpd 8(%esi,%eax,8),%xmm1 - movlpd 16(%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm0 - movhpd 8(%esi,%ebx,8),%xmm1 - movhpd 16(%esi,%ebx,8),%xmm2 - - movl nb400nf_faction(%ebp),%edi - - ## move nb400nf_ix-iz to xmm4-xmm6 - movapd nb400nf_ix(%esp),%xmm4 - movapd nb400nf_iy(%esp),%xmm5 - movapd nb400nf_iz(%esp),%xmm6 - - ## calc dr - subpd %xmm0,%xmm4 - subpd %xmm1,%xmm5 - subpd %xmm2,%xmm6 - - ## square it - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb400nf_three(%esp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb400nf_half(%esp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb400nf_three(%esp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb400nf_half(%esp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=iter2 of rinv (new lu) - mulpd %xmm0,%xmm4 ## xmm4=r - mulpd nb400nf_gbscale(%esp),%xmm4 - - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 ## idx *= 4 - - movd %eax,%mm0 - movd %ebx,%mm1 - - movl nb400nf_GBtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm6,%ebx ## indices in eax/ebx - - movapd (%esi,%eax,8),%xmm4 ## Y1 F1 - movapd (%esi,%ebx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%esi,%eax,8),%xmm6 ## G1 H1 - movapd 16(%esi,%ebx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - movapd nb400nf_qq(%esp),%xmm3 - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - mulpd %xmm3,%xmm5 ## vcoul=qq*VV - addpd nb400nf_vctot(%esp),%xmm5 - movapd %xmm5,nb400nf_vctot(%esp) - - ## should we do one more iteration? - subl $2,nb400nf_innerk(%esp) - jl _nb_kernel400nf_ia32_sse2.nb400nf_checksingle - jmp _nb_kernel400nf_ia32_sse2.nb400nf_unroll_loop -_nb_kernel400nf_ia32_sse2.nb400nf_checksingle: - movl nb400nf_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel400nf_ia32_sse2.nb400nf_dosingle - jmp _nb_kernel400nf_ia32_sse2.nb400nf_updateouterdata -_nb_kernel400nf_ia32_sse2.nb400nf_dosingle: - movl nb400nf_charge(%ebp),%esi - movl nb400nf_invsqrta(%ebp),%edx - movl nb400nf_pos(%ebp),%edi - movl nb400nf_innerjjnr(%esp),%ecx - movl (%ecx),%eax - xorpd %xmm6,%xmm6 - movapd %xmm6,%xmm7 - movsd (%edx,%eax,8),%xmm7 - movlpd (%esi,%eax,8),%xmm6 ## xmm6(0) has the charge - mulsd nb400nf_isai(%esp),%xmm7 - movapd %xmm7,nb400nf_isaprod(%esp) - movapd %xmm7,%xmm1 - mulpd nb400nf_gbtsc(%esp),%xmm1 - movapd %xmm1,nb400nf_gbscale(%esp) - - mulsd nb400nf_iq(%esp),%xmm7 - mulsd %xmm7,%xmm6 - movapd %xmm6,nb400nf_qq(%esp) - - leal (%eax,%eax,2),%eax - - ## move coordinates to xmm0-xmm2 - movlpd (%edi,%eax,8),%xmm0 - movlpd 8(%edi,%eax,8),%xmm1 - movlpd 16(%edi,%eax,8),%xmm2 - - ## move nb400nf_ix-iz to xmm4-xmm6 - movapd nb400nf_ix(%esp),%xmm4 - movapd nb400nf_iy(%esp),%xmm5 - movapd nb400nf_iz(%esp),%xmm6 - - ## calc dr - subsd %xmm0,%xmm4 - subsd %xmm1,%xmm5 - subsd %xmm2,%xmm6 - - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb400nf_three(%esp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb400nf_half(%esp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb400nf_three(%esp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb400nf_half(%esp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=iter2 of rinv (new lu) - - mulsd %xmm0,%xmm4 ## xmm4=r - mulsd nb400nf_gbscale(%esp),%xmm4 - - movd %eax,%mm0 - - cvttsd2si %xmm4,%eax ## mm6 = lu idx - cvtsi2sd %eax,%xmm5 - subsd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%eax ## idx *= 4 - - movl nb400nf_GBtab(%ebp),%esi - - ## Coulomb - movapd (%esi,%eax,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%esi,%eax,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## table ready in xmm4-xmm7 - - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - movapd nb400nf_qq(%esp),%xmm3 - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd %xmm3,%xmm5 ## vcoul=qq*VV - addsd nb400nf_vctot(%esp),%xmm5 - movsd %xmm5,nb400nf_vctot(%esp) - -_nb_kernel400nf_ia32_sse2.nb400nf_updateouterdata: - ## get n from stack - movl nb400nf_n(%esp),%esi - ## get group index for i particle - movl nb400nf_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movapd nb400nf_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movl nb400nf_Vc(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%eax,%edx,8) - - ## finish if last - movl nb400nf_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel400nf_ia32_sse2.nb400nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb400nf_n(%esp) - jmp _nb_kernel400nf_ia32_sse2.nb400nf_outer -_nb_kernel400nf_ia32_sse2.nb400nf_outerend: - ## check if more outer neighborlists remain - movl nb400nf_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel400nf_ia32_sse2.nb400nf_end - ## non-zero, do one more workunit - jmp _nb_kernel400nf_ia32_sse2.nb400nf_threadloop -_nb_kernel400nf_ia32_sse2.nb400nf_end: - emms - - movl nb400nf_nouter(%esp),%eax - movl nb400nf_ninner(%esp),%ebx - movl nb400nf_outeriter(%ebp),%ecx - movl nb400nf_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb400nf_salign(%esp),%eax - addl %eax,%esp - addl $244,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.intel_syntax.s deleted file mode 100644 index c5010b4e62..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.intel_syntax.s +++ /dev/null @@ -1,1530 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. - -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - - - - -.globl nb_kernel410_ia32_sse2 -.globl _nb_kernel410_ia32_sse2 -nb_kernel410_ia32_sse2: -_nb_kernel410_ia32_sse2: -.equiv nb410_p_nri, 8 -.equiv nb410_iinr, 12 -.equiv nb410_jindex, 16 -.equiv nb410_jjnr, 20 -.equiv nb410_shift, 24 -.equiv nb410_shiftvec, 28 -.equiv nb410_fshift, 32 -.equiv nb410_gid, 36 -.equiv nb410_pos, 40 -.equiv nb410_faction, 44 -.equiv nb410_charge, 48 -.equiv nb410_p_facel, 52 -.equiv nb410_argkrf, 56 -.equiv nb410_argcrf, 60 -.equiv nb410_Vc, 64 -.equiv nb410_type, 68 -.equiv nb410_p_ntype, 72 -.equiv nb410_vdwparam, 76 -.equiv nb410_Vvdw, 80 -.equiv nb410_p_tabscale, 84 -.equiv nb410_VFtab, 88 -.equiv nb410_invsqrta, 92 -.equiv nb410_dvda, 96 -.equiv nb410_p_gbtabscale, 100 -.equiv nb410_GBtab, 104 -.equiv nb410_p_nthreads, 108 -.equiv nb410_count, 112 -.equiv nb410_mtx, 116 -.equiv nb410_outeriter, 120 -.equiv nb410_inneriter, 124 -.equiv nb410_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb410_ix, 0 -.equiv nb410_iy, 16 -.equiv nb410_iz, 32 -.equiv nb410_iq, 48 -.equiv nb410_dx, 64 -.equiv nb410_dy, 80 -.equiv nb410_dz, 96 -.equiv nb410_two, 112 -.equiv nb410_six, 128 -.equiv nb410_twelve, 144 -.equiv nb410_gbtsc, 160 -.equiv nb410_qq, 176 -.equiv nb410_c6, 192 -.equiv nb410_c12, 208 -.equiv nb410_fscal, 224 -.equiv nb410_vctot, 240 -.equiv nb410_Vvdwtot, 256 -.equiv nb410_fix, 272 -.equiv nb410_fiy, 288 -.equiv nb410_fiz, 304 -.equiv nb410_half, 320 -.equiv nb410_three, 336 -.equiv nb410_r, 352 -.equiv nb410_isai, 368 -.equiv nb410_isaprod, 384 -.equiv nb410_dvdasum, 400 -.equiv nb410_gbscale, 416 -.equiv nb410_ii, 432 -.equiv nb410_is3, 436 -.equiv nb410_ii3, 440 -.equiv nb410_ntia, 444 -.equiv nb410_innerjjnr, 448 -.equiv nb410_innerk, 452 -.equiv nb410_n, 456 -.equiv nb410_nn1, 460 -.equiv nb410_nri, 464 -.equiv nb410_facel, 472 ;# uses 8 bytes -.equiv nb410_ntype, 480 -.equiv nb410_nouter, 484 -.equiv nb410_ninner, 488 -.equiv nb410_salign, 492 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 496 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb410_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb410_p_nri] - mov esi, [ebp + nb410_p_facel] - mov edi, [ebp + nb410_p_ntype] - mov ecx, [ecx] - movsd xmm7, [esi] - mov edi, [edi] - mov [esp + nb410_nri], ecx - movsd [esp + nb410_facel], xmm7 - mov [esp + nb410_ntype], edi - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb410_nouter], eax - mov [esp + nb410_ninner], eax - - - mov eax, [ebp + nb410_p_gbtabscale] - movsd xmm5, [eax] - shufpd xmm5, xmm5, 0 - movapd [esp + nb410_gbtsc], xmm5 - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double 0.5 IEEE (hex) - mov ebx, 0x3fe00000 - mov [esp + nb410_half], eax - mov [esp + nb410_half+4], ebx - movsd xmm1, [esp + nb410_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# 1.0 - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# 2.0 - addpd xmm3, xmm2 ;# 3.0 - movapd xmm4, xmm3 - addpd xmm4, xmm4 ;# 6.0 - movapd xmm5, xmm4 - addpd xmm5, xmm5 ;# 12.0 - movapd [esp + nb410_half], xmm1 - movapd [esp + nb410_two], xmm2 - movapd [esp + nb410_three], xmm3 - movapd [esp + nb410_six], xmm4 - movapd [esp + nb410_twelve], xmm5 - -.nb410_threadloop: - mov esi, [ebp + nb410_count] ;# pointer to sync counter - mov eax, [esi] -.nb410_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb410_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb410_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb410_n], eax - mov [esp + nb410_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb410_outerstart - jmp .nb410_end - -.nb410_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb410_nouter] - mov [esp + nb410_nouter], ebx - -.nb410_outer: - mov eax, [ebp + nb410_shift] ;# eax = pointer into shift[] - mov ebx, [eax+esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb410_is3],ebx ;# store is3 - - mov eax, [ebp + nb410_shiftvec] ;# eax = base of shiftvec[] - - movsd xmm0, [eax + ebx*8] - movsd xmm1, [eax + ebx*8 + 8] - movsd xmm2, [eax + ebx*8 + 16] - - mov ecx, [ebp + nb410_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx+esi*4] ;# ebx =ii - mov [esp + nb410_ii], ebx - - mov edx, [ebp + nb410_charge] - movsd xmm3, [edx + ebx*8] - mulsd xmm3, [esp + nb410_facel] - shufpd xmm3, xmm3, 0 - - mov edx, [ebp + nb410_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [edx + ebx*8] - shufpd xmm4, xmm4, 0 - - mov edx, [ebp + nb410_type] - mov edx, [edx + ebx*4] - imul edx, [esp + nb410_ntype] - shl edx, 1 - mov [esp + nb410_ntia], edx - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb410_pos] ;# eax = base of pos[] - - addsd xmm0, [eax + ebx*8] - addsd xmm1, [eax + ebx*8 + 8] - addsd xmm2, [eax + ebx*8 + 16] - - movapd [esp + nb410_iq], xmm3 - movapd [esp + nb410_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [esp + nb410_ix], xmm0 - movapd [esp + nb410_iy], xmm1 - movapd [esp + nb410_iz], xmm2 - - mov [esp + nb410_ii3], ebx - - ;# clear vctot and i forces - xorpd xmm4, xmm4 - movapd [esp + nb410_vctot], xmm4 - movapd [esp + nb410_Vvdwtot], xmm4 - movapd [esp + nb410_dvdasum], xmm4 - movapd [esp + nb410_fix], xmm4 - movapd [esp + nb410_fiy], xmm4 - movapd [esp + nb410_fiz], xmm4 - - mov eax, [ebp + nb410_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb410_pos] - mov edi, [ebp + nb410_faction] - mov eax, [ebp + nb410_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb410_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [esp + nb410_ninner] - mov [esp + nb410_ninner], ecx - add edx, 0 - mov [esp + nb410_innerk], edx ;# number of innerloop atoms - jge .nb410_unroll_loop - jmp .nb410_checksingle -.nb410_unroll_loop: - ;# twice unrolled innerloop here - mov edx, [esp + nb410_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - add dword ptr [esp + nb410_innerjjnr], 8 ;# advance pointer (unrolled 2) - - ;# load isaj - mov esi, [ebp + nb410_invsqrta] - movlpd xmm2, [esi + eax*8] - movhpd xmm2, [esi + ebx*8] - mulpd xmm2, [esp + nb410_isai] - movapd [esp + nb410_isaprod], xmm2 - movapd xmm1, xmm2 - mulpd xmm1, [esp + nb410_gbtsc] - movapd [esp + nb410_gbscale], xmm1 - - mov esi, [ebp + nb410_charge] ;# base of charge[] - movlpd xmm3, [esi + eax*8] - movhpd xmm3, [esi + ebx*8] - - mulpd xmm2, [esp + nb410_iq] - mulpd xmm3, xmm2 - movapd [esp + nb410_qq], xmm3 - - movd mm0, eax ;# use mmx registers as temp storage - movd mm1, ebx - - mov esi, [ebp + nb410_type] - mov eax, [esi + eax*4] - mov ebx, [esi + ebx*4] - mov esi, [ebp + nb410_vdwparam] - shl eax, 1 - shl ebx, 1 - mov edi, [esp + nb410_ntia] - add eax, edi - add ebx, edi - - movlpd xmm6, [esi + eax*8] ;# c6a - movlpd xmm7, [esi + ebx*8] ;# c6b - movhpd xmm6, [esi + eax*8 + 8] ;# c6a c12a - movhpd xmm7, [esi + ebx*8 + 8] ;# c6b c12b - - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movd eax, mm0 - movd ebx, mm1 - movapd [esp + nb410_c6], xmm4 - movapd [esp + nb410_c12], xmm6 - - mov esi, [ebp + nb410_pos] ;# base of pos[] - - movd mm2, eax - movd mm3, ebx - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [esi + eax*8] - movlpd xmm1, [esi + eax*8 + 8] - movlpd xmm2, [esi + eax*8 + 16] - movhpd xmm0, [esi + ebx*8] - movhpd xmm1, [esi + ebx*8 + 8] - movhpd xmm2, [esi + ebx*8 + 16] - - ;# move ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb410_ix] - movapd xmm5, [esp + nb410_iy] - movapd xmm6, [esp + nb410_iz] - - ;# calc dr - subpd xmm4, xmm0 - subpd xmm5, xmm1 - subpd xmm6, xmm2 - - ;# store dr - movapd [esp + nb410_dx], xmm4 - movapd [esp + nb410_dy], xmm5 - movapd [esp + nb410_dz], xmm6 - ;# square it - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb410_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb410_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb410_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb410_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=rinv - - mulpd xmm4, xmm0 ;# xmm4=r - movapd [esp + nb410_r], xmm4 - mulpd xmm4, [esp + nb410_gbscale] - - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 ;# idx *= 4 - - movd mm0, eax - movd mm1, ebx - - mov esi, [ebp + nb410_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ebx, mm6 ;# indices in eax/ebx - - movapd xmm4, [esi + eax*8] ;# Y1 F1 - movapd xmm3, [esi + ebx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [esi + eax*8 + 16] ;# G1 H1 - movapd xmm3, [esi + ebx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - mulpd xmm7, [esp + nb410_two] ;# two*Heps2 - movapd xmm3, [esp + nb410_qq] - addpd xmm7, xmm6 - addpd xmm7, xmm5 ;# xmm7=FF - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - mulpd xmm5, xmm3 ;# vcoul=qq*VV - mulpd xmm3, xmm7 ;# fijC=FF*qq - ;# get jnr from regs - movd ecx, mm2 - movd edx, mm3 - mov esi, [ebp + nb410_dvda] - - ;# Calculate dVda - xorpd xmm7, xmm7 - mulpd xmm3, [esp + nb410_gbscale] - movapd xmm6, xmm3 - mulpd xmm6, [esp + nb410_r] - addpd xmm6, xmm5 - addpd xmm5, [esp + nb410_vctot] - movapd [esp + nb410_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subpd xmm7, xmm6 - movapd xmm6, xmm7 - - ;# update dvdasum - addpd xmm7, [esp + nb410_dvdasum] - movapd [esp + nb410_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - addsd xmm6, [esi + ecx*8] - addsd xmm7, [esi + edx*8] - movsd [esi + ecx*8], xmm6 - movsd [esi + edx*8], xmm7 - - ;# L-J - movapd xmm4, xmm0 - mulpd xmm4, xmm0 ;# xmm4=rinvsq - - movapd xmm6, xmm4 - mulpd xmm6, xmm4 - - mulpd xmm6, xmm4 ;# xmm6=rinvsix - movapd xmm4, xmm6 - mulpd xmm4, xmm4 ;# xmm4=rinvtwelve - mulpd xmm6, [esp + nb410_c6] - mulpd xmm4, [esp + nb410_c12] - movapd xmm7, [esp + nb410_Vvdwtot] - addpd xmm7, xmm4 - mulpd xmm4, [esp + nb410_twelve] - subpd xmm7, xmm6 - mulpd xmm6, [esp + nb410_six] - movapd [esp + nb410_Vvdwtot], xmm7 - subpd xmm4, xmm6 - mulpd xmm4, xmm0 - subpd xmm4, xmm3 - mulpd xmm4, xmm0 - - movapd xmm0, [esp + nb410_dx] - movapd xmm1, [esp + nb410_dy] - movapd xmm2, [esp + nb410_dz] - - movd eax, mm0 - movd ebx, mm1 - - mov edi, [ebp + nb410_faction] - mulpd xmm0, xmm4 - mulpd xmm1, xmm4 - mulpd xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movapd xmm3, [esp + nb410_fix] - movapd xmm4, [esp + nb410_fiy] - movapd xmm5, [esp + nb410_fiz] - addpd xmm3, xmm0 - addpd xmm4, xmm1 - addpd xmm5, xmm2 - movapd [esp + nb410_fix], xmm3 - movapd [esp + nb410_fiy], xmm4 - movapd [esp + nb410_fiz], xmm5 - ;# the fj's - start by accumulating forces from memory - movlpd xmm3, [edi + eax*8] - movlpd xmm4, [edi + eax*8 + 8] - movlpd xmm5, [edi + eax*8 + 16] - movhpd xmm3, [edi + ebx*8] - movhpd xmm4, [edi + ebx*8 + 8] - movhpd xmm5, [edi + ebx*8 + 16] - subpd xmm3, xmm0 - subpd xmm4, xmm1 - subpd xmm5, xmm2 - movlpd [edi + eax*8], xmm3 - movlpd [edi + eax*8 + 8], xmm4 - movlpd [edi + eax*8 + 16], xmm5 - movhpd [edi + ebx*8], xmm3 - movhpd [edi + ebx*8 + 8], xmm4 - movhpd [edi + ebx*8 + 16], xmm5 - - ;# should we do one more iteration? - sub dword ptr [esp + nb410_innerk], 2 - jl .nb410_checksingle - jmp .nb410_unroll_loop -.nb410_checksingle: - mov edx, [esp + nb410_innerk] - and edx, 1 - jnz .nb410_dosingle - jmp .nb410_updateouterdata -.nb410_dosingle: - mov esi, [ebp + nb410_charge] - mov edx, [ebp + nb410_invsqrta] - mov edi, [ebp + nb410_pos] - mov ecx, [esp + nb410_innerjjnr] - mov eax, [ecx] - - xorpd xmm6, xmm6 - movapd xmm7, xmm6 - movsd xmm7, [edx + eax*8] - movlpd xmm6, [esi + eax*8] ;# xmm6(0) has the charge - mulsd xmm7, [esp + nb410_isai] - movapd [esp + nb410_isaprod], xmm7 - movapd xmm1, xmm7 - mulpd xmm1, [esp + nb410_gbtsc] - movapd [esp + nb410_gbscale], xmm1 - - mulsd xmm7, [esp + nb410_iq] - mulsd xmm6, xmm7 - movapd [esp + nb410_qq], xmm6 - - movd mm0, eax ;# use mmx registers as temp storage - mov esi, [ebp + nb410_type] - mov eax, [esi + eax*4] - mov esi, [ebp + nb410_vdwparam] - shl eax, 1 - mov edi, [esp + nb410_ntia] - add eax, edi - - movlpd xmm6, [esi + eax*8] ;# c6a - movhpd xmm6, [esi + eax*8 + 8] ;# c6a c12a - xorpd xmm7, xmm7 - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movd eax, mm0 - movapd [esp + nb410_c6], xmm4 - movapd [esp + nb410_c12], xmm6 - - mov esi, [ebp + nb410_pos] ;# base of pos[] - - movd mm2, eax - lea eax, [eax + eax*2] ;# replace jnr with j3 - - ;# move coordinates to xmm0-xmm2 - movlpd xmm0, [esi + eax*8] - movlpd xmm1, [esi + eax*8 + 8] - movlpd xmm2, [esi + eax*8 + 16] - - ;# move ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb410_ix] - movapd xmm5, [esp + nb410_iy] - movapd xmm6, [esp + nb410_iz] - - ;# calc dr - subsd xmm4, xmm0 - subsd xmm5, xmm1 - subsd xmm6, xmm2 - - ;# store dr - movapd [esp + nb410_dx], xmm4 - movapd [esp + nb410_dy], xmm5 - movapd [esp + nb410_dz], xmm6 - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb410_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb410_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb410_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb410_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=rinv - - mulsd xmm4, xmm0 ;# xmm4=r - movapd [esp + nb410_r], xmm4 - mulsd xmm4, [esp + nb410_gbscale] - - movd mm0, eax - cvttsd2si eax, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, eax - subsd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl eax, 2 ;# idx *= 4 - - mov esi, [ebp + nb410_GBtab] - - movapd xmm4, [esi + eax*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [esi + eax*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# coulomb table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - mulsd xmm7, [esp + nb410_two] ;# two*Heps2 - movapd xmm3, [esp + nb410_qq] - addsd xmm7, xmm6 - addsd xmm7, xmm5 ;# xmm7=FF - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, xmm3 ;# vcoul=qq*VV - mulsd xmm3, xmm7 ;# fijC=FF*qq - ;# get jnr from regs - movd ebx, mm2 - mov esi, [ebp + nb410_dvda] - - ;# Calculate dVda - xorpd xmm7, xmm7 - mulsd xmm3, [esp + nb410_gbscale] - movsd xmm6, xmm3 - mulsd xmm6, [esp + nb410_r] - addsd xmm6, xmm5 - addsd xmm5, [esp + nb410_vctot] - movsd [esp + nb410_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subpd xmm7, xmm7 - movsd xmm6, xmm7 - - ;# update dvdasum - addsd xmm7, [esp + nb410_dvdasum] - movsd [esp + nb410_dvdasum], xmm7 - - ;# update j atoms dvdaj - addsd xmm6, [esi + ebx*8] - movsd [esi + ebx*8], xmm6 - - ;# L-J - movapd xmm4, xmm0 - mulsd xmm4, xmm0 ;# xmm4=rinvsq - - - movapd xmm6, xmm4 - mulsd xmm6, xmm4 - - mulsd xmm6, xmm4 ;# xmm6=rinvsix - movapd xmm4, xmm6 - mulsd xmm4, xmm4 ;# xmm4=rinvtwelve - mulsd xmm6, [esp + nb410_c6] - mulsd xmm4, [esp + nb410_c12] - movapd xmm7, [esp + nb410_Vvdwtot] - addsd xmm7, xmm4 - mulsd xmm4, [esp + nb410_twelve] - subsd xmm7, xmm6 - mulsd xmm6, [esp + nb410_six] - movlpd [esp + nb410_Vvdwtot], xmm7 - subsd xmm4, xmm6 - mulsd xmm4, xmm0 - subsd xmm4, xmm3 - mulsd xmm4, xmm0 - - movapd xmm0, [esp + nb410_dx] - movapd xmm1, [esp + nb410_dy] - movapd xmm2, [esp + nb410_dz] - - movd eax, mm0 - - mov edi, [ebp + nb410_faction] - mulsd xmm0, xmm4 - mulsd xmm1, xmm4 - mulsd xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movapd xmm3, [esp + nb410_fix] - movapd xmm4, [esp + nb410_fiy] - movapd xmm5, [esp + nb410_fiz] - addsd xmm3, xmm0 - addsd xmm4, xmm1 - addsd xmm5, xmm2 - movlpd [esp + nb410_fix], xmm3 - movlpd [esp + nb410_fiy], xmm4 - movlpd [esp + nb410_fiz], xmm5 - ;# the fj's - start by accumulating forces from memory - movlpd xmm3, [edi + eax*8] - movlpd xmm4, [edi + eax*8 + 8] - movlpd xmm5, [edi + eax*8 + 16] - subsd xmm3, xmm0 - subsd xmm4, xmm1 - subsd xmm5, xmm2 - movlpd [edi + eax*8], xmm3 - movlpd [edi + eax*8 + 8], xmm4 - movlpd [edi + eax*8 + 16], xmm5 - -.nb410_updateouterdata: - mov ecx, [esp + nb410_ii3] - mov edi, [ebp + nb410_faction] - mov esi, [ebp + nb410_fshift] - mov edx, [esp + nb410_is3] - - ;# accumulate i forces in xmm0, xmm1, xmm2 - movapd xmm0, [esp + nb410_fix] - movapd xmm1, [esp + nb410_fiy] - movapd xmm2, [esp + nb410_fiz] - - movhlps xmm3, xmm0 - movhlps xmm4, xmm1 - movhlps xmm5, xmm2 - addsd xmm0, xmm3 - addsd xmm1, xmm4 - addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 - - ;# increment i force - movsd xmm3, [edi + ecx*8] - movsd xmm4, [edi + ecx*8 + 8] - movsd xmm5, [edi + ecx*8 + 16] - addsd xmm3, xmm0 - addsd xmm4, xmm1 - addsd xmm5, xmm2 - movsd [edi + ecx*8], xmm3 - movsd [edi + ecx*8 + 8], xmm4 - movsd [edi + ecx*8 + 16], xmm5 - - ;# increment fshift force - movsd xmm3, [esi + edx*8] - movsd xmm4, [esi + edx*8 + 8] - movsd xmm5, [esi + edx*8 + 16] - addsd xmm3, xmm0 - addsd xmm4, xmm1 - addsd xmm5, xmm2 - movsd [esi + edx*8], xmm3 - movsd [esi + edx*8 + 8], xmm4 - movsd [esi + edx*8 + 16], xmm5 - - ;# get n from stack - mov esi, [esp + nb410_n] - ;# get group index for i particle - mov edx, [ebp + nb410_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movapd xmm7, [esp + nb410_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov eax, [ebp + nb410_Vc] - addsd xmm7, [eax + edx*8] - ;# move back to mem - movsd [eax + edx*8], xmm7 - - ;# accumulate total lj energy and update it - movapd xmm7, [esp + nb410_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov eax, [ebp + nb410_Vvdw] - addsd xmm7, [eax + edx*8] - ;# move back to mem - movsd [eax + edx*8], xmm7 - - ;# accumulate dVda and update it - movapd xmm7, [esp + nb410_dvdasum] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - mov edx, [esp + nb410_ii] - mov eax, [ebp + nb410_dvda] - addsd xmm7, [eax + edx*8] - movsd [eax + edx*8], xmm7 - - ;# finish if last - mov ecx, [esp + nb410_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb410_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb410_n], esi - jmp .nb410_outer -.nb410_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb410_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb410_end - ;# non-zero, do one more workunit - jmp .nb410_threadloop -.nb410_end: - emms - - mov eax, [esp + nb410_nouter] - mov ebx, [esp + nb410_ninner] - mov ecx, [ebp + nb410_outeriter] - mov edx, [ebp + nb410_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb410_salign] - add esp, eax - add esp, 496 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret - - - - - - - -.globl nb_kernel410nf_ia32_sse2 -.globl _nb_kernel410nf_ia32_sse2 -nb_kernel410nf_ia32_sse2: -_nb_kernel410nf_ia32_sse2: -.equiv nb410nf_p_nri, 8 -.equiv nb410nf_iinr, 12 -.equiv nb410nf_jindex, 16 -.equiv nb410nf_jjnr, 20 -.equiv nb410nf_shift, 24 -.equiv nb410nf_shiftvec, 28 -.equiv nb410nf_fshift, 32 -.equiv nb410nf_gid, 36 -.equiv nb410nf_pos, 40 -.equiv nb410nf_faction, 44 -.equiv nb410nf_charge, 48 -.equiv nb410nf_p_facel, 52 -.equiv nb410nf_argkrf, 56 -.equiv nb410nf_argcrf, 60 -.equiv nb410nf_Vc, 64 -.equiv nb410nf_type, 68 -.equiv nb410nf_p_ntype, 72 -.equiv nb410nf_vdwparam, 76 -.equiv nb410nf_Vvdw, 80 -.equiv nb410nf_p_tabscale, 84 -.equiv nb410nf_VFtab, 88 -.equiv nb410nf_invsqrta, 92 -.equiv nb410nf_dvda, 96 -.equiv nb410nf_p_gbtabscale, 100 -.equiv nb410nf_GBtab, 104 -.equiv nb410nf_p_nthreads, 108 -.equiv nb410nf_count, 112 -.equiv nb410nf_mtx, 116 -.equiv nb410nf_outeriter, 120 -.equiv nb410nf_inneriter, 124 -.equiv nb410nf_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb410nf_ix, 0 -.equiv nb410nf_iy, 16 -.equiv nb410nf_iz, 32 -.equiv nb410nf_iq, 48 -.equiv nb410nf_two, 64 -.equiv nb410nf_gbtsc, 80 -.equiv nb410nf_qq, 96 -.equiv nb410nf_c6, 112 -.equiv nb410nf_c12, 128 -.equiv nb410nf_vctot, 144 -.equiv nb410nf_Vvdwtot, 160 -.equiv nb410nf_half, 176 -.equiv nb410nf_three, 192 -.equiv nb410nf_r, 208 -.equiv nb410nf_isai, 224 -.equiv nb410nf_isaprod, 240 -.equiv nb410nf_gbscale, 256 -.equiv nb410nf_ii, 272 -.equiv nb410nf_is3, 276 -.equiv nb410nf_ii3, 280 -.equiv nb410nf_ntia, 284 -.equiv nb410nf_innerjjnr, 288 -.equiv nb410nf_innerk, 292 -.equiv nb410nf_n, 296 -.equiv nb410nf_nn1, 300 -.equiv nb410nf_nri, 304 -.equiv nb410nf_facel, 312 ;# uses 8 bytes -.equiv nb410nf_ntype, 320 -.equiv nb410nf_nouter, 324 -.equiv nb410nf_ninner, 328 -.equiv nb410nf_salign, 332 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 336 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb410nf_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb410nf_p_nri] - mov esi, [ebp + nb410nf_p_facel] - mov edi, [ebp + nb410nf_p_ntype] - mov ecx, [ecx] - movsd xmm7, [esi] - mov edi, [edi] - mov [esp + nb410nf_nri], ecx - movsd [esp + nb410nf_facel], xmm7 - mov [esp + nb410nf_ntype], edi - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb410nf_nouter], eax - mov [esp + nb410nf_ninner], eax - - - mov eax, [ebp + nb410nf_p_gbtabscale] - movsd xmm5, [eax] - shufpd xmm5, xmm5, 0 - movapd [esp + nb410nf_gbtsc], xmm5 - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double 0.5 IEEE (hex) - mov ebx, 0x3fe00000 - mov [esp + nb410nf_half], eax - mov [esp + nb410nf_half+4], ebx - movsd xmm1, [esp + nb410nf_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# 1.0 - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# 2.0 - addpd xmm3, xmm2 ;# 3.0 - movapd [esp + nb410nf_half], xmm1 - movapd [esp + nb410nf_two], xmm2 - movapd [esp + nb410nf_three], xmm3 - -.nb410nf_threadloop: - mov esi, [ebp + nb410nf_count] ;# pointer to sync counter - mov eax, [esi] -.nb410nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb410nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb410nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb410nf_n], eax - mov [esp + nb410nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb410nf_outerstart - jmp .nb410nf_end - -.nb410nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb410nf_nouter] - mov [esp + nb410nf_nouter], ebx - -.nb410nf_outer: - mov eax, [ebp + nb410nf_shift] ;# eax = pointer into shift[] - mov ebx, [eax+esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb410nf_is3],ebx ;# store is3 - - mov eax, [ebp + nb410nf_shiftvec] ;# eax = base of shiftvec[] - - movsd xmm0, [eax + ebx*8] - movsd xmm1, [eax + ebx*8 + 8] - movsd xmm2, [eax + ebx*8 + 16] - - mov ecx, [ebp + nb410nf_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx+esi*4] ;# ebx =ii - mov [esp + nb410nf_ii], ebx - - mov edx, [ebp + nb410nf_charge] - movsd xmm3, [edx + ebx*8] - mulsd xmm3, [esp + nb410nf_facel] - shufpd xmm3, xmm3, 0 - - mov edx, [ebp + nb410nf_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [edx + ebx*8] - shufpd xmm4, xmm4, 0 - - mov edx, [ebp + nb410nf_type] - mov edx, [edx + ebx*4] - imul edx, [esp + nb410nf_ntype] - shl edx, 1 - mov [esp + nb410nf_ntia], edx - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb410nf_pos] ;# eax = base of pos[] - - addsd xmm0, [eax + ebx*8] - addsd xmm1, [eax + ebx*8 + 8] - addsd xmm2, [eax + ebx*8 + 16] - - movapd [esp + nb410nf_iq], xmm3 - movapd [esp + nb410nf_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [esp + nb410nf_ix], xmm0 - movapd [esp + nb410nf_iy], xmm1 - movapd [esp + nb410nf_iz], xmm2 - - mov [esp + nb410nf_ii3], ebx - - ;# clear vctot and Vvdwtot - xorpd xmm4, xmm4 - movapd [esp + nb410nf_vctot], xmm4 - movapd [esp + nb410nf_Vvdwtot], xmm4 - - mov eax, [ebp + nb410nf_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb410nf_pos] - mov edi, [ebp + nb410nf_faction] - mov eax, [ebp + nb410nf_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb410nf_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [esp + nb410nf_ninner] - mov [esp + nb410nf_ninner], ecx - add edx, 0 - mov [esp + nb410nf_innerk], edx ;# number of innerloop atoms - jge .nb410nf_unroll_loop - jmp .nb410nf_checksingle -.nb410nf_unroll_loop: - ;# twice unrolled innerloop here - mov edx, [esp + nb410nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - add dword ptr [esp + nb410nf_innerjjnr], 8 ;# advance pointer (unrolled 2) - - ;# load isaj - mov esi, [ebp + nb410nf_invsqrta] - movlpd xmm2, [esi + eax*8] - movhpd xmm2, [esi + ebx*8] - mulpd xmm2, [esp + nb410nf_isai] - movapd [esp + nb410nf_isaprod], xmm2 - movapd xmm1, xmm2 - mulpd xmm1, [esp + nb410nf_gbtsc] - movapd [esp + nb410nf_gbscale], xmm1 - - mov esi, [ebp + nb410nf_charge] ;# base of charge[] - movlpd xmm3, [esi + eax*8] - movhpd xmm3, [esi + ebx*8] - - mulpd xmm2, [esp + nb410nf_iq] - mulpd xmm3, xmm2 - movapd [esp + nb410nf_qq], xmm3 - - movd mm0, eax ;# use mmx registers as temp storage - movd mm1, ebx - - mov esi, [ebp + nb410nf_type] - mov eax, [esi + eax*4] - mov ebx, [esi + ebx*4] - mov esi, [ebp + nb410nf_vdwparam] - shl eax, 1 - shl ebx, 1 - mov edi, [esp + nb410nf_ntia] - add eax, edi - add ebx, edi - - movlpd xmm6, [esi + eax*8] ;# c6a - movlpd xmm7, [esi + ebx*8] ;# c6b - movhpd xmm6, [esi + eax*8 + 8] ;# c6a c12a - movhpd xmm7, [esi + ebx*8 + 8] ;# c6b c12b - - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movd eax, mm0 - movd ebx, mm1 - movapd [esp + nb410nf_c6], xmm4 - movapd [esp + nb410nf_c12], xmm6 - - mov esi, [ebp + nb410nf_pos] ;# base of pos[] - - movd mm2, eax - movd mm3, ebx - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [esi + eax*8] - movlpd xmm1, [esi + eax*8 + 8] - movlpd xmm2, [esi + eax*8 + 16] - movhpd xmm0, [esi + ebx*8] - movhpd xmm1, [esi + ebx*8 + 8] - movhpd xmm2, [esi + ebx*8 + 16] - - ;# move ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb410nf_ix] - movapd xmm5, [esp + nb410nf_iy] - movapd xmm6, [esp + nb410nf_iz] - - ;# calc dr - subpd xmm4, xmm0 - subpd xmm5, xmm1 - subpd xmm6, xmm2 - - ;# square dr - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb410nf_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb410nf_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb410nf_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb410nf_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=rinv - - mulpd xmm4, xmm0 ;# xmm4=r - movapd [esp + nb410nf_r], xmm4 - mulpd xmm4, [esp + nb410nf_gbscale] - - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 ;# idx *= 4 - - movd mm0, eax - movd mm1, ebx - - mov esi, [ebp + nb410nf_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ebx, mm6 ;# indices in eax/ebx - - movapd xmm4, [esi + eax*8] ;# Y1 F1 - movapd xmm3, [esi + ebx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [esi + eax*8 + 16] ;# G1 H1 - movapd xmm3, [esi + ebx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [esp + nb410nf_qq] - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - mulpd xmm5, xmm3 ;# vcoul=qq*VV - - addpd xmm5, [esp + nb410nf_vctot] - movapd [esp + nb410nf_vctot], xmm5 - - ;# L-J - movapd xmm4, xmm0 - mulpd xmm4, xmm0 ;# xmm4=rinvsq - - movapd xmm6, xmm4 - mulpd xmm6, xmm4 - - mulpd xmm6, xmm4 ;# xmm6=rinvsix - movapd xmm4, xmm6 - mulpd xmm4, xmm4 ;# xmm4=rinvtwelve - mulpd xmm6, [esp + nb410nf_c6] - mulpd xmm4, [esp + nb410nf_c12] - movapd xmm7, [esp + nb410nf_Vvdwtot] - addpd xmm7, xmm4 - subpd xmm7, xmm6 - movapd [esp + nb410nf_Vvdwtot], xmm7 - - ;# should we do one more iteration? - sub dword ptr [esp + nb410nf_innerk], 2 - jl .nb410nf_checksingle - jmp .nb410nf_unroll_loop -.nb410nf_checksingle: - mov edx, [esp + nb410nf_innerk] - and edx, 1 - jnz .nb410nf_dosingle - jmp .nb410nf_updateouterdata -.nb410nf_dosingle: - mov esi, [ebp + nb410nf_charge] - mov edx, [ebp + nb410nf_invsqrta] - mov edi, [ebp + nb410nf_pos] - mov ecx, [esp + nb410nf_innerjjnr] - mov eax, [ecx] - - xorpd xmm6, xmm6 - movapd xmm7, xmm6 - movsd xmm7, [edx + eax*8] - movlpd xmm6, [esi + eax*8] ;# xmm6(0) has the charge - mulsd xmm7, [esp + nb410nf_isai] - movapd [esp + nb410nf_isaprod], xmm7 - movapd xmm1, xmm7 - mulpd xmm1, [esp + nb410nf_gbtsc] - movapd [esp + nb410nf_gbscale], xmm1 - - mulsd xmm7, [esp + nb410nf_iq] - mulsd xmm6, xmm7 - movapd [esp + nb410nf_qq], xmm6 - - movd mm0, eax ;# use mmx registers as temp storage - mov esi, [ebp + nb410nf_type] - mov eax, [esi + eax*4] - mov esi, [ebp + nb410nf_vdwparam] - shl eax, 1 - mov edi, [esp + nb410nf_ntia] - add eax, edi - - movlpd xmm6, [esi + eax*8] ;# c6a - movhpd xmm6, [esi + eax*8 + 8] ;# c6a c12a - - xorpd xmm7, xmm7 - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movd eax, mm0 - movapd [esp + nb410nf_c6], xmm4 - movapd [esp + nb410nf_c12], xmm6 - - mov esi, [ebp + nb410nf_pos] ;# base of pos[] - - movd mm2, eax - lea eax, [eax + eax*2] ;# replace jnr with j3 - - ;# move coordinates to xmm0-xmm2 - movlpd xmm0, [esi + eax*8] - movlpd xmm1, [esi + eax*8 + 8] - movlpd xmm2, [esi + eax*8 + 16] - - ;# move ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb410nf_ix] - movapd xmm5, [esp + nb410nf_iy] - movapd xmm6, [esp + nb410nf_iz] - - ;# calc dr - subsd xmm4, xmm0 - subsd xmm5, xmm1 - subsd xmm6, xmm2 - - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb410nf_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb410nf_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb410nf_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb410nf_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=rinv - - mulsd xmm4, xmm0 ;# xmm4=r - movapd [esp + nb410nf_r], xmm4 - mulsd xmm4, [esp + nb410nf_gbscale] - - movd mm0, eax - cvttsd2si eax, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, eax - subsd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl eax, 2 ;# idx *= 4 - - mov esi, [ebp + nb410nf_GBtab] - - movapd xmm4, [esi + eax*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [esi + eax*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# coulomb table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [esp + nb410nf_qq] - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, xmm3 ;# vcoul=qq*VV - - addsd xmm5, [esp + nb410nf_vctot] - movsd [esp + nb410nf_vctot], xmm5 - - ;# L-J - movapd xmm4, xmm0 - mulsd xmm4, xmm0 ;# xmm4=rinvsq - - - movapd xmm6, xmm4 - mulsd xmm6, xmm4 - - mulsd xmm6, xmm4 ;# xmm6=rinvsix - movapd xmm4, xmm6 - mulsd xmm4, xmm4 ;# xmm4=rinvtwelve - mulsd xmm6, [esp + nb410nf_c6] - mulsd xmm4, [esp + nb410nf_c12] - movapd xmm7, [esp + nb410nf_Vvdwtot] - addsd xmm7, xmm4 - subsd xmm7, xmm6 - movlpd [esp + nb410nf_Vvdwtot], xmm7 - -.nb410nf_updateouterdata: - mov ecx, [esp + nb410nf_ii3] - mov edx, [esp + nb410nf_is3] - - ;# get n from stack - mov esi, [esp + nb410nf_n] - ;# get group index for i particle - mov edx, [ebp + nb410nf_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movapd xmm7, [esp + nb410nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov eax, [ebp + nb410nf_Vc] - addsd xmm7, [eax + edx*8] - ;# move back to mem - movsd [eax + edx*8], xmm7 - - ;# accumulate total lj energy and update it - movapd xmm7, [esp + nb410nf_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov eax, [ebp + nb410nf_Vvdw] - addsd xmm7, [eax + edx*8] - ;# move back to mem - movsd [eax + edx*8], xmm7 - - ;# finish if last - mov ecx, [esp + nb410nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb410nf_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb410nf_n], esi - jmp .nb410nf_outer -.nb410nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb410nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb410nf_end - ;# non-zero, do one more workunit - jmp .nb410nf_threadloop -.nb410nf_end: - emms - - mov eax, [esp + nb410nf_nouter] - mov ebx, [esp + nb410nf_ninner] - mov ecx, [ebp + nb410nf_outeriter] - mov edx, [ebp + nb410nf_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb410nf_salign] - add esp, eax - add esp, 336 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret - - diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.s deleted file mode 100644 index c8c4a4eea4..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.s +++ /dev/null @@ -1,1503 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - - - -.globl nb_kernel410_ia32_sse2 -.globl _nb_kernel410_ia32_sse2 -nb_kernel410_ia32_sse2: -_nb_kernel410_ia32_sse2: -.set nb410_p_nri, 8 -.set nb410_iinr, 12 -.set nb410_jindex, 16 -.set nb410_jjnr, 20 -.set nb410_shift, 24 -.set nb410_shiftvec, 28 -.set nb410_fshift, 32 -.set nb410_gid, 36 -.set nb410_pos, 40 -.set nb410_faction, 44 -.set nb410_charge, 48 -.set nb410_p_facel, 52 -.set nb410_argkrf, 56 -.set nb410_argcrf, 60 -.set nb410_Vc, 64 -.set nb410_type, 68 -.set nb410_p_ntype, 72 -.set nb410_vdwparam, 76 -.set nb410_Vvdw, 80 -.set nb410_p_tabscale, 84 -.set nb410_VFtab, 88 -.set nb410_invsqrta, 92 -.set nb410_dvda, 96 -.set nb410_p_gbtabscale, 100 -.set nb410_GBtab, 104 -.set nb410_p_nthreads, 108 -.set nb410_count, 112 -.set nb410_mtx, 116 -.set nb410_outeriter, 120 -.set nb410_inneriter, 124 -.set nb410_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb410_ix, 0 -.set nb410_iy, 16 -.set nb410_iz, 32 -.set nb410_iq, 48 -.set nb410_dx, 64 -.set nb410_dy, 80 -.set nb410_dz, 96 -.set nb410_two, 112 -.set nb410_six, 128 -.set nb410_twelve, 144 -.set nb410_gbtsc, 160 -.set nb410_qq, 176 -.set nb410_c6, 192 -.set nb410_c12, 208 -.set nb410_fscal, 224 -.set nb410_vctot, 240 -.set nb410_Vvdwtot, 256 -.set nb410_fix, 272 -.set nb410_fiy, 288 -.set nb410_fiz, 304 -.set nb410_half, 320 -.set nb410_three, 336 -.set nb410_r, 352 -.set nb410_isai, 368 -.set nb410_isaprod, 384 -.set nb410_dvdasum, 400 -.set nb410_gbscale, 416 -.set nb410_ii, 432 -.set nb410_is3, 436 -.set nb410_ii3, 440 -.set nb410_ntia, 444 -.set nb410_innerjjnr, 448 -.set nb410_innerk, 452 -.set nb410_n, 456 -.set nb410_nn1, 460 -.set nb410_nri, 464 -.set nb410_facel, 472 ## uses 8 bytes -.set nb410_ntype, 480 -.set nb410_nouter, 484 -.set nb410_ninner, 488 -.set nb410_salign, 492 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $496,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb410_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb410_p_nri(%ebp),%ecx - movl nb410_p_facel(%ebp),%esi - movl nb410_p_ntype(%ebp),%edi - movl (%ecx),%ecx - movsd (%esi),%xmm7 - movl (%edi),%edi - movl %ecx,nb410_nri(%esp) - movsd %xmm7,nb410_facel(%esp) - movl %edi,nb410_ntype(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb410_nouter(%esp) - movl %eax,nb410_ninner(%esp) - - - movl nb410_p_gbtabscale(%ebp),%eax - movsd (%eax),%xmm5 - shufpd $0,%xmm5,%xmm5 - movapd %xmm5,nb410_gbtsc(%esp) - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double 0.5 IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb410_half(%esp) - movl %ebx,nb410_half+4(%esp) - movsd nb410_half(%esp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## 1.0 - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## 2.0 - addpd %xmm2,%xmm3 ## 3.0 - movapd %xmm3,%xmm4 - addpd %xmm4,%xmm4 ## 6.0 - movapd %xmm4,%xmm5 - addpd %xmm5,%xmm5 ## 12.0 - movapd %xmm1,nb410_half(%esp) - movapd %xmm2,nb410_two(%esp) - movapd %xmm3,nb410_three(%esp) - movapd %xmm4,nb410_six(%esp) - movapd %xmm5,nb410_twelve(%esp) - -_nb_kernel410_ia32_sse2.nb410_threadloop: - movl nb410_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel410_ia32_sse2.nb410_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel410_ia32_sse2.nb410_spinlock - - ## if(nn1>nri) nn1=nri - movl nb410_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb410_n(%esp) - movl %ebx,nb410_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel410_ia32_sse2.nb410_outerstart - jmp _nb_kernel410_ia32_sse2.nb410_end - -_nb_kernel410_ia32_sse2.nb410_outerstart: - ## ebx contains number of outer iterations - addl nb410_nouter(%esp),%ebx - movl %ebx,nb410_nouter(%esp) - -_nb_kernel410_ia32_sse2.nb410_outer: - movl nb410_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb410_is3(%esp) ## store is3 - - movl nb410_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movsd (%eax,%ebx,8),%xmm0 - movsd 8(%eax,%ebx,8),%xmm1 - movsd 16(%eax,%ebx,8),%xmm2 - - movl nb410_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - movl %ebx,nb410_ii(%esp) - - movl nb410_charge(%ebp),%edx - movsd (%edx,%ebx,8),%xmm3 - mulsd nb410_facel(%esp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movl nb410_invsqrta(%ebp),%edx ## load invsqrta[ii] - movsd (%edx,%ebx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - movl nb410_type(%ebp),%edx - movl (%edx,%ebx,4),%edx - imull nb410_ntype(%esp),%edx - shll %edx - movl %edx,nb410_ntia(%esp) - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb410_pos(%ebp),%eax ## eax = base of pos[] - - addsd (%eax,%ebx,8),%xmm0 - addsd 8(%eax,%ebx,8),%xmm1 - addsd 16(%eax,%ebx,8),%xmm2 - - movapd %xmm3,nb410_iq(%esp) - movapd %xmm4,nb410_isai(%esp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb410_ix(%esp) - movapd %xmm1,nb410_iy(%esp) - movapd %xmm2,nb410_iz(%esp) - - movl %ebx,nb410_ii3(%esp) - - ## clear vctot and i forces - xorpd %xmm4,%xmm4 - movapd %xmm4,nb410_vctot(%esp) - movapd %xmm4,nb410_Vvdwtot(%esp) - movapd %xmm4,nb410_dvdasum(%esp) - movapd %xmm4,nb410_fix(%esp) - movapd %xmm4,nb410_fiy(%esp) - movapd %xmm4,nb410_fiz(%esp) - - movl nb410_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb410_pos(%ebp),%esi - movl nb410_faction(%ebp),%edi - movl nb410_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb410_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb410_ninner(%esp),%ecx - movl %ecx,nb410_ninner(%esp) - addl $0,%edx - movl %edx,nb410_innerk(%esp) ## number of innerloop atoms - jge _nb_kernel410_ia32_sse2.nb410_unroll_loop - jmp _nb_kernel410_ia32_sse2.nb410_checksingle -_nb_kernel410_ia32_sse2.nb410_unroll_loop: - ## twice unrolled innerloop here - movl nb410_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - addl $8,nb410_innerjjnr(%esp) ## advance pointer (unrolled 2) - - ## load isaj - movl nb410_invsqrta(%ebp),%esi - movlpd (%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm2 - mulpd nb410_isai(%esp),%xmm2 - movapd %xmm2,nb410_isaprod(%esp) - movapd %xmm2,%xmm1 - mulpd nb410_gbtsc(%esp),%xmm1 - movapd %xmm1,nb410_gbscale(%esp) - - movl nb410_charge(%ebp),%esi ## base of charge[] - movlpd (%esi,%eax,8),%xmm3 - movhpd (%esi,%ebx,8),%xmm3 - - mulpd nb410_iq(%esp),%xmm2 - mulpd %xmm2,%xmm3 - movapd %xmm3,nb410_qq(%esp) - - movd %eax,%mm0 ## use mmx registers as temp storage - movd %ebx,%mm1 - - movl nb410_type(%ebp),%esi - movl (%esi,%eax,4),%eax - movl (%esi,%ebx,4),%ebx - movl nb410_vdwparam(%ebp),%esi - shll %eax - shll %ebx - movl nb410_ntia(%esp),%edi - addl %edi,%eax - addl %edi,%ebx - - movlpd (%esi,%eax,8),%xmm6 ## c6a - movlpd (%esi,%ebx,8),%xmm7 ## c6b - movhpd 8(%esi,%eax,8),%xmm6 ## c6a c12a - movhpd 8(%esi,%ebx,8),%xmm7 ## c6b c12b - - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movd %mm0,%eax - movd %mm1,%ebx - movapd %xmm4,nb410_c6(%esp) - movapd %xmm6,nb410_c12(%esp) - - movl nb410_pos(%ebp),%esi ## base of pos[] - - movd %eax,%mm2 - movd %ebx,%mm3 - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - - ## move two coordinates to xmm0-xmm2 - movlpd (%esi,%eax,8),%xmm0 - movlpd 8(%esi,%eax,8),%xmm1 - movlpd 16(%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm0 - movhpd 8(%esi,%ebx,8),%xmm1 - movhpd 16(%esi,%ebx,8),%xmm2 - - ## move ix-iz to xmm4-xmm6 - movapd nb410_ix(%esp),%xmm4 - movapd nb410_iy(%esp),%xmm5 - movapd nb410_iz(%esp),%xmm6 - - ## calc dr - subpd %xmm0,%xmm4 - subpd %xmm1,%xmm5 - subpd %xmm2,%xmm6 - - ## store dr - movapd %xmm4,nb410_dx(%esp) - movapd %xmm5,nb410_dy(%esp) - movapd %xmm6,nb410_dz(%esp) - ## square it - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb410_three(%esp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb410_half(%esp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb410_three(%esp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb410_half(%esp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=rinv - - mulpd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb410_r(%esp) - mulpd nb410_gbscale(%esp),%xmm4 - - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 ## idx *= 4 - - movd %eax,%mm0 - movd %ebx,%mm1 - - movl nb410_GBtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm6,%ebx ## indices in eax/ebx - - movapd (%esi,%eax,8),%xmm4 ## Y1 F1 - movapd (%esi,%ebx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%esi,%eax,8),%xmm6 ## G1 H1 - movapd 16(%esi,%ebx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - mulpd nb410_two(%esp),%xmm7 ## two*Heps2 - movapd nb410_qq(%esp),%xmm3 - addpd %xmm6,%xmm7 - addpd %xmm5,%xmm7 ## xmm7=FF - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - mulpd %xmm3,%xmm5 ## vcoul=qq*VV - mulpd %xmm7,%xmm3 ## fijC=FF*qq - ## get jnr from regs - movd %mm2,%ecx - movd %mm3,%edx - movl nb410_dvda(%ebp),%esi - - ## Calculate dVda - xorpd %xmm7,%xmm7 - mulpd nb410_gbscale(%esp),%xmm3 - movapd %xmm3,%xmm6 - mulpd nb410_r(%esp),%xmm6 - addpd %xmm5,%xmm6 - addpd nb410_vctot(%esp),%xmm5 - movapd %xmm5,nb410_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subpd %xmm6,%xmm7 - movapd %xmm7,%xmm6 - - ## update dvdasum - addpd nb410_dvdasum(%esp),%xmm7 - movapd %xmm7,nb410_dvdasum(%esp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - addsd (%esi,%ecx,8),%xmm6 - addsd (%esi,%edx,8),%xmm7 - movsd %xmm6,(%esi,%ecx,8) - movsd %xmm7,(%esi,%edx,8) - - ## L-J - movapd %xmm0,%xmm4 - mulpd %xmm0,%xmm4 ## xmm4=rinvsq - - movapd %xmm4,%xmm6 - mulpd %xmm4,%xmm6 - - mulpd %xmm4,%xmm6 ## xmm6=rinvsix - movapd %xmm6,%xmm4 - mulpd %xmm4,%xmm4 ## xmm4=rinvtwelve - mulpd nb410_c6(%esp),%xmm6 - mulpd nb410_c12(%esp),%xmm4 - movapd nb410_Vvdwtot(%esp),%xmm7 - addpd %xmm4,%xmm7 - mulpd nb410_twelve(%esp),%xmm4 - subpd %xmm6,%xmm7 - mulpd nb410_six(%esp),%xmm6 - movapd %xmm7,nb410_Vvdwtot(%esp) - subpd %xmm6,%xmm4 - mulpd %xmm0,%xmm4 - subpd %xmm3,%xmm4 - mulpd %xmm0,%xmm4 - - movapd nb410_dx(%esp),%xmm0 - movapd nb410_dy(%esp),%xmm1 - movapd nb410_dz(%esp),%xmm2 - - movd %mm0,%eax - movd %mm1,%ebx - - movl nb410_faction(%ebp),%edi - mulpd %xmm4,%xmm0 - mulpd %xmm4,%xmm1 - mulpd %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movapd nb410_fix(%esp),%xmm3 - movapd nb410_fiy(%esp),%xmm4 - movapd nb410_fiz(%esp),%xmm5 - addpd %xmm0,%xmm3 - addpd %xmm1,%xmm4 - addpd %xmm2,%xmm5 - movapd %xmm3,nb410_fix(%esp) - movapd %xmm4,nb410_fiy(%esp) - movapd %xmm5,nb410_fiz(%esp) - ## the fj's - start by accumulating forces from memory - movlpd (%edi,%eax,8),%xmm3 - movlpd 8(%edi,%eax,8),%xmm4 - movlpd 16(%edi,%eax,8),%xmm5 - movhpd (%edi,%ebx,8),%xmm3 - movhpd 8(%edi,%ebx,8),%xmm4 - movhpd 16(%edi,%ebx,8),%xmm5 - subpd %xmm0,%xmm3 - subpd %xmm1,%xmm4 - subpd %xmm2,%xmm5 - movlpd %xmm3,(%edi,%eax,8) - movlpd %xmm4,8(%edi,%eax,8) - movlpd %xmm5,16(%edi,%eax,8) - movhpd %xmm3,(%edi,%ebx,8) - movhpd %xmm4,8(%edi,%ebx,8) - movhpd %xmm5,16(%edi,%ebx,8) - - ## should we do one more iteration? - subl $2,nb410_innerk(%esp) - jl _nb_kernel410_ia32_sse2.nb410_checksingle - jmp _nb_kernel410_ia32_sse2.nb410_unroll_loop -_nb_kernel410_ia32_sse2.nb410_checksingle: - movl nb410_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel410_ia32_sse2.nb410_dosingle - jmp _nb_kernel410_ia32_sse2.nb410_updateouterdata -_nb_kernel410_ia32_sse2.nb410_dosingle: - movl nb410_charge(%ebp),%esi - movl nb410_invsqrta(%ebp),%edx - movl nb410_pos(%ebp),%edi - movl nb410_innerjjnr(%esp),%ecx - movl (%ecx),%eax - - xorpd %xmm6,%xmm6 - movapd %xmm6,%xmm7 - movsd (%edx,%eax,8),%xmm7 - movlpd (%esi,%eax,8),%xmm6 ## xmm6(0) has the charge - mulsd nb410_isai(%esp),%xmm7 - movapd %xmm7,nb410_isaprod(%esp) - movapd %xmm7,%xmm1 - mulpd nb410_gbtsc(%esp),%xmm1 - movapd %xmm1,nb410_gbscale(%esp) - - mulsd nb410_iq(%esp),%xmm7 - mulsd %xmm7,%xmm6 - movapd %xmm6,nb410_qq(%esp) - - movd %eax,%mm0 ## use mmx registers as temp storage - movl nb410_type(%ebp),%esi - movl (%esi,%eax,4),%eax - movl nb410_vdwparam(%ebp),%esi - shll %eax - movl nb410_ntia(%esp),%edi - addl %edi,%eax - - movlpd (%esi,%eax,8),%xmm6 ## c6a - movhpd 8(%esi,%eax,8),%xmm6 ## c6a c12a - xorpd %xmm7,%xmm7 - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movd %mm0,%eax - movapd %xmm4,nb410_c6(%esp) - movapd %xmm6,nb410_c12(%esp) - - movl nb410_pos(%ebp),%esi ## base of pos[] - - movd %eax,%mm2 - leal (%eax,%eax,2),%eax ## replace jnr with j3 - - ## move coordinates to xmm0-xmm2 - movlpd (%esi,%eax,8),%xmm0 - movlpd 8(%esi,%eax,8),%xmm1 - movlpd 16(%esi,%eax,8),%xmm2 - - ## move ix-iz to xmm4-xmm6 - movapd nb410_ix(%esp),%xmm4 - movapd nb410_iy(%esp),%xmm5 - movapd nb410_iz(%esp),%xmm6 - - ## calc dr - subsd %xmm0,%xmm4 - subsd %xmm1,%xmm5 - subsd %xmm2,%xmm6 - - ## store dr - movapd %xmm4,nb410_dx(%esp) - movapd %xmm5,nb410_dy(%esp) - movapd %xmm6,nb410_dz(%esp) - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb410_three(%esp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb410_half(%esp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb410_three(%esp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb410_half(%esp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=rinv - - mulsd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb410_r(%esp) - mulsd nb410_gbscale(%esp),%xmm4 - - movd %eax,%mm0 - cvttsd2si %xmm4,%eax ## mm6 = lu idx - cvtsi2sd %eax,%xmm5 - subsd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%eax ## idx *= 4 - - movl nb410_GBtab(%ebp),%esi - - movapd (%esi,%eax,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%esi,%eax,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## coulomb table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - mulsd nb410_two(%esp),%xmm7 ## two*Heps2 - movapd nb410_qq(%esp),%xmm3 - addsd %xmm6,%xmm7 - addsd %xmm5,%xmm7 ## xmm7=FF - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd %xmm3,%xmm5 ## vcoul=qq*VV - mulsd %xmm7,%xmm3 ## fijC=FF*qq - ## get jnr from regs - movd %mm2,%ebx - movl nb410_dvda(%ebp),%esi - - ## Calculate dVda - xorpd %xmm7,%xmm7 - mulsd nb410_gbscale(%esp),%xmm3 - movsd %xmm3,%xmm6 - mulsd nb410_r(%esp),%xmm6 - addsd %xmm5,%xmm6 - addsd nb410_vctot(%esp),%xmm5 - movsd %xmm5,nb410_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subpd %xmm7,%xmm7 - movsd %xmm7,%xmm6 - - ## update dvdasum - addsd nb410_dvdasum(%esp),%xmm7 - movsd %xmm7,nb410_dvdasum(%esp) - - ## update j atoms dvdaj - addsd (%esi,%ebx,8),%xmm6 - movsd %xmm6,(%esi,%ebx,8) - - ## L-J - movapd %xmm0,%xmm4 - mulsd %xmm0,%xmm4 ## xmm4=rinvsq - - - movapd %xmm4,%xmm6 - mulsd %xmm4,%xmm6 - - mulsd %xmm4,%xmm6 ## xmm6=rinvsix - movapd %xmm6,%xmm4 - mulsd %xmm4,%xmm4 ## xmm4=rinvtwelve - mulsd nb410_c6(%esp),%xmm6 - mulsd nb410_c12(%esp),%xmm4 - movapd nb410_Vvdwtot(%esp),%xmm7 - addsd %xmm4,%xmm7 - mulsd nb410_twelve(%esp),%xmm4 - subsd %xmm6,%xmm7 - mulsd nb410_six(%esp),%xmm6 - movlpd %xmm7,nb410_Vvdwtot(%esp) - subsd %xmm6,%xmm4 - mulsd %xmm0,%xmm4 - subsd %xmm3,%xmm4 - mulsd %xmm0,%xmm4 - - movapd nb410_dx(%esp),%xmm0 - movapd nb410_dy(%esp),%xmm1 - movapd nb410_dz(%esp),%xmm2 - - movd %mm0,%eax - - movl nb410_faction(%ebp),%edi - mulsd %xmm4,%xmm0 - mulsd %xmm4,%xmm1 - mulsd %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movapd nb410_fix(%esp),%xmm3 - movapd nb410_fiy(%esp),%xmm4 - movapd nb410_fiz(%esp),%xmm5 - addsd %xmm0,%xmm3 - addsd %xmm1,%xmm4 - addsd %xmm2,%xmm5 - movlpd %xmm3,nb410_fix(%esp) - movlpd %xmm4,nb410_fiy(%esp) - movlpd %xmm5,nb410_fiz(%esp) - ## the fj's - start by accumulating forces from memory - movlpd (%edi,%eax,8),%xmm3 - movlpd 8(%edi,%eax,8),%xmm4 - movlpd 16(%edi,%eax,8),%xmm5 - subsd %xmm0,%xmm3 - subsd %xmm1,%xmm4 - subsd %xmm2,%xmm5 - movlpd %xmm3,(%edi,%eax,8) - movlpd %xmm4,8(%edi,%eax,8) - movlpd %xmm5,16(%edi,%eax,8) - -_nb_kernel410_ia32_sse2.nb410_updateouterdata: - movl nb410_ii3(%esp),%ecx - movl nb410_faction(%ebp),%edi - movl nb410_fshift(%ebp),%esi - movl nb410_is3(%esp),%edx - - ## accumulate i forces in xmm0, xmm1, xmm2 - movapd nb410_fix(%esp),%xmm0 - movapd nb410_fiy(%esp),%xmm1 - movapd nb410_fiz(%esp),%xmm2 - - movhlps %xmm0,%xmm3 - movhlps %xmm1,%xmm4 - movhlps %xmm2,%xmm5 - addsd %xmm3,%xmm0 - addsd %xmm4,%xmm1 - addsd %xmm5,%xmm2 ## sum is in low xmm0-xmm2 - - ## increment i force - movsd (%edi,%ecx,8),%xmm3 - movsd 8(%edi,%ecx,8),%xmm4 - movsd 16(%edi,%ecx,8),%xmm5 - addsd %xmm0,%xmm3 - addsd %xmm1,%xmm4 - addsd %xmm2,%xmm5 - movsd %xmm3,(%edi,%ecx,8) - movsd %xmm4,8(%edi,%ecx,8) - movsd %xmm5,16(%edi,%ecx,8) - - ## increment fshift force - movsd (%esi,%edx,8),%xmm3 - movsd 8(%esi,%edx,8),%xmm4 - movsd 16(%esi,%edx,8),%xmm5 - addsd %xmm0,%xmm3 - addsd %xmm1,%xmm4 - addsd %xmm2,%xmm5 - movsd %xmm3,(%esi,%edx,8) - movsd %xmm4,8(%esi,%edx,8) - movsd %xmm5,16(%esi,%edx,8) - - ## get n from stack - movl nb410_n(%esp),%esi - ## get group index for i particle - movl nb410_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movapd nb410_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movl nb410_Vc(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%eax,%edx,8) - - ## accumulate total lj energy and update it - movapd nb410_Vvdwtot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movl nb410_Vvdw(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%eax,%edx,8) - - ## accumulate dVda and update it - movapd nb410_dvdasum(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - movl nb410_ii(%esp),%edx - movl nb410_dvda(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - movsd %xmm7,(%eax,%edx,8) - - ## finish if last - movl nb410_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel410_ia32_sse2.nb410_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb410_n(%esp) - jmp _nb_kernel410_ia32_sse2.nb410_outer -_nb_kernel410_ia32_sse2.nb410_outerend: - ## check if more outer neighborlists remain - movl nb410_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel410_ia32_sse2.nb410_end - ## non-zero, do one more workunit - jmp _nb_kernel410_ia32_sse2.nb410_threadloop -_nb_kernel410_ia32_sse2.nb410_end: - emms - - movl nb410_nouter(%esp),%eax - movl nb410_ninner(%esp),%ebx - movl nb410_outeriter(%ebp),%ecx - movl nb410_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb410_salign(%esp),%eax - addl %eax,%esp - addl $496,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - - - - - - - -.globl nb_kernel410nf_ia32_sse2 -.globl _nb_kernel410nf_ia32_sse2 -nb_kernel410nf_ia32_sse2: -_nb_kernel410nf_ia32_sse2: -.set nb410nf_p_nri, 8 -.set nb410nf_iinr, 12 -.set nb410nf_jindex, 16 -.set nb410nf_jjnr, 20 -.set nb410nf_shift, 24 -.set nb410nf_shiftvec, 28 -.set nb410nf_fshift, 32 -.set nb410nf_gid, 36 -.set nb410nf_pos, 40 -.set nb410nf_faction, 44 -.set nb410nf_charge, 48 -.set nb410nf_p_facel, 52 -.set nb410nf_argkrf, 56 -.set nb410nf_argcrf, 60 -.set nb410nf_Vc, 64 -.set nb410nf_type, 68 -.set nb410nf_p_ntype, 72 -.set nb410nf_vdwparam, 76 -.set nb410nf_Vvdw, 80 -.set nb410nf_p_tabscale, 84 -.set nb410nf_VFtab, 88 -.set nb410nf_invsqrta, 92 -.set nb410nf_dvda, 96 -.set nb410nf_p_gbtabscale, 100 -.set nb410nf_GBtab, 104 -.set nb410nf_p_nthreads, 108 -.set nb410nf_count, 112 -.set nb410nf_mtx, 116 -.set nb410nf_outeriter, 120 -.set nb410nf_inneriter, 124 -.set nb410nf_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb410nf_ix, 0 -.set nb410nf_iy, 16 -.set nb410nf_iz, 32 -.set nb410nf_iq, 48 -.set nb410nf_two, 64 -.set nb410nf_gbtsc, 80 -.set nb410nf_qq, 96 -.set nb410nf_c6, 112 -.set nb410nf_c12, 128 -.set nb410nf_vctot, 144 -.set nb410nf_Vvdwtot, 160 -.set nb410nf_half, 176 -.set nb410nf_three, 192 -.set nb410nf_r, 208 -.set nb410nf_isai, 224 -.set nb410nf_isaprod, 240 -.set nb410nf_gbscale, 256 -.set nb410nf_ii, 272 -.set nb410nf_is3, 276 -.set nb410nf_ii3, 280 -.set nb410nf_ntia, 284 -.set nb410nf_innerjjnr, 288 -.set nb410nf_innerk, 292 -.set nb410nf_n, 296 -.set nb410nf_nn1, 300 -.set nb410nf_nri, 304 -.set nb410nf_facel, 312 ## uses 8 bytes -.set nb410nf_ntype, 320 -.set nb410nf_nouter, 324 -.set nb410nf_ninner, 328 -.set nb410nf_salign, 332 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $336,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb410nf_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb410nf_p_nri(%ebp),%ecx - movl nb410nf_p_facel(%ebp),%esi - movl nb410nf_p_ntype(%ebp),%edi - movl (%ecx),%ecx - movsd (%esi),%xmm7 - movl (%edi),%edi - movl %ecx,nb410nf_nri(%esp) - movsd %xmm7,nb410nf_facel(%esp) - movl %edi,nb410nf_ntype(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb410nf_nouter(%esp) - movl %eax,nb410nf_ninner(%esp) - - - movl nb410nf_p_gbtabscale(%ebp),%eax - movsd (%eax),%xmm5 - shufpd $0,%xmm5,%xmm5 - movapd %xmm5,nb410nf_gbtsc(%esp) - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double 0.5 IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb410nf_half(%esp) - movl %ebx,nb410nf_half+4(%esp) - movsd nb410nf_half(%esp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## 1.0 - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## 2.0 - addpd %xmm2,%xmm3 ## 3.0 - movapd %xmm1,nb410nf_half(%esp) - movapd %xmm2,nb410nf_two(%esp) - movapd %xmm3,nb410nf_three(%esp) - -_nb_kernel410nf_ia32_sse2.nb410nf_threadloop: - movl nb410nf_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel410nf_ia32_sse2.nb410nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel410nf_ia32_sse2.nb410nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb410nf_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb410nf_n(%esp) - movl %ebx,nb410nf_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel410nf_ia32_sse2.nb410nf_outerstart - jmp _nb_kernel410nf_ia32_sse2.nb410nf_end - -_nb_kernel410nf_ia32_sse2.nb410nf_outerstart: - ## ebx contains number of outer iterations - addl nb410nf_nouter(%esp),%ebx - movl %ebx,nb410nf_nouter(%esp) - -_nb_kernel410nf_ia32_sse2.nb410nf_outer: - movl nb410nf_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb410nf_is3(%esp) ## store is3 - - movl nb410nf_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movsd (%eax,%ebx,8),%xmm0 - movsd 8(%eax,%ebx,8),%xmm1 - movsd 16(%eax,%ebx,8),%xmm2 - - movl nb410nf_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - movl %ebx,nb410nf_ii(%esp) - - movl nb410nf_charge(%ebp),%edx - movsd (%edx,%ebx,8),%xmm3 - mulsd nb410nf_facel(%esp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movl nb410nf_invsqrta(%ebp),%edx ## load invsqrta[ii] - movsd (%edx,%ebx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - movl nb410nf_type(%ebp),%edx - movl (%edx,%ebx,4),%edx - imull nb410nf_ntype(%esp),%edx - shll %edx - movl %edx,nb410nf_ntia(%esp) - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb410nf_pos(%ebp),%eax ## eax = base of pos[] - - addsd (%eax,%ebx,8),%xmm0 - addsd 8(%eax,%ebx,8),%xmm1 - addsd 16(%eax,%ebx,8),%xmm2 - - movapd %xmm3,nb410nf_iq(%esp) - movapd %xmm4,nb410nf_isai(%esp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb410nf_ix(%esp) - movapd %xmm1,nb410nf_iy(%esp) - movapd %xmm2,nb410nf_iz(%esp) - - movl %ebx,nb410nf_ii3(%esp) - - ## clear vctot and Vvdwtot - xorpd %xmm4,%xmm4 - movapd %xmm4,nb410nf_vctot(%esp) - movapd %xmm4,nb410nf_Vvdwtot(%esp) - - movl nb410nf_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb410nf_pos(%ebp),%esi - movl nb410nf_faction(%ebp),%edi - movl nb410nf_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb410nf_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb410nf_ninner(%esp),%ecx - movl %ecx,nb410nf_ninner(%esp) - addl $0,%edx - movl %edx,nb410nf_innerk(%esp) ## number of innerloop atoms - jge _nb_kernel410nf_ia32_sse2.nb410nf_unroll_loop - jmp _nb_kernel410nf_ia32_sse2.nb410nf_checksingle -_nb_kernel410nf_ia32_sse2.nb410nf_unroll_loop: - ## twice unrolled innerloop here - movl nb410nf_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - addl $8,nb410nf_innerjjnr(%esp) ## advance pointer (unrolled 2) - - ## load isaj - movl nb410nf_invsqrta(%ebp),%esi - movlpd (%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm2 - mulpd nb410nf_isai(%esp),%xmm2 - movapd %xmm2,nb410nf_isaprod(%esp) - movapd %xmm2,%xmm1 - mulpd nb410nf_gbtsc(%esp),%xmm1 - movapd %xmm1,nb410nf_gbscale(%esp) - - movl nb410nf_charge(%ebp),%esi ## base of charge[] - movlpd (%esi,%eax,8),%xmm3 - movhpd (%esi,%ebx,8),%xmm3 - - mulpd nb410nf_iq(%esp),%xmm2 - mulpd %xmm2,%xmm3 - movapd %xmm3,nb410nf_qq(%esp) - - movd %eax,%mm0 ## use mmx registers as temp storage - movd %ebx,%mm1 - - movl nb410nf_type(%ebp),%esi - movl (%esi,%eax,4),%eax - movl (%esi,%ebx,4),%ebx - movl nb410nf_vdwparam(%ebp),%esi - shll %eax - shll %ebx - movl nb410nf_ntia(%esp),%edi - addl %edi,%eax - addl %edi,%ebx - - movlpd (%esi,%eax,8),%xmm6 ## c6a - movlpd (%esi,%ebx,8),%xmm7 ## c6b - movhpd 8(%esi,%eax,8),%xmm6 ## c6a c12a - movhpd 8(%esi,%ebx,8),%xmm7 ## c6b c12b - - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movd %mm0,%eax - movd %mm1,%ebx - movapd %xmm4,nb410nf_c6(%esp) - movapd %xmm6,nb410nf_c12(%esp) - - movl nb410nf_pos(%ebp),%esi ## base of pos[] - - movd %eax,%mm2 - movd %ebx,%mm3 - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - - ## move two coordinates to xmm0-xmm2 - movlpd (%esi,%eax,8),%xmm0 - movlpd 8(%esi,%eax,8),%xmm1 - movlpd 16(%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm0 - movhpd 8(%esi,%ebx,8),%xmm1 - movhpd 16(%esi,%ebx,8),%xmm2 - - ## move ix-iz to xmm4-xmm6 - movapd nb410nf_ix(%esp),%xmm4 - movapd nb410nf_iy(%esp),%xmm5 - movapd nb410nf_iz(%esp),%xmm6 - - ## calc dr - subpd %xmm0,%xmm4 - subpd %xmm1,%xmm5 - subpd %xmm2,%xmm6 - - ## square dr - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb410nf_three(%esp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb410nf_half(%esp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb410nf_three(%esp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb410nf_half(%esp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=rinv - - mulpd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb410nf_r(%esp) - mulpd nb410nf_gbscale(%esp),%xmm4 - - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 ## idx *= 4 - - movd %eax,%mm0 - movd %ebx,%mm1 - - movl nb410nf_GBtab(%ebp),%esi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm6,%ebx ## indices in eax/ebx - - movapd (%esi,%eax,8),%xmm4 ## Y1 F1 - movapd (%esi,%ebx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%esi,%eax,8),%xmm6 ## G1 H1 - movapd 16(%esi,%ebx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - movapd nb410nf_qq(%esp),%xmm3 - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - mulpd %xmm3,%xmm5 ## vcoul=qq*VV - - addpd nb410nf_vctot(%esp),%xmm5 - movapd %xmm5,nb410nf_vctot(%esp) - - ## L-J - movapd %xmm0,%xmm4 - mulpd %xmm0,%xmm4 ## xmm4=rinvsq - - movapd %xmm4,%xmm6 - mulpd %xmm4,%xmm6 - - mulpd %xmm4,%xmm6 ## xmm6=rinvsix - movapd %xmm6,%xmm4 - mulpd %xmm4,%xmm4 ## xmm4=rinvtwelve - mulpd nb410nf_c6(%esp),%xmm6 - mulpd nb410nf_c12(%esp),%xmm4 - movapd nb410nf_Vvdwtot(%esp),%xmm7 - addpd %xmm4,%xmm7 - subpd %xmm6,%xmm7 - movapd %xmm7,nb410nf_Vvdwtot(%esp) - - ## should we do one more iteration? - subl $2,nb410nf_innerk(%esp) - jl _nb_kernel410nf_ia32_sse2.nb410nf_checksingle - jmp _nb_kernel410nf_ia32_sse2.nb410nf_unroll_loop -_nb_kernel410nf_ia32_sse2.nb410nf_checksingle: - movl nb410nf_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel410nf_ia32_sse2.nb410nf_dosingle - jmp _nb_kernel410nf_ia32_sse2.nb410nf_updateouterdata -_nb_kernel410nf_ia32_sse2.nb410nf_dosingle: - movl nb410nf_charge(%ebp),%esi - movl nb410nf_invsqrta(%ebp),%edx - movl nb410nf_pos(%ebp),%edi - movl nb410nf_innerjjnr(%esp),%ecx - movl (%ecx),%eax - - xorpd %xmm6,%xmm6 - movapd %xmm6,%xmm7 - movsd (%edx,%eax,8),%xmm7 - movlpd (%esi,%eax,8),%xmm6 ## xmm6(0) has the charge - mulsd nb410nf_isai(%esp),%xmm7 - movapd %xmm7,nb410nf_isaprod(%esp) - movapd %xmm7,%xmm1 - mulpd nb410nf_gbtsc(%esp),%xmm1 - movapd %xmm1,nb410nf_gbscale(%esp) - - mulsd nb410nf_iq(%esp),%xmm7 - mulsd %xmm7,%xmm6 - movapd %xmm6,nb410nf_qq(%esp) - - movd %eax,%mm0 ## use mmx registers as temp storage - movl nb410nf_type(%ebp),%esi - movl (%esi,%eax,4),%eax - movl nb410nf_vdwparam(%ebp),%esi - shll %eax - movl nb410nf_ntia(%esp),%edi - addl %edi,%eax - - movlpd (%esi,%eax,8),%xmm6 ## c6a - movhpd 8(%esi,%eax,8),%xmm6 ## c6a c12a - - xorpd %xmm7,%xmm7 - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movd %mm0,%eax - movapd %xmm4,nb410nf_c6(%esp) - movapd %xmm6,nb410nf_c12(%esp) - - movl nb410nf_pos(%ebp),%esi ## base of pos[] - - movd %eax,%mm2 - leal (%eax,%eax,2),%eax ## replace jnr with j3 - - ## move coordinates to xmm0-xmm2 - movlpd (%esi,%eax,8),%xmm0 - movlpd 8(%esi,%eax,8),%xmm1 - movlpd 16(%esi,%eax,8),%xmm2 - - ## move ix-iz to xmm4-xmm6 - movapd nb410nf_ix(%esp),%xmm4 - movapd nb410nf_iy(%esp),%xmm5 - movapd nb410nf_iz(%esp),%xmm6 - - ## calc dr - subsd %xmm0,%xmm4 - subsd %xmm1,%xmm5 - subsd %xmm2,%xmm6 - - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb410nf_three(%esp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb410nf_half(%esp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb410nf_three(%esp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb410nf_half(%esp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=rinv - - mulsd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb410nf_r(%esp) - mulsd nb410nf_gbscale(%esp),%xmm4 - - movd %eax,%mm0 - cvttsd2si %xmm4,%eax ## mm6 = lu idx - cvtsi2sd %eax,%xmm5 - subsd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%eax ## idx *= 4 - - movl nb410nf_GBtab(%ebp),%esi - - movapd (%esi,%eax,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%esi,%eax,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## coulomb table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - movapd nb410nf_qq(%esp),%xmm3 - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd %xmm3,%xmm5 ## vcoul=qq*VV - - addsd nb410nf_vctot(%esp),%xmm5 - movsd %xmm5,nb410nf_vctot(%esp) - - ## L-J - movapd %xmm0,%xmm4 - mulsd %xmm0,%xmm4 ## xmm4=rinvsq - - - movapd %xmm4,%xmm6 - mulsd %xmm4,%xmm6 - - mulsd %xmm4,%xmm6 ## xmm6=rinvsix - movapd %xmm6,%xmm4 - mulsd %xmm4,%xmm4 ## xmm4=rinvtwelve - mulsd nb410nf_c6(%esp),%xmm6 - mulsd nb410nf_c12(%esp),%xmm4 - movapd nb410nf_Vvdwtot(%esp),%xmm7 - addsd %xmm4,%xmm7 - subsd %xmm6,%xmm7 - movlpd %xmm7,nb410nf_Vvdwtot(%esp) - -_nb_kernel410nf_ia32_sse2.nb410nf_updateouterdata: - movl nb410nf_ii3(%esp),%ecx - movl nb410nf_is3(%esp),%edx - - ## get n from stack - movl nb410nf_n(%esp),%esi - ## get group index for i particle - movl nb410nf_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movapd nb410nf_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movl nb410nf_Vc(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%eax,%edx,8) - - ## accumulate total lj energy and update it - movapd nb410nf_Vvdwtot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movl nb410nf_Vvdw(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%eax,%edx,8) - - ## finish if last - movl nb410nf_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel410nf_ia32_sse2.nb410nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb410nf_n(%esp) - jmp _nb_kernel410nf_ia32_sse2.nb410nf_outer -_nb_kernel410nf_ia32_sse2.nb410nf_outerend: - ## check if more outer neighborlists remain - movl nb410nf_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel410nf_ia32_sse2.nb410nf_end - ## non-zero, do one more workunit - jmp _nb_kernel410nf_ia32_sse2.nb410nf_threadloop -_nb_kernel410nf_ia32_sse2.nb410nf_end: - emms - - movl nb410nf_nouter(%esp),%eax - movl nb410nf_ninner(%esp),%ebx - movl nb410nf_outeriter(%ebp),%ecx - movl nb410nf_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb410nf_salign(%esp),%eax - addl %eax,%esp - addl $336,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.intel_syntax.s deleted file mode 100644 index 30eb5c3cbb..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.intel_syntax.s +++ /dev/null @@ -1,1714 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. - -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - -.globl nb_kernel430_ia32_sse2 -.globl _nb_kernel430_ia32_sse2 -nb_kernel430_ia32_sse2: -_nb_kernel430_ia32_sse2: -.equiv nb430_p_nri, 8 -.equiv nb430_iinr, 12 -.equiv nb430_jindex, 16 -.equiv nb430_jjnr, 20 -.equiv nb430_shift, 24 -.equiv nb430_shiftvec, 28 -.equiv nb430_fshift, 32 -.equiv nb430_gid, 36 -.equiv nb430_pos, 40 -.equiv nb430_faction, 44 -.equiv nb430_charge, 48 -.equiv nb430_p_facel, 52 -.equiv nb430_argkrf, 56 -.equiv nb430_argcrf, 60 -.equiv nb430_Vc, 64 -.equiv nb430_type, 68 -.equiv nb430_p_ntype, 72 -.equiv nb430_vdwparam, 76 -.equiv nb430_Vvdw, 80 -.equiv nb430_p_tabscale, 84 -.equiv nb430_VFtab, 88 -.equiv nb430_invsqrta, 92 -.equiv nb430_dvda, 96 -.equiv nb430_p_gbtabscale, 100 -.equiv nb430_GBtab, 104 -.equiv nb430_p_nthreads, 108 -.equiv nb430_count, 112 -.equiv nb430_mtx, 116 -.equiv nb430_outeriter, 120 -.equiv nb430_inneriter, 124 -.equiv nb430_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb430_ix, 0 -.equiv nb430_iy, 16 -.equiv nb430_iz, 32 -.equiv nb430_iq, 48 -.equiv nb430_dx, 64 -.equiv nb430_dy, 80 -.equiv nb430_dz, 96 -.equiv nb430_two, 112 -.equiv nb430_gbtsc, 128 -.equiv nb430_tsc, 144 -.equiv nb430_qq, 160 -.equiv nb430_c6, 176 -.equiv nb430_c12, 192 -.equiv nb430_fscal, 208 -.equiv nb430_vctot, 224 -.equiv nb430_Vvdwtot, 240 -.equiv nb430_fix, 256 -.equiv nb430_fiy, 272 -.equiv nb430_fiz, 288 -.equiv nb430_half, 304 -.equiv nb430_three, 320 -.equiv nb430_r, 336 -.equiv nb430_isai, 352 -.equiv nb430_isaprod, 368 -.equiv nb430_dvdasum, 384 -.equiv nb430_gbscale, 400 -.equiv nb430_ii, 416 -.equiv nb430_is3, 420 -.equiv nb430_ii3, 424 -.equiv nb430_ntia, 428 -.equiv nb430_innerjjnr, 432 -.equiv nb430_innerk, 436 -.equiv nb430_n, 440 -.equiv nb430_nn1, 444 -.equiv nb430_nri, 448 -.equiv nb430_facel, 456 ;# uses 8 bytes -.equiv nb430_ntype, 464 -.equiv nb430_nouter, 468 -.equiv nb430_ninner, 472 -.equiv nb430_salign, 476 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 484 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb430_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb430_p_nri] - mov esi, [ebp + nb430_p_facel] - mov edi, [ebp + nb430_p_ntype] - mov ecx, [ecx] - movsd xmm7, [esi] - mov edi, [edi] - mov [esp + nb430_nri], ecx - movsd [esp + nb430_facel], xmm7 - mov [esp + nb430_ntype], edi - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb430_nouter], eax - mov [esp + nb430_ninner], eax - - - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double 0.5 IEEE (hex) - mov ebx, 0x3fe00000 - mov [esp + nb430_half], eax - mov [esp + nb430_half+4], ebx - movsd xmm1, [esp + nb430_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# 1.0 - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# 2.0 - addpd xmm3, xmm2 ;# 3.0 - movapd [esp + nb430_half], xmm1 - movapd [esp + nb430_two], xmm2 - movapd [esp + nb430_three], xmm3 - mov eax, [ebp + nb430_p_tabscale] - movsd xmm3, [eax] - mov eax, [ebp + nb430_p_gbtabscale] - movsd xmm4, [eax] - shufpd xmm3, xmm3, 0 - shufpd xmm4, xmm4, 0 - movapd [esp + nb430_tsc], xmm3 - movapd [esp + nb430_gbtsc], xmm4 - -.nb430_threadloop: - mov esi, [ebp + nb430_count] ;# pointer to sync counter - mov eax, [esi] -.nb430_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb430_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb430_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb430_n], eax - mov [esp + nb430_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb430_outerstart - jmp .nb430_end - -.nb430_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb430_nouter] - mov [esp + nb430_nouter], ebx - -.nb430_outer: - mov eax, [ebp + nb430_shift] ;# eax = pointer into shift[] - mov ebx, [eax+esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb430_is3],ebx ;# store is3 - - mov eax, [ebp + nb430_shiftvec] ;# eax = base of shiftvec[] - - movsd xmm0, [eax + ebx*8] - movsd xmm1, [eax + ebx*8 + 8] - movsd xmm2, [eax + ebx*8 + 16] - - mov ecx, [ebp + nb430_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx+esi*4] ;# ebx =ii - mov [esp + nb430_ii], ebx - - mov edx, [ebp + nb430_charge] - movsd xmm3, [edx + ebx*8] - mulsd xmm3, [esp + nb430_facel] - shufpd xmm3, xmm3, 0 - - mov edx, [ebp + nb430_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [edx + ebx*8] - shufpd xmm4, xmm4, 0 - - mov edx, [ebp + nb430_type] - mov edx, [edx + ebx*4] - imul edx, [esp + nb430_ntype] - shl edx, 1 - mov [esp + nb430_ntia], edx - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb430_pos] ;# eax = base of pos[] - - addsd xmm0, [eax + ebx*8] - addsd xmm1, [eax + ebx*8 + 8] - addsd xmm2, [eax + ebx*8 + 16] - - movapd [esp + nb430_iq], xmm3 - movapd [esp + nb430_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [esp + nb430_ix], xmm0 - movapd [esp + nb430_iy], xmm1 - movapd [esp + nb430_iz], xmm2 - - mov [esp + nb430_ii3], ebx - - ;# clear vctot and i forces - xorpd xmm4, xmm4 - movapd [esp + nb430_vctot], xmm4 - movapd [esp + nb430_Vvdwtot], xmm4 - movapd [esp + nb430_dvdasum], xmm4 - movapd [esp + nb430_fix], xmm4 - movapd [esp + nb430_fiy], xmm4 - movapd [esp + nb430_fiz], xmm4 - - mov eax, [ebp + nb430_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb430_pos] - mov edi, [ebp + nb430_faction] - mov eax, [ebp + nb430_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb430_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [esp + nb430_ninner] - mov [esp + nb430_ninner], ecx - add edx, 0 - mov [esp + nb430_innerk], edx ;# number of innerloop atoms - jge .nb430_unroll_loop - jmp .nb430_checksingle -.nb430_unroll_loop: - ;# twice unrolled innerloop here - mov edx, [esp + nb430_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - add dword ptr [esp + nb430_innerjjnr], 8 ;# advance pointer (unrolled 2) - - ;# load isaj - mov esi, [ebp + nb430_invsqrta] - movlpd xmm2, [esi + eax*8] - movhpd xmm2, [esi + ebx*8] - mulpd xmm2, [esp + nb430_isai] - movapd [esp + nb430_isaprod], xmm2 - movapd xmm1, xmm2 - mulpd xmm1, [esp + nb430_gbtsc] - movapd [esp + nb430_gbscale], xmm1 - - mov esi, [ebp + nb430_charge] ;# base of charge[] - movlpd xmm3, [esi + eax*8] - movhpd xmm3, [esi + ebx*8] - - mulpd xmm2, [esp + nb430_iq] - mulpd xmm3, xmm2 - movapd [esp + nb430_qq], xmm3 - - mov esi, [ebp + nb430_type] - mov ecx, [esi + eax*4] - mov edx, [esi + ebx*4] - mov esi, [ebp + nb430_vdwparam] - shl ecx, 1 - shl edx, 1 - mov edi, [esp + nb430_ntia] - add ecx, edi - add edx, edi - - movlpd xmm6, [esi + ecx*8] ;# c6a - movlpd xmm7, [esi + edx*8] ;# c6b - movhpd xmm6, [esi + ecx*8 + 8] ;# c6a c12a - movhpd xmm7, [esi + edx*8 + 8] ;# c6b c12b - - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movapd [esp + nb430_c6], xmm4 - movapd [esp + nb430_c12], xmm6 - - mov esi, [ebp + nb430_pos] ;# base of pos[] - - movd mm2, eax - movd mm3, ebx - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [esi + eax*8] - movlpd xmm1, [esi + eax*8 + 8] - movlpd xmm2, [esi + eax*8 + 16] - movhpd xmm0, [esi + ebx*8] - movhpd xmm1, [esi + ebx*8 + 8] - movhpd xmm2, [esi + ebx*8 + 16] - - mov edi, [ebp + nb430_faction] - - ;# move nb430_ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb430_ix] - movapd xmm5, [esp + nb430_iy] - movapd xmm6, [esp + nb430_iz] - - ;# calc dr - subpd xmm4, xmm0 - subpd xmm5, xmm1 - subpd xmm6, xmm2 - - ;# store dr - movapd [esp + nb430_dx], xmm4 - movapd [esp + nb430_dy], xmm5 - movapd [esp + nb430_dz], xmm6 - ;# square it - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb430_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb430_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb430_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb430_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=iter2 of rinv - mulpd xmm4, xmm0 ;# xmm4=r - movapd [esp + nb430_r], xmm4 - mulpd xmm4, [esp + nb430_gbscale] - - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 ;# idx *= 4 - - mov esi, [ebp + nb430_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 ;# indices in eax/ebx - - ;# Coulomb - movapd xmm4, [esi + ecx*8] ;# Y1 F1 - movapd xmm3, [esi + edx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [esi + ecx*8 + 16] ;# G1 H1 - movapd xmm3, [esi + edx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - mulpd xmm7, [esp + nb430_two] ;# two*Heps2 - movapd xmm3, [esp + nb430_qq] - addpd xmm7, xmm6 - addpd xmm7, xmm5 ;# xmm7=FF - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - mulpd xmm5, xmm3 ;# vcoul=qq*VV - mulpd xmm3, xmm7 ;# fijC=FF*qq - ;# get jnr from regs - movd ecx, mm2 - movd edx, mm3 - mov esi, [ebp + nb430_dvda] - - ;# Calculate dVda - xorpd xmm7, xmm7 - mulpd xmm3, [esp + nb430_gbscale] - movapd xmm6, xmm3 - mulpd xmm6, [esp + nb430_r] - addpd xmm6, xmm5 - addpd xmm5, [esp + nb430_vctot] - movapd [esp + nb430_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subpd xmm7, xmm6 - movapd xmm6, xmm7 - - ;# update dvdasum - addpd xmm7, [esp + nb430_dvdasum] - movapd [esp + nb430_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - addsd xmm6, [esi + ecx*8] - addsd xmm7, [esi + edx*8] - movsd [esi + ecx*8], xmm6 - movsd [esi + edx*8], xmm7 - - ;# put scalar force on stack temporarily - movapd [esp + nb430_fscal], xmm3 - - movapd xmm4, [esp + nb430_r] - mulpd xmm4, [esp + nb430_tsc] - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 3 ;# idx *= 8 - - mov esi, [ebp + nb430_VFtab] - - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 ;# indices in eax/ebx - - ;# Dispersion - movapd xmm4, [esi + ecx*8] ;# Y1 F1 - movapd xmm3, [esi + edx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [esi + ecx*8 + 16] ;# G1 H1 - movapd xmm3, [esi + edx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# Dispersion table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - mulpd xmm7, [esp + nb430_two] ;# two*Heps2 - addpd xmm7, xmm6 - addpd xmm7, xmm5 ;# xmm7=FF - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - - movapd xmm4, [esp + nb430_c6] - mulpd xmm7, xmm4 ;# fijD - mulpd xmm5, xmm4 ;# Vvdw6 - mulpd xmm7, [esp + nb430_tsc] - addpd xmm7, [esp + nb430_fscal] ;# add to fscal - - ;# put scalar force back on stack Update Vvdwtot directly - addpd xmm5, [esp + nb430_Vvdwtot] - movapd [esp + nb430_fscal], xmm7 - movapd [esp + nb430_Vvdwtot], xmm5 - - ;# Repulsion - movapd xmm4, [esi + ecx*8 + 32] ;# Y1 F1 - movapd xmm3, [esi + edx*8 + 32] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [esi + ecx*8 + 48] ;# G1 H1 - movapd xmm3, [esi + edx*8 + 48] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# Dispersion table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - mulpd xmm7, [esp + nb430_two] ;# two*Heps2 - addpd xmm7, xmm6 - addpd xmm7, xmm5 ;# xmm7=FF - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - - movapd xmm4, [esp + nb430_c12] - mulpd xmm7, xmm4 ;# fijR - mulpd xmm5, xmm4 ;# Vvdw12 - mulpd xmm7, [esp + nb430_tsc] - addpd xmm7, [esp + nb430_fscal] - - addpd xmm5, [esp + nb430_Vvdwtot] - movapd [esp + nb430_Vvdwtot], xmm5 - xorpd xmm4, xmm4 - - mulpd xmm7, xmm0 - subpd xmm4, xmm7 - - movapd xmm0, [esp + nb430_dx] - movapd xmm1, [esp + nb430_dy] - movapd xmm2, [esp + nb430_dz] - - mov edi, [ebp + nb430_faction] - mulpd xmm0, xmm4 - mulpd xmm1, xmm4 - mulpd xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movapd xmm3, [esp + nb430_fix] - movapd xmm4, [esp + nb430_fiy] - movapd xmm5, [esp + nb430_fiz] - addpd xmm3, xmm0 - addpd xmm4, xmm1 - addpd xmm5, xmm2 - movapd [esp + nb430_fix], xmm3 - movapd [esp + nb430_fiy], xmm4 - movapd [esp + nb430_fiz], xmm5 - ;# the fj's - start by accumulating forces from memory - movlpd xmm3, [edi + eax*8] - movlpd xmm4, [edi + eax*8 + 8] - movlpd xmm5, [edi + eax*8 + 16] - movhpd xmm3, [edi + ebx*8] - movhpd xmm4, [edi + ebx*8 + 8] - movhpd xmm5, [edi + ebx*8 + 16] - subpd xmm3, xmm0 - subpd xmm4, xmm1 - subpd xmm5, xmm2 - movlpd [edi + eax*8], xmm3 - movlpd [edi + eax*8 + 8], xmm4 - movlpd [edi + eax*8 + 16], xmm5 - movhpd [edi + ebx*8], xmm3 - movhpd [edi + ebx*8 + 8], xmm4 - movhpd [edi + ebx*8 + 16], xmm5 - - ;# should we do one more iteration? - sub dword ptr [esp + nb430_innerk], 2 - jl .nb430_checksingle - jmp .nb430_unroll_loop -.nb430_checksingle: - mov edx, [esp + nb430_innerk] - and edx, 1 - jnz .nb430_dosingle - jmp .nb430_updateouterdata -.nb430_dosingle: - mov esi, [ebp + nb430_charge] - mov edx, [ebp + nb430_invsqrta] - mov edi, [ebp + nb430_pos] - mov ecx, [esp + nb430_innerjjnr] - mov eax, [ecx] - - xorpd xmm6, xmm6 - movapd xmm7, xmm6 - movsd xmm7, [edx + eax*8] - movlpd xmm6, [esi + eax*8] ;# xmm6(0) has the charge - mulsd xmm7, [esp + nb430_isai] - movapd [esp + nb430_isaprod], xmm7 - movapd xmm1, xmm7 - mulpd xmm1, [esp + nb430_gbtsc] - movapd [esp + nb430_gbscale], xmm1 - - mulsd xmm7, [esp + nb430_iq] - mulsd xmm6, xmm7 - movapd [esp + nb430_qq], xmm6 - - mov esi, [ebp + nb430_type] - mov edx, [esi + eax*4] - mov esi, [ebp + nb430_vdwparam] - shl edx, 1 - mov edi, [esp + nb430_ntia] - add edx, edi - - movlpd xmm6, [esi + edx*8] ;# c6a - movhpd xmm6, [esi + edx*8 + 8] ;# c6a c12a - - xorpd xmm7, xmm7 - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movapd [esp + nb430_c6], xmm4 - movapd [esp + nb430_c12], xmm6 - - mov esi, [ebp + nb430_pos] ;# base of pos[] - - movd mm2, eax - lea eax, [eax + eax*2] ;# replace jnr with j3 - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [esi + eax*8] - movlpd xmm1, [esi + eax*8 + 8] - movlpd xmm2, [esi + eax*8 + 16] - - mov edi, [ebp + nb430_faction] - - ;# move nb430_ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb430_ix] - movapd xmm5, [esp + nb430_iy] - movapd xmm6, [esp + nb430_iz] - - ;# calc dr - subsd xmm4, xmm0 - subsd xmm5, xmm1 - subsd xmm6, xmm2 - - ;# store dr - movapd [esp + nb430_dx], xmm4 - movapd [esp + nb430_dy], xmm5 - movapd [esp + nb430_dz], xmm6 - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb430_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb430_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb430_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb430_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu) - mulsd xmm4, xmm0 ;# xmm4=r - movsd [esp + nb430_r], xmm4 - mulsd xmm4, [esp + nb430_gbscale] - - cvttsd2si edx, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, edx - subsd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl edx, 2 ;# idx *= 4 - mov esi, [ebp + nb430_GBtab] - - ;# Coulomb - movapd xmm4, [esi + edx*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [esi + edx*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# coulomb table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - mulsd xmm7, [esp + nb430_two] ;# two*Heps2 - movapd xmm3, [esp + nb430_qq] - addsd xmm7, xmm6 - addsd xmm7, xmm5 ;# xmm7=FF - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, xmm3 ;# vcoul=qq*VV - mulsd xmm3, xmm7 ;# fijC=FF*qq - ;# get jnr from regs - movd ebx, mm2 - mov esi, [ebp + nb430_dvda] - - ;# Calculate dVda - xorpd xmm7, xmm7 - mulsd xmm3, [esp + nb430_gbscale] - movsd xmm6, xmm3 - mulsd xmm6, [esp + nb430_r] - addsd xmm6, xmm5 - addsd xmm5, [esp + nb430_vctot] - movsd [esp + nb430_vctot], xmm5 - - ;# xmm6=(vcoul+fijC*r) - subpd xmm7, xmm6 - movsd xmm6, xmm7 - - ;# update dvdasum - addsd xmm7, [esp + nb430_dvdasum] - movsd [esp + nb430_dvdasum], xmm7 - - ;# update j atoms dvdaj - addsd xmm6, [esi + ebx*8] - movsd [esi + ebx*8], xmm6 - - ;# put scalar force on stack temporarily - movsd [esp + nb430_fscal], xmm3 - - movsd xmm4, [esp + nb430_r] - mulsd xmm4, [esp + nb430_tsc] - cvttsd2si edx, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, edx - subsd xmm4, xmm5 - movsd xmm1, xmm4 ;# xmm1=eps - movsd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl edx, 3 - - mov esi, [ebp + nb430_VFtab] - - ;# Dispersion - movapd xmm4, [esi + edx*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [esi + edx*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# Dispersion table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - mulsd xmm7, [esp + nb430_two] ;# two*Heps2 - movapd xmm3, [esp + nb430_qq] - addsd xmm7, xmm6 - addsd xmm7, xmm5 ;# xmm7=FF - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - - movapd xmm4, [esp + nb430_c6] - mulsd xmm7, xmm4 ;# fijD - mulsd xmm5, xmm4 ;# Vvdw6 - mulpd xmm7, [esp + nb430_tsc] - addsd xmm7, [esp + nb430_fscal] ;# add to fscal - - ;# put scalar force back on stack Update Vvdwtot directly - addsd xmm5, [esp + nb430_Vvdwtot] - movlpd [esp + nb430_fscal], xmm7 - movlpd [esp + nb430_Vvdwtot], xmm5 - - ;# Repulsion - movapd xmm4, [esi + edx*8 + 32] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [esi + edx*8 + 48] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# Dispersion table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - mulsd xmm7, [esp + nb430_two] ;# two*Heps2 - movapd xmm3, [esp + nb430_qq] - addsd xmm7, xmm6 - addsd xmm7, xmm5 ;# xmm7=FF - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - - movapd xmm4, [esp + nb430_c12] - mulsd xmm7, xmm4 ;# fijR - mulsd xmm5, xmm4 ;# Vvdw12 - mulpd xmm7, [esp + nb430_tsc] - addsd xmm7, [esp + nb430_fscal] - - addsd xmm5, [esp + nb430_Vvdwtot] - movlpd [esp + nb430_Vvdwtot], xmm5 - xorpd xmm4, xmm4 - - mulsd xmm7, xmm0 - subsd xmm4, xmm7 - - movapd xmm0, [esp + nb430_dx] - movapd xmm1, [esp + nb430_dy] - movapd xmm2, [esp + nb430_dz] - - mov edi, [ebp + nb430_faction] - mulsd xmm0, xmm4 - mulsd xmm1, xmm4 - mulsd xmm2, xmm4 - ;# xmm0-xmm2 contains tx-tz (partial force) - ;# now update f_i - movapd xmm3, [esp + nb430_fix] - movapd xmm4, [esp + nb430_fiy] - movapd xmm5, [esp + nb430_fiz] - addsd xmm3, xmm0 - addsd xmm4, xmm1 - addsd xmm5, xmm2 - movlpd [esp + nb430_fix], xmm3 - movlpd [esp + nb430_fiy], xmm4 - movlpd [esp + nb430_fiz], xmm5 - ;# the fj's - start by accumulating forces from memory - movlpd xmm3, [edi + eax*8] - movlpd xmm4, [edi + eax*8 + 8] - movlpd xmm5, [edi + eax*8 + 16] - subsd xmm3, xmm0 - subsd xmm4, xmm1 - subsd xmm5, xmm2 - movlpd [edi + eax*8], xmm3 - movlpd [edi + eax*8 + 8], xmm4 - movlpd [edi + eax*8 + 16], xmm5 -.nb430_updateouterdata: - mov ecx, [esp + nb430_ii3] - mov edi, [ebp + nb430_faction] - mov esi, [ebp + nb430_fshift] - mov edx, [esp + nb430_is3] - - ;# accumulate i forces in xmm0, xmm1, xmm2 - movapd xmm0, [esp + nb430_fix] - movapd xmm1, [esp + nb430_fiy] - movapd xmm2, [esp + nb430_fiz] - - movhlps xmm3, xmm0 - movhlps xmm4, xmm1 - movhlps xmm5, xmm2 - addsd xmm0, xmm3 - addsd xmm1, xmm4 - addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 - - ;# increment i force - movsd xmm3, [edi + ecx*8] - movsd xmm4, [edi + ecx*8 + 8] - movsd xmm5, [edi + ecx*8 + 16] - addsd xmm3, xmm0 - addsd xmm4, xmm1 - addsd xmm5, xmm2 - movsd [edi + ecx*8], xmm3 - movsd [edi + ecx*8 + 8], xmm4 - movsd [edi + ecx*8 + 16], xmm5 - - ;# increment fshift force - movsd xmm3, [esi + edx*8] - movsd xmm4, [esi + edx*8 + 8] - movsd xmm5, [esi + edx*8 + 16] - addsd xmm3, xmm0 - addsd xmm4, xmm1 - addsd xmm5, xmm2 - movsd [esi + edx*8], xmm3 - movsd [esi + edx*8 + 8], xmm4 - movsd [esi + edx*8 + 16], xmm5 - - ;# get n from stack - mov esi, [esp + nb430_n] - ;# get group index for i particle - mov edx, [ebp + nb430_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movapd xmm7, [esp + nb430_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov eax, [ebp + nb430_Vc] - addsd xmm7, [eax + edx*8] - ;# move back to mem - movsd [eax + edx*8], xmm7 - - ;# accumulate total lj energy and update it - movapd xmm7, [esp + nb430_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov eax, [ebp + nb430_Vvdw] - addsd xmm7, [eax + edx*8] - ;# move back to mem - movsd [eax + edx*8], xmm7 - - ;# accumulate dVda and update it - movapd xmm7, [esp + nb430_dvdasum] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - mov edx, [esp + nb430_ii] - mov eax, [ebp + nb430_dvda] - addsd xmm7, [eax + edx*8] - movsd [eax + edx*8], xmm7 - - ;# finish if last - mov ecx, [esp + nb430_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb430_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb430_n], esi - jmp .nb430_outer -.nb430_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb430_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb430_end - ;# non-zero, do one more workunit - jmp .nb430_threadloop -.nb430_end: - emms - - mov eax, [esp + nb430_nouter] - mov ebx, [esp + nb430_ninner] - mov ecx, [ebp + nb430_outeriter] - mov edx, [ebp + nb430_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb430_salign] - add esp, eax - add esp, 484 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret - - - - - -.globl nb_kernel430nf_ia32_sse2 -.globl _nb_kernel430nf_ia32_sse2 -nb_kernel430nf_ia32_sse2: -_nb_kernel430nf_ia32_sse2: -.equiv nb430nf_p_nri, 8 -.equiv nb430nf_iinr, 12 -.equiv nb430nf_jindex, 16 -.equiv nb430nf_jjnr, 20 -.equiv nb430nf_shift, 24 -.equiv nb430nf_shiftvec, 28 -.equiv nb430nf_fshift, 32 -.equiv nb430nf_gid, 36 -.equiv nb430nf_pos, 40 -.equiv nb430nf_faction, 44 -.equiv nb430nf_charge, 48 -.equiv nb430nf_p_facel, 52 -.equiv nb430nf_argkrf, 56 -.equiv nb430nf_argcrf, 60 -.equiv nb430nf_Vc, 64 -.equiv nb430nf_type, 68 -.equiv nb430nf_p_ntype, 72 -.equiv nb430nf_vdwparam, 76 -.equiv nb430nf_Vvdw, 80 -.equiv nb430nf_p_tabscale, 84 -.equiv nb430nf_VFtab, 88 -.equiv nb430nf_invsqrta, 92 -.equiv nb430nf_dvda, 96 -.equiv nb430nf_p_gbtabscale, 100 -.equiv nb430nf_GBtab, 104 -.equiv nb430nf_p_nthreads, 108 -.equiv nb430nf_count, 112 -.equiv nb430nf_mtx, 116 -.equiv nb430nf_outeriter, 120 -.equiv nb430nf_inneriter, 124 -.equiv nb430nf_work, 128 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb430nf_ix, 0 -.equiv nb430nf_iy, 16 -.equiv nb430nf_iz, 32 -.equiv nb430nf_iq, 48 -.equiv nb430nf_gbtsc, 64 -.equiv nb430nf_tsc, 80 -.equiv nb430nf_qq, 96 -.equiv nb430nf_c6, 112 -.equiv nb430nf_c12, 128 -.equiv nb430nf_vctot, 144 -.equiv nb430nf_Vvdwtot, 160 -.equiv nb430nf_half, 176 -.equiv nb430nf_three, 192 -.equiv nb430nf_r, 208 -.equiv nb430nf_isai, 224 -.equiv nb430nf_isaprod, 240 -.equiv nb430nf_gbscale, 256 -.equiv nb430nf_is3, 272 -.equiv nb430nf_ii3, 276 -.equiv nb430nf_ntia, 280 -.equiv nb430nf_innerjjnr, 284 -.equiv nb430nf_innerk, 288 -.equiv nb430nf_n, 292 -.equiv nb430nf_nn1, 296 -.equiv nb430nf_nri, 300 -.equiv nb430nf_facel, 304 ;# uses 8 bytes -.equiv nb430nf_ntype, 312 -.equiv nb430nf_nouter, 316 -.equiv nb430nf_ninner, 320 -.equiv nb430nf_salign, 324 - push ebp - mov ebp,esp - push eax - push ebx - push ecx - push edx - push esi - push edi - sub esp, 328 ;# local stack space - mov eax, esp - and eax, 0xf - sub esp, eax - mov [esp + nb430nf_salign], eax - - emms - - ;# Move args passed by reference to stack - mov ecx, [ebp + nb430nf_p_nri] - mov esi, [ebp + nb430nf_p_facel] - mov edi, [ebp + nb430nf_p_ntype] - mov ecx, [ecx] - movsd xmm7, [esi] - mov edi, [edi] - mov [esp + nb430nf_nri], ecx - movsd [esp + nb430nf_facel], xmm7 - mov [esp + nb430nf_ntype], edi - - ;# zero iteration counters - mov eax, 0 - mov [esp + nb430nf_nouter], eax - mov [esp + nb430nf_ninner], eax - - - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double 0.5 IEEE (hex) - mov ebx, 0x3fe00000 - mov [esp + nb430nf_half], eax - mov [esp + nb430nf_half+4], ebx - movsd xmm1, [esp + nb430nf_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# 1.0 - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# 2.0 - addpd xmm3, xmm2 ;# 3.0 - movapd [esp + nb430nf_half], xmm1 - movapd [esp + nb430nf_three], xmm3 - mov eax, [ebp + nb430nf_p_tabscale] - movsd xmm3, [eax] - mov eax, [ebp + nb430nf_p_gbtabscale] - movsd xmm4, [eax] - shufpd xmm3, xmm3, 0 - shufpd xmm4, xmm4, 0 - movapd [esp + nb430nf_tsc], xmm3 - movapd [esp + nb430nf_gbtsc], xmm4 - -.nb430nf_threadloop: - mov esi, [ebp + nb430nf_count] ;# pointer to sync counter - mov eax, [esi] -.nb430nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb430nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [esp + nb430nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [esp + nb430nf_n], eax - mov [esp + nb430nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb430nf_outerstart - jmp .nb430nf_end - -.nb430nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [esp + nb430nf_nouter] - mov [esp + nb430nf_nouter], ebx - -.nb430nf_outer: - mov eax, [ebp + nb430nf_shift] ;# eax = pointer into shift[] - mov ebx, [eax+esi*4] ;# ebx=shift[n] - - lea ebx, [ebx + ebx*2] ;# ebx=3*is - mov [esp + nb430nf_is3],ebx ;# store is3 - - mov eax, [ebp + nb430nf_shiftvec] ;# eax = base of shiftvec[] - - movsd xmm0, [eax + ebx*8] - movsd xmm1, [eax + ebx*8 + 8] - movsd xmm2, [eax + ebx*8 + 16] - - mov ecx, [ebp + nb430nf_iinr] ;# ecx = pointer into iinr[] - mov ebx, [ecx+esi*4] ;# ebx =ii - - mov edx, [ebp + nb430nf_charge] - movsd xmm3, [edx + ebx*8] - mulsd xmm3, [esp + nb430nf_facel] - shufpd xmm3, xmm3, 0 - - mov edx, [ebp + nb430nf_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [edx + ebx*8] - shufpd xmm4, xmm4, 0 - - mov edx, [ebp + nb430nf_type] - mov edx, [edx + ebx*4] - imul edx, [esp + nb430nf_ntype] - shl edx, 1 - mov [esp + nb430nf_ntia], edx - - lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3 - mov eax, [ebp + nb430nf_pos] ;# eax = base of pos[] - - addsd xmm0, [eax + ebx*8] - addsd xmm1, [eax + ebx*8 + 8] - addsd xmm2, [eax + ebx*8 + 16] - - movapd [esp + nb430nf_iq], xmm3 - movapd [esp + nb430nf_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [esp + nb430nf_ix], xmm0 - movapd [esp + nb430nf_iy], xmm1 - movapd [esp + nb430nf_iz], xmm2 - - mov [esp + nb430nf_ii3], ebx - - ;# clear vctot - xorpd xmm4, xmm4 - movapd [esp + nb430nf_vctot], xmm4 - movapd [esp + nb430nf_Vvdwtot], xmm4 - - mov eax, [ebp + nb430nf_jindex] - mov ecx, [eax + esi*4] ;# jindex[n] - mov edx, [eax + esi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov esi, [ebp + nb430nf_pos] - mov edi, [ebp + nb430nf_faction] - mov eax, [ebp + nb430nf_jjnr] - shl ecx, 2 - add eax, ecx - mov [esp + nb430nf_innerjjnr], eax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [esp + nb430nf_ninner] - mov [esp + nb430nf_ninner], ecx - add edx, 0 - mov [esp + nb430nf_innerk], edx ;# number of innerloop atoms - jge .nb430nf_unroll_loop - jmp .nb430nf_checksingle -.nb430nf_unroll_loop: - ;# twice unrolled innerloop here - mov edx, [esp + nb430nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [edx] - mov ebx, [edx + 4] - add dword ptr [esp + nb430nf_innerjjnr], 8 ;# advance pointer (unrolled 2) - - ;# load isaj - mov esi, [ebp + nb430nf_invsqrta] - movlpd xmm2, [esi + eax*8] - movhpd xmm2, [esi + ebx*8] - mulpd xmm2, [esp + nb430nf_isai] - movapd [esp + nb430nf_isaprod], xmm2 - movapd xmm1, xmm2 - mulpd xmm1, [esp + nb430nf_gbtsc] - movapd [esp + nb430nf_gbscale], xmm1 - - mov esi, [ebp + nb430nf_charge] ;# base of charge[] - movlpd xmm3, [esi + eax*8] - movhpd xmm3, [esi + ebx*8] - - mulpd xmm2, [esp + nb430nf_iq] - mulpd xmm3, xmm2 - movapd [esp + nb430nf_qq], xmm3 - - mov esi, [ebp + nb430nf_type] - mov ecx, [esi + eax*4] - mov edx, [esi + ebx*4] - mov esi, [ebp + nb430nf_vdwparam] - shl ecx, 1 - shl edx, 1 - mov edi, [esp + nb430nf_ntia] - add ecx, edi - add edx, edi - - movlpd xmm6, [esi + ecx*8] ;# c6a - movlpd xmm7, [esi + edx*8] ;# c6b - movhpd xmm6, [esi + ecx*8 + 8] ;# c6a c12a - movhpd xmm7, [esi + edx*8 + 8] ;# c6b c12b - - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movapd [esp + nb430nf_c6], xmm4 - movapd [esp + nb430nf_c12], xmm6 - - mov esi, [ebp + nb430nf_pos] ;# base of pos[] - - lea eax, [eax + eax*2] ;# replace jnr with j3 - lea ebx, [ebx + ebx*2] - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [esi + eax*8] - movlpd xmm1, [esi + eax*8 + 8] - movlpd xmm2, [esi + eax*8 + 16] - movhpd xmm0, [esi + ebx*8] - movhpd xmm1, [esi + ebx*8 + 8] - movhpd xmm2, [esi + ebx*8 + 16] - - mov edi, [ebp + nb430nf_faction] - - ;# move nb430nf_ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb430nf_ix] - movapd xmm5, [esp + nb430nf_iy] - movapd xmm6, [esp + nb430nf_iz] - - ;# calc dr - subpd xmm4, xmm0 - subpd xmm5, xmm1 - subpd xmm6, xmm2 - - ;# square it - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb430nf_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb430nf_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb430nf_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb430nf_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=iter2 of rinv - mulpd xmm4, xmm0 ;# xmm4=r - movapd [esp + nb430nf_r], xmm4 - mulpd xmm4, [esp + nb430nf_gbscale] - - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 ;# idx *= 4 - - mov esi, [ebp + nb430nf_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 ;# indices in eax/ebx - - ;# Coulomb - movapd xmm4, [esi + ecx*8] ;# Y1 F1 - movapd xmm3, [esi + edx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [esi + ecx*8 + 16] ;# G1 H1 - movapd xmm3, [esi + edx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [esp + nb430nf_qq] - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - mulpd xmm5, xmm3 ;# vcoul=qq*VV - addpd xmm5, [esp + nb430nf_vctot] - movapd [esp + nb430nf_vctot], xmm5 - - movapd xmm4, [esp + nb430nf_r] - mulpd xmm4, [esp + nb430nf_tsc] - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 3 ;# idx *= 8 - - mov esi, [ebp + nb430nf_VFtab] - - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 ;# indices in eax/ebx - - ;# Dispersion - movapd xmm4, [esi + ecx*8] ;# Y1 F1 - movapd xmm3, [esi + edx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [esi + ecx*8 + 16] ;# G1 H1 - movapd xmm3, [esi + edx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# Dispersion table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - - mulpd xmm5, [esp + nb430nf_c6] ;# Vvdw6 - addpd xmm5, [esp + nb430nf_Vvdwtot] - movapd [esp + nb430nf_Vvdwtot], xmm5 - - ;# Repulsion - movapd xmm4, [esi + ecx*8 + 32] ;# Y1 F1 - movapd xmm3, [esi + edx*8 + 32] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [esi + ecx*8 + 48] ;# G1 H1 - movapd xmm3, [esi + edx*8 + 48] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# Dispersion table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - - mulpd xmm5, [esp + nb430nf_c12] ;# Vvdw12 - addpd xmm5, [esp + nb430nf_Vvdwtot] - movapd [esp + nb430nf_Vvdwtot], xmm5 - xorpd xmm4, xmm4 - - ;# should we do one more iteration? - sub dword ptr [esp + nb430nf_innerk], 2 - jl .nb430nf_checksingle - jmp .nb430nf_unroll_loop -.nb430nf_checksingle: - mov edx, [esp + nb430nf_innerk] - and edx, 1 - jnz .nb430nf_dosingle - jmp .nb430nf_updateouterdata -.nb430nf_dosingle: - mov esi, [ebp + nb430nf_charge] - mov edx, [ebp + nb430nf_invsqrta] - mov edi, [ebp + nb430nf_pos] - mov ecx, [esp + nb430nf_innerjjnr] - mov eax, [ecx] - - xorpd xmm6, xmm6 - movapd xmm7, xmm6 - movsd xmm7, [edx + eax*8] - movlpd xmm6, [esi + eax*8] ;# xmm6(0) has the charge - mulsd xmm7, [esp + nb430nf_isai] - movapd [esp + nb430nf_isaprod], xmm7 - movapd xmm1, xmm7 - mulpd xmm1, [esp + nb430nf_gbtsc] - movapd [esp + nb430nf_gbscale], xmm1 - - mulsd xmm7, [esp + nb430nf_iq] - mulsd xmm6, xmm7 - movapd [esp + nb430nf_qq], xmm6 - - mov esi, [ebp + nb430nf_type] - mov edx, [esi + eax*4] - mov esi, [ebp + nb430nf_vdwparam] - shl edx, 1 - mov edi, [esp + nb430nf_ntia] - add edx, edi - - movlpd xmm6, [esi + edx*8] ;# c6a - movhpd xmm6, [esi + edx*8 + 8] ;# c6a c12a - - xorpd xmm7, xmm7 - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movapd [esp + nb430nf_c6], xmm4 - movapd [esp + nb430nf_c12], xmm6 - - mov esi, [ebp + nb430nf_pos] ;# base of pos[] - - lea eax, [eax + eax*2] ;# replace jnr with j3 - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [esi + eax*8] - movlpd xmm1, [esi + eax*8 + 8] - movlpd xmm2, [esi + eax*8 + 16] - - mov edi, [ebp + nb430nf_faction] - - ;# move nb430nf_ix-iz to xmm4-xmm6 - movapd xmm4, [esp + nb430nf_ix] - movapd xmm5, [esp + nb430nf_iy] - movapd xmm6, [esp + nb430nf_iz] - - ;# calc dr - subsd xmm4, xmm0 - subsd xmm5, xmm1 - subsd xmm6, xmm2 - - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [esp + nb430nf_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb430nf_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [esp + nb430nf_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [esp + nb430nf_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu) - mulsd xmm4, xmm0 ;# xmm4=r - movsd [esp + nb430nf_r], xmm4 - mulsd xmm4, [esp + nb430nf_gbscale] - - cvttsd2si edx, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, edx - subsd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl edx, 2 ;# idx *= 4 - mov esi, [ebp + nb430nf_GBtab] - - ;# Coulomb - movapd xmm4, [esi + edx*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [esi + edx*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# coulomb table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [esp + nb430nf_qq] - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, xmm3 ;# vcoul=qq*VV - addsd xmm5, [esp + nb430nf_vctot] - movsd [esp + nb430nf_vctot], xmm5 - - movsd xmm4, [esp + nb430nf_r] - mulsd xmm4, [esp + nb430nf_tsc] - cvttsd2si edx, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, edx - subsd xmm4, xmm5 - movsd xmm1, xmm4 ;# xmm1=eps - movsd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl edx, 3 - - mov esi, [ebp + nb430nf_VFtab] - - ;# Dispersion - movapd xmm4, [esi + edx*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [esi + edx*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# Dispersion table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - - mulsd xmm5, [esp + nb430nf_c6] ;# Vvdw6 - addsd xmm5, [esp + nb430nf_Vvdwtot] - movlpd [esp + nb430nf_Vvdwtot], xmm5 - - ;# Repulsion - movapd xmm4, [esi + edx*8 + 32] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [esi + edx*8 + 48] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# Dispersion table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, [esp + nb430nf_c12] ;# Vvdw12 - addsd xmm5, [esp + nb430nf_Vvdwtot] - movlpd [esp + nb430nf_Vvdwtot], xmm5 -.nb430nf_updateouterdata: - ;# get n from stack - mov esi, [esp + nb430nf_n] - ;# get group index for i particle - mov edx, [ebp + nb430nf_gid] ;# base of gid[] - mov edx, [edx + esi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movapd xmm7, [esp + nb430nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov eax, [ebp + nb430nf_Vc] - addsd xmm7, [eax + edx*8] - ;# move back to mem - movsd [eax + edx*8], xmm7 - - ;# accumulate total lj energy and update it - movapd xmm7, [esp + nb430nf_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov eax, [ebp + nb430nf_Vvdw] - addsd xmm7, [eax + edx*8] - ;# move back to mem - movsd [eax + edx*8], xmm7 - - ;# finish if last - mov ecx, [esp + nb430nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb430nf_outerend - - ;# not last, iterate outer loop once more! - mov [esp + nb430nf_n], esi - jmp .nb430nf_outer -.nb430nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [esp + nb430nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb430nf_end - ;# non-zero, do one more workunit - jmp .nb430nf_threadloop -.nb430nf_end: - emms - - mov eax, [esp + nb430nf_nouter] - mov ebx, [esp + nb430nf_ninner] - mov ecx, [ebp + nb430nf_outeriter] - mov edx, [ebp + nb430nf_inneriter] - mov [ecx], eax - mov [edx], ebx - - mov eax, [esp + nb430nf_salign] - add esp, eax - add esp, 328 - pop edi - pop esi - pop edx - pop ecx - pop ebx - pop eax - leave - ret - diff --git a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.s b/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.s deleted file mode 100644 index fb9c0e2bcf..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.s +++ /dev/null @@ -1,1688 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - -.globl nb_kernel430_ia32_sse2 -.globl _nb_kernel430_ia32_sse2 -nb_kernel430_ia32_sse2: -_nb_kernel430_ia32_sse2: -.set nb430_p_nri, 8 -.set nb430_iinr, 12 -.set nb430_jindex, 16 -.set nb430_jjnr, 20 -.set nb430_shift, 24 -.set nb430_shiftvec, 28 -.set nb430_fshift, 32 -.set nb430_gid, 36 -.set nb430_pos, 40 -.set nb430_faction, 44 -.set nb430_charge, 48 -.set nb430_p_facel, 52 -.set nb430_argkrf, 56 -.set nb430_argcrf, 60 -.set nb430_Vc, 64 -.set nb430_type, 68 -.set nb430_p_ntype, 72 -.set nb430_vdwparam, 76 -.set nb430_Vvdw, 80 -.set nb430_p_tabscale, 84 -.set nb430_VFtab, 88 -.set nb430_invsqrta, 92 -.set nb430_dvda, 96 -.set nb430_p_gbtabscale, 100 -.set nb430_GBtab, 104 -.set nb430_p_nthreads, 108 -.set nb430_count, 112 -.set nb430_mtx, 116 -.set nb430_outeriter, 120 -.set nb430_inneriter, 124 -.set nb430_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb430_ix, 0 -.set nb430_iy, 16 -.set nb430_iz, 32 -.set nb430_iq, 48 -.set nb430_dx, 64 -.set nb430_dy, 80 -.set nb430_dz, 96 -.set nb430_two, 112 -.set nb430_gbtsc, 128 -.set nb430_tsc, 144 -.set nb430_qq, 160 -.set nb430_c6, 176 -.set nb430_c12, 192 -.set nb430_fscal, 208 -.set nb430_vctot, 224 -.set nb430_Vvdwtot, 240 -.set nb430_fix, 256 -.set nb430_fiy, 272 -.set nb430_fiz, 288 -.set nb430_half, 304 -.set nb430_three, 320 -.set nb430_r, 336 -.set nb430_isai, 352 -.set nb430_isaprod, 368 -.set nb430_dvdasum, 384 -.set nb430_gbscale, 400 -.set nb430_ii, 416 -.set nb430_is3, 420 -.set nb430_ii3, 424 -.set nb430_ntia, 428 -.set nb430_innerjjnr, 432 -.set nb430_innerk, 436 -.set nb430_n, 440 -.set nb430_nn1, 444 -.set nb430_nri, 448 -.set nb430_facel, 456 ## uses 8 bytes -.set nb430_ntype, 464 -.set nb430_nouter, 468 -.set nb430_ninner, 472 -.set nb430_salign, 476 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $484,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb430_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb430_p_nri(%ebp),%ecx - movl nb430_p_facel(%ebp),%esi - movl nb430_p_ntype(%ebp),%edi - movl (%ecx),%ecx - movsd (%esi),%xmm7 - movl (%edi),%edi - movl %ecx,nb430_nri(%esp) - movsd %xmm7,nb430_facel(%esp) - movl %edi,nb430_ntype(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb430_nouter(%esp) - movl %eax,nb430_ninner(%esp) - - - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double 0.5 IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb430_half(%esp) - movl %ebx,nb430_half+4(%esp) - movsd nb430_half(%esp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## 1.0 - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## 2.0 - addpd %xmm2,%xmm3 ## 3.0 - movapd %xmm1,nb430_half(%esp) - movapd %xmm2,nb430_two(%esp) - movapd %xmm3,nb430_three(%esp) - movl nb430_p_tabscale(%ebp),%eax - movsd (%eax),%xmm3 - movl nb430_p_gbtabscale(%ebp),%eax - movsd (%eax),%xmm4 - shufpd $0,%xmm3,%xmm3 - shufpd $0,%xmm4,%xmm4 - movapd %xmm3,nb430_tsc(%esp) - movapd %xmm4,nb430_gbtsc(%esp) - -_nb_kernel430_ia32_sse2.nb430_threadloop: - movl nb430_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel430_ia32_sse2.nb430_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel430_ia32_sse2.nb430_spinlock - - ## if(nn1>nri) nn1=nri - movl nb430_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb430_n(%esp) - movl %ebx,nb430_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel430_ia32_sse2.nb430_outerstart - jmp _nb_kernel430_ia32_sse2.nb430_end - -_nb_kernel430_ia32_sse2.nb430_outerstart: - ## ebx contains number of outer iterations - addl nb430_nouter(%esp),%ebx - movl %ebx,nb430_nouter(%esp) - -_nb_kernel430_ia32_sse2.nb430_outer: - movl nb430_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb430_is3(%esp) ## store is3 - - movl nb430_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movsd (%eax,%ebx,8),%xmm0 - movsd 8(%eax,%ebx,8),%xmm1 - movsd 16(%eax,%ebx,8),%xmm2 - - movl nb430_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - movl %ebx,nb430_ii(%esp) - - movl nb430_charge(%ebp),%edx - movsd (%edx,%ebx,8),%xmm3 - mulsd nb430_facel(%esp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movl nb430_invsqrta(%ebp),%edx ## load invsqrta[ii] - movsd (%edx,%ebx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - movl nb430_type(%ebp),%edx - movl (%edx,%ebx,4),%edx - imull nb430_ntype(%esp),%edx - shll %edx - movl %edx,nb430_ntia(%esp) - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb430_pos(%ebp),%eax ## eax = base of pos[] - - addsd (%eax,%ebx,8),%xmm0 - addsd 8(%eax,%ebx,8),%xmm1 - addsd 16(%eax,%ebx,8),%xmm2 - - movapd %xmm3,nb430_iq(%esp) - movapd %xmm4,nb430_isai(%esp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb430_ix(%esp) - movapd %xmm1,nb430_iy(%esp) - movapd %xmm2,nb430_iz(%esp) - - movl %ebx,nb430_ii3(%esp) - - ## clear vctot and i forces - xorpd %xmm4,%xmm4 - movapd %xmm4,nb430_vctot(%esp) - movapd %xmm4,nb430_Vvdwtot(%esp) - movapd %xmm4,nb430_dvdasum(%esp) - movapd %xmm4,nb430_fix(%esp) - movapd %xmm4,nb430_fiy(%esp) - movapd %xmm4,nb430_fiz(%esp) - - movl nb430_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb430_pos(%ebp),%esi - movl nb430_faction(%ebp),%edi - movl nb430_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb430_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb430_ninner(%esp),%ecx - movl %ecx,nb430_ninner(%esp) - addl $0,%edx - movl %edx,nb430_innerk(%esp) ## number of innerloop atoms - jge _nb_kernel430_ia32_sse2.nb430_unroll_loop - jmp _nb_kernel430_ia32_sse2.nb430_checksingle -_nb_kernel430_ia32_sse2.nb430_unroll_loop: - ## twice unrolled innerloop here - movl nb430_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - addl $8,nb430_innerjjnr(%esp) ## advance pointer (unrolled 2) - - ## load isaj - movl nb430_invsqrta(%ebp),%esi - movlpd (%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm2 - mulpd nb430_isai(%esp),%xmm2 - movapd %xmm2,nb430_isaprod(%esp) - movapd %xmm2,%xmm1 - mulpd nb430_gbtsc(%esp),%xmm1 - movapd %xmm1,nb430_gbscale(%esp) - - movl nb430_charge(%ebp),%esi ## base of charge[] - movlpd (%esi,%eax,8),%xmm3 - movhpd (%esi,%ebx,8),%xmm3 - - mulpd nb430_iq(%esp),%xmm2 - mulpd %xmm2,%xmm3 - movapd %xmm3,nb430_qq(%esp) - - movl nb430_type(%ebp),%esi - movl (%esi,%eax,4),%ecx - movl (%esi,%ebx,4),%edx - movl nb430_vdwparam(%ebp),%esi - shll %ecx - shll %edx - movl nb430_ntia(%esp),%edi - addl %edi,%ecx - addl %edi,%edx - - movlpd (%esi,%ecx,8),%xmm6 ## c6a - movlpd (%esi,%edx,8),%xmm7 ## c6b - movhpd 8(%esi,%ecx,8),%xmm6 ## c6a c12a - movhpd 8(%esi,%edx,8),%xmm7 ## c6b c12b - - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movapd %xmm4,nb430_c6(%esp) - movapd %xmm6,nb430_c12(%esp) - - movl nb430_pos(%ebp),%esi ## base of pos[] - - movd %eax,%mm2 - movd %ebx,%mm3 - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - - ## move two coordinates to xmm0-xmm2 - movlpd (%esi,%eax,8),%xmm0 - movlpd 8(%esi,%eax,8),%xmm1 - movlpd 16(%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm0 - movhpd 8(%esi,%ebx,8),%xmm1 - movhpd 16(%esi,%ebx,8),%xmm2 - - movl nb430_faction(%ebp),%edi - - ## move nb430_ix-iz to xmm4-xmm6 - movapd nb430_ix(%esp),%xmm4 - movapd nb430_iy(%esp),%xmm5 - movapd nb430_iz(%esp),%xmm6 - - ## calc dr - subpd %xmm0,%xmm4 - subpd %xmm1,%xmm5 - subpd %xmm2,%xmm6 - - ## store dr - movapd %xmm4,nb430_dx(%esp) - movapd %xmm5,nb430_dy(%esp) - movapd %xmm6,nb430_dz(%esp) - ## square it - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb430_three(%esp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb430_half(%esp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb430_three(%esp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb430_half(%esp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=iter2 of rinv - mulpd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb430_r(%esp) - mulpd nb430_gbscale(%esp),%xmm4 - - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 ## idx *= 4 - - movl nb430_GBtab(%ebp),%esi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx ## indices in eax/ebx - - ## Coulomb - movapd (%esi,%ecx,8),%xmm4 ## Y1 F1 - movapd (%esi,%edx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%esi,%ecx,8),%xmm6 ## G1 H1 - movapd 16(%esi,%edx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - mulpd nb430_two(%esp),%xmm7 ## two*Heps2 - movapd nb430_qq(%esp),%xmm3 - addpd %xmm6,%xmm7 - addpd %xmm5,%xmm7 ## xmm7=FF - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - mulpd %xmm3,%xmm5 ## vcoul=qq*VV - mulpd %xmm7,%xmm3 ## fijC=FF*qq - ## get jnr from regs - movd %mm2,%ecx - movd %mm3,%edx - movl nb430_dvda(%ebp),%esi - - ## Calculate dVda - xorpd %xmm7,%xmm7 - mulpd nb430_gbscale(%esp),%xmm3 - movapd %xmm3,%xmm6 - mulpd nb430_r(%esp),%xmm6 - addpd %xmm5,%xmm6 - addpd nb430_vctot(%esp),%xmm5 - movapd %xmm5,nb430_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subpd %xmm6,%xmm7 - movapd %xmm7,%xmm6 - - ## update dvdasum - addpd nb430_dvdasum(%esp),%xmm7 - movapd %xmm7,nb430_dvdasum(%esp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - addsd (%esi,%ecx,8),%xmm6 - addsd (%esi,%edx,8),%xmm7 - movsd %xmm6,(%esi,%ecx,8) - movsd %xmm7,(%esi,%edx,8) - - ## put scalar force on stack temporarily - movapd %xmm3,nb430_fscal(%esp) - - movapd nb430_r(%esp),%xmm4 - mulpd nb430_tsc(%esp),%xmm4 - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $3,%mm6 ## idx *= 8 - - movl nb430_VFtab(%ebp),%esi - - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx ## indices in eax/ebx - - ## Dispersion - movapd (%esi,%ecx,8),%xmm4 ## Y1 F1 - movapd (%esi,%edx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%esi,%ecx,8),%xmm6 ## G1 H1 - movapd 16(%esi,%edx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## Dispersion table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - mulpd nb430_two(%esp),%xmm7 ## two*Heps2 - addpd %xmm6,%xmm7 - addpd %xmm5,%xmm7 ## xmm7=FF - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - - movapd nb430_c6(%esp),%xmm4 - mulpd %xmm4,%xmm7 ## fijD - mulpd %xmm4,%xmm5 ## Vvdw6 - mulpd nb430_tsc(%esp),%xmm7 - addpd nb430_fscal(%esp),%xmm7 ## add to fscal - - ## put scalar force back on stack Update Vvdwtot directly - addpd nb430_Vvdwtot(%esp),%xmm5 - movapd %xmm7,nb430_fscal(%esp) - movapd %xmm5,nb430_Vvdwtot(%esp) - - ## Repulsion - movapd 32(%esi,%ecx,8),%xmm4 ## Y1 F1 - movapd 32(%esi,%edx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 48(%esi,%ecx,8),%xmm6 ## G1 H1 - movapd 48(%esi,%edx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## Dispersion table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - mulpd nb430_two(%esp),%xmm7 ## two*Heps2 - addpd %xmm6,%xmm7 - addpd %xmm5,%xmm7 ## xmm7=FF - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - - movapd nb430_c12(%esp),%xmm4 - mulpd %xmm4,%xmm7 ## fijR - mulpd %xmm4,%xmm5 ## Vvdw12 - mulpd nb430_tsc(%esp),%xmm7 - addpd nb430_fscal(%esp),%xmm7 - - addpd nb430_Vvdwtot(%esp),%xmm5 - movapd %xmm5,nb430_Vvdwtot(%esp) - xorpd %xmm4,%xmm4 - - mulpd %xmm0,%xmm7 - subpd %xmm7,%xmm4 - - movapd nb430_dx(%esp),%xmm0 - movapd nb430_dy(%esp),%xmm1 - movapd nb430_dz(%esp),%xmm2 - - movl nb430_faction(%ebp),%edi - mulpd %xmm4,%xmm0 - mulpd %xmm4,%xmm1 - mulpd %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movapd nb430_fix(%esp),%xmm3 - movapd nb430_fiy(%esp),%xmm4 - movapd nb430_fiz(%esp),%xmm5 - addpd %xmm0,%xmm3 - addpd %xmm1,%xmm4 - addpd %xmm2,%xmm5 - movapd %xmm3,nb430_fix(%esp) - movapd %xmm4,nb430_fiy(%esp) - movapd %xmm5,nb430_fiz(%esp) - ## the fj's - start by accumulating forces from memory - movlpd (%edi,%eax,8),%xmm3 - movlpd 8(%edi,%eax,8),%xmm4 - movlpd 16(%edi,%eax,8),%xmm5 - movhpd (%edi,%ebx,8),%xmm3 - movhpd 8(%edi,%ebx,8),%xmm4 - movhpd 16(%edi,%ebx,8),%xmm5 - subpd %xmm0,%xmm3 - subpd %xmm1,%xmm4 - subpd %xmm2,%xmm5 - movlpd %xmm3,(%edi,%eax,8) - movlpd %xmm4,8(%edi,%eax,8) - movlpd %xmm5,16(%edi,%eax,8) - movhpd %xmm3,(%edi,%ebx,8) - movhpd %xmm4,8(%edi,%ebx,8) - movhpd %xmm5,16(%edi,%ebx,8) - - ## should we do one more iteration? - subl $2,nb430_innerk(%esp) - jl _nb_kernel430_ia32_sse2.nb430_checksingle - jmp _nb_kernel430_ia32_sse2.nb430_unroll_loop -_nb_kernel430_ia32_sse2.nb430_checksingle: - movl nb430_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel430_ia32_sse2.nb430_dosingle - jmp _nb_kernel430_ia32_sse2.nb430_updateouterdata -_nb_kernel430_ia32_sse2.nb430_dosingle: - movl nb430_charge(%ebp),%esi - movl nb430_invsqrta(%ebp),%edx - movl nb430_pos(%ebp),%edi - movl nb430_innerjjnr(%esp),%ecx - movl (%ecx),%eax - - xorpd %xmm6,%xmm6 - movapd %xmm6,%xmm7 - movsd (%edx,%eax,8),%xmm7 - movlpd (%esi,%eax,8),%xmm6 ## xmm6(0) has the charge - mulsd nb430_isai(%esp),%xmm7 - movapd %xmm7,nb430_isaprod(%esp) - movapd %xmm7,%xmm1 - mulpd nb430_gbtsc(%esp),%xmm1 - movapd %xmm1,nb430_gbscale(%esp) - - mulsd nb430_iq(%esp),%xmm7 - mulsd %xmm7,%xmm6 - movapd %xmm6,nb430_qq(%esp) - - movl nb430_type(%ebp),%esi - movl (%esi,%eax,4),%edx - movl nb430_vdwparam(%ebp),%esi - shll %edx - movl nb430_ntia(%esp),%edi - addl %edi,%edx - - movlpd (%esi,%edx,8),%xmm6 ## c6a - movhpd 8(%esi,%edx,8),%xmm6 ## c6a c12a - - xorpd %xmm7,%xmm7 - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movapd %xmm4,nb430_c6(%esp) - movapd %xmm6,nb430_c12(%esp) - - movl nb430_pos(%ebp),%esi ## base of pos[] - - movd %eax,%mm2 - leal (%eax,%eax,2),%eax ## replace jnr with j3 - - ## move two coordinates to xmm0-xmm2 - movlpd (%esi,%eax,8),%xmm0 - movlpd 8(%esi,%eax,8),%xmm1 - movlpd 16(%esi,%eax,8),%xmm2 - - movl nb430_faction(%ebp),%edi - - ## move nb430_ix-iz to xmm4-xmm6 - movapd nb430_ix(%esp),%xmm4 - movapd nb430_iy(%esp),%xmm5 - movapd nb430_iz(%esp),%xmm6 - - ## calc dr - subsd %xmm0,%xmm4 - subsd %xmm1,%xmm5 - subsd %xmm2,%xmm6 - - ## store dr - movapd %xmm4,nb430_dx(%esp) - movapd %xmm5,nb430_dy(%esp) - movapd %xmm6,nb430_dz(%esp) - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb430_three(%esp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb430_half(%esp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb430_three(%esp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb430_half(%esp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=iter2 of rinv (new lu) - mulsd %xmm0,%xmm4 ## xmm4=r - movsd %xmm4,nb430_r(%esp) - mulsd nb430_gbscale(%esp),%xmm4 - - cvttsd2si %xmm4,%edx ## mm6 = lu idx - cvtsi2sd %edx,%xmm5 - subsd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%edx ## idx *= 4 - movl nb430_GBtab(%ebp),%esi - - ## Coulomb - movapd (%esi,%edx,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%esi,%edx,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## coulomb table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - mulsd nb430_two(%esp),%xmm7 ## two*Heps2 - movapd nb430_qq(%esp),%xmm3 - addsd %xmm6,%xmm7 - addsd %xmm5,%xmm7 ## xmm7=FF - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd %xmm3,%xmm5 ## vcoul=qq*VV - mulsd %xmm7,%xmm3 ## fijC=FF*qq - ## get jnr from regs - movd %mm2,%ebx - movl nb430_dvda(%ebp),%esi - - ## Calculate dVda - xorpd %xmm7,%xmm7 - mulsd nb430_gbscale(%esp),%xmm3 - movsd %xmm3,%xmm6 - mulsd nb430_r(%esp),%xmm6 - addsd %xmm5,%xmm6 - addsd nb430_vctot(%esp),%xmm5 - movsd %xmm5,nb430_vctot(%esp) - - ## xmm6=(vcoul+fijC*r) - subpd %xmm6,%xmm7 - movsd %xmm7,%xmm6 - - ## update dvdasum - addsd nb430_dvdasum(%esp),%xmm7 - movsd %xmm7,nb430_dvdasum(%esp) - - ## update j atoms dvdaj - addsd (%esi,%ebx,8),%xmm6 - movsd %xmm6,(%esi,%ebx,8) - - ## put scalar force on stack temporarily - movsd %xmm3,nb430_fscal(%esp) - - movsd nb430_r(%esp),%xmm4 - mulsd nb430_tsc(%esp),%xmm4 - cvttsd2si %xmm4,%edx ## mm6 = lu idx - cvtsi2sd %edx,%xmm5 - subsd %xmm5,%xmm4 - movsd %xmm4,%xmm1 ## xmm1=eps - movsd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $3,%edx - - movl nb430_VFtab(%ebp),%esi - - ## Dispersion - movapd (%esi,%edx,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%esi,%edx,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## Dispersion table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - mulsd nb430_two(%esp),%xmm7 ## two*Heps2 - movapd nb430_qq(%esp),%xmm3 - addsd %xmm6,%xmm7 - addsd %xmm5,%xmm7 ## xmm7=FF - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - - movapd nb430_c6(%esp),%xmm4 - mulsd %xmm4,%xmm7 ## fijD - mulsd %xmm4,%xmm5 ## Vvdw6 - mulpd nb430_tsc(%esp),%xmm7 - addsd nb430_fscal(%esp),%xmm7 ## add to fscal - - ## put scalar force back on stack Update Vvdwtot directly - addsd nb430_Vvdwtot(%esp),%xmm5 - movlpd %xmm7,nb430_fscal(%esp) - movlpd %xmm5,nb430_Vvdwtot(%esp) - - ## Repulsion - movapd 32(%esi,%edx,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 48(%esi,%edx,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## Dispersion table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - mulsd nb430_two(%esp),%xmm7 ## two*Heps2 - movapd nb430_qq(%esp),%xmm3 - addsd %xmm6,%xmm7 - addsd %xmm5,%xmm7 ## xmm7=FF - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - - movapd nb430_c12(%esp),%xmm4 - mulsd %xmm4,%xmm7 ## fijR - mulsd %xmm4,%xmm5 ## Vvdw12 - mulpd nb430_tsc(%esp),%xmm7 - addsd nb430_fscal(%esp),%xmm7 - - addsd nb430_Vvdwtot(%esp),%xmm5 - movlpd %xmm5,nb430_Vvdwtot(%esp) - xorpd %xmm4,%xmm4 - - mulsd %xmm0,%xmm7 - subsd %xmm7,%xmm4 - - movapd nb430_dx(%esp),%xmm0 - movapd nb430_dy(%esp),%xmm1 - movapd nb430_dz(%esp),%xmm2 - - movl nb430_faction(%ebp),%edi - mulsd %xmm4,%xmm0 - mulsd %xmm4,%xmm1 - mulsd %xmm4,%xmm2 - ## xmm0-xmm2 contains tx-tz (partial force) - ## now update f_i - movapd nb430_fix(%esp),%xmm3 - movapd nb430_fiy(%esp),%xmm4 - movapd nb430_fiz(%esp),%xmm5 - addsd %xmm0,%xmm3 - addsd %xmm1,%xmm4 - addsd %xmm2,%xmm5 - movlpd %xmm3,nb430_fix(%esp) - movlpd %xmm4,nb430_fiy(%esp) - movlpd %xmm5,nb430_fiz(%esp) - ## the fj's - start by accumulating forces from memory - movlpd (%edi,%eax,8),%xmm3 - movlpd 8(%edi,%eax,8),%xmm4 - movlpd 16(%edi,%eax,8),%xmm5 - subsd %xmm0,%xmm3 - subsd %xmm1,%xmm4 - subsd %xmm2,%xmm5 - movlpd %xmm3,(%edi,%eax,8) - movlpd %xmm4,8(%edi,%eax,8) - movlpd %xmm5,16(%edi,%eax,8) -_nb_kernel430_ia32_sse2.nb430_updateouterdata: - movl nb430_ii3(%esp),%ecx - movl nb430_faction(%ebp),%edi - movl nb430_fshift(%ebp),%esi - movl nb430_is3(%esp),%edx - - ## accumulate i forces in xmm0, xmm1, xmm2 - movapd nb430_fix(%esp),%xmm0 - movapd nb430_fiy(%esp),%xmm1 - movapd nb430_fiz(%esp),%xmm2 - - movhlps %xmm0,%xmm3 - movhlps %xmm1,%xmm4 - movhlps %xmm2,%xmm5 - addsd %xmm3,%xmm0 - addsd %xmm4,%xmm1 - addsd %xmm5,%xmm2 ## sum is in low xmm0-xmm2 - - ## increment i force - movsd (%edi,%ecx,8),%xmm3 - movsd 8(%edi,%ecx,8),%xmm4 - movsd 16(%edi,%ecx,8),%xmm5 - addsd %xmm0,%xmm3 - addsd %xmm1,%xmm4 - addsd %xmm2,%xmm5 - movsd %xmm3,(%edi,%ecx,8) - movsd %xmm4,8(%edi,%ecx,8) - movsd %xmm5,16(%edi,%ecx,8) - - ## increment fshift force - movsd (%esi,%edx,8),%xmm3 - movsd 8(%esi,%edx,8),%xmm4 - movsd 16(%esi,%edx,8),%xmm5 - addsd %xmm0,%xmm3 - addsd %xmm1,%xmm4 - addsd %xmm2,%xmm5 - movsd %xmm3,(%esi,%edx,8) - movsd %xmm4,8(%esi,%edx,8) - movsd %xmm5,16(%esi,%edx,8) - - ## get n from stack - movl nb430_n(%esp),%esi - ## get group index for i particle - movl nb430_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movapd nb430_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movl nb430_Vc(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%eax,%edx,8) - - ## accumulate total lj energy and update it - movapd nb430_Vvdwtot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movl nb430_Vvdw(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%eax,%edx,8) - - ## accumulate dVda and update it - movapd nb430_dvdasum(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - movl nb430_ii(%esp),%edx - movl nb430_dvda(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - movsd %xmm7,(%eax,%edx,8) - - ## finish if last - movl nb430_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel430_ia32_sse2.nb430_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb430_n(%esp) - jmp _nb_kernel430_ia32_sse2.nb430_outer -_nb_kernel430_ia32_sse2.nb430_outerend: - ## check if more outer neighborlists remain - movl nb430_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel430_ia32_sse2.nb430_end - ## non-zero, do one more workunit - jmp _nb_kernel430_ia32_sse2.nb430_threadloop -_nb_kernel430_ia32_sse2.nb430_end: - emms - - movl nb430_nouter(%esp),%eax - movl nb430_ninner(%esp),%ebx - movl nb430_outeriter(%ebp),%ecx - movl nb430_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb430_salign(%esp),%eax - addl %eax,%esp - addl $484,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - - - - - -.globl nb_kernel430nf_ia32_sse2 -.globl _nb_kernel430nf_ia32_sse2 -nb_kernel430nf_ia32_sse2: -_nb_kernel430nf_ia32_sse2: -.set nb430nf_p_nri, 8 -.set nb430nf_iinr, 12 -.set nb430nf_jindex, 16 -.set nb430nf_jjnr, 20 -.set nb430nf_shift, 24 -.set nb430nf_shiftvec, 28 -.set nb430nf_fshift, 32 -.set nb430nf_gid, 36 -.set nb430nf_pos, 40 -.set nb430nf_faction, 44 -.set nb430nf_charge, 48 -.set nb430nf_p_facel, 52 -.set nb430nf_argkrf, 56 -.set nb430nf_argcrf, 60 -.set nb430nf_Vc, 64 -.set nb430nf_type, 68 -.set nb430nf_p_ntype, 72 -.set nb430nf_vdwparam, 76 -.set nb430nf_Vvdw, 80 -.set nb430nf_p_tabscale, 84 -.set nb430nf_VFtab, 88 -.set nb430nf_invsqrta, 92 -.set nb430nf_dvda, 96 -.set nb430nf_p_gbtabscale, 100 -.set nb430nf_GBtab, 104 -.set nb430nf_p_nthreads, 108 -.set nb430nf_count, 112 -.set nb430nf_mtx, 116 -.set nb430nf_outeriter, 120 -.set nb430nf_inneriter, 124 -.set nb430nf_work, 128 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb430nf_ix, 0 -.set nb430nf_iy, 16 -.set nb430nf_iz, 32 -.set nb430nf_iq, 48 -.set nb430nf_gbtsc, 64 -.set nb430nf_tsc, 80 -.set nb430nf_qq, 96 -.set nb430nf_c6, 112 -.set nb430nf_c12, 128 -.set nb430nf_vctot, 144 -.set nb430nf_Vvdwtot, 160 -.set nb430nf_half, 176 -.set nb430nf_three, 192 -.set nb430nf_r, 208 -.set nb430nf_isai, 224 -.set nb430nf_isaprod, 240 -.set nb430nf_gbscale, 256 -.set nb430nf_is3, 272 -.set nb430nf_ii3, 276 -.set nb430nf_ntia, 280 -.set nb430nf_innerjjnr, 284 -.set nb430nf_innerk, 288 -.set nb430nf_n, 292 -.set nb430nf_nn1, 296 -.set nb430nf_nri, 300 -.set nb430nf_facel, 304 ## uses 8 bytes -.set nb430nf_ntype, 312 -.set nb430nf_nouter, 316 -.set nb430nf_ninner, 320 -.set nb430nf_salign, 324 - pushl %ebp - movl %esp,%ebp - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushl %esi - pushl %edi - subl $328,%esp ## local stack space - movl %esp,%eax - andl $0xf,%eax - subl %eax,%esp - movl %eax,nb430nf_salign(%esp) - - emms - - ## Move args passed by reference to stack - movl nb430nf_p_nri(%ebp),%ecx - movl nb430nf_p_facel(%ebp),%esi - movl nb430nf_p_ntype(%ebp),%edi - movl (%ecx),%ecx - movsd (%esi),%xmm7 - movl (%edi),%edi - movl %ecx,nb430nf_nri(%esp) - movsd %xmm7,nb430nf_facel(%esp) - movl %edi,nb430nf_ntype(%esp) - - ## zero iteration counters - movl $0,%eax - movl %eax,nb430nf_nouter(%esp) - movl %eax,nb430nf_ninner(%esp) - - - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double 0.5 IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb430nf_half(%esp) - movl %ebx,nb430nf_half+4(%esp) - movsd nb430nf_half(%esp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## 1.0 - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## 2.0 - addpd %xmm2,%xmm3 ## 3.0 - movapd %xmm1,nb430nf_half(%esp) - movapd %xmm3,nb430nf_three(%esp) - movl nb430nf_p_tabscale(%ebp),%eax - movsd (%eax),%xmm3 - movl nb430nf_p_gbtabscale(%ebp),%eax - movsd (%eax),%xmm4 - shufpd $0,%xmm3,%xmm3 - shufpd $0,%xmm4,%xmm4 - movapd %xmm3,nb430nf_tsc(%esp) - movapd %xmm4,nb430nf_gbtsc(%esp) - -_nb_kernel430nf_ia32_sse2.nb430nf_threadloop: - movl nb430nf_count(%ebp),%esi ## pointer to sync counter - movl (%esi),%eax -_nb_kernel430nf_ia32_sse2.nb430nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%esi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel430nf_ia32_sse2.nb430nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb430nf_nri(%esp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb430nf_n(%esp) - movl %ebx,nb430nf_nn1(%esp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel430nf_ia32_sse2.nb430nf_outerstart - jmp _nb_kernel430nf_ia32_sse2.nb430nf_end - -_nb_kernel430nf_ia32_sse2.nb430nf_outerstart: - ## ebx contains number of outer iterations - addl nb430nf_nouter(%esp),%ebx - movl %ebx,nb430nf_nouter(%esp) - -_nb_kernel430nf_ia32_sse2.nb430nf_outer: - movl nb430nf_shift(%ebp),%eax ## eax = pointer into shift[] - movl (%eax,%esi,4),%ebx ## ebx=shift[n] - - leal (%ebx,%ebx,2),%ebx ## ebx=3*is - movl %ebx,nb430nf_is3(%esp) ## store is3 - - movl nb430nf_shiftvec(%ebp),%eax ## eax = base of shiftvec[] - - movsd (%eax,%ebx,8),%xmm0 - movsd 8(%eax,%ebx,8),%xmm1 - movsd 16(%eax,%ebx,8),%xmm2 - - movl nb430nf_iinr(%ebp),%ecx ## ecx = pointer into iinr[] - movl (%ecx,%esi,4),%ebx ## ebx =ii - - movl nb430nf_charge(%ebp),%edx - movsd (%edx,%ebx,8),%xmm3 - mulsd nb430nf_facel(%esp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movl nb430nf_invsqrta(%ebp),%edx ## load invsqrta[ii] - movsd (%edx,%ebx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - movl nb430nf_type(%ebp),%edx - movl (%edx,%ebx,4),%edx - imull nb430nf_ntype(%esp),%edx - shll %edx - movl %edx,nb430nf_ntia(%esp) - - leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 - movl nb430nf_pos(%ebp),%eax ## eax = base of pos[] - - addsd (%eax,%ebx,8),%xmm0 - addsd 8(%eax,%ebx,8),%xmm1 - addsd 16(%eax,%ebx,8),%xmm2 - - movapd %xmm3,nb430nf_iq(%esp) - movapd %xmm4,nb430nf_isai(%esp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb430nf_ix(%esp) - movapd %xmm1,nb430nf_iy(%esp) - movapd %xmm2,nb430nf_iz(%esp) - - movl %ebx,nb430nf_ii3(%esp) - - ## clear vctot - xorpd %xmm4,%xmm4 - movapd %xmm4,nb430nf_vctot(%esp) - movapd %xmm4,nb430nf_Vvdwtot(%esp) - - movl nb430nf_jindex(%ebp),%eax - movl (%eax,%esi,4),%ecx ## jindex[n] - movl 4(%eax,%esi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movl nb430nf_pos(%ebp),%esi - movl nb430nf_faction(%ebp),%edi - movl nb430nf_jjnr(%ebp),%eax - shll $2,%ecx - addl %ecx,%eax - movl %eax,nb430nf_innerjjnr(%esp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb430nf_ninner(%esp),%ecx - movl %ecx,nb430nf_ninner(%esp) - addl $0,%edx - movl %edx,nb430nf_innerk(%esp) ## number of innerloop atoms - jge _nb_kernel430nf_ia32_sse2.nb430nf_unroll_loop - jmp _nb_kernel430nf_ia32_sse2.nb430nf_checksingle -_nb_kernel430nf_ia32_sse2.nb430nf_unroll_loop: - ## twice unrolled innerloop here - movl nb430nf_innerjjnr(%esp),%edx ## pointer to jjnr[k] - movl (%edx),%eax - movl 4(%edx),%ebx - addl $8,nb430nf_innerjjnr(%esp) ## advance pointer (unrolled 2) - - ## load isaj - movl nb430nf_invsqrta(%ebp),%esi - movlpd (%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm2 - mulpd nb430nf_isai(%esp),%xmm2 - movapd %xmm2,nb430nf_isaprod(%esp) - movapd %xmm2,%xmm1 - mulpd nb430nf_gbtsc(%esp),%xmm1 - movapd %xmm1,nb430nf_gbscale(%esp) - - movl nb430nf_charge(%ebp),%esi ## base of charge[] - movlpd (%esi,%eax,8),%xmm3 - movhpd (%esi,%ebx,8),%xmm3 - - mulpd nb430nf_iq(%esp),%xmm2 - mulpd %xmm2,%xmm3 - movapd %xmm3,nb430nf_qq(%esp) - - movl nb430nf_type(%ebp),%esi - movl (%esi,%eax,4),%ecx - movl (%esi,%ebx,4),%edx - movl nb430nf_vdwparam(%ebp),%esi - shll %ecx - shll %edx - movl nb430nf_ntia(%esp),%edi - addl %edi,%ecx - addl %edi,%edx - - movlpd (%esi,%ecx,8),%xmm6 ## c6a - movlpd (%esi,%edx,8),%xmm7 ## c6b - movhpd 8(%esi,%ecx,8),%xmm6 ## c6a c12a - movhpd 8(%esi,%edx,8),%xmm7 ## c6b c12b - - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movapd %xmm4,nb430nf_c6(%esp) - movapd %xmm6,nb430nf_c12(%esp) - - movl nb430nf_pos(%ebp),%esi ## base of pos[] - - leal (%eax,%eax,2),%eax ## replace jnr with j3 - leal (%ebx,%ebx,2),%ebx - - ## move two coordinates to xmm0-xmm2 - movlpd (%esi,%eax,8),%xmm0 - movlpd 8(%esi,%eax,8),%xmm1 - movlpd 16(%esi,%eax,8),%xmm2 - movhpd (%esi,%ebx,8),%xmm0 - movhpd 8(%esi,%ebx,8),%xmm1 - movhpd 16(%esi,%ebx,8),%xmm2 - - movl nb430nf_faction(%ebp),%edi - - ## move nb430nf_ix-iz to xmm4-xmm6 - movapd nb430nf_ix(%esp),%xmm4 - movapd nb430nf_iy(%esp),%xmm5 - movapd nb430nf_iz(%esp),%xmm6 - - ## calc dr - subpd %xmm0,%xmm4 - subpd %xmm1,%xmm5 - subpd %xmm2,%xmm6 - - ## square it - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb430nf_three(%esp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb430nf_half(%esp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb430nf_three(%esp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb430nf_half(%esp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=iter2 of rinv - mulpd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb430nf_r(%esp) - mulpd nb430nf_gbscale(%esp),%xmm4 - - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 ## idx *= 4 - - movl nb430nf_GBtab(%ebp),%esi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx ## indices in eax/ebx - - ## Coulomb - movapd (%esi,%ecx,8),%xmm4 ## Y1 F1 - movapd (%esi,%edx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%esi,%ecx,8),%xmm6 ## G1 H1 - movapd 16(%esi,%edx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - movapd nb430nf_qq(%esp),%xmm3 - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - mulpd %xmm3,%xmm5 ## vcoul=qq*VV - addpd nb430nf_vctot(%esp),%xmm5 - movapd %xmm5,nb430nf_vctot(%esp) - - movapd nb430nf_r(%esp),%xmm4 - mulpd nb430nf_tsc(%esp),%xmm4 - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $3,%mm6 ## idx *= 8 - - movl nb430nf_VFtab(%ebp),%esi - - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx ## indices in eax/ebx - - ## Dispersion - movapd (%esi,%ecx,8),%xmm4 ## Y1 F1 - movapd (%esi,%edx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%esi,%ecx,8),%xmm6 ## G1 H1 - movapd 16(%esi,%edx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## Dispersion table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - - mulpd nb430nf_c6(%esp),%xmm5 ## Vvdw6 - addpd nb430nf_Vvdwtot(%esp),%xmm5 - movapd %xmm5,nb430nf_Vvdwtot(%esp) - - ## Repulsion - movapd 32(%esi,%ecx,8),%xmm4 ## Y1 F1 - movapd 32(%esi,%edx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 48(%esi,%ecx,8),%xmm6 ## G1 H1 - movapd 48(%esi,%edx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## Dispersion table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - - mulpd nb430nf_c12(%esp),%xmm5 ## Vvdw12 - addpd nb430nf_Vvdwtot(%esp),%xmm5 - movapd %xmm5,nb430nf_Vvdwtot(%esp) - xorpd %xmm4,%xmm4 - - ## should we do one more iteration? - subl $2,nb430nf_innerk(%esp) - jl _nb_kernel430nf_ia32_sse2.nb430nf_checksingle - jmp _nb_kernel430nf_ia32_sse2.nb430nf_unroll_loop -_nb_kernel430nf_ia32_sse2.nb430nf_checksingle: - movl nb430nf_innerk(%esp),%edx - andl $1,%edx - jnz _nb_kernel430nf_ia32_sse2.nb430nf_dosingle - jmp _nb_kernel430nf_ia32_sse2.nb430nf_updateouterdata -_nb_kernel430nf_ia32_sse2.nb430nf_dosingle: - movl nb430nf_charge(%ebp),%esi - movl nb430nf_invsqrta(%ebp),%edx - movl nb430nf_pos(%ebp),%edi - movl nb430nf_innerjjnr(%esp),%ecx - movl (%ecx),%eax - - xorpd %xmm6,%xmm6 - movapd %xmm6,%xmm7 - movsd (%edx,%eax,8),%xmm7 - movlpd (%esi,%eax,8),%xmm6 ## xmm6(0) has the charge - mulsd nb430nf_isai(%esp),%xmm7 - movapd %xmm7,nb430nf_isaprod(%esp) - movapd %xmm7,%xmm1 - mulpd nb430nf_gbtsc(%esp),%xmm1 - movapd %xmm1,nb430nf_gbscale(%esp) - - mulsd nb430nf_iq(%esp),%xmm7 - mulsd %xmm7,%xmm6 - movapd %xmm6,nb430nf_qq(%esp) - - movl nb430nf_type(%ebp),%esi - movl (%esi,%eax,4),%edx - movl nb430nf_vdwparam(%ebp),%esi - shll %edx - movl nb430nf_ntia(%esp),%edi - addl %edi,%edx - - movlpd (%esi,%edx,8),%xmm6 ## c6a - movhpd 8(%esi,%edx,8),%xmm6 ## c6a c12a - - xorpd %xmm7,%xmm7 - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movapd %xmm4,nb430nf_c6(%esp) - movapd %xmm6,nb430nf_c12(%esp) - - movl nb430nf_pos(%ebp),%esi ## base of pos[] - - leal (%eax,%eax,2),%eax ## replace jnr with j3 - - ## move two coordinates to xmm0-xmm2 - movlpd (%esi,%eax,8),%xmm0 - movlpd 8(%esi,%eax,8),%xmm1 - movlpd 16(%esi,%eax,8),%xmm2 - - movl nb430nf_faction(%ebp),%edi - - ## move nb430nf_ix-iz to xmm4-xmm6 - movapd nb430nf_ix(%esp),%xmm4 - movapd nb430nf_iy(%esp),%xmm5 - movapd nb430nf_iz(%esp),%xmm6 - - ## calc dr - subsd %xmm0,%xmm4 - subsd %xmm1,%xmm5 - subsd %xmm2,%xmm6 - - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb430nf_three(%esp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb430nf_half(%esp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb430nf_three(%esp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb430nf_half(%esp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=iter2 of rinv (new lu) - mulsd %xmm0,%xmm4 ## xmm4=r - movsd %xmm4,nb430nf_r(%esp) - mulsd nb430nf_gbscale(%esp),%xmm4 - - cvttsd2si %xmm4,%edx ## mm6 = lu idx - cvtsi2sd %edx,%xmm5 - subsd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%edx ## idx *= 4 - movl nb430nf_GBtab(%ebp),%esi - - ## Coulomb - movapd (%esi,%edx,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%esi,%edx,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## coulomb table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - movapd nb430nf_qq(%esp),%xmm3 - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd %xmm3,%xmm5 ## vcoul=qq*VV - addsd nb430nf_vctot(%esp),%xmm5 - movsd %xmm5,nb430nf_vctot(%esp) - - movsd nb430nf_r(%esp),%xmm4 - mulsd nb430nf_tsc(%esp),%xmm4 - cvttsd2si %xmm4,%edx ## mm6 = lu idx - cvtsi2sd %edx,%xmm5 - subsd %xmm5,%xmm4 - movsd %xmm4,%xmm1 ## xmm1=eps - movsd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $3,%edx - - movl nb430nf_VFtab(%ebp),%esi - - ## Dispersion - movapd (%esi,%edx,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%esi,%edx,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## Dispersion table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - - mulsd nb430nf_c6(%esp),%xmm5 ## Vvdw6 - addsd nb430nf_Vvdwtot(%esp),%xmm5 - movlpd %xmm5,nb430nf_Vvdwtot(%esp) - - ## Repulsion - movapd 32(%esi,%edx,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 48(%esi,%edx,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## Dispersion table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd nb430nf_c12(%esp),%xmm5 ## Vvdw12 - addsd nb430nf_Vvdwtot(%esp),%xmm5 - movlpd %xmm5,nb430nf_Vvdwtot(%esp) -_nb_kernel430nf_ia32_sse2.nb430nf_updateouterdata: - ## get n from stack - movl nb430nf_n(%esp),%esi - ## get group index for i particle - movl nb430nf_gid(%ebp),%edx ## base of gid[] - movl (%edx,%esi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movapd nb430nf_vctot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movl nb430nf_Vc(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%eax,%edx,8) - - ## accumulate total lj energy and update it - movapd nb430nf_Vvdwtot(%esp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movl nb430nf_Vvdw(%ebp),%eax - addsd (%eax,%edx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%eax,%edx,8) - - ## finish if last - movl nb430nf_nn1(%esp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel430nf_ia32_sse2.nb430nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb430nf_n(%esp) - jmp _nb_kernel430nf_ia32_sse2.nb430nf_outer -_nb_kernel430nf_ia32_sse2.nb430nf_outerend: - ## check if more outer neighborlists remain - movl nb430nf_nri(%esp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel430nf_ia32_sse2.nb430nf_end - ## non-zero, do one more workunit - jmp _nb_kernel430nf_ia32_sse2.nb430nf_threadloop -_nb_kernel430nf_ia32_sse2.nb430nf_end: - emms - - movl nb430nf_nouter(%esp),%eax - movl nb430nf_ninner(%esp),%ebx - movl nb430nf_outeriter(%ebp),%ecx - movl nb430nf_inneriter(%ebp),%edx - movl %eax,(%ecx) - movl %ebx,(%edx) - - movl nb430nf_salign(%esp),%eax - addl %eax,%esp - addl $328,%esp - popl %edi - popl %esi - popl %edx - popl %ecx - popl %ebx - popl %eax - leave - ret - - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/Makefile.am b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/Makefile.am index 04b13b177a..c5704df8e3 100644 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/Makefile.am +++ b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/Makefile.am @@ -62,29 +62,29 @@ libnb_kernel_x86_64_sse_la_SOURCES = \ EXTRA_DIST = \ - nb_kernel010_x86_64_sse.intel_syntax.s nb_kernel030_x86_64_sse.intel_syntax.s \ - nb_kernel100_x86_64_sse.intel_syntax.s nb_kernel101_x86_64_sse.intel_syntax.s \ - nb_kernel102_x86_64_sse.intel_syntax.s nb_kernel103_x86_64_sse.intel_syntax.s \ - nb_kernel104_x86_64_sse.intel_syntax.s nb_kernel110_x86_64_sse.intel_syntax.s \ - nb_kernel111_x86_64_sse.intel_syntax.s nb_kernel112_x86_64_sse.intel_syntax.s \ - nb_kernel113_x86_64_sse.intel_syntax.s nb_kernel114_x86_64_sse.intel_syntax.s \ - nb_kernel130_x86_64_sse.intel_syntax.s nb_kernel131_x86_64_sse.intel_syntax.s \ - nb_kernel132_x86_64_sse.intel_syntax.s nb_kernel133_x86_64_sse.intel_syntax.s \ - nb_kernel134_x86_64_sse.intel_syntax.s nb_kernel200_x86_64_sse.intel_syntax.s \ - nb_kernel201_x86_64_sse.intel_syntax.s nb_kernel202_x86_64_sse.intel_syntax.s \ - nb_kernel203_x86_64_sse.intel_syntax.s nb_kernel204_x86_64_sse.intel_syntax.s \ - nb_kernel210_x86_64_sse.intel_syntax.s nb_kernel211_x86_64_sse.intel_syntax.s \ - nb_kernel212_x86_64_sse.intel_syntax.s nb_kernel213_x86_64_sse.intel_syntax.s \ - nb_kernel214_x86_64_sse.intel_syntax.s nb_kernel230_x86_64_sse.intel_syntax.s \ - nb_kernel231_x86_64_sse.intel_syntax.s nb_kernel232_x86_64_sse.intel_syntax.s \ - nb_kernel233_x86_64_sse.intel_syntax.s nb_kernel234_x86_64_sse.intel_syntax.s \ - nb_kernel300_x86_64_sse.intel_syntax.s nb_kernel301_x86_64_sse.intel_syntax.s \ - nb_kernel302_x86_64_sse.intel_syntax.s nb_kernel303_x86_64_sse.intel_syntax.s \ - nb_kernel304_x86_64_sse.intel_syntax.s nb_kernel310_x86_64_sse.intel_syntax.s \ - nb_kernel311_x86_64_sse.intel_syntax.s nb_kernel312_x86_64_sse.intel_syntax.s \ - nb_kernel313_x86_64_sse.intel_syntax.s nb_kernel314_x86_64_sse.intel_syntax.s \ - nb_kernel330_x86_64_sse.intel_syntax.s nb_kernel331_x86_64_sse.intel_syntax.s \ - nb_kernel332_x86_64_sse.intel_syntax.s nb_kernel333_x86_64_sse.intel_syntax.s \ - nb_kernel334_x86_64_sse.intel_syntax.s nb_kernel400_x86_64_sse.intel_syntax.s \ - nb_kernel410_x86_64_sse.intel_syntax.s nb_kernel430_x86_64_sse.intel_syntax.s \ - nb_kernel_x86_64_sse_test_asm.intel_syntax.s + nb_kernel010_x86_64_sse_intel_syntax.s nb_kernel030_x86_64_sse_intel_syntax.s \ + nb_kernel100_x86_64_sse_intel_syntax.s nb_kernel101_x86_64_sse_intel_syntax.s \ + nb_kernel102_x86_64_sse_intel_syntax.s nb_kernel103_x86_64_sse_intel_syntax.s \ + nb_kernel104_x86_64_sse_intel_syntax.s nb_kernel110_x86_64_sse_intel_syntax.s \ + nb_kernel111_x86_64_sse_intel_syntax.s nb_kernel112_x86_64_sse_intel_syntax.s \ + nb_kernel113_x86_64_sse_intel_syntax.s nb_kernel114_x86_64_sse_intel_syntax.s \ + nb_kernel130_x86_64_sse_intel_syntax.s nb_kernel131_x86_64_sse_intel_syntax.s \ + nb_kernel132_x86_64_sse_intel_syntax.s nb_kernel133_x86_64_sse_intel_syntax.s \ + nb_kernel134_x86_64_sse_intel_syntax.s nb_kernel200_x86_64_sse_intel_syntax.s \ + nb_kernel201_x86_64_sse_intel_syntax.s nb_kernel202_x86_64_sse_intel_syntax.s \ + nb_kernel203_x86_64_sse_intel_syntax.s nb_kernel204_x86_64_sse_intel_syntax.s \ + nb_kernel210_x86_64_sse_intel_syntax.s nb_kernel211_x86_64_sse_intel_syntax.s \ + nb_kernel212_x86_64_sse_intel_syntax.s nb_kernel213_x86_64_sse_intel_syntax.s \ + nb_kernel214_x86_64_sse_intel_syntax.s nb_kernel230_x86_64_sse_intel_syntax.s \ + nb_kernel231_x86_64_sse_intel_syntax.s nb_kernel232_x86_64_sse_intel_syntax.s \ + nb_kernel233_x86_64_sse_intel_syntax.s nb_kernel234_x86_64_sse_intel_syntax.s \ + nb_kernel300_x86_64_sse_intel_syntax.s nb_kernel301_x86_64_sse_intel_syntax.s \ + nb_kernel302_x86_64_sse_intel_syntax.s nb_kernel303_x86_64_sse_intel_syntax.s \ + nb_kernel304_x86_64_sse_intel_syntax.s nb_kernel310_x86_64_sse_intel_syntax.s \ + nb_kernel311_x86_64_sse_intel_syntax.s nb_kernel312_x86_64_sse_intel_syntax.s \ + nb_kernel313_x86_64_sse_intel_syntax.s nb_kernel314_x86_64_sse_intel_syntax.s \ + nb_kernel330_x86_64_sse_intel_syntax.s nb_kernel331_x86_64_sse_intel_syntax.s \ + nb_kernel332_x86_64_sse_intel_syntax.s nb_kernel333_x86_64_sse_intel_syntax.s \ + nb_kernel334_x86_64_sse_intel_syntax.s nb_kernel400_x86_64_sse_intel_syntax.s \ + nb_kernel410_x86_64_sse_intel_syntax.s nb_kernel430_x86_64_sse_intel_syntax.s \ + nb_kernel_x86_64_sse_test_asm_intel_syntax.s diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.intel_syntax.s deleted file mode 100644 index 99680cf17e..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.intel_syntax.s +++ /dev/null @@ -1,1662 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. - -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - - - -.globl nb_kernel400_x86_64_sse -.globl _nb_kernel400_x86_64_sse -nb_kernel400_x86_64_sse: -_nb_kernel400_x86_64_sse: -;# Room for return address and rbp (16 bytes) -.equiv nb400_fshift, 16 -.equiv nb400_gid, 24 -.equiv nb400_pos, 32 -.equiv nb400_faction, 40 -.equiv nb400_charge, 48 -.equiv nb400_p_facel, 56 -.equiv nb400_argkrf, 64 -.equiv nb400_argcrf, 72 -.equiv nb400_Vc, 80 -.equiv nb400_type, 88 -.equiv nb400_p_ntype, 96 -.equiv nb400_vdwparam, 104 -.equiv nb400_Vvdw, 112 -.equiv nb400_p_tabscale, 120 -.equiv nb400_VFtab, 128 -.equiv nb400_invsqrta, 136 -.equiv nb400_dvda, 144 -.equiv nb400_p_gbtabscale, 152 -.equiv nb400_GBtab, 160 -.equiv nb400_p_nthreads, 168 -.equiv nb400_count, 176 -.equiv nb400_mtx, 184 -.equiv nb400_outeriter, 192 -.equiv nb400_inneriter, 200 -.equiv nb400_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb400_ix, 0 -.equiv nb400_iy, 16 -.equiv nb400_iz, 32 -.equiv nb400_iq, 48 -.equiv nb400_dx, 64 -.equiv nb400_dy, 80 -.equiv nb400_dz, 96 -.equiv nb400_two, 112 -.equiv nb400_gbtsc, 128 -.equiv nb400_qq, 144 -.equiv nb400_r, 160 -.equiv nb400_vctot, 176 -.equiv nb400_fix, 192 -.equiv nb400_fiy, 208 -.equiv nb400_fiz, 224 -.equiv nb400_half, 240 -.equiv nb400_three, 256 -.equiv nb400_isai, 272 -.equiv nb400_isaprod, 288 -.equiv nb400_dvdasum, 304 -.equiv nb400_gbscale, 320 -.equiv nb400_nri, 336 -.equiv nb400_iinr, 344 -.equiv nb400_jindex, 352 -.equiv nb400_jjnr, 360 -.equiv nb400_shift, 368 -.equiv nb400_shiftvec, 376 -.equiv nb400_facel, 384 -.equiv nb400_innerjjnr, 392 -.equiv nb400_is3, 400 -.equiv nb400_ii3, 404 -.equiv nb400_ii, 408 -.equiv nb400_innerk, 412 -.equiv nb400_n, 416 -.equiv nb400_nn1, 420 -.equiv nb400_nouter, 424 -.equiv nb400_ninner, 428 -.equiv nb400_jnra, 432 -.equiv nb400_jnrb, 436 -.equiv nb400_jnrc, 440 -.equiv nb400_jnrd, 444 - - push rbp - mov rbp, rsp - push rbx - - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 456 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb400_nouter], eax - mov [rsp + nb400_ninner], eax - - mov edi, [rdi] - mov [rsp + nb400_nri], edi - mov [rsp + nb400_iinr], rsi - mov [rsp + nb400_jindex], rdx - mov [rsp + nb400_jjnr], rcx - mov [rsp + nb400_shift], r8 - mov [rsp + nb400_shiftvec], r9 - mov rsi, [rbp + nb400_p_facel] - movss xmm0, [rsi] - movss [rsp + nb400_facel], xmm0 - - mov rbx, [rbp + nb400_p_gbtabscale] - movss xmm4, [rbx] - shufps xmm4, xmm4, 0 - movaps [rsp + nb400_gbtsc], xmm4 - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# half in IEEE (hex) - mov [rsp + nb400_half], eax - movss xmm1, [rsp + nb400_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# one - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# two - addps xmm3, xmm2 ;# three - movaps [rsp + nb400_half], xmm1 - movaps [rsp + nb400_two], xmm2 - movaps [rsp + nb400_three], xmm3 - -.nb400_threadloop: - mov rsi, [rbp + nb400_count] ;# pointer to sync counter - mov eax, [rsi] -.nb400_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb400_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb400_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb400_n], eax - mov [rsp + nb400_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb400_outerstart - jmp .nb400_end - -.nb400_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb400_nouter] - mov [rsp + nb400_nouter], ebx - -.nb400_outer: - mov rax, [rsp + nb400_shift] ;# rax = pointer into shift[] - mov ebx, [rax + rsi*4] ;# ebx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb400_is3],ebx ;# store is3 - - mov rax, [rsp + nb400_shiftvec] ;# rax = base of shiftvec[] - - movss xmm0, [rax + rbx*4] - movss xmm1, [rax + rbx*4 + 4] - movss xmm2, [rax + rbx*4 + 8] - - mov rcx, [rsp + nb400_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx + rsi*4] ;# ebx =ii - mov [rsp + nb400_ii], ebx - - mov rdx, [rbp + nb400_charge] - movss xmm3, [rdx + rbx*4] - mulss xmm3, [rsp + nb400_facel] - shufps xmm3, xmm3, 0 - - - mov rdx, [rbp + nb400_invsqrta] ;# load invsqrta[ii] - movss xmm4, [rdx + rbx*4] - shufps xmm4, xmm4, 0 - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb400_pos] ;# rax = base of pos[] - - addss xmm0, [rax + rbx*4] - addss xmm1, [rax + rbx*4 + 4] - addss xmm2, [rax + rbx*4 + 8] - - movaps [rsp + nb400_iq], xmm3 - movaps [rsp + nb400_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [rsp + nb400_ix], xmm0 - movaps [rsp + nb400_iy], xmm1 - movaps [rsp + nb400_iz], xmm2 - - mov [rsp + nb400_ii3], ebx - - ;# clear vctot and i forces - xorps xmm4, xmm4 - movaps [rsp + nb400_dvdasum], xmm4 - movaps xmm12, xmm4 - movaps xmm13, xmm4 - movaps xmm14, xmm4 - movaps xmm15, xmm4 - - mov rax, [rsp + nb400_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb400_pos] - mov rdi, [rbp + nb400_faction] - mov rax, [rsp + nb400_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb400_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [rsp + nb400_ninner] - mov [rsp + nb400_ninner], ecx - add edx, 0 - mov [rsp + nb400_innerk], edx ;# number of innerloop atoms - jge .nb400_unroll_loop - jmp .nb400_finish_inner -.nb400_unroll_loop: - ;# quad-unroll innerloop here - mov rdx, [rsp + nb400_innerjjnr] ;# pointer to jjnr[k] - mov eax, [rdx] - mov ebx, [rdx + 4] - mov ecx, [rdx + 8] - mov edx, [rdx + 12] ;# eax-edx=jnr1-4 - - add qword ptr [rsp + nb400_innerjjnr], 16 ;# advance pointer (unrolled 4) - - mov rsi, [rbp + nb400_pos] ;# base of pos[] - - lea r8, [rax + rax*2] ;# j3 - lea r9, [rbx + rbx*2] - lea r10, [rcx + rcx*2] - lea r11, [rdx + rdx*2] - - ;# move four coordinates to xmm0-xmm2 - movlps xmm4, [rsi + r8*4] - movlps xmm5, [rsi + r10*4] - movss xmm2, [rsi + r8*4 + 8] - movss xmm6, [rsi + r10*4 + 8] - - movhps xmm4, [rsi + r9*4] - movhps xmm5, [rsi + r11*4] - - movss xmm0, [rsi + r9*4 + 8] - movss xmm1, [rsi + r11*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# 10001000 - - shufps xmm0, xmm5, 136 ;# 10001000 - shufps xmm1, xmm5, 221 ;# 11011101 - - ;# calc dr - subps xmm0, [rsp + nb400_ix] - subps xmm1, [rsp + nb400_iy] - subps xmm2, [rsp + nb400_iz] - - ;# store dr - movaps xmm9, xmm0 - movaps xmm10, xmm1 - movaps xmm11, xmm2 - - ;# square it - mulps xmm0,xmm0 - mulps xmm1,xmm1 - mulps xmm2,xmm2 - addps xmm0, xmm1 - addps xmm0, xmm2 - movaps xmm4, xmm0 - ;# rsq in xmm4 - - ;# load isaj - mov rsi, [rbp + nb400_invsqrta] - movss xmm0, [rsi + rax*4] - movss xmm1, [rsi + rcx*4] - movss xmm2, [rsi + rbx*4] - movss xmm3, [rsi + rdx*4] - movaps xmm7, [rsp + nb400_isai] - shufps xmm0, xmm2, 0 - shufps xmm1, xmm3, 0 - shufps xmm0, xmm1, 136 ;# 10001000 ;# all isaj in xmm3 - mulps xmm7, xmm0 - - movaps [rsp + nb400_isaprod], xmm7 - movaps xmm1, xmm7 - mulps xmm1, [rsp + nb400_gbtsc] - movaps [rsp + nb400_gbscale], xmm1 - - mov rsi, [rbp + nb400_charge] ;# base of charge[] - - movss xmm0, [rsi + rax*4] - movss xmm1, [rsi + rcx*4] - movss xmm2, [rsi + rbx*4] - movss xmm3, [rsi + rdx*4] - - mulps xmm7, [rsp + nb400_iq] - shufps xmm0, xmm2, 0 - shufps xmm1, xmm3, 0 - shufps xmm0, xmm1, 136 ;# 10001000 ;# all charges in xmm3 - - mulps xmm0, xmm7 - movaps [rsp + nb400_qq], xmm0 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb400_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb400_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb400_r], xmm4 - mulps xmm4, [rsp + nb400_gbscale] - - ;# truncate and convert to integers - cvttps2dq xmm5, xmm4 - - ;# convert back to float - cvtdq2ps xmm6, xmm5 - - ;# multiply by 4 - pslld xmm5, 2 - - ;# move to integer registers - movhlps xmm7, xmm5 - movd r12d, xmm5 - movd r14d, xmm7 - pshufd xmm5, xmm5, 1 - pshufd xmm7, xmm7, 1 - movd r13d, xmm5 - movd r15d, xmm7 - - ;# calculate eps - subps xmm4, xmm6 - movaps xmm1, xmm4 ;#eps - - mov rsi, [rbp + nb400_GBtab] - - ;# load table data - movlps xmm5, [rsi + r12*4] - movlps xmm7, [rsi + r14*4] - movhps xmm5, [rsi + r13*4] - movhps xmm7, [rsi + r15*4] - - movaps xmm4, xmm5 - shufps xmm4, xmm7, 136 ;# 10001000 - shufps xmm5, xmm7, 221 ;# 11011101 - - movlps xmm7, [rsi + r12*4 + 8] - movlps xmm8, [rsi + r14*4 + 8] - movhps xmm7, [rsi + r13*4 + 8] - movhps xmm8, [rsi + r15*4 + 8] - - movaps xmm6, xmm7 - - shufps xmm6, xmm8, 136 ;# 10001000 - shufps xmm7, xmm8, 221 ;# 11011101 - ;# table data ready in xmm4-xmm7 - - mulps xmm7, xmm1 ;# Heps - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm1 ;# Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - addps xmm7, xmm7 ;# two*Heps2 - movaps xmm3, [rsp + nb400_qq] - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - mulps xmm3, xmm7 ;# fijC=FF*qq - ;# at this point xmm5 contains vcoul and xmm3 fijC - - mov rsi, [rbp + nb400_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulps xmm3, [rsp + nb400_gbscale] - movaps xmm6, xmm3 - mulps xmm6, [rsp + nb400_r] - addps xmm6, xmm5 - - ;# increment vctot (sum in xmm12) - addps xmm12, xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [rsp + nb400_dvdasum] - movaps [rsp + nb400_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - movaps xmm5, xmm6 - movaps xmm4, xmm7 - shufps xmm5, xmm5, 0x1 - shufps xmm4, xmm4, 0x1 - - ;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss xmm6, [rsi + rax*4] - addss xmm5, [rsi + rbx*4] - addss xmm7, [rsi + rcx*4] - addss xmm4, [rsi + rdx*4] - movss [rsi + rax*4], xmm6 - movss [rsi + rbx*4], xmm5 - movss [rsi + rcx*4], xmm7 - movss [rsi + rdx*4], xmm4 - - xorps xmm4, xmm4 - mulps xmm3, xmm0 - subps xmm4, xmm3 - - mov rsi, [rbp + nb400_faction] - ;# the fj's - start by accumulating x & y forces from memory - movlps xmm0, [rsi + r8*4] ;# x1 y1 - - - movlps xmm1, [rsi + r10*4] ;# x3 y3 - - - movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2 - movhps xmm1, [rsi + r11*4] ;# x3 y3 x4 y4 - - mulps xmm9, xmm4 - mulps xmm10, xmm4 - mulps xmm11, xmm4 - - ;# accumulate i forces - addps xmm13, xmm9 - addps xmm14, xmm10 - addps xmm15, xmm11 - - movaps xmm8, xmm9 - unpcklps xmm9, xmm10 ;# x1 y1 x2 y2 - unpckhps xmm8, xmm10 ;# x3 y3 x4 y4 - - ;# update fjx and fjy - addps xmm0, xmm9 - addps xmm1, xmm8 - - movlps [rsi + r8*4], xmm0 - movlps [rsi + r10*4], xmm1 - movhps [rsi + r9*4], xmm0 - movhps [rsi + r11*4], xmm1 - - ;# xmm11: fjz1 fjz2 fjz3 fjz4 - pshufd xmm10, xmm11, 1 ;# fjz2 - - - - movhlps xmm9, xmm11 ;# fjz3 - - - - pshufd xmm8, xmm11, 3 ;# fjz4 - - - - - addss xmm11, [rsi + r8*4 + 8] - addss xmm10, [rsi + r9*4 + 8] - addss xmm9, [rsi + r10*4 + 8] - addss xmm8, [rsi + r11*4 + 8] - movss [rsi + r8*4 + 8], xmm11 - movss [rsi + r9*4 + 8], xmm10 - movss [rsi + r10*4 + 8], xmm9 - movss [rsi + r11*4 + 8], xmm8 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb400_innerk], 4 - jl .nb400_finish_inner - jmp .nb400_unroll_loop -.nb400_finish_inner: - ;# check if at least two particles remain - add dword ptr [rsp + nb400_innerk], 4 - mov edx, [rsp + nb400_innerk] - and edx, 2 - jnz .nb400_dopair - jmp .nb400_checksingle -.nb400_dopair: - mov rcx, [rsp + nb400_innerjjnr] - - mov eax, [rcx] - mov ebx, [rcx + 4] - add qword ptr [rsp + nb400_innerjjnr], 8 - - ;# load isaj - mov rsi, [rbp + nb400_invsqrta] - movss xmm3, [rsi + rax*4] - movss xmm6, [rsi + rbx*4] - unpcklps xmm3, xmm6 - - movaps xmm2, [rsp + nb400_isai] - mulps xmm2, xmm3 - - movaps [rsp + nb400_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [rsp + nb400_gbtsc] - movaps [rsp + nb400_gbscale], xmm1 - - mov rsi, [rbp + nb400_charge] ;# base of charge[] - - mulps xmm2, [rsp + nb400_iq] - movss xmm3, [rsi + rax*4] - movss xmm6, [rsi + rbx*4] - unpcklps xmm3, xmm6 - - mulps xmm3, xmm2 - movaps [rsp + nb400_qq], xmm3 - - mov rsi, [rbp + nb400_pos] ;# base of pos[] - - lea r8, [rax + rax*2] ;# j3 - lea r9, [rbx + rbx*2] - - ;# move four coordinates to xmm0-xmm2 - movlps xmm4, [rsi + r8*4] ;# x1 y1 - - - movlps xmm5, [rsi + r9*4] ;# x2 y2 - - - - movss xmm6, [rsi + r8*4 + 8] ;# z1 - - - - movss xmm7, [rsi + r9*4 + 8] ;# z2 - - - - - unpcklps xmm4, xmm5 ;# x1 x2 y1 y2 - movhlps xmm5, xmm4 ;# y1 y2 - - - unpcklps xmm6, xmm7 ;# z1 z2 - - - - ;# calc dr - subps xmm4, [rsp + nb400_ix] - subps xmm5, [rsp + nb400_iy] - subps xmm6, [rsp + nb400_iz] - - ;# store dr - movaps xmm9, xmm4 - movaps xmm10, xmm5 - movaps xmm11, xmm6 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb400_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb400_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb400_r], xmm4 - mulps xmm4, [rsp + nb400_gbscale] - - ;# truncate and convert to integers - cvttps2dq xmm5, xmm4 - - ;# convert back to float - cvtdq2ps xmm6, xmm5 - - ;# multiply by 4 - pslld xmm5, 2 - - ;# move to integer registers - movd r12d, xmm5 - pshufd xmm5, xmm5, 1 - movd r13d, xmm5 - - ;# calculate eps - subps xmm4, xmm6 - movaps xmm1, xmm4 ;#eps - - mov rsi, [rbp + nb400_GBtab] - - ;# load table data - movlps xmm4, [rsi + r12*4] - movlps xmm5, [rsi + r13*4] - unpcklps xmm4, xmm5 - movhlps xmm5, xmm4 - - movlps xmm6, [rsi + r12*4 + 8] - movlps xmm7, [rsi + r13*4 + 8] - unpcklps xmm6, xmm7 - movhlps xmm7, xmm6 - ;# table data ready in xmm4-xmm7 - - mulps xmm7, xmm1 ;# Heps - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm1 ;# Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - addps xmm7, xmm7 ;# two*Heps2 - movaps xmm3, [rsp + nb400_qq] - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - mulps xmm3, xmm7 ;# fijC=FF*qq - ;# at this point xmm5 contains vcoul and xmm3 fijC - - ;# zero upper part of vcoul - xorps xmm2, xmm2 - movlhps xmm5, xmm2 - - mov rsi, [rbp + nb400_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulps xmm3, [rsp + nb400_gbscale] - movaps xmm6, xmm3 - mulps xmm6, [rsp + nb400_r] - addps xmm6, xmm5 - - ;# increment vctot (sum in xmm12) - addps xmm12, xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# zero upper half of dvda - movlhps xmm7, xmm2 - - ;# update dvdasum - addps xmm7, [rsp + nb400_dvdasum] - movaps [rsp + nb400_dvdasum], xmm7 - - ;# update j atoms dvdaj - movaps xmm5, xmm6 - shufps xmm5, xmm5, 0x1 - - ;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss xmm6, [rsi + rax*4] - addss xmm5, [rsi + rbx*4] - movss [rsi + rax*4], xmm6 - movss [rsi + rbx*4], xmm5 - - xorps xmm4, xmm4 - mulps xmm3, xmm0 - subps xmm4, xmm3 - - mulps xmm9, xmm4 - mulps xmm10, xmm4 - mulps xmm11, xmm4 - - movlhps xmm9, xmm2 - movlhps xmm10, xmm2 - movlhps xmm11, xmm2 - - ;# accumulate i forces - addps xmm13, xmm9 - addps xmm14, xmm10 - addps xmm15, xmm11 - - mov rsi, [rbp + nb400_faction] - ;# the fj's - start by accumulating x & y forces from memory - movlps xmm0, [rsi + r8*4] ;# x1 y1 - - - movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2 - - unpcklps xmm9, xmm10 ;# x1 y1 x2 y2 - addps xmm0, xmm9 - - movlps [rsi + r8*4], xmm0 - movhps [rsi + r9*4], xmm0 - - ;# z forces - pshufd xmm8, xmm11, 1 - addss xmm11, [rsi + r8*4 + 8] - addss xmm8, [rsi + r9*4 + 8] - movss [rsi + r8*4 + 8], xmm11 - movss [rsi + r9*4 + 8], xmm8 - -.nb400_checksingle: - mov edx, [rsp + nb400_innerk] - and edx, 1 - jnz .nb400_dosingle - jmp .nb400_updateouterdata -.nb400_dosingle: - mov rcx, [rsp + nb400_innerjjnr] - mov eax, [rcx] - - ;# load isaj - mov rsi, [rbp + nb400_invsqrta] - movss xmm2, [rsi + rax*4] - mulss xmm2, [rsp + nb400_isai] - movss [rsp + nb400_isaprod], xmm2 - movaps xmm1, xmm2 - mulss xmm1, [rsp + nb400_gbtsc] - movss [rsp + nb400_gbscale], xmm1 - - mov rsi, [rbp + nb400_charge] ;# base of charge[] - - mulss xmm2, [rsp + nb400_iq] - movss xmm3, [rsi + rax*4] - mulss xmm3, xmm2 - movss [rsp + nb400_qq], xmm3 - - mov rsi, [rbp + nb400_pos] ;# base of pos[] - - lea r8, [rax + rax*2] ;# j3=3*jnr - - ;# move four coordinates to xmm0-xmm2 - movss xmm4, [rsi + r8*4] - movss xmm5, [rsi + r8*4 + 4] - movss xmm6, [rsi + r8*4 + 8] - - ;# calc dr - subss xmm4, [rsp + nb400_ix] - subss xmm5, [rsp + nb400_iy] - subss xmm6, [rsp + nb400_iz] - - ;# store dr - movaps xmm9, xmm4 - movaps xmm10, xmm5 - movaps xmm11, xmm6 - - ;# square it - mulss xmm4,xmm4 - mulss xmm5,xmm5 - mulss xmm6,xmm6 - addss xmm4, xmm5 - addss xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulss xmm5, xmm5 - movaps xmm1, [rsp + nb400_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb400_half] - subss xmm1, xmm5 ;# 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - mulss xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb400_r], xmm4 - mulss xmm4, [rsp + nb400_gbscale] - - ;# truncate and convert to integers - cvttss2si r12d, xmm4 - - ;# convert back to float - cvtsi2ss xmm6, r12d - - ;# multiply by 4 - shl r12d, 2 - - ;# calculate eps - subss xmm4, xmm6 - movaps xmm1, xmm4 ;#eps - - mov rsi, [rbp + nb400_GBtab] - - ;# load table data - movss xmm4, [rsi + r12*4] - movss xmm5, [rsi + r12*4 + 4] - movss xmm6, [rsi + r12*4 + 8] - movss xmm7, [rsi + r12*4 + 12] - ;# table data ready in xmm4-xmm7 - - mulss xmm7, xmm1 ;# Heps - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm1 ;# Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - addss xmm7, xmm7 ;# two*Heps2 - movss xmm3, [rsp + nb400_qq] - addss xmm7, xmm6 - addss xmm7, xmm5 ;# xmm7=FF - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, xmm3 ;# vcoul=qq*VV - mulss xmm3, xmm7 ;# fijC=FF*qq - ;# at this point xmm5 contains vcoul and xmm3 fijC - - mov rsi, [rbp + nb400_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulss xmm3, [rsp + nb400_gbscale] - movaps xmm6, xmm3 - mulss xmm6, [rsp + nb400_r] - addss xmm6, xmm5 - - ;# increment vctot (sum in xmm12) - addss xmm12, xmm5 - - ;# xmm6=(vcoul+fijC*r) - subss xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addss xmm7, [rsp + nb400_dvdasum] - movss [rsp + nb400_dvdasum], xmm7 - - ;# update j atoms dvdaj - addss xmm6, [rsi + rax*4] - movss [rsi + rax*4], xmm6 - - xorps xmm4, xmm4 - mulss xmm3, xmm0 - subss xmm4, xmm3 - - mulss xmm9, xmm4 - mulss xmm10, xmm4 - mulss xmm11, xmm4 - - ;# accumulate i forces - addss xmm13, xmm9 - addss xmm14, xmm10 - addss xmm15, xmm11 - - mov rsi, [rbp + nb400_faction] - ;# add to j forces - addss xmm9, [rsi + r8*4] - addss xmm10, [rsi + r8*4 + 4] - addss xmm11, [rsi + r8*4 + 8] - movss [rsi + r8*4], xmm9 - movss [rsi + r8*4 + 4], xmm10 - movss [rsi + r8*4 + 8], xmm11 - -.nb400_updateouterdata: - mov ecx, [rsp + nb400_ii3] - mov rdi, [rbp + nb400_faction] - mov rsi, [rbp + nb400_fshift] - mov edx, [rsp + nb400_is3] - - ;# accumulate i forces in xmm13, xmm14, xmm15 - movhlps xmm0, xmm13 - movhlps xmm1, xmm14 - movhlps xmm2, xmm15 - addps xmm0, xmm13 - addps xmm1, xmm14 - addps xmm2, xmm15 - movaps xmm3, xmm0 - movaps xmm4, xmm1 - movaps xmm5, xmm2 - shufps xmm3, xmm3, 1 - shufps xmm4, xmm4, 1 - shufps xmm5, xmm5, 1 - addss xmm0, xmm3 - addss xmm1, xmm4 - addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0 - - ;# increment i force - movss xmm3, [rdi + rcx*4] - movss xmm4, [rdi + rcx*4 + 4] - movss xmm5, [rdi + rcx*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [rdi + rcx*4], xmm3 - movss [rdi + rcx*4 + 4], xmm4 - movss [rdi + rcx*4 + 8], xmm5 - - ;# increment fshift force - movss xmm3, [rsi + rdx*4] - movss xmm4, [rsi + rdx*4 + 4] - movss xmm5, [rsi + rdx*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [rsi + rdx*4], xmm3 - movss [rsi + rdx*4 + 4], xmm4 - movss [rsi + rdx*4 + 8], xmm5 - - ;# get n from stack - mov esi, [rsp + nb400_n] - ;# get group index for i particle - mov rdx, [rbp + nb400_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - ;# accumulate - movhlps xmm6, xmm12 - addps xmm12, xmm6 ;# pos 0-1 in xmm12 have the sum now - movaps xmm6, xmm12 - shufps xmm6, xmm6, 1 - addss xmm12, xmm6 - - ;# add earlier value from mem - mov rax, [rbp + nb400_Vc] - addss xmm12, [rax + rdx*4] - ;# move back to mem - movss [rax + rdx*4], xmm12 - - ;# accumulate dVda and update it - movaps xmm7, [rsp + nb400_dvdasum] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - mov edx, [rsp + nb400_ii] - mov rax, [rbp + nb400_dvda] - addss xmm7, [rax + rdx*4] - movss [rax + rdx*4], xmm7 - - ;# finish if last - mov ecx, [rsp + nb400_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb400_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb400_n], esi - jmp .nb400_outer -.nb400_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb400_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb400_end - ;# non-zero, do one more workunit - jmp .nb400_threadloop -.nb400_end: - - mov eax, [rsp + nb400_nouter] - mov ebx, [rsp + nb400_ninner] - mov rcx, [rbp + nb400_outeriter] - mov rdx, [rbp + nb400_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 456 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - - - - -.globl nb_kernel400nf_x86_64_sse -.globl _nb_kernel400nf_x86_64_sse -nb_kernel400nf_x86_64_sse: -_nb_kernel400nf_x86_64_sse: -.equiv nb400nf_fshift, 16 -.equiv nb400nf_gid, 24 -.equiv nb400nf_pos, 32 -.equiv nb400nf_faction, 40 -.equiv nb400nf_charge, 48 -.equiv nb400nf_p_facel, 56 -.equiv nb400nf_argkrf, 64 -.equiv nb400nf_argcrf, 72 -.equiv nb400nf_Vc, 80 -.equiv nb400nf_type, 88 -.equiv nb400nf_p_ntype, 96 -.equiv nb400nf_vdwparam, 104 -.equiv nb400nf_Vvdw, 112 -.equiv nb400nf_p_tabscale, 120 -.equiv nb400nf_VFtab, 128 -.equiv nb400nf_invsqrta, 136 -.equiv nb400nf_dvda, 144 -.equiv nb400nf_p_gbtabscale, 152 -.equiv nb400nf_GBtab, 160 -.equiv nb400nf_p_nthreads, 168 -.equiv nb400nf_count, 176 -.equiv nb400nf_mtx, 184 -.equiv nb400nf_outeriter, 192 -.equiv nb400nf_inneriter, 200 -.equiv nb400nf_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb400nf_ix, 0 -.equiv nb400nf_iy, 16 -.equiv nb400nf_iz, 32 -.equiv nb400nf_iq, 48 -.equiv nb400nf_gbtsc, 64 -.equiv nb400nf_qq, 80 -.equiv nb400nf_vctot, 96 -.equiv nb400nf_half, 112 -.equiv nb400nf_three, 128 -.equiv nb400nf_isai, 144 -.equiv nb400nf_isaprod, 160 -.equiv nb400nf_gbscale, 176 -.equiv nb400nf_nri, 192 -.equiv nb400nf_iinr, 200 -.equiv nb400nf_jindex, 208 -.equiv nb400nf_jjnr, 216 -.equiv nb400nf_shift, 224 -.equiv nb400nf_shiftvec, 232 -.equiv nb400nf_facel, 240 -.equiv nb400nf_innerjjnr, 248 -.equiv nb400nf_is3, 256 -.equiv nb400nf_ii3, 260 -.equiv nb400nf_innerk, 264 -.equiv nb400nf_n, 268 -.equiv nb400nf_nn1, 272 -.equiv nb400nf_nouter, 276 -.equiv nb400nf_ninner, 280 - - - push rbp - mov rbp, rsp - push rbx - - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 296 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb400nf_nouter], eax - mov [rsp + nb400nf_ninner], eax - - mov edi, [rdi] - mov [rsp + nb400nf_nri], edi - mov [rsp + nb400nf_iinr], rsi - mov [rsp + nb400nf_jindex], rdx - mov [rsp + nb400nf_jjnr], rcx - mov [rsp + nb400nf_shift], r8 - mov [rsp + nb400nf_shiftvec], r9 - mov rsi, [rbp + nb400nf_p_facel] - movss xmm0, [rsi] - movss [rsp + nb400nf_facel], xmm0 - - mov rbx, [rbp + nb400nf_p_gbtabscale] - movss xmm4, [rbx] - shufps xmm4, xmm4, 0 - movaps [rsp + nb400nf_gbtsc], xmm4 - - - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# half in IEEE (hex) - mov [rsp + nb400nf_half], eax - movss xmm1, [rsp + nb400nf_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# one - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# two - addps xmm3, xmm2 ;# three - movaps [rsp + nb400nf_half], xmm1 - movaps [rsp + nb400nf_three], xmm3 - -.nb400nf_threadloop: - mov rsi, [rbp + nb400nf_count] ;# pointer to sync counter - mov eax, [rsi] -.nb400nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb400nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb400nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb400nf_n], eax - mov [rsp + nb400nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb400nf_outerstart - jmp .nb400nf_end - -.nb400nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb400nf_nouter] - mov [rsp + nb400nf_nouter], ebx - -.nb400nf_outer: - mov rax, [rsp + nb400nf_shift] ;# rax = pointer into shift[] - mov ebx, [rax + rsi*4] ;# ebx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb400nf_is3],ebx ;# store is3 - - mov rax, [rsp + nb400nf_shiftvec] ;# rax = base of shiftvec[] - - movss xmm0, [rax + rbx*4] - movss xmm1, [rax + rbx*4 + 4] - movss xmm2, [rax + rbx*4 + 8] - - mov rcx, [rsp + nb400nf_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx + rsi*4] ;# ebx =ii - - mov rdx, [rbp + nb400nf_charge] - movss xmm3, [rdx + rbx*4] - mulss xmm3, [rsp + nb400nf_facel] - shufps xmm3, xmm3, 0 - - mov rdx, [rbp + nb400nf_invsqrta] ;# load invsqrta[ii] - movss xmm4, [rdx + rbx*4] - shufps xmm4, xmm4, 0 - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb400nf_pos] ;# rax = base of pos[] - - addss xmm0, [rax + rbx*4] - addss xmm1, [rax + rbx*4 + 4] - addss xmm2, [rax + rbx*4 + 8] - - movaps [rsp + nb400nf_iq], xmm3 - movaps [rsp + nb400nf_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [rsp + nb400nf_ix], xmm0 - movaps [rsp + nb400nf_iy], xmm1 - movaps [rsp + nb400nf_iz], xmm2 - - mov [rsp + nb400nf_ii3], ebx - - ;# clear vctot - xorps xmm4, xmm4 - movaps [rsp + nb400nf_vctot], xmm4 - - mov rax, [rsp + nb400nf_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb400nf_pos] - mov rdi, [rbp + nb400nf_faction] - mov rax, [rsp + nb400nf_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb400nf_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [rsp + nb400nf_ninner] - mov [rsp + nb400nf_ninner], ecx - add edx, 0 - mov [rsp + nb400nf_innerk], edx ;# number of innerloop atoms - jge .nb400nf_unroll_loop - jmp .nb400nf_finish_inner -.nb400nf_unroll_loop: - ;# quad-unroll innerloop here - mov rdx, [rsp + nb400nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [rdx] - mov ebx, [rdx + 4] - mov ecx, [rdx + 8] - mov edx, [rdx + 12] ;# eax-edx=jnr1-4 - add qword ptr [rsp + nb400nf_innerjjnr], 16 ;# advance pointer (unrolled 4) - - ;# load isa2 - mov rsi, [rbp + nb400nf_invsqrta] - movss xmm3, [rsi + rax*4] - movss xmm4, [rsi + rcx*4] - movss xmm6, [rsi + rbx*4] - movss xmm7, [rsi + rdx*4] - movaps xmm2, [rsp + nb400nf_isai] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# 10001000 ;# all charges in xmm3 - mulps xmm2, xmm3 - - movaps [rsp + nb400nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [rsp + nb400nf_gbtsc] - movaps [rsp + nb400nf_gbscale], xmm1 - - mov rsi, [rbp + nb400nf_charge] ;# base of charge[] - - movss xmm3, [rsi + rax*4] - movss xmm4, [rsi + rcx*4] - movss xmm6, [rsi + rbx*4] - movss xmm7, [rsi + rdx*4] - - mulps xmm2, [rsp + nb400nf_iq] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# 10001000 ;# all charges in xmm3 - mulps xmm3, xmm2 - movaps [rsp + nb400nf_qq], xmm3 - - - mov rsi, [rbp + nb400nf_pos] ;# base of pos[] - - lea rax, [rax + rax*2] ;# replace jnr with j3 - lea rbx, [rbx + rbx*2] - - lea rcx, [rcx + rcx*2] ;# replace jnr with j3 - lea rdx, [rdx + rdx*2] - - ;# move four coordinates to xmm0-xmm2 - - movlps xmm4, [rsi + rax*4] - movlps xmm5, [rsi + rcx*4] - movss xmm2, [rsi + rax*4 + 8] - movss xmm6, [rsi + rcx*4 + 8] - - movhps xmm4, [rsi + rbx*4] - movhps xmm5, [rsi + rdx*4] - - movss xmm0, [rsi + rbx*4 + 8] - movss xmm1, [rsi + rdx*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# 10001000 - - shufps xmm0, xmm5, 136 ;# 10001000 - shufps xmm1, xmm5, 221 ;# 11011101 - - ;# move ix-iz to xmm4-xmm6 - movaps xmm4, [rsp + nb400nf_ix] - movaps xmm5, [rsp + nb400nf_iy] - movaps xmm6, [rsp + nb400nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb400nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb400nf_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - mulps xmm4, [rsp + nb400nf_gbscale] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 2 - pslld mm7, 2 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov rsi, [rbp + nb400nf_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# load coulomb table - movaps xmm4, [rsi + rax*4] - movaps xmm5, [rsi + rbx*4] - movaps xmm6, [rsi + rcx*4] - movaps xmm7, [rsi + rdx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [rsp + nb400nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - addps xmm5, [rsp + nb400nf_vctot] - movaps [rsp + nb400nf_vctot], xmm5 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb400nf_innerk], 4 - jl .nb400nf_finish_inner - jmp .nb400nf_unroll_loop -.nb400nf_finish_inner: - ;# check if at least two particles remain - add dword ptr [rsp + nb400nf_innerk], 4 - mov edx, [rsp + nb400nf_innerk] - and edx, 2 - jnz .nb400nf_dopair - jmp .nb400nf_checksingle -.nb400nf_dopair: - mov rcx, [rsp + nb400nf_innerjjnr] - - mov eax, [rcx] - mov ebx, [rcx + 4] - add qword ptr [rsp + nb400nf_innerjjnr], 8 - - xorps xmm2, xmm2 - movaps xmm6, xmm2 - - ;# load isa2 - mov rsi, [rbp + nb400nf_invsqrta] - movss xmm2, [rsi + rax*4] - movss xmm3, [rsi + rbx*4] - unpcklps xmm2, xmm3 ;# isa2 in xmm3(0,1) - mulps xmm2, [rsp + nb400nf_isai] - movaps [rsp + nb400nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [rsp + nb400nf_gbtsc] - movaps [rsp + nb400nf_gbscale], xmm1 - - mov rsi, [rbp + nb400nf_charge] ;# base of charge[] - movss xmm3, [rsi + rax*4] - movss xmm6, [rsi + rbx*4] - unpcklps xmm3, xmm6 ;# 00001000 ;# xmm3(0,1) has the charges - - mulps xmm2, [rsp + nb400nf_iq] - mulps xmm3, xmm2 - movaps [rsp + nb400nf_qq], xmm3 - - mov rdi, [rbp + nb400nf_pos] - - lea rax, [rax + rax*2] - lea rbx, [rbx + rbx*2] - ;# move coordinates to xmm0-xmm2 - movlps xmm1, [rdi + rax*4] - movss xmm2, [rdi + rax*4 + 8] - movhps xmm1, [rdi + rbx*4] - movss xmm0, [rdi + rbx*4 + 8] - - movlhps xmm3, xmm7 - - shufps xmm2, xmm0, 0 - - movaps xmm0, xmm1 - - shufps xmm2, xmm2, 136 ;# 10001000 - - shufps xmm0, xmm0, 136 ;# 10001000 - shufps xmm1, xmm1, 221 ;# 11011101 - - mov rdi, [rbp + nb400nf_faction] - ;# move ix-iz to xmm4-xmm6 - xorps xmm7, xmm7 - - movaps xmm4, [rsp + nb400nf_ix] - movaps xmm5, [rsp + nb400nf_iy] - movaps xmm6, [rsp + nb400nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb400nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb400nf_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - mulps xmm4, [rsp + nb400nf_gbscale] - - cvttps2pi mm6, xmm4 ;# mm6 contain lu indices - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 - - mov rsi, [rbp + nb400nf_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# load coulomb table - movaps xmm4, [rsi + rcx*4] - movaps xmm7, [rsi + rdx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [rsp + nb400nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - addps xmm5, [rsp + nb400nf_vctot] - movaps [rsp + nb400nf_vctot], xmm5 - -.nb400nf_checksingle: - mov edx, [rsp + nb400nf_innerk] - and edx, 1 - jnz .nb400nf_dosingle - jmp .nb400nf_updateouterdata -.nb400nf_dosingle: - mov rsi, [rbp + nb400nf_charge] - mov rdx, [rbp + nb400nf_invsqrta] - mov rdi, [rbp + nb400nf_pos] - mov rcx, [rsp + nb400nf_innerjjnr] - mov eax, [rcx] - xorps xmm2, xmm2 - movaps xmm6, xmm2 - movss xmm2, [rdx + rax*4] ;# isa2 - mulss xmm2, [rsp + nb400nf_isai] - movss [rsp + nb400nf_isaprod], xmm2 - movss xmm1, xmm2 - mulss xmm1, [rsp + nb400nf_gbtsc] - movss [rsp + nb400nf_gbscale], xmm1 - - mulss xmm2, [rsp + nb400nf_iq] - movss xmm6, [rsi + rax*4] ;# xmm6(0) has the charge - mulss xmm6, xmm2 - movss [rsp + nb400nf_qq], xmm6 - - lea rax, [rax + rax*2] - - ;# move coordinates to xmm0-xmm2 - movss xmm0, [rdi + rax*4] - movss xmm1, [rdi + rax*4 + 4] - movss xmm2, [rdi + rax*4 + 8] - - movss xmm4, [rsp + nb400nf_ix] - movss xmm5, [rsp + nb400nf_iy] - movss xmm6, [rsp + nb400nf_iz] - - ;# calc dr - subss xmm4, xmm0 - subss xmm5, xmm1 - subss xmm6, xmm2 - - ;# square it - mulss xmm4,xmm4 - mulss xmm5,xmm5 - mulss xmm6,xmm6 - addss xmm4, xmm5 - addss xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movss xmm2, xmm5 - mulss xmm5, xmm5 - movss xmm1, [rsp + nb400nf_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movss xmm0, [rsp + nb400nf_half] - subss xmm1, xmm5 ;# 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - - mulss xmm4, xmm0 ;# xmm4=r - mulss xmm4, [rsp + nb400nf_gbscale] - - cvttss2si ebx, xmm4 ;# mm6 contain lu indices - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movss xmm1, xmm4 ;# xmm1=eps - movss xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 2 - - mov rsi, [rbp + nb400nf_GBtab] - - movaps xmm4, [rsi + rbx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - movss xmm3, [rsp + nb400nf_qq] - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, xmm3 ;# vcoul=qq*VV - addss xmm5, [rsp + nb400nf_vctot] - movss [rsp + nb400nf_vctot], xmm5 -.nb400nf_updateouterdata: - ;# get n from stack - mov esi, [rsp + nb400nf_n] - ;# get group index for i particle - mov rdx, [rbp + nb400nf_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movaps xmm7, [rsp + nb400nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov rax, [rbp + nb400nf_Vc] - addss xmm7, [rax + rdx*4] - ;# move back to mem - movss [rax + rdx*4], xmm7 - - ;# finish if last - mov ecx, [rsp + nb400nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb400nf_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb400nf_n], esi - jmp .nb400nf_outer -.nb400nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb400nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb400nf_end - ;# non-zero, do one more workunit - jmp .nb400nf_threadloop -.nb400nf_end: - - mov eax, [rsp + nb400nf_nouter] - mov ebx, [rsp + nb400nf_ninner] - mov rcx, [rbp + nb400nf_outeriter] - mov rdx, [rbp + nb400nf_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 296 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.s deleted file mode 100644 index 6b8062d5d1..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel400_x86_64_sse.s +++ /dev/null @@ -1,1638 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - - - - - -.globl nb_kernel400_x86_64_sse -.globl _nb_kernel400_x86_64_sse -nb_kernel400_x86_64_sse: -_nb_kernel400_x86_64_sse: -## Room for return address and rbp (16 bytes) -.set nb400_fshift, 16 -.set nb400_gid, 24 -.set nb400_pos, 32 -.set nb400_faction, 40 -.set nb400_charge, 48 -.set nb400_p_facel, 56 -.set nb400_argkrf, 64 -.set nb400_argcrf, 72 -.set nb400_Vc, 80 -.set nb400_type, 88 -.set nb400_p_ntype, 96 -.set nb400_vdwparam, 104 -.set nb400_Vvdw, 112 -.set nb400_p_tabscale, 120 -.set nb400_VFtab, 128 -.set nb400_invsqrta, 136 -.set nb400_dvda, 144 -.set nb400_p_gbtabscale, 152 -.set nb400_GBtab, 160 -.set nb400_p_nthreads, 168 -.set nb400_count, 176 -.set nb400_mtx, 184 -.set nb400_outeriter, 192 -.set nb400_inneriter, 200 -.set nb400_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb400_ix, 0 -.set nb400_iy, 16 -.set nb400_iz, 32 -.set nb400_iq, 48 -.set nb400_dx, 64 -.set nb400_dy, 80 -.set nb400_dz, 96 -.set nb400_two, 112 -.set nb400_gbtsc, 128 -.set nb400_qq, 144 -.set nb400_r, 160 -.set nb400_vctot, 176 -.set nb400_fix, 192 -.set nb400_fiy, 208 -.set nb400_fiz, 224 -.set nb400_half, 240 -.set nb400_three, 256 -.set nb400_isai, 272 -.set nb400_isaprod, 288 -.set nb400_dvdasum, 304 -.set nb400_gbscale, 320 -.set nb400_nri, 336 -.set nb400_iinr, 344 -.set nb400_jindex, 352 -.set nb400_jjnr, 360 -.set nb400_shift, 368 -.set nb400_shiftvec, 376 -.set nb400_facel, 384 -.set nb400_innerjjnr, 392 -.set nb400_is3, 400 -.set nb400_ii3, 404 -.set nb400_ii, 408 -.set nb400_innerk, 412 -.set nb400_n, 416 -.set nb400_nn1, 420 -.set nb400_nouter, 424 -.set nb400_ninner, 428 -.set nb400_jnra, 432 -.set nb400_jnrb, 436 -.set nb400_jnrc, 440 -.set nb400_jnrd, 444 - - push %rbp - movq %rsp,%rbp - push %rbx - - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $456,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb400_nouter(%rsp) - movl %eax,nb400_ninner(%rsp) - - movl (%rdi),%edi - movl %edi,nb400_nri(%rsp) - movq %rsi,nb400_iinr(%rsp) - movq %rdx,nb400_jindex(%rsp) - movq %rcx,nb400_jjnr(%rsp) - movq %r8,nb400_shift(%rsp) - movq %r9,nb400_shiftvec(%rsp) - movq nb400_p_facel(%rbp),%rsi - movss (%rsi),%xmm0 - movss %xmm0,nb400_facel(%rsp) - - movq nb400_p_gbtabscale(%rbp),%rbx - movss (%rbx),%xmm4 - shufps $0,%xmm4,%xmm4 - movaps %xmm4,nb400_gbtsc(%rsp) - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## half in IEEE (hex) - movl %eax,nb400_half(%rsp) - movss nb400_half(%rsp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## one - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## two - addps %xmm2,%xmm3 ## three - movaps %xmm1,nb400_half(%rsp) - movaps %xmm2,nb400_two(%rsp) - movaps %xmm3,nb400_three(%rsp) - -_nb_kernel400_x86_64_sse.nb400_threadloop: - movq nb400_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel400_x86_64_sse.nb400_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel400_x86_64_sse.nb400_spinlock - - ## if(nn1>nri) nn1=nri - movl nb400_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb400_n(%rsp) - movl %ebx,nb400_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel400_x86_64_sse.nb400_outerstart - jmp _nb_kernel400_x86_64_sse.nb400_end - -_nb_kernel400_x86_64_sse.nb400_outerstart: - ## ebx contains number of outer iterations - addl nb400_nouter(%rsp),%ebx - movl %ebx,nb400_nouter(%rsp) - -_nb_kernel400_x86_64_sse.nb400_outer: - movq nb400_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## ebx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb400_is3(%rsp) ## store is3 - - movq nb400_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movss (%rax,%rbx,4),%xmm0 - movss 4(%rax,%rbx,4),%xmm1 - movss 8(%rax,%rbx,4),%xmm2 - - movq nb400_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - movl %ebx,nb400_ii(%rsp) - - movq nb400_charge(%rbp),%rdx - movss (%rdx,%rbx,4),%xmm3 - mulss nb400_facel(%rsp),%xmm3 - shufps $0,%xmm3,%xmm3 - - - movq nb400_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movss (%rdx,%rbx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb400_pos(%rbp),%rax ## rax = base of pos[] - - addss (%rax,%rbx,4),%xmm0 - addss 4(%rax,%rbx,4),%xmm1 - addss 8(%rax,%rbx,4),%xmm2 - - movaps %xmm3,nb400_iq(%rsp) - movaps %xmm4,nb400_isai(%rsp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb400_ix(%rsp) - movaps %xmm1,nb400_iy(%rsp) - movaps %xmm2,nb400_iz(%rsp) - - movl %ebx,nb400_ii3(%rsp) - - ## clear vctot and i forces - xorps %xmm4,%xmm4 - movaps %xmm4,nb400_dvdasum(%rsp) - movaps %xmm4,%xmm12 - movaps %xmm4,%xmm13 - movaps %xmm4,%xmm14 - movaps %xmm4,%xmm15 - - movq nb400_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb400_pos(%rbp),%rsi - movq nb400_faction(%rbp),%rdi - movq nb400_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb400_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb400_ninner(%rsp),%ecx - movl %ecx,nb400_ninner(%rsp) - addl $0,%edx - movl %edx,nb400_innerk(%rsp) ## number of innerloop atoms - jge _nb_kernel400_x86_64_sse.nb400_unroll_loop - jmp _nb_kernel400_x86_64_sse.nb400_finish_inner -_nb_kernel400_x86_64_sse.nb400_unroll_loop: - ## quad-unroll innerloop here - movq nb400_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%eax - movl 4(%rdx),%ebx - movl 8(%rdx),%ecx - movl 12(%rdx),%edx ## eax-edx=jnr1-4 - - addq $16,nb400_innerjjnr(%rsp) ## advance pointer (unrolled 4) - - movq nb400_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r8 ## j3 - lea (%rbx,%rbx,2),%r9 - lea (%rcx,%rcx,2),%r10 - lea (%rdx,%rdx,2),%r11 - - ## move four coordinates to xmm0-xmm2 - movlps (%rsi,%r8,4),%xmm4 - movlps (%rsi,%r10,4),%xmm5 - movss 8(%rsi,%r8,4),%xmm2 - movss 8(%rsi,%r10,4),%xmm6 - - movhps (%rsi,%r9,4),%xmm4 - movhps (%rsi,%r11,4),%xmm5 - - movss 8(%rsi,%r9,4),%xmm0 - movss 8(%rsi,%r11,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## 10001000 - - shufps $136,%xmm5,%xmm0 ## 10001000 - shufps $221,%xmm5,%xmm1 ## 11011101 - - ## calc dr - subps nb400_ix(%rsp),%xmm0 - subps nb400_iy(%rsp),%xmm1 - subps nb400_iz(%rsp),%xmm2 - - ## store dr - movaps %xmm0,%xmm9 - movaps %xmm1,%xmm10 - movaps %xmm2,%xmm11 - - ## square it - mulps %xmm0,%xmm0 - mulps %xmm1,%xmm1 - mulps %xmm2,%xmm2 - addps %xmm1,%xmm0 - addps %xmm2,%xmm0 - movaps %xmm0,%xmm4 - ## rsq in xmm4 - - ## load isaj - movq nb400_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm0 - movss (%rsi,%rcx,4),%xmm1 - movss (%rsi,%rbx,4),%xmm2 - movss (%rsi,%rdx,4),%xmm3 - movaps nb400_isai(%rsp),%xmm7 - shufps $0,%xmm2,%xmm0 - shufps $0,%xmm3,%xmm1 - shufps $136,%xmm1,%xmm0 ## 10001000 ;# all isaj in xmm3 - mulps %xmm0,%xmm7 - - movaps %xmm7,nb400_isaprod(%rsp) - movaps %xmm7,%xmm1 - mulps nb400_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb400_gbscale(%rsp) - - movq nb400_charge(%rbp),%rsi ## base of charge[] - - movss (%rsi,%rax,4),%xmm0 - movss (%rsi,%rcx,4),%xmm1 - movss (%rsi,%rbx,4),%xmm2 - movss (%rsi,%rdx,4),%xmm3 - - mulps nb400_iq(%rsp),%xmm7 - shufps $0,%xmm2,%xmm0 - shufps $0,%xmm3,%xmm1 - shufps $136,%xmm1,%xmm0 ## 10001000 ;# all charges in xmm3 - - mulps %xmm7,%xmm0 - movaps %xmm0,nb400_qq(%rsp) - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb400_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb400_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb400_r(%rsp) - mulps nb400_gbscale(%rsp),%xmm4 - - ## truncate and convert to integers - cvttps2dq %xmm4,%xmm5 - - ## convert back to float - cvtdq2ps %xmm5,%xmm6 - - ## multiply by 4 - pslld $2,%xmm5 - - ## move to integer registers - movhlps %xmm5,%xmm7 - movd %xmm5,%r12d - movd %xmm7,%r14d - pshufd $1,%xmm5,%xmm5 - pshufd $1,%xmm7,%xmm7 - movd %xmm5,%r13d - movd %xmm7,%r15d - - ## calculate eps - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ##eps - - movq nb400_GBtab(%rbp),%rsi - - ## load table data - movlps (%rsi,%r12,4),%xmm5 - movlps (%rsi,%r14,4),%xmm7 - movhps (%rsi,%r13,4),%xmm5 - movhps (%rsi,%r15,4),%xmm7 - - movaps %xmm5,%xmm4 - shufps $136,%xmm7,%xmm4 ## 10001000 - shufps $221,%xmm7,%xmm5 ## 11011101 - - movlps 8(%rsi,%r12,4),%xmm7 - movlps 8(%rsi,%r14,4),%xmm8 - movhps 8(%rsi,%r13,4),%xmm7 - movhps 8(%rsi,%r15,4),%xmm8 - - movaps %xmm7,%xmm6 - - shufps $136,%xmm8,%xmm6 ## 10001000 - shufps $221,%xmm8,%xmm7 ## 11011101 - ## table data ready in xmm4-xmm7 - - mulps %xmm1,%xmm7 ## Heps - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm1,%xmm7 ## Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - addps %xmm7,%xmm7 ## two*Heps2 - movaps nb400_qq(%rsp),%xmm3 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - mulps %xmm7,%xmm3 ## fijC=FF*qq - ## at this point xmm5 contains vcoul and xmm3 fijC - - movq nb400_dvda(%rbp),%rsi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulps nb400_gbscale(%rsp),%xmm3 - movaps %xmm3,%xmm6 - mulps nb400_r(%rsp),%xmm6 - addps %xmm5,%xmm6 - - ## increment vctot (sum in xmm12) - addps %xmm5,%xmm12 - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb400_dvdasum(%rsp),%xmm7 - movaps %xmm7,nb400_dvdasum(%rsp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - movaps %xmm6,%xmm5 - movaps %xmm7,%xmm4 - shufps $0x1,%xmm5,%xmm5 - shufps $0x1,%xmm4,%xmm4 - - ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss (%rsi,%rax,4),%xmm6 - addss (%rsi,%rbx,4),%xmm5 - addss (%rsi,%rcx,4),%xmm7 - addss (%rsi,%rdx,4),%xmm4 - movss %xmm6,(%rsi,%rax,4) - movss %xmm5,(%rsi,%rbx,4) - movss %xmm7,(%rsi,%rcx,4) - movss %xmm4,(%rsi,%rdx,4) - - xorps %xmm4,%xmm4 - mulps %xmm0,%xmm3 - subps %xmm3,%xmm4 - - movq nb400_faction(%rbp),%rsi - ## the fj's - start by accumulating x & y forces from memory - movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - - - movlps (%rsi,%r10,4),%xmm1 ## x3 y3 - - - movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2 - movhps (%rsi,%r11,4),%xmm1 ## x3 y3 x4 y4 - - mulps %xmm4,%xmm9 - mulps %xmm4,%xmm10 - mulps %xmm4,%xmm11 - - ## accumulate i forces - addps %xmm9,%xmm13 - addps %xmm10,%xmm14 - addps %xmm11,%xmm15 - - movaps %xmm9,%xmm8 - unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2 - unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4 - - ## update fjx and fjy - addps %xmm9,%xmm0 - addps %xmm8,%xmm1 - - movlps %xmm0,(%rsi,%r8,4) - movlps %xmm1,(%rsi,%r10,4) - movhps %xmm0,(%rsi,%r9,4) - movhps %xmm1,(%rsi,%r11,4) - - ## xmm11: fjz1 fjz2 fjz3 fjz4 - pshufd $1,%xmm11,%xmm10 ## fjz2 - - - - movhlps %xmm11,%xmm9 ## fjz3 - - - - pshufd $3,%xmm11,%xmm8 ## fjz4 - - - - - addss 8(%rsi,%r8,4),%xmm11 - addss 8(%rsi,%r9,4),%xmm10 - addss 8(%rsi,%r10,4),%xmm9 - addss 8(%rsi,%r11,4),%xmm8 - movss %xmm11,8(%rsi,%r8,4) - movss %xmm10,8(%rsi,%r9,4) - movss %xmm9,8(%rsi,%r10,4) - movss %xmm8,8(%rsi,%r11,4) - - ## should we do one more iteration? - subl $4,nb400_innerk(%rsp) - jl _nb_kernel400_x86_64_sse.nb400_finish_inner - jmp _nb_kernel400_x86_64_sse.nb400_unroll_loop -_nb_kernel400_x86_64_sse.nb400_finish_inner: - ## check if at least two particles remain - addl $4,nb400_innerk(%rsp) - movl nb400_innerk(%rsp),%edx - andl $2,%edx - jnz _nb_kernel400_x86_64_sse.nb400_dopair - jmp _nb_kernel400_x86_64_sse.nb400_checksingle -_nb_kernel400_x86_64_sse.nb400_dopair: - movq nb400_innerjjnr(%rsp),%rcx - - movl (%rcx),%eax - movl 4(%rcx),%ebx - addq $8,nb400_innerjjnr(%rsp) - - ## load isaj - movq nb400_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rbx,4),%xmm6 - unpcklps %xmm6,%xmm3 - - movaps nb400_isai(%rsp),%xmm2 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb400_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulps nb400_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb400_gbscale(%rsp) - - movq nb400_charge(%rbp),%rsi ## base of charge[] - - mulps nb400_iq(%rsp),%xmm2 - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rbx,4),%xmm6 - unpcklps %xmm6,%xmm3 - - mulps %xmm2,%xmm3 - movaps %xmm3,nb400_qq(%rsp) - - movq nb400_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r8 ## j3 - lea (%rbx,%rbx,2),%r9 - - ## move four coordinates to xmm0-xmm2 - movlps (%rsi,%r8,4),%xmm4 ## x1 y1 - - - movlps (%rsi,%r9,4),%xmm5 ## x2 y2 - - - - movss 8(%rsi,%r8,4),%xmm6 ## z1 - - - - movss 8(%rsi,%r9,4),%xmm7 ## z2 - - - - - unpcklps %xmm5,%xmm4 ## x1 x2 y1 y2 - movhlps %xmm4,%xmm5 ## y1 y2 - - - unpcklps %xmm7,%xmm6 ## z1 z2 - - - - ## calc dr - subps nb400_ix(%rsp),%xmm4 - subps nb400_iy(%rsp),%xmm5 - subps nb400_iz(%rsp),%xmm6 - - ## store dr - movaps %xmm4,%xmm9 - movaps %xmm5,%xmm10 - movaps %xmm6,%xmm11 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb400_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb400_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb400_r(%rsp) - mulps nb400_gbscale(%rsp),%xmm4 - - ## truncate and convert to integers - cvttps2dq %xmm4,%xmm5 - - ## convert back to float - cvtdq2ps %xmm5,%xmm6 - - ## multiply by 4 - pslld $2,%xmm5 - - ## move to integer registers - movd %xmm5,%r12d - pshufd $1,%xmm5,%xmm5 - movd %xmm5,%r13d - - ## calculate eps - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ##eps - - movq nb400_GBtab(%rbp),%rsi - - ## load table data - movlps (%rsi,%r12,4),%xmm4 - movlps (%rsi,%r13,4),%xmm5 - unpcklps %xmm5,%xmm4 - movhlps %xmm4,%xmm5 - - movlps 8(%rsi,%r12,4),%xmm6 - movlps 8(%rsi,%r13,4),%xmm7 - unpcklps %xmm7,%xmm6 - movhlps %xmm6,%xmm7 - ## table data ready in xmm4-xmm7 - - mulps %xmm1,%xmm7 ## Heps - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm1,%xmm7 ## Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - addps %xmm7,%xmm7 ## two*Heps2 - movaps nb400_qq(%rsp),%xmm3 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - mulps %xmm7,%xmm3 ## fijC=FF*qq - ## at this point xmm5 contains vcoul and xmm3 fijC - - ## zero upper part of vcoul - xorps %xmm2,%xmm2 - movlhps %xmm2,%xmm5 - - movq nb400_dvda(%rbp),%rsi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulps nb400_gbscale(%rsp),%xmm3 - movaps %xmm3,%xmm6 - mulps nb400_r(%rsp),%xmm6 - addps %xmm5,%xmm6 - - ## increment vctot (sum in xmm12) - addps %xmm5,%xmm12 - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## zero upper half of dvda - movlhps %xmm2,%xmm7 - - ## update dvdasum - addps nb400_dvdasum(%rsp),%xmm7 - movaps %xmm7,nb400_dvdasum(%rsp) - - ## update j atoms dvdaj - movaps %xmm6,%xmm5 - shufps $0x1,%xmm5,%xmm5 - - ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss (%rsi,%rax,4),%xmm6 - addss (%rsi,%rbx,4),%xmm5 - movss %xmm6,(%rsi,%rax,4) - movss %xmm5,(%rsi,%rbx,4) - - xorps %xmm4,%xmm4 - mulps %xmm0,%xmm3 - subps %xmm3,%xmm4 - - mulps %xmm4,%xmm9 - mulps %xmm4,%xmm10 - mulps %xmm4,%xmm11 - - movlhps %xmm2,%xmm9 - movlhps %xmm2,%xmm10 - movlhps %xmm2,%xmm11 - - ## accumulate i forces - addps %xmm9,%xmm13 - addps %xmm10,%xmm14 - addps %xmm11,%xmm15 - - movq nb400_faction(%rbp),%rsi - ## the fj's - start by accumulating x & y forces from memory - movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - - - movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2 - - unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2 - addps %xmm9,%xmm0 - - movlps %xmm0,(%rsi,%r8,4) - movhps %xmm0,(%rsi,%r9,4) - - ## z forces - pshufd $1,%xmm11,%xmm8 - addss 8(%rsi,%r8,4),%xmm11 - addss 8(%rsi,%r9,4),%xmm8 - movss %xmm11,8(%rsi,%r8,4) - movss %xmm8,8(%rsi,%r9,4) - -_nb_kernel400_x86_64_sse.nb400_checksingle: - movl nb400_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel400_x86_64_sse.nb400_dosingle - jmp _nb_kernel400_x86_64_sse.nb400_updateouterdata -_nb_kernel400_x86_64_sse.nb400_dosingle: - movq nb400_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - - ## load isaj - movq nb400_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm2 - mulss nb400_isai(%rsp),%xmm2 - movss %xmm2,nb400_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulss nb400_gbtsc(%rsp),%xmm1 - movss %xmm1,nb400_gbscale(%rsp) - - movq nb400_charge(%rbp),%rsi ## base of charge[] - - mulss nb400_iq(%rsp),%xmm2 - movss (%rsi,%rax,4),%xmm3 - mulss %xmm2,%xmm3 - movss %xmm3,nb400_qq(%rsp) - - movq nb400_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r8 ## j3=3*jnr - - ## move four coordinates to xmm0-xmm2 - movss (%rsi,%r8,4),%xmm4 - movss 4(%rsi,%r8,4),%xmm5 - movss 8(%rsi,%r8,4),%xmm6 - - ## calc dr - subss nb400_ix(%rsp),%xmm4 - subss nb400_iy(%rsp),%xmm5 - subss nb400_iz(%rsp),%xmm6 - - ## store dr - movaps %xmm4,%xmm9 - movaps %xmm5,%xmm10 - movaps %xmm6,%xmm11 - - ## square it - mulss %xmm4,%xmm4 - mulss %xmm5,%xmm5 - mulss %xmm6,%xmm6 - addss %xmm5,%xmm4 - addss %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movaps nb400_three(%rsp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movaps nb400_half(%rsp),%xmm0 - subss %xmm5,%xmm1 ## 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - mulss %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb400_r(%rsp) - mulss nb400_gbscale(%rsp),%xmm4 - - ## truncate and convert to integers - cvttss2si %xmm4,%r12d - - ## convert back to float - cvtsi2ss %r12d,%xmm6 - - ## multiply by 4 - shll $2,%r12d - - ## calculate eps - subss %xmm6,%xmm4 - movaps %xmm4,%xmm1 ##eps - - movq nb400_GBtab(%rbp),%rsi - - ## load table data - movss (%rsi,%r12,4),%xmm4 - movss 4(%rsi,%r12,4),%xmm5 - movss 8(%rsi,%r12,4),%xmm6 - movss 12(%rsi,%r12,4),%xmm7 - ## table data ready in xmm4-xmm7 - - mulss %xmm1,%xmm7 ## Heps - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm1,%xmm7 ## Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - addss %xmm7,%xmm7 ## two*Heps2 - movss nb400_qq(%rsp),%xmm3 - addss %xmm6,%xmm7 - addss %xmm5,%xmm7 ## xmm7=FF - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss %xmm3,%xmm5 ## vcoul=qq*VV - mulss %xmm7,%xmm3 ## fijC=FF*qq - ## at this point xmm5 contains vcoul and xmm3 fijC - - movq nb400_dvda(%rbp),%rsi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulss nb400_gbscale(%rsp),%xmm3 - movaps %xmm3,%xmm6 - mulss nb400_r(%rsp),%xmm6 - addss %xmm5,%xmm6 - - ## increment vctot (sum in xmm12) - addss %xmm5,%xmm12 - - ## xmm6=(vcoul+fijC*r) - subss %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addss nb400_dvdasum(%rsp),%xmm7 - movss %xmm7,nb400_dvdasum(%rsp) - - ## update j atoms dvdaj - addss (%rsi,%rax,4),%xmm6 - movss %xmm6,(%rsi,%rax,4) - - xorps %xmm4,%xmm4 - mulss %xmm0,%xmm3 - subss %xmm3,%xmm4 - - mulss %xmm4,%xmm9 - mulss %xmm4,%xmm10 - mulss %xmm4,%xmm11 - - ## accumulate i forces - addss %xmm9,%xmm13 - addss %xmm10,%xmm14 - addss %xmm11,%xmm15 - - movq nb400_faction(%rbp),%rsi - ## add to j forces - addss (%rsi,%r8,4),%xmm9 - addss 4(%rsi,%r8,4),%xmm10 - addss 8(%rsi,%r8,4),%xmm11 - movss %xmm9,(%rsi,%r8,4) - movss %xmm10,4(%rsi,%r8,4) - movss %xmm11,8(%rsi,%r8,4) - -_nb_kernel400_x86_64_sse.nb400_updateouterdata: - movl nb400_ii3(%rsp),%ecx - movq nb400_faction(%rbp),%rdi - movq nb400_fshift(%rbp),%rsi - movl nb400_is3(%rsp),%edx - - ## accumulate i forces in xmm13, xmm14, xmm15 - movhlps %xmm13,%xmm0 - movhlps %xmm14,%xmm1 - movhlps %xmm15,%xmm2 - addps %xmm13,%xmm0 - addps %xmm14,%xmm1 - addps %xmm15,%xmm2 - movaps %xmm0,%xmm3 - movaps %xmm1,%xmm4 - movaps %xmm2,%xmm5 - shufps $1,%xmm3,%xmm3 - shufps $1,%xmm4,%xmm4 - shufps $1,%xmm5,%xmm5 - addss %xmm3,%xmm0 - addss %xmm4,%xmm1 - addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0 - - ## increment i force - movss (%rdi,%rcx,4),%xmm3 - movss 4(%rdi,%rcx,4),%xmm4 - movss 8(%rdi,%rcx,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%rdi,%rcx,4) - movss %xmm4,4(%rdi,%rcx,4) - movss %xmm5,8(%rdi,%rcx,4) - - ## increment fshift force - movss (%rsi,%rdx,4),%xmm3 - movss 4(%rsi,%rdx,4),%xmm4 - movss 8(%rsi,%rdx,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%rsi,%rdx,4) - movss %xmm4,4(%rsi,%rdx,4) - movss %xmm5,8(%rsi,%rdx,4) - - ## get n from stack - movl nb400_n(%rsp),%esi - ## get group index for i particle - movq nb400_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - ## accumulate - movhlps %xmm12,%xmm6 - addps %xmm6,%xmm12 ## pos 0-1 in xmm12 have the sum now - movaps %xmm12,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm12 - - ## add earlier value from mem - movq nb400_Vc(%rbp),%rax - addss (%rax,%rdx,4),%xmm12 - ## move back to mem - movss %xmm12,(%rax,%rdx,4) - - ## accumulate dVda and update it - movaps nb400_dvdasum(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - movl nb400_ii(%rsp),%edx - movq nb400_dvda(%rbp),%rax - addss (%rax,%rdx,4),%xmm7 - movss %xmm7,(%rax,%rdx,4) - - ## finish if last - movl nb400_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel400_x86_64_sse.nb400_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb400_n(%rsp) - jmp _nb_kernel400_x86_64_sse.nb400_outer -_nb_kernel400_x86_64_sse.nb400_outerend: - ## check if more outer neighborlists remain - movl nb400_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel400_x86_64_sse.nb400_end - ## non-zero, do one more workunit - jmp _nb_kernel400_x86_64_sse.nb400_threadloop -_nb_kernel400_x86_64_sse.nb400_end: - - movl nb400_nouter(%rsp),%eax - movl nb400_ninner(%rsp),%ebx - movq nb400_outeriter(%rbp),%rcx - movq nb400_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $456,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - - - -.globl nb_kernel400nf_x86_64_sse -.globl _nb_kernel400nf_x86_64_sse -nb_kernel400nf_x86_64_sse: -_nb_kernel400nf_x86_64_sse: -.set nb400nf_fshift, 16 -.set nb400nf_gid, 24 -.set nb400nf_pos, 32 -.set nb400nf_faction, 40 -.set nb400nf_charge, 48 -.set nb400nf_p_facel, 56 -.set nb400nf_argkrf, 64 -.set nb400nf_argcrf, 72 -.set nb400nf_Vc, 80 -.set nb400nf_type, 88 -.set nb400nf_p_ntype, 96 -.set nb400nf_vdwparam, 104 -.set nb400nf_Vvdw, 112 -.set nb400nf_p_tabscale, 120 -.set nb400nf_VFtab, 128 -.set nb400nf_invsqrta, 136 -.set nb400nf_dvda, 144 -.set nb400nf_p_gbtabscale, 152 -.set nb400nf_GBtab, 160 -.set nb400nf_p_nthreads, 168 -.set nb400nf_count, 176 -.set nb400nf_mtx, 184 -.set nb400nf_outeriter, 192 -.set nb400nf_inneriter, 200 -.set nb400nf_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb400nf_ix, 0 -.set nb400nf_iy, 16 -.set nb400nf_iz, 32 -.set nb400nf_iq, 48 -.set nb400nf_gbtsc, 64 -.set nb400nf_qq, 80 -.set nb400nf_vctot, 96 -.set nb400nf_half, 112 -.set nb400nf_three, 128 -.set nb400nf_isai, 144 -.set nb400nf_isaprod, 160 -.set nb400nf_gbscale, 176 -.set nb400nf_nri, 192 -.set nb400nf_iinr, 200 -.set nb400nf_jindex, 208 -.set nb400nf_jjnr, 216 -.set nb400nf_shift, 224 -.set nb400nf_shiftvec, 232 -.set nb400nf_facel, 240 -.set nb400nf_innerjjnr, 248 -.set nb400nf_is3, 256 -.set nb400nf_ii3, 260 -.set nb400nf_innerk, 264 -.set nb400nf_n, 268 -.set nb400nf_nn1, 272 -.set nb400nf_nouter, 276 -.set nb400nf_ninner, 280 - - - push %rbp - movq %rsp,%rbp - push %rbx - - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $296,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb400nf_nouter(%rsp) - movl %eax,nb400nf_ninner(%rsp) - - movl (%rdi),%edi - movl %edi,nb400nf_nri(%rsp) - movq %rsi,nb400nf_iinr(%rsp) - movq %rdx,nb400nf_jindex(%rsp) - movq %rcx,nb400nf_jjnr(%rsp) - movq %r8,nb400nf_shift(%rsp) - movq %r9,nb400nf_shiftvec(%rsp) - movq nb400nf_p_facel(%rbp),%rsi - movss (%rsi),%xmm0 - movss %xmm0,nb400nf_facel(%rsp) - - movq nb400nf_p_gbtabscale(%rbp),%rbx - movss (%rbx),%xmm4 - shufps $0,%xmm4,%xmm4 - movaps %xmm4,nb400nf_gbtsc(%rsp) - - - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## half in IEEE (hex) - movl %eax,nb400nf_half(%rsp) - movss nb400nf_half(%rsp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## one - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## two - addps %xmm2,%xmm3 ## three - movaps %xmm1,nb400nf_half(%rsp) - movaps %xmm3,nb400nf_three(%rsp) - -_nb_kernel400nf_x86_64_sse.nb400nf_threadloop: - movq nb400nf_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel400nf_x86_64_sse.nb400nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel400nf_x86_64_sse.nb400nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb400nf_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb400nf_n(%rsp) - movl %ebx,nb400nf_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel400nf_x86_64_sse.nb400nf_outerstart - jmp _nb_kernel400nf_x86_64_sse.nb400nf_end - -_nb_kernel400nf_x86_64_sse.nb400nf_outerstart: - ## ebx contains number of outer iterations - addl nb400nf_nouter(%rsp),%ebx - movl %ebx,nb400nf_nouter(%rsp) - -_nb_kernel400nf_x86_64_sse.nb400nf_outer: - movq nb400nf_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## ebx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb400nf_is3(%rsp) ## store is3 - - movq nb400nf_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movss (%rax,%rbx,4),%xmm0 - movss 4(%rax,%rbx,4),%xmm1 - movss 8(%rax,%rbx,4),%xmm2 - - movq nb400nf_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - - movq nb400nf_charge(%rbp),%rdx - movss (%rdx,%rbx,4),%xmm3 - mulss nb400nf_facel(%rsp),%xmm3 - shufps $0,%xmm3,%xmm3 - - movq nb400nf_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movss (%rdx,%rbx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb400nf_pos(%rbp),%rax ## rax = base of pos[] - - addss (%rax,%rbx,4),%xmm0 - addss 4(%rax,%rbx,4),%xmm1 - addss 8(%rax,%rbx,4),%xmm2 - - movaps %xmm3,nb400nf_iq(%rsp) - movaps %xmm4,nb400nf_isai(%rsp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb400nf_ix(%rsp) - movaps %xmm1,nb400nf_iy(%rsp) - movaps %xmm2,nb400nf_iz(%rsp) - - movl %ebx,nb400nf_ii3(%rsp) - - ## clear vctot - xorps %xmm4,%xmm4 - movaps %xmm4,nb400nf_vctot(%rsp) - - movq nb400nf_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb400nf_pos(%rbp),%rsi - movq nb400nf_faction(%rbp),%rdi - movq nb400nf_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb400nf_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb400nf_ninner(%rsp),%ecx - movl %ecx,nb400nf_ninner(%rsp) - addl $0,%edx - movl %edx,nb400nf_innerk(%rsp) ## number of innerloop atoms - jge _nb_kernel400nf_x86_64_sse.nb400nf_unroll_loop - jmp _nb_kernel400nf_x86_64_sse.nb400nf_finish_inner -_nb_kernel400nf_x86_64_sse.nb400nf_unroll_loop: - ## quad-unroll innerloop here - movq nb400nf_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%eax - movl 4(%rdx),%ebx - movl 8(%rdx),%ecx - movl 12(%rdx),%edx ## eax-edx=jnr1-4 - addq $16,nb400nf_innerjjnr(%rsp) ## advance pointer (unrolled 4) - - ## load isa2 - movq nb400nf_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rcx,4),%xmm4 - movss (%rsi,%rbx,4),%xmm6 - movss (%rsi,%rdx,4),%xmm7 - movaps nb400nf_isai(%rsp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb400nf_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulps nb400nf_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb400nf_gbscale(%rsp) - - movq nb400nf_charge(%rbp),%rsi ## base of charge[] - - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rcx,4),%xmm4 - movss (%rsi,%rbx,4),%xmm6 - movss (%rsi,%rdx,4),%xmm7 - - mulps nb400nf_iq(%rsp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3 - mulps %xmm2,%xmm3 - movaps %xmm3,nb400nf_qq(%rsp) - - - movq nb400nf_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%rax ## replace jnr with j3 - lea (%rbx,%rbx,2),%rbx - - lea (%rcx,%rcx,2),%rcx ## replace jnr with j3 - lea (%rdx,%rdx,2),%rdx - - ## move four coordinates to xmm0-xmm2 - - movlps (%rsi,%rax,4),%xmm4 - movlps (%rsi,%rcx,4),%xmm5 - movss 8(%rsi,%rax,4),%xmm2 - movss 8(%rsi,%rcx,4),%xmm6 - - movhps (%rsi,%rbx,4),%xmm4 - movhps (%rsi,%rdx,4),%xmm5 - - movss 8(%rsi,%rbx,4),%xmm0 - movss 8(%rsi,%rdx,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## 10001000 - - shufps $136,%xmm5,%xmm0 ## 10001000 - shufps $221,%xmm5,%xmm1 ## 11011101 - - ## move ix-iz to xmm4-xmm6 - movaps nb400nf_ix(%rsp),%xmm4 - movaps nb400nf_iy(%rsp),%xmm5 - movaps nb400nf_iz(%rsp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb400nf_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb400nf_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - mulps nb400nf_gbscale(%rsp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $2,%mm6 - pslld $2,%mm7 - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movq nb400nf_GBtab(%rbp),%rsi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## load coulomb table - movaps (%rsi,%rax,4),%xmm4 - movaps (%rsi,%rbx,4),%xmm5 - movaps (%rsi,%rcx,4),%xmm6 - movaps (%rsi,%rdx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb400nf_qq(%rsp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - addps nb400nf_vctot(%rsp),%xmm5 - movaps %xmm5,nb400nf_vctot(%rsp) - - ## should we do one more iteration? - subl $4,nb400nf_innerk(%rsp) - jl _nb_kernel400nf_x86_64_sse.nb400nf_finish_inner - jmp _nb_kernel400nf_x86_64_sse.nb400nf_unroll_loop -_nb_kernel400nf_x86_64_sse.nb400nf_finish_inner: - ## check if at least two particles remain - addl $4,nb400nf_innerk(%rsp) - movl nb400nf_innerk(%rsp),%edx - andl $2,%edx - jnz _nb_kernel400nf_x86_64_sse.nb400nf_dopair - jmp _nb_kernel400nf_x86_64_sse.nb400nf_checksingle -_nb_kernel400nf_x86_64_sse.nb400nf_dopair: - movq nb400nf_innerjjnr(%rsp),%rcx - - movl (%rcx),%eax - movl 4(%rcx),%ebx - addq $8,nb400nf_innerjjnr(%rsp) - - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - - ## load isa2 - movq nb400nf_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm2 - movss (%rsi,%rbx,4),%xmm3 - unpcklps %xmm3,%xmm2 ## isa2 in xmm3(0,1) - mulps nb400nf_isai(%rsp),%xmm2 - movaps %xmm2,nb400nf_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulps nb400nf_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb400nf_gbscale(%rsp) - - movq nb400nf_charge(%rbp),%rsi ## base of charge[] - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rbx,4),%xmm6 - unpcklps %xmm6,%xmm3 ## 00001000 ;# xmm3(0,1) has the charges - - mulps nb400nf_iq(%rsp),%xmm2 - mulps %xmm2,%xmm3 - movaps %xmm3,nb400nf_qq(%rsp) - - movq nb400nf_pos(%rbp),%rdi - - lea (%rax,%rax,2),%rax - lea (%rbx,%rbx,2),%rbx - ## move coordinates to xmm0-xmm2 - movlps (%rdi,%rax,4),%xmm1 - movss 8(%rdi,%rax,4),%xmm2 - movhps (%rdi,%rbx,4),%xmm1 - movss 8(%rdi,%rbx,4),%xmm0 - - movlhps %xmm7,%xmm3 - - shufps $0,%xmm0,%xmm2 - - movaps %xmm1,%xmm0 - - shufps $136,%xmm2,%xmm2 ## 10001000 - - shufps $136,%xmm0,%xmm0 ## 10001000 - shufps $221,%xmm1,%xmm1 ## 11011101 - - movq nb400nf_faction(%rbp),%rdi - ## move ix-iz to xmm4-xmm6 - xorps %xmm7,%xmm7 - - movaps nb400nf_ix(%rsp),%xmm4 - movaps nb400nf_iy(%rsp),%xmm5 - movaps nb400nf_iz(%rsp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb400nf_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb400nf_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - mulps nb400nf_gbscale(%rsp),%xmm4 - - cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 - - movq nb400nf_GBtab(%rbp),%rsi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## load coulomb table - movaps (%rsi,%rcx,4),%xmm4 - movaps (%rsi,%rdx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb400nf_qq(%rsp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - addps nb400nf_vctot(%rsp),%xmm5 - movaps %xmm5,nb400nf_vctot(%rsp) - -_nb_kernel400nf_x86_64_sse.nb400nf_checksingle: - movl nb400nf_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel400nf_x86_64_sse.nb400nf_dosingle - jmp _nb_kernel400nf_x86_64_sse.nb400nf_updateouterdata -_nb_kernel400nf_x86_64_sse.nb400nf_dosingle: - movq nb400nf_charge(%rbp),%rsi - movq nb400nf_invsqrta(%rbp),%rdx - movq nb400nf_pos(%rbp),%rdi - movq nb400nf_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - movss (%rdx,%rax,4),%xmm2 ## isa2 - mulss nb400nf_isai(%rsp),%xmm2 - movss %xmm2,nb400nf_isaprod(%rsp) - movss %xmm2,%xmm1 - mulss nb400nf_gbtsc(%rsp),%xmm1 - movss %xmm1,nb400nf_gbscale(%rsp) - - mulss nb400nf_iq(%rsp),%xmm2 - movss (%rsi,%rax,4),%xmm6 ## xmm6(0) has the charge - mulss %xmm2,%xmm6 - movss %xmm6,nb400nf_qq(%rsp) - - lea (%rax,%rax,2),%rax - - ## move coordinates to xmm0-xmm2 - movss (%rdi,%rax,4),%xmm0 - movss 4(%rdi,%rax,4),%xmm1 - movss 8(%rdi,%rax,4),%xmm2 - - movss nb400nf_ix(%rsp),%xmm4 - movss nb400nf_iy(%rsp),%xmm5 - movss nb400nf_iz(%rsp),%xmm6 - - ## calc dr - subss %xmm0,%xmm4 - subss %xmm1,%xmm5 - subss %xmm2,%xmm6 - - ## square it - mulss %xmm4,%xmm4 - mulss %xmm5,%xmm5 - mulss %xmm6,%xmm6 - addss %xmm5,%xmm4 - addss %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movss %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movss nb400nf_three(%rsp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movss nb400nf_half(%rsp),%xmm0 - subss %xmm5,%xmm1 ## 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - - mulss %xmm0,%xmm4 ## xmm4=r - mulss nb400nf_gbscale(%rsp),%xmm4 - - cvttss2si %xmm4,%ebx ## mm6 contain lu indices - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movss %xmm4,%xmm1 ## xmm1=eps - movss %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%ebx - - movq nb400nf_GBtab(%rbp),%rsi - - movaps (%rsi,%rbx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - movss nb400nf_qq(%rsp),%xmm3 - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss %xmm3,%xmm5 ## vcoul=qq*VV - addss nb400nf_vctot(%rsp),%xmm5 - movss %xmm5,nb400nf_vctot(%rsp) -_nb_kernel400nf_x86_64_sse.nb400nf_updateouterdata: - ## get n from stack - movl nb400nf_n(%rsp),%esi - ## get group index for i particle - movq nb400nf_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movaps nb400nf_vctot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movq nb400nf_Vc(%rbp),%rax - addss (%rax,%rdx,4),%xmm7 - ## move back to mem - movss %xmm7,(%rax,%rdx,4) - - ## finish if last - movl nb400nf_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel400nf_x86_64_sse.nb400nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb400nf_n(%rsp) - jmp _nb_kernel400nf_x86_64_sse.nb400nf_outer -_nb_kernel400nf_x86_64_sse.nb400nf_outerend: - ## check if more outer neighborlists remain - movl nb400nf_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel400nf_x86_64_sse.nb400nf_end - ## non-zero, do one more workunit - jmp _nb_kernel400nf_x86_64_sse.nb400nf_threadloop -_nb_kernel400nf_x86_64_sse.nb400nf_end: - - movl nb400nf_nouter(%rsp),%eax - movl nb400nf_ninner(%rsp),%ebx - movq nb400nf_outeriter(%rbp),%rcx - movq nb400nf_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $296,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.intel_syntax.s deleted file mode 100644 index a7f86f162d..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.intel_syntax.s +++ /dev/null @@ -1,2009 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. - -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - - - -.globl nb_kernel410_x86_64_sse -.globl _nb_kernel410_x86_64_sse -nb_kernel410_x86_64_sse: -_nb_kernel410_x86_64_sse: -;# Room for return address and rbp (16 bytes) -.equiv nb410_fshift, 16 -.equiv nb410_gid, 24 -.equiv nb410_pos, 32 -.equiv nb410_faction, 40 -.equiv nb410_charge, 48 -.equiv nb410_p_facel, 56 -.equiv nb410_argkrf, 64 -.equiv nb410_argcrf, 72 -.equiv nb410_Vc, 80 -.equiv nb410_type, 88 -.equiv nb410_p_ntype, 96 -.equiv nb410_vdwparam, 104 -.equiv nb410_Vvdw, 112 -.equiv nb410_p_tabscale, 120 -.equiv nb410_VFtab, 128 -.equiv nb410_invsqrta, 136 -.equiv nb410_dvda, 144 -.equiv nb410_p_gbtabscale, 152 -.equiv nb410_GBtab, 160 -.equiv nb410_p_nthreads, 168 -.equiv nb410_count, 176 -.equiv nb410_mtx, 184 -.equiv nb410_outeriter, 192 -.equiv nb410_inneriter, 200 -.equiv nb410_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb410_ix, 0 -.equiv nb410_iy, 16 -.equiv nb410_iz, 32 -.equiv nb410_iq, 48 -.equiv nb410_dx, 64 -.equiv nb410_dy, 80 -.equiv nb410_dz, 96 -.equiv nb410_two, 112 -.equiv nb410_six, 128 -.equiv nb410_twelve, 144 -.equiv nb410_gbtsc, 160 -.equiv nb410_qq, 176 -.equiv nb410_c6, 192 -.equiv nb410_c12, 208 -.equiv nb410_fscal, 224 -.equiv nb410_vctot, 240 -.equiv nb410_Vvdwtot, 256 -.equiv nb410_fix, 272 -.equiv nb410_fiy, 288 -.equiv nb410_fiz, 304 -.equiv nb410_half, 320 -.equiv nb410_three, 336 -.equiv nb410_r, 352 -.equiv nb410_isai, 368 -.equiv nb410_isaprod, 384 -.equiv nb410_dvdasum, 400 -.equiv nb410_gbscale, 416 -.equiv nb410_nri, 432 -.equiv nb410_iinr, 440 -.equiv nb410_jindex, 448 -.equiv nb410_jjnr, 456 -.equiv nb410_shift, 464 -.equiv nb410_shiftvec, 472 -.equiv nb410_facel, 480 -.equiv nb410_innerjjnr, 488 -.equiv nb410_is3, 496 -.equiv nb410_ii3, 500 -.equiv nb410_ii, 504 -.equiv nb410_ntia, 508 -.equiv nb410_innerk, 512 -.equiv nb410_n, 516 -.equiv nb410_nn1, 520 -.equiv nb410_ntype, 524 -.equiv nb410_nouter, 528 -.equiv nb410_ninner, 532 -.equiv nb410_jnra, 536 -.equiv nb410_jnrb, 540 -.equiv nb410_jnrc, 544 -.equiv nb410_jnrd, 548 - - push rbp - mov rbp, rsp - push rbx - - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 568 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb410_nouter], eax - mov [rsp + nb410_ninner], eax - - mov edi, [rdi] - mov [rsp + nb410_nri], edi - mov [rsp + nb410_iinr], rsi - mov [rsp + nb410_jindex], rdx - mov [rsp + nb410_jjnr], rcx - mov [rsp + nb410_shift], r8 - mov [rsp + nb410_shiftvec], r9 - mov rdi, [rbp + nb410_p_ntype] - mov edi, [rdi] - mov [rsp + nb410_ntype], edi - mov rsi, [rbp + nb410_p_facel] - movss xmm0, [rsi] - movss [rsp + nb410_facel], xmm0 - - mov rbx, [rbp + nb410_p_gbtabscale] - movss xmm4, [rbx] - shufps xmm4, xmm4, 0 - movaps [rsp + nb410_gbtsc], xmm4 - - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# half in IEEE (hex) - mov [rsp + nb410_half], eax - movss xmm1, [rsp + nb410_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# one - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# two - addps xmm3, xmm2 ;# three - movaps xmm4, xmm3 - addps xmm4, xmm4 ;# six - movaps xmm5, xmm4 - addps xmm5, xmm5 ;# twelve - movaps [rsp + nb410_half], xmm1 - movaps [rsp + nb410_two], xmm2 - movaps [rsp + nb410_three], xmm3 - movaps [rsp + nb410_six], xmm4 - movaps [rsp + nb410_twelve], xmm5 - -.nb410_threadloop: - mov rsi, [rbp + nb410_count] ;# pointer to sync counter - mov eax, [rsi] -.nb410_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb410_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb410_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb410_n], eax - mov [rsp + nb410_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb410_outerstart - jmp .nb410_end - -.nb410_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb410_nouter] - mov [rsp + nb410_nouter], ebx - -.nb410_outer: - mov rax, [rsp + nb410_shift] ;# rax = pointer into shift[] - mov ebx, [rax+rsi*4] ;# ebx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb410_is3],ebx ;# store is3 - - mov rax, [rsp + nb410_shiftvec] ;# rax = base of shiftvec[] - - movss xmm0, [rax + rbx*4] - movss xmm1, [rax + rbx*4 + 4] - movss xmm2, [rax + rbx*4 + 8] - - mov rcx, [rsp + nb410_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx + rsi*4] ;# ebx =ii - mov [rsp + nb410_ii], ebx - - mov rdx, [rbp + nb410_charge] - movss xmm3, [rdx + rbx*4] - mulss xmm3, [rsp + nb410_facel] - shufps xmm3, xmm3, 0 - - mov rdx, [rbp + nb410_invsqrta] ;# load invsqrta[ii] - movss xmm4, [rdx + rbx*4] - shufps xmm4, xmm4, 0 - - mov rdx, [rbp + nb410_type] - mov edx, [rdx + rbx*4] - imul edx, [rsp + nb410_ntype] - shl edx, 1 - mov [rsp + nb410_ntia], edx - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb410_pos] ;# rax = base of pos[] - - addss xmm0, [rax + rbx*4] - addss xmm1, [rax + rbx*4 + 4] - addss xmm2, [rax + rbx*4 + 8] - - movaps [rsp + nb410_iq], xmm3 - movaps [rsp + nb410_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [rsp + nb410_ix], xmm0 - movaps [rsp + nb410_iy], xmm1 - movaps [rsp + nb410_iz], xmm2 - - mov [rsp + nb410_ii3], ebx - - ;# clear vctot and i forces - xorps xmm13, xmm13 - movaps xmm12, xmm13 - movaps [rsp + nb410_Vvdwtot], xmm13 - movaps [rsp + nb410_dvdasum], xmm13 - movaps xmm14, xmm13 - movaps xmm15, xmm13 - - mov rax, [rsp + nb410_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb410_pos] - mov rdi, [rbp + nb410_faction] - mov rax, [rsp + nb410_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb410_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [rsp + nb410_ninner] - mov [rsp + nb410_ninner], ecx - add edx, 0 - mov [rsp + nb410_innerk], edx ;# number of innerloop atoms - jge .nb410_unroll_loop - jmp .nb410_finish_inner -.nb410_unroll_loop: - ;# quad-unroll innerloop here - mov rdx, [rsp + nb410_innerjjnr] ;# pointer to jjnr[k] - mov eax, [rdx] - mov ebx, [rdx + 4] - mov ecx, [rdx + 8] - mov edx, [rdx + 12] ;# eax-edx=jnr1-4 - - add qword ptr [rsp + nb410_innerjjnr], 16 ;# advance pointer (unrolled 4) - - ;# load isaj - mov rsi, [rbp + nb410_invsqrta] - movss xmm3, [rsi + rax*4] - movss xmm4, [rsi + rcx*4] - movss xmm6, [rsi + rbx*4] - movss xmm7, [rsi + rdx*4] - movaps xmm2, [rsp + nb410_isai] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# 10001000 ;# all isaj in xmm3 - mulps xmm2, xmm3 - - movaps [rsp + nb410_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [rsp + nb410_gbtsc] - movaps [rsp + nb410_gbscale], xmm1 - - mov rsi, [rbp + nb410_charge] ;# base of charge[] - - movss xmm3, [rsi + rax*4] - movss xmm4, [rsi + rcx*4] - movss xmm6, [rsi + rbx*4] - movss xmm7, [rsi + rdx*4] - - mulps xmm2, [rsp + nb410_iq] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# 10001000 ;# all charges in xmm3 - mulps xmm3, xmm2 - movaps [rsp + nb410_qq], xmm3 - - ;# vdw parameters - mov rsi, [rbp + nb410_type] - mov r12d, [rsi + rax*4] - mov r13d, [rsi + rbx*4] - mov r14d, [rsi + rcx*4] - mov r15d, [rsi + rdx*4] - shl r12d, 1 - shl r13d, 1 - shl r14d, 1 - shl r15d, 1 - mov edi, [rsp + nb410_ntia] - add r12d, edi - add r13d, edi - add r14d, edi - add r15d, edi - - mov rsi, [rbp + nb410_vdwparam] - movlps xmm3, [rsi + r12*4] - movlps xmm7, [rsi + r14*4] - movhps xmm3, [rsi + r13*4] - movhps xmm7, [rsi + r15*4] - - movaps xmm0, xmm3 - shufps xmm0, xmm7, 136 ;# 10001000 - shufps xmm3, xmm7, 221 ;# 11011101 - - movaps [rsp + nb410_c6], xmm0 - movaps [rsp + nb410_c12], xmm3 - - mov rsi, [rbp + nb410_pos] ;# base of pos[] - - lea r8, [rax + rax*2] ;# jnr - lea r9, [rbx + rbx*2] - lea r10, [rcx + rcx*2] - lea r11, [rdx + rdx*2] - - ;# move four coordinates to xmm0-xmm2 - movlps xmm4, [rsi + r8*4] - movlps xmm5, [rsi + r10*4] - movss xmm2, [rsi + r8*4 + 8] - movss xmm6, [rsi + r10*4 + 8] - - movhps xmm4, [rsi + r9*4] - movhps xmm5, [rsi + r11*4] - - movss xmm0, [rsi + r9*4 + 8] - movss xmm1, [rsi + r11*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# 10001000 - - shufps xmm0, xmm5, 136 ;# 10001000 - shufps xmm1, xmm5, 221 ;# 11011101 - - ;# calc dr - subps xmm0, [rsp + nb410_ix] - subps xmm1, [rsp + nb410_iy] - subps xmm2, [rsp + nb410_iz] - - ;# store dr - movaps [rsp + nb410_dx], xmm0 - movaps [rsp + nb410_dy], xmm1 - movaps [rsp + nb410_dz], xmm2 - - ;# square it - mulps xmm0,xmm0 - mulps xmm1,xmm1 - mulps xmm2,xmm2 - addps xmm0, xmm1 - addps xmm0, xmm2 - movaps xmm4, xmm0 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb410_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb410_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb410_r], xmm4 - mulps xmm4, [rsp + nb410_gbscale] - - ;# truncate and convert to integers - cvttps2dq xmm5, xmm4 - - ;# convert back to float - cvtdq2ps xmm6, xmm5 - - ;# multiply by 4 - pslld xmm5, 2 - - ;# move to integer registers - movhlps xmm7, xmm5 - movd r12d, xmm5 - movd r14d, xmm7 - pshufd xmm5, xmm5, 1 - pshufd xmm7, xmm7, 1 - movd r13d, xmm5 - movd r15d, xmm7 - - ;# calculate eps - subps xmm4, xmm6 - movaps xmm1, xmm4 ;#eps - - mov rsi, [rbp + nb410_GBtab] - - movaps xmm9, xmm0 ;# rinv - mulps xmm9, xmm9 ;# rinvsq - movaps xmm10, xmm9 ;# rinvsq - mulps xmm10, xmm10 ;# rinv4 - mulps xmm10, xmm9 ;# rinv6 - movaps xmm11, xmm10 - mulps xmm11, xmm11 ;# rinv12 - - ;# load table data - movlps xmm5, [rsi + r12*4] - movlps xmm7, [rsi + r14*4] - movhps xmm5, [rsi + r13*4] - movhps xmm7, [rsi + r15*4] - - movaps xmm4, xmm5 - shufps xmm4, xmm7, 136 ;# 10001000 - shufps xmm5, xmm7, 221 ;# 11011101 - - mulps xmm10, [rsp + nb410_c6] ;# vvdw6=c6*rinv6 - mulps xmm11, [rsp + nb410_c12] ;# vvdw12=c12*rinv12 - - movaps xmm9, xmm11 - subps xmm11, xmm10 ;# Vvdw=Vvdw12-Vvdw6 - - ;# add potential to vvdwtot - addps xmm11, [rsp + nb410_Vvdwtot] - movaps [rsp + nb410_Vvdwtot], xmm11 - - movlps xmm7, [rsi + r12*4 + 8] - movlps xmm8, [rsi + r14*4 + 8] - movhps xmm7, [rsi + r13*4 + 8] - movhps xmm8, [rsi + r15*4 + 8] - - movaps xmm6, xmm7 - - shufps xmm6, xmm8, 136 ;# 10001000 - shufps xmm7, xmm8, 221 ;# 11011101 - ;# table data ready in xmm4-xmm7 - - mulps xmm7, xmm1 ;# Heps - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm1 ;# Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - addps xmm7, xmm7 ;# two*Heps2 - movaps xmm3, [rsp + nb410_qq] - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - mulps xmm3, xmm7 ;# fijC=FF*qq - ;# at this point xmm5 contains vcoul and xmm3 fijC - - ;# LJ forces - mulps xmm10, [rsp + nb410_six] - mulps xmm9, [rsp + nb410_twelve] - subps xmm9, xmm10 - mulps xmm9, xmm0 ;# (12*vnb12-6*vnb6)*rinv - - mov rsi, [rbp + nb410_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulps xmm3, [rsp + nb410_gbscale] - movaps xmm6, xmm3 - mulps xmm6, [rsp + nb410_r] - addps xmm6, xmm5 - - ;# increment vctot (sum in xmm12) - addps xmm12, xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [rsp + nb410_dvdasum] - movaps [rsp + nb410_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - movaps xmm5, xmm6 - movaps xmm4, xmm7 - shufps xmm5, xmm5, 0x1 - shufps xmm4, xmm4, 0x1 - - ;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss xmm6, [rsi + rax*4] - addss xmm5, [rsi + rbx*4] - addss xmm7, [rsi + rcx*4] - addss xmm4, [rsi + rdx*4] - movss [rsi + rax*4], xmm6 - movss [rsi + rbx*4], xmm5 - movss [rsi + rcx*4], xmm7 - movss [rsi + rdx*4], xmm4 - - subps xmm9, xmm3 - mulps xmm9, xmm0 ;# fscal - - movaps xmm10, xmm9 - movaps xmm11, xmm9 - - mulps xmm9, [rsp + nb410_dx] - mulps xmm10, [rsp + nb410_dy] - mulps xmm11, [rsp + nb410_dz] - - ;# accumulate i forces - addps xmm13, xmm9 - addps xmm14, xmm10 - addps xmm15, xmm11 - - mov rsi, [rbp + nb410_faction] - ;# the fj's - start by accumulating x & y forces from memory - movlps xmm0, [rsi + r8*4] ;# x1 y1 - - - movlps xmm1, [rsi + r10*4] ;# x3 y3 - - - movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2 - movhps xmm1, [rsi + r11*4] ;# x3 y3 x4 y4 - - movaps xmm8, xmm9 - unpcklps xmm9, xmm10 ;# x1 y1 x2 y2 - unpckhps xmm8, xmm10 ;# x3 y3 x4 y4 - - ;# update fjx and fjy - addps xmm0, xmm9 - addps xmm1, xmm8 - - movlps [rsi + r8*4], xmm0 - movlps [rsi + r10*4], xmm1 - movhps [rsi + r9*4], xmm0 - movhps [rsi + r11*4], xmm1 - - ;# xmm11: fjz1 fjz2 fjz3 fjz4 - pshufd xmm10, xmm11, 1 ;# fjz2 - - - - movhlps xmm9, xmm11 ;# fjz3 - - - - pshufd xmm8, xmm11, 3 ;# fjz4 - - - - - addss xmm11, [rsi + r8*4 + 8] - addss xmm10, [rsi + r9*4 + 8] - addss xmm9, [rsi + r10*4 + 8] - addss xmm8, [rsi + r11*4 + 8] - movss [rsi + r8*4 + 8], xmm11 - movss [rsi + r9*4 + 8], xmm10 - movss [rsi + r10*4 + 8], xmm9 - movss [rsi + r11*4 + 8], xmm8 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb410_innerk], 4 - jl .nb410_finish_inner - jmp .nb410_unroll_loop -.nb410_finish_inner: - ;# check if at least two particles remain - add dword ptr [rsp + nb410_innerk], 4 - mov edx, [rsp + nb410_innerk] - and edx, 2 - jnz .nb410_dopair - jmp .nb410_checksingle -.nb410_dopair: - mov rcx, [rsp + nb410_innerjjnr] - - mov eax, [rcx] - mov ebx, [rcx + 4] - add qword ptr [rsp + nb410_innerjjnr], 8 - - ;# load isaj - mov rsi, [rbp + nb410_invsqrta] - movss xmm2, [rsi + rax*4] - movss xmm6, [rsi + rbx*4] - unpcklps xmm2, xmm6 - - mulps xmm2, [rsp + nb410_isai] - - movaps [rsp + nb410_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [rsp + nb410_gbtsc] - movaps [rsp + nb410_gbscale], xmm1 - - mulps xmm2, [rsp + nb410_iq] - mov rsi, [rbp + nb410_charge] ;# base of charge[] - movss xmm3, [rsi + rax*4] - movss xmm6, [rsi + rbx*4] - unpcklps xmm3, xmm6 - - - mulps xmm3, xmm2 - movaps [rsp + nb410_qq], xmm3 - - ;# vdw parameters - mov rsi, [rbp + nb410_type] - mov r12d, [rsi + rax*4] - mov r13d, [rsi + rbx*4] - shl r12d, 1 - shl r13d, 1 - mov edi, [rsp + nb410_ntia] - add r12d, edi - add r13d, edi - - mov rsi, [rbp + nb410_vdwparam] - movlps xmm3, [rsi + r12*4] - movhps xmm3, [rsi + r13*4] - - xorps xmm7, xmm7 - movaps xmm0, xmm3 - shufps xmm0, xmm7, 136 ;# 10001000 - shufps xmm3, xmm7, 221 ;# 11011101 - - movaps [rsp + nb410_c6], xmm0 - movaps [rsp + nb410_c12], xmm3 - - mov rsi, [rbp + nb410_pos] ;# base of pos[] - - lea r8, [rax + rax*2] ;# j3 - lea r9, [rbx + rbx*2] - - ;# move four coordinates to xmm0-xmm2 - movlps xmm4, [rsi + r8*4] ;# x1 y1 - - - movlps xmm5, [rsi + r9*4] ;# x2 y2 - - - - movss xmm6, [rsi + r8*4 + 8] ;# z1 - - - - movss xmm7, [rsi + r9*4 + 8] ;# z2 - - - - - unpcklps xmm4, xmm5 ;# x1 x2 y1 y2 - movhlps xmm5, xmm4 ;# y1 y2 - - - unpcklps xmm6, xmm7 ;# z1 z2 - - - - ;# calc dr - subps xmm4, [rsp + nb410_ix] - subps xmm5, [rsp + nb410_iy] - subps xmm6, [rsp + nb410_iz] - - ;# store dr - movaps [rsp + nb410_dx], xmm4 - movaps [rsp + nb410_dy], xmm5 - movaps [rsp + nb410_dz], xmm6 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb410_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb410_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb410_r], xmm4 - mulps xmm4, [rsp + nb410_gbscale] - - ;# truncate and convert to integers - cvttps2dq xmm5, xmm4 - - ;# convert back to float - cvtdq2ps xmm6, xmm5 - - ;# multiply by 4 - pslld xmm5, 2 - - ;# move to integer registers - movd r12d, xmm5 - pshufd xmm5, xmm5, 1 - movd r13d, xmm5 - - ;# calculate eps - subps xmm4, xmm6 - movaps xmm1, xmm4 ;#eps - - mov rsi, [rbp + nb410_GBtab] - - movaps xmm9, xmm0 ;# rinv - mulps xmm9, xmm9 ;# rinvsq - movaps xmm10, xmm9 ;# rinvsq - mulps xmm10, xmm10 ;# rinv4 - mulps xmm10, xmm9 ;# rinv6 - movaps xmm11, xmm10 - mulps xmm11, xmm11 ;# rinv12 - - ;# load table data - movlps xmm4, [rsi + r12*4] ;# Y1 F1 - movlps xmm5, [rsi + r13*4] ;# Y2 F2 - unpcklps xmm4, xmm5 ;# Y1 Y2 F1 F2 - movhlps xmm5, xmm4 ;# F1 F2 - - mulps xmm10, [rsp + nb410_c6] ;# vvdw6=c6*rinv6 - mulps xmm11, [rsp + nb410_c12] ;# vvdw12=c12*rinv12 - - movaps xmm9, xmm11 - subps xmm11, xmm10 ;# Vvdw=Vvdw12-Vvdw6 - - ;# add potential to vvdwtot - addps xmm11, [rsp + nb410_Vvdwtot] - movlps [rsp + nb410_Vvdwtot], xmm11 - - movlps xmm6, [rsi + r12*4 + 8] ;# G1 H1 - movlps xmm7, [rsi + r13*4 + 8] ;# G2 H2 - unpcklps xmm6, xmm7 ;# G1 G2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# table data ready in xmm4-xmm7 - - mulps xmm7, xmm1 ;# Heps - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm1 ;# Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - addps xmm7, xmm7 ;# two*Heps2 - movaps xmm3, [rsp + nb410_qq] - - addps xmm7, xmm6 - addps xmm7, xmm5 ;# xmm7=FF - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - mulps xmm3, xmm7 ;# fijC=FF*qq - ;# at this point xmm5 contains vcoul and xmm3 fijC - - ;# LJ forces - mulps xmm10, [rsp + nb410_six] - mulps xmm9, [rsp + nb410_twelve] - subps xmm9, xmm10 - mulps xmm9, xmm0 ;# (12*vnb12-6*vnb6)*rinv - - ;# zero upper part of vcoul - xorps xmm2, xmm2 - movlhps xmm5, xmm2 - - mov rsi, [rbp + nb410_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulps xmm3, [rsp + nb410_gbscale] - movaps xmm6, xmm3 - mulps xmm6, [rsp + nb410_r] - addps xmm6, xmm5 - - xorps xmm4, xmm4 - ;# increment vctot (sum in xmm12) - addps xmm12, xmm5 - - ;# xmm6=(vcoul+fijC*r) - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# zero upper half of dvda - movlhps xmm7, xmm4 - - ;# update dvdasum - addps xmm7, [rsp + nb410_dvdasum] - movaps [rsp + nb410_dvdasum], xmm7 - - ;# update j atoms dvdaj - movaps xmm5, xmm6 - shufps xmm5, xmm5, 0x1 - - ;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss xmm6, [rsi + rax*4] - addss xmm5, [rsi + rbx*4] - movss [rsi + rax*4], xmm6 - movss [rsi + rbx*4], xmm5 - - xorps xmm7, xmm7 - - subps xmm9, xmm3 - mulps xmm9, xmm0 ;# fscal - - movaps xmm10, xmm9 - movaps xmm11, xmm9 - - mulps xmm9, [rsp + nb410_dx] - mulps xmm10, [rsp + nb410_dy] - mulps xmm11, [rsp + nb410_dz] - - movlhps xmm9, xmm7 - movlhps xmm10, xmm7 - movlhps xmm11, xmm7 - - ;# accumulate i forces - addps xmm13, xmm9 - addps xmm14, xmm10 - addps xmm15, xmm11 - - mov rsi, [rbp + nb410_faction] - ;# the fj's - start by accumulating x & y forces from memory - movlps xmm0, [rsi + r8*4] ;# x1 y1 - - - movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2 - - unpcklps xmm9, xmm10 ;# x1 y1 x2 y2 - addps xmm0, xmm9 - - movlps [rsi + r8*4], xmm0 - movhps [rsi + r9*4], xmm0 - - ;# z forces - pshufd xmm8, xmm11, 1 - addss xmm11, [rsi + r8*4 + 8] - addss xmm8, [rsi + r9*4 + 8] - movss [rsi + r8*4 + 8], xmm11 - movss [rsi + r9*4 + 8], xmm8 - -.nb410_checksingle: - mov edx, [rsp + nb410_innerk] - and edx, 1 - jnz .nb410_dosingle - jmp .nb410_updateouterdata -.nb410_dosingle: - mov rsi, [rbp + nb410_charge] - mov rdx, [rbp + nb410_invsqrta] - mov rdi, [rbp + nb410_pos] - mov rcx, [rsp + nb410_innerjjnr] - mov eax, [rcx] - - ;# load isaj - mov rsi, [rbp + nb410_invsqrta] - movss xmm3, [rsi + rax*4] - movaps xmm2, [rsp + nb410_isai] - mulss xmm2, xmm3 - - movss [rsp + nb410_isaprod], xmm2 - movaps xmm1, xmm2 - mulss xmm1, [rsp + nb410_gbtsc] - movss [rsp + nb410_gbscale], xmm1 - - mulss xmm2, [rsp + nb410_iq] - mov rsi, [rbp + nb410_charge] ;# base of charge[] - - movss xmm3, [rsi + rax*4] - mulss xmm3, xmm2 - movss [rsp + nb410_qq], xmm3 - - ;# vdw parameters - mov rsi, [rbp + nb410_type] - mov r12d, [rsi + rax*4] - shl r12d, 1 - mov edi, [rsp + nb410_ntia] - add r12d, edi - - mov rsi, [rbp + nb410_vdwparam] - movss xmm0, [rsi + r12*4] - movss xmm3, [rsi + r12*4 + 4] - movaps [rsp + nb410_c6], xmm0 - movaps [rsp + nb410_c12], xmm3 - - mov rsi, [rbp + nb410_pos] ;# base of pos[] - - lea r8, [rax + rax*2] ;# jnr - - ;# move four coordinates to xmm0-xmm2 - movss xmm4, [rsi + r8*4] - movss xmm5, [rsi + r8*4 + 4] - movss xmm6, [rsi + r8*4 + 8] - - ;# calc dr - subss xmm4, [rsp + nb410_ix] - subss xmm5, [rsp + nb410_iy] - subss xmm6, [rsp + nb410_iz] - - ;# store dr - movaps [rsp + nb410_dx], xmm4 - movaps [rsp + nb410_dy], xmm5 - movaps [rsp + nb410_dz], xmm6 - - ;# square it - mulss xmm4,xmm4 - mulss xmm5,xmm5 - mulss xmm6,xmm6 - addss xmm4, xmm5 - addss xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulss xmm5, xmm5 - movaps xmm1, [rsp + nb410_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb410_half] - subss xmm1, xmm5 ;# 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - mulss xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb410_r], xmm4 - mulss xmm4, [rsp + nb410_gbscale] - - ;# truncate and convert to integers - cvttss2si r12d, xmm4 - - ;# convert back to float - cvtsi2ss xmm6, r12d - - ;# multiply by 4 - shl r12d, 2 - - ;# calculate eps - subss xmm4, xmm6 - movaps xmm1, xmm4 ;#eps - - mov rsi, [rbp + nb410_GBtab] - - movaps xmm9, xmm0 ;# rinv - mulss xmm9, xmm9 ;# rinvsq - movaps xmm10, xmm9 ;# rinvsq - mulss xmm10, xmm10 ;# rinv4 - mulss xmm10, xmm9 ;# rinv6 - movaps xmm11, xmm10 - mulss xmm11, xmm11 ;# rinv12 - - ;# load table data - movss xmm4, [rsi + r12*4] - movss xmm5, [rsi + r12*4 + 4] - movss xmm6, [rsi + r12*4 + 8] - movss xmm7, [rsi + r12*4 + 12] - ;# table data ready in xmm4-xmm7 - - mulss xmm10, [rsp + nb410_c6] ;# vvdw6=c6*rinv6 - mulss xmm11, [rsp + nb410_c12] ;# vvdw12=c12*rinv12 - - movaps xmm9, xmm11 - subss xmm11, xmm10 ;# Vvdw=Vvdw12-Vvdw6 - - ;# add potential to vvdwtot - addss xmm11, [rsp + nb410_Vvdwtot] - movss [rsp + nb410_Vvdwtot], xmm11 - - mulss xmm7, xmm1 ;# Heps - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm1 ;# Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - addss xmm7, xmm7 ;# two*Heps2 - movss xmm3, [rsp + nb410_qq] - addss xmm7, xmm6 - addss xmm7, xmm5 ;# xmm7=FF - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, xmm3 ;# vcoul=qq*VV - mulss xmm3, xmm7 ;# fijC=FF*qq - ;# at this point xmm5 contains vcoul and xmm3 fijC - - ;# LJ forces - mulss xmm10, [rsp + nb410_six] - mulss xmm9, [rsp + nb410_twelve] - subss xmm9, xmm10 - mulss xmm9, xmm0 ;# (12*vnb12-6*vnb6)*rinv - - mov rsi, [rbp + nb410_dvda] - - ;# Calculate dVda - xorps xmm7, xmm7 - mulss xmm3, [rsp + nb410_gbscale] - movaps xmm6, xmm3 - mulss xmm6, [rsp + nb410_r] - addss xmm6, xmm5 - - ;# increment vctot (sum in xmm12) - addss xmm12, xmm5 - - ;# xmm6=(vcoul+fijC*r) - subss xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addss xmm7, [rsp + nb410_dvdasum] - movss [rsp + nb410_dvdasum], xmm7 - - ;# update j atoms dvdaj - addss xmm6, [rsi + rax*4] - movss [rsi + rax*4], xmm6 - - subss xmm9, xmm3 - mulss xmm9, xmm0 ;# fscal - - movaps xmm10, xmm9 - movaps xmm11, xmm9 - - mulss xmm9, [rsp + nb410_dx] - mulss xmm10, [rsp + nb410_dy] - mulss xmm11, [rsp + nb410_dz] - - ;# accumulate i forces - addss xmm13, xmm9 - addss xmm14, xmm10 - addss xmm15, xmm11 - - mov rsi, [rbp + nb410_faction] - ;# add to j forces - addss xmm9, [rsi + r8*4] - addss xmm10, [rsi + r8*4 + 4] - addss xmm11, [rsi + r8*4 + 8] - movss [rsi + r8*4], xmm9 - movss [rsi + r8*4 + 4], xmm10 - movss [rsi + r8*4 + 8], xmm11 - -.nb410_updateouterdata: - mov ecx, [rsp + nb410_ii3] - mov rdi, [rbp + nb410_faction] - mov rsi, [rbp + nb410_fshift] - mov edx, [rsp + nb410_is3] - - ;# accumulate i forces in xmm13, xmm14, xmm15 - movhlps xmm0, xmm13 - movhlps xmm1, xmm14 - movhlps xmm2, xmm15 - addps xmm0, xmm13 - addps xmm1, xmm14 - addps xmm2, xmm15 - movaps xmm3, xmm0 - movaps xmm4, xmm1 - movaps xmm5, xmm2 - shufps xmm3, xmm3, 1 - shufps xmm4, xmm4, 1 - shufps xmm5, xmm5, 1 - addss xmm0, xmm3 - addss xmm1, xmm4 - addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0 - - - ;# increment i force - movss xmm3, [rdi + rcx*4] - movss xmm4, [rdi + rcx*4 + 4] - movss xmm5, [rdi + rcx*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [rdi + rcx*4], xmm3 - movss [rdi + rcx*4 + 4], xmm4 - movss [rdi + rcx*4 + 8], xmm5 - - ;# increment fshift force - movss xmm3, [rsi + rdx*4] - movss xmm4, [rsi + rdx*4 + 4] - movss xmm5, [rsi + rdx*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [rsi + rdx*4], xmm3 - movss [rsi + rdx*4 + 4], xmm4 - movss [rsi + rdx*4 + 8], xmm5 - - ;# get n from stack - mov esi, [rsp + nb410_n] - ;# get group index for i particle - mov rdx, [rbp + nb410_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - ;# accumulate - movhlps xmm6, xmm12 - addps xmm12, xmm6 ;# pos 0-1 in xmm12 have the sum now - movaps xmm6, xmm12 - shufps xmm6, xmm6, 1 - addss xmm12, xmm6 - - ;# add earlier value from mem - mov rax, [rbp + nb410_Vc] - addss xmm12, [rax + rdx*4] - ;# move back to mem - movss [rax + rdx*4], xmm12 - - ;# accumulate total lj energy and update it - movaps xmm7, [rsp + nb410_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov rax, [rbp + nb410_Vvdw] - addss xmm7, [rax + rdx*4] - ;# move back to mem - movss [rax + rdx*4], xmm7 - - ;# accumulate dVda and update it - movaps xmm7, [rsp + nb410_dvdasum] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - mov edx, [rsp + nb410_ii] - mov rax, [rbp + nb410_dvda] - addss xmm7, [rax + rdx*4] - movss [rax + rdx*4], xmm7 - - ;# finish if last - mov ecx, [rsp + nb410_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb410_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb410_n], esi - jmp .nb410_outer -.nb410_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb410_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb410_end - ;# non-zero, do one more workunit - jmp .nb410_threadloop -.nb410_end: - - mov eax, [rsp + nb410_nouter] - mov ebx, [rsp + nb410_ninner] - mov rcx, [rbp + nb410_outeriter] - mov rdx, [rbp + nb410_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 568 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - - - -.globl nb_kernel410nf_x86_64_sse -.globl _nb_kernel410nf_x86_64_sse -nb_kernel410nf_x86_64_sse: -_nb_kernel410nf_x86_64_sse: -;# Room for return address and rbp (16 bytes) -.equiv nb410nf_fshift, 16 -.equiv nb410nf_gid, 24 -.equiv nb410nf_pos, 32 -.equiv nb410nf_faction, 40 -.equiv nb410nf_charge, 48 -.equiv nb410nf_p_facel, 56 -.equiv nb410nf_argkrf, 64 -.equiv nb410nf_argcrf, 72 -.equiv nb410nf_Vc, 80 -.equiv nb410nf_type, 88 -.equiv nb410nf_p_ntype, 96 -.equiv nb410nf_vdwparam, 104 -.equiv nb410nf_Vvdw, 112 -.equiv nb410nf_p_tabscale, 120 -.equiv nb410nf_VFtab, 128 -.equiv nb410nf_invsqrta, 136 -.equiv nb410nf_dvda, 144 -.equiv nb410nf_p_gbtabscale, 152 -.equiv nb410nf_GBtab, 160 -.equiv nb410nf_p_nthreads, 168 -.equiv nb410nf_count, 176 -.equiv nb410nf_mtx, 184 -.equiv nb410nf_outeriter, 192 -.equiv nb410nf_inneriter, 200 -.equiv nb410nf_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb410nf_ix, 0 -.equiv nb410nf_iy, 16 -.equiv nb410nf_iz, 32 -.equiv nb410nf_iq, 48 -.equiv nb410nf_gbtsc, 64 -.equiv nb410nf_qq, 80 -.equiv nb410nf_c6, 96 -.equiv nb410nf_c12, 112 -.equiv nb410nf_vctot, 128 -.equiv nb410nf_Vvdwtot, 144 -.equiv nb410nf_half, 160 -.equiv nb410nf_three, 176 -.equiv nb410nf_isai, 192 -.equiv nb410nf_isaprod, 208 -.equiv nb410nf_gbscale, 224 -.equiv nb410nf_nri, 240 -.equiv nb410nf_iinr, 248 -.equiv nb410nf_jindex, 256 -.equiv nb410nf_jjnr, 264 -.equiv nb410nf_shift, 272 -.equiv nb410nf_shiftvec, 280 -.equiv nb410nf_facel, 288 -.equiv nb410nf_innerjjnr, 296 -.equiv nb410nf_is3, 304 -.equiv nb410nf_ii3, 308 -.equiv nb410nf_ntia, 312 -.equiv nb410nf_innerk, 316 -.equiv nb410nf_n, 320 -.equiv nb410nf_nn1, 324 -.equiv nb410nf_ntype, 328 -.equiv nb410nf_nouter, 332 -.equiv nb410nf_ninner, 336 - - push rbp - mov rbp, rsp - push rbx - - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 360 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb410nf_nouter], eax - mov [rsp + nb410nf_ninner], eax - - mov edi, [rdi] - mov [rsp + nb410nf_nri], edi - mov [rsp + nb410nf_iinr], rsi - mov [rsp + nb410nf_jindex], rdx - mov [rsp + nb410nf_jjnr], rcx - mov [rsp + nb410nf_shift], r8 - mov [rsp + nb410nf_shiftvec], r9 - mov rdi, [rbp + nb410nf_p_ntype] - mov edi, [rdi] - mov [rsp + nb410nf_ntype], edi - mov rsi, [rbp + nb410nf_p_facel] - movss xmm0, [rsi] - movss [rsp + nb410nf_facel], xmm0 - - mov rbx, [rbp + nb410nf_p_gbtabscale] - movss xmm4, [rbx] - shufps xmm4, xmm4, 0 - movaps [rsp + nb410nf_gbtsc], xmm4 - - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# half in IEEE (hex) - mov [rsp + nb410nf_half], eax - movss xmm1, [rsp + nb410nf_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# one - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# two - addps xmm3, xmm2 ;# three - movaps [rsp + nb410nf_half], xmm1 - movaps [rsp + nb410nf_three], xmm3 - -.nb410nf_threadloop: - mov rsi, [rbp + nb410nf_count] ;# pointer to sync counter - mov eax, [rsi] -.nb410nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb410nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb410nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb410nf_n], eax - mov [rsp + nb410nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb410nf_outerstart - jmp .nb410nf_end - -.nb410nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb410nf_nouter] - mov [rsp + nb410nf_nouter], ebx - -.nb410nf_outer: - mov rax, [rsp + nb410nf_shift] ;# rax = pointer into shift[] - mov ebx, [rax+rsi*4] ;# ebx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb410nf_is3],ebx ;# store is3 - - mov rax, [rsp + nb410nf_shiftvec] ;# rax = base of shiftvec[] - - movss xmm0, [rax + rbx*4] - movss xmm1, [rax + rbx*4 + 4] - movss xmm2, [rax + rbx*4 + 8] - - mov rcx, [rsp + nb410nf_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx + rsi*4] ;# ebx =ii - - mov rdx, [rbp + nb410nf_charge] - movss xmm3, [rdx + rbx*4] - mulss xmm3, [rsp + nb410nf_facel] - shufps xmm3, xmm3, 0 - - mov rdx, [rbp + nb410nf_invsqrta] ;# load invsqrta[ii] - movss xmm4, [rdx + rbx*4] - shufps xmm4, xmm4, 0 - - mov rdx, [rbp + nb410nf_type] - mov edx, [rdx + rbx*4] - imul edx, [rsp + nb410nf_ntype] - shl edx, 1 - mov [rsp + nb410nf_ntia], edx - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb410nf_pos] ;# rax = base of pos[] - - addss xmm0, [rax + rbx*4] - addss xmm1, [rax + rbx*4 + 4] - addss xmm2, [rax + rbx*4 + 8] - - movaps [rsp + nb410nf_iq], xmm3 - movaps [rsp + nb410nf_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [rsp + nb410nf_ix], xmm0 - movaps [rsp + nb410nf_iy], xmm1 - movaps [rsp + nb410nf_iz], xmm2 - - mov [rsp + nb410nf_ii3], ebx - - ;# clear vctot - xorps xmm4, xmm4 - movaps [rsp + nb410nf_vctot], xmm4 - movaps [rsp + nb410nf_Vvdwtot], xmm4 - - mov rax, [rsp + nb410nf_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb410nf_pos] - mov rdi, [rbp + nb410nf_faction] - mov rax, [rsp + nb410nf_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb410nf_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [rsp + nb410nf_ninner] - mov [rsp + nb410nf_ninner], ecx - add edx, 0 - mov [rsp + nb410nf_innerk], edx ;# number of innerloop atoms - jge .nb410nf_unroll_loop - jmp .nb410nf_finish_inner -.nb410nf_unroll_loop: - ;# quad-unroll innerloop here - mov rdx, [rsp + nb410nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [rdx] - mov ebx, [rdx + 4] - mov ecx, [rdx + 8] - mov edx, [rdx + 12] ;# eax-edx=jnr1-4 - add qword ptr [rsp + nb410nf_innerjjnr], 16 ;# advance pointer (unrolled 4) - - ;# load isa2 - mov rsi, [rbp + nb410nf_invsqrta] - movss xmm3, [rsi + rax*4] - movss xmm4, [rsi + rcx*4] - movss xmm6, [rsi + rbx*4] - movss xmm7, [rsi + rdx*4] - movaps xmm2, [rsp + nb410nf_isai] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# 10001000 ;# all charges in xmm3 - mulps xmm2, xmm3 - - movaps [rsp + nb410nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [rsp + nb410nf_gbtsc] - movaps [rsp + nb410nf_gbscale], xmm1 - - mov rsi, [rbp + nb410nf_charge] ;# base of charge[] - - movss xmm3, [rsi + rax*4] - movss xmm4, [rsi + rcx*4] - movss xmm6, [rsi + rbx*4] - movss xmm7, [rsi + rdx*4] - - mulps xmm2, [rsp + nb410nf_iq] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# 10001000 ;# all charges in xmm3 - mulps xmm3, xmm2 - movaps [rsp + nb410nf_qq], xmm3 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov rsi, [rbp + nb410nf_type] - mov eax, [rsi + rax*4] - mov ebx, [rsi + rbx*4] - mov ecx, [rsi + rcx*4] - mov edx, [rsi + rdx*4] - mov rsi, [rbp + nb410nf_vdwparam] - shl eax, 1 - shl ebx, 1 - shl ecx, 1 - shl edx, 1 - mov edi, [rsp + nb410nf_ntia] - add eax, edi - add ebx, edi - add ecx, edi - add edx, edi - - movlps xmm6, [rsi + rax*4] - movlps xmm7, [rsi + rcx*4] - movhps xmm6, [rsi + rbx*4] - movhps xmm7, [rsi + rdx*4] - - movaps xmm4, xmm6 - shufps xmm4, xmm7, 136 ;# 10001000 - shufps xmm6, xmm7, 221 ;# 11011101 - - movd eax, mm0 - movd ebx, mm1 - movd ecx, mm2 - movd edx, mm3 - - movaps [rsp + nb410nf_c6], xmm4 - movaps [rsp + nb410nf_c12], xmm6 - - mov rsi, [rbp + nb410nf_pos] ;# base of pos[] - - lea rax, [rax + rax*2] ;# replace jnr with j3 - lea rbx, [rbx + rbx*2] - - lea rcx, [rcx + rcx*2] ;# replace jnr with j3 - lea rdx, [rdx + rdx*2] - - ;# move four coordinates to xmm0-xmm2 - - movlps xmm4, [rsi + rax*4] - movlps xmm5, [rsi + rcx*4] - movss xmm2, [rsi + rax*4 + 8] - movss xmm6, [rsi + rcx*4 + 8] - - movhps xmm4, [rsi + rbx*4] - movhps xmm5, [rsi + rdx*4] - - movss xmm0, [rsi + rbx*4 + 8] - movss xmm1, [rsi + rdx*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# 10001000 - - shufps xmm0, xmm5, 136 ;# 10001000 - shufps xmm1, xmm5, 221 ;# 11011101 - - ;# move ix-iz to xmm4-xmm6 - movaps xmm4, [rsp + nb410nf_ix] - movaps xmm5, [rsp + nb410nf_iy] - movaps xmm6, [rsp + nb410nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb410nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb410nf_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - mulps xmm4, [rsp + nb410nf_gbscale] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 2 - pslld mm7, 2 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov rsi, [rbp + nb410nf_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# load coulomb table - movaps xmm4, [rsi + rax*4] - movaps xmm5, [rsi + rbx*4] - movaps xmm6, [rsi + rcx*4] - movaps xmm7, [rsi + rdx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# coulomb table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [rsp + nb410nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - ;# update vctot - addps xmm5, [rsp + nb410nf_vctot] - movaps [rsp + nb410nf_vctot], xmm5 - - ;# L-J - movaps xmm4, xmm0 - mulps xmm4, xmm0 ;# xmm4=rinvsq - - movaps xmm6, xmm4 - mulps xmm6, xmm4 - - mulps xmm6, xmm4 ;# xmm6=rinvsix - movaps xmm4, xmm6 - mulps xmm4, xmm4 ;# xmm4=rinvtwelve - mulps xmm6, [rsp + nb410nf_c6] - mulps xmm4, [rsp + nb410nf_c12] - movaps xmm7, [rsp + nb410nf_Vvdwtot] - addps xmm7, xmm4 - subps xmm7, xmm6 - movaps [rsp + nb410nf_Vvdwtot], xmm7 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb410nf_innerk], 4 - jl .nb410nf_finish_inner - jmp .nb410nf_unroll_loop -.nb410nf_finish_inner: - ;# check if at least two particles remain - add dword ptr [rsp + nb410nf_innerk], 4 - mov edx, [rsp + nb410nf_innerk] - and edx, 2 - jnz .nb410nf_dopair - jmp .nb410nf_checksingle -.nb410nf_dopair: - mov rcx, [rsp + nb410nf_innerjjnr] - mov eax, [rcx] - mov ebx, [rcx + 4] - add qword ptr [rsp + nb410nf_innerjjnr], 8 - - xorps xmm2, xmm2 - movaps xmm6, xmm2 - - ;# load isa2 - mov rsi, [rbp + nb410nf_invsqrta] - movss xmm2, [rsi + rax*4] - movss xmm3, [rsi + rbx*4] - unpcklps xmm2, xmm3 ;# isa2 in xmm3(0,1) - mulps xmm2, [rsp + nb410nf_isai] - movaps [rsp + nb410nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [rsp + nb410nf_gbtsc] - movaps [rsp + nb410nf_gbscale], xmm1 - - mov rsi, [rbp + nb410nf_charge] ;# base of charge[] - movss xmm3, [rsi + rax*4] - movss xmm6, [rsi + rbx*4] - unpcklps xmm3, xmm6 ;# 00001000 ;# xmm3(0,1) has the charges - - mulps xmm2, [rsp + nb410nf_iq] - mulps xmm3, xmm2 - movaps [rsp + nb410nf_qq], xmm3 - - mov rsi, [rbp + nb410nf_type] - mov ecx, eax - mov edx, ebx - mov ecx, [rsi + rcx*4] - mov edx, [rsi + rdx*4] - mov rsi, [rbp + nb410nf_vdwparam] - shl ecx, 1 - shl edx, 1 - mov edi, [rsp + nb410nf_ntia] - add ecx, edi - add edx, edi - movlps xmm6, [rsi + rcx*4] - movhps xmm6, [rsi + rdx*4] - mov rdi, [rbp + nb410nf_pos] - - movaps xmm4, xmm6 - shufps xmm4, xmm4, 8 ;# 00001000 - shufps xmm6, xmm6, 13 ;# 00001101 - movlhps xmm4, xmm7 - movlhps xmm6, xmm7 - - movaps [rsp + nb410nf_c6], xmm4 - movaps [rsp + nb410nf_c12], xmm6 - - lea rax, [rax + rax*2] - lea rbx, [rbx + rbx*2] - ;# move coordinates to xmm0-xmm2 - movlps xmm1, [rdi + rax*4] - movss xmm2, [rdi + rax*4 + 8] - movhps xmm1, [rdi + rbx*4] - movss xmm0, [rdi + rbx*4 + 8] - - movlhps xmm3, xmm7 - - shufps xmm2, xmm0, 0 - - movaps xmm0, xmm1 - - shufps xmm2, xmm2, 136 ;# 10001000 - - shufps xmm0, xmm0, 136 ;# 10001000 - shufps xmm1, xmm1, 221 ;# 11011101 - - mov rdi, [rbp + nb410nf_faction] - ;# move ix-iz to xmm4-xmm6 - xorps xmm7, xmm7 - - movaps xmm4, [rsp + nb410nf_ix] - movaps xmm5, [rsp + nb410nf_iy] - movaps xmm6, [rsp + nb410nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb410nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb410nf_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - mulps xmm4, [rsp + nb410nf_gbscale] - - cvttps2pi mm6, xmm4 ;# mm6 contain lu indices - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 - - mov rsi, [rbp + nb410nf_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# load coulomb table - movaps xmm4, [rsi + rcx*4] - movaps xmm7, [rsi + rdx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [rsp + nb410nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - - addps xmm5, [rsp + nb410nf_vctot] - movaps [rsp + nb410nf_vctot], xmm5 - - ;# L-J - movaps xmm4, xmm0 - mulps xmm4, xmm0 ;# xmm4=rinvsq - - ;# at this point mm5 contains vcoul and mm3 fijC - ;# increment vcoul - then we can get rid of mm5 - ;# update vctot - - movaps xmm6, xmm4 - mulps xmm6, xmm4 - - mulps xmm6, xmm4 ;# xmm6=rinvsix - movaps xmm4, xmm6 - mulps xmm4, xmm4 ;# xmm4=rinvtwelve - mulps xmm6, [rsp + nb410nf_c6] - mulps xmm4, [rsp + nb410nf_c12] - movaps xmm7, [rsp + nb410nf_Vvdwtot] - addps xmm7, xmm4 - subps xmm7, xmm6 - movaps [rsp + nb410nf_Vvdwtot], xmm7 - -.nb410nf_checksingle: - mov edx, [rsp + nb410nf_innerk] - and edx, 1 - jnz .nb410nf_dosingle - jmp .nb410nf_updateouterdata -.nb410nf_dosingle: - mov rsi, [rbp + nb410nf_charge] - mov rdx, [rbp + nb410nf_invsqrta] - mov rdi, [rbp + nb410nf_pos] - mov rcx, [rsp + nb410nf_innerjjnr] - mov eax, [rcx] - xorps xmm2, xmm2 - movaps xmm6, xmm2 - movss xmm2, [rdx + rax*4] ;# isa2 - mulss xmm2, [rsp + nb410nf_isai] - movss [rsp + nb410nf_isaprod], xmm2 - movss xmm1, xmm2 - mulss xmm1, [rsp + nb410nf_gbtsc] - movss [rsp + nb410nf_gbscale], xmm1 - - mulss xmm2, [rsp + nb410nf_iq] - movss xmm6, [rsi + rax*4] ;# xmm6(0) has the charge - mulss xmm6, xmm2 - movss [rsp + nb410nf_qq], xmm6 - - mov rsi, [rbp + nb410nf_type] - mov ecx, eax - mov ecx, [rsi + rcx*4] - mov rsi, [rbp + nb410nf_vdwparam] - shl ecx, 1 - add ecx, [rsp + nb410nf_ntia] - movlps xmm6, [rsi + rcx*4] - movaps xmm4, xmm6 - shufps xmm4, xmm4, 252 ;# 11111100 - shufps xmm6, xmm6, 253 ;# 11111101 - - movaps [rsp + nb410nf_c6], xmm4 - movaps [rsp + nb410nf_c12], xmm6 - - lea rax, [rax + rax*2] - - ;# move coordinates to xmm0-xmm2 - movss xmm0, [rdi + rax*4] - movss xmm1, [rdi + rax*4 + 4] - movss xmm2, [rdi + rax*4 + 8] - - movaps xmm4, [rsp + nb410nf_ix] - movaps xmm5, [rsp + nb410nf_iy] - movaps xmm6, [rsp + nb410nf_iz] - - ;# calc dr - subss xmm4, xmm0 - subss xmm5, xmm1 - subss xmm6, xmm2 - - ;# square it - mulss xmm4,xmm4 - mulss xmm5,xmm5 - mulss xmm6,xmm6 - addss xmm4, xmm5 - addss xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulss xmm5, xmm5 - movss xmm1, [rsp + nb410nf_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movss xmm0, [rsp + nb410nf_half] - subss xmm1, xmm5 ;# 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - - mulss xmm4, xmm0 ;# xmm4=r - mulss xmm4, [rsp + nb410nf_gbscale] - - cvttss2si ebx, xmm4 ;# mm6 contain lu indices - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 2 - mov rsi, [rbp + nb410nf_GBtab] - - movaps xmm4, [rsi + rbx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - movss xmm3, [rsp + nb410nf_qq] - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, xmm3 ;# vcoul=qq*VV - addss xmm5, [rsp + nb410nf_vctot] - movss [rsp + nb410nf_vctot], xmm5 - - ;# L-J - movaps xmm4, xmm0 - mulss xmm4, xmm0 ;# xmm4=rinvsq - - movaps xmm6, xmm4 - mulss xmm6, xmm4 - - mulss xmm6, xmm4 ;# xmm6=rinvsix - movaps xmm4, xmm6 - mulss xmm4, xmm4 ;# xmm4=rinvtwelve - mulss xmm6, [rsp + nb410nf_c6] - mulss xmm4, [rsp + nb410nf_c12] - movss xmm7, [rsp + nb410nf_Vvdwtot] - addps xmm7, xmm4 - subps xmm7, xmm6 - movss [rsp + nb410nf_Vvdwtot], xmm7 - -.nb410nf_updateouterdata: - ;# get n from stack - mov esi, [rsp + nb410nf_n] - ;# get group index for i particle - mov rdx, [rbp + nb410nf_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movaps xmm7, [rsp + nb410nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov rax, [rbp + nb410nf_Vc] - addss xmm7, [rax + rdx*4] - ;# move back to mem - movss [rax + rdx*4], xmm7 - - ;# accumulate total lj energy and update it - movaps xmm7, [rsp + nb410nf_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov rax, [rbp + nb410nf_Vvdw] - addss xmm7, [rax + rdx*4] - ;# move back to mem - movss [rax + rdx*4], xmm7 - - ;# finish if last - mov ecx, [rsp + nb410nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb410nf_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb410nf_n], esi - jmp .nb410nf_outer -.nb410nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb410nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb410nf_end - ;# non-zero, do one more workunit - jmp .nb410nf_threadloop -.nb410nf_end: - - mov eax, [rsp + nb410nf_nouter] - mov ebx, [rsp + nb410nf_ninner] - mov rcx, [rbp + nb410nf_outeriter] - mov rdx, [rbp + nb410nf_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 360 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.s deleted file mode 100644 index f1953c7bf6..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.s +++ /dev/null @@ -1,1985 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - - - - - -.globl nb_kernel410_x86_64_sse -.globl _nb_kernel410_x86_64_sse -nb_kernel410_x86_64_sse: -_nb_kernel410_x86_64_sse: -## Room for return address and rbp (16 bytes) -.set nb410_fshift, 16 -.set nb410_gid, 24 -.set nb410_pos, 32 -.set nb410_faction, 40 -.set nb410_charge, 48 -.set nb410_p_facel, 56 -.set nb410_argkrf, 64 -.set nb410_argcrf, 72 -.set nb410_Vc, 80 -.set nb410_type, 88 -.set nb410_p_ntype, 96 -.set nb410_vdwparam, 104 -.set nb410_Vvdw, 112 -.set nb410_p_tabscale, 120 -.set nb410_VFtab, 128 -.set nb410_invsqrta, 136 -.set nb410_dvda, 144 -.set nb410_p_gbtabscale, 152 -.set nb410_GBtab, 160 -.set nb410_p_nthreads, 168 -.set nb410_count, 176 -.set nb410_mtx, 184 -.set nb410_outeriter, 192 -.set nb410_inneriter, 200 -.set nb410_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb410_ix, 0 -.set nb410_iy, 16 -.set nb410_iz, 32 -.set nb410_iq, 48 -.set nb410_dx, 64 -.set nb410_dy, 80 -.set nb410_dz, 96 -.set nb410_two, 112 -.set nb410_six, 128 -.set nb410_twelve, 144 -.set nb410_gbtsc, 160 -.set nb410_qq, 176 -.set nb410_c6, 192 -.set nb410_c12, 208 -.set nb410_fscal, 224 -.set nb410_vctot, 240 -.set nb410_Vvdwtot, 256 -.set nb410_fix, 272 -.set nb410_fiy, 288 -.set nb410_fiz, 304 -.set nb410_half, 320 -.set nb410_three, 336 -.set nb410_r, 352 -.set nb410_isai, 368 -.set nb410_isaprod, 384 -.set nb410_dvdasum, 400 -.set nb410_gbscale, 416 -.set nb410_nri, 432 -.set nb410_iinr, 440 -.set nb410_jindex, 448 -.set nb410_jjnr, 456 -.set nb410_shift, 464 -.set nb410_shiftvec, 472 -.set nb410_facel, 480 -.set nb410_innerjjnr, 488 -.set nb410_is3, 496 -.set nb410_ii3, 500 -.set nb410_ii, 504 -.set nb410_ntia, 508 -.set nb410_innerk, 512 -.set nb410_n, 516 -.set nb410_nn1, 520 -.set nb410_ntype, 524 -.set nb410_nouter, 528 -.set nb410_ninner, 532 -.set nb410_jnra, 536 -.set nb410_jnrb, 540 -.set nb410_jnrc, 544 -.set nb410_jnrd, 548 - - push %rbp - movq %rsp,%rbp - push %rbx - - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $568,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb410_nouter(%rsp) - movl %eax,nb410_ninner(%rsp) - - movl (%rdi),%edi - movl %edi,nb410_nri(%rsp) - movq %rsi,nb410_iinr(%rsp) - movq %rdx,nb410_jindex(%rsp) - movq %rcx,nb410_jjnr(%rsp) - movq %r8,nb410_shift(%rsp) - movq %r9,nb410_shiftvec(%rsp) - movq nb410_p_ntype(%rbp),%rdi - movl (%rdi),%edi - movl %edi,nb410_ntype(%rsp) - movq nb410_p_facel(%rbp),%rsi - movss (%rsi),%xmm0 - movss %xmm0,nb410_facel(%rsp) - - movq nb410_p_gbtabscale(%rbp),%rbx - movss (%rbx),%xmm4 - shufps $0,%xmm4,%xmm4 - movaps %xmm4,nb410_gbtsc(%rsp) - - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## half in IEEE (hex) - movl %eax,nb410_half(%rsp) - movss nb410_half(%rsp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## one - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## two - addps %xmm2,%xmm3 ## three - movaps %xmm3,%xmm4 - addps %xmm4,%xmm4 ## six - movaps %xmm4,%xmm5 - addps %xmm5,%xmm5 ## twelve - movaps %xmm1,nb410_half(%rsp) - movaps %xmm2,nb410_two(%rsp) - movaps %xmm3,nb410_three(%rsp) - movaps %xmm4,nb410_six(%rsp) - movaps %xmm5,nb410_twelve(%rsp) - -_nb_kernel410_x86_64_sse.nb410_threadloop: - movq nb410_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel410_x86_64_sse.nb410_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel410_x86_64_sse.nb410_spinlock - - ## if(nn1>nri) nn1=nri - movl nb410_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb410_n(%rsp) - movl %ebx,nb410_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel410_x86_64_sse.nb410_outerstart - jmp _nb_kernel410_x86_64_sse.nb410_end - -_nb_kernel410_x86_64_sse.nb410_outerstart: - ## ebx contains number of outer iterations - addl nb410_nouter(%rsp),%ebx - movl %ebx,nb410_nouter(%rsp) - -_nb_kernel410_x86_64_sse.nb410_outer: - movq nb410_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## ebx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb410_is3(%rsp) ## store is3 - - movq nb410_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movss (%rax,%rbx,4),%xmm0 - movss 4(%rax,%rbx,4),%xmm1 - movss 8(%rax,%rbx,4),%xmm2 - - movq nb410_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - movl %ebx,nb410_ii(%rsp) - - movq nb410_charge(%rbp),%rdx - movss (%rdx,%rbx,4),%xmm3 - mulss nb410_facel(%rsp),%xmm3 - shufps $0,%xmm3,%xmm3 - - movq nb410_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movss (%rdx,%rbx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - movq nb410_type(%rbp),%rdx - movl (%rdx,%rbx,4),%edx - imull nb410_ntype(%rsp),%edx - shll %edx - movl %edx,nb410_ntia(%rsp) - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb410_pos(%rbp),%rax ## rax = base of pos[] - - addss (%rax,%rbx,4),%xmm0 - addss 4(%rax,%rbx,4),%xmm1 - addss 8(%rax,%rbx,4),%xmm2 - - movaps %xmm3,nb410_iq(%rsp) - movaps %xmm4,nb410_isai(%rsp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb410_ix(%rsp) - movaps %xmm1,nb410_iy(%rsp) - movaps %xmm2,nb410_iz(%rsp) - - movl %ebx,nb410_ii3(%rsp) - - ## clear vctot and i forces - xorps %xmm13,%xmm13 - movaps %xmm13,%xmm12 - movaps %xmm13,nb410_Vvdwtot(%rsp) - movaps %xmm13,nb410_dvdasum(%rsp) - movaps %xmm13,%xmm14 - movaps %xmm13,%xmm15 - - movq nb410_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb410_pos(%rbp),%rsi - movq nb410_faction(%rbp),%rdi - movq nb410_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb410_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb410_ninner(%rsp),%ecx - movl %ecx,nb410_ninner(%rsp) - addl $0,%edx - movl %edx,nb410_innerk(%rsp) ## number of innerloop atoms - jge _nb_kernel410_x86_64_sse.nb410_unroll_loop - jmp _nb_kernel410_x86_64_sse.nb410_finish_inner -_nb_kernel410_x86_64_sse.nb410_unroll_loop: - ## quad-unroll innerloop here - movq nb410_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%eax - movl 4(%rdx),%ebx - movl 8(%rdx),%ecx - movl 12(%rdx),%edx ## eax-edx=jnr1-4 - - addq $16,nb410_innerjjnr(%rsp) ## advance pointer (unrolled 4) - - ## load isaj - movq nb410_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rcx,4),%xmm4 - movss (%rsi,%rbx,4),%xmm6 - movss (%rsi,%rdx,4),%xmm7 - movaps nb410_isai(%rsp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## 10001000 ;# all isaj in xmm3 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb410_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulps nb410_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb410_gbscale(%rsp) - - movq nb410_charge(%rbp),%rsi ## base of charge[] - - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rcx,4),%xmm4 - movss (%rsi,%rbx,4),%xmm6 - movss (%rsi,%rdx,4),%xmm7 - - mulps nb410_iq(%rsp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3 - mulps %xmm2,%xmm3 - movaps %xmm3,nb410_qq(%rsp) - - ## vdw parameters - movq nb410_type(%rbp),%rsi - movl (%rsi,%rax,4),%r12d - movl (%rsi,%rbx,4),%r13d - movl (%rsi,%rcx,4),%r14d - movl (%rsi,%rdx,4),%r15d - shll %r12d - shll %r13d - shll %r14d - shll %r15d - movl nb410_ntia(%rsp),%edi - addl %edi,%r12d - addl %edi,%r13d - addl %edi,%r14d - addl %edi,%r15d - - movq nb410_vdwparam(%rbp),%rsi - movlps (%rsi,%r12,4),%xmm3 - movlps (%rsi,%r14,4),%xmm7 - movhps (%rsi,%r13,4),%xmm3 - movhps (%rsi,%r15,4),%xmm7 - - movaps %xmm3,%xmm0 - shufps $136,%xmm7,%xmm0 ## 10001000 - shufps $221,%xmm7,%xmm3 ## 11011101 - - movaps %xmm0,nb410_c6(%rsp) - movaps %xmm3,nb410_c12(%rsp) - - movq nb410_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r8 ## jnr - lea (%rbx,%rbx,2),%r9 - lea (%rcx,%rcx,2),%r10 - lea (%rdx,%rdx,2),%r11 - - ## move four coordinates to xmm0-xmm2 - movlps (%rsi,%r8,4),%xmm4 - movlps (%rsi,%r10,4),%xmm5 - movss 8(%rsi,%r8,4),%xmm2 - movss 8(%rsi,%r10,4),%xmm6 - - movhps (%rsi,%r9,4),%xmm4 - movhps (%rsi,%r11,4),%xmm5 - - movss 8(%rsi,%r9,4),%xmm0 - movss 8(%rsi,%r11,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## 10001000 - - shufps $136,%xmm5,%xmm0 ## 10001000 - shufps $221,%xmm5,%xmm1 ## 11011101 - - ## calc dr - subps nb410_ix(%rsp),%xmm0 - subps nb410_iy(%rsp),%xmm1 - subps nb410_iz(%rsp),%xmm2 - - ## store dr - movaps %xmm0,nb410_dx(%rsp) - movaps %xmm1,nb410_dy(%rsp) - movaps %xmm2,nb410_dz(%rsp) - - ## square it - mulps %xmm0,%xmm0 - mulps %xmm1,%xmm1 - mulps %xmm2,%xmm2 - addps %xmm1,%xmm0 - addps %xmm2,%xmm0 - movaps %xmm0,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb410_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb410_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb410_r(%rsp) - mulps nb410_gbscale(%rsp),%xmm4 - - ## truncate and convert to integers - cvttps2dq %xmm4,%xmm5 - - ## convert back to float - cvtdq2ps %xmm5,%xmm6 - - ## multiply by 4 - pslld $2,%xmm5 - - ## move to integer registers - movhlps %xmm5,%xmm7 - movd %xmm5,%r12d - movd %xmm7,%r14d - pshufd $1,%xmm5,%xmm5 - pshufd $1,%xmm7,%xmm7 - movd %xmm5,%r13d - movd %xmm7,%r15d - - ## calculate eps - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ##eps - - movq nb410_GBtab(%rbp),%rsi - - movaps %xmm0,%xmm9 ## rinv - mulps %xmm9,%xmm9 ## rinvsq - movaps %xmm9,%xmm10 ## rinvsq - mulps %xmm10,%xmm10 ## rinv4 - mulps %xmm9,%xmm10 ## rinv6 - movaps %xmm10,%xmm11 - mulps %xmm11,%xmm11 ## rinv12 - - ## load table data - movlps (%rsi,%r12,4),%xmm5 - movlps (%rsi,%r14,4),%xmm7 - movhps (%rsi,%r13,4),%xmm5 - movhps (%rsi,%r15,4),%xmm7 - - movaps %xmm5,%xmm4 - shufps $136,%xmm7,%xmm4 ## 10001000 - shufps $221,%xmm7,%xmm5 ## 11011101 - - mulps nb410_c6(%rsp),%xmm10 ## vvdw6=c6*rinv6 - mulps nb410_c12(%rsp),%xmm11 ## vvdw12=c12*rinv12 - - movaps %xmm11,%xmm9 - subps %xmm10,%xmm11 ## Vvdw=Vvdw12-Vvdw6 - - ## add potential to vvdwtot - addps nb410_Vvdwtot(%rsp),%xmm11 - movaps %xmm11,nb410_Vvdwtot(%rsp) - - movlps 8(%rsi,%r12,4),%xmm7 - movlps 8(%rsi,%r14,4),%xmm8 - movhps 8(%rsi,%r13,4),%xmm7 - movhps 8(%rsi,%r15,4),%xmm8 - - movaps %xmm7,%xmm6 - - shufps $136,%xmm8,%xmm6 ## 10001000 - shufps $221,%xmm8,%xmm7 ## 11011101 - ## table data ready in xmm4-xmm7 - - mulps %xmm1,%xmm7 ## Heps - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm1,%xmm7 ## Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - addps %xmm7,%xmm7 ## two*Heps2 - movaps nb410_qq(%rsp),%xmm3 - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - mulps %xmm7,%xmm3 ## fijC=FF*qq - ## at this point xmm5 contains vcoul and xmm3 fijC - - ## LJ forces - mulps nb410_six(%rsp),%xmm10 - mulps nb410_twelve(%rsp),%xmm9 - subps %xmm10,%xmm9 - mulps %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv - - movq nb410_dvda(%rbp),%rsi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulps nb410_gbscale(%rsp),%xmm3 - movaps %xmm3,%xmm6 - mulps nb410_r(%rsp),%xmm6 - addps %xmm5,%xmm6 - - ## increment vctot (sum in xmm12) - addps %xmm5,%xmm12 - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb410_dvdasum(%rsp),%xmm7 - movaps %xmm7,nb410_dvdasum(%rsp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - movaps %xmm6,%xmm5 - movaps %xmm7,%xmm4 - shufps $0x1,%xmm5,%xmm5 - shufps $0x1,%xmm4,%xmm4 - - ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss (%rsi,%rax,4),%xmm6 - addss (%rsi,%rbx,4),%xmm5 - addss (%rsi,%rcx,4),%xmm7 - addss (%rsi,%rdx,4),%xmm4 - movss %xmm6,(%rsi,%rax,4) - movss %xmm5,(%rsi,%rbx,4) - movss %xmm7,(%rsi,%rcx,4) - movss %xmm4,(%rsi,%rdx,4) - - subps %xmm3,%xmm9 - mulps %xmm0,%xmm9 ## fscal - - movaps %xmm9,%xmm10 - movaps %xmm9,%xmm11 - - mulps nb410_dx(%rsp),%xmm9 - mulps nb410_dy(%rsp),%xmm10 - mulps nb410_dz(%rsp),%xmm11 - - ## accumulate i forces - addps %xmm9,%xmm13 - addps %xmm10,%xmm14 - addps %xmm11,%xmm15 - - movq nb410_faction(%rbp),%rsi - ## the fj's - start by accumulating x & y forces from memory - movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - - - movlps (%rsi,%r10,4),%xmm1 ## x3 y3 - - - movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2 - movhps (%rsi,%r11,4),%xmm1 ## x3 y3 x4 y4 - - movaps %xmm9,%xmm8 - unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2 - unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4 - - ## update fjx and fjy - addps %xmm9,%xmm0 - addps %xmm8,%xmm1 - - movlps %xmm0,(%rsi,%r8,4) - movlps %xmm1,(%rsi,%r10,4) - movhps %xmm0,(%rsi,%r9,4) - movhps %xmm1,(%rsi,%r11,4) - - ## xmm11: fjz1 fjz2 fjz3 fjz4 - pshufd $1,%xmm11,%xmm10 ## fjz2 - - - - movhlps %xmm11,%xmm9 ## fjz3 - - - - pshufd $3,%xmm11,%xmm8 ## fjz4 - - - - - addss 8(%rsi,%r8,4),%xmm11 - addss 8(%rsi,%r9,4),%xmm10 - addss 8(%rsi,%r10,4),%xmm9 - addss 8(%rsi,%r11,4),%xmm8 - movss %xmm11,8(%rsi,%r8,4) - movss %xmm10,8(%rsi,%r9,4) - movss %xmm9,8(%rsi,%r10,4) - movss %xmm8,8(%rsi,%r11,4) - - ## should we do one more iteration? - subl $4,nb410_innerk(%rsp) - jl _nb_kernel410_x86_64_sse.nb410_finish_inner - jmp _nb_kernel410_x86_64_sse.nb410_unroll_loop -_nb_kernel410_x86_64_sse.nb410_finish_inner: - ## check if at least two particles remain - addl $4,nb410_innerk(%rsp) - movl nb410_innerk(%rsp),%edx - andl $2,%edx - jnz _nb_kernel410_x86_64_sse.nb410_dopair - jmp _nb_kernel410_x86_64_sse.nb410_checksingle -_nb_kernel410_x86_64_sse.nb410_dopair: - movq nb410_innerjjnr(%rsp),%rcx - - movl (%rcx),%eax - movl 4(%rcx),%ebx - addq $8,nb410_innerjjnr(%rsp) - - ## load isaj - movq nb410_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm2 - movss (%rsi,%rbx,4),%xmm6 - unpcklps %xmm6,%xmm2 - - mulps nb410_isai(%rsp),%xmm2 - - movaps %xmm2,nb410_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulps nb410_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb410_gbscale(%rsp) - - mulps nb410_iq(%rsp),%xmm2 - movq nb410_charge(%rbp),%rsi ## base of charge[] - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rbx,4),%xmm6 - unpcklps %xmm6,%xmm3 - - - mulps %xmm2,%xmm3 - movaps %xmm3,nb410_qq(%rsp) - - ## vdw parameters - movq nb410_type(%rbp),%rsi - movl (%rsi,%rax,4),%r12d - movl (%rsi,%rbx,4),%r13d - shll %r12d - shll %r13d - movl nb410_ntia(%rsp),%edi - addl %edi,%r12d - addl %edi,%r13d - - movq nb410_vdwparam(%rbp),%rsi - movlps (%rsi,%r12,4),%xmm3 - movhps (%rsi,%r13,4),%xmm3 - - xorps %xmm7,%xmm7 - movaps %xmm3,%xmm0 - shufps $136,%xmm7,%xmm0 ## 10001000 - shufps $221,%xmm7,%xmm3 ## 11011101 - - movaps %xmm0,nb410_c6(%rsp) - movaps %xmm3,nb410_c12(%rsp) - - movq nb410_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r8 ## j3 - lea (%rbx,%rbx,2),%r9 - - ## move four coordinates to xmm0-xmm2 - movlps (%rsi,%r8,4),%xmm4 ## x1 y1 - - - movlps (%rsi,%r9,4),%xmm5 ## x2 y2 - - - - movss 8(%rsi,%r8,4),%xmm6 ## z1 - - - - movss 8(%rsi,%r9,4),%xmm7 ## z2 - - - - - unpcklps %xmm5,%xmm4 ## x1 x2 y1 y2 - movhlps %xmm4,%xmm5 ## y1 y2 - - - unpcklps %xmm7,%xmm6 ## z1 z2 - - - - ## calc dr - subps nb410_ix(%rsp),%xmm4 - subps nb410_iy(%rsp),%xmm5 - subps nb410_iz(%rsp),%xmm6 - - ## store dr - movaps %xmm4,nb410_dx(%rsp) - movaps %xmm5,nb410_dy(%rsp) - movaps %xmm6,nb410_dz(%rsp) - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb410_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb410_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb410_r(%rsp) - mulps nb410_gbscale(%rsp),%xmm4 - - ## truncate and convert to integers - cvttps2dq %xmm4,%xmm5 - - ## convert back to float - cvtdq2ps %xmm5,%xmm6 - - ## multiply by 4 - pslld $2,%xmm5 - - ## move to integer registers - movd %xmm5,%r12d - pshufd $1,%xmm5,%xmm5 - movd %xmm5,%r13d - - ## calculate eps - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ##eps - - movq nb410_GBtab(%rbp),%rsi - - movaps %xmm0,%xmm9 ## rinv - mulps %xmm9,%xmm9 ## rinvsq - movaps %xmm9,%xmm10 ## rinvsq - mulps %xmm10,%xmm10 ## rinv4 - mulps %xmm9,%xmm10 ## rinv6 - movaps %xmm10,%xmm11 - mulps %xmm11,%xmm11 ## rinv12 - - ## load table data - movlps (%rsi,%r12,4),%xmm4 ## Y1 F1 - movlps (%rsi,%r13,4),%xmm5 ## Y2 F2 - unpcklps %xmm5,%xmm4 ## Y1 Y2 F1 F2 - movhlps %xmm4,%xmm5 ## F1 F2 - - mulps nb410_c6(%rsp),%xmm10 ## vvdw6=c6*rinv6 - mulps nb410_c12(%rsp),%xmm11 ## vvdw12=c12*rinv12 - - movaps %xmm11,%xmm9 - subps %xmm10,%xmm11 ## Vvdw=Vvdw12-Vvdw6 - - ## add potential to vvdwtot - addps nb410_Vvdwtot(%rsp),%xmm11 - movlps %xmm11,nb410_Vvdwtot(%rsp) - - movlps 8(%rsi,%r12,4),%xmm6 ## G1 H1 - movlps 8(%rsi,%r13,4),%xmm7 ## G2 H2 - unpcklps %xmm7,%xmm6 ## G1 G2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## table data ready in xmm4-xmm7 - - mulps %xmm1,%xmm7 ## Heps - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm1,%xmm7 ## Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - addps %xmm7,%xmm7 ## two*Heps2 - movaps nb410_qq(%rsp),%xmm3 - - addps %xmm6,%xmm7 - addps %xmm5,%xmm7 ## xmm7=FF - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - mulps %xmm7,%xmm3 ## fijC=FF*qq - ## at this point xmm5 contains vcoul and xmm3 fijC - - ## LJ forces - mulps nb410_six(%rsp),%xmm10 - mulps nb410_twelve(%rsp),%xmm9 - subps %xmm10,%xmm9 - mulps %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv - - ## zero upper part of vcoul - xorps %xmm2,%xmm2 - movlhps %xmm2,%xmm5 - - movq nb410_dvda(%rbp),%rsi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulps nb410_gbscale(%rsp),%xmm3 - movaps %xmm3,%xmm6 - mulps nb410_r(%rsp),%xmm6 - addps %xmm5,%xmm6 - - xorps %xmm4,%xmm4 - ## increment vctot (sum in xmm12) - addps %xmm5,%xmm12 - - ## xmm6=(vcoul+fijC*r) - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## zero upper half of dvda - movlhps %xmm4,%xmm7 - - ## update dvdasum - addps nb410_dvdasum(%rsp),%xmm7 - movaps %xmm7,nb410_dvdasum(%rsp) - - ## update j atoms dvdaj - movaps %xmm6,%xmm5 - shufps $0x1,%xmm5,%xmm5 - - ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss (%rsi,%rax,4),%xmm6 - addss (%rsi,%rbx,4),%xmm5 - movss %xmm6,(%rsi,%rax,4) - movss %xmm5,(%rsi,%rbx,4) - - xorps %xmm7,%xmm7 - - subps %xmm3,%xmm9 - mulps %xmm0,%xmm9 ## fscal - - movaps %xmm9,%xmm10 - movaps %xmm9,%xmm11 - - mulps nb410_dx(%rsp),%xmm9 - mulps nb410_dy(%rsp),%xmm10 - mulps nb410_dz(%rsp),%xmm11 - - movlhps %xmm7,%xmm9 - movlhps %xmm7,%xmm10 - movlhps %xmm7,%xmm11 - - ## accumulate i forces - addps %xmm9,%xmm13 - addps %xmm10,%xmm14 - addps %xmm11,%xmm15 - - movq nb410_faction(%rbp),%rsi - ## the fj's - start by accumulating x & y forces from memory - movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - - - movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2 - - unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2 - addps %xmm9,%xmm0 - - movlps %xmm0,(%rsi,%r8,4) - movhps %xmm0,(%rsi,%r9,4) - - ## z forces - pshufd $1,%xmm11,%xmm8 - addss 8(%rsi,%r8,4),%xmm11 - addss 8(%rsi,%r9,4),%xmm8 - movss %xmm11,8(%rsi,%r8,4) - movss %xmm8,8(%rsi,%r9,4) - -_nb_kernel410_x86_64_sse.nb410_checksingle: - movl nb410_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel410_x86_64_sse.nb410_dosingle - jmp _nb_kernel410_x86_64_sse.nb410_updateouterdata -_nb_kernel410_x86_64_sse.nb410_dosingle: - movq nb410_charge(%rbp),%rsi - movq nb410_invsqrta(%rbp),%rdx - movq nb410_pos(%rbp),%rdi - movq nb410_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - - ## load isaj - movq nb410_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm3 - movaps nb410_isai(%rsp),%xmm2 - mulss %xmm3,%xmm2 - - movss %xmm2,nb410_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulss nb410_gbtsc(%rsp),%xmm1 - movss %xmm1,nb410_gbscale(%rsp) - - mulss nb410_iq(%rsp),%xmm2 - movq nb410_charge(%rbp),%rsi ## base of charge[] - - movss (%rsi,%rax,4),%xmm3 - mulss %xmm2,%xmm3 - movss %xmm3,nb410_qq(%rsp) - - ## vdw parameters - movq nb410_type(%rbp),%rsi - movl (%rsi,%rax,4),%r12d - shll %r12d - movl nb410_ntia(%rsp),%edi - addl %edi,%r12d - - movq nb410_vdwparam(%rbp),%rsi - movss (%rsi,%r12,4),%xmm0 - movss 4(%rsi,%r12,4),%xmm3 - movaps %xmm0,nb410_c6(%rsp) - movaps %xmm3,nb410_c12(%rsp) - - movq nb410_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r8 ## jnr - - ## move four coordinates to xmm0-xmm2 - movss (%rsi,%r8,4),%xmm4 - movss 4(%rsi,%r8,4),%xmm5 - movss 8(%rsi,%r8,4),%xmm6 - - ## calc dr - subss nb410_ix(%rsp),%xmm4 - subss nb410_iy(%rsp),%xmm5 - subss nb410_iz(%rsp),%xmm6 - - ## store dr - movaps %xmm4,nb410_dx(%rsp) - movaps %xmm5,nb410_dy(%rsp) - movaps %xmm6,nb410_dz(%rsp) - - ## square it - mulss %xmm4,%xmm4 - mulss %xmm5,%xmm5 - mulss %xmm6,%xmm6 - addss %xmm5,%xmm4 - addss %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movaps nb410_three(%rsp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movaps nb410_half(%rsp),%xmm0 - subss %xmm5,%xmm1 ## 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - mulss %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb410_r(%rsp) - mulss nb410_gbscale(%rsp),%xmm4 - - ## truncate and convert to integers - cvttss2si %xmm4,%r12d - - ## convert back to float - cvtsi2ss %r12d,%xmm6 - - ## multiply by 4 - shll $2,%r12d - - ## calculate eps - subss %xmm6,%xmm4 - movaps %xmm4,%xmm1 ##eps - - movq nb410_GBtab(%rbp),%rsi - - movaps %xmm0,%xmm9 ## rinv - mulss %xmm9,%xmm9 ## rinvsq - movaps %xmm9,%xmm10 ## rinvsq - mulss %xmm10,%xmm10 ## rinv4 - mulss %xmm9,%xmm10 ## rinv6 - movaps %xmm10,%xmm11 - mulss %xmm11,%xmm11 ## rinv12 - - ## load table data - movss (%rsi,%r12,4),%xmm4 - movss 4(%rsi,%r12,4),%xmm5 - movss 8(%rsi,%r12,4),%xmm6 - movss 12(%rsi,%r12,4),%xmm7 - ## table data ready in xmm4-xmm7 - - mulss nb410_c6(%rsp),%xmm10 ## vvdw6=c6*rinv6 - mulss nb410_c12(%rsp),%xmm11 ## vvdw12=c12*rinv12 - - movaps %xmm11,%xmm9 - subss %xmm10,%xmm11 ## Vvdw=Vvdw12-Vvdw6 - - ## add potential to vvdwtot - addss nb410_Vvdwtot(%rsp),%xmm11 - movss %xmm11,nb410_Vvdwtot(%rsp) - - mulss %xmm1,%xmm7 ## Heps - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm1,%xmm7 ## Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - addss %xmm7,%xmm7 ## two*Heps2 - movss nb410_qq(%rsp),%xmm3 - addss %xmm6,%xmm7 - addss %xmm5,%xmm7 ## xmm7=FF - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss %xmm3,%xmm5 ## vcoul=qq*VV - mulss %xmm7,%xmm3 ## fijC=FF*qq - ## at this point xmm5 contains vcoul and xmm3 fijC - - ## LJ forces - mulss nb410_six(%rsp),%xmm10 - mulss nb410_twelve(%rsp),%xmm9 - subss %xmm10,%xmm9 - mulss %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv - - movq nb410_dvda(%rbp),%rsi - - ## Calculate dVda - xorps %xmm7,%xmm7 - mulss nb410_gbscale(%rsp),%xmm3 - movaps %xmm3,%xmm6 - mulss nb410_r(%rsp),%xmm6 - addss %xmm5,%xmm6 - - ## increment vctot (sum in xmm12) - addss %xmm5,%xmm12 - - ## xmm6=(vcoul+fijC*r) - subss %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addss nb410_dvdasum(%rsp),%xmm7 - movss %xmm7,nb410_dvdasum(%rsp) - - ## update j atoms dvdaj - addss (%rsi,%rax,4),%xmm6 - movss %xmm6,(%rsi,%rax,4) - - subss %xmm3,%xmm9 - mulss %xmm0,%xmm9 ## fscal - - movaps %xmm9,%xmm10 - movaps %xmm9,%xmm11 - - mulss nb410_dx(%rsp),%xmm9 - mulss nb410_dy(%rsp),%xmm10 - mulss nb410_dz(%rsp),%xmm11 - - ## accumulate i forces - addss %xmm9,%xmm13 - addss %xmm10,%xmm14 - addss %xmm11,%xmm15 - - movq nb410_faction(%rbp),%rsi - ## add to j forces - addss (%rsi,%r8,4),%xmm9 - addss 4(%rsi,%r8,4),%xmm10 - addss 8(%rsi,%r8,4),%xmm11 - movss %xmm9,(%rsi,%r8,4) - movss %xmm10,4(%rsi,%r8,4) - movss %xmm11,8(%rsi,%r8,4) - -_nb_kernel410_x86_64_sse.nb410_updateouterdata: - movl nb410_ii3(%rsp),%ecx - movq nb410_faction(%rbp),%rdi - movq nb410_fshift(%rbp),%rsi - movl nb410_is3(%rsp),%edx - - ## accumulate i forces in xmm13, xmm14, xmm15 - movhlps %xmm13,%xmm0 - movhlps %xmm14,%xmm1 - movhlps %xmm15,%xmm2 - addps %xmm13,%xmm0 - addps %xmm14,%xmm1 - addps %xmm15,%xmm2 - movaps %xmm0,%xmm3 - movaps %xmm1,%xmm4 - movaps %xmm2,%xmm5 - shufps $1,%xmm3,%xmm3 - shufps $1,%xmm4,%xmm4 - shufps $1,%xmm5,%xmm5 - addss %xmm3,%xmm0 - addss %xmm4,%xmm1 - addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0 - - - ## increment i force - movss (%rdi,%rcx,4),%xmm3 - movss 4(%rdi,%rcx,4),%xmm4 - movss 8(%rdi,%rcx,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%rdi,%rcx,4) - movss %xmm4,4(%rdi,%rcx,4) - movss %xmm5,8(%rdi,%rcx,4) - - ## increment fshift force - movss (%rsi,%rdx,4),%xmm3 - movss 4(%rsi,%rdx,4),%xmm4 - movss 8(%rsi,%rdx,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%rsi,%rdx,4) - movss %xmm4,4(%rsi,%rdx,4) - movss %xmm5,8(%rsi,%rdx,4) - - ## get n from stack - movl nb410_n(%rsp),%esi - ## get group index for i particle - movq nb410_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - ## accumulate - movhlps %xmm12,%xmm6 - addps %xmm6,%xmm12 ## pos 0-1 in xmm12 have the sum now - movaps %xmm12,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm12 - - ## add earlier value from mem - movq nb410_Vc(%rbp),%rax - addss (%rax,%rdx,4),%xmm12 - ## move back to mem - movss %xmm12,(%rax,%rdx,4) - - ## accumulate total lj energy and update it - movaps nb410_Vvdwtot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movq nb410_Vvdw(%rbp),%rax - addss (%rax,%rdx,4),%xmm7 - ## move back to mem - movss %xmm7,(%rax,%rdx,4) - - ## accumulate dVda and update it - movaps nb410_dvdasum(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - movl nb410_ii(%rsp),%edx - movq nb410_dvda(%rbp),%rax - addss (%rax,%rdx,4),%xmm7 - movss %xmm7,(%rax,%rdx,4) - - ## finish if last - movl nb410_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel410_x86_64_sse.nb410_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb410_n(%rsp) - jmp _nb_kernel410_x86_64_sse.nb410_outer -_nb_kernel410_x86_64_sse.nb410_outerend: - ## check if more outer neighborlists remain - movl nb410_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel410_x86_64_sse.nb410_end - ## non-zero, do one more workunit - jmp _nb_kernel410_x86_64_sse.nb410_threadloop -_nb_kernel410_x86_64_sse.nb410_end: - - movl nb410_nouter(%rsp),%eax - movl nb410_ninner(%rsp),%ebx - movq nb410_outeriter(%rbp),%rcx - movq nb410_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $568,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - - -.globl nb_kernel410nf_x86_64_sse -.globl _nb_kernel410nf_x86_64_sse -nb_kernel410nf_x86_64_sse: -_nb_kernel410nf_x86_64_sse: -## Room for return address and rbp (16 bytes) -.set nb410nf_fshift, 16 -.set nb410nf_gid, 24 -.set nb410nf_pos, 32 -.set nb410nf_faction, 40 -.set nb410nf_charge, 48 -.set nb410nf_p_facel, 56 -.set nb410nf_argkrf, 64 -.set nb410nf_argcrf, 72 -.set nb410nf_Vc, 80 -.set nb410nf_type, 88 -.set nb410nf_p_ntype, 96 -.set nb410nf_vdwparam, 104 -.set nb410nf_Vvdw, 112 -.set nb410nf_p_tabscale, 120 -.set nb410nf_VFtab, 128 -.set nb410nf_invsqrta, 136 -.set nb410nf_dvda, 144 -.set nb410nf_p_gbtabscale, 152 -.set nb410nf_GBtab, 160 -.set nb410nf_p_nthreads, 168 -.set nb410nf_count, 176 -.set nb410nf_mtx, 184 -.set nb410nf_outeriter, 192 -.set nb410nf_inneriter, 200 -.set nb410nf_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb410nf_ix, 0 -.set nb410nf_iy, 16 -.set nb410nf_iz, 32 -.set nb410nf_iq, 48 -.set nb410nf_gbtsc, 64 -.set nb410nf_qq, 80 -.set nb410nf_c6, 96 -.set nb410nf_c12, 112 -.set nb410nf_vctot, 128 -.set nb410nf_Vvdwtot, 144 -.set nb410nf_half, 160 -.set nb410nf_three, 176 -.set nb410nf_isai, 192 -.set nb410nf_isaprod, 208 -.set nb410nf_gbscale, 224 -.set nb410nf_nri, 240 -.set nb410nf_iinr, 248 -.set nb410nf_jindex, 256 -.set nb410nf_jjnr, 264 -.set nb410nf_shift, 272 -.set nb410nf_shiftvec, 280 -.set nb410nf_facel, 288 -.set nb410nf_innerjjnr, 296 -.set nb410nf_is3, 304 -.set nb410nf_ii3, 308 -.set nb410nf_ntia, 312 -.set nb410nf_innerk, 316 -.set nb410nf_n, 320 -.set nb410nf_nn1, 324 -.set nb410nf_ntype, 328 -.set nb410nf_nouter, 332 -.set nb410nf_ninner, 336 - - push %rbp - movq %rsp,%rbp - push %rbx - - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $360,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb410nf_nouter(%rsp) - movl %eax,nb410nf_ninner(%rsp) - - movl (%rdi),%edi - movl %edi,nb410nf_nri(%rsp) - movq %rsi,nb410nf_iinr(%rsp) - movq %rdx,nb410nf_jindex(%rsp) - movq %rcx,nb410nf_jjnr(%rsp) - movq %r8,nb410nf_shift(%rsp) - movq %r9,nb410nf_shiftvec(%rsp) - movq nb410nf_p_ntype(%rbp),%rdi - movl (%rdi),%edi - movl %edi,nb410nf_ntype(%rsp) - movq nb410nf_p_facel(%rbp),%rsi - movss (%rsi),%xmm0 - movss %xmm0,nb410nf_facel(%rsp) - - movq nb410nf_p_gbtabscale(%rbp),%rbx - movss (%rbx),%xmm4 - shufps $0,%xmm4,%xmm4 - movaps %xmm4,nb410nf_gbtsc(%rsp) - - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## half in IEEE (hex) - movl %eax,nb410nf_half(%rsp) - movss nb410nf_half(%rsp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## one - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## two - addps %xmm2,%xmm3 ## three - movaps %xmm1,nb410nf_half(%rsp) - movaps %xmm3,nb410nf_three(%rsp) - -_nb_kernel410nf_x86_64_sse.nb410nf_threadloop: - movq nb410nf_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel410nf_x86_64_sse.nb410nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel410nf_x86_64_sse.nb410nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb410nf_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb410nf_n(%rsp) - movl %ebx,nb410nf_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel410nf_x86_64_sse.nb410nf_outerstart - jmp _nb_kernel410nf_x86_64_sse.nb410nf_end - -_nb_kernel410nf_x86_64_sse.nb410nf_outerstart: - ## ebx contains number of outer iterations - addl nb410nf_nouter(%rsp),%ebx - movl %ebx,nb410nf_nouter(%rsp) - -_nb_kernel410nf_x86_64_sse.nb410nf_outer: - movq nb410nf_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## ebx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb410nf_is3(%rsp) ## store is3 - - movq nb410nf_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movss (%rax,%rbx,4),%xmm0 - movss 4(%rax,%rbx,4),%xmm1 - movss 8(%rax,%rbx,4),%xmm2 - - movq nb410nf_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - - movq nb410nf_charge(%rbp),%rdx - movss (%rdx,%rbx,4),%xmm3 - mulss nb410nf_facel(%rsp),%xmm3 - shufps $0,%xmm3,%xmm3 - - movq nb410nf_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movss (%rdx,%rbx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - movq nb410nf_type(%rbp),%rdx - movl (%rdx,%rbx,4),%edx - imull nb410nf_ntype(%rsp),%edx - shll %edx - movl %edx,nb410nf_ntia(%rsp) - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb410nf_pos(%rbp),%rax ## rax = base of pos[] - - addss (%rax,%rbx,4),%xmm0 - addss 4(%rax,%rbx,4),%xmm1 - addss 8(%rax,%rbx,4),%xmm2 - - movaps %xmm3,nb410nf_iq(%rsp) - movaps %xmm4,nb410nf_isai(%rsp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb410nf_ix(%rsp) - movaps %xmm1,nb410nf_iy(%rsp) - movaps %xmm2,nb410nf_iz(%rsp) - - movl %ebx,nb410nf_ii3(%rsp) - - ## clear vctot - xorps %xmm4,%xmm4 - movaps %xmm4,nb410nf_vctot(%rsp) - movaps %xmm4,nb410nf_Vvdwtot(%rsp) - - movq nb410nf_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb410nf_pos(%rbp),%rsi - movq nb410nf_faction(%rbp),%rdi - movq nb410nf_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb410nf_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb410nf_ninner(%rsp),%ecx - movl %ecx,nb410nf_ninner(%rsp) - addl $0,%edx - movl %edx,nb410nf_innerk(%rsp) ## number of innerloop atoms - jge _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop - jmp _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner -_nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop: - ## quad-unroll innerloop here - movq nb410nf_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%eax - movl 4(%rdx),%ebx - movl 8(%rdx),%ecx - movl 12(%rdx),%edx ## eax-edx=jnr1-4 - addq $16,nb410nf_innerjjnr(%rsp) ## advance pointer (unrolled 4) - - ## load isa2 - movq nb410nf_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rcx,4),%xmm4 - movss (%rsi,%rbx,4),%xmm6 - movss (%rsi,%rdx,4),%xmm7 - movaps nb410nf_isai(%rsp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb410nf_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulps nb410nf_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb410nf_gbscale(%rsp) - - movq nb410nf_charge(%rbp),%rsi ## base of charge[] - - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rcx,4),%xmm4 - movss (%rsi,%rbx,4),%xmm6 - movss (%rsi,%rdx,4),%xmm7 - - mulps nb410nf_iq(%rsp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3 - mulps %xmm2,%xmm3 - movaps %xmm3,nb410nf_qq(%rsp) - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movq nb410nf_type(%rbp),%rsi - movl (%rsi,%rax,4),%eax - movl (%rsi,%rbx,4),%ebx - movl (%rsi,%rcx,4),%ecx - movl (%rsi,%rdx,4),%edx - movq nb410nf_vdwparam(%rbp),%rsi - shll %eax - shll %ebx - shll %ecx - shll %edx - movl nb410nf_ntia(%rsp),%edi - addl %edi,%eax - addl %edi,%ebx - addl %edi,%ecx - addl %edi,%edx - - movlps (%rsi,%rax,4),%xmm6 - movlps (%rsi,%rcx,4),%xmm7 - movhps (%rsi,%rbx,4),%xmm6 - movhps (%rsi,%rdx,4),%xmm7 - - movaps %xmm6,%xmm4 - shufps $136,%xmm7,%xmm4 ## 10001000 - shufps $221,%xmm7,%xmm6 ## 11011101 - - movd %mm0,%eax - movd %mm1,%ebx - movd %mm2,%ecx - movd %mm3,%edx - - movaps %xmm4,nb410nf_c6(%rsp) - movaps %xmm6,nb410nf_c12(%rsp) - - movq nb410nf_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%rax ## replace jnr with j3 - lea (%rbx,%rbx,2),%rbx - - lea (%rcx,%rcx,2),%rcx ## replace jnr with j3 - lea (%rdx,%rdx,2),%rdx - - ## move four coordinates to xmm0-xmm2 - - movlps (%rsi,%rax,4),%xmm4 - movlps (%rsi,%rcx,4),%xmm5 - movss 8(%rsi,%rax,4),%xmm2 - movss 8(%rsi,%rcx,4),%xmm6 - - movhps (%rsi,%rbx,4),%xmm4 - movhps (%rsi,%rdx,4),%xmm5 - - movss 8(%rsi,%rbx,4),%xmm0 - movss 8(%rsi,%rdx,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## 10001000 - - shufps $136,%xmm5,%xmm0 ## 10001000 - shufps $221,%xmm5,%xmm1 ## 11011101 - - ## move ix-iz to xmm4-xmm6 - movaps nb410nf_ix(%rsp),%xmm4 - movaps nb410nf_iy(%rsp),%xmm5 - movaps nb410nf_iz(%rsp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb410nf_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb410nf_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - mulps nb410nf_gbscale(%rsp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $2,%mm6 - pslld $2,%mm7 - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movq nb410nf_GBtab(%rbp),%rsi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## load coulomb table - movaps (%rsi,%rax,4),%xmm4 - movaps (%rsi,%rbx,4),%xmm5 - movaps (%rsi,%rcx,4),%xmm6 - movaps (%rsi,%rdx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## coulomb table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb410nf_qq(%rsp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - ## update vctot - addps nb410nf_vctot(%rsp),%xmm5 - movaps %xmm5,nb410nf_vctot(%rsp) - - ## L-J - movaps %xmm0,%xmm4 - mulps %xmm0,%xmm4 ## xmm4=rinvsq - - movaps %xmm4,%xmm6 - mulps %xmm4,%xmm6 - - mulps %xmm4,%xmm6 ## xmm6=rinvsix - movaps %xmm6,%xmm4 - mulps %xmm4,%xmm4 ## xmm4=rinvtwelve - mulps nb410nf_c6(%rsp),%xmm6 - mulps nb410nf_c12(%rsp),%xmm4 - movaps nb410nf_Vvdwtot(%rsp),%xmm7 - addps %xmm4,%xmm7 - subps %xmm6,%xmm7 - movaps %xmm7,nb410nf_Vvdwtot(%rsp) - - ## should we do one more iteration? - subl $4,nb410nf_innerk(%rsp) - jl _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner - jmp _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop -_nb_kernel410nf_x86_64_sse.nb410nf_finish_inner: - ## check if at least two particles remain - addl $4,nb410nf_innerk(%rsp) - movl nb410nf_innerk(%rsp),%edx - andl $2,%edx - jnz _nb_kernel410nf_x86_64_sse.nb410nf_dopair - jmp _nb_kernel410nf_x86_64_sse.nb410nf_checksingle -_nb_kernel410nf_x86_64_sse.nb410nf_dopair: - movq nb410nf_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - movl 4(%rcx),%ebx - addq $8,nb410nf_innerjjnr(%rsp) - - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - - ## load isa2 - movq nb410nf_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm2 - movss (%rsi,%rbx,4),%xmm3 - unpcklps %xmm3,%xmm2 ## isa2 in xmm3(0,1) - mulps nb410nf_isai(%rsp),%xmm2 - movaps %xmm2,nb410nf_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulps nb410nf_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb410nf_gbscale(%rsp) - - movq nb410nf_charge(%rbp),%rsi ## base of charge[] - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rbx,4),%xmm6 - unpcklps %xmm6,%xmm3 ## 00001000 ;# xmm3(0,1) has the charges - - mulps nb410nf_iq(%rsp),%xmm2 - mulps %xmm2,%xmm3 - movaps %xmm3,nb410nf_qq(%rsp) - - movq nb410nf_type(%rbp),%rsi - movl %eax,%ecx - movl %ebx,%edx - movl (%rsi,%rcx,4),%ecx - movl (%rsi,%rdx,4),%edx - movq nb410nf_vdwparam(%rbp),%rsi - shll %ecx - shll %edx - movl nb410nf_ntia(%rsp),%edi - addl %edi,%ecx - addl %edi,%edx - movlps (%rsi,%rcx,4),%xmm6 - movhps (%rsi,%rdx,4),%xmm6 - movq nb410nf_pos(%rbp),%rdi - - movaps %xmm6,%xmm4 - shufps $8,%xmm4,%xmm4 ## 00001000 - shufps $13,%xmm6,%xmm6 ## 00001101 - movlhps %xmm7,%xmm4 - movlhps %xmm7,%xmm6 - - movaps %xmm4,nb410nf_c6(%rsp) - movaps %xmm6,nb410nf_c12(%rsp) - - lea (%rax,%rax,2),%rax - lea (%rbx,%rbx,2),%rbx - ## move coordinates to xmm0-xmm2 - movlps (%rdi,%rax,4),%xmm1 - movss 8(%rdi,%rax,4),%xmm2 - movhps (%rdi,%rbx,4),%xmm1 - movss 8(%rdi,%rbx,4),%xmm0 - - movlhps %xmm7,%xmm3 - - shufps $0,%xmm0,%xmm2 - - movaps %xmm1,%xmm0 - - shufps $136,%xmm2,%xmm2 ## 10001000 - - shufps $136,%xmm0,%xmm0 ## 10001000 - shufps $221,%xmm1,%xmm1 ## 11011101 - - movq nb410nf_faction(%rbp),%rdi - ## move ix-iz to xmm4-xmm6 - xorps %xmm7,%xmm7 - - movaps nb410nf_ix(%rsp),%xmm4 - movaps nb410nf_iy(%rsp),%xmm5 - movaps nb410nf_iz(%rsp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb410nf_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb410nf_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - mulps nb410nf_gbscale(%rsp),%xmm4 - - cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 - - movq nb410nf_GBtab(%rbp),%rsi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## load coulomb table - movaps (%rsi,%rcx,4),%xmm4 - movaps (%rsi,%rdx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb410nf_qq(%rsp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - - addps nb410nf_vctot(%rsp),%xmm5 - movaps %xmm5,nb410nf_vctot(%rsp) - - ## L-J - movaps %xmm0,%xmm4 - mulps %xmm0,%xmm4 ## xmm4=rinvsq - - ## at this point mm5 contains vcoul and mm3 fijC - ## increment vcoul - then we can get rid of mm5 - ## update vctot - - movaps %xmm4,%xmm6 - mulps %xmm4,%xmm6 - - mulps %xmm4,%xmm6 ## xmm6=rinvsix - movaps %xmm6,%xmm4 - mulps %xmm4,%xmm4 ## xmm4=rinvtwelve - mulps nb410nf_c6(%rsp),%xmm6 - mulps nb410nf_c12(%rsp),%xmm4 - movaps nb410nf_Vvdwtot(%rsp),%xmm7 - addps %xmm4,%xmm7 - subps %xmm6,%xmm7 - movaps %xmm7,nb410nf_Vvdwtot(%rsp) - -_nb_kernel410nf_x86_64_sse.nb410nf_checksingle: - movl nb410nf_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel410nf_x86_64_sse.nb410nf_dosingle - jmp _nb_kernel410nf_x86_64_sse.nb410nf_updateouterdata -_nb_kernel410nf_x86_64_sse.nb410nf_dosingle: - movq nb410nf_charge(%rbp),%rsi - movq nb410nf_invsqrta(%rbp),%rdx - movq nb410nf_pos(%rbp),%rdi - movq nb410nf_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - movss (%rdx,%rax,4),%xmm2 ## isa2 - mulss nb410nf_isai(%rsp),%xmm2 - movss %xmm2,nb410nf_isaprod(%rsp) - movss %xmm2,%xmm1 - mulss nb410nf_gbtsc(%rsp),%xmm1 - movss %xmm1,nb410nf_gbscale(%rsp) - - mulss nb410nf_iq(%rsp),%xmm2 - movss (%rsi,%rax,4),%xmm6 ## xmm6(0) has the charge - mulss %xmm2,%xmm6 - movss %xmm6,nb410nf_qq(%rsp) - - movq nb410nf_type(%rbp),%rsi - movl %eax,%ecx - movl (%rsi,%rcx,4),%ecx - movq nb410nf_vdwparam(%rbp),%rsi - shll %ecx - addl nb410nf_ntia(%rsp),%ecx - movlps (%rsi,%rcx,4),%xmm6 - movaps %xmm6,%xmm4 - shufps $252,%xmm4,%xmm4 ## 11111100 - shufps $253,%xmm6,%xmm6 ## 11111101 - - movaps %xmm4,nb410nf_c6(%rsp) - movaps %xmm6,nb410nf_c12(%rsp) - - lea (%rax,%rax,2),%rax - - ## move coordinates to xmm0-xmm2 - movss (%rdi,%rax,4),%xmm0 - movss 4(%rdi,%rax,4),%xmm1 - movss 8(%rdi,%rax,4),%xmm2 - - movaps nb410nf_ix(%rsp),%xmm4 - movaps nb410nf_iy(%rsp),%xmm5 - movaps nb410nf_iz(%rsp),%xmm6 - - ## calc dr - subss %xmm0,%xmm4 - subss %xmm1,%xmm5 - subss %xmm2,%xmm6 - - ## square it - mulss %xmm4,%xmm4 - mulss %xmm5,%xmm5 - mulss %xmm6,%xmm6 - addss %xmm5,%xmm4 - addss %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movss nb410nf_three(%rsp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movss nb410nf_half(%rsp),%xmm0 - subss %xmm5,%xmm1 ## 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - - mulss %xmm0,%xmm4 ## xmm4=r - mulss nb410nf_gbscale(%rsp),%xmm4 - - cvttss2si %xmm4,%ebx ## mm6 contain lu indices - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%ebx - movq nb410nf_GBtab(%rbp),%rsi - - movaps (%rsi,%rbx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - movss nb410nf_qq(%rsp),%xmm3 - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss %xmm3,%xmm5 ## vcoul=qq*VV - addss nb410nf_vctot(%rsp),%xmm5 - movss %xmm5,nb410nf_vctot(%rsp) - - ## L-J - movaps %xmm0,%xmm4 - mulss %xmm0,%xmm4 ## xmm4=rinvsq - - movaps %xmm4,%xmm6 - mulss %xmm4,%xmm6 - - mulss %xmm4,%xmm6 ## xmm6=rinvsix - movaps %xmm6,%xmm4 - mulss %xmm4,%xmm4 ## xmm4=rinvtwelve - mulss nb410nf_c6(%rsp),%xmm6 - mulss nb410nf_c12(%rsp),%xmm4 - movss nb410nf_Vvdwtot(%rsp),%xmm7 - addps %xmm4,%xmm7 - subps %xmm6,%xmm7 - movss %xmm7,nb410nf_Vvdwtot(%rsp) - -_nb_kernel410nf_x86_64_sse.nb410nf_updateouterdata: - ## get n from stack - movl nb410nf_n(%rsp),%esi - ## get group index for i particle - movq nb410nf_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movaps nb410nf_vctot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movq nb410nf_Vc(%rbp),%rax - addss (%rax,%rdx,4),%xmm7 - ## move back to mem - movss %xmm7,(%rax,%rdx,4) - - ## accumulate total lj energy and update it - movaps nb410nf_Vvdwtot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movq nb410nf_Vvdw(%rbp),%rax - addss (%rax,%rdx,4),%xmm7 - ## move back to mem - movss %xmm7,(%rax,%rdx,4) - - ## finish if last - movl nb410nf_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel410nf_x86_64_sse.nb410nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb410nf_n(%rsp) - jmp _nb_kernel410nf_x86_64_sse.nb410nf_outer -_nb_kernel410nf_x86_64_sse.nb410nf_outerend: - ## check if more outer neighborlists remain - movl nb410nf_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel410nf_x86_64_sse.nb410nf_end - ## non-zero, do one more workunit - jmp _nb_kernel410nf_x86_64_sse.nb410nf_threadloop -_nb_kernel410nf_x86_64_sse.nb410nf_end: - - movl nb410nf_nouter(%rsp),%eax - movl nb410nf_ninner(%rsp),%ebx - movq nb410nf_outeriter(%rbp),%rcx - movq nb410nf_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $360,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.c b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.c index 8b82656348..a6b97565f0 100644 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.c +++ b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.c @@ -22,25 +22,14 @@ #include #include +#include + /* get gmx_gbdata_t */ #include "../nb_kerneltype.h" #include "nb_kernel430_x86_64_sse.h" -/* to extract single integers from a __m128i datatype */ -#define _mm_extract_epi32(x, imm) \ -_mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))) -static inline __m128 -my_invrsq_ps(__m128 x) -{ - const __m128 three = (const __m128) {3.0f, 3.0f, 3.0f, 3.0f}; - const __m128 half = (const __m128) {0.5f, 0.5f, 0.5f, 0.5f}; - - __m128 t1 = _mm_rsqrt_ps(x); - - return (__m128) _mm_mul_ps(half,_mm_mul_ps(t1,_mm_sub_ps(three,_mm_mul_ps(x,_mm_mul_ps(t1,t1))))); -} void nb_kernel430_x86_64_sse(int * p_nri, int * iinr, @@ -101,15 +90,15 @@ void nb_kernel430_x86_64_sse(int * p_nri, __m128 fac_sse,tabscale_sse,gbtabscale_sse; __m128i n0, nnn; - const __m128 neg = {-1.0f,-1.0f,-1.0f,-1.0f}; - const __m128 zero = {0.0f,0.0f,0.0f,0.0f}; - const __m128 half = {0.5f,0.5f,0.5f,0.5f}; - const __m128 two = {2.0f,2.0f,2.0f,2.0f}; - const __m128 three = {3.0f,3.0f,3.0f,3.0f}; - const __m128 six = {6.0f,6.0f,6.0f,6.0f}; - const __m128 twelwe = {12.0f,12.0f,12.0f,12.0f}; + const __m128 neg = _mm_set1_ps(-1.0f); + const __m128 zero = _mm_set1_ps(0.0f); + const __m128 half = _mm_set1_ps(0.5f); + const __m128 two = _mm_set1_ps(2.0f); + const __m128 three = _mm_set1_ps(3.0f); + const __m128 six = _mm_set1_ps(6.0f); + const __m128 twelwe = _mm_set1_ps(12.0f); - __m128i four = _mm_set_epi32(4,4,4,4); + __m128i four = _mm_set1_epi32(4); __m128i maski = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff); __m128i mask = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff); @@ -224,7 +213,7 @@ void nb_kernel430_x86_64_sse(int * p_nri, rsq = _mm_add_ps(t1,t2); rsq = _mm_add_ps(rsq,t3); - rinv = my_invrsq_ps(rsq); + rinv = gmx_mm_invsqrt_ps(rsq); xmm1 = _mm_load_ss(invsqrta+jnr); xmm2 = _mm_load_ss(invsqrta+jnr2); @@ -292,10 +281,10 @@ void nb_kernel430_x86_64_sse(int * p_nri, nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_ps */ - xmm1 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ - xmm2 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ - xmm3 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ - xmm4 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ + xmm1 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ + xmm2 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ + xmm3 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ + xmm4 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ /* transpose 4*4 */ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */ @@ -355,10 +344,10 @@ void nb_kernel430_x86_64_sse(int * p_nri, nnn = _mm_slli_epi32(n0,3); /* Tabulated VdW interaction - disperion */ - xmm1 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ - xmm2 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ - xmm3 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ - xmm4 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ + xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ + xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ + xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ + xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ /* transpose 4*4 */ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */ @@ -387,10 +376,10 @@ void nb_kernel430_x86_64_sse(int * p_nri, /* Tabulated VdW interaction - repulsion */ nnn = _mm_add_epi32(nnn,four); - xmm1 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ - xmm2 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ - xmm3 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ - xmm4 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ + xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ + xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ + xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ + xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ /* transpose 4*4 */ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */ @@ -620,15 +609,15 @@ void nb_kernel430_x86_64_sse(int * p_nri, mask = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff); } - jx = _mm_and_ps( (__m128) mask, xmm6); - jy = _mm_and_ps( (__m128) mask, xmm4); - jz = _mm_and_ps( (__m128) mask, xmm5); + jx = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm6); + jy = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm4); + jz = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm5); - c6 = _mm_and_ps( (__m128) mask, c6); - c12 = _mm_and_ps( (__m128) mask, c12); - dvdaj = _mm_and_ps( (__m128) mask, dvdaj); - isaj = _mm_and_ps( (__m128) mask, isaj); - q = _mm_and_ps( (__m128) mask, q); + c6 = _mm_and_ps( gmx_mm_castsi128_ps(mask), c6); + c12 = _mm_and_ps( gmx_mm_castsi128_ps(mask), c12); + dvdaj = _mm_and_ps( gmx_mm_castsi128_ps(mask), dvdaj); + isaj = _mm_and_ps( gmx_mm_castsi128_ps(mask), isaj); + q = _mm_and_ps( gmx_mm_castsi128_ps(mask), q); dx1 = _mm_sub_ps(ix,jx); dy1 = _mm_sub_ps(iy,jy); @@ -641,7 +630,7 @@ void nb_kernel430_x86_64_sse(int * p_nri, rsq = _mm_add_ps(t1,t2); rsq = _mm_add_ps(rsq,t3); - rinv = my_invrsq_ps(rsq); + rinv = gmx_mm_invsqrt_ps(rsq); isaprod = _mm_mul_ps(isai,isaj); qq = _mm_mul_ps(iq,q); @@ -664,10 +653,10 @@ void nb_kernel430_x86_64_sse(int * p_nri, nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_ps */ - xmm1 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ - xmm2 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ - xmm3 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ - xmm4 = _mm_load_ps(GBtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ + xmm1 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ + xmm2 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ + xmm3 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ + xmm4 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ /* transpose 4*4 */ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */ @@ -705,8 +694,8 @@ void nb_kernel430_x86_64_sse(int * p_nri, xmm1 = _mm_mul_ps(xmm1,isaj); dvdaj = _mm_add_ps(dvdaj,xmm1); - vcoul = _mm_and_ps( (__m128) mask, vcoul); - vgb = _mm_and_ps( (__m128) mask, vgb); + vcoul = _mm_and_ps( gmx_mm_castsi128_ps(mask), vcoul); + vgb = _mm_and_ps( gmx_mm_castsi128_ps(mask), vgb); vctot = _mm_add_ps(vctot,vcoul); vgbtot = _mm_add_ps(vgbtot,vgb); @@ -720,10 +709,10 @@ void nb_kernel430_x86_64_sse(int * p_nri, nnn = _mm_slli_epi32(n0,3); /* Tabulated VdW interaction - disperion */ - xmm1 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ - xmm2 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ - xmm3 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ - xmm4 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ + xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ + xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ + xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ + xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ /* transpose 4*4 */ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */ @@ -752,10 +741,10 @@ void nb_kernel430_x86_64_sse(int * p_nri, /* Tabulated VdW interaction - repulsion */ nnn = _mm_add_epi32(nnn,four); - xmm1 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ - xmm2 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ - xmm3 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ - xmm4 = _mm_load_ps(VFtab+(_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ + xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */ + xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */ + xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */ + xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */ /* transpose 4*4 */ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */ @@ -892,9 +881,9 @@ void nb_kernel430_x86_64_sse(int * p_nri, _mm_store_ss(faction+j33+2,xmm7); } - t1 = _mm_and_ps( (__m128) mask, t1); - t2 = _mm_and_ps( (__m128) mask, t2); - t3 = _mm_and_ps( (__m128) mask, t3); + t1 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t1); + t2 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t2); + t3 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t3); fix = _mm_add_ps(fix,t1); fiy = _mm_add_ps(fiy,t2); @@ -919,7 +908,7 @@ void nb_kernel430_x86_64_sse(int * p_nri, xmm2 = _mm_unpacklo_ps(fix,fiy); /* fx, fy, - - */ xmm2 = _mm_movelh_ps(xmm2,fiz); - xmm2 = _mm_and_ps( (__m128) maski, xmm2); + xmm2 = _mm_and_ps( gmx_mm_castsi128_ps(maski), xmm2); /* load i force from memory */ xmm4 = _mm_loadl_pi(xmm4, (__m64 *) (faction+ii3)); diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.intel_syntax.s deleted file mode 100644 index e3ee63bb60..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.intel_syntax.s +++ /dev/null @@ -1,2330 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. - -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - - - - -.globl nb_kernel430_x86_64_sse -.globl _nb_kernel430_x86_64_sse -nb_kernel430_x86_64_sse: -_nb_kernel430_x86_64_sse: -;# Room for return address and rbp (16 bytes) -.equiv nb430_fshift, 16 -.equiv nb430_gid, 24 -.equiv nb430_pos, 32 -.equiv nb430_faction, 40 -.equiv nb430_charge, 48 -.equiv nb430_p_facel, 56 -.equiv nb430_argkrf, 64 -.equiv nb430_argcrf, 72 -.equiv nb430_Vc, 80 -.equiv nb430_type, 88 -.equiv nb430_p_ntype, 96 -.equiv nb430_vdwparam, 104 -.equiv nb430_Vvdw, 112 -.equiv nb430_p_tabscale, 120 -.equiv nb430_VFtab, 128 -.equiv nb430_invsqrta, 136 -.equiv nb430_dvda, 144 -.equiv nb430_p_gbtabscale, 152 -.equiv nb430_GBtab, 160 -.equiv nb430_p_nthreads, 168 -.equiv nb430_count, 176 -.equiv nb430_mtx, 184 -.equiv nb430_outeriter, 192 -.equiv nb430_inneriter, 200 -.equiv nb430_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb430_ix, 0 -.equiv nb430_iy, 16 -.equiv nb430_iz, 32 -.equiv nb430_iq, 48 -.equiv nb430_dx, 64 -.equiv nb430_dy, 80 -.equiv nb430_dz, 96 -.equiv nb430_eps, 112 -.equiv nb430_gbtsc, 128 -.equiv nb430_tsc, 144 -.equiv nb430_qq, 160 -.equiv nb430_c6, 176 -.equiv nb430_c12, 192 -.equiv nb430_epsgb, 208 -.equiv nb430_vctot, 224 -.equiv nb430_Vvdwtot, 240 -.equiv nb430_fix, 256 -.equiv nb430_fiy, 272 -.equiv nb430_fiz, 288 -.equiv nb430_half, 304 -.equiv nb430_three, 320 -.equiv nb430_r, 336 -.equiv nb430_isai, 352 -.equiv nb430_isaprod, 368 -.equiv nb430_dvdasum, 384 -.equiv nb430_gbscale, 400 -.equiv nb430_rinv, 416 -.equiv nb430_nri, 432 -.equiv nb430_iinr, 440 -.equiv nb430_jindex, 448 -.equiv nb430_jjnr, 456 -.equiv nb430_shift, 464 -.equiv nb430_shiftvec, 472 -.equiv nb430_facel, 480 -.equiv nb430_innerjjnr, 488 -.equiv nb430_ii, 496 -.equiv nb430_is3, 500 -.equiv nb430_ii3, 504 -.equiv nb430_ntia, 508 -.equiv nb430_innerk, 512 -.equiv nb430_n, 516 -.equiv nb430_nn1, 520 -.equiv nb430_ntype, 524 -.equiv nb430_nouter, 528 -.equiv nb430_ninner, 532 - - push rbp - mov rbp, rsp - push rbx - - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 552 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb430_nouter], eax - mov [rsp + nb430_ninner], eax - - - - mov edi, [rdi] - mov [rsp + nb430_nri], edi - mov [rsp + nb430_iinr], rsi - mov [rsp + nb430_jindex], rdx - mov [rsp + nb430_jjnr], rcx - mov [rsp + nb430_shift], r8 - mov [rsp + nb430_shiftvec], r9 - mov rdi, [rbp + nb430_p_ntype] - mov edi, [rdi] - mov [rsp + nb430_ntype], edi - mov rsi, [rbp + nb430_p_facel] - movss xmm0, [rsi] - movss [rsp + nb430_facel], xmm0 - - mov rax, [rbp + nb430_p_tabscale] - movss xmm3, [rax] - shufps xmm3, xmm3, 0 - movaps [rsp + nb430_tsc], xmm3 - - mov rbx, [rbp + nb430_p_gbtabscale] - movss xmm4, [rbx] - shufps xmm4, xmm4, 0 - movaps [rsp + nb430_gbtsc], xmm4 - - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# half in IEEE (hex) - mov [rsp + nb430_half], eax - movss xmm1, [rsp + nb430_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# one - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# two - addps xmm3, xmm2 ;# three - movaps [rsp + nb430_half], xmm1 - movaps [rsp + nb430_three], xmm3 - -.nb430_threadloop: - mov rsi, [rbp + nb430_count] ;# pointer to sync counter - mov eax, [rsi] -.nb430_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb430_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb430_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb430_n], eax - mov [rsp + nb430_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb430_outerstart - jmp .nb430_end - -.nb430_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb430_nouter] - mov [rsp + nb430_nouter], ebx - -.nb430_outer: - mov rax, [rsp + nb430_shift] ;# rax = pointer into shift[] - mov ebx, [rax + rsi*4] ;# ebx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb430_is3],ebx ;# store is3 - - mov rax, [rsp + nb430_shiftvec] ;# rax = base of shiftvec[] - - movss xmm0, [rax + rbx*4] - movss xmm1, [rax + rbx*4 + 4] - movss xmm2, [rax + rbx*4 + 8] - - mov rcx, [rsp + nb430_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx + rsi*4] ;# ebx =ii - mov [rsp + nb430_ii], ebx - - mov rdx, [rbp + nb430_charge] - movss xmm3, [rdx + rbx*4] - mulss xmm3, [rsp + nb430_facel] - shufps xmm3, xmm3, 0 - - mov rdx, [rbp + nb430_invsqrta] ;# load invsqrta[ii] - movss xmm4, [rdx + rbx*4] - shufps xmm4, xmm4, 0 - - mov rdx, [rbp + nb430_type] - mov edx, [rdx + rbx*4] - imul edx, [rsp + nb430_ntype] - shl edx, 1 - mov [rsp + nb430_ntia], edx - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb430_pos] ;# rax = base of pos[] - - addss xmm0, [rax + rbx*4] - addss xmm1, [rax + rbx*4 + 4] - addss xmm2, [rax + rbx*4 + 8] - - movaps [rsp + nb430_iq], xmm3 - movaps [rsp + nb430_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [rsp + nb430_ix], xmm0 - movaps [rsp + nb430_iy], xmm1 - movaps [rsp + nb430_iz], xmm2 - - mov [rsp + nb430_ii3], ebx - - ;# clear vctot and i forces - xorps xmm4, xmm4 - movaps [rsp + nb430_vctot], xmm4 - movaps [rsp + nb430_Vvdwtot], xmm4 - movaps [rsp + nb430_dvdasum], xmm4 - movaps [rsp + nb430_fix], xmm4 - movaps [rsp + nb430_fiy], xmm4 - movaps [rsp + nb430_fiz], xmm4 - - mov rax, [rsp + nb430_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb430_pos] - mov rdi, [rbp + nb430_faction] - mov rax, [rsp + nb430_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb430_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [rsp + nb430_ninner] - mov [rsp + nb430_ninner], ecx - add edx, 0 - mov [rsp + nb430_innerk], edx ;# number of innerloop atoms - - jge .nb430_unroll_loop - jmp .nb430_finish_inner -.nb430_unroll_loop: - ;# quad-unroll innerloop here - mov rdx, [rsp + nb430_innerjjnr] ;# pointer to jjnr[k] - mov eax, [rdx] - mov ebx, [rdx + 4] - mov ecx, [rdx + 8] - mov edx, [rdx + 12] ;# eax-edx=jnr1-4 - - add qword ptr [rsp + nb430_innerjjnr], 16 ;# advance pointer (unrolled 4) - - ;# load isaj - mov rsi, [rbp + nb430_invsqrta] - movss xmm3, [rsi + rax*4] - movss xmm4, [rsi + rcx*4] - movss xmm6, [rsi + rbx*4] - movss xmm7, [rsi + rdx*4] - movaps xmm2, [rsp + nb430_isai] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# 10001000 ;# all isaj in xmm3 - mulps xmm2, xmm3 - - movaps [rsp + nb430_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [rsp + nb430_gbtsc] - movaps [rsp + nb430_gbscale], xmm1 - - mov rsi, [rbp + nb430_charge] ;# base of charge[] - - movss xmm3, [rsi + rax*4] - movss xmm4, [rsi + rcx*4] - movss xmm6, [rsi + rbx*4] - movss xmm7, [rsi + rdx*4] - - mulps xmm2, [rsp + nb430_iq] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# 10001000 ;# all charges in xmm3 - mulps xmm3, xmm2 - movaps [rsp + nb430_qq], xmm3 - - ;# vdw parameters - mov rsi, [rbp + nb430_type] - mov r12d, [rsi + rax*4] - mov r13d, [rsi + rbx*4] - mov r14d, [rsi + rcx*4] - mov r15d, [rsi + rdx*4] - shl r12d, 1 - shl r13d, 1 - shl r14d, 1 - shl r15d, 1 - mov edi, [rsp + nb430_ntia] - add r12d, edi - add r13d, edi - add r14d, edi - add r15d, edi - - mov rsi, [rbp + nb430_vdwparam] - movlps xmm3, [rsi + r12*4] - movlps xmm7, [rsi + r14*4] - movhps xmm3, [rsi + r13*4] - movhps xmm7, [rsi + r15*4] - - movaps xmm0, xmm3 - shufps xmm0, xmm7, 136 ;# 10001000 - shufps xmm3, xmm7, 221 ;# 11011101 - - movaps [rsp + nb430_c6], xmm0 - movaps [rsp + nb430_c12], xmm3 - - mov rsi, [rbp + nb430_pos] ;# base of pos[] - - lea r8, [rax + rax*2] ;# jnr - lea r9, [rbx + rbx*2] - lea r10, [rcx + rcx*2] - lea r11, [rdx + rdx*2] - - ;# move four coordinates to xmm0-xmm2 - movlps xmm4, [rsi + r8*4] - movlps xmm5, [rsi + r10*4] - movss xmm2, [rsi + r8*4 + 8] - movss xmm6, [rsi + r10*4 + 8] - - movhps xmm4, [rsi + r9*4] - movhps xmm5, [rsi + r11*4] - - movss xmm0, [rsi + r9*4 + 8] - movss xmm1, [rsi + r11*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# 10001000 - - shufps xmm0, xmm5, 136 ;# 10001000 - shufps xmm1, xmm5, 221 ;# 11011101 - - ;# calc dr - subps xmm0, [rsp + nb430_ix] - subps xmm1, [rsp + nb430_iy] - subps xmm2, [rsp + nb430_iz] - - ;# store dr - movaps [rsp + nb430_dx], xmm0 - movaps [rsp + nb430_dy], xmm1 - movaps [rsp + nb430_dz], xmm2 - - movd mm0, r8 ;# store j3 - movd mm1, r9 - movd mm2, r10 - movd mm3, r11 - - ;# square it - mulps xmm0,xmm0 - mulps xmm1,xmm1 - mulps xmm2,xmm2 - addps xmm0, xmm1 - addps xmm0, xmm2 - movaps xmm4, xmm0 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb430_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb430_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb430_r], xmm4 - movaps [rsp + nb430_rinv], xmm0 - - movaps xmm8, xmm4 ;# r - mulps xmm4, [rsp + nb430_gbscale] ;# rgbtab - mulps xmm8, [rsp + nb430_tsc] ;# rtab - - ;# truncate and convert to integers - cvttps2dq xmm5, xmm4 ;# gb - cvttps2dq xmm9, xmm8 ;# lj - - ;# convert back to float - cvtdq2ps xmm6, xmm5 ;# gb - cvtdq2ps xmm10, xmm9 ;# lj - - ;# multiply by 4 and 8, respectively - pslld xmm5, 2 ;# gb - pslld xmm9, 3 ;# lj - - ;# move to integer registers - movhlps xmm7, xmm5 ;# gb - movhlps xmm11, xmm9 ;# lj - movd r8d, xmm5 ;# gb - movd r12d, xmm9 ;# lj - movd r10d, xmm7 ;# gb - movd r14d, xmm11 ;# lj - pshufd xmm5, xmm5, 1 ;# gb - pshufd xmm9, xmm9, 1 ;# lj - pshufd xmm7, xmm7, 1 ;# gb - pshufd xmm11, xmm11, 1 ;# lj - movd r9d, xmm5 ;# gb - movd r13d, xmm9 ;# lj - movd r11d, xmm7 ;# gb - movd r15d, xmm11 ;# lj - ;# GB indices: r8-r11 LJ indices: r12-r15 - - ;# calculate eps - subps xmm4, xmm6 ;# gb - subps xmm8, xmm10 ;# lj - movaps [rsp + nb430_epsgb], xmm4 ;# gb eps - movaps [rsp + nb430_eps], xmm8 ;# lj eps - - mov rsi, [rbp + nb430_GBtab] - mov rdi, [rbp + nb430_VFtab] - - ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 - movlps xmm1, [rsi + r8*4] ;# Y1c F1c - movlps xmm5, [rdi + r12*4] ;# Y1d F1d - movlps xmm9, [rdi + r12*4 + 16] ;# Y1r F1r - - movlps xmm3, [rsi + r10*4] ;# Y3c F3c - movlps xmm7, [rdi + r14*4] ;# Y3d F3d - movlps xmm11, [rdi + r14*4 + 16] ;# Y3r F3r - - movhps xmm1, [rsi + r9*4] ;# Y1c F1c Y2c F2c - movhps xmm5, [rdi + r13*4] ;# Y1d F1d Y2d F2d - movhps xmm9, [rdi + r13*4 + 16] ;# Y1r F1r Y2r F2r - - movhps xmm3, [rsi + r11*4] ;# Y3c F3c Y4c F4c - movhps xmm7, [rdi + r15*4] ;# Y3d F3d Y4d F4d - movhps xmm11, [rdi + r15*4 + 16] ;# Y3r F3r Y4r F4r - - movaps xmm0, xmm1 - movaps xmm4, xmm5 - movaps xmm8, xmm9 - shufps xmm0, xmm3, 136 ;# 10001000 => Y1c Y2c Y3c Y4c - shufps xmm4, xmm7, 136 ;# 10001000 => Y1d Y2d Y3d Y4d - shufps xmm8, xmm11, 136 ;# 10001000 => Y1r Y2r Y3r Y4r - shufps xmm1, xmm3, 221 ;# 11011101 => F1c F2c F3c F4c - shufps xmm5, xmm7, 221 ;# 11011101 => F1d F2d F3d F4d - shufps xmm9, xmm11, 221 ;# 11011101 => F1r F2r F3r F4r - - movlps xmm3, [rsi + r8*4 + 8] ;# G1c H1c - movlps xmm7, [rdi + r12*4 + 8] ;# G1d H1d - movlps xmm11, [rdi + r12*4 + 24] ;# G1r H1r - - movlps xmm12, [rsi + r10*4 + 8] ;# G3c H3c - movlps xmm13, [rdi + r14*4 + 8] ;# G3d H3d - movlps xmm14, [rdi + r14*4 + 24] ;# G3r H3r - - movhps xmm3, [rsi + r9*4 + 8] ;# G1c H1c G2c H2c - movhps xmm7, [rdi + r13*4 + 8] ;# G1d H1d G2d H2d - movhps xmm11, [rdi + r13*4 + 24] ;# G1r H1r G2r H2r - - movhps xmm12, [rsi + r11*4 + 8] ;# G3c H3c G4c H4c - movhps xmm13, [rdi + r15*4 + 8] ;# G3d H3d G4d H4d - movhps xmm14, [rdi + r15*4 + 24] ;# G3r H3r G4r H4r - movaps xmm2, xmm3 - movaps xmm6, xmm7 - movaps xmm10, xmm11 - - shufps xmm2, xmm12, 136 ;# 10001000 => G1c G2c G3c G4c - shufps xmm6, xmm13, 136 ;# 10001000 => G1d G2d G3d G4d - shufps xmm10, xmm14, 136 ;# 10001000 => G1r G2r G3r G4r - shufps xmm3, xmm12, 221 ;# 11011101 => H1c H2c H3c H4c - shufps xmm7, xmm13, 221 ;# 11011101 => H1d H2d H3d H4d - shufps xmm11, xmm14, 221 ;# 11011101 => H1r H2r H3r H4r - ;# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 - - movaps xmm12, [rsp + nb430_epsgb] - movaps xmm13, [rsp + nb430_eps] - - mulps xmm3, xmm12 ;# Heps - mulps xmm7, xmm13 - mulps xmm11, xmm13 - mulps xmm2, xmm12 ;# Geps - mulps xmm6, xmm13 - mulps xmm10, xmm13 - mulps xmm3, xmm12 ;# Heps2 - mulps xmm7, xmm13 - mulps xmm11, xmm13 - - addps xmm1, xmm2 ;# F+Geps - addps xmm5, xmm6 - addps xmm9, xmm10 - addps xmm1, xmm3 ;# F+Geps+Heps2 = Fp - addps xmm5, xmm7 - addps xmm9, xmm11 - addps xmm3, xmm3 ;# 2*Heps2 - addps xmm7, xmm7 - addps xmm11, xmm11 - addps xmm3, xmm2 ;# 2*Heps2+Geps - addps xmm7, xmm6 - addps xmm11, xmm10 - addps xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps - addps xmm7, xmm5 - addps xmm11, xmm9 - mulps xmm1, xmm12 ;# eps*Fp - mulps xmm5, xmm13 - mulps xmm9, xmm13 - addps xmm1, xmm0 ;# VV - addps xmm5, xmm4 - addps xmm9, xmm8 - mulps xmm1, [rsp + nb430_qq] ;# VV*qq = vcoul - mulps xmm5, [rsp + nb430_c6] ;# vnb6 - mulps xmm9, [rsp + nb430_c12] ;# vnb12 - mulps xmm3, [rsp + nb430_qq] ;# FF*qq = fij - mulps xmm7, [rsp + nb430_c6] ;# fijD - mulps xmm11, [rsp + nb430_c12] ;#fijR - - addps xmm11, xmm7 ;# fijD+fijR - mulps xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale - - ;# accumulate Vvdwtot - addps xmm5, [rsp + nb430_Vvdwtot] - addps xmm5, xmm9 - movaps [rsp + nb430_Vvdwtot], xmm5 - - mov rsi, [rbp + nb430_dvda] - - ;# Calculate dVda - mulps xmm3, [rsp + nb430_gbscale] ;# fijC=qq*FF*gbscale - movaps xmm6, xmm3 - mulps xmm6, [rsp + nb430_r] - addps xmm6, xmm1 ;# vcoul+fijC*r - - addps xmm3, xmm11 ;# fijC+fijD+fijR - - ;# increment vctot - addps xmm1, [rsp + nb430_vctot] - movaps [rsp + nb430_vctot], xmm1 - - ;# xmm6=(vcoul+fijC*r) - xorps xmm7, xmm7 - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [rsp + nb430_dvdasum] - movaps [rsp + nb430_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - movaps xmm5, xmm6 - movaps xmm4, xmm7 - shufps xmm5, xmm5, 0x1 - shufps xmm4, xmm4, 0x1 - - ;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss xmm6, [rsi + rax*4] - addss xmm5, [rsi + rbx*4] - addss xmm7, [rsi + rcx*4] - addss xmm4, [rsi + rdx*4] - movss [rsi + rax*4], xmm6 - movss [rsi + rbx*4], xmm5 - movss [rsi + rcx*4], xmm7 - movss [rsi + rdx*4], xmm4 - - xorps xmm4, xmm4 - mulps xmm3, [rsp + nb430_rinv] - subps xmm4, xmm3 - - movd r8, mm0 ;# fetch j3 - movd r9, mm1 - movd r10, mm2 - movd r11, mm3 - - movaps xmm9, xmm4 - movaps xmm10, xmm4 - movaps xmm11, xmm4 - - mulps xmm9, [rsp + nb430_dx] - mulps xmm10, [rsp + nb430_dy] - mulps xmm11, [rsp + nb430_dz] - - ;# accumulate i forces - movaps xmm12, [rsp + nb430_fix] - movaps xmm13, [rsp + nb430_fiy] - movaps xmm14, [rsp + nb430_fiz] - addps xmm12, xmm9 - addps xmm13, xmm10 - addps xmm14, xmm11 - movaps [rsp + nb430_fix], xmm12 - movaps [rsp + nb430_fiy], xmm13 - movaps [rsp + nb430_fiz], xmm14 - - mov rsi, [rbp + nb430_faction] - ;# the fj's - start by accumulating x & y forces from memory - movlps xmm0, [rsi + r8*4] ;# x1 y1 - - - movlps xmm1, [rsi + r10*4] ;# x3 y3 - - - movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2 - movhps xmm1, [rsi + r11*4] ;# x3 y3 x4 y4 - - movaps xmm8, xmm9 - unpcklps xmm9, xmm10 ;# x1 y1 x2 y2 - unpckhps xmm8, xmm10 ;# x3 y3 x4 y4 - - ;# update fjx and fjy - addps xmm0, xmm9 - addps xmm1, xmm8 - - movlps [rsi + r8*4], xmm0 - movlps [rsi + r10*4], xmm1 - movhps [rsi + r9*4], xmm0 - movhps [rsi + r11*4], xmm1 - - ;# xmm11: fjz1 fjz2 fjz3 fjz4 - pshufd xmm10, xmm11, 1 ;# fjz2 - - - - movhlps xmm9, xmm11 ;# fjz3 - - - - pshufd xmm8, xmm11, 3 ;# fjz4 - - - - - addss xmm11, [rsi + r8*4 + 8] - addss xmm10, [rsi + r9*4 + 8] - addss xmm9, [rsi + r10*4 + 8] - addss xmm8, [rsi + r11*4 + 8] - movss [rsi + r8*4 + 8], xmm11 - movss [rsi + r9*4 + 8], xmm10 - movss [rsi + r10*4 + 8], xmm9 - movss [rsi + r11*4 + 8], xmm8 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb430_innerk], 4 - jl .nb430_finish_inner - jmp .nb430_unroll_loop -.nb430_finish_inner: - ;# check if at least two particles remain - add dword ptr [rsp + nb430_innerk], 4 - mov edx, [rsp + nb430_innerk] - and edx, 2 - jnz .nb430_dopair - jmp .nb430_checksingle -.nb430_dopair: - mov rcx, [rsp + nb430_innerjjnr] - - mov eax, [rcx] - mov ebx, [rcx + 4] - add qword ptr [rsp + nb430_innerjjnr], 8 - - ;# load isaj - mov rsi, [rbp + nb430_invsqrta] - movss xmm3, [rsi + rax*4] - movss xmm6, [rsi + rbx*4] - movaps xmm2, [rsp + nb430_isai] - unpcklps xmm3, xmm6 - mulps xmm2, xmm3 - movaps [rsp + nb430_isaprod], xmm2 - - movaps xmm1, xmm2 - mulps xmm1, [rsp + nb430_gbtsc] - movaps [rsp + nb430_gbscale], xmm1 - - mov rsi, [rbp + nb430_charge] ;# base of charge[] - - movss xmm3, [rsi + rax*4] - movss xmm6, [rsi + rbx*4] - unpcklps xmm3, xmm6 - mulps xmm2, [rsp + nb430_iq] - mulps xmm3, xmm2 - movaps [rsp + nb430_qq], xmm3 - - ;# vdw parameters - mov rsi, [rbp + nb430_type] - mov r12d, [rsi + rax*4] - mov r13d, [rsi + rbx*4] - shl r12d, 1 - shl r13d, 1 - mov edi, [rsp + nb430_ntia] - add r12d, edi - add r13d, edi - - mov rsi, [rbp + nb430_vdwparam] - movlps xmm3, [rsi + r12*4] - movhps xmm3, [rsi + r13*4] - - xorps xmm7, xmm7 - movaps xmm0, xmm3 - shufps xmm0, xmm7, 136 ;# 10001000 - shufps xmm3, xmm7, 221 ;# 11011101 - - movaps [rsp + nb430_c6], xmm0 - movaps [rsp + nb430_c12], xmm3 - - mov rsi, [rbp + nb430_pos] ;# base of pos[] - - lea r8, [rax + rax*2] ;# j3 - lea r9, [rbx + rbx*2] - - ;# move four coordinates to xmm0-xmm2 - movlps xmm0, [rsi + r8*4] ;# x1 y1 - - - movlps xmm1, [rsi + r9*4] ;# x2 y2 - - - - movss xmm2, [rsi + r8*4 + 8] ;# z1 - - - - movss xmm7, [rsi + r9*4 + 8] ;# z2 - - - - - unpcklps xmm0, xmm1 ;# x1 x2 y1 y2 - movhlps xmm1, xmm0 ;# y1 y2 - - - unpcklps xmm2, xmm7 ;# z1 z2 - - - - ;# calc dr - subps xmm0, [rsp + nb430_ix] - subps xmm1, [rsp + nb430_iy] - subps xmm2, [rsp + nb430_iz] - - ;# store dr - movaps [rsp + nb430_dx], xmm0 - movaps [rsp + nb430_dy], xmm1 - movaps [rsp + nb430_dz], xmm2 - - ;# square it - mulps xmm0,xmm0 - mulps xmm1,xmm1 - mulps xmm2,xmm2 - addps xmm0, xmm1 - addps xmm0, xmm2 - movaps xmm4, xmm0 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb430_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb430_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb430_r], xmm4 - movaps [rsp + nb430_rinv], xmm0 - - movaps xmm8, xmm4 ;# r - mulps xmm4, [rsp + nb430_gbscale] ;# rgbtab - mulps xmm8, [rsp + nb430_tsc] ;# rtab - - ;# truncate and convert to integers - cvttps2dq xmm5, xmm4 ;# gb - cvttps2dq xmm9, xmm8 ;# lj - - ;# convert back to float - cvtdq2ps xmm6, xmm5 ;# gb - cvtdq2ps xmm10, xmm9 ;# lj - - ;# multiply by 4 and 8, respectively - pslld xmm5, 2 ;# gb - pslld xmm9, 3 ;# lj - - ;# move to integer registers - movd r12d, xmm5 ;# gb - movd r14d, xmm9 ;# lj - pshufd xmm5, xmm5, 1 ;# gb - pshufd xmm9, xmm9, 1 ;# lj - movd r13d, xmm5 ;# gb - movd r15d, xmm9 ;# lj - ;# GB indices: r12-r13 LJ indices: r14-r15 - - ;# calculate eps - subps xmm4, xmm6 ;# gb - subps xmm8, xmm10 ;# lj - movaps [rsp + nb430_epsgb], xmm4 ;# gb eps - movaps [rsp + nb430_eps], xmm8 ;# lj eps - - mov rsi, [rbp + nb430_GBtab] - mov rdi, [rbp + nb430_VFtab] - - ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 - movlps xmm0, [rsi + r12*4] ;# Y1c F1c - movlps xmm1, [rsi + r13*4] ;# Y2c F2c - movlps xmm4, [rdi + r14*4] ;# Y1d F1d - movlps xmm5, [rdi + r15*4] ;# Y2d F2d - movlps xmm8, [rdi + r14*4 + 16] ;# Y1r F1r - movlps xmm9, [rdi + r15*4 + 16] ;# Y2r F2r - - unpcklps xmm0, xmm1 - movhlps xmm1, xmm0 - unpcklps xmm4, xmm5 - movhlps xmm5, xmm4 - unpcklps xmm8, xmm9 - movhlps xmm9, xmm8 - movlps xmm2, [rsi + r12*4 + 8] ;# G1c H1c - movlps xmm3, [rsi + r13*4 + 8] ;# G2c H2c - movlps xmm6, [rdi + r14*4 + 8] ;# G1d H1d - movlps xmm7, [rdi + r15*4 + 8] ;# G2d H2d - movlps xmm10, [rdi + r14*4 + 24] ;# G1r H1r - movlps xmm11, [rdi + r15*4 + 24] ;# G2r H2r - unpcklps xmm2, xmm3 - movhlps xmm3, xmm2 - unpcklps xmm6, xmm7 - movhlps xmm7, xmm6 - unpcklps xmm10, xmm11 - movhlps xmm11, xmm10 - ;# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 - - movaps xmm12, [rsp + nb430_epsgb] - movaps xmm13, [rsp + nb430_eps] - - mulps xmm3, xmm12 ;# Heps - mulps xmm7, xmm13 - mulps xmm11, xmm13 - mulps xmm2, xmm12 ;# Geps - mulps xmm6, xmm13 - mulps xmm10, xmm13 - mulps xmm3, xmm12 ;# Heps2 - mulps xmm7, xmm13 - mulps xmm11, xmm13 - - addps xmm1, xmm2 ;# F+Geps - addps xmm5, xmm6 - addps xmm9, xmm10 - addps xmm1, xmm3 ;# F+Geps+Heps2 = Fp - addps xmm5, xmm7 - addps xmm9, xmm11 - addps xmm3, xmm3 ;# 2*Heps2 - addps xmm7, xmm7 - addps xmm11, xmm11 - addps xmm3, xmm2 ;# 2*Heps2+Geps - addps xmm7, xmm6 - addps xmm11, xmm10 - addps xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps - addps xmm7, xmm5 - addps xmm11, xmm9 - mulps xmm1, xmm12 ;# eps*Fp - mulps xmm5, xmm13 - mulps xmm9, xmm13 - addps xmm1, xmm0 ;# VV - addps xmm5, xmm4 - addps xmm9, xmm8 - mulps xmm1, [rsp + nb430_qq] ;# VV*qq = vcoul - mulps xmm5, [rsp + nb430_c6] ;# vnb6 - mulps xmm9, [rsp + nb430_c12] ;# vnb12 - mulps xmm3, [rsp + nb430_qq] ;# FF*qq = fij - mulps xmm7, [rsp + nb430_c6] ;# fijD - mulps xmm11, [rsp + nb430_c12] ;#fijR - - addps xmm11, xmm7 ;# fijD+fijR - mulps xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale - - ;# accumulate Vvdwtot - addps xmm5, [rsp + nb430_Vvdwtot] - addps xmm5, xmm9 - movlps [rsp + nb430_Vvdwtot], xmm5 - - mov rsi, [rbp + nb430_dvda] - - ;# Calculate dVda - mulps xmm3, [rsp + nb430_gbscale] ;# fijC=qq*FF*gbscale - movaps xmm6, xmm3 - mulps xmm6, [rsp + nb430_r] - addps xmm6, xmm1 ;# vcoul+fijC*r - - addps xmm3, xmm11 ;# fijC+fijD+fijR - - ;# increment vctot - addps xmm1, [rsp + nb430_vctot] - movlps [rsp + nb430_vctot], xmm1 - - ;# xmm6=(vcoul+fijC*r) - xorps xmm7, xmm7 - subps xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addps xmm7, [rsp + nb430_dvdasum] - movlps [rsp + nb430_dvdasum], xmm7 - - ;# update j atoms dvdaj - movaps xmm5, xmm6 - shufps xmm5, xmm5, 0x1 - - ;# xmm6=dvdaj1 xmm5=dvdaj2 - addss xmm6, [rsi + rax*4] - addss xmm5, [rsi + rbx*4] - movss [rsi + rax*4], xmm6 - movss [rsi + rbx*4], xmm5 - - xorps xmm4, xmm4 - mulps xmm3, [rsp + nb430_rinv] - subps xmm4, xmm3 - - movaps xmm9, xmm4 - movaps xmm10, xmm4 - movaps xmm11, xmm4 - - mulps xmm9, [rsp + nb430_dx] - mulps xmm10, [rsp + nb430_dy] - mulps xmm11, [rsp + nb430_dz] - - - ;# accumulate i forces - movaps xmm12, [rsp + nb430_fix] - movaps xmm13, [rsp + nb430_fiy] - movaps xmm14, [rsp + nb430_fiz] - addps xmm12, xmm9 - addps xmm13, xmm10 - addps xmm14, xmm11 - movlps [rsp + nb430_fix], xmm12 - movlps [rsp + nb430_fiy], xmm13 - movlps [rsp + nb430_fiz], xmm14 - - mov rsi, [rbp + nb430_faction] - ;# the fj's - start by accumulating x & y forces from memory - movlps xmm0, [rsi + r8*4] ;# x1 y1 - - - movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2 - - unpcklps xmm9, xmm10 ;# x1 y1 x2 y2 - addps xmm0, xmm9 - - movlps [rsi + r8*4], xmm0 - movhps [rsi + r9*4], xmm0 - - ;# z forces - pshufd xmm8, xmm11, 1 - addss xmm11, [rsi + r8*4 + 8] - addss xmm8, [rsi + r9*4 + 8] - movss [rsi + r8*4 + 8], xmm11 - movss [rsi + r9*4 + 8], xmm8 - -.nb430_checksingle: - mov edx, [rsp + nb430_innerk] - and edx, 1 - jnz .nb430_dosingle - jmp .nb430_updateouterdata -.nb430_dosingle: - mov rsi, [rbp + nb430_charge] - mov rdx, [rbp + nb430_invsqrta] - mov rdi, [rbp + nb430_pos] - mov rcx, [rsp + nb430_innerjjnr] - mov eax, [rcx] - - ;# load isaj - mov rsi, [rbp + nb430_invsqrta] - movss xmm3, [rsi + rax*4] - movaps xmm2, [rsp + nb430_isai] - mulss xmm2, xmm3 - movaps [rsp + nb430_isaprod], xmm2 - - movaps xmm1, xmm2 - mulss xmm1, [rsp + nb430_gbtsc] - movaps [rsp + nb430_gbscale], xmm1 - - mov rsi, [rbp + nb430_charge] ;# base of charge[] - - movss xmm3, [rsi + rax*4] - mulss xmm2, [rsp + nb430_iq] - mulss xmm3, xmm2 - movaps [rsp + nb430_qq], xmm3 - - ;# vdw parameters - mov rsi, [rbp + nb430_type] - mov r12d, [rsi + rax*4] - shl r12d, 1 - mov edi, [rsp + nb430_ntia] - add r12d, edi - - mov rsi, [rbp + nb430_vdwparam] - movss xmm0, [rsi + r12*4] - movss xmm3, [rsi + r12*4 + 4] - movaps [rsp + nb430_c6], xmm0 - movaps [rsp + nb430_c12], xmm3 - - mov rsi, [rbp + nb430_pos] ;# base of pos[] - - lea r8, [rax + rax*2] ;# j3 - - ;# move four coordinates to xmm0-xmm2 - movss xmm0, [rsi + r8*4] - movss xmm1, [rsi + r8*4 + 4] - movss xmm2, [rsi + r8*4 + 8] - - ;# calc dr - subss xmm0, [rsp + nb430_ix] - subss xmm1, [rsp + nb430_iy] - subss xmm2, [rsp + nb430_iz] - - ;# store dr - movaps [rsp + nb430_dx], xmm0 - movaps [rsp + nb430_dy], xmm1 - movaps [rsp + nb430_dz], xmm2 - - ;# square it - mulss xmm0,xmm0 - mulss xmm1,xmm1 - mulss xmm2,xmm2 - addss xmm0, xmm1 - addss xmm0, xmm2 - movaps xmm4, xmm0 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulss xmm5, xmm5 - movaps xmm1, [rsp + nb430_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb430_half] - subss xmm1, xmm5 ;# 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - mulss xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb430_r], xmm4 - movaps [rsp + nb430_rinv], xmm0 - - movaps xmm8, xmm4 ;# r - mulss xmm4, [rsp + nb430_gbscale] ;# rgbtab - mulss xmm8, [rsp + nb430_tsc] ;# rtab - - ;# truncate and convert to integers - cvttss2si r12d, xmm4 ;# gb - cvttss2si r14d, xmm8 ;# lj - - ;# convert back to float - cvtsi2ss xmm6, r12d ;# gb - cvtsi2ss xmm10, r14d ;# lj - - ;# multiply by 4 and 8, respectively - shl r12d, 2 ;# gb - shl r14d, 3 ;# lj - - ;# GB index: r12 LJ indices: r14 - - ;# calculate eps - subss xmm4, xmm6 ;# gb - subss xmm8, xmm10 ;# lj - movaps [rsp + nb430_epsgb], xmm4 ;# gb eps - movaps [rsp + nb430_eps], xmm8 ;# lj eps - - mov rsi, [rbp + nb430_GBtab] - mov rdi, [rbp + nb430_VFtab] - - ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 - movss xmm0, [rsi + r12*4] - movss xmm1, [rsi + r12*4 + 4] - movss xmm2, [rsi + r12*4 + 8] - movss xmm3, [rsi + r12*4 + 12] - movss xmm4, [rdi + r14*4] - movss xmm5, [rdi + r14*4 + 4] - movss xmm6, [rdi + r14*4 + 8] - movss xmm7, [rdi + r14*4 + 12] - movss xmm8, [rdi + r14*4 + 16] - movss xmm9, [rdi + r14*4 + 20] - movss xmm10, [rdi + r14*4 + 24] - movss xmm11, [rdi + r14*4 + 28] - ;# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 - - movaps xmm12, [rsp + nb430_epsgb] - movaps xmm13, [rsp + nb430_eps] - - mulss xmm3, xmm12 ;# Heps - mulss xmm7, xmm13 - mulss xmm11, xmm13 - mulss xmm2, xmm12 ;# Geps - mulss xmm6, xmm13 - mulss xmm10, xmm13 - mulss xmm3, xmm12 ;# Heps2 - mulss xmm7, xmm13 - mulss xmm11, xmm13 - - addss xmm1, xmm2 ;# F+Geps - addss xmm5, xmm6 - addss xmm9, xmm10 - addss xmm1, xmm3 ;# F+Geps+Heps2 = Fp - addss xmm5, xmm7 - addss xmm9, xmm11 - addss xmm3, xmm3 ;# 2*Heps2 - addss xmm7, xmm7 - addss xmm11, xmm11 - addss xmm3, xmm2 ;# 2*Heps2+Geps - addss xmm7, xmm6 - addss xmm11, xmm10 - addss xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps - addss xmm7, xmm5 - addss xmm11, xmm9 - mulss xmm1, xmm12 ;# eps*Fp - mulss xmm5, xmm13 - mulss xmm9, xmm13 - addss xmm1, xmm0 ;# VV - addss xmm5, xmm4 - addss xmm9, xmm8 - mulss xmm1, [rsp + nb430_qq] ;# VV*qq = vcoul - mulss xmm5, [rsp + nb430_c6] ;# vnb6 - mulss xmm9, [rsp + nb430_c12] ;# vnb12 - mulss xmm3, [rsp + nb430_qq] ;# FF*qq = fij - mulss xmm7, [rsp + nb430_c6] ;# fijD - mulss xmm11, [rsp + nb430_c12] ;#fijR - - addss xmm11, xmm7 ;# fijD+fijR - mulss xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale - - ;# accumulate Vvdwtot - addss xmm5, [rsp + nb430_Vvdwtot] - addss xmm5, xmm9 - movss [rsp + nb430_Vvdwtot], xmm5 - - mov rsi, [rbp + nb430_dvda] - - ;# Calculate dVda - mulss xmm3, [rsp + nb430_gbscale] ;# fijC=qq*FF*gbscale - movaps xmm6, xmm3 - mulss xmm6, [rsp + nb430_r] - addss xmm6, xmm1 ;# vcoul+fijC*r - - addss xmm3, xmm11 ;# fijC+fijD+fijR - - ;# increment vctot - addss xmm1, [rsp + nb430_vctot] - movss [rsp + nb430_vctot], xmm1 - - ;# xmm6=(vcoul+fijC*r) - xorps xmm7, xmm7 - subss xmm7, xmm6 - movaps xmm6, xmm7 - - ;# update dvdasum - addss xmm7, [rsp + nb430_dvdasum] - movss [rsp + nb430_dvdasum], xmm7 - - ;# update j atoms dvdaj - - ;# xmm6=dvdaj1 - addss xmm6, [rsi + rax*4] - movss [rsi + rax*4], xmm6 - - xorps xmm4, xmm4 - mulss xmm3, [rsp + nb430_rinv] - subss xmm4, xmm3 - - movss xmm9, xmm4 - movss xmm10, xmm4 - movss xmm11, xmm4 - - mulss xmm9, [rsp + nb430_dx] - mulss xmm10, [rsp + nb430_dy] - mulss xmm11, [rsp + nb430_dz] - - ;# accumulate i forces - movaps xmm12, [rsp + nb430_fix] - movaps xmm13, [rsp + nb430_fiy] - movaps xmm14, [rsp + nb430_fiz] - addss xmm12, xmm9 - addss xmm13, xmm10 - addss xmm14, xmm11 - movss [rsp + nb430_fix], xmm12 - movss [rsp + nb430_fiy], xmm13 - movss [rsp + nb430_fiz], xmm14 - - mov rsi, [rbp + nb430_faction] - ;# add to j forces - addss xmm9, [rsi + r8*4] - addss xmm10, [rsi + r8*4 + 4] - addss xmm11, [rsi + r8*4 + 8] - movss [rsi + r8*4], xmm9 - movss [rsi + r8*4 + 4], xmm10 - movss [rsi + r8*4 + 8], xmm11 - -.nb430_updateouterdata: - mov ecx, [rsp + nb430_ii3] - mov rdi, [rbp + nb430_faction] - mov rsi, [rbp + nb430_fshift] - mov edx, [rsp + nb430_is3] - - ;# accumulate i forces in xmm0, xmm1, xmm2 - movaps xmm0, [rsp + nb430_fix] - movaps xmm1, [rsp + nb430_fiy] - movaps xmm2, [rsp + nb430_fiz] - - movhlps xmm3, xmm0 - movhlps xmm4, xmm1 - movhlps xmm5, xmm2 - addps xmm0, xmm3 - addps xmm1, xmm4 - addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2 - - movaps xmm3, xmm0 - movaps xmm4, xmm1 - movaps xmm5, xmm2 - - shufps xmm3, xmm3, 1 - shufps xmm4, xmm4, 1 - shufps xmm5, xmm5, 1 - addss xmm0, xmm3 - addss xmm1, xmm4 - addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0 - - ;# increment i force - movss xmm3, [rdi + rcx*4] - movss xmm4, [rdi + rcx*4 + 4] - movss xmm5, [rdi + rcx*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [rdi + rcx*4], xmm3 - movss [rdi + rcx*4 + 4], xmm4 - movss [rdi + rcx*4 + 8], xmm5 - - ;# increment fshift force - movss xmm3, [rsi + rdx*4] - movss xmm4, [rsi + rdx*4 + 4] - movss xmm5, [rsi + rdx*4 + 8] - subss xmm3, xmm0 - subss xmm4, xmm1 - subss xmm5, xmm2 - movss [rsi + rdx*4], xmm3 - movss [rsi + rdx*4 + 4], xmm4 - movss [rsi + rdx*4 + 8], xmm5 - - ;# get n from stack - mov esi, [rsp + nb430_n] - ;# get group index for i particle - mov rdx, [rbp + nb430_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movaps xmm7, [rsp + nb430_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov rax, [rbp + nb430_Vc] - addss xmm7, [rax + rdx*4] - ;# move back to mem - movss [rax + rdx*4], xmm7 - - ;# accumulate total lj energy and update it - movaps xmm7, [rsp + nb430_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov rax, [rbp + nb430_Vvdw] - addss xmm7, [rax + rdx*4] - ;# move back to mem - movss [rax + rdx*4], xmm7 - - ;# accumulate dVda and update it - movaps xmm7, [rsp + nb430_dvdasum] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - mov edx, [rsp + nb430_ii] - mov rax, [rbp + nb430_dvda] - addss xmm7, [rax + rdx*4] - movss [rax + rdx*4], xmm7 - - ;# finish if last - mov ecx, [rsp + nb430_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb430_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb430_n], esi - jmp .nb430_outer -.nb430_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb430_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb430_end - ;# non-zero, do one more workunit - jmp .nb430_threadloop -.nb430_end: - mov eax, [rsp + nb430_nouter] - mov ebx, [rsp + nb430_ninner] - mov rcx, [rbp + nb430_outeriter] - mov rdx, [rbp + nb430_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 552 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - - - - - -.globl nb_kernel430nf_x86_64_sse -.globl _nb_kernel430nf_x86_64_sse -nb_kernel430nf_x86_64_sse: -_nb_kernel430nf_x86_64_sse: -;# Room for return address and rbp (16 bytes) -.equiv nb430nf_fshift, 16 -.equiv nb430nf_gid, 24 -.equiv nb430nf_pos, 32 -.equiv nb430nf_faction, 40 -.equiv nb430nf_charge, 48 -.equiv nb430nf_p_facel, 56 -.equiv nb430nf_argkrf, 64 -.equiv nb430nf_argcrf, 72 -.equiv nb430nf_Vc, 80 -.equiv nb430nf_type, 88 -.equiv nb430nf_p_ntype, 96 -.equiv nb430nf_vdwparam, 104 -.equiv nb430nf_Vvdw, 112 -.equiv nb430nf_p_tabscale, 120 -.equiv nb430nf_VFtab, 128 -.equiv nb430nf_invsqrta, 136 -.equiv nb430nf_dvda, 144 -.equiv nb430nf_p_gbtabscale, 152 -.equiv nb430nf_GBtab, 160 -.equiv nb430nf_p_nthreads, 168 -.equiv nb430nf_count, 176 -.equiv nb430nf_mtx, 184 -.equiv nb430nf_outeriter, 192 -.equiv nb430nf_inneriter, 200 -.equiv nb430nf_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse use -.equiv nb430nf_ix, 0 -.equiv nb430nf_iy, 16 -.equiv nb430nf_iz, 32 -.equiv nb430nf_iq, 48 -.equiv nb430nf_gbtsc, 64 -.equiv nb430nf_tsc, 80 -.equiv nb430nf_qq, 96 -.equiv nb430nf_c6, 112 -.equiv nb430nf_c12, 128 -.equiv nb430nf_vctot, 144 -.equiv nb430nf_Vvdwtot, 160 -.equiv nb430nf_half, 176 -.equiv nb430nf_three, 192 -.equiv nb430nf_isai, 208 -.equiv nb430nf_isaprod, 224 -.equiv nb430nf_gbscale, 240 -.equiv nb430nf_r, 256 -.equiv nb430nf_nri, 272 -.equiv nb430nf_iinr, 280 -.equiv nb430nf_jindex, 288 -.equiv nb430nf_jjnr, 296 -.equiv nb430nf_shift, 304 -.equiv nb430nf_shiftvec, 312 -.equiv nb430nf_facel, 320 -.equiv nb430nf_innerjjnr, 328 -.equiv nb430nf_is3, 336 -.equiv nb430nf_ii3, 340 -.equiv nb430nf_ntia, 344 -.equiv nb430nf_innerk, 348 -.equiv nb430nf_n, 352 -.equiv nb430nf_nn1, 356 -.equiv nb430nf_ntype, 360 -.equiv nb430nf_nouter, 364 -.equiv nb430nf_ninner, 368 - - push rbp - mov rbp, rsp - push rbx - - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 392 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb430nf_nouter], eax - mov [rsp + nb430nf_ninner], eax - - mov edi, [rdi] - mov [rsp + nb430nf_nri], edi - mov [rsp + nb430nf_iinr], rsi - mov [rsp + nb430nf_jindex], rdx - mov [rsp + nb430nf_jjnr], rcx - mov [rsp + nb430nf_shift], r8 - mov [rsp + nb430nf_shiftvec], r9 - mov rdi, [rbp + nb430nf_p_ntype] - mov edi, [rdi] - mov [rsp + nb430nf_ntype], edi - mov rsi, [rbp + nb430nf_p_facel] - movss xmm0, [rsi] - movss [rsp + nb430nf_facel], xmm0 - - mov rax, [rbp + nb430nf_p_tabscale] - movss xmm3, [rax] - shufps xmm3, xmm3, 0 - movaps [rsp + nb430nf_tsc], xmm3 - - mov rbx, [rbp + nb430nf_p_gbtabscale] - movss xmm4, [rbx] - shufps xmm4, xmm4, 0 - movaps [rsp + nb430nf_gbtsc], xmm4 - - ;# create constant floating-point factors on stack - mov eax, 0x3f000000 ;# half in IEEE (hex) - mov [rsp + nb430nf_half], eax - movss xmm1, [rsp + nb430nf_half] - shufps xmm1, xmm1, 0 ;# splat to all elements - movaps xmm2, xmm1 - addps xmm2, xmm2 ;# one - movaps xmm3, xmm2 - addps xmm2, xmm2 ;# two - addps xmm3, xmm2 ;# three - movaps [rsp + nb430nf_half], xmm1 - movaps [rsp + nb430nf_three], xmm3 - -.nb430nf_threadloop: - mov rsi, [rbp + nb430nf_count] ;# pointer to sync counter - mov eax, [rsi] -.nb430nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb430nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb430nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb430nf_n], eax - mov [rsp + nb430nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb430nf_outerstart - jmp .nb430nf_end - -.nb430nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb430nf_nouter] - mov [rsp + nb430nf_nouter], ebx - -.nb430nf_outer: - mov rax, [rsp + nb430nf_shift] ;# rax = pointer into shift[] - mov ebx, [rax + rsi*4] ;# ebx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb430nf_is3],ebx ;# store is3 - - mov rax, [rsp + nb430nf_shiftvec] ;# rax = base of shiftvec[] - - movss xmm0, [rax + rbx*4] - movss xmm1, [rax + rbx*4 + 4] - movss xmm2, [rax + rbx*4 + 8] - - mov rcx, [rsp + nb430nf_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx + rsi*4] ;# ebx =ii - - mov rdx, [rbp + nb430nf_charge] - movss xmm3, [rdx + rbx*4] - mulss xmm3, [rsp + nb430nf_facel] - shufps xmm3, xmm3, 0 - - mov rdx, [rbp + nb430nf_invsqrta] ;# load invsqrta[ii] - movss xmm4, [rdx + rbx*4] - shufps xmm4, xmm4, 0 - - mov rdx, [rbp + nb430nf_type] - mov edx, [rdx + rbx*4] - imul edx, [rsp + nb430nf_ntype] - shl edx, 1 - mov [rsp + nb430nf_ntia], edx - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb430nf_pos] ;# rax = base of pos[] - - addss xmm0, [rax + rbx*4] - addss xmm1, [rax + rbx*4 + 4] - addss xmm2, [rax + rbx*4 + 8] - - movaps [rsp + nb430nf_iq], xmm3 - movaps [rsp + nb430nf_isai], xmm4 - - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - - movaps [rsp + nb430nf_ix], xmm0 - movaps [rsp + nb430nf_iy], xmm1 - movaps [rsp + nb430nf_iz], xmm2 - - mov [rsp + nb430nf_ii3], ebx - - ;# clear vctot - xorps xmm4, xmm4 - movaps [rsp + nb430nf_vctot], xmm4 - movaps [rsp + nb430nf_Vvdwtot], xmm4 - - mov rax, [rsp + nb430nf_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb430nf_pos] - mov rdi, [rbp + nb430nf_faction] - mov rax, [rsp + nb430nf_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb430nf_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 4 - add ecx, [rsp + nb430nf_ninner] - mov [rsp + nb430nf_ninner], ecx - add edx, 0 - mov [rsp + nb430nf_innerk], edx ;# number of innerloop atoms - jge .nb430nf_unroll_loop - jmp .nb430nf_finish_inner -.nb430nf_unroll_loop: - ;# quad-unroll innerloop here - mov rdx, [rsp + nb430nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [rdx] - mov ebx, [rdx + 4] - mov ecx, [rdx + 8] - mov edx, [rdx + 12] ;# eax-edx=jnr1-4 - add qword ptr [rsp + nb430nf_innerjjnr], 16 ;# advance pointer (unrolled 4) - - ;# load isa2 - mov rsi, [rbp + nb430nf_invsqrta] - movss xmm3, [rsi + rax*4] - movss xmm4, [rsi + rcx*4] - movss xmm6, [rsi + rbx*4] - movss xmm7, [rsi + rdx*4] - movaps xmm2, [rsp + nb430nf_isai] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# 10001000 ;# all charges in xmm3 - mulps xmm2, xmm3 - - movaps [rsp + nb430nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [rsp + nb430nf_gbtsc] - movaps [rsp + nb430nf_gbscale], xmm1 - - mov rsi, [rbp + nb430nf_charge] ;# base of charge[] - - movss xmm3, [rsi + rax*4] - movss xmm4, [rsi + rcx*4] - movss xmm6, [rsi + rbx*4] - movss xmm7, [rsi + rdx*4] - - mulps xmm2, [rsp + nb430nf_iq] - shufps xmm3, xmm6, 0 - shufps xmm4, xmm7, 0 - shufps xmm3, xmm4, 136 ;# 10001000 ;# all charges in xmm3 - mulps xmm3, xmm2 - movaps [rsp + nb430nf_qq], xmm3 - - movd mm0, eax ;# use mmx registers as temp storage - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov rsi, [rbp + nb430nf_type] - mov eax, [rsi + rax*4] - mov ebx, [rsi + rbx*4] - mov ecx, [rsi + rcx*4] - mov edx, [rsi + rdx*4] - mov rsi, [rbp + nb430nf_vdwparam] - shl eax, 1 - shl ebx, 1 - shl ecx, 1 - shl edx, 1 - mov edi, [rsp + nb430nf_ntia] - add eax, edi - add ebx, edi - add ecx, edi - add edx, edi - - movlps xmm6, [rsi + rax*4] - movlps xmm7, [rsi + rcx*4] - movhps xmm6, [rsi + rbx*4] - movhps xmm7, [rsi + rdx*4] - - movaps xmm4, xmm6 - shufps xmm4, xmm7, 136 ;# 10001000 - shufps xmm6, xmm7, 221 ;# 11011101 - - movd eax, mm0 - movd ebx, mm1 - movd ecx, mm2 - movd edx, mm3 - - movaps [rsp + nb430nf_c6], xmm4 - movaps [rsp + nb430nf_c12], xmm6 - - mov rsi, [rbp + nb430nf_pos] ;# base of pos[] - - lea rax, [rax + rax*2] ;# replace jnr with j3 - lea rbx, [rbx + rbx*2] - - lea rcx, [rcx + rcx*2] ;# replace jnr with j3 - lea rdx, [rdx + rdx*2] - - ;# move four coordinates to xmm0-xmm2 - - movlps xmm4, [rsi + rax*4] - movlps xmm5, [rsi + rcx*4] - movss xmm2, [rsi + rax*4 + 8] - movss xmm6, [rsi + rcx*4 + 8] - - movhps xmm4, [rsi + rbx*4] - movhps xmm5, [rsi + rdx*4] - - movss xmm0, [rsi + rbx*4 + 8] - movss xmm1, [rsi + rdx*4 + 8] - - shufps xmm2, xmm0, 0 - shufps xmm6, xmm1, 0 - - movaps xmm0, xmm4 - movaps xmm1, xmm4 - - shufps xmm2, xmm6, 136 ;# 10001000 - - shufps xmm0, xmm5, 136 ;# 10001000 - shufps xmm1, xmm5, 221 ;# 11011101 - - ;# move ix-iz to xmm4-xmm6 - movaps xmm4, [rsp + nb430nf_ix] - movaps xmm5, [rsp + nb430nf_iy] - movaps xmm6, [rsp + nb430nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb430nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb430nf_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb430nf_r], xmm4 - mulps xmm4, [rsp + nb430nf_gbscale] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 2 - pslld mm7, 2 - - movd mm0, eax - movd mm1, ebx - movd mm2, ecx - movd mm3, edx - - mov rsi, [rbp + nb430nf_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# load coulomb table - movaps xmm4, [rsi + rax*4] - movaps xmm5, [rsi + rbx*4] - movaps xmm6, [rsi + rcx*4] - movaps xmm7, [rsi + rdx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [rsp + nb430nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - addps xmm5, [rsp + nb430nf_vctot] - movaps [rsp + nb430nf_vctot], xmm5 - - - movaps xmm4, [rsp + nb430nf_r] - mulps xmm4, [rsp + nb430nf_tsc] - - movhlps xmm5, xmm4 - cvttps2pi mm6, xmm4 - cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices - cvtpi2ps xmm6, mm6 - cvtpi2ps xmm5, mm7 - movlhps xmm6, xmm5 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 3 - pslld mm7, 3 - - mov rsi, [rbp + nb430nf_VFtab] - movd eax, mm6 - psrlq mm6, 32 - movd ecx, mm7 - psrlq mm7, 32 - movd ebx, mm6 - movd edx, mm7 - - ;# dispersion - movaps xmm4, [rsi + rax*4] - movaps xmm5, [rsi + rbx*4] - movaps xmm6, [rsi + rcx*4] - movaps xmm7, [rsi + rdx*4] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# dispersion table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, [rsp + nb430nf_c6] ;# Vvdw6 - addps xmm5, [rsp + nb430nf_Vvdwtot] - movaps [rsp + nb430nf_Vvdwtot], xmm5 - - ;# repulsion - movaps xmm4, [rsi + rax*4 + 16] - movaps xmm5, [rsi + rbx*4 + 16] - movaps xmm6, [rsi + rcx*4 + 16] - movaps xmm7, [rsi + rdx*4 + 16] - ;# transpose, using xmm3 for scratch - movaps xmm3, xmm6 - shufps xmm3, xmm7, 0xEE - shufps xmm6, xmm7, 0x44 - movaps xmm7, xmm4 - shufps xmm7, xmm5, 0xEE - shufps xmm4, xmm5, 0x44 - movaps xmm5, xmm4 - shufps xmm5, xmm6, 0xDD - shufps xmm4, xmm6, 0x88 - movaps xmm6, xmm7 - shufps xmm6, xmm3, 0x88 - shufps xmm7, xmm3, 0xDD - ;# table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - - mulps xmm5, [rsp + nb430nf_c12] ;# Vvdw12 - addps xmm5, [rsp + nb430nf_Vvdwtot] - movaps [rsp + nb430nf_Vvdwtot], xmm5 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb430nf_innerk], 4 - jl .nb430nf_finish_inner - jmp .nb430nf_unroll_loop -.nb430nf_finish_inner: - ;# check if at least two particles remain - add dword ptr [rsp + nb430nf_innerk], 4 - mov edx, [rsp + nb430nf_innerk] - and edx, 2 - jnz .nb430nf_dopair - jmp .nb430nf_checksingle -.nb430nf_dopair: - - mov rcx, [rsp + nb430nf_innerjjnr] - - mov eax, [rcx] - mov ebx, [rcx + 4] - add qword ptr [rsp + nb430nf_innerjjnr], 8 - - xorps xmm2, xmm2 - movaps xmm6, xmm2 - - ;# load isa2 - mov rsi, [rbp + nb430nf_invsqrta] - movss xmm2, [rsi + rax*4] - movss xmm3, [rsi + rbx*4] - unpcklps xmm2, xmm3 ;# isa2 in xmm3(0,1) - mulps xmm2, [rsp + nb430nf_isai] - movaps [rsp + nb430nf_isaprod], xmm2 - movaps xmm1, xmm2 - mulps xmm1, [rsp + nb430nf_gbtsc] - movaps [rsp + nb430nf_gbscale], xmm1 - - mov rsi, [rbp + nb430nf_charge] ;# base of charge[] - movss xmm3, [rsi + rax*4] - movss xmm6, [rsi + rbx*4] - unpcklps xmm3, xmm6 ;# 00001000 ;# xmm3(0,1) has the charges - - mulps xmm2, [rsp + nb430nf_iq] - mulps xmm3, xmm2 - movaps [rsp + nb430nf_qq], xmm3 - - mov rsi, [rbp + nb430nf_type] - mov ecx, eax - mov edx, ebx - mov ecx, [rsi + rcx*4] - mov edx, [rsi + rdx*4] - mov rsi, [rbp + nb430nf_vdwparam] - shl ecx, 1 - shl edx, 1 - mov edi, [rsp + nb430nf_ntia] - add ecx, edi - add edx, edi - movlps xmm6, [rsi + rcx*4] - movhps xmm6, [rsi + rdx*4] - mov rdi, [rbp + nb430nf_pos] - - movaps xmm4, xmm6 - shufps xmm4, xmm4, 8 ;# 00001000 - shufps xmm6, xmm6, 13 ;# 00001101 - movlhps xmm4, xmm7 - movlhps xmm6, xmm7 - - movaps [rsp + nb430nf_c6], xmm4 - movaps [rsp + nb430nf_c12], xmm6 - - lea rax, [rax + rax*2] - lea rbx, [rbx + rbx*2] - ;# move coordinates to xmm0-xmm2 - movlps xmm1, [rdi + rax*4] - movss xmm2, [rdi + rax*4 + 8] - movhps xmm1, [rdi + rbx*4] - movss xmm0, [rdi + rbx*4 + 8] - - movlhps xmm3, xmm7 - - shufps xmm2, xmm0, 0 - - movaps xmm0, xmm1 - - shufps xmm2, xmm2, 136 ;# 10001000 - - shufps xmm0, xmm0, 136 ;# 10001000 - shufps xmm1, xmm1, 221 ;# 11011101 - - mov rdi, [rbp + nb430nf_faction] - ;# move ix-iz to xmm4-xmm6 - xorps xmm7, xmm7 - - movaps xmm4, [rsp + nb430nf_ix] - movaps xmm5, [rsp + nb430nf_iy] - movaps xmm6, [rsp + nb430nf_iz] - - ;# calc dr - subps xmm4, xmm0 - subps xmm5, xmm1 - subps xmm6, xmm2 - - ;# square it - mulps xmm4,xmm4 - mulps xmm5,xmm5 - mulps xmm6,xmm6 - addps xmm4, xmm5 - addps xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtps xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulps xmm5, xmm5 - movaps xmm1, [rsp + nb430nf_three] - mulps xmm5, xmm4 ;# rsq*lu*lu - movaps xmm0, [rsp + nb430nf_half] - subps xmm1, xmm5 ;# 30-rsq*lu*lu - mulps xmm1, xmm2 - mulps xmm0, xmm1 ;# xmm0=rinv - mulps xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb430nf_r], xmm4 - mulps xmm4, [rsp + nb430nf_gbscale] - - cvttps2pi mm6, xmm4 ;# mm6 contain lu indices - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 - - mov rsi, [rbp + nb430nf_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# load coulomb table - movaps xmm4, [rsi + rcx*4] - movaps xmm7, [rsi + rdx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - movaps xmm3, [rsp + nb430nf_qq] - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - mulps xmm5, xmm3 ;# vcoul=qq*VV - addps xmm5, [rsp + nb430nf_vctot] - movaps [rsp + nb430nf_vctot], xmm5 - - movaps xmm4, [rsp + nb430nf_r] - mulps xmm4, [rsp + nb430nf_tsc] - - cvttps2pi mm6, xmm4 - cvtpi2ps xmm6, mm6 - subps xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulps xmm2, xmm2 ;# xmm2=eps2 - pslld mm6, 3 - - mov rsi, [rbp + nb430nf_VFtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 - - ;# dispersion - movaps xmm4, [rsi + rcx*4] - movaps xmm7, [rsi + rdx*4] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# dispersion table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - - mulps xmm5, [rsp + nb430nf_c6] ;# Vvdw6 - addps xmm5, [rsp + nb430nf_Vvdwtot] - movaps [rsp + nb430nf_Vvdwtot], xmm5 - - ;# repulsion - movaps xmm4, [rsi + rcx*4 + 16] - movaps xmm7, [rsi + rdx*4 + 16] - ;# transpose, using xmm3 for scratch - movaps xmm6, xmm4 - unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2 - unpckhps xmm6, xmm7 ;# G1 G2 H1 H2 - movhlps xmm5, xmm4 ;# F1 F2 - movhlps xmm7, xmm6 ;# H1 H2 - ;# table ready, in xmm4-xmm7 - mulps xmm6, xmm1 ;# xmm6=Geps - mulps xmm7, xmm2 ;# xmm7=Heps2 - addps xmm5, xmm6 - addps xmm5, xmm7 ;# xmm5=Fp - mulps xmm5, xmm1 ;# xmm5=eps*Fp - addps xmm5, xmm4 ;# xmm5=VV - - mulps xmm5, [rsp + nb430nf_c12] ;# Vvdw12 - - addps xmm5, [rsp + nb430nf_Vvdwtot] - movaps [rsp + nb430nf_Vvdwtot], xmm5 -.nb430nf_checksingle: - mov edx, [rsp + nb430nf_innerk] - and edx, 1 - jnz .nb430nf_dosingle - jmp .nb430nf_updateouterdata -.nb430nf_dosingle: - mov rsi, [rbp + nb430nf_charge] - mov rdx, [rbp + nb430nf_invsqrta] - mov rdi, [rbp + nb430nf_pos] - mov rcx, [rsp + nb430nf_innerjjnr] - mov eax, [rcx] - xorps xmm2, xmm2 - movaps xmm6, xmm2 - movss xmm2, [rdx + rax*4] ;# isa2 - mulss xmm2, [rsp + nb430nf_isai] - movss [rsp + nb430nf_isaprod], xmm2 - movss xmm1, xmm2 - mulss xmm1, [rsp + nb430nf_gbtsc] - movss [rsp + nb430nf_gbscale], xmm1 - - mulss xmm2, [rsp + nb430nf_iq] - movss xmm6, [rsi + rax*4] ;# xmm6(0) has the charge - mulss xmm6, xmm2 - movss [rsp + nb430nf_qq], xmm6 - - mov rsi, [rbp + nb430nf_type] - mov ecx, eax - mov ecx, [rsi + rcx*4] - mov rsi, [rbp + nb430nf_vdwparam] - shl ecx, 1 - add ecx, [rsp + nb430nf_ntia] - movlps xmm6, [rsi + rcx*4] - movaps xmm4, xmm6 - shufps xmm4, xmm4, 252 ;# 11111100 - shufps xmm6, xmm6, 253 ;# 11111101 - - movss [rsp + nb430nf_c6], xmm4 - movss [rsp + nb430nf_c12], xmm6 - - lea rax, [rax + rax*2] - - ;# move coordinates to xmm0-xmm2 - movss xmm0, [rdi + rax*4] - movss xmm1, [rdi + rax*4 + 4] - movss xmm2, [rdi + rax*4 + 8] - - movss xmm4, [rsp + nb430nf_ix] - movss xmm5, [rsp + nb430nf_iy] - movss xmm6, [rsp + nb430nf_iz] - - ;# calc dr - subss xmm4, xmm0 - subss xmm5, xmm1 - subss xmm6, xmm2 - - ;# square it - mulss xmm4,xmm4 - mulss xmm5,xmm5 - mulss xmm6,xmm6 - addss xmm4, xmm5 - addss xmm4, xmm6 - ;# rsq in xmm4 - - rsqrtss xmm5, xmm4 - ;# lookup seed in xmm5 - movaps xmm2, xmm5 - mulss xmm5, xmm5 - movss xmm1, [rsp + nb430nf_three] - mulss xmm5, xmm4 ;# rsq*lu*lu - movss xmm0, [rsp + nb430nf_half] - subss xmm1, xmm5 ;# 30-rsq*lu*lu - mulss xmm1, xmm2 - mulss xmm0, xmm1 ;# xmm0=rinv - - mulss xmm4, xmm0 ;# xmm4=r - movaps [rsp + nb430nf_r], xmm4 - mulss xmm4, [rsp + nb430nf_gbscale] - - cvttss2si ebx, xmm4 ;# mm6 contain lu indices - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movaps xmm1, xmm4 ;# xmm1=eps - movaps xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 2 - - mov rsi, [rbp + nb430nf_GBtab] - - movaps xmm4, [rsi + rbx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - movss xmm3, [rsp + nb430nf_qq] - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, xmm3 ;# vcoul=qq*VV - addss xmm5, [rsp + nb430nf_vctot] - movss [rsp + nb430nf_vctot], xmm5 - - movss xmm4, [rsp + nb430nf_r] - mulps xmm4, [rsp + nb430nf_tsc] - - cvttss2si ebx, xmm4 - cvtsi2ss xmm6, ebx - subss xmm4, xmm6 - movss xmm1, xmm4 ;# xmm1=eps - movss xmm2, xmm1 - mulss xmm2, xmm2 ;# xmm2=eps2 - - shl ebx, 3 - mov rsi, [rbp + nb430nf_VFtab] - - ;# dispersion - movaps xmm4, [rsi + rbx*4] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - mulss xmm5, [rsp + nb430nf_c6] ;# Vvdw6 - addss xmm5, [rsp + nb430nf_Vvdwtot] - movss [rsp + nb430nf_Vvdwtot], xmm5 - - ;# repulsion - movaps xmm4, [rsi + rbx*4 + 16] - movhlps xmm6, xmm4 - movaps xmm5, xmm4 - movaps xmm7, xmm6 - shufps xmm5, xmm5, 1 - shufps xmm7, xmm7, 1 - ;# table ready in xmm4-xmm7 - - mulss xmm6, xmm1 ;# xmm6=Geps - mulss xmm7, xmm2 ;# xmm7=Heps2 - addss xmm5, xmm6 - addss xmm5, xmm7 ;# xmm5=Fp - mulss xmm5, xmm1 ;# xmm5=eps*Fp - addss xmm5, xmm4 ;# xmm5=VV - - mulss xmm5, [rsp + nb430nf_c12] ;# Vvdw12 - - addss xmm5, [rsp + nb430nf_Vvdwtot] - movss [rsp + nb430nf_Vvdwtot], xmm5 - -.nb430nf_updateouterdata: - ;# get n from stack - mov esi, [rsp + nb430nf_n] - ;# get group index for i particle - mov rdx, [rbp + nb430nf_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movaps xmm7, [rsp + nb430nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov rax, [rbp + nb430nf_Vc] - addss xmm7, [rax + rdx*4] - ;# move back to mem - movss [rax + rdx*4], xmm7 - - ;# accumulate total lj energy and update it - movaps xmm7, [rsp + nb430nf_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now - movaps xmm6, xmm7 - shufps xmm6, xmm6, 1 - addss xmm7, xmm6 - - ;# add earlier value from mem - mov rax, [rbp + nb430nf_Vvdw] - addss xmm7, [rax + rdx*4] - ;# move back to mem - movss [rax + rdx*4], xmm7 - - ;# finish if last - mov ecx, [rsp + nb430nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb430nf_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb430nf_n], esi - jmp .nb430nf_outer -.nb430nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb430nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb430nf_end - ;# non-zero, do one more workunit - jmp .nb430nf_threadloop -.nb430nf_end: - - mov eax, [rsp + nb430nf_nouter] - mov ebx, [rsp + nb430nf_ninner] - mov rcx, [rbp + nb430nf_outeriter] - mov rdx, [rbp + nb430nf_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 392 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - - - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.s deleted file mode 100644 index b25797c2a4..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel430_x86_64_sse.s +++ /dev/null @@ -1,2306 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - - - - - - -.globl nb_kernel430_x86_64_sse -.globl _nb_kernel430_x86_64_sse -nb_kernel430_x86_64_sse: -_nb_kernel430_x86_64_sse: -## Room for return address and rbp (16 bytes) -.set nb430_fshift, 16 -.set nb430_gid, 24 -.set nb430_pos, 32 -.set nb430_faction, 40 -.set nb430_charge, 48 -.set nb430_p_facel, 56 -.set nb430_argkrf, 64 -.set nb430_argcrf, 72 -.set nb430_Vc, 80 -.set nb430_type, 88 -.set nb430_p_ntype, 96 -.set nb430_vdwparam, 104 -.set nb430_Vvdw, 112 -.set nb430_p_tabscale, 120 -.set nb430_VFtab, 128 -.set nb430_invsqrta, 136 -.set nb430_dvda, 144 -.set nb430_p_gbtabscale, 152 -.set nb430_GBtab, 160 -.set nb430_p_nthreads, 168 -.set nb430_count, 176 -.set nb430_mtx, 184 -.set nb430_outeriter, 192 -.set nb430_inneriter, 200 -.set nb430_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb430_ix, 0 -.set nb430_iy, 16 -.set nb430_iz, 32 -.set nb430_iq, 48 -.set nb430_dx, 64 -.set nb430_dy, 80 -.set nb430_dz, 96 -.set nb430_eps, 112 -.set nb430_gbtsc, 128 -.set nb430_tsc, 144 -.set nb430_qq, 160 -.set nb430_c6, 176 -.set nb430_c12, 192 -.set nb430_epsgb, 208 -.set nb430_vctot, 224 -.set nb430_Vvdwtot, 240 -.set nb430_fix, 256 -.set nb430_fiy, 272 -.set nb430_fiz, 288 -.set nb430_half, 304 -.set nb430_three, 320 -.set nb430_r, 336 -.set nb430_isai, 352 -.set nb430_isaprod, 368 -.set nb430_dvdasum, 384 -.set nb430_gbscale, 400 -.set nb430_rinv, 416 -.set nb430_nri, 432 -.set nb430_iinr, 440 -.set nb430_jindex, 448 -.set nb430_jjnr, 456 -.set nb430_shift, 464 -.set nb430_shiftvec, 472 -.set nb430_facel, 480 -.set nb430_innerjjnr, 488 -.set nb430_ii, 496 -.set nb430_is3, 500 -.set nb430_ii3, 504 -.set nb430_ntia, 508 -.set nb430_innerk, 512 -.set nb430_n, 516 -.set nb430_nn1, 520 -.set nb430_ntype, 524 -.set nb430_nouter, 528 -.set nb430_ninner, 532 - - push %rbp - movq %rsp,%rbp - push %rbx - - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $552,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb430_nouter(%rsp) - movl %eax,nb430_ninner(%rsp) - - - - movl (%rdi),%edi - movl %edi,nb430_nri(%rsp) - movq %rsi,nb430_iinr(%rsp) - movq %rdx,nb430_jindex(%rsp) - movq %rcx,nb430_jjnr(%rsp) - movq %r8,nb430_shift(%rsp) - movq %r9,nb430_shiftvec(%rsp) - movq nb430_p_ntype(%rbp),%rdi - movl (%rdi),%edi - movl %edi,nb430_ntype(%rsp) - movq nb430_p_facel(%rbp),%rsi - movss (%rsi),%xmm0 - movss %xmm0,nb430_facel(%rsp) - - movq nb430_p_tabscale(%rbp),%rax - movss (%rax),%xmm3 - shufps $0,%xmm3,%xmm3 - movaps %xmm3,nb430_tsc(%rsp) - - movq nb430_p_gbtabscale(%rbp),%rbx - movss (%rbx),%xmm4 - shufps $0,%xmm4,%xmm4 - movaps %xmm4,nb430_gbtsc(%rsp) - - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## half in IEEE (hex) - movl %eax,nb430_half(%rsp) - movss nb430_half(%rsp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## one - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## two - addps %xmm2,%xmm3 ## three - movaps %xmm1,nb430_half(%rsp) - movaps %xmm3,nb430_three(%rsp) - -_nb_kernel430_x86_64_sse.nb430_threadloop: - movq nb430_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel430_x86_64_sse.nb430_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel430_x86_64_sse.nb430_spinlock - - ## if(nn1>nri) nn1=nri - movl nb430_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb430_n(%rsp) - movl %ebx,nb430_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel430_x86_64_sse.nb430_outerstart - jmp _nb_kernel430_x86_64_sse.nb430_end - -_nb_kernel430_x86_64_sse.nb430_outerstart: - ## ebx contains number of outer iterations - addl nb430_nouter(%rsp),%ebx - movl %ebx,nb430_nouter(%rsp) - -_nb_kernel430_x86_64_sse.nb430_outer: - movq nb430_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## ebx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb430_is3(%rsp) ## store is3 - - movq nb430_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movss (%rax,%rbx,4),%xmm0 - movss 4(%rax,%rbx,4),%xmm1 - movss 8(%rax,%rbx,4),%xmm2 - - movq nb430_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - movl %ebx,nb430_ii(%rsp) - - movq nb430_charge(%rbp),%rdx - movss (%rdx,%rbx,4),%xmm3 - mulss nb430_facel(%rsp),%xmm3 - shufps $0,%xmm3,%xmm3 - - movq nb430_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movss (%rdx,%rbx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - movq nb430_type(%rbp),%rdx - movl (%rdx,%rbx,4),%edx - imull nb430_ntype(%rsp),%edx - shll %edx - movl %edx,nb430_ntia(%rsp) - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb430_pos(%rbp),%rax ## rax = base of pos[] - - addss (%rax,%rbx,4),%xmm0 - addss 4(%rax,%rbx,4),%xmm1 - addss 8(%rax,%rbx,4),%xmm2 - - movaps %xmm3,nb430_iq(%rsp) - movaps %xmm4,nb430_isai(%rsp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb430_ix(%rsp) - movaps %xmm1,nb430_iy(%rsp) - movaps %xmm2,nb430_iz(%rsp) - - movl %ebx,nb430_ii3(%rsp) - - ## clear vctot and i forces - xorps %xmm4,%xmm4 - movaps %xmm4,nb430_vctot(%rsp) - movaps %xmm4,nb430_Vvdwtot(%rsp) - movaps %xmm4,nb430_dvdasum(%rsp) - movaps %xmm4,nb430_fix(%rsp) - movaps %xmm4,nb430_fiy(%rsp) - movaps %xmm4,nb430_fiz(%rsp) - - movq nb430_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb430_pos(%rbp),%rsi - movq nb430_faction(%rbp),%rdi - movq nb430_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb430_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb430_ninner(%rsp),%ecx - movl %ecx,nb430_ninner(%rsp) - addl $0,%edx - movl %edx,nb430_innerk(%rsp) ## number of innerloop atoms - - jge _nb_kernel430_x86_64_sse.nb430_unroll_loop - jmp _nb_kernel430_x86_64_sse.nb430_finish_inner -_nb_kernel430_x86_64_sse.nb430_unroll_loop: - ## quad-unroll innerloop here - movq nb430_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%eax - movl 4(%rdx),%ebx - movl 8(%rdx),%ecx - movl 12(%rdx),%edx ## eax-edx=jnr1-4 - - addq $16,nb430_innerjjnr(%rsp) ## advance pointer (unrolled 4) - - ## load isaj - movq nb430_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rcx,4),%xmm4 - movss (%rsi,%rbx,4),%xmm6 - movss (%rsi,%rdx,4),%xmm7 - movaps nb430_isai(%rsp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## 10001000 ;# all isaj in xmm3 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb430_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulps nb430_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb430_gbscale(%rsp) - - movq nb430_charge(%rbp),%rsi ## base of charge[] - - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rcx,4),%xmm4 - movss (%rsi,%rbx,4),%xmm6 - movss (%rsi,%rdx,4),%xmm7 - - mulps nb430_iq(%rsp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3 - mulps %xmm2,%xmm3 - movaps %xmm3,nb430_qq(%rsp) - - ## vdw parameters - movq nb430_type(%rbp),%rsi - movl (%rsi,%rax,4),%r12d - movl (%rsi,%rbx,4),%r13d - movl (%rsi,%rcx,4),%r14d - movl (%rsi,%rdx,4),%r15d - shll %r12d - shll %r13d - shll %r14d - shll %r15d - movl nb430_ntia(%rsp),%edi - addl %edi,%r12d - addl %edi,%r13d - addl %edi,%r14d - addl %edi,%r15d - - movq nb430_vdwparam(%rbp),%rsi - movlps (%rsi,%r12,4),%xmm3 - movlps (%rsi,%r14,4),%xmm7 - movhps (%rsi,%r13,4),%xmm3 - movhps (%rsi,%r15,4),%xmm7 - - movaps %xmm3,%xmm0 - shufps $136,%xmm7,%xmm0 ## 10001000 - shufps $221,%xmm7,%xmm3 ## 11011101 - - movaps %xmm0,nb430_c6(%rsp) - movaps %xmm3,nb430_c12(%rsp) - - movq nb430_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r8 ## jnr - lea (%rbx,%rbx,2),%r9 - lea (%rcx,%rcx,2),%r10 - lea (%rdx,%rdx,2),%r11 - - ## move four coordinates to xmm0-xmm2 - movlps (%rsi,%r8,4),%xmm4 - movlps (%rsi,%r10,4),%xmm5 - movss 8(%rsi,%r8,4),%xmm2 - movss 8(%rsi,%r10,4),%xmm6 - - movhps (%rsi,%r9,4),%xmm4 - movhps (%rsi,%r11,4),%xmm5 - - movss 8(%rsi,%r9,4),%xmm0 - movss 8(%rsi,%r11,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## 10001000 - - shufps $136,%xmm5,%xmm0 ## 10001000 - shufps $221,%xmm5,%xmm1 ## 11011101 - - ## calc dr - subps nb430_ix(%rsp),%xmm0 - subps nb430_iy(%rsp),%xmm1 - subps nb430_iz(%rsp),%xmm2 - - ## store dr - movaps %xmm0,nb430_dx(%rsp) - movaps %xmm1,nb430_dy(%rsp) - movaps %xmm2,nb430_dz(%rsp) - - movd %r8,%mm0 ## store j3 - movd %r9,%mm1 - movd %r10,%mm2 - movd %r11,%mm3 - - ## square it - mulps %xmm0,%xmm0 - mulps %xmm1,%xmm1 - mulps %xmm2,%xmm2 - addps %xmm1,%xmm0 - addps %xmm2,%xmm0 - movaps %xmm0,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb430_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb430_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb430_r(%rsp) - movaps %xmm0,nb430_rinv(%rsp) - - movaps %xmm4,%xmm8 ## r - mulps nb430_gbscale(%rsp),%xmm4 ## rgbtab - mulps nb430_tsc(%rsp),%xmm8 ## rtab - - ## truncate and convert to integers - cvttps2dq %xmm4,%xmm5 ## gb - cvttps2dq %xmm8,%xmm9 ## lj - - ## convert back to float - cvtdq2ps %xmm5,%xmm6 ## gb - cvtdq2ps %xmm9,%xmm10 ## lj - - ## multiply by 4 and 8, respectively - pslld $2,%xmm5 ## gb - pslld $3,%xmm9 ## lj - - ## move to integer registers - movhlps %xmm5,%xmm7 ## gb - movhlps %xmm9,%xmm11 ## lj - movd %xmm5,%r8d ## gb - movd %xmm9,%r12d ## lj - movd %xmm7,%r10d ## gb - movd %xmm11,%r14d ## lj - pshufd $1,%xmm5,%xmm5 ## gb - pshufd $1,%xmm9,%xmm9 ## lj - pshufd $1,%xmm7,%xmm7 ## gb - pshufd $1,%xmm11,%xmm11 ## lj - movd %xmm5,%r9d ## gb - movd %xmm9,%r13d ## lj - movd %xmm7,%r11d ## gb - movd %xmm11,%r15d ## lj - ## GB indices: r8-r11 LJ indices: r12-r15 - - ## calculate eps - subps %xmm6,%xmm4 ## gb - subps %xmm10,%xmm8 ## lj - movaps %xmm4,nb430_epsgb(%rsp) ## gb eps - movaps %xmm8,nb430_eps(%rsp) ## lj eps - - movq nb430_GBtab(%rbp),%rsi - movq nb430_VFtab(%rbp),%rdi - - ## load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 - movlps (%rsi,%r8,4),%xmm1 ## Y1c F1c - movlps (%rdi,%r12,4),%xmm5 ## Y1d F1d - movlps 16(%rdi,%r12,4),%xmm9 ## Y1r F1r - - movlps (%rsi,%r10,4),%xmm3 ## Y3c F3c - movlps (%rdi,%r14,4),%xmm7 ## Y3d F3d - movlps 16(%rdi,%r14,4),%xmm11 ## Y3r F3r - - movhps (%rsi,%r9,4),%xmm1 ## Y1c F1c Y2c F2c - movhps (%rdi,%r13,4),%xmm5 ## Y1d F1d Y2d F2d - movhps 16(%rdi,%r13,4),%xmm9 ## Y1r F1r Y2r F2r - - movhps (%rsi,%r11,4),%xmm3 ## Y3c F3c Y4c F4c - movhps (%rdi,%r15,4),%xmm7 ## Y3d F3d Y4d F4d - movhps 16(%rdi,%r15,4),%xmm11 ## Y3r F3r Y4r F4r - - movaps %xmm1,%xmm0 - movaps %xmm5,%xmm4 - movaps %xmm9,%xmm8 - shufps $136,%xmm3,%xmm0 ## 10001000 => Y1c Y2c Y3c Y4c - shufps $136,%xmm7,%xmm4 ## 10001000 => Y1d Y2d Y3d Y4d - shufps $136,%xmm11,%xmm8 ## 10001000 => Y1r Y2r Y3r Y4r - shufps $221,%xmm3,%xmm1 ## 11011101 => F1c F2c F3c F4c - shufps $221,%xmm7,%xmm5 ## 11011101 => F1d F2d F3d F4d - shufps $221,%xmm11,%xmm9 ## 11011101 => F1r F2r F3r F4r - - movlps 8(%rsi,%r8,4),%xmm3 ## G1c H1c - movlps 8(%rdi,%r12,4),%xmm7 ## G1d H1d - movlps 24(%rdi,%r12,4),%xmm11 ## G1r H1r - - movlps 8(%rsi,%r10,4),%xmm12 ## G3c H3c - movlps 8(%rdi,%r14,4),%xmm13 ## G3d H3d - movlps 24(%rdi,%r14,4),%xmm14 ## G3r H3r - - movhps 8(%rsi,%r9,4),%xmm3 ## G1c H1c G2c H2c - movhps 8(%rdi,%r13,4),%xmm7 ## G1d H1d G2d H2d - movhps 24(%rdi,%r13,4),%xmm11 ## G1r H1r G2r H2r - - movhps 8(%rsi,%r11,4),%xmm12 ## G3c H3c G4c H4c - movhps 8(%rdi,%r15,4),%xmm13 ## G3d H3d G4d H4d - movhps 24(%rdi,%r15,4),%xmm14 ## G3r H3r G4r H4r - movaps %xmm3,%xmm2 - movaps %xmm7,%xmm6 - movaps %xmm11,%xmm10 - - shufps $136,%xmm12,%xmm2 ## 10001000 => G1c G2c G3c G4c - shufps $136,%xmm13,%xmm6 ## 10001000 => G1d G2d G3d G4d - shufps $136,%xmm14,%xmm10 ## 10001000 => G1r G2r G3r G4r - shufps $221,%xmm12,%xmm3 ## 11011101 => H1c H2c H3c H4c - shufps $221,%xmm13,%xmm7 ## 11011101 => H1d H2d H3d H4d - shufps $221,%xmm14,%xmm11 ## 11011101 => H1r H2r H3r H4r - ## table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 - - movaps nb430_epsgb(%rsp),%xmm12 - movaps nb430_eps(%rsp),%xmm13 - - mulps %xmm12,%xmm3 ## Heps - mulps %xmm13,%xmm7 - mulps %xmm13,%xmm11 - mulps %xmm12,%xmm2 ## Geps - mulps %xmm13,%xmm6 - mulps %xmm13,%xmm10 - mulps %xmm12,%xmm3 ## Heps2 - mulps %xmm13,%xmm7 - mulps %xmm13,%xmm11 - - addps %xmm2,%xmm1 ## F+Geps - addps %xmm6,%xmm5 - addps %xmm10,%xmm9 - addps %xmm3,%xmm1 ## F+Geps+Heps2 = Fp - addps %xmm7,%xmm5 - addps %xmm11,%xmm9 - addps %xmm3,%xmm3 ## 2*Heps2 - addps %xmm7,%xmm7 - addps %xmm11,%xmm11 - addps %xmm2,%xmm3 ## 2*Heps2+Geps - addps %xmm6,%xmm7 - addps %xmm10,%xmm11 - addps %xmm1,%xmm3 ## FF = Fp + 2*Heps2 + Geps - addps %xmm5,%xmm7 - addps %xmm9,%xmm11 - mulps %xmm12,%xmm1 ## eps*Fp - mulps %xmm13,%xmm5 - mulps %xmm13,%xmm9 - addps %xmm0,%xmm1 ## VV - addps %xmm4,%xmm5 - addps %xmm8,%xmm9 - mulps nb430_qq(%rsp),%xmm1 ## VV*qq = vcoul - mulps nb430_c6(%rsp),%xmm5 ## vnb6 - mulps nb430_c12(%rsp),%xmm9 ## vnb12 - mulps nb430_qq(%rsp),%xmm3 ## FF*qq = fij - mulps nb430_c6(%rsp),%xmm7 ## fijD - mulps nb430_c12(%rsp),%xmm11 ##fijR - - addps %xmm7,%xmm11 ## fijD+fijR - mulps nb430_tsc(%rsp),%xmm11 ## (fijD+fijR)*tabscale - - ## accumulate Vvdwtot - addps nb430_Vvdwtot(%rsp),%xmm5 - addps %xmm9,%xmm5 - movaps %xmm5,nb430_Vvdwtot(%rsp) - - movq nb430_dvda(%rbp),%rsi - - ## Calculate dVda - mulps nb430_gbscale(%rsp),%xmm3 ## fijC=qq*FF*gbscale - movaps %xmm3,%xmm6 - mulps nb430_r(%rsp),%xmm6 - addps %xmm1,%xmm6 ## vcoul+fijC*r - - addps %xmm11,%xmm3 ## fijC+fijD+fijR - - ## increment vctot - addps nb430_vctot(%rsp),%xmm1 - movaps %xmm1,nb430_vctot(%rsp) - - ## xmm6=(vcoul+fijC*r) - xorps %xmm7,%xmm7 - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb430_dvdasum(%rsp),%xmm7 - movaps %xmm7,nb430_dvdasum(%rsp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - movaps %xmm6,%xmm5 - movaps %xmm7,%xmm4 - shufps $0x1,%xmm5,%xmm5 - shufps $0x1,%xmm4,%xmm4 - - ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4 - addss (%rsi,%rax,4),%xmm6 - addss (%rsi,%rbx,4),%xmm5 - addss (%rsi,%rcx,4),%xmm7 - addss (%rsi,%rdx,4),%xmm4 - movss %xmm6,(%rsi,%rax,4) - movss %xmm5,(%rsi,%rbx,4) - movss %xmm7,(%rsi,%rcx,4) - movss %xmm4,(%rsi,%rdx,4) - - xorps %xmm4,%xmm4 - mulps nb430_rinv(%rsp),%xmm3 - subps %xmm3,%xmm4 - - movd %mm0,%r8 ## fetch j3 - movd %mm1,%r9 - movd %mm2,%r10 - movd %mm3,%r11 - - movaps %xmm4,%xmm9 - movaps %xmm4,%xmm10 - movaps %xmm4,%xmm11 - - mulps nb430_dx(%rsp),%xmm9 - mulps nb430_dy(%rsp),%xmm10 - mulps nb430_dz(%rsp),%xmm11 - - ## accumulate i forces - movaps nb430_fix(%rsp),%xmm12 - movaps nb430_fiy(%rsp),%xmm13 - movaps nb430_fiz(%rsp),%xmm14 - addps %xmm9,%xmm12 - addps %xmm10,%xmm13 - addps %xmm11,%xmm14 - movaps %xmm12,nb430_fix(%rsp) - movaps %xmm13,nb430_fiy(%rsp) - movaps %xmm14,nb430_fiz(%rsp) - - movq nb430_faction(%rbp),%rsi - ## the fj's - start by accumulating x & y forces from memory - movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - - - movlps (%rsi,%r10,4),%xmm1 ## x3 y3 - - - movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2 - movhps (%rsi,%r11,4),%xmm1 ## x3 y3 x4 y4 - - movaps %xmm9,%xmm8 - unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2 - unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4 - - ## update fjx and fjy - addps %xmm9,%xmm0 - addps %xmm8,%xmm1 - - movlps %xmm0,(%rsi,%r8,4) - movlps %xmm1,(%rsi,%r10,4) - movhps %xmm0,(%rsi,%r9,4) - movhps %xmm1,(%rsi,%r11,4) - - ## xmm11: fjz1 fjz2 fjz3 fjz4 - pshufd $1,%xmm11,%xmm10 ## fjz2 - - - - movhlps %xmm11,%xmm9 ## fjz3 - - - - pshufd $3,%xmm11,%xmm8 ## fjz4 - - - - - addss 8(%rsi,%r8,4),%xmm11 - addss 8(%rsi,%r9,4),%xmm10 - addss 8(%rsi,%r10,4),%xmm9 - addss 8(%rsi,%r11,4),%xmm8 - movss %xmm11,8(%rsi,%r8,4) - movss %xmm10,8(%rsi,%r9,4) - movss %xmm9,8(%rsi,%r10,4) - movss %xmm8,8(%rsi,%r11,4) - - ## should we do one more iteration? - subl $4,nb430_innerk(%rsp) - jl _nb_kernel430_x86_64_sse.nb430_finish_inner - jmp _nb_kernel430_x86_64_sse.nb430_unroll_loop -_nb_kernel430_x86_64_sse.nb430_finish_inner: - ## check if at least two particles remain - addl $4,nb430_innerk(%rsp) - movl nb430_innerk(%rsp),%edx - andl $2,%edx - jnz _nb_kernel430_x86_64_sse.nb430_dopair - jmp _nb_kernel430_x86_64_sse.nb430_checksingle -_nb_kernel430_x86_64_sse.nb430_dopair: - movq nb430_innerjjnr(%rsp),%rcx - - movl (%rcx),%eax - movl 4(%rcx),%ebx - addq $8,nb430_innerjjnr(%rsp) - - ## load isaj - movq nb430_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rbx,4),%xmm6 - movaps nb430_isai(%rsp),%xmm2 - unpcklps %xmm6,%xmm3 - mulps %xmm3,%xmm2 - movaps %xmm2,nb430_isaprod(%rsp) - - movaps %xmm2,%xmm1 - mulps nb430_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb430_gbscale(%rsp) - - movq nb430_charge(%rbp),%rsi ## base of charge[] - - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rbx,4),%xmm6 - unpcklps %xmm6,%xmm3 - mulps nb430_iq(%rsp),%xmm2 - mulps %xmm2,%xmm3 - movaps %xmm3,nb430_qq(%rsp) - - ## vdw parameters - movq nb430_type(%rbp),%rsi - movl (%rsi,%rax,4),%r12d - movl (%rsi,%rbx,4),%r13d - shll %r12d - shll %r13d - movl nb430_ntia(%rsp),%edi - addl %edi,%r12d - addl %edi,%r13d - - movq nb430_vdwparam(%rbp),%rsi - movlps (%rsi,%r12,4),%xmm3 - movhps (%rsi,%r13,4),%xmm3 - - xorps %xmm7,%xmm7 - movaps %xmm3,%xmm0 - shufps $136,%xmm7,%xmm0 ## 10001000 - shufps $221,%xmm7,%xmm3 ## 11011101 - - movaps %xmm0,nb430_c6(%rsp) - movaps %xmm3,nb430_c12(%rsp) - - movq nb430_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r8 ## j3 - lea (%rbx,%rbx,2),%r9 - - ## move four coordinates to xmm0-xmm2 - movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - - - movlps (%rsi,%r9,4),%xmm1 ## x2 y2 - - - - movss 8(%rsi,%r8,4),%xmm2 ## z1 - - - - movss 8(%rsi,%r9,4),%xmm7 ## z2 - - - - - unpcklps %xmm1,%xmm0 ## x1 x2 y1 y2 - movhlps %xmm0,%xmm1 ## y1 y2 - - - unpcklps %xmm7,%xmm2 ## z1 z2 - - - - ## calc dr - subps nb430_ix(%rsp),%xmm0 - subps nb430_iy(%rsp),%xmm1 - subps nb430_iz(%rsp),%xmm2 - - ## store dr - movaps %xmm0,nb430_dx(%rsp) - movaps %xmm1,nb430_dy(%rsp) - movaps %xmm2,nb430_dz(%rsp) - - ## square it - mulps %xmm0,%xmm0 - mulps %xmm1,%xmm1 - mulps %xmm2,%xmm2 - addps %xmm1,%xmm0 - addps %xmm2,%xmm0 - movaps %xmm0,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb430_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb430_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb430_r(%rsp) - movaps %xmm0,nb430_rinv(%rsp) - - movaps %xmm4,%xmm8 ## r - mulps nb430_gbscale(%rsp),%xmm4 ## rgbtab - mulps nb430_tsc(%rsp),%xmm8 ## rtab - - ## truncate and convert to integers - cvttps2dq %xmm4,%xmm5 ## gb - cvttps2dq %xmm8,%xmm9 ## lj - - ## convert back to float - cvtdq2ps %xmm5,%xmm6 ## gb - cvtdq2ps %xmm9,%xmm10 ## lj - - ## multiply by 4 and 8, respectively - pslld $2,%xmm5 ## gb - pslld $3,%xmm9 ## lj - - ## move to integer registers - movd %xmm5,%r12d ## gb - movd %xmm9,%r14d ## lj - pshufd $1,%xmm5,%xmm5 ## gb - pshufd $1,%xmm9,%xmm9 ## lj - movd %xmm5,%r13d ## gb - movd %xmm9,%r15d ## lj - ## GB indices: r12-r13 LJ indices: r14-r15 - - ## calculate eps - subps %xmm6,%xmm4 ## gb - subps %xmm10,%xmm8 ## lj - movaps %xmm4,nb430_epsgb(%rsp) ## gb eps - movaps %xmm8,nb430_eps(%rsp) ## lj eps - - movq nb430_GBtab(%rbp),%rsi - movq nb430_VFtab(%rbp),%rdi - - ## load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 - movlps (%rsi,%r12,4),%xmm0 ## Y1c F1c - movlps (%rsi,%r13,4),%xmm1 ## Y2c F2c - movlps (%rdi,%r14,4),%xmm4 ## Y1d F1d - movlps (%rdi,%r15,4),%xmm5 ## Y2d F2d - movlps 16(%rdi,%r14,4),%xmm8 ## Y1r F1r - movlps 16(%rdi,%r15,4),%xmm9 ## Y2r F2r - - unpcklps %xmm1,%xmm0 - movhlps %xmm0,%xmm1 - unpcklps %xmm5,%xmm4 - movhlps %xmm4,%xmm5 - unpcklps %xmm9,%xmm8 - movhlps %xmm8,%xmm9 - movlps 8(%rsi,%r12,4),%xmm2 ## G1c H1c - movlps 8(%rsi,%r13,4),%xmm3 ## G2c H2c - movlps 8(%rdi,%r14,4),%xmm6 ## G1d H1d - movlps 8(%rdi,%r15,4),%xmm7 ## G2d H2d - movlps 24(%rdi,%r14,4),%xmm10 ## G1r H1r - movlps 24(%rdi,%r15,4),%xmm11 ## G2r H2r - unpcklps %xmm3,%xmm2 - movhlps %xmm2,%xmm3 - unpcklps %xmm7,%xmm6 - movhlps %xmm6,%xmm7 - unpcklps %xmm11,%xmm10 - movhlps %xmm10,%xmm11 - ## table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 - - movaps nb430_epsgb(%rsp),%xmm12 - movaps nb430_eps(%rsp),%xmm13 - - mulps %xmm12,%xmm3 ## Heps - mulps %xmm13,%xmm7 - mulps %xmm13,%xmm11 - mulps %xmm12,%xmm2 ## Geps - mulps %xmm13,%xmm6 - mulps %xmm13,%xmm10 - mulps %xmm12,%xmm3 ## Heps2 - mulps %xmm13,%xmm7 - mulps %xmm13,%xmm11 - - addps %xmm2,%xmm1 ## F+Geps - addps %xmm6,%xmm5 - addps %xmm10,%xmm9 - addps %xmm3,%xmm1 ## F+Geps+Heps2 = Fp - addps %xmm7,%xmm5 - addps %xmm11,%xmm9 - addps %xmm3,%xmm3 ## 2*Heps2 - addps %xmm7,%xmm7 - addps %xmm11,%xmm11 - addps %xmm2,%xmm3 ## 2*Heps2+Geps - addps %xmm6,%xmm7 - addps %xmm10,%xmm11 - addps %xmm1,%xmm3 ## FF = Fp + 2*Heps2 + Geps - addps %xmm5,%xmm7 - addps %xmm9,%xmm11 - mulps %xmm12,%xmm1 ## eps*Fp - mulps %xmm13,%xmm5 - mulps %xmm13,%xmm9 - addps %xmm0,%xmm1 ## VV - addps %xmm4,%xmm5 - addps %xmm8,%xmm9 - mulps nb430_qq(%rsp),%xmm1 ## VV*qq = vcoul - mulps nb430_c6(%rsp),%xmm5 ## vnb6 - mulps nb430_c12(%rsp),%xmm9 ## vnb12 - mulps nb430_qq(%rsp),%xmm3 ## FF*qq = fij - mulps nb430_c6(%rsp),%xmm7 ## fijD - mulps nb430_c12(%rsp),%xmm11 ##fijR - - addps %xmm7,%xmm11 ## fijD+fijR - mulps nb430_tsc(%rsp),%xmm11 ## (fijD+fijR)*tabscale - - ## accumulate Vvdwtot - addps nb430_Vvdwtot(%rsp),%xmm5 - addps %xmm9,%xmm5 - movlps %xmm5,nb430_Vvdwtot(%rsp) - - movq nb430_dvda(%rbp),%rsi - - ## Calculate dVda - mulps nb430_gbscale(%rsp),%xmm3 ## fijC=qq*FF*gbscale - movaps %xmm3,%xmm6 - mulps nb430_r(%rsp),%xmm6 - addps %xmm1,%xmm6 ## vcoul+fijC*r - - addps %xmm11,%xmm3 ## fijC+fijD+fijR - - ## increment vctot - addps nb430_vctot(%rsp),%xmm1 - movlps %xmm1,nb430_vctot(%rsp) - - ## xmm6=(vcoul+fijC*r) - xorps %xmm7,%xmm7 - subps %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addps nb430_dvdasum(%rsp),%xmm7 - movlps %xmm7,nb430_dvdasum(%rsp) - - ## update j atoms dvdaj - movaps %xmm6,%xmm5 - shufps $0x1,%xmm5,%xmm5 - - ## xmm6=dvdaj1 xmm5=dvdaj2 - addss (%rsi,%rax,4),%xmm6 - addss (%rsi,%rbx,4),%xmm5 - movss %xmm6,(%rsi,%rax,4) - movss %xmm5,(%rsi,%rbx,4) - - xorps %xmm4,%xmm4 - mulps nb430_rinv(%rsp),%xmm3 - subps %xmm3,%xmm4 - - movaps %xmm4,%xmm9 - movaps %xmm4,%xmm10 - movaps %xmm4,%xmm11 - - mulps nb430_dx(%rsp),%xmm9 - mulps nb430_dy(%rsp),%xmm10 - mulps nb430_dz(%rsp),%xmm11 - - - ## accumulate i forces - movaps nb430_fix(%rsp),%xmm12 - movaps nb430_fiy(%rsp),%xmm13 - movaps nb430_fiz(%rsp),%xmm14 - addps %xmm9,%xmm12 - addps %xmm10,%xmm13 - addps %xmm11,%xmm14 - movlps %xmm12,nb430_fix(%rsp) - movlps %xmm13,nb430_fiy(%rsp) - movlps %xmm14,nb430_fiz(%rsp) - - movq nb430_faction(%rbp),%rsi - ## the fj's - start by accumulating x & y forces from memory - movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - - - movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2 - - unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2 - addps %xmm9,%xmm0 - - movlps %xmm0,(%rsi,%r8,4) - movhps %xmm0,(%rsi,%r9,4) - - ## z forces - pshufd $1,%xmm11,%xmm8 - addss 8(%rsi,%r8,4),%xmm11 - addss 8(%rsi,%r9,4),%xmm8 - movss %xmm11,8(%rsi,%r8,4) - movss %xmm8,8(%rsi,%r9,4) - -_nb_kernel430_x86_64_sse.nb430_checksingle: - movl nb430_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel430_x86_64_sse.nb430_dosingle - jmp _nb_kernel430_x86_64_sse.nb430_updateouterdata -_nb_kernel430_x86_64_sse.nb430_dosingle: - movq nb430_charge(%rbp),%rsi - movq nb430_invsqrta(%rbp),%rdx - movq nb430_pos(%rbp),%rdi - movq nb430_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - - ## load isaj - movq nb430_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm3 - movaps nb430_isai(%rsp),%xmm2 - mulss %xmm3,%xmm2 - movaps %xmm2,nb430_isaprod(%rsp) - - movaps %xmm2,%xmm1 - mulss nb430_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb430_gbscale(%rsp) - - movq nb430_charge(%rbp),%rsi ## base of charge[] - - movss (%rsi,%rax,4),%xmm3 - mulss nb430_iq(%rsp),%xmm2 - mulss %xmm2,%xmm3 - movaps %xmm3,nb430_qq(%rsp) - - ## vdw parameters - movq nb430_type(%rbp),%rsi - movl (%rsi,%rax,4),%r12d - shll %r12d - movl nb430_ntia(%rsp),%edi - addl %edi,%r12d - - movq nb430_vdwparam(%rbp),%rsi - movss (%rsi,%r12,4),%xmm0 - movss 4(%rsi,%r12,4),%xmm3 - movaps %xmm0,nb430_c6(%rsp) - movaps %xmm3,nb430_c12(%rsp) - - movq nb430_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r8 ## j3 - - ## move four coordinates to xmm0-xmm2 - movss (%rsi,%r8,4),%xmm0 - movss 4(%rsi,%r8,4),%xmm1 - movss 8(%rsi,%r8,4),%xmm2 - - ## calc dr - subss nb430_ix(%rsp),%xmm0 - subss nb430_iy(%rsp),%xmm1 - subss nb430_iz(%rsp),%xmm2 - - ## store dr - movaps %xmm0,nb430_dx(%rsp) - movaps %xmm1,nb430_dy(%rsp) - movaps %xmm2,nb430_dz(%rsp) - - ## square it - mulss %xmm0,%xmm0 - mulss %xmm1,%xmm1 - mulss %xmm2,%xmm2 - addss %xmm1,%xmm0 - addss %xmm2,%xmm0 - movaps %xmm0,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movaps nb430_three(%rsp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movaps nb430_half(%rsp),%xmm0 - subss %xmm5,%xmm1 ## 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - mulss %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb430_r(%rsp) - movaps %xmm0,nb430_rinv(%rsp) - - movaps %xmm4,%xmm8 ## r - mulss nb430_gbscale(%rsp),%xmm4 ## rgbtab - mulss nb430_tsc(%rsp),%xmm8 ## rtab - - ## truncate and convert to integers - cvttss2si %xmm4,%r12d ## gb - cvttss2si %xmm8,%r14d ## lj - - ## convert back to float - cvtsi2ss %r12d,%xmm6 ## gb - cvtsi2ss %r14d,%xmm10 ## lj - - ## multiply by 4 and 8, respectively - shll $2,%r12d ## gb - shll $3,%r14d ## lj - - ## GB index: r12 LJ indices: r14 - - ## calculate eps - subss %xmm6,%xmm4 ## gb - subss %xmm10,%xmm8 ## lj - movaps %xmm4,nb430_epsgb(%rsp) ## gb eps - movaps %xmm8,nb430_eps(%rsp) ## lj eps - - movq nb430_GBtab(%rbp),%rsi - movq nb430_VFtab(%rbp),%rdi - - ## load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 - movss (%rsi,%r12,4),%xmm0 - movss 4(%rsi,%r12,4),%xmm1 - movss 8(%rsi,%r12,4),%xmm2 - movss 12(%rsi,%r12,4),%xmm3 - movss (%rdi,%r14,4),%xmm4 - movss 4(%rdi,%r14,4),%xmm5 - movss 8(%rdi,%r14,4),%xmm6 - movss 12(%rdi,%r14,4),%xmm7 - movss 16(%rdi,%r14,4),%xmm8 - movss 20(%rdi,%r14,4),%xmm9 - movss 24(%rdi,%r14,4),%xmm10 - movss 28(%rdi,%r14,4),%xmm11 - ## table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 - - movaps nb430_epsgb(%rsp),%xmm12 - movaps nb430_eps(%rsp),%xmm13 - - mulss %xmm12,%xmm3 ## Heps - mulss %xmm13,%xmm7 - mulss %xmm13,%xmm11 - mulss %xmm12,%xmm2 ## Geps - mulss %xmm13,%xmm6 - mulss %xmm13,%xmm10 - mulss %xmm12,%xmm3 ## Heps2 - mulss %xmm13,%xmm7 - mulss %xmm13,%xmm11 - - addss %xmm2,%xmm1 ## F+Geps - addss %xmm6,%xmm5 - addss %xmm10,%xmm9 - addss %xmm3,%xmm1 ## F+Geps+Heps2 = Fp - addss %xmm7,%xmm5 - addss %xmm11,%xmm9 - addss %xmm3,%xmm3 ## 2*Heps2 - addss %xmm7,%xmm7 - addss %xmm11,%xmm11 - addss %xmm2,%xmm3 ## 2*Heps2+Geps - addss %xmm6,%xmm7 - addss %xmm10,%xmm11 - addss %xmm1,%xmm3 ## FF = Fp + 2*Heps2 + Geps - addss %xmm5,%xmm7 - addss %xmm9,%xmm11 - mulss %xmm12,%xmm1 ## eps*Fp - mulss %xmm13,%xmm5 - mulss %xmm13,%xmm9 - addss %xmm0,%xmm1 ## VV - addss %xmm4,%xmm5 - addss %xmm8,%xmm9 - mulss nb430_qq(%rsp),%xmm1 ## VV*qq = vcoul - mulss nb430_c6(%rsp),%xmm5 ## vnb6 - mulss nb430_c12(%rsp),%xmm9 ## vnb12 - mulss nb430_qq(%rsp),%xmm3 ## FF*qq = fij - mulss nb430_c6(%rsp),%xmm7 ## fijD - mulss nb430_c12(%rsp),%xmm11 ##fijR - - addss %xmm7,%xmm11 ## fijD+fijR - mulss nb430_tsc(%rsp),%xmm11 ## (fijD+fijR)*tabscale - - ## accumulate Vvdwtot - addss nb430_Vvdwtot(%rsp),%xmm5 - addss %xmm9,%xmm5 - movss %xmm5,nb430_Vvdwtot(%rsp) - - movq nb430_dvda(%rbp),%rsi - - ## Calculate dVda - mulss nb430_gbscale(%rsp),%xmm3 ## fijC=qq*FF*gbscale - movaps %xmm3,%xmm6 - mulss nb430_r(%rsp),%xmm6 - addss %xmm1,%xmm6 ## vcoul+fijC*r - - addss %xmm11,%xmm3 ## fijC+fijD+fijR - - ## increment vctot - addss nb430_vctot(%rsp),%xmm1 - movss %xmm1,nb430_vctot(%rsp) - - ## xmm6=(vcoul+fijC*r) - xorps %xmm7,%xmm7 - subss %xmm6,%xmm7 - movaps %xmm7,%xmm6 - - ## update dvdasum - addss nb430_dvdasum(%rsp),%xmm7 - movss %xmm7,nb430_dvdasum(%rsp) - - ## update j atoms dvdaj - - ## xmm6=dvdaj1 - addss (%rsi,%rax,4),%xmm6 - movss %xmm6,(%rsi,%rax,4) - - xorps %xmm4,%xmm4 - mulss nb430_rinv(%rsp),%xmm3 - subss %xmm3,%xmm4 - - movss %xmm4,%xmm9 - movss %xmm4,%xmm10 - movss %xmm4,%xmm11 - - mulss nb430_dx(%rsp),%xmm9 - mulss nb430_dy(%rsp),%xmm10 - mulss nb430_dz(%rsp),%xmm11 - - ## accumulate i forces - movaps nb430_fix(%rsp),%xmm12 - movaps nb430_fiy(%rsp),%xmm13 - movaps nb430_fiz(%rsp),%xmm14 - addss %xmm9,%xmm12 - addss %xmm10,%xmm13 - addss %xmm11,%xmm14 - movss %xmm12,nb430_fix(%rsp) - movss %xmm13,nb430_fiy(%rsp) - movss %xmm14,nb430_fiz(%rsp) - - movq nb430_faction(%rbp),%rsi - ## add to j forces - addss (%rsi,%r8,4),%xmm9 - addss 4(%rsi,%r8,4),%xmm10 - addss 8(%rsi,%r8,4),%xmm11 - movss %xmm9,(%rsi,%r8,4) - movss %xmm10,4(%rsi,%r8,4) - movss %xmm11,8(%rsi,%r8,4) - -_nb_kernel430_x86_64_sse.nb430_updateouterdata: - movl nb430_ii3(%rsp),%ecx - movq nb430_faction(%rbp),%rdi - movq nb430_fshift(%rbp),%rsi - movl nb430_is3(%rsp),%edx - - ## accumulate i forces in xmm0, xmm1, xmm2 - movaps nb430_fix(%rsp),%xmm0 - movaps nb430_fiy(%rsp),%xmm1 - movaps nb430_fiz(%rsp),%xmm2 - - movhlps %xmm0,%xmm3 - movhlps %xmm1,%xmm4 - movhlps %xmm2,%xmm5 - addps %xmm3,%xmm0 - addps %xmm4,%xmm1 - addps %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 - - movaps %xmm0,%xmm3 - movaps %xmm1,%xmm4 - movaps %xmm2,%xmm5 - - shufps $1,%xmm3,%xmm3 - shufps $1,%xmm4,%xmm4 - shufps $1,%xmm5,%xmm5 - addss %xmm3,%xmm0 - addss %xmm4,%xmm1 - addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0 - - ## increment i force - movss (%rdi,%rcx,4),%xmm3 - movss 4(%rdi,%rcx,4),%xmm4 - movss 8(%rdi,%rcx,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%rdi,%rcx,4) - movss %xmm4,4(%rdi,%rcx,4) - movss %xmm5,8(%rdi,%rcx,4) - - ## increment fshift force - movss (%rsi,%rdx,4),%xmm3 - movss 4(%rsi,%rdx,4),%xmm4 - movss 8(%rsi,%rdx,4),%xmm5 - subss %xmm0,%xmm3 - subss %xmm1,%xmm4 - subss %xmm2,%xmm5 - movss %xmm3,(%rsi,%rdx,4) - movss %xmm4,4(%rsi,%rdx,4) - movss %xmm5,8(%rsi,%rdx,4) - - ## get n from stack - movl nb430_n(%rsp),%esi - ## get group index for i particle - movq nb430_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movaps nb430_vctot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movq nb430_Vc(%rbp),%rax - addss (%rax,%rdx,4),%xmm7 - ## move back to mem - movss %xmm7,(%rax,%rdx,4) - - ## accumulate total lj energy and update it - movaps nb430_Vvdwtot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movq nb430_Vvdw(%rbp),%rax - addss (%rax,%rdx,4),%xmm7 - ## move back to mem - movss %xmm7,(%rax,%rdx,4) - - ## accumulate dVda and update it - movaps nb430_dvdasum(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - movl nb430_ii(%rsp),%edx - movq nb430_dvda(%rbp),%rax - addss (%rax,%rdx,4),%xmm7 - movss %xmm7,(%rax,%rdx,4) - - ## finish if last - movl nb430_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel430_x86_64_sse.nb430_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb430_n(%rsp) - jmp _nb_kernel430_x86_64_sse.nb430_outer -_nb_kernel430_x86_64_sse.nb430_outerend: - ## check if more outer neighborlists remain - movl nb430_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel430_x86_64_sse.nb430_end - ## non-zero, do one more workunit - jmp _nb_kernel430_x86_64_sse.nb430_threadloop -_nb_kernel430_x86_64_sse.nb430_end: - movl nb430_nouter(%rsp),%eax - movl nb430_ninner(%rsp),%ebx - movq nb430_outeriter(%rbp),%rcx - movq nb430_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $552,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - - - - -.globl nb_kernel430nf_x86_64_sse -.globl _nb_kernel430nf_x86_64_sse -nb_kernel430nf_x86_64_sse: -_nb_kernel430nf_x86_64_sse: -## Room for return address and rbp (16 bytes) -.set nb430nf_fshift, 16 -.set nb430nf_gid, 24 -.set nb430nf_pos, 32 -.set nb430nf_faction, 40 -.set nb430nf_charge, 48 -.set nb430nf_p_facel, 56 -.set nb430nf_argkrf, 64 -.set nb430nf_argcrf, 72 -.set nb430nf_Vc, 80 -.set nb430nf_type, 88 -.set nb430nf_p_ntype, 96 -.set nb430nf_vdwparam, 104 -.set nb430nf_Vvdw, 112 -.set nb430nf_p_tabscale, 120 -.set nb430nf_VFtab, 128 -.set nb430nf_invsqrta, 136 -.set nb430nf_dvda, 144 -.set nb430nf_p_gbtabscale, 152 -.set nb430nf_GBtab, 160 -.set nb430nf_p_nthreads, 168 -.set nb430nf_count, 176 -.set nb430nf_mtx, 184 -.set nb430nf_outeriter, 192 -.set nb430nf_inneriter, 200 -.set nb430nf_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse use -.set nb430nf_ix, 0 -.set nb430nf_iy, 16 -.set nb430nf_iz, 32 -.set nb430nf_iq, 48 -.set nb430nf_gbtsc, 64 -.set nb430nf_tsc, 80 -.set nb430nf_qq, 96 -.set nb430nf_c6, 112 -.set nb430nf_c12, 128 -.set nb430nf_vctot, 144 -.set nb430nf_Vvdwtot, 160 -.set nb430nf_half, 176 -.set nb430nf_three, 192 -.set nb430nf_isai, 208 -.set nb430nf_isaprod, 224 -.set nb430nf_gbscale, 240 -.set nb430nf_r, 256 -.set nb430nf_nri, 272 -.set nb430nf_iinr, 280 -.set nb430nf_jindex, 288 -.set nb430nf_jjnr, 296 -.set nb430nf_shift, 304 -.set nb430nf_shiftvec, 312 -.set nb430nf_facel, 320 -.set nb430nf_innerjjnr, 328 -.set nb430nf_is3, 336 -.set nb430nf_ii3, 340 -.set nb430nf_ntia, 344 -.set nb430nf_innerk, 348 -.set nb430nf_n, 352 -.set nb430nf_nn1, 356 -.set nb430nf_ntype, 360 -.set nb430nf_nouter, 364 -.set nb430nf_ninner, 368 - - push %rbp - movq %rsp,%rbp - push %rbx - - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $392,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb430nf_nouter(%rsp) - movl %eax,nb430nf_ninner(%rsp) - - movl (%rdi),%edi - movl %edi,nb430nf_nri(%rsp) - movq %rsi,nb430nf_iinr(%rsp) - movq %rdx,nb430nf_jindex(%rsp) - movq %rcx,nb430nf_jjnr(%rsp) - movq %r8,nb430nf_shift(%rsp) - movq %r9,nb430nf_shiftvec(%rsp) - movq nb430nf_p_ntype(%rbp),%rdi - movl (%rdi),%edi - movl %edi,nb430nf_ntype(%rsp) - movq nb430nf_p_facel(%rbp),%rsi - movss (%rsi),%xmm0 - movss %xmm0,nb430nf_facel(%rsp) - - movq nb430nf_p_tabscale(%rbp),%rax - movss (%rax),%xmm3 - shufps $0,%xmm3,%xmm3 - movaps %xmm3,nb430nf_tsc(%rsp) - - movq nb430nf_p_gbtabscale(%rbp),%rbx - movss (%rbx),%xmm4 - shufps $0,%xmm4,%xmm4 - movaps %xmm4,nb430nf_gbtsc(%rsp) - - ## create constant floating-point factors on stack - movl $0x3f000000,%eax ## half in IEEE (hex) - movl %eax,nb430nf_half(%rsp) - movss nb430nf_half(%rsp),%xmm1 - shufps $0,%xmm1,%xmm1 ## splat to all elements - movaps %xmm1,%xmm2 - addps %xmm2,%xmm2 ## one - movaps %xmm2,%xmm3 - addps %xmm2,%xmm2 ## two - addps %xmm2,%xmm3 ## three - movaps %xmm1,nb430nf_half(%rsp) - movaps %xmm3,nb430nf_three(%rsp) - -_nb_kernel430nf_x86_64_sse.nb430nf_threadloop: - movq nb430nf_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel430nf_x86_64_sse.nb430nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel430nf_x86_64_sse.nb430nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb430nf_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb430nf_n(%rsp) - movl %ebx,nb430nf_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel430nf_x86_64_sse.nb430nf_outerstart - jmp _nb_kernel430nf_x86_64_sse.nb430nf_end - -_nb_kernel430nf_x86_64_sse.nb430nf_outerstart: - ## ebx contains number of outer iterations - addl nb430nf_nouter(%rsp),%ebx - movl %ebx,nb430nf_nouter(%rsp) - -_nb_kernel430nf_x86_64_sse.nb430nf_outer: - movq nb430nf_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## ebx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb430nf_is3(%rsp) ## store is3 - - movq nb430nf_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movss (%rax,%rbx,4),%xmm0 - movss 4(%rax,%rbx,4),%xmm1 - movss 8(%rax,%rbx,4),%xmm2 - - movq nb430nf_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - - movq nb430nf_charge(%rbp),%rdx - movss (%rdx,%rbx,4),%xmm3 - mulss nb430nf_facel(%rsp),%xmm3 - shufps $0,%xmm3,%xmm3 - - movq nb430nf_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movss (%rdx,%rbx,4),%xmm4 - shufps $0,%xmm4,%xmm4 - - movq nb430nf_type(%rbp),%rdx - movl (%rdx,%rbx,4),%edx - imull nb430nf_ntype(%rsp),%edx - shll %edx - movl %edx,nb430nf_ntia(%rsp) - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb430nf_pos(%rbp),%rax ## rax = base of pos[] - - addss (%rax,%rbx,4),%xmm0 - addss 4(%rax,%rbx,4),%xmm1 - addss 8(%rax,%rbx,4),%xmm2 - - movaps %xmm3,nb430nf_iq(%rsp) - movaps %xmm4,nb430nf_isai(%rsp) - - shufps $0,%xmm0,%xmm0 - shufps $0,%xmm1,%xmm1 - shufps $0,%xmm2,%xmm2 - - movaps %xmm0,nb430nf_ix(%rsp) - movaps %xmm1,nb430nf_iy(%rsp) - movaps %xmm2,nb430nf_iz(%rsp) - - movl %ebx,nb430nf_ii3(%rsp) - - ## clear vctot - xorps %xmm4,%xmm4 - movaps %xmm4,nb430nf_vctot(%rsp) - movaps %xmm4,nb430nf_Vvdwtot(%rsp) - - movq nb430nf_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb430nf_pos(%rbp),%rsi - movq nb430nf_faction(%rbp),%rdi - movq nb430nf_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb430nf_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $4,%edx - addl nb430nf_ninner(%rsp),%ecx - movl %ecx,nb430nf_ninner(%rsp) - addl $0,%edx - movl %edx,nb430nf_innerk(%rsp) ## number of innerloop atoms - jge _nb_kernel430nf_x86_64_sse.nb430nf_unroll_loop - jmp _nb_kernel430nf_x86_64_sse.nb430nf_finish_inner -_nb_kernel430nf_x86_64_sse.nb430nf_unroll_loop: - ## quad-unroll innerloop here - movq nb430nf_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%eax - movl 4(%rdx),%ebx - movl 8(%rdx),%ecx - movl 12(%rdx),%edx ## eax-edx=jnr1-4 - addq $16,nb430nf_innerjjnr(%rsp) ## advance pointer (unrolled 4) - - ## load isa2 - movq nb430nf_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rcx,4),%xmm4 - movss (%rsi,%rbx,4),%xmm6 - movss (%rsi,%rdx,4),%xmm7 - movaps nb430nf_isai(%rsp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3 - mulps %xmm3,%xmm2 - - movaps %xmm2,nb430nf_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulps nb430nf_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb430nf_gbscale(%rsp) - - movq nb430nf_charge(%rbp),%rsi ## base of charge[] - - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rcx,4),%xmm4 - movss (%rsi,%rbx,4),%xmm6 - movss (%rsi,%rdx,4),%xmm7 - - mulps nb430nf_iq(%rsp),%xmm2 - shufps $0,%xmm6,%xmm3 - shufps $0,%xmm7,%xmm4 - shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3 - mulps %xmm2,%xmm3 - movaps %xmm3,nb430nf_qq(%rsp) - - movd %eax,%mm0 ## use mmx registers as temp storage - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movq nb430nf_type(%rbp),%rsi - movl (%rsi,%rax,4),%eax - movl (%rsi,%rbx,4),%ebx - movl (%rsi,%rcx,4),%ecx - movl (%rsi,%rdx,4),%edx - movq nb430nf_vdwparam(%rbp),%rsi - shll %eax - shll %ebx - shll %ecx - shll %edx - movl nb430nf_ntia(%rsp),%edi - addl %edi,%eax - addl %edi,%ebx - addl %edi,%ecx - addl %edi,%edx - - movlps (%rsi,%rax,4),%xmm6 - movlps (%rsi,%rcx,4),%xmm7 - movhps (%rsi,%rbx,4),%xmm6 - movhps (%rsi,%rdx,4),%xmm7 - - movaps %xmm6,%xmm4 - shufps $136,%xmm7,%xmm4 ## 10001000 - shufps $221,%xmm7,%xmm6 ## 11011101 - - movd %mm0,%eax - movd %mm1,%ebx - movd %mm2,%ecx - movd %mm3,%edx - - movaps %xmm4,nb430nf_c6(%rsp) - movaps %xmm6,nb430nf_c12(%rsp) - - movq nb430nf_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%rax ## replace jnr with j3 - lea (%rbx,%rbx,2),%rbx - - lea (%rcx,%rcx,2),%rcx ## replace jnr with j3 - lea (%rdx,%rdx,2),%rdx - - ## move four coordinates to xmm0-xmm2 - - movlps (%rsi,%rax,4),%xmm4 - movlps (%rsi,%rcx,4),%xmm5 - movss 8(%rsi,%rax,4),%xmm2 - movss 8(%rsi,%rcx,4),%xmm6 - - movhps (%rsi,%rbx,4),%xmm4 - movhps (%rsi,%rdx,4),%xmm5 - - movss 8(%rsi,%rbx,4),%xmm0 - movss 8(%rsi,%rdx,4),%xmm1 - - shufps $0,%xmm0,%xmm2 - shufps $0,%xmm1,%xmm6 - - movaps %xmm4,%xmm0 - movaps %xmm4,%xmm1 - - shufps $136,%xmm6,%xmm2 ## 10001000 - - shufps $136,%xmm5,%xmm0 ## 10001000 - shufps $221,%xmm5,%xmm1 ## 11011101 - - ## move ix-iz to xmm4-xmm6 - movaps nb430nf_ix(%rsp),%xmm4 - movaps nb430nf_iy(%rsp),%xmm5 - movaps nb430nf_iz(%rsp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb430nf_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb430nf_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb430nf_r(%rsp) - mulps nb430nf_gbscale(%rsp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $2,%mm6 - pslld $2,%mm7 - - movd %eax,%mm0 - movd %ebx,%mm1 - movd %ecx,%mm2 - movd %edx,%mm3 - - movq nb430nf_GBtab(%rbp),%rsi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## load coulomb table - movaps (%rsi,%rax,4),%xmm4 - movaps (%rsi,%rbx,4),%xmm5 - movaps (%rsi,%rcx,4),%xmm6 - movaps (%rsi,%rdx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb430nf_qq(%rsp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - addps nb430nf_vctot(%rsp),%xmm5 - movaps %xmm5,nb430nf_vctot(%rsp) - - - movaps nb430nf_r(%rsp),%xmm4 - mulps nb430nf_tsc(%rsp),%xmm4 - - movhlps %xmm4,%xmm5 - cvttps2pi %xmm4,%mm6 - cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices - cvtpi2ps %mm6,%xmm6 - cvtpi2ps %mm7,%xmm5 - movlhps %xmm5,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $3,%mm6 - pslld $3,%mm7 - - movq nb430nf_VFtab(%rbp),%rsi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm7,%ecx - psrlq $32,%mm7 - movd %mm6,%ebx - movd %mm7,%edx - - ## dispersion - movaps (%rsi,%rax,4),%xmm4 - movaps (%rsi,%rbx,4),%xmm5 - movaps (%rsi,%rcx,4),%xmm6 - movaps (%rsi,%rdx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## dispersion table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps nb430nf_c6(%rsp),%xmm5 ## Vvdw6 - addps nb430nf_Vvdwtot(%rsp),%xmm5 - movaps %xmm5,nb430nf_Vvdwtot(%rsp) - - ## repulsion - movaps 16(%rsi,%rax,4),%xmm4 - movaps 16(%rsi,%rbx,4),%xmm5 - movaps 16(%rsi,%rcx,4),%xmm6 - movaps 16(%rsi,%rdx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm6,%xmm3 - shufps $0xEE,%xmm7,%xmm3 - shufps $0x44,%xmm7,%xmm6 - movaps %xmm4,%xmm7 - shufps $0xEE,%xmm5,%xmm7 - shufps $0x44,%xmm5,%xmm4 - movaps %xmm4,%xmm5 - shufps $0xDD,%xmm6,%xmm5 - shufps $0x88,%xmm6,%xmm4 - movaps %xmm7,%xmm6 - shufps $0x88,%xmm3,%xmm6 - shufps $0xDD,%xmm3,%xmm7 - ## table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - - mulps nb430nf_c12(%rsp),%xmm5 ## Vvdw12 - addps nb430nf_Vvdwtot(%rsp),%xmm5 - movaps %xmm5,nb430nf_Vvdwtot(%rsp) - - ## should we do one more iteration? - subl $4,nb430nf_innerk(%rsp) - jl _nb_kernel430nf_x86_64_sse.nb430nf_finish_inner - jmp _nb_kernel430nf_x86_64_sse.nb430nf_unroll_loop -_nb_kernel430nf_x86_64_sse.nb430nf_finish_inner: - ## check if at least two particles remain - addl $4,nb430nf_innerk(%rsp) - movl nb430nf_innerk(%rsp),%edx - andl $2,%edx - jnz _nb_kernel430nf_x86_64_sse.nb430nf_dopair - jmp _nb_kernel430nf_x86_64_sse.nb430nf_checksingle -_nb_kernel430nf_x86_64_sse.nb430nf_dopair: - - movq nb430nf_innerjjnr(%rsp),%rcx - - movl (%rcx),%eax - movl 4(%rcx),%ebx - addq $8,nb430nf_innerjjnr(%rsp) - - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - - ## load isa2 - movq nb430nf_invsqrta(%rbp),%rsi - movss (%rsi,%rax,4),%xmm2 - movss (%rsi,%rbx,4),%xmm3 - unpcklps %xmm3,%xmm2 ## isa2 in xmm3(0,1) - mulps nb430nf_isai(%rsp),%xmm2 - movaps %xmm2,nb430nf_isaprod(%rsp) - movaps %xmm2,%xmm1 - mulps nb430nf_gbtsc(%rsp),%xmm1 - movaps %xmm1,nb430nf_gbscale(%rsp) - - movq nb430nf_charge(%rbp),%rsi ## base of charge[] - movss (%rsi,%rax,4),%xmm3 - movss (%rsi,%rbx,4),%xmm6 - unpcklps %xmm6,%xmm3 ## 00001000 ;# xmm3(0,1) has the charges - - mulps nb430nf_iq(%rsp),%xmm2 - mulps %xmm2,%xmm3 - movaps %xmm3,nb430nf_qq(%rsp) - - movq nb430nf_type(%rbp),%rsi - movl %eax,%ecx - movl %ebx,%edx - movl (%rsi,%rcx,4),%ecx - movl (%rsi,%rdx,4),%edx - movq nb430nf_vdwparam(%rbp),%rsi - shll %ecx - shll %edx - movl nb430nf_ntia(%rsp),%edi - addl %edi,%ecx - addl %edi,%edx - movlps (%rsi,%rcx,4),%xmm6 - movhps (%rsi,%rdx,4),%xmm6 - movq nb430nf_pos(%rbp),%rdi - - movaps %xmm6,%xmm4 - shufps $8,%xmm4,%xmm4 ## 00001000 - shufps $13,%xmm6,%xmm6 ## 00001101 - movlhps %xmm7,%xmm4 - movlhps %xmm7,%xmm6 - - movaps %xmm4,nb430nf_c6(%rsp) - movaps %xmm6,nb430nf_c12(%rsp) - - lea (%rax,%rax,2),%rax - lea (%rbx,%rbx,2),%rbx - ## move coordinates to xmm0-xmm2 - movlps (%rdi,%rax,4),%xmm1 - movss 8(%rdi,%rax,4),%xmm2 - movhps (%rdi,%rbx,4),%xmm1 - movss 8(%rdi,%rbx,4),%xmm0 - - movlhps %xmm7,%xmm3 - - shufps $0,%xmm0,%xmm2 - - movaps %xmm1,%xmm0 - - shufps $136,%xmm2,%xmm2 ## 10001000 - - shufps $136,%xmm0,%xmm0 ## 10001000 - shufps $221,%xmm1,%xmm1 ## 11011101 - - movq nb430nf_faction(%rbp),%rdi - ## move ix-iz to xmm4-xmm6 - xorps %xmm7,%xmm7 - - movaps nb430nf_ix(%rsp),%xmm4 - movaps nb430nf_iy(%rsp),%xmm5 - movaps nb430nf_iz(%rsp),%xmm6 - - ## calc dr - subps %xmm0,%xmm4 - subps %xmm1,%xmm5 - subps %xmm2,%xmm6 - - ## square it - mulps %xmm4,%xmm4 - mulps %xmm5,%xmm5 - mulps %xmm6,%xmm6 - addps %xmm5,%xmm4 - addps %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtps %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulps %xmm5,%xmm5 - movaps nb430nf_three(%rsp),%xmm1 - mulps %xmm4,%xmm5 ## rsq*lu*lu - movaps nb430nf_half(%rsp),%xmm0 - subps %xmm5,%xmm1 ## 30-rsq*lu*lu - mulps %xmm2,%xmm1 - mulps %xmm1,%xmm0 ## xmm0=rinv - mulps %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb430nf_r(%rsp) - mulps nb430nf_gbscale(%rsp),%xmm4 - - cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 - - movq nb430nf_GBtab(%rbp),%rsi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## load coulomb table - movaps (%rsi,%rcx,4),%xmm4 - movaps (%rsi,%rdx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - movaps nb430nf_qq(%rsp),%xmm3 - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - mulps %xmm3,%xmm5 ## vcoul=qq*VV - addps nb430nf_vctot(%rsp),%xmm5 - movaps %xmm5,nb430nf_vctot(%rsp) - - movaps nb430nf_r(%rsp),%xmm4 - mulps nb430nf_tsc(%rsp),%xmm4 - - cvttps2pi %xmm4,%mm6 - cvtpi2ps %mm6,%xmm6 - subps %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulps %xmm2,%xmm2 ## xmm2=eps2 - pslld $3,%mm6 - - movq nb430nf_VFtab(%rbp),%rsi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx - - ## dispersion - movaps (%rsi,%rcx,4),%xmm4 - movaps (%rsi,%rdx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## dispersion table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - - mulps nb430nf_c6(%rsp),%xmm5 ## Vvdw6 - addps nb430nf_Vvdwtot(%rsp),%xmm5 - movaps %xmm5,nb430nf_Vvdwtot(%rsp) - - ## repulsion - movaps 16(%rsi,%rcx,4),%xmm4 - movaps 16(%rsi,%rdx,4),%xmm7 - ## transpose, using xmm3 for scratch - movaps %xmm4,%xmm6 - unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2 - unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2 - movhlps %xmm4,%xmm5 ## F1 F2 - movhlps %xmm6,%xmm7 ## H1 H2 - ## table ready, in xmm4-xmm7 - mulps %xmm1,%xmm6 ## xmm6=Geps - mulps %xmm2,%xmm7 ## xmm7=Heps2 - addps %xmm6,%xmm5 - addps %xmm7,%xmm5 ## xmm5=Fp - mulps %xmm1,%xmm5 ## xmm5=eps*Fp - addps %xmm4,%xmm5 ## xmm5=VV - - mulps nb430nf_c12(%rsp),%xmm5 ## Vvdw12 - - addps nb430nf_Vvdwtot(%rsp),%xmm5 - movaps %xmm5,nb430nf_Vvdwtot(%rsp) -_nb_kernel430nf_x86_64_sse.nb430nf_checksingle: - movl nb430nf_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel430nf_x86_64_sse.nb430nf_dosingle - jmp _nb_kernel430nf_x86_64_sse.nb430nf_updateouterdata -_nb_kernel430nf_x86_64_sse.nb430nf_dosingle: - movq nb430nf_charge(%rbp),%rsi - movq nb430nf_invsqrta(%rbp),%rdx - movq nb430nf_pos(%rbp),%rdi - movq nb430nf_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - xorps %xmm2,%xmm2 - movaps %xmm2,%xmm6 - movss (%rdx,%rax,4),%xmm2 ## isa2 - mulss nb430nf_isai(%rsp),%xmm2 - movss %xmm2,nb430nf_isaprod(%rsp) - movss %xmm2,%xmm1 - mulss nb430nf_gbtsc(%rsp),%xmm1 - movss %xmm1,nb430nf_gbscale(%rsp) - - mulss nb430nf_iq(%rsp),%xmm2 - movss (%rsi,%rax,4),%xmm6 ## xmm6(0) has the charge - mulss %xmm2,%xmm6 - movss %xmm6,nb430nf_qq(%rsp) - - movq nb430nf_type(%rbp),%rsi - movl %eax,%ecx - movl (%rsi,%rcx,4),%ecx - movq nb430nf_vdwparam(%rbp),%rsi - shll %ecx - addl nb430nf_ntia(%rsp),%ecx - movlps (%rsi,%rcx,4),%xmm6 - movaps %xmm6,%xmm4 - shufps $252,%xmm4,%xmm4 ## 11111100 - shufps $253,%xmm6,%xmm6 ## 11111101 - - movss %xmm4,nb430nf_c6(%rsp) - movss %xmm6,nb430nf_c12(%rsp) - - lea (%rax,%rax,2),%rax - - ## move coordinates to xmm0-xmm2 - movss (%rdi,%rax,4),%xmm0 - movss 4(%rdi,%rax,4),%xmm1 - movss 8(%rdi,%rax,4),%xmm2 - - movss nb430nf_ix(%rsp),%xmm4 - movss nb430nf_iy(%rsp),%xmm5 - movss nb430nf_iz(%rsp),%xmm6 - - ## calc dr - subss %xmm0,%xmm4 - subss %xmm1,%xmm5 - subss %xmm2,%xmm6 - - ## square it - mulss %xmm4,%xmm4 - mulss %xmm5,%xmm5 - mulss %xmm6,%xmm6 - addss %xmm5,%xmm4 - addss %xmm6,%xmm4 - ## rsq in xmm4 - - rsqrtss %xmm4,%xmm5 - ## lookup seed in xmm5 - movaps %xmm5,%xmm2 - mulss %xmm5,%xmm5 - movss nb430nf_three(%rsp),%xmm1 - mulss %xmm4,%xmm5 ## rsq*lu*lu - movss nb430nf_half(%rsp),%xmm0 - subss %xmm5,%xmm1 ## 30-rsq*lu*lu - mulss %xmm2,%xmm1 - mulss %xmm1,%xmm0 ## xmm0=rinv - - mulss %xmm0,%xmm4 ## xmm4=r - movaps %xmm4,nb430nf_r(%rsp) - mulss nb430nf_gbscale(%rsp),%xmm4 - - cvttss2si %xmm4,%ebx ## mm6 contain lu indices - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movaps %xmm4,%xmm1 ## xmm1=eps - movaps %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%ebx - - movq nb430nf_GBtab(%rbp),%rsi - - movaps (%rsi,%rbx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - movss nb430nf_qq(%rsp),%xmm3 - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss %xmm3,%xmm5 ## vcoul=qq*VV - addss nb430nf_vctot(%rsp),%xmm5 - movss %xmm5,nb430nf_vctot(%rsp) - - movss nb430nf_r(%rsp),%xmm4 - mulps nb430nf_tsc(%rsp),%xmm4 - - cvttss2si %xmm4,%ebx - cvtsi2ss %ebx,%xmm6 - subss %xmm6,%xmm4 - movss %xmm4,%xmm1 ## xmm1=eps - movss %xmm1,%xmm2 - mulss %xmm2,%xmm2 ## xmm2=eps2 - - shll $3,%ebx - movq nb430nf_VFtab(%rbp),%rsi - - ## dispersion - movaps (%rsi,%rbx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - mulss nb430nf_c6(%rsp),%xmm5 ## Vvdw6 - addss nb430nf_Vvdwtot(%rsp),%xmm5 - movss %xmm5,nb430nf_Vvdwtot(%rsp) - - ## repulsion - movaps 16(%rsi,%rbx,4),%xmm4 - movhlps %xmm4,%xmm6 - movaps %xmm4,%xmm5 - movaps %xmm6,%xmm7 - shufps $1,%xmm5,%xmm5 - shufps $1,%xmm7,%xmm7 - ## table ready in xmm4-xmm7 - - mulss %xmm1,%xmm6 ## xmm6=Geps - mulss %xmm2,%xmm7 ## xmm7=Heps2 - addss %xmm6,%xmm5 - addss %xmm7,%xmm5 ## xmm5=Fp - mulss %xmm1,%xmm5 ## xmm5=eps*Fp - addss %xmm4,%xmm5 ## xmm5=VV - - mulss nb430nf_c12(%rsp),%xmm5 ## Vvdw12 - - addss nb430nf_Vvdwtot(%rsp),%xmm5 - movss %xmm5,nb430nf_Vvdwtot(%rsp) - -_nb_kernel430nf_x86_64_sse.nb430nf_updateouterdata: - ## get n from stack - movl nb430nf_n(%rsp),%esi - ## get group index for i particle - movq nb430nf_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movaps nb430nf_vctot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movq nb430nf_Vc(%rbp),%rax - addss (%rax,%rdx,4),%xmm7 - ## move back to mem - movss %xmm7,(%rax,%rdx,4) - - ## accumulate total lj energy and update it - movaps nb430nf_Vvdwtot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now - movaps %xmm7,%xmm6 - shufps $1,%xmm6,%xmm6 - addss %xmm6,%xmm7 - - ## add earlier value from mem - movq nb430nf_Vvdw(%rbp),%rax - addss (%rax,%rdx,4),%xmm7 - ## move back to mem - movss %xmm7,(%rax,%rdx,4) - - ## finish if last - movl nb430nf_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel430nf_x86_64_sse.nb430nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb430nf_n(%rsp) - jmp _nb_kernel430nf_x86_64_sse.nb430nf_outer -_nb_kernel430nf_x86_64_sse.nb430nf_outerend: - ## check if more outer neighborlists remain - movl nb430nf_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel430nf_x86_64_sse.nb430nf_end - ## non-zero, do one more workunit - jmp _nb_kernel430nf_x86_64_sse.nb430nf_threadloop -_nb_kernel430nf_x86_64_sse.nb430nf_end: - - movl nb430nf_nouter(%rsp),%eax - movl nb430nf_ninner(%rsp),%ebx - movq nb430nf_outeriter(%rbp),%rcx - movq nb430nf_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $392,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - - - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/Makefile.am b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/Makefile.am index 5e515024c4..260af98c5c 100644 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/Makefile.am +++ b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/Makefile.am @@ -64,30 +64,30 @@ libnb_kernel_x86_64_sse2_la_SOURCES = \ EXTRA_DIST = \ - nb_kernel010_x86_64_sse2.intel_syntax.s nb_kernel030_x86_64_sse2.intel_syntax.s \ - nb_kernel100_x86_64_sse2.intel_syntax.s nb_kernel101_x86_64_sse2.intel_syntax.s \ - nb_kernel102_x86_64_sse2.intel_syntax.s nb_kernel103_x86_64_sse2.intel_syntax.s \ - nb_kernel104_x86_64_sse2.intel_syntax.s nb_kernel110_x86_64_sse2.intel_syntax.s \ - nb_kernel111_x86_64_sse2.intel_syntax.s nb_kernel112_x86_64_sse2.intel_syntax.s \ - nb_kernel113_x86_64_sse2.intel_syntax.s nb_kernel114_x86_64_sse2.intel_syntax.s \ - nb_kernel130_x86_64_sse2.intel_syntax.s nb_kernel131_x86_64_sse2.intel_syntax.s \ - nb_kernel132_x86_64_sse2.intel_syntax.s nb_kernel133_x86_64_sse2.intel_syntax.s \ - nb_kernel134_x86_64_sse2.intel_syntax.s nb_kernel200_x86_64_sse2.intel_syntax.s \ - nb_kernel201_x86_64_sse2.intel_syntax.s nb_kernel202_x86_64_sse2.intel_syntax.s \ - nb_kernel203_x86_64_sse2.intel_syntax.s nb_kernel204_x86_64_sse2.intel_syntax.s \ - nb_kernel210_x86_64_sse2.intel_syntax.s nb_kernel211_x86_64_sse2.intel_syntax.s \ - nb_kernel212_x86_64_sse2.intel_syntax.s nb_kernel213_x86_64_sse2.intel_syntax.s \ - nb_kernel214_x86_64_sse2.intel_syntax.s nb_kernel230_x86_64_sse2.intel_syntax.s \ - nb_kernel231_x86_64_sse2.intel_syntax.s nb_kernel232_x86_64_sse2.intel_syntax.s \ - nb_kernel233_x86_64_sse2.intel_syntax.s nb_kernel234_x86_64_sse2.intel_syntax.s \ - nb_kernel300_x86_64_sse2.intel_syntax.s nb_kernel301_x86_64_sse2.intel_syntax.s \ - nb_kernel302_x86_64_sse2.intel_syntax.s nb_kernel303_x86_64_sse2.intel_syntax.s \ - nb_kernel304_x86_64_sse2.intel_syntax.s nb_kernel310_x86_64_sse2.intel_syntax.s \ - nb_kernel311_x86_64_sse2.intel_syntax.s nb_kernel312_x86_64_sse2.intel_syntax.s \ - nb_kernel313_x86_64_sse2.intel_syntax.s nb_kernel314_x86_64_sse2.intel_syntax.s \ - nb_kernel330_x86_64_sse2.intel_syntax.s nb_kernel331_x86_64_sse2.intel_syntax.s \ - nb_kernel332_x86_64_sse2.intel_syntax.s nb_kernel333_x86_64_sse2.intel_syntax.s \ - nb_kernel334_x86_64_sse2.intel_syntax.s nb_kernel400_x86_64_sse2.intel_syntax.s \ - nb_kernel410_x86_64_sse2.intel_syntax.s nb_kernel430_x86_64_sse2.intel_syntax.s \ - nb_kernel_x86_64_sse2_test.intel_syntax.s + nb_kernel010_x86_64_sse2_intel_syntax.s nb_kernel030_x86_64_sse2_intel_syntax.s \ + nb_kernel100_x86_64_sse2_intel_syntax.s nb_kernel101_x86_64_sse2_intel_syntax.s \ + nb_kernel102_x86_64_sse2_intel_syntax.s nb_kernel103_x86_64_sse2_intel_syntax.s \ + nb_kernel104_x86_64_sse2_intel_syntax.s nb_kernel110_x86_64_sse2_intel_syntax.s \ + nb_kernel111_x86_64_sse2_intel_syntax.s nb_kernel112_x86_64_sse2_intel_syntax.s \ + nb_kernel113_x86_64_sse2_intel_syntax.s nb_kernel114_x86_64_sse2_intel_syntax.s \ + nb_kernel130_x86_64_sse2_intel_syntax.s nb_kernel131_x86_64_sse2_intel_syntax.s \ + nb_kernel132_x86_64_sse2_intel_syntax.s nb_kernel133_x86_64_sse2_intel_syntax.s \ + nb_kernel134_x86_64_sse2_intel_syntax.s nb_kernel200_x86_64_sse2_intel_syntax.s \ + nb_kernel201_x86_64_sse2_intel_syntax.s nb_kernel202_x86_64_sse2_intel_syntax.s \ + nb_kernel203_x86_64_sse2_intel_syntax.s nb_kernel204_x86_64_sse2_intel_syntax.s \ + nb_kernel210_x86_64_sse2_intel_syntax.s nb_kernel211_x86_64_sse2_intel_syntax.s \ + nb_kernel212_x86_64_sse2_intel_syntax.s nb_kernel213_x86_64_sse2_intel_syntax.s \ + nb_kernel214_x86_64_sse2_intel_syntax.s nb_kernel230_x86_64_sse2_intel_syntax.s \ + nb_kernel231_x86_64_sse2_intel_syntax.s nb_kernel232_x86_64_sse2_intel_syntax.s \ + nb_kernel233_x86_64_sse2_intel_syntax.s nb_kernel234_x86_64_sse2_intel_syntax.s \ + nb_kernel300_x86_64_sse2_intel_syntax.s nb_kernel301_x86_64_sse2_intel_syntax.s \ + nb_kernel302_x86_64_sse2_intel_syntax.s nb_kernel303_x86_64_sse2_intel_syntax.s \ + nb_kernel304_x86_64_sse2_intel_syntax.s nb_kernel310_x86_64_sse2_intel_syntax.s \ + nb_kernel311_x86_64_sse2_intel_syntax.s nb_kernel312_x86_64_sse2_intel_syntax.s \ + nb_kernel313_x86_64_sse2_intel_syntax.s nb_kernel314_x86_64_sse2_intel_syntax.s \ + nb_kernel330_x86_64_sse2_intel_syntax.s nb_kernel331_x86_64_sse2_intel_syntax.s \ + nb_kernel332_x86_64_sse2_intel_syntax.s nb_kernel333_x86_64_sse2_intel_syntax.s \ + nb_kernel334_x86_64_sse2_intel_syntax.s nb_kernel400_x86_64_sse2_intel_syntax.s \ + nb_kernel410_x86_64_sse2_intel_syntax.s nb_kernel430_x86_64_sse2_intel_syntax.s \ + nb_kernel_x86_64_sse2_test_intel_syntax.s diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.intel_syntax.s deleted file mode 100644 index cdc2d9f689..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.intel_syntax.s +++ /dev/null @@ -1,1236 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - - - -.globl nb_kernel400_x86_64_sse2 -.globl _nb_kernel400_x86_64_sse2 -nb_kernel400_x86_64_sse2: -_nb_kernel400_x86_64_sse2: -;# Room for return address and rbp (16 bytes) -.equiv nb400_fshift, 16 -.equiv nb400_gid, 24 -.equiv nb400_pos, 32 -.equiv nb400_faction, 40 -.equiv nb400_charge, 48 -.equiv nb400_p_facel, 56 -.equiv nb400_argkrf, 64 -.equiv nb400_argcrf, 72 -.equiv nb400_Vc, 80 -.equiv nb400_type, 88 -.equiv nb400_p_ntype, 96 -.equiv nb400_vdwparam, 104 -.equiv nb400_Vvdw, 112 -.equiv nb400_p_tabscale, 120 -.equiv nb400_VFtab, 128 -.equiv nb400_invsqrta, 136 -.equiv nb400_dvda, 144 -.equiv nb400_p_gbtabscale, 152 -.equiv nb400_GBtab, 160 -.equiv nb400_p_nthreads, 168 -.equiv nb400_count, 176 -.equiv nb400_mtx, 184 -.equiv nb400_outeriter, 192 -.equiv nb400_inneriter, 200 -.equiv nb400_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb400_ix, 0 -.equiv nb400_iy, 16 -.equiv nb400_iz, 32 -.equiv nb400_iq, 48 -.equiv nb400_dx, 64 -.equiv nb400_dy, 80 -.equiv nb400_dz, 96 -.equiv nb400_two, 112 -.equiv nb400_gbtsc, 128 -.equiv nb400_qq, 144 -.equiv nb400_r, 160 -.equiv nb400_vctot, 176 -.equiv nb400_fix, 192 -.equiv nb400_fiy, 208 -.equiv nb400_fiz, 224 -.equiv nb400_half, 240 -.equiv nb400_three, 256 -.equiv nb400_isai, 272 -.equiv nb400_isaprod, 288 -.equiv nb400_dvdasum, 304 -.equiv nb400_gbscale, 320 -.equiv nb400_nri, 336 -.equiv nb400_iinr, 344 -.equiv nb400_jindex, 352 -.equiv nb400_jjnr, 360 -.equiv nb400_shift, 368 -.equiv nb400_shiftvec, 376 -.equiv nb400_facel, 384 -.equiv nb400_innerjjnr, 392 -.equiv nb400_is3, 400 -.equiv nb400_ii3, 404 -.equiv nb400_ii, 408 -.equiv nb400_innerk, 412 -.equiv nb400_n, 416 -.equiv nb400_nn1, 420 -.equiv nb400_nouter, 424 -.equiv nb400_ninner, 428 - push rbp - mov rbp, rsp - push rbx - - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 440 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb400_nouter], eax - mov [rsp + nb400_ninner], eax - - mov edi, [rdi] - mov [rsp + nb400_nri], edi - mov [rsp + nb400_iinr], rsi - mov [rsp + nb400_jindex], rdx - mov [rsp + nb400_jjnr], rcx - mov [rsp + nb400_shift], r8 - mov [rsp + nb400_shiftvec], r9 - mov rsi, [rbp + nb400_p_facel] - movsd xmm0, [rsi] - movsd [rsp + nb400_facel], xmm0 - - mov rbx, [rbp + nb400_p_gbtabscale] - movsd xmm4, [rbx] - shufpd xmm4, xmm4, 0 - movapd [rsp + nb400_gbtsc], xmm4 - - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double half IEEE (hex) - mov ebx, 0x3fe00000 - mov [rsp + nb400_half], eax - mov [rsp + nb400_half+4], ebx - movsd xmm1, [rsp + nb400_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# one - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# two - addpd xmm3, xmm2 ;# three - movapd [rsp + nb400_half], xmm1 - movapd [rsp + nb400_two], xmm2 - movapd [rsp + nb400_three], xmm3 - -.nb400_threadloop: - mov rsi, [rbp + nb400_count] ;# pointer to sync counter - mov eax, [rsi] -.nb400_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb400_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb400_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb400_n], eax - mov [rsp + nb400_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb400_outerstart - jmp .nb400_end - -.nb400_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb400_nouter] - mov [rsp + nb400_nouter], ebx - -.nb400_outer: - mov rax, [rsp + nb400_shift] ;# rax = pointer into shift[] - mov ebx, [rax+rsi*4] ;# rbx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb400_is3],ebx ;# store is3 - - mov rax, [rsp + nb400_shiftvec] ;# rax = base of shiftvec[] - - movsd xmm0, [rax + rbx*8] - movsd xmm1, [rax + rbx*8 + 8] - movsd xmm2, [rax + rbx*8 + 16] - - mov rcx, [rsp + nb400_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx+rsi*4] ;# ebx =ii - mov [rsp + nb400_ii], ebx - - mov rdx, [rbp + nb400_charge] - movsd xmm3, [rdx + rbx*8] - mulsd xmm3, [rsp + nb400_facel] - shufpd xmm3, xmm3, 0 - - mov rdx, [rbp + nb400_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [rdx + rbx*8] - shufpd xmm4, xmm4, 0 - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb400_pos] ;# rax = base of pos[] - - addsd xmm0, [rax + rbx*8] - addsd xmm1, [rax + rbx*8 + 8] - addsd xmm2, [rax + rbx*8 + 16] - - movapd [rsp + nb400_iq], xmm3 - movapd [rsp + nb400_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [rsp + nb400_ix], xmm0 - movapd [rsp + nb400_iy], xmm1 - movapd [rsp + nb400_iz], xmm2 - - mov [rsp + nb400_ii3], ebx - - ;# clear vctot and i forces - xorpd xmm4, xmm4 - movapd xmm8, xmm4 - movapd xmm12, xmm4 - movapd xmm13, xmm4 - movapd xmm14, xmm4 - movapd xmm15, xmm4 - - mov rax, [rsp + nb400_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb400_pos] - mov rdi, [rbp + nb400_faction] - mov rax, [rsp + nb400_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb400_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [rsp + nb400_ninner] - mov [rsp + nb400_ninner], ecx - add edx, 0 - mov [rsp + nb400_innerk], edx ;# number of innerloop atoms - jge .nb400_unroll_loop - jmp .nb400_checksingle -.nb400_unroll_loop: - ;# twice unrolled innerloop here - mov rdx, [rsp + nb400_innerjjnr] ;# pointer to jjnr[k] - mov r12d, [rdx] - mov r13d, [rdx + 4] - add qword ptr [rsp + nb400_innerjjnr], 8 ;# advance pointer (unrolled 2) - - mov rsi, [rbp + nb400_pos] ;# base of pos[] - - lea r8, [r12 + r12*2] ;# j3 - lea r9, [r13 + r13*2] - - ;# move two coordinates to xmm4-xmm6 - movlpd xmm4, [rsi + r8*8] - movlpd xmm5, [rsi + r8*8 + 8] - movlpd xmm6, [rsi + r8*8 + 16] - movhpd xmm4, [rsi + r9*8] - movhpd xmm5, [rsi + r9*8 + 8] - movhpd xmm6, [rsi + r9*8 + 16] - - ;# calc dr - subpd xmm4, [rsp + nb400_ix] - subpd xmm5, [rsp + nb400_iy] - subpd xmm6, [rsp + nb400_iz] - - - ;# store dr - movapd xmm9, xmm4 - movapd xmm10, xmm5 - movapd xmm11, xmm6 - - ;# square it - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - mov rsi, [rbp + nb400_invsqrta] - movlpd xmm3, [rsi + r12*8] - - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - movhpd xmm3, [rsi + r13*8] - mulpd xmm3, [rsp + nb400_isai] - movapd [rsp + nb400_isaprod], xmm3 - movapd xmm6, xmm3 - mulpd xmm3, [rsp + nb400_gbtsc] - movapd [rsp + nb400_gbscale], xmm3 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb400_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb400_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - mov rsi, [rbp + nb400_charge] ;# base of charge[] - movlpd xmm3, [rsi + r12*8] - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb400_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb400_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu) - mulpd xmm4, xmm0 ;# xmm4=r - - mulpd xmm6, [rsp + nb400_iq] - movhpd xmm3, [rsi + r13*8] - mulpd xmm3, xmm6 - movapd [rsp + nb400_qq], xmm3 - - - movapd [rsp + nb400_r], xmm4 - mulpd xmm4, [rsp + nb400_gbscale] - - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - - pslld mm6, 2 ;# idx *= 4 - - mov rsi, [rbp + nb400_GBtab] - movd r10d, mm6 - psrlq mm6, 32 - movd r11d, mm6 ;# indices in r10/r11 - - movapd xmm4, [rsi + r10*8] ;# Y1 F1 - movapd xmm3, [rsi + r11*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [rsi + r10*8 + 16] ;# G1 H1 - movapd xmm3, [rsi + r11*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - - mulpd xmm7, xmm1 ;# xmm7=Heps - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm1 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - addpd xmm7, xmm7 ;# two*Heps2 - movapd xmm3, [rsp + nb400_qq] - addpd xmm7, xmm6 - addpd xmm7, xmm5 ;# xmm7=FF - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - mulpd xmm5, xmm3 ;# vcoul=qq*VV - mulpd xmm3, xmm7 ;# fijC=FF*qq - - mov rsi, [rbp + nb400_dvda] - - ;# Calculate dVda - xorpd xmm7, xmm7 - mulpd xmm3, [rsp + nb400_gbscale] - movapd xmm6, xmm3 - mulpd xmm6, [rsp + nb400_r] - addpd xmm6, xmm5 - - ;# update vctot - addpd xmm12, xmm5 - - ;# xmm6=(vcoul+fijC*r) - subpd xmm7, xmm6 - movapd xmm6, xmm7 - - ;# update dvdasum - addpd xmm8, xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - addsd xmm6, [rsi + r12*8] - addsd xmm7, [rsi + r13*8] - movsd [rsi + r12*8], xmm6 - movsd [rsi + r13*8], xmm7 - - ;# the fj's - start by accumulating forces from memory - mov rdi, [rbp + nb400_faction] - movlpd xmm5, [rdi + r8*8] - movlpd xmm6, [rdi + r8*8 + 8] - movlpd xmm7, [rdi + r8*8 + 16] - movhpd xmm5, [rdi + r9*8] - movhpd xmm6, [rdi + r9*8 + 8] - movhpd xmm7, [rdi + r9*8 + 16] - - xorpd xmm4, xmm4 - - mulpd xmm3, xmm0 - subpd xmm4, xmm3 - - mov rdi, [rbp + nb400_faction] - mulpd xmm9, xmm4 - mulpd xmm10, xmm4 - mulpd xmm11, xmm4 - - addpd xmm5, xmm9 - addpd xmm6, xmm10 - addpd xmm7, xmm11 - - ;# now update f_i - addpd xmm13, xmm9 - addpd xmm14, xmm10 - addpd xmm15, xmm11 - - movlpd [rdi + r8*8], xmm5 - movlpd [rdi + r8*8 + 8], xmm6 - movlpd [rdi + r8*8 + 16], xmm7 - movhpd [rdi + r9*8], xmm5 - movhpd [rdi + r9*8 + 8], xmm6 - movhpd [rdi + r9*8 + 16], xmm7 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb400_innerk], 2 - jl .nb400_checksingle - jmp .nb400_unroll_loop -.nb400_checksingle: - mov edx, [rsp + nb400_innerk] - and edx, 1 - jnz .nb400_dosingle - jmp .nb400_updateouterdata -.nb400_dosingle: - mov rsi, [rbp + nb400_charge] - mov rdx, [rbp + nb400_invsqrta] - mov rdi, [rbp + nb400_pos] - mov rcx, [rsp + nb400_innerjjnr] - mov eax, [rcx] - - ;# load isaj - mov rsi, [rbp + nb400_invsqrta] - movsd xmm2, [rsi + rax*8] - mulsd xmm2, [rsp + nb400_isai] - movapd [rsp + nb400_isaprod], xmm2 - movapd xmm1, xmm2 - mulsd xmm1, [rsp + nb400_gbtsc] - movapd [rsp + nb400_gbscale], xmm1 - - mulsd xmm2, [rsp + nb400_iq] - mov rsi, [rbp + nb400_charge] ;# base of charge[] - movsd xmm3, [rsi + rax*8] - mulsd xmm3, xmm2 - movapd [rsp + nb400_qq], xmm3 - - mov rsi, [rbp + nb400_pos] ;# base of pos[] - - lea r8, [rax + rax*2] ;# j3 - - ;# move coordinate to xmm4-xmm6 - movsd xmm4, [rsi + r8*8] - movsd xmm5, [rsi + r8*8 + 8] - movsd xmm6, [rsi + r8*8 + 16] - - mov rdi, [rbp + nb400_faction] - - ;# calc dr - subsd xmm4, [rsp + nb400_ix] - subsd xmm5, [rsp + nb400_iy] - subsd xmm6, [rsp + nb400_iz] - - ;# store dr - movapd xmm9, xmm4 - movapd xmm10, xmm5 - movapd xmm11, xmm6 - - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb400_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb400_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb400_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb400_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu) - mulsd xmm4, xmm0 ;# xmm4=r - - movapd [rsp + nb400_r], xmm4 - mulsd xmm4, [rsp + nb400_gbscale] - - cvttsd2si r10d, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, r10d - subsd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - - shl r10d, 2 ;# idx *= 4 - - mov rsi, [rbp + nb400_GBtab] - - movapd xmm4, [rsi + r10*8] ;# Y1 F1 - movhlps xmm5, xmm4 - movapd xmm6, [rsi + r10*8 + 16] ;# G1 H1 - movhlps xmm7, xmm6 - ;# coulomb table ready, in xmm4-xmm7 - - mulsd xmm7, xmm1 ;# xmm7=Heps - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm1 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - addsd xmm7, xmm7 ;# two*Heps2 - movapd xmm3, [rsp + nb400_qq] - addsd xmm7, xmm6 - addsd xmm7, xmm5 ;# xmm7=FF - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, xmm3 ;# vcoul=qq*VV - mulsd xmm3, xmm7 ;# fijC=FF*qq - - mov rsi, [rbp + nb400_dvda] - - ;# Calculate dVda - xorpd xmm7, xmm7 - mulsd xmm3, [rsp + nb400_gbscale] - movapd xmm6, xmm3 - mulsd xmm6, [rsp + nb400_r] - addsd xmm6, xmm5 - - ;# update vctot - addsd xmm12, xmm5 - - ;# xmm6=(vcoul+fijC*r) - subsd xmm7, xmm6 - movapd xmm6, xmm7 - - ;# update dvdasum - addsd xmm8, xmm7 - - ;# update j atoms dvdaj - addsd xmm6, [rsi + rax*8] - movsd [rsi + rax*8], xmm6 - - xorpd xmm4, xmm4 - - mulsd xmm3, xmm0 - subsd xmm4, xmm3 - - mov rdi, [rbp + nb400_faction] - mulsd xmm9, xmm4 - mulsd xmm10, xmm4 - mulsd xmm11, xmm4 - - ;# now update f_i - addsd xmm13, xmm9 - addsd xmm14, xmm10 - addsd xmm15, xmm11 - - ;# the fj's - start by accumulating forces from memory - mov rdi, [rbp + nb400_faction] - addsd xmm9, [rdi + r8*8] - addsd xmm10, [rdi + r8*8 + 8] - addsd xmm11, [rdi + r8*8 + 16] - movsd [rdi + r8*8], xmm9 - movsd [rdi + r8*8 + 8], xmm10 - movsd [rdi + r8*8 + 16], xmm11 - -.nb400_updateouterdata: - mov ecx, [rsp + nb400_ii3] - mov rdi, [rbp + nb400_faction] - mov rsi, [rbp + nb400_fshift] - mov edx, [rsp + nb400_is3] - - ;# accumulate i forces in xmm13, xmm14, xmm15 - movhlps xmm3, xmm13 - movhlps xmm4, xmm14 - movhlps xmm5, xmm15 - addsd xmm13, xmm3 - addsd xmm14, xmm4 - addsd xmm15, xmm5 ;# sum is in low xmm13-xmm15 - - ;# increment i force - movsd xmm3, [rdi + rcx*8] - movsd xmm4, [rdi + rcx*8 + 8] - movsd xmm5, [rdi + rcx*8 + 16] - subsd xmm3, xmm13 - subsd xmm4, xmm14 - subsd xmm5, xmm15 - movsd [rdi + rcx*8], xmm3 - movsd [rdi + rcx*8 + 8], xmm4 - movsd [rdi + rcx*8 + 16], xmm5 - - ;# increment fshift force - movsd xmm3, [rsi + rdx*8] - movsd xmm4, [rsi + rdx*8 + 8] - movsd xmm5, [rsi + rdx*8 + 16] - subsd xmm3, xmm13 - subsd xmm4, xmm14 - subsd xmm5, xmm15 - movsd [rsi + rdx*8], xmm3 - movsd [rsi + rdx*8 + 8], xmm4 - movsd [rsi + rdx*8 + 16], xmm5 - - ;# get n from stack - mov esi, [rsp + nb400_n] - ;# get group index for i particle - mov rdx, [rbp + nb400_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total coulomb energy and update it - movhlps xmm6, xmm12 - addsd xmm12, xmm6 ;# low xmm12 have the sum now - - ;# add earlier value from mem - mov rax, [rbp + nb400_Vc] - addsd xmm12, [rax + rdx*8] - ;# move back to mem - movsd [rax + rdx*8], xmm12 - - ;# accumulate dVda and update it - movhlps xmm6, xmm8 - addsd xmm8, xmm6 ;# low xmm8 has the sum now - - mov edx, [rsp + nb400_ii] - mov rax, [rbp + nb400_dvda] - addsd xmm8, [rax + rdx*8] - movsd [rax + rdx*8], xmm8 - - ;# finish if last - mov ecx, [rsp + nb400_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb400_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb400_n], esi - jmp .nb400_outer -.nb400_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb400_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb400_end - ;# non-zero, do one more workunit - jmp .nb400_threadloop -.nb400_end: - mov eax, [rsp + nb400_nouter] - mov ebx, [rsp + nb400_ninner] - mov rcx, [rbp + nb400_outeriter] - mov rdx, [rbp + nb400_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 440 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - - - - - - - -.globl nb_kernel400nf_x86_64_sse2 -.globl _nb_kernel400nf_x86_64_sse2 -nb_kernel400nf_x86_64_sse2: -_nb_kernel400nf_x86_64_sse2: -.equiv nb400nf_fshift, 16 -.equiv nb400nf_gid, 24 -.equiv nb400nf_pos, 32 -.equiv nb400nf_faction, 40 -.equiv nb400nf_charge, 48 -.equiv nb400nf_p_facel, 56 -.equiv nb400nf_argkrf, 64 -.equiv nb400nf_argcrf, 72 -.equiv nb400nf_Vc, 80 -.equiv nb400nf_type, 88 -.equiv nb400nf_p_ntype, 96 -.equiv nb400nf_vdwparam, 104 -.equiv nb400nf_Vvdw, 112 -.equiv nb400nf_p_tabscale, 120 -.equiv nb400nf_VFtab, 128 -.equiv nb400nf_invsqrta, 136 -.equiv nb400nf_dvda, 144 -.equiv nb400nf_p_gbtabscale, 152 -.equiv nb400nf_GBtab, 160 -.equiv nb400nf_p_nthreads, 168 -.equiv nb400nf_count, 176 -.equiv nb400nf_mtx, 184 -.equiv nb400nf_outeriter, 192 -.equiv nb400nf_inneriter, 200 -.equiv nb400nf_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb400nf_ix, 0 -.equiv nb400nf_iy, 16 -.equiv nb400nf_iz, 32 -.equiv nb400nf_iq, 48 -.equiv nb400nf_gbtsc, 64 -.equiv nb400nf_qq, 80 -.equiv nb400nf_vctot, 96 -.equiv nb400nf_half, 112 -.equiv nb400nf_three, 128 -.equiv nb400nf_isai, 144 -.equiv nb400nf_isaprod, 160 -.equiv nb400nf_gbscale, 176 -.equiv nb400nf_nri, 192 -.equiv nb400nf_iinr, 200 -.equiv nb400nf_jindex, 208 -.equiv nb400nf_jjnr, 216 -.equiv nb400nf_shift, 224 -.equiv nb400nf_shiftvec, 232 -.equiv nb400nf_facel, 240 -.equiv nb400nf_innerjjnr, 248 -.equiv nb400nf_is3, 256 -.equiv nb400nf_ii3, 260 -.equiv nb400nf_innerk, 264 -.equiv nb400nf_n, 268 -.equiv nb400nf_nn1, 272 -.equiv nb400nf_nouter, 276 -.equiv nb400nf_ninner, 280 - push rbp - mov rbp, rsp - push rbx - - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 296 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb400nf_nouter], eax - mov [rsp + nb400nf_ninner], eax - - mov edi, [rdi] - mov [rsp + nb400nf_nri], edi - mov [rsp + nb400nf_iinr], rsi - mov [rsp + nb400nf_jindex], rdx - mov [rsp + nb400nf_jjnr], rcx - mov [rsp + nb400nf_shift], r8 - mov [rsp + nb400nf_shiftvec], r9 - mov rsi, [rbp + nb400nf_p_facel] - movsd xmm0, [rsi] - movsd [rsp + nb400nf_facel], xmm0 - - mov rbx, [rbp + nb400nf_p_gbtabscale] - movsd xmm4, [rbx] - shufpd xmm4, xmm4, 0 - movapd [rsp + nb400nf_gbtsc], xmm4 - - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double half IEEE (hex) - mov ebx, 0x3fe00000 - mov [rsp + nb400nf_half], eax - mov [rsp + nb400nf_half+4], ebx - movsd xmm1, [rsp + nb400nf_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# one - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# two - addpd xmm3, xmm2 ;# three - movapd [rsp + nb400nf_half], xmm1 - movapd [rsp + nb400nf_three], xmm3 - -.nb400nf_threadloop: - mov rsi, [rbp + nb400nf_count] ;# pointer to sync counter - mov eax, [rsi] -.nb400nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb400nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb400nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb400nf_n], eax - mov [rsp + nb400nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb400nf_outerstart - jmp .nb400nf_end - -.nb400nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb400nf_nouter] - mov [rsp + nb400nf_nouter], ebx - -.nb400nf_outer: - mov rax, [rsp + nb400nf_shift] ;# rax = pointer into shift[] - mov ebx, [rax+rsi*4] ;# rbx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb400nf_is3],ebx ;# store is3 - - mov rax, [rsp + nb400nf_shiftvec] ;# rax = base of shiftvec[] - - movsd xmm0, [rax + rbx*8] - movsd xmm1, [rax + rbx*8 + 8] - movsd xmm2, [rax + rbx*8 + 16] - - mov rcx, [rsp + nb400nf_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx+rsi*4] ;# ebx =ii - - mov rdx, [rbp + nb400nf_charge] - movsd xmm3, [rdx + rbx*8] - mulsd xmm3, [rsp + nb400nf_facel] - shufpd xmm3, xmm3, 0 - - mov rdx, [rbp + nb400nf_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [rdx + rbx*8] - shufpd xmm4, xmm4, 0 - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb400nf_pos] ;# rax = base of pos[] - - addsd xmm0, [rax + rbx*8] - addsd xmm1, [rax + rbx*8 + 8] - addsd xmm2, [rax + rbx*8 + 16] - - movapd [rsp + nb400nf_iq], xmm3 - movapd [rsp + nb400nf_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [rsp + nb400nf_ix], xmm0 - movapd [rsp + nb400nf_iy], xmm1 - movapd [rsp + nb400nf_iz], xmm2 - - mov [rsp + nb400nf_ii3], ebx - - ;# clear vctot - xorpd xmm4, xmm4 - movapd [rsp + nb400nf_vctot], xmm4 - - mov rax, [rsp + nb400nf_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb400nf_pos] - mov rdi, [rbp + nb400nf_faction] - mov rax, [rsp + nb400nf_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb400nf_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [rsp + nb400nf_ninner] - mov [rsp + nb400nf_ninner], ecx - add edx, 0 - mov [rsp + nb400nf_innerk], edx ;# number of innerloop atoms - jge .nb400nf_unroll_loop - jmp .nb400nf_checksingle -.nb400nf_unroll_loop: - ;# twice unrolled innerloop here - mov rdx, [rsp + nb400nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [rdx] - mov ebx, [rdx + 4] - add qword ptr [rsp + nb400nf_innerjjnr], 8 ;# advance pointer (unrolled 2) - - ;# load isa2 - mov rsi, [rbp + nb400nf_invsqrta] - movlpd xmm2, [rsi + rax*8] - movhpd xmm2, [rsi + rbx*8] - mulpd xmm2, [rsp + nb400nf_isai] - movapd [rsp + nb400nf_isaprod], xmm2 - movapd xmm1, xmm2 - mulpd xmm1, [rsp + nb400nf_gbtsc] - movapd [rsp + nb400nf_gbscale], xmm1 - - mov rsi, [rbp + nb400nf_charge] ;# base of charge[] - movlpd xmm3, [rsi + rax*8] - movhpd xmm3, [rsi + rbx*8] - - mulpd xmm2, [rsp + nb400nf_iq] - mulpd xmm3, xmm2 - movapd [rsp + nb400nf_qq], xmm3 - - mov rsi, [rbp + nb400nf_pos] ;# base of pos[] - - lea rax, [rax + rax*2] ;# replace jnr with j3 - lea rbx, [rbx + rbx*2] - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [rsi + rax*8] - movlpd xmm1, [rsi + rax*8 + 8] - movlpd xmm2, [rsi + rax*8 + 16] - movhpd xmm0, [rsi + rbx*8] - movhpd xmm1, [rsi + rbx*8 + 8] - movhpd xmm2, [rsi + rbx*8 + 16] - - mov rdi, [rbp + nb400nf_faction] - - ;# move nb400nf_ix-iz to xmm4-xmm6 - movapd xmm4, [rsp + nb400nf_ix] - movapd xmm5, [rsp + nb400nf_iy] - movapd xmm6, [rsp + nb400nf_iz] - - ;# calc dr - subpd xmm4, xmm0 - subpd xmm5, xmm1 - subpd xmm6, xmm2 - - ;# square it - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb400nf_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb400nf_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb400nf_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb400nf_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu) - mulpd xmm4, xmm0 ;# xmm4=r - mulpd xmm4, [rsp + nb400nf_gbscale] - - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 ;# idx *= 4 - - movd mm0, eax - movd mm1, ebx - - mov rsi, [rbp + nb400nf_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ebx, mm6 ;# indices in eax/ebx - - movapd xmm4, [rsi + rax*8] ;# Y1 F1 - movapd xmm3, [rsi + rbx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [rsi + rax*8 + 16] ;# G1 H1 - movapd xmm3, [rsi + rbx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [rsp + nb400nf_qq] - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - mulpd xmm5, xmm3 ;# vcoul=qq*VV - addpd xmm5, [rsp + nb400nf_vctot] - movapd [rsp + nb400nf_vctot], xmm5 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb400nf_innerk], 2 - jl .nb400nf_checksingle - jmp .nb400nf_unroll_loop -.nb400nf_checksingle: - mov edx, [rsp + nb400nf_innerk] - and edx, 1 - jnz .nb400nf_dosingle - jmp .nb400nf_updateouterdata -.nb400nf_dosingle: - mov rsi, [rbp + nb400nf_charge] - mov rdx, [rbp + nb400nf_invsqrta] - mov rdi, [rbp + nb400nf_pos] - mov rcx, [rsp + nb400nf_innerjjnr] - mov eax, [rcx] - xorpd xmm6, xmm6 - movapd xmm7, xmm6 - movsd xmm7, [rdx + rax*8] - movlpd xmm6, [rsi + rax*8] ;# xmm6(0) has the charge - mulsd xmm7, [rsp + nb400nf_isai] - movapd [rsp + nb400nf_isaprod], xmm7 - movapd xmm1, xmm7 - mulpd xmm1, [rsp + nb400nf_gbtsc] - movapd [rsp + nb400nf_gbscale], xmm1 - - mulsd xmm7, [rsp + nb400nf_iq] - mulsd xmm6, xmm7 - movapd [rsp + nb400nf_qq], xmm6 - - lea rax, [rax + rax*2] - - ;# move coordinates to xmm0-xmm2 - movlpd xmm0, [rdi + rax*8] - movlpd xmm1, [rdi + rax*8 + 8] - movlpd xmm2, [rdi + rax*8 + 16] - - ;# move nb400nf_ix-iz to xmm4-xmm6 - movapd xmm4, [rsp + nb400nf_ix] - movapd xmm5, [rsp + nb400nf_iy] - movapd xmm6, [rsp + nb400nf_iz] - - ;# calc dr - subsd xmm4, xmm0 - subsd xmm5, xmm1 - subsd xmm6, xmm2 - - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb400nf_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb400nf_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb400nf_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb400nf_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu) - - mulsd xmm4, xmm0 ;# xmm4=r - mulsd xmm4, [rsp + nb400nf_gbscale] - - movd mm0, eax - - cvttsd2si eax, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, eax - subsd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl eax, 2 ;# idx *= 4 - - mov rsi, [rbp + nb400nf_GBtab] - - ;# Coulomb - movapd xmm4, [rsi + rax*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [rsi + rax*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# table ready in xmm4-xmm7 - - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [rsp + nb400nf_qq] - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, xmm3 ;# vcoul=qq*VV - addsd xmm5, [rsp + nb400nf_vctot] - movsd [rsp + nb400nf_vctot], xmm5 - -.nb400nf_updateouterdata: - ;# get n from stack - mov esi, [rsp + nb400nf_n] - ;# get group index for i particle - mov rdx, [rbp + nb400nf_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movapd xmm7, [rsp + nb400nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov rax, [rbp + nb400nf_Vc] - addsd xmm7, [rax + rdx*8] - ;# move back to mem - movsd [rax + rdx*8], xmm7 - - ;# finish if last - mov ecx, [rsp + nb400nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb400nf_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb400nf_n], esi - jmp .nb400nf_outer -.nb400nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb400nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb400nf_end - ;# non-zero, do one more workunit - jmp .nb400nf_threadloop -.nb400nf_end: - - mov eax, [rsp + nb400nf_nouter] - mov ebx, [rsp + nb400nf_ninner] - mov rcx, [rbp + nb400nf_outeriter] - mov rdx, [rbp + nb400nf_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 296 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.s deleted file mode 100644 index b75ce2037c..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel400_x86_64_sse2.s +++ /dev/null @@ -1,1212 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - - - - -.globl nb_kernel400_x86_64_sse2 -.globl _nb_kernel400_x86_64_sse2 -nb_kernel400_x86_64_sse2: -_nb_kernel400_x86_64_sse2: -## Room for return address and rbp (16 bytes) -.set nb400_fshift, 16 -.set nb400_gid, 24 -.set nb400_pos, 32 -.set nb400_faction, 40 -.set nb400_charge, 48 -.set nb400_p_facel, 56 -.set nb400_argkrf, 64 -.set nb400_argcrf, 72 -.set nb400_Vc, 80 -.set nb400_type, 88 -.set nb400_p_ntype, 96 -.set nb400_vdwparam, 104 -.set nb400_Vvdw, 112 -.set nb400_p_tabscale, 120 -.set nb400_VFtab, 128 -.set nb400_invsqrta, 136 -.set nb400_dvda, 144 -.set nb400_p_gbtabscale, 152 -.set nb400_GBtab, 160 -.set nb400_p_nthreads, 168 -.set nb400_count, 176 -.set nb400_mtx, 184 -.set nb400_outeriter, 192 -.set nb400_inneriter, 200 -.set nb400_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb400_ix, 0 -.set nb400_iy, 16 -.set nb400_iz, 32 -.set nb400_iq, 48 -.set nb400_dx, 64 -.set nb400_dy, 80 -.set nb400_dz, 96 -.set nb400_two, 112 -.set nb400_gbtsc, 128 -.set nb400_qq, 144 -.set nb400_r, 160 -.set nb400_vctot, 176 -.set nb400_fix, 192 -.set nb400_fiy, 208 -.set nb400_fiz, 224 -.set nb400_half, 240 -.set nb400_three, 256 -.set nb400_isai, 272 -.set nb400_isaprod, 288 -.set nb400_dvdasum, 304 -.set nb400_gbscale, 320 -.set nb400_nri, 336 -.set nb400_iinr, 344 -.set nb400_jindex, 352 -.set nb400_jjnr, 360 -.set nb400_shift, 368 -.set nb400_shiftvec, 376 -.set nb400_facel, 384 -.set nb400_innerjjnr, 392 -.set nb400_is3, 400 -.set nb400_ii3, 404 -.set nb400_ii, 408 -.set nb400_innerk, 412 -.set nb400_n, 416 -.set nb400_nn1, 420 -.set nb400_nouter, 424 -.set nb400_ninner, 428 - push %rbp - movq %rsp,%rbp - push %rbx - - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $440,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb400_nouter(%rsp) - movl %eax,nb400_ninner(%rsp) - - movl (%rdi),%edi - movl %edi,nb400_nri(%rsp) - movq %rsi,nb400_iinr(%rsp) - movq %rdx,nb400_jindex(%rsp) - movq %rcx,nb400_jjnr(%rsp) - movq %r8,nb400_shift(%rsp) - movq %r9,nb400_shiftvec(%rsp) - movq nb400_p_facel(%rbp),%rsi - movsd (%rsi),%xmm0 - movsd %xmm0,nb400_facel(%rsp) - - movq nb400_p_gbtabscale(%rbp),%rbx - movsd (%rbx),%xmm4 - shufpd $0,%xmm4,%xmm4 - movapd %xmm4,nb400_gbtsc(%rsp) - - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double half IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb400_half(%rsp) - movl %ebx,nb400_half+4(%rsp) - movsd nb400_half(%rsp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## one - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## two - addpd %xmm2,%xmm3 ## three - movapd %xmm1,nb400_half(%rsp) - movapd %xmm2,nb400_two(%rsp) - movapd %xmm3,nb400_three(%rsp) - -_nb_kernel400_x86_64_sse2.nb400_threadloop: - movq nb400_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel400_x86_64_sse2.nb400_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel400_x86_64_sse2.nb400_spinlock - - ## if(nn1>nri) nn1=nri - movl nb400_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb400_n(%rsp) - movl %ebx,nb400_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel400_x86_64_sse2.nb400_outerstart - jmp _nb_kernel400_x86_64_sse2.nb400_end - -_nb_kernel400_x86_64_sse2.nb400_outerstart: - ## ebx contains number of outer iterations - addl nb400_nouter(%rsp),%ebx - movl %ebx,nb400_nouter(%rsp) - -_nb_kernel400_x86_64_sse2.nb400_outer: - movq nb400_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## rbx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb400_is3(%rsp) ## store is3 - - movq nb400_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movsd (%rax,%rbx,8),%xmm0 - movsd 8(%rax,%rbx,8),%xmm1 - movsd 16(%rax,%rbx,8),%xmm2 - - movq nb400_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - movl %ebx,nb400_ii(%rsp) - - movq nb400_charge(%rbp),%rdx - movsd (%rdx,%rbx,8),%xmm3 - mulsd nb400_facel(%rsp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movq nb400_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movsd (%rdx,%rbx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb400_pos(%rbp),%rax ## rax = base of pos[] - - addsd (%rax,%rbx,8),%xmm0 - addsd 8(%rax,%rbx,8),%xmm1 - addsd 16(%rax,%rbx,8),%xmm2 - - movapd %xmm3,nb400_iq(%rsp) - movapd %xmm4,nb400_isai(%rsp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb400_ix(%rsp) - movapd %xmm1,nb400_iy(%rsp) - movapd %xmm2,nb400_iz(%rsp) - - movl %ebx,nb400_ii3(%rsp) - - ## clear vctot and i forces - xorpd %xmm4,%xmm4 - movapd %xmm4,%xmm8 - movapd %xmm4,%xmm12 - movapd %xmm4,%xmm13 - movapd %xmm4,%xmm14 - movapd %xmm4,%xmm15 - - movq nb400_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb400_pos(%rbp),%rsi - movq nb400_faction(%rbp),%rdi - movq nb400_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb400_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb400_ninner(%rsp),%ecx - movl %ecx,nb400_ninner(%rsp) - addl $0,%edx - movl %edx,nb400_innerk(%rsp) ## number of innerloop atoms - jge _nb_kernel400_x86_64_sse2.nb400_unroll_loop - jmp _nb_kernel400_x86_64_sse2.nb400_checksingle -_nb_kernel400_x86_64_sse2.nb400_unroll_loop: - ## twice unrolled innerloop here - movq nb400_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%r12d - movl 4(%rdx),%r13d - addq $8,nb400_innerjjnr(%rsp) ## advance pointer (unrolled 2) - - movq nb400_pos(%rbp),%rsi ## base of pos[] - - lea (%r12,%r12,2),%r8 ## j3 - lea (%r13,%r13,2),%r9 - - ## move two coordinates to xmm4-xmm6 - movlpd (%rsi,%r8,8),%xmm4 - movlpd 8(%rsi,%r8,8),%xmm5 - movlpd 16(%rsi,%r8,8),%xmm6 - movhpd (%rsi,%r9,8),%xmm4 - movhpd 8(%rsi,%r9,8),%xmm5 - movhpd 16(%rsi,%r9,8),%xmm6 - - ## calc dr - subpd nb400_ix(%rsp),%xmm4 - subpd nb400_iy(%rsp),%xmm5 - subpd nb400_iz(%rsp),%xmm6 - - - ## store dr - movapd %xmm4,%xmm9 - movapd %xmm5,%xmm10 - movapd %xmm6,%xmm11 - - ## square it - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - movq nb400_invsqrta(%rbp),%rsi - movlpd (%rsi,%r12,8),%xmm3 - - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - movhpd (%rsi,%r13,8),%xmm3 - mulpd nb400_isai(%rsp),%xmm3 - movapd %xmm3,nb400_isaprod(%rsp) - movapd %xmm3,%xmm6 - mulpd nb400_gbtsc(%rsp),%xmm3 - movapd %xmm3,nb400_gbscale(%rsp) - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb400_three(%rsp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb400_half(%rsp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movq nb400_charge(%rbp),%rsi ## base of charge[] - movlpd (%rsi,%r12,8),%xmm3 - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb400_three(%rsp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb400_half(%rsp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=iter2 of rinv (new lu) - mulpd %xmm0,%xmm4 ## xmm4=r - - mulpd nb400_iq(%rsp),%xmm6 - movhpd (%rsi,%r13,8),%xmm3 - mulpd %xmm6,%xmm3 - movapd %xmm3,nb400_qq(%rsp) - - - movapd %xmm4,nb400_r(%rsp) - mulpd nb400_gbscale(%rsp),%xmm4 - - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - - pslld $2,%mm6 ## idx *= 4 - - movq nb400_GBtab(%rbp),%rsi - movd %mm6,%r10d - psrlq $32,%mm6 - movd %mm6,%r11d ## indices in r10/r11 - - movapd (%rsi,%r10,8),%xmm4 ## Y1 F1 - movapd (%rsi,%r11,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%rsi,%r10,8),%xmm6 ## G1 H1 - movapd 16(%rsi,%r11,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - - mulpd %xmm1,%xmm7 ## xmm7=Heps - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm1,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - addpd %xmm7,%xmm7 ## two*Heps2 - movapd nb400_qq(%rsp),%xmm3 - addpd %xmm6,%xmm7 - addpd %xmm5,%xmm7 ## xmm7=FF - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - mulpd %xmm3,%xmm5 ## vcoul=qq*VV - mulpd %xmm7,%xmm3 ## fijC=FF*qq - - movq nb400_dvda(%rbp),%rsi - - ## Calculate dVda - xorpd %xmm7,%xmm7 - mulpd nb400_gbscale(%rsp),%xmm3 - movapd %xmm3,%xmm6 - mulpd nb400_r(%rsp),%xmm6 - addpd %xmm5,%xmm6 - - ## update vctot - addpd %xmm5,%xmm12 - - ## xmm6=(vcoul+fijC*r) - subpd %xmm6,%xmm7 - movapd %xmm7,%xmm6 - - ## update dvdasum - addpd %xmm7,%xmm8 - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - addsd (%rsi,%r12,8),%xmm6 - addsd (%rsi,%r13,8),%xmm7 - movsd %xmm6,(%rsi,%r12,8) - movsd %xmm7,(%rsi,%r13,8) - - ## the fj's - start by accumulating forces from memory - movq nb400_faction(%rbp),%rdi - movlpd (%rdi,%r8,8),%xmm5 - movlpd 8(%rdi,%r8,8),%xmm6 - movlpd 16(%rdi,%r8,8),%xmm7 - movhpd (%rdi,%r9,8),%xmm5 - movhpd 8(%rdi,%r9,8),%xmm6 - movhpd 16(%rdi,%r9,8),%xmm7 - - xorpd %xmm4,%xmm4 - - mulpd %xmm0,%xmm3 - subpd %xmm3,%xmm4 - - movq nb400_faction(%rbp),%rdi - mulpd %xmm4,%xmm9 - mulpd %xmm4,%xmm10 - mulpd %xmm4,%xmm11 - - addpd %xmm9,%xmm5 - addpd %xmm10,%xmm6 - addpd %xmm11,%xmm7 - - ## now update f_i - addpd %xmm9,%xmm13 - addpd %xmm10,%xmm14 - addpd %xmm11,%xmm15 - - movlpd %xmm5,(%rdi,%r8,8) - movlpd %xmm6,8(%rdi,%r8,8) - movlpd %xmm7,16(%rdi,%r8,8) - movhpd %xmm5,(%rdi,%r9,8) - movhpd %xmm6,8(%rdi,%r9,8) - movhpd %xmm7,16(%rdi,%r9,8) - - ## should we do one more iteration? - subl $2,nb400_innerk(%rsp) - jl _nb_kernel400_x86_64_sse2.nb400_checksingle - jmp _nb_kernel400_x86_64_sse2.nb400_unroll_loop -_nb_kernel400_x86_64_sse2.nb400_checksingle: - movl nb400_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel400_x86_64_sse2.nb400_dosingle - jmp _nb_kernel400_x86_64_sse2.nb400_updateouterdata -_nb_kernel400_x86_64_sse2.nb400_dosingle: - movq nb400_charge(%rbp),%rsi - movq nb400_invsqrta(%rbp),%rdx - movq nb400_pos(%rbp),%rdi - movq nb400_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - - ## load isaj - movq nb400_invsqrta(%rbp),%rsi - movsd (%rsi,%rax,8),%xmm2 - mulsd nb400_isai(%rsp),%xmm2 - movapd %xmm2,nb400_isaprod(%rsp) - movapd %xmm2,%xmm1 - mulsd nb400_gbtsc(%rsp),%xmm1 - movapd %xmm1,nb400_gbscale(%rsp) - - mulsd nb400_iq(%rsp),%xmm2 - movq nb400_charge(%rbp),%rsi ## base of charge[] - movsd (%rsi,%rax,8),%xmm3 - mulsd %xmm2,%xmm3 - movapd %xmm3,nb400_qq(%rsp) - - movq nb400_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r8 ## j3 - - ## move coordinate to xmm4-xmm6 - movsd (%rsi,%r8,8),%xmm4 - movsd 8(%rsi,%r8,8),%xmm5 - movsd 16(%rsi,%r8,8),%xmm6 - - movq nb400_faction(%rbp),%rdi - - ## calc dr - subsd nb400_ix(%rsp),%xmm4 - subsd nb400_iy(%rsp),%xmm5 - subsd nb400_iz(%rsp),%xmm6 - - ## store dr - movapd %xmm4,%xmm9 - movapd %xmm5,%xmm10 - movapd %xmm6,%xmm11 - - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb400_three(%rsp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb400_half(%rsp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb400_three(%rsp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb400_half(%rsp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=iter2 of rinv (new lu) - mulsd %xmm0,%xmm4 ## xmm4=r - - movapd %xmm4,nb400_r(%rsp) - mulsd nb400_gbscale(%rsp),%xmm4 - - cvttsd2si %xmm4,%r10d ## mm6 = lu idx - cvtsi2sd %r10d,%xmm5 - subsd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - - shll $2,%r10d ## idx *= 4 - - movq nb400_GBtab(%rbp),%rsi - - movapd (%rsi,%r10,8),%xmm4 ## Y1 F1 - movhlps %xmm4,%xmm5 - movapd 16(%rsi,%r10,8),%xmm6 ## G1 H1 - movhlps %xmm6,%xmm7 - ## coulomb table ready, in xmm4-xmm7 - - mulsd %xmm1,%xmm7 ## xmm7=Heps - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm1,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - addsd %xmm7,%xmm7 ## two*Heps2 - movapd nb400_qq(%rsp),%xmm3 - addsd %xmm6,%xmm7 - addsd %xmm5,%xmm7 ## xmm7=FF - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd %xmm3,%xmm5 ## vcoul=qq*VV - mulsd %xmm7,%xmm3 ## fijC=FF*qq - - movq nb400_dvda(%rbp),%rsi - - ## Calculate dVda - xorpd %xmm7,%xmm7 - mulsd nb400_gbscale(%rsp),%xmm3 - movapd %xmm3,%xmm6 - mulsd nb400_r(%rsp),%xmm6 - addsd %xmm5,%xmm6 - - ## update vctot - addsd %xmm5,%xmm12 - - ## xmm6=(vcoul+fijC*r) - subsd %xmm6,%xmm7 - movapd %xmm7,%xmm6 - - ## update dvdasum - addsd %xmm7,%xmm8 - - ## update j atoms dvdaj - addsd (%rsi,%rax,8),%xmm6 - movsd %xmm6,(%rsi,%rax,8) - - xorpd %xmm4,%xmm4 - - mulsd %xmm0,%xmm3 - subsd %xmm3,%xmm4 - - movq nb400_faction(%rbp),%rdi - mulsd %xmm4,%xmm9 - mulsd %xmm4,%xmm10 - mulsd %xmm4,%xmm11 - - ## now update f_i - addsd %xmm9,%xmm13 - addsd %xmm10,%xmm14 - addsd %xmm11,%xmm15 - - ## the fj's - start by accumulating forces from memory - movq nb400_faction(%rbp),%rdi - addsd (%rdi,%r8,8),%xmm9 - addsd 8(%rdi,%r8,8),%xmm10 - addsd 16(%rdi,%r8,8),%xmm11 - movsd %xmm9,(%rdi,%r8,8) - movsd %xmm10,8(%rdi,%r8,8) - movsd %xmm11,16(%rdi,%r8,8) - -_nb_kernel400_x86_64_sse2.nb400_updateouterdata: - movl nb400_ii3(%rsp),%ecx - movq nb400_faction(%rbp),%rdi - movq nb400_fshift(%rbp),%rsi - movl nb400_is3(%rsp),%edx - - ## accumulate i forces in xmm13, xmm14, xmm15 - movhlps %xmm13,%xmm3 - movhlps %xmm14,%xmm4 - movhlps %xmm15,%xmm5 - addsd %xmm3,%xmm13 - addsd %xmm4,%xmm14 - addsd %xmm5,%xmm15 ## sum is in low xmm13-xmm15 - - ## increment i force - movsd (%rdi,%rcx,8),%xmm3 - movsd 8(%rdi,%rcx,8),%xmm4 - movsd 16(%rdi,%rcx,8),%xmm5 - subsd %xmm13,%xmm3 - subsd %xmm14,%xmm4 - subsd %xmm15,%xmm5 - movsd %xmm3,(%rdi,%rcx,8) - movsd %xmm4,8(%rdi,%rcx,8) - movsd %xmm5,16(%rdi,%rcx,8) - - ## increment fshift force - movsd (%rsi,%rdx,8),%xmm3 - movsd 8(%rsi,%rdx,8),%xmm4 - movsd 16(%rsi,%rdx,8),%xmm5 - subsd %xmm13,%xmm3 - subsd %xmm14,%xmm4 - subsd %xmm15,%xmm5 - movsd %xmm3,(%rsi,%rdx,8) - movsd %xmm4,8(%rsi,%rdx,8) - movsd %xmm5,16(%rsi,%rdx,8) - - ## get n from stack - movl nb400_n(%rsp),%esi - ## get group index for i particle - movq nb400_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total coulomb energy and update it - movhlps %xmm12,%xmm6 - addsd %xmm6,%xmm12 ## low xmm12 have the sum now - - ## add earlier value from mem - movq nb400_Vc(%rbp),%rax - addsd (%rax,%rdx,8),%xmm12 - ## move back to mem - movsd %xmm12,(%rax,%rdx,8) - - ## accumulate dVda and update it - movhlps %xmm8,%xmm6 - addsd %xmm6,%xmm8 ## low xmm8 has the sum now - - movl nb400_ii(%rsp),%edx - movq nb400_dvda(%rbp),%rax - addsd (%rax,%rdx,8),%xmm8 - movsd %xmm8,(%rax,%rdx,8) - - ## finish if last - movl nb400_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel400_x86_64_sse2.nb400_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb400_n(%rsp) - jmp _nb_kernel400_x86_64_sse2.nb400_outer -_nb_kernel400_x86_64_sse2.nb400_outerend: - ## check if more outer neighborlists remain - movl nb400_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel400_x86_64_sse2.nb400_end - ## non-zero, do one more workunit - jmp _nb_kernel400_x86_64_sse2.nb400_threadloop -_nb_kernel400_x86_64_sse2.nb400_end: - movl nb400_nouter(%rsp),%eax - movl nb400_ninner(%rsp),%ebx - movq nb400_outeriter(%rbp),%rcx - movq nb400_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $440,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - - - - - - -.globl nb_kernel400nf_x86_64_sse2 -.globl _nb_kernel400nf_x86_64_sse2 -nb_kernel400nf_x86_64_sse2: -_nb_kernel400nf_x86_64_sse2: -.set nb400nf_fshift, 16 -.set nb400nf_gid, 24 -.set nb400nf_pos, 32 -.set nb400nf_faction, 40 -.set nb400nf_charge, 48 -.set nb400nf_p_facel, 56 -.set nb400nf_argkrf, 64 -.set nb400nf_argcrf, 72 -.set nb400nf_Vc, 80 -.set nb400nf_type, 88 -.set nb400nf_p_ntype, 96 -.set nb400nf_vdwparam, 104 -.set nb400nf_Vvdw, 112 -.set nb400nf_p_tabscale, 120 -.set nb400nf_VFtab, 128 -.set nb400nf_invsqrta, 136 -.set nb400nf_dvda, 144 -.set nb400nf_p_gbtabscale, 152 -.set nb400nf_GBtab, 160 -.set nb400nf_p_nthreads, 168 -.set nb400nf_count, 176 -.set nb400nf_mtx, 184 -.set nb400nf_outeriter, 192 -.set nb400nf_inneriter, 200 -.set nb400nf_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb400nf_ix, 0 -.set nb400nf_iy, 16 -.set nb400nf_iz, 32 -.set nb400nf_iq, 48 -.set nb400nf_gbtsc, 64 -.set nb400nf_qq, 80 -.set nb400nf_vctot, 96 -.set nb400nf_half, 112 -.set nb400nf_three, 128 -.set nb400nf_isai, 144 -.set nb400nf_isaprod, 160 -.set nb400nf_gbscale, 176 -.set nb400nf_nri, 192 -.set nb400nf_iinr, 200 -.set nb400nf_jindex, 208 -.set nb400nf_jjnr, 216 -.set nb400nf_shift, 224 -.set nb400nf_shiftvec, 232 -.set nb400nf_facel, 240 -.set nb400nf_innerjjnr, 248 -.set nb400nf_is3, 256 -.set nb400nf_ii3, 260 -.set nb400nf_innerk, 264 -.set nb400nf_n, 268 -.set nb400nf_nn1, 272 -.set nb400nf_nouter, 276 -.set nb400nf_ninner, 280 - push %rbp - movq %rsp,%rbp - push %rbx - - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $296,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb400nf_nouter(%rsp) - movl %eax,nb400nf_ninner(%rsp) - - movl (%rdi),%edi - movl %edi,nb400nf_nri(%rsp) - movq %rsi,nb400nf_iinr(%rsp) - movq %rdx,nb400nf_jindex(%rsp) - movq %rcx,nb400nf_jjnr(%rsp) - movq %r8,nb400nf_shift(%rsp) - movq %r9,nb400nf_shiftvec(%rsp) - movq nb400nf_p_facel(%rbp),%rsi - movsd (%rsi),%xmm0 - movsd %xmm0,nb400nf_facel(%rsp) - - movq nb400nf_p_gbtabscale(%rbp),%rbx - movsd (%rbx),%xmm4 - shufpd $0,%xmm4,%xmm4 - movapd %xmm4,nb400nf_gbtsc(%rsp) - - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double half IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb400nf_half(%rsp) - movl %ebx,nb400nf_half+4(%rsp) - movsd nb400nf_half(%rsp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## one - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## two - addpd %xmm2,%xmm3 ## three - movapd %xmm1,nb400nf_half(%rsp) - movapd %xmm3,nb400nf_three(%rsp) - -_nb_kernel400nf_x86_64_sse2.nb400nf_threadloop: - movq nb400nf_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel400nf_x86_64_sse2.nb400nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel400nf_x86_64_sse2.nb400nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb400nf_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb400nf_n(%rsp) - movl %ebx,nb400nf_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel400nf_x86_64_sse2.nb400nf_outerstart - jmp _nb_kernel400nf_x86_64_sse2.nb400nf_end - -_nb_kernel400nf_x86_64_sse2.nb400nf_outerstart: - ## ebx contains number of outer iterations - addl nb400nf_nouter(%rsp),%ebx - movl %ebx,nb400nf_nouter(%rsp) - -_nb_kernel400nf_x86_64_sse2.nb400nf_outer: - movq nb400nf_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## rbx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb400nf_is3(%rsp) ## store is3 - - movq nb400nf_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movsd (%rax,%rbx,8),%xmm0 - movsd 8(%rax,%rbx,8),%xmm1 - movsd 16(%rax,%rbx,8),%xmm2 - - movq nb400nf_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - - movq nb400nf_charge(%rbp),%rdx - movsd (%rdx,%rbx,8),%xmm3 - mulsd nb400nf_facel(%rsp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movq nb400nf_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movsd (%rdx,%rbx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb400nf_pos(%rbp),%rax ## rax = base of pos[] - - addsd (%rax,%rbx,8),%xmm0 - addsd 8(%rax,%rbx,8),%xmm1 - addsd 16(%rax,%rbx,8),%xmm2 - - movapd %xmm3,nb400nf_iq(%rsp) - movapd %xmm4,nb400nf_isai(%rsp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb400nf_ix(%rsp) - movapd %xmm1,nb400nf_iy(%rsp) - movapd %xmm2,nb400nf_iz(%rsp) - - movl %ebx,nb400nf_ii3(%rsp) - - ## clear vctot - xorpd %xmm4,%xmm4 - movapd %xmm4,nb400nf_vctot(%rsp) - - movq nb400nf_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb400nf_pos(%rbp),%rsi - movq nb400nf_faction(%rbp),%rdi - movq nb400nf_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb400nf_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb400nf_ninner(%rsp),%ecx - movl %ecx,nb400nf_ninner(%rsp) - addl $0,%edx - movl %edx,nb400nf_innerk(%rsp) ## number of innerloop atoms - jge _nb_kernel400nf_x86_64_sse2.nb400nf_unroll_loop - jmp _nb_kernel400nf_x86_64_sse2.nb400nf_checksingle -_nb_kernel400nf_x86_64_sse2.nb400nf_unroll_loop: - ## twice unrolled innerloop here - movq nb400nf_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%eax - movl 4(%rdx),%ebx - addq $8,nb400nf_innerjjnr(%rsp) ## advance pointer (unrolled 2) - - ## load isa2 - movq nb400nf_invsqrta(%rbp),%rsi - movlpd (%rsi,%rax,8),%xmm2 - movhpd (%rsi,%rbx,8),%xmm2 - mulpd nb400nf_isai(%rsp),%xmm2 - movapd %xmm2,nb400nf_isaprod(%rsp) - movapd %xmm2,%xmm1 - mulpd nb400nf_gbtsc(%rsp),%xmm1 - movapd %xmm1,nb400nf_gbscale(%rsp) - - movq nb400nf_charge(%rbp),%rsi ## base of charge[] - movlpd (%rsi,%rax,8),%xmm3 - movhpd (%rsi,%rbx,8),%xmm3 - - mulpd nb400nf_iq(%rsp),%xmm2 - mulpd %xmm2,%xmm3 - movapd %xmm3,nb400nf_qq(%rsp) - - movq nb400nf_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%rax ## replace jnr with j3 - lea (%rbx,%rbx,2),%rbx - - ## move two coordinates to xmm0-xmm2 - movlpd (%rsi,%rax,8),%xmm0 - movlpd 8(%rsi,%rax,8),%xmm1 - movlpd 16(%rsi,%rax,8),%xmm2 - movhpd (%rsi,%rbx,8),%xmm0 - movhpd 8(%rsi,%rbx,8),%xmm1 - movhpd 16(%rsi,%rbx,8),%xmm2 - - movq nb400nf_faction(%rbp),%rdi - - ## move nb400nf_ix-iz to xmm4-xmm6 - movapd nb400nf_ix(%rsp),%xmm4 - movapd nb400nf_iy(%rsp),%xmm5 - movapd nb400nf_iz(%rsp),%xmm6 - - ## calc dr - subpd %xmm0,%xmm4 - subpd %xmm1,%xmm5 - subpd %xmm2,%xmm6 - - ## square it - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb400nf_three(%rsp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb400nf_half(%rsp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb400nf_three(%rsp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb400nf_half(%rsp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=iter2 of rinv (new lu) - mulpd %xmm0,%xmm4 ## xmm4=r - mulpd nb400nf_gbscale(%rsp),%xmm4 - - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 ## idx *= 4 - - movd %eax,%mm0 - movd %ebx,%mm1 - - movq nb400nf_GBtab(%rbp),%rsi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm6,%ebx ## indices in eax/ebx - - movapd (%rsi,%rax,8),%xmm4 ## Y1 F1 - movapd (%rsi,%rbx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%rsi,%rax,8),%xmm6 ## G1 H1 - movapd 16(%rsi,%rbx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - movapd nb400nf_qq(%rsp),%xmm3 - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - mulpd %xmm3,%xmm5 ## vcoul=qq*VV - addpd nb400nf_vctot(%rsp),%xmm5 - movapd %xmm5,nb400nf_vctot(%rsp) - - ## should we do one more iteration? - subl $2,nb400nf_innerk(%rsp) - jl _nb_kernel400nf_x86_64_sse2.nb400nf_checksingle - jmp _nb_kernel400nf_x86_64_sse2.nb400nf_unroll_loop -_nb_kernel400nf_x86_64_sse2.nb400nf_checksingle: - movl nb400nf_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel400nf_x86_64_sse2.nb400nf_dosingle - jmp _nb_kernel400nf_x86_64_sse2.nb400nf_updateouterdata -_nb_kernel400nf_x86_64_sse2.nb400nf_dosingle: - movq nb400nf_charge(%rbp),%rsi - movq nb400nf_invsqrta(%rbp),%rdx - movq nb400nf_pos(%rbp),%rdi - movq nb400nf_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - xorpd %xmm6,%xmm6 - movapd %xmm6,%xmm7 - movsd (%rdx,%rax,8),%xmm7 - movlpd (%rsi,%rax,8),%xmm6 ## xmm6(0) has the charge - mulsd nb400nf_isai(%rsp),%xmm7 - movapd %xmm7,nb400nf_isaprod(%rsp) - movapd %xmm7,%xmm1 - mulpd nb400nf_gbtsc(%rsp),%xmm1 - movapd %xmm1,nb400nf_gbscale(%rsp) - - mulsd nb400nf_iq(%rsp),%xmm7 - mulsd %xmm7,%xmm6 - movapd %xmm6,nb400nf_qq(%rsp) - - lea (%rax,%rax,2),%rax - - ## move coordinates to xmm0-xmm2 - movlpd (%rdi,%rax,8),%xmm0 - movlpd 8(%rdi,%rax,8),%xmm1 - movlpd 16(%rdi,%rax,8),%xmm2 - - ## move nb400nf_ix-iz to xmm4-xmm6 - movapd nb400nf_ix(%rsp),%xmm4 - movapd nb400nf_iy(%rsp),%xmm5 - movapd nb400nf_iz(%rsp),%xmm6 - - ## calc dr - subsd %xmm0,%xmm4 - subsd %xmm1,%xmm5 - subsd %xmm2,%xmm6 - - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb400nf_three(%rsp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb400nf_half(%rsp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb400nf_three(%rsp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb400nf_half(%rsp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=iter2 of rinv (new lu) - - mulsd %xmm0,%xmm4 ## xmm4=r - mulsd nb400nf_gbscale(%rsp),%xmm4 - - movd %eax,%mm0 - - cvttsd2si %xmm4,%eax ## mm6 = lu idx - cvtsi2sd %eax,%xmm5 - subsd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%eax ## idx *= 4 - - movq nb400nf_GBtab(%rbp),%rsi - - ## Coulomb - movapd (%rsi,%rax,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%rsi,%rax,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## table ready in xmm4-xmm7 - - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - movapd nb400nf_qq(%rsp),%xmm3 - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd %xmm3,%xmm5 ## vcoul=qq*VV - addsd nb400nf_vctot(%rsp),%xmm5 - movsd %xmm5,nb400nf_vctot(%rsp) - -_nb_kernel400nf_x86_64_sse2.nb400nf_updateouterdata: - ## get n from stack - movl nb400nf_n(%rsp),%esi - ## get group index for i particle - movq nb400nf_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movapd nb400nf_vctot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movq nb400nf_Vc(%rbp),%rax - addsd (%rax,%rdx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%rax,%rdx,8) - - ## finish if last - movl nb400nf_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel400nf_x86_64_sse2.nb400nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb400nf_n(%rsp) - jmp _nb_kernel400nf_x86_64_sse2.nb400nf_outer -_nb_kernel400nf_x86_64_sse2.nb400nf_outerend: - ## check if more outer neighborlists remain - movl nb400nf_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel400nf_x86_64_sse2.nb400nf_end - ## non-zero, do one more workunit - jmp _nb_kernel400nf_x86_64_sse2.nb400nf_threadloop -_nb_kernel400nf_x86_64_sse2.nb400nf_end: - - movl nb400nf_nouter(%rsp),%eax - movl nb400nf_ninner(%rsp),%ebx - movq nb400nf_outeriter(%rbp),%rcx - movq nb400nf_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $296,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.intel_syntax.s deleted file mode 100644 index 72340a98f9..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.intel_syntax.s +++ /dev/null @@ -1,1488 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. - -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - - - - -.globl nb_kernel410_x86_64_sse2 -.globl _nb_kernel410_x86_64_sse2 -nb_kernel410_x86_64_sse2: -_nb_kernel410_x86_64_sse2: -;# Room for return address and rbp (16 bytes) -.equiv nb410_fshift, 16 -.equiv nb410_gid, 24 -.equiv nb410_pos, 32 -.equiv nb410_faction, 40 -.equiv nb410_charge, 48 -.equiv nb410_p_facel, 56 -.equiv nb410_argkrf, 64 -.equiv nb410_argcrf, 72 -.equiv nb410_Vc, 80 -.equiv nb410_type, 88 -.equiv nb410_p_ntype, 96 -.equiv nb410_vdwparam, 104 -.equiv nb410_Vvdw, 112 -.equiv nb410_p_tabscale, 120 -.equiv nb410_VFtab, 128 -.equiv nb410_invsqrta, 136 -.equiv nb410_dvda, 144 -.equiv nb410_p_gbtabscale, 152 -.equiv nb410_GBtab, 160 -.equiv nb410_p_nthreads, 168 -.equiv nb410_count, 176 -.equiv nb410_mtx, 184 -.equiv nb410_outeriter, 192 -.equiv nb410_inneriter, 200 -.equiv nb410_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb410_ix, 0 -.equiv nb410_iy, 16 -.equiv nb410_iz, 32 -.equiv nb410_iq, 48 -.equiv nb410_dx, 64 -.equiv nb410_dy, 80 -.equiv nb410_dz, 96 -.equiv nb410_two, 112 -.equiv nb410_six, 128 -.equiv nb410_twelve, 144 -.equiv nb410_gbtsc, 160 -.equiv nb410_qq, 176 -.equiv nb410_c6, 192 -.equiv nb410_c12, 208 -.equiv nb410_fscal, 224 -.equiv nb410_vctot, 240 -.equiv nb410_Vvdwtot, 256 -.equiv nb410_fix, 272 -.equiv nb410_fiy, 288 -.equiv nb410_fiz, 304 -.equiv nb410_half, 320 -.equiv nb410_three, 336 -.equiv nb410_r, 352 -.equiv nb410_isai, 368 -.equiv nb410_isaprod, 384 -.equiv nb410_dvdasum, 400 -.equiv nb410_gbscale, 416 -.equiv nb410_nri, 432 -.equiv nb410_iinr, 440 -.equiv nb410_jindex, 448 -.equiv nb410_jjnr, 456 -.equiv nb410_shift, 464 -.equiv nb410_shiftvec, 472 -.equiv nb410_facel, 480 -.equiv nb410_innerjjnr, 488 -.equiv nb410_ii, 496 -.equiv nb410_is3, 500 -.equiv nb410_ii3, 504 -.equiv nb410_ntia, 508 -.equiv nb410_innerk, 512 -.equiv nb410_n, 516 -.equiv nb410_nn1, 520 -.equiv nb410_ntype, 524 -.equiv nb410_nouter, 528 -.equiv nb410_ninner, 532 - push rbp - mov rbp, rsp - push rbx - - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 552 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb410_nouter], eax - mov [rsp + nb410_ninner], eax - - mov edi, [rdi] - mov [rsp + nb410_nri], edi - mov [rsp + nb410_iinr], rsi - mov [rsp + nb410_jindex], rdx - mov [rsp + nb410_jjnr], rcx - mov [rsp + nb410_shift], r8 - mov [rsp + nb410_shiftvec], r9 - mov rdi, [rbp + nb410_p_ntype] - mov edi, [rdi] - mov [rsp + nb410_ntype], edi - mov rsi, [rbp + nb410_p_facel] - movsd xmm0, [rsi] - movsd [rsp + nb410_facel], xmm0 - - mov rbx, [rbp + nb410_p_gbtabscale] - movsd xmm4, [rbx] - shufpd xmm4, xmm4, 0 - movapd [rsp + nb410_gbtsc], xmm4 - - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double half IEEE (hex) - mov ebx, 0x3fe00000 - mov [rsp + nb410_half], eax - mov [rsp + nb410_half+4], ebx - movsd xmm1, [rsp + nb410_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# one - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# two - addpd xmm3, xmm2 ;# three - movapd xmm4, xmm3 - addpd xmm4, xmm4 ;# six - movapd xmm5, xmm4 - addpd xmm5, xmm5 ;# twelve - movapd [rsp + nb410_half], xmm1 - movapd [rsp + nb410_two], xmm2 - movapd [rsp + nb410_three], xmm3 - movapd [rsp + nb410_six], xmm4 - movapd [rsp + nb410_twelve], xmm5 - -.nb410_threadloop: - mov rsi, [rbp + nb410_count] ;# pointer to sync counter - mov eax, [rsi] -.nb410_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb410_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb410_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb410_n], eax - mov [rsp + nb410_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb410_outerstart - jmp .nb410_end - -.nb410_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb410_nouter] - mov [rsp + nb410_nouter], ebx - -.nb410_outer: - mov rax, [rsp + nb410_shift] ;# rax = pointer into shift[] - mov ebx, [rax+rsi*4] ;# rbx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb410_is3],ebx ;# store is3 - - mov rax, [rsp + nb410_shiftvec] ;# rax = base of shiftvec[] - - movsd xmm0, [rax + rbx*8] - movsd xmm1, [rax + rbx*8 + 8] - movsd xmm2, [rax + rbx*8 + 16] - - mov rcx, [rsp + nb410_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx+rsi*4] ;# ebx =ii - mov [rsp + nb410_ii], ebx - - mov rdx, [rbp + nb410_charge] - movsd xmm3, [rdx + rbx*8] - mulsd xmm3, [rsp + nb410_facel] - shufpd xmm3, xmm3, 0 - - mov rdx, [rbp + nb410_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [rdx + rbx*8] - shufpd xmm4, xmm4, 0 - - mov rdx, [rbp + nb410_type] - mov edx, [rdx + rbx*4] - imul edx, [rsp + nb410_ntype] - shl edx, 1 - mov [rsp + nb410_ntia], edx - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb410_pos] ;# rax = base of pos[] - - addsd xmm0, [rax + rbx*8] - addsd xmm1, [rax + rbx*8 + 8] - addsd xmm2, [rax + rbx*8 + 16] - - movapd [rsp + nb410_iq], xmm3 - movapd [rsp + nb410_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [rsp + nb410_ix], xmm0 - movapd [rsp + nb410_iy], xmm1 - movapd [rsp + nb410_iz], xmm2 - - mov [rsp + nb410_ii3], ebx - - ;# clear vctot and i forces - xorpd xmm13, xmm13 - movapd xmm12, xmm13 - movapd [rsp + nb410_Vvdwtot], xmm13 - movapd [rsp + nb410_dvdasum], xmm13 - movapd xmm14, xmm13 - movapd xmm15, xmm13 - - mov rax, [rsp + nb410_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb410_pos] - mov rdi, [rbp + nb410_faction] - mov rax, [rsp + nb410_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb410_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [rsp + nb410_ninner] - mov [rsp + nb410_ninner], ecx - add edx, 0 - mov [rsp + nb410_innerk], edx ;# number of innerloop atoms - jge .nb410_unroll_loop - jmp .nb410_checksingle -.nb410_unroll_loop: - ;# twice unrolled innerloop here - mov rdx, [rsp + nb410_innerjjnr] ;# pointer to jjnr[k] - mov r14d, [rdx] - mov r15d, [rdx + 4] - add qword ptr [rsp + nb410_innerjjnr], 8 ;# advance pointer (unrolled 2) - - mov rsi, [rbp + nb410_pos] ;# base of pos[] - - lea r10, [r14 + r14*2] ;# replace jnr with j3 - lea r11, [r15 + r15*2] - - ;# move two coordinates to xmm4-xmm6 - movlpd xmm4, [rsi + r10*8] - movlpd xmm5, [rsi + r10*8 + 8] - movlpd xmm6, [rsi + r10*8 + 16] - movhpd xmm4, [rsi + r11*8] - movhpd xmm5, [rsi + r11*8 + 8] - movhpd xmm6, [rsi + r11*8 + 16] - - ;# calc dr - subpd xmm4, [rsp + nb410_ix] - subpd xmm5, [rsp + nb410_iy] - subpd xmm6, [rsp + nb410_iz] - - ;# store dr - movapd [rsp + nb410_dx], xmm4 - movapd [rsp + nb410_dy], xmm5 - movapd [rsp + nb410_dz], xmm6 - - ;# load isaj - mov rsi, [rbp + nb410_invsqrta] - - ;# square it - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - movlpd xmm3, [rsi + r14*8] - movhpd xmm3, [rsi + r15*8] - - mov rdi, [rbp + nb410_type] - mov r8d, [rdi + r14*4] - mov r9d, [rdi + r15*4] - - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - mulpd xmm3, [rsp + nb410_isai] - movapd [rsp + nb410_isaprod], xmm3 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb410_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb410_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm6, xmm3 - mulpd xmm6, [rsp + nb410_gbtsc] - movapd [rsp + nb410_gbscale], xmm6 - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb410_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb410_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=rinv - - mulpd xmm3, [rsp + nb410_iq] - mov rsi, [rbp + nb410_charge] ;# base of charge[] - movlpd xmm6, [rsi + r14*8] - movhpd xmm6, [rsi + r15*8] - mulpd xmm6, xmm3 - movapd [rsp + nb410_qq], xmm6 - - mulpd xmm4, xmm0 ;# xmm4=r - movapd [rsp + nb410_r], xmm4 - mulpd xmm4, [rsp + nb410_gbscale] - mov edi, [rsp + nb410_ntia] - - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - shl r8d, 1 - shl r9d, 1 - add r8d, edi - add r9d, edi - - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - mov rdi, [rbp + nb410_vdwparam] - - pslld mm6, 2 ;# idx *= 4 - - mov rsi, [rbp + nb410_GBtab] - movd r12d, mm6 - psrlq mm6, 32 - movd r13d, mm6 ;# indices in r12/r13 - - movlpd xmm6, [rdi + r8*8] - movlpd xmm7, [rdi + r8*8 + 8] - - movapd xmm9, xmm0 ;# rinv - mulpd xmm9, xmm9 ;# rinvsq - movapd xmm10, xmm9 ;# rinvsq - mulpd xmm10, xmm10 ;# rinv4 - mulpd xmm10, xmm9 ;# rinv6 - movapd xmm11, xmm10 - mulpd xmm11, xmm11 ;# rinv12 - - - movhpd xmm6, [rdi + r9*8] - movhpd xmm7, [rdi + r9*8 + 8] - - ;# load table data - movapd xmm4, [rsi + r12*8] ;# Y1 F1 - movapd xmm3, [rsi + r13*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - mulpd xmm10, xmm6 ;# vvdw6=c6*rinv6 - mulpd xmm11, xmm7 ;# vvdw12=c12*rinv12 - - movapd xmm9, xmm11 - subpd xmm11, xmm10 ;# Vvdw=Vvdw12-Vvdw6 - - ;# add potential to vvdwtot - addpd xmm11, [rsp + nb410_Vvdwtot] - movapd [rsp + nb410_Vvdwtot], xmm11 - - movapd xmm6, [rsi + r12*8 + 16] ;# G1 H1 - movapd xmm3, [rsi + r13*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - - mulpd xmm7, xmm1 ;# xmm7=Heps - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm1 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - mulpd xmm7, [rsp + nb410_two] ;# two*Heps2 - movapd xmm3, [rsp + nb410_qq] - addpd xmm7, xmm6 - addpd xmm7, xmm5 ;# xmm7=FF - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - mulpd xmm5, xmm3 ;# vcoul=qq*VV - mulpd xmm3, xmm7 ;# fijC=FF*qq - - ;# LJ forces - mulpd xmm10, [rsp + nb410_six] - mulpd xmm9, [rsp + nb410_twelve] - subpd xmm9, xmm10 - mulpd xmm9, xmm0 ;# (12*vnb12-6*vnb6)*rinv - - mov rsi, [rbp + nb410_dvda] - - ;# Calculate dVda - xorpd xmm7, xmm7 - mulpd xmm3, [rsp + nb410_gbscale] - movapd xmm6, xmm3 - mulpd xmm6, [rsp + nb410_r] - addpd xmm6, xmm5 - - ;# update vctot - addpd xmm12, xmm5 - - ;# xmm6=(vcoul+fijC*r) - subpd xmm7, xmm6 - movapd xmm6, xmm7 - - mov rdi, [rbp + nb410_faction] - ;# the fj's - start by accumulating forces from memory - movlpd xmm2, [rdi + r10*8] - movlpd xmm4, [rdi + r10*8 + 8] - movlpd xmm5, [rdi + r10*8 + 16] - - ;# update dvdasum - addpd xmm7, [rsp + nb410_dvdasum] - movapd [rsp + nb410_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - addsd xmm6, [rsi + r14*8] - addsd xmm7, [rsi + r15*8] - movsd [rsi + r14*8], xmm6 - movsd [rsi + r15*8], xmm7 - - movhpd xmm2, [rdi + r11*8] - movhpd xmm4, [rdi + r11*8 + 8] - movhpd xmm5, [rdi + r11*8 + 16] - - subpd xmm9, xmm3 - mulpd xmm9, xmm0 ;# fscal - - movapd xmm10, xmm9 - movapd xmm11, xmm9 - - mulpd xmm9, [rsp + nb410_dx] - mulpd xmm10, [rsp + nb410_dy] - mulpd xmm11, [rsp + nb410_dz] - - addpd xmm2, xmm9 - addpd xmm4, xmm10 - addpd xmm5, xmm11 - - movlpd [rdi + r10*8], xmm2 - movlpd [rdi + r10*8 + 8], xmm4 - movlpd [rdi + r10*8 + 16], xmm5 - - ;# accumulate i forces - addpd xmm13, xmm9 - addpd xmm14, xmm10 - addpd xmm15, xmm11 - - movhpd [rdi + r11*8], xmm2 - movhpd [rdi + r11*8 + 8], xmm4 - movhpd [rdi + r11*8 + 16], xmm5 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb410_innerk], 2 - jl .nb410_checksingle - jmp .nb410_unroll_loop -.nb410_checksingle: - mov edx, [rsp + nb410_innerk] - and edx, 1 - jnz .nb410_dosingle - jmp .nb410_updateouterdata -.nb410_dosingle: - mov rsi, [rbp + nb410_charge] - mov rdx, [rbp + nb410_invsqrta] - mov rdi, [rbp + nb410_pos] - mov rcx, [rsp + nb410_innerjjnr] - mov eax, [rcx] - - ;# load isaj - mov rsi, [rbp + nb410_invsqrta] - movsd xmm2, [rsi + rax*8] - mulsd xmm2, [rsp + nb410_isai] - movapd [rsp + nb410_isaprod], xmm2 - movapd xmm1, xmm2 - mulsd xmm1, [rsp + nb410_gbtsc] - movapd [rsp + nb410_gbscale], xmm1 - - mulsd xmm2, [rsp + nb410_iq] - mov rsi, [rbp + nb410_charge] ;# base of charge[] - movsd xmm3, [rsi + rax*8] - mulsd xmm3, xmm2 - movapd [rsp + nb410_qq], xmm3 - - mov rsi, [rbp + nb410_type] - mov r8d, [rsi + rax*4] - mov rsi, [rbp + nb410_vdwparam] - shl r8d, 1 - mov edi, [rsp + nb410_ntia] - add r8d, edi - - movsd xmm4, [rsi + r8*8] - movsd xmm6, [rsi + r8*8 + 8] - movapd [rsp + nb410_c6], xmm4 - movapd [rsp + nb410_c12], xmm6 - - mov rsi, [rbp + nb410_pos] ;# base of pos[] - - lea r10, [rax + rax*2] ;# replace jnr with j3 - - ;# move two coordinates to xmm4-xmm6 - movsd xmm4, [rsi + r10*8] - movsd xmm5, [rsi + r10*8 + 8] - movsd xmm6, [rsi + r10*8 + 16] - - ;# calc dr - subsd xmm4, [rsp + nb410_ix] - subsd xmm5, [rsp + nb410_iy] - subsd xmm6, [rsp + nb410_iz] - - ;# store dr - movapd [rsp + nb410_dx], xmm4 - movapd [rsp + nb410_dy], xmm5 - movapd [rsp + nb410_dz], xmm6 - - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb410_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb410_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb410_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb410_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=rinv - - mulsd xmm4, xmm0 ;# xmm4=r - movapd [rsp + nb410_r], xmm4 - mulsd xmm4, [rsp + nb410_gbscale] - - cvttsd2si r12d, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, r12d - subsd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl r12d, 2 ;# idx *= 4 - - mov rsi, [rbp + nb410_GBtab] - - movapd xmm9, xmm0 ;# rinv - mulsd xmm9, xmm9 ;# rinvsq - movapd xmm10, xmm9 ;# rinvsq - mulsd xmm10, xmm10 ;# rinv4 - mulsd xmm10, xmm9 ;# rinv6 - movapd xmm11, xmm10 - mulsd xmm11, xmm11 ;# rinv12 - - ;# load table data - movapd xmm4, [rsi + r12*8] ;# Y1 F1 - movhlps xmm5, xmm4 - - mulsd xmm10, [rsp + nb410_c6] ;# vvdw6=c6*rinv6 - mulsd xmm11, [rsp + nb410_c12] ;# vvdw12=c12*rinv12 - - movapd xmm9, xmm11 - subsd xmm11, xmm10 ;# Vvdw=Vvdw12-Vvdw6 - - ;# add potential to vvdwtot - addsd xmm11, [rsp + nb410_Vvdwtot] - movsd [rsp + nb410_Vvdwtot], xmm11 - - movapd xmm6, [rsi + r12*8 + 16] ;# G1 H1 - movhlps xmm7, xmm6 - ;# coulomb table ready, in xmm4-xmm7 - - mulsd xmm7, xmm1 ;# xmm7=Heps - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm1 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - mulsd xmm7, [rsp + nb410_two] ;# two*Heps2 - movapd xmm3, [rsp + nb410_qq] - addsd xmm7, xmm6 - addsd xmm7, xmm5 ;# xmm7=FF - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, xmm3 ;# vcoul=qq*VV - mulsd xmm3, xmm7 ;# fijC=FF*qq - - ;# LJ forces - mulsd xmm10, [rsp + nb410_six] - mulsd xmm9, [rsp + nb410_twelve] - subsd xmm9, xmm10 - mulsd xmm9, xmm0 ;# (12*vnb12-6*vnb6)*rinv - - mov rsi, [rbp + nb410_dvda] - - ;# Calculate dVda - xorpd xmm7, xmm7 - mulsd xmm3, [rsp + nb410_gbscale] - movapd xmm6, xmm3 - mulsd xmm6, [rsp + nb410_r] - addsd xmm6, xmm5 - - ;# update vctot - addsd xmm12, xmm5 - - ;# xmm6=(vcoul+fijC*r) - subsd xmm7, xmm6 - movapd xmm6, xmm7 - - ;# update dvdasum - addsd xmm7, [rsp + nb410_dvdasum] - movsd [rsp + nb410_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - addsd xmm6, [rsi + rax*8] - addsd xmm7, [rsi + rbx*8] - movsd [rsi + rax*8], xmm6 - movsd [rsi + rbx*8], xmm7 - - subsd xmm9, xmm3 - mulsd xmm9, xmm0 ;# fscal - - movapd xmm10, xmm9 - movapd xmm11, xmm9 - - mulsd xmm9, [rsp + nb410_dx] - mulsd xmm10, [rsp + nb410_dy] - mulsd xmm11, [rsp + nb410_dz] - - ;# accumulate i forces - addsd xmm13, xmm9 - addsd xmm14, xmm10 - addsd xmm15, xmm11 - - mov rdi, [rbp + nb410_faction] - ;# the fj's - start by accumulating forces from memory - addsd xmm9, [rdi + r10*8] - addsd xmm10, [rdi + r10*8 + 8] - addsd xmm11, [rdi + r10*8 + 16] - movsd [rdi + r10*8], xmm9 - movsd [rdi + r10*8 + 8], xmm10 - movsd [rdi + r10*8 + 16], xmm11 - -.nb410_updateouterdata: - mov ecx, [rsp + nb410_ii3] - mov rdi, [rbp + nb410_faction] - mov rsi, [rbp + nb410_fshift] - mov edx, [rsp + nb410_is3] - - ;# accumulate i forces in xmm13, xmm14, xmm15 - movhlps xmm3, xmm13 - movhlps xmm4, xmm14 - movhlps xmm5, xmm15 - addsd xmm13, xmm3 - addsd xmm14, xmm4 - addsd xmm15, xmm5 ;# sum is in low xmm13-xmm15 - - ;# increment i force - movsd xmm3, [rdi + rcx*8] - movsd xmm4, [rdi + rcx*8 + 8] - movsd xmm5, [rdi + rcx*8 + 16] - subsd xmm3, xmm13 - subsd xmm4, xmm14 - subsd xmm5, xmm15 - movsd [rdi + rcx*8], xmm3 - movsd [rdi + rcx*8 + 8], xmm4 - movsd [rdi + rcx*8 + 16], xmm5 - - ;# increment fshift force - movsd xmm3, [rsi + rdx*8] - movsd xmm4, [rsi + rdx*8 + 8] - movsd xmm5, [rsi + rdx*8 + 16] - subsd xmm3, xmm13 - subsd xmm4, xmm14 - subsd xmm5, xmm15 - movsd [rsi + rdx*8], xmm3 - movsd [rsi + rdx*8 + 8], xmm4 - movsd [rsi + rdx*8 + 16], xmm5 - - ;# get n from stack - mov esi, [rsp + nb410_n] - ;# get group index for i particle - mov rdx, [rbp + nb410_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movhlps xmm6, xmm12 - addsd xmm12, xmm6 ;# low xmm12 has the sum now - - ;# add earlier value from mem - mov rax, [rbp + nb410_Vc] - addsd xmm12, [rax + rdx*8] - ;# move back to mem - movsd [rax + rdx*8], xmm12 - - ;# accumulate total lj energy and update it - movapd xmm7, [rsp + nb410_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov rax, [rbp + nb410_Vvdw] - addsd xmm7, [rax + rdx*8] - ;# move back to mem - movsd [rax + rdx*8], xmm7 - - ;# accumulate dVda and update it - movapd xmm7, [rsp + nb410_dvdasum] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - mov edx, [rsp + nb410_ii] - mov rax, [rbp + nb410_dvda] - addsd xmm7, [rax + rdx*8] - movsd [rax + rdx*8], xmm7 - - ;# finish if last - mov ecx, [rsp + nb410_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb410_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb410_n], esi - jmp .nb410_outer -.nb410_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb410_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb410_end - ;# non-zero, do one more workunit - jmp .nb410_threadloop -.nb410_end: - mov eax, [rsp + nb410_nouter] - mov ebx, [rsp + nb410_ninner] - mov rcx, [rbp + nb410_outeriter] - mov rdx, [rbp + nb410_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 552 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - - - - - - - - - -.globl nb_kernel410nf_x86_64_sse2 -.globl _nb_kernel410nf_x86_64_sse2 -nb_kernel410nf_x86_64_sse2: -_nb_kernel410nf_x86_64_sse2: -;# Room for return address and rbp (16 bytes) -.equiv nb410nf_fshift, 16 -.equiv nb410nf_gid, 24 -.equiv nb410nf_pos, 32 -.equiv nb410nf_faction, 40 -.equiv nb410nf_charge, 48 -.equiv nb410nf_p_facel, 56 -.equiv nb410nf_argkrf, 64 -.equiv nb410nf_argcrf, 72 -.equiv nb410nf_Vc, 80 -.equiv nb410nf_type, 88 -.equiv nb410nf_p_ntype, 96 -.equiv nb410nf_vdwparam, 104 -.equiv nb410nf_Vvdw, 112 -.equiv nb410nf_p_tabscale, 120 -.equiv nb410nf_VFtab, 128 -.equiv nb410nf_invsqrta, 136 -.equiv nb410nf_dvda, 144 -.equiv nb410nf_p_gbtabscale, 152 -.equiv nb410nf_GBtab, 160 -.equiv nb410nf_p_nthreads, 168 -.equiv nb410nf_count, 176 -.equiv nb410nf_mtx, 184 -.equiv nb410nf_outeriter, 192 -.equiv nb410nf_inneriter, 200 -.equiv nb410nf_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb410nf_ix, 0 -.equiv nb410nf_iy, 16 -.equiv nb410nf_iz, 32 -.equiv nb410nf_iq, 48 -.equiv nb410nf_two, 64 -.equiv nb410nf_gbtsc, 80 -.equiv nb410nf_qq, 96 -.equiv nb410nf_c6, 112 -.equiv nb410nf_c12, 128 -.equiv nb410nf_vctot, 144 -.equiv nb410nf_Vvdwtot, 160 -.equiv nb410nf_half, 176 -.equiv nb410nf_three, 192 -.equiv nb410nf_r, 208 -.equiv nb410nf_isai, 224 -.equiv nb410nf_isaprod, 240 -.equiv nb410nf_gbscale, 256 -.equiv nb410nf_nri, 272 -.equiv nb410nf_iinr, 280 -.equiv nb410nf_jindex, 288 -.equiv nb410nf_jjnr, 296 -.equiv nb410nf_shift, 304 -.equiv nb410nf_shiftvec, 312 -.equiv nb410nf_facel, 320 -.equiv nb410nf_innerjjnr, 328 -.equiv nb410nf_ii, 336 -.equiv nb410nf_is3, 340 -.equiv nb410nf_ii3, 344 -.equiv nb410nf_ntia, 348 -.equiv nb410nf_innerk, 352 -.equiv nb410nf_n, 356 -.equiv nb410nf_nn1, 360 -.equiv nb410nf_ntype, 364 -.equiv nb410nf_nouter, 368 -.equiv nb410nf_ninner, 372 - push rbp - mov rbp, rsp - push rbx - - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 392 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb410nf_nouter], eax - mov [rsp + nb410nf_ninner], eax - - mov edi, [rdi] - mov [rsp + nb410nf_nri], edi - mov [rsp + nb410nf_iinr], rsi - mov [rsp + nb410nf_jindex], rdx - mov [rsp + nb410nf_jjnr], rcx - mov [rsp + nb410nf_shift], r8 - mov [rsp + nb410nf_shiftvec], r9 - mov rdi, [rbp + nb410nf_p_ntype] - mov edi, [rdi] - mov [rsp + nb410nf_ntype], edi - mov rsi, [rbp + nb410nf_p_facel] - movsd xmm0, [rsi] - movsd [rsp + nb410nf_facel], xmm0 - - mov rbx, [rbp + nb410nf_p_gbtabscale] - movsd xmm4, [rbx] - shufpd xmm4, xmm4, 0 - movapd [rsp + nb410nf_gbtsc], xmm4 - - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double half IEEE (hex) - mov ebx, 0x3fe00000 - mov [rsp + nb410nf_half], eax - mov [rsp + nb410nf_half+4], ebx - movsd xmm1, [rsp + nb410nf_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# one - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# two - addpd xmm3, xmm2 ;# three - movapd [rsp + nb410nf_half], xmm1 - movapd [rsp + nb410nf_two], xmm2 - movapd [rsp + nb410nf_three], xmm3 - -.nb410nf_threadloop: - mov rsi, [rbp + nb410nf_count] ;# pointer to sync counter - mov eax, [rsi] -.nb410nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb410nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb410nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb410nf_n], eax - mov [rsp + nb410nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb410nf_outerstart - jmp .nb410nf_end - -.nb410nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb410nf_nouter] - mov [rsp + nb410nf_nouter], ebx - -.nb410nf_outer: - mov rax, [rsp + nb410nf_shift] ;# rax = pointer into shift[] - mov ebx, [rax+rsi*4] ;# rbx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb410nf_is3],ebx ;# store is3 - - mov rax, [rsp + nb410nf_shiftvec] ;# rax = base of shiftvec[] - - movsd xmm0, [rax + rbx*8] - movsd xmm1, [rax + rbx*8 + 8] - movsd xmm2, [rax + rbx*8 + 16] - - mov rcx, [rsp + nb410nf_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx+rsi*4] ;# ebx =ii - mov [rsp + nb410nf_ii], ebx - - mov rdx, [rbp + nb410nf_charge] - movsd xmm3, [rdx + rbx*8] - mulsd xmm3, [rsp + nb410nf_facel] - shufpd xmm3, xmm3, 0 - - mov rdx, [rbp + nb410nf_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [rdx + rbx*8] - shufpd xmm4, xmm4, 0 - - mov rdx, [rbp + nb410nf_type] - mov edx, [rdx + rbx*4] - imul edx, [rsp + nb410nf_ntype] - shl edx, 1 - mov [rsp + nb410nf_ntia], edx - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb410nf_pos] ;# rax = base of pos[] - - addsd xmm0, [rax + rbx*8] - addsd xmm1, [rax + rbx*8 + 8] - addsd xmm2, [rax + rbx*8 + 16] - - movapd [rsp + nb410nf_iq], xmm3 - movapd [rsp + nb410nf_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [rsp + nb410nf_ix], xmm0 - movapd [rsp + nb410nf_iy], xmm1 - movapd [rsp + nb410nf_iz], xmm2 - - mov [rsp + nb410nf_ii3], ebx - - ;# clear vctot and Vvdwtot - xorpd xmm4, xmm4 - movapd [rsp + nb410nf_vctot], xmm4 - movapd [rsp + nb410nf_Vvdwtot], xmm4 - - mov rax, [rsp + nb410nf_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb410nf_pos] - mov rdi, [rbp + nb410nf_faction] - mov rax, [rsp + nb410nf_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb410nf_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [rsp + nb410nf_ninner] - mov [rsp + nb410nf_ninner], ecx - add edx, 0 - mov [rsp + nb410nf_innerk], edx ;# number of innerloop atoms - jge .nb410nf_unroll_loop - jmp .nb410nf_checksingle -.nb410nf_unroll_loop: - ;# twice unrolled innerloop here - mov rdx, [rsp + nb410nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [rdx] - mov ebx, [rdx + 4] - add qword ptr [rsp + nb410nf_innerjjnr], 8 ;# advance pointer (unrolled 2) - - ;# load isaj - mov rsi, [rbp + nb410nf_invsqrta] - movlpd xmm2, [rsi + rax*8] - movhpd xmm2, [rsi + rbx*8] - mulpd xmm2, [rsp + nb410nf_isai] - movapd [rsp + nb410nf_isaprod], xmm2 - movapd xmm1, xmm2 - mulpd xmm1, [rsp + nb410nf_gbtsc] - movapd [rsp + nb410nf_gbscale], xmm1 - - mov rsi, [rbp + nb410nf_charge] ;# base of charge[] - movlpd xmm3, [rsi + rax*8] - movhpd xmm3, [rsi + rbx*8] - - mulpd xmm2, [rsp + nb410nf_iq] - mulpd xmm3, xmm2 - movapd [rsp + nb410nf_qq], xmm3 - - movd mm0, eax ;# use mmx registers as temp storage - movd mm1, ebx - - mov rsi, [rbp + nb410nf_type] - mov eax, [rsi + rax*4] - mov ebx, [rsi + rbx*4] - mov rsi, [rbp + nb410nf_vdwparam] - shl eax, 1 - shl ebx, 1 - mov edi, [rsp + nb410nf_ntia] - add eax, edi - add ebx, edi - - movlpd xmm6, [rsi + rax*8] ;# c6a - movlpd xmm7, [rsi + rbx*8] ;# c6b - movhpd xmm6, [rsi + rax*8 + 8] ;# c6a c12a - movhpd xmm7, [rsi + rbx*8 + 8] ;# c6b c12b - - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movd eax, mm0 - movd ebx, mm1 - movapd [rsp + nb410nf_c6], xmm4 - movapd [rsp + nb410nf_c12], xmm6 - - mov rsi, [rbp + nb410nf_pos] ;# base of pos[] - - movd mm2, eax - movd mm3, ebx - lea rax, [rax + rax*2] ;# replace jnr with j3 - lea rbx, [rbx + rbx*2] - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [rsi + rax*8] - movlpd xmm1, [rsi + rax*8 + 8] - movlpd xmm2, [rsi + rax*8 + 16] - movhpd xmm0, [rsi + rbx*8] - movhpd xmm1, [rsi + rbx*8 + 8] - movhpd xmm2, [rsi + rbx*8 + 16] - - ;# move ix-iz to xmm4-xmm6 - movapd xmm4, [rsp + nb410nf_ix] - movapd xmm5, [rsp + nb410nf_iy] - movapd xmm6, [rsp + nb410nf_iz] - - ;# calc dr - subpd xmm4, xmm0 - subpd xmm5, xmm1 - subpd xmm6, xmm2 - - ;# square dr - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb410nf_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb410nf_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb410nf_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb410nf_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=rinv - - mulpd xmm4, xmm0 ;# xmm4=r - movapd [rsp + nb410nf_r], xmm4 - mulpd xmm4, [rsp + nb410nf_gbscale] - - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 ;# idx *= 4 - - movd mm0, eax - movd mm1, ebx - - mov rsi, [rbp + nb410nf_GBtab] - movd eax, mm6 - psrlq mm6, 32 - movd ebx, mm6 ;# indices in eax/ebx - - movapd xmm4, [rsi + rax*8] ;# Y1 F1 - movapd xmm3, [rsi + rbx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [rsi + rax*8 + 16] ;# G1 H1 - movapd xmm3, [rsi + rbx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [rsp + nb410nf_qq] - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - mulpd xmm5, xmm3 ;# vcoul=qq*VV - - addpd xmm5, [rsp + nb410nf_vctot] - movapd [rsp + nb410nf_vctot], xmm5 - - ;# L-J - movapd xmm4, xmm0 - mulpd xmm4, xmm0 ;# xmm4=rinvsq - - movapd xmm6, xmm4 - mulpd xmm6, xmm4 - - mulpd xmm6, xmm4 ;# xmm6=rinvsix - movapd xmm4, xmm6 - mulpd xmm4, xmm4 ;# xmm4=rinvtwelve - mulpd xmm6, [rsp + nb410nf_c6] - mulpd xmm4, [rsp + nb410nf_c12] - movapd xmm7, [rsp + nb410nf_Vvdwtot] - addpd xmm7, xmm4 - subpd xmm7, xmm6 - movapd [rsp + nb410nf_Vvdwtot], xmm7 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb410nf_innerk], 2 - jl .nb410nf_checksingle - jmp .nb410nf_unroll_loop -.nb410nf_checksingle: - mov edx, [rsp + nb410nf_innerk] - and edx, 1 - jnz .nb410nf_dosingle - jmp .nb410nf_updateouterdata -.nb410nf_dosingle: - mov rsi, [rbp + nb410nf_charge] - mov rdx, [rbp + nb410nf_invsqrta] - mov rdi, [rbp + nb410nf_pos] - mov rcx, [rsp + nb410nf_innerjjnr] - mov eax, [rcx] - - xorpd xmm6, xmm6 - movapd xmm7, xmm6 - movsd xmm7, [rdx + rax*8] - movlpd xmm6, [rsi + rax*8] ;# xmm6(0) has the charge - mulsd xmm7, [rsp + nb410nf_isai] - movapd [rsp + nb410nf_isaprod], xmm7 - movapd xmm1, xmm7 - mulpd xmm1, [rsp + nb410nf_gbtsc] - movapd [rsp + nb410nf_gbscale], xmm1 - - mulsd xmm7, [rsp + nb410nf_iq] - mulsd xmm6, xmm7 - movapd [rsp + nb410nf_qq], xmm6 - - movd mm0, eax ;# use mmx registers as temp storage - mov rsi, [rbp + nb410nf_type] - mov eax, [rsi + rax*4] - mov rsi, [rbp + nb410nf_vdwparam] - shl eax, 1 - mov edi, [rsp + nb410nf_ntia] - add eax, edi - - movlpd xmm6, [rsi + rax*8] ;# c6a - movhpd xmm6, [rsi + rax*8 + 8] ;# c6a c12a - - xorpd xmm7, xmm7 - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movd eax, mm0 - movapd [rsp + nb410nf_c6], xmm4 - movapd [rsp + nb410nf_c12], xmm6 - - mov rsi, [rbp + nb410nf_pos] ;# base of pos[] - - movd mm2, eax - lea rax, [rax + rax*2] ;# replace jnr with j3 - - ;# move coordinates to xmm0-xmm2 - movlpd xmm0, [rsi + rax*8] - movlpd xmm1, [rsi + rax*8 + 8] - movlpd xmm2, [rsi + rax*8 + 16] - - ;# move ix-iz to xmm4-xmm6 - movapd xmm4, [rsp + nb410nf_ix] - movapd xmm5, [rsp + nb410nf_iy] - movapd xmm6, [rsp + nb410nf_iz] - - ;# calc dr - subsd xmm4, xmm0 - subsd xmm5, xmm1 - subsd xmm6, xmm2 - - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb410nf_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb410nf_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb410nf_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb410nf_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=rinv - - mulsd xmm4, xmm0 ;# xmm4=r - movapd [rsp + nb410nf_r], xmm4 - mulsd xmm4, [rsp + nb410nf_gbscale] - - movd mm0, eax - cvttsd2si eax, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, eax - subsd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl eax, 2 ;# idx *= 4 - - mov rsi, [rbp + nb410nf_GBtab] - - movapd xmm4, [rsi + rax*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [rsi + rax*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# coulomb table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [rsp + nb410nf_qq] - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, xmm3 ;# vcoul=qq*VV - - addsd xmm5, [rsp + nb410nf_vctot] - movsd [rsp + nb410nf_vctot], xmm5 - - ;# L-J - movapd xmm4, xmm0 - mulsd xmm4, xmm0 ;# xmm4=rinvsq - - - movapd xmm6, xmm4 - mulsd xmm6, xmm4 - - mulsd xmm6, xmm4 ;# xmm6=rinvsix - movapd xmm4, xmm6 - mulsd xmm4, xmm4 ;# xmm4=rinvtwelve - mulsd xmm6, [rsp + nb410nf_c6] - mulsd xmm4, [rsp + nb410nf_c12] - movapd xmm7, [rsp + nb410nf_Vvdwtot] - addsd xmm7, xmm4 - subsd xmm7, xmm6 - movlpd [rsp + nb410nf_Vvdwtot], xmm7 - -.nb410nf_updateouterdata: - mov ecx, [rsp + nb410nf_ii3] - mov edx, [rsp + nb410nf_is3] - - ;# get n from stack - mov esi, [rsp + nb410nf_n] - ;# get group index for i particle - mov rdx, [rbp + nb410nf_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movapd xmm7, [rsp + nb410nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov rax, [rbp + nb410nf_Vc] - addsd xmm7, [rax + rdx*8] - ;# move back to mem - movsd [rax + rdx*8], xmm7 - - ;# accumulate total lj energy and update it - movapd xmm7, [rsp + nb410nf_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov rax, [rbp + nb410nf_Vvdw] - addsd xmm7, [rax + rdx*8] - ;# move back to mem - movsd [rax + rdx*8], xmm7 - - ;# finish if last - mov ecx, [rsp + nb410nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb410nf_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb410nf_n], esi - jmp .nb410nf_outer -.nb410nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb410nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb410nf_end - ;# non-zero, do one more workunit - jmp .nb410nf_threadloop -.nb410nf_end: - mov eax, [rsp + nb410nf_nouter] - mov ebx, [rsp + nb410nf_ninner] - mov rcx, [rbp + nb410nf_outeriter] - mov rdx, [rbp + nb410nf_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 392 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.s deleted file mode 100644 index 21eb0180b4..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel410_x86_64_sse2.s +++ /dev/null @@ -1,1464 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - - - - - - -.globl nb_kernel410_x86_64_sse2 -.globl _nb_kernel410_x86_64_sse2 -nb_kernel410_x86_64_sse2: -_nb_kernel410_x86_64_sse2: -## Room for return address and rbp (16 bytes) -.set nb410_fshift, 16 -.set nb410_gid, 24 -.set nb410_pos, 32 -.set nb410_faction, 40 -.set nb410_charge, 48 -.set nb410_p_facel, 56 -.set nb410_argkrf, 64 -.set nb410_argcrf, 72 -.set nb410_Vc, 80 -.set nb410_type, 88 -.set nb410_p_ntype, 96 -.set nb410_vdwparam, 104 -.set nb410_Vvdw, 112 -.set nb410_p_tabscale, 120 -.set nb410_VFtab, 128 -.set nb410_invsqrta, 136 -.set nb410_dvda, 144 -.set nb410_p_gbtabscale, 152 -.set nb410_GBtab, 160 -.set nb410_p_nthreads, 168 -.set nb410_count, 176 -.set nb410_mtx, 184 -.set nb410_outeriter, 192 -.set nb410_inneriter, 200 -.set nb410_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb410_ix, 0 -.set nb410_iy, 16 -.set nb410_iz, 32 -.set nb410_iq, 48 -.set nb410_dx, 64 -.set nb410_dy, 80 -.set nb410_dz, 96 -.set nb410_two, 112 -.set nb410_six, 128 -.set nb410_twelve, 144 -.set nb410_gbtsc, 160 -.set nb410_qq, 176 -.set nb410_c6, 192 -.set nb410_c12, 208 -.set nb410_fscal, 224 -.set nb410_vctot, 240 -.set nb410_Vvdwtot, 256 -.set nb410_fix, 272 -.set nb410_fiy, 288 -.set nb410_fiz, 304 -.set nb410_half, 320 -.set nb410_three, 336 -.set nb410_r, 352 -.set nb410_isai, 368 -.set nb410_isaprod, 384 -.set nb410_dvdasum, 400 -.set nb410_gbscale, 416 -.set nb410_nri, 432 -.set nb410_iinr, 440 -.set nb410_jindex, 448 -.set nb410_jjnr, 456 -.set nb410_shift, 464 -.set nb410_shiftvec, 472 -.set nb410_facel, 480 -.set nb410_innerjjnr, 488 -.set nb410_ii, 496 -.set nb410_is3, 500 -.set nb410_ii3, 504 -.set nb410_ntia, 508 -.set nb410_innerk, 512 -.set nb410_n, 516 -.set nb410_nn1, 520 -.set nb410_ntype, 524 -.set nb410_nouter, 528 -.set nb410_ninner, 532 - push %rbp - movq %rsp,%rbp - push %rbx - - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $552,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb410_nouter(%rsp) - movl %eax,nb410_ninner(%rsp) - - movl (%rdi),%edi - movl %edi,nb410_nri(%rsp) - movq %rsi,nb410_iinr(%rsp) - movq %rdx,nb410_jindex(%rsp) - movq %rcx,nb410_jjnr(%rsp) - movq %r8,nb410_shift(%rsp) - movq %r9,nb410_shiftvec(%rsp) - movq nb410_p_ntype(%rbp),%rdi - movl (%rdi),%edi - movl %edi,nb410_ntype(%rsp) - movq nb410_p_facel(%rbp),%rsi - movsd (%rsi),%xmm0 - movsd %xmm0,nb410_facel(%rsp) - - movq nb410_p_gbtabscale(%rbp),%rbx - movsd (%rbx),%xmm4 - shufpd $0,%xmm4,%xmm4 - movapd %xmm4,nb410_gbtsc(%rsp) - - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double half IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb410_half(%rsp) - movl %ebx,nb410_half+4(%rsp) - movsd nb410_half(%rsp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## one - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## two - addpd %xmm2,%xmm3 ## three - movapd %xmm3,%xmm4 - addpd %xmm4,%xmm4 ## six - movapd %xmm4,%xmm5 - addpd %xmm5,%xmm5 ## twelve - movapd %xmm1,nb410_half(%rsp) - movapd %xmm2,nb410_two(%rsp) - movapd %xmm3,nb410_three(%rsp) - movapd %xmm4,nb410_six(%rsp) - movapd %xmm5,nb410_twelve(%rsp) - -_nb_kernel410_x86_64_sse2.nb410_threadloop: - movq nb410_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel410_x86_64_sse2.nb410_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel410_x86_64_sse2.nb410_spinlock - - ## if(nn1>nri) nn1=nri - movl nb410_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb410_n(%rsp) - movl %ebx,nb410_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel410_x86_64_sse2.nb410_outerstart - jmp _nb_kernel410_x86_64_sse2.nb410_end - -_nb_kernel410_x86_64_sse2.nb410_outerstart: - ## ebx contains number of outer iterations - addl nb410_nouter(%rsp),%ebx - movl %ebx,nb410_nouter(%rsp) - -_nb_kernel410_x86_64_sse2.nb410_outer: - movq nb410_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## rbx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb410_is3(%rsp) ## store is3 - - movq nb410_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movsd (%rax,%rbx,8),%xmm0 - movsd 8(%rax,%rbx,8),%xmm1 - movsd 16(%rax,%rbx,8),%xmm2 - - movq nb410_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - movl %ebx,nb410_ii(%rsp) - - movq nb410_charge(%rbp),%rdx - movsd (%rdx,%rbx,8),%xmm3 - mulsd nb410_facel(%rsp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movq nb410_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movsd (%rdx,%rbx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - movq nb410_type(%rbp),%rdx - movl (%rdx,%rbx,4),%edx - imull nb410_ntype(%rsp),%edx - shll %edx - movl %edx,nb410_ntia(%rsp) - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb410_pos(%rbp),%rax ## rax = base of pos[] - - addsd (%rax,%rbx,8),%xmm0 - addsd 8(%rax,%rbx,8),%xmm1 - addsd 16(%rax,%rbx,8),%xmm2 - - movapd %xmm3,nb410_iq(%rsp) - movapd %xmm4,nb410_isai(%rsp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb410_ix(%rsp) - movapd %xmm1,nb410_iy(%rsp) - movapd %xmm2,nb410_iz(%rsp) - - movl %ebx,nb410_ii3(%rsp) - - ## clear vctot and i forces - xorpd %xmm13,%xmm13 - movapd %xmm13,%xmm12 - movapd %xmm13,nb410_Vvdwtot(%rsp) - movapd %xmm13,nb410_dvdasum(%rsp) - movapd %xmm13,%xmm14 - movapd %xmm13,%xmm15 - - movq nb410_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb410_pos(%rbp),%rsi - movq nb410_faction(%rbp),%rdi - movq nb410_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb410_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb410_ninner(%rsp),%ecx - movl %ecx,nb410_ninner(%rsp) - addl $0,%edx - movl %edx,nb410_innerk(%rsp) ## number of innerloop atoms - jge _nb_kernel410_x86_64_sse2.nb410_unroll_loop - jmp _nb_kernel410_x86_64_sse2.nb410_checksingle -_nb_kernel410_x86_64_sse2.nb410_unroll_loop: - ## twice unrolled innerloop here - movq nb410_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%r14d - movl 4(%rdx),%r15d - addq $8,nb410_innerjjnr(%rsp) ## advance pointer (unrolled 2) - - movq nb410_pos(%rbp),%rsi ## base of pos[] - - lea (%r14,%r14,2),%r10 ## replace jnr with j3 - lea (%r15,%r15,2),%r11 - - ## move two coordinates to xmm4-xmm6 - movlpd (%rsi,%r10,8),%xmm4 - movlpd 8(%rsi,%r10,8),%xmm5 - movlpd 16(%rsi,%r10,8),%xmm6 - movhpd (%rsi,%r11,8),%xmm4 - movhpd 8(%rsi,%r11,8),%xmm5 - movhpd 16(%rsi,%r11,8),%xmm6 - - ## calc dr - subpd nb410_ix(%rsp),%xmm4 - subpd nb410_iy(%rsp),%xmm5 - subpd nb410_iz(%rsp),%xmm6 - - ## store dr - movapd %xmm4,nb410_dx(%rsp) - movapd %xmm5,nb410_dy(%rsp) - movapd %xmm6,nb410_dz(%rsp) - - ## load isaj - movq nb410_invsqrta(%rbp),%rsi - - ## square it - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - movlpd (%rsi,%r14,8),%xmm3 - movhpd (%rsi,%r15,8),%xmm3 - - movq nb410_type(%rbp),%rdi - movl (%rdi,%r14,4),%r8d - movl (%rdi,%r15,4),%r9d - - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - mulpd nb410_isai(%rsp),%xmm3 - movapd %xmm3,nb410_isaprod(%rsp) - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb410_three(%rsp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb410_half(%rsp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm3,%xmm6 - mulpd nb410_gbtsc(%rsp),%xmm6 - movapd %xmm6,nb410_gbscale(%rsp) - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb410_three(%rsp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb410_half(%rsp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=rinv - - mulpd nb410_iq(%rsp),%xmm3 - movq nb410_charge(%rbp),%rsi ## base of charge[] - movlpd (%rsi,%r14,8),%xmm6 - movhpd (%rsi,%r15,8),%xmm6 - mulpd %xmm3,%xmm6 - movapd %xmm6,nb410_qq(%rsp) - - mulpd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb410_r(%rsp) - mulpd nb410_gbscale(%rsp),%xmm4 - movl nb410_ntia(%rsp),%edi - - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - shll %r8d - shll %r9d - addl %edi,%r8d - addl %edi,%r9d - - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - movq nb410_vdwparam(%rbp),%rdi - - pslld $2,%mm6 ## idx *= 4 - - movq nb410_GBtab(%rbp),%rsi - movd %mm6,%r12d - psrlq $32,%mm6 - movd %mm6,%r13d ## indices in r12/r13 - - movlpd (%rdi,%r8,8),%xmm6 - movlpd 8(%rdi,%r8,8),%xmm7 - - movapd %xmm0,%xmm9 ## rinv - mulpd %xmm9,%xmm9 ## rinvsq - movapd %xmm9,%xmm10 ## rinvsq - mulpd %xmm10,%xmm10 ## rinv4 - mulpd %xmm9,%xmm10 ## rinv6 - movapd %xmm10,%xmm11 - mulpd %xmm11,%xmm11 ## rinv12 - - - movhpd (%rdi,%r9,8),%xmm6 - movhpd 8(%rdi,%r9,8),%xmm7 - - ## load table data - movapd (%rsi,%r12,8),%xmm4 ## Y1 F1 - movapd (%rsi,%r13,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - mulpd %xmm6,%xmm10 ## vvdw6=c6*rinv6 - mulpd %xmm7,%xmm11 ## vvdw12=c12*rinv12 - - movapd %xmm11,%xmm9 - subpd %xmm10,%xmm11 ## Vvdw=Vvdw12-Vvdw6 - - ## add potential to vvdwtot - addpd nb410_Vvdwtot(%rsp),%xmm11 - movapd %xmm11,nb410_Vvdwtot(%rsp) - - movapd 16(%rsi,%r12,8),%xmm6 ## G1 H1 - movapd 16(%rsi,%r13,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - - mulpd %xmm1,%xmm7 ## xmm7=Heps - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm1,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - mulpd nb410_two(%rsp),%xmm7 ## two*Heps2 - movapd nb410_qq(%rsp),%xmm3 - addpd %xmm6,%xmm7 - addpd %xmm5,%xmm7 ## xmm7=FF - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - mulpd %xmm3,%xmm5 ## vcoul=qq*VV - mulpd %xmm7,%xmm3 ## fijC=FF*qq - - ## LJ forces - mulpd nb410_six(%rsp),%xmm10 - mulpd nb410_twelve(%rsp),%xmm9 - subpd %xmm10,%xmm9 - mulpd %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv - - movq nb410_dvda(%rbp),%rsi - - ## Calculate dVda - xorpd %xmm7,%xmm7 - mulpd nb410_gbscale(%rsp),%xmm3 - movapd %xmm3,%xmm6 - mulpd nb410_r(%rsp),%xmm6 - addpd %xmm5,%xmm6 - - ## update vctot - addpd %xmm5,%xmm12 - - ## xmm6=(vcoul+fijC*r) - subpd %xmm6,%xmm7 - movapd %xmm7,%xmm6 - - movq nb410_faction(%rbp),%rdi - ## the fj's - start by accumulating forces from memory - movlpd (%rdi,%r10,8),%xmm2 - movlpd 8(%rdi,%r10,8),%xmm4 - movlpd 16(%rdi,%r10,8),%xmm5 - - ## update dvdasum - addpd nb410_dvdasum(%rsp),%xmm7 - movapd %xmm7,nb410_dvdasum(%rsp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - addsd (%rsi,%r14,8),%xmm6 - addsd (%rsi,%r15,8),%xmm7 - movsd %xmm6,(%rsi,%r14,8) - movsd %xmm7,(%rsi,%r15,8) - - movhpd (%rdi,%r11,8),%xmm2 - movhpd 8(%rdi,%r11,8),%xmm4 - movhpd 16(%rdi,%r11,8),%xmm5 - - subpd %xmm3,%xmm9 - mulpd %xmm0,%xmm9 ## fscal - - movapd %xmm9,%xmm10 - movapd %xmm9,%xmm11 - - mulpd nb410_dx(%rsp),%xmm9 - mulpd nb410_dy(%rsp),%xmm10 - mulpd nb410_dz(%rsp),%xmm11 - - addpd %xmm9,%xmm2 - addpd %xmm10,%xmm4 - addpd %xmm11,%xmm5 - - movlpd %xmm2,(%rdi,%r10,8) - movlpd %xmm4,8(%rdi,%r10,8) - movlpd %xmm5,16(%rdi,%r10,8) - - ## accumulate i forces - addpd %xmm9,%xmm13 - addpd %xmm10,%xmm14 - addpd %xmm11,%xmm15 - - movhpd %xmm2,(%rdi,%r11,8) - movhpd %xmm4,8(%rdi,%r11,8) - movhpd %xmm5,16(%rdi,%r11,8) - - ## should we do one more iteration? - subl $2,nb410_innerk(%rsp) - jl _nb_kernel410_x86_64_sse2.nb410_checksingle - jmp _nb_kernel410_x86_64_sse2.nb410_unroll_loop -_nb_kernel410_x86_64_sse2.nb410_checksingle: - movl nb410_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel410_x86_64_sse2.nb410_dosingle - jmp _nb_kernel410_x86_64_sse2.nb410_updateouterdata -_nb_kernel410_x86_64_sse2.nb410_dosingle: - movq nb410_charge(%rbp),%rsi - movq nb410_invsqrta(%rbp),%rdx - movq nb410_pos(%rbp),%rdi - movq nb410_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - - ## load isaj - movq nb410_invsqrta(%rbp),%rsi - movsd (%rsi,%rax,8),%xmm2 - mulsd nb410_isai(%rsp),%xmm2 - movapd %xmm2,nb410_isaprod(%rsp) - movapd %xmm2,%xmm1 - mulsd nb410_gbtsc(%rsp),%xmm1 - movapd %xmm1,nb410_gbscale(%rsp) - - mulsd nb410_iq(%rsp),%xmm2 - movq nb410_charge(%rbp),%rsi ## base of charge[] - movsd (%rsi,%rax,8),%xmm3 - mulsd %xmm2,%xmm3 - movapd %xmm3,nb410_qq(%rsp) - - movq nb410_type(%rbp),%rsi - movl (%rsi,%rax,4),%r8d - movq nb410_vdwparam(%rbp),%rsi - shll %r8d - movl nb410_ntia(%rsp),%edi - addl %edi,%r8d - - movsd (%rsi,%r8,8),%xmm4 - movsd 8(%rsi,%r8,8),%xmm6 - movapd %xmm4,nb410_c6(%rsp) - movapd %xmm6,nb410_c12(%rsp) - - movq nb410_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r10 ## replace jnr with j3 - - ## move two coordinates to xmm4-xmm6 - movsd (%rsi,%r10,8),%xmm4 - movsd 8(%rsi,%r10,8),%xmm5 - movsd 16(%rsi,%r10,8),%xmm6 - - ## calc dr - subsd nb410_ix(%rsp),%xmm4 - subsd nb410_iy(%rsp),%xmm5 - subsd nb410_iz(%rsp),%xmm6 - - ## store dr - movapd %xmm4,nb410_dx(%rsp) - movapd %xmm5,nb410_dy(%rsp) - movapd %xmm6,nb410_dz(%rsp) - - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb410_three(%rsp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb410_half(%rsp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb410_three(%rsp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb410_half(%rsp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=rinv - - mulsd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb410_r(%rsp) - mulsd nb410_gbscale(%rsp),%xmm4 - - cvttsd2si %xmm4,%r12d ## mm6 = lu idx - cvtsi2sd %r12d,%xmm5 - subsd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%r12d ## idx *= 4 - - movq nb410_GBtab(%rbp),%rsi - - movapd %xmm0,%xmm9 ## rinv - mulsd %xmm9,%xmm9 ## rinvsq - movapd %xmm9,%xmm10 ## rinvsq - mulsd %xmm10,%xmm10 ## rinv4 - mulsd %xmm9,%xmm10 ## rinv6 - movapd %xmm10,%xmm11 - mulsd %xmm11,%xmm11 ## rinv12 - - ## load table data - movapd (%rsi,%r12,8),%xmm4 ## Y1 F1 - movhlps %xmm4,%xmm5 - - mulsd nb410_c6(%rsp),%xmm10 ## vvdw6=c6*rinv6 - mulsd nb410_c12(%rsp),%xmm11 ## vvdw12=c12*rinv12 - - movapd %xmm11,%xmm9 - subsd %xmm10,%xmm11 ## Vvdw=Vvdw12-Vvdw6 - - ## add potential to vvdwtot - addsd nb410_Vvdwtot(%rsp),%xmm11 - movsd %xmm11,nb410_Vvdwtot(%rsp) - - movapd 16(%rsi,%r12,8),%xmm6 ## G1 H1 - movhlps %xmm6,%xmm7 - ## coulomb table ready, in xmm4-xmm7 - - mulsd %xmm1,%xmm7 ## xmm7=Heps - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm1,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - mulsd nb410_two(%rsp),%xmm7 ## two*Heps2 - movapd nb410_qq(%rsp),%xmm3 - addsd %xmm6,%xmm7 - addsd %xmm5,%xmm7 ## xmm7=FF - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd %xmm3,%xmm5 ## vcoul=qq*VV - mulsd %xmm7,%xmm3 ## fijC=FF*qq - - ## LJ forces - mulsd nb410_six(%rsp),%xmm10 - mulsd nb410_twelve(%rsp),%xmm9 - subsd %xmm10,%xmm9 - mulsd %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv - - movq nb410_dvda(%rbp),%rsi - - ## Calculate dVda - xorpd %xmm7,%xmm7 - mulsd nb410_gbscale(%rsp),%xmm3 - movapd %xmm3,%xmm6 - mulsd nb410_r(%rsp),%xmm6 - addsd %xmm5,%xmm6 - - ## update vctot - addsd %xmm5,%xmm12 - - ## xmm6=(vcoul+fijC*r) - subsd %xmm6,%xmm7 - movapd %xmm7,%xmm6 - - ## update dvdasum - addsd nb410_dvdasum(%rsp),%xmm7 - movsd %xmm7,nb410_dvdasum(%rsp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - addsd (%rsi,%rax,8),%xmm6 - addsd (%rsi,%rbx,8),%xmm7 - movsd %xmm6,(%rsi,%rax,8) - movsd %xmm7,(%rsi,%rbx,8) - - subsd %xmm3,%xmm9 - mulsd %xmm0,%xmm9 ## fscal - - movapd %xmm9,%xmm10 - movapd %xmm9,%xmm11 - - mulsd nb410_dx(%rsp),%xmm9 - mulsd nb410_dy(%rsp),%xmm10 - mulsd nb410_dz(%rsp),%xmm11 - - ## accumulate i forces - addsd %xmm9,%xmm13 - addsd %xmm10,%xmm14 - addsd %xmm11,%xmm15 - - movq nb410_faction(%rbp),%rdi - ## the fj's - start by accumulating forces from memory - addsd (%rdi,%r10,8),%xmm9 - addsd 8(%rdi,%r10,8),%xmm10 - addsd 16(%rdi,%r10,8),%xmm11 - movsd %xmm9,(%rdi,%r10,8) - movsd %xmm10,8(%rdi,%r10,8) - movsd %xmm11,16(%rdi,%r10,8) - -_nb_kernel410_x86_64_sse2.nb410_updateouterdata: - movl nb410_ii3(%rsp),%ecx - movq nb410_faction(%rbp),%rdi - movq nb410_fshift(%rbp),%rsi - movl nb410_is3(%rsp),%edx - - ## accumulate i forces in xmm13, xmm14, xmm15 - movhlps %xmm13,%xmm3 - movhlps %xmm14,%xmm4 - movhlps %xmm15,%xmm5 - addsd %xmm3,%xmm13 - addsd %xmm4,%xmm14 - addsd %xmm5,%xmm15 ## sum is in low xmm13-xmm15 - - ## increment i force - movsd (%rdi,%rcx,8),%xmm3 - movsd 8(%rdi,%rcx,8),%xmm4 - movsd 16(%rdi,%rcx,8),%xmm5 - subsd %xmm13,%xmm3 - subsd %xmm14,%xmm4 - subsd %xmm15,%xmm5 - movsd %xmm3,(%rdi,%rcx,8) - movsd %xmm4,8(%rdi,%rcx,8) - movsd %xmm5,16(%rdi,%rcx,8) - - ## increment fshift force - movsd (%rsi,%rdx,8),%xmm3 - movsd 8(%rsi,%rdx,8),%xmm4 - movsd 16(%rsi,%rdx,8),%xmm5 - subsd %xmm13,%xmm3 - subsd %xmm14,%xmm4 - subsd %xmm15,%xmm5 - movsd %xmm3,(%rsi,%rdx,8) - movsd %xmm4,8(%rsi,%rdx,8) - movsd %xmm5,16(%rsi,%rdx,8) - - ## get n from stack - movl nb410_n(%rsp),%esi - ## get group index for i particle - movq nb410_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movhlps %xmm12,%xmm6 - addsd %xmm6,%xmm12 ## low xmm12 has the sum now - - ## add earlier value from mem - movq nb410_Vc(%rbp),%rax - addsd (%rax,%rdx,8),%xmm12 - ## move back to mem - movsd %xmm12,(%rax,%rdx,8) - - ## accumulate total lj energy and update it - movapd nb410_Vvdwtot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movq nb410_Vvdw(%rbp),%rax - addsd (%rax,%rdx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%rax,%rdx,8) - - ## accumulate dVda and update it - movapd nb410_dvdasum(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - movl nb410_ii(%rsp),%edx - movq nb410_dvda(%rbp),%rax - addsd (%rax,%rdx,8),%xmm7 - movsd %xmm7,(%rax,%rdx,8) - - ## finish if last - movl nb410_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel410_x86_64_sse2.nb410_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb410_n(%rsp) - jmp _nb_kernel410_x86_64_sse2.nb410_outer -_nb_kernel410_x86_64_sse2.nb410_outerend: - ## check if more outer neighborlists remain - movl nb410_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel410_x86_64_sse2.nb410_end - ## non-zero, do one more workunit - jmp _nb_kernel410_x86_64_sse2.nb410_threadloop -_nb_kernel410_x86_64_sse2.nb410_end: - movl nb410_nouter(%rsp),%eax - movl nb410_ninner(%rsp),%ebx - movq nb410_outeriter(%rbp),%rcx - movq nb410_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $552,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - - - - - - - - -.globl nb_kernel410nf_x86_64_sse2 -.globl _nb_kernel410nf_x86_64_sse2 -nb_kernel410nf_x86_64_sse2: -_nb_kernel410nf_x86_64_sse2: -## Room for return address and rbp (16 bytes) -.set nb410nf_fshift, 16 -.set nb410nf_gid, 24 -.set nb410nf_pos, 32 -.set nb410nf_faction, 40 -.set nb410nf_charge, 48 -.set nb410nf_p_facel, 56 -.set nb410nf_argkrf, 64 -.set nb410nf_argcrf, 72 -.set nb410nf_Vc, 80 -.set nb410nf_type, 88 -.set nb410nf_p_ntype, 96 -.set nb410nf_vdwparam, 104 -.set nb410nf_Vvdw, 112 -.set nb410nf_p_tabscale, 120 -.set nb410nf_VFtab, 128 -.set nb410nf_invsqrta, 136 -.set nb410nf_dvda, 144 -.set nb410nf_p_gbtabscale, 152 -.set nb410nf_GBtab, 160 -.set nb410nf_p_nthreads, 168 -.set nb410nf_count, 176 -.set nb410nf_mtx, 184 -.set nb410nf_outeriter, 192 -.set nb410nf_inneriter, 200 -.set nb410nf_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb410nf_ix, 0 -.set nb410nf_iy, 16 -.set nb410nf_iz, 32 -.set nb410nf_iq, 48 -.set nb410nf_two, 64 -.set nb410nf_gbtsc, 80 -.set nb410nf_qq, 96 -.set nb410nf_c6, 112 -.set nb410nf_c12, 128 -.set nb410nf_vctot, 144 -.set nb410nf_Vvdwtot, 160 -.set nb410nf_half, 176 -.set nb410nf_three, 192 -.set nb410nf_r, 208 -.set nb410nf_isai, 224 -.set nb410nf_isaprod, 240 -.set nb410nf_gbscale, 256 -.set nb410nf_nri, 272 -.set nb410nf_iinr, 280 -.set nb410nf_jindex, 288 -.set nb410nf_jjnr, 296 -.set nb410nf_shift, 304 -.set nb410nf_shiftvec, 312 -.set nb410nf_facel, 320 -.set nb410nf_innerjjnr, 328 -.set nb410nf_ii, 336 -.set nb410nf_is3, 340 -.set nb410nf_ii3, 344 -.set nb410nf_ntia, 348 -.set nb410nf_innerk, 352 -.set nb410nf_n, 356 -.set nb410nf_nn1, 360 -.set nb410nf_ntype, 364 -.set nb410nf_nouter, 368 -.set nb410nf_ninner, 372 - push %rbp - movq %rsp,%rbp - push %rbx - - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $392,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb410nf_nouter(%rsp) - movl %eax,nb410nf_ninner(%rsp) - - movl (%rdi),%edi - movl %edi,nb410nf_nri(%rsp) - movq %rsi,nb410nf_iinr(%rsp) - movq %rdx,nb410nf_jindex(%rsp) - movq %rcx,nb410nf_jjnr(%rsp) - movq %r8,nb410nf_shift(%rsp) - movq %r9,nb410nf_shiftvec(%rsp) - movq nb410nf_p_ntype(%rbp),%rdi - movl (%rdi),%edi - movl %edi,nb410nf_ntype(%rsp) - movq nb410nf_p_facel(%rbp),%rsi - movsd (%rsi),%xmm0 - movsd %xmm0,nb410nf_facel(%rsp) - - movq nb410nf_p_gbtabscale(%rbp),%rbx - movsd (%rbx),%xmm4 - shufpd $0,%xmm4,%xmm4 - movapd %xmm4,nb410nf_gbtsc(%rsp) - - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double half IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb410nf_half(%rsp) - movl %ebx,nb410nf_half+4(%rsp) - movsd nb410nf_half(%rsp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## one - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## two - addpd %xmm2,%xmm3 ## three - movapd %xmm1,nb410nf_half(%rsp) - movapd %xmm2,nb410nf_two(%rsp) - movapd %xmm3,nb410nf_three(%rsp) - -_nb_kernel410nf_x86_64_sse2.nb410nf_threadloop: - movq nb410nf_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel410nf_x86_64_sse2.nb410nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel410nf_x86_64_sse2.nb410nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb410nf_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb410nf_n(%rsp) - movl %ebx,nb410nf_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel410nf_x86_64_sse2.nb410nf_outerstart - jmp _nb_kernel410nf_x86_64_sse2.nb410nf_end - -_nb_kernel410nf_x86_64_sse2.nb410nf_outerstart: - ## ebx contains number of outer iterations - addl nb410nf_nouter(%rsp),%ebx - movl %ebx,nb410nf_nouter(%rsp) - -_nb_kernel410nf_x86_64_sse2.nb410nf_outer: - movq nb410nf_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## rbx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb410nf_is3(%rsp) ## store is3 - - movq nb410nf_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movsd (%rax,%rbx,8),%xmm0 - movsd 8(%rax,%rbx,8),%xmm1 - movsd 16(%rax,%rbx,8),%xmm2 - - movq nb410nf_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - movl %ebx,nb410nf_ii(%rsp) - - movq nb410nf_charge(%rbp),%rdx - movsd (%rdx,%rbx,8),%xmm3 - mulsd nb410nf_facel(%rsp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movq nb410nf_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movsd (%rdx,%rbx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - movq nb410nf_type(%rbp),%rdx - movl (%rdx,%rbx,4),%edx - imull nb410nf_ntype(%rsp),%edx - shll %edx - movl %edx,nb410nf_ntia(%rsp) - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb410nf_pos(%rbp),%rax ## rax = base of pos[] - - addsd (%rax,%rbx,8),%xmm0 - addsd 8(%rax,%rbx,8),%xmm1 - addsd 16(%rax,%rbx,8),%xmm2 - - movapd %xmm3,nb410nf_iq(%rsp) - movapd %xmm4,nb410nf_isai(%rsp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb410nf_ix(%rsp) - movapd %xmm1,nb410nf_iy(%rsp) - movapd %xmm2,nb410nf_iz(%rsp) - - movl %ebx,nb410nf_ii3(%rsp) - - ## clear vctot and Vvdwtot - xorpd %xmm4,%xmm4 - movapd %xmm4,nb410nf_vctot(%rsp) - movapd %xmm4,nb410nf_Vvdwtot(%rsp) - - movq nb410nf_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb410nf_pos(%rbp),%rsi - movq nb410nf_faction(%rbp),%rdi - movq nb410nf_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb410nf_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb410nf_ninner(%rsp),%ecx - movl %ecx,nb410nf_ninner(%rsp) - addl $0,%edx - movl %edx,nb410nf_innerk(%rsp) ## number of innerloop atoms - jge _nb_kernel410nf_x86_64_sse2.nb410nf_unroll_loop - jmp _nb_kernel410nf_x86_64_sse2.nb410nf_checksingle -_nb_kernel410nf_x86_64_sse2.nb410nf_unroll_loop: - ## twice unrolled innerloop here - movq nb410nf_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%eax - movl 4(%rdx),%ebx - addq $8,nb410nf_innerjjnr(%rsp) ## advance pointer (unrolled 2) - - ## load isaj - movq nb410nf_invsqrta(%rbp),%rsi - movlpd (%rsi,%rax,8),%xmm2 - movhpd (%rsi,%rbx,8),%xmm2 - mulpd nb410nf_isai(%rsp),%xmm2 - movapd %xmm2,nb410nf_isaprod(%rsp) - movapd %xmm2,%xmm1 - mulpd nb410nf_gbtsc(%rsp),%xmm1 - movapd %xmm1,nb410nf_gbscale(%rsp) - - movq nb410nf_charge(%rbp),%rsi ## base of charge[] - movlpd (%rsi,%rax,8),%xmm3 - movhpd (%rsi,%rbx,8),%xmm3 - - mulpd nb410nf_iq(%rsp),%xmm2 - mulpd %xmm2,%xmm3 - movapd %xmm3,nb410nf_qq(%rsp) - - movd %eax,%mm0 ## use mmx registers as temp storage - movd %ebx,%mm1 - - movq nb410nf_type(%rbp),%rsi - movl (%rsi,%rax,4),%eax - movl (%rsi,%rbx,4),%ebx - movq nb410nf_vdwparam(%rbp),%rsi - shll %eax - shll %ebx - movl nb410nf_ntia(%rsp),%edi - addl %edi,%eax - addl %edi,%ebx - - movlpd (%rsi,%rax,8),%xmm6 ## c6a - movlpd (%rsi,%rbx,8),%xmm7 ## c6b - movhpd 8(%rsi,%rax,8),%xmm6 ## c6a c12a - movhpd 8(%rsi,%rbx,8),%xmm7 ## c6b c12b - - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movd %mm0,%eax - movd %mm1,%ebx - movapd %xmm4,nb410nf_c6(%rsp) - movapd %xmm6,nb410nf_c12(%rsp) - - movq nb410nf_pos(%rbp),%rsi ## base of pos[] - - movd %eax,%mm2 - movd %ebx,%mm3 - lea (%rax,%rax,2),%rax ## replace jnr with j3 - lea (%rbx,%rbx,2),%rbx - - ## move two coordinates to xmm0-xmm2 - movlpd (%rsi,%rax,8),%xmm0 - movlpd 8(%rsi,%rax,8),%xmm1 - movlpd 16(%rsi,%rax,8),%xmm2 - movhpd (%rsi,%rbx,8),%xmm0 - movhpd 8(%rsi,%rbx,8),%xmm1 - movhpd 16(%rsi,%rbx,8),%xmm2 - - ## move ix-iz to xmm4-xmm6 - movapd nb410nf_ix(%rsp),%xmm4 - movapd nb410nf_iy(%rsp),%xmm5 - movapd nb410nf_iz(%rsp),%xmm6 - - ## calc dr - subpd %xmm0,%xmm4 - subpd %xmm1,%xmm5 - subpd %xmm2,%xmm6 - - ## square dr - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb410nf_three(%rsp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb410nf_half(%rsp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb410nf_three(%rsp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb410nf_half(%rsp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=rinv - - mulpd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb410nf_r(%rsp) - mulpd nb410nf_gbscale(%rsp),%xmm4 - - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 ## idx *= 4 - - movd %eax,%mm0 - movd %ebx,%mm1 - - movq nb410nf_GBtab(%rbp),%rsi - movd %mm6,%eax - psrlq $32,%mm6 - movd %mm6,%ebx ## indices in eax/ebx - - movapd (%rsi,%rax,8),%xmm4 ## Y1 F1 - movapd (%rsi,%rbx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%rsi,%rax,8),%xmm6 ## G1 H1 - movapd 16(%rsi,%rbx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - movapd nb410nf_qq(%rsp),%xmm3 - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - mulpd %xmm3,%xmm5 ## vcoul=qq*VV - - addpd nb410nf_vctot(%rsp),%xmm5 - movapd %xmm5,nb410nf_vctot(%rsp) - - ## L-J - movapd %xmm0,%xmm4 - mulpd %xmm0,%xmm4 ## xmm4=rinvsq - - movapd %xmm4,%xmm6 - mulpd %xmm4,%xmm6 - - mulpd %xmm4,%xmm6 ## xmm6=rinvsix - movapd %xmm6,%xmm4 - mulpd %xmm4,%xmm4 ## xmm4=rinvtwelve - mulpd nb410nf_c6(%rsp),%xmm6 - mulpd nb410nf_c12(%rsp),%xmm4 - movapd nb410nf_Vvdwtot(%rsp),%xmm7 - addpd %xmm4,%xmm7 - subpd %xmm6,%xmm7 - movapd %xmm7,nb410nf_Vvdwtot(%rsp) - - ## should we do one more iteration? - subl $2,nb410nf_innerk(%rsp) - jl _nb_kernel410nf_x86_64_sse2.nb410nf_checksingle - jmp _nb_kernel410nf_x86_64_sse2.nb410nf_unroll_loop -_nb_kernel410nf_x86_64_sse2.nb410nf_checksingle: - movl nb410nf_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel410nf_x86_64_sse2.nb410nf_dosingle - jmp _nb_kernel410nf_x86_64_sse2.nb410nf_updateouterdata -_nb_kernel410nf_x86_64_sse2.nb410nf_dosingle: - movq nb410nf_charge(%rbp),%rsi - movq nb410nf_invsqrta(%rbp),%rdx - movq nb410nf_pos(%rbp),%rdi - movq nb410nf_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - - xorpd %xmm6,%xmm6 - movapd %xmm6,%xmm7 - movsd (%rdx,%rax,8),%xmm7 - movlpd (%rsi,%rax,8),%xmm6 ## xmm6(0) has the charge - mulsd nb410nf_isai(%rsp),%xmm7 - movapd %xmm7,nb410nf_isaprod(%rsp) - movapd %xmm7,%xmm1 - mulpd nb410nf_gbtsc(%rsp),%xmm1 - movapd %xmm1,nb410nf_gbscale(%rsp) - - mulsd nb410nf_iq(%rsp),%xmm7 - mulsd %xmm7,%xmm6 - movapd %xmm6,nb410nf_qq(%rsp) - - movd %eax,%mm0 ## use mmx registers as temp storage - movq nb410nf_type(%rbp),%rsi - movl (%rsi,%rax,4),%eax - movq nb410nf_vdwparam(%rbp),%rsi - shll %eax - movl nb410nf_ntia(%rsp),%edi - addl %edi,%eax - - movlpd (%rsi,%rax,8),%xmm6 ## c6a - movhpd 8(%rsi,%rax,8),%xmm6 ## c6a c12a - - xorpd %xmm7,%xmm7 - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movd %mm0,%eax - movapd %xmm4,nb410nf_c6(%rsp) - movapd %xmm6,nb410nf_c12(%rsp) - - movq nb410nf_pos(%rbp),%rsi ## base of pos[] - - movd %eax,%mm2 - lea (%rax,%rax,2),%rax ## replace jnr with j3 - - ## move coordinates to xmm0-xmm2 - movlpd (%rsi,%rax,8),%xmm0 - movlpd 8(%rsi,%rax,8),%xmm1 - movlpd 16(%rsi,%rax,8),%xmm2 - - ## move ix-iz to xmm4-xmm6 - movapd nb410nf_ix(%rsp),%xmm4 - movapd nb410nf_iy(%rsp),%xmm5 - movapd nb410nf_iz(%rsp),%xmm6 - - ## calc dr - subsd %xmm0,%xmm4 - subsd %xmm1,%xmm5 - subsd %xmm2,%xmm6 - - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb410nf_three(%rsp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb410nf_half(%rsp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb410nf_three(%rsp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb410nf_half(%rsp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=rinv - - mulsd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb410nf_r(%rsp) - mulsd nb410nf_gbscale(%rsp),%xmm4 - - movd %eax,%mm0 - cvttsd2si %xmm4,%eax ## mm6 = lu idx - cvtsi2sd %eax,%xmm5 - subsd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%eax ## idx *= 4 - - movq nb410nf_GBtab(%rbp),%rsi - - movapd (%rsi,%rax,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%rsi,%rax,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## coulomb table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - movapd nb410nf_qq(%rsp),%xmm3 - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd %xmm3,%xmm5 ## vcoul=qq*VV - - addsd nb410nf_vctot(%rsp),%xmm5 - movsd %xmm5,nb410nf_vctot(%rsp) - - ## L-J - movapd %xmm0,%xmm4 - mulsd %xmm0,%xmm4 ## xmm4=rinvsq - - - movapd %xmm4,%xmm6 - mulsd %xmm4,%xmm6 - - mulsd %xmm4,%xmm6 ## xmm6=rinvsix - movapd %xmm6,%xmm4 - mulsd %xmm4,%xmm4 ## xmm4=rinvtwelve - mulsd nb410nf_c6(%rsp),%xmm6 - mulsd nb410nf_c12(%rsp),%xmm4 - movapd nb410nf_Vvdwtot(%rsp),%xmm7 - addsd %xmm4,%xmm7 - subsd %xmm6,%xmm7 - movlpd %xmm7,nb410nf_Vvdwtot(%rsp) - -_nb_kernel410nf_x86_64_sse2.nb410nf_updateouterdata: - movl nb410nf_ii3(%rsp),%ecx - movl nb410nf_is3(%rsp),%edx - - ## get n from stack - movl nb410nf_n(%rsp),%esi - ## get group index for i particle - movq nb410nf_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movapd nb410nf_vctot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movq nb410nf_Vc(%rbp),%rax - addsd (%rax,%rdx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%rax,%rdx,8) - - ## accumulate total lj energy and update it - movapd nb410nf_Vvdwtot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movq nb410nf_Vvdw(%rbp),%rax - addsd (%rax,%rdx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%rax,%rdx,8) - - ## finish if last - movl nb410nf_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel410nf_x86_64_sse2.nb410nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb410nf_n(%rsp) - jmp _nb_kernel410nf_x86_64_sse2.nb410nf_outer -_nb_kernel410nf_x86_64_sse2.nb410nf_outerend: - ## check if more outer neighborlists remain - movl nb410nf_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel410nf_x86_64_sse2.nb410nf_end - ## non-zero, do one more workunit - jmp _nb_kernel410nf_x86_64_sse2.nb410nf_threadloop -_nb_kernel410nf_x86_64_sse2.nb410nf_end: - movl nb410nf_nouter(%rsp),%eax - movl nb410nf_ninner(%rsp),%ebx - movq nb410nf_outeriter(%rbp),%rcx - movq nb410nf_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $392,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.intel_syntax.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.intel_syntax.s deleted file mode 100644 index 42ca37e0c3..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.intel_syntax.s +++ /dev/null @@ -1,1664 +0,0 @@ -;# -;# -;# Gromacs 4.0 Copyright (c) 1991-2003 -;# David van der Spoel, Erik Lindahl -;# -;# This program is free software; you can redistribute it and/or -;# modify it under the terms of the GNU General Public License -;# as published by the Free Software Foundation; either version 2 -;# of the License, or (at your option) any later version. -;# -;# To help us fund GROMACS development, we humbly ask that you cite -;# the research papers on the package. Check out http://www.gromacs.org -;# -;# And Hey: -;# Gnomes, ROck Monsters And Chili Sauce -;# - -;# These files require GNU binutils 2.10 or later, since we -;# use intel syntax for portability, or a recent version -;# of NASM that understands Extended 3DNow and SSE2 instructions. -;# (NASM is normally only used with MS Visual C++). -;# Since NASM and gnu as disagree on some definitions and use -;# completely different preprocessing options I have to introduce a -;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86. -;# Gnu as treats ';' as a line break, i.e. ignores it. This is the -;# reason why all comments need both symbols... -;# The source is written for GNU as, with intel syntax. When you use -;# NASM we redefine a couple of things. The false if-statement around -;# the following code is seen by GNU as, but NASM doesn't see it, so -;# the code inside is read by NASM but not gcc. - -; .if 0 # block below only read by NASM -%define .section section -%define .long dd -%define .align align -%define .globl global -;# NASM only wants 'dword', not 'dword ptr'. -%define ptr -%macro .equiv 2 - %1 equ %2 -%endmacro -; .endif # End of NASM-specific block -; .intel_syntax noprefix # Line only read by gnu as - - -.globl nb_kernel430_x86_64_sse2 -.globl _nb_kernel430_x86_64_sse2 -nb_kernel430_x86_64_sse2: -_nb_kernel430_x86_64_sse2: -;# Room for return address and rbp (16 bytes) -.equiv nb430_fshift, 16 -.equiv nb430_gid, 24 -.equiv nb430_pos, 32 -.equiv nb430_faction, 40 -.equiv nb430_charge, 48 -.equiv nb430_p_facel, 56 -.equiv nb430_argkrf, 64 -.equiv nb430_argcrf, 72 -.equiv nb430_Vc, 80 -.equiv nb430_type, 88 -.equiv nb430_p_ntype, 96 -.equiv nb430_vdwparam, 104 -.equiv nb430_Vvdw, 112 -.equiv nb430_p_tabscale, 120 -.equiv nb430_VFtab, 128 -.equiv nb430_invsqrta, 136 -.equiv nb430_dvda, 144 -.equiv nb430_p_gbtabscale, 152 -.equiv nb430_GBtab, 160 -.equiv nb430_p_nthreads, 168 -.equiv nb430_count, 176 -.equiv nb430_mtx, 184 -.equiv nb430_outeriter, 192 -.equiv nb430_inneriter, 200 -.equiv nb430_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb430_ix, 0 -.equiv nb430_iy, 16 -.equiv nb430_iz, 32 -.equiv nb430_iq, 48 -.equiv nb430_dx, 64 -.equiv nb430_dy, 80 -.equiv nb430_dz, 96 -.equiv nb430_eps, 112 -.equiv nb430_gbtsc, 128 -.equiv nb430_tsc, 144 -.equiv nb430_qq, 160 -.equiv nb430_c6, 176 -.equiv nb430_c12, 192 -.equiv nb430_epsgb, 208 -.equiv nb430_vctot, 224 -.equiv nb430_Vvdwtot, 240 -.equiv nb430_fix, 256 -.equiv nb430_fiy, 272 -.equiv nb430_fiz, 288 -.equiv nb430_half, 304 -.equiv nb430_three, 320 -.equiv nb430_r, 336 -.equiv nb430_isai, 352 -.equiv nb430_isaprod, 368 -.equiv nb430_dvdasum, 384 -.equiv nb430_gbscale, 400 -.equiv nb430_rinv, 416 -.equiv nb430_nri, 432 -.equiv nb430_iinr, 440 -.equiv nb430_jindex, 448 -.equiv nb430_jjnr, 456 -.equiv nb430_shift, 464 -.equiv nb430_shiftvec, 472 -.equiv nb430_facel, 480 -.equiv nb430_innerjjnr, 488 -.equiv nb430_ii, 496 -.equiv nb430_is3, 500 -.equiv nb430_ii3, 504 -.equiv nb430_ntia, 508 -.equiv nb430_innerk, 512 -.equiv nb430_n, 516 -.equiv nb430_nn1, 520 -.equiv nb430_ntype, 524 -.equiv nb430_nouter, 528 -.equiv nb430_ninner, 532 - - push rbp - mov rbp, rsp - push rbx - - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 536 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb430_nouter], eax - mov [rsp + nb430_ninner], eax - - mov edi, [rdi] - mov [rsp + nb430_nri], edi - mov [rsp + nb430_iinr], rsi - mov [rsp + nb430_jindex], rdx - mov [rsp + nb430_jjnr], rcx - mov [rsp + nb430_shift], r8 - mov [rsp + nb430_shiftvec], r9 - mov rdi, [rbp + nb430_p_ntype] - mov edi, [rdi] - mov [rsp + nb430_ntype], edi - mov rsi, [rbp + nb430_p_facel] - movsd xmm0, [rsi] - movsd [rsp + nb430_facel], xmm0 - - mov rax, [rbp + nb430_p_tabscale] - movsd xmm3, [rax] - shufpd xmm3, xmm3, 0 - movapd [rsp + nb430_tsc], xmm3 - - mov rbx, [rbp + nb430_p_gbtabscale] - movsd xmm4, [rbx] - shufpd xmm4, xmm4, 0 - movapd [rsp + nb430_gbtsc], xmm4 - - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double half IEEE (hex) - mov ebx, 0x3fe00000 - mov [rsp + nb430_half], eax - mov [rsp + nb430_half+4], ebx - movsd xmm1, [rsp + nb430_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# one - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# two - addpd xmm3, xmm2 ;# three - movapd [rsp + nb430_half], xmm1 - movapd [rsp + nb430_three], xmm3 - -.nb430_threadloop: - mov rsi, [rbp + nb430_count] ;# pointer to sync counter - mov eax, [rsi] -.nb430_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb430_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb430_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb430_n], eax - mov [rsp + nb430_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb430_outerstart - jmp .nb430_end - -.nb430_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb430_nouter] - mov [rsp + nb430_nouter], ebx - -.nb430_outer: - mov rax, [rsp + nb430_shift] ;# rax = pointer into shift[] - mov ebx, [rax+rsi*4] ;# rbx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb430_is3],ebx ;# store is3 - - mov rax, [rsp + nb430_shiftvec] ;# rax = base of shiftvec[] - - movsd xmm0, [rax + rbx*8] - movsd xmm1, [rax + rbx*8 + 8] - movsd xmm2, [rax + rbx*8 + 16] - - mov rcx, [rsp + nb430_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx+rsi*4] ;# ebx =ii - mov [rsp + nb430_ii], ebx - - mov rdx, [rbp + nb430_charge] - movsd xmm3, [rdx + rbx*8] - mulsd xmm3, [rsp + nb430_facel] - shufpd xmm3, xmm3, 0 - - mov rdx, [rbp + nb430_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [rdx + rbx*8] - shufpd xmm4, xmm4, 0 - - mov rdx, [rbp + nb430_type] - mov edx, [rdx + rbx*4] - imul edx, [rsp + nb430_ntype] - shl edx, 1 - mov [rsp + nb430_ntia], edx - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb430_pos] ;# rax = base of pos[] - - addsd xmm0, [rax + rbx*8] - addsd xmm1, [rax + rbx*8 + 8] - addsd xmm2, [rax + rbx*8 + 16] - - movapd [rsp + nb430_iq], xmm3 - movapd [rsp + nb430_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [rsp + nb430_ix], xmm0 - movapd [rsp + nb430_iy], xmm1 - movapd [rsp + nb430_iz], xmm2 - - mov [rsp + nb430_ii3], ebx - - ;# clear vctot and i forces - xorpd xmm4, xmm4 - movapd [rsp + nb430_vctot], xmm4 - movapd [rsp + nb430_Vvdwtot], xmm4 - movapd [rsp + nb430_dvdasum], xmm4 - movapd [rsp + nb430_fix], xmm4 - movapd [rsp + nb430_fiy], xmm4 - movapd [rsp + nb430_fiz], xmm4 - - mov rax, [rsp + nb430_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb430_pos] - mov rdi, [rbp + nb430_faction] - mov rax, [rsp + nb430_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb430_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [rsp + nb430_ninner] - mov [rsp + nb430_ninner], ecx - add edx, 0 - mov [rsp + nb430_innerk], edx ;# number of innerloop atoms - jge .nb430_unroll_loop - jmp .nb430_checksingle -.nb430_unroll_loop: - ;# twice unrolled innerloop here - mov rdx, [rsp + nb430_innerjjnr] ;# pointer to jjnr[k] - mov eax, [rdx] - mov ebx, [rdx + 4] - add qword ptr [rsp + nb430_innerjjnr], 8 ;# advance pointer (unrolled 2) - - - mov rsi, [rbp + nb430_pos] ;# base of pos[] - - lea r10, [rax + rax*2] ;# j3 - lea r11, [rbx + rbx*2] - - ;# move two coordinates to xmm4-xmm6 - movlpd xmm4, [rsi + r10*8] - movlpd xmm5, [rsi + r10*8 + 8] - movlpd xmm6, [rsi + r10*8 + 16] - movhpd xmm4, [rsi + r11*8] - movhpd xmm5, [rsi + r11*8 + 8] - movhpd xmm6, [rsi + r11*8 + 16] - - ;# calc dr - subpd xmm4, [rsp + nb430_ix] - subpd xmm5, [rsp + nb430_iy] - subpd xmm6, [rsp + nb430_iz] - - ;# store dr - movapd [rsp + nb430_dx], xmm4 - movapd [rsp + nb430_dy], xmm5 - movapd [rsp + nb430_dz], xmm6 - - ;# square it - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - ;# load isaj - mov rsi, [rbp + nb430_invsqrta] - movlpd xmm3, [rsi + rax*8] - movhpd xmm3, [rsi + rbx*8] - mulpd xmm3, [rsp + nb430_isai] - movapd [rsp + nb430_isaprod], xmm3 - movapd xmm6, xmm3 - mulpd xmm3, [rsp + nb430_gbtsc] - movapd [rsp + nb430_gbscale], xmm3 - - ;#invsqrt - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb430_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb430_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - mulpd xmm6, [rsp + nb430_iq] - mov rsi, [rbp + nb430_charge] ;# base of charge[] - movlpd xmm3, [rsi + rax*8] - movhpd xmm3, [rsi + rbx*8] - mulpd xmm3, xmm6 - movapd [rsp + nb430_qq], xmm3 - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb430_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb430_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=iter2 of rinv - mulpd xmm4, xmm0 ;# xmm4=r - movapd [rsp + nb430_r], xmm4 - movapd [rsp + nb430_rinv], xmm0 - - mov rsi, [rbp + nb430_type] - mov r8d, [rsi + rax*4] - mov r9d, [rsi + rbx*4] - shl r8d, 1 - shl r9d, 1 - mov edi, [rsp + nb430_ntia] - add r8d, edi - add r9d, edi - - movapd xmm8, xmm4 ;# r - mulpd xmm4, [rsp + nb430_gbscale] - mulpd xmm8, [rsp + nb430_tsc] - - ;# truncate and convert to integers - cvttpd2pi mm0, xmm4 ;# gb - cvttpd2pi mm1, xmm8 ;# lj - - ;# convert back to float - cvtpi2pd xmm6, mm0 ;# gb - cvtpi2pd xmm10, mm1 ;# lj - - ;# multiply by 4 and 8, respectively - pslld mm0, 2 ;# gb - pslld mm1, 3 ;# lj - - ;# move to integer registers - movd r12d, mm0 ;# gb - movd r14d, mm1 ;# lj - psrlq mm0, 32 - psrlq mm1, 32 - movd r13d, mm0 ;# gb - movd r15d, mm1 ;# lj - ;# GB indices: r10-11 LJ indices: r12-r13 - - ;# calculate eps - subpd xmm4, xmm6 ;# gb - subpd xmm8, xmm10 ;# lj - movapd [rsp + nb430_epsgb], xmm4 ;# gb eps - movapd [rsp + nb430_eps], xmm8 ;# lj eps - - mov rsi, [rbp + nb430_GBtab] - mov rdi, [rbp + nb430_VFtab] - - ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 - movapd xmm0, [rsi + r12*8] ;# Y1c F1c - movapd xmm12, [rsi + r13*8] ;# Y2c F2c - movapd xmm4, [rdi + r14*8] ;# Y1d F1d - movapd xmm13, [rdi + r15*8] ;# Y2d F2d - movapd xmm8, [rdi + r14*8 + 32] ;# Y1r F1r - movapd xmm14, [rdi + r15*8 + 32] ;# Y2r F2r - movapd xmm1, xmm0 - movapd xmm5, xmm4 - movapd xmm9, xmm8 - unpcklpd xmm0, xmm12 ;# Y1c Y2c - unpckhpd xmm1, xmm12 ;# F1c F2c - unpcklpd xmm4, xmm13 ;# Y1d Y2d - unpckhpd xmm5, xmm13 ;# F1d F2d - unpcklpd xmm8, xmm14 ;# Y1r Y2r - unpckhpd xmm9, xmm14 ;# F1r F2r - - movapd xmm2, [rsi + r12*8 + 16] ;# G1c H1c - movapd xmm12, [rsi + r13*8 + 16] ;# G2c H2c - movapd xmm6, [rdi + r14*8 + 16] ;# G1d H1d - movapd xmm13, [rdi + r15*8 + 16] ;# G2d H2d - movapd xmm10, [rdi + r14*8 + 48] ;# G1r H1r - movapd xmm14, [rdi + r15*8 + 48] ;# G2r H2r - movapd xmm3, xmm2 - movapd xmm7, xmm6 - movapd xmm11, xmm10 - unpcklpd xmm2, xmm12 ;# G1c G2c - unpckhpd xmm3, xmm12 ;# H1c H2c - unpcklpd xmm6, xmm13 ;# G1d G2d - unpckhpd xmm7, xmm13 ;# H1d H2d - unpcklpd xmm10, xmm14 ;# G1r G2r - unpckhpd xmm11, xmm14 ;# H1r H2r - ;# table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 - mov rdi, [rbp + nb430_vdwparam] - - movapd xmm12, [rsp + nb430_epsgb] - movapd xmm13, [rsp + nb430_eps] - - mulpd xmm3, xmm12 ;# Heps - mulpd xmm7, xmm13 - mulpd xmm11, xmm13 - mulpd xmm2, xmm12 ;# Geps - mulpd xmm6, xmm13 - mulpd xmm10, xmm13 - mulpd xmm3, xmm12 ;# Heps2 - mulpd xmm7, xmm13 - mulpd xmm11, xmm13 - - movlpd xmm14, [rdi + r8*8] - movlpd xmm15, [rdi + r8*8 + 8] - - addpd xmm1, xmm2 ;# F+Geps - addpd xmm5, xmm6 - addpd xmm9, xmm10 - addpd xmm1, xmm3 ;# F+Geps+Heps2 = Fp - addpd xmm5, xmm7 - addpd xmm9, xmm11 - addpd xmm3, xmm3 ;# 2*Heps2 - addpd xmm7, xmm7 - addpd xmm11, xmm11 - movhpd xmm14, [rdi + r9*8] - movhpd xmm15, [rdi + r9*8 + 8] - - addpd xmm3, xmm2 ;# 2*Heps2+Geps - addpd xmm7, xmm6 - addpd xmm11, xmm10 - addpd xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps - addpd xmm7, xmm5 - addpd xmm11, xmm9 - mulpd xmm1, xmm12 ;# eps*Fp - mulpd xmm5, xmm13 - mulpd xmm9, xmm13 - addpd xmm1, xmm0 ;# VV - addpd xmm5, xmm4 - addpd xmm9, xmm8 - mulpd xmm1, [rsp + nb430_qq] ;# VV*qq = vcoul - mulpd xmm5, xmm14 ;# vnb6 - mulpd xmm9, xmm15 ;# vnb12 - mulpd xmm3, [rsp + nb430_qq] ;# FF*qq = fij - mulpd xmm7, xmm14 ;# fijD - mulpd xmm11, xmm15 ;#fijR - - addpd xmm11, xmm7 ;# fijD+fijR - mulpd xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale - - ;# accumulate Vvdwtot - addpd xmm5, [rsp + nb430_Vvdwtot] - addpd xmm5, xmm9 - movapd [rsp + nb430_Vvdwtot], xmm5 - - mov rsi, [rbp + nb430_dvda] - - ;# Calculate dVda - mulpd xmm3, [rsp + nb430_gbscale] ;# fijC=qq*FF*gbscale - movapd xmm6, xmm3 - mulpd xmm6, [rsp + nb430_r] - addpd xmm6, xmm1 ;# vcoul+fijC*r - - addpd xmm3, xmm11 ;# fijC+fijD+fijR - - ;# increment vctot - addpd xmm1, [rsp + nb430_vctot] - movapd [rsp + nb430_vctot], xmm1 - - ;# xmm6=(vcoul+fijC*r) - xorpd xmm7, xmm7 - subpd xmm7, xmm6 - movapd xmm6, xmm7 - - ;# the fj's - start by combiningg forces from memory - mov rdi, [rbp + nb430_faction] - movlpd xmm0, [rdi + r10*8] - movlpd xmm1, [rdi + r10*8 + 8] - movlpd xmm2, [rdi + r10*8 + 16] - movhpd xmm0, [rdi + r11*8] - movhpd xmm1, [rdi + r11*8 + 8] - movhpd xmm2, [rdi + r11*8 + 16] - - ;# update dvdasum - addpd xmm7, [rsp + nb430_dvdasum] - movapd [rsp + nb430_dvdasum], xmm7 - - ;# update j atoms dvdaj - movhlps xmm7, xmm6 - addsd xmm6, [rsi + rax*8] - addsd xmm7, [rsi + rbx*8] - movsd [rsi + rax*8], xmm6 - movsd [rsi + rbx*8], xmm7 - - xorpd xmm4, xmm4 - mulpd xmm3, [rsp + nb430_rinv] - subpd xmm4, xmm3 - - movapd xmm9, xmm4 - movapd xmm10, xmm4 - movapd xmm11, xmm4 - - mulpd xmm9, [rsp + nb430_dx] - mulpd xmm10, [rsp + nb430_dy] - mulpd xmm11, [rsp + nb430_dz] - - addpd xmm0, xmm9 - addpd xmm1, xmm10 - addpd xmm2, xmm11 - - ;# accumulate i forces - addpd xmm9, [rsp + nb430_fix] - addpd xmm10, [rsp + nb430_fiy] - addpd xmm11, [rsp + nb430_fiz] - - movlpd [rdi + r10*8], xmm0 - movlpd [rdi + r10*8 + 8], xmm1 - movlpd [rdi + r10*8 + 16], xmm2 - - movapd [rsp + nb430_fix], xmm9 - movapd [rsp + nb430_fiy], xmm10 - movapd [rsp + nb430_fiz], xmm11 - - movhpd [rdi + r11*8], xmm0 - movhpd [rdi + r11*8 + 8], xmm1 - movhpd [rdi + r11*8 + 16], xmm2 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb430_innerk], 2 - jl .nb430_checksingle - jmp .nb430_unroll_loop -.nb430_checksingle: - mov edx, [rsp + nb430_innerk] - and edx, 1 - jnz .nb430_dosingle - jmp .nb430_updateouterdata -.nb430_dosingle: - mov rsi, [rbp + nb430_charge] - mov rdx, [rbp + nb430_invsqrta] - mov rdi, [rbp + nb430_pos] - mov rcx, [rsp + nb430_innerjjnr] - mov eax, [rcx] - - ;# load isaj - mov rsi, [rbp + nb430_invsqrta] - movsd xmm2, [rsi + rax*8] - mulsd xmm2, [rsp + nb430_isai] - movapd [rsp + nb430_isaprod], xmm2 - movapd xmm1, xmm2 - mulsd xmm1, [rsp + nb430_gbtsc] - movapd [rsp + nb430_gbscale], xmm1 - - mulsd xmm2, [rsp + nb430_iq] - mov rsi, [rbp + nb430_charge] ;# base of charge[] - movsd xmm3, [rsi + rax*8] - mulsd xmm3, xmm2 - movapd [rsp + nb430_qq], xmm3 - - mov rsi, [rbp + nb430_type] - mov r8d, [rsi + rax*4] - mov rsi, [rbp + nb430_vdwparam] - shl r8d, 1 - mov edi, [rsp + nb430_ntia] - add r8d, edi - - movsd xmm4, [rsi + r8*8] - movsd xmm6, [rsi + r8*8 + 8] - movapd [rsp + nb430_c6], xmm4 - movapd [rsp + nb430_c12], xmm6 - - mov rsi, [rbp + nb430_pos] ;# base of pos[] - - lea r10, [rax + rax*2] ;# j3 - - ;# move coordinate to xmm4-xmm6 - movsd xmm4, [rsi + r10*8] - movsd xmm5, [rsi + r10*8 + 8] - movsd xmm6, [rsi + r10*8 + 16] - - mov rdi, [rbp + nb430_faction] - - ;# calc dr - subsd xmm4, [rsp + nb430_ix] - subsd xmm5, [rsp + nb430_iy] - subsd xmm6, [rsp + nb430_iz] - - ;# store dr - movapd [rsp + nb430_dx], xmm4 - movapd [rsp + nb430_dy], xmm5 - movapd [rsp + nb430_dz], xmm6 - - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb430_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb430_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb430_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb430_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=iter2 of rinv - mulsd xmm4, xmm0 ;# xmm4=r - movapd [rsp + nb430_r], xmm4 - movapd [rsp + nb430_rinv], xmm0 - - movapd xmm8, xmm4 ;# r - mulsd xmm4, [rsp + nb430_gbscale] - mulsd xmm8, [rsp + nb430_tsc] - - ;# truncate and convert to integers - cvttsd2si r12d, xmm4 ;# gb - cvttsd2si r14d, xmm8 ;# lj - - ;# convert back to float - cvtsi2sd xmm6, r12d ;# gb - cvtsi2sd xmm10, r14d ;# lj - - ;# multiply by 4 and 8, respectively - shl r12d, 2 ;# gb - shl r14d, 3 ;# lj - - ;# GB indices: r10 LJ indices: r12 - - ;# calculate eps - subsd xmm4, xmm6 ;# gb - subsd xmm8, xmm10 ;# lj - movapd [rsp + nb430_epsgb], xmm4 ;# gb eps - movapd [rsp + nb430_eps], xmm8 ;# lj eps - - mov rsi, [rbp + nb430_GBtab] - mov rdi, [rbp + nb430_VFtab] - - ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 - movapd xmm0, [rsi + r12*8] ;# Y1c F1c - movapd xmm4, [rdi + r14*8] ;# Y1d F1d - movapd xmm8, [rdi + r14*8 + 32] ;# Y1r F1r - movhlps xmm1, xmm0 - movhlps xmm5, xmm4 - movhlps xmm9, xmm8 - - movapd xmm2, [rsi + r12*8 + 16] ;# G1c H1c - movapd xmm6, [rdi + r14*8 + 16] ;# G1d H1d - movapd xmm10, [rdi + r14*8 + 48] ;# G1r H1r - movhlps xmm3, xmm2 - movhlps xmm7, xmm6 - movhlps xmm11, xmm10 - ;# table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 - - movapd xmm12, [rsp + nb430_epsgb] - movapd xmm13, [rsp + nb430_eps] - - mulsd xmm3, xmm12 ;# Heps - mulsd xmm7, xmm13 - mulsd xmm11, xmm13 - mulsd xmm2, xmm12 ;# Geps - mulsd xmm6, xmm13 - mulsd xmm10, xmm13 - mulsd xmm3, xmm12 ;# Heps2 - mulsd xmm7, xmm13 - mulsd xmm11, xmm13 - - addsd xmm1, xmm2 ;# F+Geps - addsd xmm5, xmm6 - addsd xmm9, xmm10 - addsd xmm1, xmm3 ;# F+Geps+Heps2 = Fp - addsd xmm5, xmm7 - addsd xmm9, xmm11 - addsd xmm3, xmm3 ;# 2*Heps2 - addsd xmm7, xmm7 - addsd xmm11, xmm11 - addsd xmm3, xmm2 ;# 2*Heps2+Geps - addsd xmm7, xmm6 - addsd xmm11, xmm10 - addsd xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps - addsd xmm7, xmm5 - addsd xmm11, xmm9 - mulsd xmm1, xmm12 ;# eps*Fp - mulsd xmm5, xmm13 - mulsd xmm9, xmm13 - addsd xmm1, xmm0 ;# VV - addsd xmm5, xmm4 - addsd xmm9, xmm8 - mulsd xmm1, [rsp + nb430_qq] ;# VV*qq = vcoul - mulsd xmm5, [rsp + nb430_c6] ;# vnb6 - mulsd xmm9, [rsp + nb430_c12] ;# vnb12 - mulsd xmm3, [rsp + nb430_qq] ;# FF*qq = fij - mulsd xmm7, [rsp + nb430_c6] ;# fijD - mulsd xmm11, [rsp + nb430_c12] ;#fijR - - addsd xmm11, xmm7 ;# fijD+fijR - mulsd xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale - - ;# accumulate Vvdwtot - addsd xmm5, [rsp + nb430_Vvdwtot] - addsd xmm5, xmm9 - movsd [rsp + nb430_Vvdwtot], xmm5 - - mov rsi, [rbp + nb430_dvda] - - ;# Calculate dVda - mulsd xmm3, [rsp + nb430_gbscale] ;# fijC=qq*FF*gbscale - movapd xmm6, xmm3 - mulsd xmm6, [rsp + nb430_r] - addsd xmm6, xmm1 ;# vcoul+fijC*r - - addsd xmm3, xmm11 ;# fijC+fijD+fijR - - ;# increment vctot - addsd xmm1, [rsp + nb430_vctot] - movsd [rsp + nb430_vctot], xmm1 - - ;# xmm6=(vcoul+fijC*r) - xorpd xmm7, xmm7 - subsd xmm7, xmm6 - movapd xmm6, xmm7 - - ;# update dvdasum - addsd xmm7, [rsp + nb430_dvdasum] - movsd [rsp + nb430_dvdasum], xmm7 - - ;# update j atoms dvdaj - addsd xmm6, [rsi + rax*8] - movsd [rsi + rax*8], xmm6 - - xorpd xmm4, xmm4 - mulsd xmm3, [rsp + nb430_rinv] - subsd xmm4, xmm3 - - movapd xmm9, xmm4 - movapd xmm10, xmm4 - movapd xmm11, xmm4 - - mulsd xmm9, [rsp + nb430_dx] - mulsd xmm10, [rsp + nb430_dy] - mulsd xmm11, [rsp + nb430_dz] - - movapd xmm3, xmm9 - movapd xmm4, xmm10 - movapd xmm5, xmm11 - - ;# accumulate i forces - addsd xmm9, [rsp + nb430_fix] - addsd xmm10, [rsp + nb430_fiy] - addsd xmm11, [rsp + nb430_fiz] - movsd [rsp + nb430_fix], xmm9 - movsd [rsp + nb430_fiy], xmm10 - movsd [rsp + nb430_fiz], xmm11 - - mov rdi, [rbp + nb430_faction] - ;# the fj's - start by accumulating forces from memory - addsd xmm3, [rdi + r10*8] - addsd xmm4, [rdi + r10*8 + 8] - addsd xmm5, [rdi + r10*8 + 16] - movsd [rdi + r10*8], xmm3 - movsd [rdi + r10*8 + 8], xmm4 - movsd [rdi + r10*8 + 16], xmm5 - -.nb430_updateouterdata: - mov ecx, [rsp + nb430_ii3] - mov rdi, [rbp + nb430_faction] - mov rsi, [rbp + nb430_fshift] - mov edx, [rsp + nb430_is3] - - ;# accumulate i forces in xmm0, xmm1, xmm2 - movapd xmm0, [rsp + nb430_fix] - movapd xmm1, [rsp + nb430_fiy] - movapd xmm2, [rsp + nb430_fiz] - - movhlps xmm3, xmm0 - movhlps xmm4, xmm1 - movhlps xmm5, xmm2 - addsd xmm0, xmm3 - addsd xmm1, xmm4 - addsd xmm2, xmm5 ;# sum is in low xmm0-xmm2 - - ;# increment i force - movsd xmm3, [rdi + rcx*8] - movsd xmm4, [rdi + rcx*8 + 8] - movsd xmm5, [rdi + rcx*8 + 16] - subsd xmm3, xmm0 - subsd xmm4, xmm1 - subsd xmm5, xmm2 - movsd [rdi + rcx*8], xmm3 - movsd [rdi + rcx*8 + 8], xmm4 - movsd [rdi + rcx*8 + 16], xmm5 - - ;# increment fshift force - movsd xmm3, [rsi + rdx*8] - movsd xmm4, [rsi + rdx*8 + 8] - movsd xmm5, [rsi + rdx*8 + 16] - subsd xmm3, xmm0 - subsd xmm4, xmm1 - subsd xmm5, xmm2 - movsd [rsi + rdx*8], xmm3 - movsd [rsi + rdx*8 + 8], xmm4 - movsd [rsi + rdx*8 + 16], xmm5 - - ;# get n from stack - mov esi, [rsp + nb430_n] - ;# get group index for i particle - mov rdx, [rbp + nb430_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movapd xmm7, [rsp + nb430_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov rax, [rbp + nb430_Vc] - addsd xmm7, [rax + rdx*8] - ;# move back to mem - movsd [rax + rdx*8], xmm7 - - ;# accumulate total lj energy and update it - movapd xmm7, [rsp + nb430_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov rax, [rbp + nb430_Vvdw] - addsd xmm7, [rax + rdx*8] - ;# move back to mem - movsd [rax + rdx*8], xmm7 - - ;# accumulate dVda and update it - movapd xmm7, [rsp + nb430_dvdasum] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - mov edx, [rsp + nb430_ii] - mov rax, [rbp + nb430_dvda] - addsd xmm7, [rax + rdx*8] - movsd [rax + rdx*8], xmm7 - - ;# finish if last - mov ecx, [rsp + nb430_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb430_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb430_n], esi - jmp .nb430_outer -.nb430_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb430_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb430_end - ;# non-zero, do one more workunit - jmp .nb430_threadloop -.nb430_end: - mov eax, [rsp + nb430_nouter] - mov ebx, [rsp + nb430_ninner] - mov rcx, [rbp + nb430_outeriter] - mov rdx, [rbp + nb430_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 536 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - - - - - - -.globl nb_kernel430nf_x86_64_sse2 -.globl _nb_kernel430nf_x86_64_sse2 -nb_kernel430nf_x86_64_sse2: -_nb_kernel430nf_x86_64_sse2: -;# Room for return address and rbp (16 bytes) -.equiv nb430nf_fshift, 16 -.equiv nb430nf_gid, 24 -.equiv nb430nf_pos, 32 -.equiv nb430nf_faction, 40 -.equiv nb430nf_charge, 48 -.equiv nb430nf_p_facel, 56 -.equiv nb430nf_argkrf, 64 -.equiv nb430nf_argcrf, 72 -.equiv nb430nf_Vc, 80 -.equiv nb430nf_type, 88 -.equiv nb430nf_p_ntype, 96 -.equiv nb430nf_vdwparam, 104 -.equiv nb430nf_Vvdw, 112 -.equiv nb430nf_p_tabscale, 120 -.equiv nb430nf_VFtab, 128 -.equiv nb430nf_invsqrta, 136 -.equiv nb430nf_dvda, 144 -.equiv nb430nf_p_gbtabscale, 152 -.equiv nb430nf_GBtab, 160 -.equiv nb430nf_p_nthreads, 168 -.equiv nb430nf_count, 176 -.equiv nb430nf_mtx, 184 -.equiv nb430nf_outeriter, 192 -.equiv nb430nf_inneriter, 200 -.equiv nb430nf_work, 208 - ;# stack offsets for local variables - ;# bottom of stack is cache-aligned for sse2 use -.equiv nb430nf_ix, 0 -.equiv nb430nf_iy, 16 -.equiv nb430nf_iz, 32 -.equiv nb430nf_iq, 48 -.equiv nb430nf_gbtsc, 64 -.equiv nb430nf_tsc, 80 -.equiv nb430nf_qq, 96 -.equiv nb430nf_c6, 112 -.equiv nb430nf_c12, 128 -.equiv nb430nf_vctot, 144 -.equiv nb430nf_Vvdwtot, 160 -.equiv nb430nf_half, 176 -.equiv nb430nf_three, 192 -.equiv nb430nf_r, 208 -.equiv nb430nf_isai, 224 -.equiv nb430nf_isaprod, 240 -.equiv nb430nf_gbscale, 256 -.equiv nb430nf_nri, 272 -.equiv nb430nf_iinr, 280 -.equiv nb430nf_jindex, 288 -.equiv nb430nf_jjnr, 296 -.equiv nb430nf_shift, 304 -.equiv nb430nf_shiftvec, 312 -.equiv nb430nf_facel, 320 -.equiv nb430nf_innerjjnr, 328 -.equiv nb430nf_is3, 336 -.equiv nb430nf_ii3, 340 -.equiv nb430nf_ntia, 344 -.equiv nb430nf_innerk, 348 -.equiv nb430nf_n, 352 -.equiv nb430nf_nn1, 356 -.equiv nb430nf_ntype, 360 -.equiv nb430nf_nouter, 364 -.equiv nb430nf_ninner, 368 - push rbp - mov rbp, rsp - push rbx - - emms - - push r12 - push r13 - push r14 - push r15 - - sub rsp, 392 ;# local variable stack space (n*16+8) - - ;# zero 32-bit iteration counters - mov eax, 0 - mov [rsp + nb430nf_nouter], eax - mov [rsp + nb430nf_ninner], eax - - mov edi, [rdi] - mov [rsp + nb430nf_nri], edi - mov [rsp + nb430nf_iinr], rsi - mov [rsp + nb430nf_jindex], rdx - mov [rsp + nb430nf_jjnr], rcx - mov [rsp + nb430nf_shift], r8 - mov [rsp + nb430nf_shiftvec], r9 - mov rdi, [rbp + nb430nf_p_ntype] - mov edi, [rdi] - mov [rsp + nb430nf_ntype], edi - mov rsi, [rbp + nb430nf_p_facel] - movsd xmm0, [rsi] - movsd [rsp + nb430nf_facel], xmm0 - - mov rax, [rbp + nb430nf_p_tabscale] - movsd xmm3, [rax] - shufpd xmm3, xmm3, 0 - movapd [rsp + nb430nf_tsc], xmm3 - - mov rbx, [rbp + nb430nf_p_gbtabscale] - movsd xmm4, [rbx] - shufpd xmm4, xmm4, 0 - movapd [rsp + nb430nf_gbtsc], xmm4 - - ;# create constant floating-point factors on stack - mov eax, 0x00000000 ;# lower half of double half IEEE (hex) - mov ebx, 0x3fe00000 - mov [rsp + nb430nf_half], eax - mov [rsp + nb430nf_half+4], ebx - movsd xmm1, [rsp + nb430nf_half] - shufpd xmm1, xmm1, 0 ;# splat to all elements - movapd xmm3, xmm1 - addpd xmm3, xmm3 ;# one - movapd xmm2, xmm3 - addpd xmm2, xmm2 ;# two - addpd xmm3, xmm2 ;# three - movapd [rsp + nb430nf_half], xmm1 - movapd [rsp + nb430nf_three], xmm3 - -.nb430nf_threadloop: - mov rsi, [rbp + nb430nf_count] ;# pointer to sync counter - mov eax, [rsi] -.nb430nf_spinlock: - mov ebx, eax ;# ebx=*count=nn0 - add ebx, 1 ;# ebx=nn1=nn0+10 - lock - cmpxchg [esi], ebx ;# write nn1 to *counter, - ;# if it hasnt changed. - ;# or reread *counter to eax. - pause ;# -> better p4 performance - jnz .nb430nf_spinlock - - ;# if(nn1>nri) nn1=nri - mov ecx, [rsp + nb430nf_nri] - mov edx, ecx - sub ecx, ebx - cmovle ebx, edx ;# if(nn1>nri) nn1=nri - ;# Cleared the spinlock if we got here. - ;# eax contains nn0, ebx contains nn1. - mov [rsp + nb430nf_n], eax - mov [rsp + nb430nf_nn1], ebx - sub ebx, eax ;# calc number of outer lists - mov esi, eax ;# copy n to esi - jg .nb430nf_outerstart - jmp .nb430nf_end - -.nb430nf_outerstart: - ;# ebx contains number of outer iterations - add ebx, [rsp + nb430nf_nouter] - mov [rsp + nb430nf_nouter], ebx - -.nb430nf_outer: - mov rax, [rsp + nb430nf_shift] ;# rax = pointer into shift[] - mov ebx, [rax+rsi*4] ;# rbx=shift[n] - - lea rbx, [rbx + rbx*2] ;# rbx=3*is - mov [rsp + nb430nf_is3],ebx ;# store is3 - - mov rax, [rsp + nb430nf_shiftvec] ;# rax = base of shiftvec[] - - movsd xmm0, [rax + rbx*8] - movsd xmm1, [rax + rbx*8 + 8] - movsd xmm2, [rax + rbx*8 + 16] - - mov rcx, [rsp + nb430nf_iinr] ;# rcx = pointer into iinr[] - mov ebx, [rcx+rsi*4] ;# ebx =ii - - mov rdx, [rbp + nb430nf_charge] - movsd xmm3, [rdx + rbx*8] - mulsd xmm3, [rsp + nb430nf_facel] - shufpd xmm3, xmm3, 0 - - mov rdx, [rbp + nb430nf_invsqrta] ;# load invsqrta[ii] - movsd xmm4, [rdx + rbx*8] - shufpd xmm4, xmm4, 0 - - mov rdx, [rbp + nb430nf_type] - mov edx, [rdx + rbx*4] - imul edx, [rsp + nb430nf_ntype] - shl edx, 1 - mov [rsp + nb430nf_ntia], edx - - lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3 - mov rax, [rbp + nb430nf_pos] ;# rax = base of pos[] - - addsd xmm0, [rax + rbx*8] - addsd xmm1, [rax + rbx*8 + 8] - addsd xmm2, [rax + rbx*8 + 16] - - movapd [rsp + nb430nf_iq], xmm3 - movapd [rsp + nb430nf_isai], xmm4 - - shufpd xmm0, xmm0, 0 - shufpd xmm1, xmm1, 0 - shufpd xmm2, xmm2, 0 - - movapd [rsp + nb430nf_ix], xmm0 - movapd [rsp + nb430nf_iy], xmm1 - movapd [rsp + nb430nf_iz], xmm2 - - mov [rsp + nb430nf_ii3], ebx - - ;# clear vctot - xorpd xmm4, xmm4 - movapd [rsp + nb430nf_vctot], xmm4 - movapd [rsp + nb430nf_Vvdwtot], xmm4 - - mov rax, [rsp + nb430nf_jindex] - mov ecx, [rax + rsi*4] ;# jindex[n] - mov edx, [rax + rsi*4 + 4] ;# jindex[n+1] - sub edx, ecx ;# number of innerloop atoms - - mov rsi, [rbp + nb430nf_pos] - mov rdi, [rbp + nb430nf_faction] - mov rax, [rsp + nb430nf_jjnr] - shl ecx, 2 - add rax, rcx - mov [rsp + nb430nf_innerjjnr], rax ;# pointer to jjnr[nj0] - mov ecx, edx - sub edx, 2 - add ecx, [rsp + nb430nf_ninner] - mov [rsp + nb430nf_ninner], ecx - add edx, 0 - mov [rsp + nb430nf_innerk], edx ;# number of innerloop atoms - jge .nb430nf_unroll_loop - jmp .nb430nf_checksingle -.nb430nf_unroll_loop: - ;# twice unrolled innerloop here - mov rdx, [rsp + nb430nf_innerjjnr] ;# pointer to jjnr[k] - mov eax, [rdx] - mov ebx, [rdx + 4] - add qword ptr [rsp + nb430nf_innerjjnr], 8 ;# advance pointer (unrolled 2) - - ;# load isaj - mov rsi, [rbp + nb430nf_invsqrta] - movlpd xmm2, [rsi + rax*8] - movhpd xmm2, [rsi + rbx*8] - mulpd xmm2, [rsp + nb430nf_isai] - movapd [rsp + nb430nf_isaprod], xmm2 - movapd xmm1, xmm2 - mulpd xmm1, [rsp + nb430nf_gbtsc] - movapd [rsp + nb430nf_gbscale], xmm1 - - mov rsi, [rbp + nb430nf_charge] ;# base of charge[] - movlpd xmm3, [rsi + rax*8] - movhpd xmm3, [rsi + rbx*8] - - mulpd xmm2, [rsp + nb430nf_iq] - mulpd xmm3, xmm2 - movapd [rsp + nb430nf_qq], xmm3 - - mov rsi, [rbp + nb430nf_type] - mov ecx, [rsi + rax*4] - mov edx, [rsi + rbx*4] - mov rsi, [rbp + nb430nf_vdwparam] - shl ecx, 1 - shl edx, 1 - mov edi, [rsp + nb430nf_ntia] - add ecx, edi - add edx, edi - - movlpd xmm6, [rsi + rcx*8] ;# c6a - movlpd xmm7, [rsi + rdx*8] ;# c6b - movhpd xmm6, [rsi + rcx*8 + 8] ;# c6a c12a - movhpd xmm7, [rsi + rdx*8 + 8] ;# c6b c12b - - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movapd [rsp + nb430nf_c6], xmm4 - movapd [rsp + nb430nf_c12], xmm6 - - mov rsi, [rbp + nb430nf_pos] ;# base of pos[] - - lea rax, [rax + rax*2] ;# replace jnr with j3 - lea rbx, [rbx + rbx*2] - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [rsi + rax*8] - movlpd xmm1, [rsi + rax*8 + 8] - movlpd xmm2, [rsi + rax*8 + 16] - movhpd xmm0, [rsi + rbx*8] - movhpd xmm1, [rsi + rbx*8 + 8] - movhpd xmm2, [rsi + rbx*8 + 16] - - mov rdi, [rbp + nb430nf_faction] - - ;# move nb430nf_ix-iz to xmm4-xmm6 - movapd xmm4, [rsp + nb430nf_ix] - movapd xmm5, [rsp + nb430nf_iy] - movapd xmm6, [rsp + nb430nf_iz] - - ;# calc dr - subpd xmm4, xmm0 - subpd xmm5, xmm1 - subpd xmm6, xmm2 - - ;# square it - mulpd xmm4,xmm4 - mulpd xmm5,xmm5 - mulpd xmm6,xmm6 - addpd xmm4, xmm5 - addpd xmm4, xmm6 - ;# rsq in xmm4 - - cvtpd2ps xmm5, xmm4 - rsqrtps xmm5, xmm5 - cvtps2pd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulpd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb430nf_three] - mulpd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb430nf_half] - subpd xmm1, xmm2 ;# 30-rsq*lu*lu - mulpd xmm1, xmm5 - mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulpd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb430nf_three] - mulpd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb430nf_half] - subpd xmm2, xmm1 ;# 30-rsq*lu*lu - mulpd xmm2, xmm5 - mulpd xmm0, xmm2 ;# xmm0=iter2 of rinv - mulpd xmm4, xmm0 ;# xmm4=r - movapd [rsp + nb430nf_r], xmm4 - mulpd xmm4, [rsp + nb430nf_gbscale] - - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 2 ;# idx *= 4 - - mov rsi, [rbp + nb430nf_GBtab] - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 ;# indices in eax/ebx - - ;# Coulomb - movapd xmm4, [rsi + rcx*8] ;# Y1 F1 - movapd xmm3, [rsi + rdx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [rsi + rcx*8 + 16] ;# G1 H1 - movapd xmm3, [rsi + rdx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# coulomb table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [rsp + nb430nf_qq] - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - mulpd xmm5, xmm3 ;# vcoul=qq*VV - addpd xmm5, [rsp + nb430nf_vctot] - movapd [rsp + nb430nf_vctot], xmm5 - - movapd xmm4, [rsp + nb430nf_r] - mulpd xmm4, [rsp + nb430nf_tsc] - cvttpd2pi mm6, xmm4 ;# mm6 = lu idx - cvtpi2pd xmm5, mm6 - subpd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulpd xmm2, xmm2 ;# xmm2=eps2 - - pslld mm6, 3 ;# idx *= 8 - - mov rsi, [rbp + nb430nf_VFtab] - - movd ecx, mm6 - psrlq mm6, 32 - movd edx, mm6 ;# indices in eax/ebx - - ;# Dispersion - movapd xmm4, [rsi + rcx*8] ;# Y1 F1 - movapd xmm3, [rsi + rdx*8] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [rsi + rcx*8 + 16] ;# G1 H1 - movapd xmm3, [rsi + rdx*8 + 16] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# Dispersion table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - - mulpd xmm5, [rsp + nb430nf_c6] ;# Vvdw6 - addpd xmm5, [rsp + nb430nf_Vvdwtot] - movapd [rsp + nb430nf_Vvdwtot], xmm5 - - ;# Repulsion - movapd xmm4, [rsi + rcx*8 + 32] ;# Y1 F1 - movapd xmm3, [rsi + rdx*8 + 32] ;# Y2 F2 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 Y2 - unpckhpd xmm5, xmm3 ;# F1 F2 - - movapd xmm6, [rsi + rcx*8 + 48] ;# G1 H1 - movapd xmm3, [rsi + rdx*8 + 48] ;# G2 H2 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 G2 - unpckhpd xmm7, xmm3 ;# H1 H2 - ;# Dispersion table ready, in xmm4-xmm7 - mulpd xmm6, xmm1 ;# xmm6=Geps - mulpd xmm7, xmm2 ;# xmm7=Heps2 - addpd xmm5, xmm6 - addpd xmm5, xmm7 ;# xmm5=Fp - mulpd xmm5, xmm1 ;# xmm5=eps*Fp - addpd xmm5, xmm4 ;# xmm5=VV - - mulpd xmm5, [rsp + nb430nf_c12] ;# Vvdw12 - addpd xmm5, [rsp + nb430nf_Vvdwtot] - movapd [rsp + nb430nf_Vvdwtot], xmm5 - xorpd xmm4, xmm4 - - ;# should we do one more iteration? - sub dword ptr [rsp + nb430nf_innerk], 2 - jl .nb430nf_checksingle - jmp .nb430nf_unroll_loop -.nb430nf_checksingle: - mov edx, [rsp + nb430nf_innerk] - and edx, 1 - jnz .nb430nf_dosingle - jmp .nb430nf_updateouterdata -.nb430nf_dosingle: - mov rsi, [rbp + nb430nf_charge] - mov rdx, [rbp + nb430nf_invsqrta] - mov rdi, [rbp + nb430nf_pos] - mov rcx, [rsp + nb430nf_innerjjnr] - mov eax, [rcx] - - xorpd xmm6, xmm6 - movapd xmm7, xmm6 - movsd xmm7, [rdx + rax*8] - movlpd xmm6, [rsi + rax*8] ;# xmm6(0) has the charge - mulsd xmm7, [rsp + nb430nf_isai] - movapd [rsp + nb430nf_isaprod], xmm7 - movapd xmm1, xmm7 - mulpd xmm1, [rsp + nb430nf_gbtsc] - movapd [rsp + nb430nf_gbscale], xmm1 - - mulsd xmm7, [rsp + nb430nf_iq] - mulsd xmm6, xmm7 - movapd [rsp + nb430nf_qq], xmm6 - - mov rsi, [rbp + nb430nf_type] - mov edx, [rsi + rax*4] - mov rsi, [rbp + nb430nf_vdwparam] - shl edx, 1 - mov edi, [rsp + nb430nf_ntia] - add edx, edi - - movlpd xmm6, [rsi + rdx*8] ;# c6a - movhpd xmm6, [rsi + rdx*8 + 8] ;# c6a c12a - - xorpd xmm7, xmm7 - movapd xmm4, xmm6 - unpcklpd xmm4, xmm7 - unpckhpd xmm6, xmm7 - - movapd [rsp + nb430nf_c6], xmm4 - movapd [rsp + nb430nf_c12], xmm6 - - mov rsi, [rbp + nb430nf_pos] ;# base of pos[] - - lea rax, [rax + rax*2] ;# replace jnr with j3 - - ;# move two coordinates to xmm0-xmm2 - movlpd xmm0, [rsi + rax*8] - movlpd xmm1, [rsi + rax*8 + 8] - movlpd xmm2, [rsi + rax*8 + 16] - - mov rdi, [rbp + nb430nf_faction] - - ;# move nb430nf_ix-iz to xmm4-xmm6 - movapd xmm4, [rsp + nb430nf_ix] - movapd xmm5, [rsp + nb430nf_iy] - movapd xmm6, [rsp + nb430nf_iz] - - ;# calc dr - subsd xmm4, xmm0 - subsd xmm5, xmm1 - subsd xmm6, xmm2 - - ;# square it - mulsd xmm4,xmm4 - mulsd xmm5,xmm5 - mulsd xmm6,xmm6 - addsd xmm4, xmm5 - addsd xmm4, xmm6 - ;# rsq in xmm4 - - cvtsd2ss xmm5, xmm4 - rsqrtss xmm5, xmm5 - cvtss2sd xmm2, xmm5 ;# lu in low xmm2 - - ;# lookup seed in xmm2 - movapd xmm5, xmm2 ;# copy of lu - mulsd xmm2, xmm2 ;# lu*lu - movapd xmm1, [rsp + nb430nf_three] - mulsd xmm2, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb430nf_half] - subsd xmm1, xmm2 ;# 30-rsq*lu*lu - mulsd xmm1, xmm5 - mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu) - - movapd xmm5, xmm1 ;# copy of lu - mulsd xmm1, xmm1 ;# lu*lu - movapd xmm2, [rsp + nb430nf_three] - mulsd xmm1, xmm4 ;# rsq*lu*lu - movapd xmm0, [rsp + nb430nf_half] - subsd xmm2, xmm1 ;# 30-rsq*lu*lu - mulsd xmm2, xmm5 - mulsd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu) - mulsd xmm4, xmm0 ;# xmm4=r - movsd [rsp + nb430nf_r], xmm4 - mulsd xmm4, [rsp + nb430nf_gbscale] - - cvttsd2si edx, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, edx - subsd xmm4, xmm5 - movapd xmm1, xmm4 ;# xmm1=eps - movapd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl edx, 2 ;# idx *= 4 - mov rsi, [rbp + nb430nf_GBtab] - - ;# Coulomb - movapd xmm4, [rsi + rdx*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [rsi + rdx*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# coulomb table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - movapd xmm3, [rsp + nb430nf_qq] - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, xmm3 ;# vcoul=qq*VV - addsd xmm5, [rsp + nb430nf_vctot] - movsd [rsp + nb430nf_vctot], xmm5 - - movsd xmm4, [rsp + nb430nf_r] - mulsd xmm4, [rsp + nb430nf_tsc] - cvttsd2si edx, xmm4 ;# mm6 = lu idx - cvtsi2sd xmm5, edx - subsd xmm4, xmm5 - movsd xmm1, xmm4 ;# xmm1=eps - movsd xmm2, xmm1 - mulsd xmm2, xmm2 ;# xmm2=eps2 - - shl edx, 3 - - mov rsi, [rbp + nb430nf_VFtab] - - ;# Dispersion - movapd xmm4, [rsi + rdx*8] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [rsi + rdx*8 + 16] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# Dispersion table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - - mulsd xmm5, [rsp + nb430nf_c6] ;# Vvdw6 - addsd xmm5, [rsp + nb430nf_Vvdwtot] - movlpd [rsp + nb430nf_Vvdwtot], xmm5 - - ;# Repulsion - movapd xmm4, [rsi + rdx*8 + 32] ;# Y1 F1 - xorpd xmm3, xmm3 - movapd xmm5, xmm4 - unpcklpd xmm4, xmm3 ;# Y1 - unpckhpd xmm5, xmm3 ;# F1 - - movapd xmm6, [rsi + rdx*8 + 48] ;# G1 H1 - xorpd xmm3, xmm3 - movapd xmm7, xmm6 - unpcklpd xmm6, xmm3 ;# G1 - unpckhpd xmm7, xmm3 ;# H1 - ;# Dispersion table ready, in xmm4-xmm7 - mulsd xmm6, xmm1 ;# xmm6=Geps - mulsd xmm7, xmm2 ;# xmm7=Heps2 - addsd xmm5, xmm6 - addsd xmm5, xmm7 ;# xmm5=Fp - mulsd xmm5, xmm1 ;# xmm5=eps*Fp - addsd xmm5, xmm4 ;# xmm5=VV - mulsd xmm5, [rsp + nb430nf_c12] ;# Vvdw12 - addsd xmm5, [rsp + nb430nf_Vvdwtot] - movlpd [rsp + nb430nf_Vvdwtot], xmm5 -.nb430nf_updateouterdata: - ;# get n from stack - mov esi, [rsp + nb430nf_n] - ;# get group index for i particle - mov rdx, [rbp + nb430nf_gid] ;# base of gid[] - mov edx, [rdx + rsi*4] ;# ggid=gid[n] - - ;# accumulate total potential energy and update it - movapd xmm7, [rsp + nb430nf_vctot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov rax, [rbp + nb430nf_Vc] - addsd xmm7, [rax + rdx*8] - ;# move back to mem - movsd [rax + rdx*8], xmm7 - - ;# accumulate total lj energy and update it - movapd xmm7, [rsp + nb430nf_Vvdwtot] - ;# accumulate - movhlps xmm6, xmm7 - addsd xmm7, xmm6 ;# low xmm7 has the sum now - - ;# add earlier value from mem - mov rax, [rbp + nb430nf_Vvdw] - addsd xmm7, [rax + rdx*8] - ;# move back to mem - movsd [rax + rdx*8], xmm7 - - ;# finish if last - mov ecx, [rsp + nb430nf_nn1] - ;# esi already loaded with n - inc esi - sub ecx, esi - jz .nb430nf_outerend - - ;# not last, iterate outer loop once more! - mov [rsp + nb430nf_n], esi - jmp .nb430nf_outer -.nb430nf_outerend: - ;# check if more outer neighborlists remain - mov ecx, [rsp + nb430nf_nri] - ;# esi already loaded with n above - sub ecx, esi - jz .nb430nf_end - ;# non-zero, do one more workunit - jmp .nb430nf_threadloop -.nb430nf_end: - mov eax, [rsp + nb430nf_nouter] - mov ebx, [rsp + nb430nf_ninner] - mov rcx, [rbp + nb430nf_outeriter] - mov rdx, [rbp + nb430nf_inneriter] - mov [rcx], eax - mov [rdx], ebx - - add rsp, 392 - emms - - - pop r15 - pop r14 - pop r13 - pop r12 - - pop rbx - pop rbp - ret - - - diff --git a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.s b/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.s deleted file mode 100644 index 56360fe61e..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_x86_64_sse2/nb_kernel430_x86_64_sse2.s +++ /dev/null @@ -1,1640 +0,0 @@ -## -## -## Gromacs 4.0 Copyright (c) 1991-2003 -## David van der Spoel, Erik Lindahl -## -## This program is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License -## as published by the Free Software Foundation; either version 2 -## of the License, or (at your option) any later version. -## -## To help us fund GROMACS development, we humbly ask that you cite -## the research papers on the package. Check out http://www.gromacs.org -## -## And Hey: -## Gnomes, ROck Monsters And Chili Sauce -## - - - - -.globl nb_kernel430_x86_64_sse2 -.globl _nb_kernel430_x86_64_sse2 -nb_kernel430_x86_64_sse2: -_nb_kernel430_x86_64_sse2: -## Room for return address and rbp (16 bytes) -.set nb430_fshift, 16 -.set nb430_gid, 24 -.set nb430_pos, 32 -.set nb430_faction, 40 -.set nb430_charge, 48 -.set nb430_p_facel, 56 -.set nb430_argkrf, 64 -.set nb430_argcrf, 72 -.set nb430_Vc, 80 -.set nb430_type, 88 -.set nb430_p_ntype, 96 -.set nb430_vdwparam, 104 -.set nb430_Vvdw, 112 -.set nb430_p_tabscale, 120 -.set nb430_VFtab, 128 -.set nb430_invsqrta, 136 -.set nb430_dvda, 144 -.set nb430_p_gbtabscale, 152 -.set nb430_GBtab, 160 -.set nb430_p_nthreads, 168 -.set nb430_count, 176 -.set nb430_mtx, 184 -.set nb430_outeriter, 192 -.set nb430_inneriter, 200 -.set nb430_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb430_ix, 0 -.set nb430_iy, 16 -.set nb430_iz, 32 -.set nb430_iq, 48 -.set nb430_dx, 64 -.set nb430_dy, 80 -.set nb430_dz, 96 -.set nb430_eps, 112 -.set nb430_gbtsc, 128 -.set nb430_tsc, 144 -.set nb430_qq, 160 -.set nb430_c6, 176 -.set nb430_c12, 192 -.set nb430_epsgb, 208 -.set nb430_vctot, 224 -.set nb430_Vvdwtot, 240 -.set nb430_fix, 256 -.set nb430_fiy, 272 -.set nb430_fiz, 288 -.set nb430_half, 304 -.set nb430_three, 320 -.set nb430_r, 336 -.set nb430_isai, 352 -.set nb430_isaprod, 368 -.set nb430_dvdasum, 384 -.set nb430_gbscale, 400 -.set nb430_rinv, 416 -.set nb430_nri, 432 -.set nb430_iinr, 440 -.set nb430_jindex, 448 -.set nb430_jjnr, 456 -.set nb430_shift, 464 -.set nb430_shiftvec, 472 -.set nb430_facel, 480 -.set nb430_innerjjnr, 488 -.set nb430_ii, 496 -.set nb430_is3, 500 -.set nb430_ii3, 504 -.set nb430_ntia, 508 -.set nb430_innerk, 512 -.set nb430_n, 516 -.set nb430_nn1, 520 -.set nb430_ntype, 524 -.set nb430_nouter, 528 -.set nb430_ninner, 532 - - push %rbp - movq %rsp,%rbp - push %rbx - - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $536,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb430_nouter(%rsp) - movl %eax,nb430_ninner(%rsp) - - movl (%rdi),%edi - movl %edi,nb430_nri(%rsp) - movq %rsi,nb430_iinr(%rsp) - movq %rdx,nb430_jindex(%rsp) - movq %rcx,nb430_jjnr(%rsp) - movq %r8,nb430_shift(%rsp) - movq %r9,nb430_shiftvec(%rsp) - movq nb430_p_ntype(%rbp),%rdi - movl (%rdi),%edi - movl %edi,nb430_ntype(%rsp) - movq nb430_p_facel(%rbp),%rsi - movsd (%rsi),%xmm0 - movsd %xmm0,nb430_facel(%rsp) - - movq nb430_p_tabscale(%rbp),%rax - movsd (%rax),%xmm3 - shufpd $0,%xmm3,%xmm3 - movapd %xmm3,nb430_tsc(%rsp) - - movq nb430_p_gbtabscale(%rbp),%rbx - movsd (%rbx),%xmm4 - shufpd $0,%xmm4,%xmm4 - movapd %xmm4,nb430_gbtsc(%rsp) - - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double half IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb430_half(%rsp) - movl %ebx,nb430_half+4(%rsp) - movsd nb430_half(%rsp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## one - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## two - addpd %xmm2,%xmm3 ## three - movapd %xmm1,nb430_half(%rsp) - movapd %xmm3,nb430_three(%rsp) - -_nb_kernel430_x86_64_sse2.nb430_threadloop: - movq nb430_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel430_x86_64_sse2.nb430_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel430_x86_64_sse2.nb430_spinlock - - ## if(nn1>nri) nn1=nri - movl nb430_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb430_n(%rsp) - movl %ebx,nb430_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel430_x86_64_sse2.nb430_outerstart - jmp _nb_kernel430_x86_64_sse2.nb430_end - -_nb_kernel430_x86_64_sse2.nb430_outerstart: - ## ebx contains number of outer iterations - addl nb430_nouter(%rsp),%ebx - movl %ebx,nb430_nouter(%rsp) - -_nb_kernel430_x86_64_sse2.nb430_outer: - movq nb430_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## rbx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb430_is3(%rsp) ## store is3 - - movq nb430_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movsd (%rax,%rbx,8),%xmm0 - movsd 8(%rax,%rbx,8),%xmm1 - movsd 16(%rax,%rbx,8),%xmm2 - - movq nb430_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - movl %ebx,nb430_ii(%rsp) - - movq nb430_charge(%rbp),%rdx - movsd (%rdx,%rbx,8),%xmm3 - mulsd nb430_facel(%rsp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movq nb430_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movsd (%rdx,%rbx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - movq nb430_type(%rbp),%rdx - movl (%rdx,%rbx,4),%edx - imull nb430_ntype(%rsp),%edx - shll %edx - movl %edx,nb430_ntia(%rsp) - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb430_pos(%rbp),%rax ## rax = base of pos[] - - addsd (%rax,%rbx,8),%xmm0 - addsd 8(%rax,%rbx,8),%xmm1 - addsd 16(%rax,%rbx,8),%xmm2 - - movapd %xmm3,nb430_iq(%rsp) - movapd %xmm4,nb430_isai(%rsp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb430_ix(%rsp) - movapd %xmm1,nb430_iy(%rsp) - movapd %xmm2,nb430_iz(%rsp) - - movl %ebx,nb430_ii3(%rsp) - - ## clear vctot and i forces - xorpd %xmm4,%xmm4 - movapd %xmm4,nb430_vctot(%rsp) - movapd %xmm4,nb430_Vvdwtot(%rsp) - movapd %xmm4,nb430_dvdasum(%rsp) - movapd %xmm4,nb430_fix(%rsp) - movapd %xmm4,nb430_fiy(%rsp) - movapd %xmm4,nb430_fiz(%rsp) - - movq nb430_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb430_pos(%rbp),%rsi - movq nb430_faction(%rbp),%rdi - movq nb430_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb430_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb430_ninner(%rsp),%ecx - movl %ecx,nb430_ninner(%rsp) - addl $0,%edx - movl %edx,nb430_innerk(%rsp) ## number of innerloop atoms - jge _nb_kernel430_x86_64_sse2.nb430_unroll_loop - jmp _nb_kernel430_x86_64_sse2.nb430_checksingle -_nb_kernel430_x86_64_sse2.nb430_unroll_loop: - ## twice unrolled innerloop here - movq nb430_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%eax - movl 4(%rdx),%ebx - addq $8,nb430_innerjjnr(%rsp) ## advance pointer (unrolled 2) - - - movq nb430_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r10 ## j3 - lea (%rbx,%rbx,2),%r11 - - ## move two coordinates to xmm4-xmm6 - movlpd (%rsi,%r10,8),%xmm4 - movlpd 8(%rsi,%r10,8),%xmm5 - movlpd 16(%rsi,%r10,8),%xmm6 - movhpd (%rsi,%r11,8),%xmm4 - movhpd 8(%rsi,%r11,8),%xmm5 - movhpd 16(%rsi,%r11,8),%xmm6 - - ## calc dr - subpd nb430_ix(%rsp),%xmm4 - subpd nb430_iy(%rsp),%xmm5 - subpd nb430_iz(%rsp),%xmm6 - - ## store dr - movapd %xmm4,nb430_dx(%rsp) - movapd %xmm5,nb430_dy(%rsp) - movapd %xmm6,nb430_dz(%rsp) - - ## square it - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - ## load isaj - movq nb430_invsqrta(%rbp),%rsi - movlpd (%rsi,%rax,8),%xmm3 - movhpd (%rsi,%rbx,8),%xmm3 - mulpd nb430_isai(%rsp),%xmm3 - movapd %xmm3,nb430_isaprod(%rsp) - movapd %xmm3,%xmm6 - mulpd nb430_gbtsc(%rsp),%xmm3 - movapd %xmm3,nb430_gbscale(%rsp) - - ##invsqrt - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb430_three(%rsp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb430_half(%rsp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - mulpd nb430_iq(%rsp),%xmm6 - movq nb430_charge(%rbp),%rsi ## base of charge[] - movlpd (%rsi,%rax,8),%xmm3 - movhpd (%rsi,%rbx,8),%xmm3 - mulpd %xmm6,%xmm3 - movapd %xmm3,nb430_qq(%rsp) - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb430_three(%rsp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb430_half(%rsp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=iter2 of rinv - mulpd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb430_r(%rsp) - movapd %xmm0,nb430_rinv(%rsp) - - movq nb430_type(%rbp),%rsi - movl (%rsi,%rax,4),%r8d - movl (%rsi,%rbx,4),%r9d - shll %r8d - shll %r9d - movl nb430_ntia(%rsp),%edi - addl %edi,%r8d - addl %edi,%r9d - - movapd %xmm4,%xmm8 ## r - mulpd nb430_gbscale(%rsp),%xmm4 - mulpd nb430_tsc(%rsp),%xmm8 - - ## truncate and convert to integers - cvttpd2pi %xmm4,%mm0 ## gb - cvttpd2pi %xmm8,%mm1 ## lj - - ## convert back to float - cvtpi2pd %mm0,%xmm6 ## gb - cvtpi2pd %mm1,%xmm10 ## lj - - ## multiply by 4 and 8, respectively - pslld $2,%mm0 ## gb - pslld $3,%mm1 ## lj - - ## move to integer registers - movd %mm0,%r12d ## gb - movd %mm1,%r14d ## lj - psrlq $32,%mm0 - psrlq $32,%mm1 - movd %mm0,%r13d ## gb - movd %mm1,%r15d ## lj - ## GB indices: r10-11 LJ indices: r12-r13 - - ## calculate eps - subpd %xmm6,%xmm4 ## gb - subpd %xmm10,%xmm8 ## lj - movapd %xmm4,nb430_epsgb(%rsp) ## gb eps - movapd %xmm8,nb430_eps(%rsp) ## lj eps - - movq nb430_GBtab(%rbp),%rsi - movq nb430_VFtab(%rbp),%rdi - - ## load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 - movapd (%rsi,%r12,8),%xmm0 ## Y1c F1c - movapd (%rsi,%r13,8),%xmm12 ## Y2c F2c - movapd (%rdi,%r14,8),%xmm4 ## Y1d F1d - movapd (%rdi,%r15,8),%xmm13 ## Y2d F2d - movapd 32(%rdi,%r14,8),%xmm8 ## Y1r F1r - movapd 32(%rdi,%r15,8),%xmm14 ## Y2r F2r - movapd %xmm0,%xmm1 - movapd %xmm4,%xmm5 - movapd %xmm8,%xmm9 - unpcklpd %xmm12,%xmm0 ## Y1c Y2c - unpckhpd %xmm12,%xmm1 ## F1c F2c - unpcklpd %xmm13,%xmm4 ## Y1d Y2d - unpckhpd %xmm13,%xmm5 ## F1d F2d - unpcklpd %xmm14,%xmm8 ## Y1r Y2r - unpckhpd %xmm14,%xmm9 ## F1r F2r - - movapd 16(%rsi,%r12,8),%xmm2 ## G1c H1c - movapd 16(%rsi,%r13,8),%xmm12 ## G2c H2c - movapd 16(%rdi,%r14,8),%xmm6 ## G1d H1d - movapd 16(%rdi,%r15,8),%xmm13 ## G2d H2d - movapd 48(%rdi,%r14,8),%xmm10 ## G1r H1r - movapd 48(%rdi,%r15,8),%xmm14 ## G2r H2r - movapd %xmm2,%xmm3 - movapd %xmm6,%xmm7 - movapd %xmm10,%xmm11 - unpcklpd %xmm12,%xmm2 ## G1c G2c - unpckhpd %xmm12,%xmm3 ## H1c H2c - unpcklpd %xmm13,%xmm6 ## G1d G2d - unpckhpd %xmm13,%xmm7 ## H1d H2d - unpcklpd %xmm14,%xmm10 ## G1r G2r - unpckhpd %xmm14,%xmm11 ## H1r H2r - ## table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 - movq nb430_vdwparam(%rbp),%rdi - - movapd nb430_epsgb(%rsp),%xmm12 - movapd nb430_eps(%rsp),%xmm13 - - mulpd %xmm12,%xmm3 ## Heps - mulpd %xmm13,%xmm7 - mulpd %xmm13,%xmm11 - mulpd %xmm12,%xmm2 ## Geps - mulpd %xmm13,%xmm6 - mulpd %xmm13,%xmm10 - mulpd %xmm12,%xmm3 ## Heps2 - mulpd %xmm13,%xmm7 - mulpd %xmm13,%xmm11 - - movlpd (%rdi,%r8,8),%xmm14 - movlpd 8(%rdi,%r8,8),%xmm15 - - addpd %xmm2,%xmm1 ## F+Geps - addpd %xmm6,%xmm5 - addpd %xmm10,%xmm9 - addpd %xmm3,%xmm1 ## F+Geps+Heps2 = Fp - addpd %xmm7,%xmm5 - addpd %xmm11,%xmm9 - addpd %xmm3,%xmm3 ## 2*Heps2 - addpd %xmm7,%xmm7 - addpd %xmm11,%xmm11 - movhpd (%rdi,%r9,8),%xmm14 - movhpd 8(%rdi,%r9,8),%xmm15 - - addpd %xmm2,%xmm3 ## 2*Heps2+Geps - addpd %xmm6,%xmm7 - addpd %xmm10,%xmm11 - addpd %xmm1,%xmm3 ## FF = Fp + 2*Heps2 + Geps - addpd %xmm5,%xmm7 - addpd %xmm9,%xmm11 - mulpd %xmm12,%xmm1 ## eps*Fp - mulpd %xmm13,%xmm5 - mulpd %xmm13,%xmm9 - addpd %xmm0,%xmm1 ## VV - addpd %xmm4,%xmm5 - addpd %xmm8,%xmm9 - mulpd nb430_qq(%rsp),%xmm1 ## VV*qq = vcoul - mulpd %xmm14,%xmm5 ## vnb6 - mulpd %xmm15,%xmm9 ## vnb12 - mulpd nb430_qq(%rsp),%xmm3 ## FF*qq = fij - mulpd %xmm14,%xmm7 ## fijD - mulpd %xmm15,%xmm11 ##fijR - - addpd %xmm7,%xmm11 ## fijD+fijR - mulpd nb430_tsc(%rsp),%xmm11 ## (fijD+fijR)*tabscale - - ## accumulate Vvdwtot - addpd nb430_Vvdwtot(%rsp),%xmm5 - addpd %xmm9,%xmm5 - movapd %xmm5,nb430_Vvdwtot(%rsp) - - movq nb430_dvda(%rbp),%rsi - - ## Calculate dVda - mulpd nb430_gbscale(%rsp),%xmm3 ## fijC=qq*FF*gbscale - movapd %xmm3,%xmm6 - mulpd nb430_r(%rsp),%xmm6 - addpd %xmm1,%xmm6 ## vcoul+fijC*r - - addpd %xmm11,%xmm3 ## fijC+fijD+fijR - - ## increment vctot - addpd nb430_vctot(%rsp),%xmm1 - movapd %xmm1,nb430_vctot(%rsp) - - ## xmm6=(vcoul+fijC*r) - xorpd %xmm7,%xmm7 - subpd %xmm6,%xmm7 - movapd %xmm7,%xmm6 - - ## the fj's - start by combiningg forces from memory - movq nb430_faction(%rbp),%rdi - movlpd (%rdi,%r10,8),%xmm0 - movlpd 8(%rdi,%r10,8),%xmm1 - movlpd 16(%rdi,%r10,8),%xmm2 - movhpd (%rdi,%r11,8),%xmm0 - movhpd 8(%rdi,%r11,8),%xmm1 - movhpd 16(%rdi,%r11,8),%xmm2 - - ## update dvdasum - addpd nb430_dvdasum(%rsp),%xmm7 - movapd %xmm7,nb430_dvdasum(%rsp) - - ## update j atoms dvdaj - movhlps %xmm6,%xmm7 - addsd (%rsi,%rax,8),%xmm6 - addsd (%rsi,%rbx,8),%xmm7 - movsd %xmm6,(%rsi,%rax,8) - movsd %xmm7,(%rsi,%rbx,8) - - xorpd %xmm4,%xmm4 - mulpd nb430_rinv(%rsp),%xmm3 - subpd %xmm3,%xmm4 - - movapd %xmm4,%xmm9 - movapd %xmm4,%xmm10 - movapd %xmm4,%xmm11 - - mulpd nb430_dx(%rsp),%xmm9 - mulpd nb430_dy(%rsp),%xmm10 - mulpd nb430_dz(%rsp),%xmm11 - - addpd %xmm9,%xmm0 - addpd %xmm10,%xmm1 - addpd %xmm11,%xmm2 - - ## accumulate i forces - addpd nb430_fix(%rsp),%xmm9 - addpd nb430_fiy(%rsp),%xmm10 - addpd nb430_fiz(%rsp),%xmm11 - - movlpd %xmm0,(%rdi,%r10,8) - movlpd %xmm1,8(%rdi,%r10,8) - movlpd %xmm2,16(%rdi,%r10,8) - - movapd %xmm9,nb430_fix(%rsp) - movapd %xmm10,nb430_fiy(%rsp) - movapd %xmm11,nb430_fiz(%rsp) - - movhpd %xmm0,(%rdi,%r11,8) - movhpd %xmm1,8(%rdi,%r11,8) - movhpd %xmm2,16(%rdi,%r11,8) - - ## should we do one more iteration? - subl $2,nb430_innerk(%rsp) - jl _nb_kernel430_x86_64_sse2.nb430_checksingle - jmp _nb_kernel430_x86_64_sse2.nb430_unroll_loop -_nb_kernel430_x86_64_sse2.nb430_checksingle: - movl nb430_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel430_x86_64_sse2.nb430_dosingle - jmp _nb_kernel430_x86_64_sse2.nb430_updateouterdata -_nb_kernel430_x86_64_sse2.nb430_dosingle: - movq nb430_charge(%rbp),%rsi - movq nb430_invsqrta(%rbp),%rdx - movq nb430_pos(%rbp),%rdi - movq nb430_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - - ## load isaj - movq nb430_invsqrta(%rbp),%rsi - movsd (%rsi,%rax,8),%xmm2 - mulsd nb430_isai(%rsp),%xmm2 - movapd %xmm2,nb430_isaprod(%rsp) - movapd %xmm2,%xmm1 - mulsd nb430_gbtsc(%rsp),%xmm1 - movapd %xmm1,nb430_gbscale(%rsp) - - mulsd nb430_iq(%rsp),%xmm2 - movq nb430_charge(%rbp),%rsi ## base of charge[] - movsd (%rsi,%rax,8),%xmm3 - mulsd %xmm2,%xmm3 - movapd %xmm3,nb430_qq(%rsp) - - movq nb430_type(%rbp),%rsi - movl (%rsi,%rax,4),%r8d - movq nb430_vdwparam(%rbp),%rsi - shll %r8d - movl nb430_ntia(%rsp),%edi - addl %edi,%r8d - - movsd (%rsi,%r8,8),%xmm4 - movsd 8(%rsi,%r8,8),%xmm6 - movapd %xmm4,nb430_c6(%rsp) - movapd %xmm6,nb430_c12(%rsp) - - movq nb430_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%r10 ## j3 - - ## move coordinate to xmm4-xmm6 - movsd (%rsi,%r10,8),%xmm4 - movsd 8(%rsi,%r10,8),%xmm5 - movsd 16(%rsi,%r10,8),%xmm6 - - movq nb430_faction(%rbp),%rdi - - ## calc dr - subsd nb430_ix(%rsp),%xmm4 - subsd nb430_iy(%rsp),%xmm5 - subsd nb430_iz(%rsp),%xmm6 - - ## store dr - movapd %xmm4,nb430_dx(%rsp) - movapd %xmm5,nb430_dy(%rsp) - movapd %xmm6,nb430_dz(%rsp) - - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb430_three(%rsp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb430_half(%rsp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb430_three(%rsp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb430_half(%rsp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=iter2 of rinv - mulsd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb430_r(%rsp) - movapd %xmm0,nb430_rinv(%rsp) - - movapd %xmm4,%xmm8 ## r - mulsd nb430_gbscale(%rsp),%xmm4 - mulsd nb430_tsc(%rsp),%xmm8 - - ## truncate and convert to integers - cvttsd2si %xmm4,%r12d ## gb - cvttsd2si %xmm8,%r14d ## lj - - ## convert back to float - cvtsi2sd %r12d,%xmm6 ## gb - cvtsi2sd %r14d,%xmm10 ## lj - - ## multiply by 4 and 8, respectively - shll $2,%r12d ## gb - shll $3,%r14d ## lj - - ## GB indices: r10 LJ indices: r12 - - ## calculate eps - subsd %xmm6,%xmm4 ## gb - subsd %xmm10,%xmm8 ## lj - movapd %xmm4,nb430_epsgb(%rsp) ## gb eps - movapd %xmm8,nb430_eps(%rsp) ## lj eps - - movq nb430_GBtab(%rbp),%rsi - movq nb430_VFtab(%rbp),%rdi - - ## load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11 - movapd (%rsi,%r12,8),%xmm0 ## Y1c F1c - movapd (%rdi,%r14,8),%xmm4 ## Y1d F1d - movapd 32(%rdi,%r14,8),%xmm8 ## Y1r F1r - movhlps %xmm0,%xmm1 - movhlps %xmm4,%xmm5 - movhlps %xmm8,%xmm9 - - movapd 16(%rsi,%r12,8),%xmm2 ## G1c H1c - movapd 16(%rdi,%r14,8),%xmm6 ## G1d H1d - movapd 48(%rdi,%r14,8),%xmm10 ## G1r H1r - movhlps %xmm2,%xmm3 - movhlps %xmm6,%xmm7 - movhlps %xmm10,%xmm11 - ## table data ready. Coul GB in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11 - - movapd nb430_epsgb(%rsp),%xmm12 - movapd nb430_eps(%rsp),%xmm13 - - mulsd %xmm12,%xmm3 ## Heps - mulsd %xmm13,%xmm7 - mulsd %xmm13,%xmm11 - mulsd %xmm12,%xmm2 ## Geps - mulsd %xmm13,%xmm6 - mulsd %xmm13,%xmm10 - mulsd %xmm12,%xmm3 ## Heps2 - mulsd %xmm13,%xmm7 - mulsd %xmm13,%xmm11 - - addsd %xmm2,%xmm1 ## F+Geps - addsd %xmm6,%xmm5 - addsd %xmm10,%xmm9 - addsd %xmm3,%xmm1 ## F+Geps+Heps2 = Fp - addsd %xmm7,%xmm5 - addsd %xmm11,%xmm9 - addsd %xmm3,%xmm3 ## 2*Heps2 - addsd %xmm7,%xmm7 - addsd %xmm11,%xmm11 - addsd %xmm2,%xmm3 ## 2*Heps2+Geps - addsd %xmm6,%xmm7 - addsd %xmm10,%xmm11 - addsd %xmm1,%xmm3 ## FF = Fp + 2*Heps2 + Geps - addsd %xmm5,%xmm7 - addsd %xmm9,%xmm11 - mulsd %xmm12,%xmm1 ## eps*Fp - mulsd %xmm13,%xmm5 - mulsd %xmm13,%xmm9 - addsd %xmm0,%xmm1 ## VV - addsd %xmm4,%xmm5 - addsd %xmm8,%xmm9 - mulsd nb430_qq(%rsp),%xmm1 ## VV*qq = vcoul - mulsd nb430_c6(%rsp),%xmm5 ## vnb6 - mulsd nb430_c12(%rsp),%xmm9 ## vnb12 - mulsd nb430_qq(%rsp),%xmm3 ## FF*qq = fij - mulsd nb430_c6(%rsp),%xmm7 ## fijD - mulsd nb430_c12(%rsp),%xmm11 ##fijR - - addsd %xmm7,%xmm11 ## fijD+fijR - mulsd nb430_tsc(%rsp),%xmm11 ## (fijD+fijR)*tabscale - - ## accumulate Vvdwtot - addsd nb430_Vvdwtot(%rsp),%xmm5 - addsd %xmm9,%xmm5 - movsd %xmm5,nb430_Vvdwtot(%rsp) - - movq nb430_dvda(%rbp),%rsi - - ## Calculate dVda - mulsd nb430_gbscale(%rsp),%xmm3 ## fijC=qq*FF*gbscale - movapd %xmm3,%xmm6 - mulsd nb430_r(%rsp),%xmm6 - addsd %xmm1,%xmm6 ## vcoul+fijC*r - - addsd %xmm11,%xmm3 ## fijC+fijD+fijR - - ## increment vctot - addsd nb430_vctot(%rsp),%xmm1 - movsd %xmm1,nb430_vctot(%rsp) - - ## xmm6=(vcoul+fijC*r) - xorpd %xmm7,%xmm7 - subsd %xmm6,%xmm7 - movapd %xmm7,%xmm6 - - ## update dvdasum - addsd nb430_dvdasum(%rsp),%xmm7 - movsd %xmm7,nb430_dvdasum(%rsp) - - ## update j atoms dvdaj - addsd (%rsi,%rax,8),%xmm6 - movsd %xmm6,(%rsi,%rax,8) - - xorpd %xmm4,%xmm4 - mulsd nb430_rinv(%rsp),%xmm3 - subsd %xmm3,%xmm4 - - movapd %xmm4,%xmm9 - movapd %xmm4,%xmm10 - movapd %xmm4,%xmm11 - - mulsd nb430_dx(%rsp),%xmm9 - mulsd nb430_dy(%rsp),%xmm10 - mulsd nb430_dz(%rsp),%xmm11 - - movapd %xmm9,%xmm3 - movapd %xmm10,%xmm4 - movapd %xmm11,%xmm5 - - ## accumulate i forces - addsd nb430_fix(%rsp),%xmm9 - addsd nb430_fiy(%rsp),%xmm10 - addsd nb430_fiz(%rsp),%xmm11 - movsd %xmm9,nb430_fix(%rsp) - movsd %xmm10,nb430_fiy(%rsp) - movsd %xmm11,nb430_fiz(%rsp) - - movq nb430_faction(%rbp),%rdi - ## the fj's - start by accumulating forces from memory - addsd (%rdi,%r10,8),%xmm3 - addsd 8(%rdi,%r10,8),%xmm4 - addsd 16(%rdi,%r10,8),%xmm5 - movsd %xmm3,(%rdi,%r10,8) - movsd %xmm4,8(%rdi,%r10,8) - movsd %xmm5,16(%rdi,%r10,8) - -_nb_kernel430_x86_64_sse2.nb430_updateouterdata: - movl nb430_ii3(%rsp),%ecx - movq nb430_faction(%rbp),%rdi - movq nb430_fshift(%rbp),%rsi - movl nb430_is3(%rsp),%edx - - ## accumulate i forces in xmm0, xmm1, xmm2 - movapd nb430_fix(%rsp),%xmm0 - movapd nb430_fiy(%rsp),%xmm1 - movapd nb430_fiz(%rsp),%xmm2 - - movhlps %xmm0,%xmm3 - movhlps %xmm1,%xmm4 - movhlps %xmm2,%xmm5 - addsd %xmm3,%xmm0 - addsd %xmm4,%xmm1 - addsd %xmm5,%xmm2 ## sum is in low xmm0-xmm2 - - ## increment i force - movsd (%rdi,%rcx,8),%xmm3 - movsd 8(%rdi,%rcx,8),%xmm4 - movsd 16(%rdi,%rcx,8),%xmm5 - subsd %xmm0,%xmm3 - subsd %xmm1,%xmm4 - subsd %xmm2,%xmm5 - movsd %xmm3,(%rdi,%rcx,8) - movsd %xmm4,8(%rdi,%rcx,8) - movsd %xmm5,16(%rdi,%rcx,8) - - ## increment fshift force - movsd (%rsi,%rdx,8),%xmm3 - movsd 8(%rsi,%rdx,8),%xmm4 - movsd 16(%rsi,%rdx,8),%xmm5 - subsd %xmm0,%xmm3 - subsd %xmm1,%xmm4 - subsd %xmm2,%xmm5 - movsd %xmm3,(%rsi,%rdx,8) - movsd %xmm4,8(%rsi,%rdx,8) - movsd %xmm5,16(%rsi,%rdx,8) - - ## get n from stack - movl nb430_n(%rsp),%esi - ## get group index for i particle - movq nb430_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movapd nb430_vctot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movq nb430_Vc(%rbp),%rax - addsd (%rax,%rdx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%rax,%rdx,8) - - ## accumulate total lj energy and update it - movapd nb430_Vvdwtot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movq nb430_Vvdw(%rbp),%rax - addsd (%rax,%rdx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%rax,%rdx,8) - - ## accumulate dVda and update it - movapd nb430_dvdasum(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - movl nb430_ii(%rsp),%edx - movq nb430_dvda(%rbp),%rax - addsd (%rax,%rdx,8),%xmm7 - movsd %xmm7,(%rax,%rdx,8) - - ## finish if last - movl nb430_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel430_x86_64_sse2.nb430_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb430_n(%rsp) - jmp _nb_kernel430_x86_64_sse2.nb430_outer -_nb_kernel430_x86_64_sse2.nb430_outerend: - ## check if more outer neighborlists remain - movl nb430_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel430_x86_64_sse2.nb430_end - ## non-zero, do one more workunit - jmp _nb_kernel430_x86_64_sse2.nb430_threadloop -_nb_kernel430_x86_64_sse2.nb430_end: - movl nb430_nouter(%rsp),%eax - movl nb430_ninner(%rsp),%ebx - movq nb430_outeriter(%rbp),%rcx - movq nb430_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $536,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - - - - - -.globl nb_kernel430nf_x86_64_sse2 -.globl _nb_kernel430nf_x86_64_sse2 -nb_kernel430nf_x86_64_sse2: -_nb_kernel430nf_x86_64_sse2: -## Room for return address and rbp (16 bytes) -.set nb430nf_fshift, 16 -.set nb430nf_gid, 24 -.set nb430nf_pos, 32 -.set nb430nf_faction, 40 -.set nb430nf_charge, 48 -.set nb430nf_p_facel, 56 -.set nb430nf_argkrf, 64 -.set nb430nf_argcrf, 72 -.set nb430nf_Vc, 80 -.set nb430nf_type, 88 -.set nb430nf_p_ntype, 96 -.set nb430nf_vdwparam, 104 -.set nb430nf_Vvdw, 112 -.set nb430nf_p_tabscale, 120 -.set nb430nf_VFtab, 128 -.set nb430nf_invsqrta, 136 -.set nb430nf_dvda, 144 -.set nb430nf_p_gbtabscale, 152 -.set nb430nf_GBtab, 160 -.set nb430nf_p_nthreads, 168 -.set nb430nf_count, 176 -.set nb430nf_mtx, 184 -.set nb430nf_outeriter, 192 -.set nb430nf_inneriter, 200 -.set nb430nf_work, 208 - ## stack offsets for local variables - ## bottom of stack is cache-aligned for sse2 use -.set nb430nf_ix, 0 -.set nb430nf_iy, 16 -.set nb430nf_iz, 32 -.set nb430nf_iq, 48 -.set nb430nf_gbtsc, 64 -.set nb430nf_tsc, 80 -.set nb430nf_qq, 96 -.set nb430nf_c6, 112 -.set nb430nf_c12, 128 -.set nb430nf_vctot, 144 -.set nb430nf_Vvdwtot, 160 -.set nb430nf_half, 176 -.set nb430nf_three, 192 -.set nb430nf_r, 208 -.set nb430nf_isai, 224 -.set nb430nf_isaprod, 240 -.set nb430nf_gbscale, 256 -.set nb430nf_nri, 272 -.set nb430nf_iinr, 280 -.set nb430nf_jindex, 288 -.set nb430nf_jjnr, 296 -.set nb430nf_shift, 304 -.set nb430nf_shiftvec, 312 -.set nb430nf_facel, 320 -.set nb430nf_innerjjnr, 328 -.set nb430nf_is3, 336 -.set nb430nf_ii3, 340 -.set nb430nf_ntia, 344 -.set nb430nf_innerk, 348 -.set nb430nf_n, 352 -.set nb430nf_nn1, 356 -.set nb430nf_ntype, 360 -.set nb430nf_nouter, 364 -.set nb430nf_ninner, 368 - push %rbp - movq %rsp,%rbp - push %rbx - - emms - - push %r12 - push %r13 - push %r14 - push %r15 - - subq $392,%rsp ## local variable stack space (n*16+8) - - ## zero 32-bit iteration counters - movl $0,%eax - movl %eax,nb430nf_nouter(%rsp) - movl %eax,nb430nf_ninner(%rsp) - - movl (%rdi),%edi - movl %edi,nb430nf_nri(%rsp) - movq %rsi,nb430nf_iinr(%rsp) - movq %rdx,nb430nf_jindex(%rsp) - movq %rcx,nb430nf_jjnr(%rsp) - movq %r8,nb430nf_shift(%rsp) - movq %r9,nb430nf_shiftvec(%rsp) - movq nb430nf_p_ntype(%rbp),%rdi - movl (%rdi),%edi - movl %edi,nb430nf_ntype(%rsp) - movq nb430nf_p_facel(%rbp),%rsi - movsd (%rsi),%xmm0 - movsd %xmm0,nb430nf_facel(%rsp) - - movq nb430nf_p_tabscale(%rbp),%rax - movsd (%rax),%xmm3 - shufpd $0,%xmm3,%xmm3 - movapd %xmm3,nb430nf_tsc(%rsp) - - movq nb430nf_p_gbtabscale(%rbp),%rbx - movsd (%rbx),%xmm4 - shufpd $0,%xmm4,%xmm4 - movapd %xmm4,nb430nf_gbtsc(%rsp) - - ## create constant floating-point factors on stack - movl $0x00000000,%eax ## lower half of double half IEEE (hex) - movl $0x3fe00000,%ebx - movl %eax,nb430nf_half(%rsp) - movl %ebx,nb430nf_half+4(%rsp) - movsd nb430nf_half(%rsp),%xmm1 - shufpd $0,%xmm1,%xmm1 ## splat to all elements - movapd %xmm1,%xmm3 - addpd %xmm3,%xmm3 ## one - movapd %xmm3,%xmm2 - addpd %xmm2,%xmm2 ## two - addpd %xmm2,%xmm3 ## three - movapd %xmm1,nb430nf_half(%rsp) - movapd %xmm3,nb430nf_three(%rsp) - -_nb_kernel430nf_x86_64_sse2.nb430nf_threadloop: - movq nb430nf_count(%rbp),%rsi ## pointer to sync counter - movl (%rsi),%eax -_nb_kernel430nf_x86_64_sse2.nb430nf_spinlock: - movl %eax,%ebx ## ebx=*count=nn0 - addl $1,%ebx ## ebx=nn1=nn0+10 - lock - cmpxchgl %ebx,(%rsi) ## write nn1 to *counter, - ## if it hasnt changed. - ## or reread *counter to eax. - pause ## -> better p4 performance - jnz _nb_kernel430nf_x86_64_sse2.nb430nf_spinlock - - ## if(nn1>nri) nn1=nri - movl nb430nf_nri(%rsp),%ecx - movl %ecx,%edx - subl %ebx,%ecx - cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri - ## Cleared the spinlock if we got here. - ## eax contains nn0, ebx contains nn1. - movl %eax,nb430nf_n(%rsp) - movl %ebx,nb430nf_nn1(%rsp) - subl %eax,%ebx ## calc number of outer lists - movl %eax,%esi ## copy n to esi - jg _nb_kernel430nf_x86_64_sse2.nb430nf_outerstart - jmp _nb_kernel430nf_x86_64_sse2.nb430nf_end - -_nb_kernel430nf_x86_64_sse2.nb430nf_outerstart: - ## ebx contains number of outer iterations - addl nb430nf_nouter(%rsp),%ebx - movl %ebx,nb430nf_nouter(%rsp) - -_nb_kernel430nf_x86_64_sse2.nb430nf_outer: - movq nb430nf_shift(%rsp),%rax ## rax = pointer into shift[] - movl (%rax,%rsi,4),%ebx ## rbx=shift[n] - - lea (%rbx,%rbx,2),%rbx ## rbx=3*is - movl %ebx,nb430nf_is3(%rsp) ## store is3 - - movq nb430nf_shiftvec(%rsp),%rax ## rax = base of shiftvec[] - - movsd (%rax,%rbx,8),%xmm0 - movsd 8(%rax,%rbx,8),%xmm1 - movsd 16(%rax,%rbx,8),%xmm2 - - movq nb430nf_iinr(%rsp),%rcx ## rcx = pointer into iinr[] - movl (%rcx,%rsi,4),%ebx ## ebx =ii - - movq nb430nf_charge(%rbp),%rdx - movsd (%rdx,%rbx,8),%xmm3 - mulsd nb430nf_facel(%rsp),%xmm3 - shufpd $0,%xmm3,%xmm3 - - movq nb430nf_invsqrta(%rbp),%rdx ## load invsqrta[ii] - movsd (%rdx,%rbx,8),%xmm4 - shufpd $0,%xmm4,%xmm4 - - movq nb430nf_type(%rbp),%rdx - movl (%rdx,%rbx,4),%edx - imull nb430nf_ntype(%rsp),%edx - shll %edx - movl %edx,nb430nf_ntia(%rsp) - - lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3 - movq nb430nf_pos(%rbp),%rax ## rax = base of pos[] - - addsd (%rax,%rbx,8),%xmm0 - addsd 8(%rax,%rbx,8),%xmm1 - addsd 16(%rax,%rbx,8),%xmm2 - - movapd %xmm3,nb430nf_iq(%rsp) - movapd %xmm4,nb430nf_isai(%rsp) - - shufpd $0,%xmm0,%xmm0 - shufpd $0,%xmm1,%xmm1 - shufpd $0,%xmm2,%xmm2 - - movapd %xmm0,nb430nf_ix(%rsp) - movapd %xmm1,nb430nf_iy(%rsp) - movapd %xmm2,nb430nf_iz(%rsp) - - movl %ebx,nb430nf_ii3(%rsp) - - ## clear vctot - xorpd %xmm4,%xmm4 - movapd %xmm4,nb430nf_vctot(%rsp) - movapd %xmm4,nb430nf_Vvdwtot(%rsp) - - movq nb430nf_jindex(%rsp),%rax - movl (%rax,%rsi,4),%ecx ## jindex[n] - movl 4(%rax,%rsi,4),%edx ## jindex[n+1] - subl %ecx,%edx ## number of innerloop atoms - - movq nb430nf_pos(%rbp),%rsi - movq nb430nf_faction(%rbp),%rdi - movq nb430nf_jjnr(%rsp),%rax - shll $2,%ecx - addq %rcx,%rax - movq %rax,nb430nf_innerjjnr(%rsp) ## pointer to jjnr[nj0] - movl %edx,%ecx - subl $2,%edx - addl nb430nf_ninner(%rsp),%ecx - movl %ecx,nb430nf_ninner(%rsp) - addl $0,%edx - movl %edx,nb430nf_innerk(%rsp) ## number of innerloop atoms - jge _nb_kernel430nf_x86_64_sse2.nb430nf_unroll_loop - jmp _nb_kernel430nf_x86_64_sse2.nb430nf_checksingle -_nb_kernel430nf_x86_64_sse2.nb430nf_unroll_loop: - ## twice unrolled innerloop here - movq nb430nf_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] - movl (%rdx),%eax - movl 4(%rdx),%ebx - addq $8,nb430nf_innerjjnr(%rsp) ## advance pointer (unrolled 2) - - ## load isaj - movq nb430nf_invsqrta(%rbp),%rsi - movlpd (%rsi,%rax,8),%xmm2 - movhpd (%rsi,%rbx,8),%xmm2 - mulpd nb430nf_isai(%rsp),%xmm2 - movapd %xmm2,nb430nf_isaprod(%rsp) - movapd %xmm2,%xmm1 - mulpd nb430nf_gbtsc(%rsp),%xmm1 - movapd %xmm1,nb430nf_gbscale(%rsp) - - movq nb430nf_charge(%rbp),%rsi ## base of charge[] - movlpd (%rsi,%rax,8),%xmm3 - movhpd (%rsi,%rbx,8),%xmm3 - - mulpd nb430nf_iq(%rsp),%xmm2 - mulpd %xmm2,%xmm3 - movapd %xmm3,nb430nf_qq(%rsp) - - movq nb430nf_type(%rbp),%rsi - movl (%rsi,%rax,4),%ecx - movl (%rsi,%rbx,4),%edx - movq nb430nf_vdwparam(%rbp),%rsi - shll %ecx - shll %edx - movl nb430nf_ntia(%rsp),%edi - addl %edi,%ecx - addl %edi,%edx - - movlpd (%rsi,%rcx,8),%xmm6 ## c6a - movlpd (%rsi,%rdx,8),%xmm7 ## c6b - movhpd 8(%rsi,%rcx,8),%xmm6 ## c6a c12a - movhpd 8(%rsi,%rdx,8),%xmm7 ## c6b c12b - - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movapd %xmm4,nb430nf_c6(%rsp) - movapd %xmm6,nb430nf_c12(%rsp) - - movq nb430nf_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%rax ## replace jnr with j3 - lea (%rbx,%rbx,2),%rbx - - ## move two coordinates to xmm0-xmm2 - movlpd (%rsi,%rax,8),%xmm0 - movlpd 8(%rsi,%rax,8),%xmm1 - movlpd 16(%rsi,%rax,8),%xmm2 - movhpd (%rsi,%rbx,8),%xmm0 - movhpd 8(%rsi,%rbx,8),%xmm1 - movhpd 16(%rsi,%rbx,8),%xmm2 - - movq nb430nf_faction(%rbp),%rdi - - ## move nb430nf_ix-iz to xmm4-xmm6 - movapd nb430nf_ix(%rsp),%xmm4 - movapd nb430nf_iy(%rsp),%xmm5 - movapd nb430nf_iz(%rsp),%xmm6 - - ## calc dr - subpd %xmm0,%xmm4 - subpd %xmm1,%xmm5 - subpd %xmm2,%xmm6 - - ## square it - mulpd %xmm4,%xmm4 - mulpd %xmm5,%xmm5 - mulpd %xmm6,%xmm6 - addpd %xmm5,%xmm4 - addpd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtpd2ps %xmm4,%xmm5 - rsqrtps %xmm5,%xmm5 - cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulpd %xmm2,%xmm2 ## lu*lu - movapd nb430nf_three(%rsp),%xmm1 - mulpd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb430nf_half(%rsp),%xmm0 - subpd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm1 - mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulpd %xmm1,%xmm1 ## lu*lu - movapd nb430nf_three(%rsp),%xmm2 - mulpd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb430nf_half(%rsp),%xmm0 - subpd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulpd %xmm5,%xmm2 - mulpd %xmm2,%xmm0 ## xmm0=iter2 of rinv - mulpd %xmm0,%xmm4 ## xmm4=r - movapd %xmm4,nb430nf_r(%rsp) - mulpd nb430nf_gbscale(%rsp),%xmm4 - - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $2,%mm6 ## idx *= 4 - - movq nb430nf_GBtab(%rbp),%rsi - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx ## indices in eax/ebx - - ## Coulomb - movapd (%rsi,%rcx,8),%xmm4 ## Y1 F1 - movapd (%rsi,%rdx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%rsi,%rcx,8),%xmm6 ## G1 H1 - movapd 16(%rsi,%rdx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## coulomb table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - movapd nb430nf_qq(%rsp),%xmm3 - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - mulpd %xmm3,%xmm5 ## vcoul=qq*VV - addpd nb430nf_vctot(%rsp),%xmm5 - movapd %xmm5,nb430nf_vctot(%rsp) - - movapd nb430nf_r(%rsp),%xmm4 - mulpd nb430nf_tsc(%rsp),%xmm4 - cvttpd2pi %xmm4,%mm6 ## mm6 = lu idx - cvtpi2pd %mm6,%xmm5 - subpd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulpd %xmm2,%xmm2 ## xmm2=eps2 - - pslld $3,%mm6 ## idx *= 8 - - movq nb430nf_VFtab(%rbp),%rsi - - movd %mm6,%ecx - psrlq $32,%mm6 - movd %mm6,%edx ## indices in eax/ebx - - ## Dispersion - movapd (%rsi,%rcx,8),%xmm4 ## Y1 F1 - movapd (%rsi,%rdx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 16(%rsi,%rcx,8),%xmm6 ## G1 H1 - movapd 16(%rsi,%rdx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## Dispersion table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - - mulpd nb430nf_c6(%rsp),%xmm5 ## Vvdw6 - addpd nb430nf_Vvdwtot(%rsp),%xmm5 - movapd %xmm5,nb430nf_Vvdwtot(%rsp) - - ## Repulsion - movapd 32(%rsi,%rcx,8),%xmm4 ## Y1 F1 - movapd 32(%rsi,%rdx,8),%xmm3 ## Y2 F2 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 Y2 - unpckhpd %xmm3,%xmm5 ## F1 F2 - - movapd 48(%rsi,%rcx,8),%xmm6 ## G1 H1 - movapd 48(%rsi,%rdx,8),%xmm3 ## G2 H2 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 G2 - unpckhpd %xmm3,%xmm7 ## H1 H2 - ## Dispersion table ready, in xmm4-xmm7 - mulpd %xmm1,%xmm6 ## xmm6=Geps - mulpd %xmm2,%xmm7 ## xmm7=Heps2 - addpd %xmm6,%xmm5 - addpd %xmm7,%xmm5 ## xmm5=Fp - mulpd %xmm1,%xmm5 ## xmm5=eps*Fp - addpd %xmm4,%xmm5 ## xmm5=VV - - mulpd nb430nf_c12(%rsp),%xmm5 ## Vvdw12 - addpd nb430nf_Vvdwtot(%rsp),%xmm5 - movapd %xmm5,nb430nf_Vvdwtot(%rsp) - xorpd %xmm4,%xmm4 - - ## should we do one more iteration? - subl $2,nb430nf_innerk(%rsp) - jl _nb_kernel430nf_x86_64_sse2.nb430nf_checksingle - jmp _nb_kernel430nf_x86_64_sse2.nb430nf_unroll_loop -_nb_kernel430nf_x86_64_sse2.nb430nf_checksingle: - movl nb430nf_innerk(%rsp),%edx - andl $1,%edx - jnz _nb_kernel430nf_x86_64_sse2.nb430nf_dosingle - jmp _nb_kernel430nf_x86_64_sse2.nb430nf_updateouterdata -_nb_kernel430nf_x86_64_sse2.nb430nf_dosingle: - movq nb430nf_charge(%rbp),%rsi - movq nb430nf_invsqrta(%rbp),%rdx - movq nb430nf_pos(%rbp),%rdi - movq nb430nf_innerjjnr(%rsp),%rcx - movl (%rcx),%eax - - xorpd %xmm6,%xmm6 - movapd %xmm6,%xmm7 - movsd (%rdx,%rax,8),%xmm7 - movlpd (%rsi,%rax,8),%xmm6 ## xmm6(0) has the charge - mulsd nb430nf_isai(%rsp),%xmm7 - movapd %xmm7,nb430nf_isaprod(%rsp) - movapd %xmm7,%xmm1 - mulpd nb430nf_gbtsc(%rsp),%xmm1 - movapd %xmm1,nb430nf_gbscale(%rsp) - - mulsd nb430nf_iq(%rsp),%xmm7 - mulsd %xmm7,%xmm6 - movapd %xmm6,nb430nf_qq(%rsp) - - movq nb430nf_type(%rbp),%rsi - movl (%rsi,%rax,4),%edx - movq nb430nf_vdwparam(%rbp),%rsi - shll %edx - movl nb430nf_ntia(%rsp),%edi - addl %edi,%edx - - movlpd (%rsi,%rdx,8),%xmm6 ## c6a - movhpd 8(%rsi,%rdx,8),%xmm6 ## c6a c12a - - xorpd %xmm7,%xmm7 - movapd %xmm6,%xmm4 - unpcklpd %xmm7,%xmm4 - unpckhpd %xmm7,%xmm6 - - movapd %xmm4,nb430nf_c6(%rsp) - movapd %xmm6,nb430nf_c12(%rsp) - - movq nb430nf_pos(%rbp),%rsi ## base of pos[] - - lea (%rax,%rax,2),%rax ## replace jnr with j3 - - ## move two coordinates to xmm0-xmm2 - movlpd (%rsi,%rax,8),%xmm0 - movlpd 8(%rsi,%rax,8),%xmm1 - movlpd 16(%rsi,%rax,8),%xmm2 - - movq nb430nf_faction(%rbp),%rdi - - ## move nb430nf_ix-iz to xmm4-xmm6 - movapd nb430nf_ix(%rsp),%xmm4 - movapd nb430nf_iy(%rsp),%xmm5 - movapd nb430nf_iz(%rsp),%xmm6 - - ## calc dr - subsd %xmm0,%xmm4 - subsd %xmm1,%xmm5 - subsd %xmm2,%xmm6 - - ## square it - mulsd %xmm4,%xmm4 - mulsd %xmm5,%xmm5 - mulsd %xmm6,%xmm6 - addsd %xmm5,%xmm4 - addsd %xmm6,%xmm4 - ## rsq in xmm4 - - cvtsd2ss %xmm4,%xmm5 - rsqrtss %xmm5,%xmm5 - cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 - - ## lookup seed in xmm2 - movapd %xmm2,%xmm5 ## copy of lu - mulsd %xmm2,%xmm2 ## lu*lu - movapd nb430nf_three(%rsp),%xmm1 - mulsd %xmm4,%xmm2 ## rsq*lu*lu - movapd nb430nf_half(%rsp),%xmm0 - subsd %xmm2,%xmm1 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm1 - mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) - - movapd %xmm1,%xmm5 ## copy of lu - mulsd %xmm1,%xmm1 ## lu*lu - movapd nb430nf_three(%rsp),%xmm2 - mulsd %xmm4,%xmm1 ## rsq*lu*lu - movapd nb430nf_half(%rsp),%xmm0 - subsd %xmm1,%xmm2 ## 30-rsq*lu*lu - mulsd %xmm5,%xmm2 - mulsd %xmm2,%xmm0 ## xmm0=iter2 of rinv (new lu) - mulsd %xmm0,%xmm4 ## xmm4=r - movsd %xmm4,nb430nf_r(%rsp) - mulsd nb430nf_gbscale(%rsp),%xmm4 - - cvttsd2si %xmm4,%edx ## mm6 = lu idx - cvtsi2sd %edx,%xmm5 - subsd %xmm5,%xmm4 - movapd %xmm4,%xmm1 ## xmm1=eps - movapd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $2,%edx ## idx *= 4 - movq nb430nf_GBtab(%rbp),%rsi - - ## Coulomb - movapd (%rsi,%rdx,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%rsi,%rdx,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## coulomb table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - movapd nb430nf_qq(%rsp),%xmm3 - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd %xmm3,%xmm5 ## vcoul=qq*VV - addsd nb430nf_vctot(%rsp),%xmm5 - movsd %xmm5,nb430nf_vctot(%rsp) - - movsd nb430nf_r(%rsp),%xmm4 - mulsd nb430nf_tsc(%rsp),%xmm4 - cvttsd2si %xmm4,%edx ## mm6 = lu idx - cvtsi2sd %edx,%xmm5 - subsd %xmm5,%xmm4 - movsd %xmm4,%xmm1 ## xmm1=eps - movsd %xmm1,%xmm2 - mulsd %xmm2,%xmm2 ## xmm2=eps2 - - shll $3,%edx - - movq nb430nf_VFtab(%rbp),%rsi - - ## Dispersion - movapd (%rsi,%rdx,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 16(%rsi,%rdx,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## Dispersion table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - - mulsd nb430nf_c6(%rsp),%xmm5 ## Vvdw6 - addsd nb430nf_Vvdwtot(%rsp),%xmm5 - movlpd %xmm5,nb430nf_Vvdwtot(%rsp) - - ## Repulsion - movapd 32(%rsi,%rdx,8),%xmm4 ## Y1 F1 - xorpd %xmm3,%xmm3 - movapd %xmm4,%xmm5 - unpcklpd %xmm3,%xmm4 ## Y1 - unpckhpd %xmm3,%xmm5 ## F1 - - movapd 48(%rsi,%rdx,8),%xmm6 ## G1 H1 - xorpd %xmm3,%xmm3 - movapd %xmm6,%xmm7 - unpcklpd %xmm3,%xmm6 ## G1 - unpckhpd %xmm3,%xmm7 ## H1 - ## Dispersion table ready, in xmm4-xmm7 - mulsd %xmm1,%xmm6 ## xmm6=Geps - mulsd %xmm2,%xmm7 ## xmm7=Heps2 - addsd %xmm6,%xmm5 - addsd %xmm7,%xmm5 ## xmm5=Fp - mulsd %xmm1,%xmm5 ## xmm5=eps*Fp - addsd %xmm4,%xmm5 ## xmm5=VV - mulsd nb430nf_c12(%rsp),%xmm5 ## Vvdw12 - addsd nb430nf_Vvdwtot(%rsp),%xmm5 - movlpd %xmm5,nb430nf_Vvdwtot(%rsp) -_nb_kernel430nf_x86_64_sse2.nb430nf_updateouterdata: - ## get n from stack - movl nb430nf_n(%rsp),%esi - ## get group index for i particle - movq nb430nf_gid(%rbp),%rdx ## base of gid[] - movl (%rdx,%rsi,4),%edx ## ggid=gid[n] - - ## accumulate total potential energy and update it - movapd nb430nf_vctot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movq nb430nf_Vc(%rbp),%rax - addsd (%rax,%rdx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%rax,%rdx,8) - - ## accumulate total lj energy and update it - movapd nb430nf_Vvdwtot(%rsp),%xmm7 - ## accumulate - movhlps %xmm7,%xmm6 - addsd %xmm6,%xmm7 ## low xmm7 has the sum now - - ## add earlier value from mem - movq nb430nf_Vvdw(%rbp),%rax - addsd (%rax,%rdx,8),%xmm7 - ## move back to mem - movsd %xmm7,(%rax,%rdx,8) - - ## finish if last - movl nb430nf_nn1(%rsp),%ecx - ## esi already loaded with n - incl %esi - subl %esi,%ecx - jz _nb_kernel430nf_x86_64_sse2.nb430nf_outerend - - ## not last, iterate outer loop once more! - movl %esi,nb430nf_n(%rsp) - jmp _nb_kernel430nf_x86_64_sse2.nb430nf_outer -_nb_kernel430nf_x86_64_sse2.nb430nf_outerend: - ## check if more outer neighborlists remain - movl nb430nf_nri(%rsp),%ecx - ## esi already loaded with n above - subl %esi,%ecx - jz _nb_kernel430nf_x86_64_sse2.nb430nf_end - ## non-zero, do one more workunit - jmp _nb_kernel430nf_x86_64_sse2.nb430nf_threadloop -_nb_kernel430nf_x86_64_sse2.nb430nf_end: - movl nb430nf_nouter(%rsp),%eax - movl nb430nf_ninner(%rsp),%ebx - movq nb430nf_outeriter(%rbp),%rcx - movq nb430nf_inneriter(%rbp),%rdx - movl %eax,(%rcx) - movl %ebx,(%rdx) - - addq $392,%rsp - emms - - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - - pop %rbx - pop %rbp - ret - - - - diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt index 92c4360e7d..2c7b1f0069 100644 --- a/src/tools/CMakeLists.txt +++ b/src/tools/CMakeLists.txt @@ -32,7 +32,7 @@ add_library(gmxana gmx_membed.c ) -target_link_libraries(gmxana gmx) +target_link_libraries(gmxana md gmx) set_target_properties(gmxana PROPERTIES OUTPUT_NAME "gmxana${GMX_BINARY_SUFFIX}") # List of programs with single corresponding .c source file, -- 2.11.4.GIT