From 0f26a48719fe046a3aeab2048be302721cd7bad0 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Mon, 10 Dec 2012 18:20:24 +0100 Subject: [PATCH] implemented nbnxn AVX-256 2x(N+N) kernels With AVX-256 nbnxn kernels are faster with 2x(4+4)=4x4 than 4x8. To reduce the number of nbnxn kernels, there are no longer both simd128 and simd256 kernels, but simd_4xn and for AVX-256 additionally simd_2xnn kernels. The nbnxn search code is simplified significantly, as it no longer has 128-bit and 256-bit simd simultaneously. The SIMD pair-search and kernel configuration is now separated from the SSE SIMD bounding box code configuration. Additionally the label x86_simd in all nbnxn files is replaced by simd only, in prepartion for non-x86 SIMD acceleration. Change-Id: If03cf569cf918dcec675fbcb43e1ff3a30528f74 --- include/gmx_x86_avx_256.h | 17 + include/gmx_x86_simd_macros.h | 21 +- include/types/nb_verlet.h | 80 +++-- src/kernel/calc_verletbuf.c | 13 +- src/kernel/pme_loadbal.c | 3 +- src/mdlib/forcerec.c | 81 ++--- src/mdlib/nbnxn_atomdata.c | 30 +- src/mdlib/nbnxn_internal.h | 48 +-- ...rnel_x86_simd128.c => nbnxn_kernel_simd_2xnn.c} | 69 ++-- ...rnel_x86_simd256.h => nbnxn_kernel_simd_2xnn.h} | 29 +- ...ncludes.h => nbnxn_kernel_simd_2xnn_includes.h} | 18 +- ...simd_inner.h => nbnxn_kernel_simd_2xnn_inner.h} | 369 +++------------------ ...simd_outer.h => nbnxn_kernel_simd_2xnn_outer.h} | 198 ++++------- ...ernel_x86_simd256.c => nbnxn_kernel_simd_4xn.c} | 69 ++-- ...ernel_x86_simd128.h => nbnxn_kernel_simd_4xn.h} | 28 +- ...includes.h => nbnxn_kernel_simd_4xn_includes.h} | 18 +- ..._simd_inner.h => nbnxn_kernel_simd_4xn_inner.h} | 11 +- ..._simd_outer.h => nbnxn_kernel_simd_4xn_outer.h} | 52 ++- ..._x86_simd_utils.h => nbnxn_kernel_simd_utils.h} | 61 +++- src/mdlib/nbnxn_search.c | 217 ++++++------ src/mdlib/nbnxn_search_simd_2xnn.h | 262 +++++++++++++++ ...n_search_x86_simd.h => nbnxn_search_simd_4xn.h} | 122 +++---- src/mdlib/sim_util.c | 66 ++-- 23 files changed, 960 insertions(+), 922 deletions(-) rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd128.c => nbnxn_kernel_simd_2xnn.c} (83%) rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd256.h => nbnxn_kernel_simd_2xnn.h} (69%) copy src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_includes.h => nbnxn_kernel_simd_2xnn_includes.h} (86%) copy src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_inner.h => nbnxn_kernel_simd_2xnn_inner.h} (58%) copy src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_outer.h => nbnxn_kernel_simd_2xnn_outer.h} (66%) rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd256.c => nbnxn_kernel_simd_4xn.c} (84%) rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd128.h => nbnxn_kernel_simd_4xn.h} (71%) rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_includes.h => nbnxn_kernel_simd_4xn_includes.h} (86%) rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_inner.h => nbnxn_kernel_simd_4xn_inner.h} (99%) rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_outer.h => nbnxn_kernel_simd_4xn_outer.h} (95%) rename src/mdlib/nbnxn_kernels/{nbnxn_kernel_x86_simd_utils.h => nbnxn_kernel_simd_utils.h} (90%) create mode 100644 src/mdlib/nbnxn_search_simd_2xnn.h rename src/mdlib/{nbnxn_search_x86_simd.h => nbnxn_search_simd_4xn.h} (77%) diff --git a/include/gmx_x86_avx_256.h b/include/gmx_x86_avx_256.h index 1e444f46e9..9f266e834f 100644 --- a/include/gmx_x86_avx_256.h +++ b/include/gmx_x86_avx_256.h @@ -128,6 +128,16 @@ gmx_mm256_set_m128(__m128 hi, __m128 lo) } +static gmx_inline __m256 +gmx_mm256_load4_ps(float const * p) +{ + __m128 a; + + a = _mm_load_ps(p); + return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 0x1); +} + + static __m256d gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2) { @@ -147,6 +157,13 @@ gmx_mm256_set_m128d(__m128d hi, __m128d lo) } +static __m128 gmx_mm256_sum4h_m128(__m256 x, __m256 y) +{ + __m256 sum; + + sum = _mm256_add_ps(x,y); + return _mm_add_ps(_mm256_castps256_ps128(sum),_mm256_extractf128_ps(sum,0x1)); +} static void diff --git a/include/gmx_x86_simd_macros.h b/include/gmx_x86_simd_macros.h index fd85c6e5e6..e1b1403c24 100644 --- a/include/gmx_x86_simd_macros.h +++ b/include/gmx_x86_simd_macros.h @@ -91,11 +91,11 @@ */ #if !defined GMX_MM128_HERE && !defined GMX_MM256_HERE -"You should define GMX_MM128_HERE or GMX_MM256_HERE" +#error "You should define GMX_MM128_HERE or GMX_MM256_HERE" #endif #if defined GMX_MM128_HERE && defined GMX_MM256_HERE -"You should not define both GMX_MM128_HERE and GMX_MM256_HERE" +#error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE" #endif #ifdef GMX_MM128_HERE @@ -235,6 +235,23 @@ #define gmx_pmecorrF_pr gmx_mm256_pmecorrF_ps #define gmx_pmecorrV_pr gmx_mm256_pmecorrV_ps +#define gmx_loaddh_pr gmx_mm256_load4_ps + +/* Half SIMD-width type */ +#define gmx_mm_hpr __m128 + +/* Half SIMD-width macros */ +#define gmx_load_hpr _mm_load_ps +#define gmx_load1_hpr(x) _mm_set1_ps((x)[0]) +#define gmx_store_hpr _mm_store_ps +#define gmx_add_hpr _mm_add_ps +#define gmx_sub_hpr _mm_sub_ps + +#define gmx_sum4_hpr gmx_mm256_sum4h_m128 + +/* Conversion between half and full SIMD-width */ +#define gmx_2hpr_to_pr gmx_mm256_set_m128 + #else #include "gmx_x86_simd_double.h" diff --git a/include/types/nb_verlet.h b/include/types/nb_verlet.h index 32dd9432d7..8a33375c03 100644 --- a/include/types/nb_verlet.h +++ b/include/types/nb_verlet.h @@ -46,48 +46,66 @@ extern "C" { #endif -/*! Nonbonded NxN kernel types: plain C, SSE/AVX, GPU CUDA, GPU emulation, etc */ -enum { nbkNotSet = 0, - nbk4x4_PlainC, - nbk4xN_X86_SIMD128, - nbk4xN_X86_SIMD256, - nbk8x8x8_CUDA, - nbk8x8x8_PlainC }; +#ifdef GMX_X86_SSE2 +/* Use SIMD accelerated nbnxn search and kernels */ +#define GMX_NBNXN_SIMD + +#ifdef GMX_X86_AVX_256 +/* Comment out this define to use AVX-128 kernels with AVX-256 acceleration */ +#define GMX_NBNXN_SIMD_BITWIDTH 256 +#else +#define GMX_NBNXN_SIMD_BITWIDTH 128 +#endif + +/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently. + * Currently the 2xNN SIMD kernels only make sense and are only implemented + * with AVX-256 in single precision using a 4x4 cluster setup instead of 4x8. + */ +#define GMX_NBNXN_SIMD_4XN +#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE +#define GMX_NBNXN_SIMD_2XNN +#endif + +#endif + + +/*! Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */ +typedef enum +{ + nbnxnkNotSet = 0, + nbnxnk4x4_PlainC, + nbnxnk4xN_SIMD_4xN, + nbnxnk4xN_SIMD_2xNN, + nbnxnk8x8x8_CUDA, + nbnxnk8x8x8_PlainC, + nbnxnkNR +} nbnxn_kernel_type; /* Note that _mm_... intrinsics can be converted to either SSE or AVX * depending on compiler flags. * For gcc we check for __AVX__ * At least a check for icc should be added (if there is a macro) */ -static const char *nbk_name[] = - { "not set", "plain C 4x4", -#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__) +static const char *nbnxn_kernel_name[nbnxnkNR] = + { "not set", "plain C", +#if !(defined GMX_X86_SSE2) + "not available", "not available", +#else +#if GMX_NBNXN_SIMD_BITWIDTH == 128 +#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__) #ifndef GMX_X86_SSE4_1 -#ifndef GMX_DOUBLE - "SSE2 4x4", + "SSE2", "SSE2", #else - "SSE2 4x2", + "SSE4.1", "SSE4.1", #endif #else -#ifndef GMX_DOUBLE - "SSE4.1 4x4", -#else - "SSE4.1 4x2", + "AVX-128", "AVX-128", #endif -#endif -#else -#ifndef GMX_DOUBLE - "AVX-128 4x4", #else - "AVX-128 4x2", + "AVX-256", "AVX-256", #endif #endif -#ifndef GMX_DOUBLE - "AVX-256 4x8", -#else - "AVX-256 4x4", -#endif - "CUDA 8x8x8", "plain C 8x8x8" }; + "CUDA", "plain C" }; enum { ewaldexclTable, ewaldexclAnalytical }; @@ -119,9 +137,9 @@ typedef struct { /* non-bonded data structure with Verlet-type cut-off */ typedef struct { - nbnxn_search_t nbs; /* n vs n atom pair searching data */ - int ngrp; /* number of interaction groups */ - nonbonded_verlet_group_t grp[2];/* local and non-local interaction group */ + nbnxn_search_t nbs; /* n vs n atom pair searching data */ + int ngrp; /* number of interaction groups */ + nonbonded_verlet_group_t grp[2];/* local and non-local interaction group */ gmx_bool bUseGPU; /* TRUE when GPU acceleration is used */ nbnxn_cuda_ptr_t cu_nbv; /* pointer to CUDA nb verlet data */ diff --git a/src/kernel/calc_verletbuf.c b/src/kernel/calc_verletbuf.c index 8a1f6ef00d..68e3cf628c 100644 --- a/src/kernel/calc_verletbuf.c +++ b/src/kernel/calc_verletbuf.c @@ -78,17 +78,14 @@ void verletbuf_get_list_setup(gmx_bool bGPU, } else { -#ifndef GMX_X86_SSE2 +#ifndef GMX_NBNXN_SIMD list_setup->cluster_size_j = NBNXN_CPU_CLUSTER_I_SIZE; #else - int simd_width; - -#ifdef GMX_X86_AVX_256 - simd_width = 256; -#else - simd_width = 128; + list_setup->cluster_size_j = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8); +#ifdef GMX_NBNXN_SIMD_2XNN + /* We assume the smallest cluster size to be on the safe side */ + list_setup->cluster_size_j /= 2; #endif - list_setup->cluster_size_j = simd_width/(sizeof(real)*8); #endif } } diff --git a/src/kernel/pme_loadbal.c b/src/kernel/pme_loadbal.c index 6367ac25bd..a208f08c4f 100644 --- a/src/kernel/pme_loadbal.c +++ b/src/kernel/pme_loadbal.c @@ -624,7 +624,8 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, ic->ewaldcoeff = set->ewaldcoeff; bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0); - if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->grp[0].kernel_type == nbk8x8x8_CUDA) + if (pme_lb->cutoff_scheme == ecutsVERLET && + nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA) { nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic); } diff --git a/src/mdlib/forcerec.c b/src/mdlib/forcerec.c index b773dff15f..ce27dda2b7 100644 --- a/src/mdlib/forcerec.c +++ b/src/mdlib/forcerec.c @@ -1403,41 +1403,48 @@ static void init_forcerec_f_threads(t_forcerec *fr,int nenergrp) static void pick_nbnxn_kernel_cpu(FILE *fp, const t_commrec *cr, const gmx_cpuid_t cpuid_info, + const t_inputrec *ir, int *kernel_type, int *ewald_excl) { - *kernel_type = nbk4x4_PlainC; + *kernel_type = nbnxnk4x4_PlainC; *ewald_excl = ewaldexclTable; -#ifdef GMX_X86_SSE2 +#ifdef GMX_NBNXN_SIMD { - /* On Intel Sandy-Bridge AVX-256 kernels are always faster. - * On AMD Bulldozer AVX-256 is much slower than AVX-128. - */ - if(gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX) == 1 && - gmx_cpuid_vendor(cpuid_info) != GMX_CPUID_VENDOR_AMD) - { -#ifdef GMX_X86_AVX_256 - *kernel_type = nbk4xN_X86_SIMD256; -#else - *kernel_type = nbk4xN_X86_SIMD128; +#ifdef GMX_NBNXN_SIMD_4XN + *kernel_type = nbnxnk4xN_SIMD_4xN; #endif - } - else +#ifdef GMX_NBNXN_SIMD_2XNN + /* We expect the 2xNN kernels to be faster in most cases */ + *kernel_type = nbnxnk4xN_SIMD_2xNN; +#endif + +#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256 + if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT) { - *kernel_type = nbk4xN_X86_SIMD128; + /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4), + * 10% with HT, 50% without HT, but extra zeros interactions + * can compensate. As we currently don't detect the actual use + * of HT, switch to 4x8 to avoid a potential performance hit. + */ + *kernel_type = nbnxnk4xN_SIMD_4xN; } - - if (getenv("GMX_NBNXN_AVX128") != NULL) +#endif + if (getenv("GMX_NBNXN_SIMD_4XN") != NULL) { - *kernel_type = nbk4xN_X86_SIMD128; +#ifdef GMX_NBNXN_SIMD_2XNN + *kernel_type = nbnxnk4xN_SIMD_4xN; +#else + gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels"); +#endif } - if (getenv("GMX_NBNXN_AVX256") != NULL) + if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL) { -#ifdef GMX_X86_AVX_256 - *kernel_type = nbk4xN_X86_SIMD256; +#ifdef GMX_NBNXN_SIMD_2XNN + *kernel_type = nbnxnk4xN_SIMD_2xNN; #else - gmx_fatal(FARGS,"You requested AVX-256 nbnxn kernels, but GROMACS was built without AVX support"); + gmx_fatal(FARGS,"SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels"); #endif } @@ -1466,6 +1473,7 @@ static void pick_nbnxn_kernel(FILE *fp, const gmx_hw_info_t *hwinfo, gmx_bool use_cpu_acceleration, gmx_bool *bUseGPU, + const t_inputrec *ir, int *kernel_type, int *ewald_excl, gmx_bool bDoNonbonded) @@ -1475,7 +1483,7 @@ static void pick_nbnxn_kernel(FILE *fp, assert(kernel_type); - *kernel_type = nbkNotSet; + *kernel_type = nbnxnkNotSet; *ewald_excl = ewaldexclTable; bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL); @@ -1521,7 +1529,7 @@ static void pick_nbnxn_kernel(FILE *fp, if (bEmulateGPU) { - *kernel_type = nbk8x8x8_PlainC; + *kernel_type = nbnxnk8x8x8_PlainC; if (bDoNonbonded) { @@ -1530,31 +1538,28 @@ static void pick_nbnxn_kernel(FILE *fp, } else if (bGPU) { - *kernel_type = nbk8x8x8_CUDA; + *kernel_type = nbnxnk8x8x8_CUDA; } - if (*kernel_type == nbkNotSet) + if (*kernel_type == nbnxnkNotSet) { if (use_cpu_acceleration) { - pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info, + pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,ir, kernel_type,ewald_excl); } else { - *kernel_type = nbk4x4_PlainC; + *kernel_type = nbnxnk4x4_PlainC; } } if (bDoNonbonded && fp != NULL) { - if (MASTER(cr)) - { - fprintf(stderr,"Using %s non-bonded kernels\n", - nbk_name[*kernel_type]); - } - fprintf(fp,"\nUsing %s non-bonded kernels\n\n", - nbk_name[*kernel_type]); + fprintf(fp,"\nUsing %s %dx%d non-bonded kernels\n\n", + nbnxn_kernel_name[*kernel_type], + nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE, + nbnxn_kernel_to_cj_size(*kernel_type)); } } @@ -1754,12 +1759,13 @@ static void init_nb_verlet(FILE *fp, { nbv->grp[i].nbl_lists.nnbl = 0; nbv->grp[i].nbat = NULL; - nbv->grp[i].kernel_type = nbkNotSet; + nbv->grp[i].kernel_type = nbnxnkNotSet; if (i == 0) /* local */ { pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration, &nbv->bUseGPU, + ir, &nbv->grp[i].kernel_type, &nbv->grp[i].ewald_excl, fr->bNonbonded); @@ -1771,6 +1777,7 @@ static void init_nb_verlet(FILE *fp, /* Use GPU for local, select a CPU kernel for non-local */ pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration, NULL, + ir, &nbv->grp[i].kernel_type, &nbv->grp[i].ewald_excl, fr->bNonbonded); @@ -1834,7 +1841,7 @@ static void init_nb_verlet(FILE *fp, for(i=0; ingrp; i++) { - if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA) + if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA) { nb_alloc = &pmalloc; nb_free = &pfree; diff --git a/src/mdlib/nbnxn_atomdata.c b/src/mdlib/nbnxn_atomdata.c index aaa3f22c8e..ad231287cf 100644 --- a/src/mdlib/nbnxn_atomdata.c +++ b/src/mdlib/nbnxn_atomdata.c @@ -151,8 +151,8 @@ static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t *out, ma((void **)&out->Vvdw,out->nV*sizeof(*out->Vvdw)); ma((void **)&out->Vc ,out->nV*sizeof(*out->Vc )); - if (nb_kernel_type == nbk4xN_X86_SIMD128 || - nb_kernel_type == nbk4xN_X86_SIMD256) + if (nb_kernel_type == nbnxnk4xN_SIMD_4xN || + nb_kernel_type == nbnxnk4xN_SIMD_2xNN) { cj_size = nbnxn_kernel_to_cj_size(nb_kernel_type); out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size; @@ -598,17 +598,25 @@ void nbnxn_atomdata_init(FILE *fp, nbat->lj_comb = NULL; if (simple) { + int pack_x; + switch (nb_kernel_type) { - case nbk4xN_X86_SIMD128: - nbat->XFormat = nbatX4; - break; - case nbk4xN_X86_SIMD256: -#ifndef GMX_DOUBLE - nbat->XFormat = nbatX8; -#else - nbat->XFormat = nbatX4; -#endif + case nbnxnk4xN_SIMD_4xN: + case nbnxnk4xN_SIMD_2xNN: + pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE, + nbnxn_kernel_to_cj_size(nb_kernel_type)); + switch (pack_x) + { + case 4: + nbat->XFormat = nbatX4; + break; + case 8: + nbat->XFormat = nbatX8; + break; + default: + gmx_incons("Unsupported packing width"); + } break; default: nbat->XFormat = nbatXYZ; diff --git a/src/mdlib/nbnxn_internal.h b/src/mdlib/nbnxn_internal.h index 9d0be66e30..8e9f005b7c 100644 --- a/src/mdlib/nbnxn_internal.h +++ b/src/mdlib/nbnxn_internal.h @@ -95,29 +95,32 @@ typedef struct { int nsubc_tot; /* Total number of subcell, used for printing */ } nbnxn_grid_t; -#ifdef NBNXN_SEARCH_SSE +#ifdef GMX_NBNXN_SIMD +#if GMX_NBNXN_SIMD_BITWIDTH == 128 #define GMX_MM128_HERE +#else +#if GMX_NBNXN_SIMD_BITWIDTH == 256 +#define GMX_MM256_HERE +#else +#error "unsupported GMX_NBNXN_SIMD_BITWIDTH" +#endif +#endif #include "gmx_x86_simd_macros.h" -typedef struct nbnxn_x_ci_x86_simd128 { + +typedef struct nbnxn_x_ci_simd_4xn { /* The i-cluster coordinates for simple search */ gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0; gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1; gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2; gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3; -} nbnxn_x_ci_x86_simd128_t; -#undef GMX_MM128_HERE -#ifdef GMX_X86_AVX_256 -#define GMX_MM256_HERE -#include "gmx_x86_simd_macros.h" -typedef struct nbnxn_x_ci_x86_simd256 { +} nbnxn_x_ci_simd_4xn_t; + +typedef struct nbnxn_x_ci_simd_2xnn { /* The i-cluster coordinates for simple search */ gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0; - gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1; gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2; - gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3; -} nbnxn_x_ci_x86_simd256_t; -#undef GMX_MM256_HERE -#endif +} nbnxn_x_ci_simd_2xnn_t; + #endif /* Working data for the actual i-supercell during pair search */ @@ -126,11 +129,9 @@ typedef struct nbnxn_list_work { float *bb_ci; /* The bounding boxes, pbc shifted, for each cluster */ real *x_ci; /* The coordinates, pbc shifted, for each atom */ -#ifdef NBNXN_SEARCH_SSE - nbnxn_x_ci_x86_simd128_t *x_ci_x86_simd128; -#ifdef GMX_X86_AVX_256 - nbnxn_x_ci_x86_simd256_t *x_ci_x86_simd256; -#endif +#ifdef GMX_NBNXN_SIMD + nbnxn_x_ci_simd_4xn_t *x_ci_simd_4xn; + nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn; #endif int cj_ind; /* The current cj_ind index for the current list */ int cj4_init; /* The first unitialized cj4 block */ @@ -155,17 +156,18 @@ gmx_icell_set_x_t(int ci, nbnxn_list_work_t *work); static gmx_icell_set_x_t icell_set_x_simple; -#ifdef NBNXN_SEARCH_SSE -static gmx_icell_set_x_t icell_set_x_simple_x86_simd128; -#ifdef GMX_X86_AVX_256 -static gmx_icell_set_x_t icell_set_x_simple_x86_simd256; -#endif +#ifdef GMX_NBNXN_SIMD +static gmx_icell_set_x_t icell_set_x_simple_simd_4xn; +static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn; #endif static gmx_icell_set_x_t icell_set_x_supersub; #ifdef NBNXN_SEARCH_SSE static gmx_icell_set_x_t icell_set_x_supersub_sse8; #endif +#undef GMX_MM128_HERE +#undef GMX_MM256_HERE + /* Local cycle count struct for profiling */ typedef struct { int count; diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c similarity index 83% rename from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c rename to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c index 8018f65f5e..a294bad9c0 100644 --- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c +++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c @@ -49,18 +49,26 @@ #include "../nbnxn_consts.h" #include "nbnxn_kernel_common.h" -#ifdef GMX_X86_SSE2 +#ifdef GMX_NBNXN_SIMD_2XNN -#include "nbnxn_kernel_x86_simd128.h" +#include "nbnxn_kernel_simd_2xnn.h" -/* Include all flavors of the 128-bit SSE or AVX kernel loops */ +/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */ +#if GMX_NBNXN_SIMD_BITWIDTH == 128 #define GMX_MM128_HERE +#else +#if GMX_NBNXN_SIMD_BITWIDTH == 256 +#define GMX_MM256_HERE +#else +#error "unsupported GMX_NBNXN_SIMD_BITWIDTH" +#endif +#endif /* Analytical reaction-field kernels */ #define CALC_COUL_RF -#include "nbnxn_kernel_x86_simd_includes.h" +#include "nbnxn_kernel_simd_2xnn_includes.h" #undef CALC_COUL_RF @@ -68,11 +76,11 @@ #define CALC_COUL_TAB /* Single cut-off: rcoulomb = rvdw */ -#include "nbnxn_kernel_x86_simd_includes.h" +#include "nbnxn_kernel_simd_2xnn_includes.h" /* Twin cut-off: rcoulomb >= rvdw */ #define VDW_CUTOFF_CHECK -#include "nbnxn_kernel_x86_simd_includes.h" +#include "nbnxn_kernel_simd_2xnn_includes.h" #undef VDW_CUTOFF_CHECK #undef CALC_COUL_TAB @@ -81,11 +89,11 @@ #define CALC_COUL_EWALD /* Single cut-off: rcoulomb = rvdw */ -#include "nbnxn_kernel_x86_simd_includes.h" +#include "nbnxn_kernel_simd_2xnn_includes.h" /* Twin cut-off: rcoulomb >= rvdw */ #define VDW_CUTOFF_CHECK -#include "nbnxn_kernel_x86_simd_includes.h" +#include "nbnxn_kernel_simd_2xnn_includes.h" #undef VDW_CUTOFF_CHECK #undef CALC_COUL_EWALD @@ -109,7 +117,7 @@ typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t *nbl, enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR }; -#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_ener +#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_ener static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] = { { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) }, { NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) }, @@ -118,7 +126,7 @@ static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] = { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } }; #undef NBK_FN -#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_energrp +#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_energrp static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] = { { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) }, { NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) }, @@ -127,7 +135,7 @@ static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] = { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } }; #undef NBK_FN -#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_noener +#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_2xnn_##elec##_comb_##ljcomb##_noener static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] = { { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) }, { NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) }, @@ -141,15 +149,14 @@ static void reduce_group_energies(int ng,int ng_2log, const real *VSvdw,const real *VSc, real *Vvdw,real *Vc) { + const int simd_width = GMX_X86_SIMD_WIDTH_HERE; + const int unrollj_half = GMX_X86_SIMD_WIDTH_HERE/4; int ng_p2,i,j,j0,j1,c,s; -#define SIMD_WIDTH (GMX_X86_SIMD_WIDTH_HERE) -#define SIMD_WIDTH_HALF (GMX_X86_SIMD_WIDTH_HERE/2) - ng_p2 = (1<>(2*UNROLLJ))); /* Intel Compiler version 12.1.3 20120130 is buggy: use cast. * With gcc we don't need the cast, but it's faster. */ #define cast_cvt(x) _mm256_cvtepi32_ps(_mm256_castps_si256(x)) int_SSE0 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask0)),zero_SSE); - int_SSE1 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask1)),zero_SSE); - int_SSE2 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr,mask0)),zero_SSE); - int_SSE3 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr,mask1)),zero_SSE); -#undef cast_cvt -#else - /* Load integer interaction mask */ - /* With AVX there are no integer operations, - * and there is no int to double conversion, so cast to float - */ - __m256 mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(l_cj[cjind].excl)); -#define cast_cvt(x) _mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castps_si256(x))) - int_SSE0 = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask0)),zero_SSE); - int_SSE1 = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask1)),zero_SSE); - int_SSE2 = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask2)),zero_SSE); - int_SSE3 = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask3)),zero_SSE); + int_SSE2 = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask2)),zero_SSE); #undef cast_cvt -#endif } #endif -#endif /* load j atom coordinates */ - jxSSE = gmx_load_pr(x+ajx); - jySSE = gmx_load_pr(x+ajy); - jzSSE = gmx_load_pr(x+ajz); - + jxSSE = gmx_loaddh_pr(x+ajx); + jySSE = gmx_loaddh_pr(x+ajy); + jzSSE = gmx_loaddh_pr(x+ajz); + /* Calculate distance */ dx_SSE0 = gmx_sub_pr(ix_SSE0,jxSSE); dy_SSE0 = gmx_sub_pr(iy_SSE0,jySSE); dz_SSE0 = gmx_sub_pr(iz_SSE0,jzSSE); - dx_SSE1 = gmx_sub_pr(ix_SSE1,jxSSE); - dy_SSE1 = gmx_sub_pr(iy_SSE1,jySSE); - dz_SSE1 = gmx_sub_pr(iz_SSE1,jzSSE); dx_SSE2 = gmx_sub_pr(ix_SSE2,jxSSE); dy_SSE2 = gmx_sub_pr(iy_SSE2,jySSE); dz_SSE2 = gmx_sub_pr(iz_SSE2,jzSSE); - dx_SSE3 = gmx_sub_pr(ix_SSE3,jxSSE); - dy_SSE3 = gmx_sub_pr(iy_SSE3,jySSE); - dz_SSE3 = gmx_sub_pr(iz_SSE3,jzSSE); - + /* rsq = dx*dx+dy*dy+dz*dz */ rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0); - rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1); rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2); - rsq_SSE3 = gmx_calc_rsq_pr(dx_SSE3,dy_SSE3,dz_SSE3); #ifndef CUTOFF_BLENDV wco_SSE0 = gmx_cmplt_pr(rsq_SSE0,rc2_SSE); - wco_SSE1 = gmx_cmplt_pr(rsq_SSE1,rc2_SSE); wco_SSE2 = gmx_cmplt_pr(rsq_SSE2,rc2_SSE); - wco_SSE3 = gmx_cmplt_pr(rsq_SSE3,rc2_SSE); #endif #ifdef CHECK_EXCLS @@ -368,49 +295,15 @@ if (cj == ci_sh) { wco_SSE0 = gmx_and_pr(wco_SSE0,diag_SSE0); - wco_SSE1 = gmx_and_pr(wco_SSE1,diag_SSE1); wco_SSE2 = gmx_and_pr(wco_SSE2,diag_SSE2); - wco_SSE3 = gmx_and_pr(wco_SSE3,diag_SSE3); } #else -#if UNROLLJ < UNROLLI - if (cj == ci_sh*2) - { - wco_SSE0 = gmx_and_pr(wco_SSE0,diag0_SSE0); - wco_SSE1 = gmx_and_pr(wco_SSE1,diag0_SSE1); - wco_SSE2 = gmx_and_pr(wco_SSE2,diag0_SSE2); - wco_SSE3 = gmx_and_pr(wco_SSE3,diag0_SSE3); - } - if (cj == ci_sh*2 + 1) - { - wco_SSE0 = gmx_and_pr(wco_SSE0,diag1_SSE0); - wco_SSE1 = gmx_and_pr(wco_SSE1,diag1_SSE1); - wco_SSE2 = gmx_and_pr(wco_SSE2,diag1_SSE2); - wco_SSE3 = gmx_and_pr(wco_SSE3,diag1_SSE3); - } -#else - if (cj*2 == ci_sh) - { - wco_SSE0 = gmx_and_pr(wco_SSE0,diag0_SSE0); - wco_SSE1 = gmx_and_pr(wco_SSE1,diag0_SSE1); - wco_SSE2 = gmx_and_pr(wco_SSE2,diag0_SSE2); - wco_SSE3 = gmx_and_pr(wco_SSE3,diag0_SSE3); - } - else if (cj*2 + 1 == ci_sh) - { - wco_SSE0 = gmx_and_pr(wco_SSE0,diag1_SSE0); - wco_SSE1 = gmx_and_pr(wco_SSE1,diag1_SSE1); - wco_SSE2 = gmx_and_pr(wco_SSE2,diag1_SSE2); - wco_SSE3 = gmx_and_pr(wco_SSE3,diag1_SSE3); - } -#endif +#error "only UNROLLJ == UNROLLI currently supported in the joined kernels" #endif #else /* EXCL_FORCES */ /* Remove all excluded atom pairs from the list */ wco_SSE0 = gmx_and_pr(wco_SSE0,int_SSE0); - wco_SSE1 = gmx_and_pr(wco_SSE1,int_SSE1); wco_SSE2 = gmx_and_pr(wco_SSE2,int_SSE2); - wco_SSE3 = gmx_and_pr(wco_SSE3,int_SSE3); #endif #endif @@ -435,72 +328,51 @@ #ifdef CHECK_EXCLS /* For excluded pairs add a small number to avoid r^-6 = NaN */ rsq_SSE0 = gmx_add_pr(rsq_SSE0,gmx_andnot_pr(int_SSE0,avoid_sing_SSE)); - rsq_SSE1 = gmx_add_pr(rsq_SSE1,gmx_andnot_pr(int_SSE1,avoid_sing_SSE)); rsq_SSE2 = gmx_add_pr(rsq_SSE2,gmx_andnot_pr(int_SSE2,avoid_sing_SSE)); - rsq_SSE3 = gmx_add_pr(rsq_SSE3,gmx_andnot_pr(int_SSE3,avoid_sing_SSE)); #endif /* Calculate 1/r */ -#ifndef GMX_DOUBLE rinv_SSE0 = gmx_invsqrt_pr(rsq_SSE0); - rinv_SSE1 = gmx_invsqrt_pr(rsq_SSE1); rinv_SSE2 = gmx_invsqrt_pr(rsq_SSE2); - rinv_SSE3 = gmx_invsqrt_pr(rsq_SSE3); -#else - GMX_MM_INVSQRT2_PD(rsq_SSE0,rsq_SSE1,rinv_SSE0,rinv_SSE1); - GMX_MM_INVSQRT2_PD(rsq_SSE2,rsq_SSE3,rinv_SSE2,rinv_SSE3); -#endif #ifdef CALC_COULOMB /* Load parameters for j atom */ - jq_SSE = gmx_load_pr(q+aj); + jq_SSE = gmx_loaddh_pr(q+aj); qq_SSE0 = gmx_mul_pr(iq_SSE0,jq_SSE); - qq_SSE1 = gmx_mul_pr(iq_SSE1,jq_SSE); qq_SSE2 = gmx_mul_pr(iq_SSE2,jq_SSE); - qq_SSE3 = gmx_mul_pr(iq_SSE3,jq_SSE); #endif #ifdef CALC_LJ #if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C - load_lj_pair_params(nbfp0,type,aj,c6_SSE0,c12_SSE0); - load_lj_pair_params(nbfp1,type,aj,c6_SSE1,c12_SSE1); + load_lj_pair_params2(nbfp0,type,aj,c6_SSE0,c12_SSE0); #ifndef HALF_LJ - load_lj_pair_params(nbfp2,type,aj,c6_SSE2,c12_SSE2); - load_lj_pair_params(nbfp3,type,aj,c6_SSE3,c12_SSE3); + load_lj_pair_params2(nbfp2,type,aj,c6_SSE2,c12_SSE2); #endif #endif /* not defined any LJ rule */ #ifdef LJ_COMB_GEOM - c6s_j_SSE = gmx_load_pr(ljc+aj2+0); - c12s_j_SSE = gmx_load_pr(ljc+aj2+STRIDE); + c6s_j_SSE = gmx_loaddh_pr(ljc+aj2+0); + c12s_j_SSE = gmx_loaddh_pr(ljc+aj2+STRIDE); c6_SSE0 = gmx_mul_pr(c6s_SSE0 ,c6s_j_SSE ); - c6_SSE1 = gmx_mul_pr(c6s_SSE1 ,c6s_j_SSE ); #ifndef HALF_LJ c6_SSE2 = gmx_mul_pr(c6s_SSE2 ,c6s_j_SSE ); - c6_SSE3 = gmx_mul_pr(c6s_SSE3 ,c6s_j_SSE ); #endif c12_SSE0 = gmx_mul_pr(c12s_SSE0,c12s_j_SSE); - c12_SSE1 = gmx_mul_pr(c12s_SSE1,c12s_j_SSE); #ifndef HALF_LJ c12_SSE2 = gmx_mul_pr(c12s_SSE2,c12s_j_SSE); - c12_SSE3 = gmx_mul_pr(c12s_SSE3,c12s_j_SSE); #endif #endif /* LJ_COMB_GEOM */ #ifdef LJ_COMB_LB - hsig_j_SSE = gmx_load_pr(ljc+aj2+0); - seps_j_SSE = gmx_load_pr(ljc+aj2+STRIDE); + hsig_j_SSE = gmx_loaddh_pr(ljc+aj2+0); + seps_j_SSE = gmx_loaddh_pr(ljc+aj2+STRIDE); sig_SSE0 = gmx_add_pr(hsig_i_SSE0,hsig_j_SSE); - sig_SSE1 = gmx_add_pr(hsig_i_SSE1,hsig_j_SSE); eps_SSE0 = gmx_mul_pr(seps_i_SSE0,seps_j_SSE); - eps_SSE1 = gmx_mul_pr(seps_i_SSE1,seps_j_SSE); #ifndef HALF_LJ sig_SSE2 = gmx_add_pr(hsig_i_SSE2,hsig_j_SSE); - sig_SSE3 = gmx_add_pr(hsig_i_SSE3,hsig_j_SSE); eps_SSE2 = gmx_mul_pr(seps_i_SSE2,seps_j_SSE); - eps_SSE3 = gmx_mul_pr(seps_i_SSE3,seps_j_SSE); #endif #endif /* LJ_COMB_LB */ @@ -508,21 +380,15 @@ #ifndef CUTOFF_BLENDV rinv_SSE0 = gmx_and_pr(rinv_SSE0,wco_SSE0); - rinv_SSE1 = gmx_and_pr(rinv_SSE1,wco_SSE1); rinv_SSE2 = gmx_and_pr(rinv_SSE2,wco_SSE2); - rinv_SSE3 = gmx_and_pr(rinv_SSE3,wco_SSE3); #else /* We only need to mask for the cut-off: blendv is faster */ rinv_SSE0 = gmx_blendv_pr(rinv_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0)); - rinv_SSE1 = gmx_blendv_pr(rinv_SSE1,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE1)); rinv_SSE2 = gmx_blendv_pr(rinv_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2)); - rinv_SSE3 = gmx_blendv_pr(rinv_SSE3,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE3)); #endif rinvsq_SSE0 = gmx_mul_pr(rinv_SSE0,rinv_SSE0); - rinvsq_SSE1 = gmx_mul_pr(rinv_SSE1,rinv_SSE1); rinvsq_SSE2 = gmx_mul_pr(rinv_SSE2,rinv_SSE2); - rinvsq_SSE3 = gmx_mul_pr(rinv_SSE3,rinv_SSE3); #ifdef CALC_COULOMB /* Note that here we calculate force*r, not the usual force/r. @@ -534,29 +400,21 @@ #ifdef EXCL_FORCES /* Only add 1/r for non-excluded atom pairs */ rinv_ex_SSE0 = gmx_and_pr(rinv_SSE0,int_SSE0); - rinv_ex_SSE1 = gmx_and_pr(rinv_SSE1,int_SSE1); rinv_ex_SSE2 = gmx_and_pr(rinv_SSE2,int_SSE2); - rinv_ex_SSE3 = gmx_and_pr(rinv_SSE3,int_SSE3); #else /* No exclusion forces, we always need 1/r */ #define rinv_ex_SSE0 rinv_SSE0 -#define rinv_ex_SSE1 rinv_SSE1 #define rinv_ex_SSE2 rinv_SSE2 -#define rinv_ex_SSE3 rinv_SSE3 #endif #ifdef CALC_COUL_RF /* Electrostatic interactions */ frcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(rsq_SSE0,mrc_3_SSE))); - frcoul_SSE1 = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_mul_pr(rsq_SSE1,mrc_3_SSE))); frcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(rsq_SSE2,mrc_3_SSE))); - frcoul_SSE3 = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_mul_pr(rsq_SSE3,mrc_3_SSE))); #ifdef CALC_ENERGIES vcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_add_pr(gmx_mul_pr(rsq_SSE0,hrc_3_SSE),moh_rc_SSE))); - vcoul_SSE1 = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_add_pr(gmx_mul_pr(rsq_SSE1,hrc_3_SSE),moh_rc_SSE))); vcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_add_pr(gmx_mul_pr(rsq_SSE2,hrc_3_SSE),moh_rc_SSE))); - vcoul_SSE3 = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_add_pr(gmx_mul_pr(rsq_SSE3,hrc_3_SSE),moh_rc_SSE))); #endif #endif @@ -566,30 +424,20 @@ */ #ifndef CUTOFF_BLENDV brsq_SSE0 = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE0,wco_SSE0)); - brsq_SSE1 = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE1,wco_SSE1)); brsq_SSE2 = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE2,wco_SSE2)); - brsq_SSE3 = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE3,wco_SSE3)); #else /* Strangely, putting mul on a separate line is slower (icc 13) */ brsq_SSE0 = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0))); - brsq_SSE1 = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE1,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE1))); brsq_SSE2 = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2))); - brsq_SSE3 = gmx_mul_pr(beta2_SSE,gmx_blendv_pr(rsq_SSE3,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE3))); #endif ewcorr_SSE0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE0),beta_SSE); - ewcorr_SSE1 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE1),beta_SSE); ewcorr_SSE2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE2),beta_SSE); - ewcorr_SSE3 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE3),beta_SSE); frcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(ewcorr_SSE0,brsq_SSE0))); - frcoul_SSE1 = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_mul_pr(ewcorr_SSE1,brsq_SSE1))); frcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(ewcorr_SSE2,brsq_SSE2))); - frcoul_SSE3 = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_mul_pr(ewcorr_SSE3,brsq_SSE3))); #ifdef CALC_ENERGIES vc_sub_SSE0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE0),beta_SSE); - vc_sub_SSE1 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE1),beta_SSE); vc_sub_SSE2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE2),beta_SSE); - vc_sub_SSE3 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE3),beta_SSE); #endif #endif /* CALC_COUL_EWALD */ @@ -597,35 +445,23 @@ #ifdef CALC_COUL_TAB /* Electrostatic interactions */ r_SSE0 = gmx_mul_pr(rsq_SSE0,rinv_SSE0); - r_SSE1 = gmx_mul_pr(rsq_SSE1,rinv_SSE1); r_SSE2 = gmx_mul_pr(rsq_SSE2,rinv_SSE2); - r_SSE3 = gmx_mul_pr(rsq_SSE3,rinv_SSE3); /* Convert r to scaled table units */ rs_SSE0 = gmx_mul_pr(r_SSE0,invtsp_SSE); - rs_SSE1 = gmx_mul_pr(r_SSE1,invtsp_SSE); rs_SSE2 = gmx_mul_pr(r_SSE2,invtsp_SSE); - rs_SSE3 = gmx_mul_pr(r_SSE3,invtsp_SSE); /* Truncate scaled r to an int */ ti_SSE0 = gmx_cvttpr_epi32(rs_SSE0); - ti_SSE1 = gmx_cvttpr_epi32(rs_SSE1); ti_SSE2 = gmx_cvttpr_epi32(rs_SSE2); - ti_SSE3 = gmx_cvttpr_epi32(rs_SSE3); #ifdef GMX_X86_SSE4_1 /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */ rf_SSE0 = gmx_floor_pr(rs_SSE0); - rf_SSE1 = gmx_floor_pr(rs_SSE1); rf_SSE2 = gmx_floor_pr(rs_SSE2); - rf_SSE3 = gmx_floor_pr(rs_SSE3); #else rf_SSE0 = gmx_cvtepi32_pr(ti_SSE0); - rf_SSE1 = gmx_cvtepi32_pr(ti_SSE1); rf_SSE2 = gmx_cvtepi32_pr(ti_SSE2); - rf_SSE3 = gmx_cvtepi32_pr(ti_SSE3); #endif frac_SSE0 = gmx_sub_pr(rs_SSE0,rf_SSE0); - frac_SSE1 = gmx_sub_pr(rs_SSE1,rf_SSE1); frac_SSE2 = gmx_sub_pr(rs_SSE2,rf_SSE2); - frac_SSE3 = gmx_sub_pr(rs_SSE3,rf_SSE3); /* Load and interpolate table forces and possibly energies. * Force and energy can be combined in one table, stride 4: FDV0 @@ -634,36 +470,24 @@ */ #ifndef CALC_ENERGIES load_table_f(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0); - load_table_f(tab_coul_F,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1); load_table_f(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2); - load_table_f(tab_coul_F,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3); #else #ifdef TAB_FDV0 load_table_f_v(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0); - load_table_f_v(tab_coul_F,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1,ctabv_SSE1); load_table_f_v(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2); - load_table_f_v(tab_coul_F,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3,ctabv_SSE3); #else load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0); - load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1,ctabv_SSE1); load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2); - load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3,ctabv_SSE3); #endif #endif fsub_SSE0 = gmx_add_pr(ctab0_SSE0,gmx_mul_pr(frac_SSE0,ctab1_SSE0)); - fsub_SSE1 = gmx_add_pr(ctab0_SSE1,gmx_mul_pr(frac_SSE1,ctab1_SSE1)); fsub_SSE2 = gmx_add_pr(ctab0_SSE2,gmx_mul_pr(frac_SSE2,ctab1_SSE2)); - fsub_SSE3 = gmx_add_pr(ctab0_SSE3,gmx_mul_pr(frac_SSE3,ctab1_SSE3)); frcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,gmx_mul_pr(fsub_SSE0,r_SSE0))); - frcoul_SSE1 = gmx_mul_pr(qq_SSE1,gmx_sub_pr(rinv_ex_SSE1,gmx_mul_pr(fsub_SSE1,r_SSE1))); frcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,gmx_mul_pr(fsub_SSE2,r_SSE2))); - frcoul_SSE3 = gmx_mul_pr(qq_SSE3,gmx_sub_pr(rinv_ex_SSE3,gmx_mul_pr(fsub_SSE3,r_SSE3))); #ifdef CALC_ENERGIES vc_sub_SSE0 = gmx_add_pr(ctabv_SSE0,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE0),gmx_add_pr(ctab0_SSE0,fsub_SSE0))); - vc_sub_SSE1 = gmx_add_pr(ctabv_SSE1,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE1),gmx_add_pr(ctab0_SSE1,fsub_SSE1))); vc_sub_SSE2 = gmx_add_pr(ctabv_SSE2,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE2),gmx_add_pr(ctab0_SSE2,fsub_SSE2))); - vc_sub_SSE3 = gmx_add_pr(ctabv_SSE3,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE3),gmx_add_pr(ctab0_SSE3,fsub_SSE3))); #endif #endif /* CALC_COUL_TAB */ @@ -672,30 +496,21 @@ /* Add Ewald potential shift to vc_sub for convenience */ #ifdef CHECK_EXCLS vc_sub_SSE0 = gmx_add_pr(vc_sub_SSE0,gmx_and_pr(sh_ewald_SSE,int_SSE0)); - vc_sub_SSE1 = gmx_add_pr(vc_sub_SSE1,gmx_and_pr(sh_ewald_SSE,int_SSE1)); vc_sub_SSE2 = gmx_add_pr(vc_sub_SSE2,gmx_and_pr(sh_ewald_SSE,int_SSE2)); - vc_sub_SSE3 = gmx_add_pr(vc_sub_SSE3,gmx_and_pr(sh_ewald_SSE,int_SSE3)); #else vc_sub_SSE0 = gmx_add_pr(vc_sub_SSE0,sh_ewald_SSE); - vc_sub_SSE1 = gmx_add_pr(vc_sub_SSE1,sh_ewald_SSE); vc_sub_SSE2 = gmx_add_pr(vc_sub_SSE2,sh_ewald_SSE); - vc_sub_SSE3 = gmx_add_pr(vc_sub_SSE3,sh_ewald_SSE); #endif #endif vcoul_SSE0 = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,vc_sub_SSE0)); - vcoul_SSE1 = gmx_mul_pr(qq_SSE1,gmx_sub_pr(rinv_ex_SSE1,vc_sub_SSE1)); vcoul_SSE2 = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,vc_sub_SSE2)); - vcoul_SSE3 = gmx_mul_pr(qq_SSE3,gmx_sub_pr(rinv_ex_SSE3,vc_sub_SSE3)); - #endif #ifdef CALC_ENERGIES /* Mask energy for cut-off and diagonal */ vcoul_SSE0 = gmx_and_pr(vcoul_SSE0,wco_SSE0); - vcoul_SSE1 = gmx_and_pr(vcoul_SSE1,wco_SSE1); vcoul_SSE2 = gmx_and_pr(vcoul_SSE2,wco_SSE2); - vcoul_SSE3 = gmx_and_pr(vcoul_SSE3,wco_SSE3); #endif #endif /* CALC_COULOMB */ @@ -705,128 +520,92 @@ #ifdef VDW_CUTOFF_CHECK wco_vdw_SSE0 = gmx_cmplt_pr(rsq_SSE0,rcvdw2_SSE); - wco_vdw_SSE1 = gmx_cmplt_pr(rsq_SSE1,rcvdw2_SSE); #ifndef HALF_LJ wco_vdw_SSE2 = gmx_cmplt_pr(rsq_SSE2,rcvdw2_SSE); - wco_vdw_SSE3 = gmx_cmplt_pr(rsq_SSE3,rcvdw2_SSE); #endif #else /* Same cut-off for Coulomb and VdW, reuse the registers */ #define wco_vdw_SSE0 wco_SSE0 -#define wco_vdw_SSE1 wco_SSE1 #define wco_vdw_SSE2 wco_SSE2 -#define wco_vdw_SSE3 wco_SSE3 #endif #ifndef LJ_COMB_LB rinvsix_SSE0 = gmx_mul_pr(rinvsq_SSE0,gmx_mul_pr(rinvsq_SSE0,rinvsq_SSE0)); - rinvsix_SSE1 = gmx_mul_pr(rinvsq_SSE1,gmx_mul_pr(rinvsq_SSE1,rinvsq_SSE1)); #ifdef EXCL_FORCES rinvsix_SSE0 = gmx_and_pr(rinvsix_SSE0,int_SSE0); - rinvsix_SSE1 = gmx_and_pr(rinvsix_SSE1,int_SSE1); #endif #ifndef HALF_LJ rinvsix_SSE2 = gmx_mul_pr(rinvsq_SSE2,gmx_mul_pr(rinvsq_SSE2,rinvsq_SSE2)); - rinvsix_SSE3 = gmx_mul_pr(rinvsq_SSE3,gmx_mul_pr(rinvsq_SSE3,rinvsq_SSE3)); #ifdef EXCL_FORCES rinvsix_SSE2 = gmx_and_pr(rinvsix_SSE2,int_SSE2); - rinvsix_SSE3 = gmx_and_pr(rinvsix_SSE3,int_SSE3); #endif #endif #ifdef VDW_CUTOFF_CHECK rinvsix_SSE0 = gmx_and_pr(rinvsix_SSE0,wco_vdw_SSE0); - rinvsix_SSE1 = gmx_and_pr(rinvsix_SSE1,wco_vdw_SSE1); #ifndef HALF_LJ rinvsix_SSE2 = gmx_and_pr(rinvsix_SSE2,wco_vdw_SSE2); - rinvsix_SSE3 = gmx_and_pr(rinvsix_SSE3,wco_vdw_SSE3); #endif #endif FrLJ6_SSE0 = gmx_mul_pr(c6_SSE0,rinvsix_SSE0); - FrLJ6_SSE1 = gmx_mul_pr(c6_SSE1,rinvsix_SSE1); #ifndef HALF_LJ FrLJ6_SSE2 = gmx_mul_pr(c6_SSE2,rinvsix_SSE2); - FrLJ6_SSE3 = gmx_mul_pr(c6_SSE3,rinvsix_SSE3); #endif FrLJ12_SSE0 = gmx_mul_pr(c12_SSE0,gmx_mul_pr(rinvsix_SSE0,rinvsix_SSE0)); - FrLJ12_SSE1 = gmx_mul_pr(c12_SSE1,gmx_mul_pr(rinvsix_SSE1,rinvsix_SSE1)); #ifndef HALF_LJ FrLJ12_SSE2 = gmx_mul_pr(c12_SSE2,gmx_mul_pr(rinvsix_SSE2,rinvsix_SSE2)); - FrLJ12_SSE3 = gmx_mul_pr(c12_SSE3,gmx_mul_pr(rinvsix_SSE3,rinvsix_SSE3)); #endif #endif /* not LJ_COMB_LB */ #ifdef LJ_COMB_LB sir_SSE0 = gmx_mul_pr(sig_SSE0,rinv_SSE0); - sir_SSE1 = gmx_mul_pr(sig_SSE1,rinv_SSE1); #ifndef HALF_LJ sir_SSE2 = gmx_mul_pr(sig_SSE2,rinv_SSE2); - sir_SSE3 = gmx_mul_pr(sig_SSE3,rinv_SSE3); #endif sir2_SSE0 = gmx_mul_pr(sir_SSE0,sir_SSE0); - sir2_SSE1 = gmx_mul_pr(sir_SSE1,sir_SSE1); #ifndef HALF_LJ sir2_SSE2 = gmx_mul_pr(sir_SSE2,sir_SSE2); - sir2_SSE3 = gmx_mul_pr(sir_SSE3,sir_SSE3); #endif sir6_SSE0 = gmx_mul_pr(sir2_SSE0,gmx_mul_pr(sir2_SSE0,sir2_SSE0)); - sir6_SSE1 = gmx_mul_pr(sir2_SSE1,gmx_mul_pr(sir2_SSE1,sir2_SSE1)); #ifdef EXCL_FORCES sir6_SSE0 = gmx_and_pr(sir6_SSE0,int_SSE0); - sir6_SSE1 = gmx_and_pr(sir6_SSE1,int_SSE1); #endif #ifndef HALF_LJ sir6_SSE2 = gmx_mul_pr(sir2_SSE2,gmx_mul_pr(sir2_SSE2,sir2_SSE2)); - sir6_SSE3 = gmx_mul_pr(sir2_SSE3,gmx_mul_pr(sir2_SSE3,sir2_SSE3)); #ifdef EXCL_FORCES sir6_SSE2 = gmx_and_pr(sir6_SSE2,int_SSE2); - sir6_SSE3 = gmx_and_pr(sir6_SSE3,int_SSE3); #endif #endif #ifdef VDW_CUTOFF_CHECK sir6_SSE0 = gmx_and_pr(sir6_SSE0,wco_vdw_SSE0); - sir6_SSE1 = gmx_and_pr(sir6_SSE1,wco_vdw_SSE1); #ifndef HALF_LJ sir6_SSE2 = gmx_and_pr(sir6_SSE2,wco_vdw_SSE2); - sir6_SSE3 = gmx_and_pr(sir6_SSE3,wco_vdw_SSE3); #endif #endif FrLJ6_SSE0 = gmx_mul_pr(eps_SSE0,sir6_SSE0); - FrLJ6_SSE1 = gmx_mul_pr(eps_SSE1,sir6_SSE1); #ifndef HALF_LJ FrLJ6_SSE2 = gmx_mul_pr(eps_SSE2,sir6_SSE2); - FrLJ6_SSE3 = gmx_mul_pr(eps_SSE3,sir6_SSE3); #endif FrLJ12_SSE0 = gmx_mul_pr(FrLJ6_SSE0,sir6_SSE0); - FrLJ12_SSE1 = gmx_mul_pr(FrLJ6_SSE1,sir6_SSE1); #ifndef HALF_LJ FrLJ12_SSE2 = gmx_mul_pr(FrLJ6_SSE2,sir6_SSE2); - FrLJ12_SSE3 = gmx_mul_pr(FrLJ6_SSE3,sir6_SSE3); #endif #if defined CALC_ENERGIES /* We need C6 and C12 to calculate the LJ potential shift */ sig2_SSE0 = gmx_mul_pr(sig_SSE0,sig_SSE0); - sig2_SSE1 = gmx_mul_pr(sig_SSE1,sig_SSE1); #ifndef HALF_LJ sig2_SSE2 = gmx_mul_pr(sig_SSE2,sig_SSE2); - sig2_SSE3 = gmx_mul_pr(sig_SSE3,sig_SSE3); #endif sig6_SSE0 = gmx_mul_pr(sig2_SSE0,gmx_mul_pr(sig2_SSE0,sig2_SSE0)); - sig6_SSE1 = gmx_mul_pr(sig2_SSE1,gmx_mul_pr(sig2_SSE1,sig2_SSE1)); #ifndef HALF_LJ sig6_SSE2 = gmx_mul_pr(sig2_SSE2,gmx_mul_pr(sig2_SSE2,sig2_SSE2)); - sig6_SSE3 = gmx_mul_pr(sig2_SSE3,gmx_mul_pr(sig2_SSE3,sig2_SSE3)); #endif c6_SSE0 = gmx_mul_pr(eps_SSE0,sig6_SSE0); - c6_SSE1 = gmx_mul_pr(eps_SSE1,sig6_SSE1); #ifndef HALF_LJ c6_SSE2 = gmx_mul_pr(eps_SSE2,sig6_SSE2); - c6_SSE3 = gmx_mul_pr(eps_SSE3,sig6_SSE3); #endif c12_SSE0 = gmx_mul_pr(c6_SSE0,sig6_SSE0); - c12_SSE1 = gmx_mul_pr(c6_SSE1,sig6_SSE1); #ifndef HALF_LJ c12_SSE2 = gmx_mul_pr(c6_SSE2,sig6_SSE2); - c12_SSE3 = gmx_mul_pr(c6_SSE3,sig6_SSE3); #endif #endif #endif /* LJ_COMB_LB */ @@ -862,66 +641,52 @@ #ifdef CALC_COULOMB #ifndef ENERGY_GROUPS - vctotSSE = gmx_add_pr(vctotSSE, gmx_sum4_pr(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3)); + vctotSSE = gmx_add_pr(vctotSSE, gmx_add_pr(vcoul_SSE0,vcoul_SSE2)); #else - add_ener_grp(vcoul_SSE0,vctp[0],egp_jj); - add_ener_grp(vcoul_SSE1,vctp[1],egp_jj); - add_ener_grp(vcoul_SSE2,vctp[2],egp_jj); - add_ener_grp(vcoul_SSE3,vctp[3],egp_jj); + add_ener_grp_halves(vcoul_SSE0,vctp[0],vctp[1],egp_jj); + add_ener_grp_halves(vcoul_SSE2,vctp[2],vctp[3],egp_jj); #endif #endif #ifdef CALC_LJ /* Calculate the LJ energies */ VLJ6_SSE0 = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE0,gmx_mul_pr(c6_SSE0,sh_invrc6_SSE))); - VLJ6_SSE1 = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE1,gmx_mul_pr(c6_SSE1,sh_invrc6_SSE))); #ifndef HALF_LJ VLJ6_SSE2 = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE2,gmx_mul_pr(c6_SSE2,sh_invrc6_SSE))); - VLJ6_SSE3 = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE3,gmx_mul_pr(c6_SSE3,sh_invrc6_SSE))); #endif VLJ12_SSE0 = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE0,gmx_mul_pr(c12_SSE0,sh_invrc12_SSE))); - VLJ12_SSE1 = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE1,gmx_mul_pr(c12_SSE1,sh_invrc12_SSE))); #ifndef HALF_LJ VLJ12_SSE2 = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE2,gmx_mul_pr(c12_SSE2,sh_invrc12_SSE))); - VLJ12_SSE3 = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE3,gmx_mul_pr(c12_SSE3,sh_invrc12_SSE))); #endif VLJ_SSE0 = gmx_sub_pr(VLJ12_SSE0,VLJ6_SSE0); - VLJ_SSE1 = gmx_sub_pr(VLJ12_SSE1,VLJ6_SSE1); #ifndef HALF_LJ VLJ_SSE2 = gmx_sub_pr(VLJ12_SSE2,VLJ6_SSE2); - VLJ_SSE3 = gmx_sub_pr(VLJ12_SSE3,VLJ6_SSE3); #endif /* The potential shift should be removed for pairs beyond cut-off */ VLJ_SSE0 = gmx_and_pr(VLJ_SSE0,wco_vdw_SSE0); - VLJ_SSE1 = gmx_and_pr(VLJ_SSE1,wco_vdw_SSE1); #ifndef HALF_LJ VLJ_SSE2 = gmx_and_pr(VLJ_SSE2,wco_vdw_SSE2); - VLJ_SSE3 = gmx_and_pr(VLJ_SSE3,wco_vdw_SSE3); #endif #ifdef CHECK_EXCLS /* The potential shift should be removed for excluded pairs */ VLJ_SSE0 = gmx_and_pr(VLJ_SSE0,int_SSE0); - VLJ_SSE1 = gmx_and_pr(VLJ_SSE1,int_SSE1); #ifndef HALF_LJ VLJ_SSE2 = gmx_and_pr(VLJ_SSE2,int_SSE2); - VLJ_SSE3 = gmx_and_pr(VLJ_SSE3,int_SSE3); #endif #endif #ifndef ENERGY_GROUPS VvdwtotSSE = gmx_add_pr(VvdwtotSSE, #ifndef HALF_LJ - gmx_sum4_pr(VLJ_SSE0,VLJ_SSE1,VLJ_SSE2,VLJ_SSE3) + gmx_add_pr(VLJ_SSE0,VLJ_SSE2) #else - gmx_add_pr(VLJ_SSE0,VLJ_SSE1) + VLJ_SSE0 #endif ); #else - add_ener_grp(VLJ_SSE0,vvdwtp[0],egp_jj); - add_ener_grp(VLJ_SSE1,vvdwtp[1],egp_jj); + add_ener_grp_halves(VLJ_SSE0,vvdwtp[0],vvdwtp[1],egp_jj); #ifndef HALF_LJ - add_ener_grp(VLJ_SSE2,vvdwtp[2],egp_jj); - add_ener_grp(VLJ_SSE3,vvdwtp[3],egp_jj); + add_ener_grp_halves(VLJ_SSE2,vvdwtp[2],vvdwtp[3],egp_jj); #endif #endif #endif /* CALC_LJ */ @@ -935,16 +700,8 @@ ( #endif gmx_sub_pr(FrLJ12_SSE0,FrLJ6_SSE0))); - fscal_SSE1 = gmx_mul_pr(rinvsq_SSE1, -#ifdef CALC_COULOMB - gmx_add_pr(frcoul_SSE1, -#else - ( -#endif - gmx_sub_pr(FrLJ12_SSE1,FrLJ6_SSE1))); #else fscal_SSE0 = gmx_mul_pr(rinvsq_SSE0,frcoul_SSE0); - fscal_SSE1 = gmx_mul_pr(rinvsq_SSE1,frcoul_SSE1); #endif /* CALC_LJ */ #if defined CALC_LJ && !defined HALF_LJ fscal_SSE2 = gmx_mul_pr(rinvsq_SSE2, @@ -954,65 +711,41 @@ ( #endif gmx_sub_pr(FrLJ12_SSE2,FrLJ6_SSE2))); - fscal_SSE3 = gmx_mul_pr(rinvsq_SSE3, -#ifdef CALC_COULOMB - gmx_add_pr(frcoul_SSE3, -#else - ( -#endif - gmx_sub_pr(FrLJ12_SSE3,FrLJ6_SSE3))); #else /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */ fscal_SSE2 = gmx_mul_pr(rinvsq_SSE2,frcoul_SSE2); - fscal_SSE3 = gmx_mul_pr(rinvsq_SSE3,frcoul_SSE3); #endif - + /* Calculate temporary vectorial force */ tx_SSE0 = gmx_mul_pr(fscal_SSE0,dx_SSE0); - tx_SSE1 = gmx_mul_pr(fscal_SSE1,dx_SSE1); tx_SSE2 = gmx_mul_pr(fscal_SSE2,dx_SSE2); - tx_SSE3 = gmx_mul_pr(fscal_SSE3,dx_SSE3); ty_SSE0 = gmx_mul_pr(fscal_SSE0,dy_SSE0); - ty_SSE1 = gmx_mul_pr(fscal_SSE1,dy_SSE1); ty_SSE2 = gmx_mul_pr(fscal_SSE2,dy_SSE2); - ty_SSE3 = gmx_mul_pr(fscal_SSE3,dy_SSE3); tz_SSE0 = gmx_mul_pr(fscal_SSE0,dz_SSE0); - tz_SSE1 = gmx_mul_pr(fscal_SSE1,dz_SSE1); tz_SSE2 = gmx_mul_pr(fscal_SSE2,dz_SSE2); - tz_SSE3 = gmx_mul_pr(fscal_SSE3,dz_SSE3); - + /* Increment i atom force */ fix_SSE0 = gmx_add_pr(fix_SSE0,tx_SSE0); - fix_SSE1 = gmx_add_pr(fix_SSE1,tx_SSE1); fix_SSE2 = gmx_add_pr(fix_SSE2,tx_SSE2); - fix_SSE3 = gmx_add_pr(fix_SSE3,tx_SSE3); fiy_SSE0 = gmx_add_pr(fiy_SSE0,ty_SSE0); - fiy_SSE1 = gmx_add_pr(fiy_SSE1,ty_SSE1); fiy_SSE2 = gmx_add_pr(fiy_SSE2,ty_SSE2); - fiy_SSE3 = gmx_add_pr(fiy_SSE3,ty_SSE3); fiz_SSE0 = gmx_add_pr(fiz_SSE0,tz_SSE0); - fiz_SSE1 = gmx_add_pr(fiz_SSE1,tz_SSE1); fiz_SSE2 = gmx_add_pr(fiz_SSE2,tz_SSE2); - fiz_SSE3 = gmx_add_pr(fiz_SSE3,tz_SSE3); - + /* Decrement j atom force */ - gmx_store_pr(f+ajx, - gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) )); - gmx_store_pr(f+ajy, - gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) )); - gmx_store_pr(f+ajz, - gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) )); + gmx_store_hpr(f+ajx, + gmx_sub_hpr( gmx_load_hpr(f+ajx), gmx_sum4_hpr(tx_SSE0,tx_SSE2) )); + gmx_store_hpr(f+ajy, + gmx_sub_hpr( gmx_load_hpr(f+ajy), gmx_sum4_hpr(ty_SSE0,ty_SSE2) )); + gmx_store_hpr(f+ajz, + gmx_sub_hpr( gmx_load_hpr(f+ajz), gmx_sum4_hpr(tz_SSE0,tz_SSE2) )); } #undef rinv_ex_SSE0 -#undef rinv_ex_SSE1 #undef rinv_ex_SSE2 -#undef rinv_ex_SSE3 #undef wco_vdw_SSE0 -#undef wco_vdw_SSE1 #undef wco_vdw_SSE2 -#undef wco_vdw_SSE3 #undef CUTOFF_BLENDV diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h similarity index 66% copy from src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h copy to src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h index 0644f9776c..d36a99f15c 100644 --- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h +++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h @@ -41,13 +41,13 @@ #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3]) #define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE -#define UNROLLJ GMX_X86_SIMD_WIDTH_HERE +#define UNROLLJ (GMX_X86_SIMD_WIDTH_HERE/2) #if defined GMX_MM128_HERE || defined GMX_DOUBLE #define STRIDE 4 #endif #if defined GMX_MM256_HERE && !defined GMX_DOUBLE -#define STRIDE 8 +#define STRIDE 4 #endif #ifdef GMX_MM128_HERE @@ -74,7 +74,7 @@ #define SIMD_MASK_ALL 0xffffffff -#include "nbnxn_kernel_x86_simd_utils.h" +#include "nbnxn_kernel_simd_utils.h" /* All functionality defines are set here, except for: * CALC_ENERGIES, ENERGY_GROUPS which are defined before. @@ -91,51 +91,49 @@ /* Assumes all LJ parameters are identical */ /* #define FIX_LJ_C */ -#define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e +/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names + * with all combinations off electrostatics (coul), LJ combination rules (ljc) + * and energy calculations (ene), depending on the defines set. + */ + +#define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene #if defined LJ_COMB_GEOM -#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e) +#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene) #else #if defined LJ_COMB_LB -#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e) +#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene) #else -#define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e) +#define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene) #endif #endif #ifdef CALC_COUL_RF -#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e) +#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene) #endif #ifdef CALC_COUL_TAB #ifndef VDW_CUTOFF_CHECK -#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e) +#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene) #else -#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e) +#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene) #endif #endif #ifdef CALC_COUL_EWALD #ifndef VDW_CUTOFF_CHECK -#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald,e) +#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene) #else -#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald_twin,e) -#endif -#endif - -#ifdef GMX_MM128_HERE -#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e) +#define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene) #endif -#ifdef GMX_MM256_HERE -#define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e) #endif static void #ifndef CALC_ENERGIES -NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener) +NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,noener) #else #ifndef ENERGY_GROUPS -NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener) +NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,ener) #else -NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp) +NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp) #endif #endif #undef NBK_FUNC_NAME @@ -187,13 +185,9 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp) gmx_mm_pr shY_SSE; gmx_mm_pr shZ_SSE; gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0; - gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1; gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2; - gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3; gmx_mm_pr fix_SSE0,fiy_SSE0,fiz_SSE0; - gmx_mm_pr fix_SSE1,fiy_SSE1,fiz_SSE1; gmx_mm_pr fix_SSE2,fiy_SSE2,fiz_SSE2; - gmx_mm_pr fix_SSE3,fiy_SSE3,fiz_SSE3; #if UNROLLJ >= 4 #ifndef GMX_DOUBLE __m128 fix_SSE,fiy_SSE,fiz_SSE; @@ -205,66 +199,12 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp) __m128d fix2_SSE,fiy2_SSE,fiz2_SSE; #endif -#ifndef GMX_MM256_HERE -#ifndef GMX_DOUBLE - __m128i mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 ); - __m128i mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 ); - __m128i mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 ); - __m128i mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 ); -#else - /* For double precision we need to set two 32bit ints for one double */ - __m128i mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 ); - __m128i mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 ); - __m128i mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 ); - __m128i mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 ); -#endif -#else /* AVX: use floating point masks, as there are no integer instructions */ -#ifndef GMX_DOUBLE gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 )); - gmx_mm_pr mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 )); -#else - /* There is no 256-bit int to double conversion, so we use float here */ - __m256 mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 )); - __m256 mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 )); - __m256 mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 )); - __m256 mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 )); -#endif -#endif + gmx_mm_pr mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 )); -#ifndef GMX_MM256_HERE -#ifndef GMX_DOUBLE - __m128 diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 )); - __m128 diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 )); - __m128 diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 )); - __m128 diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); -#else - __m128d diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 )); - __m128d diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); - __m128d diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); - __m128d diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); - __m128d diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff )); - __m128d diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff )); - __m128d diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 )); - __m128d diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); -#endif -#else /* GMX_MM256_HERE */ -#ifndef GMX_DOUBLE - gmx_mm_pr diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 )); - gmx_mm_pr diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 )); - gmx_mm_pr diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 )); - gmx_mm_pr diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); - gmx_mm_pr diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); - gmx_mm_pr diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); - gmx_mm_pr diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); - gmx_mm_pr diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); -#else - gmx_mm_pr diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 )); - gmx_mm_pr diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); - gmx_mm_pr diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); - gmx_mm_pr diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 )); -#endif -#endif + gmx_mm_pr diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 )); + gmx_mm_pr diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 )); #ifndef GMX_MM256_HERE __m128i zeroi_SSE = _mm_setzero_si128(); @@ -275,9 +215,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp) gmx_mm_pr one_SSE=gmx_set1_pr(1.0); gmx_mm_pr iq_SSE0=gmx_setzero_pr(); - gmx_mm_pr iq_SSE1=gmx_setzero_pr(); gmx_mm_pr iq_SSE2=gmx_setzero_pr(); - gmx_mm_pr iq_SSE3=gmx_setzero_pr(); gmx_mm_pr mrc_3_SSE; #ifdef CALC_ENERGIES gmx_mm_pr hrc_3_SSE,moh_rc_SSE; @@ -292,9 +230,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp) #endif #ifdef GMX_MM256_HERE int ti0_array[2*UNROLLJ-1],*ti0; - int ti1_array[2*UNROLLJ-1],*ti1; int ti2_array[2*UNROLLJ-1],*ti2; - int ti3_array[2*UNROLLJ-1],*ti3; #endif #ifdef CALC_ENERGIES gmx_mm_pr mhalfsp_SSE; @@ -313,17 +249,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp) const real *ljc; gmx_mm_pr hsig_i_SSE0,seps_i_SSE0; - gmx_mm_pr hsig_i_SSE1,seps_i_SSE1; gmx_mm_pr hsig_i_SSE2,seps_i_SSE2; - gmx_mm_pr hsig_i_SSE3,seps_i_SSE3; #else #ifdef FIX_LJ_C real pvdw_array[2*UNROLLI*UNROLLJ+3]; real *pvdw_c6,*pvdw_c12; gmx_mm_pr c6_SSE0,c12_SSE0; - gmx_mm_pr c6_SSE1,c12_SSE1; gmx_mm_pr c6_SSE2,c12_SSE2; - gmx_mm_pr c6_SSE3,c12_SSE3; #endif #ifdef LJ_COMB_GEOM @@ -380,9 +312,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp) #ifdef GMX_MM256_HERE /* Generate aligned table pointers */ ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1)))); - ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1)))); ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1)))); - ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1)))); #endif invtsp_SSE = gmx_set1_pr(ic->tabq_scale); @@ -571,55 +501,45 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp) } #endif - /* Load i atom data */ +#define gmx_load2_hpr(x) _mm256_insertf128_ps(gmx_load1_pr(x),gmx_load1_hpr(x+1),1) + + /* Load i atom data */ sciy = scix + STRIDE; sciz = sciy + STRIDE; - ix_SSE0 = gmx_add_pr(gmx_load1_pr(x+scix) ,shX_SSE); - ix_SSE1 = gmx_add_pr(gmx_load1_pr(x+scix+1),shX_SSE); - ix_SSE2 = gmx_add_pr(gmx_load1_pr(x+scix+2),shX_SSE); - ix_SSE3 = gmx_add_pr(gmx_load1_pr(x+scix+3),shX_SSE); - iy_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciy) ,shY_SSE); - iy_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciy+1),shY_SSE); - iy_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciy+2),shY_SSE); - iy_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciy+3),shY_SSE); - iz_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciz) ,shZ_SSE); - iz_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciz+1),shZ_SSE); - iz_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE); - iz_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE); + ix_SSE0 = gmx_add_pr(gmx_load2_hpr(x+scix) ,shX_SSE); + ix_SSE2 = gmx_add_pr(gmx_load2_hpr(x+scix+2),shX_SSE); + iy_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciy) ,shY_SSE); + iy_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciy+2),shY_SSE); + iz_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciz) ,shZ_SSE); + iz_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE); /* With half_LJ we currently always calculate Coulomb interactions */ if (do_coul || half_LJ) { - iq_SSE0 = gmx_set1_pr(facel*q[sci]); - iq_SSE1 = gmx_set1_pr(facel*q[sci+1]); - iq_SSE2 = gmx_set1_pr(facel*q[sci+2]); - iq_SSE3 = gmx_set1_pr(facel*q[sci+3]); + gmx_mm_pr facel_SSE; + + facel_SSE = gmx_set1_pr(facel); + + iq_SSE0 = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci)); + iq_SSE2 = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci+2)); } #ifdef LJ_COMB_LB - hsig_i_SSE0 = gmx_load1_pr(ljc+sci2+0); - hsig_i_SSE1 = gmx_load1_pr(ljc+sci2+1); - hsig_i_SSE2 = gmx_load1_pr(ljc+sci2+2); - hsig_i_SSE3 = gmx_load1_pr(ljc+sci2+3); - seps_i_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0); - seps_i_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1); - seps_i_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2); - seps_i_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3); + hsig_i_SSE0 = gmx_load2_hpr(ljc+sci2+0); + hsig_i_SSE2 = gmx_load2_hpr(ljc+sci2+2); + seps_i_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0); + seps_i_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2); #else #ifdef LJ_COMB_GEOM - c6s_SSE0 = gmx_load1_pr(ljc+sci2+0); - c6s_SSE1 = gmx_load1_pr(ljc+sci2+1); + c6s_SSE0 = gmx_load2_hpr(ljc+sci2+0); if (!half_LJ) { - c6s_SSE2 = gmx_load1_pr(ljc+sci2+2); - c6s_SSE3 = gmx_load1_pr(ljc+sci2+3); + c6s_SSE2 = gmx_load2_hpr(ljc+sci2+2); } - c12s_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0); - c12s_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1); + c12s_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0); if (!half_LJ) { - c12s_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2); - c12s_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3); + c12s_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2); } #else nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride; @@ -638,17 +558,11 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp) /* Clear i atom forces */ fix_SSE0 = gmx_setzero_pr(); - fix_SSE1 = gmx_setzero_pr(); fix_SSE2 = gmx_setzero_pr(); - fix_SSE3 = gmx_setzero_pr(); fiy_SSE0 = gmx_setzero_pr(); - fiy_SSE1 = gmx_setzero_pr(); fiy_SSE2 = gmx_setzero_pr(); - fiy_SSE3 = gmx_setzero_pr(); fiz_SSE0 = gmx_setzero_pr(); - fiz_SSE1 = gmx_setzero_pr(); fiz_SSE2 = gmx_setzero_pr(); - fiz_SSE3 = gmx_setzero_pr(); cjind = cjind0; @@ -661,13 +575,13 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp) #define CHECK_EXCLS while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL) { -#include "nbnxn_kernel_x86_simd_inner.h" +#include "nbnxn_kernel_simd_2xnn_inner.h" cjind++; } #undef CHECK_EXCLS for(; (cjindcj[cjind].excl != SIMD_MASK_ALL) { -#include "nbnxn_kernel_x86_simd_inner.h" +#include "nbnxn_kernel_simd_2xnn_inner.h" cjind++; } #undef CHECK_EXCLS for(; (cjindcj[cjind].excl != SIMD_MASK_ALL) { -#include "nbnxn_kernel_x86_simd_inner.h" +#include "nbnxn_kernel_simd_2xnn_inner.h" cjind++; } #undef CHECK_EXCLS for(; (cjind= rvdw */ #define VDW_CUTOFF_CHECK -#include "nbnxn_kernel_x86_simd_includes.h" +#include "nbnxn_kernel_simd_4xn_includes.h" #undef VDW_CUTOFF_CHECK #undef CALC_COUL_TAB @@ -81,11 +89,11 @@ #define CALC_COUL_EWALD /* Single cut-off: rcoulomb = rvdw */ -#include "nbnxn_kernel_x86_simd_includes.h" +#include "nbnxn_kernel_simd_4xn_includes.h" /* Twin cut-off: rcoulomb >= rvdw */ #define VDW_CUTOFF_CHECK -#include "nbnxn_kernel_x86_simd_includes.h" +#include "nbnxn_kernel_simd_4xn_includes.h" #undef VDW_CUTOFF_CHECK #undef CALC_COUL_EWALD @@ -109,7 +117,7 @@ typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t *nbl, enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR }; -#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_ener +#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_ener static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] = { { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) }, { NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) }, @@ -118,7 +126,7 @@ static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] = { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } }; #undef NBK_FN -#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_energrp +#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_energrp static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] = { { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) }, { NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) }, @@ -127,7 +135,7 @@ static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] = { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } }; #undef NBK_FN -#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_noener +#define NBK_FN(elec,ljcomb) nbnxn_kernel_simd_4xn_##elec##_comb_##ljcomb##_noener static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] = { { NBK_FN(rf ,geom), NBK_FN(rf ,lb), NBK_FN(rf ,none) }, { NBK_FN(tab ,geom), NBK_FN(tab ,lb), NBK_FN(tab ,none) }, @@ -141,15 +149,14 @@ static void reduce_group_energies(int ng,int ng_2log, const real *VSvdw,const real *VSc, real *Vvdw,real *Vc) { + const int simd_width = GMX_X86_SIMD_WIDTH_HERE; + const int unrollj_half = GMX_X86_SIMD_WIDTH_HERE/2; int ng_p2,i,j,j0,j1,c,s; -#define SIMD_WIDTH (GMX_X86_SIMD_WIDTH_HERE) -#define SIMD_WIDTH_HALF (GMX_X86_SIMD_WIDTH_HERE/2) - ng_p2 = (1<cj[cjind].excl != SIMD_MASK_ALL) { -#include "nbnxn_kernel_x86_simd_inner.h" +#include "nbnxn_kernel_simd_4xn_inner.h" cjind++; } #undef CHECK_EXCLS for(; (cjindcj[cjind].excl != SIMD_MASK_ALL) { -#include "nbnxn_kernel_x86_simd_inner.h" +#include "nbnxn_kernel_simd_4xn_inner.h" cjind++; } #undef CHECK_EXCLS for(; (cjindcj[cjind].excl != SIMD_MASK_ALL) { -#include "nbnxn_kernel_x86_simd_inner.h" +#include "nbnxn_kernel_simd_4xn_inner.h" cjind++; } #undef CHECK_EXCLS for(; (cjind + #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8 #define NBNXN_8BB_SSE #endif @@ -94,6 +94,9 @@ #define STRIDE_8BB 4 #define STRIDE_8BB_2LOG 2 +#endif /* NBNXN_SEARCH_SSE */ + +#ifdef GMX_NBNXN_SIMD /* The functions below are macros as they are performance sensitive */ @@ -119,27 +122,37 @@ #define X_IND_CJ_J8(cj) ((cj)*STRIDE_P8) /* The j-cluster size is matched to the SIMD width */ -#ifndef GMX_DOUBLE -/* 128 bits can hold 4 floats */ -#define CI_TO_CJ_S128(ci) CI_TO_CJ_J4(ci) -#define X_IND_CI_S128(ci) X_IND_CI_J4(ci) -#define X_IND_CJ_S128(cj) X_IND_CJ_J4(cj) -/* 256 bits can hold 8 floats */ -#define CI_TO_CJ_S256(ci) CI_TO_CJ_J8(ci) -#define X_IND_CI_S256(ci) X_IND_CI_J8(ci) -#define X_IND_CJ_S256(cj) X_IND_CJ_J8(cj) +#if GMX_NBNXN_SIMD_BITWIDTH == 128 +#ifdef GMX_DOUBLE +#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J2(ci) +#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J2(ci) +#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J2(cj) +#else +#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci) +#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci) +#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj) +#endif +#else +#if GMX_NBNXN_SIMD_BITWIDTH == 256 +#ifdef GMX_DOUBLE +#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci) +#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci) +#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj) +#else +#define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J8(ci) +#define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J8(ci) +#define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J8(cj) +/* Half SIMD with j-cluster size */ +#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci) +#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci) +#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj) +#endif #else -/* 128 bits can hold 2 doubles */ -#define CI_TO_CJ_S128(ci) CI_TO_CJ_J2(ci) -#define X_IND_CI_S128(ci) X_IND_CI_J2(ci) -#define X_IND_CJ_S128(cj) X_IND_CJ_J2(cj) -/* 256 bits can hold 4 doubles */ -#define CI_TO_CJ_S256(ci) CI_TO_CJ_J4(ci) -#define X_IND_CI_S256(ci) X_IND_CI_J4(ci) -#define X_IND_CJ_S256(cj) X_IND_CJ_J4(cj) +#error "unsupported GMX_NBNXN_SIMD_WIDTH" +#endif #endif -#endif /* NBNXN_SEARCH_SSE */ +#endif /* GMX_NBNXN_SIMD */ /* Interaction masks for 4xN atom interactions. @@ -253,12 +266,12 @@ static int nbnxn_kernel_to_ci_size(int nb_kernel_type) { switch (nb_kernel_type) { - case nbk4x4_PlainC: - case nbk4xN_X86_SIMD128: - case nbk4xN_X86_SIMD256: + case nbnxnk4x4_PlainC: + case nbnxnk4xN_SIMD_4xN: + case nbnxnk4xN_SIMD_2xNN: return NBNXN_CPU_CLUSTER_I_SIZE; - case nbk8x8x8_CUDA: - case nbk8x8x8_PlainC: + case nbnxnk8x8x8_CUDA: + case nbnxnk8x8x8_PlainC: /* The cluster size for super/sub lists is only set here. * Any value should work for the pair-search and atomdata code. * The kernels, of course, might require a particular value. @@ -273,24 +286,33 @@ static int nbnxn_kernel_to_ci_size(int nb_kernel_type) int nbnxn_kernel_to_cj_size(int nb_kernel_type) { + int nbnxn_simd_width=0; + int cj_size=0; + +#ifdef GMX_NBNXN_SIMD + nbnxn_simd_width = GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8); +#endif + switch (nb_kernel_type) { - case nbk4x4_PlainC: - return NBNXN_CPU_CLUSTER_I_SIZE; - case nbk4xN_X86_SIMD128: - /* Number of reals that fit in SIMD (128 bits = 16 bytes) */ - return 16/sizeof(real); - case nbk4xN_X86_SIMD256: - /* Number of reals that fit in SIMD (256 bits = 32 bytes) */ - return 32/sizeof(real); - case nbk8x8x8_CUDA: - case nbk8x8x8_PlainC: - return nbnxn_kernel_to_ci_size(nb_kernel_type); + case nbnxnk4x4_PlainC: + cj_size = NBNXN_CPU_CLUSTER_I_SIZE; + break; + case nbnxnk4xN_SIMD_4xN: + cj_size = nbnxn_simd_width; + break; + case nbnxnk4xN_SIMD_2xNN: + cj_size = nbnxn_simd_width/2; + break; + case nbnxnk8x8x8_CUDA: + case nbnxnk8x8x8_PlainC: + cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type); + break; default: gmx_incons("unknown kernel type"); } - return 0; + return cj_size; } static int ci_to_cj(int na_cj_2log,int ci) @@ -307,20 +329,20 @@ static int ci_to_cj(int na_cj_2log,int ci) gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type) { - if (nb_kernel_type == nbkNotSet) + if (nb_kernel_type == nbnxnkNotSet) { gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list."); } switch (nb_kernel_type) { - case nbk8x8x8_CUDA: - case nbk8x8x8_PlainC: + case nbnxnk8x8x8_CUDA: + case nbnxnk8x8x8_PlainC: return FALSE; - case nbk4x4_PlainC: - case nbk4xN_X86_SIMD128: - case nbk4xN_X86_SIMD256: + case nbnxnk4x4_PlainC: + case nbnxnk4xN_SIMD_4xN: + case nbnxnk4xN_SIMD_2xNN: return TRUE; default: @@ -2360,18 +2382,16 @@ static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl, snew(nbl->work,1); #ifdef NBNXN_BBXXXX - snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16); + snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32); #else - snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16); -#endif - snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16); -#ifdef NBNXN_SEARCH_SSE - snew_aligned(nbl->work->x_ci_x86_simd128,1,16); -#ifdef GMX_X86_AVX_256 - snew_aligned(nbl->work->x_ci_x86_simd256,1,32); + snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32); #endif + snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32); +#ifdef GMX_NBNXN_SIMD + snew_aligned(nbl->work->x_ci_simd_4xn,1,32); + snew_aligned(nbl->work->x_ci_simd_2xnn,1,32); #endif - snew_aligned(nbl->work->d2,GPU_NSUBCELL,16); + snew_aligned(nbl->work->d2,GPU_NSUBCELL,32); } void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list, @@ -2626,7 +2646,6 @@ static unsigned int get_imask(gmx_bool rdiag,int ci,int cj) return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL); } -#ifdef NBNXN_SEARCH_SSE /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */ static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj) { @@ -2639,7 +2658,6 @@ static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj) #endif } -#ifdef GMX_X86_AVX_256 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */ static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj) { @@ -2647,12 +2665,23 @@ static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj) return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 : (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 : NBNXN_INT_MASK_ALL)); -#else /* cj-size = 2 */ +#else /* cj-size = 4 */ return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL); #endif } + +#ifdef GMX_NBNXN_SIMD +#if GMX_NBNXN_SIMD_BITWIDTH == 128 +#define get_imask_x86_simd_4xn get_imask_x86_simd128 +#else +#if GMX_NBNXN_SIMD_BITWIDTH == 256 +#define get_imask_x86_simd_4xn get_imask_x86_simd256 +#define get_imask_x86_simd_2xnn get_imask_x86_simd128 +#else +#error "unsupported GMX_NBNXN_SIMD_BITWIDTH" +#endif +#endif #endif -#endif /* NBNXN_SEARCH_SSE */ /* Plain C code for making a pair list of cell ci vs cell cjf-cjl. * Checks bounding box distances and possibly atom pair distances. @@ -2773,23 +2802,11 @@ static void make_cluster_list_simple(const nbnxn_grid_t *gridj, } } -#ifdef NBNXN_SEARCH_SSE -/* Include make_cluster_list_x86_simd128/256 */ -#define GMX_MM128_HERE -#include "gmx_x86_simd_macros.h" -#define STRIDE_S PACK_X4 -#include "nbnxn_search_x86_simd.h" -#undef STRIDE_S -#undef GMX_MM128_HERE -#ifdef GMX_X86_AVX_256 -/* Include make_cluster_list_x86_simd128/256 */ -#define GMX_MM256_HERE -#include "gmx_x86_simd_macros.h" -#define STRIDE_S GMX_X86_SIMD_WIDTH_HERE -#include "nbnxn_search_x86_simd.h" -#undef STRIDE_S -#undef GMX_MM256_HERE +#ifdef GMX_NBNXN_SIMD_4XN +#include "nbnxn_search_simd_4xn.h" #endif +#ifdef GMX_NBNXN_SIMD_2XNN +#include "nbnxn_search_simd_2xnn.h" #endif /* Plain C or SSE code for making a pair list of super-cell sci vs scj. @@ -4495,7 +4512,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs, switch (nb_kernel_type) { - case nbk4x4_PlainC: + case nbnxnk4x4_PlainC: check_subcell_list_space_simple(nbl,cl-cf+1); make_cluster_list_simple(gridj, @@ -4505,30 +4522,30 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs, rl2,rbb2, &ndistc); break; -#ifdef NBNXN_SEARCH_SSE - case nbk4xN_X86_SIMD128: +#ifdef GMX_NBNXN_SIMD_4XN + case nbnxnk4xN_SIMD_4xN: check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2); - make_cluster_list_x86_simd128(gridj, - nbl,ci,cf,cl, - (gridi == gridj && shift == CENTRAL), - nbat->x, - rl2,rbb2, - &ndistc); + make_cluster_list_simd_4xn(gridj, + nbl,ci,cf,cl, + (gridi == gridj && shift == CENTRAL), + nbat->x, + rl2,rbb2, + &ndistc); break; -#ifdef GMX_X86_AVX_256 - case nbk4xN_X86_SIMD256: +#endif +#ifdef GMX_NBNXN_SIMD_2XNN + case nbnxnk4xN_SIMD_2xNN: check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2); - make_cluster_list_x86_simd256(gridj, - nbl,ci,cf,cl, - (gridi == gridj && shift == CENTRAL), - nbat->x, - rl2,rbb2, - &ndistc); + make_cluster_list_simd_2xnn(gridj, + nbl,ci,cf,cl, + (gridi == gridj && shift == CENTRAL), + nbat->x, + rl2,rbb2, + &ndistc); break; #endif -#endif - case nbk8x8x8_PlainC: - case nbk8x8x8_CUDA: + case nbnxnk8x8x8_PlainC: + case nbnxnk8x8x8_CUDA: check_subcell_list_space_supersub(nbl,cl-cf+1); for(cj=cf; cj<=cl; cj++) { @@ -4728,15 +4745,15 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs, { switch (nb_kernel_type) { -#ifdef NBNXN_SEARCH_SSE - case nbk4xN_X86_SIMD128: - nbs->icell_set_x = icell_set_x_x86_simd128; - break; -#ifdef GMX_X86_AVX_256 - case nbk4xN_X86_SIMD256: - nbs->icell_set_x = icell_set_x_x86_simd256; +#ifdef GMX_NBNXN_SIMD_4XN + case nbnxnk4xN_SIMD_4xN: + nbs->icell_set_x = icell_set_x_simd_4xn; break; #endif +#ifdef GMX_NBNXN_SIMD_2XNN + case nbnxnk4xN_SIMD_2xNN: + nbs->icell_set_x = icell_set_x_simd_2xnn; + break; #endif default: nbs->icell_set_x = icell_set_x_simple; diff --git a/src/mdlib/nbnxn_search_simd_2xnn.h b/src/mdlib/nbnxn_search_simd_2xnn.h new file mode 100644 index 0000000000..baf6f7bbb9 --- /dev/null +++ b/src/mdlib/nbnxn_search_simd_2xnn.h @@ -0,0 +1,262 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2012, The GROMACS development team, + * check out http://www.gromacs.org for more information. + * Copyright (c) 2012, by the GROMACS development team, led by + * David van der Spoel, Berk Hess, Erik Lindahl, and including many + * others, as listed in the AUTHORS file in the top-level source + * directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#if GMX_NBNXN_SIMD_BITWIDTH == 128 +#define GMX_MM128_HERE +#else +#if GMX_NBNXN_SIMD_BITWIDTH == 256 +#define GMX_MM256_HERE +#else +#error "unsupported GMX_NBNXN_SIMD_BITWIDTH" +#endif +#endif +#include "gmx_x86_simd_macros.h" + +#if GMX_X86_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE +#define STRIDE_S (GMX_X86_SIMD_WIDTH_HERE/2) +#else +#define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE +#endif + +static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a) +{ + gmx_mm_hpr a_SSE; + + a_SSE = _mm_load_ps(a); + + return gmx_2hpr_to_pr(a_SSE,a_SSE); +} + +static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a,real shift) +{ + gmx_mm_hpr a0,a1; + + a0 = _mm_set1_ps(a[0] + shift); + a1 = _mm_set1_ps(a[1] + shift); + + return gmx_2hpr_to_pr(a1,a0); +} + +/* Copies PBC shifted i-cell packed atom coordinates to working array */ +static gmx_inline void +icell_set_x_simd_2xnn(int ci, + real shx,real shy,real shz, + int na_c, + int stride,const real *x, + nbnxn_list_work_t *work) +{ + int ia; + nbnxn_x_ci_simd_2xnn_t *x_ci; + + x_ci = work->x_ci_simd_2xnn; + + ia = X_IND_CI_SIMD_2XNN(ci); + + x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx); + x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy); + x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz); + x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx); + x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy); + x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz); +} + +/* SIMD code for making a pair list of cell ci vs cell cjf-cjl + * for coordinates in packed format. + * Checks bouding box distances and possibly atom pair distances. + * This is an accelerated version of make_cluster_list_simple. + */ +static gmx_inline void +make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj, + nbnxn_pairlist_t *nbl, + int ci,int cjf,int cjl, + gmx_bool remove_sub_diag, + const real *x_j, + real rl2,float rbb2, + int *ndistc) +{ + const nbnxn_x_ci_simd_2xnn_t *work; + const float *bb_ci; + + gmx_mm_pr jx_SSE,jy_SSE,jz_SSE; + + gmx_mm_pr dx_SSE0,dy_SSE0,dz_SSE0; + gmx_mm_pr dx_SSE2,dy_SSE2,dz_SSE2; + + gmx_mm_pr rsq_SSE0; + gmx_mm_pr rsq_SSE2; + + gmx_mm_pr wco_SSE0; + gmx_mm_pr wco_SSE2; + gmx_mm_pr wco_any_SSE; + + gmx_mm_pr rc2_SSE; + + gmx_bool InRange; + float d2; + int xind_f,xind_l,cj; + + cjf = CI_TO_CJ_SIMD_2XNN(cjf); + cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1; + + work = nbl->work->x_ci_simd_2xnn; + + bb_ci = nbl->work->bb_ci; + + rc2_SSE = gmx_set1_pr(rl2); + + InRange = FALSE; + while (!InRange && cjf <= cjl) + { + d2 = subc_bb_dist2_sse(4,0,bb_ci,cjf,gridj->bbj); + *ndistc += 2; + + /* Check if the distance is within the distance where + * we use only the bounding box distance rbb, + * or within the cut-off and there is at least one atom pair + * within the cut-off. + */ + if (d2 < rbb2) + { + InRange = TRUE; + } + else if (d2 < rl2) + { + xind_f = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf); + + jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S); + jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S); + jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S); + + /* Calculate distance */ + dx_SSE0 = gmx_sub_pr(work->ix_SSE0,jx_SSE); + dy_SSE0 = gmx_sub_pr(work->iy_SSE0,jy_SSE); + dz_SSE0 = gmx_sub_pr(work->iz_SSE0,jz_SSE); + dx_SSE2 = gmx_sub_pr(work->ix_SSE2,jx_SSE); + dy_SSE2 = gmx_sub_pr(work->iy_SSE2,jy_SSE); + dz_SSE2 = gmx_sub_pr(work->iz_SSE2,jz_SSE); + + /* rsq = dx*dx+dy*dy+dz*dz */ + rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0); + rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2); + + wco_SSE0 = gmx_cmplt_pr(rsq_SSE0,rc2_SSE); + wco_SSE2 = gmx_cmplt_pr(rsq_SSE2,rc2_SSE); + + wco_any_SSE = gmx_or_pr(wco_SSE0,wco_SSE2); + + InRange = gmx_movemask_pr(wco_any_SSE); + + *ndistc += 2*GMX_X86_SIMD_WIDTH_HERE; + } + if (!InRange) + { + cjf++; + } + } + if (!InRange) + { + return; + } + + InRange = FALSE; + while (!InRange && cjl > cjf) + { + d2 = subc_bb_dist2_sse(4,0,bb_ci,cjl,gridj->bbj); + *ndistc += 2; + + /* Check if the distance is within the distance where + * we use only the bounding box distance rbb, + * or within the cut-off and there is at least one atom pair + * within the cut-off. + */ + if (d2 < rbb2) + { + InRange = TRUE; + } + else if (d2 < rl2) + { + xind_l = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl); + + jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S); + jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S); + jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S); + + /* Calculate distance */ + dx_SSE0 = gmx_sub_pr(work->ix_SSE0,jx_SSE); + dy_SSE0 = gmx_sub_pr(work->iy_SSE0,jy_SSE); + dz_SSE0 = gmx_sub_pr(work->iz_SSE0,jz_SSE); + dx_SSE2 = gmx_sub_pr(work->ix_SSE2,jx_SSE); + dy_SSE2 = gmx_sub_pr(work->iy_SSE2,jy_SSE); + dz_SSE2 = gmx_sub_pr(work->iz_SSE2,jz_SSE); + + /* rsq = dx*dx+dy*dy+dz*dz */ + rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0); + rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2); + + wco_SSE0 = gmx_cmplt_pr(rsq_SSE0,rc2_SSE); + wco_SSE2 = gmx_cmplt_pr(rsq_SSE2,rc2_SSE); + + wco_any_SSE = gmx_or_pr(wco_SSE0,wco_SSE2); + + InRange = gmx_movemask_pr(wco_any_SSE); + + *ndistc += 2*GMX_X86_SIMD_WIDTH_HERE; + } + if (!InRange) + { + cjl--; + } + } + + if (cjf <= cjl) + { + for(cj=cjf; cj<=cjl; cj++) + { + /* Store cj and the interaction mask */ + nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cj; + nbl->cj[nbl->ncj].excl = get_imask_x86_simd_2xnn(remove_sub_diag,ci,cj); + nbl->ncj++; + } + /* Increase the closing index in i super-cell list */ + nbl->ci[nbl->nci].cj_ind_end = nbl->ncj; + } +} + +#undef STRIDE_S +#undef GMX_MM128_HERE +#undef GMX_MM256_HERE diff --git a/src/mdlib/nbnxn_search_x86_simd.h b/src/mdlib/nbnxn_search_simd_4xn.h similarity index 77% rename from src/mdlib/nbnxn_search_x86_simd.h rename to src/mdlib/nbnxn_search_simd_4xn.h index a6af973d8a..d754d07bc0 100644 --- a/src/mdlib/nbnxn_search_x86_simd.h +++ b/src/mdlib/nbnxn_search_simd_4xn.h @@ -36,40 +36,37 @@ * the research papers on the package. Check out http://www.gromacs.org. */ -/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file. - * gmx_sse_or_avh.h should be included before including this file. - */ - -/* Copies PBC shifted i-cell packed atom coordinates to working array */ -#ifdef GMX_MM128_HERE -static void icell_set_x_x86_simd128 +#if GMX_NBNXN_SIMD_BITWIDTH == 128 +#define GMX_MM128_HERE #else -#ifdef GMX_MM256_HERE -static void icell_set_x_x86_simd256 +#if GMX_NBNXN_SIMD_BITWIDTH == 256 +#define GMX_MM256_HERE #else -"error: GMX_MM128_HERE or GMX_MM256_HERE not defined" +#error "unsupported GMX_NBNXN_SIMD_BITWIDTH" #endif #endif - (int ci, - real shx,real shy,real shz, - int na_c, - int stride,const real *x, - nbnxn_list_work_t *work) -{ - int ia; -#ifdef GMX_MM128_HERE - nbnxn_x_ci_x86_simd128_t *x_ci; - - x_ci = work->x_ci_x86_simd128; +#include "gmx_x86_simd_macros.h" - ia = X_IND_CI_S128(ci); +#if GMX_X86_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE +#define STRIDE_S (GMX_X86_SIMD_WIDTH_HERE) #else - nbnxn_x_ci_x86_simd256_t *x_ci; +#define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE +#endif + +/* Copies PBC shifted i-cell packed atom coordinates to working array */ +static gmx_inline void +icell_set_x_simd_4xn(int ci, + real shx,real shy,real shz, + int na_c, + int stride,const real *x, + nbnxn_list_work_t *work) +{ + int ia; + nbnxn_x_ci_simd_4xn_t *x_ci; - x_ci = work->x_ci_x86_simd256; + x_ci = work->x_ci_simd_4xn; - ia = X_IND_CI_S256(ci); -#endif + ia = X_IND_CI_SIMD_4XN(ci); x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S ] + shx); x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S ] + shy); @@ -85,34 +82,21 @@ static void icell_set_x_x86_simd256 x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz); } -/* SSE or AVX code for making a pair list of cell ci vs cell cjf-cjl +/* SIMD code for making a pair list of cell ci vs cell cjf-cjl * for coordinates in packed format. * Checks bouding box distances and possibly atom pair distances. * This is an accelerated version of make_cluster_list_simple. */ -#ifdef GMX_MM128_HERE -static void make_cluster_list_x86_simd128 -#else -#ifdef GMX_MM256_HERE -static void make_cluster_list_x86_simd256 -#else -"error: GMX_MM128_HERE or GMX_MM256_HERE not defined" -#endif -#endif - (const nbnxn_grid_t *gridj, - nbnxn_pairlist_t *nbl, - int ci,int cjf,int cjl, - gmx_bool remove_sub_diag, - const real *x_j, - real rl2,float rbb2, - int *ndistc) +static gmx_inline void +make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj, + nbnxn_pairlist_t *nbl, + int ci,int cjf,int cjl, + gmx_bool remove_sub_diag, + const real *x_j, + real rl2,float rbb2, + int *ndistc) { -#ifdef GMX_MM128_HERE - const nbnxn_x_ci_x86_simd128_t *work; -#else - const nbnxn_x_ci_x86_simd256_t *work; -#endif - + const nbnxn_x_ci_simd_4xn_t *work; const float *bb_ci; gmx_mm_pr jx_SSE,jy_SSE,jz_SSE; @@ -139,17 +123,10 @@ static void make_cluster_list_x86_simd256 float d2; int xind_f,xind_l,cj; -#ifdef GMX_MM128_HERE - cjf = CI_TO_CJ_S128(cjf); - cjl = CI_TO_CJ_S128(cjl+1) - 1; - - work = nbl->work->x_ci_x86_simd128; -#else - cjf = CI_TO_CJ_S256(cjf); - cjl = CI_TO_CJ_S256(cjl+1) - 1; + cjf = CI_TO_CJ_SIMD_4XN(cjf); + cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1; - work = nbl->work->x_ci_x86_simd256; -#endif + work = nbl->work->x_ci_simd_4xn; bb_ci = nbl->work->bb_ci; @@ -172,11 +149,8 @@ static void make_cluster_list_x86_simd256 } else if (d2 < rl2) { -#ifdef GMX_MM128_HERE - xind_f = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjf); -#else - xind_f = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjf); -#endif + xind_f = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf); + jx_SSE = gmx_load_pr(x_j+xind_f+0*STRIDE_S); jy_SSE = gmx_load_pr(x_j+xind_f+1*STRIDE_S); jz_SSE = gmx_load_pr(x_j+xind_f+2*STRIDE_S); @@ -242,11 +216,8 @@ static void make_cluster_list_x86_simd256 } else if (d2 < rl2) { -#ifdef GMX_MM128_HERE - xind_l = X_IND_CJ_S128(CI_TO_CJ_S128(gridj->cell0) + cjl); -#else - xind_l = X_IND_CJ_S256(CI_TO_CJ_S256(gridj->cell0) + cjl); -#endif + xind_l = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl); + jx_SSE = gmx_load_pr(x_j+xind_l+0*STRIDE_S); jy_SSE = gmx_load_pr(x_j+xind_l+1*STRIDE_S); jz_SSE = gmx_load_pr(x_j+xind_l+2*STRIDE_S); @@ -295,16 +266,15 @@ static void make_cluster_list_x86_simd256 for(cj=cjf; cj<=cjl; cj++) { /* Store cj and the interaction mask */ -#ifdef GMX_MM128_HERE - nbl->cj[nbl->ncj].cj = CI_TO_CJ_S128(gridj->cell0) + cj; - nbl->cj[nbl->ncj].excl = get_imask_x86_simd128(remove_sub_diag,ci,cj); -#else - nbl->cj[nbl->ncj].cj = CI_TO_CJ_S256(gridj->cell0) + cj; - nbl->cj[nbl->ncj].excl = get_imask_x86_simd256(remove_sub_diag,ci,cj); -#endif + nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj; + nbl->cj[nbl->ncj].excl = get_imask_x86_simd_4xn(remove_sub_diag,ci,cj); nbl->ncj++; } /* Increase the closing index in i super-cell list */ nbl->ci[nbl->nci].cj_ind_end = nbl->ncj; } } + +#undef STRIDE_S +#undef GMX_MM128_HERE +#undef GMX_MM256_HERE diff --git a/src/mdlib/sim_util.c b/src/mdlib/sim_util.c index b57d35cf4d..eb9f636140 100644 --- a/src/mdlib/sim_util.c +++ b/src/mdlib/sim_util.c @@ -93,8 +93,8 @@ #include "nbnxn_atomdata.h" #include "nbnxn_search.h" #include "nbnxn_kernels/nbnxn_kernel_ref.h" -#include "nbnxn_kernels/nbnxn_kernel_x86_simd128.h" -#include "nbnxn_kernels/nbnxn_kernel_x86_simd256.h" +#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h" +#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h" #include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h" #ifdef GMX_LIB_MPI @@ -620,13 +620,13 @@ static void do_nb_verlet(t_forcerec *fr, gmx_incons("Invalid cut-off scheme passed!"); } - if (nbvg->kernel_type != nbk8x8x8_CUDA) + if (nbvg->kernel_type != nbnxnk8x8x8_CUDA) { wallcycle_sub_start(wcycle, ewcsNONBONDED); } switch (nbvg->kernel_type) { - case nbk4x4_PlainC: + case nbnxnk4x4_PlainC: nbnxn_kernel_ref(&nbvg->nbl_lists, nbvg->nbat, ic, fr->shift_vec, @@ -639,38 +639,38 @@ static void do_nb_verlet(t_forcerec *fr, enerd->grpp.ener[egLJSR]); break; - case nbk4xN_X86_SIMD128: - nbnxn_kernel_x86_simd128(&nbvg->nbl_lists, - nbvg->nbat, ic, - nbvg->ewald_excl, - fr->shift_vec, - flags, - clearF, - fr->fshift[0], - enerd->grpp.ener[egCOULSR], - fr->bBHAM ? - enerd->grpp.ener[egBHAMSR] : - enerd->grpp.ener[egLJSR]); + case nbnxnk4xN_SIMD_4xN: + nbnxn_kernel_simd_4xn(&nbvg->nbl_lists, + nbvg->nbat, ic, + nbvg->ewald_excl, + fr->shift_vec, + flags, + clearF, + fr->fshift[0], + enerd->grpp.ener[egCOULSR], + fr->bBHAM ? + enerd->grpp.ener[egBHAMSR] : + enerd->grpp.ener[egLJSR]); break; - case nbk4xN_X86_SIMD256: - nbnxn_kernel_x86_simd256(&nbvg->nbl_lists, - nbvg->nbat, ic, - nbvg->ewald_excl, - fr->shift_vec, - flags, - clearF, - fr->fshift[0], - enerd->grpp.ener[egCOULSR], - fr->bBHAM ? - enerd->grpp.ener[egBHAMSR] : - enerd->grpp.ener[egLJSR]); + case nbnxnk4xN_SIMD_2xNN: + nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists, + nbvg->nbat, ic, + nbvg->ewald_excl, + fr->shift_vec, + flags, + clearF, + fr->fshift[0], + enerd->grpp.ener[egCOULSR], + fr->bBHAM ? + enerd->grpp.ener[egBHAMSR] : + enerd->grpp.ener[egLJSR]); break; - case nbk8x8x8_CUDA: + case nbnxnk8x8x8_CUDA: nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality); break; - case nbk8x8x8_PlainC: + case nbnxnk8x8x8_PlainC: nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0], nbvg->nbat, ic, fr->shift_vec, @@ -688,7 +688,7 @@ static void do_nb_verlet(t_forcerec *fr, gmx_incons("Invalid nonbonded kernel type passed!"); } - if (nbvg->kernel_type != nbk8x8x8_CUDA) + if (nbvg->kernel_type != nbnxnk8x8x8_CUDA) { wallcycle_sub_stop(wcycle, ewcsNONBONDED); } @@ -785,7 +785,7 @@ void do_force_cutsVERLET(FILE *fplog,t_commrec *cr, bDoForces = (flags & GMX_FORCE_FORCES); bSepLRF = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF)); bUseGPU = fr->nbv->bUseGPU; - bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbk8x8x8_PlainC); + bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC); if (bStateChanged) { @@ -1000,7 +1000,7 @@ void do_force_cutsVERLET(FILE *fplog,t_commrec *cr, wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_NONLOCAL); - if (nbv->grp[eintNonlocal].kernel_type == nbk8x8x8_CUDA) + if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA) { /* initialize non-local pair-list on the GPU */ nbnxn_cuda_init_pairlist(nbv->cu_nbv, -- 2.11.4.GIT