From bf670b872db5709dc6d95542e75311feb39fe3c4 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Tue, 16 Oct 2012 10:42:10 +0200 Subject: [PATCH] made pair-search with GPU 20% faster Also replaced more numbers by symbolic constants. Change-Id: Ida0ef5c46b3fd1d6b4d8006598194e19a943b660 --- src/mdlib/nbnxn_search.c | 334 +++++++++++++++++++++++------------------------ 1 file changed, 166 insertions(+), 168 deletions(-) diff --git a/src/mdlib/nbnxn_search.c b/src/mdlib/nbnxn_search.c index 477210fb6d..68617556a7 100644 --- a/src/mdlib/nbnxn_search.c +++ b/src/mdlib/nbnxn_search.c @@ -63,9 +63,11 @@ #define NBNXN_8BB_SSE #endif -/* The width of SSE with single precision, used for bounding boxes */ -#define SSE_F_WIDTH 4 -#define SSE_F_WIDTH_2LOG 2 +/* The width of SSE/AVX128 with single precision for bounding boxes with GPU. + * Here AVX-256 turns out to be slightly slower than AVX-128. + */ +#define STRIDE_8BB 4 +#define STRIDE_8BB_2LOG 2 #endif /* NBNXN_SEARCH_SSE */ @@ -168,7 +170,7 @@ /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */ #define NBNXN_BBXXXX /* Size of bounding box corners quadruplet */ -#define NNBSBB_XXXX (NNBSBB_D*DIM*SSE_F_WIDTH) +#define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_8BB) #endif /* We shift the i-particles backward for PBC. @@ -254,16 +256,6 @@ static gmx_icell_set_x_t icell_set_x_supersub; static gmx_icell_set_x_t icell_set_x_supersub_sse8; #endif -/* Function type for checking if sub-cells are within range */ -typedef gmx_bool -gmx_subcell_in_range_t(int na_c, - int si,const real *x_or_bb_i, - int csj,int stride,const real *x_or_bb_j, - real rl2); - -static gmx_subcell_in_range_t subc_in_range_x; -static gmx_subcell_in_range_t subc_in_range_sse8; - /* Local cycle count struct for profiling */ typedef struct { int count; @@ -359,8 +351,6 @@ typedef struct nbnxn_search { gmx_icell_set_x_t *icell_set_x; /* Function for setting i-coords */ - gmx_subcell_in_range_t *subc_dc; /* Function for sub-cell range check */ - int nthread_max; /* Maximum number of threads for pair-search */ nbnxn_search_work_t *work; /* Work array, size nthread_max */ } nbnxn_search_t_t; @@ -569,23 +559,6 @@ void nbnxn_init_search(nbnxn_search_t * nbs_ptr, nbs->a = NULL; nbs->a_nalloc = 0; - /* nbs->subc_dc is only used with super/sub setup */ -#ifdef NBNXN_8BB_SSE - nbs->subc_dc = subc_in_range_sse8; -#else - if (getenv("GMX_NBNXN_BB") != NULL) - { - /* Use only bounding box sub cell pair distances, - * fast, but produces slightly more sub cell pairs. - */ - nbs->subc_dc = NULL; - } - else - { - nbs->subc_dc = subc_in_range_x; - } -#endif - nbs->nthread_max = nthread_max; /* Initialize the work data structures for each thread */ @@ -699,7 +672,7 @@ static int set_grid_size_xy(const nbnxn_search_t nbs, srenew(grid->nsubc,grid->nc_nalloc); srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D); #ifdef NBNXN_8BB_SSE - bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/SSE_F_WIDTH*NNBSBB_XXXX; + bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX; #else bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B; #endif @@ -997,12 +970,12 @@ static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb) i += stride; } /* Note: possible double to float conversion here */ - bb[ 0] = R2F_D(xl); - bb[ 4] = R2F_D(yl); - bb[ 8] = R2F_D(zl); - bb[12] = R2F_U(xh); - bb[16] = R2F_U(yh); - bb[20] = R2F_U(zh); + bb[0*STRIDE_8BB] = R2F_D(xl); + bb[1*STRIDE_8BB] = R2F_D(yl); + bb[2*STRIDE_8BB] = R2F_D(zl); + bb[3*STRIDE_8BB] = R2F_U(xh); + bb[4*STRIDE_8BB] = R2F_U(yh); + bb[5*STRIDE_8BB] = R2F_U(zh); } #endif /* NBNXN_SEARCH_SSE */ @@ -1038,12 +1011,12 @@ static void calc_bounding_box_xxxx_sse(int na,const float *x, { calc_bounding_box_sse(na,x,bb_work); - bb[ 0] = bb_work[BBL_X]; - bb[ 4] = bb_work[BBL_Y]; - bb[ 8] = bb_work[BBL_Z]; - bb[12] = bb_work[BBU_X]; - bb[16] = bb_work[BBU_Y]; - bb[20] = bb_work[BBU_Z]; + bb[0*STRIDE_8BB] = bb_work[BBL_X]; + bb[1*STRIDE_8BB] = bb_work[BBL_Y]; + bb[2*STRIDE_8BB] = bb_work[BBL_Z]; + bb[3*STRIDE_8BB] = bb_work[BBU_X]; + bb[4*STRIDE_8BB] = bb_work[BBU_Y]; + bb[5*STRIDE_8BB] = bb_work[BBU_Z]; } #endif /* NBNXN_SEARCH_SSE_SINGLE */ @@ -1127,18 +1100,18 @@ static void print_bbsizes_supersub(FILE *fp, for(c=0; cnc; c++) { #ifdef NBNXN_BBXXXX - for(s=0; snsubc[c]; s+=SSE_F_WIDTH) + for(s=0; snsubc[c]; s+=STRIDE_8BB) { int cs_w,i,d; - cs_w = (c*GPU_NSUBCELL + s)/SSE_F_WIDTH; - for(i=0; ibb[cs_w*NNBSBB_XXXX+(DIM+d)*SSE_F_WIDTH+i] - - grid->bb[cs_w*NNBSBB_XXXX+ d *SSE_F_WIDTH+i]; + grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] - + grid->bb[cs_w*NNBSBB_XXXX+ d *STRIDE_8BB+i]; } } } @@ -1508,8 +1481,8 @@ void fill_cell(const nbnxn_search_t nbs, */ bb_ptr = grid->bb + - ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+SSE_F_WIDTH_2LOG))*NNBSBB_XXXX + - (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (SSE_F_WIDTH-1)); + ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX + + (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1)); #ifdef NBNXN_SEARCH_SSE_SINGLE if (nbat->XFormat == nbatXYZQ) @@ -1527,9 +1500,9 @@ void fill_cell(const nbnxn_search_t nbs, { fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n", sx,sy,sz, - bb_ptr[0],bb_ptr[12], - bb_ptr[4],bb_ptr[16], - bb_ptr[8],bb_ptr[20]); + bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB], + bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB], + bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]); } } #endif @@ -2483,74 +2456,83 @@ static float subc_bb_dist2_sse(int na_c, #endif } +/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */ +#define SUBC_BB_DIST2_SSE_XXXX_INNER(si,bb_i,d2) \ +{ \ + int shi; \ + \ + __m128 dx_0,dy_0,dz_0; \ + __m128 dx_1,dy_1,dz_1; \ + \ + __m128 mx,my,mz; \ + __m128 m0x,m0y,m0z; \ + \ + __m128 d2x,d2y,d2z; \ + __m128 d2s,d2t; \ + \ + shi = si*NNBSBB_D*DIM; \ + \ + xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB); \ + yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB); \ + zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB); \ + xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB); \ + yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB); \ + zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB); \ + \ + dx_0 = _mm_sub_ps(xi_l,xj_h); \ + dy_0 = _mm_sub_ps(yi_l,yj_h); \ + dz_0 = _mm_sub_ps(zi_l,zj_h); \ + \ + dx_1 = _mm_sub_ps(xj_l,xi_h); \ + dy_1 = _mm_sub_ps(yj_l,yi_h); \ + dz_1 = _mm_sub_ps(zj_l,zi_h); \ + \ + mx = _mm_max_ps(dx_0,dx_1); \ + my = _mm_max_ps(dy_0,dy_1); \ + mz = _mm_max_ps(dz_0,dz_1); \ + \ + m0x = _mm_max_ps(mx,zero); \ + m0y = _mm_max_ps(my,zero); \ + m0z = _mm_max_ps(mz,zero); \ + \ + d2x = _mm_mul_ps(m0x,m0x); \ + d2y = _mm_mul_ps(m0y,m0y); \ + d2z = _mm_mul_ps(m0z,m0z); \ + \ + d2s = _mm_add_ps(d2x,d2y); \ + d2t = _mm_add_ps(d2s,d2z); \ + \ + _mm_store_ps(d2+si,d2t); \ +} + /* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */ static void subc_bb_dist2_sse_xxxx(const float *bb_j, int nsi,const float *bb_i, float *d2) { - int si; - int shi; - __m128 xj_l,yj_l,zj_l; __m128 xj_h,yj_h,zj_h; __m128 xi_l,yi_l,zi_l; __m128 xi_h,yi_h,zi_h; - __m128 dx_0,dy_0,dz_0; - __m128 dx_1,dy_1,dz_1; - - __m128 mx,my,mz; - __m128 m0x,m0y,m0z; - - __m128 d2x,d2y,d2z; - __m128 d2s,d2t; - __m128 zero; zero = _mm_setzero_ps(); - xj_l = _mm_load1_ps(bb_j+0*SSE_F_WIDTH); - yj_l = _mm_load1_ps(bb_j+1*SSE_F_WIDTH); - zj_l = _mm_load1_ps(bb_j+2*SSE_F_WIDTH); - xj_h = _mm_load1_ps(bb_j+3*SSE_F_WIDTH); - yj_h = _mm_load1_ps(bb_j+4*SSE_F_WIDTH); - zj_h = _mm_load1_ps(bb_j+5*SSE_F_WIDTH); + xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]); + yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]); + zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]); + xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]); + yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]); + zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]); - for(si=0; siwork,1); #ifdef NBNXN_BBXXXX - snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/SSE_F_WIDTH*NNBSBB_XXXX,16); + snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16); #else snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16); #endif @@ -3283,15 +3266,15 @@ static void make_cluster_list_simple(const nbnxn_grid_t *gridj, /* Plain C or SSE code for making a pair list of super-cell sci vs scj. * Checks bounding box distances and possibly atom pair distances. */ -static void make_cluster_list(const nbnxn_search_t nbs, - const nbnxn_grid_t *gridi, - const nbnxn_grid_t *gridj, - nbnxn_pairlist_t *nbl, - int sci,int scj, - gmx_bool sci_equals_scj, - int stride,const real *x, - real rl2,float rbb2, - int *ndistc) +static void make_cluster_list_supersub(const nbnxn_search_t nbs, + const nbnxn_grid_t *gridi, + const nbnxn_grid_t *gridj, + nbnxn_pairlist_t *nbl, + int sci,int scj, + gmx_bool sci_equals_scj, + int stride,const real *x, + real rl2,float rbb2, + int *ndistc) { int na_c; int npair; @@ -3340,14 +3323,20 @@ static void make_cluster_list(const nbnxn_search_t nbs, #ifdef NBNXN_BBXXXX /* Determine all ci1 bb distances in one call with SSE */ - subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>SSE_F_WIDTH_2LOG)*NNBSBB_XXXX+(cj & (SSE_F_WIDTH-1)), + subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)), ci1,bb_ci,d2l); *ndistc += na_c*2; #endif npair = 0; - for(ci=0; cibb); @@ -3363,7 +3352,7 @@ static void make_cluster_list(const nbnxn_search_t nbs, */ *ndistc += na_c*na_c; if (d2 < rbb2 || - (d2 < rl2 && nbs->subc_dc(na_c,ci,x_ci,cj_gl,stride,x,rl2))) + (d2 < rl2 && subc_in_range_x(na_c,ci,x_ci,cj_gl,stride,x,rl2))) #else /* Check if the distance between the two bounding boxes * in within the pair-list cut-off. @@ -3388,7 +3377,14 @@ static void make_cluster_list(const nbnxn_search_t nbs, */ if (npair == 1 && d2l[ci_last] >= rbb2) { - if (!nbs->subc_dc(na_c,ci_last,x_ci,cj_gl,stride,x,rl2)) + /* Avoid using function pointers here, as it's slower */ + if ( +#ifdef NBNXN_8BB_SSE + !subc_in_range_sse8 +#else + !subc_in_range_x +#endif + (na_c,ci_last,x_ci,cj_gl,stride,x,rl2)) { imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last)); npair--; @@ -4007,17 +4003,17 @@ static void set_icell_bb_supersub(const float *bb,int ci, int ia,m,i; #ifdef NBNXN_BBXXXX - ia = ci*(GPU_NSUBCELL>>SSE_F_WIDTH_2LOG)*NNBSBB_XXXX; - for(m=0; m<(GPU_NSUBCELL>>SSE_F_WIDTH_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX) + ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; + for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX) { - for(i=0; ixstride,nbat->x, - rl2,rbb2, - &ndistc); + make_cluster_list_supersub(nbs,gridi,gridj, + nbl,ci,cj, + (gridi == gridj && shift == CENTRAL && ci == cj), + nbat->xstride,nbat->x, + rl2,rbb2, + &ndistc); } break; } -- 2.11.4.GIT