1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
10 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12 * Copyright (c) 2001-2012, The GROMACS development team,
13 * check out http://www.gromacs.org for more information.
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * If you want to redistribute modifications, please consider that
21 * scientific software is very special. Version control is crucial -
22 * bugs must be traceable. We will be happy to consider code for
23 * inclusion in the official distribution, but derived work must not
24 * be called official GROMACS. Details are found in the README & COPYING
25 * files - if they are missing, get the official version at www.gromacs.org.
27 * To help us fund GROMACS development, we humbly ask that you cite
28 * the papers on the package - you can find them in the top README file.
30 * For more info, check our website at http://www.gromacs.org
45 #include "nbnxn_search.h"
46 #include "nbnxn_consts.h"
47 #include "gmx_cyclecounter.h"
49 #include "gmx_omp_nthreads.h"
53 #define NBNXN_SEARCH_SSE
56 #define NBNXN_SEARCH_SSE_SINGLE
57 #include "gmx_x86_simd_single.h"
59 #include "gmx_x86_simd_double.h"
62 #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
66 /* The width of SSE with single precision, used for bounding boxes */
68 #define SSE_F_WIDTH_2LOG 2
70 #endif /* NBNXN_SEARCH_SSE */
72 /* Pair search box lower and upper corner in x,y,z.
73 * Store this in 4 iso 3 reals, which is useful with SSE.
74 * To avoid complicating the code we also use 4 without SSE.
77 #define NNBSBB_B (2*NNBSBB_C)
78 /* Pair search box lower and upper bound in z only. */
80 /* Pair search box lower and upper corner x,y,z indices */
88 /* Strides for x/f with xyz and xyzq coordinate (and charge) storage */
91 /* Size of packs of x, y or z with SSE/AVX packed coords/forces */
94 /* Strides for a pack of 4 and 8 coordinates/forces */
95 #define STRIDE_P4 (DIM*PACK_X4)
96 #define STRIDE_P8 (DIM*PACK_X8)
98 /* Index of atom a into the SSE/AVX coordinate/force array */
99 #define X4_IND_A(a) (STRIDE_P4*((a) >> 2) + ((a) & (PACK_X4 - 1)))
100 #define X8_IND_A(a) (STRIDE_P8*((a) >> 3) + ((a) & (PACK_X8 - 1)))
103 #ifdef NBNXN_SEARCH_SSE
105 /* The functions below are macros as they are performance sensitive */
107 /* 4x4 list, pack=4: no complex conversion required */
108 /* i-cluster to j-cluster conversion */
109 #define CI_TO_CJ_J4(ci) (ci)
110 /* cluster index to coordinate array index conversion */
111 #define X_IND_CI_J4(ci) ((ci)*STRIDE_P4)
112 #define X_IND_CJ_J4(cj) ((cj)*STRIDE_P4)
114 /* 4x2 list, pack=4: j-cluster size is half the packing width */
115 /* i-cluster to j-cluster conversion */
116 #define CI_TO_CJ_J2(ci) ((ci)<<1)
117 /* cluster index to coordinate array index conversion */
118 #define X_IND_CI_J2(ci) ((ci)*STRIDE_P4)
119 #define X_IND_CJ_J2(cj) (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
121 /* 4x8 list, pack=8: i-cluster size is half the packing width */
122 /* i-cluster to j-cluster conversion */
123 #define CI_TO_CJ_J8(ci) ((ci)>>1)
124 /* cluster index to coordinate array index conversion */
125 #define X_IND_CI_J8(ci) (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
126 #define X_IND_CJ_J8(cj) ((cj)*STRIDE_P8)
128 /* The j-cluster size is matched to the SIMD width */
130 /* 128 bits can hold 4 floats */
131 #define CI_TO_CJ_S128(ci) CI_TO_CJ_J4(ci)
132 #define X_IND_CI_S128(ci) X_IND_CI_J4(ci)
133 #define X_IND_CJ_S128(cj) X_IND_CJ_J4(cj)
134 /* 256 bits can hold 8 floats */
135 #define CI_TO_CJ_S256(ci) CI_TO_CJ_J8(ci)
136 #define X_IND_CI_S256(ci) X_IND_CI_J8(ci)
137 #define X_IND_CJ_S256(cj) X_IND_CJ_J8(cj)
139 /* 128 bits can hold 2 doubles */
140 #define CI_TO_CJ_S128(ci) CI_TO_CJ_J2(ci)
141 #define X_IND_CI_S128(ci) X_IND_CI_J2(ci)
142 #define X_IND_CJ_S128(cj) X_IND_CJ_J2(cj)
143 /* 256 bits can hold 4 doubles */
144 #define CI_TO_CJ_S256(ci) CI_TO_CJ_J4(ci)
145 #define X_IND_CI_S256(ci) X_IND_CI_J4(ci)
146 #define X_IND_CJ_S256(cj) X_IND_CJ_J4(cj)
149 #endif /* NBNXN_SEARCH_SSE */
152 /* Interaction masks for 4xN atom interactions.
153 * Bit i*CJ_SIZE + j tells if atom i and j interact.
155 /* All interaction mask is the same for all kernels */
156 #define NBNXN_INT_MASK_ALL 0xffffffff
157 /* 4x4 kernel diagonal mask */
158 #define NBNXN_INT_MASK_DIAG 0x08ce
159 /* 4x2 kernel diagonal masks */
160 #define NBNXN_INT_MASK_DIAG_J2_0 0x0002
161 #define NBNXN_INT_MASK_DIAG_J2_1 0x002F
162 /* 4x8 kernel diagonal masks */
163 #define NBNXN_INT_MASK_DIAG_J8_0 0xf0f8fcfe
164 #define NBNXN_INT_MASK_DIAG_J8_1 0x0080c0e0
167 #ifdef NBNXN_SEARCH_SSE
168 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
170 /* Size of bounding box corners quadruplet */
171 #define NNBSBB_XXXX (NNBSBB_D*DIM*SSE_F_WIDTH)
174 /* We shift the i-particles backward for PBC.
175 * This leads to more conditionals than shifting forward.
176 * We do this to get more balanced pair lists.
178 #define NBNXN_SHIFT_BACKWARD
181 /* This define is a lazy way to avoid interdependence of the grid
182 * and searching data structures.
184 #define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
186 #ifdef NBNXN_SEARCH_SSE
187 #define GMX_MM128_HERE
188 #include "gmx_x86_simd_macros.h"
189 typedef struct nbnxn_x_ci_x86_simd128
{
190 /* The i-cluster coordinates for simple search */
191 gmx_mm_pr ix_SSE0
,iy_SSE0
,iz_SSE0
;
192 gmx_mm_pr ix_SSE1
,iy_SSE1
,iz_SSE1
;
193 gmx_mm_pr ix_SSE2
,iy_SSE2
,iz_SSE2
;
194 gmx_mm_pr ix_SSE3
,iy_SSE3
,iz_SSE3
;
195 } nbnxn_x_ci_x86_simd128_t
;
196 #undef GMX_MM128_HERE
197 #ifdef GMX_X86_AVX_256
198 #define GMX_MM256_HERE
199 #include "gmx_x86_simd_macros.h"
200 typedef struct nbnxn_x_ci_x86_simd256
{
201 /* The i-cluster coordinates for simple search */
202 gmx_mm_pr ix_SSE0
,iy_SSE0
,iz_SSE0
;
203 gmx_mm_pr ix_SSE1
,iy_SSE1
,iz_SSE1
;
204 gmx_mm_pr ix_SSE2
,iy_SSE2
,iz_SSE2
;
205 gmx_mm_pr ix_SSE3
,iy_SSE3
,iz_SSE3
;
206 } nbnxn_x_ci_x86_simd256_t
;
207 #undef GMX_MM256_HERE
211 /* Working data for the actual i-supercell during pair search */
212 typedef struct nbnxn_list_work
{
213 gmx_cache_protect_t cp0
; /* Protect cache between threads */
215 float *bb_ci
; /* The bounding boxes, pbc shifted, for each cluster */
216 real
*x_ci
; /* The coordinates, pbc shifted, for each atom */
217 #ifdef NBNXN_SEARCH_SSE
218 nbnxn_x_ci_x86_simd128_t
*x_ci_x86_simd128
;
219 #ifdef GMX_X86_AVX_256
220 nbnxn_x_ci_x86_simd256_t
*x_ci_x86_simd256
;
223 int cj_ind
; /* The current cj_ind index for the current list */
224 int cj4_init
; /* The first unitialized cj4 block */
226 float *d2
; /* Bounding box distance work array */
228 nbnxn_cj_t
*cj
; /* The j-cell list */
229 int cj_nalloc
; /* Allocation size of cj */
231 int ncj_noq
; /* Nr. of cluster pairs without Coul for flop count */
232 int ncj_hlj
; /* Nr. of cluster pairs with 1/2 LJ for flop count */
234 gmx_cache_protect_t cp1
; /* Protect cache between threads */
237 /* Function type for setting the i-atom coordinate working data */
239 gmx_icell_set_x_t(int ci
,
240 real shx
,real shy
,real shz
,
242 int stride
,const real
*x
,
243 nbnxn_list_work_t
*work
);
245 static gmx_icell_set_x_t icell_set_x_simple
;
246 #ifdef NBNXN_SEARCH_SSE
247 static gmx_icell_set_x_t icell_set_x_simple_x86_simd128
;
248 #ifdef GMX_X86_AVX_256
249 static gmx_icell_set_x_t icell_set_x_simple_x86_simd256
;
252 static gmx_icell_set_x_t icell_set_x_supersub
;
253 #ifdef NBNXN_SEARCH_SSE
254 static gmx_icell_set_x_t icell_set_x_supersub_sse8
;
257 /* Function type for checking if sub-cells are within range */
259 gmx_subcell_in_range_t(int na_c
,
260 int si
,const real
*x_or_bb_i
,
261 int csj
,int stride
,const real
*x_or_bb_j
,
264 static gmx_subcell_in_range_t subc_in_range_x
;
265 static gmx_subcell_in_range_t subc_in_range_sse8
;
267 /* Local cycle count struct for profiling */
274 /* Local cycle count enum for profiling */
275 enum { enbsCCgrid
, enbsCCsearch
, enbsCCcombine
, enbsCCreducef
, enbsCCnr
};
277 /* A pair-search grid struct for one domain decomposition zone */
279 rvec c0
; /* The lower corner of the (local) grid */
280 rvec c1
; /* The upper corner of the (local) grid */
281 real atom_density
; /* The atom number density for the local grid */
283 gmx_bool bSimple
; /* Is this grid simple or super/sub */
284 int na_c
; /* Number of atoms per cluster */
285 int na_cj
; /* Number of atoms for list j-clusters */
286 int na_sc
; /* Number of atoms per super-cluster */
287 int na_c_2log
; /* 2log of na_c */
289 int ncx
; /* Number of (super-)cells along x */
290 int ncy
; /* Number of (super-)cells along y */
291 int nc
; /* Total number of (super-)cells */
293 real sx
; /* x-size of a (super-)cell */
294 real sy
; /* y-size of a (super-)cell */
295 real inv_sx
; /* 1/sx */
296 real inv_sy
; /* 1/sy */
298 int cell0
; /* Index in nbs->cell corresponding to cell 0 */
300 int *cxy_na
; /* The number of atoms for each column in x,y */
301 int *cxy_ind
; /* Grid (super)cell index, offset from cell0 */
302 int cxy_nalloc
; /* Allocation size for cxy_na and cxy_ind */
304 int *nsubc
; /* The number of sub cells for each super cell */
305 float *bbcz
; /* Bounding boxes in z for the super cells */
306 float *bb
; /* 3D bounding boxes for the sub cells */
307 float *bbj
; /* 3D j-b.boxes for SSE-double or AVX-single */
308 int *flags
; /* Flag for the super cells */
309 int nc_nalloc
; /* Allocation size for the pointers above */
311 float *bbcz_simple
; /* bbcz for simple grid converted from super */
312 float *bb_simple
; /* bb for simple grid converted from super */
313 int *flags_simple
; /* flags for simple grid converted from super */
314 int nc_nalloc_simple
; /* Allocation size for the pointers above */
316 int nsubc_tot
; /* Total number of subcell, used for printing */
319 /* Thread-local work struct, contains part of nbnxn_grid_t */
321 gmx_cache_protect_t cp0
;
327 int sort_work_nalloc
;
329 int ndistc
; /* Number of distance checks for flop counting */
331 nbnxn_cycle_t cc
[enbsCCnr
];
333 gmx_cache_protect_t cp1
;
334 } nbnxn_search_work_t
;
336 /* Main pair-search struct, contains the grid(s), not the pair-list(s) */
337 typedef struct nbnxn_search
{
338 int ePBC
; /* PBC type enum */
339 matrix box
; /* The periodic unit-cell */
341 gmx_bool DomDec
; /* Are we doing domain decomposition? */
342 ivec dd_dim
; /* Are we doing DD in x,y,z? */
343 gmx_domdec_zones_t
*zones
; /* The domain decomposition zones */
345 int ngrid
; /* The number of grids, equal to #DD-zones */
346 nbnxn_grid_t
*grid
; /* Array of grids, size ngrid */
347 int *cell
; /* Actual allocated cell array for all grids */
348 int cell_nalloc
; /* Allocation size of cell */
349 int *a
; /* Atom index for grid, the inverse of cell */
350 int a_nalloc
; /* Allocation size of a */
352 int natoms_local
; /* The local atoms run from 0 to natoms_local */
353 int natoms_nonlocal
; /* The non-local atoms run from natoms_local
354 * to natoms_nonlocal */
356 gmx_bool print_cycles
;
358 nbnxn_cycle_t cc
[enbsCCnr
];
360 gmx_icell_set_x_t
*icell_set_x
; /* Function for setting i-coords */
362 gmx_subcell_in_range_t
*subc_dc
; /* Function for sub-cell range check */
364 int nthread_max
; /* Maximum number of threads for pair-search */
365 nbnxn_search_work_t
*work
; /* Work array, size nthread_max */
369 static void nbs_cycle_clear(nbnxn_cycle_t
*cc
)
373 for(i
=0; i
<enbsCCnr
; i
++)
380 static void nbs_cycle_start(nbnxn_cycle_t
*cc
)
382 cc
->start
= gmx_cycles_read();
385 static void nbs_cycle_stop(nbnxn_cycle_t
*cc
)
387 cc
->c
+= gmx_cycles_read() - cc
->start
;
391 static double Mcyc_av(const nbnxn_cycle_t
*cc
)
393 return (double)cc
->c
*1e-6/cc
->count
;
396 static void nbs_cycle_print(FILE *fp
,const nbnxn_search_t nbs
)
402 fprintf(fp
,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
403 nbs
->cc
[enbsCCgrid
].count
,
404 Mcyc_av(&nbs
->cc
[enbsCCgrid
]),
405 Mcyc_av(&nbs
->cc
[enbsCCsearch
]),
406 Mcyc_av(&nbs
->cc
[enbsCCreducef
]));
408 if (nbs
->nthread_max
> 1)
410 if (nbs
->cc
[enbsCCcombine
].count
> 0)
412 fprintf(fp
," comb %5.2f",
413 Mcyc_av(&nbs
->cc
[enbsCCcombine
]));
415 fprintf(fp
," s. th");
416 for(t
=0; t
<nbs
->nthread_max
; t
++)
419 Mcyc_av(&nbs
->work
[t
].cc
[enbsCCsearch
]));
425 static void nbnxn_grid_init(nbnxn_grid_t
* grid
)
428 grid
->cxy_ind
= NULL
;
429 grid
->cxy_nalloc
= 0;
435 static int get_2log(int n
)
440 while ((1<<log2
) < n
)
446 gmx_fatal(FARGS
,"nbnxn na_c (%d) is not a power of 2",n
);
452 static int kernel_to_ci_size(int nb_kernel_type
)
454 switch (nb_kernel_type
)
457 case nbk4xN_X86_SIMD128
:
458 case nbk4xN_X86_SIMD256
:
459 return NBNXN_CPU_CLUSTER_I_SIZE
;
461 case nbk8x8x8_PlainC
:
462 /* The cluster size for super/sub lists is only set here.
463 * Any value should work for the pair-search and atomdata code.
464 * The kernels, of course, might require a particular value.
466 return NBNXN_GPU_CLUSTER_SIZE
;
468 gmx_incons("unknown kernel type");
474 static int kernel_to_cj_size(int nb_kernel_type
)
476 switch (nb_kernel_type
)
479 return NBNXN_CPU_CLUSTER_I_SIZE
;
480 case nbk4xN_X86_SIMD128
:
481 /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
482 return 16/sizeof(real
);
483 case nbk4xN_X86_SIMD256
:
484 /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
485 return 32/sizeof(real
);
487 case nbk8x8x8_PlainC
:
488 return kernel_to_ci_size(nb_kernel_type
);
490 gmx_incons("unknown kernel type");
496 static int ci_to_cj(int na_cj_2log
,int ci
)
500 case 2: return ci
; break;
501 case 1: return (ci
<<1); break;
502 case 3: return (ci
>>1); break;
508 gmx_bool
nbnxn_kernel_pairlist_simple(int nb_kernel_type
)
510 if (nb_kernel_type
== nbkNotSet
)
512 gmx_fatal(FARGS
, "Non-bonded kernel type not set for Verlet-style pair-list.");
515 switch (nb_kernel_type
)
518 case nbk8x8x8_PlainC
:
522 case nbk4xN_X86_SIMD128
:
523 case nbk4xN_X86_SIMD256
:
527 gmx_incons("Invalid nonbonded kernel type passed!");
532 void nbnxn_init_search(nbnxn_search_t
* nbs_ptr
,
534 gmx_domdec_zones_t
*zones
,
543 nbs
->DomDec
= (n_dd_cells
!= NULL
);
545 clear_ivec(nbs
->dd_dim
);
553 if ((*n_dd_cells
)[d
] > 1)
556 /* Each grid matches a DD zone */
562 snew(nbs
->grid
,nbs
->ngrid
);
563 for(g
=0; g
<nbs
->ngrid
; g
++)
565 nbnxn_grid_init(&nbs
->grid
[g
]);
568 nbs
->cell_nalloc
= 0;
572 /* nbs->subc_dc is only used with super/sub setup */
574 nbs
->subc_dc
= subc_in_range_sse8
;
576 if (getenv("GMX_NBNXN_BB") != NULL
)
578 /* Use only bounding box sub cell pair distances,
579 * fast, but produces slightly more sub cell pairs.
585 nbs
->subc_dc
= subc_in_range_x
;
589 nbs
->nthread_max
= nthread_max
;
591 /* Initialize the work data structures for each thread */
592 snew(nbs
->work
,nbs
->nthread_max
);
593 for(t
=0; t
<nbs
->nthread_max
; t
++)
595 nbs
->work
[t
].cxy_na
= NULL
;
596 nbs
->work
[t
].cxy_na_nalloc
= 0;
597 nbs
->work
[t
].sort_work
= NULL
;
598 nbs
->work
[t
].sort_work_nalloc
= 0;
601 /* Initialize detailed nbsearch cycle counting */
602 nbs
->print_cycles
= (getenv("GMX_NBNXN_CYCLE") != 0);
603 nbs
->search_count
= 0;
604 nbs_cycle_clear(nbs
->cc
);
605 for(t
=0; t
<nbs
->nthread_max
; t
++)
607 nbs_cycle_clear(nbs
->work
[t
].cc
);
611 static real
grid_atom_density(int n
,rvec corner0
,rvec corner1
)
615 rvec_sub(corner1
,corner0
,size
);
617 return n
/(size
[XX
]*size
[YY
]*size
[ZZ
]);
620 static int set_grid_size_xy(const nbnxn_search_t nbs
,
622 int n
,rvec corner0
,rvec corner1
,
628 real adens
,tlen
,tlen_x
,tlen_y
,nc_max
;
631 rvec_sub(corner1
,corner0
,size
);
635 /* target cell length */
638 /* To minimize the zero interactions, we should make
639 * the largest of the i/j cell cubic.
641 na_c
= max(grid
->na_c
,grid
->na_cj
);
643 /* Approximately cubic cells */
644 tlen
= pow(na_c
/atom_density
,1.0/3.0);
650 /* Approximately cubic sub cells */
651 tlen
= pow(grid
->na_c
/atom_density
,1.0/3.0);
652 tlen_x
= tlen
*GPU_NSUBCELL_X
;
653 tlen_y
= tlen
*GPU_NSUBCELL_Y
;
655 /* We round ncx and ncy down, because we get less cell pairs
656 * in the nbsist when the fixed cell dimensions (x,y) are
657 * larger than the variable one (z) than the other way around.
659 grid
->ncx
= max(1,(int)(size
[XX
]/tlen_x
));
660 grid
->ncy
= max(1,(int)(size
[YY
]/tlen_y
));
668 /* We need one additional cell entry for particles moved by DD */
669 if (grid
->ncx
*grid
->ncy
+1 > grid
->cxy_nalloc
)
671 grid
->cxy_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
672 srenew(grid
->cxy_na
,grid
->cxy_nalloc
);
673 srenew(grid
->cxy_ind
,grid
->cxy_nalloc
+1);
675 for(t
=0; t
<nbs
->nthread_max
; t
++)
677 if (grid
->ncx
*grid
->ncy
+1 > nbs
->work
[t
].cxy_na_nalloc
)
679 nbs
->work
[t
].cxy_na_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
680 srenew(nbs
->work
[t
].cxy_na
,nbs
->work
[t
].cxy_na_nalloc
);
684 /* Worst case scenario of 1 atom in each last cell */
685 if (grid
->na_cj
<= grid
->na_c
)
687 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
;
691 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
*grid
->na_cj
/grid
->na_c
;
694 if (nc_max
> grid
->nc_nalloc
)
698 grid
->nc_nalloc
= over_alloc_large(nc_max
);
699 srenew(grid
->nsubc
,grid
->nc_nalloc
);
700 srenew(grid
->bbcz
,grid
->nc_nalloc
*NNBSBB_D
);
702 bb_nalloc
= grid
->nc_nalloc
*GPU_NSUBCELL
/SSE_F_WIDTH
*NNBSBB_XXXX
;
704 bb_nalloc
= grid
->nc_nalloc
*GPU_NSUBCELL
*NNBSBB_B
;
706 sfree_aligned(grid
->bb
);
707 /* This snew also zeros the contents, this avoid possible
708 * floating exceptions in SSE with the unused bb elements.
710 snew_aligned(grid
->bb
,bb_nalloc
,16);
714 if (grid
->na_cj
== grid
->na_c
)
716 grid
->bbj
= grid
->bb
;
720 sfree_aligned(grid
->bbj
);
721 snew_aligned(grid
->bbj
,bb_nalloc
*grid
->na_c
/grid
->na_cj
,16);
725 srenew(grid
->flags
,grid
->nc_nalloc
);
728 copy_rvec(corner0
,grid
->c0
);
729 copy_rvec(corner1
,grid
->c1
);
730 grid
->sx
= size
[XX
]/grid
->ncx
;
731 grid
->sy
= size
[YY
]/grid
->ncy
;
732 grid
->inv_sx
= 1/grid
->sx
;
733 grid
->inv_sy
= 1/grid
->sy
;
738 #define SORT_GRID_OVERSIZE 2
739 #define SGSF (SORT_GRID_OVERSIZE + 1)
741 static void sort_atoms(int dim
,gmx_bool Backwards
,
742 int *a
,int n
,rvec
*x
,
743 real h0
,real invh
,int nsort
,int *sort
)
755 /* For small oversize factors clearing the whole area is fastest.
756 * For large oversize we should clear the used elements after use.
758 for(i
=0; i
<nsort
; i
++)
762 /* Sort the particles using a simple index sort */
765 /* The cast takes care of float-point rounding effects below zero.
766 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
767 * times the box height out of the box.
769 zi
= (int)((x
[a
[i
]][dim
] - h0
)*invh
);
771 #ifdef DEBUG_NBNXN_GRIDDING
772 if (zi
< 0 || zi
>= nsort
)
774 gmx_fatal(FARGS
,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
775 a
[i
],'x'+dim
,x
[a
[i
]][dim
],h0
,invh
,zi
,nsort
);
779 /* Ideally this particle should go in sort cell zi,
780 * but that might already be in use,
781 * in that case find the first empty cell higher up
789 /* We have multiple atoms in the same sorting slot.
790 * Sort on real z for minimal bounding box size.
791 * There is an extra check for identical z to ensure
792 * well-defined output order, independent of input order
793 * to ensure binary reproducibility after restarts.
795 while(sort
[zi
] >= 0 && ( x
[a
[i
]][dim
] > x
[sort
[zi
]][dim
] ||
796 (x
[a
[i
]][dim
] == x
[sort
[zi
]][dim
] &&
804 /* Shift all elements by one slot until we find an empty slot */
807 while (sort
[zim
] >= 0)
823 for(zi
=0; zi
<nsort
; zi
++)
833 for(zi
=nsort
-1; zi
>=0; zi
--)
843 gmx_incons("Lost particles while sorting");
848 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
849 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
855 /* Coordinate order x,y,z, bb order xyz0 */
856 static void calc_bounding_box(int na
,int stride
,const real
*x
,float *bb
)
859 real xl
,xh
,yl
,yh
,zl
,zh
;
871 xl
= min(xl
,x
[i
+XX
]);
872 xh
= max(xh
,x
[i
+XX
]);
873 yl
= min(yl
,x
[i
+YY
]);
874 yh
= max(yh
,x
[i
+YY
]);
875 zl
= min(zl
,x
[i
+ZZ
]);
876 zh
= max(zh
,x
[i
+ZZ
]);
879 /* Note: possible double to float conversion here */
880 bb
[BBL_X
] = R2F_D(xl
);
881 bb
[BBL_Y
] = R2F_D(yl
);
882 bb
[BBL_Z
] = R2F_D(zl
);
883 bb
[BBU_X
] = R2F_U(xh
);
884 bb
[BBU_Y
] = R2F_U(yh
);
885 bb
[BBU_Z
] = R2F_U(zh
);
888 /* Packed coordinates, bb order xyz0 */
889 static void calc_bounding_box_x_x4(int na
,const real
*x
,float *bb
)
892 real xl
,xh
,yl
,yh
,zl
,zh
;
902 xl
= min(xl
,x
[j
+XX
*PACK_X4
]);
903 xh
= max(xh
,x
[j
+XX
*PACK_X4
]);
904 yl
= min(yl
,x
[j
+YY
*PACK_X4
]);
905 yh
= max(yh
,x
[j
+YY
*PACK_X4
]);
906 zl
= min(zl
,x
[j
+ZZ
*PACK_X4
]);
907 zh
= max(zh
,x
[j
+ZZ
*PACK_X4
]);
909 /* Note: possible double to float conversion here */
910 bb
[BBL_X
] = R2F_D(xl
);
911 bb
[BBL_Y
] = R2F_D(yl
);
912 bb
[BBL_Z
] = R2F_D(zl
);
913 bb
[BBU_X
] = R2F_U(xh
);
914 bb
[BBU_Y
] = R2F_U(yh
);
915 bb
[BBU_Z
] = R2F_U(zh
);
918 /* Packed coordinates, bb order xyz0 */
919 static void calc_bounding_box_x_x8(int na
,const real
*x
,float *bb
)
922 real xl
,xh
,yl
,yh
,zl
,zh
;
932 xl
= min(xl
,x
[j
+XX
*PACK_X8
]);
933 xh
= max(xh
,x
[j
+XX
*PACK_X8
]);
934 yl
= min(yl
,x
[j
+YY
*PACK_X8
]);
935 yh
= max(yh
,x
[j
+YY
*PACK_X8
]);
936 zl
= min(zl
,x
[j
+ZZ
*PACK_X8
]);
937 zh
= max(zh
,x
[j
+ZZ
*PACK_X8
]);
939 /* Note: possible double to float conversion here */
940 bb
[BBL_X
] = R2F_D(xl
);
941 bb
[BBL_Y
] = R2F_D(yl
);
942 bb
[BBL_Z
] = R2F_D(zl
);
943 bb
[BBU_X
] = R2F_U(xh
);
944 bb
[BBU_Y
] = R2F_U(yh
);
945 bb
[BBU_Z
] = R2F_U(zh
);
948 #ifdef NBNXN_SEARCH_SSE
950 /* Packed coordinates, bb order xyz0 */
951 static void calc_bounding_box_x_x4_halves(int na
,const real
*x
,
952 float *bb
,float *bbj
)
954 calc_bounding_box_x_x4(min(na
,2),x
,bbj
);
958 calc_bounding_box_x_x4(min(na
-2,2),x
+(PACK_X4
>>1),bbj
+NNBSBB_B
);
962 /* Set the "empty" bounding box to the same as the first one,
963 * so we don't need to treat special cases in the rest of the code.
965 _mm_store_ps(bbj
+NNBSBB_B
,_mm_load_ps(bbj
));
966 _mm_store_ps(bbj
+NNBSBB_B
+NNBSBB_C
,_mm_load_ps(bbj
+NNBSBB_C
));
969 _mm_store_ps(bb
,_mm_min_ps(_mm_load_ps(bbj
),
970 _mm_load_ps(bbj
+NNBSBB_B
)));
971 _mm_store_ps(bb
+NNBSBB_C
,_mm_max_ps(_mm_load_ps(bbj
+NNBSBB_C
),
972 _mm_load_ps(bbj
+NNBSBB_B
+NNBSBB_C
)));
975 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
976 static void calc_bounding_box_xxxx(int na
,int stride
,const real
*x
,float *bb
)
979 real xl
,xh
,yl
,yh
,zl
,zh
;
991 xl
= min(xl
,x
[i
+XX
]);
992 xh
= max(xh
,x
[i
+XX
]);
993 yl
= min(yl
,x
[i
+YY
]);
994 yh
= max(yh
,x
[i
+YY
]);
995 zl
= min(zl
,x
[i
+ZZ
]);
996 zh
= max(zh
,x
[i
+ZZ
]);
999 /* Note: possible double to float conversion here */
1008 #endif /* NBNXN_SEARCH_SSE */
1010 #ifdef NBNXN_SEARCH_SSE_SINGLE
1012 /* Coordinate order xyz?, bb order xyz0 */
1013 static void calc_bounding_box_sse(int na
,const float *x
,float *bb
)
1015 __m128 bb_0_SSE
,bb_1_SSE
;
1020 bb_0_SSE
= _mm_load_ps(x
);
1021 bb_1_SSE
= bb_0_SSE
;
1025 x_SSE
= _mm_load_ps(x
+i
*NNBSBB_C
);
1026 bb_0_SSE
= _mm_min_ps(bb_0_SSE
,x_SSE
);
1027 bb_1_SSE
= _mm_max_ps(bb_1_SSE
,x_SSE
);
1030 _mm_store_ps(bb
,bb_0_SSE
);
1031 _mm_store_ps(bb
+4,bb_1_SSE
);
1034 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
1035 static void calc_bounding_box_xxxx_sse(int na
,const float *x
,
1039 calc_bounding_box_sse(na
,x
,bb_work
);
1041 bb
[ 0] = bb_work
[BBL_X
];
1042 bb
[ 4] = bb_work
[BBL_Y
];
1043 bb
[ 8] = bb_work
[BBL_Z
];
1044 bb
[12] = bb_work
[BBU_X
];
1045 bb
[16] = bb_work
[BBU_Y
];
1046 bb
[20] = bb_work
[BBU_Z
];
1049 #endif /* NBNXN_SEARCH_SSE_SINGLE */
1051 #ifdef NBNXN_SEARCH_SSE
1053 /* Combines pairs of consecutive bounding boxes */
1054 static void combine_bounding_box_pairs(nbnxn_grid_t
*grid
,const float *bb
)
1057 __m128 min_SSE
,max_SSE
;
1059 for(i
=0; i
<grid
->ncx
*grid
->ncy
; i
++)
1061 /* Starting bb in a column is expected to be 2-aligned */
1062 sc2
= grid
->cxy_ind
[i
]>>1;
1063 /* For odd numbers skip the last bb here */
1064 nc2
= (grid
->cxy_na
[i
]+3)>>(2+1);
1065 for(c2
=sc2
; c2
<sc2
+nc2
; c2
++)
1067 min_SSE
= _mm_min_ps(_mm_load_ps(bb
+(c2
*4+0)*NNBSBB_C
),
1068 _mm_load_ps(bb
+(c2
*4+2)*NNBSBB_C
));
1069 max_SSE
= _mm_max_ps(_mm_load_ps(bb
+(c2
*4+1)*NNBSBB_C
),
1070 _mm_load_ps(bb
+(c2
*4+3)*NNBSBB_C
));
1071 _mm_store_ps(grid
->bbj
+(c2
*2+0)*NNBSBB_C
,min_SSE
);
1072 _mm_store_ps(grid
->bbj
+(c2
*2+1)*NNBSBB_C
,max_SSE
);
1074 if (((grid
->cxy_na
[i
]+3)>>2) & 1)
1076 /* Copy the last bb for odd bb count in this column */
1077 for(j
=0; j
<NNBSBB_C
; j
++)
1079 grid
->bbj
[(c2
*2+0)*NNBSBB_C
+j
] = bb
[(c2
*4+0)*NNBSBB_C
+j
];
1080 grid
->bbj
[(c2
*2+1)*NNBSBB_C
+j
] = bb
[(c2
*4+1)*NNBSBB_C
+j
];
1089 /* Prints the average bb size, used for debug output */
1090 static void print_bbsizes_simple(FILE *fp
,
1091 const nbnxn_search_t nbs
,
1092 const nbnxn_grid_t
*grid
)
1098 for(c
=0; c
<grid
->nc
; c
++)
1100 for(d
=0; d
<DIM
; d
++)
1102 ba
[d
] += grid
->bb
[c
*NNBSBB_B
+NNBSBB_C
+d
] - grid
->bb
[c
*NNBSBB_B
+d
];
1105 dsvmul(1.0/grid
->nc
,ba
,ba
);
1107 fprintf(fp
,"ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
1108 nbs
->box
[XX
][XX
]/grid
->ncx
,
1109 nbs
->box
[YY
][YY
]/grid
->ncy
,
1110 nbs
->box
[ZZ
][ZZ
]*grid
->ncx
*grid
->ncy
/grid
->nc
,
1111 ba
[XX
],ba
[YY
],ba
[ZZ
],
1112 ba
[XX
]*grid
->ncx
/nbs
->box
[XX
][XX
],
1113 ba
[YY
]*grid
->ncy
/nbs
->box
[YY
][YY
],
1114 ba
[ZZ
]*grid
->nc
/(grid
->ncx
*grid
->ncy
*nbs
->box
[ZZ
][ZZ
]));
1117 /* Prints the average bb size, used for debug output */
1118 static void print_bbsizes_supersub(FILE *fp
,
1119 const nbnxn_search_t nbs
,
1120 const nbnxn_grid_t
*grid
)
1127 for(c
=0; c
<grid
->nc
; c
++)
1130 for(s
=0; s
<grid
->nsubc
[c
]; s
+=SSE_F_WIDTH
)
1134 cs_w
= (c
*GPU_NSUBCELL
+ s
)/SSE_F_WIDTH
;
1135 for(i
=0; i
<SSE_F_WIDTH
; i
++)
1137 for(d
=0; d
<DIM
; d
++)
1140 grid
->bb
[cs_w
*NNBSBB_XXXX
+(DIM
+d
)*SSE_F_WIDTH
+i
] -
1141 grid
->bb
[cs_w
*NNBSBB_XXXX
+ d
*SSE_F_WIDTH
+i
];
1146 for(s
=0; s
<grid
->nsubc
[c
]; s
++)
1150 cs
= c
*GPU_NSUBCELL
+ s
;
1151 for(d
=0; d
<DIM
; d
++)
1154 grid
->bb
[cs
*NNBSBB_B
+NNBSBB_C
+d
] -
1155 grid
->bb
[cs
*NNBSBB_B
+d
];
1159 ns
+= grid
->nsubc
[c
];
1161 dsvmul(1.0/ns
,ba
,ba
);
1163 fprintf(fp
,"ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
1164 nbs
->box
[XX
][XX
]/(grid
->ncx
*GPU_NSUBCELL_X
),
1165 nbs
->box
[YY
][YY
]/(grid
->ncy
*GPU_NSUBCELL_Y
),
1166 nbs
->box
[ZZ
][ZZ
]*grid
->ncx
*grid
->ncy
/(grid
->nc
*GPU_NSUBCELL_Z
),
1167 ba
[XX
],ba
[YY
],ba
[ZZ
],
1168 ba
[XX
]*grid
->ncx
*GPU_NSUBCELL_X
/nbs
->box
[XX
][XX
],
1169 ba
[YY
]*grid
->ncy
*GPU_NSUBCELL_Y
/nbs
->box
[YY
][YY
],
1170 ba
[ZZ
]*grid
->nc
*GPU_NSUBCELL_Z
/(grid
->ncx
*grid
->ncy
*nbs
->box
[ZZ
][ZZ
]));
1173 static void copy_int_to_nbat_int(const int *a
,int na
,int na_round
,
1174 const int *in
,int fill
,int *innb
)
1181 innb
[j
++] = in
[a
[i
]];
1183 /* Complete the partially filled last cell with fill */
1184 for(; i
<na_round
; i
++)
1190 static void clear_nbat_real(int na
,int nbatFormat
,real
*xnb
,int a0
)
1199 for(d
=0; d
<DIM
; d
++)
1201 xnb
[(a0
+a
)*STRIDE_XYZ
+d
] = 0;
1208 for(d
=0; d
<DIM
; d
++)
1210 xnb
[(a0
+a
)*STRIDE_XYZQ
+d
] = 0;
1216 c
= a0
& (PACK_X4
-1);
1219 xnb
[j
+XX
*PACK_X4
] = 0;
1220 xnb
[j
+YY
*PACK_X4
] = 0;
1221 xnb
[j
+ZZ
*PACK_X4
] = 0;
1226 j
+= (DIM
-1)*PACK_X4
;
1233 c
= a0
& (PACK_X8
-1);
1236 xnb
[j
+XX
*PACK_X8
] = 0;
1237 xnb
[j
+YY
*PACK_X8
] = 0;
1238 xnb
[j
+ZZ
*PACK_X8
] = 0;
1243 j
+= (DIM
-1)*PACK_X8
;
1251 static void copy_rvec_to_nbat_real(const int *a
,int na
,int na_round
,
1252 rvec
*x
,int nbatFormat
,real
*xnb
,int a0
,
1253 int cx
,int cy
,int cz
)
1257 /* We might need to place filler particles to fill up the cell to na_round.
1258 * The coefficients (LJ and q) for such particles are zero.
1259 * But we might still get NaN as 0*NaN when distances are too small.
1260 * We hope that -107 nm is far away enough from to zero
1261 * to avoid accidental short distances to particles shifted down for pbc.
1263 #define NBAT_FAR_AWAY 107
1271 xnb
[j
++] = x
[a
[i
]][XX
];
1272 xnb
[j
++] = x
[a
[i
]][YY
];
1273 xnb
[j
++] = x
[a
[i
]][ZZ
];
1275 /* Complete the partially filled last cell with copies of the last element.
1276 * This simplifies the bounding box calculation and avoid
1277 * numerical issues with atoms that are coincidentally close.
1279 for(; i
<na_round
; i
++)
1281 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cx
);
1282 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cy
);
1283 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
1290 xnb
[j
++] = x
[a
[i
]][XX
];
1291 xnb
[j
++] = x
[a
[i
]][YY
];
1292 xnb
[j
++] = x
[a
[i
]][ZZ
];
1295 /* Complete the partially filled last cell with particles far apart */
1296 for(; i
<na_round
; i
++)
1298 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cx
);
1299 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cy
);
1300 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
1306 c
= a0
& (PACK_X4
-1);
1309 xnb
[j
+XX
*PACK_X4
] = x
[a
[i
]][XX
];
1310 xnb
[j
+YY
*PACK_X4
] = x
[a
[i
]][YY
];
1311 xnb
[j
+ZZ
*PACK_X4
] = x
[a
[i
]][ZZ
];
1316 j
+= (DIM
-1)*PACK_X4
;
1320 /* Complete the partially filled last cell with particles far apart */
1321 for(; i
<na_round
; i
++)
1323 xnb
[j
+XX
*PACK_X4
] = -NBAT_FAR_AWAY
*(1 + cx
);
1324 xnb
[j
+YY
*PACK_X4
] = -NBAT_FAR_AWAY
*(1 + cy
);
1325 xnb
[j
+ZZ
*PACK_X4
] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
1330 j
+= (DIM
-1)*PACK_X4
;
1337 c
= a0
& (PACK_X8
- 1);
1340 xnb
[j
+XX
*PACK_X8
] = x
[a
[i
]][XX
];
1341 xnb
[j
+YY
*PACK_X8
] = x
[a
[i
]][YY
];
1342 xnb
[j
+ZZ
*PACK_X8
] = x
[a
[i
]][ZZ
];
1347 j
+= (DIM
-1)*PACK_X8
;
1351 /* Complete the partially filled last cell with particles far apart */
1352 for(; i
<na_round
; i
++)
1354 xnb
[j
+XX
*PACK_X8
] = -NBAT_FAR_AWAY
*(1 + cx
);
1355 xnb
[j
+YY
*PACK_X8
] = -NBAT_FAR_AWAY
*(1 + cy
);
1356 xnb
[j
+ZZ
*PACK_X8
] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
1361 j
+= (DIM
-1)*PACK_X8
;
1367 gmx_incons("Unsupported stride");
1371 /* Potentially sorts atoms on LJ coefficients !=0 and ==0.
1372 * Also sets interaction flags.
1374 void sort_on_lj(nbnxn_atomdata_t
*nbat
,int na_c
,
1375 int a0
,int a1
,const int *atinfo
,
1379 int subc
,s
,a
,n1
,n2
,a_lj_max
,i
,j
;
1380 int sort1
[NBNXN_NA_SC_MAX
/GPU_NSUBCELL
];
1381 int sort2
[NBNXN_NA_SC_MAX
/GPU_NSUBCELL
];
1387 for(s
=a0
; s
<a1
; s
+=na_c
)
1389 /* Make lists for this (sub-)cell on atoms with and without LJ */
1394 for(a
=s
; a
<min(s
+na_c
,a1
); a
++)
1396 haveQ
= haveQ
|| GET_CGINFO_HAS_Q(atinfo
[order
[a
]]);
1398 if (GET_CGINFO_HAS_VDW(atinfo
[order
[a
]]))
1400 sort1
[n1
++] = order
[a
];
1405 sort2
[n2
++] = order
[a
];
1409 /* If we don't have atom with LJ, there's nothing to sort */
1412 *flags
|= NBNXN_CI_DO_LJ(subc
);
1416 /* Only sort when strictly necessary. Ordering particles
1417 * Ordering particles can lead to less accurate summation
1418 * due to rounding, both for LJ and Coulomb interactions.
1420 if (2*(a_lj_max
- s
) >= na_c
)
1424 order
[a0
+i
] = sort1
[i
];
1428 order
[a0
+n1
+j
] = sort2
[j
];
1432 *flags
|= NBNXN_CI_HALF_LJ(subc
);
1437 *flags
|= NBNXN_CI_DO_COUL(subc
);
1443 /* Fill a pair search cell with atoms.
1444 * Potentially sorts atoms and sets the interaction flags.
1446 void fill_cell(const nbnxn_search_t nbs
,
1448 nbnxn_atomdata_t
*nbat
,
1452 int sx
,int sy
, int sz
,
1463 sort_on_lj(nbat
,grid
->na_c
,a0
,a1
,atinfo
,nbs
->a
,
1464 grid
->flags
+(a0
>>grid
->na_c_2log
)-grid
->cell0
);
1467 /* Now we have sorted the atoms, set the cell indices */
1468 for(a
=a0
; a
<a1
; a
++)
1470 nbs
->cell
[nbs
->a
[a
]] = a
;
1473 copy_rvec_to_nbat_real(nbs
->a
+a0
,a1
-a0
,grid
->na_c
,x
,
1474 nbat
->XFormat
,nbat
->x
,a0
,
1477 if (nbat
->XFormat
== nbatX4
)
1479 /* Store the bounding boxes as xyz.xyz. */
1480 offset
= ((a0
- grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
)*NNBSBB_B
;
1481 bb_ptr
= grid
->bb
+ offset
;
1483 #if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
1484 if (2*grid
->na_cj
== grid
->na_c
)
1486 calc_bounding_box_x_x4_halves(na
,nbat
->x
+X4_IND_A(a0
),bb_ptr
,
1487 grid
->bbj
+offset
*2);
1492 calc_bounding_box_x_x4(na
,nbat
->x
+X4_IND_A(a0
),bb_ptr
);
1495 else if (nbat
->XFormat
== nbatX8
)
1497 /* Store the bounding boxes as xyz.xyz. */
1498 offset
= ((a0
- grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
)*NNBSBB_B
;
1499 bb_ptr
= grid
->bb
+ offset
;
1501 calc_bounding_box_x_x8(na
,nbat
->x
+X8_IND_A(a0
),bb_ptr
);
1504 else if (!grid
->bSimple
)
1506 /* Store the bounding boxes in a format convenient
1507 * for SSE calculations: xxxxyyyyzzzz...
1511 ((a0
-grid
->cell0
*grid
->na_sc
)>>(grid
->na_c_2log
+SSE_F_WIDTH_2LOG
))*NNBSBB_XXXX
+
1512 (((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
) & (SSE_F_WIDTH
-1));
1514 #ifdef NBNXN_SEARCH_SSE_SINGLE
1515 if (nbat
->XFormat
== nbatXYZQ
)
1517 calc_bounding_box_xxxx_sse(na
,nbat
->x
+a0
*nbat
->xstride
,
1523 calc_bounding_box_xxxx(na
,nbat
->xstride
,nbat
->x
+a0
*nbat
->xstride
,
1528 fprintf(debug
,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1530 bb_ptr
[0],bb_ptr
[12],
1531 bb_ptr
[4],bb_ptr
[16],
1532 bb_ptr
[8],bb_ptr
[20]);
1538 /* Store the bounding boxes as xyz.xyz. */
1539 bb_ptr
= grid
->bb
+((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
)*NNBSBB_B
;
1541 calc_bounding_box(na
,nbat
->xstride
,nbat
->x
+a0
*nbat
->xstride
,
1547 bbo
= (a0
- grid
->cell0
*grid
->na_sc
)/grid
->na_c
;
1548 fprintf(debug
,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1550 (grid
->bb
+bbo
*NNBSBB_B
)[BBL_X
],
1551 (grid
->bb
+bbo
*NNBSBB_B
)[BBU_X
],
1552 (grid
->bb
+bbo
*NNBSBB_B
)[BBL_Y
],
1553 (grid
->bb
+bbo
*NNBSBB_B
)[BBU_Y
],
1554 (grid
->bb
+bbo
*NNBSBB_B
)[BBL_Z
],
1555 (grid
->bb
+bbo
*NNBSBB_B
)[BBU_Z
]);
1560 /* Spatially sort the atoms within one grid column */
1561 static void sort_columns_simple(const nbnxn_search_t nbs
,
1567 nbnxn_atomdata_t
*nbat
,
1568 int cxy_start
,int cxy_end
,
1572 int cx
,cy
,cz
,ncz
,cfilled
,c
;
1578 fprintf(debug
,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1579 grid
->cell0
,cxy_start
,cxy_end
,a0
,a1
);
1582 /* Sort the atoms within each x,y column in 3 dimensions */
1583 for(cxy
=cxy_start
; cxy
<cxy_end
; cxy
++)
1586 cy
= cxy
- cx
*grid
->ncy
;
1588 na
= grid
->cxy_na
[cxy
];
1589 ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
1590 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1592 /* Sort the atoms within each x,y column on z coordinate */
1593 sort_atoms(ZZ
,FALSE
,
1596 ncz
*grid
->na_sc
*SORT_GRID_OVERSIZE
/nbs
->box
[ZZ
][ZZ
],
1597 ncz
*grid
->na_sc
*SGSF
,sort_work
);
1599 /* Fill the ncz cells in this column */
1600 cfilled
= grid
->cxy_ind
[cxy
];
1601 for(cz
=0; cz
<ncz
; cz
++)
1603 c
= grid
->cxy_ind
[cxy
] + cz
;
1605 ash_c
= ash
+ cz
*grid
->na_sc
;
1606 na_c
= min(grid
->na_sc
,na
-(ash_c
-ash
));
1608 fill_cell(nbs
,grid
,nbat
,
1609 ash_c
,ash_c
+na_c
,atinfo
,x
,
1610 grid
->na_sc
*cx
+ (dd_zone
>> 2),
1611 grid
->na_sc
*cy
+ (dd_zone
& 3),
1615 /* This copy to bbcz is not really necessary.
1616 * But it allows to use the same grid search code
1617 * for the simple and supersub cell setups.
1623 grid
->bbcz
[c
*NNBSBB_D
] = grid
->bb
[cfilled
*NNBSBB_B
+2];
1624 grid
->bbcz
[c
*NNBSBB_D
+1] = grid
->bb
[cfilled
*NNBSBB_B
+6];
1627 /* Set the unused atom indices to -1 */
1628 for(ind
=na
; ind
<ncz
*grid
->na_sc
; ind
++)
1630 nbs
->a
[ash
+ind
] = -1;
1635 /* Spatially sort the atoms within one grid column */
1636 static void sort_columns_supersub(const nbnxn_search_t nbs
,
1642 nbnxn_atomdata_t
*nbat
,
1643 int cxy_start
,int cxy_end
,
1647 int cx
,cy
,cz
=-1,c
=-1,ncz
;
1648 int na
,ash
,na_c
,ind
,a
;
1649 int subdiv_z
,sub_z
,na_z
,ash_z
;
1650 int subdiv_y
,sub_y
,na_y
,ash_y
;
1651 int subdiv_x
,sub_x
,na_x
,ash_x
;
1653 /* cppcheck-suppress unassignedVariable */
1654 float bb_work_array
[NNBSBB_B
+3],*bb_work_align
;
1656 bb_work_align
= (float *)(((size_t)(bb_work_array
+3)) & (~((size_t)15)));
1660 fprintf(debug
,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1661 grid
->cell0
,cxy_start
,cxy_end
,a0
,a1
);
1664 subdiv_x
= grid
->na_c
;
1665 subdiv_y
= GPU_NSUBCELL_X
*subdiv_x
;
1666 subdiv_z
= GPU_NSUBCELL_Y
*subdiv_y
;
1668 /* Sort the atoms within each x,y column in 3 dimensions */
1669 for(cxy
=cxy_start
; cxy
<cxy_end
; cxy
++)
1672 cy
= cxy
- cx
*grid
->ncy
;
1674 na
= grid
->cxy_na
[cxy
];
1675 ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
1676 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1678 /* Sort the atoms within each x,y column on z coordinate */
1679 sort_atoms(ZZ
,FALSE
,
1682 ncz
*grid
->na_sc
*SORT_GRID_OVERSIZE
/nbs
->box
[ZZ
][ZZ
],
1683 ncz
*grid
->na_sc
*SGSF
,sort_work
);
1685 /* This loop goes over the supercells and subcells along z at once */
1686 for(sub_z
=0; sub_z
<ncz
*GPU_NSUBCELL_Z
; sub_z
++)
1688 ash_z
= ash
+ sub_z
*subdiv_z
;
1689 na_z
= min(subdiv_z
,na
-(ash_z
-ash
));
1691 /* We have already sorted on z */
1693 if (sub_z
% GPU_NSUBCELL_Z
== 0)
1695 cz
= sub_z
/GPU_NSUBCELL_Z
;
1696 c
= grid
->cxy_ind
[cxy
] + cz
;
1698 /* The number of atoms in this supercell */
1699 na_c
= min(grid
->na_sc
,na
-(ash_z
-ash
));
1701 grid
->nsubc
[c
] = min(GPU_NSUBCELL
,(na_c
+grid
->na_c
-1)/grid
->na_c
);
1703 /* Store the z-boundaries of the super cell */
1704 grid
->bbcz
[c
*NNBSBB_D
] = x
[nbs
->a
[ash_z
]][ZZ
];
1705 grid
->bbcz
[c
*NNBSBB_D
+1] = x
[nbs
->a
[ash_z
+na_c
-1]][ZZ
];
1708 #if GPU_NSUBCELL_Y > 1
1709 /* Sort the atoms along y */
1710 sort_atoms(YY
,(sub_z
& 1),
1711 nbs
->a
+ash_z
,na_z
,x
,
1712 grid
->c0
[YY
]+cy
*grid
->sy
,grid
->inv_sy
,
1713 subdiv_y
*SGSF
,sort_work
);
1716 for(sub_y
=0; sub_y
<GPU_NSUBCELL_Y
; sub_y
++)
1718 ash_y
= ash_z
+ sub_y
*subdiv_y
;
1719 na_y
= min(subdiv_y
,na
-(ash_y
-ash
));
1721 #if GPU_NSUBCELL_X > 1
1722 /* Sort the atoms along x */
1723 sort_atoms(XX
,((cz
*GPU_NSUBCELL_Y
+ sub_y
) & 1),
1724 nbs
->a
+ash_y
,na_y
,x
,
1725 grid
->c0
[XX
]+cx
*grid
->sx
,grid
->inv_sx
,
1726 subdiv_x
*SGSF
,sort_work
);
1729 for(sub_x
=0; sub_x
<GPU_NSUBCELL_X
; sub_x
++)
1731 ash_x
= ash_y
+ sub_x
*subdiv_x
;
1732 na_x
= min(subdiv_x
,na
-(ash_x
-ash
));
1734 fill_cell(nbs
,grid
,nbat
,
1735 ash_x
,ash_x
+na_x
,atinfo
,x
,
1736 grid
->na_c
*(cx
*GPU_NSUBCELL_X
+sub_x
) + (dd_zone
>> 2),
1737 grid
->na_c
*(cy
*GPU_NSUBCELL_Y
+sub_y
) + (dd_zone
& 3),
1744 /* Set the unused atom indices to -1 */
1745 for(ind
=na
; ind
<ncz
*grid
->na_sc
; ind
++)
1747 nbs
->a
[ash
+ind
] = -1;
1752 /* Determine in which grid column atoms should go */
1753 static void calc_column_indices(nbnxn_grid_t
*grid
,
1755 rvec
*x
,const int *move
,
1756 int thread
,int nthread
,
1763 /* We add one extra cell for particles which moved during DD */
1764 for(i
=0; i
<grid
->ncx
*grid
->ncy
+1; i
++)
1769 n0
= a0
+ (int)((thread
+0)*(a1
- a0
))/nthread
;
1770 n1
= a0
+ (int)((thread
+1)*(a1
- a0
))/nthread
;
1771 for(i
=n0
; i
<n1
; i
++)
1773 if (move
== NULL
|| move
[i
] >= 0)
1775 /* We need to be careful with rounding,
1776 * particles might be a few bits outside the local box.
1777 * The int cast takes care of the lower bound,
1778 * we need to explicitly take care of the upper bound.
1780 cx
= (int)((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1781 if (cx
== grid
->ncx
)
1785 cy
= (int)((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1786 if (cy
== grid
->ncy
)
1790 /* For the moment cell contains only the, grid local,
1791 * x and y indices, not z.
1793 cell
[i
] = cx
*grid
->ncy
+ cy
;
1795 #ifdef DEBUG_NBNXN_GRIDDING
1796 if (cell
[i
] < 0 || cell
[i
] >= grid
->ncx
*grid
->ncy
)
1799 "grid cell cx %d cy %d out of range (max %d %d)",
1800 cx
,cy
,grid
->ncx
,grid
->ncy
);
1806 /* Put this moved particle after the end of the grid,
1807 * so we can process it later without using conditionals.
1809 cell
[i
] = grid
->ncx
*grid
->ncy
;
1816 /* Determine in which grid cells the atoms should go */
1817 static void calc_cell_indices(const nbnxn_search_t nbs
,
1824 nbnxn_atomdata_t
*nbat
)
1827 int cx
,cy
,cxy
,ncz_max
,ncz
;
1829 int *cxy_na
,cxy_na_i
;
1831 nthread
= gmx_omp_nthreads_get(emntPairsearch
);
1833 #pragma omp parallel for num_threads(nthread) schedule(static)
1834 for(thread
=0; thread
<nthread
; thread
++)
1836 calc_column_indices(grid
,a0
,a1
,x
,move
,thread
,nthread
,
1837 nbs
->cell
,nbs
->work
[thread
].cxy_na
);
1840 /* Make the cell index as a function of x and y */
1843 grid
->cxy_ind
[0] = 0;
1844 for(i
=0; i
<grid
->ncx
*grid
->ncy
+1; i
++)
1846 /* We set ncz_max at the beginning of the loop iso at the end
1847 * to skip i=grid->ncx*grid->ncy which are moved particles
1848 * that do not need to be ordered on the grid.
1854 cxy_na_i
= nbs
->work
[0].cxy_na
[i
];
1855 for(thread
=1; thread
<nthread
; thread
++)
1857 cxy_na_i
+= nbs
->work
[thread
].cxy_na
[i
];
1859 ncz
= (cxy_na_i
+ grid
->na_sc
- 1)/grid
->na_sc
;
1860 if (nbat
->XFormat
== nbatX8
)
1862 /* Make the number of cell a multiple of 2 */
1863 ncz
= (ncz
+ 1) & ~1;
1865 grid
->cxy_ind
[i
+1] = grid
->cxy_ind
[i
] + ncz
;
1866 /* Clear cxy_na, so we can reuse the array below */
1867 grid
->cxy_na
[i
] = 0;
1869 grid
->nc
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
] - grid
->cxy_ind
[0];
1871 nbat
->natoms
= (grid
->cell0
+ grid
->nc
)*grid
->na_sc
;
1875 fprintf(debug
,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1876 grid
->na_sc
,grid
->na_c
,grid
->nc
,
1877 grid
->ncx
,grid
->ncy
,grid
->nc
/((double)(grid
->ncx
*grid
->ncy
)),
1882 for(cy
=0; cy
<grid
->ncy
; cy
++)
1884 for(cx
=0; cx
<grid
->ncx
; cx
++)
1886 fprintf(debug
," %2d",grid
->cxy_ind
[i
+1]-grid
->cxy_ind
[i
]);
1889 fprintf(debug
,"\n");
1894 /* Make sure the work array for sorting is large enough */
1895 if (ncz_max
*grid
->na_sc
*SGSF
> nbs
->work
[0].sort_work_nalloc
)
1897 for(thread
=0; thread
<nbs
->nthread_max
; thread
++)
1899 nbs
->work
[thread
].sort_work_nalloc
=
1900 over_alloc_large(ncz_max
*grid
->na_sc
*SGSF
);
1901 srenew(nbs
->work
[thread
].sort_work
,
1902 nbs
->work
[thread
].sort_work_nalloc
);
1906 /* Now we know the dimensions we can fill the grid.
1907 * This is the first, unsorted fill. We sort the columns after this.
1909 for(i
=a0
; i
<a1
; i
++)
1911 /* At this point nbs->cell contains the local grid x,y indices */
1913 nbs
->a
[(grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
+ grid
->cxy_na
[cxy
]++] = i
;
1916 /* Set the cell indices for the moved particles */
1917 n0
= grid
->nc
*grid
->na_sc
;
1918 n1
= grid
->nc
*grid
->na_sc
+grid
->cxy_na
[grid
->ncx
*grid
->ncy
];
1919 for(i
=n0
; i
<n1
; i
++)
1921 nbs
->cell
[nbs
->a
[i
]] = i
;
1924 /* Sort the super-cell columns along z into the sub-cells. */
1925 #pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
1926 for(thread
=0; thread
<nbs
->nthread_max
; thread
++)
1930 sort_columns_simple(nbs
,dd_zone
,grid
,a0
,a1
,atinfo
,x
,nbat
,
1931 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1932 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1933 nbs
->work
[thread
].sort_work
);
1937 sort_columns_supersub(nbs
,dd_zone
,grid
,a0
,a1
,atinfo
,x
,nbat
,
1938 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1939 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1940 nbs
->work
[thread
].sort_work
);
1944 #ifdef NBNXN_SEARCH_SSE
1945 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1947 combine_bounding_box_pairs(grid
,grid
->bb
);
1953 grid
->nsubc_tot
= 0;
1954 for(i
=0; i
<grid
->nc
; i
++)
1956 grid
->nsubc_tot
+= grid
->nsubc
[i
];
1964 print_bbsizes_simple(debug
,nbs
,grid
);
1968 fprintf(debug
,"ns non-zero sub-cells: %d average atoms %.2f\n",
1969 grid
->nsubc_tot
,(a1
-a0
)/(double)grid
->nsubc_tot
);
1971 print_bbsizes_supersub(debug
,nbs
,grid
);
1976 /* Reallocation wrapper function for nbnxn data structures */
1977 static void nb_realloc_void(void **ptr
,
1978 int nbytes_copy
,int nbytes_new
,
1979 gmx_nbat_alloc_t
*ma
,
1980 gmx_nbat_free_t
*mf
)
1984 ma(&ptr_new
,nbytes_new
);
1986 if (nbytes_new
> 0 && ptr_new
== NULL
)
1988 gmx_fatal(FARGS
, "Allocation of %d bytes failed", nbytes_new
);
1991 if (nbytes_copy
> 0)
1993 if (nbytes_new
< nbytes_copy
)
1995 gmx_incons("In nb_realloc_void: new size less than copy size");
1997 memcpy(ptr_new
,*ptr
,nbytes_copy
);
2006 /* NOTE: does not preserve the contents! */
2007 static void nb_realloc_int(int **ptr
,int n
,
2008 gmx_nbat_alloc_t
*ma
,
2009 gmx_nbat_free_t
*mf
)
2015 ma((void **)ptr
,n
*sizeof(**ptr
));
2018 /* NOTE: does not preserve the contents! */
2019 static void nb_realloc_real(real
**ptr
,int n
,
2020 gmx_nbat_alloc_t
*ma
,
2021 gmx_nbat_free_t
*mf
)
2027 ma((void **)ptr
,n
*sizeof(**ptr
));
2030 /* Reallocate the nbnxn_atomdata_t for a size of n atoms */
2031 static void nbnxn_atomdata_realloc(nbnxn_atomdata_t
*nbat
,int n
)
2035 nb_realloc_void((void **)&nbat
->type
,
2036 nbat
->natoms
*sizeof(*nbat
->type
),
2037 n
*sizeof(*nbat
->type
),
2038 nbat
->alloc
,nbat
->free
);
2039 nb_realloc_void((void **)&nbat
->lj_comb
,
2040 nbat
->natoms
*2*sizeof(*nbat
->lj_comb
),
2041 n
*2*sizeof(*nbat
->lj_comb
),
2042 nbat
->alloc
,nbat
->free
);
2043 if (nbat
->XFormat
!= nbatXYZQ
)
2045 nb_realloc_void((void **)&nbat
->q
,
2046 nbat
->natoms
*sizeof(*nbat
->q
),
2048 nbat
->alloc
,nbat
->free
);
2050 if (nbat
->nenergrp
> 1)
2052 nb_realloc_void((void **)&nbat
->energrp
,
2053 nbat
->natoms
/nbat
->na_c
*sizeof(*nbat
->energrp
),
2054 n
/nbat
->na_c
*sizeof(*nbat
->energrp
),
2055 nbat
->alloc
,nbat
->free
);
2057 nb_realloc_void((void **)&nbat
->x
,
2058 nbat
->natoms
*nbat
->xstride
*sizeof(*nbat
->x
),
2059 n
*nbat
->xstride
*sizeof(*nbat
->x
),
2060 nbat
->alloc
,nbat
->free
);
2061 for(t
=0; t
<nbat
->nout
; t
++)
2063 /* Allocate one element extra for possible signaling with CUDA */
2064 nb_realloc_void((void **)&nbat
->out
[t
].f
,
2065 nbat
->natoms
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
2066 n
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
2067 nbat
->alloc
,nbat
->free
);
2072 /* Sets up a grid and puts the atoms on the grid.
2073 * This function only operates on one domain of the domain decompostion.
2074 * Note that without domain decomposition there is only one domain.
2076 void nbnxn_put_on_grid(nbnxn_search_t nbs
,
2077 int ePBC
,matrix box
,
2079 rvec corner0
,rvec corner1
,
2084 int nmoved
,int *move
,
2086 nbnxn_atomdata_t
*nbat
)
2090 int nc_max_grid
,nc_max
;
2092 grid
= &nbs
->grid
[dd_zone
];
2094 nbs_cycle_start(&nbs
->cc
[enbsCCgrid
]);
2096 grid
->bSimple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
2098 grid
->na_c
= kernel_to_ci_size(nb_kernel_type
);
2099 grid
->na_cj
= kernel_to_cj_size(nb_kernel_type
);
2100 grid
->na_sc
= (grid
->bSimple
? 1 : GPU_NSUBCELL
)*grid
->na_c
;
2101 grid
->na_c_2log
= get_2log(grid
->na_c
);
2103 nbat
->na_c
= grid
->na_c
;
2112 (nbs
->grid
[dd_zone
-1].cell0
+ nbs
->grid
[dd_zone
-1].nc
)*
2113 nbs
->grid
[dd_zone
-1].na_sc
/grid
->na_sc
;
2121 copy_mat(box
,nbs
->box
);
2123 if (atom_density
>= 0)
2125 grid
->atom_density
= atom_density
;
2129 grid
->atom_density
= grid_atom_density(n
-nmoved
,corner0
,corner1
);
2134 nbs
->natoms_local
= a1
- nmoved
;
2135 /* We assume that nbnxn_put_on_grid is called first
2136 * for the local atoms (dd_zone=0).
2138 nbs
->natoms_nonlocal
= a1
- nmoved
;
2142 nbs
->natoms_nonlocal
= max(nbs
->natoms_nonlocal
,a1
);
2145 nc_max_grid
= set_grid_size_xy(nbs
,grid
,n
-nmoved
,corner0
,corner1
,
2146 nbs
->grid
[0].atom_density
,
2149 nc_max
= grid
->cell0
+ nc_max_grid
;
2151 if (a1
> nbs
->cell_nalloc
)
2153 nbs
->cell_nalloc
= over_alloc_large(a1
);
2154 srenew(nbs
->cell
,nbs
->cell_nalloc
);
2157 /* To avoid conditionals we store the moved particles at the end of a,
2158 * make sure we have enough space.
2160 if (nc_max
*grid
->na_sc
+ nmoved
> nbs
->a_nalloc
)
2162 nbs
->a_nalloc
= over_alloc_large(nc_max
*grid
->na_sc
+ nmoved
);
2163 srenew(nbs
->a
,nbs
->a_nalloc
);
2166 if (nc_max
*grid
->na_sc
> nbat
->nalloc
)
2168 nbnxn_atomdata_realloc(nbat
,nc_max
*grid
->na_sc
);
2171 calc_cell_indices(nbs
,dd_zone
,grid
,a0
,a1
,atinfo
,x
,move
,nbat
);
2175 nbat
->natoms_local
= nbat
->natoms
;
2178 nbs_cycle_stop(&nbs
->cc
[enbsCCgrid
]);
2181 /* Calls nbnxn_put_on_grid for all non-local domains */
2182 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs
,
2183 const gmx_domdec_zones_t
*zones
,
2187 nbnxn_atomdata_t
*nbat
)
2192 for(zone
=1; zone
<zones
->n
; zone
++)
2194 for(d
=0; d
<DIM
; d
++)
2196 c0
[d
] = zones
->size
[zone
].bb_x0
[d
];
2197 c1
[d
] = zones
->size
[zone
].bb_x1
[d
];
2200 nbnxn_put_on_grid(nbs
,nbs
->ePBC
,NULL
,
2202 zones
->cg_range
[zone
],
2203 zones
->cg_range
[zone
+1],
2213 /* Add simple grid type information to the local super/sub grid */
2214 void nbnxn_grid_add_simple(nbnxn_search_t nbs
,
2215 nbnxn_atomdata_t
*nbat
)
2221 grid
= &nbs
->grid
[0];
2225 gmx_incons("nbnxn_grid_simple called with a simple grid");
2228 ncd
= grid
->na_sc
/NBNXN_CPU_CLUSTER_I_SIZE
;
2230 if (grid
->nc
*ncd
> grid
->nc_nalloc_simple
)
2232 grid
->nc_nalloc_simple
= over_alloc_large(grid
->nc
*ncd
);
2233 srenew(grid
->bbcz_simple
,grid
->nc_nalloc_simple
*NNBSBB_D
);
2234 srenew(grid
->bb_simple
,grid
->nc_nalloc_simple
*NNBSBB_B
);
2235 srenew(grid
->flags_simple
,grid
->nc_nalloc_simple
);
2238 sfree_aligned(grid
->bbj
);
2239 snew_aligned(grid
->bbj
,grid
->nc_nalloc_simple
/2,16);
2243 bbcz
= grid
->bbcz_simple
;
2244 bb
= grid
->bb_simple
;
2246 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
2247 for(sc
=0; sc
<grid
->nc
; sc
++)
2251 for(c
=0; c
<ncd
; c
++)
2255 na
= NBNXN_CPU_CLUSTER_I_SIZE
;
2257 nbat
->type
[tx
*NBNXN_CPU_CLUSTER_I_SIZE
+na
-1] == nbat
->ntype
-1)
2264 switch (nbat
->XFormat
)
2267 /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
2268 calc_bounding_box_x_x4(na
,nbat
->x
+tx
*STRIDE_P4
,
2272 /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
2273 calc_bounding_box_x_x8(na
,nbat
->x
+X8_IND_A(tx
*NBNXN_CPU_CLUSTER_I_SIZE
),
2277 calc_bounding_box(na
,nbat
->xstride
,
2278 nbat
->x
+tx
*NBNXN_CPU_CLUSTER_I_SIZE
*nbat
->xstride
,
2282 bbcz
[tx
*NNBSBB_D
+0] = bb
[tx
*NNBSBB_B
+ZZ
];
2283 bbcz
[tx
*NNBSBB_D
+1] = bb
[tx
*NNBSBB_B
+NNBSBB_C
+ZZ
];
2285 /* No interaction optimization yet here */
2286 grid
->flags_simple
[tx
] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
2290 grid
->flags_simple
[tx
] = 0;
2295 #ifdef NBNXN_SEARCH_SSE
2296 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
2298 combine_bounding_box_pairs(grid
,grid
->bb_simple
);
2303 void nbnxn_get_ncells(nbnxn_search_t nbs
,int *ncx
,int *ncy
)
2305 *ncx
= nbs
->grid
[0].ncx
;
2306 *ncy
= nbs
->grid
[0].ncy
;
2309 void nbnxn_get_atomorder(nbnxn_search_t nbs
,int **a
,int *n
)
2311 const nbnxn_grid_t
*grid
;
2313 grid
= &nbs
->grid
[0];
2315 /* Return the atom order for the home cell (index 0) */
2318 *n
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
]*grid
->na_sc
;
2321 void nbnxn_set_atomorder(nbnxn_search_t nbs
)
2324 int ao
,cx
,cy
,cxy
,cz
,j
;
2326 /* Set the atom order for the home cell (index 0) */
2327 grid
= &nbs
->grid
[0];
2330 for(cx
=0; cx
<grid
->ncx
; cx
++)
2332 for(cy
=0; cy
<grid
->ncy
; cy
++)
2334 cxy
= cx
*grid
->ncy
+ cy
;
2335 j
= grid
->cxy_ind
[cxy
]*grid
->na_sc
;
2336 for(cz
=0; cz
<grid
->cxy_na
[cxy
]; cz
++)
2347 /* Determines the cell range along one dimension that
2348 * the bounding box b0 - b1 sees.
2350 static void get_cell_range(real b0
,real b1
,
2351 int nc
,real c0
,real s
,real invs
,
2352 real d2
,real r2
,int *cf
,int *cl
)
2354 *cf
= max((int)((b0
- c0
)*invs
),0);
2356 while (*cf
> 0 && d2
+ sqr((b0
- c0
) - (*cf
-1+1)*s
) < r2
)
2361 *cl
= min((int)((b1
- c0
)*invs
),nc
-1);
2362 while (*cl
< nc
-1 && d2
+ sqr((*cl
+1)*s
- (b1
- c0
)) < r2
)
2368 /* Reference code calculating the distance^2 between two bounding boxes */
2369 static float box_dist2(float bx0
,float bx1
,float by0
,
2370 float by1
,float bz0
,float bz1
,
2378 dl
= bx0
- bb
[BBU_X
];
2379 dh
= bb
[BBL_X
] - bx1
;
2384 dl
= by0
- bb
[BBU_Y
];
2385 dh
= bb
[BBL_Y
] - by1
;
2390 dl
= bz0
- bb
[BBU_Z
];
2391 dh
= bb
[BBL_Z
] - bz1
;
2399 /* Plain C code calculating the distance^2 between two bounding boxes */
2400 static float subc_bb_dist2(int si
,const float *bb_i_ci
,
2401 int csj
,const float *bb_j_all
)
2403 const float *bb_i
,*bb_j
;
2407 bb_i
= bb_i_ci
+ si
*NNBSBB_B
;
2408 bb_j
= bb_j_all
+ csj
*NNBSBB_B
;
2412 dl
= bb_i
[BBL_X
] - bb_j
[BBU_X
];
2413 dh
= bb_j
[BBL_X
] - bb_i
[BBU_X
];
2418 dl
= bb_i
[BBL_Y
] - bb_j
[BBU_Y
];
2419 dh
= bb_j
[BBL_Y
] - bb_i
[BBU_Y
];
2424 dl
= bb_i
[BBL_Z
] - bb_j
[BBU_Z
];
2425 dh
= bb_j
[BBL_Z
] - bb_i
[BBU_Z
];
2433 #ifdef NBNXN_SEARCH_SSE
2435 /* SSE code for bb distance for bb format xyz0 */
2436 static float subc_bb_dist2_sse(int na_c
,
2437 int si
,const float *bb_i_ci
,
2438 int csj
,const float *bb_j_all
)
2440 const float *bb_i
,*bb_j
;
2442 __m128 bb_i_SSE0
,bb_i_SSE1
;
2443 __m128 bb_j_SSE0
,bb_j_SSE1
;
2449 #ifndef GMX_X86_SSE4_1
2450 float d2_array
[7],*d2_align
;
2452 d2_align
= (float *)(((size_t)(d2_array
+3)) & (~((size_t)15)));
2457 bb_i
= bb_i_ci
+ si
*NNBSBB_B
;
2458 bb_j
= bb_j_all
+ csj
*NNBSBB_B
;
2460 bb_i_SSE0
= _mm_load_ps(bb_i
);
2461 bb_i_SSE1
= _mm_load_ps(bb_i
+NNBSBB_C
);
2462 bb_j_SSE0
= _mm_load_ps(bb_j
);
2463 bb_j_SSE1
= _mm_load_ps(bb_j
+NNBSBB_C
);
2465 dl_SSE
= _mm_sub_ps(bb_i_SSE0
,bb_j_SSE1
);
2466 dh_SSE
= _mm_sub_ps(bb_j_SSE0
,bb_i_SSE1
);
2468 dm_SSE
= _mm_max_ps(dl_SSE
,dh_SSE
);
2469 dm0_SSE
= _mm_max_ps(dm_SSE
,_mm_setzero_ps());
2470 #ifndef GMX_X86_SSE4_1
2471 d2_SSE
= _mm_mul_ps(dm0_SSE
,dm0_SSE
);
2473 _mm_store_ps(d2_align
,d2_SSE
);
2475 return d2_align
[0] + d2_align
[1] + d2_align
[2];
2477 /* SSE4.1 dot product of components 0,1,2 */
2478 d2_SSE
= _mm_dp_ps(dm0_SSE
,dm0_SSE
,0x71);
2480 _mm_store_ss(&d2
,d2_SSE
);
2486 /* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
2487 static void subc_bb_dist2_sse_xxxx(const float *bb_j
,
2488 int nsi
,const float *bb_i
,
2494 __m128 xj_l
,yj_l
,zj_l
;
2495 __m128 xj_h
,yj_h
,zj_h
;
2496 __m128 xi_l
,yi_l
,zi_l
;
2497 __m128 xi_h
,yi_h
,zi_h
;
2499 __m128 dx_0
,dy_0
,dz_0
;
2500 __m128 dx_1
,dy_1
,dz_1
;
2510 zero
= _mm_setzero_ps();
2512 xj_l
= _mm_load1_ps(bb_j
+0*SSE_F_WIDTH
);
2513 yj_l
= _mm_load1_ps(bb_j
+1*SSE_F_WIDTH
);
2514 zj_l
= _mm_load1_ps(bb_j
+2*SSE_F_WIDTH
);
2515 xj_h
= _mm_load1_ps(bb_j
+3*SSE_F_WIDTH
);
2516 yj_h
= _mm_load1_ps(bb_j
+4*SSE_F_WIDTH
);
2517 zj_h
= _mm_load1_ps(bb_j
+5*SSE_F_WIDTH
);
2519 for(si
=0; si
<nsi
; si
+=SSE_F_WIDTH
)
2521 shi
= si
*NNBSBB_D
*DIM
;
2523 xi_l
= _mm_load_ps(bb_i
+shi
+0*SSE_F_WIDTH
);
2524 yi_l
= _mm_load_ps(bb_i
+shi
+1*SSE_F_WIDTH
);
2525 zi_l
= _mm_load_ps(bb_i
+shi
+2*SSE_F_WIDTH
);
2526 xi_h
= _mm_load_ps(bb_i
+shi
+3*SSE_F_WIDTH
);
2527 yi_h
= _mm_load_ps(bb_i
+shi
+4*SSE_F_WIDTH
);
2528 zi_h
= _mm_load_ps(bb_i
+shi
+5*SSE_F_WIDTH
);
2530 dx_0
= _mm_sub_ps(xi_l
,xj_h
);
2531 dy_0
= _mm_sub_ps(yi_l
,yj_h
);
2532 dz_0
= _mm_sub_ps(zi_l
,zj_h
);
2534 dx_1
= _mm_sub_ps(xj_l
,xi_h
);
2535 dy_1
= _mm_sub_ps(yj_l
,yi_h
);
2536 dz_1
= _mm_sub_ps(zj_l
,zi_h
);
2538 mx
= _mm_max_ps(dx_0
,dx_1
);
2539 my
= _mm_max_ps(dy_0
,dy_1
);
2540 mz
= _mm_max_ps(dz_0
,dz_1
);
2542 m0x
= _mm_max_ps(mx
,zero
);
2543 m0y
= _mm_max_ps(my
,zero
);
2544 m0z
= _mm_max_ps(mz
,zero
);
2546 d2x
= _mm_mul_ps(m0x
,m0x
);
2547 d2y
= _mm_mul_ps(m0y
,m0y
);
2548 d2z
= _mm_mul_ps(m0z
,m0z
);
2550 d2s
= _mm_add_ps(d2x
,d2y
);
2551 d2t
= _mm_add_ps(d2s
,d2z
);
2553 _mm_store_ps(d2
+si
,d2t
);
2557 #endif /* NBNXN_SEARCH_SSE */
2559 /* Plain C function which determines if any atom pair between two cells
2560 * is within distance sqrt(rl2).
2562 static gmx_bool
subc_in_range_x(int na_c
,
2563 int si
,const real
*x_i
,
2564 int csj
,int stride
,const real
*x_j
,
2570 for(i
=0; i
<na_c
; i
++)
2572 i0
= (si
*na_c
+ i
)*DIM
;
2573 for(j
=0; j
<na_c
; j
++)
2575 j0
= (csj
*na_c
+ j
)*stride
;
2577 d2
= sqr(x_i
[i0
] - x_j
[j0
]) +
2578 sqr(x_i
[i0
+1] - x_j
[j0
+1]) +
2579 sqr(x_i
[i0
+2] - x_j
[j0
+2]);
2591 /* SSE function which determines if any atom pair between two cells,
2592 * both with 8 atoms, is within distance sqrt(rl2).
2594 static gmx_bool
subc_in_range_sse8(int na_c
,
2595 int si
,const real
*x_i
,
2596 int csj
,int stride
,const real
*x_j
,
2599 #ifdef NBNXN_SEARCH_SSE_SINGLE
2600 __m128 ix_SSE0
,iy_SSE0
,iz_SSE0
;
2601 __m128 ix_SSE1
,iy_SSE1
,iz_SSE1
;
2602 __m128 jx0_SSE
,jy0_SSE
,jz0_SSE
;
2603 __m128 jx1_SSE
,jy1_SSE
,jz1_SSE
;
2605 __m128 dx_SSE0
,dy_SSE0
,dz_SSE0
;
2606 __m128 dx_SSE1
,dy_SSE1
,dz_SSE1
;
2607 __m128 dx_SSE2
,dy_SSE2
,dz_SSE2
;
2608 __m128 dx_SSE3
,dy_SSE3
,dz_SSE3
;
2619 __m128 wco_any_SSE01
,wco_any_SSE23
,wco_any_SSE
;
2626 rc2_SSE
= _mm_set1_ps(rl2
);
2629 ix_SSE0
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+0)*SSE_F_WIDTH
);
2630 iy_SSE0
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+1)*SSE_F_WIDTH
);
2631 iz_SSE0
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+2)*SSE_F_WIDTH
);
2632 ix_SSE1
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+3)*SSE_F_WIDTH
);
2633 iy_SSE1
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+4)*SSE_F_WIDTH
);
2634 iz_SSE1
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+5)*SSE_F_WIDTH
);
2636 /* We loop from the outer to the inner particles to maximize
2637 * the chance that we find a pair in range quickly and return.
2643 jx0_SSE
= _mm_load1_ps(x_j
+j0
*stride
+0);
2644 jy0_SSE
= _mm_load1_ps(x_j
+j0
*stride
+1);
2645 jz0_SSE
= _mm_load1_ps(x_j
+j0
*stride
+2);
2647 jx1_SSE
= _mm_load1_ps(x_j
+j1
*stride
+0);
2648 jy1_SSE
= _mm_load1_ps(x_j
+j1
*stride
+1);
2649 jz1_SSE
= _mm_load1_ps(x_j
+j1
*stride
+2);
2651 /* Calculate distance */
2652 dx_SSE0
= _mm_sub_ps(ix_SSE0
,jx0_SSE
);
2653 dy_SSE0
= _mm_sub_ps(iy_SSE0
,jy0_SSE
);
2654 dz_SSE0
= _mm_sub_ps(iz_SSE0
,jz0_SSE
);
2655 dx_SSE1
= _mm_sub_ps(ix_SSE1
,jx0_SSE
);
2656 dy_SSE1
= _mm_sub_ps(iy_SSE1
,jy0_SSE
);
2657 dz_SSE1
= _mm_sub_ps(iz_SSE1
,jz0_SSE
);
2658 dx_SSE2
= _mm_sub_ps(ix_SSE0
,jx1_SSE
);
2659 dy_SSE2
= _mm_sub_ps(iy_SSE0
,jy1_SSE
);
2660 dz_SSE2
= _mm_sub_ps(iz_SSE0
,jz1_SSE
);
2661 dx_SSE3
= _mm_sub_ps(ix_SSE1
,jx1_SSE
);
2662 dy_SSE3
= _mm_sub_ps(iy_SSE1
,jy1_SSE
);
2663 dz_SSE3
= _mm_sub_ps(iz_SSE1
,jz1_SSE
);
2665 /* rsq = dx*dx+dy*dy+dz*dz */
2666 rsq_SSE0
= gmx_mm_calc_rsq_ps(dx_SSE0
,dy_SSE0
,dz_SSE0
);
2667 rsq_SSE1
= gmx_mm_calc_rsq_ps(dx_SSE1
,dy_SSE1
,dz_SSE1
);
2668 rsq_SSE2
= gmx_mm_calc_rsq_ps(dx_SSE2
,dy_SSE2
,dz_SSE2
);
2669 rsq_SSE3
= gmx_mm_calc_rsq_ps(dx_SSE3
,dy_SSE3
,dz_SSE3
);
2671 wco_SSE0
= _mm_cmplt_ps(rsq_SSE0
,rc2_SSE
);
2672 wco_SSE1
= _mm_cmplt_ps(rsq_SSE1
,rc2_SSE
);
2673 wco_SSE2
= _mm_cmplt_ps(rsq_SSE2
,rc2_SSE
);
2674 wco_SSE3
= _mm_cmplt_ps(rsq_SSE3
,rc2_SSE
);
2676 wco_any_SSE01
= _mm_or_ps(wco_SSE0
,wco_SSE1
);
2677 wco_any_SSE23
= _mm_or_ps(wco_SSE2
,wco_SSE3
);
2678 wco_any_SSE
= _mm_or_ps(wco_any_SSE01
,wco_any_SSE23
);
2680 if (_mm_movemask_ps(wco_any_SSE
))
2692 gmx_incons("SSE function called without SSE support");
2698 /* Returns the j sub-cell for index cj_ind */
2699 static int nbl_cj(const nbnxn_pairlist_t
*nbl
,int cj_ind
)
2701 return nbl
->cj4
[cj_ind
>>2].cj
[cj_ind
& 3];
2704 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
2705 static unsigned nbl_imask0(const nbnxn_pairlist_t
*nbl
,int cj_ind
)
2707 return nbl
->cj4
[cj_ind
>>2].imei
[0].imask
;
2710 /* Ensures there is enough space for extra extra exclusion masks */
2711 static void check_excl_space(nbnxn_pairlist_t
*nbl
,int extra
)
2713 if (nbl
->nexcl
+extra
> nbl
->excl_nalloc
)
2715 nbl
->excl_nalloc
= over_alloc_small(nbl
->nexcl
+extra
);
2716 nb_realloc_void((void **)&nbl
->excl
,
2717 nbl
->nexcl
*sizeof(*nbl
->excl
),
2718 nbl
->excl_nalloc
*sizeof(*nbl
->excl
),
2719 nbl
->alloc
,nbl
->free
);
2723 /* Ensures there is enough space for ncell extra j-cells in the list */
2724 static void check_subcell_list_space_simple(nbnxn_pairlist_t
*nbl
,
2729 cj_max
= nbl
->ncj
+ ncell
;
2731 if (cj_max
> nbl
->cj_nalloc
)
2733 nbl
->cj_nalloc
= over_alloc_small(cj_max
);
2734 nb_realloc_void((void **)&nbl
->cj
,
2735 nbl
->ncj
*sizeof(*nbl
->cj
),
2736 nbl
->cj_nalloc
*sizeof(*nbl
->cj
),
2737 nbl
->alloc
,nbl
->free
);
2741 /* Ensures there is enough space for ncell extra j-subcells in the list */
2742 static void check_subcell_list_space_supersub(nbnxn_pairlist_t
*nbl
,
2745 int ncj4_max
,j4
,j
,w
,t
;
2748 #define WARP_SIZE 32
2750 /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
2751 /* We can store 4 j-subcell - i-supercell pairs in one struct.
2752 * since we round down, we need one extra entry.
2754 ncj4_max
= ((nbl
->work
->cj_ind
+ nsupercell
*GPU_NSUBCELL
+ 4-1) >> 2);
2756 if (ncj4_max
> nbl
->cj4_nalloc
)
2758 nbl
->cj4_nalloc
= over_alloc_small(ncj4_max
);
2759 nb_realloc_void((void **)&nbl
->cj4
,
2760 nbl
->work
->cj4_init
*sizeof(*nbl
->cj4
),
2761 nbl
->cj4_nalloc
*sizeof(*nbl
->cj4
),
2762 nbl
->alloc
,nbl
->free
);
2765 if (ncj4_max
> nbl
->work
->cj4_init
)
2767 for(j4
=nbl
->work
->cj4_init
; j4
<ncj4_max
; j4
++)
2769 /* No i-subcells and no excl's in the list initially */
2770 for(w
=0; w
<NWARP
; w
++)
2772 nbl
->cj4
[j4
].imei
[w
].imask
= 0U;
2773 nbl
->cj4
[j4
].imei
[w
].excl_ind
= 0;
2777 nbl
->work
->cj4_init
= ncj4_max
;
2781 /* Default nbnxn allocation routine, allocates 32 byte aligned,
2782 * which works for plain C and aligned SSE and AVX loads/stores.
2784 static void nbnxn_alloc_aligned(void **ptr
,size_t nbytes
)
2786 *ptr
= save_malloc_aligned("ptr",__FILE__
,__LINE__
,nbytes
,1,32);
2789 /* Free function for memory allocated with nbnxn_alloc_aligned */
2790 static void nbnxn_free_aligned(void *ptr
)
2795 /* Set all excl masks for one GPU warp no exclusions */
2796 static void set_no_excls(nbnxn_excl_t
*excl
)
2800 for(t
=0; t
<WARP_SIZE
; t
++)
2802 /* Turn all interaction bits on */
2803 excl
->pair
[t
] = NBNXN_INT_MASK_ALL
;
2807 /* Initializes a single nbnxn_pairlist_t data structure */
2808 static void nbnxn_init_pairlist(nbnxn_pairlist_t
*nbl
,
2810 gmx_nbat_alloc_t
*alloc
,
2811 gmx_nbat_free_t
*free
)
2815 nbl
->alloc
= nbnxn_alloc_aligned
;
2823 nbl
->free
= nbnxn_free_aligned
;
2830 nbl
->bSimple
= bSimple
;
2841 /* We need one element extra in sj, so alloc initially with 1 */
2842 nbl
->cj4_nalloc
= 0;
2849 nbl
->excl_nalloc
= 0;
2851 check_excl_space(nbl
,1);
2853 set_no_excls(&nbl
->excl
[0]);
2858 snew_aligned(nbl
->work
->bb_ci
,GPU_NSUBCELL
/SSE_F_WIDTH
*NNBSBB_XXXX
,16);
2860 snew_aligned(nbl
->work
->bb_ci
,GPU_NSUBCELL
*NNBSBB_B
,16);
2862 snew_aligned(nbl
->work
->x_ci
,NBNXN_NA_SC_MAX
*DIM
,16);
2863 #ifdef NBNXN_SEARCH_SSE
2864 snew_aligned(nbl
->work
->x_ci_x86_simd128
,1,16);
2865 #ifdef GMX_X86_AVX_256
2866 snew_aligned(nbl
->work
->x_ci_x86_simd256
,1,32);
2869 snew_aligned(nbl
->work
->d2
,GPU_NSUBCELL
,16);
2872 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t
*nbl_list
,
2873 gmx_bool bSimple
, gmx_bool bCombined
,
2874 gmx_nbat_alloc_t
*alloc
,
2875 gmx_nbat_free_t
*free
)
2879 nbl_list
->bSimple
= bSimple
;
2880 nbl_list
->bCombined
= bCombined
;
2882 nbl_list
->nnbl
= gmx_omp_nthreads_get(emntNonbonded
);
2884 snew(nbl_list
->nbl
,nbl_list
->nnbl
);
2885 /* Execute in order to avoid memory interleaving between threads */
2886 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
2887 for(i
=0; i
<nbl_list
->nnbl
; i
++)
2889 /* Allocate the nblist data structure locally on each thread
2890 * to optimize memory access for NUMA architectures.
2892 snew(nbl_list
->nbl
[i
],1);
2894 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
2897 nbnxn_init_pairlist(nbl_list
->nbl
[i
],nbl_list
->bSimple
,alloc
,free
);
2901 nbnxn_init_pairlist(nbl_list
->nbl
[i
],nbl_list
->bSimple
,NULL
,NULL
);
2906 /* Print statistics of a pair list, used for debug output */
2907 static void print_nblist_statistics_simple(FILE *fp
,const nbnxn_pairlist_t
*nbl
,
2908 const nbnxn_search_t nbs
,real rl
)
2910 const nbnxn_grid_t
*grid
;
2915 /* This code only produces correct statistics with domain decomposition */
2916 grid
= &nbs
->grid
[0];
2918 fprintf(fp
,"nbl nci %d ncj %d\n",
2920 fprintf(fp
,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2921 nbl
->na_sc
,rl
,nbl
->ncj
,nbl
->ncj
/(double)grid
->nc
,
2922 nbl
->ncj
/(double)grid
->nc
*grid
->na_sc
,
2923 nbl
->ncj
/(double)grid
->nc
*grid
->na_sc
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
->nc
*grid
->na_sc
/det(nbs
->box
)));
2925 fprintf(fp
,"nbl average j cell list length %.1f\n",
2926 0.25*nbl
->ncj
/(double)nbl
->nci
);
2928 for(s
=0; s
<SHIFTS
; s
++)
2933 for(i
=0; i
<nbl
->nci
; i
++)
2935 cs
[nbl
->ci
[i
].shift
& NBNXN_CI_SHIFT
] +=
2936 nbl
->ci
[i
].cj_ind_end
- nbl
->ci
[i
].cj_ind_start
;
2938 j
= nbl
->ci
[i
].cj_ind_start
;
2939 while (j
< nbl
->ci
[i
].cj_ind_end
&&
2940 nbl
->cj
[j
].excl
!= NBNXN_INT_MASK_ALL
)
2946 fprintf(fp
,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
2947 nbl
->ncj
,npexcl
,100*npexcl
/(double)nbl
->ncj
);
2948 for(s
=0; s
<SHIFTS
; s
++)
2952 fprintf(fp
,"nbl shift %2d ncj %3d\n",s
,cs
[s
]);
2957 /* Print statistics of a pair lists, used for debug output */
2958 static void print_nblist_statistics_supersub(FILE *fp
,const nbnxn_pairlist_t
*nbl
,
2959 const nbnxn_search_t nbs
,real rl
)
2961 const nbnxn_grid_t
*grid
;
2963 int c
[GPU_NSUBCELL
+1];
2965 /* This code only produces correct statistics with domain decomposition */
2966 grid
= &nbs
->grid
[0];
2968 fprintf(fp
,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
2969 nbl
->nsci
,nbl
->ncj4
,nbl
->nci_tot
,nbl
->nexcl
);
2970 fprintf(fp
,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2971 nbl
->na_ci
,rl
,nbl
->nci_tot
,nbl
->nci_tot
/(double)grid
->nsubc_tot
,
2972 nbl
->nci_tot
/(double)grid
->nsubc_tot
*grid
->na_c
,
2973 nbl
->nci_tot
/(double)grid
->nsubc_tot
*grid
->na_c
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
->nsubc_tot
*grid
->na_c
/det(nbs
->box
)));
2975 fprintf(fp
,"nbl average j super cell list length %.1f\n",
2976 0.25*nbl
->ncj4
/(double)nbl
->nsci
);
2977 fprintf(fp
,"nbl average i sub cell list length %.1f\n",
2978 nbl
->nci_tot
/(0.25*nbl
->ncj4
));
2980 for(si
=0; si
<=GPU_NSUBCELL
; si
++)
2984 for(i
=0; i
<nbl
->nsci
; i
++)
2986 for(j4
=nbl
->sci
[i
].cj4_ind_start
; j4
<nbl
->sci
[i
].cj4_ind_end
; j4
++)
2991 for(si
=0; si
<GPU_NSUBCELL
; si
++)
2993 if (nbl
->cj4
[j4
].imei
[0].imask
& (1U << (j
*GPU_NSUBCELL
+ si
)))
3002 for(b
=0; b
<=GPU_NSUBCELL
; b
++)
3004 fprintf(fp
,"nbl j-list #i-subcell %d %7d %4.1f\n",
3005 b
,c
[b
],100.0*c
[b
]/(double)(nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
));
3009 /* Print the full pair list, used for debug output */
3010 static void print_supersub_nsp(const char *fn
,
3011 const nbnxn_pairlist_t
*nbl
,
3018 sprintf(buf
,"%s_%s.xvg",fn
,NONLOCAL_I(iloc
) ? "nl" : "l");
3019 fp
= ffopen(buf
,"w");
3021 for(i
=0; i
<nbl
->nci
; i
++)
3024 for(j4
=nbl
->sci
[i
].cj4_ind_start
; j4
<nbl
->sci
[i
].cj4_ind_end
; j4
++)
3026 for(p
=0; p
<NBNXN_GPU_JGROUP_SIZE
*GPU_NSUBCELL
; p
++)
3028 nsp
+= (nbl
->cj4
[j4
].imei
[0].imask
>> p
) & 1;
3031 fprintf(fp
,"%4d %3d %3d\n",
3034 nbl
->sci
[i
].cj4_ind_end
-nbl
->sci
[i
].cj4_ind_start
);
3040 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
3041 static void low_get_nbl_exclusions(nbnxn_pairlist_t
*nbl
,int cj4
,
3042 int warp
,nbnxn_excl_t
**excl
)
3044 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
3046 /* No exclusions set, make a new list entry */
3047 nbl
->cj4
[cj4
].imei
[warp
].excl_ind
= nbl
->nexcl
;
3049 *excl
= &nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
3050 set_no_excls(*excl
);
3054 /* We already have some exclusions, new ones can be added to the list */
3055 *excl
= &nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
3059 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
3060 * allocates extra memory, if necessary.
3062 static void get_nbl_exclusions_1(nbnxn_pairlist_t
*nbl
,int cj4
,
3063 int warp
,nbnxn_excl_t
**excl
)
3065 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
3067 /* We need to make a new list entry, check if we have space */
3068 check_excl_space(nbl
,1);
3070 low_get_nbl_exclusions(nbl
,cj4
,warp
,excl
);
3073 /* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
3074 * allocates extra memory, if necessary.
3076 static void get_nbl_exclusions_2(nbnxn_pairlist_t
*nbl
,int cj4
,
3077 nbnxn_excl_t
**excl_w0
,
3078 nbnxn_excl_t
**excl_w1
)
3080 /* Check for space we might need */
3081 check_excl_space(nbl
,2);
3083 low_get_nbl_exclusions(nbl
,cj4
,0,excl_w0
);
3084 low_get_nbl_exclusions(nbl
,cj4
,1,excl_w1
);
3087 /* Sets the self exclusions i=j and pair exclusions i>j */
3088 static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t
*nbl
,
3089 int cj4_ind
,int sj_offset
,
3092 nbnxn_excl_t
*excl
[2];
3095 /* Here we only set the set self and double pair exclusions */
3097 get_nbl_exclusions_2(nbl
,cj4_ind
,&excl
[0],&excl
[1]);
3099 /* Only minor < major bits set */
3100 for(ej
=0; ej
<nbl
->na_ci
; ej
++)
3103 for(ei
=ej
; ei
<nbl
->na_ci
; ei
++)
3105 excl
[w
]->pair
[(ej
&(4-1))*nbl
->na_ci
+ei
] &=
3106 ~(1U << (sj_offset
*GPU_NSUBCELL
+si
));
3111 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
3112 static unsigned int get_imask(gmx_bool rdiag
,int ci
,int cj
)
3114 return (rdiag
&& ci
== cj
? NBNXN_INT_MASK_DIAG
: NBNXN_INT_MASK_ALL
);
3117 #ifdef NBNXN_SEARCH_SSE
3118 /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
3119 static unsigned int get_imask_x86_simd128(gmx_bool rdiag
,int ci
,int cj
)
3121 #ifndef GMX_DOUBLE /* cj-size = 4 */
3122 return (rdiag
&& ci
== cj
? NBNXN_INT_MASK_DIAG
: NBNXN_INT_MASK_ALL
);
3123 #else /* cj-size = 2 */
3124 return (rdiag
&& ci
*2 == cj
? NBNXN_INT_MASK_DIAG_J2_0
:
3125 (rdiag
&& ci
*2+1 == cj
? NBNXN_INT_MASK_DIAG_J2_1
:
3126 NBNXN_INT_MASK_ALL
));
3130 #ifdef GMX_X86_AVX_256
3131 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
3132 static unsigned int get_imask_x86_simd256(gmx_bool rdiag
,int ci
,int cj
)
3134 #ifndef GMX_DOUBLE /* cj-size = 8 */
3135 return (rdiag
&& ci
== cj
*2 ? NBNXN_INT_MASK_DIAG_J8_0
:
3136 (rdiag
&& ci
== cj
*2+1 ? NBNXN_INT_MASK_DIAG_J8_1
:
3137 NBNXN_INT_MASK_ALL
));
3138 #else /* cj-size = 2 */
3139 return (rdiag
&& ci
== cj
? NBNXN_INT_MASK_DIAG
: NBNXN_INT_MASK_ALL
);
3143 #endif /* NBNXN_SEARCH_SSE */
3145 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
3146 * Checks bounding box distances and possibly atom pair distances.
3148 static void make_cluster_list_simple(const nbnxn_grid_t
*gridj
,
3149 nbnxn_pairlist_t
*nbl
,
3150 int ci
,int cjf
,int cjl
,
3151 gmx_bool remove_sub_diag
,
3153 real rl2
,float rbb2
,
3156 const nbnxn_list_work_t
*work
;
3163 int cjf_gl
,cjl_gl
,cj
;
3167 bb_ci
= nbl
->work
->bb_ci
;
3168 x_ci
= nbl
->work
->x_ci
;
3171 while (!InRange
&& cjf
<= cjl
)
3173 d2
= subc_bb_dist2(0,bb_ci
,cjf
,gridj
->bb
);
3176 /* Check if the distance is within the distance where
3177 * we use only the bounding box distance rbb,
3178 * or within the cut-off and there is at least one atom pair
3179 * within the cut-off.
3189 cjf_gl
= gridj
->cell0
+ cjf
;
3190 for(i
=0; i
<NBNXN_CPU_CLUSTER_I_SIZE
&& !InRange
; i
++)
3192 for(j
=0; j
<NBNXN_CPU_CLUSTER_I_SIZE
; j
++)
3194 InRange
= InRange
||
3195 (sqr(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+XX
]) +
3196 sqr(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+YY
]) +
3197 sqr(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+ZZ
]) < rl2
);
3200 *ndistc
+= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
3213 while (!InRange
&& cjl
> cjf
)
3215 d2
= subc_bb_dist2(0,bb_ci
,cjl
,gridj
->bb
);
3218 /* Check if the distance is within the distance where
3219 * we use only the bounding box distance rbb,
3220 * or within the cut-off and there is at least one atom pair
3221 * within the cut-off.
3231 cjl_gl
= gridj
->cell0
+ cjl
;
3232 for(i
=0; i
<NBNXN_CPU_CLUSTER_I_SIZE
&& !InRange
; i
++)
3234 for(j
=0; j
<NBNXN_CPU_CLUSTER_I_SIZE
; j
++)
3236 InRange
= InRange
||
3237 (sqr(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+XX
]) +
3238 sqr(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+YY
]) +
3239 sqr(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+ZZ
]) < rl2
);
3242 *ndistc
+= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
3252 for(cj
=cjf
; cj
<=cjl
; cj
++)
3254 /* Store cj and the interaction mask */
3255 nbl
->cj
[nbl
->ncj
].cj
= gridj
->cell0
+ cj
;
3256 nbl
->cj
[nbl
->ncj
].excl
= get_imask(remove_sub_diag
,ci
,cj
);
3259 /* Increase the closing index in i super-cell list */
3260 nbl
->ci
[nbl
->nci
].cj_ind_end
= nbl
->ncj
;
3264 #ifdef NBNXN_SEARCH_SSE
3265 /* Include make_cluster_list_x86_simd128/256 */
3266 #define GMX_MM128_HERE
3267 #include "gmx_x86_simd_macros.h"
3268 #define STRIDE_S PACK_X4
3269 #include "nbnxn_search_x86_simd.h"
3271 #undef GMX_MM128_HERE
3272 #ifdef GMX_X86_AVX_256
3273 /* Include make_cluster_list_x86_simd128/256 */
3274 #define GMX_MM256_HERE
3275 #include "gmx_x86_simd_macros.h"
3276 #define STRIDE_S GMX_X86_SIMD_WIDTH_HERE
3277 #include "nbnxn_search_x86_simd.h"
3279 #undef GMX_MM256_HERE
3283 /* Plain C or SSE code for making a pair list of super-cell sci vs scj.
3284 * Checks bounding box distances and possibly atom pair distances.
3286 static void make_cluster_list(const nbnxn_search_t nbs
,
3287 const nbnxn_grid_t
*gridi
,
3288 const nbnxn_grid_t
*gridj
,
3289 nbnxn_pairlist_t
*nbl
,
3291 gmx_bool sci_equals_scj
,
3292 int stride
,const real
*x
,
3293 real rl2
,float rbb2
,
3298 int cjo
,ci1
,ci
,cj
,cj_gl
;
3299 int cj4_ind
,cj_offset
;
3306 #define PRUNE_LIST_CPU_ONE
3307 #ifdef PRUNE_LIST_CPU_ONE
3311 d2l
= nbl
->work
->d2
;
3313 bb_ci
= nbl
->work
->bb_ci
;
3314 x_ci
= nbl
->work
->x_ci
;
3318 for(cjo
=0; cjo
<gridj
->nsubc
[scj
]; cjo
++)
3320 cj4_ind
= (nbl
->work
->cj_ind
>> 2);
3321 cj_offset
= nbl
->work
->cj_ind
- cj4_ind
*NBNXN_GPU_JGROUP_SIZE
;
3322 cj4
= &nbl
->cj4
[cj4_ind
];
3324 cj
= scj
*GPU_NSUBCELL
+ cjo
;
3326 cj_gl
= gridj
->cell0
*GPU_NSUBCELL
+ cj
;
3328 /* Initialize this j-subcell i-subcell list */
3329 cj4
->cj
[cj_offset
] = cj_gl
;
3338 ci1
= gridi
->nsubc
[sci
];
3342 /* Determine all ci1 bb distances in one call with SSE */
3343 subc_bb_dist2_sse_xxxx(gridj
->bb
+(cj
>>SSE_F_WIDTH_2LOG
)*NNBSBB_XXXX
+(cj
& (SSE_F_WIDTH
-1)),
3349 for(ci
=0; ci
<ci1
; ci
++)
3351 #ifndef NBNXN_BBXXXX
3352 /* Determine the bb distance between ci and cj */
3353 d2l
[ci
] = subc_bb_dist2(ci
,bb_ci
,cj
,gridj
->bb
);
3358 #ifdef PRUNE_LIST_CPU_ALL
3359 /* Check if the distance is within the distance where
3360 * we use only the bounding box distance rbb,
3361 * or within the cut-off and there is at least one atom pair
3362 * within the cut-off. This check is very costly.
3364 *ndistc
+= na_c
*na_c
;
3366 (d2
< rl2
&& nbs
->subc_dc(na_c
,ci
,x_ci
,cj_gl
,stride
,x
,rl2
)))
3368 /* Check if the distance between the two bounding boxes
3369 * in within the pair-list cut-off.
3374 /* Flag this i-subcell to be taken into account */
3375 imask
|= (1U << (cj_offset
*GPU_NSUBCELL
+ci
));
3377 #ifdef PRUNE_LIST_CPU_ONE
3385 #ifdef PRUNE_LIST_CPU_ONE
3386 /* If we only found 1 pair, check if any atoms are actually
3387 * within the cut-off, so we could get rid of it.
3389 if (npair
== 1 && d2l
[ci_last
] >= rbb2
)
3391 if (!nbs
->subc_dc(na_c
,ci_last
,x_ci
,cj_gl
,stride
,x
,rl2
))
3393 imask
&= ~(1U << (cj_offset
*GPU_NSUBCELL
+ci_last
));
3401 /* We have a useful sj entry, close it now */
3403 /* Set the exclucions for the ci== sj entry.
3404 * Here we don't bother to check if this entry is actually flagged,
3405 * as it will nearly always be in the list.
3409 set_self_and_newton_excls_supersub(nbl
,cj4_ind
,cj_offset
,cjo
);
3412 /* Copy the cluster interaction mask to the list */
3413 for(w
=0; w
<NWARP
; w
++)
3415 cj4
->imei
[w
].imask
|= imask
;
3418 nbl
->work
->cj_ind
++;
3420 /* Keep the count */
3421 nbl
->nci_tot
+= npair
;
3423 /* Increase the closing index in i super-cell list */
3424 nbl
->sci
[nbl
->nsci
].cj4_ind_end
= ((nbl
->work
->cj_ind
+4-1)>>2);
3429 /* Set all atom-pair exclusions from the topology stored in excl
3430 * as masks in the pair-list for simple list i-entry nbl_ci
3432 static void set_ci_top_excls(const nbnxn_search_t nbs
,
3433 nbnxn_pairlist_t
*nbl
,
3434 gmx_bool diagRemoved
,
3437 const nbnxn_ci_t
*nbl_ci
,
3438 const t_blocka
*excl
)
3442 int cj_ind_first
,cj_ind_last
;
3443 int cj_first
,cj_last
;
3445 int i
,ai
,aj
,si
,eind
,ge
,se
;
3446 int found
,cj_ind_0
,cj_ind_1
,cj_ind_m
;
3450 nbnxn_excl_t
*nbl_excl
;
3451 int inner_i
,inner_e
;
3455 if (nbl_ci
->cj_ind_end
== nbl_ci
->cj_ind_start
)
3463 cj_ind_first
= nbl_ci
->cj_ind_start
;
3464 cj_ind_last
= nbl
->ncj
- 1;
3466 cj_first
= nbl
->cj
[cj_ind_first
].cj
;
3467 cj_last
= nbl
->cj
[cj_ind_last
].cj
;
3469 /* Determine how many contiguous j-cells we have starting
3470 * from the first i-cell. This number can be used to directly
3471 * calculate j-cell indices for excluded atoms.
3474 if (na_ci_2log
== na_cj_2log
)
3476 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3477 nbl
->cj
[cj_ind_first
+ndirect
].cj
== ci
+ ndirect
)
3482 #ifdef NBNXN_SEARCH_SSE
3485 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3486 nbl
->cj
[cj_ind_first
+ndirect
].cj
== ci_to_cj(na_cj_2log
,ci
) + ndirect
)
3493 /* Loop over the atoms in the i super-cell */
3494 for(i
=0; i
<nbl
->na_sc
; i
++)
3496 ai
= nbs
->a
[ci
*nbl
->na_sc
+i
];
3499 si
= (i
>>na_ci_2log
);
3501 /* Loop over the topology-based exclusions for this i-atom */
3502 for(eind
=excl
->index
[ai
]; eind
<excl
->index
[ai
+1]; eind
++)
3508 /* The self exclusion are already set, save some time */
3514 /* Without shifts we only calculate interactions j>i
3515 * for one-way pair-lists.
3517 if (diagRemoved
&& ge
<= ci
*nbl
->na_sc
+ i
)
3522 se
= (ge
>> na_cj_2log
);
3524 /* Could the cluster se be in our list? */
3525 if (se
>= cj_first
&& se
<= cj_last
)
3527 if (se
< cj_first
+ ndirect
)
3529 /* We can calculate cj_ind directly from se */
3530 found
= cj_ind_first
+ se
- cj_first
;
3534 /* Search for se using bisection */
3536 cj_ind_0
= cj_ind_first
+ ndirect
;
3537 cj_ind_1
= cj_ind_last
+ 1;
3538 while (found
== -1 && cj_ind_0
< cj_ind_1
)
3540 cj_ind_m
= (cj_ind_0
+ cj_ind_1
)>>1;
3542 cj_m
= nbl
->cj
[cj_ind_m
].cj
;
3550 cj_ind_1
= cj_ind_m
;
3554 cj_ind_0
= cj_ind_m
+ 1;
3561 inner_i
= i
- (si
<< na_ci_2log
);
3562 inner_e
= ge
- (se
<< na_cj_2log
);
3564 nbl
->cj
[found
].excl
&= ~(1U<<((inner_i
<<na_cj_2log
) + inner_e
));
3572 /* Set all atom-pair exclusions from the topology stored in excl
3573 * as masks in the pair-list for i-super-cell entry nbl_sci
3575 static void set_sci_top_excls(const nbnxn_search_t nbs
,
3576 nbnxn_pairlist_t
*nbl
,
3577 gmx_bool diagRemoved
,
3579 const nbnxn_sci_t
*nbl_sci
,
3580 const t_blocka
*excl
)
3585 int cj_ind_first
,cj_ind_last
;
3586 int cj_first
,cj_last
;
3588 int i
,ai
,aj
,si
,eind
,ge
,se
;
3589 int found
,cj_ind_0
,cj_ind_1
,cj_ind_m
;
3593 nbnxn_excl_t
*nbl_excl
;
3594 int inner_i
,inner_e
,w
;
3600 if (nbl_sci
->cj4_ind_end
== nbl_sci
->cj4_ind_start
)
3608 cj_ind_first
= nbl_sci
->cj4_ind_start
*NBNXN_GPU_JGROUP_SIZE
;
3609 cj_ind_last
= nbl
->work
->cj_ind
- 1;
3611 cj_first
= nbl
->cj4
[nbl_sci
->cj4_ind_start
].cj
[0];
3612 cj_last
= nbl_cj(nbl
,cj_ind_last
);
3614 /* Determine how many contiguous j-clusters we have starting
3615 * from the first i-cluster. This number can be used to directly
3616 * calculate j-cluster indices for excluded atoms.
3619 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3620 nbl_cj(nbl
,cj_ind_first
+ndirect
) == sci
*GPU_NSUBCELL
+ ndirect
)
3625 /* Loop over the atoms in the i super-cell */
3626 for(i
=0; i
<nbl
->na_sc
; i
++)
3628 ai
= nbs
->a
[sci
*nbl
->na_sc
+i
];
3631 si
= (i
>>na_c_2log
);
3633 /* Loop over the topology-based exclusions for this i-atom */
3634 for(eind
=excl
->index
[ai
]; eind
<excl
->index
[ai
+1]; eind
++)
3640 /* The self exclusion are already set, save some time */
3646 /* Without shifts we only calculate interactions j>i
3647 * for one-way pair-lists.
3649 if (diagRemoved
&& ge
<= sci
*nbl
->na_sc
+ i
)
3655 /* Could the cluster se be in our list? */
3656 if (se
>= cj_first
&& se
<= cj_last
)
3658 if (se
< cj_first
+ ndirect
)
3660 /* We can calculate cj_ind directly from se */
3661 found
= cj_ind_first
+ se
- cj_first
;
3665 /* Search for se using bisection */
3667 cj_ind_0
= cj_ind_first
+ ndirect
;
3668 cj_ind_1
= cj_ind_last
+ 1;
3669 while (found
== -1 && cj_ind_0
< cj_ind_1
)
3671 cj_ind_m
= (cj_ind_0
+ cj_ind_1
)>>1;
3673 cj_m
= nbl_cj(nbl
,cj_ind_m
);
3681 cj_ind_1
= cj_ind_m
;
3685 cj_ind_0
= cj_ind_m
+ 1;
3692 inner_i
= i
- si
*na_c
;
3693 inner_e
= ge
- se
*na_c
;
3695 /* Macro for getting the index of atom a within a cluster */
3696 #define AMODI(a) ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
3697 /* Macro for converting an atom number to a cluster number */
3698 #define A2CI(a) ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
3700 if (nbl_imask0(nbl
,found
) & (1U << (AMODI(found
)*GPU_NSUBCELL
+ si
)))
3704 get_nbl_exclusions_1(nbl
,A2CI(found
),w
,&nbl_excl
);
3706 nbl_excl
->pair
[AMODI(inner_e
)*nbl
->na_ci
+inner_i
] &=
3707 ~(1U << (AMODI(found
)*GPU_NSUBCELL
+ si
));
3719 /* Reallocate the simple ci list for at least n entries */
3720 static void nb_realloc_ci(nbnxn_pairlist_t
*nbl
,int n
)
3722 nbl
->ci_nalloc
= over_alloc_small(n
);
3723 nb_realloc_void((void **)&nbl
->ci
,
3724 nbl
->nci
*sizeof(*nbl
->ci
),
3725 nbl
->ci_nalloc
*sizeof(*nbl
->ci
),
3726 nbl
->alloc
,nbl
->free
);
3729 /* Reallocate the super-cell sci list for at least n entries */
3730 static void nb_realloc_sci(nbnxn_pairlist_t
*nbl
,int n
)
3732 nbl
->sci_nalloc
= over_alloc_small(n
);
3733 nb_realloc_void((void **)&nbl
->sci
,
3734 nbl
->nsci
*sizeof(*nbl
->sci
),
3735 nbl
->sci_nalloc
*sizeof(*nbl
->sci
),
3736 nbl
->alloc
,nbl
->free
);
3739 /* Make a new ci entry at index nbl->nci */
3740 static void new_ci_entry(nbnxn_pairlist_t
*nbl
,int ci
,int shift
,int flags
,
3741 nbnxn_list_work_t
*work
)
3743 if (nbl
->nci
+ 1 > nbl
->ci_nalloc
)
3745 nb_realloc_ci(nbl
,nbl
->nci
+1);
3747 nbl
->ci
[nbl
->nci
].ci
= ci
;
3748 nbl
->ci
[nbl
->nci
].shift
= shift
;
3749 /* Store the interaction flags along with the shift */
3750 nbl
->ci
[nbl
->nci
].shift
|= flags
;
3751 nbl
->ci
[nbl
->nci
].cj_ind_start
= nbl
->ncj
;
3752 nbl
->ci
[nbl
->nci
].cj_ind_end
= nbl
->ncj
;
3755 /* Make a new sci entry at index nbl->nsci */
3756 static void new_sci_entry(nbnxn_pairlist_t
*nbl
,int sci
,int shift
,int flags
,
3757 nbnxn_list_work_t
*work
)
3759 if (nbl
->nsci
+ 1 > nbl
->sci_nalloc
)
3761 nb_realloc_sci(nbl
,nbl
->nsci
+1);
3763 nbl
->sci
[nbl
->nsci
].sci
= sci
;
3764 nbl
->sci
[nbl
->nsci
].shift
= shift
;
3765 nbl
->sci
[nbl
->nsci
].cj4_ind_start
= nbl
->ncj4
;
3766 nbl
->sci
[nbl
->nsci
].cj4_ind_end
= nbl
->ncj4
;
3769 /* Sort the simple j-list cj on exclusions.
3770 * Entries with exclusions will all be sorted to the beginning of the list.
3772 static void sort_cj_excl(nbnxn_cj_t
*cj
,int ncj
,
3773 nbnxn_list_work_t
*work
)
3777 if (ncj
> work
->cj_nalloc
)
3779 work
->cj_nalloc
= over_alloc_large(ncj
);
3780 srenew(work
->cj
,work
->cj_nalloc
);
3783 /* Make a list of the j-cells involving exclusions */
3785 for(j
=0; j
<ncj
; j
++)
3787 if (cj
[j
].excl
!= NBNXN_INT_MASK_ALL
)
3789 work
->cj
[jnew
++] = cj
[j
];
3792 /* Check if there are exclusions at all or not just the first entry */
3793 if (!((jnew
== 0) ||
3794 (jnew
== 1 && cj
[0].excl
!= NBNXN_INT_MASK_ALL
)))
3796 for(j
=0; j
<ncj
; j
++)
3798 if (cj
[j
].excl
== NBNXN_INT_MASK_ALL
)
3800 work
->cj
[jnew
++] = cj
[j
];
3803 for(j
=0; j
<ncj
; j
++)
3805 cj
[j
] = work
->cj
[j
];
3810 /* Close this simple list i entry */
3811 static void close_ci_entry_simple(nbnxn_pairlist_t
*nbl
)
3815 /* All content of the new ci entry have already been filled correctly,
3816 * we only need to increase the count here (for non empty lists).
3818 jlen
= nbl
->ci
[nbl
->nci
].cj_ind_end
- nbl
->ci
[nbl
->nci
].cj_ind_start
;
3821 sort_cj_excl(nbl
->cj
+nbl
->ci
[nbl
->nci
].cj_ind_start
,jlen
,nbl
->work
);
3823 if (nbl
->ci
[nbl
->nci
].shift
& NBNXN_CI_HALF_LJ(0))
3825 nbl
->work
->ncj_hlj
+= jlen
;
3827 else if (!(nbl
->ci
[nbl
->nci
].shift
& NBNXN_CI_DO_COUL(0)))
3829 nbl
->work
->ncj_noq
+= jlen
;
3836 /* Split sci entry for load balancing on the GPU.
3837 * As we only now the current count on our own thread,
3838 * we will need to estimate the current total amount of i-entries.
3839 * As the lists get concatenated later, this estimate depends
3840 * both on nthread and our own thread index thread.
3842 static void split_sci_entry(nbnxn_pairlist_t
*nbl
,
3843 int nsp_max_av
,gmx_bool progBal
,int nc_bal
,
3844 int thread
,int nthread
)
3848 int cj4_start
,cj4_end
,j4len
,cj4
;
3850 int nsp
,nsp_sci
,nsp_cj4
,nsp_cj4_e
,nsp_cj4_p
;
3853 /* Estimate the total numbers of ci's of the nblist combined
3854 * over all threads using the target number of ci's.
3856 nsci_est
= nc_bal
*thread
/nthread
+ nbl
->nsci
;
3859 /* The first ci blocks should be larger, to avoid overhead.
3860 * The last ci blocks should be smaller, to improve load balancing.
3863 nsp_max_av
*nc_bal
*3/(2*(nsci_est
- 1 + nc_bal
)));
3867 nsp_max
= nsp_max_av
;
3870 cj4_start
= nbl
->sci
[nbl
->nsci
-1].cj4_ind_start
;
3871 cj4_end
= nbl
->sci
[nbl
->nsci
-1].cj4_ind_end
;
3872 j4len
= cj4_end
- cj4_start
;
3874 if (j4len
> 1 && j4len
*GPU_NSUBCELL
*NBNXN_GPU_JGROUP_SIZE
> nsp_max
)
3876 /* Remove the last ci entry and process the cj4's again */
3885 while (cj4
< cj4_end
)
3887 nsp_cj4_p
= nsp_cj4
;
3889 for(p
=0; p
<GPU_NSUBCELL
*NBNXN_GPU_JGROUP_SIZE
; p
++)
3891 nsp_cj4
+= (nbl
->cj4
[cj4
].imei
[0].imask
>> p
) & 1;
3895 if (nsp
> nsp_max
&& nsp
> nsp_cj4
)
3897 nbl
->sci
[sci
].cj4_ind_end
= cj4
;
3900 if (nbl
->nsci
+1 > nbl
->sci_nalloc
)
3902 nb_realloc_sci(nbl
,nbl
->nsci
+1);
3904 nbl
->sci
[sci
].sci
= nbl
->sci
[nbl
->nsci
-1].sci
;
3905 nbl
->sci
[sci
].shift
= nbl
->sci
[nbl
->nsci
-1].shift
;
3906 nbl
->sci
[sci
].cj4_ind_start
= cj4
;
3907 nsp_sci
= nsp
- nsp_cj4
;
3908 nsp_cj4_e
= nsp_cj4_p
;
3915 /* Put the remaining cj4's in a new ci entry */
3916 nbl
->sci
[sci
].cj4_ind_end
= cj4_end
;
3918 /* Possibly balance out the last two ci's
3919 * by moving the last cj4 of the second last ci.
3921 if (nsp_sci
- nsp_cj4_e
>= nsp
+ nsp_cj4_e
)
3923 nbl
->sci
[sci
-1].cj4_ind_end
--;
3924 nbl
->sci
[sci
].cj4_ind_start
--;
3932 /* Clost this super/sub list i entry */
3933 static void close_ci_entry_supersub(nbnxn_pairlist_t
*nbl
,
3935 gmx_bool progBal
,int nc_bal
,
3936 int thread
,int nthread
)
3941 /* All content of the new ci entry have already been filled correctly,
3942 * we only need to increase the count here (for non empty lists).
3944 j4len
= nbl
->sci
[nbl
->nsci
].cj4_ind_end
- nbl
->sci
[nbl
->nsci
].cj4_ind_start
;
3947 /* We can only have complete blocks of 4 j-entries in a list,
3948 * so round the count up before closing.
3950 nbl
->ncj4
= ((nbl
->work
->cj_ind
+ 4-1) >> 2);
3951 nbl
->work
->cj_ind
= nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
;
3957 split_sci_entry(nbl
,nsp_max_av
,progBal
,nc_bal
,thread
,nthread
);
3962 /* Syncs the working array before adding another grid pair to the list */
3963 static void sync_work(nbnxn_pairlist_t
*nbl
)
3967 nbl
->work
->cj_ind
= nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
;
3968 nbl
->work
->cj4_init
= nbl
->ncj4
;
3972 /* Clears an nbnxn_pairlist_t data structure */
3973 static void clear_pairlist(nbnxn_pairlist_t
*nbl
)
3982 nbl
->work
->ncj_noq
= 0;
3983 nbl
->work
->ncj_hlj
= 0;
3986 /* Sets a simple list i-cell bounding box, including PBC shift */
3987 static void set_icell_bb_simple(const float *bb
,int ci
,
3988 real shx
,real shy
,real shz
,
3994 bb_ci
[BBL_X
] = bb
[ia
+BBL_X
] + shx
;
3995 bb_ci
[BBL_Y
] = bb
[ia
+BBL_Y
] + shy
;
3996 bb_ci
[BBL_Z
] = bb
[ia
+BBL_Z
] + shz
;
3997 bb_ci
[BBU_X
] = bb
[ia
+BBU_X
] + shx
;
3998 bb_ci
[BBU_Y
] = bb
[ia
+BBU_Y
] + shy
;
3999 bb_ci
[BBU_Z
] = bb
[ia
+BBU_Z
] + shz
;
4002 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
4003 static void set_icell_bb_supersub(const float *bb
,int ci
,
4004 real shx
,real shy
,real shz
,
4010 ia
= ci
*(GPU_NSUBCELL
>>SSE_F_WIDTH_2LOG
)*NNBSBB_XXXX
;
4011 for(m
=0; m
<(GPU_NSUBCELL
>>SSE_F_WIDTH_2LOG
)*NNBSBB_XXXX
; m
+=NNBSBB_XXXX
)
4013 for(i
=0; i
<SSE_F_WIDTH
; i
++)
4015 bb_ci
[m
+ 0+i
] = bb
[ia
+m
+ 0+i
] + shx
;
4016 bb_ci
[m
+ 4+i
] = bb
[ia
+m
+ 4+i
] + shy
;
4017 bb_ci
[m
+ 8+i
] = bb
[ia
+m
+ 8+i
] + shz
;
4018 bb_ci
[m
+12+i
] = bb
[ia
+m
+12+i
] + shx
;
4019 bb_ci
[m
+16+i
] = bb
[ia
+m
+16+i
] + shy
;
4020 bb_ci
[m
+20+i
] = bb
[ia
+m
+20+i
] + shz
;
4024 ia
= ci
*GPU_NSUBCELL
*NNBSBB_B
;
4025 for(i
=0; i
<GPU_NSUBCELL
*NNBSBB_B
; i
+=NNBSBB_B
)
4027 bb_ci
[BBL_X
] = bb
[ia
+BBL_X
] + shx
;
4028 bb_ci
[BBL_Y
] = bb
[ia
+BBL_Y
] + shy
;
4029 bb_ci
[BBL_Z
] = bb
[ia
+BBL_Z
] + shz
;
4030 bb_ci
[BBU_X
] = bb
[ia
+BBU_X
] + shx
;
4031 bb_ci
[BBU_Y
] = bb
[ia
+BBU_Y
] + shy
;
4032 bb_ci
[BBU_Z
] = bb
[ia
+BBU_Z
] + shz
;
4037 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
4038 static void icell_set_x_simple(int ci
,
4039 real shx
,real shy
,real shz
,
4041 int stride
,const real
*x
,
4042 nbnxn_list_work_t
*work
)
4046 ia
= ci
*NBNXN_CPU_CLUSTER_I_SIZE
;
4048 for(i
=0; i
<NBNXN_CPU_CLUSTER_I_SIZE
; i
++)
4050 work
->x_ci
[i
*STRIDE_XYZ
+XX
] = x
[(ia
+i
)*stride
+XX
] + shx
;
4051 work
->x_ci
[i
*STRIDE_XYZ
+YY
] = x
[(ia
+i
)*stride
+YY
] + shy
;
4052 work
->x_ci
[i
*STRIDE_XYZ
+ZZ
] = x
[(ia
+i
)*stride
+ZZ
] + shz
;
4056 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
4057 static void icell_set_x_supersub(int ci
,
4058 real shx
,real shy
,real shz
,
4060 int stride
,const real
*x
,
4061 nbnxn_list_work_t
*work
)
4068 ia
= ci
*GPU_NSUBCELL
*na_c
;
4069 for(i
=0; i
<GPU_NSUBCELL
*na_c
; i
++)
4071 x_ci
[i
*DIM
+ XX
] = x
[(ia
+i
)*stride
+ XX
] + shx
;
4072 x_ci
[i
*DIM
+ YY
] = x
[(ia
+i
)*stride
+ YY
] + shy
;
4073 x_ci
[i
*DIM
+ ZZ
] = x
[(ia
+i
)*stride
+ ZZ
] + shz
;
4077 #ifdef NBNXN_SEARCH_SSE
4078 /* Copies PBC shifted super-cell packed atom coordinates to working array */
4079 static void icell_set_x_supersub_sse8(int ci
,
4080 real shx
,real shy
,real shz
,
4082 int stride
,const real
*x
,
4083 nbnxn_list_work_t
*work
)
4090 for(si
=0; si
<GPU_NSUBCELL
; si
++)
4092 for(i
=0; i
<na_c
; i
+=SSE_F_WIDTH
)
4095 ia
= ci
*GPU_NSUBCELL
*na_c
+ io
;
4096 for(j
=0; j
<SSE_F_WIDTH
; j
++)
4098 x_ci
[io
*DIM
+ j
+ XX
*SSE_F_WIDTH
] = x
[(ia
+j
)*stride
+XX
] + shx
;
4099 x_ci
[io
*DIM
+ j
+ YY
*SSE_F_WIDTH
] = x
[(ia
+j
)*stride
+YY
] + shy
;
4100 x_ci
[io
*DIM
+ j
+ ZZ
*SSE_F_WIDTH
] = x
[(ia
+j
)*stride
+ZZ
] + shz
;
4107 static real nbnxn_rlist_inc_nonloc_fac
= 0.6;
4109 /* Due to the cluster size the effective pair-list is longer than
4110 * that of a simple atom pair-list. This function gives the extra distance.
4112 real
nbnxn_get_rlist_effective_inc(int cluster_size
,real atom_density
)
4114 return ((0.5 + nbnxn_rlist_inc_nonloc_fac
)*sqr(((cluster_size
) - 1.0)/(cluster_size
))*pow((cluster_size
)/(atom_density
),1.0/3.0));
4117 /* Estimates the interaction volume^2 for non-local interactions */
4118 static real
nonlocal_vol2(const gmx_domdec_zones_t
*zones
,rvec ls
,real r
)
4127 /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
4128 * not home interaction volume^2. As these volumes are not additive,
4129 * this is an overestimate, but it would only be significant in the limit
4130 * of small cells, where we anyhow need to split the lists into
4131 * as small parts as possible.
4134 for(z
=0; z
<zones
->n
; z
++)
4136 if (zones
->shift
[z
][XX
] + zones
->shift
[z
][YY
] + zones
->shift
[z
][ZZ
] == 1)
4141 for(d
=0; d
<DIM
; d
++)
4143 if (zones
->shift
[z
][d
] == 0)
4147 za
*= zones
->size
[z
].x1
[d
] - zones
->size
[z
].x0
[d
];
4151 /* 4 octants of a sphere */
4152 vold_est
= 0.25*M_PI
*r
*r
*r
*r
;
4153 /* 4 quarter pie slices on the edges */
4154 vold_est
+= 4*cl
*M_PI
/6.0*r
*r
*r
;
4155 /* One rectangular volume on a face */
4156 vold_est
+= ca
*0.5*r
*r
;
4158 vol2_est_tot
+= vold_est
*za
;
4162 return vol2_est_tot
;
4165 /* Estimates the average size of a full j-list for super/sub setup */
4166 static int get_nsubpair_max(const nbnxn_search_t nbs
,
4169 int min_ci_balanced
)
4171 const nbnxn_grid_t
*grid
;
4173 real xy_diag2
,r_eff_sup
,vol_est
,nsp_est
,nsp_est_nl
;
4176 grid
= &nbs
->grid
[0];
4178 ls
[XX
] = (grid
->c1
[XX
] - grid
->c0
[XX
])/(grid
->ncx
*GPU_NSUBCELL_X
);
4179 ls
[YY
] = (grid
->c1
[YY
] - grid
->c0
[YY
])/(grid
->ncy
*GPU_NSUBCELL_Y
);
4180 ls
[ZZ
] = (grid
->c1
[ZZ
] - grid
->c0
[ZZ
])*grid
->ncx
*grid
->ncy
/(grid
->nc
*GPU_NSUBCELL_Z
);
4182 /* The average squared length of the diagonal of a sub cell */
4183 xy_diag2
= ls
[XX
]*ls
[XX
] + ls
[YY
]*ls
[YY
] + ls
[ZZ
]*ls
[ZZ
];
4185 /* The formulas below are a heuristic estimate of the average nsj per si*/
4186 r_eff_sup
= rlist
+ nbnxn_rlist_inc_nonloc_fac
*sqr((grid
->na_c
- 1.0)/grid
->na_c
)*sqrt(xy_diag2
/3);
4188 if (!nbs
->DomDec
|| nbs
->zones
->n
== 1)
4195 sqr(grid
->atom_density
/grid
->na_c
)*
4196 nonlocal_vol2(nbs
->zones
,ls
,r_eff_sup
);
4201 /* Sub-cell interacts with itself */
4202 vol_est
= ls
[XX
]*ls
[YY
]*ls
[ZZ
];
4203 /* 6/2 rectangular volume on the faces */
4204 vol_est
+= (ls
[XX
]*ls
[YY
] + ls
[XX
]*ls
[ZZ
] + ls
[YY
]*ls
[ZZ
])*r_eff_sup
;
4205 /* 12/2 quarter pie slices on the edges */
4206 vol_est
+= 2*(ls
[XX
] + ls
[YY
] + ls
[ZZ
])*0.25*M_PI
*sqr(r_eff_sup
);
4207 /* 4 octants of a sphere */
4208 vol_est
+= 0.5*4.0/3.0*M_PI
*pow(r_eff_sup
,3);
4210 nsp_est
= grid
->nsubc_tot
*vol_est
*grid
->atom_density
/grid
->na_c
;
4212 /* Subtract the non-local pair count */
4213 nsp_est
-= nsp_est_nl
;
4217 fprintf(debug
,"nsp_est local %5.1f non-local %5.1f\n",
4218 nsp_est
,nsp_est_nl
);
4223 nsp_est
= nsp_est_nl
;
4226 if (min_ci_balanced
<= 0 || grid
->nc
>= min_ci_balanced
|| grid
->nc
== 0)
4228 /* We don't need to worry */
4233 /* Thus the (average) maximum j-list size should be as follows */
4234 nsubpair_max
= max(1,(int)(nsp_est
/min_ci_balanced
+0.5));
4236 /* Since the target value is a maximum (this avoid high outliers,
4237 * which lead to load imbalance), not average, we get more lists
4238 * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
4239 * But more importantly, the optimal GPU performance moves
4240 * to lower number of block for very small blocks.
4241 * To compensate we add the maximum pair count per cj4.
4243 nsubpair_max
+= GPU_NSUBCELL
*NBNXN_CPU_CLUSTER_I_SIZE
;
4248 fprintf(debug
,"nbl nsp estimate %.1f, nsubpair_max %d\n",
4249 nsp_est
,nsubpair_max
);
4252 return nsubpair_max
;
4255 /* Debug list print function */
4256 static void print_nblist_ci_cj(FILE *fp
,const nbnxn_pairlist_t
*nbl
)
4260 for(i
=0; i
<nbl
->nci
; i
++)
4262 fprintf(fp
,"ci %4d shift %2d ncj %3d\n",
4263 nbl
->ci
[i
].ci
,nbl
->ci
[i
].shift
,
4264 nbl
->ci
[i
].cj_ind_end
- nbl
->ci
[i
].cj_ind_start
);
4266 for(j
=nbl
->ci
[i
].cj_ind_start
; j
<nbl
->ci
[i
].cj_ind_end
; j
++)
4268 fprintf(fp
," cj %5d imask %x\n",
4275 /* Debug list print function */
4276 static void print_nblist_sci_cj(FILE *fp
,const nbnxn_pairlist_t
*nbl
)
4280 for(i
=0; i
<nbl
->nsci
; i
++)
4282 fprintf(fp
,"ci %4d shift %2d ncj4 %2d\n",
4283 nbl
->sci
[i
].sci
,nbl
->sci
[i
].shift
,
4284 nbl
->sci
[i
].cj4_ind_end
- nbl
->sci
[i
].cj4_ind_start
);
4286 for(j4
=nbl
->sci
[i
].cj4_ind_start
; j4
<nbl
->sci
[i
].cj4_ind_end
; j4
++)
4290 fprintf(fp
," sj %5d imask %x\n",
4292 nbl
->cj4
[j4
].imei
[0].imask
);
4298 /* Combine pair lists *nbl generated on multiple threads nblc */
4299 static void combine_nblists(int nnbl
,nbnxn_pairlist_t
**nbl
,
4300 nbnxn_pairlist_t
*nblc
)
4302 int nsci
,ncj4
,nexcl
;
4307 gmx_incons("combine_nblists does not support simple lists");
4312 nexcl
= nblc
->nexcl
;
4313 for(i
=0; i
<nnbl
; i
++)
4315 nsci
+= nbl
[i
]->nsci
;
4316 ncj4
+= nbl
[i
]->ncj4
;
4317 nexcl
+= nbl
[i
]->nexcl
;
4320 if (nsci
> nblc
->sci_nalloc
)
4322 nb_realloc_sci(nblc
,nsci
);
4324 if (ncj4
> nblc
->cj4_nalloc
)
4326 nblc
->cj4_nalloc
= over_alloc_small(ncj4
);
4327 nb_realloc_void((void **)&nblc
->cj4
,
4328 nblc
->ncj4
*sizeof(*nblc
->cj4
),
4329 nblc
->cj4_nalloc
*sizeof(*nblc
->cj4
),
4330 nblc
->alloc
,nblc
->free
);
4332 if (nexcl
> nblc
->excl_nalloc
)
4334 nblc
->excl_nalloc
= over_alloc_small(nexcl
);
4335 nb_realloc_void((void **)&nblc
->excl
,
4336 nblc
->nexcl
*sizeof(*nblc
->excl
),
4337 nblc
->excl_nalloc
*sizeof(*nblc
->excl
),
4338 nblc
->alloc
,nblc
->free
);
4341 /* Each thread should copy its own data to the combined arrays,
4342 * as otherwise data will go back and forth between different caches.
4344 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
4345 for(n
=0; n
<nnbl
; n
++)
4352 const nbnxn_pairlist_t
*nbli
;
4354 /* Determine the offset in the combined data for our thread */
4355 sci_offset
= nblc
->nsci
;
4356 cj4_offset
= nblc
->ncj4
;
4357 ci_offset
= nblc
->nci_tot
;
4358 excl_offset
= nblc
->nexcl
;
4362 sci_offset
+= nbl
[i
]->nsci
;
4363 cj4_offset
+= nbl
[i
]->ncj4
;
4364 ci_offset
+= nbl
[i
]->nci_tot
;
4365 excl_offset
+= nbl
[i
]->nexcl
;
4370 for(i
=0; i
<nbli
->nsci
; i
++)
4372 nblc
->sci
[sci_offset
+i
] = nbli
->sci
[i
];
4373 nblc
->sci
[sci_offset
+i
].cj4_ind_start
+= cj4_offset
;
4374 nblc
->sci
[sci_offset
+i
].cj4_ind_end
+= cj4_offset
;
4377 for(j4
=0; j4
<nbli
->ncj4
; j4
++)
4379 nblc
->cj4
[cj4_offset
+j4
] = nbli
->cj4
[j4
];
4380 nblc
->cj4
[cj4_offset
+j4
].imei
[0].excl_ind
+= excl_offset
;
4381 nblc
->cj4
[cj4_offset
+j4
].imei
[1].excl_ind
+= excl_offset
;
4384 for(j4
=0; j4
<nbli
->nexcl
; j4
++)
4386 nblc
->excl
[excl_offset
+j4
] = nbli
->excl
[j4
];
4390 for(n
=0; n
<nnbl
; n
++)
4392 nblc
->nsci
+= nbl
[n
]->nsci
;
4393 nblc
->ncj4
+= nbl
[n
]->ncj4
;
4394 nblc
->nci_tot
+= nbl
[n
]->nci_tot
;
4395 nblc
->nexcl
+= nbl
[n
]->nexcl
;
4399 /* Returns the next ci to be processes by our thread */
4400 static gmx_bool
next_ci(const nbnxn_grid_t
*grid
,
4402 int nth
,int ci_block
,
4403 int *ci_x
,int *ci_y
,
4409 if (*ci_b
== ci_block
)
4411 /* Jump to the next block assigned to this task */
4412 *ci
+= (nth
- 1)*ci_block
;
4416 if (*ci
>= grid
->nc
*conv
)
4421 while (*ci
>= grid
->cxy_ind
[*ci_x
*grid
->ncy
+ *ci_y
+ 1]*conv
)
4424 if (*ci_y
== grid
->ncy
)
4434 /* Returns the distance^2 for which we put cell pairs in the list
4435 * without checking atom pair distances. This is usually < rlist^2.
4437 static float boundingbox_only_distance2(const nbnxn_grid_t
*gridi
,
4438 const nbnxn_grid_t
*gridj
,
4442 /* If the distance between two sub-cell bounding boxes is less
4443 * than this distance, do not check the distance between
4444 * all particle pairs in the sub-cell, since then it is likely
4445 * that the box pair has atom pairs within the cut-off.
4446 * We use the nblist cut-off minus 0.5 times the average x/y diagonal
4447 * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
4448 * Using more than 0.5 gains at most 0.5%.
4449 * If forces are calculated more than twice, the performance gain
4450 * in the force calculation outweighs the cost of checking.
4451 * Note that with subcell lists, the atom-pair distance check
4452 * is only performed when only 1 out of 8 sub-cells in within range,
4453 * this is because the GPU is much faster than the cpu.
4458 bbx
= 0.5*(gridi
->sx
+ gridj
->sx
);
4459 bby
= 0.5*(gridi
->sy
+ gridj
->sy
);
4462 bbx
/= GPU_NSUBCELL_X
;
4463 bby
/= GPU_NSUBCELL_Y
;
4466 rbb2
= sqr(max(0,rlist
- 0.5*sqrt(bbx
*bbx
+ bby
*bby
)));
4471 return (float)((1+GMX_FLOAT_EPS
)*rbb2
);
4475 /* Generates the part of pair-list nbl assigned to our thread */
4476 static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs
,
4477 const nbnxn_grid_t
*gridi
,
4478 const nbnxn_grid_t
*gridj
,
4479 nbnxn_search_work_t
*work
,
4480 const nbnxn_atomdata_t
*nbat
,
4481 const t_blocka
*excl
,
4486 int min_ci_balanced
,
4488 nbnxn_pairlist_t
*nbl
)
4495 int ci_block
,ci_b
,ci
,ci_x
,ci_y
,ci_xy
,cj
;
4502 const float *bb_i
,*bbcz_i
,*bbcz_j
;
4504 real bx0
,bx1
,by0
,by1
,bz0
,bz1
;
4506 real d2cx
,d2z
,d2z_cx
,d2z_cy
,d2zx
,d2zxy
,d2xy
;
4507 int cxf
,cxl
,cyf
,cyf_x
,cyl
;
4513 nbs_cycle_start(&work
->cc
[enbsCCsearch
]);
4515 if (gridj
->bSimple
!= nbl
->bSimple
)
4517 gmx_incons("Grid incompatible with pair-list");
4522 nbl
->na_sc
= gridj
->na_sc
;
4523 nbl
->na_ci
= gridj
->na_c
;
4524 nbl
->na_cj
= kernel_to_cj_size(nb_kernel_type
);
4525 na_cj_2log
= get_2log(nbl
->na_cj
);
4529 copy_mat(nbs
->box
,box
);
4531 rl2
= nbl
->rlist
*nbl
->rlist
;
4533 rbb2
= boundingbox_only_distance2(gridi
,gridj
,nbl
->rlist
,nbl
->bSimple
);
4537 fprintf(debug
,"nbl bounding box only distance %f\n",sqrt(rbb2
));
4540 /* Set the shift range */
4541 for(d
=0; d
<DIM
; d
++)
4543 /* Check if we need periodicity shifts.
4544 * Without PBC or with domain decomposition we don't need them.
4546 if (d
>= ePBC2npbcdim(nbs
->ePBC
) || nbs
->dd_dim
[d
])
4553 box
[XX
][XX
] - fabs(box
[YY
][XX
]) - fabs(box
[ZZ
][XX
]) < sqrt(rl2
))
4564 if (nbl
->bSimple
&& !gridi
->bSimple
)
4566 conv_i
= gridi
->na_sc
/gridj
->na_sc
;
4567 bb_i
= gridi
->bb_simple
;
4568 bbcz_i
= gridi
->bbcz_simple
;
4569 flags_i
= gridi
->flags_simple
;
4575 bbcz_i
= gridi
->bbcz
;
4576 flags_i
= gridi
->flags
;
4578 cell0_i
= gridi
->cell0
*conv_i
;
4580 bbcz_j
= gridj
->bbcz
;
4584 #define CI_BLOCK_ENUM 5
4585 #define CI_BLOCK_DENOM 11
4586 /* Here we decide how to distribute the blocks over the threads.
4587 * We use prime numbers to try to avoid that the grid size becomes
4588 * a multiple of the number of threads, which would lead to some
4589 * threads getting "inner" pairs and others getting boundary pairs,
4590 * which in turns will lead to load imbalance between threads.
4591 * Set the block size as 5/11/ntask times the average number of cells
4592 * in a y,z slab. This should ensure a quite uniform distribution
4593 * of the grid parts of the different thread along all three grid
4594 * zone boundaries with 3D domain decomposition. At the same time
4595 * the blocks will not become too small.
4597 ci_block
= (gridi
->nc
*CI_BLOCK_ENUM
)/(CI_BLOCK_DENOM
*gridi
->ncx
*nth
);
4599 /* Ensure the blocks are not too small: avoids cache invalidation */
4600 if (ci_block
*gridi
->na_sc
< 16)
4602 ci_block
= (16 + gridi
->na_sc
- 1)/gridi
->na_sc
;
4605 /* Without domain decomposition
4606 * or with less than 3 blocks per task, divide in nth blocks.
4608 if (!nbs
->DomDec
|| ci_block
*3*nth
> gridi
->nc
)
4610 ci_block
= (gridi
->nc
+ nth
- 1)/nth
;
4615 /* Blocks of the conversion factor - 1 give a large repeat count
4616 * combined with a small block size. This should result in good
4617 * load balancing for both small and large domains.
4619 ci_block
= conv_i
- 1;
4623 fprintf(debug
,"nbl nc_i %d col.av. %.1f ci_block %d\n",
4624 gridi
->nc
,gridi
->nc
/(double)(gridi
->ncx
*gridi
->ncy
),ci_block
);
4631 ci
= th
*ci_block
- 1;
4634 while (next_ci(gridi
,conv_i
,nth
,ci_block
,&ci_x
,&ci_y
,&ci_b
,&ci
))
4636 if (nbl
->bSimple
&& flags_i
[ci
] == 0)
4642 if (gridj
!= gridi
&& shp
[XX
] == 0)
4646 bx1
= bb_i
[ci
*NNBSBB_B
+NNBSBB_C
+XX
];
4650 bx1
= gridi
->c0
[XX
] + (ci_x
+1)*gridi
->sx
;
4652 if (bx1
< gridj
->c0
[XX
])
4654 d2cx
= sqr(gridj
->c0
[XX
] - bx1
);
4663 ci_xy
= ci_x
*gridi
->ncy
+ ci_y
;
4665 /* Loop over shift vectors in three dimensions */
4666 for (tz
=-shp
[ZZ
]; tz
<=shp
[ZZ
]; tz
++)
4668 shz
= tz
*box
[ZZ
][ZZ
];
4670 bz0
= bbcz_i
[ci
*NNBSBB_D
] + shz
;
4671 bz1
= bbcz_i
[ci
*NNBSBB_D
+1] + shz
;
4683 d2z
= sqr(bz0
- box
[ZZ
][ZZ
]);
4686 d2z_cx
= d2z
+ d2cx
;
4694 bz1
/((real
)(gridi
->cxy_ind
[ci_xy
+1] - gridi
->cxy_ind
[ci_xy
]));
4699 /* The check with bz1_frac close to or larger than 1 comes later */
4701 for (ty
=-shp
[YY
]; ty
<=shp
[YY
]; ty
++)
4703 shy
= ty
*box
[YY
][YY
] + tz
*box
[ZZ
][YY
];
4707 by0
= bb_i
[ci
*NNBSBB_B
+YY
] + shy
;
4708 by1
= bb_i
[ci
*NNBSBB_B
+NNBSBB_C
+YY
] + shy
;
4712 by0
= gridi
->c0
[YY
] + (ci_y
)*gridi
->sy
+ shy
;
4713 by1
= gridi
->c0
[YY
] + (ci_y
+1)*gridi
->sy
+ shy
;
4716 get_cell_range(by0
,by1
,
4717 gridj
->ncy
,gridj
->c0
[YY
],gridj
->sy
,gridj
->inv_sy
,
4727 if (by1
< gridj
->c0
[YY
])
4729 d2z_cy
+= sqr(gridj
->c0
[YY
] - by1
);
4731 else if (by0
> gridj
->c1
[YY
])
4733 d2z_cy
+= sqr(by0
- gridj
->c1
[YY
]);
4736 for (tx
=-shp
[XX
]; tx
<=shp
[XX
]; tx
++)
4738 shift
= XYZ2IS(tx
,ty
,tz
);
4740 #ifdef NBNXN_SHIFT_BACKWARD
4741 if (gridi
== gridj
&& shift
> CENTRAL
)
4747 shx
= tx
*box
[XX
][XX
] + ty
*box
[YY
][XX
] + tz
*box
[ZZ
][XX
];
4751 bx0
= bb_i
[ci
*NNBSBB_B
+XX
] + shx
;
4752 bx1
= bb_i
[ci
*NNBSBB_B
+NNBSBB_C
+XX
] + shx
;
4756 bx0
= gridi
->c0
[XX
] + (ci_x
)*gridi
->sx
+ shx
;
4757 bx1
= gridi
->c0
[XX
] + (ci_x
+1)*gridi
->sx
+ shx
;
4760 get_cell_range(bx0
,bx1
,
4761 gridj
->ncx
,gridj
->c0
[XX
],gridj
->sx
,gridj
->inv_sx
,
4772 new_ci_entry(nbl
,cell0_i
+ci
,shift
,flags_i
[ci
],
4777 new_sci_entry(nbl
,cell0_i
+ci
,shift
,flags_i
[ci
],
4781 #ifndef NBNXN_SHIFT_BACKWARD
4784 if (shift
== CENTRAL
&& gridi
== gridj
&&
4788 /* Leave the pairs with i > j.
4789 * x is the major index, so skip half of it.
4796 set_icell_bb_simple(bb_i
,ci
,shx
,shy
,shz
,
4801 set_icell_bb_supersub(bb_i
,ci
,shx
,shy
,shz
,
4805 nbs
->icell_set_x(cell0_i
+ci
,shx
,shy
,shz
,
4806 gridi
->na_c
,nbat
->xstride
,nbat
->x
,
4809 for(cx
=cxf
; cx
<=cxl
; cx
++)
4812 if (gridj
->c0
[XX
] + cx
*gridj
->sx
> bx1
)
4814 d2zx
+= sqr(gridj
->c0
[XX
] + cx
*gridj
->sx
- bx1
);
4816 else if (gridj
->c0
[XX
] + (cx
+1)*gridj
->sx
< bx0
)
4818 d2zx
+= sqr(gridj
->c0
[XX
] + (cx
+1)*gridj
->sx
- bx0
);
4821 #ifndef NBNXN_SHIFT_BACKWARD
4822 if (gridi
== gridj
&&
4823 cx
== 0 && cyf
< ci_y
)
4825 if (gridi
== gridj
&&
4826 cx
== 0 && shift
== CENTRAL
&& cyf
< ci_y
)
4829 /* Leave the pairs with i > j.
4830 * Skip half of y when i and j have the same x.
4839 for(cy
=cyf_x
; cy
<=cyl
; cy
++)
4841 c0
= gridj
->cxy_ind
[cx
*gridj
->ncy
+cy
];
4842 c1
= gridj
->cxy_ind
[cx
*gridj
->ncy
+cy
+1];
4843 #ifdef NBNXN_SHIFT_BACKWARD
4844 if (gridi
== gridj
&&
4845 shift
== CENTRAL
&& c0
< ci
)
4852 if (gridj
->c0
[YY
] + cy
*gridj
->sy
> by1
)
4854 d2zxy
+= sqr(gridj
->c0
[YY
] + cy
*gridj
->sy
- by1
);
4856 else if (gridj
->c0
[YY
] + (cy
+1)*gridj
->sy
< by0
)
4858 d2zxy
+= sqr(gridj
->c0
[YY
] + (cy
+1)*gridj
->sy
- by0
);
4860 if (c1
> c0
&& d2zxy
< rl2
)
4862 cs
= c0
+ (int)(bz1_frac
*(c1
- c0
));
4870 /* Find the lowest cell that can possibly
4875 (bbcz_j
[cf
*NNBSBB_D
+1] >= bz0
||
4876 d2xy
+ sqr(bbcz_j
[cf
*NNBSBB_D
+1] - bz0
) < rl2
))
4881 /* Find the highest cell that can possibly
4886 (bbcz_j
[cl
*NNBSBB_D
] <= bz1
||
4887 d2xy
+ sqr(bbcz_j
[cl
*NNBSBB_D
] - bz1
) < rl2
))
4892 #ifdef NBNXN_REFCODE
4894 /* Simple reference code */
4898 for(k
=c0
; k
<c1
; k
++)
4900 if (box_dist2(bx0
,bx1
,by0
,by1
,bz0
,bz1
,
4901 bb
+k
*NNBSBB_B
) < rl2
&&
4906 if (box_dist2(bx0
,bx1
,by0
,by1
,bz0
,bz1
,
4907 bb
+k
*NNBSBB_B
) < rl2
&&
4918 /* We want each atom/cell pair only once,
4919 * only use cj >= ci.
4921 #ifndef NBNXN_SHIFT_BACKWARD
4924 if (shift
== CENTRAL
)
4933 switch (nb_kernel_type
)
4936 check_subcell_list_space_simple(nbl
,cl
-cf
+1);
4938 make_cluster_list_simple(gridj
,
4940 (gridi
== gridj
&& shift
== CENTRAL
),
4945 #ifdef NBNXN_SEARCH_SSE
4946 case nbk4xN_X86_SIMD128
:
4947 check_subcell_list_space_simple(nbl
,ci_to_cj(na_cj_2log
,cl
-cf
)+2);
4948 make_cluster_list_x86_simd128(gridj
,
4950 (gridi
== gridj
&& shift
== CENTRAL
),
4955 #ifdef GMX_X86_AVX_256
4956 case nbk4xN_X86_SIMD256
:
4957 check_subcell_list_space_simple(nbl
,ci_to_cj(na_cj_2log
,cl
-cf
)+2);
4958 make_cluster_list_x86_simd256(gridj
,
4960 (gridi
== gridj
&& shift
== CENTRAL
),
4967 case nbk8x8x8_PlainC
:
4969 check_subcell_list_space_supersub(nbl
,cl
-cf
+1);
4970 for(cj
=cf
; cj
<=cl
; cj
++)
4972 make_cluster_list(nbs
,gridi
,gridj
,
4974 (gridi
== gridj
&& shift
== CENTRAL
&& ci
== cj
),
4975 nbat
->xstride
,nbat
->x
,
4981 ncpcheck
+= cl
- cf
+ 1;
4987 /* Set the exclusions for this ci list */
4990 set_ci_top_excls(nbs
,
4992 shift
== CENTRAL
&& gridi
== gridj
,
4995 &(nbl
->ci
[nbl
->nci
]),
5000 set_sci_top_excls(nbs
,
5002 shift
== CENTRAL
&& gridi
== gridj
,
5004 &(nbl
->sci
[nbl
->nsci
]),
5008 /* Close this ci list */
5011 close_ci_entry_simple(nbl
);
5015 close_ci_entry_supersub(nbl
,
5017 progBal
,min_ci_balanced
,
5025 work
->ndistc
= ndistc
;
5027 nbs_cycle_stop(&work
->cc
[enbsCCsearch
]);
5031 fprintf(debug
,"number of distance checks %d\n",ndistc
);
5032 fprintf(debug
,"ncpcheck %s %d\n",gridi
==gridj
? "local" : "non-local",
5037 print_nblist_statistics_simple(debug
,nbl
,nbs
,rlist
);
5041 print_nblist_statistics_supersub(debug
,nbl
,nbs
,rlist
);
5047 /* Make a local or non-local pair-list, depending on iloc */
5048 void nbnxn_make_pairlist(const nbnxn_search_t nbs
,
5049 const nbnxn_atomdata_t
*nbat
,
5050 const t_blocka
*excl
,
5052 int min_ci_balanced
,
5053 nbnxn_pairlist_set_t
*nbl_list
,
5058 const nbnxn_grid_t
*gridi
,*gridj
;
5059 int nzi
,zi
,zj0
,zj1
,zj
;
5063 nbnxn_pairlist_t
**nbl
;
5064 gmx_bool CombineNBLists
;
5065 int np_tot
,np_noq
,np_hlj
,nap
;
5067 nnbl
= nbl_list
->nnbl
;
5068 nbl
= nbl_list
->nbl
;
5069 CombineNBLists
= nbl_list
->bCombined
;
5073 fprintf(debug
,"ns making %d nblists\n", nnbl
);
5076 if (nbl_list
->bSimple
)
5078 switch (nb_kernel_type
)
5080 #ifdef NBNXN_SEARCH_SSE
5081 case nbk4xN_X86_SIMD128
:
5082 nbs
->icell_set_x
= icell_set_x_x86_simd128
;
5084 #ifdef GMX_X86_AVX_256
5085 case nbk4xN_X86_SIMD256
:
5086 nbs
->icell_set_x
= icell_set_x_x86_simd256
;
5091 nbs
->icell_set_x
= icell_set_x_simple
;
5097 #ifdef NBNXN_SEARCH_SSE
5098 nbs
->icell_set_x
= icell_set_x_supersub_sse8
;
5100 nbs
->icell_set_x
= icell_set_x_supersub
;
5106 /* Only zone (grid) 0 vs 0 */
5113 nzi
= nbs
->zones
->nizone
;
5116 if (!nbl_list
->bSimple
&& min_ci_balanced
> 0)
5118 nsubpair_max
= get_nsubpair_max(nbs
,iloc
,rlist
,min_ci_balanced
);
5125 /* Clear all pair-lists */
5126 for(th
=0; th
<nnbl
; th
++)
5128 clear_pairlist(nbl
[th
]);
5131 for(zi
=0; zi
<nzi
; zi
++)
5133 gridi
= &nbs
->grid
[zi
];
5135 if (NONLOCAL_I(iloc
))
5137 zj0
= nbs
->zones
->izone
[zi
].j0
;
5138 zj1
= nbs
->zones
->izone
[zi
].j1
;
5144 for(zj
=zj0
; zj
<zj1
; zj
++)
5146 gridj
= &nbs
->grid
[zj
];
5150 fprintf(debug
,"ns search grid %d vs %d\n",zi
,zj
);
5153 nbs_cycle_start(&nbs
->cc
[enbsCCsearch
]);
5155 #pragma omp parallel for num_threads(nnbl) schedule(static)
5156 for(th
=0; th
<nnbl
; th
++)
5158 if (CombineNBLists
&& th
> 0)
5160 clear_pairlist(nbl
[th
]);
5163 /* Divide the i super cell equally over the nblists */
5164 nbnxn_make_pairlist_part(nbs
,gridi
,gridj
,
5165 &nbs
->work
[th
],nbat
,excl
,
5169 (LOCAL_I(iloc
) || nbs
->zones
->n
<= 2),
5174 nbs_cycle_stop(&nbs
->cc
[enbsCCsearch
]);
5179 for(th
=0; th
<nnbl
; th
++)
5181 inc_nrnb(nrnb
,eNR_NBNXN_DIST2
,nbs
->work
[th
].ndistc
);
5183 if (nbl_list
->bSimple
)
5185 np_tot
+= nbl
[th
]->ncj
;
5186 np_noq
+= nbl
[th
]->work
->ncj_noq
;
5187 np_hlj
+= nbl
[th
]->work
->ncj_hlj
;
5191 /* This count ignores potential subsequent pair pruning */
5192 np_tot
+= nbl
[th
]->nci_tot
;
5195 nap
= nbl
[0]->na_ci
*nbl
[0]->na_cj
;
5196 nbl_list
->natpair_ljq
= (np_tot
- np_noq
)*nap
- np_hlj
*nap
/2;
5197 nbl_list
->natpair_lj
= np_noq
*nap
;
5198 nbl_list
->natpair_q
= np_hlj
*nap
/2;
5200 if (CombineNBLists
&& nnbl
> 1)
5202 nbs_cycle_start(&nbs
->cc
[enbsCCcombine
]);
5204 combine_nblists(nnbl
-1,nbl
+1,nbl
[0]);
5206 nbs_cycle_stop(&nbs
->cc
[enbsCCcombine
]);
5213 print_supersub_nsp("nsubpair",nbl[0],iloc);
5216 /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
5219 nbs
->search_count
++;
5221 if (nbs
->print_cycles
&&
5222 (!nbs
->DomDec
|| (nbs
->DomDec
&& !LOCAL_I(iloc
))) &&
5223 nbs
->search_count
% 100 == 0)
5225 nbs_cycle_print(stderr
,nbs
);
5228 if (debug
&& (CombineNBLists
&& nnbl
> 1))
5230 if (nbl
[0]->bSimple
)
5232 print_nblist_statistics_simple(debug
,nbl
[0],nbs
,rlist
);
5236 print_nblist_statistics_supersub(debug
,nbl
[0],nbs
,rlist
);
5242 if (nbl
[0]->bSimple
)
5244 print_nblist_ci_cj(debug
,nbl
[0]);
5248 print_nblist_sci_cj(debug
,nbl
[0]);
5253 /* Initializes an nbnxn_atomdata_output_t data structure */
5254 static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t
*out
,
5256 int nenergrp
,int stride
,
5257 gmx_nbat_alloc_t
*ma
)
5262 ma((void **)&out
->fshift
,SHIFTS
*DIM
*sizeof(*out
->fshift
));
5263 out
->nV
= nenergrp
*nenergrp
;
5264 ma((void **)&out
->Vvdw
,out
->nV
*sizeof(*out
->Vvdw
));
5265 ma((void **)&out
->Vc
,out
->nV
*sizeof(*out
->Vc
));
5267 if (nb_kernel_type
== nbk4xN_X86_SIMD128
||
5268 nb_kernel_type
== nbk4xN_X86_SIMD256
)
5270 cj_size
= kernel_to_cj_size(nb_kernel_type
);
5271 out
->nVS
= nenergrp
*nenergrp
*stride
*(cj_size
>>1)*cj_size
;
5272 ma((void **)&out
->VSvdw
,out
->nVS
*sizeof(*out
->VSvdw
));
5273 ma((void **)&out
->VSc
,out
->nVS
*sizeof(*out
->VSc
));
5281 /* Determines the combination rule (or none) to be used, stores it,
5282 * and sets the LJ parameters required with the rule.
5284 static void set_combination_rule_data(nbnxn_atomdata_t
*nbat
)
5291 switch (nbat
->comb_rule
)
5294 nbat
->comb_rule
= ljcrGEOM
;
5298 /* Copy the diagonal from the nbfp matrix */
5299 nbat
->nbfp_comb
[i
*2 ] = sqrt(nbat
->nbfp
[(i
*nt
+i
)*2 ]);
5300 nbat
->nbfp_comb
[i
*2+1] = sqrt(nbat
->nbfp
[(i
*nt
+i
)*2+1]);
5306 /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
5307 c6
= nbat
->nbfp
[(i
*nt
+i
)*2 ];
5308 c12
= nbat
->nbfp
[(i
*nt
+i
)*2+1];
5309 if (c6
> 0 && c12
> 0)
5311 /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
5312 * so we get 6*C6 and 12*C12 after combining.
5314 nbat
->nbfp_comb
[i
*2 ] = 0.5*pow(c12
/c6
,1.0/6.0);
5315 nbat
->nbfp_comb
[i
*2+1] = sqrt(c6
*c6
/c12
);
5319 nbat
->nbfp_comb
[i
*2 ] = 0;
5320 nbat
->nbfp_comb
[i
*2+1] = 0;
5325 /* In nbfp_s4 we use a stride of 4 for storing two parameters */
5326 nbat
->alloc((void **)&nbat
->nbfp_s4
,nt
*nt
*4*sizeof(*nbat
->nbfp_s4
));
5331 nbat
->nbfp_s4
[(i
*nt
+j
)*4+0] = nbat
->nbfp
[(i
*nt
+j
)*2+0];
5332 nbat
->nbfp_s4
[(i
*nt
+j
)*4+1] = nbat
->nbfp
[(i
*nt
+j
)*2+1];
5333 nbat
->nbfp_s4
[(i
*nt
+j
)*4+2] = 0;
5334 nbat
->nbfp_s4
[(i
*nt
+j
)*4+3] = 0;
5339 gmx_incons("Unknown combination rule");
5344 /* Initializes an nbnxn_atomdata_t data structure */
5345 void nbnxn_atomdata_init(FILE *fp
,
5346 nbnxn_atomdata_t
*nbat
,
5348 int ntype
,const real
*nbfp
,
5351 gmx_nbat_alloc_t
*alloc
,
5352 gmx_nbat_free_t
*free
)
5357 gmx_bool simple
,bCombGeom
,bCombLB
;
5361 nbat
->alloc
= nbnxn_alloc_aligned
;
5365 nbat
->alloc
= alloc
;
5369 nbat
->free
= nbnxn_free_aligned
;
5378 fprintf(debug
,"There are %d atom types in the system, adding one for nbnxn_atomdata_t\n",ntype
);
5380 nbat
->ntype
= ntype
+ 1;
5381 nbat
->alloc((void **)&nbat
->nbfp
,
5382 nbat
->ntype
*nbat
->ntype
*2*sizeof(*nbat
->nbfp
));
5383 nbat
->alloc((void **)&nbat
->nbfp_comb
,nbat
->ntype
*2*sizeof(*nbat
->nbfp_comb
));
5385 /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
5386 * force-field floating point parameters.
5389 ptr
= getenv("GMX_LJCOMB_TOL");
5394 sscanf(ptr
,"%lf",&dbl
);
5400 /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
5401 * to check for the LB rule.
5403 for(i
=0; i
<ntype
; i
++)
5405 c6
= nbfp
[(i
*ntype
+i
)*2 ];
5406 c12
= nbfp
[(i
*ntype
+i
)*2+1];
5407 if (c6
> 0 && c12
> 0)
5409 nbat
->nbfp_comb
[i
*2 ] = pow(c12
/c6
,1.0/6.0);
5410 nbat
->nbfp_comb
[i
*2+1] = 0.25*c6
*c6
/c12
;
5412 else if (c6
== 0 && c12
== 0)
5414 nbat
->nbfp_comb
[i
*2 ] = 0;
5415 nbat
->nbfp_comb
[i
*2+1] = 0;
5419 /* Can not use LB rule with only dispersion or repulsion */
5424 for(i
=0; i
<nbat
->ntype
; i
++)
5426 for(j
=0; j
<nbat
->ntype
; j
++)
5428 if (i
< ntype
&& j
< ntype
)
5430 /* We store the prefactor in the derivative of the potential
5431 * in the parameter to avoid multiplications in the inner loop.
5433 c6
= nbfp
[(i
*ntype
+j
)*2 ];
5434 c12
= nbfp
[(i
*ntype
+j
)*2+1];
5435 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = 6.0*c6
;
5436 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = 12.0*c12
;
5438 bCombGeom
= bCombGeom
&&
5439 gmx_within_tol(c6
*c6
,nbfp
[(i
*ntype
+i
)*2 ]*nbfp
[(j
*ntype
+j
)*2 ],tol
) &&
5440 gmx_within_tol(c12
*c12
,nbfp
[(i
*ntype
+i
)*2+1]*nbfp
[(j
*ntype
+j
)*2+1],tol
);
5442 bCombLB
= bCombLB
&&
5443 ((c6
== 0 && c12
== 0 &&
5444 (nbat
->nbfp_comb
[i
*2+1] == 0 || nbat
->nbfp_comb
[j
*2+1] == 0)) ||
5445 (c6
> 0 && c12
> 0 &&
5446 gmx_within_tol(pow(c12
/c6
,1.0/6.0),0.5*(nbat
->nbfp_comb
[i
*2]+nbat
->nbfp_comb
[j
*2]),tol
) &&
5447 gmx_within_tol(0.25*c6
*c6
/c12
,sqrt(nbat
->nbfp_comb
[i
*2+1]*nbat
->nbfp_comb
[j
*2+1]),tol
)));
5451 /* Add zero parameters for the additional dummy atom type */
5452 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = 0;
5453 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = 0;
5459 fprintf(debug
,"Combination rules: geometric %d Lorentz-Berthelot %d\n",
5463 simple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
5467 /* We prefer the geometic combination rule,
5468 * as that gives a slightly faster kernel than the LB rule.
5472 nbat
->comb_rule
= ljcrGEOM
;
5476 nbat
->comb_rule
= ljcrLB
;
5480 nbat
->comb_rule
= ljcrNONE
;
5482 nbat
->free(nbat
->nbfp_comb
);
5487 if (nbat
->comb_rule
== ljcrNONE
)
5489 fprintf(fp
,"Using full Lennard-Jones parameter combination matrix\n\n");
5493 fprintf(fp
,"Using %s Lennard-Jones combination rule\n\n",
5494 nbat
->comb_rule
==ljcrGEOM
? "geometric" : "Lorentz-Berthelot");
5498 set_combination_rule_data(nbat
);
5502 nbat
->comb_rule
= ljcrNONE
;
5504 nbat
->free(nbat
->nbfp_comb
);
5509 nbat
->lj_comb
= NULL
;
5512 switch (nb_kernel_type
)
5514 case nbk4xN_X86_SIMD128
:
5515 nbat
->XFormat
= nbatX4
;
5517 case nbk4xN_X86_SIMD256
:
5519 nbat
->XFormat
= nbatX8
;
5521 nbat
->XFormat
= nbatX4
;
5525 nbat
->XFormat
= nbatXYZ
;
5529 nbat
->FFormat
= nbat
->XFormat
;
5533 nbat
->XFormat
= nbatXYZQ
;
5534 nbat
->FFormat
= nbatXYZ
;
5537 nbat
->nenergrp
= n_energygroups
;
5540 /* Energy groups not supported yet for super-sub lists */
5543 /* Temporary storage goes is #grp^3*8 real, so limit to 64 */
5544 if (nbat
->nenergrp
> 64)
5546 gmx_fatal(FARGS
,"With NxN kernels not more than 64 energy groups are supported\n");
5549 while (nbat
->nenergrp
> (1<<nbat
->neg_2log
))
5553 nbat
->energrp
= NULL
;
5554 nbat
->alloc((void **)&nbat
->shift_vec
,SHIFTS
*sizeof(*nbat
->shift_vec
));
5555 nbat
->xstride
= (nbat
->XFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
5556 nbat
->fstride
= (nbat
->FFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
5559 snew(nbat
->out
,nbat
->nout
);
5561 for(i
=0; i
<nbat
->nout
; i
++)
5563 nbnxn_atomdata_output_init(&nbat
->out
[i
],
5565 nbat
->nenergrp
,1<<nbat
->neg_2log
,
5570 static void copy_lj_to_nbat_lj_comb_x4(const real
*ljparam_type
,
5571 const int *type
,int na
,
5576 /* The LJ params follow the combination rule:
5577 * copy the params for the type array to the atom array.
5579 for(is
=0; is
<na
; is
+=PACK_X4
)
5581 for(k
=0; k
<PACK_X4
; k
++)
5584 ljparam_at
[is
*2 +k
] = ljparam_type
[type
[i
]*2 ];
5585 ljparam_at
[is
*2+PACK_X4
+k
] = ljparam_type
[type
[i
]*2+1];
5590 static void copy_lj_to_nbat_lj_comb_x8(const real
*ljparam_type
,
5591 const int *type
,int na
,
5596 /* The LJ params follow the combination rule:
5597 * copy the params for the type array to the atom array.
5599 for(is
=0; is
<na
; is
+=PACK_X8
)
5601 for(k
=0; k
<PACK_X8
; k
++)
5604 ljparam_at
[is
*2 +k
] = ljparam_type
[type
[i
]*2 ];
5605 ljparam_at
[is
*2+PACK_X8
+k
] = ljparam_type
[type
[i
]*2+1];
5610 /* Sets the atom type and LJ data in nbnxn_atomdata_t */
5611 static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t
*nbat
,
5613 const nbnxn_search_t nbs
,
5617 const nbnxn_grid_t
*grid
;
5619 for(g
=0; g
<ngrid
; g
++)
5621 grid
= &nbs
->grid
[g
];
5623 /* Loop over all columns and copy and fill */
5624 for(i
=0; i
<grid
->ncx
*grid
->ncy
; i
++)
5626 ncz
= grid
->cxy_ind
[i
+1] - grid
->cxy_ind
[i
];
5627 ash
= (grid
->cell0
+ grid
->cxy_ind
[i
])*grid
->na_sc
;
5629 copy_int_to_nbat_int(nbs
->a
+ash
,grid
->cxy_na
[i
],ncz
*grid
->na_sc
,
5630 type
,nbat
->ntype
-1,nbat
->type
+ash
);
5632 if (nbat
->comb_rule
!= ljcrNONE
)
5634 if (nbat
->XFormat
== nbatX4
)
5636 copy_lj_to_nbat_lj_comb_x4(nbat
->nbfp_comb
,
5637 nbat
->type
+ash
,ncz
*grid
->na_sc
,
5638 nbat
->lj_comb
+ash
*2);
5640 else if (nbat
->XFormat
== nbatX8
)
5642 copy_lj_to_nbat_lj_comb_x8(nbat
->nbfp_comb
,
5643 nbat
->type
+ash
,ncz
*grid
->na_sc
,
5644 nbat
->lj_comb
+ash
*2);
5651 /* Sets the charges in nbnxn_atomdata_t *nbat */
5652 static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t
*nbat
,
5654 const nbnxn_search_t nbs
,
5657 int g
,cxy
,ncz
,ash
,na
,na_round
,i
,j
;
5659 const nbnxn_grid_t
*grid
;
5661 for(g
=0; g
<ngrid
; g
++)
5663 grid
= &nbs
->grid
[g
];
5665 /* Loop over all columns and copy and fill */
5666 for(cxy
=0; cxy
<grid
->ncx
*grid
->ncy
; cxy
++)
5668 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
5669 na
= grid
->cxy_na
[cxy
];
5670 na_round
= (grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
])*grid
->na_sc
;
5672 if (nbat
->XFormat
== nbatXYZQ
)
5674 q
= nbat
->x
+ ash
*STRIDE_XYZQ
+ ZZ
+ 1;
5677 *q
= charge
[nbs
->a
[ash
+i
]];
5680 /* Complete the partially filled last cell with zeros */
5681 for(; i
<na_round
; i
++)
5692 *q
= charge
[nbs
->a
[ash
+i
]];
5695 /* Complete the partially filled last cell with zeros */
5696 for(; i
<na_round
; i
++)
5706 /* Copies the energy group indices to a reordered and packed array */
5707 static void copy_egp_to_nbat_egps(const int *a
,int na
,int na_round
,
5708 int na_c
,int bit_shift
,
5709 const int *in
,int *innb
)
5715 for(i
=0; i
<na
; i
+=na_c
)
5717 /* Store na_c energy groups number into one int */
5719 for(sa
=0; sa
<na_c
; sa
++)
5724 comb
|= (GET_CGINFO_GID(in
[at
]) << (sa
*bit_shift
));
5729 /* Complete the partially filled last cell with fill */
5730 for(; i
<na_round
; i
+=na_c
)
5736 /* Set the energy group indices for atoms in nbnxn_atomdata_t */
5737 static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t
*nbat
,
5739 const nbnxn_search_t nbs
,
5743 const nbnxn_grid_t
*grid
;
5745 for(g
=0; g
<ngrid
; g
++)
5747 grid
= &nbs
->grid
[g
];
5749 /* Loop over all columns and copy and fill */
5750 for(i
=0; i
<grid
->ncx
*grid
->ncy
; i
++)
5752 ncz
= grid
->cxy_ind
[i
+1] - grid
->cxy_ind
[i
];
5753 ash
= (grid
->cell0
+ grid
->cxy_ind
[i
])*grid
->na_sc
;
5755 copy_egp_to_nbat_egps(nbs
->a
+ash
,grid
->cxy_na
[i
],ncz
*grid
->na_sc
,
5756 nbat
->na_c
,nbat
->neg_2log
,
5757 atinfo
,nbat
->energrp
+(ash
>>grid
->na_c_2log
));
5762 /* Sets all required atom parameter data in nbnxn_atomdata_t */
5763 void nbnxn_atomdata_set(nbnxn_atomdata_t
*nbat
,
5765 const nbnxn_search_t nbs
,
5766 const t_mdatoms
*mdatoms
,
5771 if (locality
== eatLocal
)
5780 nbnxn_atomdata_set_atomtypes(nbat
,ngrid
,nbs
,mdatoms
->typeA
);
5782 nbnxn_atomdata_set_charges(nbat
,ngrid
,nbs
,mdatoms
->chargeA
);
5784 if (nbat
->nenergrp
> 1)
5786 nbnxn_atomdata_set_energygroups(nbat
,ngrid
,nbs
,atinfo
);
5790 /* Copies the shift vector array to nbnxn_atomdata_t */
5791 void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox
,
5793 nbnxn_atomdata_t
*nbat
)
5797 nbat
->bDynamicBox
= bDynamicBox
;
5798 for(i
=0; i
<SHIFTS
; i
++)
5800 copy_rvec(shift_vec
[i
],nbat
->shift_vec
[i
]);
5804 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
5805 void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs
,
5809 nbnxn_atomdata_t
*nbat
)
5832 nbat
->natoms_local
= nbs
->grid
[0].nc
*nbs
->grid
[0].na_sc
;
5835 nth
= gmx_omp_nthreads_get(emntPairsearch
);
5837 #pragma omp parallel for num_threads(nth) schedule(static)
5838 for(th
=0; th
<nth
; th
++)
5842 for(g
=g0
; g
<g1
; g
++)
5844 const nbnxn_grid_t
*grid
;
5847 grid
= &nbs
->grid
[g
];
5849 cxy0
= (grid
->ncx
*grid
->ncy
* th
+nth
-1)/nth
;
5850 cxy1
= (grid
->ncx
*grid
->ncy
*(th
+1)+nth
-1)/nth
;
5852 for(cxy
=cxy0
; cxy
<cxy1
; cxy
++)
5856 na
= grid
->cxy_na
[cxy
];
5857 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
5859 if (g
== 0 && FillLocal
)
5862 (grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
])*grid
->na_sc
;
5866 /* We fill only the real particle locations.
5867 * We assume the filling entries at the end have been
5868 * properly set before during ns.
5872 copy_rvec_to_nbat_real(nbs
->a
+ash
,na
,na_fill
,x
,
5873 nbat
->XFormat
,nbat
->x
,ash
,
5880 /* Add part of the force array(s) from nbnxn_atomdata_t to f */
5882 nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs
,
5883 const nbnxn_atomdata_t
*nbat
,
5884 nbnxn_atomdata_output_t
*out
,
5895 /* Loop over all columns and copy and fill */
5896 switch (nbat
->FFormat
)
5904 for(a
=a0
; a
<a1
; a
++)
5906 i
= cell
[a
]*nbat
->fstride
;
5909 f
[a
][YY
] += fnb
[i
+1];
5910 f
[a
][ZZ
] += fnb
[i
+2];
5915 for(a
=a0
; a
<a1
; a
++)
5917 i
= cell
[a
]*nbat
->fstride
;
5919 for(fa
=0; fa
<nfa
; fa
++)
5921 f
[a
][XX
] += out
[fa
].f
[i
];
5922 f
[a
][YY
] += out
[fa
].f
[i
+1];
5923 f
[a
][ZZ
] += out
[fa
].f
[i
+2];
5933 for(a
=a0
; a
<a1
; a
++)
5935 i
= X4_IND_A(cell
[a
]);
5937 f
[a
][XX
] += fnb
[i
+XX
*PACK_X4
];
5938 f
[a
][YY
] += fnb
[i
+YY
*PACK_X4
];
5939 f
[a
][ZZ
] += fnb
[i
+ZZ
*PACK_X4
];
5944 for(a
=a0
; a
<a1
; a
++)
5946 i
= X4_IND_A(cell
[a
]);
5948 for(fa
=0; fa
<nfa
; fa
++)
5950 f
[a
][XX
] += out
[fa
].f
[i
+XX
*PACK_X4
];
5951 f
[a
][YY
] += out
[fa
].f
[i
+YY
*PACK_X4
];
5952 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*PACK_X4
];
5962 for(a
=a0
; a
<a1
; a
++)
5964 i
= X8_IND_A(cell
[a
]);
5966 f
[a
][XX
] += fnb
[i
+XX
*PACK_X8
];
5967 f
[a
][YY
] += fnb
[i
+YY
*PACK_X8
];
5968 f
[a
][ZZ
] += fnb
[i
+ZZ
*PACK_X8
];
5973 for(a
=a0
; a
<a1
; a
++)
5975 i
= X8_IND_A(cell
[a
]);
5977 for(fa
=0; fa
<nfa
; fa
++)
5979 f
[a
][XX
] += out
[fa
].f
[i
+XX
*PACK_X8
];
5980 f
[a
][YY
] += out
[fa
].f
[i
+YY
*PACK_X8
];
5981 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*PACK_X8
];
5989 /* Add the force array(s) from nbnxn_atomdata_t to f */
5990 void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs
,
5992 const nbnxn_atomdata_t
*nbat
,
5998 nbs_cycle_start(&nbs
->cc
[enbsCCreducef
]);
6004 na
= nbs
->natoms_nonlocal
;
6008 na
= nbs
->natoms_local
;
6011 a0
= nbs
->natoms_local
;
6012 na
= nbs
->natoms_nonlocal
- nbs
->natoms_local
;
6016 nth
= gmx_omp_nthreads_get(emntNonbonded
);
6017 #pragma omp parallel for num_threads(nth) schedule(static)
6018 for(th
=0; th
<nth
; th
++)
6020 nbnxn_atomdata_add_nbat_f_to_f_part(nbs
,nbat
,
6028 nbs_cycle_stop(&nbs
->cc
[enbsCCreducef
]);
6031 /* Adds the shift forces from nbnxn_atomdata_t to fshift */
6032 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t
*nbat
,
6035 const nbnxn_atomdata_output_t
*out
;
6042 for(s
=0; s
<SHIFTS
; s
++)
6045 for(th
=0; th
<nbat
->nout
; th
++)
6047 sum
[XX
] += out
[th
].fshift
[s
*DIM
+XX
];
6048 sum
[YY
] += out
[th
].fshift
[s
*DIM
+YY
];
6049 sum
[ZZ
] += out
[th
].fshift
[s
*DIM
+ZZ
];
6051 rvec_inc(fshift
[s
],sum
);