1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
10 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12 * Copyright (c) 2001-2012, The GROMACS development team,
13 * check out http://www.gromacs.org for more information.
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * If you want to redistribute modifications, please consider that
21 * scientific software is very special. Version control is crucial -
22 * bugs must be traceable. We will be happy to consider code for
23 * inclusion in the official distribution, but derived work must not
24 * be called official GROMACS. Details are found in the README & COPYING
25 * files - if they are missing, get the official version at www.gromacs.org.
27 * To help us fund GROMACS development, we humbly ask that you cite
28 * the papers on the package - you can find them in the top README file.
30 * For more info, check our website at http://www.gromacs.org
45 #include "nbnxn_search.h"
46 #include "nbnxn_consts.h"
47 #include "gmx_cyclecounter.h"
49 #include "gmx_omp_nthreads.h"
53 #define NBNXN_SEARCH_SSE
56 #define NBNXN_SEARCH_SSE_SINGLE
57 #include "gmx_x86_simd_single.h"
59 #include "gmx_x86_simd_double.h"
62 #if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
66 /* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
67 * Here AVX-256 turns out to be slightly slower than AVX-128.
70 #define STRIDE_8BB_2LOG 2
72 #endif /* NBNXN_SEARCH_SSE */
74 /* Pair search box lower and upper corner in x,y,z.
75 * Store this in 4 iso 3 reals, which is useful with SSE.
76 * To avoid complicating the code we also use 4 without SSE.
79 #define NNBSBB_B (2*NNBSBB_C)
80 /* Pair search box lower and upper bound in z only. */
82 /* Pair search box lower and upper corner x,y,z indices */
90 /* Strides for x/f with xyz and xyzq coordinate (and charge) storage */
93 /* Size of packs of x, y or z with SSE/AVX packed coords/forces */
96 /* Strides for a pack of 4 and 8 coordinates/forces */
97 #define STRIDE_P4 (DIM*PACK_X4)
98 #define STRIDE_P8 (DIM*PACK_X8)
100 /* Index of atom a into the SSE/AVX coordinate/force array */
101 #define X4_IND_A(a) (STRIDE_P4*((a) >> 2) + ((a) & (PACK_X4 - 1)))
102 #define X8_IND_A(a) (STRIDE_P8*((a) >> 3) + ((a) & (PACK_X8 - 1)))
105 #ifdef NBNXN_SEARCH_SSE
107 /* The functions below are macros as they are performance sensitive */
109 /* 4x4 list, pack=4: no complex conversion required */
110 /* i-cluster to j-cluster conversion */
111 #define CI_TO_CJ_J4(ci) (ci)
112 /* cluster index to coordinate array index conversion */
113 #define X_IND_CI_J4(ci) ((ci)*STRIDE_P4)
114 #define X_IND_CJ_J4(cj) ((cj)*STRIDE_P4)
116 /* 4x2 list, pack=4: j-cluster size is half the packing width */
117 /* i-cluster to j-cluster conversion */
118 #define CI_TO_CJ_J2(ci) ((ci)<<1)
119 /* cluster index to coordinate array index conversion */
120 #define X_IND_CI_J2(ci) ((ci)*STRIDE_P4)
121 #define X_IND_CJ_J2(cj) (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
123 /* 4x8 list, pack=8: i-cluster size is half the packing width */
124 /* i-cluster to j-cluster conversion */
125 #define CI_TO_CJ_J8(ci) ((ci)>>1)
126 /* cluster index to coordinate array index conversion */
127 #define X_IND_CI_J8(ci) (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
128 #define X_IND_CJ_J8(cj) ((cj)*STRIDE_P8)
130 /* The j-cluster size is matched to the SIMD width */
132 /* 128 bits can hold 4 floats */
133 #define CI_TO_CJ_S128(ci) CI_TO_CJ_J4(ci)
134 #define X_IND_CI_S128(ci) X_IND_CI_J4(ci)
135 #define X_IND_CJ_S128(cj) X_IND_CJ_J4(cj)
136 /* 256 bits can hold 8 floats */
137 #define CI_TO_CJ_S256(ci) CI_TO_CJ_J8(ci)
138 #define X_IND_CI_S256(ci) X_IND_CI_J8(ci)
139 #define X_IND_CJ_S256(cj) X_IND_CJ_J8(cj)
141 /* 128 bits can hold 2 doubles */
142 #define CI_TO_CJ_S128(ci) CI_TO_CJ_J2(ci)
143 #define X_IND_CI_S128(ci) X_IND_CI_J2(ci)
144 #define X_IND_CJ_S128(cj) X_IND_CJ_J2(cj)
145 /* 256 bits can hold 4 doubles */
146 #define CI_TO_CJ_S256(ci) CI_TO_CJ_J4(ci)
147 #define X_IND_CI_S256(ci) X_IND_CI_J4(ci)
148 #define X_IND_CJ_S256(cj) X_IND_CJ_J4(cj)
151 #endif /* NBNXN_SEARCH_SSE */
154 /* Interaction masks for 4xN atom interactions.
155 * Bit i*CJ_SIZE + j tells if atom i and j interact.
157 /* All interaction mask is the same for all kernels */
158 #define NBNXN_INT_MASK_ALL 0xffffffff
159 /* 4x4 kernel diagonal mask */
160 #define NBNXN_INT_MASK_DIAG 0x08ce
161 /* 4x2 kernel diagonal masks */
162 #define NBNXN_INT_MASK_DIAG_J2_0 0x0002
163 #define NBNXN_INT_MASK_DIAG_J2_1 0x002F
164 /* 4x8 kernel diagonal masks */
165 #define NBNXN_INT_MASK_DIAG_J8_0 0xf0f8fcfe
166 #define NBNXN_INT_MASK_DIAG_J8_1 0x0080c0e0
169 #ifdef NBNXN_SEARCH_SSE
170 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
172 /* Size of bounding box corners quadruplet */
173 #define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_8BB)
176 /* We shift the i-particles backward for PBC.
177 * This leads to more conditionals than shifting forward.
178 * We do this to get more balanced pair lists.
180 #define NBNXN_SHIFT_BACKWARD
183 /* This define is a lazy way to avoid interdependence of the grid
184 * and searching data structures.
186 #define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
188 #ifdef NBNXN_SEARCH_SSE
189 #define GMX_MM128_HERE
190 #include "gmx_x86_simd_macros.h"
191 typedef struct nbnxn_x_ci_x86_simd128
{
192 /* The i-cluster coordinates for simple search */
193 gmx_mm_pr ix_SSE0
,iy_SSE0
,iz_SSE0
;
194 gmx_mm_pr ix_SSE1
,iy_SSE1
,iz_SSE1
;
195 gmx_mm_pr ix_SSE2
,iy_SSE2
,iz_SSE2
;
196 gmx_mm_pr ix_SSE3
,iy_SSE3
,iz_SSE3
;
197 } nbnxn_x_ci_x86_simd128_t
;
198 #undef GMX_MM128_HERE
199 #ifdef GMX_X86_AVX_256
200 #define GMX_MM256_HERE
201 #include "gmx_x86_simd_macros.h"
202 typedef struct nbnxn_x_ci_x86_simd256
{
203 /* The i-cluster coordinates for simple search */
204 gmx_mm_pr ix_SSE0
,iy_SSE0
,iz_SSE0
;
205 gmx_mm_pr ix_SSE1
,iy_SSE1
,iz_SSE1
;
206 gmx_mm_pr ix_SSE2
,iy_SSE2
,iz_SSE2
;
207 gmx_mm_pr ix_SSE3
,iy_SSE3
,iz_SSE3
;
208 } nbnxn_x_ci_x86_simd256_t
;
209 #undef GMX_MM256_HERE
213 /* Working data for the actual i-supercell during pair search */
214 typedef struct nbnxn_list_work
{
215 gmx_cache_protect_t cp0
; /* Protect cache between threads */
217 float *bb_ci
; /* The bounding boxes, pbc shifted, for each cluster */
218 real
*x_ci
; /* The coordinates, pbc shifted, for each atom */
219 #ifdef NBNXN_SEARCH_SSE
220 nbnxn_x_ci_x86_simd128_t
*x_ci_x86_simd128
;
221 #ifdef GMX_X86_AVX_256
222 nbnxn_x_ci_x86_simd256_t
*x_ci_x86_simd256
;
225 int cj_ind
; /* The current cj_ind index for the current list */
226 int cj4_init
; /* The first unitialized cj4 block */
228 float *d2
; /* Bounding box distance work array */
230 nbnxn_cj_t
*cj
; /* The j-cell list */
231 int cj_nalloc
; /* Allocation size of cj */
233 int ncj_noq
; /* Nr. of cluster pairs without Coul for flop count */
234 int ncj_hlj
; /* Nr. of cluster pairs with 1/2 LJ for flop count */
236 gmx_cache_protect_t cp1
; /* Protect cache between threads */
239 /* Function type for setting the i-atom coordinate working data */
241 gmx_icell_set_x_t(int ci
,
242 real shx
,real shy
,real shz
,
244 int stride
,const real
*x
,
245 nbnxn_list_work_t
*work
);
247 static gmx_icell_set_x_t icell_set_x_simple
;
248 #ifdef NBNXN_SEARCH_SSE
249 static gmx_icell_set_x_t icell_set_x_simple_x86_simd128
;
250 #ifdef GMX_X86_AVX_256
251 static gmx_icell_set_x_t icell_set_x_simple_x86_simd256
;
254 static gmx_icell_set_x_t icell_set_x_supersub
;
255 #ifdef NBNXN_SEARCH_SSE
256 static gmx_icell_set_x_t icell_set_x_supersub_sse8
;
259 /* Local cycle count struct for profiling */
266 /* Local cycle count enum for profiling */
267 enum { enbsCCgrid
, enbsCCsearch
, enbsCCcombine
, enbsCCreducef
, enbsCCnr
};
269 /* A pair-search grid struct for one domain decomposition zone */
271 rvec c0
; /* The lower corner of the (local) grid */
272 rvec c1
; /* The upper corner of the (local) grid */
273 real atom_density
; /* The atom number density for the local grid */
275 gmx_bool bSimple
; /* Is this grid simple or super/sub */
276 int na_c
; /* Number of atoms per cluster */
277 int na_cj
; /* Number of atoms for list j-clusters */
278 int na_sc
; /* Number of atoms per super-cluster */
279 int na_c_2log
; /* 2log of na_c */
281 int ncx
; /* Number of (super-)cells along x */
282 int ncy
; /* Number of (super-)cells along y */
283 int nc
; /* Total number of (super-)cells */
285 real sx
; /* x-size of a (super-)cell */
286 real sy
; /* y-size of a (super-)cell */
287 real inv_sx
; /* 1/sx */
288 real inv_sy
; /* 1/sy */
290 int cell0
; /* Index in nbs->cell corresponding to cell 0 */
292 int *cxy_na
; /* The number of atoms for each column in x,y */
293 int *cxy_ind
; /* Grid (super)cell index, offset from cell0 */
294 int cxy_nalloc
; /* Allocation size for cxy_na and cxy_ind */
296 int *nsubc
; /* The number of sub cells for each super cell */
297 float *bbcz
; /* Bounding boxes in z for the super cells */
298 float *bb
; /* 3D bounding boxes for the sub cells */
299 float *bbj
; /* 3D j-b.boxes for SSE-double or AVX-single */
300 int *flags
; /* Flag for the super cells */
301 int nc_nalloc
; /* Allocation size for the pointers above */
303 float *bbcz_simple
; /* bbcz for simple grid converted from super */
304 float *bb_simple
; /* bb for simple grid converted from super */
305 int *flags_simple
; /* flags for simple grid converted from super */
306 int nc_nalloc_simple
; /* Allocation size for the pointers above */
308 int nsubc_tot
; /* Total number of subcell, used for printing */
311 /* Thread-local work struct, contains part of nbnxn_grid_t */
313 gmx_cache_protect_t cp0
;
319 int sort_work_nalloc
;
321 int ndistc
; /* Number of distance checks for flop counting */
323 nbnxn_cycle_t cc
[enbsCCnr
];
325 gmx_cache_protect_t cp1
;
326 } nbnxn_search_work_t
;
328 /* Main pair-search struct, contains the grid(s), not the pair-list(s) */
329 typedef struct nbnxn_search
{
330 int ePBC
; /* PBC type enum */
331 matrix box
; /* The periodic unit-cell */
333 gmx_bool DomDec
; /* Are we doing domain decomposition? */
334 ivec dd_dim
; /* Are we doing DD in x,y,z? */
335 gmx_domdec_zones_t
*zones
; /* The domain decomposition zones */
337 int ngrid
; /* The number of grids, equal to #DD-zones */
338 nbnxn_grid_t
*grid
; /* Array of grids, size ngrid */
339 int *cell
; /* Actual allocated cell array for all grids */
340 int cell_nalloc
; /* Allocation size of cell */
341 int *a
; /* Atom index for grid, the inverse of cell */
342 int a_nalloc
; /* Allocation size of a */
344 int natoms_local
; /* The local atoms run from 0 to natoms_local */
345 int natoms_nonlocal
; /* The non-local atoms run from natoms_local
346 * to natoms_nonlocal */
348 gmx_bool print_cycles
;
350 nbnxn_cycle_t cc
[enbsCCnr
];
352 gmx_icell_set_x_t
*icell_set_x
; /* Function for setting i-coords */
354 int nthread_max
; /* Maximum number of threads for pair-search */
355 nbnxn_search_work_t
*work
; /* Work array, size nthread_max */
359 static void nbs_cycle_clear(nbnxn_cycle_t
*cc
)
363 for(i
=0; i
<enbsCCnr
; i
++)
370 static void nbs_cycle_start(nbnxn_cycle_t
*cc
)
372 cc
->start
= gmx_cycles_read();
375 static void nbs_cycle_stop(nbnxn_cycle_t
*cc
)
377 cc
->c
+= gmx_cycles_read() - cc
->start
;
381 static double Mcyc_av(const nbnxn_cycle_t
*cc
)
383 return (double)cc
->c
*1e-6/cc
->count
;
386 static void nbs_cycle_print(FILE *fp
,const nbnxn_search_t nbs
)
392 fprintf(fp
,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
393 nbs
->cc
[enbsCCgrid
].count
,
394 Mcyc_av(&nbs
->cc
[enbsCCgrid
]),
395 Mcyc_av(&nbs
->cc
[enbsCCsearch
]),
396 Mcyc_av(&nbs
->cc
[enbsCCreducef
]));
398 if (nbs
->nthread_max
> 1)
400 if (nbs
->cc
[enbsCCcombine
].count
> 0)
402 fprintf(fp
," comb %5.2f",
403 Mcyc_av(&nbs
->cc
[enbsCCcombine
]));
405 fprintf(fp
," s. th");
406 for(t
=0; t
<nbs
->nthread_max
; t
++)
409 Mcyc_av(&nbs
->work
[t
].cc
[enbsCCsearch
]));
415 static void nbnxn_grid_init(nbnxn_grid_t
* grid
)
418 grid
->cxy_ind
= NULL
;
419 grid
->cxy_nalloc
= 0;
425 static int get_2log(int n
)
430 while ((1<<log2
) < n
)
436 gmx_fatal(FARGS
,"nbnxn na_c (%d) is not a power of 2",n
);
442 static int kernel_to_ci_size(int nb_kernel_type
)
444 switch (nb_kernel_type
)
447 case nbk4xN_X86_SIMD128
:
448 case nbk4xN_X86_SIMD256
:
449 return NBNXN_CPU_CLUSTER_I_SIZE
;
451 case nbk8x8x8_PlainC
:
452 /* The cluster size for super/sub lists is only set here.
453 * Any value should work for the pair-search and atomdata code.
454 * The kernels, of course, might require a particular value.
456 return NBNXN_GPU_CLUSTER_SIZE
;
458 gmx_incons("unknown kernel type");
464 static int kernel_to_cj_size(int nb_kernel_type
)
466 switch (nb_kernel_type
)
469 return NBNXN_CPU_CLUSTER_I_SIZE
;
470 case nbk4xN_X86_SIMD128
:
471 /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
472 return 16/sizeof(real
);
473 case nbk4xN_X86_SIMD256
:
474 /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
475 return 32/sizeof(real
);
477 case nbk8x8x8_PlainC
:
478 return kernel_to_ci_size(nb_kernel_type
);
480 gmx_incons("unknown kernel type");
486 static int ci_to_cj(int na_cj_2log
,int ci
)
490 case 2: return ci
; break;
491 case 1: return (ci
<<1); break;
492 case 3: return (ci
>>1); break;
498 gmx_bool
nbnxn_kernel_pairlist_simple(int nb_kernel_type
)
500 if (nb_kernel_type
== nbkNotSet
)
502 gmx_fatal(FARGS
, "Non-bonded kernel type not set for Verlet-style pair-list.");
505 switch (nb_kernel_type
)
508 case nbk8x8x8_PlainC
:
512 case nbk4xN_X86_SIMD128
:
513 case nbk4xN_X86_SIMD256
:
517 gmx_incons("Invalid nonbonded kernel type passed!");
522 void nbnxn_init_search(nbnxn_search_t
* nbs_ptr
,
524 gmx_domdec_zones_t
*zones
,
533 nbs
->DomDec
= (n_dd_cells
!= NULL
);
535 clear_ivec(nbs
->dd_dim
);
543 if ((*n_dd_cells
)[d
] > 1)
546 /* Each grid matches a DD zone */
552 snew(nbs
->grid
,nbs
->ngrid
);
553 for(g
=0; g
<nbs
->ngrid
; g
++)
555 nbnxn_grid_init(&nbs
->grid
[g
]);
558 nbs
->cell_nalloc
= 0;
562 nbs
->nthread_max
= nthread_max
;
564 /* Initialize the work data structures for each thread */
565 snew(nbs
->work
,nbs
->nthread_max
);
566 for(t
=0; t
<nbs
->nthread_max
; t
++)
568 nbs
->work
[t
].cxy_na
= NULL
;
569 nbs
->work
[t
].cxy_na_nalloc
= 0;
570 nbs
->work
[t
].sort_work
= NULL
;
571 nbs
->work
[t
].sort_work_nalloc
= 0;
574 /* Initialize detailed nbsearch cycle counting */
575 nbs
->print_cycles
= (getenv("GMX_NBNXN_CYCLE") != 0);
576 nbs
->search_count
= 0;
577 nbs_cycle_clear(nbs
->cc
);
578 for(t
=0; t
<nbs
->nthread_max
; t
++)
580 nbs_cycle_clear(nbs
->work
[t
].cc
);
584 static real
grid_atom_density(int n
,rvec corner0
,rvec corner1
)
588 rvec_sub(corner1
,corner0
,size
);
590 return n
/(size
[XX
]*size
[YY
]*size
[ZZ
]);
593 static int set_grid_size_xy(const nbnxn_search_t nbs
,
595 int n
,rvec corner0
,rvec corner1
,
601 real adens
,tlen
,tlen_x
,tlen_y
,nc_max
;
604 rvec_sub(corner1
,corner0
,size
);
608 /* target cell length */
611 /* To minimize the zero interactions, we should make
612 * the largest of the i/j cell cubic.
614 na_c
= max(grid
->na_c
,grid
->na_cj
);
616 /* Approximately cubic cells */
617 tlen
= pow(na_c
/atom_density
,1.0/3.0);
623 /* Approximately cubic sub cells */
624 tlen
= pow(grid
->na_c
/atom_density
,1.0/3.0);
625 tlen_x
= tlen
*GPU_NSUBCELL_X
;
626 tlen_y
= tlen
*GPU_NSUBCELL_Y
;
628 /* We round ncx and ncy down, because we get less cell pairs
629 * in the nbsist when the fixed cell dimensions (x,y) are
630 * larger than the variable one (z) than the other way around.
632 grid
->ncx
= max(1,(int)(size
[XX
]/tlen_x
));
633 grid
->ncy
= max(1,(int)(size
[YY
]/tlen_y
));
641 /* We need one additional cell entry for particles moved by DD */
642 if (grid
->ncx
*grid
->ncy
+1 > grid
->cxy_nalloc
)
644 grid
->cxy_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
645 srenew(grid
->cxy_na
,grid
->cxy_nalloc
);
646 srenew(grid
->cxy_ind
,grid
->cxy_nalloc
+1);
648 for(t
=0; t
<nbs
->nthread_max
; t
++)
650 if (grid
->ncx
*grid
->ncy
+1 > nbs
->work
[t
].cxy_na_nalloc
)
652 nbs
->work
[t
].cxy_na_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
653 srenew(nbs
->work
[t
].cxy_na
,nbs
->work
[t
].cxy_na_nalloc
);
657 /* Worst case scenario of 1 atom in each last cell */
658 if (grid
->na_cj
<= grid
->na_c
)
660 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
;
664 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
*grid
->na_cj
/grid
->na_c
;
667 if (nc_max
> grid
->nc_nalloc
)
671 grid
->nc_nalloc
= over_alloc_large(nc_max
);
672 srenew(grid
->nsubc
,grid
->nc_nalloc
);
673 srenew(grid
->bbcz
,grid
->nc_nalloc
*NNBSBB_D
);
675 bb_nalloc
= grid
->nc_nalloc
*GPU_NSUBCELL
/STRIDE_8BB
*NNBSBB_XXXX
;
677 bb_nalloc
= grid
->nc_nalloc
*GPU_NSUBCELL
*NNBSBB_B
;
679 sfree_aligned(grid
->bb
);
680 /* This snew also zeros the contents, this avoid possible
681 * floating exceptions in SSE with the unused bb elements.
683 snew_aligned(grid
->bb
,bb_nalloc
,16);
687 if (grid
->na_cj
== grid
->na_c
)
689 grid
->bbj
= grid
->bb
;
693 sfree_aligned(grid
->bbj
);
694 snew_aligned(grid
->bbj
,bb_nalloc
*grid
->na_c
/grid
->na_cj
,16);
698 srenew(grid
->flags
,grid
->nc_nalloc
);
701 copy_rvec(corner0
,grid
->c0
);
702 copy_rvec(corner1
,grid
->c1
);
703 grid
->sx
= size
[XX
]/grid
->ncx
;
704 grid
->sy
= size
[YY
]/grid
->ncy
;
705 grid
->inv_sx
= 1/grid
->sx
;
706 grid
->inv_sy
= 1/grid
->sy
;
711 #define SORT_GRID_OVERSIZE 2
712 #define SGSF (SORT_GRID_OVERSIZE + 1)
714 static void sort_atoms(int dim
,gmx_bool Backwards
,
715 int *a
,int n
,rvec
*x
,
716 real h0
,real invh
,int nsort
,int *sort
)
728 /* For small oversize factors clearing the whole area is fastest.
729 * For large oversize we should clear the used elements after use.
731 for(i
=0; i
<nsort
; i
++)
735 /* Sort the particles using a simple index sort */
738 /* The cast takes care of float-point rounding effects below zero.
739 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
740 * times the box height out of the box.
742 zi
= (int)((x
[a
[i
]][dim
] - h0
)*invh
);
744 #ifdef DEBUG_NBNXN_GRIDDING
745 if (zi
< 0 || zi
>= nsort
)
747 gmx_fatal(FARGS
,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
748 a
[i
],'x'+dim
,x
[a
[i
]][dim
],h0
,invh
,zi
,nsort
);
752 /* Ideally this particle should go in sort cell zi,
753 * but that might already be in use,
754 * in that case find the first empty cell higher up
762 /* We have multiple atoms in the same sorting slot.
763 * Sort on real z for minimal bounding box size.
764 * There is an extra check for identical z to ensure
765 * well-defined output order, independent of input order
766 * to ensure binary reproducibility after restarts.
768 while(sort
[zi
] >= 0 && ( x
[a
[i
]][dim
] > x
[sort
[zi
]][dim
] ||
769 (x
[a
[i
]][dim
] == x
[sort
[zi
]][dim
] &&
777 /* Shift all elements by one slot until we find an empty slot */
780 while (sort
[zim
] >= 0)
796 for(zi
=0; zi
<nsort
; zi
++)
806 for(zi
=nsort
-1; zi
>=0; zi
--)
816 gmx_incons("Lost particles while sorting");
821 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
822 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
828 /* Coordinate order x,y,z, bb order xyz0 */
829 static void calc_bounding_box(int na
,int stride
,const real
*x
,float *bb
)
832 real xl
,xh
,yl
,yh
,zl
,zh
;
844 xl
= min(xl
,x
[i
+XX
]);
845 xh
= max(xh
,x
[i
+XX
]);
846 yl
= min(yl
,x
[i
+YY
]);
847 yh
= max(yh
,x
[i
+YY
]);
848 zl
= min(zl
,x
[i
+ZZ
]);
849 zh
= max(zh
,x
[i
+ZZ
]);
852 /* Note: possible double to float conversion here */
853 bb
[BBL_X
] = R2F_D(xl
);
854 bb
[BBL_Y
] = R2F_D(yl
);
855 bb
[BBL_Z
] = R2F_D(zl
);
856 bb
[BBU_X
] = R2F_U(xh
);
857 bb
[BBU_Y
] = R2F_U(yh
);
858 bb
[BBU_Z
] = R2F_U(zh
);
861 /* Packed coordinates, bb order xyz0 */
862 static void calc_bounding_box_x_x4(int na
,const real
*x
,float *bb
)
865 real xl
,xh
,yl
,yh
,zl
,zh
;
875 xl
= min(xl
,x
[j
+XX
*PACK_X4
]);
876 xh
= max(xh
,x
[j
+XX
*PACK_X4
]);
877 yl
= min(yl
,x
[j
+YY
*PACK_X4
]);
878 yh
= max(yh
,x
[j
+YY
*PACK_X4
]);
879 zl
= min(zl
,x
[j
+ZZ
*PACK_X4
]);
880 zh
= max(zh
,x
[j
+ZZ
*PACK_X4
]);
882 /* Note: possible double to float conversion here */
883 bb
[BBL_X
] = R2F_D(xl
);
884 bb
[BBL_Y
] = R2F_D(yl
);
885 bb
[BBL_Z
] = R2F_D(zl
);
886 bb
[BBU_X
] = R2F_U(xh
);
887 bb
[BBU_Y
] = R2F_U(yh
);
888 bb
[BBU_Z
] = R2F_U(zh
);
891 /* Packed coordinates, bb order xyz0 */
892 static void calc_bounding_box_x_x8(int na
,const real
*x
,float *bb
)
895 real xl
,xh
,yl
,yh
,zl
,zh
;
905 xl
= min(xl
,x
[j
+XX
*PACK_X8
]);
906 xh
= max(xh
,x
[j
+XX
*PACK_X8
]);
907 yl
= min(yl
,x
[j
+YY
*PACK_X8
]);
908 yh
= max(yh
,x
[j
+YY
*PACK_X8
]);
909 zl
= min(zl
,x
[j
+ZZ
*PACK_X8
]);
910 zh
= max(zh
,x
[j
+ZZ
*PACK_X8
]);
912 /* Note: possible double to float conversion here */
913 bb
[BBL_X
] = R2F_D(xl
);
914 bb
[BBL_Y
] = R2F_D(yl
);
915 bb
[BBL_Z
] = R2F_D(zl
);
916 bb
[BBU_X
] = R2F_U(xh
);
917 bb
[BBU_Y
] = R2F_U(yh
);
918 bb
[BBU_Z
] = R2F_U(zh
);
921 #ifdef NBNXN_SEARCH_SSE
923 /* Packed coordinates, bb order xyz0 */
924 static void calc_bounding_box_x_x4_halves(int na
,const real
*x
,
925 float *bb
,float *bbj
)
927 calc_bounding_box_x_x4(min(na
,2),x
,bbj
);
931 calc_bounding_box_x_x4(min(na
-2,2),x
+(PACK_X4
>>1),bbj
+NNBSBB_B
);
935 /* Set the "empty" bounding box to the same as the first one,
936 * so we don't need to treat special cases in the rest of the code.
938 _mm_store_ps(bbj
+NNBSBB_B
,_mm_load_ps(bbj
));
939 _mm_store_ps(bbj
+NNBSBB_B
+NNBSBB_C
,_mm_load_ps(bbj
+NNBSBB_C
));
942 _mm_store_ps(bb
,_mm_min_ps(_mm_load_ps(bbj
),
943 _mm_load_ps(bbj
+NNBSBB_B
)));
944 _mm_store_ps(bb
+NNBSBB_C
,_mm_max_ps(_mm_load_ps(bbj
+NNBSBB_C
),
945 _mm_load_ps(bbj
+NNBSBB_B
+NNBSBB_C
)));
948 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
949 static void calc_bounding_box_xxxx(int na
,int stride
,const real
*x
,float *bb
)
952 real xl
,xh
,yl
,yh
,zl
,zh
;
964 xl
= min(xl
,x
[i
+XX
]);
965 xh
= max(xh
,x
[i
+XX
]);
966 yl
= min(yl
,x
[i
+YY
]);
967 yh
= max(yh
,x
[i
+YY
]);
968 zl
= min(zl
,x
[i
+ZZ
]);
969 zh
= max(zh
,x
[i
+ZZ
]);
972 /* Note: possible double to float conversion here */
973 bb
[0*STRIDE_8BB
] = R2F_D(xl
);
974 bb
[1*STRIDE_8BB
] = R2F_D(yl
);
975 bb
[2*STRIDE_8BB
] = R2F_D(zl
);
976 bb
[3*STRIDE_8BB
] = R2F_U(xh
);
977 bb
[4*STRIDE_8BB
] = R2F_U(yh
);
978 bb
[5*STRIDE_8BB
] = R2F_U(zh
);
981 #endif /* NBNXN_SEARCH_SSE */
983 #ifdef NBNXN_SEARCH_SSE_SINGLE
985 /* Coordinate order xyz?, bb order xyz0 */
986 static void calc_bounding_box_sse(int na
,const float *x
,float *bb
)
988 __m128 bb_0_SSE
,bb_1_SSE
;
993 bb_0_SSE
= _mm_load_ps(x
);
998 x_SSE
= _mm_load_ps(x
+i
*NNBSBB_C
);
999 bb_0_SSE
= _mm_min_ps(bb_0_SSE
,x_SSE
);
1000 bb_1_SSE
= _mm_max_ps(bb_1_SSE
,x_SSE
);
1003 _mm_store_ps(bb
,bb_0_SSE
);
1004 _mm_store_ps(bb
+4,bb_1_SSE
);
1007 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
1008 static void calc_bounding_box_xxxx_sse(int na
,const float *x
,
1012 calc_bounding_box_sse(na
,x
,bb_work
);
1014 bb
[0*STRIDE_8BB
] = bb_work
[BBL_X
];
1015 bb
[1*STRIDE_8BB
] = bb_work
[BBL_Y
];
1016 bb
[2*STRIDE_8BB
] = bb_work
[BBL_Z
];
1017 bb
[3*STRIDE_8BB
] = bb_work
[BBU_X
];
1018 bb
[4*STRIDE_8BB
] = bb_work
[BBU_Y
];
1019 bb
[5*STRIDE_8BB
] = bb_work
[BBU_Z
];
1022 #endif /* NBNXN_SEARCH_SSE_SINGLE */
1024 #ifdef NBNXN_SEARCH_SSE
1026 /* Combines pairs of consecutive bounding boxes */
1027 static void combine_bounding_box_pairs(nbnxn_grid_t
*grid
,const float *bb
)
1030 __m128 min_SSE
,max_SSE
;
1032 for(i
=0; i
<grid
->ncx
*grid
->ncy
; i
++)
1034 /* Starting bb in a column is expected to be 2-aligned */
1035 sc2
= grid
->cxy_ind
[i
]>>1;
1036 /* For odd numbers skip the last bb here */
1037 nc2
= (grid
->cxy_na
[i
]+3)>>(2+1);
1038 for(c2
=sc2
; c2
<sc2
+nc2
; c2
++)
1040 min_SSE
= _mm_min_ps(_mm_load_ps(bb
+(c2
*4+0)*NNBSBB_C
),
1041 _mm_load_ps(bb
+(c2
*4+2)*NNBSBB_C
));
1042 max_SSE
= _mm_max_ps(_mm_load_ps(bb
+(c2
*4+1)*NNBSBB_C
),
1043 _mm_load_ps(bb
+(c2
*4+3)*NNBSBB_C
));
1044 _mm_store_ps(grid
->bbj
+(c2
*2+0)*NNBSBB_C
,min_SSE
);
1045 _mm_store_ps(grid
->bbj
+(c2
*2+1)*NNBSBB_C
,max_SSE
);
1047 if (((grid
->cxy_na
[i
]+3)>>2) & 1)
1049 /* Copy the last bb for odd bb count in this column */
1050 for(j
=0; j
<NNBSBB_C
; j
++)
1052 grid
->bbj
[(c2
*2+0)*NNBSBB_C
+j
] = bb
[(c2
*4+0)*NNBSBB_C
+j
];
1053 grid
->bbj
[(c2
*2+1)*NNBSBB_C
+j
] = bb
[(c2
*4+1)*NNBSBB_C
+j
];
1062 /* Prints the average bb size, used for debug output */
1063 static void print_bbsizes_simple(FILE *fp
,
1064 const nbnxn_search_t nbs
,
1065 const nbnxn_grid_t
*grid
)
1071 for(c
=0; c
<grid
->nc
; c
++)
1073 for(d
=0; d
<DIM
; d
++)
1075 ba
[d
] += grid
->bb
[c
*NNBSBB_B
+NNBSBB_C
+d
] - grid
->bb
[c
*NNBSBB_B
+d
];
1078 dsvmul(1.0/grid
->nc
,ba
,ba
);
1080 fprintf(fp
,"ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
1081 nbs
->box
[XX
][XX
]/grid
->ncx
,
1082 nbs
->box
[YY
][YY
]/grid
->ncy
,
1083 nbs
->box
[ZZ
][ZZ
]*grid
->ncx
*grid
->ncy
/grid
->nc
,
1084 ba
[XX
],ba
[YY
],ba
[ZZ
],
1085 ba
[XX
]*grid
->ncx
/nbs
->box
[XX
][XX
],
1086 ba
[YY
]*grid
->ncy
/nbs
->box
[YY
][YY
],
1087 ba
[ZZ
]*grid
->nc
/(grid
->ncx
*grid
->ncy
*nbs
->box
[ZZ
][ZZ
]));
1090 /* Prints the average bb size, used for debug output */
1091 static void print_bbsizes_supersub(FILE *fp
,
1092 const nbnxn_search_t nbs
,
1093 const nbnxn_grid_t
*grid
)
1100 for(c
=0; c
<grid
->nc
; c
++)
1103 for(s
=0; s
<grid
->nsubc
[c
]; s
+=STRIDE_8BB
)
1107 cs_w
= (c
*GPU_NSUBCELL
+ s
)/STRIDE_8BB
;
1108 for(i
=0; i
<STRIDE_8BB
; i
++)
1110 for(d
=0; d
<DIM
; d
++)
1113 grid
->bb
[cs_w
*NNBSBB_XXXX
+(DIM
+d
)*STRIDE_8BB
+i
] -
1114 grid
->bb
[cs_w
*NNBSBB_XXXX
+ d
*STRIDE_8BB
+i
];
1119 for(s
=0; s
<grid
->nsubc
[c
]; s
++)
1123 cs
= c
*GPU_NSUBCELL
+ s
;
1124 for(d
=0; d
<DIM
; d
++)
1127 grid
->bb
[cs
*NNBSBB_B
+NNBSBB_C
+d
] -
1128 grid
->bb
[cs
*NNBSBB_B
+d
];
1132 ns
+= grid
->nsubc
[c
];
1134 dsvmul(1.0/ns
,ba
,ba
);
1136 fprintf(fp
,"ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
1137 nbs
->box
[XX
][XX
]/(grid
->ncx
*GPU_NSUBCELL_X
),
1138 nbs
->box
[YY
][YY
]/(grid
->ncy
*GPU_NSUBCELL_Y
),
1139 nbs
->box
[ZZ
][ZZ
]*grid
->ncx
*grid
->ncy
/(grid
->nc
*GPU_NSUBCELL_Z
),
1140 ba
[XX
],ba
[YY
],ba
[ZZ
],
1141 ba
[XX
]*grid
->ncx
*GPU_NSUBCELL_X
/nbs
->box
[XX
][XX
],
1142 ba
[YY
]*grid
->ncy
*GPU_NSUBCELL_Y
/nbs
->box
[YY
][YY
],
1143 ba
[ZZ
]*grid
->nc
*GPU_NSUBCELL_Z
/(grid
->ncx
*grid
->ncy
*nbs
->box
[ZZ
][ZZ
]));
1146 static void copy_int_to_nbat_int(const int *a
,int na
,int na_round
,
1147 const int *in
,int fill
,int *innb
)
1154 innb
[j
++] = in
[a
[i
]];
1156 /* Complete the partially filled last cell with fill */
1157 for(; i
<na_round
; i
++)
1163 static void clear_nbat_real(int na
,int nbatFormat
,real
*xnb
,int a0
)
1172 for(d
=0; d
<DIM
; d
++)
1174 xnb
[(a0
+a
)*STRIDE_XYZ
+d
] = 0;
1181 for(d
=0; d
<DIM
; d
++)
1183 xnb
[(a0
+a
)*STRIDE_XYZQ
+d
] = 0;
1189 c
= a0
& (PACK_X4
-1);
1192 xnb
[j
+XX
*PACK_X4
] = 0;
1193 xnb
[j
+YY
*PACK_X4
] = 0;
1194 xnb
[j
+ZZ
*PACK_X4
] = 0;
1199 j
+= (DIM
-1)*PACK_X4
;
1206 c
= a0
& (PACK_X8
-1);
1209 xnb
[j
+XX
*PACK_X8
] = 0;
1210 xnb
[j
+YY
*PACK_X8
] = 0;
1211 xnb
[j
+ZZ
*PACK_X8
] = 0;
1216 j
+= (DIM
-1)*PACK_X8
;
1224 static void copy_rvec_to_nbat_real(const int *a
,int na
,int na_round
,
1225 rvec
*x
,int nbatFormat
,real
*xnb
,int a0
,
1226 int cx
,int cy
,int cz
)
1230 /* We might need to place filler particles to fill up the cell to na_round.
1231 * The coefficients (LJ and q) for such particles are zero.
1232 * But we might still get NaN as 0*NaN when distances are too small.
1233 * We hope that -107 nm is far away enough from to zero
1234 * to avoid accidental short distances to particles shifted down for pbc.
1236 #define NBAT_FAR_AWAY 107
1244 xnb
[j
++] = x
[a
[i
]][XX
];
1245 xnb
[j
++] = x
[a
[i
]][YY
];
1246 xnb
[j
++] = x
[a
[i
]][ZZ
];
1248 /* Complete the partially filled last cell with copies of the last element.
1249 * This simplifies the bounding box calculation and avoid
1250 * numerical issues with atoms that are coincidentally close.
1252 for(; i
<na_round
; i
++)
1254 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cx
);
1255 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cy
);
1256 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
1263 xnb
[j
++] = x
[a
[i
]][XX
];
1264 xnb
[j
++] = x
[a
[i
]][YY
];
1265 xnb
[j
++] = x
[a
[i
]][ZZ
];
1268 /* Complete the partially filled last cell with particles far apart */
1269 for(; i
<na_round
; i
++)
1271 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cx
);
1272 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cy
);
1273 xnb
[j
++] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
1279 c
= a0
& (PACK_X4
-1);
1282 xnb
[j
+XX
*PACK_X4
] = x
[a
[i
]][XX
];
1283 xnb
[j
+YY
*PACK_X4
] = x
[a
[i
]][YY
];
1284 xnb
[j
+ZZ
*PACK_X4
] = x
[a
[i
]][ZZ
];
1289 j
+= (DIM
-1)*PACK_X4
;
1293 /* Complete the partially filled last cell with particles far apart */
1294 for(; i
<na_round
; i
++)
1296 xnb
[j
+XX
*PACK_X4
] = -NBAT_FAR_AWAY
*(1 + cx
);
1297 xnb
[j
+YY
*PACK_X4
] = -NBAT_FAR_AWAY
*(1 + cy
);
1298 xnb
[j
+ZZ
*PACK_X4
] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
1303 j
+= (DIM
-1)*PACK_X4
;
1310 c
= a0
& (PACK_X8
- 1);
1313 xnb
[j
+XX
*PACK_X8
] = x
[a
[i
]][XX
];
1314 xnb
[j
+YY
*PACK_X8
] = x
[a
[i
]][YY
];
1315 xnb
[j
+ZZ
*PACK_X8
] = x
[a
[i
]][ZZ
];
1320 j
+= (DIM
-1)*PACK_X8
;
1324 /* Complete the partially filled last cell with particles far apart */
1325 for(; i
<na_round
; i
++)
1327 xnb
[j
+XX
*PACK_X8
] = -NBAT_FAR_AWAY
*(1 + cx
);
1328 xnb
[j
+YY
*PACK_X8
] = -NBAT_FAR_AWAY
*(1 + cy
);
1329 xnb
[j
+ZZ
*PACK_X8
] = -NBAT_FAR_AWAY
*(1 + cz
+ i
);
1334 j
+= (DIM
-1)*PACK_X8
;
1340 gmx_incons("Unsupported stride");
1344 /* Potentially sorts atoms on LJ coefficients !=0 and ==0.
1345 * Also sets interaction flags.
1347 void sort_on_lj(nbnxn_atomdata_t
*nbat
,int na_c
,
1348 int a0
,int a1
,const int *atinfo
,
1352 int subc
,s
,a
,n1
,n2
,a_lj_max
,i
,j
;
1353 int sort1
[NBNXN_NA_SC_MAX
/GPU_NSUBCELL
];
1354 int sort2
[NBNXN_NA_SC_MAX
/GPU_NSUBCELL
];
1360 for(s
=a0
; s
<a1
; s
+=na_c
)
1362 /* Make lists for this (sub-)cell on atoms with and without LJ */
1367 for(a
=s
; a
<min(s
+na_c
,a1
); a
++)
1369 haveQ
= haveQ
|| GET_CGINFO_HAS_Q(atinfo
[order
[a
]]);
1371 if (GET_CGINFO_HAS_VDW(atinfo
[order
[a
]]))
1373 sort1
[n1
++] = order
[a
];
1378 sort2
[n2
++] = order
[a
];
1382 /* If we don't have atom with LJ, there's nothing to sort */
1385 *flags
|= NBNXN_CI_DO_LJ(subc
);
1389 /* Only sort when strictly necessary. Ordering particles
1390 * Ordering particles can lead to less accurate summation
1391 * due to rounding, both for LJ and Coulomb interactions.
1393 if (2*(a_lj_max
- s
) >= na_c
)
1397 order
[a0
+i
] = sort1
[i
];
1401 order
[a0
+n1
+j
] = sort2
[j
];
1405 *flags
|= NBNXN_CI_HALF_LJ(subc
);
1410 *flags
|= NBNXN_CI_DO_COUL(subc
);
1416 /* Fill a pair search cell with atoms.
1417 * Potentially sorts atoms and sets the interaction flags.
1419 void fill_cell(const nbnxn_search_t nbs
,
1421 nbnxn_atomdata_t
*nbat
,
1425 int sx
,int sy
, int sz
,
1436 sort_on_lj(nbat
,grid
->na_c
,a0
,a1
,atinfo
,nbs
->a
,
1437 grid
->flags
+(a0
>>grid
->na_c_2log
)-grid
->cell0
);
1440 /* Now we have sorted the atoms, set the cell indices */
1441 for(a
=a0
; a
<a1
; a
++)
1443 nbs
->cell
[nbs
->a
[a
]] = a
;
1446 copy_rvec_to_nbat_real(nbs
->a
+a0
,a1
-a0
,grid
->na_c
,x
,
1447 nbat
->XFormat
,nbat
->x
,a0
,
1450 if (nbat
->XFormat
== nbatX4
)
1452 /* Store the bounding boxes as xyz.xyz. */
1453 offset
= ((a0
- grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
)*NNBSBB_B
;
1454 bb_ptr
= grid
->bb
+ offset
;
1456 #if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
1457 if (2*grid
->na_cj
== grid
->na_c
)
1459 calc_bounding_box_x_x4_halves(na
,nbat
->x
+X4_IND_A(a0
),bb_ptr
,
1460 grid
->bbj
+offset
*2);
1465 calc_bounding_box_x_x4(na
,nbat
->x
+X4_IND_A(a0
),bb_ptr
);
1468 else if (nbat
->XFormat
== nbatX8
)
1470 /* Store the bounding boxes as xyz.xyz. */
1471 offset
= ((a0
- grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
)*NNBSBB_B
;
1472 bb_ptr
= grid
->bb
+ offset
;
1474 calc_bounding_box_x_x8(na
,nbat
->x
+X8_IND_A(a0
),bb_ptr
);
1477 else if (!grid
->bSimple
)
1479 /* Store the bounding boxes in a format convenient
1480 * for SSE calculations: xxxxyyyyzzzz...
1484 ((a0
-grid
->cell0
*grid
->na_sc
)>>(grid
->na_c_2log
+STRIDE_8BB_2LOG
))*NNBSBB_XXXX
+
1485 (((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
) & (STRIDE_8BB
-1));
1487 #ifdef NBNXN_SEARCH_SSE_SINGLE
1488 if (nbat
->XFormat
== nbatXYZQ
)
1490 calc_bounding_box_xxxx_sse(na
,nbat
->x
+a0
*nbat
->xstride
,
1496 calc_bounding_box_xxxx(na
,nbat
->xstride
,nbat
->x
+a0
*nbat
->xstride
,
1501 fprintf(debug
,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1503 bb_ptr
[0*STRIDE_8BB
],bb_ptr
[3*STRIDE_8BB
],
1504 bb_ptr
[1*STRIDE_8BB
],bb_ptr
[4*STRIDE_8BB
],
1505 bb_ptr
[2*STRIDE_8BB
],bb_ptr
[5*STRIDE_8BB
]);
1511 /* Store the bounding boxes as xyz.xyz. */
1512 bb_ptr
= grid
->bb
+((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
)*NNBSBB_B
;
1514 calc_bounding_box(na
,nbat
->xstride
,nbat
->x
+a0
*nbat
->xstride
,
1520 bbo
= (a0
- grid
->cell0
*grid
->na_sc
)/grid
->na_c
;
1521 fprintf(debug
,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1523 (grid
->bb
+bbo
*NNBSBB_B
)[BBL_X
],
1524 (grid
->bb
+bbo
*NNBSBB_B
)[BBU_X
],
1525 (grid
->bb
+bbo
*NNBSBB_B
)[BBL_Y
],
1526 (grid
->bb
+bbo
*NNBSBB_B
)[BBU_Y
],
1527 (grid
->bb
+bbo
*NNBSBB_B
)[BBL_Z
],
1528 (grid
->bb
+bbo
*NNBSBB_B
)[BBU_Z
]);
1533 /* Spatially sort the atoms within one grid column */
1534 static void sort_columns_simple(const nbnxn_search_t nbs
,
1540 nbnxn_atomdata_t
*nbat
,
1541 int cxy_start
,int cxy_end
,
1545 int cx
,cy
,cz
,ncz
,cfilled
,c
;
1551 fprintf(debug
,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1552 grid
->cell0
,cxy_start
,cxy_end
,a0
,a1
);
1555 /* Sort the atoms within each x,y column in 3 dimensions */
1556 for(cxy
=cxy_start
; cxy
<cxy_end
; cxy
++)
1559 cy
= cxy
- cx
*grid
->ncy
;
1561 na
= grid
->cxy_na
[cxy
];
1562 ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
1563 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1565 /* Sort the atoms within each x,y column on z coordinate */
1566 sort_atoms(ZZ
,FALSE
,
1569 ncz
*grid
->na_sc
*SORT_GRID_OVERSIZE
/nbs
->box
[ZZ
][ZZ
],
1570 ncz
*grid
->na_sc
*SGSF
,sort_work
);
1572 /* Fill the ncz cells in this column */
1573 cfilled
= grid
->cxy_ind
[cxy
];
1574 for(cz
=0; cz
<ncz
; cz
++)
1576 c
= grid
->cxy_ind
[cxy
] + cz
;
1578 ash_c
= ash
+ cz
*grid
->na_sc
;
1579 na_c
= min(grid
->na_sc
,na
-(ash_c
-ash
));
1581 fill_cell(nbs
,grid
,nbat
,
1582 ash_c
,ash_c
+na_c
,atinfo
,x
,
1583 grid
->na_sc
*cx
+ (dd_zone
>> 2),
1584 grid
->na_sc
*cy
+ (dd_zone
& 3),
1588 /* This copy to bbcz is not really necessary.
1589 * But it allows to use the same grid search code
1590 * for the simple and supersub cell setups.
1596 grid
->bbcz
[c
*NNBSBB_D
] = grid
->bb
[cfilled
*NNBSBB_B
+2];
1597 grid
->bbcz
[c
*NNBSBB_D
+1] = grid
->bb
[cfilled
*NNBSBB_B
+6];
1600 /* Set the unused atom indices to -1 */
1601 for(ind
=na
; ind
<ncz
*grid
->na_sc
; ind
++)
1603 nbs
->a
[ash
+ind
] = -1;
1608 /* Spatially sort the atoms within one grid column */
1609 static void sort_columns_supersub(const nbnxn_search_t nbs
,
1615 nbnxn_atomdata_t
*nbat
,
1616 int cxy_start
,int cxy_end
,
1620 int cx
,cy
,cz
=-1,c
=-1,ncz
;
1621 int na
,ash
,na_c
,ind
,a
;
1622 int subdiv_z
,sub_z
,na_z
,ash_z
;
1623 int subdiv_y
,sub_y
,na_y
,ash_y
;
1624 int subdiv_x
,sub_x
,na_x
,ash_x
;
1626 /* cppcheck-suppress unassignedVariable */
1627 float bb_work_array
[NNBSBB_B
+3],*bb_work_align
;
1629 bb_work_align
= (float *)(((size_t)(bb_work_array
+3)) & (~((size_t)15)));
1633 fprintf(debug
,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1634 grid
->cell0
,cxy_start
,cxy_end
,a0
,a1
);
1637 subdiv_x
= grid
->na_c
;
1638 subdiv_y
= GPU_NSUBCELL_X
*subdiv_x
;
1639 subdiv_z
= GPU_NSUBCELL_Y
*subdiv_y
;
1641 /* Sort the atoms within each x,y column in 3 dimensions */
1642 for(cxy
=cxy_start
; cxy
<cxy_end
; cxy
++)
1645 cy
= cxy
- cx
*grid
->ncy
;
1647 na
= grid
->cxy_na
[cxy
];
1648 ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
1649 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1651 /* Sort the atoms within each x,y column on z coordinate */
1652 sort_atoms(ZZ
,FALSE
,
1655 ncz
*grid
->na_sc
*SORT_GRID_OVERSIZE
/nbs
->box
[ZZ
][ZZ
],
1656 ncz
*grid
->na_sc
*SGSF
,sort_work
);
1658 /* This loop goes over the supercells and subcells along z at once */
1659 for(sub_z
=0; sub_z
<ncz
*GPU_NSUBCELL_Z
; sub_z
++)
1661 ash_z
= ash
+ sub_z
*subdiv_z
;
1662 na_z
= min(subdiv_z
,na
-(ash_z
-ash
));
1664 /* We have already sorted on z */
1666 if (sub_z
% GPU_NSUBCELL_Z
== 0)
1668 cz
= sub_z
/GPU_NSUBCELL_Z
;
1669 c
= grid
->cxy_ind
[cxy
] + cz
;
1671 /* The number of atoms in this supercell */
1672 na_c
= min(grid
->na_sc
,na
-(ash_z
-ash
));
1674 grid
->nsubc
[c
] = min(GPU_NSUBCELL
,(na_c
+grid
->na_c
-1)/grid
->na_c
);
1676 /* Store the z-boundaries of the super cell */
1677 grid
->bbcz
[c
*NNBSBB_D
] = x
[nbs
->a
[ash_z
]][ZZ
];
1678 grid
->bbcz
[c
*NNBSBB_D
+1] = x
[nbs
->a
[ash_z
+na_c
-1]][ZZ
];
1681 #if GPU_NSUBCELL_Y > 1
1682 /* Sort the atoms along y */
1683 sort_atoms(YY
,(sub_z
& 1),
1684 nbs
->a
+ash_z
,na_z
,x
,
1685 grid
->c0
[YY
]+cy
*grid
->sy
,grid
->inv_sy
,
1686 subdiv_y
*SGSF
,sort_work
);
1689 for(sub_y
=0; sub_y
<GPU_NSUBCELL_Y
; sub_y
++)
1691 ash_y
= ash_z
+ sub_y
*subdiv_y
;
1692 na_y
= min(subdiv_y
,na
-(ash_y
-ash
));
1694 #if GPU_NSUBCELL_X > 1
1695 /* Sort the atoms along x */
1696 sort_atoms(XX
,((cz
*GPU_NSUBCELL_Y
+ sub_y
) & 1),
1697 nbs
->a
+ash_y
,na_y
,x
,
1698 grid
->c0
[XX
]+cx
*grid
->sx
,grid
->inv_sx
,
1699 subdiv_x
*SGSF
,sort_work
);
1702 for(sub_x
=0; sub_x
<GPU_NSUBCELL_X
; sub_x
++)
1704 ash_x
= ash_y
+ sub_x
*subdiv_x
;
1705 na_x
= min(subdiv_x
,na
-(ash_x
-ash
));
1707 fill_cell(nbs
,grid
,nbat
,
1708 ash_x
,ash_x
+na_x
,atinfo
,x
,
1709 grid
->na_c
*(cx
*GPU_NSUBCELL_X
+sub_x
) + (dd_zone
>> 2),
1710 grid
->na_c
*(cy
*GPU_NSUBCELL_Y
+sub_y
) + (dd_zone
& 3),
1717 /* Set the unused atom indices to -1 */
1718 for(ind
=na
; ind
<ncz
*grid
->na_sc
; ind
++)
1720 nbs
->a
[ash
+ind
] = -1;
1725 /* Determine in which grid column atoms should go */
1726 static void calc_column_indices(nbnxn_grid_t
*grid
,
1728 rvec
*x
,const int *move
,
1729 int thread
,int nthread
,
1736 /* We add one extra cell for particles which moved during DD */
1737 for(i
=0; i
<grid
->ncx
*grid
->ncy
+1; i
++)
1742 n0
= a0
+ (int)((thread
+0)*(a1
- a0
))/nthread
;
1743 n1
= a0
+ (int)((thread
+1)*(a1
- a0
))/nthread
;
1744 for(i
=n0
; i
<n1
; i
++)
1746 if (move
== NULL
|| move
[i
] >= 0)
1748 /* We need to be careful with rounding,
1749 * particles might be a few bits outside the local box.
1750 * The int cast takes care of the lower bound,
1751 * we need to explicitly take care of the upper bound.
1753 cx
= (int)((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1754 if (cx
== grid
->ncx
)
1758 cy
= (int)((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1759 if (cy
== grid
->ncy
)
1763 /* For the moment cell contains only the, grid local,
1764 * x and y indices, not z.
1766 cell
[i
] = cx
*grid
->ncy
+ cy
;
1768 #ifdef DEBUG_NBNXN_GRIDDING
1769 if (cell
[i
] < 0 || cell
[i
] >= grid
->ncx
*grid
->ncy
)
1772 "grid cell cx %d cy %d out of range (max %d %d)",
1773 cx
,cy
,grid
->ncx
,grid
->ncy
);
1779 /* Put this moved particle after the end of the grid,
1780 * so we can process it later without using conditionals.
1782 cell
[i
] = grid
->ncx
*grid
->ncy
;
1789 /* Determine in which grid cells the atoms should go */
1790 static void calc_cell_indices(const nbnxn_search_t nbs
,
1797 nbnxn_atomdata_t
*nbat
)
1800 int cx
,cy
,cxy
,ncz_max
,ncz
;
1802 int *cxy_na
,cxy_na_i
;
1804 nthread
= gmx_omp_nthreads_get(emntPairsearch
);
1806 #pragma omp parallel for num_threads(nthread) schedule(static)
1807 for(thread
=0; thread
<nthread
; thread
++)
1809 calc_column_indices(grid
,a0
,a1
,x
,move
,thread
,nthread
,
1810 nbs
->cell
,nbs
->work
[thread
].cxy_na
);
1813 /* Make the cell index as a function of x and y */
1816 grid
->cxy_ind
[0] = 0;
1817 for(i
=0; i
<grid
->ncx
*grid
->ncy
+1; i
++)
1819 /* We set ncz_max at the beginning of the loop iso at the end
1820 * to skip i=grid->ncx*grid->ncy which are moved particles
1821 * that do not need to be ordered on the grid.
1827 cxy_na_i
= nbs
->work
[0].cxy_na
[i
];
1828 for(thread
=1; thread
<nthread
; thread
++)
1830 cxy_na_i
+= nbs
->work
[thread
].cxy_na
[i
];
1832 ncz
= (cxy_na_i
+ grid
->na_sc
- 1)/grid
->na_sc
;
1833 if (nbat
->XFormat
== nbatX8
)
1835 /* Make the number of cell a multiple of 2 */
1836 ncz
= (ncz
+ 1) & ~1;
1838 grid
->cxy_ind
[i
+1] = grid
->cxy_ind
[i
] + ncz
;
1839 /* Clear cxy_na, so we can reuse the array below */
1840 grid
->cxy_na
[i
] = 0;
1842 grid
->nc
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
] - grid
->cxy_ind
[0];
1844 nbat
->natoms
= (grid
->cell0
+ grid
->nc
)*grid
->na_sc
;
1848 fprintf(debug
,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1849 grid
->na_sc
,grid
->na_c
,grid
->nc
,
1850 grid
->ncx
,grid
->ncy
,grid
->nc
/((double)(grid
->ncx
*grid
->ncy
)),
1855 for(cy
=0; cy
<grid
->ncy
; cy
++)
1857 for(cx
=0; cx
<grid
->ncx
; cx
++)
1859 fprintf(debug
," %2d",grid
->cxy_ind
[i
+1]-grid
->cxy_ind
[i
]);
1862 fprintf(debug
,"\n");
1867 /* Make sure the work array for sorting is large enough */
1868 if (ncz_max
*grid
->na_sc
*SGSF
> nbs
->work
[0].sort_work_nalloc
)
1870 for(thread
=0; thread
<nbs
->nthread_max
; thread
++)
1872 nbs
->work
[thread
].sort_work_nalloc
=
1873 over_alloc_large(ncz_max
*grid
->na_sc
*SGSF
);
1874 srenew(nbs
->work
[thread
].sort_work
,
1875 nbs
->work
[thread
].sort_work_nalloc
);
1879 /* Now we know the dimensions we can fill the grid.
1880 * This is the first, unsorted fill. We sort the columns after this.
1882 for(i
=a0
; i
<a1
; i
++)
1884 /* At this point nbs->cell contains the local grid x,y indices */
1886 nbs
->a
[(grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
+ grid
->cxy_na
[cxy
]++] = i
;
1889 /* Set the cell indices for the moved particles */
1890 n0
= grid
->nc
*grid
->na_sc
;
1891 n1
= grid
->nc
*grid
->na_sc
+grid
->cxy_na
[grid
->ncx
*grid
->ncy
];
1892 for(i
=n0
; i
<n1
; i
++)
1894 nbs
->cell
[nbs
->a
[i
]] = i
;
1897 /* Sort the super-cell columns along z into the sub-cells. */
1898 #pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
1899 for(thread
=0; thread
<nbs
->nthread_max
; thread
++)
1903 sort_columns_simple(nbs
,dd_zone
,grid
,a0
,a1
,atinfo
,x
,nbat
,
1904 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1905 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1906 nbs
->work
[thread
].sort_work
);
1910 sort_columns_supersub(nbs
,dd_zone
,grid
,a0
,a1
,atinfo
,x
,nbat
,
1911 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1912 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1913 nbs
->work
[thread
].sort_work
);
1917 #ifdef NBNXN_SEARCH_SSE
1918 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1920 combine_bounding_box_pairs(grid
,grid
->bb
);
1926 grid
->nsubc_tot
= 0;
1927 for(i
=0; i
<grid
->nc
; i
++)
1929 grid
->nsubc_tot
+= grid
->nsubc
[i
];
1937 print_bbsizes_simple(debug
,nbs
,grid
);
1941 fprintf(debug
,"ns non-zero sub-cells: %d average atoms %.2f\n",
1942 grid
->nsubc_tot
,(a1
-a0
)/(double)grid
->nsubc_tot
);
1944 print_bbsizes_supersub(debug
,nbs
,grid
);
1949 /* Reallocation wrapper function for nbnxn data structures */
1950 static void nb_realloc_void(void **ptr
,
1951 int nbytes_copy
,int nbytes_new
,
1952 gmx_nbat_alloc_t
*ma
,
1953 gmx_nbat_free_t
*mf
)
1957 ma(&ptr_new
,nbytes_new
);
1959 if (nbytes_new
> 0 && ptr_new
== NULL
)
1961 gmx_fatal(FARGS
, "Allocation of %d bytes failed", nbytes_new
);
1964 if (nbytes_copy
> 0)
1966 if (nbytes_new
< nbytes_copy
)
1968 gmx_incons("In nb_realloc_void: new size less than copy size");
1970 memcpy(ptr_new
,*ptr
,nbytes_copy
);
1979 /* NOTE: does not preserve the contents! */
1980 static void nb_realloc_int(int **ptr
,int n
,
1981 gmx_nbat_alloc_t
*ma
,
1982 gmx_nbat_free_t
*mf
)
1988 ma((void **)ptr
,n
*sizeof(**ptr
));
1991 /* NOTE: does not preserve the contents! */
1992 static void nb_realloc_real(real
**ptr
,int n
,
1993 gmx_nbat_alloc_t
*ma
,
1994 gmx_nbat_free_t
*mf
)
2000 ma((void **)ptr
,n
*sizeof(**ptr
));
2003 /* Reallocate the nbnxn_atomdata_t for a size of n atoms */
2004 static void nbnxn_atomdata_realloc(nbnxn_atomdata_t
*nbat
,int n
)
2008 nb_realloc_void((void **)&nbat
->type
,
2009 nbat
->natoms
*sizeof(*nbat
->type
),
2010 n
*sizeof(*nbat
->type
),
2011 nbat
->alloc
,nbat
->free
);
2012 nb_realloc_void((void **)&nbat
->lj_comb
,
2013 nbat
->natoms
*2*sizeof(*nbat
->lj_comb
),
2014 n
*2*sizeof(*nbat
->lj_comb
),
2015 nbat
->alloc
,nbat
->free
);
2016 if (nbat
->XFormat
!= nbatXYZQ
)
2018 nb_realloc_void((void **)&nbat
->q
,
2019 nbat
->natoms
*sizeof(*nbat
->q
),
2021 nbat
->alloc
,nbat
->free
);
2023 if (nbat
->nenergrp
> 1)
2025 nb_realloc_void((void **)&nbat
->energrp
,
2026 nbat
->natoms
/nbat
->na_c
*sizeof(*nbat
->energrp
),
2027 n
/nbat
->na_c
*sizeof(*nbat
->energrp
),
2028 nbat
->alloc
,nbat
->free
);
2030 nb_realloc_void((void **)&nbat
->x
,
2031 nbat
->natoms
*nbat
->xstride
*sizeof(*nbat
->x
),
2032 n
*nbat
->xstride
*sizeof(*nbat
->x
),
2033 nbat
->alloc
,nbat
->free
);
2034 for(t
=0; t
<nbat
->nout
; t
++)
2036 /* Allocate one element extra for possible signaling with CUDA */
2037 nb_realloc_void((void **)&nbat
->out
[t
].f
,
2038 nbat
->natoms
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
2039 n
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
2040 nbat
->alloc
,nbat
->free
);
2045 /* Sets up a grid and puts the atoms on the grid.
2046 * This function only operates on one domain of the domain decompostion.
2047 * Note that without domain decomposition there is only one domain.
2049 void nbnxn_put_on_grid(nbnxn_search_t nbs
,
2050 int ePBC
,matrix box
,
2052 rvec corner0
,rvec corner1
,
2057 int nmoved
,int *move
,
2059 nbnxn_atomdata_t
*nbat
)
2063 int nc_max_grid
,nc_max
;
2065 grid
= &nbs
->grid
[dd_zone
];
2067 nbs_cycle_start(&nbs
->cc
[enbsCCgrid
]);
2069 grid
->bSimple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
2071 grid
->na_c
= kernel_to_ci_size(nb_kernel_type
);
2072 grid
->na_cj
= kernel_to_cj_size(nb_kernel_type
);
2073 grid
->na_sc
= (grid
->bSimple
? 1 : GPU_NSUBCELL
)*grid
->na_c
;
2074 grid
->na_c_2log
= get_2log(grid
->na_c
);
2076 nbat
->na_c
= grid
->na_c
;
2085 (nbs
->grid
[dd_zone
-1].cell0
+ nbs
->grid
[dd_zone
-1].nc
)*
2086 nbs
->grid
[dd_zone
-1].na_sc
/grid
->na_sc
;
2094 copy_mat(box
,nbs
->box
);
2096 if (atom_density
>= 0)
2098 grid
->atom_density
= atom_density
;
2102 grid
->atom_density
= grid_atom_density(n
-nmoved
,corner0
,corner1
);
2107 nbs
->natoms_local
= a1
- nmoved
;
2108 /* We assume that nbnxn_put_on_grid is called first
2109 * for the local atoms (dd_zone=0).
2111 nbs
->natoms_nonlocal
= a1
- nmoved
;
2115 nbs
->natoms_nonlocal
= max(nbs
->natoms_nonlocal
,a1
);
2118 nc_max_grid
= set_grid_size_xy(nbs
,grid
,n
-nmoved
,corner0
,corner1
,
2119 nbs
->grid
[0].atom_density
,
2122 nc_max
= grid
->cell0
+ nc_max_grid
;
2124 if (a1
> nbs
->cell_nalloc
)
2126 nbs
->cell_nalloc
= over_alloc_large(a1
);
2127 srenew(nbs
->cell
,nbs
->cell_nalloc
);
2130 /* To avoid conditionals we store the moved particles at the end of a,
2131 * make sure we have enough space.
2133 if (nc_max
*grid
->na_sc
+ nmoved
> nbs
->a_nalloc
)
2135 nbs
->a_nalloc
= over_alloc_large(nc_max
*grid
->na_sc
+ nmoved
);
2136 srenew(nbs
->a
,nbs
->a_nalloc
);
2139 if (nc_max
*grid
->na_sc
> nbat
->nalloc
)
2141 nbnxn_atomdata_realloc(nbat
,nc_max
*grid
->na_sc
);
2144 calc_cell_indices(nbs
,dd_zone
,grid
,a0
,a1
,atinfo
,x
,move
,nbat
);
2148 nbat
->natoms_local
= nbat
->natoms
;
2151 nbs_cycle_stop(&nbs
->cc
[enbsCCgrid
]);
2154 /* Calls nbnxn_put_on_grid for all non-local domains */
2155 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs
,
2156 const gmx_domdec_zones_t
*zones
,
2160 nbnxn_atomdata_t
*nbat
)
2165 for(zone
=1; zone
<zones
->n
; zone
++)
2167 for(d
=0; d
<DIM
; d
++)
2169 c0
[d
] = zones
->size
[zone
].bb_x0
[d
];
2170 c1
[d
] = zones
->size
[zone
].bb_x1
[d
];
2173 nbnxn_put_on_grid(nbs
,nbs
->ePBC
,NULL
,
2175 zones
->cg_range
[zone
],
2176 zones
->cg_range
[zone
+1],
2186 /* Add simple grid type information to the local super/sub grid */
2187 void nbnxn_grid_add_simple(nbnxn_search_t nbs
,
2188 nbnxn_atomdata_t
*nbat
)
2194 grid
= &nbs
->grid
[0];
2198 gmx_incons("nbnxn_grid_simple called with a simple grid");
2201 ncd
= grid
->na_sc
/NBNXN_CPU_CLUSTER_I_SIZE
;
2203 if (grid
->nc
*ncd
> grid
->nc_nalloc_simple
)
2205 grid
->nc_nalloc_simple
= over_alloc_large(grid
->nc
*ncd
);
2206 srenew(grid
->bbcz_simple
,grid
->nc_nalloc_simple
*NNBSBB_D
);
2207 srenew(grid
->bb_simple
,grid
->nc_nalloc_simple
*NNBSBB_B
);
2208 srenew(grid
->flags_simple
,grid
->nc_nalloc_simple
);
2211 sfree_aligned(grid
->bbj
);
2212 snew_aligned(grid
->bbj
,grid
->nc_nalloc_simple
/2,16);
2216 bbcz
= grid
->bbcz_simple
;
2217 bb
= grid
->bb_simple
;
2219 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
2220 for(sc
=0; sc
<grid
->nc
; sc
++)
2224 for(c
=0; c
<ncd
; c
++)
2228 na
= NBNXN_CPU_CLUSTER_I_SIZE
;
2230 nbat
->type
[tx
*NBNXN_CPU_CLUSTER_I_SIZE
+na
-1] == nbat
->ntype
-1)
2237 switch (nbat
->XFormat
)
2240 /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
2241 calc_bounding_box_x_x4(na
,nbat
->x
+tx
*STRIDE_P4
,
2245 /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
2246 calc_bounding_box_x_x8(na
,nbat
->x
+X8_IND_A(tx
*NBNXN_CPU_CLUSTER_I_SIZE
),
2250 calc_bounding_box(na
,nbat
->xstride
,
2251 nbat
->x
+tx
*NBNXN_CPU_CLUSTER_I_SIZE
*nbat
->xstride
,
2255 bbcz
[tx
*NNBSBB_D
+0] = bb
[tx
*NNBSBB_B
+ZZ
];
2256 bbcz
[tx
*NNBSBB_D
+1] = bb
[tx
*NNBSBB_B
+NNBSBB_C
+ZZ
];
2258 /* No interaction optimization yet here */
2259 grid
->flags_simple
[tx
] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
2263 grid
->flags_simple
[tx
] = 0;
2268 #ifdef NBNXN_SEARCH_SSE
2269 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
2271 combine_bounding_box_pairs(grid
,grid
->bb_simple
);
2276 void nbnxn_get_ncells(nbnxn_search_t nbs
,int *ncx
,int *ncy
)
2278 *ncx
= nbs
->grid
[0].ncx
;
2279 *ncy
= nbs
->grid
[0].ncy
;
2282 void nbnxn_get_atomorder(nbnxn_search_t nbs
,int **a
,int *n
)
2284 const nbnxn_grid_t
*grid
;
2286 grid
= &nbs
->grid
[0];
2288 /* Return the atom order for the home cell (index 0) */
2291 *n
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
]*grid
->na_sc
;
2294 void nbnxn_set_atomorder(nbnxn_search_t nbs
)
2297 int ao
,cx
,cy
,cxy
,cz
,j
;
2299 /* Set the atom order for the home cell (index 0) */
2300 grid
= &nbs
->grid
[0];
2303 for(cx
=0; cx
<grid
->ncx
; cx
++)
2305 for(cy
=0; cy
<grid
->ncy
; cy
++)
2307 cxy
= cx
*grid
->ncy
+ cy
;
2308 j
= grid
->cxy_ind
[cxy
]*grid
->na_sc
;
2309 for(cz
=0; cz
<grid
->cxy_na
[cxy
]; cz
++)
2320 /* Determines the cell range along one dimension that
2321 * the bounding box b0 - b1 sees.
2323 static void get_cell_range(real b0
,real b1
,
2324 int nc
,real c0
,real s
,real invs
,
2325 real d2
,real r2
,int *cf
,int *cl
)
2327 *cf
= max((int)((b0
- c0
)*invs
),0);
2329 while (*cf
> 0 && d2
+ sqr((b0
- c0
) - (*cf
-1+1)*s
) < r2
)
2334 *cl
= min((int)((b1
- c0
)*invs
),nc
-1);
2335 while (*cl
< nc
-1 && d2
+ sqr((*cl
+1)*s
- (b1
- c0
)) < r2
)
2341 /* Reference code calculating the distance^2 between two bounding boxes */
2342 static float box_dist2(float bx0
,float bx1
,float by0
,
2343 float by1
,float bz0
,float bz1
,
2351 dl
= bx0
- bb
[BBU_X
];
2352 dh
= bb
[BBL_X
] - bx1
;
2357 dl
= by0
- bb
[BBU_Y
];
2358 dh
= bb
[BBL_Y
] - by1
;
2363 dl
= bz0
- bb
[BBU_Z
];
2364 dh
= bb
[BBL_Z
] - bz1
;
2372 /* Plain C code calculating the distance^2 between two bounding boxes */
2373 static float subc_bb_dist2(int si
,const float *bb_i_ci
,
2374 int csj
,const float *bb_j_all
)
2376 const float *bb_i
,*bb_j
;
2380 bb_i
= bb_i_ci
+ si
*NNBSBB_B
;
2381 bb_j
= bb_j_all
+ csj
*NNBSBB_B
;
2385 dl
= bb_i
[BBL_X
] - bb_j
[BBU_X
];
2386 dh
= bb_j
[BBL_X
] - bb_i
[BBU_X
];
2391 dl
= bb_i
[BBL_Y
] - bb_j
[BBU_Y
];
2392 dh
= bb_j
[BBL_Y
] - bb_i
[BBU_Y
];
2397 dl
= bb_i
[BBL_Z
] - bb_j
[BBU_Z
];
2398 dh
= bb_j
[BBL_Z
] - bb_i
[BBU_Z
];
2406 #ifdef NBNXN_SEARCH_SSE
2408 /* SSE code for bb distance for bb format xyz0 */
2409 static float subc_bb_dist2_sse(int na_c
,
2410 int si
,const float *bb_i_ci
,
2411 int csj
,const float *bb_j_all
)
2413 const float *bb_i
,*bb_j
;
2415 __m128 bb_i_SSE0
,bb_i_SSE1
;
2416 __m128 bb_j_SSE0
,bb_j_SSE1
;
2422 #ifndef GMX_X86_SSE4_1
2423 float d2_array
[7],*d2_align
;
2425 d2_align
= (float *)(((size_t)(d2_array
+3)) & (~((size_t)15)));
2430 bb_i
= bb_i_ci
+ si
*NNBSBB_B
;
2431 bb_j
= bb_j_all
+ csj
*NNBSBB_B
;
2433 bb_i_SSE0
= _mm_load_ps(bb_i
);
2434 bb_i_SSE1
= _mm_load_ps(bb_i
+NNBSBB_C
);
2435 bb_j_SSE0
= _mm_load_ps(bb_j
);
2436 bb_j_SSE1
= _mm_load_ps(bb_j
+NNBSBB_C
);
2438 dl_SSE
= _mm_sub_ps(bb_i_SSE0
,bb_j_SSE1
);
2439 dh_SSE
= _mm_sub_ps(bb_j_SSE0
,bb_i_SSE1
);
2441 dm_SSE
= _mm_max_ps(dl_SSE
,dh_SSE
);
2442 dm0_SSE
= _mm_max_ps(dm_SSE
,_mm_setzero_ps());
2443 #ifndef GMX_X86_SSE4_1
2444 d2_SSE
= _mm_mul_ps(dm0_SSE
,dm0_SSE
);
2446 _mm_store_ps(d2_align
,d2_SSE
);
2448 return d2_align
[0] + d2_align
[1] + d2_align
[2];
2450 /* SSE4.1 dot product of components 0,1,2 */
2451 d2_SSE
= _mm_dp_ps(dm0_SSE
,dm0_SSE
,0x71);
2453 _mm_store_ss(&d2
,d2_SSE
);
2459 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
2460 #define SUBC_BB_DIST2_SSE_XXXX_INNER(si,bb_i,d2) \
2464 __m128 dx_0,dy_0,dz_0; \
2465 __m128 dx_1,dy_1,dz_1; \
2468 __m128 m0x,m0y,m0z; \
2470 __m128 d2x,d2y,d2z; \
2473 shi = si*NNBSBB_D*DIM; \
2475 xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB); \
2476 yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB); \
2477 zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB); \
2478 xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB); \
2479 yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB); \
2480 zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB); \
2482 dx_0 = _mm_sub_ps(xi_l,xj_h); \
2483 dy_0 = _mm_sub_ps(yi_l,yj_h); \
2484 dz_0 = _mm_sub_ps(zi_l,zj_h); \
2486 dx_1 = _mm_sub_ps(xj_l,xi_h); \
2487 dy_1 = _mm_sub_ps(yj_l,yi_h); \
2488 dz_1 = _mm_sub_ps(zj_l,zi_h); \
2490 mx = _mm_max_ps(dx_0,dx_1); \
2491 my = _mm_max_ps(dy_0,dy_1); \
2492 mz = _mm_max_ps(dz_0,dz_1); \
2494 m0x = _mm_max_ps(mx,zero); \
2495 m0y = _mm_max_ps(my,zero); \
2496 m0z = _mm_max_ps(mz,zero); \
2498 d2x = _mm_mul_ps(m0x,m0x); \
2499 d2y = _mm_mul_ps(m0y,m0y); \
2500 d2z = _mm_mul_ps(m0z,m0z); \
2502 d2s = _mm_add_ps(d2x,d2y); \
2503 d2t = _mm_add_ps(d2s,d2z); \
2505 _mm_store_ps(d2+si,d2t); \
2508 /* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
2509 static void subc_bb_dist2_sse_xxxx(const float *bb_j
,
2510 int nsi
,const float *bb_i
,
2513 __m128 xj_l
,yj_l
,zj_l
;
2514 __m128 xj_h
,yj_h
,zj_h
;
2515 __m128 xi_l
,yi_l
,zi_l
;
2516 __m128 xi_h
,yi_h
,zi_h
;
2520 zero
= _mm_setzero_ps();
2522 xj_l
= _mm_set1_ps(bb_j
[0*STRIDE_8BB
]);
2523 yj_l
= _mm_set1_ps(bb_j
[1*STRIDE_8BB
]);
2524 zj_l
= _mm_set1_ps(bb_j
[2*STRIDE_8BB
]);
2525 xj_h
= _mm_set1_ps(bb_j
[3*STRIDE_8BB
]);
2526 yj_h
= _mm_set1_ps(bb_j
[4*STRIDE_8BB
]);
2527 zj_h
= _mm_set1_ps(bb_j
[5*STRIDE_8BB
]);
2529 /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
2530 * But as we know the number of iterations is 1 or 2, we unroll manually.
2532 SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i
,d2
);
2533 if (STRIDE_8BB
< nsi
)
2535 SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB
,bb_i
,d2
);
2539 #endif /* NBNXN_SEARCH_SSE */
2541 /* Plain C function which determines if any atom pair between two cells
2542 * is within distance sqrt(rl2).
2544 static gmx_bool
subc_in_range_x(int na_c
,
2545 int si
,const real
*x_i
,
2546 int csj
,int stride
,const real
*x_j
,
2552 for(i
=0; i
<na_c
; i
++)
2554 i0
= (si
*na_c
+ i
)*DIM
;
2555 for(j
=0; j
<na_c
; j
++)
2557 j0
= (csj
*na_c
+ j
)*stride
;
2559 d2
= sqr(x_i
[i0
] - x_j
[j0
]) +
2560 sqr(x_i
[i0
+1] - x_j
[j0
+1]) +
2561 sqr(x_i
[i0
+2] - x_j
[j0
+2]);
2573 /* SSE function which determines if any atom pair between two cells,
2574 * both with 8 atoms, is within distance sqrt(rl2).
2576 static gmx_bool
subc_in_range_sse8(int na_c
,
2577 int si
,const real
*x_i
,
2578 int csj
,int stride
,const real
*x_j
,
2581 #ifdef NBNXN_SEARCH_SSE_SINGLE
2582 __m128 ix_SSE0
,iy_SSE0
,iz_SSE0
;
2583 __m128 ix_SSE1
,iy_SSE1
,iz_SSE1
;
2590 rc2_SSE
= _mm_set1_ps(rl2
);
2592 na_c_sse
= NBNXN_GPU_CLUSTER_SIZE
/STRIDE_8BB
;
2593 ix_SSE0
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+0)*STRIDE_8BB
);
2594 iy_SSE0
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+1)*STRIDE_8BB
);
2595 iz_SSE0
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+2)*STRIDE_8BB
);
2596 ix_SSE1
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+3)*STRIDE_8BB
);
2597 iy_SSE1
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+4)*STRIDE_8BB
);
2598 iz_SSE1
= _mm_load_ps(x_i
+(si
*na_c_sse
*DIM
+5)*STRIDE_8BB
);
2600 /* We loop from the outer to the inner particles to maximize
2601 * the chance that we find a pair in range quickly and return.
2607 __m128 jx0_SSE
,jy0_SSE
,jz0_SSE
;
2608 __m128 jx1_SSE
,jy1_SSE
,jz1_SSE
;
2610 __m128 dx_SSE0
,dy_SSE0
,dz_SSE0
;
2611 __m128 dx_SSE1
,dy_SSE1
,dz_SSE1
;
2612 __m128 dx_SSE2
,dy_SSE2
,dz_SSE2
;
2613 __m128 dx_SSE3
,dy_SSE3
,dz_SSE3
;
2624 __m128 wco_any_SSE01
,wco_any_SSE23
,wco_any_SSE
;
2626 jx0_SSE
= _mm_load1_ps(x_j
+j0
*stride
+0);
2627 jy0_SSE
= _mm_load1_ps(x_j
+j0
*stride
+1);
2628 jz0_SSE
= _mm_load1_ps(x_j
+j0
*stride
+2);
2630 jx1_SSE
= _mm_load1_ps(x_j
+j1
*stride
+0);
2631 jy1_SSE
= _mm_load1_ps(x_j
+j1
*stride
+1);
2632 jz1_SSE
= _mm_load1_ps(x_j
+j1
*stride
+2);
2634 /* Calculate distance */
2635 dx_SSE0
= _mm_sub_ps(ix_SSE0
,jx0_SSE
);
2636 dy_SSE0
= _mm_sub_ps(iy_SSE0
,jy0_SSE
);
2637 dz_SSE0
= _mm_sub_ps(iz_SSE0
,jz0_SSE
);
2638 dx_SSE1
= _mm_sub_ps(ix_SSE1
,jx0_SSE
);
2639 dy_SSE1
= _mm_sub_ps(iy_SSE1
,jy0_SSE
);
2640 dz_SSE1
= _mm_sub_ps(iz_SSE1
,jz0_SSE
);
2641 dx_SSE2
= _mm_sub_ps(ix_SSE0
,jx1_SSE
);
2642 dy_SSE2
= _mm_sub_ps(iy_SSE0
,jy1_SSE
);
2643 dz_SSE2
= _mm_sub_ps(iz_SSE0
,jz1_SSE
);
2644 dx_SSE3
= _mm_sub_ps(ix_SSE1
,jx1_SSE
);
2645 dy_SSE3
= _mm_sub_ps(iy_SSE1
,jy1_SSE
);
2646 dz_SSE3
= _mm_sub_ps(iz_SSE1
,jz1_SSE
);
2648 /* rsq = dx*dx+dy*dy+dz*dz */
2649 rsq_SSE0
= gmx_mm_calc_rsq_ps(dx_SSE0
,dy_SSE0
,dz_SSE0
);
2650 rsq_SSE1
= gmx_mm_calc_rsq_ps(dx_SSE1
,dy_SSE1
,dz_SSE1
);
2651 rsq_SSE2
= gmx_mm_calc_rsq_ps(dx_SSE2
,dy_SSE2
,dz_SSE2
);
2652 rsq_SSE3
= gmx_mm_calc_rsq_ps(dx_SSE3
,dy_SSE3
,dz_SSE3
);
2654 wco_SSE0
= _mm_cmplt_ps(rsq_SSE0
,rc2_SSE
);
2655 wco_SSE1
= _mm_cmplt_ps(rsq_SSE1
,rc2_SSE
);
2656 wco_SSE2
= _mm_cmplt_ps(rsq_SSE2
,rc2_SSE
);
2657 wco_SSE3
= _mm_cmplt_ps(rsq_SSE3
,rc2_SSE
);
2659 wco_any_SSE01
= _mm_or_ps(wco_SSE0
,wco_SSE1
);
2660 wco_any_SSE23
= _mm_or_ps(wco_SSE2
,wco_SSE3
);
2661 wco_any_SSE
= _mm_or_ps(wco_any_SSE01
,wco_any_SSE23
);
2663 if (_mm_movemask_ps(wco_any_SSE
))
2675 gmx_incons("SSE function called without SSE support");
2681 /* Returns the j sub-cell for index cj_ind */
2682 static int nbl_cj(const nbnxn_pairlist_t
*nbl
,int cj_ind
)
2684 return nbl
->cj4
[cj_ind
>>2].cj
[cj_ind
& 3];
2687 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
2688 static unsigned nbl_imask0(const nbnxn_pairlist_t
*nbl
,int cj_ind
)
2690 return nbl
->cj4
[cj_ind
>>2].imei
[0].imask
;
2693 /* Ensures there is enough space for extra extra exclusion masks */
2694 static void check_excl_space(nbnxn_pairlist_t
*nbl
,int extra
)
2696 if (nbl
->nexcl
+extra
> nbl
->excl_nalloc
)
2698 nbl
->excl_nalloc
= over_alloc_small(nbl
->nexcl
+extra
);
2699 nb_realloc_void((void **)&nbl
->excl
,
2700 nbl
->nexcl
*sizeof(*nbl
->excl
),
2701 nbl
->excl_nalloc
*sizeof(*nbl
->excl
),
2702 nbl
->alloc
,nbl
->free
);
2706 /* Ensures there is enough space for ncell extra j-cells in the list */
2707 static void check_subcell_list_space_simple(nbnxn_pairlist_t
*nbl
,
2712 cj_max
= nbl
->ncj
+ ncell
;
2714 if (cj_max
> nbl
->cj_nalloc
)
2716 nbl
->cj_nalloc
= over_alloc_small(cj_max
);
2717 nb_realloc_void((void **)&nbl
->cj
,
2718 nbl
->ncj
*sizeof(*nbl
->cj
),
2719 nbl
->cj_nalloc
*sizeof(*nbl
->cj
),
2720 nbl
->alloc
,nbl
->free
);
2724 /* Ensures there is enough space for ncell extra j-subcells in the list */
2725 static void check_subcell_list_space_supersub(nbnxn_pairlist_t
*nbl
,
2728 int ncj4_max
,j4
,j
,w
,t
;
2731 #define WARP_SIZE 32
2733 /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
2734 /* We can store 4 j-subcell - i-supercell pairs in one struct.
2735 * since we round down, we need one extra entry.
2737 ncj4_max
= ((nbl
->work
->cj_ind
+ nsupercell
*GPU_NSUBCELL
+ 4-1) >> 2);
2739 if (ncj4_max
> nbl
->cj4_nalloc
)
2741 nbl
->cj4_nalloc
= over_alloc_small(ncj4_max
);
2742 nb_realloc_void((void **)&nbl
->cj4
,
2743 nbl
->work
->cj4_init
*sizeof(*nbl
->cj4
),
2744 nbl
->cj4_nalloc
*sizeof(*nbl
->cj4
),
2745 nbl
->alloc
,nbl
->free
);
2748 if (ncj4_max
> nbl
->work
->cj4_init
)
2750 for(j4
=nbl
->work
->cj4_init
; j4
<ncj4_max
; j4
++)
2752 /* No i-subcells and no excl's in the list initially */
2753 for(w
=0; w
<NWARP
; w
++)
2755 nbl
->cj4
[j4
].imei
[w
].imask
= 0U;
2756 nbl
->cj4
[j4
].imei
[w
].excl_ind
= 0;
2760 nbl
->work
->cj4_init
= ncj4_max
;
2764 /* Default nbnxn allocation routine, allocates 32 byte aligned,
2765 * which works for plain C and aligned SSE and AVX loads/stores.
2767 static void nbnxn_alloc_aligned(void **ptr
,size_t nbytes
)
2769 *ptr
= save_malloc_aligned("ptr",__FILE__
,__LINE__
,nbytes
,1,32);
2772 /* Free function for memory allocated with nbnxn_alloc_aligned */
2773 static void nbnxn_free_aligned(void *ptr
)
2778 /* Set all excl masks for one GPU warp no exclusions */
2779 static void set_no_excls(nbnxn_excl_t
*excl
)
2783 for(t
=0; t
<WARP_SIZE
; t
++)
2785 /* Turn all interaction bits on */
2786 excl
->pair
[t
] = NBNXN_INT_MASK_ALL
;
2790 /* Initializes a single nbnxn_pairlist_t data structure */
2791 static void nbnxn_init_pairlist(nbnxn_pairlist_t
*nbl
,
2793 gmx_nbat_alloc_t
*alloc
,
2794 gmx_nbat_free_t
*free
)
2798 nbl
->alloc
= nbnxn_alloc_aligned
;
2806 nbl
->free
= nbnxn_free_aligned
;
2813 nbl
->bSimple
= bSimple
;
2824 /* We need one element extra in sj, so alloc initially with 1 */
2825 nbl
->cj4_nalloc
= 0;
2832 nbl
->excl_nalloc
= 0;
2834 check_excl_space(nbl
,1);
2836 set_no_excls(&nbl
->excl
[0]);
2841 snew_aligned(nbl
->work
->bb_ci
,GPU_NSUBCELL
/STRIDE_8BB
*NNBSBB_XXXX
,16);
2843 snew_aligned(nbl
->work
->bb_ci
,GPU_NSUBCELL
*NNBSBB_B
,16);
2845 snew_aligned(nbl
->work
->x_ci
,NBNXN_NA_SC_MAX
*DIM
,16);
2846 #ifdef NBNXN_SEARCH_SSE
2847 snew_aligned(nbl
->work
->x_ci_x86_simd128
,1,16);
2848 #ifdef GMX_X86_AVX_256
2849 snew_aligned(nbl
->work
->x_ci_x86_simd256
,1,32);
2852 snew_aligned(nbl
->work
->d2
,GPU_NSUBCELL
,16);
2855 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t
*nbl_list
,
2856 gmx_bool bSimple
, gmx_bool bCombined
,
2857 gmx_nbat_alloc_t
*alloc
,
2858 gmx_nbat_free_t
*free
)
2862 nbl_list
->bSimple
= bSimple
;
2863 nbl_list
->bCombined
= bCombined
;
2865 nbl_list
->nnbl
= gmx_omp_nthreads_get(emntNonbonded
);
2867 snew(nbl_list
->nbl
,nbl_list
->nnbl
);
2868 /* Execute in order to avoid memory interleaving between threads */
2869 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
2870 for(i
=0; i
<nbl_list
->nnbl
; i
++)
2872 /* Allocate the nblist data structure locally on each thread
2873 * to optimize memory access for NUMA architectures.
2875 snew(nbl_list
->nbl
[i
],1);
2877 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
2880 nbnxn_init_pairlist(nbl_list
->nbl
[i
],nbl_list
->bSimple
,alloc
,free
);
2884 nbnxn_init_pairlist(nbl_list
->nbl
[i
],nbl_list
->bSimple
,NULL
,NULL
);
2889 /* Print statistics of a pair list, used for debug output */
2890 static void print_nblist_statistics_simple(FILE *fp
,const nbnxn_pairlist_t
*nbl
,
2891 const nbnxn_search_t nbs
,real rl
)
2893 const nbnxn_grid_t
*grid
;
2898 /* This code only produces correct statistics with domain decomposition */
2899 grid
= &nbs
->grid
[0];
2901 fprintf(fp
,"nbl nci %d ncj %d\n",
2903 fprintf(fp
,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2904 nbl
->na_sc
,rl
,nbl
->ncj
,nbl
->ncj
/(double)grid
->nc
,
2905 nbl
->ncj
/(double)grid
->nc
*grid
->na_sc
,
2906 nbl
->ncj
/(double)grid
->nc
*grid
->na_sc
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
->nc
*grid
->na_sc
/det(nbs
->box
)));
2908 fprintf(fp
,"nbl average j cell list length %.1f\n",
2909 0.25*nbl
->ncj
/(double)nbl
->nci
);
2911 for(s
=0; s
<SHIFTS
; s
++)
2916 for(i
=0; i
<nbl
->nci
; i
++)
2918 cs
[nbl
->ci
[i
].shift
& NBNXN_CI_SHIFT
] +=
2919 nbl
->ci
[i
].cj_ind_end
- nbl
->ci
[i
].cj_ind_start
;
2921 j
= nbl
->ci
[i
].cj_ind_start
;
2922 while (j
< nbl
->ci
[i
].cj_ind_end
&&
2923 nbl
->cj
[j
].excl
!= NBNXN_INT_MASK_ALL
)
2929 fprintf(fp
,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
2930 nbl
->ncj
,npexcl
,100*npexcl
/(double)nbl
->ncj
);
2931 for(s
=0; s
<SHIFTS
; s
++)
2935 fprintf(fp
,"nbl shift %2d ncj %3d\n",s
,cs
[s
]);
2940 /* Print statistics of a pair lists, used for debug output */
2941 static void print_nblist_statistics_supersub(FILE *fp
,const nbnxn_pairlist_t
*nbl
,
2942 const nbnxn_search_t nbs
,real rl
)
2944 const nbnxn_grid_t
*grid
;
2946 int c
[GPU_NSUBCELL
+1];
2948 /* This code only produces correct statistics with domain decomposition */
2949 grid
= &nbs
->grid
[0];
2951 fprintf(fp
,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
2952 nbl
->nsci
,nbl
->ncj4
,nbl
->nci_tot
,nbl
->nexcl
);
2953 fprintf(fp
,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2954 nbl
->na_ci
,rl
,nbl
->nci_tot
,nbl
->nci_tot
/(double)grid
->nsubc_tot
,
2955 nbl
->nci_tot
/(double)grid
->nsubc_tot
*grid
->na_c
,
2956 nbl
->nci_tot
/(double)grid
->nsubc_tot
*grid
->na_c
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
->nsubc_tot
*grid
->na_c
/det(nbs
->box
)));
2958 fprintf(fp
,"nbl average j super cell list length %.1f\n",
2959 0.25*nbl
->ncj4
/(double)nbl
->nsci
);
2960 fprintf(fp
,"nbl average i sub cell list length %.1f\n",
2961 nbl
->nci_tot
/(0.25*nbl
->ncj4
));
2963 for(si
=0; si
<=GPU_NSUBCELL
; si
++)
2967 for(i
=0; i
<nbl
->nsci
; i
++)
2969 for(j4
=nbl
->sci
[i
].cj4_ind_start
; j4
<nbl
->sci
[i
].cj4_ind_end
; j4
++)
2974 for(si
=0; si
<GPU_NSUBCELL
; si
++)
2976 if (nbl
->cj4
[j4
].imei
[0].imask
& (1U << (j
*GPU_NSUBCELL
+ si
)))
2985 for(b
=0; b
<=GPU_NSUBCELL
; b
++)
2987 fprintf(fp
,"nbl j-list #i-subcell %d %7d %4.1f\n",
2988 b
,c
[b
],100.0*c
[b
]/(double)(nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
));
2992 /* Print the full pair list, used for debug output */
2993 static void print_supersub_nsp(const char *fn
,
2994 const nbnxn_pairlist_t
*nbl
,
3001 sprintf(buf
,"%s_%s.xvg",fn
,NONLOCAL_I(iloc
) ? "nl" : "l");
3002 fp
= ffopen(buf
,"w");
3004 for(i
=0; i
<nbl
->nci
; i
++)
3007 for(j4
=nbl
->sci
[i
].cj4_ind_start
; j4
<nbl
->sci
[i
].cj4_ind_end
; j4
++)
3009 for(p
=0; p
<NBNXN_GPU_JGROUP_SIZE
*GPU_NSUBCELL
; p
++)
3011 nsp
+= (nbl
->cj4
[j4
].imei
[0].imask
>> p
) & 1;
3014 fprintf(fp
,"%4d %3d %3d\n",
3017 nbl
->sci
[i
].cj4_ind_end
-nbl
->sci
[i
].cj4_ind_start
);
3023 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
3024 static void low_get_nbl_exclusions(nbnxn_pairlist_t
*nbl
,int cj4
,
3025 int warp
,nbnxn_excl_t
**excl
)
3027 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
3029 /* No exclusions set, make a new list entry */
3030 nbl
->cj4
[cj4
].imei
[warp
].excl_ind
= nbl
->nexcl
;
3032 *excl
= &nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
3033 set_no_excls(*excl
);
3037 /* We already have some exclusions, new ones can be added to the list */
3038 *excl
= &nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
3042 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
3043 * allocates extra memory, if necessary.
3045 static void get_nbl_exclusions_1(nbnxn_pairlist_t
*nbl
,int cj4
,
3046 int warp
,nbnxn_excl_t
**excl
)
3048 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
3050 /* We need to make a new list entry, check if we have space */
3051 check_excl_space(nbl
,1);
3053 low_get_nbl_exclusions(nbl
,cj4
,warp
,excl
);
3056 /* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
3057 * allocates extra memory, if necessary.
3059 static void get_nbl_exclusions_2(nbnxn_pairlist_t
*nbl
,int cj4
,
3060 nbnxn_excl_t
**excl_w0
,
3061 nbnxn_excl_t
**excl_w1
)
3063 /* Check for space we might need */
3064 check_excl_space(nbl
,2);
3066 low_get_nbl_exclusions(nbl
,cj4
,0,excl_w0
);
3067 low_get_nbl_exclusions(nbl
,cj4
,1,excl_w1
);
3070 /* Sets the self exclusions i=j and pair exclusions i>j */
3071 static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t
*nbl
,
3072 int cj4_ind
,int sj_offset
,
3075 nbnxn_excl_t
*excl
[2];
3078 /* Here we only set the set self and double pair exclusions */
3080 get_nbl_exclusions_2(nbl
,cj4_ind
,&excl
[0],&excl
[1]);
3082 /* Only minor < major bits set */
3083 for(ej
=0; ej
<nbl
->na_ci
; ej
++)
3086 for(ei
=ej
; ei
<nbl
->na_ci
; ei
++)
3088 excl
[w
]->pair
[(ej
&(4-1))*nbl
->na_ci
+ei
] &=
3089 ~(1U << (sj_offset
*GPU_NSUBCELL
+si
));
3094 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
3095 static unsigned int get_imask(gmx_bool rdiag
,int ci
,int cj
)
3097 return (rdiag
&& ci
== cj
? NBNXN_INT_MASK_DIAG
: NBNXN_INT_MASK_ALL
);
3100 #ifdef NBNXN_SEARCH_SSE
3101 /* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
3102 static unsigned int get_imask_x86_simd128(gmx_bool rdiag
,int ci
,int cj
)
3104 #ifndef GMX_DOUBLE /* cj-size = 4 */
3105 return (rdiag
&& ci
== cj
? NBNXN_INT_MASK_DIAG
: NBNXN_INT_MASK_ALL
);
3106 #else /* cj-size = 2 */
3107 return (rdiag
&& ci
*2 == cj
? NBNXN_INT_MASK_DIAG_J2_0
:
3108 (rdiag
&& ci
*2+1 == cj
? NBNXN_INT_MASK_DIAG_J2_1
:
3109 NBNXN_INT_MASK_ALL
));
3113 #ifdef GMX_X86_AVX_256
3114 /* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
3115 static unsigned int get_imask_x86_simd256(gmx_bool rdiag
,int ci
,int cj
)
3117 #ifndef GMX_DOUBLE /* cj-size = 8 */
3118 return (rdiag
&& ci
== cj
*2 ? NBNXN_INT_MASK_DIAG_J8_0
:
3119 (rdiag
&& ci
== cj
*2+1 ? NBNXN_INT_MASK_DIAG_J8_1
:
3120 NBNXN_INT_MASK_ALL
));
3121 #else /* cj-size = 2 */
3122 return (rdiag
&& ci
== cj
? NBNXN_INT_MASK_DIAG
: NBNXN_INT_MASK_ALL
);
3126 #endif /* NBNXN_SEARCH_SSE */
3128 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
3129 * Checks bounding box distances and possibly atom pair distances.
3131 static void make_cluster_list_simple(const nbnxn_grid_t
*gridj
,
3132 nbnxn_pairlist_t
*nbl
,
3133 int ci
,int cjf
,int cjl
,
3134 gmx_bool remove_sub_diag
,
3136 real rl2
,float rbb2
,
3139 const nbnxn_list_work_t
*work
;
3146 int cjf_gl
,cjl_gl
,cj
;
3150 bb_ci
= nbl
->work
->bb_ci
;
3151 x_ci
= nbl
->work
->x_ci
;
3154 while (!InRange
&& cjf
<= cjl
)
3156 d2
= subc_bb_dist2(0,bb_ci
,cjf
,gridj
->bb
);
3159 /* Check if the distance is within the distance where
3160 * we use only the bounding box distance rbb,
3161 * or within the cut-off and there is at least one atom pair
3162 * within the cut-off.
3172 cjf_gl
= gridj
->cell0
+ cjf
;
3173 for(i
=0; i
<NBNXN_CPU_CLUSTER_I_SIZE
&& !InRange
; i
++)
3175 for(j
=0; j
<NBNXN_CPU_CLUSTER_I_SIZE
; j
++)
3177 InRange
= InRange
||
3178 (sqr(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+XX
]) +
3179 sqr(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+YY
]) +
3180 sqr(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+ZZ
]) < rl2
);
3183 *ndistc
+= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
3196 while (!InRange
&& cjl
> cjf
)
3198 d2
= subc_bb_dist2(0,bb_ci
,cjl
,gridj
->bb
);
3201 /* Check if the distance is within the distance where
3202 * we use only the bounding box distance rbb,
3203 * or within the cut-off and there is at least one atom pair
3204 * within the cut-off.
3214 cjl_gl
= gridj
->cell0
+ cjl
;
3215 for(i
=0; i
<NBNXN_CPU_CLUSTER_I_SIZE
&& !InRange
; i
++)
3217 for(j
=0; j
<NBNXN_CPU_CLUSTER_I_SIZE
; j
++)
3219 InRange
= InRange
||
3220 (sqr(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+XX
]) +
3221 sqr(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+YY
]) +
3222 sqr(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+ZZ
]) < rl2
);
3225 *ndistc
+= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
3235 for(cj
=cjf
; cj
<=cjl
; cj
++)
3237 /* Store cj and the interaction mask */
3238 nbl
->cj
[nbl
->ncj
].cj
= gridj
->cell0
+ cj
;
3239 nbl
->cj
[nbl
->ncj
].excl
= get_imask(remove_sub_diag
,ci
,cj
);
3242 /* Increase the closing index in i super-cell list */
3243 nbl
->ci
[nbl
->nci
].cj_ind_end
= nbl
->ncj
;
3247 #ifdef NBNXN_SEARCH_SSE
3248 /* Include make_cluster_list_x86_simd128/256 */
3249 #define GMX_MM128_HERE
3250 #include "gmx_x86_simd_macros.h"
3251 #define STRIDE_S PACK_X4
3252 #include "nbnxn_search_x86_simd.h"
3254 #undef GMX_MM128_HERE
3255 #ifdef GMX_X86_AVX_256
3256 /* Include make_cluster_list_x86_simd128/256 */
3257 #define GMX_MM256_HERE
3258 #include "gmx_x86_simd_macros.h"
3259 #define STRIDE_S GMX_X86_SIMD_WIDTH_HERE
3260 #include "nbnxn_search_x86_simd.h"
3262 #undef GMX_MM256_HERE
3266 /* Plain C or SSE code for making a pair list of super-cell sci vs scj.
3267 * Checks bounding box distances and possibly atom pair distances.
3269 static void make_cluster_list_supersub(const nbnxn_search_t nbs
,
3270 const nbnxn_grid_t
*gridi
,
3271 const nbnxn_grid_t
*gridj
,
3272 nbnxn_pairlist_t
*nbl
,
3274 gmx_bool sci_equals_scj
,
3275 int stride
,const real
*x
,
3276 real rl2
,float rbb2
,
3281 int cjo
,ci1
,ci
,cj
,cj_gl
;
3282 int cj4_ind
,cj_offset
;
3289 #define PRUNE_LIST_CPU_ONE
3290 #ifdef PRUNE_LIST_CPU_ONE
3294 d2l
= nbl
->work
->d2
;
3296 bb_ci
= nbl
->work
->bb_ci
;
3297 x_ci
= nbl
->work
->x_ci
;
3301 for(cjo
=0; cjo
<gridj
->nsubc
[scj
]; cjo
++)
3303 cj4_ind
= (nbl
->work
->cj_ind
>> 2);
3304 cj_offset
= nbl
->work
->cj_ind
- cj4_ind
*NBNXN_GPU_JGROUP_SIZE
;
3305 cj4
= &nbl
->cj4
[cj4_ind
];
3307 cj
= scj
*GPU_NSUBCELL
+ cjo
;
3309 cj_gl
= gridj
->cell0
*GPU_NSUBCELL
+ cj
;
3311 /* Initialize this j-subcell i-subcell list */
3312 cj4
->cj
[cj_offset
] = cj_gl
;
3321 ci1
= gridi
->nsubc
[sci
];
3325 /* Determine all ci1 bb distances in one call with SSE */
3326 subc_bb_dist2_sse_xxxx(gridj
->bb
+(cj
>>STRIDE_8BB_2LOG
)*NNBSBB_XXXX
+(cj
& (STRIDE_8BB
-1)),
3332 /* We use a fixed upper-bound instead of ci1 to help optimization */
3333 for(ci
=0; ci
<GPU_NSUBCELL
; ci
++)
3340 #ifndef NBNXN_BBXXXX
3341 /* Determine the bb distance between ci and cj */
3342 d2l
[ci
] = subc_bb_dist2(ci
,bb_ci
,cj
,gridj
->bb
);
3347 #ifdef PRUNE_LIST_CPU_ALL
3348 /* Check if the distance is within the distance where
3349 * we use only the bounding box distance rbb,
3350 * or within the cut-off and there is at least one atom pair
3351 * within the cut-off. This check is very costly.
3353 *ndistc
+= na_c
*na_c
;
3355 (d2
< rl2
&& subc_in_range_x(na_c
,ci
,x_ci
,cj_gl
,stride
,x
,rl2
)))
3357 /* Check if the distance between the two bounding boxes
3358 * in within the pair-list cut-off.
3363 /* Flag this i-subcell to be taken into account */
3364 imask
|= (1U << (cj_offset
*GPU_NSUBCELL
+ci
));
3366 #ifdef PRUNE_LIST_CPU_ONE
3374 #ifdef PRUNE_LIST_CPU_ONE
3375 /* If we only found 1 pair, check if any atoms are actually
3376 * within the cut-off, so we could get rid of it.
3378 if (npair
== 1 && d2l
[ci_last
] >= rbb2
)
3380 /* Avoid using function pointers here, as it's slower */
3382 #ifdef NBNXN_8BB_SSE
3387 (na_c
,ci_last
,x_ci
,cj_gl
,stride
,x
,rl2
))
3389 imask
&= ~(1U << (cj_offset
*GPU_NSUBCELL
+ci_last
));
3397 /* We have a useful sj entry, close it now */
3399 /* Set the exclucions for the ci== sj entry.
3400 * Here we don't bother to check if this entry is actually flagged,
3401 * as it will nearly always be in the list.
3405 set_self_and_newton_excls_supersub(nbl
,cj4_ind
,cj_offset
,cjo
);
3408 /* Copy the cluster interaction mask to the list */
3409 for(w
=0; w
<NWARP
; w
++)
3411 cj4
->imei
[w
].imask
|= imask
;
3414 nbl
->work
->cj_ind
++;
3416 /* Keep the count */
3417 nbl
->nci_tot
+= npair
;
3419 /* Increase the closing index in i super-cell list */
3420 nbl
->sci
[nbl
->nsci
].cj4_ind_end
= ((nbl
->work
->cj_ind
+4-1)>>2);
3425 /* Set all atom-pair exclusions from the topology stored in excl
3426 * as masks in the pair-list for simple list i-entry nbl_ci
3428 static void set_ci_top_excls(const nbnxn_search_t nbs
,
3429 nbnxn_pairlist_t
*nbl
,
3430 gmx_bool diagRemoved
,
3433 const nbnxn_ci_t
*nbl_ci
,
3434 const t_blocka
*excl
)
3438 int cj_ind_first
,cj_ind_last
;
3439 int cj_first
,cj_last
;
3441 int i
,ai
,aj
,si
,eind
,ge
,se
;
3442 int found
,cj_ind_0
,cj_ind_1
,cj_ind_m
;
3446 nbnxn_excl_t
*nbl_excl
;
3447 int inner_i
,inner_e
;
3451 if (nbl_ci
->cj_ind_end
== nbl_ci
->cj_ind_start
)
3459 cj_ind_first
= nbl_ci
->cj_ind_start
;
3460 cj_ind_last
= nbl
->ncj
- 1;
3462 cj_first
= nbl
->cj
[cj_ind_first
].cj
;
3463 cj_last
= nbl
->cj
[cj_ind_last
].cj
;
3465 /* Determine how many contiguous j-cells we have starting
3466 * from the first i-cell. This number can be used to directly
3467 * calculate j-cell indices for excluded atoms.
3470 if (na_ci_2log
== na_cj_2log
)
3472 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3473 nbl
->cj
[cj_ind_first
+ndirect
].cj
== ci
+ ndirect
)
3478 #ifdef NBNXN_SEARCH_SSE
3481 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3482 nbl
->cj
[cj_ind_first
+ndirect
].cj
== ci_to_cj(na_cj_2log
,ci
) + ndirect
)
3489 /* Loop over the atoms in the i super-cell */
3490 for(i
=0; i
<nbl
->na_sc
; i
++)
3492 ai
= nbs
->a
[ci
*nbl
->na_sc
+i
];
3495 si
= (i
>>na_ci_2log
);
3497 /* Loop over the topology-based exclusions for this i-atom */
3498 for(eind
=excl
->index
[ai
]; eind
<excl
->index
[ai
+1]; eind
++)
3504 /* The self exclusion are already set, save some time */
3510 /* Without shifts we only calculate interactions j>i
3511 * for one-way pair-lists.
3513 if (diagRemoved
&& ge
<= ci
*nbl
->na_sc
+ i
)
3518 se
= (ge
>> na_cj_2log
);
3520 /* Could the cluster se be in our list? */
3521 if (se
>= cj_first
&& se
<= cj_last
)
3523 if (se
< cj_first
+ ndirect
)
3525 /* We can calculate cj_ind directly from se */
3526 found
= cj_ind_first
+ se
- cj_first
;
3530 /* Search for se using bisection */
3532 cj_ind_0
= cj_ind_first
+ ndirect
;
3533 cj_ind_1
= cj_ind_last
+ 1;
3534 while (found
== -1 && cj_ind_0
< cj_ind_1
)
3536 cj_ind_m
= (cj_ind_0
+ cj_ind_1
)>>1;
3538 cj_m
= nbl
->cj
[cj_ind_m
].cj
;
3546 cj_ind_1
= cj_ind_m
;
3550 cj_ind_0
= cj_ind_m
+ 1;
3557 inner_i
= i
- (si
<< na_ci_2log
);
3558 inner_e
= ge
- (se
<< na_cj_2log
);
3560 nbl
->cj
[found
].excl
&= ~(1U<<((inner_i
<<na_cj_2log
) + inner_e
));
3568 /* Set all atom-pair exclusions from the topology stored in excl
3569 * as masks in the pair-list for i-super-cell entry nbl_sci
3571 static void set_sci_top_excls(const nbnxn_search_t nbs
,
3572 nbnxn_pairlist_t
*nbl
,
3573 gmx_bool diagRemoved
,
3575 const nbnxn_sci_t
*nbl_sci
,
3576 const t_blocka
*excl
)
3581 int cj_ind_first
,cj_ind_last
;
3582 int cj_first
,cj_last
;
3584 int i
,ai
,aj
,si
,eind
,ge
,se
;
3585 int found
,cj_ind_0
,cj_ind_1
,cj_ind_m
;
3589 nbnxn_excl_t
*nbl_excl
;
3590 int inner_i
,inner_e
,w
;
3596 if (nbl_sci
->cj4_ind_end
== nbl_sci
->cj4_ind_start
)
3604 cj_ind_first
= nbl_sci
->cj4_ind_start
*NBNXN_GPU_JGROUP_SIZE
;
3605 cj_ind_last
= nbl
->work
->cj_ind
- 1;
3607 cj_first
= nbl
->cj4
[nbl_sci
->cj4_ind_start
].cj
[0];
3608 cj_last
= nbl_cj(nbl
,cj_ind_last
);
3610 /* Determine how many contiguous j-clusters we have starting
3611 * from the first i-cluster. This number can be used to directly
3612 * calculate j-cluster indices for excluded atoms.
3615 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3616 nbl_cj(nbl
,cj_ind_first
+ndirect
) == sci
*GPU_NSUBCELL
+ ndirect
)
3621 /* Loop over the atoms in the i super-cell */
3622 for(i
=0; i
<nbl
->na_sc
; i
++)
3624 ai
= nbs
->a
[sci
*nbl
->na_sc
+i
];
3627 si
= (i
>>na_c_2log
);
3629 /* Loop over the topology-based exclusions for this i-atom */
3630 for(eind
=excl
->index
[ai
]; eind
<excl
->index
[ai
+1]; eind
++)
3636 /* The self exclusion are already set, save some time */
3642 /* Without shifts we only calculate interactions j>i
3643 * for one-way pair-lists.
3645 if (diagRemoved
&& ge
<= sci
*nbl
->na_sc
+ i
)
3651 /* Could the cluster se be in our list? */
3652 if (se
>= cj_first
&& se
<= cj_last
)
3654 if (se
< cj_first
+ ndirect
)
3656 /* We can calculate cj_ind directly from se */
3657 found
= cj_ind_first
+ se
- cj_first
;
3661 /* Search for se using bisection */
3663 cj_ind_0
= cj_ind_first
+ ndirect
;
3664 cj_ind_1
= cj_ind_last
+ 1;
3665 while (found
== -1 && cj_ind_0
< cj_ind_1
)
3667 cj_ind_m
= (cj_ind_0
+ cj_ind_1
)>>1;
3669 cj_m
= nbl_cj(nbl
,cj_ind_m
);
3677 cj_ind_1
= cj_ind_m
;
3681 cj_ind_0
= cj_ind_m
+ 1;
3688 inner_i
= i
- si
*na_c
;
3689 inner_e
= ge
- se
*na_c
;
3691 /* Macro for getting the index of atom a within a cluster */
3692 #define AMODI(a) ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
3693 /* Macro for converting an atom number to a cluster number */
3694 #define A2CI(a) ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
3696 if (nbl_imask0(nbl
,found
) & (1U << (AMODI(found
)*GPU_NSUBCELL
+ si
)))
3700 get_nbl_exclusions_1(nbl
,A2CI(found
),w
,&nbl_excl
);
3702 nbl_excl
->pair
[AMODI(inner_e
)*nbl
->na_ci
+inner_i
] &=
3703 ~(1U << (AMODI(found
)*GPU_NSUBCELL
+ si
));
3715 /* Reallocate the simple ci list for at least n entries */
3716 static void nb_realloc_ci(nbnxn_pairlist_t
*nbl
,int n
)
3718 nbl
->ci_nalloc
= over_alloc_small(n
);
3719 nb_realloc_void((void **)&nbl
->ci
,
3720 nbl
->nci
*sizeof(*nbl
->ci
),
3721 nbl
->ci_nalloc
*sizeof(*nbl
->ci
),
3722 nbl
->alloc
,nbl
->free
);
3725 /* Reallocate the super-cell sci list for at least n entries */
3726 static void nb_realloc_sci(nbnxn_pairlist_t
*nbl
,int n
)
3728 nbl
->sci_nalloc
= over_alloc_small(n
);
3729 nb_realloc_void((void **)&nbl
->sci
,
3730 nbl
->nsci
*sizeof(*nbl
->sci
),
3731 nbl
->sci_nalloc
*sizeof(*nbl
->sci
),
3732 nbl
->alloc
,nbl
->free
);
3735 /* Make a new ci entry at index nbl->nci */
3736 static void new_ci_entry(nbnxn_pairlist_t
*nbl
,int ci
,int shift
,int flags
,
3737 nbnxn_list_work_t
*work
)
3739 if (nbl
->nci
+ 1 > nbl
->ci_nalloc
)
3741 nb_realloc_ci(nbl
,nbl
->nci
+1);
3743 nbl
->ci
[nbl
->nci
].ci
= ci
;
3744 nbl
->ci
[nbl
->nci
].shift
= shift
;
3745 /* Store the interaction flags along with the shift */
3746 nbl
->ci
[nbl
->nci
].shift
|= flags
;
3747 nbl
->ci
[nbl
->nci
].cj_ind_start
= nbl
->ncj
;
3748 nbl
->ci
[nbl
->nci
].cj_ind_end
= nbl
->ncj
;
3751 /* Make a new sci entry at index nbl->nsci */
3752 static void new_sci_entry(nbnxn_pairlist_t
*nbl
,int sci
,int shift
,int flags
,
3753 nbnxn_list_work_t
*work
)
3755 if (nbl
->nsci
+ 1 > nbl
->sci_nalloc
)
3757 nb_realloc_sci(nbl
,nbl
->nsci
+1);
3759 nbl
->sci
[nbl
->nsci
].sci
= sci
;
3760 nbl
->sci
[nbl
->nsci
].shift
= shift
;
3761 nbl
->sci
[nbl
->nsci
].cj4_ind_start
= nbl
->ncj4
;
3762 nbl
->sci
[nbl
->nsci
].cj4_ind_end
= nbl
->ncj4
;
3765 /* Sort the simple j-list cj on exclusions.
3766 * Entries with exclusions will all be sorted to the beginning of the list.
3768 static void sort_cj_excl(nbnxn_cj_t
*cj
,int ncj
,
3769 nbnxn_list_work_t
*work
)
3773 if (ncj
> work
->cj_nalloc
)
3775 work
->cj_nalloc
= over_alloc_large(ncj
);
3776 srenew(work
->cj
,work
->cj_nalloc
);
3779 /* Make a list of the j-cells involving exclusions */
3781 for(j
=0; j
<ncj
; j
++)
3783 if (cj
[j
].excl
!= NBNXN_INT_MASK_ALL
)
3785 work
->cj
[jnew
++] = cj
[j
];
3788 /* Check if there are exclusions at all or not just the first entry */
3789 if (!((jnew
== 0) ||
3790 (jnew
== 1 && cj
[0].excl
!= NBNXN_INT_MASK_ALL
)))
3792 for(j
=0; j
<ncj
; j
++)
3794 if (cj
[j
].excl
== NBNXN_INT_MASK_ALL
)
3796 work
->cj
[jnew
++] = cj
[j
];
3799 for(j
=0; j
<ncj
; j
++)
3801 cj
[j
] = work
->cj
[j
];
3806 /* Close this simple list i entry */
3807 static void close_ci_entry_simple(nbnxn_pairlist_t
*nbl
)
3811 /* All content of the new ci entry have already been filled correctly,
3812 * we only need to increase the count here (for non empty lists).
3814 jlen
= nbl
->ci
[nbl
->nci
].cj_ind_end
- nbl
->ci
[nbl
->nci
].cj_ind_start
;
3817 sort_cj_excl(nbl
->cj
+nbl
->ci
[nbl
->nci
].cj_ind_start
,jlen
,nbl
->work
);
3819 if (nbl
->ci
[nbl
->nci
].shift
& NBNXN_CI_HALF_LJ(0))
3821 nbl
->work
->ncj_hlj
+= jlen
;
3823 else if (!(nbl
->ci
[nbl
->nci
].shift
& NBNXN_CI_DO_COUL(0)))
3825 nbl
->work
->ncj_noq
+= jlen
;
3832 /* Split sci entry for load balancing on the GPU.
3833 * As we only now the current count on our own thread,
3834 * we will need to estimate the current total amount of i-entries.
3835 * As the lists get concatenated later, this estimate depends
3836 * both on nthread and our own thread index thread.
3838 static void split_sci_entry(nbnxn_pairlist_t
*nbl
,
3839 int nsp_max_av
,gmx_bool progBal
,int nc_bal
,
3840 int thread
,int nthread
)
3844 int cj4_start
,cj4_end
,j4len
,cj4
;
3846 int nsp
,nsp_sci
,nsp_cj4
,nsp_cj4_e
,nsp_cj4_p
;
3849 /* Estimate the total numbers of ci's of the nblist combined
3850 * over all threads using the target number of ci's.
3852 nsci_est
= nc_bal
*thread
/nthread
+ nbl
->nsci
;
3855 /* The first ci blocks should be larger, to avoid overhead.
3856 * The last ci blocks should be smaller, to improve load balancing.
3859 nsp_max_av
*nc_bal
*3/(2*(nsci_est
- 1 + nc_bal
)));
3863 nsp_max
= nsp_max_av
;
3866 cj4_start
= nbl
->sci
[nbl
->nsci
-1].cj4_ind_start
;
3867 cj4_end
= nbl
->sci
[nbl
->nsci
-1].cj4_ind_end
;
3868 j4len
= cj4_end
- cj4_start
;
3870 if (j4len
> 1 && j4len
*GPU_NSUBCELL
*NBNXN_GPU_JGROUP_SIZE
> nsp_max
)
3872 /* Remove the last ci entry and process the cj4's again */
3881 while (cj4
< cj4_end
)
3883 nsp_cj4_p
= nsp_cj4
;
3885 for(p
=0; p
<GPU_NSUBCELL
*NBNXN_GPU_JGROUP_SIZE
; p
++)
3887 nsp_cj4
+= (nbl
->cj4
[cj4
].imei
[0].imask
>> p
) & 1;
3891 if (nsp
> nsp_max
&& nsp
> nsp_cj4
)
3893 nbl
->sci
[sci
].cj4_ind_end
= cj4
;
3896 if (nbl
->nsci
+1 > nbl
->sci_nalloc
)
3898 nb_realloc_sci(nbl
,nbl
->nsci
+1);
3900 nbl
->sci
[sci
].sci
= nbl
->sci
[nbl
->nsci
-1].sci
;
3901 nbl
->sci
[sci
].shift
= nbl
->sci
[nbl
->nsci
-1].shift
;
3902 nbl
->sci
[sci
].cj4_ind_start
= cj4
;
3903 nsp_sci
= nsp
- nsp_cj4
;
3904 nsp_cj4_e
= nsp_cj4_p
;
3911 /* Put the remaining cj4's in a new ci entry */
3912 nbl
->sci
[sci
].cj4_ind_end
= cj4_end
;
3914 /* Possibly balance out the last two ci's
3915 * by moving the last cj4 of the second last ci.
3917 if (nsp_sci
- nsp_cj4_e
>= nsp
+ nsp_cj4_e
)
3919 nbl
->sci
[sci
-1].cj4_ind_end
--;
3920 nbl
->sci
[sci
].cj4_ind_start
--;
3928 /* Clost this super/sub list i entry */
3929 static void close_ci_entry_supersub(nbnxn_pairlist_t
*nbl
,
3931 gmx_bool progBal
,int nc_bal
,
3932 int thread
,int nthread
)
3937 /* All content of the new ci entry have already been filled correctly,
3938 * we only need to increase the count here (for non empty lists).
3940 j4len
= nbl
->sci
[nbl
->nsci
].cj4_ind_end
- nbl
->sci
[nbl
->nsci
].cj4_ind_start
;
3943 /* We can only have complete blocks of 4 j-entries in a list,
3944 * so round the count up before closing.
3946 nbl
->ncj4
= ((nbl
->work
->cj_ind
+ 4-1) >> 2);
3947 nbl
->work
->cj_ind
= nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
;
3953 split_sci_entry(nbl
,nsp_max_av
,progBal
,nc_bal
,thread
,nthread
);
3958 /* Syncs the working array before adding another grid pair to the list */
3959 static void sync_work(nbnxn_pairlist_t
*nbl
)
3963 nbl
->work
->cj_ind
= nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
;
3964 nbl
->work
->cj4_init
= nbl
->ncj4
;
3968 /* Clears an nbnxn_pairlist_t data structure */
3969 static void clear_pairlist(nbnxn_pairlist_t
*nbl
)
3978 nbl
->work
->ncj_noq
= 0;
3979 nbl
->work
->ncj_hlj
= 0;
3982 /* Sets a simple list i-cell bounding box, including PBC shift */
3983 static void set_icell_bb_simple(const float *bb
,int ci
,
3984 real shx
,real shy
,real shz
,
3990 bb_ci
[BBL_X
] = bb
[ia
+BBL_X
] + shx
;
3991 bb_ci
[BBL_Y
] = bb
[ia
+BBL_Y
] + shy
;
3992 bb_ci
[BBL_Z
] = bb
[ia
+BBL_Z
] + shz
;
3993 bb_ci
[BBU_X
] = bb
[ia
+BBU_X
] + shx
;
3994 bb_ci
[BBU_Y
] = bb
[ia
+BBU_Y
] + shy
;
3995 bb_ci
[BBU_Z
] = bb
[ia
+BBU_Z
] + shz
;
3998 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
3999 static void set_icell_bb_supersub(const float *bb
,int ci
,
4000 real shx
,real shy
,real shz
,
4006 ia
= ci
*(GPU_NSUBCELL
>>STRIDE_8BB_2LOG
)*NNBSBB_XXXX
;
4007 for(m
=0; m
<(GPU_NSUBCELL
>>STRIDE_8BB_2LOG
)*NNBSBB_XXXX
; m
+=NNBSBB_XXXX
)
4009 for(i
=0; i
<STRIDE_8BB
; i
++)
4011 bb_ci
[m
+0*STRIDE_8BB
+i
] = bb
[ia
+m
+0*STRIDE_8BB
+i
] + shx
;
4012 bb_ci
[m
+1*STRIDE_8BB
+i
] = bb
[ia
+m
+1*STRIDE_8BB
+i
] + shy
;
4013 bb_ci
[m
+2*STRIDE_8BB
+i
] = bb
[ia
+m
+2*STRIDE_8BB
+i
] + shz
;
4014 bb_ci
[m
+3*STRIDE_8BB
+i
] = bb
[ia
+m
+3*STRIDE_8BB
+i
] + shx
;
4015 bb_ci
[m
+4*STRIDE_8BB
+i
] = bb
[ia
+m
+4*STRIDE_8BB
+i
] + shy
;
4016 bb_ci
[m
+5*STRIDE_8BB
+i
] = bb
[ia
+m
+5*STRIDE_8BB
+i
] + shz
;
4020 ia
= ci
*GPU_NSUBCELL
*NNBSBB_B
;
4021 for(i
=0; i
<GPU_NSUBCELL
*NNBSBB_B
; i
+=NNBSBB_B
)
4023 bb_ci
[BBL_X
] = bb
[ia
+BBL_X
] + shx
;
4024 bb_ci
[BBL_Y
] = bb
[ia
+BBL_Y
] + shy
;
4025 bb_ci
[BBL_Z
] = bb
[ia
+BBL_Z
] + shz
;
4026 bb_ci
[BBU_X
] = bb
[ia
+BBU_X
] + shx
;
4027 bb_ci
[BBU_Y
] = bb
[ia
+BBU_Y
] + shy
;
4028 bb_ci
[BBU_Z
] = bb
[ia
+BBU_Z
] + shz
;
4033 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
4034 static void icell_set_x_simple(int ci
,
4035 real shx
,real shy
,real shz
,
4037 int stride
,const real
*x
,
4038 nbnxn_list_work_t
*work
)
4042 ia
= ci
*NBNXN_CPU_CLUSTER_I_SIZE
;
4044 for(i
=0; i
<NBNXN_CPU_CLUSTER_I_SIZE
; i
++)
4046 work
->x_ci
[i
*STRIDE_XYZ
+XX
] = x
[(ia
+i
)*stride
+XX
] + shx
;
4047 work
->x_ci
[i
*STRIDE_XYZ
+YY
] = x
[(ia
+i
)*stride
+YY
] + shy
;
4048 work
->x_ci
[i
*STRIDE_XYZ
+ZZ
] = x
[(ia
+i
)*stride
+ZZ
] + shz
;
4052 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
4053 static void icell_set_x_supersub(int ci
,
4054 real shx
,real shy
,real shz
,
4056 int stride
,const real
*x
,
4057 nbnxn_list_work_t
*work
)
4064 ia
= ci
*GPU_NSUBCELL
*na_c
;
4065 for(i
=0; i
<GPU_NSUBCELL
*na_c
; i
++)
4067 x_ci
[i
*DIM
+ XX
] = x
[(ia
+i
)*stride
+ XX
] + shx
;
4068 x_ci
[i
*DIM
+ YY
] = x
[(ia
+i
)*stride
+ YY
] + shy
;
4069 x_ci
[i
*DIM
+ ZZ
] = x
[(ia
+i
)*stride
+ ZZ
] + shz
;
4073 #ifdef NBNXN_SEARCH_SSE
4074 /* Copies PBC shifted super-cell packed atom coordinates to working array */
4075 static void icell_set_x_supersub_sse8(int ci
,
4076 real shx
,real shy
,real shz
,
4078 int stride
,const real
*x
,
4079 nbnxn_list_work_t
*work
)
4086 for(si
=0; si
<GPU_NSUBCELL
; si
++)
4088 for(i
=0; i
<na_c
; i
+=STRIDE_8BB
)
4091 ia
= ci
*GPU_NSUBCELL
*na_c
+ io
;
4092 for(j
=0; j
<STRIDE_8BB
; j
++)
4094 x_ci
[io
*DIM
+ j
+ XX
*STRIDE_8BB
] = x
[(ia
+j
)*stride
+XX
] + shx
;
4095 x_ci
[io
*DIM
+ j
+ YY
*STRIDE_8BB
] = x
[(ia
+j
)*stride
+YY
] + shy
;
4096 x_ci
[io
*DIM
+ j
+ ZZ
*STRIDE_8BB
] = x
[(ia
+j
)*stride
+ZZ
] + shz
;
4103 static real nbnxn_rlist_inc_nonloc_fac
= 0.6;
4105 /* Due to the cluster size the effective pair-list is longer than
4106 * that of a simple atom pair-list. This function gives the extra distance.
4108 real
nbnxn_get_rlist_effective_inc(int cluster_size
,real atom_density
)
4110 return ((0.5 + nbnxn_rlist_inc_nonloc_fac
)*sqr(((cluster_size
) - 1.0)/(cluster_size
))*pow((cluster_size
)/(atom_density
),1.0/3.0));
4113 /* Estimates the interaction volume^2 for non-local interactions */
4114 static real
nonlocal_vol2(const gmx_domdec_zones_t
*zones
,rvec ls
,real r
)
4123 /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
4124 * not home interaction volume^2. As these volumes are not additive,
4125 * this is an overestimate, but it would only be significant in the limit
4126 * of small cells, where we anyhow need to split the lists into
4127 * as small parts as possible.
4130 for(z
=0; z
<zones
->n
; z
++)
4132 if (zones
->shift
[z
][XX
] + zones
->shift
[z
][YY
] + zones
->shift
[z
][ZZ
] == 1)
4137 for(d
=0; d
<DIM
; d
++)
4139 if (zones
->shift
[z
][d
] == 0)
4143 za
*= zones
->size
[z
].x1
[d
] - zones
->size
[z
].x0
[d
];
4147 /* 4 octants of a sphere */
4148 vold_est
= 0.25*M_PI
*r
*r
*r
*r
;
4149 /* 4 quarter pie slices on the edges */
4150 vold_est
+= 4*cl
*M_PI
/6.0*r
*r
*r
;
4151 /* One rectangular volume on a face */
4152 vold_est
+= ca
*0.5*r
*r
;
4154 vol2_est_tot
+= vold_est
*za
;
4158 return vol2_est_tot
;
4161 /* Estimates the average size of a full j-list for super/sub setup */
4162 static int get_nsubpair_max(const nbnxn_search_t nbs
,
4165 int min_ci_balanced
)
4167 const nbnxn_grid_t
*grid
;
4169 real xy_diag2
,r_eff_sup
,vol_est
,nsp_est
,nsp_est_nl
;
4172 grid
= &nbs
->grid
[0];
4174 ls
[XX
] = (grid
->c1
[XX
] - grid
->c0
[XX
])/(grid
->ncx
*GPU_NSUBCELL_X
);
4175 ls
[YY
] = (grid
->c1
[YY
] - grid
->c0
[YY
])/(grid
->ncy
*GPU_NSUBCELL_Y
);
4176 ls
[ZZ
] = (grid
->c1
[ZZ
] - grid
->c0
[ZZ
])*grid
->ncx
*grid
->ncy
/(grid
->nc
*GPU_NSUBCELL_Z
);
4178 /* The average squared length of the diagonal of a sub cell */
4179 xy_diag2
= ls
[XX
]*ls
[XX
] + ls
[YY
]*ls
[YY
] + ls
[ZZ
]*ls
[ZZ
];
4181 /* The formulas below are a heuristic estimate of the average nsj per si*/
4182 r_eff_sup
= rlist
+ nbnxn_rlist_inc_nonloc_fac
*sqr((grid
->na_c
- 1.0)/grid
->na_c
)*sqrt(xy_diag2
/3);
4184 if (!nbs
->DomDec
|| nbs
->zones
->n
== 1)
4191 sqr(grid
->atom_density
/grid
->na_c
)*
4192 nonlocal_vol2(nbs
->zones
,ls
,r_eff_sup
);
4197 /* Sub-cell interacts with itself */
4198 vol_est
= ls
[XX
]*ls
[YY
]*ls
[ZZ
];
4199 /* 6/2 rectangular volume on the faces */
4200 vol_est
+= (ls
[XX
]*ls
[YY
] + ls
[XX
]*ls
[ZZ
] + ls
[YY
]*ls
[ZZ
])*r_eff_sup
;
4201 /* 12/2 quarter pie slices on the edges */
4202 vol_est
+= 2*(ls
[XX
] + ls
[YY
] + ls
[ZZ
])*0.25*M_PI
*sqr(r_eff_sup
);
4203 /* 4 octants of a sphere */
4204 vol_est
+= 0.5*4.0/3.0*M_PI
*pow(r_eff_sup
,3);
4206 nsp_est
= grid
->nsubc_tot
*vol_est
*grid
->atom_density
/grid
->na_c
;
4208 /* Subtract the non-local pair count */
4209 nsp_est
-= nsp_est_nl
;
4213 fprintf(debug
,"nsp_est local %5.1f non-local %5.1f\n",
4214 nsp_est
,nsp_est_nl
);
4219 nsp_est
= nsp_est_nl
;
4222 if (min_ci_balanced
<= 0 || grid
->nc
>= min_ci_balanced
|| grid
->nc
== 0)
4224 /* We don't need to worry */
4229 /* Thus the (average) maximum j-list size should be as follows */
4230 nsubpair_max
= max(1,(int)(nsp_est
/min_ci_balanced
+0.5));
4232 /* Since the target value is a maximum (this avoid high outliers,
4233 * which lead to load imbalance), not average, we get more lists
4234 * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
4235 * But more importantly, the optimal GPU performance moves
4236 * to lower number of block for very small blocks.
4237 * To compensate we add the maximum pair count per cj4.
4239 nsubpair_max
+= GPU_NSUBCELL
*NBNXN_CPU_CLUSTER_I_SIZE
;
4244 fprintf(debug
,"nbl nsp estimate %.1f, nsubpair_max %d\n",
4245 nsp_est
,nsubpair_max
);
4248 return nsubpair_max
;
4251 /* Debug list print function */
4252 static void print_nblist_ci_cj(FILE *fp
,const nbnxn_pairlist_t
*nbl
)
4256 for(i
=0; i
<nbl
->nci
; i
++)
4258 fprintf(fp
,"ci %4d shift %2d ncj %3d\n",
4259 nbl
->ci
[i
].ci
,nbl
->ci
[i
].shift
,
4260 nbl
->ci
[i
].cj_ind_end
- nbl
->ci
[i
].cj_ind_start
);
4262 for(j
=nbl
->ci
[i
].cj_ind_start
; j
<nbl
->ci
[i
].cj_ind_end
; j
++)
4264 fprintf(fp
," cj %5d imask %x\n",
4271 /* Debug list print function */
4272 static void print_nblist_sci_cj(FILE *fp
,const nbnxn_pairlist_t
*nbl
)
4276 for(i
=0; i
<nbl
->nsci
; i
++)
4278 fprintf(fp
,"ci %4d shift %2d ncj4 %2d\n",
4279 nbl
->sci
[i
].sci
,nbl
->sci
[i
].shift
,
4280 nbl
->sci
[i
].cj4_ind_end
- nbl
->sci
[i
].cj4_ind_start
);
4282 for(j4
=nbl
->sci
[i
].cj4_ind_start
; j4
<nbl
->sci
[i
].cj4_ind_end
; j4
++)
4286 fprintf(fp
," sj %5d imask %x\n",
4288 nbl
->cj4
[j4
].imei
[0].imask
);
4294 /* Combine pair lists *nbl generated on multiple threads nblc */
4295 static void combine_nblists(int nnbl
,nbnxn_pairlist_t
**nbl
,
4296 nbnxn_pairlist_t
*nblc
)
4298 int nsci
,ncj4
,nexcl
;
4303 gmx_incons("combine_nblists does not support simple lists");
4308 nexcl
= nblc
->nexcl
;
4309 for(i
=0; i
<nnbl
; i
++)
4311 nsci
+= nbl
[i
]->nsci
;
4312 ncj4
+= nbl
[i
]->ncj4
;
4313 nexcl
+= nbl
[i
]->nexcl
;
4316 if (nsci
> nblc
->sci_nalloc
)
4318 nb_realloc_sci(nblc
,nsci
);
4320 if (ncj4
> nblc
->cj4_nalloc
)
4322 nblc
->cj4_nalloc
= over_alloc_small(ncj4
);
4323 nb_realloc_void((void **)&nblc
->cj4
,
4324 nblc
->ncj4
*sizeof(*nblc
->cj4
),
4325 nblc
->cj4_nalloc
*sizeof(*nblc
->cj4
),
4326 nblc
->alloc
,nblc
->free
);
4328 if (nexcl
> nblc
->excl_nalloc
)
4330 nblc
->excl_nalloc
= over_alloc_small(nexcl
);
4331 nb_realloc_void((void **)&nblc
->excl
,
4332 nblc
->nexcl
*sizeof(*nblc
->excl
),
4333 nblc
->excl_nalloc
*sizeof(*nblc
->excl
),
4334 nblc
->alloc
,nblc
->free
);
4337 /* Each thread should copy its own data to the combined arrays,
4338 * as otherwise data will go back and forth between different caches.
4340 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
4341 for(n
=0; n
<nnbl
; n
++)
4348 const nbnxn_pairlist_t
*nbli
;
4350 /* Determine the offset in the combined data for our thread */
4351 sci_offset
= nblc
->nsci
;
4352 cj4_offset
= nblc
->ncj4
;
4353 ci_offset
= nblc
->nci_tot
;
4354 excl_offset
= nblc
->nexcl
;
4358 sci_offset
+= nbl
[i
]->nsci
;
4359 cj4_offset
+= nbl
[i
]->ncj4
;
4360 ci_offset
+= nbl
[i
]->nci_tot
;
4361 excl_offset
+= nbl
[i
]->nexcl
;
4366 for(i
=0; i
<nbli
->nsci
; i
++)
4368 nblc
->sci
[sci_offset
+i
] = nbli
->sci
[i
];
4369 nblc
->sci
[sci_offset
+i
].cj4_ind_start
+= cj4_offset
;
4370 nblc
->sci
[sci_offset
+i
].cj4_ind_end
+= cj4_offset
;
4373 for(j4
=0; j4
<nbli
->ncj4
; j4
++)
4375 nblc
->cj4
[cj4_offset
+j4
] = nbli
->cj4
[j4
];
4376 nblc
->cj4
[cj4_offset
+j4
].imei
[0].excl_ind
+= excl_offset
;
4377 nblc
->cj4
[cj4_offset
+j4
].imei
[1].excl_ind
+= excl_offset
;
4380 for(j4
=0; j4
<nbli
->nexcl
; j4
++)
4382 nblc
->excl
[excl_offset
+j4
] = nbli
->excl
[j4
];
4386 for(n
=0; n
<nnbl
; n
++)
4388 nblc
->nsci
+= nbl
[n
]->nsci
;
4389 nblc
->ncj4
+= nbl
[n
]->ncj4
;
4390 nblc
->nci_tot
+= nbl
[n
]->nci_tot
;
4391 nblc
->nexcl
+= nbl
[n
]->nexcl
;
4395 /* Returns the next ci to be processes by our thread */
4396 static gmx_bool
next_ci(const nbnxn_grid_t
*grid
,
4398 int nth
,int ci_block
,
4399 int *ci_x
,int *ci_y
,
4405 if (*ci_b
== ci_block
)
4407 /* Jump to the next block assigned to this task */
4408 *ci
+= (nth
- 1)*ci_block
;
4412 if (*ci
>= grid
->nc
*conv
)
4417 while (*ci
>= grid
->cxy_ind
[*ci_x
*grid
->ncy
+ *ci_y
+ 1]*conv
)
4420 if (*ci_y
== grid
->ncy
)
4430 /* Returns the distance^2 for which we put cell pairs in the list
4431 * without checking atom pair distances. This is usually < rlist^2.
4433 static float boundingbox_only_distance2(const nbnxn_grid_t
*gridi
,
4434 const nbnxn_grid_t
*gridj
,
4438 /* If the distance between two sub-cell bounding boxes is less
4439 * than this distance, do not check the distance between
4440 * all particle pairs in the sub-cell, since then it is likely
4441 * that the box pair has atom pairs within the cut-off.
4442 * We use the nblist cut-off minus 0.5 times the average x/y diagonal
4443 * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
4444 * Using more than 0.5 gains at most 0.5%.
4445 * If forces are calculated more than twice, the performance gain
4446 * in the force calculation outweighs the cost of checking.
4447 * Note that with subcell lists, the atom-pair distance check
4448 * is only performed when only 1 out of 8 sub-cells in within range,
4449 * this is because the GPU is much faster than the cpu.
4454 bbx
= 0.5*(gridi
->sx
+ gridj
->sx
);
4455 bby
= 0.5*(gridi
->sy
+ gridj
->sy
);
4458 bbx
/= GPU_NSUBCELL_X
;
4459 bby
/= GPU_NSUBCELL_Y
;
4462 rbb2
= sqr(max(0,rlist
- 0.5*sqrt(bbx
*bbx
+ bby
*bby
)));
4467 return (float)((1+GMX_FLOAT_EPS
)*rbb2
);
4471 /* Generates the part of pair-list nbl assigned to our thread */
4472 static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs
,
4473 const nbnxn_grid_t
*gridi
,
4474 const nbnxn_grid_t
*gridj
,
4475 nbnxn_search_work_t
*work
,
4476 const nbnxn_atomdata_t
*nbat
,
4477 const t_blocka
*excl
,
4482 int min_ci_balanced
,
4484 nbnxn_pairlist_t
*nbl
)
4491 int ci_block
,ci_b
,ci
,ci_x
,ci_y
,ci_xy
,cj
;
4498 const float *bb_i
,*bbcz_i
,*bbcz_j
;
4500 real bx0
,bx1
,by0
,by1
,bz0
,bz1
;
4502 real d2cx
,d2z
,d2z_cx
,d2z_cy
,d2zx
,d2zxy
,d2xy
;
4503 int cxf
,cxl
,cyf
,cyf_x
,cyl
;
4509 nbs_cycle_start(&work
->cc
[enbsCCsearch
]);
4511 if (gridj
->bSimple
!= nbl
->bSimple
)
4513 gmx_incons("Grid incompatible with pair-list");
4518 nbl
->na_sc
= gridj
->na_sc
;
4519 nbl
->na_ci
= gridj
->na_c
;
4520 nbl
->na_cj
= kernel_to_cj_size(nb_kernel_type
);
4521 na_cj_2log
= get_2log(nbl
->na_cj
);
4525 copy_mat(nbs
->box
,box
);
4527 rl2
= nbl
->rlist
*nbl
->rlist
;
4529 rbb2
= boundingbox_only_distance2(gridi
,gridj
,nbl
->rlist
,nbl
->bSimple
);
4533 fprintf(debug
,"nbl bounding box only distance %f\n",sqrt(rbb2
));
4536 /* Set the shift range */
4537 for(d
=0; d
<DIM
; d
++)
4539 /* Check if we need periodicity shifts.
4540 * Without PBC or with domain decomposition we don't need them.
4542 if (d
>= ePBC2npbcdim(nbs
->ePBC
) || nbs
->dd_dim
[d
])
4549 box
[XX
][XX
] - fabs(box
[YY
][XX
]) - fabs(box
[ZZ
][XX
]) < sqrt(rl2
))
4560 if (nbl
->bSimple
&& !gridi
->bSimple
)
4562 conv_i
= gridi
->na_sc
/gridj
->na_sc
;
4563 bb_i
= gridi
->bb_simple
;
4564 bbcz_i
= gridi
->bbcz_simple
;
4565 flags_i
= gridi
->flags_simple
;
4571 bbcz_i
= gridi
->bbcz
;
4572 flags_i
= gridi
->flags
;
4574 cell0_i
= gridi
->cell0
*conv_i
;
4576 bbcz_j
= gridj
->bbcz
;
4580 #define CI_BLOCK_ENUM 5
4581 #define CI_BLOCK_DENOM 11
4582 /* Here we decide how to distribute the blocks over the threads.
4583 * We use prime numbers to try to avoid that the grid size becomes
4584 * a multiple of the number of threads, which would lead to some
4585 * threads getting "inner" pairs and others getting boundary pairs,
4586 * which in turns will lead to load imbalance between threads.
4587 * Set the block size as 5/11/ntask times the average number of cells
4588 * in a y,z slab. This should ensure a quite uniform distribution
4589 * of the grid parts of the different thread along all three grid
4590 * zone boundaries with 3D domain decomposition. At the same time
4591 * the blocks will not become too small.
4593 ci_block
= (gridi
->nc
*CI_BLOCK_ENUM
)/(CI_BLOCK_DENOM
*gridi
->ncx
*nth
);
4595 /* Ensure the blocks are not too small: avoids cache invalidation */
4596 if (ci_block
*gridi
->na_sc
< 16)
4598 ci_block
= (16 + gridi
->na_sc
- 1)/gridi
->na_sc
;
4601 /* Without domain decomposition
4602 * or with less than 3 blocks per task, divide in nth blocks.
4604 if (!nbs
->DomDec
|| ci_block
*3*nth
> gridi
->nc
)
4606 ci_block
= (gridi
->nc
+ nth
- 1)/nth
;
4611 /* Blocks of the conversion factor - 1 give a large repeat count
4612 * combined with a small block size. This should result in good
4613 * load balancing for both small and large domains.
4615 ci_block
= conv_i
- 1;
4619 fprintf(debug
,"nbl nc_i %d col.av. %.1f ci_block %d\n",
4620 gridi
->nc
,gridi
->nc
/(double)(gridi
->ncx
*gridi
->ncy
),ci_block
);
4627 ci
= th
*ci_block
- 1;
4630 while (next_ci(gridi
,conv_i
,nth
,ci_block
,&ci_x
,&ci_y
,&ci_b
,&ci
))
4632 if (nbl
->bSimple
&& flags_i
[ci
] == 0)
4638 if (gridj
!= gridi
&& shp
[XX
] == 0)
4642 bx1
= bb_i
[ci
*NNBSBB_B
+NNBSBB_C
+XX
];
4646 bx1
= gridi
->c0
[XX
] + (ci_x
+1)*gridi
->sx
;
4648 if (bx1
< gridj
->c0
[XX
])
4650 d2cx
= sqr(gridj
->c0
[XX
] - bx1
);
4659 ci_xy
= ci_x
*gridi
->ncy
+ ci_y
;
4661 /* Loop over shift vectors in three dimensions */
4662 for (tz
=-shp
[ZZ
]; tz
<=shp
[ZZ
]; tz
++)
4664 shz
= tz
*box
[ZZ
][ZZ
];
4666 bz0
= bbcz_i
[ci
*NNBSBB_D
] + shz
;
4667 bz1
= bbcz_i
[ci
*NNBSBB_D
+1] + shz
;
4679 d2z
= sqr(bz0
- box
[ZZ
][ZZ
]);
4682 d2z_cx
= d2z
+ d2cx
;
4690 bz1
/((real
)(gridi
->cxy_ind
[ci_xy
+1] - gridi
->cxy_ind
[ci_xy
]));
4695 /* The check with bz1_frac close to or larger than 1 comes later */
4697 for (ty
=-shp
[YY
]; ty
<=shp
[YY
]; ty
++)
4699 shy
= ty
*box
[YY
][YY
] + tz
*box
[ZZ
][YY
];
4703 by0
= bb_i
[ci
*NNBSBB_B
+YY
] + shy
;
4704 by1
= bb_i
[ci
*NNBSBB_B
+NNBSBB_C
+YY
] + shy
;
4708 by0
= gridi
->c0
[YY
] + (ci_y
)*gridi
->sy
+ shy
;
4709 by1
= gridi
->c0
[YY
] + (ci_y
+1)*gridi
->sy
+ shy
;
4712 get_cell_range(by0
,by1
,
4713 gridj
->ncy
,gridj
->c0
[YY
],gridj
->sy
,gridj
->inv_sy
,
4723 if (by1
< gridj
->c0
[YY
])
4725 d2z_cy
+= sqr(gridj
->c0
[YY
] - by1
);
4727 else if (by0
> gridj
->c1
[YY
])
4729 d2z_cy
+= sqr(by0
- gridj
->c1
[YY
]);
4732 for (tx
=-shp
[XX
]; tx
<=shp
[XX
]; tx
++)
4734 shift
= XYZ2IS(tx
,ty
,tz
);
4736 #ifdef NBNXN_SHIFT_BACKWARD
4737 if (gridi
== gridj
&& shift
> CENTRAL
)
4743 shx
= tx
*box
[XX
][XX
] + ty
*box
[YY
][XX
] + tz
*box
[ZZ
][XX
];
4747 bx0
= bb_i
[ci
*NNBSBB_B
+XX
] + shx
;
4748 bx1
= bb_i
[ci
*NNBSBB_B
+NNBSBB_C
+XX
] + shx
;
4752 bx0
= gridi
->c0
[XX
] + (ci_x
)*gridi
->sx
+ shx
;
4753 bx1
= gridi
->c0
[XX
] + (ci_x
+1)*gridi
->sx
+ shx
;
4756 get_cell_range(bx0
,bx1
,
4757 gridj
->ncx
,gridj
->c0
[XX
],gridj
->sx
,gridj
->inv_sx
,
4768 new_ci_entry(nbl
,cell0_i
+ci
,shift
,flags_i
[ci
],
4773 new_sci_entry(nbl
,cell0_i
+ci
,shift
,flags_i
[ci
],
4777 #ifndef NBNXN_SHIFT_BACKWARD
4780 if (shift
== CENTRAL
&& gridi
== gridj
&&
4784 /* Leave the pairs with i > j.
4785 * x is the major index, so skip half of it.
4792 set_icell_bb_simple(bb_i
,ci
,shx
,shy
,shz
,
4797 set_icell_bb_supersub(bb_i
,ci
,shx
,shy
,shz
,
4801 nbs
->icell_set_x(cell0_i
+ci
,shx
,shy
,shz
,
4802 gridi
->na_c
,nbat
->xstride
,nbat
->x
,
4805 for(cx
=cxf
; cx
<=cxl
; cx
++)
4808 if (gridj
->c0
[XX
] + cx
*gridj
->sx
> bx1
)
4810 d2zx
+= sqr(gridj
->c0
[XX
] + cx
*gridj
->sx
- bx1
);
4812 else if (gridj
->c0
[XX
] + (cx
+1)*gridj
->sx
< bx0
)
4814 d2zx
+= sqr(gridj
->c0
[XX
] + (cx
+1)*gridj
->sx
- bx0
);
4817 #ifndef NBNXN_SHIFT_BACKWARD
4818 if (gridi
== gridj
&&
4819 cx
== 0 && cyf
< ci_y
)
4821 if (gridi
== gridj
&&
4822 cx
== 0 && shift
== CENTRAL
&& cyf
< ci_y
)
4825 /* Leave the pairs with i > j.
4826 * Skip half of y when i and j have the same x.
4835 for(cy
=cyf_x
; cy
<=cyl
; cy
++)
4837 c0
= gridj
->cxy_ind
[cx
*gridj
->ncy
+cy
];
4838 c1
= gridj
->cxy_ind
[cx
*gridj
->ncy
+cy
+1];
4839 #ifdef NBNXN_SHIFT_BACKWARD
4840 if (gridi
== gridj
&&
4841 shift
== CENTRAL
&& c0
< ci
)
4848 if (gridj
->c0
[YY
] + cy
*gridj
->sy
> by1
)
4850 d2zxy
+= sqr(gridj
->c0
[YY
] + cy
*gridj
->sy
- by1
);
4852 else if (gridj
->c0
[YY
] + (cy
+1)*gridj
->sy
< by0
)
4854 d2zxy
+= sqr(gridj
->c0
[YY
] + (cy
+1)*gridj
->sy
- by0
);
4856 if (c1
> c0
&& d2zxy
< rl2
)
4858 cs
= c0
+ (int)(bz1_frac
*(c1
- c0
));
4866 /* Find the lowest cell that can possibly
4871 (bbcz_j
[cf
*NNBSBB_D
+1] >= bz0
||
4872 d2xy
+ sqr(bbcz_j
[cf
*NNBSBB_D
+1] - bz0
) < rl2
))
4877 /* Find the highest cell that can possibly
4882 (bbcz_j
[cl
*NNBSBB_D
] <= bz1
||
4883 d2xy
+ sqr(bbcz_j
[cl
*NNBSBB_D
] - bz1
) < rl2
))
4888 #ifdef NBNXN_REFCODE
4890 /* Simple reference code, for debugging,
4891 * overrides the more complex code above.
4896 for(k
=c0
; k
<c1
; k
++)
4898 if (box_dist2(bx0
,bx1
,by0
,by1
,bz0
,bz1
,
4899 bb
+k
*NNBSBB_B
) < rl2
&&
4904 if (box_dist2(bx0
,bx1
,by0
,by1
,bz0
,bz1
,
4905 bb
+k
*NNBSBB_B
) < rl2
&&
4916 /* We want each atom/cell pair only once,
4917 * only use cj >= ci.
4919 #ifndef NBNXN_SHIFT_BACKWARD
4922 if (shift
== CENTRAL
)
4931 switch (nb_kernel_type
)
4934 check_subcell_list_space_simple(nbl
,cl
-cf
+1);
4936 make_cluster_list_simple(gridj
,
4938 (gridi
== gridj
&& shift
== CENTRAL
),
4943 #ifdef NBNXN_SEARCH_SSE
4944 case nbk4xN_X86_SIMD128
:
4945 check_subcell_list_space_simple(nbl
,ci_to_cj(na_cj_2log
,cl
-cf
)+2);
4946 make_cluster_list_x86_simd128(gridj
,
4948 (gridi
== gridj
&& shift
== CENTRAL
),
4953 #ifdef GMX_X86_AVX_256
4954 case nbk4xN_X86_SIMD256
:
4955 check_subcell_list_space_simple(nbl
,ci_to_cj(na_cj_2log
,cl
-cf
)+2);
4956 make_cluster_list_x86_simd256(gridj
,
4958 (gridi
== gridj
&& shift
== CENTRAL
),
4965 case nbk8x8x8_PlainC
:
4967 check_subcell_list_space_supersub(nbl
,cl
-cf
+1);
4968 for(cj
=cf
; cj
<=cl
; cj
++)
4970 make_cluster_list_supersub(nbs
,gridi
,gridj
,
4972 (gridi
== gridj
&& shift
== CENTRAL
&& ci
== cj
),
4973 nbat
->xstride
,nbat
->x
,
4979 ncpcheck
+= cl
- cf
+ 1;
4985 /* Set the exclusions for this ci list */
4988 set_ci_top_excls(nbs
,
4990 shift
== CENTRAL
&& gridi
== gridj
,
4993 &(nbl
->ci
[nbl
->nci
]),
4998 set_sci_top_excls(nbs
,
5000 shift
== CENTRAL
&& gridi
== gridj
,
5002 &(nbl
->sci
[nbl
->nsci
]),
5006 /* Close this ci list */
5009 close_ci_entry_simple(nbl
);
5013 close_ci_entry_supersub(nbl
,
5015 progBal
,min_ci_balanced
,
5023 work
->ndistc
= ndistc
;
5025 nbs_cycle_stop(&work
->cc
[enbsCCsearch
]);
5029 fprintf(debug
,"number of distance checks %d\n",ndistc
);
5030 fprintf(debug
,"ncpcheck %s %d\n",gridi
==gridj
? "local" : "non-local",
5035 print_nblist_statistics_simple(debug
,nbl
,nbs
,rlist
);
5039 print_nblist_statistics_supersub(debug
,nbl
,nbs
,rlist
);
5045 /* Make a local or non-local pair-list, depending on iloc */
5046 void nbnxn_make_pairlist(const nbnxn_search_t nbs
,
5047 const nbnxn_atomdata_t
*nbat
,
5048 const t_blocka
*excl
,
5050 int min_ci_balanced
,
5051 nbnxn_pairlist_set_t
*nbl_list
,
5056 const nbnxn_grid_t
*gridi
,*gridj
;
5057 int nzi
,zi
,zj0
,zj1
,zj
;
5061 nbnxn_pairlist_t
**nbl
;
5062 gmx_bool CombineNBLists
;
5063 int np_tot
,np_noq
,np_hlj
,nap
;
5065 nnbl
= nbl_list
->nnbl
;
5066 nbl
= nbl_list
->nbl
;
5067 CombineNBLists
= nbl_list
->bCombined
;
5071 fprintf(debug
,"ns making %d nblists\n", nnbl
);
5074 if (nbl_list
->bSimple
)
5076 switch (nb_kernel_type
)
5078 #ifdef NBNXN_SEARCH_SSE
5079 case nbk4xN_X86_SIMD128
:
5080 nbs
->icell_set_x
= icell_set_x_x86_simd128
;
5082 #ifdef GMX_X86_AVX_256
5083 case nbk4xN_X86_SIMD256
:
5084 nbs
->icell_set_x
= icell_set_x_x86_simd256
;
5089 nbs
->icell_set_x
= icell_set_x_simple
;
5095 #ifdef NBNXN_SEARCH_SSE
5096 nbs
->icell_set_x
= icell_set_x_supersub_sse8
;
5098 nbs
->icell_set_x
= icell_set_x_supersub
;
5104 /* Only zone (grid) 0 vs 0 */
5111 nzi
= nbs
->zones
->nizone
;
5114 if (!nbl_list
->bSimple
&& min_ci_balanced
> 0)
5116 nsubpair_max
= get_nsubpair_max(nbs
,iloc
,rlist
,min_ci_balanced
);
5123 /* Clear all pair-lists */
5124 for(th
=0; th
<nnbl
; th
++)
5126 clear_pairlist(nbl
[th
]);
5129 for(zi
=0; zi
<nzi
; zi
++)
5131 gridi
= &nbs
->grid
[zi
];
5133 if (NONLOCAL_I(iloc
))
5135 zj0
= nbs
->zones
->izone
[zi
].j0
;
5136 zj1
= nbs
->zones
->izone
[zi
].j1
;
5142 for(zj
=zj0
; zj
<zj1
; zj
++)
5144 gridj
= &nbs
->grid
[zj
];
5148 fprintf(debug
,"ns search grid %d vs %d\n",zi
,zj
);
5151 nbs_cycle_start(&nbs
->cc
[enbsCCsearch
]);
5153 #pragma omp parallel for num_threads(nnbl) schedule(static)
5154 for(th
=0; th
<nnbl
; th
++)
5156 if (CombineNBLists
&& th
> 0)
5158 clear_pairlist(nbl
[th
]);
5161 /* Divide the i super cell equally over the nblists */
5162 nbnxn_make_pairlist_part(nbs
,gridi
,gridj
,
5163 &nbs
->work
[th
],nbat
,excl
,
5167 (LOCAL_I(iloc
) || nbs
->zones
->n
<= 2),
5172 nbs_cycle_stop(&nbs
->cc
[enbsCCsearch
]);
5177 for(th
=0; th
<nnbl
; th
++)
5179 inc_nrnb(nrnb
,eNR_NBNXN_DIST2
,nbs
->work
[th
].ndistc
);
5181 if (nbl_list
->bSimple
)
5183 np_tot
+= nbl
[th
]->ncj
;
5184 np_noq
+= nbl
[th
]->work
->ncj_noq
;
5185 np_hlj
+= nbl
[th
]->work
->ncj_hlj
;
5189 /* This count ignores potential subsequent pair pruning */
5190 np_tot
+= nbl
[th
]->nci_tot
;
5193 nap
= nbl
[0]->na_ci
*nbl
[0]->na_cj
;
5194 nbl_list
->natpair_ljq
= (np_tot
- np_noq
)*nap
- np_hlj
*nap
/2;
5195 nbl_list
->natpair_lj
= np_noq
*nap
;
5196 nbl_list
->natpair_q
= np_hlj
*nap
/2;
5198 if (CombineNBLists
&& nnbl
> 1)
5200 nbs_cycle_start(&nbs
->cc
[enbsCCcombine
]);
5202 combine_nblists(nnbl
-1,nbl
+1,nbl
[0]);
5204 nbs_cycle_stop(&nbs
->cc
[enbsCCcombine
]);
5211 print_supersub_nsp("nsubpair",nbl[0],iloc);
5214 /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
5217 nbs
->search_count
++;
5219 if (nbs
->print_cycles
&&
5220 (!nbs
->DomDec
|| (nbs
->DomDec
&& !LOCAL_I(iloc
))) &&
5221 nbs
->search_count
% 100 == 0)
5223 nbs_cycle_print(stderr
,nbs
);
5226 if (debug
&& (CombineNBLists
&& nnbl
> 1))
5228 if (nbl
[0]->bSimple
)
5230 print_nblist_statistics_simple(debug
,nbl
[0],nbs
,rlist
);
5234 print_nblist_statistics_supersub(debug
,nbl
[0],nbs
,rlist
);
5240 if (nbl
[0]->bSimple
)
5242 print_nblist_ci_cj(debug
,nbl
[0]);
5246 print_nblist_sci_cj(debug
,nbl
[0]);
5251 /* Initializes an nbnxn_atomdata_output_t data structure */
5252 static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t
*out
,
5254 int nenergrp
,int stride
,
5255 gmx_nbat_alloc_t
*ma
)
5260 ma((void **)&out
->fshift
,SHIFTS
*DIM
*sizeof(*out
->fshift
));
5261 out
->nV
= nenergrp
*nenergrp
;
5262 ma((void **)&out
->Vvdw
,out
->nV
*sizeof(*out
->Vvdw
));
5263 ma((void **)&out
->Vc
,out
->nV
*sizeof(*out
->Vc
));
5265 if (nb_kernel_type
== nbk4xN_X86_SIMD128
||
5266 nb_kernel_type
== nbk4xN_X86_SIMD256
)
5268 cj_size
= kernel_to_cj_size(nb_kernel_type
);
5269 out
->nVS
= nenergrp
*nenergrp
*stride
*(cj_size
>>1)*cj_size
;
5270 ma((void **)&out
->VSvdw
,out
->nVS
*sizeof(*out
->VSvdw
));
5271 ma((void **)&out
->VSc
,out
->nVS
*sizeof(*out
->VSc
));
5279 /* Determines the combination rule (or none) to be used, stores it,
5280 * and sets the LJ parameters required with the rule.
5282 static void set_combination_rule_data(nbnxn_atomdata_t
*nbat
)
5289 switch (nbat
->comb_rule
)
5292 nbat
->comb_rule
= ljcrGEOM
;
5296 /* Copy the diagonal from the nbfp matrix */
5297 nbat
->nbfp_comb
[i
*2 ] = sqrt(nbat
->nbfp
[(i
*nt
+i
)*2 ]);
5298 nbat
->nbfp_comb
[i
*2+1] = sqrt(nbat
->nbfp
[(i
*nt
+i
)*2+1]);
5304 /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
5305 c6
= nbat
->nbfp
[(i
*nt
+i
)*2 ];
5306 c12
= nbat
->nbfp
[(i
*nt
+i
)*2+1];
5307 if (c6
> 0 && c12
> 0)
5309 /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
5310 * so we get 6*C6 and 12*C12 after combining.
5312 nbat
->nbfp_comb
[i
*2 ] = 0.5*pow(c12
/c6
,1.0/6.0);
5313 nbat
->nbfp_comb
[i
*2+1] = sqrt(c6
*c6
/c12
);
5317 nbat
->nbfp_comb
[i
*2 ] = 0;
5318 nbat
->nbfp_comb
[i
*2+1] = 0;
5323 /* In nbfp_s4 we use a stride of 4 for storing two parameters */
5324 nbat
->alloc((void **)&nbat
->nbfp_s4
,nt
*nt
*4*sizeof(*nbat
->nbfp_s4
));
5329 nbat
->nbfp_s4
[(i
*nt
+j
)*4+0] = nbat
->nbfp
[(i
*nt
+j
)*2+0];
5330 nbat
->nbfp_s4
[(i
*nt
+j
)*4+1] = nbat
->nbfp
[(i
*nt
+j
)*2+1];
5331 nbat
->nbfp_s4
[(i
*nt
+j
)*4+2] = 0;
5332 nbat
->nbfp_s4
[(i
*nt
+j
)*4+3] = 0;
5337 gmx_incons("Unknown combination rule");
5342 /* Initializes an nbnxn_atomdata_t data structure */
5343 void nbnxn_atomdata_init(FILE *fp
,
5344 nbnxn_atomdata_t
*nbat
,
5346 int ntype
,const real
*nbfp
,
5349 gmx_nbat_alloc_t
*alloc
,
5350 gmx_nbat_free_t
*free
)
5355 gmx_bool simple
,bCombGeom
,bCombLB
;
5359 nbat
->alloc
= nbnxn_alloc_aligned
;
5363 nbat
->alloc
= alloc
;
5367 nbat
->free
= nbnxn_free_aligned
;
5376 fprintf(debug
,"There are %d atom types in the system, adding one for nbnxn_atomdata_t\n",ntype
);
5378 nbat
->ntype
= ntype
+ 1;
5379 nbat
->alloc((void **)&nbat
->nbfp
,
5380 nbat
->ntype
*nbat
->ntype
*2*sizeof(*nbat
->nbfp
));
5381 nbat
->alloc((void **)&nbat
->nbfp_comb
,nbat
->ntype
*2*sizeof(*nbat
->nbfp_comb
));
5383 /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
5384 * force-field floating point parameters.
5387 ptr
= getenv("GMX_LJCOMB_TOL");
5392 sscanf(ptr
,"%lf",&dbl
);
5398 /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
5399 * to check for the LB rule.
5401 for(i
=0; i
<ntype
; i
++)
5403 c6
= nbfp
[(i
*ntype
+i
)*2 ];
5404 c12
= nbfp
[(i
*ntype
+i
)*2+1];
5405 if (c6
> 0 && c12
> 0)
5407 nbat
->nbfp_comb
[i
*2 ] = pow(c12
/c6
,1.0/6.0);
5408 nbat
->nbfp_comb
[i
*2+1] = 0.25*c6
*c6
/c12
;
5410 else if (c6
== 0 && c12
== 0)
5412 nbat
->nbfp_comb
[i
*2 ] = 0;
5413 nbat
->nbfp_comb
[i
*2+1] = 0;
5417 /* Can not use LB rule with only dispersion or repulsion */
5422 for(i
=0; i
<nbat
->ntype
; i
++)
5424 for(j
=0; j
<nbat
->ntype
; j
++)
5426 if (i
< ntype
&& j
< ntype
)
5428 /* We store the prefactor in the derivative of the potential
5429 * in the parameter to avoid multiplications in the inner loop.
5431 c6
= nbfp
[(i
*ntype
+j
)*2 ];
5432 c12
= nbfp
[(i
*ntype
+j
)*2+1];
5433 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = 6.0*c6
;
5434 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = 12.0*c12
;
5436 bCombGeom
= bCombGeom
&&
5437 gmx_within_tol(c6
*c6
,nbfp
[(i
*ntype
+i
)*2 ]*nbfp
[(j
*ntype
+j
)*2 ],tol
) &&
5438 gmx_within_tol(c12
*c12
,nbfp
[(i
*ntype
+i
)*2+1]*nbfp
[(j
*ntype
+j
)*2+1],tol
);
5440 bCombLB
= bCombLB
&&
5441 ((c6
== 0 && c12
== 0 &&
5442 (nbat
->nbfp_comb
[i
*2+1] == 0 || nbat
->nbfp_comb
[j
*2+1] == 0)) ||
5443 (c6
> 0 && c12
> 0 &&
5444 gmx_within_tol(pow(c12
/c6
,1.0/6.0),0.5*(nbat
->nbfp_comb
[i
*2]+nbat
->nbfp_comb
[j
*2]),tol
) &&
5445 gmx_within_tol(0.25*c6
*c6
/c12
,sqrt(nbat
->nbfp_comb
[i
*2+1]*nbat
->nbfp_comb
[j
*2+1]),tol
)));
5449 /* Add zero parameters for the additional dummy atom type */
5450 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = 0;
5451 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = 0;
5457 fprintf(debug
,"Combination rules: geometric %d Lorentz-Berthelot %d\n",
5461 simple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
5465 /* We prefer the geometic combination rule,
5466 * as that gives a slightly faster kernel than the LB rule.
5470 nbat
->comb_rule
= ljcrGEOM
;
5474 nbat
->comb_rule
= ljcrLB
;
5478 nbat
->comb_rule
= ljcrNONE
;
5480 nbat
->free(nbat
->nbfp_comb
);
5485 if (nbat
->comb_rule
== ljcrNONE
)
5487 fprintf(fp
,"Using full Lennard-Jones parameter combination matrix\n\n");
5491 fprintf(fp
,"Using %s Lennard-Jones combination rule\n\n",
5492 nbat
->comb_rule
==ljcrGEOM
? "geometric" : "Lorentz-Berthelot");
5496 set_combination_rule_data(nbat
);
5500 nbat
->comb_rule
= ljcrNONE
;
5502 nbat
->free(nbat
->nbfp_comb
);
5507 nbat
->lj_comb
= NULL
;
5510 switch (nb_kernel_type
)
5512 case nbk4xN_X86_SIMD128
:
5513 nbat
->XFormat
= nbatX4
;
5515 case nbk4xN_X86_SIMD256
:
5517 nbat
->XFormat
= nbatX8
;
5519 nbat
->XFormat
= nbatX4
;
5523 nbat
->XFormat
= nbatXYZ
;
5527 nbat
->FFormat
= nbat
->XFormat
;
5531 nbat
->XFormat
= nbatXYZQ
;
5532 nbat
->FFormat
= nbatXYZ
;
5535 nbat
->nenergrp
= n_energygroups
;
5538 /* Energy groups not supported yet for super-sub lists */
5541 /* Temporary storage goes is #grp^3*8 real, so limit to 64 */
5542 if (nbat
->nenergrp
> 64)
5544 gmx_fatal(FARGS
,"With NxN kernels not more than 64 energy groups are supported\n");
5547 while (nbat
->nenergrp
> (1<<nbat
->neg_2log
))
5551 nbat
->energrp
= NULL
;
5552 nbat
->alloc((void **)&nbat
->shift_vec
,SHIFTS
*sizeof(*nbat
->shift_vec
));
5553 nbat
->xstride
= (nbat
->XFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
5554 nbat
->fstride
= (nbat
->FFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
5557 snew(nbat
->out
,nbat
->nout
);
5559 for(i
=0; i
<nbat
->nout
; i
++)
5561 nbnxn_atomdata_output_init(&nbat
->out
[i
],
5563 nbat
->nenergrp
,1<<nbat
->neg_2log
,
5568 static void copy_lj_to_nbat_lj_comb_x4(const real
*ljparam_type
,
5569 const int *type
,int na
,
5574 /* The LJ params follow the combination rule:
5575 * copy the params for the type array to the atom array.
5577 for(is
=0; is
<na
; is
+=PACK_X4
)
5579 for(k
=0; k
<PACK_X4
; k
++)
5582 ljparam_at
[is
*2 +k
] = ljparam_type
[type
[i
]*2 ];
5583 ljparam_at
[is
*2+PACK_X4
+k
] = ljparam_type
[type
[i
]*2+1];
5588 static void copy_lj_to_nbat_lj_comb_x8(const real
*ljparam_type
,
5589 const int *type
,int na
,
5594 /* The LJ params follow the combination rule:
5595 * copy the params for the type array to the atom array.
5597 for(is
=0; is
<na
; is
+=PACK_X8
)
5599 for(k
=0; k
<PACK_X8
; k
++)
5602 ljparam_at
[is
*2 +k
] = ljparam_type
[type
[i
]*2 ];
5603 ljparam_at
[is
*2+PACK_X8
+k
] = ljparam_type
[type
[i
]*2+1];
5608 /* Sets the atom type and LJ data in nbnxn_atomdata_t */
5609 static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t
*nbat
,
5611 const nbnxn_search_t nbs
,
5615 const nbnxn_grid_t
*grid
;
5617 for(g
=0; g
<ngrid
; g
++)
5619 grid
= &nbs
->grid
[g
];
5621 /* Loop over all columns and copy and fill */
5622 for(i
=0; i
<grid
->ncx
*grid
->ncy
; i
++)
5624 ncz
= grid
->cxy_ind
[i
+1] - grid
->cxy_ind
[i
];
5625 ash
= (grid
->cell0
+ grid
->cxy_ind
[i
])*grid
->na_sc
;
5627 copy_int_to_nbat_int(nbs
->a
+ash
,grid
->cxy_na
[i
],ncz
*grid
->na_sc
,
5628 type
,nbat
->ntype
-1,nbat
->type
+ash
);
5630 if (nbat
->comb_rule
!= ljcrNONE
)
5632 if (nbat
->XFormat
== nbatX4
)
5634 copy_lj_to_nbat_lj_comb_x4(nbat
->nbfp_comb
,
5635 nbat
->type
+ash
,ncz
*grid
->na_sc
,
5636 nbat
->lj_comb
+ash
*2);
5638 else if (nbat
->XFormat
== nbatX8
)
5640 copy_lj_to_nbat_lj_comb_x8(nbat
->nbfp_comb
,
5641 nbat
->type
+ash
,ncz
*grid
->na_sc
,
5642 nbat
->lj_comb
+ash
*2);
5649 /* Sets the charges in nbnxn_atomdata_t *nbat */
5650 static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t
*nbat
,
5652 const nbnxn_search_t nbs
,
5655 int g
,cxy
,ncz
,ash
,na
,na_round
,i
,j
;
5657 const nbnxn_grid_t
*grid
;
5659 for(g
=0; g
<ngrid
; g
++)
5661 grid
= &nbs
->grid
[g
];
5663 /* Loop over all columns and copy and fill */
5664 for(cxy
=0; cxy
<grid
->ncx
*grid
->ncy
; cxy
++)
5666 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
5667 na
= grid
->cxy_na
[cxy
];
5668 na_round
= (grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
])*grid
->na_sc
;
5670 if (nbat
->XFormat
== nbatXYZQ
)
5672 q
= nbat
->x
+ ash
*STRIDE_XYZQ
+ ZZ
+ 1;
5675 *q
= charge
[nbs
->a
[ash
+i
]];
5678 /* Complete the partially filled last cell with zeros */
5679 for(; i
<na_round
; i
++)
5690 *q
= charge
[nbs
->a
[ash
+i
]];
5693 /* Complete the partially filled last cell with zeros */
5694 for(; i
<na_round
; i
++)
5704 /* Copies the energy group indices to a reordered and packed array */
5705 static void copy_egp_to_nbat_egps(const int *a
,int na
,int na_round
,
5706 int na_c
,int bit_shift
,
5707 const int *in
,int *innb
)
5713 for(i
=0; i
<na
; i
+=na_c
)
5715 /* Store na_c energy groups number into one int */
5717 for(sa
=0; sa
<na_c
; sa
++)
5722 comb
|= (GET_CGINFO_GID(in
[at
]) << (sa
*bit_shift
));
5727 /* Complete the partially filled last cell with fill */
5728 for(; i
<na_round
; i
+=na_c
)
5734 /* Set the energy group indices for atoms in nbnxn_atomdata_t */
5735 static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t
*nbat
,
5737 const nbnxn_search_t nbs
,
5741 const nbnxn_grid_t
*grid
;
5743 for(g
=0; g
<ngrid
; g
++)
5745 grid
= &nbs
->grid
[g
];
5747 /* Loop over all columns and copy and fill */
5748 for(i
=0; i
<grid
->ncx
*grid
->ncy
; i
++)
5750 ncz
= grid
->cxy_ind
[i
+1] - grid
->cxy_ind
[i
];
5751 ash
= (grid
->cell0
+ grid
->cxy_ind
[i
])*grid
->na_sc
;
5753 copy_egp_to_nbat_egps(nbs
->a
+ash
,grid
->cxy_na
[i
],ncz
*grid
->na_sc
,
5754 nbat
->na_c
,nbat
->neg_2log
,
5755 atinfo
,nbat
->energrp
+(ash
>>grid
->na_c_2log
));
5760 /* Sets all required atom parameter data in nbnxn_atomdata_t */
5761 void nbnxn_atomdata_set(nbnxn_atomdata_t
*nbat
,
5763 const nbnxn_search_t nbs
,
5764 const t_mdatoms
*mdatoms
,
5769 if (locality
== eatLocal
)
5778 nbnxn_atomdata_set_atomtypes(nbat
,ngrid
,nbs
,mdatoms
->typeA
);
5780 nbnxn_atomdata_set_charges(nbat
,ngrid
,nbs
,mdatoms
->chargeA
);
5782 if (nbat
->nenergrp
> 1)
5784 nbnxn_atomdata_set_energygroups(nbat
,ngrid
,nbs
,atinfo
);
5788 /* Copies the shift vector array to nbnxn_atomdata_t */
5789 void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox
,
5791 nbnxn_atomdata_t
*nbat
)
5795 nbat
->bDynamicBox
= bDynamicBox
;
5796 for(i
=0; i
<SHIFTS
; i
++)
5798 copy_rvec(shift_vec
[i
],nbat
->shift_vec
[i
]);
5802 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
5803 void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs
,
5807 nbnxn_atomdata_t
*nbat
)
5830 nbat
->natoms_local
= nbs
->grid
[0].nc
*nbs
->grid
[0].na_sc
;
5833 nth
= gmx_omp_nthreads_get(emntPairsearch
);
5835 #pragma omp parallel for num_threads(nth) schedule(static)
5836 for(th
=0; th
<nth
; th
++)
5840 for(g
=g0
; g
<g1
; g
++)
5842 const nbnxn_grid_t
*grid
;
5845 grid
= &nbs
->grid
[g
];
5847 cxy0
= (grid
->ncx
*grid
->ncy
* th
+nth
-1)/nth
;
5848 cxy1
= (grid
->ncx
*grid
->ncy
*(th
+1)+nth
-1)/nth
;
5850 for(cxy
=cxy0
; cxy
<cxy1
; cxy
++)
5854 na
= grid
->cxy_na
[cxy
];
5855 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
5857 if (g
== 0 && FillLocal
)
5860 (grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
])*grid
->na_sc
;
5864 /* We fill only the real particle locations.
5865 * We assume the filling entries at the end have been
5866 * properly set before during ns.
5870 copy_rvec_to_nbat_real(nbs
->a
+ash
,na
,na_fill
,x
,
5871 nbat
->XFormat
,nbat
->x
,ash
,
5878 /* Add part of the force array(s) from nbnxn_atomdata_t to f */
5880 nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs
,
5881 const nbnxn_atomdata_t
*nbat
,
5882 nbnxn_atomdata_output_t
*out
,
5893 /* Loop over all columns and copy and fill */
5894 switch (nbat
->FFormat
)
5902 for(a
=a0
; a
<a1
; a
++)
5904 i
= cell
[a
]*nbat
->fstride
;
5907 f
[a
][YY
] += fnb
[i
+1];
5908 f
[a
][ZZ
] += fnb
[i
+2];
5913 for(a
=a0
; a
<a1
; a
++)
5915 i
= cell
[a
]*nbat
->fstride
;
5917 for(fa
=0; fa
<nfa
; fa
++)
5919 f
[a
][XX
] += out
[fa
].f
[i
];
5920 f
[a
][YY
] += out
[fa
].f
[i
+1];
5921 f
[a
][ZZ
] += out
[fa
].f
[i
+2];
5931 for(a
=a0
; a
<a1
; a
++)
5933 i
= X4_IND_A(cell
[a
]);
5935 f
[a
][XX
] += fnb
[i
+XX
*PACK_X4
];
5936 f
[a
][YY
] += fnb
[i
+YY
*PACK_X4
];
5937 f
[a
][ZZ
] += fnb
[i
+ZZ
*PACK_X4
];
5942 for(a
=a0
; a
<a1
; a
++)
5944 i
= X4_IND_A(cell
[a
]);
5946 for(fa
=0; fa
<nfa
; fa
++)
5948 f
[a
][XX
] += out
[fa
].f
[i
+XX
*PACK_X4
];
5949 f
[a
][YY
] += out
[fa
].f
[i
+YY
*PACK_X4
];
5950 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*PACK_X4
];
5960 for(a
=a0
; a
<a1
; a
++)
5962 i
= X8_IND_A(cell
[a
]);
5964 f
[a
][XX
] += fnb
[i
+XX
*PACK_X8
];
5965 f
[a
][YY
] += fnb
[i
+YY
*PACK_X8
];
5966 f
[a
][ZZ
] += fnb
[i
+ZZ
*PACK_X8
];
5971 for(a
=a0
; a
<a1
; a
++)
5973 i
= X8_IND_A(cell
[a
]);
5975 for(fa
=0; fa
<nfa
; fa
++)
5977 f
[a
][XX
] += out
[fa
].f
[i
+XX
*PACK_X8
];
5978 f
[a
][YY
] += out
[fa
].f
[i
+YY
*PACK_X8
];
5979 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*PACK_X8
];
5987 /* Add the force array(s) from nbnxn_atomdata_t to f */
5988 void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs
,
5990 const nbnxn_atomdata_t
*nbat
,
5996 nbs_cycle_start(&nbs
->cc
[enbsCCreducef
]);
6002 na
= nbs
->natoms_nonlocal
;
6006 na
= nbs
->natoms_local
;
6009 a0
= nbs
->natoms_local
;
6010 na
= nbs
->natoms_nonlocal
- nbs
->natoms_local
;
6014 nth
= gmx_omp_nthreads_get(emntNonbonded
);
6015 #pragma omp parallel for num_threads(nth) schedule(static)
6016 for(th
=0; th
<nth
; th
++)
6018 nbnxn_atomdata_add_nbat_f_to_f_part(nbs
,nbat
,
6026 nbs_cycle_stop(&nbs
->cc
[enbsCCreducef
]);
6029 /* Adds the shift forces from nbnxn_atomdata_t to fshift */
6030 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t
*nbat
,
6033 const nbnxn_atomdata_output_t
*out
;
6040 for(s
=0; s
<SHIFTS
; s
++)
6043 for(th
=0; th
<nbat
->nout
; th
++)
6045 sum
[XX
] += out
[th
].fshift
[s
*DIM
+XX
];
6046 sum
[YY
] += out
[th
].fshift
[s
*DIM
+YY
];
6047 sum
[ZZ
] += out
[th
].fshift
[s
*DIM
+ZZ
];
6049 rvec_inc(fshift
[s
],sum
);