2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
42 #include "types/commrec.h"
44 #include "gromacs/math/utilities.h"
45 #include "gromacs/math/vec.h"
46 #include "nbnxn_consts.h"
47 /* nbnxn_internal.h included gromacs/simd/macros.h */
48 #include "nbnxn_internal.h"
50 #include "gromacs/simd/vector_operations.h"
52 #include "nbnxn_atomdata.h"
53 #include "nbnxn_search.h"
54 #include "gmx_omp_nthreads.h"
58 #include "gromacs/pbcutil/ishift.h"
59 #include "gromacs/pbcutil/pbc.h"
60 #include "gromacs/utility/smalloc.h"
62 #ifdef NBNXN_SEARCH_BB_SIMD4
63 /* Always use 4-wide SIMD for bounding box calculations */
66 /* Single precision BBs + coordinates, we can also load coordinates with SIMD */
67 # define NBNXN_SEARCH_SIMD4_FLOAT_X_BB
70 # if defined NBNXN_SEARCH_SIMD4_FLOAT_X_BB && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
71 /* Store bounding boxes with x, y and z coordinates in packs of 4 */
72 # define NBNXN_PBB_SIMD4
75 /* The packed bounding box coordinate stride is always set to 4.
76 * With AVX we could use 8, but that turns out not to be faster.
79 # define STRIDE_PBB_2LOG 2
81 #endif /* NBNXN_SEARCH_BB_SIMD4 */
85 /* The functions below are macros as they are performance sensitive */
87 /* 4x4 list, pack=4: no complex conversion required */
88 /* i-cluster to j-cluster conversion */
89 #define CI_TO_CJ_J4(ci) (ci)
90 /* cluster index to coordinate array index conversion */
91 #define X_IND_CI_J4(ci) ((ci)*STRIDE_P4)
92 #define X_IND_CJ_J4(cj) ((cj)*STRIDE_P4)
94 /* 4x2 list, pack=4: j-cluster size is half the packing width */
95 /* i-cluster to j-cluster conversion */
96 #define CI_TO_CJ_J2(ci) ((ci)<<1)
97 /* cluster index to coordinate array index conversion */
98 #define X_IND_CI_J2(ci) ((ci)*STRIDE_P4)
99 #define X_IND_CJ_J2(cj) (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
101 /* 4x8 list, pack=8: i-cluster size is half the packing width */
102 /* i-cluster to j-cluster conversion */
103 #define CI_TO_CJ_J8(ci) ((ci)>>1)
104 /* cluster index to coordinate array index conversion */
105 #define X_IND_CI_J8(ci) (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
106 #define X_IND_CJ_J8(cj) ((cj)*STRIDE_P8)
108 /* The j-cluster size is matched to the SIMD width */
109 #if GMX_SIMD_REAL_WIDTH == 2
110 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J2(ci)
111 #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J2(ci)
112 #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J2(cj)
114 #if GMX_SIMD_REAL_WIDTH == 4
115 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J4(ci)
116 #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J4(ci)
117 #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J4(cj)
119 #if GMX_SIMD_REAL_WIDTH == 8
120 #define CI_TO_CJ_SIMD_4XN(ci) CI_TO_CJ_J8(ci)
121 #define X_IND_CI_SIMD_4XN(ci) X_IND_CI_J8(ci)
122 #define X_IND_CJ_SIMD_4XN(cj) X_IND_CJ_J8(cj)
123 /* Half SIMD with j-cluster size */
124 #define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
125 #define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
126 #define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
128 #if GMX_SIMD_REAL_WIDTH == 16
129 #define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J8(ci)
130 #define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci)
131 #define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj)
133 #error "unsupported GMX_SIMD_REAL_WIDTH"
139 #endif /* GMX_NBNXN_SIMD */
142 #ifdef NBNXN_SEARCH_BB_SIMD4
143 /* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
145 /* Size of bounding box corners quadruplet */
146 #define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_PBB)
149 /* We shift the i-particles backward for PBC.
150 * This leads to more conditionals than shifting forward.
151 * We do this to get more balanced pair lists.
153 #define NBNXN_SHIFT_BACKWARD
156 /* This define is a lazy way to avoid interdependence of the grid
157 * and searching data structures.
159 #define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
162 static void nbs_cycle_clear(nbnxn_cycle_t
*cc
)
166 for (i
= 0; i
< enbsCCnr
; i
++)
173 static double Mcyc_av(const nbnxn_cycle_t
*cc
)
175 return (double)cc
->c
*1e-6/cc
->count
;
178 static void nbs_cycle_print(FILE *fp
, const nbnxn_search_t nbs
)
184 fprintf(fp
, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
185 nbs
->cc
[enbsCCgrid
].count
,
186 Mcyc_av(&nbs
->cc
[enbsCCgrid
]),
187 Mcyc_av(&nbs
->cc
[enbsCCsearch
]),
188 Mcyc_av(&nbs
->cc
[enbsCCreducef
]));
190 if (nbs
->nthread_max
> 1)
192 if (nbs
->cc
[enbsCCcombine
].count
> 0)
194 fprintf(fp
, " comb %5.2f",
195 Mcyc_av(&nbs
->cc
[enbsCCcombine
]));
197 fprintf(fp
, " s. th");
198 for (t
= 0; t
< nbs
->nthread_max
; t
++)
200 fprintf(fp
, " %4.1f",
201 Mcyc_av(&nbs
->work
[t
].cc
[enbsCCsearch
]));
207 static void nbnxn_grid_init(nbnxn_grid_t
* grid
)
210 grid
->cxy_ind
= NULL
;
211 grid
->cxy_nalloc
= 0;
217 static int get_2log(int n
)
222 while ((1<<log2
) < n
)
228 gmx_fatal(FARGS
, "nbnxn na_c (%d) is not a power of 2", n
);
234 static int nbnxn_kernel_to_ci_size(int nb_kernel_type
)
236 switch (nb_kernel_type
)
238 case nbnxnk4x4_PlainC
:
239 case nbnxnk4xN_SIMD_4xN
:
240 case nbnxnk4xN_SIMD_2xNN
:
241 return NBNXN_CPU_CLUSTER_I_SIZE
;
242 case nbnxnk8x8x8_CUDA
:
243 case nbnxnk8x8x8_PlainC
:
244 /* The cluster size for super/sub lists is only set here.
245 * Any value should work for the pair-search and atomdata code.
246 * The kernels, of course, might require a particular value.
248 return NBNXN_GPU_CLUSTER_SIZE
;
250 gmx_incons("unknown kernel type");
256 int nbnxn_kernel_to_cj_size(int nb_kernel_type
)
258 int nbnxn_simd_width
= 0;
261 #ifdef GMX_NBNXN_SIMD
262 nbnxn_simd_width
= GMX_SIMD_REAL_WIDTH
;
265 switch (nb_kernel_type
)
267 case nbnxnk4x4_PlainC
:
268 cj_size
= NBNXN_CPU_CLUSTER_I_SIZE
;
270 case nbnxnk4xN_SIMD_4xN
:
271 cj_size
= nbnxn_simd_width
;
273 case nbnxnk4xN_SIMD_2xNN
:
274 cj_size
= nbnxn_simd_width
/2;
276 case nbnxnk8x8x8_CUDA
:
277 case nbnxnk8x8x8_PlainC
:
278 cj_size
= nbnxn_kernel_to_ci_size(nb_kernel_type
);
281 gmx_incons("unknown kernel type");
287 static int ci_to_cj(int na_cj_2log
, int ci
)
291 case 2: return ci
; break;
292 case 1: return (ci
<<1); break;
293 case 3: return (ci
>>1); break;
299 gmx_bool
nbnxn_kernel_pairlist_simple(int nb_kernel_type
)
301 if (nb_kernel_type
== nbnxnkNotSet
)
303 gmx_fatal(FARGS
, "Non-bonded kernel type not set for Verlet-style pair-list.");
306 switch (nb_kernel_type
)
308 case nbnxnk8x8x8_CUDA
:
309 case nbnxnk8x8x8_PlainC
:
312 case nbnxnk4x4_PlainC
:
313 case nbnxnk4xN_SIMD_4xN
:
314 case nbnxnk4xN_SIMD_2xNN
:
318 gmx_incons("Invalid nonbonded kernel type passed!");
323 /* Initializes a single nbnxn_pairlist_t data structure */
324 static void nbnxn_init_pairlist_fep(t_nblist
*nl
)
326 nl
->type
= GMX_NBLIST_INTERACTION_FREE_ENERGY
;
327 nl
->igeometry
= GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE
;
328 /* The interaction functions are set in the free energy kernel fuction */
347 void nbnxn_init_search(nbnxn_search_t
* nbs_ptr
,
349 gmx_domdec_zones_t
*zones
,
361 nbs
->DomDec
= (n_dd_cells
!= NULL
);
363 clear_ivec(nbs
->dd_dim
);
369 for (d
= 0; d
< DIM
; d
++)
371 if ((*n_dd_cells
)[d
] > 1)
374 /* Each grid matches a DD zone */
380 snew(nbs
->grid
, nbs
->ngrid
);
381 for (g
= 0; g
< nbs
->ngrid
; g
++)
383 nbnxn_grid_init(&nbs
->grid
[g
]);
386 nbs
->cell_nalloc
= 0;
390 nbs
->nthread_max
= nthread_max
;
392 /* Initialize the work data structures for each thread */
393 snew(nbs
->work
, nbs
->nthread_max
);
394 for (t
= 0; t
< nbs
->nthread_max
; t
++)
396 nbs
->work
[t
].cxy_na
= NULL
;
397 nbs
->work
[t
].cxy_na_nalloc
= 0;
398 nbs
->work
[t
].sort_work
= NULL
;
399 nbs
->work
[t
].sort_work_nalloc
= 0;
401 snew(nbs
->work
[t
].nbl_fep
, 1);
402 nbnxn_init_pairlist_fep(nbs
->work
[t
].nbl_fep
);
405 /* Initialize detailed nbsearch cycle counting */
406 nbs
->print_cycles
= (getenv("GMX_NBNXN_CYCLE") != 0);
407 nbs
->search_count
= 0;
408 nbs_cycle_clear(nbs
->cc
);
409 for (t
= 0; t
< nbs
->nthread_max
; t
++)
411 nbs_cycle_clear(nbs
->work
[t
].cc
);
415 static real
grid_atom_density(int n
, rvec corner0
, rvec corner1
)
419 rvec_sub(corner1
, corner0
, size
);
421 return n
/(size
[XX
]*size
[YY
]*size
[ZZ
]);
424 static int set_grid_size_xy(const nbnxn_search_t nbs
,
427 int n
, rvec corner0
, rvec corner1
,
432 real adens
, tlen
, tlen_x
, tlen_y
, nc_max
;
435 rvec_sub(corner1
, corner0
, size
);
439 /* target cell length */
442 /* To minimize the zero interactions, we should make
443 * the largest of the i/j cell cubic.
445 na_c
= max(grid
->na_c
, grid
->na_cj
);
447 /* Approximately cubic cells */
448 tlen
= pow(na_c
/atom_density
, 1.0/3.0);
454 /* Approximately cubic sub cells */
455 tlen
= pow(grid
->na_c
/atom_density
, 1.0/3.0);
456 tlen_x
= tlen
*GPU_NSUBCELL_X
;
457 tlen_y
= tlen
*GPU_NSUBCELL_Y
;
459 /* We round ncx and ncy down, because we get less cell pairs
460 * in the nbsist when the fixed cell dimensions (x,y) are
461 * larger than the variable one (z) than the other way around.
463 grid
->ncx
= max(1, (int)(size
[XX
]/tlen_x
));
464 grid
->ncy
= max(1, (int)(size
[YY
]/tlen_y
));
472 grid
->sx
= size
[XX
]/grid
->ncx
;
473 grid
->sy
= size
[YY
]/grid
->ncy
;
474 grid
->inv_sx
= 1/grid
->sx
;
475 grid
->inv_sy
= 1/grid
->sy
;
479 /* This is a non-home zone, add an extra row of cells
480 * for particles communicated for bonded interactions.
481 * These can be beyond the cut-off. It doesn't matter where
482 * they end up on the grid, but for performance it's better
483 * if they don't end up in cells that can be within cut-off range.
489 /* We need one additional cell entry for particles moved by DD */
490 if (grid
->ncx
*grid
->ncy
+1 > grid
->cxy_nalloc
)
492 grid
->cxy_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
493 srenew(grid
->cxy_na
, grid
->cxy_nalloc
);
494 srenew(grid
->cxy_ind
, grid
->cxy_nalloc
+1);
496 for (t
= 0; t
< nbs
->nthread_max
; t
++)
498 if (grid
->ncx
*grid
->ncy
+1 > nbs
->work
[t
].cxy_na_nalloc
)
500 nbs
->work
[t
].cxy_na_nalloc
= over_alloc_large(grid
->ncx
*grid
->ncy
+1);
501 srenew(nbs
->work
[t
].cxy_na
, nbs
->work
[t
].cxy_na_nalloc
);
505 /* Worst case scenario of 1 atom in each last cell */
506 if (grid
->na_cj
<= grid
->na_c
)
508 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
;
512 nc_max
= n
/grid
->na_sc
+ grid
->ncx
*grid
->ncy
*grid
->na_cj
/grid
->na_c
;
515 if (nc_max
> grid
->nc_nalloc
)
517 grid
->nc_nalloc
= over_alloc_large(nc_max
);
518 srenew(grid
->nsubc
, grid
->nc_nalloc
);
519 srenew(grid
->bbcz
, grid
->nc_nalloc
*NNBSBB_D
);
521 sfree_aligned(grid
->bb
);
522 /* This snew also zeros the contents, this avoid possible
523 * floating exceptions in SIMD with the unused bb elements.
527 snew_aligned(grid
->bb
, grid
->nc_nalloc
, 16);
534 pbb_nalloc
= grid
->nc_nalloc
*GPU_NSUBCELL
/STRIDE_PBB
*NNBSBB_XXXX
;
535 snew_aligned(grid
->pbb
, pbb_nalloc
, 16);
537 snew_aligned(grid
->bb
, grid
->nc_nalloc
*GPU_NSUBCELL
, 16);
543 if (grid
->na_cj
== grid
->na_c
)
545 grid
->bbj
= grid
->bb
;
549 sfree_aligned(grid
->bbj
);
550 snew_aligned(grid
->bbj
, grid
->nc_nalloc
*grid
->na_c
/grid
->na_cj
, 16);
554 srenew(grid
->flags
, grid
->nc_nalloc
);
557 srenew(grid
->fep
, grid
->nc_nalloc
*grid
->na_sc
/grid
->na_c
);
561 copy_rvec(corner0
, grid
->c0
);
562 copy_rvec(corner1
, grid
->c1
);
567 /* We need to sort paricles in grid columns on z-coordinate.
568 * As particle are very often distributed homogeneously, we a sorting
569 * algorithm similar to pigeonhole sort. We multiply the z-coordinate
570 * by a factor, cast to an int and try to store in that hole. If the hole
571 * is full, we move this or another particle. A second pass is needed to make
572 * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
573 * 4 is the optimal value for homogeneous particle distribution and allows
574 * for an O(#particles) sort up till distributions were all particles are
575 * concentrated in 1/4 of the space. No NlogN fallback is implemented,
576 * as it can be expensive to detect imhomogeneous particle distributions.
577 * SGSF is the maximum ratio of holes used, in the worst case all particles
578 * end up in the last hole and we need #particles extra holes at the end.
580 #define SORT_GRID_OVERSIZE 4
581 #define SGSF (SORT_GRID_OVERSIZE + 1)
583 /* Sort particle index a on coordinates x along dim.
584 * Backwards tells if we want decreasing iso increasing coordinates.
585 * h0 is the minimum of the coordinate range.
586 * invh is the 1/length of the sorting range.
587 * n_per_h (>=n) is the expected average number of particles per 1/invh
588 * sort is the sorting work array.
589 * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
590 * or easier, allocate at least n*SGSF elements.
592 static void sort_atoms(int dim
, gmx_bool Backwards
,
593 int gmx_unused dd_zone
,
594 int *a
, int n
, rvec
*x
,
595 real h0
, real invh
, int n_per_h
,
599 int zi
, zim
, zi_min
, zi_max
;
611 gmx_incons("n > n_per_h");
615 /* Transform the inverse range height into the inverse hole height */
616 invh
*= n_per_h
*SORT_GRID_OVERSIZE
;
618 /* Set nsort to the maximum possible number of holes used.
619 * In worst case all n elements end up in the last bin.
621 nsort
= n_per_h
*SORT_GRID_OVERSIZE
+ n
;
623 /* Determine the index range used, so we can limit it for the second pass */
627 /* Sort the particles using a simple index sort */
628 for (i
= 0; i
< n
; i
++)
630 /* The cast takes care of float-point rounding effects below zero.
631 * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
632 * times the box height out of the box.
634 zi
= (int)((x
[a
[i
]][dim
] - h0
)*invh
);
637 /* As we can have rounding effect, we use > iso >= here */
638 if (zi
< 0 || (dd_zone
== 0 && zi
> n_per_h
*SORT_GRID_OVERSIZE
))
640 gmx_fatal(FARGS
, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
641 a
[i
], 'x'+dim
, x
[a
[i
]][dim
], h0
, invh
, zi
,
642 n_per_h
, SORT_GRID_OVERSIZE
);
646 /* In a non-local domain, particles communcated for bonded interactions
647 * can be far beyond the grid size, which is set by the non-bonded
648 * cut-off distance. We sort such particles into the last cell.
650 if (zi
> n_per_h
*SORT_GRID_OVERSIZE
)
652 zi
= n_per_h
*SORT_GRID_OVERSIZE
;
655 /* Ideally this particle should go in sort cell zi,
656 * but that might already be in use,
657 * in that case find the first empty cell higher up
662 zi_min
= min(zi_min
, zi
);
663 zi_max
= max(zi_max
, zi
);
667 /* We have multiple atoms in the same sorting slot.
668 * Sort on real z for minimal bounding box size.
669 * There is an extra check for identical z to ensure
670 * well-defined output order, independent of input order
671 * to ensure binary reproducibility after restarts.
673 while (sort
[zi
] >= 0 && ( x
[a
[i
]][dim
] > x
[sort
[zi
]][dim
] ||
674 (x
[a
[i
]][dim
] == x
[sort
[zi
]][dim
] &&
682 /* Shift all elements by one slot until we find an empty slot */
685 while (sort
[zim
] >= 0)
693 zi_max
= max(zi_max
, zim
);
696 zi_max
= max(zi_max
, zi
);
703 for (zi
= 0; zi
< nsort
; zi
++)
714 for (zi
= zi_max
; zi
>= zi_min
; zi
--)
725 gmx_incons("Lost particles while sorting");
730 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
731 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
737 /* Coordinate order x,y,z, bb order xyz0 */
738 static void calc_bounding_box(int na
, int stride
, const real
*x
, nbnxn_bb_t
*bb
)
741 real xl
, xh
, yl
, yh
, zl
, zh
;
751 for (j
= 1; j
< na
; j
++)
753 xl
= min(xl
, x
[i
+XX
]);
754 xh
= max(xh
, x
[i
+XX
]);
755 yl
= min(yl
, x
[i
+YY
]);
756 yh
= max(yh
, x
[i
+YY
]);
757 zl
= min(zl
, x
[i
+ZZ
]);
758 zh
= max(zh
, x
[i
+ZZ
]);
761 /* Note: possible double to float conversion here */
762 bb
->lower
[BB_X
] = R2F_D(xl
);
763 bb
->lower
[BB_Y
] = R2F_D(yl
);
764 bb
->lower
[BB_Z
] = R2F_D(zl
);
765 bb
->upper
[BB_X
] = R2F_U(xh
);
766 bb
->upper
[BB_Y
] = R2F_U(yh
);
767 bb
->upper
[BB_Z
] = R2F_U(zh
);
770 /* Packed coordinates, bb order xyz0 */
771 static void calc_bounding_box_x_x4(int na
, const real
*x
, nbnxn_bb_t
*bb
)
774 real xl
, xh
, yl
, yh
, zl
, zh
;
782 for (j
= 1; j
< na
; j
++)
784 xl
= min(xl
, x
[j
+XX
*PACK_X4
]);
785 xh
= max(xh
, x
[j
+XX
*PACK_X4
]);
786 yl
= min(yl
, x
[j
+YY
*PACK_X4
]);
787 yh
= max(yh
, x
[j
+YY
*PACK_X4
]);
788 zl
= min(zl
, x
[j
+ZZ
*PACK_X4
]);
789 zh
= max(zh
, x
[j
+ZZ
*PACK_X4
]);
791 /* Note: possible double to float conversion here */
792 bb
->lower
[BB_X
] = R2F_D(xl
);
793 bb
->lower
[BB_Y
] = R2F_D(yl
);
794 bb
->lower
[BB_Z
] = R2F_D(zl
);
795 bb
->upper
[BB_X
] = R2F_U(xh
);
796 bb
->upper
[BB_Y
] = R2F_U(yh
);
797 bb
->upper
[BB_Z
] = R2F_U(zh
);
800 /* Packed coordinates, bb order xyz0 */
801 static void calc_bounding_box_x_x8(int na
, const real
*x
, nbnxn_bb_t
*bb
)
804 real xl
, xh
, yl
, yh
, zl
, zh
;
812 for (j
= 1; j
< na
; j
++)
814 xl
= min(xl
, x
[j
+XX
*PACK_X8
]);
815 xh
= max(xh
, x
[j
+XX
*PACK_X8
]);
816 yl
= min(yl
, x
[j
+YY
*PACK_X8
]);
817 yh
= max(yh
, x
[j
+YY
*PACK_X8
]);
818 zl
= min(zl
, x
[j
+ZZ
*PACK_X8
]);
819 zh
= max(zh
, x
[j
+ZZ
*PACK_X8
]);
821 /* Note: possible double to float conversion here */
822 bb
->lower
[BB_X
] = R2F_D(xl
);
823 bb
->lower
[BB_Y
] = R2F_D(yl
);
824 bb
->lower
[BB_Z
] = R2F_D(zl
);
825 bb
->upper
[BB_X
] = R2F_U(xh
);
826 bb
->upper
[BB_Y
] = R2F_U(yh
);
827 bb
->upper
[BB_Z
] = R2F_U(zh
);
830 /* Packed coordinates, bb order xyz0 */
831 static void calc_bounding_box_x_x4_halves(int na
, const real
*x
,
832 nbnxn_bb_t
*bb
, nbnxn_bb_t
*bbj
)
834 calc_bounding_box_x_x4(min(na
, 2), x
, bbj
);
838 calc_bounding_box_x_x4(min(na
-2, 2), x
+(PACK_X4
>>1), bbj
+1);
842 /* Set the "empty" bounding box to the same as the first one,
843 * so we don't need to treat special cases in the rest of the code.
845 #ifdef NBNXN_SEARCH_BB_SIMD4
846 gmx_simd4_store_f(&bbj
[1].lower
[0], gmx_simd4_load_f(&bbj
[0].lower
[0]));
847 gmx_simd4_store_f(&bbj
[1].upper
[0], gmx_simd4_load_f(&bbj
[0].upper
[0]));
853 #ifdef NBNXN_SEARCH_BB_SIMD4
854 gmx_simd4_store_f(&bb
->lower
[0],
855 gmx_simd4_min_f(gmx_simd4_load_f(&bbj
[0].lower
[0]),
856 gmx_simd4_load_f(&bbj
[1].lower
[0])));
857 gmx_simd4_store_f(&bb
->upper
[0],
858 gmx_simd4_max_f(gmx_simd4_load_f(&bbj
[0].upper
[0]),
859 gmx_simd4_load_f(&bbj
[1].upper
[0])));
864 for (i
= 0; i
< NNBSBB_C
; i
++)
866 bb
->lower
[i
] = min(bbj
[0].lower
[i
], bbj
[1].lower
[i
]);
867 bb
->upper
[i
] = max(bbj
[0].upper
[i
], bbj
[1].upper
[i
]);
873 #ifdef NBNXN_SEARCH_BB_SIMD4
875 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
876 static void calc_bounding_box_xxxx(int na
, int stride
, const real
*x
, float *bb
)
879 real xl
, xh
, yl
, yh
, zl
, zh
;
889 for (j
= 1; j
< na
; j
++)
891 xl
= min(xl
, x
[i
+XX
]);
892 xh
= max(xh
, x
[i
+XX
]);
893 yl
= min(yl
, x
[i
+YY
]);
894 yh
= max(yh
, x
[i
+YY
]);
895 zl
= min(zl
, x
[i
+ZZ
]);
896 zh
= max(zh
, x
[i
+ZZ
]);
899 /* Note: possible double to float conversion here */
900 bb
[0*STRIDE_PBB
] = R2F_D(xl
);
901 bb
[1*STRIDE_PBB
] = R2F_D(yl
);
902 bb
[2*STRIDE_PBB
] = R2F_D(zl
);
903 bb
[3*STRIDE_PBB
] = R2F_U(xh
);
904 bb
[4*STRIDE_PBB
] = R2F_U(yh
);
905 bb
[5*STRIDE_PBB
] = R2F_U(zh
);
908 #endif /* NBNXN_SEARCH_BB_SIMD4 */
910 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
912 /* Coordinate order xyz?, bb order xyz0 */
913 static void calc_bounding_box_simd4(int na
, const float *x
, nbnxn_bb_t
*bb
)
915 gmx_simd4_float_t bb_0_S
, bb_1_S
;
916 gmx_simd4_float_t x_S
;
920 bb_0_S
= gmx_simd4_load_f(x
);
923 for (i
= 1; i
< na
; i
++)
925 x_S
= gmx_simd4_load_f(x
+i
*NNBSBB_C
);
926 bb_0_S
= gmx_simd4_min_f(bb_0_S
, x_S
);
927 bb_1_S
= gmx_simd4_max_f(bb_1_S
, x_S
);
930 gmx_simd4_store_f(&bb
->lower
[0], bb_0_S
);
931 gmx_simd4_store_f(&bb
->upper
[0], bb_1_S
);
934 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
935 static void calc_bounding_box_xxxx_simd4(int na
, const float *x
,
936 nbnxn_bb_t
*bb_work_aligned
,
939 calc_bounding_box_simd4(na
, x
, bb_work_aligned
);
941 bb
[0*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_X
];
942 bb
[1*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Y
];
943 bb
[2*STRIDE_PBB
] = bb_work_aligned
->lower
[BB_Z
];
944 bb
[3*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_X
];
945 bb
[4*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Y
];
946 bb
[5*STRIDE_PBB
] = bb_work_aligned
->upper
[BB_Z
];
949 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
952 /* Combines pairs of consecutive bounding boxes */
953 static void combine_bounding_box_pairs(nbnxn_grid_t
*grid
, const nbnxn_bb_t
*bb
)
955 int i
, j
, sc2
, nc2
, c2
;
957 for (i
= 0; i
< grid
->ncx
*grid
->ncy
; i
++)
959 /* Starting bb in a column is expected to be 2-aligned */
960 sc2
= grid
->cxy_ind
[i
]>>1;
961 /* For odd numbers skip the last bb here */
962 nc2
= (grid
->cxy_na
[i
]+3)>>(2+1);
963 for (c2
= sc2
; c2
< sc2
+nc2
; c2
++)
965 #ifdef NBNXN_SEARCH_BB_SIMD4
966 gmx_simd4_float_t min_S
, max_S
;
968 min_S
= gmx_simd4_min_f(gmx_simd4_load_f(&bb
[c2
*2+0].lower
[0]),
969 gmx_simd4_load_f(&bb
[c2
*2+1].lower
[0]));
970 max_S
= gmx_simd4_max_f(gmx_simd4_load_f(&bb
[c2
*2+0].upper
[0]),
971 gmx_simd4_load_f(&bb
[c2
*2+1].upper
[0]));
972 gmx_simd4_store_f(&grid
->bbj
[c2
].lower
[0], min_S
);
973 gmx_simd4_store_f(&grid
->bbj
[c2
].upper
[0], max_S
);
975 for (j
= 0; j
< NNBSBB_C
; j
++)
977 grid
->bbj
[c2
].lower
[j
] = min(bb
[c2
*2+0].lower
[j
],
978 bb
[c2
*2+1].lower
[j
]);
979 grid
->bbj
[c2
].upper
[j
] = max(bb
[c2
*2+0].upper
[j
],
980 bb
[c2
*2+1].upper
[j
]);
984 if (((grid
->cxy_na
[i
]+3)>>2) & 1)
986 /* The bb count in this column is odd: duplicate the last bb */
987 for (j
= 0; j
< NNBSBB_C
; j
++)
989 grid
->bbj
[c2
].lower
[j
] = bb
[c2
*2].lower
[j
];
990 grid
->bbj
[c2
].upper
[j
] = bb
[c2
*2].upper
[j
];
997 /* Prints the average bb size, used for debug output */
998 static void print_bbsizes_simple(FILE *fp
,
999 const nbnxn_search_t nbs
,
1000 const nbnxn_grid_t
*grid
)
1006 for (c
= 0; c
< grid
->nc
; c
++)
1008 for (d
= 0; d
< DIM
; d
++)
1010 ba
[d
] += grid
->bb
[c
].upper
[d
] - grid
->bb
[c
].lower
[d
];
1013 dsvmul(1.0/grid
->nc
, ba
, ba
);
1015 fprintf(fp
, "ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
1016 nbs
->box
[XX
][XX
]/grid
->ncx
,
1017 nbs
->box
[YY
][YY
]/grid
->ncy
,
1018 nbs
->box
[ZZ
][ZZ
]*grid
->ncx
*grid
->ncy
/grid
->nc
,
1019 ba
[XX
], ba
[YY
], ba
[ZZ
],
1020 ba
[XX
]*grid
->ncx
/nbs
->box
[XX
][XX
],
1021 ba
[YY
]*grid
->ncy
/nbs
->box
[YY
][YY
],
1022 ba
[ZZ
]*grid
->nc
/(grid
->ncx
*grid
->ncy
*nbs
->box
[ZZ
][ZZ
]));
1025 /* Prints the average bb size, used for debug output */
1026 static void print_bbsizes_supersub(FILE *fp
,
1027 const nbnxn_search_t nbs
,
1028 const nbnxn_grid_t
*grid
)
1035 for (c
= 0; c
< grid
->nc
; c
++)
1038 for (s
= 0; s
< grid
->nsubc
[c
]; s
+= STRIDE_PBB
)
1042 cs_w
= (c
*GPU_NSUBCELL
+ s
)/STRIDE_PBB
;
1043 for (i
= 0; i
< STRIDE_PBB
; i
++)
1045 for (d
= 0; d
< DIM
; d
++)
1048 grid
->pbb
[cs_w
*NNBSBB_XXXX
+(DIM
+d
)*STRIDE_PBB
+i
] -
1049 grid
->pbb
[cs_w
*NNBSBB_XXXX
+ d
*STRIDE_PBB
+i
];
1054 for (s
= 0; s
< grid
->nsubc
[c
]; s
++)
1058 cs
= c
*GPU_NSUBCELL
+ s
;
1059 for (d
= 0; d
< DIM
; d
++)
1061 ba
[d
] += grid
->bb
[cs
].upper
[d
] - grid
->bb
[cs
].lower
[d
];
1065 ns
+= grid
->nsubc
[c
];
1067 dsvmul(1.0/ns
, ba
, ba
);
1069 fprintf(fp
, "ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
1070 nbs
->box
[XX
][XX
]/(grid
->ncx
*GPU_NSUBCELL_X
),
1071 nbs
->box
[YY
][YY
]/(grid
->ncy
*GPU_NSUBCELL_Y
),
1072 nbs
->box
[ZZ
][ZZ
]*grid
->ncx
*grid
->ncy
/(grid
->nc
*GPU_NSUBCELL_Z
),
1073 ba
[XX
], ba
[YY
], ba
[ZZ
],
1074 ba
[XX
]*grid
->ncx
*GPU_NSUBCELL_X
/nbs
->box
[XX
][XX
],
1075 ba
[YY
]*grid
->ncy
*GPU_NSUBCELL_Y
/nbs
->box
[YY
][YY
],
1076 ba
[ZZ
]*grid
->nc
*GPU_NSUBCELL_Z
/(grid
->ncx
*grid
->ncy
*nbs
->box
[ZZ
][ZZ
]));
1079 /* Potentially sorts atoms on LJ coefficients !=0 and ==0.
1080 * Also sets interaction flags.
1082 void sort_on_lj(int na_c
,
1083 int a0
, int a1
, const int *atinfo
,
1087 int subc
, s
, a
, n1
, n2
, a_lj_max
, i
, j
;
1088 int sort1
[NBNXN_NA_SC_MAX
/GPU_NSUBCELL
];
1089 int sort2
[NBNXN_NA_SC_MAX
/GPU_NSUBCELL
];
1090 gmx_bool haveQ
, bFEP
;
1095 for (s
= a0
; s
< a1
; s
+= na_c
)
1097 /* Make lists for this (sub-)cell on atoms with and without LJ */
1102 for (a
= s
; a
< min(s
+na_c
, a1
); a
++)
1104 haveQ
= haveQ
|| GET_CGINFO_HAS_Q(atinfo
[order
[a
]]);
1106 if (GET_CGINFO_HAS_VDW(atinfo
[order
[a
]]))
1108 sort1
[n1
++] = order
[a
];
1113 sort2
[n2
++] = order
[a
];
1117 /* If we don't have atoms with LJ, there's nothing to sort */
1120 *flags
|= NBNXN_CI_DO_LJ(subc
);
1124 /* Only sort when strictly necessary. Ordering particles
1125 * Ordering particles can lead to less accurate summation
1126 * due to rounding, both for LJ and Coulomb interactions.
1128 if (2*(a_lj_max
- s
) >= na_c
)
1130 for (i
= 0; i
< n1
; i
++)
1132 order
[a0
+i
] = sort1
[i
];
1134 for (j
= 0; j
< n2
; j
++)
1136 order
[a0
+n1
+j
] = sort2
[j
];
1140 *flags
|= NBNXN_CI_HALF_LJ(subc
);
1145 *flags
|= NBNXN_CI_DO_COUL(subc
);
1151 /* Fill a pair search cell with atoms.
1152 * Potentially sorts atoms and sets the interaction flags.
1154 void fill_cell(const nbnxn_search_t nbs
,
1156 nbnxn_atomdata_t
*nbat
,
1160 int sx
, int sy
, int sz
,
1161 nbnxn_bb_t gmx_unused
*bb_work_aligned
)
1174 sort_on_lj(grid
->na_c
, a0
, a1
, atinfo
, nbs
->a
,
1175 grid
->flags
+(a0
>>grid
->na_c_2log
)-grid
->cell0
);
1180 /* Set the fep flag for perturbed atoms in this (sub-)cell */
1183 /* The grid-local cluster/(sub-)cell index */
1184 c
= (a0
>> grid
->na_c_2log
) - grid
->cell0
*(grid
->bSimple
? 1 : GPU_NSUBCELL
);
1186 for (at
= a0
; at
< a1
; at
++)
1188 if (nbs
->a
[at
] >= 0 && GET_CGINFO_FEP(atinfo
[nbs
->a
[at
]]))
1190 grid
->fep
[c
] |= (1 << (at
- a0
));
1195 /* Now we have sorted the atoms, set the cell indices */
1196 for (a
= a0
; a
< a1
; a
++)
1198 nbs
->cell
[nbs
->a
[a
]] = a
;
1201 copy_rvec_to_nbat_real(nbs
->a
+a0
, a1
-a0
, grid
->na_c
, x
,
1202 nbat
->XFormat
, nbat
->x
, a0
,
1205 if (nbat
->XFormat
== nbatX4
)
1207 /* Store the bounding boxes as xyz.xyz. */
1208 offset
= (a0
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
1209 bb_ptr
= grid
->bb
+ offset
;
1211 #if defined GMX_NBNXN_SIMD && GMX_SIMD_REAL_WIDTH == 2
1212 if (2*grid
->na_cj
== grid
->na_c
)
1214 calc_bounding_box_x_x4_halves(na
, nbat
->x
+X4_IND_A(a0
), bb_ptr
,
1215 grid
->bbj
+offset
*2);
1220 calc_bounding_box_x_x4(na
, nbat
->x
+X4_IND_A(a0
), bb_ptr
);
1223 else if (nbat
->XFormat
== nbatX8
)
1225 /* Store the bounding boxes as xyz.xyz. */
1226 offset
= (a0
- grid
->cell0
*grid
->na_sc
) >> grid
->na_c_2log
;
1227 bb_ptr
= grid
->bb
+ offset
;
1229 calc_bounding_box_x_x8(na
, nbat
->x
+X8_IND_A(a0
), bb_ptr
);
1232 else if (!grid
->bSimple
)
1234 /* Store the bounding boxes in a format convenient
1235 * for SIMD4 calculations: xxxxyyyyzzzz...
1239 ((a0
-grid
->cell0
*grid
->na_sc
)>>(grid
->na_c_2log
+STRIDE_PBB_2LOG
))*NNBSBB_XXXX
+
1240 (((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
) & (STRIDE_PBB
-1));
1242 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
1243 if (nbat
->XFormat
== nbatXYZQ
)
1245 calc_bounding_box_xxxx_simd4(na
, nbat
->x
+a0
*nbat
->xstride
,
1246 bb_work_aligned
, pbb_ptr
);
1251 calc_bounding_box_xxxx(na
, nbat
->xstride
, nbat
->x
+a0
*nbat
->xstride
,
1256 fprintf(debug
, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1258 pbb_ptr
[0*STRIDE_PBB
], pbb_ptr
[3*STRIDE_PBB
],
1259 pbb_ptr
[1*STRIDE_PBB
], pbb_ptr
[4*STRIDE_PBB
],
1260 pbb_ptr
[2*STRIDE_PBB
], pbb_ptr
[5*STRIDE_PBB
]);
1266 /* Store the bounding boxes as xyz.xyz. */
1267 bb_ptr
= grid
->bb
+((a0
-grid
->cell0
*grid
->na_sc
)>>grid
->na_c_2log
);
1269 calc_bounding_box(na
, nbat
->xstride
, nbat
->x
+a0
*nbat
->xstride
,
1275 bbo
= (a0
- grid
->cell0
*grid
->na_sc
)/grid
->na_c
;
1276 fprintf(debug
, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1278 grid
->bb
[bbo
].lower
[BB_X
],
1279 grid
->bb
[bbo
].lower
[BB_Y
],
1280 grid
->bb
[bbo
].lower
[BB_Z
],
1281 grid
->bb
[bbo
].upper
[BB_X
],
1282 grid
->bb
[bbo
].upper
[BB_Y
],
1283 grid
->bb
[bbo
].upper
[BB_Z
]);
1288 /* Spatially sort the atoms within one grid column */
1289 static void sort_columns_simple(const nbnxn_search_t nbs
,
1295 nbnxn_atomdata_t
*nbat
,
1296 int cxy_start
, int cxy_end
,
1300 int cx
, cy
, cz
, ncz
, cfilled
, c
;
1301 int na
, ash
, ind
, a
;
1306 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1307 grid
->cell0
, cxy_start
, cxy_end
, a0
, a1
);
1310 /* Sort the atoms within each x,y column in 3 dimensions */
1311 for (cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
1314 cy
= cxy
- cx
*grid
->ncy
;
1316 na
= grid
->cxy_na
[cxy
];
1317 ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
1318 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1320 /* Sort the atoms within each x,y column on z coordinate */
1321 sort_atoms(ZZ
, FALSE
, dd_zone
,
1324 1.0/nbs
->box
[ZZ
][ZZ
], ncz
*grid
->na_sc
,
1327 /* Fill the ncz cells in this column */
1328 cfilled
= grid
->cxy_ind
[cxy
];
1329 for (cz
= 0; cz
< ncz
; cz
++)
1331 c
= grid
->cxy_ind
[cxy
] + cz
;
1333 ash_c
= ash
+ cz
*grid
->na_sc
;
1334 na_c
= min(grid
->na_sc
, na
-(ash_c
-ash
));
1336 fill_cell(nbs
, grid
, nbat
,
1337 ash_c
, ash_c
+na_c
, atinfo
, x
,
1338 grid
->na_sc
*cx
+ (dd_zone
>> 2),
1339 grid
->na_sc
*cy
+ (dd_zone
& 3),
1343 /* This copy to bbcz is not really necessary.
1344 * But it allows to use the same grid search code
1345 * for the simple and supersub cell setups.
1351 grid
->bbcz
[c
*NNBSBB_D
] = grid
->bb
[cfilled
].lower
[BB_Z
];
1352 grid
->bbcz
[c
*NNBSBB_D
+1] = grid
->bb
[cfilled
].upper
[BB_Z
];
1355 /* Set the unused atom indices to -1 */
1356 for (ind
= na
; ind
< ncz
*grid
->na_sc
; ind
++)
1358 nbs
->a
[ash
+ind
] = -1;
1363 /* Spatially sort the atoms within one grid column */
1364 static void sort_columns_supersub(const nbnxn_search_t nbs
,
1370 nbnxn_atomdata_t
*nbat
,
1371 int cxy_start
, int cxy_end
,
1375 int cx
, cy
, cz
= -1, c
= -1, ncz
;
1376 int na
, ash
, na_c
, ind
, a
;
1377 int subdiv_z
, sub_z
, na_z
, ash_z
;
1378 int subdiv_y
, sub_y
, na_y
, ash_y
;
1379 int subdiv_x
, sub_x
, na_x
, ash_x
;
1381 nbnxn_bb_t bb_work_array
[2], *bb_work_aligned
;
1383 bb_work_aligned
= (nbnxn_bb_t
*)(((size_t)(bb_work_array
+1)) & (~((size_t)15)));
1387 fprintf(debug
, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1388 grid
->cell0
, cxy_start
, cxy_end
, a0
, a1
);
1391 subdiv_x
= grid
->na_c
;
1392 subdiv_y
= GPU_NSUBCELL_X
*subdiv_x
;
1393 subdiv_z
= GPU_NSUBCELL_Y
*subdiv_y
;
1395 /* Sort the atoms within each x,y column in 3 dimensions */
1396 for (cxy
= cxy_start
; cxy
< cxy_end
; cxy
++)
1399 cy
= cxy
- cx
*grid
->ncy
;
1401 na
= grid
->cxy_na
[cxy
];
1402 ncz
= grid
->cxy_ind
[cxy
+1] - grid
->cxy_ind
[cxy
];
1403 ash
= (grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
;
1405 /* Sort the atoms within each x,y column on z coordinate */
1406 sort_atoms(ZZ
, FALSE
, dd_zone
,
1409 1.0/nbs
->box
[ZZ
][ZZ
], ncz
*grid
->na_sc
,
1412 /* This loop goes over the supercells and subcells along z at once */
1413 for (sub_z
= 0; sub_z
< ncz
*GPU_NSUBCELL_Z
; sub_z
++)
1415 ash_z
= ash
+ sub_z
*subdiv_z
;
1416 na_z
= min(subdiv_z
, na
-(ash_z
-ash
));
1418 /* We have already sorted on z */
1420 if (sub_z
% GPU_NSUBCELL_Z
== 0)
1422 cz
= sub_z
/GPU_NSUBCELL_Z
;
1423 c
= grid
->cxy_ind
[cxy
] + cz
;
1425 /* The number of atoms in this supercell */
1426 na_c
= min(grid
->na_sc
, na
-(ash_z
-ash
));
1428 grid
->nsubc
[c
] = min(GPU_NSUBCELL
, (na_c
+grid
->na_c
-1)/grid
->na_c
);
1430 /* Store the z-boundaries of the super cell */
1431 grid
->bbcz
[c
*NNBSBB_D
] = x
[nbs
->a
[ash_z
]][ZZ
];
1432 grid
->bbcz
[c
*NNBSBB_D
+1] = x
[nbs
->a
[ash_z
+na_c
-1]][ZZ
];
1435 #if GPU_NSUBCELL_Y > 1
1436 /* Sort the atoms along y */
1437 sort_atoms(YY
, (sub_z
& 1), dd_zone
,
1438 nbs
->a
+ash_z
, na_z
, x
,
1439 grid
->c0
[YY
]+cy
*grid
->sy
,
1440 grid
->inv_sy
, subdiv_z
,
1444 for (sub_y
= 0; sub_y
< GPU_NSUBCELL_Y
; sub_y
++)
1446 ash_y
= ash_z
+ sub_y
*subdiv_y
;
1447 na_y
= min(subdiv_y
, na
-(ash_y
-ash
));
1449 #if GPU_NSUBCELL_X > 1
1450 /* Sort the atoms along x */
1451 sort_atoms(XX
, ((cz
*GPU_NSUBCELL_Y
+ sub_y
) & 1), dd_zone
,
1452 nbs
->a
+ash_y
, na_y
, x
,
1453 grid
->c0
[XX
]+cx
*grid
->sx
,
1454 grid
->inv_sx
, subdiv_y
,
1458 for (sub_x
= 0; sub_x
< GPU_NSUBCELL_X
; sub_x
++)
1460 ash_x
= ash_y
+ sub_x
*subdiv_x
;
1461 na_x
= min(subdiv_x
, na
-(ash_x
-ash
));
1463 fill_cell(nbs
, grid
, nbat
,
1464 ash_x
, ash_x
+na_x
, atinfo
, x
,
1465 grid
->na_c
*(cx
*GPU_NSUBCELL_X
+sub_x
) + (dd_zone
>> 2),
1466 grid
->na_c
*(cy
*GPU_NSUBCELL_Y
+sub_y
) + (dd_zone
& 3),
1473 /* Set the unused atom indices to -1 */
1474 for (ind
= na
; ind
< ncz
*grid
->na_sc
; ind
++)
1476 nbs
->a
[ash
+ind
] = -1;
1481 /* Determine in which grid column atoms should go */
1482 static void calc_column_indices(nbnxn_grid_t
*grid
,
1485 int dd_zone
, const int *move
,
1486 int thread
, int nthread
,
1493 /* We add one extra cell for particles which moved during DD */
1494 for (i
= 0; i
< grid
->ncx
*grid
->ncy
+1; i
++)
1499 n0
= a0
+ (int)((thread
+0)*(a1
- a0
))/nthread
;
1500 n1
= a0
+ (int)((thread
+1)*(a1
- a0
))/nthread
;
1504 for (i
= n0
; i
< n1
; i
++)
1506 if (move
== NULL
|| move
[i
] >= 0)
1508 /* We need to be careful with rounding,
1509 * particles might be a few bits outside the local zone.
1510 * The int cast takes care of the lower bound,
1511 * we will explicitly take care of the upper bound.
1513 cx
= (int)((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1514 cy
= (int)((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1517 if (cx
< 0 || cx
> grid
->ncx
||
1518 cy
< 0 || cy
> grid
->ncy
)
1521 "grid cell cx %d cy %d out of range (max %d %d)\n"
1522 "atom %f %f %f, grid->c0 %f %f",
1523 cx
, cy
, grid
->ncx
, grid
->ncy
,
1524 x
[i
][XX
], x
[i
][YY
], x
[i
][ZZ
], grid
->c0
[XX
], grid
->c0
[YY
]);
1527 /* Take care of potential rouding issues */
1528 cx
= min(cx
, grid
->ncx
- 1);
1529 cy
= min(cy
, grid
->ncy
- 1);
1531 /* For the moment cell will contain only the, grid local,
1532 * x and y indices, not z.
1534 cell
[i
] = cx
*grid
->ncy
+ cy
;
1538 /* Put this moved particle after the end of the grid,
1539 * so we can process it later without using conditionals.
1541 cell
[i
] = grid
->ncx
*grid
->ncy
;
1550 for (i
= n0
; i
< n1
; i
++)
1552 cx
= (int)((x
[i
][XX
] - grid
->c0
[XX
])*grid
->inv_sx
);
1553 cy
= (int)((x
[i
][YY
] - grid
->c0
[YY
])*grid
->inv_sy
);
1555 /* For non-home zones there could be particles outside
1556 * the non-bonded cut-off range, which have been communicated
1557 * for bonded interactions only. For the result it doesn't
1558 * matter where these end up on the grid. For performance
1559 * we put them in an extra row at the border.
1562 cx
= min(cx
, grid
->ncx
- 1);
1564 cy
= min(cy
, grid
->ncy
- 1);
1566 /* For the moment cell will contain only the, grid local,
1567 * x and y indices, not z.
1569 cell
[i
] = cx
*grid
->ncy
+ cy
;
1576 /* Determine in which grid cells the atoms should go */
1577 static void calc_cell_indices(const nbnxn_search_t nbs
,
1584 nbnxn_atomdata_t
*nbat
)
1587 int cx
, cy
, cxy
, ncz_max
, ncz
;
1588 int nthread
, thread
;
1589 int *cxy_na
, cxy_na_i
;
1591 nthread
= gmx_omp_nthreads_get(emntPairsearch
);
1593 #pragma omp parallel for num_threads(nthread) schedule(static)
1594 for (thread
= 0; thread
< nthread
; thread
++)
1596 calc_column_indices(grid
, a0
, a1
, x
, dd_zone
, move
, thread
, nthread
,
1597 nbs
->cell
, nbs
->work
[thread
].cxy_na
);
1600 /* Make the cell index as a function of x and y */
1603 grid
->cxy_ind
[0] = 0;
1604 for (i
= 0; i
< grid
->ncx
*grid
->ncy
+1; i
++)
1606 /* We set ncz_max at the beginning of the loop iso at the end
1607 * to skip i=grid->ncx*grid->ncy which are moved particles
1608 * that do not need to be ordered on the grid.
1614 cxy_na_i
= nbs
->work
[0].cxy_na
[i
];
1615 for (thread
= 1; thread
< nthread
; thread
++)
1617 cxy_na_i
+= nbs
->work
[thread
].cxy_na
[i
];
1619 ncz
= (cxy_na_i
+ grid
->na_sc
- 1)/grid
->na_sc
;
1620 if (nbat
->XFormat
== nbatX8
)
1622 /* Make the number of cell a multiple of 2 */
1623 ncz
= (ncz
+ 1) & ~1;
1625 grid
->cxy_ind
[i
+1] = grid
->cxy_ind
[i
] + ncz
;
1626 /* Clear cxy_na, so we can reuse the array below */
1627 grid
->cxy_na
[i
] = 0;
1629 grid
->nc
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
] - grid
->cxy_ind
[0];
1631 nbat
->natoms
= (grid
->cell0
+ grid
->nc
)*grid
->na_sc
;
1635 fprintf(debug
, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1636 grid
->na_sc
, grid
->na_c
, grid
->nc
,
1637 grid
->ncx
, grid
->ncy
, grid
->nc
/((double)(grid
->ncx
*grid
->ncy
)),
1642 for (cy
= 0; cy
< grid
->ncy
; cy
++)
1644 for (cx
= 0; cx
< grid
->ncx
; cx
++)
1646 fprintf(debug
, " %2d", grid
->cxy_ind
[i
+1]-grid
->cxy_ind
[i
]);
1649 fprintf(debug
, "\n");
1654 /* Make sure the work array for sorting is large enough */
1655 if (ncz_max
*grid
->na_sc
*SGSF
> nbs
->work
[0].sort_work_nalloc
)
1657 for (thread
= 0; thread
< nbs
->nthread_max
; thread
++)
1659 nbs
->work
[thread
].sort_work_nalloc
=
1660 over_alloc_large(ncz_max
*grid
->na_sc
*SGSF
);
1661 srenew(nbs
->work
[thread
].sort_work
,
1662 nbs
->work
[thread
].sort_work_nalloc
);
1663 /* When not in use, all elements should be -1 */
1664 for (i
= 0; i
< nbs
->work
[thread
].sort_work_nalloc
; i
++)
1666 nbs
->work
[thread
].sort_work
[i
] = -1;
1671 /* Now we know the dimensions we can fill the grid.
1672 * This is the first, unsorted fill. We sort the columns after this.
1674 for (i
= a0
; i
< a1
; i
++)
1676 /* At this point nbs->cell contains the local grid x,y indices */
1678 nbs
->a
[(grid
->cell0
+ grid
->cxy_ind
[cxy
])*grid
->na_sc
+ grid
->cxy_na
[cxy
]++] = i
;
1683 /* Set the cell indices for the moved particles */
1684 n0
= grid
->nc
*grid
->na_sc
;
1685 n1
= grid
->nc
*grid
->na_sc
+grid
->cxy_na
[grid
->ncx
*grid
->ncy
];
1688 for (i
= n0
; i
< n1
; i
++)
1690 nbs
->cell
[nbs
->a
[i
]] = i
;
1695 /* Sort the super-cell columns along z into the sub-cells. */
1696 #pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
1697 for (thread
= 0; thread
< nbs
->nthread_max
; thread
++)
1701 sort_columns_simple(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, nbat
,
1702 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1703 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1704 nbs
->work
[thread
].sort_work
);
1708 sort_columns_supersub(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, nbat
,
1709 ((thread
+0)*grid
->ncx
*grid
->ncy
)/nthread
,
1710 ((thread
+1)*grid
->ncx
*grid
->ncy
)/nthread
,
1711 nbs
->work
[thread
].sort_work
);
1715 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1717 combine_bounding_box_pairs(grid
, grid
->bb
);
1722 grid
->nsubc_tot
= 0;
1723 for (i
= 0; i
< grid
->nc
; i
++)
1725 grid
->nsubc_tot
+= grid
->nsubc
[i
];
1733 print_bbsizes_simple(debug
, nbs
, grid
);
1737 fprintf(debug
, "ns non-zero sub-cells: %d average atoms %.2f\n",
1738 grid
->nsubc_tot
, (a1
-a0
)/(double)grid
->nsubc_tot
);
1740 print_bbsizes_supersub(debug
, nbs
, grid
);
1745 static void init_buffer_flags(nbnxn_buffer_flags_t
*flags
,
1750 flags
->nflag
= (natoms
+ NBNXN_BUFFERFLAG_SIZE
- 1)/NBNXN_BUFFERFLAG_SIZE
;
1751 if (flags
->nflag
> flags
->flag_nalloc
)
1753 flags
->flag_nalloc
= over_alloc_large(flags
->nflag
);
1754 srenew(flags
->flag
, flags
->flag_nalloc
);
1756 for (b
= 0; b
< flags
->nflag
; b
++)
1762 /* Sets up a grid and puts the atoms on the grid.
1763 * This function only operates on one domain of the domain decompostion.
1764 * Note that without domain decomposition there is only one domain.
1766 void nbnxn_put_on_grid(nbnxn_search_t nbs
,
1767 int ePBC
, matrix box
,
1769 rvec corner0
, rvec corner1
,
1774 int nmoved
, int *move
,
1776 nbnxn_atomdata_t
*nbat
)
1780 int nc_max_grid
, nc_max
;
1782 grid
= &nbs
->grid
[dd_zone
];
1784 nbs_cycle_start(&nbs
->cc
[enbsCCgrid
]);
1786 grid
->bSimple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
1788 grid
->na_c
= nbnxn_kernel_to_ci_size(nb_kernel_type
);
1789 grid
->na_cj
= nbnxn_kernel_to_cj_size(nb_kernel_type
);
1790 grid
->na_sc
= (grid
->bSimple
? 1 : GPU_NSUBCELL
)*grid
->na_c
;
1791 grid
->na_c_2log
= get_2log(grid
->na_c
);
1793 nbat
->na_c
= grid
->na_c
;
1802 (nbs
->grid
[dd_zone
-1].cell0
+ nbs
->grid
[dd_zone
-1].nc
)*
1803 nbs
->grid
[dd_zone
-1].na_sc
/grid
->na_sc
;
1811 copy_mat(box
, nbs
->box
);
1813 if (atom_density
>= 0)
1815 grid
->atom_density
= atom_density
;
1819 grid
->atom_density
= grid_atom_density(n
-nmoved
, corner0
, corner1
);
1824 nbs
->natoms_local
= a1
- nmoved
;
1825 /* We assume that nbnxn_put_on_grid is called first
1826 * for the local atoms (dd_zone=0).
1828 nbs
->natoms_nonlocal
= a1
- nmoved
;
1832 nbs
->natoms_nonlocal
= max(nbs
->natoms_nonlocal
, a1
);
1835 nc_max_grid
= set_grid_size_xy(nbs
, grid
,
1836 dd_zone
, n
-nmoved
, corner0
, corner1
,
1837 nbs
->grid
[0].atom_density
);
1839 nc_max
= grid
->cell0
+ nc_max_grid
;
1841 if (a1
> nbs
->cell_nalloc
)
1843 nbs
->cell_nalloc
= over_alloc_large(a1
);
1844 srenew(nbs
->cell
, nbs
->cell_nalloc
);
1847 /* To avoid conditionals we store the moved particles at the end of a,
1848 * make sure we have enough space.
1850 if (nc_max
*grid
->na_sc
+ nmoved
> nbs
->a_nalloc
)
1852 nbs
->a_nalloc
= over_alloc_large(nc_max
*grid
->na_sc
+ nmoved
);
1853 srenew(nbs
->a
, nbs
->a_nalloc
);
1856 /* We need padding up to a multiple of the buffer flag size: simply add */
1857 if (nc_max
*grid
->na_sc
+ NBNXN_BUFFERFLAG_SIZE
> nbat
->nalloc
)
1859 nbnxn_atomdata_realloc(nbat
, nc_max
*grid
->na_sc
+NBNXN_BUFFERFLAG_SIZE
);
1862 calc_cell_indices(nbs
, dd_zone
, grid
, a0
, a1
, atinfo
, x
, move
, nbat
);
1866 nbat
->natoms_local
= nbat
->natoms
;
1869 nbs_cycle_stop(&nbs
->cc
[enbsCCgrid
]);
1872 /* Calls nbnxn_put_on_grid for all non-local domains */
1873 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs
,
1874 const gmx_domdec_zones_t
*zones
,
1878 nbnxn_atomdata_t
*nbat
)
1883 for (zone
= 1; zone
< zones
->n
; zone
++)
1885 for (d
= 0; d
< DIM
; d
++)
1887 c0
[d
] = zones
->size
[zone
].bb_x0
[d
];
1888 c1
[d
] = zones
->size
[zone
].bb_x1
[d
];
1891 nbnxn_put_on_grid(nbs
, nbs
->ePBC
, NULL
,
1893 zones
->cg_range
[zone
],
1894 zones
->cg_range
[zone
+1],
1904 /* Add simple grid type information to the local super/sub grid */
1905 void nbnxn_grid_add_simple(nbnxn_search_t nbs
,
1906 nbnxn_atomdata_t
*nbat
)
1913 grid
= &nbs
->grid
[0];
1917 gmx_incons("nbnxn_grid_simple called with a simple grid");
1920 ncd
= grid
->na_sc
/NBNXN_CPU_CLUSTER_I_SIZE
;
1922 if (grid
->nc
*ncd
> grid
->nc_nalloc_simple
)
1924 grid
->nc_nalloc_simple
= over_alloc_large(grid
->nc
*ncd
);
1925 srenew(grid
->bbcz_simple
, grid
->nc_nalloc_simple
*NNBSBB_D
);
1926 srenew(grid
->bb_simple
, grid
->nc_nalloc_simple
);
1927 srenew(grid
->flags_simple
, grid
->nc_nalloc_simple
);
1930 sfree_aligned(grid
->bbj
);
1931 snew_aligned(grid
->bbj
, grid
->nc_nalloc_simple
/2, 16);
1935 bbcz
= grid
->bbcz_simple
;
1936 bb
= grid
->bb_simple
;
1938 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
1939 for (sc
= 0; sc
< grid
->nc
; sc
++)
1943 for (c
= 0; c
< ncd
; c
++)
1947 na
= NBNXN_CPU_CLUSTER_I_SIZE
;
1949 nbat
->type
[tx
*NBNXN_CPU_CLUSTER_I_SIZE
+na
-1] == nbat
->ntype
-1)
1956 switch (nbat
->XFormat
)
1959 /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1960 calc_bounding_box_x_x4(na
, nbat
->x
+tx
*STRIDE_P4
,
1964 /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1965 calc_bounding_box_x_x8(na
, nbat
->x
+X8_IND_A(tx
*NBNXN_CPU_CLUSTER_I_SIZE
),
1969 calc_bounding_box(na
, nbat
->xstride
,
1970 nbat
->x
+tx
*NBNXN_CPU_CLUSTER_I_SIZE
*nbat
->xstride
,
1974 bbcz
[tx
*NNBSBB_D
+0] = bb
[tx
].lower
[BB_Z
];
1975 bbcz
[tx
*NNBSBB_D
+1] = bb
[tx
].upper
[BB_Z
];
1977 /* No interaction optimization yet here */
1978 grid
->flags_simple
[tx
] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1982 grid
->flags_simple
[tx
] = 0;
1987 if (grid
->bSimple
&& nbat
->XFormat
== nbatX8
)
1989 combine_bounding_box_pairs(grid
, grid
->bb_simple
);
1993 void nbnxn_get_ncells(nbnxn_search_t nbs
, int *ncx
, int *ncy
)
1995 *ncx
= nbs
->grid
[0].ncx
;
1996 *ncy
= nbs
->grid
[0].ncy
;
1999 void nbnxn_get_atomorder(nbnxn_search_t nbs
, int **a
, int *n
)
2001 const nbnxn_grid_t
*grid
;
2003 grid
= &nbs
->grid
[0];
2005 /* Return the atom order for the home cell (index 0) */
2008 *n
= grid
->cxy_ind
[grid
->ncx
*grid
->ncy
]*grid
->na_sc
;
2011 void nbnxn_set_atomorder(nbnxn_search_t nbs
)
2014 int ao
, cx
, cy
, cxy
, cz
, j
;
2016 /* Set the atom order for the home cell (index 0) */
2017 grid
= &nbs
->grid
[0];
2020 for (cx
= 0; cx
< grid
->ncx
; cx
++)
2022 for (cy
= 0; cy
< grid
->ncy
; cy
++)
2024 cxy
= cx
*grid
->ncy
+ cy
;
2025 j
= grid
->cxy_ind
[cxy
]*grid
->na_sc
;
2026 for (cz
= 0; cz
< grid
->cxy_na
[cxy
]; cz
++)
2037 /* Determines the cell range along one dimension that
2038 * the bounding box b0 - b1 sees.
2040 static void get_cell_range(real b0
, real b1
,
2041 int nc
, real c0
, real s
, real invs
,
2042 real d2
, real r2
, int *cf
, int *cl
)
2044 *cf
= max((int)((b0
- c0
)*invs
), 0);
2046 while (*cf
> 0 && d2
+ sqr((b0
- c0
) - (*cf
-1+1)*s
) < r2
)
2051 *cl
= min((int)((b1
- c0
)*invs
), nc
-1);
2052 while (*cl
< nc
-1 && d2
+ sqr((*cl
+1)*s
- (b1
- c0
)) < r2
)
2058 /* Reference code calculating the distance^2 between two bounding boxes */
2059 static float box_dist2(float bx0
, float bx1
, float by0
,
2060 float by1
, float bz0
, float bz1
,
2061 const nbnxn_bb_t
*bb
)
2064 float dl
, dh
, dm
, dm0
;
2068 dl
= bx0
- bb
->upper
[BB_X
];
2069 dh
= bb
->lower
[BB_X
] - bx1
;
2074 dl
= by0
- bb
->upper
[BB_Y
];
2075 dh
= bb
->lower
[BB_Y
] - by1
;
2080 dl
= bz0
- bb
->upper
[BB_Z
];
2081 dh
= bb
->lower
[BB_Z
] - bz1
;
2089 /* Plain C code calculating the distance^2 between two bounding boxes */
2090 static float subc_bb_dist2(int si
, const nbnxn_bb_t
*bb_i_ci
,
2091 int csj
, const nbnxn_bb_t
*bb_j_all
)
2093 const nbnxn_bb_t
*bb_i
, *bb_j
;
2095 float dl
, dh
, dm
, dm0
;
2097 bb_i
= bb_i_ci
+ si
;
2098 bb_j
= bb_j_all
+ csj
;
2102 dl
= bb_i
->lower
[BB_X
] - bb_j
->upper
[BB_X
];
2103 dh
= bb_j
->lower
[BB_X
] - bb_i
->upper
[BB_X
];
2108 dl
= bb_i
->lower
[BB_Y
] - bb_j
->upper
[BB_Y
];
2109 dh
= bb_j
->lower
[BB_Y
] - bb_i
->upper
[BB_Y
];
2114 dl
= bb_i
->lower
[BB_Z
] - bb_j
->upper
[BB_Z
];
2115 dh
= bb_j
->lower
[BB_Z
] - bb_i
->upper
[BB_Z
];
2123 #ifdef NBNXN_SEARCH_BB_SIMD4
2125 /* 4-wide SIMD code for bb distance for bb format xyz0 */
2126 static float subc_bb_dist2_simd4(int si
, const nbnxn_bb_t
*bb_i_ci
,
2127 int csj
, const nbnxn_bb_t
*bb_j_all
)
2129 gmx_simd4_float_t bb_i_S0
, bb_i_S1
;
2130 gmx_simd4_float_t bb_j_S0
, bb_j_S1
;
2131 gmx_simd4_float_t dl_S
;
2132 gmx_simd4_float_t dh_S
;
2133 gmx_simd4_float_t dm_S
;
2134 gmx_simd4_float_t dm0_S
;
2136 bb_i_S0
= gmx_simd4_load_f(&bb_i_ci
[si
].lower
[0]);
2137 bb_i_S1
= gmx_simd4_load_f(&bb_i_ci
[si
].upper
[0]);
2138 bb_j_S0
= gmx_simd4_load_f(&bb_j_all
[csj
].lower
[0]);
2139 bb_j_S1
= gmx_simd4_load_f(&bb_j_all
[csj
].upper
[0]);
2141 dl_S
= gmx_simd4_sub_f(bb_i_S0
, bb_j_S1
);
2142 dh_S
= gmx_simd4_sub_f(bb_j_S0
, bb_i_S1
);
2144 dm_S
= gmx_simd4_max_f(dl_S
, dh_S
);
2145 dm0_S
= gmx_simd4_max_f(dm_S
, gmx_simd4_setzero_f());
2147 return gmx_simd4_dotproduct3_f(dm0_S
, dm0_S
);
2150 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
2151 #define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2) \
2155 gmx_simd4_float_t dx_0, dy_0, dz_0; \
2156 gmx_simd4_float_t dx_1, dy_1, dz_1; \
2158 gmx_simd4_float_t mx, my, mz; \
2159 gmx_simd4_float_t m0x, m0y, m0z; \
2161 gmx_simd4_float_t d2x, d2y, d2z; \
2162 gmx_simd4_float_t d2s, d2t; \
2164 shi = si*NNBSBB_D*DIM; \
2166 xi_l = gmx_simd4_load_f(bb_i+shi+0*STRIDE_PBB); \
2167 yi_l = gmx_simd4_load_f(bb_i+shi+1*STRIDE_PBB); \
2168 zi_l = gmx_simd4_load_f(bb_i+shi+2*STRIDE_PBB); \
2169 xi_h = gmx_simd4_load_f(bb_i+shi+3*STRIDE_PBB); \
2170 yi_h = gmx_simd4_load_f(bb_i+shi+4*STRIDE_PBB); \
2171 zi_h = gmx_simd4_load_f(bb_i+shi+5*STRIDE_PBB); \
2173 dx_0 = gmx_simd4_sub_f(xi_l, xj_h); \
2174 dy_0 = gmx_simd4_sub_f(yi_l, yj_h); \
2175 dz_0 = gmx_simd4_sub_f(zi_l, zj_h); \
2177 dx_1 = gmx_simd4_sub_f(xj_l, xi_h); \
2178 dy_1 = gmx_simd4_sub_f(yj_l, yi_h); \
2179 dz_1 = gmx_simd4_sub_f(zj_l, zi_h); \
2181 mx = gmx_simd4_max_f(dx_0, dx_1); \
2182 my = gmx_simd4_max_f(dy_0, dy_1); \
2183 mz = gmx_simd4_max_f(dz_0, dz_1); \
2185 m0x = gmx_simd4_max_f(mx, zero); \
2186 m0y = gmx_simd4_max_f(my, zero); \
2187 m0z = gmx_simd4_max_f(mz, zero); \
2189 d2x = gmx_simd4_mul_f(m0x, m0x); \
2190 d2y = gmx_simd4_mul_f(m0y, m0y); \
2191 d2z = gmx_simd4_mul_f(m0z, m0z); \
2193 d2s = gmx_simd4_add_f(d2x, d2y); \
2194 d2t = gmx_simd4_add_f(d2s, d2z); \
2196 gmx_simd4_store_f(d2+si, d2t); \
2199 /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
2200 static void subc_bb_dist2_simd4_xxxx(const float *bb_j
,
2201 int nsi
, const float *bb_i
,
2204 gmx_simd4_float_t xj_l
, yj_l
, zj_l
;
2205 gmx_simd4_float_t xj_h
, yj_h
, zj_h
;
2206 gmx_simd4_float_t xi_l
, yi_l
, zi_l
;
2207 gmx_simd4_float_t xi_h
, yi_h
, zi_h
;
2209 gmx_simd4_float_t zero
;
2211 zero
= gmx_simd4_setzero_f();
2213 xj_l
= gmx_simd4_set1_f(bb_j
[0*STRIDE_PBB
]);
2214 yj_l
= gmx_simd4_set1_f(bb_j
[1*STRIDE_PBB
]);
2215 zj_l
= gmx_simd4_set1_f(bb_j
[2*STRIDE_PBB
]);
2216 xj_h
= gmx_simd4_set1_f(bb_j
[3*STRIDE_PBB
]);
2217 yj_h
= gmx_simd4_set1_f(bb_j
[4*STRIDE_PBB
]);
2218 zj_h
= gmx_simd4_set1_f(bb_j
[5*STRIDE_PBB
]);
2220 /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
2221 * But as we know the number of iterations is 1 or 2, we unroll manually.
2223 SUBC_BB_DIST2_SIMD4_XXXX_INNER(0, bb_i
, d2
);
2224 if (STRIDE_PBB
< nsi
)
2226 SUBC_BB_DIST2_SIMD4_XXXX_INNER(STRIDE_PBB
, bb_i
, d2
);
2230 #endif /* NBNXN_SEARCH_BB_SIMD4 */
2232 /* Plain C function which determines if any atom pair between two cells
2233 * is within distance sqrt(rl2).
2235 static gmx_bool
subc_in_range_x(int na_c
,
2236 int si
, const real
*x_i
,
2237 int csj
, int stride
, const real
*x_j
,
2243 for (i
= 0; i
< na_c
; i
++)
2245 i0
= (si
*na_c
+ i
)*DIM
;
2246 for (j
= 0; j
< na_c
; j
++)
2248 j0
= (csj
*na_c
+ j
)*stride
;
2250 d2
= sqr(x_i
[i0
] - x_j
[j0
]) +
2251 sqr(x_i
[i0
+1] - x_j
[j0
+1]) +
2252 sqr(x_i
[i0
+2] - x_j
[j0
+2]);
2264 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
2266 /* 4-wide SIMD function which determines if any atom pair between two cells,
2267 * both with 8 atoms, is within distance sqrt(rl2).
2268 * Using 8-wide AVX is not faster on Intel Sandy Bridge.
2270 static gmx_bool
subc_in_range_simd4(int na_c
,
2271 int si
, const real
*x_i
,
2272 int csj
, int stride
, const real
*x_j
,
2275 gmx_simd4_real_t ix_S0
, iy_S0
, iz_S0
;
2276 gmx_simd4_real_t ix_S1
, iy_S1
, iz_S1
;
2278 gmx_simd4_real_t rc2_S
;
2283 rc2_S
= gmx_simd4_set1_r(rl2
);
2285 dim_stride
= NBNXN_GPU_CLUSTER_SIZE
/STRIDE_PBB
*DIM
;
2286 ix_S0
= gmx_simd4_load_r(x_i
+(si
*dim_stride
+0)*STRIDE_PBB
);
2287 iy_S0
= gmx_simd4_load_r(x_i
+(si
*dim_stride
+1)*STRIDE_PBB
);
2288 iz_S0
= gmx_simd4_load_r(x_i
+(si
*dim_stride
+2)*STRIDE_PBB
);
2289 ix_S1
= gmx_simd4_load_r(x_i
+(si
*dim_stride
+3)*STRIDE_PBB
);
2290 iy_S1
= gmx_simd4_load_r(x_i
+(si
*dim_stride
+4)*STRIDE_PBB
);
2291 iz_S1
= gmx_simd4_load_r(x_i
+(si
*dim_stride
+5)*STRIDE_PBB
);
2293 /* We loop from the outer to the inner particles to maximize
2294 * the chance that we find a pair in range quickly and return.
2300 gmx_simd4_real_t jx0_S
, jy0_S
, jz0_S
;
2301 gmx_simd4_real_t jx1_S
, jy1_S
, jz1_S
;
2303 gmx_simd4_real_t dx_S0
, dy_S0
, dz_S0
;
2304 gmx_simd4_real_t dx_S1
, dy_S1
, dz_S1
;
2305 gmx_simd4_real_t dx_S2
, dy_S2
, dz_S2
;
2306 gmx_simd4_real_t dx_S3
, dy_S3
, dz_S3
;
2308 gmx_simd4_real_t rsq_S0
;
2309 gmx_simd4_real_t rsq_S1
;
2310 gmx_simd4_real_t rsq_S2
;
2311 gmx_simd4_real_t rsq_S3
;
2313 gmx_simd4_bool_t wco_S0
;
2314 gmx_simd4_bool_t wco_S1
;
2315 gmx_simd4_bool_t wco_S2
;
2316 gmx_simd4_bool_t wco_S3
;
2317 gmx_simd4_bool_t wco_any_S01
, wco_any_S23
, wco_any_S
;
2319 jx0_S
= gmx_simd4_set1_r(x_j
[j0
*stride
+0]);
2320 jy0_S
= gmx_simd4_set1_r(x_j
[j0
*stride
+1]);
2321 jz0_S
= gmx_simd4_set1_r(x_j
[j0
*stride
+2]);
2323 jx1_S
= gmx_simd4_set1_r(x_j
[j1
*stride
+0]);
2324 jy1_S
= gmx_simd4_set1_r(x_j
[j1
*stride
+1]);
2325 jz1_S
= gmx_simd4_set1_r(x_j
[j1
*stride
+2]);
2327 /* Calculate distance */
2328 dx_S0
= gmx_simd4_sub_r(ix_S0
, jx0_S
);
2329 dy_S0
= gmx_simd4_sub_r(iy_S0
, jy0_S
);
2330 dz_S0
= gmx_simd4_sub_r(iz_S0
, jz0_S
);
2331 dx_S1
= gmx_simd4_sub_r(ix_S1
, jx0_S
);
2332 dy_S1
= gmx_simd4_sub_r(iy_S1
, jy0_S
);
2333 dz_S1
= gmx_simd4_sub_r(iz_S1
, jz0_S
);
2334 dx_S2
= gmx_simd4_sub_r(ix_S0
, jx1_S
);
2335 dy_S2
= gmx_simd4_sub_r(iy_S0
, jy1_S
);
2336 dz_S2
= gmx_simd4_sub_r(iz_S0
, jz1_S
);
2337 dx_S3
= gmx_simd4_sub_r(ix_S1
, jx1_S
);
2338 dy_S3
= gmx_simd4_sub_r(iy_S1
, jy1_S
);
2339 dz_S3
= gmx_simd4_sub_r(iz_S1
, jz1_S
);
2341 /* rsq = dx*dx+dy*dy+dz*dz */
2342 rsq_S0
= gmx_simd4_calc_rsq_r(dx_S0
, dy_S0
, dz_S0
);
2343 rsq_S1
= gmx_simd4_calc_rsq_r(dx_S1
, dy_S1
, dz_S1
);
2344 rsq_S2
= gmx_simd4_calc_rsq_r(dx_S2
, dy_S2
, dz_S2
);
2345 rsq_S3
= gmx_simd4_calc_rsq_r(dx_S3
, dy_S3
, dz_S3
);
2347 wco_S0
= gmx_simd4_cmplt_r(rsq_S0
, rc2_S
);
2348 wco_S1
= gmx_simd4_cmplt_r(rsq_S1
, rc2_S
);
2349 wco_S2
= gmx_simd4_cmplt_r(rsq_S2
, rc2_S
);
2350 wco_S3
= gmx_simd4_cmplt_r(rsq_S3
, rc2_S
);
2352 wco_any_S01
= gmx_simd4_or_b(wco_S0
, wco_S1
);
2353 wco_any_S23
= gmx_simd4_or_b(wco_S2
, wco_S3
);
2354 wco_any_S
= gmx_simd4_or_b(wco_any_S01
, wco_any_S23
);
2356 if (gmx_simd4_anytrue_b(wco_any_S
))
2370 /* Returns the j sub-cell for index cj_ind */
2371 static int nbl_cj(const nbnxn_pairlist_t
*nbl
, int cj_ind
)
2373 return nbl
->cj4
[cj_ind
>> NBNXN_GPU_JGROUP_SIZE_2LOG
].cj
[cj_ind
& (NBNXN_GPU_JGROUP_SIZE
- 1)];
2376 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
2377 static unsigned int nbl_imask0(const nbnxn_pairlist_t
*nbl
, int cj_ind
)
2379 return nbl
->cj4
[cj_ind
>> NBNXN_GPU_JGROUP_SIZE_2LOG
].imei
[0].imask
;
2382 /* Ensures there is enough space for extra extra exclusion masks */
2383 static void check_excl_space(nbnxn_pairlist_t
*nbl
, int extra
)
2385 if (nbl
->nexcl
+extra
> nbl
->excl_nalloc
)
2387 nbl
->excl_nalloc
= over_alloc_small(nbl
->nexcl
+extra
);
2388 nbnxn_realloc_void((void **)&nbl
->excl
,
2389 nbl
->nexcl
*sizeof(*nbl
->excl
),
2390 nbl
->excl_nalloc
*sizeof(*nbl
->excl
),
2391 nbl
->alloc
, nbl
->free
);
2395 /* Ensures there is enough space for ncell extra j-cells in the list */
2396 static void check_subcell_list_space_simple(nbnxn_pairlist_t
*nbl
,
2401 cj_max
= nbl
->ncj
+ ncell
;
2403 if (cj_max
> nbl
->cj_nalloc
)
2405 nbl
->cj_nalloc
= over_alloc_small(cj_max
);
2406 nbnxn_realloc_void((void **)&nbl
->cj
,
2407 nbl
->ncj
*sizeof(*nbl
->cj
),
2408 nbl
->cj_nalloc
*sizeof(*nbl
->cj
),
2409 nbl
->alloc
, nbl
->free
);
2413 /* Ensures there is enough space for ncell extra j-subcells in the list */
2414 static void check_subcell_list_space_supersub(nbnxn_pairlist_t
*nbl
,
2417 int ncj4_max
, j4
, j
, w
, t
;
2420 #define WARP_SIZE 32
2422 /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
2423 /* We can store 4 j-subcell - i-supercell pairs in one struct.
2424 * since we round down, we need one extra entry.
2426 ncj4_max
= ((nbl
->work
->cj_ind
+ nsupercell
*GPU_NSUBCELL
+ NBNXN_GPU_JGROUP_SIZE
- 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG
);
2428 if (ncj4_max
> nbl
->cj4_nalloc
)
2430 nbl
->cj4_nalloc
= over_alloc_small(ncj4_max
);
2431 nbnxn_realloc_void((void **)&nbl
->cj4
,
2432 nbl
->work
->cj4_init
*sizeof(*nbl
->cj4
),
2433 nbl
->cj4_nalloc
*sizeof(*nbl
->cj4
),
2434 nbl
->alloc
, nbl
->free
);
2437 if (ncj4_max
> nbl
->work
->cj4_init
)
2439 for (j4
= nbl
->work
->cj4_init
; j4
< ncj4_max
; j4
++)
2441 /* No i-subcells and no excl's in the list initially */
2442 for (w
= 0; w
< NWARP
; w
++)
2444 nbl
->cj4
[j4
].imei
[w
].imask
= 0U;
2445 nbl
->cj4
[j4
].imei
[w
].excl_ind
= 0;
2449 nbl
->work
->cj4_init
= ncj4_max
;
2453 /* Set all excl masks for one GPU warp no exclusions */
2454 static void set_no_excls(nbnxn_excl_t
*excl
)
2458 for (t
= 0; t
< WARP_SIZE
; t
++)
2460 /* Turn all interaction bits on */
2461 excl
->pair
[t
] = NBNXN_INTERACTION_MASK_ALL
;
2465 /* Initializes a single nbnxn_pairlist_t data structure */
2466 static void nbnxn_init_pairlist(nbnxn_pairlist_t
*nbl
,
2468 nbnxn_alloc_t
*alloc
,
2473 nbl
->alloc
= nbnxn_alloc_aligned
;
2481 nbl
->free
= nbnxn_free_aligned
;
2488 nbl
->bSimple
= bSimple
;
2499 /* We need one element extra in sj, so alloc initially with 1 */
2500 nbl
->cj4_nalloc
= 0;
2507 nbl
->excl_nalloc
= 0;
2509 check_excl_space(nbl
, 1);
2511 set_no_excls(&nbl
->excl
[0]);
2517 snew_aligned(nbl
->work
->bb_ci
, 1, NBNXN_SEARCH_BB_MEM_ALIGN
);
2522 snew_aligned(nbl
->work
->pbb_ci
, GPU_NSUBCELL
/STRIDE_PBB
*NNBSBB_XXXX
, NBNXN_SEARCH_BB_MEM_ALIGN
);
2524 snew_aligned(nbl
->work
->bb_ci
, GPU_NSUBCELL
, NBNXN_SEARCH_BB_MEM_ALIGN
);
2527 snew_aligned(nbl
->work
->x_ci
, NBNXN_NA_SC_MAX
*DIM
, NBNXN_SEARCH_BB_MEM_ALIGN
);
2528 #ifdef GMX_NBNXN_SIMD
2529 snew_aligned(nbl
->work
->x_ci_simd_4xn
, 1, NBNXN_MEM_ALIGN
);
2530 snew_aligned(nbl
->work
->x_ci_simd_2xnn
, 1, NBNXN_MEM_ALIGN
);
2532 snew_aligned(nbl
->work
->d2
, GPU_NSUBCELL
, NBNXN_SEARCH_BB_MEM_ALIGN
);
2534 nbl
->work
->sort
= NULL
;
2535 nbl
->work
->sort_nalloc
= 0;
2536 nbl
->work
->sci_sort
= NULL
;
2537 nbl
->work
->sci_sort_nalloc
= 0;
2540 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t
*nbl_list
,
2541 gmx_bool bSimple
, gmx_bool bCombined
,
2542 nbnxn_alloc_t
*alloc
,
2547 nbl_list
->bSimple
= bSimple
;
2548 nbl_list
->bCombined
= bCombined
;
2550 nbl_list
->nnbl
= gmx_omp_nthreads_get(emntNonbonded
);
2552 if (!nbl_list
->bCombined
&&
2553 nbl_list
->nnbl
> NBNXN_BUFFERFLAG_MAX_THREADS
)
2555 gmx_fatal(FARGS
, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
2556 nbl_list
->nnbl
, NBNXN_BUFFERFLAG_MAX_THREADS
, NBNXN_BUFFERFLAG_MAX_THREADS
);
2559 snew(nbl_list
->nbl
, nbl_list
->nnbl
);
2560 snew(nbl_list
->nbl_fep
, nbl_list
->nnbl
);
2561 /* Execute in order to avoid memory interleaving between threads */
2562 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
2563 for (i
= 0; i
< nbl_list
->nnbl
; i
++)
2565 /* Allocate the nblist data structure locally on each thread
2566 * to optimize memory access for NUMA architectures.
2568 snew(nbl_list
->nbl
[i
], 1);
2570 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
2573 nbnxn_init_pairlist(nbl_list
->nbl
[i
], nbl_list
->bSimple
, alloc
, free
);
2577 nbnxn_init_pairlist(nbl_list
->nbl
[i
], nbl_list
->bSimple
, NULL
, NULL
);
2580 snew(nbl_list
->nbl_fep
[i
], 1);
2581 nbnxn_init_pairlist_fep(nbl_list
->nbl_fep
[i
]);
2585 /* Print statistics of a pair list, used for debug output */
2586 static void print_nblist_statistics_simple(FILE *fp
, const nbnxn_pairlist_t
*nbl
,
2587 const nbnxn_search_t nbs
, real rl
)
2589 const nbnxn_grid_t
*grid
;
2594 /* This code only produces correct statistics with domain decomposition */
2595 grid
= &nbs
->grid
[0];
2597 fprintf(fp
, "nbl nci %d ncj %d\n",
2598 nbl
->nci
, nbl
->ncj
);
2599 fprintf(fp
, "nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2600 nbl
->na_sc
, rl
, nbl
->ncj
, nbl
->ncj
/(double)grid
->nc
,
2601 nbl
->ncj
/(double)grid
->nc
*grid
->na_sc
,
2602 nbl
->ncj
/(double)grid
->nc
*grid
->na_sc
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
->nc
*grid
->na_sc
/det(nbs
->box
)));
2604 fprintf(fp
, "nbl average j cell list length %.1f\n",
2605 0.25*nbl
->ncj
/(double)nbl
->nci
);
2607 for (s
= 0; s
< SHIFTS
; s
++)
2612 for (i
= 0; i
< nbl
->nci
; i
++)
2614 cs
[nbl
->ci
[i
].shift
& NBNXN_CI_SHIFT
] +=
2615 nbl
->ci
[i
].cj_ind_end
- nbl
->ci
[i
].cj_ind_start
;
2617 j
= nbl
->ci
[i
].cj_ind_start
;
2618 while (j
< nbl
->ci
[i
].cj_ind_end
&&
2619 nbl
->cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
2625 fprintf(fp
, "nbl cell pairs, total: %d excl: %d %.1f%%\n",
2626 nbl
->ncj
, npexcl
, 100*npexcl
/(double)nbl
->ncj
);
2627 for (s
= 0; s
< SHIFTS
; s
++)
2631 fprintf(fp
, "nbl shift %2d ncj %3d\n", s
, cs
[s
]);
2636 /* Print statistics of a pair lists, used for debug output */
2637 static void print_nblist_statistics_supersub(FILE *fp
, const nbnxn_pairlist_t
*nbl
,
2638 const nbnxn_search_t nbs
, real rl
)
2640 const nbnxn_grid_t
*grid
;
2641 int i
, j4
, j
, si
, b
;
2642 int c
[GPU_NSUBCELL
+1];
2644 /* This code only produces correct statistics with domain decomposition */
2645 grid
= &nbs
->grid
[0];
2647 fprintf(fp
, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
2648 nbl
->nsci
, nbl
->ncj4
, nbl
->nci_tot
, nbl
->nexcl
);
2649 fprintf(fp
, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
2650 nbl
->na_ci
, rl
, nbl
->nci_tot
, nbl
->nci_tot
/(double)grid
->nsubc_tot
,
2651 nbl
->nci_tot
/(double)grid
->nsubc_tot
*grid
->na_c
,
2652 nbl
->nci_tot
/(double)grid
->nsubc_tot
*grid
->na_c
/(0.5*4.0/3.0*M_PI
*rl
*rl
*rl
*grid
->nsubc_tot
*grid
->na_c
/det(nbs
->box
)));
2654 fprintf(fp
, "nbl average j super cell list length %.1f\n",
2655 0.25*nbl
->ncj4
/(double)nbl
->nsci
);
2656 fprintf(fp
, "nbl average i sub cell list length %.1f\n",
2657 nbl
->nci_tot
/((double)nbl
->ncj4
));
2659 for (si
= 0; si
<= GPU_NSUBCELL
; si
++)
2663 for (i
= 0; i
< nbl
->nsci
; i
++)
2665 for (j4
= nbl
->sci
[i
].cj4_ind_start
; j4
< nbl
->sci
[i
].cj4_ind_end
; j4
++)
2667 for (j
= 0; j
< NBNXN_GPU_JGROUP_SIZE
; j
++)
2670 for (si
= 0; si
< GPU_NSUBCELL
; si
++)
2672 if (nbl
->cj4
[j4
].imei
[0].imask
& (1U << (j
*GPU_NSUBCELL
+ si
)))
2681 for (b
= 0; b
<= GPU_NSUBCELL
; b
++)
2683 fprintf(fp
, "nbl j-list #i-subcell %d %7d %4.1f\n",
2684 b
, c
[b
], 100.0*c
[b
]/(double)(nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
));
2688 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
2689 static void low_get_nbl_exclusions(nbnxn_pairlist_t
*nbl
, int cj4
,
2690 int warp
, nbnxn_excl_t
**excl
)
2692 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
2694 /* No exclusions set, make a new list entry */
2695 nbl
->cj4
[cj4
].imei
[warp
].excl_ind
= nbl
->nexcl
;
2697 *excl
= &nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
2698 set_no_excls(*excl
);
2702 /* We already have some exclusions, new ones can be added to the list */
2703 *excl
= &nbl
->excl
[nbl
->cj4
[cj4
].imei
[warp
].excl_ind
];
2707 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
2708 * generates a new element and allocates extra memory, if necessary.
2710 static void get_nbl_exclusions_1(nbnxn_pairlist_t
*nbl
, int cj4
,
2711 int warp
, nbnxn_excl_t
**excl
)
2713 if (nbl
->cj4
[cj4
].imei
[warp
].excl_ind
== 0)
2715 /* We need to make a new list entry, check if we have space */
2716 check_excl_space(nbl
, 1);
2718 low_get_nbl_exclusions(nbl
, cj4
, warp
, excl
);
2721 /* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
2722 * generates a new element and allocates extra memory, if necessary.
2724 static void get_nbl_exclusions_2(nbnxn_pairlist_t
*nbl
, int cj4
,
2725 nbnxn_excl_t
**excl_w0
,
2726 nbnxn_excl_t
**excl_w1
)
2728 /* Check for space we might need */
2729 check_excl_space(nbl
, 2);
2731 low_get_nbl_exclusions(nbl
, cj4
, 0, excl_w0
);
2732 low_get_nbl_exclusions(nbl
, cj4
, 1, excl_w1
);
2735 /* Sets the self exclusions i=j and pair exclusions i>j */
2736 static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t
*nbl
,
2737 int cj4_ind
, int sj_offset
,
2740 nbnxn_excl_t
*excl
[2];
2743 /* Here we only set the set self and double pair exclusions */
2745 get_nbl_exclusions_2(nbl
, cj4_ind
, &excl
[0], &excl
[1]);
2747 /* Only minor < major bits set */
2748 for (ej
= 0; ej
< nbl
->na_ci
; ej
++)
2751 for (ei
= ej
; ei
< nbl
->na_ci
; ei
++)
2753 excl
[w
]->pair
[(ej
& (NBNXN_GPU_JGROUP_SIZE
-1))*nbl
->na_ci
+ ei
] &=
2754 ~(1U << (sj_offset
*GPU_NSUBCELL
+ si
));
2759 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
2760 static unsigned int get_imask(gmx_bool rdiag
, int ci
, int cj
)
2762 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
2765 /* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
2766 static unsigned int get_imask_simd_j2(gmx_bool rdiag
, int ci
, int cj
)
2768 return (rdiag
&& ci
*2 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_0
:
2769 (rdiag
&& ci
*2+1 == cj
? NBNXN_INTERACTION_MASK_DIAG_J2_1
:
2770 NBNXN_INTERACTION_MASK_ALL
));
2773 /* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
2774 static unsigned int get_imask_simd_j4(gmx_bool rdiag
, int ci
, int cj
)
2776 return (rdiag
&& ci
== cj
? NBNXN_INTERACTION_MASK_DIAG
: NBNXN_INTERACTION_MASK_ALL
);
2779 /* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
2780 static unsigned int get_imask_simd_j8(gmx_bool rdiag
, int ci
, int cj
)
2782 return (rdiag
&& ci
== cj
*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
:
2783 (rdiag
&& ci
== cj
*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
:
2784 NBNXN_INTERACTION_MASK_ALL
));
2787 #ifdef GMX_NBNXN_SIMD
2788 #if GMX_SIMD_REAL_WIDTH == 2
2789 #define get_imask_simd_4xn get_imask_simd_j2
2791 #if GMX_SIMD_REAL_WIDTH == 4
2792 #define get_imask_simd_4xn get_imask_simd_j4
2794 #if GMX_SIMD_REAL_WIDTH == 8
2795 #define get_imask_simd_4xn get_imask_simd_j8
2796 #define get_imask_simd_2xnn get_imask_simd_j4
2798 #if GMX_SIMD_REAL_WIDTH == 16
2799 #define get_imask_simd_2xnn get_imask_simd_j8
2803 /* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
2804 * Checks bounding box distances and possibly atom pair distances.
2806 static void make_cluster_list_simple(const nbnxn_grid_t
*gridj
,
2807 nbnxn_pairlist_t
*nbl
,
2808 int ci
, int cjf
, int cjl
,
2809 gmx_bool remove_sub_diag
,
2811 real rl2
, float rbb2
,
2814 const nbnxn_list_work_t
*work
;
2816 const nbnxn_bb_t
*bb_ci
;
2821 int cjf_gl
, cjl_gl
, cj
;
2825 bb_ci
= nbl
->work
->bb_ci
;
2826 x_ci
= nbl
->work
->x_ci
;
2829 while (!InRange
&& cjf
<= cjl
)
2831 d2
= subc_bb_dist2(0, bb_ci
, cjf
, gridj
->bb
);
2834 /* Check if the distance is within the distance where
2835 * we use only the bounding box distance rbb,
2836 * or within the cut-off and there is at least one atom pair
2837 * within the cut-off.
2847 cjf_gl
= gridj
->cell0
+ cjf
;
2848 for (i
= 0; i
< NBNXN_CPU_CLUSTER_I_SIZE
&& !InRange
; i
++)
2850 for (j
= 0; j
< NBNXN_CPU_CLUSTER_I_SIZE
; j
++)
2852 InRange
= InRange
||
2853 (sqr(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+XX
]) +
2854 sqr(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+YY
]) +
2855 sqr(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjf_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+ZZ
]) < rl2
);
2858 *ndistc
+= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
2871 while (!InRange
&& cjl
> cjf
)
2873 d2
= subc_bb_dist2(0, bb_ci
, cjl
, gridj
->bb
);
2876 /* Check if the distance is within the distance where
2877 * we use only the bounding box distance rbb,
2878 * or within the cut-off and there is at least one atom pair
2879 * within the cut-off.
2889 cjl_gl
= gridj
->cell0
+ cjl
;
2890 for (i
= 0; i
< NBNXN_CPU_CLUSTER_I_SIZE
&& !InRange
; i
++)
2892 for (j
= 0; j
< NBNXN_CPU_CLUSTER_I_SIZE
; j
++)
2894 InRange
= InRange
||
2895 (sqr(x_ci
[i
*STRIDE_XYZ
+XX
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+XX
]) +
2896 sqr(x_ci
[i
*STRIDE_XYZ
+YY
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+YY
]) +
2897 sqr(x_ci
[i
*STRIDE_XYZ
+ZZ
] - x_j
[(cjl_gl
*NBNXN_CPU_CLUSTER_I_SIZE
+j
)*STRIDE_XYZ
+ZZ
]) < rl2
);
2900 *ndistc
+= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
2910 for (cj
= cjf
; cj
<= cjl
; cj
++)
2912 /* Store cj and the interaction mask */
2913 nbl
->cj
[nbl
->ncj
].cj
= gridj
->cell0
+ cj
;
2914 nbl
->cj
[nbl
->ncj
].excl
= get_imask(remove_sub_diag
, ci
, cj
);
2917 /* Increase the closing index in i super-cell list */
2918 nbl
->ci
[nbl
->nci
].cj_ind_end
= nbl
->ncj
;
2922 #ifdef GMX_NBNXN_SIMD_4XN
2923 #include "nbnxn_search_simd_4xn.h"
2925 #ifdef GMX_NBNXN_SIMD_2XNN
2926 #include "nbnxn_search_simd_2xnn.h"
2929 /* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
2930 * Checks bounding box distances and possibly atom pair distances.
2932 static void make_cluster_list_supersub(const nbnxn_grid_t
*gridi
,
2933 const nbnxn_grid_t
*gridj
,
2934 nbnxn_pairlist_t
*nbl
,
2936 gmx_bool sci_equals_scj
,
2937 int stride
, const real
*x
,
2938 real rl2
, float rbb2
,
2943 int cjo
, ci1
, ci
, cj
, cj_gl
;
2944 int cj4_ind
, cj_offset
;
2948 const float *pbb_ci
;
2950 const nbnxn_bb_t
*bb_ci
;
2955 #define PRUNE_LIST_CPU_ONE
2956 #ifdef PRUNE_LIST_CPU_ONE
2960 d2l
= nbl
->work
->d2
;
2963 pbb_ci
= nbl
->work
->pbb_ci
;
2965 bb_ci
= nbl
->work
->bb_ci
;
2967 x_ci
= nbl
->work
->x_ci
;
2971 for (cjo
= 0; cjo
< gridj
->nsubc
[scj
]; cjo
++)
2973 cj4_ind
= (nbl
->work
->cj_ind
>> NBNXN_GPU_JGROUP_SIZE_2LOG
);
2974 cj_offset
= nbl
->work
->cj_ind
- cj4_ind
*NBNXN_GPU_JGROUP_SIZE
;
2975 cj4
= &nbl
->cj4
[cj4_ind
];
2977 cj
= scj
*GPU_NSUBCELL
+ cjo
;
2979 cj_gl
= gridj
->cell0
*GPU_NSUBCELL
+ cj
;
2981 /* Initialize this j-subcell i-subcell list */
2982 cj4
->cj
[cj_offset
] = cj_gl
;
2991 ci1
= gridi
->nsubc
[sci
];
2995 /* Determine all ci1 bb distances in one call with SIMD4 */
2996 subc_bb_dist2_simd4_xxxx(gridj
->pbb
+(cj
>>STRIDE_PBB_2LOG
)*NNBSBB_XXXX
+(cj
& (STRIDE_PBB
-1)),
3002 /* We use a fixed upper-bound instead of ci1 to help optimization */
3003 for (ci
= 0; ci
< GPU_NSUBCELL
; ci
++)
3010 #ifndef NBNXN_BBXXXX
3011 /* Determine the bb distance between ci and cj */
3012 d2l
[ci
] = subc_bb_dist2(ci
, bb_ci
, cj
, gridj
->bb
);
3017 #ifdef PRUNE_LIST_CPU_ALL
3018 /* Check if the distance is within the distance where
3019 * we use only the bounding box distance rbb,
3020 * or within the cut-off and there is at least one atom pair
3021 * within the cut-off. This check is very costly.
3023 *ndistc
+= na_c
*na_c
;
3026 #ifdef NBNXN_PBB_SIMD4
3031 (na_c
, ci
, x_ci
, cj_gl
, stride
, x
, rl2
)))
3033 /* Check if the distance between the two bounding boxes
3034 * in within the pair-list cut-off.
3039 /* Flag this i-subcell to be taken into account */
3040 imask
|= (1U << (cj_offset
*GPU_NSUBCELL
+ci
));
3042 #ifdef PRUNE_LIST_CPU_ONE
3050 #ifdef PRUNE_LIST_CPU_ONE
3051 /* If we only found 1 pair, check if any atoms are actually
3052 * within the cut-off, so we could get rid of it.
3054 if (npair
== 1 && d2l
[ci_last
] >= rbb2
)
3056 /* Avoid using function pointers here, as it's slower */
3058 #ifdef NBNXN_PBB_SIMD4
3059 !subc_in_range_simd4
3063 (na_c
, ci_last
, x_ci
, cj_gl
, stride
, x
, rl2
))
3065 imask
&= ~(1U << (cj_offset
*GPU_NSUBCELL
+ci_last
));
3073 /* We have a useful sj entry, close it now */
3075 /* Set the exclucions for the ci== sj entry.
3076 * Here we don't bother to check if this entry is actually flagged,
3077 * as it will nearly always be in the list.
3081 set_self_and_newton_excls_supersub(nbl
, cj4_ind
, cj_offset
, cjo
);
3084 /* Copy the cluster interaction mask to the list */
3085 for (w
= 0; w
< NWARP
; w
++)
3087 cj4
->imei
[w
].imask
|= imask
;
3090 nbl
->work
->cj_ind
++;
3092 /* Keep the count */
3093 nbl
->nci_tot
+= npair
;
3095 /* Increase the closing index in i super-cell list */
3096 nbl
->sci
[nbl
->nsci
].cj4_ind_end
=
3097 ((nbl
->work
->cj_ind
+NBNXN_GPU_JGROUP_SIZE
-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG
);
3102 /* Set all atom-pair exclusions from the topology stored in excl
3103 * as masks in the pair-list for simple list i-entry nbl_ci
3105 static void set_ci_top_excls(const nbnxn_search_t nbs
,
3106 nbnxn_pairlist_t
*nbl
,
3107 gmx_bool diagRemoved
,
3110 const nbnxn_ci_t
*nbl_ci
,
3111 const t_blocka
*excl
)
3115 int cj_ind_first
, cj_ind_last
;
3116 int cj_first
, cj_last
;
3118 int i
, ai
, aj
, si
, eind
, ge
, se
;
3119 int found
, cj_ind_0
, cj_ind_1
, cj_ind_m
;
3123 nbnxn_excl_t
*nbl_excl
;
3124 int inner_i
, inner_e
;
3128 if (nbl_ci
->cj_ind_end
== nbl_ci
->cj_ind_start
)
3136 cj_ind_first
= nbl_ci
->cj_ind_start
;
3137 cj_ind_last
= nbl
->ncj
- 1;
3139 cj_first
= nbl
->cj
[cj_ind_first
].cj
;
3140 cj_last
= nbl
->cj
[cj_ind_last
].cj
;
3142 /* Determine how many contiguous j-cells we have starting
3143 * from the first i-cell. This number can be used to directly
3144 * calculate j-cell indices for excluded atoms.
3147 if (na_ci_2log
== na_cj_2log
)
3149 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3150 nbl
->cj
[cj_ind_first
+ndirect
].cj
== ci
+ ndirect
)
3155 #ifdef NBNXN_SEARCH_BB_SIMD4
3158 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3159 nbl
->cj
[cj_ind_first
+ndirect
].cj
== ci_to_cj(na_cj_2log
, ci
) + ndirect
)
3166 /* Loop over the atoms in the i super-cell */
3167 for (i
= 0; i
< nbl
->na_sc
; i
++)
3169 ai
= nbs
->a
[ci
*nbl
->na_sc
+i
];
3172 si
= (i
>>na_ci_2log
);
3174 /* Loop over the topology-based exclusions for this i-atom */
3175 for (eind
= excl
->index
[ai
]; eind
< excl
->index
[ai
+1]; eind
++)
3181 /* The self exclusion are already set, save some time */
3187 /* Without shifts we only calculate interactions j>i
3188 * for one-way pair-lists.
3190 if (diagRemoved
&& ge
<= ci
*nbl
->na_sc
+ i
)
3195 se
= (ge
>> na_cj_2log
);
3197 /* Could the cluster se be in our list? */
3198 if (se
>= cj_first
&& se
<= cj_last
)
3200 if (se
< cj_first
+ ndirect
)
3202 /* We can calculate cj_ind directly from se */
3203 found
= cj_ind_first
+ se
- cj_first
;
3207 /* Search for se using bisection */
3209 cj_ind_0
= cj_ind_first
+ ndirect
;
3210 cj_ind_1
= cj_ind_last
+ 1;
3211 while (found
== -1 && cj_ind_0
< cj_ind_1
)
3213 cj_ind_m
= (cj_ind_0
+ cj_ind_1
)>>1;
3215 cj_m
= nbl
->cj
[cj_ind_m
].cj
;
3223 cj_ind_1
= cj_ind_m
;
3227 cj_ind_0
= cj_ind_m
+ 1;
3234 inner_i
= i
- (si
<< na_ci_2log
);
3235 inner_e
= ge
- (se
<< na_cj_2log
);
3237 nbl
->cj
[found
].excl
&= ~(1U<<((inner_i
<<na_cj_2log
) + inner_e
));
3238 /* The next code line is usually not needed. We do not want to version
3239 * away the above line, because there is logic that relies on being
3240 * able to detect easily whether any exclusions exist. */
3241 #if (defined GMX_SIMD_IBM_QPX)
3242 nbl
->cj
[found
].interaction_mask_indices
[inner_i
] &= ~(1U << inner_e
);
3251 /* Add a new i-entry to the FEP list and copy the i-properties */
3252 static gmx_inline
void fep_list_new_nri_copy(t_nblist
*nlist
)
3254 /* Add a new i-entry */
3257 assert(nlist
->nri
< nlist
->maxnri
);
3259 /* Duplicate the last i-entry, except for jindex, which continues */
3260 nlist
->iinr
[nlist
->nri
] = nlist
->iinr
[nlist
->nri
-1];
3261 nlist
->shift
[nlist
->nri
] = nlist
->shift
[nlist
->nri
-1];
3262 nlist
->gid
[nlist
->nri
] = nlist
->gid
[nlist
->nri
-1];
3263 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
3266 /* For load balancing of the free-energy lists over threads, we set
3267 * the maximum nrj size of an i-entry to 40. This leads to good
3268 * load balancing in the worst case scenario of a single perturbed
3269 * particle on 16 threads, while not introducing significant overhead.
3270 * Note that half of the perturbed pairs will anyhow end up in very small lists,
3271 * since non perturbed i-particles will see few perturbed j-particles).
3273 const int max_nrj_fep
= 40;
3275 /* Exclude the perturbed pairs from the Verlet list. This is only done to avoid
3276 * singularities for overlapping particles (0/0), since the charges and
3277 * LJ parameters have been zeroed in the nbnxn data structure.
3278 * Simultaneously make a group pair list for the perturbed pairs.
3280 static void make_fep_list(const nbnxn_search_t nbs
,
3281 const nbnxn_atomdata_t
*nbat
,
3282 nbnxn_pairlist_t
*nbl
,
3283 gmx_bool bDiagRemoved
,
3285 const nbnxn_grid_t
*gridi
,
3286 const nbnxn_grid_t
*gridj
,
3289 int ci
, cj_ind_start
, cj_ind_end
, cj_ind
, cja
, cjr
;
3291 int ngid
, gid_i
= 0, gid_j
, gid
;
3292 int egp_shift
, egp_mask
;
3294 int i
, j
, ind_i
, ind_j
, ai
, aj
;
3296 gmx_bool bFEP_i
, bFEP_i_all
;
3298 if (nbl_ci
->cj_ind_end
== nbl_ci
->cj_ind_start
)
3306 cj_ind_start
= nbl_ci
->cj_ind_start
;
3307 cj_ind_end
= nbl_ci
->cj_ind_end
;
3309 /* In worst case we have alternating energy groups
3310 * and create #atom-pair lists, which means we need the size
3311 * of a cluster pair (na_ci*na_cj) times the number of cj's.
3313 nri_max
= nbl
->na_ci
*nbl
->na_cj
*(cj_ind_end
- cj_ind_start
);
3314 if (nlist
->nri
+ nri_max
> nlist
->maxnri
)
3316 nlist
->maxnri
= over_alloc_large(nlist
->nri
+ nri_max
);
3317 reallocate_nblist(nlist
);
3320 ngid
= nbat
->nenergrp
;
3322 if (ngid
*gridj
->na_cj
> sizeof(gid_cj
)*8)
3324 gmx_fatal(FARGS
, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %d energy groups",
3325 gridi
->na_c
, gridj
->na_cj
, (sizeof(gid_cj
)*8)/gridj
->na_cj
);
3328 egp_shift
= nbat
->neg_2log
;
3329 egp_mask
= (1<<nbat
->neg_2log
) - 1;
3331 /* Loop over the atoms in the i sub-cell */
3333 for (i
= 0; i
< nbl
->na_ci
; i
++)
3335 ind_i
= ci
*nbl
->na_ci
+ i
;
3340 nlist
->jindex
[nri
+1] = nlist
->jindex
[nri
];
3341 nlist
->iinr
[nri
] = ai
;
3342 /* The actual energy group pair index is set later */
3343 nlist
->gid
[nri
] = 0;
3344 nlist
->shift
[nri
] = nbl_ci
->shift
& NBNXN_CI_SHIFT
;
3346 bFEP_i
= gridi
->fep
[ci
- gridi
->cell0
] & (1 << i
);
3348 bFEP_i_all
= bFEP_i_all
&& bFEP_i
;
3350 if ((nlist
->nrj
+ cj_ind_end
- cj_ind_start
)*nbl
->na_cj
> nlist
->maxnrj
)
3352 nlist
->maxnrj
= over_alloc_small((nlist
->nrj
+ cj_ind_end
- cj_ind_start
)*nbl
->na_cj
);
3353 srenew(nlist
->jjnr
, nlist
->maxnrj
);
3354 srenew(nlist
->excl_fep
, nlist
->maxnrj
);
3359 gid_i
= (nbat
->energrp
[ci
] >> (egp_shift
*i
)) & egp_mask
;
3362 for (cj_ind
= cj_ind_start
; cj_ind
< cj_ind_end
; cj_ind
++)
3364 unsigned int fep_cj
;
3366 cja
= nbl
->cj
[cj_ind
].cj
;
3368 if (gridj
->na_cj
== gridj
->na_c
)
3370 cjr
= cja
- gridj
->cell0
;
3371 fep_cj
= gridj
->fep
[cjr
];
3374 gid_cj
= nbat
->energrp
[cja
];
3377 else if (2*gridj
->na_cj
== gridj
->na_c
)
3379 cjr
= cja
- gridj
->cell0
*2;
3380 /* Extract half of the ci fep/energrp mask */
3381 fep_cj
= (gridj
->fep
[cjr
>>1] >> ((cjr
&1)*gridj
->na_cj
)) & ((1<<gridj
->na_cj
) - 1);
3384 gid_cj
= nbat
->energrp
[cja
>>1] >> ((cja
&1)*gridj
->na_cj
*egp_shift
) & ((1<<(gridj
->na_cj
*egp_shift
)) - 1);
3389 cjr
= cja
- (gridj
->cell0
>>1);
3390 /* Combine two ci fep masks/energrp */
3391 fep_cj
= gridj
->fep
[cjr
*2] + (gridj
->fep
[cjr
*2+1] << gridj
->na_c
);
3394 gid_cj
= nbat
->energrp
[cja
*2] + (nbat
->energrp
[cja
*2+1] << (gridj
->na_c
*egp_shift
));
3398 if (bFEP_i
|| fep_cj
!= 0)
3400 for (j
= 0; j
< nbl
->na_cj
; j
++)
3402 /* Is this interaction perturbed and not excluded? */
3403 ind_j
= cja
*nbl
->na_cj
+ j
;
3406 (bFEP_i
|| (fep_cj
& (1 << j
))) &&
3407 (!bDiagRemoved
|| ind_j
>= ind_i
))
3411 gid_j
= (gid_cj
>> (j
*egp_shift
)) & egp_mask
;
3412 gid
= GID(gid_i
, gid_j
, ngid
);
3414 if (nlist
->nrj
> nlist
->jindex
[nri
] &&
3415 nlist
->gid
[nri
] != gid
)
3417 /* Energy group pair changed: new list */
3418 fep_list_new_nri_copy(nlist
);
3421 nlist
->gid
[nri
] = gid
;
3424 if (nlist
->nrj
- nlist
->jindex
[nri
] >= max_nrj_fep
)
3426 fep_list_new_nri_copy(nlist
);
3430 /* Add it to the FEP list */
3431 nlist
->jjnr
[nlist
->nrj
] = aj
;
3432 nlist
->excl_fep
[nlist
->nrj
] = (nbl
->cj
[cj_ind
].excl
>> (i
*nbl
->na_cj
+ j
)) & 1;
3435 /* Exclude it from the normal list.
3436 * Note that the charge has been set to zero,
3437 * but we need to avoid 0/0, as perturbed atoms
3438 * can be on top of each other.
3440 nbl
->cj
[cj_ind
].excl
&= ~(1U << (i
*nbl
->na_cj
+ j
));
3446 if (nlist
->nrj
> nlist
->jindex
[nri
])
3448 /* Actually add this new, non-empty, list */
3450 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
3457 /* All interactions are perturbed, we can skip this entry */
3458 nbl_ci
->cj_ind_end
= cj_ind_start
;
3462 /* Return the index of atom a within a cluster */
3463 static gmx_inline
int cj_mod_cj4(int cj
)
3465 return cj
& (NBNXN_GPU_JGROUP_SIZE
- 1);
3468 /* Convert a j-cluster to a cj4 group */
3469 static gmx_inline
int cj_to_cj4(int cj
)
3471 return cj
>> NBNXN_GPU_JGROUP_SIZE_2LOG
;
3474 /* Return the index of an j-atom within a warp */
3475 static gmx_inline
int a_mod_wj(int a
)
3477 return a
& (NBNXN_GPU_CLUSTER_SIZE
/2 - 1);
3480 /* As make_fep_list above, but for super/sub lists. */
3481 static void make_fep_list_supersub(const nbnxn_search_t nbs
,
3482 const nbnxn_atomdata_t
*nbat
,
3483 nbnxn_pairlist_t
*nbl
,
3484 gmx_bool bDiagRemoved
,
3485 const nbnxn_sci_t
*nbl_sci
,
3490 const nbnxn_grid_t
*gridi
,
3491 const nbnxn_grid_t
*gridj
,
3494 int sci
, cj4_ind_start
, cj4_ind_end
, cj4_ind
, gcj
, cjr
;
3497 int i
, j
, ind_i
, ind_j
, ai
, aj
;
3501 const nbnxn_cj4_t
*cj4
;
3503 if (nbl_sci
->cj4_ind_end
== nbl_sci
->cj4_ind_start
)
3511 cj4_ind_start
= nbl_sci
->cj4_ind_start
;
3512 cj4_ind_end
= nbl_sci
->cj4_ind_end
;
3514 /* Here we process one super-cell, max #atoms na_sc, versus a list
3515 * cj4 entries, each with max NBNXN_GPU_JGROUP_SIZE cj's, each
3516 * of size na_cj atoms.
3517 * On the GPU we don't support energy groups (yet).
3518 * So for each of the na_sc i-atoms, we need max one FEP list
3519 * for each max_nrj_fep j-atoms.
3521 nri_max
= nbl
->na_sc
*nbl
->na_cj
*(1 + ((cj4_ind_end
- cj4_ind_start
)*NBNXN_GPU_JGROUP_SIZE
)/max_nrj_fep
);
3522 if (nlist
->nri
+ nri_max
> nlist
->maxnri
)
3524 nlist
->maxnri
= over_alloc_large(nlist
->nri
+ nri_max
);
3525 reallocate_nblist(nlist
);
3528 /* Loop over the atoms in the i super-cluster */
3529 for (c
= 0; c
< GPU_NSUBCELL
; c
++)
3531 c_abs
= sci
*GPU_NSUBCELL
+ c
;
3533 for (i
= 0; i
< nbl
->na_ci
; i
++)
3535 ind_i
= c_abs
*nbl
->na_ci
+ i
;
3540 nlist
->jindex
[nri
+1] = nlist
->jindex
[nri
];
3541 nlist
->iinr
[nri
] = ai
;
3542 /* With GPUs, energy groups are not supported */
3543 nlist
->gid
[nri
] = 0;
3544 nlist
->shift
[nri
] = nbl_sci
->shift
& NBNXN_CI_SHIFT
;
3546 bFEP_i
= (gridi
->fep
[c_abs
- gridi
->cell0
] & (1 << i
));
3548 xi
= nbat
->x
[ind_i
*nbat
->xstride
+XX
] + shx
;
3549 yi
= nbat
->x
[ind_i
*nbat
->xstride
+YY
] + shy
;
3550 zi
= nbat
->x
[ind_i
*nbat
->xstride
+ZZ
] + shz
;
3552 if ((nlist
->nrj
+ cj4_ind_end
- cj4_ind_start
)*NBNXN_GPU_JGROUP_SIZE
*nbl
->na_cj
> nlist
->maxnrj
)
3554 nlist
->maxnrj
= over_alloc_small((nlist
->nrj
+ cj4_ind_end
- cj4_ind_start
)*NBNXN_GPU_JGROUP_SIZE
*nbl
->na_cj
);
3555 srenew(nlist
->jjnr
, nlist
->maxnrj
);
3556 srenew(nlist
->excl_fep
, nlist
->maxnrj
);
3559 for (cj4_ind
= cj4_ind_start
; cj4_ind
< cj4_ind_end
; cj4_ind
++)
3561 cj4
= &nbl
->cj4
[cj4_ind
];
3563 for (gcj
= 0; gcj
< NBNXN_GPU_JGROUP_SIZE
; gcj
++)
3565 unsigned int fep_cj
;
3567 if ((cj4
->imei
[0].imask
& (1U << (gcj
*GPU_NSUBCELL
+ c
))) == 0)
3569 /* Skip this ci for this cj */
3573 cjr
= cj4
->cj
[gcj
] - gridj
->cell0
*GPU_NSUBCELL
;
3575 fep_cj
= gridj
->fep
[cjr
];
3577 if (bFEP_i
|| fep_cj
!= 0)
3579 for (j
= 0; j
< nbl
->na_cj
; j
++)
3581 /* Is this interaction perturbed and not excluded? */
3582 ind_j
= (gridj
->cell0
*GPU_NSUBCELL
+ cjr
)*nbl
->na_cj
+ j
;
3585 (bFEP_i
|| (fep_cj
& (1 << j
))) &&
3586 (!bDiagRemoved
|| ind_j
>= ind_i
))
3590 unsigned int excl_bit
;
3593 get_nbl_exclusions_1(nbl
, cj4_ind
, j
>>2, &excl
);
3595 excl_pair
= a_mod_wj(j
)*nbl
->na_ci
+ i
;
3596 excl_bit
= (1U << (gcj
*GPU_NSUBCELL
+ c
));
3598 dx
= nbat
->x
[ind_j
*nbat
->xstride
+XX
] - xi
;
3599 dy
= nbat
->x
[ind_j
*nbat
->xstride
+YY
] - yi
;
3600 dz
= nbat
->x
[ind_j
*nbat
->xstride
+ZZ
] - zi
;
3602 /* The unpruned GPU list has more than 2/3
3603 * of the atom pairs beyond rlist. Using
3604 * this list will cause a lot of overhead
3605 * in the CPU FEP kernels, especially
3606 * relative to the fast GPU kernels.
3607 * So we prune the FEP list here.
3609 if (dx
*dx
+ dy
*dy
+ dz
*dz
< rlist_fep2
)
3611 if (nlist
->nrj
- nlist
->jindex
[nri
] >= max_nrj_fep
)
3613 fep_list_new_nri_copy(nlist
);
3617 /* Add it to the FEP list */
3618 nlist
->jjnr
[nlist
->nrj
] = aj
;
3619 nlist
->excl_fep
[nlist
->nrj
] = (excl
->pair
[excl_pair
] & excl_bit
) ? 1 : 0;
3623 /* Exclude it from the normal list.
3624 * Note that the charge and LJ parameters have
3625 * been set to zero, but we need to avoid 0/0,
3626 * as perturbed atoms can be on top of each other.
3628 excl
->pair
[excl_pair
] &= ~excl_bit
;
3632 /* Note that we could mask out this pair in imask
3633 * if all i- and/or all j-particles are perturbed.
3634 * But since the perturbed pairs on the CPU will
3635 * take an order of magnitude more time, the GPU
3636 * will finish before the CPU and there is no gain.
3642 if (nlist
->nrj
> nlist
->jindex
[nri
])
3644 /* Actually add this new, non-empty, list */
3646 nlist
->jindex
[nlist
->nri
] = nlist
->nrj
;
3653 /* Set all atom-pair exclusions from the topology stored in excl
3654 * as masks in the pair-list for i-super-cell entry nbl_sci
3656 static void set_sci_top_excls(const nbnxn_search_t nbs
,
3657 nbnxn_pairlist_t
*nbl
,
3658 gmx_bool diagRemoved
,
3660 const nbnxn_sci_t
*nbl_sci
,
3661 const t_blocka
*excl
)
3666 int cj_ind_first
, cj_ind_last
;
3667 int cj_first
, cj_last
;
3669 int i
, ai
, aj
, si
, eind
, ge
, se
;
3670 int found
, cj_ind_0
, cj_ind_1
, cj_ind_m
;
3674 nbnxn_excl_t
*nbl_excl
;
3675 int inner_i
, inner_e
, w
;
3681 if (nbl_sci
->cj4_ind_end
== nbl_sci
->cj4_ind_start
)
3689 cj_ind_first
= nbl_sci
->cj4_ind_start
*NBNXN_GPU_JGROUP_SIZE
;
3690 cj_ind_last
= nbl
->work
->cj_ind
- 1;
3692 cj_first
= nbl
->cj4
[nbl_sci
->cj4_ind_start
].cj
[0];
3693 cj_last
= nbl_cj(nbl
, cj_ind_last
);
3695 /* Determine how many contiguous j-clusters we have starting
3696 * from the first i-cluster. This number can be used to directly
3697 * calculate j-cluster indices for excluded atoms.
3700 while (cj_ind_first
+ ndirect
<= cj_ind_last
&&
3701 nbl_cj(nbl
, cj_ind_first
+ndirect
) == sci
*GPU_NSUBCELL
+ ndirect
)
3706 /* Loop over the atoms in the i super-cell */
3707 for (i
= 0; i
< nbl
->na_sc
; i
++)
3709 ai
= nbs
->a
[sci
*nbl
->na_sc
+i
];
3712 si
= (i
>>na_c_2log
);
3714 /* Loop over the topology-based exclusions for this i-atom */
3715 for (eind
= excl
->index
[ai
]; eind
< excl
->index
[ai
+1]; eind
++)
3721 /* The self exclusion are already set, save some time */
3727 /* Without shifts we only calculate interactions j>i
3728 * for one-way pair-lists.
3730 if (diagRemoved
&& ge
<= sci
*nbl
->na_sc
+ i
)
3736 /* Could the cluster se be in our list? */
3737 if (se
>= cj_first
&& se
<= cj_last
)
3739 if (se
< cj_first
+ ndirect
)
3741 /* We can calculate cj_ind directly from se */
3742 found
= cj_ind_first
+ se
- cj_first
;
3746 /* Search for se using bisection */
3748 cj_ind_0
= cj_ind_first
+ ndirect
;
3749 cj_ind_1
= cj_ind_last
+ 1;
3750 while (found
== -1 && cj_ind_0
< cj_ind_1
)
3752 cj_ind_m
= (cj_ind_0
+ cj_ind_1
)>>1;
3754 cj_m
= nbl_cj(nbl
, cj_ind_m
);
3762 cj_ind_1
= cj_ind_m
;
3766 cj_ind_0
= cj_ind_m
+ 1;
3773 inner_i
= i
- si
*na_c
;
3774 inner_e
= ge
- se
*na_c
;
3776 if (nbl_imask0(nbl
, found
) & (1U << (cj_mod_cj4(found
)*GPU_NSUBCELL
+ si
)))
3780 get_nbl_exclusions_1(nbl
, cj_to_cj4(found
), w
, &nbl_excl
);
3782 nbl_excl
->pair
[a_mod_wj(inner_e
)*nbl
->na_ci
+inner_i
] &=
3783 ~(1U << (cj_mod_cj4(found
)*GPU_NSUBCELL
+ si
));
3792 /* Reallocate the simple ci list for at least n entries */
3793 static void nb_realloc_ci(nbnxn_pairlist_t
*nbl
, int n
)
3795 nbl
->ci_nalloc
= over_alloc_small(n
);
3796 nbnxn_realloc_void((void **)&nbl
->ci
,
3797 nbl
->nci
*sizeof(*nbl
->ci
),
3798 nbl
->ci_nalloc
*sizeof(*nbl
->ci
),
3799 nbl
->alloc
, nbl
->free
);
3802 /* Reallocate the super-cell sci list for at least n entries */
3803 static void nb_realloc_sci(nbnxn_pairlist_t
*nbl
, int n
)
3805 nbl
->sci_nalloc
= over_alloc_small(n
);
3806 nbnxn_realloc_void((void **)&nbl
->sci
,
3807 nbl
->nsci
*sizeof(*nbl
->sci
),
3808 nbl
->sci_nalloc
*sizeof(*nbl
->sci
),
3809 nbl
->alloc
, nbl
->free
);
3812 /* Make a new ci entry at index nbl->nci */
3813 static void new_ci_entry(nbnxn_pairlist_t
*nbl
, int ci
, int shift
, int flags
)
3815 if (nbl
->nci
+ 1 > nbl
->ci_nalloc
)
3817 nb_realloc_ci(nbl
, nbl
->nci
+1);
3819 nbl
->ci
[nbl
->nci
].ci
= ci
;
3820 nbl
->ci
[nbl
->nci
].shift
= shift
;
3821 /* Store the interaction flags along with the shift */
3822 nbl
->ci
[nbl
->nci
].shift
|= flags
;
3823 nbl
->ci
[nbl
->nci
].cj_ind_start
= nbl
->ncj
;
3824 nbl
->ci
[nbl
->nci
].cj_ind_end
= nbl
->ncj
;
3827 /* Make a new sci entry at index nbl->nsci */
3828 static void new_sci_entry(nbnxn_pairlist_t
*nbl
, int sci
, int shift
)
3830 if (nbl
->nsci
+ 1 > nbl
->sci_nalloc
)
3832 nb_realloc_sci(nbl
, nbl
->nsci
+1);
3834 nbl
->sci
[nbl
->nsci
].sci
= sci
;
3835 nbl
->sci
[nbl
->nsci
].shift
= shift
;
3836 nbl
->sci
[nbl
->nsci
].cj4_ind_start
= nbl
->ncj4
;
3837 nbl
->sci
[nbl
->nsci
].cj4_ind_end
= nbl
->ncj4
;
3840 /* Sort the simple j-list cj on exclusions.
3841 * Entries with exclusions will all be sorted to the beginning of the list.
3843 static void sort_cj_excl(nbnxn_cj_t
*cj
, int ncj
,
3844 nbnxn_list_work_t
*work
)
3848 if (ncj
> work
->cj_nalloc
)
3850 work
->cj_nalloc
= over_alloc_large(ncj
);
3851 srenew(work
->cj
, work
->cj_nalloc
);
3854 /* Make a list of the j-cells involving exclusions */
3856 for (j
= 0; j
< ncj
; j
++)
3858 if (cj
[j
].excl
!= NBNXN_INTERACTION_MASK_ALL
)
3860 work
->cj
[jnew
++] = cj
[j
];
3863 /* Check if there are exclusions at all or not just the first entry */
3864 if (!((jnew
== 0) ||
3865 (jnew
== 1 && cj
[0].excl
!= NBNXN_INTERACTION_MASK_ALL
)))
3867 for (j
= 0; j
< ncj
; j
++)
3869 if (cj
[j
].excl
== NBNXN_INTERACTION_MASK_ALL
)
3871 work
->cj
[jnew
++] = cj
[j
];
3874 for (j
= 0; j
< ncj
; j
++)
3876 cj
[j
] = work
->cj
[j
];
3881 /* Close this simple list i entry */
3882 static void close_ci_entry_simple(nbnxn_pairlist_t
*nbl
)
3886 /* All content of the new ci entry have already been filled correctly,
3887 * we only need to increase the count here (for non empty lists).
3889 jlen
= nbl
->ci
[nbl
->nci
].cj_ind_end
- nbl
->ci
[nbl
->nci
].cj_ind_start
;
3892 sort_cj_excl(nbl
->cj
+nbl
->ci
[nbl
->nci
].cj_ind_start
, jlen
, nbl
->work
);
3894 /* The counts below are used for non-bonded pair/flop counts
3895 * and should therefore match the available kernel setups.
3897 if (!(nbl
->ci
[nbl
->nci
].shift
& NBNXN_CI_DO_COUL(0)))
3899 nbl
->work
->ncj_noq
+= jlen
;
3901 else if ((nbl
->ci
[nbl
->nci
].shift
& NBNXN_CI_HALF_LJ(0)) ||
3902 !(nbl
->ci
[nbl
->nci
].shift
& NBNXN_CI_DO_LJ(0)))
3904 nbl
->work
->ncj_hlj
+= jlen
;
3911 /* Split sci entry for load balancing on the GPU.
3912 * Splitting ensures we have enough lists to fully utilize the whole GPU.
3913 * With progBal we generate progressively smaller lists, which improves
3914 * load balancing. As we only know the current count on our own thread,
3915 * we will need to estimate the current total amount of i-entries.
3916 * As the lists get concatenated later, this estimate depends
3917 * both on nthread and our own thread index.
3919 static void split_sci_entry(nbnxn_pairlist_t
*nbl
,
3920 int nsp_max_av
, gmx_bool progBal
, int nc_bal
,
3921 int thread
, int nthread
)
3925 int cj4_start
, cj4_end
, j4len
, cj4
;
3927 int nsp
, nsp_sci
, nsp_cj4
, nsp_cj4_e
, nsp_cj4_p
;
3932 /* Estimate the total numbers of ci's of the nblist combined
3933 * over all threads using the target number of ci's.
3935 nsci_est
= nc_bal
*thread
/nthread
+ nbl
->nsci
;
3937 /* The first ci blocks should be larger, to avoid overhead.
3938 * The last ci blocks should be smaller, to improve load balancing.
3941 nsp_max_av
*nc_bal
*3/(2*(nsci_est
- 1 + nc_bal
)));
3945 nsp_max
= nsp_max_av
;
3948 cj4_start
= nbl
->sci
[nbl
->nsci
-1].cj4_ind_start
;
3949 cj4_end
= nbl
->sci
[nbl
->nsci
-1].cj4_ind_end
;
3950 j4len
= cj4_end
- cj4_start
;
3952 if (j4len
> 1 && j4len
*GPU_NSUBCELL
*NBNXN_GPU_JGROUP_SIZE
> nsp_max
)
3954 /* Remove the last ci entry and process the cj4's again */
3962 for (cj4
= cj4_start
; cj4
< cj4_end
; cj4
++)
3964 nsp_cj4_p
= nsp_cj4
;
3965 /* Count the number of cluster pairs in this cj4 group */
3967 for (p
= 0; p
< GPU_NSUBCELL
*NBNXN_GPU_JGROUP_SIZE
; p
++)
3969 nsp_cj4
+= (nbl
->cj4
[cj4
].imei
[0].imask
>> p
) & 1;
3972 if (nsp_cj4
> 0 && nsp
+ nsp_cj4
> nsp_max
)
3974 /* Split the list at cj4 */
3975 nbl
->sci
[sci
].cj4_ind_end
= cj4
;
3976 /* Create a new sci entry */
3979 if (nbl
->nsci
+1 > nbl
->sci_nalloc
)
3981 nb_realloc_sci(nbl
, nbl
->nsci
+1);
3983 nbl
->sci
[sci
].sci
= nbl
->sci
[nbl
->nsci
-1].sci
;
3984 nbl
->sci
[sci
].shift
= nbl
->sci
[nbl
->nsci
-1].shift
;
3985 nbl
->sci
[sci
].cj4_ind_start
= cj4
;
3987 nsp_cj4_e
= nsp_cj4_p
;
3993 /* Put the remaining cj4's in the last sci entry */
3994 nbl
->sci
[sci
].cj4_ind_end
= cj4_end
;
3996 /* Possibly balance out the last two sci's
3997 * by moving the last cj4 of the second last sci.
3999 if (nsp_sci
- nsp_cj4_e
>= nsp
+ nsp_cj4_e
)
4001 nbl
->sci
[sci
-1].cj4_ind_end
--;
4002 nbl
->sci
[sci
].cj4_ind_start
--;
4009 /* Clost this super/sub list i entry */
4010 static void close_ci_entry_supersub(nbnxn_pairlist_t
*nbl
,
4012 gmx_bool progBal
, int nc_bal
,
4013 int thread
, int nthread
)
4018 /* All content of the new ci entry have already been filled correctly,
4019 * we only need to increase the count here (for non empty lists).
4021 j4len
= nbl
->sci
[nbl
->nsci
].cj4_ind_end
- nbl
->sci
[nbl
->nsci
].cj4_ind_start
;
4024 /* We can only have complete blocks of 4 j-entries in a list,
4025 * so round the count up before closing.
4027 nbl
->ncj4
= ((nbl
->work
->cj_ind
+ NBNXN_GPU_JGROUP_SIZE
- 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG
);
4028 nbl
->work
->cj_ind
= nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
;
4034 /* Measure the size of the new entry and potentially split it */
4035 split_sci_entry(nbl
, nsp_max_av
, progBal
, nc_bal
, thread
, nthread
);
4040 /* Syncs the working array before adding another grid pair to the list */
4041 static void sync_work(nbnxn_pairlist_t
*nbl
)
4045 nbl
->work
->cj_ind
= nbl
->ncj4
*NBNXN_GPU_JGROUP_SIZE
;
4046 nbl
->work
->cj4_init
= nbl
->ncj4
;
4050 /* Clears an nbnxn_pairlist_t data structure */
4051 static void clear_pairlist(nbnxn_pairlist_t
*nbl
)
4060 nbl
->work
->ncj_noq
= 0;
4061 nbl
->work
->ncj_hlj
= 0;
4064 /* Clears a group scheme pair list */
4065 static void clear_pairlist_fep(t_nblist
*nl
)
4069 if (nl
->jindex
== NULL
)
4071 snew(nl
->jindex
, 1);
4076 /* Sets a simple list i-cell bounding box, including PBC shift */
4077 static gmx_inline
void set_icell_bb_simple(const nbnxn_bb_t
*bb
, int ci
,
4078 real shx
, real shy
, real shz
,
4081 bb_ci
->lower
[BB_X
] = bb
[ci
].lower
[BB_X
] + shx
;
4082 bb_ci
->lower
[BB_Y
] = bb
[ci
].lower
[BB_Y
] + shy
;
4083 bb_ci
->lower
[BB_Z
] = bb
[ci
].lower
[BB_Z
] + shz
;
4084 bb_ci
->upper
[BB_X
] = bb
[ci
].upper
[BB_X
] + shx
;
4085 bb_ci
->upper
[BB_Y
] = bb
[ci
].upper
[BB_Y
] + shy
;
4086 bb_ci
->upper
[BB_Z
] = bb
[ci
].upper
[BB_Z
] + shz
;
4090 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
4091 static void set_icell_bbxxxx_supersub(const float *bb
, int ci
,
4092 real shx
, real shy
, real shz
,
4097 ia
= ci
*(GPU_NSUBCELL
>>STRIDE_PBB_2LOG
)*NNBSBB_XXXX
;
4098 for (m
= 0; m
< (GPU_NSUBCELL
>>STRIDE_PBB_2LOG
)*NNBSBB_XXXX
; m
+= NNBSBB_XXXX
)
4100 for (i
= 0; i
< STRIDE_PBB
; i
++)
4102 bb_ci
[m
+0*STRIDE_PBB
+i
] = bb
[ia
+m
+0*STRIDE_PBB
+i
] + shx
;
4103 bb_ci
[m
+1*STRIDE_PBB
+i
] = bb
[ia
+m
+1*STRIDE_PBB
+i
] + shy
;
4104 bb_ci
[m
+2*STRIDE_PBB
+i
] = bb
[ia
+m
+2*STRIDE_PBB
+i
] + shz
;
4105 bb_ci
[m
+3*STRIDE_PBB
+i
] = bb
[ia
+m
+3*STRIDE_PBB
+i
] + shx
;
4106 bb_ci
[m
+4*STRIDE_PBB
+i
] = bb
[ia
+m
+4*STRIDE_PBB
+i
] + shy
;
4107 bb_ci
[m
+5*STRIDE_PBB
+i
] = bb
[ia
+m
+5*STRIDE_PBB
+i
] + shz
;
4113 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
4114 static void set_icell_bb_supersub(const nbnxn_bb_t
*bb
, int ci
,
4115 real shx
, real shy
, real shz
,
4120 for (i
= 0; i
< GPU_NSUBCELL
; i
++)
4122 set_icell_bb_simple(bb
, ci
*GPU_NSUBCELL
+i
,
4128 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
4129 static void icell_set_x_simple(int ci
,
4130 real shx
, real shy
, real shz
,
4131 int gmx_unused na_c
,
4132 int stride
, const real
*x
,
4133 nbnxn_list_work_t
*work
)
4137 ia
= ci
*NBNXN_CPU_CLUSTER_I_SIZE
;
4139 for (i
= 0; i
< NBNXN_CPU_CLUSTER_I_SIZE
; i
++)
4141 work
->x_ci
[i
*STRIDE_XYZ
+XX
] = x
[(ia
+i
)*stride
+XX
] + shx
;
4142 work
->x_ci
[i
*STRIDE_XYZ
+YY
] = x
[(ia
+i
)*stride
+YY
] + shy
;
4143 work
->x_ci
[i
*STRIDE_XYZ
+ZZ
] = x
[(ia
+i
)*stride
+ZZ
] + shz
;
4147 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
4148 static void icell_set_x_supersub(int ci
,
4149 real shx
, real shy
, real shz
,
4151 int stride
, const real
*x
,
4152 nbnxn_list_work_t
*work
)
4159 ia
= ci
*GPU_NSUBCELL
*na_c
;
4160 for (i
= 0; i
< GPU_NSUBCELL
*na_c
; i
++)
4162 x_ci
[i
*DIM
+ XX
] = x
[(ia
+i
)*stride
+ XX
] + shx
;
4163 x_ci
[i
*DIM
+ YY
] = x
[(ia
+i
)*stride
+ YY
] + shy
;
4164 x_ci
[i
*DIM
+ ZZ
] = x
[(ia
+i
)*stride
+ ZZ
] + shz
;
4168 #ifdef NBNXN_SEARCH_BB_SIMD4
4169 /* Copies PBC shifted super-cell packed atom coordinates to working array */
4170 static void icell_set_x_supersub_simd4(int ci
,
4171 real shx
, real shy
, real shz
,
4173 int stride
, const real
*x
,
4174 nbnxn_list_work_t
*work
)
4176 int si
, io
, ia
, i
, j
;
4181 for (si
= 0; si
< GPU_NSUBCELL
; si
++)
4183 for (i
= 0; i
< na_c
; i
+= STRIDE_PBB
)
4186 ia
= ci
*GPU_NSUBCELL
*na_c
+ io
;
4187 for (j
= 0; j
< STRIDE_PBB
; j
++)
4189 x_ci
[io
*DIM
+ j
+ XX
*STRIDE_PBB
] = x
[(ia
+j
)*stride
+XX
] + shx
;
4190 x_ci
[io
*DIM
+ j
+ YY
*STRIDE_PBB
] = x
[(ia
+j
)*stride
+YY
] + shy
;
4191 x_ci
[io
*DIM
+ j
+ ZZ
*STRIDE_PBB
] = x
[(ia
+j
)*stride
+ZZ
] + shz
;
4198 static real
minimum_subgrid_size_xy(const nbnxn_grid_t
*grid
)
4202 return min(grid
->sx
, grid
->sy
);
4206 return min(grid
->sx
/GPU_NSUBCELL_X
, grid
->sy
/GPU_NSUBCELL_Y
);
4210 static real
effective_buffer_1x1_vs_MxN(const nbnxn_grid_t
*gridi
,
4211 const nbnxn_grid_t
*gridj
)
4213 const real eff_1x1_buffer_fac_overest
= 0.1;
4215 /* Determine an atom-pair list cut-off buffer size for atom pairs,
4216 * to be added to rlist (including buffer) used for MxN.
4217 * This is for converting an MxN list to a 1x1 list. This means we can't
4218 * use the normal buffer estimate, as we have an MxN list in which
4219 * some atom pairs beyond rlist are missing. We want to capture
4220 * the beneficial effect of buffering by extra pairs just outside rlist,
4221 * while removing the useless pairs that are further away from rlist.
4222 * (Also the buffer could have been set manually not using the estimate.)
4223 * This buffer size is an overestimate.
4224 * We add 10% of the smallest grid sub-cell dimensions.
4225 * Note that the z-size differs per cell and we don't use this,
4226 * so we overestimate.
4227 * With PME, the 10% value gives a buffer that is somewhat larger
4228 * than the effective buffer with a tolerance of 0.005 kJ/mol/ps.
4229 * Smaller tolerances or using RF lead to a smaller effective buffer,
4230 * so 10% gives a safe overestimate.
4232 return eff_1x1_buffer_fac_overest
*(minimum_subgrid_size_xy(gridi
) +
4233 minimum_subgrid_size_xy(gridj
));
4236 /* Clusters at the cut-off only increase rlist by 60% of their size */
4237 static real nbnxn_rlist_inc_outside_fac
= 0.6;
4239 /* Due to the cluster size the effective pair-list is longer than
4240 * that of a simple atom pair-list. This function gives the extra distance.
4242 real
nbnxn_get_rlist_effective_inc(int cluster_size_j
, real atom_density
)
4245 real vol_inc_i
, vol_inc_j
;
4247 /* We should get this from the setup, but currently it's the same for
4248 * all setups, including GPUs.
4250 cluster_size_i
= NBNXN_CPU_CLUSTER_I_SIZE
;
4252 vol_inc_i
= (cluster_size_i
- 1)/atom_density
;
4253 vol_inc_j
= (cluster_size_j
- 1)/atom_density
;
4255 return nbnxn_rlist_inc_outside_fac
*pow(vol_inc_i
+ vol_inc_j
, 1.0/3.0);
4258 /* Estimates the interaction volume^2 for non-local interactions */
4259 static real
nonlocal_vol2(const gmx_domdec_zones_t
*zones
, rvec ls
, real r
)
4268 /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
4269 * not home interaction volume^2. As these volumes are not additive,
4270 * this is an overestimate, but it would only be significant in the limit
4271 * of small cells, where we anyhow need to split the lists into
4272 * as small parts as possible.
4275 for (z
= 0; z
< zones
->n
; z
++)
4277 if (zones
->shift
[z
][XX
] + zones
->shift
[z
][YY
] + zones
->shift
[z
][ZZ
] == 1)
4282 for (d
= 0; d
< DIM
; d
++)
4284 if (zones
->shift
[z
][d
] == 0)
4288 za
*= zones
->size
[z
].x1
[d
] - zones
->size
[z
].x0
[d
];
4292 /* 4 octants of a sphere */
4293 vold_est
= 0.25*M_PI
*r
*r
*r
*r
;
4294 /* 4 quarter pie slices on the edges */
4295 vold_est
+= 4*cl
*M_PI
/6.0*r
*r
*r
;
4296 /* One rectangular volume on a face */
4297 vold_est
+= ca
*0.5*r
*r
;
4299 vol2_est_tot
+= vold_est
*za
;
4303 return vol2_est_tot
;
4306 /* Estimates the average size of a full j-list for super/sub setup */
4307 static int get_nsubpair_max(const nbnxn_search_t nbs
,
4310 int min_ci_balanced
)
4312 const nbnxn_grid_t
*grid
;
4314 real xy_diag2
, r_eff_sup
, vol_est
, nsp_est
, nsp_est_nl
;
4317 grid
= &nbs
->grid
[0];
4319 ls
[XX
] = (grid
->c1
[XX
] - grid
->c0
[XX
])/(grid
->ncx
*GPU_NSUBCELL_X
);
4320 ls
[YY
] = (grid
->c1
[YY
] - grid
->c0
[YY
])/(grid
->ncy
*GPU_NSUBCELL_Y
);
4321 ls
[ZZ
] = (grid
->c1
[ZZ
] - grid
->c0
[ZZ
])*grid
->ncx
*grid
->ncy
/(grid
->nc
*GPU_NSUBCELL_Z
);
4323 /* The average squared length of the diagonal of a sub cell */
4324 xy_diag2
= ls
[XX
]*ls
[XX
] + ls
[YY
]*ls
[YY
] + ls
[ZZ
]*ls
[ZZ
];
4326 /* The formulas below are a heuristic estimate of the average nsj per si*/
4327 r_eff_sup
= rlist
+ nbnxn_rlist_inc_outside_fac
*sqr((grid
->na_c
- 1.0)/grid
->na_c
)*sqrt(xy_diag2
/3);
4329 if (!nbs
->DomDec
|| nbs
->zones
->n
== 1)
4336 sqr(grid
->atom_density
/grid
->na_c
)*
4337 nonlocal_vol2(nbs
->zones
, ls
, r_eff_sup
);
4342 /* Sub-cell interacts with itself */
4343 vol_est
= ls
[XX
]*ls
[YY
]*ls
[ZZ
];
4344 /* 6/2 rectangular volume on the faces */
4345 vol_est
+= (ls
[XX
]*ls
[YY
] + ls
[XX
]*ls
[ZZ
] + ls
[YY
]*ls
[ZZ
])*r_eff_sup
;
4346 /* 12/2 quarter pie slices on the edges */
4347 vol_est
+= 2*(ls
[XX
] + ls
[YY
] + ls
[ZZ
])*0.25*M_PI
*sqr(r_eff_sup
);
4348 /* 4 octants of a sphere */
4349 vol_est
+= 0.5*4.0/3.0*M_PI
*pow(r_eff_sup
, 3);
4351 nsp_est
= grid
->nsubc_tot
*vol_est
*grid
->atom_density
/grid
->na_c
;
4353 /* Subtract the non-local pair count */
4354 nsp_est
-= nsp_est_nl
;
4358 fprintf(debug
, "nsp_est local %5.1f non-local %5.1f\n",
4359 nsp_est
, nsp_est_nl
);
4364 nsp_est
= nsp_est_nl
;
4367 if (min_ci_balanced
<= 0 || grid
->nc
>= min_ci_balanced
|| grid
->nc
== 0)
4369 /* We don't need to worry */
4374 /* Thus the (average) maximum j-list size should be as follows */
4375 nsubpair_max
= max(1, (int)(nsp_est
/min_ci_balanced
+0.5));
4377 /* Since the target value is a maximum (this avoids high outliers,
4378 * which lead to load imbalance), not average, we add half the
4379 * number of pairs in a cj4 block to get the average about right.
4381 nsubpair_max
+= GPU_NSUBCELL
*NBNXN_GPU_JGROUP_SIZE
/2;
4386 fprintf(debug
, "nbl nsp estimate %.1f, nsubpair_max %d\n",
4387 nsp_est
, nsubpair_max
);
4390 return nsubpair_max
;
4393 /* Debug list print function */
4394 static void print_nblist_ci_cj(FILE *fp
, const nbnxn_pairlist_t
*nbl
)
4398 for (i
= 0; i
< nbl
->nci
; i
++)
4400 fprintf(fp
, "ci %4d shift %2d ncj %3d\n",
4401 nbl
->ci
[i
].ci
, nbl
->ci
[i
].shift
,
4402 nbl
->ci
[i
].cj_ind_end
- nbl
->ci
[i
].cj_ind_start
);
4404 for (j
= nbl
->ci
[i
].cj_ind_start
; j
< nbl
->ci
[i
].cj_ind_end
; j
++)
4406 fprintf(fp
, " cj %5d imask %x\n",
4413 /* Debug list print function */
4414 static void print_nblist_sci_cj(FILE *fp
, const nbnxn_pairlist_t
*nbl
)
4416 int i
, j4
, j
, ncp
, si
;
4418 for (i
= 0; i
< nbl
->nsci
; i
++)
4420 fprintf(fp
, "ci %4d shift %2d ncj4 %2d\n",
4421 nbl
->sci
[i
].sci
, nbl
->sci
[i
].shift
,
4422 nbl
->sci
[i
].cj4_ind_end
- nbl
->sci
[i
].cj4_ind_start
);
4425 for (j4
= nbl
->sci
[i
].cj4_ind_start
; j4
< nbl
->sci
[i
].cj4_ind_end
; j4
++)
4427 for (j
= 0; j
< NBNXN_GPU_JGROUP_SIZE
; j
++)
4429 fprintf(fp
, " sj %5d imask %x\n",
4431 nbl
->cj4
[j4
].imei
[0].imask
);
4432 for (si
= 0; si
< GPU_NSUBCELL
; si
++)
4434 if (nbl
->cj4
[j4
].imei
[0].imask
& (1U << (j
*GPU_NSUBCELL
+ si
)))
4441 fprintf(fp
, "ci %4d shift %2d ncj4 %2d ncp %3d\n",
4442 nbl
->sci
[i
].sci
, nbl
->sci
[i
].shift
,
4443 nbl
->sci
[i
].cj4_ind_end
- nbl
->sci
[i
].cj4_ind_start
,
4448 /* Combine pair lists *nbl generated on multiple threads nblc */
4449 static void combine_nblists(int nnbl
, nbnxn_pairlist_t
**nbl
,
4450 nbnxn_pairlist_t
*nblc
)
4452 int nsci
, ncj4
, nexcl
;
4457 gmx_incons("combine_nblists does not support simple lists");
4462 nexcl
= nblc
->nexcl
;
4463 for (i
= 0; i
< nnbl
; i
++)
4465 nsci
+= nbl
[i
]->nsci
;
4466 ncj4
+= nbl
[i
]->ncj4
;
4467 nexcl
+= nbl
[i
]->nexcl
;
4470 if (nsci
> nblc
->sci_nalloc
)
4472 nb_realloc_sci(nblc
, nsci
);
4474 if (ncj4
> nblc
->cj4_nalloc
)
4476 nblc
->cj4_nalloc
= over_alloc_small(ncj4
);
4477 nbnxn_realloc_void((void **)&nblc
->cj4
,
4478 nblc
->ncj4
*sizeof(*nblc
->cj4
),
4479 nblc
->cj4_nalloc
*sizeof(*nblc
->cj4
),
4480 nblc
->alloc
, nblc
->free
);
4482 if (nexcl
> nblc
->excl_nalloc
)
4484 nblc
->excl_nalloc
= over_alloc_small(nexcl
);
4485 nbnxn_realloc_void((void **)&nblc
->excl
,
4486 nblc
->nexcl
*sizeof(*nblc
->excl
),
4487 nblc
->excl_nalloc
*sizeof(*nblc
->excl
),
4488 nblc
->alloc
, nblc
->free
);
4491 /* Each thread should copy its own data to the combined arrays,
4492 * as otherwise data will go back and forth between different caches.
4494 #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
4495 for (n
= 0; n
< nnbl
; n
++)
4502 const nbnxn_pairlist_t
*nbli
;
4504 /* Determine the offset in the combined data for our thread */
4505 sci_offset
= nblc
->nsci
;
4506 cj4_offset
= nblc
->ncj4
;
4507 ci_offset
= nblc
->nci_tot
;
4508 excl_offset
= nblc
->nexcl
;
4510 for (i
= 0; i
< n
; i
++)
4512 sci_offset
+= nbl
[i
]->nsci
;
4513 cj4_offset
+= nbl
[i
]->ncj4
;
4514 ci_offset
+= nbl
[i
]->nci_tot
;
4515 excl_offset
+= nbl
[i
]->nexcl
;
4520 for (i
= 0; i
< nbli
->nsci
; i
++)
4522 nblc
->sci
[sci_offset
+i
] = nbli
->sci
[i
];
4523 nblc
->sci
[sci_offset
+i
].cj4_ind_start
+= cj4_offset
;
4524 nblc
->sci
[sci_offset
+i
].cj4_ind_end
+= cj4_offset
;
4527 for (j4
= 0; j4
< nbli
->ncj4
; j4
++)
4529 nblc
->cj4
[cj4_offset
+j4
] = nbli
->cj4
[j4
];
4530 nblc
->cj4
[cj4_offset
+j4
].imei
[0].excl_ind
+= excl_offset
;
4531 nblc
->cj4
[cj4_offset
+j4
].imei
[1].excl_ind
+= excl_offset
;
4534 for (j4
= 0; j4
< nbli
->nexcl
; j4
++)
4536 nblc
->excl
[excl_offset
+j4
] = nbli
->excl
[j4
];
4540 for (n
= 0; n
< nnbl
; n
++)
4542 nblc
->nsci
+= nbl
[n
]->nsci
;
4543 nblc
->ncj4
+= nbl
[n
]->ncj4
;
4544 nblc
->nci_tot
+= nbl
[n
]->nci_tot
;
4545 nblc
->nexcl
+= nbl
[n
]->nexcl
;
4549 static void balance_fep_lists(const nbnxn_search_t nbs
,
4550 nbnxn_pairlist_set_t
*nbl_lists
)
4553 int nri_tot
, nrj_tot
, nrj_target
;
4557 nnbl
= nbl_lists
->nnbl
;
4561 /* Nothing to balance */
4565 /* Count the total i-lists and pairs */
4568 for (th
= 0; th
< nnbl
; th
++)
4570 nri_tot
+= nbl_lists
->nbl_fep
[th
]->nri
;
4571 nrj_tot
+= nbl_lists
->nbl_fep
[th
]->nrj
;
4574 nrj_target
= (nrj_tot
+ nnbl
- 1)/nnbl
;
4576 assert(gmx_omp_nthreads_get(emntNonbonded
) == nnbl
);
4578 #pragma omp parallel for schedule(static) num_threads(nnbl)
4579 for (th
= 0; th
< nnbl
; th
++)
4583 nbl
= nbs
->work
[th
].nbl_fep
;
4585 /* Note that here we allocate for the total size, instead of
4586 * a per-thread esimate (which is hard to obtain).
4588 if (nri_tot
> nbl
->maxnri
)
4590 nbl
->maxnri
= over_alloc_large(nri_tot
);
4591 reallocate_nblist(nbl
);
4593 if (nri_tot
> nbl
->maxnri
|| nrj_tot
> nbl
->maxnrj
)
4595 nbl
->maxnrj
= over_alloc_small(nrj_tot
);
4596 srenew(nbl
->jjnr
, nbl
->maxnrj
);
4597 srenew(nbl
->excl_fep
, nbl
->maxnrj
);
4600 clear_pairlist_fep(nbl
);
4603 /* Loop over the source lists and assign and copy i-entries */
4605 nbld
= nbs
->work
[th_dest
].nbl_fep
;
4606 for (th
= 0; th
< nnbl
; th
++)
4611 nbls
= nbl_lists
->nbl_fep
[th
];
4613 for (i
= 0; i
< nbls
->nri
; i
++)
4617 /* The number of pairs in this i-entry */
4618 nrj
= nbls
->jindex
[i
+1] - nbls
->jindex
[i
];
4620 /* Decide if list th_dest is too large and we should procede
4621 * to the next destination list.
4623 if (th_dest
+1 < nnbl
&& nbld
->nrj
> 0 &&
4624 nbld
->nrj
+ nrj
- nrj_target
> nrj_target
- nbld
->nrj
)
4627 nbld
= nbs
->work
[th_dest
].nbl_fep
;
4630 nbld
->iinr
[nbld
->nri
] = nbls
->iinr
[i
];
4631 nbld
->gid
[nbld
->nri
] = nbls
->gid
[i
];
4632 nbld
->shift
[nbld
->nri
] = nbls
->shift
[i
];
4634 for (j
= nbls
->jindex
[i
]; j
< nbls
->jindex
[i
+1]; j
++)
4636 nbld
->jjnr
[nbld
->nrj
] = nbls
->jjnr
[j
];
4637 nbld
->excl_fep
[nbld
->nrj
] = nbls
->excl_fep
[j
];
4641 nbld
->jindex
[nbld
->nri
] = nbld
->nrj
;
4645 /* Swap the list pointers */
4646 for (th
= 0; th
< nnbl
; th
++)
4650 nbl_tmp
= nbl_lists
->nbl_fep
[th
];
4651 nbl_lists
->nbl_fep
[th
] = nbs
->work
[th
].nbl_fep
;
4652 nbs
->work
[th
].nbl_fep
= nbl_tmp
;
4656 fprintf(debug
, "nbl_fep[%d] nri %4d nrj %4d\n",
4658 nbl_lists
->nbl_fep
[th
]->nri
,
4659 nbl_lists
->nbl_fep
[th
]->nrj
);
4664 /* Returns the next ci to be processes by our thread */
4665 static gmx_bool
next_ci(const nbnxn_grid_t
*grid
,
4667 int nth
, int ci_block
,
4668 int *ci_x
, int *ci_y
,
4674 if (*ci_b
== ci_block
)
4676 /* Jump to the next block assigned to this task */
4677 *ci
+= (nth
- 1)*ci_block
;
4681 if (*ci
>= grid
->nc
*conv
)
4686 while (*ci
>= grid
->cxy_ind
[*ci_x
*grid
->ncy
+ *ci_y
+ 1]*conv
)
4689 if (*ci_y
== grid
->ncy
)
4699 /* Returns the distance^2 for which we put cell pairs in the list
4700 * without checking atom pair distances. This is usually < rlist^2.
4702 static float boundingbox_only_distance2(const nbnxn_grid_t
*gridi
,
4703 const nbnxn_grid_t
*gridj
,
4707 /* If the distance between two sub-cell bounding boxes is less
4708 * than this distance, do not check the distance between
4709 * all particle pairs in the sub-cell, since then it is likely
4710 * that the box pair has atom pairs within the cut-off.
4711 * We use the nblist cut-off minus 0.5 times the average x/y diagonal
4712 * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
4713 * Using more than 0.5 gains at most 0.5%.
4714 * If forces are calculated more than twice, the performance gain
4715 * in the force calculation outweighs the cost of checking.
4716 * Note that with subcell lists, the atom-pair distance check
4717 * is only performed when only 1 out of 8 sub-cells in within range,
4718 * this is because the GPU is much faster than the cpu.
4723 bbx
= 0.5*(gridi
->sx
+ gridj
->sx
);
4724 bby
= 0.5*(gridi
->sy
+ gridj
->sy
);
4727 bbx
/= GPU_NSUBCELL_X
;
4728 bby
/= GPU_NSUBCELL_Y
;
4731 rbb2
= sqr(max(0, rlist
- 0.5*sqrt(bbx
*bbx
+ bby
*bby
)));
4736 return (float)((1+GMX_FLOAT_EPS
)*rbb2
);
4740 static int get_ci_block_size(const nbnxn_grid_t
*gridi
,
4741 gmx_bool bDomDec
, int nth
)
4743 const int ci_block_enum
= 5;
4744 const int ci_block_denom
= 11;
4745 const int ci_block_min_atoms
= 16;
4748 /* Here we decide how to distribute the blocks over the threads.
4749 * We use prime numbers to try to avoid that the grid size becomes
4750 * a multiple of the number of threads, which would lead to some
4751 * threads getting "inner" pairs and others getting boundary pairs,
4752 * which in turns will lead to load imbalance between threads.
4753 * Set the block size as 5/11/ntask times the average number of cells
4754 * in a y,z slab. This should ensure a quite uniform distribution
4755 * of the grid parts of the different thread along all three grid
4756 * zone boundaries with 3D domain decomposition. At the same time
4757 * the blocks will not become too small.
4759 ci_block
= (gridi
->nc
*ci_block_enum
)/(ci_block_denom
*gridi
->ncx
*nth
);
4761 /* Ensure the blocks are not too small: avoids cache invalidation */
4762 if (ci_block
*gridi
->na_sc
< ci_block_min_atoms
)
4764 ci_block
= (ci_block_min_atoms
+ gridi
->na_sc
- 1)/gridi
->na_sc
;
4767 /* Without domain decomposition
4768 * or with less than 3 blocks per task, divide in nth blocks.
4770 if (!bDomDec
|| ci_block
*3*nth
> gridi
->nc
)
4772 ci_block
= (gridi
->nc
+ nth
- 1)/nth
;
4778 /* Generates the part of pair-list nbl assigned to our thread */
4779 static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs
,
4780 const nbnxn_grid_t
*gridi
,
4781 const nbnxn_grid_t
*gridj
,
4782 nbnxn_search_work_t
*work
,
4783 const nbnxn_atomdata_t
*nbat
,
4784 const t_blocka
*excl
,
4788 gmx_bool bFBufferFlag
,
4791 int min_ci_balanced
,
4793 nbnxn_pairlist_t
*nbl
,
4798 real rl2
, rl_fep2
= 0;
4801 int ci_b
, ci
, ci_x
, ci_y
, ci_xy
, cj
;
4807 int conv_i
, cell0_i
;
4808 const nbnxn_bb_t
*bb_i
= NULL
;
4810 const float *pbb_i
= NULL
;
4812 const float *bbcz_i
, *bbcz_j
;
4814 real bx0
, bx1
, by0
, by1
, bz0
, bz1
;
4816 real d2cx
, d2z
, d2z_cx
, d2z_cy
, d2zx
, d2zxy
, d2xy
;
4817 int cxf
, cxl
, cyf
, cyf_x
, cyl
;
4819 int c0
, c1
, cs
, cf
, cl
;
4822 int gridi_flag_shift
= 0, gridj_flag_shift
= 0;
4823 unsigned int *gridj_flag
= NULL
;
4824 int ncj_old_i
, ncj_old_j
;
4826 nbs_cycle_start(&work
->cc
[enbsCCsearch
]);
4828 if (gridj
->bSimple
!= nbl
->bSimple
)
4830 gmx_incons("Grid incompatible with pair-list");
4834 nbl
->na_sc
= gridj
->na_sc
;
4835 nbl
->na_ci
= gridj
->na_c
;
4836 nbl
->na_cj
= nbnxn_kernel_to_cj_size(nb_kernel_type
);
4837 na_cj_2log
= get_2log(nbl
->na_cj
);
4843 /* Determine conversion of clusters to flag blocks */
4844 gridi_flag_shift
= 0;
4845 while ((nbl
->na_ci
<<gridi_flag_shift
) < NBNXN_BUFFERFLAG_SIZE
)
4849 gridj_flag_shift
= 0;
4850 while ((nbl
->na_cj
<<gridj_flag_shift
) < NBNXN_BUFFERFLAG_SIZE
)
4855 gridj_flag
= work
->buffer_flags
.flag
;
4858 copy_mat(nbs
->box
, box
);
4860 rl2
= nbl
->rlist
*nbl
->rlist
;
4862 if (nbs
->bFEP
&& !nbl
->bSimple
)
4864 /* Determine an atom-pair list cut-off distance for FEP atom pairs.
4865 * We should not simply use rlist, since then we would not have
4866 * the small, effective buffering of the NxN lists.
4867 * The buffer is on overestimate, but the resulting cost for pairs
4868 * beyond rlist is neglible compared to the FEP pairs within rlist.
4870 rl_fep2
= nbl
->rlist
+ effective_buffer_1x1_vs_MxN(gridi
, gridj
);
4874 fprintf(debug
, "nbl_fep atom-pair rlist %f\n", rl_fep2
);
4876 rl_fep2
= rl_fep2
*rl_fep2
;
4879 rbb2
= boundingbox_only_distance2(gridi
, gridj
, nbl
->rlist
, nbl
->bSimple
);
4883 fprintf(debug
, "nbl bounding box only distance %f\n", sqrt(rbb2
));
4886 /* Set the shift range */
4887 for (d
= 0; d
< DIM
; d
++)
4889 /* Check if we need periodicity shifts.
4890 * Without PBC or with domain decomposition we don't need them.
4892 if (d
>= ePBC2npbcdim(nbs
->ePBC
) || nbs
->dd_dim
[d
])
4899 box
[XX
][XX
] - fabs(box
[YY
][XX
]) - fabs(box
[ZZ
][XX
]) < sqrt(rl2
))
4910 if (nbl
->bSimple
&& !gridi
->bSimple
)
4912 conv_i
= gridi
->na_sc
/gridj
->na_sc
;
4913 bb_i
= gridi
->bb_simple
;
4914 bbcz_i
= gridi
->bbcz_simple
;
4915 flags_i
= gridi
->flags_simple
;
4930 /* We use the normal bounding box format for both grid types */
4933 bbcz_i
= gridi
->bbcz
;
4934 flags_i
= gridi
->flags
;
4936 cell0_i
= gridi
->cell0
*conv_i
;
4938 bbcz_j
= gridj
->bbcz
;
4942 /* Blocks of the conversion factor - 1 give a large repeat count
4943 * combined with a small block size. This should result in good
4944 * load balancing for both small and large domains.
4946 ci_block
= conv_i
- 1;
4950 fprintf(debug
, "nbl nc_i %d col.av. %.1f ci_block %d\n",
4951 gridi
->nc
, gridi
->nc
/(double)(gridi
->ncx
*gridi
->ncy
), ci_block
);
4957 /* Initially ci_b and ci to 1 before where we want them to start,
4958 * as they will both be incremented in next_ci.
4961 ci
= th
*ci_block
- 1;
4964 while (next_ci(gridi
, conv_i
, nth
, ci_block
, &ci_x
, &ci_y
, &ci_b
, &ci
))
4966 if (nbl
->bSimple
&& flags_i
[ci
] == 0)
4971 ncj_old_i
= nbl
->ncj
;
4974 if (gridj
!= gridi
&& shp
[XX
] == 0)
4978 bx1
= bb_i
[ci
].upper
[BB_X
];
4982 bx1
= gridi
->c0
[XX
] + (ci_x
+1)*gridi
->sx
;
4984 if (bx1
< gridj
->c0
[XX
])
4986 d2cx
= sqr(gridj
->c0
[XX
] - bx1
);
4995 ci_xy
= ci_x
*gridi
->ncy
+ ci_y
;
4997 /* Loop over shift vectors in three dimensions */
4998 for (tz
= -shp
[ZZ
]; tz
<= shp
[ZZ
]; tz
++)
5000 shz
= tz
*box
[ZZ
][ZZ
];
5002 bz0
= bbcz_i
[ci
*NNBSBB_D
] + shz
;
5003 bz1
= bbcz_i
[ci
*NNBSBB_D
+1] + shz
;
5015 d2z
= sqr(bz0
- box
[ZZ
][ZZ
]);
5018 d2z_cx
= d2z
+ d2cx
;
5026 bz1
/((real
)(gridi
->cxy_ind
[ci_xy
+1] - gridi
->cxy_ind
[ci_xy
]));
5031 /* The check with bz1_frac close to or larger than 1 comes later */
5033 for (ty
= -shp
[YY
]; ty
<= shp
[YY
]; ty
++)
5035 shy
= ty
*box
[YY
][YY
] + tz
*box
[ZZ
][YY
];
5039 by0
= bb_i
[ci
].lower
[BB_Y
] + shy
;
5040 by1
= bb_i
[ci
].upper
[BB_Y
] + shy
;
5044 by0
= gridi
->c0
[YY
] + (ci_y
)*gridi
->sy
+ shy
;
5045 by1
= gridi
->c0
[YY
] + (ci_y
+1)*gridi
->sy
+ shy
;
5048 get_cell_range(by0
, by1
,
5049 gridj
->ncy
, gridj
->c0
[YY
], gridj
->sy
, gridj
->inv_sy
,
5059 if (by1
< gridj
->c0
[YY
])
5061 d2z_cy
+= sqr(gridj
->c0
[YY
] - by1
);
5063 else if (by0
> gridj
->c1
[YY
])
5065 d2z_cy
+= sqr(by0
- gridj
->c1
[YY
]);
5068 for (tx
= -shp
[XX
]; tx
<= shp
[XX
]; tx
++)
5070 shift
= XYZ2IS(tx
, ty
, tz
);
5072 #ifdef NBNXN_SHIFT_BACKWARD
5073 if (gridi
== gridj
&& shift
> CENTRAL
)
5079 shx
= tx
*box
[XX
][XX
] + ty
*box
[YY
][XX
] + tz
*box
[ZZ
][XX
];
5083 bx0
= bb_i
[ci
].lower
[BB_X
] + shx
;
5084 bx1
= bb_i
[ci
].upper
[BB_X
] + shx
;
5088 bx0
= gridi
->c0
[XX
] + (ci_x
)*gridi
->sx
+ shx
;
5089 bx1
= gridi
->c0
[XX
] + (ci_x
+1)*gridi
->sx
+ shx
;
5092 get_cell_range(bx0
, bx1
,
5093 gridj
->ncx
, gridj
->c0
[XX
], gridj
->sx
, gridj
->inv_sx
,
5104 new_ci_entry(nbl
, cell0_i
+ci
, shift
, flags_i
[ci
]);
5108 new_sci_entry(nbl
, cell0_i
+ci
, shift
);
5111 #ifndef NBNXN_SHIFT_BACKWARD
5114 if (shift
== CENTRAL
&& gridi
== gridj
&&
5118 /* Leave the pairs with i > j.
5119 * x is the major index, so skip half of it.
5126 set_icell_bb_simple(bb_i
, ci
, shx
, shy
, shz
,
5132 set_icell_bbxxxx_supersub(pbb_i
, ci
, shx
, shy
, shz
,
5135 set_icell_bb_supersub(bb_i
, ci
, shx
, shy
, shz
,
5140 nbs
->icell_set_x(cell0_i
+ci
, shx
, shy
, shz
,
5141 gridi
->na_c
, nbat
->xstride
, nbat
->x
,
5144 for (cx
= cxf
; cx
<= cxl
; cx
++)
5147 if (gridj
->c0
[XX
] + cx
*gridj
->sx
> bx1
)
5149 d2zx
+= sqr(gridj
->c0
[XX
] + cx
*gridj
->sx
- bx1
);
5151 else if (gridj
->c0
[XX
] + (cx
+1)*gridj
->sx
< bx0
)
5153 d2zx
+= sqr(gridj
->c0
[XX
] + (cx
+1)*gridj
->sx
- bx0
);
5156 #ifndef NBNXN_SHIFT_BACKWARD
5157 if (gridi
== gridj
&&
5158 cx
== 0 && cyf
< ci_y
)
5160 if (gridi
== gridj
&&
5161 cx
== 0 && shift
== CENTRAL
&& cyf
< ci_y
)
5164 /* Leave the pairs with i > j.
5165 * Skip half of y when i and j have the same x.
5174 for (cy
= cyf_x
; cy
<= cyl
; cy
++)
5176 c0
= gridj
->cxy_ind
[cx
*gridj
->ncy
+cy
];
5177 c1
= gridj
->cxy_ind
[cx
*gridj
->ncy
+cy
+1];
5178 #ifdef NBNXN_SHIFT_BACKWARD
5179 if (gridi
== gridj
&&
5180 shift
== CENTRAL
&& c0
< ci
)
5187 if (gridj
->c0
[YY
] + cy
*gridj
->sy
> by1
)
5189 d2zxy
+= sqr(gridj
->c0
[YY
] + cy
*gridj
->sy
- by1
);
5191 else if (gridj
->c0
[YY
] + (cy
+1)*gridj
->sy
< by0
)
5193 d2zxy
+= sqr(gridj
->c0
[YY
] + (cy
+1)*gridj
->sy
- by0
);
5195 if (c1
> c0
&& d2zxy
< rl2
)
5197 cs
= c0
+ (int)(bz1_frac
*(c1
- c0
));
5205 /* Find the lowest cell that can possibly
5210 (bbcz_j
[cf
*NNBSBB_D
+1] >= bz0
||
5211 d2xy
+ sqr(bbcz_j
[cf
*NNBSBB_D
+1] - bz0
) < rl2
))
5216 /* Find the highest cell that can possibly
5221 (bbcz_j
[cl
*NNBSBB_D
] <= bz1
||
5222 d2xy
+ sqr(bbcz_j
[cl
*NNBSBB_D
] - bz1
) < rl2
))
5227 #ifdef NBNXN_REFCODE
5229 /* Simple reference code, for debugging,
5230 * overrides the more complex code above.
5235 for (k
= c0
; k
< c1
; k
++)
5237 if (box_dist2(bx0
, bx1
, by0
, by1
, bz0
, bz1
, bb
+k
) < rl2
&&
5242 if (box_dist2(bx0
, bx1
, by0
, by1
, bz0
, bz1
, bb
+k
) < rl2
&&
5253 /* We want each atom/cell pair only once,
5254 * only use cj >= ci.
5256 #ifndef NBNXN_SHIFT_BACKWARD
5259 if (shift
== CENTRAL
)
5268 /* For f buffer flags with simple lists */
5269 ncj_old_j
= nbl
->ncj
;
5271 switch (nb_kernel_type
)
5273 case nbnxnk4x4_PlainC
:
5274 check_subcell_list_space_simple(nbl
, cl
-cf
+1);
5276 make_cluster_list_simple(gridj
,
5278 (gridi
== gridj
&& shift
== CENTRAL
),
5283 #ifdef GMX_NBNXN_SIMD_4XN
5284 case nbnxnk4xN_SIMD_4xN
:
5285 check_subcell_list_space_simple(nbl
, ci_to_cj(na_cj_2log
, cl
-cf
)+2);
5286 make_cluster_list_simd_4xn(gridj
,
5288 (gridi
== gridj
&& shift
== CENTRAL
),
5294 #ifdef GMX_NBNXN_SIMD_2XNN
5295 case nbnxnk4xN_SIMD_2xNN
:
5296 check_subcell_list_space_simple(nbl
, ci_to_cj(na_cj_2log
, cl
-cf
)+2);
5297 make_cluster_list_simd_2xnn(gridj
,
5299 (gridi
== gridj
&& shift
== CENTRAL
),
5305 case nbnxnk8x8x8_PlainC
:
5306 case nbnxnk8x8x8_CUDA
:
5307 check_subcell_list_space_supersub(nbl
, cl
-cf
+1);
5308 for (cj
= cf
; cj
<= cl
; cj
++)
5310 make_cluster_list_supersub(gridi
, gridj
,
5312 (gridi
== gridj
&& shift
== CENTRAL
&& ci
== cj
),
5313 nbat
->xstride
, nbat
->x
,
5319 ncpcheck
+= cl
- cf
+ 1;
5321 if (bFBufferFlag
&& nbl
->ncj
> ncj_old_j
)
5325 cbf
= nbl
->cj
[ncj_old_j
].cj
>> gridj_flag_shift
;
5326 cbl
= nbl
->cj
[nbl
->ncj
-1].cj
>> gridj_flag_shift
;
5327 for (cb
= cbf
; cb
<= cbl
; cb
++)
5329 gridj_flag
[cb
] = 1U<<th
;
5337 /* Set the exclusions for this ci list */
5340 set_ci_top_excls(nbs
,
5342 shift
== CENTRAL
&& gridi
== gridj
,
5345 &(nbl
->ci
[nbl
->nci
]),
5350 make_fep_list(nbs
, nbat
, nbl
,
5351 shift
== CENTRAL
&& gridi
== gridj
,
5352 &(nbl
->ci
[nbl
->nci
]),
5353 gridi
, gridj
, nbl_fep
);
5358 set_sci_top_excls(nbs
,
5360 shift
== CENTRAL
&& gridi
== gridj
,
5362 &(nbl
->sci
[nbl
->nsci
]),
5367 make_fep_list_supersub(nbs
, nbat
, nbl
,
5368 shift
== CENTRAL
&& gridi
== gridj
,
5369 &(nbl
->sci
[nbl
->nsci
]),
5372 gridi
, gridj
, nbl_fep
);
5376 /* Close this ci list */
5379 close_ci_entry_simple(nbl
);
5383 close_ci_entry_supersub(nbl
,
5385 progBal
, min_ci_balanced
,
5392 if (bFBufferFlag
&& nbl
->ncj
> ncj_old_i
)
5394 work
->buffer_flags
.flag
[(gridi
->cell0
+ci
)>>gridi_flag_shift
] = 1U<<th
;
5398 work
->ndistc
= ndistc
;
5400 nbs_cycle_stop(&work
->cc
[enbsCCsearch
]);
5404 fprintf(debug
, "number of distance checks %d\n", ndistc
);
5405 fprintf(debug
, "ncpcheck %s %d\n", gridi
== gridj
? "local" : "non-local",
5410 print_nblist_statistics_simple(debug
, nbl
, nbs
, rlist
);
5414 print_nblist_statistics_supersub(debug
, nbl
, nbs
, rlist
);
5419 fprintf(debug
, "nbl FEP list pairs: %d\n", nbl_fep
->nrj
);
5424 static void reduce_buffer_flags(const nbnxn_search_t nbs
,
5426 const nbnxn_buffer_flags_t
*dest
)
5429 const unsigned int *flag
;
5431 for (s
= 0; s
< nsrc
; s
++)
5433 flag
= nbs
->work
[s
].buffer_flags
.flag
;
5435 for (b
= 0; b
< dest
->nflag
; b
++)
5437 dest
->flag
[b
] |= flag
[b
];
5442 static void print_reduction_cost(const nbnxn_buffer_flags_t
*flags
, int nout
)
5444 int nelem
, nkeep
, ncopy
, nred
, b
, c
, out
;
5450 for (b
= 0; b
< flags
->nflag
; b
++)
5452 if (flags
->flag
[b
] == 1)
5454 /* Only flag 0 is set, no copy of reduction required */
5458 else if (flags
->flag
[b
] > 0)
5461 for (out
= 0; out
< nout
; out
++)
5463 if (flags
->flag
[b
] & (1U<<out
))
5480 fprintf(debug
, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
5482 nelem
/(double)(flags
->nflag
),
5483 nkeep
/(double)(flags
->nflag
),
5484 ncopy
/(double)(flags
->nflag
),
5485 nred
/(double)(flags
->nflag
));
5488 /* Perform a count (linear) sort to sort the smaller lists to the end.
5489 * This avoids load imbalance on the GPU, as large lists will be
5490 * scheduled and executed first and the smaller lists later.
5491 * Load balancing between multi-processors only happens at the end
5492 * and there smaller lists lead to more effective load balancing.
5493 * The sorting is done on the cj4 count, not on the actual pair counts.
5494 * Not only does this make the sort faster, but it also results in
5495 * better load balancing than using a list sorted on exact load.
5496 * This function swaps the pointer in the pair list to avoid a copy operation.
5498 static void sort_sci(nbnxn_pairlist_t
*nbl
)
5500 nbnxn_list_work_t
*work
;
5501 int m
, i
, s
, s0
, s1
;
5502 nbnxn_sci_t
*sci_sort
;
5504 if (nbl
->ncj4
<= nbl
->nsci
)
5506 /* nsci = 0 or all sci have size 1, sorting won't change the order */
5512 /* We will distinguish differences up to double the average */
5513 m
= (2*nbl
->ncj4
)/nbl
->nsci
;
5515 if (m
+ 1 > work
->sort_nalloc
)
5517 work
->sort_nalloc
= over_alloc_large(m
+ 1);
5518 srenew(work
->sort
, work
->sort_nalloc
);
5521 if (work
->sci_sort_nalloc
!= nbl
->sci_nalloc
)
5523 work
->sci_sort_nalloc
= nbl
->sci_nalloc
;
5524 nbnxn_realloc_void((void **)&work
->sci_sort
,
5526 work
->sci_sort_nalloc
*sizeof(*work
->sci_sort
),
5527 nbl
->alloc
, nbl
->free
);
5530 /* Count the entries of each size */
5531 for (i
= 0; i
<= m
; i
++)
5535 for (s
= 0; s
< nbl
->nsci
; s
++)
5537 i
= min(m
, nbl
->sci
[s
].cj4_ind_end
- nbl
->sci
[s
].cj4_ind_start
);
5540 /* Calculate the offset for each count */
5543 for (i
= m
- 1; i
>= 0; i
--)
5546 work
->sort
[i
] = work
->sort
[i
+ 1] + s0
;
5550 /* Sort entries directly into place */
5551 sci_sort
= work
->sci_sort
;
5552 for (s
= 0; s
< nbl
->nsci
; s
++)
5554 i
= min(m
, nbl
->sci
[s
].cj4_ind_end
- nbl
->sci
[s
].cj4_ind_start
);
5555 sci_sort
[work
->sort
[i
]++] = nbl
->sci
[s
];
5558 /* Swap the sci pointers so we use the new, sorted list */
5559 work
->sci_sort
= nbl
->sci
;
5560 nbl
->sci
= sci_sort
;
5563 /* Make a local or non-local pair-list, depending on iloc */
5564 void nbnxn_make_pairlist(const nbnxn_search_t nbs
,
5565 nbnxn_atomdata_t
*nbat
,
5566 const t_blocka
*excl
,
5568 int min_ci_balanced
,
5569 nbnxn_pairlist_set_t
*nbl_list
,
5574 nbnxn_grid_t
*gridi
, *gridj
;
5576 int nzi
, zi
, zj0
, zj1
, zj
;
5580 nbnxn_pairlist_t
**nbl
;
5582 gmx_bool CombineNBLists
;
5584 int np_tot
, np_noq
, np_hlj
, nap
;
5586 /* Check if we are running hybrid GPU + CPU nbnxn mode */
5587 bGPUCPU
= (!nbs
->grid
[0].bSimple
&& nbl_list
->bSimple
);
5589 nnbl
= nbl_list
->nnbl
;
5590 nbl
= nbl_list
->nbl
;
5591 CombineNBLists
= nbl_list
->bCombined
;
5595 fprintf(debug
, "ns making %d nblists\n", nnbl
);
5598 nbat
->bUseBufferFlags
= (nbat
->nout
> 1);
5599 /* We should re-init the flags before making the first list */
5600 if (nbat
->bUseBufferFlags
&& (LOCAL_I(iloc
) || bGPUCPU
))
5602 init_buffer_flags(&nbat
->buffer_flags
, nbat
->natoms
);
5605 if (nbl_list
->bSimple
)
5607 switch (nb_kernel_type
)
5609 #ifdef GMX_NBNXN_SIMD_4XN
5610 case nbnxnk4xN_SIMD_4xN
:
5611 nbs
->icell_set_x
= icell_set_x_simd_4xn
;
5614 #ifdef GMX_NBNXN_SIMD_2XNN
5615 case nbnxnk4xN_SIMD_2xNN
:
5616 nbs
->icell_set_x
= icell_set_x_simd_2xnn
;
5620 nbs
->icell_set_x
= icell_set_x_simple
;
5626 #ifdef NBNXN_SEARCH_BB_SIMD4
5627 nbs
->icell_set_x
= icell_set_x_supersub_simd4
;
5629 nbs
->icell_set_x
= icell_set_x_supersub
;
5635 /* Only zone (grid) 0 vs 0 */
5642 nzi
= nbs
->zones
->nizone
;
5645 if (!nbl_list
->bSimple
&& min_ci_balanced
> 0)
5647 nsubpair_max
= get_nsubpair_max(nbs
, iloc
, rlist
, min_ci_balanced
);
5654 /* Clear all pair-lists */
5655 for (th
= 0; th
< nnbl
; th
++)
5657 clear_pairlist(nbl
[th
]);
5661 clear_pairlist_fep(nbl_list
->nbl_fep
[th
]);
5665 for (zi
= 0; zi
< nzi
; zi
++)
5667 gridi
= &nbs
->grid
[zi
];
5669 if (NONLOCAL_I(iloc
))
5671 zj0
= nbs
->zones
->izone
[zi
].j0
;
5672 zj1
= nbs
->zones
->izone
[zi
].j1
;
5678 for (zj
= zj0
; zj
< zj1
; zj
++)
5680 gridj
= &nbs
->grid
[zj
];
5684 fprintf(debug
, "ns search grid %d vs %d\n", zi
, zj
);
5687 nbs_cycle_start(&nbs
->cc
[enbsCCsearch
]);
5689 if (nbl
[0]->bSimple
&& !gridi
->bSimple
)
5691 /* Hybrid list, determine blocking later */
5696 ci_block
= get_ci_block_size(gridi
, nbs
->DomDec
, nnbl
);
5699 /* With GPU: generate progressively smaller lists for
5700 * load balancing for local only or non-local with 2 zones.
5702 progBal
= (LOCAL_I(iloc
) || nbs
->zones
->n
<= 2);
5704 #pragma omp parallel for num_threads(nnbl) schedule(static)
5705 for (th
= 0; th
< nnbl
; th
++)
5707 /* Re-init the thread-local work flag data before making
5708 * the first list (not an elegant conditional).
5710 if (nbat
->bUseBufferFlags
&& ((zi
== 0 && zj
== 0) ||
5711 (bGPUCPU
&& zi
== 0 && zj
== 1)))
5713 init_buffer_flags(&nbs
->work
[th
].buffer_flags
, nbat
->natoms
);
5716 if (CombineNBLists
&& th
> 0)
5718 clear_pairlist(nbl
[th
]);
5721 /* Divide the i super cell equally over the nblists */
5722 nbnxn_make_pairlist_part(nbs
, gridi
, gridj
,
5723 &nbs
->work
[th
], nbat
, excl
,
5727 nbat
->bUseBufferFlags
,
5729 progBal
, min_ci_balanced
,
5732 nbl_list
->nbl_fep
[th
]);
5734 nbs_cycle_stop(&nbs
->cc
[enbsCCsearch
]);
5739 for (th
= 0; th
< nnbl
; th
++)
5741 inc_nrnb(nrnb
, eNR_NBNXN_DIST2
, nbs
->work
[th
].ndistc
);
5743 if (nbl_list
->bSimple
)
5745 np_tot
+= nbl
[th
]->ncj
;
5746 np_noq
+= nbl
[th
]->work
->ncj_noq
;
5747 np_hlj
+= nbl
[th
]->work
->ncj_hlj
;
5751 /* This count ignores potential subsequent pair pruning */
5752 np_tot
+= nbl
[th
]->nci_tot
;
5755 nap
= nbl
[0]->na_ci
*nbl
[0]->na_cj
;
5756 nbl_list
->natpair_ljq
= (np_tot
- np_noq
)*nap
- np_hlj
*nap
/2;
5757 nbl_list
->natpair_lj
= np_noq
*nap
;
5758 nbl_list
->natpair_q
= np_hlj
*nap
/2;
5760 if (CombineNBLists
&& nnbl
> 1)
5762 nbs_cycle_start(&nbs
->cc
[enbsCCcombine
]);
5764 combine_nblists(nnbl
-1, nbl
+1, nbl
[0]);
5766 nbs_cycle_stop(&nbs
->cc
[enbsCCcombine
]);
5771 if (!nbl_list
->bSimple
)
5773 /* Sort the entries on size, large ones first */
5774 if (CombineNBLists
|| nnbl
== 1)
5780 #pragma omp parallel for num_threads(nnbl) schedule(static)
5781 for (th
= 0; th
< nnbl
; th
++)
5788 if (nbat
->bUseBufferFlags
)
5790 reduce_buffer_flags(nbs
, nnbl
, &nbat
->buffer_flags
);
5795 /* Balance the free-energy lists over all the threads */
5796 balance_fep_lists(nbs
, nbl_list
);
5799 /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
5802 nbs
->search_count
++;
5804 if (nbs
->print_cycles
&&
5805 (!nbs
->DomDec
|| (nbs
->DomDec
&& !LOCAL_I(iloc
))) &&
5806 nbs
->search_count
% 100 == 0)
5808 nbs_cycle_print(stderr
, nbs
);
5811 if (debug
&& (CombineNBLists
&& nnbl
> 1))
5813 if (nbl
[0]->bSimple
)
5815 print_nblist_statistics_simple(debug
, nbl
[0], nbs
, rlist
);
5819 print_nblist_statistics_supersub(debug
, nbl
[0], nbs
, rlist
);
5827 if (nbl
[0]->bSimple
)
5829 print_nblist_ci_cj(debug
, nbl
[0]);
5833 print_nblist_sci_cj(debug
, nbl
[0]);
5837 if (nbat
->bUseBufferFlags
)
5839 print_reduction_cost(&nbat
->buffer_flags
, nnbl
);