1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
10 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12 * Copyright (c) 2001-2012, The GROMACS development team,
13 * check out http://www.gromacs.org for more information.
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * If you want to redistribute modifications, please consider that
21 * scientific software is very special. Version control is crucial -
22 * bugs must be traceable. We will be happy to consider code for
23 * inclusion in the official distribution, but derived work must not
24 * be called official GROMACS. Details are found in the README & COPYING
25 * files - if they are missing, get the official version at www.gromacs.org.
27 * To help us fund GROMACS development, we humbly ask that you cite
28 * the papers on the package - you can find them in the top README file.
30 * For more info, check our website at http://www.gromacs.org
33 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
36 #ifndef _nbnxn_internal_h
37 #define _nsnxn_internal_h
41 #include "gmx_cyclecounter.h"
49 #define NBNXN_SEARCH_SSE
53 /* Block size for the non-bonded thread force-buffer reduction,
54 * should be a multiple of 2 in case of AVX256.
56 #define NBNXN_CELLBLOCK_SIZE 4
57 #define NBNXN_CELLBLOCK_SIZE_2LOG 2
59 /* We currently store the reduction flags as bits in an unsigned int.
60 * In most cases this limits the number of flags to 32.
61 * The reduction will automatically disable the flagging and do a full
62 * reduction when the flags won't fit, but this will lead to very slow
63 * reduction. As we anyhow don't expect reasonable performance with
64 * more than 32 threads, we put in this hard limit.
65 * You can increase this number, but the reduction will be very slow.
67 #define NBNXN_CELLBLOCK_MAX_THREADS 32
69 /* Flags for telling if threads write to force output buffers */
71 int ncb
; /* The number of cell blocks */
72 gmx_bool bUse
; /* Should we use these flags? */
73 unsigned *flag
; /* Bit i is set when thread i writes to a cell-block */
74 int flag_nalloc
; /* Allocation size of cxy_flag */
75 } nbnxn_cellblock_flags
;
77 /* A pair-search grid struct for one domain decomposition zone */
79 rvec c0
; /* The lower corner of the (local) grid */
80 rvec c1
; /* The upper corner of the (local) grid */
81 real atom_density
; /* The atom number density for the local grid */
83 gmx_bool bSimple
; /* Is this grid simple or super/sub */
84 int na_c
; /* Number of atoms per cluster */
85 int na_cj
; /* Number of atoms for list j-clusters */
86 int na_sc
; /* Number of atoms per super-cluster */
87 int na_c_2log
; /* 2log of na_c */
89 int ncx
; /* Number of (super-)cells along x */
90 int ncy
; /* Number of (super-)cells along y */
91 int nc
; /* Total number of (super-)cells */
93 real sx
; /* x-size of a (super-)cell */
94 real sy
; /* y-size of a (super-)cell */
95 real inv_sx
; /* 1/sx */
96 real inv_sy
; /* 1/sy */
98 int cell0
; /* Index in nbs->cell corresponding to cell 0 */
100 int *cxy_na
; /* The number of atoms for each column in x,y */
101 int *cxy_ind
; /* Grid (super)cell index, offset from cell0 */
102 int cxy_nalloc
; /* Allocation size for cxy_na and cxy_ind */
104 int *nsubc
; /* The number of sub cells for each super cell */
105 float *bbcz
; /* Bounding boxes in z for the super cells */
106 float *bb
; /* 3D bounding boxes for the sub cells */
107 float *bbj
; /* 3D j-b.boxes for SSE-double or AVX-single */
108 int *flags
; /* Flag for the super cells */
109 int nc_nalloc
; /* Allocation size for the pointers above */
111 float *bbcz_simple
; /* bbcz for simple grid converted from super */
112 float *bb_simple
; /* bb for simple grid converted from super */
113 int *flags_simple
; /* flags for simple grid converted from super */
114 int nc_nalloc_simple
; /* Allocation size for the pointers above */
116 nbnxn_cellblock_flags cellblock_flags
; /* Flags for F output buffers */
118 int nsubc_tot
; /* Total number of subcell, used for printing */
121 #ifdef NBNXN_SEARCH_SSE
122 #define GMX_MM128_HERE
123 #include "gmx_x86_simd_macros.h"
124 typedef struct nbnxn_x_ci_x86_simd128
{
125 /* The i-cluster coordinates for simple search */
126 gmx_mm_pr ix_SSE0
,iy_SSE0
,iz_SSE0
;
127 gmx_mm_pr ix_SSE1
,iy_SSE1
,iz_SSE1
;
128 gmx_mm_pr ix_SSE2
,iy_SSE2
,iz_SSE2
;
129 gmx_mm_pr ix_SSE3
,iy_SSE3
,iz_SSE3
;
130 } nbnxn_x_ci_x86_simd128_t
;
131 #undef GMX_MM128_HERE
132 #ifdef GMX_X86_AVX_256
133 #define GMX_MM256_HERE
134 #include "gmx_x86_simd_macros.h"
135 typedef struct nbnxn_x_ci_x86_simd256
{
136 /* The i-cluster coordinates for simple search */
137 gmx_mm_pr ix_SSE0
,iy_SSE0
,iz_SSE0
;
138 gmx_mm_pr ix_SSE1
,iy_SSE1
,iz_SSE1
;
139 gmx_mm_pr ix_SSE2
,iy_SSE2
,iz_SSE2
;
140 gmx_mm_pr ix_SSE3
,iy_SSE3
,iz_SSE3
;
141 } nbnxn_x_ci_x86_simd256_t
;
142 #undef GMX_MM256_HERE
146 /* Working data for the actual i-supercell during pair search */
147 typedef struct nbnxn_list_work
{
148 gmx_cache_protect_t cp0
; /* Protect cache between threads */
150 float *bb_ci
; /* The bounding boxes, pbc shifted, for each cluster */
151 real
*x_ci
; /* The coordinates, pbc shifted, for each atom */
152 #ifdef NBNXN_SEARCH_SSE
153 nbnxn_x_ci_x86_simd128_t
*x_ci_x86_simd128
;
154 #ifdef GMX_X86_AVX_256
155 nbnxn_x_ci_x86_simd256_t
*x_ci_x86_simd256
;
158 int cj_ind
; /* The current cj_ind index for the current list */
159 int cj4_init
; /* The first unitialized cj4 block */
161 float *d2
; /* Bounding box distance work array */
163 nbnxn_cj_t
*cj
; /* The j-cell list */
164 int cj_nalloc
; /* Allocation size of cj */
166 int ncj_noq
; /* Nr. of cluster pairs without Coul for flop count */
167 int ncj_hlj
; /* Nr. of cluster pairs with 1/2 LJ for flop count */
169 gmx_cache_protect_t cp1
; /* Protect cache between threads */
172 /* Function type for setting the i-atom coordinate working data */
174 gmx_icell_set_x_t(int ci
,
175 real shx
,real shy
,real shz
,
177 int stride
,const real
*x
,
178 nbnxn_list_work_t
*work
);
180 static gmx_icell_set_x_t icell_set_x_simple
;
181 #ifdef NBNXN_SEARCH_SSE
182 static gmx_icell_set_x_t icell_set_x_simple_x86_simd128
;
183 #ifdef GMX_X86_AVX_256
184 static gmx_icell_set_x_t icell_set_x_simple_x86_simd256
;
187 static gmx_icell_set_x_t icell_set_x_supersub
;
188 #ifdef NBNXN_SEARCH_SSE
189 static gmx_icell_set_x_t icell_set_x_supersub_sse8
;
192 /* Local cycle count struct for profiling */
199 /* Local cycle count enum for profiling */
200 enum { enbsCCgrid
, enbsCCsearch
, enbsCCcombine
, enbsCCreducef
, enbsCCnr
};
202 /* Thread-local work struct, contains part of nbnxn_grid_t */
204 gmx_cache_protect_t cp0
;
210 int sort_work_nalloc
;
212 nbnxn_cellblock_flags gridi_flags
; /* Flags for i-grid f buffer */
213 nbnxn_cellblock_flags gridj_flags
; /* Flags for j-grid f buffer */
215 int ndistc
; /* Number of distance checks for flop counting */
217 nbnxn_cycle_t cc
[enbsCCnr
];
219 gmx_cache_protect_t cp1
;
220 } nbnxn_search_work_t
;
222 /* Main pair-search struct, contains the grid(s), not the pair-list(s) */
223 typedef struct nbnxn_search
{
224 int ePBC
; /* PBC type enum */
225 matrix box
; /* The periodic unit-cell */
227 gmx_bool DomDec
; /* Are we doing domain decomposition? */
228 ivec dd_dim
; /* Are we doing DD in x,y,z? */
229 gmx_domdec_zones_t
*zones
; /* The domain decomposition zones */
231 int ngrid
; /* The number of grids, equal to #DD-zones */
232 nbnxn_grid_t
*grid
; /* Array of grids, size ngrid */
233 int *cell
; /* Actual allocated cell array for all grids */
234 int cell_nalloc
; /* Allocation size of cell */
235 int *a
; /* Atom index for grid, the inverse of cell */
236 int a_nalloc
; /* Allocation size of a */
238 int natoms_local
; /* The local atoms run from 0 to natoms_local */
239 int natoms_nonlocal
; /* The non-local atoms run from natoms_local
240 * to natoms_nonlocal */
242 gmx_bool print_cycles
;
244 nbnxn_cycle_t cc
[enbsCCnr
];
246 gmx_icell_set_x_t
*icell_set_x
; /* Function for setting i-coords */
248 int nthread_max
; /* Maximum number of threads for pair-search */
249 nbnxn_search_work_t
*work
; /* Work array, size nthread_max */
253 static void nbs_cycle_start(nbnxn_cycle_t
*cc
)
255 cc
->start
= gmx_cycles_read();
258 static void nbs_cycle_stop(nbnxn_cycle_t
*cc
)
260 cc
->c
+= gmx_cycles_read() - cc
->start
;