2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "nbnxn_atomdata.h"
48 #include "thread_mpi/atomic.h"
50 #include "gromacs/math/functions.h"
51 #include "gromacs/math/utilities.h"
52 #include "gromacs/math/vec.h"
53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
54 #include "gromacs/mdlib/nb_verlet.h"
55 #include "gromacs/mdlib/nbnxn_consts.h"
56 #include "gromacs/mdlib/nbnxn_internal.h"
57 #include "gromacs/mdlib/nbnxn_search.h"
58 #include "gromacs/mdlib/nbnxn_util.h"
59 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
60 #include "gromacs/mdtypes/mdatom.h"
61 #include "gromacs/pbcutil/ishift.h"
62 #include "gromacs/simd/simd.h"
63 #include "gromacs/timing/wallcycle.h"
64 #include "gromacs/utility/exceptions.h"
65 #include "gromacs/utility/fatalerror.h"
66 #include "gromacs/utility/gmxomp.h"
67 #include "gromacs/utility/logger.h"
68 #include "gromacs/utility/smalloc.h"
69 #include "gromacs/utility/stringutil.h"
71 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
73 /* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
74 void nbnxn_alloc_aligned(void **ptr
, size_t nbytes
)
76 *ptr
= save_malloc_aligned("ptr", __FILE__
, __LINE__
, nbytes
, 1, NBNXN_MEM_ALIGN
);
79 /* Free function for memory allocated with nbnxn_alloc_aligned */
80 void nbnxn_free_aligned(void *ptr
)
85 /* Reallocation wrapper function for nbnxn data structures */
86 void nbnxn_realloc_void(void **ptr
,
87 int nbytes_copy
, int nbytes_new
,
93 ma(&ptr_new
, nbytes_new
);
95 if (nbytes_new
> 0 && ptr_new
== nullptr)
97 gmx_fatal(FARGS
, "Allocation of %d bytes failed", nbytes_new
);
102 if (nbytes_new
< nbytes_copy
)
104 gmx_incons("In nbnxn_realloc_void: new size less than copy size");
106 memcpy(ptr_new
, *ptr
, nbytes_copy
);
115 /* Reallocate the nbnxn_atomdata_t for a size of n atoms */
116 void nbnxn_atomdata_realloc(nbnxn_atomdata_t
*nbat
, int n
)
118 GMX_ASSERT(nbat
->nalloc
>= nbat
->natoms
, "We should have at least as many elelements allocated as there are set");
122 nbnxn_realloc_void((void **)&nbat
->type
,
123 nbat
->natoms
*sizeof(*nbat
->type
),
124 n
*sizeof(*nbat
->type
),
125 nbat
->alloc
, nbat
->free
);
126 nbnxn_realloc_void((void **)&nbat
->lj_comb
,
127 nbat
->natoms
*2*sizeof(*nbat
->lj_comb
),
128 n
*2*sizeof(*nbat
->lj_comb
),
129 nbat
->alloc
, nbat
->free
);
130 if (nbat
->XFormat
!= nbatXYZQ
)
132 nbnxn_realloc_void((void **)&nbat
->q
,
133 nbat
->natoms
*sizeof(*nbat
->q
),
135 nbat
->alloc
, nbat
->free
);
137 if (nbat
->nenergrp
> 1)
139 nbnxn_realloc_void((void **)&nbat
->energrp
,
140 nbat
->natoms
/nbat
->na_c
*sizeof(*nbat
->energrp
),
141 n
/nbat
->na_c
*sizeof(*nbat
->energrp
),
142 nbat
->alloc
, nbat
->free
);
144 nbnxn_realloc_void((void **)&nbat
->x
,
145 nbat
->natoms
*nbat
->xstride
*sizeof(*nbat
->x
),
146 n
*nbat
->xstride
*sizeof(*nbat
->x
),
147 nbat
->alloc
, nbat
->free
);
148 for (t
= 0; t
< nbat
->nout
; t
++)
150 /* Allocate one element extra for possible signaling with GPUs */
151 nbnxn_realloc_void((void **)&nbat
->out
[t
].f
,
152 nbat
->natoms
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
153 n
*nbat
->fstride
*sizeof(*nbat
->out
[t
].f
),
154 nbat
->alloc
, nbat
->free
);
159 /* Initializes an nbnxn_atomdata_output_t data structure */
160 static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t
*out
,
162 int nenergrp
, int stride
,
166 ma((void **)&out
->fshift
, SHIFTS
*DIM
*sizeof(*out
->fshift
));
167 out
->nV
= nenergrp
*nenergrp
;
168 ma((void **)&out
->Vvdw
, out
->nV
*sizeof(*out
->Vvdw
));
169 ma((void **)&out
->Vc
, out
->nV
*sizeof(*out
->Vc
));
171 if (nb_kernel_type
== nbnxnk4xN_SIMD_4xN
||
172 nb_kernel_type
== nbnxnk4xN_SIMD_2xNN
)
174 int cj_size
= nbnxn_kernel_to_cluster_j_size(nb_kernel_type
);
175 out
->nVS
= nenergrp
*nenergrp
*stride
*(cj_size
>>1)*cj_size
;
176 ma((void **)&out
->VSvdw
, out
->nVS
*sizeof(*out
->VSvdw
));
177 ma((void **)&out
->VSc
, out
->nVS
*sizeof(*out
->VSc
));
185 static void copy_int_to_nbat_int(const int *a
, int na
, int na_round
,
186 const int *in
, int fill
, int *innb
)
191 for (i
= 0; i
< na
; i
++)
193 innb
[j
++] = in
[a
[i
]];
195 /* Complete the partially filled last cell with fill */
196 for (; i
< na_round
; i
++)
202 void copy_rvec_to_nbat_real(const int *a
, int na
, int na_round
,
203 const rvec
*x
, int nbatFormat
,
206 /* We complete partially filled cells, can only be the last one in each
207 * column, with coordinates farAway. The actual coordinate value does
208 * not influence the results, since these filler particles do not interact.
209 * Clusters with normal atoms + fillers have a bounding box based only
210 * on the coordinates of the atoms. Clusters with only fillers have as
211 * the bounding box the coordinates of the first filler. Such clusters
212 * are not considered as i-entries, but they are considered as j-entries.
213 * So for performance it is better to have their bounding boxes far away,
214 * such that filler only clusters don't end up in the pair list.
216 const real farAway
= -1000000;
224 for (i
= 0; i
< na
; i
++)
226 xnb
[j
++] = x
[a
[i
]][XX
];
227 xnb
[j
++] = x
[a
[i
]][YY
];
228 xnb
[j
++] = x
[a
[i
]][ZZ
];
230 /* Complete the partially filled last cell with farAway elements */
231 for (; i
< na_round
; i
++)
240 for (i
= 0; i
< na
; i
++)
242 xnb
[j
++] = x
[a
[i
]][XX
];
243 xnb
[j
++] = x
[a
[i
]][YY
];
244 xnb
[j
++] = x
[a
[i
]][ZZ
];
247 /* Complete the partially filled last cell with zeros */
248 for (; i
< na_round
; i
++)
257 j
= atom_to_x_index
<c_packX4
>(a0
);
258 c
= a0
& (c_packX4
-1);
259 for (i
= 0; i
< na
; i
++)
261 xnb
[j
+XX
*c_packX4
] = x
[a
[i
]][XX
];
262 xnb
[j
+YY
*c_packX4
] = x
[a
[i
]][YY
];
263 xnb
[j
+ZZ
*c_packX4
] = x
[a
[i
]][ZZ
];
268 j
+= (DIM
-1)*c_packX4
;
272 /* Complete the partially filled last cell with zeros */
273 for (; i
< na_round
; i
++)
275 xnb
[j
+XX
*c_packX4
] = farAway
;
276 xnb
[j
+YY
*c_packX4
] = farAway
;
277 xnb
[j
+ZZ
*c_packX4
] = farAway
;
282 j
+= (DIM
-1)*c_packX4
;
288 j
= atom_to_x_index
<c_packX8
>(a0
);
289 c
= a0
& (c_packX8
- 1);
290 for (i
= 0; i
< na
; i
++)
292 xnb
[j
+XX
*c_packX8
] = x
[a
[i
]][XX
];
293 xnb
[j
+YY
*c_packX8
] = x
[a
[i
]][YY
];
294 xnb
[j
+ZZ
*c_packX8
] = x
[a
[i
]][ZZ
];
299 j
+= (DIM
-1)*c_packX8
;
303 /* Complete the partially filled last cell with zeros */
304 for (; i
< na_round
; i
++)
306 xnb
[j
+XX
*c_packX8
] = farAway
;
307 xnb
[j
+YY
*c_packX8
] = farAway
;
308 xnb
[j
+ZZ
*c_packX8
] = farAway
;
313 j
+= (DIM
-1)*c_packX8
;
319 gmx_incons("Unsupported nbnxn_atomdata_t format");
323 /* Stores the LJ parameter data in a format convenient for different kernels */
324 static void set_lj_parameter_data(nbnxn_atomdata_t
*nbat
, gmx_bool bSIMD
)
328 int nt
= nbat
->ntype
;
333 /* nbfp_aligned stores two parameters using the stride most suitable
334 * for the present SIMD architecture, as specified by the constant
335 * c_simdBestPairAlignment from the SIMD header.
336 * There's a slight inefficiency in allocating and initializing nbfp_aligned
337 * when it might not be used, but introducing the conditional code is not
340 nbat
->alloc((void **)&nbat
->nbfp_aligned
,
341 nt
*nt
*c_simdBestPairAlignment
*sizeof(*nbat
->nbfp_aligned
));
342 for (int i
= 0; i
< nt
; i
++)
344 for (int j
= 0; j
< nt
; j
++)
346 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+0] = nbat
->nbfp
[(i
*nt
+j
)*2+0];
347 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+1] = nbat
->nbfp
[(i
*nt
+j
)*2+1];
348 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+2] = 0;
349 nbat
->nbfp_aligned
[(i
*nt
+j
)*c_simdBestPairAlignment
+3] = 0;
355 /* We use combination rule data for SIMD combination rule kernels
356 * and with LJ-PME kernels. We then only need parameters per atom type,
357 * not per pair of atom types.
359 switch (nbat
->comb_rule
)
362 nbat
->comb_rule
= ljcrGEOM
;
364 for (int i
= 0; i
< nt
; i
++)
366 /* Store the sqrt of the diagonal from the nbfp matrix */
367 nbat
->nbfp_comb
[i
*2 ] = std::sqrt(nbat
->nbfp
[(i
*nt
+i
)*2 ]);
368 nbat
->nbfp_comb
[i
*2+1] = std::sqrt(nbat
->nbfp
[(i
*nt
+i
)*2+1]);
372 for (int i
= 0; i
< nt
; i
++)
374 /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
375 c6
= nbat
->nbfp
[(i
*nt
+i
)*2 ];
376 c12
= nbat
->nbfp
[(i
*nt
+i
)*2+1];
377 if (c6
> 0 && c12
> 0)
379 /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
380 * so we get 6*C6 and 12*C12 after combining.
382 nbat
->nbfp_comb
[i
*2 ] = 0.5*gmx::sixthroot(c12
/c6
);
383 nbat
->nbfp_comb
[i
*2+1] = std::sqrt(c6
*c6
/c12
);
387 nbat
->nbfp_comb
[i
*2 ] = 0;
388 nbat
->nbfp_comb
[i
*2+1] = 0;
393 /* We always store the full matrix (see code above) */
396 gmx_incons("Unknown combination rule");
402 nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t
*nbat
)
404 const int simd_width
= GMX_SIMD_REAL_WIDTH
;
406 /* Set the diagonal cluster pair exclusion mask setup data.
407 * In the kernel we check 0 < j - i to generate the masks.
408 * Here we store j - i for generating the mask for the first i,
409 * we substract 0.5 to avoid rounding issues.
410 * In the kernel we can subtract 1 to generate the subsequent mask.
412 int simd_4xn_diag_size
;
414 simd_4xn_diag_size
= std::max(NBNXN_CPU_CLUSTER_I_SIZE
, simd_width
);
415 snew_aligned(nbat
->simd_4xn_diagonal_j_minus_i
, simd_4xn_diag_size
, NBNXN_MEM_ALIGN
);
416 for (int j
= 0; j
< simd_4xn_diag_size
; j
++)
418 nbat
->simd_4xn_diagonal_j_minus_i
[j
] = j
- 0.5;
421 snew_aligned(nbat
->simd_2xnn_diagonal_j_minus_i
, simd_width
, NBNXN_MEM_ALIGN
);
422 for (int j
= 0; j
< simd_width
/2; j
++)
424 /* The j-cluster size is half the SIMD width */
425 nbat
->simd_2xnn_diagonal_j_minus_i
[j
] = j
- 0.5;
426 /* The next half of the SIMD width is for i + 1 */
427 nbat
->simd_2xnn_diagonal_j_minus_i
[simd_width
/2+j
] = j
- 1 - 0.5;
430 /* We use up to 32 bits for exclusion masking.
431 * The same masks are used for the 4xN and 2x(N+N) kernels.
432 * The masks are read either into integer SIMD registers or into
433 * real SIMD registers (together with a cast).
434 * In single precision this means the real and integer SIMD registers
437 simd_excl_size
= NBNXN_CPU_CLUSTER_I_SIZE
*simd_width
;
438 #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
439 snew_aligned(nbat
->simd_exclusion_filter64
, simd_excl_size
, NBNXN_MEM_ALIGN
);
441 snew_aligned(nbat
->simd_exclusion_filter
, simd_excl_size
, NBNXN_MEM_ALIGN
);
444 for (int j
= 0; j
< simd_excl_size
; j
++)
446 /* Set the consecutive bits for masking pair exclusions */
447 #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
448 nbat
->simd_exclusion_filter64
[j
] = (1U << j
);
450 nbat
->simd_exclusion_filter
[j
] = (1U << j
);
454 #if !GMX_SIMD_HAVE_LOGICAL && !GMX_SIMD_HAVE_INT32_LOGICAL
455 // If the SIMD implementation has no bitwise logical operation support
456 // whatsoever we cannot use the normal masking. Instead,
457 // we generate a vector of all 2^4 possible ways an i atom
458 // interacts with its 4 j atoms. Each array entry contains
459 // GMX_SIMD_REAL_WIDTH values that are read with a single aligned SIMD load.
460 // Since there is no logical value representation in this case, we use
461 // any nonzero value to indicate 'true', while zero mean 'false'.
462 // This can then be converted to a SIMD boolean internally in the SIMD
463 // module by comparing to zero.
464 // Each array entry encodes how this i atom will interact with the 4 j atoms.
465 // Matching code exists in set_ci_top_excls() to generate indices into this array.
466 // Those indices are used in the kernels.
468 simd_excl_size
= NBNXN_CPU_CLUSTER_I_SIZE
*NBNXN_CPU_CLUSTER_I_SIZE
;
469 const real simdFalse
= 0.0;
470 const real simdTrue
= 1.0;
471 real
*simd_interaction_array
;
473 snew_aligned(simd_interaction_array
, simd_excl_size
* GMX_SIMD_REAL_WIDTH
, NBNXN_MEM_ALIGN
);
474 for (int j
= 0; j
< simd_excl_size
; j
++)
476 int index
= j
* GMX_SIMD_REAL_WIDTH
;
477 for (int i
= 0; i
< GMX_SIMD_REAL_WIDTH
; i
++)
479 simd_interaction_array
[index
+ i
] = (j
& (1 << i
)) ? simdTrue
: simdFalse
;
482 nbat
->simd_interaction_array
= simd_interaction_array
;
487 /* Initializes an nbnxn_atomdata_t data structure */
488 void nbnxn_atomdata_init(const gmx::MDLogger
&mdlog
,
489 nbnxn_atomdata_t
*nbat
,
491 int enbnxninitcombrule
,
492 int ntype
, const real
*nbfp
,
495 nbnxn_alloc_t
*alloc
,
501 gmx_bool simple
, bCombGeom
, bCombLB
, bSIMD
;
503 if (alloc
== nullptr)
505 nbat
->alloc
= nbnxn_alloc_aligned
;
513 nbat
->free
= nbnxn_free_aligned
;
522 fprintf(debug
, "There are %d atom types in the system, adding one for nbnxn_atomdata_t\n", ntype
);
524 nbat
->ntype
= ntype
+ 1;
525 nbat
->alloc((void **)&nbat
->nbfp
,
526 nbat
->ntype
*nbat
->ntype
*2*sizeof(*nbat
->nbfp
));
527 nbat
->alloc((void **)&nbat
->nbfp_comb
, nbat
->ntype
*2*sizeof(*nbat
->nbfp_comb
));
529 /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
530 * force-field floating point parameters.
533 ptr
= getenv("GMX_LJCOMB_TOL");
538 sscanf(ptr
, "%lf", &dbl
);
544 /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
545 * to check for the LB rule.
547 for (int i
= 0; i
< ntype
; i
++)
549 c6
= nbfp
[(i
*ntype
+i
)*2 ]/6.0;
550 c12
= nbfp
[(i
*ntype
+i
)*2+1]/12.0;
551 if (c6
> 0 && c12
> 0)
553 nbat
->nbfp_comb
[i
*2 ] = gmx::sixthroot(c12
/c6
);
554 nbat
->nbfp_comb
[i
*2+1] = 0.25*c6
*c6
/c12
;
556 else if (c6
== 0 && c12
== 0)
558 nbat
->nbfp_comb
[i
*2 ] = 0;
559 nbat
->nbfp_comb
[i
*2+1] = 0;
563 /* Can not use LB rule with only dispersion or repulsion */
568 for (int i
= 0; i
< nbat
->ntype
; i
++)
570 for (int j
= 0; j
< nbat
->ntype
; j
++)
572 if (i
< ntype
&& j
< ntype
)
574 /* fr->nbfp has been updated, so that array too now stores c6/c12 including
575 * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only).
577 c6
= nbfp
[(i
*ntype
+j
)*2 ];
578 c12
= nbfp
[(i
*ntype
+j
)*2+1];
579 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = c6
;
580 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = c12
;
582 /* Compare 6*C6 and 12*C12 for geometric cobination rule */
583 bCombGeom
= bCombGeom
&&
584 gmx_within_tol(c6
*c6
, nbfp
[(i
*ntype
+i
)*2 ]*nbfp
[(j
*ntype
+j
)*2 ], tol
) &&
585 gmx_within_tol(c12
*c12
, nbfp
[(i
*ntype
+i
)*2+1]*nbfp
[(j
*ntype
+j
)*2+1], tol
);
587 /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
591 ((c6
== 0 && c12
== 0 &&
592 (nbat
->nbfp_comb
[i
*2+1] == 0 || nbat
->nbfp_comb
[j
*2+1] == 0)) ||
593 (c6
> 0 && c12
> 0 &&
594 gmx_within_tol(gmx::sixthroot(c12
/c6
),
595 0.5*(nbat
->nbfp_comb
[i
*2]+nbat
->nbfp_comb
[j
*2]), tol
) &&
596 gmx_within_tol(0.25*c6
*c6
/c12
, std::sqrt(nbat
->nbfp_comb
[i
*2+1]*nbat
->nbfp_comb
[j
*2+1]), tol
)));
600 /* Add zero parameters for the additional dummy atom type */
601 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2 ] = 0;
602 nbat
->nbfp
[(i
*nbat
->ntype
+j
)*2+1] = 0;
608 fprintf(debug
, "Combination rules: geometric %d Lorentz-Berthelot %d\n",
612 simple
= nbnxn_kernel_pairlist_simple(nb_kernel_type
);
614 switch (enbnxninitcombrule
)
616 case enbnxninitcombruleDETECT
:
617 /* We prefer the geometic combination rule,
618 * as that gives a slightly faster kernel than the LB rule.
622 nbat
->comb_rule
= ljcrGEOM
;
626 nbat
->comb_rule
= ljcrLB
;
630 nbat
->comb_rule
= ljcrNONE
;
632 nbat
->free(nbat
->nbfp_comb
);
637 if (nbat
->comb_rule
== ljcrNONE
)
639 mesg
= "Using full Lennard-Jones parameter combination matrix";
643 mesg
= gmx::formatString("Using %s Lennard-Jones combination rule",
644 nbat
->comb_rule
== ljcrGEOM
? "geometric" : "Lorentz-Berthelot");
646 GMX_LOG(mdlog
.info
).asParagraph().appendText(mesg
);
649 case enbnxninitcombruleGEOM
:
650 nbat
->comb_rule
= ljcrGEOM
;
652 case enbnxninitcombruleLB
:
653 nbat
->comb_rule
= ljcrLB
;
655 case enbnxninitcombruleNONE
:
656 nbat
->comb_rule
= ljcrNONE
;
658 nbat
->free(nbat
->nbfp_comb
);
661 gmx_incons("Unknown enbnxninitcombrule");
664 bSIMD
= (nb_kernel_type
== nbnxnk4xN_SIMD_4xN
||
665 nb_kernel_type
== nbnxnk4xN_SIMD_2xNN
);
667 set_lj_parameter_data(nbat
, bSIMD
);
670 nbat
->type
= nullptr;
671 nbat
->lj_comb
= nullptr;
678 pack_x
= std::max(NBNXN_CPU_CLUSTER_I_SIZE
,
679 nbnxn_kernel_to_cluster_j_size(nb_kernel_type
));
683 nbat
->XFormat
= nbatX4
;
686 nbat
->XFormat
= nbatX8
;
689 gmx_incons("Unsupported packing width");
694 nbat
->XFormat
= nbatXYZ
;
697 nbat
->FFormat
= nbat
->XFormat
;
701 nbat
->XFormat
= nbatXYZQ
;
702 nbat
->FFormat
= nbatXYZ
;
705 nbat
->nenergrp
= n_energygroups
;
708 // We now check for energy groups already when starting mdrun
709 GMX_RELEASE_ASSERT(n_energygroups
== 1, "GPU kernels do not support energy groups");
711 /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
712 if (nbat
->nenergrp
> 64)
714 gmx_fatal(FARGS
, "With NxN kernels not more than 64 energy groups are supported\n");
717 while (nbat
->nenergrp
> (1<<nbat
->neg_2log
))
721 nbat
->energrp
= nullptr;
722 nbat
->alloc((void **)&nbat
->shift_vec
, SHIFTS
*sizeof(*nbat
->shift_vec
));
723 nbat
->xstride
= (nbat
->XFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
724 nbat
->fstride
= (nbat
->FFormat
== nbatXYZQ
? STRIDE_XYZQ
: DIM
);
730 nbnxn_atomdata_init_simple_exclusion_masks(nbat
);
734 /* Initialize the output data structures */
736 snew(nbat
->out
, nbat
->nout
);
738 for (int i
= 0; i
< nbat
->nout
; i
++)
740 nbnxn_atomdata_output_init(&nbat
->out
[i
],
742 nbat
->nenergrp
, 1<<nbat
->neg_2log
,
745 nbat
->buffer_flags
.flag
= nullptr;
746 nbat
->buffer_flags
.flag_nalloc
= 0;
748 nth
= gmx_omp_nthreads_get(emntNonbonded
);
750 ptr
= getenv("GMX_USE_TREEREDUCE");
753 nbat
->bUseTreeReduce
= strtol(ptr
, nullptr, 10);
756 else if (nth
> 8) /*on the CPU we currently don't benefit even at 32*/
758 nbat
->bUseTreeReduce
= 1;
763 nbat
->bUseTreeReduce
= 0;
765 if (nbat
->bUseTreeReduce
)
767 GMX_LOG(mdlog
.info
).asParagraph().appendText("Using tree force reduction");
769 snew(nbat
->syncStep
, nth
);
773 template<int packSize
>
774 static void copy_lj_to_nbat_lj_comb(const real
*ljparam_type
,
775 const int *type
, int na
,
778 /* The LJ params follow the combination rule:
779 * copy the params for the type array to the atom array.
781 for (int is
= 0; is
< na
; is
+= packSize
)
783 for (int k
= 0; k
< packSize
; k
++)
786 ljparam_at
[is
*2 + k
] = ljparam_type
[type
[i
]*2 ];
787 ljparam_at
[is
*2 + packSize
+ k
] = ljparam_type
[type
[i
]*2 + 1];
792 /* Sets the atom type in nbnxn_atomdata_t */
793 static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t
*nbat
,
794 const nbnxn_search
*nbs
,
797 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
799 /* Loop over all columns and copy and fill */
800 for (int i
= 0; i
< grid
.numCells
[XX
]*grid
.numCells
[YY
]; i
++)
802 int ncz
= grid
.cxy_ind
[i
+1] - grid
.cxy_ind
[i
];
803 int ash
= (grid
.cell0
+ grid
.cxy_ind
[i
])*grid
.na_sc
;
805 copy_int_to_nbat_int(nbs
->a
.data() + ash
, grid
.cxy_na
[i
], ncz
*grid
.na_sc
,
806 type
, nbat
->ntype
-1, nbat
->type
+ash
);
811 /* Sets the LJ combination rule parameters in nbnxn_atomdata_t */
812 static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t
*nbat
,
813 const nbnxn_search
*nbs
)
815 if (nbat
->comb_rule
!= ljcrNONE
)
817 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
819 /* Loop over all columns and copy and fill */
820 for (int i
= 0; i
< grid
.numCells
[XX
]*grid
.numCells
[YY
]; i
++)
822 int ncz
= grid
.cxy_ind
[i
+1] - grid
.cxy_ind
[i
];
823 int ash
= (grid
.cell0
+ grid
.cxy_ind
[i
])*grid
.na_sc
;
825 if (nbat
->XFormat
== nbatX4
)
827 copy_lj_to_nbat_lj_comb
<c_packX4
>(nbat
->nbfp_comb
,
830 nbat
->lj_comb
+ ash
*2);
832 else if (nbat
->XFormat
== nbatX8
)
834 copy_lj_to_nbat_lj_comb
<c_packX8
>(nbat
->nbfp_comb
,
837 nbat
->lj_comb
+ ash
*2);
839 else if (nbat
->XFormat
== nbatXYZQ
)
841 copy_lj_to_nbat_lj_comb
<1>(nbat
->nbfp_comb
,
844 nbat
->lj_comb
+ ash
*2);
851 /* Sets the charges in nbnxn_atomdata_t *nbat */
852 static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t
*nbat
,
853 const nbnxn_search
*nbs
,
856 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
858 /* Loop over all columns and copy and fill */
859 for (int cxy
= 0; cxy
< grid
.numCells
[XX
]*grid
.numCells
[YY
]; cxy
++)
861 int ash
= (grid
.cell0
+ grid
.cxy_ind
[cxy
])*grid
.na_sc
;
862 int na
= grid
.cxy_na
[cxy
];
863 int na_round
= (grid
.cxy_ind
[cxy
+1] - grid
.cxy_ind
[cxy
])*grid
.na_sc
;
865 if (nbat
->XFormat
== nbatXYZQ
)
867 real
*q
= nbat
->x
+ ash
*STRIDE_XYZQ
+ ZZ
+ 1;
869 for (i
= 0; i
< na
; i
++)
871 *q
= charge
[nbs
->a
[ash
+i
]];
874 /* Complete the partially filled last cell with zeros */
875 for (; i
< na_round
; i
++)
883 real
*q
= nbat
->q
+ ash
;
885 for (i
= 0; i
< na
; i
++)
887 *q
= charge
[nbs
->a
[ash
+i
]];
890 /* Complete the partially filled last cell with zeros */
891 for (; i
< na_round
; i
++)
901 /* Set the charges of perturbed atoms in nbnxn_atomdata_t to 0.
902 * This is to automatically remove the RF/PME self term in the nbnxn kernels.
903 * Part of the zero interactions are still calculated in the normal kernels.
904 * All perturbed interactions are calculated in the free energy kernel,
905 * using the original charge and LJ data, not nbnxn_atomdata_t.
907 static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t
*nbat
,
908 const nbnxn_search
*nbs
)
913 if (nbat
->XFormat
== nbatXYZQ
)
915 q
= nbat
->x
+ ZZ
+ 1;
916 stride_q
= STRIDE_XYZQ
;
924 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
932 nsubc
= c_gpuNumClusterPerCell
;
935 int c_offset
= grid
.cell0
*grid
.na_sc
;
937 /* Loop over all columns and copy and fill */
938 for (int c
= 0; c
< grid
.nc
*nsubc
; c
++)
940 /* Does this cluster contain perturbed particles? */
941 if (grid
.fep
[c
] != 0)
943 for (int i
= 0; i
< grid
.na_c
; i
++)
945 /* Is this a perturbed particle? */
946 if (grid
.fep
[c
] & (1 << i
))
948 int ind
= c_offset
+ c
*grid
.na_c
+ i
;
949 /* Set atom type and charge to non-interacting */
950 nbat
->type
[ind
] = nbat
->ntype
- 1;
959 /* Copies the energy group indices to a reordered and packed array */
960 static void copy_egp_to_nbat_egps(const int *a
, int na
, int na_round
,
961 int na_c
, int bit_shift
,
962 const int *in
, int *innb
)
968 for (i
= 0; i
< na
; i
+= na_c
)
970 /* Store na_c energy group numbers into one int */
972 for (int sa
= 0; sa
< na_c
; sa
++)
977 comb
|= (GET_CGINFO_GID(in
[at
]) << (sa
*bit_shift
));
982 /* Complete the partially filled last cell with fill */
983 for (; i
< na_round
; i
+= na_c
)
989 /* Set the energy group indices for atoms in nbnxn_atomdata_t */
990 static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t
*nbat
,
991 const nbnxn_search
*nbs
,
994 if (nbat
->nenergrp
== 1)
999 for (const nbnxn_grid_t
&grid
: nbs
->grid
)
1001 /* Loop over all columns and copy and fill */
1002 for (int i
= 0; i
< grid
.numCells
[XX
]*grid
.numCells
[YY
]; i
++)
1004 int ncz
= grid
.cxy_ind
[i
+1] - grid
.cxy_ind
[i
];
1005 int ash
= (grid
.cell0
+ grid
.cxy_ind
[i
])*grid
.na_sc
;
1007 copy_egp_to_nbat_egps(nbs
->a
.data() + ash
, grid
.cxy_na
[i
], ncz
*grid
.na_sc
,
1008 nbat
->na_c
, nbat
->neg_2log
,
1009 atinfo
, nbat
->energrp
+(ash
>>grid
.na_c_2log
));
1014 /* Sets all required atom parameter data in nbnxn_atomdata_t */
1015 void nbnxn_atomdata_set(nbnxn_atomdata_t
*nbat
,
1016 const nbnxn_search
*nbs
,
1017 const t_mdatoms
*mdatoms
,
1020 nbnxn_atomdata_set_atomtypes(nbat
, nbs
, mdatoms
->typeA
);
1022 nbnxn_atomdata_set_charges(nbat
, nbs
, mdatoms
->chargeA
);
1026 nbnxn_atomdata_mask_fep(nbat
, nbs
);
1029 /* This must be done after masking types for FEP */
1030 nbnxn_atomdata_set_ljcombparams(nbat
, nbs
);
1032 nbnxn_atomdata_set_energygroups(nbat
, nbs
, atinfo
);
1035 /* Copies the shift vector array to nbnxn_atomdata_t */
1036 void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox
,
1038 nbnxn_atomdata_t
*nbat
)
1042 nbat
->bDynamicBox
= bDynamicBox
;
1043 for (i
= 0; i
< SHIFTS
; i
++)
1045 copy_rvec(shift_vec
[i
], nbat
->shift_vec
[i
]);
1049 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
1050 void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search
*nbs
,
1054 nbnxn_atomdata_t
*nbat
,
1055 gmx_wallcycle
*wcycle
)
1057 wallcycle_start(wcycle
, ewcNB_XF_BUF_OPS
);
1058 wallcycle_sub_start(wcycle
, ewcsNB_X_BUF_OPS
);
1067 g1
= nbs
->grid
.size();
1075 g1
= nbs
->grid
.size();
1081 nbat
->natoms_local
= nbs
->grid
[0].nc
*nbs
->grid
[0].na_sc
;
1084 nth
= gmx_omp_nthreads_get(emntPairsearch
);
1086 #pragma omp parallel for num_threads(nth) schedule(static)
1087 for (th
= 0; th
< nth
; th
++)
1091 for (int g
= g0
; g
< g1
; g
++)
1093 const nbnxn_grid_t
&grid
= nbs
->grid
[g
];
1094 const int numCellsXY
= grid
.numCells
[XX
]*grid
.numCells
[YY
];
1096 const int cxy0
= (numCellsXY
* th
+ nth
- 1)/nth
;
1097 const int cxy1
= (numCellsXY
*(th
+ 1) + nth
- 1)/nth
;
1099 for (int cxy
= cxy0
; cxy
< cxy1
; cxy
++)
1101 int na
, ash
, na_fill
;
1103 na
= grid
.cxy_na
[cxy
];
1104 ash
= (grid
.cell0
+ grid
.cxy_ind
[cxy
])*grid
.na_sc
;
1106 if (g
== 0 && FillLocal
)
1109 (grid
.cxy_ind
[cxy
+1] - grid
.cxy_ind
[cxy
])*grid
.na_sc
;
1113 /* We fill only the real particle locations.
1114 * We assume the filling entries at the end have been
1115 * properly set before during pair-list generation.
1119 copy_rvec_to_nbat_real(nbs
->a
.data() + ash
, na
, na_fill
, x
,
1120 nbat
->XFormat
, nbat
->x
, ash
);
1124 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1127 wallcycle_sub_stop(wcycle
, ewcsNB_X_BUF_OPS
);
1128 wallcycle_stop(wcycle
, ewcNB_XF_BUF_OPS
);
1132 nbnxn_atomdata_clear_reals(real
* gmx_restrict dest
,
1135 for (int i
= i0
; i
< i1
; i
++)
1141 gmx_unused
static void
1142 nbnxn_atomdata_reduce_reals(real
* gmx_restrict dest
,
1144 real
** gmx_restrict src
,
1150 /* The destination buffer contains data, add to it */
1151 for (int i
= i0
; i
< i1
; i
++)
1153 for (int s
= 0; s
< nsrc
; s
++)
1155 dest
[i
] += src
[s
][i
];
1161 /* The destination buffer is unitialized, set it first */
1162 for (int i
= i0
; i
< i1
; i
++)
1164 dest
[i
] = src
[0][i
];
1165 for (int s
= 1; s
< nsrc
; s
++)
1167 dest
[i
] += src
[s
][i
];
1173 gmx_unused
static void
1174 nbnxn_atomdata_reduce_reals_simd(real gmx_unused
* gmx_restrict dest
,
1175 gmx_bool gmx_unused bDestSet
,
1176 real gmx_unused
** gmx_restrict src
,
1177 int gmx_unused nsrc
,
1178 int gmx_unused i0
, int gmx_unused i1
)
1181 /* The SIMD width here is actually independent of that in the kernels,
1182 * but we use the same width for simplicity (usually optimal anyhow).
1184 SimdReal dest_SSE
, src_SSE
;
1188 for (int i
= i0
; i
< i1
; i
+= GMX_SIMD_REAL_WIDTH
)
1190 dest_SSE
= load
<SimdReal
>(dest
+i
);
1191 for (int s
= 0; s
< nsrc
; s
++)
1193 src_SSE
= load
<SimdReal
>(src
[s
]+i
);
1194 dest_SSE
= dest_SSE
+ src_SSE
;
1196 store(dest
+i
, dest_SSE
);
1201 for (int i
= i0
; i
< i1
; i
+= GMX_SIMD_REAL_WIDTH
)
1203 dest_SSE
= load
<SimdReal
>(src
[0]+i
);
1204 for (int s
= 1; s
< nsrc
; s
++)
1206 src_SSE
= load
<SimdReal
>(src
[s
]+i
);
1207 dest_SSE
= dest_SSE
+ src_SSE
;
1209 store(dest
+i
, dest_SSE
);
1215 /* Add part of the force array(s) from nbnxn_atomdata_t to f */
1217 nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search
*nbs
,
1218 const nbnxn_atomdata_t
*nbat
,
1219 nbnxn_atomdata_output_t
*out
,
1224 gmx::ArrayRef
<const int> cell
= nbs
->cell
;
1226 /* Loop over all columns and copy and fill */
1227 switch (nbat
->FFormat
)
1233 const real
*fnb
= out
[0].f
;
1235 for (int a
= a0
; a
< a1
; a
++)
1237 int i
= cell
[a
]*nbat
->fstride
;
1240 f
[a
][YY
] += fnb
[i
+1];
1241 f
[a
][ZZ
] += fnb
[i
+2];
1246 for (int a
= a0
; a
< a1
; a
++)
1248 int i
= cell
[a
]*nbat
->fstride
;
1250 for (int fa
= 0; fa
< nfa
; fa
++)
1252 f
[a
][XX
] += out
[fa
].f
[i
];
1253 f
[a
][YY
] += out
[fa
].f
[i
+1];
1254 f
[a
][ZZ
] += out
[fa
].f
[i
+2];
1262 const real
*fnb
= out
[0].f
;
1264 for (int a
= a0
; a
< a1
; a
++)
1266 int i
= atom_to_x_index
<c_packX4
>(cell
[a
]);
1268 f
[a
][XX
] += fnb
[i
+XX
*c_packX4
];
1269 f
[a
][YY
] += fnb
[i
+YY
*c_packX4
];
1270 f
[a
][ZZ
] += fnb
[i
+ZZ
*c_packX4
];
1275 for (int a
= a0
; a
< a1
; a
++)
1277 int i
= atom_to_x_index
<c_packX4
>(cell
[a
]);
1279 for (int fa
= 0; fa
< nfa
; fa
++)
1281 f
[a
][XX
] += out
[fa
].f
[i
+XX
*c_packX4
];
1282 f
[a
][YY
] += out
[fa
].f
[i
+YY
*c_packX4
];
1283 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*c_packX4
];
1291 const real
*fnb
= out
[0].f
;
1293 for (int a
= a0
; a
< a1
; a
++)
1295 int i
= atom_to_x_index
<c_packX8
>(cell
[a
]);
1297 f
[a
][XX
] += fnb
[i
+XX
*c_packX8
];
1298 f
[a
][YY
] += fnb
[i
+YY
*c_packX8
];
1299 f
[a
][ZZ
] += fnb
[i
+ZZ
*c_packX8
];
1304 for (int a
= a0
; a
< a1
; a
++)
1306 int i
= atom_to_x_index
<c_packX8
>(cell
[a
]);
1308 for (int fa
= 0; fa
< nfa
; fa
++)
1310 f
[a
][XX
] += out
[fa
].f
[i
+XX
*c_packX8
];
1311 f
[a
][YY
] += out
[fa
].f
[i
+YY
*c_packX8
];
1312 f
[a
][ZZ
] += out
[fa
].f
[i
+ZZ
*c_packX8
];
1318 gmx_incons("Unsupported nbnxn_atomdata_t format");
1322 static inline unsigned char reverse_bits(unsigned char b
)
1324 /* http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv */
1325 return (b
* 0x0202020202ULL
& 0x010884422010ULL
) % 1023;
1328 static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(const nbnxn_atomdata_t
*nbat
,
1331 const nbnxn_buffer_flags_t
*flags
= &nbat
->buffer_flags
;
1333 int next_pow2
= 1<<(gmx::log2I(nth
-1)+1);
1335 assert(nbat
->nout
== nth
); /* tree-reduce currently only works for nout==nth */
1337 memset(nbat
->syncStep
, 0, sizeof(*(nbat
->syncStep
))*nth
);
1339 #pragma omp parallel num_threads(nth)
1347 th
= gmx_omp_get_thread_num();
1349 for (group_size
= 2; group_size
< 2*next_pow2
; group_size
*= 2)
1351 int index
[2], group_pos
, partner_pos
, wu
;
1352 int partner_th
= th
^ (group_size
/2);
1357 /* wait on partner thread - replaces full barrier */
1358 int sync_th
, sync_group_size
;
1360 tMPI_Atomic_memory_barrier(); /* gurantee data is saved before marking work as done */
1361 tMPI_Atomic_set(&(nbat
->syncStep
[th
]), group_size
/2); /* mark previous step as completed */
1363 /* find thread to sync with. Equal to partner_th unless nth is not a power of two. */
1364 for (sync_th
= partner_th
, sync_group_size
= group_size
; sync_th
>= nth
&& sync_group_size
> 2; sync_group_size
/= 2)
1366 sync_th
&= ~(sync_group_size
/4);
1368 if (sync_th
< nth
) /* otherwise nothing to sync index[1] will be >=nout */
1370 /* wait on the thread which computed input data in previous step */
1371 while (tMPI_Atomic_get((volatile tMPI_Atomic_t
*)&(nbat
->syncStep
[sync_th
])) < group_size
/2)
1375 /* guarantee that no later load happens before wait loop is finisehd */
1376 tMPI_Atomic_memory_barrier();
1378 #else /* TMPI_ATOMICS */
1383 /* Calculate buffers to sum (result goes into first buffer) */
1384 group_pos
= th
% group_size
;
1385 index
[0] = th
- group_pos
;
1386 index
[1] = index
[0] + group_size
/2;
1388 /* If no second buffer, nothing to do */
1389 if (index
[1] >= nbat
->nout
&& group_size
> 2)
1394 #if NBNXN_BUFFERFLAG_MAX_THREADS > 256
1395 #error reverse_bits assumes max 256 threads
1397 /* Position is permuted so that one of the 2 vectors being added was computed on the same thread in the previous step.
1398 This improves locality and enables to sync with just a single thread between steps (=the levels in the btree).
1399 The permutation which allows this corresponds to reversing the bits of the group position.
1401 group_pos
= reverse_bits(group_pos
)/(256/group_size
);
1403 partner_pos
= group_pos
^ 1;
1405 /* loop over two work-units (own and partner) */
1406 for (wu
= 0; wu
< 2; wu
++)
1410 if (partner_th
< nth
)
1412 break; /* partner exists we don't have to do his work */
1416 group_pos
= partner_pos
;
1420 /* Calculate the cell-block range for our thread */
1421 b0
= (flags
->nflag
* group_pos
)/group_size
;
1422 b1
= (flags
->nflag
*(group_pos
+1))/group_size
;
1424 for (b
= b0
; b
< b1
; b
++)
1426 i0
= b
*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1427 i1
= (b
+1)*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1429 if (bitmask_is_set(flags
->flag
[b
], index
[1]) || group_size
> 2)
1432 nbnxn_atomdata_reduce_reals_simd
1434 nbnxn_atomdata_reduce_reals
1436 (nbat
->out
[index
[0]].f
,
1437 bitmask_is_set(flags
->flag
[b
], index
[0]) || group_size
> 2,
1438 &(nbat
->out
[index
[1]].f
), 1, i0
, i1
);
1441 else if (!bitmask_is_set(flags
->flag
[b
], index
[0]))
1443 nbnxn_atomdata_clear_reals(nbat
->out
[index
[0]].f
,
1450 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1455 static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t
*nbat
,
1458 #pragma omp parallel for num_threads(nth) schedule(static)
1459 for (int th
= 0; th
< nth
; th
++)
1463 const nbnxn_buffer_flags_t
*flags
;
1465 real
*fptr
[NBNXN_BUFFERFLAG_MAX_THREADS
];
1467 flags
= &nbat
->buffer_flags
;
1469 /* Calculate the cell-block range for our thread */
1470 int b0
= (flags
->nflag
* th
)/nth
;
1471 int b1
= (flags
->nflag
*(th
+1))/nth
;
1473 for (int b
= b0
; b
< b1
; b
++)
1475 int i0
= b
*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1476 int i1
= (b
+1)*NBNXN_BUFFERFLAG_SIZE
*nbat
->fstride
;
1479 for (int out
= 1; out
< nbat
->nout
; out
++)
1481 if (bitmask_is_set(flags
->flag
[b
], out
))
1483 fptr
[nfptr
++] = nbat
->out
[out
].f
;
1489 nbnxn_atomdata_reduce_reals_simd
1491 nbnxn_atomdata_reduce_reals
1494 bitmask_is_set(flags
->flag
[b
], 0),
1498 else if (!bitmask_is_set(flags
->flag
[b
], 0))
1500 nbnxn_atomdata_clear_reals(nbat
->out
[0].f
,
1505 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1509 /* Add the force array(s) from nbnxn_atomdata_t to f */
1510 void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search
*nbs
,
1512 const nbnxn_atomdata_t
*nbat
,
1514 gmx_wallcycle
*wcycle
)
1516 wallcycle_start(wcycle
, ewcNB_XF_BUF_OPS
);
1517 wallcycle_sub_start(wcycle
, ewcsNB_F_BUF_OPS
);
1521 nbs_cycle_start(&nbs
->cc
[enbsCCreducef
]);
1527 na
= nbs
->natoms_nonlocal
;
1531 na
= nbs
->natoms_local
;
1534 a0
= nbs
->natoms_local
;
1535 na
= nbs
->natoms_nonlocal
- nbs
->natoms_local
;
1539 int nth
= gmx_omp_nthreads_get(emntNonbonded
);
1543 if (locality
!= eatAll
)
1545 gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
1548 /* Reduce the force thread output buffers into buffer 0, before adding
1549 * them to the, differently ordered, "real" force buffer.
1551 if (nbat
->bUseTreeReduce
)
1553 nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat
, nth
);
1557 nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat
, nth
);
1560 #pragma omp parallel for num_threads(nth) schedule(static)
1561 for (int th
= 0; th
< nth
; th
++)
1565 nbnxn_atomdata_add_nbat_f_to_f_part(nbs
, nbat
,
1572 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
1575 nbs_cycle_stop(&nbs
->cc
[enbsCCreducef
]);
1577 wallcycle_sub_stop(wcycle
, ewcsNB_F_BUF_OPS
);
1578 wallcycle_stop(wcycle
, ewcNB_XF_BUF_OPS
);
1581 /* Adds the shift forces from nbnxn_atomdata_t to fshift */
1582 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t
*nbat
,
1585 const nbnxn_atomdata_output_t
* out
= nbat
->out
;
1587 for (int s
= 0; s
< SHIFTS
; s
++)
1591 for (int th
= 0; th
< nbat
->nout
; th
++)
1593 sum
[XX
] += out
[th
].fshift
[s
*DIM
+XX
];
1594 sum
[YY
] += out
[th
].fshift
[s
*DIM
+YY
];
1595 sum
[ZZ
] += out
[th
].fshift
[s
*DIM
+ZZ
];
1597 rvec_inc(fshift
[s
], sum
);