2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "gromacs/gmxlib/nrnb.h"
39 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
40 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
41 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
42 #include "gromacs/math/vectypes.h"
43 #include "gromacs/mdlib/enerdata_utils.h"
44 #include "gromacs/mdlib/force.h"
45 #include "gromacs/mdlib/gmx_omp_nthreads.h"
46 #include "gromacs/mdlib/ppforceworkload.h"
47 #include "gromacs/mdtypes/enerdata.h"
48 #include "gromacs/mdtypes/forceoutput.h"
49 #include "gromacs/mdtypes/inputrec.h"
50 #include "gromacs/mdtypes/interaction_const.h"
51 #include "gromacs/mdtypes/md_enums.h"
52 #include "gromacs/mdtypes/mdatom.h"
53 #include "gromacs/nbnxm/gpu_data_mgmt.h"
54 #include "gromacs/nbnxm/nbnxm.h"
55 #include "gromacs/nbnxm/nbnxm_simd.h"
56 #include "gromacs/nbnxm/kernels_reference/kernel_gpu_ref.h"
57 #include "gromacs/simd/simd.h"
58 #include "gromacs/timing/wallcycle.h"
59 #include "gromacs/utility/gmxassert.h"
60 #include "gromacs/utility/real.h"
62 #include "kernel_common.h"
63 #include "pairlistset.h"
64 #include "pairlistsets.h"
65 #define INCLUDE_KERNELFUNCTION_TABLES
66 #include "gromacs/nbnxm/kernels_reference/kernel_ref.h"
67 #ifdef GMX_NBNXN_SIMD_2XNN
68 #include "gromacs/nbnxm/kernels_simd_2xmm/kernels.h"
70 #ifdef GMX_NBNXN_SIMD_4XN
71 #include "gromacs/nbnxm/kernels_simd_4xm/kernels.h"
73 #undef INCLUDE_FUNCTION_TABLES
75 /*! \brief Clears the energy group output buffers
77 * \param[in,out] out nbnxn kernel output struct
79 static void clearGroupEnergies(nbnxn_atomdata_output_t
*out
)
81 std::fill(out
->Vvdw
.begin(), out
->Vvdw
.end(), 0.0_real
);
82 std::fill(out
->Vc
.begin(), out
->Vc
.end(), 0.0_real
);
83 std::fill(out
->VSvdw
.begin(), out
->VSvdw
.end(), 0.0_real
);
84 std::fill(out
->VSc
.begin(), out
->VSc
.end(), 0.0_real
);
87 /*! \brief Reduce the group-pair energy buffers produced by a SIMD kernel
88 * to single terms in the output buffers.
90 * The SIMD kernels produce a large number of energy buffer in SIMD registers
91 * to avoid scattered reads and writes.
93 * \tparam unrollj The unroll size for j-particles in the SIMD kernel
94 * \param[in] numGroups The number of energy groups
95 * \param[in] numGroups_2log Log2 of numGroups, rounded up
96 * \param[in,out] out Struct with energy buffers
98 template <int unrollj
> static void
99 reduceGroupEnergySimdBuffers(int numGroups
,
101 nbnxn_atomdata_output_t
*out
)
103 const int unrollj_half
= unrollj
/2;
104 /* Energies are stored in SIMD registers with size 2^numGroups_2log */
105 const int numGroupsStorage
= (1 << numGroups_2log
);
107 const real
* gmx_restrict vVdwSimd
= out
->VSvdw
.data();
108 const real
* gmx_restrict vCoulombSimd
= out
->VSc
.data();
109 real
* gmx_restrict vVdw
= out
->Vvdw
.data();
110 real
* gmx_restrict vCoulomb
= out
->Vc
.data();
112 /* The size of the SIMD energy group buffer array is:
113 * numGroups*numGroups*numGroupsStorage*unrollj_half*simd_width
115 for (int i
= 0; i
< numGroups
; i
++)
117 for (int j1
= 0; j1
< numGroups
; j1
++)
119 for (int j0
= 0; j0
< numGroups
; j0
++)
121 int c
= ((i
*numGroups
+ j1
)*numGroupsStorage
+ j0
)*unrollj_half
*unrollj
;
122 for (int s
= 0; s
< unrollj_half
; s
++)
124 vVdw
[i
*numGroups
+ j0
] += vVdwSimd
[c
+ 0];
125 vVdw
[i
*numGroups
+ j1
] += vVdwSimd
[c
+ 1];
126 vCoulomb
[i
*numGroups
+ j0
] += vCoulombSimd
[c
+ 0];
127 vCoulomb
[i
*numGroups
+ j1
] += vCoulombSimd
[c
+ 1];
135 /*! \brief Dispatches the non-bonded N versus M atom cluster CPU kernels.
137 * OpenMP parallelization is performed within this function.
138 * Energy reduction, but not force and shift force reduction, is performed
139 * within this function.
141 * \param[in] pairlistSet Pairlists with local or non-local interactions to compute
142 * \param[in] kernelSetup The non-bonded kernel setup
143 * \param[in,out] nbat The atomdata for the interactions
144 * \param[in] ic Non-bonded interaction constants
145 * \param[in] shiftVectors The PBC shift vectors
146 * \param[in] forceFlags Flags that tell what to compute
147 * \param[in] clearF Enum that tells if to clear the force output buffer
148 * \param[out] vCoulomb Output buffer for Coulomb energies
149 * \param[out] vVdw Output buffer for Van der Waals energies
150 * \param[in] wcycle Pointer to cycle counting data structure.
153 nbnxn_kernel_cpu(const PairlistSet
&pairlistSet
,
154 const Nbnxm::KernelSetup
&kernelSetup
,
155 nbnxn_atomdata_t
*nbat
,
156 const interaction_const_t
&ic
,
158 const gmx::ForceFlags
&forceFlags
,
162 gmx_wallcycle
*wcycle
)
166 if (EEL_RF(ic
.eeltype
) || ic
.eeltype
== eelCUT
)
172 if (kernelSetup
.ewaldExclusionType
== Nbnxm::EwaldExclusionType::Table
)
174 if (ic
.rcoulomb
== ic
.rvdw
)
180 coulkt
= coulktTAB_TWIN
;
185 if (ic
.rcoulomb
== ic
.rvdw
)
187 coulkt
= coulktEWALD
;
191 coulkt
= coulktEWALD_TWIN
;
196 const nbnxn_atomdata_t::Params
&nbatParams
= nbat
->params();
199 if (ic
.vdwtype
== evdwCUT
)
201 switch (ic
.vdw_modifier
)
204 case eintmodPOTSHIFT
:
205 switch (nbatParams
.comb_rule
)
207 case ljcrGEOM
: vdwkt
= vdwktLJCUT_COMBGEOM
; break;
208 case ljcrLB
: vdwkt
= vdwktLJCUT_COMBLB
; break;
209 case ljcrNONE
: vdwkt
= vdwktLJCUT_COMBNONE
; break;
211 GMX_RELEASE_ASSERT(false, "Unknown combination rule");
214 case eintmodFORCESWITCH
:
215 vdwkt
= vdwktLJFORCESWITCH
;
217 case eintmodPOTSWITCH
:
218 vdwkt
= vdwktLJPOTSWITCH
;
221 GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction modifier");
224 else if (ic
.vdwtype
== evdwPME
)
226 if (ic
.ljpme_comb_rule
== eljpmeGEOM
)
228 vdwkt
= vdwktLJEWALDCOMBGEOM
;
232 vdwkt
= vdwktLJEWALDCOMBLB
;
233 /* At setup we (should have) selected the C reference kernel */
234 GMX_RELEASE_ASSERT(kernelSetup
.kernelType
== Nbnxm::KernelType::Cpu4x4_PlainC
, "Only the C reference nbnxn SIMD kernel supports LJ-PME with LB combination rules");
239 GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction type");
242 gmx::ArrayRef
<const NbnxnPairlistCpu
> pairlists
= pairlistSet
.cpuLists();
244 int gmx_unused nthreads
= gmx_omp_nthreads_get(emntNonbonded
);
245 wallcycle_sub_start(wcycle
, ewcsNONBONDED_CLEAR
);
246 #pragma omp parallel for schedule(static) num_threads(nthreads)
247 for (gmx::index nb
= 0; nb
< pairlists
.ssize(); nb
++)
249 // Presently, the kernels do not call C++ code that can throw,
250 // so no need for a try/catch pair in this OpenMP region.
251 nbnxn_atomdata_output_t
*out
= &nbat
->out
[nb
];
253 if (clearF
== enbvClearFYes
)
255 clearForceBuffer(nbat
, nb
);
257 clear_fshift(out
->fshift
.data());
262 wallcycle_sub_stop(wcycle
, ewcsNONBONDED_CLEAR
);
263 wallcycle_sub_start(wcycle
, ewcsNONBONDED_KERNEL
);
266 // TODO: Change to reference
267 const NbnxnPairlistCpu
*pairlist
= &pairlists
[nb
];
269 if (!forceFlags
.computeEnergy
)
271 /* Don't calculate energies */
272 switch (kernelSetup
.kernelType
)
274 case Nbnxm::KernelType::Cpu4x4_PlainC
:
275 nbnxn_kernel_noener_ref
[coulkt
][vdwkt
](pairlist
, nbat
,
280 #ifdef GMX_NBNXN_SIMD_2XNN
281 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN
:
282 nbnxm_kernel_noener_simd_2xmm
[coulkt
][vdwkt
](pairlist
, nbat
,
288 #ifdef GMX_NBNXN_SIMD_4XN
289 case Nbnxm::KernelType::Cpu4xN_Simd_4xN
:
290 nbnxm_kernel_noener_simd_4xm
[coulkt
][vdwkt
](pairlist
, nbat
,
297 GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
300 else if (out
->Vvdw
.size() == 1)
302 /* A single energy group (pair) */
306 switch (kernelSetup
.kernelType
)
308 case Nbnxm::KernelType::Cpu4x4_PlainC
:
309 nbnxn_kernel_ener_ref
[coulkt
][vdwkt
](pairlist
, nbat
,
314 #ifdef GMX_NBNXN_SIMD_2XNN
315 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN
:
316 nbnxm_kernel_ener_simd_2xmm
[coulkt
][vdwkt
](pairlist
, nbat
,
322 #ifdef GMX_NBNXN_SIMD_4XN
323 case Nbnxm::KernelType::Cpu4xN_Simd_4xN
:
324 nbnxm_kernel_ener_simd_4xm
[coulkt
][vdwkt
](pairlist
, nbat
,
331 GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
336 /* Calculate energy group contributions */
337 clearGroupEnergies(out
);
341 switch (kernelSetup
.kernelType
)
343 case Nbnxm::KernelType::Cpu4x4_PlainC
:
344 unrollj
= c_nbnxnCpuIClusterSize
;
345 nbnxn_kernel_energrp_ref
[coulkt
][vdwkt
](pairlist
, nbat
,
350 #ifdef GMX_NBNXN_SIMD_2XNN
351 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN
:
352 unrollj
= GMX_SIMD_REAL_WIDTH
/2;
353 nbnxm_kernel_energrp_simd_2xmm
[coulkt
][vdwkt
](pairlist
, nbat
,
359 #ifdef GMX_NBNXN_SIMD_4XN
360 case Nbnxm::KernelType::Cpu4xN_Simd_4xN
:
361 unrollj
= GMX_SIMD_REAL_WIDTH
;
362 nbnxm_kernel_energrp_simd_4xm
[coulkt
][vdwkt
](pairlist
, nbat
,
369 GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
372 if (kernelSetup
.kernelType
!= Nbnxm::KernelType::Cpu4x4_PlainC
)
377 reduceGroupEnergySimdBuffers
<2>(nbatParams
.nenergrp
,
382 reduceGroupEnergySimdBuffers
<4>(nbatParams
.nenergrp
,
387 reduceGroupEnergySimdBuffers
<8>(nbatParams
.nenergrp
,
392 GMX_RELEASE_ASSERT(false, "Unsupported j-unroll size");
397 wallcycle_sub_stop(wcycle
, ewcsNONBONDED_KERNEL
);
399 if (forceFlags
.computeEnergy
)
401 reduce_energies_over_lists(nbat
, pairlists
.ssize(), vVdw
, vCoulomb
);
405 static void accountFlops(t_nrnb
*nrnb
,
406 const PairlistSet
&pairlistSet
,
407 const nonbonded_verlet_t
&nbv
,
408 const interaction_const_t
&ic
,
409 const gmx::ForceFlags
&forceFlags
)
411 const bool usingGpuKernels
= nbv
.useGpu();
413 int enr_nbnxn_kernel_ljc
;
414 if (EEL_RF(ic
.eeltype
) || ic
.eeltype
== eelCUT
)
416 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_RF
;
418 else if ((!usingGpuKernels
&& nbv
.kernelSetup().ewaldExclusionType
== Nbnxm::EwaldExclusionType::Analytical
) ||
419 (usingGpuKernels
&& Nbnxm::gpu_is_kernel_ewald_analytical(nbv
.gpu_nbv
)))
421 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_EWALD
;
425 enr_nbnxn_kernel_ljc
= eNR_NBNXN_LJ_TAB
;
427 int enr_nbnxn_kernel_lj
= eNR_NBNXN_LJ
;
428 if (forceFlags
.computeEnergy
)
430 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
431 enr_nbnxn_kernel_ljc
+= 1;
432 enr_nbnxn_kernel_lj
+= 1;
435 inc_nrnb(nrnb
, enr_nbnxn_kernel_ljc
,
436 pairlistSet
.natpair_ljq_
);
437 inc_nrnb(nrnb
, enr_nbnxn_kernel_lj
,
438 pairlistSet
.natpair_lj_
);
439 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
440 inc_nrnb(nrnb
, enr_nbnxn_kernel_ljc
-eNR_NBNXN_LJ_RF
+eNR_NBNXN_RF
,
441 pairlistSet
.natpair_q_
);
443 if (ic
.vdw_modifier
== eintmodFORCESWITCH
)
445 /* We add up the switch cost separately */
446 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_FSW
+ (forceFlags
.computeEnergy
? 1 : 0),
447 pairlistSet
.natpair_ljq_
+ pairlistSet
.natpair_lj_
);
449 if (ic
.vdw_modifier
== eintmodPOTSWITCH
)
451 /* We add up the switch cost separately */
452 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_PSW
+ (forceFlags
.computeEnergy
? 1 : 0),
453 pairlistSet
.natpair_ljq_
+ pairlistSet
.natpair_lj_
);
455 if (ic
.vdwtype
== evdwPME
)
457 /* We add up the LJ Ewald cost separately */
458 inc_nrnb(nrnb
, eNR_NBNXN_ADD_LJ_EWALD
+ (forceFlags
.computeEnergy
? 1 : 0),
459 pairlistSet
.natpair_ljq_
+ pairlistSet
.natpair_lj_
);
464 nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
,
465 const interaction_const_t
&ic
,
466 const gmx::ForceFlags
&forceFlags
,
468 const t_forcerec
&fr
,
469 gmx_enerdata_t
*enerd
,
472 const PairlistSet
&pairlistSet
= pairlistSets().pairlistSet(iLocality
);
474 switch (kernelSetup().kernelType
)
476 case Nbnxm::KernelType::Cpu4x4_PlainC
:
477 case Nbnxm::KernelType::Cpu4xN_Simd_4xN
:
478 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN
:
479 nbnxn_kernel_cpu(pairlistSet
,
486 enerd
->grpp
.ener
[egCOULSR
].data(),
488 enerd
->grpp
.ener
[egBHAMSR
].data() :
489 enerd
->grpp
.ener
[egLJSR
].data(),
493 case Nbnxm::KernelType::Gpu8x8x8
:
494 Nbnxm::gpu_launch_kernel(gpu_nbv
, forceFlags
, iLocality
);
497 case Nbnxm::KernelType::Cpu8x8x8_PlainC
:
498 nbnxn_kernel_gpu_ref(pairlistSet
.gpuList(),
504 nbat
->out
[0].fshift
.data(),
505 enerd
->grpp
.ener
[egCOULSR
].data(),
507 enerd
->grpp
.ener
[egBHAMSR
].data() :
508 enerd
->grpp
.ener
[egLJSR
].data());
512 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
516 accountFlops(nrnb
, pairlistSet
, *this, ic
, forceFlags
);
520 nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocality
,
521 const t_forcerec
*fr
,
523 gmx::ForceWithShiftForces
*forceWithShiftForces
,
524 const t_mdatoms
&mdatoms
,
527 gmx_enerdata_t
*enerd
,
528 const gmx::ForceFlags
&forceFlags
,
531 const auto nbl_fep
= pairlistSets().pairlistSet(iLocality
).fepLists();
533 /* When the first list is empty, all are empty and there is nothing to do */
534 if (!pairlistSets().params().haveFep
|| nbl_fep
[0]->nrj
== 0)
540 /* Add short-range interactions */
541 donb_flags
|= GMX_NONBONDED_DO_SR
;
543 /* Currently all group scheme kernels always calculate (shift-)forces */
544 if (forceFlags
.computeForces
)
546 donb_flags
|= GMX_NONBONDED_DO_FORCE
;
548 if (forceFlags
.computeVirial
)
550 donb_flags
|= GMX_NONBONDED_DO_SHIFTFORCE
;
552 if (forceFlags
.computeEnergy
)
554 donb_flags
|= GMX_NONBONDED_DO_POTENTIAL
;
557 nb_kernel_data_t kernel_data
;
558 real dvdl_nb
[efptNR
] = { 0 };
559 kernel_data
.flags
= donb_flags
;
560 kernel_data
.lambda
= lambda
;
561 kernel_data
.dvdl
= dvdl_nb
;
563 kernel_data
.energygrp_elec
= enerd
->grpp
.ener
[egCOULSR
].data();
564 kernel_data
.energygrp_vdw
= enerd
->grpp
.ener
[egLJSR
].data();
566 GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded
) == nbl_fep
.ssize(), "Number of lists should be same as number of NB threads");
568 wallcycle_sub_start(wcycle_
, ewcsNONBONDED_FEP
);
569 #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
570 for (gmx::index th
= 0; th
< nbl_fep
.ssize(); th
++)
574 gmx_nb_free_energy_kernel(nbl_fep
[th
].get(),
575 x
, forceWithShiftForces
,
576 fr
, &mdatoms
, &kernel_data
, nrnb
);
578 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
581 if (fepvals
->sc_alpha
!= 0)
583 enerd
->dvdl_nonlin
[efptVDW
] += dvdl_nb
[efptVDW
];
584 enerd
->dvdl_nonlin
[efptCOUL
] += dvdl_nb
[efptCOUL
];
588 enerd
->dvdl_lin
[efptVDW
] += dvdl_nb
[efptVDW
];
589 enerd
->dvdl_lin
[efptCOUL
] += dvdl_nb
[efptCOUL
];
592 /* If we do foreign lambda and we have soft-core interactions
593 * we have to recalculate the (non-linear) energies contributions.
595 if (fepvals
->n_lambda
> 0 && forceFlags
.computeDhdl
&& fepvals
->sc_alpha
!= 0)
598 kernel_data
.flags
= (donb_flags
& ~(GMX_NONBONDED_DO_FORCE
| GMX_NONBONDED_DO_SHIFTFORCE
)) | GMX_NONBONDED_DO_FOREIGNLAMBDA
;
599 kernel_data
.lambda
= lam_i
;
600 kernel_data
.energygrp_elec
= enerd
->foreign_grpp
.ener
[egCOULSR
].data();
601 kernel_data
.energygrp_vdw
= enerd
->foreign_grpp
.ener
[egLJSR
].data();
602 /* Note that we add to kernel_data.dvdl, but ignore the result */
604 for (size_t i
= 0; i
< enerd
->enerpart_lambda
.size(); i
++)
606 for (int j
= 0; j
< efptNR
; j
++)
608 lam_i
[j
] = (i
== 0 ? lambda
[j
] : fepvals
->all_lambda
[j
][i
-1]);
610 reset_foreign_enerdata(enerd
);
611 #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
612 for (gmx::index th
= 0; th
< nbl_fep
.ssize(); th
++)
616 gmx_nb_free_energy_kernel(nbl_fep
[th
].get(),
617 x
, forceWithShiftForces
,
618 fr
, &mdatoms
, &kernel_data
, nrnb
);
620 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
623 sum_epot(&(enerd
->foreign_grpp
), enerd
->foreign_term
);
624 enerd
->enerpart_lambda
[i
] += enerd
->foreign_term
[F_EPOT
];
627 wallcycle_sub_stop(wcycle_
, ewcsNONBONDED_FEP
);