2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define OpenCL implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \ingroup module_nbnxm
55 #include "gromacs/gpu_utils/device_stream_manager.h"
56 #include "gromacs/gpu_utils/gpu_utils.h"
57 #include "gromacs/gpu_utils/oclutils.h"
58 #include "gromacs/hardware/gpu_hw_info.h"
59 #include "gromacs/math/vectypes.h"
60 #include "gromacs/mdlib/force_flags.h"
61 #include "gromacs/mdtypes/interaction_const.h"
62 #include "gromacs/mdtypes/md_enums.h"
63 #include "gromacs/nbnxm/atomdata.h"
64 #include "gromacs/nbnxm/gpu_data_mgmt.h"
65 #include "gromacs/nbnxm/gpu_jit_support.h"
66 #include "gromacs/nbnxm/nbnxm.h"
67 #include "gromacs/nbnxm/nbnxm_gpu.h"
68 #include "gromacs/nbnxm/pairlistsets.h"
69 #include "gromacs/pbcutil/ishift.h"
70 #include "gromacs/timing/gpu_timing.h"
71 #include "gromacs/utility/cstringutil.h"
72 #include "gromacs/utility/fatalerror.h"
73 #include "gromacs/utility/gmxassert.h"
74 #include "gromacs/utility/real.h"
75 #include "gromacs/utility/smalloc.h"
77 #include "nbnxm_ocl_types.h"
82 /*! \brief Copies of values from cl_driver_diagnostics_intel.h,
83 * which isn't guaranteed to be available. */
85 #define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
86 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1
87 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2
88 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4
91 /*! \brief This parameter should be determined heuristically from the
92 * kernel execution times
94 * This value is best for small systems on a single AMD Radeon R9 290X
95 * (and about 5% faster than 40, which is the default for CUDA
96 * devices). Larger simulation systems were quite insensitive to the
97 * value of this parameter.
99 static unsigned int gpu_min_ci_balanced_factor
= 50;
101 /*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
102 * and the table GPU array.
104 * If called with an already allocated table, it just re-uploads the
107 static void init_ewald_coulomb_force_table(const EwaldCorrectionTables
& tables
,
109 const DeviceContext
& deviceContext
)
111 if (nbp
->coulomb_tab
!= nullptr)
113 freeDeviceBuffer(&(nbp
->coulomb_tab
));
116 DeviceBuffer
<real
> coulomb_tab
;
118 initParamLookupTable(&coulomb_tab
, nullptr, tables
.tableF
.data(), tables
.tableF
.size(), deviceContext
);
120 nbp
->coulomb_tab
= coulomb_tab
;
121 nbp
->coulomb_tab_scale
= tables
.scale
;
125 /*! \brief Initializes the atomdata structure first time, it only gets filled at
128 static void init_atomdata_first(cl_atomdata_t
* ad
, int ntypes
, const DeviceContext
& deviceContext
)
132 allocateDeviceBuffer(&ad
->shift_vec
, SHIFTS
* DIM
, deviceContext
);
133 ad
->bShiftVecUploaded
= CL_FALSE
;
135 allocateDeviceBuffer(&ad
->fshift
, SHIFTS
* DIM
, deviceContext
);
136 allocateDeviceBuffer(&ad
->e_lj
, 1, deviceContext
);
137 allocateDeviceBuffer(&ad
->e_el
, 1, deviceContext
);
139 /* initialize to nullptr pointers to data that is not allocated here and will
140 need reallocation in nbnxn_gpu_init_atomdata */
144 /* size -1 indicates that the respective array hasn't been initialized yet */
149 /*! \brief Copies all parameters related to the cut-off from ic to nbp
151 static void set_cutoff_parameters(NBParamGpu
* nbp
, const interaction_const_t
* ic
, const PairlistParams
& listParams
)
153 nbp
->ewald_beta
= ic
->ewaldcoeff_q
;
154 nbp
->sh_ewald
= ic
->sh_ewald
;
155 nbp
->epsfac
= ic
->epsfac
;
156 nbp
->two_k_rf
= 2.0 * ic
->k_rf
;
157 nbp
->c_rf
= ic
->c_rf
;
158 nbp
->rvdw_sq
= ic
->rvdw
* ic
->rvdw
;
159 nbp
->rcoulomb_sq
= ic
->rcoulomb
* ic
->rcoulomb
;
160 nbp
->rlistOuter_sq
= listParams
.rlistOuter
* listParams
.rlistOuter
;
161 nbp
->rlistInner_sq
= listParams
.rlistInner
* listParams
.rlistInner
;
162 nbp
->useDynamicPruning
= listParams
.useDynamicPruning
;
164 nbp
->sh_lj_ewald
= ic
->sh_lj_ewald
;
165 nbp
->ewaldcoeff_lj
= ic
->ewaldcoeff_lj
;
167 nbp
->rvdw_switch
= ic
->rvdw_switch
;
168 nbp
->dispersion_shift
= ic
->dispersion_shift
;
169 nbp
->repulsion_shift
= ic
->repulsion_shift
;
170 nbp
->vdw_switch
= ic
->vdw_switch
;
173 /*! \brief Returns the kinds of electrostatics and Vdw OpenCL
174 * kernels that will be used.
176 * Respectively, these values are from enum eelOcl and enum
178 static void map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t
* ic
,
183 if (ic
->vdwtype
== evdwCUT
)
185 switch (ic
->vdw_modifier
)
188 case eintmodPOTSHIFT
:
191 case ljcrNONE
: *gpu_vdwtype
= evdwTypeCUT
; break;
192 case ljcrGEOM
: *gpu_vdwtype
= evdwTypeCUTCOMBGEOM
; break;
193 case ljcrLB
: *gpu_vdwtype
= evdwTypeCUTCOMBLB
; break;
196 "The requested LJ combination rule is not implemented in the "
197 "OpenCL GPU accelerated kernels!");
200 case eintmodFORCESWITCH
: *gpu_vdwtype
= evdwTypeFSWITCH
; break;
201 case eintmodPOTSWITCH
: *gpu_vdwtype
= evdwTypePSWITCH
; break;
204 "The requested VdW interaction modifier is not implemented in the GPU "
205 "accelerated kernels!");
208 else if (ic
->vdwtype
== evdwPME
)
210 if (ic
->ljpme_comb_rule
== ljcrGEOM
)
212 *gpu_vdwtype
= evdwTypeEWALDGEOM
;
216 *gpu_vdwtype
= evdwTypeEWALDLB
;
221 gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
224 if (ic
->eeltype
== eelCUT
)
226 *gpu_eeltype
= eelTypeCUT
;
228 else if (EEL_RF(ic
->eeltype
))
230 *gpu_eeltype
= eelTypeRF
;
232 else if ((EEL_PME(ic
->eeltype
) || ic
->eeltype
== eelEWALD
))
234 *gpu_eeltype
= nbnxn_gpu_pick_ewald_kernel_type(*ic
);
238 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
240 "The requested electrostatics type is not implemented in the GPU accelerated "
245 /*! \brief Initializes the nonbonded parameter data structure.
247 static void init_nbparam(NBParamGpu
* nbp
,
248 const interaction_const_t
* ic
,
249 const PairlistParams
& listParams
,
250 const nbnxn_atomdata_t::Params
& nbatParams
,
251 const DeviceContext
& deviceContext
)
253 set_cutoff_parameters(nbp
, ic
, listParams
);
255 map_interaction_types_to_gpu_kernel_flavors(ic
, nbatParams
.comb_rule
, &(nbp
->eeltype
), &(nbp
->vdwtype
));
257 if (ic
->vdwtype
== evdwPME
)
259 if (ic
->ljpme_comb_rule
== ljcrGEOM
)
261 GMX_ASSERT(nbatParams
.comb_rule
== ljcrGEOM
, "Combination rule mismatch!");
265 GMX_ASSERT(nbatParams
.comb_rule
== ljcrLB
, "Combination rule mismatch!");
268 /* generate table for PME */
269 nbp
->coulomb_tab
= nullptr;
270 if (nbp
->eeltype
== eelTypeEWALD_TAB
|| nbp
->eeltype
== eelTypeEWALD_TAB_TWIN
)
272 GMX_RELEASE_ASSERT(ic
->coulombEwaldTables
, "Need valid Coulomb Ewald correction tables");
273 init_ewald_coulomb_force_table(*ic
->coulombEwaldTables
, nbp
, deviceContext
);
277 allocateDeviceBuffer(&nbp
->coulomb_tab
, 1, deviceContext
);
280 const int nnbfp
= 2 * nbatParams
.numTypes
* nbatParams
.numTypes
;
281 const int nnbfp_comb
= 2 * nbatParams
.numTypes
;
284 /* set up LJ parameter lookup table */
285 DeviceBuffer
<real
> nbfp
;
286 initParamLookupTable(&nbfp
, nullptr, nbatParams
.nbfp
.data(), nnbfp
, deviceContext
);
289 if (ic
->vdwtype
== evdwPME
)
291 DeviceBuffer
<float> nbfp_comb
;
292 initParamLookupTable(&nbfp_comb
, nullptr, nbatParams
.nbfp_comb
.data(), nnbfp_comb
, deviceContext
);
293 nbp
->nbfp_comb
= nbfp_comb
;
298 //! This function is documented in the header file
299 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t
* nbv
, const interaction_const_t
* ic
)
301 if (!nbv
|| !nbv
->useGpu())
305 NbnxmGpu
* nb
= nbv
->gpu_nbv
;
306 NBParamGpu
* nbp
= nb
->nbparam
;
308 set_cutoff_parameters(nbp
, ic
, nbv
->pairlistSets().params());
310 nbp
->eeltype
= nbnxn_gpu_pick_ewald_kernel_type(*ic
);
312 GMX_RELEASE_ASSERT(ic
->coulombEwaldTables
, "Need valid Coulomb Ewald correction tables");
313 init_ewald_coulomb_force_table(*ic
->coulombEwaldTables
, nbp
, *nb
->deviceContext_
);
316 /*! \brief Initializes the pair list data structure.
318 static void init_plist(cl_plist_t
* pl
)
320 /* initialize to nullptr pointers to data that is not allocated here and will
321 need reallocation in nbnxn_gpu_init_pairlist */
327 /* size -1 indicates that the respective array hasn't been initialized yet */
334 pl
->imask_nalloc
= -1;
336 pl
->excl_nalloc
= -1;
337 pl
->haveFreshList
= false;
340 /*! \brief Initializes the timings data structure.
342 static void init_timings(gmx_wallclock_gpu_nbnxn_t
* t
)
351 for (i
= 0; i
< 2; i
++)
353 for (j
= 0; j
< 2; j
++)
355 t
->ktime
[i
][j
].t
= 0.0;
356 t
->ktime
[i
][j
].c
= 0;
361 t
->pruneTime
.t
= 0.0;
362 t
->dynamicPruneTime
.c
= 0;
363 t
->dynamicPruneTime
.t
= 0.0;
366 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
367 static cl_kernel
nbnxn_gpu_create_kernel(NbnxmGpu
* nb
, const char* kernel_name
)
372 kernel
= clCreateKernel(nb
->dev_rundata
->program
, kernel_name
, &cl_error
);
373 if (CL_SUCCESS
!= cl_error
)
375 gmx_fatal(FARGS
, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d", kernel_name
,
376 nb
->deviceContext_
->deviceInfo().device_name
, cl_error
);
382 /*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
384 static void nbnxn_ocl_clear_e_fshift(NbnxmGpu
* nb
)
388 cl_atomdata_t
* adat
= nb
->atdat
;
389 cl_command_queue ls
= nb
->deviceStreams
[InteractionLocality::Local
]->stream();
391 size_t local_work_size
[3] = { 1, 1, 1 };
392 size_t global_work_size
[3] = { 1, 1, 1 };
394 cl_int shifts
= SHIFTS
* 3;
398 cl_kernel zero_e_fshift
= nb
->kernel_zero_e_fshift
;
400 local_work_size
[0] = 64;
401 // Round the total number of threads up from the array size
402 global_work_size
[0] = ((shifts
+ local_work_size
[0] - 1) / local_work_size
[0]) * local_work_size
[0];
405 cl_error
= clSetKernelArg(zero_e_fshift
, arg_no
++, sizeof(cl_mem
), &(adat
->fshift
));
406 cl_error
|= clSetKernelArg(zero_e_fshift
, arg_no
++, sizeof(cl_mem
), &(adat
->e_lj
));
407 cl_error
|= clSetKernelArg(zero_e_fshift
, arg_no
++, sizeof(cl_mem
), &(adat
->e_el
));
408 cl_error
|= clSetKernelArg(zero_e_fshift
, arg_no
++, sizeof(cl_uint
), &shifts
);
409 GMX_ASSERT(cl_error
== CL_SUCCESS
, ocl_get_error_string(cl_error
).c_str());
411 cl_error
= clEnqueueNDRangeKernel(ls
, zero_e_fshift
, 3, nullptr, global_work_size
,
412 local_work_size
, 0, nullptr, nullptr);
413 GMX_ASSERT(cl_error
== CL_SUCCESS
, ocl_get_error_string(cl_error
).c_str());
416 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
417 static void nbnxn_gpu_init_kernels(NbnxmGpu
* nb
)
419 /* Init to 0 main kernel arrays */
420 /* They will be later on initialized in select_nbnxn_kernel */
421 // TODO: consider always creating all variants of the kernels here so that there is no
422 // need for late call to clCreateKernel -- if that gives any advantage?
423 memset(nb
->kernel_ener_noprune_ptr
, 0, sizeof(nb
->kernel_ener_noprune_ptr
));
424 memset(nb
->kernel_ener_prune_ptr
, 0, sizeof(nb
->kernel_ener_prune_ptr
));
425 memset(nb
->kernel_noener_noprune_ptr
, 0, sizeof(nb
->kernel_noener_noprune_ptr
));
426 memset(nb
->kernel_noener_prune_ptr
, 0, sizeof(nb
->kernel_noener_prune_ptr
));
428 /* Init pruning kernels
430 * TODO: we could avoid creating kernels if dynamic pruning is turned off,
431 * but ATM that depends on force flags not passed into the initialization.
433 nb
->kernel_pruneonly
[epruneFirst
] = nbnxn_gpu_create_kernel(nb
, "nbnxn_kernel_prune_opencl");
434 nb
->kernel_pruneonly
[epruneRolling
] =
435 nbnxn_gpu_create_kernel(nb
, "nbnxn_kernel_prune_rolling_opencl");
437 /* Init auxiliary kernels */
438 nb
->kernel_zero_e_fshift
= nbnxn_gpu_create_kernel(nb
, "zero_e_fshift");
441 /*! \brief Initializes simulation constant data.
443 * Initializes members of the atomdata and nbparam structs and
444 * clears e/fshift output buffers.
446 static void nbnxn_ocl_init_const(cl_atomdata_t
* atomData
,
447 NBParamGpu
* nbParams
,
448 const interaction_const_t
* ic
,
449 const PairlistParams
& listParams
,
450 const nbnxn_atomdata_t::Params
& nbatParams
,
451 const DeviceContext
& deviceContext
)
453 init_atomdata_first(atomData
, nbatParams
.numTypes
, deviceContext
);
454 init_nbparam(nbParams
, ic
, listParams
, nbatParams
, deviceContext
);
458 //! This function is documented in the header file
459 NbnxmGpu
* gpu_init(const gmx::DeviceStreamManager
& deviceStreamManager
,
460 const interaction_const_t
* ic
,
461 const PairlistParams
& listParams
,
462 const nbnxn_atomdata_t
* nbat
,
463 const bool bLocalAndNonlocal
)
465 GMX_ASSERT(ic
, "Need a valid interaction constants object");
467 auto nb
= new NbnxmGpu();
468 nb
->deviceContext_
= &deviceStreamManager
.context();
470 snew(nb
->nbparam
, 1);
471 snew(nb
->plist
[InteractionLocality::Local
], 1);
472 if (bLocalAndNonlocal
)
474 snew(nb
->plist
[InteractionLocality::NonLocal
], 1);
477 nb
->bUseTwoStreams
= bLocalAndNonlocal
;
479 nb
->timers
= new cl_timers_t();
480 snew(nb
->timings
, 1);
482 /* set device info, just point it to the right GPU among the detected ones */
483 nb
->dev_rundata
= new gmx_device_runtime_data_t();
486 pmalloc(reinterpret_cast<void**>(&nb
->nbst
.e_lj
), sizeof(*nb
->nbst
.e_lj
));
487 pmalloc(reinterpret_cast<void**>(&nb
->nbst
.e_el
), sizeof(*nb
->nbst
.e_el
));
488 pmalloc(reinterpret_cast<void**>(&nb
->nbst
.fshift
), SHIFTS
* sizeof(*nb
->nbst
.fshift
));
490 init_plist(nb
->plist
[InteractionLocality::Local
]);
492 /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
493 nb
->bDoTime
= (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
495 /* local/non-local GPU streams */
496 GMX_RELEASE_ASSERT(deviceStreamManager
.streamIsValid(gmx::DeviceStreamType::NonBondedLocal
),
497 "Local non-bonded stream should be initialized to use GPU for non-bonded.");
498 nb
->deviceStreams
[InteractionLocality::Local
] =
499 &deviceStreamManager
.stream(gmx::DeviceStreamType::NonBondedLocal
);
501 if (nb
->bUseTwoStreams
)
503 init_plist(nb
->plist
[InteractionLocality::NonLocal
]);
505 GMX_RELEASE_ASSERT(deviceStreamManager
.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal
),
506 "Non-local non-bonded stream should be initialized to use GPU for "
507 "non-bonded with domain decomposition.");
508 nb
->deviceStreams
[InteractionLocality::NonLocal
] =
509 &deviceStreamManager
.stream(gmx::DeviceStreamType::NonBondedNonLocal
);
514 init_timings(nb
->timings
);
517 nbnxn_ocl_init_const(nb
->atdat
, nb
->nbparam
, ic
, listParams
, nbat
->params(), *nb
->deviceContext_
);
519 /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
520 * TODO: decide about NVIDIA
522 nb
->bPrefetchLjParam
= (getenv("GMX_OCL_DISABLE_I_PREFETCH") == nullptr)
523 && ((nb
->deviceContext_
->deviceInfo().deviceVendor
== DeviceVendor::Amd
)
524 || (nb
->deviceContext_
->deviceInfo().deviceVendor
== DeviceVendor::Intel
)
525 || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != nullptr));
527 /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
528 * but sadly this is not supported in OpenCL (yet?). Consider adding it if
529 * it becomes supported.
531 nbnxn_gpu_compile_kernels(nb
);
532 nbnxn_gpu_init_kernels(nb
);
534 /* clear energy and shift force outputs */
535 nbnxn_ocl_clear_e_fshift(nb
);
539 fprintf(debug
, "Initialized OpenCL data structures.\n");
545 /*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
547 static void nbnxn_ocl_clear_f(NbnxmGpu
* nb
, int natoms_clear
)
549 if (natoms_clear
== 0)
554 cl_atomdata_t
* atomData
= nb
->atdat
;
555 const DeviceStream
& localStream
= *nb
->deviceStreams
[InteractionLocality::Local
];
557 clearDeviceBufferAsync(&atomData
->f
, 0, natoms_clear
* DIM
, localStream
);
560 //! This function is documented in the header file
561 void gpu_clear_outputs(NbnxmGpu
* nb
, bool computeVirial
)
563 nbnxn_ocl_clear_f(nb
, nb
->atdat
->natoms
);
564 /* clear shift force array and energies if the outputs were
565 used in the current step */
568 nbnxn_ocl_clear_e_fshift(nb
);
571 /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
572 cl_int gmx_unused cl_error
;
573 cl_error
= clFlush(nb
->deviceStreams
[InteractionLocality::Local
]->stream());
574 GMX_ASSERT(cl_error
== CL_SUCCESS
, ("clFlush failed: " + ocl_get_error_string(cl_error
)).c_str());
577 //! This function is documented in the header file
578 void gpu_init_pairlist(NbnxmGpu
* nb
, const NbnxnPairlistGpu
* h_plist
, const InteractionLocality iloc
)
581 // Timing accumulation should happen only if there was work to do
582 // because getLastRangeTime() gets skipped with empty lists later
583 // which leads to the counter not being reset.
584 bool bDoTime
= (nb
->bDoTime
&& !h_plist
->sci
.empty());
585 const DeviceStream
& deviceStream
= *nb
->deviceStreams
[iloc
];
586 cl_plist_t
* d_plist
= nb
->plist
[iloc
];
588 if (d_plist
->na_c
< 0)
590 d_plist
->na_c
= h_plist
->na_ci
;
594 if (d_plist
->na_c
!= h_plist
->na_ci
)
596 sprintf(sbuf
, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
597 d_plist
->na_c
, h_plist
->na_ci
);
602 gpu_timers_t::Interaction
& iTimers
= nb
->timers
->interaction
[iloc
];
606 iTimers
.pl_h2d
.openTimingRegion(deviceStream
);
607 iTimers
.didPairlistH2D
= true;
610 // TODO most of this function is same in CUDA and OpenCL, move into the header
611 const DeviceContext
& deviceContext
= *nb
->deviceContext_
;
613 reallocateDeviceBuffer(&d_plist
->sci
, h_plist
->sci
.size(), &d_plist
->nsci
, &d_plist
->sci_nalloc
,
615 copyToDeviceBuffer(&d_plist
->sci
, h_plist
->sci
.data(), 0, h_plist
->sci
.size(), deviceStream
,
616 GpuApiCallBehavior::Async
, bDoTime
? iTimers
.pl_h2d
.fetchNextEvent() : nullptr);
618 reallocateDeviceBuffer(&d_plist
->cj4
, h_plist
->cj4
.size(), &d_plist
->ncj4
, &d_plist
->cj4_nalloc
,
620 copyToDeviceBuffer(&d_plist
->cj4
, h_plist
->cj4
.data(), 0, h_plist
->cj4
.size(), deviceStream
,
621 GpuApiCallBehavior::Async
, bDoTime
? iTimers
.pl_h2d
.fetchNextEvent() : nullptr);
623 reallocateDeviceBuffer(&d_plist
->imask
, h_plist
->cj4
.size() * c_nbnxnGpuClusterpairSplit
,
624 &d_plist
->nimask
, &d_plist
->imask_nalloc
, deviceContext
);
626 reallocateDeviceBuffer(&d_plist
->excl
, h_plist
->excl
.size(), &d_plist
->nexcl
,
627 &d_plist
->excl_nalloc
, deviceContext
);
628 copyToDeviceBuffer(&d_plist
->excl
, h_plist
->excl
.data(), 0, h_plist
->excl
.size(), deviceStream
,
629 GpuApiCallBehavior::Async
, bDoTime
? iTimers
.pl_h2d
.fetchNextEvent() : nullptr);
633 iTimers
.pl_h2d
.closeTimingRegion(deviceStream
);
636 /* need to prune the pair list during the next step */
637 d_plist
->haveFreshList
= true;
640 //! This function is documented in the header file
641 void gpu_upload_shiftvec(NbnxmGpu
* nb
, const nbnxn_atomdata_t
* nbatom
)
643 cl_atomdata_t
* adat
= nb
->atdat
;
644 const DeviceStream
& deviceStream
= *nb
->deviceStreams
[InteractionLocality::Local
];
646 /* only if we have a dynamic box */
647 if (nbatom
->bDynamicBox
|| !adat
->bShiftVecUploaded
)
649 GMX_ASSERT(sizeof(float) * DIM
== sizeof(*nbatom
->shift_vec
.data()),
650 "Sizes of host- and device-side shift vectors should be the same.");
651 copyToDeviceBuffer(&adat
->shift_vec
, reinterpret_cast<const float*>(nbatom
->shift_vec
.data()),
652 0, SHIFTS
* DIM
, deviceStream
, GpuApiCallBehavior::Async
, nullptr);
653 adat
->bShiftVecUploaded
= CL_TRUE
;
657 //! This function is documented in the header file
658 void gpu_init_atomdata(NbnxmGpu
* nb
, const nbnxn_atomdata_t
* nbat
)
663 bool bDoTime
= nb
->bDoTime
;
664 cl_timers_t
* timers
= nb
->timers
;
665 cl_atomdata_t
* d_atdat
= nb
->atdat
;
666 const DeviceContext
& deviceContext
= *nb
->deviceContext_
;
667 const DeviceStream
& deviceStream
= *nb
->deviceStreams
[InteractionLocality::Local
];
669 natoms
= nbat
->numAtoms();
674 /* time async copy */
675 timers
->atdat
.openTimingRegion(deviceStream
);
678 /* need to reallocate if we have to copy more atoms than the amount of space
679 available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
680 if (natoms
> d_atdat
->nalloc
)
682 nalloc
= over_alloc_small(natoms
);
684 /* free up first if the arrays have already been initialized */
685 if (d_atdat
->nalloc
!= -1)
687 freeDeviceBuffer(&d_atdat
->f
);
688 freeDeviceBuffer(&d_atdat
->xq
);
689 freeDeviceBuffer(&d_atdat
->lj_comb
);
690 freeDeviceBuffer(&d_atdat
->atom_types
);
694 allocateDeviceBuffer(&d_atdat
->f
, nalloc
* DIM
, deviceContext
);
695 allocateDeviceBuffer(&d_atdat
->xq
, nalloc
* (DIM
+ 1), deviceContext
);
697 if (useLjCombRule(nb
->nbparam
->vdwtype
))
699 // Two Lennard-Jones parameters per atom
700 allocateDeviceBuffer(&d_atdat
->lj_comb
, nalloc
* 2, deviceContext
);
704 allocateDeviceBuffer(&d_atdat
->atom_types
, nalloc
, deviceContext
);
707 d_atdat
->nalloc
= nalloc
;
711 d_atdat
->natoms
= natoms
;
712 d_atdat
->natoms_local
= nbat
->natoms_local
;
714 /* need to clear GPU f output if realloc happened */
717 nbnxn_ocl_clear_f(nb
, nalloc
);
720 if (useLjCombRule(nb
->nbparam
->vdwtype
))
722 GMX_ASSERT(sizeof(float) == sizeof(*nbat
->params().lj_comb
.data()),
723 "Size of the LJ parameters element should be equal to the size of float2.");
724 copyToDeviceBuffer(&d_atdat
->lj_comb
, nbat
->params().lj_comb
.data(), 0, 2 * natoms
,
725 deviceStream
, GpuApiCallBehavior::Async
,
726 bDoTime
? timers
->atdat
.fetchNextEvent() : nullptr);
730 GMX_ASSERT(sizeof(int) == sizeof(*nbat
->params().type
.data()),
731 "Sizes of host- and device-side atom types should be the same.");
732 copyToDeviceBuffer(&d_atdat
->atom_types
, nbat
->params().type
.data(), 0, natoms
, deviceStream
,
733 GpuApiCallBehavior::Async
, bDoTime
? timers
->atdat
.fetchNextEvent() : nullptr);
738 timers
->atdat
.closeTimingRegion(deviceStream
);
741 /* kick off the tasks enqueued above to ensure concurrency with the search */
742 cl_error
= clFlush(deviceStream
.stream());
743 GMX_RELEASE_ASSERT(cl_error
== CL_SUCCESS
,
744 ("clFlush failed: " + ocl_get_error_string(cl_error
)).c_str());
747 /*! \brief Releases an OpenCL kernel pointer */
748 static void free_kernel(cl_kernel
* kernel_ptr
)
750 cl_int gmx_unused cl_error
;
752 GMX_ASSERT(kernel_ptr
, "Need a valid kernel pointer");
756 cl_error
= clReleaseKernel(*kernel_ptr
);
757 GMX_RELEASE_ASSERT(cl_error
== CL_SUCCESS
,
758 ("clReleaseKernel failed: " + ocl_get_error_string(cl_error
)).c_str());
760 *kernel_ptr
= nullptr;
764 /*! \brief Releases a list of OpenCL kernel pointers */
765 static void free_kernels(cl_kernel
* kernels
, int count
)
769 for (i
= 0; i
< count
; i
++)
771 free_kernel(kernels
+ i
);
775 /*! \brief Free the OpenCL program.
777 * The function releases the OpenCL program assuciated with the
778 * device that the calling PP rank is running on.
780 * \param program [in] OpenCL program to release.
782 static void freeGpuProgram(cl_program program
)
786 cl_int cl_error
= clReleaseProgram(program
);
787 GMX_RELEASE_ASSERT(cl_error
== CL_SUCCESS
,
788 ("clReleaseProgram failed: " + ocl_get_error_string(cl_error
)).c_str());
793 //! This function is documented in the header file
794 void gpu_free(NbnxmGpu
* nb
)
802 int kernel_count
= sizeof(nb
->kernel_ener_noprune_ptr
) / sizeof(nb
->kernel_ener_noprune_ptr
[0][0]);
803 free_kernels(nb
->kernel_ener_noprune_ptr
[0], kernel_count
);
805 kernel_count
= sizeof(nb
->kernel_ener_prune_ptr
) / sizeof(nb
->kernel_ener_prune_ptr
[0][0]);
806 free_kernels(nb
->kernel_ener_prune_ptr
[0], kernel_count
);
808 kernel_count
= sizeof(nb
->kernel_noener_noprune_ptr
) / sizeof(nb
->kernel_noener_noprune_ptr
[0][0]);
809 free_kernels(nb
->kernel_noener_noprune_ptr
[0], kernel_count
);
811 kernel_count
= sizeof(nb
->kernel_noener_prune_ptr
) / sizeof(nb
->kernel_noener_prune_ptr
[0][0]);
812 free_kernels(nb
->kernel_noener_prune_ptr
[0], kernel_count
);
814 free_kernel(&(nb
->kernel_zero_e_fshift
));
817 freeDeviceBuffer(&(nb
->atdat
->xq
));
818 freeDeviceBuffer(&(nb
->atdat
->f
));
819 freeDeviceBuffer(&(nb
->atdat
->e_lj
));
820 freeDeviceBuffer(&(nb
->atdat
->e_el
));
821 freeDeviceBuffer(&(nb
->atdat
->fshift
));
822 freeDeviceBuffer(&(nb
->atdat
->lj_comb
));
823 freeDeviceBuffer(&(nb
->atdat
->atom_types
));
824 freeDeviceBuffer(&(nb
->atdat
->shift_vec
));
828 freeDeviceBuffer(&(nb
->nbparam
->nbfp
));
829 freeDeviceBuffer(&(nb
->nbparam
->nbfp_comb
));
830 freeDeviceBuffer(&(nb
->nbparam
->coulomb_tab
));
834 auto* plist
= nb
->plist
[InteractionLocality::Local
];
835 freeDeviceBuffer(&plist
->sci
);
836 freeDeviceBuffer(&plist
->cj4
);
837 freeDeviceBuffer(&plist
->imask
);
838 freeDeviceBuffer(&plist
->excl
);
840 if (nb
->bUseTwoStreams
)
842 auto* plist_nl
= nb
->plist
[InteractionLocality::NonLocal
];
843 freeDeviceBuffer(&plist_nl
->sci
);
844 freeDeviceBuffer(&plist_nl
->cj4
);
845 freeDeviceBuffer(&plist_nl
->imask
);
846 freeDeviceBuffer(&plist_nl
->excl
);
851 pfree(nb
->nbst
.e_lj
);
852 nb
->nbst
.e_lj
= nullptr;
854 pfree(nb
->nbst
.e_el
);
855 nb
->nbst
.e_el
= nullptr;
857 pfree(nb
->nbst
.fshift
);
858 nb
->nbst
.fshift
= nullptr;
860 /* Free other events */
861 if (nb
->nonlocal_done
)
863 clReleaseEvent(nb
->nonlocal_done
);
864 nb
->nonlocal_done
= nullptr;
866 if (nb
->misc_ops_and_local_H2D_done
)
868 clReleaseEvent(nb
->misc_ops_and_local_H2D_done
);
869 nb
->misc_ops_and_local_H2D_done
= nullptr;
872 freeGpuProgram(nb
->dev_rundata
->program
);
873 delete nb
->dev_rundata
;
875 /* Free timers and timings */
882 fprintf(debug
, "Cleaned up OpenCL data structures.\n");
886 //! This function is documented in the header file
887 gmx_wallclock_gpu_nbnxn_t
* gpu_get_timings(NbnxmGpu
* nb
)
889 return (nb
!= nullptr && nb
->bDoTime
) ? nb
->timings
: nullptr;
892 //! This function is documented in the header file
893 void gpu_reset_timings(nonbonded_verlet_t
* nbv
)
895 if (nbv
->gpu_nbv
&& nbv
->gpu_nbv
->bDoTime
)
897 init_timings(nbv
->gpu_nbv
->timings
);
901 //! This function is documented in the header file
902 int gpu_min_ci_balanced(NbnxmGpu
* nb
)
904 return nb
!= nullptr ? gpu_min_ci_balanced_factor
* nb
->deviceContext_
->deviceInfo().compute_units
: 0;
907 //! This function is documented in the header file
908 gmx_bool
gpu_is_kernel_ewald_analytical(const NbnxmGpu
* nb
)
910 return ((nb
->nbparam
->eeltype
== eelTypeEWALD_ANA
) || (nb
->nbparam
->eeltype
== eelTypeEWALD_ANA_TWIN
));