2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define OpenCL implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \ingroup module_nbnxm
55 #include "gromacs/gpu_utils/device_stream_manager.h"
56 #include "gromacs/gpu_utils/oclutils.h"
57 #include "gromacs/hardware/device_information.h"
58 #include "gromacs/hardware/gpu_hw_info.h"
59 #include "gromacs/math/vectypes.h"
60 #include "gromacs/mdlib/force_flags.h"
61 #include "gromacs/mdtypes/interaction_const.h"
62 #include "gromacs/mdtypes/md_enums.h"
63 #include "gromacs/nbnxm/atomdata.h"
64 #include "gromacs/nbnxm/gpu_data_mgmt.h"
65 #include "gromacs/nbnxm/gpu_jit_support.h"
66 #include "gromacs/nbnxm/nbnxm.h"
67 #include "gromacs/nbnxm/nbnxm_gpu.h"
68 #include "gromacs/nbnxm/nbnxm_gpu_data_mgmt.h"
69 #include "gromacs/nbnxm/pairlistsets.h"
70 #include "gromacs/pbcutil/ishift.h"
71 #include "gromacs/timing/gpu_timing.h"
72 #include "gromacs/utility/cstringutil.h"
73 #include "gromacs/utility/fatalerror.h"
74 #include "gromacs/utility/gmxassert.h"
75 #include "gromacs/utility/real.h"
76 #include "gromacs/utility/smalloc.h"
78 #include "nbnxm_ocl_types.h"
83 /*! \brief Copies of values from cl_driver_diagnostics_intel.h,
84 * which isn't guaranteed to be available. */
86 #define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
87 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1
88 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2
89 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4
92 /*! \brief This parameter should be determined heuristically from the
93 * kernel execution times
95 * This value is best for small systems on a single AMD Radeon R9 290X
96 * (and about 5% faster than 40, which is the default for CUDA
97 * devices). Larger simulation systems were quite insensitive to the
98 * value of this parameter.
100 static unsigned int gpu_min_ci_balanced_factor
= 50;
103 /*! \brief Initializes the atomdata structure first time, it only gets filled at
106 static void init_atomdata_first(cl_atomdata_t
* ad
, int ntypes
, const DeviceContext
& deviceContext
)
110 allocateDeviceBuffer(&ad
->shift_vec
, SHIFTS
* DIM
, deviceContext
);
111 ad
->bShiftVecUploaded
= CL_FALSE
;
113 allocateDeviceBuffer(&ad
->fshift
, SHIFTS
* DIM
, deviceContext
);
114 allocateDeviceBuffer(&ad
->e_lj
, 1, deviceContext
);
115 allocateDeviceBuffer(&ad
->e_el
, 1, deviceContext
);
117 /* initialize to nullptr pointers to data that is not allocated here and will
118 need reallocation in nbnxn_gpu_init_atomdata */
122 /* size -1 indicates that the respective array hasn't been initialized yet */
127 /*! \brief Returns the kinds of electrostatics and Vdw OpenCL
128 * kernels that will be used.
130 * Respectively, these values are from enum eelOcl and enum
132 static void map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t
* ic
,
137 if (ic
->vdwtype
== evdwCUT
)
139 switch (ic
->vdw_modifier
)
142 case eintmodPOTSHIFT
:
145 case ljcrNONE
: *gpu_vdwtype
= evdwTypeCUT
; break;
146 case ljcrGEOM
: *gpu_vdwtype
= evdwTypeCUTCOMBGEOM
; break;
147 case ljcrLB
: *gpu_vdwtype
= evdwTypeCUTCOMBLB
; break;
150 "The requested LJ combination rule is not implemented in the "
151 "OpenCL GPU accelerated kernels!");
154 case eintmodFORCESWITCH
: *gpu_vdwtype
= evdwTypeFSWITCH
; break;
155 case eintmodPOTSWITCH
: *gpu_vdwtype
= evdwTypePSWITCH
; break;
158 "The requested VdW interaction modifier is not implemented in the GPU "
159 "accelerated kernels!");
162 else if (ic
->vdwtype
== evdwPME
)
164 if (ic
->ljpme_comb_rule
== ljcrGEOM
)
166 *gpu_vdwtype
= evdwTypeEWALDGEOM
;
170 *gpu_vdwtype
= evdwTypeEWALDLB
;
175 gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
178 if (ic
->eeltype
== eelCUT
)
180 *gpu_eeltype
= eelTypeCUT
;
182 else if (EEL_RF(ic
->eeltype
))
184 *gpu_eeltype
= eelTypeRF
;
186 else if ((EEL_PME(ic
->eeltype
) || ic
->eeltype
== eelEWALD
))
188 *gpu_eeltype
= nbnxn_gpu_pick_ewald_kernel_type(*ic
);
192 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
194 "The requested electrostatics type is not implemented in the GPU accelerated "
199 /*! \brief Initializes the nonbonded parameter data structure.
201 static void init_nbparam(NBParamGpu
* nbp
,
202 const interaction_const_t
* ic
,
203 const PairlistParams
& listParams
,
204 const nbnxn_atomdata_t::Params
& nbatParams
,
205 const DeviceContext
& deviceContext
)
207 set_cutoff_parameters(nbp
, ic
, listParams
);
209 map_interaction_types_to_gpu_kernel_flavors(ic
, nbatParams
.comb_rule
, &(nbp
->eeltype
), &(nbp
->vdwtype
));
211 if (ic
->vdwtype
== evdwPME
)
213 if (ic
->ljpme_comb_rule
== ljcrGEOM
)
215 GMX_ASSERT(nbatParams
.comb_rule
== ljcrGEOM
, "Combination rule mismatch!");
219 GMX_ASSERT(nbatParams
.comb_rule
== ljcrLB
, "Combination rule mismatch!");
222 /* generate table for PME */
223 nbp
->coulomb_tab
= nullptr;
224 if (nbp
->eeltype
== eelTypeEWALD_TAB
|| nbp
->eeltype
== eelTypeEWALD_TAB_TWIN
)
226 GMX_RELEASE_ASSERT(ic
->coulombEwaldTables
, "Need valid Coulomb Ewald correction tables");
227 init_ewald_coulomb_force_table(*ic
->coulombEwaldTables
, nbp
, deviceContext
);
231 allocateDeviceBuffer(&nbp
->coulomb_tab
, 1, deviceContext
);
234 const int nnbfp
= 2 * nbatParams
.numTypes
* nbatParams
.numTypes
;
235 const int nnbfp_comb
= 2 * nbatParams
.numTypes
;
238 /* set up LJ parameter lookup table */
239 DeviceBuffer
<real
> nbfp
;
240 initParamLookupTable(&nbfp
, nullptr, nbatParams
.nbfp
.data(), nnbfp
, deviceContext
);
243 if (ic
->vdwtype
== evdwPME
)
245 DeviceBuffer
<float> nbfp_comb
;
246 initParamLookupTable(&nbfp_comb
, nullptr, nbatParams
.nbfp_comb
.data(), nnbfp_comb
, deviceContext
);
247 nbp
->nbfp_comb
= nbfp_comb
;
252 //! This function is documented in the header file
253 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t
* nbv
, const interaction_const_t
* ic
)
255 if (!nbv
|| !nbv
->useGpu())
259 NbnxmGpu
* nb
= nbv
->gpu_nbv
;
260 NBParamGpu
* nbp
= nb
->nbparam
;
262 set_cutoff_parameters(nbp
, ic
, nbv
->pairlistSets().params());
264 nbp
->eeltype
= nbnxn_gpu_pick_ewald_kernel_type(*ic
);
266 GMX_RELEASE_ASSERT(ic
->coulombEwaldTables
, "Need valid Coulomb Ewald correction tables");
267 init_ewald_coulomb_force_table(*ic
->coulombEwaldTables
, nbp
, *nb
->deviceContext_
);
270 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
271 static cl_kernel
nbnxn_gpu_create_kernel(NbnxmGpu
* nb
, const char* kernel_name
)
276 kernel
= clCreateKernel(nb
->dev_rundata
->program
, kernel_name
, &cl_error
);
277 if (CL_SUCCESS
!= cl_error
)
279 gmx_fatal(FARGS
, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d", kernel_name
,
280 nb
->deviceContext_
->deviceInfo().device_name
, cl_error
);
286 /*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
288 static void nbnxn_ocl_clear_e_fshift(NbnxmGpu
* nb
)
292 cl_atomdata_t
* adat
= nb
->atdat
;
293 cl_command_queue ls
= nb
->deviceStreams
[InteractionLocality::Local
]->stream();
295 size_t local_work_size
[3] = { 1, 1, 1 };
296 size_t global_work_size
[3] = { 1, 1, 1 };
298 cl_int shifts
= SHIFTS
* 3;
302 cl_kernel zero_e_fshift
= nb
->kernel_zero_e_fshift
;
304 local_work_size
[0] = 64;
305 // Round the total number of threads up from the array size
306 global_work_size
[0] = ((shifts
+ local_work_size
[0] - 1) / local_work_size
[0]) * local_work_size
[0];
309 cl_error
= clSetKernelArg(zero_e_fshift
, arg_no
++, sizeof(cl_mem
), &(adat
->fshift
));
310 cl_error
|= clSetKernelArg(zero_e_fshift
, arg_no
++, sizeof(cl_mem
), &(adat
->e_lj
));
311 cl_error
|= clSetKernelArg(zero_e_fshift
, arg_no
++, sizeof(cl_mem
), &(adat
->e_el
));
312 cl_error
|= clSetKernelArg(zero_e_fshift
, arg_no
++, sizeof(cl_uint
), &shifts
);
313 GMX_ASSERT(cl_error
== CL_SUCCESS
, ocl_get_error_string(cl_error
).c_str());
315 cl_error
= clEnqueueNDRangeKernel(ls
, zero_e_fshift
, 3, nullptr, global_work_size
,
316 local_work_size
, 0, nullptr, nullptr);
317 GMX_ASSERT(cl_error
== CL_SUCCESS
, ocl_get_error_string(cl_error
).c_str());
320 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
321 static void nbnxn_gpu_init_kernels(NbnxmGpu
* nb
)
323 /* Init to 0 main kernel arrays */
324 /* They will be later on initialized in select_nbnxn_kernel */
325 // TODO: consider always creating all variants of the kernels here so that there is no
326 // need for late call to clCreateKernel -- if that gives any advantage?
327 memset(nb
->kernel_ener_noprune_ptr
, 0, sizeof(nb
->kernel_ener_noprune_ptr
));
328 memset(nb
->kernel_ener_prune_ptr
, 0, sizeof(nb
->kernel_ener_prune_ptr
));
329 memset(nb
->kernel_noener_noprune_ptr
, 0, sizeof(nb
->kernel_noener_noprune_ptr
));
330 memset(nb
->kernel_noener_prune_ptr
, 0, sizeof(nb
->kernel_noener_prune_ptr
));
332 /* Init pruning kernels
334 * TODO: we could avoid creating kernels if dynamic pruning is turned off,
335 * but ATM that depends on force flags not passed into the initialization.
337 nb
->kernel_pruneonly
[epruneFirst
] = nbnxn_gpu_create_kernel(nb
, "nbnxn_kernel_prune_opencl");
338 nb
->kernel_pruneonly
[epruneRolling
] =
339 nbnxn_gpu_create_kernel(nb
, "nbnxn_kernel_prune_rolling_opencl");
341 /* Init auxiliary kernels */
342 nb
->kernel_zero_e_fshift
= nbnxn_gpu_create_kernel(nb
, "zero_e_fshift");
345 /*! \brief Initializes simulation constant data.
347 * Initializes members of the atomdata and nbparam structs and
348 * clears e/fshift output buffers.
350 static void nbnxn_ocl_init_const(cl_atomdata_t
* atomData
,
351 NBParamGpu
* nbParams
,
352 const interaction_const_t
* ic
,
353 const PairlistParams
& listParams
,
354 const nbnxn_atomdata_t::Params
& nbatParams
,
355 const DeviceContext
& deviceContext
)
357 init_atomdata_first(atomData
, nbatParams
.numTypes
, deviceContext
);
358 init_nbparam(nbParams
, ic
, listParams
, nbatParams
, deviceContext
);
362 //! This function is documented in the header file
363 NbnxmGpu
* gpu_init(const gmx::DeviceStreamManager
& deviceStreamManager
,
364 const interaction_const_t
* ic
,
365 const PairlistParams
& listParams
,
366 const nbnxn_atomdata_t
* nbat
,
367 const bool bLocalAndNonlocal
)
369 GMX_ASSERT(ic
, "Need a valid interaction constants object");
371 auto nb
= new NbnxmGpu();
372 nb
->deviceContext_
= &deviceStreamManager
.context();
374 snew(nb
->nbparam
, 1);
375 snew(nb
->plist
[InteractionLocality::Local
], 1);
376 if (bLocalAndNonlocal
)
378 snew(nb
->plist
[InteractionLocality::NonLocal
], 1);
381 nb
->bUseTwoStreams
= bLocalAndNonlocal
;
383 nb
->timers
= new cl_timers_t();
384 snew(nb
->timings
, 1);
386 /* set device info, just point it to the right GPU among the detected ones */
387 nb
->dev_rundata
= new gmx_device_runtime_data_t();
390 pmalloc(reinterpret_cast<void**>(&nb
->nbst
.e_lj
), sizeof(*nb
->nbst
.e_lj
));
391 pmalloc(reinterpret_cast<void**>(&nb
->nbst
.e_el
), sizeof(*nb
->nbst
.e_el
));
392 pmalloc(reinterpret_cast<void**>(&nb
->nbst
.fshift
), SHIFTS
* sizeof(*nb
->nbst
.fshift
));
394 init_plist(nb
->plist
[InteractionLocality::Local
]);
396 /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
397 nb
->bDoTime
= (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
399 /* local/non-local GPU streams */
400 GMX_RELEASE_ASSERT(deviceStreamManager
.streamIsValid(gmx::DeviceStreamType::NonBondedLocal
),
401 "Local non-bonded stream should be initialized to use GPU for non-bonded.");
402 nb
->deviceStreams
[InteractionLocality::Local
] =
403 &deviceStreamManager
.stream(gmx::DeviceStreamType::NonBondedLocal
);
405 if (nb
->bUseTwoStreams
)
407 init_plist(nb
->plist
[InteractionLocality::NonLocal
]);
409 GMX_RELEASE_ASSERT(deviceStreamManager
.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal
),
410 "Non-local non-bonded stream should be initialized to use GPU for "
411 "non-bonded with domain decomposition.");
412 nb
->deviceStreams
[InteractionLocality::NonLocal
] =
413 &deviceStreamManager
.stream(gmx::DeviceStreamType::NonBondedNonLocal
);
418 init_timings(nb
->timings
);
421 nbnxn_ocl_init_const(nb
->atdat
, nb
->nbparam
, ic
, listParams
, nbat
->params(), *nb
->deviceContext_
);
423 /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
424 * TODO: decide about NVIDIA
426 nb
->bPrefetchLjParam
= (getenv("GMX_OCL_DISABLE_I_PREFETCH") == nullptr)
427 && ((nb
->deviceContext_
->deviceInfo().deviceVendor
== DeviceVendor::Amd
)
428 || (nb
->deviceContext_
->deviceInfo().deviceVendor
== DeviceVendor::Intel
)
429 || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != nullptr));
431 /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
432 * but sadly this is not supported in OpenCL (yet?). Consider adding it if
433 * it becomes supported.
435 nbnxn_gpu_compile_kernels(nb
);
436 nbnxn_gpu_init_kernels(nb
);
438 /* clear energy and shift force outputs */
439 nbnxn_ocl_clear_e_fshift(nb
);
443 fprintf(debug
, "Initialized OpenCL data structures.\n");
449 /*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
451 static void nbnxn_ocl_clear_f(NbnxmGpu
* nb
, int natoms_clear
)
453 if (natoms_clear
== 0)
458 cl_atomdata_t
* atomData
= nb
->atdat
;
459 const DeviceStream
& localStream
= *nb
->deviceStreams
[InteractionLocality::Local
];
461 clearDeviceBufferAsync(&atomData
->f
, 0, natoms_clear
* DIM
, localStream
);
464 //! This function is documented in the header file
465 void gpu_clear_outputs(NbnxmGpu
* nb
, bool computeVirial
)
467 nbnxn_ocl_clear_f(nb
, nb
->atdat
->natoms
);
468 /* clear shift force array and energies if the outputs were
469 used in the current step */
472 nbnxn_ocl_clear_e_fshift(nb
);
475 /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
476 cl_int gmx_unused cl_error
;
477 cl_error
= clFlush(nb
->deviceStreams
[InteractionLocality::Local
]->stream());
478 GMX_ASSERT(cl_error
== CL_SUCCESS
, ("clFlush failed: " + ocl_get_error_string(cl_error
)).c_str());
481 //! This function is documented in the header file
482 void gpu_init_pairlist(NbnxmGpu
* nb
, const NbnxnPairlistGpu
* h_plist
, const InteractionLocality iloc
)
485 // Timing accumulation should happen only if there was work to do
486 // because getLastRangeTime() gets skipped with empty lists later
487 // which leads to the counter not being reset.
488 bool bDoTime
= (nb
->bDoTime
&& !h_plist
->sci
.empty());
489 const DeviceStream
& deviceStream
= *nb
->deviceStreams
[iloc
];
490 gpu_plist
* d_plist
= nb
->plist
[iloc
];
492 if (d_plist
->na_c
< 0)
494 d_plist
->na_c
= h_plist
->na_ci
;
498 if (d_plist
->na_c
!= h_plist
->na_ci
)
500 sprintf(sbuf
, "In init_plist: the #atoms per cell has changed (from %d to %d)",
501 d_plist
->na_c
, h_plist
->na_ci
);
506 gpu_timers_t::Interaction
& iTimers
= nb
->timers
->interaction
[iloc
];
510 iTimers
.pl_h2d
.openTimingRegion(deviceStream
);
511 iTimers
.didPairlistH2D
= true;
514 // TODO most of this function is same in CUDA and OpenCL, move into the header
515 const DeviceContext
& deviceContext
= *nb
->deviceContext_
;
517 reallocateDeviceBuffer(&d_plist
->sci
, h_plist
->sci
.size(), &d_plist
->nsci
, &d_plist
->sci_nalloc
,
519 copyToDeviceBuffer(&d_plist
->sci
, h_plist
->sci
.data(), 0, h_plist
->sci
.size(), deviceStream
,
520 GpuApiCallBehavior::Async
, bDoTime
? iTimers
.pl_h2d
.fetchNextEvent() : nullptr);
522 reallocateDeviceBuffer(&d_plist
->cj4
, h_plist
->cj4
.size(), &d_plist
->ncj4
, &d_plist
->cj4_nalloc
,
524 copyToDeviceBuffer(&d_plist
->cj4
, h_plist
->cj4
.data(), 0, h_plist
->cj4
.size(), deviceStream
,
525 GpuApiCallBehavior::Async
, bDoTime
? iTimers
.pl_h2d
.fetchNextEvent() : nullptr);
527 reallocateDeviceBuffer(&d_plist
->imask
, h_plist
->cj4
.size() * c_nbnxnGpuClusterpairSplit
,
528 &d_plist
->nimask
, &d_plist
->imask_nalloc
, deviceContext
);
530 reallocateDeviceBuffer(&d_plist
->excl
, h_plist
->excl
.size(), &d_plist
->nexcl
,
531 &d_plist
->excl_nalloc
, deviceContext
);
532 copyToDeviceBuffer(&d_plist
->excl
, h_plist
->excl
.data(), 0, h_plist
->excl
.size(), deviceStream
,
533 GpuApiCallBehavior::Async
, bDoTime
? iTimers
.pl_h2d
.fetchNextEvent() : nullptr);
537 iTimers
.pl_h2d
.closeTimingRegion(deviceStream
);
540 /* need to prune the pair list during the next step */
541 d_plist
->haveFreshList
= true;
544 //! This function is documented in the header file
545 void gpu_upload_shiftvec(NbnxmGpu
* nb
, const nbnxn_atomdata_t
* nbatom
)
547 cl_atomdata_t
* adat
= nb
->atdat
;
548 const DeviceStream
& deviceStream
= *nb
->deviceStreams
[InteractionLocality::Local
];
550 /* only if we have a dynamic box */
551 if (nbatom
->bDynamicBox
|| !adat
->bShiftVecUploaded
)
553 GMX_ASSERT(sizeof(float) * DIM
== sizeof(*nbatom
->shift_vec
.data()),
554 "Sizes of host- and device-side shift vectors should be the same.");
555 copyToDeviceBuffer(&adat
->shift_vec
, reinterpret_cast<const float*>(nbatom
->shift_vec
.data()),
556 0, SHIFTS
* DIM
, deviceStream
, GpuApiCallBehavior::Async
, nullptr);
557 adat
->bShiftVecUploaded
= CL_TRUE
;
561 //! This function is documented in the header file
562 void gpu_init_atomdata(NbnxmGpu
* nb
, const nbnxn_atomdata_t
* nbat
)
567 bool bDoTime
= nb
->bDoTime
;
568 cl_timers_t
* timers
= nb
->timers
;
569 cl_atomdata_t
* d_atdat
= nb
->atdat
;
570 const DeviceContext
& deviceContext
= *nb
->deviceContext_
;
571 const DeviceStream
& deviceStream
= *nb
->deviceStreams
[InteractionLocality::Local
];
573 natoms
= nbat
->numAtoms();
578 /* time async copy */
579 timers
->atdat
.openTimingRegion(deviceStream
);
582 /* need to reallocate if we have to copy more atoms than the amount of space
583 available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
584 if (natoms
> d_atdat
->nalloc
)
586 nalloc
= over_alloc_small(natoms
);
588 /* free up first if the arrays have already been initialized */
589 if (d_atdat
->nalloc
!= -1)
591 freeDeviceBuffer(&d_atdat
->f
);
592 freeDeviceBuffer(&d_atdat
->xq
);
593 freeDeviceBuffer(&d_atdat
->lj_comb
);
594 freeDeviceBuffer(&d_atdat
->atom_types
);
598 allocateDeviceBuffer(&d_atdat
->f
, nalloc
* DIM
, deviceContext
);
599 allocateDeviceBuffer(&d_atdat
->xq
, nalloc
* (DIM
+ 1), deviceContext
);
601 if (useLjCombRule(nb
->nbparam
->vdwtype
))
603 // Two Lennard-Jones parameters per atom
604 allocateDeviceBuffer(&d_atdat
->lj_comb
, nalloc
* 2, deviceContext
);
608 allocateDeviceBuffer(&d_atdat
->atom_types
, nalloc
, deviceContext
);
611 d_atdat
->nalloc
= nalloc
;
615 d_atdat
->natoms
= natoms
;
616 d_atdat
->natoms_local
= nbat
->natoms_local
;
618 /* need to clear GPU f output if realloc happened */
621 nbnxn_ocl_clear_f(nb
, nalloc
);
624 if (useLjCombRule(nb
->nbparam
->vdwtype
))
626 GMX_ASSERT(sizeof(float) == sizeof(*nbat
->params().lj_comb
.data()),
627 "Size of the LJ parameters element should be equal to the size of float2.");
628 copyToDeviceBuffer(&d_atdat
->lj_comb
, nbat
->params().lj_comb
.data(), 0, 2 * natoms
,
629 deviceStream
, GpuApiCallBehavior::Async
,
630 bDoTime
? timers
->atdat
.fetchNextEvent() : nullptr);
634 GMX_ASSERT(sizeof(int) == sizeof(*nbat
->params().type
.data()),
635 "Sizes of host- and device-side atom types should be the same.");
636 copyToDeviceBuffer(&d_atdat
->atom_types
, nbat
->params().type
.data(), 0, natoms
, deviceStream
,
637 GpuApiCallBehavior::Async
, bDoTime
? timers
->atdat
.fetchNextEvent() : nullptr);
642 timers
->atdat
.closeTimingRegion(deviceStream
);
645 /* kick off the tasks enqueued above to ensure concurrency with the search */
646 cl_error
= clFlush(deviceStream
.stream());
647 GMX_RELEASE_ASSERT(cl_error
== CL_SUCCESS
,
648 ("clFlush failed: " + ocl_get_error_string(cl_error
)).c_str());
651 /*! \brief Releases an OpenCL kernel pointer */
652 static void free_kernel(cl_kernel
* kernel_ptr
)
654 cl_int gmx_unused cl_error
;
656 GMX_ASSERT(kernel_ptr
, "Need a valid kernel pointer");
660 cl_error
= clReleaseKernel(*kernel_ptr
);
661 GMX_RELEASE_ASSERT(cl_error
== CL_SUCCESS
,
662 ("clReleaseKernel failed: " + ocl_get_error_string(cl_error
)).c_str());
664 *kernel_ptr
= nullptr;
668 /*! \brief Releases a list of OpenCL kernel pointers */
669 static void free_kernels(cl_kernel
* kernels
, int count
)
673 for (i
= 0; i
< count
; i
++)
675 free_kernel(kernels
+ i
);
679 /*! \brief Free the OpenCL program.
681 * The function releases the OpenCL program assuciated with the
682 * device that the calling PP rank is running on.
684 * \param program [in] OpenCL program to release.
686 static void freeGpuProgram(cl_program program
)
690 cl_int cl_error
= clReleaseProgram(program
);
691 GMX_RELEASE_ASSERT(cl_error
== CL_SUCCESS
,
692 ("clReleaseProgram failed: " + ocl_get_error_string(cl_error
)).c_str());
697 //! This function is documented in the header file
698 void gpu_free(NbnxmGpu
* nb
)
706 int kernel_count
= sizeof(nb
->kernel_ener_noprune_ptr
) / sizeof(nb
->kernel_ener_noprune_ptr
[0][0]);
707 free_kernels(nb
->kernel_ener_noprune_ptr
[0], kernel_count
);
709 kernel_count
= sizeof(nb
->kernel_ener_prune_ptr
) / sizeof(nb
->kernel_ener_prune_ptr
[0][0]);
710 free_kernels(nb
->kernel_ener_prune_ptr
[0], kernel_count
);
712 kernel_count
= sizeof(nb
->kernel_noener_noprune_ptr
) / sizeof(nb
->kernel_noener_noprune_ptr
[0][0]);
713 free_kernels(nb
->kernel_noener_noprune_ptr
[0], kernel_count
);
715 kernel_count
= sizeof(nb
->kernel_noener_prune_ptr
) / sizeof(nb
->kernel_noener_prune_ptr
[0][0]);
716 free_kernels(nb
->kernel_noener_prune_ptr
[0], kernel_count
);
718 free_kernel(&(nb
->kernel_zero_e_fshift
));
721 freeDeviceBuffer(&(nb
->atdat
->xq
));
722 freeDeviceBuffer(&(nb
->atdat
->f
));
723 freeDeviceBuffer(&(nb
->atdat
->e_lj
));
724 freeDeviceBuffer(&(nb
->atdat
->e_el
));
725 freeDeviceBuffer(&(nb
->atdat
->fshift
));
726 freeDeviceBuffer(&(nb
->atdat
->lj_comb
));
727 freeDeviceBuffer(&(nb
->atdat
->atom_types
));
728 freeDeviceBuffer(&(nb
->atdat
->shift_vec
));
732 freeDeviceBuffer(&(nb
->nbparam
->nbfp
));
733 freeDeviceBuffer(&(nb
->nbparam
->nbfp_comb
));
734 freeDeviceBuffer(&(nb
->nbparam
->coulomb_tab
));
738 auto* plist
= nb
->plist
[InteractionLocality::Local
];
739 freeDeviceBuffer(&plist
->sci
);
740 freeDeviceBuffer(&plist
->cj4
);
741 freeDeviceBuffer(&plist
->imask
);
742 freeDeviceBuffer(&plist
->excl
);
744 if (nb
->bUseTwoStreams
)
746 auto* plist_nl
= nb
->plist
[InteractionLocality::NonLocal
];
747 freeDeviceBuffer(&plist_nl
->sci
);
748 freeDeviceBuffer(&plist_nl
->cj4
);
749 freeDeviceBuffer(&plist_nl
->imask
);
750 freeDeviceBuffer(&plist_nl
->excl
);
755 pfree(nb
->nbst
.e_lj
);
756 nb
->nbst
.e_lj
= nullptr;
758 pfree(nb
->nbst
.e_el
);
759 nb
->nbst
.e_el
= nullptr;
761 pfree(nb
->nbst
.fshift
);
762 nb
->nbst
.fshift
= nullptr;
764 /* Free other events */
765 if (nb
->nonlocal_done
)
767 clReleaseEvent(nb
->nonlocal_done
);
768 nb
->nonlocal_done
= nullptr;
770 if (nb
->misc_ops_and_local_H2D_done
)
772 clReleaseEvent(nb
->misc_ops_and_local_H2D_done
);
773 nb
->misc_ops_and_local_H2D_done
= nullptr;
776 freeGpuProgram(nb
->dev_rundata
->program
);
777 delete nb
->dev_rundata
;
779 /* Free timers and timings */
786 fprintf(debug
, "Cleaned up OpenCL data structures.\n");
790 //! This function is documented in the header file
791 gmx_wallclock_gpu_nbnxn_t
* gpu_get_timings(NbnxmGpu
* nb
)
793 return (nb
!= nullptr && nb
->bDoTime
) ? nb
->timings
: nullptr;
796 //! This function is documented in the header file
797 void gpu_reset_timings(nonbonded_verlet_t
* nbv
)
799 if (nbv
->gpu_nbv
&& nbv
->gpu_nbv
->bDoTime
)
801 init_timings(nbv
->gpu_nbv
->timings
);
805 //! This function is documented in the header file
806 int gpu_min_ci_balanced(NbnxmGpu
* nb
)
808 return nb
!= nullptr ? gpu_min_ci_balanced_factor
* nb
->deviceContext_
->deviceInfo().compute_units
: 0;
811 //! This function is documented in the header file
812 gmx_bool
gpu_is_kernel_ewald_analytical(const NbnxmGpu
* nb
)
814 return ((nb
->nbparam
->eeltype
== eelTypeEWALD_ANA
) || (nb
->nbparam
->eeltype
== eelTypeEWALD_ANA_TWIN
));