2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief This file contains internal function implementations
39 * for performing the PME calculations on GPU.
41 * Note that this file is compiled as regular C++ source in OpenCL builds, but
42 * it is treated as CUDA source in CUDA-enabled GPU builds.
44 * \author Aleksei Iupinov <a.yupinov@gmail.com>
45 * \ingroup module_ewald
50 #include "pme_gpu_internal.h"
58 #include "gromacs/ewald/ewald_utils.h"
59 #include "gromacs/gpu_utils/gpu_utils.h"
60 #include "gromacs/math/invertmatrix.h"
61 #include "gromacs/math/units.h"
62 #include "gromacs/timing/gpu_timing.h"
63 #include "gromacs/utility/exceptions.h"
64 #include "gromacs/utility/fatalerror.h"
65 #include "gromacs/utility/gmxassert.h"
66 #include "gromacs/utility/logger.h"
67 #include "gromacs/utility/stringutil.h"
69 #if GMX_GPU == GMX_GPU_CUDA
70 # include "gromacs/gpu_utils/pmalloc_cuda.h"
73 #elif GMX_GPU == GMX_GPU_OPENCL
74 # include "gromacs/gpu_utils/gmxopencl.h"
77 #include "gromacs/ewald/pme.h"
79 #include "pme_gpu_3dfft.h"
80 #include "pme_gpu_calculate_splines.h"
81 #include "pme_gpu_constants.h"
82 #include "pme_gpu_program_impl.h"
83 #include "pme_gpu_timings.h"
84 #include "pme_gpu_types.h"
85 #include "pme_gpu_types_host.h"
86 #include "pme_gpu_types_host_impl.h"
88 #include "pme_internal.h"
89 #include "pme_solve.h"
93 * Atom limit above which it is advantageous to turn on the
94 * recalcuating of the splines in the gather and using less threads per atom in the spline and spread
96 constexpr int c_pmeGpuPerformanceAtomLimit
= 23000;
99 * Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
101 * \param[in] pmeGpu The PME GPU structure.
102 * \returns The pointer to the kernel parameters.
104 static PmeGpuKernelParamsBase
* pme_gpu_get_kernel_params_base_ptr(const PmeGpu
* pmeGpu
)
106 // reinterpret_cast is needed because the derived CUDA structure is not known in this file
107 auto* kernelParamsPtr
= reinterpret_cast<PmeGpuKernelParamsBase
*>(pmeGpu
->kernelParams
.get());
108 return kernelParamsPtr
;
111 int pme_gpu_get_atom_data_alignment(const PmeGpu
* /*unused*/)
113 // TODO: this can be simplified, as c_pmeAtomDataAlignment is now constant
116 return c_pmeAtomDataAlignment
;
124 int pme_gpu_get_atoms_per_warp(const PmeGpu
* pmeGpu
)
126 if (pmeGpu
->settings
.useOrderThreadsPerAtom
)
128 return pmeGpu
->programHandle_
->impl_
->warpSize
/ c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
;
132 return pmeGpu
->programHandle_
->impl_
->warpSize
/ c_pmeSpreadGatherThreadsPerAtom
;
136 void pme_gpu_synchronize(const PmeGpu
* pmeGpu
)
138 pmeGpu
->archSpecific
->pmeStream_
.synchronize();
141 void pme_gpu_alloc_energy_virial(PmeGpu
* pmeGpu
)
143 const size_t energyAndVirialSize
= c_virialAndEnergyCount
* sizeof(float);
144 allocateDeviceBuffer(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
, c_virialAndEnergyCount
,
145 pmeGpu
->archSpecific
->deviceContext_
);
146 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_virialAndEnergy
), energyAndVirialSize
);
149 void pme_gpu_free_energy_virial(PmeGpu
* pmeGpu
)
151 freeDeviceBuffer(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
);
152 pfree(pmeGpu
->staging
.h_virialAndEnergy
);
153 pmeGpu
->staging
.h_virialAndEnergy
= nullptr;
156 void pme_gpu_clear_energy_virial(const PmeGpu
* pmeGpu
)
158 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
, 0,
159 c_virialAndEnergyCount
, pmeGpu
->archSpecific
->pmeStream_
);
162 void pme_gpu_realloc_and_copy_bspline_values(PmeGpu
* pmeGpu
)
164 const int splineValuesOffset
[DIM
] = { 0, pmeGpu
->kernelParams
->grid
.realGridSize
[XX
],
165 pmeGpu
->kernelParams
->grid
.realGridSize
[XX
]
166 + pmeGpu
->kernelParams
->grid
.realGridSize
[YY
] };
167 memcpy(&pmeGpu
->kernelParams
->grid
.splineValuesOffset
, &splineValuesOffset
, sizeof(splineValuesOffset
));
169 const int newSplineValuesSize
= pmeGpu
->kernelParams
->grid
.realGridSize
[XX
]
170 + pmeGpu
->kernelParams
->grid
.realGridSize
[YY
]
171 + pmeGpu
->kernelParams
->grid
.realGridSize
[ZZ
];
172 const bool shouldRealloc
= (newSplineValuesSize
> pmeGpu
->archSpecific
->splineValuesSize
);
173 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
, newSplineValuesSize
,
174 &pmeGpu
->archSpecific
->splineValuesSize
,
175 &pmeGpu
->archSpecific
->splineValuesSizeAlloc
,
176 pmeGpu
->archSpecific
->deviceContext_
);
179 /* Reallocate the host buffer */
180 pfree(pmeGpu
->staging
.h_splineModuli
);
181 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_splineModuli
),
182 newSplineValuesSize
* sizeof(float));
184 for (int i
= 0; i
< DIM
; i
++)
186 memcpy(pmeGpu
->staging
.h_splineModuli
+ splineValuesOffset
[i
],
187 pmeGpu
->common
->bsp_mod
[i
].data(), pmeGpu
->common
->bsp_mod
[i
].size() * sizeof(float));
189 /* TODO: pin original buffer instead! */
190 copyToDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
, pmeGpu
->staging
.h_splineModuli
,
191 0, newSplineValuesSize
, pmeGpu
->archSpecific
->pmeStream_
,
192 pmeGpu
->settings
.transferKind
, nullptr);
195 void pme_gpu_free_bspline_values(const PmeGpu
* pmeGpu
)
197 pfree(pmeGpu
->staging
.h_splineModuli
);
198 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
);
201 void pme_gpu_realloc_forces(PmeGpu
* pmeGpu
)
203 const size_t newForcesSize
= pmeGpu
->nAtomsAlloc
* DIM
;
204 GMX_ASSERT(newForcesSize
> 0, "Bad number of atoms in PME GPU");
205 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
, newForcesSize
,
206 &pmeGpu
->archSpecific
->forcesSize
, &pmeGpu
->archSpecific
->forcesSizeAlloc
,
207 pmeGpu
->archSpecific
->deviceContext_
);
208 pmeGpu
->staging
.h_forces
.reserveWithPadding(pmeGpu
->nAtomsAlloc
);
209 pmeGpu
->staging
.h_forces
.resizeWithPadding(pmeGpu
->kernelParams
->atoms
.nAtoms
);
212 void pme_gpu_free_forces(const PmeGpu
* pmeGpu
)
214 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
);
217 void pme_gpu_copy_input_forces(PmeGpu
* pmeGpu
)
219 GMX_ASSERT(pmeGpu
->kernelParams
->atoms
.nAtoms
> 0, "Bad number of atoms in PME GPU");
220 float* h_forcesFloat
= reinterpret_cast<float*>(pmeGpu
->staging
.h_forces
.data());
221 copyToDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
, h_forcesFloat
, 0,
222 DIM
* pmeGpu
->kernelParams
->atoms
.nAtoms
, pmeGpu
->archSpecific
->pmeStream_
,
223 pmeGpu
->settings
.transferKind
, nullptr);
226 void pme_gpu_copy_output_forces(PmeGpu
* pmeGpu
)
228 GMX_ASSERT(pmeGpu
->kernelParams
->atoms
.nAtoms
> 0, "Bad number of atoms in PME GPU");
229 float* h_forcesFloat
= reinterpret_cast<float*>(pmeGpu
->staging
.h_forces
.data());
230 copyFromDeviceBuffer(h_forcesFloat
, &pmeGpu
->kernelParams
->atoms
.d_forces
, 0,
231 DIM
* pmeGpu
->kernelParams
->atoms
.nAtoms
, pmeGpu
->archSpecific
->pmeStream_
,
232 pmeGpu
->settings
.transferKind
, nullptr);
235 void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu
* pmeGpu
, const float* h_coefficients
)
237 GMX_ASSERT(h_coefficients
, "Bad host-side charge buffer in PME GPU");
238 const size_t newCoefficientsSize
= pmeGpu
->nAtomsAlloc
;
239 GMX_ASSERT(newCoefficientsSize
> 0, "Bad number of atoms in PME GPU");
240 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
, newCoefficientsSize
,
241 &pmeGpu
->archSpecific
->coefficientsSize
,
242 &pmeGpu
->archSpecific
->coefficientsSizeAlloc
,
243 pmeGpu
->archSpecific
->deviceContext_
);
244 copyToDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
,
245 const_cast<float*>(h_coefficients
), 0, pmeGpu
->kernelParams
->atoms
.nAtoms
,
246 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
249 const size_t paddingIndex
= pmeGpu
->kernelParams
->atoms
.nAtoms
;
250 const size_t paddingCount
= pmeGpu
->nAtomsAlloc
- paddingIndex
;
251 if (paddingCount
> 0)
253 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->atoms
.d_coefficients
, paddingIndex
,
254 paddingCount
, pmeGpu
->archSpecific
->pmeStream_
);
259 void pme_gpu_free_coefficients(const PmeGpu
* pmeGpu
)
261 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
);
264 void pme_gpu_realloc_spline_data(PmeGpu
* pmeGpu
)
266 const int order
= pmeGpu
->common
->pme_order
;
267 const int alignment
= pme_gpu_get_atoms_per_warp(pmeGpu
);
268 const size_t nAtomsPadded
= ((pmeGpu
->nAtomsAlloc
+ alignment
- 1) / alignment
) * alignment
;
269 const int newSplineDataSize
= DIM
* order
* nAtomsPadded
;
270 GMX_ASSERT(newSplineDataSize
> 0, "Bad number of atoms in PME GPU");
271 /* Two arrays of the same size */
272 const bool shouldRealloc
= (newSplineDataSize
> pmeGpu
->archSpecific
->splineDataSize
);
273 int currentSizeTemp
= pmeGpu
->archSpecific
->splineDataSize
;
274 int currentSizeTempAlloc
= pmeGpu
->archSpecific
->splineDataSizeAlloc
;
275 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_theta
, newSplineDataSize
, ¤tSizeTemp
,
276 ¤tSizeTempAlloc
, pmeGpu
->archSpecific
->deviceContext_
);
277 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_dtheta
, newSplineDataSize
,
278 &pmeGpu
->archSpecific
->splineDataSize
, &pmeGpu
->archSpecific
->splineDataSizeAlloc
,
279 pmeGpu
->archSpecific
->deviceContext_
);
280 // the host side reallocation
283 pfree(pmeGpu
->staging
.h_theta
);
284 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_theta
), newSplineDataSize
* sizeof(float));
285 pfree(pmeGpu
->staging
.h_dtheta
);
286 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_dtheta
), newSplineDataSize
* sizeof(float));
290 void pme_gpu_free_spline_data(const PmeGpu
* pmeGpu
)
292 /* Two arrays of the same size */
293 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_theta
);
294 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_dtheta
);
295 pfree(pmeGpu
->staging
.h_theta
);
296 pfree(pmeGpu
->staging
.h_dtheta
);
299 void pme_gpu_realloc_grid_indices(PmeGpu
* pmeGpu
)
301 const size_t newIndicesSize
= DIM
* pmeGpu
->nAtomsAlloc
;
302 GMX_ASSERT(newIndicesSize
> 0, "Bad number of atoms in PME GPU");
303 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_gridlineIndices
, newIndicesSize
,
304 &pmeGpu
->archSpecific
->gridlineIndicesSize
,
305 &pmeGpu
->archSpecific
->gridlineIndicesSizeAlloc
,
306 pmeGpu
->archSpecific
->deviceContext_
);
307 pfree(pmeGpu
->staging
.h_gridlineIndices
);
308 pmalloc(reinterpret_cast<void**>(&pmeGpu
->staging
.h_gridlineIndices
), newIndicesSize
* sizeof(int));
311 void pme_gpu_free_grid_indices(const PmeGpu
* pmeGpu
)
313 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_gridlineIndices
);
314 pfree(pmeGpu
->staging
.h_gridlineIndices
);
317 void pme_gpu_realloc_grids(PmeGpu
* pmeGpu
)
319 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
320 const int newRealGridSize
= kernelParamsPtr
->grid
.realGridSizePadded
[XX
]
321 * kernelParamsPtr
->grid
.realGridSizePadded
[YY
]
322 * kernelParamsPtr
->grid
.realGridSizePadded
[ZZ
];
323 const int newComplexGridSize
= kernelParamsPtr
->grid
.complexGridSizePadded
[XX
]
324 * kernelParamsPtr
->grid
.complexGridSizePadded
[YY
]
325 * kernelParamsPtr
->grid
.complexGridSizePadded
[ZZ
] * 2;
326 // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
327 if (pmeGpu
->archSpecific
->performOutOfPlaceFFT
)
329 /* 2 separate grids */
330 reallocateDeviceBuffer(&kernelParamsPtr
->grid
.d_fourierGrid
, newComplexGridSize
,
331 &pmeGpu
->archSpecific
->complexGridSize
,
332 &pmeGpu
->archSpecific
->complexGridSizeAlloc
,
333 pmeGpu
->archSpecific
->deviceContext_
);
334 reallocateDeviceBuffer(
335 &kernelParamsPtr
->grid
.d_realGrid
, newRealGridSize
, &pmeGpu
->archSpecific
->realGridSize
,
336 &pmeGpu
->archSpecific
->realGridSizeAlloc
, pmeGpu
->archSpecific
->deviceContext_
);
340 /* A single buffer so that any grid will fit */
341 const int newGridsSize
= std::max(newRealGridSize
, newComplexGridSize
);
342 reallocateDeviceBuffer(
343 &kernelParamsPtr
->grid
.d_realGrid
, newGridsSize
, &pmeGpu
->archSpecific
->realGridSize
,
344 &pmeGpu
->archSpecific
->realGridSizeAlloc
, pmeGpu
->archSpecific
->deviceContext_
);
345 kernelParamsPtr
->grid
.d_fourierGrid
= kernelParamsPtr
->grid
.d_realGrid
;
346 pmeGpu
->archSpecific
->complexGridSize
= pmeGpu
->archSpecific
->realGridSize
;
347 // the size might get used later for copying the grid
351 void pme_gpu_free_grids(const PmeGpu
* pmeGpu
)
353 if (pmeGpu
->archSpecific
->performOutOfPlaceFFT
)
355 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_fourierGrid
);
357 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_realGrid
);
360 void pme_gpu_clear_grids(const PmeGpu
* pmeGpu
)
362 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->grid
.d_realGrid
, 0,
363 pmeGpu
->archSpecific
->realGridSize
, pmeGpu
->archSpecific
->pmeStream_
);
366 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu
* pmeGpu
)
368 pme_gpu_free_fract_shifts(pmeGpu
);
370 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
372 const int nx
= kernelParamsPtr
->grid
.realGridSize
[XX
];
373 const int ny
= kernelParamsPtr
->grid
.realGridSize
[YY
];
374 const int nz
= kernelParamsPtr
->grid
.realGridSize
[ZZ
];
375 const int cellCount
= c_pmeNeighborUnitcellCount
;
376 const int gridDataOffset
[DIM
] = { 0, cellCount
* nx
, cellCount
* (nx
+ ny
) };
378 memcpy(kernelParamsPtr
->grid
.tablesOffsets
, &gridDataOffset
, sizeof(gridDataOffset
));
380 const int newFractShiftsSize
= cellCount
* (nx
+ ny
+ nz
);
382 #if GMX_GPU == GMX_GPU_CUDA
383 initParamLookupTable(kernelParamsPtr
->grid
.d_fractShiftsTable
, kernelParamsPtr
->fractShiftsTableTexture
,
384 pmeGpu
->common
->fsh
.data(), newFractShiftsSize
);
386 initParamLookupTable(kernelParamsPtr
->grid
.d_gridlineIndicesTable
,
387 kernelParamsPtr
->gridlineIndicesTableTexture
, pmeGpu
->common
->nn
.data(),
389 #elif GMX_GPU == GMX_GPU_OPENCL
390 // No dedicated texture routines....
391 allocateDeviceBuffer(&kernelParamsPtr
->grid
.d_fractShiftsTable
, newFractShiftsSize
,
392 pmeGpu
->archSpecific
->deviceContext_
);
393 allocateDeviceBuffer(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
, newFractShiftsSize
,
394 pmeGpu
->archSpecific
->deviceContext_
);
395 copyToDeviceBuffer(&kernelParamsPtr
->grid
.d_fractShiftsTable
, pmeGpu
->common
->fsh
.data(), 0,
396 newFractShiftsSize
, pmeGpu
->archSpecific
->pmeStream_
,
397 GpuApiCallBehavior::Async
, nullptr);
398 copyToDeviceBuffer(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
, pmeGpu
->common
->nn
.data(), 0,
399 newFractShiftsSize
, pmeGpu
->archSpecific
->pmeStream_
,
400 GpuApiCallBehavior::Async
, nullptr);
404 void pme_gpu_free_fract_shifts(const PmeGpu
* pmeGpu
)
406 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
407 #if GMX_GPU == GMX_GPU_CUDA
408 destroyParamLookupTable(kernelParamsPtr
->grid
.d_fractShiftsTable
,
409 kernelParamsPtr
->fractShiftsTableTexture
);
410 destroyParamLookupTable(kernelParamsPtr
->grid
.d_gridlineIndicesTable
,
411 kernelParamsPtr
->gridlineIndicesTableTexture
);
412 #elif GMX_GPU == GMX_GPU_OPENCL
413 freeDeviceBuffer(&kernelParamsPtr
->grid
.d_fractShiftsTable
);
414 freeDeviceBuffer(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
);
418 bool pme_gpu_stream_query(const PmeGpu
* pmeGpu
)
420 return haveStreamTasksCompleted(pmeGpu
->archSpecific
->pmeStream_
);
423 void pme_gpu_copy_input_gather_grid(const PmeGpu
* pmeGpu
, float* h_grid
)
425 copyToDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_realGrid
, h_grid
, 0, pmeGpu
->archSpecific
->realGridSize
,
426 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
429 void pme_gpu_copy_output_spread_grid(const PmeGpu
* pmeGpu
, float* h_grid
)
431 copyFromDeviceBuffer(h_grid
, &pmeGpu
->kernelParams
->grid
.d_realGrid
, 0,
432 pmeGpu
->archSpecific
->realGridSize
, pmeGpu
->archSpecific
->pmeStream_
,
433 pmeGpu
->settings
.transferKind
, nullptr);
434 pmeGpu
->archSpecific
->syncSpreadGridD2H
.markEvent(pmeGpu
->archSpecific
->pmeStream_
);
437 void pme_gpu_copy_output_spread_atom_data(const PmeGpu
* pmeGpu
)
439 const int alignment
= pme_gpu_get_atoms_per_warp(pmeGpu
);
440 const size_t nAtomsPadded
= ((pmeGpu
->nAtomsAlloc
+ alignment
- 1) / alignment
) * alignment
;
441 const size_t splinesCount
= DIM
* nAtomsPadded
* pmeGpu
->common
->pme_order
;
442 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
443 copyFromDeviceBuffer(pmeGpu
->staging
.h_dtheta
, &kernelParamsPtr
->atoms
.d_dtheta
, 0, splinesCount
,
444 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
445 copyFromDeviceBuffer(pmeGpu
->staging
.h_theta
, &kernelParamsPtr
->atoms
.d_theta
, 0, splinesCount
,
446 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
447 copyFromDeviceBuffer(pmeGpu
->staging
.h_gridlineIndices
, &kernelParamsPtr
->atoms
.d_gridlineIndices
,
448 0, kernelParamsPtr
->atoms
.nAtoms
* DIM
, pmeGpu
->archSpecific
->pmeStream_
,
449 pmeGpu
->settings
.transferKind
, nullptr);
452 void pme_gpu_copy_input_gather_atom_data(const PmeGpu
* pmeGpu
)
454 const int alignment
= pme_gpu_get_atoms_per_warp(pmeGpu
);
455 const size_t nAtomsPadded
= ((pmeGpu
->nAtomsAlloc
+ alignment
- 1) / alignment
) * alignment
;
456 const size_t splinesCount
= DIM
* nAtomsPadded
* pmeGpu
->common
->pme_order
;
457 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
460 // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
461 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_gridlineIndices
, 0,
462 pmeGpu
->nAtomsAlloc
* DIM
, pmeGpu
->archSpecific
->pmeStream_
);
463 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_dtheta
, 0,
464 pmeGpu
->nAtomsAlloc
* pmeGpu
->common
->pme_order
* DIM
,
465 pmeGpu
->archSpecific
->pmeStream_
);
466 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_theta
, 0,
467 pmeGpu
->nAtomsAlloc
* pmeGpu
->common
->pme_order
* DIM
,
468 pmeGpu
->archSpecific
->pmeStream_
);
470 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_dtheta
, pmeGpu
->staging
.h_dtheta
, 0, splinesCount
,
471 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
472 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_theta
, pmeGpu
->staging
.h_theta
, 0, splinesCount
,
473 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
474 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_gridlineIndices
, pmeGpu
->staging
.h_gridlineIndices
,
475 0, kernelParamsPtr
->atoms
.nAtoms
* DIM
, pmeGpu
->archSpecific
->pmeStream_
,
476 pmeGpu
->settings
.transferKind
, nullptr);
479 void pme_gpu_sync_spread_grid(const PmeGpu
* pmeGpu
)
481 pmeGpu
->archSpecific
->syncSpreadGridD2H
.waitForEvent();
484 void pme_gpu_init_internal(PmeGpu
* pmeGpu
)
486 #if GMX_GPU == GMX_GPU_CUDA
487 // Prepare to use the device that this PME task was assigned earlier.
488 // Other entities, such as CUDA timing events, are known to implicitly use the device context.
489 CU_RET_ERR(cudaSetDevice(pmeGpu
->deviceInfo
->id
), "Switching to PME CUDA device");
492 /* Allocate the target-specific structures */
493 pmeGpu
->archSpecific
.reset(new PmeGpuSpecific(pmeGpu
->programHandle_
->impl_
->deviceContext_
));
494 pmeGpu
->kernelParams
.reset(new PmeGpuKernelParams());
496 pmeGpu
->archSpecific
->performOutOfPlaceFFT
= true;
497 /* This should give better performance, according to the cuFFT documentation.
498 * The performance seems to be the same though.
499 * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
502 // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
503 if (GMX_GPU
== GMX_GPU_CUDA
)
505 /* WARNING: CUDA timings are incorrect with multiple streams.
506 * This is the main reason why they are disabled by default.
508 // TODO: Consider turning on by default when we can detect nr of streams.
509 pmeGpu
->archSpecific
->useTiming
= (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
511 else if (GMX_GPU
== GMX_GPU_OPENCL
)
513 pmeGpu
->archSpecific
->useTiming
= (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
516 #if GMX_GPU == GMX_GPU_CUDA
517 pmeGpu
->maxGridWidthX
= pmeGpu
->deviceInfo
->prop
.maxGridSize
[0];
518 #elif GMX_GPU == GMX_GPU_OPENCL
519 pmeGpu
->maxGridWidthX
= INT32_MAX
/ 2;
520 // TODO: is there no really global work size limit in OpenCL?
523 /* Creating a PME GPU stream:
524 * - default high priority with CUDA
525 * - no priorities implemented yet with OpenCL; see #2532
527 pmeGpu
->archSpecific
->pmeStream_
.init(*pmeGpu
->deviceInfo
, pmeGpu
->archSpecific
->deviceContext_
,
528 DeviceStreamPriority::High
, pmeGpu
->archSpecific
->useTiming
);
531 void pme_gpu_reinit_3dfft(const PmeGpu
* pmeGpu
)
533 if (pme_gpu_settings(pmeGpu
).performGPUFFT
)
535 pmeGpu
->archSpecific
->fftSetup
.resize(0);
536 for (int i
= 0; i
< pmeGpu
->common
->ngrids
; i
++)
538 pmeGpu
->archSpecific
->fftSetup
.push_back(std::make_unique
<GpuParallel3dFft
>(pmeGpu
));
543 void pme_gpu_destroy_3dfft(const PmeGpu
* pmeGpu
)
545 pmeGpu
->archSpecific
->fftSetup
.resize(0);
548 int getSplineParamFullIndex(int order
, int splineIndex
, int dimIndex
, int atomIndex
, int atomsPerWarp
)
550 if (order
!= c_pmeGpuOrder
)
554 constexpr int fixedOrder
= c_pmeGpuOrder
;
555 GMX_UNUSED_VALUE(fixedOrder
);
557 const int atomWarpIndex
= atomIndex
% atomsPerWarp
;
558 const int warpIndex
= atomIndex
/ atomsPerWarp
;
559 int indexBase
, result
;
560 switch (atomsPerWarp
)
563 indexBase
= getSplineParamIndexBase
<fixedOrder
, 1>(warpIndex
, atomWarpIndex
);
564 result
= getSplineParamIndex
<fixedOrder
, 1>(indexBase
, dimIndex
, splineIndex
);
568 indexBase
= getSplineParamIndexBase
<fixedOrder
, 2>(warpIndex
, atomWarpIndex
);
569 result
= getSplineParamIndex
<fixedOrder
, 2>(indexBase
, dimIndex
, splineIndex
);
573 indexBase
= getSplineParamIndexBase
<fixedOrder
, 4>(warpIndex
, atomWarpIndex
);
574 result
= getSplineParamIndex
<fixedOrder
, 4>(indexBase
, dimIndex
, splineIndex
);
578 indexBase
= getSplineParamIndexBase
<fixedOrder
, 8>(warpIndex
, atomWarpIndex
);
579 result
= getSplineParamIndex
<fixedOrder
, 8>(indexBase
, dimIndex
, splineIndex
);
583 GMX_THROW(gmx::NotImplementedError(
584 gmx::formatString("Test function call not unrolled for atomsPerWarp = %d in "
585 "getSplineParamFullIndex",
591 void pme_gpu_getEnergyAndVirial(const gmx_pme_t
& pme
, PmeOutput
* output
)
593 const PmeGpu
* pmeGpu
= pme
.gpu
;
594 for (int j
= 0; j
< c_virialAndEnergyCount
; j
++)
596 GMX_ASSERT(std::isfinite(pmeGpu
->staging
.h_virialAndEnergy
[j
]),
597 "PME GPU produces incorrect energy/virial.");
601 output
->coulombVirial_
[XX
][XX
] = 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
602 output
->coulombVirial_
[YY
][YY
] = 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
603 output
->coulombVirial_
[ZZ
][ZZ
] = 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
604 output
->coulombVirial_
[XX
][YY
] = output
->coulombVirial_
[YY
][XX
] =
605 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
606 output
->coulombVirial_
[XX
][ZZ
] = output
->coulombVirial_
[ZZ
][XX
] =
607 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
608 output
->coulombVirial_
[YY
][ZZ
] = output
->coulombVirial_
[ZZ
][YY
] =
609 0.25F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
610 output
->coulombEnergy_
= 0.5F
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
613 /*! \brief Sets the force-related members in \p output
615 * \param[in] pmeGpu PME GPU data structure
616 * \param[out] output Pointer to PME output data structure
618 static void pme_gpu_getForceOutput(PmeGpu
* pmeGpu
, PmeOutput
* output
)
620 output
->haveForceOutput_
= !pmeGpu
->settings
.useGpuForceReduction
;
621 if (output
->haveForceOutput_
)
623 output
->forces_
= pmeGpu
->staging
.h_forces
;
627 PmeOutput
pme_gpu_getOutput(const gmx_pme_t
& pme
, const bool computeEnergyAndVirial
)
629 PmeGpu
* pmeGpu
= pme
.gpu
;
633 pme_gpu_getForceOutput(pmeGpu
, &output
);
635 if (computeEnergyAndVirial
)
637 if (pme_gpu_settings(pmeGpu
).performGPUSolve
)
639 pme_gpu_getEnergyAndVirial(pme
, &output
);
643 get_pme_ener_vir_q(pme
.solve_work
, pme
.nthread
, &output
);
649 void pme_gpu_update_input_box(PmeGpu gmx_unused
* pmeGpu
, const matrix gmx_unused box
)
652 GMX_THROW(gmx::NotImplementedError("PME is implemented for single-precision only on GPU"));
655 pmeGpu
->common
->boxScaler
->scaleBox(box
, scaledBox
);
656 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
657 kernelParamsPtr
->current
.boxVolume
= scaledBox
[XX
][XX
] * scaledBox
[YY
][YY
] * scaledBox
[ZZ
][ZZ
];
658 GMX_ASSERT(kernelParamsPtr
->current
.boxVolume
!= 0.0F
, "Zero volume of the unit cell");
660 gmx::invertBoxMatrix(scaledBox
, recipBox
);
662 /* The GPU recipBox is transposed as compared to the CPU recipBox.
663 * Spread uses matrix columns (while solve and gather use rows).
664 * There is no particular reason for this; it might be further rethought/optimized for better access patterns.
666 const real newRecipBox
[DIM
][DIM
] = { { recipBox
[XX
][XX
], recipBox
[YY
][XX
], recipBox
[ZZ
][XX
] },
667 { 0.0, recipBox
[YY
][YY
], recipBox
[ZZ
][YY
] },
668 { 0.0, 0.0, recipBox
[ZZ
][ZZ
] } };
669 memcpy(kernelParamsPtr
->current
.recipBox
, newRecipBox
, sizeof(matrix
));
673 /*! \brief \libinternal
674 * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
676 * \param[in] pmeGpu The PME GPU structure.
678 static void pme_gpu_reinit_grids(PmeGpu
* pmeGpu
)
680 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
681 kernelParamsPtr
->grid
.ewaldFactor
=
682 (M_PI
* M_PI
) / (pmeGpu
->common
->ewaldcoeff_q
* pmeGpu
->common
->ewaldcoeff_q
);
684 /* The grid size variants */
685 for (int i
= 0; i
< DIM
; i
++)
687 kernelParamsPtr
->grid
.realGridSize
[i
] = pmeGpu
->common
->nk
[i
];
688 kernelParamsPtr
->grid
.realGridSizeFP
[i
] =
689 static_cast<float>(kernelParamsPtr
->grid
.realGridSize
[i
]);
690 kernelParamsPtr
->grid
.realGridSizePadded
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
692 // The complex grid currently uses no padding;
693 // if it starts to do so, then another test should be added for that
694 kernelParamsPtr
->grid
.complexGridSize
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
695 kernelParamsPtr
->grid
.complexGridSizePadded
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
697 /* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
698 if (!pme_gpu_settings(pmeGpu
).performGPUFFT
)
700 // This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly
701 kernelParamsPtr
->grid
.realGridSizePadded
[ZZ
] =
702 (kernelParamsPtr
->grid
.realGridSize
[ZZ
] / 2 + 1) * 2;
705 /* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
706 kernelParamsPtr
->grid
.complexGridSize
[ZZ
] /= 2;
707 kernelParamsPtr
->grid
.complexGridSize
[ZZ
]++;
708 kernelParamsPtr
->grid
.complexGridSizePadded
[ZZ
] = kernelParamsPtr
->grid
.complexGridSize
[ZZ
];
710 pme_gpu_realloc_and_copy_fract_shifts(pmeGpu
);
711 pme_gpu_realloc_and_copy_bspline_values(pmeGpu
);
712 pme_gpu_realloc_grids(pmeGpu
);
713 pme_gpu_reinit_3dfft(pmeGpu
);
716 /* Several GPU functions that refer to the CPU PME data live here.
717 * We would like to keep these away from the GPU-framework specific code for clarity,
718 * as well as compilation issues with MPI.
721 /*! \brief \libinternal
722 * Copies everything useful from the PME CPU to the PME GPU structure.
723 * The goal is to minimize interaction with the PME CPU structure in the GPU code.
725 * \param[in] pme The PME structure.
727 static void pme_gpu_copy_common_data_from(const gmx_pme_t
* pme
)
729 /* TODO: Consider refactoring the CPU PME code to use the same structure,
730 * so that this function becomes 2 lines */
731 PmeGpu
* pmeGpu
= pme
->gpu
;
732 pmeGpu
->common
->ngrids
= pme
->ngrids
;
733 pmeGpu
->common
->epsilon_r
= pme
->epsilon_r
;
734 pmeGpu
->common
->ewaldcoeff_q
= pme
->ewaldcoeff_q
;
735 pmeGpu
->common
->nk
[XX
] = pme
->nkx
;
736 pmeGpu
->common
->nk
[YY
] = pme
->nky
;
737 pmeGpu
->common
->nk
[ZZ
] = pme
->nkz
;
738 pmeGpu
->common
->pme_order
= pme
->pme_order
;
739 if (pmeGpu
->common
->pme_order
!= c_pmeGpuOrder
)
741 GMX_THROW(gmx::NotImplementedError("pme_order != 4 is not implemented!"));
743 for (int i
= 0; i
< DIM
; i
++)
745 pmeGpu
->common
->bsp_mod
[i
].assign(pme
->bsp_mod
[i
], pme
->bsp_mod
[i
] + pmeGpu
->common
->nk
[i
]);
747 const int cellCount
= c_pmeNeighborUnitcellCount
;
748 pmeGpu
->common
->fsh
.resize(0);
749 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshx
, pme
->fshx
+ cellCount
* pme
->nkx
);
750 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshy
, pme
->fshy
+ cellCount
* pme
->nky
);
751 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshz
, pme
->fshz
+ cellCount
* pme
->nkz
);
752 pmeGpu
->common
->nn
.resize(0);
753 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nnx
, pme
->nnx
+ cellCount
* pme
->nkx
);
754 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nny
, pme
->nny
+ cellCount
* pme
->nky
);
755 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nnz
, pme
->nnz
+ cellCount
* pme
->nkz
);
756 pmeGpu
->common
->runMode
= pme
->runMode
;
757 pmeGpu
->common
->isRankPmeOnly
= !pme
->bPPnode
;
758 pmeGpu
->common
->boxScaler
= pme
->boxScaler
;
761 /*! \libinternal \brief
762 * uses heuristics to select the best performing PME gather and scatter kernels
764 * \param[in,out] pmeGpu The PME GPU structure.
766 static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu
* pmeGpu
)
768 if (pmeGpu
->kernelParams
->atoms
.nAtoms
> c_pmeGpuPerformanceAtomLimit
&& (GMX_GPU
== GMX_GPU_CUDA
))
770 pmeGpu
->settings
.useOrderThreadsPerAtom
= true;
771 pmeGpu
->settings
.recalculateSplines
= true;
775 pmeGpu
->settings
.useOrderThreadsPerAtom
= false;
776 pmeGpu
->settings
.recalculateSplines
= false;
781 /*! \libinternal \brief
782 * Initializes the PME GPU data at the beginning of the run.
783 * TODO: this should become PmeGpu::PmeGpu()
785 * \param[in,out] pme The PME structure.
786 * \param[in,out] deviceInfo The GPU device information structure.
787 * \param[in] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
789 static void pme_gpu_init(gmx_pme_t
* pme
, const DeviceInformation
* deviceInfo
, const PmeGpuProgram
* pmeGpuProgram
)
791 GMX_ASSERT(deviceInfo
!= nullptr,
792 "Device information can not be nullptr when GPU is used for PME.");
793 pme
->gpu
= new PmeGpu();
794 PmeGpu
* pmeGpu
= pme
->gpu
;
795 changePinningPolicy(&pmeGpu
->staging
.h_forces
, pme_get_pinning_policy());
796 pmeGpu
->common
= std::make_shared
<PmeShared
>();
798 /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
799 /* A convenience variable. */
800 pmeGpu
->settings
.useDecomposition
= (pme
->nnodes
!= 1);
801 /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
802 pmeGpu
->settings
.performGPUGather
= true;
803 // By default GPU-side reduction is off (explicitly set here for tests, otherwise reset per-step)
804 pmeGpu
->settings
.useGpuForceReduction
= false;
806 pme_gpu_set_testing(pmeGpu
, false);
808 pmeGpu
->deviceInfo
= deviceInfo
;
809 GMX_ASSERT(pmeGpuProgram
!= nullptr, "GPU kernels must be already compiled");
810 pmeGpu
->programHandle_
= pmeGpuProgram
;
812 pmeGpu
->initializedClfftLibrary_
= std::make_unique
<gmx::ClfftInitializer
>();
814 pme_gpu_init_internal(pmeGpu
);
815 pme_gpu_alloc_energy_virial(pmeGpu
);
817 pme_gpu_copy_common_data_from(pme
);
819 GMX_ASSERT(pmeGpu
->common
->epsilon_r
!= 0.0F
, "PME GPU: bad electrostatic coefficient");
821 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
822 kernelParamsPtr
->constants
.elFactor
= ONE_4PI_EPS0
/ pmeGpu
->common
->epsilon_r
;
825 void pme_gpu_transform_spline_atom_data(const PmeGpu
* pmeGpu
,
826 const PmeAtomComm
* atc
,
827 PmeSplineDataType type
,
829 PmeLayoutTransform transform
)
831 // The GPU atom spline data is laid out in a different way currently than the CPU one.
832 // This function converts the data from GPU to CPU layout (in the host memory).
833 // It is only intended for testing purposes so far.
834 // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their
835 // performance (e.g. spreading on GPU, gathering on CPU).
836 GMX_RELEASE_ASSERT(atc
->nthread
== 1, "Only the serial PME data layout is supported");
837 const uintmax_t threadIndex
= 0;
838 const auto atomCount
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
)->atoms
.nAtoms
;
839 const auto atomsPerWarp
= pme_gpu_get_atoms_per_warp(pmeGpu
);
840 const auto pmeOrder
= pmeGpu
->common
->pme_order
;
841 GMX_ASSERT(pmeOrder
== c_pmeGpuOrder
, "Only PME order 4 is implemented");
843 real
* cpuSplineBuffer
;
844 float* h_splineBuffer
;
847 case PmeSplineDataType::Values
:
848 cpuSplineBuffer
= atc
->spline
[threadIndex
].theta
.coefficients
[dimIndex
];
849 h_splineBuffer
= pmeGpu
->staging
.h_theta
;
852 case PmeSplineDataType::Derivatives
:
853 cpuSplineBuffer
= atc
->spline
[threadIndex
].dtheta
.coefficients
[dimIndex
];
854 h_splineBuffer
= pmeGpu
->staging
.h_dtheta
;
857 default: GMX_THROW(gmx::InternalError("Unknown spline data type"));
860 for (auto atomIndex
= 0; atomIndex
< atomCount
; atomIndex
++)
862 for (auto orderIndex
= 0; orderIndex
< pmeOrder
; orderIndex
++)
864 const auto gpuValueIndex
=
865 getSplineParamFullIndex(pmeOrder
, orderIndex
, dimIndex
, atomIndex
, atomsPerWarp
);
866 const auto cpuValueIndex
= atomIndex
* pmeOrder
+ orderIndex
;
867 GMX_ASSERT(cpuValueIndex
< atomCount
* pmeOrder
,
868 "Atom spline data index out of bounds (while transforming GPU data layout "
872 case PmeLayoutTransform::GpuToHost
:
873 cpuSplineBuffer
[cpuValueIndex
] = h_splineBuffer
[gpuValueIndex
];
876 case PmeLayoutTransform::HostToGpu
:
877 h_splineBuffer
[gpuValueIndex
] = cpuSplineBuffer
[cpuValueIndex
];
880 default: GMX_THROW(gmx::InternalError("Unknown layout transform"));
886 void pme_gpu_get_real_grid_sizes(const PmeGpu
* pmeGpu
, gmx::IVec
* gridSize
, gmx::IVec
* paddedGridSize
)
888 GMX_ASSERT(gridSize
!= nullptr, "");
889 GMX_ASSERT(paddedGridSize
!= nullptr, "");
890 GMX_ASSERT(pmeGpu
!= nullptr, "");
891 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
892 for (int i
= 0; i
< DIM
; i
++)
894 (*gridSize
)[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
895 (*paddedGridSize
)[i
] = kernelParamsPtr
->grid
.realGridSizePadded
[i
];
899 void pme_gpu_reinit(gmx_pme_t
* pme
, const DeviceInformation
* deviceInfo
, const PmeGpuProgram
* pmeGpuProgram
)
901 GMX_ASSERT(pme
!= nullptr, "Need valid PME object");
902 if (pme
->runMode
== PmeRunMode::CPU
)
904 GMX_ASSERT(pme
->gpu
== nullptr, "Should not have PME GPU object");
910 /* First-time initialization */
911 pme_gpu_init(pme
, deviceInfo
, pmeGpuProgram
);
915 /* After this call nothing in the GPU code should refer to the gmx_pme_t *pme itself - until the next pme_gpu_reinit */
916 pme_gpu_copy_common_data_from(pme
);
918 /* GPU FFT will only get used for a single rank.*/
919 pme
->gpu
->settings
.performGPUFFT
=
920 (pme
->gpu
->common
->runMode
== PmeRunMode::GPU
) && !pme
->gpu
->settings
.useDecomposition
;
921 pme
->gpu
->settings
.performGPUSolve
= (pme
->gpu
->common
->runMode
== PmeRunMode::GPU
);
923 /* Reinit active timers */
924 pme_gpu_reinit_timings(pme
->gpu
);
926 pme_gpu_reinit_grids(pme
->gpu
);
927 // Note: if timing the reinit launch overhead becomes more relevant
928 // (e.g. with regulat PP-PME re-balancing), we should pass wcycle here.
929 pme_gpu_reinit_computation(pme
, nullptr);
930 /* Clear the previous box - doesn't hurt, and forces the PME CPU recipbox
931 * update for mixed mode on grid switch. TODO: use shared recipbox field.
933 std::memset(pme
->gpu
->common
->previousBox
, 0, sizeof(pme
->gpu
->common
->previousBox
));
936 void pme_gpu_destroy(PmeGpu
* pmeGpu
)
938 /* Free lots of data */
939 pme_gpu_free_energy_virial(pmeGpu
);
940 pme_gpu_free_bspline_values(pmeGpu
);
941 pme_gpu_free_forces(pmeGpu
);
942 pme_gpu_free_coefficients(pmeGpu
);
943 pme_gpu_free_spline_data(pmeGpu
);
944 pme_gpu_free_grid_indices(pmeGpu
);
945 pme_gpu_free_fract_shifts(pmeGpu
);
946 pme_gpu_free_grids(pmeGpu
);
948 pme_gpu_destroy_3dfft(pmeGpu
);
953 void pme_gpu_reinit_atoms(PmeGpu
* pmeGpu
, const int nAtoms
, const real
* charges
)
955 auto* kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
956 kernelParamsPtr
->atoms
.nAtoms
= nAtoms
;
957 const int alignment
= pme_gpu_get_atom_data_alignment(pmeGpu
);
958 pmeGpu
->nAtomsPadded
= ((nAtoms
+ alignment
- 1) / alignment
) * alignment
;
959 const int nAtomsAlloc
= c_usePadding
? pmeGpu
->nAtomsPadded
: nAtoms
;
960 const bool haveToRealloc
=
961 (pmeGpu
->nAtomsAlloc
< nAtomsAlloc
); /* This check might be redundant, but is logical */
962 pmeGpu
->nAtomsAlloc
= nAtomsAlloc
;
965 GMX_RELEASE_ASSERT(false, "Only single precision supported");
966 GMX_UNUSED_VALUE(charges
);
968 pme_gpu_realloc_and_copy_input_coefficients(pmeGpu
, reinterpret_cast<const float*>(charges
));
969 /* Could also be checked for haveToRealloc, but the copy always needs to be performed */
974 pme_gpu_realloc_forces(pmeGpu
);
975 pme_gpu_realloc_spline_data(pmeGpu
);
976 pme_gpu_realloc_grid_indices(pmeGpu
);
978 pme_gpu_select_best_performing_pme_spreadgather_kernels(pmeGpu
);
982 * Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled).
983 * In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
985 * \param[in] pmeGpu The PME GPU data structure.
986 * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
988 static CommandEvent
* pme_gpu_fetch_timing_event(const PmeGpu
* pmeGpu
, size_t PMEStageId
)
990 CommandEvent
* timingEvent
= nullptr;
991 if (pme_gpu_timings_enabled(pmeGpu
))
993 GMX_ASSERT(PMEStageId
< pmeGpu
->archSpecific
->timingEvents
.size(),
994 "Wrong PME GPU timing event index");
995 timingEvent
= pmeGpu
->archSpecific
->timingEvents
[PMEStageId
].fetchNextEvent();
1000 void pme_gpu_3dfft(const PmeGpu
* pmeGpu
, gmx_fft_direction dir
, int grid_index
)
1002 int timerId
= (dir
== GMX_FFT_REAL_TO_COMPLEX
) ? gtPME_FFT_R2C
: gtPME_FFT_C2R
;
1004 pme_gpu_start_timing(pmeGpu
, timerId
);
1005 pmeGpu
->archSpecific
->fftSetup
[grid_index
]->perform3dFft(
1006 dir
, pme_gpu_fetch_timing_event(pmeGpu
, timerId
));
1007 pme_gpu_stop_timing(pmeGpu
, timerId
);
1011 * Given possibly large \p blockCount, returns a compact 1D or 2D grid for kernel scheduling,
1012 * to minimize number of unused blocks.
1014 std::pair
<int, int> inline pmeGpuCreateGrid(const PmeGpu
* pmeGpu
, int blockCount
)
1016 // How many maximum widths in X do we need (hopefully just one)
1017 const int minRowCount
= (blockCount
+ pmeGpu
->maxGridWidthX
- 1) / pmeGpu
->maxGridWidthX
;
1018 // Trying to make things even
1019 const int colCount
= (blockCount
+ minRowCount
- 1) / minRowCount
;
1020 GMX_ASSERT((colCount
* minRowCount
- blockCount
) >= 0, "pmeGpuCreateGrid: totally wrong");
1021 GMX_ASSERT((colCount
* minRowCount
- blockCount
) < minRowCount
,
1022 "pmeGpuCreateGrid: excessive blocks");
1023 return std::pair
<int, int>(colCount
, minRowCount
);
1027 * Returns a pointer to appropriate spline and spread kernel based on the input bool values
1029 * \param[in] pmeGpu The PME GPU structure.
1030 * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
1031 * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
1033 * \return Pointer to CUDA kernel
1035 static auto selectSplineAndSpreadKernelPtr(const PmeGpu
* pmeGpu
, bool useOrderThreadsPerAtom
, bool writeSplinesToGlobal
)
1037 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1038 if (writeSplinesToGlobal
)
1040 if (useOrderThreadsPerAtom
)
1042 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelWriteSplinesThPerAtom4
;
1046 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelWriteSplines
;
1051 if (useOrderThreadsPerAtom
)
1053 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelThPerAtom4
;
1057 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernel
;
1065 * Returns a pointer to appropriate spline kernel based on the input bool values
1067 * \param[in] pmeGpu The PME GPU structure.
1068 * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
1069 * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
1071 * \return Pointer to CUDA kernel
1073 static auto selectSplineKernelPtr(const PmeGpu
* pmeGpu
, bool useOrderThreadsPerAtom
, bool gmx_unused writeSplinesToGlobal
)
1075 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1077 writeSplinesToGlobal
,
1078 "Spline data should always be written to global memory when just calculating splines");
1080 if (useOrderThreadsPerAtom
)
1082 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineKernelThPerAtom4
;
1086 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineKernel
;
1092 * Returns a pointer to appropriate spread kernel based on the input bool values
1094 * \param[in] pmeGpu The PME GPU structure.
1095 * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
1096 * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
1098 * \return Pointer to CUDA kernel
1100 static auto selectSpreadKernelPtr(const PmeGpu
* pmeGpu
, bool useOrderThreadsPerAtom
, bool writeSplinesToGlobal
)
1102 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1103 if (writeSplinesToGlobal
)
1105 if (useOrderThreadsPerAtom
)
1107 kernelPtr
= pmeGpu
->programHandle_
->impl_
->spreadKernelThPerAtom4
;
1111 kernelPtr
= pmeGpu
->programHandle_
->impl_
->spreadKernel
;
1116 /* if we are not saving the spline data we need to recalculate it
1117 using the spline and spread Kernel */
1118 if (useOrderThreadsPerAtom
)
1120 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernelThPerAtom4
;
1124 kernelPtr
= pmeGpu
->programHandle_
->impl_
->splineAndSpreadKernel
;
1130 void pme_gpu_spread(const PmeGpu
* pmeGpu
,
1131 GpuEventSynchronizer
* xReadyOnDevice
,
1132 int gmx_unused gridIndex
,
1134 bool computeSplines
,
1137 GMX_ASSERT(computeSplines
|| spreadCharges
,
1138 "PME spline/spread kernel has invalid input (nothing to do)");
1139 const auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
1140 GMX_ASSERT(kernelParamsPtr
->atoms
.nAtoms
> 0, "No atom data in PME GPU spread");
1142 const size_t blockSize
= pmeGpu
->programHandle_
->impl_
->spreadWorkGroupSize
;
1144 const int order
= pmeGpu
->common
->pme_order
;
1145 GMX_ASSERT(order
== c_pmeGpuOrder
, "Only PME order 4 is implemented");
1146 const bool writeGlobal
= pmeGpu
->settings
.copyAllOutputs
;
1147 const bool useOrderThreadsPerAtom
= pmeGpu
->settings
.useOrderThreadsPerAtom
;
1148 const bool recalculateSplines
= pmeGpu
->settings
.recalculateSplines
;
1149 #if GMX_GPU == GMX_GPU_OPENCL
1150 GMX_ASSERT(!useOrderThreadsPerAtom
, "Only 16 threads per atom supported in OpenCL");
1151 GMX_ASSERT(!recalculateSplines
, "Recalculating splines not supported in OpenCL");
1153 const int atomsPerBlock
= useOrderThreadsPerAtom
? blockSize
/ c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
1154 : blockSize
/ c_pmeSpreadGatherThreadsPerAtom
;
1156 // TODO: pick smaller block size in runtime if needed
1157 // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
1158 // If doing so, change atomsPerBlock in the kernels as well.
1159 // TODO: test varying block sizes on modern arch-s as well
1160 // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
1161 //(for spline data mostly)
1162 GMX_ASSERT(!c_usePadding
|| !(c_pmeAtomDataAlignment
% atomsPerBlock
),
1163 "inconsistent atom data padding vs. spreading block size");
1165 // Ensure that coordinates are ready on the device before launching spread;
1166 // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
1167 // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
1168 GMX_ASSERT(xReadyOnDevice
!= nullptr || (GMX_GPU
!= GMX_GPU_CUDA
)
1169 || pmeGpu
->common
->isRankPmeOnly
|| pme_gpu_settings(pmeGpu
).copyAllOutputs
,
1170 "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
1173 xReadyOnDevice
->enqueueWaitEvent(pmeGpu
->archSpecific
->pmeStream_
);
1176 const int blockCount
= pmeGpu
->nAtomsPadded
/ atomsPerBlock
;
1177 auto dimGrid
= pmeGpuCreateGrid(pmeGpu
, blockCount
);
1179 KernelLaunchConfig config
;
1180 config
.blockSize
[0] = order
;
1181 config
.blockSize
[1] = useOrderThreadsPerAtom
? 1 : order
;
1182 config
.blockSize
[2] = atomsPerBlock
;
1183 config
.gridSize
[0] = dimGrid
.first
;
1184 config
.gridSize
[1] = dimGrid
.second
;
1185 config
.stream
= pmeGpu
->archSpecific
->pmeStream_
.stream();
1188 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1193 timingId
= gtPME_SPLINEANDSPREAD
;
1194 kernelPtr
= selectSplineAndSpreadKernelPtr(pmeGpu
, useOrderThreadsPerAtom
,
1195 writeGlobal
|| (!recalculateSplines
));
1199 timingId
= gtPME_SPLINE
;
1200 kernelPtr
= selectSplineKernelPtr(pmeGpu
, useOrderThreadsPerAtom
,
1201 writeGlobal
|| (!recalculateSplines
));
1206 timingId
= gtPME_SPREAD
;
1207 kernelPtr
= selectSpreadKernelPtr(pmeGpu
, useOrderThreadsPerAtom
,
1208 writeGlobal
|| (!recalculateSplines
));
1212 pme_gpu_start_timing(pmeGpu
, timingId
);
1213 auto* timingEvent
= pme_gpu_fetch_timing_event(pmeGpu
, timingId
);
1214 #if c_canEmbedBuffers
1215 const auto kernelArgs
= prepareGpuKernelArguments(kernelPtr
, config
, kernelParamsPtr
);
1217 const auto kernelArgs
= prepareGpuKernelArguments(
1218 kernelPtr
, config
, kernelParamsPtr
, &kernelParamsPtr
->atoms
.d_theta
,
1219 &kernelParamsPtr
->atoms
.d_dtheta
, &kernelParamsPtr
->atoms
.d_gridlineIndices
,
1220 &kernelParamsPtr
->grid
.d_realGrid
, &kernelParamsPtr
->grid
.d_fractShiftsTable
,
1221 &kernelParamsPtr
->grid
.d_gridlineIndicesTable
, &kernelParamsPtr
->atoms
.d_coefficients
,
1222 &kernelParamsPtr
->atoms
.d_coordinates
);
1225 launchGpuKernel(kernelPtr
, config
, timingEvent
, "PME spline/spread", kernelArgs
);
1226 pme_gpu_stop_timing(pmeGpu
, timingId
);
1228 const auto& settings
= pmeGpu
->settings
;
1229 const bool copyBackGrid
= spreadCharges
&& (!settings
.performGPUFFT
|| settings
.copyAllOutputs
);
1232 pme_gpu_copy_output_spread_grid(pmeGpu
, h_grid
);
1234 const bool copyBackAtomData
=
1235 computeSplines
&& (!settings
.performGPUGather
|| settings
.copyAllOutputs
);
1236 if (copyBackAtomData
)
1238 pme_gpu_copy_output_spread_atom_data(pmeGpu
);
1242 void pme_gpu_solve(const PmeGpu
* pmeGpu
, t_complex
* h_grid
, GridOrdering gridOrdering
, bool computeEnergyAndVirial
)
1244 const auto& settings
= pmeGpu
->settings
;
1245 const bool copyInputAndOutputGrid
= !settings
.performGPUFFT
|| settings
.copyAllOutputs
;
1247 auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
1249 float* h_gridFloat
= reinterpret_cast<float*>(h_grid
);
1250 if (copyInputAndOutputGrid
)
1252 copyToDeviceBuffer(&kernelParamsPtr
->grid
.d_fourierGrid
, h_gridFloat
, 0,
1253 pmeGpu
->archSpecific
->complexGridSize
, pmeGpu
->archSpecific
->pmeStream_
,
1254 pmeGpu
->settings
.transferKind
, nullptr);
1257 int majorDim
= -1, middleDim
= -1, minorDim
= -1;
1258 switch (gridOrdering
)
1260 case GridOrdering::YZX
:
1266 case GridOrdering::XYZ
:
1272 default: GMX_ASSERT(false, "Implement grid ordering here and below for the kernel launch");
1275 const int maxBlockSize
= pmeGpu
->programHandle_
->impl_
->solveMaxWorkGroupSize
;
1277 const int gridLineSize
= pmeGpu
->kernelParams
->grid
.complexGridSize
[minorDim
];
1278 const int gridLinesPerBlock
= std::max(maxBlockSize
/ gridLineSize
, 1);
1279 const int blocksPerGridLine
= (gridLineSize
+ maxBlockSize
- 1) / maxBlockSize
;
1281 if (blocksPerGridLine
== 1)
1283 cellsPerBlock
= gridLineSize
* gridLinesPerBlock
;
1287 cellsPerBlock
= (gridLineSize
+ blocksPerGridLine
- 1) / blocksPerGridLine
;
1289 const int warpSize
= pmeGpu
->programHandle_
->impl_
->warpSize
;
1290 const int blockSize
= (cellsPerBlock
+ warpSize
- 1) / warpSize
* warpSize
;
1292 static_assert(GMX_GPU
!= GMX_GPU_CUDA
|| c_solveMaxWarpsPerBlock
/ 2 >= 4,
1293 "The CUDA solve energy kernels needs at least 4 warps. "
1294 "Here we launch at least half of the max warps.");
1296 KernelLaunchConfig config
;
1297 config
.blockSize
[0] = blockSize
;
1298 config
.gridSize
[0] = blocksPerGridLine
;
1299 // rounding up to full warps so that shuffle operations produce defined results
1300 config
.gridSize
[1] = (pmeGpu
->kernelParams
->grid
.complexGridSize
[middleDim
] + gridLinesPerBlock
- 1)
1301 / gridLinesPerBlock
;
1302 config
.gridSize
[2] = pmeGpu
->kernelParams
->grid
.complexGridSize
[majorDim
];
1303 config
.stream
= pmeGpu
->archSpecific
->pmeStream_
.stream();
1305 int timingId
= gtPME_SOLVE
;
1306 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1307 if (gridOrdering
== GridOrdering::YZX
)
1309 kernelPtr
= computeEnergyAndVirial
? pmeGpu
->programHandle_
->impl_
->solveYZXEnergyKernel
1310 : pmeGpu
->programHandle_
->impl_
->solveYZXKernel
;
1312 else if (gridOrdering
== GridOrdering::XYZ
)
1314 kernelPtr
= computeEnergyAndVirial
? pmeGpu
->programHandle_
->impl_
->solveXYZEnergyKernel
1315 : pmeGpu
->programHandle_
->impl_
->solveXYZKernel
;
1318 pme_gpu_start_timing(pmeGpu
, timingId
);
1319 auto* timingEvent
= pme_gpu_fetch_timing_event(pmeGpu
, timingId
);
1320 #if c_canEmbedBuffers
1321 const auto kernelArgs
= prepareGpuKernelArguments(kernelPtr
, config
, kernelParamsPtr
);
1323 const auto kernelArgs
= prepareGpuKernelArguments(
1324 kernelPtr
, config
, kernelParamsPtr
, &kernelParamsPtr
->grid
.d_splineModuli
,
1325 &kernelParamsPtr
->constants
.d_virialAndEnergy
, &kernelParamsPtr
->grid
.d_fourierGrid
);
1327 launchGpuKernel(kernelPtr
, config
, timingEvent
, "PME solve", kernelArgs
);
1328 pme_gpu_stop_timing(pmeGpu
, timingId
);
1330 if (computeEnergyAndVirial
)
1332 copyFromDeviceBuffer(pmeGpu
->staging
.h_virialAndEnergy
,
1333 &kernelParamsPtr
->constants
.d_virialAndEnergy
, 0, c_virialAndEnergyCount
,
1334 pmeGpu
->archSpecific
->pmeStream_
, pmeGpu
->settings
.transferKind
, nullptr);
1337 if (copyInputAndOutputGrid
)
1339 copyFromDeviceBuffer(h_gridFloat
, &kernelParamsPtr
->grid
.d_fourierGrid
, 0,
1340 pmeGpu
->archSpecific
->complexGridSize
, pmeGpu
->archSpecific
->pmeStream_
,
1341 pmeGpu
->settings
.transferKind
, nullptr);
1346 * Returns a pointer to appropriate gather kernel based on the inputvalues
1348 * \param[in] pmeGpu The PME GPU structure.
1349 * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
1350 * \param[in] readSplinesFromGlobal bool controlling if we should write spline data to global memory
1352 * \return Pointer to CUDA kernel
1354 inline auto selectGatherKernelPtr(const PmeGpu
* pmeGpu
, bool useOrderThreadsPerAtom
, bool readSplinesFromGlobal
)
1357 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
= nullptr;
1359 if (readSplinesFromGlobal
)
1361 if (useOrderThreadsPerAtom
)
1363 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernelReadSplinesThPerAtom4
;
1367 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernelReadSplines
;
1372 if (useOrderThreadsPerAtom
)
1374 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernelThPerAtom4
;
1378 kernelPtr
= pmeGpu
->programHandle_
->impl_
->gatherKernel
;
1385 void pme_gpu_gather(PmeGpu
* pmeGpu
, const float* h_grid
)
1387 const auto& settings
= pmeGpu
->settings
;
1388 if (!settings
.performGPUFFT
|| settings
.copyAllOutputs
)
1390 pme_gpu_copy_input_gather_grid(pmeGpu
, const_cast<float*>(h_grid
));
1393 if (settings
.copyAllOutputs
)
1395 pme_gpu_copy_input_gather_atom_data(pmeGpu
);
1398 /* Set if we have unit tests */
1399 const bool readGlobal
= pmeGpu
->settings
.copyAllOutputs
;
1400 const size_t blockSize
= pmeGpu
->programHandle_
->impl_
->gatherWorkGroupSize
;
1401 const bool useOrderThreadsPerAtom
= pmeGpu
->settings
.useOrderThreadsPerAtom
;
1402 const bool recalculateSplines
= pmeGpu
->settings
.recalculateSplines
;
1403 #if GMX_GPU == GMX_GPU_OPENCL
1404 GMX_ASSERT(!useOrderThreadsPerAtom
, "Only 16 threads per atom supported in OpenCL");
1405 GMX_ASSERT(!recalculateSplines
, "Recalculating splines not supported in OpenCL");
1407 const int atomsPerBlock
= useOrderThreadsPerAtom
? blockSize
/ c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
1408 : blockSize
/ c_pmeSpreadGatherThreadsPerAtom
;
1410 GMX_ASSERT(!c_usePadding
|| !(c_pmeAtomDataAlignment
% atomsPerBlock
),
1411 "inconsistent atom data padding vs. gathering block size");
1413 const int blockCount
= pmeGpu
->nAtomsPadded
/ atomsPerBlock
;
1414 auto dimGrid
= pmeGpuCreateGrid(pmeGpu
, blockCount
);
1416 const int order
= pmeGpu
->common
->pme_order
;
1417 GMX_ASSERT(order
== c_pmeGpuOrder
, "Only PME order 4 is implemented");
1419 KernelLaunchConfig config
;
1420 config
.blockSize
[0] = order
;
1421 config
.blockSize
[1] = useOrderThreadsPerAtom
? 1 : order
;
1422 config
.blockSize
[2] = atomsPerBlock
;
1423 config
.gridSize
[0] = dimGrid
.first
;
1424 config
.gridSize
[1] = dimGrid
.second
;
1425 config
.stream
= pmeGpu
->archSpecific
->pmeStream_
.stream();
1427 // TODO test different cache configs
1429 int timingId
= gtPME_GATHER
;
1430 PmeGpuProgramImpl::PmeKernelHandle kernelPtr
=
1431 selectGatherKernelPtr(pmeGpu
, useOrderThreadsPerAtom
, readGlobal
|| (!recalculateSplines
));
1432 // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
1434 pme_gpu_start_timing(pmeGpu
, timingId
);
1435 auto* timingEvent
= pme_gpu_fetch_timing_event(pmeGpu
, timingId
);
1436 const auto* kernelParamsPtr
= pmeGpu
->kernelParams
.get();
1437 #if c_canEmbedBuffers
1438 const auto kernelArgs
= prepareGpuKernelArguments(kernelPtr
, config
, kernelParamsPtr
);
1440 const auto kernelArgs
= prepareGpuKernelArguments(
1441 kernelPtr
, config
, kernelParamsPtr
, &kernelParamsPtr
->atoms
.d_coefficients
,
1442 &kernelParamsPtr
->grid
.d_realGrid
, &kernelParamsPtr
->atoms
.d_theta
,
1443 &kernelParamsPtr
->atoms
.d_dtheta
, &kernelParamsPtr
->atoms
.d_gridlineIndices
,
1444 &kernelParamsPtr
->atoms
.d_forces
);
1446 launchGpuKernel(kernelPtr
, config
, timingEvent
, "PME gather", kernelArgs
);
1447 pme_gpu_stop_timing(pmeGpu
, timingId
);
1449 if (pmeGpu
->settings
.useGpuForceReduction
)
1451 pmeGpu
->archSpecific
->pmeForcesReady
.markEvent(pmeGpu
->archSpecific
->pmeStream_
);
1455 pme_gpu_copy_output_forces(pmeGpu
);
1459 void* pme_gpu_get_kernelparam_forces(const PmeGpu
* pmeGpu
)
1461 if (pmeGpu
&& pmeGpu
->kernelParams
)
1463 return pmeGpu
->kernelParams
->atoms
.d_forces
;
1471 void pme_gpu_set_kernelparam_coordinates(const PmeGpu
* pmeGpu
, DeviceBuffer
<gmx::RVec
> d_x
)
1473 GMX_ASSERT(pmeGpu
&& pmeGpu
->kernelParams
,
1474 "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was "
1477 GMX_ASSERT(checkDeviceBuffer(d_x
, pmeGpu
->kernelParams
->atoms
.nAtoms
),
1478 "The device-side buffer can not be set.");
1480 pmeGpu
->kernelParams
->atoms
.d_coordinates
= d_x
;
1483 const DeviceStream
* pme_gpu_get_stream(const PmeGpu
* pmeGpu
)
1487 return &pmeGpu
->archSpecific
->pmeStream_
;
1495 GpuEventSynchronizer
* pme_gpu_get_forces_ready_synchronizer(const PmeGpu
* pmeGpu
)
1497 if (pmeGpu
&& pmeGpu
->kernelParams
)
1499 return &pmeGpu
->archSpecific
->pmeForcesReady
;