2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief This file contains internal function implementations
39 * for performing the PME calculations on GPU.
41 * Note that this file is compiled as regular C++ source in OpenCL builds, but
42 * it is treated as CUDA source in CUDA-enabled GPU builds.
44 * \author Aleksei Iupinov <a.yupinov@gmail.com>
45 * \ingroup module_ewald
50 #include "pme-gpu-internal.h"
57 #include "gromacs/ewald/ewald-utils.h"
58 #include "gromacs/gpu_utils/gpu_utils.h"
59 #include "gromacs/math/invertmatrix.h"
60 #include "gromacs/math/units.h"
61 #include "gromacs/timing/gpu_timing.h"
62 #include "gromacs/utility/exceptions.h"
63 #include "gromacs/utility/fatalerror.h"
64 #include "gromacs/utility/gmxassert.h"
65 #include "gromacs/utility/logger.h"
66 #include "gromacs/utility/stringutil.h"
68 #if GMX_GPU == GMX_GPU_CUDA
69 #include "gromacs/gpu_utils/pmalloc_cuda.h"
72 #elif GMX_GPU == GMX_GPU_OPENCL
73 #include "gromacs/gpu_utils/gmxopencl.h"
76 #include "pme-gpu-3dfft.h"
77 #include "pme-gpu-constants.h"
78 #include "pme-gpu-program-impl.h"
79 #include "pme-gpu-timings.h"
80 #include "pme-gpu-types.h"
81 #include "pme-gpu-types-host.h"
82 #include "pme-gpu-types-host-impl.h"
83 #include "pme-gpu-utils.h"
85 #include "pme-internal.h"
88 * Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
90 * \param[in] pmeGpu The PME GPU structure.
91 * \returns The pointer to the kernel parameters.
93 static PmeGpuKernelParamsBase
*pme_gpu_get_kernel_params_base_ptr(const PmeGpu
*pmeGpu
)
95 // reinterpret_cast is needed because the derived CUDA structure is not known in this file
96 auto *kernelParamsPtr
= reinterpret_cast<PmeGpuKernelParamsBase
*>(pmeGpu
->kernelParams
.get());
97 return kernelParamsPtr
;
100 int pme_gpu_get_atom_data_alignment(const PmeGpu
*)
102 //TODO: this can be simplified, as PME_ATOM_DATA_ALIGNMENT is now constant
103 return PME_ATOM_DATA_ALIGNMENT
;
106 int pme_gpu_get_atoms_per_warp(const PmeGpu
*pmeGpu
)
108 #if GMX_GPU == GMX_GPU_CUDA
109 const int order
= pmeGpu
->common
->pme_order
;
110 GMX_ASSERT(order
> 0, "Invalid PME order");
111 return PME_SPREADGATHER_ATOMS_PER_WARP
;
113 GMX_THROW(gmx::NotImplementedError("Atom alignment per warp has to be deduced dynamically for OpenCL"));
114 GMX_UNUSED_VALUE(pmeGpu
);
118 void pme_gpu_synchronize(const PmeGpu
*pmeGpu
)
120 gpuStreamSynchronize(pmeGpu
->archSpecific
->pmeStream
);
123 void pme_gpu_alloc_energy_virial(const PmeGpu
*pmeGpu
)
125 const size_t energyAndVirialSize
= c_virialAndEnergyCount
* sizeof(float);
126 allocateDeviceBuffer(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
, c_virialAndEnergyCount
, pmeGpu
->archSpecific
->context
);
127 pmalloc((void **)&pmeGpu
->staging
.h_virialAndEnergy
, energyAndVirialSize
);
130 void pme_gpu_free_energy_virial(PmeGpu
*pmeGpu
)
132 freeDeviceBuffer(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
);
133 pfree(pmeGpu
->staging
.h_virialAndEnergy
);
134 pmeGpu
->staging
.h_virialAndEnergy
= nullptr;
137 void pme_gpu_clear_energy_virial(const PmeGpu
*pmeGpu
)
139 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->constants
.d_virialAndEnergy
, 0,
140 c_virialAndEnergyCount
, pmeGpu
->archSpecific
->pmeStream
);
143 void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu
*pmeGpu
)
145 const int splineValuesOffset
[DIM
] = {
147 pmeGpu
->kernelParams
->grid
.realGridSize
[XX
],
148 pmeGpu
->kernelParams
->grid
.realGridSize
[XX
] + pmeGpu
->kernelParams
->grid
.realGridSize
[YY
]
150 memcpy((void *)&pmeGpu
->kernelParams
->grid
.splineValuesOffset
, &splineValuesOffset
, sizeof(splineValuesOffset
));
152 const int newSplineValuesSize
= pmeGpu
->kernelParams
->grid
.realGridSize
[XX
] +
153 pmeGpu
->kernelParams
->grid
.realGridSize
[YY
] +
154 pmeGpu
->kernelParams
->grid
.realGridSize
[ZZ
];
155 const bool shouldRealloc
= (newSplineValuesSize
> pmeGpu
->archSpecific
->splineValuesSize
);
156 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
, newSplineValuesSize
,
157 &pmeGpu
->archSpecific
->splineValuesSize
, &pmeGpu
->archSpecific
->splineValuesSizeAlloc
, pmeGpu
->archSpecific
->context
);
160 /* Reallocate the host buffer */
161 pfree(pmeGpu
->staging
.h_splineModuli
);
162 pmalloc((void **)&pmeGpu
->staging
.h_splineModuli
, newSplineValuesSize
* sizeof(float));
164 for (int i
= 0; i
< DIM
; i
++)
166 memcpy(pmeGpu
->staging
.h_splineModuli
+ splineValuesOffset
[i
], pmeGpu
->common
->bsp_mod
[i
].data(), pmeGpu
->common
->bsp_mod
[i
].size() * sizeof(float));
168 /* TODO: pin original buffer instead! */
169 copyToDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
, pmeGpu
->staging
.h_splineModuli
,
170 0, newSplineValuesSize
,
171 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
174 void pme_gpu_free_bspline_values(const PmeGpu
*pmeGpu
)
176 pfree(pmeGpu
->staging
.h_splineModuli
);
177 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_splineModuli
);
180 void pme_gpu_realloc_forces(PmeGpu
*pmeGpu
)
182 const size_t newForcesSize
= pmeGpu
->nAtomsAlloc
* DIM
;
183 GMX_ASSERT(newForcesSize
> 0, "Bad number of atoms in PME GPU");
184 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
, newForcesSize
,
185 &pmeGpu
->archSpecific
->forcesSize
, &pmeGpu
->archSpecific
->forcesSizeAlloc
, pmeGpu
->archSpecific
->context
);
186 pmeGpu
->staging
.h_forces
.reserve(pmeGpu
->nAtomsAlloc
);
187 pmeGpu
->staging
.h_forces
.resize(pmeGpu
->kernelParams
->atoms
.nAtoms
);
190 void pme_gpu_free_forces(const PmeGpu
*pmeGpu
)
192 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
);
195 void pme_gpu_copy_input_forces(PmeGpu
*pmeGpu
)
197 GMX_ASSERT(pmeGpu
->kernelParams
->atoms
.nAtoms
> 0, "Bad number of atoms in PME GPU");
198 float *h_forcesFloat
= reinterpret_cast<float *>(pmeGpu
->staging
.h_forces
.data());
199 copyToDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_forces
, h_forcesFloat
,
200 0, DIM
* pmeGpu
->kernelParams
->atoms
.nAtoms
,
201 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
204 void pme_gpu_copy_output_forces(PmeGpu
*pmeGpu
)
206 GMX_ASSERT(pmeGpu
->kernelParams
->atoms
.nAtoms
> 0, "Bad number of atoms in PME GPU");
207 float *h_forcesFloat
= reinterpret_cast<float *>(pmeGpu
->staging
.h_forces
.data());
208 copyFromDeviceBuffer(h_forcesFloat
, &pmeGpu
->kernelParams
->atoms
.d_forces
,
209 0, DIM
* pmeGpu
->kernelParams
->atoms
.nAtoms
,
210 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
213 void pme_gpu_realloc_coordinates(const PmeGpu
*pmeGpu
)
215 const size_t newCoordinatesSize
= pmeGpu
->nAtomsAlloc
* DIM
;
216 GMX_ASSERT(newCoordinatesSize
> 0, "Bad number of atoms in PME GPU");
217 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coordinates
, newCoordinatesSize
,
218 &pmeGpu
->archSpecific
->coordinatesSize
, &pmeGpu
->archSpecific
->coordinatesSizeAlloc
, pmeGpu
->archSpecific
->context
);
221 const size_t paddingIndex
= DIM
* pmeGpu
->kernelParams
->atoms
.nAtoms
;
222 const size_t paddingCount
= DIM
* pmeGpu
->nAtomsAlloc
- paddingIndex
;
223 if (paddingCount
> 0)
225 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->atoms
.d_coordinates
, paddingIndex
,
226 paddingCount
, pmeGpu
->archSpecific
->pmeStream
);
231 void pme_gpu_copy_input_coordinates(const PmeGpu
*pmeGpu
, const rvec
*h_coordinates
)
233 GMX_ASSERT(h_coordinates
, "Bad host-side coordinate buffer in PME GPU");
235 GMX_RELEASE_ASSERT(false, "Only single precision is supported");
236 GMX_UNUSED_VALUE(h_coordinates
);
238 const float *h_coordinatesFloat
= reinterpret_cast<const float *>(h_coordinates
);
239 copyToDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coordinates
, h_coordinatesFloat
,
240 0, pmeGpu
->kernelParams
->atoms
.nAtoms
* DIM
,
241 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
245 void pme_gpu_free_coordinates(const PmeGpu
*pmeGpu
)
247 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coordinates
);
250 void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu
*pmeGpu
, const float *h_coefficients
)
252 GMX_ASSERT(h_coefficients
, "Bad host-side charge buffer in PME GPU");
253 const size_t newCoefficientsSize
= pmeGpu
->nAtomsAlloc
;
254 GMX_ASSERT(newCoefficientsSize
> 0, "Bad number of atoms in PME GPU");
255 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
, newCoefficientsSize
,
256 &pmeGpu
->archSpecific
->coefficientsSize
, &pmeGpu
->archSpecific
->coefficientsSizeAlloc
, pmeGpu
->archSpecific
->context
);
257 copyToDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
, const_cast<float *>(h_coefficients
),
258 0, pmeGpu
->kernelParams
->atoms
.nAtoms
,
259 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
262 const size_t paddingIndex
= pmeGpu
->kernelParams
->atoms
.nAtoms
;
263 const size_t paddingCount
= pmeGpu
->nAtomsAlloc
- paddingIndex
;
264 if (paddingCount
> 0)
266 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->atoms
.d_coefficients
, paddingIndex
,
267 paddingCount
, pmeGpu
->archSpecific
->pmeStream
);
272 void pme_gpu_free_coefficients(const PmeGpu
*pmeGpu
)
274 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_coefficients
);
277 void pme_gpu_realloc_spline_data(const PmeGpu
*pmeGpu
)
279 const int order
= pmeGpu
->common
->pme_order
;
280 const int alignment
= pme_gpu_get_atoms_per_warp(pmeGpu
);
281 const size_t nAtomsPadded
= ((pmeGpu
->nAtomsAlloc
+ alignment
- 1) / alignment
) * alignment
;
282 const int newSplineDataSize
= DIM
* order
* nAtomsPadded
;
283 GMX_ASSERT(newSplineDataSize
> 0, "Bad number of atoms in PME GPU");
284 /* Two arrays of the same size */
285 const bool shouldRealloc
= (newSplineDataSize
> pmeGpu
->archSpecific
->splineDataSize
);
286 int currentSizeTemp
= pmeGpu
->archSpecific
->splineDataSize
;
287 int currentSizeTempAlloc
= pmeGpu
->archSpecific
->splineDataSizeAlloc
;
288 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_theta
, newSplineDataSize
,
289 ¤tSizeTemp
, ¤tSizeTempAlloc
, pmeGpu
->archSpecific
->context
);
290 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_dtheta
, newSplineDataSize
,
291 &pmeGpu
->archSpecific
->splineDataSize
, &pmeGpu
->archSpecific
->splineDataSizeAlloc
, pmeGpu
->archSpecific
->context
);
292 // the host side reallocation
295 pfree(pmeGpu
->staging
.h_theta
);
296 pmalloc((void **)&pmeGpu
->staging
.h_theta
, newSplineDataSize
* sizeof(float));
297 pfree(pmeGpu
->staging
.h_dtheta
);
298 pmalloc((void **)&pmeGpu
->staging
.h_dtheta
, newSplineDataSize
* sizeof(float));
302 void pme_gpu_free_spline_data(const PmeGpu
*pmeGpu
)
304 /* Two arrays of the same size */
305 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_theta
);
306 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_dtheta
);
307 pfree(pmeGpu
->staging
.h_theta
);
308 pfree(pmeGpu
->staging
.h_dtheta
);
311 void pme_gpu_realloc_grid_indices(const PmeGpu
*pmeGpu
)
313 const size_t newIndicesSize
= DIM
* pmeGpu
->nAtomsAlloc
;
314 GMX_ASSERT(newIndicesSize
> 0, "Bad number of atoms in PME GPU");
315 reallocateDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_gridlineIndices
, newIndicesSize
,
316 &pmeGpu
->archSpecific
->gridlineIndicesSize
, &pmeGpu
->archSpecific
->gridlineIndicesSizeAlloc
, pmeGpu
->archSpecific
->context
);
317 pfree(pmeGpu
->staging
.h_gridlineIndices
);
318 pmalloc((void **)&pmeGpu
->staging
.h_gridlineIndices
, newIndicesSize
* sizeof(int));
321 void pme_gpu_free_grid_indices(const PmeGpu
*pmeGpu
)
323 freeDeviceBuffer(&pmeGpu
->kernelParams
->atoms
.d_gridlineIndices
);
324 pfree(pmeGpu
->staging
.h_gridlineIndices
);
327 void pme_gpu_realloc_grids(PmeGpu
*pmeGpu
)
329 auto *kernelParamsPtr
= pmeGpu
->kernelParams
.get();
330 const int newRealGridSize
= kernelParamsPtr
->grid
.realGridSizePadded
[XX
] *
331 kernelParamsPtr
->grid
.realGridSizePadded
[YY
] *
332 kernelParamsPtr
->grid
.realGridSizePadded
[ZZ
];
333 const int newComplexGridSize
= kernelParamsPtr
->grid
.complexGridSizePadded
[XX
] *
334 kernelParamsPtr
->grid
.complexGridSizePadded
[YY
] *
335 kernelParamsPtr
->grid
.complexGridSizePadded
[ZZ
] * 2;
336 // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
337 if (pmeGpu
->archSpecific
->performOutOfPlaceFFT
)
339 /* 2 separate grids */
340 reallocateDeviceBuffer(&kernelParamsPtr
->grid
.d_fourierGrid
, newComplexGridSize
,
341 &pmeGpu
->archSpecific
->complexGridSize
, &pmeGpu
->archSpecific
->complexGridSizeAlloc
, pmeGpu
->archSpecific
->context
);
342 reallocateDeviceBuffer(&kernelParamsPtr
->grid
.d_realGrid
, newRealGridSize
,
343 &pmeGpu
->archSpecific
->realGridSize
, &pmeGpu
->archSpecific
->realGridSizeAlloc
, pmeGpu
->archSpecific
->context
);
347 /* A single buffer so that any grid will fit */
348 const int newGridsSize
= std::max(newRealGridSize
, newComplexGridSize
);
349 reallocateDeviceBuffer(&kernelParamsPtr
->grid
.d_realGrid
, newGridsSize
,
350 &pmeGpu
->archSpecific
->realGridSize
, &pmeGpu
->archSpecific
->realGridSizeAlloc
, pmeGpu
->archSpecific
->context
);
351 kernelParamsPtr
->grid
.d_fourierGrid
= kernelParamsPtr
->grid
.d_realGrid
;
352 pmeGpu
->archSpecific
->complexGridSize
= pmeGpu
->archSpecific
->realGridSize
;
353 // the size might get used later for copying the grid
357 void pme_gpu_free_grids(const PmeGpu
*pmeGpu
)
359 if (pmeGpu
->archSpecific
->performOutOfPlaceFFT
)
361 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_fourierGrid
);
363 freeDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_realGrid
);
366 void pme_gpu_clear_grids(const PmeGpu
*pmeGpu
)
368 clearDeviceBufferAsync(&pmeGpu
->kernelParams
->grid
.d_realGrid
, 0,
369 pmeGpu
->archSpecific
->realGridSize
, pmeGpu
->archSpecific
->pmeStream
);
372 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu
*pmeGpu
)
374 pme_gpu_free_fract_shifts(pmeGpu
);
376 auto *kernelParamsPtr
= pmeGpu
->kernelParams
.get();
378 const int nx
= kernelParamsPtr
->grid
.realGridSize
[XX
];
379 const int ny
= kernelParamsPtr
->grid
.realGridSize
[YY
];
380 const int nz
= kernelParamsPtr
->grid
.realGridSize
[ZZ
];
381 const int cellCount
= c_pmeNeighborUnitcellCount
;
382 const int gridDataOffset
[DIM
] = {0, cellCount
* nx
, cellCount
* (nx
+ ny
)};
384 memcpy(kernelParamsPtr
->grid
.tablesOffsets
, &gridDataOffset
, sizeof(gridDataOffset
));
386 const int newFractShiftsSize
= cellCount
* (nx
+ ny
+ nz
);
388 #if GMX_GPU == GMX_GPU_CUDA
389 initParamLookupTable(kernelParamsPtr
->grid
.d_fractShiftsTable
,
390 kernelParamsPtr
->fractShiftsTableTexture
,
391 pmeGpu
->common
->fsh
.data(),
395 initParamLookupTable(kernelParamsPtr
->grid
.d_gridlineIndicesTable
,
396 kernelParamsPtr
->gridlineIndicesTableTexture
,
397 pmeGpu
->common
->nn
.data(),
400 #elif GMX_GPU == GMX_GPU_OPENCL
401 // No dedicated texture routines....
402 allocateDeviceBuffer(&kernelParamsPtr
->grid
.d_fractShiftsTable
, newFractShiftsSize
, pmeGpu
->archSpecific
->context
);
403 allocateDeviceBuffer(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
, newFractShiftsSize
, pmeGpu
->archSpecific
->context
);
404 copyToDeviceBuffer(&kernelParamsPtr
->grid
.d_fractShiftsTable
, pmeGpu
->common
->fsh
.data(),
405 0, newFractShiftsSize
,
406 pmeGpu
->archSpecific
->pmeStream
, GpuApiCallBehavior::Async
, nullptr);
407 copyToDeviceBuffer(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
, pmeGpu
->common
->nn
.data(),
408 0, newFractShiftsSize
,
409 pmeGpu
->archSpecific
->pmeStream
, GpuApiCallBehavior::Async
, nullptr);
413 void pme_gpu_free_fract_shifts(const PmeGpu
*pmeGpu
)
415 auto *kernelParamsPtr
= pmeGpu
->kernelParams
.get();
416 #if GMX_GPU == GMX_GPU_CUDA
417 destroyParamLookupTable(kernelParamsPtr
->grid
.d_fractShiftsTable
,
418 kernelParamsPtr
->fractShiftsTableTexture
,
420 destroyParamLookupTable(kernelParamsPtr
->grid
.d_gridlineIndicesTable
,
421 kernelParamsPtr
->gridlineIndicesTableTexture
,
423 #elif GMX_GPU == GMX_GPU_OPENCL
424 freeDeviceBuffer(&kernelParamsPtr
->grid
.d_fractShiftsTable
);
425 freeDeviceBuffer(&kernelParamsPtr
->grid
.d_gridlineIndicesTable
);
429 bool pme_gpu_stream_query(const PmeGpu
*pmeGpu
)
431 return haveStreamTasksCompleted(pmeGpu
->archSpecific
->pmeStream
);
434 void pme_gpu_copy_input_gather_grid(const PmeGpu
*pmeGpu
, float *h_grid
)
436 copyToDeviceBuffer(&pmeGpu
->kernelParams
->grid
.d_realGrid
, h_grid
,
437 0, pmeGpu
->archSpecific
->realGridSize
,
438 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
441 void pme_gpu_copy_output_spread_grid(const PmeGpu
*pmeGpu
, float *h_grid
)
443 copyFromDeviceBuffer(h_grid
, &pmeGpu
->kernelParams
->grid
.d_realGrid
,
444 0, pmeGpu
->archSpecific
->realGridSize
,
445 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
446 pmeGpu
->archSpecific
->syncSpreadGridD2H
.markEvent(pmeGpu
->archSpecific
->pmeStream
);
449 void pme_gpu_copy_output_spread_atom_data(const PmeGpu
*pmeGpu
)
451 const int alignment
= pme_gpu_get_atoms_per_warp(pmeGpu
);
452 const size_t nAtomsPadded
= ((pmeGpu
->nAtomsAlloc
+ alignment
- 1) / alignment
) * alignment
;
453 const size_t splinesCount
= DIM
* nAtomsPadded
* pmeGpu
->common
->pme_order
;
454 auto *kernelParamsPtr
= pmeGpu
->kernelParams
.get();
455 copyFromDeviceBuffer(pmeGpu
->staging
.h_dtheta
, &kernelParamsPtr
->atoms
.d_dtheta
,
457 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
458 copyFromDeviceBuffer(pmeGpu
->staging
.h_theta
, &kernelParamsPtr
->atoms
.d_theta
,
460 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
461 copyFromDeviceBuffer(pmeGpu
->staging
.h_gridlineIndices
, &kernelParamsPtr
->atoms
.d_gridlineIndices
,
462 0, kernelParamsPtr
->atoms
.nAtoms
* DIM
,
463 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
466 void pme_gpu_copy_input_gather_atom_data(const PmeGpu
*pmeGpu
)
468 const int alignment
= pme_gpu_get_atoms_per_warp(pmeGpu
);
469 const size_t nAtomsPadded
= ((pmeGpu
->nAtomsAlloc
+ alignment
- 1) / alignment
) * alignment
;
470 const size_t splinesCount
= DIM
* nAtomsPadded
* pmeGpu
->common
->pme_order
;
471 auto *kernelParamsPtr
= pmeGpu
->kernelParams
.get();
474 // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
475 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_gridlineIndices
, 0,
476 pmeGpu
->nAtomsAlloc
* DIM
, pmeGpu
->archSpecific
->pmeStream
);
477 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_dtheta
, 0,
478 pmeGpu
->nAtomsAlloc
* pmeGpu
->common
->pme_order
* DIM
, pmeGpu
->archSpecific
->pmeStream
);
479 clearDeviceBufferAsync(&kernelParamsPtr
->atoms
.d_theta
, 0,
480 pmeGpu
->nAtomsAlloc
* pmeGpu
->common
->pme_order
* DIM
, pmeGpu
->archSpecific
->pmeStream
);
482 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_dtheta
, pmeGpu
->staging
.h_dtheta
,
484 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
485 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_theta
, pmeGpu
->staging
.h_theta
,
487 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
488 copyToDeviceBuffer(&kernelParamsPtr
->atoms
.d_gridlineIndices
, pmeGpu
->staging
.h_gridlineIndices
,
489 0, kernelParamsPtr
->atoms
.nAtoms
* DIM
,
490 pmeGpu
->archSpecific
->pmeStream
, pmeGpu
->settings
.transferKind
, nullptr);
493 void pme_gpu_sync_spread_grid(const PmeGpu
*pmeGpu
)
495 pmeGpu
->archSpecific
->syncSpreadGridD2H
.waitForEvent();
498 void pme_gpu_init_internal(PmeGpu
*pmeGpu
)
500 /* Allocate the target-specific structures */
501 pmeGpu
->archSpecific
.reset(new PmeGpuSpecific());
502 pmeGpu
->kernelParams
.reset(new PmeGpuKernelParams());
504 pmeGpu
->archSpecific
->performOutOfPlaceFFT
= true;
505 /* This should give better performance, according to the cuFFT documentation.
506 * The performance seems to be the same though.
507 * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
510 // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
511 if (GMX_GPU
== GMX_GPU_CUDA
)
513 /* WARNING: CUDA timings are incorrect with multiple streams.
514 * This is the main reason why they are disabled by default.
516 // TODO: Consider turning on by default when we can detect nr of streams.
517 pmeGpu
->archSpecific
->useTiming
= (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
519 else if (GMX_GPU
== GMX_GPU_OPENCL
)
521 pmeGpu
->archSpecific
->useTiming
= (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
524 // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
525 pmeGpu
->archSpecific
->context
= pmeGpu
->programHandle_
->impl_
->context
;
527 #if GMX_GPU == GMX_GPU_CUDA
528 // Prepare to use the device that this PME task was assigned earlier.
529 CU_RET_ERR(cudaSetDevice(pmeGpu
->deviceInfo
->id
), "Switching to PME CUDA device");
532 #if GMX_GPU == GMX_GPU_CUDA
533 pmeGpu
->maxGridWidthX
= pmeGpu
->deviceInfo
->prop
.maxGridSize
[0];
534 #elif GMX_GPU == GMX_GPU_OPENCL
535 //TODO we'll need work size checks for OpenCL too
538 /* Creating a PME GPU stream:
539 * - default high priority with CUDA
540 * - no priorities implemented yet with OpenCL; see #2532
542 #if GMX_GPU == GMX_GPU_CUDA
544 int highest_priority
, lowest_priority
;
545 stat
= cudaDeviceGetStreamPriorityRange(&lowest_priority
, &highest_priority
);
546 CU_RET_ERR(stat
, "PME cudaDeviceGetStreamPriorityRange failed");
547 stat
= cudaStreamCreateWithPriority(&pmeGpu
->archSpecific
->pmeStream
,
548 cudaStreamDefault
, //cudaStreamNonBlocking,
550 CU_RET_ERR(stat
, "cudaStreamCreateWithPriority on the PME stream failed");
551 #elif GMX_GPU == GMX_GPU_OPENCL
552 cl_command_queue_properties queueProperties
= pmeGpu
->archSpecific
->useTiming
? CL_QUEUE_PROFILING_ENABLE
: 0;
553 cl_device_id device_id
= pmeGpu
->deviceInfo
->ocl_gpu_id
.ocl_device_id
;
555 pmeGpu
->archSpecific
->pmeStream
= clCreateCommandQueue(pmeGpu
->archSpecific
->context
,
556 device_id
, queueProperties
, &clError
);
557 if (clError
!= CL_SUCCESS
)
559 GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
564 void pme_gpu_destroy_specific(const PmeGpu
*pmeGpu
)
566 #if GMX_GPU == GMX_GPU_CUDA
567 /* Destroy the CUDA stream */
568 cudaError_t stat
= cudaStreamDestroy(pmeGpu
->archSpecific
->pmeStream
);
569 CU_RET_ERR(stat
, "PME cudaStreamDestroy error");
570 #elif GMX_GPU == GMX_GPU_OPENCL
571 cl_int clError
= clReleaseCommandQueue(pmeGpu
->archSpecific
->pmeStream
);
572 if (clError
!= CL_SUCCESS
)
574 gmx_warning("Failed to destroy PME command queue");
579 void pme_gpu_reinit_3dfft(const PmeGpu
*pmeGpu
)
581 if (pme_gpu_performs_FFT(pmeGpu
))
583 pmeGpu
->archSpecific
->fftSetup
.resize(0);
584 for (int i
= 0; i
< pmeGpu
->common
->ngrids
; i
++)
586 pmeGpu
->archSpecific
->fftSetup
.push_back(std::unique_ptr
<GpuParallel3dFft
>(new GpuParallel3dFft(pmeGpu
)));
591 void pme_gpu_destroy_3dfft(const PmeGpu
*pmeGpu
)
593 pmeGpu
->archSpecific
->fftSetup
.resize(0);
596 int getSplineParamFullIndex(int order
, int splineIndex
, int dimIndex
, int warpIndex
, int atomWarpIndex
)
602 constexpr int fixedOrder
= 4;
603 GMX_UNUSED_VALUE(fixedOrder
);
604 const int indexBase
= getSplineParamIndexBase
<fixedOrder
>(warpIndex
, atomWarpIndex
);
605 return getSplineParamIndex
<fixedOrder
>(indexBase
, dimIndex
, splineIndex
);
608 gmx::ArrayRef
<gmx::RVec
> pme_gpu_get_forces(PmeGpu
*pmeGpu
)
610 return pmeGpu
->staging
.h_forces
;
613 void pme_gpu_get_energy_virial(const PmeGpu
*pmeGpu
, real
*energy
, matrix virial
)
615 for (int j
= 0; j
< c_virialAndEnergyCount
; j
++)
617 GMX_ASSERT(std::isfinite(pmeGpu
->staging
.h_virialAndEnergy
[j
]), "PME GPU produces incorrect energy/virial.");
620 GMX_ASSERT(energy
, "Invalid energy output pointer in PME GPU");
622 virial
[XX
][XX
] = 0.25f
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
623 virial
[YY
][YY
] = 0.25f
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
624 virial
[ZZ
][ZZ
] = 0.25f
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
625 virial
[XX
][YY
] = virial
[YY
][XX
] = 0.25f
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
626 virial
[XX
][ZZ
] = virial
[ZZ
][XX
] = 0.25f
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
627 virial
[YY
][ZZ
] = virial
[ZZ
][YY
] = 0.25f
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
628 *energy
= 0.5f
* pmeGpu
->staging
.h_virialAndEnergy
[j
++];
631 void pme_gpu_update_input_box(PmeGpu gmx_unused
*pmeGpu
,
632 const matrix gmx_unused box
)
635 GMX_THROW(gmx::NotImplementedError("PME is implemented for single-precision only on GPU"));
638 pmeGpu
->common
->boxScaler
->scaleBox(box
, scaledBox
);
639 auto *kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
640 kernelParamsPtr
->current
.boxVolume
= scaledBox
[XX
][XX
] * scaledBox
[YY
][YY
] * scaledBox
[ZZ
][ZZ
];
641 GMX_ASSERT(kernelParamsPtr
->current
.boxVolume
!= 0.0f
, "Zero volume of the unit cell");
643 gmx::invertBoxMatrix(scaledBox
, recipBox
);
645 /* The GPU recipBox is transposed as compared to the CPU recipBox.
646 * Spread uses matrix columns (while solve and gather use rows).
647 * There is no particular reason for this; it might be further rethought/optimized for better access patterns.
649 const real newRecipBox
[DIM
][DIM
] =
651 {recipBox
[XX
][XX
], recipBox
[YY
][XX
], recipBox
[ZZ
][XX
]},
652 { 0.0, recipBox
[YY
][YY
], recipBox
[ZZ
][YY
]},
653 { 0.0, 0.0, recipBox
[ZZ
][ZZ
]}
655 memcpy(kernelParamsPtr
->current
.recipBox
, newRecipBox
, sizeof(matrix
));
659 /*! \brief \libinternal
660 * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
662 * \param[in] pmeGpu The PME GPU structure.
664 static void pme_gpu_reinit_grids(PmeGpu
*pmeGpu
)
666 auto *kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
667 kernelParamsPtr
->grid
.ewaldFactor
= (M_PI
* M_PI
) / (pmeGpu
->common
->ewaldcoeff_q
* pmeGpu
->common
->ewaldcoeff_q
);
669 /* The grid size variants */
670 for (int i
= 0; i
< DIM
; i
++)
672 kernelParamsPtr
->grid
.realGridSize
[i
] = pmeGpu
->common
->nk
[i
];
673 kernelParamsPtr
->grid
.realGridSizeFP
[i
] = (float)kernelParamsPtr
->grid
.realGridSize
[i
];
674 kernelParamsPtr
->grid
.realGridSizePadded
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
676 // The complex grid currently uses no padding;
677 // if it starts to do so, then another test should be added for that
678 kernelParamsPtr
->grid
.complexGridSize
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
679 kernelParamsPtr
->grid
.complexGridSizePadded
[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
681 /* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
682 if (!pme_gpu_performs_FFT(pmeGpu
))
684 // This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly
685 kernelParamsPtr
->grid
.realGridSizePadded
[ZZ
] = (kernelParamsPtr
->grid
.realGridSize
[ZZ
] / 2 + 1) * 2;
688 /* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
689 kernelParamsPtr
->grid
.complexGridSize
[ZZ
] /= 2;
690 kernelParamsPtr
->grid
.complexGridSize
[ZZ
]++;
691 kernelParamsPtr
->grid
.complexGridSizePadded
[ZZ
] = kernelParamsPtr
->grid
.complexGridSize
[ZZ
];
693 pme_gpu_realloc_and_copy_fract_shifts(pmeGpu
);
694 pme_gpu_realloc_and_copy_bspline_values(pmeGpu
);
695 pme_gpu_realloc_grids(pmeGpu
);
696 pme_gpu_reinit_3dfft(pmeGpu
);
699 /* Several GPU functions that refer to the CPU PME data live here.
700 * We would like to keep these away from the GPU-framework specific code for clarity,
701 * as well as compilation issues with MPI.
704 /*! \brief \libinternal
705 * Copies everything useful from the PME CPU to the PME GPU structure.
706 * The goal is to minimize interaction with the PME CPU structure in the GPU code.
708 * \param[in] pme The PME structure.
710 static void pme_gpu_copy_common_data_from(const gmx_pme_t
*pme
)
712 /* TODO: Consider refactoring the CPU PME code to use the same structure,
713 * so that this function becomes 2 lines */
714 PmeGpu
*pmeGpu
= pme
->gpu
;
715 pmeGpu
->common
->ngrids
= pme
->ngrids
;
716 pmeGpu
->common
->epsilon_r
= pme
->epsilon_r
;
717 pmeGpu
->common
->ewaldcoeff_q
= pme
->ewaldcoeff_q
;
718 pmeGpu
->common
->nk
[XX
] = pme
->nkx
;
719 pmeGpu
->common
->nk
[YY
] = pme
->nky
;
720 pmeGpu
->common
->nk
[ZZ
] = pme
->nkz
;
721 pmeGpu
->common
->pme_order
= pme
->pme_order
;
722 for (int i
= 0; i
< DIM
; i
++)
724 pmeGpu
->common
->bsp_mod
[i
].assign(pme
->bsp_mod
[i
], pme
->bsp_mod
[i
] + pmeGpu
->common
->nk
[i
]);
726 const int cellCount
= c_pmeNeighborUnitcellCount
;
727 pmeGpu
->common
->fsh
.resize(0);
728 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshx
, pme
->fshx
+ cellCount
* pme
->nkx
);
729 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshy
, pme
->fshy
+ cellCount
* pme
->nky
);
730 pmeGpu
->common
->fsh
.insert(pmeGpu
->common
->fsh
.end(), pme
->fshz
, pme
->fshz
+ cellCount
* pme
->nkz
);
731 pmeGpu
->common
->nn
.resize(0);
732 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nnx
, pme
->nnx
+ cellCount
* pme
->nkx
);
733 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nny
, pme
->nny
+ cellCount
* pme
->nky
);
734 pmeGpu
->common
->nn
.insert(pmeGpu
->common
->nn
.end(), pme
->nnz
, pme
->nnz
+ cellCount
* pme
->nkz
);
735 pmeGpu
->common
->runMode
= pme
->runMode
;
736 pmeGpu
->common
->boxScaler
= pme
->boxScaler
;
739 /*! \libinternal \brief
740 * Initializes the PME GPU data at the beginning of the run.
741 * TODO: this should become PmeGpu::PmeGpu()
743 * \param[in,out] pme The PME structure.
744 * \param[in,out] gpuInfo The GPU information structure.
745 * \param[in] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
747 static void pme_gpu_init(gmx_pme_t
*pme
,
748 gmx_device_info_t
*gpuInfo
,
749 PmeGpuProgramHandle pmeGpuProgram
)
751 pme
->gpu
= new PmeGpu();
752 PmeGpu
*pmeGpu
= pme
->gpu
;
753 changePinningPolicy(&pmeGpu
->staging
.h_forces
, gmx::PinningPolicy::CanBePinned
);
754 pmeGpu
->common
= std::shared_ptr
<PmeShared
>(new PmeShared());
756 /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
757 /* A convenience variable. */
758 pmeGpu
->settings
.useDecomposition
= (pme
->nnodes
== 1);
759 /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
760 pmeGpu
->settings
.performGPUGather
= true;
762 pme_gpu_set_testing(pmeGpu
, false);
764 pmeGpu
->deviceInfo
= gpuInfo
;
765 GMX_ASSERT(pmeGpuProgram
!= nullptr, "GPU kernels must be already compiled");
766 pmeGpu
->programHandle_
= pmeGpuProgram
;
768 pme_gpu_init_internal(pmeGpu
);
769 pme_gpu_alloc_energy_virial(pmeGpu
);
771 pme_gpu_copy_common_data_from(pme
);
773 GMX_ASSERT(pmeGpu
->common
->epsilon_r
!= 0.0f
, "PME GPU: bad electrostatic coefficient");
775 auto *kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
776 kernelParamsPtr
->constants
.elFactor
= ONE_4PI_EPS0
/ pmeGpu
->common
->epsilon_r
;
779 void pme_gpu_transform_spline_atom_data(const PmeGpu
*pmeGpu
, const pme_atomcomm_t
*atc
,
780 PmeSplineDataType type
, int dimIndex
, PmeLayoutTransform transform
)
782 // The GPU atom spline data is laid out in a different way currently than the CPU one.
783 // This function converts the data from GPU to CPU layout (in the host memory).
784 // It is only intended for testing purposes so far.
785 // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their performance
786 // (e.g. spreading on GPU, gathering on CPU).
787 GMX_RELEASE_ASSERT(atc
->nthread
== 1, "Only the serial PME data layout is supported");
788 const uintmax_t threadIndex
= 0;
789 const auto atomCount
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
)->atoms
.nAtoms
;
790 const auto atomsPerWarp
= pme_gpu_get_atoms_per_warp(pmeGpu
);
791 const auto pmeOrder
= pmeGpu
->common
->pme_order
;
793 real
*cpuSplineBuffer
;
794 float *h_splineBuffer
;
797 case PmeSplineDataType::Values
:
798 cpuSplineBuffer
= atc
->spline
[threadIndex
].theta
[dimIndex
];
799 h_splineBuffer
= pmeGpu
->staging
.h_theta
;
802 case PmeSplineDataType::Derivatives
:
803 cpuSplineBuffer
= atc
->spline
[threadIndex
].dtheta
[dimIndex
];
804 h_splineBuffer
= pmeGpu
->staging
.h_dtheta
;
808 GMX_THROW(gmx::InternalError("Unknown spline data type"));
811 for (auto atomIndex
= 0; atomIndex
< atomCount
; atomIndex
++)
813 auto atomWarpIndex
= atomIndex
% atomsPerWarp
;
814 auto warpIndex
= atomIndex
/ atomsPerWarp
;
815 for (auto orderIndex
= 0; orderIndex
< pmeOrder
; orderIndex
++)
817 const auto gpuValueIndex
= getSplineParamFullIndex(pmeOrder
, orderIndex
, dimIndex
, warpIndex
, atomWarpIndex
);
818 const auto cpuValueIndex
= atomIndex
* pmeOrder
+ orderIndex
;
819 GMX_ASSERT(cpuValueIndex
< atomCount
* pmeOrder
, "Atom spline data index out of bounds (while transforming GPU data layout for host)");
822 case PmeLayoutTransform::GpuToHost
:
823 cpuSplineBuffer
[cpuValueIndex
] = h_splineBuffer
[gpuValueIndex
];
826 case PmeLayoutTransform::HostToGpu
:
827 h_splineBuffer
[gpuValueIndex
] = cpuSplineBuffer
[cpuValueIndex
];
831 GMX_THROW(gmx::InternalError("Unknown layout transform"));
837 void pme_gpu_get_real_grid_sizes(const PmeGpu
*pmeGpu
, gmx::IVec
*gridSize
, gmx::IVec
*paddedGridSize
)
839 GMX_ASSERT(gridSize
!= nullptr, "");
840 GMX_ASSERT(paddedGridSize
!= nullptr, "");
841 GMX_ASSERT(pmeGpu
!= nullptr, "");
842 auto *kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
843 for (int i
= 0; i
< DIM
; i
++)
845 (*gridSize
)[i
] = kernelParamsPtr
->grid
.realGridSize
[i
];
846 (*paddedGridSize
)[i
] = kernelParamsPtr
->grid
.realGridSizePadded
[i
];
850 void pme_gpu_reinit(gmx_pme_t
*pme
,
851 gmx_device_info_t
*gpuInfo
,
852 PmeGpuProgramHandle pmeGpuProgram
)
854 if (!pme_gpu_active(pme
))
861 /* First-time initialization */
862 pme_gpu_init(pme
, gpuInfo
, pmeGpuProgram
);
866 /* After this call nothing in the GPU code should refer to the gmx_pme_t *pme itself - until the next pme_gpu_reinit */
867 pme_gpu_copy_common_data_from(pme
);
869 /* GPU FFT will only get used for a single rank.*/
870 pme
->gpu
->settings
.performGPUFFT
= (pme
->gpu
->common
->runMode
== PmeRunMode::GPU
) && !pme_gpu_uses_dd(pme
->gpu
);
871 pme
->gpu
->settings
.performGPUSolve
= (pme
->gpu
->common
->runMode
== PmeRunMode::GPU
);
873 /* Reinit active timers */
874 pme_gpu_reinit_timings(pme
->gpu
);
876 pme_gpu_reinit_grids(pme
->gpu
);
877 // Note: if timing the reinit launch overhead becomes more relevant
878 // (e.g. with regulat PP-PME re-balancing), we should pass wcycle here.
879 pme_gpu_reinit_computation(pme
, nullptr);
880 /* Clear the previous box - doesn't hurt, and forces the PME CPU recipbox
881 * update for mixed mode on grid switch. TODO: use shared recipbox field.
883 std::memset(pme
->gpu
->common
->previousBox
, 0, sizeof(pme
->gpu
->common
->previousBox
));
886 void pme_gpu_destroy(PmeGpu
*pmeGpu
)
888 /* Free lots of data */
889 pme_gpu_free_energy_virial(pmeGpu
);
890 pme_gpu_free_bspline_values(pmeGpu
);
891 pme_gpu_free_forces(pmeGpu
);
892 pme_gpu_free_coordinates(pmeGpu
);
893 pme_gpu_free_coefficients(pmeGpu
);
894 pme_gpu_free_spline_data(pmeGpu
);
895 pme_gpu_free_grid_indices(pmeGpu
);
896 pme_gpu_free_fract_shifts(pmeGpu
);
897 pme_gpu_free_grids(pmeGpu
);
899 pme_gpu_destroy_3dfft(pmeGpu
);
901 /* Free the GPU-framework specific data last */
902 pme_gpu_destroy_specific(pmeGpu
);
907 void pme_gpu_reinit_atoms(PmeGpu
*pmeGpu
, const int nAtoms
, const real
*charges
)
909 auto *kernelParamsPtr
= pme_gpu_get_kernel_params_base_ptr(pmeGpu
);
910 kernelParamsPtr
->atoms
.nAtoms
= nAtoms
;
911 const int alignment
= pme_gpu_get_atom_data_alignment(pmeGpu
);
912 pmeGpu
->nAtomsPadded
= ((nAtoms
+ alignment
- 1) / alignment
) * alignment
;
913 const int nAtomsAlloc
= c_usePadding
? pmeGpu
->nAtomsPadded
: nAtoms
;
914 const bool haveToRealloc
= (pmeGpu
->nAtomsAlloc
< nAtomsAlloc
); /* This check might be redundant, but is logical */
915 pmeGpu
->nAtomsAlloc
= nAtomsAlloc
;
918 GMX_RELEASE_ASSERT(false, "Only single precision supported");
919 GMX_UNUSED_VALUE(charges
);
921 pme_gpu_realloc_and_copy_input_coefficients(pmeGpu
, reinterpret_cast<const float *>(charges
));
922 /* Could also be checked for haveToRealloc, but the copy always needs to be performed */
927 pme_gpu_realloc_coordinates(pmeGpu
);
928 pme_gpu_realloc_forces(pmeGpu
);
929 pme_gpu_realloc_spline_data(pmeGpu
);
930 pme_gpu_realloc_grid_indices(pmeGpu
);
934 void pme_gpu_3dfft(const PmeGpu
*pmeGpu
, gmx_fft_direction dir
, int grid_index
)
936 int timerId
= (dir
== GMX_FFT_REAL_TO_COMPLEX
) ? gtPME_FFT_R2C
: gtPME_FFT_C2R
;
938 auto *timingEvent
= pme_gpu_fetch_timing_event(pmeGpu
, timerId
);
939 pme_gpu_start_timing(pmeGpu
, timerId
);
940 pmeGpu
->archSpecific
->fftSetup
[grid_index
]->perform3dFft(dir
, timingEvent
);
941 pme_gpu_stop_timing(pmeGpu
, timerId
);