2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file contains internal CUDA function implementations
38 * for performing the PME calculations on GPU.
40 * \author Aleksei Iupinov <a.yupinov@gmail.com>
50 #include "gromacs/gpu_utils/cudautils.cuh"
51 #include "gromacs/gpu_utils/pmalloc_cuda.h"
52 #include "gromacs/utility/gmxassert.h"
53 #include "gromacs/utility/smalloc.h"
56 #include "pme-3dfft.cuh"
59 int pme_gpu_get_atom_data_alignment(const pme_gpu_t *pmeGPU)
61 const int order = pmeGPU->common->pme_order;
62 GMX_ASSERT(order > 0, "Invalid PME order");
63 return PME_ATOM_DATA_ALIGNMENT;
66 int pme_gpu_get_atoms_per_warp(const pme_gpu_t *pmeGPU)
68 const int order = pmeGPU->common->pme_order;
69 GMX_ASSERT(order > 0, "Invalid PME order");
70 return PME_SPREADGATHER_ATOMS_PER_WARP;
73 void pme_gpu_synchronize(const pme_gpu_t *pmeGPU)
75 cudaError_t stat = cudaStreamSynchronize(pmeGPU->archSpecific->pmeStream);
76 CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!");
79 void pme_gpu_alloc_energy_virial(const pme_gpu_t *pmeGPU)
81 const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
82 cudaError_t stat = cudaMalloc((void **)&pmeGPU->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize);
83 CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial");
84 pmalloc((void **)&pmeGPU->staging.h_virialAndEnergy, energyAndVirialSize);
87 void pme_gpu_free_energy_virial(pme_gpu_t *pmeGPU)
89 cudaError_t stat = cudaFree(pmeGPU->kernelParams->constants.d_virialAndEnergy);
90 CU_RET_ERR(stat, "cudaFree failed on PME energy and virial");
91 pmeGPU->kernelParams->constants.d_virialAndEnergy = nullptr;
92 pfree(pmeGPU->staging.h_virialAndEnergy);
93 pmeGPU->staging.h_virialAndEnergy = nullptr;
96 void pme_gpu_clear_energy_virial(const pme_gpu_t *pmeGPU)
98 cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->constants.d_virialAndEnergy, 0,
99 c_virialAndEnergyCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
100 CU_RET_ERR(stat, "PME energy/virial cudaMemsetAsync error");
103 void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *pmeGPU)
105 const int splineValuesOffset[DIM] = {
107 pmeGPU->kernelParams->grid.realGridSize[XX],
108 pmeGPU->kernelParams->grid.realGridSize[XX] + pmeGPU->kernelParams->grid.realGridSize[YY]
110 memcpy((void *)&pmeGPU->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
112 const int newSplineValuesSize = pmeGPU->kernelParams->grid.realGridSize[XX] +
113 pmeGPU->kernelParams->grid.realGridSize[YY] +
114 pmeGPU->kernelParams->grid.realGridSize[ZZ];
115 const bool shouldRealloc = (newSplineValuesSize > pmeGPU->archSpecific->splineValuesSize);
116 cu_realloc_buffered((void **)&pmeGPU->kernelParams->grid.d_splineModuli, nullptr, sizeof(float),
117 &pmeGPU->archSpecific->splineValuesSize, &pmeGPU->archSpecific->splineValuesSizeAlloc, newSplineValuesSize, pmeGPU->archSpecific->pmeStream, true);
120 /* Reallocate the host buffer */
121 pfree(pmeGPU->staging.h_splineModuli);
122 pmalloc((void **)&pmeGPU->staging.h_splineModuli, newSplineValuesSize * sizeof(float));
124 for (int i = 0; i < DIM; i++)
126 memcpy(pmeGPU->staging.h_splineModuli + splineValuesOffset[i], pmeGPU->common->bsp_mod[i].data(), pmeGPU->common->bsp_mod[i].size() * sizeof(float));
128 /* TODO: pin original buffer instead! */
129 cu_copy_H2D_async(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli,
130 newSplineValuesSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
133 void pme_gpu_free_bspline_values(const pme_gpu_t *pmeGPU)
135 pfree(pmeGPU->staging.h_splineModuli);
136 cu_free_buffered(pmeGPU->kernelParams->grid.d_splineModuli, &pmeGPU->archSpecific->splineValuesSize,
137 &pmeGPU->archSpecific->splineValuesSizeAlloc);
140 void pme_gpu_realloc_forces(const pme_gpu_t *pmeGPU)
142 const size_t newForcesSize = pmeGPU->nAtomsAlloc * DIM;
143 GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
144 cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_forces, nullptr, sizeof(float),
145 &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc, newForcesSize, pmeGPU->archSpecific->pmeStream, true);
148 void pme_gpu_free_forces(const pme_gpu_t *pmeGPU)
150 cu_free_buffered(pmeGPU->kernelParams->atoms.d_forces, &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc);
153 void pme_gpu_copy_input_forces(const pme_gpu_t *pmeGPU, const float *h_forces)
155 GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
156 const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
157 GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
158 cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->archSpecific->pmeStream);
161 void pme_gpu_copy_output_forces(const pme_gpu_t *pmeGPU, float *h_forces)
163 GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
164 const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
165 GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
166 cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream);
167 cudaError_t stat = cudaEventRecord(pmeGPU->archSpecific->syncForcesD2H, pmeGPU->archSpecific->pmeStream);
168 CU_RET_ERR(stat, "PME gather forces synchronization failure");
171 void pme_gpu_sync_output_forces(const pme_gpu_t *pmeGPU)
173 cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncForcesD2H);
174 CU_RET_ERR(stat, "Error while waiting for the PME GPU forces");
177 void pme_gpu_realloc_coordinates(const pme_gpu_t *pmeGPU)
179 const size_t newCoordinatesSize = pmeGPU->nAtomsAlloc * DIM;
180 GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
181 cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coordinates, nullptr, sizeof(float),
182 &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc, newCoordinatesSize, pmeGPU->archSpecific->pmeStream, true);
185 const size_t paddingIndex = DIM * pmeGPU->kernelParams->atoms.nAtoms;
186 const size_t paddingCount = DIM * pmeGPU->nAtomsAlloc - paddingIndex;
187 if (paddingCount > 0)
189 cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coordinates + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
190 CU_RET_ERR(stat, "PME failed to clear the padded coordinates");
195 void pme_gpu_copy_input_coordinates(const pme_gpu_t *pmeGPU, const rvec *h_coordinates)
197 GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU");
199 GMX_RELEASE_ASSERT(false, "Only single precision is supported");
200 GMX_UNUSED_VALUE(h_coordinates);
202 cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
203 pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->archSpecific->pmeStream);
207 void pme_gpu_free_coordinates(const pme_gpu_t *pmeGPU)
209 cu_free_buffered(pmeGPU->kernelParams->atoms.d_coordinates, &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc);
212 void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *pmeGPU, const float *h_coefficients)
214 GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
215 const size_t newCoefficientsSize = pmeGPU->nAtomsAlloc;
216 GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
217 cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coefficients, nullptr, sizeof(float),
218 &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc,
219 newCoefficientsSize, pmeGPU->archSpecific->pmeStream, true);
220 cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
221 pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->archSpecific->pmeStream);
224 const size_t paddingIndex = pmeGPU->kernelParams->atoms.nAtoms;
225 const size_t paddingCount = pmeGPU->nAtomsAlloc - paddingIndex;
226 if (paddingCount > 0)
228 cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coefficients + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
229 CU_RET_ERR(stat, "PME failed to clear the padded charges");
234 void pme_gpu_free_coefficients(const pme_gpu_t *pmeGPU)
236 cu_free_buffered(pmeGPU->kernelParams->atoms.d_coefficients, &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc);
239 void pme_gpu_realloc_spline_data(const pme_gpu_t *pmeGPU)
241 const int order = pmeGPU->common->pme_order;
242 const int alignment = pme_gpu_get_atoms_per_warp(pmeGPU);
243 const size_t nAtomsPadded = ((pmeGPU->nAtomsAlloc + alignment - 1) / alignment) * alignment;
244 const int newSplineDataSize = DIM * order * nAtomsPadded;
245 GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
246 /* Two arrays of the same size */
247 const bool shouldRealloc = (newSplineDataSize > pmeGPU->archSpecific->splineDataSize);
248 int currentSizeTemp = pmeGPU->archSpecific->splineDataSize;
249 int currentSizeTempAlloc = pmeGPU->archSpecific->splineDataSizeAlloc;
250 cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_theta, nullptr, sizeof(float),
251 ¤tSizeTemp, ¤tSizeTempAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true);
252 cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_dtheta, nullptr, sizeof(float),
253 &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true);
254 // the host side reallocation
257 pfree(pmeGPU->staging.h_theta);
258 pmalloc((void **)&pmeGPU->staging.h_theta, newSplineDataSize * sizeof(float));
259 pfree(pmeGPU->staging.h_dtheta);
260 pmalloc((void **)&pmeGPU->staging.h_dtheta, newSplineDataSize * sizeof(float));
264 void pme_gpu_free_spline_data(const pme_gpu_t *pmeGPU)
266 /* Two arrays of the same size */
267 cu_free_buffered(pmeGPU->kernelParams->atoms.d_theta);
268 cu_free_buffered(pmeGPU->kernelParams->atoms.d_dtheta, &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc);
269 pfree(pmeGPU->staging.h_theta);
270 pfree(pmeGPU->staging.h_dtheta);
273 void pme_gpu_realloc_grid_indices(const pme_gpu_t *pmeGPU)
275 const size_t newIndicesSize = DIM * pmeGPU->nAtomsAlloc;
276 GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
277 cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_gridlineIndices, nullptr, sizeof(int),
278 &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc, newIndicesSize, pmeGPU->archSpecific->pmeStream, true);
279 pfree(pmeGPU->staging.h_gridlineIndices);
280 pmalloc((void **)&pmeGPU->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
283 void pme_gpu_free_grid_indices(const pme_gpu_t *pmeGPU)
285 cu_free_buffered(pmeGPU->kernelParams->atoms.d_gridlineIndices, &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc);
286 pfree(pmeGPU->staging.h_gridlineIndices);
289 void pme_gpu_realloc_grids(pme_gpu_t *pmeGPU)
291 auto *kernelParamsPtr = pmeGPU->kernelParams.get();
292 const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX] *
293 kernelParamsPtr->grid.realGridSizePadded[YY] *
294 kernelParamsPtr->grid.realGridSizePadded[ZZ];
295 const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX] *
296 kernelParamsPtr->grid.complexGridSizePadded[YY] *
297 kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
298 // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
299 if (pmeGPU->archSpecific->performOutOfPlaceFFT)
301 /* 2 separate grids */
302 cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fourierGrid, nullptr, sizeof(float),
303 &pmeGPU->archSpecific->complexGridSize, &pmeGPU->archSpecific->complexGridSizeAlloc,
304 newComplexGridSize, pmeGPU->archSpecific->pmeStream, true);
305 cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
306 &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc,
307 newRealGridSize, pmeGPU->archSpecific->pmeStream, true);
311 /* A single buffer so that any grid will fit */
312 const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
313 cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
314 &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc,
315 newGridsSize, pmeGPU->archSpecific->pmeStream, true);
316 kernelParamsPtr->grid.d_fourierGrid = kernelParamsPtr->grid.d_realGrid;
317 pmeGPU->archSpecific->complexGridSize = pmeGPU->archSpecific->realGridSize;
318 // the size might get used later for copying the grid
322 void pme_gpu_free_grids(const pme_gpu_t *pmeGPU)
324 if (pmeGPU->archSpecific->performOutOfPlaceFFT)
326 cu_free_buffered(pmeGPU->kernelParams->grid.d_fourierGrid);
328 cu_free_buffered(pmeGPU->kernelParams->grid.d_realGrid,
329 &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc);
332 void pme_gpu_clear_grids(const pme_gpu_t *pmeGPU)
334 cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->grid.d_realGrid, 0,
335 pmeGPU->archSpecific->realGridSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
336 /* Should the complex grid be cleared in some weird case? */
337 CU_RET_ERR(stat, "cudaMemsetAsync on the PME grid error");
340 void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU)
342 pme_gpu_free_fract_shifts(pmeGPU);
344 auto *kernelParamsPtr = pmeGPU->kernelParams.get();
346 const int nx = kernelParamsPtr->grid.realGridSize[XX];
347 const int ny = kernelParamsPtr->grid.realGridSize[YY];
348 const int nz = kernelParamsPtr->grid.realGridSize[ZZ];
349 const int cellCount = c_pmeNeighborUnitcellCount;
350 const int gridDataOffset[DIM] = {0, cellCount * nx, cellCount * (nx + ny)};
352 memcpy(kernelParamsPtr->grid.tablesOffsets, &gridDataOffset, sizeof(gridDataOffset));
354 const int newFractShiftsSize = cellCount * (nx + ny + nz);
356 initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
357 kernelParamsPtr->fractShiftsTableTexture,
358 &pme_gpu_get_fract_shifts_texref(),
359 pmeGPU->common->fsh.data(),
363 initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
364 kernelParamsPtr->gridlineIndicesTableTexture,
365 &pme_gpu_get_gridline_texref(),
366 pmeGPU->common->nn.data(),
371 void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU)
373 auto *kernelParamsPtr = pmeGPU->kernelParams.get();
374 destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
375 kernelParamsPtr->fractShiftsTableTexture,
376 &pme_gpu_get_fract_shifts_texref(),
378 destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
379 kernelParamsPtr->gridlineIndicesTableTexture,
380 &pme_gpu_get_gridline_texref(),
384 void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU)
386 cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncEnerVirD2H);
387 CU_RET_ERR(stat, "Error while waiting for PME solve output");
389 for (int j = 0; j < c_virialAndEnergyCount; j++)
391 GMX_ASSERT(std::isfinite(pmeGPU->staging.h_virialAndEnergy[j]), "PME GPU produces incorrect energy/virial.");
395 void pme_gpu_copy_input_gather_grid(const pme_gpu_t *pmeGpu, float *h_grid)
397 const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
398 cu_copy_H2D_async(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->archSpecific->pmeStream);
401 void pme_gpu_copy_output_spread_grid(const pme_gpu_t *pmeGpu, float *h_grid)
403 const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
404 cu_copy_D2H_async(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->archSpecific->pmeStream);
405 cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
406 CU_RET_ERR(stat, "PME spread grid sync event record failure");
409 void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *pmeGpu)
411 const int alignment = pme_gpu_get_atoms_per_warp(pmeGpu);
412 const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
413 const size_t splinesSize = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
414 auto *kernelParamsPtr = pmeGpu->kernelParams.get();
415 cu_copy_D2H_async(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
416 cu_copy_D2H_async(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
417 cu_copy_D2H_async(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
418 kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
419 cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSplineAtomDataD2H, pmeGpu->archSpecific->pmeStream);
420 CU_RET_ERR(stat, "PME spread atom data sync event record failure");
423 void pme_gpu_copy_input_gather_atom_data(const pme_gpu_t *pmeGpu)
425 const int alignment = pme_gpu_get_atoms_per_warp(pmeGpu);
426 const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
427 const size_t splinesSize = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
428 auto *kernelParamsPtr = pmeGpu->kernelParams.get();
431 const size_t gridlineIndicesSizePerAtom = DIM * sizeof(int);
432 const size_t splineDataSizePerAtom = pmeGpu->common->pme_order * DIM * sizeof(float);
433 // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
434 CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_gridlineIndices, 0, pmeGpu->nAtomsAlloc * gridlineIndicesSizePerAtom, pmeGpu->archSpecific->pmeStream),
435 "PME failed to clear the gridline indices");
436 CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_dtheta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
437 "PME failed to clear the spline derivatives");
438 CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_theta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
439 "PME failed to clear the spline values");
441 cu_copy_H2D_async(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
442 cu_copy_H2D_async(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
443 cu_copy_H2D_async(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
444 kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
447 void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU)
449 cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSpreadGridD2H);
450 CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
453 void pme_gpu_sync_spline_atom_data(const pme_gpu_t *pmeGPU)
455 cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSplineAtomDataD2H);
456 CU_RET_ERR(stat, "Error while waiting for the PME GPU atom data to be copied to the host");
459 void pme_gpu_sync_solve_grid(const pme_gpu_t *pmeGPU)
461 cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSolveGridD2H);
462 CU_RET_ERR(stat, "Error while waiting for the PME GPU solve grid to be copied to the host");
463 //should check for pme_gpu_performs_solve(pmeGPU)
466 void pme_gpu_init_internal(pme_gpu_t *pmeGPU)
468 /* Allocate the target-specific structures */
469 pmeGPU->archSpecific.reset(new pme_gpu_specific_t());
470 pmeGPU->kernelParams.reset(new pme_gpu_kernel_params_t());
472 pmeGPU->archSpecific->performOutOfPlaceFFT = true;
473 /* This should give better performance, according to the cuFFT documentation.
474 * The performance seems to be the same though.
475 * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
478 pmeGPU->archSpecific->useTiming = (getenv("GMX_DISABLE_CUDA_TIMING") == nullptr) &&
479 (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
480 /* TODO: multiple CUDA streams on same GPU cause nonsense cudaEvent_t timings.
481 * This should probably also check for gpuId exclusivity?
484 /* Creating a PME CUDA stream */
486 int highest_priority, lowest_priority;
487 stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
488 CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
489 stat = cudaStreamCreateWithPriority(&pmeGPU->archSpecific->pmeStream,
490 cudaStreamDefault, //cudaStreamNonBlocking,
492 CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
495 void pme_gpu_destroy_specific(const pme_gpu_t *pmeGPU)
497 /* Destroy the CUDA stream */
498 cudaError_t stat = cudaStreamDestroy(pmeGPU->archSpecific->pmeStream);
499 CU_RET_ERR(stat, "PME cudaStreamDestroy error");
502 void pme_gpu_init_sync_events(const pme_gpu_t *pmeGPU)
505 const auto eventFlags = cudaEventDisableTiming;
506 stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, eventFlags);
507 CU_RET_ERR(stat, "cudaEventCreate on syncEnerVirD2H failed");
508 stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, eventFlags);
509 CU_RET_ERR(stat, "cudaEventCreate on syncForcesD2H failed");
510 stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, eventFlags);
511 CU_RET_ERR(stat, "cudaEventCreate on syncSpreadGridD2H failed");
512 stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, eventFlags);
513 CU_RET_ERR(stat, "cudaEventCreate on syncSplineAtomDataD2H failed");
514 stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, eventFlags);
515 CU_RET_ERR(stat, "cudaEventCreate on syncSolveGridD2H failed");
518 void pme_gpu_destroy_sync_events(const pme_gpu_t *pmeGPU)
521 stat = cudaEventDestroy(pmeGPU->archSpecific->syncEnerVirD2H);
522 CU_RET_ERR(stat, "cudaEventDestroy failed on syncEnerVirD2H");
523 stat = cudaEventDestroy(pmeGPU->archSpecific->syncForcesD2H);
524 CU_RET_ERR(stat, "cudaEventDestroy failed on syncForcesD2H");
525 stat = cudaEventDestroy(pmeGPU->archSpecific->syncSpreadGridD2H);
526 CU_RET_ERR(stat, "cudaEventDestroy failed on syncSpreadGridD2H");
527 stat = cudaEventDestroy(pmeGPU->archSpecific->syncSplineAtomDataD2H);
528 CU_RET_ERR(stat, "cudaEventDestroy failed on syncSplineAtomDataD2H");
529 stat = cudaEventDestroy(pmeGPU->archSpecific->syncSolveGridD2H);
530 CU_RET_ERR(stat, "cudaEventDestroy failed on syncSolveGridD2H");
533 void pme_gpu_reinit_3dfft(const pme_gpu_t *pmeGPU)
535 if (pme_gpu_performs_FFT(pmeGPU))
537 pmeGPU->archSpecific->fftSetup.resize(0);
538 for (int i = 0; i < pmeGPU->common->ngrids; i++)
540 pmeGPU->archSpecific->fftSetup.push_back(std::unique_ptr<GpuParallel3dFft>(new GpuParallel3dFft(pmeGPU)));
545 void pme_gpu_destroy_3dfft(const pme_gpu_t *pmeGPU)
547 pmeGPU->archSpecific->fftSetup.resize(0);