src/gromacs/ewald/pme.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  * \brief This file contains internal CUDA function implementations
  38  * for performing the PME calculations on GPU.
  39  *
  40  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  41  */
  42
  43 #include "gmxpre.h"
  44
  45 #include <cmath>
  46
  47 /* The rest */
  48 #include "pme.h"
  49
  50 #include "gromacs/gpu_utils/cudautils.cuh"
  51 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  52 #include "gromacs/utility/gmxassert.h"
  53 #include "gromacs/utility/smalloc.h"
  54
  55 #include "pme.cuh"
  56 #include "pme-3dfft.cuh"
  57 #include "pme-grid.h"
  58
  59 int pme_gpu_get_atom_data_alignment(const pme_gpu_t *pmeGPU)
  60 {
  61     const int order = pmeGPU->common->pme_order;
  62     GMX_ASSERT(order > 0, "Invalid PME order");
  63     return PME_ATOM_DATA_ALIGNMENT;
  64 }
  65
  66 int pme_gpu_get_atoms_per_warp(const pme_gpu_t *pmeGPU)
  67 {
  68     const int order = pmeGPU->common->pme_order;
  69     GMX_ASSERT(order > 0, "Invalid PME order");
  70     return PME_SPREADGATHER_ATOMS_PER_WARP;
  71 }
  72
  73 void pme_gpu_synchronize(const pme_gpu_t *pmeGPU)
  74 {
  75     cudaError_t stat = cudaStreamSynchronize(pmeGPU->archSpecific->pmeStream);
  76     CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!");
  77 }
  78
  79 void pme_gpu_alloc_energy_virial(const pme_gpu_t *pmeGPU)
  80 {
  81     const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
  82     cudaError_t  stat                = cudaMalloc((void **)&pmeGPU->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize);
  83     CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial");
  84     pmalloc((void **)&pmeGPU->staging.h_virialAndEnergy, energyAndVirialSize);
  85 }
  86
  87 void pme_gpu_free_energy_virial(pme_gpu_t *pmeGPU)
  88 {
  89     cudaError_t stat = cudaFree(pmeGPU->kernelParams->constants.d_virialAndEnergy);
  90     CU_RET_ERR(stat, "cudaFree failed on PME energy and virial");
  91     pmeGPU->kernelParams->constants.d_virialAndEnergy = nullptr;
  92     pfree(pmeGPU->staging.h_virialAndEnergy);
  93     pmeGPU->staging.h_virialAndEnergy = nullptr;
  94 }
  95
  96 void pme_gpu_clear_energy_virial(const pme_gpu_t *pmeGPU)
  97 {
  98     cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->constants.d_virialAndEnergy, 0,
  99                                        c_virialAndEnergyCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
 100     CU_RET_ERR(stat, "PME energy/virial cudaMemsetAsync error");
 101 }
 102
 103 void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *pmeGPU)
 104 {
 105     const int splineValuesOffset[DIM] = {
 106         0,
 107         pmeGPU->kernelParams->grid.realGridSize[XX],
 108         pmeGPU->kernelParams->grid.realGridSize[XX] + pmeGPU->kernelParams->grid.realGridSize[YY]
 109     };
 110     memcpy((void *)&pmeGPU->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
 111
 112     const int newSplineValuesSize = pmeGPU->kernelParams->grid.realGridSize[XX] +
 113         pmeGPU->kernelParams->grid.realGridSize[YY] +
 114         pmeGPU->kernelParams->grid.realGridSize[ZZ];
 115     const bool shouldRealloc = (newSplineValuesSize > pmeGPU->archSpecific->splineValuesSize);
 116     cu_realloc_buffered((void **)&pmeGPU->kernelParams->grid.d_splineModuli, nullptr, sizeof(float),
 117                         &pmeGPU->archSpecific->splineValuesSize, &pmeGPU->archSpecific->splineValuesSizeAlloc, newSplineValuesSize, pmeGPU->archSpecific->pmeStream, true);
 118     if (shouldRealloc)
 119     {
 120         /* Reallocate the host buffer */
 121         pfree(pmeGPU->staging.h_splineModuli);
 122         pmalloc((void **)&pmeGPU->staging.h_splineModuli, newSplineValuesSize * sizeof(float));
 123     }
 124     for (int i = 0; i < DIM; i++)
 125     {
 126         memcpy(pmeGPU->staging.h_splineModuli + splineValuesOffset[i], pmeGPU->common->bsp_mod[i].data(), pmeGPU->common->bsp_mod[i].size() * sizeof(float));
 127     }
 128     /* TODO: pin original buffer instead! */
 129     cu_copy_H2D_async(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli,
 130                       newSplineValuesSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
 131 }
 132
 133 void pme_gpu_free_bspline_values(const pme_gpu_t *pmeGPU)
 134 {
 135     pfree(pmeGPU->staging.h_splineModuli);
 136     cu_free_buffered(pmeGPU->kernelParams->grid.d_splineModuli, &pmeGPU->archSpecific->splineValuesSize,
 137                      &pmeGPU->archSpecific->splineValuesSizeAlloc);
 138 }
 139
 140 void pme_gpu_realloc_forces(const pme_gpu_t *pmeGPU)
 141 {
 142     const size_t newForcesSize = pmeGPU->nAtomsAlloc * DIM;
 143     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
 144     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_forces, nullptr, sizeof(float),
 145                         &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc, newForcesSize, pmeGPU->archSpecific->pmeStream, true);
 146 }
 147
 148 void pme_gpu_free_forces(const pme_gpu_t *pmeGPU)
 149 {
 150     cu_free_buffered(pmeGPU->kernelParams->atoms.d_forces, &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc);
 151 }
 152
 153 void pme_gpu_copy_input_forces(const pme_gpu_t *pmeGPU, const float *h_forces)
 154 {
 155     GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
 156     const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
 157     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
 158     cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->archSpecific->pmeStream);
 159 }
 160
 161 void pme_gpu_copy_output_forces(const pme_gpu_t *pmeGPU, float *h_forces)
 162 {
 163     GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
 164     const size_t forcesSize   = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
 165     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
 166     cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream);
 167     cudaError_t stat = cudaEventRecord(pmeGPU->archSpecific->syncForcesD2H, pmeGPU->archSpecific->pmeStream);
 168     CU_RET_ERR(stat, "PME gather forces synchronization failure");
 169 }
 170
 171 void pme_gpu_sync_output_forces(const pme_gpu_t *pmeGPU)
 172 {
 173     cudaError_t  stat = cudaEventSynchronize(pmeGPU->archSpecific->syncForcesD2H);
 174     CU_RET_ERR(stat, "Error while waiting for the PME GPU forces");
 175 }
 176
 177 void pme_gpu_realloc_coordinates(const pme_gpu_t *pmeGPU)
 178 {
 179     const size_t newCoordinatesSize = pmeGPU->nAtomsAlloc * DIM;
 180     GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
 181     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coordinates, nullptr, sizeof(float),
 182                         &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc, newCoordinatesSize, pmeGPU->archSpecific->pmeStream, true);
 183     if (c_usePadding)
 184     {
 185         const size_t paddingIndex = DIM * pmeGPU->kernelParams->atoms.nAtoms;
 186         const size_t paddingCount = DIM * pmeGPU->nAtomsAlloc - paddingIndex;
 187         if (paddingCount > 0)
 188         {
 189             cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coordinates + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
 190             CU_RET_ERR(stat, "PME failed to clear the padded coordinates");
 191         }
 192     }
 193 }
 194
 195 void pme_gpu_copy_input_coordinates(const pme_gpu_t *pmeGPU, const rvec *h_coordinates)
 196 {
 197     GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU");
 198 #if GMX_DOUBLE
 199     GMX_RELEASE_ASSERT(false, "Only single precision is supported");
 200     GMX_UNUSED_VALUE(h_coordinates);
 201 #else
 202     cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
 203                       pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->archSpecific->pmeStream);
 204 #endif
 205 }
 206
 207 void pme_gpu_free_coordinates(const pme_gpu_t *pmeGPU)
 208 {
 209     cu_free_buffered(pmeGPU->kernelParams->atoms.d_coordinates, &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc);
 210 }
 211
 212 void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *pmeGPU, const float *h_coefficients)
 213 {
 214     GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
 215     const size_t newCoefficientsSize = pmeGPU->nAtomsAlloc;
 216     GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
 217     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coefficients, nullptr, sizeof(float),
 218                         &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc,
 219                         newCoefficientsSize, pmeGPU->archSpecific->pmeStream, true);
 220     cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
 221                       pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->archSpecific->pmeStream);
 222     if (c_usePadding)
 223     {
 224         const size_t paddingIndex = pmeGPU->kernelParams->atoms.nAtoms;
 225         const size_t paddingCount = pmeGPU->nAtomsAlloc - paddingIndex;
 226         if (paddingCount > 0)
 227         {
 228             cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coefficients + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
 229             CU_RET_ERR(stat, "PME failed to clear the padded charges");
 230         }
 231     }
 232 }
 233
 234 void pme_gpu_free_coefficients(const pme_gpu_t *pmeGPU)
 235 {
 236     cu_free_buffered(pmeGPU->kernelParams->atoms.d_coefficients, &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc);
 237 }
 238
 239 void pme_gpu_realloc_spline_data(const pme_gpu_t *pmeGPU)
 240 {
 241     const int    order             = pmeGPU->common->pme_order;
 242     const int    alignment         = pme_gpu_get_atoms_per_warp(pmeGPU);
 243     const size_t nAtomsPadded      = ((pmeGPU->nAtomsAlloc + alignment - 1) / alignment) * alignment;
 244     const int    newSplineDataSize = DIM * order * nAtomsPadded;
 245     GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
 246     /* Two arrays of the same size */
 247     const bool shouldRealloc        = (newSplineDataSize > pmeGPU->archSpecific->splineDataSize);
 248     int        currentSizeTemp      = pmeGPU->archSpecific->splineDataSize;
 249     int        currentSizeTempAlloc = pmeGPU->archSpecific->splineDataSizeAlloc;
 250     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_theta, nullptr, sizeof(float),
 251                         &currentSizeTemp, &currentSizeTempAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true);
 252     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_dtheta, nullptr, sizeof(float),
 253                         &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true);
 254     // the host side reallocation
 255     if (shouldRealloc)
 256     {
 257         pfree(pmeGPU->staging.h_theta);
 258         pmalloc((void **)&pmeGPU->staging.h_theta, newSplineDataSize * sizeof(float));
 259         pfree(pmeGPU->staging.h_dtheta);
 260         pmalloc((void **)&pmeGPU->staging.h_dtheta, newSplineDataSize * sizeof(float));
 261     }
 262 }
 263
 264 void pme_gpu_free_spline_data(const pme_gpu_t *pmeGPU)
 265 {
 266     /* Two arrays of the same size */
 267     cu_free_buffered(pmeGPU->kernelParams->atoms.d_theta);
 268     cu_free_buffered(pmeGPU->kernelParams->atoms.d_dtheta, &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc);
 269     pfree(pmeGPU->staging.h_theta);
 270     pfree(pmeGPU->staging.h_dtheta);
 271 }
 272
 273 void pme_gpu_realloc_grid_indices(const pme_gpu_t *pmeGPU)
 274 {
 275     const size_t newIndicesSize = DIM * pmeGPU->nAtomsAlloc;
 276     GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
 277     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_gridlineIndices, nullptr, sizeof(int),
 278                         &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc, newIndicesSize, pmeGPU->archSpecific->pmeStream, true);
 279     pfree(pmeGPU->staging.h_gridlineIndices);
 280     pmalloc((void **)&pmeGPU->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
 281 }
 282
 283 void pme_gpu_free_grid_indices(const pme_gpu_t *pmeGPU)
 284 {
 285     cu_free_buffered(pmeGPU->kernelParams->atoms.d_gridlineIndices, &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc);
 286     pfree(pmeGPU->staging.h_gridlineIndices);
 287 }
 288
 289 void pme_gpu_realloc_grids(pme_gpu_t *pmeGPU)
 290 {
 291     auto     *kernelParamsPtr = pmeGPU->kernelParams.get();
 292     const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX] *
 293         kernelParamsPtr->grid.realGridSizePadded[YY] *
 294         kernelParamsPtr->grid.realGridSizePadded[ZZ];
 295     const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX] *
 296         kernelParamsPtr->grid.complexGridSizePadded[YY] *
 297         kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
 298     // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
 299     if (pmeGPU->archSpecific->performOutOfPlaceFFT)
 300     {
 301         /* 2 separate grids */
 302         cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fourierGrid, nullptr, sizeof(float),
 303                             &pmeGPU->archSpecific->complexGridSize, &pmeGPU->archSpecific->complexGridSizeAlloc,
 304                             newComplexGridSize, pmeGPU->archSpecific->pmeStream, true);
 305         cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
 306                             &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc,
 307                             newRealGridSize, pmeGPU->archSpecific->pmeStream, true);
 308     }
 309     else
 310     {
 311         /* A single buffer so that any grid will fit */
 312         const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
 313         cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
 314                             &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc,
 315                             newGridsSize, pmeGPU->archSpecific->pmeStream, true);
 316         kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
 317         pmeGPU->archSpecific->complexGridSize = pmeGPU->archSpecific->realGridSize;
 318         // the size might get used later for copying the grid
 319     }
 320 }
 321
 322 void pme_gpu_free_grids(const pme_gpu_t *pmeGPU)
 323 {
 324     if (pmeGPU->archSpecific->performOutOfPlaceFFT)
 325     {
 326         cu_free_buffered(pmeGPU->kernelParams->grid.d_fourierGrid);
 327     }
 328     cu_free_buffered(pmeGPU->kernelParams->grid.d_realGrid,
 329                      &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc);
 330 }
 331
 332 void pme_gpu_clear_grids(const pme_gpu_t *pmeGPU)
 333 {
 334     cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->grid.d_realGrid, 0,
 335                                        pmeGPU->archSpecific->realGridSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
 336     /* Should the complex grid be cleared in some weird case? */
 337     CU_RET_ERR(stat, "cudaMemsetAsync on the PME grid error");
 338 }
 339
 340 void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU)
 341 {
 342     pme_gpu_free_fract_shifts(pmeGPU);
 343
 344     auto        *kernelParamsPtr = pmeGPU->kernelParams.get();
 345
 346     const int    nx                  = kernelParamsPtr->grid.realGridSize[XX];
 347     const int    ny                  = kernelParamsPtr->grid.realGridSize[YY];
 348     const int    nz                  = kernelParamsPtr->grid.realGridSize[ZZ];
 349     const int    cellCount           = c_pmeNeighborUnitcellCount;
 350     const int    gridDataOffset[DIM] = {0, cellCount * nx, cellCount * (nx + ny)};
 351
 352     memcpy(kernelParamsPtr->grid.tablesOffsets, &gridDataOffset, sizeof(gridDataOffset));
 353
 354     const int    newFractShiftsSize  = cellCount * (nx + ny + nz);
 355
 356     initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
 357                          kernelParamsPtr->fractShiftsTableTexture,
 358                          &pme_gpu_get_fract_shifts_texref(),
 359                          pmeGPU->common->fsh.data(),
 360                          newFractShiftsSize,
 361                          pmeGPU->deviceInfo);
 362
 363     initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
 364                          kernelParamsPtr->gridlineIndicesTableTexture,
 365                          &pme_gpu_get_gridline_texref(),
 366                          pmeGPU->common->nn.data(),
 367                          newFractShiftsSize,
 368                          pmeGPU->deviceInfo);
 369 }
 370
 371 void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU)
 372 {
 373     auto *kernelParamsPtr = pmeGPU->kernelParams.get();
 374     destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
 375                             kernelParamsPtr->fractShiftsTableTexture,
 376                             &pme_gpu_get_fract_shifts_texref(),
 377                             pmeGPU->deviceInfo);
 378     destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
 379                             kernelParamsPtr->gridlineIndicesTableTexture,
 380                             &pme_gpu_get_gridline_texref(),
 381                             pmeGPU->deviceInfo);
 382 }
 383
 384 void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU)
 385 {
 386     cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncEnerVirD2H);
 387     CU_RET_ERR(stat, "Error while waiting for PME solve output");
 388
 389     for (int j = 0; j < c_virialAndEnergyCount; j++)
 390     {
 391         GMX_ASSERT(std::isfinite(pmeGPU->staging.h_virialAndEnergy[j]), "PME GPU produces incorrect energy/virial.");
 392     }
 393 }
 394
 395 void pme_gpu_copy_input_gather_grid(const pme_gpu_t *pmeGpu, float *h_grid)
 396 {
 397     const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
 398     cu_copy_H2D_async(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->archSpecific->pmeStream);
 399 }
 400
 401 void pme_gpu_copy_output_spread_grid(const pme_gpu_t *pmeGpu, float *h_grid)
 402 {
 403     const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
 404     cu_copy_D2H_async(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->archSpecific->pmeStream);
 405     cudaError_t  stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
 406     CU_RET_ERR(stat, "PME spread grid sync event record failure");
 407 }
 408
 409 void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *pmeGpu)
 410 {
 411     const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
 412     const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
 413     const size_t splinesSize     = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
 414     auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
 415     cu_copy_D2H_async(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
 416     cu_copy_D2H_async(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
 417     cu_copy_D2H_async(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
 418                       kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
 419     cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSplineAtomDataD2H, pmeGpu->archSpecific->pmeStream);
 420     CU_RET_ERR(stat, "PME spread atom data sync event record failure");
 421 }
 422
 423 void pme_gpu_copy_input_gather_atom_data(const pme_gpu_t *pmeGpu)
 424 {
 425     const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
 426     const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
 427     const size_t splinesSize     = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
 428     auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
 429     if (c_usePadding)
 430     {
 431         const size_t gridlineIndicesSizePerAtom = DIM * sizeof(int);
 432         const size_t splineDataSizePerAtom      = pmeGpu->common->pme_order * DIM * sizeof(float);
 433         // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
 434         CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_gridlineIndices, 0, pmeGpu->nAtomsAlloc * gridlineIndicesSizePerAtom, pmeGpu->archSpecific->pmeStream),
 435                    "PME failed to clear the gridline indices");
 436         CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_dtheta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
 437                    "PME failed to clear the spline derivatives");
 438         CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_theta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
 439                    "PME failed to clear the spline values");
 440     }
 441     cu_copy_H2D_async(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
 442     cu_copy_H2D_async(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
 443     cu_copy_H2D_async(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
 444                       kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
 445 }
 446
 447 void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU)
 448 {
 449     cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSpreadGridD2H);
 450     CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
 451 }
 452
 453 void pme_gpu_sync_spline_atom_data(const pme_gpu_t *pmeGPU)
 454 {
 455     cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSplineAtomDataD2H);
 456     CU_RET_ERR(stat, "Error while waiting for the PME GPU atom data to be copied to the host");
 457 }
 458
 459 void pme_gpu_sync_solve_grid(const pme_gpu_t *pmeGPU)
 460 {
 461     cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSolveGridD2H);
 462     CU_RET_ERR(stat, "Error while waiting for the PME GPU solve grid to be copied to the host");
 463     //should check for pme_gpu_performs_solve(pmeGPU)
 464 }
 465
 466 void pme_gpu_init_internal(pme_gpu_t *pmeGPU)
 467 {
 468     /* Allocate the target-specific structures */
 469     pmeGPU->archSpecific.reset(new pme_gpu_specific_t());
 470     pmeGPU->kernelParams.reset(new pme_gpu_kernel_params_t());
 471
 472     pmeGPU->archSpecific->performOutOfPlaceFFT = true;
 473     /* This should give better performance, according to the cuFFT documentation.
 474      * The performance seems to be the same though.
 475      * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
 476      */
 477
 478     pmeGPU->archSpecific->useTiming = (getenv("GMX_DISABLE_CUDA_TIMING") == nullptr) &&
 479         (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
 480     /* TODO: multiple CUDA streams on same GPU cause nonsense cudaEvent_t timings.
 481      * This should probably also check for gpuId exclusivity?
 482      */
 483
 484     /* Creating a PME CUDA stream */
 485     cudaError_t stat;
 486     int         highest_priority, lowest_priority;
 487     stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
 488     CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
 489     stat = cudaStreamCreateWithPriority(&pmeGPU->archSpecific->pmeStream,
 490                                         cudaStreamDefault, //cudaStreamNonBlocking,
 491                                         highest_priority);
 492     CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
 493 }
 494
 495 void pme_gpu_destroy_specific(const pme_gpu_t *pmeGPU)
 496 {
 497     /* Destroy the CUDA stream */
 498     cudaError_t stat = cudaStreamDestroy(pmeGPU->archSpecific->pmeStream);
 499     CU_RET_ERR(stat, "PME cudaStreamDestroy error");
 500 }
 501
 502 void pme_gpu_init_sync_events(const pme_gpu_t *pmeGPU)
 503 {
 504     cudaError_t stat;
 505     const auto  eventFlags = cudaEventDisableTiming;
 506     stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, eventFlags);
 507     CU_RET_ERR(stat, "cudaEventCreate on syncEnerVirD2H failed");
 508     stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, eventFlags);
 509     CU_RET_ERR(stat, "cudaEventCreate on syncForcesD2H failed");
 510     stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, eventFlags);
 511     CU_RET_ERR(stat, "cudaEventCreate on syncSpreadGridD2H failed");
 512     stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, eventFlags);
 513     CU_RET_ERR(stat, "cudaEventCreate on syncSplineAtomDataD2H failed");
 514     stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, eventFlags);
 515     CU_RET_ERR(stat, "cudaEventCreate on syncSolveGridD2H failed");
 516 }
 517
 518 void pme_gpu_destroy_sync_events(const pme_gpu_t *pmeGPU)
 519 {
 520     cudaError_t stat;
 521     stat = cudaEventDestroy(pmeGPU->archSpecific->syncEnerVirD2H);
 522     CU_RET_ERR(stat, "cudaEventDestroy failed on syncEnerVirD2H");
 523     stat = cudaEventDestroy(pmeGPU->archSpecific->syncForcesD2H);
 524     CU_RET_ERR(stat, "cudaEventDestroy failed on syncForcesD2H");
 525     stat = cudaEventDestroy(pmeGPU->archSpecific->syncSpreadGridD2H);
 526     CU_RET_ERR(stat, "cudaEventDestroy failed on syncSpreadGridD2H");
 527     stat = cudaEventDestroy(pmeGPU->archSpecific->syncSplineAtomDataD2H);
 528     CU_RET_ERR(stat, "cudaEventDestroy failed on syncSplineAtomDataD2H");
 529     stat = cudaEventDestroy(pmeGPU->archSpecific->syncSolveGridD2H);
 530     CU_RET_ERR(stat, "cudaEventDestroy failed on syncSolveGridD2H");
 531 }
 532
 533 void pme_gpu_reinit_3dfft(const pme_gpu_t *pmeGPU)
 534 {
 535     if (pme_gpu_performs_FFT(pmeGPU))
 536     {
 537         pmeGPU->archSpecific->fftSetup.resize(0);
 538         for (int i = 0; i < pmeGPU->common->ngrids; i++)
 539         {
 540             pmeGPU->archSpecific->fftSetup.push_back(std::unique_ptr<GpuParallel3dFft>(new GpuParallel3dFft(pmeGPU)));
 541         }
 542     }
 543 }
 544
 545 void pme_gpu_destroy_3dfft(const pme_gpu_t *pmeGPU)
 546 {
 547     pmeGPU->archSpecific->fftSetup.resize(0);
 548 }