src/gromacs/ewald/pme-gpu-internal.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file contains internal function definitions for performing the PME calculations on GPU.
  39  * These are not meant to be exposed outside of the PME GPU code.
  40  * As of now, their bodies are still in the common pme-gpu.cpp files.
  41  *
  42  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  43  * \ingroup module_ewald
  44  */
  45
  46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
  47 #define GMX_EWALD_PME_GPU_INTERNAL_H
  48
  49 #include "gromacs/gpu_utils/gpu_macros.h"      // for the CUDA_FUNC_ macros
  50
  51 #include "pme-gpu-types.h"                     // for the inline functions accessing pme_gpu_t members
  52
  53 struct gmx_hw_info_t;
  54 struct gmx_gpu_opt_t;
  55 struct gmx_pme_t;                              // only used in pme_gpu_reinit
  56 struct t_commrec;
  57 struct gmx_wallclock_gpu_pme_t;
  58 struct pme_atomcomm_t;
  59
  60 namespace gmx
  61 {
  62 class MDLogger;
  63 }
  64
  65 //! Type of spline data
  66 enum class PmeSplineDataType
  67 {
  68     Values,      // theta
  69     Derivatives, // dtheta
  70 };               //TODO move this into new and shiny pme.h (pme-types.h?)
  71
  72 /* Some general constants for PME GPU behaviour follow. */
  73
  74 /*! \brief \libinternal
  75  * false: The atom data GPU buffers are sized precisely according to the number of atoms.
  76  *        (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
  77  *        The atom index checks in the spread/gather code potentially hinder the performance.
  78  * true:  The atom data GPU buffers are padded with zeroes so that the possible number of atoms
  79  *        fitting in is divisible by PME_ATOM_DATA_ALIGNMENT.
  80  *        The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
  81  *        Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
  82  * \todo Estimate performance differences
  83  */
  84 const bool c_usePadding = true;
  85
  86 /*! \brief \libinternal
  87  * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
  88  * true:  Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
  89  *        Could be good for performance in specific systems with lots of neutral atoms.
  90  * \todo Estimate performance differences.
  91  */
  92 const bool c_skipNeutralAtoms = false;
  93
  94 /*! \brief \libinternal
  95  * Number of PME solve output floating point numbers.
  96  * 6 for symmetric virial matrix + 1 for reciprocal energy.
  97  */
  98 const int c_virialAndEnergyCount = 7;
  99
 100 /* A block of CUDA-only functions that live in pme.cu */
 101
 102 /*! \libinternal \brief
 103  * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
 104  * Depends on CUDA-specific block sizes, needed for the atom data padding.
 105  *
 106  * \param[in] pmeGPU            The PME GPU structure.
 107  * \returns   Number of atoms in a single GPU atom data chunk.
 108  */
 109 CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
 110
 111 /*! \libinternal \brief
 112  * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
 113  *
 114  * \param[in] pmeGPU            The PME GPU structure.
 115  * \returns   Number of atoms in a single GPU atom spline data chunk.
 116  */
 117 CUDA_FUNC_QUALIFIER int pme_gpu_get_atoms_per_warp(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
 118
 119 /*! \libinternal \brief
 120  * Synchronizes the current step, waiting for the GPU kernels/transfers to finish.
 121  *
 122  * \param[in] pmeGPU            The PME GPU structure.
 123  */
 124 CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 125
 126 /*! \libinternal \brief
 127  * Allocates the fixed size energy and virial buffer both on GPU and CPU.
 128  *
 129  * \param[in] pmeGPU            The PME GPU structure.
 130  */
 131 CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 132
 133 /*! \libinternal \brief
 134  * Frees the energy and virial memory both on GPU and CPU.
 135  *
 136  * \param[in] pmeGPU            The PME GPU structure.
 137  */
 138 CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 139
 140 /*! \libinternal \brief
 141  * Clears the energy and virial memory on GPU with 0.
 142  * Should be called at the end of the energy/virial calculation step.
 143  *
 144  * \param[in] pmeGPU            The PME GPU structure.
 145  */
 146 CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 147
 148 /*! \libinternal \brief
 149  * Reallocates and copies the pre-computed B-spline values to the GPU.
 150  *
 151  * \param[in] pmeGPU             The PME GPU structure.
 152  */
 153 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 154
 155 /*! \libinternal \brief
 156  * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
 157  *
 158  * \param[in] pmeGPU             The PME GPU structure.
 159  */
 160 CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 161
 162 /*! \libinternal \brief
 163  * Reallocates the GPU buffer for the PME forces.
 164  *
 165  * \param[in] pmeGPU             The PME GPU structure.
 166  */
 167 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 168
 169 /*! \libinternal \brief
 170  * Frees the GPU buffer for the PME forces.
 171  *
 172  * \param[in] pmeGPU             The PME GPU structure.
 173  */
 174 CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 175
 176 /*! \libinternal \brief
 177  * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces).
 178  * To be called e.g. after the bonded calculations.
 179  *
 180  * \param[in] pmeGPU             The PME GPU structure.
 181  * \param[in] h_forces           The input forces rvec buffer.
 182  */
 183 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 184                                                    const float     *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
 185
 186 /*! \libinternal \brief
 187  * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
 188  *
 189  * \param[in] pmeGPU             The PME GPU structure.
 190  * \param[out] h_forces          The output forces rvec buffer.
 191  */
 192 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 193                                                     float           *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
 194
 195 /*! \libinternal \brief
 196  * Waits for the PME GPU output forces copying to the CPU buffer to finish.
 197  *
 198  * \param[in] pmeGPU            The PME GPU structure.
 199  */
 200 CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 201
 202 /*! \libinternal \brief
 203  * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
 204  *
 205  * \param[in] pmeGPU            The PME GPU structure.
 206  *
 207  * Needs to be called on every DD step/in the beginning.
 208  */
 209 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 210
 211 /*! \libinternal \brief
 212  * Copies the input coordinates from the CPU buffer onto the GPU.
 213  *
 214  * \param[in] pmeGPU            The PME GPU structure.
 215  * \param[in] h_coordinates     Input coordinates (XYZ rvec array).
 216  *
 217  * Needs to be called every MD step. The coordinates are then used in the spline calculation.
 218  */
 219 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 220                                                         const rvec      *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM
 221
 222 /*! \libinternal \brief
 223  * Frees the coordinates on the GPU.
 224  *
 225  * \param[in] pmeGPU            The PME GPU structure.
 226  */
 227 CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 228
 229 /*! \libinternal \brief
 230  * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
 231  * Clears the padded part if needed.
 232  *
 233  * \param[in] pmeGPU            The PME GPU structure.
 234  * \param[in] h_coefficients    The input atom charges/coefficients.
 235  *
 236  * Does not need to be done every MD step, only whenever the local charges change.
 237  * (So, in the beginning of the run, or on DD step).
 238  */
 239 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 240                                                                      const float     *CUDA_FUNC_ARGUMENT(h_coefficients)) CUDA_FUNC_TERM
 241
 242 /*! \libinternal \brief
 243  * Frees the charges/coefficients on the GPU.
 244  *
 245  * \param[in] pmeGPU             The PME GPU structure.
 246  */
 247 CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 248
 249 /*! \libinternal \brief
 250  * Reallocates the buffers on the GPU and the host for the atoms spline data.
 251  *
 252  * \param[in] pmeGPU            The PME GPU structure.
 253  */
 254 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 255
 256 /*! \libinternal \brief
 257  * Frees the buffers on the GPU for the atoms spline data.
 258  *
 259  * \param[in] pmeGPU            The PME GPU structure.
 260  */
 261 CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 262
 263 /*! \libinternal \brief
 264  * Reallocates the buffers on the GPU and the host for the particle gridline indices.
 265  *
 266  * \param[in] pmeGPU            The PME GPU structure.
 267  */
 268 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 269
 270 /*! \libinternal \brief
 271  * Frees the buffer on the GPU for the particle gridline indices.
 272  *
 273  * \param[in] pmeGPU            The PME GPU structure.
 274  */
 275 CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 276
 277 /*! \libinternal \brief
 278  * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
 279  *
 280  * \param[in] pmeGPU            The PME GPU structure.
 281  */
 282 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 283
 284 /*! \libinternal \brief
 285  * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
 286  *
 287  * \param[in] pmeGPU            The PME GPU structure.
 288  */
 289 CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 290
 291 /*! \libinternal \brief
 292  * Clears the real space grid on the GPU.
 293  * Should be called at the end of each MD step.
 294  *
 295  * \param[in] pmeGPU            The PME GPU structure.
 296  */
 297 CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 298
 299 /*! \libinternal \brief
 300  * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
 301  *
 302  * \param[in] pmeGPU            The PME GPU structure.
 303  */
 304 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 305
 306 /*! \libinternal \brief
 307  * Frees the pre-computed fractional coordinates' shifts on the GPU.
 308  *
 309  * \param[in] pmeGPU            The PME GPU structure.
 310  */
 311 CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 312
 313 /*! \libinternal \brief
 314  * Waits for the output virial/energy copying to the intermediate CPU buffer to finish.
 315  *
 316  * \param[in] pmeGPU  The PME GPU structure.
 317  */
 318 CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 319
 320 /*! \libinternal \brief
 321  * Copies the input real-space grid from the host to the GPU.
 322  *
 323  * \param[in] pmeGPU   The PME GPU structure.
 324  * \param[in] h_grid   The host-side grid buffer.
 325  */
 326 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 327                                                         float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
 328
 329 /*! \libinternal \brief
 330  * Copies the output real-space grid from the GPU to the host.
 331  *
 332  * \param[in] pmeGPU   The PME GPU structure.
 333  * \param[out] h_grid  The host-side grid buffer.
 334  */
 335 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 336                                                          float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
 337
 338 /*! \libinternal \brief
 339  * Copies the spread output spline data and gridline indices from the GPU to the host.
 340  *
 341  * \param[in] pmeGPU   The PME GPU structure.
 342  */
 343 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 344
 345 /*! \libinternal \brief
 346  * Copies the gather input spline data and gridline indices from the host to the GPU.
 347  *
 348  * \param[in] pmeGPU   The PME GPU structure.
 349  */
 350 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 351
 352 /*! \libinternal \brief
 353  * Waits for the grid copying to the host-side buffer after spreading to finish.
 354  *
 355  * \param[in] pmeGPU  The PME GPU structure.
 356  */
 357 CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 358
 359 /*! \libinternal \brief
 360  * Waits for the atom data copying to the intermediate host-side buffer after spline computation to finish.
 361  *
 362  * \param[in] pmeGPU  The PME GPU structure.
 363  */
 364 CUDA_FUNC_QUALIFIER void pme_gpu_sync_spline_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 365
 366 /*! \libinternal \brief
 367  * Waits for the grid copying to the host-side buffer after solving to finish.
 368  *
 369  * \param[in] pmeGPU  The PME GPU structure.
 370  */
 371 CUDA_FUNC_QUALIFIER void pme_gpu_sync_solve_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 372
 373 /*! \libinternal \brief
 374  * Does the one-time GPU-framework specific PME initialization.
 375  * For CUDA, the PME stream is created with the highest priority.
 376  *
 377  * \param[in] pmeGPU  The PME GPU structure.
 378  */
 379 CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 380
 381 /*! \libinternal \brief
 382  * Destroys the PME GPU-framework specific data.
 383  * Should be called last in the PME GPU destructor.
 384  *
 385  * \param[in] pmeGPU  The PME GPU structure.
 386  */
 387 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 388
 389 /*! \libinternal \brief
 390  * Initializes the PME GPU synchronization events.
 391  *
 392  * \param[in] pmeGPU  The PME GPU structure.
 393  */
 394 CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 395
 396 /*! \libinternal \brief
 397  * Destroys the PME GPU synchronization events.
 398  *
 399  * \param[in] pmeGPU  The PME GPU structure.
 400  */
 401 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 402
 403 /*! \libinternal \brief
 404  * Initializes the CUDA FFT structures.
 405  *
 406  * \param[in] pmeGPU  The PME GPU structure.
 407  */
 408 CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 409
 410 /*! \libinternal \brief
 411  * Destroys the CUDA FFT structures.
 412  *
 413  * \param[in] pmeGPU  The PME GPU structure.
 414  */
 415 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 416
 417 /* Several CUDA event-based timing functions that live in pme-timings.cu */
 418
 419 /*! \libinternal \brief
 420  * Finalizes all the PME GPU stage timings for the current step. Should be called at the end of every step.
 421  *
 422  * \param[in] pmeGPU         The PME GPU structure.
 423  */
 424 CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 425
 426 /*! \brief
 427  * Resets the PME GPU timings. To be called at the reset step.
 428  *
 429  * \param[in] pmeGPU         The PME GPU structure.
 430  */
 431 CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 432
 433 /*! \libinternal \brief
 434  * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.
 435  *
 436  * \param[in] pmeGPU         The PME GPU structure.
 437  * \param[in] timings        The gmx_wallclock_gpu_pme_t structure.
 438  */
 439 CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const pme_gpu_t         *CUDA_FUNC_ARGUMENT(pmeGPU),
 440                                              gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
 441
 442 /* The PME stages themselves */
 443
 444 /*! \libinternal \brief
 445  * A GPU spline computation and charge spreading function.
 446  *
 447  * \param[in]  pmeGpu          The PME GPU structure.
 448  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
 449  * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
 450  *                             e.g. testing or host-side FFT)
 451  * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
 452  * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
 453  */
 454 CUDA_FUNC_QUALIFIER void pme_gpu_spread(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu),
 455                                         int              CUDA_FUNC_ARGUMENT(gridIndex),
 456                                         real            *CUDA_FUNC_ARGUMENT(h_grid),
 457                                         bool             CUDA_FUNC_ARGUMENT(computeSplines),
 458                                         bool             CUDA_FUNC_ARGUMENT(spreadCharges)) CUDA_FUNC_TERM
 459
 460 /*! \libinternal \brief
 461  * A GPU force gathering function.
 462  *
 463  * \param[in]     pmeGpu           The PME GPU structure.
 464  * \param[in,out] h_forces         The host buffer with input and output forces.
 465  * \param[in]     overwriteForces  True: h_forces are output-only.
 466  *                                 False: h_forces are copied to the device and are reduced with the results of the force gathering.
 467  *                                 TODO: determine efficiency/balance of host/device-side reductions.
 468  * \param[in]     h_grid           The host-side grid buffer (used only in testing moe)
 469  */
 470 CUDA_FUNC_QUALIFIER void pme_gpu_gather(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu),
 471                                         float           *CUDA_FUNC_ARGUMENT(h_forces),
 472                                         bool             CUDA_FUNC_ARGUMENT(overwriteForces),
 473                                         const float     *CUDA_FUNC_ARGUMENT(h_grid)
 474                                         ) CUDA_FUNC_TERM
 475
 476
 477 /* The inlined convenience PME GPU status getters */
 478
 479 /*! \libinternal \brief
 480  * Tells if PME runs on multiple GPUs with the decomposition.
 481  *
 482  * \param[in] pmeGPU         The PME GPU structure.
 483  * \returns                  True if PME runs on multiple GPUs, false otherwise.
 484  */
 485 gmx_inline bool pme_gpu_uses_dd(const pme_gpu_t *pmeGPU)
 486 {
 487     return !pmeGPU->settings.useDecomposition;
 488 }
 489
 490 /*! \libinternal \brief
 491  * Tells if PME performs the gathering stage on GPU.
 492  *
 493  * \param[in] pmeGPU         The PME GPU structure.
 494  * \returns                  True if the gathering is performed on GPU, false otherwise.
 495  */
 496 gmx_inline bool pme_gpu_performs_gather(const pme_gpu_t *pmeGPU)
 497 {
 498     return pmeGPU->settings.performGPUGather;
 499 }
 500
 501 /*! \libinternal \brief
 502  * Tells if PME performs the FFT stages on GPU.
 503  *
 504  * \param[in] pmeGPU         The PME GPU structure.
 505  * \returns                  True if FFT is performed on GPU, false otherwise.
 506  */
 507 gmx_inline bool pme_gpu_performs_FFT(const pme_gpu_t *pmeGPU)
 508 {
 509     return pmeGPU->settings.performGPUFFT;
 510 }
 511
 512 /*! \libinternal \brief
 513  * Tells if PME performs the grid (un-)wrapping on GPU.
 514  *
 515  * \param[in] pmeGPU         The PME GPU structure.
 516  * \returns                  True if (un-)wrapping is performed on GPU, false otherwise.
 517  */
 518 gmx_inline bool pme_gpu_performs_wrapping(const pme_gpu_t *pmeGPU)
 519 {
 520     return pmeGPU->settings.useDecomposition;
 521 }
 522
 523 /*! \libinternal \brief
 524  * Tells if PME performs the grid solving on GPU.
 525  *
 526  * \param[in] pmeGPU         The PME GPU structure.
 527  * \returns                  True if solving is performed on GPU, false otherwise.
 528  */
 529 gmx_inline bool pme_gpu_performs_solve(const pme_gpu_t *pmeGPU)
 530 {
 531     return pmeGPU->settings.performGPUSolve;
 532 }
 533
 534 /*! \libinternal \brief
 535  * Enables or disables the testing mode.
 536  * Testing mode only implies copying all the outputs, even the intermediate ones, to the host.
 537  *
 538  * \param[in] pmeGPU             The PME GPU structure.
 539  * \param[in] testing            Should the testing mode be enabled, or disabled.
 540  */
 541 gmx_inline void pme_gpu_set_testing(pme_gpu_t *pmeGPU, bool testing)
 542 {
 543     pmeGPU->settings.copyAllOutputs = testing;
 544 }
 545
 546 /*! \libinternal \brief
 547  * Tells if PME is in the testing mode.
 548  *
 549  * \param[in] pmeGPU             The PME GPU structure.
 550  * \returns                      true if testing mode is enabled, false otherwise.
 551  */
 552 gmx_inline bool pme_gpu_is_testing(const pme_gpu_t *pmeGPU)
 553 {
 554     return pmeGPU->settings.copyAllOutputs;
 555 }
 556
 557 /* A block of C++ functions that live in pme-gpu-internal.cpp */
 558
 559 /*! \libinternal \brief
 560  * Returns the output virial and energy of the PME solving.
 561  * Should be called after pme_gpu_finish_step.
 562  *
 563  * \param[in] pmeGPU             The PME GPU structure.
 564  * \param[out] energy            The output energy.
 565  * \param[out] virial            The output virial matrix.
 566  */
 567 void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial);
 568
 569 /*! \libinternal \brief
 570  * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_start_step().
 571  *
 572  * \param[in] pmeGPU         The PME GPU structure.
 573  * \param[in] box            The unit cell box.
 574  */
 575 void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box);
 576
 577 /*! \libinternal \brief
 578  * Starts the PME GPU step (copies coordinates onto GPU, possibly sets the unit cell parameters).
 579  *
 580  * \param[in] pmeGPU           The PME GPU structure.
 581  * \param[in] needToUpdateBox  Tells if the stored unit cell parameters should be updated from \p box.
 582  * \param[in] box              The unit cell box which does not necessarily change every step (only with pressure coupling enabled).
 583  *                             Would only be used if \p needToUpdateBox is true.
 584  * \param[in] h_coordinates    Input coordinates (XYZ rvec array).
 585  */
 586 void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates);
 587
 588 /*! \libinternal \brief
 589  * Finishes the PME GPU step, waiting for the output forces and/or energy/virial to be copied to the host.
 590  *
 591  * \param[in] pmeGPU         The PME GPU structure.
 592  * \param[in] bCalcForces    The left-over flag from the CPU code which tells the function to copy the forces to the CPU side. Should be passed to the launch call instead. FIXME
 593  * \param[in] bCalcEnerVir   The left-over flag from the CPU code which tells the function to copy the energy/virial to the CPU side. Should be passed to the launch call instead.
 594  */
 595 void pme_gpu_finish_step(const pme_gpu_t *pmeGPU,  const bool       bCalcForces,
 596                          const bool       bCalcEnerVir);
 597
 598 //! A binary enum for spline data layout transformation
 599 enum class PmeLayoutTransform
 600 {
 601     GpuToHost,
 602     HostToGpu
 603 };
 604
 605 /*! \libinternal \brief
 606  * Rearranges the atom spline data between the GPU and host layouts.
 607  * Only used for test purposes so far, likely to be horribly slow.
 608  *
 609  * \param[in]  pmeGPU     The PME GPU structure.
 610  * \param[out] atc        he PME CPU atom data structure (with a single-threaded layout).
 611  * \param[in]  type       The spline data type (values or derivatives).
 612  * \param[in]  dimIndex   Dimension index.
 613  * \param[in]  transform  Layout transform type
 614  */
 615 void pme_gpu_transform_spline_atom_data(const pme_gpu_t *pmeGPU, const pme_atomcomm_t *atc,
 616                                         PmeSplineDataType type, int dimIndex, PmeLayoutTransform transform);
 617
 618 /*! \libinternal \brief
 619  * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
 620  *
 621  * \param[in] pmeGPU             The PME GPU structure.
 622  * \param[out] gridSize          Pointer to the grid dimensions to fill in.
 623  * \param[out] paddedGridSize    Pointer to the padded grid dimensions to fill in.
 624  */
 625 void pme_gpu_get_real_grid_sizes(const pme_gpu_t *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize);
 626
 627 /*! \libinternal \brief
 628  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
 629  *
 630  * \param[in,out] pme       The PME structure.
 631  * \param[in,out] gpuInfo   The GPU information structure.
 632  * \param[in]     mdlog     The logger.
 633  * \param[in]     cr        The communication structure.
 634  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
 635  */
 636 void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr);
 637
 638 /*! \libinternal \brief
 639  * Destroys the PME GPU data at the end of the run.
 640  *
 641  * \param[in] pmeGPU     The PME GPU structure.
 642  */
 643 void pme_gpu_destroy(pme_gpu_t *pmeGPU);
 644
 645 /*! \libinternal \brief
 646  * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
 647  *
 648  * \param[in] pmeGPU    The PME GPU structure.
 649  * \param[in] nAtoms    The number of particles.
 650  * \param[in] charges   The pointer to the host-side array of particle charges.
 651  *
 652  * This is a function that should only be called in the beginning of the run and on domain decomposition.
 653  * Should be called before the pme_gpu_set_io_ranges.
 654  */
 655 void pme_gpu_reinit_atoms(pme_gpu_t        *pmeGPU,
 656                           const int         nAtoms,
 657                           const real       *charges);
 658
 659 #endif