PME force gathering - CUDA kernel + unit tests
[gromacs.git] / src / gromacs / ewald / pme-gpu-internal.h
blobfeaf6bc64f9eb4a169aef80c4c907a4382bf8661
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 /*! \internal \file
38 * \brief This file contains internal function definitions for performing the PME calculations on GPU.
39 * These are not meant to be exposed outside of the PME GPU code.
40 * As of now, their bodies are still in the common pme-gpu.cpp files.
42 * \author Aleksei Iupinov <a.yupinov@gmail.com>
43 * \ingroup module_ewald
46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
47 #define GMX_EWALD_PME_GPU_INTERNAL_H
49 #include "gromacs/gpu_utils/gpu_macros.h" // for the CUDA_FUNC_ macros
51 #include "pme-gpu-types.h" // for the inline functions accessing pme_gpu_t members
53 struct gmx_hw_info_t;
54 struct gmx_gpu_opt_t;
55 struct gmx_pme_t; // only used in pme_gpu_reinit
56 struct t_commrec;
57 struct gmx_wallclock_gpu_pme_t;
58 struct pme_atomcomm_t;
60 namespace gmx
62 class MDLogger;
65 //! Type of spline data
66 enum class PmeSplineDataType
68 Values, // theta
69 Derivatives, // dtheta
70 }; //TODO move this into new and shiny pme.h (pme-types.h?)
72 /* Some general constants for PME GPU behaviour follow. */
74 /*! \brief \libinternal
75 * false: The atom data GPU buffers are sized precisely according to the number of atoms.
76 * (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
77 * The atom index checks in the spread/gather code potentially hinder the performance.
78 * true: The atom data GPU buffers are padded with zeroes so that the possible number of atoms
79 * fitting in is divisible by PME_ATOM_DATA_ALIGNMENT.
80 * The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
81 * Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
82 * \todo Estimate performance differences
84 const bool c_usePadding = true;
86 /*! \brief \libinternal
87 * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
88 * true: Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
89 * Could be good for performance in specific systems with lots of neutral atoms.
90 * \todo Estimate performance differences.
92 const bool c_skipNeutralAtoms = false;
94 /*! \brief \libinternal
95 * Number of PME solve output floating point numbers.
96 * 6 for symmetric virial matrix + 1 for reciprocal energy.
98 const int c_virialAndEnergyCount = 7;
100 /* A block of CUDA-only functions that live in pme.cu */
102 /*! \libinternal \brief
103 * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
104 * Depends on CUDA-specific block sizes, needed for the atom data padding.
106 * \param[in] pmeGPU The PME GPU structure.
107 * \returns Number of atoms in a single GPU atom data chunk.
109 CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
111 /*! \libinternal \brief
112 * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
114 * \param[in] pmeGPU The PME GPU structure.
115 * \returns Number of atoms in a single GPU atom spline data chunk.
117 CUDA_FUNC_QUALIFIER int pme_gpu_get_atoms_per_warp(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
119 /*! \libinternal \brief
120 * Synchronizes the current step, waiting for the GPU kernels/transfers to finish.
122 * \param[in] pmeGPU The PME GPU structure.
124 CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
126 /*! \libinternal \brief
127 * Allocates the fixed size energy and virial buffer both on GPU and CPU.
129 * \param[in] pmeGPU The PME GPU structure.
131 CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
133 /*! \libinternal \brief
134 * Frees the energy and virial memory both on GPU and CPU.
136 * \param[in] pmeGPU The PME GPU structure.
138 CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
140 /*! \libinternal \brief
141 * Clears the energy and virial memory on GPU with 0.
142 * Should be called at the end of the energy/virial calculation step.
144 * \param[in] pmeGPU The PME GPU structure.
146 CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
148 /*! \libinternal \brief
149 * Reallocates and copies the pre-computed B-spline values to the GPU.
151 * \param[in] pmeGPU The PME GPU structure.
153 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
155 /*! \libinternal \brief
156 * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
158 * \param[in] pmeGPU The PME GPU structure.
160 CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
162 /*! \libinternal \brief
163 * Reallocates the GPU buffer for the PME forces.
165 * \param[in] pmeGPU The PME GPU structure.
167 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
169 /*! \libinternal \brief
170 * Frees the GPU buffer for the PME forces.
172 * \param[in] pmeGPU The PME GPU structure.
174 CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
176 /*! \libinternal \brief
177 * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces).
178 * To be called e.g. after the bonded calculations.
180 * \param[in] pmeGPU The PME GPU structure.
181 * \param[in] h_forces The input forces rvec buffer.
183 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
184 const float *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
186 /*! \libinternal \brief
187 * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
189 * \param[in] pmeGPU The PME GPU structure.
190 * \param[out] h_forces The output forces rvec buffer.
192 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
193 float *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
195 /*! \libinternal \brief
196 * Waits for the PME GPU output forces copying to the CPU buffer to finish.
198 * \param[in] pmeGPU The PME GPU structure.
200 CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
202 /*! \libinternal \brief
203 * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
205 * \param[in] pmeGPU The PME GPU structure.
207 * Needs to be called on every DD step/in the beginning.
209 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
211 /*! \libinternal \brief
212 * Copies the input coordinates from the CPU buffer onto the GPU.
214 * \param[in] pmeGPU The PME GPU structure.
215 * \param[in] h_coordinates Input coordinates (XYZ rvec array).
217 * Needs to be called every MD step. The coordinates are then used in the spline calculation.
219 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
220 const rvec *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM
222 /*! \libinternal \brief
223 * Frees the coordinates on the GPU.
225 * \param[in] pmeGPU The PME GPU structure.
227 CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
229 /*! \libinternal \brief
230 * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
231 * Clears the padded part if needed.
233 * \param[in] pmeGPU The PME GPU structure.
234 * \param[in] h_coefficients The input atom charges/coefficients.
236 * Does not need to be done every MD step, only whenever the local charges change.
237 * (So, in the beginning of the run, or on DD step).
239 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
240 const float *CUDA_FUNC_ARGUMENT(h_coefficients)) CUDA_FUNC_TERM
242 /*! \libinternal \brief
243 * Frees the charges/coefficients on the GPU.
245 * \param[in] pmeGPU The PME GPU structure.
247 CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
249 /*! \libinternal \brief
250 * Reallocates the buffers on the GPU and the host for the atoms spline data.
252 * \param[in] pmeGPU The PME GPU structure.
254 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
256 /*! \libinternal \brief
257 * Frees the buffers on the GPU for the atoms spline data.
259 * \param[in] pmeGPU The PME GPU structure.
261 CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
263 /*! \libinternal \brief
264 * Reallocates the buffers on the GPU and the host for the particle gridline indices.
266 * \param[in] pmeGPU The PME GPU structure.
268 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
270 /*! \libinternal \brief
271 * Frees the buffer on the GPU for the particle gridline indices.
273 * \param[in] pmeGPU The PME GPU structure.
275 CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
277 /*! \libinternal \brief
278 * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
280 * \param[in] pmeGPU The PME GPU structure.
282 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
284 /*! \libinternal \brief
285 * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
287 * \param[in] pmeGPU The PME GPU structure.
289 CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
291 /*! \libinternal \brief
292 * Clears the real space grid on the GPU.
293 * Should be called at the end of each MD step.
295 * \param[in] pmeGPU The PME GPU structure.
297 CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
299 /*! \libinternal \brief
300 * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
302 * \param[in] pmeGPU The PME GPU structure.
304 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
306 /*! \libinternal \brief
307 * Frees the pre-computed fractional coordinates' shifts on the GPU.
309 * \param[in] pmeGPU The PME GPU structure.
311 CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
313 /*! \libinternal \brief
314 * Waits for the output virial/energy copying to the intermediate CPU buffer to finish.
316 * \param[in] pmeGPU The PME GPU structure.
318 CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
320 /*! \libinternal \brief
321 * Copies the input real-space grid from the host to the GPU.
323 * \param[in] pmeGPU The PME GPU structure.
324 * \param[in] h_grid The host-side grid buffer.
326 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
327 float *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
329 /*! \libinternal \brief
330 * Copies the output real-space grid from the GPU to the host.
332 * \param[in] pmeGPU The PME GPU structure.
333 * \param[out] h_grid The host-side grid buffer.
335 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
336 float *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
338 /*! \libinternal \brief
339 * Copies the spread output spline data and gridline indices from the GPU to the host.
341 * \param[in] pmeGPU The PME GPU structure.
343 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
345 /*! \libinternal \brief
346 * Copies the gather input spline data and gridline indices from the host to the GPU.
348 * \param[in] pmeGPU The PME GPU structure.
350 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
352 /*! \libinternal \brief
353 * Waits for the grid copying to the host-side buffer after spreading to finish.
355 * \param[in] pmeGPU The PME GPU structure.
357 CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
359 /*! \libinternal \brief
360 * Waits for the atom data copying to the intermediate host-side buffer after spline computation to finish.
362 * \param[in] pmeGPU The PME GPU structure.
364 CUDA_FUNC_QUALIFIER void pme_gpu_sync_spline_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
366 /*! \libinternal \brief
367 * Waits for the grid copying to the host-side buffer after solving to finish.
369 * \param[in] pmeGPU The PME GPU structure.
371 CUDA_FUNC_QUALIFIER void pme_gpu_sync_solve_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
373 /*! \libinternal \brief
374 * Does the one-time GPU-framework specific PME initialization.
375 * For CUDA, the PME stream is created with the highest priority.
377 * \param[in] pmeGPU The PME GPU structure.
379 CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
381 /*! \libinternal \brief
382 * Destroys the PME GPU-framework specific data.
383 * Should be called last in the PME GPU destructor.
385 * \param[in] pmeGPU The PME GPU structure.
387 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
389 /*! \libinternal \brief
390 * Initializes the PME GPU synchronization events.
392 * \param[in] pmeGPU The PME GPU structure.
394 CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
396 /*! \libinternal \brief
397 * Destroys the PME GPU synchronization events.
399 * \param[in] pmeGPU The PME GPU structure.
401 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
403 /*! \libinternal \brief
404 * Initializes the CUDA FFT structures.
406 * \param[in] pmeGPU The PME GPU structure.
408 CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
410 /*! \libinternal \brief
411 * Destroys the CUDA FFT structures.
413 * \param[in] pmeGPU The PME GPU structure.
415 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
417 /* Several CUDA event-based timing functions that live in pme-timings.cu */
419 /*! \libinternal \brief
420 * Finalizes all the PME GPU stage timings for the current step. Should be called at the end of every step.
422 * \param[in] pmeGPU The PME GPU structure.
424 CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
426 /*! \brief
427 * Resets the PME GPU timings. To be called at the reset step.
429 * \param[in] pmeGPU The PME GPU structure.
431 CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
433 /*! \libinternal \brief
434 * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.
436 * \param[in] pmeGPU The PME GPU structure.
437 * \param[in] timings The gmx_wallclock_gpu_pme_t structure.
439 CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
440 gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
442 /* The PME stages themselves */
444 /*! \libinternal \brief
445 * A GPU spline computation and charge spreading function.
447 * \param[in] pmeGpu The PME GPU structure.
448 * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
449 * \param[out] h_grid The host-side grid buffer (used only if the result of the spread is expected on the host,
450 * e.g. testing or host-side FFT)
451 * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed.
452 * \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
454 CUDA_FUNC_QUALIFIER void pme_gpu_spread(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu),
455 int CUDA_FUNC_ARGUMENT(gridIndex),
456 real *CUDA_FUNC_ARGUMENT(h_grid),
457 bool CUDA_FUNC_ARGUMENT(computeSplines),
458 bool CUDA_FUNC_ARGUMENT(spreadCharges)) CUDA_FUNC_TERM
460 /*! \libinternal \brief
461 * A GPU force gathering function.
463 * \param[in] pmeGpu The PME GPU structure.
464 * \param[in,out] h_forces The host buffer with input and output forces.
465 * \param[in] overwriteForces True: h_forces are output-only.
466 * False: h_forces are copied to the device and are reduced with the results of the force gathering.
467 * TODO: determine efficiency/balance of host/device-side reductions.
468 * \param[in] h_grid The host-side grid buffer (used only in testing moe)
470 CUDA_FUNC_QUALIFIER void pme_gpu_gather(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu),
471 float *CUDA_FUNC_ARGUMENT(h_forces),
472 bool CUDA_FUNC_ARGUMENT(overwriteForces),
473 const float *CUDA_FUNC_ARGUMENT(h_grid)
474 ) CUDA_FUNC_TERM
477 /* The inlined convenience PME GPU status getters */
479 /*! \libinternal \brief
480 * Tells if PME runs on multiple GPUs with the decomposition.
482 * \param[in] pmeGPU The PME GPU structure.
483 * \returns True if PME runs on multiple GPUs, false otherwise.
485 gmx_inline bool pme_gpu_uses_dd(const pme_gpu_t *pmeGPU)
487 return !pmeGPU->settings.useDecomposition;
490 /*! \libinternal \brief
491 * Tells if PME performs the gathering stage on GPU.
493 * \param[in] pmeGPU The PME GPU structure.
494 * \returns True if the gathering is performed on GPU, false otherwise.
496 gmx_inline bool pme_gpu_performs_gather(const pme_gpu_t *pmeGPU)
498 return pmeGPU->settings.performGPUGather;
501 /*! \libinternal \brief
502 * Tells if PME performs the FFT stages on GPU.
504 * \param[in] pmeGPU The PME GPU structure.
505 * \returns True if FFT is performed on GPU, false otherwise.
507 gmx_inline bool pme_gpu_performs_FFT(const pme_gpu_t *pmeGPU)
509 return pmeGPU->settings.performGPUFFT;
512 /*! \libinternal \brief
513 * Tells if PME performs the grid (un-)wrapping on GPU.
515 * \param[in] pmeGPU The PME GPU structure.
516 * \returns True if (un-)wrapping is performed on GPU, false otherwise.
518 gmx_inline bool pme_gpu_performs_wrapping(const pme_gpu_t *pmeGPU)
520 return pmeGPU->settings.useDecomposition;
523 /*! \libinternal \brief
524 * Tells if PME performs the grid solving on GPU.
526 * \param[in] pmeGPU The PME GPU structure.
527 * \returns True if solving is performed on GPU, false otherwise.
529 gmx_inline bool pme_gpu_performs_solve(const pme_gpu_t *pmeGPU)
531 return pmeGPU->settings.performGPUSolve;
534 /*! \libinternal \brief
535 * Enables or disables the testing mode.
536 * Testing mode only implies copying all the outputs, even the intermediate ones, to the host.
538 * \param[in] pmeGPU The PME GPU structure.
539 * \param[in] testing Should the testing mode be enabled, or disabled.
541 gmx_inline void pme_gpu_set_testing(pme_gpu_t *pmeGPU, bool testing)
543 pmeGPU->settings.copyAllOutputs = testing;
546 /*! \libinternal \brief
547 * Tells if PME is in the testing mode.
549 * \param[in] pmeGPU The PME GPU structure.
550 * \returns true if testing mode is enabled, false otherwise.
552 gmx_inline bool pme_gpu_is_testing(const pme_gpu_t *pmeGPU)
554 return pmeGPU->settings.copyAllOutputs;
557 /* A block of C++ functions that live in pme-gpu-internal.cpp */
559 /*! \libinternal \brief
560 * Returns the output virial and energy of the PME solving.
561 * Should be called after pme_gpu_finish_step.
563 * \param[in] pmeGPU The PME GPU structure.
564 * \param[out] energy The output energy.
565 * \param[out] virial The output virial matrix.
567 void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial);
569 /*! \libinternal \brief
570 * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_start_step().
572 * \param[in] pmeGPU The PME GPU structure.
573 * \param[in] box The unit cell box.
575 void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box);
577 /*! \libinternal \brief
578 * Starts the PME GPU step (copies coordinates onto GPU, possibly sets the unit cell parameters).
580 * \param[in] pmeGPU The PME GPU structure.
581 * \param[in] needToUpdateBox Tells if the stored unit cell parameters should be updated from \p box.
582 * \param[in] box The unit cell box which does not necessarily change every step (only with pressure coupling enabled).
583 * Would only be used if \p needToUpdateBox is true.
584 * \param[in] h_coordinates Input coordinates (XYZ rvec array).
586 void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates);
588 /*! \libinternal \brief
589 * Finishes the PME GPU step, waiting for the output forces and/or energy/virial to be copied to the host.
591 * \param[in] pmeGPU The PME GPU structure.
592 * \param[in] bCalcForces The left-over flag from the CPU code which tells the function to copy the forces to the CPU side. Should be passed to the launch call instead. FIXME
593 * \param[in] bCalcEnerVir The left-over flag from the CPU code which tells the function to copy the energy/virial to the CPU side. Should be passed to the launch call instead.
595 void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcForces,
596 const bool bCalcEnerVir);
598 //! A binary enum for spline data layout transformation
599 enum class PmeLayoutTransform
601 GpuToHost,
602 HostToGpu
605 /*! \libinternal \brief
606 * Rearranges the atom spline data between the GPU and host layouts.
607 * Only used for test purposes so far, likely to be horribly slow.
609 * \param[in] pmeGPU The PME GPU structure.
610 * \param[out] atc he PME CPU atom data structure (with a single-threaded layout).
611 * \param[in] type The spline data type (values or derivatives).
612 * \param[in] dimIndex Dimension index.
613 * \param[in] transform Layout transform type
615 void pme_gpu_transform_spline_atom_data(const pme_gpu_t *pmeGPU, const pme_atomcomm_t *atc,
616 PmeSplineDataType type, int dimIndex, PmeLayoutTransform transform);
618 /*! \libinternal \brief
619 * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
621 * \param[in] pmeGPU The PME GPU structure.
622 * \param[out] gridSize Pointer to the grid dimensions to fill in.
623 * \param[out] paddedGridSize Pointer to the padded grid dimensions to fill in.
625 void pme_gpu_get_real_grid_sizes(const pme_gpu_t *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize);
627 /*! \libinternal \brief
628 * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
630 * \param[in,out] pme The PME structure.
631 * \param[in,out] gpuInfo The GPU information structure.
632 * \param[in] mdlog The logger.
633 * \param[in] cr The communication structure.
634 * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
636 void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr);
638 /*! \libinternal \brief
639 * Destroys the PME GPU data at the end of the run.
641 * \param[in] pmeGPU The PME GPU structure.
643 void pme_gpu_destroy(pme_gpu_t *pmeGPU);
645 /*! \libinternal \brief
646 * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
648 * \param[in] pmeGPU The PME GPU structure.
649 * \param[in] nAtoms The number of particles.
650 * \param[in] charges The pointer to the host-side array of particle charges.
652 * This is a function that should only be called in the beginning of the run and on domain decomposition.
653 * Should be called before the pme_gpu_set_io_ranges.
655 void pme_gpu_reinit_atoms(pme_gpu_t *pmeGPU,
656 const int nAtoms,
657 const real *charges);
659 #endif