From a453e5fa4c268d714ef5d31d5bb2c19423f3a102 Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Wed, 15 Jan 2020 18:30:23 +0100
Subject: [PATCH] Regularize nbnxm module doxygen

This module will be easier to work with if all the files are
documented and all the comments precede the things that they comment.

Had to add several \internal flags to various files to ensure
things that should be internal are that way.

Change-Id: Ia08a936c23276cbea97c2e8ff47db403d3fd9185
---
 src/gromacs/nbnxm/atomdata.h                       |  51 ++--
 src/gromacs/nbnxm/boundingboxes.h                  |   7 +-
 src/gromacs/nbnxm/constants.h                      |  21 +-
 src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h          | 141 +++++++----
 src/gromacs/nbnxm/gpu_types_common.h               |  85 ++++---
 src/gromacs/nbnxm/grid.h                           |  43 ++--
 .../nbnxm/kernels_reference/kernel_gpu_ref.h       |  14 +-
 src/gromacs/nbnxm/kernels_reference/kernel_ref.h   |  22 +-
 src/gromacs/nbnxm/nbnxm_geometry.h                 |  23 +-
 src/gromacs/nbnxm/nbnxm_simd.h                     |  16 +-
 src/gromacs/nbnxm/opencl/nbnxm_ocl_consts.h        |  10 +-
 .../nbnxm/opencl/nbnxm_ocl_kernel_utils.clh        |  95 ++++---
 src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h         | 274 +++++++++++++--------
 src/gromacs/nbnxm/pairlist.h                       | 147 +++++++----
 src/gromacs/nbnxm/pairlist_simd_2xmm.h             |  15 +-
 src/gromacs/nbnxm/pairlist_simd_4xm.h              |  15 +-
 src/gromacs/nbnxm/pairlistparams.h                 |  29 ++-
 src/gromacs/nbnxm/pairlistsets.h                   |   2 +
 src/gromacs/nbnxm/pairlistwork.h                   |  48 ++--
 src/gromacs/nbnxm/pairsearch.h                     |  57 +++--
 20 files changed, 734 insertions(+), 381 deletions(-)
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index 68c25e5933..d41408d953 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -107,10 +107,11 @@ static inline int atom_to_x_index(int a)
     return DIM * (a & ~(packSize - 1)) + (a & (packSize - 1));
 }
 
-// Struct that holds force and energy output buffers
+/*! \internal
+ * \brief Struct that holds force and energy output buffers */
 struct nbnxn_atomdata_output_t
 {
-    /* Constructor
+    /*! \brief Constructor
      *
      * \param[in] kernelType              Type of non-bonded kernel
      * \param[in] numEnergyGroups         The number of energy groups
@@ -119,15 +120,21 @@ struct nbnxn_atomdata_output_t
      */
     nbnxn_atomdata_output_t(Nbnxm::KernelType  kernelType,
                             int                numEnergyGroups,
-                            int                simdEnergyBUfferStride,
+                            int                simdEnergyBufferStride,
                             gmx::PinningPolicy pinningPolicy);
 
-    gmx::HostVector<real> f;      // f, size natoms*fstride
-    gmx::HostVector<real> fshift; // Shift force array, size SHIFTS*DIM
-    gmx::HostVector<real> Vvdw;   // Temporary Van der Waals group energy storage
-    gmx::HostVector<real> Vc;     // Temporary Coulomb group energy storage
-    AlignedVector<real>   VSvdw;  // Temporary SIMD Van der Waals group energy storage
-    AlignedVector<real>   VSc;    // Temporary SIMD Coulomb group energy storage
+    //! f, size natoms*fstride
+    gmx::HostVector<real> f;
+    //! Shift force array, size SHIFTS*DIM
+    gmx::HostVector<real> fshift;
+    //! Temporary Van der Waals group energy storage
+    gmx::HostVector<real> Vvdw;
+    //! Temporary Coulomb group energy storage
+    gmx::HostVector<real> Vc;
+    //! Temporary SIMD Van der Waals group energy storage
+    AlignedVector<real> VSvdw;
+    //! Temporary SIMD Coulomb group energy storage
+    AlignedVector<real> VSc;
 };
 
 /*! \brief Block size in atoms for the non-bonded thread force-buffer reduction.
@@ -177,35 +184,37 @@ enum
  */
 struct nbnxn_atomdata_t
 { //NOLINT(clang-analyzer-optin.performance.Padding)
+    /*! \internal
+     * \brief The actual atom data parameter values */
     struct Params
     {
-        /* Constructor
+        /*! \brief Constructor
          *
          * \param[in] pinningPolicy  Sets the pinning policy for all data that might be transfered to a GPU
          */
         Params(gmx::PinningPolicy pinningPolicy);
 
-        // The number of different atom types
+        //! The number of different atom types
         int numTypes;
-        // Lennard-Jone 6*C6 and 12*C12 parameters, size numTypes*2*2
+        //! Lennard-Jone 6*C6 and 12*C12 parameters, size numTypes*2*2
         gmx::HostVector<real> nbfp;
-        // Combination rule, see enum defined above
+        //! Combination rule, see enum defined above
         int comb_rule;
-        // LJ parameters per atom type, size numTypes*2
+        //! LJ parameters per atom type, size numTypes*2
         gmx::HostVector<real> nbfp_comb;
-        // As nbfp, but with a stride for the present SIMD architecture
+        //! As nbfp, but with a stride for the present SIMD architecture
         AlignedVector<real> nbfp_aligned;
-        // Atom types per atom
+        //! Atom types per atom
         gmx::HostVector<int> type;
-        // LJ parameters per atom for fast SIMD loading
+        //! LJ parameters per atom for fast SIMD loading
         gmx::HostVector<real> lj_comb;
-        // Charges per atom, not set with format nbatXYZQ
+        //! Charges per atom, not set with format nbatXYZQ
         gmx::HostVector<real> q;
-        // The number of energy groups
+        //! The number of energy groups
         int nenergrp;
-        // 2log(nenergrp)
+        //! 2log(nenergrp)
         int neg_2log;
-        // The energy groups, one int entry per cluster, only set when needed
+        //! The energy groups, one int entry per cluster, only set when needed
         gmx::HostVector<int> energrp;
     };
 
diff --git a/src/gromacs/nbnxm/boundingboxes.h b/src/gromacs/nbnxm/boundingboxes.h
index 945fbc5961..986dde307c 100644
--- a/src/gromacs/nbnxm/boundingboxes.h
+++ b/src/gromacs/nbnxm/boundingboxes.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -33,7 +33,8 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-/*! \internal \file
+/*! \file
+ * \internal
  *
  * \brief Declares constants and helper functions used when handling
  * bounding boxes for clusters of particles.
@@ -57,7 +58,7 @@ static constexpr int c_numBoundingBoxBounds1D = 2;
 
 #ifndef DOXYGEN
 
-/* Bounding box calculations are (currently) always in single precision, so
+/*! \brief Bounding box calculations are (currently) always in single precision, so
  * we only need to check for single precision support here.
  * This uses less (cache-)memory and SIMD is faster, at least on x86.
  */
diff --git a/src/gromacs/nbnxm/constants.h b/src/gromacs/nbnxm/constants.h
index ecb628abb7..41a3e15f70 100644
--- a/src/gromacs/nbnxm/constants.h
+++ b/src/gromacs/nbnxm/constants.h
@@ -34,12 +34,23 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
+/*! \internal \file
+ *
+ * \brief
+ * Declares constants for the module
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_nbnxm
+ */
+
 #ifndef GMX_NBNXN_CONSTANTS_H
 #define GMX_NBNXN_CONSTANTS_H
 
-// Lower limit for square interaction distances in nonbonded kernels.
-// For smaller values we will overflow when calculating r^-1 or r^-12, but
-// to keep it simple we always apply the limit from the tougher r^-12 condition.
+/*! \brief Lower limit for square interaction distances in nonbonded kernels.
+ *
+ * For smaller values we will overflow when calculating r^-1 or r^-12, but
+ * to keep it simple we always apply the limit from the tougher r^-12 condition.
+ */
 #if GMX_DOUBLE
 // Some double precision SIMD architectures use single precision in the first
 // step, so although the double precision criterion would allow smaller rsq,
@@ -52,10 +63,10 @@
 #endif
 
 
-/* The number of clusters in a super-cluster, used for GPU */
+//! The number of clusters in a super-cluster, used for GPU
 #define c_nbnxnGpuNumClusterPerSupercluster 8
 
-/* With GPU kernels we group cluster pairs in 4 to optimize memory usage
+/*! \brief With GPU kernels we group cluster pairs in 4 to optimize memory usage
  * of integers containing 32 bits.
  */
 #define c_nbnxnGpuJgroupSize (32 / c_nbnxnGpuNumClusterPerSupercluster)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index 797a0b4bc4..911536167b 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -140,9 +140,12 @@ typedef struct nb_staging  nb_staging_t;
  */
 struct nb_staging
 {
-    float*  e_lj;   /**< LJ energy            */
-    float*  e_el;   /**< electrostatic energy */
-    float3* fshift; /**< shift forces         */
+    //! LJ energy
+    float* e_lj;
+    //! electrostatic energy
+    float* e_el;
+    //! shift forces
+    float3* fshift;
 };
 
 /** \internal
@@ -150,24 +153,37 @@ struct nb_staging
  */
 struct cu_atomdata
 {
-    int natoms;       /**< number of atoms                              */
-    int natoms_local; /**< number of local atoms                        */
-    int nalloc;       /**< allocation size for the atom data (xq, f)    */
-
-    float4* xq; /**< atom coordinates + charges, size natoms      */
-    float3* f;  /**< force output array, size natoms              */
-
-    float* e_lj; /**< LJ energy output, size 1                     */
-    float* e_el; /**< Electrostatics energy input, size 1          */
-
-    float3* fshift; /**< shift forces                                 */
-
-    int     ntypes;     /**< number of atom types                         */
-    int*    atom_types; /**< atom type indices, size natoms               */
-    float2* lj_comb;    /**< sqrt(c6),sqrt(c12) size natoms               */
-
-    float3* shift_vec;         /**< shifts                                       */
-    bool    bShiftVecUploaded; /**< true if the shift vector has been uploaded   */
+    //! number of atoms
+    int natoms;
+    //! number of local atoms
+    int natoms_local;
+    //! allocation size for the atom data (xq, f)
+    int nalloc;
+
+    //! atom coordinates + charges, size natoms
+    float4* xq;
+    //! force output array, size natoms
+    float3* f;
+
+    //! LJ energy output, size 1
+    float* e_lj;
+    //! Electrostatics energy input, size 1
+    float* e_el;
+
+    //! shift forces
+    float3* fshift;
+
+    //! number of atom types
+    int ntypes;
+    //! atom type indices, size natoms
+    int* atom_types;
+    //! sqrt(c6),sqrt(c12) size natoms
+    float2* lj_comb;
+
+    //! shifts
+    float3* shift_vec;
+    //! true if the shift vector has been uploaded
+    bool bShiftVecUploaded;
 };
 
 /** \internal
@@ -176,39 +192,64 @@ struct cu_atomdata
 struct cu_nbparam
 {
 
-    int eeltype; /**< type of electrostatics, takes values from #eelCu */
-    int vdwtype; /**< type of VdW impl., takes values from #evdwCu     */
-
-    float epsfac;      /**< charge multiplication factor                      */
-    float c_rf;        /**< Reaction-field/plain cutoff electrostatics const. */
-    float two_k_rf;    /**< Reaction-field electrostatics constant            */
-    float ewald_beta;  /**< Ewald/PME parameter                               */
-    float sh_ewald;    /**< Ewald/PME correction term substracted from the direct-space potential */
-    float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential        */
-    float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient                          */
-
-    float rcoulomb_sq; /**< Coulomb cut-off squared                           */
-
-    float rvdw_sq;           /**< VdW cut-off squared                               */
-    float rvdw_switch;       /**< VdW switched cut-off                              */
-    float rlistOuter_sq;     /**< Full, outer pair-list cut-off squared             */
-    float rlistInner_sq;     /**< Inner, dynamic pruned pair-list cut-off squared   */
-    bool  useDynamicPruning; /**< True if we use dynamic pair-list pruning          */
-
-    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
-    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
-    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
+    //! type of electrostatics, takes values from #eelCu
+    int eeltype;
+    //! type of VdW impl., takes values from #evdwCu
+    int vdwtype;
+
+    //! charge multiplication factor
+    float epsfac;
+    //! Reaction-field/plain cutoff electrostatics const.
+    float c_rf;
+    //! Reaction-field electrostatics constant
+    float two_k_rf;
+    //! Ewald/PME parameter
+    float ewald_beta;
+    //! Ewald/PME correction term substracted from the direct-space potential
+    float sh_ewald;
+    //! LJ-Ewald/PME correction term added to the correction potential
+    float sh_lj_ewald;
+    //! LJ-Ewald/PME coefficient
+    float ewaldcoeff_lj;
+
+    //! Coulomb cut-off squared
+    float rcoulomb_sq;
+
+    //! VdW cut-off squared
+    float rvdw_sq;
+    //! VdW switched cut-off
+    float rvdw_switch;
+    //! Full, outer pair-list cut-off squared
+    float rlistOuter_sq;
+    //! Inner, dynamic pruned pair-list cut-off squared
+    float rlistInner_sq;
+    //! True if we use dynamic pair-list pruning
+    bool useDynamicPruning;
+
+    //! VdW shift dispersion constants
+    shift_consts_t dispersion_shift;
+    //! VdW shift repulsion constants
+    shift_consts_t repulsion_shift;
+    //! VdW switch constants
+    switch_consts_t vdw_switch;
 
     /* LJ non-bonded parameters - accessed through texture memory */
-    float*              nbfp; /**< nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements */
-    cudaTextureObject_t nbfp_texobj; /**< texture object bound to nbfp */
-    float*              nbfp_comb; /**< nonbonded parameter table per atom type, 2*ntype elements */
-    cudaTextureObject_t nbfp_comb_texobj; /**< texture object bound to nbfp_texobj */
+    //! nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements
+    float* nbfp;
+    //! texture object bound to nbfp
+    cudaTextureObject_t nbfp_texobj;
+    //! nonbonded parameter table per atom type, 2*ntype elements
+    float* nbfp_comb;
+    //! texture object bound to nbfp_texobj
+    cudaTextureObject_t nbfp_comb_texobj;
 
     /* Ewald Coulomb force table data - accessed through texture memory */
-    float               coulomb_tab_scale;  /**< table scale/spacing                        */
-    float*              coulomb_tab;        /**< pointer to the table in the device memory  */
-    cudaTextureObject_t coulomb_tab_texobj; /**< texture object bound to coulomb_tab        */
+    //! table scale/spacing
+    float coulomb_tab_scale;
+    //! pointer to the table in the device memory
+    float* coulomb_tab;
+    //! texture object bound to coulomb_tab
+    cudaTextureObject_t coulomb_tab_texobj;
 };
 
 /** \internal
diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h
index 0ec0c6e965..28dbe70459 100644
--- a/src/gromacs/nbnxm/gpu_types_common.h
+++ b/src/gromacs/nbnxm/gpu_types_common.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -76,8 +76,10 @@ struct gpu_timers_t
      */
     struct XFTransfers
     {
-        GpuRegionTimer nb_h2d; /**< timer for x/q H2D transfers (l/nl, every step) */
-        GpuRegionTimer nb_d2h; /**< timer for f D2H transfer (l/nl, every step) */
+        //! timer for x/q H2D transfers (l/nl, every step)
+        GpuRegionTimer nb_h2d;
+        //! timer for f D2H transfer (l/nl, every step)
+        GpuRegionTimer nb_d2h;
     };
 
     /*! \internal
@@ -85,14 +87,20 @@ struct gpu_timers_t
      */
     struct Interaction
     {
-        GpuRegionTimer pl_h2d;       /**< timer for pair-list H2D transfers (l/nl, every PS step) */
-        bool didPairlistH2D = false; /**< true when a pair-list transfer has been done at this step */
-        GpuRegionTimer nb_k;         /**< timer for non-bonded kernels (l/nl, every step)         */
-        GpuRegionTimer prune_k; /**< timer for the 1st pass list pruning kernel (l/nl, every PS step) */
-        bool didPrune = false; /**< true when we timed pruning and the timings need to be accounted for */
-        GpuRegionTimer rollingPrune_k; /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size)  */
-        bool           didRollingPrune =
-                false; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
+        //! timer for pair-list H2D transfers (l/nl, every PS step)
+        GpuRegionTimer pl_h2d;
+        //! true when a pair-list transfer has been done at this step
+        bool didPairlistH2D = false;
+        //! timer for non-bonded kernels (l/nl, every step)
+        GpuRegionTimer nb_k;
+        //! timer for the 1st pass list pruning kernel (l/nl, every PS step)
+        GpuRegionTimer prune_k;
+        //! true when we timed pruning and the timings need to be accounted for
+        bool didPrune = false;
+        //! timer for rolling pruning kernels (l/nl, frequency depends on chunk size)
+        GpuRegionTimer rollingPrune_k;
+        //! true when we timed rolling pruning (at the previous step) and the timings need to be accounted for
+        bool didRollingPrune = false;
     };
 
     //! timer for atom data transfer (every PS step)
@@ -103,29 +111,46 @@ struct gpu_timers_t
     gmx::EnumerationArray<InteractionLocality, Nbnxm::gpu_timers_t::Interaction> interaction;
 };
 
+/*! \internal
+ * \brief GPU pair list structure */
 struct gpu_plist
 {
-    int na_c; /**< number of atoms per cluster                  */
-
-    int                       nsci;       /**< size of sci, # of i clusters in the list     */
-    int                       sci_nalloc; /**< allocation size of sci                       */
-    DeviceBuffer<nbnxn_sci_t> sci;        /**< list of i-cluster ("super-clusters")         */
-
-    int                       ncj4;          /**< total # of 4*j clusters                      */
-    int                       cj4_nalloc;    /**< allocation size of cj4                       */
-    DeviceBuffer<nbnxn_cj4_t> cj4;           /**< 4*j cluster list, contains j cluster number
-                                                and index into the i cluster list            */
-    int                        nimask;       /**< # of 4*j clusters * # of warps               */
-    int                        imask_nalloc; /**< allocation size of imask                     */
-    DeviceBuffer<unsigned int> imask;        /**< imask for 2 warps for each 4*j cluster group */
-    DeviceBuffer<nbnxn_excl_t> excl;         /**< atom interaction bits                        */
-    int                        nexcl;        /**< count for excl                               */
-    int                        excl_nalloc;  /**< allocation size of excl                      */
+    //! number of atoms per cluster
+    int na_c;
+
+    //! size of sci, # of i clusters in the list
+    int nsci;
+    //! allocation size of sci
+    int sci_nalloc;
+    //! list of i-cluster ("super-clusters")
+    DeviceBuffer<nbnxn_sci_t> sci;
+
+    //! total # of 4*j clusters
+    int ncj4;
+    //! allocation size of cj4
+    int cj4_nalloc;
+    //! 4*j cluster list, contains j cluster number and index into the i cluster list
+    DeviceBuffer<nbnxn_cj4_t> cj4;
+    //! # of 4*j clusters * # of warps
+    int nimask;
+    //! allocation size of imask
+    int imask_nalloc;
+    //! imask for 2 warps for each 4*j cluster group
+    DeviceBuffer<unsigned int> imask;
+    //! atom interaction bits
+    DeviceBuffer<nbnxn_excl_t> excl;
+    //! count for excl
+    int nexcl;
+    //! allocation size of excl
+    int excl_nalloc;
 
     /* parameter+variables for normal and rolling pruning */
-    bool haveFreshList; /**< true after search, indictes that initial pruning with outer prunning is needed */
-    int  rollingPruningNumParts; /**< the number of parts/steps over which one cyle of roling pruning takes places */
-    int  rollingPruningPart; /**< the next part to which the roling pruning needs to be applied */
+    //! true after search, indictes that initial pruning with outer prunning is needed
+    bool haveFreshList;
+    //! the number of parts/steps over which one cyle of roling pruning takes places
+    int rollingPruningNumParts;
+    //! the next part to which the roling pruning needs to be applied
+    int rollingPruningPart;
 };
 
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/grid.h b/src/gromacs/nbnxm/grid.h
index a8326a3bec..b55f83d222 100644
--- a/src/gromacs/nbnxm/grid.h
+++ b/src/gromacs/nbnxm/grid.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -125,14 +125,20 @@ struct BoundingBox
         //! Returns a pointer for SIMD storing of a Corner object
         float* ptr() { return &x; }
 
-        float x;       //!< x coordinate
-        float y;       //!< y coordinate
-        float z;       //!< z coordinate
-        float padding; //!< padding, unused, but should be set to avoid operations on unitialized data
+        //! x coordinate
+        float x;
+        //! y coordinate
+        float y;
+        //! z coordinate
+        float z;
+        //! padding, unused, but should be set to avoid operations on unitialized data
+        float padding;
     };
 
-    Corner lower; //!< lower, along x and y and z, corner
-    Corner upper; //!< upper, along x and y and z, corner
+    //! lower, along x and y and z, corner
+    Corner lower;
+    //! upper, along x and y and z, corner
+    Corner upper;
 };
 
 /*! \internal
@@ -140,8 +146,10 @@ struct BoundingBox
  */
 struct BoundingBox1D
 {
-    float lower; //!< lower bound
-    float upper; //!< upper bound
+    //! lower bound
+    float lower;
+    //! upper bound
+    float upper;
 };
 
 } // namespace Nbnxm
@@ -182,14 +190,19 @@ public:
         //! Constructs the cluster/cell geometry given the type of pairlist
         Geometry(PairlistType pairlistType);
 
-        bool isSimple;             //!< Is this grid simple (CPU) or hierarchical (GPU)
-        int  numAtomsICluster;     //!< Number of atoms per cluster
-        int  numAtomsJCluster;     //!< Number of atoms for list j-clusters
-        int  numAtomsPerCell;      //!< Number of atoms per cell
-        int  numAtomsICluster2Log; //!< 2log of na_c
+        //! Is this grid simple (CPU) or hierarchical (GPU)
+        bool isSimple;
+        //! Number of atoms per cluster
+        int numAtomsICluster;
+        //! Number of atoms for list j-clusters
+        int numAtomsJCluster;
+        //! Number of atoms per cell
+        int numAtomsPerCell;
+        //! 2log of na_c
+        int numAtomsICluster2Log;
     };
 
-    // The physical dimensions of a grid
+    //! The physical dimensions of a grid \internal
     struct Dimensions
     {
         //! The lower corner of the (local) grid
diff --git a/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.h b/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.h
index a861307d2c..e9fbac5f87 100644
--- a/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.h
+++ b/src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.h
@@ -33,9 +33,17 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
+/*! \internal \file
+ *
+ * \brief
+ * Declares GPU reference kernel
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_nbnxm
+ */
 
-#ifndef _nbnxn_kernel_gpu_ref_h
-#define _nbnxn_kernel_gpu_ref_h
+#ifndef GMX_NBNXM_KERNELS_REFERENCE_KERNEL_GPU_REF_H
+#define GMX_NBNXM_KERNELS_REFERENCE_KERNEL_GPU_REF_H
 
 #include "gromacs/math/vectypes.h"
 #include "gromacs/mdtypes/forcerec.h"
@@ -50,7 +58,7 @@ namespace gmx
 class StepWorkload;
 }
 
-/* Reference (slow) kernel for nb n vs n GPU type pair lists */
+//! Reference (slow) kernel for nb n vs n GPU type pair lists
 void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu*    nbl,
                           const nbnxn_atomdata_t*    nbat,
                           const interaction_const_t* iconst,
diff --git a/src/gromacs/nbnxm/kernels_reference/kernel_ref.h b/src/gromacs/nbnxm/kernels_reference/kernel_ref.h
index 6d92c582f4..e98d2deba6 100644
--- a/src/gromacs/nbnxm/kernels_reference/kernel_ref.h
+++ b/src/gromacs/nbnxm/kernels_reference/kernel_ref.h
@@ -33,11 +33,21 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
+/*! \internal \file
+ *
+ * \brief
+ * Declares CPU reference kernels
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_nbnxm
+ */
+#ifndef GMX_NBNXM_KERNELS_REFERENCE_KERNEL_REF_H
+#define GMX_NBNXM_KERNELS_REFERENCE_KERNEL_REF_H
 
 #include "gromacs/nbnxm/kernel_common.h"
 
-/* Declare all the different kernel functions.
- */
+//! All the different CPU reference kernel functions.
+//! \{
 nbk_func_noener nbnxn_kernel_ElecRF_VdwLJ_F_ref;
 nbk_func_noener nbnxn_kernel_ElecRF_VdwLJFsw_F_ref;
 nbk_func_noener nbnxn_kernel_ElecRF_VdwLJPsw_F_ref;
@@ -85,15 +95,18 @@ nbk_func_ener nbnxn_kernel_ElecQSTabTwinCut_VdwLJFsw_VgrpF_ref;
 nbk_func_ener nbnxn_kernel_ElecQSTabTwinCut_VdwLJPsw_VgrpF_ref;
 nbk_func_ener nbnxn_kernel_ElecQSTabTwinCut_VdwLJEwCombGeom_VgrpF_ref;
 nbk_func_ener nbnxn_kernel_ElecQSTabTwinCut_VdwLJEwCombLB_VgrpF_ref;
+//! \}
 
 #ifdef INCLUDE_KERNELFUNCTION_TABLES
 
-/* Declare and define the kernel function pointer lookup tables.
+/*! \brief Declare and define the kernel function pointer lookup tables.
+ *
  * The minor index of the array goes over both the LJ combination rules,
  * which is only supported by plain cut-off, and the LJ switch/PME functions.
  * For the C reference kernels, unlike the SIMD kernels, there is not much
  * advantage in using combination rules, so we (re-)use the same kernel.
  */
+//! \{
 static p_nbk_func_noener nbnxn_kernel_noener_ref[coulktNR][vdwktNR_ref] = {
     { nbnxn_kernel_ElecRF_VdwLJ_F_ref, nbnxn_kernel_ElecRF_VdwLJ_F_ref, nbnxn_kernel_ElecRF_VdwLJ_F_ref,
       nbnxn_kernel_ElecRF_VdwLJFsw_F_ref, nbnxn_kernel_ElecRF_VdwLJPsw_F_ref,
@@ -147,5 +160,8 @@ static p_nbk_func_ener nbnxn_kernel_energrp_ref[coulktNR][vdwktNR_ref] = {
       nbnxn_kernel_ElecQSTabTwinCut_VdwLJPsw_VgrpF_ref, nbnxn_kernel_ElecQSTabTwinCut_VdwLJEwCombGeom_VgrpF_ref,
       nbnxn_kernel_ElecQSTabTwinCut_VdwLJEwCombLB_VgrpF_ref }
 };
+//! \}
 
 #endif /* INCLUDE_KERNELFUNCTION_TABLES */
+
+#endif
diff --git a/src/gromacs/nbnxm/nbnxm_geometry.h b/src/gromacs/nbnxm/nbnxm_geometry.h
index 0a438075e2..debc49b17e 100644
--- a/src/gromacs/nbnxm/nbnxm_geometry.h
+++ b/src/gromacs/nbnxm/nbnxm_geometry.h
@@ -33,7 +33,14 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-
+/*! \internal \file
+ *
+ * \brief
+ * Declares the geometry-related functionality
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_nbnxm
+ */
 #ifndef GMX_NBNXM_NBNXM_GEOMETRY_H
 #define GMX_NBNXM_NBNXM_GEOMETRY_H
 
@@ -45,7 +52,8 @@
 #include "pairlist.h"
 
 
-/* Returns the base-2 log of n.
+/*! \copybrief Returns the base-2 log of n.
+ * *
  * Generates a fatal error when n is not an integer power of 2.
  */
 static inline int get_2log(int n)
@@ -68,13 +76,13 @@ static inline int get_2log(int n)
 namespace Nbnxm
 {
 
-/* The nbnxn i-cluster size in atoms for each nbnxn kernel type */
+/*! \brief The nbnxn i-cluster size in atoms for each nbnxn kernel type */
 static constexpr gmx::EnumerationArray<KernelType, int> IClusterSizePerKernelType = {
     { 0, c_nbnxnCpuIClusterSize, c_nbnxnCpuIClusterSize, c_nbnxnCpuIClusterSize,
       c_nbnxnGpuClusterSize, c_nbnxnGpuClusterSize }
 };
 
-/* The nbnxn j-cluster size in atoms for each nbnxn kernel type */
+/*! \brief The nbnxn j-cluster size in atoms for each nbnxn kernel type */
 static constexpr gmx::EnumerationArray<KernelType, int> JClusterSizePerKernelType = {
     { 0, c_nbnxnCpuIClusterSize,
 #if GMX_SIMD
@@ -85,13 +93,14 @@ static constexpr gmx::EnumerationArray<KernelType, int> JClusterSizePerKernelTyp
       c_nbnxnGpuClusterSize, c_nbnxnGpuClusterSize }
 };
 
-/* Returns whether the pair-list corresponding to nb_kernel_type is simple */
+/*! \brief Returns whether the pair-list corresponding to nb_kernel_type is simple */
 static inline bool kernelTypeUsesSimplePairlist(const KernelType kernelType)
 {
     return (kernelType == KernelType::Cpu4x4_PlainC || kernelType == KernelType::Cpu4xN_Simd_4xN
             || kernelType == KernelType::Cpu4xN_Simd_2xNN);
 }
 
+//! Returns whether a SIMD kernel is in use
 static inline bool kernelTypeIsSimd(const KernelType kernelType)
 {
     return (kernelType == KernelType::Cpu4xN_Simd_4xN || kernelType == KernelType::Cpu4xN_Simd_2xNN);
@@ -99,7 +108,7 @@ static inline bool kernelTypeIsSimd(const KernelType kernelType)
 
 } // namespace Nbnxm
 
-/* Returns the effective list radius of the pair-list
+/*! \brief Returns the effective list radius of the pair-list
  *
  * Due to the cluster size the effective pair-list is longer than
  * that of a simple atom pair-list. This function gives the extra distance.
@@ -110,7 +119,7 @@ static inline bool kernelTypeIsSimd(const KernelType kernelType)
  */
 real nbnxn_get_rlist_effective_inc(int jClusterSize, real atomDensity);
 
-/* Returns the effective list radius of the pair-list
+/*! \brief Returns the effective list radius of the pair-list
  *
  * Due to the cluster size the effective pair-list is longer than
  * that of a simple atom pair-list. This function gives the extra distance.
diff --git a/src/gromacs/nbnxm/nbnxm_simd.h b/src/gromacs/nbnxm/nbnxm_simd.h
index 79c92f59fc..a81c52fdd2 100644
--- a/src/gromacs/nbnxm/nbnxm_simd.h
+++ b/src/gromacs/nbnxm/nbnxm_simd.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015,2019, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -32,7 +32,15 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-
+/*! \internal \file
+ *
+ * \brief
+ * Defines constants used to know which Nbnxm kernel flavours (4xn or 2xnn)
+ * can be supported by the SIMD layer in use.
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_nbnxm
+ */
 #ifndef GMX_NBNXM_NBNXM_SIMD_H
 #define GMX_NBNXM_NBNXM_SIMD_H
 
@@ -41,7 +49,7 @@
 #include "gromacs/utility/real.h"
 
 #if GMX_SIMD
-/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
+/*! \brief The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
  * Currently the 2xNN SIMD kernels only make sense with:
  *  8-way SIMD: 4x4 setup, works with AVX-256 in single precision
  * 16-way SIMD: 4x8 setup, works with Intel MIC in single precision
@@ -59,4 +67,4 @@
 
 #endif // GMX_SIMD
 
-#endif /* _nbnxn_simd_h */
+#endif
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_consts.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_consts.h
index 2d3d29110c..87872c6552 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_consts.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_consts.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -32,6 +32,14 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
+/*! \internal \file
+ *
+ * \brief
+ * Declares constants for OpenCL code
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_nbnxm
+ */
 #ifndef NBNXN_OPENCL_CONSTS_H
 #define NBNXN_OPENCL_CONSTS_H
 
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
index 4ebecb7584..05f66de557 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
@@ -33,6 +33,15 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
+/*! \internal \file
+ *  \brief
+ *  Utility constant and function declaration for the OpenCL non-bonded kernels.
+ *  This header should be included once at the top level, just before the
+ *  kernels are included (has to be preceded by nbnxn_ocl_types.h).
+ *
+ *  \author Szilárd Páll <pall.szilard@gmail.com>
+ *  \ingroup module_nbnxm
+ */
 
 #define GMX_DOUBLE 0
 
@@ -144,50 +153,76 @@ typedef struct
 typedef struct cl_nbparam_params
 {
 
-    int eeltype; /**< type of electrostatics, takes values from #eelCu */
-    int vdwtype; /**< type of VdW impl., takes values from #evdwCu     */
-
-    float epsfac;      /**< charge multiplication factor                      */
-    float c_rf;        /**< Reaction-field/plain cutoff electrostatics const. */
-    float two_k_rf;    /**< Reaction-field electrostatics constant            */
-    float ewald_beta;  /**< Ewald/PME parameter                               */
-    float sh_ewald;    /**< Ewald/PME correction term substracted from the direct-space potential */
-    float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential        */
-    float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient                          */
-
-    float rcoulomb_sq; /**< Coulomb cut-off squared                           */
-
-    float rvdw_sq;       /**< VdW cut-off squared                               */
-    float rvdw_switch;   /**< VdW switched cut-off                              */
-    float rlistOuter_sq; /**< Full, outer pair-list cut-off squared             */
-    float rlistInner_sq; /**< Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
-
-    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
-    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
-    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
+    //! type of electrostatics, takes values from #eelCu
+    int eeltype;
+    //! type of VdW impl., takes values from #evdwCu
+    int vdwtype;
+
+    //! charge multiplication factor
+    float epsfac;
+    //! Reaction-field/plain cutoff electrostatics const.
+    float c_rf;
+    //! Reaction-field electrostatics constant
+    float two_k_rf;
+    //! Ewald/PME parameter
+    float ewald_beta;
+    //! Ewald/PME correction term substracted from the direct-space potential
+    float sh_ewald;
+    //! LJ-Ewald/PME correction term added to the correction potential
+    float sh_lj_ewald;
+    //! LJ-Ewald/PME coefficient
+    float ewaldcoeff_lj;
+
+    //! Coulomb cut-off squared
+    float rcoulomb_sq;
+
+    //! VdW cut-off squared
+    float rvdw_sq;
+    //! VdW switched cut-off
+    float rvdw_switch;
+    //! Full, outer pair-list cut-off squared
+    float rlistOuter_sq;
+    //! Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds
+    float rlistInner_sq;
+
+    //! VdW shift dispersion constants
+    shift_consts_t dispersion_shift;
+    //! VdW shift repulsion constants
+    shift_consts_t repulsion_shift;
+    //! VdW switch constants
+    switch_consts_t vdw_switch;
 
     /* Ewald Coulomb force table data - accessed through texture memory */
-    float coulomb_tab_scale; /**< table scale/spacing                        */
+    //! table scale/spacing
+    float coulomb_tab_scale;
 } cl_nbparam_params_t;
 
 typedef struct
 {
-    int sci;           /* i-super-cluster       */
-    int shift;         /* Shift vector index plus possible flags */
-    int cj4_ind_start; /* Start index into cj4  */
-    int cj4_ind_end;   /* End index into cj4    */
+    //! i-super-cluster
+    int sci;
+    //! Shift vector index plus possible flags
+    int shift;
+    //! Start index into cj4
+    int cj4_ind_start;
+    //! End index into cj4
+    int cj4_ind_end;
 } nbnxn_sci_t;
 
 typedef struct
 {
-    unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
-    int          excl_ind; /* Index into the exclusion array for 1 warp   */
+    //! The i-cluster interactions mask for 1 warp
+    unsigned int imask;
+    //! Index into the exclusion array for 1 warp
+    int excl_ind;
 } nbnxn_im_ei_t;
 
 typedef struct
 {
-    int           cj[4];   /* The 4 j-clusters                            */
-    nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
+    //! The 4 j-clusters
+    int cj[4];
+    //! The i-cluster mask data       for 2 warps
+    nbnxn_im_ei_t imei[2];
 } nbnxn_cj4_t;
 
 
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
index 303968ea96..c6f5636658 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -153,9 +153,12 @@ enum ePruneKind
  */
 typedef struct cl_nb_staging
 {
-    float* e_lj;        /**< LJ energy                       */
-    float* e_el;        /**< electrostatic energy            */
-    float (*fshift)[3]; /**< float3 buffer with shift forces */
+    //! LJ energy
+    float* e_lj;
+    //! electrostatic energy
+    float* e_el;
+    //! float3 buffer with shift forces
+    float (*fshift)[3];
 } cl_nb_staging_t;
 
 /*! \internal
@@ -163,29 +166,45 @@ typedef struct cl_nb_staging
  */
 typedef struct cl_atomdata
 {
-    int natoms;       /**< number of atoms                              */
-    int natoms_local; /**< number of local atoms                        */
-    int nalloc;       /**< allocation size for the atom data (xq, f)    */
-
-    cl_mem xq; /**< float4 buffer with atom coordinates + charges, size natoms */
-
-    cl_mem f;           /**< float3 buffer with force output array, size natoms         */
-    size_t f_elem_size; /**< Size in bytes for one element of f buffer      */
-
-    cl_mem e_lj; /**< LJ energy output, size 1                       */
-    cl_mem e_el; /**< Electrostatics energy input, size 1            */
-
-    cl_mem fshift;           /**< float3 buffer with shift forces                */
-    size_t fshift_elem_size; /**< Size in bytes for one element of fshift buffer */
-
-    int    ntypes;     /**< number of atom types                           */
-    cl_mem atom_types; /**< int buffer with atom type indices, size natoms */
-    cl_mem lj_comb;    /**< float2 buffer with sqrt(c6),sqrt(c12), size natoms */
-
-    cl_mem shift_vec;           /**< float3 buffer with shifts values               */
-    size_t shift_vec_elem_size; /**< Size in bytes for one element of shift_vec buffer */
-
-    cl_bool bShiftVecUploaded; /**< true if the shift vector has been uploaded  */
+    //! number of atoms
+    int natoms;
+    //! number of local atoms
+    int natoms_local;
+    //! allocation size for the atom data (xq, f)
+    int nalloc;
+
+    //! float4 buffer with atom coordinates + charges, size natoms
+    cl_mem xq;
+
+    //! float3 buffer with force output array, size natoms
+    cl_mem f;
+    //! Size in bytes for one element of f buffer
+    size_t f_elem_size;
+
+    //! LJ energy output, size 1
+    cl_mem e_lj;
+    //! Electrostatics energy input, size 1
+    cl_mem e_el;
+
+    //! float3 buffer with shift forces
+    cl_mem fshift;
+    //! Size in bytes for one element of fshift buffer
+    size_t fshift_elem_size;
+
+    //! number of atom types
+    int ntypes;
+    //! int buffer with atom type indices, size natoms
+    cl_mem atom_types;
+    //! float2 buffer with sqrt(c6),sqrt(c12), size natoms
+    cl_mem lj_comb;
+
+    //! float3 buffer with shifts values
+    cl_mem shift_vec;
+    //! Size in bytes for one element of shift_vec buffer
+    size_t shift_vec_elem_size;
+
+    //! true if the shift vector has been uploaded
+    cl_bool bShiftVecUploaded;
 } cl_atomdata_t;
 
 /*! \internal
@@ -194,36 +213,58 @@ typedef struct cl_atomdata
 typedef struct cl_nbparam
 {
 
-    int eeltype; /**< type of electrostatics, takes values from #eelOcl */
-    int vdwtype; /**< type of VdW impl., takes values from #evdwOcl     */
-
-    float epsfac;      /**< charge multiplication factor                      */
-    float c_rf;        /**< Reaction-field/plain cutoff electrostatics const. */
-    float two_k_rf;    /**< Reaction-field electrostatics constant            */
-    float ewald_beta;  /**< Ewald/PME parameter                               */
-    float sh_ewald;    /**< Ewald/PME correction term substracted from the direct-space potential */
-    float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential        */
-    float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient                          */
-
-    float rcoulomb_sq; /**< Coulomb cut-off squared                           */
-
-    float rvdw_sq;           /**< VdW cut-off squared                               */
-    float rvdw_switch;       /**< VdW switched cut-off                              */
-    float rlistOuter_sq;     /**< Full, outer pair-list cut-off squared             */
-    float rlistInner_sq;     /**< Inner, dynamic pruned pair-list cut-off squared   */
-    bool  useDynamicPruning; /**< True if we use dynamic pair-list pruning          */
-
-    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
-    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
-    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
+    //! type of electrostatics, takes values from #eelOcl
+    int eeltype;
+    //! type of VdW impl., takes values from #evdwOcl
+    int vdwtype;
+
+    //! charge multiplication factor
+    float epsfac;
+    //! Reaction-field/plain cutoff electrostatics const.
+    float c_rf;
+    //! Reaction-field electrostatics constant
+    float two_k_rf;
+    //! Ewald/PME parameter
+    float ewald_beta;
+    //! Ewald/PME correction term substracted from the direct-space potential
+    float sh_ewald;
+    //! LJ-Ewald/PME correction term added to the correction potential
+    float sh_lj_ewald;
+    //! LJ-Ewald/PME coefficient
+    float ewaldcoeff_lj;
+
+    //! Coulomb cut-off squared
+    float rcoulomb_sq;
+
+    //! VdW cut-off squared
+    float rvdw_sq;
+    //! VdW switched cut-off
+    float rvdw_switch;
+    //! Full, outer pair-list cut-off squared
+    float rlistOuter_sq;
+    //! Inner, dynamic pruned pair-list cut-off squared
+    float rlistInner_sq;
+    //! True if we use dynamic pair-list pruning
+    bool useDynamicPruning;
+
+    //! VdW shift dispersion constants
+    shift_consts_t dispersion_shift;
+    //! VdW shift repulsion constants
+    shift_consts_t repulsion_shift;
+    //! VdW switch constants
+    switch_consts_t vdw_switch;
 
     /* LJ non-bonded parameters - accessed through texture memory */
-    cl_mem nbfp_climg2d; /**< nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements */
-    cl_mem nbfp_comb_climg2d; /**< nonbonded parameter table per atom type, 2*ntype elements */
+    //! nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements
+    cl_mem nbfp_climg2d;
+    //! nonbonded parameter table per atom type, 2*ntype elements
+    cl_mem nbfp_comb_climg2d;
 
     /* Ewald Coulomb force table data - accessed through texture memory */
-    float  coulomb_tab_scale;   /**< table scale/spacing                        */
-    cl_mem coulomb_tab_climg2d; /**< pointer to the table in the device memory  */
+    //! table scale/spacing
+    float coulomb_tab_scale;
+    //! pointer to the table in the device memory
+    cl_mem coulomb_tab_climg2d;
 } cl_nbparam_t;
 
 /*! \internal
@@ -234,30 +275,48 @@ typedef struct cl_nbparam
 typedef struct cl_nbparam_params
 {
 
-    int eeltype; /**< type of electrostatics, takes values from #eelCu */
-    int vdwtype; /**< type of VdW impl., takes values from #evdwCu     */
-
-    float epsfac;      /**< charge multiplication factor                      */
-    float c_rf;        /**< Reaction-field/plain cutoff electrostatics const. */
-    float two_k_rf;    /**< Reaction-field electrostatics constant            */
-    float ewald_beta;  /**< Ewald/PME parameter                               */
-    float sh_ewald;    /**< Ewald/PME correction term substracted from the direct-space potential */
-    float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential        */
-    float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient                          */
-
-    float rcoulomb_sq; /**< Coulomb cut-off squared                           */
-
-    float rvdw_sq;       /**< VdW cut-off squared                               */
-    float rvdw_switch;   /**< VdW switched cut-off                              */
-    float rlistOuter_sq; /**< Full, outer pair-list cut-off squared             */
-    float rlistInner_sq; /**< Inner, dynamic pruned pair-list cut-off squared   */
-
-    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
-    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
-    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
+    //! type of electrostatics, takes values from #eelCu
+    int eeltype;
+    //! type of VdW impl., takes values from #evdwCu
+    int vdwtype;
+
+    //! charge multiplication factor
+    float epsfac;
+    //! Reaction-field/plain cutoff electrostatics const.
+    float c_rf;
+    //! Reaction-field electrostatics constant
+    float two_k_rf;
+    //! Ewald/PME parameter
+    float ewald_beta;
+    //! Ewald/PME correction term substracted from the direct-space potential
+    float sh_ewald;
+    //! LJ-Ewald/PME correction term added to the correction potential
+    float sh_lj_ewald;
+    //! LJ-Ewald/PME coefficient
+    float ewaldcoeff_lj;
+
+    //! Coulomb cut-off squared
+    float rcoulomb_sq;
+
+    //! VdW cut-off squared
+    float rvdw_sq;
+    //! VdW switched cut-off
+    float rvdw_switch;
+    //! Full, outer pair-list cut-off squared
+    float rlistOuter_sq;
+    //! Inner, dynamic pruned pair-list cut-off squared
+    float rlistInner_sq;
+
+    //! VdW shift dispersion constants
+    shift_consts_t dispersion_shift;
+    //! VdW shift repulsion constants
+    shift_consts_t repulsion_shift;
+    //! VdW switch constants
+    switch_consts_t vdw_switch;
 
     /* Ewald Coulomb force table data - accessed through texture memory */
-    float coulomb_tab_scale; /**< table scale/spacing                        */
+    //! table scale/spacing
+    float coulomb_tab_scale;
 } cl_nbparam_params_t;
 
 
@@ -276,8 +335,10 @@ typedef struct Nbnxm::gpu_timers_t cl_timers_t;
  */
 struct gmx_nbnxm_gpu_t
 {
-    const gmx_device_info_t*          dev_info;    /**< OpenCL device information    */
-    struct gmx_device_runtime_data_t* dev_rundata; /**< OpenCL runtime data (context, kernels) */
+    //! OpenCL device information
+    const gmx_device_info_t* dev_info;
+    //! OpenCL runtime data (context, kernels)
+    struct gmx_device_runtime_data_t* dev_rundata;
 
     /**< Pointers to non-bonded kernel functions
      * organized similar with nb_kfunc_xxx arrays in nbnxn_ocl.cpp */
@@ -287,9 +348,11 @@ struct gmx_nbnxm_gpu_t
     cl_kernel kernel_noener_prune_ptr[eelOclNR][evdwOclNR];
     cl_kernel kernel_ener_prune_ptr[eelOclNR][evdwOclNR];
     ///@}
-    cl_kernel kernel_pruneonly[ePruneNR]; /**< prune kernels, ePruneKind defined the kernel kinds */
+    //! prune kernels, ePruneKind defined the kernel kinds
+    cl_kernel kernel_pruneonly[ePruneNR];
 
-    bool bPrefetchLjParam; /**< true if prefetching fg i-atom LJ parameters should be used in the kernels */
+    //! true if prefetching fg i-atom LJ parameters should be used in the kernels
+    bool bPrefetchLjParam;
 
     /**< auxiliary kernels implementing memset-like functions */
     ///@{
@@ -299,23 +362,35 @@ struct gmx_nbnxm_gpu_t
     cl_kernel kernel_zero_e_fshift;
     ///@}
 
-    cl_bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU          */
-    cl_bool bNonLocalStreamActive; /**< true indicates that the nonlocal_done event was enqueued */
-
-    cl_atomdata_t* atdat;   /**< atom data                                                  */
-    cl_nbparam_t*  nbparam; /**< parameters required for the non-bonded calc.               */
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_plist_t*> plist; /**< pair-list data structures (local and non-local)            */
-    cl_nb_staging_t nbst; /**< staging area where fshift/energies get downloaded          */
-
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_command_queue> stream; /**< local and non-local GPU queues                             */
-
-    /** events used for synchronization */
-    cl_event nonlocal_done;               /**< event triggered when the non-local non-bonded kernel
-                                             is done (and the local transfer can proceed) */
-    cl_event misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
-                                             the local stream that need to precede the
-                                             non-local force calculations are done
-                                             (e.g. f buffer 0-ing, local x/q H2D) */
+    //! true if doing both local/non-local NB work on GPU
+    cl_bool bUseTwoStreams;
+    //! true indicates that the nonlocal_done event was enqueued
+    cl_bool bNonLocalStreamActive;
+
+    //! atom data
+    cl_atomdata_t* atdat;
+    //! parameters required for the non-bonded calc.
+    cl_nbparam_t* nbparam;
+    //! pair-list data structures (local and non-local)
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_plist_t*> plist;
+    //! staging area where fshift/energies get downloaded
+    cl_nb_staging_t nbst;
+
+    //! local and non-local GPU queues
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_command_queue> stream;
+
+    /*! \brief Events used for synchronization */
+    /*! \{ */
+    /*! \brief Event triggered when the non-local non-bonded
+     * kernel is done (and the local transfer can proceed) */
+    cl_event nonlocal_done;
+    /*! \brief Event triggered when the tasks issued in the local
+     * stream that need to precede the non-local force or buffer
+     * operation calculations are done (e.g. f buffer 0-ing, local
+     * x/q H2D, buffer op initialization in local stream that is
+     * required also by nonlocal stream ) */
+    cl_event misc_ops_and_local_H2D_done;
+    /*! \} */
 
     //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled
     //  to be executed in the current domain. As long as bonded work is not split up into
@@ -323,9 +398,12 @@ struct gmx_nbnxm_gpu_t
     gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
 
 
-    cl_bool      bDoTime; /**< True if event-based timing is enabled.                     */
-    cl_timers_t* timers;  /**< OpenCL event-based timers.                                 */
-    struct gmx_wallclock_gpu_nbnxn_t* timings; /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */
+    //! True if event-based timing is enabled.
+    cl_bool bDoTime;
+    //! OpenCL event-based timers.
+    cl_timers_t* timers;
+    //! Timing data. TODO: deprecate this and query timers for accumulated data instead
+    struct gmx_wallclock_gpu_nbnxn_t* timings;
 };
 
 #endif /* NBNXN_OPENCL_TYPES_H */
diff --git a/src/gromacs/nbnxm/pairlist.h b/src/gromacs/nbnxm/pairlist.h
index 777a42a5a0..3f7c62705f 100644
--- a/src/gromacs/nbnxm/pairlist.h
+++ b/src/gromacs/nbnxm/pairlist.h
@@ -57,25 +57,29 @@ struct NbnxnPairlistCpuWork;
 struct NbnxnPairlistGpuWork;
 
 
-/* Convenience type for vector with aligned memory */
+//! Convenience type for vector with aligned memory
 template<typename T>
 using AlignedVector = std::vector<T, gmx::AlignedAllocator<T>>;
 
-/* Convenience type for vector that avoids initialization at resize() */
+//! Convenience type for vector that avoids initialization at resize()
 template<typename T>
 using FastVector = std::vector<T, gmx::DefaultInitializationAllocator<T>>;
 
-/* A buffer data structure of 64 bytes
+/*! \brief Cache-line protection buffer
+ *
+ * A buffer data structure of 64 bytes
  * to be placed at the beginning and end of structs
  * to avoid cache invalidation of the real contents
  * of the struct by writes to neighboring memory.
  */
 typedef struct
 {
+    //! Unused field used to create space to protect cache lines that are in use
     int dummy[16];
 } gmx_cache_protect_t;
 
-/* This is the actual cluster-pair list j-entry.
+/*! \brief This is the actual cluster-pair list j-entry.
+ *
  * cj is the j-cluster.
  * The interaction bits in excl are indexed i-major, j-minor.
  * The cj entries are sorted such that ones with exclusions come first.
@@ -84,11 +88,15 @@ typedef struct
  */
 struct nbnxn_cj_t
 {
-    int          cj;   /* The j-cluster                    */
-    unsigned int excl; /* The exclusion (interaction) bits */
+    //! The j-cluster
+    int cj;
+    //! The exclusion (interaction) bits
+    unsigned int excl;
 };
 
-/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
+/*! \brief Constants for interpreting interaction flags
+ *
+ * In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
  * The upper bits contain information for non-bonded kernel optimization.
  * Simply calculating LJ and Coulomb for all pairs in a cluster pair is fine.
  * But three flags can be used to skip interactions, currently only for subc=0
@@ -96,66 +104,88 @@ struct nbnxn_cj_t
  * shift & NBNXN_CI_HALF_LJ(subc)    => we can skip LJ for the second half of i
  * !(shift & NBNXN_CI_DO_COUL(subc)) => we can skip Coulomb for all pairs
  */
+//! \{
 #define NBNXN_CI_SHIFT 127
 #define NBNXN_CI_DO_LJ(subc) (1 << (7 + 3 * (subc)))
 #define NBNXN_CI_HALF_LJ(subc) (1 << (8 + 3 * (subc)))
 #define NBNXN_CI_DO_COUL(subc) (1 << (9 + 3 * (subc)))
+//! \}
 
-/* Cluster-pair Interaction masks
+/*! \brief Cluster-pair Interaction masks
+ *
  * Bit i*j-cluster-size + j tells if atom i and j interact.
  */
+//! \{
 // TODO: Rename according to convention when moving into Nbnxn namespace
-/* All interaction mask is the same for all kernels */
+//! All interaction mask is the same for all kernels
 constexpr unsigned int NBNXN_INTERACTION_MASK_ALL = 0xffffffffU;
-/* 4x4 kernel diagonal mask */
+//! 4x4 kernel diagonal mask
 constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG = 0x08ceU;
-/* 4x2 kernel diagonal masks */
+//! 4x2 kernel diagonal masks
+//! \{
 constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J2_0 = 0x0002U;
 constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J2_1 = 0x002fU;
-/* 4x8 kernel diagonal masks */
+//! \}
+//! 4x8 kernel diagonal masks
+//! \{
 constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J8_0 = 0xf0f8fcfeU;
 constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J8_1 = 0x0080c0e0U;
+//! \}
+//! \}
 
-/* Simple pair-list i-unit */
+/*! \internal
+ * \brief Simple pair-list i-unit
+ */
 struct nbnxn_ci_t
 {
-    int ci;           /* i-cluster             */
-    int shift;        /* Shift vector index plus possible flags, see above */
-    int cj_ind_start; /* Start index into cj   */
-    int cj_ind_end;   /* End index into cj     */
+    //! i-cluster
+    int ci;
+    //! Shift vector index plus possible flags, see above
+    int shift;
+    //! Start index into cj
+    int cj_ind_start;
+    //! End index into cj
+    int cj_ind_end;
 };
 
-/* Grouped pair-list i-unit */
+//! Grouped pair-list i-unit
 typedef struct
 {
-    /* Returns the number of j-cluster groups in this entry */
+    //! Returns the number of j-cluster groups in this entry
     int numJClusterGroups() const { return cj4_ind_end - cj4_ind_start; }
 
-    int sci;           /* i-super-cluster       */
-    int shift;         /* Shift vector index plus possible flags */
-    int cj4_ind_start; /* Start index into cj4  */
-    int cj4_ind_end;   /* End index into cj4    */
+    //! i-super-cluster
+    int sci;
+    //! Shift vector index plus possible flags
+    int shift;
+    //! Start index into cj4
+    int cj4_ind_start;
+    //! End index into cj4
+    int cj4_ind_end;
 } nbnxn_sci_t;
 
-/* Interaction data for a j-group for one warp */
+//! Interaction data for a j-group for one warp
 struct nbnxn_im_ei_t
 {
-    // The i-cluster interactions mask for 1 warp
+    //! The i-cluster interactions mask for 1 warp
     unsigned int imask = 0U;
-    // Index into the exclusion array for 1 warp, default index 0 which means no exclusions
+    //! Index into the exclusion array for 1 warp, default index 0 which means no exclusions
     int excl_ind = 0;
 };
 
+//! Four-way j-cluster lists
 typedef struct
 {
-    int           cj[c_nbnxnGpuJgroupSize];         /* The 4 j-clusters */
-    nbnxn_im_ei_t imei[c_nbnxnGpuClusterpairSplit]; /* The i-cluster mask data       for 2 warps */
+    //! The 4 j-clusters
+    int cj[c_nbnxnGpuJgroupSize];
+    //! The i-cluster mask data for 2 warps
+    nbnxn_im_ei_t imei[c_nbnxnGpuClusterpairSplit];
 } nbnxn_cj4_t;
 
-/* Struct for storing the atom-pair interaction bits for a cluster pair in a GPU pairlist */
+//! Struct for storing the atom-pair interaction bits for a cluster pair in a GPU pairlist
 struct nbnxn_excl_t
 {
-    /* Constructor, sets no exclusions, so all atom pairs interacting */
+    //! Constructor, sets no exclusions, so all atom pairs interacting
     nbnxn_excl_t()
     {
         for (unsigned int& pairEntry : pair)
@@ -164,32 +194,43 @@ struct nbnxn_excl_t
         }
     }
 
-    /* Topology exclusion interaction bits per warp */
+    //! Topology exclusion interaction bits per warp
     unsigned int pair[c_nbnxnGpuExclSize];
 };
 
-/* Cluster pairlist type for use on CPUs */
+//! Cluster pairlist type for use on CPUs
 struct NbnxnPairlistCpu
 {
     NbnxnPairlistCpu();
 
+    //! Cache protection
     gmx_cache_protect_t cp0;
 
-    int                    na_ci;   /* The number of atoms per i-cluster        */
-    int                    na_cj;   /* The number of atoms per j-cluster        */
-    real                   rlist;   /* The radius for constructing the list     */
-    FastVector<nbnxn_ci_t> ci;      /* The i-cluster list                       */
-    FastVector<nbnxn_ci_t> ciOuter; /* The outer, unpruned i-cluster list       */
-
-    FastVector<nbnxn_cj_t> cj;      /* The j-cluster list, size ncj             */
-    FastVector<nbnxn_cj_t> cjOuter; /* The outer, unpruned j-cluster list       */
-    int                    ncjInUse; /* The number of j-clusters that are used by ci entries in this list, will be <= cj.size() */
-
-    int nci_tot; /* The total number of i clusters           */
+    //! The number of atoms per i-cluster
+    int na_ci;
+    //! The number of atoms per j-cluster
+    int na_cj;
+    //! The radius for constructing the list
+    real rlist;
+    //! The i-cluster list
+    FastVector<nbnxn_ci_t> ci;
+    //! The outer, unpruned i-cluster list
+    FastVector<nbnxn_ci_t> ciOuter;
+
+    //! The j-cluster list, size ncj
+    FastVector<nbnxn_cj_t> cj;
+    //! The outer, unpruned j-cluster list
+    FastVector<nbnxn_cj_t> cjOuter;
+    //! The number of j-clusters that are used by ci entries in this list, will be <= cj.size()
+    int ncjInUse;
+
+    //! The total number of i clusters
+    int nci_tot;
 
-    /* Working data storage for list construction */
+    //! Working data storage for list construction
     std::unique_ptr<NbnxnPairlistCpuWork> work;
 
+    //! Cache protection
     gmx_cache_protect_t cp1;
 };
 
@@ -201,18 +242,23 @@ struct NbnxnPairlistCpu
  */
 struct NbnxnPairlistGpu
 {
-    /* Constructor
+    /*! \brief Constructor
      *
      * \param[in] pinningPolicy  Sets the pinning policy for all buffers used on the GPU
      */
     NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy);
 
+    //! Cache protection
     gmx_cache_protect_t cp0;
 
-    int  na_ci; /* The number of atoms per i-cluster        */
-    int  na_cj; /* The number of atoms per j-cluster        */
-    int  na_sc; /* The number of atoms per super cluster    */
-    real rlist; /* The radius for constructing the list     */
+    //! The number of atoms per i-cluster
+    int na_ci;
+    //! The number of atoms per j-cluster
+    int na_cj;
+    //! The number of atoms per super cluster
+    int na_sc;
+    //! The radius for constructing the list
+    real rlist;
     // The i-super-cluster list, indexes into cj4;
     gmx::HostVector<nbnxn_sci_t> sci;
     // The list of 4*j-cluster groups
@@ -222,9 +268,10 @@ struct NbnxnPairlistGpu
     // The total number of i-clusters
     int nci_tot;
 
-    /* Working data storage for list construction */
+    //! Working data storage for list construction
     std::unique_ptr<NbnxnPairlistGpuWork> work;
 
+    //! Cache protection
     gmx_cache_protect_t cp1;
 };
 
diff --git a/src/gromacs/nbnxm/pairlist_simd_2xmm.h b/src/gromacs/nbnxm/pairlist_simd_2xmm.h
index 0625ec9bb2..c7f72b4d9d 100644
--- a/src/gromacs/nbnxm/pairlist_simd_2xmm.h
+++ b/src/gromacs/nbnxm/pairlist_simd_2xmm.h
@@ -33,13 +33,22 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
+/*! \internal \file
+ *
+ * \brief
+ * Declares inline-friendly code for making 2xNN pairlists
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_nbnxm
+ */
+
 
-/* Stride of the packed x coordinate array */
+//! Stride of the packed x coordinate array
 static constexpr int c_xStride2xNN = (GMX_SIMD_REAL_WIDTH >= 2 * c_nbnxnCpuIClusterSize)
                                              ? GMX_SIMD_REAL_WIDTH / 2
                                              : c_nbnxnCpuIClusterSize;
 
-/* Copies PBC shifted i-cell packed atom coordinates to working array */
+//! Copies PBC shifted i-cell packed atom coordinates to working array
 static inline void icell_set_x_simd_2xnn(int  ci,
                                          real shx,
                                          real shy,
@@ -67,7 +76,7 @@ static inline void icell_set_x_simd_2xnn(int  ci,
           loadU1DualHsimd(x + ia + 2 * c_xStride2xNN + 2) + SimdReal(shz));
 }
 
-/* SIMD code for checking and adding cluster-pairs to the list using coordinates in packed format.
+/*! \brief SIMD code for checking and adding cluster-pairs to the list using coordinates in packed format.
  *
  * Checks bouding box distances and possibly atom pair distances.
  * This is an accelerated version of make_cluster_list_simple.
diff --git a/src/gromacs/nbnxm/pairlist_simd_4xm.h b/src/gromacs/nbnxm/pairlist_simd_4xm.h
index 9c1cb9a3cc..ce8cf3f28f 100644
--- a/src/gromacs/nbnxm/pairlist_simd_4xm.h
+++ b/src/gromacs/nbnxm/pairlist_simd_4xm.h
@@ -34,11 +34,20 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
-/* Stride of the packed x coordinate array */
+/*! \internal \file
+ *
+ * \brief
+ * Declares inline-friendly code for making 4xN pairlists
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_nbnxm
+ */
+
+//! Stride of the packed x coordinate array
 static constexpr int c_xStride4xN =
         (GMX_SIMD_REAL_WIDTH > c_nbnxnCpuIClusterSize ? GMX_SIMD_REAL_WIDTH : c_nbnxnCpuIClusterSize);
 
-/* Copies PBC shifted i-cell packed atom coordinates to working array */
+//! Copies PBC shifted i-cell packed atom coordinates to working array
 static inline void icell_set_x_simd_4xn(int  ci,
                                         real shx,
                                         real shy,
@@ -66,7 +75,7 @@ static inline void icell_set_x_simd_4xn(int  ci,
     store(x_ci_simd + 11 * GMX_SIMD_REAL_WIDTH, SimdReal(x[ia + 2 * c_xStride4xN + 3] + shz));
 }
 
-/* SIMD code for checking and adding cluster-pairs to the list using coordinates in packed format.
+/*! \brief SIMD code for checking and adding cluster-pairs to the list using coordinates in packed format.
  *
  * Checks bouding box distances and possibly atom pair distances.
  * This is an accelerated version of make_cluster_list_simple.
diff --git a/src/gromacs/nbnxm/pairlistparams.h b/src/gromacs/nbnxm/pairlistparams.h
index 01755808f5..063f6dbf59 100644
--- a/src/gromacs/nbnxm/pairlistparams.h
+++ b/src/gromacs/nbnxm/pairlistparams.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -118,15 +118,24 @@ struct PairlistParams
      */
     PairlistParams(Nbnxm::KernelType kernelType, bool haveFep, real rlist, bool haveMultipleDomains);
 
-    PairlistType pairlistType;           //!< The type of cluster-pair list
-    bool         haveFep;                //!< Tells whether we have perturbed interactions
-    real         rlistOuter;             //!< Cut-off of the larger, outer pair-list
-    real         rlistInner;             //!< Cut-off of the smaller, inner pair-list
-    bool         haveMultipleDomains;    //!< True when using DD with multiple domains
-    bool         useDynamicPruning;      //!< Are we using dynamic pair-list pruning
-    int          nstlistPrune;           //!< Pair-list dynamic pruning interval
-    int          numRollingPruningParts; //!< The number parts to divide the pair-list into for rolling pruning, a value of 1 gives no rolling pruning
-    int          lifetime;               //!< Lifetime in steps of the pair-list
+    //! The type of cluster-pair list
+    PairlistType pairlistType;
+    //! Tells whether we have perturbed interactions
+    bool haveFep;
+    //! Cut-off of the larger, outer pair-list
+    real rlistOuter;
+    //! Cut-off of the smaller, inner pair-list
+    real rlistInner;
+    //! True when using DD with multiple domains
+    bool haveMultipleDomains;
+    //! Are we using dynamic pair-list pruning
+    bool useDynamicPruning;
+    //! Pair-list dynamic pruning interval
+    int nstlistPrune;
+    //! The number parts to divide the pair-list into for rolling pruning, a value of 1 gives no rolling pruning
+    int numRollingPruningParts;
+    //! Lifetime in steps of the pair-list
+    int lifetime;
 };
 
 #endif
diff --git a/src/gromacs/nbnxm/pairlistsets.h b/src/gromacs/nbnxm/pairlistsets.h
index 9692e11169..d3ca0bdb8e 100644
--- a/src/gromacs/nbnxm/pairlistsets.h
+++ b/src/gromacs/nbnxm/pairlistsets.h
@@ -66,9 +66,11 @@ template<typename>
 class ListOfLists;
 }
 
+//! Contains sets of pairlists \internal
 class PairlistSets
 {
 public:
+    //! Constructor
     PairlistSets(const PairlistParams& pairlistParams,
                  bool                  haveMultipleDomains,
                  int                   minimumIlistCountForGpuBalancing);
diff --git a/src/gromacs/nbnxm/pairlistwork.h b/src/gromacs/nbnxm/pairlistwork.h
index 3f6aec3516..3fce22eab7 100644
--- a/src/gromacs/nbnxm/pairlistwork.h
+++ b/src/gromacs/nbnxm/pairlistwork.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -54,10 +54,10 @@
 #include "grid.h"
 #include "pairlist.h"
 
-/* Working data for the actual i-supercell during pair search */
+//! Working data for the actual i-supercell during pair search \internal
 struct NbnxnPairlistCpuWork
 {
-    // Struct for storing coordinats and bounding box for an i-entry during search
+    //! Struct for storing coordinats and bounding box for an i-entry during search \internal
     struct IClusterData
     {
         IClusterData() :
@@ -67,30 +67,30 @@ struct NbnxnPairlistCpuWork
         {
         }
 
-        // The bounding boxes, pbc shifted, for each cluster
+        //! The bounding boxes, pbc shifted, for each cluster
         AlignedVector<Nbnxm::BoundingBox> bb;
-        // The coordinates, pbc shifted, for each atom
+        //! The coordinates, pbc shifted, for each atom
         std::vector<real> x;
-        // Aligned list for storing 4*DIM*GMX_SIMD_REAL_WIDTH reals
+        //! Aligned list for storing 4*DIM*GMX_SIMD_REAL_WIDTH reals
         AlignedVector<real> xSimd;
     };
 
-    // Protect data from cache pollution between threads
+    //! Protect data from cache pollution between threads
     gmx_cache_protect_t cp0;
 
-    // Work data for generating an IEntry in the pairlist
+    //! Work data for generating an IEntry in the pairlist
     IClusterData iClusterData;
-    // The current cj_ind index for the current list
+    //! The current cj_ind index for the current list
     int cj_ind;
-    // Temporary j-cluster list, used for sorting on exclusions
+    //! Temporary j-cluster list, used for sorting on exclusions
     std::vector<nbnxn_cj_t> cj;
 
-    // Nr. of cluster pairs without Coulomb for flop counting
+    //! Nr. of cluster pairs without Coulomb for flop counting
     int ncj_noq;
-    // Nr. of cluster pairs with 1/2 LJ for flop count
+    //! Nr. of cluster pairs with 1/2 LJ for flop count
     int ncj_hlj;
 
-    // Protect data from cache pollution between threads
+    //! Protect data from cache pollution between threads
     gmx_cache_protect_t cp1;
 };
 
@@ -109,13 +109,13 @@ struct NbnxnPairlistGpuWork
         {
         }
 
-        // The bounding boxes, pbc shifted, for each cluster
+        //! The bounding boxes, pbc shifted, for each cluster
         AlignedVector<Nbnxm::BoundingBox> bb;
-        // As bb, but in packed xxxx format
+        //! As bb, but in packed xxxx format
         AlignedVector<float> bbPacked;
-        // The coordinates, pbc shifted, for each atom
+        //! The coordinates, pbc shifted, for each atom
         AlignedVector<real> x;
-        // Aligned coordinate list used for 4*DIM*GMX_SIMD_REAL_WIDTH floats
+        //! Aligned coordinate list used for 4*DIM*GMX_SIMD_REAL_WIDTH floats
         AlignedVector<real> xSimd;
     };
 
@@ -125,23 +125,23 @@ struct NbnxnPairlistGpuWork
     {
     }
 
-    // Protect data from cache pollution between threads
+    //! Protect data from cache pollution between threads
     gmx_cache_protect_t cp0;
 
-    // Work data for generating an i-entry in the pairlist
+    //! Work data for generating an i-entry in the pairlist
     ISuperClusterData iSuperClusterData;
-    // The current j-cluster index for the current list
+    //! The current j-cluster index for the current list
     int cj_ind;
-    // Bounding box distance work array
+    //! Bounding box distance work array
     AlignedVector<float> distanceBuffer;
 
-    // Buffer for sorting list entries
+    //! Buffer for sorting list entries
     std::vector<int> sortBuffer;
 
-    // Second sci array, for sorting
+    //! Second sci array, for sorting
     gmx::HostVector<nbnxn_sci_t> sci_sort;
 
-    // Protect data from cache pollution between threads
+    //! Protect data from cache pollution between threads
     gmx_cache_protect_t cp1;
 };
 
diff --git a/src/gromacs/nbnxm/pairsearch.h b/src/gromacs/nbnxm/pairsearch.h
index 357e78a19a..66d1ea39c9 100644
--- a/src/gromacs/nbnxm/pairsearch.h
+++ b/src/gromacs/nbnxm/pairsearch.h
@@ -74,20 +74,22 @@ template<class T>
 using AlignedVector = std::vector<T, gmx::AlignedAllocator<T>>;
 
 
-/* Local cycle count struct for profiling */
+//! Local cycle count struct for profiling \internal
 class nbnxn_cycle_t
 {
 public:
+    //! Start counting cycles
     void start() { start_ = gmx_cycles_read(); }
-
+    //! Stop counting cycles
     void stop()
     {
         cycles_ += gmx_cycles_read() - start_;
         count_++;
     }
-
+    //! Return the number of periods of cycle counting
     int count() const { return count_; }
 
+    //! Return the average number of million cycles per counting period
     double averageMCycles() const
     {
         if (count_ > 0)
@@ -101,9 +103,12 @@ public:
     }
 
 private:
-    int          count_  = 0;
+    //! Number of counting periods
+    int count_ = 0;
+    //! Total cycles in all counting periods
     gmx_cycles_t cycles_ = 0;
-    gmx_cycles_t start_  = 0;
+    //! Cycle count at the most recent start
+    gmx_cycles_t start_ = 0;
 };
 
 //! Local cycle count enum for profiling different parts of search
@@ -139,30 +144,37 @@ struct SearchCycleCounting
 
 // TODO: Move nbnxn_search_work_t definition to its own file
 
-/* Thread-local work struct, contains working data for Grid */
+//! Thread-local work struct, contains working data for Grid \internal
 struct PairsearchWork
 {
     PairsearchWork();
 
     ~PairsearchWork();
 
-    gmx_cache_protect_t cp0; /* Buffer to avoid cache polution */
+    //! Buffer to avoid cache polution
+    gmx_cache_protect_t cp0;
 
-    std::vector<int> sortBuffer; /* Temporary buffer for sorting atoms within a grid column */
+    //! Temporary buffer for sorting atoms within a grid column
+    std::vector<int> sortBuffer;
 
-    nbnxn_buffer_flags_t buffer_flags; /* Flags for force buffer access */
+    //! Flags for force buffer access
+    nbnxn_buffer_flags_t buffer_flags;
 
-    int ndistc; /* Number of distance checks for flop counting */
+    //! Number of distance checks for flop counting
+    int ndistc;
 
 
-    std::unique_ptr<t_nblist> nbl_fep; /* Temporary FEP list for load balancing */
+    //! Temporary FEP list for load balancing
+    std::unique_ptr<t_nblist> nbl_fep;
 
-    nbnxn_cycle_t cycleCounter; /* Counter for thread-local cycles */
+    //! Counter for thread-local cycles
+    nbnxn_cycle_t cycleCounter;
 
-    gmx_cache_protect_t cp1; /* Buffer to avoid cache polution */
+    //! Buffer to avoid cache polution
+    gmx_cache_protect_t cp1;
 };
 
-/* Main pair-search struct, contains the grid(s), not the pair-list(s) */
+//! Main pair-search struct, contains the grid(s), not the pair-list(s) \internal
 class PairSearch
 {
 public:
@@ -188,13 +200,16 @@ public:
         cycleCounting_.stop(enbsCCgrid);
     }
 
-    /* \brief Constructor
+    /*! \brief Constructor
      *
-     * \param[in] pbcType         The periodic boundary conditions
-     * \param[in] numDDCells      The number of domain decomposition cells per dimension, without DD nullptr should be passed
-     * \param[in] zones           The domain decomposition zone setup, without DD nullptr should be passed
-     * \param[in] haveFep         Tells whether non-bonded interactions are perturbed
-     * \param[in] maxNumThreads   The maximum number of threads used in the search
+     * \param[in] pbcType                  The periodic boundary conditions
+     * \param[in] doTestParticleInsertion  Whether test-particle insertion is active
+     * \param[in] numDDCells               The number of domain decomposition cells per dimension, without DD nullptr should be passed
+     * \param[in] zones                    The domain decomposition zone setup, without DD nullptr should be passed
+     * \param[in] pairlistType             The type of tte pair list
+     * \param[in] haveFep                  Tells whether non-bonded interactions are perturbed
+     * \param[in] maxNumThreads            The maximum number of threads used in the search
+     * \param[in] pinningPolicy            Sets the pinning policy for all buffers used on the GPU
      */
     PairSearch(PbcType                   pbcType,
                bool                      doTestParticleInsertion,
@@ -202,7 +217,7 @@ public:
                const gmx_domdec_zones_t* zones,
                PairlistType              pairlistType,
                bool                      haveFep,
-               int                       maxNumthreads,
+               int                       maxNumThreads,
                gmx::PinningPolicy        pinningPolicy);
 
     //! Sets the order of the local atoms to the order grid atom ordering
-- 
2.11.4.GIT