2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef CUDA_ARCH_UTILS_CUH_
36 #define CUDA_ARCH_UTILS_CUH_
41 * \brief CUDA arch dependent definitions.
43 * \author Szilard Pall <pall.szilard@gmail.com>
46 /* GMX_PTX_ARCH is set to the virtual arch (PTX) version targeted by
47 * the current compiler pass or zero for the host pass and it is
48 * intended to be used instead of __CUDA_ARCH__.
51 #define GMX_PTX_ARCH 0
53 #define GMX_PTX_ARCH __CUDA_ARCH__
56 /* Until CC 5.2 and likely for the near future all NVIDIA architectures
57 have a warp size of 32, but this could change later. If it does, the
58 following constants should depend on the value of GMX_PTX_ARCH.
60 static const int warp_size = 32;
61 static const int warp_size_log2 = 5;
62 /*! \brief Bitmask corresponding to all threads active in a warp.
63 * NOTE that here too we assume 32-wide warps.
65 static const unsigned int c_fullWarpMask = 0xffffffff;
67 /* Below are backward-compatibility wrappers for CUDA 9 warp-wide intrinsics. */
69 /*! \brief Compatibility wrapper around the CUDA __syncwarp() instrinsic. */
70 static __forceinline__ __device__
71 void gmx_syncwarp(const unsigned int activeMask = c_fullWarpMask)
73 #if GMX_CUDA_VERSION < 9000
74 /* no sync needed on pre-Volta. */
75 GMX_UNUSED_VALUE(activeMask);
77 __syncwarp(activeMask);
81 /*! \brief Compatibility wrapper around the CUDA __ballot()/__ballot_sync() instrinsic. */
82 static __forceinline__ __device__
83 unsigned int gmx_ballot_sync(const unsigned int activeMask,
86 #if GMX_CUDA_VERSION < 9000
87 GMX_UNUSED_VALUE(activeMask);
88 return __ballot(pred);
90 return __ballot_sync(activeMask, pred);
94 /*! \brief Compatibility wrapper around the CUDA __any()/__any_sync() instrinsic. */
95 static __forceinline__ __device__
96 int gmx_any_sync(const unsigned int activeMask,
99 #if GMX_CUDA_VERSION < 9000
100 GMX_UNUSED_VALUE(activeMask);
103 return __any_sync(activeMask, pred);
107 /*! \brief Compatibility wrapper around the CUDA __shfl_up()/__shfl_up_sync() instrinsic. */
108 template <typename T>
109 static __forceinline__ __device__
110 T gmx_shfl_up_sync(const unsigned int activeMask,
114 #if GMX_CUDA_VERSION < 9000
115 GMX_UNUSED_VALUE(activeMask);
116 return __shfl_up(var, offset);
118 return __shfl_up_sync(activeMask, var, offset);
122 /*! \brief Compatibility wrapper around the CUDA __shfl_down()/__shfl_down_sync() instrinsic. */
123 template <typename T>
124 static __forceinline__ __device__
125 T gmx_shfl_down_sync(const unsigned int activeMask,
129 #if GMX_CUDA_VERSION < 9000
130 GMX_UNUSED_VALUE(activeMask);
131 return __shfl_down(var, offset);
133 return __shfl_down_sync(activeMask, var, offset);
137 #endif /* CUDA_ARCH_UTILS_CUH_ */