src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "vectype_ops.clh"
  37
  38 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  39 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  40
  41 #define WARP_SIZE  32
  42
  43 #undef KERNEL_UTILS_INLINE
  44 #ifdef KERNEL_UTILS_INLINE
  45 #define __INLINE__ inline
  46 #else
  47 #define __INLINE__
  48 #endif
  49
  50 /* 1.0 / sqrt(M_PI) */
  51 #define M_FLOAT_1_SQRTPI 0.564189583547756f
  52
  53 //-------------------
  54
  55 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
  56 #define NBNXN_OPENCL_KERNEL_UTILS_CLH
  57
  58 __constant sampler_t generic_sampler     = CLK_NORMALIZED_COORDS_FALSE  /* Natural coords */
  59                                             | CLK_ADDRESS_NONE          /* No clamp/repeat*/
  60                                             | CLK_FILTER_NEAREST ;      /* No interpolation */
  61
  62 #define __device__
  63
  64 #define WARP_SIZE_LOG2  (5)
  65 #define CL_SIZE_LOG2    (3)  /* change this together with CL_SIZE !*/
  66 #define CL_SIZE_SQ      (CL_SIZE * CL_SIZE)
  67 #define FBUF_STRIDE     (CL_SIZE_SQ)
  68
  69 #define ONE_SIXTH_F     0.16666667f
  70 #define ONE_TWELVETH_F  0.08333333f
  71
  72
  73 // Data structures shared between OpenCL device code and OpenCL host code
  74 // TODO: review, improve
  75 // Replaced real by float for now, to avoid including any other header
  76 typedef struct {
  77     /*real*/float c2;
  78     /*real*/float c3;
  79     /*real*/float cpot;
  80 } shift_consts_t;
  81
  82 /* Used with potential switching:
  83  * rsw        = max(r - r_switch, 0)
  84  * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
  85  * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
  86  * force      = force*dsw - potential*sw
  87  * potential *= sw
  88  */
  89 typedef struct {
  90     /*real*/float c3;
  91     /*real*/float c4;
  92     /*real*/float c5;
  93 } switch_consts_t;
  94
  95 // Data structure shared between the OpenCL device code and OpenCL host code
  96 // Must not contain OpenCL objects (buffers)
  97 typedef struct cl_nbparam_params
  98 {
  99
 100     int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
 101     int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
 102
 103     float           epsfac;           /**< charge multiplication factor                      */
 104     float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
 105     float           two_k_rf;         /**< Reaction-field electrostatics constant            */
 106     float           ewald_beta;       /**< Ewald/PME parameter                               */
 107     float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
 108     float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
 109     float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
 110
 111     float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
 112
 113     float           rvdw_sq;          /**< VdW cut-off squared                               */
 114     float           rvdw_switch;      /**< VdW switched cut-off                              */
 115     float           rlistOuter_sq;    /**< Full, outer pair-list cut-off squared             */
 116     float           rlistInner_sq;    /**< Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
 117
 118     shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
 119     shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
 120     switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 121
 122     /* Ewald Coulomb force table data - accessed through texture memory */
 123     float                  coulomb_tab_scale;  /**< table scale/spacing                        */
 124 }cl_nbparam_params_t;
 125
 126 typedef struct {
 127     int sci;            /* i-super-cluster       */
 128     int shift;          /* Shift vector index plus possible flags */
 129     int cj4_ind_start;  /* Start index into cj4  */
 130     int cj4_ind_end;    /* End index into cj4    */
 131 } nbnxn_sci_t;
 132
 133 typedef struct {
 134     unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
 135     int          excl_ind; /* Index into the exclusion array for 1 warp   */
 136 } nbnxn_im_ei_t;
 137
 138 typedef struct {
 139     int           cj[4];   /* The 4 j-clusters                            */
 140     nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
 141 } nbnxn_cj4_t;
 142
 143
 144 typedef struct {
 145     unsigned int pair[32]; /* Topology exclusion interaction bits for one warp,
 146                             * each unsigned has bitS for 4*8 i clusters
 147                             */
 148 } nbnxn_excl_t;
 149
 150 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 151 __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 152
 153 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
 154 __INLINE__ __device__
 155 void convert_sigma_epsilon_to_c6_c12(const float  sigma,
 156                                      const float  epsilon,
 157                                      float       *c6,
 158                                      float       *c12)
 159 {
 160     float sigma2, sigma6;
 161
 162     sigma2 = sigma * sigma;
 163     sigma6 = sigma2 *sigma2 * sigma2;
 164     *c6    = epsilon * sigma6;
 165     *c12   = *c6 * sigma6;
 166 }
 167
 168
 169 /*! Apply force switch,  force + energy version. */
 170  __INLINE__ __device__
 171 void calculate_force_switch_F(cl_nbparam_params_t *nbparam,
 172                               float                c6,
 173                               float                c12,
 174                               float                inv_r,
 175                               float                r2,
 176                               float               *F_invr)
 177 {
 178     float r, r_switch;
 179
 180     /* force switch constants */
 181     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 182     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 183     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 184     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 185
 186     r         = r2 * inv_r;
 187     r_switch  = r - nbparam->rvdw_switch;
 188     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 189
 190     *F_invr  +=
 191         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 192         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 193 }
 194
 195 /*! Apply force switch, force-only version. */
 196 __INLINE__ __device__
 197 void calculate_force_switch_F_E(cl_nbparam_params_t *nbparam,
 198                                 float                c6,
 199                                 float                c12,
 200                                 float                inv_r,
 201                                 float                r2,
 202                                 float               *F_invr,
 203                                 float               *E_lj)
 204 {
 205     float r, r_switch;
 206
 207     /* force switch constants */
 208     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 209     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 210     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 211     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 212
 213     float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
 214     float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
 215     float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
 216     float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
 217
 218     r         = r2 * inv_r;
 219     r_switch  = r - nbparam->rvdw_switch;
 220     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 221
 222     *F_invr  +=
 223         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 224         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 225     *E_lj    +=
 226         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 227         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 228 }
 229
 230 /*! Apply potential switch, force-only version. */
 231 __INLINE__ __device__
 232 void calculate_potential_switch_F(cl_nbparam_params_t *nbparam,
 233                                   float                inv_r,
 234                                   float                r2,
 235                                   float               *F_invr,
 236                                   float               *E_lj)
 237 {
 238     float r, r_switch;
 239     float sw, dsw;
 240
 241     /* potential switch constants */
 242     float switch_V3 = nbparam->vdw_switch.c3;
 243     float switch_V4 = nbparam->vdw_switch.c4;
 244     float switch_V5 = nbparam->vdw_switch.c5;
 245     float switch_F2 = nbparam->vdw_switch.c3;
 246     float switch_F3 = nbparam->vdw_switch.c4;
 247     float switch_F4 = nbparam->vdw_switch.c5;
 248
 249     r        = r2 * inv_r;
 250     r_switch = r - nbparam->rvdw_switch;
 251
 252     /* Unlike in the F+E kernel, conditional is faster here */
 253     if (r_switch > 0.0f)
 254     {
 255         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 256         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 257
 258         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 259     }
 260 }
 261
 262 /*! Apply potential switch, force + energy version. */
 263 __INLINE__ __device__
 264 void calculate_potential_switch_F_E(cl_nbparam_params_t *nbparam,
 265                                     float                inv_r,
 266                                     float                r2,
 267                                     float               *F_invr,
 268                                     float               *E_lj)
 269 {
 270     float r, r_switch;
 271     float sw, dsw;
 272
 273     /* potential switch constants */
 274     float switch_V3 = nbparam->vdw_switch.c3;
 275     float switch_V4 = nbparam->vdw_switch.c4;
 276     float switch_V5 = nbparam->vdw_switch.c5;
 277     float switch_F2 = nbparam->vdw_switch.c3;
 278     float switch_F3 = nbparam->vdw_switch.c4;
 279     float switch_F4 = nbparam->vdw_switch.c5;
 280
 281     r        = r2 * inv_r;
 282     r_switch = r - nbparam->rvdw_switch;
 283     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 284
 285     /* Unlike in the F-only kernel, masking is faster here */
 286     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 287     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 288
 289     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 290     *E_lj   *= sw;
 291 }
 292
 293 /*! Calculate LJ-PME grid force contribution with
 294  *  geometric combination rule.
 295  */
 296 __INLINE__ __device__
 297 void calculate_lj_ewald_comb_geom_F(__constant float * nbfp_comb_climg2d,
 298                                     int                typei,
 299                                     int                typej,
 300                                     float              r2,
 301                                     float              inv_r2,
 302                                     float              lje_coeff2,
 303                                     float              lje_coeff6_6,
 304                                     float             *F_invr)
 305 {
 306     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 307
 308     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 309
 310     /* Recalculate inv_r6 without exclusion mask */
 311     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 312     cr2       = lje_coeff2*r2;
 313     expmcr2   = exp(-cr2);
 314     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 315
 316     /* Subtract the grid force from the total LJ force */
 317     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 318 }
 319
 320 /*! Calculate LJ-PME grid force + energy contribution with
 321  *  geometric combination rule.
 322  */
 323 __INLINE__ __device__
 324 void calculate_lj_ewald_comb_geom_F_E(__constant float    *nbfp_comb_climg2d,
 325                                       cl_nbparam_params_t *nbparam,
 326                                       int                  typei,
 327                                       int                  typej,
 328                                       float                r2,
 329                                       float                inv_r2,
 330                                       float                lje_coeff2,
 331                                       float                lje_coeff6_6,
 332                                       float                int_bit,
 333                                       float               *F_invr,
 334                                       float               *E_lj)
 335 {
 336     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 337
 338     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 339
 340     /* Recalculate inv_r6 without exclusion mask */
 341     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 342     cr2       = lje_coeff2*r2;
 343     expmcr2   = exp(-cr2);
 344     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 345
 346     /* Subtract the grid force from the total LJ force */
 347     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 348
 349     /* Shift should be applied only to real LJ pairs */
 350     sh_mask   = nbparam->sh_lj_ewald*int_bit;
 351     *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 352 }
 353
 354 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 355  *  Lorentz-Berthelot combination rule.
 356  *  We use a single F+E kernel with conditional because the performance impact
 357  *  of this is pretty small and LB on the CPU is anyway very slow.
 358  */
 359 __INLINE__ __device__
 360 void calculate_lj_ewald_comb_LB_F_E(__constant float    *nbfp_comb_climg2d,
 361                                     cl_nbparam_params_t *nbparam,
 362                                     int                  typei,
 363                                     int                  typej,
 364                                     float                r2,
 365                                     float                inv_r2,
 366                                     float                lje_coeff2,
 367                                     float                lje_coeff6_6,
 368                                     float                int_bit,
 369                                     bool                 with_E_lj,
 370                                     float               *F_invr,
 371                                     float               *E_lj)
 372 {
 373     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 374     float sigma, sigma2, epsilon;
 375
 376     /* sigma and epsilon are scaled to give 6*C6 */
 377     sigma      = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
 378
 379     epsilon    = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
 380
 381     sigma2  = sigma*sigma;
 382     c6grid  = epsilon*sigma2*sigma2*sigma2;
 383
 384     /* Recalculate inv_r6 without exclusion mask */
 385     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 386     cr2       = lje_coeff2*r2;
 387     expmcr2   = exp(-cr2);
 388     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 389
 390     /* Subtract the grid force from the total LJ force */
 391     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 392
 393     if (with_E_lj==true)
 394     {
 395         float sh_mask;
 396
 397         /* Shift should be applied only to real LJ pairs */
 398         sh_mask   = nbparam->sh_lj_ewald*int_bit;
 399         *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 400     }
 401 }
 402
 403 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 404  *  Original idea: from the OpenMM project
 405  */
 406 __INLINE__ __device__ float
 407 interpolate_coulomb_force_r(__constant float *coulomb_tab_climg2d,
 408                             float             r,
 409                             float             scale)
 410 {
 411     float   normalized = scale * r;
 412     int     index      = (int) normalized;
 413     float   fract2     = normalized - index;
 414     float   fract1     = 1.0f - fract2;
 415
 416     return fract1*coulomb_tab_climg2d[index] +
 417            fract2*coulomb_tab_climg2d[index + 1];
 418 }
 419
 420 /*! Calculate analytical Ewald correction term. */
 421 __INLINE__ __device__
 422 float pmecorrF(float z2)
 423 {
 424     const float FN6 = -1.7357322914161492954e-8f;
 425     const float FN5 = 1.4703624142580877519e-6f;
 426     const float FN4 = -0.000053401640219807709149f;
 427     const float FN3 = 0.0010054721316683106153f;
 428     const float FN2 = -0.019278317264888380590f;
 429     const float FN1 = 0.069670166153766424023f;
 430     const float FN0 = -0.75225204789749321333f;
 431
 432     const float FD4 = 0.0011193462567257629232f;
 433     const float FD3 = 0.014866955030185295499f;
 434     const float FD2 = 0.11583842382862377919f;
 435     const float FD1 = 0.50736591960530292870f;
 436     const float FD0 = 1.0f;
 437
 438     float       z4;
 439     float       polyFN0, polyFN1, polyFD0, polyFD1;
 440
 441     z4          = z2*z2;
 442
 443     polyFD0     = FD4*z4 + FD2;
 444     polyFD1     = FD3*z4 + FD1;
 445     polyFD0     = polyFD0*z4 + FD0;
 446     polyFD0     = polyFD1*z2 + polyFD0;
 447
 448     polyFD0     = 1.0f/polyFD0;
 449
 450     polyFN0     = FN6*z4 + FN4;
 451     polyFN1     = FN5*z4 + FN3;
 452     polyFN0     = polyFN0*z4 + FN2;
 453     polyFN1     = polyFN1*z4 + FN1;
 454     polyFN0     = polyFN0*z4 + FN0;
 455     polyFN0     = polyFN1*z2 + polyFN0;
 456
 457     return polyFN0*polyFD0;
 458 }
 459
 460 /*! Final j-force reduction; this generic implementation works with
 461  *  arbitrary array sizes.
 462  */
 463 /* AMD OpenCL compiler error "Undeclared function index 1024" if __INLINE__d */
 464 __INLINE__ __device__
 465 void reduce_force_j_generic(__local float *f_buf, __global float *fout,
 466                             int tidxi, int tidxj, int aidx)
 467 {
 468     /* Split the reduction between the first 3 column threads
 469        Threads with column id 0 will do the reduction for (float3).x components
 470        Threads with column id 1 will do the reduction for (float3).y components
 471        Threads with column id 2 will do the reduction for (float3).z components.
 472        The reduction is performed for each line tidxj of f_buf. */
 473     if (tidxi < 3)
 474     {
 475         float f = 0.0f;
 476         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 477         {
 478             f += f_buf[FBUF_STRIDE * tidxi + j];
 479         }
 480
 481         atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
 482     }
 483 }
 484
 485 /*! Final i-force reduction; this generic implementation works with
 486  *  arbitrary array sizes.
 487  */
 488 __INLINE__ __device__
 489 void reduce_force_i_generic(__local float *f_buf, __global float *fout,
 490                             float *fshift_buf, bool bCalcFshift,
 491                             int tidxi, int tidxj, int aidx)
 492 {
 493     /* Split the reduction between the first 3 line threads
 494        Threads with line id 0 will do the reduction for (float3).x components
 495        Threads with line id 1 will do the reduction for (float3).y components
 496        Threads with line id 2 will do the reduction for (float3).z components. */
 497     if (tidxj < 3)
 498     {
 499         float f = 0.0f;
 500         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 501         {
 502             f += f_buf[tidxj * FBUF_STRIDE + j];
 503         }
 504
 505         atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 506
 507         if (bCalcFshift)
 508         {
 509             (*fshift_buf) += f;
 510         }
 511     }
 512 }
 513
 514 /*! Final i-force reduction; this implementation works only with power of two
 515  *  array sizes.
 516  */
 517 __INLINE__ __device__
 518 void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
 519                          float *fshift_buf, bool bCalcFshift,
 520                          int tidxi, int tidxj, int aidx)
 521 {
 522     int     i, j;
 523     /* Reduce the initial CL_SIZE values for each i atom to half
 524      * every step by using CL_SIZE * i threads.
 525      * Can't just use i as loop variable because than nvcc refuses to unroll.
 526      */
 527     i = CL_SIZE/2;
 528     for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
 529     {
 530         if (tidxj < i)
 531         {
 532
 533             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 534             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 535             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 536         }
 537         i >>= 1;
 538     }
 539
 540     /* i == 1, last reduction step, writing to global mem */
 541     /* Split the reduction between the first 3 line threads
 542        Threads with line id 0 will do the reduction for (float3).x components
 543        Threads with line id 1 will do the reduction for (float3).y components
 544        Threads with line id 2 will do the reduction for (float3).z components. */
 545     if (tidxj < 3)
 546     {
 547         float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 548
 549         atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 550
 551         if (bCalcFshift)
 552         {
 553             (*fshift_buf) += f;
 554         }
 555     }
 556 }
 557
 558 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 559  *  on whether the size of the array to be reduced is power of two or not.
 560  */
 561 __INLINE__ __device__
 562 void reduce_force_i(__local float *f_buf, __global float *f,
 563                     float *fshift_buf, bool bCalcFshift,
 564                     int tidxi, int tidxj, int ai)
 565 {
 566     if ((CL_SIZE & (CL_SIZE - 1)))
 567     {
 568         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 569     }
 570     else
 571     {
 572         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 573     }
 574 }
 575
 576 /*! Energy reduction; this implementation works only with power of two
 577  *  array sizes.
 578  */
 579 __INLINE__ __device__
 580 void reduce_energy_pow2(volatile __local float  *buf,
 581                         volatile __global float *e_lj,
 582                         volatile __global float *e_el,
 583                         unsigned int             tidx)
 584 {
 585     int     i, j;
 586     float   e1, e2;
 587
 588     i = WARP_SIZE/2;
 589
 590     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 591     for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
 592     {
 593         if (tidx < i)
 594         {
 595             buf[              tidx] += buf[              tidx + i];
 596             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 597         }
 598         i >>= 1;
 599     }
 600
 601     /* last reduction step, writing to global mem */
 602     if (tidx == 0)
 603     {
 604         e1 = buf[              tidx] + buf[              tidx + i];
 605         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 606
 607         atomicAdd_g_f(e_lj, e1);
 608         atomicAdd_g_f(e_el, e2);
 609     }
 610 }
 611
 612 /*! Writes in debug_buffer the input value.
 613  *  Each thread has its own unique location in debug_buffer.
 614  *  Works for 2D global configurations.
 615  */
 616 void print_to_debug_buffer_f(__global float* debug_buffer, float value)
 617 {
 618     if (debug_buffer)
 619         debug_buffer[get_global_id(1) * get_global_size(0) + get_global_id(0)] = value;
 620 }
 621
 622 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */