src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   9  * Copyright (c) 2001-2009, The GROMACS Development Team
  10  *
  11  * Gromacs is a library for molecular simulation and trajectory analysis,
  12  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  13  * a full list of developers and information, check out http://www.gromacs.org
  14  *
  15  * This program is free software; you can redistribute it and/or modify it under
  16  * the terms of the GNU Lesser General Public License as published by the Free
  17  * Software Foundation; either version 2 of the License, or (at your option) any
  18  * later version.
  19  * As a special exception, you may use this file as part of a free software
  20  * library without restriction.  Specifically, if other files instantiate
  21  * templates or use macros or inline functions from this file, or you compile
  22  * this file and link it with other files to produce an executable, this
  23  * file does not by itself cause the resulting executable to be covered by
  24  * the GNU Lesser General Public License.
  25  *
  26  * In plain-speak: do not worry about classes/macros/templates either - only
  27  * changes to the library have to be LGPL, not an application linking with it.
  28  *
  29  * To help fund GROMACS development, we humbly ask that you cite
  30  * the papers people have written on it - you can find them on the website!
  31  */
  32
  33 /* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
  34 #include "gmx_x86_simd_macros.h"
  35
  36 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
  37
  38 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
  39 #define UNROLLJ    GMX_X86_SIMD_WIDTH_HERE
  40
  41 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
  42 #define STRIDE     4
  43 #endif
  44 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
  45 #define STRIDE     8
  46 #endif
  47
  48 #ifdef GMX_MM128_HERE
  49 #ifndef GMX_DOUBLE
  50 /* SSE single precision 4x4 kernel */
  51 #define SUM_SIMD(x) SUM_SIMD4(x)
  52 #define TAB_FDV0
  53 #else
  54 /* SSE double precision 4x2 kernel */
  55 #define SUM_SIMD(x) (x[0]+x[1])
  56 #endif
  57 #endif
  58
  59 #ifdef GMX_MM256_HERE
  60 #ifndef GMX_DOUBLE
  61 /* AVX single precision 4x8 kernel */
  62 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
  63 #define TAB_FDV0
  64 #else
  65 /* AVX double precision 4x4 kernel */
  66 #define SUM_SIMD(x) SUM_SIMD4(x)
  67 #endif
  68 #endif
  69
  70 #define SIMD_MASK_ALL   0xffffffff
  71
  72 #include "nbnxn_kernel_x86_simd_utils.h"
  73
  74 /* All functionality defines are set here, except for:
  75  * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
  76  * CHECK_EXCLS, which is set just before including the inner loop contents.
  77  * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
  78  * set before calling the kernel function. We might want to move that
  79  * to inside the n-loop and have a different combination rule for different
  80  * ci's, as no combination rule gives a 50% performance hit for LJ.
  81  */
  82
  83 /* We always calculate shift forces, because it's cheap anyhow */
  84 #define CALC_SHIFTFORCES
  85
  86 /* Assumes all LJ parameters are identical */
  87 /* #define FIX_LJ_C */
  88
  89 #define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
  90
  91 #if defined LJ_COMB_GEOM
  92 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
  93 #else
  94 #if defined LJ_COMB_LB
  95 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
  96 #else
  97 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
  98 #endif
  99 #endif
 100
 101 #ifdef CALC_COUL_RF
 102 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
 103 #endif
 104 #ifdef CALC_COUL_TAB
 105 #ifndef VDW_CUTOFF_CHECK
 106 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
 107 #else
 108 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
 109 #endif
 110 #endif
 111
 112 #ifdef GMX_MM128_HERE
 113 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
 114 #endif
 115 #ifdef GMX_MM256_HERE
 116 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
 117 #endif
 118
 119 static void
 120 #ifndef CALC_ENERGIES
 121 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
 122 #else
 123 #ifndef ENERGY_GROUPS
 124 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
 125 #else
 126 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 127 #endif
 128 #endif
 129 #undef NBK_FUNC_NAME
 130 #undef NBK_FUNC_NAME_C
 131 #undef NBK_FUNC_NAME_C_LJC
 132                             (const nbnxn_pairlist_t     *nbl,
 133                              const nbnxn_atomdata_t     *nbat,
 134                              const interaction_const_t  *ic,
 135                              rvec                       *shift_vec,
 136                              real                       *f
 137 #ifdef CALC_SHIFTFORCES
 138                              ,
 139                              real                       *fshift
 140 #endif
 141 #ifdef CALC_ENERGIES
 142                              ,
 143                              real                       *Vvdw,
 144                              real                       *Vc
 145 #endif
 146                             )
 147 {
 148     const nbnxn_ci_t   *nbln;
 149     const nbnxn_cj_t   *l_cj;
 150     const int          *type;
 151     const real         *q;
 152     const real         *shiftvec;
 153     const real         *x;
 154     const real         *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
 155     real       facel;
 156     real       *nbfp_ptr;
 157     int        nbfp_stride;
 158     int        n,ci,ci_sh;
 159     int        ish,ish3;
 160     gmx_bool   half_LJ,do_coul;
 161     int        sci,scix,sciy,sciz,sci2;
 162     int        cjind0,cjind1,cjind;
 163     int        ip,jp;
 164
 165 #ifdef ENERGY_GROUPS
 166     int        Vstride_i;
 167     int        egps_ishift,egps_imask;
 168     int        egps_jshift,egps_jmask,egps_jstride;
 169     int        egps_i;
 170     real       *vvdwtp[UNROLLI];
 171     real       *vctp[UNROLLI];
 172 #endif
 173
 174     gmx_mm_pr  shX_SSE;
 175     gmx_mm_pr  shY_SSE;
 176     gmx_mm_pr  shZ_SSE;
 177     gmx_mm_pr  ix_SSE0,iy_SSE0,iz_SSE0;
 178     gmx_mm_pr  ix_SSE1,iy_SSE1,iz_SSE1;
 179     gmx_mm_pr  ix_SSE2,iy_SSE2,iz_SSE2;
 180     gmx_mm_pr  ix_SSE3,iy_SSE3,iz_SSE3;
 181     gmx_mm_pr  fix_SSE0,fiy_SSE0,fiz_SSE0;
 182     gmx_mm_pr  fix_SSE1,fiy_SSE1,fiz_SSE1;
 183     gmx_mm_pr  fix_SSE2,fiy_SSE2,fiz_SSE2;
 184     gmx_mm_pr  fix_SSE3,fiy_SSE3,fiz_SSE3;
 185 #if UNROLLJ >= 4
 186 #ifndef GMX_DOUBLE
 187     __m128     fix_SSE,fiy_SSE,fiz_SSE;
 188 #else
 189     __m256d    fix_SSE,fiy_SSE,fiz_SSE;
 190 #endif
 191 #else
 192     __m128d    fix0_SSE,fiy0_SSE,fiz0_SSE;
 193     __m128d    fix2_SSE,fiy2_SSE,fiz2_SSE;
 194 #endif
 195
 196 #ifndef GMX_MM256_HERE
 197 #ifndef GMX_DOUBLE
 198     __m128i    mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
 199     __m128i    mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
 200     __m128i    mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
 201     __m128i    mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
 202 #else
 203     /* For double precision we need to set two 32bit ints for one double */
 204     __m128i    mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
 205     __m128i    mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
 206     __m128i    mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
 207     __m128i    mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
 208 #endif
 209 #else
 210     /* AVX: use floating point masks, as there are no integer instructions */
 211 #ifndef GMX_DOUBLE
 212     gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
 213     gmx_mm_pr  mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
 214 #else
 215     /* There is no 256-bit int to double conversion, so we use float here */
 216     __m256     mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
 217     __m256     mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
 218     __m256     mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
 219     __m256     mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
 220 #endif
 221 #endif
 222
 223 #ifndef GMX_MM256_HERE
 224 #ifndef GMX_DOUBLE
 225     __m128     diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
 226     __m128     diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
 227     __m128     diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
 228     __m128     diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 229 #else
 230     __m128d    diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
 231     __m128d    diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 232     __m128d    diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 233     __m128d    diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 234     __m128d    diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
 235     __m128d    diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
 236     __m128d    diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
 237     __m128d    diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 238 #endif
 239 #else /* GMX_MM256_HERE */
 240 #ifndef GMX_DOUBLE
 241     gmx_mm_pr  diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
 242     gmx_mm_pr  diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
 243     gmx_mm_pr  diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
 244     gmx_mm_pr  diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 245     gmx_mm_pr  diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 246     gmx_mm_pr  diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 247     gmx_mm_pr  diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 248     gmx_mm_pr  diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 249 #else
 250     gmx_mm_pr  diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
 251     gmx_mm_pr  diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 252     gmx_mm_pr  diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 253     gmx_mm_pr  diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 254 #endif
 255 #endif
 256
 257 #ifndef GMX_MM256_HERE
 258     __m128i    zeroi_SSE = _mm_setzero_si128();
 259 #endif
 260 #ifdef GMX_X86_SSE4_1
 261     gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
 262 #endif
 263
 264     gmx_mm_pr  one_SSE=gmx_set1_pr(1.0);
 265     gmx_mm_pr  iq_SSE0=gmx_setzero_pr();
 266     gmx_mm_pr  iq_SSE1=gmx_setzero_pr();
 267     gmx_mm_pr  iq_SSE2=gmx_setzero_pr();
 268     gmx_mm_pr  iq_SSE3=gmx_setzero_pr();
 269     gmx_mm_pr  mrc_3_SSE;
 270 #ifdef CALC_ENERGIES
 271     gmx_mm_pr  hrc_3_SSE,moh_rc_SSE;
 272 #endif
 273 #ifdef CALC_COUL_TAB
 274     /* Coulomb table variables */
 275     gmx_mm_pr  invtsp_SSE;
 276     const real *tab_coul_F;
 277 #ifndef TAB_FDV0
 278     const real *tab_coul_V;
 279 #endif
 280 #ifdef GMX_MM256_HERE
 281     int        ti0_array[2*UNROLLJ-1],*ti0;
 282     int        ti1_array[2*UNROLLJ-1],*ti1;
 283     int        ti2_array[2*UNROLLJ-1],*ti2;
 284     int        ti3_array[2*UNROLLJ-1],*ti3;
 285 #endif
 286 #ifdef CALC_ENERGIES
 287     gmx_mm_pr  mhalfsp_SSE;
 288     gmx_mm_pr  sh_ewald_SSE;
 289 #endif
 290 #endif
 291
 292 #ifdef LJ_COMB_LB
 293     const real *ljc;
 294
 295     gmx_mm_pr  hsig_i_SSE0,seps_i_SSE0;
 296     gmx_mm_pr  hsig_i_SSE1,seps_i_SSE1;
 297     gmx_mm_pr  hsig_i_SSE2,seps_i_SSE2;
 298     gmx_mm_pr  hsig_i_SSE3,seps_i_SSE3;
 299 #else
 300 #ifdef FIX_LJ_C
 301     real       pvdw_array[2*UNROLLI*UNROLLJ+3];
 302     real       *pvdw_c6,*pvdw_c12;
 303     gmx_mm_pr  c6_SSE0,c12_SSE0;
 304     gmx_mm_pr  c6_SSE1,c12_SSE1;
 305     gmx_mm_pr  c6_SSE2,c12_SSE2;
 306     gmx_mm_pr  c6_SSE3,c12_SSE3;
 307 #endif
 308
 309 #ifdef LJ_COMB_GEOM
 310     const real *ljc;
 311
 312     gmx_mm_pr  c6s_SSE0,c12s_SSE0;
 313     gmx_mm_pr  c6s_SSE1,c12s_SSE1;
 314     gmx_mm_pr  c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
 315     gmx_mm_pr  c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
 316 #endif
 317 #endif /* LJ_COMB_LB */
 318
 319     gmx_mm_pr  vctotSSE,VvdwtotSSE;
 320     gmx_mm_pr  sixthSSE,twelvethSSE;
 321
 322     gmx_mm_pr  avoid_sing_SSE;
 323     gmx_mm_pr  rc2_SSE;
 324 #ifdef VDW_CUTOFF_CHECK
 325     gmx_mm_pr  rcvdw2_SSE;
 326 #endif
 327
 328 #ifdef CALC_ENERGIES
 329     gmx_mm_pr  sh_invrc6_SSE,sh_invrc12_SSE;
 330
 331     /* cppcheck-suppress unassignedVariable */
 332     real       tmpsum_array[15],*tmpsum;
 333 #endif
 334 #ifdef CALC_SHIFTFORCES
 335     /* cppcheck-suppress unassignedVariable */
 336     real       shf_array[15],*shf;
 337 #endif
 338
 339     int ninner;
 340
 341 #ifdef COUNT_PAIRS
 342     int npair=0;
 343 #endif
 344
 345 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 346     ljc = nbat->lj_comb;
 347 #else
 348     /* No combination rule used */
 349 #ifndef GMX_DOUBLE
 350     nbfp_ptr    = nbat->nbfp_s4;
 351 #define NBFP_STRIDE  4
 352 #else
 353     nbfp_ptr    = nbat->nbfp;
 354 #define NBFP_STRIDE  2
 355 #endif
 356     nbfp_stride = NBFP_STRIDE;
 357 #endif
 358
 359 #ifdef CALC_COUL_TAB
 360 #ifdef GMX_MM256_HERE
 361     /* Generate aligned table pointers */
 362     ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
 363     ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
 364     ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
 365     ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
 366 #endif
 367
 368     invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
 369 #ifdef CALC_ENERGIES
 370     mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
 371
 372     sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
 373 #endif
 374
 375 #ifdef TAB_FDV0
 376     tab_coul_F = ic->tabq_coul_FDV0;
 377 #else
 378     tab_coul_F = ic->tabq_coul_F;
 379     tab_coul_V = ic->tabq_coul_V;
 380 #endif
 381 #endif
 382
 383     q                   = nbat->q;
 384     type                = nbat->type;
 385     facel               = ic->epsfac;
 386     shiftvec            = shift_vec[0];
 387     x                   = nbat->x;
 388
 389     avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
 390
 391     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
 392     rc2_SSE    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
 393 #ifdef VDW_CUTOFF_CHECK
 394     rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
 395 #endif
 396
 397 #ifdef CALC_ENERGIES
 398     sixthSSE    = gmx_set1_pr(1.0/6.0);
 399     twelvethSSE = gmx_set1_pr(1.0/12.0);
 400
 401     sh_invrc6_SSE  = gmx_set1_pr(ic->sh_invrc6);
 402     sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
 403 #endif
 404
 405     mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
 406
 407 #ifdef CALC_ENERGIES
 408     hrc_3_SSE = gmx_set1_pr(ic->k_rf);
 409
 410     moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
 411 #endif
 412
 413 #ifdef CALC_ENERGIES
 414     tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
 415 #endif
 416 #ifdef CALC_SHIFTFORCES
 417     shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
 418 #endif
 419
 420 #ifdef FIX_LJ_C
 421     pvdw_c6  = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
 422     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
 423
 424     for(jp=0; jp<UNROLLJ; jp++)
 425     {
 426         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
 427         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
 428         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
 429         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
 430
 431         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 432         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 433         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 434         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 435     }
 436     c6_SSE0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
 437     c6_SSE1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
 438     c6_SSE2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
 439     c6_SSE3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
 440
 441     c12_SSE0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
 442     c12_SSE1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
 443     c12_SSE2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
 444     c12_SSE3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
 445 #endif /* FIX_LJ_C */
 446
 447 #ifdef ENERGY_GROUPS
 448     egps_ishift  = nbat->neg_2log;
 449     egps_imask   = (1<<egps_ishift) - 1;
 450     egps_jshift  = 2*nbat->neg_2log;
 451     egps_jmask   = (1<<egps_jshift) - 1;
 452     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
 453     /* Major division is over i-particles: divide nVS by 4 for i-stride */
 454     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 455 #endif
 456
 457     l_cj = nbl->cj;
 458
 459     ninner = 0;
 460     for(n=0; n<nbl->nci; n++)
 461     {
 462         nbln = &nbl->ci[n];
 463
 464         ish              = (nbln->shift & NBNXN_CI_SHIFT);
 465         ish3             = ish*3;
 466         cjind0           = nbln->cj_ind_start;
 467         cjind1           = nbln->cj_ind_end;
 468         /* Currently only works super-cells equal to sub-cells */
 469         ci               = nbln->ci;
 470         ci_sh            = (ish == CENTRAL ? ci : -1);
 471
 472         shX_SSE = gmx_load1_pr(shiftvec+ish3);
 473         shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
 474         shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
 475
 476 #if UNROLLJ <= 4
 477         sci              = ci*STRIDE;
 478         scix             = sci*DIM;
 479         sci2             = sci*2;
 480 #else
 481         sci              = (ci>>1)*STRIDE;
 482         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
 483         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
 484         sci             += (ci & 1)*(STRIDE>>1);
 485 #endif
 486
 487         half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
 488         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
 489
 490 #ifdef ENERGY_GROUPS
 491         egps_i = nbat->energrp[ci];
 492         {
 493             int ia,egp_ia;
 494
 495             for(ia=0; ia<4; ia++)
 496             {
 497                 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
 498                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
 499                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
 500             }
 501         }
 502 #endif
 503 #if defined CALC_ENERGIES
 504 #if UNROLLJ == 4
 505         if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
 506 #endif
 507 #if UNROLLJ == 2
 508         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
 509 #endif
 510 #if UNROLLJ == 8
 511         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
 512 #endif
 513         {
 514             int  ia;
 515             real Vc_sub_self;
 516
 517 #ifdef CALC_COUL_RF
 518             Vc_sub_self = 0.5*ic->c_rf;
 519 #endif
 520 #ifdef CALC_COUL_TAB
 521 #ifdef TAB_FDV0
 522             Vc_sub_self = 0.5*tab_coul_F[2];
 523 #else
 524             Vc_sub_self = 0.5*tab_coul_V[0];
 525 #endif
 526 #endif
 527
 528             for(ia=0; ia<UNROLLI; ia++)
 529             {
 530                 real qi;
 531
 532                 qi = q[sci+ia];
 533 #ifdef ENERGY_GROUPS
 534                 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
 535 #else
 536                 Vc[0]
 537 #endif
 538                     -= facel*qi*qi*Vc_sub_self;
 539             }
 540         }
 541 #endif
 542
 543                 /* Load i atom data */
 544         sciy             = scix + STRIDE;
 545         sciz             = sciy + STRIDE;
 546         ix_SSE0          = gmx_add_pr(gmx_load1_pr(x+scix)  ,shX_SSE);
 547         ix_SSE1          = gmx_add_pr(gmx_load1_pr(x+scix+1),shX_SSE);
 548         ix_SSE2          = gmx_add_pr(gmx_load1_pr(x+scix+2),shX_SSE);
 549         ix_SSE3          = gmx_add_pr(gmx_load1_pr(x+scix+3),shX_SSE);
 550         iy_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciy)  ,shY_SSE);
 551         iy_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciy+1),shY_SSE);
 552         iy_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciy+2),shY_SSE);
 553         iy_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciy+3),shY_SSE);
 554         iz_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciz)  ,shZ_SSE);
 555         iz_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciz+1),shZ_SSE);
 556         iz_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
 557         iz_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
 558
 559         /* With half_LJ we currently always calculate Coulomb interactions */
 560         if (do_coul || half_LJ)
 561         {
 562             iq_SSE0      = gmx_set1_pr(facel*q[sci]);
 563             iq_SSE1      = gmx_set1_pr(facel*q[sci+1]);
 564             iq_SSE2      = gmx_set1_pr(facel*q[sci+2]);
 565             iq_SSE3      = gmx_set1_pr(facel*q[sci+3]);
 566         }
 567
 568 #ifdef LJ_COMB_LB
 569         hsig_i_SSE0      = gmx_load1_pr(ljc+sci2+0);
 570         hsig_i_SSE1      = gmx_load1_pr(ljc+sci2+1);
 571         hsig_i_SSE2      = gmx_load1_pr(ljc+sci2+2);
 572         hsig_i_SSE3      = gmx_load1_pr(ljc+sci2+3);
 573         seps_i_SSE0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
 574         seps_i_SSE1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
 575         seps_i_SSE2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
 576         seps_i_SSE3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
 577 #else
 578 #ifdef LJ_COMB_GEOM
 579         c6s_SSE0         = gmx_load1_pr(ljc+sci2+0);
 580         c6s_SSE1         = gmx_load1_pr(ljc+sci2+1);
 581         if (!half_LJ)
 582         {
 583             c6s_SSE2     = gmx_load1_pr(ljc+sci2+2);
 584             c6s_SSE3     = gmx_load1_pr(ljc+sci2+3);
 585         }
 586         c12s_SSE0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
 587         c12s_SSE1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
 588         if (!half_LJ)
 589         {
 590             c12s_SSE2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
 591             c12s_SSE3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
 592         }
 593 #else
 594         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
 595         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
 596         if (!half_LJ)
 597         {
 598             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
 599             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
 600         }
 601 #endif
 602 #endif
 603
 604         /* Zero the potential energy for this list */
 605         VvdwtotSSE       = gmx_setzero_pr();
 606         vctotSSE         = gmx_setzero_pr();
 607
 608         /* Clear i atom forces */
 609         fix_SSE0           = gmx_setzero_pr();
 610         fix_SSE1           = gmx_setzero_pr();
 611         fix_SSE2           = gmx_setzero_pr();
 612         fix_SSE3           = gmx_setzero_pr();
 613         fiy_SSE0           = gmx_setzero_pr();
 614         fiy_SSE1           = gmx_setzero_pr();
 615         fiy_SSE2           = gmx_setzero_pr();
 616         fiy_SSE3           = gmx_setzero_pr();
 617         fiz_SSE0           = gmx_setzero_pr();
 618         fiz_SSE1           = gmx_setzero_pr();
 619         fiz_SSE2           = gmx_setzero_pr();
 620         fiz_SSE3           = gmx_setzero_pr();
 621
 622         cjind = cjind0;
 623
 624         /* Currently all kernels use (at least half) LJ */
 625 #define CALC_LJ
 626         if (half_LJ)
 627         {
 628 #define CALC_COULOMB
 629 #define HALF_LJ
 630 #define CHECK_EXCLS
 631             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 632             {
 633 #include "nbnxn_kernel_x86_simd_inner.h"
 634                 cjind++;
 635             }
 636 #undef CHECK_EXCLS
 637             for(; (cjind<cjind1); cjind++)
 638             {
 639 #include "nbnxn_kernel_x86_simd_inner.h"
 640             }
 641 #undef HALF_LJ
 642 #undef CALC_COULOMB
 643         }
 644         else if (do_coul)
 645         {
 646 #define CALC_COULOMB
 647 #define CHECK_EXCLS
 648             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 649             {
 650 #include "nbnxn_kernel_x86_simd_inner.h"
 651                 cjind++;
 652             }
 653 #undef CHECK_EXCLS
 654             for(; (cjind<cjind1); cjind++)
 655             {
 656 #include "nbnxn_kernel_x86_simd_inner.h"
 657             }
 658 #undef CALC_COULOMB
 659         }
 660         else
 661         {
 662 #define CHECK_EXCLS
 663             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 664             {
 665 #include "nbnxn_kernel_x86_simd_inner.h"
 666                 cjind++;
 667             }
 668 #undef CHECK_EXCLS
 669             for(; (cjind<cjind1); cjind++)
 670             {
 671 #include "nbnxn_kernel_x86_simd_inner.h"
 672             }
 673         }
 674 #undef CALC_LJ
 675         ninner += cjind1 - cjind0;
 676
 677         /* Add accumulated i-forces to the force array */
 678 #if UNROLLJ >= 4
 679 #ifndef GMX_DOUBLE
 680 #define gmx_load_ps4  _mm_load_ps
 681 #define gmx_store_ps4 _mm_store_ps
 682 #define gmx_add_ps4   _mm_add_ps
 683 #else
 684 #define gmx_load_ps4  _mm256_load_pd
 685 #define gmx_store_ps4 _mm256_store_pd
 686 #define gmx_add_ps4   _mm256_add_pd
 687 #endif
 688         GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3,fix_SSE);
 689         gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
 690
 691         GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3,fiy_SSE);
 692         gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
 693
 694         GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3,fiz_SSE);
 695         gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
 696
 697 #ifdef CALC_SHIFTFORCES
 698         gmx_store_ps4(shf,fix_SSE);
 699         fshift[ish3+0] += SUM_SIMD4(shf);
 700         gmx_store_ps4(shf,fiy_SSE);
 701         fshift[ish3+1] += SUM_SIMD4(shf);
 702         gmx_store_ps4(shf,fiz_SSE);
 703         fshift[ish3+2] += SUM_SIMD4(shf);
 704 #endif
 705 #else
 706         GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
 707         _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
 708         GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
 709         _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
 710
 711         GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
 712         _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
 713         GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
 714         _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
 715
 716         GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
 717         _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
 718         GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
 719         _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
 720
 721 #ifdef CALC_SHIFTFORCES
 722         _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
 723         fshift[ish3+0] += shf[0] + shf[1];
 724         _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
 725         fshift[ish3+1] += shf[0] + shf[1];
 726         _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
 727         fshift[ish3+2] += shf[0] + shf[1];
 728 #endif
 729 #endif
 730
 731 #ifdef CALC_ENERGIES
 732         if (do_coul)
 733         {
 734             gmx_store_pr(tmpsum,vctotSSE);
 735             *Vc += SUM_SIMD(tmpsum);
 736         }
 737
 738         gmx_store_pr(tmpsum,VvdwtotSSE);
 739         *Vvdw += SUM_SIMD(tmpsum);
 740 #endif
 741
 742                 /* Outer loop uses 6 flops/iteration */
 743         }
 744
 745 #ifdef COUNT_PAIRS
 746     printf("atom pairs %d\n",npair);
 747 #endif
 748 }
 749
 750 #undef gmx_load_ps4
 751 #undef gmx_store_ps4
 752 #undef gmx_store_ps4
 753
 754 #undef CALC_SHIFTFORCES
 755
 756 #undef UNROLLI
 757 #undef UNROLLJ
 758 #undef STRIDE
 759 #undef TAB_FDV0
 760 #undef NBFP_STRIDE