src/mdlib/genborn_sse2_double.c

   1 #ifdef HAVE_CONFIG_H
   2 #include <config.h>
   3 #endif
   4
   5 #include <math.h>
   6 #include <string.h>
   7
   8 #include "typedefs.h"
   9 #include "smalloc.h"
  10 #include "genborn.h"
  11 #include "vec.h"
  12 #include "grompp.h"
  13 #include "pdbio.h"
  14 #include "names.h"
  15 #include "physics.h"
  16 #include "domdec.h"
  17 #include "partdec.h"
  18 #include "network.h"
  19 #include "gmx_fatal.h"
  20 #include "mtop_util.h"
  21 #include "genborn.h"
  22
  23 #ifdef GMX_LIB_MPI
  24 #include <mpi.h>
  25 #endif
  26 #ifdef GMX_THREADS
  27 #include "tmpi.h"
  28 #endif
  29
  30 /* Only compile this file if SSE2 intrinsics are available */
  31 #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) )
  32 #include <xmmintrin.h>
  33 #include <emmintrin.h>
  34
  35 #include "genborn_sse2_double.h"
  36
  37 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
  38 #define gmx_castsi128_pd(a) _mm_castsi128_pd(a)
  39 #define gmx_castpd_si128(a) _mm_castpd_si128(a)
  40 #elif defined(__GNUC__)
  41 #define gmx_castsi128_pd(a) ((__m128d)(a))
  42 #define gmx_castpd_si128(a) ((__m128i)(a))
  43 #else
  44 static __m128d gmx_castsi128_pd(__m128i a) { return *(__m128d *) &a; }
  45 static __m128i gmx_castpd_si128(__m128d a) { return *(__m128i *) &a; }
  46 #endif
  47
  48
  49 static inline void
  50 sincos_sse2double(__m128d x, __m128d *sinval, __m128d *cosval)
  51 {
  52     const __m128d two_over_pi = {2.0/M_PI,2.0/M_PI};
  53     const __m128d half        = {0.5,0.5};
  54     const __m128d one         = {1.0,1.0};
  55     const __m128i izero       = _mm_set1_epi32(0);
  56     const __m128i ione        = _mm_set1_epi32(1);
  57     const __m128i itwo        = _mm_set1_epi32(2);
  58     const __m128i ithree      = _mm_set1_epi32(3);
  59     const __m128d sincosd_kc1 = {(13176794.0 / 8388608.0),(13176794.0 / 8388608.0)};
  60     const __m128d sincosd_kc2 = {7.5497899548918821691639751442098584e-8,7.5497899548918821691639751442098584e-8};
  61     const __m128d sincosd_cc0 = {0.00000000206374484196,0.00000000206374484196};
  62     const __m128d sincosd_cc1 = {-0.00000027555365134677,-0.00000027555365134677};
  63     const __m128d sincosd_cc2 = {0.00002480157946764225,0.00002480157946764225};
  64     const __m128d sincosd_cc3 = {-0.00138888888730525966,-0.00138888888730525966};
  65     const __m128d sincosd_cc4 = {0.04166666666651986722,0.04166666666651986722};
  66     const __m128d sincosd_cc5 = {-0.49999999999999547304,-0.49999999999999547304};
  67     const __m128d sincosd_sc0 = {0.00000000015893606014,0.00000000015893606014};
  68     const __m128d sincosd_sc1 = {-0.00000002505069049138,-0.00000002505069049138};
  69     const __m128d sincosd_sc2 = {0.00000275573131527032,0.00000275573131527032};
  70     const __m128d sincosd_sc3 = {-0.00019841269827816117,-0.00019841269827816117};
  71     const __m128d sincosd_sc4 = {0.00833333333331908278,0.00833333333331908278};
  72     const __m128d sincosd_sc5 = {-0.16666666666666612594,-0.16666666666666612594};
  73
  74     __m128d signbit           = gmx_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ULL));
  75     __m128d tiny              = gmx_castsi128_pd(_mm_set1_epi64x(0x3e40000000000000ULL));
  76
  77     __m128d xl,xl2,xl3,qd,absxl,p1,cx,sx,ts,tc,tsn,tcn;
  78     __m128i q;
  79     __m128i offsetSin,offsetCos;
  80     __m128d sinMask,cosMask,isTiny;
  81     __m128d ct0,ct1,ct2,ct3,ct4,ct5,ct6,st1,st2,st3,st4,st6;
  82
  83     /* Rescale the angle to the range 0..4, and find which quadrant it is in */
  84     xl        = _mm_mul_pd(x,two_over_pi);
  85
  86     /* q=integer part of xl, rounded _away_ from 0.0 */
  87     /* Add 0.5 away from 0.0 */
  88     xl        = _mm_add_pd(xl,_mm_or_pd(_mm_and_pd(xl,signbit),half));
  89
  90     q         = _mm_cvttpd_epi32(xl);
  91     qd        = _mm_cvtepi32_pd(q);
  92     q         = _mm_shuffle_epi32(q,_MM_SHUFFLE(1,1,0,0));
  93
  94     /* Compute offset based on quadrant the arg falls in */
  95     offsetSin   = _mm_and_si128(q,ithree);
  96     offsetCos   = _mm_add_epi32(offsetSin,ione);
  97
  98     /* Remainder in range [-pi/4..pi/4] */
  99     p1 = _mm_mul_pd(qd,sincosd_kc1);
 100     xl = _mm_mul_pd(qd,sincosd_kc2);
 101     p1 = _mm_sub_pd(x,p1);
 102     xl = _mm_sub_pd(p1,xl);
 103
 104     absxl  = _mm_andnot_pd(signbit,xl);
 105     isTiny = _mm_cmpgt_pd(tiny,absxl);
 106
 107     xl2    = _mm_mul_pd(xl,xl);
 108     xl3    = _mm_mul_pd(xl2,xl);
 109
 110     ct0    = _mm_mul_pd(xl2,xl2);
 111     ct1    = _mm_mul_pd(sincosd_cc0,xl2);
 112     ct2    = _mm_mul_pd(sincosd_cc2,xl2);
 113     ct3    = _mm_mul_pd(sincosd_cc4,xl2);
 114     st1    = _mm_mul_pd(sincosd_sc0,xl2);
 115     st2    = _mm_mul_pd(sincosd_sc2,xl2);
 116     st3    = _mm_mul_pd(sincosd_sc4,xl2);
 117     ct1    = _mm_add_pd(ct1,sincosd_cc1);
 118     ct2    = _mm_add_pd(ct2,sincosd_cc3);
 119     ct3    = _mm_add_pd(ct3,sincosd_cc5);
 120     st1    = _mm_add_pd(st1,sincosd_sc1);
 121     st2    = _mm_add_pd(st2,sincosd_sc3);
 122     st3    = _mm_add_pd(st3,sincosd_sc5);
 123
 124     ct4    = _mm_mul_pd(ct2,ct0);
 125     ct4    = _mm_add_pd(ct4,ct3);
 126
 127     st4    = _mm_mul_pd(st2,ct0);
 128     st4    = _mm_add_pd(st4,st3);
 129     ct5    = _mm_mul_pd(ct0,ct0);
 130
 131     ct6    = _mm_mul_pd(ct5,ct1);
 132     ct6    = _mm_add_pd(ct6,ct4);
 133
 134     st6    = _mm_mul_pd(ct5,st1);
 135     st6    = _mm_add_pd(st6,st4);
 136
 137     cx     = _mm_mul_pd(ct6,xl2);
 138     cx     = _mm_add_pd(cx,one);
 139
 140     sx     = _mm_mul_pd(st6,xl3);
 141     sx     = _mm_add_pd(sx,xl);
 142
 143     /* Small angle approximation, sin(tiny)=tiny, cos(tiny)=1.0 */
 144     sx     = _mm_or_pd( _mm_and_pd(isTiny,xl) , _mm_andnot_pd(isTiny,sx) );
 145     cx     = _mm_or_pd( _mm_and_pd(isTiny,one) , _mm_andnot_pd(isTiny,cx) );
 146
 147     sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,ione), izero));
 148     cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,ione), izero));
 149
 150     ts     = _mm_or_pd( _mm_and_pd(sinMask,sx) , _mm_andnot_pd(sinMask,cx) );
 151     tc     = _mm_or_pd( _mm_and_pd(cosMask,sx) , _mm_andnot_pd(cosMask,cx) );
 152
 153     /* Flip the sign of the result when (offset mod 4) = 1 or 2 */
 154     sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,itwo), izero));
 155     tsn    = _mm_xor_pd(signbit,ts);
 156     ts     = _mm_or_pd( _mm_and_pd(sinMask,ts) , _mm_andnot_pd(sinMask,tsn) );
 157
 158     cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,itwo), izero));
 159     tcn    = _mm_xor_pd(signbit,tc);
 160     tc     = _mm_or_pd( _mm_and_pd(cosMask,tc) , _mm_andnot_pd(cosMask,tcn) );
 161
 162     *sinval = ts;
 163     *cosval = tc;
 164
 165     return;
 166 }
 167
 168 __m128d log_pd(__m128d x)
 169 {
 170         const __m128i exp_mask   = _mm_set_epi32(0x7FF00000,0,0x7FF00000,0);
 171         const __m128i exp_bias   = _mm_set_epi32(0,1023,0,1023);
 172
 173         const __m128d const_loge = _mm_set1_pd(0.69314718055994529);
 174         const __m128d const_one  = _mm_set1_pd(1.0);
 175         const __m128d const_two  = _mm_set1_pd(2.0);
 176
 177         /* Almost full single precision accuracy (~20 bits worst case) */
 178         const __m128d P0      = _mm_set1_pd(6.108179944792157749153050);
 179         const __m128d P1      = _mm_set1_pd(52.43691313715523327631139);
 180         const __m128d P2      = _mm_set1_pd(71.53664010795613671168440);
 181         const __m128d P3      = _mm_set1_pd(18.92097516931559547548485);
 182         const __m128d P4      = _mm_set1_pd(0.3504714784635941984522153);
 183         const __m128d P5      = _mm_set1_pd(-0.007105890734229368515879);
 184         const __m128d Q1      = _mm_set1_pd(17.73314231909420567454406);
 185         const __m128d Q2      = _mm_set1_pd(48.82373085428713023213363);
 186         const __m128d Q3      = _mm_set1_pd(31.65945943354513166309101);
 187         const __m128d Q4      = _mm_set1_pd(4.302477477108162270199051);
 188
 189         __m128d xmm0,xmm1,xmm2,xmm3, xmm4;
 190         __m128i xmmi,xmmj;
 191
 192         xmmi = gmx_castpd_si128(x);
 193         xmm1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(_mm_sub_epi64(_mm_srli_epi64(_mm_and_si128(xmmi, exp_mask), 52), exp_bias),_MM_SHUFFLE(3,1,2,0)));
 194         xmm0 = _mm_or_pd(gmx_castsi128_pd(_mm_andnot_si128(exp_mask, xmmi)), const_one);
 195
 196         xmm2  = _mm_mul_pd(P5,xmm0);
 197         xmm2  = _mm_add_pd(xmm2,P4);
 198         xmm2  = _mm_mul_pd(xmm2,xmm0);
 199         xmm2  = _mm_add_pd(xmm2,P3);
 200         xmm2  = _mm_mul_pd(xmm2,xmm0);
 201         xmm2  = _mm_add_pd(xmm2,P2);
 202         xmm2  = _mm_mul_pd(xmm2,xmm0);
 203         xmm2  = _mm_add_pd(xmm2,P1);
 204         xmm2  = _mm_mul_pd(xmm2,xmm0);
 205         xmm2  = _mm_add_pd(xmm2,P0);
 206
 207         xmm3  = _mm_mul_pd(Q4,xmm0);
 208         xmm3  = _mm_add_pd(xmm3,Q3);
 209         xmm3  = _mm_mul_pd(xmm3,xmm0);
 210         xmm3  = _mm_add_pd(xmm3,Q2);
 211         xmm3  = _mm_mul_pd(xmm3,xmm0);
 212         xmm3  = _mm_add_pd(xmm3,Q1);
 213         xmm3  = _mm_mul_pd(xmm3,xmm0);
 214         xmm3  = _mm_add_pd(xmm3,const_one);
 215
 216         /* xmm4=1.0/xmm3 */
 217         xmm4 = _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(xmm3)));
 218         xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
 219         xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
 220         xmm2 = _mm_mul_pd(xmm2,xmm4);
 221
 222         xmm0  = _mm_sub_pd(xmm0, const_one);
 223         xmm0  = _mm_mul_pd(xmm0,xmm2);
 224
 225         xmm0  = _mm_add_pd(xmm0,xmm1);
 226
 227     return _mm_mul_pd(xmm0, const_loge);
 228 }
 229
 230 /* This exp() routine provides accuracy of 10E-9 to 10E-11.
 231  * The polynomial minimax coefficients are actually accurate to 10E-14,
 232  * but we lose some accuracy in the polynomial evaluation.
 233  */
 234 __m128d exp_pd(__m128d x)
 235 {
 236     const __m128d lim1   = _mm_set1_pd(1025.0);   /* 1025.00000e+0d */
 237     const __m128d lim2   = _mm_set1_pd(-1022.99999999);   /* -1022.99999e+0f */
 238
 239     const __m128i base   = _mm_set_epi32(0,0,1023,1023);
 240         const __m128d half   = _mm_set1_pd(0.5);
 241         const __m128d log2e  = _mm_set1_pd(1.4426950408889634);
 242
 243     const __m128d exp_P0 = _mm_set1_pd(1.00000000000001276211229749);
 244     const __m128d exp_P1 = _mm_set1_pd(6.931471805598709708635169E-1);
 245     const __m128d exp_P2 = _mm_set1_pd(2.402265069564965287455972E-1);
 246     const __m128d exp_P3 = _mm_set1_pd(5.550410866868561155599683E-2);
 247     const __m128d exp_P4 = _mm_set1_pd(9.618129192067246128919915E-3);
 248     const __m128d exp_P5 = _mm_set1_pd(1.333355761760444302342084E-3);
 249     const __m128d exp_P6 = _mm_set1_pd(1.540343494807179111289781E-4);
 250     const __m128d exp_P7 = _mm_set1_pd(1.525298483865349629325421E-5);
 251     const __m128d exp_P8 = _mm_set1_pd(1.325940560934510417818578E-6);
 252         const __m128d exp_P9 = _mm_set1_pd(1.015033670529892589443421E-7);
 253
 254         __m128d xmm0,xmm1;
 255         __m128i xmmi;
 256
 257         xmm0 = _mm_mul_pd(x,log2e);
 258         xmm0 = _mm_min_pd(xmm0,lim1);
 259         xmm0 = _mm_max_pd(xmm0,lim2);
 260         xmm1 = _mm_sub_pd(xmm0,half);
 261
 262         xmmi = _mm_cvtpd_epi32(xmm1);
 263         xmm1 = _mm_cvtepi32_pd(xmmi);
 264
 265         xmmi = _mm_add_epi32(xmmi,base);
 266         xmmi = _mm_shuffle_epi32(xmmi,_MM_SHUFFLE(3,1,2,0));
 267         xmmi = _mm_slli_epi64(xmmi,52);
 268
 269         xmm0 = _mm_sub_pd(xmm0,xmm1);
 270
 271         xmm1 = _mm_mul_pd(exp_P9,xmm0);
 272         xmm1 = _mm_add_pd(xmm1,exp_P8);
 273         xmm1 = _mm_mul_pd(xmm1,xmm0);
 274         xmm1 = _mm_add_pd(xmm1,exp_P7);
 275         xmm1 = _mm_mul_pd(xmm1,xmm0);
 276         xmm1 = _mm_add_pd(xmm1,exp_P6);
 277         xmm1 = _mm_mul_pd(xmm1,xmm0);
 278         xmm1 = _mm_add_pd(xmm1,exp_P5);
 279         xmm1 = _mm_mul_pd(xmm1,xmm0);
 280         xmm1 = _mm_add_pd(xmm1,exp_P4);
 281         xmm1 = _mm_mul_pd(xmm1,xmm0);
 282         xmm1 = _mm_add_pd(xmm1,exp_P3);
 283         xmm1 = _mm_mul_pd(xmm1,xmm0);
 284         xmm1 = _mm_add_pd(xmm1,exp_P2);
 285         xmm1 = _mm_mul_pd(xmm1,xmm0);
 286         xmm1 = _mm_add_pd(xmm1,exp_P1);
 287         xmm1 = _mm_mul_pd(xmm1,xmm0);
 288         xmm1 = _mm_add_pd(xmm1,exp_P0);
 289         xmm1 = _mm_mul_pd(xmm1,gmx_castsi128_pd(xmmi));
 290
 291     return xmm1;
 292 }
 293
 294 static inline __m128d
 295 my_invrsq_pd(__m128d x)
 296 {
 297         const __m128d three = {3.0, 3.0};
 298         const __m128d half  = {0.5, 0.5};
 299
 300         __m128  t  = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */
 301         __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */
 302
 303         /* First Newton-Rapson step, accuracy is now 24 bits */
 304         __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1)))));
 305
 306         /* Return second Newton-Rapson step, accuracy 48 bits */
 307         return _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2)))));
 308 }
 309
 310 static inline __m128d
 311 my_inv_pd(__m128d x)
 312 {
 313         const __m128d two = {2.0, 2.0};
 314
 315         __m128  t  = _mm_rcp_ps(_mm_cvtpd_ps(x));
 316         __m128d t1 = _mm_cvtps_pd(t);
 317         __m128d t2 = _mm_mul_pd(t1,_mm_sub_pd(two,_mm_mul_pd(t1,x)));
 318
 319         return _mm_mul_pd(t2,_mm_sub_pd(two,_mm_mul_pd(t2,x)));
 320 }
 321
 322
 323 int
 324 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
 325                                                           const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
 326 {
 327         int i,k,n,ai,ai3,aj1,aj2,aj13,aj23;
 328         int at0,at1,nj0,nj1,offset,taj1,taj2;
 329         int shift;
 330
 331         double factor,gpi_ai,gpi_tmp,gpi2;
 332         double shX,shY,shZ;
 333
 334         __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
 335         __m128d t1,t2,t3,rsq11,rinv,rinv2,rinv4,rinv6;
 336         __m128d ratio,gpi,rai,raj,vaj,rvdw,mask_cmp;
 337         __m128d ccf,dccf,theta,cosq,term,sinq,res,prod;
 338         __m128d xmm1,xmm2,xmm3,xmm4,xmm7,vai,prod_ai,icf4,icf6;
 339
 340         const __m128d half  = {0.5, 0.5};
 341         const __m128d three = {3.0, 3.0};
 342         const __m128d one   = {1.0, 1.0};
 343         const __m128d two   = {2.0, 2.0};
 344         const __m128d zero  = {0.0, 0.0};
 345         const __m128d four  = {4.0, 4.0};
 346
 347         const __m128d p5inv  = {STILL_P5INV, STILL_P5INV};
 348         const __m128d pip5   = {STILL_PIP5,  STILL_PIP5};
 349         const __m128d p4     = {STILL_P4,    STILL_P4};
 350
 351         factor              = 0.5 * ONE_4PI_EPS0;
 352         n                   = 0;
 353
 354         /* Keep the compiler happy */
 355         raj         = _mm_setzero_pd();
 356         vaj         = _mm_setzero_pd();
 357         jx          = _mm_setzero_pd();
 358         jy          = _mm_setzero_pd();
 359         jz          = _mm_setzero_pd();
 360         xmm2        = _mm_setzero_pd();
 361         xmm7        = _mm_setzero_pd();
 362
 363         for(i=0;i<born->nr;i++)
 364         {
 365                 born->gpol_still_work[i]=0;
 366         }
 367
 368         for(i=0;i<nl->nri;i++)
 369         {
 370                 ai      = nl->iinr[i];
 371                 ai3     = ai * 3;
 372
 373                 nj0     = nl->jindex[i];
 374                 nj1     = nl->jindex[i+1];
 375
 376                 /* Load shifts for this list */
 377                 shift   = nl->shift[i];
 378                 shX     = fr->shift_vec[shift][0];
 379                 shY     = fr->shift_vec[shift][1];
 380                 shZ     = fr->shift_vec[shift][2];
 381
 382                 /* Splat the shifts */
 383                 sX = _mm_load1_pd(&shX);
 384                 sY = _mm_load1_pd(&shY);
 385                 sZ = _mm_load1_pd(&shZ);
 386
 387                 offset  = (nj1-nj0)%2;
 388
 389                 /* Polarization energy for atom ai */
 390                 gpi     = _mm_setzero_pd();
 391
 392                 /* Load particle ai coordinates and add shifts */
 393                 ix      = _mm_load1_pd(x+ai3);
 394                 iy      = _mm_load1_pd(x+ai3+1);
 395                 iz      = _mm_load1_pd(x+ai3+2);
 396
 397                 ix      = _mm_add_pd(sX,ix);
 398                 iy      = _mm_add_pd(sY,iy);
 399                 iz      = _mm_add_pd(sZ,iz);
 400
 401                 /* Load particle ai gb_radius */
 402                 rai     = _mm_set1_pd(top->atomtypes.gb_radius[md->typeA[ai]]);
 403                 vai     = _mm_set1_pd(born->vsolv[ai]);
 404                 prod_ai = _mm_mul_pd(p4,vai);
 405
 406                 for(k=nj0;k<nj1-offset;k+=2)
 407                 {
 408                         aj1      = nl->jjnr[k];
 409                         aj2      = nl->jjnr[k+1];
 410
 411                         aj13     = aj1 * 3;
 412                         aj23     = aj2 * 3;
 413
 414                         taj1     = md->typeA[aj1];
 415                         taj2     = md->typeA[aj2];
 416
 417                         /* Load particle aj1-2 coordinates and compute ai->aj distances */
 418                         xmm1     = _mm_loadu_pd(x+aj13);
 419                         xmm2     = _mm_loadu_pd(x+aj23);
 420                         jx       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
 421                         jy       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
 422
 423                         jz       = _mm_loadl_pd(jz,x+aj13+2);
 424                         jz       = _mm_loadh_pd(jz,x+aj23+2);
 425
 426                         dx       = _mm_sub_pd(ix,jx);
 427                         dy       = _mm_sub_pd(iy,jy);
 428                         dz       = _mm_sub_pd(iz,jz);
 429
 430                         rsq11    = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
 431                         rinv     = my_invrsq_pd(rsq11);
 432
 433                         rinv2    = _mm_mul_pd(rinv,rinv);
 434                         rinv4    = _mm_mul_pd(rinv2,rinv2);
 435                         rinv6    = _mm_mul_pd(rinv4,rinv2);
 436
 437                         vaj      = _mm_loadl_pd(vaj,born->vsolv+aj1);
 438                         vaj      = _mm_loadh_pd(vaj,born->vsolv+aj2);
 439
 440                         raj      = _mm_loadl_pd(raj, top->atomtypes.gb_radius+taj1);
 441                         raj      = _mm_loadh_pd(raj, top->atomtypes.gb_radius+taj2);
 442
 443                         rvdw     = _mm_add_pd(rai,raj);
 444                         rvdw     = _mm_mul_pd(rvdw,rvdw);
 445                         ratio    = _mm_div_pd(rsq11,rvdw);
 446
 447                         mask_cmp = _mm_cmpgt_pd(ratio,p5inv);
 448
 449                         switch(_mm_movemask_pd(mask_cmp))
 450                         {
 451                                 case 0xF:
 452                                         ccf  = one;
 453                                         dccf = zero;
 454                                         break;
 455                                 default:
 456                                         theta = _mm_mul_pd(ratio,pip5);
 457                                         sincos_sse2double(theta,&sinq,&cosq);
 458
 459                                         term  = _mm_sub_pd(one,cosq);
 460                                         term  = _mm_mul_pd(half,term);
 461                                         ccf   = _mm_mul_pd(term,term);
 462                                         dccf  = _mm_mul_pd(two,term);
 463                                         dccf  = _mm_mul_pd(dccf,sinq);
 464                                         dccf  = _mm_mul_pd(dccf,pip5);
 465                                         dccf  = _mm_mul_pd(dccf,ratio);
 466
 467                                         ccf   = _mm_or_pd(_mm_and_pd(mask_cmp,one)  ,_mm_andnot_pd(mask_cmp,ccf));
 468                                         dccf  = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
 469                         }
 470
 471                         prod      = _mm_mul_pd(p4,vaj);
 472                         icf4      = _mm_mul_pd(ccf,rinv4);
 473                         xmm2      = _mm_mul_pd(icf4,prod);
 474                         xmm3      = _mm_mul_pd(icf4,prod_ai);
 475                         gpi       = _mm_add_pd(gpi,xmm2);
 476
 477                         /* Load, subtract and store atom aj gpol energy */
 478                         xmm7      = _mm_loadl_pd(xmm7,born->gpol_still_work+aj1);
 479                         xmm7      = _mm_loadh_pd(xmm7,born->gpol_still_work+aj2);
 480
 481                         xmm3      = _mm_add_pd(xmm7,xmm3);
 482
 483                         _mm_storel_pd(born->gpol_still_work+aj1,xmm3);
 484                         _mm_storeh_pd(born->gpol_still_work+aj2,xmm3);
 485
 486                         /* Chain rule terms */
 487                         ccf       = _mm_mul_pd(four,ccf);
 488                         xmm3      = _mm_sub_pd(ccf,dccf);
 489                         icf6      = _mm_mul_pd(xmm3,rinv6);
 490                         xmm1      = _mm_mul_pd(icf6,prod);
 491                         xmm2      = _mm_mul_pd(icf6,prod_ai);
 492
 493                         /* As with single precision, we need to shift stuff around, to change the order of
 494                          * the interactions from ai->aj1, ai->aj2 to ai->aj1, aj1->ai, ai->aj2, aj2->ai etc,
 495                          to do 2 instead of 4 store operations
 496                          */
 497                         xmm3      = _mm_unpacklo_pd(xmm1,xmm2);
 498                         xmm4      = _mm_unpackhi_pd(xmm1,xmm2);
 499
 500                         _mm_storeu_pd(fr->dadx+n,xmm3);
 501                         n         = n + 2;
 502                         _mm_storeu_pd(fr->dadx+n,xmm4);
 503                         n         = n + 2;
 504                 }
 505
 506                 /* Deal with odd elements */
 507                 if(offset!=0)
 508                 {
 509                         aj1  = nl->jjnr[k];
 510                         aj13 = aj1 * 3;
 511                         taj1 = md->typeA[aj1];
 512
 513                         jx    = _mm_load_sd(x+aj13);
 514                         jy    = _mm_load_sd(x+aj13+1);
 515                         jz    = _mm_load_sd(x+aj13+2);
 516
 517                         raj   = _mm_load_sd(top->atomtypes.gb_radius+taj1);
 518                         vaj   = _mm_load_sd(born->vsolv+aj1);
 519
 520                         dx    = _mm_sub_sd(ix,jx);
 521                         dy    = _mm_sub_pd(iy,jy);
 522                         dz    = _mm_sub_pd(iz,jz);
 523
 524                         rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
 525                         rinv  = my_invrsq_pd(rsq11);
 526
 527                         rinv2 = _mm_mul_sd(rinv,rinv);
 528                         rinv4 = _mm_mul_sd(rinv2,rinv2);
 529                         rinv6 = _mm_mul_sd(rinv4,rinv2);
 530
 531                         rvdw  = _mm_add_sd(rai,raj);
 532                         rvdw  = _mm_mul_sd(rvdw,rvdw);
 533                         ratio = _mm_div_sd(rsq11,rvdw);
 534
 535                         mask_cmp = _mm_cmpgt_sd(ratio,p5inv);
 536
 537                         switch(_mm_movemask_pd(mask_cmp))
 538                         {
 539                                 case 0xF:
 540                                         ccf  = one;
 541                                         dccf = zero;
 542                                         break;
 543                                 default:
 544                                         theta = _mm_mul_sd(ratio,pip5);
 545                                         sincos_sse2double(theta,&sinq,&cosq);
 546
 547                                         term  = _mm_sub_sd(one,cosq);
 548                                         term  = _mm_mul_sd(half,term);
 549                                         ccf   = _mm_mul_sd(term,term);
 550                                         dccf  = _mm_mul_sd(two,term);
 551                                         dccf  = _mm_mul_sd(dccf,sinq);
 552                                         dccf  = _mm_mul_sd(dccf,pip5);
 553                                         dccf  = _mm_mul_sd(dccf,ratio);
 554
 555                                         ccf   = _mm_or_pd(_mm_and_pd(mask_cmp,one)  ,_mm_andnot_pd(mask_cmp,ccf));
 556                                         dccf  = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
 557                         }
 558
 559                         prod      = _mm_mul_sd(p4,vaj);
 560                         icf4      = _mm_mul_sd(ccf,rinv4);
 561                         xmm2      = _mm_mul_sd(icf4,prod);
 562                         xmm3      = _mm_mul_sd(icf4,prod_ai);
 563                         gpi       = _mm_add_sd(gpi,xmm2);
 564
 565                         /* Load, subtract and store atom aj gpol energy */
 566                         xmm7      = _mm_load_sd(born->gpol_still_work+aj1);
 567                         xmm3      = _mm_add_sd(xmm7,xmm3);
 568                         _mm_store_sd(born->gpol_still_work+aj1,xmm3);
 569
 570                         /* Chain rule terms */
 571                         ccf       = _mm_mul_sd(four,ccf);
 572                         xmm3      = _mm_sub_sd(ccf,dccf);
 573                         icf6      = _mm_mul_sd(xmm3,rinv6);
 574                         xmm1      = _mm_mul_sd(icf6,prod);
 575                         xmm2      = _mm_mul_sd(icf6,prod_ai);
 576
 577                         /* Here we only have ai->aj1 and aj1->ai, so we can store directly */
 578                         _mm_storel_pd(fr->dadx+n,xmm1);
 579                         n         = n + 1;
 580                         _mm_storel_pd(fr->dadx+n,xmm2);
 581                         n         = n + 1;
 582                 } /* End offset */
 583
 584                 /* Do end processing ... */
 585                 xmm2  = _mm_unpacklo_pd(xmm2,gpi);
 586                 gpi   = _mm_add_pd(gpi,xmm2);
 587                 gpi   = _mm_shuffle_pd(gpi,gpi,_MM_SHUFFLE2(1,1));
 588
 589                 /* Load, add and store atom ai polarisation energy */
 590                 xmm2 = _mm_load_sd(born->gpol_still_work+ai);
 591                 gpi = _mm_add_sd(gpi,xmm2);
 592                 _mm_store_sd(born->gpol_still_work+ai,gpi);
 593         }
 594
 595         /* Parallel summations */
 596         if(PARTDECOMP(cr))
 597         {
 598                 gmx_sum(natoms,born->gpol_still_work, cr);
 599         }
 600         else if(DOMAINDECOMP(cr))
 601         {
 602                 dd_atom_sum_real(cr->dd, born->gpol_still_work);
 603         }
 604
 605         /* Compute the radii */
 606         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
 607         {
 608                 if(born->use[i] != 0)
 609                 {
 610                         gpi_ai = born->gpol[i] + born->gpol_still_work[i];
 611                         gpi2   = gpi_ai*gpi_ai;
 612
 613                         born->bRad[i]=factor*gmx_invsqrt(gpi2);
 614                         fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
 615                 }
 616         }
 617
 618         /* Extra (local) communication reqiured for DD */
 619         if(DOMAINDECOMP(cr))
 620         {
 621                 dd_atom_spread_real(cr->dd, born->bRad);
 622                 dd_atom_spread_real(cr->dd, fr->invsqrta);
 623         }
 624
 625         return 0;
 626 }
 627
 628 int
 629 calc_gb_rad_hct_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
 630                                                         const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
 631 {
 632         int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
 633         int p1, p2, shift;
 634         double rr,sum,sum_tmp,min_rad,rad,doff;
 635         double shX,shY,shZ;
 636
 637         __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
 638         __m128d t1,t2,t3,rsq11,rinv,r,rai;
 639         __m128d rai_inv,sk,sk2,lij,dlij,duij;
 640         __m128d uij,lij2,uij2,lij3,uij3,diff2;
 641         __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
 642         __m128d mask_cmp,mask_cmp2,mask_cmp3;
 643         __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9,doffset;
 644         __m128d sum_ai,chrule,chrule_ai,tmp_ai;
 645         __m128d sk_ai, sk2_ai,raj,raj_inv;
 646
 647         const __m128d neg   = {-1.0, -1.0};
 648         const __m128d zero  = {0.0, 0.0};
 649         const __m128d eigth = {0.125, 0.125};
 650         const __m128d qrtr  = {0.25, 0.25};
 651         const __m128d half  = {0.5, 0.5};
 652         const __m128d one   = {1.0, 1.0};
 653         const __m128d two   = {2.0, 2.0};
 654         const __m128d three = {3.0, 3.0};
 655
 656         /* Keep the compiler happy */
 657         tmp_ai      = _mm_setzero_pd();
 658         tmp         = _mm_setzero_pd();
 659         xmm7        = _mm_setzero_pd();
 660         xmm8        = _mm_setzero_pd();
 661         raj_inv     = _mm_setzero_pd();
 662         raj         = _mm_setzero_pd();
 663         sk          = _mm_setzero_pd();
 664         jx          = _mm_setzero_pd();
 665         jy          = _mm_setzero_pd();
 666         jz          = _mm_setzero_pd();
 667
 668         /* Set the dielectric offset */
 669         doff = born->gb_doffset;
 670         doffset = _mm_load1_pd(&doff);
 671         n       = 0;
 672
 673         for(i=0;i<born->nr;i++)
 674         {
 675                 born->gpol_hct_work[i] = 0;
 676         }
 677
 678         for(i=0;i<nl->nri;i++)
 679         {
 680                 ai      = nl->iinr[i];
 681                 ai3     = ai * 3;
 682
 683                 nj0     = nl->jindex[i];
 684                 nj1     = nl->jindex[i+1];
 685
 686                 /* Load shifts for this list */
 687                 shift   = nl->shift[i];
 688                 shX     = fr->shift_vec[shift][0];
 689                 shY     = fr->shift_vec[shift][1];
 690                 shZ     = fr->shift_vec[shift][2];
 691
 692                 /* Splat the shifts */
 693                 sX = _mm_load1_pd(&shX);
 694                 sY = _mm_load1_pd(&shY);
 695                 sZ = _mm_load1_pd(&shZ);
 696
 697                 offset  = (nj1-nj0)%2;
 698
 699                 /* Load rai */
 700                 rr      = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
 701                 rai     = _mm_load1_pd(&rr);
 702                 rr      = 1.0/rr;
 703                 rai_inv = _mm_load1_pd(&rr);
 704
 705                 /* Zero out sums for polarisation energies */
 706                 sum_ai  = _mm_setzero_pd();
 707
 708                 /* Load ai coordinates and add shifts */
 709                 ix       = _mm_load1_pd(x+ai3);
 710                 iy       = _mm_load1_pd(x+ai3+1);
 711                 iz       = _mm_load1_pd(x+ai3+2);
 712
 713                 ix      = _mm_add_pd(sX,ix);
 714                 iy      = _mm_add_pd(sY,iy);
 715                 iz      = _mm_add_pd(sZ,iz);
 716
 717                 sk_ai    = _mm_load1_pd(born->param+ai);
 718                 sk2_ai   = _mm_mul_pd(sk_ai,sk_ai);
 719
 720                 for(k=nj0;k<nj1-offset;k+=2)
 721                 {
 722                         aj1  = nl->jjnr[k];
 723                         aj2  = nl->jjnr[k+1];
 724
 725                         aj13 = aj1 * 3;
 726                         aj23 = aj2 * 3;
 727
 728                         /* Load particle aj1-2 coordinates */
 729                         xmm1     = _mm_loadu_pd(x+aj13);
 730                         xmm2     = _mm_loadu_pd(x+aj23);
 731                         jx       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
 732                         jy       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
 733
 734                         jz         = _mm_loadl_pd(jz, x+aj13+2);
 735                         jz         = _mm_loadh_pd(jz, x+aj23+2);
 736
 737                         dx         = _mm_sub_pd(ix,jx);
 738                         dy         = _mm_sub_pd(iy,jy);
 739                         dz         = _mm_sub_pd(iz,jz);
 740
 741                         rsq11      = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
 742                         rinv       = my_invrsq_pd(rsq11);
 743                         r          = _mm_mul_pd(rinv,rsq11);
 744
 745                         sk         = _mm_loadl_pd(sk,born->param+aj1);
 746                         sk         = _mm_loadh_pd(sk,born->param+aj2);
 747
 748                         /* Load aj1,aj2 raj */
 749                         p1        = md->typeA[aj1];
 750                         p2        = md->typeA[aj2];
 751
 752                         raj       = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
 753                         raj       = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
 754                         raj       = _mm_sub_pd(raj,doffset);
 755
 756                         /* Compute 1.0/raj */
 757                         raj_inv   = my_inv_pd(raj);
 758
 759                         /* INTERACTION aj->ai STARS HERE */
 760                         /* conditional mask for rai<dr+sk */
 761                         xmm1       = _mm_add_pd(r,sk);
 762                         mask_cmp   = _mm_cmplt_pd(rai,xmm1);
 763
 764                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
 765                         xmm2      = _mm_sub_pd(r,sk);
 766                         xmm3      = my_inv_pd(xmm2);
 767                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
 768
 769                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
 770                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
 771
 772                         uij       = my_inv_pd(xmm1);
 773                         lij2      = _mm_mul_pd(lij,lij);
 774                         lij3      = _mm_mul_pd(lij2,lij);
 775                         uij2      = _mm_mul_pd(uij,uij);
 776                         uij3      = _mm_mul_pd(uij2,uij);
 777
 778                         diff2     = _mm_sub_pd(uij2,lij2);
 779
 780                         lij_inv   = my_invrsq_pd(lij2);
 781                         sk2       = _mm_mul_pd(sk,sk);
 782                         sk2_inv   = _mm_mul_pd(sk2,rinv);
 783                         prod      = _mm_mul_pd(qrtr,sk2_inv);
 784
 785                         log_term  = _mm_mul_pd(uij,lij_inv);
 786                         log_term  = log_pd(log_term);
 787
 788                         xmm1      = _mm_sub_pd(lij,uij);
 789                         xmm2      = _mm_mul_pd(qrtr,r);
 790                         xmm2      = _mm_mul_pd(xmm2,diff2);
 791                         xmm1      = _mm_add_pd(xmm1,xmm2);
 792                         xmm2      = _mm_mul_pd(half,rinv);
 793                         xmm2      = _mm_mul_pd(xmm2,log_term);
 794                         xmm1      = _mm_add_pd(xmm1,xmm2);
 795                         xmm9      = _mm_mul_pd(neg,diff2);
 796                         xmm2      = _mm_mul_pd(xmm9,prod);
 797                         tmp_ai    = _mm_add_pd(xmm1,xmm2);
 798
 799                         /* contitional for rai<sk-dr */
 800                         xmm3      = _mm_sub_pd(sk,r);
 801                         mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
 802
 803                         xmm4    = _mm_sub_pd(rai_inv,lij);
 804                         xmm4    = _mm_mul_pd(two,xmm4);
 805                         xmm4    = _mm_add_pd(tmp_ai,xmm4);
 806
 807                         tmp_ai      = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
 808
 809                         /* the tmp will now contain two partial values, that not all are to be used. Which */
 810                         /* ones are governed by the mask_cmp mask. */
 811                         tmp_ai     = _mm_mul_pd(half,tmp_ai);
 812                         tmp_ai     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
 813                         sum_ai     = _mm_add_pd(sum_ai,tmp_ai);
 814
 815                         xmm2   = _mm_mul_pd(half,lij2);
 816                         xmm3   = _mm_mul_pd(prod,lij3);
 817                         xmm2   = _mm_add_pd(xmm2,xmm3);
 818                         xmm3   = _mm_mul_pd(lij,rinv);
 819                         xmm4   = _mm_mul_pd(lij3,r);
 820                         xmm3   = _mm_add_pd(xmm3,xmm4);
 821                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 822                         t1     = _mm_sub_pd(xmm2,xmm3);
 823
 824                         xmm2   = _mm_mul_pd(half,uij2);
 825                         xmm2   = _mm_mul_pd(neg,xmm2);
 826                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
 827                         xmm3   = _mm_mul_pd(xmm3,uij3);
 828                         xmm2   = _mm_sub_pd(xmm2,xmm3);
 829                         xmm3   = _mm_mul_pd(uij,rinv);
 830                         xmm4   = _mm_mul_pd(uij3,r);
 831                         xmm3   = _mm_add_pd(xmm3,xmm4);
 832                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 833                         t2     = _mm_add_pd(xmm2,xmm3);
 834
 835                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
 836                         xmm2   = _mm_add_pd(one,xmm2);
 837                         xmm2   = _mm_mul_pd(eigth,xmm2);
 838                         xmm2   = _mm_mul_pd(xmm2,xmm9);
 839                         xmm3   = _mm_mul_pd(log_term, rinv);
 840                         xmm3   = _mm_mul_pd(xmm3,rinv);
 841                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 842                         t3     = _mm_add_pd(xmm2,xmm3);
 843
 844                         /* chain rule terms */
 845                         xmm2   = _mm_mul_pd(dlij,t1);
 846                         xmm2   = _mm_add_pd(xmm2,t2);
 847                         xmm2   = _mm_add_pd(xmm2,t3);
 848
 849                         /* temporary storage of chain rule terms, since we have to compute
 850                          the reciprocal terms also before storing them */
 851                         chrule = _mm_mul_pd(xmm2,rinv);
 852
 853                         /* INTERACTION ai->aj starts here */
 854                         /* Conditional mask for raj<dr+sk_ai */
 855                         xmm1   = _mm_add_pd(r,sk_ai);
 856                         mask_cmp = _mm_cmplt_pd(raj,xmm1);
 857
 858                         /* Conditional for rai>dr-sk, ends with mask_cmp2 */
 859                         xmm2  = _mm_sub_pd(r,sk_ai);
 860                         xmm3  = my_inv_pd(xmm2);
 861                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
 862
 863                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
 864                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
 865
 866                         uij       = my_inv_pd(xmm1);
 867                         lij2      = _mm_mul_pd(lij,lij);
 868                         lij3      = _mm_mul_pd(lij2,lij);
 869                         uij2      = _mm_mul_pd(uij,uij);
 870                         uij3      = _mm_mul_pd(uij2,uij);
 871
 872                         diff2     = _mm_sub_pd(uij2,lij2);
 873
 874                         lij_inv   = my_invrsq_pd(lij2);
 875
 876                         sk2       = sk2_ai;
 877                         sk2_inv   = _mm_mul_pd(sk2,rinv);
 878                         prod      = _mm_mul_pd(qrtr,sk2_inv);
 879
 880                         log_term  = _mm_mul_pd(uij,lij_inv);
 881                         log_term = log_pd(log_term);
 882
 883                         xmm1      = _mm_sub_pd(lij,uij);
 884                         xmm2      = _mm_mul_pd(qrtr,r);
 885                         xmm2      = _mm_mul_pd(xmm2,diff2);
 886                         xmm1      = _mm_add_pd(xmm1,xmm2);
 887                         xmm2      = _mm_mul_pd(half,rinv);
 888                         xmm2      = _mm_mul_pd(xmm2,log_term);
 889                         xmm1      = _mm_add_pd(xmm1,xmm2);
 890                         xmm9      = _mm_mul_pd(neg,diff2);
 891                         xmm2      = _mm_mul_pd(xmm9,prod);
 892                         tmp       = _mm_add_pd(xmm1,xmm2);
 893
 894                         /* contitional for rai<sk_ai-dr */
 895                         xmm3      = _mm_sub_pd(sk_ai,r);
 896                         mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
 897
 898                         xmm4    = _mm_sub_pd(raj_inv,lij);
 899                         xmm4    = _mm_mul_pd(two,xmm4);
 900                         xmm4    = _mm_add_pd(tmp,xmm4);
 901
 902                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
 903
 904                         /* the tmp will now contain two partial values, that not all are to be used. Which */
 905                         /* ones are governed by the mask_cmp mask. */
 906                         tmp     = _mm_mul_pd(half,tmp);
 907                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
 908
 909                         /* Load, add and store ai->aj pol energy */
 910                         xmm7    = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
 911                         xmm7    = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
 912
 913                         xmm7    = _mm_add_pd(xmm7,tmp);
 914
 915                         _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
 916                         _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
 917
 918                         /* Start chain rule terms */
 919                         xmm2   = _mm_mul_pd(half,lij2);
 920                         xmm3   = _mm_mul_pd(prod,lij3);
 921                         xmm2   = _mm_add_pd(xmm2,xmm3);
 922                         xmm3   = _mm_mul_pd(lij,rinv);
 923                         xmm4   = _mm_mul_pd(lij3,r);
 924                         xmm3   = _mm_add_pd(xmm3,xmm4);
 925                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 926                         t1     = _mm_sub_pd(xmm2,xmm3);
 927
 928                         xmm2   = _mm_mul_pd(half,uij2);
 929                         xmm2   = _mm_mul_pd(neg,xmm2);
 930                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
 931                         xmm3   = _mm_mul_pd(xmm3,uij3);
 932                         xmm2   = _mm_sub_pd(xmm2,xmm3);
 933                         xmm3   = _mm_mul_pd(uij,rinv);
 934                         xmm4   = _mm_mul_pd(uij3,r);
 935                         xmm3   = _mm_add_pd(xmm3,xmm4);
 936                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 937                         t2     = _mm_add_pd(xmm2,xmm3);
 938
 939                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
 940                         xmm2   = _mm_add_pd(one,xmm2);
 941                         xmm2   = _mm_mul_pd(eigth,xmm2);
 942                         xmm2   = _mm_mul_pd(xmm2,xmm9);
 943                         xmm3   = _mm_mul_pd(log_term, rinv);
 944                         xmm3   = _mm_mul_pd(xmm3,rinv);
 945                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 946                         t3     = _mm_add_pd(xmm2,xmm3);
 947
 948                         /* chain rule terms */
 949                         xmm2   = _mm_mul_pd(dlij,t1);
 950                         xmm2   = _mm_add_pd(xmm2,t2);
 951                         xmm2   = _mm_add_pd(xmm2,t3);
 952                         chrule_ai = _mm_mul_pd(xmm2,rinv);
 953
 954                         /* Store chain rule terms
 955                          * same unpacking rationale as with Still above
 956                          */
 957                         xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
 958                         xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
 959
 960                         _mm_storeu_pd(fr->dadx+n, xmm3);
 961                         n = n + 2;
 962                         _mm_storeu_pd(fr->dadx+n, xmm4);
 963                         n = n + 2;
 964                 }
 965
 966                 if(offset!=0)
 967                 {
 968                         aj1       = nl->jjnr[k];
 969                         aj13      = aj1 * 3;
 970                         p1        = md->typeA[aj1];
 971
 972                         jx        = _mm_load_sd(x+aj13);
 973                         jy        = _mm_load_sd(x+aj13+1);
 974                         jz        = _mm_load_sd(x+aj13+2);
 975
 976                         sk        = _mm_load_sd(born->param+aj1);
 977
 978                         dx        = _mm_sub_sd(ix,jx);
 979                         dy        = _mm_sub_pd(iy,jy);
 980                         dz        = _mm_sub_pd(iz,jz);
 981
 982                         rsq11     = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
 983                         rinv      = my_invrsq_pd(rsq11);
 984                         r         = _mm_mul_sd(rinv,rsq11);
 985
 986                         /* Load raj */
 987                         raj       = _mm_load_sd(top->atomtypes.gb_radius+p1);
 988                         raj       = _mm_sub_sd(raj,doffset);
 989                         raj_inv   = my_inv_pd(raj);
 990
 991                         /* OFFSET INTERATIONS aj->ai STARTS HERE */
 992                         /* conditional mask for rai<dr+sk */
 993                         xmm1      = _mm_add_sd(r,sk);
 994                         mask_cmp  = _mm_cmplt_sd(rai,xmm1);
 995
 996                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
 997                         xmm2      = _mm_sub_sd(r,sk);
 998                         xmm3      = my_inv_pd(xmm2);
 999                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1000
1001                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1002                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1003
1004                         uij       = my_inv_pd(xmm1);
1005                         lij2      = _mm_mul_sd(lij,lij);
1006                         lij3      = _mm_mul_sd(lij2,lij);
1007                         uij2      = _mm_mul_sd(uij,uij);
1008                         uij3      = _mm_mul_sd(uij2,uij);
1009
1010                         diff2     = _mm_sub_sd(uij2,lij2);
1011
1012                         lij_inv   = my_invrsq_pd(lij2);
1013                         sk2       = _mm_mul_sd(sk,sk);
1014                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1015                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1016
1017                         log_term  = _mm_mul_pd(uij,lij_inv);
1018                         log_term = log_pd(log_term);
1019
1020                         xmm1      = _mm_sub_sd(lij,uij);
1021                         xmm2      = _mm_mul_sd(qrtr,r);
1022                         xmm2      = _mm_mul_sd(xmm2,diff2);
1023                         xmm1      = _mm_add_sd(xmm1,xmm2);
1024                         xmm2      = _mm_mul_sd(half,rinv);
1025                         xmm2      = _mm_mul_sd(xmm2,log_term);
1026                         xmm1      = _mm_add_sd(xmm1,xmm2);
1027                         xmm9      = _mm_mul_sd(neg,diff2);
1028                         xmm2      = _mm_mul_sd(xmm9,prod);
1029
1030                         tmp_ai    = _mm_add_sd(xmm1,xmm2);
1031
1032                         /* contitional for rai<sk-dr */
1033                         xmm3      = _mm_sub_sd(sk,r);
1034                         mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1035
1036                         xmm4    = _mm_sub_sd(rai_inv,lij);
1037                         xmm4    = _mm_mul_sd(two,xmm4);
1038                         xmm4    = _mm_add_sd(tmp_ai,xmm4);
1039
1040                         tmp_ai      = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1041
1042                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1043                         /* ones are governed by the mask_cmp mask. */
1044                         tmp_ai     = _mm_mul_pd(half,tmp_ai);
1045                         tmp_ai     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1046                         sum_ai     = _mm_add_sd(sum_ai,tmp_ai);
1047
1048                         xmm2   = _mm_mul_sd(half,lij2);
1049                         xmm3   = _mm_mul_sd(prod,lij3);
1050                         xmm2   = _mm_add_sd(xmm2,xmm3);
1051                         xmm3   = _mm_mul_sd(lij,rinv);
1052                         xmm4   = _mm_mul_sd(lij3,r);
1053                         xmm3   = _mm_add_sd(xmm3,xmm4);
1054                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1055                         t1     = _mm_sub_sd(xmm2,xmm3);
1056
1057                         xmm2   = _mm_mul_sd(half,uij2);
1058                         xmm2   = _mm_mul_sd(neg,xmm2);
1059                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1060                         xmm3   = _mm_mul_sd(xmm3,uij3);
1061                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1062                         xmm3   = _mm_mul_sd(uij,rinv);
1063                         xmm4   = _mm_mul_sd(uij3,r);
1064                         xmm3   = _mm_add_sd(xmm3,xmm4);
1065                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1066                         t2     = _mm_add_sd(xmm2,xmm3);
1067
1068                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1069                         xmm2   = _mm_add_sd(one,xmm2);
1070                         xmm2   = _mm_mul_sd(eigth,xmm2);
1071                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1072                         xmm3   = _mm_mul_sd(log_term, rinv);
1073                         xmm3   = _mm_mul_sd(xmm3,rinv);
1074                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1075                         t3     = _mm_add_sd(xmm2,xmm3);
1076
1077                         /* chain rule terms */
1078                         xmm2   = _mm_mul_sd(dlij,t1);
1079                         xmm2   = _mm_add_sd(xmm2,t2);
1080                         xmm2   = _mm_add_sd(xmm2,t3);
1081
1082                         /* temporary storage of chain rule terms, since we have to compute
1083                          the reciprocal terms also before storing them */
1084                         chrule = _mm_mul_sd(xmm2,rinv);
1085
1086                         /* OFFSET INTERACTION ai->aj starts here */
1087                         /* conditional mask for raj<dr+sk */
1088                         xmm1      = _mm_add_sd(r,sk_ai);
1089                         mask_cmp  = _mm_cmplt_sd(raj,xmm1);
1090
1091                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1092                         xmm2      = _mm_sub_sd(r,sk_ai);
1093                         xmm3      = my_inv_pd(xmm2);
1094                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1095
1096                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1097                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1098
1099                         uij       = my_inv_pd(xmm1);
1100                         lij2      = _mm_mul_sd(lij,lij);
1101                         lij3      = _mm_mul_sd(lij2,lij);
1102                         uij2      = _mm_mul_sd(uij,uij);
1103                         uij3      = _mm_mul_sd(uij2,uij);
1104
1105                         diff2     = _mm_sub_sd(uij2,lij2);
1106
1107                         lij_inv   = my_invrsq_pd(lij2);
1108
1109                         sk2       = sk2_ai;
1110                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1111                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1112
1113                         log_term  = _mm_mul_pd(uij,lij_inv);
1114                         log_term = log_pd(log_term);
1115
1116                         xmm1      = _mm_sub_sd(lij,uij);
1117                         xmm2      = _mm_mul_sd(qrtr,r);
1118                         xmm2      = _mm_mul_sd(xmm2,diff2);
1119                         xmm1      = _mm_add_sd(xmm1,xmm2);
1120                         xmm2      = _mm_mul_sd(half,rinv);
1121                         xmm2      = _mm_mul_sd(xmm2,log_term);
1122                         xmm1      = _mm_add_sd(xmm1,xmm2);
1123                         xmm9      = _mm_mul_sd(neg,diff2);
1124                         xmm2      = _mm_mul_sd(xmm9,prod);
1125                         tmp       = _mm_add_sd(xmm1,xmm2);
1126
1127                         /* contitional for rai<sk-dr */
1128                         xmm3      = _mm_sub_sd(sk_ai,r);
1129                         mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1130
1131                         xmm4    = _mm_sub_sd(raj_inv,lij);
1132                         xmm4    = _mm_mul_sd(two,xmm4);
1133                         xmm4    = _mm_add_sd(tmp,xmm4);
1134
1135                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1136
1137                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1138                         /* ones are governed by the mask_cmp mask. */
1139                         tmp     = _mm_mul_pd(half,tmp);
1140                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1141
1142                         /* Load, add and store gpol energy */
1143                         xmm7    = _mm_load_sd(born->gpol_hct_work+aj1);
1144                         xmm7    = _mm_add_sd(xmm7,tmp);
1145                         _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1146
1147                         /* Start chain rule terms, t1 */
1148                         xmm2   = _mm_mul_sd(half,lij2);
1149                         xmm3   = _mm_mul_sd(prod,lij3);
1150                         xmm2   = _mm_add_sd(xmm2,xmm3);
1151                         xmm3   = _mm_mul_sd(lij,rinv);
1152                         xmm4   = _mm_mul_sd(lij3,r);
1153                         xmm3   = _mm_add_sd(xmm3,xmm4);
1154                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1155                         t1     = _mm_sub_sd(xmm2,xmm3);
1156
1157                         xmm2   = _mm_mul_sd(half,uij2);
1158                         xmm2   = _mm_mul_sd(neg,xmm2);
1159                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1160                         xmm3   = _mm_mul_sd(xmm3,uij3);
1161                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1162                         xmm3   = _mm_mul_sd(uij,rinv);
1163                         xmm4   = _mm_mul_sd(uij3,r);
1164                         xmm3   = _mm_add_sd(xmm3,xmm4);
1165                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1166                         t2     = _mm_add_sd(xmm2,xmm3);
1167
1168                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1169                         xmm2   = _mm_add_sd(one,xmm2);
1170                         xmm2   = _mm_mul_sd(eigth,xmm2);
1171                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1172                         xmm3   = _mm_mul_sd(log_term, rinv);
1173                         xmm3   = _mm_mul_sd(xmm3,rinv);
1174                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1175                         t3     = _mm_add_sd(xmm2,xmm3);
1176
1177                         /* chain rule terms */
1178                         xmm2   = _mm_mul_sd(dlij,t1);
1179                         xmm2   = _mm_add_sd(xmm2,t2);
1180                         xmm2   = _mm_add_sd(xmm2,t3);
1181                         chrule_ai = _mm_mul_sd(xmm2,rinv);
1182
1183                         _mm_store_sd(fr->dadx+n, chrule);
1184                         n = n + 1;
1185                         _mm_store_sd(fr->dadx+n, chrule_ai);
1186                         n = n + 1;
1187                 }
1188
1189                 /* Do end processing ...  */
1190                 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1191                 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1192                 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1193
1194                 /* Load, add and store atom ai polarisation energy */
1195                 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1196                 sum_ai = _mm_add_sd(sum_ai,xmm2);
1197                 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1198         }
1199
1200         /* Parallel summations */
1201         if(PARTDECOMP(cr))
1202         {
1203                 gmx_sum(natoms, born->gpol_hct_work, cr);
1204         }
1205         else if(DOMAINDECOMP(cr))
1206         {
1207                 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1208         }
1209
1210         /* Compute the radii */
1211         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1212         {
1213                 if(born->use[i] != 0)
1214                 {
1215                         rr      = top->atomtypes.gb_radius[md->typeA[i]]-doff;
1216                         sum     = 1.0/rr - born->gpol_hct_work[i];
1217                         min_rad = rr + doff;
1218                         rad     = 1.0/sum;
1219
1220                         born->bRad[i]   = rad > min_rad ? rad : min_rad;
1221                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1222                 }
1223         }
1224
1225         /* Extra (local) communication required for DD */
1226         if(DOMAINDECOMP(cr))
1227         {
1228                 dd_atom_spread_real(cr->dd, born->bRad);
1229                 dd_atom_spread_real(cr->dd, fr->invsqrta);
1230         }
1231
1232         return 0;
1233 }
1234
1235 int
1236 calc_gb_rad_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
1237                                                         const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
1238 {
1239         int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
1240         int p1,p2,p3,p4,shift;
1241         double rr,sum,sum_tmp,sum2,sum3,min_rad,rad,doff;
1242         double tsum,tchain,rr_inv,rr_inv2,gbr;
1243         double shX,shY,shZ;
1244
1245         __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
1246         __m128d t1,t2,t3,rsq11,rinv,r,rai;
1247         __m128d rai_inv,sk,sk2,lij,dlij,duij;
1248         __m128d uij,lij2,uij2,lij3,uij3,diff2;
1249         __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
1250         __m128d mask_cmp,mask_cmp2,mask_cmp3,doffset,raj,raj_inv;
1251         __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9;
1252         __m128d sum_ai,chrule,chrule_ai,tmp_ai,sk_ai,sk2_ai;
1253
1254         const __m128d neg   = {-1.0, -1.0};
1255         const __m128d zero  = {0.0, 0.0};
1256         const __m128d eigth = {0.125, 0.125};
1257         const __m128d qrtr  = {0.25, 0.25};
1258         const __m128d half  = {0.5, 0.5};
1259         const __m128d one   = {1.0, 1.0};
1260         const __m128d two   = {2.0, 2.0};
1261         const __m128d three = {3.0, 3.0};
1262
1263         /* Keep the compiler happy */
1264         tmp_ai      = _mm_setzero_pd();
1265         tmp         = _mm_setzero_pd();
1266         raj_inv     = _mm_setzero_pd();
1267         raj         = _mm_setzero_pd();
1268         sk          = _mm_setzero_pd();
1269         jx          = _mm_setzero_pd();
1270         jy          = _mm_setzero_pd();
1271         jz          = _mm_setzero_pd();
1272         xmm7        = _mm_setzero_pd();
1273         xmm8        = _mm_setzero_pd();
1274
1275         /* Set the dielectric offset */
1276         doff = born->gb_doffset;
1277         doffset = _mm_load1_pd(&doff);
1278
1279         n       = 0;
1280
1281         for(i=0;i<born->nr;i++)
1282         {
1283                 born->gpol_hct_work[i] = 0;
1284         }
1285
1286         for(i=0;i<nl->nri;i++)
1287         {
1288                 ai      = nl->iinr[i];
1289                 ai3     = ai * 3;
1290
1291                 nj0     = nl->jindex[i];
1292                 nj1     = nl->jindex[i+1];
1293
1294                 /* Load shifts for this list */
1295                 shift   = nl->shift[i];
1296                 shX     = fr->shift_vec[shift][0];
1297                 shY     = fr->shift_vec[shift][1];
1298                 shZ     = fr->shift_vec[shift][2];
1299
1300                 /* Splat the shifts */
1301                 sX = _mm_load1_pd(&shX);
1302                 sY = _mm_load1_pd(&shY);
1303                 sZ = _mm_load1_pd(&shZ);
1304
1305                 offset  = (nj1-nj0)%2;
1306
1307                 /* Load rai */
1308                 rr      = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
1309                 rai     = _mm_load1_pd(&rr);
1310                 rr      = 1.0/rr;
1311                 rai_inv = _mm_load1_pd(&rr);
1312
1313                 /* Load ai coordinates and add shifts */
1314                 ix       = _mm_load1_pd(x+ai3);
1315                 iy       = _mm_load1_pd(x+ai3+1);
1316                 iz       = _mm_load1_pd(x+ai3+2);
1317
1318                 ix      = _mm_add_pd(sX,ix);
1319                 iy      = _mm_add_pd(sY,iy);
1320                 iz      = _mm_add_pd(sZ,iz);
1321
1322                 /* Zero out sums for polarisation energies */
1323                 sum_ai = _mm_setzero_pd();
1324
1325                 sk_ai  =  _mm_load1_pd(born->param+ai);
1326                 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
1327
1328                 for(k=nj0;k<nj1-offset;k+=2)
1329                 {
1330                         aj1  = nl->jjnr[k];
1331                         aj2  = nl->jjnr[k+1];
1332
1333                         aj13 = aj1 * 3;
1334                         aj23 = aj2 * 3;
1335
1336                         /* Load particle aj1-2 coordinates */
1337                         xmm1     = _mm_loadu_pd(x+aj13);
1338                         xmm2     = _mm_loadu_pd(x+aj23);
1339                         jx       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1340                         jy       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1341
1342                         jz         = _mm_loadl_pd(jz, x+aj13+2);
1343                         jz         = _mm_loadh_pd(jz, x+aj23+2);
1344
1345                         dx         = _mm_sub_pd(ix,jx);
1346                         dy         = _mm_sub_pd(iy,jy);
1347                         dz         = _mm_sub_pd(iz,jz);
1348
1349                         rsq11      = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
1350                         rinv       = my_invrsq_pd(rsq11);
1351                         r          = _mm_mul_pd(rinv,rsq11);
1352
1353                         /* Load atom aj1,aj2 raj */
1354                         p1         = md->typeA[aj1];
1355                         p2         = md->typeA[aj2];
1356
1357                         raj        = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
1358                         raj        = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
1359                         raj        = _mm_sub_pd(raj,doffset);
1360
1361                         /* Compute 1.0/raj */
1362                         raj_inv    = my_inv_pd(raj);
1363
1364                         sk         = _mm_loadl_pd(sk,born->param+aj1);
1365                         sk         = _mm_loadh_pd(sk,born->param+aj2);
1366
1367                         /* INTERACTION aj->ai STARTS HERE */
1368                         /* conditional mask for rai<dr+sk */
1369                         xmm1       = _mm_add_pd(r,sk);
1370                         mask_cmp   = _mm_cmplt_pd(rai,xmm1);
1371
1372                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1373                         xmm2      = _mm_sub_pd(r,sk);
1374                         xmm3      = my_inv_pd(xmm2);
1375                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1376
1377                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1378                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1379
1380                         uij       = my_inv_pd(xmm1);
1381                         lij2      = _mm_mul_pd(lij,lij);
1382                         lij3      = _mm_mul_pd(lij2,lij);
1383                         uij2      = _mm_mul_pd(uij,uij);
1384                         uij3      = _mm_mul_pd(uij2,uij);
1385
1386                         diff2     = _mm_sub_pd(uij2,lij2);
1387
1388                         lij_inv   = my_invrsq_pd(lij2);
1389                         sk2       = _mm_mul_pd(sk,sk);
1390                         sk2_inv   = _mm_mul_pd(sk2,rinv);
1391                         prod      = _mm_mul_pd(qrtr,sk2_inv);
1392
1393                         log_term  = _mm_mul_pd(uij,lij_inv);
1394                         log_term = log_pd(log_term);
1395
1396                         xmm1      = _mm_sub_pd(lij,uij);
1397                         xmm2      = _mm_mul_pd(qrtr,r);
1398                         xmm2      = _mm_mul_pd(xmm2,diff2);
1399                         xmm1      = _mm_add_pd(xmm1,xmm2);
1400                         xmm2      = _mm_mul_pd(half,rinv);
1401                         xmm2      = _mm_mul_pd(xmm2,log_term);
1402                         xmm1      = _mm_add_pd(xmm1,xmm2);
1403                         xmm9      = _mm_mul_pd(neg,diff2);
1404                         xmm2      = _mm_mul_pd(xmm9,prod);
1405                         tmp_ai    = _mm_add_pd(xmm1,xmm2);
1406
1407                         /* contitional for rai<sk-dr */
1408                         xmm3      = _mm_sub_pd(sk,r);
1409                         mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
1410
1411                         xmm4    = _mm_sub_pd(rai_inv,lij);
1412                         xmm4    = _mm_mul_pd(two,xmm4);
1413                         xmm4    = _mm_add_pd(tmp_ai,xmm4);
1414
1415                         tmp_ai      = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1416
1417                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1418                         /* ones are governed by the mask_cmp mask. */
1419                         tmp_ai     = _mm_mul_pd(half,tmp_ai);
1420                         tmp_ai     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1421                         sum_ai     = _mm_add_pd(sum_ai,tmp_ai);
1422
1423                         /* Start the dadx chain rule terms */
1424                         xmm2   = _mm_mul_pd(half,lij2);
1425                         xmm3   = _mm_mul_pd(prod,lij3);
1426                         xmm2   = _mm_add_pd(xmm2,xmm3);
1427                         xmm3   = _mm_mul_pd(lij,rinv);
1428                         xmm4   = _mm_mul_pd(lij3,r);
1429                         xmm3   = _mm_add_pd(xmm3,xmm4);
1430                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1431                         t1     = _mm_sub_pd(xmm2,xmm3);
1432
1433                         xmm2   = _mm_mul_pd(half,uij2);
1434                         xmm2   = _mm_mul_pd(neg,xmm2);
1435                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
1436                         xmm3   = _mm_mul_pd(xmm3,uij3);
1437                         xmm2   = _mm_sub_pd(xmm2,xmm3);
1438                         xmm3   = _mm_mul_pd(uij,rinv);
1439                         xmm4   = _mm_mul_pd(uij3,r);
1440                         xmm3   = _mm_add_pd(xmm3,xmm4);
1441                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1442                         t2     = _mm_add_pd(xmm2,xmm3);
1443
1444                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
1445                         xmm2   = _mm_add_pd(one,xmm2);
1446                         xmm2   = _mm_mul_pd(eigth,xmm2);
1447                         xmm2   = _mm_mul_pd(xmm2,xmm9);
1448                         xmm3   = _mm_mul_pd(log_term, rinv);
1449                         xmm3   = _mm_mul_pd(xmm3,rinv);
1450                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1451                         t3     = _mm_add_pd(xmm2,xmm3);
1452
1453                         /* chain rule terms */
1454                         xmm2   = _mm_mul_pd(dlij,t1);
1455                         xmm2   = _mm_add_pd(xmm2,t2);
1456                         xmm2   = _mm_add_pd(xmm2,t3);
1457
1458                         /* temporary storage of chain rule terms, since we have to compute
1459                          the reciprocal terms also before storing them */
1460                         chrule   = _mm_mul_pd(xmm2,rinv);
1461
1462                         /* INTERACTION ai->aj STARTS HERE */
1463                         /* conditional mask for raj<dr+sk_ai */
1464                         xmm1       = _mm_add_pd(r,sk_ai);
1465                         mask_cmp   = _mm_cmplt_pd(raj,xmm1);
1466
1467                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1468                         xmm2      = _mm_sub_pd(r,sk_ai);
1469                         xmm3      = my_inv_pd(xmm2);
1470                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1471
1472                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1473                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1474
1475                         uij       = my_inv_pd(xmm1);
1476                         lij2      = _mm_mul_pd(lij,lij);
1477                         lij3      = _mm_mul_pd(lij2,lij);
1478                         uij2      = _mm_mul_pd(uij,uij);
1479                         uij3      = _mm_mul_pd(uij2,uij);
1480
1481                         diff2     = _mm_sub_pd(uij2,lij2);
1482
1483                         lij_inv   = my_invrsq_pd(lij2);
1484
1485                         sk2       = sk2_ai;
1486                         sk2_inv   = _mm_mul_pd(sk2,rinv);
1487                         prod      = _mm_mul_pd(qrtr,sk2_inv);
1488
1489                         log_term  = _mm_mul_pd(uij,lij_inv);
1490                         log_term = log_pd(log_term);
1491
1492                         xmm1      = _mm_sub_pd(lij,uij);
1493                         xmm2      = _mm_mul_pd(qrtr,r);
1494                         xmm2      = _mm_mul_pd(xmm2,diff2);
1495                         xmm1      = _mm_add_pd(xmm1,xmm2);
1496                         xmm2      = _mm_mul_pd(half,rinv);
1497                         xmm2      = _mm_mul_pd(xmm2,log_term);
1498                         xmm1      = _mm_add_pd(xmm1,xmm2);
1499                         xmm9      = _mm_mul_pd(neg,diff2);
1500                         xmm2      = _mm_mul_pd(xmm9,prod);
1501                         tmp       = _mm_add_pd(xmm1,xmm2);
1502
1503                         /* contitional for raj<sk_ai-dr */
1504                         xmm3      = _mm_sub_pd(sk_ai,r);
1505                         mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
1506
1507                         xmm4    = _mm_sub_pd(raj_inv,lij);
1508                         xmm4    = _mm_mul_pd(two,xmm4);
1509                         xmm4    = _mm_add_pd(tmp,xmm4);
1510
1511                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1512
1513                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1514                         /* ones are governed by the mask_cmp mask. */
1515                         tmp     = _mm_mul_pd(half,tmp);
1516                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1517
1518                         /* Load, add and store ai->aj pol energy */
1519                         xmm7    = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
1520                         xmm7    = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
1521
1522                         xmm7    = _mm_add_pd(xmm7,tmp);
1523
1524                         _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
1525                         _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
1526
1527                         /* Start dadx chain rule terms */
1528                         xmm2   = _mm_mul_pd(half,lij2);
1529                         xmm3   = _mm_mul_pd(prod,lij3);
1530                         xmm2   = _mm_add_pd(xmm2,xmm3);
1531                         xmm3   = _mm_mul_pd(lij,rinv);
1532                         xmm4   = _mm_mul_pd(lij3,r);
1533                         xmm3   = _mm_add_pd(xmm3,xmm4);
1534                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1535                         t1     = _mm_sub_pd(xmm2,xmm3);
1536
1537                         xmm2   = _mm_mul_pd(half,uij2);
1538                         xmm2   = _mm_mul_pd(neg,xmm2);
1539                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
1540                         xmm3   = _mm_mul_pd(xmm3,uij3);
1541                         xmm2   = _mm_sub_pd(xmm2,xmm3);
1542                         xmm3   = _mm_mul_pd(uij,rinv);
1543                         xmm4   = _mm_mul_pd(uij3,r);
1544                         xmm3   = _mm_add_pd(xmm3,xmm4);
1545                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1546                         t2     = _mm_add_pd(xmm2,xmm3);
1547
1548                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
1549                         xmm2   = _mm_add_pd(one,xmm2);
1550                         xmm2   = _mm_mul_pd(eigth,xmm2);
1551                         xmm2   = _mm_mul_pd(xmm2,xmm9);
1552                         xmm3   = _mm_mul_pd(log_term, rinv);
1553                         xmm3   = _mm_mul_pd(xmm3,rinv);
1554                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1555                         t3     = _mm_add_pd(xmm2,xmm3);
1556
1557                         /* chain rule terms */
1558                         xmm2   = _mm_mul_pd(dlij,t1);
1559                         xmm2   = _mm_add_pd(xmm2,t2);
1560                         xmm2   = _mm_add_pd(xmm2,t3);
1561                         chrule_ai   = _mm_mul_pd(xmm2,rinv);
1562
1563                         /* Store chain rule terms
1564                          * same unpacking rationale as with Still above
1565                          */
1566                         xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
1567                         xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
1568
1569                         _mm_storeu_pd(fr->dadx+n, xmm3);
1570                         n = n + 2;
1571                         _mm_storeu_pd(fr->dadx+n, xmm4);
1572                         n = n + 2;
1573                 }
1574
1575                 if(offset!=0)
1576                 {
1577                         aj1       = nl->jjnr[k];
1578                         aj13      = aj1 * 3;
1579                         p1        = md->typeA[aj1];
1580
1581                         jx        = _mm_load_sd(x+aj13);
1582                         jy        = _mm_load_sd(x+aj13+1);
1583                         jz        = _mm_load_sd(x+aj13+2);
1584
1585                         sk        = _mm_load_sd(born->param+aj1);
1586
1587                         /* Load raj */
1588                         raj        = _mm_load_sd(top->atomtypes.gb_radius+p1);
1589                         raj        = _mm_sub_sd(raj,doffset);
1590                         raj_inv    = my_inv_pd(raj);
1591
1592                         dx        = _mm_sub_sd(ix,jx);
1593                         dy        = _mm_sub_pd(iy,jy);
1594                         dz        = _mm_sub_pd(iz,jz);
1595
1596                         rsq11     = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
1597                         rinv      = my_invrsq_pd(rsq11);
1598                         r         = _mm_mul_sd(rinv,rsq11);
1599
1600                         /* OFFSET INTERACTION aj->ai STARTS HERE */
1601                         /* conditional mask for rai<dr+sk */
1602                         xmm1      = _mm_add_sd(r,sk);
1603                         mask_cmp  = _mm_cmplt_sd(rai,xmm1);
1604
1605                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1606                         xmm2      = _mm_sub_sd(r,sk);
1607                         xmm3      = my_inv_pd(xmm2);
1608                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1609
1610                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1611                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1612
1613                         uij       = my_inv_pd(xmm1);
1614                         lij2      = _mm_mul_sd(lij,lij);
1615                         lij3      = _mm_mul_sd(lij2,lij);
1616                         uij2      = _mm_mul_sd(uij,uij);
1617                         uij3      = _mm_mul_sd(uij2,uij);
1618
1619                         diff2     = _mm_sub_sd(uij2,lij2);
1620
1621                         lij_inv   = my_invrsq_pd(lij2);
1622                         sk2       = _mm_mul_sd(sk,sk);
1623                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1624                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1625
1626                         log_term  = _mm_mul_pd(uij,lij_inv);
1627                         log_term = log_pd(log_term);
1628
1629                         xmm1      = _mm_sub_sd(lij,uij);
1630                         xmm2      = _mm_mul_sd(qrtr,r);
1631                         xmm2      = _mm_mul_sd(xmm2,diff2);
1632                         xmm1      = _mm_add_sd(xmm1,xmm2);
1633                         xmm2      = _mm_mul_sd(half,rinv);
1634                         xmm2      = _mm_mul_sd(xmm2,log_term);
1635                         xmm1      = _mm_add_sd(xmm1,xmm2);
1636                         xmm9      = _mm_mul_sd(neg,diff2);
1637                         xmm2      = _mm_mul_sd(xmm9,prod);
1638
1639                         tmp_ai    = _mm_add_sd(xmm1,xmm2);
1640
1641                         /* contitional for rai<sk-dr */
1642                         xmm3      = _mm_sub_sd(sk,r);
1643                         mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1644
1645                         xmm4    = _mm_sub_sd(rai_inv,lij);
1646                         xmm4    = _mm_mul_sd(two,xmm4);
1647                         xmm4    = _mm_add_sd(tmp_ai,xmm4);
1648
1649                         tmp_ai  = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1650
1651                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1652                         /* ones are governed by the mask_cmp mask. */
1653                         tmp_ai = _mm_mul_pd(half,tmp_ai);
1654                         tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1655                         sum_ai = _mm_add_sd(sum_ai,tmp_ai);
1656
1657                         xmm2   = _mm_mul_sd(half,lij2);
1658                         xmm3   = _mm_mul_sd(prod,lij3);
1659                         xmm2   = _mm_add_sd(xmm2,xmm3);
1660                         xmm3   = _mm_mul_sd(lij,rinv);
1661                         xmm4   = _mm_mul_sd(lij3,r);
1662                         xmm3   = _mm_add_sd(xmm3,xmm4);
1663                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1664                         t1     = _mm_sub_sd(xmm2,xmm3);
1665
1666                         xmm2   = _mm_mul_sd(half,uij2);
1667                         xmm2   = _mm_mul_sd(neg,xmm2);
1668                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1669                         xmm3   = _mm_mul_sd(xmm3,uij3);
1670                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1671                         xmm3   = _mm_mul_sd(uij,rinv);
1672                         xmm4   = _mm_mul_sd(uij3,r);
1673                         xmm3   = _mm_add_sd(xmm3,xmm4);
1674                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1675                         t2     = _mm_add_sd(xmm2,xmm3);
1676
1677                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1678                         xmm2   = _mm_add_sd(one,xmm2);
1679                         xmm2   = _mm_mul_sd(eigth,xmm2);
1680                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1681                         xmm3   = _mm_mul_sd(log_term, rinv);
1682                         xmm3   = _mm_mul_sd(xmm3,rinv);
1683                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1684                         t3     = _mm_add_sd(xmm2,xmm3);
1685
1686                         /* chain rule terms */
1687                         xmm2   = _mm_mul_sd(dlij,t1);
1688                         xmm2   = _mm_add_sd(xmm2,t2);
1689                         xmm2   = _mm_add_sd(xmm2,t3);
1690                         chrule   = _mm_mul_sd(xmm2,rinv);
1691
1692                         /* OFFSET INTERACTION ai->aj STARTS HERE */
1693                         /* conditional mask for raj<dr+sk_ai */
1694                         xmm1      = _mm_add_sd(r,sk_ai);
1695                         mask_cmp  = _mm_cmplt_sd(raj,xmm1);
1696
1697                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1698                         xmm2      = _mm_sub_sd(r,sk_ai);
1699                         xmm3      = my_inv_pd(xmm2);
1700                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1701
1702                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1703                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1704
1705                         uij       = my_inv_pd(xmm1);
1706                         lij2      = _mm_mul_sd(lij,lij);
1707                         lij3      = _mm_mul_sd(lij2,lij);
1708                         uij2      = _mm_mul_sd(uij,uij);
1709                         uij3      = _mm_mul_sd(uij2,uij);
1710
1711                         diff2     = _mm_sub_sd(uij2,lij2);
1712
1713                         lij_inv   = my_invrsq_pd(lij2);
1714
1715                         sk2       = sk2_ai;
1716                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1717                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1718
1719                         log_term  = _mm_mul_pd(uij,lij_inv);
1720                         log_term = log_pd(log_term);
1721
1722                         xmm1      = _mm_sub_sd(lij,uij);
1723                         xmm2      = _mm_mul_sd(qrtr,r);
1724                         xmm2      = _mm_mul_sd(xmm2,diff2);
1725                         xmm1      = _mm_add_sd(xmm1,xmm2);
1726                         xmm2      = _mm_mul_sd(half,rinv);
1727                         xmm2      = _mm_mul_sd(xmm2,log_term);
1728                         xmm1      = _mm_add_sd(xmm1,xmm2);
1729                         xmm9      = _mm_mul_sd(neg,diff2);
1730                         xmm2      = _mm_mul_sd(xmm9,prod);
1731                         tmp       = _mm_add_sd(xmm1,xmm2);
1732
1733                         /* contitional for raj<sk_ai-dr */
1734                         xmm3      = _mm_sub_sd(sk_ai,r);
1735                         mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1736
1737                         xmm4    = _mm_sub_sd(raj_inv,lij);
1738                         xmm4    = _mm_mul_sd(two,xmm4);
1739                         xmm4    = _mm_add_sd(tmp,xmm4);
1740
1741                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1742
1743                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1744                         /* ones are governed by the mask_cmp mask. */
1745                         tmp     = _mm_mul_pd(half,tmp);
1746                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1747
1748                         /* Load, add and store gpol energy */
1749                         xmm7    = _mm_load_sd(born->gpol_hct_work+aj1);
1750                         xmm7    = _mm_add_sd(xmm7,tmp);
1751                         _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1752
1753                         /* Start chain rule terms, t1 */
1754                         xmm2   = _mm_mul_sd(half,lij2);
1755                         xmm3   = _mm_mul_sd(prod,lij3);
1756                         xmm2   = _mm_add_sd(xmm2,xmm3);
1757                         xmm3   = _mm_mul_sd(lij,rinv);
1758                         xmm4   = _mm_mul_sd(lij3,r);
1759                         xmm3   = _mm_add_sd(xmm3,xmm4);
1760                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1761                         t1     = _mm_sub_sd(xmm2,xmm3);
1762
1763                         xmm2   = _mm_mul_sd(half,uij2);
1764                         xmm2   = _mm_mul_sd(neg,xmm2);
1765                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1766                         xmm3   = _mm_mul_sd(xmm3,uij3);
1767                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1768                         xmm3   = _mm_mul_sd(uij,rinv);
1769                         xmm4   = _mm_mul_sd(uij3,r);
1770                         xmm3   = _mm_add_sd(xmm3,xmm4);
1771                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1772                         t2     = _mm_add_sd(xmm2,xmm3);
1773
1774                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1775                         xmm2   = _mm_add_sd(one,xmm2);
1776                         xmm2   = _mm_mul_sd(eigth,xmm2);
1777                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1778                         xmm3   = _mm_mul_sd(log_term, rinv);
1779                         xmm3   = _mm_mul_sd(xmm3,rinv);
1780                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1781                         t3     = _mm_add_sd(xmm2,xmm3);
1782
1783                         /* chain rule terms */
1784                         xmm2   = _mm_mul_sd(dlij,t1);
1785                         xmm2   = _mm_add_sd(xmm2,t2);
1786                         xmm2   = _mm_add_sd(xmm2,t3);
1787                         chrule_ai   = _mm_mul_sd(xmm2,rinv);
1788
1789                         _mm_store_sd(fr->dadx+n, chrule);
1790                         n = n + 1;
1791                         _mm_store_sd(fr->dadx+n, chrule_ai);
1792                         n = n + 1;
1793                 }
1794
1795                 /* Do end processing ...  */
1796                 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1797                 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1798                 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1799
1800                 /* Load, add and store atom ai polarisation energy */
1801                 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1802                 sum_ai = _mm_add_sd(sum_ai,xmm2);
1803                 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1804         }
1805
1806         /* Parallel summations */
1807         if(PARTDECOMP(cr))
1808         {
1809                 gmx_sum(natoms, born->gpol_hct_work, cr);
1810         }
1811         else if(DOMAINDECOMP(cr))
1812         {
1813                 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1814         }
1815
1816         /* Compute the radii */
1817         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1818         {
1819                 if(born->use[i] != 0)
1820                 {
1821                         rr      = top->atomtypes.gb_radius[md->typeA[i]];
1822                         rr_inv2 = 1.0/rr;
1823                         rr      = rr-doff;
1824                         rr_inv  = 1.0/rr;
1825                         sum     = rr * born->gpol_hct_work[i];
1826                         sum2    = sum  * sum;
1827                         sum3    = sum2 * sum;
1828
1829                         tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1830                         born->bRad[i] = rr_inv - tsum*rr_inv2;
1831                         born->bRad[i] = 1.0 / born->bRad[i];
1832
1833                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1834
1835                         tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1836                         born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1837                 }
1838         }
1839
1840         /* Extra (local) communication required for DD */
1841         if(DOMAINDECOMP(cr))
1842         {
1843                 dd_atom_spread_real(cr->dd, born->bRad);
1844                 dd_atom_spread_real(cr->dd, fr->invsqrta);
1845                 dd_atom_spread_real(cr->dd, born->drobc);
1846         }
1847
1848         return 0;
1849
1850 }
1851
1852
1853 int
1854 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, double *xd, double *f,
1855                                                           double *fshift, double *shift_vec, int gb_algorithm, gmx_genborn_t *born)
1856 {
1857         int i,k,n,ai,aj,ai3,aj1,aj2,aj13,aj23,aj4,nj0,nj1,offset;
1858         int shift;
1859         double rbi,shX,shY,shZ;
1860         double *rb;
1861
1862         __m128d ix,iy,iz,jx,jy,jz,fix,fiy,fiz,sX,sY,sZ;
1863         __m128d dx,dy,dz,t1,t2,t3,dva,dvaj,dax,dax_ai,fgb,fgb_ai;
1864         __m128d xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
1865
1866         t1 = t2 = t3 = _mm_setzero_pd();
1867         rb = born->work;
1868
1869         /* Loop to get the proper form for the Born radius term */
1870         if(gb_algorithm==egbSTILL) {
1871                 for(i=0;i<natoms;i++)
1872                 {
1873                         rbi   = born->bRad[i];
1874                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1875                 }
1876         }
1877
1878         else if(gb_algorithm==egbHCT) {
1879                 for(i=0;i<natoms;i++)
1880                 {
1881                         rbi   = born->bRad[i];
1882                         rb[i] = rbi * rbi * dvda[i];
1883                 }
1884         }
1885
1886         else if(gb_algorithm==egbOBC) {
1887                 for(i=0;i<natoms;i++)
1888                 {
1889                         rbi   = born->bRad[i];
1890                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1891                 }
1892         }
1893
1894         n = 0;
1895
1896         for(i=0;i<nl->nri;i++)
1897         {
1898                 ai     = nl->iinr[i];
1899                 ai3    = ai * 3;
1900
1901                 nj0    = nl->jindex[i];
1902                 nj1    = nl->jindex[i+1];
1903
1904                 /* Load shifts for this list */
1905                 shift   = 3*nl->shift[i];
1906                 shX     = shift_vec[shift+0];
1907                 shY     = shift_vec[shift+1];
1908                 shZ     = shift_vec[shift+2];
1909
1910                 /* Splat the shifts */
1911                 sX = _mm_load1_pd(&shX);
1912                 sY = _mm_load1_pd(&shY);
1913                 sZ = _mm_load1_pd(&shZ);
1914
1915                 offset = (nj1-nj0)%2;
1916
1917                 /* Load particle ai coordinates and add shifts */
1918                 ix  = _mm_load1_pd(xd+ai3);
1919                 iy  = _mm_load1_pd(xd+ai3+1);
1920                 iz  = _mm_load1_pd(xd+ai3+2);
1921
1922                 ix      = _mm_add_pd(sX,ix);
1923                 iy      = _mm_add_pd(sY,iy);
1924                 iz      = _mm_add_pd(sZ,iz);
1925
1926                 /* Load particle ai dvda */
1927                 dva = _mm_load1_pd(rb+ai);
1928
1929                 fix = _mm_setzero_pd();
1930                 fiy = _mm_setzero_pd();
1931                 fiz = _mm_setzero_pd();
1932
1933                 for(k=nj0;k<nj1-offset;k+=2)
1934                 {
1935                         aj1 = nl->jjnr[k];
1936                         aj2 = nl->jjnr[k+1];
1937
1938                         /* Load dvda_j */
1939                         dvaj = _mm_set_pd(rb[aj2],rb[aj1]);
1940
1941                         aj13 = aj1 * 3;
1942                         aj23 = aj2 * 3;
1943
1944                         /* Load particle aj1-2 coordinates */
1945                         xmm1 = _mm_loadu_pd(xd+aj13);
1946                         xmm2 = _mm_loadu_pd(xd+aj23);
1947
1948                         xmm5 = _mm_load_sd(xd+aj13+2);
1949                         xmm6 = _mm_load_sd(xd+aj23+2);
1950
1951                         jx   = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1952                         jy   = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1953                         jz   = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
1954
1955                         /* load derivative of born radii w.r.t. coordinates */
1956                         xmm7  = _mm_loadu_pd(dadx+n);
1957                         n     = n + 2;
1958                         xmm8  = _mm_loadu_pd(dadx+n);
1959                         n     = n + 2;
1960
1961                         /* Shuffle to get the ai and aj components right */
1962                         dax    = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(2,0));
1963                         dax_ai = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(3,1));
1964
1965                         /* distances */
1966                         dx   = _mm_sub_pd(ix,jx);
1967                         dy   = _mm_sub_pd(iy,jy);
1968                         dz   = _mm_sub_pd(iz,jz);
1969
1970                         /* scalar force */
1971                         fgb  = _mm_mul_pd(dva,dax);
1972                         fgb_ai = _mm_mul_pd(dvaj,dax_ai);
1973                         fgb = _mm_add_pd(fgb,fgb_ai);
1974
1975                         /* partial forces */
1976                         t1   = _mm_mul_pd(fgb,dx);
1977                         t2   = _mm_mul_pd(fgb,dy);
1978                         t3   = _mm_mul_pd(fgb,dz);
1979
1980                         /* update the i force */
1981                         fix  = _mm_add_pd(fix,t1);
1982                         fiy  = _mm_add_pd(fiy,t2);
1983                         fiz  = _mm_add_pd(fiz,t3);
1984
1985                         /* accumulate forces from memory */
1986                         xmm1 = _mm_loadu_pd(f+aj13); /* fx1 fx2 */
1987                         xmm2 = _mm_loadu_pd(f+aj23); /* fy1 fy2 */
1988
1989                         xmm5 = _mm_load1_pd(f+aj13+2); /* fz1 fz1 */
1990                         xmm6 = _mm_load1_pd(f+aj23+2); /* fz2 fz2 */
1991
1992                         /* transpose */
1993                         xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
1994                         xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
1995                         xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
1996
1997                         /* subtract partial forces */
1998                         xmm5 = _mm_sub_pd(xmm5, t1);
1999                         xmm6 = _mm_sub_pd(xmm6, t2);
2000                         xmm7 = _mm_sub_pd(xmm7, t3);
2001
2002                         /* transpose back fx and fy */
2003                         xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
2004                         xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1));
2005
2006                         /* store the force, first fx's and fy's */
2007                         _mm_storeu_pd(f+aj13,xmm1);
2008                         _mm_storeu_pd(f+aj23,xmm2);
2009
2010                         /* then fz */
2011                         _mm_storel_pd(f+aj13+2,xmm7);
2012                         _mm_storeh_pd(f+aj23+2,xmm7);
2013                 }
2014
2015                 if(offset!=0)
2016                 {
2017                         aj1  = nl->jjnr[k];
2018
2019                         dvaj = _mm_load_sd(rb+aj1);
2020
2021                         aj13 = aj1 * 3;
2022
2023                         jx    = _mm_load_sd(xd+aj13);
2024                         jy    = _mm_load_sd(xd+aj13+1);
2025                         jz    = _mm_load_sd(xd+aj13+2);
2026
2027                         dax  = _mm_load_sd(dadx+n);
2028                         n    = n + 1;
2029                         dax_ai = _mm_load_sd(dadx+n);
2030                         n    = n + 1;
2031
2032                         dx   = _mm_sub_sd(ix,jx);
2033                         dy   = _mm_sub_sd(iy,jy);
2034                         dz   = _mm_sub_sd(iz,jz);
2035
2036                         fgb  = _mm_mul_sd(dva,dax);
2037                         fgb_ai = _mm_mul_sd(dvaj,dax_ai);
2038                         fgb  = _mm_add_pd(fgb,fgb_ai);
2039
2040                         t1   = _mm_mul_sd(fgb,dx);
2041                         t2   = _mm_mul_sd(fgb,dy);
2042                         t3   = _mm_mul_sd(fgb,dz);
2043
2044                         fix  = _mm_add_sd(fix,t1);
2045                         fiy  = _mm_add_sd(fiy,t2);
2046                         fiz  = _mm_add_sd(fiz,t3);
2047
2048                         /* accumulate forces from memory */
2049                         xmm5 = _mm_load_sd(f+aj13);
2050                         xmm6 = _mm_load_sd(f+aj13+1);
2051                         xmm7 = _mm_load_sd(f+aj13+2);
2052
2053                         /* subtract partial forces */
2054                         xmm5 = _mm_sub_sd(xmm5, t1);
2055                         xmm6 = _mm_sub_sd(xmm6, t2);
2056                         xmm7 = _mm_sub_sd(xmm7, t3);
2057
2058                         /* store the force */
2059                         _mm_store_sd(f+aj13,xmm5);
2060                         _mm_store_sd(f+aj13+1,xmm6);
2061                         _mm_store_sd(f+aj13+2,xmm7);
2062                 }
2063
2064                 /* fix/fiy/fiz now contain two partial terms, that all should be
2065                  * added to the i particle forces
2066                  */
2067                 t1               = _mm_unpacklo_pd(t1,fix);
2068                 t2               = _mm_unpacklo_pd(t2,fiy);
2069                 t3               = _mm_unpacklo_pd(t3,fiz);
2070
2071                 fix              = _mm_add_pd(fix,t1);
2072                 fiy              = _mm_add_pd(fiy,t2);
2073                 fiz              = _mm_add_pd(fiz,t3);
2074
2075                 fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
2076                 fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
2077                 fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
2078
2079                 /* load, add and store i forces */
2080                 xmm1     = _mm_load_sd(f+ai3);
2081                 xmm2     = _mm_load_sd(f+ai3+1);
2082                 xmm3     = _mm_load_sd(f+ai3+2);
2083
2084                 fix      = _mm_add_sd(fix,xmm1);
2085                 fiy      = _mm_add_sd(fiy,xmm2);
2086                 fiz      = _mm_add_sd(fiz,xmm3);
2087
2088                 _mm_store_sd(f+ai3,fix);
2089                 _mm_store_sd(f+ai3+1,fiy);
2090                 _mm_store_sd(f+ai3+2,fiz);
2091
2092                 /* load, add and store i shift forces */
2093                 xmm1     = _mm_load_sd(fshift+shift);
2094                 xmm2     = _mm_load_sd(fshift+shift+1);
2095                 xmm3     = _mm_load_sd(fshift+shift+2);
2096
2097                 fix      = _mm_add_sd(fix,xmm1);
2098                 fiy      = _mm_add_sd(fiy,xmm2);
2099                 fiz      = _mm_add_sd(fiz,xmm3);
2100
2101                 _mm_store_sd(fshift+shift,fix);
2102                 _mm_store_sd(fshift+shift+1,fiy);
2103                 _mm_store_sd(fshift+shift+2,fiz);
2104
2105         }
2106
2107         return 0;
2108 }
2109
2110 #else
2111 /* keep compiler happy */
2112 int genborn_sse_dummy;
2113
2114 #endif /* SSE2 intrinsics available */
2115