src/mdlib/genborn_sse2_double.c

   1 #ifdef HAVE_CONFIG_H
   2 #include <config.h>
   3 #endif
   4
   5 #include <math.h>
   6 #include <string.h>
   7
   8 #include "typedefs.h"
   9 #include "smalloc.h"
  10 #include "genborn.h"
  11 #include "vec.h"
  12 #include "grompp.h"
  13 #include "pdbio.h"
  14 #include "names.h"
  15 #include "physics.h"
  16 #include "domdec.h"
  17 #include "partdec.h"
  18 #include "network.h"
  19 #include "gmx_fatal.h"
  20 #include "mtop_util.h"
  21 #include "genborn.h"
  22
  23 #ifdef GMX_LIB_MPI
  24 #include <mpi.h>
  25 #endif
  26 #ifdef GMX_THREADS
  27 #include "tmpi.h"
  28 #endif
  29
  30 /* Double precision SSE2 genborn disabled while waiting for transcendental implementations... */
  31 #if 0
  32
  33 /* Only compile this file if SSE2 intrinsics are available */
  34 /* #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) ) */
  35 #include <xmmintrin.h>
  36 #include <emmintrin.h>
  37
  38 #include "genborn_sse2_double.h"
  39
  40 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
  41 #define gmx_castsi128_pd(a) _mm_castsi128_pd(a)
  42 #define gmx_castpd_si128(a) _mm_castpd_si128(a)
  43 #elif defined(__GNUC__)
  44 #define gmx_castsi128_pd(a) ((__m128d)(a))
  45 #define gmx_castpd_si128(a) ((__m128i)(a))
  46 #else
  47 static __m128d gmx_castsi128_pd(__m128i a) { return *(__m128d *) &a; }
  48 static __m128i gmx_castpd_si128(__m128d a) { return *(__m128i *) &a; }
  49 #endif
  50
  51
  52 static inline void
  53 sincos_sse2double(__m128d x, __m128d *sinval, __m128d *cosval)
  54 {
  55     const __m128d two_over_pi = {2.0/M_PI,2.0/M_PI};
  56     const __m128d half        = {0.5,0.5};
  57     const __m128d one         = {1.0,1.0};
  58     const __m128i izero       = _mm_set1_epi32(0);
  59     const __m128i ione        = _mm_set1_epi32(1);
  60     const __m128i itwo        = _mm_set1_epi32(2);
  61     const __m128i ithree      = _mm_set1_epi32(3);
  62     const __m128d sincosd_kc1 = {(13176794.0 / 8388608.0),(13176794.0 / 8388608.0)};
  63     const __m128d sincosd_kc2 = {7.5497899548918821691639751442098584e-8,7.5497899548918821691639751442098584e-8};
  64     const __m128d sincosd_cc0 = {0.00000000206374484196,0.00000000206374484196};
  65     const __m128d sincosd_cc1 = {-0.00000027555365134677,-0.00000027555365134677};
  66     const __m128d sincosd_cc2 = {0.00002480157946764225,0.00002480157946764225};
  67     const __m128d sincosd_cc3 = {-0.00138888888730525966,-0.00138888888730525966};
  68     const __m128d sincosd_cc4 = {0.04166666666651986722,0.04166666666651986722};
  69     const __m128d sincosd_cc5 = {-0.49999999999999547304,-0.49999999999999547304};
  70     const __m128d sincosd_sc0 = {0.00000000015893606014,0.00000000015893606014};
  71     const __m128d sincosd_sc1 = {-0.00000002505069049138,-0.00000002505069049138};
  72     const __m128d sincosd_sc2 = {0.00000275573131527032,0.00000275573131527032};
  73     const __m128d sincosd_sc3 = {-0.00019841269827816117,-0.00019841269827816117};
  74     const __m128d sincosd_sc4 = {0.00833333333331908278,0.00833333333331908278};
  75     const __m128d sincosd_sc5 = {-0.16666666666666612594,-0.16666666666666612594};
  76
  77     __m128d signbit           = gmx_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ULL));
  78     __m128d tiny              = gmx_castsi128_pd(_mm_set1_epi64x(0x3e40000000000000ULL));
  79
  80     __m128d xl,xl2,xl3,qd,absxl,p1,cx,sx,ts,tc,tsn,tcn;
  81     __m128i q;
  82     __m128i offsetSin,offsetCos;
  83     __m128d sinMask,cosMask,isTiny;
  84     __m128d ct0,ct1,ct2,ct3,ct4,ct5,ct6,st1,st2,st3,st4,st6;
  85
  86     /* Rescale the angle to the range 0..4, and find which quadrant it is in */
  87     xl        = _mm_mul_pd(x,two_over_pi);
  88
  89     /* q=integer part of xl, rounded _away_ from 0.0 */
  90     /* Add 0.5 away from 0.0 */
  91     xl        = _mm_add_pd(xl,_mm_or_pd(_mm_and_pd(xl,signbit),half));
  92
  93     q         = _mm_cvttpd_epi32(xl);
  94     qd        = _mm_cvtepi32_pd(q);
  95     q         = _mm_shuffle_epi32(q,_MM_SHUFFLE(1,1,0,0));
  96
  97     /* Compute offset based on quadrant the arg falls in */
  98     offsetSin   = _mm_and_si128(q,ithree);
  99     offsetCos   = _mm_add_epi32(offsetSin,ione);
 100
 101     /* Remainder in range [-pi/4..pi/4] */
 102     p1 = _mm_mul_pd(qd,sincosd_kc1);
 103     xl = _mm_mul_pd(qd,sincosd_kc2);
 104     p1 = _mm_sub_pd(x,p1);
 105     xl = _mm_sub_pd(p1,xl);
 106
 107     absxl  = _mm_andnot_pd(signbit,xl);
 108     isTiny = _mm_cmpgt_pd(tiny,absxl);
 109
 110     xl2    = _mm_mul_pd(xl,xl);
 111     xl3    = _mm_mul_pd(xl2,xl);
 112
 113     ct0    = _mm_mul_pd(xl2,xl2);
 114     ct1    = _mm_mul_pd(sincosd_cc0,xl2);
 115     ct2    = _mm_mul_pd(sincosd_cc2,xl2);
 116     ct3    = _mm_mul_pd(sincosd_cc4,xl2);
 117     st1    = _mm_mul_pd(sincosd_sc0,xl2);
 118     st2    = _mm_mul_pd(sincosd_sc2,xl2);
 119     st3    = _mm_mul_pd(sincosd_sc4,xl2);
 120     ct1    = _mm_add_pd(ct1,sincosd_cc1);
 121     ct2    = _mm_add_pd(ct2,sincosd_cc3);
 122     ct3    = _mm_add_pd(ct3,sincosd_cc5);
 123     st1    = _mm_add_pd(st1,sincosd_sc1);
 124     st2    = _mm_add_pd(st2,sincosd_sc3);
 125     st3    = _mm_add_pd(st3,sincosd_sc5);
 126
 127     ct4    = _mm_mul_pd(ct2,ct0);
 128     ct4    = _mm_add_pd(ct4,ct3);
 129
 130     st4    = _mm_mul_pd(st2,ct0);
 131     st4    = _mm_add_pd(st4,st3);
 132     ct5    = _mm_mul_pd(ct0,ct0);
 133
 134     ct6    = _mm_mul_pd(ct5,ct1);
 135     ct6    = _mm_add_pd(ct6,ct4);
 136
 137     st6    = _mm_mul_pd(ct5,st1);
 138     st6    = _mm_add_pd(st6,st4);
 139
 140     cx     = _mm_mul_pd(ct6,xl2);
 141     cx     = _mm_add_pd(cx,one);
 142
 143     sx     = _mm_mul_pd(st6,xl3);
 144     sx     = _mm_add_pd(sx,xl);
 145
 146     /* Small angle approximation, sin(tiny)=tiny, cos(tiny)=1.0 */
 147     sx     = _mm_or_pd( _mm_and_pd(isTiny,xl) , _mm_andnot_pd(isTiny,sx) );
 148     cx     = _mm_or_pd( _mm_and_pd(isTiny,one) , _mm_andnot_pd(isTiny,cx) );
 149
 150     sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,ione), izero));
 151     cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,ione), izero));
 152
 153     ts     = _mm_or_pd( _mm_and_pd(sinMask,sx) , _mm_andnot_pd(sinMask,cx) );
 154     tc     = _mm_or_pd( _mm_and_pd(cosMask,sx) , _mm_andnot_pd(cosMask,cx) );
 155
 156     /* Flip the sign of the result when (offset mod 4) = 1 or 2 */
 157     sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,itwo), izero));
 158     tsn    = _mm_xor_pd(signbit,ts);
 159     ts     = _mm_or_pd( _mm_and_pd(sinMask,ts) , _mm_andnot_pd(sinMask,tsn) );
 160
 161     cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,itwo), izero));
 162     tcn    = _mm_xor_pd(signbit,tc);
 163     tc     = _mm_or_pd( _mm_and_pd(cosMask,tc) , _mm_andnot_pd(cosMask,tcn) );
 164
 165     *sinval = ts;
 166     *cosval = tc;
 167
 168     return;
 169 }
 170
 171 __m128d log_pd(__m128d x)
 172 {
 173         const __m128i exp_mask   = _mm_set_epi32(0x7FF00000,0,0x7FF00000,0);
 174         const __m128i exp_bias   = _mm_set_epi32(0,1023,0,1023);
 175
 176         const __m128d const_loge = _mm_set1_pd(0.69314718055994529);
 177         const __m128d const_one  = _mm_set1_pd(1.0);
 178         const __m128d const_two  = _mm_set1_pd(2.0);
 179
 180         /* Almost full single precision accuracy (~20 bits worst case) */
 181         const __m128d P0      = _mm_set1_pd(6.108179944792157749153050);
 182         const __m128d P1      = _mm_set1_pd(52.43691313715523327631139);
 183         const __m128d P2      = _mm_set1_pd(71.53664010795613671168440);
 184         const __m128d P3      = _mm_set1_pd(18.92097516931559547548485);
 185         const __m128d P4      = _mm_set1_pd(0.3504714784635941984522153);
 186         const __m128d P5      = _mm_set1_pd(-0.007105890734229368515879);
 187         const __m128d Q1      = _mm_set1_pd(17.73314231909420567454406);
 188         const __m128d Q2      = _mm_set1_pd(48.82373085428713023213363);
 189         const __m128d Q3      = _mm_set1_pd(31.65945943354513166309101);
 190         const __m128d Q4      = _mm_set1_pd(4.302477477108162270199051);
 191
 192         __m128d xmm0,xmm1,xmm2,xmm3, xmm4;
 193         __m128i xmmi,xmmj;
 194
 195         xmmi = gmx_castpd_si128(x);
 196         xmm1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(_mm_sub_epi64(_mm_srli_epi64(_mm_and_si128(xmmi, exp_mask), 52), exp_bias),_MM_SHUFFLE(3,1,2,0)));
 197         xmm0 = _mm_or_pd(gmx_castsi128_pd(_mm_andnot_si128(exp_mask, xmmi)), const_one);
 198
 199         xmm2  = _mm_mul_pd(P5,xmm0);
 200         xmm2  = _mm_add_pd(xmm2,P4);
 201         xmm2  = _mm_mul_pd(xmm2,xmm0);
 202         xmm2  = _mm_add_pd(xmm2,P3);
 203         xmm2  = _mm_mul_pd(xmm2,xmm0);
 204         xmm2  = _mm_add_pd(xmm2,P2);
 205         xmm2  = _mm_mul_pd(xmm2,xmm0);
 206         xmm2  = _mm_add_pd(xmm2,P1);
 207         xmm2  = _mm_mul_pd(xmm2,xmm0);
 208         xmm2  = _mm_add_pd(xmm2,P0);
 209
 210         xmm3  = _mm_mul_pd(Q4,xmm0);
 211         xmm3  = _mm_add_pd(xmm3,Q3);
 212         xmm3  = _mm_mul_pd(xmm3,xmm0);
 213         xmm3  = _mm_add_pd(xmm3,Q2);
 214         xmm3  = _mm_mul_pd(xmm3,xmm0);
 215         xmm3  = _mm_add_pd(xmm3,Q1);
 216         xmm3  = _mm_mul_pd(xmm3,xmm0);
 217         xmm3  = _mm_add_pd(xmm3,const_one);
 218
 219         /* xmm4=1.0/xmm3 */
 220         xmm4 = _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(xmm3)));
 221         xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
 222         xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
 223         xmm2 = _mm_mul_pd(xmm2,xmm4);
 224
 225         xmm0  = _mm_sub_pd(xmm0, const_one);
 226         xmm0  = _mm_mul_pd(xmm0,xmm2);
 227
 228         xmm0  = _mm_add_pd(xmm0,xmm1);
 229
 230     return _mm_mul_pd(xmm0, const_loge);
 231 }
 232
 233 /* This exp() routine provides accuracy of 10E-9 to 10E-11.
 234  * The polynomial minimax coefficients are actually accurate to 10E-14,
 235  * but we lose some accuracy in the polynomial evaluation.
 236  */
 237 __m128d exp_pd(__m128d x)
 238 {
 239     const __m128d lim1   = _mm_set1_pd(1025.0);   /* 1025.00000e+0d */
 240     const __m128d lim2   = _mm_set1_pd(-1022.99999999);   /* -1022.99999e+0f */
 241
 242     const __m128i base   = _mm_set_epi32(0,0,1023,1023);
 243         const __m128d half   = _mm_set1_pd(0.5);
 244         const __m128d log2e  = _mm_set1_pd(1.4426950408889634);
 245
 246     const __m128d exp_P0 = _mm_set1_pd(1.00000000000001276211229749);
 247     const __m128d exp_P1 = _mm_set1_pd(6.931471805598709708635169E-1);
 248     const __m128d exp_P2 = _mm_set1_pd(2.402265069564965287455972E-1);
 249     const __m128d exp_P3 = _mm_set1_pd(5.550410866868561155599683E-2);
 250     const __m128d exp_P4 = _mm_set1_pd(9.618129192067246128919915E-3);
 251     const __m128d exp_P5 = _mm_set1_pd(1.333355761760444302342084E-3);
 252     const __m128d exp_P6 = _mm_set1_pd(1.540343494807179111289781E-4);
 253     const __m128d exp_P7 = _mm_set1_pd(1.525298483865349629325421E-5);
 254     const __m128d exp_P8 = _mm_set1_pd(1.325940560934510417818578E-6);
 255         const __m128d exp_P9 = _mm_set1_pd(1.015033670529892589443421E-7);
 256
 257         __m128d xmm0,xmm1;
 258         __m128i xmmi;
 259
 260         xmm0 = _mm_mul_pd(x,log2e);
 261         xmm0 = _mm_min_pd(xmm0,lim1);
 262         xmm0 = _mm_max_pd(xmm0,lim2);
 263         xmm1 = _mm_sub_pd(xmm0,half);
 264
 265         xmmi = _mm_cvtpd_epi32(xmm1);
 266         xmm1 = _mm_cvtepi32_pd(xmmi);
 267
 268         xmmi = _mm_add_epi32(xmmi,base);
 269         xmmi = _mm_shuffle_epi32(xmmi,_MM_SHUFFLE(3,1,2,0));
 270         xmmi = _mm_slli_epi64(xmmi,52);
 271
 272         xmm0 = _mm_sub_pd(xmm0,xmm1);
 273
 274         xmm1 = _mm_mul_pd(exp_P9,xmm0);
 275         xmm1 = _mm_add_pd(xmm1,exp_P8);
 276         xmm1 = _mm_mul_pd(xmm1,xmm0);
 277         xmm1 = _mm_add_pd(xmm1,exp_P7);
 278         xmm1 = _mm_mul_pd(xmm1,xmm0);
 279         xmm1 = _mm_add_pd(xmm1,exp_P6);
 280         xmm1 = _mm_mul_pd(xmm1,xmm0);
 281         xmm1 = _mm_add_pd(xmm1,exp_P5);
 282         xmm1 = _mm_mul_pd(xmm1,xmm0);
 283         xmm1 = _mm_add_pd(xmm1,exp_P4);
 284         xmm1 = _mm_mul_pd(xmm1,xmm0);
 285         xmm1 = _mm_add_pd(xmm1,exp_P3);
 286         xmm1 = _mm_mul_pd(xmm1,xmm0);
 287         xmm1 = _mm_add_pd(xmm1,exp_P2);
 288         xmm1 = _mm_mul_pd(xmm1,xmm0);
 289         xmm1 = _mm_add_pd(xmm1,exp_P1);
 290         xmm1 = _mm_mul_pd(xmm1,xmm0);
 291         xmm1 = _mm_add_pd(xmm1,exp_P0);
 292         xmm1 = _mm_mul_pd(xmm1,gmx_castsi128_pd(xmmi));
 293
 294     return xmm1;
 295 }
 296
 297 static inline __m128d
 298 my_invrsq_pd(__m128d x)
 299 {
 300         const __m128d three = {3.0, 3.0};
 301         const __m128d half  = {0.5, 0.5};
 302
 303         __m128  t  = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */
 304         __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */
 305
 306         /* First Newton-Rapson step, accuracy is now 24 bits */
 307         __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1)))));
 308
 309         /* Return second Newton-Rapson step, accuracy 48 bits */
 310         return _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2)))));
 311 }
 312
 313 static inline __m128d
 314 my_inv_pd(__m128d x)
 315 {
 316         const __m128d two = {2.0, 2.0};
 317
 318         __m128  t  = _mm_rcp_ps(_mm_cvtpd_ps(x));
 319         __m128d t1 = _mm_cvtps_pd(t);
 320         __m128d t2 = _mm_mul_pd(t1,_mm_sub_pd(two,_mm_mul_pd(t1,x)));
 321
 322         return _mm_mul_pd(t2,_mm_sub_pd(two,_mm_mul_pd(t2,x)));
 323 }
 324
 325
 326 int
 327 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
 328                                                           const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
 329 {
 330         int i,k,n,ai,ai3,aj1,aj2,aj13,aj23;
 331         int at0,at1,nj0,nj1,offset,taj1,taj2;
 332         int shift;
 333
 334         double factor,gpi_ai,gpi_tmp,gpi2;
 335         double shX,shY,shZ;
 336
 337         __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
 338         __m128d t1,t2,t3,rsq11,rinv,rinv2,rinv4,rinv6;
 339         __m128d ratio,gpi,rai,raj,vaj,rvdw,mask_cmp;
 340         __m128d ccf,dccf,theta,cosq,term,sinq,res,prod;
 341         __m128d xmm1,xmm2,xmm3,xmm4,xmm7,vai,prod_ai,icf4,icf6;
 342
 343         const __m128d half  = {0.5, 0.5};
 344         const __m128d three = {3.0, 3.0};
 345         const __m128d one   = {1.0, 1.0};
 346         const __m128d two   = {2.0, 2.0};
 347         const __m128d zero  = {0.0, 0.0};
 348         const __m128d four  = {4.0, 4.0};
 349
 350         const __m128d p5inv  = {STILL_P5INV, STILL_P5INV};
 351         const __m128d pip5   = {STILL_PIP5,  STILL_PIP5};
 352         const __m128d p4     = {STILL_P4,    STILL_P4};
 353
 354         factor              = 0.5 * ONE_4PI_EPS0;
 355         n                   = 0;
 356
 357         /* Keep the compiler happy */
 358         raj         = _mm_setzero_pd();
 359         vaj         = _mm_setzero_pd();
 360         jx          = _mm_setzero_pd();
 361         jy          = _mm_setzero_pd();
 362         jz          = _mm_setzero_pd();
 363         xmm2        = _mm_setzero_pd();
 364         xmm7        = _mm_setzero_pd();
 365
 366         for(i=0;i<born->nr;i++)
 367         {
 368                 born->gpol_still_work[i]=0;
 369         }
 370
 371         for(i=0;i<nl->nri;i++)
 372         {
 373                 ai      = nl->iinr[i];
 374                 ai3     = ai * 3;
 375
 376                 nj0     = nl->jindex[i];
 377                 nj1     = nl->jindex[i+1];
 378
 379                 /* Load shifts for this list */
 380                 shift   = nl->shift[i];
 381                 shX     = fr->shift_vec[shift][0];
 382                 shY     = fr->shift_vec[shift][1];
 383                 shZ     = fr->shift_vec[shift][2];
 384
 385                 /* Splat the shifts */
 386                 sX = _mm_load1_pd(&shX);
 387                 sY = _mm_load1_pd(&shY);
 388                 sZ = _mm_load1_pd(&shZ);
 389
 390                 offset  = (nj1-nj0)%2;
 391
 392                 /* Polarization energy for atom ai */
 393                 gpi     = _mm_setzero_pd();
 394
 395                 /* Load particle ai coordinates and add shifts */
 396                 ix      = _mm_load1_pd(x+ai3);
 397                 iy      = _mm_load1_pd(x+ai3+1);
 398                 iz      = _mm_load1_pd(x+ai3+2);
 399
 400                 ix      = _mm_add_pd(sX,ix);
 401                 iy      = _mm_add_pd(sY,iy);
 402                 iz      = _mm_add_pd(sZ,iz);
 403
 404                 /* Load particle ai gb_radius */
 405                 rai     = _mm_set1_pd(top->atomtypes.gb_radius[md->typeA[ai]]);
 406                 vai     = _mm_set1_pd(born->vsolv[ai]);
 407                 prod_ai = _mm_mul_pd(p4,vai);
 408
 409                 for(k=nj0;k<nj1-offset;k+=2)
 410                 {
 411                         aj1      = nl->jjnr[k];
 412                         aj2      = nl->jjnr[k+1];
 413
 414                         aj13     = aj1 * 3;
 415                         aj23     = aj2 * 3;
 416
 417                         taj1     = md->typeA[aj1];
 418                         taj2     = md->typeA[aj2];
 419
 420                         /* Load particle aj1-2 coordinates and compute ai->aj distances */
 421                         xmm1     = _mm_loadu_pd(x+aj13);
 422                         xmm2     = _mm_loadu_pd(x+aj23);
 423                         jx       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
 424                         jy       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
 425
 426                         jz       = _mm_loadl_pd(jz,x+aj13+2);
 427                         jz       = _mm_loadh_pd(jz,x+aj23+2);
 428
 429                         dx       = _mm_sub_pd(ix,jx);
 430                         dy       = _mm_sub_pd(iy,jy);
 431                         dz       = _mm_sub_pd(iz,jz);
 432
 433                         rsq11    = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
 434                         rinv     = my_invrsq_pd(rsq11);
 435
 436                         rinv2    = _mm_mul_pd(rinv,rinv);
 437                         rinv4    = _mm_mul_pd(rinv2,rinv2);
 438                         rinv6    = _mm_mul_pd(rinv4,rinv2);
 439
 440                         vaj      = _mm_loadl_pd(vaj,born->vsolv+aj1);
 441                         vaj      = _mm_loadh_pd(vaj,born->vsolv+aj2);
 442
 443                         raj      = _mm_loadl_pd(raj, top->atomtypes.gb_radius+taj1);
 444                         raj      = _mm_loadh_pd(raj, top->atomtypes.gb_radius+taj2);
 445
 446                         rvdw     = _mm_add_pd(rai,raj);
 447                         rvdw     = _mm_mul_pd(rvdw,rvdw);
 448                         ratio    = _mm_div_pd(rsq11,rvdw);
 449
 450                         mask_cmp = _mm_cmpgt_pd(ratio,p5inv);
 451
 452                         switch(_mm_movemask_pd(mask_cmp))
 453                         {
 454                                 case 0xF:
 455                                         ccf  = one;
 456                                         dccf = zero;
 457                                         break;
 458                                 default:
 459                                         theta = _mm_mul_pd(ratio,pip5);
 460                                         sincos_sse2double(theta,&sinq,&cosq);
 461
 462                                         term  = _mm_sub_pd(one,cosq);
 463                                         term  = _mm_mul_pd(half,term);
 464                                         ccf   = _mm_mul_pd(term,term);
 465                                         dccf  = _mm_mul_pd(two,term);
 466                                         dccf  = _mm_mul_pd(dccf,sinq);
 467                                         dccf  = _mm_mul_pd(dccf,pip5);
 468                                         dccf  = _mm_mul_pd(dccf,ratio);
 469
 470                                         ccf   = _mm_or_pd(_mm_and_pd(mask_cmp,one)  ,_mm_andnot_pd(mask_cmp,ccf));
 471                                         dccf  = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
 472                         }
 473
 474                         prod      = _mm_mul_pd(p4,vaj);
 475                         icf4      = _mm_mul_pd(ccf,rinv4);
 476                         xmm2      = _mm_mul_pd(icf4,prod);
 477                         xmm3      = _mm_mul_pd(icf4,prod_ai);
 478                         gpi       = _mm_add_pd(gpi,xmm2);
 479
 480                         /* Load, subtract and store atom aj gpol energy */
 481                         xmm7      = _mm_loadl_pd(xmm7,born->gpol_still_work+aj1);
 482                         xmm7      = _mm_loadh_pd(xmm7,born->gpol_still_work+aj2);
 483
 484                         xmm3      = _mm_add_pd(xmm7,xmm3);
 485
 486                         _mm_storel_pd(born->gpol_still_work+aj1,xmm3);
 487                         _mm_storeh_pd(born->gpol_still_work+aj2,xmm3);
 488
 489                         /* Chain rule terms */
 490                         ccf       = _mm_mul_pd(four,ccf);
 491                         xmm3      = _mm_sub_pd(ccf,dccf);
 492                         icf6      = _mm_mul_pd(xmm3,rinv6);
 493                         xmm1      = _mm_mul_pd(icf6,prod);
 494                         xmm2      = _mm_mul_pd(icf6,prod_ai);
 495
 496                         /* As with single precision, we need to shift stuff around, to change the order of
 497                          * the interactions from ai->aj1, ai->aj2 to ai->aj1, aj1->ai, ai->aj2, aj2->ai etc,
 498                          to do 2 instead of 4 store operations
 499                          */
 500                         xmm3      = _mm_unpacklo_pd(xmm1,xmm2);
 501                         xmm4      = _mm_unpackhi_pd(xmm1,xmm2);
 502
 503                         _mm_storeu_pd(fr->dadx+n,xmm3);
 504                         n         = n + 2;
 505                         _mm_storeu_pd(fr->dadx+n,xmm4);
 506                         n         = n + 2;
 507                 }
 508
 509                 /* Deal with odd elements */
 510                 if(offset!=0)
 511                 {
 512                         aj1  = nl->jjnr[k];
 513                         aj13 = aj1 * 3;
 514                         taj1 = md->typeA[aj1];
 515
 516                         jx    = _mm_load_sd(x+aj13);
 517                         jy    = _mm_load_sd(x+aj13+1);
 518                         jz    = _mm_load_sd(x+aj13+2);
 519
 520                         raj   = _mm_load_sd(top->atomtypes.gb_radius+taj1);
 521                         vaj   = _mm_load_sd(born->vsolv+aj1);
 522
 523                         dx    = _mm_sub_sd(ix,jx);
 524                         dy    = _mm_sub_pd(iy,jy);
 525                         dz    = _mm_sub_pd(iz,jz);
 526
 527                         rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
 528                         rinv  = my_invrsq_pd(rsq11);
 529
 530                         rinv2 = _mm_mul_sd(rinv,rinv);
 531                         rinv4 = _mm_mul_sd(rinv2,rinv2);
 532                         rinv6 = _mm_mul_sd(rinv4,rinv2);
 533
 534                         rvdw  = _mm_add_sd(rai,raj);
 535                         rvdw  = _mm_mul_sd(rvdw,rvdw);
 536                         ratio = _mm_div_sd(rsq11,rvdw);
 537
 538                         mask_cmp = _mm_cmpgt_sd(ratio,p5inv);
 539
 540                         switch(_mm_movemask_pd(mask_cmp))
 541                         {
 542                                 case 0xF:
 543                                         ccf  = one;
 544                                         dccf = zero;
 545                                         break;
 546                                 default:
 547                                         theta = _mm_mul_sd(ratio,pip5);
 548                                         sincos_sse2double(theta,&sinq,&cosq);
 549
 550                                         term  = _mm_sub_sd(one,cosq);
 551                                         term  = _mm_mul_sd(half,term);
 552                                         ccf   = _mm_mul_sd(term,term);
 553                                         dccf  = _mm_mul_sd(two,term);
 554                                         dccf  = _mm_mul_sd(dccf,sinq);
 555                                         dccf  = _mm_mul_sd(dccf,pip5);
 556                                         dccf  = _mm_mul_sd(dccf,ratio);
 557
 558                                         ccf   = _mm_or_pd(_mm_and_pd(mask_cmp,one)  ,_mm_andnot_pd(mask_cmp,ccf));
 559                                         dccf  = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
 560                         }
 561
 562                         prod      = _mm_mul_sd(p4,vaj);
 563                         icf4      = _mm_mul_sd(ccf,rinv4);
 564                         xmm2      = _mm_mul_sd(icf4,prod);
 565                         xmm3      = _mm_mul_sd(icf4,prod_ai);
 566                         gpi       = _mm_add_sd(gpi,xmm2);
 567
 568                         /* Load, subtract and store atom aj gpol energy */
 569                         xmm7      = _mm_load_sd(born->gpol_still_work+aj1);
 570                         xmm3      = _mm_add_sd(xmm7,xmm3);
 571                         _mm_store_sd(born->gpol_still_work+aj1,xmm3);
 572
 573                         /* Chain rule terms */
 574                         ccf       = _mm_mul_sd(four,ccf);
 575                         xmm3      = _mm_sub_sd(ccf,dccf);
 576                         icf6      = _mm_mul_sd(xmm3,rinv6);
 577                         xmm1      = _mm_mul_sd(icf6,prod);
 578                         xmm2      = _mm_mul_sd(icf6,prod_ai);
 579
 580                         /* Here we only have ai->aj1 and aj1->ai, so we can store directly */
 581                         _mm_storel_pd(fr->dadx+n,xmm1);
 582                         n         = n + 1;
 583                         _mm_storel_pd(fr->dadx+n,xmm2);
 584                         n         = n + 1;
 585                 } /* End offset */
 586
 587                 /* Do end processing ... */
 588                 xmm2  = _mm_unpacklo_pd(xmm2,gpi);
 589                 gpi   = _mm_add_pd(gpi,xmm2);
 590                 gpi   = _mm_shuffle_pd(gpi,gpi,_MM_SHUFFLE2(1,1));
 591
 592                 /* Load, add and store atom ai polarisation energy */
 593                 xmm2 = _mm_load_sd(born->gpol_still_work+ai);
 594                 gpi = _mm_add_sd(gpi,xmm2);
 595                 _mm_store_sd(born->gpol_still_work+ai,gpi);
 596         }
 597
 598         /* Parallel summations */
 599         if(PARTDECOMP(cr))
 600         {
 601                 gmx_sum(natoms,born->gpol_still_work, cr);
 602         }
 603         else if(DOMAINDECOMP(cr))
 604         {
 605                 dd_atom_sum_real(cr->dd, born->gpol_still_work);
 606         }
 607
 608         /* Compute the radii */
 609         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
 610         {
 611                 if(born->use[i] != 0)
 612                 {
 613                         gpi_ai = born->gpol[i] + born->gpol_still_work[i];
 614                         gpi2   = gpi_ai*gpi_ai;
 615
 616                         born->bRad[i]=factor*gmx_invsqrt(gpi2);
 617                         fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
 618                 }
 619         }
 620
 621         /* Extra (local) communication reqiured for DD */
 622         if(DOMAINDECOMP(cr))
 623         {
 624                 dd_atom_spread_real(cr->dd, born->bRad);
 625                 dd_atom_spread_real(cr->dd, fr->invsqrta);
 626         }
 627
 628         return 0;
 629 }
 630
 631 int
 632 calc_gb_rad_hct_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
 633                                                         const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
 634 {
 635         int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
 636         int p1, p2, shift;
 637         double rr,sum,sum_tmp,min_rad,rad,doff;
 638         double shX,shY,shZ;
 639
 640         __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
 641         __m128d t1,t2,t3,rsq11,rinv,r,rai;
 642         __m128d rai_inv,sk,sk2,lij,dlij,duij;
 643         __m128d uij,lij2,uij2,lij3,uij3,diff2;
 644         __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
 645         __m128d mask_cmp,mask_cmp2,mask_cmp3;
 646         __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9,doffset;
 647         __m128d sum_ai,chrule,chrule_ai,tmp_ai;
 648         __m128d sk_ai, sk2_ai,raj,raj_inv;
 649
 650         const __m128d neg   = {-1.0, -1.0};
 651         const __m128d zero  = {0.0, 0.0};
 652         const __m128d eigth = {0.125, 0.125};
 653         const __m128d qrtr  = {0.25, 0.25};
 654         const __m128d half  = {0.5, 0.5};
 655         const __m128d one   = {1.0, 1.0};
 656         const __m128d two   = {2.0, 2.0};
 657         const __m128d three = {3.0, 3.0};
 658
 659         /* Keep the compiler happy */
 660         tmp_ai      = _mm_setzero_pd();
 661         tmp         = _mm_setzero_pd();
 662         xmm7        = _mm_setzero_pd();
 663         xmm8        = _mm_setzero_pd();
 664         raj_inv     = _mm_setzero_pd();
 665         raj         = _mm_setzero_pd();
 666         sk          = _mm_setzero_pd();
 667         jx          = _mm_setzero_pd();
 668         jy          = _mm_setzero_pd();
 669         jz          = _mm_setzero_pd();
 670
 671         /* Set the dielectric offset */
 672         doff = born->gb_doffset;
 673         doffset = _mm_load1_pd(&doff);
 674         n       = 0;
 675
 676         for(i=0;i<born->nr;i++)
 677         {
 678                 born->gpol_hct_work[i] = 0;
 679         }
 680
 681         for(i=0;i<nl->nri;i++)
 682         {
 683                 ai      = nl->iinr[i];
 684                 ai3     = ai * 3;
 685
 686                 nj0     = nl->jindex[i];
 687                 nj1     = nl->jindex[i+1];
 688
 689                 /* Load shifts for this list */
 690                 shift   = nl->shift[i];
 691                 shX     = fr->shift_vec[shift][0];
 692                 shY     = fr->shift_vec[shift][1];
 693                 shZ     = fr->shift_vec[shift][2];
 694
 695                 /* Splat the shifts */
 696                 sX = _mm_load1_pd(&shX);
 697                 sY = _mm_load1_pd(&shY);
 698                 sZ = _mm_load1_pd(&shZ);
 699
 700                 offset  = (nj1-nj0)%2;
 701
 702                 /* Load rai */
 703                 rr      = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
 704                 rai     = _mm_load1_pd(&rr);
 705                 rr      = 1.0/rr;
 706                 rai_inv = _mm_load1_pd(&rr);
 707
 708                 /* Zero out sums for polarisation energies */
 709                 sum_ai  = _mm_setzero_pd();
 710
 711                 /* Load ai coordinates and add shifts */
 712                 ix       = _mm_load1_pd(x+ai3);
 713                 iy       = _mm_load1_pd(x+ai3+1);
 714                 iz       = _mm_load1_pd(x+ai3+2);
 715
 716                 ix      = _mm_add_pd(sX,ix);
 717                 iy      = _mm_add_pd(sY,iy);
 718                 iz      = _mm_add_pd(sZ,iz);
 719
 720                 sk_ai    = _mm_load1_pd(born->param+ai);
 721                 sk2_ai   = _mm_mul_pd(sk_ai,sk_ai);
 722
 723                 for(k=nj0;k<nj1-offset;k+=2)
 724                 {
 725                         aj1  = nl->jjnr[k];
 726                         aj2  = nl->jjnr[k+1];
 727
 728                         aj13 = aj1 * 3;
 729                         aj23 = aj2 * 3;
 730
 731                         /* Load particle aj1-2 coordinates */
 732                         xmm1     = _mm_loadu_pd(x+aj13);
 733                         xmm2     = _mm_loadu_pd(x+aj23);
 734                         jx       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
 735                         jy       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
 736
 737                         jz         = _mm_loadl_pd(jz, x+aj13+2);
 738                         jz         = _mm_loadh_pd(jz, x+aj23+2);
 739
 740                         dx         = _mm_sub_pd(ix,jx);
 741                         dy         = _mm_sub_pd(iy,jy);
 742                         dz         = _mm_sub_pd(iz,jz);
 743
 744                         rsq11      = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
 745                         rinv       = my_invrsq_pd(rsq11);
 746                         r          = _mm_mul_pd(rinv,rsq11);
 747
 748                         sk         = _mm_loadl_pd(sk,born->param+aj1);
 749                         sk         = _mm_loadh_pd(sk,born->param+aj2);
 750
 751                         /* Load aj1,aj2 raj */
 752                         p1        = md->typeA[aj1];
 753                         p2        = md->typeA[aj2];
 754
 755                         raj       = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
 756                         raj       = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
 757                         raj       = _mm_sub_pd(raj,doffset);
 758
 759                         /* Compute 1.0/raj */
 760                         raj_inv   = my_inv_pd(raj);
 761
 762                         /* INTERACTION aj->ai STARS HERE */
 763                         /* conditional mask for rai<dr+sk */
 764                         xmm1       = _mm_add_pd(r,sk);
 765                         mask_cmp   = _mm_cmplt_pd(rai,xmm1);
 766
 767                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
 768                         xmm2      = _mm_sub_pd(r,sk);
 769                         xmm3      = my_inv_pd(xmm2);
 770                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
 771
 772                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
 773                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
 774
 775                         uij       = my_inv_pd(xmm1);
 776                         lij2      = _mm_mul_pd(lij,lij);
 777                         lij3      = _mm_mul_pd(lij2,lij);
 778                         uij2      = _mm_mul_pd(uij,uij);
 779                         uij3      = _mm_mul_pd(uij2,uij);
 780
 781                         diff2     = _mm_sub_pd(uij2,lij2);
 782
 783                         lij_inv   = my_invrsq_pd(lij2);
 784                         sk2       = _mm_mul_pd(sk,sk);
 785                         sk2_inv   = _mm_mul_pd(sk2,rinv);
 786                         prod      = _mm_mul_pd(qrtr,sk2_inv);
 787
 788                         log_term  = _mm_mul_pd(uij,lij_inv);
 789                         log_term  = log_pd(log_term);
 790
 791                         xmm1      = _mm_sub_pd(lij,uij);
 792                         xmm2      = _mm_mul_pd(qrtr,r);
 793                         xmm2      = _mm_mul_pd(xmm2,diff2);
 794                         xmm1      = _mm_add_pd(xmm1,xmm2);
 795                         xmm2      = _mm_mul_pd(half,rinv);
 796                         xmm2      = _mm_mul_pd(xmm2,log_term);
 797                         xmm1      = _mm_add_pd(xmm1,xmm2);
 798                         xmm9      = _mm_mul_pd(neg,diff2);
 799                         xmm2      = _mm_mul_pd(xmm9,prod);
 800                         tmp_ai    = _mm_add_pd(xmm1,xmm2);
 801
 802                         /* contitional for rai<sk-dr */
 803                         xmm3      = _mm_sub_pd(sk,r);
 804                         mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
 805
 806                         xmm4    = _mm_sub_pd(rai_inv,lij);
 807                         xmm4    = _mm_mul_pd(two,xmm4);
 808                         xmm4    = _mm_add_pd(tmp_ai,xmm4);
 809
 810                         tmp_ai      = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
 811
 812                         /* the tmp will now contain two partial values, that not all are to be used. Which */
 813                         /* ones are governed by the mask_cmp mask. */
 814                         tmp_ai     = _mm_mul_pd(half,tmp_ai);
 815                         tmp_ai     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
 816                         sum_ai     = _mm_add_pd(sum_ai,tmp_ai);
 817
 818                         xmm2   = _mm_mul_pd(half,lij2);
 819                         xmm3   = _mm_mul_pd(prod,lij3);
 820                         xmm2   = _mm_add_pd(xmm2,xmm3);
 821                         xmm3   = _mm_mul_pd(lij,rinv);
 822                         xmm4   = _mm_mul_pd(lij3,r);
 823                         xmm3   = _mm_add_pd(xmm3,xmm4);
 824                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 825                         t1     = _mm_sub_pd(xmm2,xmm3);
 826
 827                         xmm2   = _mm_mul_pd(half,uij2);
 828                         xmm2   = _mm_mul_pd(neg,xmm2);
 829                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
 830                         xmm3   = _mm_mul_pd(xmm3,uij3);
 831                         xmm2   = _mm_sub_pd(xmm2,xmm3);
 832                         xmm3   = _mm_mul_pd(uij,rinv);
 833                         xmm4   = _mm_mul_pd(uij3,r);
 834                         xmm3   = _mm_add_pd(xmm3,xmm4);
 835                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 836                         t2     = _mm_add_pd(xmm2,xmm3);
 837
 838                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
 839                         xmm2   = _mm_add_pd(one,xmm2);
 840                         xmm2   = _mm_mul_pd(eigth,xmm2);
 841                         xmm2   = _mm_mul_pd(xmm2,xmm9);
 842                         xmm3   = _mm_mul_pd(log_term, rinv);
 843                         xmm3   = _mm_mul_pd(xmm3,rinv);
 844                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 845                         t3     = _mm_add_pd(xmm2,xmm3);
 846
 847                         /* chain rule terms */
 848                         xmm2   = _mm_mul_pd(dlij,t1);
 849                         xmm2   = _mm_add_pd(xmm2,t2);
 850                         xmm2   = _mm_add_pd(xmm2,t3);
 851
 852                         /* temporary storage of chain rule terms, since we have to compute
 853                          the reciprocal terms also before storing them */
 854                         chrule = _mm_mul_pd(xmm2,rinv);
 855
 856                         /* INTERACTION ai->aj starts here */
 857                         /* Conditional mask for raj<dr+sk_ai */
 858                         xmm1   = _mm_add_pd(r,sk_ai);
 859                         mask_cmp = _mm_cmplt_pd(raj,xmm1);
 860
 861                         /* Conditional for rai>dr-sk, ends with mask_cmp2 */
 862                         xmm2  = _mm_sub_pd(r,sk_ai);
 863                         xmm3  = my_inv_pd(xmm2);
 864                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
 865
 866                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
 867                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
 868
 869                         uij       = my_inv_pd(xmm1);
 870                         lij2      = _mm_mul_pd(lij,lij);
 871                         lij3      = _mm_mul_pd(lij2,lij);
 872                         uij2      = _mm_mul_pd(uij,uij);
 873                         uij3      = _mm_mul_pd(uij2,uij);
 874
 875                         diff2     = _mm_sub_pd(uij2,lij2);
 876
 877                         lij_inv   = my_invrsq_pd(lij2);
 878
 879                         sk2       = sk2_ai;
 880                         sk2_inv   = _mm_mul_pd(sk2,rinv);
 881                         prod      = _mm_mul_pd(qrtr,sk2_inv);
 882
 883                         log_term  = _mm_mul_pd(uij,lij_inv);
 884                         log_term = log_pd(log_term);
 885
 886                         xmm1      = _mm_sub_pd(lij,uij);
 887                         xmm2      = _mm_mul_pd(qrtr,r);
 888                         xmm2      = _mm_mul_pd(xmm2,diff2);
 889                         xmm1      = _mm_add_pd(xmm1,xmm2);
 890                         xmm2      = _mm_mul_pd(half,rinv);
 891                         xmm2      = _mm_mul_pd(xmm2,log_term);
 892                         xmm1      = _mm_add_pd(xmm1,xmm2);
 893                         xmm9      = _mm_mul_pd(neg,diff2);
 894                         xmm2      = _mm_mul_pd(xmm9,prod);
 895                         tmp       = _mm_add_pd(xmm1,xmm2);
 896
 897                         /* contitional for rai<sk_ai-dr */
 898                         xmm3      = _mm_sub_pd(sk_ai,r);
 899                         mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
 900
 901                         xmm4    = _mm_sub_pd(raj_inv,lij);
 902                         xmm4    = _mm_mul_pd(two,xmm4);
 903                         xmm4    = _mm_add_pd(tmp,xmm4);
 904
 905                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
 906
 907                         /* the tmp will now contain two partial values, that not all are to be used. Which */
 908                         /* ones are governed by the mask_cmp mask. */
 909                         tmp     = _mm_mul_pd(half,tmp);
 910                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
 911
 912                         /* Load, add and store ai->aj pol energy */
 913                         xmm7    = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
 914                         xmm7    = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
 915
 916                         xmm7    = _mm_add_pd(xmm7,tmp);
 917
 918                         _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
 919                         _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
 920
 921                         /* Start chain rule terms */
 922                         xmm2   = _mm_mul_pd(half,lij2);
 923                         xmm3   = _mm_mul_pd(prod,lij3);
 924                         xmm2   = _mm_add_pd(xmm2,xmm3);
 925                         xmm3   = _mm_mul_pd(lij,rinv);
 926                         xmm4   = _mm_mul_pd(lij3,r);
 927                         xmm3   = _mm_add_pd(xmm3,xmm4);
 928                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 929                         t1     = _mm_sub_pd(xmm2,xmm3);
 930
 931                         xmm2   = _mm_mul_pd(half,uij2);
 932                         xmm2   = _mm_mul_pd(neg,xmm2);
 933                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
 934                         xmm3   = _mm_mul_pd(xmm3,uij3);
 935                         xmm2   = _mm_sub_pd(xmm2,xmm3);
 936                         xmm3   = _mm_mul_pd(uij,rinv);
 937                         xmm4   = _mm_mul_pd(uij3,r);
 938                         xmm3   = _mm_add_pd(xmm3,xmm4);
 939                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 940                         t2     = _mm_add_pd(xmm2,xmm3);
 941
 942                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
 943                         xmm2   = _mm_add_pd(one,xmm2);
 944                         xmm2   = _mm_mul_pd(eigth,xmm2);
 945                         xmm2   = _mm_mul_pd(xmm2,xmm9);
 946                         xmm3   = _mm_mul_pd(log_term, rinv);
 947                         xmm3   = _mm_mul_pd(xmm3,rinv);
 948                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 949                         t3     = _mm_add_pd(xmm2,xmm3);
 950
 951                         /* chain rule terms */
 952                         xmm2   = _mm_mul_pd(dlij,t1);
 953                         xmm2   = _mm_add_pd(xmm2,t2);
 954                         xmm2   = _mm_add_pd(xmm2,t3);
 955                         chrule_ai = _mm_mul_pd(xmm2,rinv);
 956
 957                         /* Store chain rule terms
 958                          * same unpacking rationale as with Still above
 959                          */
 960                         xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
 961                         xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
 962
 963                         _mm_storeu_pd(fr->dadx+n, xmm3);
 964                         n = n + 2;
 965                         _mm_storeu_pd(fr->dadx+n, xmm4);
 966                         n = n + 2;
 967                 }
 968
 969                 if(offset!=0)
 970                 {
 971                         aj1       = nl->jjnr[k];
 972                         aj13      = aj1 * 3;
 973                         p1        = md->typeA[aj1];
 974
 975                         jx        = _mm_load_sd(x+aj13);
 976                         jy        = _mm_load_sd(x+aj13+1);
 977                         jz        = _mm_load_sd(x+aj13+2);
 978
 979                         sk        = _mm_load_sd(born->param+aj1);
 980
 981                         dx        = _mm_sub_sd(ix,jx);
 982                         dy        = _mm_sub_pd(iy,jy);
 983                         dz        = _mm_sub_pd(iz,jz);
 984
 985                         rsq11     = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
 986                         rinv      = my_invrsq_pd(rsq11);
 987                         r         = _mm_mul_sd(rinv,rsq11);
 988
 989                         /* Load raj */
 990                         raj       = _mm_load_sd(top->atomtypes.gb_radius+p1);
 991                         raj       = _mm_sub_sd(raj,doffset);
 992                         raj_inv   = my_inv_pd(raj);
 993
 994                         /* OFFSET INTERATIONS aj->ai STARTS HERE */
 995                         /* conditional mask for rai<dr+sk */
 996                         xmm1      = _mm_add_sd(r,sk);
 997                         mask_cmp  = _mm_cmplt_sd(rai,xmm1);
 998
 999                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1000                         xmm2      = _mm_sub_sd(r,sk);
1001                         xmm3      = my_inv_pd(xmm2);
1002                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1003
1004                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1005                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1006
1007                         uij       = my_inv_pd(xmm1);
1008                         lij2      = _mm_mul_sd(lij,lij);
1009                         lij3      = _mm_mul_sd(lij2,lij);
1010                         uij2      = _mm_mul_sd(uij,uij);
1011                         uij3      = _mm_mul_sd(uij2,uij);
1012
1013                         diff2     = _mm_sub_sd(uij2,lij2);
1014
1015                         lij_inv   = my_invrsq_pd(lij2);
1016                         sk2       = _mm_mul_sd(sk,sk);
1017                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1018                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1019
1020                         log_term  = _mm_mul_pd(uij,lij_inv);
1021                         log_term = log_pd(log_term);
1022
1023                         xmm1      = _mm_sub_sd(lij,uij);
1024                         xmm2      = _mm_mul_sd(qrtr,r);
1025                         xmm2      = _mm_mul_sd(xmm2,diff2);
1026                         xmm1      = _mm_add_sd(xmm1,xmm2);
1027                         xmm2      = _mm_mul_sd(half,rinv);
1028                         xmm2      = _mm_mul_sd(xmm2,log_term);
1029                         xmm1      = _mm_add_sd(xmm1,xmm2);
1030                         xmm9      = _mm_mul_sd(neg,diff2);
1031                         xmm2      = _mm_mul_sd(xmm9,prod);
1032
1033                         tmp_ai    = _mm_add_sd(xmm1,xmm2);
1034
1035                         /* contitional for rai<sk-dr */
1036                         xmm3      = _mm_sub_sd(sk,r);
1037                         mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1038
1039                         xmm4    = _mm_sub_sd(rai_inv,lij);
1040                         xmm4    = _mm_mul_sd(two,xmm4);
1041                         xmm4    = _mm_add_sd(tmp_ai,xmm4);
1042
1043                         tmp_ai      = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1044
1045                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1046                         /* ones are governed by the mask_cmp mask. */
1047                         tmp_ai     = _mm_mul_pd(half,tmp_ai);
1048                         tmp_ai     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1049                         sum_ai     = _mm_add_sd(sum_ai,tmp_ai);
1050
1051                         xmm2   = _mm_mul_sd(half,lij2);
1052                         xmm3   = _mm_mul_sd(prod,lij3);
1053                         xmm2   = _mm_add_sd(xmm2,xmm3);
1054                         xmm3   = _mm_mul_sd(lij,rinv);
1055                         xmm4   = _mm_mul_sd(lij3,r);
1056                         xmm3   = _mm_add_sd(xmm3,xmm4);
1057                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1058                         t1     = _mm_sub_sd(xmm2,xmm3);
1059
1060                         xmm2   = _mm_mul_sd(half,uij2);
1061                         xmm2   = _mm_mul_sd(neg,xmm2);
1062                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1063                         xmm3   = _mm_mul_sd(xmm3,uij3);
1064                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1065                         xmm3   = _mm_mul_sd(uij,rinv);
1066                         xmm4   = _mm_mul_sd(uij3,r);
1067                         xmm3   = _mm_add_sd(xmm3,xmm4);
1068                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1069                         t2     = _mm_add_sd(xmm2,xmm3);
1070
1071                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1072                         xmm2   = _mm_add_sd(one,xmm2);
1073                         xmm2   = _mm_mul_sd(eigth,xmm2);
1074                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1075                         xmm3   = _mm_mul_sd(log_term, rinv);
1076                         xmm3   = _mm_mul_sd(xmm3,rinv);
1077                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1078                         t3     = _mm_add_sd(xmm2,xmm3);
1079
1080                         /* chain rule terms */
1081                         xmm2   = _mm_mul_sd(dlij,t1);
1082                         xmm2   = _mm_add_sd(xmm2,t2);
1083                         xmm2   = _mm_add_sd(xmm2,t3);
1084
1085                         /* temporary storage of chain rule terms, since we have to compute
1086                          the reciprocal terms also before storing them */
1087                         chrule = _mm_mul_sd(xmm2,rinv);
1088
1089                         /* OFFSET INTERACTION ai->aj starts here */
1090                         /* conditional mask for raj<dr+sk */
1091                         xmm1      = _mm_add_sd(r,sk_ai);
1092                         mask_cmp  = _mm_cmplt_sd(raj,xmm1);
1093
1094                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1095                         xmm2      = _mm_sub_sd(r,sk_ai);
1096                         xmm3      = my_inv_pd(xmm2);
1097                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1098
1099                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1100                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1101
1102                         uij       = my_inv_pd(xmm1);
1103                         lij2      = _mm_mul_sd(lij,lij);
1104                         lij3      = _mm_mul_sd(lij2,lij);
1105                         uij2      = _mm_mul_sd(uij,uij);
1106                         uij3      = _mm_mul_sd(uij2,uij);
1107
1108                         diff2     = _mm_sub_sd(uij2,lij2);
1109
1110                         lij_inv   = my_invrsq_pd(lij2);
1111
1112                         sk2       = sk2_ai;
1113                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1114                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1115
1116                         log_term  = _mm_mul_pd(uij,lij_inv);
1117                         log_term = log_pd(log_term);
1118
1119                         xmm1      = _mm_sub_sd(lij,uij);
1120                         xmm2      = _mm_mul_sd(qrtr,r);
1121                         xmm2      = _mm_mul_sd(xmm2,diff2);
1122                         xmm1      = _mm_add_sd(xmm1,xmm2);
1123                         xmm2      = _mm_mul_sd(half,rinv);
1124                         xmm2      = _mm_mul_sd(xmm2,log_term);
1125                         xmm1      = _mm_add_sd(xmm1,xmm2);
1126                         xmm9      = _mm_mul_sd(neg,diff2);
1127                         xmm2      = _mm_mul_sd(xmm9,prod);
1128                         tmp       = _mm_add_sd(xmm1,xmm2);
1129
1130                         /* contitional for rai<sk-dr */
1131                         xmm3      = _mm_sub_sd(sk_ai,r);
1132                         mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1133
1134                         xmm4    = _mm_sub_sd(raj_inv,lij);
1135                         xmm4    = _mm_mul_sd(two,xmm4);
1136                         xmm4    = _mm_add_sd(tmp,xmm4);
1137
1138                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1139
1140                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1141                         /* ones are governed by the mask_cmp mask. */
1142                         tmp     = _mm_mul_pd(half,tmp);
1143                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1144
1145                         /* Load, add and store gpol energy */
1146                         xmm7    = _mm_load_sd(born->gpol_hct_work+aj1);
1147                         xmm7    = _mm_add_sd(xmm7,tmp);
1148                         _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1149
1150                         /* Start chain rule terms, t1 */
1151                         xmm2   = _mm_mul_sd(half,lij2);
1152                         xmm3   = _mm_mul_sd(prod,lij3);
1153                         xmm2   = _mm_add_sd(xmm2,xmm3);
1154                         xmm3   = _mm_mul_sd(lij,rinv);
1155                         xmm4   = _mm_mul_sd(lij3,r);
1156                         xmm3   = _mm_add_sd(xmm3,xmm4);
1157                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1158                         t1     = _mm_sub_sd(xmm2,xmm3);
1159
1160                         xmm2   = _mm_mul_sd(half,uij2);
1161                         xmm2   = _mm_mul_sd(neg,xmm2);
1162                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1163                         xmm3   = _mm_mul_sd(xmm3,uij3);
1164                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1165                         xmm3   = _mm_mul_sd(uij,rinv);
1166                         xmm4   = _mm_mul_sd(uij3,r);
1167                         xmm3   = _mm_add_sd(xmm3,xmm4);
1168                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1169                         t2     = _mm_add_sd(xmm2,xmm3);
1170
1171                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1172                         xmm2   = _mm_add_sd(one,xmm2);
1173                         xmm2   = _mm_mul_sd(eigth,xmm2);
1174                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1175                         xmm3   = _mm_mul_sd(log_term, rinv);
1176                         xmm3   = _mm_mul_sd(xmm3,rinv);
1177                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1178                         t3     = _mm_add_sd(xmm2,xmm3);
1179
1180                         /* chain rule terms */
1181                         xmm2   = _mm_mul_sd(dlij,t1);
1182                         xmm2   = _mm_add_sd(xmm2,t2);
1183                         xmm2   = _mm_add_sd(xmm2,t3);
1184                         chrule_ai = _mm_mul_sd(xmm2,rinv);
1185
1186                         _mm_store_sd(fr->dadx+n, chrule);
1187                         n = n + 1;
1188                         _mm_store_sd(fr->dadx+n, chrule_ai);
1189                         n = n + 1;
1190                 }
1191
1192                 /* Do end processing ...  */
1193                 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1194                 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1195                 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1196
1197                 /* Load, add and store atom ai polarisation energy */
1198                 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1199                 sum_ai = _mm_add_sd(sum_ai,xmm2);
1200                 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1201         }
1202
1203         /* Parallel summations */
1204         if(PARTDECOMP(cr))
1205         {
1206                 gmx_sum(natoms, born->gpol_hct_work, cr);
1207         }
1208         else if(DOMAINDECOMP(cr))
1209         {
1210                 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1211         }
1212
1213         /* Compute the radii */
1214         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1215         {
1216                 if(born->use[i] != 0)
1217                 {
1218                         rr      = top->atomtypes.gb_radius[md->typeA[i]]-doff;
1219                         sum     = 1.0/rr - born->gpol_hct_work[i];
1220                         min_rad = rr + doff;
1221                         rad     = 1.0/sum;
1222
1223                         born->bRad[i]   = rad > min_rad ? rad : min_rad;
1224                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1225                 }
1226         }
1227
1228         /* Extra (local) communication required for DD */
1229         if(DOMAINDECOMP(cr))
1230         {
1231                 dd_atom_spread_real(cr->dd, born->bRad);
1232                 dd_atom_spread_real(cr->dd, fr->invsqrta);
1233         }
1234
1235         return 0;
1236 }
1237
1238 int
1239 calc_gb_rad_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
1240                                                         const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
1241 {
1242         int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
1243         int p1,p2,p3,p4,shift;
1244         double rr,sum,sum_tmp,sum2,sum3,min_rad,rad,doff;
1245         double tsum,tchain,rr_inv,rr_inv2,gbr;
1246         double shX,shY,shZ;
1247
1248         __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
1249         __m128d t1,t2,t3,rsq11,rinv,r,rai;
1250         __m128d rai_inv,sk,sk2,lij,dlij,duij;
1251         __m128d uij,lij2,uij2,lij3,uij3,diff2;
1252         __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
1253         __m128d mask_cmp,mask_cmp2,mask_cmp3,doffset,raj,raj_inv;
1254         __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9;
1255         __m128d sum_ai,chrule,chrule_ai,tmp_ai,sk_ai,sk2_ai;
1256
1257         const __m128d neg   = {-1.0, -1.0};
1258         const __m128d zero  = {0.0, 0.0};
1259         const __m128d eigth = {0.125, 0.125};
1260         const __m128d qrtr  = {0.25, 0.25};
1261         const __m128d half  = {0.5, 0.5};
1262         const __m128d one   = {1.0, 1.0};
1263         const __m128d two   = {2.0, 2.0};
1264         const __m128d three = {3.0, 3.0};
1265
1266         /* Keep the compiler happy */
1267         tmp_ai      = _mm_setzero_pd();
1268         tmp         = _mm_setzero_pd();
1269         raj_inv     = _mm_setzero_pd();
1270         raj         = _mm_setzero_pd();
1271         sk          = _mm_setzero_pd();
1272         jx          = _mm_setzero_pd();
1273         jy          = _mm_setzero_pd();
1274         jz          = _mm_setzero_pd();
1275         xmm7        = _mm_setzero_pd();
1276         xmm8        = _mm_setzero_pd();
1277
1278         /* Set the dielectric offset */
1279         doff = born->gb_doffset;
1280         doffset = _mm_load1_pd(&doff);
1281
1282         n       = 0;
1283
1284         for(i=0;i<born->nr;i++)
1285         {
1286                 born->gpol_hct_work[i] = 0;
1287         }
1288
1289         for(i=0;i<nl->nri;i++)
1290         {
1291                 ai      = nl->iinr[i];
1292                 ai3     = ai * 3;
1293
1294                 nj0     = nl->jindex[i];
1295                 nj1     = nl->jindex[i+1];
1296
1297                 /* Load shifts for this list */
1298                 shift   = nl->shift[i];
1299                 shX     = fr->shift_vec[shift][0];
1300                 shY     = fr->shift_vec[shift][1];
1301                 shZ     = fr->shift_vec[shift][2];
1302
1303                 /* Splat the shifts */
1304                 sX = _mm_load1_pd(&shX);
1305                 sY = _mm_load1_pd(&shY);
1306                 sZ = _mm_load1_pd(&shZ);
1307
1308                 offset  = (nj1-nj0)%2;
1309
1310                 /* Load rai */
1311                 rr      = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
1312                 rai     = _mm_load1_pd(&rr);
1313                 rr      = 1.0/rr;
1314                 rai_inv = _mm_load1_pd(&rr);
1315
1316                 /* Load ai coordinates and add shifts */
1317                 ix       = _mm_load1_pd(x+ai3);
1318                 iy       = _mm_load1_pd(x+ai3+1);
1319                 iz       = _mm_load1_pd(x+ai3+2);
1320
1321                 ix      = _mm_add_pd(sX,ix);
1322                 iy      = _mm_add_pd(sY,iy);
1323                 iz      = _mm_add_pd(sZ,iz);
1324
1325                 /* Zero out sums for polarisation energies */
1326                 sum_ai = _mm_setzero_pd();
1327
1328                 sk_ai  =  _mm_load1_pd(born->param+ai);
1329                 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
1330
1331                 for(k=nj0;k<nj1-offset;k+=2)
1332                 {
1333                         aj1  = nl->jjnr[k];
1334                         aj2  = nl->jjnr[k+1];
1335
1336                         aj13 = aj1 * 3;
1337                         aj23 = aj2 * 3;
1338
1339                         /* Load particle aj1-2 coordinates */
1340                         xmm1     = _mm_loadu_pd(x+aj13);
1341                         xmm2     = _mm_loadu_pd(x+aj23);
1342                         jx       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1343                         jy       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1344
1345                         jz         = _mm_loadl_pd(jz, x+aj13+2);
1346                         jz         = _mm_loadh_pd(jz, x+aj23+2);
1347
1348                         dx         = _mm_sub_pd(ix,jx);
1349                         dy         = _mm_sub_pd(iy,jy);
1350                         dz         = _mm_sub_pd(iz,jz);
1351
1352                         rsq11      = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
1353                         rinv       = my_invrsq_pd(rsq11);
1354                         r          = _mm_mul_pd(rinv,rsq11);
1355
1356                         /* Load atom aj1,aj2 raj */
1357                         p1         = md->typeA[aj1];
1358                         p2         = md->typeA[aj2];
1359
1360                         raj        = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
1361                         raj        = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
1362                         raj        = _mm_sub_pd(raj,doffset);
1363
1364                         /* Compute 1.0/raj */
1365                         raj_inv    = my_inv_pd(raj);
1366
1367                         sk         = _mm_loadl_pd(sk,born->param+aj1);
1368                         sk         = _mm_loadh_pd(sk,born->param+aj2);
1369
1370                         /* INTERACTION aj->ai STARTS HERE */
1371                         /* conditional mask for rai<dr+sk */
1372                         xmm1       = _mm_add_pd(r,sk);
1373                         mask_cmp   = _mm_cmplt_pd(rai,xmm1);
1374
1375                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1376                         xmm2      = _mm_sub_pd(r,sk);
1377                         xmm3      = my_inv_pd(xmm2);
1378                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1379
1380                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1381                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1382
1383                         uij       = my_inv_pd(xmm1);
1384                         lij2      = _mm_mul_pd(lij,lij);
1385                         lij3      = _mm_mul_pd(lij2,lij);
1386                         uij2      = _mm_mul_pd(uij,uij);
1387                         uij3      = _mm_mul_pd(uij2,uij);
1388
1389                         diff2     = _mm_sub_pd(uij2,lij2);
1390
1391                         lij_inv   = my_invrsq_pd(lij2);
1392                         sk2       = _mm_mul_pd(sk,sk);
1393                         sk2_inv   = _mm_mul_pd(sk2,rinv);
1394                         prod      = _mm_mul_pd(qrtr,sk2_inv);
1395
1396                         log_term  = _mm_mul_pd(uij,lij_inv);
1397                         log_term = log_pd(log_term);
1398
1399                         xmm1      = _mm_sub_pd(lij,uij);
1400                         xmm2      = _mm_mul_pd(qrtr,r);
1401                         xmm2      = _mm_mul_pd(xmm2,diff2);
1402                         xmm1      = _mm_add_pd(xmm1,xmm2);
1403                         xmm2      = _mm_mul_pd(half,rinv);
1404                         xmm2      = _mm_mul_pd(xmm2,log_term);
1405                         xmm1      = _mm_add_pd(xmm1,xmm2);
1406                         xmm9      = _mm_mul_pd(neg,diff2);
1407                         xmm2      = _mm_mul_pd(xmm9,prod);
1408                         tmp_ai    = _mm_add_pd(xmm1,xmm2);
1409
1410                         /* contitional for rai<sk-dr */
1411                         xmm3      = _mm_sub_pd(sk,r);
1412                         mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
1413
1414                         xmm4    = _mm_sub_pd(rai_inv,lij);
1415                         xmm4    = _mm_mul_pd(two,xmm4);
1416                         xmm4    = _mm_add_pd(tmp_ai,xmm4);
1417
1418                         tmp_ai      = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1419
1420                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1421                         /* ones are governed by the mask_cmp mask. */
1422                         tmp_ai     = _mm_mul_pd(half,tmp_ai);
1423                         tmp_ai     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1424                         sum_ai     = _mm_add_pd(sum_ai,tmp_ai);
1425
1426                         /* Start the dadx chain rule terms */
1427                         xmm2   = _mm_mul_pd(half,lij2);
1428                         xmm3   = _mm_mul_pd(prod,lij3);
1429                         xmm2   = _mm_add_pd(xmm2,xmm3);
1430                         xmm3   = _mm_mul_pd(lij,rinv);
1431                         xmm4   = _mm_mul_pd(lij3,r);
1432                         xmm3   = _mm_add_pd(xmm3,xmm4);
1433                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1434                         t1     = _mm_sub_pd(xmm2,xmm3);
1435
1436                         xmm2   = _mm_mul_pd(half,uij2);
1437                         xmm2   = _mm_mul_pd(neg,xmm2);
1438                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
1439                         xmm3   = _mm_mul_pd(xmm3,uij3);
1440                         xmm2   = _mm_sub_pd(xmm2,xmm3);
1441                         xmm3   = _mm_mul_pd(uij,rinv);
1442                         xmm4   = _mm_mul_pd(uij3,r);
1443                         xmm3   = _mm_add_pd(xmm3,xmm4);
1444                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1445                         t2     = _mm_add_pd(xmm2,xmm3);
1446
1447                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
1448                         xmm2   = _mm_add_pd(one,xmm2);
1449                         xmm2   = _mm_mul_pd(eigth,xmm2);
1450                         xmm2   = _mm_mul_pd(xmm2,xmm9);
1451                         xmm3   = _mm_mul_pd(log_term, rinv);
1452                         xmm3   = _mm_mul_pd(xmm3,rinv);
1453                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1454                         t3     = _mm_add_pd(xmm2,xmm3);
1455
1456                         /* chain rule terms */
1457                         xmm2   = _mm_mul_pd(dlij,t1);
1458                         xmm2   = _mm_add_pd(xmm2,t2);
1459                         xmm2   = _mm_add_pd(xmm2,t3);
1460
1461                         /* temporary storage of chain rule terms, since we have to compute
1462                          the reciprocal terms also before storing them */
1463                         chrule   = _mm_mul_pd(xmm2,rinv);
1464
1465                         /* INTERACTION ai->aj STARTS HERE */
1466                         /* conditional mask for raj<dr+sk_ai */
1467                         xmm1       = _mm_add_pd(r,sk_ai);
1468                         mask_cmp   = _mm_cmplt_pd(raj,xmm1);
1469
1470                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1471                         xmm2      = _mm_sub_pd(r,sk_ai);
1472                         xmm3      = my_inv_pd(xmm2);
1473                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1474
1475                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1476                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1477
1478                         uij       = my_inv_pd(xmm1);
1479                         lij2      = _mm_mul_pd(lij,lij);
1480                         lij3      = _mm_mul_pd(lij2,lij);
1481                         uij2      = _mm_mul_pd(uij,uij);
1482                         uij3      = _mm_mul_pd(uij2,uij);
1483
1484                         diff2     = _mm_sub_pd(uij2,lij2);
1485
1486                         lij_inv   = my_invrsq_pd(lij2);
1487
1488                         sk2       = sk2_ai;
1489                         sk2_inv   = _mm_mul_pd(sk2,rinv);
1490                         prod      = _mm_mul_pd(qrtr,sk2_inv);
1491
1492                         log_term  = _mm_mul_pd(uij,lij_inv);
1493                         log_term = log_pd(log_term);
1494
1495                         xmm1      = _mm_sub_pd(lij,uij);
1496                         xmm2      = _mm_mul_pd(qrtr,r);
1497                         xmm2      = _mm_mul_pd(xmm2,diff2);
1498                         xmm1      = _mm_add_pd(xmm1,xmm2);
1499                         xmm2      = _mm_mul_pd(half,rinv);
1500                         xmm2      = _mm_mul_pd(xmm2,log_term);
1501                         xmm1      = _mm_add_pd(xmm1,xmm2);
1502                         xmm9      = _mm_mul_pd(neg,diff2);
1503                         xmm2      = _mm_mul_pd(xmm9,prod);
1504                         tmp       = _mm_add_pd(xmm1,xmm2);
1505
1506                         /* contitional for raj<sk_ai-dr */
1507                         xmm3      = _mm_sub_pd(sk_ai,r);
1508                         mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
1509
1510                         xmm4    = _mm_sub_pd(raj_inv,lij);
1511                         xmm4    = _mm_mul_pd(two,xmm4);
1512                         xmm4    = _mm_add_pd(tmp,xmm4);
1513
1514                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1515
1516                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1517                         /* ones are governed by the mask_cmp mask. */
1518                         tmp     = _mm_mul_pd(half,tmp);
1519                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1520
1521                         /* Load, add and store ai->aj pol energy */
1522                         xmm7    = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
1523                         xmm7    = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
1524
1525                         xmm7    = _mm_add_pd(xmm7,tmp);
1526
1527                         _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
1528                         _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
1529
1530                         /* Start dadx chain rule terms */
1531                         xmm2   = _mm_mul_pd(half,lij2);
1532                         xmm3   = _mm_mul_pd(prod,lij3);
1533                         xmm2   = _mm_add_pd(xmm2,xmm3);
1534                         xmm3   = _mm_mul_pd(lij,rinv);
1535                         xmm4   = _mm_mul_pd(lij3,r);
1536                         xmm3   = _mm_add_pd(xmm3,xmm4);
1537                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1538                         t1     = _mm_sub_pd(xmm2,xmm3);
1539
1540                         xmm2   = _mm_mul_pd(half,uij2);
1541                         xmm2   = _mm_mul_pd(neg,xmm2);
1542                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
1543                         xmm3   = _mm_mul_pd(xmm3,uij3);
1544                         xmm2   = _mm_sub_pd(xmm2,xmm3);
1545                         xmm3   = _mm_mul_pd(uij,rinv);
1546                         xmm4   = _mm_mul_pd(uij3,r);
1547                         xmm3   = _mm_add_pd(xmm3,xmm4);
1548                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1549                         t2     = _mm_add_pd(xmm2,xmm3);
1550
1551                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
1552                         xmm2   = _mm_add_pd(one,xmm2);
1553                         xmm2   = _mm_mul_pd(eigth,xmm2);
1554                         xmm2   = _mm_mul_pd(xmm2,xmm9);
1555                         xmm3   = _mm_mul_pd(log_term, rinv);
1556                         xmm3   = _mm_mul_pd(xmm3,rinv);
1557                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1558                         t3     = _mm_add_pd(xmm2,xmm3);
1559
1560                         /* chain rule terms */
1561                         xmm2   = _mm_mul_pd(dlij,t1);
1562                         xmm2   = _mm_add_pd(xmm2,t2);
1563                         xmm2   = _mm_add_pd(xmm2,t3);
1564                         chrule_ai   = _mm_mul_pd(xmm2,rinv);
1565
1566                         /* Store chain rule terms
1567                          * same unpacking rationale as with Still above
1568                          */
1569                         xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
1570                         xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
1571
1572                         _mm_storeu_pd(fr->dadx+n, xmm3);
1573                         n = n + 2;
1574                         _mm_storeu_pd(fr->dadx+n, xmm4);
1575                         n = n + 2;
1576                 }
1577
1578                 if(offset!=0)
1579                 {
1580                         aj1       = nl->jjnr[k];
1581                         aj13      = aj1 * 3;
1582                         p1        = md->typeA[aj1];
1583
1584                         jx        = _mm_load_sd(x+aj13);
1585                         jy        = _mm_load_sd(x+aj13+1);
1586                         jz        = _mm_load_sd(x+aj13+2);
1587
1588                         sk        = _mm_load_sd(born->param+aj1);
1589
1590                         /* Load raj */
1591                         raj        = _mm_load_sd(top->atomtypes.gb_radius+p1);
1592                         raj        = _mm_sub_sd(raj,doffset);
1593                         raj_inv    = my_inv_pd(raj);
1594
1595                         dx        = _mm_sub_sd(ix,jx);
1596                         dy        = _mm_sub_pd(iy,jy);
1597                         dz        = _mm_sub_pd(iz,jz);
1598
1599                         rsq11     = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
1600                         rinv      = my_invrsq_pd(rsq11);
1601                         r         = _mm_mul_sd(rinv,rsq11);
1602
1603                         /* OFFSET INTERACTION aj->ai STARTS HERE */
1604                         /* conditional mask for rai<dr+sk */
1605                         xmm1      = _mm_add_sd(r,sk);
1606                         mask_cmp  = _mm_cmplt_sd(rai,xmm1);
1607
1608                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1609                         xmm2      = _mm_sub_sd(r,sk);
1610                         xmm3      = my_inv_pd(xmm2);
1611                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1612
1613                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1614                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1615
1616                         uij       = my_inv_pd(xmm1);
1617                         lij2      = _mm_mul_sd(lij,lij);
1618                         lij3      = _mm_mul_sd(lij2,lij);
1619                         uij2      = _mm_mul_sd(uij,uij);
1620                         uij3      = _mm_mul_sd(uij2,uij);
1621
1622                         diff2     = _mm_sub_sd(uij2,lij2);
1623
1624                         lij_inv   = my_invrsq_pd(lij2);
1625                         sk2       = _mm_mul_sd(sk,sk);
1626                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1627                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1628
1629                         log_term  = _mm_mul_pd(uij,lij_inv);
1630                         log_term = log_pd(log_term);
1631
1632                         xmm1      = _mm_sub_sd(lij,uij);
1633                         xmm2      = _mm_mul_sd(qrtr,r);
1634                         xmm2      = _mm_mul_sd(xmm2,diff2);
1635                         xmm1      = _mm_add_sd(xmm1,xmm2);
1636                         xmm2      = _mm_mul_sd(half,rinv);
1637                         xmm2      = _mm_mul_sd(xmm2,log_term);
1638                         xmm1      = _mm_add_sd(xmm1,xmm2);
1639                         xmm9      = _mm_mul_sd(neg,diff2);
1640                         xmm2      = _mm_mul_sd(xmm9,prod);
1641
1642                         tmp_ai    = _mm_add_sd(xmm1,xmm2);
1643
1644                         /* contitional for rai<sk-dr */
1645                         xmm3      = _mm_sub_sd(sk,r);
1646                         mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1647
1648                         xmm4    = _mm_sub_sd(rai_inv,lij);
1649                         xmm4    = _mm_mul_sd(two,xmm4);
1650                         xmm4    = _mm_add_sd(tmp_ai,xmm4);
1651
1652                         tmp_ai  = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1653
1654                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1655                         /* ones are governed by the mask_cmp mask. */
1656                         tmp_ai = _mm_mul_pd(half,tmp_ai);
1657                         tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1658                         sum_ai = _mm_add_sd(sum_ai,tmp_ai);
1659
1660                         xmm2   = _mm_mul_sd(half,lij2);
1661                         xmm3   = _mm_mul_sd(prod,lij3);
1662                         xmm2   = _mm_add_sd(xmm2,xmm3);
1663                         xmm3   = _mm_mul_sd(lij,rinv);
1664                         xmm4   = _mm_mul_sd(lij3,r);
1665                         xmm3   = _mm_add_sd(xmm3,xmm4);
1666                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1667                         t1     = _mm_sub_sd(xmm2,xmm3);
1668
1669                         xmm2   = _mm_mul_sd(half,uij2);
1670                         xmm2   = _mm_mul_sd(neg,xmm2);
1671                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1672                         xmm3   = _mm_mul_sd(xmm3,uij3);
1673                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1674                         xmm3   = _mm_mul_sd(uij,rinv);
1675                         xmm4   = _mm_mul_sd(uij3,r);
1676                         xmm3   = _mm_add_sd(xmm3,xmm4);
1677                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1678                         t2     = _mm_add_sd(xmm2,xmm3);
1679
1680                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1681                         xmm2   = _mm_add_sd(one,xmm2);
1682                         xmm2   = _mm_mul_sd(eigth,xmm2);
1683                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1684                         xmm3   = _mm_mul_sd(log_term, rinv);
1685                         xmm3   = _mm_mul_sd(xmm3,rinv);
1686                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1687                         t3     = _mm_add_sd(xmm2,xmm3);
1688
1689                         /* chain rule terms */
1690                         xmm2   = _mm_mul_sd(dlij,t1);
1691                         xmm2   = _mm_add_sd(xmm2,t2);
1692                         xmm2   = _mm_add_sd(xmm2,t3);
1693                         chrule   = _mm_mul_sd(xmm2,rinv);
1694
1695                         /* OFFSET INTERACTION ai->aj STARTS HERE */
1696                         /* conditional mask for raj<dr+sk_ai */
1697                         xmm1      = _mm_add_sd(r,sk_ai);
1698                         mask_cmp  = _mm_cmplt_sd(raj,xmm1);
1699
1700                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1701                         xmm2      = _mm_sub_sd(r,sk_ai);
1702                         xmm3      = my_inv_pd(xmm2);
1703                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1704
1705                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1706                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1707
1708                         uij       = my_inv_pd(xmm1);
1709                         lij2      = _mm_mul_sd(lij,lij);
1710                         lij3      = _mm_mul_sd(lij2,lij);
1711                         uij2      = _mm_mul_sd(uij,uij);
1712                         uij3      = _mm_mul_sd(uij2,uij);
1713
1714                         diff2     = _mm_sub_sd(uij2,lij2);
1715
1716                         lij_inv   = my_invrsq_pd(lij2);
1717
1718                         sk2       = sk2_ai;
1719                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1720                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1721
1722                         log_term  = _mm_mul_pd(uij,lij_inv);
1723                         log_term = log_pd(log_term);
1724
1725                         xmm1      = _mm_sub_sd(lij,uij);
1726                         xmm2      = _mm_mul_sd(qrtr,r);
1727                         xmm2      = _mm_mul_sd(xmm2,diff2);
1728                         xmm1      = _mm_add_sd(xmm1,xmm2);
1729                         xmm2      = _mm_mul_sd(half,rinv);
1730                         xmm2      = _mm_mul_sd(xmm2,log_term);
1731                         xmm1      = _mm_add_sd(xmm1,xmm2);
1732                         xmm9      = _mm_mul_sd(neg,diff2);
1733                         xmm2      = _mm_mul_sd(xmm9,prod);
1734                         tmp       = _mm_add_sd(xmm1,xmm2);
1735
1736                         /* contitional for raj<sk_ai-dr */
1737                         xmm3      = _mm_sub_sd(sk_ai,r);
1738                         mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1739
1740                         xmm4    = _mm_sub_sd(raj_inv,lij);
1741                         xmm4    = _mm_mul_sd(two,xmm4);
1742                         xmm4    = _mm_add_sd(tmp,xmm4);
1743
1744                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1745
1746                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1747                         /* ones are governed by the mask_cmp mask. */
1748                         tmp     = _mm_mul_pd(half,tmp);
1749                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1750
1751                         /* Load, add and store gpol energy */
1752                         xmm7    = _mm_load_sd(born->gpol_hct_work+aj1);
1753                         xmm7    = _mm_add_sd(xmm7,tmp);
1754                         _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1755
1756                         /* Start chain rule terms, t1 */
1757                         xmm2   = _mm_mul_sd(half,lij2);
1758                         xmm3   = _mm_mul_sd(prod,lij3);
1759                         xmm2   = _mm_add_sd(xmm2,xmm3);
1760                         xmm3   = _mm_mul_sd(lij,rinv);
1761                         xmm4   = _mm_mul_sd(lij3,r);
1762                         xmm3   = _mm_add_sd(xmm3,xmm4);
1763                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1764                         t1     = _mm_sub_sd(xmm2,xmm3);
1765
1766                         xmm2   = _mm_mul_sd(half,uij2);
1767                         xmm2   = _mm_mul_sd(neg,xmm2);
1768                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1769                         xmm3   = _mm_mul_sd(xmm3,uij3);
1770                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1771                         xmm3   = _mm_mul_sd(uij,rinv);
1772                         xmm4   = _mm_mul_sd(uij3,r);
1773                         xmm3   = _mm_add_sd(xmm3,xmm4);
1774                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1775                         t2     = _mm_add_sd(xmm2,xmm3);
1776
1777                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1778                         xmm2   = _mm_add_sd(one,xmm2);
1779                         xmm2   = _mm_mul_sd(eigth,xmm2);
1780                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1781                         xmm3   = _mm_mul_sd(log_term, rinv);
1782                         xmm3   = _mm_mul_sd(xmm3,rinv);
1783                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1784                         t3     = _mm_add_sd(xmm2,xmm3);
1785
1786                         /* chain rule terms */
1787                         xmm2   = _mm_mul_sd(dlij,t1);
1788                         xmm2   = _mm_add_sd(xmm2,t2);
1789                         xmm2   = _mm_add_sd(xmm2,t3);
1790                         chrule_ai   = _mm_mul_sd(xmm2,rinv);
1791
1792                         _mm_store_sd(fr->dadx+n, chrule);
1793                         n = n + 1;
1794                         _mm_store_sd(fr->dadx+n, chrule_ai);
1795                         n = n + 1;
1796                 }
1797
1798                 /* Do end processing ...  */
1799                 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1800                 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1801                 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1802
1803                 /* Load, add and store atom ai polarisation energy */
1804                 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1805                 sum_ai = _mm_add_sd(sum_ai,xmm2);
1806                 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1807         }
1808
1809         /* Parallel summations */
1810         if(PARTDECOMP(cr))
1811         {
1812                 gmx_sum(natoms, born->gpol_hct_work, cr);
1813         }
1814         else if(DOMAINDECOMP(cr))
1815         {
1816                 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1817         }
1818
1819         /* Compute the radii */
1820         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1821         {
1822                 if(born->use[i] != 0)
1823                 {
1824                         rr      = top->atomtypes.gb_radius[md->typeA[i]];
1825                         rr_inv2 = 1.0/rr;
1826                         rr      = rr-doff;
1827                         rr_inv  = 1.0/rr;
1828                         sum     = rr * born->gpol_hct_work[i];
1829                         sum2    = sum  * sum;
1830                         sum3    = sum2 * sum;
1831
1832                         tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1833                         born->bRad[i] = rr_inv - tsum*rr_inv2;
1834                         born->bRad[i] = 1.0 / born->bRad[i];
1835
1836                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1837
1838                         tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1839                         born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1840                 }
1841         }
1842
1843         /* Extra (local) communication required for DD */
1844         if(DOMAINDECOMP(cr))
1845         {
1846                 dd_atom_spread_real(cr->dd, born->bRad);
1847                 dd_atom_spread_real(cr->dd, fr->invsqrta);
1848                 dd_atom_spread_real(cr->dd, born->drobc);
1849         }
1850
1851         return 0;
1852
1853 }
1854
1855
1856 int
1857 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, double *xd, double *f,
1858                                                           double *fshift, double *shift_vec, int gb_algorithm, gmx_genborn_t *born)
1859 {
1860         int i,k,n,ai,aj,ai3,aj1,aj2,aj13,aj23,aj4,nj0,nj1,offset;
1861         int shift;
1862         double rbi,shX,shY,shZ;
1863         double *rb;
1864
1865         __m128d ix,iy,iz,jx,jy,jz,fix,fiy,fiz,sX,sY,sZ;
1866         __m128d dx,dy,dz,t1,t2,t3,dva,dvaj,dax,dax_ai,fgb,fgb_ai;
1867         __m128d xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
1868
1869         t1 = t2 = t3 = _mm_setzero_pd();
1870         rb = born->work;
1871
1872         /* Loop to get the proper form for the Born radius term */
1873         if(gb_algorithm==egbSTILL) {
1874                 for(i=0;i<natoms;i++)
1875                 {
1876                         rbi   = born->bRad[i];
1877                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1878                 }
1879         }
1880
1881         else if(gb_algorithm==egbHCT) {
1882                 for(i=0;i<natoms;i++)
1883                 {
1884                         rbi   = born->bRad[i];
1885                         rb[i] = rbi * rbi * dvda[i];
1886                 }
1887         }
1888
1889         else if(gb_algorithm==egbOBC) {
1890                 for(i=0;i<natoms;i++)
1891                 {
1892                         rbi   = born->bRad[i];
1893                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1894                 }
1895         }
1896
1897         n = 0;
1898
1899         for(i=0;i<nl->nri;i++)
1900         {
1901                 ai     = nl->iinr[i];
1902                 ai3    = ai * 3;
1903
1904                 nj0    = nl->jindex[i];
1905                 nj1    = nl->jindex[i+1];
1906
1907                 /* Load shifts for this list */
1908                 shift   = 3*nl->shift[i];
1909                 shX     = shift_vec[shift+0];
1910                 shY     = shift_vec[shift+1];
1911                 shZ     = shift_vec[shift+2];
1912
1913                 /* Splat the shifts */
1914                 sX = _mm_load1_pd(&shX);
1915                 sY = _mm_load1_pd(&shY);
1916                 sZ = _mm_load1_pd(&shZ);
1917
1918                 offset = (nj1-nj0)%2;
1919
1920                 /* Load particle ai coordinates and add shifts */
1921                 ix  = _mm_load1_pd(xd+ai3);
1922                 iy  = _mm_load1_pd(xd+ai3+1);
1923                 iz  = _mm_load1_pd(xd+ai3+2);
1924
1925                 ix      = _mm_add_pd(sX,ix);
1926                 iy      = _mm_add_pd(sY,iy);
1927                 iz      = _mm_add_pd(sZ,iz);
1928
1929                 /* Load particle ai dvda */
1930                 dva = _mm_load1_pd(rb+ai);
1931
1932                 fix = _mm_setzero_pd();
1933                 fiy = _mm_setzero_pd();
1934                 fiz = _mm_setzero_pd();
1935
1936                 for(k=nj0;k<nj1-offset;k+=2)
1937                 {
1938                         aj1 = nl->jjnr[k];
1939                         aj2 = nl->jjnr[k+1];
1940
1941                         /* Load dvda_j */
1942                         dvaj = _mm_set_pd(rb[aj2],rb[aj1]);
1943
1944                         aj13 = aj1 * 3;
1945                         aj23 = aj2 * 3;
1946
1947                         /* Load particle aj1-2 coordinates */
1948                         xmm1 = _mm_loadu_pd(xd+aj13);
1949                         xmm2 = _mm_loadu_pd(xd+aj23);
1950
1951                         xmm5 = _mm_load_sd(xd+aj13+2);
1952                         xmm6 = _mm_load_sd(xd+aj23+2);
1953
1954                         jx   = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1955                         jy   = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1956                         jz   = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
1957
1958                         /* load derivative of born radii w.r.t. coordinates */
1959                         xmm7  = _mm_loadu_pd(dadx+n);
1960                         n     = n + 2;
1961                         xmm8  = _mm_loadu_pd(dadx+n);
1962                         n     = n + 2;
1963
1964                         /* Shuffle to get the ai and aj components right */
1965                         dax    = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(2,0));
1966                         dax_ai = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(3,1));
1967
1968                         /* distances */
1969                         dx   = _mm_sub_pd(ix,jx);
1970                         dy   = _mm_sub_pd(iy,jy);
1971                         dz   = _mm_sub_pd(iz,jz);
1972
1973                         /* scalar force */
1974                         fgb  = _mm_mul_pd(dva,dax);
1975                         fgb_ai = _mm_mul_pd(dvaj,dax_ai);
1976                         fgb = _mm_add_pd(fgb,fgb_ai);
1977
1978                         /* partial forces */
1979                         t1   = _mm_mul_pd(fgb,dx);
1980                         t2   = _mm_mul_pd(fgb,dy);
1981                         t3   = _mm_mul_pd(fgb,dz);
1982
1983                         /* update the i force */
1984                         fix  = _mm_add_pd(fix,t1);
1985                         fiy  = _mm_add_pd(fiy,t2);
1986                         fiz  = _mm_add_pd(fiz,t3);
1987
1988                         /* accumulate forces from memory */
1989                         xmm1 = _mm_loadu_pd(f+aj13); /* fx1 fx2 */
1990                         xmm2 = _mm_loadu_pd(f+aj23); /* fy1 fy2 */
1991
1992                         xmm5 = _mm_load1_pd(f+aj13+2); /* fz1 fz1 */
1993                         xmm6 = _mm_load1_pd(f+aj23+2); /* fz2 fz2 */
1994
1995                         /* transpose */
1996                         xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
1997                         xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
1998                         xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
1999
2000                         /* subtract partial forces */
2001                         xmm5 = _mm_sub_pd(xmm5, t1);
2002                         xmm6 = _mm_sub_pd(xmm6, t2);
2003                         xmm7 = _mm_sub_pd(xmm7, t3);
2004
2005                         /* transpose back fx and fy */
2006                         xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
2007                         xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1));
2008
2009                         /* store the force, first fx's and fy's */
2010                         _mm_storeu_pd(f+aj13,xmm1);
2011                         _mm_storeu_pd(f+aj23,xmm2);
2012
2013                         /* then fz */
2014                         _mm_storel_pd(f+aj13+2,xmm7);
2015                         _mm_storeh_pd(f+aj23+2,xmm7);
2016                 }
2017
2018                 if(offset!=0)
2019                 {
2020                         aj1  = nl->jjnr[k];
2021
2022                         dvaj = _mm_load_sd(rb+aj1);
2023
2024                         aj13 = aj1 * 3;
2025
2026                         jx    = _mm_load_sd(xd+aj13);
2027                         jy    = _mm_load_sd(xd+aj13+1);
2028                         jz    = _mm_load_sd(xd+aj13+2);
2029
2030                         dax  = _mm_load_sd(dadx+n);
2031                         n    = n + 1;
2032                         dax_ai = _mm_load_sd(dadx+n);
2033                         n    = n + 1;
2034
2035                         dx   = _mm_sub_sd(ix,jx);
2036                         dy   = _mm_sub_sd(iy,jy);
2037                         dz   = _mm_sub_sd(iz,jz);
2038
2039                         fgb  = _mm_mul_sd(dva,dax);
2040                         fgb_ai = _mm_mul_sd(dvaj,dax_ai);
2041                         fgb  = _mm_add_pd(fgb,fgb_ai);
2042
2043                         t1   = _mm_mul_sd(fgb,dx);
2044                         t2   = _mm_mul_sd(fgb,dy);
2045                         t3   = _mm_mul_sd(fgb,dz);
2046
2047                         fix  = _mm_add_sd(fix,t1);
2048                         fiy  = _mm_add_sd(fiy,t2);
2049                         fiz  = _mm_add_sd(fiz,t3);
2050
2051                         /* accumulate forces from memory */
2052                         xmm5 = _mm_load_sd(f+aj13);
2053                         xmm6 = _mm_load_sd(f+aj13+1);
2054                         xmm7 = _mm_load_sd(f+aj13+2);
2055
2056                         /* subtract partial forces */
2057                         xmm5 = _mm_sub_sd(xmm5, t1);
2058                         xmm6 = _mm_sub_sd(xmm6, t2);
2059                         xmm7 = _mm_sub_sd(xmm7, t3);
2060
2061                         /* store the force */
2062                         _mm_store_sd(f+aj13,xmm5);
2063                         _mm_store_sd(f+aj13+1,xmm6);
2064                         _mm_store_sd(f+aj13+2,xmm7);
2065                 }
2066
2067                 /* fix/fiy/fiz now contain two partial terms, that all should be
2068                  * added to the i particle forces
2069                  */
2070                 t1               = _mm_unpacklo_pd(t1,fix);
2071                 t2               = _mm_unpacklo_pd(t2,fiy);
2072                 t3               = _mm_unpacklo_pd(t3,fiz);
2073
2074                 fix              = _mm_add_pd(fix,t1);
2075                 fiy              = _mm_add_pd(fiy,t2);
2076                 fiz              = _mm_add_pd(fiz,t3);
2077
2078                 fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
2079                 fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
2080                 fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
2081
2082                 /* load, add and store i forces */
2083                 xmm1     = _mm_load_sd(f+ai3);
2084                 xmm2     = _mm_load_sd(f+ai3+1);
2085                 xmm3     = _mm_load_sd(f+ai3+2);
2086
2087                 fix      = _mm_add_sd(fix,xmm1);
2088                 fiy      = _mm_add_sd(fiy,xmm2);
2089                 fiz      = _mm_add_sd(fiz,xmm3);
2090
2091                 _mm_store_sd(f+ai3,fix);
2092                 _mm_store_sd(f+ai3+1,fiy);
2093                 _mm_store_sd(f+ai3+2,fiz);
2094
2095                 /* load, add and store i shift forces */
2096                 xmm1     = _mm_load_sd(fshift+shift);
2097                 xmm2     = _mm_load_sd(fshift+shift+1);
2098                 xmm3     = _mm_load_sd(fshift+shift+2);
2099
2100                 fix      = _mm_add_sd(fix,xmm1);
2101                 fiy      = _mm_add_sd(fiy,xmm2);
2102                 fiz      = _mm_add_sd(fiz,xmm3);
2103
2104                 _mm_store_sd(fshift+shift,fix);
2105                 _mm_store_sd(fshift+shift+1,fiy);
2106                 _mm_store_sd(fshift+shift+2,fiz);
2107
2108         }
2109
2110         return 0;
2111 }
2112
2113 #else
2114 /* keep compiler happy */
2115 int genborn_sse_dummy;
2116
2117 #endif /* SSE2 intrinsics available */
2118