19 #include "gmx_fatal.h"
20 #include "mtop_util.h"
30 /* Double precision SSE2 genborn disabled while waiting for transcendental implementations... */
33 /* Only compile this file if SSE2 intrinsics are available */
34 /* #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) ) */
35 #include <xmmintrin.h>
36 #include <emmintrin.h>
38 #include "genborn_sse2_double.h"
40 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
41 #define gmx_castsi128_pd(a) _mm_castsi128_pd(a)
42 #define gmx_castpd_si128(a) _mm_castpd_si128(a)
43 #elif defined(__GNUC__)
44 #define gmx_castsi128_pd(a) ((__m128d)(a))
45 #define gmx_castpd_si128(a) ((__m128i)(a))
47 static __m128d
gmx_castsi128_pd(__m128i a
) { return *(__m128d
*) &a
; }
48 static __m128i
gmx_castpd_si128(__m128d a
) { return *(__m128i
*) &a
; }
53 sincos_sse2double(__m128d x
, __m128d
*sinval
, __m128d
*cosval
)
55 const __m128d two_over_pi
= {2.0/M_PI
,2.0/M_PI
};
56 const __m128d half
= {0.5,0.5};
57 const __m128d one
= {1.0,1.0};
58 const __m128i izero
= _mm_set1_epi32(0);
59 const __m128i ione
= _mm_set1_epi32(1);
60 const __m128i itwo
= _mm_set1_epi32(2);
61 const __m128i ithree
= _mm_set1_epi32(3);
62 const __m128d sincosd_kc1
= {(13176794.0 / 8388608.0),(13176794.0 / 8388608.0)};
63 const __m128d sincosd_kc2
= {7.5497899548918821691639751442098584e-8,7.5497899548918821691639751442098584e-8};
64 const __m128d sincosd_cc0
= {0.00000000206374484196,0.00000000206374484196};
65 const __m128d sincosd_cc1
= {-0.00000027555365134677,-0.00000027555365134677};
66 const __m128d sincosd_cc2
= {0.00002480157946764225,0.00002480157946764225};
67 const __m128d sincosd_cc3
= {-0.00138888888730525966,-0.00138888888730525966};
68 const __m128d sincosd_cc4
= {0.04166666666651986722,0.04166666666651986722};
69 const __m128d sincosd_cc5
= {-0.49999999999999547304,-0.49999999999999547304};
70 const __m128d sincosd_sc0
= {0.00000000015893606014,0.00000000015893606014};
71 const __m128d sincosd_sc1
= {-0.00000002505069049138,-0.00000002505069049138};
72 const __m128d sincosd_sc2
= {0.00000275573131527032,0.00000275573131527032};
73 const __m128d sincosd_sc3
= {-0.00019841269827816117,-0.00019841269827816117};
74 const __m128d sincosd_sc4
= {0.00833333333331908278,0.00833333333331908278};
75 const __m128d sincosd_sc5
= {-0.16666666666666612594,-0.16666666666666612594};
77 __m128d signbit
= gmx_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ULL
));
78 __m128d tiny
= gmx_castsi128_pd(_mm_set1_epi64x(0x3e40000000000000ULL
));
80 __m128d xl
,xl2
,xl3
,qd
,absxl
,p1
,cx
,sx
,ts
,tc
,tsn
,tcn
;
82 __m128i offsetSin
,offsetCos
;
83 __m128d sinMask
,cosMask
,isTiny
;
84 __m128d ct0
,ct1
,ct2
,ct3
,ct4
,ct5
,ct6
,st1
,st2
,st3
,st4
,st6
;
86 /* Rescale the angle to the range 0..4, and find which quadrant it is in */
87 xl
= _mm_mul_pd(x
,two_over_pi
);
89 /* q=integer part of xl, rounded _away_ from 0.0 */
90 /* Add 0.5 away from 0.0 */
91 xl
= _mm_add_pd(xl
,_mm_or_pd(_mm_and_pd(xl
,signbit
),half
));
93 q
= _mm_cvttpd_epi32(xl
);
94 qd
= _mm_cvtepi32_pd(q
);
95 q
= _mm_shuffle_epi32(q
,_MM_SHUFFLE(1,1,0,0));
97 /* Compute offset based on quadrant the arg falls in */
98 offsetSin
= _mm_and_si128(q
,ithree
);
99 offsetCos
= _mm_add_epi32(offsetSin
,ione
);
101 /* Remainder in range [-pi/4..pi/4] */
102 p1
= _mm_mul_pd(qd
,sincosd_kc1
);
103 xl
= _mm_mul_pd(qd
,sincosd_kc2
);
104 p1
= _mm_sub_pd(x
,p1
);
105 xl
= _mm_sub_pd(p1
,xl
);
107 absxl
= _mm_andnot_pd(signbit
,xl
);
108 isTiny
= _mm_cmpgt_pd(tiny
,absxl
);
110 xl2
= _mm_mul_pd(xl
,xl
);
111 xl3
= _mm_mul_pd(xl2
,xl
);
113 ct0
= _mm_mul_pd(xl2
,xl2
);
114 ct1
= _mm_mul_pd(sincosd_cc0
,xl2
);
115 ct2
= _mm_mul_pd(sincosd_cc2
,xl2
);
116 ct3
= _mm_mul_pd(sincosd_cc4
,xl2
);
117 st1
= _mm_mul_pd(sincosd_sc0
,xl2
);
118 st2
= _mm_mul_pd(sincosd_sc2
,xl2
);
119 st3
= _mm_mul_pd(sincosd_sc4
,xl2
);
120 ct1
= _mm_add_pd(ct1
,sincosd_cc1
);
121 ct2
= _mm_add_pd(ct2
,sincosd_cc3
);
122 ct3
= _mm_add_pd(ct3
,sincosd_cc5
);
123 st1
= _mm_add_pd(st1
,sincosd_sc1
);
124 st2
= _mm_add_pd(st2
,sincosd_sc3
);
125 st3
= _mm_add_pd(st3
,sincosd_sc5
);
127 ct4
= _mm_mul_pd(ct2
,ct0
);
128 ct4
= _mm_add_pd(ct4
,ct3
);
130 st4
= _mm_mul_pd(st2
,ct0
);
131 st4
= _mm_add_pd(st4
,st3
);
132 ct5
= _mm_mul_pd(ct0
,ct0
);
134 ct6
= _mm_mul_pd(ct5
,ct1
);
135 ct6
= _mm_add_pd(ct6
,ct4
);
137 st6
= _mm_mul_pd(ct5
,st1
);
138 st6
= _mm_add_pd(st6
,st4
);
140 cx
= _mm_mul_pd(ct6
,xl2
);
141 cx
= _mm_add_pd(cx
,one
);
143 sx
= _mm_mul_pd(st6
,xl3
);
144 sx
= _mm_add_pd(sx
,xl
);
146 /* Small angle approximation, sin(tiny)=tiny, cos(tiny)=1.0 */
147 sx
= _mm_or_pd( _mm_and_pd(isTiny
,xl
) , _mm_andnot_pd(isTiny
,sx
) );
148 cx
= _mm_or_pd( _mm_and_pd(isTiny
,one
) , _mm_andnot_pd(isTiny
,cx
) );
150 sinMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin
,ione
), izero
));
151 cosMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos
,ione
), izero
));
153 ts
= _mm_or_pd( _mm_and_pd(sinMask
,sx
) , _mm_andnot_pd(sinMask
,cx
) );
154 tc
= _mm_or_pd( _mm_and_pd(cosMask
,sx
) , _mm_andnot_pd(cosMask
,cx
) );
156 /* Flip the sign of the result when (offset mod 4) = 1 or 2 */
157 sinMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin
,itwo
), izero
));
158 tsn
= _mm_xor_pd(signbit
,ts
);
159 ts
= _mm_or_pd( _mm_and_pd(sinMask
,ts
) , _mm_andnot_pd(sinMask
,tsn
) );
161 cosMask
= gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos
,itwo
), izero
));
162 tcn
= _mm_xor_pd(signbit
,tc
);
163 tc
= _mm_or_pd( _mm_and_pd(cosMask
,tc
) , _mm_andnot_pd(cosMask
,tcn
) );
171 __m128d
log_pd(__m128d x
)
173 const __m128i exp_mask
= _mm_set_epi32(0x7FF00000,0,0x7FF00000,0);
174 const __m128i exp_bias
= _mm_set_epi32(0,1023,0,1023);
176 const __m128d const_loge
= _mm_set1_pd(0.69314718055994529);
177 const __m128d const_one
= _mm_set1_pd(1.0);
178 const __m128d const_two
= _mm_set1_pd(2.0);
180 /* Almost full single precision accuracy (~20 bits worst case) */
181 const __m128d P0
= _mm_set1_pd(6.108179944792157749153050);
182 const __m128d P1
= _mm_set1_pd(52.43691313715523327631139);
183 const __m128d P2
= _mm_set1_pd(71.53664010795613671168440);
184 const __m128d P3
= _mm_set1_pd(18.92097516931559547548485);
185 const __m128d P4
= _mm_set1_pd(0.3504714784635941984522153);
186 const __m128d P5
= _mm_set1_pd(-0.007105890734229368515879);
187 const __m128d Q1
= _mm_set1_pd(17.73314231909420567454406);
188 const __m128d Q2
= _mm_set1_pd(48.82373085428713023213363);
189 const __m128d Q3
= _mm_set1_pd(31.65945943354513166309101);
190 const __m128d Q4
= _mm_set1_pd(4.302477477108162270199051);
192 __m128d xmm0
,xmm1
,xmm2
,xmm3
, xmm4
;
195 xmmi
= gmx_castpd_si128(x
);
196 xmm1
= _mm_cvtepi32_pd(_mm_shuffle_epi32(_mm_sub_epi64(_mm_srli_epi64(_mm_and_si128(xmmi
, exp_mask
), 52), exp_bias
),_MM_SHUFFLE(3,1,2,0)));
197 xmm0
= _mm_or_pd(gmx_castsi128_pd(_mm_andnot_si128(exp_mask
, xmmi
)), const_one
);
199 xmm2
= _mm_mul_pd(P5
,xmm0
);
200 xmm2
= _mm_add_pd(xmm2
,P4
);
201 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
202 xmm2
= _mm_add_pd(xmm2
,P3
);
203 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
204 xmm2
= _mm_add_pd(xmm2
,P2
);
205 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
206 xmm2
= _mm_add_pd(xmm2
,P1
);
207 xmm2
= _mm_mul_pd(xmm2
,xmm0
);
208 xmm2
= _mm_add_pd(xmm2
,P0
);
210 xmm3
= _mm_mul_pd(Q4
,xmm0
);
211 xmm3
= _mm_add_pd(xmm3
,Q3
);
212 xmm3
= _mm_mul_pd(xmm3
,xmm0
);
213 xmm3
= _mm_add_pd(xmm3
,Q2
);
214 xmm3
= _mm_mul_pd(xmm3
,xmm0
);
215 xmm3
= _mm_add_pd(xmm3
,Q1
);
216 xmm3
= _mm_mul_pd(xmm3
,xmm0
);
217 xmm3
= _mm_add_pd(xmm3
,const_one
);
220 xmm4
= _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(xmm3
)));
221 xmm4
= _mm_mul_pd(xmm4
,_mm_sub_pd(const_two
,_mm_mul_pd(xmm3
,xmm4
)));
222 xmm4
= _mm_mul_pd(xmm4
,_mm_sub_pd(const_two
,_mm_mul_pd(xmm3
,xmm4
)));
223 xmm2
= _mm_mul_pd(xmm2
,xmm4
);
225 xmm0
= _mm_sub_pd(xmm0
, const_one
);
226 xmm0
= _mm_mul_pd(xmm0
,xmm2
);
228 xmm0
= _mm_add_pd(xmm0
,xmm1
);
230 return _mm_mul_pd(xmm0
, const_loge
);
233 /* This exp() routine provides accuracy of 10E-9 to 10E-11.
234 * The polynomial minimax coefficients are actually accurate to 10E-14,
235 * but we lose some accuracy in the polynomial evaluation.
237 __m128d
exp_pd(__m128d x
)
239 const __m128d lim1
= _mm_set1_pd(1025.0); /* 1025.00000e+0d */
240 const __m128d lim2
= _mm_set1_pd(-1022.99999999); /* -1022.99999e+0f */
242 const __m128i base
= _mm_set_epi32(0,0,1023,1023);
243 const __m128d half
= _mm_set1_pd(0.5);
244 const __m128d log2e
= _mm_set1_pd(1.4426950408889634);
246 const __m128d exp_P0
= _mm_set1_pd(1.00000000000001276211229749);
247 const __m128d exp_P1
= _mm_set1_pd(6.931471805598709708635169E-1);
248 const __m128d exp_P2
= _mm_set1_pd(2.402265069564965287455972E-1);
249 const __m128d exp_P3
= _mm_set1_pd(5.550410866868561155599683E-2);
250 const __m128d exp_P4
= _mm_set1_pd(9.618129192067246128919915E-3);
251 const __m128d exp_P5
= _mm_set1_pd(1.333355761760444302342084E-3);
252 const __m128d exp_P6
= _mm_set1_pd(1.540343494807179111289781E-4);
253 const __m128d exp_P7
= _mm_set1_pd(1.525298483865349629325421E-5);
254 const __m128d exp_P8
= _mm_set1_pd(1.325940560934510417818578E-6);
255 const __m128d exp_P9
= _mm_set1_pd(1.015033670529892589443421E-7);
260 xmm0
= _mm_mul_pd(x
,log2e
);
261 xmm0
= _mm_min_pd(xmm0
,lim1
);
262 xmm0
= _mm_max_pd(xmm0
,lim2
);
263 xmm1
= _mm_sub_pd(xmm0
,half
);
265 xmmi
= _mm_cvtpd_epi32(xmm1
);
266 xmm1
= _mm_cvtepi32_pd(xmmi
);
268 xmmi
= _mm_add_epi32(xmmi
,base
);
269 xmmi
= _mm_shuffle_epi32(xmmi
,_MM_SHUFFLE(3,1,2,0));
270 xmmi
= _mm_slli_epi64(xmmi
,52);
272 xmm0
= _mm_sub_pd(xmm0
,xmm1
);
274 xmm1
= _mm_mul_pd(exp_P9
,xmm0
);
275 xmm1
= _mm_add_pd(xmm1
,exp_P8
);
276 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
277 xmm1
= _mm_add_pd(xmm1
,exp_P7
);
278 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
279 xmm1
= _mm_add_pd(xmm1
,exp_P6
);
280 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
281 xmm1
= _mm_add_pd(xmm1
,exp_P5
);
282 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
283 xmm1
= _mm_add_pd(xmm1
,exp_P4
);
284 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
285 xmm1
= _mm_add_pd(xmm1
,exp_P3
);
286 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
287 xmm1
= _mm_add_pd(xmm1
,exp_P2
);
288 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
289 xmm1
= _mm_add_pd(xmm1
,exp_P1
);
290 xmm1
= _mm_mul_pd(xmm1
,xmm0
);
291 xmm1
= _mm_add_pd(xmm1
,exp_P0
);
292 xmm1
= _mm_mul_pd(xmm1
,gmx_castsi128_pd(xmmi
));
297 static inline __m128d
298 my_invrsq_pd(__m128d x
)
300 const __m128d three
= {3.0, 3.0};
301 const __m128d half
= {0.5, 0.5};
303 __m128 t
= _mm_rsqrt_ps(_mm_cvtpd_ps(x
)); /* Convert to single precision and do _mm_rsqrt_ps() */
304 __m128d t1
= _mm_cvtps_pd(t
); /* Convert back to double precision */
306 /* First Newton-Rapson step, accuracy is now 24 bits */
307 __m128d t2
= _mm_mul_pd(half
,_mm_mul_pd(t1
,_mm_sub_pd(three
,_mm_mul_pd(x
,_mm_mul_pd(t1
,t1
)))));
309 /* Return second Newton-Rapson step, accuracy 48 bits */
310 return _mm_mul_pd(half
,_mm_mul_pd(t2
,_mm_sub_pd(three
,_mm_mul_pd(x
,_mm_mul_pd(t2
,t2
)))));
313 static inline __m128d
316 const __m128d two
= {2.0, 2.0};
318 __m128 t
= _mm_rcp_ps(_mm_cvtpd_ps(x
));
319 __m128d t1
= _mm_cvtps_pd(t
);
320 __m128d t2
= _mm_mul_pd(t1
,_mm_sub_pd(two
,_mm_mul_pd(t1
,x
)));
322 return _mm_mul_pd(t2
,_mm_sub_pd(two
,_mm_mul_pd(t2
,x
)));
327 calc_gb_rad_still_sse2_double(t_commrec
*cr
, t_forcerec
*fr
,int natoms
, gmx_localtop_t
*top
,
328 const t_atomtypes
*atype
, double *x
, t_nblist
*nl
, gmx_genborn_t
*born
, t_mdatoms
*md
)
330 int i
,k
,n
,ai
,ai3
,aj1
,aj2
,aj13
,aj23
;
331 int at0
,at1
,nj0
,nj1
,offset
,taj1
,taj2
;
334 double factor
,gpi_ai
,gpi_tmp
,gpi2
;
337 __m128d ix
,iy
,iz
,jx
,jy
,jz
,dx
,dy
,dz
,sX
,sY
,sZ
;
338 __m128d t1
,t2
,t3
,rsq11
,rinv
,rinv2
,rinv4
,rinv6
;
339 __m128d ratio
,gpi
,rai
,raj
,vaj
,rvdw
,mask_cmp
;
340 __m128d ccf
,dccf
,theta
,cosq
,term
,sinq
,res
,prod
;
341 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm7
,vai
,prod_ai
,icf4
,icf6
;
343 const __m128d half
= {0.5, 0.5};
344 const __m128d three
= {3.0, 3.0};
345 const __m128d one
= {1.0, 1.0};
346 const __m128d two
= {2.0, 2.0};
347 const __m128d zero
= {0.0, 0.0};
348 const __m128d four
= {4.0, 4.0};
350 const __m128d p5inv
= {STILL_P5INV
, STILL_P5INV
};
351 const __m128d pip5
= {STILL_PIP5
, STILL_PIP5
};
352 const __m128d p4
= {STILL_P4
, STILL_P4
};
354 factor
= 0.5 * ONE_4PI_EPS0
;
357 /* Keep the compiler happy */
358 raj
= _mm_setzero_pd();
359 vaj
= _mm_setzero_pd();
360 jx
= _mm_setzero_pd();
361 jy
= _mm_setzero_pd();
362 jz
= _mm_setzero_pd();
363 xmm2
= _mm_setzero_pd();
364 xmm7
= _mm_setzero_pd();
366 for(i
=0;i
<born
->nr
;i
++)
368 born
->gpol_still_work
[i
]=0;
371 for(i
=0;i
<nl
->nri
;i
++)
377 nj1
= nl
->jindex
[i
+1];
379 /* Load shifts for this list */
380 shift
= nl
->shift
[i
];
381 shX
= fr
->shift_vec
[shift
][0];
382 shY
= fr
->shift_vec
[shift
][1];
383 shZ
= fr
->shift_vec
[shift
][2];
385 /* Splat the shifts */
386 sX
= _mm_load1_pd(&shX
);
387 sY
= _mm_load1_pd(&shY
);
388 sZ
= _mm_load1_pd(&shZ
);
390 offset
= (nj1
-nj0
)%2;
392 /* Polarization energy for atom ai */
393 gpi
= _mm_setzero_pd();
395 /* Load particle ai coordinates and add shifts */
396 ix
= _mm_load1_pd(x
+ai3
);
397 iy
= _mm_load1_pd(x
+ai3
+1);
398 iz
= _mm_load1_pd(x
+ai3
+2);
400 ix
= _mm_add_pd(sX
,ix
);
401 iy
= _mm_add_pd(sY
,iy
);
402 iz
= _mm_add_pd(sZ
,iz
);
404 /* Load particle ai gb_radius */
405 rai
= _mm_set1_pd(top
->atomtypes
.gb_radius
[md
->typeA
[ai
]]);
406 vai
= _mm_set1_pd(born
->vsolv
[ai
]);
407 prod_ai
= _mm_mul_pd(p4
,vai
);
409 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
417 taj1
= md
->typeA
[aj1
];
418 taj2
= md
->typeA
[aj2
];
420 /* Load particle aj1-2 coordinates and compute ai->aj distances */
421 xmm1
= _mm_loadu_pd(x
+aj13
);
422 xmm2
= _mm_loadu_pd(x
+aj23
);
423 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
424 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
426 jz
= _mm_loadl_pd(jz
,x
+aj13
+2);
427 jz
= _mm_loadh_pd(jz
,x
+aj23
+2);
429 dx
= _mm_sub_pd(ix
,jx
);
430 dy
= _mm_sub_pd(iy
,jy
);
431 dz
= _mm_sub_pd(iz
,jz
);
433 rsq11
= _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx
,dx
) , _mm_mul_pd(dy
,dy
) ) , _mm_mul_pd(dz
,dz
) );
434 rinv
= my_invrsq_pd(rsq11
);
436 rinv2
= _mm_mul_pd(rinv
,rinv
);
437 rinv4
= _mm_mul_pd(rinv2
,rinv2
);
438 rinv6
= _mm_mul_pd(rinv4
,rinv2
);
440 vaj
= _mm_loadl_pd(vaj
,born
->vsolv
+aj1
);
441 vaj
= _mm_loadh_pd(vaj
,born
->vsolv
+aj2
);
443 raj
= _mm_loadl_pd(raj
, top
->atomtypes
.gb_radius
+taj1
);
444 raj
= _mm_loadh_pd(raj
, top
->atomtypes
.gb_radius
+taj2
);
446 rvdw
= _mm_add_pd(rai
,raj
);
447 rvdw
= _mm_mul_pd(rvdw
,rvdw
);
448 ratio
= _mm_div_pd(rsq11
,rvdw
);
450 mask_cmp
= _mm_cmpgt_pd(ratio
,p5inv
);
452 switch(_mm_movemask_pd(mask_cmp
))
459 theta
= _mm_mul_pd(ratio
,pip5
);
460 sincos_sse2double(theta
,&sinq
,&cosq
);
462 term
= _mm_sub_pd(one
,cosq
);
463 term
= _mm_mul_pd(half
,term
);
464 ccf
= _mm_mul_pd(term
,term
);
465 dccf
= _mm_mul_pd(two
,term
);
466 dccf
= _mm_mul_pd(dccf
,sinq
);
467 dccf
= _mm_mul_pd(dccf
,pip5
);
468 dccf
= _mm_mul_pd(dccf
,ratio
);
470 ccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,one
) ,_mm_andnot_pd(mask_cmp
,ccf
));
471 dccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,zero
) ,_mm_andnot_pd(mask_cmp
,dccf
));
474 prod
= _mm_mul_pd(p4
,vaj
);
475 icf4
= _mm_mul_pd(ccf
,rinv4
);
476 xmm2
= _mm_mul_pd(icf4
,prod
);
477 xmm3
= _mm_mul_pd(icf4
,prod_ai
);
478 gpi
= _mm_add_pd(gpi
,xmm2
);
480 /* Load, subtract and store atom aj gpol energy */
481 xmm7
= _mm_loadl_pd(xmm7
,born
->gpol_still_work
+aj1
);
482 xmm7
= _mm_loadh_pd(xmm7
,born
->gpol_still_work
+aj2
);
484 xmm3
= _mm_add_pd(xmm7
,xmm3
);
486 _mm_storel_pd(born
->gpol_still_work
+aj1
,xmm3
);
487 _mm_storeh_pd(born
->gpol_still_work
+aj2
,xmm3
);
489 /* Chain rule terms */
490 ccf
= _mm_mul_pd(four
,ccf
);
491 xmm3
= _mm_sub_pd(ccf
,dccf
);
492 icf6
= _mm_mul_pd(xmm3
,rinv6
);
493 xmm1
= _mm_mul_pd(icf6
,prod
);
494 xmm2
= _mm_mul_pd(icf6
,prod_ai
);
496 /* As with single precision, we need to shift stuff around, to change the order of
497 * the interactions from ai->aj1, ai->aj2 to ai->aj1, aj1->ai, ai->aj2, aj2->ai etc,
498 to do 2 instead of 4 store operations
500 xmm3
= _mm_unpacklo_pd(xmm1
,xmm2
);
501 xmm4
= _mm_unpackhi_pd(xmm1
,xmm2
);
503 _mm_storeu_pd(fr
->dadx
+n
,xmm3
);
505 _mm_storeu_pd(fr
->dadx
+n
,xmm4
);
509 /* Deal with odd elements */
514 taj1
= md
->typeA
[aj1
];
516 jx
= _mm_load_sd(x
+aj13
);
517 jy
= _mm_load_sd(x
+aj13
+1);
518 jz
= _mm_load_sd(x
+aj13
+2);
520 raj
= _mm_load_sd(top
->atomtypes
.gb_radius
+taj1
);
521 vaj
= _mm_load_sd(born
->vsolv
+aj1
);
523 dx
= _mm_sub_sd(ix
,jx
);
524 dy
= _mm_sub_pd(iy
,jy
);
525 dz
= _mm_sub_pd(iz
,jz
);
527 rsq11
= _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx
,dx
) , _mm_mul_sd(dy
,dy
) ) , _mm_mul_sd(dz
,dz
) );
528 rinv
= my_invrsq_pd(rsq11
);
530 rinv2
= _mm_mul_sd(rinv
,rinv
);
531 rinv4
= _mm_mul_sd(rinv2
,rinv2
);
532 rinv6
= _mm_mul_sd(rinv4
,rinv2
);
534 rvdw
= _mm_add_sd(rai
,raj
);
535 rvdw
= _mm_mul_sd(rvdw
,rvdw
);
536 ratio
= _mm_div_sd(rsq11
,rvdw
);
538 mask_cmp
= _mm_cmpgt_sd(ratio
,p5inv
);
540 switch(_mm_movemask_pd(mask_cmp
))
547 theta
= _mm_mul_sd(ratio
,pip5
);
548 sincos_sse2double(theta
,&sinq
,&cosq
);
550 term
= _mm_sub_sd(one
,cosq
);
551 term
= _mm_mul_sd(half
,term
);
552 ccf
= _mm_mul_sd(term
,term
);
553 dccf
= _mm_mul_sd(two
,term
);
554 dccf
= _mm_mul_sd(dccf
,sinq
);
555 dccf
= _mm_mul_sd(dccf
,pip5
);
556 dccf
= _mm_mul_sd(dccf
,ratio
);
558 ccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,one
) ,_mm_andnot_pd(mask_cmp
,ccf
));
559 dccf
= _mm_or_pd(_mm_and_pd(mask_cmp
,zero
) ,_mm_andnot_pd(mask_cmp
,dccf
));
562 prod
= _mm_mul_sd(p4
,vaj
);
563 icf4
= _mm_mul_sd(ccf
,rinv4
);
564 xmm2
= _mm_mul_sd(icf4
,prod
);
565 xmm3
= _mm_mul_sd(icf4
,prod_ai
);
566 gpi
= _mm_add_sd(gpi
,xmm2
);
568 /* Load, subtract and store atom aj gpol energy */
569 xmm7
= _mm_load_sd(born
->gpol_still_work
+aj1
);
570 xmm3
= _mm_add_sd(xmm7
,xmm3
);
571 _mm_store_sd(born
->gpol_still_work
+aj1
,xmm3
);
573 /* Chain rule terms */
574 ccf
= _mm_mul_sd(four
,ccf
);
575 xmm3
= _mm_sub_sd(ccf
,dccf
);
576 icf6
= _mm_mul_sd(xmm3
,rinv6
);
577 xmm1
= _mm_mul_sd(icf6
,prod
);
578 xmm2
= _mm_mul_sd(icf6
,prod_ai
);
580 /* Here we only have ai->aj1 and aj1->ai, so we can store directly */
581 _mm_storel_pd(fr
->dadx
+n
,xmm1
);
583 _mm_storel_pd(fr
->dadx
+n
,xmm2
);
587 /* Do end processing ... */
588 xmm2
= _mm_unpacklo_pd(xmm2
,gpi
);
589 gpi
= _mm_add_pd(gpi
,xmm2
);
590 gpi
= _mm_shuffle_pd(gpi
,gpi
,_MM_SHUFFLE2(1,1));
592 /* Load, add and store atom ai polarisation energy */
593 xmm2
= _mm_load_sd(born
->gpol_still_work
+ai
);
594 gpi
= _mm_add_sd(gpi
,xmm2
);
595 _mm_store_sd(born
->gpol_still_work
+ai
,gpi
);
598 /* Parallel summations */
601 gmx_sum(natoms
,born
->gpol_still_work
, cr
);
603 else if(DOMAINDECOMP(cr
))
605 dd_atom_sum_real(cr
->dd
, born
->gpol_still_work
);
608 /* Compute the radii */
609 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
611 if(born
->use
[i
] != 0)
613 gpi_ai
= born
->gpol
[i
] + born
->gpol_still_work
[i
];
614 gpi2
= gpi_ai
*gpi_ai
;
616 born
->bRad
[i
]=factor
*gmx_invsqrt(gpi2
);
617 fr
->invsqrta
[i
]=gmx_invsqrt(born
->bRad
[i
]);
621 /* Extra (local) communication reqiured for DD */
624 dd_atom_spread_real(cr
->dd
, born
->bRad
);
625 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
632 calc_gb_rad_hct_sse2_double(t_commrec
*cr
, t_forcerec
*fr
, int natoms
, gmx_localtop_t
*top
,
633 const t_atomtypes
*atype
, double *x
, t_nblist
*nl
, gmx_genborn_t
*born
, t_mdatoms
*md
)
635 int i
,k
,n
,ai
,ai3
,aj1
,aj2
,aj13
,aj23
,nj0
,nj1
,at0
,at1
,offset
;
637 double rr
,sum
,sum_tmp
,min_rad
,rad
,doff
;
640 __m128d ix
,iy
,iz
,jx
,jy
,jz
,dx
,dy
,dz
,sX
,sY
,sZ
;
641 __m128d t1
,t2
,t3
,rsq11
,rinv
,r
,rai
;
642 __m128d rai_inv
,sk
,sk2
,lij
,dlij
,duij
;
643 __m128d uij
,lij2
,uij2
,lij3
,uij3
,diff2
;
644 __m128d lij_inv
,sk2_inv
,prod
,log_term
,tmp
,tmp_sum
;
645 __m128d mask_cmp
,mask_cmp2
,mask_cmp3
;
646 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm7
,xmm8
,xmm9
,doffset
;
647 __m128d sum_ai
,chrule
,chrule_ai
,tmp_ai
;
648 __m128d sk_ai
, sk2_ai
,raj
,raj_inv
;
650 const __m128d neg
= {-1.0, -1.0};
651 const __m128d zero
= {0.0, 0.0};
652 const __m128d eigth
= {0.125, 0.125};
653 const __m128d qrtr
= {0.25, 0.25};
654 const __m128d half
= {0.5, 0.5};
655 const __m128d one
= {1.0, 1.0};
656 const __m128d two
= {2.0, 2.0};
657 const __m128d three
= {3.0, 3.0};
659 /* Keep the compiler happy */
660 tmp_ai
= _mm_setzero_pd();
661 tmp
= _mm_setzero_pd();
662 xmm7
= _mm_setzero_pd();
663 xmm8
= _mm_setzero_pd();
664 raj_inv
= _mm_setzero_pd();
665 raj
= _mm_setzero_pd();
666 sk
= _mm_setzero_pd();
667 jx
= _mm_setzero_pd();
668 jy
= _mm_setzero_pd();
669 jz
= _mm_setzero_pd();
671 /* Set the dielectric offset */
672 doff
= born
->gb_doffset
;
673 doffset
= _mm_load1_pd(&doff
);
676 for(i
=0;i
<born
->nr
;i
++)
678 born
->gpol_hct_work
[i
] = 0;
681 for(i
=0;i
<nl
->nri
;i
++)
687 nj1
= nl
->jindex
[i
+1];
689 /* Load shifts for this list */
690 shift
= nl
->shift
[i
];
691 shX
= fr
->shift_vec
[shift
][0];
692 shY
= fr
->shift_vec
[shift
][1];
693 shZ
= fr
->shift_vec
[shift
][2];
695 /* Splat the shifts */
696 sX
= _mm_load1_pd(&shX
);
697 sY
= _mm_load1_pd(&shY
);
698 sZ
= _mm_load1_pd(&shZ
);
700 offset
= (nj1
-nj0
)%2;
703 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[ai
]]-doff
;
704 rai
= _mm_load1_pd(&rr
);
706 rai_inv
= _mm_load1_pd(&rr
);
708 /* Zero out sums for polarisation energies */
709 sum_ai
= _mm_setzero_pd();
711 /* Load ai coordinates and add shifts */
712 ix
= _mm_load1_pd(x
+ai3
);
713 iy
= _mm_load1_pd(x
+ai3
+1);
714 iz
= _mm_load1_pd(x
+ai3
+2);
716 ix
= _mm_add_pd(sX
,ix
);
717 iy
= _mm_add_pd(sY
,iy
);
718 iz
= _mm_add_pd(sZ
,iz
);
720 sk_ai
= _mm_load1_pd(born
->param
+ai
);
721 sk2_ai
= _mm_mul_pd(sk_ai
,sk_ai
);
723 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
731 /* Load particle aj1-2 coordinates */
732 xmm1
= _mm_loadu_pd(x
+aj13
);
733 xmm2
= _mm_loadu_pd(x
+aj23
);
734 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
735 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
737 jz
= _mm_loadl_pd(jz
, x
+aj13
+2);
738 jz
= _mm_loadh_pd(jz
, x
+aj23
+2);
740 dx
= _mm_sub_pd(ix
,jx
);
741 dy
= _mm_sub_pd(iy
,jy
);
742 dz
= _mm_sub_pd(iz
,jz
);
744 rsq11
= _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx
,dx
) , _mm_mul_pd(dy
,dy
) ) , _mm_mul_pd(dz
,dz
) );
745 rinv
= my_invrsq_pd(rsq11
);
746 r
= _mm_mul_pd(rinv
,rsq11
);
748 sk
= _mm_loadl_pd(sk
,born
->param
+aj1
);
749 sk
= _mm_loadh_pd(sk
,born
->param
+aj2
);
751 /* Load aj1,aj2 raj */
755 raj
= _mm_loadl_pd(raj
,top
->atomtypes
.gb_radius
+p1
);
756 raj
= _mm_loadh_pd(raj
,top
->atomtypes
.gb_radius
+p2
);
757 raj
= _mm_sub_pd(raj
,doffset
);
759 /* Compute 1.0/raj */
760 raj_inv
= my_inv_pd(raj
);
762 /* INTERACTION aj->ai STARS HERE */
763 /* conditional mask for rai<dr+sk */
764 xmm1
= _mm_add_pd(r
,sk
);
765 mask_cmp
= _mm_cmplt_pd(rai
,xmm1
);
767 /* conditional for rai>dr-sk, ends with mask_cmp2 */
768 xmm2
= _mm_sub_pd(r
,sk
);
769 xmm3
= my_inv_pd(xmm2
);
770 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
772 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
773 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
775 uij
= my_inv_pd(xmm1
);
776 lij2
= _mm_mul_pd(lij
,lij
);
777 lij3
= _mm_mul_pd(lij2
,lij
);
778 uij2
= _mm_mul_pd(uij
,uij
);
779 uij3
= _mm_mul_pd(uij2
,uij
);
781 diff2
= _mm_sub_pd(uij2
,lij2
);
783 lij_inv
= my_invrsq_pd(lij2
);
784 sk2
= _mm_mul_pd(sk
,sk
);
785 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
786 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
788 log_term
= _mm_mul_pd(uij
,lij_inv
);
789 log_term
= log_pd(log_term
);
791 xmm1
= _mm_sub_pd(lij
,uij
);
792 xmm2
= _mm_mul_pd(qrtr
,r
);
793 xmm2
= _mm_mul_pd(xmm2
,diff2
);
794 xmm1
= _mm_add_pd(xmm1
,xmm2
);
795 xmm2
= _mm_mul_pd(half
,rinv
);
796 xmm2
= _mm_mul_pd(xmm2
,log_term
);
797 xmm1
= _mm_add_pd(xmm1
,xmm2
);
798 xmm9
= _mm_mul_pd(neg
,diff2
);
799 xmm2
= _mm_mul_pd(xmm9
,prod
);
800 tmp_ai
= _mm_add_pd(xmm1
,xmm2
);
802 /* contitional for rai<sk-dr */
803 xmm3
= _mm_sub_pd(sk
,r
);
804 mask_cmp3
= _mm_cmplt_pd(rai
,xmm3
); /* rai<sk-dr */
806 xmm4
= _mm_sub_pd(rai_inv
,lij
);
807 xmm4
= _mm_mul_pd(two
,xmm4
);
808 xmm4
= _mm_add_pd(tmp_ai
,xmm4
);
810 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
812 /* the tmp will now contain two partial values, that not all are to be used. Which */
813 /* ones are governed by the mask_cmp mask. */
814 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
815 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
816 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
818 xmm2
= _mm_mul_pd(half
,lij2
);
819 xmm3
= _mm_mul_pd(prod
,lij3
);
820 xmm2
= _mm_add_pd(xmm2
,xmm3
);
821 xmm3
= _mm_mul_pd(lij
,rinv
);
822 xmm4
= _mm_mul_pd(lij3
,r
);
823 xmm3
= _mm_add_pd(xmm3
,xmm4
);
824 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
825 t1
= _mm_sub_pd(xmm2
,xmm3
);
827 xmm2
= _mm_mul_pd(half
,uij2
);
828 xmm2
= _mm_mul_pd(neg
,xmm2
);
829 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
830 xmm3
= _mm_mul_pd(xmm3
,uij3
);
831 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
832 xmm3
= _mm_mul_pd(uij
,rinv
);
833 xmm4
= _mm_mul_pd(uij3
,r
);
834 xmm3
= _mm_add_pd(xmm3
,xmm4
);
835 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
836 t2
= _mm_add_pd(xmm2
,xmm3
);
838 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
839 xmm2
= _mm_add_pd(one
,xmm2
);
840 xmm2
= _mm_mul_pd(eigth
,xmm2
);
841 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
842 xmm3
= _mm_mul_pd(log_term
, rinv
);
843 xmm3
= _mm_mul_pd(xmm3
,rinv
);
844 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
845 t3
= _mm_add_pd(xmm2
,xmm3
);
847 /* chain rule terms */
848 xmm2
= _mm_mul_pd(dlij
,t1
);
849 xmm2
= _mm_add_pd(xmm2
,t2
);
850 xmm2
= _mm_add_pd(xmm2
,t3
);
852 /* temporary storage of chain rule terms, since we have to compute
853 the reciprocal terms also before storing them */
854 chrule
= _mm_mul_pd(xmm2
,rinv
);
856 /* INTERACTION ai->aj starts here */
857 /* Conditional mask for raj<dr+sk_ai */
858 xmm1
= _mm_add_pd(r
,sk_ai
);
859 mask_cmp
= _mm_cmplt_pd(raj
,xmm1
);
861 /* Conditional for rai>dr-sk, ends with mask_cmp2 */
862 xmm2
= _mm_sub_pd(r
,sk_ai
);
863 xmm3
= my_inv_pd(xmm2
);
864 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
866 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
867 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
869 uij
= my_inv_pd(xmm1
);
870 lij2
= _mm_mul_pd(lij
,lij
);
871 lij3
= _mm_mul_pd(lij2
,lij
);
872 uij2
= _mm_mul_pd(uij
,uij
);
873 uij3
= _mm_mul_pd(uij2
,uij
);
875 diff2
= _mm_sub_pd(uij2
,lij2
);
877 lij_inv
= my_invrsq_pd(lij2
);
880 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
881 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
883 log_term
= _mm_mul_pd(uij
,lij_inv
);
884 log_term
= log_pd(log_term
);
886 xmm1
= _mm_sub_pd(lij
,uij
);
887 xmm2
= _mm_mul_pd(qrtr
,r
);
888 xmm2
= _mm_mul_pd(xmm2
,diff2
);
889 xmm1
= _mm_add_pd(xmm1
,xmm2
);
890 xmm2
= _mm_mul_pd(half
,rinv
);
891 xmm2
= _mm_mul_pd(xmm2
,log_term
);
892 xmm1
= _mm_add_pd(xmm1
,xmm2
);
893 xmm9
= _mm_mul_pd(neg
,diff2
);
894 xmm2
= _mm_mul_pd(xmm9
,prod
);
895 tmp
= _mm_add_pd(xmm1
,xmm2
);
897 /* contitional for rai<sk_ai-dr */
898 xmm3
= _mm_sub_pd(sk_ai
,r
);
899 mask_cmp3
= _mm_cmplt_pd(raj
,xmm3
); /* rai<sk-dr */
901 xmm4
= _mm_sub_pd(raj_inv
,lij
);
902 xmm4
= _mm_mul_pd(two
,xmm4
);
903 xmm4
= _mm_add_pd(tmp
,xmm4
);
905 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
907 /* the tmp will now contain two partial values, that not all are to be used. Which */
908 /* ones are governed by the mask_cmp mask. */
909 tmp
= _mm_mul_pd(half
,tmp
);
910 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
912 /* Load, add and store ai->aj pol energy */
913 xmm7
= _mm_loadl_pd(xmm7
,born
->gpol_hct_work
+aj1
);
914 xmm7
= _mm_loadh_pd(xmm7
,born
->gpol_hct_work
+aj2
);
916 xmm7
= _mm_add_pd(xmm7
,tmp
);
918 _mm_storel_pd(born
->gpol_hct_work
+aj1
,xmm7
);
919 _mm_storeh_pd(born
->gpol_hct_work
+aj2
,xmm7
);
921 /* Start chain rule terms */
922 xmm2
= _mm_mul_pd(half
,lij2
);
923 xmm3
= _mm_mul_pd(prod
,lij3
);
924 xmm2
= _mm_add_pd(xmm2
,xmm3
);
925 xmm3
= _mm_mul_pd(lij
,rinv
);
926 xmm4
= _mm_mul_pd(lij3
,r
);
927 xmm3
= _mm_add_pd(xmm3
,xmm4
);
928 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
929 t1
= _mm_sub_pd(xmm2
,xmm3
);
931 xmm2
= _mm_mul_pd(half
,uij2
);
932 xmm2
= _mm_mul_pd(neg
,xmm2
);
933 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
934 xmm3
= _mm_mul_pd(xmm3
,uij3
);
935 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
936 xmm3
= _mm_mul_pd(uij
,rinv
);
937 xmm4
= _mm_mul_pd(uij3
,r
);
938 xmm3
= _mm_add_pd(xmm3
,xmm4
);
939 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
940 t2
= _mm_add_pd(xmm2
,xmm3
);
942 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
943 xmm2
= _mm_add_pd(one
,xmm2
);
944 xmm2
= _mm_mul_pd(eigth
,xmm2
);
945 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
946 xmm3
= _mm_mul_pd(log_term
, rinv
);
947 xmm3
= _mm_mul_pd(xmm3
,rinv
);
948 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
949 t3
= _mm_add_pd(xmm2
,xmm3
);
951 /* chain rule terms */
952 xmm2
= _mm_mul_pd(dlij
,t1
);
953 xmm2
= _mm_add_pd(xmm2
,t2
);
954 xmm2
= _mm_add_pd(xmm2
,t3
);
955 chrule_ai
= _mm_mul_pd(xmm2
,rinv
);
957 /* Store chain rule terms
958 * same unpacking rationale as with Still above
960 xmm3
= _mm_unpacklo_pd(chrule
, chrule_ai
);
961 xmm4
= _mm_unpackhi_pd(chrule
, chrule_ai
);
963 _mm_storeu_pd(fr
->dadx
+n
, xmm3
);
965 _mm_storeu_pd(fr
->dadx
+n
, xmm4
);
975 jx
= _mm_load_sd(x
+aj13
);
976 jy
= _mm_load_sd(x
+aj13
+1);
977 jz
= _mm_load_sd(x
+aj13
+2);
979 sk
= _mm_load_sd(born
->param
+aj1
);
981 dx
= _mm_sub_sd(ix
,jx
);
982 dy
= _mm_sub_pd(iy
,jy
);
983 dz
= _mm_sub_pd(iz
,jz
);
985 rsq11
= _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx
,dx
) , _mm_mul_sd(dy
,dy
) ) , _mm_mul_sd(dz
,dz
) );
986 rinv
= my_invrsq_pd(rsq11
);
987 r
= _mm_mul_sd(rinv
,rsq11
);
990 raj
= _mm_load_sd(top
->atomtypes
.gb_radius
+p1
);
991 raj
= _mm_sub_sd(raj
,doffset
);
992 raj_inv
= my_inv_pd(raj
);
994 /* OFFSET INTERATIONS aj->ai STARTS HERE */
995 /* conditional mask for rai<dr+sk */
996 xmm1
= _mm_add_sd(r
,sk
);
997 mask_cmp
= _mm_cmplt_sd(rai
,xmm1
);
999 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1000 xmm2
= _mm_sub_sd(r
,sk
);
1001 xmm3
= my_inv_pd(xmm2
);
1002 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
1004 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1005 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1007 uij
= my_inv_pd(xmm1
);
1008 lij2
= _mm_mul_sd(lij
,lij
);
1009 lij3
= _mm_mul_sd(lij2
,lij
);
1010 uij2
= _mm_mul_sd(uij
,uij
);
1011 uij3
= _mm_mul_sd(uij2
,uij
);
1013 diff2
= _mm_sub_sd(uij2
,lij2
);
1015 lij_inv
= my_invrsq_pd(lij2
);
1016 sk2
= _mm_mul_sd(sk
,sk
);
1017 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1018 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1020 log_term
= _mm_mul_pd(uij
,lij_inv
);
1021 log_term
= log_pd(log_term
);
1023 xmm1
= _mm_sub_sd(lij
,uij
);
1024 xmm2
= _mm_mul_sd(qrtr
,r
);
1025 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1026 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1027 xmm2
= _mm_mul_sd(half
,rinv
);
1028 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1029 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1030 xmm9
= _mm_mul_sd(neg
,diff2
);
1031 xmm2
= _mm_mul_sd(xmm9
,prod
);
1033 tmp_ai
= _mm_add_sd(xmm1
,xmm2
);
1035 /* contitional for rai<sk-dr */
1036 xmm3
= _mm_sub_sd(sk
,r
);
1037 mask_cmp3
= _mm_cmplt_sd(rai
,xmm3
); /* rai<sk-dr */
1039 xmm4
= _mm_sub_sd(rai_inv
,lij
);
1040 xmm4
= _mm_mul_sd(two
,xmm4
);
1041 xmm4
= _mm_add_sd(tmp_ai
,xmm4
);
1043 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
1045 /* the tmp will now contain two partial values, that not all are to be used. Which */
1046 /* ones are governed by the mask_cmp mask. */
1047 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
1048 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1049 sum_ai
= _mm_add_sd(sum_ai
,tmp_ai
);
1051 xmm2
= _mm_mul_sd(half
,lij2
);
1052 xmm3
= _mm_mul_sd(prod
,lij3
);
1053 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1054 xmm3
= _mm_mul_sd(lij
,rinv
);
1055 xmm4
= _mm_mul_sd(lij3
,r
);
1056 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1057 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1058 t1
= _mm_sub_sd(xmm2
,xmm3
);
1060 xmm2
= _mm_mul_sd(half
,uij2
);
1061 xmm2
= _mm_mul_sd(neg
,xmm2
);
1062 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1063 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1064 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1065 xmm3
= _mm_mul_sd(uij
,rinv
);
1066 xmm4
= _mm_mul_sd(uij3
,r
);
1067 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1068 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1069 t2
= _mm_add_sd(xmm2
,xmm3
);
1071 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1072 xmm2
= _mm_add_sd(one
,xmm2
);
1073 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1074 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1075 xmm3
= _mm_mul_sd(log_term
, rinv
);
1076 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1077 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1078 t3
= _mm_add_sd(xmm2
,xmm3
);
1080 /* chain rule terms */
1081 xmm2
= _mm_mul_sd(dlij
,t1
);
1082 xmm2
= _mm_add_sd(xmm2
,t2
);
1083 xmm2
= _mm_add_sd(xmm2
,t3
);
1085 /* temporary storage of chain rule terms, since we have to compute
1086 the reciprocal terms also before storing them */
1087 chrule
= _mm_mul_sd(xmm2
,rinv
);
1089 /* OFFSET INTERACTION ai->aj starts here */
1090 /* conditional mask for raj<dr+sk */
1091 xmm1
= _mm_add_sd(r
,sk_ai
);
1092 mask_cmp
= _mm_cmplt_sd(raj
,xmm1
);
1094 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1095 xmm2
= _mm_sub_sd(r
,sk_ai
);
1096 xmm3
= my_inv_pd(xmm2
);
1097 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
1099 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1100 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1102 uij
= my_inv_pd(xmm1
);
1103 lij2
= _mm_mul_sd(lij
,lij
);
1104 lij3
= _mm_mul_sd(lij2
,lij
);
1105 uij2
= _mm_mul_sd(uij
,uij
);
1106 uij3
= _mm_mul_sd(uij2
,uij
);
1108 diff2
= _mm_sub_sd(uij2
,lij2
);
1110 lij_inv
= my_invrsq_pd(lij2
);
1113 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1114 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1116 log_term
= _mm_mul_pd(uij
,lij_inv
);
1117 log_term
= log_pd(log_term
);
1119 xmm1
= _mm_sub_sd(lij
,uij
);
1120 xmm2
= _mm_mul_sd(qrtr
,r
);
1121 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1122 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1123 xmm2
= _mm_mul_sd(half
,rinv
);
1124 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1125 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1126 xmm9
= _mm_mul_sd(neg
,diff2
);
1127 xmm2
= _mm_mul_sd(xmm9
,prod
);
1128 tmp
= _mm_add_sd(xmm1
,xmm2
);
1130 /* contitional for rai<sk-dr */
1131 xmm3
= _mm_sub_sd(sk_ai
,r
);
1132 mask_cmp3
= _mm_cmplt_sd(raj
,xmm3
); /* rai<sk-dr */
1134 xmm4
= _mm_sub_sd(raj_inv
,lij
);
1135 xmm4
= _mm_mul_sd(two
,xmm4
);
1136 xmm4
= _mm_add_sd(tmp
,xmm4
);
1138 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
1140 /* the tmp will now contain two partial values, that not all are to be used. Which */
1141 /* ones are governed by the mask_cmp mask. */
1142 tmp
= _mm_mul_pd(half
,tmp
);
1143 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1145 /* Load, add and store gpol energy */
1146 xmm7
= _mm_load_sd(born
->gpol_hct_work
+aj1
);
1147 xmm7
= _mm_add_sd(xmm7
,tmp
);
1148 _mm_store_sd(born
->gpol_hct_work
+aj1
,xmm7
);
1150 /* Start chain rule terms, t1 */
1151 xmm2
= _mm_mul_sd(half
,lij2
);
1152 xmm3
= _mm_mul_sd(prod
,lij3
);
1153 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1154 xmm3
= _mm_mul_sd(lij
,rinv
);
1155 xmm4
= _mm_mul_sd(lij3
,r
);
1156 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1157 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1158 t1
= _mm_sub_sd(xmm2
,xmm3
);
1160 xmm2
= _mm_mul_sd(half
,uij2
);
1161 xmm2
= _mm_mul_sd(neg
,xmm2
);
1162 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1163 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1164 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1165 xmm3
= _mm_mul_sd(uij
,rinv
);
1166 xmm4
= _mm_mul_sd(uij3
,r
);
1167 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1168 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1169 t2
= _mm_add_sd(xmm2
,xmm3
);
1171 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1172 xmm2
= _mm_add_sd(one
,xmm2
);
1173 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1174 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1175 xmm3
= _mm_mul_sd(log_term
, rinv
);
1176 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1177 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1178 t3
= _mm_add_sd(xmm2
,xmm3
);
1180 /* chain rule terms */
1181 xmm2
= _mm_mul_sd(dlij
,t1
);
1182 xmm2
= _mm_add_sd(xmm2
,t2
);
1183 xmm2
= _mm_add_sd(xmm2
,t3
);
1184 chrule_ai
= _mm_mul_sd(xmm2
,rinv
);
1186 _mm_store_sd(fr
->dadx
+n
, chrule
);
1188 _mm_store_sd(fr
->dadx
+n
, chrule_ai
);
1192 /* Do end processing ... */
1193 tmp_ai
= _mm_unpacklo_pd(tmp_ai
,sum_ai
);
1194 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
1195 sum_ai
= _mm_shuffle_pd(sum_ai
,sum_ai
,_MM_SHUFFLE2(1,1));
1197 /* Load, add and store atom ai polarisation energy */
1198 xmm2
= _mm_load_sd(born
->gpol_hct_work
+ai
);
1199 sum_ai
= _mm_add_sd(sum_ai
,xmm2
);
1200 _mm_store_sd(born
->gpol_hct_work
+ai
,sum_ai
);
1203 /* Parallel summations */
1206 gmx_sum(natoms
, born
->gpol_hct_work
, cr
);
1208 else if(DOMAINDECOMP(cr
))
1210 dd_atom_sum_real(cr
->dd
, born
->gpol_hct_work
);
1213 /* Compute the radii */
1214 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
1216 if(born
->use
[i
] != 0)
1218 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[i
]]-doff
;
1219 sum
= 1.0/rr
- born
->gpol_hct_work
[i
];
1220 min_rad
= rr
+ doff
;
1223 born
->bRad
[i
] = rad
> min_rad
? rad
: min_rad
;
1224 fr
->invsqrta
[i
] = gmx_invsqrt(born
->bRad
[i
]);
1228 /* Extra (local) communication required for DD */
1229 if(DOMAINDECOMP(cr
))
1231 dd_atom_spread_real(cr
->dd
, born
->bRad
);
1232 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
1239 calc_gb_rad_obc_sse2_double(t_commrec
*cr
, t_forcerec
* fr
, int natoms
, gmx_localtop_t
*top
,
1240 const t_atomtypes
*atype
, double *x
, t_nblist
*nl
, gmx_genborn_t
*born
,t_mdatoms
*md
)
1242 int i
,k
,n
,ai
,ai3
,aj1
,aj2
,aj13
,aj23
,nj0
,nj1
,at0
,at1
,offset
;
1243 int p1
,p2
,p3
,p4
,shift
;
1244 double rr
,sum
,sum_tmp
,sum2
,sum3
,min_rad
,rad
,doff
;
1245 double tsum
,tchain
,rr_inv
,rr_inv2
,gbr
;
1248 __m128d ix
,iy
,iz
,jx
,jy
,jz
,dx
,dy
,dz
,sX
,sY
,sZ
;
1249 __m128d t1
,t2
,t3
,rsq11
,rinv
,r
,rai
;
1250 __m128d rai_inv
,sk
,sk2
,lij
,dlij
,duij
;
1251 __m128d uij
,lij2
,uij2
,lij3
,uij3
,diff2
;
1252 __m128d lij_inv
,sk2_inv
,prod
,log_term
,tmp
,tmp_sum
;
1253 __m128d mask_cmp
,mask_cmp2
,mask_cmp3
,doffset
,raj
,raj_inv
;
1254 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm7
,xmm8
,xmm9
;
1255 __m128d sum_ai
,chrule
,chrule_ai
,tmp_ai
,sk_ai
,sk2_ai
;
1257 const __m128d neg
= {-1.0, -1.0};
1258 const __m128d zero
= {0.0, 0.0};
1259 const __m128d eigth
= {0.125, 0.125};
1260 const __m128d qrtr
= {0.25, 0.25};
1261 const __m128d half
= {0.5, 0.5};
1262 const __m128d one
= {1.0, 1.0};
1263 const __m128d two
= {2.0, 2.0};
1264 const __m128d three
= {3.0, 3.0};
1266 /* Keep the compiler happy */
1267 tmp_ai
= _mm_setzero_pd();
1268 tmp
= _mm_setzero_pd();
1269 raj_inv
= _mm_setzero_pd();
1270 raj
= _mm_setzero_pd();
1271 sk
= _mm_setzero_pd();
1272 jx
= _mm_setzero_pd();
1273 jy
= _mm_setzero_pd();
1274 jz
= _mm_setzero_pd();
1275 xmm7
= _mm_setzero_pd();
1276 xmm8
= _mm_setzero_pd();
1278 /* Set the dielectric offset */
1279 doff
= born
->gb_doffset
;
1280 doffset
= _mm_load1_pd(&doff
);
1284 for(i
=0;i
<born
->nr
;i
++)
1286 born
->gpol_hct_work
[i
] = 0;
1289 for(i
=0;i
<nl
->nri
;i
++)
1294 nj0
= nl
->jindex
[i
];
1295 nj1
= nl
->jindex
[i
+1];
1297 /* Load shifts for this list */
1298 shift
= nl
->shift
[i
];
1299 shX
= fr
->shift_vec
[shift
][0];
1300 shY
= fr
->shift_vec
[shift
][1];
1301 shZ
= fr
->shift_vec
[shift
][2];
1303 /* Splat the shifts */
1304 sX
= _mm_load1_pd(&shX
);
1305 sY
= _mm_load1_pd(&shY
);
1306 sZ
= _mm_load1_pd(&shZ
);
1308 offset
= (nj1
-nj0
)%2;
1311 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[ai
]]-doff
;
1312 rai
= _mm_load1_pd(&rr
);
1314 rai_inv
= _mm_load1_pd(&rr
);
1316 /* Load ai coordinates and add shifts */
1317 ix
= _mm_load1_pd(x
+ai3
);
1318 iy
= _mm_load1_pd(x
+ai3
+1);
1319 iz
= _mm_load1_pd(x
+ai3
+2);
1321 ix
= _mm_add_pd(sX
,ix
);
1322 iy
= _mm_add_pd(sY
,iy
);
1323 iz
= _mm_add_pd(sZ
,iz
);
1325 /* Zero out sums for polarisation energies */
1326 sum_ai
= _mm_setzero_pd();
1328 sk_ai
= _mm_load1_pd(born
->param
+ai
);
1329 sk2_ai
= _mm_mul_pd(sk_ai
,sk_ai
);
1331 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
1334 aj2
= nl
->jjnr
[k
+1];
1339 /* Load particle aj1-2 coordinates */
1340 xmm1
= _mm_loadu_pd(x
+aj13
);
1341 xmm2
= _mm_loadu_pd(x
+aj23
);
1342 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
1343 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
1345 jz
= _mm_loadl_pd(jz
, x
+aj13
+2);
1346 jz
= _mm_loadh_pd(jz
, x
+aj23
+2);
1348 dx
= _mm_sub_pd(ix
,jx
);
1349 dy
= _mm_sub_pd(iy
,jy
);
1350 dz
= _mm_sub_pd(iz
,jz
);
1352 rsq11
= _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx
,dx
) , _mm_mul_pd(dy
,dy
) ) , _mm_mul_pd(dz
,dz
) );
1353 rinv
= my_invrsq_pd(rsq11
);
1354 r
= _mm_mul_pd(rinv
,rsq11
);
1356 /* Load atom aj1,aj2 raj */
1357 p1
= md
->typeA
[aj1
];
1358 p2
= md
->typeA
[aj2
];
1360 raj
= _mm_loadl_pd(raj
,top
->atomtypes
.gb_radius
+p1
);
1361 raj
= _mm_loadh_pd(raj
,top
->atomtypes
.gb_radius
+p2
);
1362 raj
= _mm_sub_pd(raj
,doffset
);
1364 /* Compute 1.0/raj */
1365 raj_inv
= my_inv_pd(raj
);
1367 sk
= _mm_loadl_pd(sk
,born
->param
+aj1
);
1368 sk
= _mm_loadh_pd(sk
,born
->param
+aj2
);
1370 /* INTERACTION aj->ai STARTS HERE */
1371 /* conditional mask for rai<dr+sk */
1372 xmm1
= _mm_add_pd(r
,sk
);
1373 mask_cmp
= _mm_cmplt_pd(rai
,xmm1
);
1375 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1376 xmm2
= _mm_sub_pd(r
,sk
);
1377 xmm3
= my_inv_pd(xmm2
);
1378 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
1380 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1381 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1383 uij
= my_inv_pd(xmm1
);
1384 lij2
= _mm_mul_pd(lij
,lij
);
1385 lij3
= _mm_mul_pd(lij2
,lij
);
1386 uij2
= _mm_mul_pd(uij
,uij
);
1387 uij3
= _mm_mul_pd(uij2
,uij
);
1389 diff2
= _mm_sub_pd(uij2
,lij2
);
1391 lij_inv
= my_invrsq_pd(lij2
);
1392 sk2
= _mm_mul_pd(sk
,sk
);
1393 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
1394 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
1396 log_term
= _mm_mul_pd(uij
,lij_inv
);
1397 log_term
= log_pd(log_term
);
1399 xmm1
= _mm_sub_pd(lij
,uij
);
1400 xmm2
= _mm_mul_pd(qrtr
,r
);
1401 xmm2
= _mm_mul_pd(xmm2
,diff2
);
1402 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1403 xmm2
= _mm_mul_pd(half
,rinv
);
1404 xmm2
= _mm_mul_pd(xmm2
,log_term
);
1405 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1406 xmm9
= _mm_mul_pd(neg
,diff2
);
1407 xmm2
= _mm_mul_pd(xmm9
,prod
);
1408 tmp_ai
= _mm_add_pd(xmm1
,xmm2
);
1410 /* contitional for rai<sk-dr */
1411 xmm3
= _mm_sub_pd(sk
,r
);
1412 mask_cmp3
= _mm_cmplt_pd(rai
,xmm3
); /* rai<sk-dr */
1414 xmm4
= _mm_sub_pd(rai_inv
,lij
);
1415 xmm4
= _mm_mul_pd(two
,xmm4
);
1416 xmm4
= _mm_add_pd(tmp_ai
,xmm4
);
1418 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
1420 /* the tmp will now contain two partial values, that not all are to be used. Which */
1421 /* ones are governed by the mask_cmp mask. */
1422 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
1423 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1424 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
1426 /* Start the dadx chain rule terms */
1427 xmm2
= _mm_mul_pd(half
,lij2
);
1428 xmm3
= _mm_mul_pd(prod
,lij3
);
1429 xmm2
= _mm_add_pd(xmm2
,xmm3
);
1430 xmm3
= _mm_mul_pd(lij
,rinv
);
1431 xmm4
= _mm_mul_pd(lij3
,r
);
1432 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1433 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1434 t1
= _mm_sub_pd(xmm2
,xmm3
);
1436 xmm2
= _mm_mul_pd(half
,uij2
);
1437 xmm2
= _mm_mul_pd(neg
,xmm2
);
1438 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
1439 xmm3
= _mm_mul_pd(xmm3
,uij3
);
1440 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
1441 xmm3
= _mm_mul_pd(uij
,rinv
);
1442 xmm4
= _mm_mul_pd(uij3
,r
);
1443 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1444 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1445 t2
= _mm_add_pd(xmm2
,xmm3
);
1447 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
1448 xmm2
= _mm_add_pd(one
,xmm2
);
1449 xmm2
= _mm_mul_pd(eigth
,xmm2
);
1450 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
1451 xmm3
= _mm_mul_pd(log_term
, rinv
);
1452 xmm3
= _mm_mul_pd(xmm3
,rinv
);
1453 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1454 t3
= _mm_add_pd(xmm2
,xmm3
);
1456 /* chain rule terms */
1457 xmm2
= _mm_mul_pd(dlij
,t1
);
1458 xmm2
= _mm_add_pd(xmm2
,t2
);
1459 xmm2
= _mm_add_pd(xmm2
,t3
);
1461 /* temporary storage of chain rule terms, since we have to compute
1462 the reciprocal terms also before storing them */
1463 chrule
= _mm_mul_pd(xmm2
,rinv
);
1465 /* INTERACTION ai->aj STARTS HERE */
1466 /* conditional mask for raj<dr+sk_ai */
1467 xmm1
= _mm_add_pd(r
,sk_ai
);
1468 mask_cmp
= _mm_cmplt_pd(raj
,xmm1
);
1470 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1471 xmm2
= _mm_sub_pd(r
,sk_ai
);
1472 xmm3
= my_inv_pd(xmm2
);
1473 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
1475 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1476 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1478 uij
= my_inv_pd(xmm1
);
1479 lij2
= _mm_mul_pd(lij
,lij
);
1480 lij3
= _mm_mul_pd(lij2
,lij
);
1481 uij2
= _mm_mul_pd(uij
,uij
);
1482 uij3
= _mm_mul_pd(uij2
,uij
);
1484 diff2
= _mm_sub_pd(uij2
,lij2
);
1486 lij_inv
= my_invrsq_pd(lij2
);
1489 sk2_inv
= _mm_mul_pd(sk2
,rinv
);
1490 prod
= _mm_mul_pd(qrtr
,sk2_inv
);
1492 log_term
= _mm_mul_pd(uij
,lij_inv
);
1493 log_term
= log_pd(log_term
);
1495 xmm1
= _mm_sub_pd(lij
,uij
);
1496 xmm2
= _mm_mul_pd(qrtr
,r
);
1497 xmm2
= _mm_mul_pd(xmm2
,diff2
);
1498 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1499 xmm2
= _mm_mul_pd(half
,rinv
);
1500 xmm2
= _mm_mul_pd(xmm2
,log_term
);
1501 xmm1
= _mm_add_pd(xmm1
,xmm2
);
1502 xmm9
= _mm_mul_pd(neg
,diff2
);
1503 xmm2
= _mm_mul_pd(xmm9
,prod
);
1504 tmp
= _mm_add_pd(xmm1
,xmm2
);
1506 /* contitional for raj<sk_ai-dr */
1507 xmm3
= _mm_sub_pd(sk_ai
,r
);
1508 mask_cmp3
= _mm_cmplt_pd(raj
,xmm3
); /* rai<sk-dr */
1510 xmm4
= _mm_sub_pd(raj_inv
,lij
);
1511 xmm4
= _mm_mul_pd(two
,xmm4
);
1512 xmm4
= _mm_add_pd(tmp
,xmm4
);
1514 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
1516 /* the tmp will now contain two partial values, that not all are to be used. Which */
1517 /* ones are governed by the mask_cmp mask. */
1518 tmp
= _mm_mul_pd(half
,tmp
);
1519 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1521 /* Load, add and store ai->aj pol energy */
1522 xmm7
= _mm_loadl_pd(xmm7
,born
->gpol_hct_work
+aj1
);
1523 xmm7
= _mm_loadh_pd(xmm7
,born
->gpol_hct_work
+aj2
);
1525 xmm7
= _mm_add_pd(xmm7
,tmp
);
1527 _mm_storel_pd(born
->gpol_hct_work
+aj1
,xmm7
);
1528 _mm_storeh_pd(born
->gpol_hct_work
+aj2
,xmm7
);
1530 /* Start dadx chain rule terms */
1531 xmm2
= _mm_mul_pd(half
,lij2
);
1532 xmm3
= _mm_mul_pd(prod
,lij3
);
1533 xmm2
= _mm_add_pd(xmm2
,xmm3
);
1534 xmm3
= _mm_mul_pd(lij
,rinv
);
1535 xmm4
= _mm_mul_pd(lij3
,r
);
1536 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1537 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1538 t1
= _mm_sub_pd(xmm2
,xmm3
);
1540 xmm2
= _mm_mul_pd(half
,uij2
);
1541 xmm2
= _mm_mul_pd(neg
,xmm2
);
1542 xmm3
= _mm_mul_pd(qrtr
,sk2_inv
);
1543 xmm3
= _mm_mul_pd(xmm3
,uij3
);
1544 xmm2
= _mm_sub_pd(xmm2
,xmm3
);
1545 xmm3
= _mm_mul_pd(uij
,rinv
);
1546 xmm4
= _mm_mul_pd(uij3
,r
);
1547 xmm3
= _mm_add_pd(xmm3
,xmm4
);
1548 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1549 t2
= _mm_add_pd(xmm2
,xmm3
);
1551 xmm2
= _mm_mul_pd(sk2_inv
,rinv
);
1552 xmm2
= _mm_add_pd(one
,xmm2
);
1553 xmm2
= _mm_mul_pd(eigth
,xmm2
);
1554 xmm2
= _mm_mul_pd(xmm2
,xmm9
);
1555 xmm3
= _mm_mul_pd(log_term
, rinv
);
1556 xmm3
= _mm_mul_pd(xmm3
,rinv
);
1557 xmm3
= _mm_mul_pd(qrtr
,xmm3
);
1558 t3
= _mm_add_pd(xmm2
,xmm3
);
1560 /* chain rule terms */
1561 xmm2
= _mm_mul_pd(dlij
,t1
);
1562 xmm2
= _mm_add_pd(xmm2
,t2
);
1563 xmm2
= _mm_add_pd(xmm2
,t3
);
1564 chrule_ai
= _mm_mul_pd(xmm2
,rinv
);
1566 /* Store chain rule terms
1567 * same unpacking rationale as with Still above
1569 xmm3
= _mm_unpacklo_pd(chrule
, chrule_ai
);
1570 xmm4
= _mm_unpackhi_pd(chrule
, chrule_ai
);
1572 _mm_storeu_pd(fr
->dadx
+n
, xmm3
);
1574 _mm_storeu_pd(fr
->dadx
+n
, xmm4
);
1582 p1
= md
->typeA
[aj1
];
1584 jx
= _mm_load_sd(x
+aj13
);
1585 jy
= _mm_load_sd(x
+aj13
+1);
1586 jz
= _mm_load_sd(x
+aj13
+2);
1588 sk
= _mm_load_sd(born
->param
+aj1
);
1591 raj
= _mm_load_sd(top
->atomtypes
.gb_radius
+p1
);
1592 raj
= _mm_sub_sd(raj
,doffset
);
1593 raj_inv
= my_inv_pd(raj
);
1595 dx
= _mm_sub_sd(ix
,jx
);
1596 dy
= _mm_sub_pd(iy
,jy
);
1597 dz
= _mm_sub_pd(iz
,jz
);
1599 rsq11
= _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx
,dx
) , _mm_mul_sd(dy
,dy
) ) , _mm_mul_sd(dz
,dz
) );
1600 rinv
= my_invrsq_pd(rsq11
);
1601 r
= _mm_mul_sd(rinv
,rsq11
);
1603 /* OFFSET INTERACTION aj->ai STARTS HERE */
1604 /* conditional mask for rai<dr+sk */
1605 xmm1
= _mm_add_sd(r
,sk
);
1606 mask_cmp
= _mm_cmplt_sd(rai
,xmm1
);
1608 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1609 xmm2
= _mm_sub_sd(r
,sk
);
1610 xmm3
= my_inv_pd(xmm2
);
1611 mask_cmp2
= _mm_cmpgt_pd(rai
,xmm2
);
1613 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,rai_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1614 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1616 uij
= my_inv_pd(xmm1
);
1617 lij2
= _mm_mul_sd(lij
,lij
);
1618 lij3
= _mm_mul_sd(lij2
,lij
);
1619 uij2
= _mm_mul_sd(uij
,uij
);
1620 uij3
= _mm_mul_sd(uij2
,uij
);
1622 diff2
= _mm_sub_sd(uij2
,lij2
);
1624 lij_inv
= my_invrsq_pd(lij2
);
1625 sk2
= _mm_mul_sd(sk
,sk
);
1626 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1627 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1629 log_term
= _mm_mul_pd(uij
,lij_inv
);
1630 log_term
= log_pd(log_term
);
1632 xmm1
= _mm_sub_sd(lij
,uij
);
1633 xmm2
= _mm_mul_sd(qrtr
,r
);
1634 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1635 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1636 xmm2
= _mm_mul_sd(half
,rinv
);
1637 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1638 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1639 xmm9
= _mm_mul_sd(neg
,diff2
);
1640 xmm2
= _mm_mul_sd(xmm9
,prod
);
1642 tmp_ai
= _mm_add_sd(xmm1
,xmm2
);
1644 /* contitional for rai<sk-dr */
1645 xmm3
= _mm_sub_sd(sk
,r
);
1646 mask_cmp3
= _mm_cmplt_sd(rai
,xmm3
); /* rai<sk-dr */
1648 xmm4
= _mm_sub_sd(rai_inv
,lij
);
1649 xmm4
= _mm_mul_sd(two
,xmm4
);
1650 xmm4
= _mm_add_sd(tmp_ai
,xmm4
);
1652 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp_ai
)); /*conditional as a mask*/
1654 /* the tmp will now contain two partial values, that not all are to be used. Which */
1655 /* ones are governed by the mask_cmp mask. */
1656 tmp_ai
= _mm_mul_pd(half
,tmp_ai
);
1657 tmp_ai
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp_ai
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1658 sum_ai
= _mm_add_sd(sum_ai
,tmp_ai
);
1660 xmm2
= _mm_mul_sd(half
,lij2
);
1661 xmm3
= _mm_mul_sd(prod
,lij3
);
1662 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1663 xmm3
= _mm_mul_sd(lij
,rinv
);
1664 xmm4
= _mm_mul_sd(lij3
,r
);
1665 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1666 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1667 t1
= _mm_sub_sd(xmm2
,xmm3
);
1669 xmm2
= _mm_mul_sd(half
,uij2
);
1670 xmm2
= _mm_mul_sd(neg
,xmm2
);
1671 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1672 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1673 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1674 xmm3
= _mm_mul_sd(uij
,rinv
);
1675 xmm4
= _mm_mul_sd(uij3
,r
);
1676 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1677 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1678 t2
= _mm_add_sd(xmm2
,xmm3
);
1680 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1681 xmm2
= _mm_add_sd(one
,xmm2
);
1682 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1683 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1684 xmm3
= _mm_mul_sd(log_term
, rinv
);
1685 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1686 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1687 t3
= _mm_add_sd(xmm2
,xmm3
);
1689 /* chain rule terms */
1690 xmm2
= _mm_mul_sd(dlij
,t1
);
1691 xmm2
= _mm_add_sd(xmm2
,t2
);
1692 xmm2
= _mm_add_sd(xmm2
,t3
);
1693 chrule
= _mm_mul_sd(xmm2
,rinv
);
1695 /* OFFSET INTERACTION ai->aj STARTS HERE */
1696 /* conditional mask for raj<dr+sk_ai */
1697 xmm1
= _mm_add_sd(r
,sk_ai
);
1698 mask_cmp
= _mm_cmplt_sd(raj
,xmm1
);
1700 /* conditional for rai>dr-sk, ends with mask_cmp2 */
1701 xmm2
= _mm_sub_sd(r
,sk_ai
);
1702 xmm3
= my_inv_pd(xmm2
);
1703 mask_cmp2
= _mm_cmpgt_pd(raj
,xmm2
);
1705 lij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,raj_inv
) ,_mm_andnot_pd(mask_cmp2
,xmm3
)); /*conditional as a mask*/
1706 dlij
= _mm_or_pd(_mm_and_pd(mask_cmp2
,zero
) ,_mm_andnot_pd(mask_cmp2
,one
));
1708 uij
= my_inv_pd(xmm1
);
1709 lij2
= _mm_mul_sd(lij
,lij
);
1710 lij3
= _mm_mul_sd(lij2
,lij
);
1711 uij2
= _mm_mul_sd(uij
,uij
);
1712 uij3
= _mm_mul_sd(uij2
,uij
);
1714 diff2
= _mm_sub_sd(uij2
,lij2
);
1716 lij_inv
= my_invrsq_pd(lij2
);
1719 sk2_inv
= _mm_mul_sd(sk2
,rinv
);
1720 prod
= _mm_mul_sd(qrtr
,sk2_inv
);
1722 log_term
= _mm_mul_pd(uij
,lij_inv
);
1723 log_term
= log_pd(log_term
);
1725 xmm1
= _mm_sub_sd(lij
,uij
);
1726 xmm2
= _mm_mul_sd(qrtr
,r
);
1727 xmm2
= _mm_mul_sd(xmm2
,diff2
);
1728 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1729 xmm2
= _mm_mul_sd(half
,rinv
);
1730 xmm2
= _mm_mul_sd(xmm2
,log_term
);
1731 xmm1
= _mm_add_sd(xmm1
,xmm2
);
1732 xmm9
= _mm_mul_sd(neg
,diff2
);
1733 xmm2
= _mm_mul_sd(xmm9
,prod
);
1734 tmp
= _mm_add_sd(xmm1
,xmm2
);
1736 /* contitional for raj<sk_ai-dr */
1737 xmm3
= _mm_sub_sd(sk_ai
,r
);
1738 mask_cmp3
= _mm_cmplt_sd(raj
,xmm3
); /* rai<sk-dr */
1740 xmm4
= _mm_sub_sd(raj_inv
,lij
);
1741 xmm4
= _mm_mul_sd(two
,xmm4
);
1742 xmm4
= _mm_add_sd(tmp
,xmm4
);
1744 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp3
,xmm4
) ,_mm_andnot_pd(mask_cmp3
,tmp
)); /*conditional as a mask*/
1746 /* the tmp will now contain two partial values, that not all are to be used. Which */
1747 /* ones are governed by the mask_cmp mask. */
1748 tmp
= _mm_mul_pd(half
,tmp
);
1749 tmp
= _mm_or_pd(_mm_and_pd(mask_cmp
,tmp
) ,_mm_andnot_pd(mask_cmp
,zero
)); /*conditional as a mask*/
1751 /* Load, add and store gpol energy */
1752 xmm7
= _mm_load_sd(born
->gpol_hct_work
+aj1
);
1753 xmm7
= _mm_add_sd(xmm7
,tmp
);
1754 _mm_store_sd(born
->gpol_hct_work
+aj1
,xmm7
);
1756 /* Start chain rule terms, t1 */
1757 xmm2
= _mm_mul_sd(half
,lij2
);
1758 xmm3
= _mm_mul_sd(prod
,lij3
);
1759 xmm2
= _mm_add_sd(xmm2
,xmm3
);
1760 xmm3
= _mm_mul_sd(lij
,rinv
);
1761 xmm4
= _mm_mul_sd(lij3
,r
);
1762 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1763 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1764 t1
= _mm_sub_sd(xmm2
,xmm3
);
1766 xmm2
= _mm_mul_sd(half
,uij2
);
1767 xmm2
= _mm_mul_sd(neg
,xmm2
);
1768 xmm3
= _mm_mul_sd(qrtr
,sk2_inv
);
1769 xmm3
= _mm_mul_sd(xmm3
,uij3
);
1770 xmm2
= _mm_sub_sd(xmm2
,xmm3
);
1771 xmm3
= _mm_mul_sd(uij
,rinv
);
1772 xmm4
= _mm_mul_sd(uij3
,r
);
1773 xmm3
= _mm_add_sd(xmm3
,xmm4
);
1774 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1775 t2
= _mm_add_sd(xmm2
,xmm3
);
1777 xmm2
= _mm_mul_sd(sk2_inv
,rinv
);
1778 xmm2
= _mm_add_sd(one
,xmm2
);
1779 xmm2
= _mm_mul_sd(eigth
,xmm2
);
1780 xmm2
= _mm_mul_sd(xmm2
,xmm9
);
1781 xmm3
= _mm_mul_sd(log_term
, rinv
);
1782 xmm3
= _mm_mul_sd(xmm3
,rinv
);
1783 xmm3
= _mm_mul_sd(qrtr
,xmm3
);
1784 t3
= _mm_add_sd(xmm2
,xmm3
);
1786 /* chain rule terms */
1787 xmm2
= _mm_mul_sd(dlij
,t1
);
1788 xmm2
= _mm_add_sd(xmm2
,t2
);
1789 xmm2
= _mm_add_sd(xmm2
,t3
);
1790 chrule_ai
= _mm_mul_sd(xmm2
,rinv
);
1792 _mm_store_sd(fr
->dadx
+n
, chrule
);
1794 _mm_store_sd(fr
->dadx
+n
, chrule_ai
);
1798 /* Do end processing ... */
1799 tmp_ai
= _mm_unpacklo_pd(tmp_ai
,sum_ai
);
1800 sum_ai
= _mm_add_pd(sum_ai
,tmp_ai
);
1801 sum_ai
= _mm_shuffle_pd(sum_ai
,sum_ai
,_MM_SHUFFLE2(1,1));
1803 /* Load, add and store atom ai polarisation energy */
1804 xmm2
= _mm_load_sd(born
->gpol_hct_work
+ai
);
1805 sum_ai
= _mm_add_sd(sum_ai
,xmm2
);
1806 _mm_store_sd(born
->gpol_hct_work
+ai
,sum_ai
);
1809 /* Parallel summations */
1812 gmx_sum(natoms
, born
->gpol_hct_work
, cr
);
1814 else if(DOMAINDECOMP(cr
))
1816 dd_atom_sum_real(cr
->dd
, born
->gpol_hct_work
);
1819 /* Compute the radii */
1820 for(i
=0;i
<fr
->natoms_force
;i
++) /* PELA born->nr */
1822 if(born
->use
[i
] != 0)
1824 rr
= top
->atomtypes
.gb_radius
[md
->typeA
[i
]];
1828 sum
= rr
* born
->gpol_hct_work
[i
];
1832 tsum
= tanh(born
->obc_alpha
*sum
-born
->obc_beta
*sum2
+born
->obc_gamma
*sum3
);
1833 born
->bRad
[i
] = rr_inv
- tsum
*rr_inv2
;
1834 born
->bRad
[i
] = 1.0 / born
->bRad
[i
];
1836 fr
->invsqrta
[i
] = gmx_invsqrt(born
->bRad
[i
]);
1838 tchain
= rr
* (born
->obc_alpha
-2*born
->obc_beta
*sum
+3*born
->obc_gamma
*sum2
);
1839 born
->drobc
[i
] = (1.0-tsum
*tsum
)*tchain
*rr_inv2
;
1843 /* Extra (local) communication required for DD */
1844 if(DOMAINDECOMP(cr
))
1846 dd_atom_spread_real(cr
->dd
, born
->bRad
);
1847 dd_atom_spread_real(cr
->dd
, fr
->invsqrta
);
1848 dd_atom_spread_real(cr
->dd
, born
->drobc
);
1857 calc_gb_chainrule_sse2_double(int natoms
, t_nblist
*nl
, double *dadx
, double *dvda
, double *xd
, double *f
,
1858 double *fshift
, double *shift_vec
, int gb_algorithm
, gmx_genborn_t
*born
)
1860 int i
,k
,n
,ai
,aj
,ai3
,aj1
,aj2
,aj13
,aj23
,aj4
,nj0
,nj1
,offset
;
1862 double rbi
,shX
,shY
,shZ
;
1865 __m128d ix
,iy
,iz
,jx
,jy
,jz
,fix
,fiy
,fiz
,sX
,sY
,sZ
;
1866 __m128d dx
,dy
,dz
,t1
,t2
,t3
,dva
,dvaj
,dax
,dax_ai
,fgb
,fgb_ai
;
1867 __m128d xmm1
,xmm2
,xmm3
,xmm4
,xmm5
,xmm6
,xmm7
,xmm8
;
1869 t1
= t2
= t3
= _mm_setzero_pd();
1872 /* Loop to get the proper form for the Born radius term */
1873 if(gb_algorithm
==egbSTILL
) {
1874 for(i
=0;i
<natoms
;i
++)
1876 rbi
= born
->bRad
[i
];
1877 rb
[i
] = (2 * rbi
* rbi
* dvda
[i
])/ONE_4PI_EPS0
;
1881 else if(gb_algorithm
==egbHCT
) {
1882 for(i
=0;i
<natoms
;i
++)
1884 rbi
= born
->bRad
[i
];
1885 rb
[i
] = rbi
* rbi
* dvda
[i
];
1889 else if(gb_algorithm
==egbOBC
) {
1890 for(i
=0;i
<natoms
;i
++)
1892 rbi
= born
->bRad
[i
];
1893 rb
[i
] = rbi
* rbi
* born
->drobc
[i
] * dvda
[i
];
1899 for(i
=0;i
<nl
->nri
;i
++)
1904 nj0
= nl
->jindex
[i
];
1905 nj1
= nl
->jindex
[i
+1];
1907 /* Load shifts for this list */
1908 shift
= 3*nl
->shift
[i
];
1909 shX
= shift_vec
[shift
+0];
1910 shY
= shift_vec
[shift
+1];
1911 shZ
= shift_vec
[shift
+2];
1913 /* Splat the shifts */
1914 sX
= _mm_load1_pd(&shX
);
1915 sY
= _mm_load1_pd(&shY
);
1916 sZ
= _mm_load1_pd(&shZ
);
1918 offset
= (nj1
-nj0
)%2;
1920 /* Load particle ai coordinates and add shifts */
1921 ix
= _mm_load1_pd(xd
+ai3
);
1922 iy
= _mm_load1_pd(xd
+ai3
+1);
1923 iz
= _mm_load1_pd(xd
+ai3
+2);
1925 ix
= _mm_add_pd(sX
,ix
);
1926 iy
= _mm_add_pd(sY
,iy
);
1927 iz
= _mm_add_pd(sZ
,iz
);
1929 /* Load particle ai dvda */
1930 dva
= _mm_load1_pd(rb
+ai
);
1932 fix
= _mm_setzero_pd();
1933 fiy
= _mm_setzero_pd();
1934 fiz
= _mm_setzero_pd();
1936 for(k
=nj0
;k
<nj1
-offset
;k
+=2)
1939 aj2
= nl
->jjnr
[k
+1];
1942 dvaj
= _mm_set_pd(rb
[aj2
],rb
[aj1
]);
1947 /* Load particle aj1-2 coordinates */
1948 xmm1
= _mm_loadu_pd(xd
+aj13
);
1949 xmm2
= _mm_loadu_pd(xd
+aj23
);
1951 xmm5
= _mm_load_sd(xd
+aj13
+2);
1952 xmm6
= _mm_load_sd(xd
+aj23
+2);
1954 jx
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0));
1955 jy
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1));
1956 jz
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(0,0));
1958 /* load derivative of born radii w.r.t. coordinates */
1959 xmm7
= _mm_loadu_pd(dadx
+n
);
1961 xmm8
= _mm_loadu_pd(dadx
+n
);
1964 /* Shuffle to get the ai and aj components right */
1965 dax
= _mm_shuffle_pd(xmm7
,xmm8
,_MM_SHUFFLE2(2,0));
1966 dax_ai
= _mm_shuffle_pd(xmm7
,xmm8
,_MM_SHUFFLE2(3,1));
1969 dx
= _mm_sub_pd(ix
,jx
);
1970 dy
= _mm_sub_pd(iy
,jy
);
1971 dz
= _mm_sub_pd(iz
,jz
);
1974 fgb
= _mm_mul_pd(dva
,dax
);
1975 fgb_ai
= _mm_mul_pd(dvaj
,dax_ai
);
1976 fgb
= _mm_add_pd(fgb
,fgb_ai
);
1978 /* partial forces */
1979 t1
= _mm_mul_pd(fgb
,dx
);
1980 t2
= _mm_mul_pd(fgb
,dy
);
1981 t3
= _mm_mul_pd(fgb
,dz
);
1983 /* update the i force */
1984 fix
= _mm_add_pd(fix
,t1
);
1985 fiy
= _mm_add_pd(fiy
,t2
);
1986 fiz
= _mm_add_pd(fiz
,t3
);
1988 /* accumulate forces from memory */
1989 xmm1
= _mm_loadu_pd(f
+aj13
); /* fx1 fx2 */
1990 xmm2
= _mm_loadu_pd(f
+aj23
); /* fy1 fy2 */
1992 xmm5
= _mm_load1_pd(f
+aj13
+2); /* fz1 fz1 */
1993 xmm6
= _mm_load1_pd(f
+aj23
+2); /* fz2 fz2 */
1996 xmm7
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
1997 xmm5
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
1998 xmm6
= _mm_shuffle_pd(xmm1
,xmm2
,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
2000 /* subtract partial forces */
2001 xmm5
= _mm_sub_pd(xmm5
, t1
);
2002 xmm6
= _mm_sub_pd(xmm6
, t2
);
2003 xmm7
= _mm_sub_pd(xmm7
, t3
);
2005 /* transpose back fx and fy */
2006 xmm1
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(0,0));
2007 xmm2
= _mm_shuffle_pd(xmm5
,xmm6
,_MM_SHUFFLE2(1,1));
2009 /* store the force, first fx's and fy's */
2010 _mm_storeu_pd(f
+aj13
,xmm1
);
2011 _mm_storeu_pd(f
+aj23
,xmm2
);
2014 _mm_storel_pd(f
+aj13
+2,xmm7
);
2015 _mm_storeh_pd(f
+aj23
+2,xmm7
);
2022 dvaj
= _mm_load_sd(rb
+aj1
);
2026 jx
= _mm_load_sd(xd
+aj13
);
2027 jy
= _mm_load_sd(xd
+aj13
+1);
2028 jz
= _mm_load_sd(xd
+aj13
+2);
2030 dax
= _mm_load_sd(dadx
+n
);
2032 dax_ai
= _mm_load_sd(dadx
+n
);
2035 dx
= _mm_sub_sd(ix
,jx
);
2036 dy
= _mm_sub_sd(iy
,jy
);
2037 dz
= _mm_sub_sd(iz
,jz
);
2039 fgb
= _mm_mul_sd(dva
,dax
);
2040 fgb_ai
= _mm_mul_sd(dvaj
,dax_ai
);
2041 fgb
= _mm_add_pd(fgb
,fgb_ai
);
2043 t1
= _mm_mul_sd(fgb
,dx
);
2044 t2
= _mm_mul_sd(fgb
,dy
);
2045 t3
= _mm_mul_sd(fgb
,dz
);
2047 fix
= _mm_add_sd(fix
,t1
);
2048 fiy
= _mm_add_sd(fiy
,t2
);
2049 fiz
= _mm_add_sd(fiz
,t3
);
2051 /* accumulate forces from memory */
2052 xmm5
= _mm_load_sd(f
+aj13
);
2053 xmm6
= _mm_load_sd(f
+aj13
+1);
2054 xmm7
= _mm_load_sd(f
+aj13
+2);
2056 /* subtract partial forces */
2057 xmm5
= _mm_sub_sd(xmm5
, t1
);
2058 xmm6
= _mm_sub_sd(xmm6
, t2
);
2059 xmm7
= _mm_sub_sd(xmm7
, t3
);
2061 /* store the force */
2062 _mm_store_sd(f
+aj13
,xmm5
);
2063 _mm_store_sd(f
+aj13
+1,xmm6
);
2064 _mm_store_sd(f
+aj13
+2,xmm7
);
2067 /* fix/fiy/fiz now contain two partial terms, that all should be
2068 * added to the i particle forces
2070 t1
= _mm_unpacklo_pd(t1
,fix
);
2071 t2
= _mm_unpacklo_pd(t2
,fiy
);
2072 t3
= _mm_unpacklo_pd(t3
,fiz
);
2074 fix
= _mm_add_pd(fix
,t1
);
2075 fiy
= _mm_add_pd(fiy
,t2
);
2076 fiz
= _mm_add_pd(fiz
,t3
);
2078 fix
= _mm_shuffle_pd(fix
,fix
,_MM_SHUFFLE2(1,1));
2079 fiy
= _mm_shuffle_pd(fiy
,fiy
,_MM_SHUFFLE2(1,1));
2080 fiz
= _mm_shuffle_pd(fiz
,fiz
,_MM_SHUFFLE2(1,1));
2082 /* load, add and store i forces */
2083 xmm1
= _mm_load_sd(f
+ai3
);
2084 xmm2
= _mm_load_sd(f
+ai3
+1);
2085 xmm3
= _mm_load_sd(f
+ai3
+2);
2087 fix
= _mm_add_sd(fix
,xmm1
);
2088 fiy
= _mm_add_sd(fiy
,xmm2
);
2089 fiz
= _mm_add_sd(fiz
,xmm3
);
2091 _mm_store_sd(f
+ai3
,fix
);
2092 _mm_store_sd(f
+ai3
+1,fiy
);
2093 _mm_store_sd(f
+ai3
+2,fiz
);
2095 /* load, add and store i shift forces */
2096 xmm1
= _mm_load_sd(fshift
+shift
);
2097 xmm2
= _mm_load_sd(fshift
+shift
+1);
2098 xmm3
= _mm_load_sd(fshift
+shift
+2);
2100 fix
= _mm_add_sd(fix
,xmm1
);
2101 fiy
= _mm_add_sd(fiy
,xmm2
);
2102 fiz
= _mm_add_sd(fiz
,xmm3
);
2104 _mm_store_sd(fshift
+shift
,fix
);
2105 _mm_store_sd(fshift
+shift
+1,fiy
);
2106 _mm_store_sd(fshift
+shift
+2,fiz
);
2114 /* keep compiler happy */
2115 int genborn_sse_dummy
;
2117 #endif /* SSE2 intrinsics available */