2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_double
45 (t_nblist
* gmx_restrict nlist
,
46 rvec
* gmx_restrict xx
,
47 rvec
* gmx_restrict ff
,
48 t_forcerec
* gmx_restrict fr
,
49 t_mdatoms
* gmx_restrict mdatoms
,
50 nb_kernel_data_t
* gmx_restrict kernel_data
,
51 t_nrnb
* gmx_restrict nrnb
)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
59 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
61 int j_coord_offsetA
,j_coord_offsetB
;
62 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
64 real
*shiftvec
,*fshift
,*x
,*f
;
65 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
67 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
69 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
71 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
72 int vdwjidx0A
,vdwjidx0B
;
73 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
74 int vdwjidx1A
,vdwjidx1B
;
75 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
76 int vdwjidx2A
,vdwjidx2B
;
77 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
78 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
79 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
80 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
81 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
82 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
83 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
84 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
85 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
86 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
87 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
90 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
93 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
96 __m128i ifour
= _mm_set1_epi32(4);
97 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
99 __m128d dummy_mask
,cutoff_mask
;
100 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101 __m128d one
= _mm_set1_pd(1.0);
102 __m128d two
= _mm_set1_pd(2.0);
108 jindex
= nlist
->jindex
;
110 shiftidx
= nlist
->shift
;
112 shiftvec
= fr
->shift_vec
[0];
113 fshift
= fr
->fshift
[0];
114 facel
= _mm_set1_pd(fr
->epsfac
);
115 charge
= mdatoms
->chargeA
;
116 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
117 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
118 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
119 nvdwtype
= fr
->ntype
;
121 vdwtype
= mdatoms
->typeA
;
123 vftab
= kernel_data
->table_vdw
->data
;
124 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
126 /* Setup water-specific parameters */
127 inr
= nlist
->iinr
[0];
128 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
129 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
130 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
131 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
133 jq0
= _mm_set1_pd(charge
[inr
+0]);
134 jq1
= _mm_set1_pd(charge
[inr
+1]);
135 jq2
= _mm_set1_pd(charge
[inr
+2]);
136 vdwjidx0A
= 2*vdwtype
[inr
+0];
137 qq00
= _mm_mul_pd(iq0
,jq0
);
138 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
139 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
140 qq01
= _mm_mul_pd(iq0
,jq1
);
141 qq02
= _mm_mul_pd(iq0
,jq2
);
142 qq10
= _mm_mul_pd(iq1
,jq0
);
143 qq11
= _mm_mul_pd(iq1
,jq1
);
144 qq12
= _mm_mul_pd(iq1
,jq2
);
145 qq20
= _mm_mul_pd(iq2
,jq0
);
146 qq21
= _mm_mul_pd(iq2
,jq1
);
147 qq22
= _mm_mul_pd(iq2
,jq2
);
149 /* Avoid stupid compiler warnings */
157 /* Start outer loop over neighborlists */
158 for(iidx
=0; iidx
<nri
; iidx
++)
160 /* Load shift vector for this list */
161 i_shift_offset
= DIM
*shiftidx
[iidx
];
163 /* Load limits for loop over neighbors */
164 j_index_start
= jindex
[iidx
];
165 j_index_end
= jindex
[iidx
+1];
167 /* Get outer coordinate index */
169 i_coord_offset
= DIM
*inr
;
171 /* Load i particle coords and add shift vector */
172 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
173 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
175 fix0
= _mm_setzero_pd();
176 fiy0
= _mm_setzero_pd();
177 fiz0
= _mm_setzero_pd();
178 fix1
= _mm_setzero_pd();
179 fiy1
= _mm_setzero_pd();
180 fiz1
= _mm_setzero_pd();
181 fix2
= _mm_setzero_pd();
182 fiy2
= _mm_setzero_pd();
183 fiz2
= _mm_setzero_pd();
185 /* Reset potential sums */
186 velecsum
= _mm_setzero_pd();
187 vvdwsum
= _mm_setzero_pd();
189 /* Start inner kernel loop */
190 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
193 /* Get j neighbor index, and coordinate index */
196 j_coord_offsetA
= DIM
*jnrA
;
197 j_coord_offsetB
= DIM
*jnrB
;
199 /* load j atom coordinates */
200 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
201 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
203 /* Calculate displacement vector */
204 dx00
= _mm_sub_pd(ix0
,jx0
);
205 dy00
= _mm_sub_pd(iy0
,jy0
);
206 dz00
= _mm_sub_pd(iz0
,jz0
);
207 dx01
= _mm_sub_pd(ix0
,jx1
);
208 dy01
= _mm_sub_pd(iy0
,jy1
);
209 dz01
= _mm_sub_pd(iz0
,jz1
);
210 dx02
= _mm_sub_pd(ix0
,jx2
);
211 dy02
= _mm_sub_pd(iy0
,jy2
);
212 dz02
= _mm_sub_pd(iz0
,jz2
);
213 dx10
= _mm_sub_pd(ix1
,jx0
);
214 dy10
= _mm_sub_pd(iy1
,jy0
);
215 dz10
= _mm_sub_pd(iz1
,jz0
);
216 dx11
= _mm_sub_pd(ix1
,jx1
);
217 dy11
= _mm_sub_pd(iy1
,jy1
);
218 dz11
= _mm_sub_pd(iz1
,jz1
);
219 dx12
= _mm_sub_pd(ix1
,jx2
);
220 dy12
= _mm_sub_pd(iy1
,jy2
);
221 dz12
= _mm_sub_pd(iz1
,jz2
);
222 dx20
= _mm_sub_pd(ix2
,jx0
);
223 dy20
= _mm_sub_pd(iy2
,jy0
);
224 dz20
= _mm_sub_pd(iz2
,jz0
);
225 dx21
= _mm_sub_pd(ix2
,jx1
);
226 dy21
= _mm_sub_pd(iy2
,jy1
);
227 dz21
= _mm_sub_pd(iz2
,jz1
);
228 dx22
= _mm_sub_pd(ix2
,jx2
);
229 dy22
= _mm_sub_pd(iy2
,jy2
);
230 dz22
= _mm_sub_pd(iz2
,jz2
);
232 /* Calculate squared distance and things based on it */
233 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
234 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
235 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
236 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
237 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
238 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
239 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
240 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
241 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
243 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
244 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
245 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
246 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
247 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
248 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
249 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
250 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
251 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
253 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
254 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
255 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
256 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
257 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
258 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
259 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
260 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
261 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
263 fjx0
= _mm_setzero_pd();
264 fjy0
= _mm_setzero_pd();
265 fjz0
= _mm_setzero_pd();
266 fjx1
= _mm_setzero_pd();
267 fjy1
= _mm_setzero_pd();
268 fjz1
= _mm_setzero_pd();
269 fjx2
= _mm_setzero_pd();
270 fjy2
= _mm_setzero_pd();
271 fjz2
= _mm_setzero_pd();
273 /**************************
274 * CALCULATE INTERACTIONS *
275 **************************/
277 r00
= _mm_mul_pd(rsq00
,rinv00
);
279 /* Calculate table index by multiplying r with table scale and truncate to integer */
280 rt
= _mm_mul_pd(r00
,vftabscale
);
281 vfitab
= _mm_cvttpd_epi32(rt
);
282 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
283 vfitab
= _mm_slli_epi32(vfitab
,3);
285 /* REACTION-FIELD ELECTROSTATICS */
286 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_add_pd(rinv00
,_mm_mul_pd(krf
,rsq00
)),crf
));
287 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
289 /* CUBIC SPLINE TABLE DISPERSION */
290 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
291 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
292 GMX_MM_TRANSPOSE2_PD(Y
,F
);
293 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
294 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
295 GMX_MM_TRANSPOSE2_PD(G
,H
);
296 Heps
= _mm_mul_pd(vfeps
,H
);
297 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
298 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
299 vvdw6
= _mm_mul_pd(c6_00
,VV
);
300 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
301 fvdw6
= _mm_mul_pd(c6_00
,FF
);
303 /* CUBIC SPLINE TABLE REPULSION */
304 vfitab
= _mm_add_epi32(vfitab
,ifour
);
305 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
306 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
307 GMX_MM_TRANSPOSE2_PD(Y
,F
);
308 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
309 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
310 GMX_MM_TRANSPOSE2_PD(G
,H
);
311 Heps
= _mm_mul_pd(vfeps
,H
);
312 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
313 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
314 vvdw12
= _mm_mul_pd(c12_00
,VV
);
315 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
316 fvdw12
= _mm_mul_pd(c12_00
,FF
);
317 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
318 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
320 /* Update potential sum for this i atom from the interaction with this j atom. */
321 velecsum
= _mm_add_pd(velecsum
,velec
);
322 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
324 fscal
= _mm_add_pd(felec
,fvdw
);
326 /* Calculate temporary vectorial force */
327 tx
= _mm_mul_pd(fscal
,dx00
);
328 ty
= _mm_mul_pd(fscal
,dy00
);
329 tz
= _mm_mul_pd(fscal
,dz00
);
331 /* Update vectorial force */
332 fix0
= _mm_add_pd(fix0
,tx
);
333 fiy0
= _mm_add_pd(fiy0
,ty
);
334 fiz0
= _mm_add_pd(fiz0
,tz
);
336 fjx0
= _mm_add_pd(fjx0
,tx
);
337 fjy0
= _mm_add_pd(fjy0
,ty
);
338 fjz0
= _mm_add_pd(fjz0
,tz
);
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
344 /* REACTION-FIELD ELECTROSTATICS */
345 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_add_pd(rinv01
,_mm_mul_pd(krf
,rsq01
)),crf
));
346 felec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_mul_pd(rinv01
,rinvsq01
),krf2
));
348 /* Update potential sum for this i atom from the interaction with this j atom. */
349 velecsum
= _mm_add_pd(velecsum
,velec
);
353 /* Calculate temporary vectorial force */
354 tx
= _mm_mul_pd(fscal
,dx01
);
355 ty
= _mm_mul_pd(fscal
,dy01
);
356 tz
= _mm_mul_pd(fscal
,dz01
);
358 /* Update vectorial force */
359 fix0
= _mm_add_pd(fix0
,tx
);
360 fiy0
= _mm_add_pd(fiy0
,ty
);
361 fiz0
= _mm_add_pd(fiz0
,tz
);
363 fjx1
= _mm_add_pd(fjx1
,tx
);
364 fjy1
= _mm_add_pd(fjy1
,ty
);
365 fjz1
= _mm_add_pd(fjz1
,tz
);
367 /**************************
368 * CALCULATE INTERACTIONS *
369 **************************/
371 /* REACTION-FIELD ELECTROSTATICS */
372 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_add_pd(rinv02
,_mm_mul_pd(krf
,rsq02
)),crf
));
373 felec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_mul_pd(rinv02
,rinvsq02
),krf2
));
375 /* Update potential sum for this i atom from the interaction with this j atom. */
376 velecsum
= _mm_add_pd(velecsum
,velec
);
380 /* Calculate temporary vectorial force */
381 tx
= _mm_mul_pd(fscal
,dx02
);
382 ty
= _mm_mul_pd(fscal
,dy02
);
383 tz
= _mm_mul_pd(fscal
,dz02
);
385 /* Update vectorial force */
386 fix0
= _mm_add_pd(fix0
,tx
);
387 fiy0
= _mm_add_pd(fiy0
,ty
);
388 fiz0
= _mm_add_pd(fiz0
,tz
);
390 fjx2
= _mm_add_pd(fjx2
,tx
);
391 fjy2
= _mm_add_pd(fjy2
,ty
);
392 fjz2
= _mm_add_pd(fjz2
,tz
);
394 /**************************
395 * CALCULATE INTERACTIONS *
396 **************************/
398 /* REACTION-FIELD ELECTROSTATICS */
399 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_add_pd(rinv10
,_mm_mul_pd(krf
,rsq10
)),crf
));
400 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
402 /* Update potential sum for this i atom from the interaction with this j atom. */
403 velecsum
= _mm_add_pd(velecsum
,velec
);
407 /* Calculate temporary vectorial force */
408 tx
= _mm_mul_pd(fscal
,dx10
);
409 ty
= _mm_mul_pd(fscal
,dy10
);
410 tz
= _mm_mul_pd(fscal
,dz10
);
412 /* Update vectorial force */
413 fix1
= _mm_add_pd(fix1
,tx
);
414 fiy1
= _mm_add_pd(fiy1
,ty
);
415 fiz1
= _mm_add_pd(fiz1
,tz
);
417 fjx0
= _mm_add_pd(fjx0
,tx
);
418 fjy0
= _mm_add_pd(fjy0
,ty
);
419 fjz0
= _mm_add_pd(fjz0
,tz
);
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 /* REACTION-FIELD ELECTROSTATICS */
426 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_add_pd(rinv11
,_mm_mul_pd(krf
,rsq11
)),crf
));
427 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
429 /* Update potential sum for this i atom from the interaction with this j atom. */
430 velecsum
= _mm_add_pd(velecsum
,velec
);
434 /* Calculate temporary vectorial force */
435 tx
= _mm_mul_pd(fscal
,dx11
);
436 ty
= _mm_mul_pd(fscal
,dy11
);
437 tz
= _mm_mul_pd(fscal
,dz11
);
439 /* Update vectorial force */
440 fix1
= _mm_add_pd(fix1
,tx
);
441 fiy1
= _mm_add_pd(fiy1
,ty
);
442 fiz1
= _mm_add_pd(fiz1
,tz
);
444 fjx1
= _mm_add_pd(fjx1
,tx
);
445 fjy1
= _mm_add_pd(fjy1
,ty
);
446 fjz1
= _mm_add_pd(fjz1
,tz
);
448 /**************************
449 * CALCULATE INTERACTIONS *
450 **************************/
452 /* REACTION-FIELD ELECTROSTATICS */
453 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_add_pd(rinv12
,_mm_mul_pd(krf
,rsq12
)),crf
));
454 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
456 /* Update potential sum for this i atom from the interaction with this j atom. */
457 velecsum
= _mm_add_pd(velecsum
,velec
);
461 /* Calculate temporary vectorial force */
462 tx
= _mm_mul_pd(fscal
,dx12
);
463 ty
= _mm_mul_pd(fscal
,dy12
);
464 tz
= _mm_mul_pd(fscal
,dz12
);
466 /* Update vectorial force */
467 fix1
= _mm_add_pd(fix1
,tx
);
468 fiy1
= _mm_add_pd(fiy1
,ty
);
469 fiz1
= _mm_add_pd(fiz1
,tz
);
471 fjx2
= _mm_add_pd(fjx2
,tx
);
472 fjy2
= _mm_add_pd(fjy2
,ty
);
473 fjz2
= _mm_add_pd(fjz2
,tz
);
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 /* REACTION-FIELD ELECTROSTATICS */
480 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_add_pd(rinv20
,_mm_mul_pd(krf
,rsq20
)),crf
));
481 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum
= _mm_add_pd(velecsum
,velec
);
488 /* Calculate temporary vectorial force */
489 tx
= _mm_mul_pd(fscal
,dx20
);
490 ty
= _mm_mul_pd(fscal
,dy20
);
491 tz
= _mm_mul_pd(fscal
,dz20
);
493 /* Update vectorial force */
494 fix2
= _mm_add_pd(fix2
,tx
);
495 fiy2
= _mm_add_pd(fiy2
,ty
);
496 fiz2
= _mm_add_pd(fiz2
,tz
);
498 fjx0
= _mm_add_pd(fjx0
,tx
);
499 fjy0
= _mm_add_pd(fjy0
,ty
);
500 fjz0
= _mm_add_pd(fjz0
,tz
);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 /* REACTION-FIELD ELECTROSTATICS */
507 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_add_pd(rinv21
,_mm_mul_pd(krf
,rsq21
)),crf
));
508 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
510 /* Update potential sum for this i atom from the interaction with this j atom. */
511 velecsum
= _mm_add_pd(velecsum
,velec
);
515 /* Calculate temporary vectorial force */
516 tx
= _mm_mul_pd(fscal
,dx21
);
517 ty
= _mm_mul_pd(fscal
,dy21
);
518 tz
= _mm_mul_pd(fscal
,dz21
);
520 /* Update vectorial force */
521 fix2
= _mm_add_pd(fix2
,tx
);
522 fiy2
= _mm_add_pd(fiy2
,ty
);
523 fiz2
= _mm_add_pd(fiz2
,tz
);
525 fjx1
= _mm_add_pd(fjx1
,tx
);
526 fjy1
= _mm_add_pd(fjy1
,ty
);
527 fjz1
= _mm_add_pd(fjz1
,tz
);
529 /**************************
530 * CALCULATE INTERACTIONS *
531 **************************/
533 /* REACTION-FIELD ELECTROSTATICS */
534 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_add_pd(rinv22
,_mm_mul_pd(krf
,rsq22
)),crf
));
535 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velecsum
= _mm_add_pd(velecsum
,velec
);
542 /* Calculate temporary vectorial force */
543 tx
= _mm_mul_pd(fscal
,dx22
);
544 ty
= _mm_mul_pd(fscal
,dy22
);
545 tz
= _mm_mul_pd(fscal
,dz22
);
547 /* Update vectorial force */
548 fix2
= _mm_add_pd(fix2
,tx
);
549 fiy2
= _mm_add_pd(fiy2
,ty
);
550 fiz2
= _mm_add_pd(fiz2
,tz
);
552 fjx2
= _mm_add_pd(fjx2
,tx
);
553 fjy2
= _mm_add_pd(fjy2
,ty
);
554 fjz2
= _mm_add_pd(fjz2
,tz
);
556 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
558 /* Inner loop uses 323 flops */
565 j_coord_offsetA
= DIM
*jnrA
;
567 /* load j atom coordinates */
568 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
569 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
571 /* Calculate displacement vector */
572 dx00
= _mm_sub_pd(ix0
,jx0
);
573 dy00
= _mm_sub_pd(iy0
,jy0
);
574 dz00
= _mm_sub_pd(iz0
,jz0
);
575 dx01
= _mm_sub_pd(ix0
,jx1
);
576 dy01
= _mm_sub_pd(iy0
,jy1
);
577 dz01
= _mm_sub_pd(iz0
,jz1
);
578 dx02
= _mm_sub_pd(ix0
,jx2
);
579 dy02
= _mm_sub_pd(iy0
,jy2
);
580 dz02
= _mm_sub_pd(iz0
,jz2
);
581 dx10
= _mm_sub_pd(ix1
,jx0
);
582 dy10
= _mm_sub_pd(iy1
,jy0
);
583 dz10
= _mm_sub_pd(iz1
,jz0
);
584 dx11
= _mm_sub_pd(ix1
,jx1
);
585 dy11
= _mm_sub_pd(iy1
,jy1
);
586 dz11
= _mm_sub_pd(iz1
,jz1
);
587 dx12
= _mm_sub_pd(ix1
,jx2
);
588 dy12
= _mm_sub_pd(iy1
,jy2
);
589 dz12
= _mm_sub_pd(iz1
,jz2
);
590 dx20
= _mm_sub_pd(ix2
,jx0
);
591 dy20
= _mm_sub_pd(iy2
,jy0
);
592 dz20
= _mm_sub_pd(iz2
,jz0
);
593 dx21
= _mm_sub_pd(ix2
,jx1
);
594 dy21
= _mm_sub_pd(iy2
,jy1
);
595 dz21
= _mm_sub_pd(iz2
,jz1
);
596 dx22
= _mm_sub_pd(ix2
,jx2
);
597 dy22
= _mm_sub_pd(iy2
,jy2
);
598 dz22
= _mm_sub_pd(iz2
,jz2
);
600 /* Calculate squared distance and things based on it */
601 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
602 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
603 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
604 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
605 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
606 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
607 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
608 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
609 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
611 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
612 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
613 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
614 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
615 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
616 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
617 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
618 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
619 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
621 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
622 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
623 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
624 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
625 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
626 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
627 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
628 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
629 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
631 fjx0
= _mm_setzero_pd();
632 fjy0
= _mm_setzero_pd();
633 fjz0
= _mm_setzero_pd();
634 fjx1
= _mm_setzero_pd();
635 fjy1
= _mm_setzero_pd();
636 fjz1
= _mm_setzero_pd();
637 fjx2
= _mm_setzero_pd();
638 fjy2
= _mm_setzero_pd();
639 fjz2
= _mm_setzero_pd();
641 /**************************
642 * CALCULATE INTERACTIONS *
643 **************************/
645 r00
= _mm_mul_pd(rsq00
,rinv00
);
647 /* Calculate table index by multiplying r with table scale and truncate to integer */
648 rt
= _mm_mul_pd(r00
,vftabscale
);
649 vfitab
= _mm_cvttpd_epi32(rt
);
650 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
651 vfitab
= _mm_slli_epi32(vfitab
,3);
653 /* REACTION-FIELD ELECTROSTATICS */
654 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_add_pd(rinv00
,_mm_mul_pd(krf
,rsq00
)),crf
));
655 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
657 /* CUBIC SPLINE TABLE DISPERSION */
658 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
659 F
= _mm_setzero_pd();
660 GMX_MM_TRANSPOSE2_PD(Y
,F
);
661 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
662 H
= _mm_setzero_pd();
663 GMX_MM_TRANSPOSE2_PD(G
,H
);
664 Heps
= _mm_mul_pd(vfeps
,H
);
665 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
666 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
667 vvdw6
= _mm_mul_pd(c6_00
,VV
);
668 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
669 fvdw6
= _mm_mul_pd(c6_00
,FF
);
671 /* CUBIC SPLINE TABLE REPULSION */
672 vfitab
= _mm_add_epi32(vfitab
,ifour
);
673 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
674 F
= _mm_setzero_pd();
675 GMX_MM_TRANSPOSE2_PD(Y
,F
);
676 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
677 H
= _mm_setzero_pd();
678 GMX_MM_TRANSPOSE2_PD(G
,H
);
679 Heps
= _mm_mul_pd(vfeps
,H
);
680 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
681 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
682 vvdw12
= _mm_mul_pd(c12_00
,VV
);
683 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
684 fvdw12
= _mm_mul_pd(c12_00
,FF
);
685 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
686 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
688 /* Update potential sum for this i atom from the interaction with this j atom. */
689 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
690 velecsum
= _mm_add_pd(velecsum
,velec
);
691 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
692 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
694 fscal
= _mm_add_pd(felec
,fvdw
);
696 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
698 /* Calculate temporary vectorial force */
699 tx
= _mm_mul_pd(fscal
,dx00
);
700 ty
= _mm_mul_pd(fscal
,dy00
);
701 tz
= _mm_mul_pd(fscal
,dz00
);
703 /* Update vectorial force */
704 fix0
= _mm_add_pd(fix0
,tx
);
705 fiy0
= _mm_add_pd(fiy0
,ty
);
706 fiz0
= _mm_add_pd(fiz0
,tz
);
708 fjx0
= _mm_add_pd(fjx0
,tx
);
709 fjy0
= _mm_add_pd(fjy0
,ty
);
710 fjz0
= _mm_add_pd(fjz0
,tz
);
712 /**************************
713 * CALCULATE INTERACTIONS *
714 **************************/
716 /* REACTION-FIELD ELECTROSTATICS */
717 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_add_pd(rinv01
,_mm_mul_pd(krf
,rsq01
)),crf
));
718 felec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_mul_pd(rinv01
,rinvsq01
),krf2
));
720 /* Update potential sum for this i atom from the interaction with this j atom. */
721 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
722 velecsum
= _mm_add_pd(velecsum
,velec
);
726 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
728 /* Calculate temporary vectorial force */
729 tx
= _mm_mul_pd(fscal
,dx01
);
730 ty
= _mm_mul_pd(fscal
,dy01
);
731 tz
= _mm_mul_pd(fscal
,dz01
);
733 /* Update vectorial force */
734 fix0
= _mm_add_pd(fix0
,tx
);
735 fiy0
= _mm_add_pd(fiy0
,ty
);
736 fiz0
= _mm_add_pd(fiz0
,tz
);
738 fjx1
= _mm_add_pd(fjx1
,tx
);
739 fjy1
= _mm_add_pd(fjy1
,ty
);
740 fjz1
= _mm_add_pd(fjz1
,tz
);
742 /**************************
743 * CALCULATE INTERACTIONS *
744 **************************/
746 /* REACTION-FIELD ELECTROSTATICS */
747 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_add_pd(rinv02
,_mm_mul_pd(krf
,rsq02
)),crf
));
748 felec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_mul_pd(rinv02
,rinvsq02
),krf2
));
750 /* Update potential sum for this i atom from the interaction with this j atom. */
751 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
752 velecsum
= _mm_add_pd(velecsum
,velec
);
756 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
758 /* Calculate temporary vectorial force */
759 tx
= _mm_mul_pd(fscal
,dx02
);
760 ty
= _mm_mul_pd(fscal
,dy02
);
761 tz
= _mm_mul_pd(fscal
,dz02
);
763 /* Update vectorial force */
764 fix0
= _mm_add_pd(fix0
,tx
);
765 fiy0
= _mm_add_pd(fiy0
,ty
);
766 fiz0
= _mm_add_pd(fiz0
,tz
);
768 fjx2
= _mm_add_pd(fjx2
,tx
);
769 fjy2
= _mm_add_pd(fjy2
,ty
);
770 fjz2
= _mm_add_pd(fjz2
,tz
);
772 /**************************
773 * CALCULATE INTERACTIONS *
774 **************************/
776 /* REACTION-FIELD ELECTROSTATICS */
777 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_add_pd(rinv10
,_mm_mul_pd(krf
,rsq10
)),crf
));
778 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
780 /* Update potential sum for this i atom from the interaction with this j atom. */
781 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
782 velecsum
= _mm_add_pd(velecsum
,velec
);
786 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
788 /* Calculate temporary vectorial force */
789 tx
= _mm_mul_pd(fscal
,dx10
);
790 ty
= _mm_mul_pd(fscal
,dy10
);
791 tz
= _mm_mul_pd(fscal
,dz10
);
793 /* Update vectorial force */
794 fix1
= _mm_add_pd(fix1
,tx
);
795 fiy1
= _mm_add_pd(fiy1
,ty
);
796 fiz1
= _mm_add_pd(fiz1
,tz
);
798 fjx0
= _mm_add_pd(fjx0
,tx
);
799 fjy0
= _mm_add_pd(fjy0
,ty
);
800 fjz0
= _mm_add_pd(fjz0
,tz
);
802 /**************************
803 * CALCULATE INTERACTIONS *
804 **************************/
806 /* REACTION-FIELD ELECTROSTATICS */
807 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_add_pd(rinv11
,_mm_mul_pd(krf
,rsq11
)),crf
));
808 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
810 /* Update potential sum for this i atom from the interaction with this j atom. */
811 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
812 velecsum
= _mm_add_pd(velecsum
,velec
);
816 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
818 /* Calculate temporary vectorial force */
819 tx
= _mm_mul_pd(fscal
,dx11
);
820 ty
= _mm_mul_pd(fscal
,dy11
);
821 tz
= _mm_mul_pd(fscal
,dz11
);
823 /* Update vectorial force */
824 fix1
= _mm_add_pd(fix1
,tx
);
825 fiy1
= _mm_add_pd(fiy1
,ty
);
826 fiz1
= _mm_add_pd(fiz1
,tz
);
828 fjx1
= _mm_add_pd(fjx1
,tx
);
829 fjy1
= _mm_add_pd(fjy1
,ty
);
830 fjz1
= _mm_add_pd(fjz1
,tz
);
832 /**************************
833 * CALCULATE INTERACTIONS *
834 **************************/
836 /* REACTION-FIELD ELECTROSTATICS */
837 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_add_pd(rinv12
,_mm_mul_pd(krf
,rsq12
)),crf
));
838 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
840 /* Update potential sum for this i atom from the interaction with this j atom. */
841 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
842 velecsum
= _mm_add_pd(velecsum
,velec
);
846 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
848 /* Calculate temporary vectorial force */
849 tx
= _mm_mul_pd(fscal
,dx12
);
850 ty
= _mm_mul_pd(fscal
,dy12
);
851 tz
= _mm_mul_pd(fscal
,dz12
);
853 /* Update vectorial force */
854 fix1
= _mm_add_pd(fix1
,tx
);
855 fiy1
= _mm_add_pd(fiy1
,ty
);
856 fiz1
= _mm_add_pd(fiz1
,tz
);
858 fjx2
= _mm_add_pd(fjx2
,tx
);
859 fjy2
= _mm_add_pd(fjy2
,ty
);
860 fjz2
= _mm_add_pd(fjz2
,tz
);
862 /**************************
863 * CALCULATE INTERACTIONS *
864 **************************/
866 /* REACTION-FIELD ELECTROSTATICS */
867 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_add_pd(rinv20
,_mm_mul_pd(krf
,rsq20
)),crf
));
868 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
872 velecsum
= _mm_add_pd(velecsum
,velec
);
876 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
878 /* Calculate temporary vectorial force */
879 tx
= _mm_mul_pd(fscal
,dx20
);
880 ty
= _mm_mul_pd(fscal
,dy20
);
881 tz
= _mm_mul_pd(fscal
,dz20
);
883 /* Update vectorial force */
884 fix2
= _mm_add_pd(fix2
,tx
);
885 fiy2
= _mm_add_pd(fiy2
,ty
);
886 fiz2
= _mm_add_pd(fiz2
,tz
);
888 fjx0
= _mm_add_pd(fjx0
,tx
);
889 fjy0
= _mm_add_pd(fjy0
,ty
);
890 fjz0
= _mm_add_pd(fjz0
,tz
);
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 /* REACTION-FIELD ELECTROSTATICS */
897 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_add_pd(rinv21
,_mm_mul_pd(krf
,rsq21
)),crf
));
898 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
900 /* Update potential sum for this i atom from the interaction with this j atom. */
901 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
902 velecsum
= _mm_add_pd(velecsum
,velec
);
906 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
908 /* Calculate temporary vectorial force */
909 tx
= _mm_mul_pd(fscal
,dx21
);
910 ty
= _mm_mul_pd(fscal
,dy21
);
911 tz
= _mm_mul_pd(fscal
,dz21
);
913 /* Update vectorial force */
914 fix2
= _mm_add_pd(fix2
,tx
);
915 fiy2
= _mm_add_pd(fiy2
,ty
);
916 fiz2
= _mm_add_pd(fiz2
,tz
);
918 fjx1
= _mm_add_pd(fjx1
,tx
);
919 fjy1
= _mm_add_pd(fjy1
,ty
);
920 fjz1
= _mm_add_pd(fjz1
,tz
);
922 /**************************
923 * CALCULATE INTERACTIONS *
924 **************************/
926 /* REACTION-FIELD ELECTROSTATICS */
927 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_add_pd(rinv22
,_mm_mul_pd(krf
,rsq22
)),crf
));
928 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
930 /* Update potential sum for this i atom from the interaction with this j atom. */
931 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
932 velecsum
= _mm_add_pd(velecsum
,velec
);
936 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
938 /* Calculate temporary vectorial force */
939 tx
= _mm_mul_pd(fscal
,dx22
);
940 ty
= _mm_mul_pd(fscal
,dy22
);
941 tz
= _mm_mul_pd(fscal
,dz22
);
943 /* Update vectorial force */
944 fix2
= _mm_add_pd(fix2
,tx
);
945 fiy2
= _mm_add_pd(fiy2
,ty
);
946 fiz2
= _mm_add_pd(fiz2
,tz
);
948 fjx2
= _mm_add_pd(fjx2
,tx
);
949 fjy2
= _mm_add_pd(fjy2
,ty
);
950 fjz2
= _mm_add_pd(fjz2
,tz
);
952 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
954 /* Inner loop uses 323 flops */
957 /* End of innermost loop */
959 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
960 f
+i_coord_offset
,fshift
+i_shift_offset
);
963 /* Update potential energies */
964 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
965 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
967 /* Increment number of inner iterations */
968 inneriter
+= j_index_end
- j_index_start
;
970 /* Outer loop uses 20 flops */
973 /* Increment number of outer iterations */
976 /* Update outer/inner flops */
978 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*323);
981 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_double
982 * Electrostatics interaction: ReactionField
983 * VdW interaction: CubicSplineTable
984 * Geometry: Water3-Water3
985 * Calculate force/pot: Force
988 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_double
989 (t_nblist
* gmx_restrict nlist
,
990 rvec
* gmx_restrict xx
,
991 rvec
* gmx_restrict ff
,
992 t_forcerec
* gmx_restrict fr
,
993 t_mdatoms
* gmx_restrict mdatoms
,
994 nb_kernel_data_t
* gmx_restrict kernel_data
,
995 t_nrnb
* gmx_restrict nrnb
)
997 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
998 * just 0 for non-waters.
999 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1000 * jnr indices corresponding to data put in the four positions in the SIMD register.
1002 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1003 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1005 int j_coord_offsetA
,j_coord_offsetB
;
1006 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1007 real rcutoff_scalar
;
1008 real
*shiftvec
,*fshift
,*x
,*f
;
1009 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1011 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1013 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1015 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1016 int vdwjidx0A
,vdwjidx0B
;
1017 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1018 int vdwjidx1A
,vdwjidx1B
;
1019 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1020 int vdwjidx2A
,vdwjidx2B
;
1021 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1022 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1023 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1024 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1025 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1026 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1027 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1028 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1029 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1030 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1031 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1034 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1037 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1038 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1040 __m128i ifour
= _mm_set1_epi32(4);
1041 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1043 __m128d dummy_mask
,cutoff_mask
;
1044 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1045 __m128d one
= _mm_set1_pd(1.0);
1046 __m128d two
= _mm_set1_pd(2.0);
1052 jindex
= nlist
->jindex
;
1054 shiftidx
= nlist
->shift
;
1056 shiftvec
= fr
->shift_vec
[0];
1057 fshift
= fr
->fshift
[0];
1058 facel
= _mm_set1_pd(fr
->epsfac
);
1059 charge
= mdatoms
->chargeA
;
1060 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
1061 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
1062 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
1063 nvdwtype
= fr
->ntype
;
1064 vdwparam
= fr
->nbfp
;
1065 vdwtype
= mdatoms
->typeA
;
1067 vftab
= kernel_data
->table_vdw
->data
;
1068 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
1070 /* Setup water-specific parameters */
1071 inr
= nlist
->iinr
[0];
1072 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
1073 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1074 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1075 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1077 jq0
= _mm_set1_pd(charge
[inr
+0]);
1078 jq1
= _mm_set1_pd(charge
[inr
+1]);
1079 jq2
= _mm_set1_pd(charge
[inr
+2]);
1080 vdwjidx0A
= 2*vdwtype
[inr
+0];
1081 qq00
= _mm_mul_pd(iq0
,jq0
);
1082 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1083 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1084 qq01
= _mm_mul_pd(iq0
,jq1
);
1085 qq02
= _mm_mul_pd(iq0
,jq2
);
1086 qq10
= _mm_mul_pd(iq1
,jq0
);
1087 qq11
= _mm_mul_pd(iq1
,jq1
);
1088 qq12
= _mm_mul_pd(iq1
,jq2
);
1089 qq20
= _mm_mul_pd(iq2
,jq0
);
1090 qq21
= _mm_mul_pd(iq2
,jq1
);
1091 qq22
= _mm_mul_pd(iq2
,jq2
);
1093 /* Avoid stupid compiler warnings */
1095 j_coord_offsetA
= 0;
1096 j_coord_offsetB
= 0;
1101 /* Start outer loop over neighborlists */
1102 for(iidx
=0; iidx
<nri
; iidx
++)
1104 /* Load shift vector for this list */
1105 i_shift_offset
= DIM
*shiftidx
[iidx
];
1107 /* Load limits for loop over neighbors */
1108 j_index_start
= jindex
[iidx
];
1109 j_index_end
= jindex
[iidx
+1];
1111 /* Get outer coordinate index */
1113 i_coord_offset
= DIM
*inr
;
1115 /* Load i particle coords and add shift vector */
1116 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1117 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1119 fix0
= _mm_setzero_pd();
1120 fiy0
= _mm_setzero_pd();
1121 fiz0
= _mm_setzero_pd();
1122 fix1
= _mm_setzero_pd();
1123 fiy1
= _mm_setzero_pd();
1124 fiz1
= _mm_setzero_pd();
1125 fix2
= _mm_setzero_pd();
1126 fiy2
= _mm_setzero_pd();
1127 fiz2
= _mm_setzero_pd();
1129 /* Start inner kernel loop */
1130 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1133 /* Get j neighbor index, and coordinate index */
1135 jnrB
= jjnr
[jidx
+1];
1136 j_coord_offsetA
= DIM
*jnrA
;
1137 j_coord_offsetB
= DIM
*jnrB
;
1139 /* load j atom coordinates */
1140 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1141 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1143 /* Calculate displacement vector */
1144 dx00
= _mm_sub_pd(ix0
,jx0
);
1145 dy00
= _mm_sub_pd(iy0
,jy0
);
1146 dz00
= _mm_sub_pd(iz0
,jz0
);
1147 dx01
= _mm_sub_pd(ix0
,jx1
);
1148 dy01
= _mm_sub_pd(iy0
,jy1
);
1149 dz01
= _mm_sub_pd(iz0
,jz1
);
1150 dx02
= _mm_sub_pd(ix0
,jx2
);
1151 dy02
= _mm_sub_pd(iy0
,jy2
);
1152 dz02
= _mm_sub_pd(iz0
,jz2
);
1153 dx10
= _mm_sub_pd(ix1
,jx0
);
1154 dy10
= _mm_sub_pd(iy1
,jy0
);
1155 dz10
= _mm_sub_pd(iz1
,jz0
);
1156 dx11
= _mm_sub_pd(ix1
,jx1
);
1157 dy11
= _mm_sub_pd(iy1
,jy1
);
1158 dz11
= _mm_sub_pd(iz1
,jz1
);
1159 dx12
= _mm_sub_pd(ix1
,jx2
);
1160 dy12
= _mm_sub_pd(iy1
,jy2
);
1161 dz12
= _mm_sub_pd(iz1
,jz2
);
1162 dx20
= _mm_sub_pd(ix2
,jx0
);
1163 dy20
= _mm_sub_pd(iy2
,jy0
);
1164 dz20
= _mm_sub_pd(iz2
,jz0
);
1165 dx21
= _mm_sub_pd(ix2
,jx1
);
1166 dy21
= _mm_sub_pd(iy2
,jy1
);
1167 dz21
= _mm_sub_pd(iz2
,jz1
);
1168 dx22
= _mm_sub_pd(ix2
,jx2
);
1169 dy22
= _mm_sub_pd(iy2
,jy2
);
1170 dz22
= _mm_sub_pd(iz2
,jz2
);
1172 /* Calculate squared distance and things based on it */
1173 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1174 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1175 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1176 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1177 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1178 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1179 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1180 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1181 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1183 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1184 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1185 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1186 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1187 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1188 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1189 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1190 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1191 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1193 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1194 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
1195 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
1196 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1197 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1198 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1199 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1200 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1201 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1203 fjx0
= _mm_setzero_pd();
1204 fjy0
= _mm_setzero_pd();
1205 fjz0
= _mm_setzero_pd();
1206 fjx1
= _mm_setzero_pd();
1207 fjy1
= _mm_setzero_pd();
1208 fjz1
= _mm_setzero_pd();
1209 fjx2
= _mm_setzero_pd();
1210 fjy2
= _mm_setzero_pd();
1211 fjz2
= _mm_setzero_pd();
1213 /**************************
1214 * CALCULATE INTERACTIONS *
1215 **************************/
1217 r00
= _mm_mul_pd(rsq00
,rinv00
);
1219 /* Calculate table index by multiplying r with table scale and truncate to integer */
1220 rt
= _mm_mul_pd(r00
,vftabscale
);
1221 vfitab
= _mm_cvttpd_epi32(rt
);
1222 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1223 vfitab
= _mm_slli_epi32(vfitab
,3);
1225 /* REACTION-FIELD ELECTROSTATICS */
1226 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
1228 /* CUBIC SPLINE TABLE DISPERSION */
1229 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1230 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1231 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1232 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1233 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1234 GMX_MM_TRANSPOSE2_PD(G
,H
);
1235 Heps
= _mm_mul_pd(vfeps
,H
);
1236 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1237 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1238 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1240 /* CUBIC SPLINE TABLE REPULSION */
1241 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1242 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1243 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1244 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1245 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1246 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1247 GMX_MM_TRANSPOSE2_PD(G
,H
);
1248 Heps
= _mm_mul_pd(vfeps
,H
);
1249 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1250 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1251 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1252 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1254 fscal
= _mm_add_pd(felec
,fvdw
);
1256 /* Calculate temporary vectorial force */
1257 tx
= _mm_mul_pd(fscal
,dx00
);
1258 ty
= _mm_mul_pd(fscal
,dy00
);
1259 tz
= _mm_mul_pd(fscal
,dz00
);
1261 /* Update vectorial force */
1262 fix0
= _mm_add_pd(fix0
,tx
);
1263 fiy0
= _mm_add_pd(fiy0
,ty
);
1264 fiz0
= _mm_add_pd(fiz0
,tz
);
1266 fjx0
= _mm_add_pd(fjx0
,tx
);
1267 fjy0
= _mm_add_pd(fjy0
,ty
);
1268 fjz0
= _mm_add_pd(fjz0
,tz
);
1270 /**************************
1271 * CALCULATE INTERACTIONS *
1272 **************************/
1274 /* REACTION-FIELD ELECTROSTATICS */
1275 felec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_mul_pd(rinv01
,rinvsq01
),krf2
));
1279 /* Calculate temporary vectorial force */
1280 tx
= _mm_mul_pd(fscal
,dx01
);
1281 ty
= _mm_mul_pd(fscal
,dy01
);
1282 tz
= _mm_mul_pd(fscal
,dz01
);
1284 /* Update vectorial force */
1285 fix0
= _mm_add_pd(fix0
,tx
);
1286 fiy0
= _mm_add_pd(fiy0
,ty
);
1287 fiz0
= _mm_add_pd(fiz0
,tz
);
1289 fjx1
= _mm_add_pd(fjx1
,tx
);
1290 fjy1
= _mm_add_pd(fjy1
,ty
);
1291 fjz1
= _mm_add_pd(fjz1
,tz
);
1293 /**************************
1294 * CALCULATE INTERACTIONS *
1295 **************************/
1297 /* REACTION-FIELD ELECTROSTATICS */
1298 felec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_mul_pd(rinv02
,rinvsq02
),krf2
));
1302 /* Calculate temporary vectorial force */
1303 tx
= _mm_mul_pd(fscal
,dx02
);
1304 ty
= _mm_mul_pd(fscal
,dy02
);
1305 tz
= _mm_mul_pd(fscal
,dz02
);
1307 /* Update vectorial force */
1308 fix0
= _mm_add_pd(fix0
,tx
);
1309 fiy0
= _mm_add_pd(fiy0
,ty
);
1310 fiz0
= _mm_add_pd(fiz0
,tz
);
1312 fjx2
= _mm_add_pd(fjx2
,tx
);
1313 fjy2
= _mm_add_pd(fjy2
,ty
);
1314 fjz2
= _mm_add_pd(fjz2
,tz
);
1316 /**************************
1317 * CALCULATE INTERACTIONS *
1318 **************************/
1320 /* REACTION-FIELD ELECTROSTATICS */
1321 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
1325 /* Calculate temporary vectorial force */
1326 tx
= _mm_mul_pd(fscal
,dx10
);
1327 ty
= _mm_mul_pd(fscal
,dy10
);
1328 tz
= _mm_mul_pd(fscal
,dz10
);
1330 /* Update vectorial force */
1331 fix1
= _mm_add_pd(fix1
,tx
);
1332 fiy1
= _mm_add_pd(fiy1
,ty
);
1333 fiz1
= _mm_add_pd(fiz1
,tz
);
1335 fjx0
= _mm_add_pd(fjx0
,tx
);
1336 fjy0
= _mm_add_pd(fjy0
,ty
);
1337 fjz0
= _mm_add_pd(fjz0
,tz
);
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 /* REACTION-FIELD ELECTROSTATICS */
1344 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
1348 /* Calculate temporary vectorial force */
1349 tx
= _mm_mul_pd(fscal
,dx11
);
1350 ty
= _mm_mul_pd(fscal
,dy11
);
1351 tz
= _mm_mul_pd(fscal
,dz11
);
1353 /* Update vectorial force */
1354 fix1
= _mm_add_pd(fix1
,tx
);
1355 fiy1
= _mm_add_pd(fiy1
,ty
);
1356 fiz1
= _mm_add_pd(fiz1
,tz
);
1358 fjx1
= _mm_add_pd(fjx1
,tx
);
1359 fjy1
= _mm_add_pd(fjy1
,ty
);
1360 fjz1
= _mm_add_pd(fjz1
,tz
);
1362 /**************************
1363 * CALCULATE INTERACTIONS *
1364 **************************/
1366 /* REACTION-FIELD ELECTROSTATICS */
1367 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
1371 /* Calculate temporary vectorial force */
1372 tx
= _mm_mul_pd(fscal
,dx12
);
1373 ty
= _mm_mul_pd(fscal
,dy12
);
1374 tz
= _mm_mul_pd(fscal
,dz12
);
1376 /* Update vectorial force */
1377 fix1
= _mm_add_pd(fix1
,tx
);
1378 fiy1
= _mm_add_pd(fiy1
,ty
);
1379 fiz1
= _mm_add_pd(fiz1
,tz
);
1381 fjx2
= _mm_add_pd(fjx2
,tx
);
1382 fjy2
= _mm_add_pd(fjy2
,ty
);
1383 fjz2
= _mm_add_pd(fjz2
,tz
);
1385 /**************************
1386 * CALCULATE INTERACTIONS *
1387 **************************/
1389 /* REACTION-FIELD ELECTROSTATICS */
1390 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
1394 /* Calculate temporary vectorial force */
1395 tx
= _mm_mul_pd(fscal
,dx20
);
1396 ty
= _mm_mul_pd(fscal
,dy20
);
1397 tz
= _mm_mul_pd(fscal
,dz20
);
1399 /* Update vectorial force */
1400 fix2
= _mm_add_pd(fix2
,tx
);
1401 fiy2
= _mm_add_pd(fiy2
,ty
);
1402 fiz2
= _mm_add_pd(fiz2
,tz
);
1404 fjx0
= _mm_add_pd(fjx0
,tx
);
1405 fjy0
= _mm_add_pd(fjy0
,ty
);
1406 fjz0
= _mm_add_pd(fjz0
,tz
);
1408 /**************************
1409 * CALCULATE INTERACTIONS *
1410 **************************/
1412 /* REACTION-FIELD ELECTROSTATICS */
1413 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
1417 /* Calculate temporary vectorial force */
1418 tx
= _mm_mul_pd(fscal
,dx21
);
1419 ty
= _mm_mul_pd(fscal
,dy21
);
1420 tz
= _mm_mul_pd(fscal
,dz21
);
1422 /* Update vectorial force */
1423 fix2
= _mm_add_pd(fix2
,tx
);
1424 fiy2
= _mm_add_pd(fiy2
,ty
);
1425 fiz2
= _mm_add_pd(fiz2
,tz
);
1427 fjx1
= _mm_add_pd(fjx1
,tx
);
1428 fjy1
= _mm_add_pd(fjy1
,ty
);
1429 fjz1
= _mm_add_pd(fjz1
,tz
);
1431 /**************************
1432 * CALCULATE INTERACTIONS *
1433 **************************/
1435 /* REACTION-FIELD ELECTROSTATICS */
1436 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
1440 /* Calculate temporary vectorial force */
1441 tx
= _mm_mul_pd(fscal
,dx22
);
1442 ty
= _mm_mul_pd(fscal
,dy22
);
1443 tz
= _mm_mul_pd(fscal
,dz22
);
1445 /* Update vectorial force */
1446 fix2
= _mm_add_pd(fix2
,tx
);
1447 fiy2
= _mm_add_pd(fiy2
,ty
);
1448 fiz2
= _mm_add_pd(fiz2
,tz
);
1450 fjx2
= _mm_add_pd(fjx2
,tx
);
1451 fjy2
= _mm_add_pd(fjy2
,ty
);
1452 fjz2
= _mm_add_pd(fjz2
,tz
);
1454 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1456 /* Inner loop uses 270 flops */
1459 if(jidx
<j_index_end
)
1463 j_coord_offsetA
= DIM
*jnrA
;
1465 /* load j atom coordinates */
1466 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1467 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1469 /* Calculate displacement vector */
1470 dx00
= _mm_sub_pd(ix0
,jx0
);
1471 dy00
= _mm_sub_pd(iy0
,jy0
);
1472 dz00
= _mm_sub_pd(iz0
,jz0
);
1473 dx01
= _mm_sub_pd(ix0
,jx1
);
1474 dy01
= _mm_sub_pd(iy0
,jy1
);
1475 dz01
= _mm_sub_pd(iz0
,jz1
);
1476 dx02
= _mm_sub_pd(ix0
,jx2
);
1477 dy02
= _mm_sub_pd(iy0
,jy2
);
1478 dz02
= _mm_sub_pd(iz0
,jz2
);
1479 dx10
= _mm_sub_pd(ix1
,jx0
);
1480 dy10
= _mm_sub_pd(iy1
,jy0
);
1481 dz10
= _mm_sub_pd(iz1
,jz0
);
1482 dx11
= _mm_sub_pd(ix1
,jx1
);
1483 dy11
= _mm_sub_pd(iy1
,jy1
);
1484 dz11
= _mm_sub_pd(iz1
,jz1
);
1485 dx12
= _mm_sub_pd(ix1
,jx2
);
1486 dy12
= _mm_sub_pd(iy1
,jy2
);
1487 dz12
= _mm_sub_pd(iz1
,jz2
);
1488 dx20
= _mm_sub_pd(ix2
,jx0
);
1489 dy20
= _mm_sub_pd(iy2
,jy0
);
1490 dz20
= _mm_sub_pd(iz2
,jz0
);
1491 dx21
= _mm_sub_pd(ix2
,jx1
);
1492 dy21
= _mm_sub_pd(iy2
,jy1
);
1493 dz21
= _mm_sub_pd(iz2
,jz1
);
1494 dx22
= _mm_sub_pd(ix2
,jx2
);
1495 dy22
= _mm_sub_pd(iy2
,jy2
);
1496 dz22
= _mm_sub_pd(iz2
,jz2
);
1498 /* Calculate squared distance and things based on it */
1499 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1500 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1501 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1502 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1503 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1504 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1505 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1506 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1507 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1509 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1510 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1511 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1512 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1513 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1514 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1515 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1516 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1517 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1519 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1520 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
1521 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
1522 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1523 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1524 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1525 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1526 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1527 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1529 fjx0
= _mm_setzero_pd();
1530 fjy0
= _mm_setzero_pd();
1531 fjz0
= _mm_setzero_pd();
1532 fjx1
= _mm_setzero_pd();
1533 fjy1
= _mm_setzero_pd();
1534 fjz1
= _mm_setzero_pd();
1535 fjx2
= _mm_setzero_pd();
1536 fjy2
= _mm_setzero_pd();
1537 fjz2
= _mm_setzero_pd();
1539 /**************************
1540 * CALCULATE INTERACTIONS *
1541 **************************/
1543 r00
= _mm_mul_pd(rsq00
,rinv00
);
1545 /* Calculate table index by multiplying r with table scale and truncate to integer */
1546 rt
= _mm_mul_pd(r00
,vftabscale
);
1547 vfitab
= _mm_cvttpd_epi32(rt
);
1548 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1549 vfitab
= _mm_slli_epi32(vfitab
,3);
1551 /* REACTION-FIELD ELECTROSTATICS */
1552 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
1554 /* CUBIC SPLINE TABLE DISPERSION */
1555 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1556 F
= _mm_setzero_pd();
1557 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1558 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1559 H
= _mm_setzero_pd();
1560 GMX_MM_TRANSPOSE2_PD(G
,H
);
1561 Heps
= _mm_mul_pd(vfeps
,H
);
1562 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1563 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1564 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1566 /* CUBIC SPLINE TABLE REPULSION */
1567 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1568 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1569 F
= _mm_setzero_pd();
1570 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1571 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1572 H
= _mm_setzero_pd();
1573 GMX_MM_TRANSPOSE2_PD(G
,H
);
1574 Heps
= _mm_mul_pd(vfeps
,H
);
1575 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1576 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1577 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1578 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1580 fscal
= _mm_add_pd(felec
,fvdw
);
1582 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1584 /* Calculate temporary vectorial force */
1585 tx
= _mm_mul_pd(fscal
,dx00
);
1586 ty
= _mm_mul_pd(fscal
,dy00
);
1587 tz
= _mm_mul_pd(fscal
,dz00
);
1589 /* Update vectorial force */
1590 fix0
= _mm_add_pd(fix0
,tx
);
1591 fiy0
= _mm_add_pd(fiy0
,ty
);
1592 fiz0
= _mm_add_pd(fiz0
,tz
);
1594 fjx0
= _mm_add_pd(fjx0
,tx
);
1595 fjy0
= _mm_add_pd(fjy0
,ty
);
1596 fjz0
= _mm_add_pd(fjz0
,tz
);
1598 /**************************
1599 * CALCULATE INTERACTIONS *
1600 **************************/
1602 /* REACTION-FIELD ELECTROSTATICS */
1603 felec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_mul_pd(rinv01
,rinvsq01
),krf2
));
1607 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1609 /* Calculate temporary vectorial force */
1610 tx
= _mm_mul_pd(fscal
,dx01
);
1611 ty
= _mm_mul_pd(fscal
,dy01
);
1612 tz
= _mm_mul_pd(fscal
,dz01
);
1614 /* Update vectorial force */
1615 fix0
= _mm_add_pd(fix0
,tx
);
1616 fiy0
= _mm_add_pd(fiy0
,ty
);
1617 fiz0
= _mm_add_pd(fiz0
,tz
);
1619 fjx1
= _mm_add_pd(fjx1
,tx
);
1620 fjy1
= _mm_add_pd(fjy1
,ty
);
1621 fjz1
= _mm_add_pd(fjz1
,tz
);
1623 /**************************
1624 * CALCULATE INTERACTIONS *
1625 **************************/
1627 /* REACTION-FIELD ELECTROSTATICS */
1628 felec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_mul_pd(rinv02
,rinvsq02
),krf2
));
1632 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1634 /* Calculate temporary vectorial force */
1635 tx
= _mm_mul_pd(fscal
,dx02
);
1636 ty
= _mm_mul_pd(fscal
,dy02
);
1637 tz
= _mm_mul_pd(fscal
,dz02
);
1639 /* Update vectorial force */
1640 fix0
= _mm_add_pd(fix0
,tx
);
1641 fiy0
= _mm_add_pd(fiy0
,ty
);
1642 fiz0
= _mm_add_pd(fiz0
,tz
);
1644 fjx2
= _mm_add_pd(fjx2
,tx
);
1645 fjy2
= _mm_add_pd(fjy2
,ty
);
1646 fjz2
= _mm_add_pd(fjz2
,tz
);
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 /* REACTION-FIELD ELECTROSTATICS */
1653 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
1657 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1659 /* Calculate temporary vectorial force */
1660 tx
= _mm_mul_pd(fscal
,dx10
);
1661 ty
= _mm_mul_pd(fscal
,dy10
);
1662 tz
= _mm_mul_pd(fscal
,dz10
);
1664 /* Update vectorial force */
1665 fix1
= _mm_add_pd(fix1
,tx
);
1666 fiy1
= _mm_add_pd(fiy1
,ty
);
1667 fiz1
= _mm_add_pd(fiz1
,tz
);
1669 fjx0
= _mm_add_pd(fjx0
,tx
);
1670 fjy0
= _mm_add_pd(fjy0
,ty
);
1671 fjz0
= _mm_add_pd(fjz0
,tz
);
1673 /**************************
1674 * CALCULATE INTERACTIONS *
1675 **************************/
1677 /* REACTION-FIELD ELECTROSTATICS */
1678 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
1682 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1684 /* Calculate temporary vectorial force */
1685 tx
= _mm_mul_pd(fscal
,dx11
);
1686 ty
= _mm_mul_pd(fscal
,dy11
);
1687 tz
= _mm_mul_pd(fscal
,dz11
);
1689 /* Update vectorial force */
1690 fix1
= _mm_add_pd(fix1
,tx
);
1691 fiy1
= _mm_add_pd(fiy1
,ty
);
1692 fiz1
= _mm_add_pd(fiz1
,tz
);
1694 fjx1
= _mm_add_pd(fjx1
,tx
);
1695 fjy1
= _mm_add_pd(fjy1
,ty
);
1696 fjz1
= _mm_add_pd(fjz1
,tz
);
1698 /**************************
1699 * CALCULATE INTERACTIONS *
1700 **************************/
1702 /* REACTION-FIELD ELECTROSTATICS */
1703 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
1707 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1709 /* Calculate temporary vectorial force */
1710 tx
= _mm_mul_pd(fscal
,dx12
);
1711 ty
= _mm_mul_pd(fscal
,dy12
);
1712 tz
= _mm_mul_pd(fscal
,dz12
);
1714 /* Update vectorial force */
1715 fix1
= _mm_add_pd(fix1
,tx
);
1716 fiy1
= _mm_add_pd(fiy1
,ty
);
1717 fiz1
= _mm_add_pd(fiz1
,tz
);
1719 fjx2
= _mm_add_pd(fjx2
,tx
);
1720 fjy2
= _mm_add_pd(fjy2
,ty
);
1721 fjz2
= _mm_add_pd(fjz2
,tz
);
1723 /**************************
1724 * CALCULATE INTERACTIONS *
1725 **************************/
1727 /* REACTION-FIELD ELECTROSTATICS */
1728 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
1732 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1734 /* Calculate temporary vectorial force */
1735 tx
= _mm_mul_pd(fscal
,dx20
);
1736 ty
= _mm_mul_pd(fscal
,dy20
);
1737 tz
= _mm_mul_pd(fscal
,dz20
);
1739 /* Update vectorial force */
1740 fix2
= _mm_add_pd(fix2
,tx
);
1741 fiy2
= _mm_add_pd(fiy2
,ty
);
1742 fiz2
= _mm_add_pd(fiz2
,tz
);
1744 fjx0
= _mm_add_pd(fjx0
,tx
);
1745 fjy0
= _mm_add_pd(fjy0
,ty
);
1746 fjz0
= _mm_add_pd(fjz0
,tz
);
1748 /**************************
1749 * CALCULATE INTERACTIONS *
1750 **************************/
1752 /* REACTION-FIELD ELECTROSTATICS */
1753 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
1757 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1759 /* Calculate temporary vectorial force */
1760 tx
= _mm_mul_pd(fscal
,dx21
);
1761 ty
= _mm_mul_pd(fscal
,dy21
);
1762 tz
= _mm_mul_pd(fscal
,dz21
);
1764 /* Update vectorial force */
1765 fix2
= _mm_add_pd(fix2
,tx
);
1766 fiy2
= _mm_add_pd(fiy2
,ty
);
1767 fiz2
= _mm_add_pd(fiz2
,tz
);
1769 fjx1
= _mm_add_pd(fjx1
,tx
);
1770 fjy1
= _mm_add_pd(fjy1
,ty
);
1771 fjz1
= _mm_add_pd(fjz1
,tz
);
1773 /**************************
1774 * CALCULATE INTERACTIONS *
1775 **************************/
1777 /* REACTION-FIELD ELECTROSTATICS */
1778 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
1782 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1784 /* Calculate temporary vectorial force */
1785 tx
= _mm_mul_pd(fscal
,dx22
);
1786 ty
= _mm_mul_pd(fscal
,dy22
);
1787 tz
= _mm_mul_pd(fscal
,dz22
);
1789 /* Update vectorial force */
1790 fix2
= _mm_add_pd(fix2
,tx
);
1791 fiy2
= _mm_add_pd(fiy2
,ty
);
1792 fiz2
= _mm_add_pd(fiz2
,tz
);
1794 fjx2
= _mm_add_pd(fjx2
,tx
);
1795 fjy2
= _mm_add_pd(fjy2
,ty
);
1796 fjz2
= _mm_add_pd(fjz2
,tz
);
1798 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1800 /* Inner loop uses 270 flops */
1803 /* End of innermost loop */
1805 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1806 f
+i_coord_offset
,fshift
+i_shift_offset
);
1808 /* Increment number of inner iterations */
1809 inneriter
+= j_index_end
- j_index_start
;
1811 /* Outer loop uses 18 flops */
1814 /* Increment number of outer iterations */
1817 /* Update outer/inner flops */
1819 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*270);