2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: None
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_double
45 (t_nblist
* gmx_restrict nlist
,
46 rvec
* gmx_restrict xx
,
47 rvec
* gmx_restrict ff
,
48 t_forcerec
* gmx_restrict fr
,
49 t_mdatoms
* gmx_restrict mdatoms
,
50 nb_kernel_data_t
* gmx_restrict kernel_data
,
51 t_nrnb
* gmx_restrict nrnb
)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
59 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
61 int j_coord_offsetA
,j_coord_offsetB
;
62 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
64 real
*shiftvec
,*fshift
,*x
,*f
;
65 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
67 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
69 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
71 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
72 int vdwjidx0A
,vdwjidx0B
;
73 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
74 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
75 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
76 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
77 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
80 __m128i ifour
= _mm_set1_epi32(4);
81 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
83 __m128d dummy_mask
,cutoff_mask
;
84 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
85 __m128d one
= _mm_set1_pd(1.0);
86 __m128d two
= _mm_set1_pd(2.0);
92 jindex
= nlist
->jindex
;
94 shiftidx
= nlist
->shift
;
96 shiftvec
= fr
->shift_vec
[0];
97 fshift
= fr
->fshift
[0];
98 facel
= _mm_set1_pd(fr
->epsfac
);
99 charge
= mdatoms
->chargeA
;
101 vftab
= kernel_data
->table_elec
->data
;
102 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
104 /* Setup water-specific parameters */
105 inr
= nlist
->iinr
[0];
106 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
107 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
108 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
110 /* Avoid stupid compiler warnings */
118 /* Start outer loop over neighborlists */
119 for(iidx
=0; iidx
<nri
; iidx
++)
121 /* Load shift vector for this list */
122 i_shift_offset
= DIM
*shiftidx
[iidx
];
124 /* Load limits for loop over neighbors */
125 j_index_start
= jindex
[iidx
];
126 j_index_end
= jindex
[iidx
+1];
128 /* Get outer coordinate index */
130 i_coord_offset
= DIM
*inr
;
132 /* Load i particle coords and add shift vector */
133 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
134 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
136 fix1
= _mm_setzero_pd();
137 fiy1
= _mm_setzero_pd();
138 fiz1
= _mm_setzero_pd();
139 fix2
= _mm_setzero_pd();
140 fiy2
= _mm_setzero_pd();
141 fiz2
= _mm_setzero_pd();
142 fix3
= _mm_setzero_pd();
143 fiy3
= _mm_setzero_pd();
144 fiz3
= _mm_setzero_pd();
146 /* Reset potential sums */
147 velecsum
= _mm_setzero_pd();
149 /* Start inner kernel loop */
150 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
153 /* Get j neighbor index, and coordinate index */
156 j_coord_offsetA
= DIM
*jnrA
;
157 j_coord_offsetB
= DIM
*jnrB
;
159 /* load j atom coordinates */
160 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
163 /* Calculate displacement vector */
164 dx10
= _mm_sub_pd(ix1
,jx0
);
165 dy10
= _mm_sub_pd(iy1
,jy0
);
166 dz10
= _mm_sub_pd(iz1
,jz0
);
167 dx20
= _mm_sub_pd(ix2
,jx0
);
168 dy20
= _mm_sub_pd(iy2
,jy0
);
169 dz20
= _mm_sub_pd(iz2
,jz0
);
170 dx30
= _mm_sub_pd(ix3
,jx0
);
171 dy30
= _mm_sub_pd(iy3
,jy0
);
172 dz30
= _mm_sub_pd(iz3
,jz0
);
174 /* Calculate squared distance and things based on it */
175 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
176 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
177 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
179 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
180 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
181 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
183 /* Load parameters for j particles */
184 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
186 fjx0
= _mm_setzero_pd();
187 fjy0
= _mm_setzero_pd();
188 fjz0
= _mm_setzero_pd();
190 /**************************
191 * CALCULATE INTERACTIONS *
192 **************************/
194 r10
= _mm_mul_pd(rsq10
,rinv10
);
196 /* Compute parameters for interactions between i and j atoms */
197 qq10
= _mm_mul_pd(iq1
,jq0
);
199 /* Calculate table index by multiplying r with table scale and truncate to integer */
200 rt
= _mm_mul_pd(r10
,vftabscale
);
201 vfitab
= _mm_cvttpd_epi32(rt
);
202 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
203 vfitab
= _mm_slli_epi32(vfitab
,2);
205 /* CUBIC SPLINE TABLE ELECTROSTATICS */
206 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
207 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
208 GMX_MM_TRANSPOSE2_PD(Y
,F
);
209 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
210 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
211 GMX_MM_TRANSPOSE2_PD(G
,H
);
212 Heps
= _mm_mul_pd(vfeps
,H
);
213 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
214 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
215 velec
= _mm_mul_pd(qq10
,VV
);
216 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
217 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
219 /* Update potential sum for this i atom from the interaction with this j atom. */
220 velecsum
= _mm_add_pd(velecsum
,velec
);
224 /* Calculate temporary vectorial force */
225 tx
= _mm_mul_pd(fscal
,dx10
);
226 ty
= _mm_mul_pd(fscal
,dy10
);
227 tz
= _mm_mul_pd(fscal
,dz10
);
229 /* Update vectorial force */
230 fix1
= _mm_add_pd(fix1
,tx
);
231 fiy1
= _mm_add_pd(fiy1
,ty
);
232 fiz1
= _mm_add_pd(fiz1
,tz
);
234 fjx0
= _mm_add_pd(fjx0
,tx
);
235 fjy0
= _mm_add_pd(fjy0
,ty
);
236 fjz0
= _mm_add_pd(fjz0
,tz
);
238 /**************************
239 * CALCULATE INTERACTIONS *
240 **************************/
242 r20
= _mm_mul_pd(rsq20
,rinv20
);
244 /* Compute parameters for interactions between i and j atoms */
245 qq20
= _mm_mul_pd(iq2
,jq0
);
247 /* Calculate table index by multiplying r with table scale and truncate to integer */
248 rt
= _mm_mul_pd(r20
,vftabscale
);
249 vfitab
= _mm_cvttpd_epi32(rt
);
250 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
251 vfitab
= _mm_slli_epi32(vfitab
,2);
253 /* CUBIC SPLINE TABLE ELECTROSTATICS */
254 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
255 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
256 GMX_MM_TRANSPOSE2_PD(Y
,F
);
257 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
258 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
259 GMX_MM_TRANSPOSE2_PD(G
,H
);
260 Heps
= _mm_mul_pd(vfeps
,H
);
261 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
262 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
263 velec
= _mm_mul_pd(qq20
,VV
);
264 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
265 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
267 /* Update potential sum for this i atom from the interaction with this j atom. */
268 velecsum
= _mm_add_pd(velecsum
,velec
);
272 /* Calculate temporary vectorial force */
273 tx
= _mm_mul_pd(fscal
,dx20
);
274 ty
= _mm_mul_pd(fscal
,dy20
);
275 tz
= _mm_mul_pd(fscal
,dz20
);
277 /* Update vectorial force */
278 fix2
= _mm_add_pd(fix2
,tx
);
279 fiy2
= _mm_add_pd(fiy2
,ty
);
280 fiz2
= _mm_add_pd(fiz2
,tz
);
282 fjx0
= _mm_add_pd(fjx0
,tx
);
283 fjy0
= _mm_add_pd(fjy0
,ty
);
284 fjz0
= _mm_add_pd(fjz0
,tz
);
286 /**************************
287 * CALCULATE INTERACTIONS *
288 **************************/
290 r30
= _mm_mul_pd(rsq30
,rinv30
);
292 /* Compute parameters for interactions between i and j atoms */
293 qq30
= _mm_mul_pd(iq3
,jq0
);
295 /* Calculate table index by multiplying r with table scale and truncate to integer */
296 rt
= _mm_mul_pd(r30
,vftabscale
);
297 vfitab
= _mm_cvttpd_epi32(rt
);
298 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
299 vfitab
= _mm_slli_epi32(vfitab
,2);
301 /* CUBIC SPLINE TABLE ELECTROSTATICS */
302 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
303 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
304 GMX_MM_TRANSPOSE2_PD(Y
,F
);
305 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
306 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
307 GMX_MM_TRANSPOSE2_PD(G
,H
);
308 Heps
= _mm_mul_pd(vfeps
,H
);
309 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
310 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
311 velec
= _mm_mul_pd(qq30
,VV
);
312 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
313 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
315 /* Update potential sum for this i atom from the interaction with this j atom. */
316 velecsum
= _mm_add_pd(velecsum
,velec
);
320 /* Calculate temporary vectorial force */
321 tx
= _mm_mul_pd(fscal
,dx30
);
322 ty
= _mm_mul_pd(fscal
,dy30
);
323 tz
= _mm_mul_pd(fscal
,dz30
);
325 /* Update vectorial force */
326 fix3
= _mm_add_pd(fix3
,tx
);
327 fiy3
= _mm_add_pd(fiy3
,ty
);
328 fiz3
= _mm_add_pd(fiz3
,tz
);
330 fjx0
= _mm_add_pd(fjx0
,tx
);
331 fjy0
= _mm_add_pd(fjy0
,ty
);
332 fjz0
= _mm_add_pd(fjz0
,tz
);
334 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
336 /* Inner loop uses 132 flops */
343 j_coord_offsetA
= DIM
*jnrA
;
345 /* load j atom coordinates */
346 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
349 /* Calculate displacement vector */
350 dx10
= _mm_sub_pd(ix1
,jx0
);
351 dy10
= _mm_sub_pd(iy1
,jy0
);
352 dz10
= _mm_sub_pd(iz1
,jz0
);
353 dx20
= _mm_sub_pd(ix2
,jx0
);
354 dy20
= _mm_sub_pd(iy2
,jy0
);
355 dz20
= _mm_sub_pd(iz2
,jz0
);
356 dx30
= _mm_sub_pd(ix3
,jx0
);
357 dy30
= _mm_sub_pd(iy3
,jy0
);
358 dz30
= _mm_sub_pd(iz3
,jz0
);
360 /* Calculate squared distance and things based on it */
361 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
362 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
363 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
365 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
366 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
367 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
369 /* Load parameters for j particles */
370 jq0
= _mm_load_sd(charge
+jnrA
+0);
372 fjx0
= _mm_setzero_pd();
373 fjy0
= _mm_setzero_pd();
374 fjz0
= _mm_setzero_pd();
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r10
= _mm_mul_pd(rsq10
,rinv10
);
382 /* Compute parameters for interactions between i and j atoms */
383 qq10
= _mm_mul_pd(iq1
,jq0
);
385 /* Calculate table index by multiplying r with table scale and truncate to integer */
386 rt
= _mm_mul_pd(r10
,vftabscale
);
387 vfitab
= _mm_cvttpd_epi32(rt
);
388 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
389 vfitab
= _mm_slli_epi32(vfitab
,2);
391 /* CUBIC SPLINE TABLE ELECTROSTATICS */
392 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
393 F
= _mm_setzero_pd();
394 GMX_MM_TRANSPOSE2_PD(Y
,F
);
395 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
396 H
= _mm_setzero_pd();
397 GMX_MM_TRANSPOSE2_PD(G
,H
);
398 Heps
= _mm_mul_pd(vfeps
,H
);
399 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
400 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
401 velec
= _mm_mul_pd(qq10
,VV
);
402 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
403 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
405 /* Update potential sum for this i atom from the interaction with this j atom. */
406 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
407 velecsum
= _mm_add_pd(velecsum
,velec
);
411 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
413 /* Calculate temporary vectorial force */
414 tx
= _mm_mul_pd(fscal
,dx10
);
415 ty
= _mm_mul_pd(fscal
,dy10
);
416 tz
= _mm_mul_pd(fscal
,dz10
);
418 /* Update vectorial force */
419 fix1
= _mm_add_pd(fix1
,tx
);
420 fiy1
= _mm_add_pd(fiy1
,ty
);
421 fiz1
= _mm_add_pd(fiz1
,tz
);
423 fjx0
= _mm_add_pd(fjx0
,tx
);
424 fjy0
= _mm_add_pd(fjy0
,ty
);
425 fjz0
= _mm_add_pd(fjz0
,tz
);
427 /**************************
428 * CALCULATE INTERACTIONS *
429 **************************/
431 r20
= _mm_mul_pd(rsq20
,rinv20
);
433 /* Compute parameters for interactions between i and j atoms */
434 qq20
= _mm_mul_pd(iq2
,jq0
);
436 /* Calculate table index by multiplying r with table scale and truncate to integer */
437 rt
= _mm_mul_pd(r20
,vftabscale
);
438 vfitab
= _mm_cvttpd_epi32(rt
);
439 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
440 vfitab
= _mm_slli_epi32(vfitab
,2);
442 /* CUBIC SPLINE TABLE ELECTROSTATICS */
443 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
444 F
= _mm_setzero_pd();
445 GMX_MM_TRANSPOSE2_PD(Y
,F
);
446 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
447 H
= _mm_setzero_pd();
448 GMX_MM_TRANSPOSE2_PD(G
,H
);
449 Heps
= _mm_mul_pd(vfeps
,H
);
450 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
451 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
452 velec
= _mm_mul_pd(qq20
,VV
);
453 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
454 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
456 /* Update potential sum for this i atom from the interaction with this j atom. */
457 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
458 velecsum
= _mm_add_pd(velecsum
,velec
);
462 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
464 /* Calculate temporary vectorial force */
465 tx
= _mm_mul_pd(fscal
,dx20
);
466 ty
= _mm_mul_pd(fscal
,dy20
);
467 tz
= _mm_mul_pd(fscal
,dz20
);
469 /* Update vectorial force */
470 fix2
= _mm_add_pd(fix2
,tx
);
471 fiy2
= _mm_add_pd(fiy2
,ty
);
472 fiz2
= _mm_add_pd(fiz2
,tz
);
474 fjx0
= _mm_add_pd(fjx0
,tx
);
475 fjy0
= _mm_add_pd(fjy0
,ty
);
476 fjz0
= _mm_add_pd(fjz0
,tz
);
478 /**************************
479 * CALCULATE INTERACTIONS *
480 **************************/
482 r30
= _mm_mul_pd(rsq30
,rinv30
);
484 /* Compute parameters for interactions between i and j atoms */
485 qq30
= _mm_mul_pd(iq3
,jq0
);
487 /* Calculate table index by multiplying r with table scale and truncate to integer */
488 rt
= _mm_mul_pd(r30
,vftabscale
);
489 vfitab
= _mm_cvttpd_epi32(rt
);
490 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
491 vfitab
= _mm_slli_epi32(vfitab
,2);
493 /* CUBIC SPLINE TABLE ELECTROSTATICS */
494 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
495 F
= _mm_setzero_pd();
496 GMX_MM_TRANSPOSE2_PD(Y
,F
);
497 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
498 H
= _mm_setzero_pd();
499 GMX_MM_TRANSPOSE2_PD(G
,H
);
500 Heps
= _mm_mul_pd(vfeps
,H
);
501 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
502 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
503 velec
= _mm_mul_pd(qq30
,VV
);
504 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
505 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
507 /* Update potential sum for this i atom from the interaction with this j atom. */
508 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
509 velecsum
= _mm_add_pd(velecsum
,velec
);
513 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
515 /* Calculate temporary vectorial force */
516 tx
= _mm_mul_pd(fscal
,dx30
);
517 ty
= _mm_mul_pd(fscal
,dy30
);
518 tz
= _mm_mul_pd(fscal
,dz30
);
520 /* Update vectorial force */
521 fix3
= _mm_add_pd(fix3
,tx
);
522 fiy3
= _mm_add_pd(fiy3
,ty
);
523 fiz3
= _mm_add_pd(fiz3
,tz
);
525 fjx0
= _mm_add_pd(fjx0
,tx
);
526 fjy0
= _mm_add_pd(fjy0
,ty
);
527 fjz0
= _mm_add_pd(fjz0
,tz
);
529 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
531 /* Inner loop uses 132 flops */
534 /* End of innermost loop */
536 gmx_mm_update_iforce_3atom_swizzle_pd(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
537 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
540 /* Update potential energies */
541 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
543 /* Increment number of inner iterations */
544 inneriter
+= j_index_end
- j_index_start
;
546 /* Outer loop uses 19 flops */
549 /* Increment number of outer iterations */
552 /* Update outer/inner flops */
554 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4_VF
,outeriter
*19 + inneriter
*132);
557 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_double
558 * Electrostatics interaction: CubicSplineTable
559 * VdW interaction: None
560 * Geometry: Water4-Particle
561 * Calculate force/pot: Force
564 nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_double
565 (t_nblist
* gmx_restrict nlist
,
566 rvec
* gmx_restrict xx
,
567 rvec
* gmx_restrict ff
,
568 t_forcerec
* gmx_restrict fr
,
569 t_mdatoms
* gmx_restrict mdatoms
,
570 nb_kernel_data_t
* gmx_restrict kernel_data
,
571 t_nrnb
* gmx_restrict nrnb
)
573 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
574 * just 0 for non-waters.
575 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
576 * jnr indices corresponding to data put in the four positions in the SIMD register.
578 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
579 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
581 int j_coord_offsetA
,j_coord_offsetB
;
582 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
584 real
*shiftvec
,*fshift
,*x
,*f
;
585 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
587 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
589 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
591 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
592 int vdwjidx0A
,vdwjidx0B
;
593 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
594 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
595 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
596 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
597 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
600 __m128i ifour
= _mm_set1_epi32(4);
601 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
603 __m128d dummy_mask
,cutoff_mask
;
604 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
605 __m128d one
= _mm_set1_pd(1.0);
606 __m128d two
= _mm_set1_pd(2.0);
612 jindex
= nlist
->jindex
;
614 shiftidx
= nlist
->shift
;
616 shiftvec
= fr
->shift_vec
[0];
617 fshift
= fr
->fshift
[0];
618 facel
= _mm_set1_pd(fr
->epsfac
);
619 charge
= mdatoms
->chargeA
;
621 vftab
= kernel_data
->table_elec
->data
;
622 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
624 /* Setup water-specific parameters */
625 inr
= nlist
->iinr
[0];
626 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
627 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
628 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
630 /* Avoid stupid compiler warnings */
638 /* Start outer loop over neighborlists */
639 for(iidx
=0; iidx
<nri
; iidx
++)
641 /* Load shift vector for this list */
642 i_shift_offset
= DIM
*shiftidx
[iidx
];
644 /* Load limits for loop over neighbors */
645 j_index_start
= jindex
[iidx
];
646 j_index_end
= jindex
[iidx
+1];
648 /* Get outer coordinate index */
650 i_coord_offset
= DIM
*inr
;
652 /* Load i particle coords and add shift vector */
653 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
654 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
656 fix1
= _mm_setzero_pd();
657 fiy1
= _mm_setzero_pd();
658 fiz1
= _mm_setzero_pd();
659 fix2
= _mm_setzero_pd();
660 fiy2
= _mm_setzero_pd();
661 fiz2
= _mm_setzero_pd();
662 fix3
= _mm_setzero_pd();
663 fiy3
= _mm_setzero_pd();
664 fiz3
= _mm_setzero_pd();
666 /* Start inner kernel loop */
667 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
670 /* Get j neighbor index, and coordinate index */
673 j_coord_offsetA
= DIM
*jnrA
;
674 j_coord_offsetB
= DIM
*jnrB
;
676 /* load j atom coordinates */
677 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
680 /* Calculate displacement vector */
681 dx10
= _mm_sub_pd(ix1
,jx0
);
682 dy10
= _mm_sub_pd(iy1
,jy0
);
683 dz10
= _mm_sub_pd(iz1
,jz0
);
684 dx20
= _mm_sub_pd(ix2
,jx0
);
685 dy20
= _mm_sub_pd(iy2
,jy0
);
686 dz20
= _mm_sub_pd(iz2
,jz0
);
687 dx30
= _mm_sub_pd(ix3
,jx0
);
688 dy30
= _mm_sub_pd(iy3
,jy0
);
689 dz30
= _mm_sub_pd(iz3
,jz0
);
691 /* Calculate squared distance and things based on it */
692 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
693 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
694 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
696 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
697 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
698 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
700 /* Load parameters for j particles */
701 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
703 fjx0
= _mm_setzero_pd();
704 fjy0
= _mm_setzero_pd();
705 fjz0
= _mm_setzero_pd();
707 /**************************
708 * CALCULATE INTERACTIONS *
709 **************************/
711 r10
= _mm_mul_pd(rsq10
,rinv10
);
713 /* Compute parameters for interactions between i and j atoms */
714 qq10
= _mm_mul_pd(iq1
,jq0
);
716 /* Calculate table index by multiplying r with table scale and truncate to integer */
717 rt
= _mm_mul_pd(r10
,vftabscale
);
718 vfitab
= _mm_cvttpd_epi32(rt
);
719 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
720 vfitab
= _mm_slli_epi32(vfitab
,2);
722 /* CUBIC SPLINE TABLE ELECTROSTATICS */
723 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
724 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
725 GMX_MM_TRANSPOSE2_PD(Y
,F
);
726 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
727 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
728 GMX_MM_TRANSPOSE2_PD(G
,H
);
729 Heps
= _mm_mul_pd(vfeps
,H
);
730 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
731 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
732 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
736 /* Calculate temporary vectorial force */
737 tx
= _mm_mul_pd(fscal
,dx10
);
738 ty
= _mm_mul_pd(fscal
,dy10
);
739 tz
= _mm_mul_pd(fscal
,dz10
);
741 /* Update vectorial force */
742 fix1
= _mm_add_pd(fix1
,tx
);
743 fiy1
= _mm_add_pd(fiy1
,ty
);
744 fiz1
= _mm_add_pd(fiz1
,tz
);
746 fjx0
= _mm_add_pd(fjx0
,tx
);
747 fjy0
= _mm_add_pd(fjy0
,ty
);
748 fjz0
= _mm_add_pd(fjz0
,tz
);
750 /**************************
751 * CALCULATE INTERACTIONS *
752 **************************/
754 r20
= _mm_mul_pd(rsq20
,rinv20
);
756 /* Compute parameters for interactions between i and j atoms */
757 qq20
= _mm_mul_pd(iq2
,jq0
);
759 /* Calculate table index by multiplying r with table scale and truncate to integer */
760 rt
= _mm_mul_pd(r20
,vftabscale
);
761 vfitab
= _mm_cvttpd_epi32(rt
);
762 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
763 vfitab
= _mm_slli_epi32(vfitab
,2);
765 /* CUBIC SPLINE TABLE ELECTROSTATICS */
766 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
767 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
768 GMX_MM_TRANSPOSE2_PD(Y
,F
);
769 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
770 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
771 GMX_MM_TRANSPOSE2_PD(G
,H
);
772 Heps
= _mm_mul_pd(vfeps
,H
);
773 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
774 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
775 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
779 /* Calculate temporary vectorial force */
780 tx
= _mm_mul_pd(fscal
,dx20
);
781 ty
= _mm_mul_pd(fscal
,dy20
);
782 tz
= _mm_mul_pd(fscal
,dz20
);
784 /* Update vectorial force */
785 fix2
= _mm_add_pd(fix2
,tx
);
786 fiy2
= _mm_add_pd(fiy2
,ty
);
787 fiz2
= _mm_add_pd(fiz2
,tz
);
789 fjx0
= _mm_add_pd(fjx0
,tx
);
790 fjy0
= _mm_add_pd(fjy0
,ty
);
791 fjz0
= _mm_add_pd(fjz0
,tz
);
793 /**************************
794 * CALCULATE INTERACTIONS *
795 **************************/
797 r30
= _mm_mul_pd(rsq30
,rinv30
);
799 /* Compute parameters for interactions between i and j atoms */
800 qq30
= _mm_mul_pd(iq3
,jq0
);
802 /* Calculate table index by multiplying r with table scale and truncate to integer */
803 rt
= _mm_mul_pd(r30
,vftabscale
);
804 vfitab
= _mm_cvttpd_epi32(rt
);
805 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
806 vfitab
= _mm_slli_epi32(vfitab
,2);
808 /* CUBIC SPLINE TABLE ELECTROSTATICS */
809 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
810 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
811 GMX_MM_TRANSPOSE2_PD(Y
,F
);
812 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
813 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
814 GMX_MM_TRANSPOSE2_PD(G
,H
);
815 Heps
= _mm_mul_pd(vfeps
,H
);
816 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
817 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
818 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
822 /* Calculate temporary vectorial force */
823 tx
= _mm_mul_pd(fscal
,dx30
);
824 ty
= _mm_mul_pd(fscal
,dy30
);
825 tz
= _mm_mul_pd(fscal
,dz30
);
827 /* Update vectorial force */
828 fix3
= _mm_add_pd(fix3
,tx
);
829 fiy3
= _mm_add_pd(fiy3
,ty
);
830 fiz3
= _mm_add_pd(fiz3
,tz
);
832 fjx0
= _mm_add_pd(fjx0
,tx
);
833 fjy0
= _mm_add_pd(fjy0
,ty
);
834 fjz0
= _mm_add_pd(fjz0
,tz
);
836 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
838 /* Inner loop uses 120 flops */
845 j_coord_offsetA
= DIM
*jnrA
;
847 /* load j atom coordinates */
848 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
851 /* Calculate displacement vector */
852 dx10
= _mm_sub_pd(ix1
,jx0
);
853 dy10
= _mm_sub_pd(iy1
,jy0
);
854 dz10
= _mm_sub_pd(iz1
,jz0
);
855 dx20
= _mm_sub_pd(ix2
,jx0
);
856 dy20
= _mm_sub_pd(iy2
,jy0
);
857 dz20
= _mm_sub_pd(iz2
,jz0
);
858 dx30
= _mm_sub_pd(ix3
,jx0
);
859 dy30
= _mm_sub_pd(iy3
,jy0
);
860 dz30
= _mm_sub_pd(iz3
,jz0
);
862 /* Calculate squared distance and things based on it */
863 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
864 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
865 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
867 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
868 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
869 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
871 /* Load parameters for j particles */
872 jq0
= _mm_load_sd(charge
+jnrA
+0);
874 fjx0
= _mm_setzero_pd();
875 fjy0
= _mm_setzero_pd();
876 fjz0
= _mm_setzero_pd();
878 /**************************
879 * CALCULATE INTERACTIONS *
880 **************************/
882 r10
= _mm_mul_pd(rsq10
,rinv10
);
884 /* Compute parameters for interactions between i and j atoms */
885 qq10
= _mm_mul_pd(iq1
,jq0
);
887 /* Calculate table index by multiplying r with table scale and truncate to integer */
888 rt
= _mm_mul_pd(r10
,vftabscale
);
889 vfitab
= _mm_cvttpd_epi32(rt
);
890 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
891 vfitab
= _mm_slli_epi32(vfitab
,2);
893 /* CUBIC SPLINE TABLE ELECTROSTATICS */
894 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
895 F
= _mm_setzero_pd();
896 GMX_MM_TRANSPOSE2_PD(Y
,F
);
897 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
898 H
= _mm_setzero_pd();
899 GMX_MM_TRANSPOSE2_PD(G
,H
);
900 Heps
= _mm_mul_pd(vfeps
,H
);
901 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
902 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
903 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
907 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
909 /* Calculate temporary vectorial force */
910 tx
= _mm_mul_pd(fscal
,dx10
);
911 ty
= _mm_mul_pd(fscal
,dy10
);
912 tz
= _mm_mul_pd(fscal
,dz10
);
914 /* Update vectorial force */
915 fix1
= _mm_add_pd(fix1
,tx
);
916 fiy1
= _mm_add_pd(fiy1
,ty
);
917 fiz1
= _mm_add_pd(fiz1
,tz
);
919 fjx0
= _mm_add_pd(fjx0
,tx
);
920 fjy0
= _mm_add_pd(fjy0
,ty
);
921 fjz0
= _mm_add_pd(fjz0
,tz
);
923 /**************************
924 * CALCULATE INTERACTIONS *
925 **************************/
927 r20
= _mm_mul_pd(rsq20
,rinv20
);
929 /* Compute parameters for interactions between i and j atoms */
930 qq20
= _mm_mul_pd(iq2
,jq0
);
932 /* Calculate table index by multiplying r with table scale and truncate to integer */
933 rt
= _mm_mul_pd(r20
,vftabscale
);
934 vfitab
= _mm_cvttpd_epi32(rt
);
935 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
936 vfitab
= _mm_slli_epi32(vfitab
,2);
938 /* CUBIC SPLINE TABLE ELECTROSTATICS */
939 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
940 F
= _mm_setzero_pd();
941 GMX_MM_TRANSPOSE2_PD(Y
,F
);
942 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
943 H
= _mm_setzero_pd();
944 GMX_MM_TRANSPOSE2_PD(G
,H
);
945 Heps
= _mm_mul_pd(vfeps
,H
);
946 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
947 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
948 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
952 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
954 /* Calculate temporary vectorial force */
955 tx
= _mm_mul_pd(fscal
,dx20
);
956 ty
= _mm_mul_pd(fscal
,dy20
);
957 tz
= _mm_mul_pd(fscal
,dz20
);
959 /* Update vectorial force */
960 fix2
= _mm_add_pd(fix2
,tx
);
961 fiy2
= _mm_add_pd(fiy2
,ty
);
962 fiz2
= _mm_add_pd(fiz2
,tz
);
964 fjx0
= _mm_add_pd(fjx0
,tx
);
965 fjy0
= _mm_add_pd(fjy0
,ty
);
966 fjz0
= _mm_add_pd(fjz0
,tz
);
968 /**************************
969 * CALCULATE INTERACTIONS *
970 **************************/
972 r30
= _mm_mul_pd(rsq30
,rinv30
);
974 /* Compute parameters for interactions between i and j atoms */
975 qq30
= _mm_mul_pd(iq3
,jq0
);
977 /* Calculate table index by multiplying r with table scale and truncate to integer */
978 rt
= _mm_mul_pd(r30
,vftabscale
);
979 vfitab
= _mm_cvttpd_epi32(rt
);
980 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
981 vfitab
= _mm_slli_epi32(vfitab
,2);
983 /* CUBIC SPLINE TABLE ELECTROSTATICS */
984 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
985 F
= _mm_setzero_pd();
986 GMX_MM_TRANSPOSE2_PD(Y
,F
);
987 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
988 H
= _mm_setzero_pd();
989 GMX_MM_TRANSPOSE2_PD(G
,H
);
990 Heps
= _mm_mul_pd(vfeps
,H
);
991 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
992 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
993 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
997 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
999 /* Calculate temporary vectorial force */
1000 tx
= _mm_mul_pd(fscal
,dx30
);
1001 ty
= _mm_mul_pd(fscal
,dy30
);
1002 tz
= _mm_mul_pd(fscal
,dz30
);
1004 /* Update vectorial force */
1005 fix3
= _mm_add_pd(fix3
,tx
);
1006 fiy3
= _mm_add_pd(fiy3
,ty
);
1007 fiz3
= _mm_add_pd(fiz3
,tz
);
1009 fjx0
= _mm_add_pd(fjx0
,tx
);
1010 fjy0
= _mm_add_pd(fjy0
,ty
);
1011 fjz0
= _mm_add_pd(fjz0
,tz
);
1013 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1015 /* Inner loop uses 120 flops */
1018 /* End of innermost loop */
1020 gmx_mm_update_iforce_3atom_swizzle_pd(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1021 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
1023 /* Increment number of inner iterations */
1024 inneriter
+= j_index_end
- j_index_start
;
1026 /* Outer loop uses 18 flops */
1029 /* Increment number of outer iterations */
1032 /* Update outer/inner flops */
1034 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4_F
,outeriter
*18 + inneriter
*120);