2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_double
53 * Electrostatics interaction: CubicSplineTable
54 * VdW interaction: None
55 * Geometry: Water4-Particle
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
84 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
86 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
87 int vdwjidx0A
,vdwjidx0B
;
88 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
89 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
90 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
91 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
92 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
95 __m128i ifour
= _mm_set1_epi32(4);
96 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
98 __m128d dummy_mask
,cutoff_mask
;
99 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
100 __m128d one
= _mm_set1_pd(1.0);
101 __m128d two
= _mm_set1_pd(2.0);
107 jindex
= nlist
->jindex
;
109 shiftidx
= nlist
->shift
;
111 shiftvec
= fr
->shift_vec
[0];
112 fshift
= fr
->fshift
[0];
113 facel
= _mm_set1_pd(fr
->epsfac
);
114 charge
= mdatoms
->chargeA
;
116 vftab
= kernel_data
->table_elec
->data
;
117 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
119 /* Setup water-specific parameters */
120 inr
= nlist
->iinr
[0];
121 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
122 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
123 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
125 /* Avoid stupid compiler warnings */
133 /* Start outer loop over neighborlists */
134 for(iidx
=0; iidx
<nri
; iidx
++)
136 /* Load shift vector for this list */
137 i_shift_offset
= DIM
*shiftidx
[iidx
];
139 /* Load limits for loop over neighbors */
140 j_index_start
= jindex
[iidx
];
141 j_index_end
= jindex
[iidx
+1];
143 /* Get outer coordinate index */
145 i_coord_offset
= DIM
*inr
;
147 /* Load i particle coords and add shift vector */
148 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
149 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
151 fix1
= _mm_setzero_pd();
152 fiy1
= _mm_setzero_pd();
153 fiz1
= _mm_setzero_pd();
154 fix2
= _mm_setzero_pd();
155 fiy2
= _mm_setzero_pd();
156 fiz2
= _mm_setzero_pd();
157 fix3
= _mm_setzero_pd();
158 fiy3
= _mm_setzero_pd();
159 fiz3
= _mm_setzero_pd();
161 /* Reset potential sums */
162 velecsum
= _mm_setzero_pd();
164 /* Start inner kernel loop */
165 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
168 /* Get j neighbor index, and coordinate index */
171 j_coord_offsetA
= DIM
*jnrA
;
172 j_coord_offsetB
= DIM
*jnrB
;
174 /* load j atom coordinates */
175 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
178 /* Calculate displacement vector */
179 dx10
= _mm_sub_pd(ix1
,jx0
);
180 dy10
= _mm_sub_pd(iy1
,jy0
);
181 dz10
= _mm_sub_pd(iz1
,jz0
);
182 dx20
= _mm_sub_pd(ix2
,jx0
);
183 dy20
= _mm_sub_pd(iy2
,jy0
);
184 dz20
= _mm_sub_pd(iz2
,jz0
);
185 dx30
= _mm_sub_pd(ix3
,jx0
);
186 dy30
= _mm_sub_pd(iy3
,jy0
);
187 dz30
= _mm_sub_pd(iz3
,jz0
);
189 /* Calculate squared distance and things based on it */
190 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
191 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
192 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
194 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
195 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
196 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
198 /* Load parameters for j particles */
199 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
201 fjx0
= _mm_setzero_pd();
202 fjy0
= _mm_setzero_pd();
203 fjz0
= _mm_setzero_pd();
205 /**************************
206 * CALCULATE INTERACTIONS *
207 **************************/
209 r10
= _mm_mul_pd(rsq10
,rinv10
);
211 /* Compute parameters for interactions between i and j atoms */
212 qq10
= _mm_mul_pd(iq1
,jq0
);
214 /* Calculate table index by multiplying r with table scale and truncate to integer */
215 rt
= _mm_mul_pd(r10
,vftabscale
);
216 vfitab
= _mm_cvttpd_epi32(rt
);
217 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
218 vfitab
= _mm_slli_epi32(vfitab
,2);
220 /* CUBIC SPLINE TABLE ELECTROSTATICS */
221 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
222 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
223 GMX_MM_TRANSPOSE2_PD(Y
,F
);
224 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
225 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
226 GMX_MM_TRANSPOSE2_PD(G
,H
);
227 Heps
= _mm_mul_pd(vfeps
,H
);
228 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
229 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
230 velec
= _mm_mul_pd(qq10
,VV
);
231 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
232 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
234 /* Update potential sum for this i atom from the interaction with this j atom. */
235 velecsum
= _mm_add_pd(velecsum
,velec
);
239 /* Calculate temporary vectorial force */
240 tx
= _mm_mul_pd(fscal
,dx10
);
241 ty
= _mm_mul_pd(fscal
,dy10
);
242 tz
= _mm_mul_pd(fscal
,dz10
);
244 /* Update vectorial force */
245 fix1
= _mm_add_pd(fix1
,tx
);
246 fiy1
= _mm_add_pd(fiy1
,ty
);
247 fiz1
= _mm_add_pd(fiz1
,tz
);
249 fjx0
= _mm_add_pd(fjx0
,tx
);
250 fjy0
= _mm_add_pd(fjy0
,ty
);
251 fjz0
= _mm_add_pd(fjz0
,tz
);
253 /**************************
254 * CALCULATE INTERACTIONS *
255 **************************/
257 r20
= _mm_mul_pd(rsq20
,rinv20
);
259 /* Compute parameters for interactions between i and j atoms */
260 qq20
= _mm_mul_pd(iq2
,jq0
);
262 /* Calculate table index by multiplying r with table scale and truncate to integer */
263 rt
= _mm_mul_pd(r20
,vftabscale
);
264 vfitab
= _mm_cvttpd_epi32(rt
);
265 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
266 vfitab
= _mm_slli_epi32(vfitab
,2);
268 /* CUBIC SPLINE TABLE ELECTROSTATICS */
269 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
270 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
271 GMX_MM_TRANSPOSE2_PD(Y
,F
);
272 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
273 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
274 GMX_MM_TRANSPOSE2_PD(G
,H
);
275 Heps
= _mm_mul_pd(vfeps
,H
);
276 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
277 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
278 velec
= _mm_mul_pd(qq20
,VV
);
279 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
280 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
282 /* Update potential sum for this i atom from the interaction with this j atom. */
283 velecsum
= _mm_add_pd(velecsum
,velec
);
287 /* Calculate temporary vectorial force */
288 tx
= _mm_mul_pd(fscal
,dx20
);
289 ty
= _mm_mul_pd(fscal
,dy20
);
290 tz
= _mm_mul_pd(fscal
,dz20
);
292 /* Update vectorial force */
293 fix2
= _mm_add_pd(fix2
,tx
);
294 fiy2
= _mm_add_pd(fiy2
,ty
);
295 fiz2
= _mm_add_pd(fiz2
,tz
);
297 fjx0
= _mm_add_pd(fjx0
,tx
);
298 fjy0
= _mm_add_pd(fjy0
,ty
);
299 fjz0
= _mm_add_pd(fjz0
,tz
);
301 /**************************
302 * CALCULATE INTERACTIONS *
303 **************************/
305 r30
= _mm_mul_pd(rsq30
,rinv30
);
307 /* Compute parameters for interactions between i and j atoms */
308 qq30
= _mm_mul_pd(iq3
,jq0
);
310 /* Calculate table index by multiplying r with table scale and truncate to integer */
311 rt
= _mm_mul_pd(r30
,vftabscale
);
312 vfitab
= _mm_cvttpd_epi32(rt
);
313 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
314 vfitab
= _mm_slli_epi32(vfitab
,2);
316 /* CUBIC SPLINE TABLE ELECTROSTATICS */
317 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
318 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
319 GMX_MM_TRANSPOSE2_PD(Y
,F
);
320 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
321 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
322 GMX_MM_TRANSPOSE2_PD(G
,H
);
323 Heps
= _mm_mul_pd(vfeps
,H
);
324 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
325 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
326 velec
= _mm_mul_pd(qq30
,VV
);
327 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
328 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 velecsum
= _mm_add_pd(velecsum
,velec
);
335 /* Calculate temporary vectorial force */
336 tx
= _mm_mul_pd(fscal
,dx30
);
337 ty
= _mm_mul_pd(fscal
,dy30
);
338 tz
= _mm_mul_pd(fscal
,dz30
);
340 /* Update vectorial force */
341 fix3
= _mm_add_pd(fix3
,tx
);
342 fiy3
= _mm_add_pd(fiy3
,ty
);
343 fiz3
= _mm_add_pd(fiz3
,tz
);
345 fjx0
= _mm_add_pd(fjx0
,tx
);
346 fjy0
= _mm_add_pd(fjy0
,ty
);
347 fjz0
= _mm_add_pd(fjz0
,tz
);
349 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
351 /* Inner loop uses 132 flops */
358 j_coord_offsetA
= DIM
*jnrA
;
360 /* load j atom coordinates */
361 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
364 /* Calculate displacement vector */
365 dx10
= _mm_sub_pd(ix1
,jx0
);
366 dy10
= _mm_sub_pd(iy1
,jy0
);
367 dz10
= _mm_sub_pd(iz1
,jz0
);
368 dx20
= _mm_sub_pd(ix2
,jx0
);
369 dy20
= _mm_sub_pd(iy2
,jy0
);
370 dz20
= _mm_sub_pd(iz2
,jz0
);
371 dx30
= _mm_sub_pd(ix3
,jx0
);
372 dy30
= _mm_sub_pd(iy3
,jy0
);
373 dz30
= _mm_sub_pd(iz3
,jz0
);
375 /* Calculate squared distance and things based on it */
376 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
377 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
378 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
380 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
381 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
382 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
384 /* Load parameters for j particles */
385 jq0
= _mm_load_sd(charge
+jnrA
+0);
387 fjx0
= _mm_setzero_pd();
388 fjy0
= _mm_setzero_pd();
389 fjz0
= _mm_setzero_pd();
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 r10
= _mm_mul_pd(rsq10
,rinv10
);
397 /* Compute parameters for interactions between i and j atoms */
398 qq10
= _mm_mul_pd(iq1
,jq0
);
400 /* Calculate table index by multiplying r with table scale and truncate to integer */
401 rt
= _mm_mul_pd(r10
,vftabscale
);
402 vfitab
= _mm_cvttpd_epi32(rt
);
403 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
404 vfitab
= _mm_slli_epi32(vfitab
,2);
406 /* CUBIC SPLINE TABLE ELECTROSTATICS */
407 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
408 F
= _mm_setzero_pd();
409 GMX_MM_TRANSPOSE2_PD(Y
,F
);
410 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
411 H
= _mm_setzero_pd();
412 GMX_MM_TRANSPOSE2_PD(G
,H
);
413 Heps
= _mm_mul_pd(vfeps
,H
);
414 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
415 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
416 velec
= _mm_mul_pd(qq10
,VV
);
417 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
418 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
420 /* Update potential sum for this i atom from the interaction with this j atom. */
421 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
422 velecsum
= _mm_add_pd(velecsum
,velec
);
426 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
428 /* Calculate temporary vectorial force */
429 tx
= _mm_mul_pd(fscal
,dx10
);
430 ty
= _mm_mul_pd(fscal
,dy10
);
431 tz
= _mm_mul_pd(fscal
,dz10
);
433 /* Update vectorial force */
434 fix1
= _mm_add_pd(fix1
,tx
);
435 fiy1
= _mm_add_pd(fiy1
,ty
);
436 fiz1
= _mm_add_pd(fiz1
,tz
);
438 fjx0
= _mm_add_pd(fjx0
,tx
);
439 fjy0
= _mm_add_pd(fjy0
,ty
);
440 fjz0
= _mm_add_pd(fjz0
,tz
);
442 /**************************
443 * CALCULATE INTERACTIONS *
444 **************************/
446 r20
= _mm_mul_pd(rsq20
,rinv20
);
448 /* Compute parameters for interactions between i and j atoms */
449 qq20
= _mm_mul_pd(iq2
,jq0
);
451 /* Calculate table index by multiplying r with table scale and truncate to integer */
452 rt
= _mm_mul_pd(r20
,vftabscale
);
453 vfitab
= _mm_cvttpd_epi32(rt
);
454 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
455 vfitab
= _mm_slli_epi32(vfitab
,2);
457 /* CUBIC SPLINE TABLE ELECTROSTATICS */
458 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
459 F
= _mm_setzero_pd();
460 GMX_MM_TRANSPOSE2_PD(Y
,F
);
461 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
462 H
= _mm_setzero_pd();
463 GMX_MM_TRANSPOSE2_PD(G
,H
);
464 Heps
= _mm_mul_pd(vfeps
,H
);
465 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
466 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
467 velec
= _mm_mul_pd(qq20
,VV
);
468 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
469 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
471 /* Update potential sum for this i atom from the interaction with this j atom. */
472 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
473 velecsum
= _mm_add_pd(velecsum
,velec
);
477 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
479 /* Calculate temporary vectorial force */
480 tx
= _mm_mul_pd(fscal
,dx20
);
481 ty
= _mm_mul_pd(fscal
,dy20
);
482 tz
= _mm_mul_pd(fscal
,dz20
);
484 /* Update vectorial force */
485 fix2
= _mm_add_pd(fix2
,tx
);
486 fiy2
= _mm_add_pd(fiy2
,ty
);
487 fiz2
= _mm_add_pd(fiz2
,tz
);
489 fjx0
= _mm_add_pd(fjx0
,tx
);
490 fjy0
= _mm_add_pd(fjy0
,ty
);
491 fjz0
= _mm_add_pd(fjz0
,tz
);
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 r30
= _mm_mul_pd(rsq30
,rinv30
);
499 /* Compute parameters for interactions between i and j atoms */
500 qq30
= _mm_mul_pd(iq3
,jq0
);
502 /* Calculate table index by multiplying r with table scale and truncate to integer */
503 rt
= _mm_mul_pd(r30
,vftabscale
);
504 vfitab
= _mm_cvttpd_epi32(rt
);
505 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
506 vfitab
= _mm_slli_epi32(vfitab
,2);
508 /* CUBIC SPLINE TABLE ELECTROSTATICS */
509 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
510 F
= _mm_setzero_pd();
511 GMX_MM_TRANSPOSE2_PD(Y
,F
);
512 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
513 H
= _mm_setzero_pd();
514 GMX_MM_TRANSPOSE2_PD(G
,H
);
515 Heps
= _mm_mul_pd(vfeps
,H
);
516 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
517 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
518 velec
= _mm_mul_pd(qq30
,VV
);
519 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
520 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
524 velecsum
= _mm_add_pd(velecsum
,velec
);
528 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
530 /* Calculate temporary vectorial force */
531 tx
= _mm_mul_pd(fscal
,dx30
);
532 ty
= _mm_mul_pd(fscal
,dy30
);
533 tz
= _mm_mul_pd(fscal
,dz30
);
535 /* Update vectorial force */
536 fix3
= _mm_add_pd(fix3
,tx
);
537 fiy3
= _mm_add_pd(fiy3
,ty
);
538 fiz3
= _mm_add_pd(fiz3
,tz
);
540 fjx0
= _mm_add_pd(fjx0
,tx
);
541 fjy0
= _mm_add_pd(fjy0
,ty
);
542 fjz0
= _mm_add_pd(fjz0
,tz
);
544 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
546 /* Inner loop uses 132 flops */
549 /* End of innermost loop */
551 gmx_mm_update_iforce_3atom_swizzle_pd(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
552 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
555 /* Update potential energies */
556 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
558 /* Increment number of inner iterations */
559 inneriter
+= j_index_end
- j_index_start
;
561 /* Outer loop uses 19 flops */
564 /* Increment number of outer iterations */
567 /* Update outer/inner flops */
569 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4_VF
,outeriter
*19 + inneriter
*132);
572 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_double
573 * Electrostatics interaction: CubicSplineTable
574 * VdW interaction: None
575 * Geometry: Water4-Particle
576 * Calculate force/pot: Force
579 nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_double
580 (t_nblist
* gmx_restrict nlist
,
581 rvec
* gmx_restrict xx
,
582 rvec
* gmx_restrict ff
,
583 t_forcerec
* gmx_restrict fr
,
584 t_mdatoms
* gmx_restrict mdatoms
,
585 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
586 t_nrnb
* gmx_restrict nrnb
)
588 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
589 * just 0 for non-waters.
590 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
591 * jnr indices corresponding to data put in the four positions in the SIMD register.
593 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
594 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
596 int j_coord_offsetA
,j_coord_offsetB
;
597 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
599 real
*shiftvec
,*fshift
,*x
,*f
;
600 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
602 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
604 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
606 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
607 int vdwjidx0A
,vdwjidx0B
;
608 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
609 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
610 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
611 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
612 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
615 __m128i ifour
= _mm_set1_epi32(4);
616 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
618 __m128d dummy_mask
,cutoff_mask
;
619 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
620 __m128d one
= _mm_set1_pd(1.0);
621 __m128d two
= _mm_set1_pd(2.0);
627 jindex
= nlist
->jindex
;
629 shiftidx
= nlist
->shift
;
631 shiftvec
= fr
->shift_vec
[0];
632 fshift
= fr
->fshift
[0];
633 facel
= _mm_set1_pd(fr
->epsfac
);
634 charge
= mdatoms
->chargeA
;
636 vftab
= kernel_data
->table_elec
->data
;
637 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
639 /* Setup water-specific parameters */
640 inr
= nlist
->iinr
[0];
641 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
642 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
643 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
645 /* Avoid stupid compiler warnings */
653 /* Start outer loop over neighborlists */
654 for(iidx
=0; iidx
<nri
; iidx
++)
656 /* Load shift vector for this list */
657 i_shift_offset
= DIM
*shiftidx
[iidx
];
659 /* Load limits for loop over neighbors */
660 j_index_start
= jindex
[iidx
];
661 j_index_end
= jindex
[iidx
+1];
663 /* Get outer coordinate index */
665 i_coord_offset
= DIM
*inr
;
667 /* Load i particle coords and add shift vector */
668 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
669 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
671 fix1
= _mm_setzero_pd();
672 fiy1
= _mm_setzero_pd();
673 fiz1
= _mm_setzero_pd();
674 fix2
= _mm_setzero_pd();
675 fiy2
= _mm_setzero_pd();
676 fiz2
= _mm_setzero_pd();
677 fix3
= _mm_setzero_pd();
678 fiy3
= _mm_setzero_pd();
679 fiz3
= _mm_setzero_pd();
681 /* Start inner kernel loop */
682 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
685 /* Get j neighbor index, and coordinate index */
688 j_coord_offsetA
= DIM
*jnrA
;
689 j_coord_offsetB
= DIM
*jnrB
;
691 /* load j atom coordinates */
692 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
695 /* Calculate displacement vector */
696 dx10
= _mm_sub_pd(ix1
,jx0
);
697 dy10
= _mm_sub_pd(iy1
,jy0
);
698 dz10
= _mm_sub_pd(iz1
,jz0
);
699 dx20
= _mm_sub_pd(ix2
,jx0
);
700 dy20
= _mm_sub_pd(iy2
,jy0
);
701 dz20
= _mm_sub_pd(iz2
,jz0
);
702 dx30
= _mm_sub_pd(ix3
,jx0
);
703 dy30
= _mm_sub_pd(iy3
,jy0
);
704 dz30
= _mm_sub_pd(iz3
,jz0
);
706 /* Calculate squared distance and things based on it */
707 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
708 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
709 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
711 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
712 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
713 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
715 /* Load parameters for j particles */
716 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
718 fjx0
= _mm_setzero_pd();
719 fjy0
= _mm_setzero_pd();
720 fjz0
= _mm_setzero_pd();
722 /**************************
723 * CALCULATE INTERACTIONS *
724 **************************/
726 r10
= _mm_mul_pd(rsq10
,rinv10
);
728 /* Compute parameters for interactions between i and j atoms */
729 qq10
= _mm_mul_pd(iq1
,jq0
);
731 /* Calculate table index by multiplying r with table scale and truncate to integer */
732 rt
= _mm_mul_pd(r10
,vftabscale
);
733 vfitab
= _mm_cvttpd_epi32(rt
);
734 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
735 vfitab
= _mm_slli_epi32(vfitab
,2);
737 /* CUBIC SPLINE TABLE ELECTROSTATICS */
738 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
739 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
740 GMX_MM_TRANSPOSE2_PD(Y
,F
);
741 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
742 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
743 GMX_MM_TRANSPOSE2_PD(G
,H
);
744 Heps
= _mm_mul_pd(vfeps
,H
);
745 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
746 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
747 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
751 /* Calculate temporary vectorial force */
752 tx
= _mm_mul_pd(fscal
,dx10
);
753 ty
= _mm_mul_pd(fscal
,dy10
);
754 tz
= _mm_mul_pd(fscal
,dz10
);
756 /* Update vectorial force */
757 fix1
= _mm_add_pd(fix1
,tx
);
758 fiy1
= _mm_add_pd(fiy1
,ty
);
759 fiz1
= _mm_add_pd(fiz1
,tz
);
761 fjx0
= _mm_add_pd(fjx0
,tx
);
762 fjy0
= _mm_add_pd(fjy0
,ty
);
763 fjz0
= _mm_add_pd(fjz0
,tz
);
765 /**************************
766 * CALCULATE INTERACTIONS *
767 **************************/
769 r20
= _mm_mul_pd(rsq20
,rinv20
);
771 /* Compute parameters for interactions between i and j atoms */
772 qq20
= _mm_mul_pd(iq2
,jq0
);
774 /* Calculate table index by multiplying r with table scale and truncate to integer */
775 rt
= _mm_mul_pd(r20
,vftabscale
);
776 vfitab
= _mm_cvttpd_epi32(rt
);
777 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
778 vfitab
= _mm_slli_epi32(vfitab
,2);
780 /* CUBIC SPLINE TABLE ELECTROSTATICS */
781 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
782 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
783 GMX_MM_TRANSPOSE2_PD(Y
,F
);
784 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
785 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
786 GMX_MM_TRANSPOSE2_PD(G
,H
);
787 Heps
= _mm_mul_pd(vfeps
,H
);
788 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
789 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
790 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
794 /* Calculate temporary vectorial force */
795 tx
= _mm_mul_pd(fscal
,dx20
);
796 ty
= _mm_mul_pd(fscal
,dy20
);
797 tz
= _mm_mul_pd(fscal
,dz20
);
799 /* Update vectorial force */
800 fix2
= _mm_add_pd(fix2
,tx
);
801 fiy2
= _mm_add_pd(fiy2
,ty
);
802 fiz2
= _mm_add_pd(fiz2
,tz
);
804 fjx0
= _mm_add_pd(fjx0
,tx
);
805 fjy0
= _mm_add_pd(fjy0
,ty
);
806 fjz0
= _mm_add_pd(fjz0
,tz
);
808 /**************************
809 * CALCULATE INTERACTIONS *
810 **************************/
812 r30
= _mm_mul_pd(rsq30
,rinv30
);
814 /* Compute parameters for interactions between i and j atoms */
815 qq30
= _mm_mul_pd(iq3
,jq0
);
817 /* Calculate table index by multiplying r with table scale and truncate to integer */
818 rt
= _mm_mul_pd(r30
,vftabscale
);
819 vfitab
= _mm_cvttpd_epi32(rt
);
820 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
821 vfitab
= _mm_slli_epi32(vfitab
,2);
823 /* CUBIC SPLINE TABLE ELECTROSTATICS */
824 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
825 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
826 GMX_MM_TRANSPOSE2_PD(Y
,F
);
827 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
828 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
829 GMX_MM_TRANSPOSE2_PD(G
,H
);
830 Heps
= _mm_mul_pd(vfeps
,H
);
831 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
832 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
833 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
837 /* Calculate temporary vectorial force */
838 tx
= _mm_mul_pd(fscal
,dx30
);
839 ty
= _mm_mul_pd(fscal
,dy30
);
840 tz
= _mm_mul_pd(fscal
,dz30
);
842 /* Update vectorial force */
843 fix3
= _mm_add_pd(fix3
,tx
);
844 fiy3
= _mm_add_pd(fiy3
,ty
);
845 fiz3
= _mm_add_pd(fiz3
,tz
);
847 fjx0
= _mm_add_pd(fjx0
,tx
);
848 fjy0
= _mm_add_pd(fjy0
,ty
);
849 fjz0
= _mm_add_pd(fjz0
,tz
);
851 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
853 /* Inner loop uses 120 flops */
860 j_coord_offsetA
= DIM
*jnrA
;
862 /* load j atom coordinates */
863 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
866 /* Calculate displacement vector */
867 dx10
= _mm_sub_pd(ix1
,jx0
);
868 dy10
= _mm_sub_pd(iy1
,jy0
);
869 dz10
= _mm_sub_pd(iz1
,jz0
);
870 dx20
= _mm_sub_pd(ix2
,jx0
);
871 dy20
= _mm_sub_pd(iy2
,jy0
);
872 dz20
= _mm_sub_pd(iz2
,jz0
);
873 dx30
= _mm_sub_pd(ix3
,jx0
);
874 dy30
= _mm_sub_pd(iy3
,jy0
);
875 dz30
= _mm_sub_pd(iz3
,jz0
);
877 /* Calculate squared distance and things based on it */
878 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
879 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
880 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
882 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
883 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
884 rinv30
= gmx_mm_invsqrt_pd(rsq30
);
886 /* Load parameters for j particles */
887 jq0
= _mm_load_sd(charge
+jnrA
+0);
889 fjx0
= _mm_setzero_pd();
890 fjy0
= _mm_setzero_pd();
891 fjz0
= _mm_setzero_pd();
893 /**************************
894 * CALCULATE INTERACTIONS *
895 **************************/
897 r10
= _mm_mul_pd(rsq10
,rinv10
);
899 /* Compute parameters for interactions between i and j atoms */
900 qq10
= _mm_mul_pd(iq1
,jq0
);
902 /* Calculate table index by multiplying r with table scale and truncate to integer */
903 rt
= _mm_mul_pd(r10
,vftabscale
);
904 vfitab
= _mm_cvttpd_epi32(rt
);
905 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
906 vfitab
= _mm_slli_epi32(vfitab
,2);
908 /* CUBIC SPLINE TABLE ELECTROSTATICS */
909 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
910 F
= _mm_setzero_pd();
911 GMX_MM_TRANSPOSE2_PD(Y
,F
);
912 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
913 H
= _mm_setzero_pd();
914 GMX_MM_TRANSPOSE2_PD(G
,H
);
915 Heps
= _mm_mul_pd(vfeps
,H
);
916 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
917 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
918 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
922 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
924 /* Calculate temporary vectorial force */
925 tx
= _mm_mul_pd(fscal
,dx10
);
926 ty
= _mm_mul_pd(fscal
,dy10
);
927 tz
= _mm_mul_pd(fscal
,dz10
);
929 /* Update vectorial force */
930 fix1
= _mm_add_pd(fix1
,tx
);
931 fiy1
= _mm_add_pd(fiy1
,ty
);
932 fiz1
= _mm_add_pd(fiz1
,tz
);
934 fjx0
= _mm_add_pd(fjx0
,tx
);
935 fjy0
= _mm_add_pd(fjy0
,ty
);
936 fjz0
= _mm_add_pd(fjz0
,tz
);
938 /**************************
939 * CALCULATE INTERACTIONS *
940 **************************/
942 r20
= _mm_mul_pd(rsq20
,rinv20
);
944 /* Compute parameters for interactions between i and j atoms */
945 qq20
= _mm_mul_pd(iq2
,jq0
);
947 /* Calculate table index by multiplying r with table scale and truncate to integer */
948 rt
= _mm_mul_pd(r20
,vftabscale
);
949 vfitab
= _mm_cvttpd_epi32(rt
);
950 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
951 vfitab
= _mm_slli_epi32(vfitab
,2);
953 /* CUBIC SPLINE TABLE ELECTROSTATICS */
954 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
955 F
= _mm_setzero_pd();
956 GMX_MM_TRANSPOSE2_PD(Y
,F
);
957 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
958 H
= _mm_setzero_pd();
959 GMX_MM_TRANSPOSE2_PD(G
,H
);
960 Heps
= _mm_mul_pd(vfeps
,H
);
961 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
962 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
963 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
967 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
969 /* Calculate temporary vectorial force */
970 tx
= _mm_mul_pd(fscal
,dx20
);
971 ty
= _mm_mul_pd(fscal
,dy20
);
972 tz
= _mm_mul_pd(fscal
,dz20
);
974 /* Update vectorial force */
975 fix2
= _mm_add_pd(fix2
,tx
);
976 fiy2
= _mm_add_pd(fiy2
,ty
);
977 fiz2
= _mm_add_pd(fiz2
,tz
);
979 fjx0
= _mm_add_pd(fjx0
,tx
);
980 fjy0
= _mm_add_pd(fjy0
,ty
);
981 fjz0
= _mm_add_pd(fjz0
,tz
);
983 /**************************
984 * CALCULATE INTERACTIONS *
985 **************************/
987 r30
= _mm_mul_pd(rsq30
,rinv30
);
989 /* Compute parameters for interactions between i and j atoms */
990 qq30
= _mm_mul_pd(iq3
,jq0
);
992 /* Calculate table index by multiplying r with table scale and truncate to integer */
993 rt
= _mm_mul_pd(r30
,vftabscale
);
994 vfitab
= _mm_cvttpd_epi32(rt
);
995 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
996 vfitab
= _mm_slli_epi32(vfitab
,2);
998 /* CUBIC SPLINE TABLE ELECTROSTATICS */
999 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1000 F
= _mm_setzero_pd();
1001 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1002 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1003 H
= _mm_setzero_pd();
1004 GMX_MM_TRANSPOSE2_PD(G
,H
);
1005 Heps
= _mm_mul_pd(vfeps
,H
);
1006 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1007 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1008 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq30
,FF
),_mm_mul_pd(vftabscale
,rinv30
)));
1012 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1014 /* Calculate temporary vectorial force */
1015 tx
= _mm_mul_pd(fscal
,dx30
);
1016 ty
= _mm_mul_pd(fscal
,dy30
);
1017 tz
= _mm_mul_pd(fscal
,dz30
);
1019 /* Update vectorial force */
1020 fix3
= _mm_add_pd(fix3
,tx
);
1021 fiy3
= _mm_add_pd(fiy3
,ty
);
1022 fiz3
= _mm_add_pd(fiz3
,tz
);
1024 fjx0
= _mm_add_pd(fjx0
,tx
);
1025 fjy0
= _mm_add_pd(fjy0
,ty
);
1026 fjz0
= _mm_add_pd(fjz0
,tz
);
1028 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1030 /* Inner loop uses 120 flops */
1033 /* End of innermost loop */
1035 gmx_mm_update_iforce_3atom_swizzle_pd(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1036 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
1038 /* Increment number of inner iterations */
1039 inneriter
+= j_index_end
- j_index_start
;
1041 /* Outer loop uses 18 flops */
1044 /* Increment number of outer iterations */
1047 /* Update outer/inner flops */
1049 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4_F
,outeriter
*18 + inneriter
*120);