2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_double
53 * Electrostatics interaction: CubicSplineTable
54 * VdW interaction: CubicSplineTable
55 * Geometry: Water3-Particle
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
87 int vdwjidx0A
,vdwjidx0B
;
88 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
89 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
90 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
91 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
92 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
95 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
98 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
101 __m128i ifour
= _mm_set1_epi32(4);
102 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
104 __m128d dummy_mask
,cutoff_mask
;
105 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
106 __m128d one
= _mm_set1_pd(1.0);
107 __m128d two
= _mm_set1_pd(2.0);
113 jindex
= nlist
->jindex
;
115 shiftidx
= nlist
->shift
;
117 shiftvec
= fr
->shift_vec
[0];
118 fshift
= fr
->fshift
[0];
119 facel
= _mm_set1_pd(fr
->epsfac
);
120 charge
= mdatoms
->chargeA
;
121 nvdwtype
= fr
->ntype
;
123 vdwtype
= mdatoms
->typeA
;
125 vftab
= kernel_data
->table_elec_vdw
->data
;
126 vftabscale
= _mm_set1_pd(kernel_data
->table_elec_vdw
->scale
);
128 /* Setup water-specific parameters */
129 inr
= nlist
->iinr
[0];
130 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
131 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
132 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
133 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
135 /* Avoid stupid compiler warnings */
143 /* Start outer loop over neighborlists */
144 for(iidx
=0; iidx
<nri
; iidx
++)
146 /* Load shift vector for this list */
147 i_shift_offset
= DIM
*shiftidx
[iidx
];
149 /* Load limits for loop over neighbors */
150 j_index_start
= jindex
[iidx
];
151 j_index_end
= jindex
[iidx
+1];
153 /* Get outer coordinate index */
155 i_coord_offset
= DIM
*inr
;
157 /* Load i particle coords and add shift vector */
158 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
159 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
161 fix0
= _mm_setzero_pd();
162 fiy0
= _mm_setzero_pd();
163 fiz0
= _mm_setzero_pd();
164 fix1
= _mm_setzero_pd();
165 fiy1
= _mm_setzero_pd();
166 fiz1
= _mm_setzero_pd();
167 fix2
= _mm_setzero_pd();
168 fiy2
= _mm_setzero_pd();
169 fiz2
= _mm_setzero_pd();
171 /* Reset potential sums */
172 velecsum
= _mm_setzero_pd();
173 vvdwsum
= _mm_setzero_pd();
175 /* Start inner kernel loop */
176 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
179 /* Get j neighbor index, and coordinate index */
182 j_coord_offsetA
= DIM
*jnrA
;
183 j_coord_offsetB
= DIM
*jnrB
;
185 /* load j atom coordinates */
186 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
189 /* Calculate displacement vector */
190 dx00
= _mm_sub_pd(ix0
,jx0
);
191 dy00
= _mm_sub_pd(iy0
,jy0
);
192 dz00
= _mm_sub_pd(iz0
,jz0
);
193 dx10
= _mm_sub_pd(ix1
,jx0
);
194 dy10
= _mm_sub_pd(iy1
,jy0
);
195 dz10
= _mm_sub_pd(iz1
,jz0
);
196 dx20
= _mm_sub_pd(ix2
,jx0
);
197 dy20
= _mm_sub_pd(iy2
,jy0
);
198 dz20
= _mm_sub_pd(iz2
,jz0
);
200 /* Calculate squared distance and things based on it */
201 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
202 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
203 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
205 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
206 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
207 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
209 /* Load parameters for j particles */
210 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
211 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
212 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
214 fjx0
= _mm_setzero_pd();
215 fjy0
= _mm_setzero_pd();
216 fjz0
= _mm_setzero_pd();
218 /**************************
219 * CALCULATE INTERACTIONS *
220 **************************/
222 r00
= _mm_mul_pd(rsq00
,rinv00
);
224 /* Compute parameters for interactions between i and j atoms */
225 qq00
= _mm_mul_pd(iq0
,jq0
);
226 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
227 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
229 /* Calculate table index by multiplying r with table scale and truncate to integer */
230 rt
= _mm_mul_pd(r00
,vftabscale
);
231 vfitab
= _mm_cvttpd_epi32(rt
);
232 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
233 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
235 /* CUBIC SPLINE TABLE ELECTROSTATICS */
236 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
237 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
238 GMX_MM_TRANSPOSE2_PD(Y
,F
);
239 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
240 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
241 GMX_MM_TRANSPOSE2_PD(G
,H
);
242 Heps
= _mm_mul_pd(vfeps
,H
);
243 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
244 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
245 velec
= _mm_mul_pd(qq00
,VV
);
246 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
247 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
249 /* CUBIC SPLINE TABLE DISPERSION */
250 vfitab
= _mm_add_epi32(vfitab
,ifour
);
251 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
252 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
253 GMX_MM_TRANSPOSE2_PD(Y
,F
);
254 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
255 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
256 GMX_MM_TRANSPOSE2_PD(G
,H
);
257 Heps
= _mm_mul_pd(vfeps
,H
);
258 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
259 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
260 vvdw6
= _mm_mul_pd(c6_00
,VV
);
261 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
262 fvdw6
= _mm_mul_pd(c6_00
,FF
);
264 /* CUBIC SPLINE TABLE REPULSION */
265 vfitab
= _mm_add_epi32(vfitab
,ifour
);
266 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
267 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
268 GMX_MM_TRANSPOSE2_PD(Y
,F
);
269 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
270 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
271 GMX_MM_TRANSPOSE2_PD(G
,H
);
272 Heps
= _mm_mul_pd(vfeps
,H
);
273 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
274 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
275 vvdw12
= _mm_mul_pd(c12_00
,VV
);
276 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
277 fvdw12
= _mm_mul_pd(c12_00
,FF
);
278 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
279 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
281 /* Update potential sum for this i atom from the interaction with this j atom. */
282 velecsum
= _mm_add_pd(velecsum
,velec
);
283 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
285 fscal
= _mm_add_pd(felec
,fvdw
);
287 /* Calculate temporary vectorial force */
288 tx
= _mm_mul_pd(fscal
,dx00
);
289 ty
= _mm_mul_pd(fscal
,dy00
);
290 tz
= _mm_mul_pd(fscal
,dz00
);
292 /* Update vectorial force */
293 fix0
= _mm_add_pd(fix0
,tx
);
294 fiy0
= _mm_add_pd(fiy0
,ty
);
295 fiz0
= _mm_add_pd(fiz0
,tz
);
297 fjx0
= _mm_add_pd(fjx0
,tx
);
298 fjy0
= _mm_add_pd(fjy0
,ty
);
299 fjz0
= _mm_add_pd(fjz0
,tz
);
301 /**************************
302 * CALCULATE INTERACTIONS *
303 **************************/
305 r10
= _mm_mul_pd(rsq10
,rinv10
);
307 /* Compute parameters for interactions between i and j atoms */
308 qq10
= _mm_mul_pd(iq1
,jq0
);
310 /* Calculate table index by multiplying r with table scale and truncate to integer */
311 rt
= _mm_mul_pd(r10
,vftabscale
);
312 vfitab
= _mm_cvttpd_epi32(rt
);
313 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
314 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
316 /* CUBIC SPLINE TABLE ELECTROSTATICS */
317 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
318 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
319 GMX_MM_TRANSPOSE2_PD(Y
,F
);
320 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
321 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
322 GMX_MM_TRANSPOSE2_PD(G
,H
);
323 Heps
= _mm_mul_pd(vfeps
,H
);
324 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
325 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
326 velec
= _mm_mul_pd(qq10
,VV
);
327 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
328 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 velecsum
= _mm_add_pd(velecsum
,velec
);
335 /* Calculate temporary vectorial force */
336 tx
= _mm_mul_pd(fscal
,dx10
);
337 ty
= _mm_mul_pd(fscal
,dy10
);
338 tz
= _mm_mul_pd(fscal
,dz10
);
340 /* Update vectorial force */
341 fix1
= _mm_add_pd(fix1
,tx
);
342 fiy1
= _mm_add_pd(fiy1
,ty
);
343 fiz1
= _mm_add_pd(fiz1
,tz
);
345 fjx0
= _mm_add_pd(fjx0
,tx
);
346 fjy0
= _mm_add_pd(fjy0
,ty
);
347 fjz0
= _mm_add_pd(fjz0
,tz
);
349 /**************************
350 * CALCULATE INTERACTIONS *
351 **************************/
353 r20
= _mm_mul_pd(rsq20
,rinv20
);
355 /* Compute parameters for interactions between i and j atoms */
356 qq20
= _mm_mul_pd(iq2
,jq0
);
358 /* Calculate table index by multiplying r with table scale and truncate to integer */
359 rt
= _mm_mul_pd(r20
,vftabscale
);
360 vfitab
= _mm_cvttpd_epi32(rt
);
361 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
362 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
364 /* CUBIC SPLINE TABLE ELECTROSTATICS */
365 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
366 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
367 GMX_MM_TRANSPOSE2_PD(Y
,F
);
368 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
369 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
370 GMX_MM_TRANSPOSE2_PD(G
,H
);
371 Heps
= _mm_mul_pd(vfeps
,H
);
372 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
373 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
374 velec
= _mm_mul_pd(qq20
,VV
);
375 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
376 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velecsum
= _mm_add_pd(velecsum
,velec
);
383 /* Calculate temporary vectorial force */
384 tx
= _mm_mul_pd(fscal
,dx20
);
385 ty
= _mm_mul_pd(fscal
,dy20
);
386 tz
= _mm_mul_pd(fscal
,dz20
);
388 /* Update vectorial force */
389 fix2
= _mm_add_pd(fix2
,tx
);
390 fiy2
= _mm_add_pd(fiy2
,ty
);
391 fiz2
= _mm_add_pd(fiz2
,tz
);
393 fjx0
= _mm_add_pd(fjx0
,tx
);
394 fjy0
= _mm_add_pd(fjy0
,ty
);
395 fjz0
= _mm_add_pd(fjz0
,tz
);
397 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
399 /* Inner loop uses 162 flops */
406 j_coord_offsetA
= DIM
*jnrA
;
408 /* load j atom coordinates */
409 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
412 /* Calculate displacement vector */
413 dx00
= _mm_sub_pd(ix0
,jx0
);
414 dy00
= _mm_sub_pd(iy0
,jy0
);
415 dz00
= _mm_sub_pd(iz0
,jz0
);
416 dx10
= _mm_sub_pd(ix1
,jx0
);
417 dy10
= _mm_sub_pd(iy1
,jy0
);
418 dz10
= _mm_sub_pd(iz1
,jz0
);
419 dx20
= _mm_sub_pd(ix2
,jx0
);
420 dy20
= _mm_sub_pd(iy2
,jy0
);
421 dz20
= _mm_sub_pd(iz2
,jz0
);
423 /* Calculate squared distance and things based on it */
424 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
425 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
426 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
428 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
429 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
430 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
432 /* Load parameters for j particles */
433 jq0
= _mm_load_sd(charge
+jnrA
+0);
434 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
436 fjx0
= _mm_setzero_pd();
437 fjy0
= _mm_setzero_pd();
438 fjz0
= _mm_setzero_pd();
440 /**************************
441 * CALCULATE INTERACTIONS *
442 **************************/
444 r00
= _mm_mul_pd(rsq00
,rinv00
);
446 /* Compute parameters for interactions between i and j atoms */
447 qq00
= _mm_mul_pd(iq0
,jq0
);
448 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
450 /* Calculate table index by multiplying r with table scale and truncate to integer */
451 rt
= _mm_mul_pd(r00
,vftabscale
);
452 vfitab
= _mm_cvttpd_epi32(rt
);
453 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
454 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
456 /* CUBIC SPLINE TABLE ELECTROSTATICS */
457 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
458 F
= _mm_setzero_pd();
459 GMX_MM_TRANSPOSE2_PD(Y
,F
);
460 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
461 H
= _mm_setzero_pd();
462 GMX_MM_TRANSPOSE2_PD(G
,H
);
463 Heps
= _mm_mul_pd(vfeps
,H
);
464 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
465 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
466 velec
= _mm_mul_pd(qq00
,VV
);
467 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
468 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
470 /* CUBIC SPLINE TABLE DISPERSION */
471 vfitab
= _mm_add_epi32(vfitab
,ifour
);
472 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
473 F
= _mm_setzero_pd();
474 GMX_MM_TRANSPOSE2_PD(Y
,F
);
475 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
476 H
= _mm_setzero_pd();
477 GMX_MM_TRANSPOSE2_PD(G
,H
);
478 Heps
= _mm_mul_pd(vfeps
,H
);
479 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
480 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
481 vvdw6
= _mm_mul_pd(c6_00
,VV
);
482 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
483 fvdw6
= _mm_mul_pd(c6_00
,FF
);
485 /* CUBIC SPLINE TABLE REPULSION */
486 vfitab
= _mm_add_epi32(vfitab
,ifour
);
487 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
488 F
= _mm_setzero_pd();
489 GMX_MM_TRANSPOSE2_PD(Y
,F
);
490 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
491 H
= _mm_setzero_pd();
492 GMX_MM_TRANSPOSE2_PD(G
,H
);
493 Heps
= _mm_mul_pd(vfeps
,H
);
494 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
495 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
496 vvdw12
= _mm_mul_pd(c12_00
,VV
);
497 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
498 fvdw12
= _mm_mul_pd(c12_00
,FF
);
499 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
500 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
502 /* Update potential sum for this i atom from the interaction with this j atom. */
503 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
504 velecsum
= _mm_add_pd(velecsum
,velec
);
505 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
506 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
508 fscal
= _mm_add_pd(felec
,fvdw
);
510 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
512 /* Calculate temporary vectorial force */
513 tx
= _mm_mul_pd(fscal
,dx00
);
514 ty
= _mm_mul_pd(fscal
,dy00
);
515 tz
= _mm_mul_pd(fscal
,dz00
);
517 /* Update vectorial force */
518 fix0
= _mm_add_pd(fix0
,tx
);
519 fiy0
= _mm_add_pd(fiy0
,ty
);
520 fiz0
= _mm_add_pd(fiz0
,tz
);
522 fjx0
= _mm_add_pd(fjx0
,tx
);
523 fjy0
= _mm_add_pd(fjy0
,ty
);
524 fjz0
= _mm_add_pd(fjz0
,tz
);
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 r10
= _mm_mul_pd(rsq10
,rinv10
);
532 /* Compute parameters for interactions between i and j atoms */
533 qq10
= _mm_mul_pd(iq1
,jq0
);
535 /* Calculate table index by multiplying r with table scale and truncate to integer */
536 rt
= _mm_mul_pd(r10
,vftabscale
);
537 vfitab
= _mm_cvttpd_epi32(rt
);
538 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
539 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
541 /* CUBIC SPLINE TABLE ELECTROSTATICS */
542 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
543 F
= _mm_setzero_pd();
544 GMX_MM_TRANSPOSE2_PD(Y
,F
);
545 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
546 H
= _mm_setzero_pd();
547 GMX_MM_TRANSPOSE2_PD(G
,H
);
548 Heps
= _mm_mul_pd(vfeps
,H
);
549 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
550 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
551 velec
= _mm_mul_pd(qq10
,VV
);
552 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
553 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
555 /* Update potential sum for this i atom from the interaction with this j atom. */
556 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
557 velecsum
= _mm_add_pd(velecsum
,velec
);
561 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
563 /* Calculate temporary vectorial force */
564 tx
= _mm_mul_pd(fscal
,dx10
);
565 ty
= _mm_mul_pd(fscal
,dy10
);
566 tz
= _mm_mul_pd(fscal
,dz10
);
568 /* Update vectorial force */
569 fix1
= _mm_add_pd(fix1
,tx
);
570 fiy1
= _mm_add_pd(fiy1
,ty
);
571 fiz1
= _mm_add_pd(fiz1
,tz
);
573 fjx0
= _mm_add_pd(fjx0
,tx
);
574 fjy0
= _mm_add_pd(fjy0
,ty
);
575 fjz0
= _mm_add_pd(fjz0
,tz
);
577 /**************************
578 * CALCULATE INTERACTIONS *
579 **************************/
581 r20
= _mm_mul_pd(rsq20
,rinv20
);
583 /* Compute parameters for interactions between i and j atoms */
584 qq20
= _mm_mul_pd(iq2
,jq0
);
586 /* Calculate table index by multiplying r with table scale and truncate to integer */
587 rt
= _mm_mul_pd(r20
,vftabscale
);
588 vfitab
= _mm_cvttpd_epi32(rt
);
589 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
590 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
592 /* CUBIC SPLINE TABLE ELECTROSTATICS */
593 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
594 F
= _mm_setzero_pd();
595 GMX_MM_TRANSPOSE2_PD(Y
,F
);
596 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
597 H
= _mm_setzero_pd();
598 GMX_MM_TRANSPOSE2_PD(G
,H
);
599 Heps
= _mm_mul_pd(vfeps
,H
);
600 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
601 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
602 velec
= _mm_mul_pd(qq20
,VV
);
603 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
604 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
606 /* Update potential sum for this i atom from the interaction with this j atom. */
607 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
608 velecsum
= _mm_add_pd(velecsum
,velec
);
612 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
614 /* Calculate temporary vectorial force */
615 tx
= _mm_mul_pd(fscal
,dx20
);
616 ty
= _mm_mul_pd(fscal
,dy20
);
617 tz
= _mm_mul_pd(fscal
,dz20
);
619 /* Update vectorial force */
620 fix2
= _mm_add_pd(fix2
,tx
);
621 fiy2
= _mm_add_pd(fiy2
,ty
);
622 fiz2
= _mm_add_pd(fiz2
,tz
);
624 fjx0
= _mm_add_pd(fjx0
,tx
);
625 fjy0
= _mm_add_pd(fjy0
,ty
);
626 fjz0
= _mm_add_pd(fjz0
,tz
);
628 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
630 /* Inner loop uses 162 flops */
633 /* End of innermost loop */
635 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
636 f
+i_coord_offset
,fshift
+i_shift_offset
);
639 /* Update potential energies */
640 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
641 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
643 /* Increment number of inner iterations */
644 inneriter
+= j_index_end
- j_index_start
;
646 /* Outer loop uses 20 flops */
649 /* Increment number of outer iterations */
652 /* Update outer/inner flops */
654 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_VF
,outeriter
*20 + inneriter
*162);
657 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_double
658 * Electrostatics interaction: CubicSplineTable
659 * VdW interaction: CubicSplineTable
660 * Geometry: Water3-Particle
661 * Calculate force/pot: Force
664 nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_double
665 (t_nblist
* gmx_restrict nlist
,
666 rvec
* gmx_restrict xx
,
667 rvec
* gmx_restrict ff
,
668 t_forcerec
* gmx_restrict fr
,
669 t_mdatoms
* gmx_restrict mdatoms
,
670 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
671 t_nrnb
* gmx_restrict nrnb
)
673 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
674 * just 0 for non-waters.
675 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
676 * jnr indices corresponding to data put in the four positions in the SIMD register.
678 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
679 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
681 int j_coord_offsetA
,j_coord_offsetB
;
682 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
684 real
*shiftvec
,*fshift
,*x
,*f
;
685 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
687 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
689 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
691 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
692 int vdwjidx0A
,vdwjidx0B
;
693 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
694 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
695 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
696 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
697 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
700 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
703 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
704 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
706 __m128i ifour
= _mm_set1_epi32(4);
707 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
709 __m128d dummy_mask
,cutoff_mask
;
710 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
711 __m128d one
= _mm_set1_pd(1.0);
712 __m128d two
= _mm_set1_pd(2.0);
718 jindex
= nlist
->jindex
;
720 shiftidx
= nlist
->shift
;
722 shiftvec
= fr
->shift_vec
[0];
723 fshift
= fr
->fshift
[0];
724 facel
= _mm_set1_pd(fr
->epsfac
);
725 charge
= mdatoms
->chargeA
;
726 nvdwtype
= fr
->ntype
;
728 vdwtype
= mdatoms
->typeA
;
730 vftab
= kernel_data
->table_elec_vdw
->data
;
731 vftabscale
= _mm_set1_pd(kernel_data
->table_elec_vdw
->scale
);
733 /* Setup water-specific parameters */
734 inr
= nlist
->iinr
[0];
735 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
736 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
737 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
738 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
740 /* Avoid stupid compiler warnings */
748 /* Start outer loop over neighborlists */
749 for(iidx
=0; iidx
<nri
; iidx
++)
751 /* Load shift vector for this list */
752 i_shift_offset
= DIM
*shiftidx
[iidx
];
754 /* Load limits for loop over neighbors */
755 j_index_start
= jindex
[iidx
];
756 j_index_end
= jindex
[iidx
+1];
758 /* Get outer coordinate index */
760 i_coord_offset
= DIM
*inr
;
762 /* Load i particle coords and add shift vector */
763 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
764 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
766 fix0
= _mm_setzero_pd();
767 fiy0
= _mm_setzero_pd();
768 fiz0
= _mm_setzero_pd();
769 fix1
= _mm_setzero_pd();
770 fiy1
= _mm_setzero_pd();
771 fiz1
= _mm_setzero_pd();
772 fix2
= _mm_setzero_pd();
773 fiy2
= _mm_setzero_pd();
774 fiz2
= _mm_setzero_pd();
776 /* Start inner kernel loop */
777 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
780 /* Get j neighbor index, and coordinate index */
783 j_coord_offsetA
= DIM
*jnrA
;
784 j_coord_offsetB
= DIM
*jnrB
;
786 /* load j atom coordinates */
787 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
790 /* Calculate displacement vector */
791 dx00
= _mm_sub_pd(ix0
,jx0
);
792 dy00
= _mm_sub_pd(iy0
,jy0
);
793 dz00
= _mm_sub_pd(iz0
,jz0
);
794 dx10
= _mm_sub_pd(ix1
,jx0
);
795 dy10
= _mm_sub_pd(iy1
,jy0
);
796 dz10
= _mm_sub_pd(iz1
,jz0
);
797 dx20
= _mm_sub_pd(ix2
,jx0
);
798 dy20
= _mm_sub_pd(iy2
,jy0
);
799 dz20
= _mm_sub_pd(iz2
,jz0
);
801 /* Calculate squared distance and things based on it */
802 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
803 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
804 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
806 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
807 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
808 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
810 /* Load parameters for j particles */
811 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
812 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
813 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
815 fjx0
= _mm_setzero_pd();
816 fjy0
= _mm_setzero_pd();
817 fjz0
= _mm_setzero_pd();
819 /**************************
820 * CALCULATE INTERACTIONS *
821 **************************/
823 r00
= _mm_mul_pd(rsq00
,rinv00
);
825 /* Compute parameters for interactions between i and j atoms */
826 qq00
= _mm_mul_pd(iq0
,jq0
);
827 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
828 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
830 /* Calculate table index by multiplying r with table scale and truncate to integer */
831 rt
= _mm_mul_pd(r00
,vftabscale
);
832 vfitab
= _mm_cvttpd_epi32(rt
);
833 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
834 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
836 /* CUBIC SPLINE TABLE ELECTROSTATICS */
837 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
838 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
839 GMX_MM_TRANSPOSE2_PD(Y
,F
);
840 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
841 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
842 GMX_MM_TRANSPOSE2_PD(G
,H
);
843 Heps
= _mm_mul_pd(vfeps
,H
);
844 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
845 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
846 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
848 /* CUBIC SPLINE TABLE DISPERSION */
849 vfitab
= _mm_add_epi32(vfitab
,ifour
);
850 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
851 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
852 GMX_MM_TRANSPOSE2_PD(Y
,F
);
853 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
854 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
855 GMX_MM_TRANSPOSE2_PD(G
,H
);
856 Heps
= _mm_mul_pd(vfeps
,H
);
857 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
858 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
859 fvdw6
= _mm_mul_pd(c6_00
,FF
);
861 /* CUBIC SPLINE TABLE REPULSION */
862 vfitab
= _mm_add_epi32(vfitab
,ifour
);
863 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
864 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
865 GMX_MM_TRANSPOSE2_PD(Y
,F
);
866 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
867 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
868 GMX_MM_TRANSPOSE2_PD(G
,H
);
869 Heps
= _mm_mul_pd(vfeps
,H
);
870 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
871 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
872 fvdw12
= _mm_mul_pd(c12_00
,FF
);
873 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
875 fscal
= _mm_add_pd(felec
,fvdw
);
877 /* Calculate temporary vectorial force */
878 tx
= _mm_mul_pd(fscal
,dx00
);
879 ty
= _mm_mul_pd(fscal
,dy00
);
880 tz
= _mm_mul_pd(fscal
,dz00
);
882 /* Update vectorial force */
883 fix0
= _mm_add_pd(fix0
,tx
);
884 fiy0
= _mm_add_pd(fiy0
,ty
);
885 fiz0
= _mm_add_pd(fiz0
,tz
);
887 fjx0
= _mm_add_pd(fjx0
,tx
);
888 fjy0
= _mm_add_pd(fjy0
,ty
);
889 fjz0
= _mm_add_pd(fjz0
,tz
);
891 /**************************
892 * CALCULATE INTERACTIONS *
893 **************************/
895 r10
= _mm_mul_pd(rsq10
,rinv10
);
897 /* Compute parameters for interactions between i and j atoms */
898 qq10
= _mm_mul_pd(iq1
,jq0
);
900 /* Calculate table index by multiplying r with table scale and truncate to integer */
901 rt
= _mm_mul_pd(r10
,vftabscale
);
902 vfitab
= _mm_cvttpd_epi32(rt
);
903 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
904 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
906 /* CUBIC SPLINE TABLE ELECTROSTATICS */
907 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
908 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
909 GMX_MM_TRANSPOSE2_PD(Y
,F
);
910 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
911 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
912 GMX_MM_TRANSPOSE2_PD(G
,H
);
913 Heps
= _mm_mul_pd(vfeps
,H
);
914 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
915 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
916 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
920 /* Calculate temporary vectorial force */
921 tx
= _mm_mul_pd(fscal
,dx10
);
922 ty
= _mm_mul_pd(fscal
,dy10
);
923 tz
= _mm_mul_pd(fscal
,dz10
);
925 /* Update vectorial force */
926 fix1
= _mm_add_pd(fix1
,tx
);
927 fiy1
= _mm_add_pd(fiy1
,ty
);
928 fiz1
= _mm_add_pd(fiz1
,tz
);
930 fjx0
= _mm_add_pd(fjx0
,tx
);
931 fjy0
= _mm_add_pd(fjy0
,ty
);
932 fjz0
= _mm_add_pd(fjz0
,tz
);
934 /**************************
935 * CALCULATE INTERACTIONS *
936 **************************/
938 r20
= _mm_mul_pd(rsq20
,rinv20
);
940 /* Compute parameters for interactions between i and j atoms */
941 qq20
= _mm_mul_pd(iq2
,jq0
);
943 /* Calculate table index by multiplying r with table scale and truncate to integer */
944 rt
= _mm_mul_pd(r20
,vftabscale
);
945 vfitab
= _mm_cvttpd_epi32(rt
);
946 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
947 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
949 /* CUBIC SPLINE TABLE ELECTROSTATICS */
950 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
951 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
952 GMX_MM_TRANSPOSE2_PD(Y
,F
);
953 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
954 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
955 GMX_MM_TRANSPOSE2_PD(G
,H
);
956 Heps
= _mm_mul_pd(vfeps
,H
);
957 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
958 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
959 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
963 /* Calculate temporary vectorial force */
964 tx
= _mm_mul_pd(fscal
,dx20
);
965 ty
= _mm_mul_pd(fscal
,dy20
);
966 tz
= _mm_mul_pd(fscal
,dz20
);
968 /* Update vectorial force */
969 fix2
= _mm_add_pd(fix2
,tx
);
970 fiy2
= _mm_add_pd(fiy2
,ty
);
971 fiz2
= _mm_add_pd(fiz2
,tz
);
973 fjx0
= _mm_add_pd(fjx0
,tx
);
974 fjy0
= _mm_add_pd(fjy0
,ty
);
975 fjz0
= _mm_add_pd(fjz0
,tz
);
977 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
979 /* Inner loop uses 142 flops */
986 j_coord_offsetA
= DIM
*jnrA
;
988 /* load j atom coordinates */
989 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
992 /* Calculate displacement vector */
993 dx00
= _mm_sub_pd(ix0
,jx0
);
994 dy00
= _mm_sub_pd(iy0
,jy0
);
995 dz00
= _mm_sub_pd(iz0
,jz0
);
996 dx10
= _mm_sub_pd(ix1
,jx0
);
997 dy10
= _mm_sub_pd(iy1
,jy0
);
998 dz10
= _mm_sub_pd(iz1
,jz0
);
999 dx20
= _mm_sub_pd(ix2
,jx0
);
1000 dy20
= _mm_sub_pd(iy2
,jy0
);
1001 dz20
= _mm_sub_pd(iz2
,jz0
);
1003 /* Calculate squared distance and things based on it */
1004 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1005 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1006 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1008 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1009 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1010 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1012 /* Load parameters for j particles */
1013 jq0
= _mm_load_sd(charge
+jnrA
+0);
1014 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1016 fjx0
= _mm_setzero_pd();
1017 fjy0
= _mm_setzero_pd();
1018 fjz0
= _mm_setzero_pd();
1020 /**************************
1021 * CALCULATE INTERACTIONS *
1022 **************************/
1024 r00
= _mm_mul_pd(rsq00
,rinv00
);
1026 /* Compute parameters for interactions between i and j atoms */
1027 qq00
= _mm_mul_pd(iq0
,jq0
);
1028 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
1030 /* Calculate table index by multiplying r with table scale and truncate to integer */
1031 rt
= _mm_mul_pd(r00
,vftabscale
);
1032 vfitab
= _mm_cvttpd_epi32(rt
);
1033 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1034 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1036 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1037 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1038 F
= _mm_setzero_pd();
1039 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1040 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1041 H
= _mm_setzero_pd();
1042 GMX_MM_TRANSPOSE2_PD(G
,H
);
1043 Heps
= _mm_mul_pd(vfeps
,H
);
1044 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1045 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1046 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
1048 /* CUBIC SPLINE TABLE DISPERSION */
1049 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1050 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1051 F
= _mm_setzero_pd();
1052 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1053 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1054 H
= _mm_setzero_pd();
1055 GMX_MM_TRANSPOSE2_PD(G
,H
);
1056 Heps
= _mm_mul_pd(vfeps
,H
);
1057 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1058 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1059 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1061 /* CUBIC SPLINE TABLE REPULSION */
1062 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1063 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1064 F
= _mm_setzero_pd();
1065 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1066 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1067 H
= _mm_setzero_pd();
1068 GMX_MM_TRANSPOSE2_PD(G
,H
);
1069 Heps
= _mm_mul_pd(vfeps
,H
);
1070 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1071 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1072 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1073 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1075 fscal
= _mm_add_pd(felec
,fvdw
);
1077 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1079 /* Calculate temporary vectorial force */
1080 tx
= _mm_mul_pd(fscal
,dx00
);
1081 ty
= _mm_mul_pd(fscal
,dy00
);
1082 tz
= _mm_mul_pd(fscal
,dz00
);
1084 /* Update vectorial force */
1085 fix0
= _mm_add_pd(fix0
,tx
);
1086 fiy0
= _mm_add_pd(fiy0
,ty
);
1087 fiz0
= _mm_add_pd(fiz0
,tz
);
1089 fjx0
= _mm_add_pd(fjx0
,tx
);
1090 fjy0
= _mm_add_pd(fjy0
,ty
);
1091 fjz0
= _mm_add_pd(fjz0
,tz
);
1093 /**************************
1094 * CALCULATE INTERACTIONS *
1095 **************************/
1097 r10
= _mm_mul_pd(rsq10
,rinv10
);
1099 /* Compute parameters for interactions between i and j atoms */
1100 qq10
= _mm_mul_pd(iq1
,jq0
);
1102 /* Calculate table index by multiplying r with table scale and truncate to integer */
1103 rt
= _mm_mul_pd(r10
,vftabscale
);
1104 vfitab
= _mm_cvttpd_epi32(rt
);
1105 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1106 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1108 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1109 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1110 F
= _mm_setzero_pd();
1111 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1112 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1113 H
= _mm_setzero_pd();
1114 GMX_MM_TRANSPOSE2_PD(G
,H
);
1115 Heps
= _mm_mul_pd(vfeps
,H
);
1116 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1117 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1118 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
1122 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1124 /* Calculate temporary vectorial force */
1125 tx
= _mm_mul_pd(fscal
,dx10
);
1126 ty
= _mm_mul_pd(fscal
,dy10
);
1127 tz
= _mm_mul_pd(fscal
,dz10
);
1129 /* Update vectorial force */
1130 fix1
= _mm_add_pd(fix1
,tx
);
1131 fiy1
= _mm_add_pd(fiy1
,ty
);
1132 fiz1
= _mm_add_pd(fiz1
,tz
);
1134 fjx0
= _mm_add_pd(fjx0
,tx
);
1135 fjy0
= _mm_add_pd(fjy0
,ty
);
1136 fjz0
= _mm_add_pd(fjz0
,tz
);
1138 /**************************
1139 * CALCULATE INTERACTIONS *
1140 **************************/
1142 r20
= _mm_mul_pd(rsq20
,rinv20
);
1144 /* Compute parameters for interactions between i and j atoms */
1145 qq20
= _mm_mul_pd(iq2
,jq0
);
1147 /* Calculate table index by multiplying r with table scale and truncate to integer */
1148 rt
= _mm_mul_pd(r20
,vftabscale
);
1149 vfitab
= _mm_cvttpd_epi32(rt
);
1150 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1151 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1153 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1154 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1155 F
= _mm_setzero_pd();
1156 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1157 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1158 H
= _mm_setzero_pd();
1159 GMX_MM_TRANSPOSE2_PD(G
,H
);
1160 Heps
= _mm_mul_pd(vfeps
,H
);
1161 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1162 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1163 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
1167 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1169 /* Calculate temporary vectorial force */
1170 tx
= _mm_mul_pd(fscal
,dx20
);
1171 ty
= _mm_mul_pd(fscal
,dy20
);
1172 tz
= _mm_mul_pd(fscal
,dz20
);
1174 /* Update vectorial force */
1175 fix2
= _mm_add_pd(fix2
,tx
);
1176 fiy2
= _mm_add_pd(fiy2
,ty
);
1177 fiz2
= _mm_add_pd(fiz2
,tz
);
1179 fjx0
= _mm_add_pd(fjx0
,tx
);
1180 fjy0
= _mm_add_pd(fjy0
,ty
);
1181 fjz0
= _mm_add_pd(fjz0
,tz
);
1183 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1185 /* Inner loop uses 142 flops */
1188 /* End of innermost loop */
1190 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1191 f
+i_coord_offset
,fshift
+i_shift_offset
);
1193 /* Increment number of inner iterations */
1194 inneriter
+= j_index_end
- j_index_start
;
1196 /* Outer loop uses 18 flops */
1199 /* Increment number of outer iterations */
1202 /* Update outer/inner flops */
1204 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_F
,outeriter
*18 + inneriter
*142);