2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_double
53 * Electrostatics interaction: GeneralizedBorn
54 * VdW interaction: None
55 * Geometry: Particle-Particle
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
83 int vdwjidx0A
,vdwjidx0B
;
84 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
85 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
86 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
89 __m128d vgb
,fgb
,vgbsum
,dvdasum
,gbscale
,gbtabscale
,isaprod
,gbqqfactor
,gbinvepsdiff
,dvdaj
,gbeps
,dvdatmp
;
90 __m128d minushalf
= _mm_set1_pd(-0.5);
91 real
*invsqrta
,*dvda
,*gbtab
;
93 __m128i ifour
= _mm_set1_epi32(4);
94 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
96 __m128d dummy_mask
,cutoff_mask
;
97 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
98 __m128d one
= _mm_set1_pd(1.0);
99 __m128d two
= _mm_set1_pd(2.0);
105 jindex
= nlist
->jindex
;
107 shiftidx
= nlist
->shift
;
109 shiftvec
= fr
->shift_vec
[0];
110 fshift
= fr
->fshift
[0];
111 facel
= _mm_set1_pd(fr
->epsfac
);
112 charge
= mdatoms
->chargeA
;
114 invsqrta
= fr
->invsqrta
;
116 gbtabscale
= _mm_set1_pd(fr
->gbtab
.scale
);
117 gbtab
= fr
->gbtab
.data
;
118 gbinvepsdiff
= _mm_set1_pd((1.0/fr
->epsilon_r
) - (1.0/fr
->gb_epsilon_solvent
));
120 /* Avoid stupid compiler warnings */
128 /* Start outer loop over neighborlists */
129 for(iidx
=0; iidx
<nri
; iidx
++)
131 /* Load shift vector for this list */
132 i_shift_offset
= DIM
*shiftidx
[iidx
];
134 /* Load limits for loop over neighbors */
135 j_index_start
= jindex
[iidx
];
136 j_index_end
= jindex
[iidx
+1];
138 /* Get outer coordinate index */
140 i_coord_offset
= DIM
*inr
;
142 /* Load i particle coords and add shift vector */
143 gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,&ix0
,&iy0
,&iz0
);
145 fix0
= _mm_setzero_pd();
146 fiy0
= _mm_setzero_pd();
147 fiz0
= _mm_setzero_pd();
149 /* Load parameters for i particles */
150 iq0
= _mm_mul_pd(facel
,_mm_load1_pd(charge
+inr
+0));
151 isai0
= _mm_load1_pd(invsqrta
+inr
+0);
153 /* Reset potential sums */
154 velecsum
= _mm_setzero_pd();
155 vgbsum
= _mm_setzero_pd();
156 dvdasum
= _mm_setzero_pd();
158 /* Start inner kernel loop */
159 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
162 /* Get j neighbor index, and coordinate index */
165 j_coord_offsetA
= DIM
*jnrA
;
166 j_coord_offsetB
= DIM
*jnrB
;
168 /* load j atom coordinates */
169 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
172 /* Calculate displacement vector */
173 dx00
= _mm_sub_pd(ix0
,jx0
);
174 dy00
= _mm_sub_pd(iy0
,jy0
);
175 dz00
= _mm_sub_pd(iz0
,jz0
);
177 /* Calculate squared distance and things based on it */
178 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
180 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
182 /* Load parameters for j particles */
183 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
184 isaj0
= gmx_mm_load_2real_swizzle_pd(invsqrta
+jnrA
+0,invsqrta
+jnrB
+0);
186 /**************************
187 * CALCULATE INTERACTIONS *
188 **************************/
190 r00
= _mm_mul_pd(rsq00
,rinv00
);
192 /* Compute parameters for interactions between i and j atoms */
193 qq00
= _mm_mul_pd(iq0
,jq0
);
195 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
196 isaprod
= _mm_mul_pd(isai0
,isaj0
);
197 gbqqfactor
= _mm_xor_pd(signbit
,_mm_mul_pd(qq00
,_mm_mul_pd(isaprod
,gbinvepsdiff
)));
198 gbscale
= _mm_mul_pd(isaprod
,gbtabscale
);
200 /* Calculate generalized born table index - this is a separate table from the normal one,
201 * but we use the same procedure by multiplying r with scale and truncating to integer.
203 rt
= _mm_mul_pd(r00
,gbscale
);
204 gbitab
= _mm_cvttpd_epi32(rt
);
205 gbeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(gbitab
));
206 gbitab
= _mm_slli_epi32(gbitab
,2);
208 Y
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,0) );
209 F
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,1) );
210 GMX_MM_TRANSPOSE2_PD(Y
,F
);
211 G
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,0) +2);
212 H
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,1) +2);
213 GMX_MM_TRANSPOSE2_PD(G
,H
);
214 Heps
= _mm_mul_pd(gbeps
,H
);
215 Fp
= _mm_add_pd(F
,_mm_mul_pd(gbeps
,_mm_add_pd(G
,Heps
)));
216 VV
= _mm_add_pd(Y
,_mm_mul_pd(gbeps
,Fp
));
217 vgb
= _mm_mul_pd(gbqqfactor
,VV
);
219 FF
= _mm_add_pd(Fp
,_mm_mul_pd(gbeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
220 fgb
= _mm_mul_pd(gbqqfactor
,_mm_mul_pd(FF
,gbscale
));
221 dvdatmp
= _mm_mul_pd(minushalf
,_mm_add_pd(vgb
,_mm_mul_pd(fgb
,r00
)));
222 dvdasum
= _mm_add_pd(dvdasum
,dvdatmp
);
223 gmx_mm_increment_2real_swizzle_pd(dvda
+jnrA
,dvda
+jnrB
,_mm_mul_pd(dvdatmp
,_mm_mul_pd(isaj0
,isaj0
)));
224 velec
= _mm_mul_pd(qq00
,rinv00
);
225 felec
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec
,rinv00
),fgb
),rinv00
);
227 /* Update potential sum for this i atom from the interaction with this j atom. */
228 velecsum
= _mm_add_pd(velecsum
,velec
);
229 vgbsum
= _mm_add_pd(vgbsum
,vgb
);
233 /* Calculate temporary vectorial force */
234 tx
= _mm_mul_pd(fscal
,dx00
);
235 ty
= _mm_mul_pd(fscal
,dy00
);
236 tz
= _mm_mul_pd(fscal
,dz00
);
238 /* Update vectorial force */
239 fix0
= _mm_add_pd(fix0
,tx
);
240 fiy0
= _mm_add_pd(fiy0
,ty
);
241 fiz0
= _mm_add_pd(fiz0
,tz
);
243 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,tx
,ty
,tz
);
245 /* Inner loop uses 58 flops */
252 j_coord_offsetA
= DIM
*jnrA
;
254 /* load j atom coordinates */
255 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
258 /* Calculate displacement vector */
259 dx00
= _mm_sub_pd(ix0
,jx0
);
260 dy00
= _mm_sub_pd(iy0
,jy0
);
261 dz00
= _mm_sub_pd(iz0
,jz0
);
263 /* Calculate squared distance and things based on it */
264 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
266 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
268 /* Load parameters for j particles */
269 jq0
= _mm_load_sd(charge
+jnrA
+0);
270 isaj0
= _mm_load_sd(invsqrta
+jnrA
+0);
272 /**************************
273 * CALCULATE INTERACTIONS *
274 **************************/
276 r00
= _mm_mul_pd(rsq00
,rinv00
);
278 /* Compute parameters for interactions between i and j atoms */
279 qq00
= _mm_mul_pd(iq0
,jq0
);
281 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
282 isaprod
= _mm_mul_pd(isai0
,isaj0
);
283 gbqqfactor
= _mm_xor_pd(signbit
,_mm_mul_pd(qq00
,_mm_mul_pd(isaprod
,gbinvepsdiff
)));
284 gbscale
= _mm_mul_pd(isaprod
,gbtabscale
);
286 /* Calculate generalized born table index - this is a separate table from the normal one,
287 * but we use the same procedure by multiplying r with scale and truncating to integer.
289 rt
= _mm_mul_pd(r00
,gbscale
);
290 gbitab
= _mm_cvttpd_epi32(rt
);
291 gbeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(gbitab
));
292 gbitab
= _mm_slli_epi32(gbitab
,2);
294 Y
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,0) );
295 F
= _mm_setzero_pd();
296 GMX_MM_TRANSPOSE2_PD(Y
,F
);
297 G
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,0) +2);
298 H
= _mm_setzero_pd();
299 GMX_MM_TRANSPOSE2_PD(G
,H
);
300 Heps
= _mm_mul_pd(gbeps
,H
);
301 Fp
= _mm_add_pd(F
,_mm_mul_pd(gbeps
,_mm_add_pd(G
,Heps
)));
302 VV
= _mm_add_pd(Y
,_mm_mul_pd(gbeps
,Fp
));
303 vgb
= _mm_mul_pd(gbqqfactor
,VV
);
305 FF
= _mm_add_pd(Fp
,_mm_mul_pd(gbeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
306 fgb
= _mm_mul_pd(gbqqfactor
,_mm_mul_pd(FF
,gbscale
));
307 dvdatmp
= _mm_mul_pd(minushalf
,_mm_add_pd(vgb
,_mm_mul_pd(fgb
,r00
)));
308 dvdatmp
= _mm_unpacklo_pd(dvdatmp
,_mm_setzero_pd());
309 dvdasum
= _mm_add_pd(dvdasum
,dvdatmp
);
310 gmx_mm_increment_1real_pd(dvda
+jnrA
,_mm_mul_pd(dvdatmp
,_mm_mul_pd(isaj0
,isaj0
)));
311 velec
= _mm_mul_pd(qq00
,rinv00
);
312 felec
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec
,rinv00
),fgb
),rinv00
);
314 /* Update potential sum for this i atom from the interaction with this j atom. */
315 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
316 velecsum
= _mm_add_pd(velecsum
,velec
);
317 vgb
= _mm_unpacklo_pd(vgb
,_mm_setzero_pd());
318 vgbsum
= _mm_add_pd(vgbsum
,vgb
);
322 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
324 /* Calculate temporary vectorial force */
325 tx
= _mm_mul_pd(fscal
,dx00
);
326 ty
= _mm_mul_pd(fscal
,dy00
);
327 tz
= _mm_mul_pd(fscal
,dz00
);
329 /* Update vectorial force */
330 fix0
= _mm_add_pd(fix0
,tx
);
331 fiy0
= _mm_add_pd(fiy0
,ty
);
332 fiz0
= _mm_add_pd(fiz0
,tz
);
334 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,tx
,ty
,tz
);
336 /* Inner loop uses 58 flops */
339 /* End of innermost loop */
341 gmx_mm_update_iforce_1atom_swizzle_pd(fix0
,fiy0
,fiz0
,
342 f
+i_coord_offset
,fshift
+i_shift_offset
);
345 /* Update potential energies */
346 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
347 gmx_mm_update_1pot_pd(vgbsum
,kernel_data
->energygrp_polarization
+ggid
);
348 dvdasum
= _mm_mul_pd(dvdasum
, _mm_mul_pd(isai0
,isai0
));
349 gmx_mm_update_1pot_pd(dvdasum
,dvda
+inr
);
351 /* Increment number of inner iterations */
352 inneriter
+= j_index_end
- j_index_start
;
354 /* Outer loop uses 9 flops */
357 /* Increment number of outer iterations */
360 /* Update outer/inner flops */
362 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VF
,outeriter
*9 + inneriter
*58);
365 * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_double
366 * Electrostatics interaction: GeneralizedBorn
367 * VdW interaction: None
368 * Geometry: Particle-Particle
369 * Calculate force/pot: Force
372 nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_double
373 (t_nblist
* gmx_restrict nlist
,
374 rvec
* gmx_restrict xx
,
375 rvec
* gmx_restrict ff
,
376 t_forcerec
* gmx_restrict fr
,
377 t_mdatoms
* gmx_restrict mdatoms
,
378 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
379 t_nrnb
* gmx_restrict nrnb
)
381 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
382 * just 0 for non-waters.
383 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
384 * jnr indices corresponding to data put in the four positions in the SIMD register.
386 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
387 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
389 int j_coord_offsetA
,j_coord_offsetB
;
390 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
392 real
*shiftvec
,*fshift
,*x
,*f
;
393 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
395 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
396 int vdwjidx0A
,vdwjidx0B
;
397 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
398 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
399 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
402 __m128d vgb
,fgb
,vgbsum
,dvdasum
,gbscale
,gbtabscale
,isaprod
,gbqqfactor
,gbinvepsdiff
,dvdaj
,gbeps
,dvdatmp
;
403 __m128d minushalf
= _mm_set1_pd(-0.5);
404 real
*invsqrta
,*dvda
,*gbtab
;
406 __m128i ifour
= _mm_set1_epi32(4);
407 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
409 __m128d dummy_mask
,cutoff_mask
;
410 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
411 __m128d one
= _mm_set1_pd(1.0);
412 __m128d two
= _mm_set1_pd(2.0);
418 jindex
= nlist
->jindex
;
420 shiftidx
= nlist
->shift
;
422 shiftvec
= fr
->shift_vec
[0];
423 fshift
= fr
->fshift
[0];
424 facel
= _mm_set1_pd(fr
->epsfac
);
425 charge
= mdatoms
->chargeA
;
427 invsqrta
= fr
->invsqrta
;
429 gbtabscale
= _mm_set1_pd(fr
->gbtab
.scale
);
430 gbtab
= fr
->gbtab
.data
;
431 gbinvepsdiff
= _mm_set1_pd((1.0/fr
->epsilon_r
) - (1.0/fr
->gb_epsilon_solvent
));
433 /* Avoid stupid compiler warnings */
441 /* Start outer loop over neighborlists */
442 for(iidx
=0; iidx
<nri
; iidx
++)
444 /* Load shift vector for this list */
445 i_shift_offset
= DIM
*shiftidx
[iidx
];
447 /* Load limits for loop over neighbors */
448 j_index_start
= jindex
[iidx
];
449 j_index_end
= jindex
[iidx
+1];
451 /* Get outer coordinate index */
453 i_coord_offset
= DIM
*inr
;
455 /* Load i particle coords and add shift vector */
456 gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,&ix0
,&iy0
,&iz0
);
458 fix0
= _mm_setzero_pd();
459 fiy0
= _mm_setzero_pd();
460 fiz0
= _mm_setzero_pd();
462 /* Load parameters for i particles */
463 iq0
= _mm_mul_pd(facel
,_mm_load1_pd(charge
+inr
+0));
464 isai0
= _mm_load1_pd(invsqrta
+inr
+0);
466 dvdasum
= _mm_setzero_pd();
468 /* Start inner kernel loop */
469 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
472 /* Get j neighbor index, and coordinate index */
475 j_coord_offsetA
= DIM
*jnrA
;
476 j_coord_offsetB
= DIM
*jnrB
;
478 /* load j atom coordinates */
479 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
482 /* Calculate displacement vector */
483 dx00
= _mm_sub_pd(ix0
,jx0
);
484 dy00
= _mm_sub_pd(iy0
,jy0
);
485 dz00
= _mm_sub_pd(iz0
,jz0
);
487 /* Calculate squared distance and things based on it */
488 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
490 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
492 /* Load parameters for j particles */
493 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
494 isaj0
= gmx_mm_load_2real_swizzle_pd(invsqrta
+jnrA
+0,invsqrta
+jnrB
+0);
496 /**************************
497 * CALCULATE INTERACTIONS *
498 **************************/
500 r00
= _mm_mul_pd(rsq00
,rinv00
);
502 /* Compute parameters for interactions between i and j atoms */
503 qq00
= _mm_mul_pd(iq0
,jq0
);
505 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
506 isaprod
= _mm_mul_pd(isai0
,isaj0
);
507 gbqqfactor
= _mm_xor_pd(signbit
,_mm_mul_pd(qq00
,_mm_mul_pd(isaprod
,gbinvepsdiff
)));
508 gbscale
= _mm_mul_pd(isaprod
,gbtabscale
);
510 /* Calculate generalized born table index - this is a separate table from the normal one,
511 * but we use the same procedure by multiplying r with scale and truncating to integer.
513 rt
= _mm_mul_pd(r00
,gbscale
);
514 gbitab
= _mm_cvttpd_epi32(rt
);
515 gbeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(gbitab
));
516 gbitab
= _mm_slli_epi32(gbitab
,2);
518 Y
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,0) );
519 F
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,1) );
520 GMX_MM_TRANSPOSE2_PD(Y
,F
);
521 G
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,0) +2);
522 H
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,1) +2);
523 GMX_MM_TRANSPOSE2_PD(G
,H
);
524 Heps
= _mm_mul_pd(gbeps
,H
);
525 Fp
= _mm_add_pd(F
,_mm_mul_pd(gbeps
,_mm_add_pd(G
,Heps
)));
526 VV
= _mm_add_pd(Y
,_mm_mul_pd(gbeps
,Fp
));
527 vgb
= _mm_mul_pd(gbqqfactor
,VV
);
529 FF
= _mm_add_pd(Fp
,_mm_mul_pd(gbeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
530 fgb
= _mm_mul_pd(gbqqfactor
,_mm_mul_pd(FF
,gbscale
));
531 dvdatmp
= _mm_mul_pd(minushalf
,_mm_add_pd(vgb
,_mm_mul_pd(fgb
,r00
)));
532 dvdasum
= _mm_add_pd(dvdasum
,dvdatmp
);
533 gmx_mm_increment_2real_swizzle_pd(dvda
+jnrA
,dvda
+jnrB
,_mm_mul_pd(dvdatmp
,_mm_mul_pd(isaj0
,isaj0
)));
534 velec
= _mm_mul_pd(qq00
,rinv00
);
535 felec
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec
,rinv00
),fgb
),rinv00
);
539 /* Calculate temporary vectorial force */
540 tx
= _mm_mul_pd(fscal
,dx00
);
541 ty
= _mm_mul_pd(fscal
,dy00
);
542 tz
= _mm_mul_pd(fscal
,dz00
);
544 /* Update vectorial force */
545 fix0
= _mm_add_pd(fix0
,tx
);
546 fiy0
= _mm_add_pd(fiy0
,ty
);
547 fiz0
= _mm_add_pd(fiz0
,tz
);
549 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,tx
,ty
,tz
);
551 /* Inner loop uses 56 flops */
558 j_coord_offsetA
= DIM
*jnrA
;
560 /* load j atom coordinates */
561 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
564 /* Calculate displacement vector */
565 dx00
= _mm_sub_pd(ix0
,jx0
);
566 dy00
= _mm_sub_pd(iy0
,jy0
);
567 dz00
= _mm_sub_pd(iz0
,jz0
);
569 /* Calculate squared distance and things based on it */
570 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
572 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
574 /* Load parameters for j particles */
575 jq0
= _mm_load_sd(charge
+jnrA
+0);
576 isaj0
= _mm_load_sd(invsqrta
+jnrA
+0);
578 /**************************
579 * CALCULATE INTERACTIONS *
580 **************************/
582 r00
= _mm_mul_pd(rsq00
,rinv00
);
584 /* Compute parameters for interactions between i and j atoms */
585 qq00
= _mm_mul_pd(iq0
,jq0
);
587 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
588 isaprod
= _mm_mul_pd(isai0
,isaj0
);
589 gbqqfactor
= _mm_xor_pd(signbit
,_mm_mul_pd(qq00
,_mm_mul_pd(isaprod
,gbinvepsdiff
)));
590 gbscale
= _mm_mul_pd(isaprod
,gbtabscale
);
592 /* Calculate generalized born table index - this is a separate table from the normal one,
593 * but we use the same procedure by multiplying r with scale and truncating to integer.
595 rt
= _mm_mul_pd(r00
,gbscale
);
596 gbitab
= _mm_cvttpd_epi32(rt
);
597 gbeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(gbitab
));
598 gbitab
= _mm_slli_epi32(gbitab
,2);
600 Y
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,0) );
601 F
= _mm_setzero_pd();
602 GMX_MM_TRANSPOSE2_PD(Y
,F
);
603 G
= _mm_load_pd( gbtab
+ gmx_mm_extract_epi32(gbitab
,0) +2);
604 H
= _mm_setzero_pd();
605 GMX_MM_TRANSPOSE2_PD(G
,H
);
606 Heps
= _mm_mul_pd(gbeps
,H
);
607 Fp
= _mm_add_pd(F
,_mm_mul_pd(gbeps
,_mm_add_pd(G
,Heps
)));
608 VV
= _mm_add_pd(Y
,_mm_mul_pd(gbeps
,Fp
));
609 vgb
= _mm_mul_pd(gbqqfactor
,VV
);
611 FF
= _mm_add_pd(Fp
,_mm_mul_pd(gbeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
612 fgb
= _mm_mul_pd(gbqqfactor
,_mm_mul_pd(FF
,gbscale
));
613 dvdatmp
= _mm_mul_pd(minushalf
,_mm_add_pd(vgb
,_mm_mul_pd(fgb
,r00
)));
614 dvdatmp
= _mm_unpacklo_pd(dvdatmp
,_mm_setzero_pd());
615 dvdasum
= _mm_add_pd(dvdasum
,dvdatmp
);
616 gmx_mm_increment_1real_pd(dvda
+jnrA
,_mm_mul_pd(dvdatmp
,_mm_mul_pd(isaj0
,isaj0
)));
617 velec
= _mm_mul_pd(qq00
,rinv00
);
618 felec
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(velec
,rinv00
),fgb
),rinv00
);
622 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
624 /* Calculate temporary vectorial force */
625 tx
= _mm_mul_pd(fscal
,dx00
);
626 ty
= _mm_mul_pd(fscal
,dy00
);
627 tz
= _mm_mul_pd(fscal
,dz00
);
629 /* Update vectorial force */
630 fix0
= _mm_add_pd(fix0
,tx
);
631 fiy0
= _mm_add_pd(fiy0
,ty
);
632 fiz0
= _mm_add_pd(fiz0
,tz
);
634 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,tx
,ty
,tz
);
636 /* Inner loop uses 56 flops */
639 /* End of innermost loop */
641 gmx_mm_update_iforce_1atom_swizzle_pd(fix0
,fiy0
,fiz0
,
642 f
+i_coord_offset
,fshift
+i_shift_offset
);
644 dvdasum
= _mm_mul_pd(dvdasum
, _mm_mul_pd(isai0
,isai0
));
645 gmx_mm_update_1pot_pd(dvdasum
,dvda
+inr
);
647 /* Increment number of inner iterations */
648 inneriter
+= j_index_end
- j_index_start
;
650 /* Outer loop uses 7 flops */
653 /* Increment number of outer iterations */
656 /* Update outer/inner flops */
658 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_F
,outeriter
*7 + inneriter
*56);