2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
35 #include "types/simple.h"
44 #include "genborn_allvsall.h"
47 #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) )
49 #include <gmx_sse2_double.h>
67 int ** prologue_mask_gb
;
80 gmx_allvsallgb2_data_t
;
84 calc_maxoffset(int i
,int natoms
)
88 if ((natoms
% 2) == 1)
90 /* Odd number of atoms, easy */
93 else if ((natoms
% 4) == 0)
95 /* Multiple of four is hard */
104 maxoffset
=natoms
/2-1;
115 maxoffset
=natoms
/2-1;
128 maxoffset
=natoms
/2-1;
136 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t
* aadata
,
147 int ni0
,ni1
,nj0
,nj1
,nj
;
151 int firstinteraction
;
155 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
156 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
157 * whether they should interact or not.
159 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
160 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
161 * we create a jindex array with three elements per i atom: the starting point, the point to
162 * which we need to check exclusions, and the end point.
163 * This way we only have to allocate a short exclusion mask per i atom.
166 ni0
= (start
/UNROLLI
)*UNROLLI
;
167 ni1
= ((end
+UNROLLI
-1)/UNROLLI
)*UNROLLI
;
169 /* Set the interaction mask to only enable the i atoms we want to include */
170 snew(pi
,2*(natoms
+UNROLLI
+2*SIMD_WIDTH
));
171 aadata
->imask
= (int *) (((size_t) pi
+ 16) & (~((size_t) 15)));
172 for(i
=0;i
<natoms
+UNROLLI
;i
++)
174 aadata
->imask
[2*i
] = (i
>=start
&& i
<end
) ? 0xFFFFFFFF : 0;
175 aadata
->imask
[2*i
+1] = (i
>=start
&& i
<end
) ? 0xFFFFFFFF : 0;
178 /* Allocate memory for our modified jindex array */
179 snew(aadata
->jindex_gb
,4*(natoms
+UNROLLI
));
180 for(i
=0;i
<4*(natoms
+UNROLLI
);i
++)
182 aadata
->jindex_gb
[i
] = 0;
185 /* Create the exclusion masks for the prologue part */
186 snew(aadata
->prologue_mask_gb
,natoms
+UNROLLI
); /* list of pointers */
188 /* First zero everything to avoid uninitialized data */
189 for(i
=0;i
<natoms
+UNROLLI
;i
++)
191 aadata
->prologue_mask_gb
[i
] = NULL
;
194 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
195 for(ibase
=ni0
;ibase
<ni1
;ibase
+=UNROLLI
)
197 max_excl_offset
= -1;
199 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
200 imax
= ((ibase
+UNROLLI
) < end
) ? (ibase
+UNROLLI
) : end
;
202 /* Which atom is the first we (might) interact with? */
203 imin
= natoms
; /* Guaranteed to be overwritten by one of 'firstinteraction' */
204 for(i
=ibase
;i
<imax
;i
++)
206 /* Before exclusions, which atom is the first we (might) interact with? */
207 firstinteraction
= i
+1;
208 max_offset
= calc_maxoffset(i
,natoms
);
212 for(j
=0;j
<ilist
[F_GB12
].nr
;j
+=3)
214 a1
= ilist
[F_GB12
].iatoms
[j
+1];
215 a2
= ilist
[F_GB12
].iatoms
[j
+2];
230 if(k
==firstinteraction
)
238 for(j
=0;j
<ilist
[F_GB13
].nr
;j
+=3)
240 a1
= ilist
[F_GB13
].iatoms
[j
+1];
241 a2
= ilist
[F_GB13
].iatoms
[j
+2];
256 if(k
==firstinteraction
)
264 for(j
=0;j
<ilist
[F_GB14
].nr
;j
+=3)
266 a1
= ilist
[F_GB14
].iatoms
[j
+1];
267 a2
= ilist
[F_GB14
].iatoms
[j
+2];
281 if(k
==firstinteraction
)
287 imin
= (firstinteraction
< imin
) ? firstinteraction
: imin
;
289 /* round down to j unrolling factor */
290 imin
= (imin
/UNROLLJ
)*UNROLLJ
;
292 for(i
=ibase
;i
<imax
;i
++)
294 max_offset
= calc_maxoffset(i
,natoms
);
298 for(j
=0;j
<ilist
[F_GB12
].nr
;j
+=3)
300 a1
= ilist
[F_GB12
].iatoms
[j
+1];
301 a2
= ilist
[F_GB12
].iatoms
[j
+2];
328 if( k
+natoms
<= max_offset
)
332 max_excl_offset
= (k
> max_excl_offset
) ? k
: max_excl_offset
;
337 for(j
=0;j
<ilist
[F_GB13
].nr
;j
+=3)
339 a1
= ilist
[F_GB13
].iatoms
[j
+1];
340 a2
= ilist
[F_GB13
].iatoms
[j
+2];
367 if( k
+natoms
<= max_offset
)
371 max_excl_offset
= (k
> max_excl_offset
) ? k
: max_excl_offset
;
376 for(j
=0;j
<ilist
[F_GB14
].nr
;j
+=3)
378 a1
= ilist
[F_GB14
].iatoms
[j
+1];
379 a2
= ilist
[F_GB14
].iatoms
[j
+2];
406 if( k
+natoms
<= max_offset
)
410 max_excl_offset
= (k
> max_excl_offset
) ? k
: max_excl_offset
;
415 /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
417 /* round up to j unrolling factor */
418 max_excl_offset
= (max_excl_offset
/UNROLLJ
+1)*UNROLLJ
;
420 /* Set all the prologue masks length to this value (even for i>end) */
421 for(i
=ibase
;i
<ibase
+UNROLLI
;i
++)
423 aadata
->jindex_gb
[4*i
] = imin
;
424 aadata
->jindex_gb
[4*i
+1] = imin
+max_excl_offset
;
428 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
429 for(ibase
=ni0
;ibase
<ni1
;ibase
+=UNROLLI
)
431 for(i
=ibase
;i
<ibase
+UNROLLI
;i
++)
433 nj
= aadata
->jindex_gb
[4*i
+1] - aadata
->jindex_gb
[4*i
];
434 imin
= aadata
->jindex_gb
[4*i
];
436 /* Allocate aligned memory */
437 snew(pi
,2*(nj
+2*SIMD_WIDTH
));
438 aadata
->prologue_mask_gb
[i
] = (int *) (((size_t) pi
+ 16) & (~((size_t) 15)));
440 max_offset
= calc_maxoffset(i
,natoms
);
442 /* Include interactions i+1 <= j < i+maxoffset */
447 if( (j
>i
) && (j
<=i
+max_offset
) )
449 aadata
->prologue_mask_gb
[i
][2*k
] = 0xFFFFFFFF;
450 aadata
->prologue_mask_gb
[i
][2*k
+1] = 0xFFFFFFFF;
454 aadata
->prologue_mask_gb
[i
][2*k
] = 0;
455 aadata
->prologue_mask_gb
[i
][2*k
+1] = 0;
459 /* Clear out the explicit exclusions */
464 for(j
=0;j
<ilist
[F_GB12
].nr
;j
+=3)
466 a1
= ilist
[F_GB12
].iatoms
[j
+1];
467 a2
= ilist
[F_GB12
].iatoms
[j
+2];
488 if( k
+natoms
<= max_offset
)
496 aadata
->prologue_mask_gb
[i
][2*k
] = 0;
497 aadata
->prologue_mask_gb
[i
][2*k
+1] = 0;
503 for(j
=0;j
<ilist
[F_GB13
].nr
;j
+=3)
505 a1
= ilist
[F_GB13
].iatoms
[j
+1];
506 a2
= ilist
[F_GB13
].iatoms
[j
+2];
527 if( k
+natoms
<= max_offset
)
535 aadata
->prologue_mask_gb
[i
][2*k
] = 0;
536 aadata
->prologue_mask_gb
[i
][2*k
+1] = 0;
542 for(j
=0;j
<ilist
[F_GB14
].nr
;j
+=3)
544 a1
= ilist
[F_GB14
].iatoms
[j
+1];
545 a2
= ilist
[F_GB14
].iatoms
[j
+2];
566 if( k
+natoms
<= max_offset
)
574 aadata
->prologue_mask_gb
[i
][2*k
] = 0;
575 aadata
->prologue_mask_gb
[i
][2*k
+1] = 0;
583 /* Construct the epilogue mask - this just contains the check for maxoffset */
584 snew(aadata
->epilogue_mask
,natoms
+UNROLLI
);
586 /* First zero everything to avoid uninitialized data */
587 for(i
=0;i
<natoms
+UNROLLI
;i
++)
589 aadata
->jindex_gb
[4*i
+2] = aadata
->jindex_gb
[4*i
+1];
590 aadata
->jindex_gb
[4*i
+3] = aadata
->jindex_gb
[4*i
+1];
591 aadata
->epilogue_mask
[i
] = NULL
;
594 for(ibase
=ni0
;ibase
<ni1
;ibase
+=UNROLLI
)
596 /* Find the lowest index for which we need to use the epilogue */
598 max_offset
= calc_maxoffset(imin
,natoms
);
600 imin
= imin
+ 1 + max_offset
;
602 /* Find largest index for which we need to use the epilogue */
603 imax
= ibase
+ UNROLLI
-1;
604 imax
= (imax
< end
) ? imax
: end
;
606 max_offset
= calc_maxoffset(imax
,natoms
);
607 imax
= imax
+ 1 + max_offset
+ UNROLLJ
- 1;
609 for(i
=ibase
;i
<ibase
+UNROLLI
;i
++)
611 /* Start of epilogue - round down to j tile limit */
612 aadata
->jindex_gb
[4*i
+2] = (imin
/UNROLLJ
)*UNROLLJ
;
613 /* Make sure we dont overlap - for small systems everything is done in the prologue */
614 aadata
->jindex_gb
[4*i
+2] = (aadata
->jindex_gb
[4*i
+1] > aadata
->jindex_gb
[4*i
+2]) ? aadata
->jindex_gb
[4*i
+1] : aadata
->jindex_gb
[4*i
+2];
615 /* Round upwards to j tile limit */
616 aadata
->jindex_gb
[4*i
+3] = (imax
/UNROLLJ
)*UNROLLJ
;
617 /* Make sure we dont have a negative range for the epilogue */
618 aadata
->jindex_gb
[4*i
+3] = (aadata
->jindex_gb
[4*i
+2] > aadata
->jindex_gb
[4*i
+3]) ? aadata
->jindex_gb
[4*i
+2] : aadata
->jindex_gb
[4*i
+3];
622 /* And fill it with data... */
624 for(ibase
=ni0
;ibase
<ni1
;ibase
+=UNROLLI
)
626 for(i
=ibase
;i
<ibase
+UNROLLI
;i
++)
629 nj
= aadata
->jindex_gb
[4*i
+3] - aadata
->jindex_gb
[4*i
+2];
631 /* Allocate aligned memory */
632 snew(pi
,2*(nj
+2*SIMD_WIDTH
));
633 aadata
->epilogue_mask
[i
] = (int *) (((size_t) pi
+ 16) & (~((size_t) 15)));
635 max_offset
= calc_maxoffset(i
,natoms
);
639 j
= aadata
->jindex_gb
[4*i
+2] + k
;
640 aadata
->epilogue_mask
[i
][2*k
] = (j
<= i
+max_offset
) ? 0xFFFFFFFF : 0;
641 aadata
->epilogue_mask
[i
][2*k
+1] = (j
<= i
+max_offset
) ? 0xFFFFFFFF : 0;
649 genborn_allvsall_setup(gmx_allvsallgb2_data_t
** p_aadata
,
650 gmx_localtop_t
* top
,
651 gmx_genborn_t
* born
,
653 double radius_offset
,
661 gmx_allvsallgb2_data_t
*aadata
;
664 natoms
= mdatoms
->nr
;
669 snew(p
,2*natoms
+2*SIMD_WIDTH
);
670 aadata
->x_align
= (double *) (((size_t) p
+ 16) & (~((size_t) 15)));
671 snew(p
,2*natoms
+2*SIMD_WIDTH
);
672 aadata
->y_align
= (double *) (((size_t) p
+ 16) & (~((size_t) 15)));
673 snew(p
,2*natoms
+2*SIMD_WIDTH
);
674 aadata
->z_align
= (double *) (((size_t) p
+ 16) & (~((size_t) 15)));
675 snew(p
,2*natoms
+2*SIMD_WIDTH
);
676 aadata
->fx_align
= (double *) (((size_t) p
+ 16) & (~((size_t) 15)));
677 snew(p
,2*natoms
+2*SIMD_WIDTH
);
678 aadata
->fy_align
= (double *) (((size_t) p
+ 16) & (~((size_t) 15)));
679 snew(p
,2*natoms
+2*SIMD_WIDTH
);
680 aadata
->fz_align
= (double *) (((size_t) p
+ 16) & (~((size_t) 15)));
682 snew(p
,2*natoms
+UNROLLJ
+SIMD_WIDTH
);
683 aadata
->gb_radius
= (double *) (((size_t) p
+ 16) & (~((size_t) 15)));
685 snew(p
,2*natoms
+UNROLLJ
+SIMD_WIDTH
);
686 aadata
->workparam
= (double *) (((size_t) p
+ 16) & (~((size_t) 15)));
688 snew(p
,2*natoms
+UNROLLJ
+SIMD_WIDTH
);
689 aadata
->work
= (double *) (((size_t) p
+ 16) & (~((size_t) 15)));
691 for(i
=0;i
<mdatoms
->nr
;i
++)
693 aadata
->gb_radius
[i
] = top
->atomtypes
.gb_radius
[mdatoms
->typeA
[i
]] - radius_offset
;
694 if(gb_algorithm
==egbSTILL
)
696 aadata
->workparam
[i
] = born
->vsolv
[i
];
698 else if(gb_algorithm
==egbOBC
)
700 aadata
->workparam
[i
] = born
->param
[i
];
702 aadata
->work
[i
] = 0.0;
704 for(i
=0;i
<mdatoms
->nr
;i
++)
706 aadata
->gb_radius
[natoms
+i
] = aadata
->gb_radius
[i
];
707 aadata
->workparam
[natoms
+i
] = aadata
->workparam
[i
];
708 aadata
->work
[natoms
+i
] = aadata
->work
[i
];
711 for(i
=0;i
<2*natoms
+SIMD_WIDTH
;i
++)
713 aadata
->x_align
[i
] = 0.0;
714 aadata
->y_align
[i
] = 0.0;
715 aadata
->z_align
[i
] = 0.0;
716 aadata
->fx_align
[i
] = 0.0;
717 aadata
->fy_align
[i
] = 0.0;
718 aadata
->fz_align
[i
] = 0.0;
721 setup_gb_exclusions_and_indices(aadata
,top
->idef
.il
,mdatoms
->start
,mdatoms
->start
+mdatoms
->homenr
,mdatoms
->nr
,
722 bInclude12
,bInclude13
,bInclude14
);
727 * This routine apparently hits a compiler bug visual studio has had 'forever'.
728 * It is present both in VS2005 and VS2008, and the only way around it is to
729 * decrease optimization. We do that with at pragma, and only for MSVC, so it
730 * will not hurt any of the well-behaving and supported compilers out there.
731 * MS: Fix your compiler, it sucks like a black hole!
734 #pragma optimize("t",off)
738 genborn_allvsall_calc_still_radii_sse2_double(t_forcerec
* fr
,
740 gmx_genborn_t
* born
,
741 gmx_localtop_t
* top
,
746 gmx_allvsallgb2_data_t
*aadata
;
762 double irsq
,idr4
,idr6
;
763 double raj
,rvdw
,ratio
;
764 double vaj
,ccf
,dccf
,theta
,cosq
;
765 double term
,prod
,icf4
,icf6
,gpi2
,factor
,sinq
;
776 __m128d ix_SSE0
,iy_SSE0
,iz_SSE0
;
777 __m128d ix_SSE1
,iy_SSE1
,iz_SSE1
;
778 __m128d gpi_SSE0
,rai_SSE0
,prod_ai_SSE0
;
779 __m128d gpi_SSE1
,rai_SSE1
,prod_ai_SSE1
;
780 __m128d imask_SSE0
,jmask_SSE0
;
781 __m128d imask_SSE1
,jmask_SSE1
;
782 __m128d jx_SSE
,jy_SSE
,jz_SSE
;
783 __m128d dx_SSE0
,dy_SSE0
,dz_SSE0
;
784 __m128d dx_SSE1
,dy_SSE1
,dz_SSE1
;
785 __m128d rsq_SSE0
,rinv_SSE0
,irsq_SSE0
,idr4_SSE0
,idr6_SSE0
;
786 __m128d rsq_SSE1
,rinv_SSE1
,irsq_SSE1
,idr4_SSE1
,idr6_SSE1
;
787 __m128d raj_SSE
,vaj_SSE
,prod_SSE
;
788 __m128d rvdw_SSE0
,ratio_SSE0
;
789 __m128d rvdw_SSE1
,ratio_SSE1
;
790 __m128d theta_SSE0
,sinq_SSE0
,cosq_SSE0
,term_SSE0
;
791 __m128d theta_SSE1
,sinq_SSE1
,cosq_SSE1
,term_SSE1
;
792 __m128d ccf_SSE0
,dccf_SSE0
;
793 __m128d ccf_SSE1
,dccf_SSE1
;
794 __m128d icf4_SSE0
,icf6_SSE0
;
795 __m128d icf4_SSE1
,icf6_SSE1
;
796 __m128d half_SSE
,one_SSE
,two_SSE
,four_SSE
;
797 __m128d still_p4_SSE
,still_p5inv_SSE
,still_pip5_SSE
;
799 natoms
= mdatoms
->nr
;
800 ni0
= (mdatoms
->start
/SIMD_WIDTH
)*SIMD_WIDTH
;
801 ni1
= mdatoms
->start
+mdatoms
->homenr
;
805 aadata
= *((gmx_allvsallgb2_data_t
**)paadata
);
810 genborn_allvsall_setup(&aadata
,top
,born
,mdatoms
,0.0,
811 egbSTILL
,FALSE
,FALSE
,TRUE
);
812 *((gmx_allvsallgb2_data_t
**)paadata
) = aadata
;
815 x_align
= aadata
->x_align
;
816 y_align
= aadata
->y_align
;
817 z_align
= aadata
->z_align
;
819 gb_radius
= aadata
->gb_radius
;
820 vsolv
= aadata
->workparam
;
822 jindex
= aadata
->jindex_gb
;
825 still_p4_SSE
= _mm_set1_pd(STILL_P4
);
826 still_p5inv_SSE
= _mm_set1_pd(STILL_P5INV
);
827 still_pip5_SSE
= _mm_set1_pd(STILL_PIP5
);
828 half_SSE
= _mm_set1_pd(0.5);
829 one_SSE
= _mm_set1_pd(1.0);
830 two_SSE
= _mm_set1_pd(2.0);
831 four_SSE
= _mm_set1_pd(4.0);
833 /* This will be summed, so it has to extend to natoms + buffer */
834 for(i
=0;i
<natoms
+1+natoms
/2;i
++)
839 for(i
=ni0
;i
<ni1
+1+natoms
/2;i
++)
843 y_align
[i
] = x
[3*k
+1];
844 z_align
[i
] = x
[3*k
+2];
848 for(i
=ni0
; i
<ni1
; i
+=UNROLLI
)
850 /* We assume shifts are NOT used for all-vs-all interactions */
851 /* Load i atom data */
852 ix_SSE0
= _mm_load1_pd(x_align
+i
);
853 iy_SSE0
= _mm_load1_pd(y_align
+i
);
854 iz_SSE0
= _mm_load1_pd(z_align
+i
);
855 ix_SSE1
= _mm_load1_pd(x_align
+i
+1);
856 iy_SSE1
= _mm_load1_pd(y_align
+i
+1);
857 iz_SSE1
= _mm_load1_pd(z_align
+i
+1);
859 gpi_SSE0
= _mm_setzero_pd();
860 gpi_SSE1
= _mm_setzero_pd();
862 rai_SSE0
= _mm_load1_pd(gb_radius
+i
);
863 rai_SSE1
= _mm_load1_pd(gb_radius
+i
+1);
865 prod_ai_SSE0
= _mm_set1_pd(STILL_P4
*vsolv
[i
]);
866 prod_ai_SSE1
= _mm_set1_pd(STILL_P4
*vsolv
[i
+1]);
868 /* Load limits for loop over neighbors */
874 pmask0
= aadata
->prologue_mask_gb
[i
];
875 pmask1
= aadata
->prologue_mask_gb
[i
+1];
876 emask0
= aadata
->epilogue_mask
[i
];
877 emask1
= aadata
->epilogue_mask
[i
+1];
879 imask_SSE0
= _mm_load1_pd((double *)(aadata
->imask
+2*i
));
880 imask_SSE1
= _mm_load1_pd((double *)(aadata
->imask
+2*i
+2));
882 /* Prologue part, including exclusion mask */
883 for(j
=nj0
; j
<nj1
; j
+=UNROLLJ
)
885 jmask_SSE0
= _mm_load_pd((double *)pmask0
);
886 jmask_SSE1
= _mm_load_pd((double *)pmask1
);
890 /* load j atom coordinates */
891 jx_SSE
= _mm_load_pd(x_align
+j
);
892 jy_SSE
= _mm_load_pd(y_align
+j
);
893 jz_SSE
= _mm_load_pd(z_align
+j
);
895 /* Calculate distance */
896 dx_SSE0
= _mm_sub_pd(ix_SSE0
,jx_SSE
);
897 dy_SSE0
= _mm_sub_pd(iy_SSE0
,jy_SSE
);
898 dz_SSE0
= _mm_sub_pd(iz_SSE0
,jz_SSE
);
899 dx_SSE1
= _mm_sub_pd(ix_SSE1
,jx_SSE
);
900 dy_SSE1
= _mm_sub_pd(iy_SSE1
,jy_SSE
);
901 dz_SSE1
= _mm_sub_pd(iz_SSE1
,jz_SSE
);
903 /* rsq = dx*dx+dy*dy+dz*dz */
904 rsq_SSE0
= gmx_mm_calc_rsq_pd(dx_SSE0
,dy_SSE0
,dz_SSE0
);
905 rsq_SSE1
= gmx_mm_calc_rsq_pd(dx_SSE1
,dy_SSE1
,dz_SSE1
);
908 jmask_SSE0
= _mm_and_pd(jmask_SSE0
,imask_SSE0
);
909 jmask_SSE1
= _mm_and_pd(jmask_SSE1
,imask_SSE1
);
911 /* Calculate 1/r and 1/r2 */
912 rinv_SSE0
= gmx_mm_invsqrt_pd(rsq_SSE0
);
913 rinv_SSE1
= gmx_mm_invsqrt_pd(rsq_SSE1
);
916 rinv_SSE0
= _mm_and_pd(rinv_SSE0
,jmask_SSE0
);
917 rinv_SSE1
= _mm_and_pd(rinv_SSE1
,jmask_SSE1
);
919 irsq_SSE0
= _mm_mul_pd(rinv_SSE0
,rinv_SSE0
);
920 irsq_SSE1
= _mm_mul_pd(rinv_SSE1
,rinv_SSE1
);
921 idr4_SSE0
= _mm_mul_pd(irsq_SSE0
,irsq_SSE0
);
922 idr4_SSE1
= _mm_mul_pd(irsq_SSE1
,irsq_SSE1
);
923 idr6_SSE0
= _mm_mul_pd(idr4_SSE0
,irsq_SSE0
);
924 idr6_SSE1
= _mm_mul_pd(idr4_SSE1
,irsq_SSE1
);
926 raj_SSE
= _mm_load_pd(gb_radius
+j
);
927 vaj_SSE
= _mm_load_pd(vsolv
+j
);
929 rvdw_SSE0
= _mm_add_pd(rai_SSE0
,raj_SSE
);
930 rvdw_SSE1
= _mm_add_pd(rai_SSE1
,raj_SSE
);
932 ratio_SSE0
= _mm_mul_pd(rsq_SSE0
, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0
,rvdw_SSE0
)));
933 ratio_SSE1
= _mm_mul_pd(rsq_SSE1
, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1
,rvdw_SSE1
)));
935 ratio_SSE0
= _mm_min_pd(ratio_SSE0
,still_p5inv_SSE
);
936 ratio_SSE1
= _mm_min_pd(ratio_SSE1
,still_p5inv_SSE
);
937 theta_SSE0
= _mm_mul_pd(ratio_SSE0
,still_pip5_SSE
);
938 theta_SSE1
= _mm_mul_pd(ratio_SSE1
,still_pip5_SSE
);
939 gmx_mm_sincos_pd(theta_SSE0
,&sinq_SSE0
,&cosq_SSE0
);
940 gmx_mm_sincos_pd(theta_SSE1
,&sinq_SSE1
,&cosq_SSE1
);
941 term_SSE0
= _mm_mul_pd(half_SSE
,_mm_sub_pd(one_SSE
,cosq_SSE0
));
942 term_SSE1
= _mm_mul_pd(half_SSE
,_mm_sub_pd(one_SSE
,cosq_SSE1
));
943 ccf_SSE0
= _mm_mul_pd(term_SSE0
,term_SSE0
);
944 ccf_SSE1
= _mm_mul_pd(term_SSE1
,term_SSE1
);
945 dccf_SSE0
= _mm_mul_pd(_mm_mul_pd(two_SSE
,term_SSE0
),
946 _mm_mul_pd(sinq_SSE0
,theta_SSE0
));
947 dccf_SSE1
= _mm_mul_pd(_mm_mul_pd(two_SSE
,term_SSE1
),
948 _mm_mul_pd(sinq_SSE1
,theta_SSE1
));
950 prod_SSE
= _mm_mul_pd(still_p4_SSE
,vaj_SSE
);
951 icf4_SSE0
= _mm_mul_pd(ccf_SSE0
,idr4_SSE0
);
952 icf4_SSE1
= _mm_mul_pd(ccf_SSE1
,idr4_SSE1
);
953 icf6_SSE0
= _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE
,ccf_SSE0
),dccf_SSE0
), idr6_SSE0
);
954 icf6_SSE1
= _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE
,ccf_SSE1
),dccf_SSE1
), idr6_SSE1
);
956 _mm_store_pd(work
+j
, _mm_add_pd(_mm_load_pd(work
+j
),
957 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0
,icf4_SSE0
),
958 _mm_mul_pd(prod_ai_SSE1
,icf4_SSE1
))));
961 gpi_SSE0
= _mm_add_pd(gpi_SSE0
, _mm_mul_pd(prod_SSE
,icf4_SSE0
));
962 gpi_SSE1
= _mm_add_pd(gpi_SSE1
, _mm_mul_pd(prod_SSE
,icf4_SSE1
));
964 /* Save ai->aj and aj->ai chain rule terms */
965 _mm_store_pd(dadx
,_mm_mul_pd(prod_SSE
,icf6_SSE0
));
967 _mm_store_pd(dadx
,_mm_mul_pd(prod_SSE
,icf6_SSE1
));
970 _mm_store_pd(dadx
,_mm_mul_pd(prod_ai_SSE0
,icf6_SSE0
));
972 _mm_store_pd(dadx
,_mm_mul_pd(prod_ai_SSE1
,icf6_SSE1
));
976 /* Main part, no exclusions */
977 for(j
=nj1
; j
<nj2
; j
+=UNROLLJ
)
980 /* load j atom coordinates */
981 jx_SSE
= _mm_load_pd(x_align
+j
);
982 jy_SSE
= _mm_load_pd(y_align
+j
);
983 jz_SSE
= _mm_load_pd(z_align
+j
);
985 /* Calculate distance */
986 dx_SSE0
= _mm_sub_pd(ix_SSE0
,jx_SSE
);
987 dy_SSE0
= _mm_sub_pd(iy_SSE0
,jy_SSE
);
988 dz_SSE0
= _mm_sub_pd(iz_SSE0
,jz_SSE
);
989 dx_SSE1
= _mm_sub_pd(ix_SSE1
,jx_SSE
);
990 dy_SSE1
= _mm_sub_pd(iy_SSE1
,jy_SSE
);
991 dz_SSE1
= _mm_sub_pd(iz_SSE1
,jz_SSE
);
993 /* rsq = dx*dx+dy*dy+dz*dz */
994 rsq_SSE0
= gmx_mm_calc_rsq_pd(dx_SSE0
,dy_SSE0
,dz_SSE0
);
995 rsq_SSE1
= gmx_mm_calc_rsq_pd(dx_SSE1
,dy_SSE1
,dz_SSE1
);
997 /* Calculate 1/r and 1/r2 */
998 rinv_SSE0
= gmx_mm_invsqrt_pd(rsq_SSE0
);
999 rinv_SSE1
= gmx_mm_invsqrt_pd(rsq_SSE1
);
1002 rinv_SSE0
= _mm_and_pd(rinv_SSE0
,imask_SSE0
);
1003 rinv_SSE1
= _mm_and_pd(rinv_SSE1
,imask_SSE1
);
1005 irsq_SSE0
= _mm_mul_pd(rinv_SSE0
,rinv_SSE0
);
1006 irsq_SSE1
= _mm_mul_pd(rinv_SSE1
,rinv_SSE1
);
1007 idr4_SSE0
= _mm_mul_pd(irsq_SSE0
,irsq_SSE0
);
1008 idr4_SSE1
= _mm_mul_pd(irsq_SSE1
,irsq_SSE1
);
1009 idr6_SSE0
= _mm_mul_pd(idr4_SSE0
,irsq_SSE0
);
1010 idr6_SSE1
= _mm_mul_pd(idr4_SSE1
,irsq_SSE1
);
1012 raj_SSE
= _mm_load_pd(gb_radius
+j
);
1014 rvdw_SSE0
= _mm_add_pd(rai_SSE0
,raj_SSE
);
1015 rvdw_SSE1
= _mm_add_pd(rai_SSE1
,raj_SSE
);
1016 vaj_SSE
= _mm_load_pd(vsolv
+j
);
1018 ratio_SSE0
= _mm_mul_pd(rsq_SSE0
, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0
,rvdw_SSE0
)));
1019 ratio_SSE1
= _mm_mul_pd(rsq_SSE1
, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1
,rvdw_SSE1
)));
1021 ratio_SSE0
= _mm_min_pd(ratio_SSE0
,still_p5inv_SSE
);
1022 ratio_SSE1
= _mm_min_pd(ratio_SSE1
,still_p5inv_SSE
);
1023 theta_SSE0
= _mm_mul_pd(ratio_SSE0
,still_pip5_SSE
);
1024 theta_SSE1
= _mm_mul_pd(ratio_SSE1
,still_pip5_SSE
);
1025 gmx_mm_sincos_pd(theta_SSE0
,&sinq_SSE0
,&cosq_SSE0
);
1026 gmx_mm_sincos_pd(theta_SSE1
,&sinq_SSE1
,&cosq_SSE1
);
1027 term_SSE0
= _mm_mul_pd(half_SSE
,_mm_sub_pd(one_SSE
,cosq_SSE0
));
1028 term_SSE1
= _mm_mul_pd(half_SSE
,_mm_sub_pd(one_SSE
,cosq_SSE1
));
1029 ccf_SSE0
= _mm_mul_pd(term_SSE0
,term_SSE0
);
1030 ccf_SSE1
= _mm_mul_pd(term_SSE1
,term_SSE1
);
1031 dccf_SSE0
= _mm_mul_pd(_mm_mul_pd(two_SSE
,term_SSE0
),
1032 _mm_mul_pd(sinq_SSE0
,theta_SSE0
));
1033 dccf_SSE1
= _mm_mul_pd(_mm_mul_pd(two_SSE
,term_SSE1
),
1034 _mm_mul_pd(sinq_SSE1
,theta_SSE1
));
1036 prod_SSE
= _mm_mul_pd(still_p4_SSE
,vaj_SSE
);
1037 icf4_SSE0
= _mm_mul_pd(ccf_SSE0
,idr4_SSE0
);
1038 icf4_SSE1
= _mm_mul_pd(ccf_SSE1
,idr4_SSE1
);
1039 icf6_SSE0
= _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE
,ccf_SSE0
),dccf_SSE0
), idr6_SSE0
);
1040 icf6_SSE1
= _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE
,ccf_SSE1
),dccf_SSE1
), idr6_SSE1
);
1042 _mm_store_pd(work
+j
, _mm_add_pd(_mm_load_pd(work
+j
),
1043 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0
,icf4_SSE0
),
1044 _mm_mul_pd(prod_ai_SSE1
,icf4_SSE1
))));
1046 gpi_SSE0
= _mm_add_pd(gpi_SSE0
, _mm_mul_pd(prod_SSE
,icf4_SSE0
));
1047 gpi_SSE1
= _mm_add_pd(gpi_SSE1
, _mm_mul_pd(prod_SSE
,icf4_SSE1
));
1049 /* Save ai->aj and aj->ai chain rule terms */
1050 _mm_store_pd(dadx
,_mm_mul_pd(prod_SSE
,icf6_SSE0
));
1052 _mm_store_pd(dadx
,_mm_mul_pd(prod_SSE
,icf6_SSE1
));
1055 _mm_store_pd(dadx
,_mm_mul_pd(prod_ai_SSE0
,icf6_SSE0
));
1057 _mm_store_pd(dadx
,_mm_mul_pd(prod_ai_SSE1
,icf6_SSE1
));
1060 /* Epilogue part, including exclusion mask */
1061 for(j
=nj2
; j
<nj3
; j
+=UNROLLJ
)
1063 jmask_SSE0
= _mm_load_pd((double *)emask0
);
1064 jmask_SSE1
= _mm_load_pd((double *)emask1
);
1065 emask0
+= 2*UNROLLJ
;
1066 emask1
+= 2*UNROLLJ
;
1068 /* load j atom coordinates */
1069 jx_SSE
= _mm_load_pd(x_align
+j
);
1070 jy_SSE
= _mm_load_pd(y_align
+j
);
1071 jz_SSE
= _mm_load_pd(z_align
+j
);
1073 /* Calculate distance */
1074 dx_SSE0
= _mm_sub_pd(ix_SSE0
,jx_SSE
);
1075 dy_SSE0
= _mm_sub_pd(iy_SSE0
,jy_SSE
);
1076 dz_SSE0
= _mm_sub_pd(iz_SSE0
,jz_SSE
);
1077 dx_SSE1
= _mm_sub_pd(ix_SSE1
,jx_SSE
);
1078 dy_SSE1
= _mm_sub_pd(iy_SSE1
,jy_SSE
);
1079 dz_SSE1
= _mm_sub_pd(iz_SSE1
,jz_SSE
);
1081 /* rsq = dx*dx+dy*dy+dz*dz */
1082 rsq_SSE0
= gmx_mm_calc_rsq_pd(dx_SSE0
,dy_SSE0
,dz_SSE0
);
1083 rsq_SSE1
= gmx_mm_calc_rsq_pd(dx_SSE1
,dy_SSE1
,dz_SSE1
);
1086 jmask_SSE0
= _mm_and_pd(jmask_SSE0
,imask_SSE0
);
1087 jmask_SSE1
= _mm_and_pd(jmask_SSE1
,imask_SSE1
);
1089 /* Calculate 1/r and 1/r2 */
1090 rinv_SSE0
= gmx_mm_invsqrt_pd(rsq_SSE0
);
1091 rinv_SSE1
= gmx_mm_invsqrt_pd(rsq_SSE1
);
1094 rinv_SSE0
= _mm_and_pd(rinv_SSE0
,jmask_SSE0
);
1095 rinv_SSE1
= _mm_and_pd(rinv_SSE1
,jmask_SSE1
);
1097 irsq_SSE0
= _mm_mul_pd(rinv_SSE0
,rinv_SSE0
);
1098 irsq_SSE1
= _mm_mul_pd(rinv_SSE1
,rinv_SSE1
);
1099 idr4_SSE0
= _mm_mul_pd(irsq_SSE0
,irsq_SSE0
);
1100 idr4_SSE1
= _mm_mul_pd(irsq_SSE1
,irsq_SSE1
);
1101 idr6_SSE0
= _mm_mul_pd(idr4_SSE0
,irsq_SSE0
);
1102 idr6_SSE1
= _mm_mul_pd(idr4_SSE1
,irsq_SSE1
);
1104 raj_SSE
= _mm_load_pd(gb_radius
+j
);
1105 vaj_SSE
= _mm_load_pd(vsolv
+j
);
1107 rvdw_SSE0
= _mm_add_pd(rai_SSE0
,raj_SSE
);
1108 rvdw_SSE1
= _mm_add_pd(rai_SSE1
,raj_SSE
);
1110 ratio_SSE0
= _mm_mul_pd(rsq_SSE0
, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0
,rvdw_SSE0
)));
1111 ratio_SSE1
= _mm_mul_pd(rsq_SSE1
, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1
,rvdw_SSE1
)));
1113 ratio_SSE0
= _mm_min_pd(ratio_SSE0
,still_p5inv_SSE
);
1114 ratio_SSE1
= _mm_min_pd(ratio_SSE1
,still_p5inv_SSE
);
1115 theta_SSE0
= _mm_mul_pd(ratio_SSE0
,still_pip5_SSE
);
1116 theta_SSE1
= _mm_mul_pd(ratio_SSE1
,still_pip5_SSE
);
1117 gmx_mm_sincos_pd(theta_SSE0
,&sinq_SSE0
,&cosq_SSE0
);
1118 gmx_mm_sincos_pd(theta_SSE1
,&sinq_SSE1
,&cosq_SSE1
);
1119 term_SSE0
= _mm_mul_pd(half_SSE
,_mm_sub_pd(one_SSE
,cosq_SSE0
));
1120 term_SSE1
= _mm_mul_pd(half_SSE
,_mm_sub_pd(one_SSE
,cosq_SSE1
));
1121 ccf_SSE0
= _mm_mul_pd(term_SSE0
,term_SSE0
);
1122 ccf_SSE1
= _mm_mul_pd(term_SSE1
,term_SSE1
);
1123 dccf_SSE0
= _mm_mul_pd(_mm_mul_pd(two_SSE
,term_SSE0
),
1124 _mm_mul_pd(sinq_SSE0
,theta_SSE0
));
1125 dccf_SSE1
= _mm_mul_pd(_mm_mul_pd(two_SSE
,term_SSE1
),
1126 _mm_mul_pd(sinq_SSE1
,theta_SSE1
));
1128 prod_SSE
= _mm_mul_pd(still_p4_SSE
,vaj_SSE
);
1129 icf4_SSE0
= _mm_mul_pd(ccf_SSE0
,idr4_SSE0
);
1130 icf4_SSE1
= _mm_mul_pd(ccf_SSE1
,idr4_SSE1
);
1131 icf6_SSE0
= _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE
,ccf_SSE0
),dccf_SSE0
), idr6_SSE0
);
1132 icf6_SSE1
= _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE
,ccf_SSE1
),dccf_SSE1
), idr6_SSE1
);
1134 _mm_store_pd(work
+j
, _mm_add_pd(_mm_load_pd(work
+j
),
1135 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0
,icf4_SSE0
),
1136 _mm_mul_pd(prod_ai_SSE1
,icf4_SSE1
))));
1138 gpi_SSE0
= _mm_add_pd(gpi_SSE0
, _mm_mul_pd(prod_SSE
,icf4_SSE0
));
1139 gpi_SSE1
= _mm_add_pd(gpi_SSE1
, _mm_mul_pd(prod_SSE
,icf4_SSE1
));
1141 /* Save ai->aj and aj->ai chain rule terms */
1142 _mm_store_pd(dadx
,_mm_mul_pd(prod_SSE
,icf6_SSE0
));
1144 _mm_store_pd(dadx
,_mm_mul_pd(prod_SSE
,icf6_SSE1
));
1147 _mm_store_pd(dadx
,_mm_mul_pd(prod_ai_SSE0
,icf6_SSE0
));
1149 _mm_store_pd(dadx
,_mm_mul_pd(prod_ai_SSE1
,icf6_SSE1
));
1152 GMX_MM_TRANSPOSE2_PD(gpi_SSE0
,gpi_SSE1
);
1153 gpi_SSE0
= _mm_add_pd(gpi_SSE0
,gpi_SSE1
);
1154 _mm_store_pd(work
+i
, _mm_add_pd(gpi_SSE0
, _mm_load_pd(work
+i
)));
1157 /* In case we have written anything beyond natoms, move it back.
1158 * Never mind that we leave stuff above natoms; that will not
1159 * be accessed later in the routine.
1160 * In principle this should be a move rather than sum, but this
1161 * way we dont have to worry about even/odd offsets...
1163 for(i
=natoms
;i
<ni1
+1+natoms
/2;i
++)
1165 work
[i
-natoms
] += work
[i
];
1168 /* Parallel summations */
1171 gmx_sum(natoms
,work
,cr
);
1174 factor
= 0.5 * ONE_4PI_EPS0
;
1175 /* Calculate the radii - should we do all atoms, or just our local ones? */
1176 for(i
=0;i
<natoms
;i
++)
1178 if(born
->use
[i
] != 0)
1180 gpi
= born
->gpol
[i
]+work
[i
];
1182 born
->bRad
[i
] = factor
*gmx_invsqrt(gpi2
);
1183 fr
->invsqrta
[i
] = gmx_invsqrt(born
->bRad
[i
]);
1189 /* Reinstate MSVC optimization */
1191 #pragma optimize("",on)
1196 genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec
* fr
,
1197 t_mdatoms
* mdatoms
,
1198 gmx_genborn_t
* born
,
1200 gmx_localtop_t
* top
,
1205 gmx_allvsallgb2_data_t
*aadata
;
1208 int nj0
,nj1
,nj2
,nj3
;
1226 double rai
,rai_inv
,rai_inv2
,sum_ai
,sum_ai2
,sum_ai3
,tsum
,tchain
;
1228 __m128d ix_SSE0
,iy_SSE0
,iz_SSE0
;
1229 __m128d ix_SSE1
,iy_SSE1
,iz_SSE1
;
1230 __m128d gpi_SSE0
,rai_SSE0
,prod_ai_SSE0
;
1231 __m128d gpi_SSE1
,rai_SSE1
,prod_ai_SSE1
;
1232 __m128d imask_SSE0
,jmask_SSE0
;
1233 __m128d imask_SSE1
,jmask_SSE1
;
1234 __m128d jx_SSE
,jy_SSE
,jz_SSE
;
1235 __m128d dx_SSE0
,dy_SSE0
,dz_SSE0
;
1236 __m128d dx_SSE1
,dy_SSE1
,dz_SSE1
;
1237 __m128d rsq_SSE0
,rinv_SSE0
,irsq_SSE0
,idr4_SSE0
,idr6_SSE0
;
1238 __m128d rsq_SSE1
,rinv_SSE1
,irsq_SSE1
,idr4_SSE1
,idr6_SSE1
;
1239 __m128d raj_SSE
,raj_inv_SSE
,sk_aj_SSE
,sk2_aj_SSE
;
1240 __m128d ccf_SSE0
,dccf_SSE0
,prod_SSE0
;
1241 __m128d ccf_SSE1
,dccf_SSE1
,prod_SSE1
;
1242 __m128d icf4_SSE0
,icf6_SSE0
;
1243 __m128d icf4_SSE1
,icf6_SSE1
;
1244 __m128d oneeighth_SSE
,onefourth_SSE
,half_SSE
,one_SSE
,two_SSE
,four_SSE
;
1245 __m128d still_p4_SSE
,still_p5inv_SSE
,still_pip5_SSE
;
1246 __m128d rai_inv_SSE0
;
1247 __m128d rai_inv_SSE1
;
1248 __m128d sk_ai_SSE0
,sk2_ai_SSE0
,sum_ai_SSE0
;
1249 __m128d sk_ai_SSE1
,sk2_ai_SSE1
,sum_ai_SSE1
;
1250 __m128d lij_inv_SSE0
,sk2_rinv_SSE0
;
1251 __m128d lij_inv_SSE1
,sk2_rinv_SSE1
;
1254 __m128d t1_SSE0
,t2_SSE0
,t3_SSE0
,t4_SSE0
;
1255 __m128d t1_SSE1
,t2_SSE1
,t3_SSE1
,t4_SSE1
;
1256 __m128d obc_mask1_SSE0
,obc_mask2_SSE0
,obc_mask3_SSE0
;
1257 __m128d obc_mask1_SSE1
,obc_mask2_SSE1
,obc_mask3_SSE1
;
1258 __m128d uij_SSE0
,uij2_SSE0
,uij3_SSE0
;
1259 __m128d uij_SSE1
,uij2_SSE1
,uij3_SSE1
;
1260 __m128d lij_SSE0
,lij2_SSE0
,lij3_SSE0
;
1261 __m128d lij_SSE1
,lij2_SSE1
,lij3_SSE1
;
1262 __m128d dlij_SSE0
,diff2_SSE0
,logterm_SSE0
;
1263 __m128d dlij_SSE1
,diff2_SSE1
,logterm_SSE1
;
1264 __m128d doffset_SSE
,tmpSSE
;
1266 natoms
= mdatoms
->nr
;
1267 ni0
= (mdatoms
->start
/SIMD_WIDTH
)*SIMD_WIDTH
;
1268 ni1
= mdatoms
->start
+mdatoms
->homenr
;
1272 aadata
= *((gmx_allvsallgb2_data_t
**)paadata
);
1277 genborn_allvsall_setup(&aadata
,top
,born
,mdatoms
,born
->gb_doffset
,
1278 egbOBC
,TRUE
,TRUE
,TRUE
);
1279 *((gmx_allvsallgb2_data_t
**)paadata
) = aadata
;
1282 x_align
= aadata
->x_align
;
1283 y_align
= aadata
->y_align
;
1284 z_align
= aadata
->z_align
;
1286 gb_radius
= aadata
->gb_radius
;
1287 work
= aadata
->work
;
1288 jindex
= aadata
->jindex_gb
;
1290 obc_param
= aadata
->workparam
;
1292 oneeighth_SSE
= _mm_set1_pd(0.125);
1293 onefourth_SSE
= _mm_set1_pd(0.25);
1294 half_SSE
= _mm_set1_pd(0.5);
1295 one_SSE
= _mm_set1_pd(1.0);
1296 two_SSE
= _mm_set1_pd(2.0);
1297 four_SSE
= _mm_set1_pd(4.0);
1298 doffset_SSE
= _mm_set1_pd(born
->gb_doffset
);
1300 for(i
=0;i
<natoms
;i
++)
1302 x_align
[i
] = x
[3*i
];
1303 y_align
[i
] = x
[3*i
+1];
1304 z_align
[i
] = x
[3*i
+2];
1308 for(i
=0;i
<natoms
/2+1;i
++)
1310 x_align
[natoms
+i
] = x_align
[i
];
1311 y_align
[natoms
+i
] = y_align
[i
];
1312 z_align
[natoms
+i
] = z_align
[i
];
1315 for(i
=0;i
<natoms
+natoms
/2+1;i
++)
1320 for(i
=ni0
; i
<ni1
; i
+= UNROLLI
)
1322 /* We assume shifts are NOT used for all-vs-all interactions */
1324 /* Load i atom data */
1325 ix_SSE0
= _mm_load1_pd(x_align
+i
);
1326 iy_SSE0
= _mm_load1_pd(y_align
+i
);
1327 iz_SSE0
= _mm_load1_pd(z_align
+i
);
1328 ix_SSE1
= _mm_load1_pd(x_align
+i
+1);
1329 iy_SSE1
= _mm_load1_pd(y_align
+i
+1);
1330 iz_SSE1
= _mm_load1_pd(z_align
+i
+1);
1332 rai_SSE0
= _mm_load1_pd(gb_radius
+i
);
1333 rai_SSE1
= _mm_load1_pd(gb_radius
+i
+1);
1334 rai_inv_SSE0
= gmx_mm_inv_pd(rai_SSE0
);
1335 rai_inv_SSE1
= gmx_mm_inv_pd(rai_SSE1
);
1337 sk_ai_SSE0
= _mm_load1_pd(obc_param
+i
);
1338 sk_ai_SSE1
= _mm_load1_pd(obc_param
+i
+1);
1339 sk2_ai_SSE0
= _mm_mul_pd(sk_ai_SSE0
,sk_ai_SSE0
);
1340 sk2_ai_SSE1
= _mm_mul_pd(sk_ai_SSE1
,sk_ai_SSE1
);
1342 sum_ai_SSE0
= _mm_setzero_pd();
1343 sum_ai_SSE1
= _mm_setzero_pd();
1345 /* Load limits for loop over neighbors */
1347 nj1
= jindex
[4*i
+1];
1348 nj2
= jindex
[4*i
+2];
1349 nj3
= jindex
[4*i
+3];
1351 pmask0
= aadata
->prologue_mask_gb
[i
];
1352 pmask1
= aadata
->prologue_mask_gb
[i
+1];
1353 emask0
= aadata
->epilogue_mask
[i
];
1354 emask1
= aadata
->epilogue_mask
[i
+1];
1356 imask_SSE0
= _mm_load1_pd((double *)(aadata
->imask
+2*i
));
1357 imask_SSE1
= _mm_load1_pd((double *)(aadata
->imask
+2*i
+2));
1359 /* Prologue part, including exclusion mask */
1360 for(j
=nj0
; j
<nj1
; j
+=UNROLLJ
)
1362 jmask_SSE0
= _mm_load_pd((double *)pmask0
);
1363 jmask_SSE1
= _mm_load_pd((double *)pmask1
);
1364 pmask0
+= 2*UNROLLJ
;
1365 pmask1
+= 2*UNROLLJ
;
1367 /* load j atom coordinates */
1368 jx_SSE
= _mm_load_pd(x_align
+j
);
1369 jy_SSE
= _mm_load_pd(y_align
+j
);
1370 jz_SSE
= _mm_load_pd(z_align
+j
);
1372 /* Calculate distance */
1373 dx_SSE0
= _mm_sub_pd(ix_SSE0
,jx_SSE
);
1374 dy_SSE0
= _mm_sub_pd(iy_SSE0
,jy_SSE
);
1375 dz_SSE0
= _mm_sub_pd(iz_SSE0
,jz_SSE
);
1376 dx_SSE1
= _mm_sub_pd(ix_SSE1
,jx_SSE
);
1377 dy_SSE1
= _mm_sub_pd(iy_SSE1
,jy_SSE
);
1378 dz_SSE1
= _mm_sub_pd(iz_SSE1
,jz_SSE
);
1380 /* rsq = dx*dx+dy*dy+dz*dz */
1381 rsq_SSE0
= gmx_mm_calc_rsq_pd(dx_SSE0
,dy_SSE0
,dz_SSE0
);
1382 rsq_SSE1
= gmx_mm_calc_rsq_pd(dx_SSE1
,dy_SSE1
,dz_SSE1
);
1385 jmask_SSE0
= _mm_and_pd(jmask_SSE0
,imask_SSE0
);
1386 jmask_SSE1
= _mm_and_pd(jmask_SSE1
,imask_SSE1
);
1388 /* Calculate 1/r and 1/r2 */
1389 rinv_SSE0
= gmx_mm_invsqrt_pd(rsq_SSE0
);
1390 rinv_SSE1
= gmx_mm_invsqrt_pd(rsq_SSE1
);
1393 rinv_SSE0
= _mm_and_pd(rinv_SSE0
,jmask_SSE0
);
1394 rinv_SSE1
= _mm_and_pd(rinv_SSE1
,jmask_SSE1
);
1396 dr_SSE0
= _mm_mul_pd(rsq_SSE0
,rinv_SSE0
);
1397 dr_SSE1
= _mm_mul_pd(rsq_SSE1
,rinv_SSE1
);
1399 sk_aj_SSE
= _mm_load_pd(obc_param
+j
);
1400 raj_SSE
= _mm_load_pd(gb_radius
+j
);
1401 raj_inv_SSE
= gmx_mm_inv_pd(raj_SSE
);
1403 /* Evaluate influence of atom aj -> ai */
1404 t1_SSE0
= _mm_add_pd(dr_SSE0
,sk_aj_SSE
);
1405 t1_SSE1
= _mm_add_pd(dr_SSE1
,sk_aj_SSE
);
1406 t2_SSE0
= _mm_sub_pd(dr_SSE0
,sk_aj_SSE
);
1407 t2_SSE1
= _mm_sub_pd(dr_SSE1
,sk_aj_SSE
);
1408 t3_SSE0
= _mm_sub_pd(sk_aj_SSE
,dr_SSE0
);
1409 t3_SSE1
= _mm_sub_pd(sk_aj_SSE
,dr_SSE1
);
1411 obc_mask1_SSE0
= _mm_cmplt_pd(rai_SSE0
, t1_SSE0
);
1412 obc_mask1_SSE1
= _mm_cmplt_pd(rai_SSE1
, t1_SSE1
);
1413 obc_mask2_SSE0
= _mm_cmplt_pd(rai_SSE0
, t2_SSE0
);
1414 obc_mask2_SSE1
= _mm_cmplt_pd(rai_SSE1
, t2_SSE1
);
1415 obc_mask3_SSE0
= _mm_cmplt_pd(rai_SSE0
, t3_SSE0
);
1416 obc_mask3_SSE1
= _mm_cmplt_pd(rai_SSE1
, t3_SSE1
);
1417 obc_mask1_SSE0
= _mm_and_pd(obc_mask1_SSE0
,jmask_SSE0
);
1418 obc_mask1_SSE1
= _mm_and_pd(obc_mask1_SSE1
,jmask_SSE1
);
1420 uij_SSE0
= gmx_mm_inv_pd(t1_SSE0
);
1421 uij_SSE1
= gmx_mm_inv_pd(t1_SSE1
);
1422 lij_SSE0
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE0
,gmx_mm_inv_pd(t2_SSE0
)),
1423 _mm_andnot_pd(obc_mask2_SSE0
,rai_inv_SSE0
));
1424 lij_SSE1
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE1
,gmx_mm_inv_pd(t2_SSE1
)),
1425 _mm_andnot_pd(obc_mask2_SSE1
,rai_inv_SSE1
));
1426 dlij_SSE0
= _mm_and_pd(one_SSE
,obc_mask2_SSE0
);
1427 dlij_SSE1
= _mm_and_pd(one_SSE
,obc_mask2_SSE1
);
1429 uij2_SSE0
= _mm_mul_pd(uij_SSE0
, uij_SSE0
);
1430 uij2_SSE1
= _mm_mul_pd(uij_SSE1
, uij_SSE1
);
1431 uij3_SSE0
= _mm_mul_pd(uij2_SSE0
,uij_SSE0
);
1432 uij3_SSE1
= _mm_mul_pd(uij2_SSE1
,uij_SSE1
);
1433 lij2_SSE0
= _mm_mul_pd(lij_SSE0
, lij_SSE0
);
1434 lij2_SSE1
= _mm_mul_pd(lij_SSE1
, lij_SSE1
);
1435 lij3_SSE0
= _mm_mul_pd(lij2_SSE0
,lij_SSE0
);
1436 lij3_SSE1
= _mm_mul_pd(lij2_SSE1
,lij_SSE1
);
1438 diff2_SSE0
= _mm_sub_pd(uij2_SSE0
,lij2_SSE0
);
1439 diff2_SSE1
= _mm_sub_pd(uij2_SSE1
,lij2_SSE1
);
1440 lij_inv_SSE0
= gmx_mm_invsqrt_pd(lij2_SSE0
);
1441 lij_inv_SSE1
= gmx_mm_invsqrt_pd(lij2_SSE1
);
1442 sk2_aj_SSE
= _mm_mul_pd(sk_aj_SSE
,sk_aj_SSE
);
1443 sk2_rinv_SSE0
= _mm_mul_pd(sk2_aj_SSE
,rinv_SSE0
);
1444 sk2_rinv_SSE1
= _mm_mul_pd(sk2_aj_SSE
,rinv_SSE1
);
1445 prod_SSE0
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE0
);
1446 prod_SSE1
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE1
);
1448 logterm_SSE0
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE0
,lij_inv_SSE0
));
1449 logterm_SSE1
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE1
,lij_inv_SSE1
));
1451 t1_SSE0
= _mm_sub_pd(lij_SSE0
,uij_SSE0
);
1452 t1_SSE1
= _mm_sub_pd(lij_SSE1
,uij_SSE1
);
1453 t2_SSE0
= _mm_mul_pd(diff2_SSE0
,
1454 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE0
),
1456 t2_SSE1
= _mm_mul_pd(diff2_SSE1
,
1457 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE1
),
1460 t3_SSE0
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE0
,logterm_SSE0
));
1461 t3_SSE1
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE1
,logterm_SSE1
));
1462 t1_SSE0
= _mm_add_pd(t1_SSE0
,_mm_add_pd(t2_SSE0
,t3_SSE0
));
1463 t1_SSE1
= _mm_add_pd(t1_SSE1
,_mm_add_pd(t2_SSE1
,t3_SSE1
));
1464 t4_SSE0
= _mm_mul_pd(two_SSE
,_mm_sub_pd(rai_inv_SSE0
,lij_SSE0
));
1465 t4_SSE1
= _mm_mul_pd(two_SSE
,_mm_sub_pd(rai_inv_SSE1
,lij_SSE1
));
1466 t4_SSE0
= _mm_and_pd(t4_SSE0
,obc_mask3_SSE0
);
1467 t4_SSE1
= _mm_and_pd(t4_SSE1
,obc_mask3_SSE1
);
1468 t1_SSE0
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE0
,t4_SSE0
));
1469 t1_SSE1
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE1
,t4_SSE1
));
1471 sum_ai_SSE0
= _mm_add_pd(sum_ai_SSE0
,_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
));
1472 sum_ai_SSE1
= _mm_add_pd(sum_ai_SSE1
,_mm_and_pd(t1_SSE1
,obc_mask1_SSE1
));
1474 t1_SSE0
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE0
),
1475 _mm_mul_pd(prod_SSE0
,lij3_SSE0
));
1476 t1_SSE1
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE1
),
1477 _mm_mul_pd(prod_SSE1
,lij3_SSE1
));
1478 t1_SSE0
= _mm_sub_pd(t1_SSE0
,
1479 _mm_mul_pd(onefourth_SSE
,
1480 _mm_add_pd(_mm_mul_pd(lij_SSE0
,rinv_SSE0
),
1481 _mm_mul_pd(lij3_SSE0
,dr_SSE0
))));
1482 t1_SSE1
= _mm_sub_pd(t1_SSE1
,
1483 _mm_mul_pd(onefourth_SSE
,
1484 _mm_add_pd(_mm_mul_pd(lij_SSE1
,rinv_SSE1
),
1485 _mm_mul_pd(lij3_SSE1
,dr_SSE1
))));
1487 t2_SSE0
= _mm_mul_pd(onefourth_SSE
,
1488 _mm_add_pd(_mm_mul_pd(uij_SSE0
,rinv_SSE0
),
1489 _mm_mul_pd(uij3_SSE0
,dr_SSE0
)));
1490 t2_SSE1
= _mm_mul_pd(onefourth_SSE
,
1491 _mm_add_pd(_mm_mul_pd(uij_SSE1
,rinv_SSE1
),
1492 _mm_mul_pd(uij3_SSE1
,dr_SSE1
)));
1493 t2_SSE0
= _mm_sub_pd(t2_SSE0
,
1494 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE0
),
1495 _mm_mul_pd(prod_SSE0
,uij3_SSE0
)));
1496 t2_SSE1
= _mm_sub_pd(t2_SSE1
,
1497 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE1
),
1498 _mm_mul_pd(prod_SSE1
,uij3_SSE1
)));
1499 t3_SSE0
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE0
),
1500 _mm_mul_pd(rinv_SSE0
,rinv_SSE0
));
1501 t3_SSE1
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE1
),
1502 _mm_mul_pd(rinv_SSE1
,rinv_SSE1
));
1503 t3_SSE0
= _mm_sub_pd(t3_SSE0
,
1504 _mm_mul_pd(_mm_mul_pd(diff2_SSE0
,oneeighth_SSE
),
1506 _mm_mul_pd(sk2_rinv_SSE0
,rinv_SSE0
))));
1507 t3_SSE1
= _mm_sub_pd(t3_SSE1
,
1508 _mm_mul_pd(_mm_mul_pd(diff2_SSE1
,oneeighth_SSE
),
1510 _mm_mul_pd(sk2_rinv_SSE1
,rinv_SSE1
))));
1512 t1_SSE0
= _mm_mul_pd(rinv_SSE0
,
1513 _mm_add_pd(_mm_mul_pd(dlij_SSE0
,t1_SSE0
),
1514 _mm_add_pd(t2_SSE0
,t3_SSE0
)));
1515 t1_SSE1
= _mm_mul_pd(rinv_SSE1
,
1516 _mm_add_pd(_mm_mul_pd(dlij_SSE1
,t1_SSE1
),
1517 _mm_add_pd(t2_SSE1
,t3_SSE1
)));
1519 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
));
1521 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE1
,obc_mask1_SSE1
));
1524 /* Evaluate influence of atom ai -> aj */
1525 t1_SSE0
= _mm_add_pd(dr_SSE0
,sk_ai_SSE0
);
1526 t1_SSE1
= _mm_add_pd(dr_SSE1
,sk_ai_SSE1
);
1527 t2_SSE0
= _mm_sub_pd(dr_SSE0
,sk_ai_SSE0
);
1528 t2_SSE1
= _mm_sub_pd(dr_SSE1
,sk_ai_SSE1
);
1529 t3_SSE0
= _mm_sub_pd(sk_ai_SSE0
,dr_SSE0
);
1530 t3_SSE1
= _mm_sub_pd(sk_ai_SSE1
,dr_SSE1
);
1532 obc_mask1_SSE0
= _mm_cmplt_pd(raj_SSE
, t1_SSE0
);
1533 obc_mask1_SSE1
= _mm_cmplt_pd(raj_SSE
, t1_SSE1
);
1534 obc_mask2_SSE0
= _mm_cmplt_pd(raj_SSE
, t2_SSE0
);
1535 obc_mask2_SSE1
= _mm_cmplt_pd(raj_SSE
, t2_SSE1
);
1536 obc_mask3_SSE0
= _mm_cmplt_pd(raj_SSE
, t3_SSE0
);
1537 obc_mask3_SSE1
= _mm_cmplt_pd(raj_SSE
, t3_SSE1
);
1538 obc_mask1_SSE0
= _mm_and_pd(obc_mask1_SSE0
,jmask_SSE0
);
1539 obc_mask1_SSE1
= _mm_and_pd(obc_mask1_SSE1
,jmask_SSE1
);
1541 uij_SSE0
= gmx_mm_inv_pd(t1_SSE0
);
1542 uij_SSE1
= gmx_mm_inv_pd(t1_SSE1
);
1543 lij_SSE0
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE0
,gmx_mm_inv_pd(t2_SSE0
)),
1544 _mm_andnot_pd(obc_mask2_SSE0
,raj_inv_SSE
));
1545 lij_SSE1
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE1
,gmx_mm_inv_pd(t2_SSE1
)),
1546 _mm_andnot_pd(obc_mask2_SSE1
,raj_inv_SSE
));
1547 dlij_SSE0
= _mm_and_pd(one_SSE
,obc_mask2_SSE0
);
1548 dlij_SSE1
= _mm_and_pd(one_SSE
,obc_mask2_SSE1
);
1550 uij2_SSE0
= _mm_mul_pd(uij_SSE0
, uij_SSE0
);
1551 uij2_SSE1
= _mm_mul_pd(uij_SSE1
, uij_SSE1
);
1552 uij3_SSE0
= _mm_mul_pd(uij2_SSE0
,uij_SSE0
);
1553 uij3_SSE1
= _mm_mul_pd(uij2_SSE1
,uij_SSE1
);
1554 lij2_SSE0
= _mm_mul_pd(lij_SSE0
, lij_SSE0
);
1555 lij2_SSE1
= _mm_mul_pd(lij_SSE1
, lij_SSE1
);
1556 lij3_SSE0
= _mm_mul_pd(lij2_SSE0
,lij_SSE0
);
1557 lij3_SSE1
= _mm_mul_pd(lij2_SSE1
,lij_SSE1
);
1559 diff2_SSE0
= _mm_sub_pd(uij2_SSE0
,lij2_SSE0
);
1560 diff2_SSE1
= _mm_sub_pd(uij2_SSE1
,lij2_SSE1
);
1561 lij_inv_SSE0
= gmx_mm_invsqrt_pd(lij2_SSE0
);
1562 lij_inv_SSE1
= gmx_mm_invsqrt_pd(lij2_SSE1
);
1563 sk2_rinv_SSE0
= _mm_mul_pd(sk2_ai_SSE0
,rinv_SSE0
);
1564 sk2_rinv_SSE1
= _mm_mul_pd(sk2_ai_SSE1
,rinv_SSE1
);
1565 prod_SSE0
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE0
);
1566 prod_SSE1
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE1
);
1568 logterm_SSE0
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE0
,lij_inv_SSE0
));
1569 logterm_SSE1
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE1
,lij_inv_SSE1
));
1570 t1_SSE0
= _mm_sub_pd(lij_SSE0
,uij_SSE0
);
1571 t1_SSE1
= _mm_sub_pd(lij_SSE1
,uij_SSE1
);
1572 t2_SSE0
= _mm_mul_pd(diff2_SSE0
,
1573 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE0
),
1575 t2_SSE1
= _mm_mul_pd(diff2_SSE1
,
1576 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE1
),
1578 t3_SSE0
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE0
,logterm_SSE0
));
1579 t3_SSE1
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE1
,logterm_SSE1
));
1580 t1_SSE0
= _mm_add_pd(t1_SSE0
,_mm_add_pd(t2_SSE0
,t3_SSE0
));
1581 t1_SSE1
= _mm_add_pd(t1_SSE1
,_mm_add_pd(t2_SSE1
,t3_SSE1
));
1582 t4_SSE0
= _mm_mul_pd(two_SSE
,_mm_sub_pd(raj_inv_SSE
,lij_SSE0
));
1583 t4_SSE1
= _mm_mul_pd(two_SSE
,_mm_sub_pd(raj_inv_SSE
,lij_SSE1
));
1584 t4_SSE0
= _mm_and_pd(t4_SSE0
,obc_mask3_SSE0
);
1585 t4_SSE1
= _mm_and_pd(t4_SSE1
,obc_mask3_SSE1
);
1586 t1_SSE0
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE0
,t4_SSE0
));
1587 t1_SSE1
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE1
,t4_SSE1
));
1589 _mm_store_pd(work
+j
, _mm_add_pd(_mm_load_pd(work
+j
),
1590 _mm_add_pd(_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
),
1591 _mm_and_pd(t1_SSE1
,obc_mask1_SSE1
))));
1593 t1_SSE0
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE0
),
1594 _mm_mul_pd(prod_SSE0
,lij3_SSE0
));
1595 t1_SSE1
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE1
),
1596 _mm_mul_pd(prod_SSE1
,lij3_SSE1
));
1597 t1_SSE0
= _mm_sub_pd(t1_SSE0
,
1598 _mm_mul_pd(onefourth_SSE
,
1599 _mm_add_pd(_mm_mul_pd(lij_SSE0
,rinv_SSE0
),
1600 _mm_mul_pd(lij3_SSE0
,dr_SSE0
))));
1601 t1_SSE1
= _mm_sub_pd(t1_SSE1
,
1602 _mm_mul_pd(onefourth_SSE
,
1603 _mm_add_pd(_mm_mul_pd(lij_SSE1
,rinv_SSE1
),
1604 _mm_mul_pd(lij3_SSE1
,dr_SSE1
))));
1605 t2_SSE0
= _mm_mul_pd(onefourth_SSE
,
1606 _mm_add_pd(_mm_mul_pd(uij_SSE0
,rinv_SSE0
),
1607 _mm_mul_pd(uij3_SSE0
,dr_SSE0
)));
1608 t2_SSE1
= _mm_mul_pd(onefourth_SSE
,
1609 _mm_add_pd(_mm_mul_pd(uij_SSE1
,rinv_SSE1
),
1610 _mm_mul_pd(uij3_SSE1
,dr_SSE1
)));
1611 t2_SSE0
= _mm_sub_pd(t2_SSE0
,
1612 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE0
),
1613 _mm_mul_pd(prod_SSE0
,uij3_SSE0
)));
1614 t2_SSE1
= _mm_sub_pd(t2_SSE1
,
1615 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE1
),
1616 _mm_mul_pd(prod_SSE1
,uij3_SSE1
)));
1618 t3_SSE0
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE0
),
1619 _mm_mul_pd(rinv_SSE0
,rinv_SSE0
));
1620 t3_SSE1
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE1
),
1621 _mm_mul_pd(rinv_SSE1
,rinv_SSE1
));
1623 t3_SSE0
= _mm_sub_pd(t3_SSE0
,
1624 _mm_mul_pd(_mm_mul_pd(diff2_SSE0
,oneeighth_SSE
),
1626 _mm_mul_pd(sk2_rinv_SSE0
,rinv_SSE0
))));
1627 t3_SSE1
= _mm_sub_pd(t3_SSE1
,
1628 _mm_mul_pd(_mm_mul_pd(diff2_SSE1
,oneeighth_SSE
),
1630 _mm_mul_pd(sk2_rinv_SSE1
,rinv_SSE1
))));
1633 t1_SSE0
= _mm_mul_pd(rinv_SSE0
,
1634 _mm_add_pd(_mm_mul_pd(dlij_SSE0
,t1_SSE0
),
1635 _mm_add_pd(t2_SSE0
,t3_SSE0
)));
1636 t1_SSE1
= _mm_mul_pd(rinv_SSE1
,
1637 _mm_add_pd(_mm_mul_pd(dlij_SSE1
,t1_SSE1
),
1638 _mm_add_pd(t2_SSE1
,t3_SSE1
)));
1640 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
));
1642 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE1
,obc_mask1_SSE1
));
1646 /* Main part, no exclusions */
1647 for(j
=nj1
; j
<nj2
; j
+=UNROLLJ
)
1649 /* load j atom coordinates */
1650 jx_SSE
= _mm_load_pd(x_align
+j
);
1651 jy_SSE
= _mm_load_pd(y_align
+j
);
1652 jz_SSE
= _mm_load_pd(z_align
+j
);
1654 /* Calculate distance */
1655 dx_SSE0
= _mm_sub_pd(ix_SSE0
,jx_SSE
);
1656 dy_SSE0
= _mm_sub_pd(iy_SSE0
,jy_SSE
);
1657 dz_SSE0
= _mm_sub_pd(iz_SSE0
,jz_SSE
);
1658 dx_SSE1
= _mm_sub_pd(ix_SSE1
,jx_SSE
);
1659 dy_SSE1
= _mm_sub_pd(iy_SSE1
,jy_SSE
);
1660 dz_SSE1
= _mm_sub_pd(iz_SSE1
,jz_SSE
);
1662 /* rsq = dx*dx+dy*dy+dz*dz */
1663 rsq_SSE0
= gmx_mm_calc_rsq_pd(dx_SSE0
,dy_SSE0
,dz_SSE0
);
1664 rsq_SSE1
= gmx_mm_calc_rsq_pd(dx_SSE1
,dy_SSE1
,dz_SSE1
);
1666 /* Calculate 1/r and 1/r2 */
1667 rinv_SSE0
= gmx_mm_invsqrt_pd(rsq_SSE0
);
1668 rinv_SSE1
= gmx_mm_invsqrt_pd(rsq_SSE1
);
1671 rinv_SSE0
= _mm_and_pd(rinv_SSE0
,imask_SSE0
);
1672 rinv_SSE1
= _mm_and_pd(rinv_SSE1
,imask_SSE1
);
1674 dr_SSE0
= _mm_mul_pd(rsq_SSE0
,rinv_SSE0
);
1675 dr_SSE1
= _mm_mul_pd(rsq_SSE1
,rinv_SSE1
);
1677 sk_aj_SSE
= _mm_load_pd(obc_param
+j
);
1678 raj_SSE
= _mm_load_pd(gb_radius
+j
);
1680 raj_inv_SSE
= gmx_mm_inv_pd(raj_SSE
);
1682 /* Evaluate influence of atom aj -> ai */
1683 t1_SSE0
= _mm_add_pd(dr_SSE0
,sk_aj_SSE
);
1684 t1_SSE1
= _mm_add_pd(dr_SSE1
,sk_aj_SSE
);
1685 t2_SSE0
= _mm_sub_pd(dr_SSE0
,sk_aj_SSE
);
1686 t2_SSE1
= _mm_sub_pd(dr_SSE1
,sk_aj_SSE
);
1687 t3_SSE0
= _mm_sub_pd(sk_aj_SSE
,dr_SSE0
);
1688 t3_SSE1
= _mm_sub_pd(sk_aj_SSE
,dr_SSE1
);
1690 obc_mask1_SSE0
= _mm_cmplt_pd(rai_SSE0
, t1_SSE0
);
1691 obc_mask1_SSE1
= _mm_cmplt_pd(rai_SSE1
, t1_SSE1
);
1692 obc_mask2_SSE0
= _mm_cmplt_pd(rai_SSE0
, t2_SSE0
);
1693 obc_mask2_SSE1
= _mm_cmplt_pd(rai_SSE1
, t2_SSE1
);
1694 obc_mask3_SSE0
= _mm_cmplt_pd(rai_SSE0
, t3_SSE0
);
1695 obc_mask3_SSE1
= _mm_cmplt_pd(rai_SSE1
, t3_SSE1
);
1696 obc_mask1_SSE0
= _mm_and_pd(obc_mask1_SSE0
,imask_SSE0
);
1697 obc_mask1_SSE1
= _mm_and_pd(obc_mask1_SSE1
,imask_SSE1
);
1699 uij_SSE0
= gmx_mm_inv_pd(t1_SSE0
);
1700 uij_SSE1
= gmx_mm_inv_pd(t1_SSE1
);
1701 lij_SSE0
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE0
,gmx_mm_inv_pd(t2_SSE0
)),
1702 _mm_andnot_pd(obc_mask2_SSE0
,rai_inv_SSE0
));
1703 lij_SSE1
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE1
,gmx_mm_inv_pd(t2_SSE1
)),
1704 _mm_andnot_pd(obc_mask2_SSE1
,rai_inv_SSE1
));
1705 dlij_SSE0
= _mm_and_pd(one_SSE
,obc_mask2_SSE0
);
1706 dlij_SSE1
= _mm_and_pd(one_SSE
,obc_mask2_SSE1
);
1708 uij2_SSE0
= _mm_mul_pd(uij_SSE0
, uij_SSE0
);
1709 uij2_SSE1
= _mm_mul_pd(uij_SSE1
, uij_SSE1
);
1710 uij3_SSE0
= _mm_mul_pd(uij2_SSE0
,uij_SSE0
);
1711 uij3_SSE1
= _mm_mul_pd(uij2_SSE1
,uij_SSE1
);
1712 lij2_SSE0
= _mm_mul_pd(lij_SSE0
, lij_SSE0
);
1713 lij2_SSE1
= _mm_mul_pd(lij_SSE1
, lij_SSE1
);
1714 lij3_SSE0
= _mm_mul_pd(lij2_SSE0
,lij_SSE0
);
1715 lij3_SSE1
= _mm_mul_pd(lij2_SSE1
,lij_SSE1
);
1717 diff2_SSE0
= _mm_sub_pd(uij2_SSE0
,lij2_SSE0
);
1718 diff2_SSE1
= _mm_sub_pd(uij2_SSE1
,lij2_SSE1
);
1719 lij_inv_SSE0
= gmx_mm_invsqrt_pd(lij2_SSE0
);
1720 lij_inv_SSE1
= gmx_mm_invsqrt_pd(lij2_SSE1
);
1721 sk2_aj_SSE
= _mm_mul_pd(sk_aj_SSE
,sk_aj_SSE
);
1722 sk2_rinv_SSE0
= _mm_mul_pd(sk2_aj_SSE
,rinv_SSE0
);
1723 sk2_rinv_SSE1
= _mm_mul_pd(sk2_aj_SSE
,rinv_SSE1
);
1724 prod_SSE0
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE0
);
1725 prod_SSE1
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE1
);
1727 logterm_SSE0
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE0
,lij_inv_SSE0
));
1728 logterm_SSE1
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE1
,lij_inv_SSE1
));
1730 t1_SSE0
= _mm_sub_pd(lij_SSE0
,uij_SSE0
);
1731 t1_SSE1
= _mm_sub_pd(lij_SSE1
,uij_SSE1
);
1732 t2_SSE0
= _mm_mul_pd(diff2_SSE0
,
1733 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE0
),
1735 t2_SSE1
= _mm_mul_pd(diff2_SSE1
,
1736 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE1
),
1739 t3_SSE0
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE0
,logterm_SSE0
));
1740 t3_SSE1
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE1
,logterm_SSE1
));
1741 t1_SSE0
= _mm_add_pd(t1_SSE0
,_mm_add_pd(t2_SSE0
,t3_SSE0
));
1742 t1_SSE1
= _mm_add_pd(t1_SSE1
,_mm_add_pd(t2_SSE1
,t3_SSE1
));
1743 t4_SSE0
= _mm_mul_pd(two_SSE
,_mm_sub_pd(rai_inv_SSE0
,lij_SSE0
));
1744 t4_SSE1
= _mm_mul_pd(two_SSE
,_mm_sub_pd(rai_inv_SSE1
,lij_SSE1
));
1745 t4_SSE0
= _mm_and_pd(t4_SSE0
,obc_mask3_SSE0
);
1746 t4_SSE1
= _mm_and_pd(t4_SSE1
,obc_mask3_SSE1
);
1747 t1_SSE0
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE0
,t4_SSE0
));
1748 t1_SSE1
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE1
,t4_SSE1
));
1750 sum_ai_SSE0
= _mm_add_pd(sum_ai_SSE0
,_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
));
1751 sum_ai_SSE1
= _mm_add_pd(sum_ai_SSE1
,_mm_and_pd(t1_SSE1
,obc_mask1_SSE1
));
1753 t1_SSE0
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE0
),
1754 _mm_mul_pd(prod_SSE0
,lij3_SSE0
));
1755 t1_SSE1
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE1
),
1756 _mm_mul_pd(prod_SSE1
,lij3_SSE1
));
1758 t1_SSE0
= _mm_sub_pd(t1_SSE0
,
1759 _mm_mul_pd(onefourth_SSE
,
1760 _mm_add_pd(_mm_mul_pd(lij_SSE0
,rinv_SSE0
),
1761 _mm_mul_pd(lij3_SSE0
,dr_SSE0
))));
1762 t1_SSE1
= _mm_sub_pd(t1_SSE1
,
1763 _mm_mul_pd(onefourth_SSE
,
1764 _mm_add_pd(_mm_mul_pd(lij_SSE1
,rinv_SSE1
),
1765 _mm_mul_pd(lij3_SSE1
,dr_SSE1
))));
1767 t2_SSE0
= _mm_mul_pd(onefourth_SSE
,
1768 _mm_add_pd(_mm_mul_pd(uij_SSE0
,rinv_SSE0
),
1769 _mm_mul_pd(uij3_SSE0
,dr_SSE0
)));
1770 t2_SSE1
= _mm_mul_pd(onefourth_SSE
,
1771 _mm_add_pd(_mm_mul_pd(uij_SSE1
,rinv_SSE1
),
1772 _mm_mul_pd(uij3_SSE1
,dr_SSE1
)));
1773 t2_SSE0
= _mm_sub_pd(t2_SSE0
,
1774 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE0
),
1775 _mm_mul_pd(prod_SSE0
,uij3_SSE0
)));
1776 t2_SSE1
= _mm_sub_pd(t2_SSE1
,
1777 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE1
),
1778 _mm_mul_pd(prod_SSE1
,uij3_SSE1
)));
1779 t3_SSE0
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE0
),
1780 _mm_mul_pd(rinv_SSE0
,rinv_SSE0
));
1781 t3_SSE1
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE1
),
1782 _mm_mul_pd(rinv_SSE1
,rinv_SSE1
));
1783 t3_SSE0
= _mm_sub_pd(t3_SSE0
,
1784 _mm_mul_pd(_mm_mul_pd(diff2_SSE0
,oneeighth_SSE
),
1786 _mm_mul_pd(sk2_rinv_SSE0
,rinv_SSE0
))));
1787 t3_SSE1
= _mm_sub_pd(t3_SSE1
,
1788 _mm_mul_pd(_mm_mul_pd(diff2_SSE1
,oneeighth_SSE
),
1790 _mm_mul_pd(sk2_rinv_SSE1
,rinv_SSE1
))));
1792 t1_SSE0
= _mm_mul_pd(rinv_SSE0
,
1793 _mm_add_pd(_mm_mul_pd(dlij_SSE0
,t1_SSE0
),
1794 _mm_add_pd(t2_SSE0
,t3_SSE0
)));
1795 t1_SSE1
= _mm_mul_pd(rinv_SSE1
,
1796 _mm_add_pd(_mm_mul_pd(dlij_SSE1
,t1_SSE1
),
1797 _mm_add_pd(t2_SSE1
,t3_SSE1
)));
1799 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
));
1801 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE1
,obc_mask1_SSE1
));
1804 /* Evaluate influence of atom ai -> aj */
1805 t1_SSE0
= _mm_add_pd(dr_SSE0
,sk_ai_SSE0
);
1806 t1_SSE1
= _mm_add_pd(dr_SSE1
,sk_ai_SSE1
);
1807 t2_SSE0
= _mm_sub_pd(dr_SSE0
,sk_ai_SSE0
);
1808 t2_SSE1
= _mm_sub_pd(dr_SSE1
,sk_ai_SSE1
);
1809 t3_SSE0
= _mm_sub_pd(sk_ai_SSE0
,dr_SSE0
);
1810 t3_SSE1
= _mm_sub_pd(sk_ai_SSE1
,dr_SSE1
);
1812 obc_mask1_SSE0
= _mm_cmplt_pd(raj_SSE
, t1_SSE0
);
1813 obc_mask1_SSE1
= _mm_cmplt_pd(raj_SSE
, t1_SSE1
);
1814 obc_mask2_SSE0
= _mm_cmplt_pd(raj_SSE
, t2_SSE0
);
1815 obc_mask2_SSE1
= _mm_cmplt_pd(raj_SSE
, t2_SSE1
);
1816 obc_mask3_SSE0
= _mm_cmplt_pd(raj_SSE
, t3_SSE0
);
1817 obc_mask3_SSE1
= _mm_cmplt_pd(raj_SSE
, t3_SSE1
);
1818 obc_mask1_SSE0
= _mm_and_pd(obc_mask1_SSE0
,imask_SSE0
);
1819 obc_mask1_SSE1
= _mm_and_pd(obc_mask1_SSE1
,imask_SSE1
);
1821 uij_SSE0
= gmx_mm_inv_pd(t1_SSE0
);
1822 uij_SSE1
= gmx_mm_inv_pd(t1_SSE1
);
1823 lij_SSE0
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE0
,gmx_mm_inv_pd(t2_SSE0
)),
1824 _mm_andnot_pd(obc_mask2_SSE0
,raj_inv_SSE
));
1825 lij_SSE1
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE1
,gmx_mm_inv_pd(t2_SSE1
)),
1826 _mm_andnot_pd(obc_mask2_SSE1
,raj_inv_SSE
));
1827 dlij_SSE0
= _mm_and_pd(one_SSE
,obc_mask2_SSE0
);
1828 dlij_SSE1
= _mm_and_pd(one_SSE
,obc_mask2_SSE1
);
1830 uij2_SSE0
= _mm_mul_pd(uij_SSE0
, uij_SSE0
);
1831 uij2_SSE1
= _mm_mul_pd(uij_SSE1
, uij_SSE1
);
1832 uij3_SSE0
= _mm_mul_pd(uij2_SSE0
,uij_SSE0
);
1833 uij3_SSE1
= _mm_mul_pd(uij2_SSE1
,uij_SSE1
);
1834 lij2_SSE0
= _mm_mul_pd(lij_SSE0
, lij_SSE0
);
1835 lij2_SSE1
= _mm_mul_pd(lij_SSE1
, lij_SSE1
);
1836 lij3_SSE0
= _mm_mul_pd(lij2_SSE0
,lij_SSE0
);
1837 lij3_SSE1
= _mm_mul_pd(lij2_SSE1
,lij_SSE1
);
1839 diff2_SSE0
= _mm_sub_pd(uij2_SSE0
,lij2_SSE0
);
1840 diff2_SSE1
= _mm_sub_pd(uij2_SSE1
,lij2_SSE1
);
1841 lij_inv_SSE0
= gmx_mm_invsqrt_pd(lij2_SSE0
);
1842 lij_inv_SSE1
= gmx_mm_invsqrt_pd(lij2_SSE1
);
1843 sk2_rinv_SSE0
= _mm_mul_pd(sk2_ai_SSE0
,rinv_SSE0
);
1844 sk2_rinv_SSE1
= _mm_mul_pd(sk2_ai_SSE1
,rinv_SSE1
);
1845 prod_SSE0
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE0
);
1846 prod_SSE1
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE1
);
1848 logterm_SSE0
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE0
,lij_inv_SSE0
));
1849 logterm_SSE1
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE1
,lij_inv_SSE1
));
1850 t1_SSE0
= _mm_sub_pd(lij_SSE0
,uij_SSE0
);
1851 t1_SSE1
= _mm_sub_pd(lij_SSE1
,uij_SSE1
);
1852 t2_SSE0
= _mm_mul_pd(diff2_SSE0
,
1853 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE0
),
1855 t2_SSE1
= _mm_mul_pd(diff2_SSE1
,
1856 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE1
),
1858 t3_SSE0
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE0
,logterm_SSE0
));
1859 t3_SSE1
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE1
,logterm_SSE1
));
1860 t1_SSE0
= _mm_add_pd(t1_SSE0
,_mm_add_pd(t2_SSE0
,t3_SSE0
));
1861 t1_SSE1
= _mm_add_pd(t1_SSE1
,_mm_add_pd(t2_SSE1
,t3_SSE1
));
1862 t4_SSE0
= _mm_mul_pd(two_SSE
,_mm_sub_pd(raj_inv_SSE
,lij_SSE0
));
1863 t4_SSE1
= _mm_mul_pd(two_SSE
,_mm_sub_pd(raj_inv_SSE
,lij_SSE1
));
1864 t4_SSE0
= _mm_and_pd(t4_SSE0
,obc_mask3_SSE0
);
1865 t4_SSE1
= _mm_and_pd(t4_SSE1
,obc_mask3_SSE1
);
1866 t1_SSE0
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE0
,t4_SSE0
));
1867 t1_SSE1
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE1
,t4_SSE1
));
1869 _mm_store_pd(work
+j
, _mm_add_pd(_mm_load_pd(work
+j
),
1870 _mm_add_pd(_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
),
1871 _mm_and_pd(t1_SSE1
,obc_mask1_SSE1
))));
1873 t1_SSE0
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE0
),
1874 _mm_mul_pd(prod_SSE0
,lij3_SSE0
));
1875 t1_SSE1
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE1
),
1876 _mm_mul_pd(prod_SSE1
,lij3_SSE1
));
1877 t1_SSE0
= _mm_sub_pd(t1_SSE0
,
1878 _mm_mul_pd(onefourth_SSE
,
1879 _mm_add_pd(_mm_mul_pd(lij_SSE0
,rinv_SSE0
),
1880 _mm_mul_pd(lij3_SSE0
,dr_SSE0
))));
1881 t1_SSE1
= _mm_sub_pd(t1_SSE1
,
1882 _mm_mul_pd(onefourth_SSE
,
1883 _mm_add_pd(_mm_mul_pd(lij_SSE1
,rinv_SSE1
),
1884 _mm_mul_pd(lij3_SSE1
,dr_SSE1
))));
1885 t2_SSE0
= _mm_mul_pd(onefourth_SSE
,
1886 _mm_add_pd(_mm_mul_pd(uij_SSE0
,rinv_SSE0
),
1887 _mm_mul_pd(uij3_SSE0
,dr_SSE0
)));
1888 t2_SSE1
= _mm_mul_pd(onefourth_SSE
,
1889 _mm_add_pd(_mm_mul_pd(uij_SSE1
,rinv_SSE1
),
1890 _mm_mul_pd(uij3_SSE1
,dr_SSE1
)));
1891 t2_SSE0
= _mm_sub_pd(t2_SSE0
,
1892 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE0
),
1893 _mm_mul_pd(prod_SSE0
,uij3_SSE0
)));
1894 t2_SSE1
= _mm_sub_pd(t2_SSE1
,
1895 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE1
),
1896 _mm_mul_pd(prod_SSE1
,uij3_SSE1
)));
1898 t3_SSE0
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE0
),
1899 _mm_mul_pd(rinv_SSE0
,rinv_SSE0
));
1900 t3_SSE1
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE1
),
1901 _mm_mul_pd(rinv_SSE1
,rinv_SSE1
));
1903 t3_SSE0
= _mm_sub_pd(t3_SSE0
,
1904 _mm_mul_pd(_mm_mul_pd(diff2_SSE0
,oneeighth_SSE
),
1906 _mm_mul_pd(sk2_rinv_SSE0
,rinv_SSE0
))));
1907 t3_SSE1
= _mm_sub_pd(t3_SSE1
,
1908 _mm_mul_pd(_mm_mul_pd(diff2_SSE1
,oneeighth_SSE
),
1910 _mm_mul_pd(sk2_rinv_SSE1
,rinv_SSE1
))));
1912 t1_SSE0
= _mm_mul_pd(rinv_SSE0
,
1913 _mm_add_pd(_mm_mul_pd(dlij_SSE0
,t1_SSE0
),
1914 _mm_add_pd(t2_SSE0
,t3_SSE0
)));
1915 t1_SSE1
= _mm_mul_pd(rinv_SSE1
,
1916 _mm_add_pd(_mm_mul_pd(dlij_SSE1
,t1_SSE1
),
1917 _mm_add_pd(t2_SSE1
,t3_SSE1
)));
1919 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
));
1921 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE1
,obc_mask1_SSE1
));
1925 /* Epilogue part, including exclusion mask */
1926 for(j
=nj2
; j
<nj3
; j
+=UNROLLJ
)
1928 jmask_SSE0
= _mm_load_pd((double *)emask0
);
1929 jmask_SSE1
= _mm_load_pd((double *)emask1
);
1930 emask0
+= 2*UNROLLJ
;
1931 emask1
+= 2*UNROLLJ
;
1933 /* load j atom coordinates */
1934 jx_SSE
= _mm_load_pd(x_align
+j
);
1935 jy_SSE
= _mm_load_pd(y_align
+j
);
1936 jz_SSE
= _mm_load_pd(z_align
+j
);
1938 /* Calculate distance */
1939 dx_SSE0
= _mm_sub_pd(ix_SSE0
,jx_SSE
);
1940 dy_SSE0
= _mm_sub_pd(iy_SSE0
,jy_SSE
);
1941 dz_SSE0
= _mm_sub_pd(iz_SSE0
,jz_SSE
);
1942 dx_SSE1
= _mm_sub_pd(ix_SSE1
,jx_SSE
);
1943 dy_SSE1
= _mm_sub_pd(iy_SSE1
,jy_SSE
);
1944 dz_SSE1
= _mm_sub_pd(iz_SSE1
,jz_SSE
);
1946 /* rsq = dx*dx+dy*dy+dz*dz */
1947 rsq_SSE0
= gmx_mm_calc_rsq_pd(dx_SSE0
,dy_SSE0
,dz_SSE0
);
1948 rsq_SSE1
= gmx_mm_calc_rsq_pd(dx_SSE1
,dy_SSE1
,dz_SSE1
);
1951 jmask_SSE0
= _mm_and_pd(jmask_SSE0
,imask_SSE0
);
1952 jmask_SSE1
= _mm_and_pd(jmask_SSE1
,imask_SSE1
);
1954 /* Calculate 1/r and 1/r2 */
1955 rinv_SSE0
= gmx_mm_invsqrt_pd(rsq_SSE0
);
1956 rinv_SSE1
= gmx_mm_invsqrt_pd(rsq_SSE1
);
1959 rinv_SSE0
= _mm_and_pd(rinv_SSE0
,jmask_SSE0
);
1960 rinv_SSE1
= _mm_and_pd(rinv_SSE1
,jmask_SSE1
);
1962 dr_SSE0
= _mm_mul_pd(rsq_SSE0
,rinv_SSE0
);
1963 dr_SSE1
= _mm_mul_pd(rsq_SSE1
,rinv_SSE1
);
1965 sk_aj_SSE
= _mm_load_pd(obc_param
+j
);
1966 raj_SSE
= _mm_load_pd(gb_radius
+j
);
1968 raj_inv_SSE
= gmx_mm_inv_pd(raj_SSE
);
1970 /* Evaluate influence of atom aj -> ai */
1971 t1_SSE0
= _mm_add_pd(dr_SSE0
,sk_aj_SSE
);
1972 t1_SSE1
= _mm_add_pd(dr_SSE1
,sk_aj_SSE
);
1973 t2_SSE0
= _mm_sub_pd(dr_SSE0
,sk_aj_SSE
);
1974 t2_SSE1
= _mm_sub_pd(dr_SSE1
,sk_aj_SSE
);
1975 t3_SSE0
= _mm_sub_pd(sk_aj_SSE
,dr_SSE0
);
1976 t3_SSE1
= _mm_sub_pd(sk_aj_SSE
,dr_SSE1
);
1978 obc_mask1_SSE0
= _mm_cmplt_pd(rai_SSE0
, t1_SSE0
);
1979 obc_mask1_SSE1
= _mm_cmplt_pd(rai_SSE1
, t1_SSE1
);
1980 obc_mask2_SSE0
= _mm_cmplt_pd(rai_SSE0
, t2_SSE0
);
1981 obc_mask2_SSE1
= _mm_cmplt_pd(rai_SSE1
, t2_SSE1
);
1982 obc_mask3_SSE0
= _mm_cmplt_pd(rai_SSE0
, t3_SSE0
);
1983 obc_mask3_SSE1
= _mm_cmplt_pd(rai_SSE1
, t3_SSE1
);
1984 obc_mask1_SSE0
= _mm_and_pd(obc_mask1_SSE0
,jmask_SSE0
);
1985 obc_mask1_SSE1
= _mm_and_pd(obc_mask1_SSE1
,jmask_SSE1
);
1987 uij_SSE0
= gmx_mm_inv_pd(t1_SSE0
);
1988 uij_SSE1
= gmx_mm_inv_pd(t1_SSE1
);
1989 lij_SSE0
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE0
,gmx_mm_inv_pd(t2_SSE0
)),
1990 _mm_andnot_pd(obc_mask2_SSE0
,rai_inv_SSE0
));
1991 lij_SSE1
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE1
,gmx_mm_inv_pd(t2_SSE1
)),
1992 _mm_andnot_pd(obc_mask2_SSE1
,rai_inv_SSE1
));
1994 dlij_SSE0
= _mm_and_pd(one_SSE
,obc_mask2_SSE0
);
1995 dlij_SSE1
= _mm_and_pd(one_SSE
,obc_mask2_SSE1
);
1997 uij2_SSE0
= _mm_mul_pd(uij_SSE0
, uij_SSE0
);
1998 uij2_SSE1
= _mm_mul_pd(uij_SSE1
, uij_SSE1
);
1999 uij3_SSE0
= _mm_mul_pd(uij2_SSE0
,uij_SSE0
);
2000 uij3_SSE1
= _mm_mul_pd(uij2_SSE1
,uij_SSE1
);
2001 lij2_SSE0
= _mm_mul_pd(lij_SSE0
, lij_SSE0
);
2002 lij2_SSE1
= _mm_mul_pd(lij_SSE1
, lij_SSE1
);
2003 lij3_SSE0
= _mm_mul_pd(lij2_SSE0
,lij_SSE0
);
2004 lij3_SSE1
= _mm_mul_pd(lij2_SSE1
,lij_SSE1
);
2006 diff2_SSE0
= _mm_sub_pd(uij2_SSE0
,lij2_SSE0
);
2007 diff2_SSE1
= _mm_sub_pd(uij2_SSE1
,lij2_SSE1
);
2008 lij_inv_SSE0
= gmx_mm_invsqrt_pd(lij2_SSE0
);
2009 lij_inv_SSE1
= gmx_mm_invsqrt_pd(lij2_SSE1
);
2010 sk2_aj_SSE
= _mm_mul_pd(sk_aj_SSE
,sk_aj_SSE
);
2011 sk2_rinv_SSE0
= _mm_mul_pd(sk2_aj_SSE
,rinv_SSE0
);
2012 sk2_rinv_SSE1
= _mm_mul_pd(sk2_aj_SSE
,rinv_SSE1
);
2013 prod_SSE0
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE0
);
2014 prod_SSE1
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE1
);
2016 logterm_SSE0
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE0
,lij_inv_SSE0
));
2017 logterm_SSE1
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE1
,lij_inv_SSE1
));
2019 t1_SSE0
= _mm_sub_pd(lij_SSE0
,uij_SSE0
);
2020 t1_SSE1
= _mm_sub_pd(lij_SSE1
,uij_SSE1
);
2021 t2_SSE0
= _mm_mul_pd(diff2_SSE0
,
2022 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE0
),
2024 t2_SSE1
= _mm_mul_pd(diff2_SSE1
,
2025 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE1
),
2028 t3_SSE0
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE0
,logterm_SSE0
));
2029 t3_SSE1
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE1
,logterm_SSE1
));
2030 t1_SSE0
= _mm_add_pd(t1_SSE0
,_mm_add_pd(t2_SSE0
,t3_SSE0
));
2031 t1_SSE1
= _mm_add_pd(t1_SSE1
,_mm_add_pd(t2_SSE1
,t3_SSE1
));
2032 t4_SSE0
= _mm_mul_pd(two_SSE
,_mm_sub_pd(rai_inv_SSE0
,lij_SSE0
));
2033 t4_SSE1
= _mm_mul_pd(two_SSE
,_mm_sub_pd(rai_inv_SSE1
,lij_SSE1
));
2034 t4_SSE0
= _mm_and_pd(t4_SSE0
,obc_mask3_SSE0
);
2035 t4_SSE1
= _mm_and_pd(t4_SSE1
,obc_mask3_SSE1
);
2036 t1_SSE0
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE0
,t4_SSE0
));
2037 t1_SSE1
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE1
,t4_SSE1
));
2039 sum_ai_SSE0
= _mm_add_pd(sum_ai_SSE0
,_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
));
2040 sum_ai_SSE1
= _mm_add_pd(sum_ai_SSE1
,_mm_and_pd(t1_SSE1
,obc_mask1_SSE1
));
2042 t1_SSE0
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE0
),
2043 _mm_mul_pd(prod_SSE0
,lij3_SSE0
));
2044 t1_SSE1
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE1
),
2045 _mm_mul_pd(prod_SSE1
,lij3_SSE1
));
2046 t1_SSE0
= _mm_sub_pd(t1_SSE0
,
2047 _mm_mul_pd(onefourth_SSE
,
2048 _mm_add_pd(_mm_mul_pd(lij_SSE0
,rinv_SSE0
),
2049 _mm_mul_pd(lij3_SSE0
,dr_SSE0
))));
2050 t1_SSE1
= _mm_sub_pd(t1_SSE1
,
2051 _mm_mul_pd(onefourth_SSE
,
2052 _mm_add_pd(_mm_mul_pd(lij_SSE1
,rinv_SSE1
),
2053 _mm_mul_pd(lij3_SSE1
,dr_SSE1
))));
2055 t2_SSE0
= _mm_mul_pd(onefourth_SSE
,
2056 _mm_add_pd(_mm_mul_pd(uij_SSE0
,rinv_SSE0
),
2057 _mm_mul_pd(uij3_SSE0
,dr_SSE0
)));
2058 t2_SSE1
= _mm_mul_pd(onefourth_SSE
,
2059 _mm_add_pd(_mm_mul_pd(uij_SSE1
,rinv_SSE1
),
2060 _mm_mul_pd(uij3_SSE1
,dr_SSE1
)));
2061 t2_SSE0
= _mm_sub_pd(t2_SSE0
,
2062 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE0
),
2063 _mm_mul_pd(prod_SSE0
,uij3_SSE0
)));
2064 t2_SSE1
= _mm_sub_pd(t2_SSE1
,
2065 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE1
),
2066 _mm_mul_pd(prod_SSE1
,uij3_SSE1
)));
2067 t3_SSE0
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE0
),
2068 _mm_mul_pd(rinv_SSE0
,rinv_SSE0
));
2069 t3_SSE1
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE1
),
2070 _mm_mul_pd(rinv_SSE1
,rinv_SSE1
));
2071 t3_SSE0
= _mm_sub_pd(t3_SSE0
,
2072 _mm_mul_pd(_mm_mul_pd(diff2_SSE0
,oneeighth_SSE
),
2074 _mm_mul_pd(sk2_rinv_SSE0
,rinv_SSE0
))));
2075 t3_SSE1
= _mm_sub_pd(t3_SSE1
,
2076 _mm_mul_pd(_mm_mul_pd(diff2_SSE1
,oneeighth_SSE
),
2078 _mm_mul_pd(sk2_rinv_SSE1
,rinv_SSE1
))));
2080 t1_SSE0
= _mm_mul_pd(rinv_SSE0
,
2081 _mm_add_pd(_mm_mul_pd(dlij_SSE0
,t1_SSE0
),
2082 _mm_add_pd(t2_SSE0
,t3_SSE0
)));
2083 t1_SSE1
= _mm_mul_pd(rinv_SSE1
,
2084 _mm_add_pd(_mm_mul_pd(dlij_SSE1
,t1_SSE1
),
2085 _mm_add_pd(t2_SSE1
,t3_SSE1
)));
2087 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
));
2089 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE1
,obc_mask1_SSE1
));
2092 /* Evaluate influence of atom ai -> aj */
2093 t1_SSE0
= _mm_add_pd(dr_SSE0
,sk_ai_SSE0
);
2094 t1_SSE1
= _mm_add_pd(dr_SSE1
,sk_ai_SSE1
);
2095 t2_SSE0
= _mm_sub_pd(dr_SSE0
,sk_ai_SSE0
);
2096 t2_SSE1
= _mm_sub_pd(dr_SSE1
,sk_ai_SSE1
);
2097 t3_SSE0
= _mm_sub_pd(sk_ai_SSE0
,dr_SSE0
);
2098 t3_SSE1
= _mm_sub_pd(sk_ai_SSE1
,dr_SSE1
);
2100 obc_mask1_SSE0
= _mm_cmplt_pd(raj_SSE
, t1_SSE0
);
2101 obc_mask1_SSE1
= _mm_cmplt_pd(raj_SSE
, t1_SSE1
);
2102 obc_mask2_SSE0
= _mm_cmplt_pd(raj_SSE
, t2_SSE0
);
2103 obc_mask2_SSE1
= _mm_cmplt_pd(raj_SSE
, t2_SSE1
);
2104 obc_mask3_SSE0
= _mm_cmplt_pd(raj_SSE
, t3_SSE0
);
2105 obc_mask3_SSE1
= _mm_cmplt_pd(raj_SSE
, t3_SSE1
);
2106 obc_mask1_SSE0
= _mm_and_pd(obc_mask1_SSE0
,jmask_SSE0
);
2107 obc_mask1_SSE1
= _mm_and_pd(obc_mask1_SSE1
,jmask_SSE1
);
2109 uij_SSE0
= gmx_mm_inv_pd(t1_SSE0
);
2110 uij_SSE1
= gmx_mm_inv_pd(t1_SSE1
);
2111 lij_SSE0
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE0
,gmx_mm_inv_pd(t2_SSE0
)),
2112 _mm_andnot_pd(obc_mask2_SSE0
,raj_inv_SSE
));
2113 lij_SSE1
= _mm_or_pd( _mm_and_pd(obc_mask2_SSE1
,gmx_mm_inv_pd(t2_SSE1
)),
2114 _mm_andnot_pd(obc_mask2_SSE1
,raj_inv_SSE
));
2116 dlij_SSE0
= _mm_and_pd(one_SSE
,obc_mask2_SSE0
);
2117 dlij_SSE1
= _mm_and_pd(one_SSE
,obc_mask2_SSE1
);
2119 uij2_SSE0
= _mm_mul_pd(uij_SSE0
, uij_SSE0
);
2120 uij2_SSE1
= _mm_mul_pd(uij_SSE1
, uij_SSE1
);
2121 uij3_SSE0
= _mm_mul_pd(uij2_SSE0
,uij_SSE0
);
2122 uij3_SSE1
= _mm_mul_pd(uij2_SSE1
,uij_SSE1
);
2123 lij2_SSE0
= _mm_mul_pd(lij_SSE0
, lij_SSE0
);
2124 lij2_SSE1
= _mm_mul_pd(lij_SSE1
, lij_SSE1
);
2125 lij3_SSE0
= _mm_mul_pd(lij2_SSE0
,lij_SSE0
);
2126 lij3_SSE1
= _mm_mul_pd(lij2_SSE1
,lij_SSE1
);
2128 diff2_SSE0
= _mm_sub_pd(uij2_SSE0
,lij2_SSE0
);
2129 diff2_SSE1
= _mm_sub_pd(uij2_SSE1
,lij2_SSE1
);
2130 lij_inv_SSE0
= gmx_mm_invsqrt_pd(lij2_SSE0
);
2131 lij_inv_SSE1
= gmx_mm_invsqrt_pd(lij2_SSE1
);
2132 sk2_rinv_SSE0
= _mm_mul_pd(sk2_ai_SSE0
,rinv_SSE0
);
2133 sk2_rinv_SSE1
= _mm_mul_pd(sk2_ai_SSE1
,rinv_SSE1
);
2134 prod_SSE0
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE0
);
2135 prod_SSE1
= _mm_mul_pd(onefourth_SSE
,sk2_rinv_SSE1
);
2137 logterm_SSE0
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE0
,lij_inv_SSE0
));
2138 logterm_SSE1
= gmx_mm_log_pd(_mm_mul_pd(uij_SSE1
,lij_inv_SSE1
));
2139 t1_SSE0
= _mm_sub_pd(lij_SSE0
,uij_SSE0
);
2140 t1_SSE1
= _mm_sub_pd(lij_SSE1
,uij_SSE1
);
2141 t2_SSE0
= _mm_mul_pd(diff2_SSE0
,
2142 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE0
),
2144 t2_SSE1
= _mm_mul_pd(diff2_SSE1
,
2145 _mm_sub_pd(_mm_mul_pd(onefourth_SSE
,dr_SSE1
),
2147 t3_SSE0
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE0
,logterm_SSE0
));
2148 t3_SSE1
= _mm_mul_pd(half_SSE
,_mm_mul_pd(rinv_SSE1
,logterm_SSE1
));
2149 t1_SSE0
= _mm_add_pd(t1_SSE0
,_mm_add_pd(t2_SSE0
,t3_SSE0
));
2150 t1_SSE1
= _mm_add_pd(t1_SSE1
,_mm_add_pd(t2_SSE1
,t3_SSE1
));
2151 t4_SSE0
= _mm_mul_pd(two_SSE
,_mm_sub_pd(raj_inv_SSE
,lij_SSE0
));
2152 t4_SSE1
= _mm_mul_pd(two_SSE
,_mm_sub_pd(raj_inv_SSE
,lij_SSE1
));
2153 t4_SSE0
= _mm_and_pd(t4_SSE0
,obc_mask3_SSE0
);
2154 t4_SSE1
= _mm_and_pd(t4_SSE1
,obc_mask3_SSE1
);
2155 t1_SSE0
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE0
,t4_SSE0
));
2156 t1_SSE1
= _mm_mul_pd(half_SSE
,_mm_add_pd(t1_SSE1
,t4_SSE1
));
2158 _mm_store_pd(work
+j
, _mm_add_pd(_mm_load_pd(work
+j
),
2159 _mm_add_pd(_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
),
2160 _mm_and_pd(t1_SSE1
,obc_mask1_SSE1
))));
2162 t1_SSE0
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE0
),
2163 _mm_mul_pd(prod_SSE0
,lij3_SSE0
));
2164 t1_SSE1
= _mm_add_pd(_mm_mul_pd(half_SSE
,lij2_SSE1
),
2165 _mm_mul_pd(prod_SSE1
,lij3_SSE1
));
2167 t1_SSE0
= _mm_sub_pd(t1_SSE0
,
2168 _mm_mul_pd(onefourth_SSE
,
2169 _mm_add_pd(_mm_mul_pd(lij_SSE0
,rinv_SSE0
),
2170 _mm_mul_pd(lij3_SSE0
,dr_SSE0
))));
2171 t1_SSE1
= _mm_sub_pd(t1_SSE1
,
2172 _mm_mul_pd(onefourth_SSE
,
2173 _mm_add_pd(_mm_mul_pd(lij_SSE1
,rinv_SSE1
),
2174 _mm_mul_pd(lij3_SSE1
,dr_SSE1
))));
2175 t2_SSE0
= _mm_mul_pd(onefourth_SSE
,
2176 _mm_add_pd(_mm_mul_pd(uij_SSE0
,rinv_SSE0
),
2177 _mm_mul_pd(uij3_SSE0
,dr_SSE0
)));
2178 t2_SSE1
= _mm_mul_pd(onefourth_SSE
,
2179 _mm_add_pd(_mm_mul_pd(uij_SSE1
,rinv_SSE1
),
2180 _mm_mul_pd(uij3_SSE1
,dr_SSE1
)));
2181 t2_SSE0
= _mm_sub_pd(t2_SSE0
,
2182 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE0
),
2183 _mm_mul_pd(prod_SSE0
,uij3_SSE0
)));
2184 t2_SSE1
= _mm_sub_pd(t2_SSE1
,
2185 _mm_add_pd(_mm_mul_pd(half_SSE
,uij2_SSE1
),
2186 _mm_mul_pd(prod_SSE1
,uij3_SSE1
)));
2188 t3_SSE0
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE0
),
2189 _mm_mul_pd(rinv_SSE0
,rinv_SSE0
));
2190 t3_SSE1
= _mm_mul_pd(_mm_mul_pd(onefourth_SSE
,logterm_SSE1
),
2191 _mm_mul_pd(rinv_SSE1
,rinv_SSE1
));
2193 t3_SSE0
= _mm_sub_pd(t3_SSE0
,
2194 _mm_mul_pd(_mm_mul_pd(diff2_SSE0
,oneeighth_SSE
),
2196 _mm_mul_pd(sk2_rinv_SSE0
,rinv_SSE0
))));
2197 t3_SSE1
= _mm_sub_pd(t3_SSE1
,
2198 _mm_mul_pd(_mm_mul_pd(diff2_SSE1
,oneeighth_SSE
),
2200 _mm_mul_pd(sk2_rinv_SSE1
,rinv_SSE1
))));
2202 t1_SSE0
= _mm_mul_pd(rinv_SSE0
,
2203 _mm_add_pd(_mm_mul_pd(dlij_SSE0
,t1_SSE0
),
2204 _mm_add_pd(t2_SSE0
,t3_SSE0
)));
2205 t1_SSE1
= _mm_mul_pd(rinv_SSE1
,
2206 _mm_add_pd(_mm_mul_pd(dlij_SSE1
,t1_SSE1
),
2207 _mm_add_pd(t2_SSE1
,t3_SSE1
)));
2209 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE0
,obc_mask1_SSE0
));
2211 _mm_store_pd(dadx
,_mm_and_pd(t1_SSE1
,obc_mask1_SSE1
));
2214 GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0
,sum_ai_SSE1
);
2215 sum_ai_SSE0
= _mm_add_pd(sum_ai_SSE0
,sum_ai_SSE1
);
2216 _mm_store_pd(work
+i
, _mm_add_pd(sum_ai_SSE0
, _mm_load_pd(work
+i
)));
2220 for(i
=0;i
<natoms
/2+1;i
++)
2222 work
[i
] += work
[natoms
+i
];
2225 /* Parallel summations */
2229 gmx_sum(natoms
,work
, cr
);
2232 if(gb_algorithm
==egbHCT
)
2235 for(i
=0;i
<natoms
;i
++)
2237 if(born
->use
[i
] != 0)
2239 rai
= top
->atomtypes
.gb_radius
[mdatoms
->typeA
[i
]]-born
->gb_doffset
;
2240 sum_ai
= 1.0/rai
- work
[i
];
2241 min_rad
= rai
+ born
->gb_doffset
;
2244 born
->bRad
[i
] = rad
> min_rad
? rad
: min_rad
;
2245 fr
->invsqrta
[i
] = gmx_invsqrt(born
->bRad
[i
]);
2254 /* Calculate the radii */
2255 for(i
=0;i
<natoms
;i
++)
2258 if(born
->use
[i
] != 0)
2260 rai
= top
->atomtypes
.gb_radius
[mdatoms
->typeA
[i
]];
2262 rai
= rai
-born
->gb_doffset
;
2264 sum_ai
= rai
* work
[i
];
2265 sum_ai2
= sum_ai
* sum_ai
;
2266 sum_ai3
= sum_ai2
* sum_ai
;
2268 tsum
= tanh(born
->obc_alpha
*sum_ai
-born
->obc_beta
*sum_ai2
+born
->obc_gamma
*sum_ai3
);
2269 born
->bRad
[i
] = rai_inv
- tsum
*rai_inv2
;
2270 born
->bRad
[i
] = 1.0 / born
->bRad
[i
];
2272 fr
->invsqrta
[i
]=gmx_invsqrt(born
->bRad
[i
]);
2274 tchain
= rai
* (born
->obc_alpha
-2*born
->obc_beta
*sum_ai
+3*born
->obc_gamma
*sum_ai2
);
2275 born
->drobc
[i
] = (1.0-tsum
*tsum
)*tchain
*rai_inv2
;
2291 genborn_allvsall_calc_chainrule_sse2_double(t_forcerec
* fr
,
2292 t_mdatoms
* mdatoms
,
2293 gmx_genborn_t
* born
,
2299 gmx_allvsallgb2_data_t
*aadata
;
2302 int nj0
,nj1
,nj2
,nj3
;
2315 double rbai
,rbaj
,fgb
,fgb_ai
,rbi
;
2326 __m128d jmask_SSE0
,jmask_SSE1
;
2327 __m128d ix_SSE0
,iy_SSE0
,iz_SSE0
;
2328 __m128d ix_SSE1
,iy_SSE1
,iz_SSE1
;
2329 __m128d fix_SSE0
,fiy_SSE0
,fiz_SSE0
;
2330 __m128d fix_SSE1
,fiy_SSE1
,fiz_SSE1
;
2331 __m128d rbai_SSE0
,rbai_SSE1
;
2332 __m128d imask_SSE0
,imask_SSE1
;
2333 __m128d jx_SSE
,jy_SSE
,jz_SSE
,rbaj_SSE
;
2334 __m128d dx_SSE0
,dy_SSE0
,dz_SSE0
;
2335 __m128d dx_SSE1
,dy_SSE1
,dz_SSE1
;
2336 __m128d fgb_SSE0
,fgb_ai_SSE0
;
2337 __m128d fgb_SSE1
,fgb_ai_SSE1
;
2338 __m128d tx_SSE0
,ty_SSE0
,tz_SSE0
;
2339 __m128d tx_SSE1
,ty_SSE1
,tz_SSE1
;
2340 __m128d t1
,t2
,tmpSSE
;
2342 natoms
= mdatoms
->nr
;
2343 ni0
= (mdatoms
->start
/SIMD_WIDTH
)*SIMD_WIDTH
;
2344 ni1
= mdatoms
->start
+mdatoms
->homenr
;
2346 aadata
= (gmx_allvsallgb2_data_t
*)paadata
;
2348 x_align
= aadata
->x_align
;
2349 y_align
= aadata
->y_align
;
2350 z_align
= aadata
->z_align
;
2351 fx_align
= aadata
->fx_align
;
2352 fy_align
= aadata
->fy_align
;
2353 fz_align
= aadata
->fz_align
;
2355 jindex
= aadata
->jindex_gb
;
2361 /* Loop to get the proper form for the Born radius term */
2362 if(gb_algorithm
==egbSTILL
)
2364 for(i
=0;i
<natoms
;i
++)
2366 rbi
= born
->bRad
[i
];
2367 rb
[i
] = (2 * rbi
* rbi
* fr
->dvda
[i
])/ONE_4PI_EPS0
;
2370 else if(gb_algorithm
==egbHCT
)
2372 for(i
=0;i
<natoms
;i
++)
2374 rbi
= born
->bRad
[i
];
2375 rb
[i
] = rbi
* rbi
* fr
->dvda
[i
];
2378 else if(gb_algorithm
==egbOBC
)
2380 for(idx
=0;idx
<natoms
;idx
++)
2382 rbi
= born
->bRad
[idx
];
2383 rb
[idx
] = rbi
* rbi
* born
->drobc
[idx
] * fr
->dvda
[idx
];
2387 for(i
=0;i
<2*natoms
;i
++)
2395 for(i
=0;i
<natoms
;i
++)
2400 for(i
=ni0
; i
<ni1
; i
+=UNROLLI
)
2402 /* We assume shifts are NOT used for all-vs-all interactions */
2404 /* Load i atom data */
2405 ix_SSE0
= _mm_load1_pd(x_align
+i
);
2406 iy_SSE0
= _mm_load1_pd(y_align
+i
);
2407 iz_SSE0
= _mm_load1_pd(z_align
+i
);
2408 ix_SSE1
= _mm_load1_pd(x_align
+i
+1);
2409 iy_SSE1
= _mm_load1_pd(y_align
+i
+1);
2410 iz_SSE1
= _mm_load1_pd(z_align
+i
+1);
2412 fix_SSE0
= _mm_setzero_pd();
2413 fiy_SSE0
= _mm_setzero_pd();
2414 fiz_SSE0
= _mm_setzero_pd();
2415 fix_SSE1
= _mm_setzero_pd();
2416 fiy_SSE1
= _mm_setzero_pd();
2417 fiz_SSE1
= _mm_setzero_pd();
2419 rbai_SSE0
= _mm_load1_pd(rb
+i
);
2420 rbai_SSE1
= _mm_load1_pd(rb
+i
+1);
2422 /* Load limits for loop over neighbors */
2424 nj3
= jindex
[4*i
+3];
2426 /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
2427 for(j
=nj0
; j
<nj3
; j
+=UNROLLJ
)
2429 /* load j atom coordinates */
2430 jx_SSE
= _mm_load_pd(x_align
+j
);
2431 jy_SSE
= _mm_load_pd(y_align
+j
);
2432 jz_SSE
= _mm_load_pd(z_align
+j
);
2434 /* Calculate distance */
2435 dx_SSE0
= _mm_sub_pd(ix_SSE0
,jx_SSE
);
2436 dy_SSE0
= _mm_sub_pd(iy_SSE0
,jy_SSE
);
2437 dz_SSE0
= _mm_sub_pd(iz_SSE0
,jz_SSE
);
2438 dx_SSE1
= _mm_sub_pd(ix_SSE1
,jx_SSE
);
2439 dy_SSE1
= _mm_sub_pd(iy_SSE1
,jy_SSE
);
2440 dz_SSE1
= _mm_sub_pd(iz_SSE1
,jz_SSE
);
2442 rbaj_SSE
= _mm_load_pd(rb
+j
);
2444 fgb_SSE0
= _mm_mul_pd(rbai_SSE0
,_mm_load_pd(dadx
));
2446 fgb_SSE1
= _mm_mul_pd(rbai_SSE1
,_mm_load_pd(dadx
));
2449 fgb_ai_SSE0
= _mm_mul_pd(rbaj_SSE
,_mm_load_pd(dadx
));
2451 fgb_ai_SSE1
= _mm_mul_pd(rbaj_SSE
,_mm_load_pd(dadx
));
2454 /* Total force between ai and aj is the sum of ai->aj and aj->ai */
2455 fgb_SSE0
= _mm_add_pd(fgb_SSE0
,fgb_ai_SSE0
);
2456 fgb_SSE1
= _mm_add_pd(fgb_SSE1
,fgb_ai_SSE1
);
2458 /* Calculate temporary vectorial force */
2459 tx_SSE0
= _mm_mul_pd(fgb_SSE0
,dx_SSE0
);
2460 ty_SSE0
= _mm_mul_pd(fgb_SSE0
,dy_SSE0
);
2461 tz_SSE0
= _mm_mul_pd(fgb_SSE0
,dz_SSE0
);
2462 tx_SSE1
= _mm_mul_pd(fgb_SSE1
,dx_SSE1
);
2463 ty_SSE1
= _mm_mul_pd(fgb_SSE1
,dy_SSE1
);
2464 tz_SSE1
= _mm_mul_pd(fgb_SSE1
,dz_SSE1
);
2466 /* Increment i atom force */
2467 fix_SSE0
= _mm_add_pd(fix_SSE0
,tx_SSE0
);
2468 fiy_SSE0
= _mm_add_pd(fiy_SSE0
,ty_SSE0
);
2469 fiz_SSE0
= _mm_add_pd(fiz_SSE0
,tz_SSE0
);
2470 fix_SSE1
= _mm_add_pd(fix_SSE1
,tx_SSE1
);
2471 fiy_SSE1
= _mm_add_pd(fiy_SSE1
,ty_SSE1
);
2472 fiz_SSE1
= _mm_add_pd(fiz_SSE1
,tz_SSE1
);
2474 /* Decrement j atom force */
2475 _mm_store_pd(fx_align
+j
,
2476 _mm_sub_pd( _mm_load_pd(fx_align
+j
) , _mm_add_pd(tx_SSE0
,tx_SSE1
) ));
2477 _mm_store_pd(fy_align
+j
,
2478 _mm_sub_pd( _mm_load_pd(fy_align
+j
) , _mm_add_pd(ty_SSE0
,ty_SSE1
) ));
2479 _mm_store_pd(fz_align
+j
,
2480 _mm_sub_pd( _mm_load_pd(fz_align
+j
) , _mm_add_pd(tz_SSE0
,tz_SSE1
) ));
2483 /* Add i forces to mem */
2484 GMX_MM_TRANSPOSE2_PD(fix_SSE0
,fix_SSE1
);
2485 fix_SSE0
= _mm_add_pd(fix_SSE0
,fix_SSE1
);
2486 _mm_store_pd(fx_align
+i
, _mm_add_pd(fix_SSE0
, _mm_load_pd(fx_align
+i
)));
2488 GMX_MM_TRANSPOSE2_PD(fiy_SSE0
,fiy_SSE1
);
2489 fiy_SSE0
= _mm_add_pd(fiy_SSE0
,fiy_SSE1
);
2490 _mm_store_pd(fy_align
+i
, _mm_add_pd(fiy_SSE0
, _mm_load_pd(fy_align
+i
)));
2492 GMX_MM_TRANSPOSE2_PD(fiz_SSE0
,fiz_SSE1
);
2493 fiz_SSE0
= _mm_add_pd(fiz_SSE0
,fiz_SSE1
);
2494 _mm_store_pd(fz_align
+i
, _mm_add_pd(fiz_SSE0
, _mm_load_pd(fz_align
+i
)));
2497 for(i
=0;i
<natoms
;i
++)
2499 f
[3*i
] += fx_align
[i
] + fx_align
[natoms
+i
];
2500 f
[3*i
+1] += fy_align
[i
] + fy_align
[natoms
+i
];
2501 f
[3*i
+2] += fz_align
[i
] + fz_align
[natoms
+i
];
2508 /* dummy variable when not using SSE */
2509 int genborn_allvsall_sse2_double_dummy
;