Merge branch 'master' of git://git.gromacs.org/gromacs
[gromacs/adressmacs.git] / src / mdlib / genborn_allvsall_sse2_double.c
blobc91d05a06b9720a5bbd4f5314e7e7f5b5ebaa78f
1 /*
2 * This source code is part of
4 * G R O M A C S
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
16 * later version.
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
30 #ifdef HAVE_CONFIG_H
31 #include <config.h>
32 #endif
34 #include <math.h>
35 #include "types/simple.h"
37 #include "vec.h"
38 #include "smalloc.h"
40 #include "partdec.h"
41 #include "network.h"
42 #include "physics.h"
43 #include "genborn.h"
44 #include "genborn_allvsall.h"
47 #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) )
49 #include <gmx_sse2_double.h>
52 #define SIMD_WIDTH 2
53 #define UNROLLI 2
54 #define UNROLLJ 2
64 typedef struct
66 int * jindex_gb;
67 int ** prologue_mask_gb;
68 int ** epilogue_mask;
69 int * imask;
70 double * gb_radius;
71 double * workparam;
72 double * work;
73 double * x_align;
74 double * y_align;
75 double * z_align;
76 double * fx_align;
77 double * fy_align;
78 double * fz_align;
80 gmx_allvsallgb2_data_t;
83 static int
84 calc_maxoffset(int i,int natoms)
86 int maxoffset;
88 if ((natoms % 2) == 1)
90 /* Odd number of atoms, easy */
91 maxoffset = natoms/2;
93 else if ((natoms % 4) == 0)
95 /* Multiple of four is hard */
96 if (i < natoms/2)
98 if ((i % 2) == 0)
100 maxoffset=natoms/2;
102 else
104 maxoffset=natoms/2-1;
107 else
109 if ((i % 2) == 1)
111 maxoffset=natoms/2;
113 else
115 maxoffset=natoms/2-1;
119 else
121 /* natoms/2 = odd */
122 if ((i % 2) == 0)
124 maxoffset=natoms/2;
126 else
128 maxoffset=natoms/2-1;
132 return maxoffset;
135 static void
136 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
137 t_ilist * ilist,
138 int start,
139 int end,
140 int natoms,
141 gmx_bool bInclude12,
142 gmx_bool bInclude13,
143 gmx_bool bInclude14)
145 int i,j,k,tp;
146 int a1,a2;
147 int ni0,ni1,nj0,nj1,nj;
148 int imin,imax,iexcl;
149 int max_offset;
150 int max_excl_offset;
151 int firstinteraction;
152 int ibase;
153 int *pi;
155 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
156 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
157 * whether they should interact or not.
159 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
160 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
161 * we create a jindex array with three elements per i atom: the starting point, the point to
162 * which we need to check exclusions, and the end point.
163 * This way we only have to allocate a short exclusion mask per i atom.
166 ni0 = (start/UNROLLI)*UNROLLI;
167 ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
169 /* Set the interaction mask to only enable the i atoms we want to include */
170 snew(pi,2*(natoms+UNROLLI+2*SIMD_WIDTH));
171 aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
172 for(i=0;i<natoms+UNROLLI;i++)
174 aadata->imask[2*i] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
175 aadata->imask[2*i+1] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
178 /* Allocate memory for our modified jindex array */
179 snew(aadata->jindex_gb,4*(natoms+UNROLLI));
180 for(i=0;i<4*(natoms+UNROLLI);i++)
182 aadata->jindex_gb[i] = 0;
185 /* Create the exclusion masks for the prologue part */
186 snew(aadata->prologue_mask_gb,natoms+UNROLLI); /* list of pointers */
188 /* First zero everything to avoid uninitialized data */
189 for(i=0;i<natoms+UNROLLI;i++)
191 aadata->prologue_mask_gb[i] = NULL;
194 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
195 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
197 max_excl_offset = -1;
199 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
200 imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
202 /* Which atom is the first we (might) interact with? */
203 imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
204 for(i=ibase;i<imax;i++)
206 /* Before exclusions, which atom is the first we (might) interact with? */
207 firstinteraction = i+1;
208 max_offset = calc_maxoffset(i,natoms);
210 if(!bInclude12)
212 for(j=0;j<ilist[F_GB12].nr;j+=3)
214 a1 = ilist[F_GB12].iatoms[j+1];
215 a2 = ilist[F_GB12].iatoms[j+2];
217 if(a1==i)
219 k = a2;
221 else if(a2==i)
223 k = a1;
225 else
227 continue;
230 if(k==firstinteraction)
232 firstinteraction++;
236 if(!bInclude13)
238 for(j=0;j<ilist[F_GB13].nr;j+=3)
240 a1 = ilist[F_GB13].iatoms[j+1];
241 a2 = ilist[F_GB13].iatoms[j+2];
243 if(a1==i)
245 k = a2;
247 else if(a2==i)
249 k = a1;
251 else
253 continue;
256 if(k==firstinteraction)
258 firstinteraction++;
262 if(!bInclude14)
264 for(j=0;j<ilist[F_GB14].nr;j+=3)
266 a1 = ilist[F_GB14].iatoms[j+1];
267 a2 = ilist[F_GB14].iatoms[j+2];
268 if(a1==i)
270 k = a2;
272 else if(a2==i)
274 k = a1;
276 else
278 continue;
281 if(k==firstinteraction)
283 firstinteraction++;
287 imin = (firstinteraction < imin) ? firstinteraction : imin;
289 /* round down to j unrolling factor */
290 imin = (imin/UNROLLJ)*UNROLLJ;
292 for(i=ibase;i<imax;i++)
294 max_offset = calc_maxoffset(i,natoms);
296 if(!bInclude12)
298 for(j=0;j<ilist[F_GB12].nr;j+=3)
300 a1 = ilist[F_GB12].iatoms[j+1];
301 a2 = ilist[F_GB12].iatoms[j+2];
303 if(a1==i)
305 k = a2;
307 else if(a2==i)
309 k = a1;
311 else
313 continue;
316 if(k<imin)
318 k += natoms;
321 if(k>i+max_offset)
323 continue;
326 k = k - imin;
328 if( k+natoms <= max_offset )
330 k+=natoms;
332 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
335 if(!bInclude13)
337 for(j=0;j<ilist[F_GB13].nr;j+=3)
339 a1 = ilist[F_GB13].iatoms[j+1];
340 a2 = ilist[F_GB13].iatoms[j+2];
342 if(a1==i)
344 k = a2;
346 else if(a2==i)
348 k = a1;
350 else
352 continue;
355 if(k<imin)
357 k += natoms;
360 if(k>i+max_offset)
362 continue;
365 k = k - imin;
367 if( k+natoms <= max_offset )
369 k+=natoms;
371 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
374 if(!bInclude14)
376 for(j=0;j<ilist[F_GB14].nr;j+=3)
378 a1 = ilist[F_GB14].iatoms[j+1];
379 a2 = ilist[F_GB14].iatoms[j+2];
381 if(a1==i)
383 k = a2;
385 else if(a2==i)
387 k = a1;
389 else
391 continue;
394 if(k<imin)
396 k += natoms;
399 if(k>i+max_offset)
401 continue;
404 k = k - imin;
406 if( k+natoms <= max_offset )
408 k+=natoms;
410 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
415 /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
416 max_excl_offset++;
417 /* round up to j unrolling factor */
418 max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
420 /* Set all the prologue masks length to this value (even for i>end) */
421 for(i=ibase;i<ibase+UNROLLI;i++)
423 aadata->jindex_gb[4*i] = imin;
424 aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
428 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
429 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
431 for(i=ibase;i<ibase+UNROLLI;i++)
433 nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
434 imin = aadata->jindex_gb[4*i];
436 /* Allocate aligned memory */
437 snew(pi,2*(nj+2*SIMD_WIDTH));
438 aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
440 max_offset = calc_maxoffset(i,natoms);
442 /* Include interactions i+1 <= j < i+maxoffset */
443 for(k=0;k<nj;k++)
445 j = imin + k;
447 if( (j>i) && (j<=i+max_offset) )
449 aadata->prologue_mask_gb[i][2*k] = 0xFFFFFFFF;
450 aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF;
452 else
454 aadata->prologue_mask_gb[i][2*k] = 0;
455 aadata->prologue_mask_gb[i][2*k+1] = 0;
459 /* Clear out the explicit exclusions */
460 if(i<end)
462 if(!bInclude12)
464 for(j=0;j<ilist[F_GB12].nr;j+=3)
466 a1 = ilist[F_GB12].iatoms[j+1];
467 a2 = ilist[F_GB12].iatoms[j+2];
469 if(a1==i)
471 k = a2;
473 else if(a2==i)
475 k = a1;
477 else
479 continue;
482 if(k>i+max_offset)
484 continue;
486 k = k-i;
488 if( k+natoms <= max_offset )
490 k+=natoms;
493 k = k+i-imin;
494 if(k>=0)
496 aadata->prologue_mask_gb[i][2*k] = 0;
497 aadata->prologue_mask_gb[i][2*k+1] = 0;
501 if(!bInclude13)
503 for(j=0;j<ilist[F_GB13].nr;j+=3)
505 a1 = ilist[F_GB13].iatoms[j+1];
506 a2 = ilist[F_GB13].iatoms[j+2];
508 if(a1==i)
510 k = a2;
512 else if(a2==i)
514 k = a1;
516 else
518 continue;
521 if(k>i+max_offset)
523 continue;
525 k = k-i;
527 if( k+natoms <= max_offset )
529 k+=natoms;
532 k = k+i-imin;
533 if(k>=0)
535 aadata->prologue_mask_gb[i][2*k] = 0;
536 aadata->prologue_mask_gb[i][2*k+1] = 0;
540 if(!bInclude14)
542 for(j=0;j<ilist[F_GB14].nr;j+=3)
544 a1 = ilist[F_GB14].iatoms[j+1];
545 a2 = ilist[F_GB14].iatoms[j+2];
547 if(a1==i)
549 k = a2;
551 else if(a2==i)
553 k = a1;
555 else
557 continue;
560 if(k>i+max_offset)
562 continue;
564 k = k-i;
566 if( k+natoms <= max_offset )
568 k+=natoms;
571 k = k+i-imin;
572 if(k>=0)
574 aadata->prologue_mask_gb[i][2*k] = 0;
575 aadata->prologue_mask_gb[i][2*k+1] = 0;
583 /* Construct the epilogue mask - this just contains the check for maxoffset */
584 snew(aadata->epilogue_mask,natoms+UNROLLI);
586 /* First zero everything to avoid uninitialized data */
587 for(i=0;i<natoms+UNROLLI;i++)
589 aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
590 aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
591 aadata->epilogue_mask[i] = NULL;
594 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
596 /* Find the lowest index for which we need to use the epilogue */
597 imin = ibase;
598 max_offset = calc_maxoffset(imin,natoms);
600 imin = imin + 1 + max_offset;
602 /* Find largest index for which we need to use the epilogue */
603 imax = ibase + UNROLLI-1;
604 imax = (imax < end) ? imax : end;
606 max_offset = calc_maxoffset(imax,natoms);
607 imax = imax + 1 + max_offset + UNROLLJ - 1;
609 for(i=ibase;i<ibase+UNROLLI;i++)
611 /* Start of epilogue - round down to j tile limit */
612 aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
613 /* Make sure we dont overlap - for small systems everything is done in the prologue */
614 aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
615 /* Round upwards to j tile limit */
616 aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
617 /* Make sure we dont have a negative range for the epilogue */
618 aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
622 /* And fill it with data... */
624 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
626 for(i=ibase;i<ibase+UNROLLI;i++)
629 nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
631 /* Allocate aligned memory */
632 snew(pi,2*(nj+2*SIMD_WIDTH));
633 aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
635 max_offset = calc_maxoffset(i,natoms);
637 for(k=0;k<nj;k++)
639 j = aadata->jindex_gb[4*i+2] + k;
640 aadata->epilogue_mask[i][2*k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
641 aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
648 static void
649 genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
650 gmx_localtop_t * top,
651 gmx_genborn_t * born,
652 t_mdatoms * mdatoms,
653 double radius_offset,
654 int gb_algorithm,
655 gmx_bool bInclude12,
656 gmx_bool bInclude13,
657 gmx_bool bInclude14)
659 int i,j,idx;
660 int natoms;
661 gmx_allvsallgb2_data_t *aadata;
662 double *p;
664 natoms = mdatoms->nr;
666 snew(aadata,1);
667 *p_aadata = aadata;
669 snew(p,2*natoms+2*SIMD_WIDTH);
670 aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
671 snew(p,2*natoms+2*SIMD_WIDTH);
672 aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
673 snew(p,2*natoms+2*SIMD_WIDTH);
674 aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
675 snew(p,2*natoms+2*SIMD_WIDTH);
676 aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
677 snew(p,2*natoms+2*SIMD_WIDTH);
678 aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
679 snew(p,2*natoms+2*SIMD_WIDTH);
680 aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
682 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
683 aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15)));
685 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
686 aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15)));
688 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
689 aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15)));
691 for(i=0;i<mdatoms->nr;i++)
693 aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
694 if(gb_algorithm==egbSTILL)
696 aadata->workparam[i] = born->vsolv[i];
698 else if(gb_algorithm==egbOBC)
700 aadata->workparam[i] = born->param[i];
702 aadata->work[i] = 0.0;
704 for(i=0;i<mdatoms->nr;i++)
706 aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
707 aadata->workparam[natoms+i] = aadata->workparam[i];
708 aadata->work[natoms+i] = aadata->work[i];
711 for(i=0;i<2*natoms+SIMD_WIDTH;i++)
713 aadata->x_align[i] = 0.0;
714 aadata->y_align[i] = 0.0;
715 aadata->z_align[i] = 0.0;
716 aadata->fx_align[i] = 0.0;
717 aadata->fy_align[i] = 0.0;
718 aadata->fz_align[i] = 0.0;
721 setup_gb_exclusions_and_indices(aadata,top->idef.il,mdatoms->start,mdatoms->start+mdatoms->homenr,mdatoms->nr,
722 bInclude12,bInclude13,bInclude14);
727 * This routine apparently hits a compiler bug visual studio has had 'forever'.
728 * It is present both in VS2005 and VS2008, and the only way around it is to
729 * decrease optimization. We do that with at pragma, and only for MSVC, so it
730 * will not hurt any of the well-behaving and supported compilers out there.
731 * MS: Fix your compiler, it sucks like a black hole!
733 #ifdef _MSC_VER
734 #pragma optimize("t",off)
735 #endif
738 genborn_allvsall_calc_still_radii_sse2_double(t_forcerec * fr,
739 t_mdatoms * mdatoms,
740 gmx_genborn_t * born,
741 gmx_localtop_t * top,
742 double * x,
743 t_commrec * cr,
744 void * paadata)
746 gmx_allvsallgb2_data_t *aadata;
747 int natoms;
748 int ni0,ni1;
749 int nj0,nj1,nj2,nj3;
750 int i,j,k,n;
751 int * mask;
752 int * pmask0;
753 int * pmask1;
754 int * emask0;
755 int * emask1;
756 double ix,iy,iz;
757 double jx,jy,jz;
758 double dx,dy,dz;
759 double rsq,rinv;
760 double gpi,rai,vai;
761 double prod_ai;
762 double irsq,idr4,idr6;
763 double raj,rvdw,ratio;
764 double vaj,ccf,dccf,theta,cosq;
765 double term,prod,icf4,icf6,gpi2,factor,sinq;
766 double * gb_radius;
767 double * vsolv;
768 double * work;
769 double tmpsum[2];
770 double * x_align;
771 double * y_align;
772 double * z_align;
773 int * jindex;
774 double * dadx;
776 __m128d ix_SSE0,iy_SSE0,iz_SSE0;
777 __m128d ix_SSE1,iy_SSE1,iz_SSE1;
778 __m128d gpi_SSE0,rai_SSE0,prod_ai_SSE0;
779 __m128d gpi_SSE1,rai_SSE1,prod_ai_SSE1;
780 __m128d imask_SSE0,jmask_SSE0;
781 __m128d imask_SSE1,jmask_SSE1;
782 __m128d jx_SSE,jy_SSE,jz_SSE;
783 __m128d dx_SSE0,dy_SSE0,dz_SSE0;
784 __m128d dx_SSE1,dy_SSE1,dz_SSE1;
785 __m128d rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
786 __m128d rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
787 __m128d raj_SSE,vaj_SSE,prod_SSE;
788 __m128d rvdw_SSE0,ratio_SSE0;
789 __m128d rvdw_SSE1,ratio_SSE1;
790 __m128d theta_SSE0,sinq_SSE0,cosq_SSE0,term_SSE0;
791 __m128d theta_SSE1,sinq_SSE1,cosq_SSE1,term_SSE1;
792 __m128d ccf_SSE0,dccf_SSE0;
793 __m128d ccf_SSE1,dccf_SSE1;
794 __m128d icf4_SSE0,icf6_SSE0;
795 __m128d icf4_SSE1,icf6_SSE1;
796 __m128d half_SSE,one_SSE,two_SSE,four_SSE;
797 __m128d still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
799 natoms = mdatoms->nr;
800 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
801 ni1 = mdatoms->start+mdatoms->homenr;
803 n = 0;
805 aadata = *((gmx_allvsallgb2_data_t **)paadata);
808 if(aadata==NULL)
810 genborn_allvsall_setup(&aadata,top,born,mdatoms,0.0,
811 egbSTILL,FALSE,FALSE,TRUE);
812 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
815 x_align = aadata->x_align;
816 y_align = aadata->y_align;
817 z_align = aadata->z_align;
819 gb_radius = aadata->gb_radius;
820 vsolv = aadata->workparam;
821 work = aadata->work;
822 jindex = aadata->jindex_gb;
823 dadx = fr->dadx;
825 still_p4_SSE = _mm_set1_pd(STILL_P4);
826 still_p5inv_SSE = _mm_set1_pd(STILL_P5INV);
827 still_pip5_SSE = _mm_set1_pd(STILL_PIP5);
828 half_SSE = _mm_set1_pd(0.5);
829 one_SSE = _mm_set1_pd(1.0);
830 two_SSE = _mm_set1_pd(2.0);
831 four_SSE = _mm_set1_pd(4.0);
833 /* This will be summed, so it has to extend to natoms + buffer */
834 for(i=0;i<natoms+1+natoms/2;i++)
836 work[i] = 0;
839 for(i=ni0;i<ni1+1+natoms/2;i++)
841 k = i%natoms;
842 x_align[i] = x[3*k];
843 y_align[i] = x[3*k+1];
844 z_align[i] = x[3*k+2];
845 work[i] = 0;
848 for(i=ni0; i<ni1; i+=UNROLLI)
850 /* We assume shifts are NOT used for all-vs-all interactions */
851 /* Load i atom data */
852 ix_SSE0 = _mm_load1_pd(x_align+i);
853 iy_SSE0 = _mm_load1_pd(y_align+i);
854 iz_SSE0 = _mm_load1_pd(z_align+i);
855 ix_SSE1 = _mm_load1_pd(x_align+i+1);
856 iy_SSE1 = _mm_load1_pd(y_align+i+1);
857 iz_SSE1 = _mm_load1_pd(z_align+i+1);
859 gpi_SSE0 = _mm_setzero_pd();
860 gpi_SSE1 = _mm_setzero_pd();
862 rai_SSE0 = _mm_load1_pd(gb_radius+i);
863 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
865 prod_ai_SSE0 = _mm_set1_pd(STILL_P4*vsolv[i]);
866 prod_ai_SSE1 = _mm_set1_pd(STILL_P4*vsolv[i+1]);
868 /* Load limits for loop over neighbors */
869 nj0 = jindex[4*i];
870 nj1 = jindex[4*i+1];
871 nj2 = jindex[4*i+2];
872 nj3 = jindex[4*i+3];
874 pmask0 = aadata->prologue_mask_gb[i];
875 pmask1 = aadata->prologue_mask_gb[i+1];
876 emask0 = aadata->epilogue_mask[i];
877 emask1 = aadata->epilogue_mask[i+1];
879 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
880 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
882 /* Prologue part, including exclusion mask */
883 for(j=nj0; j<nj1; j+=UNROLLJ)
885 jmask_SSE0 = _mm_load_pd((double *)pmask0);
886 jmask_SSE1 = _mm_load_pd((double *)pmask1);
887 pmask0 += 2*UNROLLJ;
888 pmask1 += 2*UNROLLJ;
890 /* load j atom coordinates */
891 jx_SSE = _mm_load_pd(x_align+j);
892 jy_SSE = _mm_load_pd(y_align+j);
893 jz_SSE = _mm_load_pd(z_align+j);
895 /* Calculate distance */
896 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
897 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
898 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
899 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
900 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
901 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
903 /* rsq = dx*dx+dy*dy+dz*dz */
904 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
905 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
907 /* Combine masks */
908 jmask_SSE0 = _mm_and_pd(jmask_SSE0,imask_SSE0);
909 jmask_SSE1 = _mm_and_pd(jmask_SSE1,imask_SSE1);
911 /* Calculate 1/r and 1/r2 */
912 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
913 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
915 /* Apply mask */
916 rinv_SSE0 = _mm_and_pd(rinv_SSE0,jmask_SSE0);
917 rinv_SSE1 = _mm_and_pd(rinv_SSE1,jmask_SSE1);
919 irsq_SSE0 = _mm_mul_pd(rinv_SSE0,rinv_SSE0);
920 irsq_SSE1 = _mm_mul_pd(rinv_SSE1,rinv_SSE1);
921 idr4_SSE0 = _mm_mul_pd(irsq_SSE0,irsq_SSE0);
922 idr4_SSE1 = _mm_mul_pd(irsq_SSE1,irsq_SSE1);
923 idr6_SSE0 = _mm_mul_pd(idr4_SSE0,irsq_SSE0);
924 idr6_SSE1 = _mm_mul_pd(idr4_SSE1,irsq_SSE1);
926 raj_SSE = _mm_load_pd(gb_radius+j);
927 vaj_SSE = _mm_load_pd(vsolv+j);
929 rvdw_SSE0 = _mm_add_pd(rai_SSE0,raj_SSE);
930 rvdw_SSE1 = _mm_add_pd(rai_SSE1,raj_SSE);
932 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0,rvdw_SSE0)));
933 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1,rvdw_SSE1)));
935 ratio_SSE0 = _mm_min_pd(ratio_SSE0,still_p5inv_SSE);
936 ratio_SSE1 = _mm_min_pd(ratio_SSE1,still_p5inv_SSE);
937 theta_SSE0 = _mm_mul_pd(ratio_SSE0,still_pip5_SSE);
938 theta_SSE1 = _mm_mul_pd(ratio_SSE1,still_pip5_SSE);
939 gmx_mm_sincos_pd(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
940 gmx_mm_sincos_pd(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
941 term_SSE0 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE0));
942 term_SSE1 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE1));
943 ccf_SSE0 = _mm_mul_pd(term_SSE0,term_SSE0);
944 ccf_SSE1 = _mm_mul_pd(term_SSE1,term_SSE1);
945 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE0),
946 _mm_mul_pd(sinq_SSE0,theta_SSE0));
947 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE1),
948 _mm_mul_pd(sinq_SSE1,theta_SSE1));
950 prod_SSE = _mm_mul_pd(still_p4_SSE,vaj_SSE);
951 icf4_SSE0 = _mm_mul_pd(ccf_SSE0,idr4_SSE0);
952 icf4_SSE1 = _mm_mul_pd(ccf_SSE1,idr4_SSE1);
953 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
954 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
956 _mm_store_pd(work+j , _mm_add_pd(_mm_load_pd(work+j),
957 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0,icf4_SSE0),
958 _mm_mul_pd(prod_ai_SSE1,icf4_SSE1))));
961 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE,icf4_SSE0));
962 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE,icf4_SSE1));
964 /* Save ai->aj and aj->ai chain rule terms */
965 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE0));
966 dadx+=2;
967 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE1));
968 dadx+=2;
970 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE0,icf6_SSE0));
971 dadx+=2;
972 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE1,icf6_SSE1));
973 dadx+=2;
976 /* Main part, no exclusions */
977 for(j=nj1; j<nj2; j+=UNROLLJ)
980 /* load j atom coordinates */
981 jx_SSE = _mm_load_pd(x_align+j);
982 jy_SSE = _mm_load_pd(y_align+j);
983 jz_SSE = _mm_load_pd(z_align+j);
985 /* Calculate distance */
986 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
987 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
988 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
989 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
990 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
991 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
993 /* rsq = dx*dx+dy*dy+dz*dz */
994 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
995 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
997 /* Calculate 1/r and 1/r2 */
998 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
999 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1001 /* Apply mask */
1002 rinv_SSE0 = _mm_and_pd(rinv_SSE0,imask_SSE0);
1003 rinv_SSE1 = _mm_and_pd(rinv_SSE1,imask_SSE1);
1005 irsq_SSE0 = _mm_mul_pd(rinv_SSE0,rinv_SSE0);
1006 irsq_SSE1 = _mm_mul_pd(rinv_SSE1,rinv_SSE1);
1007 idr4_SSE0 = _mm_mul_pd(irsq_SSE0,irsq_SSE0);
1008 idr4_SSE1 = _mm_mul_pd(irsq_SSE1,irsq_SSE1);
1009 idr6_SSE0 = _mm_mul_pd(idr4_SSE0,irsq_SSE0);
1010 idr6_SSE1 = _mm_mul_pd(idr4_SSE1,irsq_SSE1);
1012 raj_SSE = _mm_load_pd(gb_radius+j);
1014 rvdw_SSE0 = _mm_add_pd(rai_SSE0,raj_SSE);
1015 rvdw_SSE1 = _mm_add_pd(rai_SSE1,raj_SSE);
1016 vaj_SSE = _mm_load_pd(vsolv+j);
1018 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0,rvdw_SSE0)));
1019 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1,rvdw_SSE1)));
1021 ratio_SSE0 = _mm_min_pd(ratio_SSE0,still_p5inv_SSE);
1022 ratio_SSE1 = _mm_min_pd(ratio_SSE1,still_p5inv_SSE);
1023 theta_SSE0 = _mm_mul_pd(ratio_SSE0,still_pip5_SSE);
1024 theta_SSE1 = _mm_mul_pd(ratio_SSE1,still_pip5_SSE);
1025 gmx_mm_sincos_pd(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
1026 gmx_mm_sincos_pd(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
1027 term_SSE0 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE0));
1028 term_SSE1 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE1));
1029 ccf_SSE0 = _mm_mul_pd(term_SSE0,term_SSE0);
1030 ccf_SSE1 = _mm_mul_pd(term_SSE1,term_SSE1);
1031 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE0),
1032 _mm_mul_pd(sinq_SSE0,theta_SSE0));
1033 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE1),
1034 _mm_mul_pd(sinq_SSE1,theta_SSE1));
1036 prod_SSE = _mm_mul_pd(still_p4_SSE,vaj_SSE );
1037 icf4_SSE0 = _mm_mul_pd(ccf_SSE0,idr4_SSE0);
1038 icf4_SSE1 = _mm_mul_pd(ccf_SSE1,idr4_SSE1);
1039 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1040 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1042 _mm_store_pd(work+j , _mm_add_pd(_mm_load_pd(work+j),
1043 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0,icf4_SSE0),
1044 _mm_mul_pd(prod_ai_SSE1,icf4_SSE1))));
1046 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE,icf4_SSE0));
1047 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE,icf4_SSE1));
1049 /* Save ai->aj and aj->ai chain rule terms */
1050 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE0));
1051 dadx+=2;
1052 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE1));
1053 dadx+=2;
1055 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE0,icf6_SSE0));
1056 dadx+=2;
1057 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE1,icf6_SSE1));
1058 dadx+=2;
1060 /* Epilogue part, including exclusion mask */
1061 for(j=nj2; j<nj3; j+=UNROLLJ)
1063 jmask_SSE0 = _mm_load_pd((double *)emask0);
1064 jmask_SSE1 = _mm_load_pd((double *)emask1);
1065 emask0 += 2*UNROLLJ;
1066 emask1 += 2*UNROLLJ;
1068 /* load j atom coordinates */
1069 jx_SSE = _mm_load_pd(x_align+j);
1070 jy_SSE = _mm_load_pd(y_align+j);
1071 jz_SSE = _mm_load_pd(z_align+j);
1073 /* Calculate distance */
1074 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
1075 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
1076 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
1077 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
1078 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
1079 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
1081 /* rsq = dx*dx+dy*dy+dz*dz */
1082 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1083 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1085 /* Combine masks */
1086 jmask_SSE0 = _mm_and_pd(jmask_SSE0,imask_SSE0);
1087 jmask_SSE1 = _mm_and_pd(jmask_SSE1,imask_SSE1);
1089 /* Calculate 1/r and 1/r2 */
1090 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1091 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1093 /* Apply mask */
1094 rinv_SSE0 = _mm_and_pd(rinv_SSE0,jmask_SSE0);
1095 rinv_SSE1 = _mm_and_pd(rinv_SSE1,jmask_SSE1);
1097 irsq_SSE0 = _mm_mul_pd(rinv_SSE0,rinv_SSE0);
1098 irsq_SSE1 = _mm_mul_pd(rinv_SSE1,rinv_SSE1);
1099 idr4_SSE0 = _mm_mul_pd(irsq_SSE0,irsq_SSE0);
1100 idr4_SSE1 = _mm_mul_pd(irsq_SSE1,irsq_SSE1);
1101 idr6_SSE0 = _mm_mul_pd(idr4_SSE0,irsq_SSE0);
1102 idr6_SSE1 = _mm_mul_pd(idr4_SSE1,irsq_SSE1);
1104 raj_SSE = _mm_load_pd(gb_radius+j);
1105 vaj_SSE = _mm_load_pd(vsolv+j);
1107 rvdw_SSE0 = _mm_add_pd(rai_SSE0,raj_SSE);
1108 rvdw_SSE1 = _mm_add_pd(rai_SSE1,raj_SSE);
1110 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0,rvdw_SSE0)));
1111 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1,rvdw_SSE1)));
1113 ratio_SSE0 = _mm_min_pd(ratio_SSE0,still_p5inv_SSE);
1114 ratio_SSE1 = _mm_min_pd(ratio_SSE1,still_p5inv_SSE);
1115 theta_SSE0 = _mm_mul_pd(ratio_SSE0,still_pip5_SSE);
1116 theta_SSE1 = _mm_mul_pd(ratio_SSE1,still_pip5_SSE);
1117 gmx_mm_sincos_pd(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
1118 gmx_mm_sincos_pd(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
1119 term_SSE0 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE0));
1120 term_SSE1 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE1));
1121 ccf_SSE0 = _mm_mul_pd(term_SSE0,term_SSE0);
1122 ccf_SSE1 = _mm_mul_pd(term_SSE1,term_SSE1);
1123 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE0),
1124 _mm_mul_pd(sinq_SSE0,theta_SSE0));
1125 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE1),
1126 _mm_mul_pd(sinq_SSE1,theta_SSE1));
1128 prod_SSE = _mm_mul_pd(still_p4_SSE,vaj_SSE);
1129 icf4_SSE0 = _mm_mul_pd(ccf_SSE0,idr4_SSE0);
1130 icf4_SSE1 = _mm_mul_pd(ccf_SSE1,idr4_SSE1);
1131 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1132 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1134 _mm_store_pd(work+j , _mm_add_pd(_mm_load_pd(work+j),
1135 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0,icf4_SSE0),
1136 _mm_mul_pd(prod_ai_SSE1,icf4_SSE1))));
1138 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE,icf4_SSE0));
1139 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE,icf4_SSE1));
1141 /* Save ai->aj and aj->ai chain rule terms */
1142 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE0));
1143 dadx+=2;
1144 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE1));
1145 dadx+=2;
1147 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE0,icf6_SSE0));
1148 dadx+=2;
1149 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE1,icf6_SSE1));
1150 dadx+=2;
1152 GMX_MM_TRANSPOSE2_PD(gpi_SSE0,gpi_SSE1);
1153 gpi_SSE0 = _mm_add_pd(gpi_SSE0,gpi_SSE1);
1154 _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i)));
1157 /* In case we have written anything beyond natoms, move it back.
1158 * Never mind that we leave stuff above natoms; that will not
1159 * be accessed later in the routine.
1160 * In principle this should be a move rather than sum, but this
1161 * way we dont have to worry about even/odd offsets...
1163 for(i=natoms;i<ni1+1+natoms/2;i++)
1165 work[i-natoms] += work[i];
1168 /* Parallel summations */
1169 if(PARTDECOMP(cr))
1171 gmx_sum(natoms,work,cr);
1174 factor = 0.5 * ONE_4PI_EPS0;
1175 /* Calculate the radii - should we do all atoms, or just our local ones? */
1176 for(i=0;i<natoms;i++)
1178 if(born->use[i] != 0)
1180 gpi = born->gpol[i]+work[i];
1181 gpi2 = gpi * gpi;
1182 born->bRad[i] = factor*gmx_invsqrt(gpi2);
1183 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1187 return 0;
1189 /* Reinstate MSVC optimization */
1190 #ifdef _MSC_VER
1191 #pragma optimize("",on)
1192 #endif
1196 genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec * fr,
1197 t_mdatoms * mdatoms,
1198 gmx_genborn_t * born,
1199 int gb_algorithm,
1200 gmx_localtop_t * top,
1201 double * x,
1202 t_commrec * cr,
1203 void * paadata)
1205 gmx_allvsallgb2_data_t *aadata;
1206 int natoms;
1207 int ni0,ni1;
1208 int nj0,nj1,nj2,nj3;
1209 int i,j,k,n;
1210 int * mask;
1211 int * pmask0;
1212 int * pmask1;
1213 int * emask0;
1214 int * emask1;
1215 double * gb_radius;
1216 double * vsolv;
1217 double * work;
1218 double tmpsum[2];
1219 double * x_align;
1220 double * y_align;
1221 double * z_align;
1222 int * jindex;
1223 double * dadx;
1224 double * obc_param;
1225 double rad,min_rad;
1226 double rai,rai_inv,rai_inv2,sum_ai,sum_ai2,sum_ai3,tsum,tchain;
1228 __m128d ix_SSE0,iy_SSE0,iz_SSE0;
1229 __m128d ix_SSE1,iy_SSE1,iz_SSE1;
1230 __m128d gpi_SSE0,rai_SSE0,prod_ai_SSE0;
1231 __m128d gpi_SSE1,rai_SSE1,prod_ai_SSE1;
1232 __m128d imask_SSE0,jmask_SSE0;
1233 __m128d imask_SSE1,jmask_SSE1;
1234 __m128d jx_SSE,jy_SSE,jz_SSE;
1235 __m128d dx_SSE0,dy_SSE0,dz_SSE0;
1236 __m128d dx_SSE1,dy_SSE1,dz_SSE1;
1237 __m128d rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
1238 __m128d rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
1239 __m128d raj_SSE,raj_inv_SSE,sk_aj_SSE,sk2_aj_SSE;
1240 __m128d ccf_SSE0,dccf_SSE0,prod_SSE0;
1241 __m128d ccf_SSE1,dccf_SSE1,prod_SSE1;
1242 __m128d icf4_SSE0,icf6_SSE0;
1243 __m128d icf4_SSE1,icf6_SSE1;
1244 __m128d oneeighth_SSE,onefourth_SSE,half_SSE,one_SSE,two_SSE,four_SSE;
1245 __m128d still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
1246 __m128d rai_inv_SSE0;
1247 __m128d rai_inv_SSE1;
1248 __m128d sk_ai_SSE0,sk2_ai_SSE0,sum_ai_SSE0;
1249 __m128d sk_ai_SSE1,sk2_ai_SSE1,sum_ai_SSE1;
1250 __m128d lij_inv_SSE0,sk2_rinv_SSE0;
1251 __m128d lij_inv_SSE1,sk2_rinv_SSE1;
1252 __m128d dr_SSE0;
1253 __m128d dr_SSE1;
1254 __m128d t1_SSE0,t2_SSE0,t3_SSE0,t4_SSE0;
1255 __m128d t1_SSE1,t2_SSE1,t3_SSE1,t4_SSE1;
1256 __m128d obc_mask1_SSE0,obc_mask2_SSE0,obc_mask3_SSE0;
1257 __m128d obc_mask1_SSE1,obc_mask2_SSE1,obc_mask3_SSE1;
1258 __m128d uij_SSE0,uij2_SSE0,uij3_SSE0;
1259 __m128d uij_SSE1,uij2_SSE1,uij3_SSE1;
1260 __m128d lij_SSE0,lij2_SSE0,lij3_SSE0;
1261 __m128d lij_SSE1,lij2_SSE1,lij3_SSE1;
1262 __m128d dlij_SSE0,diff2_SSE0,logterm_SSE0;
1263 __m128d dlij_SSE1,diff2_SSE1,logterm_SSE1;
1264 __m128d doffset_SSE,tmpSSE;
1266 natoms = mdatoms->nr;
1267 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
1268 ni1 = mdatoms->start+mdatoms->homenr;
1270 n = 0;
1272 aadata = *((gmx_allvsallgb2_data_t **)paadata);
1275 if(aadata==NULL)
1277 genborn_allvsall_setup(&aadata,top,born,mdatoms,born->gb_doffset,
1278 egbOBC,TRUE,TRUE,TRUE);
1279 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1282 x_align = aadata->x_align;
1283 y_align = aadata->y_align;
1284 z_align = aadata->z_align;
1286 gb_radius = aadata->gb_radius;
1287 work = aadata->work;
1288 jindex = aadata->jindex_gb;
1289 dadx = fr->dadx;
1290 obc_param = aadata->workparam;
1292 oneeighth_SSE = _mm_set1_pd(0.125);
1293 onefourth_SSE = _mm_set1_pd(0.25);
1294 half_SSE = _mm_set1_pd(0.5);
1295 one_SSE = _mm_set1_pd(1.0);
1296 two_SSE = _mm_set1_pd(2.0);
1297 four_SSE = _mm_set1_pd(4.0);
1298 doffset_SSE = _mm_set1_pd(born->gb_doffset);
1300 for(i=0;i<natoms;i++)
1302 x_align[i] = x[3*i];
1303 y_align[i] = x[3*i+1];
1304 z_align[i] = x[3*i+2];
1307 /* Copy again */
1308 for(i=0;i<natoms/2+1;i++)
1310 x_align[natoms+i] = x_align[i];
1311 y_align[natoms+i] = y_align[i];
1312 z_align[natoms+i] = z_align[i];
1315 for(i=0;i<natoms+natoms/2+1;i++)
1317 work[i] = 0;
1320 for(i=ni0; i<ni1; i+= UNROLLI)
1322 /* We assume shifts are NOT used for all-vs-all interactions */
1324 /* Load i atom data */
1325 ix_SSE0 = _mm_load1_pd(x_align+i);
1326 iy_SSE0 = _mm_load1_pd(y_align+i);
1327 iz_SSE0 = _mm_load1_pd(z_align+i);
1328 ix_SSE1 = _mm_load1_pd(x_align+i+1);
1329 iy_SSE1 = _mm_load1_pd(y_align+i+1);
1330 iz_SSE1 = _mm_load1_pd(z_align+i+1);
1332 rai_SSE0 = _mm_load1_pd(gb_radius+i);
1333 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
1334 rai_inv_SSE0 = gmx_mm_inv_pd(rai_SSE0);
1335 rai_inv_SSE1 = gmx_mm_inv_pd(rai_SSE1);
1337 sk_ai_SSE0 = _mm_load1_pd(obc_param+i);
1338 sk_ai_SSE1 = _mm_load1_pd(obc_param+i+1);
1339 sk2_ai_SSE0 = _mm_mul_pd(sk_ai_SSE0,sk_ai_SSE0);
1340 sk2_ai_SSE1 = _mm_mul_pd(sk_ai_SSE1,sk_ai_SSE1);
1342 sum_ai_SSE0 = _mm_setzero_pd();
1343 sum_ai_SSE1 = _mm_setzero_pd();
1345 /* Load limits for loop over neighbors */
1346 nj0 = jindex[4*i];
1347 nj1 = jindex[4*i+1];
1348 nj2 = jindex[4*i+2];
1349 nj3 = jindex[4*i+3];
1351 pmask0 = aadata->prologue_mask_gb[i];
1352 pmask1 = aadata->prologue_mask_gb[i+1];
1353 emask0 = aadata->epilogue_mask[i];
1354 emask1 = aadata->epilogue_mask[i+1];
1356 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
1357 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
1359 /* Prologue part, including exclusion mask */
1360 for(j=nj0; j<nj1; j+=UNROLLJ)
1362 jmask_SSE0 = _mm_load_pd((double *)pmask0);
1363 jmask_SSE1 = _mm_load_pd((double *)pmask1);
1364 pmask0 += 2*UNROLLJ;
1365 pmask1 += 2*UNROLLJ;
1367 /* load j atom coordinates */
1368 jx_SSE = _mm_load_pd(x_align+j);
1369 jy_SSE = _mm_load_pd(y_align+j);
1370 jz_SSE = _mm_load_pd(z_align+j);
1372 /* Calculate distance */
1373 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
1374 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
1375 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
1376 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
1377 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
1378 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
1380 /* rsq = dx*dx+dy*dy+dz*dz */
1381 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1382 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1384 /* Combine masks */
1385 jmask_SSE0 = _mm_and_pd(jmask_SSE0,imask_SSE0);
1386 jmask_SSE1 = _mm_and_pd(jmask_SSE1,imask_SSE1);
1388 /* Calculate 1/r and 1/r2 */
1389 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1390 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1392 /* Apply mask */
1393 rinv_SSE0 = _mm_and_pd(rinv_SSE0,jmask_SSE0);
1394 rinv_SSE1 = _mm_and_pd(rinv_SSE1,jmask_SSE1);
1396 dr_SSE0 = _mm_mul_pd(rsq_SSE0,rinv_SSE0);
1397 dr_SSE1 = _mm_mul_pd(rsq_SSE1,rinv_SSE1);
1399 sk_aj_SSE = _mm_load_pd(obc_param+j);
1400 raj_SSE = _mm_load_pd(gb_radius+j);
1401 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1403 /* Evaluate influence of atom aj -> ai */
1404 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_aj_SSE);
1405 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_aj_SSE);
1406 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_aj_SSE);
1407 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_aj_SSE);
1408 t3_SSE0 = _mm_sub_pd(sk_aj_SSE,dr_SSE0);
1409 t3_SSE1 = _mm_sub_pd(sk_aj_SSE,dr_SSE1);
1411 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1412 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1413 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1414 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1415 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1416 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1417 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
1418 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
1420 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1421 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1422 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1423 _mm_andnot_pd(obc_mask2_SSE0,rai_inv_SSE0));
1424 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1425 _mm_andnot_pd(obc_mask2_SSE1,rai_inv_SSE1));
1426 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1427 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1429 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1430 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1431 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1432 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1433 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1434 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1435 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1436 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1438 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1439 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1440 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1441 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1442 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE,sk_aj_SSE);
1443 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE0);
1444 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE1);
1445 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1446 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1448 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1449 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1451 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
1452 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
1453 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1454 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1455 prod_SSE0));
1456 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1457 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1458 prod_SSE1));
1460 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1461 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1462 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1463 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1464 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE0,lij_SSE0));
1465 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE1,lij_SSE1));
1466 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1467 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1468 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1469 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1471 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1472 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1474 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1475 _mm_mul_pd(prod_SSE0,lij3_SSE0));
1476 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1477 _mm_mul_pd(prod_SSE1,lij3_SSE1));
1478 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1479 _mm_mul_pd(onefourth_SSE,
1480 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1481 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1482 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1483 _mm_mul_pd(onefourth_SSE,
1484 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1485 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1487 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1488 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1489 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1490 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1491 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1492 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1493 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1494 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1495 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1496 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1497 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1498 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1499 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1500 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1501 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1502 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1503 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1504 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1505 _mm_add_pd(one_SSE,
1506 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1507 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1508 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1509 _mm_add_pd(one_SSE,
1510 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1512 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1513 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1514 _mm_add_pd(t2_SSE0,t3_SSE0)));
1515 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1516 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1517 _mm_add_pd(t2_SSE1,t3_SSE1)));
1519 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1520 dadx += 2;
1521 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1522 dadx += 2;
1524 /* Evaluate influence of atom ai -> aj */
1525 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_ai_SSE0);
1526 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_ai_SSE1);
1527 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_ai_SSE0);
1528 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_ai_SSE1);
1529 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0,dr_SSE0);
1530 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1,dr_SSE1);
1532 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1533 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1534 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1535 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1536 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1537 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1538 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
1539 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
1541 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1542 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1543 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1544 _mm_andnot_pd(obc_mask2_SSE0,raj_inv_SSE));
1545 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1546 _mm_andnot_pd(obc_mask2_SSE1,raj_inv_SSE));
1547 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1548 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1550 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1551 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1552 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1553 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1554 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1555 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1556 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1557 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1559 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1560 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1561 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1562 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1563 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0,rinv_SSE0);
1564 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1,rinv_SSE1);
1565 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1566 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1568 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1569 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1570 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
1571 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
1572 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1573 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1574 prod_SSE0));
1575 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1576 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1577 prod_SSE1));
1578 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1579 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1580 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1581 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1582 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE0));
1583 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE1));
1584 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1585 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1586 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1587 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1589 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1590 _mm_add_pd(_mm_and_pd(t1_SSE0,obc_mask1_SSE0),
1591 _mm_and_pd(t1_SSE1,obc_mask1_SSE1))));
1593 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1594 _mm_mul_pd(prod_SSE0,lij3_SSE0));
1595 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1596 _mm_mul_pd(prod_SSE1,lij3_SSE1));
1597 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1598 _mm_mul_pd(onefourth_SSE,
1599 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1600 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1601 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1602 _mm_mul_pd(onefourth_SSE,
1603 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1604 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1605 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1606 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1607 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1608 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1609 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1610 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1611 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1612 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1613 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1614 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1615 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1616 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1618 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1619 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1620 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1621 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1623 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1624 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1625 _mm_add_pd(one_SSE,
1626 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1627 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1628 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1629 _mm_add_pd(one_SSE,
1630 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1633 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1634 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1635 _mm_add_pd(t2_SSE0,t3_SSE0)));
1636 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1637 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1638 _mm_add_pd(t2_SSE1,t3_SSE1)));
1640 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1641 dadx += 2;
1642 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1643 dadx += 2;
1646 /* Main part, no exclusions */
1647 for(j=nj1; j<nj2; j+=UNROLLJ)
1649 /* load j atom coordinates */
1650 jx_SSE = _mm_load_pd(x_align+j);
1651 jy_SSE = _mm_load_pd(y_align+j);
1652 jz_SSE = _mm_load_pd(z_align+j);
1654 /* Calculate distance */
1655 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
1656 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
1657 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
1658 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
1659 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
1660 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
1662 /* rsq = dx*dx+dy*dy+dz*dz */
1663 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1664 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1666 /* Calculate 1/r and 1/r2 */
1667 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1668 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1670 /* Apply mask */
1671 rinv_SSE0 = _mm_and_pd(rinv_SSE0,imask_SSE0);
1672 rinv_SSE1 = _mm_and_pd(rinv_SSE1,imask_SSE1);
1674 dr_SSE0 = _mm_mul_pd(rsq_SSE0,rinv_SSE0);
1675 dr_SSE1 = _mm_mul_pd(rsq_SSE1,rinv_SSE1);
1677 sk_aj_SSE = _mm_load_pd(obc_param+j);
1678 raj_SSE = _mm_load_pd(gb_radius+j);
1680 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1682 /* Evaluate influence of atom aj -> ai */
1683 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_aj_SSE);
1684 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_aj_SSE);
1685 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_aj_SSE);
1686 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_aj_SSE);
1687 t3_SSE0 = _mm_sub_pd(sk_aj_SSE,dr_SSE0);
1688 t3_SSE1 = _mm_sub_pd(sk_aj_SSE,dr_SSE1);
1690 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1691 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1692 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1693 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1694 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1695 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1696 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,imask_SSE0);
1697 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,imask_SSE1);
1699 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1700 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1701 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1702 _mm_andnot_pd(obc_mask2_SSE0,rai_inv_SSE0));
1703 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1704 _mm_andnot_pd(obc_mask2_SSE1,rai_inv_SSE1));
1705 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1706 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1708 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1709 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1710 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1711 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1712 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1713 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1714 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1715 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1717 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1718 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1719 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1720 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1721 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE,sk_aj_SSE);
1722 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE0);
1723 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE1);
1724 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1725 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1727 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1728 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1730 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
1731 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
1732 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1733 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1734 prod_SSE0));
1735 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1736 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1737 prod_SSE1));
1739 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1740 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1741 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1742 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1743 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE0,lij_SSE0));
1744 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE1,lij_SSE1));
1745 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1746 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1747 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1748 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1750 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1751 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1753 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1754 _mm_mul_pd(prod_SSE0,lij3_SSE0));
1755 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1756 _mm_mul_pd(prod_SSE1,lij3_SSE1));
1758 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1759 _mm_mul_pd(onefourth_SSE,
1760 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1761 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1762 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1763 _mm_mul_pd(onefourth_SSE,
1764 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1765 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1767 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1768 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1769 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1770 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1771 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1772 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1773 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1774 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1775 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1776 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1777 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1778 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1779 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1780 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1781 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1782 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1783 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1784 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1785 _mm_add_pd(one_SSE,
1786 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1787 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1788 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1789 _mm_add_pd(one_SSE,
1790 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1792 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1793 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1794 _mm_add_pd(t2_SSE0,t3_SSE0)));
1795 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1796 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1797 _mm_add_pd(t2_SSE1,t3_SSE1)));
1799 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1800 dadx += 2;
1801 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1802 dadx += 2;
1804 /* Evaluate influence of atom ai -> aj */
1805 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_ai_SSE0);
1806 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_ai_SSE1);
1807 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_ai_SSE0);
1808 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_ai_SSE1);
1809 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0,dr_SSE0);
1810 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1,dr_SSE1);
1812 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1813 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1814 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1815 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1816 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1817 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1818 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,imask_SSE0);
1819 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,imask_SSE1);
1821 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1822 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1823 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1824 _mm_andnot_pd(obc_mask2_SSE0,raj_inv_SSE));
1825 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1826 _mm_andnot_pd(obc_mask2_SSE1,raj_inv_SSE));
1827 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1828 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1830 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1831 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1832 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1833 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1834 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1835 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1836 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1837 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1839 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1840 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1841 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1842 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1843 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0,rinv_SSE0);
1844 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1,rinv_SSE1);
1845 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1846 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1848 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1849 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1850 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
1851 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
1852 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1853 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1854 prod_SSE0));
1855 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1856 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1857 prod_SSE1));
1858 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1859 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1860 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1861 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1862 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE0));
1863 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE1));
1864 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1865 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1866 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1867 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1869 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1870 _mm_add_pd(_mm_and_pd(t1_SSE0,obc_mask1_SSE0),
1871 _mm_and_pd(t1_SSE1,obc_mask1_SSE1))));
1873 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1874 _mm_mul_pd(prod_SSE0,lij3_SSE0));
1875 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1876 _mm_mul_pd(prod_SSE1,lij3_SSE1));
1877 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1878 _mm_mul_pd(onefourth_SSE,
1879 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1880 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1881 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1882 _mm_mul_pd(onefourth_SSE,
1883 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1884 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1885 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1886 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1887 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1888 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1889 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1890 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1891 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1892 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1893 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1894 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1895 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1896 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1898 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1899 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1900 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1901 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1903 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1904 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1905 _mm_add_pd(one_SSE,
1906 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1907 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1908 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1909 _mm_add_pd(one_SSE,
1910 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1912 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1913 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1914 _mm_add_pd(t2_SSE0,t3_SSE0)));
1915 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1916 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1917 _mm_add_pd(t2_SSE1,t3_SSE1)));
1919 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1920 dadx += 2;
1921 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1922 dadx += 2;
1925 /* Epilogue part, including exclusion mask */
1926 for(j=nj2; j<nj3; j+=UNROLLJ)
1928 jmask_SSE0 = _mm_load_pd((double *)emask0);
1929 jmask_SSE1 = _mm_load_pd((double *)emask1);
1930 emask0 += 2*UNROLLJ;
1931 emask1 += 2*UNROLLJ;
1933 /* load j atom coordinates */
1934 jx_SSE = _mm_load_pd(x_align+j);
1935 jy_SSE = _mm_load_pd(y_align+j);
1936 jz_SSE = _mm_load_pd(z_align+j);
1938 /* Calculate distance */
1939 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
1940 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
1941 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
1942 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
1943 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
1944 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
1946 /* rsq = dx*dx+dy*dy+dz*dz */
1947 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1948 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1950 /* Combine masks */
1951 jmask_SSE0 = _mm_and_pd(jmask_SSE0,imask_SSE0);
1952 jmask_SSE1 = _mm_and_pd(jmask_SSE1,imask_SSE1);
1954 /* Calculate 1/r and 1/r2 */
1955 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1956 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1958 /* Apply mask */
1959 rinv_SSE0 = _mm_and_pd(rinv_SSE0,jmask_SSE0);
1960 rinv_SSE1 = _mm_and_pd(rinv_SSE1,jmask_SSE1);
1962 dr_SSE0 = _mm_mul_pd(rsq_SSE0,rinv_SSE0);
1963 dr_SSE1 = _mm_mul_pd(rsq_SSE1,rinv_SSE1);
1965 sk_aj_SSE = _mm_load_pd(obc_param+j);
1966 raj_SSE = _mm_load_pd(gb_radius+j);
1968 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1970 /* Evaluate influence of atom aj -> ai */
1971 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_aj_SSE);
1972 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_aj_SSE);
1973 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_aj_SSE);
1974 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_aj_SSE);
1975 t3_SSE0 = _mm_sub_pd(sk_aj_SSE,dr_SSE0);
1976 t3_SSE1 = _mm_sub_pd(sk_aj_SSE,dr_SSE1);
1978 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1979 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1980 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1981 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1982 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1983 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1984 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
1985 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
1987 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1988 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1989 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1990 _mm_andnot_pd(obc_mask2_SSE0,rai_inv_SSE0));
1991 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1992 _mm_andnot_pd(obc_mask2_SSE1,rai_inv_SSE1));
1994 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1995 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1997 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1998 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1999 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
2000 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
2001 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2002 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2003 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
2004 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
2006 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
2007 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
2008 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2009 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2010 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE,sk_aj_SSE);
2011 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE0);
2012 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE1);
2013 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
2014 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
2016 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
2017 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
2019 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
2020 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
2021 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2022 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
2023 prod_SSE0));
2024 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2025 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
2026 prod_SSE1));
2028 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
2029 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
2030 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
2031 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
2032 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE0,lij_SSE0));
2033 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE1,lij_SSE1));
2034 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
2035 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
2036 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
2037 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
2039 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
2040 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
2042 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
2043 _mm_mul_pd(prod_SSE0,lij3_SSE0));
2044 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
2045 _mm_mul_pd(prod_SSE1,lij3_SSE1));
2046 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2047 _mm_mul_pd(onefourth_SSE,
2048 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
2049 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
2050 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2051 _mm_mul_pd(onefourth_SSE,
2052 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
2053 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
2055 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2056 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
2057 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
2058 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2059 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
2060 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
2061 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2062 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
2063 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
2064 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2065 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
2066 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
2067 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
2068 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
2069 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
2070 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
2071 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2072 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
2073 _mm_add_pd(one_SSE,
2074 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
2075 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2076 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
2077 _mm_add_pd(one_SSE,
2078 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
2080 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2081 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
2082 _mm_add_pd(t2_SSE0,t3_SSE0)));
2083 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2084 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
2085 _mm_add_pd(t2_SSE1,t3_SSE1)));
2087 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
2088 dadx += 2;
2089 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
2090 dadx += 2;
2092 /* Evaluate influence of atom ai -> aj */
2093 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_ai_SSE0);
2094 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_ai_SSE1);
2095 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_ai_SSE0);
2096 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_ai_SSE1);
2097 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0,dr_SSE0);
2098 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1,dr_SSE1);
2100 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
2101 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
2102 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
2103 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
2104 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
2105 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
2106 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
2107 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
2109 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
2110 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
2111 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
2112 _mm_andnot_pd(obc_mask2_SSE0,raj_inv_SSE));
2113 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
2114 _mm_andnot_pd(obc_mask2_SSE1,raj_inv_SSE));
2116 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
2117 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
2119 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
2120 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
2121 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
2122 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
2123 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2124 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2125 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
2126 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
2128 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
2129 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
2130 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2131 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2132 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0,rinv_SSE0);
2133 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1,rinv_SSE1);
2134 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
2135 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
2137 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
2138 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
2139 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
2140 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
2141 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2142 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
2143 prod_SSE0));
2144 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2145 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
2146 prod_SSE1));
2147 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
2148 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
2149 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
2150 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
2151 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE0));
2152 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE1));
2153 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
2154 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
2155 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
2156 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
2158 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
2159 _mm_add_pd(_mm_and_pd(t1_SSE0,obc_mask1_SSE0),
2160 _mm_and_pd(t1_SSE1,obc_mask1_SSE1))));
2162 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
2163 _mm_mul_pd(prod_SSE0,lij3_SSE0));
2164 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
2165 _mm_mul_pd(prod_SSE1,lij3_SSE1));
2167 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2168 _mm_mul_pd(onefourth_SSE,
2169 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
2170 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
2171 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2172 _mm_mul_pd(onefourth_SSE,
2173 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
2174 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
2175 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2176 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
2177 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
2178 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2179 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
2180 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
2181 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2182 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
2183 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
2184 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2185 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
2186 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
2188 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
2189 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
2190 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
2191 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
2193 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2194 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
2195 _mm_add_pd(one_SSE,
2196 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
2197 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2198 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
2199 _mm_add_pd(one_SSE,
2200 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
2202 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2203 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
2204 _mm_add_pd(t2_SSE0,t3_SSE0)));
2205 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2206 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
2207 _mm_add_pd(t2_SSE1,t3_SSE1)));
2209 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
2210 dadx += 2;
2211 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
2212 dadx += 2;
2214 GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0,sum_ai_SSE1);
2215 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0,sum_ai_SSE1);
2216 _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i)));
2220 for(i=0;i<natoms/2+1;i++)
2222 work[i] += work[natoms+i];
2225 /* Parallel summations */
2227 if(PARTDECOMP(cr))
2229 gmx_sum(natoms,work, cr);
2232 if(gb_algorithm==egbHCT)
2234 /* HCT */
2235 for(i=0;i<natoms;i++)
2237 if(born->use[i] != 0)
2239 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
2240 sum_ai = 1.0/rai - work[i];
2241 min_rad = rai + born->gb_doffset;
2242 rad = 1.0/sum_ai;
2244 born->bRad[i] = rad > min_rad ? rad : min_rad;
2245 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2250 else
2252 /* OBC */
2254 /* Calculate the radii */
2255 for(i=0;i<natoms;i++)
2258 if(born->use[i] != 0)
2260 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
2261 rai_inv2 = 1.0/rai;
2262 rai = rai-born->gb_doffset;
2263 rai_inv = 1.0/rai;
2264 sum_ai = rai * work[i];
2265 sum_ai2 = sum_ai * sum_ai;
2266 sum_ai3 = sum_ai2 * sum_ai;
2268 tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
2269 born->bRad[i] = rai_inv - tsum*rai_inv2;
2270 born->bRad[i] = 1.0 / born->bRad[i];
2272 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
2274 tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
2275 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
2280 return 0;
2291 genborn_allvsall_calc_chainrule_sse2_double(t_forcerec * fr,
2292 t_mdatoms * mdatoms,
2293 gmx_genborn_t * born,
2294 double * x,
2295 double * f,
2296 int gb_algorithm,
2297 void * paadata)
2299 gmx_allvsallgb2_data_t *aadata;
2300 int natoms;
2301 int ni0,ni1;
2302 int nj0,nj1,nj2,nj3;
2303 int i,j,k,n;
2304 int idx;
2305 int * mask;
2306 int * pmask0;
2307 int * emask0;
2308 int * jindex;
2310 double ix,iy,iz;
2311 double fix,fiy,fiz;
2312 double jx,jy,jz;
2313 double dx,dy,dz;
2314 double tx,ty,tz;
2315 double rbai,rbaj,fgb,fgb_ai,rbi;
2316 double * rb;
2317 double * dadx;
2318 double * x_align;
2319 double * y_align;
2320 double * z_align;
2321 double * fx_align;
2322 double * fy_align;
2323 double * fz_align;
2324 double tmpsum[2];
2326 __m128d jmask_SSE0,jmask_SSE1;
2327 __m128d ix_SSE0,iy_SSE0,iz_SSE0;
2328 __m128d ix_SSE1,iy_SSE1,iz_SSE1;
2329 __m128d fix_SSE0,fiy_SSE0,fiz_SSE0;
2330 __m128d fix_SSE1,fiy_SSE1,fiz_SSE1;
2331 __m128d rbai_SSE0,rbai_SSE1;
2332 __m128d imask_SSE0,imask_SSE1;
2333 __m128d jx_SSE,jy_SSE,jz_SSE,rbaj_SSE;
2334 __m128d dx_SSE0,dy_SSE0,dz_SSE0;
2335 __m128d dx_SSE1,dy_SSE1,dz_SSE1;
2336 __m128d fgb_SSE0,fgb_ai_SSE0;
2337 __m128d fgb_SSE1,fgb_ai_SSE1;
2338 __m128d tx_SSE0,ty_SSE0,tz_SSE0;
2339 __m128d tx_SSE1,ty_SSE1,tz_SSE1;
2340 __m128d t1,t2,tmpSSE;
2342 natoms = mdatoms->nr;
2343 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
2344 ni1 = mdatoms->start+mdatoms->homenr;
2346 aadata = (gmx_allvsallgb2_data_t *)paadata;
2348 x_align = aadata->x_align;
2349 y_align = aadata->y_align;
2350 z_align = aadata->z_align;
2351 fx_align = aadata->fx_align;
2352 fy_align = aadata->fy_align;
2353 fz_align = aadata->fz_align;
2355 jindex = aadata->jindex_gb;
2356 dadx = fr->dadx;
2358 n = 0;
2359 rb = aadata->work;
2361 /* Loop to get the proper form for the Born radius term */
2362 if(gb_algorithm==egbSTILL)
2364 for(i=0;i<natoms;i++)
2366 rbi = born->bRad[i];
2367 rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
2370 else if(gb_algorithm==egbHCT)
2372 for(i=0;i<natoms;i++)
2374 rbi = born->bRad[i];
2375 rb[i] = rbi * rbi * fr->dvda[i];
2378 else if(gb_algorithm==egbOBC)
2380 for(idx=0;idx<natoms;idx++)
2382 rbi = born->bRad[idx];
2383 rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
2387 for(i=0;i<2*natoms;i++)
2389 fx_align[i] = 0;
2390 fy_align[i] = 0;
2391 fz_align[i] = 0;
2395 for(i=0;i<natoms;i++)
2397 rb[i+natoms]=rb[i];
2400 for(i=ni0; i<ni1; i+=UNROLLI)
2402 /* We assume shifts are NOT used for all-vs-all interactions */
2404 /* Load i atom data */
2405 ix_SSE0 = _mm_load1_pd(x_align+i);
2406 iy_SSE0 = _mm_load1_pd(y_align+i);
2407 iz_SSE0 = _mm_load1_pd(z_align+i);
2408 ix_SSE1 = _mm_load1_pd(x_align+i+1);
2409 iy_SSE1 = _mm_load1_pd(y_align+i+1);
2410 iz_SSE1 = _mm_load1_pd(z_align+i+1);
2412 fix_SSE0 = _mm_setzero_pd();
2413 fiy_SSE0 = _mm_setzero_pd();
2414 fiz_SSE0 = _mm_setzero_pd();
2415 fix_SSE1 = _mm_setzero_pd();
2416 fiy_SSE1 = _mm_setzero_pd();
2417 fiz_SSE1 = _mm_setzero_pd();
2419 rbai_SSE0 = _mm_load1_pd(rb+i);
2420 rbai_SSE1 = _mm_load1_pd(rb+i+1);
2422 /* Load limits for loop over neighbors */
2423 nj0 = jindex[4*i];
2424 nj3 = jindex[4*i+3];
2426 /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
2427 for(j=nj0; j<nj3; j+=UNROLLJ)
2429 /* load j atom coordinates */
2430 jx_SSE = _mm_load_pd(x_align+j);
2431 jy_SSE = _mm_load_pd(y_align+j);
2432 jz_SSE = _mm_load_pd(z_align+j);
2434 /* Calculate distance */
2435 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
2436 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
2437 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
2438 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
2439 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
2440 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
2442 rbaj_SSE = _mm_load_pd(rb+j);
2444 fgb_SSE0 = _mm_mul_pd(rbai_SSE0,_mm_load_pd(dadx));
2445 dadx += 2;
2446 fgb_SSE1 = _mm_mul_pd(rbai_SSE1,_mm_load_pd(dadx));
2447 dadx += 2;
2449 fgb_ai_SSE0 = _mm_mul_pd(rbaj_SSE,_mm_load_pd(dadx));
2450 dadx +=2;
2451 fgb_ai_SSE1 = _mm_mul_pd(rbaj_SSE,_mm_load_pd(dadx));
2452 dadx +=2;
2454 /* Total force between ai and aj is the sum of ai->aj and aj->ai */
2455 fgb_SSE0 = _mm_add_pd(fgb_SSE0,fgb_ai_SSE0);
2456 fgb_SSE1 = _mm_add_pd(fgb_SSE1,fgb_ai_SSE1);
2458 /* Calculate temporary vectorial force */
2459 tx_SSE0 = _mm_mul_pd(fgb_SSE0,dx_SSE0);
2460 ty_SSE0 = _mm_mul_pd(fgb_SSE0,dy_SSE0);
2461 tz_SSE0 = _mm_mul_pd(fgb_SSE0,dz_SSE0);
2462 tx_SSE1 = _mm_mul_pd(fgb_SSE1,dx_SSE1);
2463 ty_SSE1 = _mm_mul_pd(fgb_SSE1,dy_SSE1);
2464 tz_SSE1 = _mm_mul_pd(fgb_SSE1,dz_SSE1);
2466 /* Increment i atom force */
2467 fix_SSE0 = _mm_add_pd(fix_SSE0,tx_SSE0);
2468 fiy_SSE0 = _mm_add_pd(fiy_SSE0,ty_SSE0);
2469 fiz_SSE0 = _mm_add_pd(fiz_SSE0,tz_SSE0);
2470 fix_SSE1 = _mm_add_pd(fix_SSE1,tx_SSE1);
2471 fiy_SSE1 = _mm_add_pd(fiy_SSE1,ty_SSE1);
2472 fiz_SSE1 = _mm_add_pd(fiz_SSE1,tz_SSE1);
2474 /* Decrement j atom force */
2475 _mm_store_pd(fx_align+j,
2476 _mm_sub_pd( _mm_load_pd(fx_align+j) , _mm_add_pd(tx_SSE0,tx_SSE1) ));
2477 _mm_store_pd(fy_align+j,
2478 _mm_sub_pd( _mm_load_pd(fy_align+j) , _mm_add_pd(ty_SSE0,ty_SSE1) ));
2479 _mm_store_pd(fz_align+j,
2480 _mm_sub_pd( _mm_load_pd(fz_align+j) , _mm_add_pd(tz_SSE0,tz_SSE1) ));
2483 /* Add i forces to mem */
2484 GMX_MM_TRANSPOSE2_PD(fix_SSE0,fix_SSE1);
2485 fix_SSE0 = _mm_add_pd(fix_SSE0,fix_SSE1);
2486 _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i)));
2488 GMX_MM_TRANSPOSE2_PD(fiy_SSE0,fiy_SSE1);
2489 fiy_SSE0 = _mm_add_pd(fiy_SSE0,fiy_SSE1);
2490 _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i)));
2492 GMX_MM_TRANSPOSE2_PD(fiz_SSE0,fiz_SSE1);
2493 fiz_SSE0 = _mm_add_pd(fiz_SSE0,fiz_SSE1);
2494 _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i)));
2497 for(i=0;i<natoms;i++)
2499 f[3*i] += fx_align[i] + fx_align[natoms+i];
2500 f[3*i+1] += fy_align[i] + fy_align[natoms+i];
2501 f[3*i+2] += fz_align[i] + fz_align[natoms+i];
2504 return 0;
2507 #else
2508 /* dummy variable when not using SSE */
2509 int genborn_allvsall_sse2_double_dummy;
2512 #endif