A freeze group can now be allowed to move rigidly in some dimension by using "freezed...
[gromacs/rigid-bodies.git] / src / mdlib / genborn_allvsall_sse2_single.c
blobff8f4a18f408b2270bb94a601faaf1d4964a398c
1 /*
2 * This source code is part of
4 * G R O M A C S
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
16 * later version.
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
30 #ifdef HAVE_CONFIG_H
31 #include <config.h>
32 #endif
34 #include <math.h>
35 #include "types/simple.h"
37 #include "vec.h"
38 #include "smalloc.h"
40 #include "partdec.h"
41 #include "network.h"
42 #include "physics.h"
43 #include "genborn.h"
44 #include "genborn_allvsall.h"
46 #if ( (defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_SSE2)) && !defined(GMX_DOUBLE) )
48 #include <gmx_sse2_single.h>
51 #define SIMD_WIDTH 4
52 #define UNROLLI 4
53 #define UNROLLJ 4
63 typedef struct
65 int * jindex_gb;
66 int ** prologue_mask_gb;
67 int ** epilogue_mask;
68 int * imask;
69 real * gb_radius;
70 real * workparam;
71 real * work;
72 real * x_align;
73 real * y_align;
74 real * z_align;
75 real * fx_align;
76 real * fy_align;
77 real * fz_align;
79 gmx_allvsallgb2_data_t;
82 static int
83 calc_maxoffset(int i,int natoms)
85 int maxoffset;
87 if ((natoms % 2) == 1)
89 /* Odd number of atoms, easy */
90 maxoffset = natoms/2;
92 else if ((natoms % 4) == 0)
94 /* Multiple of four is hard */
95 if (i < natoms/2)
97 if ((i % 2) == 0)
99 maxoffset=natoms/2;
101 else
103 maxoffset=natoms/2-1;
106 else
108 if ((i % 2) == 1)
110 maxoffset=natoms/2;
112 else
114 maxoffset=natoms/2-1;
118 else
120 /* natoms/2 = odd */
121 if ((i % 2) == 0)
123 maxoffset=natoms/2;
125 else
127 maxoffset=natoms/2-1;
131 return maxoffset;
134 static void
135 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
136 t_ilist * ilist,
137 int start,
138 int end,
139 int natoms,
140 gmx_bool bInclude12,
141 gmx_bool bInclude13,
142 gmx_bool bInclude14)
144 int i,j,k,tp;
145 int a1,a2;
146 int ni0,ni1,nj0,nj1,nj;
147 int imin,imax,iexcl;
148 int max_offset;
149 int max_excl_offset;
150 int firstinteraction;
151 int ibase;
152 int *pi;
154 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
155 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
156 * whether they should interact or not.
158 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
159 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
160 * we create a jindex array with three elements per i atom: the starting point, the point to
161 * which we need to check exclusions, and the end point.
162 * This way we only have to allocate a short exclusion mask per i atom.
165 ni0 = (start/UNROLLI)*UNROLLI;
166 ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
168 /* Set the interaction mask to only enable the i atoms we want to include */
169 snew(pi,natoms+UNROLLI+2*SIMD_WIDTH);
170 aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
171 for(i=0;i<natoms+UNROLLI;i++)
173 aadata->imask[i] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
176 /* Allocate memory for our modified jindex array */
177 snew(aadata->jindex_gb,4*(natoms+UNROLLI));
178 for(i=0;i<4*(natoms+UNROLLI);i++)
180 aadata->jindex_gb[i] = 0;
183 /* Create the exclusion masks for the prologue part */
184 snew(aadata->prologue_mask_gb,natoms+UNROLLI); /* list of pointers */
186 /* First zero everything to avoid uninitialized data */
187 for(i=0;i<natoms+UNROLLI;i++)
189 aadata->prologue_mask_gb[i] = NULL;
192 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
193 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
195 max_excl_offset = -1;
197 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
198 imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
200 /* Which atom is the first we (might) interact with? */
201 imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
202 for(i=ibase;i<imax;i++)
204 /* Before exclusions, which atom is the first we (might) interact with? */
205 firstinteraction = i+1;
206 max_offset = calc_maxoffset(i,natoms);
208 if(!bInclude12)
210 for(j=0;j<ilist[F_GB12].nr;j+=3)
212 a1 = ilist[F_GB12].iatoms[j+1];
213 a2 = ilist[F_GB12].iatoms[j+2];
215 if(a1==i)
217 k = a2;
219 else if(a2==i)
221 k = a1;
223 else
225 continue;
228 if(k==firstinteraction)
230 firstinteraction++;
234 if(!bInclude13)
236 for(j=0;j<ilist[F_GB13].nr;j+=3)
238 a1 = ilist[F_GB13].iatoms[j+1];
239 a2 = ilist[F_GB13].iatoms[j+2];
241 if(a1==i)
243 k = a2;
245 else if(a2==i)
247 k = a1;
249 else
251 continue;
254 if(k==firstinteraction)
256 firstinteraction++;
260 if(!bInclude14)
262 for(j=0;j<ilist[F_GB14].nr;j+=3)
264 a1 = ilist[F_GB14].iatoms[j+1];
265 a2 = ilist[F_GB14].iatoms[j+2];
266 if(a1==i)
268 k = a2;
270 else if(a2==i)
272 k = a1;
274 else
276 continue;
279 if(k==firstinteraction)
281 firstinteraction++;
285 imin = (firstinteraction < imin) ? firstinteraction : imin;
287 /* round down to j unrolling factor */
288 imin = (imin/UNROLLJ)*UNROLLJ;
290 for(i=ibase;i<imax;i++)
292 max_offset = calc_maxoffset(i,natoms);
294 if(!bInclude12)
296 for(j=0;j<ilist[F_GB12].nr;j+=3)
298 a1 = ilist[F_GB12].iatoms[j+1];
299 a2 = ilist[F_GB12].iatoms[j+2];
301 if(a1==i)
303 k = a2;
305 else if(a2==i)
307 k = a1;
309 else
311 continue;
314 if(k<imin)
316 k += natoms;
319 if(k>i+max_offset)
321 continue;
324 k = k - imin;
326 if( k+natoms <= max_offset )
328 k+=natoms;
330 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
333 if(!bInclude13)
335 for(j=0;j<ilist[F_GB13].nr;j+=3)
337 a1 = ilist[F_GB13].iatoms[j+1];
338 a2 = ilist[F_GB13].iatoms[j+2];
340 if(a1==i)
342 k = a2;
344 else if(a2==i)
346 k = a1;
348 else
350 continue;
353 if(k<imin)
355 k += natoms;
358 if(k>i+max_offset)
360 continue;
363 k = k - imin;
365 if( k+natoms <= max_offset )
367 k+=natoms;
369 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
372 if(!bInclude14)
374 for(j=0;j<ilist[F_GB14].nr;j+=3)
376 a1 = ilist[F_GB14].iatoms[j+1];
377 a2 = ilist[F_GB14].iatoms[j+2];
379 if(a1==i)
381 k = a2;
383 else if(a2==i)
385 k = a1;
387 else
389 continue;
392 if(k<imin)
394 k += natoms;
397 if(k>i+max_offset)
399 continue;
402 k = k - imin;
404 if( k+natoms <= max_offset )
406 k+=natoms;
408 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
413 /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
414 max_excl_offset++;
415 /* round up to j unrolling factor */
416 max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
418 /* Set all the prologue masks length to this value (even for i>end) */
419 for(i=ibase;i<ibase+UNROLLI;i++)
421 aadata->jindex_gb[4*i] = imin;
422 aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
426 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
427 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
429 for(i=ibase;i<ibase+UNROLLI;i++)
431 nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
432 imin = aadata->jindex_gb[4*i];
434 /* Allocate aligned memory */
435 snew(pi,nj+2*SIMD_WIDTH);
436 aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
438 max_offset = calc_maxoffset(i,natoms);
440 /* Include interactions i+1 <= j < i+maxoffset */
441 for(k=0;k<nj;k++)
443 j = imin + k;
445 if( (j>i) && (j<=i+max_offset) )
447 aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
449 else
451 aadata->prologue_mask_gb[i][k] = 0;
455 /* Clear out the explicit exclusions */
456 if(i<end)
458 if(!bInclude12)
460 for(j=0;j<ilist[F_GB12].nr;j+=3)
462 a1 = ilist[F_GB12].iatoms[j+1];
463 a2 = ilist[F_GB12].iatoms[j+2];
465 if(a1==i)
467 k = a2;
469 else if(a2==i)
471 k = a1;
473 else
475 continue;
478 if(k>i+max_offset)
480 continue;
482 k = k-i;
484 if( k+natoms <= max_offset )
486 k+=natoms;
489 k = k+i-imin;
490 if(k>=0)
492 aadata->prologue_mask_gb[i][k] = 0;
496 if(!bInclude13)
498 for(j=0;j<ilist[F_GB13].nr;j+=3)
500 a1 = ilist[F_GB13].iatoms[j+1];
501 a2 = ilist[F_GB13].iatoms[j+2];
503 if(a1==i)
505 k = a2;
507 else if(a2==i)
509 k = a1;
511 else
513 continue;
516 if(k>i+max_offset)
518 continue;
520 k = k-i;
522 if( k+natoms <= max_offset )
524 k+=natoms;
527 k = k+i-imin;
528 if(k>=0)
530 aadata->prologue_mask_gb[i][k] = 0;
534 if(!bInclude14)
536 for(j=0;j<ilist[F_GB14].nr;j+=3)
538 a1 = ilist[F_GB14].iatoms[j+1];
539 a2 = ilist[F_GB14].iatoms[j+2];
541 if(a1==i)
543 k = a2;
545 else if(a2==i)
547 k = a1;
549 else
551 continue;
554 if(k>i+max_offset)
556 continue;
558 k = k-i;
560 if( k+natoms <= max_offset )
562 k+=natoms;
565 k = k+i-imin;
566 if(k>=0)
568 aadata->prologue_mask_gb[i][k] = 0;
576 /* Construct the epilogue mask - this just contains the check for maxoffset */
577 snew(aadata->epilogue_mask,natoms+UNROLLI);
579 /* First zero everything to avoid uninitialized data */
580 for(i=0;i<natoms+UNROLLI;i++)
582 aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
583 aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
584 aadata->epilogue_mask[i] = NULL;
587 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
589 /* Find the lowest index for which we need to use the epilogue */
590 imin = ibase;
591 max_offset = calc_maxoffset(imin,natoms);
593 imin = imin + 1 + max_offset;
595 /* Find largest index for which we need to use the epilogue */
596 imax = ibase + UNROLLI-1;
597 imax = (imax < end) ? imax : end;
599 max_offset = calc_maxoffset(imax,natoms);
600 imax = imax + 1 + max_offset + UNROLLJ - 1;
602 for(i=ibase;i<ibase+UNROLLI;i++)
604 /* Start of epilogue - round down to j tile limit */
605 aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
606 /* Make sure we dont overlap - for small systems everything is done in the prologue */
607 aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
608 /* Round upwards to j tile limit */
609 aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
610 /* Make sure we dont have a negative range for the epilogue */
611 aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
615 /* And fill it with data... */
617 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
619 for(i=ibase;i<ibase+UNROLLI;i++)
622 nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
624 /* Allocate aligned memory */
625 snew(pi,nj+2*SIMD_WIDTH);
626 aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
628 max_offset = calc_maxoffset(i,natoms);
630 for(k=0;k<nj;k++)
632 j = aadata->jindex_gb[4*i+2] + k;
633 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
640 static void
641 genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
642 gmx_localtop_t * top,
643 gmx_genborn_t * born,
644 t_mdatoms * mdatoms,
645 real radius_offset,
646 int gb_algorithm,
647 gmx_bool bInclude12,
648 gmx_bool bInclude13,
649 gmx_bool bInclude14)
651 int i,j,idx;
652 int natoms;
653 gmx_allvsallgb2_data_t *aadata;
654 real *p;
656 natoms = mdatoms->nr;
658 snew(aadata,1);
659 *p_aadata = aadata;
661 snew(p,2*natoms+2*SIMD_WIDTH);
662 aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
663 snew(p,2*natoms+2*SIMD_WIDTH);
664 aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
665 snew(p,2*natoms+2*SIMD_WIDTH);
666 aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
667 snew(p,2*natoms+2*SIMD_WIDTH);
668 aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
669 snew(p,2*natoms+2*SIMD_WIDTH);
670 aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
671 snew(p,2*natoms+2*SIMD_WIDTH);
672 aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
674 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
675 aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
677 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
678 aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
680 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
681 aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
683 for(i=0;i<mdatoms->nr;i++)
685 aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
686 if(gb_algorithm==egbSTILL)
688 aadata->workparam[i] = born->vsolv[i];
690 else if(gb_algorithm==egbOBC)
692 aadata->workparam[i] = born->param[i];
694 aadata->work[i] = 0.0;
696 for(i=0;i<mdatoms->nr;i++)
698 aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
699 aadata->workparam[natoms+i] = aadata->workparam[i];
700 aadata->work[natoms+i] = aadata->work[i];
703 for(i=0;i<2*natoms+SIMD_WIDTH;i++)
705 aadata->x_align[i] = 0.0;
706 aadata->y_align[i] = 0.0;
707 aadata->z_align[i] = 0.0;
708 aadata->fx_align[i] = 0.0;
709 aadata->fy_align[i] = 0.0;
710 aadata->fz_align[i] = 0.0;
713 setup_gb_exclusions_and_indices(aadata,top->idef.il,mdatoms->start,mdatoms->start+mdatoms->homenr,mdatoms->nr,
714 bInclude12,bInclude13,bInclude14);
719 genborn_allvsall_calc_still_radii_sse2_single(t_forcerec * fr,
720 t_mdatoms * mdatoms,
721 gmx_genborn_t * born,
722 gmx_localtop_t * top,
723 real * x,
724 t_commrec * cr,
725 void * paadata)
727 gmx_allvsallgb2_data_t *aadata;
728 int natoms;
729 int ni0,ni1;
730 int nj0,nj1,nj2,nj3;
731 int i,j,k,n;
732 int * mask;
733 int * pmask0;
734 int * pmask1;
735 int * pmask2;
736 int * pmask3;
737 int * emask0;
738 int * emask1;
739 int * emask2;
740 int * emask3;
741 real ix,iy,iz;
742 real jx,jy,jz;
743 real dx,dy,dz;
744 real rsq,rinv;
745 real gpi,rai,vai;
746 real prod_ai;
747 real irsq,idr4,idr6;
748 real raj,rvdw,ratio;
749 real vaj,ccf,dccf,theta,cosq;
750 real term,prod,icf4,icf6,gpi2,factor,sinq;
751 real * gb_radius;
752 real * vsolv;
753 real * work;
754 real tmpsum[4];
755 real * x_align;
756 real * y_align;
757 real * z_align;
758 int * jindex;
759 real * dadx;
761 __m128 ix_SSE0,iy_SSE0,iz_SSE0;
762 __m128 ix_SSE1,iy_SSE1,iz_SSE1;
763 __m128 ix_SSE2,iy_SSE2,iz_SSE2;
764 __m128 ix_SSE3,iy_SSE3,iz_SSE3;
765 __m128 gpi_SSE0,rai_SSE0,prod_ai_SSE0;
766 __m128 gpi_SSE1,rai_SSE1,prod_ai_SSE1;
767 __m128 gpi_SSE2,rai_SSE2,prod_ai_SSE2;
768 __m128 gpi_SSE3,rai_SSE3,prod_ai_SSE3;
769 __m128 imask_SSE0,jmask_SSE0;
770 __m128 imask_SSE1,jmask_SSE1;
771 __m128 imask_SSE2,jmask_SSE2;
772 __m128 imask_SSE3,jmask_SSE3;
773 __m128 jx_SSE,jy_SSE,jz_SSE;
774 __m128 dx_SSE0,dy_SSE0,dz_SSE0;
775 __m128 dx_SSE1,dy_SSE1,dz_SSE1;
776 __m128 dx_SSE2,dy_SSE2,dz_SSE2;
777 __m128 dx_SSE3,dy_SSE3,dz_SSE3;
778 __m128 rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
779 __m128 rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
780 __m128 rsq_SSE2,rinv_SSE2,irsq_SSE2,idr4_SSE2,idr6_SSE2;
781 __m128 rsq_SSE3,rinv_SSE3,irsq_SSE3,idr4_SSE3,idr6_SSE3;
782 __m128 raj_SSE,vaj_SSE,prod_SSE;
783 __m128 rvdw_SSE0,ratio_SSE0;
784 __m128 rvdw_SSE1,ratio_SSE1;
785 __m128 rvdw_SSE2,ratio_SSE2;
786 __m128 rvdw_SSE3,ratio_SSE3;
787 __m128 theta_SSE0,sinq_SSE0,cosq_SSE0,term_SSE0;
788 __m128 theta_SSE1,sinq_SSE1,cosq_SSE1,term_SSE1;
789 __m128 theta_SSE2,sinq_SSE2,cosq_SSE2,term_SSE2;
790 __m128 theta_SSE3,sinq_SSE3,cosq_SSE3,term_SSE3;
791 __m128 ccf_SSE0,dccf_SSE0;
792 __m128 ccf_SSE1,dccf_SSE1;
793 __m128 ccf_SSE2,dccf_SSE2;
794 __m128 ccf_SSE3,dccf_SSE3;
795 __m128 icf4_SSE0,icf6_SSE0;
796 __m128 icf4_SSE1,icf6_SSE1;
797 __m128 icf4_SSE2,icf6_SSE2;
798 __m128 icf4_SSE3,icf6_SSE3;
799 __m128 half_SSE,one_SSE,two_SSE,four_SSE;
800 __m128 still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
802 natoms = mdatoms->nr;
803 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
804 ni1 = mdatoms->start+mdatoms->homenr;
806 n = 0;
808 aadata = *((gmx_allvsallgb2_data_t **)paadata);
811 if(aadata==NULL)
813 genborn_allvsall_setup(&aadata,top,born,mdatoms,0.0,
814 egbSTILL,FALSE,FALSE,TRUE);
815 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
818 x_align = aadata->x_align;
819 y_align = aadata->y_align;
820 z_align = aadata->z_align;
822 gb_radius = aadata->gb_radius;
823 vsolv = aadata->workparam;
824 work = aadata->work;
825 jindex = aadata->jindex_gb;
826 dadx = fr->dadx;
828 still_p4_SSE = _mm_set1_ps(STILL_P4);
829 still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
830 still_pip5_SSE = _mm_set1_ps(STILL_PIP5);
831 half_SSE = _mm_set1_ps(0.5);
832 one_SSE = _mm_set1_ps(1.0);
833 two_SSE = _mm_set1_ps(2.0);
834 four_SSE = _mm_set1_ps(4.0);
836 /* This will be summed, so it has to extend to natoms + buffer */
837 for(i=0;i<natoms+1+natoms/2;i++)
839 work[i] = 0;
842 for(i=ni0;i<ni1+1+natoms/2;i++)
844 k = i%natoms;
845 x_align[i] = x[3*k];
846 y_align[i] = x[3*k+1];
847 z_align[i] = x[3*k+2];
848 work[i] = 0;
852 for(i=ni0; i<ni1; i+=UNROLLI)
854 /* We assume shifts are NOT used for all-vs-all interactions */
856 /* Load i atom data */
857 ix_SSE0 = _mm_load1_ps(x_align+i);
858 iy_SSE0 = _mm_load1_ps(y_align+i);
859 iz_SSE0 = _mm_load1_ps(z_align+i);
860 ix_SSE1 = _mm_load1_ps(x_align+i+1);
861 iy_SSE1 = _mm_load1_ps(y_align+i+1);
862 iz_SSE1 = _mm_load1_ps(z_align+i+1);
863 ix_SSE2 = _mm_load1_ps(x_align+i+2);
864 iy_SSE2 = _mm_load1_ps(y_align+i+2);
865 iz_SSE2 = _mm_load1_ps(z_align+i+2);
866 ix_SSE3 = _mm_load1_ps(x_align+i+3);
867 iy_SSE3 = _mm_load1_ps(y_align+i+3);
868 iz_SSE3 = _mm_load1_ps(z_align+i+3);
870 gpi_SSE0 = _mm_setzero_ps();
871 gpi_SSE1 = _mm_setzero_ps();
872 gpi_SSE2 = _mm_setzero_ps();
873 gpi_SSE3 = _mm_setzero_ps();
875 rai_SSE0 = _mm_load1_ps(gb_radius+i);
876 rai_SSE1 = _mm_load1_ps(gb_radius+i+1);
877 rai_SSE2 = _mm_load1_ps(gb_radius+i+2);
878 rai_SSE3 = _mm_load1_ps(gb_radius+i+3);
880 prod_ai_SSE0 = _mm_set1_ps(STILL_P4*vsolv[i]);
881 prod_ai_SSE1 = _mm_set1_ps(STILL_P4*vsolv[i+1]);
882 prod_ai_SSE2 = _mm_set1_ps(STILL_P4*vsolv[i+2]);
883 prod_ai_SSE3 = _mm_set1_ps(STILL_P4*vsolv[i+3]);
885 /* Load limits for loop over neighbors */
886 nj0 = jindex[4*i];
887 nj1 = jindex[4*i+1];
888 nj2 = jindex[4*i+2];
889 nj3 = jindex[4*i+3];
891 pmask0 = aadata->prologue_mask_gb[i];
892 pmask1 = aadata->prologue_mask_gb[i+1];
893 pmask2 = aadata->prologue_mask_gb[i+2];
894 pmask3 = aadata->prologue_mask_gb[i+3];
895 emask0 = aadata->epilogue_mask[i];
896 emask1 = aadata->epilogue_mask[i+1];
897 emask2 = aadata->epilogue_mask[i+2];
898 emask3 = aadata->epilogue_mask[i+3];
900 imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i));
901 imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1));
902 imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2));
903 imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3));
905 /* Prologue part, including exclusion mask */
906 for(j=nj0; j<nj1; j+=UNROLLJ)
908 jmask_SSE0 = _mm_load_ps((real *)pmask0);
909 jmask_SSE1 = _mm_load_ps((real *)pmask1);
910 jmask_SSE2 = _mm_load_ps((real *)pmask2);
911 jmask_SSE3 = _mm_load_ps((real *)pmask3);
912 pmask0 += UNROLLJ;
913 pmask1 += UNROLLJ;
914 pmask2 += UNROLLJ;
915 pmask3 += UNROLLJ;
917 /* load j atom coordinates */
918 jx_SSE = _mm_load_ps(x_align+j);
919 jy_SSE = _mm_load_ps(y_align+j);
920 jz_SSE = _mm_load_ps(z_align+j);
922 /* Calculate distance */
923 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
924 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
925 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
926 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
927 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
928 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
929 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
930 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
931 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
932 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
933 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
934 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
936 /* rsq = dx*dx+dy*dy+dz*dz */
937 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
938 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
939 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
940 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
942 /* Combine masks */
943 jmask_SSE0 = _mm_and_ps(jmask_SSE0,imask_SSE0);
944 jmask_SSE1 = _mm_and_ps(jmask_SSE1,imask_SSE1);
945 jmask_SSE2 = _mm_and_ps(jmask_SSE2,imask_SSE2);
946 jmask_SSE3 = _mm_and_ps(jmask_SSE3,imask_SSE3);
948 /* Calculate 1/r and 1/r2 */
949 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
950 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
951 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
952 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
954 /* Apply mask */
955 rinv_SSE0 = _mm_and_ps(rinv_SSE0,jmask_SSE0);
956 rinv_SSE1 = _mm_and_ps(rinv_SSE1,jmask_SSE1);
957 rinv_SSE2 = _mm_and_ps(rinv_SSE2,jmask_SSE2);
958 rinv_SSE3 = _mm_and_ps(rinv_SSE3,jmask_SSE3);
960 irsq_SSE0 = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
961 irsq_SSE1 = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
962 irsq_SSE2 = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
963 irsq_SSE3 = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
964 idr4_SSE0 = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
965 idr4_SSE1 = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
966 idr4_SSE2 = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
967 idr4_SSE3 = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
968 idr6_SSE0 = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
969 idr6_SSE1 = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
970 idr6_SSE2 = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
971 idr6_SSE3 = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
973 raj_SSE = _mm_load_ps(gb_radius+j);
974 vaj_SSE = _mm_load_ps(vsolv+j);
976 rvdw_SSE0 = _mm_add_ps(rai_SSE0,raj_SSE);
977 rvdw_SSE1 = _mm_add_ps(rai_SSE1,raj_SSE);
978 rvdw_SSE2 = _mm_add_ps(rai_SSE2,raj_SSE);
979 rvdw_SSE3 = _mm_add_ps(rai_SSE3,raj_SSE);
981 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
982 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
983 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
984 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
986 ratio_SSE0 = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
987 ratio_SSE1 = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
988 ratio_SSE2 = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
989 ratio_SSE3 = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
990 theta_SSE0 = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
991 theta_SSE1 = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
992 theta_SSE2 = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
993 theta_SSE3 = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
994 gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
995 gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
996 gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);
997 gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);
998 term_SSE0 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
999 term_SSE1 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1000 term_SSE2 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1001 term_SSE3 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1002 ccf_SSE0 = _mm_mul_ps(term_SSE0,term_SSE0);
1003 ccf_SSE1 = _mm_mul_ps(term_SSE1,term_SSE1);
1004 ccf_SSE2 = _mm_mul_ps(term_SSE2,term_SSE2);
1005 ccf_SSE3 = _mm_mul_ps(term_SSE3,term_SSE3);
1006 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1007 _mm_mul_ps(sinq_SSE0,theta_SSE0));
1008 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1009 _mm_mul_ps(sinq_SSE1,theta_SSE1));
1010 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1011 _mm_mul_ps(sinq_SSE2,theta_SSE2));
1012 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1013 _mm_mul_ps(sinq_SSE3,theta_SSE3));
1015 prod_SSE = _mm_mul_ps(still_p4_SSE,vaj_SSE);
1016 icf4_SSE0 = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1017 icf4_SSE1 = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1018 icf4_SSE2 = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1019 icf4_SSE3 = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1020 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1021 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1022 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1023 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1025 _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1026 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1027 _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1028 _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1029 _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1031 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1032 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1033 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1034 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1036 /* Save ai->aj and aj->ai chain rule terms */
1037 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1038 dadx+=4;
1039 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1040 dadx+=4;
1041 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1042 dadx+=4;
1043 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1044 dadx+=4;
1046 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1047 dadx+=4;
1048 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1049 dadx+=4;
1050 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1051 dadx+=4;
1052 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1053 dadx+=4;
1056 /* Main part, no exclusions */
1057 for(j=nj1; j<nj2; j+=UNROLLJ)
1059 /* load j atom coordinates */
1060 jx_SSE = _mm_load_ps(x_align+j);
1061 jy_SSE = _mm_load_ps(y_align+j);
1062 jz_SSE = _mm_load_ps(z_align+j);
1064 /* Calculate distance */
1065 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
1066 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
1067 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
1068 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
1069 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
1070 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
1071 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
1072 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
1073 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
1074 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
1075 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
1076 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
1078 /* rsq = dx*dx+dy*dy+dz*dz */
1079 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1080 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1081 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1082 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1084 /* Calculate 1/r and 1/r2 */
1085 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1086 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1087 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1088 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1090 /* Apply mask */
1091 rinv_SSE0 = _mm_and_ps(rinv_SSE0,imask_SSE0);
1092 rinv_SSE1 = _mm_and_ps(rinv_SSE1,imask_SSE1);
1093 rinv_SSE2 = _mm_and_ps(rinv_SSE2,imask_SSE2);
1094 rinv_SSE3 = _mm_and_ps(rinv_SSE3,imask_SSE3);
1096 irsq_SSE0 = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
1097 irsq_SSE1 = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
1098 irsq_SSE2 = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
1099 irsq_SSE3 = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
1100 idr4_SSE0 = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
1101 idr4_SSE1 = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
1102 idr4_SSE2 = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
1103 idr4_SSE3 = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
1104 idr6_SSE0 = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
1105 idr6_SSE1 = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
1106 idr6_SSE2 = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
1107 idr6_SSE3 = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
1109 raj_SSE = _mm_load_ps(gb_radius+j);
1111 rvdw_SSE0 = _mm_add_ps(rai_SSE0,raj_SSE);
1112 rvdw_SSE1 = _mm_add_ps(rai_SSE1,raj_SSE);
1113 rvdw_SSE2 = _mm_add_ps(rai_SSE2,raj_SSE);
1114 rvdw_SSE3 = _mm_add_ps(rai_SSE3,raj_SSE);
1115 vaj_SSE = _mm_load_ps(vsolv+j);
1117 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
1118 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
1119 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
1120 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
1122 ratio_SSE0 = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
1123 ratio_SSE1 = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
1124 ratio_SSE2 = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
1125 ratio_SSE3 = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
1126 theta_SSE0 = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
1127 theta_SSE1 = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
1128 theta_SSE2 = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
1129 theta_SSE3 = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
1130 gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
1131 gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
1132 gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);
1133 gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);
1134 term_SSE0 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
1135 term_SSE1 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1136 term_SSE2 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1137 term_SSE3 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1138 ccf_SSE0 = _mm_mul_ps(term_SSE0,term_SSE0);
1139 ccf_SSE1 = _mm_mul_ps(term_SSE1,term_SSE1);
1140 ccf_SSE2 = _mm_mul_ps(term_SSE2,term_SSE2);
1141 ccf_SSE3 = _mm_mul_ps(term_SSE3,term_SSE3);
1142 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1143 _mm_mul_ps(sinq_SSE0,theta_SSE0));
1144 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1145 _mm_mul_ps(sinq_SSE1,theta_SSE1));
1146 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1147 _mm_mul_ps(sinq_SSE2,theta_SSE2));
1148 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1149 _mm_mul_ps(sinq_SSE3,theta_SSE3));
1151 prod_SSE = _mm_mul_ps(still_p4_SSE,vaj_SSE );
1152 icf4_SSE0 = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1153 icf4_SSE1 = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1154 icf4_SSE2 = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1155 icf4_SSE3 = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1156 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1157 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1158 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1159 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1161 _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1162 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1163 _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1164 _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1165 _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1167 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1168 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1169 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1170 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1172 /* Save ai->aj and aj->ai chain rule terms */
1173 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1174 dadx+=4;
1175 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1176 dadx+=4;
1177 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1178 dadx+=4;
1179 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1180 dadx+=4;
1182 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1183 dadx+=4;
1184 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1185 dadx+=4;
1186 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1187 dadx+=4;
1188 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1189 dadx+=4;
1191 /* Epilogue part, including exclusion mask */
1192 for(j=nj2; j<nj3; j+=UNROLLJ)
1194 jmask_SSE0 = _mm_load_ps((real *)emask0);
1195 jmask_SSE1 = _mm_load_ps((real *)emask1);
1196 jmask_SSE2 = _mm_load_ps((real *)emask2);
1197 jmask_SSE3 = _mm_load_ps((real *)emask3);
1198 emask0 += UNROLLJ;
1199 emask1 += UNROLLJ;
1200 emask2 += UNROLLJ;
1201 emask3 += UNROLLJ;
1203 /* load j atom coordinates */
1204 jx_SSE = _mm_load_ps(x_align+j);
1205 jy_SSE = _mm_load_ps(y_align+j);
1206 jz_SSE = _mm_load_ps(z_align+j);
1208 /* Calculate distance */
1209 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
1210 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
1211 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
1212 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
1213 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
1214 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
1215 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
1216 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
1217 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
1218 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
1219 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
1220 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
1222 /* rsq = dx*dx+dy*dy+dz*dz */
1223 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1224 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1225 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1226 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1228 /* Combine masks */
1229 jmask_SSE0 = _mm_and_ps(jmask_SSE0,imask_SSE0);
1230 jmask_SSE1 = _mm_and_ps(jmask_SSE1,imask_SSE1);
1231 jmask_SSE2 = _mm_and_ps(jmask_SSE2,imask_SSE2);
1232 jmask_SSE3 = _mm_and_ps(jmask_SSE3,imask_SSE3);
1234 /* Calculate 1/r and 1/r2 */
1235 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1236 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1237 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1238 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1240 /* Apply mask */
1241 rinv_SSE0 = _mm_and_ps(rinv_SSE0,jmask_SSE0);
1242 rinv_SSE1 = _mm_and_ps(rinv_SSE1,jmask_SSE1);
1243 rinv_SSE2 = _mm_and_ps(rinv_SSE2,jmask_SSE2);
1244 rinv_SSE3 = _mm_and_ps(rinv_SSE3,jmask_SSE3);
1246 irsq_SSE0 = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
1247 irsq_SSE1 = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
1248 irsq_SSE2 = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
1249 irsq_SSE3 = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
1250 idr4_SSE0 = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
1251 idr4_SSE1 = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
1252 idr4_SSE2 = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
1253 idr4_SSE3 = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
1254 idr6_SSE0 = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
1255 idr6_SSE1 = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
1256 idr6_SSE2 = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
1257 idr6_SSE3 = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
1259 raj_SSE = _mm_load_ps(gb_radius+j);
1260 vaj_SSE = _mm_load_ps(vsolv+j);
1262 rvdw_SSE0 = _mm_add_ps(rai_SSE0,raj_SSE);
1263 rvdw_SSE1 = _mm_add_ps(rai_SSE1,raj_SSE);
1264 rvdw_SSE2 = _mm_add_ps(rai_SSE2,raj_SSE);
1265 rvdw_SSE3 = _mm_add_ps(rai_SSE3,raj_SSE);
1267 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
1268 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
1269 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
1270 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
1272 ratio_SSE0 = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
1273 ratio_SSE1 = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
1274 ratio_SSE2 = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
1275 ratio_SSE3 = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
1276 theta_SSE0 = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
1277 theta_SSE1 = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
1278 theta_SSE2 = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
1279 theta_SSE3 = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
1280 gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
1281 gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
1282 gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);
1283 gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);
1284 term_SSE0 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
1285 term_SSE1 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1286 term_SSE2 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1287 term_SSE3 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1288 ccf_SSE0 = _mm_mul_ps(term_SSE0,term_SSE0);
1289 ccf_SSE1 = _mm_mul_ps(term_SSE1,term_SSE1);
1290 ccf_SSE2 = _mm_mul_ps(term_SSE2,term_SSE2);
1291 ccf_SSE3 = _mm_mul_ps(term_SSE3,term_SSE3);
1292 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1293 _mm_mul_ps(sinq_SSE0,theta_SSE0));
1294 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1295 _mm_mul_ps(sinq_SSE1,theta_SSE1));
1296 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1297 _mm_mul_ps(sinq_SSE2,theta_SSE2));
1298 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1299 _mm_mul_ps(sinq_SSE3,theta_SSE3));
1301 prod_SSE = _mm_mul_ps(still_p4_SSE,vaj_SSE);
1302 icf4_SSE0 = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1303 icf4_SSE1 = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1304 icf4_SSE2 = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1305 icf4_SSE3 = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1306 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1307 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1308 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1309 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1311 _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1312 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1313 _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1314 _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1315 _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1317 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1318 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1319 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1320 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1322 /* Save ai->aj and aj->ai chain rule terms */
1323 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1324 dadx+=4;
1325 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1326 dadx+=4;
1327 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1328 dadx+=4;
1329 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1330 dadx+=4;
1332 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1333 dadx+=4;
1334 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1335 dadx+=4;
1336 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1337 dadx+=4;
1338 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1339 dadx+=4;
1341 _MM_TRANSPOSE4_PS(gpi_SSE0,gpi_SSE1,gpi_SSE2,gpi_SSE3);
1342 gpi_SSE0 = _mm_add_ps(gpi_SSE0,gpi_SSE1);
1343 gpi_SSE2 = _mm_add_ps(gpi_SSE2,gpi_SSE3);
1344 gpi_SSE0 = _mm_add_ps(gpi_SSE0,gpi_SSE2);
1345 _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
1348 /* In case we have written anything beyond natoms, move it back.
1349 * Never mind that we leave stuff above natoms; that will not
1350 * be accessed later in the routine.
1351 * In principle this should be a move rather than sum, but this
1352 * way we dont have to worry about even/odd offsets...
1354 for(i=natoms;i<ni1+1+natoms/2;i++)
1356 work[i-natoms] += work[i];
1359 /* Parallel summations */
1360 if(PARTDECOMP(cr))
1362 gmx_sum(natoms,work,cr);
1365 factor = 0.5 * ONE_4PI_EPS0;
1366 /* Calculate the radii - should we do all atoms, or just our local ones? */
1367 for(i=0;i<natoms;i++)
1369 if(born->use[i] != 0)
1371 gpi = born->gpol[i]+work[i];
1372 gpi2 = gpi * gpi;
1373 born->bRad[i] = factor*gmx_invsqrt(gpi2);
1374 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1378 return 0;
1384 genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec * fr,
1385 t_mdatoms * mdatoms,
1386 gmx_genborn_t * born,
1387 int gb_algorithm,
1388 gmx_localtop_t * top,
1389 real * x,
1390 t_commrec * cr,
1391 void * paadata)
1393 gmx_allvsallgb2_data_t *aadata;
1394 int natoms;
1395 int ni0,ni1;
1396 int nj0,nj1,nj2,nj3;
1397 int i,j,k,n;
1398 int * mask;
1399 int * pmask0;
1400 int * pmask1;
1401 int * pmask2;
1402 int * pmask3;
1403 int * emask0;
1404 int * emask1;
1405 int * emask2;
1406 int * emask3;
1407 real * gb_radius;
1408 real * vsolv;
1409 real * work;
1410 real tmpsum[4];
1411 real * x_align;
1412 real * y_align;
1413 real * z_align;
1414 int * jindex;
1415 real * dadx;
1416 real * obc_param;
1417 real rad,min_rad;
1418 real rai,rai_inv,rai_inv2,sum_ai,sum_ai2,sum_ai3,tsum,tchain;
1420 __m128 ix_SSE0,iy_SSE0,iz_SSE0;
1421 __m128 ix_SSE1,iy_SSE1,iz_SSE1;
1422 __m128 ix_SSE2,iy_SSE2,iz_SSE2;
1423 __m128 ix_SSE3,iy_SSE3,iz_SSE3;
1424 __m128 gpi_SSE0,rai_SSE0,prod_ai_SSE0;
1425 __m128 gpi_SSE1,rai_SSE1,prod_ai_SSE1;
1426 __m128 gpi_SSE2,rai_SSE2,prod_ai_SSE2;
1427 __m128 gpi_SSE3,rai_SSE3,prod_ai_SSE3;
1428 __m128 imask_SSE0,jmask_SSE0;
1429 __m128 imask_SSE1,jmask_SSE1;
1430 __m128 imask_SSE2,jmask_SSE2;
1431 __m128 imask_SSE3,jmask_SSE3;
1432 __m128 jx_SSE,jy_SSE,jz_SSE;
1433 __m128 dx_SSE0,dy_SSE0,dz_SSE0;
1434 __m128 dx_SSE1,dy_SSE1,dz_SSE1;
1435 __m128 dx_SSE2,dy_SSE2,dz_SSE2;
1436 __m128 dx_SSE3,dy_SSE3,dz_SSE3;
1437 __m128 rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
1438 __m128 rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
1439 __m128 rsq_SSE2,rinv_SSE2,irsq_SSE2,idr4_SSE2,idr6_SSE2;
1440 __m128 rsq_SSE3,rinv_SSE3,irsq_SSE3,idr4_SSE3,idr6_SSE3;
1441 __m128 raj_SSE,raj_inv_SSE,sk_aj_SSE,sk2_aj_SSE;
1442 __m128 ccf_SSE0,dccf_SSE0,prod_SSE0;
1443 __m128 ccf_SSE1,dccf_SSE1,prod_SSE1;
1444 __m128 ccf_SSE2,dccf_SSE2,prod_SSE2;
1445 __m128 ccf_SSE3,dccf_SSE3,prod_SSE3;
1446 __m128 icf4_SSE0,icf6_SSE0;
1447 __m128 icf4_SSE1,icf6_SSE1;
1448 __m128 icf4_SSE2,icf6_SSE2;
1449 __m128 icf4_SSE3,icf6_SSE3;
1450 __m128 oneeighth_SSE,onefourth_SSE,half_SSE,one_SSE,two_SSE,four_SSE;
1451 __m128 still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
1452 __m128 rai_inv_SSE0;
1453 __m128 rai_inv_SSE1;
1454 __m128 rai_inv_SSE2;
1455 __m128 rai_inv_SSE3;
1456 __m128 sk_ai_SSE0,sk2_ai_SSE0,sum_ai_SSE0;
1457 __m128 sk_ai_SSE1,sk2_ai_SSE1,sum_ai_SSE1;
1458 __m128 sk_ai_SSE2,sk2_ai_SSE2,sum_ai_SSE2;
1459 __m128 sk_ai_SSE3,sk2_ai_SSE3,sum_ai_SSE3;
1460 __m128 lij_inv_SSE0,sk2_rinv_SSE0;
1461 __m128 lij_inv_SSE1,sk2_rinv_SSE1;
1462 __m128 lij_inv_SSE2,sk2_rinv_SSE2;
1463 __m128 lij_inv_SSE3,sk2_rinv_SSE3;
1464 __m128 dr_SSE0;
1465 __m128 dr_SSE1;
1466 __m128 dr_SSE2;
1467 __m128 dr_SSE3;
1468 __m128 t1_SSE0,t2_SSE0,t3_SSE0,t4_SSE0;
1469 __m128 t1_SSE1,t2_SSE1,t3_SSE1,t4_SSE1;
1470 __m128 t1_SSE2,t2_SSE2,t3_SSE2,t4_SSE2;
1471 __m128 t1_SSE3,t2_SSE3,t3_SSE3,t4_SSE3;
1472 __m128 obc_mask1_SSE0,obc_mask2_SSE0,obc_mask3_SSE0;
1473 __m128 obc_mask1_SSE1,obc_mask2_SSE1,obc_mask3_SSE1;
1474 __m128 obc_mask1_SSE2,obc_mask2_SSE2,obc_mask3_SSE2;
1475 __m128 obc_mask1_SSE3,obc_mask2_SSE3,obc_mask3_SSE3;
1476 __m128 uij_SSE0,uij2_SSE0,uij3_SSE0;
1477 __m128 uij_SSE1,uij2_SSE1,uij3_SSE1;
1478 __m128 uij_SSE2,uij2_SSE2,uij3_SSE2;
1479 __m128 uij_SSE3,uij2_SSE3,uij3_SSE3;
1480 __m128 lij_SSE0,lij2_SSE0,lij3_SSE0;
1481 __m128 lij_SSE1,lij2_SSE1,lij3_SSE1;
1482 __m128 lij_SSE2,lij2_SSE2,lij3_SSE2;
1483 __m128 lij_SSE3,lij2_SSE3,lij3_SSE3;
1484 __m128 dlij_SSE0,diff2_SSE0,logterm_SSE0;
1485 __m128 dlij_SSE1,diff2_SSE1,logterm_SSE1;
1486 __m128 dlij_SSE2,diff2_SSE2,logterm_SSE2;
1487 __m128 dlij_SSE3,diff2_SSE3,logterm_SSE3;
1488 __m128 doffset_SSE;
1490 natoms = mdatoms->nr;
1491 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
1492 ni1 = mdatoms->start+mdatoms->homenr;
1494 n = 0;
1496 aadata = *((gmx_allvsallgb2_data_t **)paadata);
1499 if(aadata==NULL)
1501 genborn_allvsall_setup(&aadata,top,born,mdatoms,born->gb_doffset,
1502 egbOBC,TRUE,TRUE,TRUE);
1503 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1506 x_align = aadata->x_align;
1507 y_align = aadata->y_align;
1508 z_align = aadata->z_align;
1510 gb_radius = aadata->gb_radius;
1511 work = aadata->work;
1512 jindex = aadata->jindex_gb;
1513 dadx = fr->dadx;
1514 obc_param = aadata->workparam;
1516 oneeighth_SSE = _mm_set1_ps(0.125);
1517 onefourth_SSE = _mm_set1_ps(0.25);
1518 half_SSE = _mm_set1_ps(0.5);
1519 one_SSE = _mm_set1_ps(1.0);
1520 two_SSE = _mm_set1_ps(2.0);
1521 four_SSE = _mm_set1_ps(4.0);
1522 doffset_SSE = _mm_set1_ps(born->gb_doffset);
1524 for(i=0;i<natoms;i++)
1526 x_align[i] = x[3*i];
1527 y_align[i] = x[3*i+1];
1528 z_align[i] = x[3*i+2];
1531 /* Copy again */
1532 for(i=0;i<natoms/2+1;i++)
1534 x_align[natoms+i] = x_align[i];
1535 y_align[natoms+i] = y_align[i];
1536 z_align[natoms+i] = z_align[i];
1539 for(i=0;i<natoms+natoms/2+1;i++)
1541 work[i] = 0;
1544 for(i=ni0; i<ni1; i+= UNROLLI)
1546 /* We assume shifts are NOT used for all-vs-all interactions */
1548 /* Load i atom data */
1549 ix_SSE0 = _mm_load1_ps(x_align+i);
1550 iy_SSE0 = _mm_load1_ps(y_align+i);
1551 iz_SSE0 = _mm_load1_ps(z_align+i);
1552 ix_SSE1 = _mm_load1_ps(x_align+i+1);
1553 iy_SSE1 = _mm_load1_ps(y_align+i+1);
1554 iz_SSE1 = _mm_load1_ps(z_align+i+1);
1555 ix_SSE2 = _mm_load1_ps(x_align+i+2);
1556 iy_SSE2 = _mm_load1_ps(y_align+i+2);
1557 iz_SSE2 = _mm_load1_ps(z_align+i+2);
1558 ix_SSE3 = _mm_load1_ps(x_align+i+3);
1559 iy_SSE3 = _mm_load1_ps(y_align+i+3);
1560 iz_SSE3 = _mm_load1_ps(z_align+i+3);
1562 rai_SSE0 = _mm_load1_ps(gb_radius+i);
1563 rai_SSE1 = _mm_load1_ps(gb_radius+i+1);
1564 rai_SSE2 = _mm_load1_ps(gb_radius+i+2);
1565 rai_SSE3 = _mm_load1_ps(gb_radius+i+3);
1566 rai_inv_SSE0 = gmx_mm_inv_ps(rai_SSE0);
1567 rai_inv_SSE1 = gmx_mm_inv_ps(rai_SSE1);
1568 rai_inv_SSE2 = gmx_mm_inv_ps(rai_SSE2);
1569 rai_inv_SSE3 = gmx_mm_inv_ps(rai_SSE3);
1571 sk_ai_SSE0 = _mm_load1_ps(obc_param+i);
1572 sk_ai_SSE1 = _mm_load1_ps(obc_param+i+1);
1573 sk_ai_SSE2 = _mm_load1_ps(obc_param+i+2);
1574 sk_ai_SSE3 = _mm_load1_ps(obc_param+i+3);
1575 sk2_ai_SSE0 = _mm_mul_ps(sk_ai_SSE0,sk_ai_SSE0);
1576 sk2_ai_SSE1 = _mm_mul_ps(sk_ai_SSE1,sk_ai_SSE1);
1577 sk2_ai_SSE2 = _mm_mul_ps(sk_ai_SSE2,sk_ai_SSE2);
1578 sk2_ai_SSE3 = _mm_mul_ps(sk_ai_SSE3,sk_ai_SSE3);
1580 sum_ai_SSE0 = _mm_setzero_ps();
1581 sum_ai_SSE1 = _mm_setzero_ps();
1582 sum_ai_SSE2 = _mm_setzero_ps();
1583 sum_ai_SSE3 = _mm_setzero_ps();
1585 /* Load limits for loop over neighbors */
1586 nj0 = jindex[4*i];
1587 nj1 = jindex[4*i+1];
1588 nj2 = jindex[4*i+2];
1589 nj3 = jindex[4*i+3];
1591 pmask0 = aadata->prologue_mask_gb[i];
1592 pmask1 = aadata->prologue_mask_gb[i+1];
1593 pmask2 = aadata->prologue_mask_gb[i+2];
1594 pmask3 = aadata->prologue_mask_gb[i+3];
1595 emask0 = aadata->epilogue_mask[i];
1596 emask1 = aadata->epilogue_mask[i+1];
1597 emask2 = aadata->epilogue_mask[i+2];
1598 emask3 = aadata->epilogue_mask[i+3];
1600 imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i));
1601 imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1));
1602 imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2));
1603 imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3));
1605 /* Prologue part, including exclusion mask */
1606 for(j=nj0; j<nj1; j+=UNROLLJ)
1608 jmask_SSE0 = _mm_load_ps((real *)pmask0);
1609 jmask_SSE1 = _mm_load_ps((real *)pmask1);
1610 jmask_SSE2 = _mm_load_ps((real *)pmask2);
1611 jmask_SSE3 = _mm_load_ps((real *)pmask3);
1612 pmask0 += UNROLLJ;
1613 pmask1 += UNROLLJ;
1614 pmask2 += UNROLLJ;
1615 pmask3 += UNROLLJ;
1617 /* load j atom coordinates */
1618 jx_SSE = _mm_load_ps(x_align+j);
1619 jy_SSE = _mm_load_ps(y_align+j);
1620 jz_SSE = _mm_load_ps(z_align+j);
1622 /* Calculate distance */
1623 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
1624 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
1625 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
1626 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
1627 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
1628 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
1629 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
1630 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
1631 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
1632 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
1633 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
1634 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
1636 /* rsq = dx*dx+dy*dy+dz*dz */
1637 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1638 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1639 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1640 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1642 /* Combine masks */
1643 jmask_SSE0 = _mm_and_ps(jmask_SSE0,imask_SSE0);
1644 jmask_SSE1 = _mm_and_ps(jmask_SSE1,imask_SSE1);
1645 jmask_SSE2 = _mm_and_ps(jmask_SSE2,imask_SSE2);
1646 jmask_SSE3 = _mm_and_ps(jmask_SSE3,imask_SSE3);
1648 /* Calculate 1/r and 1/r2 */
1649 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1650 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1651 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1652 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1654 /* Apply mask */
1655 rinv_SSE0 = _mm_and_ps(rinv_SSE0,jmask_SSE0);
1656 rinv_SSE1 = _mm_and_ps(rinv_SSE1,jmask_SSE1);
1657 rinv_SSE2 = _mm_and_ps(rinv_SSE2,jmask_SSE2);
1658 rinv_SSE3 = _mm_and_ps(rinv_SSE3,jmask_SSE3);
1660 dr_SSE0 = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
1661 dr_SSE1 = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
1662 dr_SSE2 = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
1663 dr_SSE3 = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
1665 sk_aj_SSE = _mm_load_ps(obc_param+j);
1666 raj_SSE = _mm_load_ps(gb_radius+j);
1667 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
1669 /* Evaluate influence of atom aj -> ai */
1670 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_aj_SSE);
1671 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_aj_SSE);
1672 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_aj_SSE);
1673 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_aj_SSE);
1674 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
1675 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
1676 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
1677 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
1678 t3_SSE0 = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
1679 t3_SSE1 = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
1680 t3_SSE2 = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
1681 t3_SSE3 = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
1683 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
1684 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
1685 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
1686 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
1687 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
1688 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
1689 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
1690 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
1691 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
1692 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
1693 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
1694 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
1695 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
1696 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
1697 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
1698 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
1700 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
1701 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
1702 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
1703 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
1704 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
1705 _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
1706 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
1707 _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
1708 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
1709 _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
1710 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
1711 _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
1712 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
1713 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
1714 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
1715 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
1717 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
1718 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
1719 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
1720 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
1721 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
1722 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
1723 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
1724 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
1725 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
1726 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
1727 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
1728 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
1729 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
1730 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
1731 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
1732 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
1734 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
1735 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
1736 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
1737 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
1738 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
1739 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
1740 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
1741 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
1742 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
1743 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
1744 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
1745 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
1746 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
1747 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
1748 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
1749 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
1750 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
1752 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
1753 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
1754 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
1755 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
1757 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
1758 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
1759 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
1760 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
1761 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
1762 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
1763 prod_SSE0));
1764 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
1765 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
1766 prod_SSE1));
1767 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
1768 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
1769 prod_SSE2));
1770 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
1771 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
1772 prod_SSE3));
1774 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
1775 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
1776 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
1777 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
1778 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
1779 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
1780 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
1781 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
1782 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
1783 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
1784 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
1785 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
1786 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
1787 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
1788 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
1789 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
1790 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
1791 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
1792 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
1793 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
1795 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
1796 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
1797 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
1798 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
1800 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
1801 _mm_mul_ps(prod_SSE0,lij3_SSE0));
1802 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
1803 _mm_mul_ps(prod_SSE1,lij3_SSE1));
1804 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
1805 _mm_mul_ps(prod_SSE2,lij3_SSE2));
1806 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
1807 _mm_mul_ps(prod_SSE3,lij3_SSE3));
1808 t1_SSE0 = _mm_sub_ps(t1_SSE0,
1809 _mm_mul_ps(onefourth_SSE,
1810 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
1811 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
1812 t1_SSE1 = _mm_sub_ps(t1_SSE1,
1813 _mm_mul_ps(onefourth_SSE,
1814 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
1815 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
1816 t1_SSE2 = _mm_sub_ps(t1_SSE2,
1817 _mm_mul_ps(onefourth_SSE,
1818 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
1819 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
1820 t1_SSE3 = _mm_sub_ps(t1_SSE3,
1821 _mm_mul_ps(onefourth_SSE,
1822 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
1823 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
1825 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
1826 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
1827 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
1828 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
1829 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
1830 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
1831 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
1832 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
1833 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
1834 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
1835 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
1836 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
1837 t2_SSE0 = _mm_sub_ps(t2_SSE0,
1838 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
1839 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
1840 t2_SSE1 = _mm_sub_ps(t2_SSE1,
1841 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
1842 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
1843 t2_SSE2 = _mm_sub_ps(t2_SSE2,
1844 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
1845 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
1846 t2_SSE3 = _mm_sub_ps(t2_SSE3,
1847 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
1848 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
1849 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
1850 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
1851 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
1852 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
1853 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
1854 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
1855 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
1856 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
1857 t3_SSE0 = _mm_sub_ps(t3_SSE0,
1858 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
1859 _mm_add_ps(one_SSE,
1860 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
1861 t3_SSE1 = _mm_sub_ps(t3_SSE1,
1862 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
1863 _mm_add_ps(one_SSE,
1864 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
1865 t3_SSE2 = _mm_sub_ps(t3_SSE2,
1866 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
1867 _mm_add_ps(one_SSE,
1868 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
1869 t3_SSE3 = _mm_sub_ps(t3_SSE3,
1870 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
1871 _mm_add_ps(one_SSE,
1872 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
1874 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
1875 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
1876 _mm_add_ps(t2_SSE0,t3_SSE0)));
1877 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
1878 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
1879 _mm_add_ps(t2_SSE1,t3_SSE1)));
1880 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
1881 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
1882 _mm_add_ps(t2_SSE2,t3_SSE2)));
1883 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
1884 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
1885 _mm_add_ps(t2_SSE3,t3_SSE3)));
1887 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
1888 dadx += 4;
1889 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
1890 dadx += 4;
1891 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
1892 dadx += 4;
1893 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
1894 dadx += 4;
1896 /* Evaluate influence of atom ai -> aj */
1897 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
1898 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
1899 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
1900 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
1901 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
1902 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
1903 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
1904 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
1905 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
1906 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
1907 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
1908 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
1910 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
1911 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
1912 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
1913 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
1914 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
1915 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
1916 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
1917 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
1918 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
1919 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
1920 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
1921 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
1922 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
1923 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
1924 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
1925 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
1927 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
1928 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
1929 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
1930 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
1931 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
1932 _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
1933 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
1934 _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
1935 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
1936 _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
1937 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
1938 _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
1939 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
1940 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
1941 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
1942 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
1944 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
1945 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
1946 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
1947 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
1948 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
1949 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
1950 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
1951 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
1952 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
1953 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
1954 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
1955 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
1956 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
1957 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
1958 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
1959 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
1961 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
1962 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
1963 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
1964 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
1965 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
1966 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
1967 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
1968 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
1969 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
1970 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
1971 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
1972 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
1973 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
1974 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
1975 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
1976 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
1978 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
1979 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
1980 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
1981 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
1982 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
1983 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
1984 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
1985 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
1986 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
1987 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
1988 prod_SSE0));
1989 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
1990 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
1991 prod_SSE1));
1992 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
1993 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
1994 prod_SSE2));
1995 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
1996 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
1997 prod_SSE3));
1998 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
1999 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2000 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2001 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2002 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2003 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2004 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2005 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2006 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
2007 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
2008 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
2009 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
2010 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2011 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2012 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2013 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2014 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2015 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2016 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2017 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2019 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2020 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
2021 _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
2022 _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
2023 _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
2025 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2026 _mm_mul_ps(prod_SSE0,lij3_SSE0));
2027 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2028 _mm_mul_ps(prod_SSE1,lij3_SSE1));
2029 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2030 _mm_mul_ps(prod_SSE2,lij3_SSE2));
2031 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2032 _mm_mul_ps(prod_SSE3,lij3_SSE3));
2033 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2034 _mm_mul_ps(onefourth_SSE,
2035 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2036 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2037 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2038 _mm_mul_ps(onefourth_SSE,
2039 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2040 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2041 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2042 _mm_mul_ps(onefourth_SSE,
2043 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2044 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2045 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2046 _mm_mul_ps(onefourth_SSE,
2047 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2048 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2049 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2050 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2051 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2052 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2053 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2054 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2055 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2056 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2057 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2058 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2059 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2060 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2061 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2062 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2063 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2064 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2065 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2066 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2067 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2068 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2069 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2070 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2071 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2072 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2074 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2075 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2076 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2077 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2078 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2079 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2080 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2081 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2083 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2084 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2085 _mm_add_ps(one_SSE,
2086 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2087 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2088 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2089 _mm_add_ps(one_SSE,
2090 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2091 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2092 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2093 _mm_add_ps(one_SSE,
2094 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2095 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2096 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2097 _mm_add_ps(one_SSE,
2098 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2101 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2102 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2103 _mm_add_ps(t2_SSE0,t3_SSE0)));
2104 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2105 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2106 _mm_add_ps(t2_SSE1,t3_SSE1)));
2107 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2108 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2109 _mm_add_ps(t2_SSE2,t3_SSE2)));
2110 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2111 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2112 _mm_add_ps(t2_SSE3,t3_SSE3)));
2114 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2115 dadx += 4;
2116 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2117 dadx += 4;
2118 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2119 dadx += 4;
2120 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2121 dadx += 4;
2125 /* Main part, no exclusions */
2126 for(j=nj1; j<nj2; j+=UNROLLJ)
2128 /* load j atom coordinates */
2129 jx_SSE = _mm_load_ps(x_align+j);
2130 jy_SSE = _mm_load_ps(y_align+j);
2131 jz_SSE = _mm_load_ps(z_align+j);
2133 /* Calculate distance */
2134 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
2135 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
2136 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
2137 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
2138 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
2139 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
2140 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
2141 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
2142 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
2143 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
2144 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
2145 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
2147 /* rsq = dx*dx+dy*dy+dz*dz */
2148 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2149 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2150 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2151 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2153 /* Calculate 1/r and 1/r2 */
2154 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
2155 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
2156 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
2157 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
2159 /* Apply mask */
2160 rinv_SSE0 = _mm_and_ps(rinv_SSE0,imask_SSE0);
2161 rinv_SSE1 = _mm_and_ps(rinv_SSE1,imask_SSE1);
2162 rinv_SSE2 = _mm_and_ps(rinv_SSE2,imask_SSE2);
2163 rinv_SSE3 = _mm_and_ps(rinv_SSE3,imask_SSE3);
2165 dr_SSE0 = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
2166 dr_SSE1 = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
2167 dr_SSE2 = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
2168 dr_SSE3 = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
2170 sk_aj_SSE = _mm_load_ps(obc_param+j);
2171 raj_SSE = _mm_load_ps(gb_radius+j);
2173 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
2175 /* Evaluate influence of atom aj -> ai */
2176 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_aj_SSE);
2177 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_aj_SSE);
2178 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_aj_SSE);
2179 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_aj_SSE);
2180 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
2181 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
2182 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
2183 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
2184 t3_SSE0 = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
2185 t3_SSE1 = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
2186 t3_SSE2 = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
2187 t3_SSE3 = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
2189 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2190 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2191 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2192 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2193 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2194 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2195 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2196 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2197 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2198 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2199 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2200 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2201 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,imask_SSE0);
2202 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,imask_SSE1);
2203 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,imask_SSE2);
2204 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,imask_SSE3);
2206 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2207 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2208 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2209 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2210 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2211 _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
2212 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2213 _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
2214 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2215 _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
2216 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2217 _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
2218 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2219 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2220 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2221 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2223 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2224 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2225 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2226 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2227 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2228 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2229 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2230 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2231 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2232 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2233 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2234 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2235 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2236 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2237 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2238 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2240 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2241 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2242 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2243 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2244 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2245 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2246 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2247 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2248 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
2249 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
2250 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
2251 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
2252 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
2253 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2254 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2255 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2256 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2258 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2259 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2260 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2261 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2263 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
2264 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
2265 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
2266 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
2267 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2268 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2269 prod_SSE0));
2270 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2271 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2272 prod_SSE1));
2273 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2274 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2275 prod_SSE2));
2276 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2277 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2278 prod_SSE3));
2280 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2281 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2282 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2283 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2284 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2285 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2286 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2287 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2288 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
2289 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
2290 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
2291 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
2292 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2293 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2294 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2295 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2296 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2297 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2298 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2299 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2301 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2302 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2303 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2304 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2306 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2307 _mm_mul_ps(prod_SSE0,lij3_SSE0));
2308 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2309 _mm_mul_ps(prod_SSE1,lij3_SSE1));
2310 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2311 _mm_mul_ps(prod_SSE2,lij3_SSE2));
2312 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2313 _mm_mul_ps(prod_SSE3,lij3_SSE3));
2314 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2315 _mm_mul_ps(onefourth_SSE,
2316 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2317 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2318 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2319 _mm_mul_ps(onefourth_SSE,
2320 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2321 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2322 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2323 _mm_mul_ps(onefourth_SSE,
2324 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2325 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2326 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2327 _mm_mul_ps(onefourth_SSE,
2328 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2329 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2331 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2332 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2333 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2334 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2335 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2336 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2337 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2338 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2339 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2340 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2341 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2342 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2343 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2344 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2345 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2346 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2347 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2348 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2349 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2350 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2351 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2352 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2353 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2354 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2355 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2356 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2357 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2358 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2359 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2360 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2361 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2362 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2363 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2364 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2365 _mm_add_ps(one_SSE,
2366 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2367 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2368 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2369 _mm_add_ps(one_SSE,
2370 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2371 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2372 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2373 _mm_add_ps(one_SSE,
2374 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2375 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2376 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2377 _mm_add_ps(one_SSE,
2378 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2380 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2381 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2382 _mm_add_ps(t2_SSE0,t3_SSE0)));
2383 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2384 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2385 _mm_add_ps(t2_SSE1,t3_SSE1)));
2386 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2387 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2388 _mm_add_ps(t2_SSE2,t3_SSE2)));
2389 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2390 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2391 _mm_add_ps(t2_SSE3,t3_SSE3)));
2393 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2394 dadx += 4;
2395 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2396 dadx += 4;
2397 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2398 dadx += 4;
2399 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2400 dadx += 4;
2402 /* Evaluate influence of atom ai -> aj */
2403 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
2404 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
2405 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
2406 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
2407 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
2408 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
2409 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
2410 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
2411 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
2412 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
2413 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
2414 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
2416 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2417 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2418 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2419 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2420 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2421 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2422 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2423 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2424 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2425 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2426 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2427 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2428 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,imask_SSE0);
2429 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,imask_SSE1);
2430 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,imask_SSE2);
2431 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,imask_SSE3);
2433 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2434 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2435 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2436 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2437 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2438 _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
2439 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2440 _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
2441 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2442 _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
2443 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2444 _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
2445 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2446 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2447 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2448 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2450 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2451 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2452 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2453 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2454 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2455 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2456 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2457 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2458 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2459 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2460 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2461 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2462 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2463 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2464 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2465 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2467 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2468 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2469 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2470 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2471 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2472 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2473 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2474 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2475 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
2476 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
2477 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
2478 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
2479 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2480 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2481 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2482 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2484 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2485 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2486 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2487 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2488 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
2489 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
2490 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
2491 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
2492 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2493 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2494 prod_SSE0));
2495 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2496 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2497 prod_SSE1));
2498 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2499 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2500 prod_SSE2));
2501 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2502 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2503 prod_SSE3));
2504 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2505 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2506 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2507 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2508 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2509 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2510 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2511 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2512 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
2513 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
2514 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
2515 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
2516 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2517 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2518 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2519 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2520 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2521 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2522 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2523 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2525 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2526 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
2527 _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
2528 _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
2529 _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
2531 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2532 _mm_mul_ps(prod_SSE0,lij3_SSE0));
2533 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2534 _mm_mul_ps(prod_SSE1,lij3_SSE1));
2535 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2536 _mm_mul_ps(prod_SSE2,lij3_SSE2));
2537 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2538 _mm_mul_ps(prod_SSE3,lij3_SSE3));
2539 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2540 _mm_mul_ps(onefourth_SSE,
2541 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2542 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2543 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2544 _mm_mul_ps(onefourth_SSE,
2545 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2546 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2547 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2548 _mm_mul_ps(onefourth_SSE,
2549 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2550 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2551 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2552 _mm_mul_ps(onefourth_SSE,
2553 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2554 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2555 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2556 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2557 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2558 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2559 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2560 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2561 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2562 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2563 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2564 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2565 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2566 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2567 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2568 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2569 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2570 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2571 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2572 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2573 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2574 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2575 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2576 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2577 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2578 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2580 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2581 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2582 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2583 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2584 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2585 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2586 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2587 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2589 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2590 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2591 _mm_add_ps(one_SSE,
2592 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2593 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2594 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2595 _mm_add_ps(one_SSE,
2596 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2597 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2598 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2599 _mm_add_ps(one_SSE,
2600 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2601 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2602 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2603 _mm_add_ps(one_SSE,
2604 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2606 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2607 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2608 _mm_add_ps(t2_SSE0,t3_SSE0)));
2609 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2610 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2611 _mm_add_ps(t2_SSE1,t3_SSE1)));
2612 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2613 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2614 _mm_add_ps(t2_SSE2,t3_SSE2)));
2615 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2616 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2617 _mm_add_ps(t2_SSE3,t3_SSE3)));
2619 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2620 dadx += 4;
2621 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2622 dadx += 4;
2623 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2624 dadx += 4;
2625 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2626 dadx += 4;
2629 /* Epilogue part, including exclusion mask */
2630 for(j=nj2; j<nj3; j+=UNROLLJ)
2632 jmask_SSE0 = _mm_load_ps((real *)emask0);
2633 jmask_SSE1 = _mm_load_ps((real *)emask1);
2634 jmask_SSE2 = _mm_load_ps((real *)emask2);
2635 jmask_SSE3 = _mm_load_ps((real *)emask3);
2636 emask0 += UNROLLJ;
2637 emask1 += UNROLLJ;
2638 emask2 += UNROLLJ;
2639 emask3 += UNROLLJ;
2641 /* load j atom coordinates */
2642 jx_SSE = _mm_load_ps(x_align+j);
2643 jy_SSE = _mm_load_ps(y_align+j);
2644 jz_SSE = _mm_load_ps(z_align+j);
2646 /* Calculate distance */
2647 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
2648 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
2649 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
2650 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
2651 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
2652 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
2653 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
2654 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
2655 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
2656 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
2657 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
2658 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
2660 /* rsq = dx*dx+dy*dy+dz*dz */
2661 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2662 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2663 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2664 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2666 /* Combine masks */
2667 jmask_SSE0 = _mm_and_ps(jmask_SSE0,imask_SSE0);
2668 jmask_SSE1 = _mm_and_ps(jmask_SSE1,imask_SSE1);
2669 jmask_SSE2 = _mm_and_ps(jmask_SSE2,imask_SSE2);
2670 jmask_SSE3 = _mm_and_ps(jmask_SSE3,imask_SSE3);
2672 /* Calculate 1/r and 1/r2 */
2673 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
2674 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
2675 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
2676 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
2678 /* Apply mask */
2679 rinv_SSE0 = _mm_and_ps(rinv_SSE0,jmask_SSE0);
2680 rinv_SSE1 = _mm_and_ps(rinv_SSE1,jmask_SSE1);
2681 rinv_SSE2 = _mm_and_ps(rinv_SSE2,jmask_SSE2);
2682 rinv_SSE3 = _mm_and_ps(rinv_SSE3,jmask_SSE3);
2684 dr_SSE0 = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
2685 dr_SSE1 = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
2686 dr_SSE2 = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
2687 dr_SSE3 = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
2689 sk_aj_SSE = _mm_load_ps(obc_param+j);
2690 raj_SSE = _mm_load_ps(gb_radius+j);
2692 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
2694 /* Evaluate influence of atom aj -> ai */
2695 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_aj_SSE);
2696 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_aj_SSE);
2697 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_aj_SSE);
2698 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_aj_SSE);
2699 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
2700 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
2701 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
2702 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
2703 t3_SSE0 = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
2704 t3_SSE1 = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
2705 t3_SSE2 = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
2706 t3_SSE3 = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
2708 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2709 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2710 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2711 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2712 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2713 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2714 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2715 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2716 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2717 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2718 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2719 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2720 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
2721 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
2722 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
2723 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
2725 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2726 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2727 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2728 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2729 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2730 _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
2731 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2732 _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
2733 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2734 _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
2735 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2736 _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
2737 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2738 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2739 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2740 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2742 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2743 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2744 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2745 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2746 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2747 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2748 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2749 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2750 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2751 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2752 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2753 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2754 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2755 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2756 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2757 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2759 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2760 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2761 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2762 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2763 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2764 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2765 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2766 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2767 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
2768 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
2769 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
2770 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
2771 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
2772 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2773 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2774 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2775 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2777 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2778 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2779 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2780 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2782 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
2783 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
2784 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
2785 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
2786 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2787 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2788 prod_SSE0));
2789 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2790 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2791 prod_SSE1));
2792 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2793 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2794 prod_SSE2));
2795 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2796 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2797 prod_SSE3));
2799 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2800 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2801 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2802 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2803 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2804 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2805 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2806 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2807 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
2808 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
2809 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
2810 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
2811 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2812 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2813 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2814 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2815 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2816 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2817 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2818 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2820 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2821 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2822 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2823 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2825 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2826 _mm_mul_ps(prod_SSE0,lij3_SSE0));
2827 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2828 _mm_mul_ps(prod_SSE1,lij3_SSE1));
2829 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2830 _mm_mul_ps(prod_SSE2,lij3_SSE2));
2831 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2832 _mm_mul_ps(prod_SSE3,lij3_SSE3));
2833 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2834 _mm_mul_ps(onefourth_SSE,
2835 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2836 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2837 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2838 _mm_mul_ps(onefourth_SSE,
2839 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2840 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2841 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2842 _mm_mul_ps(onefourth_SSE,
2843 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2844 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2845 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2846 _mm_mul_ps(onefourth_SSE,
2847 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2848 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2850 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2851 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2852 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2853 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2854 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2855 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2856 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2857 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2858 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2859 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2860 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2861 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2862 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2863 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2864 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2865 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2866 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2867 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2868 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2869 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2870 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2871 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2872 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2873 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2874 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2875 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2876 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2877 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2878 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2879 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2880 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2881 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2882 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2883 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2884 _mm_add_ps(one_SSE,
2885 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2886 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2887 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2888 _mm_add_ps(one_SSE,
2889 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2890 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2891 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2892 _mm_add_ps(one_SSE,
2893 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2894 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2895 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2896 _mm_add_ps(one_SSE,
2897 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2899 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2900 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2901 _mm_add_ps(t2_SSE0,t3_SSE0)));
2902 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2903 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2904 _mm_add_ps(t2_SSE1,t3_SSE1)));
2905 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2906 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2907 _mm_add_ps(t2_SSE2,t3_SSE2)));
2908 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2909 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2910 _mm_add_ps(t2_SSE3,t3_SSE3)));
2912 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2913 dadx += 4;
2914 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2915 dadx += 4;
2916 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2917 dadx += 4;
2918 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2919 dadx += 4;
2921 /* Evaluate influence of atom ai -> aj */
2922 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
2923 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
2924 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
2925 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
2926 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
2927 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
2928 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
2929 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
2930 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
2931 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
2932 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
2933 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
2935 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2936 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2937 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2938 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2939 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2940 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2941 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2942 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2943 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2944 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2945 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2946 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2947 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
2948 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
2949 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
2950 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
2952 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2953 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2954 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2955 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2956 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2957 _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
2958 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2959 _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
2960 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2961 _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
2962 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2963 _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
2964 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2965 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2966 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2967 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2969 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2970 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2971 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2972 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2973 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2974 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2975 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2976 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2977 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2978 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2979 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2980 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2981 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2982 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2983 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2984 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2986 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2987 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2988 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2989 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2990 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2991 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2992 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2993 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2994 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
2995 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
2996 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
2997 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
2998 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2999 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
3000 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
3001 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
3003 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
3004 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
3005 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
3006 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
3007 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
3008 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
3009 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
3010 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
3011 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
3012 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
3013 prod_SSE0));
3014 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
3015 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
3016 prod_SSE1));
3017 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
3018 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
3019 prod_SSE2));
3020 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
3021 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
3022 prod_SSE3));
3023 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
3024 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
3025 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
3026 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
3027 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
3028 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
3029 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
3030 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
3031 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
3032 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
3033 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
3034 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
3035 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
3036 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
3037 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
3038 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
3039 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
3040 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
3041 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
3042 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
3044 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
3045 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
3046 _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
3047 _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
3048 _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
3050 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
3051 _mm_mul_ps(prod_SSE0,lij3_SSE0));
3052 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
3053 _mm_mul_ps(prod_SSE1,lij3_SSE1));
3054 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
3055 _mm_mul_ps(prod_SSE2,lij3_SSE2));
3056 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
3057 _mm_mul_ps(prod_SSE3,lij3_SSE3));
3058 t1_SSE0 = _mm_sub_ps(t1_SSE0,
3059 _mm_mul_ps(onefourth_SSE,
3060 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
3061 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
3062 t1_SSE1 = _mm_sub_ps(t1_SSE1,
3063 _mm_mul_ps(onefourth_SSE,
3064 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
3065 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
3066 t1_SSE2 = _mm_sub_ps(t1_SSE2,
3067 _mm_mul_ps(onefourth_SSE,
3068 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
3069 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
3070 t1_SSE3 = _mm_sub_ps(t1_SSE3,
3071 _mm_mul_ps(onefourth_SSE,
3072 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
3073 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
3074 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
3075 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
3076 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
3077 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
3078 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
3079 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
3080 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
3081 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
3082 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
3083 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
3084 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
3085 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
3086 t2_SSE0 = _mm_sub_ps(t2_SSE0,
3087 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
3088 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
3089 t2_SSE1 = _mm_sub_ps(t2_SSE1,
3090 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
3091 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
3092 t2_SSE2 = _mm_sub_ps(t2_SSE2,
3093 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
3094 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
3095 t2_SSE3 = _mm_sub_ps(t2_SSE3,
3096 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
3097 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
3099 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
3100 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
3101 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
3102 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
3103 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
3104 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
3105 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
3106 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
3108 t3_SSE0 = _mm_sub_ps(t3_SSE0,
3109 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
3110 _mm_add_ps(one_SSE,
3111 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
3112 t3_SSE1 = _mm_sub_ps(t3_SSE1,
3113 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
3114 _mm_add_ps(one_SSE,
3115 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
3116 t3_SSE2 = _mm_sub_ps(t3_SSE2,
3117 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
3118 _mm_add_ps(one_SSE,
3119 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
3120 t3_SSE3 = _mm_sub_ps(t3_SSE3,
3121 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
3122 _mm_add_ps(one_SSE,
3123 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
3126 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
3127 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
3128 _mm_add_ps(t2_SSE0,t3_SSE0)));
3129 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
3130 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
3131 _mm_add_ps(t2_SSE1,t3_SSE1)));
3132 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
3133 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
3134 _mm_add_ps(t2_SSE2,t3_SSE2)));
3135 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
3136 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
3137 _mm_add_ps(t2_SSE3,t3_SSE3)));
3139 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
3140 dadx += 4;
3141 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
3142 dadx += 4;
3143 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
3144 dadx += 4;
3145 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
3146 dadx += 4;
3148 _MM_TRANSPOSE4_PS(sum_ai_SSE0,sum_ai_SSE1,sum_ai_SSE2,sum_ai_SSE3);
3149 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,sum_ai_SSE1);
3150 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2,sum_ai_SSE3);
3151 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,sum_ai_SSE2);
3152 _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
3156 for(i=0;i<natoms/2+1;i++)
3158 work[i] += work[natoms+i];
3161 /* Parallel summations */
3163 if(PARTDECOMP(cr))
3165 gmx_sum(natoms,work, cr);
3168 if(gb_algorithm==egbHCT)
3170 /* HCT */
3171 for(i=0;i<natoms;i++)
3173 if(born->use[i] != 0)
3175 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
3176 sum_ai = 1.0/rai - work[i];
3177 min_rad = rai + born->gb_doffset;
3178 rad = 1.0/sum_ai;
3180 born->bRad[i] = rad > min_rad ? rad : min_rad;
3181 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3186 else
3188 /* OBC */
3190 /* Calculate the radii */
3191 for(i=0;i<natoms;i++)
3194 if(born->use[i] != 0)
3196 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
3197 rai_inv2 = 1.0/rai;
3198 rai = rai-born->gb_doffset;
3199 rai_inv = 1.0/rai;
3200 sum_ai = rai * work[i];
3201 sum_ai2 = sum_ai * sum_ai;
3202 sum_ai3 = sum_ai2 * sum_ai;
3204 tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
3205 born->bRad[i] = rai_inv - tsum*rai_inv2;
3206 born->bRad[i] = 1.0 / born->bRad[i];
3208 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
3210 tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
3211 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
3216 return 0;
3227 genborn_allvsall_calc_chainrule_sse2_single(t_forcerec * fr,
3228 t_mdatoms * mdatoms,
3229 gmx_genborn_t * born,
3230 real * x,
3231 real * f,
3232 int gb_algorithm,
3233 void * paadata)
3235 gmx_allvsallgb2_data_t *aadata;
3236 int natoms;
3237 int ni0,ni1;
3238 int nj0,nj1,nj2,nj3;
3239 int i,j,k,n;
3240 int idx;
3241 int * mask;
3242 int * pmask0;
3243 int * emask0;
3244 int * jindex;
3246 real ix,iy,iz;
3247 real fix,fiy,fiz;
3248 real jx,jy,jz;
3249 real dx,dy,dz;
3250 real tx,ty,tz;
3251 real rbai,rbaj,fgb,fgb_ai,rbi;
3252 real * rb;
3253 real * dadx;
3254 real * x_align;
3255 real * y_align;
3256 real * z_align;
3257 real * fx_align;
3258 real * fy_align;
3259 real * fz_align;
3260 real tmpsum[4];
3262 __m128 jmask_SSE0,jmask_SSE1,jmask_SSE2,jmask_SSE3;
3263 __m128 ix_SSE0,iy_SSE0,iz_SSE0;
3264 __m128 ix_SSE1,iy_SSE1,iz_SSE1;
3265 __m128 ix_SSE2,iy_SSE2,iz_SSE2;
3266 __m128 ix_SSE3,iy_SSE3,iz_SSE3;
3267 __m128 fix_SSE0,fiy_SSE0,fiz_SSE0;
3268 __m128 fix_SSE1,fiy_SSE1,fiz_SSE1;
3269 __m128 fix_SSE2,fiy_SSE2,fiz_SSE2;
3270 __m128 fix_SSE3,fiy_SSE3,fiz_SSE3;
3271 __m128 rbai_SSE0,rbai_SSE1,rbai_SSE2,rbai_SSE3;
3272 __m128 imask_SSE0,imask_SSE1,imask_SSE2,imask_SSE3;
3273 __m128 jx_SSE,jy_SSE,jz_SSE,rbaj_SSE;
3274 __m128 dx_SSE0,dy_SSE0,dz_SSE0;
3275 __m128 dx_SSE1,dy_SSE1,dz_SSE1;
3276 __m128 dx_SSE2,dy_SSE2,dz_SSE2;
3277 __m128 dx_SSE3,dy_SSE3,dz_SSE3;
3278 __m128 fgb_SSE0,fgb_ai_SSE0;
3279 __m128 fgb_SSE1,fgb_ai_SSE1;
3280 __m128 fgb_SSE2,fgb_ai_SSE2;
3281 __m128 fgb_SSE3,fgb_ai_SSE3;
3282 __m128 tx_SSE0,ty_SSE0,tz_SSE0;
3283 __m128 tx_SSE1,ty_SSE1,tz_SSE1;
3284 __m128 tx_SSE2,ty_SSE2,tz_SSE2;
3285 __m128 tx_SSE3,ty_SSE3,tz_SSE3;
3286 __m128 t1,t2;
3288 natoms = mdatoms->nr;
3289 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
3290 ni1 = mdatoms->start+mdatoms->homenr;
3291 dadx = fr->dadx;
3293 aadata = (gmx_allvsallgb2_data_t *)paadata;
3295 x_align = aadata->x_align;
3296 y_align = aadata->y_align;
3297 z_align = aadata->z_align;
3298 fx_align = aadata->fx_align;
3299 fy_align = aadata->fy_align;
3300 fz_align = aadata->fz_align;
3302 jindex = aadata->jindex_gb;
3303 dadx = fr->dadx;
3305 n = 0;
3306 rb = aadata->work;
3308 /* Loop to get the proper form for the Born radius term */
3309 if(gb_algorithm==egbSTILL)
3311 for(i=0;i<natoms;i++)
3313 rbi = born->bRad[i];
3314 rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
3317 else if(gb_algorithm==egbHCT)
3319 for(i=0;i<natoms;i++)
3321 rbi = born->bRad[i];
3322 rb[i] = rbi * rbi * fr->dvda[i];
3325 else if(gb_algorithm==egbOBC)
3327 for(idx=0;idx<natoms;idx++)
3329 rbi = born->bRad[idx];
3330 rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
3334 for(i=0;i<2*natoms;i++)
3336 fx_align[i] = 0;
3337 fy_align[i] = 0;
3338 fz_align[i] = 0;
3342 for(i=0;i<natoms;i++)
3344 rb[i+natoms]=rb[i];
3347 for(i=ni0; i<ni1; i+=UNROLLI)
3349 /* We assume shifts are NOT used for all-vs-all interactions */
3351 /* Load i atom data */
3352 ix_SSE0 = _mm_load1_ps(x_align+i);
3353 iy_SSE0 = _mm_load1_ps(y_align+i);
3354 iz_SSE0 = _mm_load1_ps(z_align+i);
3355 ix_SSE1 = _mm_load1_ps(x_align+i+1);
3356 iy_SSE1 = _mm_load1_ps(y_align+i+1);
3357 iz_SSE1 = _mm_load1_ps(z_align+i+1);
3358 ix_SSE2 = _mm_load1_ps(x_align+i+2);
3359 iy_SSE2 = _mm_load1_ps(y_align+i+2);
3360 iz_SSE2 = _mm_load1_ps(z_align+i+2);
3361 ix_SSE3 = _mm_load1_ps(x_align+i+3);
3362 iy_SSE3 = _mm_load1_ps(y_align+i+3);
3363 iz_SSE3 = _mm_load1_ps(z_align+i+3);
3365 fix_SSE0 = _mm_setzero_ps();
3366 fiy_SSE0 = _mm_setzero_ps();
3367 fiz_SSE0 = _mm_setzero_ps();
3368 fix_SSE1 = _mm_setzero_ps();
3369 fiy_SSE1 = _mm_setzero_ps();
3370 fiz_SSE1 = _mm_setzero_ps();
3371 fix_SSE2 = _mm_setzero_ps();
3372 fiy_SSE2 = _mm_setzero_ps();
3373 fiz_SSE2 = _mm_setzero_ps();
3374 fix_SSE3 = _mm_setzero_ps();
3375 fiy_SSE3 = _mm_setzero_ps();
3376 fiz_SSE3 = _mm_setzero_ps();
3378 rbai_SSE0 = _mm_load1_ps(rb+i);
3379 rbai_SSE1 = _mm_load1_ps(rb+i+1);
3380 rbai_SSE2 = _mm_load1_ps(rb+i+2);
3381 rbai_SSE3 = _mm_load1_ps(rb+i+3);
3383 /* Load limits for loop over neighbors */
3384 nj0 = jindex[4*i];
3385 nj3 = jindex[4*i+3];
3387 /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
3388 for(j=nj0; j<nj3; j+=UNROLLJ)
3390 /* load j atom coordinates */
3391 jx_SSE = _mm_load_ps(x_align+j);
3392 jy_SSE = _mm_load_ps(y_align+j);
3393 jz_SSE = _mm_load_ps(z_align+j);
3395 /* Calculate distance */
3396 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
3397 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
3398 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
3399 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
3400 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
3401 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
3402 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
3403 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
3404 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
3405 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
3406 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
3407 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
3409 rbaj_SSE = _mm_load_ps(rb+j);
3411 fgb_SSE0 = _mm_mul_ps(rbai_SSE0,_mm_load_ps(dadx));
3412 dadx += 4;
3413 fgb_SSE1 = _mm_mul_ps(rbai_SSE1,_mm_load_ps(dadx));
3414 dadx += 4;
3415 fgb_SSE2 = _mm_mul_ps(rbai_SSE2,_mm_load_ps(dadx));
3416 dadx += 4;
3417 fgb_SSE3 = _mm_mul_ps(rbai_SSE3,_mm_load_ps(dadx));
3418 dadx += 4;
3420 fgb_ai_SSE0 = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3421 dadx +=4;
3422 fgb_ai_SSE1 = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3423 dadx +=4;
3424 fgb_ai_SSE2 = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3425 dadx +=4;
3426 fgb_ai_SSE3 = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3427 dadx +=4;
3429 /* Total force between ai and aj is the sum of ai->aj and aj->ai */
3430 fgb_SSE0 = _mm_add_ps(fgb_SSE0,fgb_ai_SSE0);
3431 fgb_SSE1 = _mm_add_ps(fgb_SSE1,fgb_ai_SSE1);
3432 fgb_SSE2 = _mm_add_ps(fgb_SSE2,fgb_ai_SSE2);
3433 fgb_SSE3 = _mm_add_ps(fgb_SSE3,fgb_ai_SSE3);
3435 /* Calculate temporary vectorial force */
3436 tx_SSE0 = _mm_mul_ps(fgb_SSE0,dx_SSE0);
3437 ty_SSE0 = _mm_mul_ps(fgb_SSE0,dy_SSE0);
3438 tz_SSE0 = _mm_mul_ps(fgb_SSE0,dz_SSE0);
3439 tx_SSE1 = _mm_mul_ps(fgb_SSE1,dx_SSE1);
3440 ty_SSE1 = _mm_mul_ps(fgb_SSE1,dy_SSE1);
3441 tz_SSE1 = _mm_mul_ps(fgb_SSE1,dz_SSE1);
3442 tx_SSE2 = _mm_mul_ps(fgb_SSE2,dx_SSE2);
3443 ty_SSE2 = _mm_mul_ps(fgb_SSE2,dy_SSE2);
3444 tz_SSE2 = _mm_mul_ps(fgb_SSE2,dz_SSE2);
3445 tx_SSE3 = _mm_mul_ps(fgb_SSE3,dx_SSE3);
3446 ty_SSE3 = _mm_mul_ps(fgb_SSE3,dy_SSE3);
3447 tz_SSE3 = _mm_mul_ps(fgb_SSE3,dz_SSE3);
3449 /* Increment i atom force */
3450 fix_SSE0 = _mm_add_ps(fix_SSE0,tx_SSE0);
3451 fiy_SSE0 = _mm_add_ps(fiy_SSE0,ty_SSE0);
3452 fiz_SSE0 = _mm_add_ps(fiz_SSE0,tz_SSE0);
3453 fix_SSE1 = _mm_add_ps(fix_SSE1,tx_SSE1);
3454 fiy_SSE1 = _mm_add_ps(fiy_SSE1,ty_SSE1);
3455 fiz_SSE1 = _mm_add_ps(fiz_SSE1,tz_SSE1);
3456 fix_SSE2 = _mm_add_ps(fix_SSE2,tx_SSE2);
3457 fiy_SSE2 = _mm_add_ps(fiy_SSE2,ty_SSE2);
3458 fiz_SSE2 = _mm_add_ps(fiz_SSE2,tz_SSE2);
3459 fix_SSE3 = _mm_add_ps(fix_SSE3,tx_SSE3);
3460 fiy_SSE3 = _mm_add_ps(fiy_SSE3,ty_SSE3);
3461 fiz_SSE3 = _mm_add_ps(fiz_SSE3,tz_SSE3);
3463 /* Decrement j atom force */
3464 _mm_store_ps(fx_align+j,
3465 _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
3466 _mm_store_ps(fy_align+j,
3467 _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
3468 _mm_store_ps(fz_align+j,
3469 _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
3471 /* Add i forces to mem and shifted force list */
3472 _MM_TRANSPOSE4_PS(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3);
3473 fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE1);
3474 fix_SSE2 = _mm_add_ps(fix_SSE2,fix_SSE3);
3475 fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE2);
3476 _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
3478 _MM_TRANSPOSE4_PS(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3);
3479 fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE1);
3480 fiy_SSE2 = _mm_add_ps(fiy_SSE2,fiy_SSE3);
3481 fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE2);
3482 _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
3484 _MM_TRANSPOSE4_PS(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3);
3485 fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE1);
3486 fiz_SSE2 = _mm_add_ps(fiz_SSE2,fiz_SSE3);
3487 fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE2);
3488 _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
3491 for(i=0;i<natoms;i++)
3493 f[3*i] += fx_align[i] + fx_align[natoms+i];
3494 f[3*i+1] += fy_align[i] + fy_align[natoms+i];
3495 f[3*i+2] += fz_align[i] + fz_align[natoms+i];
3498 return 0;
3501 #else
3502 /* dummy variable when not using SSE */
3503 int genborn_allvsall_sse2_single_dummy;
3506 #endif