Fixed synchronization counter update for pwr6 kernels
[gromacs.git] / src / gmxlib / nonbonded / nb_kernel_power6 / pwr6kernel420.F
blob33f751ca6952d071c213019a85b74cbcb0c7a0a4
2 C                This source code is part of
4 C                 G   R   O   M   A   C   S
6 C Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 C Copyright (c) 2001-2009, The GROMACS Development Team
9 C Gromacs is a library for molecular simulation and trajectory analysis,
10 C written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 C a full list of developers and information, check out http://www.gromacs.org
13 C This program is free software; you can redistribute it and/or modify it under
14 C the terms of the GNU Lesser General Public License as published by the Free
15 C Software Foundation; either version 2 of the License, or (at your option) any
16 C later version.
17 C As a special exception, you may use this file as part of a free software
18 C library without restriction.  Specifically, if other files instantiate
19 C templates or use macros or inline functions from this file, or you compile
20 C this file and link it with other files to produce an executable, this
21 C file does not by itself cause the resulting executable to be covered by
22 C the GNU Lesser General Public License.
24 C In plain-speak: do not worry about classes/macros/templates either - only
25 C changes to the library have to be LGPL, not an application linking with it.
27 C To help fund GROMACS development, we humbly ask that you cite
28 C the papers people have written on it - you can find them on the website!
31 #ifdef HAVE_CONFIG_H
32 #  include<config.h>
33 #endif
35 #ifdef GMX_DOUBLE
36 #  define gmxreal real*8
37 #else
38 #  define gmxreal real*4
39 #endif
44 C Gromacs nonbonded kernel pwr6kernel420
45 C Coulomb interaction:     Generalized-Born
46 C VdW interaction:         Buckingham
47 C water optimization:      No
48 C Calculate forces:        yes
50       subroutine pwr6kernel420(
51      &                          nri,
52      &                          iinr,
53      &                          jindex,
54      &                          jjnr,
55      &                          shift,
56      &                          shiftvec,
57      &                          fshift,
58      &                          gid,
59      &                          pos,
60      &                          faction,
61      &                          charge,
62      &                          facel,
63      &                          krf,
64      &                          crf,
65      &                          Vc,
66      &                          type,
67      &                          ntype,
68      &                          vdwparam,
69      &                          Vvdw,
70      &                          tabscale,
71      &                          VFtab,
72      &                          invsqrta,
73      &                          dvda,
74      &                          gbtabscale,
75      &                          GBtab,
76      &                          nthreads,
77      &                          count,
78      &                          mtx,
79      &                          outeriter,
80      &                          inneriter,
81      &                          work)
82       implicit      none
83       integer*4     nri,iinr(*),jindex(*),jjnr(*),shift(*)
84       gmxreal       shiftvec(*),fshift(*),pos(*),faction(*)
85       integer*4     gid(*),type(*),ntype
86       gmxreal       charge(*),facel,krf,crf,Vc(*),vdwparam(*)
87       gmxreal       Vvdw(*),tabscale,VFtab(*)
88       gmxreal       invsqrta(*),dvda(*),gbtabscale,GBtab(*)
89       integer*4     nthreads,count,mtx,outeriter,inneriter
90       gmxreal       work(*)
92       integer*4     n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid
93       integer*4     nn0,nn1,nouter,ninner
94       gmxreal       shX,shY,shZ
95       gmxreal       fscal,tx,ty,tz
96       gmxreal       rinvsq
97       gmxreal       iq
98       gmxreal       qq,vcoul,vctot
99       integer*4     nti
100       integer*4     tj
101       gmxreal       rinvsix
102       gmxreal       Vvdw6,Vvdwtot
103       gmxreal       r,rt,eps,eps2
104       integer*4     n0,nnn
105       gmxreal       Y,F,Geps,Heps2,Fp,VV
106       gmxreal       FF
107       gmxreal       fijC
108       gmxreal       isai,isaj,isaprod,gbscale,vgb
109       gmxreal       dvdasum,dvdatmp,dvdaj,fgb
110       gmxreal       Vvdwexp,br
111       gmxreal       ix1,iy1,iz1,fix1,fiy1,fiz1
112       gmxreal       jx1,jy1,jz1
113       gmxreal       dx11,dy11,dz11,rsq11,rinv11
114       gmxreal       c6,cexp1,cexp2
117 C    Reset outer and inner iteration counters
118       nouter           = 0               
119       ninner           = 0               
121 C    Loop over thread workunits
122    10 call pwr6kernelsync(mtx,count,nri,nthreads,nn0,nn1)
123         if(nn1.gt.nri) nn1=nri
125 C      Start outer loop over neighborlists
126         
127         do n=nn0+1,nn1
129 C        Load shift vector for this list
130           is3              = 3*shift(n)+1    
131           shX              = shiftvec(is3)   
132           shY              = shiftvec(is3+1) 
133           shZ              = shiftvec(is3+2) 
135 C        Load limits for loop over neighbors
136           nj0              = jindex(n)+1     
137           nj1              = jindex(n+1)     
139 C        Get outer coordinate index
140           ii               = iinr(n)+1       
141           ii3              = 3*ii-2          
143 C        Load i atom data, add shift vector
144           ix1              = shX + pos(ii3+0)
145           iy1              = shY + pos(ii3+1)
146           iz1              = shZ + pos(ii3+2)
148 C        Load parameters for i atom
149           iq               = facel*charge(ii)
150           isai             = invsqrta(ii)    
151           nti              = 3*ntype*type(ii)
153 C        Zero the potential energy for this list
154           vctot            = 0               
155           Vvdwtot          = 0               
156           dvdasum          = 0               
158 C        Clear i atom forces
159           fix1             = 0               
160           fiy1             = 0               
161           fiz1             = 0               
162           
163           do k=nj0,nj1
165 C          Get j neighbor index, and coordinate index
166             jnr              = jjnr(k)+1       
167             j3               = 3*jnr-2         
169 C          load j atom coordinates
170             jx1              = pos(j3+0)       
171             jy1              = pos(j3+1)       
172             jz1              = pos(j3+2)       
174 C          Calculate distance
175             dx11             = ix1 - jx1       
176             dy11             = iy1 - jy1       
177             dz11             = iz1 - jz1       
178             rsq11            = dx11*dx11+dy11*dy11+dz11*dz11
180 C          Calculate 1/r and 1/r2
182 C          PowerPC intrinsics 1/sqrt lookup table
183 #ifndef GMX_BLUEGENE
184             rinv11           = frsqrtes(rsq11) 
185 #else
186             rinv11           = frsqrte(dble(rsq11)) 
187 #endif
188             rinv11           = (0.5*rinv11*(3.0-((rsq11*rinv11)
189      &  *rinv11)))
190 #ifdef GMX_DOUBLE
191             rinv11           = (0.5*rinv11*(3.0-((rsq11*rinv11)
192      &  *rinv11)))
193 #endif
195 C          Load parameters for j atom
196             isaj             = invsqrta(jnr)   
197             isaprod          = isai*isaj       
198             qq               = iq*charge(jnr)  
199             vcoul            = qq*rinv11       
200             fscal            = vcoul*rinv11    
201             qq               = isaprod*(-qq)   
202             gbscale          = isaprod*gbtabscale
203             tj               = nti+3*type(jnr)+1
204             c6               = vdwparam(tj)    
205             cexp1            = vdwparam(tj+1)  
206             cexp2            = vdwparam(tj+2)  
207             rinvsq           = rinv11*rinv11   
209 C          Tabulated Generalized-Born interaction
210             dvdaj            = dvda(jnr)       
211             r                = rsq11*rinv11    
213 C          Calculate table index
214             rt               = r*gbscale       
215             n0               = rt              
216             eps              = rt-n0           
217             eps2             = eps*eps         
218             nnn              = 4*n0+1          
219             Y                = GBtab(nnn)      
220             F                = GBtab(nnn+1)    
221             Geps             = eps*GBtab(nnn+2)
222             Heps2            = eps2*GBtab(nnn+3)
223             Fp               = F+Geps+Heps2    
224             VV               = Y+eps*Fp        
225             FF               = Fp+Geps+2.0*Heps2
226             vgb              = qq*VV           
227             fijC             = qq*FF*gbscale   
228             dvdatmp          = -0.5*(vgb+fijC*r)
229             dvdasum          = dvdasum + dvdatmp
230             dvda(jnr)        = dvdaj+dvdatmp*isaj*isaj
231             vctot            = vctot + vcoul   
233 C          Buckingham interaction
234             rinvsix          = rinvsq*rinvsq*rinvsq
235             Vvdw6            = c6*rinvsix      
236             br               = cexp2*rsq11*rinv11
237             Vvdwexp          = cexp1*exp(-br)  
238             Vvdwtot          = Vvdwtot+Vvdwexp-Vvdw6
239             fscal            = (br*Vvdwexp-6.0*Vvdw6)*rinvsq
240      &  -(fijC-fscal)*rinv11
242 C          Calculate temporary vectorial force
243             tx               = fscal*dx11      
244             ty               = fscal*dy11      
245             tz               = fscal*dz11      
247 C          Increment i atom force
248             fix1             = fix1 + tx       
249             fiy1             = fiy1 + ty       
250             fiz1             = fiz1 + tz       
252 C          Decrement j atom force
253             faction(j3+0)    = faction(j3+0) - tx
254             faction(j3+1)    = faction(j3+1) - ty
255             faction(j3+2)    = faction(j3+2) - tz
257 C          Inner loop uses 89 flops/iteration
258           end do
259           
261 C        Add i forces to mem and shifted force list
262           faction(ii3+0)   = faction(ii3+0) + fix1
263           faction(ii3+1)   = faction(ii3+1) + fiy1
264           faction(ii3+2)   = faction(ii3+2) + fiz1
265           fshift(is3)      = fshift(is3)+fix1
266           fshift(is3+1)    = fshift(is3+1)+fiy1
267           fshift(is3+2)    = fshift(is3+2)+fiz1
269 C        Add potential energies to the group for this list
270           ggid             = gid(n)+1        
271           Vc(ggid)         = Vc(ggid) + vctot
272           Vvdw(ggid)       = Vvdw(ggid) + Vvdwtot
273           dvda(ii)         = dvda(ii) + dvdasum*isai*isai
275 C        Increment number of inner iterations
276           ninner           = ninner + nj1 - nj0
278 C        Outer loop uses 13 flops/iteration
279         end do
280         
282 C      Increment number of outer iterations
283         nouter           = nouter + nn1 - nn0
284       if(nn1.lt.nri) goto 10
286 C    Write outer/inner iteration count to pointers
287       outeriter        = nouter          
288       inneriter        = ninner          
289       return
290       end
298 C Gromacs nonbonded kernel pwr6kernel420nf
299 C Coulomb interaction:     Generalized-Born
300 C VdW interaction:         Buckingham
301 C water optimization:      No
302 C Calculate forces:        no
304       subroutine pwr6kernel420nf(
305      &                          nri,
306      &                          iinr,
307      &                          jindex,
308      &                          jjnr,
309      &                          shift,
310      &                          shiftvec,
311      &                          fshift,
312      &                          gid,
313      &                          pos,
314      &                          faction,
315      &                          charge,
316      &                          facel,
317      &                          krf,
318      &                          crf,
319      &                          Vc,
320      &                          type,
321      &                          ntype,
322      &                          vdwparam,
323      &                          Vvdw,
324      &                          tabscale,
325      &                          VFtab,
326      &                          invsqrta,
327      &                          dvda,
328      &                          gbtabscale,
329      &                          GBtab,
330      &                          nthreads,
331      &                          count,
332      &                          mtx,
333      &                          outeriter,
334      &                          inneriter,
335      &                          work)
336       implicit      none
337       integer*4     nri,iinr(*),jindex(*),jjnr(*),shift(*)
338       gmxreal       shiftvec(*),fshift(*),pos(*),faction(*)
339       integer*4     gid(*),type(*),ntype
340       gmxreal       charge(*),facel,krf,crf,Vc(*),vdwparam(*)
341       gmxreal       Vvdw(*),tabscale,VFtab(*)
342       gmxreal       invsqrta(*),dvda(*),gbtabscale,GBtab(*)
343       integer*4     nthreads,count,mtx,outeriter,inneriter
344       gmxreal       work(*)
346       integer*4     n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid
347       integer*4     nn0,nn1,nouter,ninner
348       gmxreal       shX,shY,shZ
349       gmxreal       rinvsq
350       gmxreal       iq
351       gmxreal       qq,vcoul,vctot
352       integer*4     nti
353       integer*4     tj
354       gmxreal       rinvsix
355       gmxreal       Vvdw6,Vvdwtot
356       gmxreal       r,rt,eps,eps2
357       integer*4     n0,nnn
358       gmxreal       Y,F,Geps,Heps2,Fp,VV
359       gmxreal       isai,isaj,isaprod,gbscale,vgb
360       gmxreal       Vvdwexp,br
361       gmxreal       ix1,iy1,iz1
362       gmxreal       jx1,jy1,jz1
363       gmxreal       dx11,dy11,dz11,rsq11,rinv11
364       gmxreal       c6,cexp1,cexp2
367 C    Reset outer and inner iteration counters
368       nouter           = 0               
369       ninner           = 0               
371 C    Loop over thread workunits
372    10 call pwr6kernelsync(mtx,count,nri,nthreads,nn0,nn1)
373         if(nn1.gt.nri) nn1=nri
375 C      Start outer loop over neighborlists
376         
377         do n=nn0+1,nn1
379 C        Load shift vector for this list
380           is3              = 3*shift(n)+1    
381           shX              = shiftvec(is3)   
382           shY              = shiftvec(is3+1) 
383           shZ              = shiftvec(is3+2) 
385 C        Load limits for loop over neighbors
386           nj0              = jindex(n)+1     
387           nj1              = jindex(n+1)     
389 C        Get outer coordinate index
390           ii               = iinr(n)+1       
391           ii3              = 3*ii-2          
393 C        Load i atom data, add shift vector
394           ix1              = shX + pos(ii3+0)
395           iy1              = shY + pos(ii3+1)
396           iz1              = shZ + pos(ii3+2)
398 C        Load parameters for i atom
399           iq               = facel*charge(ii)
400           isai             = invsqrta(ii)    
401           nti              = 3*ntype*type(ii)
403 C        Zero the potential energy for this list
404           vctot            = 0               
405           Vvdwtot          = 0               
407 C        Clear i atom forces
408           
409           do k=nj0,nj1
411 C          Get j neighbor index, and coordinate index
412             jnr              = jjnr(k)+1       
413             j3               = 3*jnr-2         
415 C          load j atom coordinates
416             jx1              = pos(j3+0)       
417             jy1              = pos(j3+1)       
418             jz1              = pos(j3+2)       
420 C          Calculate distance
421             dx11             = ix1 - jx1       
422             dy11             = iy1 - jy1       
423             dz11             = iz1 - jz1       
424             rsq11            = dx11*dx11+dy11*dy11+dz11*dz11
426 C          Calculate 1/r and 1/r2
428 C          PowerPC intrinsics 1/sqrt lookup table
429 #ifndef GMX_BLUEGENE
430             rinv11           = frsqrtes(rsq11) 
431 #else
432             rinv11           = frsqrte(dble(rsq11)) 
433 #endif
434             rinv11           = (0.5*rinv11*(3.0-((rsq11*rinv11)
435      &  *rinv11)))
436 #ifdef GMX_DOUBLE
437             rinv11           = (0.5*rinv11*(3.0-((rsq11*rinv11)
438      &  *rinv11)))
439 #endif
441 C          Load parameters for j atom
442             isaj             = invsqrta(jnr)   
443             isaprod          = isai*isaj       
444             qq               = iq*charge(jnr)  
445             vcoul            = qq*rinv11       
446             qq               = isaprod*(-qq)   
447             gbscale          = isaprod*gbtabscale
448             tj               = nti+3*type(jnr)+1
449             c6               = vdwparam(tj)    
450             cexp1            = vdwparam(tj+1)  
451             cexp2            = vdwparam(tj+2)  
452             rinvsq           = rinv11*rinv11   
454 C          Tabulated Generalized-Born interaction
455             r                = rsq11*rinv11    
457 C          Calculate table index
458             rt               = r*gbscale       
459             n0               = rt              
460             eps              = rt-n0           
461             eps2             = eps*eps         
462             nnn              = 4*n0+1          
463             Y                = GBtab(nnn)      
464             F                = GBtab(nnn+1)    
465             Geps             = eps*GBtab(nnn+2)
466             Heps2            = eps2*GBtab(nnn+3)
467             Fp               = F+Geps+Heps2    
468             VV               = Y+eps*Fp        
469             vgb              = qq*VV           
470             vctot            = vctot + vcoul   
472 C          Buckingham interaction
473             rinvsix          = rinvsq*rinvsq*rinvsq
474             Vvdw6            = c6*rinvsix      
475             br               = cexp2*rsq11*rinv11
476             Vvdwexp          = cexp1*exp(-br)  
477             Vvdwtot          = Vvdwtot+Vvdwexp-Vvdw6
479 C          Inner loop uses 65 flops/iteration
480           end do
481           
483 C        Add i forces to mem and shifted force list
485 C        Add potential energies to the group for this list
486           ggid             = gid(n)+1        
487           Vc(ggid)         = Vc(ggid) + vctot
488           Vvdw(ggid)       = Vvdw(ggid) + Vvdwtot
490 C        Increment number of inner iterations
491           ninner           = ninner + nj1 - nj0
493 C        Outer loop uses 6 flops/iteration
494         end do
495         
497 C      Increment number of outer iterations
498         nouter           = nouter + nn1 - nn0
499       if(nn1.lt.nri) goto 10
501 C    Write outer/inner iteration count to pointers
502       outeriter        = nouter          
503       inneriter        = ninner          
504       return
505       end