src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.s

   1 ##
   2 ##
   3 ## Gromacs 4.0                         Copyright (c) 1991-2003
   4 ## David van der Spoel, Erik Lindahl
   5 ##
   6 ## This program is free software; you can redistribute it and/or
   7 ## modify it under the terms of the GNU General Public License
   8 ## as published by the Free Software Foundation; either version 2
   9 ## of the License, or (at your option) any later version.
  10 ##
  11 ## To help us fund GROMACS development, we humbly ask that you cite
  12 ## the research papers on the package. Check out http://www.gromacs.org
  13 ##
  14 ## And Hey:
  15 ## Gnomes, ROck Monsters And Chili Sauce
  16 ##
  17
  18
  19
  20 .globl nb_kernel410_ia32_sse
  21 .globl _nb_kernel410_ia32_sse
  22 nb_kernel410_ia32_sse:
  23 _nb_kernel410_ia32_sse:
  24 .set nb410_p_nri, 8
  25 .set nb410_iinr, 12
  26 .set nb410_jindex, 16
  27 .set nb410_jjnr, 20
  28 .set nb410_shift, 24
  29 .set nb410_shiftvec, 28
  30 .set nb410_fshift, 32
  31 .set nb410_gid, 36
  32 .set nb410_pos, 40
  33 .set nb410_faction, 44
  34 .set nb410_charge, 48
  35 .set nb410_p_facel, 52
  36 .set nb410_argkrf, 56
  37 .set nb410_argcrf, 60
  38 .set nb410_Vc, 64
  39 .set nb410_type, 68
  40 .set nb410_p_ntype, 72
  41 .set nb410_vdwparam, 76
  42 .set nb410_Vvdw, 80
  43 .set nb410_p_tabscale, 84
  44 .set nb410_VFtab, 88
  45 .set nb410_invsqrta, 92
  46 .set nb410_dvda, 96
  47 .set nb410_p_gbtabscale, 100
  48 .set nb410_GBtab, 104
  49 .set nb410_p_nthreads, 108
  50 .set nb410_count, 112
  51 .set nb410_mtx, 116
  52 .set nb410_outeriter, 120
  53 .set nb410_inneriter, 124
  54 .set nb410_work, 128
  55         ## stack offsets for local variables
  56         ## bottom of stack is cache-aligned for sse use
  57 .set nb410_ix, 0
  58 .set nb410_iy, 16
  59 .set nb410_iz, 32
  60 .set nb410_iq, 48
  61 .set nb410_dx, 64
  62 .set nb410_dy, 80
  63 .set nb410_dz, 96
  64 .set nb410_two, 112
  65 .set nb410_six, 128
  66 .set nb410_twelve, 144
  67 .set nb410_gbtsc, 160
  68 .set nb410_qq, 176
  69 .set nb410_c6, 192
  70 .set nb410_c12, 208
  71 .set nb410_fscal, 224
  72 .set nb410_vctot, 240
  73 .set nb410_Vvdwtot, 256
  74 .set nb410_fix, 272
  75 .set nb410_fiy, 288
  76 .set nb410_fiz, 304
  77 .set nb410_half, 320
  78 .set nb410_three, 336
  79 .set nb410_r, 352
  80 .set nb410_isai, 368
  81 .set nb410_isaprod, 384
  82 .set nb410_dvdasum, 400
  83 .set nb410_gbscale, 416
  84 .set nb410_is3, 432
  85 .set nb410_ii3, 436
  86 .set nb410_ii, 440
  87 .set nb410_ntia, 444
  88 .set nb410_innerjjnr, 448
  89 .set nb410_innerk, 452
  90 .set nb410_n, 456
  91 .set nb410_nn1, 460
  92 .set nb410_jnra, 464
  93 .set nb410_jnrb, 468
  94 .set nb410_jnrc, 472
  95 .set nb410_jnrd, 476
  96 .set nb410_nri, 480
  97 .set nb410_facel, 484
  98 .set nb410_ntype, 488
  99 .set nb410_nouter, 492
 100 .set nb410_ninner, 496
 101 .set nb410_salign, 500
 102         pushl %ebp
 103         movl %esp,%ebp
 104         pushl %eax
 105         pushl %ebx
 106         pushl %ecx
 107         pushl %edx
 108         pushl %esi
 109         pushl %edi
 110         subl $504,%esp          ## local stack space
 111         movl %esp,%eax
 112         andl $0xf,%eax
 113         subl %eax,%esp
 114         movl %eax,nb410_salign(%esp)
 115
 116         emms
 117
 118         ## Move args passed by reference to stack
 119         movl nb410_p_nri(%ebp),%ecx
 120         movl nb410_p_facel(%ebp),%esi
 121         movl nb410_p_ntype(%ebp),%edi
 122         movl (%ecx),%ecx
 123         movl (%esi),%esi
 124         movl (%edi),%edi
 125         movl %ecx,nb410_nri(%esp)
 126         movl %esi,nb410_facel(%esp)
 127         movl %edi,nb410_ntype(%esp)
 128
 129         ## zero iteration counters
 130         movl $0,%eax
 131         movl %eax,nb410_nouter(%esp)
 132         movl %eax,nb410_ninner(%esp)
 133
 134
 135         movl nb410_p_gbtabscale(%ebp),%eax
 136         movss (%eax),%xmm5
 137         shufps $0,%xmm5,%xmm5
 138         movaps %xmm5,nb410_gbtsc(%esp)
 139
 140         ## create constant floating-point factors on stack
 141         movl $0x3f000000,%eax   ## constant 0.5 in IEEE (hex)
 142         movl %eax,nb410_half(%esp)
 143         movss nb410_half(%esp),%xmm1
 144         shufps $0,%xmm1,%xmm1  ## splat to all elements
 145         movaps %xmm1,%xmm2
 146         addps  %xmm2,%xmm2      ## constant 1.0
 147         movaps %xmm2,%xmm3
 148         addps  %xmm2,%xmm2      ## constant 2.0
 149         addps  %xmm2,%xmm3      ## constant 3.0
 150         movaps %xmm3,%xmm4
 151         addps  %xmm4,%xmm4      ## 6.0
 152         movaps %xmm4,%xmm5
 153         addps  %xmm5,%xmm5      ## constant 12.0
 154         movaps %xmm1,nb410_half(%esp)
 155         movaps %xmm2,nb410_two(%esp)
 156         movaps %xmm3,nb410_three(%esp)
 157         movaps %xmm4,nb410_six(%esp)
 158         movaps %xmm5,nb410_twelve(%esp)
 159
 160 _nb_kernel410_ia32_sse.nb410_threadloop:
 161         movl  nb410_count(%ebp),%esi            ## pointer to sync counter
 162         movl  (%esi),%eax
 163 _nb_kernel410_ia32_sse.nb410_spinlock:
 164         movl  %eax,%ebx                         ## ebx=*count=nn0
 165         addl  $1,%ebx                          ## ebx=nn1=nn0+10
 166         lock
 167         cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
 168                                                 ## if it hasnt changed.
 169                                                 ## or reread *counter to eax.
 170         pause                                   ## -> better p4 performance
 171         jnz _nb_kernel410_ia32_sse.nb410_spinlock
 172
 173         ## if(nn1>nri) nn1=nri
 174         movl nb410_nri(%esp),%ecx
 175         movl %ecx,%edx
 176         subl %ebx,%ecx
 177         cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
 178         ## Cleared the spinlock if we got here.
 179         ## eax contains nn0, ebx contains nn1.
 180         movl %eax,nb410_n(%esp)
 181         movl %ebx,nb410_nn1(%esp)
 182         subl %eax,%ebx                          ## calc number of outer lists
 183         movl %eax,%esi                          ## copy n to esi
 184         jg  _nb_kernel410_ia32_sse.nb410_outerstart
 185         jmp _nb_kernel410_ia32_sse.nb410_end
 186
 187 _nb_kernel410_ia32_sse.nb410_outerstart:
 188         ## ebx contains number of outer iterations
 189         addl nb410_nouter(%esp),%ebx
 190         movl %ebx,nb410_nouter(%esp)
 191
 192 _nb_kernel410_ia32_sse.nb410_outer:
 193         movl  nb410_shift(%ebp),%eax        ## eax = pointer into shift[]
 194         movl  (%eax,%esi,4),%ebx        ## ebx=shift[n]
 195
 196         leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is
 197         movl  %ebx,nb410_is3(%esp)      ## store is3
 198
 199         movl  nb410_shiftvec(%ebp),%eax     ## eax = base of shiftvec[]
 200
 201         movss (%eax,%ebx,4),%xmm0
 202         movss 4(%eax,%ebx,4),%xmm1
 203         movss 8(%eax,%ebx,4),%xmm2
 204
 205         movl  nb410_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]
 206         movl  (%ecx,%esi,4),%ebx            ## ebx =ii
 207         movl  %ebx,nb410_ii(%esp)
 208
 209         movl  nb410_charge(%ebp),%edx
 210         movss (%edx,%ebx,4),%xmm3
 211         mulss nb410_facel(%esp),%xmm3
 212         shufps $0,%xmm3,%xmm3
 213
 214         movl  nb410_invsqrta(%ebp),%edx         ## load invsqrta[ii]
 215         movss (%edx,%ebx,4),%xmm4
 216         shufps $0,%xmm4,%xmm4
 217
 218         movl  nb410_type(%ebp),%edx
 219         movl  (%edx,%ebx,4),%edx
 220         imull nb410_ntype(%esp),%edx
 221         shll  %edx
 222         movl  %edx,nb410_ntia(%esp)
 223
 224         leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3
 225         movl  nb410_pos(%ebp),%eax      ## eax = base of pos[]
 226
 227         addss (%eax,%ebx,4),%xmm0
 228         addss 4(%eax,%ebx,4),%xmm1
 229         addss 8(%eax,%ebx,4),%xmm2
 230
 231         movaps %xmm3,nb410_iq(%esp)
 232         movaps %xmm4,nb410_isai(%esp)
 233
 234         shufps $0,%xmm0,%xmm0
 235         shufps $0,%xmm1,%xmm1
 236         shufps $0,%xmm2,%xmm2
 237
 238         movaps %xmm0,nb410_ix(%esp)
 239         movaps %xmm1,nb410_iy(%esp)
 240         movaps %xmm2,nb410_iz(%esp)
 241
 242         movl  %ebx,nb410_ii3(%esp)
 243
 244         ## clear vctot and i forces
 245         xorps %xmm4,%xmm4
 246         movaps %xmm4,nb410_vctot(%esp)
 247         movaps %xmm4,nb410_Vvdwtot(%esp)
 248         movaps %xmm4,nb410_dvdasum(%esp)
 249         movaps %xmm4,nb410_fix(%esp)
 250         movaps %xmm4,nb410_fiy(%esp)
 251         movaps %xmm4,nb410_fiz(%esp)
 252
 253         movl  nb410_jindex(%ebp),%eax
 254         movl  (%eax,%esi,4),%ecx             ## jindex[n]
 255         movl  4(%eax,%esi,4),%edx            ## jindex[n+1]
 256         subl  %ecx,%edx              ## number of innerloop atoms
 257
 258         movl  nb410_pos(%ebp),%esi
 259         movl  nb410_faction(%ebp),%edi
 260         movl  nb410_jjnr(%ebp),%eax
 261         shll  $2,%ecx
 262         addl  %ecx,%eax
 263         movl  %eax,nb410_innerjjnr(%esp)       ## pointer to jjnr[nj0]
 264         movl  %edx,%ecx
 265         subl  $4,%edx
 266         addl  nb410_ninner(%esp),%ecx
 267         movl  %ecx,nb410_ninner(%esp)
 268         addl  $0,%edx
 269         movl  %edx,nb410_innerk(%esp)      ## number of innerloop atoms
 270         jge   _nb_kernel410_ia32_sse.nb410_unroll_loop
 271         jmp   _nb_kernel410_ia32_sse.nb410_finish_inner
 272 _nb_kernel410_ia32_sse.nb410_unroll_loop:
 273         ## quad-unroll innerloop here
 274         movl  nb410_innerjjnr(%esp),%edx       ## pointer to jjnr[k]
 275         movl  (%edx),%eax
 276         movl  4(%edx),%ebx
 277         movl  8(%edx),%ecx
 278         movl  12(%edx),%edx           ## eax-edx=jnr1-4
 279         addl $16,nb410_innerjjnr(%esp)             ## advance pointer (unrolled 4)
 280
 281         ## load isaj
 282         movl nb410_invsqrta(%ebp),%esi
 283         movss (%esi,%eax,4),%xmm3
 284         movss (%esi,%ecx,4),%xmm4
 285         movss (%esi,%ebx,4),%xmm6
 286         movss (%esi,%edx,4),%xmm7
 287         movaps nb410_isai(%esp),%xmm2
 288         shufps $0,%xmm6,%xmm3
 289         shufps $0,%xmm7,%xmm4
 290         shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all isaj in xmm3
 291         mulps  %xmm3,%xmm2
 292
 293         movaps %xmm2,nb410_isaprod(%esp)
 294         movaps %xmm2,%xmm1
 295         mulps nb410_gbtsc(%esp),%xmm1
 296         movaps %xmm1,nb410_gbscale(%esp)
 297
 298         movl nb410_charge(%ebp),%esi     ## base of charge[]
 299
 300         movss (%esi,%eax,4),%xmm3
 301         movss (%esi,%ecx,4),%xmm4
 302         movss (%esi,%ebx,4),%xmm6
 303         movss (%esi,%edx,4),%xmm7
 304
 305         mulps nb410_iq(%esp),%xmm2
 306         shufps $0,%xmm6,%xmm3
 307         shufps $0,%xmm7,%xmm4
 308         shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3
 309         mulps  %xmm2,%xmm3
 310         movaps %xmm3,nb410_qq(%esp)
 311
 312         movd %eax,%mm0
 313         movd %ebx,%mm1
 314         movd %ecx,%mm2
 315         movd %edx,%mm3
 316
 317         movl nb410_type(%ebp),%esi
 318         movl (%esi,%eax,4),%eax
 319         movl (%esi,%ebx,4),%ebx
 320         movl (%esi,%ecx,4),%ecx
 321         movl (%esi,%edx,4),%edx
 322         movl nb410_vdwparam(%ebp),%esi
 323         shll %eax
 324         shll %ebx
 325         shll %ecx
 326         shll %edx
 327         movl nb410_ntia(%esp),%edi
 328         addl %edi,%eax
 329         addl %edi,%ebx
 330         addl %edi,%ecx
 331         addl %edi,%edx
 332
 333         movlps (%esi,%eax,4),%xmm6
 334         movlps (%esi,%ecx,4),%xmm7
 335         movhps (%esi,%ebx,4),%xmm6
 336         movhps (%esi,%edx,4),%xmm7
 337
 338         movaps %xmm6,%xmm4
 339         shufps $136,%xmm7,%xmm4 ## constant 10001000
 340         shufps $221,%xmm7,%xmm6 ## constant 11011101
 341
 342         movd  %mm0,%eax
 343         movd  %mm1,%ebx
 344         movd  %mm2,%ecx
 345         movd  %mm3,%edx
 346
 347         movaps %xmm4,nb410_c6(%esp)
 348         movaps %xmm6,nb410_c12(%esp)
 349
 350         movl nb410_pos(%ebp),%esi        ## base of pos[]
 351
 352         movl %eax,nb410_jnra(%esp)
 353         movl %ebx,nb410_jnrb(%esp)
 354         movl %ecx,nb410_jnrc(%esp)
 355         movl %edx,nb410_jnrd(%esp)
 356
 357         leal  (%eax,%eax,2),%eax     ## replace jnr with j3
 358         leal  (%ebx,%ebx,2),%ebx
 359
 360         leal  (%ecx,%ecx,2),%ecx     ## replace jnr with j3
 361         leal  (%edx,%edx,2),%edx
 362
 363         ## move four coordinates to xmm0-xmm2
 364
 365         movlps (%esi,%eax,4),%xmm4
 366         movlps (%esi,%ecx,4),%xmm5
 367         movss 8(%esi,%eax,4),%xmm2
 368         movss 8(%esi,%ecx,4),%xmm6
 369
 370         movhps (%esi,%ebx,4),%xmm4
 371         movhps (%esi,%edx,4),%xmm5
 372
 373         movss 8(%esi,%ebx,4),%xmm0
 374         movss 8(%esi,%edx,4),%xmm1
 375
 376         shufps $0,%xmm0,%xmm2
 377         shufps $0,%xmm1,%xmm6
 378
 379         movaps %xmm4,%xmm0
 380         movaps %xmm4,%xmm1
 381
 382         shufps $136,%xmm6,%xmm2 ## constant 10001000
 383
 384         shufps $136,%xmm5,%xmm0 ## constant 10001000
 385         shufps $221,%xmm5,%xmm1 ## constant 11011101
 386
 387         ## move ix-iz to xmm4-xmm6
 388         movaps nb410_ix(%esp),%xmm4
 389         movaps nb410_iy(%esp),%xmm5
 390         movaps nb410_iz(%esp),%xmm6
 391
 392         ## calc dr
 393         subps %xmm0,%xmm4
 394         subps %xmm1,%xmm5
 395         subps %xmm2,%xmm6
 396
 397         ## store dr
 398         movaps %xmm4,nb410_dx(%esp)
 399         movaps %xmm5,nb410_dy(%esp)
 400         movaps %xmm6,nb410_dz(%esp)
 401         ## square it
 402         mulps %xmm4,%xmm4
 403         mulps %xmm5,%xmm5
 404         mulps %xmm6,%xmm6
 405         addps %xmm5,%xmm4
 406         addps %xmm6,%xmm4
 407         ## rsq in xmm4
 408
 409         rsqrtps %xmm4,%xmm5
 410         ## lookup seed in xmm5
 411         movaps %xmm5,%xmm2
 412         mulps %xmm5,%xmm5
 413         movaps nb410_three(%esp),%xmm1
 414         mulps %xmm4,%xmm5       ## rsq*lu*lu
 415         movaps nb410_half(%esp),%xmm0
 416         subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu
 417         mulps %xmm2,%xmm1
 418         mulps %xmm1,%xmm0       ## xmm0=rinv
 419         mulps %xmm0,%xmm4       ## xmm4=r
 420         movaps %xmm4,nb410_r(%esp)
 421         mulps nb410_gbscale(%esp),%xmm4
 422
 423         movhlps %xmm4,%xmm5
 424         cvttps2pi %xmm4,%mm6
 425         cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices
 426         cvtpi2ps %mm6,%xmm6
 427         cvtpi2ps %mm7,%xmm5
 428         movlhps %xmm5,%xmm6
 429         subps %xmm6,%xmm4
 430         movaps %xmm4,%xmm1      ## xmm1=eps
 431         movaps %xmm1,%xmm2
 432         mulps  %xmm2,%xmm2      ## xmm2=eps2
 433         pslld $2,%mm6
 434         pslld $2,%mm7
 435
 436         movd %eax,%mm0
 437         movd %ebx,%mm1
 438         movd %ecx,%mm2
 439         movd %edx,%mm3
 440
 441         movl nb410_GBtab(%ebp),%esi
 442         movd %mm6,%eax
 443         psrlq $32,%mm6
 444         movd %mm7,%ecx
 445         psrlq $32,%mm7
 446         movd %mm6,%ebx
 447         movd %mm7,%edx
 448
 449         ## load coulomb table
 450         movaps (%esi,%eax,4),%xmm4
 451         movaps (%esi,%ebx,4),%xmm5
 452         movaps (%esi,%ecx,4),%xmm6
 453         movaps (%esi,%edx,4),%xmm7
 454         ## transpose, using xmm3 for scratch
 455         movaps %xmm6,%xmm3
 456         shufps $0xEE,%xmm7,%xmm3
 457         shufps $0x44,%xmm7,%xmm6
 458         movaps %xmm4,%xmm7
 459         shufps $0xEE,%xmm5,%xmm7
 460         shufps $0x44,%xmm5,%xmm4
 461         movaps %xmm4,%xmm5
 462         shufps $0xDD,%xmm6,%xmm5
 463         shufps $0x88,%xmm6,%xmm4
 464         movaps %xmm7,%xmm6
 465         shufps $0x88,%xmm3,%xmm6
 466         shufps $0xDD,%xmm3,%xmm7
 467         ## coulomb table ready, in xmm4-xmm7
 468         mulps  %xmm1,%xmm6      ## xmm6=Geps
 469         mulps  %xmm2,%xmm7      ## xmm7=Heps2
 470
 471         addps  %xmm6,%xmm5
 472         addps  %xmm7,%xmm5      ## xmm5=Fp
 473         mulps  nb410_two(%esp),%xmm7    ## two*Heps2
 474         movaps nb410_qq(%esp),%xmm3
 475         addps  %xmm6,%xmm7
 476         addps  %xmm5,%xmm7 ## xmm7=FF
 477         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp
 478         addps  %xmm4,%xmm5 ## xmm5=VV
 479         mulps  %xmm3,%xmm5 ## vcoul=qq*VV
 480         mulps  %xmm7,%xmm3 ## fijC=FF*qq
 481         ## get jnr from stack
 482         movl nb410_jnra(%esp),%eax
 483         movl nb410_jnrb(%esp),%ebx
 484         movl nb410_jnrc(%esp),%ecx
 485         movl nb410_jnrd(%esp),%edx
 486
 487         movl nb410_dvda(%ebp),%esi
 488
 489         ## Calculate dVda
 490         xorps %xmm7,%xmm7
 491         mulps nb410_gbscale(%esp),%xmm3
 492         movaps %xmm3,%xmm6
 493         mulps  nb410_r(%esp),%xmm6
 494         addps  %xmm5,%xmm6
 495         addps  nb410_vctot(%esp),%xmm5
 496         movaps %xmm5,nb410_vctot(%esp)
 497
 498         ## xmm6=(vcoul+fijC*r)
 499         subps  %xmm6,%xmm7
 500         movaps %xmm7,%xmm6
 501
 502         ## update dvdasum
 503         addps  nb410_dvdasum(%esp),%xmm7
 504         movaps %xmm7,nb410_dvdasum(%esp)
 505
 506         ## update j atoms dvdaj
 507         movhlps %xmm6,%xmm7
 508         movaps  %xmm6,%xmm5
 509         movaps  %xmm7,%xmm4
 510         shufps $0x1,%xmm5,%xmm5
 511         shufps $0x1,%xmm4,%xmm4
 512         ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
 513         addss  (%esi,%eax,4),%xmm6
 514         addss  (%esi,%ebx,4),%xmm5
 515         addss  (%esi,%ecx,4),%xmm7
 516         addss  (%esi,%edx,4),%xmm4
 517         movss  %xmm6,(%esi,%eax,4)
 518         movss  %xmm5,(%esi,%ebx,4)
 519         movss  %xmm7,(%esi,%ecx,4)
 520         movss  %xmm4,(%esi,%edx,4)
 521
 522         ## L-J
 523         movaps %xmm0,%xmm4
 524         mulps  %xmm0,%xmm4      ## xmm4=rinvsq
 525
 526         movaps %xmm4,%xmm6
 527         mulps  %xmm4,%xmm6
 528
 529         mulps  %xmm4,%xmm6      ## xmm6=rinvsix
 530         movaps %xmm6,%xmm4
 531         mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve
 532         mulps  nb410_c6(%esp),%xmm6
 533         mulps  nb410_c12(%esp),%xmm4
 534         movaps nb410_Vvdwtot(%esp),%xmm7
 535         addps  %xmm4,%xmm7
 536         mulps  nb410_twelve(%esp),%xmm4
 537         subps  %xmm6,%xmm7
 538         mulps  nb410_six(%esp),%xmm6
 539         movaps %xmm7,nb410_Vvdwtot(%esp)
 540         subps  %xmm6,%xmm4
 541         mulps  %xmm0,%xmm4
 542         subps  %xmm3,%xmm4
 543         mulps  %xmm0,%xmm4
 544
 545         movaps nb410_dx(%esp),%xmm0
 546         movaps nb410_dy(%esp),%xmm1
 547         movaps nb410_dz(%esp),%xmm2
 548
 549         movd %mm0,%eax
 550         movd %mm1,%ebx
 551         movd %mm2,%ecx
 552         movd %mm3,%edx
 553
 554         movl   nb410_faction(%ebp),%edi
 555         mulps  %xmm4,%xmm0
 556         mulps  %xmm4,%xmm1
 557         mulps  %xmm4,%xmm2
 558         ## xmm0-xmm2 contains tx-tz (partial force)
 559         ## now update f_i
 560         movaps nb410_fix(%esp),%xmm3
 561         movaps nb410_fiy(%esp),%xmm4
 562         movaps nb410_fiz(%esp),%xmm5
 563         addps  %xmm0,%xmm3
 564         addps  %xmm1,%xmm4
 565         addps  %xmm2,%xmm5
 566         movaps %xmm3,nb410_fix(%esp)
 567         movaps %xmm4,nb410_fiy(%esp)
 568         movaps %xmm5,nb410_fiz(%esp)
 569         ## the fj's - start by accumulating x & y forces from memory
 570         movlps (%edi,%eax,4),%xmm4
 571         movlps (%edi,%ecx,4),%xmm6
 572         movhps (%edi,%ebx,4),%xmm4
 573         movhps (%edi,%edx,4),%xmm6
 574
 575         movaps %xmm4,%xmm3
 576         shufps $136,%xmm6,%xmm3 ## constant 10001000
 577         shufps $221,%xmm6,%xmm4 ## constant 11011101
 578
 579         ## now xmm3-xmm5 contains fjx, fjy, fjz
 580         subps  %xmm0,%xmm3
 581         subps  %xmm1,%xmm4
 582
 583         ## unpack them back so we can store them - first x & y in xmm3/xmm4
 584
 585         movaps %xmm3,%xmm6
 586         unpcklps %xmm4,%xmm6
 587         unpckhps %xmm4,%xmm3
 588         ## xmm6(l)=x & y for j1, (h) for j2
 589         ## xmm3(l)=x & y for j3, (h) for j4
 590         movlps %xmm6,(%edi,%eax,4)
 591         movlps %xmm3,(%edi,%ecx,4)
 592
 593         movhps %xmm6,(%edi,%ebx,4)
 594         movhps %xmm3,(%edi,%edx,4)
 595
 596         ## and the z forces
 597         movss  8(%edi,%eax,4),%xmm4
 598         movss  8(%edi,%ebx,4),%xmm5
 599         movss  8(%edi,%ecx,4),%xmm6
 600         movss  8(%edi,%edx,4),%xmm7
 601         subss  %xmm2,%xmm4
 602         shufps $229,%xmm2,%xmm2 ## constant 11100101
 603         subss  %xmm2,%xmm5
 604         shufps $234,%xmm2,%xmm2 ## constant 11101010
 605         subss  %xmm2,%xmm6
 606         shufps $255,%xmm2,%xmm2 ## constant 11111111
 607         subss  %xmm2,%xmm7
 608         movss  %xmm4,8(%edi,%eax,4)
 609         movss  %xmm5,8(%edi,%ebx,4)
 610         movss  %xmm6,8(%edi,%ecx,4)
 611         movss  %xmm7,8(%edi,%edx,4)
 612
 613         ## should we do one more iteration?
 614         subl $4,nb410_innerk(%esp)
 615         jl    _nb_kernel410_ia32_sse.nb410_finish_inner
 616         jmp   _nb_kernel410_ia32_sse.nb410_unroll_loop
 617 _nb_kernel410_ia32_sse.nb410_finish_inner:
 618         ## check if at least two particles remain
 619         addl $4,nb410_innerk(%esp)
 620         movl  nb410_innerk(%esp),%edx
 621         andl  $2,%edx
 622         jnz   _nb_kernel410_ia32_sse.nb410_dopair
 623         jmp   _nb_kernel410_ia32_sse.nb410_checksingle
 624 _nb_kernel410_ia32_sse.nb410_dopair:
 625         movl  nb410_innerjjnr(%esp),%ecx
 626         movl  (%ecx),%eax
 627         movl  4(%ecx),%ebx
 628         addl $8,nb410_innerjjnr(%esp)
 629
 630         xorps %xmm2,%xmm2
 631         movaps %xmm2,%xmm6
 632
 633         ## load isaj
 634         movl nb410_invsqrta(%ebp),%esi
 635         movss (%esi,%eax,4),%xmm2
 636         movss (%esi,%ebx,4),%xmm3
 637         unpcklps %xmm3,%xmm2    ## isaj in xmm2(0,1)
 638         mulps  nb410_isai(%esp),%xmm2
 639         movaps %xmm2,nb410_isaprod(%esp)
 640         movaps %xmm2,%xmm1
 641         mulps nb410_gbtsc(%esp),%xmm1
 642         movaps %xmm1,nb410_gbscale(%esp)
 643
 644         movl nb410_charge(%ebp),%esi     ## base of charge[]
 645         movss (%esi,%eax,4),%xmm3
 646         movss (%esi,%ebx,4),%xmm6
 647         unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges
 648
 649         mulps  nb410_iq(%esp),%xmm2
 650         mulps  %xmm2,%xmm3
 651         movaps %xmm3,nb410_qq(%esp)
 652
 653         movl nb410_type(%ebp),%esi
 654         movl  %eax,%ecx
 655         movl  %ebx,%edx
 656         movl (%esi,%ecx,4),%ecx
 657         movl (%esi,%edx,4),%edx
 658         movl nb410_vdwparam(%ebp),%esi
 659         shll %ecx
 660         shll %edx
 661         movl nb410_ntia(%esp),%edi
 662         addl %edi,%ecx
 663         addl %edi,%edx
 664         movlps (%esi,%ecx,4),%xmm6
 665         movhps (%esi,%edx,4),%xmm6
 666         movl nb410_pos(%ebp),%edi
 667
 668         movaps %xmm6,%xmm4
 669         shufps $8,%xmm4,%xmm4 ## constant 00001000
 670         shufps $13,%xmm6,%xmm6 ## constant 00001101
 671         movlhps %xmm7,%xmm4
 672         movlhps %xmm7,%xmm6
 673
 674         movaps %xmm4,nb410_c6(%esp)
 675         movaps %xmm6,nb410_c12(%esp)
 676
 677         movd  %eax,%mm0
 678         movd  %ebx,%mm1
 679
 680         leal  (%eax,%eax,2),%eax
 681         leal  (%ebx,%ebx,2),%ebx
 682         ## move coordinates to xmm0-xmm2
 683         movlps (%edi,%eax,4),%xmm1
 684         movss 8(%edi,%eax,4),%xmm2
 685         movhps (%edi,%ebx,4),%xmm1
 686         movss 8(%edi,%ebx,4),%xmm0
 687
 688         movlhps %xmm7,%xmm3
 689
 690         shufps $0,%xmm0,%xmm2
 691
 692         movaps %xmm1,%xmm0
 693
 694         shufps $136,%xmm2,%xmm2 ## constant 10001000
 695
 696         shufps $136,%xmm0,%xmm0 ## constant 10001000
 697         shufps $221,%xmm1,%xmm1 ## constant 11011101
 698
 699         movl   nb410_faction(%ebp),%edi
 700         ## move ix-iz to xmm4-xmm6
 701         xorps   %xmm7,%xmm7
 702
 703         movaps nb410_ix(%esp),%xmm4
 704         movaps nb410_iy(%esp),%xmm5
 705         movaps nb410_iz(%esp),%xmm6
 706
 707         ## calc dr
 708         subps %xmm0,%xmm4
 709         subps %xmm1,%xmm5
 710         subps %xmm2,%xmm6
 711
 712         ## store dr
 713         movaps %xmm4,nb410_dx(%esp)
 714         movaps %xmm5,nb410_dy(%esp)
 715         movaps %xmm6,nb410_dz(%esp)
 716         ## square it
 717         mulps %xmm4,%xmm4
 718         mulps %xmm5,%xmm5
 719         mulps %xmm6,%xmm6
 720         addps %xmm5,%xmm4
 721         addps %xmm6,%xmm4
 722         ## rsq in xmm4
 723
 724         rsqrtps %xmm4,%xmm5
 725         ## lookup seed in xmm5
 726         movaps %xmm5,%xmm2
 727         mulps %xmm5,%xmm5
 728         movaps nb410_three(%esp),%xmm1
 729         mulps %xmm4,%xmm5       ## rsq*lu*lu
 730         movaps nb410_half(%esp),%xmm0
 731         subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu
 732         mulps %xmm2,%xmm1
 733         mulps %xmm1,%xmm0       ## xmm0=rinv
 734         mulps %xmm0,%xmm4       ## xmm4=r
 735         movaps %xmm4,nb410_r(%esp)
 736         mulps nb410_gbscale(%esp),%xmm4
 737
 738         cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices
 739         cvtpi2ps %mm6,%xmm6
 740         subps %xmm6,%xmm4
 741         movaps %xmm4,%xmm1      ## xmm1=eps
 742         movaps %xmm1,%xmm2
 743         mulps  %xmm2,%xmm2      ## xmm2=eps2
 744
 745         pslld $2,%mm6
 746
 747         movl nb410_GBtab(%ebp),%esi
 748         movd %mm6,%ecx
 749         psrlq $32,%mm6
 750         movd %mm6,%edx
 751
 752         ## load coulomb table
 753         movaps (%esi,%ecx,4),%xmm4
 754         movaps (%esi,%edx,4),%xmm7
 755         ## transpose, using xmm3 for scratch
 756         movaps %xmm4,%xmm6
 757         unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2
 758         unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
 759         movhlps  %xmm4,%xmm5    ## F1 F2
 760         movhlps  %xmm6,%xmm7    ## H1 H2
 761         ## coulomb table ready, in xmm4-xmm7
 762
 763         mulps  %xmm1,%xmm6      ## xmm6=Geps
 764         mulps  %xmm2,%xmm7      ## xmm7=Heps2
 765         addps  %xmm6,%xmm5
 766         addps  %xmm7,%xmm5      ## xmm5=Fp
 767         mulps  nb410_two(%esp),%xmm7    ## two*Heps2
 768         movaps nb410_qq(%esp),%xmm3
 769         addps  %xmm6,%xmm7
 770         addps  %xmm5,%xmm7 ## xmm7=FF
 771         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp
 772         addps  %xmm4,%xmm5 ## xmm5=VV
 773         mulps  %xmm3,%xmm5 ## vcoul=qq*VV
 774         mulps  %xmm7,%xmm3 ## fijC=FF*qq
 775         ## get jnr from regs
 776         movd %mm0,%ecx
 777         movd %mm1,%edx
 778
 779         movl nb410_dvda(%ebp),%esi
 780         ## Calculate dVda
 781         xorps %xmm7,%xmm7
 782         mulps nb410_gbscale(%esp),%xmm3
 783         movaps %xmm3,%xmm6
 784         mulps  nb410_r(%esp),%xmm6
 785         addps  %xmm5,%xmm6
 786         addps  nb410_vctot(%esp),%xmm5
 787         movaps %xmm5,nb410_vctot(%esp)
 788
 789         ## xmm6=(vcoul+fijC*r)
 790         subps  %xmm6,%xmm7
 791         movaps %xmm7,%xmm6
 792
 793         ## update dvdasum
 794         addps  nb410_dvdasum(%esp),%xmm7
 795         movaps %xmm7,nb410_dvdasum(%esp)
 796
 797         ## update j atoms dvdaj
 798         movaps %xmm6,%xmm7
 799         shufps $0x1,%xmm7,%xmm7
 800         addss  (%esi,%ecx,4),%xmm6
 801         addss  (%esi,%edx,4),%xmm7
 802         movss  %xmm6,(%esi,%ecx,4)
 803         movss  %xmm7,(%esi,%edx,4)
 804
 805         ## L-J
 806         movaps %xmm0,%xmm4
 807         mulps  %xmm0,%xmm4      ## xmm4=rinvsq
 808
 809         ## at this point mm5 contains vcoul and mm3 fijC
 810         ## increment vcoul - then we can get rid of mm5
 811         ## update vctot
 812
 813         movaps %xmm4,%xmm6
 814         mulps  %xmm4,%xmm6
 815
 816         mulps  %xmm4,%xmm6      ## xmm6=rinvsix
 817         movaps %xmm6,%xmm4
 818         mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve
 819         mulps  nb410_c6(%esp),%xmm6
 820         mulps  nb410_c12(%esp),%xmm4
 821         movaps nb410_Vvdwtot(%esp),%xmm7
 822         addps  %xmm4,%xmm7
 823         mulps  nb410_twelve(%esp),%xmm4
 824         subps  %xmm6,%xmm7
 825         mulps  nb410_six(%esp),%xmm6
 826         movaps %xmm7,nb410_Vvdwtot(%esp)
 827         subps  %xmm6,%xmm4
 828         mulps  %xmm0,%xmm4
 829         subps  %xmm3,%xmm4
 830         mulps  %xmm0,%xmm4
 831
 832         movaps nb410_dx(%esp),%xmm0
 833         movaps nb410_dy(%esp),%xmm1
 834         movaps nb410_dz(%esp),%xmm2
 835
 836         mulps  %xmm4,%xmm0
 837         mulps  %xmm4,%xmm1
 838         mulps  %xmm4,%xmm2
 839         ## xmm0-xmm2 contains tx-tz (partial force)
 840         ## now update f_i
 841         movaps nb410_fix(%esp),%xmm3
 842         movaps nb410_fiy(%esp),%xmm4
 843         movaps nb410_fiz(%esp),%xmm5
 844         addps  %xmm0,%xmm3
 845         addps  %xmm1,%xmm4
 846         addps  %xmm2,%xmm5
 847         movaps %xmm3,nb410_fix(%esp)
 848         movaps %xmm4,nb410_fiy(%esp)
 849         movaps %xmm5,nb410_fiz(%esp)
 850         ## update the fj's
 851         movss   (%edi,%eax,4),%xmm3
 852         movss   4(%edi,%eax,4),%xmm4
 853         movss   8(%edi,%eax,4),%xmm5
 854         subss   %xmm0,%xmm3
 855         subss   %xmm1,%xmm4
 856         subss   %xmm2,%xmm5
 857         movss   %xmm3,(%edi,%eax,4)
 858         movss   %xmm4,4(%edi,%eax,4)
 859         movss   %xmm5,8(%edi,%eax,4)
 860
 861         shufps $225,%xmm0,%xmm0 ## constant 11100001
 862         shufps $225,%xmm1,%xmm1 ## constant 11100001
 863         shufps $225,%xmm2,%xmm2 ## constant 11100001
 864
 865         movss   (%edi,%ebx,4),%xmm3
 866         movss   4(%edi,%ebx,4),%xmm4
 867         movss   8(%edi,%ebx,4),%xmm5
 868         subss   %xmm0,%xmm3
 869         subss   %xmm1,%xmm4
 870         subss   %xmm2,%xmm5
 871         movss   %xmm3,(%edi,%ebx,4)
 872         movss   %xmm4,4(%edi,%ebx,4)
 873         movss   %xmm5,8(%edi,%ebx,4)
 874
 875 _nb_kernel410_ia32_sse.nb410_checksingle:
 876         movl  nb410_innerk(%esp),%edx
 877         andl  $1,%edx
 878         jnz    _nb_kernel410_ia32_sse.nb410_dosingle
 879         jmp    _nb_kernel410_ia32_sse.nb410_updateouterdata
 880 _nb_kernel410_ia32_sse.nb410_dosingle:
 881         movl nb410_charge(%ebp),%esi
 882         movl nb410_invsqrta(%ebp),%edx
 883         movl nb410_pos(%ebp),%edi
 884         movl  nb410_innerjjnr(%esp),%ecx
 885         movl  (%ecx),%eax
 886         xorps  %xmm2,%xmm2
 887         movaps %xmm2,%xmm6
 888         movss (%edx,%eax,4),%xmm2       ## isaj
 889         mulss nb410_isai(%esp),%xmm2
 890         movss %xmm2,nb410_isaprod(%esp)
 891         movss %xmm2,%xmm1
 892         mulss nb410_gbtsc(%esp),%xmm1
 893         movss %xmm1,nb410_gbscale(%esp)
 894
 895         mulss  nb410_iq(%esp),%xmm2
 896         movss (%esi,%eax,4),%xmm6       ## xmm6(0) has the charge
 897         mulss  %xmm2,%xmm6
 898         movss %xmm6,nb410_qq(%esp)
 899
 900         movl nb410_type(%ebp),%esi
 901         movl %eax,%ecx
 902         movl (%esi,%ecx,4),%ecx
 903         movl nb410_vdwparam(%ebp),%esi
 904         shll %ecx
 905         addl nb410_ntia(%esp),%ecx
 906         movlps (%esi,%ecx,4),%xmm6
 907         movaps %xmm6,%xmm4
 908         shufps $252,%xmm4,%xmm4 ## constant 11111100
 909         shufps $253,%xmm6,%xmm6 ## constant 11111101
 910
 911         movaps %xmm4,nb410_c6(%esp)
 912         movaps %xmm6,nb410_c12(%esp)
 913
 914         movd  %eax,%mm0
 915         leal  (%eax,%eax,2),%eax
 916
 917         ## move coordinates to xmm0-xmm2
 918         movss (%edi,%eax,4),%xmm0
 919         movss 4(%edi,%eax,4),%xmm1
 920         movss 8(%edi,%eax,4),%xmm2
 921
 922         movaps nb410_ix(%esp),%xmm4
 923         movaps nb410_iy(%esp),%xmm5
 924         movaps nb410_iz(%esp),%xmm6
 925
 926         ## calc dr
 927         subss %xmm0,%xmm4
 928         subss %xmm1,%xmm5
 929         subss %xmm2,%xmm6
 930
 931         ## store dr
 932         movss %xmm4,nb410_dx(%esp)
 933         movss %xmm5,nb410_dy(%esp)
 934         movss %xmm6,nb410_dz(%esp)
 935         ## square it
 936         mulss %xmm4,%xmm4
 937         mulss %xmm5,%xmm5
 938         mulss %xmm6,%xmm6
 939         addss %xmm5,%xmm4
 940         addss %xmm6,%xmm4
 941         ## rsq in xmm4
 942
 943         rsqrtss %xmm4,%xmm5
 944         ## lookup seed in xmm5
 945         movaps %xmm5,%xmm2
 946         mulss %xmm5,%xmm5
 947         movss nb410_three(%esp),%xmm1
 948         mulss %xmm4,%xmm5       ## rsq*lu*lu
 949         movss nb410_half(%esp),%xmm0
 950         subss %xmm5,%xmm1       ## constant 30-rsq*lu*lu
 951         mulss %xmm2,%xmm1
 952         mulss %xmm1,%xmm0       ## xmm0=rinv
 953
 954         mulss %xmm0,%xmm4       ## xmm4=r
 955         movss %xmm4,nb410_r(%esp)
 956         mulss nb410_gbscale(%esp),%xmm4
 957
 958         cvttss2si %xmm4,%ebx    ## mm6 contain lu indices
 959         cvtsi2ss %ebx,%xmm6
 960         subss %xmm6,%xmm4
 961         movaps %xmm4,%xmm1      ## xmm1=eps
 962         movaps %xmm1,%xmm2
 963         mulss  %xmm2,%xmm2      ## xmm2=eps2
 964
 965         shll $2,%ebx
 966         movl nb410_GBtab(%ebp),%esi
 967
 968         movaps (%esi,%ebx,4),%xmm4
 969         movhlps %xmm4,%xmm6
 970         movaps %xmm4,%xmm5
 971         movaps %xmm6,%xmm7
 972         shufps $1,%xmm5,%xmm5
 973         shufps $1,%xmm7,%xmm7
 974         ## table ready in xmm4-xmm7
 975
 976         mulss  %xmm1,%xmm6      ## xmm6=Geps
 977         mulss  %xmm2,%xmm7      ## xmm7=Heps2
 978         addss  %xmm6,%xmm5
 979         addss  %xmm7,%xmm5      ## xmm5=Fp
 980         mulss  nb410_two(%esp),%xmm7    ## two*Heps2
 981         movss nb410_qq(%esp),%xmm3
 982         addss  %xmm6,%xmm7
 983         addss  %xmm5,%xmm7 ## xmm7=FF
 984         mulss  %xmm1,%xmm5 ## xmm5=eps*Fp
 985         addss  %xmm4,%xmm5 ## xmm5=VV
 986         mulss  %xmm3,%xmm5 ## vcoul=qq*VV
 987         mulss  %xmm7,%xmm3 ## fijC=FF*qq
 988
 989         movd %mm0,%ebx
 990         movl nb410_dvda(%ebp),%esi
 991
 992         ## Calculate dVda
 993         xorps %xmm7,%xmm7
 994         mulss nb410_gbscale(%esp),%xmm3
 995         movaps %xmm3,%xmm6
 996         mulss  nb410_r(%esp),%xmm6
 997         addss  %xmm5,%xmm6
 998         addss  nb410_vctot(%esp),%xmm5
 999         movss %xmm5,nb410_vctot(%esp)
1000
1001         ## xmm6=(vcoul+fijC*r)
1002         subps  %xmm6,%xmm7
1003         movaps %xmm7,%xmm6
1004
1005         ## update dvdasum
1006         addps  nb410_dvdasum(%esp),%xmm7
1007         movaps %xmm7,nb410_dvdasum(%esp)
1008
1009         ## update j atoms dvdaj
1010         addss  (%esi,%ebx,4),%xmm6
1011         movss  %xmm6,(%esi,%ebx,4)
1012
1013         ## L-J
1014         movaps %xmm0,%xmm4
1015         mulss  %xmm0,%xmm4      ## xmm4=rinvsq
1016
1017         movaps %xmm4,%xmm6
1018         mulss  %xmm4,%xmm6
1019
1020         mulss  %xmm4,%xmm6      ## xmm6=rinvsix
1021         movaps %xmm6,%xmm4
1022         mulss  %xmm4,%xmm4      ## xmm4=rinvtwelve
1023         mulss  nb410_c6(%esp),%xmm6
1024         mulss  nb410_c12(%esp),%xmm4
1025         movss nb410_Vvdwtot(%esp),%xmm7
1026         addss  %xmm4,%xmm7
1027         mulss  nb410_twelve(%esp),%xmm4
1028         subss  %xmm6,%xmm7
1029         mulss  nb410_six(%esp),%xmm6
1030         movss %xmm7,nb410_Vvdwtot(%esp)
1031         subss  %xmm6,%xmm4
1032         mulss  %xmm0,%xmm4
1033         subss  %xmm3,%xmm4
1034         mulss  %xmm0,%xmm4
1035
1036         movss nb410_dx(%esp),%xmm0
1037         movss nb410_dy(%esp),%xmm1
1038         movss nb410_dz(%esp),%xmm2
1039
1040         movl   nb410_faction(%ebp),%edi
1041         mulss  %xmm4,%xmm0
1042         mulss  %xmm4,%xmm1
1043         mulss  %xmm4,%xmm2
1044         ## xmm0-xmm2 contains tx-tz (partial force)
1045         ## now update f_i
1046         movss nb410_fix(%esp),%xmm3
1047         movss nb410_fiy(%esp),%xmm4
1048         movss nb410_fiz(%esp),%xmm5
1049         addss  %xmm0,%xmm3
1050         addss  %xmm1,%xmm4
1051         addss  %xmm2,%xmm5
1052         movss %xmm3,nb410_fix(%esp)
1053         movss %xmm4,nb410_fiy(%esp)
1054         movss %xmm5,nb410_fiz(%esp)
1055         ## update fj
1056
1057         movss   (%edi,%eax,4),%xmm3
1058         movss   4(%edi,%eax,4),%xmm4
1059         movss   8(%edi,%eax,4),%xmm5
1060         subss   %xmm0,%xmm3
1061         subss   %xmm1,%xmm4
1062         subss   %xmm2,%xmm5
1063         movss   %xmm3,(%edi,%eax,4)
1064         movss   %xmm4,4(%edi,%eax,4)
1065         movss   %xmm5,8(%edi,%eax,4)
1066 _nb_kernel410_ia32_sse.nb410_updateouterdata:
1067         movl  nb410_ii3(%esp),%ecx
1068         movl  nb410_faction(%ebp),%edi
1069         movl  nb410_fshift(%ebp),%esi
1070         movl  nb410_is3(%esp),%edx
1071
1072         ## accumulate i forces in xmm0, xmm1, xmm2
1073         movaps nb410_fix(%esp),%xmm0
1074         movaps nb410_fiy(%esp),%xmm1
1075         movaps nb410_fiz(%esp),%xmm2
1076
1077         movhlps %xmm0,%xmm3
1078         movhlps %xmm1,%xmm4
1079         movhlps %xmm2,%xmm5
1080         addps  %xmm3,%xmm0
1081         addps  %xmm4,%xmm1
1082         addps  %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2
1083
1084         movaps %xmm0,%xmm3
1085         movaps %xmm1,%xmm4
1086         movaps %xmm2,%xmm5
1087
1088         shufps $1,%xmm3,%xmm3
1089         shufps $1,%xmm4,%xmm4
1090         shufps $1,%xmm5,%xmm5
1091         addss  %xmm3,%xmm0
1092         addss  %xmm4,%xmm1
1093         addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0
1094
1095         ## increment i force
1096         movss  (%edi,%ecx,4),%xmm3
1097         movss  4(%edi,%ecx,4),%xmm4
1098         movss  8(%edi,%ecx,4),%xmm5
1099         addss  %xmm0,%xmm3
1100         addss  %xmm1,%xmm4
1101         addss  %xmm2,%xmm5
1102         movss  %xmm3,(%edi,%ecx,4)
1103         movss  %xmm4,4(%edi,%ecx,4)
1104         movss  %xmm5,8(%edi,%ecx,4)
1105
1106         ## increment fshift force
1107         movss  (%esi,%edx,4),%xmm3
1108         movss  4(%esi,%edx,4),%xmm4
1109         movss  8(%esi,%edx,4),%xmm5
1110         addss  %xmm0,%xmm3
1111         addss  %xmm1,%xmm4
1112         addss  %xmm2,%xmm5
1113         movss  %xmm3,(%esi,%edx,4)
1114         movss  %xmm4,4(%esi,%edx,4)
1115         movss  %xmm5,8(%esi,%edx,4)
1116
1117         ## get n from stack
1118         movl nb410_n(%esp),%esi
1119         ## get group index for i particle
1120         movl  nb410_gid(%ebp),%edx              ## base of gid[]
1121         movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
1122
1123         ## accumulate total potential energy and update it
1124         movaps nb410_vctot(%esp),%xmm7
1125         ## accumulate
1126         movhlps %xmm7,%xmm6
1127         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1128         movaps %xmm7,%xmm6
1129         shufps $1,%xmm6,%xmm6
1130         addss  %xmm6,%xmm7
1131
1132         ## add earlier value from mem
1133         movl  nb410_Vc(%ebp),%eax
1134         addss (%eax,%edx,4),%xmm7
1135         ## move back to mem
1136         movss %xmm7,(%eax,%edx,4)
1137
1138         ## accumulate total lj energy and update it
1139         movaps nb410_Vvdwtot(%esp),%xmm7
1140         ## accumulate
1141         movhlps %xmm7,%xmm6
1142         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1143         movaps %xmm7,%xmm6
1144         shufps $1,%xmm6,%xmm6
1145         addss  %xmm6,%xmm7
1146
1147         ## add earlier value from mem
1148         movl  nb410_Vvdw(%ebp),%eax
1149         addss (%eax,%edx,4),%xmm7
1150         ## move back to mem
1151         movss %xmm7,(%eax,%edx,4)
1152
1153         ## accumulate dVda and update it
1154         movaps nb410_dvdasum(%esp),%xmm7
1155         ## accumulate
1156         movhlps %xmm7,%xmm6
1157         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1158         movaps %xmm7,%xmm6
1159         shufps $1,%xmm6,%xmm6
1160         addss  %xmm6,%xmm7
1161
1162         movl nb410_ii(%esp),%edx
1163         movl nb410_dvda(%ebp),%eax
1164         addss (%eax,%edx,4),%xmm7
1165         movss %xmm7,(%eax,%edx,4)
1166
1167         ## finish if last
1168         movl nb410_nn1(%esp),%ecx
1169         ## esi already loaded with n
1170         incl %esi
1171         subl %esi,%ecx
1172         jz _nb_kernel410_ia32_sse.nb410_outerend
1173
1174         ## not last, iterate outer loop once more!
1175         movl %esi,nb410_n(%esp)
1176         jmp _nb_kernel410_ia32_sse.nb410_outer
1177 _nb_kernel410_ia32_sse.nb410_outerend:
1178         ## check if more outer neighborlists remain
1179         movl  nb410_nri(%esp),%ecx
1180         ## esi already loaded with n above
1181         subl  %esi,%ecx
1182         jz _nb_kernel410_ia32_sse.nb410_end
1183         ## non-zero, do one more workunit
1184         jmp   _nb_kernel410_ia32_sse.nb410_threadloop
1185 _nb_kernel410_ia32_sse.nb410_end:
1186         emms
1187
1188         movl nb410_nouter(%esp),%eax
1189         movl nb410_ninner(%esp),%ebx
1190         movl nb410_outeriter(%ebp),%ecx
1191         movl nb410_inneriter(%ebp),%edx
1192         movl %eax,(%ecx)
1193         movl %ebx,(%edx)
1194
1195         movl nb410_salign(%esp),%eax
1196         addl %eax,%esp
1197         addl $504,%esp
1198         popl %edi
1199         popl %esi
1200         popl %edx
1201         popl %ecx
1202         popl %ebx
1203         popl %eax
1204         leave
1205         ret
1206
1207
1208
1209 .globl nb_kernel410nf_ia32_sse
1210 .globl _nb_kernel410nf_ia32_sse
1211 nb_kernel410nf_ia32_sse:
1212 _nb_kernel410nf_ia32_sse:
1213 .set nb410nf_p_nri, 8
1214 .set nb410nf_iinr, 12
1215 .set nb410nf_jindex, 16
1216 .set nb410nf_jjnr, 20
1217 .set nb410nf_shift, 24
1218 .set nb410nf_shiftvec, 28
1219 .set nb410nf_fshift, 32
1220 .set nb410nf_gid, 36
1221 .set nb410nf_pos, 40
1222 .set nb410nf_faction, 44
1223 .set nb410nf_charge, 48
1224 .set nb410nf_p_facel, 52
1225 .set nb410nf_argkrf, 56
1226 .set nb410nf_argcrf, 60
1227 .set nb410nf_Vc, 64
1228 .set nb410nf_type, 68
1229 .set nb410nf_p_ntype, 72
1230 .set nb410nf_vdwparam, 76
1231 .set nb410nf_Vvdw, 80
1232 .set nb410nf_p_tabscale, 84
1233 .set nb410nf_VFtab, 88
1234 .set nb410nf_invsqrta, 92
1235 .set nb410nf_dvda, 96
1236 .set nb410nf_p_gbtabscale, 100
1237 .set nb410nf_GBtab, 104
1238 .set nb410nf_p_nthreads, 108
1239 .set nb410nf_count, 112
1240 .set nb410nf_mtx, 116
1241 .set nb410nf_outeriter, 120
1242 .set nb410nf_inneriter, 124
1243 .set nb410nf_work, 128
1244         ## stack offsets for local variables
1245         ## bottom of stack is cache-aligned for sse use
1246 .set nb410nf_ix, 0
1247 .set nb410nf_iy, 16
1248 .set nb410nf_iz, 32
1249 .set nb410nf_iq, 48
1250 .set nb410nf_gbtsc, 64
1251 .set nb410nf_qq, 80
1252 .set nb410nf_c6, 96
1253 .set nb410nf_c12, 112
1254 .set nb410nf_vctot, 128
1255 .set nb410nf_Vvdwtot, 144
1256 .set nb410nf_half, 160
1257 .set nb410nf_three, 176
1258 .set nb410nf_isai, 192
1259 .set nb410nf_isaprod, 208
1260 .set nb410nf_gbscale, 224
1261 .set nb410nf_is3, 240
1262 .set nb410nf_ii3, 244
1263 .set nb410nf_ntia, 248
1264 .set nb410nf_innerjjnr, 252
1265 .set nb410nf_innerk, 256
1266 .set nb410nf_n, 260
1267 .set nb410nf_nn1, 264
1268 .set nb410nf_nri, 268
1269 .set nb410nf_facel, 272
1270 .set nb410nf_ntype, 276
1271 .set nb410nf_nouter, 280
1272 .set nb410nf_ninner, 284
1273 .set nb410nf_salign, 288
1274         pushl %ebp
1275         movl %esp,%ebp
1276         pushl %eax
1277         pushl %ebx
1278         pushl %ecx
1279         pushl %edx
1280         pushl %esi
1281         pushl %edi
1282         subl $292,%esp          ## local stack space
1283         movl %esp,%eax
1284         andl $0xf,%eax
1285         subl %eax,%esp
1286         movl %eax,nb410nf_salign(%esp)
1287
1288         emms
1289
1290         ## Move args passed by reference to stack
1291         movl nb410nf_p_nri(%ebp),%ecx
1292         movl nb410nf_p_facel(%ebp),%esi
1293         movl nb410nf_p_ntype(%ebp),%edi
1294         movl (%ecx),%ecx
1295         movl (%esi),%esi
1296         movl (%edi),%edi
1297         movl %ecx,nb410nf_nri(%esp)
1298         movl %esi,nb410nf_facel(%esp)
1299         movl %edi,nb410nf_ntype(%esp)
1300
1301         ## zero iteration counters
1302         movl $0,%eax
1303         movl %eax,nb410nf_nouter(%esp)
1304         movl %eax,nb410nf_ninner(%esp)
1305
1306
1307         movl nb410nf_p_gbtabscale(%ebp),%eax
1308         movss (%eax),%xmm5
1309         shufps $0,%xmm5,%xmm5
1310         movaps %xmm5,nb410nf_gbtsc(%esp)
1311
1312         ## create constant floating-point factors on stack
1313         movl $0x3f000000,%eax   ## constant 0.5 in IEEE (hex)
1314         movl %eax,nb410nf_half(%esp)
1315         movss nb410nf_half(%esp),%xmm1
1316         shufps $0,%xmm1,%xmm1  ## splat to all elements
1317         movaps %xmm1,%xmm2
1318         addps  %xmm2,%xmm2      ## constant 1.0
1319         movaps %xmm2,%xmm3
1320         addps  %xmm2,%xmm2      ## constant 2.0
1321         addps  %xmm2,%xmm3      ## constant 3.0
1322         movaps %xmm1,nb410nf_half(%esp)
1323         movaps %xmm3,nb410nf_three(%esp)
1324
1325 _nb_kernel410nf_ia32_sse.nb410nf_threadloop:
1326         movl  nb410nf_count(%ebp),%esi            ## pointer to sync counter
1327         movl  (%esi),%eax
1328 _nb_kernel410nf_ia32_sse.nb410nf_spinlock:
1329         movl  %eax,%ebx                         ## ebx=*count=nn0
1330         addl  $1,%ebx                          ## ebx=nn1=nn0+10
1331         lock
1332         cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,
1333                                                 ## if it hasnt changed.
1334                                                 ## or reread *counter to eax.
1335         pause                                   ## -> better p4 performance
1336         jnz _nb_kernel410nf_ia32_sse.nb410nf_spinlock
1337
1338         ## if(nn1>nri) nn1=nri
1339         movl nb410nf_nri(%esp),%ecx
1340         movl %ecx,%edx
1341         subl %ebx,%ecx
1342         cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
1343         ## Cleared the spinlock if we got here.
1344         ## eax contains nn0, ebx contains nn1.
1345         movl %eax,nb410nf_n(%esp)
1346         movl %ebx,nb410nf_nn1(%esp)
1347         subl %eax,%ebx                          ## calc number of outer lists
1348         movl %eax,%esi                          ## copy n to esi
1349         jg  _nb_kernel410nf_ia32_sse.nb410nf_outerstart
1350         jmp _nb_kernel410nf_ia32_sse.nb410nf_end
1351
1352 _nb_kernel410nf_ia32_sse.nb410nf_outerstart:
1353         ## ebx contains number of outer iterations
1354         addl nb410nf_nouter(%esp),%ebx
1355         movl %ebx,nb410nf_nouter(%esp)
1356
1357 _nb_kernel410nf_ia32_sse.nb410nf_outer:
1358         movl  nb410nf_shift(%ebp),%eax        ## eax = pointer into shift[]
1359         movl  (%eax,%esi,4),%ebx        ## ebx=shift[n]
1360
1361         leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is
1362         movl  %ebx,nb410nf_is3(%esp)            ## store is3
1363
1364         movl  nb410nf_shiftvec(%ebp),%eax     ## eax = base of shiftvec[]
1365
1366         movss (%eax,%ebx,4),%xmm0
1367         movss 4(%eax,%ebx,4),%xmm1
1368         movss 8(%eax,%ebx,4),%xmm2
1369
1370         movl  nb410nf_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]
1371         movl  (%ecx,%esi,4),%ebx            ## ebx =ii
1372
1373         movl  nb410nf_charge(%ebp),%edx
1374         movss (%edx,%ebx,4),%xmm3
1375         mulss nb410nf_facel(%esp),%xmm3
1376         shufps $0,%xmm3,%xmm3
1377
1378         movl  nb410nf_invsqrta(%ebp),%edx       ## load invsqrta[ii]
1379         movss (%edx,%ebx,4),%xmm4
1380         shufps $0,%xmm4,%xmm4
1381
1382         movl  nb410nf_type(%ebp),%edx
1383         movl  (%edx,%ebx,4),%edx
1384         imull nb410nf_ntype(%esp),%edx
1385         shll  %edx
1386         movl  %edx,nb410nf_ntia(%esp)
1387
1388         leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3
1389         movl  nb410nf_pos(%ebp),%eax      ## eax = base of pos[]
1390
1391         addss (%eax,%ebx,4),%xmm0
1392         addss 4(%eax,%ebx,4),%xmm1
1393         addss 8(%eax,%ebx,4),%xmm2
1394
1395         movaps %xmm3,nb410nf_iq(%esp)
1396         movaps %xmm4,nb410nf_isai(%esp)
1397
1398         shufps $0,%xmm0,%xmm0
1399         shufps $0,%xmm1,%xmm1
1400         shufps $0,%xmm2,%xmm2
1401
1402         movaps %xmm0,nb410nf_ix(%esp)
1403         movaps %xmm1,nb410nf_iy(%esp)
1404         movaps %xmm2,nb410nf_iz(%esp)
1405
1406         movl  %ebx,nb410nf_ii3(%esp)
1407
1408         ## clear vctot
1409         xorps %xmm4,%xmm4
1410         movaps %xmm4,nb410nf_vctot(%esp)
1411         movaps %xmm4,nb410nf_Vvdwtot(%esp)
1412
1413         movl  nb410nf_jindex(%ebp),%eax
1414         movl  (%eax,%esi,4),%ecx             ## jindex[n]
1415         movl  4(%eax,%esi,4),%edx            ## jindex[n+1]
1416         subl  %ecx,%edx              ## number of innerloop atoms
1417
1418         movl  nb410nf_pos(%ebp),%esi
1419         movl  nb410nf_faction(%ebp),%edi
1420         movl  nb410nf_jjnr(%ebp),%eax
1421         shll  $2,%ecx
1422         addl  %ecx,%eax
1423         movl  %eax,nb410nf_innerjjnr(%esp)       ## pointer to jjnr[nj0]
1424         movl  %edx,%ecx
1425         subl  $4,%edx
1426         addl  nb410nf_ninner(%esp),%ecx
1427         movl  %ecx,nb410nf_ninner(%esp)
1428         addl  $0,%edx
1429         movl  %edx,nb410nf_innerk(%esp)      ## number of innerloop atoms
1430         jge   _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop
1431         jmp   _nb_kernel410nf_ia32_sse.nb410nf_finish_inner
1432 _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop:
1433         ## quad-unroll innerloop here
1434         movl  nb410nf_innerjjnr(%esp),%edx       ## pointer to jjnr[k]
1435         movl  (%edx),%eax
1436         movl  4(%edx),%ebx
1437         movl  8(%edx),%ecx
1438         movl  12(%edx),%edx           ## eax-edx=jnr1-4
1439         addl $16,nb410nf_innerjjnr(%esp)             ## advance pointer (unrolled 4)
1440
1441         ## load isa2
1442         movl nb410nf_invsqrta(%ebp),%esi
1443         movss (%esi,%eax,4),%xmm3
1444         movss (%esi,%ecx,4),%xmm4
1445         movss (%esi,%ebx,4),%xmm6
1446         movss (%esi,%edx,4),%xmm7
1447         movaps nb410nf_isai(%esp),%xmm2
1448         shufps $0,%xmm6,%xmm3
1449         shufps $0,%xmm7,%xmm4
1450         shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3
1451         mulps  %xmm3,%xmm2
1452
1453         movaps %xmm2,nb410nf_isaprod(%esp)
1454         movaps %xmm2,%xmm1
1455         mulps nb410nf_gbtsc(%esp),%xmm1
1456         movaps %xmm1,nb410nf_gbscale(%esp)
1457
1458         movl nb410nf_charge(%ebp),%esi     ## base of charge[]
1459
1460         movss (%esi,%eax,4),%xmm3
1461         movss (%esi,%ecx,4),%xmm4
1462         movss (%esi,%ebx,4),%xmm6
1463         movss (%esi,%edx,4),%xmm7
1464
1465         mulps nb410nf_iq(%esp),%xmm2
1466         shufps $0,%xmm6,%xmm3
1467         shufps $0,%xmm7,%xmm4
1468         shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3
1469         mulps  %xmm2,%xmm3
1470         movaps %xmm3,nb410nf_qq(%esp)
1471
1472         movd %eax,%mm0
1473         movd %ebx,%mm1
1474         movd %ecx,%mm2
1475         movd %edx,%mm3
1476
1477         movl nb410nf_type(%ebp),%esi
1478         movl (%esi,%eax,4),%eax
1479         movl (%esi,%ebx,4),%ebx
1480         movl (%esi,%ecx,4),%ecx
1481         movl (%esi,%edx,4),%edx
1482         movl nb410nf_vdwparam(%ebp),%esi
1483         shll %eax
1484         shll %ebx
1485         shll %ecx
1486         shll %edx
1487         movl nb410nf_ntia(%esp),%edi
1488         addl %edi,%eax
1489         addl %edi,%ebx
1490         addl %edi,%ecx
1491         addl %edi,%edx
1492
1493         movlps (%esi,%eax,4),%xmm6
1494         movlps (%esi,%ecx,4),%xmm7
1495         movhps (%esi,%ebx,4),%xmm6
1496         movhps (%esi,%edx,4),%xmm7
1497
1498         movaps %xmm6,%xmm4
1499         shufps $136,%xmm7,%xmm4 ## constant 10001000
1500         shufps $221,%xmm7,%xmm6 ## constant 11011101
1501
1502         movd  %mm0,%eax
1503         movd  %mm1,%ebx
1504         movd  %mm2,%ecx
1505         movd  %mm3,%edx
1506
1507         movaps %xmm4,nb410nf_c6(%esp)
1508         movaps %xmm6,nb410nf_c12(%esp)
1509
1510         movl nb410nf_pos(%ebp),%esi        ## base of pos[]
1511
1512         leal  (%eax,%eax,2),%eax     ## replace jnr with j3
1513         leal  (%ebx,%ebx,2),%ebx
1514
1515         leal  (%ecx,%ecx,2),%ecx     ## replace jnr with j3
1516         leal  (%edx,%edx,2),%edx
1517
1518         ## move four coordinates to xmm0-xmm2
1519
1520         movlps (%esi,%eax,4),%xmm4
1521         movlps (%esi,%ecx,4),%xmm5
1522         movss 8(%esi,%eax,4),%xmm2
1523         movss 8(%esi,%ecx,4),%xmm6
1524
1525         movhps (%esi,%ebx,4),%xmm4
1526         movhps (%esi,%edx,4),%xmm5
1527
1528         movss 8(%esi,%ebx,4),%xmm0
1529         movss 8(%esi,%edx,4),%xmm1
1530
1531         shufps $0,%xmm0,%xmm2
1532         shufps $0,%xmm1,%xmm6
1533
1534         movaps %xmm4,%xmm0
1535         movaps %xmm4,%xmm1
1536
1537         shufps $136,%xmm6,%xmm2 ## constant 10001000
1538
1539         shufps $136,%xmm5,%xmm0 ## constant 10001000
1540         shufps $221,%xmm5,%xmm1 ## constant 11011101
1541
1542         ## move ix-iz to xmm4-xmm6
1543         movaps nb410nf_ix(%esp),%xmm4
1544         movaps nb410nf_iy(%esp),%xmm5
1545         movaps nb410nf_iz(%esp),%xmm6
1546
1547         ## calc dr
1548         subps %xmm0,%xmm4
1549         subps %xmm1,%xmm5
1550         subps %xmm2,%xmm6
1551
1552         ## square it
1553         mulps %xmm4,%xmm4
1554         mulps %xmm5,%xmm5
1555         mulps %xmm6,%xmm6
1556         addps %xmm5,%xmm4
1557         addps %xmm6,%xmm4
1558         ## rsq in xmm4
1559
1560         rsqrtps %xmm4,%xmm5
1561         ## lookup seed in xmm5
1562         movaps %xmm5,%xmm2
1563         mulps %xmm5,%xmm5
1564         movaps nb410nf_three(%esp),%xmm1
1565         mulps %xmm4,%xmm5       ## rsq*lu*lu
1566         movaps nb410nf_half(%esp),%xmm0
1567         subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu
1568         mulps %xmm2,%xmm1
1569         mulps %xmm1,%xmm0       ## xmm0=rinv
1570         mulps %xmm0,%xmm4       ## xmm4=r
1571         mulps nb410nf_gbscale(%esp),%xmm4
1572
1573         movhlps %xmm4,%xmm5
1574         cvttps2pi %xmm4,%mm6
1575         cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices
1576         cvtpi2ps %mm6,%xmm6
1577         cvtpi2ps %mm7,%xmm5
1578         movlhps %xmm5,%xmm6
1579         subps %xmm6,%xmm4
1580         movaps %xmm4,%xmm1      ## xmm1=eps
1581         movaps %xmm1,%xmm2
1582         mulps  %xmm2,%xmm2      ## xmm2=eps2
1583         pslld $2,%mm6
1584         pslld $2,%mm7
1585
1586         movd %eax,%mm0
1587         movd %ebx,%mm1
1588         movd %ecx,%mm2
1589         movd %edx,%mm3
1590
1591         movl nb410nf_GBtab(%ebp),%esi
1592         movd %mm6,%eax
1593         psrlq $32,%mm6
1594         movd %mm7,%ecx
1595         psrlq $32,%mm7
1596         movd %mm6,%ebx
1597         movd %mm7,%edx
1598
1599         ## load coulomb table
1600         movaps (%esi,%eax,4),%xmm4
1601         movaps (%esi,%ebx,4),%xmm5
1602         movaps (%esi,%ecx,4),%xmm6
1603         movaps (%esi,%edx,4),%xmm7
1604         ## transpose, using xmm3 for scratch
1605         movaps %xmm6,%xmm3
1606         shufps $0xEE,%xmm7,%xmm3
1607         shufps $0x44,%xmm7,%xmm6
1608         movaps %xmm4,%xmm7
1609         shufps $0xEE,%xmm5,%xmm7
1610         shufps $0x44,%xmm5,%xmm4
1611         movaps %xmm4,%xmm5
1612         shufps $0xDD,%xmm6,%xmm5
1613         shufps $0x88,%xmm6,%xmm4
1614         movaps %xmm7,%xmm6
1615         shufps $0x88,%xmm3,%xmm6
1616         shufps $0xDD,%xmm3,%xmm7
1617         ## coulomb table ready, in xmm4-xmm7
1618         mulps  %xmm1,%xmm6      ## xmm6=Geps
1619         mulps  %xmm2,%xmm7      ## xmm7=Heps2
1620
1621         addps  %xmm6,%xmm5
1622         addps  %xmm7,%xmm5      ## xmm5=Fp
1623         movaps nb410nf_qq(%esp),%xmm3
1624         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp
1625         addps  %xmm4,%xmm5 ## xmm5=VV
1626         mulps  %xmm3,%xmm5 ## vcoul=qq*VV
1627         ## update vctot
1628         addps  nb410nf_vctot(%esp),%xmm5
1629         movaps %xmm5,nb410nf_vctot(%esp)
1630
1631         ## L-J
1632         movaps %xmm0,%xmm4
1633         mulps  %xmm0,%xmm4      ## xmm4=rinvsq
1634
1635         movaps %xmm4,%xmm6
1636         mulps  %xmm4,%xmm6
1637
1638         mulps  %xmm4,%xmm6      ## xmm6=rinvsix
1639         movaps %xmm6,%xmm4
1640         mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve
1641         mulps  nb410nf_c6(%esp),%xmm6
1642         mulps  nb410nf_c12(%esp),%xmm4
1643         movaps nb410nf_Vvdwtot(%esp),%xmm7
1644         addps  %xmm4,%xmm7
1645         subps  %xmm6,%xmm7
1646         movaps %xmm7,nb410nf_Vvdwtot(%esp)
1647
1648         ## should we do one more iteration?
1649         subl $4,nb410nf_innerk(%esp)
1650         jl    _nb_kernel410nf_ia32_sse.nb410nf_finish_inner
1651         jmp   _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop
1652 _nb_kernel410nf_ia32_sse.nb410nf_finish_inner:
1653         ## check if at least two particles remain
1654         addl $4,nb410nf_innerk(%esp)
1655         movl  nb410nf_innerk(%esp),%edx
1656         andl  $2,%edx
1657         jnz   _nb_kernel410nf_ia32_sse.nb410nf_dopair
1658         jmp   _nb_kernel410nf_ia32_sse.nb410nf_checksingle
1659 _nb_kernel410nf_ia32_sse.nb410nf_dopair:
1660         movl  nb410nf_innerjjnr(%esp),%ecx
1661         movl  (%ecx),%eax
1662         movl  4(%ecx),%ebx
1663         addl $8,nb410nf_innerjjnr(%esp)
1664
1665         xorps %xmm2,%xmm2
1666         movaps %xmm2,%xmm6
1667
1668         ## load isa2
1669         movl nb410nf_invsqrta(%ebp),%esi
1670         movss (%esi,%eax,4),%xmm2
1671         movss (%esi,%ebx,4),%xmm3
1672         unpcklps %xmm3,%xmm2    ## isa2 in xmm3(0,1)
1673         mulps  nb410nf_isai(%esp),%xmm2
1674         movaps %xmm2,nb410nf_isaprod(%esp)
1675         movaps %xmm2,%xmm1
1676         mulps nb410nf_gbtsc(%esp),%xmm1
1677         movaps %xmm1,nb410nf_gbscale(%esp)
1678
1679         movl nb410nf_charge(%ebp),%esi     ## base of charge[]
1680         movss (%esi,%eax,4),%xmm3
1681         movss (%esi,%ebx,4),%xmm6
1682         unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges
1683
1684         mulps  nb410nf_iq(%esp),%xmm2
1685         mulps  %xmm2,%xmm3
1686         movaps %xmm3,nb410nf_qq(%esp)
1687
1688         movl nb410nf_type(%ebp),%esi
1689         movl  %eax,%ecx
1690         movl  %ebx,%edx
1691         movl (%esi,%ecx,4),%ecx
1692         movl (%esi,%edx,4),%edx
1693         movl nb410nf_vdwparam(%ebp),%esi
1694         shll %ecx
1695         shll %edx
1696         movl nb410nf_ntia(%esp),%edi
1697         addl %edi,%ecx
1698         addl %edi,%edx
1699         movlps (%esi,%ecx,4),%xmm6
1700         movhps (%esi,%edx,4),%xmm6
1701         movl nb410nf_pos(%ebp),%edi
1702
1703         movaps %xmm6,%xmm4
1704         shufps $8,%xmm4,%xmm4 ## constant 00001000
1705         shufps $13,%xmm6,%xmm6 ## constant 00001101
1706         movlhps %xmm7,%xmm4
1707         movlhps %xmm7,%xmm6
1708
1709         movaps %xmm4,nb410nf_c6(%esp)
1710         movaps %xmm6,nb410nf_c12(%esp)
1711
1712         leal  (%eax,%eax,2),%eax
1713         leal  (%ebx,%ebx,2),%ebx
1714         ## move coordinates to xmm0-xmm2
1715         movlps (%edi,%eax,4),%xmm1
1716         movss 8(%edi,%eax,4),%xmm2
1717         movhps (%edi,%ebx,4),%xmm1
1718         movss 8(%edi,%ebx,4),%xmm0
1719
1720         movlhps %xmm7,%xmm3
1721
1722         shufps $0,%xmm0,%xmm2
1723
1724         movaps %xmm1,%xmm0
1725
1726         shufps $136,%xmm2,%xmm2 ## constant 10001000
1727
1728         shufps $136,%xmm0,%xmm0 ## constant 10001000
1729         shufps $221,%xmm1,%xmm1 ## constant 11011101
1730
1731         movl   nb410nf_faction(%ebp),%edi
1732         ## move ix-iz to xmm4-xmm6
1733         xorps   %xmm7,%xmm7
1734
1735         movaps nb410nf_ix(%esp),%xmm4
1736         movaps nb410nf_iy(%esp),%xmm5
1737         movaps nb410nf_iz(%esp),%xmm6
1738
1739         ## calc dr
1740         subps %xmm0,%xmm4
1741         subps %xmm1,%xmm5
1742         subps %xmm2,%xmm6
1743
1744         ## square it
1745         mulps %xmm4,%xmm4
1746         mulps %xmm5,%xmm5
1747         mulps %xmm6,%xmm6
1748         addps %xmm5,%xmm4
1749         addps %xmm6,%xmm4
1750         ## rsq in xmm4
1751
1752         rsqrtps %xmm4,%xmm5
1753         ## lookup seed in xmm5
1754         movaps %xmm5,%xmm2
1755         mulps %xmm5,%xmm5
1756         movaps nb410nf_three(%esp),%xmm1
1757         mulps %xmm4,%xmm5       ## rsq*lu*lu
1758         movaps nb410nf_half(%esp),%xmm0
1759         subps %xmm5,%xmm1       ## constant 30-rsq*lu*lu
1760         mulps %xmm2,%xmm1
1761         mulps %xmm1,%xmm0       ## xmm0=rinv
1762         mulps %xmm0,%xmm4       ## xmm4=r
1763         mulps nb410nf_gbscale(%esp),%xmm4
1764
1765         cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices
1766         cvtpi2ps %mm6,%xmm6
1767         subps %xmm6,%xmm4
1768         movaps %xmm4,%xmm1      ## xmm1=eps
1769         movaps %xmm1,%xmm2
1770         mulps  %xmm2,%xmm2      ## xmm2=eps2
1771
1772         pslld $2,%mm6
1773
1774         movl nb410nf_GBtab(%ebp),%esi
1775         movd %mm6,%ecx
1776         psrlq $32,%mm6
1777         movd %mm6,%edx
1778
1779         ## load coulomb table
1780         movaps (%esi,%ecx,4),%xmm4
1781         movaps (%esi,%edx,4),%xmm7
1782         ## transpose, using xmm3 for scratch
1783         movaps %xmm4,%xmm6
1784         unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2
1785         unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
1786         movhlps  %xmm4,%xmm5    ## F1 F2
1787         movhlps  %xmm6,%xmm7    ## H1 H2
1788         ## coulomb table ready, in xmm4-xmm7
1789
1790         mulps  %xmm1,%xmm6      ## xmm6=Geps
1791         mulps  %xmm2,%xmm7      ## xmm7=Heps2
1792         addps  %xmm6,%xmm5
1793         addps  %xmm7,%xmm5      ## xmm5=Fp
1794         movaps nb410nf_qq(%esp),%xmm3
1795         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp
1796         addps  %xmm4,%xmm5 ## xmm5=VV
1797         mulps  %xmm3,%xmm5 ## vcoul=qq*VV
1798
1799         addps  nb410nf_vctot(%esp),%xmm5
1800         movaps %xmm5,nb410nf_vctot(%esp)
1801
1802         ## L-J
1803         movaps %xmm0,%xmm4
1804         mulps  %xmm0,%xmm4      ## xmm4=rinvsq
1805
1806         ## at this point mm5 contains vcoul and mm3 fijC
1807         ## increment vcoul - then we can get rid of mm5
1808         ## update vctot
1809
1810         movaps %xmm4,%xmm6
1811         mulps  %xmm4,%xmm6
1812
1813         mulps  %xmm4,%xmm6      ## xmm6=rinvsix
1814         movaps %xmm6,%xmm4
1815         mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve
1816         mulps  nb410nf_c6(%esp),%xmm6
1817         mulps  nb410nf_c12(%esp),%xmm4
1818         movaps nb410nf_Vvdwtot(%esp),%xmm7
1819         addps  %xmm4,%xmm7
1820         subps  %xmm6,%xmm7
1821         movaps %xmm7,nb410nf_Vvdwtot(%esp)
1822
1823 _nb_kernel410nf_ia32_sse.nb410nf_checksingle:
1824         movl  nb410nf_innerk(%esp),%edx
1825         andl  $1,%edx
1826         jnz    _nb_kernel410nf_ia32_sse.nb410nf_dosingle
1827         jmp    _nb_kernel410nf_ia32_sse.nb410nf_updateouterdata
1828 _nb_kernel410nf_ia32_sse.nb410nf_dosingle:
1829         movl nb410nf_charge(%ebp),%esi
1830         movl nb410nf_invsqrta(%ebp),%edx
1831         movl nb410nf_pos(%ebp),%edi
1832         movl  nb410nf_innerjjnr(%esp),%ecx
1833         movl  (%ecx),%eax
1834         xorps  %xmm2,%xmm2
1835         movaps %xmm2,%xmm6
1836         movss (%edx,%eax,4),%xmm2       ## isa2
1837         mulss nb410nf_isai(%esp),%xmm2
1838         movss %xmm2,nb410nf_isaprod(%esp)
1839         movss %xmm2,%xmm1
1840         mulss nb410nf_gbtsc(%esp),%xmm1
1841         movss %xmm1,nb410nf_gbscale(%esp)
1842
1843         mulss  nb410nf_iq(%esp),%xmm2
1844         movss (%esi,%eax,4),%xmm6       ## xmm6(0) has the charge
1845         mulss  %xmm2,%xmm6
1846         movss %xmm6,nb410nf_qq(%esp)
1847
1848         movl nb410nf_type(%ebp),%esi
1849         movl %eax,%ecx
1850         movl (%esi,%ecx,4),%ecx
1851         movl nb410nf_vdwparam(%ebp),%esi
1852         shll %ecx
1853         addl nb410nf_ntia(%esp),%ecx
1854         movlps (%esi,%ecx,4),%xmm6
1855         movaps %xmm6,%xmm4
1856         shufps $252,%xmm4,%xmm4 ## constant 11111100
1857         shufps $253,%xmm6,%xmm6 ## constant 11111101
1858
1859         movaps %xmm4,nb410nf_c6(%esp)
1860         movaps %xmm6,nb410nf_c12(%esp)
1861
1862         leal  (%eax,%eax,2),%eax
1863
1864         ## move coordinates to xmm0-xmm2
1865         movss (%edi,%eax,4),%xmm0
1866         movss 4(%edi,%eax,4),%xmm1
1867         movss 8(%edi,%eax,4),%xmm2
1868
1869         movaps nb410nf_ix(%esp),%xmm4
1870         movaps nb410nf_iy(%esp),%xmm5
1871         movaps nb410nf_iz(%esp),%xmm6
1872
1873         ## calc dr
1874         subss %xmm0,%xmm4
1875         subss %xmm1,%xmm5
1876         subss %xmm2,%xmm6
1877
1878         ## square it
1879         mulss %xmm4,%xmm4
1880         mulss %xmm5,%xmm5
1881         mulss %xmm6,%xmm6
1882         addss %xmm5,%xmm4
1883         addss %xmm6,%xmm4
1884         ## rsq in xmm4
1885
1886         rsqrtss %xmm4,%xmm5
1887         ## lookup seed in xmm5
1888         movaps %xmm5,%xmm2
1889         mulss %xmm5,%xmm5
1890         movss nb410nf_three(%esp),%xmm1
1891         mulss %xmm4,%xmm5       ## rsq*lu*lu
1892         movss nb410nf_half(%esp),%xmm0
1893         subss %xmm5,%xmm1       ## constant 30-rsq*lu*lu
1894         mulss %xmm2,%xmm1
1895         mulss %xmm1,%xmm0       ## xmm0=rinv
1896
1897         mulss %xmm0,%xmm4       ## xmm4=r
1898         mulss nb410nf_gbscale(%esp),%xmm4
1899
1900         cvttss2si %xmm4,%ebx    ## mm6 contain lu indices
1901         cvtsi2ss %ebx,%xmm6
1902         subss %xmm6,%xmm4
1903         movaps %xmm4,%xmm1      ## xmm1=eps
1904         movaps %xmm1,%xmm2
1905         mulss  %xmm2,%xmm2      ## xmm2=eps2
1906
1907         shll $2,%ebx
1908         movl nb410nf_GBtab(%ebp),%esi
1909
1910         movaps (%esi,%ebx,4),%xmm4
1911         movhlps %xmm4,%xmm6
1912         movaps %xmm4,%xmm5
1913         movaps %xmm6,%xmm7
1914         shufps $1,%xmm5,%xmm5
1915         shufps $1,%xmm7,%xmm7
1916         ## table ready in xmm4-xmm7
1917
1918         mulss  %xmm1,%xmm6      ## xmm6=Geps
1919         mulss  %xmm2,%xmm7      ## xmm7=Heps2
1920         addss  %xmm6,%xmm5
1921         addss  %xmm7,%xmm5      ## xmm5=Fp
1922         movss nb410nf_qq(%esp),%xmm3
1923         mulss  %xmm1,%xmm5 ## xmm5=eps*Fp
1924         addss  %xmm4,%xmm5 ## xmm5=VV
1925         mulss  %xmm3,%xmm5 ## vcoul=qq*VV
1926         addss  nb410nf_vctot(%esp),%xmm5
1927         movss %xmm5,nb410nf_vctot(%esp)
1928
1929         ## L-J
1930         movaps %xmm0,%xmm4
1931         mulss  %xmm0,%xmm4      ## xmm4=rinvsq
1932
1933         movaps %xmm4,%xmm6
1934         mulss  %xmm4,%xmm6
1935
1936         mulss  %xmm4,%xmm6      ## xmm6=rinvsix
1937         movaps %xmm6,%xmm4
1938         mulss  %xmm4,%xmm4      ## xmm4=rinvtwelve
1939         mulss  nb410nf_c6(%esp),%xmm6
1940         mulss  nb410nf_c12(%esp),%xmm4
1941         movss nb410nf_Vvdwtot(%esp),%xmm7
1942         addps  %xmm4,%xmm7
1943         subps  %xmm6,%xmm7
1944         movss %xmm7,nb410nf_Vvdwtot(%esp)
1945
1946 _nb_kernel410nf_ia32_sse.nb410nf_updateouterdata:
1947         ## get n from stack
1948         movl nb410nf_n(%esp),%esi
1949         ## get group index for i particle
1950         movl  nb410nf_gid(%ebp),%edx            ## base of gid[]
1951         movl  (%edx,%esi,4),%edx                ## ggid=gid[n]
1952
1953         ## accumulate total potential energy and update it
1954         movaps nb410nf_vctot(%esp),%xmm7
1955         ## accumulate
1956         movhlps %xmm7,%xmm6
1957         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1958         movaps %xmm7,%xmm6
1959         shufps $1,%xmm6,%xmm6
1960         addss  %xmm6,%xmm7
1961
1962         ## add earlier value from mem
1963         movl  nb410nf_Vc(%ebp),%eax
1964         addss (%eax,%edx,4),%xmm7
1965         ## move back to mem
1966         movss %xmm7,(%eax,%edx,4)
1967
1968         ## accumulate total lj energy and update it
1969         movaps nb410nf_Vvdwtot(%esp),%xmm7
1970         ## accumulate
1971         movhlps %xmm7,%xmm6
1972         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1973         movaps %xmm7,%xmm6
1974         shufps $1,%xmm6,%xmm6
1975         addss  %xmm6,%xmm7
1976
1977         ## add earlier value from mem
1978         movl  nb410nf_Vvdw(%ebp),%eax
1979         addss (%eax,%edx,4),%xmm7
1980         ## move back to mem
1981         movss %xmm7,(%eax,%edx,4)
1982
1983         ## finish if last
1984         movl nb410nf_nn1(%esp),%ecx
1985         ## esi already loaded with n
1986         incl %esi
1987         subl %esi,%ecx
1988         jz _nb_kernel410nf_ia32_sse.nb410nf_outerend
1989
1990         ## not last, iterate outer loop once more!
1991         movl %esi,nb410nf_n(%esp)
1992         jmp _nb_kernel410nf_ia32_sse.nb410nf_outer
1993 _nb_kernel410nf_ia32_sse.nb410nf_outerend:
1994         ## check if more outer neighborlists remain
1995         movl  nb410nf_nri(%esp),%ecx
1996         ## esi already loaded with n above
1997         subl  %esi,%ecx
1998         jz _nb_kernel410nf_ia32_sse.nb410nf_end
1999         ## non-zero, do one more workunit
2000         jmp   _nb_kernel410nf_ia32_sse.nb410nf_threadloop
2001 _nb_kernel410nf_ia32_sse.nb410nf_end:
2002         emms
2003
2004         movl nb410nf_nouter(%esp),%eax
2005         movl nb410nf_ninner(%esp),%ebx
2006         movl nb410nf_outeriter(%ebp),%ecx
2007         movl nb410nf_inneriter(%ebp),%edx
2008         movl %eax,(%ecx)
2009         movl %ebx,(%edx)
2010
2011         movl nb410nf_salign(%esp),%eax
2012         addl %eax,%esp
2013         addl $292,%esp
2014         popl %edi
2015         popl %esi
2016         popl %edx
2017         popl %ecx
2018         popl %ebx
2019         popl %eax
2020         leave
2021         ret
2022