src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel310_x86_64_sse.s

   1 ##
   2 ##
   3 ## Gromacs 4.0                         Copyright (c) 1991-2003
   4 ## David van der Spoel, Erik Lindahl
   5 ##
   6 ## This program is free software; you can redistribute it and/or
   7 ## modify it under the terms of the GNU General Public License
   8 ## as published by the Free Software Foundation; either version 2
   9 ## of the License, or (at your option) any later version.
  10 ##
  11 ## To help us fund GROMACS development, we humbly ask that you cite
  12 ## the research papers on the package. Check out http://www.gromacs.org
  13 ##
  14 ## And Hey:
  15 ## Gnomes, ROck Monsters And Chili Sauce
  16 ##
  17
  18
  19
  20
  21
  22
  23 .globl nb_kernel310_x86_64_sse
  24 .globl _nb_kernel310_x86_64_sse
  25 nb_kernel310_x86_64_sse:
  26 _nb_kernel310_x86_64_sse:
  27 ##      Room for return address and rbp (16 bytes)
  28 .set nb310_fshift, 16
  29 .set nb310_gid, 24
  30 .set nb310_pos, 32
  31 .set nb310_faction, 40
  32 .set nb310_charge, 48
  33 .set nb310_p_facel, 56
  34 .set nb310_argkrf, 64
  35 .set nb310_argcrf, 72
  36 .set nb310_Vc, 80
  37 .set nb310_type, 88
  38 .set nb310_p_ntype, 96
  39 .set nb310_vdwparam, 104
  40 .set nb310_Vvdw, 112
  41 .set nb310_p_tabscale, 120
  42 .set nb310_VFtab, 128
  43 .set nb310_invsqrta, 136
  44 .set nb310_dvda, 144
  45 .set nb310_p_gbtabscale, 152
  46 .set nb310_GBtab, 160
  47 .set nb310_p_nthreads, 168
  48 .set nb310_count, 176
  49 .set nb310_mtx, 184
  50 .set nb310_outeriter, 192
  51 .set nb310_inneriter, 200
  52 .set nb310_work, 208
  53         ## stack offsets for local variables
  54         ## bottom of stack is cache-aligned for sse use
  55 .set nb310_ix, 0
  56 .set nb310_iy, 16
  57 .set nb310_iz, 32
  58 .set nb310_iq, 48
  59 .set nb310_dx, 64
  60 .set nb310_dy, 80
  61 .set nb310_dz, 96
  62 .set nb310_two, 112
  63 .set nb310_six, 128
  64 .set nb310_twelve, 144
  65 .set nb310_tsc, 160
  66 .set nb310_qq, 176
  67 .set nb310_c6, 192
  68 .set nb310_c12, 208
  69 .set nb310_fscal, 224
  70 .set nb310_vctot, 240
  71 .set nb310_Vvdwtot, 256
  72 .set nb310_fix, 272
  73 .set nb310_fiy, 288
  74 .set nb310_fiz, 304
  75 .set nb310_half, 320
  76 .set nb310_three, 336
  77 .set nb310_nri, 352
  78 .set nb310_iinr, 360
  79 .set nb310_jindex, 368
  80 .set nb310_jjnr, 376
  81 .set nb310_shift, 384
  82 .set nb310_shiftvec, 392
  83 .set nb310_facel, 400
  84 .set nb310_innerjjnr, 408
  85 .set nb310_is3, 416
  86 .set nb310_ii3, 420
  87 .set nb310_ntia, 424
  88 .set nb310_innerk, 428
  89 .set nb310_n, 432
  90 .set nb310_nn1, 436
  91 .set nb310_ntype, 440
  92 .set nb310_nouter, 444
  93 .set nb310_ninner, 448
  94
  95
  96         push %rbp
  97         movq %rsp,%rbp
  98         push %rbx
  99
 100
 101         emms
 102
 103         push %r12
 104         push %r13
 105         push %r14
 106         push %r15
 107
 108         subq $472,%rsp          ## local variable stack space (n*16+8)
 109
 110         ## zero 32-bit iteration counters
 111         movl $0,%eax
 112         movl %eax,nb310_nouter(%rsp)
 113         movl %eax,nb310_ninner(%rsp)
 114
 115         movl (%rdi),%edi
 116         movl %edi,nb310_nri(%rsp)
 117         movq %rsi,nb310_iinr(%rsp)
 118         movq %rdx,nb310_jindex(%rsp)
 119         movq %rcx,nb310_jjnr(%rsp)
 120         movq %r8,nb310_shift(%rsp)
 121         movq %r9,nb310_shiftvec(%rsp)
 122         movq nb310_p_ntype(%rbp),%rdi
 123         movl (%rdi),%edi
 124         movl %edi,nb310_ntype(%rsp)
 125         movq nb310_p_facel(%rbp),%rsi
 126         movss (%rsi),%xmm0
 127         movss %xmm0,nb310_facel(%rsp)
 128
 129
 130         movq nb310_p_tabscale(%rbp),%rax
 131         movss (%rax),%xmm3
 132         shufps $0,%xmm3,%xmm3
 133         movaps %xmm3,nb310_tsc(%rsp)
 134
 135
 136         ## create constant floating-point factors on stack
 137         movl $0x3f000000,%eax   ## half in IEEE (hex)
 138         movl %eax,nb310_half(%rsp)
 139         movss nb310_half(%rsp),%xmm1
 140         shufps $0,%xmm1,%xmm1  ## splat to all elements
 141         movaps %xmm1,%xmm2
 142         addps  %xmm2,%xmm2      ## one
 143         movaps %xmm2,%xmm3
 144         addps  %xmm2,%xmm2      ## two
 145         addps  %xmm2,%xmm3      ## three
 146         movaps %xmm3,%xmm4
 147         addps  %xmm4,%xmm4      ## six
 148         movaps %xmm4,%xmm5
 149         addps  %xmm5,%xmm5      ## twelve
 150         movaps %xmm1,nb310_half(%rsp)
 151         movaps %xmm2,nb310_two(%rsp)
 152         movaps %xmm3,nb310_three(%rsp)
 153         movaps %xmm4,nb310_six(%rsp)
 154         movaps %xmm5,nb310_twelve(%rsp)
 155
 156 _nb_kernel310_x86_64_sse.nb310_threadloop:
 157         movq  nb310_count(%rbp),%rsi            ## pointer to sync counter
 158         movl  (%rsi),%eax
 159 _nb_kernel310_x86_64_sse.nb310_spinlock:
 160         movl  %eax,%ebx                         ## ebx=*count=nn0
 161         addl  $1,%ebx                          ## ebx=nn1=nn0+10
 162         lock
 163         cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
 164                                                 ## if it hasnt changed.
 165                                                 ## or reread *counter to eax.
 166         pause                                   ## -> better p4 performance
 167         jnz _nb_kernel310_x86_64_sse.nb310_spinlock
 168
 169         ## if(nn1>nri) nn1=nri
 170         movl nb310_nri(%rsp),%ecx
 171         movl %ecx,%edx
 172         subl %ebx,%ecx
 173         cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
 174         ## Cleared the spinlock if we got here.
 175         ## eax contains nn0, ebx contains nn1.
 176         movl %eax,nb310_n(%rsp)
 177         movl %ebx,nb310_nn1(%rsp)
 178         subl %eax,%ebx                          ## calc number of outer lists
 179         movl %eax,%esi                          ## copy n to esi
 180         jg  _nb_kernel310_x86_64_sse.nb310_outerstart
 181         jmp _nb_kernel310_x86_64_sse.nb310_end
 182
 183 _nb_kernel310_x86_64_sse.nb310_outerstart:
 184         ## ebx contains number of outer iterations
 185         addl nb310_nouter(%rsp),%ebx
 186         movl %ebx,nb310_nouter(%rsp)
 187
 188 _nb_kernel310_x86_64_sse.nb310_outer:
 189         movq  nb310_shift(%rsp),%rax        ## rax = pointer into shift[]
 190         movl  (%rax,%rsi,4),%ebx        ## ebx=shift[n]
 191
 192         lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is
 193         movl  %ebx,nb310_is3(%rsp)      ## store is3
 194
 195         movq  nb310_shiftvec(%rsp),%rax     ## rax = base of shiftvec[]
 196
 197         movss (%rax,%rbx,4),%xmm0
 198         movss 4(%rax,%rbx,4),%xmm1
 199         movss 8(%rax,%rbx,4),%xmm2
 200
 201         movq  nb310_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]
 202         movl  (%rcx,%rsi,4),%ebx            ## ebx =ii
 203
 204         movq  nb310_charge(%rbp),%rdx
 205         movss (%rdx,%rbx,4),%xmm3
 206         mulss nb310_facel(%rsp),%xmm3
 207         shufps $0,%xmm3,%xmm3
 208
 209         movq  nb310_type(%rbp),%rdx
 210         movl  (%rdx,%rbx,4),%edx
 211         imull nb310_ntype(%rsp),%edx
 212         shll  %edx
 213         movl  %edx,nb310_ntia(%rsp)
 214
 215         lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3
 216         movq  nb310_pos(%rbp),%rax      ## rax = base of pos[]
 217
 218         addss (%rax,%rbx,4),%xmm0
 219         addss 4(%rax,%rbx,4),%xmm1
 220         addss 8(%rax,%rbx,4),%xmm2
 221
 222         movaps %xmm3,nb310_iq(%rsp)
 223
 224         shufps $0,%xmm0,%xmm0
 225         shufps $0,%xmm1,%xmm1
 226         shufps $0,%xmm2,%xmm2
 227
 228         movaps %xmm0,nb310_ix(%rsp)
 229         movaps %xmm1,nb310_iy(%rsp)
 230         movaps %xmm2,nb310_iz(%rsp)
 231
 232         movl  %ebx,nb310_ii3(%rsp)
 233
 234         ## clear vctot and i forces
 235         xorps %xmm15,%xmm15
 236         movaps %xmm15,nb310_vctot(%rsp)
 237         movaps %xmm15,nb310_Vvdwtot(%rsp)
 238         movaps %xmm15,%xmm14
 239         movaps %xmm15,%xmm13
 240
 241         movq  nb310_jindex(%rsp),%rax
 242         movl  (%rax,%rsi,4),%ecx             ## jindex[n]
 243         movl  4(%rax,%rsi,4),%edx            ## jindex[n+1]
 244         subl  %ecx,%edx              ## number of innerloop atoms
 245
 246         movq  nb310_pos(%rbp),%rsi
 247         movq  nb310_faction(%rbp),%rdi
 248         movq  nb310_jjnr(%rsp),%rax
 249         shll  $2,%ecx
 250         addq  %rcx,%rax
 251         movq  %rax,nb310_innerjjnr(%rsp)       ## pointer to jjnr[nj0]
 252         movl  %edx,%ecx
 253         subl  $4,%edx
 254         addl  nb310_ninner(%rsp),%ecx
 255         movl  %ecx,nb310_ninner(%rsp)
 256         addl  $0,%edx
 257         movl  %edx,nb310_innerk(%rsp)      ## number of innerloop atoms
 258         jge   _nb_kernel310_x86_64_sse.nb310_unroll_loop
 259         jmp   _nb_kernel310_x86_64_sse.nb310_finish_inner
 260 _nb_kernel310_x86_64_sse.nb310_unroll_loop:
 261         ## quad-unrolled innerloop here
 262         movq  nb310_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]
 263         movl  (%rdx),%r8d
 264         movl  4(%rdx),%r9d
 265         movl  8(%rdx),%r10d
 266         movl  12(%rdx),%r11d           ## eax-edx=jnr1-4
 267
 268         addq $16,nb310_innerjjnr(%rsp)             ## advance pointer (unrolled 4)
 269
 270
 271         lea  (%r8,%r8,2),%rax     ## replace jnr with j3
 272         lea  (%r9,%r9,2),%rbx
 273
 274         lea  (%r10,%r10,2),%rcx     ## replace jnr with j3
 275         lea  (%r11,%r11,2),%rdx
 276
 277         ## load coordinates
 278         movq nb310_pos(%rbp),%rdi
 279
 280         movlps (%rdi,%rax,4),%xmm1      ## x1 y1 - -
 281         movlps (%rdi,%rbx,4),%xmm2      ## x2 y2 - -
 282         movlps (%rdi,%rcx,4),%xmm3      ## x3 y3 - -
 283         movlps (%rdi,%rdx,4),%xmm4      ## x4 y4 - -
 284
 285         movss 8(%rdi,%rax,4),%xmm5      ## z1 - - -
 286         movss 8(%rdi,%rbx,4),%xmm6      ## z2 - - -
 287         movss 8(%rdi,%rcx,4),%xmm7      ## z3 - - -
 288         movss 8(%rdi,%rdx,4),%xmm8      ## z4 - - -
 289
 290     unpcklps %xmm3,%xmm1 ## x1 x3 y1 y3
 291     unpcklps %xmm4,%xmm2 ## x2 x4 y2 y4
 292     unpcklps %xmm7,%xmm5 ## z1 z3 -  -
 293     unpcklps %xmm8,%xmm6 ## z2 z4 -  -
 294         movq nb310_charge(%rbp),%rsi
 295
 296     movaps %xmm1,%xmm3
 297
 298     unpcklps %xmm2,%xmm1 ## x1 x2 x3 x4
 299     unpckhps %xmm2,%xmm3 ## y1 y2 y3 y4
 300     unpcklps %xmm6,%xmm5 ## z1 z2 z3 z4
 301
 302         ## calc dr
 303         subps nb310_ix(%rsp),%xmm1
 304         subps nb310_iy(%rsp),%xmm3
 305         subps nb310_iz(%rsp),%xmm5
 306
 307         ## store dr in xmm9-xmm11
 308     movaps %xmm1,%xmm9
 309     movaps %xmm3,%xmm10
 310     movaps %xmm5,%xmm11
 311
 312         movss (%rsi,%r8,4),%xmm0
 313         movss (%rsi,%r10,4),%xmm2
 314         movss (%rsi,%r9,4),%xmm6
 315         movss (%rsi,%r11,4),%xmm8
 316
 317         ## square it
 318         mulps %xmm1,%xmm1
 319         mulps %xmm3,%xmm3
 320         mulps %xmm5,%xmm5
 321         addps %xmm1,%xmm3
 322         addps %xmm5,%xmm3
 323         ## rsq in xmm3
 324         movq nb310_type(%rbp),%rsi
 325
 326     unpcklps %xmm2,%xmm0
 327     unpcklps %xmm8,%xmm6
 328
 329     unpcklps %xmm6,%xmm0
 330
 331
 332     ## calculate rinv=1/sqrt(rsq)
 333         rsqrtps %xmm3,%xmm5
 334         movaps %xmm5,%xmm2
 335         mulps %xmm5,%xmm5
 336         movaps nb310_three(%rsp),%xmm1
 337         mulps %xmm3,%xmm5       ## rsq*lu*lu
 338     subps %xmm5,%xmm1   ## 30-rsq*lu*lu
 339         mulps %xmm2,%xmm1
 340         mulps nb310_half(%rsp),%xmm1
 341     ## xmm1=rinv
 342     ## xmm3=rsq
 343         mulps nb310_iq(%rsp),%xmm0
 344
 345     ## vdw types
 346         movl (%rsi,%r8,4),%r8d
 347         movl (%rsi,%r9,4),%r9d
 348         movl (%rsi,%r10,4),%r10d
 349         movl (%rsi,%r11,4),%r11d
 350
 351     mulps %xmm1,%xmm3 ## r
 352     mulps nb310_tsc(%rsp),%xmm3   ## rtab
 353     movaps %xmm0,nb310_qq(%rsp)
 354
 355     ## truncate and convert to integers
 356     cvttps2dq %xmm3,%xmm2
 357
 358         shll %r8d
 359         shll %r9d
 360         shll %r10d
 361         shll %r11d
 362
 363     ## convert back to float
 364     cvtdq2ps  %xmm2,%xmm0
 365
 366     movl nb310_ntia(%rsp),%edi
 367         addl %edi,%r8d
 368         addl %edi,%r9d
 369         addl %edi,%r10d
 370         addl %edi,%r11d
 371
 372     ## multiply by 4
 373     pslld   $2,%xmm2
 374
 375     ## move to integer registers
 376     movhlps %xmm2,%xmm7
 377     movd    %xmm2,%r12d
 378     movd    %xmm7,%r14d
 379     pshufd $1,%xmm2,%xmm2
 380     pshufd $1,%xmm7,%xmm7
 381     movd    %xmm2,%r13d
 382     movd    %xmm7,%r15d
 383
 384
 385     ## calculate eps
 386     subps     %xmm0,%xmm3
 387
 388         movq nb310_vdwparam(%rbp),%rsi
 389         movlps (%rsi,%r8,4),%xmm7
 390         movlps (%rsi,%r10,4),%xmm8
 391         movhps (%rsi,%r9,4),%xmm7
 392         movhps (%rsi,%r11,4),%xmm8
 393
 394         movaps %xmm7,%xmm12
 395         shufps $136,%xmm8,%xmm12 ## 10001000
 396         shufps $221,%xmm8,%xmm7 ## 11011101
 397
 398     movaps %xmm12,nb310_c6(%rsp)
 399     movaps %xmm7,nb310_c12(%rsp)
 400
 401         movq nb310_VFtab(%rbp),%rsi
 402     ## load table data
 403         movlps (%rsi,%r12,4),%xmm5
 404         movlps (%rsi,%r14,4),%xmm7
 405         movhps (%rsi,%r13,4),%xmm5
 406         movhps (%rsi,%r15,4),%xmm7
 407
 408     movaps %xmm5,%xmm4
 409         shufps $136,%xmm7,%xmm4 ## 10001000
 410         shufps $221,%xmm7,%xmm5 ## 11011101
 411
 412     movaps %xmm1,%xmm0 ## rinv
 413     mulps  %xmm0,%xmm0 ## rinvsq
 414     movaps %xmm0,%xmm2 ## rinvsq
 415     mulps  %xmm2,%xmm2 ## rinv4
 416     mulps  %xmm0,%xmm2 ## rinv6
 417     movaps %xmm2,%xmm12
 418     mulps  %xmm12,%xmm12 ## rinv12
 419
 420         movlps 8(%rsi,%r12,4),%xmm7
 421         movlps 8(%rsi,%r14,4),%xmm8
 422         movhps 8(%rsi,%r13,4),%xmm7
 423         movhps 8(%rsi,%r15,4),%xmm8
 424
 425     movaps %xmm7,%xmm6
 426
 427     mulps  nb310_c6(%rsp),%xmm2      ## vvdw6=c6*rinv6
 428         mulps  nb310_c12(%rsp),%xmm12     ## vvdw12=c12*rinv12
 429
 430         movaps %xmm12,%xmm0
 431         subps  %xmm2,%xmm12     ## Vvdw=Vvdw12-Vvdw6
 432
 433     ## add potential to vvdwtot
 434         addps  nb310_Vvdwtot(%rsp),%xmm12
 435     movaps %xmm12,nb310_Vvdwtot(%rsp)
 436
 437         shufps $136,%xmm8,%xmm6 ## 10001000
 438         shufps $221,%xmm8,%xmm7 ## 11011101
 439     ## table data ready in xmm4-xmm7
 440
 441     mulps %xmm3,%xmm7  ## Heps
 442     mulps  %xmm3,%xmm6 ## Geps
 443     mulps %xmm3,%xmm7  ## Heps2
 444
 445     addps  %xmm6,%xmm5  ## F+Geps
 446     addps  %xmm7,%xmm5  ## F+Geps+Heps2 = Fp
 447     addps  %xmm7,%xmm7  ## 2*Heps2
 448     addps  %xmm6,%xmm7  ## 2*Heps2+Geps
 449     addps  %xmm5,%xmm7  ## FF = Fp + 2*Heps2 + Geps
 450     mulps  %xmm3,%xmm5  ## eps*Fp
 451     addps  %xmm4,%xmm5  ## VV
 452     mulps  nb310_qq(%rsp),%xmm5     ## VV*qq=vcoul
 453     mulps  nb310_qq(%rsp),%xmm7     ## FF*qq=fijC
 454
 455     ## LJ forces
 456     mulps  nb310_six(%rsp),%xmm2
 457     mulps  nb310_twelve(%rsp),%xmm0
 458     subps  %xmm2,%xmm0
 459     mulps  %xmm1,%xmm0 ## (12*vnb12-6*vnb6)*rinv
 460
 461     ## add potential to vctot
 462         addps  nb310_vctot(%rsp),%xmm5
 463     movaps %xmm5,nb310_vctot(%rsp)
 464
 465     mulps  nb310_tsc(%rsp),%xmm7
 466     subps  %xmm7,%xmm0
 467
 468     mulps  %xmm1,%xmm0 ## fscal
 469
 470     ## calculate scalar force by multiplying dx/dy/dz with fscal
 471         mulps  %xmm0,%xmm9
 472         mulps  %xmm0,%xmm10
 473         mulps  %xmm0,%xmm11
 474
 475         movq nb310_faction(%rbp),%rsi
 476         ## the fj's - start by accumulating x & y forces from memory
 477         movlps (%rsi,%rax,4),%xmm0 ## x1 y1 - -
 478         movlps (%rsi,%rcx,4),%xmm1 ## x3 y3 - -
 479         movhps (%rsi,%rbx,4),%xmm0 ## x1 y1 x2 y2
 480         movhps (%rsi,%rdx,4),%xmm1 ## x3 y3 x4 y4
 481
 482         ## xmm0-xmm2 contains tx-tz (partial force)
 483         ## accumulate i forces
 484     addps %xmm9,%xmm13
 485     addps %xmm10,%xmm14
 486     addps %xmm11,%xmm15
 487
 488     movaps %xmm9,%xmm8
 489     unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
 490     unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4
 491
 492     ## update fjx and fjy
 493         addps  %xmm9,%xmm0
 494         addps  %xmm8,%xmm1
 495
 496         movlps %xmm0,(%rsi,%rax,4)
 497         movlps %xmm1,(%rsi,%rcx,4)
 498         movhps %xmm0,(%rsi,%rbx,4)
 499         movhps %xmm1,(%rsi,%rdx,4)
 500
 501     ## xmm11: fjz1 fjz2 fjz3 fjz4
 502     pshufd $1,%xmm11,%xmm10 ## fjz2 - - -
 503     movhlps %xmm11,%xmm9     ## fjz3 - - -
 504     pshufd $3,%xmm11,%xmm8  ## fjz4 - - -
 505
 506         addss  8(%rsi,%rax,4),%xmm11
 507         addss  8(%rsi,%rbx,4),%xmm10
 508         addss  8(%rsi,%rcx,4),%xmm9
 509         addss  8(%rsi,%rdx,4),%xmm8
 510         movss  %xmm11,8(%rsi,%rax,4)
 511         movss  %xmm10,8(%rsi,%rbx,4)
 512         movss  %xmm9,8(%rsi,%rcx,4)
 513         movss  %xmm8,8(%rsi,%rdx,4)
 514
 515         ## should we do one more iteration?
 516         subl $4,nb310_innerk(%rsp)
 517         jl    _nb_kernel310_x86_64_sse.nb310_finish_inner
 518         jmp   _nb_kernel310_x86_64_sse.nb310_unroll_loop
 519 _nb_kernel310_x86_64_sse.nb310_finish_inner:
 520     ## check if at least two particles remain
 521     addl $4,nb310_innerk(%rsp)
 522     movl  nb310_innerk(%rsp),%edx
 523     andl  $2,%edx
 524     jnz   _nb_kernel310_x86_64_sse.nb310_dopair
 525     jmp   _nb_kernel310_x86_64_sse.nb310_checksingle
 526 _nb_kernel310_x86_64_sse.nb310_dopair:
 527         ## twice-unrolled innerloop here
 528         movq  nb310_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]
 529         movl  (%rdx),%eax
 530         movl  4(%rdx),%ebx
 531
 532         addq $8,nb310_innerjjnr(%rsp)             ## advance pointer (unrolled 2)
 533
 534         movq nb310_charge(%rbp),%rsi
 535         movss (%rsi,%rax,4),%xmm0
 536         movss (%rsi,%rbx,4),%xmm2
 537
 538     unpcklps %xmm2,%xmm0 ## jqa jqb
 539         mulps nb310_iq(%rsp),%xmm0
 540     movaps %xmm0,nb310_qq(%rsp)
 541
 542         movq nb310_type(%rbp),%rsi
 543     ## vdw parameters
 544         movl (%rsi,%rax,4),%r12d
 545         movl (%rsi,%rbx,4),%r13d
 546         shll %r12d
 547         shll %r13d
 548     movl nb310_ntia(%rsp),%edi
 549         addl %edi,%r12d
 550         addl %edi,%r13d
 551
 552         movq nb310_vdwparam(%rbp),%rsi
 553         movlps (%rsi,%r12,4),%xmm3
 554         movhps (%rsi,%r13,4),%xmm3
 555
 556     xorps  %xmm7,%xmm7
 557         movaps %xmm3,%xmm0
 558         shufps $136,%xmm7,%xmm0 ## 10001000
 559         shufps $221,%xmm7,%xmm3 ## 11011101
 560
 561     movaps %xmm0,nb310_c6(%rsp)
 562     movaps %xmm3,nb310_c12(%rsp)
 563
 564         lea  (%rax,%rax,2),%rax     ## replace jnr with j3
 565         lea  (%rbx,%rbx,2),%rbx
 566
 567         ## load coordinates
 568         movq nb310_pos(%rbp),%rdi
 569
 570         movlps (%rdi,%rax,4),%xmm4      ## x1 y1 - -
 571         movlps (%rdi,%rbx,4),%xmm5      ## x2 y2 - -
 572
 573         movss 8(%rdi,%rax,4),%xmm6      ## z1 - - -
 574         movss 8(%rdi,%rbx,4),%xmm7      ## z2 - - -
 575
 576     unpcklps %xmm5,%xmm4 ## x1 x2 y1 y2
 577     movhlps  %xmm4,%xmm5 ## y1 y2 -  -
 578     unpcklps %xmm7,%xmm6 ## z1 z2 -  -
 579
 580         ## calc dr
 581         subps nb310_ix(%rsp),%xmm4
 582         subps nb310_iy(%rsp),%xmm5
 583         subps nb310_iz(%rsp),%xmm6
 584
 585         ## store dr in xmm9-xmm11
 586     movaps %xmm4,%xmm9
 587     movaps %xmm5,%xmm10
 588     movaps %xmm6,%xmm11
 589
 590         ## square it
 591         mulps %xmm4,%xmm4
 592         mulps %xmm5,%xmm5
 593         mulps %xmm6,%xmm6
 594         addps %xmm5,%xmm4
 595         addps %xmm6,%xmm4
 596         ## rsq in xmm4
 597
 598     ## calculate rinv=1/sqrt(rsq)
 599         rsqrtps %xmm4,%xmm5
 600         movaps %xmm5,%xmm2
 601         mulps %xmm5,%xmm5
 602         movaps nb310_three(%rsp),%xmm1
 603         mulps %xmm4,%xmm5       ## rsq*lu*lu
 604     subps %xmm5,%xmm1   ## 30-rsq*lu*lu
 605         mulps %xmm2,%xmm1
 606         mulps nb310_half(%rsp),%xmm1
 607     ## xmm1=rinv
 608     movaps %xmm4,%xmm3
 609     ## xmm3=rsq
 610
 611     mulps %xmm1,%xmm3 ## r
 612     mulps nb310_tsc(%rsp),%xmm3   ## rtab
 613
 614     ## truncate and convert to integers
 615     cvttps2dq %xmm3,%xmm2
 616
 617     ## convert back to float
 618     cvtdq2ps  %xmm2,%xmm0
 619
 620     ## multiply by 4
 621     pslld   $2,%xmm2
 622
 623     ## move to integer registers
 624     movd    %xmm2,%r12d
 625     pshufd $1,%xmm2,%xmm2
 626     movd    %xmm2,%r13d
 627
 628     ## calculate eps
 629     subps     %xmm0,%xmm3
 630
 631         movq nb310_VFtab(%rbp),%rsi
 632     ## load table data
 633         movlps (%rsi,%r12,4),%xmm4
 634         movlps (%rsi,%r13,4),%xmm5
 635     unpcklps %xmm5,%xmm4
 636     movhlps %xmm4,%xmm5
 637
 638     movaps %xmm1,%xmm0 ## rinv
 639     mulps  %xmm0,%xmm0 ## rinvsq
 640     movaps %xmm0,%xmm2 ## rinvsq
 641     mulps  %xmm2,%xmm2 ## rinv4
 642     mulps  %xmm0,%xmm2 ## rinv6
 643     movaps %xmm2,%xmm12
 644     mulps  %xmm12,%xmm12 ## rinv12
 645
 646         movlps 8(%rsi,%r12,4),%xmm6
 647         movlps 8(%rsi,%r13,4),%xmm7
 648     unpcklps %xmm7,%xmm6
 649     movhlps %xmm6,%xmm7
 650     ## table data ready in xmm4-xmm7
 651
 652     mulps  nb310_c6(%rsp),%xmm2      ## vvdw6=c6*rinv6
 653         mulps  nb310_c12(%rsp),%xmm12     ## vvdw12=c12*rinv12
 654
 655         movaps %xmm12,%xmm0
 656         subps  %xmm2,%xmm12     ## Vvdw=Vvdw12-Vvdw6
 657
 658     ## add potential to vvdwtot
 659         addps  nb310_Vvdwtot(%rsp),%xmm12
 660     movlps %xmm12,nb310_Vvdwtot(%rsp)
 661
 662     mulps %xmm3,%xmm7  ## Heps
 663     mulps  %xmm3,%xmm6 ## Geps
 664     mulps %xmm3,%xmm7  ## Heps2
 665
 666     addps  %xmm6,%xmm5  ## F+Geps
 667     addps  %xmm7,%xmm5  ## F+Geps+Heps2 = Fp
 668     addps  %xmm7,%xmm7  ## 2*Heps2
 669     addps  %xmm6,%xmm7  ## 2*Heps2+Geps
 670     addps  %xmm5,%xmm7  ## FF = Fp + 2*Heps2 + Geps
 671     mulps  %xmm3,%xmm5  ## eps*Fp
 672     addps  %xmm4,%xmm5  ## VV
 673     mulps  nb310_qq(%rsp),%xmm5     ## VV*qq=vcoul
 674     mulps  nb310_qq(%rsp),%xmm7     ## FF*qq=fijC
 675
 676     ## LJ forces
 677     mulps  nb310_six(%rsp),%xmm2
 678     mulps  nb310_twelve(%rsp),%xmm0
 679     subps  %xmm2,%xmm0
 680     mulps  %xmm1,%xmm0 ## (12*vnb12-6*vnb6)*rinv
 681
 682     ## add potential to vctot
 683         addps  nb310_vctot(%rsp),%xmm5
 684     movlps %xmm5,nb310_vctot(%rsp)
 685
 686     xorps %xmm8,%xmm8
 687
 688     mulps  nb310_tsc(%rsp),%xmm7
 689     subps  %xmm7,%xmm0
 690
 691     mulps  %xmm1,%xmm0 ## fscal
 692
 693     ## calculate scalar force by multiplying dx/dy/dz with fscal
 694         mulps  %xmm0,%xmm9
 695         mulps  %xmm0,%xmm10
 696         mulps  %xmm0,%xmm11
 697
 698     movlhps %xmm8,%xmm9
 699     movlhps %xmm8,%xmm10
 700     movlhps %xmm8,%xmm11
 701
 702         ## accumulate i forces
 703     addps %xmm9,%xmm13
 704     addps %xmm10,%xmm14
 705     addps %xmm11,%xmm15
 706
 707         movq nb310_faction(%rbp),%rsi
 708         ## the fj's - start by accumulating x & y forces from memory
 709         movlps (%rsi,%rax,4),%xmm0 ## x1 y1 - -
 710         movhps (%rsi,%rbx,4),%xmm0 ## x1 y1 x2 y2
 711
 712     unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
 713     addps    %xmm9,%xmm0
 714
 715         movlps %xmm0,(%rsi,%rax,4)
 716         movhps %xmm0,(%rsi,%rbx,4)
 717
 718     ## z forces
 719     pshufd $1,%xmm11,%xmm8
 720     addss  8(%rsi,%rax,4),%xmm11
 721     addss  8(%rsi,%rbx,4),%xmm8
 722     movss  %xmm11,8(%rsi,%rax,4)
 723     movss  %xmm8,8(%rsi,%rbx,4)
 724
 725 _nb_kernel310_x86_64_sse.nb310_checksingle:
 726     movl  nb310_innerk(%rsp),%edx
 727     andl  $1,%edx
 728     jnz    _nb_kernel310_x86_64_sse.nb310_dosingle
 729     jmp    _nb_kernel310_x86_64_sse.nb310_updateouterdata
 730
 731 _nb_kernel310_x86_64_sse.nb310_dosingle:
 732     movq nb310_innerjjnr(%rsp),%rcx
 733         movl  (%rcx),%eax
 734
 735         movq nb310_charge(%rbp),%rsi
 736         movss (%rsi,%rax,4),%xmm0
 737
 738         mulss nb310_iq(%rsp),%xmm0
 739     movaps %xmm0,nb310_qq(%rsp)
 740
 741         movq nb310_type(%rbp),%rsi
 742     ## vdw parameters
 743         movl (%rsi,%rax,4),%r12d
 744         shll %r12d
 745     movl nb310_ntia(%rsp),%edi
 746         addl %edi,%r12d
 747
 748         movq nb310_vdwparam(%rbp),%rsi
 749         movss (%rsi,%r12,4),%xmm0
 750         movss 4(%rsi,%r12,4),%xmm3
 751
 752     movaps %xmm0,nb310_c6(%rsp)
 753     movaps %xmm3,nb310_c12(%rsp)
 754
 755         lea  (%rax,%rax,2),%rax        ## replace jnr with j3
 756
 757         movq nb310_pos(%rbp),%rdi
 758         movss (%rdi,%rax,4),%xmm4           ## x1 - - -
 759         movss 4(%rdi,%rax,4),%xmm5       ## y2 - - -
 760         movss 8(%rdi,%rax,4),%xmm6       ## 13 - - -
 761
 762         ## calc dr
 763         subss nb310_ix(%rsp),%xmm4
 764         subss nb310_iy(%rsp),%xmm5
 765         subss nb310_iz(%rsp),%xmm6
 766
 767         ## store dr in xmm9-xmm11
 768     movaps %xmm4,%xmm9
 769     movaps %xmm5,%xmm10
 770     movaps %xmm6,%xmm11
 771
 772         ## square it
 773         mulss %xmm4,%xmm4
 774         mulss %xmm5,%xmm5
 775         mulss %xmm6,%xmm6
 776         addss %xmm5,%xmm4
 777         addss %xmm6,%xmm4
 778         ## rsq in xmm4
 779
 780     ## calculate rinv=1/sqrt(rsq)
 781         rsqrtss %xmm4,%xmm5
 782         movaps %xmm5,%xmm2
 783         mulss %xmm5,%xmm5
 784         movaps nb310_three(%rsp),%xmm1
 785         mulss %xmm4,%xmm5       ## rsq*lu*lu
 786     subss %xmm5,%xmm1   ## 30-rsq*lu*lu
 787         mulss %xmm2,%xmm1
 788         mulss nb310_half(%rsp),%xmm1
 789     ## xmm1=rinv
 790     movaps %xmm4,%xmm3
 791     ## xmm3=rsq
 792
 793     mulss %xmm1,%xmm3 ## r
 794     mulss nb310_tsc(%rsp),%xmm3   ## rtab
 795
 796     ## truncate and convert to integers
 797     cvttss2si %xmm3,%r12d
 798
 799     ## convert back to float
 800     cvtsi2ss  %r12d,%xmm0
 801
 802     ## multiply by 4
 803     shll      $2,%r12d
 804
 805     ## calculate eps
 806     subss     %xmm0,%xmm3
 807
 808         movq nb310_VFtab(%rbp),%rsi
 809
 810     movaps %xmm1,%xmm0 ## rinv
 811     mulss  %xmm0,%xmm0 ## rinvsq
 812     movaps %xmm0,%xmm2 ## rinvsq
 813     mulss  %xmm2,%xmm2 ## rinv4
 814     mulss  %xmm0,%xmm2 ## rinv6
 815     movaps %xmm2,%xmm12
 816     mulss  %xmm12,%xmm12 ## rinv12
 817
 818     ## load table data
 819         movss (%rsi,%r12,4),%xmm4
 820         movss 4(%rsi,%r12,4),%xmm5
 821         movss 8(%rsi,%r12,4),%xmm6
 822         movss 12(%rsi,%r12,4),%xmm7
 823     ## table data ready in xmm4-xmm7
 824
 825     mulss  nb310_c6(%rsp),%xmm2      ## vvdw6=c6*rinv6
 826         mulss  nb310_c12(%rsp),%xmm12     ## vvdw12=c12*rinv12
 827
 828         movaps %xmm12,%xmm0
 829         subss  %xmm2,%xmm12     ## Vvdw=Vvdw12-Vvdw6
 830
 831     ## add potential to vvdwtot
 832         addss  nb310_Vvdwtot(%rsp),%xmm12
 833     movss %xmm12,nb310_Vvdwtot(%rsp)
 834
 835     mulss %xmm3,%xmm7  ## Heps
 836     mulss  %xmm3,%xmm6 ## Geps
 837     mulss %xmm3,%xmm7  ## Heps2
 838
 839     addss  %xmm6,%xmm5  ## F+Geps
 840     addss  %xmm7,%xmm5  ## F+Geps+Heps2 = Fp
 841     addss  %xmm7,%xmm7  ## 2*Heps2
 842     addss  %xmm6,%xmm7  ## 2*Heps2+Geps
 843     addss  %xmm5,%xmm7  ## FF = Fp + 2*Heps2 + Geps
 844     mulss  %xmm3,%xmm5  ## eps*Fp
 845     addss  %xmm4,%xmm5  ## VV
 846     mulss  nb310_qq(%rsp),%xmm5     ## VV*qq=vcoul
 847     mulss  nb310_qq(%rsp),%xmm7     ## FF*qq=fijC
 848
 849     ## LJ forces
 850     mulss  nb310_six(%rsp),%xmm2
 851     mulss  nb310_twelve(%rsp),%xmm0
 852     subss  %xmm2,%xmm0
 853     mulss  %xmm1,%xmm0 ## (12*vnb12-6*vnb6)*rinv
 854
 855     ## add potential to vctot
 856         addss  nb310_vctot(%rsp),%xmm5
 857     movss %xmm5,nb310_vctot(%rsp)
 858
 859     mulss  nb310_tsc(%rsp),%xmm7
 860     subss  %xmm7,%xmm0
 861
 862     mulss  %xmm1,%xmm0 ## fscal
 863
 864     ## calculate scalar force by multiplying dx/dy/dz with fscal
 865         mulss  %xmm0,%xmm9
 866         mulss  %xmm0,%xmm10
 867         mulss  %xmm0,%xmm11
 868
 869         ## accumulate i forces
 870     addss %xmm9,%xmm13
 871     addss %xmm10,%xmm14
 872     addss %xmm11,%xmm15
 873
 874         movq nb310_faction(%rbp),%rsi
 875     ## add to j forces
 876     addss  (%rsi,%rax,4),%xmm9
 877     addss  4(%rsi,%rax,4),%xmm10
 878     addss  8(%rsi,%rax,4),%xmm11
 879     movss  %xmm9,(%rsi,%rax,4)
 880     movss  %xmm10,4(%rsi,%rax,4)
 881     movss  %xmm11,8(%rsi,%rax,4)
 882
 883 _nb_kernel310_x86_64_sse.nb310_updateouterdata:
 884         movl  nb310_ii3(%rsp),%ecx
 885         movq  nb310_faction(%rbp),%rdi
 886         movq  nb310_fshift(%rbp),%rsi
 887         movl  nb310_is3(%rsp),%edx
 888
 889         ## accumulate i forces in xmm13, xmm14, xmm15
 890         movhlps %xmm13,%xmm0
 891         movhlps %xmm14,%xmm1
 892         movhlps %xmm15,%xmm2
 893         addps  %xmm13,%xmm0
 894         addps  %xmm14,%xmm1
 895         addps  %xmm15,%xmm2
 896     movaps %xmm0,%xmm3
 897         movaps %xmm1,%xmm4
 898         movaps %xmm2,%xmm5
 899         shufps $1,%xmm3,%xmm3
 900         shufps $1,%xmm4,%xmm4
 901         shufps $1,%xmm5,%xmm5
 902         addss  %xmm3,%xmm0
 903         addss  %xmm4,%xmm1
 904         addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0
 905
 906         ## increment i force
 907         movss  (%rdi,%rcx,4),%xmm3
 908         movss  4(%rdi,%rcx,4),%xmm4
 909         movss  8(%rdi,%rcx,4),%xmm5
 910         subss  %xmm0,%xmm3
 911         subss  %xmm1,%xmm4
 912         subss  %xmm2,%xmm5
 913         movss  %xmm3,(%rdi,%rcx,4)
 914         movss  %xmm4,4(%rdi,%rcx,4)
 915         movss  %xmm5,8(%rdi,%rcx,4)
 916
 917         ## increment fshift force
 918         movss  (%rsi,%rdx,4),%xmm3
 919         movss  4(%rsi,%rdx,4),%xmm4
 920         movss  8(%rsi,%rdx,4),%xmm5
 921         subss  %xmm0,%xmm3
 922         subss  %xmm1,%xmm4
 923         subss  %xmm2,%xmm5
 924         movss  %xmm3,(%rsi,%rdx,4)
 925         movss  %xmm4,4(%rsi,%rdx,4)
 926         movss  %xmm5,8(%rsi,%rdx,4)
 927
 928         ## get n from stack
 929         movl nb310_n(%rsp),%esi
 930         ## get group index for i particle
 931         movq  nb310_gid(%rbp),%rdx              ## base of gid[]
 932         movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
 933
 934         ## accumulate total potential energy and update it
 935         movaps nb310_vctot(%rsp),%xmm7
 936         ## accumulate
 937         movhlps %xmm7,%xmm6
 938         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
 939         movaps %xmm7,%xmm6
 940         shufps $1,%xmm6,%xmm6
 941         addss  %xmm6,%xmm7
 942
 943         ## add earlier value from mem
 944         movq  nb310_Vc(%rbp),%rax
 945         addss (%rax,%rdx,4),%xmm7
 946         ## move back to mem
 947         movss %xmm7,(%rax,%rdx,4)
 948
 949         ## accumulate total lj energy and update it
 950         movaps nb310_Vvdwtot(%rsp),%xmm7
 951         ## accumulate
 952         movhlps %xmm7,%xmm6
 953         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
 954         movaps %xmm7,%xmm6
 955         shufps $1,%xmm6,%xmm6
 956         addss  %xmm6,%xmm7
 957
 958         ## add earlier value from mem
 959         movq  nb310_Vvdw(%rbp),%rax
 960         addss (%rax,%rdx,4),%xmm7
 961         ## move back to mem
 962         movss %xmm7,(%rax,%rdx,4)
 963
 964         ## finish if last
 965         movl nb310_nn1(%rsp),%ecx
 966         ## esi already loaded with n
 967         incl %esi
 968         subl %esi,%ecx
 969         jz _nb_kernel310_x86_64_sse.nb310_outerend
 970
 971         ## not last, iterate outer loop once more!
 972         movl %esi,nb310_n(%rsp)
 973         jmp _nb_kernel310_x86_64_sse.nb310_outer
 974 _nb_kernel310_x86_64_sse.nb310_outerend:
 975         ## check if more outer neighborlists remain
 976         movl  nb310_nri(%rsp),%ecx
 977         ## esi already loaded with n above
 978         subl  %esi,%ecx
 979         jz _nb_kernel310_x86_64_sse.nb310_end
 980         ## non-zero, do one more workunit
 981         jmp   _nb_kernel310_x86_64_sse.nb310_threadloop
 982 _nb_kernel310_x86_64_sse.nb310_end:
 983
 984         movl nb310_nouter(%rsp),%eax
 985         movl nb310_ninner(%rsp),%ebx
 986         movq nb310_outeriter(%rbp),%rcx
 987         movq nb310_inneriter(%rbp),%rdx
 988         movl %eax,(%rcx)
 989         movl %ebx,(%rdx)
 990
 991         addq $472,%rsp
 992         emms
 993
 994
 995         pop %r15
 996         pop %r14
 997         pop %r13
 998         pop %r12
 999
1000         pop %rbx
1001         pop    %rbp
1002         ret
1003
1004
1005
1006
1007
1008
1009 .globl nb_kernel310nf_x86_64_sse
1010 .globl _nb_kernel310nf_x86_64_sse
1011 nb_kernel310nf_x86_64_sse:
1012 _nb_kernel310nf_x86_64_sse:
1013 ##      Room for return address and rbp (16 bytes)
1014 .set nb310nf_fshift, 16
1015 .set nb310nf_gid, 24
1016 .set nb310nf_pos, 32
1017 .set nb310nf_faction, 40
1018 .set nb310nf_charge, 48
1019 .set nb310nf_p_facel, 56
1020 .set nb310nf_argkrf, 64
1021 .set nb310nf_argcrf, 72
1022 .set nb310nf_Vc, 80
1023 .set nb310nf_type, 88
1024 .set nb310nf_p_ntype, 96
1025 .set nb310nf_vdwparam, 104
1026 .set nb310nf_Vvdw, 112
1027 .set nb310nf_p_tabscale, 120
1028 .set nb310nf_VFtab, 128
1029 .set nb310nf_invsqrta, 136
1030 .set nb310nf_dvda, 144
1031 .set nb310nf_p_gbtabscale, 152
1032 .set nb310nf_GBtab, 160
1033 .set nb310nf_p_nthreads, 168
1034 .set nb310nf_count, 176
1035 .set nb310nf_mtx, 184
1036 .set nb310nf_outeriter, 192
1037 .set nb310nf_inneriter, 200
1038 .set nb310nf_work, 208
1039         ## stack offsets for local variables
1040         ## bottom of stack is cache-aligned for sse use
1041 .set nb310nf_ix, 0
1042 .set nb310nf_iy, 16
1043 .set nb310nf_iz, 32
1044 .set nb310nf_iq, 48
1045 .set nb310nf_tsc, 64
1046 .set nb310nf_qq, 80
1047 .set nb310nf_c6, 96
1048 .set nb310nf_c12, 112
1049 .set nb310nf_vctot, 128
1050 .set nb310nf_Vvdwtot, 144
1051 .set nb310nf_half, 160
1052 .set nb310nf_three, 176
1053 .set nb310nf_nri, 192
1054 .set nb310nf_iinr, 200
1055 .set nb310nf_jindex, 208
1056 .set nb310nf_jjnr, 216
1057 .set nb310nf_shift, 224
1058 .set nb310nf_shiftvec, 232
1059 .set nb310nf_facel, 240
1060 .set nb310nf_innerjjnr, 248
1061 .set nb310nf_is3, 256
1062 .set nb310nf_ii3, 260
1063 .set nb310nf_ntia, 264
1064 .set nb310nf_innerk, 268
1065 .set nb310nf_n, 272
1066 .set nb310nf_nn1, 276
1067 .set nb310nf_ntype, 280
1068 .set nb310nf_nouter, 284
1069 .set nb310nf_ninner, 288
1070
1071         push %rbp
1072         movq %rsp,%rbp
1073         push %rbx
1074
1075
1076         emms
1077
1078         push %r12
1079         push %r13
1080         push %r14
1081         push %r15
1082
1083         subq $312,%rsp          ## local variable stack space (n*16+8)
1084
1085         ## zero 32-bit iteration counters
1086         movl $0,%eax
1087         movl %eax,nb310nf_nouter(%rsp)
1088         movl %eax,nb310nf_ninner(%rsp)
1089
1090         movl (%rdi),%edi
1091         movl %edi,nb310nf_nri(%rsp)
1092         movq %rsi,nb310nf_iinr(%rsp)
1093         movq %rdx,nb310nf_jindex(%rsp)
1094         movq %rcx,nb310nf_jjnr(%rsp)
1095         movq %r8,nb310nf_shift(%rsp)
1096         movq %r9,nb310nf_shiftvec(%rsp)
1097         movq nb310nf_p_ntype(%rbp),%rdi
1098         movl (%rdi),%edi
1099         movl %edi,nb310nf_ntype(%rsp)
1100         movq nb310nf_p_facel(%rbp),%rsi
1101         movss (%rsi),%xmm0
1102         movss %xmm0,nb310nf_facel(%rsp)
1103
1104         movq nb310nf_p_tabscale(%rbp),%rax
1105         movss (%rax),%xmm3
1106         shufps $0,%xmm3,%xmm3
1107         movaps %xmm3,nb310nf_tsc(%rsp)
1108
1109         ## create constant floating-point factors on stack
1110         movl $0x3f000000,%eax   ## half in IEEE (hex)
1111         movl %eax,nb310nf_half(%rsp)
1112         movss nb310nf_half(%rsp),%xmm1
1113         shufps $0,%xmm1,%xmm1  ## splat to all elements
1114         movaps %xmm1,%xmm2
1115         addps  %xmm2,%xmm2      ## one
1116         movaps %xmm2,%xmm3
1117         addps  %xmm2,%xmm2      ## two
1118         addps  %xmm2,%xmm3      ## three
1119         movaps %xmm1,nb310nf_half(%rsp)
1120         movaps %xmm3,nb310nf_three(%rsp)
1121
1122 _nb_kernel310nf_x86_64_sse.nb310nf_threadloop:
1123         movq  nb310nf_count(%rbp),%rsi            ## pointer to sync counter
1124         movl  (%rsi),%eax
1125 _nb_kernel310nf_x86_64_sse.nb310nf_spinlock:
1126         movl  %eax,%ebx                         ## ebx=*count=nn0
1127         addl  $1,%ebx                          ## ebx=nn1=nn0+10
1128         lock
1129         cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
1130                                                 ## if it hasnt changed.
1131                                                 ## or reread *counter to eax.
1132         pause                                   ## -> better p4 performance
1133         jnz _nb_kernel310nf_x86_64_sse.nb310nf_spinlock
1134
1135         ## if(nn1>nri) nn1=nri
1136         movl nb310nf_nri(%rsp),%ecx
1137         movl %ecx,%edx
1138         subl %ebx,%ecx
1139         cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
1140         ## Cleared the spinlock if we got here.
1141         ## eax contains nn0, ebx contains nn1.
1142         movl %eax,nb310nf_n(%rsp)
1143         movl %ebx,nb310nf_nn1(%rsp)
1144         subl %eax,%ebx                          ## calc number of outer lists
1145         movl %eax,%esi                          ## copy n to esi
1146         jg  _nb_kernel310nf_x86_64_sse.nb310nf_outerstart
1147         jmp _nb_kernel310nf_x86_64_sse.nb310nf_end
1148
1149 _nb_kernel310nf_x86_64_sse.nb310nf_outerstart:
1150         ## ebx contains number of outer iterations
1151         addl nb310nf_nouter(%rsp),%ebx
1152         movl %ebx,nb310nf_nouter(%rsp)
1153
1154 _nb_kernel310nf_x86_64_sse.nb310nf_outer:
1155         movq  nb310nf_shift(%rsp),%rax        ## rax = pointer into shift[]
1156         movl  (%rax,%rsi,4),%ebx                ## ebx=shift[n]
1157
1158         lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is
1159         movl  %ebx,nb310nf_is3(%rsp)            ## store is3
1160
1161         movq  nb310nf_shiftvec(%rsp),%rax     ## rax = base of shiftvec[]
1162
1163         movss (%rax,%rbx,4),%xmm0
1164         movss 4(%rax,%rbx,4),%xmm1
1165         movss 8(%rax,%rbx,4),%xmm2
1166
1167         movq  nb310nf_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]
1168         movl  (%rcx,%rsi,4),%ebx            ## ebx =ii
1169
1170         movq  nb310nf_charge(%rbp),%rdx
1171         movss (%rdx,%rbx,4),%xmm3
1172         mulss nb310nf_facel(%rsp),%xmm3
1173         shufps $0,%xmm3,%xmm3
1174
1175         movq  nb310nf_type(%rbp),%rdx
1176         movl  (%rdx,%rbx,4),%edx
1177         imull nb310nf_ntype(%rsp),%edx
1178         shll  %edx
1179         movl  %edx,nb310nf_ntia(%rsp)
1180
1181         lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3
1182         movq  nb310nf_pos(%rbp),%rax      ## rax = base of pos[]
1183
1184         addss (%rax,%rbx,4),%xmm0
1185         addss 4(%rax,%rbx,4),%xmm1
1186         addss 8(%rax,%rbx,4),%xmm2
1187
1188         movaps %xmm3,nb310nf_iq(%rsp)
1189
1190         shufps $0,%xmm0,%xmm0
1191         shufps $0,%xmm1,%xmm1
1192         shufps $0,%xmm2,%xmm2
1193
1194         movaps %xmm0,nb310nf_ix(%rsp)
1195         movaps %xmm1,nb310nf_iy(%rsp)
1196         movaps %xmm2,nb310nf_iz(%rsp)
1197
1198         movl  %ebx,nb310nf_ii3(%rsp)
1199
1200         ## clear vctot and i forces
1201         xorps %xmm4,%xmm4
1202         movaps %xmm4,nb310nf_vctot(%rsp)
1203         movaps %xmm4,nb310nf_Vvdwtot(%rsp)
1204
1205         movq  nb310nf_jindex(%rsp),%rax
1206         movq  (%rax,%rsi,4),%rcx             ## jindex[n]
1207         movl  4(%rax,%rsi,4),%edx            ## jindex[n+1]
1208         subl  %ecx,%edx              ## number of innerloop atoms
1209
1210         movq  nb310nf_pos(%rbp),%rsi
1211         movq  nb310nf_jjnr(%rsp),%rax
1212         shll  $2,%ecx
1213         addq  %rcx,%rax
1214         movq  %rax,nb310nf_innerjjnr(%rsp)       ## pointer to jjnr[nj0]
1215         movl  %edx,%ecx
1216         subl  $4,%edx
1217         addl  nb310nf_ninner(%rsp),%ecx
1218         movl  %ecx,nb310nf_ninner(%rsp)
1219         addl  $0,%edx
1220         movl  %edx,nb310nf_innerk(%rsp)      ## number of innerloop atoms
1221         jge   _nb_kernel310nf_x86_64_sse.nb310nf_unroll_loop
1222         jmp   _nb_kernel310nf_x86_64_sse.nb310nf_finish_inner
1223 _nb_kernel310nf_x86_64_sse.nb310nf_unroll_loop:
1224         ## quad-unroll innerloop here
1225         movq  nb310nf_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]
1226         movl  (%rdx),%eax
1227         movl  4(%rdx),%ebx
1228         movl  8(%rdx),%ecx
1229         movl  12(%rdx),%edx           ## eax-edx=jnr1-4
1230         addq $16,nb310nf_innerjjnr(%rsp)             ## advance pointer (unrolled 4)
1231
1232         movq nb310nf_charge(%rbp),%rsi     ## base of charge[]
1233
1234         movss (%rsi,%rax,4),%xmm3
1235         movss (%rsi,%rcx,4),%xmm4
1236         movss (%rsi,%rbx,4),%xmm6
1237         movss (%rsi,%rdx,4),%xmm7
1238
1239         movaps nb310nf_iq(%rsp),%xmm2
1240         shufps $0,%xmm6,%xmm3
1241         shufps $0,%xmm7,%xmm4
1242         shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3
1243         movd  %eax,%mm0         ## use mmx registers as temp storage
1244         movd  %ebx,%mm1
1245         mulps  %xmm2,%xmm3
1246         movd  %ecx,%mm2
1247         movd  %edx,%mm3
1248
1249         movaps %xmm3,nb310nf_qq(%rsp)
1250
1251         movq nb310nf_type(%rbp),%rsi
1252         movl (%rsi,%rax,4),%eax
1253         movl (%rsi,%rbx,4),%ebx
1254         movl (%rsi,%rcx,4),%ecx
1255         movl (%rsi,%rdx,4),%edx
1256         movq nb310nf_vdwparam(%rbp),%rsi
1257         shll %eax
1258         shll %ebx
1259         shll %ecx
1260         shll %edx
1261         movl nb310nf_ntia(%rsp),%edi
1262         addl %edi,%eax
1263         addl %edi,%ebx
1264         addl %edi,%ecx
1265         addl %edi,%edx
1266
1267         movlps (%rsi,%rax,4),%xmm6
1268         movlps (%rsi,%rcx,4),%xmm7
1269         movhps (%rsi,%rbx,4),%xmm6
1270         movhps (%rsi,%rdx,4),%xmm7
1271
1272         movaps %xmm6,%xmm4
1273         shufps $136,%xmm7,%xmm4 ## 10001000
1274         shufps $221,%xmm7,%xmm6 ## 11011101
1275
1276         movd  %mm0,%eax
1277         movd  %mm1,%ebx
1278         movd  %mm2,%ecx
1279         movd  %mm3,%edx
1280
1281         movaps %xmm4,nb310nf_c6(%rsp)
1282         movaps %xmm6,nb310nf_c12(%rsp)
1283
1284         movq nb310nf_pos(%rbp),%rsi        ## base of pos[]
1285
1286         lea  (%rax,%rax,2),%rax     ## replace jnr with j3
1287         lea  (%rbx,%rbx,2),%rbx
1288
1289         lea  (%rcx,%rcx,2),%rcx     ## replace jnr with j3
1290         lea  (%rdx,%rdx,2),%rdx
1291
1292         ## move four coordinates to xmm0-xmm2
1293
1294         movlps (%rsi,%rax,4),%xmm4
1295         movlps (%rsi,%rcx,4),%xmm5
1296         movss 8(%rsi,%rax,4),%xmm2
1297         movss 8(%rsi,%rcx,4),%xmm6
1298
1299         movhps (%rsi,%rbx,4),%xmm4
1300         movhps (%rsi,%rdx,4),%xmm5
1301
1302         movss 8(%rsi,%rbx,4),%xmm0
1303         movss 8(%rsi,%rdx,4),%xmm1
1304
1305         shufps $0,%xmm0,%xmm2
1306         shufps $0,%xmm1,%xmm6
1307
1308         movaps %xmm4,%xmm0
1309         movaps %xmm4,%xmm1
1310
1311         shufps $136,%xmm6,%xmm2 ## 10001000
1312
1313         shufps $136,%xmm5,%xmm0 ## 10001000
1314         shufps $221,%xmm5,%xmm1 ## 11011101
1315
1316         ## move ix-iz to xmm4-xmm6
1317         movaps nb310nf_ix(%rsp),%xmm4
1318         movaps nb310nf_iy(%rsp),%xmm5
1319         movaps nb310nf_iz(%rsp),%xmm6
1320
1321         ## calc dr
1322         subps %xmm0,%xmm4
1323         subps %xmm1,%xmm5
1324         subps %xmm2,%xmm6
1325
1326         ## square it
1327         mulps %xmm4,%xmm4
1328         mulps %xmm5,%xmm5
1329         mulps %xmm6,%xmm6
1330         addps %xmm5,%xmm4
1331         addps %xmm6,%xmm4
1332         ## rsq in xmm4
1333
1334         rsqrtps %xmm4,%xmm5
1335         ## lookup seed in xmm5
1336         movaps %xmm5,%xmm2
1337         mulps %xmm5,%xmm5
1338         movaps nb310nf_three(%rsp),%xmm1
1339         mulps %xmm4,%xmm5       ## rsq*lu*lu
1340         movaps nb310nf_half(%rsp),%xmm0
1341         subps %xmm5,%xmm1       ## 30-rsq*lu*lu
1342         mulps %xmm2,%xmm1
1343         mulps %xmm1,%xmm0       ## xmm0=rinv
1344         mulps %xmm0,%xmm4       ## xmm4=r
1345         mulps nb310nf_tsc(%rsp),%xmm4
1346
1347         movhlps %xmm4,%xmm5
1348         cvttps2pi %xmm4,%mm6
1349         cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices
1350         cvtpi2ps %mm6,%xmm6
1351         cvtpi2ps %mm7,%xmm5
1352         movlhps %xmm5,%xmm6
1353         subps %xmm6,%xmm4
1354         movaps %xmm4,%xmm1      ## xmm1=eps
1355         movaps %xmm1,%xmm2
1356         mulps  %xmm2,%xmm2      ## xmm2=eps2
1357         pslld $2,%mm6
1358         pslld $2,%mm7
1359
1360         movd %eax,%mm0
1361         movd %ebx,%mm1
1362         movd %ecx,%mm2
1363         movd %edx,%mm3
1364
1365         movq nb310nf_VFtab(%rbp),%rsi
1366         movd %mm6,%eax
1367         psrlq $32,%mm6
1368         movd %mm7,%ecx
1369         psrlq $32,%mm7
1370         movd %mm6,%ebx
1371         movd %mm7,%edx
1372
1373         movlps (%rsi,%rax,4),%xmm5
1374         movlps (%rsi,%rcx,4),%xmm7
1375         movhps (%rsi,%rbx,4),%xmm5
1376         movhps (%rsi,%rdx,4),%xmm7 ## got half coulomb table
1377
1378         movaps %xmm5,%xmm4
1379         shufps $136,%xmm7,%xmm4 ## 10001000
1380         shufps $221,%xmm7,%xmm5 ## 11011101
1381
1382         movlps 8(%rsi,%rax,4),%xmm7
1383         movlps 8(%rsi,%rcx,4),%xmm3
1384         movhps 8(%rsi,%rbx,4),%xmm7
1385         movhps 8(%rsi,%rdx,4),%xmm3    ## other half of coulomb table
1386         movaps %xmm7,%xmm6
1387         shufps $136,%xmm3,%xmm6 ## 10001000
1388         shufps $221,%xmm3,%xmm7 ## 11011101
1389         ## coulomb table ready, in xmm4-xmm7
1390
1391         mulps  %xmm1,%xmm6      ## xmm6=Geps
1392         mulps  %xmm2,%xmm7      ## xmm7=Heps2
1393         addps  %xmm6,%xmm5
1394         addps  %xmm7,%xmm5      ## xmm5=Fp
1395         movaps nb310nf_qq(%rsp),%xmm3
1396         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp
1397         addps  %xmm4,%xmm5 ## xmm5=VV
1398         mulps  %xmm3,%xmm5 ## vcoul=qq*VV
1399         ## L-J
1400         movaps %xmm0,%xmm4
1401         mulps  %xmm0,%xmm4      ## xmm4=rinvsq
1402
1403         ## at this point mm5 contains vcoul
1404         ## increment vcoul - then we can get rid of mm5
1405         ## update vctot
1406         addps  nb310nf_vctot(%rsp),%xmm5
1407         movaps %xmm4,%xmm6
1408         mulps  %xmm4,%xmm6
1409         movaps %xmm5,nb310nf_vctot(%rsp)
1410
1411         mulps  %xmm4,%xmm6      ## xmm6=rinvsix
1412         movaps %xmm6,%xmm4
1413         mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve
1414         mulps  nb310nf_c6(%rsp),%xmm6
1415         mulps  nb310nf_c12(%rsp),%xmm4
1416         movaps nb310nf_Vvdwtot(%rsp),%xmm7
1417         addps  %xmm4,%xmm7
1418         subps  %xmm6,%xmm7
1419         movaps %xmm7,nb310nf_Vvdwtot(%rsp)
1420
1421
1422         ## should we do one more iteration?
1423         subl $4,nb310nf_innerk(%rsp)
1424         jl    _nb_kernel310nf_x86_64_sse.nb310nf_finish_inner
1425         jmp   _nb_kernel310nf_x86_64_sse.nb310nf_unroll_loop
1426 _nb_kernel310nf_x86_64_sse.nb310nf_finish_inner:
1427         ## check if at least two particles remain
1428         addl $4,nb310nf_innerk(%rsp)
1429         movl  nb310nf_innerk(%rsp),%edx
1430         andl  $2,%edx
1431         jnz   _nb_kernel310nf_x86_64_sse.nb310nf_dopair
1432         jmp   _nb_kernel310nf_x86_64_sse.nb310nf_checksingle
1433 _nb_kernel310nf_x86_64_sse.nb310nf_dopair:
1434         movq nb310nf_charge(%rbp),%rsi
1435     movq  nb310nf_innerjjnr(%rsp),%rcx
1436         movl  (%rcx),%eax
1437         movl  4(%rcx),%ebx
1438         addq $8,nb310nf_innerjjnr(%rsp)
1439         xorps %xmm7,%xmm7
1440         movss (%rsi,%rax,4),%xmm3
1441         movss (%rsi,%rbx,4),%xmm6
1442         shufps $0,%xmm6,%xmm3
1443         shufps $8,%xmm3,%xmm3 ## 00001000 ;# xmm3(0,1) has the charges
1444
1445         mulps  nb310nf_iq(%rsp),%xmm3
1446         movlhps %xmm7,%xmm3
1447         movaps %xmm3,nb310nf_qq(%rsp)
1448
1449         movq nb310nf_type(%rbp),%rsi
1450         movl  %eax,%ecx
1451         movl  %ebx,%edx
1452         movl (%rsi,%rcx,4),%ecx
1453         movl (%rsi,%rdx,4),%edx
1454         movq nb310nf_vdwparam(%rbp),%rsi
1455         shll %ecx
1456         shll %edx
1457         movl nb310nf_ntia(%rsp),%edi
1458         addl %edi,%ecx
1459         addl %edi,%edx
1460         movlps (%rsi,%rcx,4),%xmm6
1461         movhps (%rsi,%rdx,4),%xmm6
1462         movq nb310nf_pos(%rbp),%rdi
1463
1464         movaps %xmm6,%xmm4
1465         shufps $8,%xmm4,%xmm4 ## 00001000
1466         shufps $13,%xmm6,%xmm6 ## 00001101
1467         movlhps %xmm7,%xmm4
1468         movlhps %xmm7,%xmm6
1469
1470         movaps %xmm4,nb310nf_c6(%rsp)
1471         movaps %xmm6,nb310nf_c12(%rsp)
1472
1473         lea  (%rax,%rax,2),%rax
1474         lea  (%rbx,%rbx,2),%rbx
1475         ## move coordinates to xmm0-xmm2
1476         movlps (%rdi,%rax,4),%xmm1
1477         movss 8(%rdi,%rax,4),%xmm2
1478         movhps (%rdi,%rbx,4),%xmm1
1479         movss 8(%rdi,%rbx,4),%xmm0
1480
1481         movlhps %xmm7,%xmm3
1482
1483         shufps $0,%xmm0,%xmm2
1484
1485         movaps %xmm1,%xmm0
1486
1487         shufps $136,%xmm2,%xmm2 ## 10001000
1488
1489         shufps $136,%xmm0,%xmm0 ## 10001000
1490         shufps $221,%xmm1,%xmm1 ## 11011101
1491
1492         ## move ix-iz to xmm4-xmm6
1493         xorps   %xmm7,%xmm7
1494
1495         movaps nb310nf_ix(%rsp),%xmm4
1496         movaps nb310nf_iy(%rsp),%xmm5
1497         movaps nb310nf_iz(%rsp),%xmm6
1498
1499         ## calc dr
1500         subps %xmm0,%xmm4
1501         subps %xmm1,%xmm5
1502         subps %xmm2,%xmm6
1503
1504         ## square it
1505         mulps %xmm4,%xmm4
1506         mulps %xmm5,%xmm5
1507         mulps %xmm6,%xmm6
1508         addps %xmm5,%xmm4
1509         addps %xmm6,%xmm4
1510         ## rsq in xmm4
1511
1512         rsqrtps %xmm4,%xmm5
1513         ## lookup seed in xmm5
1514         movaps %xmm5,%xmm2
1515         mulps %xmm5,%xmm5
1516         movaps nb310nf_three(%rsp),%xmm1
1517         mulps %xmm4,%xmm5       ## rsq*lu*lu
1518         movaps nb310nf_half(%rsp),%xmm0
1519         subps %xmm5,%xmm1       ## 30-rsq*lu*lu
1520         mulps %xmm2,%xmm1
1521         mulps %xmm1,%xmm0       ## xmm0=rinv
1522         mulps %xmm0,%xmm4       ## xmm4=r
1523         mulps nb310nf_tsc(%rsp),%xmm4
1524
1525         cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices
1526         cvtpi2ps %mm6,%xmm6
1527         subps %xmm6,%xmm4
1528         movaps %xmm4,%xmm1      ## xmm1=eps
1529         movaps %xmm1,%xmm2
1530         mulps  %xmm2,%xmm2      ## xmm2=eps2
1531
1532         pslld $2,%mm6
1533
1534         movq nb310nf_VFtab(%rbp),%rsi
1535         movd %mm6,%ecx
1536         psrlq $32,%mm6
1537         movd %mm6,%edx
1538
1539         movlps (%rsi,%rcx,4),%xmm5
1540         movhps (%rsi,%rdx,4),%xmm5 ## got half coulomb table
1541         movaps %xmm5,%xmm4
1542         shufps $136,%xmm4,%xmm4 ## 10001000
1543         shufps $221,%xmm7,%xmm5 ## 11011101
1544
1545         movlps 8(%rsi,%rcx,4),%xmm7
1546         movhps 8(%rsi,%rdx,4),%xmm7
1547         movaps %xmm7,%xmm6
1548         shufps $136,%xmm6,%xmm6 ## 10001000
1549         shufps $221,%xmm7,%xmm7 ## 11011101
1550         ## table ready in xmm4-xmm7
1551
1552         mulps  %xmm1,%xmm6      ## xmm6=Geps
1553         mulps  %xmm2,%xmm7      ## xmm7=Heps2
1554         addps  %xmm6,%xmm5
1555         addps  %xmm7,%xmm5      ## xmm5=Fp
1556         movaps nb310nf_qq(%rsp),%xmm3
1557         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp
1558         addps  %xmm4,%xmm5 ## xmm5=VV
1559         mulps  %xmm3,%xmm5 ## vcoul=qq*VV
1560         ## L-J
1561         movaps %xmm0,%xmm4
1562         mulps  %xmm0,%xmm4      ## xmm4=rinvsq
1563
1564         ## at this point mm5 contains vcoul
1565         ## increment vcoul - then we can get rid of mm5
1566         ## update vctot
1567         addps  nb310nf_vctot(%rsp),%xmm5
1568
1569         movaps %xmm4,%xmm6
1570         mulps  %xmm4,%xmm6
1571
1572         movaps %xmm5,nb310nf_vctot(%rsp)
1573
1574         mulps  %xmm4,%xmm6      ## xmm6=rinvsix
1575         movaps %xmm6,%xmm4
1576         mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve
1577         mulps  nb310nf_c6(%rsp),%xmm6
1578         mulps  nb310nf_c12(%rsp),%xmm4
1579         movaps nb310nf_Vvdwtot(%rsp),%xmm7
1580         addps  %xmm4,%xmm7
1581         subps  %xmm6,%xmm7
1582         movaps %xmm7,nb310nf_Vvdwtot(%rsp)
1583
1584 _nb_kernel310nf_x86_64_sse.nb310nf_checksingle:
1585         movl  nb310nf_innerk(%rsp),%edx
1586         andl  $1,%edx
1587         jnz    _nb_kernel310nf_x86_64_sse.nb310nf_dosingle
1588         jmp    _nb_kernel310nf_x86_64_sse.nb310nf_updateouterdata
1589 _nb_kernel310nf_x86_64_sse.nb310nf_dosingle:
1590         movq nb310nf_charge(%rbp),%rsi
1591         movq nb310nf_pos(%rbp),%rdi
1592         movq  nb310nf_innerjjnr(%rsp),%rcx
1593         movl  (%rcx),%eax
1594         xorps  %xmm6,%xmm6
1595         movss (%rsi,%rax,4),%xmm6       ## xmm6(0) has the charge
1596         mulps  nb310nf_iq(%rsp),%xmm6
1597         movaps %xmm6,nb310nf_qq(%rsp)
1598
1599         movq nb310nf_type(%rbp),%rsi
1600         movl %eax,%ecx
1601         movl (%rsi,%rcx,4),%ecx
1602         movq nb310nf_vdwparam(%rbp),%rsi
1603         shll %ecx
1604         addl nb310nf_ntia(%rsp),%ecx
1605         movlps (%rsi,%rcx,4),%xmm6
1606         movaps %xmm6,%xmm4
1607         shufps $252,%xmm4,%xmm4 ## 11111100
1608         shufps $253,%xmm6,%xmm6 ## 11111101
1609
1610         movaps %xmm4,nb310nf_c6(%rsp)
1611         movaps %xmm6,nb310nf_c12(%rsp)
1612
1613         lea  (%rax,%rax,2),%rax
1614
1615         ## move coordinates to xmm0-xmm2
1616         movss (%rdi,%rax,4),%xmm0
1617         movss 4(%rdi,%rax,4),%xmm1
1618         movss 8(%rdi,%rax,4),%xmm2
1619
1620         movaps nb310nf_ix(%rsp),%xmm4
1621         movaps nb310nf_iy(%rsp),%xmm5
1622         movaps nb310nf_iz(%rsp),%xmm6
1623
1624         ## calc dr
1625         subps %xmm0,%xmm4
1626         subps %xmm1,%xmm5
1627         subps %xmm2,%xmm6
1628
1629         ## square it
1630         mulps %xmm4,%xmm4
1631         mulps %xmm5,%xmm5
1632         mulps %xmm6,%xmm6
1633         addps %xmm5,%xmm4
1634         addps %xmm6,%xmm4
1635         ## rsq in xmm4
1636
1637         rsqrtps %xmm4,%xmm5
1638         ## lookup seed in xmm5
1639         movaps %xmm5,%xmm2
1640         mulps %xmm5,%xmm5
1641         movaps nb310nf_three(%rsp),%xmm1
1642         mulps %xmm4,%xmm5       ## rsq*lu*lu
1643         movaps nb310nf_half(%rsp),%xmm0
1644         subps %xmm5,%xmm1       ## 30-rsq*lu*lu
1645         mulps %xmm2,%xmm1
1646         mulps %xmm1,%xmm0       ## xmm0=rinv
1647
1648         mulps %xmm0,%xmm4       ## xmm4=r
1649         mulps nb310nf_tsc(%rsp),%xmm4
1650
1651         cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices
1652         cvtpi2ps %mm6,%xmm6
1653         subps %xmm6,%xmm4
1654         movaps %xmm4,%xmm1      ## xmm1=eps
1655         movaps %xmm1,%xmm2
1656         mulps  %xmm2,%xmm2      ## xmm2=eps2
1657
1658         pslld $2,%mm6
1659
1660         movq nb310nf_VFtab(%rbp),%rsi
1661         movd %mm6,%ebx
1662
1663         movlps (%rsi,%rbx,4),%xmm4
1664         movlps 8(%rsi,%rbx,4),%xmm6
1665         movaps %xmm4,%xmm5
1666         movaps %xmm6,%xmm7
1667         shufps $1,%xmm5,%xmm5
1668         shufps $1,%xmm7,%xmm7
1669         ## table ready in xmm4-xmm7
1670
1671         mulps  %xmm1,%xmm6      ## xmm6=Geps
1672         mulps  %xmm2,%xmm7      ## xmm7=Heps2
1673         addps  %xmm6,%xmm5
1674         addps  %xmm7,%xmm5      ## xmm5=Fp
1675         movaps nb310nf_qq(%rsp),%xmm3
1676         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp
1677         addps  %xmm4,%xmm5 ## xmm5=VV
1678         mulps  %xmm3,%xmm5 ## vcoul=qq*VV
1679         ## L-J
1680         movaps %xmm0,%xmm4
1681         mulps  %xmm0,%xmm4      ## xmm4=rinvsq
1682
1683         ## at this point mm5 contains vcoul
1684         ## increment vcoul - then we can get rid of mm5
1685         ## update vctot
1686         addss  nb310nf_vctot(%rsp),%xmm5
1687
1688         movaps %xmm4,%xmm6
1689         mulps  %xmm4,%xmm6
1690
1691         movss %xmm5,nb310nf_vctot(%rsp)
1692
1693         mulps  %xmm4,%xmm6      ## xmm6=rinvsix
1694         movaps %xmm6,%xmm4
1695         mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve
1696         mulps  nb310nf_c6(%rsp),%xmm6
1697         mulps  nb310nf_c12(%rsp),%xmm4
1698         movss nb310nf_Vvdwtot(%rsp),%xmm7
1699         addps  %xmm4,%xmm7
1700         subps  %xmm6,%xmm7
1701         movss %xmm7,nb310nf_Vvdwtot(%rsp)
1702
1703 _nb_kernel310nf_x86_64_sse.nb310nf_updateouterdata:
1704         ## get n from stack
1705         movl nb310nf_n(%rsp),%esi
1706         ## get group index for i particle
1707         movq  nb310nf_gid(%rbp),%rdx            ## base of gid[]
1708         movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
1709
1710         ## accumulate total potential energy and update it
1711         movaps nb310nf_vctot(%rsp),%xmm7
1712         ## accumulate
1713         movhlps %xmm7,%xmm6
1714         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1715         movaps %xmm7,%xmm6
1716         shufps $1,%xmm6,%xmm6
1717         addss  %xmm6,%xmm7
1718
1719         ## add earlier value from mem
1720         movq  nb310nf_Vc(%rbp),%rax
1721         addss (%rax,%rdx,4),%xmm7
1722         ## move back to mem
1723         movss %xmm7,(%rax,%rdx,4)
1724
1725         ## accumulate total lj energy and update it
1726         movaps nb310nf_Vvdwtot(%rsp),%xmm7
1727         ## accumulate
1728         movhlps %xmm7,%xmm6
1729         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1730         movaps %xmm7,%xmm6
1731         shufps $1,%xmm6,%xmm6
1732         addss  %xmm6,%xmm7
1733
1734         ## add earlier value from mem
1735         movq  nb310nf_Vvdw(%rbp),%rax
1736         addss (%rax,%rdx,4),%xmm7
1737         ## move back to mem
1738         movss %xmm7,(%rax,%rdx,4)
1739
1740         ## finish if last
1741         movl nb310nf_nn1(%rsp),%ecx
1742         ## esi already loaded with n
1743         incl %esi
1744         subl %esi,%ecx
1745         jz _nb_kernel310nf_x86_64_sse.nb310nf_outerend
1746
1747         ## not last, iterate outer loop once more!
1748         movl %esi,nb310nf_n(%rsp)
1749         jmp _nb_kernel310nf_x86_64_sse.nb310nf_outer
1750 _nb_kernel310nf_x86_64_sse.nb310nf_outerend:
1751         ## check if more outer neighborlists remain
1752         movl  nb310nf_nri(%rsp),%ecx
1753         ## esi already loaded with n above
1754         subl  %esi,%ecx
1755         jz _nb_kernel310nf_x86_64_sse.nb310nf_end
1756         ## non-zero, do one more workunit
1757         jmp   _nb_kernel310nf_x86_64_sse.nb310nf_threadloop
1758 _nb_kernel310nf_x86_64_sse.nb310nf_end:
1759
1760         movl nb310nf_nouter(%rsp),%eax
1761         movl nb310nf_ninner(%rsp),%ebx
1762         movq nb310nf_outeriter(%rbp),%rcx
1763         movq nb310nf_inneriter(%rbp),%rdx
1764         movl %eax,(%rcx)
1765         movl %ebx,(%rdx)
1766
1767         addq $312,%rsp
1768         emms
1769
1770
1771         pop %r15
1772         pop %r14
1773         pop %r13
1774         pop %r12
1775
1776         pop %rbx
1777         pop    %rbp
1778         ret
1779