src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel410_x86_64_sse.s

   1 ##
   2 ##
   3 ## Gromacs 4.0                         Copyright (c) 1991-2003
   4 ## David van der Spoel, Erik Lindahl
   5 ##
   6 ## This program is free software; you can redistribute it and/or
   7 ## modify it under the terms of the GNU General Public License
   8 ## as published by the Free Software Foundation; either version 2
   9 ## of the License, or (at your option) any later version.
  10 ##
  11 ## To help us fund GROMACS development, we humbly ask that you cite
  12 ## the research papers on the package. Check out http://www.gromacs.org
  13 ##
  14 ## And Hey:
  15 ## Gnomes, ROck Monsters And Chili Sauce
  16 ##
  17
  18
  19
  20
  21
  22
  23 .globl nb_kernel410_x86_64_sse
  24 .globl _nb_kernel410_x86_64_sse
  25 nb_kernel410_x86_64_sse:
  26 _nb_kernel410_x86_64_sse:
  27 ##      Room for return address and rbp (16 bytes)
  28 .set nb410_fshift, 16
  29 .set nb410_gid, 24
  30 .set nb410_pos, 32
  31 .set nb410_faction, 40
  32 .set nb410_charge, 48
  33 .set nb410_p_facel, 56
  34 .set nb410_argkrf, 64
  35 .set nb410_argcrf, 72
  36 .set nb410_Vc, 80
  37 .set nb410_type, 88
  38 .set nb410_p_ntype, 96
  39 .set nb410_vdwparam, 104
  40 .set nb410_Vvdw, 112
  41 .set nb410_p_tabscale, 120
  42 .set nb410_VFtab, 128
  43 .set nb410_invsqrta, 136
  44 .set nb410_dvda, 144
  45 .set nb410_p_gbtabscale, 152
  46 .set nb410_GBtab, 160
  47 .set nb410_p_nthreads, 168
  48 .set nb410_count, 176
  49 .set nb410_mtx, 184
  50 .set nb410_outeriter, 192
  51 .set nb410_inneriter, 200
  52 .set nb410_work, 208
  53         ## stack offsets for local variables
  54         ## bottom of stack is cache-aligned for sse use
  55 .set nb410_ix, 0
  56 .set nb410_iy, 16
  57 .set nb410_iz, 32
  58 .set nb410_iq, 48
  59 .set nb410_dx, 64
  60 .set nb410_dy, 80
  61 .set nb410_dz, 96
  62 .set nb410_two, 112
  63 .set nb410_six, 128
  64 .set nb410_twelve, 144
  65 .set nb410_gbtsc, 160
  66 .set nb410_qq, 176
  67 .set nb410_c6, 192
  68 .set nb410_c12, 208
  69 .set nb410_fscal, 224
  70 .set nb410_vctot, 240
  71 .set nb410_Vvdwtot, 256
  72 .set nb410_fix, 272
  73 .set nb410_fiy, 288
  74 .set nb410_fiz, 304
  75 .set nb410_half, 320
  76 .set nb410_three, 336
  77 .set nb410_r, 352
  78 .set nb410_isai, 368
  79 .set nb410_isaprod, 384
  80 .set nb410_dvdasum, 400
  81 .set nb410_gbscale, 416
  82 .set nb410_nri, 432
  83 .set nb410_iinr, 440
  84 .set nb410_jindex, 448
  85 .set nb410_jjnr, 456
  86 .set nb410_shift, 464
  87 .set nb410_shiftvec, 472
  88 .set nb410_facel, 480
  89 .set nb410_innerjjnr, 488
  90 .set nb410_is3, 496
  91 .set nb410_ii3, 500
  92 .set nb410_ii, 504
  93 .set nb410_ntia, 508
  94 .set nb410_innerk, 512
  95 .set nb410_n, 516
  96 .set nb410_nn1, 520
  97 .set nb410_ntype, 524
  98 .set nb410_nouter, 528
  99 .set nb410_ninner, 532
 100 .set nb410_jnra, 536
 101 .set nb410_jnrb, 540
 102 .set nb410_jnrc, 544
 103 .set nb410_jnrd, 548
 104
 105         push %rbp
 106         movq %rsp,%rbp
 107         push %rbx
 108
 109
 110         emms
 111
 112         push %r12
 113         push %r13
 114         push %r14
 115         push %r15
 116
 117         subq $568,%rsp          ## local variable stack space (n*16+8)
 118
 119         ## zero 32-bit iteration counters
 120         movl $0,%eax
 121         movl %eax,nb410_nouter(%rsp)
 122         movl %eax,nb410_ninner(%rsp)
 123
 124         movl (%rdi),%edi
 125         movl %edi,nb410_nri(%rsp)
 126         movq %rsi,nb410_iinr(%rsp)
 127         movq %rdx,nb410_jindex(%rsp)
 128         movq %rcx,nb410_jjnr(%rsp)
 129         movq %r8,nb410_shift(%rsp)
 130         movq %r9,nb410_shiftvec(%rsp)
 131         movq nb410_p_ntype(%rbp),%rdi
 132         movl (%rdi),%edi
 133         movl %edi,nb410_ntype(%rsp)
 134         movq nb410_p_facel(%rbp),%rsi
 135         movss (%rsi),%xmm0
 136         movss %xmm0,nb410_facel(%rsp)
 137
 138         movq nb410_p_gbtabscale(%rbp),%rbx
 139         movss (%rbx),%xmm4
 140         shufps $0,%xmm4,%xmm4
 141         movaps %xmm4,nb410_gbtsc(%rsp)
 142
 143
 144         ## create constant floating-point factors on stack
 145         movl $0x3f000000,%eax   ## half in IEEE (hex)
 146         movl %eax,nb410_half(%rsp)
 147         movss nb410_half(%rsp),%xmm1
 148         shufps $0,%xmm1,%xmm1  ## splat to all elements
 149         movaps %xmm1,%xmm2
 150         addps  %xmm2,%xmm2      ## one
 151         movaps %xmm2,%xmm3
 152         addps  %xmm2,%xmm2      ## two
 153         addps  %xmm2,%xmm3      ## three
 154         movaps %xmm3,%xmm4
 155         addps  %xmm4,%xmm4      ## six
 156         movaps %xmm4,%xmm5
 157         addps  %xmm5,%xmm5      ## twelve
 158         movaps %xmm1,nb410_half(%rsp)
 159         movaps %xmm2,nb410_two(%rsp)
 160         movaps %xmm3,nb410_three(%rsp)
 161         movaps %xmm4,nb410_six(%rsp)
 162         movaps %xmm5,nb410_twelve(%rsp)
 163
 164 _nb_kernel410_x86_64_sse.nb410_threadloop:
 165         movq  nb410_count(%rbp),%rsi            ## pointer to sync counter
 166         movl  (%rsi),%eax
 167 _nb_kernel410_x86_64_sse.nb410_spinlock:
 168         movl  %eax,%ebx                         ## ebx=*count=nn0
 169         addl  $1,%ebx                          ## ebx=nn1=nn0+10
 170         lock
 171         cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
 172                                                 ## if it hasnt changed.
 173                                                 ## or reread *counter to eax.
 174         pause                                   ## -> better p4 performance
 175         jnz _nb_kernel410_x86_64_sse.nb410_spinlock
 176
 177         ## if(nn1>nri) nn1=nri
 178         movl nb410_nri(%rsp),%ecx
 179         movl %ecx,%edx
 180         subl %ebx,%ecx
 181         cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
 182         ## Cleared the spinlock if we got here.
 183         ## eax contains nn0, ebx contains nn1.
 184         movl %eax,nb410_n(%rsp)
 185         movl %ebx,nb410_nn1(%rsp)
 186         subl %eax,%ebx                          ## calc number of outer lists
 187         movl %eax,%esi                          ## copy n to esi
 188         jg  _nb_kernel410_x86_64_sse.nb410_outerstart
 189         jmp _nb_kernel410_x86_64_sse.nb410_end
 190
 191 _nb_kernel410_x86_64_sse.nb410_outerstart:
 192         ## ebx contains number of outer iterations
 193         addl nb410_nouter(%rsp),%ebx
 194         movl %ebx,nb410_nouter(%rsp)
 195
 196 _nb_kernel410_x86_64_sse.nb410_outer:
 197         movq  nb410_shift(%rsp),%rax        ## rax = pointer into shift[]
 198         movl  (%rax,%rsi,4),%ebx        ## ebx=shift[n]
 199
 200         lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is
 201         movl  %ebx,nb410_is3(%rsp)      ## store is3
 202
 203         movq  nb410_shiftvec(%rsp),%rax     ## rax = base of shiftvec[]
 204
 205         movss (%rax,%rbx,4),%xmm0
 206         movss 4(%rax,%rbx,4),%xmm1
 207         movss 8(%rax,%rbx,4),%xmm2
 208
 209         movq  nb410_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]
 210         movl  (%rcx,%rsi,4),%ebx            ## ebx =ii
 211         movl  %ebx,nb410_ii(%rsp)
 212
 213         movq  nb410_charge(%rbp),%rdx
 214         movss (%rdx,%rbx,4),%xmm3
 215         mulss nb410_facel(%rsp),%xmm3
 216         shufps $0,%xmm3,%xmm3
 217
 218         movq  nb410_invsqrta(%rbp),%rdx         ## load invsqrta[ii]
 219         movss (%rdx,%rbx,4),%xmm4
 220         shufps $0,%xmm4,%xmm4
 221
 222         movq  nb410_type(%rbp),%rdx
 223         movl  (%rdx,%rbx,4),%edx
 224         imull nb410_ntype(%rsp),%edx
 225         shll  %edx
 226         movl  %edx,nb410_ntia(%rsp)
 227
 228         lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3
 229         movq  nb410_pos(%rbp),%rax      ## rax = base of pos[]
 230
 231         addss (%rax,%rbx,4),%xmm0
 232         addss 4(%rax,%rbx,4),%xmm1
 233         addss 8(%rax,%rbx,4),%xmm2
 234
 235         movaps %xmm3,nb410_iq(%rsp)
 236         movaps %xmm4,nb410_isai(%rsp)
 237
 238         shufps $0,%xmm0,%xmm0
 239         shufps $0,%xmm1,%xmm1
 240         shufps $0,%xmm2,%xmm2
 241
 242         movaps %xmm0,nb410_ix(%rsp)
 243         movaps %xmm1,nb410_iy(%rsp)
 244         movaps %xmm2,nb410_iz(%rsp)
 245
 246         movl  %ebx,nb410_ii3(%rsp)
 247
 248         ## clear vctot and i forces
 249         xorps %xmm13,%xmm13
 250         movaps %xmm13,%xmm12
 251         movaps %xmm13,nb410_Vvdwtot(%rsp)
 252         movaps %xmm13,nb410_dvdasum(%rsp)
 253         movaps %xmm13,%xmm14
 254         movaps %xmm13,%xmm15
 255
 256         movq  nb410_jindex(%rsp),%rax
 257         movl  (%rax,%rsi,4),%ecx             ## jindex[n]
 258         movl  4(%rax,%rsi,4),%edx            ## jindex[n+1]
 259         subl  %ecx,%edx              ## number of innerloop atoms
 260
 261         movq  nb410_pos(%rbp),%rsi
 262         movq  nb410_faction(%rbp),%rdi
 263         movq  nb410_jjnr(%rsp),%rax
 264         shll  $2,%ecx
 265         addq  %rcx,%rax
 266         movq  %rax,nb410_innerjjnr(%rsp)       ## pointer to jjnr[nj0]
 267         movl  %edx,%ecx
 268         subl  $4,%edx
 269         addl  nb410_ninner(%rsp),%ecx
 270         movl  %ecx,nb410_ninner(%rsp)
 271         addl  $0,%edx
 272         movl  %edx,nb410_innerk(%rsp)      ## number of innerloop atoms
 273         jge   _nb_kernel410_x86_64_sse.nb410_unroll_loop
 274         jmp   _nb_kernel410_x86_64_sse.nb410_finish_inner
 275 _nb_kernel410_x86_64_sse.nb410_unroll_loop:
 276         ## quad-unroll innerloop here
 277         movq  nb410_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]
 278         movl  (%rdx),%eax
 279         movl  4(%rdx),%ebx
 280         movl  8(%rdx),%ecx
 281         movl  12(%rdx),%edx           ## eax-edx=jnr1-4
 282
 283         addq $16,nb410_innerjjnr(%rsp)             ## advance pointer (unrolled 4)
 284
 285         ## load isaj
 286         movq nb410_invsqrta(%rbp),%rsi
 287         movss (%rsi,%rax,4),%xmm3
 288         movss (%rsi,%rcx,4),%xmm4
 289         movss (%rsi,%rbx,4),%xmm6
 290         movss (%rsi,%rdx,4),%xmm7
 291         movaps nb410_isai(%rsp),%xmm2
 292         shufps $0,%xmm6,%xmm3
 293         shufps $0,%xmm7,%xmm4
 294         shufps $136,%xmm4,%xmm3 ## 10001000 ;# all isaj in xmm3
 295         mulps  %xmm3,%xmm2
 296
 297         movaps %xmm2,nb410_isaprod(%rsp)
 298         movaps %xmm2,%xmm1
 299         mulps nb410_gbtsc(%rsp),%xmm1
 300         movaps %xmm1,nb410_gbscale(%rsp)
 301
 302         movq nb410_charge(%rbp),%rsi     ## base of charge[]
 303
 304         movss (%rsi,%rax,4),%xmm3
 305         movss (%rsi,%rcx,4),%xmm4
 306         movss (%rsi,%rbx,4),%xmm6
 307         movss (%rsi,%rdx,4),%xmm7
 308
 309         mulps nb410_iq(%rsp),%xmm2
 310         shufps $0,%xmm6,%xmm3
 311         shufps $0,%xmm7,%xmm4
 312         shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3
 313         mulps  %xmm2,%xmm3
 314         movaps %xmm3,nb410_qq(%rsp)
 315
 316     ## vdw parameters
 317         movq nb410_type(%rbp),%rsi
 318         movl (%rsi,%rax,4),%r12d
 319         movl (%rsi,%rbx,4),%r13d
 320         movl (%rsi,%rcx,4),%r14d
 321         movl (%rsi,%rdx,4),%r15d
 322         shll %r12d
 323         shll %r13d
 324         shll %r14d
 325         shll %r15d
 326     movl nb410_ntia(%rsp),%edi
 327         addl %edi,%r12d
 328         addl %edi,%r13d
 329         addl %edi,%r14d
 330         addl %edi,%r15d
 331
 332         movq nb410_vdwparam(%rbp),%rsi
 333         movlps (%rsi,%r12,4),%xmm3
 334         movlps (%rsi,%r14,4),%xmm7
 335         movhps (%rsi,%r13,4),%xmm3
 336         movhps (%rsi,%r15,4),%xmm7
 337
 338         movaps %xmm3,%xmm0
 339         shufps $136,%xmm7,%xmm0 ## 10001000
 340         shufps $221,%xmm7,%xmm3 ## 11011101
 341
 342     movaps %xmm0,nb410_c6(%rsp)
 343     movaps %xmm3,nb410_c12(%rsp)
 344
 345         movq nb410_pos(%rbp),%rsi        ## base of pos[]
 346
 347         lea  (%rax,%rax,2),%r8     ## jnr
 348         lea  (%rbx,%rbx,2),%r9
 349         lea  (%rcx,%rcx,2),%r10
 350         lea  (%rdx,%rdx,2),%r11
 351
 352         ## move four coordinates to xmm0-xmm2
 353         movlps (%rsi,%r8,4),%xmm4
 354         movlps (%rsi,%r10,4),%xmm5
 355         movss 8(%rsi,%r8,4),%xmm2
 356         movss 8(%rsi,%r10,4),%xmm6
 357
 358         movhps (%rsi,%r9,4),%xmm4
 359         movhps (%rsi,%r11,4),%xmm5
 360
 361         movss 8(%rsi,%r9,4),%xmm0
 362         movss 8(%rsi,%r11,4),%xmm1
 363
 364         shufps $0,%xmm0,%xmm2
 365         shufps $0,%xmm1,%xmm6
 366
 367         movaps %xmm4,%xmm0
 368         movaps %xmm4,%xmm1
 369
 370         shufps $136,%xmm6,%xmm2 ## 10001000
 371
 372         shufps $136,%xmm5,%xmm0 ## 10001000
 373         shufps $221,%xmm5,%xmm1 ## 11011101
 374
 375         ## calc dr
 376         subps nb410_ix(%rsp),%xmm0
 377         subps nb410_iy(%rsp),%xmm1
 378         subps nb410_iz(%rsp),%xmm2
 379
 380         ## store dr
 381         movaps %xmm0,nb410_dx(%rsp)
 382         movaps %xmm1,nb410_dy(%rsp)
 383         movaps %xmm2,nb410_dz(%rsp)
 384
 385         ## square it
 386         mulps %xmm0,%xmm0
 387         mulps %xmm1,%xmm1
 388         mulps %xmm2,%xmm2
 389         addps %xmm1,%xmm0
 390         addps %xmm2,%xmm0
 391     movaps %xmm0,%xmm4
 392         ## rsq in xmm4
 393
 394         rsqrtps %xmm4,%xmm5
 395         ## lookup seed in xmm5
 396         movaps %xmm5,%xmm2
 397         mulps %xmm5,%xmm5
 398         movaps nb410_three(%rsp),%xmm1
 399         mulps %xmm4,%xmm5       ## rsq*lu*lu
 400         movaps nb410_half(%rsp),%xmm0
 401         subps %xmm5,%xmm1       ## 30-rsq*lu*lu
 402         mulps %xmm2,%xmm1
 403         mulps %xmm1,%xmm0       ## xmm0=rinv
 404         mulps %xmm0,%xmm4       ## xmm4=r
 405         movaps %xmm4,nb410_r(%rsp)
 406         mulps nb410_gbscale(%rsp),%xmm4
 407
 408     ## truncate and convert to integers
 409     cvttps2dq %xmm4,%xmm5
 410
 411     ## convert back to float
 412     cvtdq2ps  %xmm5,%xmm6
 413
 414     ## multiply by 4
 415     pslld   $2,%xmm5
 416
 417     ## move to integer registers
 418     movhlps %xmm5,%xmm7
 419     movd    %xmm5,%r12d
 420     movd    %xmm7,%r14d
 421     pshufd $1,%xmm5,%xmm5
 422     pshufd $1,%xmm7,%xmm7
 423     movd    %xmm5,%r13d
 424     movd    %xmm7,%r15d
 425
 426     ## calculate eps
 427     subps     %xmm6,%xmm4
 428     movaps    %xmm4,%xmm1 ##eps
 429
 430         movq nb410_GBtab(%rbp),%rsi
 431
 432     movaps %xmm0,%xmm9 ## rinv
 433     mulps  %xmm9,%xmm9 ## rinvsq
 434     movaps %xmm9,%xmm10 ## rinvsq
 435     mulps  %xmm10,%xmm10 ## rinv4
 436     mulps  %xmm9,%xmm10 ## rinv6
 437     movaps %xmm10,%xmm11
 438     mulps  %xmm11,%xmm11 ## rinv12
 439
 440     ## load table data
 441         movlps (%rsi,%r12,4),%xmm5
 442         movlps (%rsi,%r14,4),%xmm7
 443         movhps (%rsi,%r13,4),%xmm5
 444         movhps (%rsi,%r15,4),%xmm7
 445
 446     movaps %xmm5,%xmm4
 447         shufps $136,%xmm7,%xmm4 ## 10001000
 448         shufps $221,%xmm7,%xmm5 ## 11011101
 449
 450     mulps  nb410_c6(%rsp),%xmm10      ## vvdw6=c6*rinv6
 451         mulps  nb410_c12(%rsp),%xmm11     ## vvdw12=c12*rinv12
 452
 453         movaps %xmm11,%xmm9
 454         subps  %xmm10,%xmm11    ## Vvdw=Vvdw12-Vvdw6
 455
 456     ## add potential to vvdwtot
 457         addps  nb410_Vvdwtot(%rsp),%xmm11
 458     movaps %xmm11,nb410_Vvdwtot(%rsp)
 459
 460         movlps 8(%rsi,%r12,4),%xmm7
 461         movlps 8(%rsi,%r14,4),%xmm8
 462         movhps 8(%rsi,%r13,4),%xmm7
 463         movhps 8(%rsi,%r15,4),%xmm8
 464
 465     movaps %xmm7,%xmm6
 466
 467         shufps $136,%xmm8,%xmm6 ## 10001000
 468         shufps $221,%xmm8,%xmm7 ## 11011101
 469     ## table data ready in xmm4-xmm7
 470
 471     mulps  %xmm1,%xmm7  ## Heps
 472         mulps  %xmm1,%xmm6      ## xmm6=Geps
 473         mulps  %xmm1,%xmm7      ## Heps2
 474         addps  %xmm6,%xmm5
 475         addps  %xmm7,%xmm5      ## xmm5=Fp
 476         addps  %xmm7,%xmm7      ## two*Heps2
 477         movaps nb410_qq(%rsp),%xmm3
 478         addps  %xmm6,%xmm7
 479         addps  %xmm5,%xmm7 ## xmm7=FF
 480         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp
 481         addps  %xmm4,%xmm5 ## xmm5=VV
 482         mulps  %xmm3,%xmm5 ## vcoul=qq*VV
 483         mulps  %xmm7,%xmm3 ## fijC=FF*qq
 484         ## at this point xmm5 contains vcoul and xmm3 fijC
 485
 486     ## LJ forces
 487     mulps  nb410_six(%rsp),%xmm10
 488     mulps  nb410_twelve(%rsp),%xmm9
 489     subps  %xmm10,%xmm9
 490     mulps  %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv
 491
 492         movq nb410_dvda(%rbp),%rsi
 493
 494         ## Calculate dVda
 495         xorps  %xmm7,%xmm7
 496         mulps nb410_gbscale(%rsp),%xmm3
 497         movaps %xmm3,%xmm6
 498         mulps  nb410_r(%rsp),%xmm6
 499         addps  %xmm5,%xmm6
 500
 501     ## increment vctot (sum in xmm12)
 502         addps  %xmm5,%xmm12
 503
 504         ## xmm6=(vcoul+fijC*r)
 505         subps  %xmm6,%xmm7
 506         movaps %xmm7,%xmm6
 507
 508     ## update dvdasum
 509     addps  nb410_dvdasum(%rsp),%xmm7
 510     movaps %xmm7,nb410_dvdasum(%rsp)
 511
 512         ## update j atoms dvdaj
 513         movhlps %xmm6,%xmm7
 514         movaps  %xmm6,%xmm5
 515         movaps  %xmm7,%xmm4
 516         shufps $0x1,%xmm5,%xmm5
 517         shufps $0x1,%xmm4,%xmm4
 518
 519         ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
 520         addss  (%rsi,%rax,4),%xmm6
 521         addss  (%rsi,%rbx,4),%xmm5
 522         addss  (%rsi,%rcx,4),%xmm7
 523         addss  (%rsi,%rdx,4),%xmm4
 524         movss  %xmm6,(%rsi,%rax,4)
 525         movss  %xmm5,(%rsi,%rbx,4)
 526         movss  %xmm7,(%rsi,%rcx,4)
 527         movss  %xmm4,(%rsi,%rdx,4)
 528
 529     subps  %xmm3,%xmm9
 530     mulps  %xmm0,%xmm9 ## fscal
 531
 532     movaps  %xmm9,%xmm10
 533     movaps  %xmm9,%xmm11
 534
 535     mulps   nb410_dx(%rsp),%xmm9
 536     mulps   nb410_dy(%rsp),%xmm10
 537     mulps   nb410_dz(%rsp),%xmm11
 538
 539         ## accumulate i forces
 540     addps %xmm9,%xmm13
 541     addps %xmm10,%xmm14
 542     addps %xmm11,%xmm15
 543
 544         movq nb410_faction(%rbp),%rsi
 545         ## the fj's - start by accumulating x & y forces from memory
 546         movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
 547         movlps (%rsi,%r10,4),%xmm1 ## x3 y3 - -
 548         movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
 549         movhps (%rsi,%r11,4),%xmm1 ## x3 y3 x4 y4
 550
 551     movaps %xmm9,%xmm8
 552     unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
 553     unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4
 554
 555     ## update fjx and fjy
 556         addps  %xmm9,%xmm0
 557         addps  %xmm8,%xmm1
 558
 559         movlps %xmm0,(%rsi,%r8,4)
 560         movlps %xmm1,(%rsi,%r10,4)
 561         movhps %xmm0,(%rsi,%r9,4)
 562         movhps %xmm1,(%rsi,%r11,4)
 563
 564     ## xmm11: fjz1 fjz2 fjz3 fjz4
 565     pshufd $1,%xmm11,%xmm10 ## fjz2 - - -
 566     movhlps %xmm11,%xmm9     ## fjz3 - - -
 567     pshufd $3,%xmm11,%xmm8  ## fjz4 - - -
 568
 569         addss  8(%rsi,%r8,4),%xmm11
 570         addss  8(%rsi,%r9,4),%xmm10
 571         addss  8(%rsi,%r10,4),%xmm9
 572         addss  8(%rsi,%r11,4),%xmm8
 573         movss  %xmm11,8(%rsi,%r8,4)
 574         movss  %xmm10,8(%rsi,%r9,4)
 575         movss  %xmm9,8(%rsi,%r10,4)
 576         movss  %xmm8,8(%rsi,%r11,4)
 577
 578         ## should we do one more iteration?
 579         subl $4,nb410_innerk(%rsp)
 580         jl    _nb_kernel410_x86_64_sse.nb410_finish_inner
 581         jmp   _nb_kernel410_x86_64_sse.nb410_unroll_loop
 582 _nb_kernel410_x86_64_sse.nb410_finish_inner:
 583         ## check if at least two particles remain
 584         addl $4,nb410_innerk(%rsp)
 585         movl  nb410_innerk(%rsp),%edx
 586         andl  $2,%edx
 587         jnz   _nb_kernel410_x86_64_sse.nb410_dopair
 588         jmp   _nb_kernel410_x86_64_sse.nb410_checksingle
 589 _nb_kernel410_x86_64_sse.nb410_dopair:
 590         movq  nb410_innerjjnr(%rsp),%rcx
 591
 592         movl  (%rcx),%eax
 593         movl  4(%rcx),%ebx
 594         addq $8,nb410_innerjjnr(%rsp)
 595
 596         ## load isaj
 597         movq nb410_invsqrta(%rbp),%rsi
 598         movss (%rsi,%rax,4),%xmm2
 599         movss (%rsi,%rbx,4),%xmm6
 600     unpcklps %xmm6,%xmm2
 601
 602         mulps  nb410_isai(%rsp),%xmm2
 603
 604         movaps %xmm2,nb410_isaprod(%rsp)
 605         movaps %xmm2,%xmm1
 606         mulps nb410_gbtsc(%rsp),%xmm1
 607         movaps %xmm1,nb410_gbscale(%rsp)
 608
 609     mulps nb410_iq(%rsp),%xmm2
 610         movq nb410_charge(%rbp),%rsi     ## base of charge[]
 611         movss (%rsi,%rax,4),%xmm3
 612         movss (%rsi,%rbx,4),%xmm6
 613     unpcklps %xmm6,%xmm3
 614
 615
 616         mulps %xmm2,%xmm3
 617         movaps %xmm3,nb410_qq(%rsp)
 618
 619      ## vdw parameters
 620         movq nb410_type(%rbp),%rsi
 621         movl (%rsi,%rax,4),%r12d
 622         movl (%rsi,%rbx,4),%r13d
 623         shll %r12d
 624         shll %r13d
 625     movl nb410_ntia(%rsp),%edi
 626         addl %edi,%r12d
 627         addl %edi,%r13d
 628
 629         movq nb410_vdwparam(%rbp),%rsi
 630         movlps (%rsi,%r12,4),%xmm3
 631         movhps (%rsi,%r13,4),%xmm3
 632
 633     xorps %xmm7,%xmm7
 634         movaps %xmm3,%xmm0
 635         shufps $136,%xmm7,%xmm0 ## 10001000
 636         shufps $221,%xmm7,%xmm3 ## 11011101
 637
 638     movaps %xmm0,nb410_c6(%rsp)
 639     movaps %xmm3,nb410_c12(%rsp)
 640
 641         movq nb410_pos(%rbp),%rsi        ## base of pos[]
 642
 643         lea  (%rax,%rax,2),%r8     ## j3
 644         lea  (%rbx,%rbx,2),%r9
 645
 646         ## move four coordinates to xmm0-xmm2
 647         movlps (%rsi,%r8,4),%xmm4       ## x1 y1 - -
 648         movlps (%rsi,%r9,4),%xmm5       ## x2 y2 - -
 649
 650         movss 8(%rsi,%r8,4),%xmm6       ## z1 - - -
 651         movss 8(%rsi,%r9,4),%xmm7       ## z2 - - -
 652
 653     unpcklps %xmm5,%xmm4 ## x1 x2 y1 y2
 654     movhlps  %xmm4,%xmm5 ## y1 y2 -  -
 655     unpcklps %xmm7,%xmm6 ## z1 z2 -  -
 656
 657         ## calc dr
 658         subps nb410_ix(%rsp),%xmm4
 659         subps nb410_iy(%rsp),%xmm5
 660         subps nb410_iz(%rsp),%xmm6
 661
 662         ## store dr
 663         movaps %xmm4,nb410_dx(%rsp)
 664         movaps %xmm5,nb410_dy(%rsp)
 665         movaps %xmm6,nb410_dz(%rsp)
 666
 667         ## square it
 668         mulps %xmm4,%xmm4
 669         mulps %xmm5,%xmm5
 670         mulps %xmm6,%xmm6
 671         addps %xmm5,%xmm4
 672         addps %xmm6,%xmm4
 673         ## rsq in xmm4
 674
 675         rsqrtps %xmm4,%xmm5
 676         ## lookup seed in xmm5
 677         movaps %xmm5,%xmm2
 678         mulps %xmm5,%xmm5
 679         movaps nb410_three(%rsp),%xmm1
 680         mulps %xmm4,%xmm5       ## rsq*lu*lu
 681         movaps nb410_half(%rsp),%xmm0
 682         subps %xmm5,%xmm1       ## 30-rsq*lu*lu
 683         mulps %xmm2,%xmm1
 684         mulps %xmm1,%xmm0       ## xmm0=rinv
 685         mulps %xmm0,%xmm4       ## xmm4=r
 686         movaps %xmm4,nb410_r(%rsp)
 687         mulps nb410_gbscale(%rsp),%xmm4
 688
 689     ## truncate and convert to integers
 690     cvttps2dq %xmm4,%xmm5
 691
 692     ## convert back to float
 693     cvtdq2ps  %xmm5,%xmm6
 694
 695     ## multiply by 4
 696     pslld   $2,%xmm5
 697
 698     ## move to integer registers
 699     movd    %xmm5,%r12d
 700     pshufd $1,%xmm5,%xmm5
 701     movd    %xmm5,%r13d
 702
 703     ## calculate eps
 704     subps     %xmm6,%xmm4
 705     movaps    %xmm4,%xmm1 ##eps
 706
 707         movq nb410_GBtab(%rbp),%rsi
 708
 709     movaps %xmm0,%xmm9 ## rinv
 710     mulps  %xmm9,%xmm9 ## rinvsq
 711     movaps %xmm9,%xmm10 ## rinvsq
 712     mulps  %xmm10,%xmm10 ## rinv4
 713     mulps  %xmm9,%xmm10 ## rinv6
 714     movaps %xmm10,%xmm11
 715     mulps  %xmm11,%xmm11 ## rinv12
 716
 717     ## load table data
 718         movlps (%rsi,%r12,4),%xmm4  ## Y1 F1
 719         movlps (%rsi,%r13,4),%xmm5  ## Y2 F2
 720     unpcklps %xmm5,%xmm4        ## Y1 Y2 F1 F2
 721     movhlps  %xmm4,%xmm5        ## F1 F2
 722
 723     mulps  nb410_c6(%rsp),%xmm10      ## vvdw6=c6*rinv6
 724         mulps  nb410_c12(%rsp),%xmm11     ## vvdw12=c12*rinv12
 725
 726         movaps %xmm11,%xmm9
 727         subps  %xmm10,%xmm11    ## Vvdw=Vvdw12-Vvdw6
 728
 729     ## add potential to vvdwtot
 730         addps  nb410_Vvdwtot(%rsp),%xmm11
 731     movlps %xmm11,nb410_Vvdwtot(%rsp)
 732
 733         movlps 8(%rsi,%r12,4),%xmm6      ## G1 H1
 734         movlps 8(%rsi,%r13,4),%xmm7      ## G2 H2
 735     unpcklps %xmm7,%xmm6             ## G1 G2
 736     movhlps  %xmm6,%xmm7             ## H1 H2
 737     ## table data ready in xmm4-xmm7
 738
 739     mulps  %xmm1,%xmm7  ## Heps
 740         mulps  %xmm1,%xmm6      ## xmm6=Geps
 741         mulps  %xmm1,%xmm7      ## Heps2
 742         addps  %xmm6,%xmm5
 743         addps  %xmm7,%xmm5      ## xmm5=Fp
 744         addps  %xmm7,%xmm7      ## two*Heps2
 745         movaps nb410_qq(%rsp),%xmm3
 746
 747         addps  %xmm6,%xmm7
 748         addps  %xmm5,%xmm7 ## xmm7=FF
 749         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp
 750         addps  %xmm4,%xmm5 ## xmm5=VV
 751         mulps  %xmm3,%xmm5 ## vcoul=qq*VV
 752         mulps  %xmm7,%xmm3 ## fijC=FF*qq
 753         ## at this point xmm5 contains vcoul and xmm3 fijC
 754
 755     ## LJ forces
 756     mulps  nb410_six(%rsp),%xmm10
 757     mulps  nb410_twelve(%rsp),%xmm9
 758     subps  %xmm10,%xmm9
 759     mulps  %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv
 760
 761     ## zero upper part of vcoul
 762     xorps %xmm2,%xmm2
 763     movlhps %xmm2,%xmm5
 764
 765         movq nb410_dvda(%rbp),%rsi
 766
 767         ## Calculate dVda
 768         xorps  %xmm7,%xmm7
 769         mulps nb410_gbscale(%rsp),%xmm3
 770         movaps %xmm3,%xmm6
 771         mulps  nb410_r(%rsp),%xmm6
 772         addps  %xmm5,%xmm6
 773
 774     xorps  %xmm4,%xmm4
 775     ## increment vctot (sum in xmm12)
 776         addps  %xmm5,%xmm12
 777
 778         ## xmm6=(vcoul+fijC*r)
 779         subps  %xmm6,%xmm7
 780         movaps %xmm7,%xmm6
 781
 782     ## zero upper half of dvda
 783     movlhps %xmm4,%xmm7
 784
 785     ## update dvdasum
 786     addps  nb410_dvdasum(%rsp),%xmm7
 787     movaps %xmm7,nb410_dvdasum(%rsp)
 788
 789         ## update j atoms dvdaj
 790         movaps  %xmm6,%xmm5
 791         shufps $0x1,%xmm5,%xmm5
 792
 793         ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
 794         addss  (%rsi,%rax,4),%xmm6
 795         addss  (%rsi,%rbx,4),%xmm5
 796         movss  %xmm6,(%rsi,%rax,4)
 797         movss  %xmm5,(%rsi,%rbx,4)
 798
 799     xorps %xmm7,%xmm7
 800
 801     subps  %xmm3,%xmm9
 802     mulps  %xmm0,%xmm9 ## fscal
 803
 804     movaps  %xmm9,%xmm10
 805     movaps  %xmm9,%xmm11
 806
 807     mulps   nb410_dx(%rsp),%xmm9
 808     mulps   nb410_dy(%rsp),%xmm10
 809     mulps   nb410_dz(%rsp),%xmm11
 810
 811     movlhps  %xmm7,%xmm9
 812     movlhps  %xmm7,%xmm10
 813     movlhps  %xmm7,%xmm11
 814
 815         ## accumulate i forces
 816     addps %xmm9,%xmm13
 817     addps %xmm10,%xmm14
 818     addps %xmm11,%xmm15
 819
 820         movq nb410_faction(%rbp),%rsi
 821         ## the fj's - start by accumulating x & y forces from memory
 822         movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
 823         movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
 824
 825     unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
 826     addps    %xmm9,%xmm0
 827
 828         movlps %xmm0,(%rsi,%r8,4)
 829         movhps %xmm0,(%rsi,%r9,4)
 830
 831     ## z forces
 832     pshufd $1,%xmm11,%xmm8
 833     addss  8(%rsi,%r8,4),%xmm11
 834     addss  8(%rsi,%r9,4),%xmm8
 835     movss  %xmm11,8(%rsi,%r8,4)
 836     movss  %xmm8,8(%rsi,%r9,4)
 837
 838 _nb_kernel410_x86_64_sse.nb410_checksingle:
 839         movl  nb410_innerk(%rsp),%edx
 840         andl  $1,%edx
 841         jnz    _nb_kernel410_x86_64_sse.nb410_dosingle
 842         jmp    _nb_kernel410_x86_64_sse.nb410_updateouterdata
 843 _nb_kernel410_x86_64_sse.nb410_dosingle:
 844         movq nb410_charge(%rbp),%rsi
 845         movq nb410_invsqrta(%rbp),%rdx
 846         movq nb410_pos(%rbp),%rdi
 847         movq  nb410_innerjjnr(%rsp),%rcx
 848         movl  (%rcx),%eax
 849
 850         ## load isaj
 851         movq nb410_invsqrta(%rbp),%rsi
 852         movss (%rsi,%rax,4),%xmm3
 853         movaps nb410_isai(%rsp),%xmm2
 854         mulss  %xmm3,%xmm2
 855
 856         movss %xmm2,nb410_isaprod(%rsp)
 857         movaps %xmm2,%xmm1
 858         mulss nb410_gbtsc(%rsp),%xmm1
 859         movss %xmm1,nb410_gbscale(%rsp)
 860
 861     mulss nb410_iq(%rsp),%xmm2
 862         movq nb410_charge(%rbp),%rsi     ## base of charge[]
 863
 864         movss (%rsi,%rax,4),%xmm3
 865         mulss %xmm2,%xmm3
 866         movss %xmm3,nb410_qq(%rsp)
 867
 868     ## vdw parameters
 869         movq nb410_type(%rbp),%rsi
 870         movl (%rsi,%rax,4),%r12d
 871         shll %r12d
 872     movl nb410_ntia(%rsp),%edi
 873         addl %edi,%r12d
 874
 875         movq nb410_vdwparam(%rbp),%rsi
 876         movss (%rsi,%r12,4),%xmm0
 877         movss 4(%rsi,%r12,4),%xmm3
 878     movaps %xmm0,nb410_c6(%rsp)
 879     movaps %xmm3,nb410_c12(%rsp)
 880
 881         movq nb410_pos(%rbp),%rsi        ## base of pos[]
 882
 883         lea  (%rax,%rax,2),%r8     ## jnr
 884
 885         ## move four coordinates to xmm0-xmm2
 886         movss (%rsi,%r8,4),%xmm4
 887         movss 4(%rsi,%r8,4),%xmm5
 888         movss 8(%rsi,%r8,4),%xmm6
 889
 890         ## calc dr
 891         subss nb410_ix(%rsp),%xmm4
 892         subss nb410_iy(%rsp),%xmm5
 893         subss nb410_iz(%rsp),%xmm6
 894
 895         ## store dr
 896         movaps %xmm4,nb410_dx(%rsp)
 897         movaps %xmm5,nb410_dy(%rsp)
 898         movaps %xmm6,nb410_dz(%rsp)
 899
 900         ## square it
 901         mulss %xmm4,%xmm4
 902         mulss %xmm5,%xmm5
 903         mulss %xmm6,%xmm6
 904         addss %xmm5,%xmm4
 905         addss %xmm6,%xmm4
 906         ## rsq in xmm4
 907
 908         rsqrtss %xmm4,%xmm5
 909         ## lookup seed in xmm5
 910         movaps %xmm5,%xmm2
 911         mulss %xmm5,%xmm5
 912         movaps nb410_three(%rsp),%xmm1
 913         mulss %xmm4,%xmm5       ## rsq*lu*lu
 914         movaps nb410_half(%rsp),%xmm0
 915         subss %xmm5,%xmm1       ## 30-rsq*lu*lu
 916         mulss %xmm2,%xmm1
 917         mulss %xmm1,%xmm0       ## xmm0=rinv
 918         mulss %xmm0,%xmm4       ## xmm4=r
 919         movaps %xmm4,nb410_r(%rsp)
 920         mulss nb410_gbscale(%rsp),%xmm4
 921
 922     ## truncate and convert to integers
 923     cvttss2si %xmm4,%r12d
 924
 925     ## convert back to float
 926     cvtsi2ss  %r12d,%xmm6
 927
 928     ## multiply by 4
 929     shll  $2,%r12d
 930
 931     ## calculate eps
 932     subss     %xmm6,%xmm4
 933     movaps    %xmm4,%xmm1 ##eps
 934
 935         movq nb410_GBtab(%rbp),%rsi
 936
 937     movaps %xmm0,%xmm9 ## rinv
 938     mulss  %xmm9,%xmm9 ## rinvsq
 939     movaps %xmm9,%xmm10 ## rinvsq
 940     mulss  %xmm10,%xmm10 ## rinv4
 941     mulss  %xmm9,%xmm10 ## rinv6
 942     movaps %xmm10,%xmm11
 943     mulss  %xmm11,%xmm11 ## rinv12
 944
 945     ## load table data
 946         movss (%rsi,%r12,4),%xmm4
 947         movss 4(%rsi,%r12,4),%xmm5
 948         movss 8(%rsi,%r12,4),%xmm6
 949         movss 12(%rsi,%r12,4),%xmm7
 950     ## table data ready in xmm4-xmm7
 951
 952     mulss  nb410_c6(%rsp),%xmm10      ## vvdw6=c6*rinv6
 953         mulss  nb410_c12(%rsp),%xmm11     ## vvdw12=c12*rinv12
 954
 955         movaps %xmm11,%xmm9
 956         subss  %xmm10,%xmm11    ## Vvdw=Vvdw12-Vvdw6
 957
 958     ## add potential to vvdwtot
 959         addss  nb410_Vvdwtot(%rsp),%xmm11
 960     movss %xmm11,nb410_Vvdwtot(%rsp)
 961
 962     mulss  %xmm1,%xmm7  ## Heps
 963         mulss  %xmm1,%xmm6      ## xmm6=Geps
 964         mulss  %xmm1,%xmm7      ## Heps2
 965         addss  %xmm6,%xmm5
 966         addss  %xmm7,%xmm5      ## xmm5=Fp
 967         addss  %xmm7,%xmm7      ## two*Heps2
 968         movss  nb410_qq(%rsp),%xmm3
 969         addss  %xmm6,%xmm7
 970         addss  %xmm5,%xmm7 ## xmm7=FF
 971         mulss  %xmm1,%xmm5 ## xmm5=eps*Fp
 972         addss  %xmm4,%xmm5 ## xmm5=VV
 973         mulss  %xmm3,%xmm5 ## vcoul=qq*VV
 974         mulss  %xmm7,%xmm3 ## fijC=FF*qq
 975         ## at this point xmm5 contains vcoul and xmm3 fijC
 976
 977     ## LJ forces
 978     mulss  nb410_six(%rsp),%xmm10
 979     mulss  nb410_twelve(%rsp),%xmm9
 980     subss  %xmm10,%xmm9
 981     mulss  %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv
 982
 983         movq nb410_dvda(%rbp),%rsi
 984
 985         ## Calculate dVda
 986         xorps  %xmm7,%xmm7
 987         mulss nb410_gbscale(%rsp),%xmm3
 988         movaps %xmm3,%xmm6
 989         mulss  nb410_r(%rsp),%xmm6
 990         addss  %xmm5,%xmm6
 991
 992     ## increment vctot (sum in xmm12)
 993         addss  %xmm5,%xmm12
 994
 995         ## xmm6=(vcoul+fijC*r)
 996         subss  %xmm6,%xmm7
 997         movaps %xmm7,%xmm6
 998
 999     ## update dvdasum
1000     addss  nb410_dvdasum(%rsp),%xmm7
1001     movss %xmm7,nb410_dvdasum(%rsp)
1002
1003         ## update j atoms dvdaj
1004         addss  (%rsi,%rax,4),%xmm6
1005         movss  %xmm6,(%rsi,%rax,4)
1006
1007     subss  %xmm3,%xmm9
1008     mulss  %xmm0,%xmm9 ## fscal
1009
1010     movaps  %xmm9,%xmm10
1011     movaps  %xmm9,%xmm11
1012
1013     mulss   nb410_dx(%rsp),%xmm9
1014     mulss   nb410_dy(%rsp),%xmm10
1015     mulss   nb410_dz(%rsp),%xmm11
1016
1017         ## accumulate i forces
1018     addss %xmm9,%xmm13
1019     addss %xmm10,%xmm14
1020     addss %xmm11,%xmm15
1021
1022         movq nb410_faction(%rbp),%rsi
1023     ## add to j forces
1024     addss  (%rsi,%r8,4),%xmm9
1025     addss  4(%rsi,%r8,4),%xmm10
1026     addss  8(%rsi,%r8,4),%xmm11
1027     movss  %xmm9,(%rsi,%r8,4)
1028     movss  %xmm10,4(%rsi,%r8,4)
1029     movss  %xmm11,8(%rsi,%r8,4)
1030
1031 _nb_kernel410_x86_64_sse.nb410_updateouterdata:
1032         movl  nb410_ii3(%rsp),%ecx
1033         movq  nb410_faction(%rbp),%rdi
1034         movq  nb410_fshift(%rbp),%rsi
1035         movl  nb410_is3(%rsp),%edx
1036
1037         ## accumulate i forces in xmm13, xmm14, xmm15
1038         movhlps %xmm13,%xmm0
1039         movhlps %xmm14,%xmm1
1040         movhlps %xmm15,%xmm2
1041         addps  %xmm13,%xmm0
1042         addps  %xmm14,%xmm1
1043         addps  %xmm15,%xmm2
1044     movaps %xmm0,%xmm3
1045         movaps %xmm1,%xmm4
1046         movaps %xmm2,%xmm5
1047         shufps $1,%xmm3,%xmm3
1048         shufps $1,%xmm4,%xmm4
1049         shufps $1,%xmm5,%xmm5
1050         addss  %xmm3,%xmm0
1051         addss  %xmm4,%xmm1
1052         addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0
1053
1054
1055         ## increment i force
1056         movss  (%rdi,%rcx,4),%xmm3
1057         movss  4(%rdi,%rcx,4),%xmm4
1058         movss  8(%rdi,%rcx,4),%xmm5
1059         subss  %xmm0,%xmm3
1060         subss  %xmm1,%xmm4
1061         subss  %xmm2,%xmm5
1062         movss  %xmm3,(%rdi,%rcx,4)
1063         movss  %xmm4,4(%rdi,%rcx,4)
1064         movss  %xmm5,8(%rdi,%rcx,4)
1065
1066         ## increment fshift force
1067         movss  (%rsi,%rdx,4),%xmm3
1068         movss  4(%rsi,%rdx,4),%xmm4
1069         movss  8(%rsi,%rdx,4),%xmm5
1070         subss  %xmm0,%xmm3
1071         subss  %xmm1,%xmm4
1072         subss  %xmm2,%xmm5
1073         movss  %xmm3,(%rsi,%rdx,4)
1074         movss  %xmm4,4(%rsi,%rdx,4)
1075         movss  %xmm5,8(%rsi,%rdx,4)
1076
1077         ## get n from stack
1078         movl nb410_n(%rsp),%esi
1079         ## get group index for i particle
1080         movq  nb410_gid(%rbp),%rdx              ## base of gid[]
1081         movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
1082
1083         ## accumulate total potential energy and update it
1084         ## accumulate
1085         movhlps %xmm12,%xmm6
1086         addps  %xmm6,%xmm12     ## pos 0-1 in xmm12 have the sum now
1087         movaps %xmm12,%xmm6
1088         shufps $1,%xmm6,%xmm6
1089         addss  %xmm6,%xmm12
1090
1091         ## add earlier value from mem
1092         movq  nb410_Vc(%rbp),%rax
1093         addss (%rax,%rdx,4),%xmm12
1094         ## move back to mem
1095         movss %xmm12,(%rax,%rdx,4)
1096
1097         ## accumulate total lj energy and update it
1098         movaps nb410_Vvdwtot(%rsp),%xmm7
1099         ## accumulate
1100         movhlps %xmm7,%xmm6
1101         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1102         movaps %xmm7,%xmm6
1103         shufps $1,%xmm6,%xmm6
1104         addss  %xmm6,%xmm7
1105
1106         ## add earlier value from mem
1107         movq  nb410_Vvdw(%rbp),%rax
1108         addss (%rax,%rdx,4),%xmm7
1109         ## move back to mem
1110         movss %xmm7,(%rax,%rdx,4)
1111
1112         ## accumulate dVda and update it
1113         movaps nb410_dvdasum(%rsp),%xmm7
1114         ## accumulate
1115         movhlps %xmm7,%xmm6
1116         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1117         movaps %xmm7,%xmm6
1118         shufps $1,%xmm6,%xmm6
1119         addss  %xmm6,%xmm7
1120
1121         movl nb410_ii(%rsp),%edx
1122         movq nb410_dvda(%rbp),%rax
1123         addss (%rax,%rdx,4),%xmm7
1124         movss %xmm7,(%rax,%rdx,4)
1125
1126         ## finish if last
1127         movl nb410_nn1(%rsp),%ecx
1128         ## esi already loaded with n
1129         incl %esi
1130         subl %esi,%ecx
1131         jz _nb_kernel410_x86_64_sse.nb410_outerend
1132
1133         ## not last, iterate outer loop once more!
1134         movl %esi,nb410_n(%rsp)
1135         jmp _nb_kernel410_x86_64_sse.nb410_outer
1136 _nb_kernel410_x86_64_sse.nb410_outerend:
1137         ## check if more outer neighborlists remain
1138         movl  nb410_nri(%rsp),%ecx
1139         ## esi already loaded with n above
1140         subl  %esi,%ecx
1141         jz _nb_kernel410_x86_64_sse.nb410_end
1142         ## non-zero, do one more workunit
1143         jmp   _nb_kernel410_x86_64_sse.nb410_threadloop
1144 _nb_kernel410_x86_64_sse.nb410_end:
1145
1146         movl nb410_nouter(%rsp),%eax
1147         movl nb410_ninner(%rsp),%ebx
1148         movq nb410_outeriter(%rbp),%rcx
1149         movq nb410_inneriter(%rbp),%rdx
1150         movl %eax,(%rcx)
1151         movl %ebx,(%rdx)
1152
1153         addq $568,%rsp
1154         emms
1155
1156
1157         pop %r15
1158         pop %r14
1159         pop %r13
1160         pop %r12
1161
1162         pop %rbx
1163         pop    %rbp
1164         ret
1165
1166
1167
1168 .globl nb_kernel410nf_x86_64_sse
1169 .globl _nb_kernel410nf_x86_64_sse
1170 nb_kernel410nf_x86_64_sse:
1171 _nb_kernel410nf_x86_64_sse:
1172 ##      Room for return address and rbp (16 bytes)
1173 .set nb410nf_fshift, 16
1174 .set nb410nf_gid, 24
1175 .set nb410nf_pos, 32
1176 .set nb410nf_faction, 40
1177 .set nb410nf_charge, 48
1178 .set nb410nf_p_facel, 56
1179 .set nb410nf_argkrf, 64
1180 .set nb410nf_argcrf, 72
1181 .set nb410nf_Vc, 80
1182 .set nb410nf_type, 88
1183 .set nb410nf_p_ntype, 96
1184 .set nb410nf_vdwparam, 104
1185 .set nb410nf_Vvdw, 112
1186 .set nb410nf_p_tabscale, 120
1187 .set nb410nf_VFtab, 128
1188 .set nb410nf_invsqrta, 136
1189 .set nb410nf_dvda, 144
1190 .set nb410nf_p_gbtabscale, 152
1191 .set nb410nf_GBtab, 160
1192 .set nb410nf_p_nthreads, 168
1193 .set nb410nf_count, 176
1194 .set nb410nf_mtx, 184
1195 .set nb410nf_outeriter, 192
1196 .set nb410nf_inneriter, 200
1197 .set nb410nf_work, 208
1198         ## stack offsets for local variables
1199         ## bottom of stack is cache-aligned for sse use
1200 .set nb410nf_ix, 0
1201 .set nb410nf_iy, 16
1202 .set nb410nf_iz, 32
1203 .set nb410nf_iq, 48
1204 .set nb410nf_gbtsc, 64
1205 .set nb410nf_qq, 80
1206 .set nb410nf_c6, 96
1207 .set nb410nf_c12, 112
1208 .set nb410nf_vctot, 128
1209 .set nb410nf_Vvdwtot, 144
1210 .set nb410nf_half, 160
1211 .set nb410nf_three, 176
1212 .set nb410nf_isai, 192
1213 .set nb410nf_isaprod, 208
1214 .set nb410nf_gbscale, 224
1215 .set nb410nf_nri, 240
1216 .set nb410nf_iinr, 248
1217 .set nb410nf_jindex, 256
1218 .set nb410nf_jjnr, 264
1219 .set nb410nf_shift, 272
1220 .set nb410nf_shiftvec, 280
1221 .set nb410nf_facel, 288
1222 .set nb410nf_innerjjnr, 296
1223 .set nb410nf_is3, 304
1224 .set nb410nf_ii3, 308
1225 .set nb410nf_ntia, 312
1226 .set nb410nf_innerk, 316
1227 .set nb410nf_n, 320
1228 .set nb410nf_nn1, 324
1229 .set nb410nf_ntype, 328
1230 .set nb410nf_nouter, 332
1231 .set nb410nf_ninner, 336
1232
1233         push %rbp
1234         movq %rsp,%rbp
1235         push %rbx
1236
1237
1238         emms
1239
1240         push %r12
1241         push %r13
1242         push %r14
1243         push %r15
1244
1245         subq $360,%rsp          ## local variable stack space (n*16+8)
1246
1247         ## zero 32-bit iteration counters
1248         movl $0,%eax
1249         movl %eax,nb410nf_nouter(%rsp)
1250         movl %eax,nb410nf_ninner(%rsp)
1251
1252         movl (%rdi),%edi
1253         movl %edi,nb410nf_nri(%rsp)
1254         movq %rsi,nb410nf_iinr(%rsp)
1255         movq %rdx,nb410nf_jindex(%rsp)
1256         movq %rcx,nb410nf_jjnr(%rsp)
1257         movq %r8,nb410nf_shift(%rsp)
1258         movq %r9,nb410nf_shiftvec(%rsp)
1259         movq nb410nf_p_ntype(%rbp),%rdi
1260         movl (%rdi),%edi
1261         movl %edi,nb410nf_ntype(%rsp)
1262         movq nb410nf_p_facel(%rbp),%rsi
1263         movss (%rsi),%xmm0
1264         movss %xmm0,nb410nf_facel(%rsp)
1265
1266         movq nb410nf_p_gbtabscale(%rbp),%rbx
1267         movss (%rbx),%xmm4
1268         shufps $0,%xmm4,%xmm4
1269         movaps %xmm4,nb410nf_gbtsc(%rsp)
1270
1271
1272         ## create constant floating-point factors on stack
1273         movl $0x3f000000,%eax   ## half in IEEE (hex)
1274         movl %eax,nb410nf_half(%rsp)
1275         movss nb410nf_half(%rsp),%xmm1
1276         shufps $0,%xmm1,%xmm1  ## splat to all elements
1277         movaps %xmm1,%xmm2
1278         addps  %xmm2,%xmm2      ## one
1279         movaps %xmm2,%xmm3
1280         addps  %xmm2,%xmm2      ## two
1281         addps  %xmm2,%xmm3      ## three
1282         movaps %xmm1,nb410nf_half(%rsp)
1283         movaps %xmm3,nb410nf_three(%rsp)
1284
1285 _nb_kernel410nf_x86_64_sse.nb410nf_threadloop:
1286         movq  nb410nf_count(%rbp),%rsi            ## pointer to sync counter
1287         movl  (%rsi),%eax
1288 _nb_kernel410nf_x86_64_sse.nb410nf_spinlock:
1289         movl  %eax,%ebx                         ## ebx=*count=nn0
1290         addl  $1,%ebx                          ## ebx=nn1=nn0+10
1291         lock
1292         cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
1293                                                 ## if it hasnt changed.
1294                                                 ## or reread *counter to eax.
1295         pause                                   ## -> better p4 performance
1296         jnz _nb_kernel410nf_x86_64_sse.nb410nf_spinlock
1297
1298         ## if(nn1>nri) nn1=nri
1299         movl nb410nf_nri(%rsp),%ecx
1300         movl %ecx,%edx
1301         subl %ebx,%ecx
1302         cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
1303         ## Cleared the spinlock if we got here.
1304         ## eax contains nn0, ebx contains nn1.
1305         movl %eax,nb410nf_n(%rsp)
1306         movl %ebx,nb410nf_nn1(%rsp)
1307         subl %eax,%ebx                          ## calc number of outer lists
1308         movl %eax,%esi                          ## copy n to esi
1309         jg  _nb_kernel410nf_x86_64_sse.nb410nf_outerstart
1310         jmp _nb_kernel410nf_x86_64_sse.nb410nf_end
1311
1312 _nb_kernel410nf_x86_64_sse.nb410nf_outerstart:
1313         ## ebx contains number of outer iterations
1314         addl nb410nf_nouter(%rsp),%ebx
1315         movl %ebx,nb410nf_nouter(%rsp)
1316
1317 _nb_kernel410nf_x86_64_sse.nb410nf_outer:
1318         movq  nb410nf_shift(%rsp),%rax        ## rax = pointer into shift[]
1319         movl  (%rax,%rsi,4),%ebx        ## ebx=shift[n]
1320
1321         lea  (%rbx,%rbx,2),%rbx    ## rbx=3*is
1322         movl  %ebx,nb410nf_is3(%rsp)            ## store is3
1323
1324         movq  nb410nf_shiftvec(%rsp),%rax     ## rax = base of shiftvec[]
1325
1326         movss (%rax,%rbx,4),%xmm0
1327         movss 4(%rax,%rbx,4),%xmm1
1328         movss 8(%rax,%rbx,4),%xmm2
1329
1330         movq  nb410nf_iinr(%rsp),%rcx         ## rcx = pointer into iinr[]
1331         movl  (%rcx,%rsi,4),%ebx            ## ebx =ii
1332
1333         movq  nb410nf_charge(%rbp),%rdx
1334         movss (%rdx,%rbx,4),%xmm3
1335         mulss nb410nf_facel(%rsp),%xmm3
1336         shufps $0,%xmm3,%xmm3
1337
1338         movq  nb410nf_invsqrta(%rbp),%rdx       ## load invsqrta[ii]
1339         movss (%rdx,%rbx,4),%xmm4
1340         shufps $0,%xmm4,%xmm4
1341
1342         movq  nb410nf_type(%rbp),%rdx
1343         movl  (%rdx,%rbx,4),%edx
1344         imull nb410nf_ntype(%rsp),%edx
1345         shll  %edx
1346         movl  %edx,nb410nf_ntia(%rsp)
1347
1348         lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3
1349         movq  nb410nf_pos(%rbp),%rax      ## rax = base of pos[]
1350
1351         addss (%rax,%rbx,4),%xmm0
1352         addss 4(%rax,%rbx,4),%xmm1
1353         addss 8(%rax,%rbx,4),%xmm2
1354
1355         movaps %xmm3,nb410nf_iq(%rsp)
1356         movaps %xmm4,nb410nf_isai(%rsp)
1357
1358         shufps $0,%xmm0,%xmm0
1359         shufps $0,%xmm1,%xmm1
1360         shufps $0,%xmm2,%xmm2
1361
1362         movaps %xmm0,nb410nf_ix(%rsp)
1363         movaps %xmm1,nb410nf_iy(%rsp)
1364         movaps %xmm2,nb410nf_iz(%rsp)
1365
1366         movl  %ebx,nb410nf_ii3(%rsp)
1367
1368         ## clear vctot
1369         xorps %xmm4,%xmm4
1370         movaps %xmm4,nb410nf_vctot(%rsp)
1371         movaps %xmm4,nb410nf_Vvdwtot(%rsp)
1372
1373         movq  nb410nf_jindex(%rsp),%rax
1374         movl  (%rax,%rsi,4),%ecx             ## jindex[n]
1375         movl  4(%rax,%rsi,4),%edx            ## jindex[n+1]
1376         subl  %ecx,%edx              ## number of innerloop atoms
1377
1378         movq  nb410nf_pos(%rbp),%rsi
1379         movq  nb410nf_faction(%rbp),%rdi
1380         movq  nb410nf_jjnr(%rsp),%rax
1381         shll  $2,%ecx
1382         addq  %rcx,%rax
1383         movq  %rax,nb410nf_innerjjnr(%rsp)       ## pointer to jjnr[nj0]
1384         movl  %edx,%ecx
1385         subl  $4,%edx
1386         addl  nb410nf_ninner(%rsp),%ecx
1387         movl  %ecx,nb410nf_ninner(%rsp)
1388         addl  $0,%edx
1389         movl  %edx,nb410nf_innerk(%rsp)      ## number of innerloop atoms
1390         jge   _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop
1391         jmp   _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner
1392 _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop:
1393         ## quad-unroll innerloop here
1394         movq  nb410nf_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]
1395         movl  (%rdx),%eax
1396         movl  4(%rdx),%ebx
1397         movl  8(%rdx),%ecx
1398         movl  12(%rdx),%edx           ## eax-edx=jnr1-4
1399         addq $16,nb410nf_innerjjnr(%rsp)             ## advance pointer (unrolled 4)
1400
1401         ## load isa2
1402         movq nb410nf_invsqrta(%rbp),%rsi
1403         movss (%rsi,%rax,4),%xmm3
1404         movss (%rsi,%rcx,4),%xmm4
1405         movss (%rsi,%rbx,4),%xmm6
1406         movss (%rsi,%rdx,4),%xmm7
1407         movaps nb410nf_isai(%rsp),%xmm2
1408         shufps $0,%xmm6,%xmm3
1409         shufps $0,%xmm7,%xmm4
1410         shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3
1411         mulps  %xmm3,%xmm2
1412
1413         movaps %xmm2,nb410nf_isaprod(%rsp)
1414         movaps %xmm2,%xmm1
1415         mulps nb410nf_gbtsc(%rsp),%xmm1
1416         movaps %xmm1,nb410nf_gbscale(%rsp)
1417
1418         movq nb410nf_charge(%rbp),%rsi     ## base of charge[]
1419
1420         movss (%rsi,%rax,4),%xmm3
1421         movss (%rsi,%rcx,4),%xmm4
1422         movss (%rsi,%rbx,4),%xmm6
1423         movss (%rsi,%rdx,4),%xmm7
1424
1425         mulps nb410nf_iq(%rsp),%xmm2
1426         shufps $0,%xmm6,%xmm3
1427         shufps $0,%xmm7,%xmm4
1428         shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3
1429         mulps  %xmm2,%xmm3
1430         movaps %xmm3,nb410nf_qq(%rsp)
1431
1432         movd %eax,%mm0
1433         movd %ebx,%mm1
1434         movd %ecx,%mm2
1435         movd %edx,%mm3
1436
1437         movq nb410nf_type(%rbp),%rsi
1438         movl (%rsi,%rax,4),%eax
1439         movl (%rsi,%rbx,4),%ebx
1440         movl (%rsi,%rcx,4),%ecx
1441         movl (%rsi,%rdx,4),%edx
1442         movq nb410nf_vdwparam(%rbp),%rsi
1443         shll %eax
1444         shll %ebx
1445         shll %ecx
1446         shll %edx
1447         movl nb410nf_ntia(%rsp),%edi
1448         addl %edi,%eax
1449         addl %edi,%ebx
1450         addl %edi,%ecx
1451         addl %edi,%edx
1452
1453         movlps (%rsi,%rax,4),%xmm6
1454         movlps (%rsi,%rcx,4),%xmm7
1455         movhps (%rsi,%rbx,4),%xmm6
1456         movhps (%rsi,%rdx,4),%xmm7
1457
1458         movaps %xmm6,%xmm4
1459         shufps $136,%xmm7,%xmm4 ## 10001000
1460         shufps $221,%xmm7,%xmm6 ## 11011101
1461
1462         movd  %mm0,%eax
1463         movd  %mm1,%ebx
1464         movd  %mm2,%ecx
1465         movd  %mm3,%edx
1466
1467         movaps %xmm4,nb410nf_c6(%rsp)
1468         movaps %xmm6,nb410nf_c12(%rsp)
1469
1470         movq nb410nf_pos(%rbp),%rsi        ## base of pos[]
1471
1472         lea  (%rax,%rax,2),%rax     ## replace jnr with j3
1473         lea  (%rbx,%rbx,2),%rbx
1474
1475         lea  (%rcx,%rcx,2),%rcx     ## replace jnr with j3
1476         lea  (%rdx,%rdx,2),%rdx
1477
1478         ## move four coordinates to xmm0-xmm2
1479
1480         movlps (%rsi,%rax,4),%xmm4
1481         movlps (%rsi,%rcx,4),%xmm5
1482         movss 8(%rsi,%rax,4),%xmm2
1483         movss 8(%rsi,%rcx,4),%xmm6
1484
1485         movhps (%rsi,%rbx,4),%xmm4
1486         movhps (%rsi,%rdx,4),%xmm5
1487
1488         movss 8(%rsi,%rbx,4),%xmm0
1489         movss 8(%rsi,%rdx,4),%xmm1
1490
1491         shufps $0,%xmm0,%xmm2
1492         shufps $0,%xmm1,%xmm6
1493
1494         movaps %xmm4,%xmm0
1495         movaps %xmm4,%xmm1
1496
1497         shufps $136,%xmm6,%xmm2 ## 10001000
1498
1499         shufps $136,%xmm5,%xmm0 ## 10001000
1500         shufps $221,%xmm5,%xmm1 ## 11011101
1501
1502         ## move ix-iz to xmm4-xmm6
1503         movaps nb410nf_ix(%rsp),%xmm4
1504         movaps nb410nf_iy(%rsp),%xmm5
1505         movaps nb410nf_iz(%rsp),%xmm6
1506
1507         ## calc dr
1508         subps %xmm0,%xmm4
1509         subps %xmm1,%xmm5
1510         subps %xmm2,%xmm6
1511
1512         ## square it
1513         mulps %xmm4,%xmm4
1514         mulps %xmm5,%xmm5
1515         mulps %xmm6,%xmm6
1516         addps %xmm5,%xmm4
1517         addps %xmm6,%xmm4
1518         ## rsq in xmm4
1519
1520         rsqrtps %xmm4,%xmm5
1521         ## lookup seed in xmm5
1522         movaps %xmm5,%xmm2
1523         mulps %xmm5,%xmm5
1524         movaps nb410nf_three(%rsp),%xmm1
1525         mulps %xmm4,%xmm5       ## rsq*lu*lu
1526         movaps nb410nf_half(%rsp),%xmm0
1527         subps %xmm5,%xmm1       ## 30-rsq*lu*lu
1528         mulps %xmm2,%xmm1
1529         mulps %xmm1,%xmm0       ## xmm0=rinv
1530         mulps %xmm0,%xmm4       ## xmm4=r
1531         mulps nb410nf_gbscale(%rsp),%xmm4
1532
1533         movhlps %xmm4,%xmm5
1534         cvttps2pi %xmm4,%mm6
1535         cvttps2pi %xmm5,%mm7    ## mm6/mm7 contain lu indices
1536         cvtpi2ps %mm6,%xmm6
1537         cvtpi2ps %mm7,%xmm5
1538         movlhps %xmm5,%xmm6
1539         subps %xmm6,%xmm4
1540         movaps %xmm4,%xmm1      ## xmm1=eps
1541         movaps %xmm1,%xmm2
1542         mulps  %xmm2,%xmm2      ## xmm2=eps2
1543         pslld $2,%mm6
1544         pslld $2,%mm7
1545
1546         movd %eax,%mm0
1547         movd %ebx,%mm1
1548         movd %ecx,%mm2
1549         movd %edx,%mm3
1550
1551         movq nb410nf_GBtab(%rbp),%rsi
1552         movd %mm6,%eax
1553         psrlq $32,%mm6
1554         movd %mm7,%ecx
1555         psrlq $32,%mm7
1556         movd %mm6,%ebx
1557         movd %mm7,%edx
1558
1559         ## load coulomb table
1560         movaps (%rsi,%rax,4),%xmm4
1561         movaps (%rsi,%rbx,4),%xmm5
1562         movaps (%rsi,%rcx,4),%xmm6
1563         movaps (%rsi,%rdx,4),%xmm7
1564         ## transpose, using xmm3 for scratch
1565         movaps %xmm6,%xmm3
1566         shufps $0xEE,%xmm7,%xmm3
1567         shufps $0x44,%xmm7,%xmm6
1568         movaps %xmm4,%xmm7
1569         shufps $0xEE,%xmm5,%xmm7
1570         shufps $0x44,%xmm5,%xmm4
1571         movaps %xmm4,%xmm5
1572         shufps $0xDD,%xmm6,%xmm5
1573         shufps $0x88,%xmm6,%xmm4
1574         movaps %xmm7,%xmm6
1575         shufps $0x88,%xmm3,%xmm6
1576         shufps $0xDD,%xmm3,%xmm7
1577         ## coulomb table ready, in xmm4-xmm7
1578         mulps  %xmm1,%xmm6      ## xmm6=Geps
1579         mulps  %xmm2,%xmm7      ## xmm7=Heps2
1580
1581         addps  %xmm6,%xmm5
1582         addps  %xmm7,%xmm5      ## xmm5=Fp
1583         movaps nb410nf_qq(%rsp),%xmm3
1584         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp
1585         addps  %xmm4,%xmm5 ## xmm5=VV
1586         mulps  %xmm3,%xmm5 ## vcoul=qq*VV
1587         ## update vctot
1588         addps  nb410nf_vctot(%rsp),%xmm5
1589         movaps %xmm5,nb410nf_vctot(%rsp)
1590
1591         ## L-J
1592         movaps %xmm0,%xmm4
1593         mulps  %xmm0,%xmm4      ## xmm4=rinvsq
1594
1595         movaps %xmm4,%xmm6
1596         mulps  %xmm4,%xmm6
1597
1598         mulps  %xmm4,%xmm6      ## xmm6=rinvsix
1599         movaps %xmm6,%xmm4
1600         mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve
1601         mulps  nb410nf_c6(%rsp),%xmm6
1602         mulps  nb410nf_c12(%rsp),%xmm4
1603         movaps nb410nf_Vvdwtot(%rsp),%xmm7
1604         addps  %xmm4,%xmm7
1605         subps  %xmm6,%xmm7
1606         movaps %xmm7,nb410nf_Vvdwtot(%rsp)
1607
1608         ## should we do one more iteration?
1609         subl $4,nb410nf_innerk(%rsp)
1610         jl    _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner
1611         jmp   _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop
1612 _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner:
1613         ## check if at least two particles remain
1614         addl $4,nb410nf_innerk(%rsp)
1615         movl  nb410nf_innerk(%rsp),%edx
1616         andl  $2,%edx
1617         jnz   _nb_kernel410nf_x86_64_sse.nb410nf_dopair
1618         jmp   _nb_kernel410nf_x86_64_sse.nb410nf_checksingle
1619 _nb_kernel410nf_x86_64_sse.nb410nf_dopair:
1620         movq  nb410nf_innerjjnr(%rsp),%rcx
1621         movl  (%rcx),%eax
1622         movl  4(%rcx),%ebx
1623         addq $8,nb410nf_innerjjnr(%rsp)
1624
1625         xorps %xmm2,%xmm2
1626         movaps %xmm2,%xmm6
1627
1628         ## load isa2
1629         movq nb410nf_invsqrta(%rbp),%rsi
1630         movss (%rsi,%rax,4),%xmm2
1631         movss (%rsi,%rbx,4),%xmm3
1632         unpcklps %xmm3,%xmm2    ## isa2 in xmm3(0,1)
1633         mulps  nb410nf_isai(%rsp),%xmm2
1634         movaps %xmm2,nb410nf_isaprod(%rsp)
1635         movaps %xmm2,%xmm1
1636         mulps nb410nf_gbtsc(%rsp),%xmm1
1637         movaps %xmm1,nb410nf_gbscale(%rsp)
1638
1639         movq nb410nf_charge(%rbp),%rsi     ## base of charge[]
1640         movss (%rsi,%rax,4),%xmm3
1641         movss (%rsi,%rbx,4),%xmm6
1642         unpcklps %xmm6,%xmm3 ## 00001000 ;# xmm3(0,1) has the charges
1643
1644         mulps  nb410nf_iq(%rsp),%xmm2
1645         mulps  %xmm2,%xmm3
1646         movaps %xmm3,nb410nf_qq(%rsp)
1647
1648         movq nb410nf_type(%rbp),%rsi
1649         movl  %eax,%ecx
1650         movl  %ebx,%edx
1651         movl (%rsi,%rcx,4),%ecx
1652         movl (%rsi,%rdx,4),%edx
1653         movq nb410nf_vdwparam(%rbp),%rsi
1654         shll %ecx
1655         shll %edx
1656         movl nb410nf_ntia(%rsp),%edi
1657         addl %edi,%ecx
1658         addl %edi,%edx
1659         movlps (%rsi,%rcx,4),%xmm6
1660         movhps (%rsi,%rdx,4),%xmm6
1661         movq nb410nf_pos(%rbp),%rdi
1662
1663         movaps %xmm6,%xmm4
1664         shufps $8,%xmm4,%xmm4 ## 00001000
1665         shufps $13,%xmm6,%xmm6 ## 00001101
1666         movlhps %xmm7,%xmm4
1667         movlhps %xmm7,%xmm6
1668
1669         movaps %xmm4,nb410nf_c6(%rsp)
1670         movaps %xmm6,nb410nf_c12(%rsp)
1671
1672         lea  (%rax,%rax,2),%rax
1673         lea  (%rbx,%rbx,2),%rbx
1674         ## move coordinates to xmm0-xmm2
1675         movlps (%rdi,%rax,4),%xmm1
1676         movss 8(%rdi,%rax,4),%xmm2
1677         movhps (%rdi,%rbx,4),%xmm1
1678         movss 8(%rdi,%rbx,4),%xmm0
1679
1680         movlhps %xmm7,%xmm3
1681
1682         shufps $0,%xmm0,%xmm2
1683
1684         movaps %xmm1,%xmm0
1685
1686         shufps $136,%xmm2,%xmm2 ## 10001000
1687
1688         shufps $136,%xmm0,%xmm0 ## 10001000
1689         shufps $221,%xmm1,%xmm1 ## 11011101
1690
1691         movq   nb410nf_faction(%rbp),%rdi
1692         ## move ix-iz to xmm4-xmm6
1693         xorps   %xmm7,%xmm7
1694
1695         movaps nb410nf_ix(%rsp),%xmm4
1696         movaps nb410nf_iy(%rsp),%xmm5
1697         movaps nb410nf_iz(%rsp),%xmm6
1698
1699         ## calc dr
1700         subps %xmm0,%xmm4
1701         subps %xmm1,%xmm5
1702         subps %xmm2,%xmm6
1703
1704         ## square it
1705         mulps %xmm4,%xmm4
1706         mulps %xmm5,%xmm5
1707         mulps %xmm6,%xmm6
1708         addps %xmm5,%xmm4
1709         addps %xmm6,%xmm4
1710         ## rsq in xmm4
1711
1712         rsqrtps %xmm4,%xmm5
1713         ## lookup seed in xmm5
1714         movaps %xmm5,%xmm2
1715         mulps %xmm5,%xmm5
1716         movaps nb410nf_three(%rsp),%xmm1
1717         mulps %xmm4,%xmm5       ## rsq*lu*lu
1718         movaps nb410nf_half(%rsp),%xmm0
1719         subps %xmm5,%xmm1       ## 30-rsq*lu*lu
1720         mulps %xmm2,%xmm1
1721         mulps %xmm1,%xmm0       ## xmm0=rinv
1722         mulps %xmm0,%xmm4       ## xmm4=r
1723         mulps nb410nf_gbscale(%rsp),%xmm4
1724
1725         cvttps2pi %xmm4,%mm6    ## mm6 contain lu indices
1726         cvtpi2ps %mm6,%xmm6
1727         subps %xmm6,%xmm4
1728         movaps %xmm4,%xmm1      ## xmm1=eps
1729         movaps %xmm1,%xmm2
1730         mulps  %xmm2,%xmm2      ## xmm2=eps2
1731
1732         pslld $2,%mm6
1733
1734         movq nb410nf_GBtab(%rbp),%rsi
1735         movd %mm6,%ecx
1736         psrlq $32,%mm6
1737         movd %mm6,%edx
1738
1739         ## load coulomb table
1740         movaps (%rsi,%rcx,4),%xmm4
1741         movaps (%rsi,%rdx,4),%xmm7
1742         ## transpose, using xmm3 for scratch
1743         movaps %xmm4,%xmm6
1744         unpcklps %xmm7,%xmm4    ## Y1 Y2 F1 F2
1745         unpckhps %xmm7,%xmm6    ## G1 G2 H1 H2
1746         movhlps  %xmm4,%xmm5    ## F1 F2
1747         movhlps  %xmm6,%xmm7    ## H1 H2
1748         ## coulomb table ready, in xmm4-xmm7
1749
1750         mulps  %xmm1,%xmm6      ## xmm6=Geps
1751         mulps  %xmm2,%xmm7      ## xmm7=Heps2
1752         addps  %xmm6,%xmm5
1753         addps  %xmm7,%xmm5      ## xmm5=Fp
1754         movaps nb410nf_qq(%rsp),%xmm3
1755         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp
1756         addps  %xmm4,%xmm5 ## xmm5=VV
1757         mulps  %xmm3,%xmm5 ## vcoul=qq*VV
1758
1759         addps  nb410nf_vctot(%rsp),%xmm5
1760         movaps %xmm5,nb410nf_vctot(%rsp)
1761
1762         ## L-J
1763         movaps %xmm0,%xmm4
1764         mulps  %xmm0,%xmm4      ## xmm4=rinvsq
1765
1766         ## at this point mm5 contains vcoul and mm3 fijC
1767         ## increment vcoul - then we can get rid of mm5
1768         ## update vctot
1769
1770         movaps %xmm4,%xmm6
1771         mulps  %xmm4,%xmm6
1772
1773         mulps  %xmm4,%xmm6      ## xmm6=rinvsix
1774         movaps %xmm6,%xmm4
1775         mulps  %xmm4,%xmm4      ## xmm4=rinvtwelve
1776         mulps  nb410nf_c6(%rsp),%xmm6
1777         mulps  nb410nf_c12(%rsp),%xmm4
1778         movaps nb410nf_Vvdwtot(%rsp),%xmm7
1779         addps  %xmm4,%xmm7
1780         subps  %xmm6,%xmm7
1781         movaps %xmm7,nb410nf_Vvdwtot(%rsp)
1782
1783 _nb_kernel410nf_x86_64_sse.nb410nf_checksingle:
1784         movl  nb410nf_innerk(%rsp),%edx
1785         andl  $1,%edx
1786         jnz    _nb_kernel410nf_x86_64_sse.nb410nf_dosingle
1787         jmp    _nb_kernel410nf_x86_64_sse.nb410nf_updateouterdata
1788 _nb_kernel410nf_x86_64_sse.nb410nf_dosingle:
1789         movq nb410nf_charge(%rbp),%rsi
1790         movq nb410nf_invsqrta(%rbp),%rdx
1791         movq nb410nf_pos(%rbp),%rdi
1792         movq  nb410nf_innerjjnr(%rsp),%rcx
1793         movl  (%rcx),%eax
1794         xorps  %xmm2,%xmm2
1795         movaps %xmm2,%xmm6
1796         movss (%rdx,%rax,4),%xmm2       ## isa2
1797         mulss nb410nf_isai(%rsp),%xmm2
1798         movss %xmm2,nb410nf_isaprod(%rsp)
1799         movss %xmm2,%xmm1
1800         mulss nb410nf_gbtsc(%rsp),%xmm1
1801         movss %xmm1,nb410nf_gbscale(%rsp)
1802
1803         mulss  nb410nf_iq(%rsp),%xmm2
1804         movss (%rsi,%rax,4),%xmm6       ## xmm6(0) has the charge
1805         mulss  %xmm2,%xmm6
1806         movss %xmm6,nb410nf_qq(%rsp)
1807
1808         movq nb410nf_type(%rbp),%rsi
1809         movl %eax,%ecx
1810         movl (%rsi,%rcx,4),%ecx
1811         movq nb410nf_vdwparam(%rbp),%rsi
1812         shll %ecx
1813         addl nb410nf_ntia(%rsp),%ecx
1814         movlps (%rsi,%rcx,4),%xmm6
1815         movaps %xmm6,%xmm4
1816         shufps $252,%xmm4,%xmm4 ## 11111100
1817         shufps $253,%xmm6,%xmm6 ## 11111101
1818
1819         movaps %xmm4,nb410nf_c6(%rsp)
1820         movaps %xmm6,nb410nf_c12(%rsp)
1821
1822         lea  (%rax,%rax,2),%rax
1823
1824         ## move coordinates to xmm0-xmm2
1825         movss (%rdi,%rax,4),%xmm0
1826         movss 4(%rdi,%rax,4),%xmm1
1827         movss 8(%rdi,%rax,4),%xmm2
1828
1829         movaps nb410nf_ix(%rsp),%xmm4
1830         movaps nb410nf_iy(%rsp),%xmm5
1831         movaps nb410nf_iz(%rsp),%xmm6
1832
1833         ## calc dr
1834         subss %xmm0,%xmm4
1835         subss %xmm1,%xmm5
1836         subss %xmm2,%xmm6
1837
1838         ## square it
1839         mulss %xmm4,%xmm4
1840         mulss %xmm5,%xmm5
1841         mulss %xmm6,%xmm6
1842         addss %xmm5,%xmm4
1843         addss %xmm6,%xmm4
1844         ## rsq in xmm4
1845
1846         rsqrtss %xmm4,%xmm5
1847         ## lookup seed in xmm5
1848         movaps %xmm5,%xmm2
1849         mulss %xmm5,%xmm5
1850         movss nb410nf_three(%rsp),%xmm1
1851         mulss %xmm4,%xmm5       ## rsq*lu*lu
1852         movss nb410nf_half(%rsp),%xmm0
1853         subss %xmm5,%xmm1       ## 30-rsq*lu*lu
1854         mulss %xmm2,%xmm1
1855         mulss %xmm1,%xmm0       ## xmm0=rinv
1856
1857         mulss %xmm0,%xmm4       ## xmm4=r
1858         mulss nb410nf_gbscale(%rsp),%xmm4
1859
1860         cvttss2si %xmm4,%ebx    ## mm6 contain lu indices
1861         cvtsi2ss %ebx,%xmm6
1862         subss %xmm6,%xmm4
1863         movaps %xmm4,%xmm1      ## xmm1=eps
1864         movaps %xmm1,%xmm2
1865         mulss  %xmm2,%xmm2      ## xmm2=eps2
1866
1867         shll $2,%ebx
1868         movq nb410nf_GBtab(%rbp),%rsi
1869
1870         movaps (%rsi,%rbx,4),%xmm4
1871         movhlps %xmm4,%xmm6
1872         movaps %xmm4,%xmm5
1873         movaps %xmm6,%xmm7
1874         shufps $1,%xmm5,%xmm5
1875         shufps $1,%xmm7,%xmm7
1876         ## table ready in xmm4-xmm7
1877
1878         mulss  %xmm1,%xmm6      ## xmm6=Geps
1879         mulss  %xmm2,%xmm7      ## xmm7=Heps2
1880         addss  %xmm6,%xmm5
1881         addss  %xmm7,%xmm5      ## xmm5=Fp
1882         movss nb410nf_qq(%rsp),%xmm3
1883         mulss  %xmm1,%xmm5 ## xmm5=eps*Fp
1884         addss  %xmm4,%xmm5 ## xmm5=VV
1885         mulss  %xmm3,%xmm5 ## vcoul=qq*VV
1886         addss  nb410nf_vctot(%rsp),%xmm5
1887         movss %xmm5,nb410nf_vctot(%rsp)
1888
1889         ## L-J
1890         movaps %xmm0,%xmm4
1891         mulss  %xmm0,%xmm4      ## xmm4=rinvsq
1892
1893         movaps %xmm4,%xmm6
1894         mulss  %xmm4,%xmm6
1895
1896         mulss  %xmm4,%xmm6      ## xmm6=rinvsix
1897         movaps %xmm6,%xmm4
1898         mulss  %xmm4,%xmm4      ## xmm4=rinvtwelve
1899         mulss  nb410nf_c6(%rsp),%xmm6
1900         mulss  nb410nf_c12(%rsp),%xmm4
1901         movss nb410nf_Vvdwtot(%rsp),%xmm7
1902         addps  %xmm4,%xmm7
1903         subps  %xmm6,%xmm7
1904         movss %xmm7,nb410nf_Vvdwtot(%rsp)
1905
1906 _nb_kernel410nf_x86_64_sse.nb410nf_updateouterdata:
1907         ## get n from stack
1908         movl nb410nf_n(%rsp),%esi
1909         ## get group index for i particle
1910         movq  nb410nf_gid(%rbp),%rdx            ## base of gid[]
1911         movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
1912
1913         ## accumulate total potential energy and update it
1914         movaps nb410nf_vctot(%rsp),%xmm7
1915         ## accumulate
1916         movhlps %xmm7,%xmm6
1917         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1918         movaps %xmm7,%xmm6
1919         shufps $1,%xmm6,%xmm6
1920         addss  %xmm6,%xmm7
1921
1922         ## add earlier value from mem
1923         movq  nb410nf_Vc(%rbp),%rax
1924         addss (%rax,%rdx,4),%xmm7
1925         ## move back to mem
1926         movss %xmm7,(%rax,%rdx,4)
1927
1928         ## accumulate total lj energy and update it
1929         movaps nb410nf_Vvdwtot(%rsp),%xmm7
1930         ## accumulate
1931         movhlps %xmm7,%xmm6
1932         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1933         movaps %xmm7,%xmm6
1934         shufps $1,%xmm6,%xmm6
1935         addss  %xmm6,%xmm7
1936
1937         ## add earlier value from mem
1938         movq  nb410nf_Vvdw(%rbp),%rax
1939         addss (%rax,%rdx,4),%xmm7
1940         ## move back to mem
1941         movss %xmm7,(%rax,%rdx,4)
1942
1943         ## finish if last
1944         movl nb410nf_nn1(%rsp),%ecx
1945         ## esi already loaded with n
1946         incl %esi
1947         subl %esi,%ecx
1948         jz _nb_kernel410nf_x86_64_sse.nb410nf_outerend
1949
1950         ## not last, iterate outer loop once more!
1951         movl %esi,nb410nf_n(%rsp)
1952         jmp _nb_kernel410nf_x86_64_sse.nb410nf_outer
1953 _nb_kernel410nf_x86_64_sse.nb410nf_outerend:
1954         ## check if more outer neighborlists remain
1955         movl  nb410nf_nri(%rsp),%ecx
1956         ## esi already loaded with n above
1957         subl  %esi,%ecx
1958         jz _nb_kernel410nf_x86_64_sse.nb410nf_end
1959         ## non-zero, do one more workunit
1960         jmp   _nb_kernel410nf_x86_64_sse.nb410nf_threadloop
1961 _nb_kernel410nf_x86_64_sse.nb410nf_end:
1962
1963         movl nb410nf_nouter(%rsp),%eax
1964         movl nb410nf_ninner(%rsp),%ebx
1965         movq nb410nf_outeriter(%rbp),%rcx
1966         movq nb410nf_inneriter(%rbp),%rdx
1967         movl %eax,(%rcx)
1968         movl %ebx,(%rdx)
1969
1970         addq $360,%rsp
1971         emms
1972
1973
1974         pop %r15
1975         pop %r14
1976         pop %r13
1977         pop %r12
1978
1979         pop %rbx
1980         pop    %rbp
1981         ret
1982
1983
1984
1985