src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel010_x86_64_sse.s

   1 ##
   2 ##
   3 ## Gromacs 4.0                         Copyright (c) 1991-2003
   4 ## David van der Spoel, Erik Lindahl
   5 ##
   6 ## This program is free software; you can redistribute it and/or
   7 ## modify it under the terms of the GNU General Public License
   8 ## as published by the Free Software Foundation; either version 2
   9 ## of the License, or (at your option) any later version.
  10 ##
  11 ## To help us fund GROMACS development, we humbly ask that you cite
  12 ## the research papers on the package. Check out http://www.gromacs.org
  13 ##
  14 ## And Hey:
  15 ## Gnomes, ROck Monsters And Chili Sauce
  16 ##
  17
  18
  19
  20
  21 ## nb010 - forces are calculated
  22 .globl nb_kernel010_x86_64_sse
  23 .globl _nb_kernel010_x86_64_sse
  24 nb_kernel010_x86_64_sse:
  25 _nb_kernel010_x86_64_sse:
  26 ##      Room for return address and rbp (16 bytes)
  27 .set nb010_fshift, 16
  28 .set nb010_gid, 24
  29 .set nb010_pos, 32
  30 .set nb010_faction, 40
  31 .set nb010_charge, 48
  32 .set nb010_p_facel, 56
  33 .set nb010_argkrf, 64
  34 .set nb010_argcrf, 72
  35 .set nb010_Vc, 80
  36 .set nb010_type, 88
  37 .set nb010_p_ntype, 96
  38 .set nb010_vdwparam, 104
  39 .set nb010_Vvdw, 112
  40 .set nb010_p_tabscale, 120
  41 .set nb010_VFtab, 128
  42 .set nb010_invsqrta, 136
  43 .set nb010_dvda, 144
  44 .set nb010_p_gbtabscale, 152
  45 .set nb010_GBtab, 160
  46 .set nb010_p_nthreads, 168
  47 .set nb010_count, 176
  48 .set nb010_mtx, 184
  49 .set nb010_outeriter, 192
  50 .set nb010_inneriter, 200
  51 .set nb010_work, 208
  52         ## The mutex (last arg) is not used in assembly.
  53         ## stack offsets for local variables
  54         ## bottom of stack is cache-aligned for sse use
  55 .set nb010_ix, 0
  56 .set nb010_iy, 16
  57 .set nb010_iz, 32
  58 .set nb010_dx, 48
  59 .set nb010_dy, 64
  60 .set nb010_dz, 80
  61 .set nb010_two, 96
  62 .set nb010_c6, 112
  63 .set nb010_c12, 128
  64 .set nb010_six, 144
  65 .set nb010_twelve, 160
  66 .set nb010_Vvdwtot, 176
  67 .set nb010_fix, 192
  68 .set nb010_fiy, 208
  69 .set nb010_fiz, 224
  70 .set nb010_half, 240
  71 .set nb010_three, 256
  72 .set nb010_nri, 272
  73 .set nb010_iinr, 280
  74 .set nb010_jindex, 288
  75 .set nb010_jjnr, 296
  76 .set nb010_shift, 304
  77 .set nb010_shiftvec, 312
  78 .set nb010_facel, 320
  79 .set nb010_innerjjnr, 328
  80 .set nb010_is3, 336
  81 .set nb010_ii3, 340
  82 .set nb010_ntia, 344
  83 .set nb010_innerk, 348
  84 .set nb010_n, 352
  85 .set nb010_nn1, 356
  86 .set nb010_ntype, 360
  87 .set nb010_nouter, 364
  88 .set nb010_ninner, 368
  89
  90         push %rbp
  91         movq %rsp,%rbp
  92         push %rbx
  93
  94         push %r12
  95         push %r13
  96         push %r14
  97         push %r15
  98
  99         emms
 100
 101         subq $392,%rsp          # # local variable stack space (n*16+8)
 102         ## zero 32-bit iteration counters
 103         movl $0,%eax
 104         movl %eax,nb010_nouter(%rsp)
 105         movl %eax,nb010_ninner(%rsp)
 106
 107         movl (%rdi),%edi
 108         movl %edi,nb010_nri(%rsp)
 109         movq %rsi,nb010_iinr(%rsp)
 110         movq %rdx,nb010_jindex(%rsp)
 111         movq %rcx,nb010_jjnr(%rsp)
 112         movq %r8,nb010_shift(%rsp)
 113         movq %r9,nb010_shiftvec(%rsp)
 114         movq nb010_p_ntype(%rbp),%rdi
 115         movl (%rdi),%edi
 116         movl %edi,nb010_ntype(%rsp)
 117
 118     ## create constant floating-point factors on stack
 119     movl $0x40000000,%eax   ## 2.0 in IEEE (hex)
 120     movl %eax,nb010_two(%rsp)
 121     movss nb010_two(%rsp),%xmm1
 122         shufps $0,%xmm1,%xmm1  ## splat to all elements
 123         movaps %xmm1,%xmm2
 124         addps  %xmm1,%xmm2      ## 4.0
 125         addps  %xmm1,%xmm2      ## 6.0
 126         movaps %xmm2,%xmm3
 127         addps  %xmm3,%xmm3      ## 12.0
 128         movaps %xmm1,nb010_two(%rsp)
 129     movaps %xmm2,nb010_six(%rsp)
 130     movaps %xmm3,nb010_twelve(%rsp)
 131
 132
 133 _nb_kernel010_x86_64_sse.nb010_threadloop:
 134     movq  nb010_count(%rbp),%rsi            ## pointer to sync counter
 135     movl  (%rsi),%eax
 136 _nb_kernel010_x86_64_sse.nb010_spinlock:
 137     movl  %eax,%ebx                         ## ebx=*count=nn0
 138     addl  $1,%ebx                           ## ebx=nn1=nn0+10
 139         lock
 140         cmpxchgl %ebx,(%rsi)                ## write nn1 to *counter,
 141                                             ## if it hasnt changed.
 142                                             ## or reread *counter to eax.
 143     pause                                   ## -> better p4 performance
 144     jnz _nb_kernel010_x86_64_sse.nb010_spinlock
 145
 146     ## if(nn1>nri) nn1=nri
 147     movl nb010_nri(%rsp),%ecx
 148     movl %ecx,%edx
 149     subl %ebx,%ecx
 150     cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
 151     ## Cleared the spinlock if we got here.
 152     ## eax contains nn0, ebx contains nn1.
 153     movl %eax,nb010_n(%rsp)
 154     movl %ebx,nb010_nn1(%rsp)
 155     subl %eax,%ebx                          ## calc number of outer lists
 156     movl %eax,%esi                              ## copy n to esi
 157     jg  _nb_kernel010_x86_64_sse.nb010_outerstart
 158     jmp _nb_kernel010_x86_64_sse.nb010_end
 159
 160 _nb_kernel010_x86_64_sse.nb010_outerstart:
 161         ## ebx contains number of outer iterations
 162         addl nb010_nouter(%rsp),%ebx
 163         movl %ebx,nb010_nouter(%rsp)
 164
 165 _nb_kernel010_x86_64_sse.nb010_outer:
 166     movq  nb010_shift(%rsp),%rax                ## rax = base of shift[]
 167     movl  (%rax,%rsi,4),%ebx                    ## ebx=shift[n]
 168
 169     lea  (%rbx,%rbx,2),%rbx                    ## rbx=3*is
 170     movl  %ebx,nb010_is3(%rsp)          ## store is3
 171
 172     movq  nb010_shiftvec(%rsp),%rax             ## rax = base of shiftvec[]
 173
 174         movss (%rax,%rbx,4),%xmm10
 175         movss 4(%rax,%rbx,4),%xmm11
 176         movss 8(%rax,%rbx,4),%xmm12
 177
 178     movq  nb010_iinr(%rsp),%rcx                 ## rcx = base of iinr[]
 179     movl  (%rcx,%rsi,4),%ebx                    ## ebx =ii
 180
 181     movq nb010_type(%rbp),%rdx
 182     movl (%rdx,%rbx,4),%edx
 183     imull nb010_ntype(%rsp),%edx
 184     shll %edx
 185     movl %edx,nb010_ntia(%rsp)
 186
 187     lea  (%rbx,%rbx,2),%rbx            ## rbx = 3*ii=ii3
 188     movq  nb010_pos(%rbp),%rax          ## rax = base of pos[]
 189
 190         addss (%rax,%rbx,4),%xmm10
 191         addss 4(%rax,%rbx,4),%xmm11
 192         addss 8(%rax,%rbx,4),%xmm12
 193
 194     shufps $0,%xmm10,%xmm10
 195     shufps $0,%xmm11,%xmm11
 196     shufps $0,%xmm12,%xmm12
 197
 198     movaps %xmm10,nb010_ix(%rsp)
 199     movaps %xmm11,nb010_iy(%rsp)
 200     movaps %xmm12,nb010_iz(%rsp)
 201
 202     movl  %ebx,nb010_ii3(%rsp)
 203
 204         ## clear vvdwtot (xmm12) and i forces (xmm13-xmm15)
 205         xorps %xmm12,%xmm12
 206         movaps %xmm12,%xmm13
 207         movaps %xmm12,%xmm14
 208         movaps %xmm12,%xmm15
 209
 210     movq  nb010_jindex(%rsp),%rax
 211     movl  (%rax,%rsi,4),%ecx                    ## jindex[n]
 212     movl  4(%rax,%rsi,4),%edx                   ## jindex[n+1]
 213     subl  %ecx,%edx                             ## number of innerloop atoms
 214
 215     movq  nb010_jjnr(%rsp),%rax
 216     shll  $2,%ecx
 217     addq  %rcx,%rax
 218     movq  %rax,nb010_innerjjnr(%rsp)        ## pointer to jjnr[nj0]
 219         movl  %edx,%ecx
 220     subl  $4,%edx
 221         addl  nb010_ninner(%rsp),%ecx
 222         movl  %ecx,nb010_ninner(%rsp)
 223         addl  $0,%edx
 224     movl  %edx,nb010_innerk(%rsp)           ## number of innerloop atoms
 225
 226     jge   _nb_kernel010_x86_64_sse.nb010_unroll_loop
 227     jmp   _nb_kernel010_x86_64_sse.nb010_finish_inner
 228 _nb_kernel010_x86_64_sse.nb010_unroll_loop:
 229         ## quad-unrolled innerloop here
 230         movq  nb010_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]
 231         movl  (%rdx),%eax
 232         movl  4(%rdx),%ebx
 233         movl  8(%rdx),%ecx
 234         movl  12(%rdx),%edx           ## eax-edx=jnr1-4
 235
 236         addq $16,nb010_innerjjnr(%rsp)             ## advance pointer (unrolled 4)
 237
 238         lea  (%rax,%rax,2),%r8     ## replace jnr with j3
 239         lea  (%rbx,%rbx,2),%r9
 240         lea  (%rcx,%rcx,2),%r10
 241         lea  (%rdx,%rdx,2),%r11
 242
 243         movq nb010_pos(%rbp),%rdi
 244         ## load coordinates
 245         movlps (%rdi,%r8,4),%xmm1       ## x1 y1 - -
 246         movlps (%rdi,%r10,4),%xmm2      ## x3 y3 - -
 247         movhps (%rdi,%r9,4),%xmm1       ## x2 y2 - -
 248         movhps (%rdi,%r11,4),%xmm2      ## x4 y4 - -
 249
 250         movss 8(%rdi,%r8,4),%xmm5       ## z1 - - -
 251         movss 8(%rdi,%r10,4),%xmm6      ## z2 - - -
 252         movss 8(%rdi,%r9,4),%xmm7       ## z3 - - -
 253         movss 8(%rdi,%r11,4),%xmm8      ## z4 - - -
 254     movlhps %xmm7,%xmm5 ## jzOa  -  jzOb  -
 255     movlhps %xmm8,%xmm6 ## jzOc  -  jzOd -
 256
 257         movq nb010_type(%rbp),%rsi
 258
 259     movaps %xmm1,%xmm4
 260     unpcklps %xmm2,%xmm1 ## jxa jxc jya jyc
 261     unpckhps %xmm2,%xmm4 ## jxb jxd jyb jyd
 262     movaps %xmm1,%xmm2
 263     unpcklps %xmm4,%xmm1 ## x
 264     unpckhps %xmm4,%xmm2 ## y
 265     shufps  $136,%xmm6,%xmm5  ## 10001000 => jzH2a jzH2b jzH2c jzH2d
 266
 267     ## load vdw types
 268         movl (%rsi,%rax,4),%r12d
 269         movl (%rsi,%rbx,4),%r13d
 270         movl (%rsi,%rcx,4),%r14d
 271         movl (%rsi,%rdx,4),%r15d
 272
 273         ## calc dr
 274         subps nb010_ix(%rsp),%xmm1
 275         subps nb010_iy(%rsp),%xmm2
 276         subps nb010_iz(%rsp),%xmm5
 277
 278         ## store dr in xmm9-xmm11
 279     movaps %xmm1,%xmm9
 280     movaps %xmm2,%xmm10
 281     movaps %xmm5,%xmm11
 282
 283     ## type *=2
 284         shll %r12d
 285         shll %r13d
 286         shll %r14d
 287         shll %r15d
 288
 289         ## square it
 290         mulps %xmm1,%xmm1
 291         mulps %xmm2,%xmm2
 292         mulps %xmm5,%xmm5
 293         addps %xmm2,%xmm1
 294         addps %xmm5,%xmm1
 295         ## rsq in xmm1
 296
 297     ## 2*type*ntia
 298     movl nb010_ntia(%rsp),%edi
 299         addl %edi,%r12d
 300         addl %edi,%r13d
 301         addl %edi,%r14d
 302         addl %edi,%r15d
 303
 304         movq nb010_vdwparam(%rbp),%rsi
 305     ## xmm0=c6
 306     ## xmm3=c12
 307
 308         rcpps %xmm1,%xmm5
 309         ## 1/x lookup seed in xmm5
 310         movaps nb010_two(%rsp),%xmm6
 311         mulps %xmm5,%xmm1
 312     ## load c6/c12
 313         movlps (%rsi,%r12,4),%xmm7
 314         movlps (%rsi,%r14,4),%xmm8
 315
 316         subps %xmm1,%xmm6
 317         mulps %xmm5,%xmm6       ## xmm6=rinvsq
 318
 319         movaps %xmm6,%xmm4  ## rinvsq
 320
 321         movhps (%rsi,%r13,4),%xmm7
 322         movhps (%rsi,%r15,4),%xmm8
 323
 324         movaps %xmm6,%xmm1
 325         mulps  %xmm6,%xmm1  ## rinv4
 326         mulps  %xmm6,%xmm1      ## rinv6
 327         movaps %xmm1,%xmm2
 328         mulps  %xmm2,%xmm2      ## xmm2=rinv12
 329
 330     ## shuffle c6/c12
 331         movaps %xmm7,%xmm5
 332         shufps $136,%xmm8,%xmm5 ## 10001000
 333         shufps $221,%xmm8,%xmm7 ## 11011101
 334
 335         movq nb010_faction(%rbp),%rsi
 336
 337         mulps  %xmm5,%xmm1 ## c6*rinv6
 338         mulps  %xmm7,%xmm2 ## c12*rinv12
 339         movaps %xmm2,%xmm5
 340         subps  %xmm1,%xmm5      ## Vvdw=Vvdw12-Vvdw6
 341         mulps  nb010_six(%rsp),%xmm1
 342         mulps  nb010_twelve(%rsp),%xmm2
 343         subps  %xmm1,%xmm2
 344         mulps  %xmm2,%xmm4      ## xmm4=total fscal
 345
 346         ## the fj's - start by accumulating x & y forces from memory
 347         movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
 348         movlps (%rsi,%r10,4),%xmm1 ## x3 y3 - -
 349         movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
 350         movhps (%rsi,%r11,4),%xmm1 ## x3 y3 x4 y4
 351
 352     ## add potential to Vvdwtot (sum in xmm12)
 353         addps  %xmm5,%xmm12
 354
 355     ## calculate scalar force by multiplying dx/dy/dz with fscal
 356         mulps  %xmm4,%xmm9
 357         mulps  %xmm4,%xmm10
 358         mulps  %xmm4,%xmm11
 359
 360         ## xmm0-xmm2 contains tx-tz (partial force)
 361         ## accumulate i forces
 362     addps %xmm9,%xmm13
 363     addps %xmm10,%xmm14
 364     addps %xmm11,%xmm15
 365
 366     ## permute local forces
 367     movaps %xmm9,%xmm8
 368     unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
 369     unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4
 370
 371     ## xmm11: fjz1 fjz2 fjz3 fjz4
 372     pshufd $1,%xmm11,%xmm5 ## fjz2 - - -
 373     movhlps %xmm11,%xmm4     ## fjz3 - - -
 374     pshufd $3,%xmm11,%xmm3  ## fjz4 - - -
 375
 376     ## update fjx and fjy
 377         addps  %xmm9,%xmm0
 378         addps  %xmm8,%xmm1
 379
 380         movlps %xmm0,(%rsi,%r8,4)
 381         movlps %xmm1,(%rsi,%r10,4)
 382         movhps %xmm0,(%rsi,%r9,4)
 383         movhps %xmm1,(%rsi,%r11,4)
 384
 385         addss  8(%rsi,%r8,4),%xmm11
 386         addss  8(%rsi,%r9,4),%xmm5
 387         addss  8(%rsi,%r10,4),%xmm4
 388         addss  8(%rsi,%r11,4),%xmm3
 389         movss  %xmm11,8(%rsi,%r8,4)
 390         movss  %xmm5,8(%rsi,%r9,4)
 391         movss  %xmm4,8(%rsi,%r10,4)
 392         movss  %xmm3,8(%rsi,%r11,4)
 393
 394         ## should we do one more iteration?
 395         subl $4,nb010_innerk(%rsp)
 396         jl    _nb_kernel010_x86_64_sse.nb010_finish_inner
 397         jmp   _nb_kernel010_x86_64_sse.nb010_unroll_loop
 398 _nb_kernel010_x86_64_sse.nb010_finish_inner:
 399     ## check if at least two particles remain
 400     addl $4,nb010_innerk(%rsp)
 401     movl  nb010_innerk(%rsp),%edx
 402     andl  $2,%edx
 403     jnz   _nb_kernel010_x86_64_sse.nb010_dopair
 404     jmp   _nb_kernel010_x86_64_sse.nb010_checksingle
 405 _nb_kernel010_x86_64_sse.nb010_dopair:
 406         ## twice-unrolled innerloop here
 407         movq  nb010_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]
 408         movl  (%rdx),%eax
 409         movl  4(%rdx),%ebx
 410
 411         addq $8,nb010_innerjjnr(%rsp)             ## advance pointer (unrolled 2)
 412
 413         movq nb010_type(%rbp),%rsi
 414         movl (%rsi,%rax,4),%r12d
 415         movl (%rsi,%rbx,4),%r13d
 416         shll %r12d
 417         shll %r13d
 418     movl nb010_ntia(%rsp),%edi
 419         addl %edi,%r12d
 420         addl %edi,%r13d
 421
 422         movq nb010_vdwparam(%rbp),%rsi
 423         movlps (%rsi,%r12,4),%xmm3
 424         movhps (%rsi,%r13,4),%xmm3
 425
 426     xorps  %xmm7,%xmm7
 427         movaps %xmm3,%xmm0
 428         shufps $136,%xmm7,%xmm0 ## 10001000
 429         shufps $221,%xmm7,%xmm3 ## 11011101
 430
 431     ## xmm0=c6
 432     ## xmm3=c12
 433         lea  (%rax,%rax,2),%rax     ## replace jnr with j3
 434         lea  (%rbx,%rbx,2),%rbx
 435
 436         movq nb010_pos(%rbp),%rdi
 437         ## load coordinates
 438         movlps (%rdi,%rax,4),%xmm1      ## x1 y1  -  -
 439         movlps (%rdi,%rbx,4),%xmm4      ## x2 y2  -  -
 440
 441         movss 8(%rdi,%rax,4),%xmm5      ## z1 - - -
 442         movss 8(%rdi,%rbx,4),%xmm7      ## z2 - - -
 443
 444     unpcklps %xmm4,%xmm1 ## x1 x2 y1 y2
 445     movhlps  %xmm1,%xmm2 ## y1 y2 -  -
 446     unpcklps %xmm7,%xmm5 ## z1 z2 -  -
 447
 448         ## calc dr
 449         subps nb010_ix(%rsp),%xmm1
 450         subps nb010_iy(%rsp),%xmm2
 451         subps nb010_iz(%rsp),%xmm5
 452
 453         ## store dr in xmm9-xmm11
 454     movaps %xmm1,%xmm9
 455     movaps %xmm2,%xmm10
 456     movaps %xmm5,%xmm11
 457
 458         ## square it
 459         mulps %xmm1,%xmm1
 460         mulps %xmm2,%xmm2
 461         mulps %xmm5,%xmm5
 462         addps %xmm2,%xmm1
 463         addps %xmm5,%xmm1
 464         ## rsq in xmm1
 465
 466         rcpps %xmm1,%xmm5
 467         ## 1/x lookup seed in xmm5
 468         movaps nb010_two(%rsp),%xmm6
 469         mulps %xmm5,%xmm1
 470         subps %xmm1,%xmm6
 471         mulps %xmm5,%xmm6       ## xmm6=rinvsq
 472
 473         movaps %xmm6,%xmm4  ## rinvsq
 474
 475         movaps %xmm6,%xmm1
 476         mulps  %xmm6,%xmm1  ## rinv4
 477         mulps  %xmm6,%xmm1      ## rinv6
 478         movaps %xmm1,%xmm2
 479         mulps  %xmm2,%xmm2      ## xmm2=rinv12
 480
 481         mulps  %xmm0,%xmm1
 482         mulps  %xmm3,%xmm2
 483         movaps %xmm2,%xmm5
 484         subps  %xmm1,%xmm5      ## Vvdw=Vvdw12-Vvdw6
 485         mulps  nb010_six(%rsp),%xmm1
 486         mulps  nb010_twelve(%rsp),%xmm2
 487         subps  %xmm1,%xmm2
 488         mulps  %xmm2,%xmm4      ## xmm4=total fscal
 489
 490     xorps  %xmm7,%xmm7
 491     movlhps %xmm7,%xmm5
 492
 493     ## add potential to Vvdwtot (sum in xmm12)
 494         addps  %xmm5,%xmm12
 495
 496     ## calculate scalar force by multiplying dx/dy/dz with fscal
 497         mulps  %xmm4,%xmm9
 498         mulps  %xmm4,%xmm10
 499         mulps  %xmm4,%xmm11
 500
 501     movlhps %xmm7,%xmm9
 502     movlhps %xmm7,%xmm10
 503     movlhps %xmm7,%xmm11
 504
 505         ## xmm0-xmm2 contains tx-tz (partial force)
 506         ## accumulate i forces
 507     addps %xmm9,%xmm13
 508     addps %xmm10,%xmm14
 509     addps %xmm11,%xmm15
 510
 511         movq nb010_faction(%rbp),%rsi
 512         ## the fj's - start by accumulating x & y forces from memory
 513         movlps (%rsi,%rax,4),%xmm0 ## x1 y1 - -
 514         movhps (%rsi,%rbx,4),%xmm0 ## x1 y1 x2 y2
 515
 516     unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
 517     addps    %xmm9,%xmm0
 518
 519         movlps %xmm0,(%rsi,%rax,4)
 520         movhps %xmm0,(%rsi,%rbx,4)
 521
 522     ## z forces
 523     pshufd $1,%xmm11,%xmm8
 524     addss  8(%rsi,%rax,4),%xmm11
 525     addss  8(%rsi,%rbx,4),%xmm8
 526     movss  %xmm11,8(%rsi,%rax,4)
 527     movss  %xmm8,8(%rsi,%rbx,4)
 528
 529 _nb_kernel010_x86_64_sse.nb010_checksingle:
 530     movl  nb010_innerk(%rsp),%edx
 531     andl  $1,%edx
 532     jnz    _nb_kernel010_x86_64_sse.nb010_dosingle
 533     jmp    _nb_kernel010_x86_64_sse.nb010_updateouterdata
 534
 535 _nb_kernel010_x86_64_sse.nb010_dosingle:
 536     movq nb010_innerjjnr(%rsp),%rcx
 537         movl  (%rcx),%eax
 538
 539         movq nb010_type(%rbp),%rsi
 540         movl (%rsi,%rax,4),%r12d
 541         shll %r12d
 542     movl nb010_ntia(%rsp),%edi
 543         addl %edi,%r12d
 544
 545         movq nb010_vdwparam(%rbp),%rsi
 546         movss (%rsi,%r12,4),%xmm0
 547     movss 4(%rsi,%r12,4),%xmm3
 548
 549     ## xmm0=c6
 550     ## xmm3=c12
 551
 552         lea  (%rax,%rax,2),%rax     ## replace jnr with j3
 553
 554         movq nb010_pos(%rbp),%rdi
 555         ## load coordinates
 556         movss (%rdi,%rax,4),%xmm1
 557         movss 4(%rdi,%rax,4),%xmm2
 558         movss 8(%rdi,%rax,4),%xmm5
 559
 560         ## calc dr
 561         subss nb010_ix(%rsp),%xmm1
 562         subss nb010_iy(%rsp),%xmm2
 563         subss nb010_iz(%rsp),%xmm5
 564
 565         ## store dr in xmm9-xmm11
 566     movaps %xmm1,%xmm9
 567     movaps %xmm2,%xmm10
 568     movaps %xmm5,%xmm11
 569
 570         ## square it
 571         mulss %xmm1,%xmm1
 572         mulss %xmm2,%xmm2
 573         mulss %xmm5,%xmm5
 574         addss %xmm2,%xmm1
 575         addss %xmm5,%xmm1
 576         ## rsq in xmm1
 577
 578         ## rsq in xmm4
 579         rcpss %xmm1,%xmm5
 580         ## 1/x lookup seed in xmm5
 581         movaps nb010_two(%rsp),%xmm6
 582         mulss %xmm5,%xmm1
 583         subss %xmm1,%xmm6
 584         mulss %xmm5,%xmm6       ## xmm6=rinvsq
 585
 586         movaps %xmm6,%xmm4  ## rinvsq
 587
 588         movaps %xmm6,%xmm1
 589         mulss  %xmm6,%xmm1  ## rinv4
 590         mulss  %xmm6,%xmm1      ## rinv6
 591         movaps %xmm1,%xmm2
 592         mulss  %xmm2,%xmm2      ## xmm2=rinv12
 593
 594         mulss  %xmm0,%xmm1
 595         mulss  %xmm3,%xmm2
 596         movaps %xmm2,%xmm5
 597         subss  %xmm1,%xmm5      ## Vvdw=Vvdw12-Vvdw6
 598         mulss  nb010_six(%rsp),%xmm1
 599         mulss  nb010_twelve(%rsp),%xmm2
 600         subss  %xmm1,%xmm2
 601         mulss  %xmm2,%xmm4      ## xmm4=total fscal
 602
 603     ## add potential to Vvdwtot (sum in xmm12)
 604         addss  %xmm5,%xmm12
 605
 606     ## calculate scalar force by multiplying dx/dy/dz with fscal
 607         mulss  %xmm4,%xmm9
 608         mulss  %xmm4,%xmm10
 609         mulss  %xmm4,%xmm11
 610
 611         ## xmm0-xmm2 contains tx-tz (partial force)
 612         ## accumulate i forces
 613     addss %xmm9,%xmm13
 614     addss %xmm10,%xmm14
 615     addss %xmm11,%xmm15
 616
 617         movq nb010_faction(%rbp),%rsi
 618     ## add to j forces
 619     addss  (%rsi,%rax,4),%xmm9
 620     addss  4(%rsi,%rax,4),%xmm10
 621     addss  8(%rsi,%rax,4),%xmm11
 622     movss  %xmm9,(%rsi,%rax,4)
 623     movss  %xmm10,4(%rsi,%rax,4)
 624     movss  %xmm11,8(%rsi,%rax,4)
 625
 626 _nb_kernel010_x86_64_sse.nb010_updateouterdata:
 627         movl  nb010_ii3(%rsp),%ecx
 628         movq  nb010_faction(%rbp),%rdi
 629         movq  nb010_fshift(%rbp),%rsi
 630         movl  nb010_is3(%rsp),%edx
 631
 632         ## accumulate i forces in xmm13, xmm14, xmm15
 633         movhlps %xmm13,%xmm0
 634         movhlps %xmm14,%xmm1
 635         movhlps %xmm15,%xmm2
 636         addps  %xmm13,%xmm0
 637         addps  %xmm14,%xmm1
 638         addps  %xmm15,%xmm2
 639     movaps %xmm0,%xmm3
 640         movaps %xmm1,%xmm4
 641         movaps %xmm2,%xmm5
 642         shufps $1,%xmm3,%xmm3
 643         shufps $1,%xmm4,%xmm4
 644         shufps $1,%xmm5,%xmm5
 645         addss  %xmm3,%xmm0
 646         addss  %xmm4,%xmm1
 647         addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0
 648
 649         ## increment i force
 650         movss  (%rdi,%rcx,4),%xmm3
 651         movss  4(%rdi,%rcx,4),%xmm4
 652         movss  8(%rdi,%rcx,4),%xmm5
 653         subss  %xmm0,%xmm3
 654         subss  %xmm1,%xmm4
 655         subss  %xmm2,%xmm5
 656         movss  %xmm3,(%rdi,%rcx,4)
 657         movss  %xmm4,4(%rdi,%rcx,4)
 658         movss  %xmm5,8(%rdi,%rcx,4)
 659
 660         ## increment fshift force
 661         movss  (%rsi,%rdx,4),%xmm3
 662         movss  4(%rsi,%rdx,4),%xmm4
 663         movss  8(%rsi,%rdx,4),%xmm5
 664         subss  %xmm0,%xmm3
 665         subss  %xmm1,%xmm4
 666         subss  %xmm2,%xmm5
 667         movss  %xmm3,(%rsi,%rdx,4)
 668         movss  %xmm4,4(%rsi,%rdx,4)
 669         movss  %xmm5,8(%rsi,%rdx,4)
 670
 671         ## get n from stack
 672         movl nb010_n(%rsp),%esi
 673     ## get group index for i particle
 674     movq  nb010_gid(%rbp),%rdx          ## base of gid[]
 675     movl  (%rdx,%rsi,4),%edx            ## ggid=gid[n]
 676
 677         ## accumulate total potential energy and update it
 678         ## accumulate
 679         movhlps %xmm12,%xmm6
 680         addps  %xmm6,%xmm12     ## pos 0-1 in xmm12 have the sum now
 681         movaps %xmm12,%xmm6
 682         shufps $1,%xmm6,%xmm6
 683         addss  %xmm6,%xmm12
 684
 685         ## add earlier value from mem
 686         movq  nb010_Vvdw(%rbp),%rax
 687         addss (%rax,%rdx,4),%xmm12
 688         ## move back to mem
 689         movss %xmm12,(%rax,%rdx,4)
 690
 691         ## finish if last
 692         movl nb010_nn1(%rsp),%ecx
 693         ## esi already loaded with n
 694         incl %esi
 695         subl %esi,%ecx
 696         jz _nb_kernel010_x86_64_sse.nb010_outerend
 697
 698         ## not last, iterate outer loop once more!
 699         movl %esi,nb010_n(%rsp)
 700         jmp _nb_kernel010_x86_64_sse.nb010_outer
 701 _nb_kernel010_x86_64_sse.nb010_outerend:
 702         ## check if more outer neighborlists remain
 703         movl  nb010_nri(%rsp),%ecx
 704         ## esi already loaded with n above
 705         subl  %esi,%ecx
 706         jz _nb_kernel010_x86_64_sse.nb010_end
 707         ## non-zero, do one more workunit
 708         jmp   _nb_kernel010_x86_64_sse.nb010_threadloop
 709 _nb_kernel010_x86_64_sse.nb010_end:
 710
 711         emms
 712
 713         movl nb010_nouter(%rsp),%eax
 714         movl nb010_ninner(%rsp),%ebx
 715         movq nb010_outeriter(%rbp),%rcx
 716         movq nb010_inneriter(%rbp),%rdx
 717         movl %eax,(%rcx)
 718         movl %ebx,(%rdx)
 719
 720         addq $392,%rsp
 721
 722         pop %r15
 723         pop %r14
 724         pop %r13
 725         pop %r12
 726
 727         pop %rbx
 728         pop    %rbp
 729         ret
 730
 731
 732
 733
 734
 735
 736
 737
 738 .globl nb_kernel010nf_x86_64_sse
 739 .globl _nb_kernel010nf_x86_64_sse
 740 nb_kernel010nf_x86_64_sse:
 741 _nb_kernel010nf_x86_64_sse:
 742 ##      Room for return address and rbp (16 bytes)
 743 .set nb010nf_fshift, 16
 744 .set nb010nf_gid, 24
 745 .set nb010nf_pos, 32
 746 .set nb010nf_faction, 40
 747 .set nb010nf_charge, 48
 748 .set nb010nf_p_facel, 56
 749 .set nb010nf_argkrf, 64
 750 .set nb010nf_argcrf, 72
 751 .set nb010nf_Vc, 80
 752 .set nb010nf_type, 88
 753 .set nb010nf_p_ntype, 96
 754 .set nb010nf_vdwparam, 104
 755 .set nb010nf_Vvdw, 112
 756 .set nb010nf_p_tabscale, 120
 757 .set nb010nf_VFtab, 128
 758 .set nb010nf_invsqrta, 136
 759 .set nb010nf_dvda, 144
 760 .set nb010nf_p_gbtabscale, 152
 761 .set nb010nf_GBtab, 160
 762 .set nb010nf_p_nthreads, 168
 763 .set nb010nf_count, 176
 764 .set nb010nf_mtx, 184
 765 .set nb010nf_outeriter, 192
 766 .set nb010nf_inneriter, 200
 767 .set nb010nf_work, 208
 768         ## The mutex (last arg) is not used in assembly.
 769         ## stack offsets for local variables
 770         ## bottom of stack is cache-aligned for sse use
 771 .set nb010nf_ix, 0
 772 .set nb010nf_iy, 16
 773 .set nb010nf_iz, 32
 774 .set nb010nf_two, 48
 775 .set nb010nf_c6, 64
 776 .set nb010nf_c12, 80
 777 .set nb010nf_Vvdwtot, 96
 778 .set nb010nf_half, 112
 779 .set nb010nf_three, 128
 780 .set nb010nf_nri, 144
 781 .set nb010nf_iinr, 152
 782 .set nb010nf_jindex, 160
 783 .set nb010nf_jjnr, 168
 784 .set nb010nf_shift, 176
 785 .set nb010nf_shiftvec, 184
 786 .set nb010nf_innerjjnr, 192
 787 .set nb010nf_facel, 200
 788 .set nb010nf_ntia, 208
 789 .set nb010nf_innerk, 216
 790 .set nb010nf_is3, 220
 791 .set nb010nf_ii3, 224
 792 .set nb010nf_n, 228
 793 .set nb010nf_nn1, 232
 794 .set nb010nf_ntype, 236
 795 .set nb010nf_nouter, 240
 796 .set nb010nf_ninner, 244
 797
 798         push %rbp
 799         movq %rsp,%rbp
 800         push %rbx
 801
 802         subq $264,%rsp          # # local variable stack space (n*16+8)
 803         emms
 804
 805         ## zero 32-bit iteration counters
 806         movl $0,%eax
 807         movl %eax,nb010nf_nouter(%rsp)
 808         movl %eax,nb010nf_ninner(%rsp)
 809
 810
 811         movl (%rdi),%edi
 812         movl %edi,nb010nf_nri(%rsp)
 813         movq %rsi,nb010nf_iinr(%rsp)
 814         movq %rdx,nb010nf_jindex(%rsp)
 815         movq %rcx,nb010nf_jjnr(%rsp)
 816         movq %r8,nb010nf_shift(%rsp)
 817         movq %r9,nb010nf_shiftvec(%rsp)
 818         movq nb010nf_p_ntype(%rbp),%rdi
 819         movl (%rdi),%edi
 820         movl %edi,nb010nf_ntype(%rsp)
 821
 822         ## create constant floating-point factors on stack
 823         movl $0x40000000,%eax   ## 2.0 in IEEE (hex)
 824         movl %eax,nb010nf_two(%rsp)
 825         movss nb010nf_two(%rsp),%xmm1
 826         shufps $0,%xmm1,%xmm1  ## splat to all elements
 827         movaps %xmm1,nb010nf_two(%rsp)
 828
 829 _nb_kernel010nf_x86_64_sse.nb010nf_threadloop:
 830         movq  nb010nf_count(%rbp),%rsi          ## pointer to sync counter
 831         movl  (%rsi),%eax
 832 _nb_kernel010nf_x86_64_sse.nb010nf_spinlock:
 833         movl  %eax,%ebx                         ## ebx=*count=nn0
 834         addl  $1,%ebx                           ## ebx=nn1=nn0+10
 835         lock
 836         cmpxchgl %ebx,(%rsi)                    ## write nn1 to *counter,
 837                                                 ## if it hasnt changed.
 838                                                 ## or reread *counter to eax.
 839         pause                                   ## -> better p4 performance
 840         jnz _nb_kernel010nf_x86_64_sse.nb010nf_spinlock
 841
 842         ## if(nn1>nri) nn1=nri
 843         movl nb010nf_nri(%rsp),%ecx
 844         movl %ecx,%edx
 845         subl %ebx,%ecx
 846         cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri
 847         ## Cleared the spinlock if we got here.
 848         ## eax contains nn0, ebx contains nn1.
 849         movl %eax,nb010nf_n(%rsp)
 850         movl %ebx,nb010nf_nn1(%rsp)
 851         subl %eax,%ebx                          ## calc number of outer lists
 852         movl %eax,%esi                          ## copy n to esi
 853         jg  _nb_kernel010nf_x86_64_sse.nb010nf_outerstart
 854         jmp _nb_kernel010nf_x86_64_sse.nb010nf_end
 855
 856 _nb_kernel010nf_x86_64_sse.nb010nf_outerstart:
 857         ## ebx contains number of outer iterations
 858         addl nb010nf_nouter(%rsp),%ebx
 859         movl %ebx,nb010nf_nouter(%rsp)
 860
 861 _nb_kernel010nf_x86_64_sse.nb010nf_outer:
 862         movq  nb010nf_shift(%rsp),%rax          ## rax = base of shift[]
 863         movl  (%rax,%rsi,4),%ebx                ## ebx=shift[n]
 864
 865         lea  (%rbx,%rbx,2),%rbx                ## rbx=3*is
 866         movl  %ebx,nb010nf_is3(%rsp)            ## store is3
 867
 868         movq  nb010nf_shiftvec(%rsp),%rax       ## rax = base of shiftvec[]
 869
 870         movss (%rax,%rbx,4),%xmm0
 871         movss 4(%rax,%rbx,4),%xmm1
 872         movss 8(%rax,%rbx,4),%xmm2
 873
 874         movq  nb010nf_iinr(%rsp),%rcx           ## rcx = base of iinr[]
 875         movl  (%rcx,%rsi,4),%ebx                ## ebx =ii
 876
 877         movq  nb010nf_type(%rbp),%rdx
 878         movl  (%rdx,%rbx,4),%edx
 879         imull nb010nf_ntype(%rsp),%edx
 880         shll  %edx
 881         movl  %edx,nb010nf_ntia(%rsp)
 882
 883         lea  (%rbx,%rbx,2),%rbx        ## rbx = 3*ii=ii3
 884         movq  nb010nf_pos(%rbp),%rax      ## rax = base of pos[]
 885
 886         addss (%rax,%rbx,4),%xmm0
 887         addss 4(%rax,%rbx,4),%xmm1
 888         addss 8(%rax,%rbx,4),%xmm2
 889
 890         shufps $0,%xmm0,%xmm0
 891         shufps $0,%xmm1,%xmm1
 892         shufps $0,%xmm2,%xmm2
 893
 894         movaps %xmm0,nb010nf_ix(%rsp)
 895         movaps %xmm1,nb010nf_iy(%rsp)
 896         movaps %xmm2,nb010nf_iz(%rsp)
 897
 898         movl  %ebx,nb010nf_ii3(%rsp)
 899
 900         ## clear Vvdwtot and i forces
 901         xorps %xmm4,%xmm4
 902         movaps %xmm4,nb010nf_Vvdwtot(%rsp)
 903
 904         movq  nb010nf_jindex(%rsp),%rax
 905         movl  (%rax,%rsi,4),%ecx             ## jindex[n]
 906         movl  4(%rax,%rsi,4),%edx            ## jindex[n+1]
 907         subl  %ecx,%edx              ## number of innerloop atoms
 908
 909         movq  nb010nf_pos(%rbp),%rsi
 910         movq  nb010nf_jjnr(%rsp),%rax
 911         shll  $2,%ecx
 912         addq  %rcx,%rax
 913         movq  %rax,nb010nf_innerjjnr(%rsp)      ## pointer to jjnr[nj0]
 914         movl  %edx,%ecx
 915         subl  $4,%edx
 916         addl  nb010nf_ninner(%rsp),%ecx
 917         movl  %ecx,nb010nf_ninner(%rsp)
 918         addl  $0,%edx
 919         movl  %edx,nb010nf_innerk(%rsp)         ## number of innerloop atoms
 920
 921         jge   _nb_kernel010nf_x86_64_sse.nb010nf_unroll_loop
 922         jmp   _nb_kernel010nf_x86_64_sse.nb010nf_finish_inner
 923 _nb_kernel010nf_x86_64_sse.nb010nf_unroll_loop:
 924         ## quad-unroll innerloop here
 925         movq  nb010nf_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]
 926         movl  (%rdx),%eax
 927         movl  4(%rdx),%ebx
 928         movl  8(%rdx),%ecx
 929         movl  12(%rdx),%edx           ## eax-edx=jnr1-4
 930         ## advance pointer (unrolled 4)
 931         addq  $16,nb010nf_innerjjnr(%rsp)
 932
 933         movd  %eax,%mm0         ## use mmx registers as temp storage
 934         movd  %ebx,%mm1
 935         movd  %ecx,%mm2
 936         movd  %edx,%mm3
 937
 938         movq nb010nf_type(%rbp),%rsi
 939         movl (%rsi,%rax,4),%eax
 940         movl (%rsi,%rbx,4),%ebx
 941         movl (%rsi,%rcx,4),%ecx
 942         movl (%rsi,%rdx,4),%edx
 943         movq nb010nf_vdwparam(%rbp),%rsi
 944         shll %eax
 945         shll %ebx
 946         shll %ecx
 947         shll %edx
 948         movl nb010nf_ntia(%rsp),%edi
 949         addl %edi,%eax
 950         addl %edi,%ebx
 951         addl %edi,%ecx
 952         addl %edi,%edx
 953
 954         movlps (%rsi,%rax,4),%xmm6
 955         movlps (%rsi,%rcx,4),%xmm7
 956         movhps (%rsi,%rbx,4),%xmm6
 957         movhps (%rsi,%rdx,4),%xmm7
 958
 959         movaps %xmm6,%xmm4
 960         shufps $136,%xmm7,%xmm4 ## 10001000
 961         shufps $221,%xmm7,%xmm6 ## 11011101
 962
 963         movd  %mm0,%eax
 964         movd  %mm1,%ebx
 965         movd  %mm2,%ecx
 966         movd  %mm3,%edx
 967
 968         movaps %xmm4,nb010nf_c6(%rsp)
 969         movaps %xmm6,nb010nf_c12(%rsp)
 970
 971         movq nb010nf_pos(%rbp),%rsi        ## base of pos[]
 972
 973         lea  (%rax,%rax,2),%rax     ## replace jnr with j3
 974         lea  (%rbx,%rbx,2),%rbx
 975
 976         mulps %xmm2,%xmm3
 977         lea  (%rcx,%rcx,2),%rcx     ## replace jnr with j3
 978         lea  (%rdx,%rdx,2),%rdx
 979
 980         ## move four coordinates to xmm0-xmm2
 981
 982         movlps (%rsi,%rax,4),%xmm4
 983         movlps (%rsi,%rcx,4),%xmm5
 984         movss 8(%rsi,%rax,4),%xmm2
 985         movss 8(%rsi,%rcx,4),%xmm6
 986
 987         movhps (%rsi,%rbx,4),%xmm4
 988         movhps (%rsi,%rdx,4),%xmm5
 989
 990         movss 8(%rsi,%rbx,4),%xmm0
 991         movss 8(%rsi,%rdx,4),%xmm1
 992
 993         shufps $0,%xmm0,%xmm2
 994         shufps $0,%xmm1,%xmm6
 995
 996         movaps %xmm4,%xmm0
 997         movaps %xmm4,%xmm1
 998
 999         shufps $136,%xmm6,%xmm2 ## 10001000
1000
1001         shufps $136,%xmm5,%xmm0 ## 10001000
1002         shufps $221,%xmm5,%xmm1 ## 11011101
1003
1004         ## move ix-iz to xmm4-xmm6
1005         movaps nb010nf_ix(%rsp),%xmm4
1006         movaps nb010nf_iy(%rsp),%xmm5
1007         movaps nb010nf_iz(%rsp),%xmm6
1008
1009         ## calc dr
1010         subps %xmm0,%xmm4
1011         subps %xmm1,%xmm5
1012         subps %xmm2,%xmm6
1013
1014         ## square it
1015         mulps %xmm4,%xmm4
1016         mulps %xmm5,%xmm5
1017         mulps %xmm6,%xmm6
1018         addps %xmm5,%xmm4
1019         addps %xmm6,%xmm4
1020
1021         ## rsq in xmm4
1022         rcpps %xmm4,%xmm5
1023         ## 1/x lookup seed in xmm5
1024         movaps nb010nf_two(%rsp),%xmm0
1025         mulps %xmm5,%xmm4
1026         subps %xmm4,%xmm0
1027         mulps %xmm5,%xmm0       ## xmm0=rinvsq
1028         movaps %xmm0,%xmm4
1029
1030         movaps %xmm0,%xmm1
1031         mulps  %xmm0,%xmm1
1032         mulps  %xmm0,%xmm1      ## xmm1=rinvsix
1033         movaps %xmm1,%xmm2
1034         mulps  %xmm2,%xmm2      ## xmm2=rinvtwelve
1035
1036         mulps  nb010nf_c6(%rsp),%xmm1
1037         mulps  nb010nf_c12(%rsp),%xmm2
1038         movaps %xmm2,%xmm5
1039         subps  %xmm1,%xmm5      ## Vvdw=Vvdw12-Vvdw6
1040         addps  nb010nf_Vvdwtot(%rsp),%xmm5
1041         movaps %xmm5,nb010nf_Vvdwtot(%rsp)
1042
1043         ## should we do one more iteration?
1044         subl  $4,nb010nf_innerk(%rsp)
1045         jl    _nb_kernel010nf_x86_64_sse.nb010nf_finish_inner
1046         jmp   _nb_kernel010nf_x86_64_sse.nb010nf_unroll_loop
1047 _nb_kernel010nf_x86_64_sse.nb010nf_finish_inner:
1048         ## check if at least two particles remain
1049         addl  $4,nb010nf_innerk(%rsp)
1050         movl  nb010nf_innerk(%rsp),%edx
1051         andl  $2,%edx
1052         jnz   _nb_kernel010nf_x86_64_sse.nb010nf_dopair
1053         jmp   _nb_kernel010nf_x86_64_sse.nb010nf_checksingle
1054 _nb_kernel010nf_x86_64_sse.nb010nf_dopair:
1055         movq  nb010nf_innerjjnr(%rsp),%rcx
1056
1057         movl  (%rcx),%eax
1058         movl  4(%rcx),%ebx
1059         addq  $8,nb010nf_innerjjnr(%rsp)
1060
1061         movq nb010nf_type(%rbp),%rsi
1062         movl  %eax,%ecx
1063         movl  %ebx,%edx
1064         movl (%rsi,%rcx,4),%ecx
1065         movl (%rsi,%rdx,4),%edx
1066         movq nb010nf_vdwparam(%rbp),%rsi
1067         shll %ecx
1068         shll %edx
1069         movl nb010nf_ntia(%rsp),%edi
1070         addl %edi,%ecx
1071         addl %edi,%edx
1072         movlps (%rsi,%rcx,4),%xmm6
1073         movhps (%rsi,%rdx,4),%xmm6
1074         movq nb010nf_pos(%rbp),%rdi
1075         xorps  %xmm7,%xmm7
1076         movaps %xmm6,%xmm4
1077         shufps $8,%xmm4,%xmm4 ## 00001000
1078         shufps $13,%xmm6,%xmm6 ## 00001101
1079         movlhps %xmm7,%xmm4
1080         movlhps %xmm7,%xmm6
1081
1082         movaps %xmm4,nb010nf_c6(%rsp)
1083         movaps %xmm6,nb010nf_c12(%rsp)
1084
1085         lea  (%rax,%rax,2),%rax
1086         lea  (%rbx,%rbx,2),%rbx
1087         ## move coordinates to xmm0-xmm2
1088         movlps (%rdi,%rax,4),%xmm1
1089         movss 8(%rdi,%rax,4),%xmm2
1090         movhps (%rdi,%rbx,4),%xmm1
1091         movss 8(%rdi,%rbx,4),%xmm0
1092
1093         movlhps %xmm7,%xmm3
1094
1095         shufps $0,%xmm0,%xmm2
1096
1097         movaps %xmm1,%xmm0
1098
1099         shufps $136,%xmm2,%xmm2 ## 10001000
1100
1101         shufps $136,%xmm0,%xmm0 ## 10001000
1102         shufps $221,%xmm1,%xmm1 ## 11011101
1103
1104         ## move nb010nf_ix-iz to xmm4-xmm6
1105         xorps   %xmm7,%xmm7
1106
1107         movaps nb010nf_ix(%rsp),%xmm4
1108         movaps nb010nf_iy(%rsp),%xmm5
1109         movaps nb010nf_iz(%rsp),%xmm6
1110
1111         ## calc dr
1112         subps %xmm0,%xmm4
1113         subps %xmm1,%xmm5
1114         subps %xmm2,%xmm6
1115
1116         ## square it
1117         mulps %xmm4,%xmm4
1118         mulps %xmm5,%xmm5
1119         mulps %xmm6,%xmm6
1120         addps %xmm5,%xmm4
1121         addps %xmm6,%xmm4
1122         ## rsq in xmm4
1123
1124
1125         rcpps %xmm4,%xmm5
1126         ## 1/x lookup seed in xmm5
1127         movaps nb010nf_two(%rsp),%xmm0
1128         mulps %xmm5,%xmm4
1129         subps %xmm4,%xmm0
1130         mulps %xmm5,%xmm0       ## xmm0=rinvsq
1131         movaps %xmm0,%xmm4
1132
1133         movaps %xmm0,%xmm1
1134         mulps  %xmm0,%xmm1
1135         mulps  %xmm0,%xmm1      ## xmm1=rinvsix
1136         movaps %xmm1,%xmm2
1137         mulps  %xmm2,%xmm2      ## xmm2=rinvtwelve
1138
1139         mulps  nb010nf_c6(%rsp),%xmm1
1140         mulps  nb010nf_c12(%rsp),%xmm2
1141         movaps %xmm2,%xmm5
1142         subps  %xmm1,%xmm5      ## Vvdw=Vvdw12-Vvdw6
1143         addps  nb010nf_Vvdwtot(%rsp),%xmm5
1144         movaps %xmm5,nb010nf_Vvdwtot(%rsp)
1145
1146 _nb_kernel010nf_x86_64_sse.nb010nf_checksingle:
1147         movl  nb010nf_innerk(%rsp),%edx
1148         andl  $1,%edx
1149         jnz    _nb_kernel010nf_x86_64_sse.nb010nf_dosingle
1150         jmp    _nb_kernel010nf_x86_64_sse.nb010nf_updateouterdata
1151 _nb_kernel010nf_x86_64_sse.nb010nf_dosingle:
1152         movq nb010nf_pos(%rbp),%rdi
1153         movq  nb010nf_innerjjnr(%rsp),%rcx
1154         movl  (%rcx),%eax
1155
1156         movq nb010nf_type(%rbp),%rsi
1157         movl %eax,%ecx
1158         movl (%rsi,%rcx,4),%ecx
1159         movq nb010nf_vdwparam(%rbp),%rsi
1160         shll %ecx
1161         addl nb010nf_ntia(%rsp),%ecx
1162         xorps  %xmm6,%xmm6
1163         movlps (%rsi,%rcx,4),%xmm6
1164         movaps %xmm6,%xmm4
1165         shufps $252,%xmm4,%xmm4 ## 11111100
1166         shufps $253,%xmm6,%xmm6 ## 11111101
1167
1168         movaps %xmm4,nb010nf_c6(%rsp)
1169         movaps %xmm6,nb010nf_c12(%rsp)
1170
1171         lea  (%rax,%rax,2),%rax
1172
1173         ## move coordinates to xmm0-xmm2
1174         movss (%rdi,%rax,4),%xmm0
1175         movss 4(%rdi,%rax,4),%xmm1
1176         movss 8(%rdi,%rax,4),%xmm2
1177
1178         xorps   %xmm7,%xmm7
1179
1180         movaps nb010nf_ix(%rsp),%xmm4
1181         movaps nb010nf_iy(%rsp),%xmm5
1182         movaps nb010nf_iz(%rsp),%xmm6
1183
1184         ## calc dr
1185         subps %xmm0,%xmm4
1186         subps %xmm1,%xmm5
1187         subps %xmm2,%xmm6
1188
1189         ## square it
1190         mulps %xmm4,%xmm4
1191         mulps %xmm5,%xmm5
1192         mulps %xmm6,%xmm6
1193         addps %xmm5,%xmm4
1194         addps %xmm6,%xmm4
1195         ## rsq in xmm4
1196
1197         rcpps %xmm4,%xmm5
1198         ## 1/x lookup seed in xmm5
1199         movaps nb010nf_two(%rsp),%xmm0
1200         mulps %xmm5,%xmm4
1201         subps %xmm4,%xmm0
1202         mulps %xmm5,%xmm0       ## xmm0=rinvsq
1203         movaps %xmm0,%xmm4
1204
1205         movaps %xmm0,%xmm1
1206         mulps  %xmm0,%xmm1
1207         mulps  %xmm0,%xmm1      ## xmm1=rinvsix
1208         movaps %xmm1,%xmm2
1209         mulps  %xmm2,%xmm2      ## xmm2=rinvtwelve
1210
1211         mulps  nb010nf_c6(%rsp),%xmm1
1212         mulps  nb010nf_c12(%rsp),%xmm2
1213         movaps %xmm2,%xmm5
1214         subps  %xmm1,%xmm5      ## Vvdw=Vvdw12-Vvdw6
1215         addss  nb010nf_Vvdwtot(%rsp),%xmm5
1216         movss %xmm5,nb010nf_Vvdwtot(%rsp)
1217
1218 _nb_kernel010nf_x86_64_sse.nb010nf_updateouterdata:
1219         ## get n from stack
1220         movl nb010nf_n(%rsp),%esi
1221         ## get group index for i particle
1222         movq  nb010nf_gid(%rbp),%rdx            ## base of gid[]
1223         movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]
1224
1225         ## accumulate total lj energy and update it
1226         movaps nb010nf_Vvdwtot(%rsp),%xmm7
1227         ## accumulate
1228         movhlps %xmm7,%xmm6
1229         addps  %xmm6,%xmm7      ## pos 0-1 in xmm7 have the sum now
1230         movaps %xmm7,%xmm6
1231         shufps $1,%xmm6,%xmm6
1232         addss  %xmm6,%xmm7
1233
1234         ## add earlier value from mem
1235         movq  nb010nf_Vvdw(%rbp),%rax
1236         addss (%rax,%rdx,4),%xmm7
1237         ## move back to mem
1238         movss %xmm7,(%rax,%rdx,4)
1239
1240         ## finish if last
1241         movl nb010nf_nn1(%rsp),%ecx
1242         ## esi already loaded with n
1243         incl %esi
1244         subl %esi,%ecx
1245         jz _nb_kernel010nf_x86_64_sse.nb010nf_outerend
1246
1247         ## not last, iterate outer loop once more!
1248         movl %esi,nb010nf_n(%rsp)
1249         jmp _nb_kernel010nf_x86_64_sse.nb010nf_outer
1250 _nb_kernel010nf_x86_64_sse.nb010nf_outerend:
1251         ## check if more outer neighborlists remain
1252         movl  nb010nf_nri(%rsp),%ecx
1253         ## esi already loaded with n above
1254         subl  %esi,%ecx
1255         jz _nb_kernel010nf_x86_64_sse.nb010nf_end
1256         ## non-zero, do one more workunit
1257         jmp   _nb_kernel010nf_x86_64_sse.nb010nf_threadloop
1258 _nb_kernel010nf_x86_64_sse.nb010nf_end:
1259
1260         movl nb010nf_nouter(%rsp),%eax
1261         movl nb010nf_ninner(%rsp),%ebx
1262         movq nb010nf_outeriter(%rbp),%rcx
1263         movq nb010nf_inneriter(%rbp),%rdx
1264         movl %eax,(%rcx)
1265         movl %ebx,(%rdx)
1266
1267         addq $264,%rsp
1268         emms
1269
1270         pop %rbx
1271         pop    %rbp
1272         ret
1273