src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel130_x86_64_sse_intel_syntax.s

   1 ;#
   2 ;#
   3 ;# Gromacs 4.0                         Copyright (c) 1991-2003
   4 ;# David van der Spoel, Erik Lindahl
   5 ;#
   6 ;# This program is free software; you can redistribute it and/or
   7 ;# modify it under the terms of the GNU General Public License
   8 ;# as published by the Free Software Foundation; either version 2
   9 ;# of the License, or (at your option) any later version.
  10 ;#
  11 ;# To help us fund GROMACS development, we humbly ask that you cite
  12 ;# the research papers on the package. Check out http://www.gromacs.org
  13 ;#
  14 ;# And Hey:
  15 ;# Gnomes, ROck Monsters And Chili Sauce
  16 ;#
  17
  18 ;# These files require GNU binutils 2.10 or later, since we
  19 ;# use intel syntax for portability, or a recent version
  20 ;# of NASM that understands Extended 3DNow and SSE2 instructions.
  21 ;# (NASM is normally only used with MS Visual C++).
  22 ;# Since NASM and gnu as disagree on some definitions and use
  23 ;# completely different preprocessing options I have to introduce a
  24 ;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
  25 ;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
  26 ;# reason why all comments need both symbols...
  27 ;# The source is written for GNU as, with intel syntax. When you use
  28 ;# NASM we redefine a couple of things. The false if-statement around
  29 ;# the following code is seen by GNU as, but NASM doesn't see it, so
  30 ;# the code inside is read by NASM but not gcc.
  31
  32 ; .if 0    # block below only read by NASM
  33 %define .section        section
  34 %define .long           dd
  35 %define .align          align
  36 %define .globl          global
  37 ;# NASM only wants 'dword', not 'dword ptr'.
  38 %define ptr
  39 %macro .equiv                  2
  40    %1 equ %2
  41 %endmacro
  42 ; .endif                   # End of NASM-specific block
  43 ; .intel_syntax noprefix   # Line only read by gnu as
  44
  45 .section .text
  46
  47
  48
  49 .globl nb_kernel130_x86_64_sse
  50 .globl _nb_kernel130_x86_64_sse
  51 nb_kernel130_x86_64_sse:
  52 _nb_kernel130_x86_64_sse:
  53 ;#      Room for return address and rbp (16 bytes)
  54 .equiv          nb130_fshift,           16
  55 .equiv          nb130_gid,              24
  56 .equiv          nb130_pos,              32
  57 .equiv          nb130_faction,          40
  58 .equiv          nb130_charge,           48
  59 .equiv          nb130_p_facel,          56
  60 .equiv          nb130_argkrf,           64
  61 .equiv          nb130_argcrf,           72
  62 .equiv          nb130_Vc,               80
  63 .equiv          nb130_type,             88
  64 .equiv          nb130_p_ntype,          96
  65 .equiv          nb130_vdwparam,         104
  66 .equiv          nb130_Vvdw,             112
  67 .equiv          nb130_p_tabscale,       120
  68 .equiv          nb130_VFtab,            128
  69 .equiv          nb130_invsqrta,         136
  70 .equiv          nb130_dvda,             144
  71 .equiv          nb130_p_gbtabscale,     152
  72 .equiv          nb130_GBtab,            160
  73 .equiv          nb130_p_nthreads,       168
  74 .equiv          nb130_count,            176
  75 .equiv          nb130_mtx,              184
  76 .equiv          nb130_outeriter,        192
  77 .equiv          nb130_inneriter,        200
  78 .equiv          nb130_work,             208
  79         ;# stack offsets for local variables
  80         ;# bottom of stack is cache-aligned for sse use
  81 .equiv          nb130_ix,               0
  82 .equiv          nb130_iy,               16
  83 .equiv          nb130_iz,               32
  84 .equiv          nb130_iq,               48
  85 .equiv          nb130_dx,               64
  86 .equiv          nb130_dy,               80
  87 .equiv          nb130_dz,               96
  88 .equiv          nb130_c6,               112
  89 .equiv          nb130_c12,              128
  90 .equiv          nb130_tsc,              144
  91 .equiv          nb130_qq,               160
  92 .equiv          nb130_vctot,            176
  93 .equiv          nb130_Vvdwtot,          192
  94 .equiv          nb130_fix,              208
  95 .equiv          nb130_fiy,              224
  96 .equiv          nb130_fiz,              240
  97 .equiv          nb130_half,             256
  98 .equiv          nb130_three,            272
  99 .equiv          nb130_two,              288
 100 .equiv          nb130_nri,              336
 101 .equiv          nb130_iinr,             344
 102 .equiv          nb130_jindex,           352
 103 .equiv          nb130_jjnr,             360
 104 .equiv          nb130_shift,            368
 105 .equiv          nb130_shiftvec,         376
 106 .equiv          nb130_facel,            384
 107 .equiv          nb130_innerjjnr,        392
 108 .equiv          nb130_is3,              400
 109 .equiv          nb130_ii3,              404
 110 .equiv          nb130_ntia,             408
 111 .equiv          nb130_innerk,           412
 112 .equiv          nb130_n,                416
 113 .equiv          nb130_nn1,              420
 114 .equiv          nb130_ntype,            424
 115 .equiv          nb130_nouter,           428
 116 .equiv          nb130_ninner,           432
 117
 118         push rbp
 119         mov  rbp, rsp
 120
 121     ;# Push integer registers on stack
 122         push rbx
 123     push rsi
 124     push rdi
 125     push r12
 126     push r13
 127     push r14
 128     push r15
 129
 130     ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
 131     sub rsp, 168
 132
 133     ;# Save xmm registers to stack
 134     movaps [rsp      ], xmm6
 135     movaps [rsp + 16 ], xmm7
 136     movaps [rsp + 32 ], xmm8
 137     movaps [rsp + 48 ], xmm9
 138     movaps [rsp + 64 ], xmm10
 139     movaps [rsp + 80 ], xmm11
 140     movaps [rsp + 96 ], xmm12
 141     movaps [rsp + 112], xmm13
 142     movaps [rsp + 128], xmm14
 143     movaps [rsp + 144], xmm15
 144
 145         emms
 146         sub rsp, 432            ;# local variable stack space (n*16+8)
 147 ; .if 0    # block below only read by NASM - special calling convention on win64
 148 %ifidn __OUTPUT_FORMAT__, win64
 149     ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
 150     add rbp, 48
 151     ;# Adjust stack pointer for different alignment
 152     ;# Move around arguments to fit AMD64 convention below
 153     ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
 154     ;# win64 passes args in: rcx,rdx,r8,r9         + stack
 155     mov rdi, rcx
 156     mov rsi, rdx
 157     mov rdx, r8
 158     mov rcx, r9
 159     mov r8,  [rbp]
 160     mov r9,  [rbp + 8]
 161 %endif
 162 ; .endif   # end NASM- and win64-specific block
 163
 164         ;# zero 32-bit iteration counters
 165         mov eax, 0
 166         mov [rsp + nb130_nouter], eax
 167         mov [rsp + nb130_ninner], eax
 168
 169         mov edi, [rdi]
 170         mov [rsp + nb130_nri], edi
 171         mov [rsp + nb130_iinr], rsi
 172         mov [rsp + nb130_jindex], rdx
 173         mov [rsp + nb130_jjnr], rcx
 174         mov [rsp + nb130_shift], r8
 175         mov [rsp + nb130_shiftvec], r9
 176         mov rdi, [rbp + nb130_p_ntype]
 177         mov edi, [rdi]
 178         mov [rsp + nb130_ntype], edi
 179         mov rsi, [rbp + nb130_p_facel]
 180         movss xmm0, [rsi]
 181         movss [rsp + nb130_facel], xmm0
 182
 183         mov rax, [rbp + nb130_p_tabscale]
 184         movss xmm3, [rax]
 185         shufps xmm3, xmm3, 0
 186         movaps [rsp + nb130_tsc], xmm3
 187
 188         ;# create constant floating-point factors on stack
 189         mov eax, 0x3f000000     ;# half in IEEE (hex)
 190         mov [rsp + nb130_half], eax
 191         movss xmm1, [rsp + nb130_half]
 192         shufps xmm1, xmm1, 0    ;# splat to all elements
 193         movaps xmm2, xmm1
 194         addps  xmm2, xmm2       ;# one
 195         movaps xmm3, xmm2
 196         addps  xmm2, xmm2       ;# two
 197         addps  xmm3, xmm2       ;# three
 198         movaps [rsp + nb130_half],  xmm1
 199         movaps [rsp + nb130_two],  xmm2
 200         movaps [rsp + nb130_three],  xmm3
 201
 202 .nb130_threadloop:
 203         mov   rsi, [rbp + nb130_count]          ;# pointer to sync counter
 204         mov   eax, [rsi]
 205 .nb130_spinlock:
 206         mov   ebx, eax                          ;# ebx=*count=nn0
 207         add   ebx, 1                           ;# ebx=nn1=nn0+10
 208         lock
 209         cmpxchg [rsi], ebx                      ;# write nn1 to *counter,
 210                                                 ;# if it hasnt changed.
 211                                                 ;# or reread *counter to eax.
 212         pause                                   ;# -> better p4 performance
 213         jnz .nb130_spinlock
 214
 215         ;# if(nn1>nri) nn1=nri
 216         mov ecx, [rsp + nb130_nri]
 217         mov edx, ecx
 218         sub ecx, ebx
 219         cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
 220         ;# Cleared the spinlock if we got here.
 221         ;# eax contains nn0, ebx contains nn1.
 222         mov [rsp + nb130_n], eax
 223         mov [rsp + nb130_nn1], ebx
 224         sub ebx, eax                            ;# calc number of outer lists
 225         mov esi, eax                            ;# copy n to esi
 226         jg  .nb130_outerstart
 227         jmp .nb130_end
 228
 229 .nb130_outerstart:
 230         ;# ebx contains number of outer iterations
 231         add ebx, [rsp + nb130_nouter]
 232         mov [rsp + nb130_nouter], ebx
 233
 234 .nb130_outer:
 235         mov   rax, [rsp + nb130_shift]      ;# eax = pointer into shift[]
 236         mov   ebx, [rax + rsi*4]                ;# ebx=shift[n]
 237
 238         lea   rbx, [rbx + rbx*2]    ;# rbx=3*is
 239         mov   [rsp + nb130_is3],ebx     ;# store is3
 240
 241         mov   rax, [rsp + nb130_shiftvec]   ;# eax = base of shiftvec[]
 242
 243         movss xmm0, [rax + rbx*4]
 244         movss xmm1, [rax + rbx*4 + 4]
 245         movss xmm2, [rax + rbx*4 + 8]
 246
 247         mov   rcx, [rsp + nb130_iinr]       ;# ecx = pointer into iinr[]
 248         mov   ebx, [rcx + rsi*4]            ;# ebx =ii
 249
 250         mov   rdx, [rbp + nb130_charge]
 251         movss xmm3, [rdx + rbx*4]
 252         mulss xmm3, [rsp + nb130_facel]
 253         shufps xmm3, xmm3, 0
 254
 255         mov   rdx, [rbp + nb130_type]
 256         mov   edx, [rdx + rbx*4]
 257         imul  edx, [rsp + nb130_ntype]
 258         shl   edx, 1
 259         mov   [rsp + nb130_ntia], edx
 260
 261         lea   rbx, [rbx + rbx*2]        ;# rbx = 3*ii=ii3
 262         mov   rax, [rbp + nb130_pos]    ;# eax = base of pos[]
 263
 264         addss xmm0, [rax + rbx*4]
 265         addss xmm1, [rax + rbx*4 + 4]
 266         addss xmm2, [rax + rbx*4 + 8]
 267
 268         movaps [rsp + nb130_iq], xmm3
 269
 270         shufps xmm0, xmm0, 0
 271         shufps xmm1, xmm1, 0
 272         shufps xmm2, xmm2, 0
 273
 274         movaps [rsp + nb130_ix], xmm0
 275         movaps [rsp + nb130_iy], xmm1
 276         movaps [rsp + nb130_iz], xmm2
 277
 278         mov   [rsp + nb130_ii3], ebx
 279
 280         ;# clear vctot and i forces
 281         xorps xmm4, xmm4
 282         movaps [rsp + nb130_vctot], xmm4
 283         movaps [rsp + nb130_Vvdwtot], xmm4
 284         movaps [rsp + nb130_fix], xmm4
 285         movaps [rsp + nb130_fiy], xmm4
 286         movaps [rsp + nb130_fiz], xmm4
 287
 288         mov   rax, [rsp + nb130_jindex]
 289         mov   ecx, [rax + rsi*4]             ;# jindex[n]
 290         mov   edx, [rax + rsi*4 + 4]         ;# jindex[n+1]
 291         sub   edx, ecx               ;# number of innerloop atoms
 292
 293         mov   rax, [rsp + nb130_jjnr]
 294         shl   ecx, 2
 295         add   rax, rcx
 296         mov   [rsp + nb130_innerjjnr], rax     ;# pointer to jjnr[nj0]
 297         mov   ecx, edx
 298         sub   edx,  4
 299         add   ecx, [rsp + nb130_ninner]
 300         mov   [rsp + nb130_ninner], ecx
 301         add   edx, 0
 302         mov   [rsp + nb130_innerk], edx    ;# number of innerloop atoms
 303         jge   .nb130_unroll_loop
 304         jmp   .nb130_finish_inner
 305 .nb130_unroll_loop:
 306         ;# quad-unroll innerloop here
 307         mov   rdx, [rsp + nb130_innerjjnr]     ;# pointer to jjnr[k]
 308         mov   eax, [rdx]
 309         mov   ebx, [rdx + 4]
 310         mov   ecx, [rdx + 8]
 311         mov   edx, [rdx + 12]         ;# eax-edx=jnr1-4
 312         add qword ptr [rsp + nb130_innerjjnr],  16 ;# advance pointer (unrolled 4)
 313
 314         mov rsi, [rbp + nb130_charge]
 315         movss xmm0, [rsi + rax*4]
 316         movss xmm1, [rsi + rcx*4]
 317         movss xmm2, [rsi + rbx*4]
 318         movss xmm3, [rsi + rdx*4]
 319
 320     unpcklps xmm0, xmm1  ;# jqa jqc - -
 321     unpcklps xmm2, xmm3  ;# jqb jqd - -
 322     unpcklps xmm0, xmm2  ;# jqa jqb jqc jqd
 323         mulps xmm0, [rsp + nb130_iq]
 324     movaps [rsp + nb130_qq], xmm0
 325
 326     ;# vdw parameters
 327         mov rsi, [rbp + nb130_type]
 328         mov r12d, [rsi + rax*4]
 329         mov r13d, [rsi + rbx*4]
 330         mov r14d, [rsi + rcx*4]
 331         mov r15d, [rsi + rdx*4]
 332         shl r12d, 1
 333         shl r13d, 1
 334         shl r14d, 1
 335         shl r15d, 1
 336     mov edi, [rsp + nb130_ntia]
 337         add r12d, edi
 338         add r13d, edi
 339         add r14d, edi
 340         add r15d, edi
 341
 342         mov rsi, [rbp + nb130_vdwparam]
 343         movlps xmm3, [rsi + r12*4]
 344         movlps xmm7, [rsi + r14*4]
 345         movhps xmm3, [rsi + r13*4]
 346         movhps xmm7, [rsi + r15*4]
 347
 348         movaps xmm0, xmm3
 349         shufps xmm0, xmm7, 136  ;# 10001000
 350         shufps xmm3, xmm7, 221  ;# 11011101
 351
 352     movaps [rsp + nb130_c6], xmm0
 353     movaps [rsp + nb130_c12], xmm3
 354
 355         lea   rax, [rax + rax*2]     ;# replace jnr with j3
 356         lea   rbx, [rbx + rbx*2]
 357         lea   rcx, [rcx + rcx*2]
 358         lea   rdx, [rdx + rdx*2]
 359
 360         mov rdi, [rbp + nb130_pos]
 361         ;# load coordinates
 362         movlps xmm1, [rdi + rax*4]      ;# x1 y1 - -
 363         movlps xmm2, [rdi + rcx*4]      ;# x3 y3 - -
 364         movhps xmm1, [rdi + rbx*4]      ;# x2 y2 - -
 365         movhps xmm2, [rdi + rdx*4]      ;# x4 y4 - -
 366
 367         movss xmm5, [rdi + rax*4 + 8]   ;# z1 - - -
 368         movss xmm6, [rdi + rcx*4 + 8]   ;# z2 - - -
 369         movss xmm7, [rdi + rbx*4 + 8]   ;# z3 - - -
 370         movss xmm8, [rdi + rdx*4 + 8]   ;# z4 - - -
 371     movlhps xmm5, xmm7 ;# jzOa  -  jzOb  -
 372     movlhps xmm6, xmm8 ;# jzOc  -  jzOd -
 373
 374     movaps xmm4, xmm1
 375     unpcklps xmm1, xmm2  ;# jxa jxc jya jyc
 376     unpckhps xmm4, xmm2  ;# jxb jxd jyb jyd
 377     movaps xmm2, xmm1
 378     unpcklps xmm1, xmm4 ;# x
 379     unpckhps xmm2, xmm4 ;# y
 380     shufps   xmm5, xmm6,  136  ;# 10001000 => jzH2a jzH2b jzH2c jzH2d
 381
 382         ;# calc dr
 383         subps xmm1, [rsp + nb130_ix]
 384         subps xmm2, [rsp + nb130_iy]
 385         subps xmm5, [rsp + nb130_iz]
 386
 387         ;# store dr
 388     movaps [rsp + nb130_dx], xmm1
 389     movaps [rsp + nb130_dy], xmm2
 390     movaps [rsp + nb130_dz], xmm5
 391
 392         ;# square it
 393         mulps xmm1,xmm1
 394         mulps xmm2,xmm2
 395         mulps xmm5,xmm5
 396         addps xmm1, xmm2
 397         addps xmm1, xmm5
 398
 399         ;# rsq in xmm1
 400
 401     ;# calculate rinv=1/sqrt(rsq)
 402         rsqrtps xmm5, xmm1
 403         movaps xmm2, xmm5
 404         mulps xmm5, xmm5
 405         movaps xmm4, [rsp + nb130_three]
 406         mulps xmm5, xmm1        ;# rsq*lu*lu
 407     subps xmm4, xmm5    ;# 30-rsq*lu*lu
 408         mulps xmm4, xmm2
 409         mulps xmm4, [rsp + nb130_half]
 410         movaps xmm2, xmm4
 411         mulps  xmm1, xmm4
 412     ;# xmm2=rinv
 413     ;# xmm1=r
 414
 415     mulps xmm1, [rsp + nb130_tsc] ;# rtab
 416
 417     ;# truncate and convert to integers
 418     cvttps2dq xmm5, xmm1
 419
 420     ;# convert back to float
 421     cvtdq2ps  xmm4, xmm5
 422
 423     ;# multiply by 8
 424     pslld   xmm5, 3
 425
 426     ;# calculate eps
 427     subps     xmm1, xmm4
 428
 429     ;# move to integer registers
 430     movhlps xmm6, xmm5
 431     movd    r8d, xmm5
 432     movd    r10d, xmm6
 433     pshufd  xmm5, xmm5, 1
 434     pshufd  xmm6, xmm6, 1
 435     movd    r9d, xmm5
 436     movd    r11d, xmm6
 437
 438     ;# xmm1=eps
 439     ;# xmm2=rinv
 440
 441         mov rsi, [rbp + nb130_VFtab]
 442     ;# calculate LJ table
 443     movlps xmm5, [rsi + r8*4]
 444         movlps xmm9, [rsi + r8*4 + 16]
 445
 446         movlps xmm7,  [rsi + r10*4]
 447         movlps xmm11, [rsi + r10*4 + 16]
 448
 449     movaps  xmm0, xmm2             ;# rinv
 450     mulps   xmm2, [rsp + nb130_qq] ;# vcoul=rinv*qq
 451     movaps  xmm3, xmm2             ;# copy of vcoul (to calc fscal)
 452     mulps   xmm3, xmm0             ;# vcoul*rinv
 453
 454         movhps xmm5, [rsi + r9*4]
 455         movhps xmm9, [rsi + r9*4 + 16]
 456
 457     addps   xmm2, [rsp + nb130_vctot]
 458     movaps  [rsp + nb130_vctot], xmm2
 459
 460         movhps xmm7,  [rsi + r11*4]
 461         movhps xmm11, [rsi + r11*4 + 16]
 462
 463     movaps xmm4, xmm5
 464     movaps xmm8, xmm9
 465         shufps xmm4, xmm7, 136  ;# 10001000
 466         shufps xmm8, xmm11, 136  ;# 10001000
 467         shufps xmm5, xmm7, 221  ;# 11011101
 468         shufps xmm9, xmm11, 221  ;# 11011101
 469
 470         movlps xmm7,  [rsi + r8*4 + 8]
 471         movlps xmm11, [rsi + r8*4 + 24]
 472
 473         movlps xmm13, [rsi + r10*4 + 8]
 474         movlps xmm14, [rsi + r10*4 + 24]
 475
 476         movhps xmm7,  [rsi + r9*4 + 8]
 477         movhps xmm11, [rsi + r9*4 + 24]
 478
 479         movhps xmm13, [rsi + r11*4 + 8]
 480         movhps xmm14, [rsi + r11*4 + 24]
 481
 482     movaps xmm6, xmm7
 483     movaps xmm10, xmm11
 484
 485         shufps xmm6, xmm13, 136  ;# 10001000
 486         shufps xmm10, xmm14, 136  ;# 10001000
 487         shufps xmm7, xmm13, 221  ;# 11011101
 488         shufps xmm11, xmm14, 221  ;# 11011101
 489     ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
 490
 491     mulps  xmm7, xmm1    ;# Heps
 492     mulps  xmm11, xmm1
 493     mulps  xmm6, xmm1   ;# Geps
 494     mulps  xmm10, xmm1
 495     mulps  xmm7, xmm1   ;# Heps2
 496     mulps  xmm11, xmm1
 497     addps  xmm5, xmm6  ;# F+Geps
 498     addps  xmm9, xmm10
 499     addps  xmm5, xmm7   ;# F+Geps+Heps2 = Fp
 500     addps  xmm9, xmm11
 501     addps  xmm7, xmm7    ;# 2*Heps2
 502     addps  xmm11, xmm11
 503     addps  xmm7, xmm6   ;# 2*Heps2+Geps
 504     addps  xmm11, xmm10
 505
 506     addps  xmm7, xmm5  ;# FF = Fp + 2*Heps2 + Geps
 507     addps  xmm11, xmm9
 508     mulps  xmm5, xmm1  ;# eps*Fp
 509     mulps  xmm9, xmm1
 510     movaps xmm12, [rsp + nb130_c6]
 511     movaps xmm13, [rsp + nb130_c12]
 512     addps  xmm5, xmm4 ;# VV
 513     addps  xmm9, xmm8
 514
 515     mulps  xmm5, xmm12  ;# VV*c6 = vnb6
 516     mulps  xmm9, xmm13  ;# VV*c12 = vnb12
 517     addps  xmm5, xmm9
 518     addps  xmm5, [rsp + nb130_Vvdwtot]
 519     movaps [rsp + nb130_Vvdwtot], xmm5
 520
 521     mulps  xmm7, xmm12   ;# FF*c6 = fnb6
 522     mulps  xmm11, xmm13   ;# FF*c12  = fnb12
 523     addps  xmm7, xmm11
 524
 525     mulps  xmm7, [rsp + nb130_tsc]
 526     subps  xmm3, xmm7
 527     mulps  xmm3, xmm0   ;# fscal
 528
 529     movaps xmm9, xmm3
 530     movaps xmm10, xmm3
 531     movaps xmm11, xmm3
 532
 533     movaps xmm12, [rsp + nb130_fix]
 534     movaps xmm13, [rsp + nb130_fiy]
 535     movaps xmm14, [rsp + nb130_fiz]
 536
 537     mulps  xmm9,  [rsp + nb130_dx]
 538     mulps  xmm10, [rsp + nb130_dy]
 539     mulps  xmm11, [rsp + nb130_dz]
 540
 541     ;# accumulate i forces
 542     addps xmm12, xmm9
 543     addps xmm13, xmm10
 544     addps xmm14, xmm11
 545     movaps [rsp + nb130_fix], xmm12
 546     movaps [rsp + nb130_fiy], xmm13
 547     movaps [rsp + nb130_fiz], xmm14
 548
 549         mov rsi, [rbp + nb130_faction]
 550         ;# the fj's - start by accumulating x & y forces from memory
 551         movlps xmm0, [rsi + rax*4] ;# x1 y1 - -
 552         movlps xmm1, [rsi + rcx*4] ;# x3 y3 - -
 553         movhps xmm0, [rsi + rbx*4] ;# x1 y1 x2 y2
 554         movhps xmm1, [rsi + rdx*4] ;# x3 y3 x4 y4
 555
 556     movaps xmm8, xmm9
 557     unpcklps xmm9, xmm10 ;# x1 y1 x2 y2
 558     unpckhps xmm8, xmm10 ;# x3 y3 x4 y4
 559
 560     ;# update fjx and fjy
 561         addps  xmm0, xmm9
 562         addps  xmm1, xmm8
 563
 564         movlps [rsi + rax*4], xmm0
 565         movlps [rsi + rcx*4], xmm1
 566         movhps [rsi + rbx*4], xmm0
 567         movhps [rsi + rdx*4], xmm1
 568
 569     ;# xmm11: fjz1 fjz2 fjz3 fjz4
 570     pshufd  xmm10, xmm11, 1  ;# fjz2 - - -
 571     movhlps xmm9,  xmm11     ;# fjz3 - - -
 572     pshufd  xmm8,  xmm11, 3  ;# fjz4 - - -
 573
 574         addss  xmm11, [rsi + rax*4 + 8]
 575         addss  xmm10, [rsi + rbx*4 + 8]
 576         addss  xmm9,  [rsi + rcx*4 + 8]
 577         addss  xmm8,  [rsi + rdx*4 + 8]
 578         movss  [rsi + rax*4 + 8], xmm11
 579         movss  [rsi + rbx*4 + 8], xmm10
 580         movss  [rsi + rcx*4 + 8], xmm9
 581         movss  [rsi + rdx*4 + 8], xmm8
 582
 583         ;# should we do one more iteration?
 584         sub dword ptr [rsp + nb130_innerk],  4
 585         jl    .nb130_finish_inner
 586         jmp   .nb130_unroll_loop
 587 .nb130_finish_inner:
 588         ;# check if at least two particles remain
 589         add dword ptr [rsp + nb130_innerk],  4
 590         mov   edx, [rsp + nb130_innerk]
 591         and   edx, 2
 592         jnz   .nb130_dopair
 593         jmp   .nb130_checksingle
 594 .nb130_dopair:
 595     mov   rcx, [rsp + nb130_innerjjnr]
 596
 597         mov   eax, [rcx]
 598         mov   ebx, [rcx + 4]
 599         add qword ptr [rsp + nb130_innerjjnr],  8
 600
 601         mov rsi, [rbp + nb130_charge]
 602         movss xmm0, [rsi + rax*4]
 603         movss xmm2, [rsi + rbx*4]
 604
 605     unpcklps xmm0, xmm2  ;# jqa jqb
 606         mulps xmm0, [rsp + nb130_iq]
 607     movaps [rsp + nb130_qq], xmm0
 608
 609         mov rsi, [rbp + nb130_type]
 610     ;# vdw parameters
 611         mov r12d, [rsi + rax*4]
 612         mov r13d, [rsi + rbx*4]
 613         shl r12d, 1
 614         shl r13d, 1
 615     mov edi, [rsp + nb130_ntia]
 616         add r12d, edi
 617         add r13d, edi
 618
 619         mov rsi, [rbp + nb130_vdwparam]
 620         movlps xmm3, [rsi + r12*4]
 621         movhps xmm3, [rsi + r13*4]
 622
 623     xorps  xmm7, xmm7
 624         movaps xmm0, xmm3
 625         shufps xmm0, xmm7, 136  ;# 10001000
 626         shufps xmm3, xmm7, 221  ;# 11011101
 627
 628     movaps [rsp + nb130_c6], xmm0
 629     movaps [rsp + nb130_c12], xmm3
 630
 631         lea   rax, [rax + rax*2]     ;# replace jnr with j3
 632         lea   rbx, [rbx + rbx*2]
 633
 634         ;# load coordinates
 635         mov rdi, [rbp + nb130_pos]
 636
 637         movlps xmm1, [rdi + rax*4]      ;# x1 y1 - -
 638         movlps xmm2, [rdi + rbx*4]      ;# x2 y2 - -
 639
 640         movss xmm5, [rdi + rax*4 + 8]   ;# z1 - - -
 641         movss xmm6, [rdi + rbx*4 + 8]   ;# z2 - - -
 642
 643     unpcklps xmm1, xmm2 ;# x1 x2 y1 y2
 644     movhlps  xmm2, xmm1 ;# y1 y2 -  -
 645     unpcklps xmm5, xmm6 ;# z1 z2 -  -
 646
 647         ;# calc dr
 648         subps xmm1, [rsp + nb130_ix]
 649         subps xmm2, [rsp + nb130_iy]
 650         subps xmm5, [rsp + nb130_iz]
 651
 652         ;# store dr
 653     movaps [rsp + nb130_dx], xmm1
 654     movaps [rsp + nb130_dy], xmm2
 655     movaps [rsp + nb130_dz], xmm5
 656
 657         ;# square it
 658         mulps xmm1,xmm1
 659         mulps xmm2,xmm2
 660         mulps xmm5,xmm5
 661         addps xmm1, xmm2
 662         addps xmm1, xmm5
 663
 664         ;# rsq in xmm1
 665
 666     ;# calculate rinv=1/sqrt(rsq)
 667         rsqrtps xmm5, xmm1
 668         movaps xmm2, xmm5
 669         mulps xmm5, xmm5
 670         movaps xmm4, [rsp + nb130_three]
 671         mulps xmm5, xmm1        ;# rsq*lu*lu
 672     subps xmm4, xmm5    ;# 30-rsq*lu*lu
 673         mulps xmm4, xmm2
 674         mulps xmm4, [rsp + nb130_half]
 675         movaps xmm2, xmm4
 676         mulps  xmm1, xmm4
 677     ;# xmm2=rinv
 678     ;# xmm1=r
 679
 680     mulps xmm1, [rsp + nb130_tsc] ;# rtab
 681
 682     ;# truncate and convert to integers
 683     cvttps2dq xmm5, xmm1
 684
 685     ;# convert back to float
 686     cvtdq2ps  xmm4, xmm5
 687
 688     ;# multiply by 8
 689     pslld   xmm5, 3
 690
 691     ;# calculate eps
 692     subps     xmm1, xmm4
 693
 694     ;# move to integer registers
 695     movd    r8d, xmm5
 696     pshufd  xmm5, xmm5, 1
 697     movd    r9d, xmm5
 698
 699     ;# xmm1=eps
 700     ;# xmm2=rinv
 701
 702         mov rsi, [rbp + nb130_VFtab]
 703     ;# calculate LJ table
 704     movlps xmm4, [rsi + r8*4]
 705         movlps xmm5, [rsi + r9*4]
 706
 707     unpcklps xmm4, xmm5
 708     movhlps  xmm5, xmm4
 709
 710     movlps xmm6, [rsi + r8*4 + 8]
 711         movlps xmm7, [rsi + r9*4 + 8]
 712
 713     movaps  xmm0, xmm2             ;# rinv
 714     mulps   xmm2, [rsp + nb130_qq] ;# vcoul=rinv*qq
 715     movaps  xmm3, xmm2             ;# copy of vcoul (to calc fscal)
 716     mulps   xmm3, xmm0             ;# vcoul*rinv
 717
 718     unpcklps xmm6, xmm7
 719     movhlps  xmm7, xmm6
 720
 721     movlps xmm8, [rsi + r8*4 + 16]
 722         movlps xmm9, [rsi + r9*4 + 16]
 723
 724     unpcklps xmm8, xmm9
 725     movhlps  xmm9, xmm8
 726
 727     addps   xmm2, [rsp + nb130_vctot]
 728     movlps  [rsp + nb130_vctot], xmm2
 729
 730     movlps xmm10, [rsi + r8*4 + 24]
 731         movlps xmm11, [rsi + r9*4 + 24]
 732
 733     unpcklps xmm10, xmm11
 734     movhlps  xmm11, xmm10
 735     ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
 736
 737     mulps  xmm7, xmm1    ;# Heps
 738     mulps  xmm11, xmm1
 739     mulps  xmm6, xmm1   ;# Geps
 740     mulps  xmm10, xmm1
 741     mulps  xmm7, xmm1   ;# Heps2
 742     mulps  xmm11, xmm1
 743     addps  xmm5, xmm6  ;# F+Geps
 744     addps  xmm9, xmm10
 745     addps  xmm5, xmm7   ;# F+Geps+Heps2 = Fp
 746     addps  xmm9, xmm11
 747     addps  xmm7, xmm7    ;# 2*Heps2
 748     addps  xmm11, xmm11
 749     addps  xmm7, xmm6   ;# 2*Heps2+Geps
 750     addps  xmm11, xmm10
 751
 752     addps  xmm7, xmm5  ;# FF = Fp + 2*Heps2 + Geps
 753     addps  xmm11, xmm9
 754     mulps  xmm5, xmm1  ;# eps*Fp
 755     mulps  xmm9, xmm1
 756     movaps xmm12, [rsp + nb130_c6]
 757     movaps xmm13, [rsp + nb130_c12]
 758     addps  xmm5, xmm4 ;# VV
 759     addps  xmm9, xmm8
 760
 761     mulps  xmm5, xmm12  ;# VV*c6 = vnb6
 762     mulps  xmm9, xmm13  ;# VV*c12 = vnb12
 763     addps  xmm5, xmm9
 764     addps  xmm5, [rsp + nb130_Vvdwtot]
 765     movlps [rsp + nb130_Vvdwtot], xmm5
 766
 767     mulps  xmm7, xmm12   ;# FF*c6 = fnb6
 768     mulps  xmm11, xmm13   ;# FF*c12  = fnb12
 769     addps  xmm7, xmm11
 770
 771     mulps  xmm7, [rsp + nb130_tsc]
 772     subps  xmm3, xmm7
 773     mulps  xmm3, xmm0   ;# fscal
 774
 775     movaps xmm9, xmm3
 776     movaps xmm10, xmm3
 777     movaps xmm11, xmm3
 778
 779     xorps  xmm8, xmm8
 780
 781     movaps xmm12, [rsp + nb130_fix]
 782     movaps xmm13, [rsp + nb130_fiy]
 783     movaps xmm14, [rsp + nb130_fiz]
 784
 785     mulps  xmm9,  [rsp + nb130_dx]
 786     mulps  xmm10, [rsp + nb130_dy]
 787     mulps  xmm11, [rsp + nb130_dz]
 788
 789     movlhps xmm9, xmm8
 790     movlhps xmm10, xmm8
 791     movlhps xmm11, xmm8
 792
 793     ;# accumulate i forces
 794     addps xmm12, xmm9
 795     addps xmm13, xmm10
 796     addps xmm14, xmm11
 797     movaps [rsp + nb130_fix], xmm12
 798     movaps [rsp + nb130_fiy], xmm13
 799     movaps [rsp + nb130_fiz], xmm14
 800
 801         mov rsi, [rbp + nb130_faction]
 802         ;# the fj's - start by accumulating x & y forces from memory
 803         movlps xmm0, [rsi + rax*4] ;# x1 y1 - -
 804         movhps xmm0, [rsi + rbx*4] ;# x1 y1 x2 y2
 805
 806     unpcklps xmm9, xmm10  ;# x1 y1 x2 y2
 807     addps    xmm0, xmm9
 808
 809         movlps [rsi + rax*4], xmm0
 810         movhps [rsi + rbx*4], xmm0
 811
 812     ;# z forces
 813     pshufd xmm8, xmm11, 1
 814     addss  xmm11, [rsi + rax*4 + 8]
 815     addss  xmm8,  [rsi + rbx*4 + 8]
 816     movss  [rsi + rax*4 + 8], xmm11
 817     movss  [rsi + rbx*4 + 8], xmm8
 818
 819 .nb130_checksingle:
 820         mov   edx, [rsp + nb130_innerk]
 821         and   edx, 1
 822         jnz    .nb130_dosingle
 823         jmp    .nb130_updateouterdata
 824 .nb130_dosingle:
 825         mov rdi, [rbp + nb130_pos]
 826         mov   rcx, [rsp + nb130_innerjjnr]
 827         mov   eax, [rcx]
 828
 829         mov rsi, [rbp + nb130_charge]
 830         movss xmm0, [rsi + rax*4]
 831
 832         mulss xmm0, [rsp + nb130_iq]
 833     movaps [rsp + nb130_qq], xmm0
 834
 835         mov rsi, [rbp + nb130_type]
 836     ;# vdw parameters
 837         mov r12d, [rsi + rax*4]
 838         shl r12d, 1
 839     mov edi, [rsp + nb130_ntia]
 840         add r12d, edi
 841
 842         mov rsi, [rbp + nb130_vdwparam]
 843         movss xmm0, [rsi + r12*4]
 844         movss xmm3, [rsi + r12*4 + 4]
 845
 846     movaps [rsp + nb130_c6], xmm0
 847     movaps [rsp + nb130_c12], xmm3
 848
 849         lea   rax, [rax + rax*2]     ;# replace jnr with j3
 850
 851         mov rdi, [rbp + nb130_pos]
 852         ;# load coordinates
 853         movss xmm1, [rdi + rax*4]
 854         movss xmm2, [rdi + rax*4 + 4]
 855         movss xmm5, [rdi + rax*4 + 8]
 856
 857         ;# calc dr
 858         subss xmm1, [rsp + nb130_ix]
 859         subss xmm2, [rsp + nb130_iy]
 860         subss xmm5, [rsp + nb130_iz]
 861
 862         ;# store dr
 863     movaps [rsp + nb130_dx], xmm1
 864     movaps [rsp + nb130_dy], xmm2
 865     movaps [rsp + nb130_dz], xmm5
 866
 867         ;# square it
 868         mulss xmm1,xmm1
 869         mulss xmm2,xmm2
 870         mulss xmm5,xmm5
 871         addss xmm1, xmm2
 872         addss xmm1, xmm5
 873
 874         ;# rsq in xmm1
 875
 876     ;# calculate rinv=1/sqrt(rsq)
 877         rsqrtss xmm5, xmm1
 878         movaps xmm2, xmm5
 879         mulss xmm5, xmm5
 880         movaps xmm4, [rsp + nb130_three]
 881         mulss xmm5, xmm1        ;# rsq*lu*lu
 882     subss xmm4, xmm5    ;# 30-rsq*lu*lu
 883         mulss xmm4, xmm2
 884         mulss xmm4, [rsp + nb130_half]
 885         movaps xmm2, xmm4
 886         mulss  xmm1, xmm4
 887     ;# xmm2=rinv
 888     ;# xmm1=r
 889
 890     mulss xmm1, [rsp + nb130_tsc] ;# rtab
 891
 892     ;# truncate and convert to integers
 893     cvttss2si r8d, xmm1
 894
 895     ;# convert back to float
 896     cvtsi2ss  xmm4, r8d
 897
 898     ;# multiply by 8
 899     shl      r8d, 3
 900
 901     ;# calculate eps
 902     subss     xmm1, xmm4
 903
 904     ;# xmm1=eps
 905     ;# xmm2=rinv
 906
 907         mov rsi, [rbp + nb130_VFtab]
 908     ;# calculate LJ table
 909     movss xmm4, [rsi + r8*4]
 910         movss xmm5, [rsi + r8*4 + 4]
 911     movss xmm6, [rsi + r8*4 + 8]
 912         movss xmm7, [rsi + r8*4 + 12]
 913     movss xmm8, [rsi + r8*4 + 16]
 914         movss xmm9, [rsi + r8*4 + 20]
 915     movss xmm10, [rsi + r8*4 + 24]
 916         movss xmm11, [rsi + r8*4 + 28]
 917     ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
 918
 919     ;# coulomb interaction
 920     movaps  xmm0, xmm2             ;# rinv
 921     mulss   xmm2, [rsp + nb130_qq] ;# vcoul=rinv*qq
 922     movaps  xmm3, xmm2             ;# copy of vcoul (to calc fscal)
 923     mulss   xmm3, xmm0             ;# vcoul*rinv
 924
 925     addss   xmm2, [rsp + nb130_vctot]
 926     movss   [rsp + nb130_vctot], xmm2
 927
 928     ;# calculate table interaction
 929     mulss  xmm7, xmm1    ;# Heps
 930     mulss  xmm11, xmm1
 931     mulss  xmm6, xmm1   ;# Geps
 932     mulss  xmm10, xmm1
 933     mulss  xmm7, xmm1   ;# Heps2
 934     mulss  xmm11, xmm1
 935     addss  xmm5, xmm6  ;# F+Geps
 936     addss  xmm9, xmm10
 937     addss  xmm5, xmm7   ;# F+Geps+Heps2 = Fp
 938     addss  xmm9, xmm11
 939     addss  xmm7, xmm7    ;# 2*Heps2
 940     addss  xmm11, xmm11
 941     addss  xmm7, xmm6   ;# 2*Heps2+Geps
 942     addss  xmm11, xmm10
 943
 944     addss  xmm7, xmm5  ;# FF = Fp + 2*Heps2 + Geps
 945     addss  xmm11, xmm9
 946     mulss  xmm5, xmm1  ;# eps*Fp
 947     mulss  xmm9, xmm1
 948     movaps xmm12, [rsp + nb130_c6]
 949     movaps xmm13, [rsp + nb130_c12]
 950     addss  xmm5, xmm4 ;# VV
 951     addss  xmm9, xmm8
 952
 953     mulss  xmm5, xmm12  ;# VV*c6 = vnb6
 954     mulss  xmm9, xmm13  ;# VV*c12 = vnb12
 955     addss  xmm5, xmm9
 956     addss  xmm5, [rsp + nb130_Vvdwtot]
 957     movss  [rsp + nb130_Vvdwtot], xmm5
 958
 959     mulss  xmm7, xmm12   ;# FF*c6 = fnb6
 960     mulss  xmm11, xmm13   ;# FF*c12  = fnb12
 961     addss  xmm7, xmm11
 962
 963     mulss  xmm7, [rsp + nb130_tsc]
 964     subss  xmm3, xmm7
 965     mulss  xmm3, xmm0   ;# fscal
 966
 967     movaps xmm9, xmm3
 968     movaps xmm10, xmm3
 969     movaps xmm11, xmm3
 970
 971     movaps xmm12, [rsp + nb130_fix]
 972     movaps xmm13, [rsp + nb130_fiy]
 973     movaps xmm14, [rsp + nb130_fiz]
 974
 975     mulss  xmm9,  [rsp + nb130_dx]
 976     mulss  xmm10, [rsp + nb130_dy]
 977     mulss  xmm11, [rsp + nb130_dz]
 978
 979     ;# accumulate i forces
 980     addss xmm12, xmm9
 981     addss xmm13, xmm10
 982     addss xmm14, xmm11
 983     movaps [rsp + nb130_fix], xmm12
 984     movaps [rsp + nb130_fiy], xmm13
 985     movaps [rsp + nb130_fiz], xmm14
 986
 987         mov rsi, [rbp + nb130_faction]
 988     ;# add to j forces
 989     addss  xmm9,  [rsi + rax*4]
 990     addss  xmm10, [rsi + rax*4 + 4]
 991     addss  xmm11, [rsi + rax*4 + 8]
 992     movss  [rsi + rax*4],     xmm9
 993     movss  [rsi + rax*4 + 4], xmm10
 994     movss  [rsi + rax*4 + 8], xmm11
 995
 996 .nb130_updateouterdata:
 997         mov   ecx, [rsp + nb130_ii3]
 998         mov   rdi, [rbp + nb130_faction]
 999         mov   rsi, [rbp + nb130_fshift]
1000         mov   edx, [rsp + nb130_is3]
1001
1002         ;# accumulate i forces in xmm0, xmm1, xmm2
1003         movaps xmm0, [rsp + nb130_fix]
1004         movaps xmm1, [rsp + nb130_fiy]
1005         movaps xmm2, [rsp + nb130_fiz]
1006
1007         movhlps xmm3, xmm0
1008         movhlps xmm4, xmm1
1009         movhlps xmm5, xmm2
1010         addps  xmm0, xmm3
1011         addps  xmm1, xmm4
1012         addps  xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2
1013
1014         movaps xmm3, xmm0
1015         movaps xmm4, xmm1
1016         movaps xmm5, xmm2
1017
1018         shufps xmm3, xmm3, 1
1019         shufps xmm4, xmm4, 1
1020         shufps xmm5, xmm5, 1
1021         addss  xmm0, xmm3
1022         addss  xmm1, xmm4
1023         addss  xmm2, xmm5       ;# xmm0-xmm2 has single force in pos0
1024
1025         ;# increment i force
1026         movss  xmm3, [rdi + rcx*4]
1027         movss  xmm4, [rdi + rcx*4 + 4]
1028         movss  xmm5, [rdi + rcx*4 + 8]
1029         subss  xmm3, xmm0
1030         subss  xmm4, xmm1
1031         subss  xmm5, xmm2
1032         movss  [rdi + rcx*4],     xmm3
1033         movss  [rdi + rcx*4 + 4], xmm4
1034         movss  [rdi + rcx*4 + 8], xmm5
1035
1036         ;# increment fshift force
1037         movss  xmm3, [rsi + rdx*4]
1038         movss  xmm4, [rsi + rdx*4 + 4]
1039         movss  xmm5, [rsi + rdx*4 + 8]
1040         subss  xmm3, xmm0
1041         subss  xmm4, xmm1
1042         subss  xmm5, xmm2
1043         movss  [rsi + rdx*4],     xmm3
1044         movss  [rsi + rdx*4 + 4], xmm4
1045         movss  [rsi + rdx*4 + 8], xmm5
1046
1047         ;# get n from stack
1048         mov esi, [rsp + nb130_n]
1049         ;# get group index for i particle
1050         mov   rdx, [rbp + nb130_gid]            ;# base of gid[]
1051         mov   edx, [rdx + rsi*4]                ;# ggid=gid[n]
1052
1053         ;# accumulate total potential energy and update it
1054         movaps xmm7, [rsp + nb130_vctot]
1055         ;# accumulate
1056         movhlps xmm6, xmm7
1057         addps  xmm7, xmm6       ;# pos 0-1 in xmm7 have the sum now
1058         movaps xmm6, xmm7
1059         shufps xmm6, xmm6, 1
1060         addss  xmm7, xmm6
1061
1062         ;# add earlier value from mem
1063         mov   rax, [rbp + nb130_Vc]
1064         addss xmm7, [rax + rdx*4]
1065         ;# move back to mem
1066         movss [rax + rdx*4], xmm7
1067
1068         ;# accumulate total lj energy and update it
1069         movaps xmm7, [rsp + nb130_Vvdwtot]
1070         ;# accumulate
1071         movhlps xmm6, xmm7
1072         addps  xmm7, xmm6       ;# pos 0-1 in xmm7 have the sum now
1073         movaps xmm6, xmm7
1074         shufps xmm6, xmm6, 1
1075         addss  xmm7, xmm6
1076
1077         ;# add earlier value from mem
1078         mov   rax, [rbp + nb130_Vvdw]
1079         addss xmm7, [rax + rdx*4]
1080         ;# move back to mem
1081         movss [rax + rdx*4], xmm7
1082
1083         ;# finish if last
1084         mov ecx, [rsp + nb130_nn1]
1085         ;# esi already loaded with n
1086         inc esi
1087         sub ecx, esi
1088         jz .nb130_outerend
1089
1090         ;# not last, iterate outer loop once more!
1091         mov [rsp + nb130_n], esi
1092         jmp .nb130_outer
1093 .nb130_outerend:
1094         ;# check if more outer neighborlists remain
1095         mov   ecx, [rsp + nb130_nri]
1096         ;# esi already loaded with n above
1097         sub   ecx, esi
1098         jz .nb130_end
1099         ;# non-zero, do one more workunit
1100         jmp   .nb130_threadloop
1101 .nb130_end:
1102         mov eax, [rsp + nb130_nouter]
1103         mov ebx, [rsp + nb130_ninner]
1104         mov rcx, [rbp + nb130_outeriter]
1105         mov rdx, [rbp + nb130_inneriter]
1106         mov [rcx], eax
1107         mov [rdx], ebx
1108
1109         add rsp, 432
1110         emms
1111
1112     ;# Save xmm registers to stack
1113     movaps xmm6,  [rsp      ]
1114     movaps xmm7,  [rsp + 16 ]
1115     movaps xmm8,  [rsp + 32 ]
1116     movaps xmm9,  [rsp + 48 ]
1117     movaps xmm10, [rsp + 64 ]
1118     movaps xmm11, [rsp + 80 ]
1119     movaps xmm12, [rsp + 96 ]
1120     movaps xmm13, [rsp + 112]
1121     movaps xmm14, [rsp + 128]
1122     movaps xmm15, [rsp + 144]
1123
1124     ;# Reset pointers after restoring xmm6-15
1125     add rsp, 168
1126
1127     pop r15
1128     pop r14
1129     pop r13
1130     pop r12
1131     pop rdi
1132     pop rsi
1133     pop rbx
1134
1135         pop     rbp
1136         ret
1137
1138
1139
1140
1141
1142
1143 .globl nb_kernel130nf_x86_64_sse
1144 .globl _nb_kernel130nf_x86_64_sse
1145 nb_kernel130nf_x86_64_sse:
1146 _nb_kernel130nf_x86_64_sse:
1147 ;#      Room for return address and rbp (16 bytes)
1148 .equiv          nb130nf_fshift,         16
1149 .equiv          nb130nf_gid,            24
1150 .equiv          nb130nf_pos,            32
1151 .equiv          nb130nf_faction,        40
1152 .equiv          nb130nf_charge,         48
1153 .equiv          nb130nf_p_facel,        56
1154 .equiv          nb130nf_argkrf,         64
1155 .equiv          nb130nf_argcrf,         72
1156 .equiv          nb130nf_Vc,             80
1157 .equiv          nb130nf_type,           88
1158 .equiv          nb130nf_p_ntype,        96
1159 .equiv          nb130nf_vdwparam,       104
1160 .equiv          nb130nf_Vvdw,           112
1161 .equiv          nb130nf_p_tabscale,     120
1162 .equiv          nb130nf_VFtab,          128
1163 .equiv          nb130nf_invsqrta,       136
1164 .equiv          nb130nf_dvda,           144
1165 .equiv          nb130nf_p_gbtabscale,   152
1166 .equiv          nb130nf_GBtab,          160
1167 .equiv          nb130nf_p_nthreads,     168
1168 .equiv          nb130nf_count,          176
1169 .equiv          nb130nf_mtx,            184
1170 .equiv          nb130nf_outeriter,      192
1171 .equiv          nb130nf_inneriter,      200
1172 .equiv          nb130nf_work,           208
1173         ;# stack offsets for local variables
1174         ;# bottom of stack is cache-aligned for sse use
1175 .equiv          nb130nf_ix,             0
1176 .equiv          nb130nf_iy,             16
1177 .equiv          nb130nf_iz,             32
1178 .equiv          nb130nf_iq,             48
1179 .equiv          nb130nf_c6,             64
1180 .equiv          nb130nf_c12,            80
1181 .equiv          nb130nf_vctot,          96
1182 .equiv          nb130nf_Vvdwtot,        112
1183 .equiv          nb130nf_half,           128
1184 .equiv          nb130nf_three,          144
1185 .equiv          nb130nf_krf,            160
1186 .equiv          nb130nf_crf,            176
1187 .equiv          nb130nf_tsc,            192
1188 .equiv          nb130nf_nri,            208
1189 .equiv          nb130nf_iinr,           216
1190 .equiv          nb130nf_jindex,         224
1191 .equiv          nb130nf_jjnr,           232
1192 .equiv          nb130nf_shift,          240
1193 .equiv          nb130nf_shiftvec,       248
1194 .equiv          nb130nf_facel,          256
1195 .equiv          nb130nf_innerjjnr,      264
1196 .equiv          nb130nf_is3,            272
1197 .equiv          nb130nf_ii3,            280
1198 .equiv          nb130nf_ntia,           284
1199 .equiv          nb130nf_innerk,         288
1200 .equiv          nb130nf_n,              292
1201 .equiv          nb130nf_nn1,            296
1202 .equiv          nb130nf_ntype,          300
1203 .equiv          nb130nf_nouter,         304
1204 .equiv          nb130nf_ninner,         308
1205
1206         push rbp
1207         mov  rbp, rsp
1208
1209     ;# Push integer registers on stack
1210         push rbx
1211     push rsi
1212     push rdi
1213     push r12
1214     push r13
1215     push r14
1216     push r15
1217
1218     ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
1219     sub rsp, 168
1220
1221     ;# Save xmm registers to stack
1222     movaps [rsp      ], xmm6
1223     movaps [rsp + 16 ], xmm7
1224     movaps [rsp + 32 ], xmm8
1225     movaps [rsp + 48 ], xmm9
1226     movaps [rsp + 64 ], xmm10
1227     movaps [rsp + 80 ], xmm11
1228     movaps [rsp + 96 ], xmm12
1229     movaps [rsp + 112], xmm13
1230     movaps [rsp + 128], xmm14
1231     movaps [rsp + 144], xmm15
1232
1233         emms
1234         sub rsp, 320            ;# local variable stack space (n*16+8)
1235 ; .if 0    # block below only read by NASM - special calling convention on win64
1236 %ifidn __OUTPUT_FORMAT__, win64
1237     ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
1238     add rbp, 48
1239     ;# Adjust stack pointer for different alignment
1240     ;# Move around arguments to fit AMD64 convention below
1241     ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
1242     ;# win64 passes args in: rcx,rdx,r8,r9         + stack
1243     mov rdi, rcx
1244     mov rsi, rdx
1245     mov rdx, r8
1246     mov rcx, r9
1247     mov r8,  [rbp]
1248     mov r9,  [rbp + 8]
1249 %endif
1250 ; .endif   # end NASM- and win64-specific block
1251
1252         ;# zero 32-bit iteration counters
1253         mov eax, 0
1254         mov [rsp + nb130nf_nouter], eax
1255         mov [rsp + nb130nf_ninner], eax
1256
1257         mov edi, [rdi]
1258         mov [rsp + nb130nf_nri], edi
1259         mov [rsp + nb130nf_iinr], rsi
1260         mov [rsp + nb130nf_jindex], rdx
1261         mov [rsp + nb130nf_jjnr], rcx
1262         mov [rsp + nb130nf_shift], r8
1263         mov [rsp + nb130nf_shiftvec], r9
1264         mov rdi, [rbp + nb130nf_p_ntype]
1265         mov edi, [rdi]
1266         mov [rsp + nb130nf_ntype], edi
1267         mov rsi, [rbp + nb130nf_p_facel]
1268         movss xmm0, [rsi]
1269         movss [rsp + nb130nf_facel], xmm0
1270
1271         mov rax, [rbp + nb130nf_p_tabscale]
1272         movss xmm3, [rax]
1273         shufps xmm3, xmm3, 0
1274         movaps [rsp + nb130nf_tsc], xmm3
1275
1276         ;# create constant floating-point factors on stack
1277         mov eax, 0x3f000000     ;# half in IEEE (hex)
1278         mov [rsp + nb130nf_half], eax
1279         movss xmm1, [rsp + nb130nf_half]
1280         shufps xmm1, xmm1, 0    ;# splat to all elements
1281         movaps xmm2, xmm1
1282         addps  xmm2, xmm2       ;# one
1283         movaps xmm3, xmm2
1284         addps  xmm2, xmm2       ;# two
1285         addps  xmm3, xmm2       ;# three
1286         movaps [rsp + nb130nf_half],  xmm1
1287         movaps [rsp + nb130nf_three],  xmm3
1288
1289
1290 .nb130nf_threadloop:
1291         mov   rsi, [rbp + nb130nf_count]          ;# pointer to sync counter
1292         mov   eax, [rsi]
1293 .nb130nf_spinlock:
1294         mov   ebx, eax                          ;# ebx=*count=nn0
1295         add   ebx, 1                           ;# ebx=nn1=nn0+10
1296         lock
1297         cmpxchg [rsi], ebx                      ;# write nn1 to *counter,
1298                                                 ;# if it hasnt changed.
1299                                                 ;# or reread *counter to eax.
1300         pause                                   ;# -> better p4 performance
1301         jnz .nb130nf_spinlock
1302
1303         ;# if(nn1>nri) nn1=nri
1304         mov ecx, [rsp + nb130nf_nri]
1305         mov edx, ecx
1306         sub ecx, ebx
1307         cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
1308         ;# Cleared the spinlock if we got here.
1309         ;# eax contains nn0, ebx contains nn1.
1310         mov [rsp + nb130nf_n], eax
1311         mov [rsp + nb130nf_nn1], ebx
1312         sub ebx, eax                            ;# calc number of outer lists
1313         mov esi, eax                            ;# copy n to esi
1314         jg  .nb130nf_outerstart
1315         jmp .nb130nf_end
1316
1317 .nb130nf_outerstart:
1318         ;# ebx contains number of outer iterations
1319         add ebx, [rsp + nb130nf_nouter]
1320         mov [rsp + nb130nf_nouter], ebx
1321
1322 .nb130nf_outer:
1323         mov   rax, [rsp + nb130nf_shift]      ;# eax = pointer into shift[]
1324         mov   ebx, [rax + rsi*4]                ;# ebx=shift[n]
1325
1326         lea   rbx, [rbx + rbx*2]    ;# rbx=3*is
1327         mov   [rsp + nb130nf_is3],ebx           ;# store is3
1328
1329         mov   rax, [rsp + nb130nf_shiftvec]   ;# eax = base of shiftvec[]
1330
1331         movss xmm0, [rax + rbx*4]
1332         movss xmm1, [rax + rbx*4 + 4]
1333         movss xmm2, [rax + rbx*4 + 8]
1334
1335         mov   rcx, [rsp + nb130nf_iinr]       ;# ecx = pointer into iinr[]
1336         mov   ebx, [rcx + rsi*4]            ;# ebx =ii
1337
1338         mov   rdx, [rbp + nb130nf_charge]
1339         movss xmm3, [rdx + rbx*4]
1340         mulss xmm3, [rsp + nb130nf_facel]
1341         shufps xmm3, xmm3, 0
1342
1343         mov   rdx, [rbp + nb130nf_type]
1344         mov   edx, [rdx + rbx*4]
1345         imul  edx, [rsp + nb130nf_ntype]
1346         shl   edx, 1
1347         mov   [rsp + nb130nf_ntia], edx
1348
1349         lea   rbx, [rbx + rbx*2]        ;# rbx = 3*ii=ii3
1350         mov   rax, [rbp + nb130nf_pos]    ;# eax = base of pos[]
1351
1352         addss xmm0, [rax + rbx*4]
1353         addss xmm1, [rax + rbx*4 + 4]
1354         addss xmm2, [rax + rbx*4 + 8]
1355
1356         movaps [rsp + nb130nf_iq], xmm3
1357
1358         shufps xmm0, xmm0, 0
1359         shufps xmm1, xmm1, 0
1360         shufps xmm2, xmm2, 0
1361
1362         movaps [rsp + nb130nf_ix], xmm0
1363         movaps [rsp + nb130nf_iy], xmm1
1364         movaps [rsp + nb130nf_iz], xmm2
1365
1366         mov   [rsp + nb130nf_ii3], ebx
1367
1368         ;# clear vctot and i forces
1369         xorps xmm4, xmm4
1370         movaps [rsp + nb130nf_vctot], xmm4
1371         movaps [rsp + nb130nf_Vvdwtot], xmm4
1372
1373         mov   rax, [rsp + nb130nf_jindex]
1374         mov   ecx, [rax + rsi*4]             ;# jindex[n]
1375         mov   edx, [rax + rsi*4 + 4]         ;# jindex[n+1]
1376         sub   edx, ecx               ;# number of innerloop atoms
1377
1378         mov   rsi, [rbp + nb130nf_pos]
1379         mov   rax, [rsp + nb130nf_jjnr]
1380         shl   ecx, 2
1381         add   rax, rcx
1382         mov   [rsp + nb130nf_innerjjnr], rax     ;# pointer to jjnr[nj0]
1383         mov   ecx, edx
1384         sub   edx,  4
1385         add   ecx, [rsp + nb130nf_ninner]
1386         mov   [rsp + nb130nf_ninner], ecx
1387         add   edx, 0
1388         mov   [rsp + nb130nf_innerk], edx    ;# number of innerloop atoms
1389         jge   .nb130nf_unroll_loop
1390         jmp   .nb130nf_finish_inner
1391 .nb130nf_unroll_loop:
1392         ;# quad-unroll innerloop here
1393         mov   rdx, [rsp + nb130nf_innerjjnr]     ;# pointer to jjnr[k]
1394         mov   eax, [rdx]
1395         mov   ebx, [rdx + 4]
1396         mov   ecx, [rdx + 8]
1397         mov   edx, [rdx + 12]         ;# eax-edx=jnr1-4
1398         add qword ptr [rsp + nb130nf_innerjjnr],  16 ;# advance pointer (unrolled 4)
1399
1400         mov rsi, [rbp + nb130nf_charge]    ;# base of charge[]
1401
1402         movss xmm3, [rsi + rax*4]
1403         movss xmm4, [rsi + rcx*4]
1404         movss xmm6, [rsi + rbx*4]
1405         movss xmm7, [rsi + rdx*4]
1406
1407         movaps xmm2, [rsp + nb130nf_iq]
1408         shufps xmm3, xmm6, 0
1409         shufps xmm4, xmm7, 0
1410         shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3
1411         movd  mm0, eax          ;# use mmx registers as temp storage
1412         movd  mm1, ebx
1413         movd  mm2, ecx
1414         movd  mm3, edx
1415
1416         mov rsi, [rbp + nb130nf_type]
1417         mov eax, [rsi + rax*4]
1418         mov ebx, [rsi + rbx*4]
1419         mov ecx, [rsi + rcx*4]
1420         mov edx, [rsi + rdx*4]
1421         mov rsi, [rbp + nb130nf_vdwparam]
1422         shl eax, 1
1423         shl ebx, 1
1424         shl ecx, 1
1425         shl edx, 1
1426         mov edi, [rsp + nb130nf_ntia]
1427         add eax, edi
1428         add ebx, edi
1429         add ecx, edi
1430         add edx, edi
1431
1432         movlps xmm6, [rsi + rax*4]
1433         movlps xmm7, [rsi + rcx*4]
1434         movhps xmm6, [rsi + rbx*4]
1435         movhps xmm7, [rsi + rdx*4]
1436
1437         movaps xmm4, xmm6
1438         shufps xmm4, xmm7, 136  ;# constant 10001000
1439         shufps xmm6, xmm7, 221  ;# constant 11011101
1440
1441         movd  eax, mm0
1442         movd  ebx, mm1
1443         movd  ecx, mm2
1444         movd  edx, mm3
1445
1446         movaps [rsp + nb130nf_c6], xmm4
1447         movaps [rsp + nb130nf_c12], xmm6
1448
1449         mov rsi, [rbp + nb130nf_pos]       ;# base of pos[]
1450
1451         lea   rax, [rax + rax*2]     ;# replace jnr with j3
1452         lea   rbx, [rbx + rbx*2]
1453
1454         mulps xmm3, xmm2
1455         lea   rcx, [rcx + rcx*2]     ;# replace jnr with j3
1456         lea   rdx, [rdx + rdx*2]
1457
1458         ;# move four coordinates to xmm0-xmm2
1459
1460         movlps xmm4, [rsi + rax*4]
1461         movlps xmm5, [rsi + rcx*4]
1462         movss xmm2, [rsi + rax*4 + 8]
1463         movss xmm6, [rsi + rcx*4 + 8]
1464
1465         movhps xmm4, [rsi + rbx*4]
1466         movhps xmm5, [rsi + rdx*4]
1467
1468         movss xmm0, [rsi + rbx*4 + 8]
1469         movss xmm1, [rsi + rdx*4 + 8]
1470
1471         shufps xmm2, xmm0, 0
1472         shufps xmm6, xmm1, 0
1473
1474         movaps xmm0, xmm4
1475         movaps xmm1, xmm4
1476
1477         shufps xmm2, xmm6, 136  ;# constant 10001000
1478
1479         shufps xmm0, xmm5, 136  ;# constant 10001000
1480         shufps xmm1, xmm5, 221  ;# constant 11011101
1481
1482         ;# move ix-iz to xmm4-xmm6
1483         movaps xmm4, [rsp + nb130nf_ix]
1484         movaps xmm5, [rsp + nb130nf_iy]
1485         movaps xmm6, [rsp + nb130nf_iz]
1486
1487         ;# calc dr
1488         subps xmm4, xmm0
1489         subps xmm5, xmm1
1490         subps xmm6, xmm2
1491
1492         ;# square it
1493         mulps xmm4,xmm4
1494         mulps xmm5,xmm5
1495         mulps xmm6,xmm6
1496         addps xmm4, xmm5
1497         addps xmm4, xmm6
1498         ;# rsq in xmm4
1499
1500         rsqrtps xmm5, xmm4
1501         ;# lookup seed in xmm5
1502         movaps xmm2, xmm5
1503         mulps xmm5, xmm5
1504         movaps xmm1, [rsp + nb130nf_three]
1505         mulps xmm5, xmm4        ;# rsq*lu*lu
1506         movaps xmm0, [rsp + nb130nf_half]
1507         subps xmm1, xmm5        ;# constant 30-rsq*lu*lu
1508         mulps xmm1, xmm2
1509         mulps xmm0, xmm1        ;# xmm0=rinv
1510         movaps xmm1, xmm0
1511         mulps xmm3, xmm0
1512         addps  xmm3, [rsp + nb130nf_vctot]
1513         movaps [rsp + nb130nf_vctot], xmm3
1514
1515         ;# LJ table
1516         mulps  xmm4, xmm1  ;# r
1517         mulps  xmm4, [rsp + nb130nf_tsc] ;# rtab
1518
1519         movaps xmm0, xmm1 ;# copy of rinv
1520         movhlps xmm5, xmm4
1521         cvttps2pi mm6, xmm4
1522         cvttps2pi mm7, xmm5     ;# mm6/mm7 contain lu indices
1523         cvtpi2ps xmm6, mm6
1524         cvtpi2ps xmm5, mm7
1525         movlhps xmm6, xmm5
1526         subps xmm4, xmm6
1527         movaps xmm1, xmm4       ;# xmm1=eps
1528         movaps xmm2, xmm1
1529         mulps  xmm2, xmm2       ;# xmm2=eps2
1530         pslld mm6, 3
1531         pslld mm7, 3
1532
1533         mov  rsi, [rbp + nb130nf_VFtab]
1534         movd eax, mm6
1535         psrlq mm6, 32
1536         movd ecx, mm7
1537         psrlq mm7, 32
1538         movd ebx, mm6
1539         movd edx, mm7
1540
1541         ;# dispersion
1542         movlps xmm5, [rsi + rax*4]
1543         movlps xmm7, [rsi + rcx*4]
1544         movhps xmm5, [rsi + rbx*4]
1545         movhps xmm7, [rsi + rdx*4] ;# got half dispersion table
1546         movaps xmm4, xmm5
1547         shufps xmm4, xmm7, 136  ;# constant 10001000
1548         shufps xmm5, xmm7, 221  ;# constant 11011101
1549
1550         movlps xmm7, [rsi + rax*4 + 8]
1551         movlps xmm3, [rsi + rcx*4 + 8]
1552         movhps xmm7, [rsi + rbx*4 + 8]
1553         movhps xmm3, [rsi + rdx*4 + 8] ;# other half of dispersion table
1554         movaps xmm6, xmm7
1555         shufps xmm6, xmm3, 136  ;# constant 10001000
1556         shufps xmm7, xmm3, 221  ;# constant 11011101
1557         ;# dispersion table ready, in xmm4-xmm7
1558
1559         mulps  xmm6, xmm1       ;# xmm6=Geps
1560         mulps  xmm7, xmm2       ;# xmm7=Heps2
1561         addps  xmm5, xmm6
1562         addps  xmm5, xmm7       ;# xmm5=Fp
1563         mulps  xmm5, xmm1 ;# xmm5=eps*Fp
1564         addps  xmm5, xmm4 ;# xmm5=VV
1565
1566         movaps xmm4, [rsp + nb130nf_c6]
1567         mulps  xmm5, xmm4        ;# Vvdw6
1568
1569         ;# Update Vvdwtot directly
1570         addps  xmm5, [rsp + nb130nf_Vvdwtot]
1571         movaps [rsp + nb130nf_Vvdwtot], xmm5
1572
1573         ;# repulsion
1574         movlps xmm5, [rsi + rax*4 + 16]
1575         movlps xmm7, [rsi + rcx*4 + 16]
1576         movhps xmm5, [rsi + rbx*4 + 16]
1577         movhps xmm7, [rsi + rdx*4 + 16] ;# got half repulsion table
1578         movaps xmm4, xmm5
1579         shufps xmm4, xmm7, 136  ;# constant 10001000
1580         shufps xmm5, xmm7, 221  ;# constant 11011101
1581
1582         movlps xmm7, [rsi + rax*4 + 24]
1583         movlps xmm3, [rsi + rcx*4 + 24]
1584         movhps xmm7, [rsi + rbx*4 + 24]
1585         movhps xmm3, [rsi + rdx*4 + 24] ;# other half of repulsion table
1586         movaps xmm6, xmm7
1587         shufps xmm6, xmm3, 136  ;# constant 10001000
1588         shufps xmm7, xmm3, 221  ;# constant 11011101
1589         ;# table ready, in xmm4-xmm7
1590         mulps  xmm6, xmm1       ;# xmm6=Geps
1591         mulps  xmm7, xmm2       ;# xmm7=Heps2
1592         addps  xmm5, xmm6
1593         addps  xmm5, xmm7       ;# xmm5=Fp
1594         mulps  xmm5, xmm1 ;# xmm5=eps*Fp
1595         addps  xmm5, xmm4 ;# xmm5=VV
1596
1597         movaps xmm4, [rsp + nb130nf_c12]
1598         mulps  xmm5, xmm4 ;# Vvdw12
1599
1600         addps  xmm5, [rsp + nb130nf_Vvdwtot]
1601         movaps [rsp + nb130nf_Vvdwtot], xmm5
1602
1603         ;# should we do one more iteration?
1604         sub dword ptr [rsp + nb130nf_innerk],  4
1605         jl    .nb130nf_finish_inner
1606         jmp   .nb130nf_unroll_loop
1607 .nb130nf_finish_inner:
1608         ;# check if at least two particles remain
1609         add dword ptr [rsp + nb130nf_innerk],  4
1610         mov   edx, [rsp + nb130nf_innerk]
1611         and   edx, 2
1612         jnz   .nb130nf_dopair
1613         jmp   .nb130nf_checksingle
1614 .nb130nf_dopair:
1615         mov rsi, [rbp + nb130nf_charge]
1616
1617         mov   rcx, [rsp + nb130nf_innerjjnr]
1618
1619         mov   eax, [rcx]
1620         mov   ebx, [rcx + 4]
1621         add qword ptr [rsp + nb130nf_innerjjnr],  8
1622
1623         xorps xmm3, xmm3
1624         movss xmm3, [rsi + rax*4]
1625         movss xmm6, [rsi + rbx*4]
1626         shufps xmm3, xmm6, 12 ;# constant 00001100
1627         shufps xmm3, xmm3, 88 ;# constant 01011000 ;# xmm3(0,1) has the charges
1628
1629         mov rsi, [rbp + nb130nf_type]
1630         mov   ecx, eax
1631         mov   edx, ebx
1632         mov ecx, [rsi + rcx*4]
1633         mov edx, [rsi + rdx*4]
1634         mov rsi, [rbp + nb130nf_vdwparam]
1635         shl ecx, 1
1636         shl edx, 1
1637         mov edi, [rsp + nb130nf_ntia]
1638         add ecx, edi
1639         add edx, edi
1640         movlps xmm6, [rsi + rcx*4]
1641         movhps xmm6, [rsi + rdx*4]
1642         mov rdi, [rbp + nb130nf_pos]
1643         xorps  xmm7,xmm7
1644         movaps xmm4, xmm6
1645         shufps xmm4, xmm4, 8 ;# constant 00001000
1646         shufps xmm6, xmm6, 13 ;# constant 00001101
1647         movlhps xmm4, xmm7
1648         movlhps xmm6, xmm7
1649
1650         movaps [rsp + nb130nf_c6], xmm4
1651         movaps [rsp + nb130nf_c12], xmm6
1652
1653         lea   rax, [rax + rax*2]
1654         lea   rbx, [rbx + rbx*2]
1655         ;# move coordinates to xmm0-xmm2
1656         movlps xmm1, [rdi + rax*4]
1657         movss xmm2, [rdi + rax*4 + 8]
1658         movhps xmm1, [rdi + rbx*4]
1659         movss xmm0, [rdi + rbx*4 + 8]
1660
1661         mulps  xmm3, [rsp + nb130nf_iq]
1662
1663         movlhps xmm3, xmm7
1664
1665         shufps xmm2, xmm0, 0
1666
1667         movaps xmm0, xmm1
1668
1669         shufps xmm2, xmm2, 136  ;# constant 10001000
1670
1671         shufps xmm0, xmm0, 136  ;# constant 10001000
1672         shufps xmm1, xmm1, 221  ;# constant 11011101
1673
1674         ;# move ix-iz to xmm4-xmm6
1675         xorps   xmm7, xmm7
1676
1677         movaps xmm4, [rsp + nb130nf_ix]
1678         movaps xmm5, [rsp + nb130nf_iy]
1679         movaps xmm6, [rsp + nb130nf_iz]
1680
1681         ;# calc dr
1682         subps xmm4, xmm0
1683         subps xmm5, xmm1
1684         subps xmm6, xmm2
1685
1686         ;# square it
1687         mulps xmm4,xmm4
1688         mulps xmm5,xmm5
1689         mulps xmm6,xmm6
1690         addps xmm4, xmm5
1691         addps xmm4, xmm6
1692         ;# rsq in xmm4
1693
1694         rsqrtps xmm5, xmm4
1695         ;# lookup seed in xmm5
1696         movaps xmm2, xmm5
1697         mulps xmm5, xmm5
1698         movaps xmm1, [rsp + nb130nf_three]
1699         mulps xmm5, xmm4        ;# rsq*lu*lu
1700         movaps xmm0, [rsp + nb130nf_half]
1701         subps xmm1, xmm5        ;# constant 30-rsq*lu*lu
1702         mulps xmm1, xmm2
1703         mulps xmm0, xmm1        ;# xmm0=rinv
1704         movaps xmm1, xmm0
1705         mulps xmm3, xmm0
1706         addps  xmm3, [rsp + nb130nf_vctot]
1707         movaps [rsp + nb130nf_vctot], xmm3
1708
1709         ;# LJ table
1710         mulps  xmm4, xmm1  ;# r
1711         mulps  xmm4, [rsp + nb130nf_tsc] ;# rtab
1712
1713         movaps xmm0, xmm1 ;# copy of rinv
1714         cvttps2pi mm6, xmm4
1715         cvtpi2ps xmm6, mm6
1716         subps xmm4, xmm6
1717         movaps xmm1, xmm4       ;# xmm1=eps
1718         movaps xmm2, xmm1
1719         mulps  xmm2, xmm2       ;# xmm2=eps2
1720         pslld mm6, 3
1721
1722         mov  rsi, [rbp + nb130nf_VFtab]
1723         movd eax, mm6
1724         psrlq mm6, 32
1725         movd ebx, mm6
1726
1727         ;# dispersion
1728         movlps xmm5, [rsi + rax*4]
1729         movhps xmm5, [rsi + rbx*4]
1730         movaps xmm4, xmm5
1731         shufps xmm4, xmm7, 136  ;# constant 10001000
1732         shufps xmm5, xmm7, 221  ;# constant 11011101
1733
1734         movlps xmm7, [rsi + rax*4 + 8]
1735         movhps xmm7, [rsi + rbx*4 + 8]
1736         movaps xmm6, xmm7
1737         shufps xmm6, xmm3, 136  ;# constant 10001000
1738         shufps xmm7, xmm3, 221  ;# constant 11011101
1739         ;# dispersion table ready, in xmm4-xmm7
1740
1741         mulps  xmm6, xmm1       ;# xmm6=Geps
1742         mulps  xmm7, xmm2       ;# xmm7=Heps2
1743         addps  xmm5, xmm6
1744         addps  xmm5, xmm7       ;# xmm5=Fp
1745         mulps  xmm5, xmm1 ;# xmm5=eps*Fp
1746         addps  xmm5, xmm4 ;# xmm5=VV
1747
1748         movaps xmm4, [rsp + nb130nf_c6]
1749         mulps  xmm5, xmm4        ;# Vvdw6
1750
1751         ;# Update Vvdwtot directly
1752         addps  xmm5, [rsp + nb130nf_Vvdwtot]
1753         movaps [rsp + nb130nf_Vvdwtot], xmm5
1754
1755         ;# repulsion
1756         movlps xmm5, [rsi + rax*4 + 16]
1757         movhps xmm5, [rsi + rbx*4 + 16]
1758         movaps xmm4, xmm5
1759         shufps xmm4, xmm7, 136  ;# constant 10001000
1760         shufps xmm5, xmm7, 221  ;# constant 11011101
1761
1762         movlps xmm7, [rsi + rax*4 + 24]
1763         movhps xmm7, [rsi + rbx*4 + 24]
1764         movaps xmm6, xmm7
1765         shufps xmm6, xmm3, 136  ;# constant 10001000
1766         shufps xmm7, xmm3, 221  ;# constant 11011101
1767         ;# table ready, in xmm4-xmm7
1768         mulps  xmm6, xmm1       ;# xmm6=Geps
1769         mulps  xmm7, xmm2       ;# xmm7=Heps2
1770         addps  xmm5, xmm6
1771         addps  xmm5, xmm7       ;# xmm5=Fp
1772         mulps  xmm5, xmm1 ;# xmm5=eps*Fp
1773         addps  xmm5, xmm4 ;# xmm5=VV
1774
1775         movaps xmm4, [rsp + nb130nf_c12]
1776         mulps  xmm5, xmm4 ;# Vvdw12
1777
1778         addps  xmm5, [rsp + nb130nf_Vvdwtot]
1779         movaps [rsp + nb130nf_Vvdwtot], xmm5
1780
1781 .nb130nf_checksingle:
1782         mov   edx, [rsp + nb130nf_innerk]
1783         and   edx, 1
1784         jnz    .nb130nf_dosingle
1785         jmp    .nb130nf_updateouterdata
1786 .nb130nf_dosingle:
1787         mov rsi, [rbp + nb130nf_charge]
1788         mov rdi, [rbp + nb130nf_pos]
1789         mov   rcx, [rsp + nb130nf_innerjjnr]
1790         xorps xmm3, xmm3
1791         mov   eax, [rcx]
1792         movss xmm3, [rsi + rax*4]       ;# xmm3(0) has the charge
1793
1794         mov rsi, [rbp + nb130nf_type]
1795         mov ecx, eax
1796         mov ecx, [rsi + rcx*4]
1797         mov rsi, [rbp + nb130nf_vdwparam]
1798         shl ecx, 1
1799         add ecx, [rsp + nb130nf_ntia]
1800         xorps  xmm6, xmm6
1801         movlps xmm6, [rsi + rcx*4]
1802         movaps xmm4, xmm6
1803         shufps xmm4, xmm4, 252  ;# constant 11111100
1804         shufps xmm6, xmm6, 253  ;# constant 11111101
1805
1806         movaps [rsp + nb130nf_c6], xmm4
1807         movaps [rsp + nb130nf_c12], xmm6
1808
1809         lea   rax, [rax + rax*2]
1810
1811         ;# move coordinates to xmm0-xmm2
1812         movss xmm0, [rdi + rax*4]
1813         movss xmm1, [rdi + rax*4 + 4]
1814         movss xmm2, [rdi + rax*4 + 8]
1815
1816         mulps  xmm3, [rsp + nb130nf_iq]
1817
1818         xorps   xmm7, xmm7
1819
1820         movaps xmm4, [rsp + nb130nf_ix]
1821         movaps xmm5, [rsp + nb130nf_iy]
1822         movaps xmm6, [rsp + nb130nf_iz]
1823
1824         ;# calc dr
1825         subps xmm4, xmm0
1826         subps xmm5, xmm1
1827         subps xmm6, xmm2
1828
1829         ;# square it
1830         mulps xmm4,xmm4
1831         mulps xmm5,xmm5
1832         mulps xmm6,xmm6
1833         addps xmm4, xmm5
1834         addps xmm4, xmm6
1835         ;# rsq in xmm4
1836
1837         rsqrtss xmm5, xmm4
1838         ;# lookup seed in xmm5
1839         movss xmm2, xmm5
1840         mulss xmm5, xmm5
1841         movss xmm1, [rsp + nb130nf_three]
1842         mulss xmm5, xmm4        ;# rsq*lu*lu
1843         movss xmm0, [rsp + nb130nf_half]
1844         subss xmm1, xmm5        ;# constant 30-rsq*lu*lu
1845         mulss xmm1, xmm2
1846         mulss xmm0, xmm1        ;# xmm0=rinv
1847         movaps xmm1, xmm0
1848         mulss xmm3, xmm0
1849         addss  xmm3, [rsp + nb130nf_vctot]
1850         movss [rsp + nb130nf_vctot], xmm3
1851
1852         ;# LJ table
1853         mulss  xmm4, xmm1  ;# r
1854         mulss  xmm4, [rsp + nb130nf_tsc] ;# rtab
1855
1856         movaps xmm0, xmm1 ;# copy of rinv
1857         cvttps2pi mm6, xmm4
1858         cvtpi2ps xmm6, mm6
1859         subss xmm4, xmm6
1860         movss xmm1, xmm4        ;# xmm1=eps
1861         movss xmm2, xmm1
1862         mulss  xmm2, xmm2       ;# xmm2=eps2
1863         pslld mm6, 3
1864
1865         movd mm0, eax
1866
1867         mov  rsi, [rbp + nb130nf_VFtab]
1868         movd eax, mm6
1869
1870         ;# dispersion
1871         movlps xmm5, [rsi + rax*4]
1872         movaps xmm4, xmm5
1873         shufps xmm4, xmm7, 136  ;# constant 10001000
1874         shufps xmm5, xmm7, 221  ;# constant 11011101
1875
1876         movlps xmm7, [rsi + rax*4 + 8]
1877         movaps xmm6, xmm7
1878         shufps xmm6, xmm3, 136  ;# constant 10001000
1879         shufps xmm7, xmm3, 221  ;# constant 11011101
1880         ;# dispersion table ready, in xmm4-xmm7
1881
1882         mulss  xmm6, xmm1       ;# xmm6=Geps
1883         mulss  xmm7, xmm2       ;# xmm7=Heps2
1884         addss  xmm5, xmm6
1885         addss  xmm5, xmm7       ;# xmm5=Fp
1886         mulss  xmm5, xmm1 ;# xmm5=eps*Fp
1887         addss  xmm5, xmm4 ;# xmm5=VV
1888
1889         movss  xmm4, [rsp + nb130nf_c6]
1890         mulss  xmm5, xmm4        ;# Vvdw6
1891
1892         ;# Update Vvdwtot directly
1893         addss  xmm5, [rsp + nb130nf_Vvdwtot]
1894         movss [rsp + nb130nf_Vvdwtot], xmm5
1895
1896         ;# repulsion
1897         movlps xmm5, [rsi + rax*4 + 16]
1898         movaps xmm4, xmm5
1899         shufps xmm4, xmm7, 136  ;# constant 10001000
1900         shufps xmm5, xmm7, 221  ;# constant 11011101
1901
1902         movlps xmm7, [rsi + rax*4 + 24]
1903         movaps xmm6, xmm7
1904         shufps xmm6, xmm3, 136  ;# constant 10001000
1905         shufps xmm7, xmm3, 221  ;# constant 11011101
1906         ;# table ready, in xmm4-xmm7
1907         mulss  xmm6, xmm1       ;# xmm6=Geps
1908         mulss  xmm7, xmm2       ;# xmm7=Heps2
1909         addss  xmm5, xmm6
1910         addss  xmm5, xmm7       ;# xmm5=Fp
1911         mulss  xmm5, xmm1 ;# xmm5=eps*Fp
1912         addss  xmm5, xmm4 ;# xmm5=VV
1913
1914         movss  xmm4, [rsp + nb130nf_c12]
1915         mulss  xmm5, xmm4 ;# Vvdw12
1916
1917         addss  xmm5, [rsp + nb130nf_Vvdwtot]
1918         movss [rsp + nb130nf_Vvdwtot], xmm5
1919
1920
1921 .nb130nf_updateouterdata:
1922
1923         ;# get n from stack
1924         mov esi, [rsp + nb130nf_n]
1925         ;# get group index for i particle
1926         mov   rdx, [rbp + nb130nf_gid]          ;# base of gid[]
1927         mov   edx, [rdx + rsi*4]                ;# ggid=gid[n]
1928
1929         ;# accumulate total potential energy and update it
1930         movaps xmm7, [rsp + nb130nf_vctot]
1931         ;# accumulate
1932         movhlps xmm6, xmm7
1933         addps  xmm7, xmm6       ;# pos 0-1 in xmm7 have the sum now
1934         movaps xmm6, xmm7
1935         shufps xmm6, xmm6, 1
1936         addss  xmm7, xmm6
1937
1938         ;# add earlier value from mem
1939         mov   rax, [rbp + nb130nf_Vc]
1940         addss xmm7, [rax + rdx*4]
1941         ;# move back to mem
1942         movss [rax + rdx*4], xmm7
1943
1944         ;# accumulate total lj energy and update it
1945         movaps xmm7, [rsp + nb130nf_Vvdwtot]
1946         ;# accumulate
1947         movhlps xmm6, xmm7
1948         addps  xmm7, xmm6       ;# pos 0-1 in xmm7 have the sum now
1949         movaps xmm6, xmm7
1950         shufps xmm6, xmm6, 1
1951         addss  xmm7, xmm6
1952
1953         ;# add earlier value from mem
1954         mov   rax, [rbp + nb130nf_Vvdw]
1955         addss xmm7, [rax + rdx*4]
1956         ;# move back to mem
1957         movss [rax + rdx*4], xmm7
1958
1959         ;# finish if last
1960         mov ecx, [rsp + nb130nf_nn1]
1961         ;# esi already loaded with n
1962         inc esi
1963         sub ecx, esi
1964         jz .nb130nf_outerend
1965
1966         ;# not last, iterate outer loop once more!
1967         mov [rsp + nb130nf_n], esi
1968         jmp .nb130nf_outer
1969 .nb130nf_outerend:
1970         ;# check if more outer neighborlists remain
1971         mov   ecx, [rsp + nb130nf_nri]
1972         ;# esi already loaded with n above
1973         sub   ecx, esi
1974         jz .nb130nf_end
1975         ;# non-zero, do one more workunit
1976         jmp   .nb130nf_threadloop
1977 .nb130nf_end:
1978
1979         mov eax, [rsp + nb130nf_nouter]
1980         mov ebx, [rsp + nb130nf_ninner]
1981         mov rcx, [rbp + nb130nf_outeriter]
1982         mov rdx, [rbp + nb130nf_inneriter]
1983         mov [rcx], eax
1984         mov [rdx], ebx
1985
1986         add rsp, 320
1987         emms
1988
1989     ;# Save xmm registers to stack
1990     movaps xmm6,  [rsp      ]
1991     movaps xmm7,  [rsp + 16 ]
1992     movaps xmm8,  [rsp + 32 ]
1993     movaps xmm9,  [rsp + 48 ]
1994     movaps xmm10, [rsp + 64 ]
1995     movaps xmm11, [rsp + 80 ]
1996     movaps xmm12, [rsp + 96 ]
1997     movaps xmm13, [rsp + 112]
1998     movaps xmm14, [rsp + 128]
1999     movaps xmm15, [rsp + 144]
2000
2001     ;# Reset pointers after restoring xmm6-15
2002     add rsp, 168
2003
2004     pop r15
2005     pop r14
2006     pop r13
2007     pop r12
2008     pop rdi
2009     pop rsi
2010     pop rbx
2011
2012         pop     rbp
2013         ret