src/gmxlib/nonbonded/nb_kernel_x86_64_sse/nb_kernel133_x86_64_sse_intel_syntax.s

   1 ;#
   2 ;#
   3 ;# Gromacs 4.0                         Copyright (c) 1991-2003
   4 ;# David van der Spoel, Erik Lindahl
   5 ;#
   6 ;# This program is free software; you can redistribute it and/or
   7 ;# modify it under the terms of the GNU General Public License
   8 ;# as published by the Free Software Foundation; either version 2
   9 ;# of the License, or (at your option) any later version.
  10 ;#
  11 ;# To help us fund GROMACS development, we humbly ask that you cite
  12 ;# the research papers on the package. Check out http://www.gromacs.org
  13 ;#
  14 ;# And Hey:
  15 ;# Gnomes, ROck Monsters And Chili Sauce
  16 ;#
  17
  18 ;# These files require GNU binutils 2.10 or later, since we
  19 ;# use intel syntax for portability, or a recent version
  20 ;# of NASM that understands Extended 3DNow and SSE2 instructions.
  21 ;# (NASM is normally only used with MS Visual C++).
  22 ;# Since NASM and gnu as disagree on some definitions and use
  23 ;# completely different preprocessing options I have to introduce a
  24 ;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
  25 ;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
  26 ;# reason why all comments need both symbols...
  27 ;# The source is written for GNU as, with intel syntax. When you use
  28 ;# NASM we redefine a couple of things. The false if-statement around
  29 ;# the following code is seen by GNU as, but NASM doesn't see it, so
  30 ;# the code inside is read by NASM but not gcc.
  31
  32 ; .if 0    # block below only read by NASM
  33 %define .section        section
  34 %define .long           dd
  35 %define .align          align
  36 %define .globl          global
  37 ;# NASM only wants 'dword', not 'dword ptr'.
  38 %define ptr
  39 %macro .equiv                  2
  40    %1 equ %2
  41 %endmacro
  42 ; .endif                   # End of NASM-specific block
  43 ; .intel_syntax noprefix   # Line only read by gnu as
  44
  45 .section .text
  46
  47
  48 .globl nb_kernel133_x86_64_sse
  49 .globl _nb_kernel133_x86_64_sse
  50 nb_kernel133_x86_64_sse:
  51 _nb_kernel133_x86_64_sse:
  52 ;#      Room for return address and rbp (16 bytes)
  53 .equiv          nb133_fshift,           16
  54 .equiv          nb133_gid,              24
  55 .equiv          nb133_pos,              32
  56 .equiv          nb133_faction,          40
  57 .equiv          nb133_charge,           48
  58 .equiv          nb133_p_facel,          56
  59 .equiv          nb133_argkrf,           64
  60 .equiv          nb133_argcrf,           72
  61 .equiv          nb133_Vc,               80
  62 .equiv          nb133_type,             88
  63 .equiv          nb133_p_ntype,          96
  64 .equiv          nb133_vdwparam,         104
  65 .equiv          nb133_Vvdw,             112
  66 .equiv          nb133_p_tabscale,       120
  67 .equiv          nb133_VFtab,            128
  68 .equiv          nb133_invsqrta,         136
  69 .equiv          nb133_dvda,             144
  70 .equiv          nb133_p_gbtabscale,     152
  71 .equiv          nb133_GBtab,            160
  72 .equiv          nb133_p_nthreads,       168
  73 .equiv          nb133_count,            176
  74 .equiv          nb133_mtx,              184
  75 .equiv          nb133_outeriter,        192
  76 .equiv          nb133_inneriter,        200
  77 .equiv          nb133_work,             208
  78         ;# stack offsets for local variables
  79         ;# bottom of stack is cache-aligned for sse use
  80 .equiv          nb133_ixO,              0
  81 .equiv          nb133_iyO,              16
  82 .equiv          nb133_izO,              32
  83 .equiv          nb133_ixH1,             48
  84 .equiv          nb133_iyH1,             64
  85 .equiv          nb133_izH1,             80
  86 .equiv          nb133_ixH2,             96
  87 .equiv          nb133_iyH2,             112
  88 .equiv          nb133_izH2,             128
  89 .equiv          nb133_ixM,              144
  90 .equiv          nb133_iyM,              160
  91 .equiv          nb133_izM,              176
  92 .equiv          nb133_iqM,              192
  93 .equiv          nb133_iqH,              208
  94 .equiv          nb133_dxO,              224
  95 .equiv          nb133_dyO,              240
  96 .equiv          nb133_dzO,              256
  97 .equiv          nb133_dxH1,             272
  98 .equiv          nb133_dyH1,             288
  99 .equiv          nb133_dzH1,             304
 100 .equiv          nb133_dxH2,             320
 101 .equiv          nb133_dyH2,             336
 102 .equiv          nb133_dzH2,             352
 103 .equiv          nb133_dxM,              368
 104 .equiv          nb133_dyM,              384
 105 .equiv          nb133_dzM,              400
 106 .equiv          nb133_qqM,              416
 107 .equiv          nb133_qqH,              432
 108 .equiv          nb133_rinvH1,           448
 109 .equiv          nb133_rinvH2,           464
 110 .equiv          nb133_rinvM,            480
 111 .equiv          nb133_two,              496
 112 .equiv          nb133_c6,               512
 113 .equiv          nb133_c12,              528
 114 .equiv          nb133_tsc,              544
 115 .equiv          nb133_fstmp,            560
 116 .equiv          nb133_krf,              576
 117 .equiv          nb133_crf,              592
 118 .equiv          nb133_krsqH1,           608
 119 .equiv          nb133_krsqH2,           624
 120 .equiv          nb133_krsqM,            640
 121 .equiv          nb133_vctot,            656
 122 .equiv          nb133_Vvdwtot,          672
 123 .equiv          nb133_fixO,             688
 124 .equiv          nb133_fiyO,             704
 125 .equiv          nb133_fizO,             720
 126 .equiv          nb133_fixH1,            736
 127 .equiv          nb133_fiyH1,            752
 128 .equiv          nb133_fizH1,            768
 129 .equiv          nb133_fixH2,            784
 130 .equiv          nb133_fiyH2,            800
 131 .equiv          nb133_fizH2,            816
 132 .equiv          nb133_fixM,             832
 133 .equiv          nb133_fiyM,             848
 134 .equiv          nb133_fizM,             864
 135 .equiv          nb133_fjx,              880
 136 .equiv          nb133_fjy,              896
 137 .equiv          nb133_fjz,              912
 138 .equiv          nb133_half,             928
 139 .equiv          nb133_three,            944
 140 .equiv          nb133_rsqOO,            960
 141 .equiv          nb133_facel,            976
 142 .equiv          nb133_iinr,             984
 143 .equiv          nb133_jindex,           992
 144 .equiv          nb133_jjnr,             1000
 145 .equiv          nb133_shift,            1008
 146 .equiv          nb133_shiftvec,         1016
 147 .equiv          nb133_innerjjnr,        1024
 148 .equiv          nb133_is3,              1032
 149 .equiv          nb133_ii3,              1036
 150 .equiv          nb133_nri,              1040
 151 .equiv          nb133_ntia,             1044
 152 .equiv          nb133_innerk,           1048
 153 .equiv          nb133_n,                1052
 154 .equiv          nb133_nn1,              1056
 155 .equiv          nb133_nouter,           1060
 156 .equiv          nb133_ninner,           1064
 157
 158         push rbp
 159         mov  rbp, rsp
 160
 161     ;# Push integer registers on stack
 162         push rbx
 163     push rsi
 164     push rdi
 165     push r12
 166     push r13
 167     push r14
 168     push r15
 169
 170     ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
 171     sub rsp, 168
 172
 173     ;# Save xmm registers to stack
 174     movaps [rsp      ], xmm6
 175     movaps [rsp + 16 ], xmm7
 176     movaps [rsp + 32 ], xmm8
 177     movaps [rsp + 48 ], xmm9
 178     movaps [rsp + 64 ], xmm10
 179     movaps [rsp + 80 ], xmm11
 180     movaps [rsp + 96 ], xmm12
 181     movaps [rsp + 112], xmm13
 182     movaps [rsp + 128], xmm14
 183     movaps [rsp + 144], xmm15
 184
 185         emms
 186         sub rsp, 1072           ;# local variable stack space (n*16+8)
 187 ; .if 0    # block below only read by NASM - special calling convention on win64
 188 %ifidn __OUTPUT_FORMAT__, win64
 189     ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
 190     add rbp, 48
 191     ;# Adjust stack pointer for different alignment
 192     ;# Move around arguments to fit AMD64 convention below
 193     ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
 194     ;# win64 passes args in: rcx,rdx,r8,r9         + stack
 195     mov rdi, rcx
 196     mov rsi, rdx
 197     mov rdx, r8
 198     mov rcx, r9
 199     mov r8,  [rbp]
 200     mov r9,  [rbp + 8]
 201 %endif
 202 ; .endif   # end NASM- and win64-specific block
 203
 204         ;# zero 32-bit iteration counters
 205         mov eax, 0
 206         mov [rsp + nb133_nouter], eax
 207         mov [rsp + nb133_ninner], eax
 208
 209         mov edi, [rdi]
 210         mov [rsp + nb133_nri], edi
 211         mov [rsp + nb133_iinr], rsi
 212         mov [rsp + nb133_jindex], rdx
 213         mov [rsp + nb133_jjnr], rcx
 214         mov [rsp + nb133_shift], r8
 215         mov [rsp + nb133_shiftvec], r9
 216         mov rsi, [rbp + nb133_p_facel]
 217         movss xmm0, [rsi]
 218         movss [rsp + nb133_facel], xmm0
 219
 220         ;# create constant floating-point factors on stack
 221         mov eax, 0x3f000000     ;# half in IEEE (hex)
 222         mov [rsp + nb133_half], eax
 223         movss xmm1, [rsp + nb133_half]
 224         shufps xmm1, xmm1, 0    ;# splat to all elements
 225         movaps xmm2, xmm1
 226         addps  xmm2, xmm2       ;# one
 227         movaps xmm3, xmm2
 228         addps  xmm2, xmm2       ;# two
 229         addps  xmm3, xmm2       ;# three
 230         movaps [rsp + nb133_half],  xmm1
 231         movaps [rsp + nb133_two],  xmm2
 232         movaps [rsp + nb133_three],  xmm3
 233
 234         mov rax, [rbp + nb133_p_tabscale]
 235         movss xmm3, [rax]
 236         shufps xmm3, xmm3, 0
 237         movaps [rsp + nb133_tsc], xmm3
 238
 239         ;# assume we have at least one i particle - start directly
 240         mov   rcx, [rsp + nb133_iinr]       ;# rcx = pointer into iinr[]
 241         mov   ebx, [rcx]            ;# ebx =ii
 242
 243         mov   rdx, [rbp + nb133_charge]
 244         movss xmm4, [rdx + rbx*4 + 4]
 245         movss xmm3, [rdx + rbx*4 + 12]
 246         mov rsi, [rbp + nb133_p_facel]
 247         movss xmm0, [rsi]
 248         movss xmm5, [rsp + nb133_facel]
 249         mulss  xmm3, xmm5
 250         mulss  xmm4, xmm5
 251
 252         shufps xmm3, xmm3, 0
 253         shufps xmm4, xmm4, 0
 254         movaps [rsp + nb133_iqM], xmm3
 255         movaps [rsp + nb133_iqH], xmm4
 256
 257         mov   rdx, [rbp + nb133_type]
 258         mov   ecx, [rdx + rbx*4]
 259         shl   ecx, 1
 260         mov rdi, [rbp + nb133_p_ntype]
 261         imul  ecx, [rdi]      ;# rcx = ntia = 2*ntype*type[ii0]
 262         mov   [rsp + nb133_ntia], ecx
 263 .nb133_threadloop:
 264         mov   rsi, [rbp + nb133_count]          ;# pointer to sync counter
 265         mov   eax, [rsi]
 266 .nb133_spinlock:
 267         mov   ebx, eax                          ;# ebx=*count=nn0
 268         add   ebx, 1                           ;# ebx=nn1=nn0+10
 269         lock
 270         cmpxchg [rsi], ebx                      ;# write nn1 to *counter,
 271                                                 ;# if it hasnt changed.
 272                                                 ;# or reread *counter to eax.
 273         pause                                   ;# -> better p4 performance
 274         jnz .nb133_spinlock
 275
 276         ;# if(nn1>nri) nn1=nri
 277         mov ecx, [rsp + nb133_nri]
 278         mov edx, ecx
 279         sub ecx, ebx
 280         cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
 281         ;# Cleared the spinlock if we got here.
 282         ;# eax contains nn0, ebx contains nn1.
 283         mov [rsp + nb133_n], eax
 284         mov [rsp + nb133_nn1], ebx
 285         sub ebx, eax                            ;# calc number of outer lists
 286         mov esi, eax                            ;# copy n to esi
 287         jg  .nb133_outerstart
 288         jmp .nb133_end
 289
 290 .nb133_outerstart:
 291         ;# ebx contains number of outer iterations
 292         add ebx, [rsp + nb133_nouter]
 293         mov [rsp + nb133_nouter], ebx
 294
 295 .nb133_outer:
 296         mov   rax, [rsp + nb133_shift]      ;# eax = pointer into shift[]
 297         mov   ebx, [rax + rsi*4]                ;# ebx=shift[n]
 298
 299         lea   rbx, [rbx + rbx*2]        ;# rbx=3*is
 300         mov   [rsp + nb133_is3],ebx     ;# store is3
 301
 302         mov   rax, [rsp + nb133_shiftvec]   ;# eax = base of shiftvec[]
 303
 304         movss xmm0, [rax + rbx*4]
 305         movss xmm1, [rax + rbx*4 + 4]
 306         movss xmm2, [rax + rbx*4 + 8]
 307
 308         mov   rcx, [rsp + nb133_iinr]           ;# ecx = pointer into iinr[]
 309         mov   ebx, [rcx + rsi*4]                ;# ebx =ii
 310
 311         movaps xmm3, xmm0
 312         movaps xmm4, xmm1
 313         movaps xmm5, xmm2
 314         movaps xmm6, xmm0
 315         movaps xmm7, xmm1
 316
 317         lea   rbx, [rbx + rbx*2]        ;# rbx = 3*ii=ii3
 318         mov   rax, [rbp + nb133_pos]    ;# eax = base of pos[]
 319         mov   [rsp + nb133_ii3], ebx
 320
 321         addss xmm3, [rax + rbx*4]       ;# ox
 322         addss xmm4, [rax + rbx*4 + 4]  ;# oy
 323         addss xmm5, [rax + rbx*4 + 8]  ;# oz
 324         addss xmm6, [rax + rbx*4 + 12] ;# h1x
 325         addss xmm7, [rax + rbx*4 + 16] ;# h1y
 326         shufps xmm3, xmm3, 0
 327         shufps xmm4, xmm4, 0
 328         shufps xmm5, xmm5, 0
 329         shufps xmm6, xmm6, 0
 330         shufps xmm7, xmm7, 0
 331         movaps [rsp + nb133_ixO], xmm3
 332         movaps [rsp + nb133_iyO], xmm4
 333         movaps [rsp + nb133_izO], xmm5
 334         movaps [rsp + nb133_ixH1], xmm6
 335         movaps [rsp + nb133_iyH1], xmm7
 336
 337         movss xmm6, xmm2
 338         movss xmm3, xmm0
 339         movss xmm4, xmm1
 340         movss xmm5, xmm2
 341         addss xmm6, [rax + rbx*4 + 20] ;# h1z
 342         addss xmm0, [rax + rbx*4 + 24] ;# h2x
 343         addss xmm1, [rax + rbx*4 + 28] ;# h2y
 344         addss xmm2, [rax + rbx*4 + 32] ;# h2z
 345         addss xmm3, [rax + rbx*4 + 36] ;# mx
 346         addss xmm4, [rax + rbx*4 + 40] ;# my
 347         addss xmm5, [rax + rbx*4 + 44] ;# mz
 348
 349         shufps xmm6, xmm6, 0
 350         shufps xmm0, xmm0, 0
 351         shufps xmm1, xmm1, 0
 352         shufps xmm2, xmm2, 0
 353         shufps xmm3, xmm3, 0
 354         shufps xmm4, xmm4, 0
 355         shufps xmm5, xmm5, 0
 356         movaps [rsp + nb133_izH1], xmm6
 357         movaps [rsp + nb133_ixH2], xmm0
 358         movaps [rsp + nb133_iyH2], xmm1
 359         movaps [rsp + nb133_izH2], xmm2
 360         movaps [rsp + nb133_ixM], xmm3
 361         movaps [rsp + nb133_iyM], xmm4
 362         movaps [rsp + nb133_izM], xmm5
 363
 364         ;# clear vctot and i forces
 365         xorps xmm4, xmm4
 366         movaps [rsp + nb133_vctot], xmm4
 367         movaps [rsp + nb133_Vvdwtot], xmm4
 368         movaps [rsp + nb133_fixO], xmm4
 369         movaps [rsp + nb133_fiyO], xmm4
 370         movaps [rsp + nb133_fizO], xmm4
 371         movaps [rsp + nb133_fixH1], xmm4
 372         movaps [rsp + nb133_fiyH1], xmm4
 373         movaps [rsp + nb133_fizH1], xmm4
 374         movaps [rsp + nb133_fixH2], xmm4
 375         movaps [rsp + nb133_fiyH2], xmm4
 376         movaps [rsp + nb133_fizH2], xmm4
 377         movaps [rsp + nb133_fixM], xmm4
 378         movaps [rsp + nb133_fiyM], xmm4
 379         movaps [rsp + nb133_fizM], xmm4
 380
 381         mov   rax, [rsp + nb133_jindex]
 382         mov   ecx, [rax + rsi*4]                ;# jindex[n]
 383         mov   edx, [rax + rsi*4 + 4]            ;# jindex[n+1]
 384         sub   edx, ecx                  ;# number of innerloop atoms
 385
 386         mov   rsi, [rbp + nb133_pos]
 387         mov   rdi, [rbp + nb133_faction]
 388         mov   rax, [rsp + nb133_jjnr]
 389         shl   ecx, 2
 390         add   rax, rcx
 391         mov   [rsp + nb133_innerjjnr], rax      ;# pointer to jjnr[nj0]
 392         mov   ecx, edx
 393         sub   edx,  4
 394         add   ecx, [rsp + nb133_ninner]
 395         mov   [rsp + nb133_ninner], ecx
 396         add   edx, 0
 397         mov   [rsp + nb133_innerk], edx ;# number of innerloop atoms
 398         jge   .nb133_unroll_loop
 399         jmp   .nb133_odd_inner
 400 .nb133_unroll_loop:
 401         ;# quad-unroll innerloop here
 402         mov   rdx, [rsp + nb133_innerjjnr]      ;# pointer to jjnr[k]
 403         mov   eax, [rdx]
 404         mov   ebx, [rdx + 4]
 405         mov   ecx, [rdx + 8]
 406         mov   edx, [rdx + 12]           ;# eax-edx=jnr1-4
 407
 408         add qword ptr [rsp + nb133_innerjjnr],  16 ;# advance pointer (unrolled 4)
 409
 410         mov rsi, [rbp + nb133_charge]   ;# base of charge[]
 411
 412         movss xmm3, [rsi + rax*4]
 413         movss xmm4, [rsi + rcx*4]
 414         movss xmm6, [rsi + rbx*4]
 415         movss xmm7, [rsi + rdx*4]
 416
 417         shufps xmm3, xmm6, 0
 418         shufps xmm4, xmm7, 0
 419         shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3
 420         movaps xmm4, xmm3               ;# and in xmm4
 421         mulps  xmm3, [rsp + nb133_iqM]
 422         mulps  xmm4, [rsp + nb133_iqH]
 423
 424         movaps  [rsp + nb133_qqM], xmm3
 425         movaps  [rsp + nb133_qqH], xmm4
 426
 427         mov rsi, [rbp + nb133_type]
 428         mov r8d, [rsi + rax*4]
 429         mov r9d, [rsi + rbx*4]
 430         mov r10d, [rsi + rcx*4]
 431         mov r11d, [rsi + rdx*4]
 432         mov rsi, [rbp + nb133_vdwparam]
 433         shl r8d, 1
 434         shl r9d, 1
 435         shl r10d, 1
 436         shl r11d, 1
 437         mov edi, [rsp + nb133_ntia]
 438         add r8d, edi
 439         add r9d, edi
 440         add r10d, edi
 441         add r11d, edi
 442
 443         movlps xmm6, [rsi + r8*4]
 444         movlps xmm7, [rsi + r10*4]
 445         movhps xmm6, [rsi + r9*4]
 446         movhps xmm7, [rsi + r11*4]
 447
 448         movaps xmm4, xmm6
 449         shufps xmm4, xmm7, 136  ;# constant 10001000
 450         shufps xmm6, xmm7, 221  ;# constant 11011101
 451
 452         movaps [rsp + nb133_c6], xmm4
 453         movaps [rsp + nb133_c12], xmm6
 454
 455         mov rsi, [rbp + nb133_pos]      ;# base of pos[]
 456
 457         lea   rax, [rax + rax*2]        ;# replace jnr with j3
 458         lea   rbx, [rbx + rbx*2]
 459         lea   rcx, [rcx + rcx*2]        ;# replace jnr with j3
 460         lea   rdx, [rdx + rdx*2]
 461
 462         ;# move four coordinates to xmm0-xmm2
 463         movlps xmm4, [rsi + rax*4]
 464         movlps xmm5, [rsi + rcx*4]
 465         movss xmm2, [rsi + rax*4 + 8]
 466         movss xmm6, [rsi + rcx*4 + 8]
 467
 468         movhps xmm4, [rsi + rbx*4]
 469         movhps xmm5, [rsi + rdx*4]
 470
 471         movss xmm0, [rsi + rbx*4 + 8]
 472         movss xmm1, [rsi + rdx*4 + 8]
 473
 474         shufps xmm2, xmm0, 0
 475         shufps xmm6, xmm1, 0
 476
 477         movaps xmm0, xmm4
 478         movaps xmm1, xmm4
 479
 480         shufps xmm2, xmm6, 136  ;# constant 10001000
 481         shufps xmm0, xmm5, 136  ;# constant 10001000
 482         shufps xmm1, xmm5, 221  ;# constant 11011101
 483
 484     ;# xmm0 = jx
 485     ;# xmm1 = jy
 486     ;# xmm2 = jz
 487
 488     ;# O interaction
 489     ;# copy to xmm3-xmm5
 490     movaps xmm3, xmm0
 491     movaps xmm4, xmm1
 492     movaps xmm5, xmm2
 493
 494     subps xmm3, [rsp + nb133_ixO]
 495     subps xmm4, [rsp + nb133_iyO]
 496     subps xmm5, [rsp + nb133_izO]
 497
 498     movaps [rsp + nb133_dxO], xmm3
 499     movaps [rsp + nb133_dyO], xmm4
 500     movaps [rsp + nb133_dzO], xmm5
 501
 502         mulps  xmm3, xmm3
 503         mulps  xmm4, xmm4
 504         mulps  xmm5, xmm5
 505
 506         addps  xmm3, xmm4
 507         addps  xmm3, xmm5
 508     ;# xmm3=rsq
 509
 510     ;# calculate rinv=1/sqrt(rsq)
 511         rsqrtps xmm5, xmm3
 512         movaps xmm15, xmm5
 513         mulps xmm5, xmm5
 514         movaps xmm4, [rsp + nb133_three]
 515         mulps xmm5, xmm3        ;# rsq*lu*lu
 516     subps xmm4, xmm5    ;# 30-rsq*lu*lu
 517         mulps xmm4, xmm15
 518         mulps xmm4, [rsp + nb133_half]
 519         movaps xmm15, xmm4
 520         mulps  xmm3, xmm4
 521     ;# xmm15=rinv
 522     ;# xmm3=r
 523
 524     mulps xmm3, [rsp + nb133_tsc] ;# rtab
 525
 526     ;# truncate and convert to integers
 527     cvttps2dq xmm5, xmm3
 528
 529     ;# convert back to float
 530     cvtdq2ps  xmm4, xmm5
 531
 532     ;# multiply by 8
 533     pslld   xmm5, 3
 534
 535     ;# calculate eps
 536     subps     xmm3, xmm4    ;# xmm3=eps
 537
 538     ;# move to integer registers
 539     movhlps xmm6, xmm5
 540     movd    r8d, xmm5
 541     movd    r10d, xmm6
 542     pshufd  xmm5, xmm5, 1
 543     pshufd  xmm6, xmm6, 1
 544     movd    r9d, xmm5
 545     movd    r11d, xmm6
 546     ;# xmm3=eps
 547     ;# xmm15=rinv
 548
 549         mov rsi, [rbp + nb133_VFtab]
 550     ;# calculate LJ table
 551     movlps xmm5, [rsi + r8*4]
 552         movlps xmm9, [rsi + r8*4 + 16]
 553
 554         movlps xmm7,  [rsi + r10*4]
 555         movlps xmm11, [rsi + r10*4 + 16]
 556
 557         movhps xmm5, [rsi + r9*4]
 558         movhps xmm9, [rsi + r9*4 + 16]
 559
 560         movhps xmm7,  [rsi + r11*4]
 561         movhps xmm11, [rsi + r11*4 + 16]
 562
 563     movaps xmm4, xmm5
 564     movaps xmm8, xmm9
 565         shufps xmm4, xmm7, 136  ;# 10001000
 566         shufps xmm8, xmm11, 136  ;# 10001000
 567         shufps xmm5, xmm7, 221  ;# 11011101
 568         shufps xmm9, xmm11, 221  ;# 11011101
 569
 570         movlps xmm7,  [rsi + r8*4 + 8]
 571         movlps xmm11, [rsi + r8*4 + 24]
 572
 573         movlps xmm13, [rsi + r10*4 + 8]
 574         movlps xmm14, [rsi + r10*4 + 24]
 575
 576         movhps xmm7,  [rsi + r9*4 + 8]
 577         movhps xmm11, [rsi + r9*4 + 24]
 578
 579         movhps xmm13, [rsi + r11*4 + 8]
 580         movhps xmm14, [rsi + r11*4 + 24]
 581
 582     movaps xmm6, xmm7
 583     movaps xmm10, xmm11
 584
 585         shufps xmm6, xmm13, 136  ;# 10001000
 586         shufps xmm10, xmm14, 136  ;# 10001000
 587         shufps xmm7, xmm13, 221  ;# 11011101
 588         shufps xmm11, xmm14, 221  ;# 11011101
 589     ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
 590
 591     mulps  xmm7, xmm3    ;# Heps
 592     mulps  xmm11, xmm3
 593     mulps  xmm6, xmm3   ;# Geps
 594     mulps  xmm10, xmm3
 595     mulps  xmm7, xmm3   ;# Heps2
 596     mulps  xmm11, xmm3
 597     addps  xmm5, xmm6  ;# F+Geps
 598     addps  xmm9, xmm10
 599     addps  xmm5, xmm7   ;# F+Geps+Heps2 = Fp
 600     addps  xmm9, xmm11
 601     addps  xmm7, xmm7    ;# 2*Heps2
 602     addps  xmm11, xmm11
 603     addps  xmm7, xmm6   ;# 2*Heps2+Geps
 604     addps  xmm11, xmm10
 605
 606     addps  xmm7, xmm5  ;# FF = Fp + 2*Heps2 + Geps
 607     addps  xmm11, xmm9
 608     mulps  xmm5, xmm3  ;# eps*Fp
 609     mulps  xmm9, xmm3
 610     movaps xmm12, [rsp + nb133_c6]
 611     movaps xmm13, [rsp + nb133_c12]
 612     addps  xmm5, xmm4 ;# VV
 613     addps  xmm9, xmm8
 614
 615     mulps  xmm5, xmm12  ;# VV*c6 = vnb6
 616     mulps  xmm9, xmm13  ;# VV*c12 = vnb12
 617     addps  xmm5, xmm9
 618     addps  xmm5, [rsp + nb133_Vvdwtot]
 619     movaps [rsp + nb133_Vvdwtot], xmm5
 620
 621     mulps  xmm7, xmm12   ;# FF*c6 = fnb6
 622     mulps  xmm11, xmm13   ;# FF*c12  = fnb12
 623     addps  xmm7, xmm11
 624
 625     mulps  xmm7, [rsp + nb133_tsc]
 626     mulps  xmm7, xmm15   ;# -fscal
 627     xorps  xmm9, xmm9
 628
 629     subps  xmm9, xmm7     ;# fscal
 630     movaps xmm10, xmm9
 631     movaps xmm11, xmm9
 632
 633     mulps  xmm9,  [rsp + nb133_dxO] ;# fx/fy/fz
 634     mulps  xmm10, [rsp + nb133_dyO]
 635     mulps  xmm11, [rsp + nb133_dzO]
 636
 637     ;# save j force temporarily
 638     movaps [rsp + nb133_fjx], xmm9
 639     movaps [rsp + nb133_fjy], xmm10
 640     movaps [rsp + nb133_fjz], xmm11
 641
 642     ;# increment i O force
 643     addps xmm9, [rsp + nb133_fixO]
 644     addps xmm10, [rsp + nb133_fiyO]
 645     addps xmm11, [rsp + nb133_fizO]
 646     movaps [rsp + nb133_fixO], xmm9
 647     movaps [rsp + nb133_fiyO], xmm10
 648     movaps [rsp + nb133_fizO], xmm11
 649     ;# finished O LJ interaction.
 650
 651     ;# do H1, H2, and M interactions in parallel.
 652     ;# xmm0-xmm2 still contain j coordinates.
 653     movaps xmm3, xmm0
 654     movaps xmm4, xmm1
 655     movaps xmm5, xmm2
 656     movaps xmm6, xmm0
 657     movaps xmm7, xmm1
 658     movaps xmm8, xmm2
 659
 660     subps xmm0, [rsp + nb133_ixH1]
 661     subps xmm1, [rsp + nb133_iyH1]
 662     subps xmm2, [rsp + nb133_izH1]
 663     subps xmm3, [rsp + nb133_ixH2]
 664     subps xmm4, [rsp + nb133_iyH2]
 665     subps xmm5, [rsp + nb133_izH2]
 666     subps xmm6, [rsp + nb133_ixM]
 667     subps xmm7, [rsp + nb133_iyM]
 668     subps xmm8, [rsp + nb133_izM]
 669
 670         movaps [rsp + nb133_dxH1], xmm0
 671         movaps [rsp + nb133_dyH1], xmm1
 672         movaps [rsp + nb133_dzH1], xmm2
 673         mulps  xmm0, xmm0
 674         mulps  xmm1, xmm1
 675         mulps  xmm2, xmm2
 676         movaps [rsp + nb133_dxH2], xmm3
 677         movaps [rsp + nb133_dyH2], xmm4
 678         movaps [rsp + nb133_dzH2], xmm5
 679         mulps  xmm3, xmm3
 680         mulps  xmm4, xmm4
 681         mulps  xmm5, xmm5
 682         movaps [rsp + nb133_dxM], xmm6
 683         movaps [rsp + nb133_dyM], xmm7
 684         movaps [rsp + nb133_dzM], xmm8
 685         mulps  xmm6, xmm6
 686         mulps  xmm7, xmm7
 687         mulps  xmm8, xmm8
 688         addps  xmm0, xmm1
 689         addps  xmm0, xmm2
 690         addps  xmm3, xmm4
 691         addps  xmm3, xmm5
 692     addps  xmm6, xmm7
 693     addps  xmm6, xmm8
 694
 695         ;# start doing invsqrt for j atoms
 696         rsqrtps xmm1, xmm0
 697         rsqrtps xmm4, xmm3
 698     rsqrtps xmm7, xmm6
 699
 700         movaps  xmm2, xmm1
 701         movaps  xmm5, xmm4
 702     movaps  xmm8, xmm7
 703
 704         mulps   xmm1, xmm1 ;# lu*lu
 705         mulps   xmm4, xmm4 ;# lu*lu
 706     mulps   xmm7, xmm7 ;# lu*lu
 707
 708         movaps  xmm9, [rsp + nb133_three]
 709         movaps  xmm10, xmm9
 710     movaps  xmm11, xmm9
 711
 712         mulps   xmm1, xmm0 ;# rsq*lu*lu
 713         mulps   xmm4, xmm3 ;# rsq*lu*lu
 714     mulps   xmm7, xmm6 ;# rsq*lu*lu
 715
 716         subps   xmm9, xmm1
 717         subps   xmm10, xmm4
 718     subps   xmm11, xmm7 ;# 3-rsq*lu*lu
 719
 720         mulps   xmm9, xmm2
 721         mulps   xmm10, xmm5
 722     mulps   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)
 723
 724         movaps  xmm0, [rsp + nb133_half]
 725         mulps   xmm9, xmm0  ;# rinvH1
 726         mulps   xmm10, xmm0 ;# rinvH2
 727     mulps   xmm11, xmm0 ;# rinvM
 728
 729         ;# interactions
 730     movaps xmm0, xmm9    ;# rinv
 731     movaps xmm1, xmm10
 732     movaps xmm2, xmm11
 733     mulps  xmm9, xmm9    ;# rinvsq
 734     mulps  xmm10, xmm10
 735     mulps  xmm11, xmm11
 736     mulps  xmm0, [rsp + nb133_qqH]
 737     mulps  xmm1, [rsp + nb133_qqH]
 738     mulps  xmm2, [rsp + nb133_qqM]
 739     mulps  xmm9, xmm0
 740     mulps  xmm10, xmm1
 741     mulps  xmm11, xmm2
 742
 743     addps xmm0, [rsp + nb133_vctot]
 744     addps xmm1, xmm2
 745     addps xmm0, xmm1
 746     movaps [rsp + nb133_vctot], xmm0
 747
 748         ;# move j forces to local temp variables
 749         mov rdi, [rbp + nb133_faction]
 750     movlps xmm0, [rdi + rax*4] ;# jxa jya  -   -
 751     movlps xmm1, [rdi + rcx*4] ;# jxc jyc  -   -
 752     movhps xmm0, [rdi + rbx*4] ;# jxa jya jxb jyb
 753     movhps xmm1, [rdi + rdx*4] ;# jxc jyc jxd jyd
 754
 755     movss  xmm2, [rdi + rax*4 + 8] ;# jza  -  -  -
 756     movss  xmm3, [rdi + rcx*4 + 8] ;# jzc  -  -  -
 757     movss  xmm5, [rdi + rbx*4 + 8] ;# jzb  - - -
 758     movss  xmm6, [rdi + rdx*4 + 8] ;# jzd - - -
 759     movlhps xmm2, xmm5
 760     movlhps xmm3, xmm6
 761
 762     shufps xmm2, xmm3,  136  ;# 10001000 => jza jzb jzc jzd
 763
 764     ;# xmm0: jxa jya jxb jyb
 765     ;# xmm1: jxc jyc jxd jyd
 766     ;# xmm2: jza jzb jzc jzd
 767
 768     movaps xmm7, xmm9
 769     movaps xmm8, xmm9
 770     movaps xmm13, xmm11
 771     movaps xmm14, xmm11
 772     movaps xmm15, xmm11
 773     movaps xmm11, xmm10
 774     movaps xmm12, xmm10
 775
 776         mulps xmm7, [rsp + nb133_dxH1]
 777         mulps xmm8, [rsp + nb133_dyH1]
 778         mulps xmm9, [rsp + nb133_dzH1]
 779         mulps xmm10, [rsp + nb133_dxH2]
 780         mulps xmm11, [rsp + nb133_dyH2]
 781         mulps xmm12, [rsp + nb133_dzH2]
 782         mulps xmm13, [rsp + nb133_dxM]
 783         mulps xmm14, [rsp + nb133_dyM]
 784         mulps xmm15, [rsp + nb133_dzM]
 785
 786     ;# fetch forces from O interaction
 787     movaps xmm3, [rsp + nb133_fjx]
 788     movaps xmm4, [rsp + nb133_fjy]
 789     addps  xmm2, [rsp + nb133_fjz]
 790
 791     addps xmm3, xmm7
 792     addps xmm4, xmm8
 793     addps xmm2, xmm9
 794     addps xmm7, [rsp + nb133_fixH1]
 795     addps xmm8, [rsp + nb133_fiyH1]
 796     addps xmm9, [rsp + nb133_fizH1]
 797
 798     addps xmm3, xmm10
 799     addps xmm4, xmm11
 800     addps xmm2, xmm12
 801     addps xmm10, [rsp + nb133_fixH2]
 802     addps xmm11, [rsp + nb133_fiyH2]
 803     addps xmm12, [rsp + nb133_fizH2]
 804
 805     addps xmm3, xmm13
 806     addps xmm4, xmm14
 807     addps xmm2, xmm15
 808     addps xmm13, [rsp + nb133_fixM]
 809     addps xmm14, [rsp + nb133_fiyM]
 810     addps xmm15, [rsp + nb133_fizM]
 811
 812     movaps [rsp + nb133_fixH1], xmm7
 813     movaps [rsp + nb133_fiyH1], xmm8
 814     movaps [rsp + nb133_fizH1], xmm9
 815     movaps [rsp + nb133_fixH2], xmm10
 816     movaps [rsp + nb133_fiyH2], xmm11
 817     movaps [rsp + nb133_fizH2], xmm12
 818     movaps [rsp + nb133_fixM], xmm13
 819     movaps [rsp + nb133_fiyM], xmm14
 820     movaps [rsp + nb133_fizM], xmm15
 821
 822     ;# xmm3 = fjx , xmm4 = fjy  , xmm2=fjz, already updated.
 823     movaps xmm5, xmm3
 824     unpcklps xmm3, xmm4   ;# fjx1 fjy1 fjx2 fjy2
 825     unpckhps xmm5, xmm4   ;# fjx3 fjy3 fjx4 fjy4
 826
 827     addps xmm0, xmm3
 828     addps xmm1, xmm5
 829     movhlps  xmm3, xmm2 ;# fjzc fjzd
 830
 831     movlps [rdi + rax*4], xmm0
 832     movhps [rdi + rbx*4], xmm0
 833     movlps [rdi + rcx*4], xmm1
 834     movhps [rdi + rdx*4], xmm1
 835     movss  [rdi + rax*4 + 8], xmm2
 836     movss  [rdi + rcx*4 + 8], xmm3
 837     shufps xmm2, xmm2, 1
 838     shufps xmm3, xmm3, 1
 839     movss  [rdi + rbx*4 + 8], xmm2
 840     movss  [rdi + rdx*4 + 8], xmm3
 841
 842         ;# should we do one more iteration?
 843         sub dword ptr [rsp + nb133_innerk],  4
 844         jl    .nb133_odd_inner
 845         jmp   .nb133_unroll_loop
 846 .nb133_odd_inner:
 847         add dword ptr [rsp + nb133_innerk],  4
 848         jnz   .nb133_odd_loop
 849         jmp   .nb133_updateouterdata
 850 .nb133_odd_loop:
 851         mov   rdx, [rsp + nb133_innerjjnr]      ;# pointer to jjnr[k]
 852         mov   eax, [rdx]
 853         add qword ptr [rsp + nb133_innerjjnr],  4
 854
 855         xorps xmm4, xmm4        ;# clear reg.
 856         movss xmm4, [rsp + nb133_iqM]
 857         mov rsi, [rbp + nb133_charge]
 858         movhps xmm4, [rsp + nb133_iqH]  ;# [qM  0  qH  qH]
 859         shufps xmm4, xmm4, 41   ;# [0 qH qH qM]
 860
 861         movss xmm3, [rsi + rax*4]       ;# charge in xmm3
 862         shufps xmm3, xmm3, 0
 863         mulps xmm3, xmm4
 864         movaps [rsp + nb133_qqM], xmm3  ;# use dummy qq for storage
 865
 866         xorps xmm6, xmm6
 867         mov rsi, [rbp + nb133_type]
 868         mov ebx, [rsi + rax*4]
 869         mov rsi, [rbp + nb133_vdwparam]
 870         shl ebx, 1
 871         add ebx, [rsp + nb133_ntia]
 872         movlps xmm6, [rsi + rbx*4]
 873         movaps xmm7, xmm6
 874         shufps xmm6, xmm6, 252  ;# constant 11111100
 875         shufps xmm7, xmm7, 253  ;# constant 11111101
 876         movaps [rsp + nb133_c6], xmm6
 877         movaps [rsp + nb133_c12], xmm7
 878
 879         mov rsi, [rbp + nb133_pos]
 880         lea rax, [rax + rax*2]
 881
 882         movss xmm0, [rsp + nb133_ixO]
 883         movss xmm1, [rsp + nb133_iyO]
 884         movss xmm2, [rsp + nb133_izO]
 885         movss xmm3, [rsp + nb133_ixH1]
 886         movss xmm4, [rsp + nb133_iyH1]
 887         movss xmm5, [rsp + nb133_izH1]
 888         unpcklps xmm0, [rsp + nb133_ixH2]       ;# ixO ixH2 - -
 889         unpcklps xmm1, [rsp + nb133_iyH2]       ;# iyO iyH2 - -
 890         unpcklps xmm2, [rsp + nb133_izH2]       ;# izO izH2 - -
 891         unpcklps xmm3, [rsp + nb133_ixM]        ;# ixH1 ixM - -
 892         unpcklps xmm4, [rsp + nb133_iyM]        ;# iyH1 iyM - -
 893         unpcklps xmm5, [rsp + nb133_izM]        ;# izH1 izM - -
 894         unpcklps xmm0, xmm3     ;# ixO ixH1 ixH2 ixM
 895         unpcklps xmm1, xmm4     ;# same for y
 896         unpcklps xmm2, xmm5     ;# same for z
 897
 898         ;# move j coords to xmm0-xmm2
 899         movss xmm3, [rsi + rax*4]
 900         movss xmm4, [rsi + rax*4 + 4]
 901         movss xmm5, [rsi + rax*4 + 8]
 902         shufps xmm3, xmm3, 0
 903         shufps xmm4, xmm4, 0
 904         shufps xmm5, xmm5, 0
 905
 906         subps xmm3, xmm0
 907         subps xmm4, xmm1
 908         subps xmm5, xmm2
 909
 910         ;# use O distances for storage
 911         movaps [rsp + nb133_dxO], xmm3
 912         movaps [rsp + nb133_dyO], xmm4
 913         movaps [rsp + nb133_dzO], xmm5
 914
 915         mulps  xmm3, xmm3
 916         mulps  xmm4, xmm4
 917         mulps  xmm5, xmm5
 918
 919         addps  xmm4, xmm3
 920         addps  xmm4, xmm5
 921         ;# rsq in xmm4
 922
 923         rsqrtps xmm5, xmm4
 924         ;# lookup seed in xmm5
 925         movaps xmm2, xmm5
 926         mulps xmm5, xmm5
 927         movaps xmm1, [rsp + nb133_three]
 928         mulps xmm5, xmm4        ;# rsq*lu*lu
 929         movaps xmm0, [rsp + nb133_half]
 930         subps xmm1, xmm5        ;# constant 30-rsq*lu*lu
 931         mulps xmm1, xmm2
 932         mulps xmm0, xmm1        ;# xmm0=rinv, xmm4=rsq
 933
 934         ;# LJ table interaction
 935         mulps xmm4, xmm0
 936         mulps  xmm4, [rsp + nb133_tsc] ;# rtab
 937
 938         cvttps2pi mm6, xmm4
 939         cvtpi2ps xmm6, mm6
 940         subss  xmm4, xmm6
 941         movss xmm1, xmm4        ;# xmm1=eps
 942         movss xmm2, xmm1
 943         mulss  xmm2, xmm2       ;# xmm2=eps2
 944         pslld mm6, 3
 945
 946         movd mm0, eax
 947
 948         mov  rsi, [rbp + nb133_VFtab]
 949         movd eax, mm6
 950
 951         ;# dispersion
 952         movlps xmm5, [rsi + rax*4]
 953         movaps xmm4, xmm5
 954         shufps xmm4, xmm7, 136  ;# constant 10001000
 955         shufps xmm5, xmm7, 221  ;# constant 11011101
 956
 957         movlps xmm7, [rsi + rax*4 + 8]
 958         movaps xmm6, xmm7
 959         shufps xmm6, xmm3, 136  ;# constant 10001000
 960         shufps xmm7, xmm3, 221  ;# constant 11011101
 961         ;# dispersion table ready, in xmm4-xmm7
 962
 963         mulss  xmm6, xmm1       ;# xmm6=Geps
 964         mulss  xmm7, xmm2       ;# xmm7=Heps2
 965         addss  xmm5, xmm6
 966         addss  xmm5, xmm7       ;# xmm5=Fp
 967         mulss  xmm7, [rsp + nb133_two]  ;# two*Heps2
 968         addss  xmm7, xmm6
 969         addss  xmm7, xmm5 ;# xmm7=FF
 970         mulss  xmm5, xmm1 ;# xmm5=eps*Fp
 971         addss  xmm5, xmm4 ;# xmm5=VV
 972
 973         movss xmm4, [rsp + nb133_c6]
 974         mulss  xmm7, xmm4        ;# fijD
 975         mulss  xmm5, xmm4        ;# Vvdw6
 976         mulss  xmm7, [rsp + nb133_tsc]
 977         ;# put scalar force on stack Update Vvdwtot directly
 978         addss  xmm5, [rsp + nb133_Vvdwtot]
 979         movss [rsp + nb133_fstmp], xmm7
 980         movss [rsp + nb133_Vvdwtot], xmm5
 981
 982         ;# repulsion
 983         movlps xmm5, [rsi + rax*4 + 16]
 984         movaps xmm4, xmm5
 985         shufps xmm4, xmm7, 136  ;# constant 10001000
 986         shufps xmm5, xmm7, 221  ;# constant 11011101
 987
 988         movlps xmm7, [rsi + rax*4 + 24]
 989         movaps xmm6, xmm7
 990         shufps xmm6, xmm3, 136  ;# constant 10001000
 991         shufps xmm7, xmm3, 221  ;# constant 11011101
 992         ;# table ready, in xmm4-xmm7
 993         mulss  xmm6, xmm1       ;# xmm6=Geps
 994         mulss  xmm7, xmm2       ;# xmm7=Heps2
 995         addss  xmm5, xmm6
 996         addss  xmm5, xmm7       ;# xmm5=Fp
 997         mulss  xmm7, [rsp + nb133_two]  ;# two*Heps2
 998         addss  xmm7, xmm6
 999         addss  xmm7, xmm5 ;# xmm7=FF
1000         mulss  xmm5, xmm1 ;# xmm5=eps*Fp
1001         addss  xmm5, xmm4 ;# xmm5=VV
1002
1003         movss xmm4, [rsp + nb133_c12]
1004         mulss  xmm7, xmm4 ;# fijR
1005         mulss  xmm5, xmm4 ;# Vvdw12
1006         mulss  xmm7, [rsp + nb133_tsc]
1007         addss  xmm7, [rsp + nb133_fstmp]
1008         movss [rsp + nb133_fstmp], xmm7
1009         addss  xmm5, [rsp + nb133_Vvdwtot]
1010         movss [rsp + nb133_Vvdwtot], xmm5
1011
1012         movd eax, mm0
1013
1014         movaps xmm4, xmm0
1015         mulps  xmm4, [rsp + nb133_qqM]
1016         movaps xmm2, xmm4
1017         mulps  xmm4, xmm0
1018         subss  xmm4, [rsp + nb133_fstmp]
1019         mulps  xmm4, xmm0
1020
1021         addps  xmm2, [rsp + nb133_vctot]
1022         movaps [rsp + nb133_vctot], xmm2
1023
1024         movaps xmm0, [rsp + nb133_dxO]
1025         movaps xmm1, [rsp + nb133_dyO]
1026         movaps xmm2, [rsp + nb133_dzO]
1027
1028         mulps  xmm0, xmm4
1029         mulps  xmm1, xmm4
1030         mulps  xmm2, xmm4 ;# xmm0-xmm2 now contains tx-tz (partial force)
1031
1032         movss  xmm3, [rsp + nb133_fixO]
1033         movss  xmm4, [rsp + nb133_fiyO]
1034         movss  xmm5, [rsp + nb133_fizO]
1035         addss  xmm3, xmm0
1036         addss  xmm4, xmm1
1037         addss  xmm5, xmm2
1038         movss  [rsp + nb133_fixO], xmm3
1039         movss  [rsp + nb133_fiyO], xmm4
1040         movss  [rsp + nb133_fizO], xmm5 ;# updated the O force now do the H's
1041
1042         movaps xmm3, xmm0
1043         movaps xmm4, xmm1
1044         movaps xmm5, xmm2
1045         shufps xmm3, xmm3, 0x39 ;# shift right
1046         shufps xmm4, xmm4, 0x39
1047         shufps xmm5, xmm5, 0x39
1048         addss  xmm3, [rsp + nb133_fixH1]
1049         addss  xmm4, [rsp + nb133_fiyH1]
1050         addss  xmm5, [rsp + nb133_fizH1]
1051         movss  [rsp + nb133_fixH1], xmm3
1052         movss  [rsp + nb133_fiyH1], xmm4
1053         movss  [rsp + nb133_fizH1], xmm5        ;# updated the H1 force
1054
1055         shufps xmm3, xmm3, 0x39
1056         shufps xmm4, xmm4, 0x39
1057         shufps xmm5, xmm5, 0x39
1058         addss  xmm3, [rsp + nb133_fixH2]
1059         addss  xmm4, [rsp + nb133_fiyH2]
1060         addss  xmm5, [rsp + nb133_fizH2]
1061         movss  [rsp + nb133_fixH2], xmm3
1062         movss  [rsp + nb133_fiyH2], xmm4
1063         movss  [rsp + nb133_fizH2], xmm5        ;# updated the H2 force
1064
1065         mov rdi, [rbp + nb133_faction]
1066         shufps xmm3, xmm3, 0x39
1067         shufps xmm4, xmm4, 0x39
1068         shufps xmm5, xmm5, 0x39
1069         addss  xmm3, [rsp + nb133_fixM]
1070         addss  xmm4, [rsp + nb133_fiyM]
1071         addss  xmm5, [rsp + nb133_fizM]
1072         movss  [rsp + nb133_fixM], xmm3
1073         movss  [rsp + nb133_fiyM], xmm4
1074         movss  [rsp + nb133_fizM], xmm5 ;# updated the M force
1075
1076         ;# the fj's - move in from mem start by acc. tx/ty/tz in xmm0, xmm1
1077         movlps xmm6, [rdi + rax*4]
1078         movss  xmm7, [rdi + rax*4 + 8]
1079
1080         movhlps xmm3, xmm0
1081         movhlps xmm4, xmm1
1082         movhlps xmm5, xmm2
1083         addps   xmm3, xmm0
1084         addps   xmm4, xmm1
1085         addps   xmm5, xmm2
1086         movaps  xmm0, xmm3
1087         movaps  xmm1, xmm4
1088         movaps  xmm2, xmm5
1089
1090         shufps xmm3, xmm3, 0x39 ;# shift right
1091         shufps xmm4, xmm4, 0x39
1092         shufps xmm5, xmm5, 0x39
1093         addss  xmm0, xmm3
1094         addss  xmm1, xmm4
1095         addss  xmm2, xmm5
1096         unpcklps xmm0, xmm1     ;# x,y sum in xmm0, z sum in xmm2
1097
1098         addps    xmm6, xmm0
1099         addss    xmm7, xmm2
1100
1101         movlps [rdi + rax*4],     xmm6
1102         movss  [rdi + rax*4 + 8], xmm7
1103
1104         dec dword ptr [rsp + nb133_innerk]
1105         jz    .nb133_updateouterdata
1106         jmp   .nb133_odd_loop
1107 .nb133_updateouterdata:
1108         mov   ecx, [rsp + nb133_ii3]
1109         mov   rdi, [rbp + nb133_faction]
1110         mov   rsi, [rbp + nb133_fshift]
1111         mov   edx, [rsp + nb133_is3]
1112
1113         ;# accumulate  Oi forces in xmm0, xmm1, xmm2
1114         movaps xmm0, [rsp + nb133_fixO]
1115         movaps xmm1, [rsp + nb133_fiyO]
1116         movaps xmm2, [rsp + nb133_fizO]
1117
1118         movhlps xmm3, xmm0
1119         movhlps xmm4, xmm1
1120         movhlps xmm5, xmm2
1121         addps  xmm0, xmm3
1122         addps  xmm1, xmm4
1123         addps  xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2
1124
1125         movaps xmm3, xmm0
1126         movaps xmm4, xmm1
1127         movaps xmm5, xmm2
1128
1129         shufps xmm3, xmm3, 1
1130         shufps xmm4, xmm4, 1
1131         shufps xmm5, xmm5, 1
1132         addss  xmm0, xmm3
1133         addss  xmm1, xmm4
1134         addss  xmm2, xmm5       ;# xmm0-xmm2 has single force in pos0
1135
1136         ;# increment i force
1137         movss  xmm3, [rdi + rcx*4]
1138         movss  xmm4, [rdi + rcx*4 + 4]
1139         movss  xmm5, [rdi + rcx*4 + 8]
1140         subss  xmm3, xmm0
1141         subss  xmm4, xmm1
1142         subss  xmm5, xmm2
1143         movss  [rdi + rcx*4],     xmm3
1144         movss  [rdi + rcx*4 + 4], xmm4
1145         movss  [rdi + rcx*4 + 8], xmm5
1146
1147         ;# accumulate force in xmm6/xmm7 for fshift
1148         movaps xmm6, xmm0
1149         movss xmm7, xmm2
1150         movlhps xmm6, xmm1
1151         shufps  xmm6, xmm6, 8 ;# constant 00001000
1152
1153         ;# accumulate H1i forces in xmm0, xmm1, xmm2
1154         movaps xmm0, [rsp + nb133_fixH1]
1155         movaps xmm1, [rsp + nb133_fiyH1]
1156         movaps xmm2, [rsp + nb133_fizH1]
1157
1158         movhlps xmm3, xmm0
1159         movhlps xmm4, xmm1
1160         movhlps xmm5, xmm2
1161         addps  xmm0, xmm3
1162         addps  xmm1, xmm4
1163         addps  xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2
1164
1165         movaps xmm3, xmm0
1166         movaps xmm4, xmm1
1167         movaps xmm5, xmm2
1168
1169         shufps xmm3, xmm3, 1
1170         shufps xmm4, xmm4, 1
1171         shufps xmm5, xmm5, 1
1172         addss  xmm0, xmm3
1173         addss  xmm1, xmm4
1174         addss  xmm2, xmm5       ;# xmm0-xmm2 has single force in pos0
1175
1176         ;# increment i force
1177         movss  xmm3, [rdi + rcx*4 + 12]
1178         movss  xmm4, [rdi + rcx*4 + 16]
1179         movss  xmm5, [rdi + rcx*4 + 20]
1180         subss  xmm3, xmm0
1181         subss  xmm4, xmm1
1182         subss  xmm5, xmm2
1183         movss  [rdi + rcx*4 + 12], xmm3
1184         movss  [rdi + rcx*4 + 16], xmm4
1185         movss  [rdi + rcx*4 + 20], xmm5
1186
1187         ;# accumulate force in xmm6/xmm7 for fshift
1188         addss xmm7, xmm2
1189         movlhps xmm0, xmm1
1190         shufps  xmm0, xmm0, 8 ;# constant 00001000
1191         addps   xmm6, xmm0
1192
1193         ;# accumulate H2i forces in xmm0, xmm1, xmm2
1194         movaps xmm0, [rsp + nb133_fixH2]
1195         movaps xmm1, [rsp + nb133_fiyH2]
1196         movaps xmm2, [rsp + nb133_fizH2]
1197
1198         movhlps xmm3, xmm0
1199         movhlps xmm4, xmm1
1200         movhlps xmm5, xmm2
1201         addps  xmm0, xmm3
1202         addps  xmm1, xmm4
1203         addps  xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2
1204
1205         movaps xmm3, xmm0
1206         movaps xmm4, xmm1
1207         movaps xmm5, xmm2
1208
1209         shufps xmm3, xmm3, 1
1210         shufps xmm4, xmm4, 1
1211         shufps xmm5, xmm5, 1
1212         addss  xmm0, xmm3
1213         addss  xmm1, xmm4
1214         addss  xmm2, xmm5       ;# xmm0-xmm2 has single force in pos0
1215
1216         ;# increment i force
1217         movss  xmm3, [rdi + rcx*4 + 24]
1218         movss  xmm4, [rdi + rcx*4 + 28]
1219         movss  xmm5, [rdi + rcx*4 + 32]
1220         subss  xmm3, xmm0
1221         subss  xmm4, xmm1
1222         subss  xmm5, xmm2
1223         movss  [rdi + rcx*4 + 24], xmm3
1224         movss  [rdi + rcx*4 + 28], xmm4
1225         movss  [rdi + rcx*4 + 32], xmm5
1226
1227         ;# accumulate force in xmm6/xmm7 for fshift
1228         addss xmm7, xmm2
1229         movlhps xmm0, xmm1
1230         shufps  xmm0, xmm0, 8 ;# constant 00001000
1231         addps   xmm6, xmm0
1232
1233         ;# accumulate Mi forces in xmm0, xmm1, xmm2
1234         movaps xmm0, [rsp + nb133_fixM]
1235         movaps xmm1, [rsp + nb133_fiyM]
1236         movaps xmm2, [rsp + nb133_fizM]
1237
1238         movhlps xmm3, xmm0
1239         movhlps xmm4, xmm1
1240         movhlps xmm5, xmm2
1241         addps  xmm0, xmm3
1242         addps  xmm1, xmm4
1243         addps  xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2
1244
1245         movaps xmm3, xmm0
1246         movaps xmm4, xmm1
1247         movaps xmm5, xmm2
1248
1249         shufps xmm3, xmm3, 1
1250         shufps xmm4, xmm4, 1
1251         shufps xmm5, xmm5, 1
1252         addss  xmm0, xmm3
1253         addss  xmm1, xmm4
1254         addss  xmm2, xmm5       ;# xmm0-xmm2 has single force in pos0
1255
1256         ;# increment i force
1257         movss  xmm3, [rdi + rcx*4 + 36]
1258         movss  xmm4, [rdi + rcx*4 + 40]
1259         movss  xmm5, [rdi + rcx*4 + 44]
1260         subss  xmm3, xmm0
1261         subss  xmm4, xmm1
1262         subss  xmm5, xmm2
1263         movss  [rdi + rcx*4 + 36], xmm3
1264         movss  [rdi + rcx*4 + 40], xmm4
1265         movss  [rdi + rcx*4 + 44], xmm5
1266
1267         ;# accumulate force in xmm6/xmm7 for fshift
1268         addss xmm7, xmm2
1269         movlhps xmm0, xmm1
1270         shufps  xmm0, xmm0, 8 ;# constant 00001000
1271         addps   xmm6, xmm0
1272
1273         ;# increment fshift force
1274         movlps  xmm3, [rsi + rdx*4]
1275         movss  xmm4, [rsi + rdx*4 + 8]
1276         subps  xmm3, xmm6
1277         subss  xmm4, xmm7
1278         movlps  [rsi + rdx*4],    xmm3
1279         movss  [rsi + rdx*4 + 8], xmm4
1280
1281         ;# get n from stack
1282         mov esi, [rsp + nb133_n]
1283         ;# get group index for i particle
1284         mov   rdx, [rbp + nb133_gid]            ;# base of gid[]
1285         mov   edx, [rdx + rsi*4]                ;# ggid=gid[n]
1286
1287         ;# accumulate total potential energy and update it
1288         movaps xmm7, [rsp + nb133_vctot]
1289         ;# accumulate
1290         movhlps xmm6, xmm7
1291         addps  xmm7, xmm6       ;# pos 0-1 in xmm7 have the sum now
1292         movaps xmm6, xmm7
1293         shufps xmm6, xmm6, 1
1294         addss  xmm7, xmm6
1295
1296         ;# add earlier value from mem
1297         mov   rax, [rbp + nb133_Vc]
1298         addss xmm7, [rax + rdx*4]
1299         ;# move back to mem
1300         movss [rax + rdx*4], xmm7
1301
1302         ;# accumulate total lj energy and update it
1303         movaps xmm7, [rsp + nb133_Vvdwtot]
1304         ;# accumulate
1305         movhlps xmm6, xmm7
1306         addps  xmm7, xmm6       ;# pos 0-1 in xmm7 have the sum now
1307         movaps xmm6, xmm7
1308         shufps xmm6, xmm6, 1
1309         addss  xmm7, xmm6
1310
1311         ;# add earlier value from mem
1312         mov   rax, [rbp + nb133_Vvdw]
1313         addss xmm7, [rax + rdx*4]
1314         ;# move back to mem
1315         movss [rax + rdx*4], xmm7
1316
1317         ;# finish if last
1318         mov ecx, [rsp + nb133_nn1]
1319         ;# esi already loaded with n
1320         inc esi
1321         sub ecx, esi
1322         jz .nb133_outerend
1323
1324         ;# not last, iterate outer loop once more!
1325         mov [rsp + nb133_n], esi
1326         jmp .nb133_outer
1327 .nb133_outerend:
1328         ;# check if more outer neighborlists remain
1329         mov   ecx, [rsp + nb133_nri]
1330         ;# esi already loaded with n above
1331         sub   ecx, esi
1332         jz .nb133_end
1333         ;# non-zero, do one more workunit
1334         jmp   .nb133_threadloop
1335 .nb133_end:
1336         mov eax, [rsp + nb133_nouter]
1337         mov ebx, [rsp + nb133_ninner]
1338         mov rcx, [rbp + nb133_outeriter]
1339         mov rdx, [rbp + nb133_inneriter]
1340         mov [rcx], eax
1341         mov [rdx], ebx
1342
1343         add rsp, 1072
1344         emms
1345
1346     ;# Save xmm registers to stack
1347     movaps xmm6,  [rsp      ]
1348     movaps xmm7,  [rsp + 16 ]
1349     movaps xmm8,  [rsp + 32 ]
1350     movaps xmm9,  [rsp + 48 ]
1351     movaps xmm10, [rsp + 64 ]
1352     movaps xmm11, [rsp + 80 ]
1353     movaps xmm12, [rsp + 96 ]
1354     movaps xmm13, [rsp + 112]
1355     movaps xmm14, [rsp + 128]
1356     movaps xmm15, [rsp + 144]
1357
1358     ;# Reset pointers after restoring xmm6-15
1359     add rsp, 168
1360
1361     pop r15
1362     pop r14
1363     pop r13
1364     pop r12
1365     pop rdi
1366     pop rsi
1367     pop rbx
1368
1369         pop     rbp
1370         ret
1371
1372
1373
1374
1375
1376
1377 .globl nb_kernel133nf_x86_64_sse
1378 .globl _nb_kernel133nf_x86_64_sse
1379 nb_kernel133nf_x86_64_sse:
1380 _nb_kernel133nf_x86_64_sse:
1381 ;#      Room for return address and rbp (16 bytes)
1382 .equiv          nb133nf_fshift,         16
1383 .equiv          nb133nf_gid,            24
1384 .equiv          nb133nf_pos,            32
1385 .equiv          nb133nf_faction,        40
1386 .equiv          nb133nf_charge,         48
1387 .equiv          nb133nf_p_facel,        56
1388 .equiv          nb133nf_argkrf,         64
1389 .equiv          nb133nf_argcrf,         72
1390 .equiv          nb133nf_Vc,             80
1391 .equiv          nb133nf_type,           88
1392 .equiv          nb133nf_p_ntype,        96
1393 .equiv          nb133nf_vdwparam,       104
1394 .equiv          nb133nf_Vvdw,           112
1395 .equiv          nb133nf_p_tabscale,     120
1396 .equiv          nb133nf_VFtab,          128
1397 .equiv          nb133nf_invsqrta,       136
1398 .equiv          nb133nf_dvda,           144
1399 .equiv          nb133nf_p_gbtabscale,   152
1400 .equiv          nb133nf_GBtab,          160
1401 .equiv          nb133nf_p_nthreads,     168
1402 .equiv          nb133nf_count,          176
1403 .equiv          nb133nf_mtx,            184
1404 .equiv          nb133nf_outeriter,      192
1405 .equiv          nb133nf_inneriter,      200
1406 .equiv          nb133nf_work,           208
1407         ;# stack offsets for local variables
1408         ;# bottom of stack is cache-aligned for sse use
1409 .equiv          nb133nf_ixO,            0
1410 .equiv          nb133nf_iyO,            16
1411 .equiv          nb133nf_izO,            32
1412 .equiv          nb133nf_ixH1,           48
1413 .equiv          nb133nf_iyH1,           64
1414 .equiv          nb133nf_izH1,           80
1415 .equiv          nb133nf_ixH2,           96
1416 .equiv          nb133nf_iyH2,           112
1417 .equiv          nb133nf_izH2,           128
1418 .equiv          nb133nf_ixM,            144
1419 .equiv          nb133nf_iyM,            160
1420 .equiv          nb133nf_izM,            176
1421 .equiv          nb133nf_iqM,            192
1422 .equiv          nb133nf_iqH,            208
1423 .equiv          nb133nf_qqM,            224
1424 .equiv          nb133nf_qqH,            240
1425 .equiv          nb133nf_rinvH1,         256
1426 .equiv          nb133nf_rinvH2,         272
1427 .equiv          nb133nf_rinvM,          288
1428 .equiv          nb133nf_tsc,            304
1429 .equiv          nb133nf_c6,             320
1430 .equiv          nb133nf_c12,            336
1431 .equiv          nb133nf_krf,            352
1432 .equiv          nb133nf_crf,            368
1433 .equiv          nb133nf_krsqH1,         384
1434 .equiv          nb133nf_krsqH2,         400
1435 .equiv          nb133nf_krsqM,          416
1436 .equiv          nb133nf_vctot,          432
1437 .equiv          nb133nf_Vvdwtot,        448
1438 .equiv          nb133nf_half,           464
1439 .equiv          nb133nf_three,          480
1440 .equiv          nb133nf_nri,            496
1441 .equiv          nb133nf_iinr,           504
1442 .equiv          nb133nf_jindex,         512
1443 .equiv          nb133nf_jjnr,           520
1444 .equiv          nb133nf_shift,          528
1445 .equiv          nb133nf_shiftvec,       536
1446 .equiv          nb133nf_facel,          544
1447 .equiv          nb133nf_innerjjnr,      552
1448 .equiv          nb133nf_is3,            560
1449 .equiv          nb133nf_ii3,            564
1450 .equiv          nb133nf_ntia,           568
1451 .equiv          nb133nf_innerk,         572
1452 .equiv          nb133nf_n,              576
1453 .equiv          nb133nf_nn1,            580
1454 .equiv          nb133nf_nouter,         584
1455 .equiv          nb133nf_ninner,         588
1456
1457         push rbp
1458         mov  rbp, rsp
1459
1460     ;# Push integer registers on stack
1461         push rbx
1462     push rsi
1463     push rdi
1464     push r12
1465     push r13
1466     push r14
1467     push r15
1468
1469     ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
1470     sub rsp, 168
1471
1472     ;# Save xmm registers to stack
1473     movaps [rsp      ], xmm6
1474     movaps [rsp + 16 ], xmm7
1475     movaps [rsp + 32 ], xmm8
1476     movaps [rsp + 48 ], xmm9
1477     movaps [rsp + 64 ], xmm10
1478     movaps [rsp + 80 ], xmm11
1479     movaps [rsp + 96 ], xmm12
1480     movaps [rsp + 112], xmm13
1481     movaps [rsp + 128], xmm14
1482     movaps [rsp + 144], xmm15
1483
1484         emms
1485         sub rsp, 592            ;# local variable stack space (n*16+8)
1486 ; .if 0    # block below only read by NASM - special calling convention on win64
1487 %ifidn __OUTPUT_FORMAT__, win64
1488     ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
1489     add rbp, 48
1490     ;# Adjust stack pointer for different alignment
1491     ;# Move around arguments to fit AMD64 convention below
1492     ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
1493     ;# win64 passes args in: rcx,rdx,r8,r9         + stack
1494     mov rdi, rcx
1495     mov rsi, rdx
1496     mov rdx, r8
1497     mov rcx, r9
1498     mov r8,  [rbp]
1499     mov r9,  [rbp + 8]
1500 %endif
1501 ; .endif   # end NASM- and win64-specific block
1502
1503         ;# zero 32-bit iteration counters
1504         mov eax, 0
1505         mov [rsp + nb133nf_nouter], eax
1506         mov [rsp + nb133nf_ninner], eax
1507
1508         mov edi, [rdi]
1509         mov [rsp + nb133nf_nri], edi
1510         mov [rsp + nb133nf_iinr], rsi
1511         mov [rsp + nb133nf_jindex], rdx
1512         mov [rsp + nb133nf_jjnr], rcx
1513         mov [rsp + nb133nf_shift], r8
1514         mov [rsp + nb133nf_shiftvec], r9
1515         mov rsi, [rbp + nb133nf_p_facel]
1516         movss xmm0, [rsi]
1517         movss [rsp + nb133nf_facel], xmm0
1518
1519         mov rax, [rbp + nb133nf_p_tabscale]
1520         movss xmm3, [rax]
1521         shufps xmm3, xmm3, 0
1522         movaps [rsp + nb133nf_tsc], xmm3
1523
1524         ;# create constant floating-point factors on stack
1525         mov eax, 0x3f000000     ;# half in IEEE (hex)
1526         mov [rsp + nb133nf_half], eax
1527         movss xmm1, [rsp + nb133nf_half]
1528         shufps xmm1, xmm1, 0    ;# splat to all elements
1529         movaps xmm2, xmm1
1530         addps  xmm2, xmm2       ;# one
1531         movaps xmm3, xmm2
1532         addps  xmm2, xmm2       ;# two
1533         addps  xmm3, xmm2       ;# three
1534         movaps [rsp + nb133nf_half],  xmm1
1535         movaps [rsp + nb133nf_three],  xmm3
1536
1537         ;# assume we have at least one i particle - start directly
1538         mov   rcx, [rsp + nb133nf_iinr]       ;# rcx = pointer into iinr[]
1539         mov   ebx, [rcx]            ;# ebx =ii
1540
1541         mov   rdx, [rbp + nb133nf_charge]
1542         movss xmm4, [rdx + rbx*4 + 4]
1543         movss xmm3, [rdx + rbx*4 + 12]
1544         mov rsi, [rbp + nb133nf_p_facel]
1545         movss xmm0, [rsi]
1546         movss xmm5, [rsp + nb133nf_facel]
1547         mulss  xmm3, xmm5
1548         mulss  xmm4, xmm5
1549
1550         shufps xmm3, xmm3, 0
1551         shufps xmm4, xmm4, 0
1552         movaps [rsp + nb133nf_iqM], xmm3
1553         movaps [rsp + nb133nf_iqH], xmm4
1554
1555         mov   rdx, [rbp + nb133nf_type]
1556         mov   ecx, [rdx + rbx*4]
1557         shl   ecx, 1
1558         mov rdi, [rbp + nb133nf_p_ntype]
1559         imul  ecx, [rdi]      ;# rcx = ntia = 2*ntype*type[ii0]
1560         mov   [rsp + nb133nf_ntia], ecx
1561
1562 .nb133nf_threadloop:
1563         mov   rsi, [rbp + nb133nf_count]          ;# pointer to sync counter
1564         mov   eax, [rsi]
1565 .nb133nf_spinlock:
1566         mov   ebx, eax                          ;# ebx=*count=nn0
1567         add   ebx, 1                           ;# ebx=nn1=nn0+10
1568         lock
1569         cmpxchg [rsi], ebx                      ;# write nn1 to *counter,
1570                                                 ;# if it hasnt changed.
1571                                                 ;# or reread *counter to eax.
1572         pause                                   ;# -> better p4 performance
1573         jnz .nb133nf_spinlock
1574
1575         ;# if(nn1>nri) nn1=nri
1576         mov ecx, [rsp + nb133nf_nri]
1577         mov edx, ecx
1578         sub ecx, ebx
1579         cmovle ebx, edx                         ;# if(nn1>nri) nn1=nri
1580         ;# Cleared the spinlock if we got here.
1581         ;# eax contains nn0, ebx contains nn1.
1582         mov [rsp + nb133nf_n], eax
1583         mov [rsp + nb133nf_nn1], ebx
1584         sub ebx, eax                            ;# calc number of outer lists
1585         mov esi, eax                            ;# copy n to esi
1586         jg  .nb133nf_outerstart
1587         jmp .nb133nf_end
1588
1589 .nb133nf_outerstart:
1590         ;# ebx contains number of outer iterations
1591         add ebx, [rsp + nb133nf_nouter]
1592         mov [rsp + nb133nf_nouter], ebx
1593
1594 .nb133nf_outer:
1595         mov   rax, [rsp + nb133nf_shift]      ;# eax = pointer into shift[]
1596         mov   ebx, [rax + rsi*4]                ;# ebx=shift[n]
1597
1598         lea   rbx, [rbx + rbx*2]        ;# rbx=3*is
1599         mov   [rsp + nb133nf_is3],ebx           ;# store is3
1600
1601         mov   rax, [rsp + nb133nf_shiftvec]   ;# eax = base of shiftvec[]
1602
1603         movss xmm0, [rax + rbx*4]
1604         movss xmm1, [rax + rbx*4 + 4]
1605         movss xmm2, [rax + rbx*4 + 8]
1606
1607         mov   rcx, [rsp + nb133nf_iinr]         ;# ecx = pointer into iinr[]
1608         mov   ebx, [rcx + rsi*4]                ;# ebx =ii
1609
1610         movaps xmm3, xmm0
1611         movaps xmm4, xmm1
1612         movaps xmm5, xmm2
1613         movaps xmm6, xmm0
1614         movaps xmm7, xmm1
1615
1616         lea   rbx, [rbx + rbx*2]        ;# rbx = 3*ii=ii3
1617         mov   rax, [rbp + nb133nf_pos]  ;# eax = base of pos[]
1618         mov   [rsp + nb133nf_ii3], ebx
1619
1620         addss xmm3, [rax + rbx*4]       ;# ox
1621         addss xmm4, [rax + rbx*4 + 4]  ;# oy
1622         addss xmm5, [rax + rbx*4 + 8]  ;# oz
1623         addss xmm6, [rax + rbx*4 + 12] ;# h1x
1624         addss xmm7, [rax + rbx*4 + 16] ;# h1y
1625         shufps xmm3, xmm3, 0
1626         shufps xmm4, xmm4, 0
1627         shufps xmm5, xmm5, 0
1628         shufps xmm6, xmm6, 0
1629         shufps xmm7, xmm7, 0
1630         movaps [rsp + nb133nf_ixO], xmm3
1631         movaps [rsp + nb133nf_iyO], xmm4
1632         movaps [rsp + nb133nf_izO], xmm5
1633         movaps [rsp + nb133nf_ixH1], xmm6
1634         movaps [rsp + nb133nf_iyH1], xmm7
1635
1636         movss xmm6, xmm2
1637         movss xmm3, xmm0
1638         movss xmm4, xmm1
1639         movss xmm5, xmm2
1640         addss xmm6, [rax + rbx*4 + 20] ;# h1z
1641         addss xmm0, [rax + rbx*4 + 24] ;# h2x
1642         addss xmm1, [rax + rbx*4 + 28] ;# h2y
1643         addss xmm2, [rax + rbx*4 + 32] ;# h2z
1644         addss xmm3, [rax + rbx*4 + 36] ;# mx
1645         addss xmm4, [rax + rbx*4 + 40] ;# my
1646         addss xmm5, [rax + rbx*4 + 44] ;# mz
1647
1648         shufps xmm6, xmm6, 0
1649         shufps xmm0, xmm0, 0
1650         shufps xmm1, xmm1, 0
1651         shufps xmm2, xmm2, 0
1652         shufps xmm3, xmm3, 0
1653         shufps xmm4, xmm4, 0
1654         shufps xmm5, xmm5, 0
1655         movaps [rsp + nb133nf_izH1], xmm6
1656         movaps [rsp + nb133nf_ixH2], xmm0
1657         movaps [rsp + nb133nf_iyH2], xmm1
1658         movaps [rsp + nb133nf_izH2], xmm2
1659         movaps [rsp + nb133nf_ixM], xmm3
1660         movaps [rsp + nb133nf_iyM], xmm4
1661         movaps [rsp + nb133nf_izM], xmm5
1662
1663         ;# clear vctot
1664         xorps xmm4, xmm4
1665         movaps [rsp + nb133nf_vctot], xmm4
1666         movaps [rsp + nb133nf_Vvdwtot], xmm4
1667
1668         mov   rax, [rsp + nb133nf_jindex]
1669         mov   ecx, [rax + rsi*4]                ;# jindex[n]
1670         mov   edx, [rax + rsi*4 + 4]            ;# jindex[n+1]
1671         sub   edx, ecx                  ;# number of innerloop atoms
1672
1673         mov   rsi, [rbp + nb133nf_pos]
1674         mov   rax, [rsp + nb133nf_jjnr]
1675         shl   ecx, 2
1676         add   rax, rcx
1677         mov   [rsp + nb133nf_innerjjnr], rax    ;# pointer to jjnr[nj0]
1678         mov   ecx, edx
1679         sub   edx,  4
1680         add   ecx, [rsp + nb133nf_ninner]
1681         mov   [rsp + nb133nf_ninner], ecx
1682         add   edx, 0
1683         mov   [rsp + nb133nf_innerk], edx       ;# number of innerloop atoms
1684         jge   .nb133nf_unroll_loop
1685         jmp   .nb133nf_odd_inner
1686 .nb133nf_unroll_loop:
1687         ;# quad-unroll innerloop here
1688         mov   rdx, [rsp + nb133nf_innerjjnr]    ;# pointer to jjnr[k]
1689         mov   eax, [rdx]
1690         mov   ebx, [rdx + 4]
1691         mov   ecx, [rdx + 8]
1692         mov   edx, [rdx + 12]           ;# eax-edx=jnr1-4
1693
1694         add qword ptr [rsp + nb133nf_innerjjnr],  16 ;# advance pointer (unrolled 4)
1695
1696         mov rsi, [rbp + nb133nf_charge] ;# base of charge[]
1697
1698         movss xmm3, [rsi + rax*4]
1699         movss xmm4, [rsi + rcx*4]
1700         movss xmm6, [rsi + rbx*4]
1701         movss xmm7, [rsi + rdx*4]
1702
1703         shufps xmm3, xmm6, 0
1704         shufps xmm4, xmm7, 0
1705         shufps xmm3, xmm4, 136  ;# constant 10001000 ;# all charges in xmm3
1706         movaps xmm4, xmm3               ;# and in xmm4
1707         mulps  xmm3, [rsp + nb133nf_iqM]
1708         mulps  xmm4, [rsp + nb133nf_iqH]
1709
1710         movd  mm0, eax          ;# use mmx registers as temp storage
1711         movd  mm1, ebx
1712         movd  mm2, ecx
1713         movd  mm3, edx
1714
1715         movaps  [rsp + nb133nf_qqM], xmm3
1716         movaps  [rsp + nb133nf_qqH], xmm4
1717
1718         mov rsi, [rbp + nb133nf_type]
1719         mov eax, [rsi + rax*4]
1720         mov ebx, [rsi + rbx*4]
1721         mov ecx, [rsi + rcx*4]
1722         mov edx, [rsi + rdx*4]
1723         mov rsi, [rbp + nb133nf_vdwparam]
1724         shl eax, 1
1725         shl ebx, 1
1726         shl ecx, 1
1727         shl edx, 1
1728         mov edi, [rsp + nb133nf_ntia]
1729         add eax, edi
1730         add ebx, edi
1731         add ecx, edi
1732         add edx, edi
1733
1734         movlps xmm6, [rsi + rax*4]
1735         movlps xmm7, [rsi + rcx*4]
1736         movhps xmm6, [rsi + rbx*4]
1737         movhps xmm7, [rsi + rdx*4]
1738
1739         movaps xmm4, xmm6
1740         shufps xmm4, xmm7, 136  ;# constant 10001000
1741         shufps xmm6, xmm7, 221  ;# constant 11011101
1742
1743         movd  eax, mm0
1744         movd  ebx, mm1
1745         movd  ecx, mm2
1746         movd  edx, mm3
1747
1748         movaps [rsp + nb133nf_c6], xmm4
1749         movaps [rsp + nb133nf_c12], xmm6
1750
1751         mov rsi, [rbp + nb133nf_pos]    ;# base of pos[]
1752
1753         lea   rax, [rax + rax*2]        ;# replace jnr with j3
1754         lea   rbx, [rbx + rbx*2]
1755         lea   rcx, [rcx + rcx*2]        ;# replace jnr with j3
1756         lea   rdx, [rdx + rdx*2]
1757
1758         ;# move four coordinates to xmm0-xmm2
1759         movlps xmm4, [rsi + rax*4]
1760         movlps xmm5, [rsi + rcx*4]
1761         movss xmm2, [rsi + rax*4 + 8]
1762         movss xmm6, [rsi + rcx*4 + 8]
1763
1764         movhps xmm4, [rsi + rbx*4]
1765         movhps xmm5, [rsi + rdx*4]
1766
1767         movss xmm0, [rsi + rbx*4 + 8]
1768         movss xmm1, [rsi + rdx*4 + 8]
1769
1770         shufps xmm2, xmm0, 0
1771         shufps xmm6, xmm1, 0
1772
1773         movaps xmm0, xmm4
1774         movaps xmm1, xmm4
1775
1776         shufps xmm2, xmm6, 136  ;# constant 10001000
1777
1778         shufps xmm0, xmm5, 136  ;# constant 10001000
1779         shufps xmm1, xmm5, 221  ;# constant 11011101
1780
1781         ;# move ixO-izO to xmm4-xmm6
1782         movaps xmm4, [rsp + nb133nf_ixO]
1783         movaps xmm5, [rsp + nb133nf_iyO]
1784         movaps xmm6, [rsp + nb133nf_izO]
1785
1786         ;# calc dr
1787         subps xmm4, xmm0
1788         subps xmm5, xmm1
1789         subps xmm6, xmm2
1790
1791         ;# square it
1792         mulps xmm4,xmm4
1793         mulps xmm5,xmm5
1794         mulps xmm6,xmm6
1795         addps xmm4, xmm5
1796         addps xmm4, xmm6
1797         movaps xmm7, xmm4
1798         ;# rsqO in xmm7
1799
1800         ;# move ixH1-izH1 to xmm4-xmm6
1801         movaps xmm4, [rsp + nb133nf_ixH1]
1802         movaps xmm5, [rsp + nb133nf_iyH1]
1803         movaps xmm6, [rsp + nb133nf_izH1]
1804
1805         ;# calc dr
1806         subps xmm4, xmm0
1807         subps xmm5, xmm1
1808         subps xmm6, xmm2
1809
1810         ;# square it
1811         mulps xmm4,xmm4
1812         mulps xmm5,xmm5
1813         mulps xmm6,xmm6
1814         addps xmm6, xmm5
1815         addps xmm6, xmm4
1816         ;# rsqH1 in xmm6
1817
1818         ;# move ixH2-izH2 to xmm3-xmm5
1819         movaps xmm3, [rsp + nb133nf_ixH2]
1820         movaps xmm4, [rsp + nb133nf_iyH2]
1821         movaps xmm5, [rsp + nb133nf_izH2]
1822
1823         ;# calc dr
1824         subps xmm3, xmm0
1825         subps xmm4, xmm1
1826         subps xmm5, xmm2
1827
1828         ;# square it
1829         mulps xmm3,xmm3
1830         mulps xmm4,xmm4
1831         mulps xmm5,xmm5
1832         addps xmm5, xmm4
1833         addps xmm5, xmm3
1834
1835         ;# move ixM-izM to xmm2-xmm4
1836         movaps xmm3, [rsp + nb133nf_iyM]
1837         movaps xmm4, [rsp + nb133nf_izM]
1838         subps  xmm3, xmm1
1839         subps  xmm4, xmm2
1840         movaps xmm2, [rsp + nb133nf_ixM]
1841         subps  xmm2, xmm0
1842
1843         ;# square it
1844         mulps xmm2,xmm2
1845         mulps xmm3,xmm3
1846         mulps xmm4,xmm4
1847         addps xmm4, xmm3
1848         addps xmm4, xmm2
1849         ;# rsqM in xmm4, rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
1850
1851         ;# rsqH1 - seed in xmm2
1852         rsqrtps xmm2, xmm6
1853         movaps  xmm3, xmm2
1854         mulps   xmm2, xmm2
1855         movaps  xmm0, [rsp + nb133nf_three]
1856         mulps   xmm2, xmm6      ;# rsq*lu*lu
1857         subps   xmm0, xmm2      ;# constant 30-rsq*lu*lu
1858         mulps   xmm0, xmm3      ;# lu*(3-rsq*lu*lu)
1859         mulps   xmm0, [rsp + nb133nf_half]
1860         movaps  [rsp + nb133nf_rinvH1], xmm0    ;# rinvH1
1861
1862         ;# rsqH2 - seed to xmm2
1863         rsqrtps xmm2, xmm5
1864         movaps  xmm3, xmm2
1865         mulps   xmm2, xmm2
1866         movaps  xmm0, [rsp + nb133nf_three]
1867         mulps   xmm2, xmm5      ;# rsq*lu*lu
1868         subps   xmm0, xmm2      ;# constant 30-rsq*lu*lu
1869         mulps   xmm0, xmm3      ;# lu*(3-rsq*lu*lu)
1870         mulps   xmm0, [rsp + nb133nf_half]
1871         movaps  [rsp + nb133nf_rinvH2], xmm0    ;# rinvH2
1872
1873         ;# rsqM - seed to xmm2
1874         rsqrtps xmm2, xmm4
1875         movaps  xmm3, xmm2
1876         mulps   xmm2, xmm2
1877         movaps  xmm0, [rsp + nb133nf_three]
1878         mulps   xmm2, xmm4      ;# rsq*lu*lu
1879         subps   xmm0, xmm2      ;# constant 30-rsq*lu*lu
1880         mulps   xmm0, xmm3      ;# lu*(3-rsq*lu*lu)
1881         mulps   xmm0, [rsp + nb133nf_half]
1882         movaps  [rsp + nb133nf_rinvM], xmm0
1883
1884         ;# Do the O LJ-only interaction directly.
1885         ;# rsqO is in xmm7
1886         rsqrtps xmm2, xmm7
1887         movaps  xmm3, xmm2
1888         mulps   xmm2, xmm2
1889         movaps  xmm4, [rsp + nb133nf_three]
1890         mulps   xmm2, xmm7      ;# rsq*lu*lu
1891         subps   xmm4, xmm2      ;# constant 30-rsq*lu*lu
1892         mulps   xmm4, xmm3      ;# lu*(3-rsq*lu*lu)
1893         mulps   xmm4, [rsp + nb133nf_half]
1894         movaps  xmm0, xmm4
1895         ;# xmm0=rinvO
1896
1897         mulps xmm7, xmm0
1898         mulps xmm7, [rsp + nb133nf_tsc] ;# rtab
1899
1900         movhlps xmm5, xmm7
1901         cvttps2pi mm6, xmm7
1902         cvttps2pi mm7, xmm5     ;# mm6/mm7 contain lu indices
1903         cvtpi2ps xmm6, mm6
1904         cvtpi2ps xmm5, mm7
1905         movlhps xmm6, xmm5
1906         subps  xmm7, xmm6
1907         movaps xmm1, xmm7       ;# xmm1=eps
1908         movaps xmm2, xmm1
1909         mulps  xmm2, xmm2       ;# xmm2=eps2
1910         pslld mm6, 3
1911         pslld mm7, 3
1912
1913         mov  rsi, [rbp + nb133nf_VFtab]
1914         movd eax, mm6
1915         psrlq mm6, 32
1916         movd ecx, mm7
1917         psrlq mm7, 32
1918         movd ebx, mm6
1919         movd edx, mm7
1920
1921         ;# dispersion
1922         movlps xmm5, [rsi + rax*4]
1923         movlps xmm7, [rsi + rcx*4]
1924         movhps xmm5, [rsi + rbx*4]
1925         movhps xmm7, [rsi + rdx*4] ;# got half dispersion table
1926         movaps xmm4, xmm5
1927         shufps xmm4, xmm7, 136  ;# constant 10001000
1928         shufps xmm5, xmm7, 221  ;# constant 11011101
1929
1930         movlps xmm7, [rsi + rax*4 + 8]
1931         movlps xmm3, [rsi + rcx*4 + 8]
1932         movhps xmm7, [rsi + rbx*4 + 8]
1933         movhps xmm3, [rsi + rdx*4 + 8] ;# other half of dispersion table
1934         movaps xmm6, xmm7
1935         shufps xmm6, xmm3, 136  ;# constant 10001000
1936         shufps xmm7, xmm3, 221  ;# constant 11011101
1937         ;# dispersion table ready, in xmm4-xmm7
1938
1939         mulps  xmm6, xmm1       ;# xmm6=Geps
1940         mulps  xmm7, xmm2       ;# xmm7=Heps2
1941         addps  xmm5, xmm6
1942         addps  xmm5, xmm7       ;# xmm5=Fp
1943         mulps  xmm5, xmm1 ;# xmm5=eps*Fp
1944         addps  xmm5, xmm4 ;# xmm5=VV
1945
1946         movaps xmm4, [rsp + nb133nf_c6]
1947         mulps  xmm5, xmm4        ;# Vvdw6
1948
1949         addps  xmm5, [rsp + nb133nf_Vvdwtot]
1950         movaps [rsp + nb133nf_Vvdwtot], xmm5
1951
1952         ;# repulsion
1953         movlps xmm5, [rsi + rax*4 + 16]
1954         movlps xmm7, [rsi + rcx*4 + 16]
1955         movhps xmm5, [rsi + rbx*4 + 16]
1956         movhps xmm7, [rsi + rdx*4 + 16] ;# got half repulsion table
1957         movaps xmm4, xmm5
1958         shufps xmm4, xmm7, 136  ;# constant 10001000
1959         shufps xmm5, xmm7, 221  ;# constant 11011101
1960
1961         movlps xmm7, [rsi + rax*4 + 24]
1962         movlps xmm3, [rsi + rcx*4 + 24]
1963         movhps xmm7, [rsi + rbx*4 + 24]
1964         movhps xmm3, [rsi + rdx*4 + 24] ;# other half of repulsion table
1965         movaps xmm6, xmm7
1966         shufps xmm6, xmm3, 136  ;# constant 10001000
1967         shufps xmm7, xmm3, 221  ;# constant 11011101
1968         ;# table ready, in xmm4-xmm7
1969         mulps  xmm6, xmm1       ;# xmm6=Geps
1970         mulps  xmm7, xmm2       ;# xmm7=Heps2
1971         addps  xmm5, xmm6
1972         addps  xmm5, xmm7       ;# xmm5=Fp
1973         mulps  xmm5, xmm1 ;# xmm5=eps*Fp
1974         addps  xmm5, xmm4 ;# xmm5=VV
1975
1976         movaps xmm4, [rsp + nb133nf_c12]
1977         mulps  xmm5, xmm4 ;# Vvdw12
1978
1979         addps  xmm5, [rsp + nb133nf_Vvdwtot]
1980         movaps [rsp + nb133nf_Vvdwtot], xmm5
1981
1982         ;# Do H1-H2-M interactions
1983         movaps  xmm7, [rsp + nb133nf_rinvH1]
1984         addps   xmm7, [rsp + nb133nf_rinvH2]
1985         movaps  xmm6, [rsp + nb133nf_rinvM]
1986
1987         mulps   xmm7, [rsp + nb133nf_qqH]
1988         mulps   xmm6, [rsp + nb133nf_qqM]
1989         addps   xmm7, xmm6
1990
1991         addps  xmm7, [rsp + nb133nf_vctot]
1992         movaps [rsp + nb133nf_vctot], xmm7
1993
1994         ;# should we do one more iteration?
1995         sub dword ptr [rsp + nb133nf_innerk],  4
1996         jl    .nb133nf_odd_inner
1997         jmp   .nb133nf_unroll_loop
1998 .nb133nf_odd_inner:
1999         add dword ptr [rsp + nb133nf_innerk],  4
2000         jnz   .nb133nf_odd_loop
2001         jmp   .nb133nf_updateouterdata
2002 .nb133nf_odd_loop:
2003         mov   rdx, [rsp + nb133nf_innerjjnr]    ;# pointer to jjnr[k]
2004         mov   eax, [rdx]
2005         add qword ptr [rsp + nb133nf_innerjjnr],  4
2006
2007         xorps xmm4, xmm4        ;# clear reg.
2008         movss xmm4, [rsp + nb133nf_iqM]
2009         mov rsi, [rbp + nb133nf_charge]
2010         movhps xmm4, [rsp + nb133nf_iqH]  ;# [qM  0  qH  qH]
2011         shufps xmm4, xmm4, 41   ;# [0 qH qH qM]
2012
2013         movss xmm3, [rsi + rax*4]       ;# charge in xmm3
2014         shufps xmm3, xmm3, 0
2015         mulps xmm3, xmm4
2016         movaps [rsp + nb133nf_qqM], xmm3        ;# use dummy qq for storage
2017
2018         xorps xmm6, xmm6
2019         mov rsi, [rbp + nb133nf_type]
2020         mov ebx, [rsi + rax*4]
2021         mov rsi, [rbp + nb133nf_vdwparam]
2022         shl ebx, 1
2023         add ebx, [rsp + nb133nf_ntia]
2024         movlps xmm6, [rsi + rbx*4]
2025         movaps xmm7, xmm6
2026         shufps xmm6, xmm6, 252  ;# constant 11111100
2027         shufps xmm7, xmm7, 253  ;# constant 11111101
2028         movaps [rsp + nb133nf_c6], xmm6
2029         movaps [rsp + nb133nf_c12], xmm7
2030
2031         mov rsi, [rbp + nb133nf_pos]
2032         lea rax, [rax + rax*2]
2033
2034         movss xmm3, [rsp + nb133nf_ixO]
2035         movss xmm4, [rsp + nb133nf_iyO]
2036         movss xmm5, [rsp + nb133nf_izO]
2037         movss xmm0, [rsp + nb133nf_ixH1]
2038         movss xmm1, [rsp + nb133nf_iyH1]
2039         movss xmm2, [rsp + nb133nf_izH1]
2040         unpcklps xmm3, [rsp + nb133nf_ixH2]     ;# ixO ixH2 - -
2041         unpcklps xmm4, [rsp + nb133nf_iyH2]     ;# iyO iyH2 - -
2042         unpcklps xmm5, [rsp + nb133nf_izH2]     ;# izO izH2 - -
2043         unpcklps xmm0, [rsp + nb133nf_ixM]      ;# ixH1 ixM - -
2044         unpcklps xmm1, [rsp + nb133nf_iyM]      ;# iyH1 iyM - -
2045         unpcklps xmm2, [rsp + nb133nf_izM]      ;# izH1 izM - -
2046         unpcklps xmm3, xmm0     ;# ixO ixH1 ixH2 ixM
2047         unpcklps xmm4, xmm1     ;# same for y
2048         unpcklps xmm5, xmm2     ;# same for z
2049
2050         ;# move j coords to xmm0-xmm2
2051         movss xmm0, [rsi + rax*4]
2052         movss xmm1, [rsi + rax*4 + 4]
2053         movss xmm2, [rsi + rax*4 + 8]
2054         shufps xmm0, xmm0, 0
2055         shufps xmm1, xmm1, 0
2056         shufps xmm2, xmm2, 0
2057
2058         subps xmm3, xmm0
2059         subps xmm4, xmm1
2060         subps xmm5, xmm2
2061
2062         mulps  xmm3, xmm3
2063         mulps  xmm4, xmm4
2064         mulps  xmm5, xmm5
2065
2066         addps  xmm4, xmm3
2067         addps  xmm4, xmm5
2068         ;# rsq in xmm4
2069
2070         rsqrtps xmm5, xmm4
2071         ;# lookup seed in xmm5
2072         movaps xmm2, xmm5
2073         mulps xmm5, xmm5
2074         movaps xmm1, [rsp + nb133nf_three]
2075         mulps xmm5, xmm4        ;# rsq*lu*lu
2076         movaps xmm0, [rsp + nb133nf_half]
2077         subps xmm1, xmm5        ;# constant 30-rsq*lu*lu
2078         mulps xmm1, xmm2
2079         mulps xmm0, xmm1        ;# xmm0=rinv, xmm4=rsq
2080
2081         ;# LJ table interaction
2082         mulps xmm4, xmm0
2083         mulps  xmm4, [rsp + nb133nf_tsc] ;# rtab
2084
2085         cvttps2pi mm6, xmm4
2086         cvtpi2ps xmm6, mm6
2087         subss  xmm4, xmm6
2088         movss xmm1, xmm4        ;# xmm1=eps
2089         movss xmm2, xmm1
2090         mulss  xmm2, xmm2       ;# xmm2=eps2
2091         pslld mm6, 3
2092
2093         mov  rsi, [rbp + nb133nf_VFtab]
2094         movd eax, mm6
2095
2096         ;# dispersion
2097         movlps xmm5, [rsi + rax*4]
2098         movaps xmm4, xmm5
2099         shufps xmm4, xmm7, 136  ;# constant 10001000
2100         shufps xmm5, xmm7, 221  ;# constant 11011101
2101
2102         movlps xmm7, [rsi + rax*4 + 8]
2103         movaps xmm6, xmm7
2104         shufps xmm6, xmm3, 136  ;# constant 10001000
2105         shufps xmm7, xmm3, 221  ;# constant 11011101
2106         ;# dispersion table ready, in xmm4-xmm7
2107
2108         mulss  xmm6, xmm1       ;# xmm6=Geps
2109         mulss  xmm7, xmm2       ;# xmm7=Heps2
2110         addss  xmm5, xmm6
2111         addss  xmm5, xmm7       ;# xmm5=Fp
2112         mulss  xmm5, xmm1 ;# xmm5=eps*Fp
2113         addss  xmm5, xmm4 ;# xmm5=VV
2114
2115         movss xmm4, [rsp + nb133nf_c6]
2116         mulss  xmm5, xmm4        ;# Vvdw6
2117
2118         ;# put scalar force on stack Update Vvdwtot directly
2119         addss  xmm5, [rsp + nb133nf_Vvdwtot]
2120         movss [rsp + nb133nf_Vvdwtot], xmm5
2121
2122         ;# repulsion
2123         movlps xmm5, [rsi + rax*4 + 16]
2124         movaps xmm4, xmm5
2125         shufps xmm4, xmm7, 136  ;# constant 10001000
2126         shufps xmm5, xmm7, 221  ;# constant 11011101
2127
2128         movlps xmm7, [rsi + rax*4 + 24]
2129         movaps xmm6, xmm7
2130         shufps xmm6, xmm3, 136  ;# constant 10001000
2131         shufps xmm7, xmm3, 221  ;# constant 11011101
2132         ;# table ready, in xmm4-xmm7
2133         mulss  xmm6, xmm1       ;# xmm6=Geps
2134         mulss  xmm7, xmm2       ;# xmm7=Heps2
2135         addss  xmm5, xmm6
2136         addss  xmm5, xmm7       ;# xmm5=Fp
2137         mulss  xmm5, xmm1 ;# xmm5=eps*Fp
2138         addss  xmm5, xmm4 ;# xmm5=VV
2139
2140         movss xmm4, [rsp + nb133nf_c12]
2141         mulss  xmm5, xmm4 ;# Vvdw12
2142
2143         addss  xmm5, [rsp + nb133nf_Vvdwtot]
2144         movss [rsp + nb133nf_Vvdwtot], xmm5
2145
2146         mulps  xmm0, [rsp + nb133nf_qqM]        ;# xmm0=vcoul
2147
2148         addps  xmm0, [rsp + nb133nf_vctot]
2149         movaps [rsp + nb133nf_vctot], xmm0
2150
2151         dec dword ptr [rsp + nb133nf_innerk]
2152         jz    .nb133nf_updateouterdata
2153         jmp   .nb133nf_odd_loop
2154 .nb133nf_updateouterdata:
2155         ;# get n from stack
2156         mov esi, [rsp + nb133nf_n]
2157         ;# get group index for i particle
2158         mov   rdx, [rbp + nb133nf_gid]          ;# base of gid[]
2159         mov   edx, [rdx + rsi*4]                ;# ggid=gid[n]
2160
2161         ;# accumulate total potential energy and update it
2162         movaps xmm7, [rsp + nb133nf_vctot]
2163         ;# accumulate
2164         movhlps xmm6, xmm7
2165         addps  xmm7, xmm6       ;# pos 0-1 in xmm7 have the sum now
2166         movaps xmm6, xmm7
2167         shufps xmm6, xmm6, 1
2168         addss  xmm7, xmm6
2169
2170         ;# add earlier value from mem
2171         mov   rax, [rbp + nb133nf_Vc]
2172         addss xmm7, [rax + rdx*4]
2173         ;# move back to mem
2174         movss [rax + rdx*4], xmm7
2175
2176         ;# accumulate total lj energy and update it
2177         movaps xmm7, [rsp + nb133nf_Vvdwtot]
2178         ;# accumulate
2179         movhlps xmm6, xmm7
2180         addps  xmm7, xmm6       ;# pos 0-1 in xmm7 have the sum now
2181         movaps xmm6, xmm7
2182         shufps xmm6, xmm6, 1
2183         addss  xmm7, xmm6
2184
2185         ;# add earlier value from mem
2186         mov   rax, [rbp + nb133nf_Vvdw]
2187         addss xmm7, [rax + rdx*4]
2188         ;# move back to mem
2189         movss [rax + rdx*4], xmm7
2190
2191         ;# finish if last
2192         mov ecx, [rsp + nb133nf_nn1]
2193         ;# esi already loaded with n
2194         inc esi
2195         sub ecx, esi
2196         jz .nb133nf_outerend
2197
2198         ;# not last, iterate outer loop once more!
2199         mov [rsp + nb133nf_n], esi
2200         jmp .nb133nf_outer
2201 .nb133nf_outerend:
2202         ;# check if more outer neighborlists remain
2203         mov   ecx, [rsp + nb133nf_nri]
2204         ;# esi already loaded with n above
2205         sub   ecx, esi
2206         jz .nb133nf_end
2207         ;# non-zero, do one more workunit
2208         jmp   .nb133nf_threadloop
2209 .nb133nf_end:
2210
2211         mov eax, [rsp + nb133nf_nouter]
2212         mov ebx, [rsp + nb133nf_ninner]
2213         mov rcx, [rbp + nb133nf_outeriter]
2214         mov rdx, [rbp + nb133nf_inneriter]
2215         mov [rcx], eax
2216         mov [rdx], ebx
2217
2218         add rsp, 592
2219         emms
2220
2221     ;# Save xmm registers to stack
2222     movaps xmm6,  [rsp      ]
2223     movaps xmm7,  [rsp + 16 ]
2224     movaps xmm8,  [rsp + 32 ]
2225     movaps xmm9,  [rsp + 48 ]
2226     movaps xmm10, [rsp + 64 ]
2227     movaps xmm11, [rsp + 80 ]
2228     movaps xmm12, [rsp + 96 ]
2229     movaps xmm13, [rsp + 112]
2230     movaps xmm14, [rsp + 128]
2231     movaps xmm15, [rsp + 144]
2232
2233     ;# Reset pointers after restoring xmm6-15
2234     add rsp, 168
2235
2236     pop r15
2237     pop r14
2238     pop r13
2239     pop r12
2240     pop rdi
2241     pop rsi
2242     pop rbx
2243
2244         pop     rbp
2245         ret