3 ;
# Gromacs 4.0 Copyright (c) 1991-2003
4 ;
# David van der Spoel, Erik Lindahl
6 ;
# This program is free software; you can redistribute it and/or
7 ;
# modify it under the terms of the GNU General Public License
8 ;
# as published by the Free Software Foundation; either version 2
9 ;
# of the License, or (at your option) any later version.
11 ;
# To help us fund GROMACS development, we humbly ask that you cite
12 ;
# the research papers on the package. Check out http://www.gromacs.org
15 ;
# Gnomes, ROck Monsters And Chili Sauce
18 ;
# These files require GNU binutils 2.10 or later, since we
19 ;
# use intel syntax for portability, or a recent version
20 ;
# of NASM that understands Extended 3DNow and SSE2 instructions.
21 ;
# (NASM is normally only used with MS Visual C++).
22 ;
# Since NASM and gnu as disagree on some definitions and use
23 ;
# completely different preprocessing options I have to introduce a
24 ;
# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
25 ;
# Gnu as treats ';' as a line break, i.e. ignores it. This is the
26 ;
# reason why all comments need both symbols...
27 ;
# The source is written for GNU as, with intel syntax. When you use
28 ;
# NASM we redefine a couple of things. The false if-statement around
29 ;
# the following code is seen by GNU as, but NASM doesn't see it, so
30 ;
# the code inside is read by NASM but not gcc.
32 ;
.if 0 # block below only read by NASM
33 %define
.section section
37 ;
# NASM only wants 'dword', not 'dword ptr'.
42 ;
.endif # End of NASM-specific block
43 ;
.intel_syntax noprefix # Line only read by gnu as
48 .globl nb_kernel133_x86_64_sse
49 .globl _nb_kernel133_x86_64_sse
50 nb_kernel133_x86_64_sse
:
51 _nb_kernel133_x86_64_sse
:
52 ;
# Room for return address and rbp (16 bytes)
53 .equiv nb133_fshift, 16
56 .equiv nb133_faction, 40
57 .equiv nb133_charge, 48
58 .equiv nb133_p_facel, 56
59 .equiv nb133_argkrf, 64
60 .equiv nb133_argcrf, 72
63 .equiv nb133_p_ntype, 96
64 .equiv nb133_vdwparam, 104
65 .equiv nb133_Vvdw, 112
66 .equiv nb133_p_tabscale, 120
67 .equiv nb133_VFtab, 128
68 .equiv nb133_invsqrta, 136
69 .equiv nb133_dvda, 144
70 .equiv nb133_p_gbtabscale, 152
71 .equiv nb133_GBtab, 160
72 .equiv nb133_p_nthreads, 168
73 .equiv nb133_count, 176
75 .equiv nb133_outeriter, 192
76 .equiv nb133_inneriter, 200
77 .equiv nb133_work, 208
78 ;
# stack offsets for local variables
79 ;
# bottom of stack is cache-aligned for sse use
87 .equiv nb133_iyH2, 112
88 .equiv nb133_izH2, 128
97 .equiv nb133_dxH1, 272
98 .equiv nb133_dyH1, 288
99 .equiv nb133_dzH1, 304
100 .equiv nb133_dxH2, 320
101 .equiv nb133_dyH2, 336
102 .equiv nb133_dzH2, 352
103 .equiv nb133_dxM, 368
104 .equiv nb133_dyM, 384
105 .equiv nb133_dzM, 400
106 .equiv nb133_qqM, 416
107 .equiv nb133_qqH, 432
108 .equiv nb133_rinvH1, 448
109 .equiv nb133_rinvH2, 464
110 .equiv nb133_rinvM, 480
111 .equiv nb133_two, 496
113 .equiv nb133_c12, 528
114 .equiv nb133_tsc, 544
115 .equiv nb133_fstmp, 560
116 .equiv nb133_krf, 576
117 .equiv nb133_crf, 592
118 .equiv nb133_krsqH1, 608
119 .equiv nb133_krsqH2, 624
120 .equiv nb133_krsqM, 640
121 .equiv nb133_vctot, 656
122 .equiv nb133_Vvdwtot, 672
123 .equiv nb133_fixO, 688
124 .equiv nb133_fiyO, 704
125 .equiv nb133_fizO, 720
126 .equiv nb133_fixH1, 736
127 .equiv nb133_fiyH1, 752
128 .equiv nb133_fizH1, 768
129 .equiv nb133_fixH2, 784
130 .equiv nb133_fiyH2, 800
131 .equiv nb133_fizH2, 816
132 .equiv nb133_fixM, 832
133 .equiv nb133_fiyM, 848
134 .equiv nb133_fizM, 864
135 .equiv nb133_fjx, 880
136 .equiv nb133_fjy, 896
137 .equiv nb133_fjz, 912
138 .equiv nb133_half, 928
139 .equiv nb133_three, 944
140 .equiv nb133_rsqOO, 960
141 .equiv nb133_facel, 976
142 .equiv nb133_iinr, 984
143 .equiv nb133_jindex, 992
144 .equiv nb133_jjnr, 1000
145 .equiv nb133_shift, 1008
146 .equiv nb133_shiftvec, 1016
147 .equiv nb133_innerjjnr, 1024
148 .equiv nb133_is3, 1032
149 .equiv nb133_ii3, 1036
150 .equiv nb133_nri, 1040
151 .equiv nb133_ntia, 1044
152 .equiv nb133_innerk, 1048
154 .equiv nb133_nn1, 1056
155 .equiv nb133_nouter, 1060
156 .equiv nb133_ninner, 1064
161 ;
# Push integer registers on stack
170 ;
# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
173 ;
# Save xmm registers to stack
175 movaps
[rsp
+ 16 ], xmm7
176 movaps
[rsp
+ 32 ], xmm8
177 movaps
[rsp
+ 48 ], xmm9
178 movaps
[rsp
+ 64 ], xmm10
179 movaps
[rsp
+ 80 ], xmm11
180 movaps
[rsp
+ 96 ], xmm12
181 movaps
[rsp
+ 112], xmm13
182 movaps
[rsp
+ 128], xmm14
183 movaps
[rsp
+ 144], xmm15
186 sub rsp
, 1072 ;
# local variable stack space (n*16+8)
187 ;
.if 0 # block below only read by NASM - special calling convention on win64
188 %ifidn __OUTPUT_FORMAT__
, win64
189 ;
# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
191 ;
# Adjust stack pointer for different alignment
192 ;
# Move around arguments to fit AMD64 convention below
193 ;
# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
194 ;
# win64 passes args in: rcx,rdx,r8,r9 + stack
202 ;
.endif # end NASM- and win64-specific block
204 ;
# zero 32-bit iteration counters
206 mov
[rsp
+ nb133_nouter
], eax
207 mov
[rsp
+ nb133_ninner
], eax
210 mov
[rsp
+ nb133_nri
], edi
211 mov
[rsp
+ nb133_iinr
], rsi
212 mov
[rsp
+ nb133_jindex
], rdx
213 mov
[rsp
+ nb133_jjnr
], rcx
214 mov
[rsp
+ nb133_shift
], r8
215 mov
[rsp
+ nb133_shiftvec
], r9
216 mov rsi
, [rbp
+ nb133_p_facel
]
218 movss
[rsp
+ nb133_facel
], xmm0
220 ;
# create constant floating-point factors on stack
221 mov eax
, 0x3f000000 ;
# half in IEEE (hex)
222 mov
[rsp
+ nb133_half
], eax
223 movss xmm1
, [rsp
+ nb133_half
]
224 shufps xmm1
, xmm1
, 0 ;
# splat to all elements
226 addps xmm2
, xmm2 ;
# one
228 addps xmm2
, xmm2 ;
# two
229 addps xmm3
, xmm2 ;
# three
230 movaps
[rsp
+ nb133_half
], xmm1
231 movaps
[rsp
+ nb133_two
], xmm2
232 movaps
[rsp
+ nb133_three
], xmm3
234 mov rax
, [rbp
+ nb133_p_tabscale
]
237 movaps
[rsp
+ nb133_tsc
], xmm3
239 ;
# assume we have at least one i particle - start directly
240 mov rcx
, [rsp
+ nb133_iinr
] ;
# rcx = pointer into iinr[]
241 mov ebx
, [rcx
] ;
# ebx =ii
243 mov rdx
, [rbp
+ nb133_charge
]
244 movss xmm4
, [rdx
+ rbx
*4 + 4]
245 movss xmm3
, [rdx
+ rbx
*4 + 12]
246 mov rsi
, [rbp
+ nb133_p_facel
]
248 movss xmm5
, [rsp
+ nb133_facel
]
254 movaps
[rsp
+ nb133_iqM
], xmm3
255 movaps
[rsp
+ nb133_iqH
], xmm4
257 mov rdx
, [rbp
+ nb133_type
]
258 mov ecx
, [rdx
+ rbx
*4]
260 mov rdi
, [rbp
+ nb133_p_ntype
]
261 imul ecx
, [rdi
] ;
# rcx = ntia = 2*ntype*type[ii0]
262 mov
[rsp
+ nb133_ntia
], ecx
264 mov rsi
, [rbp
+ nb133_count
] ;
# pointer to sync counter
267 mov ebx
, eax ;
# ebx=*count=nn0
268 add ebx
, 1 ;
# ebx=nn1=nn0+10
270 cmpxchg
[rsi
], ebx ;
# write nn1 to *counter,
271 ;
# if it hasnt changed.
272 ;
# or reread *counter to eax.
273 pause ;
# -> better p4 performance
276 ;
# if(nn1>nri) nn1=nri
277 mov ecx
, [rsp
+ nb133_nri
]
280 cmovle ebx
, edx ;
# if(nn1>nri) nn1=nri
281 ;
# Cleared the spinlock if we got here.
282 ;
# eax contains nn0, ebx contains nn1.
283 mov
[rsp
+ nb133_n
], eax
284 mov
[rsp
+ nb133_nn1
], ebx
285 sub ebx
, eax ;
# calc number of outer lists
286 mov esi
, eax ;
# copy n to esi
291 ;
# ebx contains number of outer iterations
292 add ebx
, [rsp
+ nb133_nouter
]
293 mov
[rsp
+ nb133_nouter
], ebx
296 mov rax
, [rsp
+ nb133_shift
] ;
# eax = pointer into shift[]
297 mov ebx
, [rax
+ rsi
*4] ;
# ebx=shift[n]
299 lea rbx
, [rbx
+ rbx
*2] ;
# rbx=3*is
300 mov
[rsp
+ nb133_is3
],ebx ;
# store is3
302 mov rax
, [rsp
+ nb133_shiftvec
] ;
# eax = base of shiftvec[]
304 movss xmm0
, [rax
+ rbx
*4]
305 movss xmm1
, [rax
+ rbx
*4 + 4]
306 movss xmm2
, [rax
+ rbx
*4 + 8]
308 mov rcx
, [rsp
+ nb133_iinr
] ;
# ecx = pointer into iinr[]
309 mov ebx
, [rcx
+ rsi
*4] ;
# ebx =ii
317 lea rbx
, [rbx
+ rbx
*2] ;
# rbx = 3*ii=ii3
318 mov rax
, [rbp
+ nb133_pos
] ;
# eax = base of pos[]
319 mov
[rsp
+ nb133_ii3
], ebx
321 addss xmm3
, [rax
+ rbx
*4] ;
# ox
322 addss xmm4
, [rax
+ rbx
*4 + 4] ;
# oy
323 addss xmm5
, [rax
+ rbx
*4 + 8] ;
# oz
324 addss xmm6
, [rax
+ rbx
*4 + 12] ;
# h1x
325 addss xmm7
, [rax
+ rbx
*4 + 16] ;
# h1y
331 movaps
[rsp
+ nb133_ixO
], xmm3
332 movaps
[rsp
+ nb133_iyO
], xmm4
333 movaps
[rsp
+ nb133_izO
], xmm5
334 movaps
[rsp
+ nb133_ixH1
], xmm6
335 movaps
[rsp
+ nb133_iyH1
], xmm7
341 addss xmm6
, [rax
+ rbx
*4 + 20] ;
# h1z
342 addss xmm0
, [rax
+ rbx
*4 + 24] ;
# h2x
343 addss xmm1
, [rax
+ rbx
*4 + 28] ;
# h2y
344 addss xmm2
, [rax
+ rbx
*4 + 32] ;
# h2z
345 addss xmm3
, [rax
+ rbx
*4 + 36] ;
# mx
346 addss xmm4
, [rax
+ rbx
*4 + 40] ;
# my
347 addss xmm5
, [rax
+ rbx
*4 + 44] ;
# mz
356 movaps
[rsp
+ nb133_izH1
], xmm6
357 movaps
[rsp
+ nb133_ixH2
], xmm0
358 movaps
[rsp
+ nb133_iyH2
], xmm1
359 movaps
[rsp
+ nb133_izH2
], xmm2
360 movaps
[rsp
+ nb133_ixM
], xmm3
361 movaps
[rsp
+ nb133_iyM
], xmm4
362 movaps
[rsp
+ nb133_izM
], xmm5
364 ;
# clear vctot and i forces
366 movaps
[rsp
+ nb133_vctot
], xmm4
367 movaps
[rsp
+ nb133_Vvdwtot
], xmm4
368 movaps
[rsp
+ nb133_fixO
], xmm4
369 movaps
[rsp
+ nb133_fiyO
], xmm4
370 movaps
[rsp
+ nb133_fizO
], xmm4
371 movaps
[rsp
+ nb133_fixH1
], xmm4
372 movaps
[rsp
+ nb133_fiyH1
], xmm4
373 movaps
[rsp
+ nb133_fizH1
], xmm4
374 movaps
[rsp
+ nb133_fixH2
], xmm4
375 movaps
[rsp
+ nb133_fiyH2
], xmm4
376 movaps
[rsp
+ nb133_fizH2
], xmm4
377 movaps
[rsp
+ nb133_fixM
], xmm4
378 movaps
[rsp
+ nb133_fiyM
], xmm4
379 movaps
[rsp
+ nb133_fizM
], xmm4
381 mov rax
, [rsp
+ nb133_jindex
]
382 mov ecx
, [rax
+ rsi
*4] ;
# jindex[n]
383 mov edx
, [rax
+ rsi
*4 + 4] ;
# jindex[n+1]
384 sub edx
, ecx ;
# number of innerloop atoms
386 mov rsi
, [rbp
+ nb133_pos
]
387 mov rdi
, [rbp
+ nb133_faction
]
388 mov rax
, [rsp
+ nb133_jjnr
]
391 mov
[rsp
+ nb133_innerjjnr
], rax ;
# pointer to jjnr[nj0]
394 add ecx
, [rsp
+ nb133_ninner
]
395 mov
[rsp
+ nb133_ninner
], ecx
397 mov
[rsp
+ nb133_innerk
], edx ;
# number of innerloop atoms
398 jge
.nb133_unroll_loop
401 ;
# quad-unroll innerloop here
402 mov rdx
, [rsp
+ nb133_innerjjnr
] ;
# pointer to jjnr[k]
406 mov edx
, [rdx
+ 12] ;
# eax-edx=jnr1-4
408 add qword ptr
[rsp
+ nb133_innerjjnr
], 16 ;
# advance pointer (unrolled 4)
410 mov rsi
, [rbp
+ nb133_charge
] ;
# base of charge[]
412 movss xmm3
, [rsi
+ rax
*4]
413 movss xmm4
, [rsi
+ rcx
*4]
414 movss xmm6
, [rsi
+ rbx
*4]
415 movss xmm7
, [rsi
+ rdx
*4]
419 shufps xmm3
, xmm4
, 136 ;
# constant 10001000 ;# all charges in xmm3
420 movaps xmm4
, xmm3 ;
# and in xmm4
421 mulps xmm3
, [rsp
+ nb133_iqM
]
422 mulps xmm4
, [rsp
+ nb133_iqH
]
424 movaps
[rsp
+ nb133_qqM
], xmm3
425 movaps
[rsp
+ nb133_qqH
], xmm4
427 mov rsi
, [rbp
+ nb133_type
]
428 mov r8d
, [rsi
+ rax
*4]
429 mov r9d
, [rsi
+ rbx
*4]
430 mov r10d
, [rsi
+ rcx
*4]
431 mov r11d
, [rsi
+ rdx
*4]
432 mov rsi
, [rbp
+ nb133_vdwparam
]
437 mov edi
, [rsp
+ nb133_ntia
]
443 movlps xmm6
, [rsi
+ r8*4]
444 movlps xmm7
, [rsi
+ r10*4]
445 movhps xmm6
, [rsi
+ r9*4]
446 movhps xmm7
, [rsi
+ r11*4]
449 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
450 shufps xmm6
, xmm7
, 221 ;
# constant 11011101
452 movaps
[rsp
+ nb133_c6
], xmm4
453 movaps
[rsp
+ nb133_c12
], xmm6
455 mov rsi
, [rbp
+ nb133_pos
] ;
# base of pos[]
457 lea rax
, [rax
+ rax
*2] ;
# replace jnr with j3
458 lea rbx
, [rbx
+ rbx
*2]
459 lea rcx
, [rcx
+ rcx
*2] ;
# replace jnr with j3
460 lea rdx
, [rdx
+ rdx
*2]
462 ;
# move four coordinates to xmm0-xmm2
463 movlps xmm4
, [rsi
+ rax
*4]
464 movlps xmm5
, [rsi
+ rcx
*4]
465 movss xmm2
, [rsi
+ rax
*4 + 8]
466 movss xmm6
, [rsi
+ rcx
*4 + 8]
468 movhps xmm4
, [rsi
+ rbx
*4]
469 movhps xmm5
, [rsi
+ rdx
*4]
471 movss xmm0
, [rsi
+ rbx
*4 + 8]
472 movss xmm1
, [rsi
+ rdx
*4 + 8]
480 shufps xmm2
, xmm6
, 136 ;
# constant 10001000
481 shufps xmm0
, xmm5
, 136 ;
# constant 10001000
482 shufps xmm1
, xmm5
, 221 ;
# constant 11011101
494 subps xmm3
, [rsp
+ nb133_ixO
]
495 subps xmm4
, [rsp
+ nb133_iyO
]
496 subps xmm5
, [rsp
+ nb133_izO
]
498 movaps
[rsp
+ nb133_dxO
], xmm3
499 movaps
[rsp
+ nb133_dyO
], xmm4
500 movaps
[rsp
+ nb133_dzO
], xmm5
510 ;
# calculate rinv=1/sqrt(rsq)
514 movaps xmm4
, [rsp
+ nb133_three
]
515 mulps xmm5
, xmm3 ;
# rsq*lu*lu
516 subps xmm4
, xmm5 ;
# 30-rsq*lu*lu
518 mulps xmm4
, [rsp
+ nb133_half
]
524 mulps xmm3
, [rsp
+ nb133_tsc
] ;
# rtab
526 ;
# truncate and convert to integers
529 ;
# convert back to float
536 subps xmm3
, xmm4 ;
# xmm3=eps
538 ;
# move to integer registers
549 mov rsi
, [rbp
+ nb133_VFtab
]
550 ;
# calculate LJ table
551 movlps xmm5
, [rsi
+ r8*4]
552 movlps xmm9
, [rsi
+ r8*4 + 16]
554 movlps xmm7
, [rsi
+ r10*4]
555 movlps xmm11
, [rsi
+ r10*4 + 16]
557 movhps xmm5
, [rsi
+ r9*4]
558 movhps xmm9
, [rsi
+ r9*4 + 16]
560 movhps xmm7
, [rsi
+ r11*4]
561 movhps xmm11
, [rsi
+ r11*4 + 16]
565 shufps xmm4
, xmm7
, 136 ;
# 10001000
566 shufps xmm8
, xmm11
, 136 ;
# 10001000
567 shufps xmm5
, xmm7
, 221 ;
# 11011101
568 shufps xmm9
, xmm11
, 221 ;
# 11011101
570 movlps xmm7
, [rsi
+ r8*4 + 8]
571 movlps xmm11
, [rsi
+ r8*4 + 24]
573 movlps xmm13
, [rsi
+ r10*4 + 8]
574 movlps xmm14
, [rsi
+ r10*4 + 24]
576 movhps xmm7
, [rsi
+ r9*4 + 8]
577 movhps xmm11
, [rsi
+ r9*4 + 24]
579 movhps xmm13
, [rsi
+ r11*4 + 8]
580 movhps xmm14
, [rsi
+ r11*4 + 24]
585 shufps xmm6
, xmm13
, 136 ;
# 10001000
586 shufps xmm10
, xmm14
, 136 ;
# 10001000
587 shufps xmm7
, xmm13
, 221 ;
# 11011101
588 shufps xmm11
, xmm14
, 221 ;
# 11011101
589 ;
# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
591 mulps xmm7
, xmm3 ;
# Heps
593 mulps xmm6
, xmm3 ;
# Geps
595 mulps xmm7
, xmm3 ;
# Heps2
597 addps xmm5
, xmm6 ;
# F+Geps
599 addps xmm5
, xmm7 ;
# F+Geps+Heps2 = Fp
601 addps xmm7
, xmm7 ;
# 2*Heps2
603 addps xmm7
, xmm6 ;
# 2*Heps2+Geps
606 addps xmm7
, xmm5 ;
# FF = Fp + 2*Heps2 + Geps
608 mulps xmm5
, xmm3 ;
# eps*Fp
610 movaps xmm12
, [rsp
+ nb133_c6
]
611 movaps xmm13
, [rsp
+ nb133_c12
]
612 addps xmm5
, xmm4 ;
# VV
615 mulps xmm5
, xmm12 ;
# VV*c6 = vnb6
616 mulps xmm9
, xmm13 ;
# VV*c12 = vnb12
618 addps xmm5
, [rsp
+ nb133_Vvdwtot
]
619 movaps
[rsp
+ nb133_Vvdwtot
], xmm5
621 mulps xmm7
, xmm12 ;
# FF*c6 = fnb6
622 mulps xmm11
, xmm13 ;
# FF*c12 = fnb12
625 mulps xmm7
, [rsp
+ nb133_tsc
]
626 mulps xmm7
, xmm15 ;
# -fscal
629 subps xmm9
, xmm7 ;
# fscal
633 mulps xmm9
, [rsp
+ nb133_dxO
] ;
# fx/fy/fz
634 mulps xmm10
, [rsp
+ nb133_dyO
]
635 mulps xmm11
, [rsp
+ nb133_dzO
]
637 ;
# save j force temporarily
638 movaps
[rsp
+ nb133_fjx
], xmm9
639 movaps
[rsp
+ nb133_fjy
], xmm10
640 movaps
[rsp
+ nb133_fjz
], xmm11
642 ;
# increment i O force
643 addps xmm9
, [rsp
+ nb133_fixO
]
644 addps xmm10
, [rsp
+ nb133_fiyO
]
645 addps xmm11
, [rsp
+ nb133_fizO
]
646 movaps
[rsp
+ nb133_fixO
], xmm9
647 movaps
[rsp
+ nb133_fiyO
], xmm10
648 movaps
[rsp
+ nb133_fizO
], xmm11
649 ;
# finished O LJ interaction.
651 ;
# do H1, H2, and M interactions in parallel.
652 ;
# xmm0-xmm2 still contain j coordinates.
660 subps xmm0
, [rsp
+ nb133_ixH1
]
661 subps xmm1
, [rsp
+ nb133_iyH1
]
662 subps xmm2
, [rsp
+ nb133_izH1
]
663 subps xmm3
, [rsp
+ nb133_ixH2
]
664 subps xmm4
, [rsp
+ nb133_iyH2
]
665 subps xmm5
, [rsp
+ nb133_izH2
]
666 subps xmm6
, [rsp
+ nb133_ixM
]
667 subps xmm7
, [rsp
+ nb133_iyM
]
668 subps xmm8
, [rsp
+ nb133_izM
]
670 movaps
[rsp
+ nb133_dxH1
], xmm0
671 movaps
[rsp
+ nb133_dyH1
], xmm1
672 movaps
[rsp
+ nb133_dzH1
], xmm2
676 movaps
[rsp
+ nb133_dxH2
], xmm3
677 movaps
[rsp
+ nb133_dyH2
], xmm4
678 movaps
[rsp
+ nb133_dzH2
], xmm5
682 movaps
[rsp
+ nb133_dxM
], xmm6
683 movaps
[rsp
+ nb133_dyM
], xmm7
684 movaps
[rsp
+ nb133_dzM
], xmm8
695 ;
# start doing invsqrt for j atoms
704 mulps xmm1
, xmm1 ;
# lu*lu
705 mulps xmm4
, xmm4 ;
# lu*lu
706 mulps xmm7
, xmm7 ;
# lu*lu
708 movaps xmm9
, [rsp
+ nb133_three
]
712 mulps xmm1
, xmm0 ;
# rsq*lu*lu
713 mulps xmm4
, xmm3 ;
# rsq*lu*lu
714 mulps xmm7
, xmm6 ;
# rsq*lu*lu
718 subps xmm11
, xmm7 ;
# 3-rsq*lu*lu
722 mulps xmm11
, xmm8 ;
# lu*(3-rsq*lu*lu)
724 movaps xmm0
, [rsp
+ nb133_half
]
725 mulps xmm9
, xmm0 ;
# rinvH1
726 mulps xmm10
, xmm0 ;
# rinvH2
727 mulps xmm11
, xmm0 ;
# rinvM
730 movaps xmm0
, xmm9 ;
# rinv
733 mulps xmm9
, xmm9 ;
# rinvsq
736 mulps xmm0
, [rsp
+ nb133_qqH
]
737 mulps xmm1
, [rsp
+ nb133_qqH
]
738 mulps xmm2
, [rsp
+ nb133_qqM
]
743 addps xmm0
, [rsp
+ nb133_vctot
]
746 movaps
[rsp
+ nb133_vctot
], xmm0
748 ;
# move j forces to local temp variables
749 mov rdi
, [rbp
+ nb133_faction
]
750 movlps xmm0
, [rdi
+ rax
*4] ;
# jxa jya - -
751 movlps xmm1
, [rdi
+ rcx
*4] ;
# jxc jyc - -
752 movhps xmm0
, [rdi
+ rbx
*4] ;
# jxa jya jxb jyb
753 movhps xmm1
, [rdi
+ rdx
*4] ;
# jxc jyc jxd jyd
755 movss xmm2
, [rdi
+ rax
*4 + 8] ;
# jza - - -
756 movss xmm3
, [rdi
+ rcx
*4 + 8] ;
# jzc - - -
757 movss xmm5
, [rdi
+ rbx
*4 + 8] ;
# jzb - - -
758 movss xmm6
, [rdi
+ rdx
*4 + 8] ;
# jzd - - -
762 shufps xmm2
, xmm3
, 136 ;
# 10001000 => jza jzb jzc jzd
764 ;
# xmm0: jxa jya jxb jyb
765 ;
# xmm1: jxc jyc jxd jyd
766 ;
# xmm2: jza jzb jzc jzd
776 mulps xmm7
, [rsp
+ nb133_dxH1
]
777 mulps xmm8
, [rsp
+ nb133_dyH1
]
778 mulps xmm9
, [rsp
+ nb133_dzH1
]
779 mulps xmm10
, [rsp
+ nb133_dxH2
]
780 mulps xmm11
, [rsp
+ nb133_dyH2
]
781 mulps xmm12
, [rsp
+ nb133_dzH2
]
782 mulps xmm13
, [rsp
+ nb133_dxM
]
783 mulps xmm14
, [rsp
+ nb133_dyM
]
784 mulps xmm15
, [rsp
+ nb133_dzM
]
786 ;
# fetch forces from O interaction
787 movaps xmm3
, [rsp
+ nb133_fjx
]
788 movaps xmm4
, [rsp
+ nb133_fjy
]
789 addps xmm2
, [rsp
+ nb133_fjz
]
794 addps xmm7
, [rsp
+ nb133_fixH1
]
795 addps xmm8
, [rsp
+ nb133_fiyH1
]
796 addps xmm9
, [rsp
+ nb133_fizH1
]
801 addps xmm10
, [rsp
+ nb133_fixH2
]
802 addps xmm11
, [rsp
+ nb133_fiyH2
]
803 addps xmm12
, [rsp
+ nb133_fizH2
]
808 addps xmm13
, [rsp
+ nb133_fixM
]
809 addps xmm14
, [rsp
+ nb133_fiyM
]
810 addps xmm15
, [rsp
+ nb133_fizM
]
812 movaps
[rsp
+ nb133_fixH1
], xmm7
813 movaps
[rsp
+ nb133_fiyH1
], xmm8
814 movaps
[rsp
+ nb133_fizH1
], xmm9
815 movaps
[rsp
+ nb133_fixH2
], xmm10
816 movaps
[rsp
+ nb133_fiyH2
], xmm11
817 movaps
[rsp
+ nb133_fizH2
], xmm12
818 movaps
[rsp
+ nb133_fixM
], xmm13
819 movaps
[rsp
+ nb133_fiyM
], xmm14
820 movaps
[rsp
+ nb133_fizM
], xmm15
822 ;
# xmm3 = fjx , xmm4 = fjy , xmm2=fjz, already updated.
824 unpcklps xmm3
, xmm4 ;
# fjx1 fjy1 fjx2 fjy2
825 unpckhps xmm5
, xmm4 ;
# fjx3 fjy3 fjx4 fjy4
829 movhlps xmm3
, xmm2 ;
# fjzc fjzd
831 movlps
[rdi
+ rax
*4], xmm0
832 movhps
[rdi
+ rbx
*4], xmm0
833 movlps
[rdi
+ rcx
*4], xmm1
834 movhps
[rdi
+ rdx
*4], xmm1
835 movss
[rdi
+ rax
*4 + 8], xmm2
836 movss
[rdi
+ rcx
*4 + 8], xmm3
839 movss
[rdi
+ rbx
*4 + 8], xmm2
840 movss
[rdi
+ rdx
*4 + 8], xmm3
842 ;
# should we do one more iteration?
843 sub dword ptr
[rsp
+ nb133_innerk
], 4
845 jmp
.nb133_unroll_loop
847 add dword ptr
[rsp
+ nb133_innerk
], 4
849 jmp
.nb133_updateouterdata
851 mov rdx
, [rsp
+ nb133_innerjjnr
] ;
# pointer to jjnr[k]
853 add qword ptr
[rsp
+ nb133_innerjjnr
], 4
855 xorps xmm4
, xmm4 ;
# clear reg.
856 movss xmm4
, [rsp
+ nb133_iqM
]
857 mov rsi
, [rbp
+ nb133_charge
]
858 movhps xmm4
, [rsp
+ nb133_iqH
] ;
# [qM 0 qH qH]
859 shufps xmm4
, xmm4
, 41 ;
# [0 qH qH qM]
861 movss xmm3
, [rsi
+ rax
*4] ;
# charge in xmm3
864 movaps
[rsp
+ nb133_qqM
], xmm3 ;
# use dummy qq for storage
867 mov rsi
, [rbp
+ nb133_type
]
868 mov ebx
, [rsi
+ rax
*4]
869 mov rsi
, [rbp
+ nb133_vdwparam
]
871 add ebx
, [rsp
+ nb133_ntia
]
872 movlps xmm6
, [rsi
+ rbx
*4]
874 shufps xmm6
, xmm6
, 252 ;
# constant 11111100
875 shufps xmm7
, xmm7
, 253 ;
# constant 11111101
876 movaps
[rsp
+ nb133_c6
], xmm6
877 movaps
[rsp
+ nb133_c12
], xmm7
879 mov rsi
, [rbp
+ nb133_pos
]
880 lea rax
, [rax
+ rax
*2]
882 movss xmm0
, [rsp
+ nb133_ixO
]
883 movss xmm1
, [rsp
+ nb133_iyO
]
884 movss xmm2
, [rsp
+ nb133_izO
]
885 movss xmm3
, [rsp
+ nb133_ixH1
]
886 movss xmm4
, [rsp
+ nb133_iyH1
]
887 movss xmm5
, [rsp
+ nb133_izH1
]
888 unpcklps xmm0
, [rsp
+ nb133_ixH2
] ;
# ixO ixH2 - -
889 unpcklps xmm1
, [rsp
+ nb133_iyH2
] ;
# iyO iyH2 - -
890 unpcklps xmm2
, [rsp
+ nb133_izH2
] ;
# izO izH2 - -
891 unpcklps xmm3
, [rsp
+ nb133_ixM
] ;
# ixH1 ixM - -
892 unpcklps xmm4
, [rsp
+ nb133_iyM
] ;
# iyH1 iyM - -
893 unpcklps xmm5
, [rsp
+ nb133_izM
] ;
# izH1 izM - -
894 unpcklps xmm0
, xmm3 ;
# ixO ixH1 ixH2 ixM
895 unpcklps xmm1
, xmm4 ;
# same for y
896 unpcklps xmm2
, xmm5 ;
# same for z
898 ;
# move j coords to xmm0-xmm2
899 movss xmm3
, [rsi
+ rax
*4]
900 movss xmm4
, [rsi
+ rax
*4 + 4]
901 movss xmm5
, [rsi
+ rax
*4 + 8]
910 ;
# use O distances for storage
911 movaps
[rsp
+ nb133_dxO
], xmm3
912 movaps
[rsp
+ nb133_dyO
], xmm4
913 movaps
[rsp
+ nb133_dzO
], xmm5
924 ;
# lookup seed in xmm5
927 movaps xmm1
, [rsp
+ nb133_three
]
928 mulps xmm5
, xmm4 ;
# rsq*lu*lu
929 movaps xmm0
, [rsp
+ nb133_half
]
930 subps xmm1
, xmm5 ;
# constant 30-rsq*lu*lu
932 mulps xmm0
, xmm1 ;
# xmm0=rinv, xmm4=rsq
934 ;
# LJ table interaction
936 mulps xmm4
, [rsp
+ nb133_tsc
] ;
# rtab
941 movss xmm1
, xmm4 ;
# xmm1=eps
943 mulss xmm2
, xmm2 ;
# xmm2=eps2
948 mov rsi
, [rbp
+ nb133_VFtab
]
952 movlps xmm5
, [rsi
+ rax
*4]
954 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
955 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
957 movlps xmm7
, [rsi
+ rax
*4 + 8]
959 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
960 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
961 ;
# dispersion table ready, in xmm4-xmm7
963 mulss xmm6
, xmm1 ;
# xmm6=Geps
964 mulss xmm7
, xmm2 ;
# xmm7=Heps2
966 addss xmm5
, xmm7 ;
# xmm5=Fp
967 mulss xmm7
, [rsp
+ nb133_two
] ;
# two*Heps2
969 addss xmm7
, xmm5 ;
# xmm7=FF
970 mulss xmm5
, xmm1 ;
# xmm5=eps*Fp
971 addss xmm5
, xmm4 ;
# xmm5=VV
973 movss xmm4
, [rsp
+ nb133_c6
]
974 mulss xmm7
, xmm4 ;
# fijD
975 mulss xmm5
, xmm4 ;
# Vvdw6
976 mulss xmm7
, [rsp
+ nb133_tsc
]
977 ;
# put scalar force on stack Update Vvdwtot directly
978 addss xmm5
, [rsp
+ nb133_Vvdwtot
]
979 movss
[rsp
+ nb133_fstmp
], xmm7
980 movss
[rsp
+ nb133_Vvdwtot
], xmm5
983 movlps xmm5
, [rsi
+ rax
*4 + 16]
985 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
986 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
988 movlps xmm7
, [rsi
+ rax
*4 + 24]
990 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
991 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
992 ;
# table ready, in xmm4-xmm7
993 mulss xmm6
, xmm1 ;
# xmm6=Geps
994 mulss xmm7
, xmm2 ;
# xmm7=Heps2
996 addss xmm5
, xmm7 ;
# xmm5=Fp
997 mulss xmm7
, [rsp
+ nb133_two
] ;
# two*Heps2
999 addss xmm7
, xmm5 ;
# xmm7=FF
1000 mulss xmm5
, xmm1 ;
# xmm5=eps*Fp
1001 addss xmm5
, xmm4 ;
# xmm5=VV
1003 movss xmm4
, [rsp
+ nb133_c12
]
1004 mulss xmm7
, xmm4 ;
# fijR
1005 mulss xmm5
, xmm4 ;
# Vvdw12
1006 mulss xmm7
, [rsp
+ nb133_tsc
]
1007 addss xmm7
, [rsp
+ nb133_fstmp
]
1008 movss
[rsp
+ nb133_fstmp
], xmm7
1009 addss xmm5
, [rsp
+ nb133_Vvdwtot
]
1010 movss
[rsp
+ nb133_Vvdwtot
], xmm5
1015 mulps xmm4
, [rsp
+ nb133_qqM
]
1018 subss xmm4
, [rsp
+ nb133_fstmp
]
1021 addps xmm2
, [rsp
+ nb133_vctot
]
1022 movaps
[rsp
+ nb133_vctot
], xmm2
1024 movaps xmm0
, [rsp
+ nb133_dxO
]
1025 movaps xmm1
, [rsp
+ nb133_dyO
]
1026 movaps xmm2
, [rsp
+ nb133_dzO
]
1030 mulps xmm2
, xmm4 ;
# xmm0-xmm2 now contains tx-tz (partial force)
1032 movss xmm3
, [rsp
+ nb133_fixO
]
1033 movss xmm4
, [rsp
+ nb133_fiyO
]
1034 movss xmm5
, [rsp
+ nb133_fizO
]
1038 movss
[rsp
+ nb133_fixO
], xmm3
1039 movss
[rsp
+ nb133_fiyO
], xmm4
1040 movss
[rsp
+ nb133_fizO
], xmm5 ;
# updated the O force now do the H's
1045 shufps xmm3
, xmm3
, 0x39 ;
# shift right
1046 shufps xmm4
, xmm4
, 0x39
1047 shufps xmm5
, xmm5
, 0x39
1048 addss xmm3
, [rsp
+ nb133_fixH1
]
1049 addss xmm4
, [rsp
+ nb133_fiyH1
]
1050 addss xmm5
, [rsp
+ nb133_fizH1
]
1051 movss
[rsp
+ nb133_fixH1
], xmm3
1052 movss
[rsp
+ nb133_fiyH1
], xmm4
1053 movss
[rsp
+ nb133_fizH1
], xmm5 ;
# updated the H1 force
1055 shufps xmm3
, xmm3
, 0x39
1056 shufps xmm4
, xmm4
, 0x39
1057 shufps xmm5
, xmm5
, 0x39
1058 addss xmm3
, [rsp
+ nb133_fixH2
]
1059 addss xmm4
, [rsp
+ nb133_fiyH2
]
1060 addss xmm5
, [rsp
+ nb133_fizH2
]
1061 movss
[rsp
+ nb133_fixH2
], xmm3
1062 movss
[rsp
+ nb133_fiyH2
], xmm4
1063 movss
[rsp
+ nb133_fizH2
], xmm5 ;
# updated the H2 force
1065 mov rdi
, [rbp
+ nb133_faction
]
1066 shufps xmm3
, xmm3
, 0x39
1067 shufps xmm4
, xmm4
, 0x39
1068 shufps xmm5
, xmm5
, 0x39
1069 addss xmm3
, [rsp
+ nb133_fixM
]
1070 addss xmm4
, [rsp
+ nb133_fiyM
]
1071 addss xmm5
, [rsp
+ nb133_fizM
]
1072 movss
[rsp
+ nb133_fixM
], xmm3
1073 movss
[rsp
+ nb133_fiyM
], xmm4
1074 movss
[rsp
+ nb133_fizM
], xmm5 ;
# updated the M force
1076 ;
# the fj's - move in from mem start by acc. tx/ty/tz in xmm0, xmm1
1077 movlps xmm6
, [rdi
+ rax
*4]
1078 movss xmm7
, [rdi
+ rax
*4 + 8]
1090 shufps xmm3
, xmm3
, 0x39 ;
# shift right
1091 shufps xmm4
, xmm4
, 0x39
1092 shufps xmm5
, xmm5
, 0x39
1096 unpcklps xmm0
, xmm1 ;
# x,y sum in xmm0, z sum in xmm2
1101 movlps
[rdi
+ rax
*4], xmm6
1102 movss
[rdi
+ rax
*4 + 8], xmm7
1104 dec dword ptr
[rsp
+ nb133_innerk
]
1105 jz
.nb133_updateouterdata
1107 .nb133_updateouterdata:
1108 mov ecx
, [rsp
+ nb133_ii3
]
1109 mov rdi
, [rbp
+ nb133_faction
]
1110 mov rsi
, [rbp
+ nb133_fshift
]
1111 mov edx
, [rsp
+ nb133_is3
]
1113 ;
# accumulate Oi forces in xmm0, xmm1, xmm2
1114 movaps xmm0
, [rsp
+ nb133_fixO
]
1115 movaps xmm1
, [rsp
+ nb133_fiyO
]
1116 movaps xmm2
, [rsp
+ nb133_fizO
]
1123 addps xmm2
, xmm5 ;
# sum is in 1/2 in xmm0-xmm2
1129 shufps xmm3
, xmm3
, 1
1130 shufps xmm4
, xmm4
, 1
1131 shufps xmm5
, xmm5
, 1
1134 addss xmm2
, xmm5 ;
# xmm0-xmm2 has single force in pos0
1136 ;
# increment i force
1137 movss xmm3
, [rdi
+ rcx
*4]
1138 movss xmm4
, [rdi
+ rcx
*4 + 4]
1139 movss xmm5
, [rdi
+ rcx
*4 + 8]
1143 movss
[rdi
+ rcx
*4], xmm3
1144 movss
[rdi
+ rcx
*4 + 4], xmm4
1145 movss
[rdi
+ rcx
*4 + 8], xmm5
1147 ;
# accumulate force in xmm6/xmm7 for fshift
1151 shufps xmm6
, xmm6
, 8 ;
# constant 00001000
1153 ;
# accumulate H1i forces in xmm0, xmm1, xmm2
1154 movaps xmm0
, [rsp
+ nb133_fixH1
]
1155 movaps xmm1
, [rsp
+ nb133_fiyH1
]
1156 movaps xmm2
, [rsp
+ nb133_fizH1
]
1163 addps xmm2
, xmm5 ;
# sum is in 1/2 in xmm0-xmm2
1169 shufps xmm3
, xmm3
, 1
1170 shufps xmm4
, xmm4
, 1
1171 shufps xmm5
, xmm5
, 1
1174 addss xmm2
, xmm5 ;
# xmm0-xmm2 has single force in pos0
1176 ;
# increment i force
1177 movss xmm3
, [rdi
+ rcx
*4 + 12]
1178 movss xmm4
, [rdi
+ rcx
*4 + 16]
1179 movss xmm5
, [rdi
+ rcx
*4 + 20]
1183 movss
[rdi
+ rcx
*4 + 12], xmm3
1184 movss
[rdi
+ rcx
*4 + 16], xmm4
1185 movss
[rdi
+ rcx
*4 + 20], xmm5
1187 ;
# accumulate force in xmm6/xmm7 for fshift
1190 shufps xmm0
, xmm0
, 8 ;
# constant 00001000
1193 ;
# accumulate H2i forces in xmm0, xmm1, xmm2
1194 movaps xmm0
, [rsp
+ nb133_fixH2
]
1195 movaps xmm1
, [rsp
+ nb133_fiyH2
]
1196 movaps xmm2
, [rsp
+ nb133_fizH2
]
1203 addps xmm2
, xmm5 ;
# sum is in 1/2 in xmm0-xmm2
1209 shufps xmm3
, xmm3
, 1
1210 shufps xmm4
, xmm4
, 1
1211 shufps xmm5
, xmm5
, 1
1214 addss xmm2
, xmm5 ;
# xmm0-xmm2 has single force in pos0
1216 ;
# increment i force
1217 movss xmm3
, [rdi
+ rcx
*4 + 24]
1218 movss xmm4
, [rdi
+ rcx
*4 + 28]
1219 movss xmm5
, [rdi
+ rcx
*4 + 32]
1223 movss
[rdi
+ rcx
*4 + 24], xmm3
1224 movss
[rdi
+ rcx
*4 + 28], xmm4
1225 movss
[rdi
+ rcx
*4 + 32], xmm5
1227 ;
# accumulate force in xmm6/xmm7 for fshift
1230 shufps xmm0
, xmm0
, 8 ;
# constant 00001000
1233 ;
# accumulate Mi forces in xmm0, xmm1, xmm2
1234 movaps xmm0
, [rsp
+ nb133_fixM
]
1235 movaps xmm1
, [rsp
+ nb133_fiyM
]
1236 movaps xmm2
, [rsp
+ nb133_fizM
]
1243 addps xmm2
, xmm5 ;
# sum is in 1/2 in xmm0-xmm2
1249 shufps xmm3
, xmm3
, 1
1250 shufps xmm4
, xmm4
, 1
1251 shufps xmm5
, xmm5
, 1
1254 addss xmm2
, xmm5 ;
# xmm0-xmm2 has single force in pos0
1256 ;
# increment i force
1257 movss xmm3
, [rdi
+ rcx
*4 + 36]
1258 movss xmm4
, [rdi
+ rcx
*4 + 40]
1259 movss xmm5
, [rdi
+ rcx
*4 + 44]
1263 movss
[rdi
+ rcx
*4 + 36], xmm3
1264 movss
[rdi
+ rcx
*4 + 40], xmm4
1265 movss
[rdi
+ rcx
*4 + 44], xmm5
1267 ;
# accumulate force in xmm6/xmm7 for fshift
1270 shufps xmm0
, xmm0
, 8 ;
# constant 00001000
1273 ;
# increment fshift force
1274 movlps xmm3
, [rsi
+ rdx
*4]
1275 movss xmm4
, [rsi
+ rdx
*4 + 8]
1278 movlps
[rsi
+ rdx
*4], xmm3
1279 movss
[rsi
+ rdx
*4 + 8], xmm4
1282 mov esi
, [rsp
+ nb133_n
]
1283 ;
# get group index for i particle
1284 mov rdx
, [rbp
+ nb133_gid
] ;
# base of gid[]
1285 mov edx
, [rdx
+ rsi
*4] ;
# ggid=gid[n]
1287 ;
# accumulate total potential energy and update it
1288 movaps xmm7
, [rsp
+ nb133_vctot
]
1291 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
1293 shufps xmm6
, xmm6
, 1
1296 ;
# add earlier value from mem
1297 mov rax
, [rbp
+ nb133_Vc
]
1298 addss xmm7
, [rax
+ rdx
*4]
1300 movss
[rax
+ rdx
*4], xmm7
1302 ;
# accumulate total lj energy and update it
1303 movaps xmm7
, [rsp
+ nb133_Vvdwtot
]
1306 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
1308 shufps xmm6
, xmm6
, 1
1311 ;
# add earlier value from mem
1312 mov rax
, [rbp
+ nb133_Vvdw
]
1313 addss xmm7
, [rax
+ rdx
*4]
1315 movss
[rax
+ rdx
*4], xmm7
1318 mov ecx
, [rsp
+ nb133_nn1
]
1319 ;
# esi already loaded with n
1324 ;
# not last, iterate outer loop once more!
1325 mov
[rsp
+ nb133_n
], esi
1328 ;
# check if more outer neighborlists remain
1329 mov ecx
, [rsp
+ nb133_nri
]
1330 ;
# esi already loaded with n above
1333 ;
# non-zero, do one more workunit
1334 jmp
.nb133_threadloop
1336 mov eax
, [rsp
+ nb133_nouter
]
1337 mov ebx
, [rsp
+ nb133_ninner
]
1338 mov rcx
, [rbp
+ nb133_outeriter
]
1339 mov rdx
, [rbp
+ nb133_inneriter
]
1346 ;
# Save xmm registers to stack
1348 movaps xmm7
, [rsp
+ 16 ]
1349 movaps xmm8
, [rsp
+ 32 ]
1350 movaps xmm9
, [rsp
+ 48 ]
1351 movaps xmm10
, [rsp
+ 64 ]
1352 movaps xmm11
, [rsp
+ 80 ]
1353 movaps xmm12
, [rsp
+ 96 ]
1354 movaps xmm13
, [rsp
+ 112]
1355 movaps xmm14
, [rsp
+ 128]
1356 movaps xmm15
, [rsp
+ 144]
1358 ;
# Reset pointers after restoring xmm6-15
1377 .globl nb_kernel133nf_x86_64_sse
1378 .globl _nb_kernel133nf_x86_64_sse
1379 nb_kernel133nf_x86_64_sse
:
1380 _nb_kernel133nf_x86_64_sse
:
1381 ;
# Room for return address and rbp (16 bytes)
1382 .equiv nb133nf_fshift, 16
1383 .equiv nb133nf_gid, 24
1384 .equiv nb133nf_pos, 32
1385 .equiv nb133nf_faction, 40
1386 .equiv nb133nf_charge, 48
1387 .equiv nb133nf_p_facel, 56
1388 .equiv nb133nf_argkrf, 64
1389 .equiv nb133nf_argcrf, 72
1390 .equiv nb133nf_Vc, 80
1391 .equiv nb133nf_type, 88
1392 .equiv nb133nf_p_ntype, 96
1393 .equiv nb133nf_vdwparam, 104
1394 .equiv nb133nf_Vvdw, 112
1395 .equiv nb133nf_p_tabscale, 120
1396 .equiv nb133nf_VFtab, 128
1397 .equiv nb133nf_invsqrta, 136
1398 .equiv nb133nf_dvda, 144
1399 .equiv nb133nf_p_gbtabscale, 152
1400 .equiv nb133nf_GBtab, 160
1401 .equiv nb133nf_p_nthreads, 168
1402 .equiv nb133nf_count, 176
1403 .equiv nb133nf_mtx, 184
1404 .equiv nb133nf_outeriter, 192
1405 .equiv nb133nf_inneriter, 200
1406 .equiv nb133nf_work, 208
1407 ;
# stack offsets for local variables
1408 ;
# bottom of stack is cache-aligned for sse use
1409 .equiv nb133nf_ixO, 0
1410 .equiv nb133nf_iyO, 16
1411 .equiv nb133nf_izO, 32
1412 .equiv nb133nf_ixH1, 48
1413 .equiv nb133nf_iyH1, 64
1414 .equiv nb133nf_izH1, 80
1415 .equiv nb133nf_ixH2, 96
1416 .equiv nb133nf_iyH2, 112
1417 .equiv nb133nf_izH2, 128
1418 .equiv nb133nf_ixM, 144
1419 .equiv nb133nf_iyM, 160
1420 .equiv nb133nf_izM, 176
1421 .equiv nb133nf_iqM, 192
1422 .equiv nb133nf_iqH, 208
1423 .equiv nb133nf_qqM, 224
1424 .equiv nb133nf_qqH, 240
1425 .equiv nb133nf_rinvH1, 256
1426 .equiv nb133nf_rinvH2, 272
1427 .equiv nb133nf_rinvM, 288
1428 .equiv nb133nf_tsc, 304
1429 .equiv nb133nf_c6, 320
1430 .equiv nb133nf_c12, 336
1431 .equiv nb133nf_krf, 352
1432 .equiv nb133nf_crf, 368
1433 .equiv nb133nf_krsqH1, 384
1434 .equiv nb133nf_krsqH2, 400
1435 .equiv nb133nf_krsqM, 416
1436 .equiv nb133nf_vctot, 432
1437 .equiv nb133nf_Vvdwtot, 448
1438 .equiv nb133nf_half, 464
1439 .equiv nb133nf_three, 480
1440 .equiv nb133nf_nri, 496
1441 .equiv nb133nf_iinr, 504
1442 .equiv nb133nf_jindex, 512
1443 .equiv nb133nf_jjnr, 520
1444 .equiv nb133nf_shift, 528
1445 .equiv nb133nf_shiftvec, 536
1446 .equiv nb133nf_facel, 544
1447 .equiv nb133nf_innerjjnr, 552
1448 .equiv nb133nf_is3, 560
1449 .equiv nb133nf_ii3, 564
1450 .equiv nb133nf_ntia, 568
1451 .equiv nb133nf_innerk, 572
1452 .equiv nb133nf_n, 576
1453 .equiv nb133nf_nn1, 580
1454 .equiv nb133nf_nouter, 584
1455 .equiv nb133nf_ninner, 588
1460 ;
# Push integer registers on stack
1469 ;
# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
1472 ;
# Save xmm registers to stack
1474 movaps
[rsp
+ 16 ], xmm7
1475 movaps
[rsp
+ 32 ], xmm8
1476 movaps
[rsp
+ 48 ], xmm9
1477 movaps
[rsp
+ 64 ], xmm10
1478 movaps
[rsp
+ 80 ], xmm11
1479 movaps
[rsp
+ 96 ], xmm12
1480 movaps
[rsp
+ 112], xmm13
1481 movaps
[rsp
+ 128], xmm14
1482 movaps
[rsp
+ 144], xmm15
1485 sub rsp
, 592 ;
# local variable stack space (n*16+8)
1486 ;
.if 0 # block below only read by NASM - special calling convention on win64
1487 %ifidn __OUTPUT_FORMAT__
, win64
1488 ;
# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
1490 ;
# Adjust stack pointer for different alignment
1491 ;
# Move around arguments to fit AMD64 convention below
1492 ;
# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
1493 ;
# win64 passes args in: rcx,rdx,r8,r9 + stack
1501 ;
.endif # end NASM- and win64-specific block
1503 ;
# zero 32-bit iteration counters
1505 mov
[rsp
+ nb133nf_nouter
], eax
1506 mov
[rsp
+ nb133nf_ninner
], eax
1509 mov
[rsp
+ nb133nf_nri
], edi
1510 mov
[rsp
+ nb133nf_iinr
], rsi
1511 mov
[rsp
+ nb133nf_jindex
], rdx
1512 mov
[rsp
+ nb133nf_jjnr
], rcx
1513 mov
[rsp
+ nb133nf_shift
], r8
1514 mov
[rsp
+ nb133nf_shiftvec
], r9
1515 mov rsi
, [rbp
+ nb133nf_p_facel
]
1517 movss
[rsp
+ nb133nf_facel
], xmm0
1519 mov rax
, [rbp
+ nb133nf_p_tabscale
]
1521 shufps xmm3
, xmm3
, 0
1522 movaps
[rsp
+ nb133nf_tsc
], xmm3
1524 ;
# create constant floating-point factors on stack
1525 mov eax
, 0x3f000000 ;
# half in IEEE (hex)
1526 mov
[rsp
+ nb133nf_half
], eax
1527 movss xmm1
, [rsp
+ nb133nf_half
]
1528 shufps xmm1
, xmm1
, 0 ;
# splat to all elements
1530 addps xmm2
, xmm2 ;
# one
1532 addps xmm2
, xmm2 ;
# two
1533 addps xmm3
, xmm2 ;
# three
1534 movaps
[rsp
+ nb133nf_half
], xmm1
1535 movaps
[rsp
+ nb133nf_three
], xmm3
1537 ;
# assume we have at least one i particle - start directly
1538 mov rcx
, [rsp
+ nb133nf_iinr
] ;
# rcx = pointer into iinr[]
1539 mov ebx
, [rcx
] ;
# ebx =ii
1541 mov rdx
, [rbp
+ nb133nf_charge
]
1542 movss xmm4
, [rdx
+ rbx
*4 + 4]
1543 movss xmm3
, [rdx
+ rbx
*4 + 12]
1544 mov rsi
, [rbp
+ nb133nf_p_facel
]
1546 movss xmm5
, [rsp
+ nb133nf_facel
]
1550 shufps xmm3
, xmm3
, 0
1551 shufps xmm4
, xmm4
, 0
1552 movaps
[rsp
+ nb133nf_iqM
], xmm3
1553 movaps
[rsp
+ nb133nf_iqH
], xmm4
1555 mov rdx
, [rbp
+ nb133nf_type
]
1556 mov ecx
, [rdx
+ rbx
*4]
1558 mov rdi
, [rbp
+ nb133nf_p_ntype
]
1559 imul ecx
, [rdi
] ;
# rcx = ntia = 2*ntype*type[ii0]
1560 mov
[rsp
+ nb133nf_ntia
], ecx
1562 .nb133nf_threadloop:
1563 mov rsi
, [rbp
+ nb133nf_count
] ;
# pointer to sync counter
1566 mov ebx
, eax ;
# ebx=*count=nn0
1567 add ebx
, 1 ;
# ebx=nn1=nn0+10
1569 cmpxchg
[rsi
], ebx ;
# write nn1 to *counter,
1570 ;
# if it hasnt changed.
1571 ;
# or reread *counter to eax.
1572 pause ;
# -> better p4 performance
1573 jnz
.nb133nf_spinlock
1575 ;
# if(nn1>nri) nn1=nri
1576 mov ecx
, [rsp
+ nb133nf_nri
]
1579 cmovle ebx
, edx ;
# if(nn1>nri) nn1=nri
1580 ;
# Cleared the spinlock if we got here.
1581 ;
# eax contains nn0, ebx contains nn1.
1582 mov
[rsp
+ nb133nf_n
], eax
1583 mov
[rsp
+ nb133nf_nn1
], ebx
1584 sub ebx
, eax ;
# calc number of outer lists
1585 mov esi
, eax ;
# copy n to esi
1586 jg
.nb133nf_outerstart
1589 .nb133nf_outerstart:
1590 ;
# ebx contains number of outer iterations
1591 add ebx
, [rsp
+ nb133nf_nouter
]
1592 mov
[rsp
+ nb133nf_nouter
], ebx
1595 mov rax
, [rsp
+ nb133nf_shift
] ;
# eax = pointer into shift[]
1596 mov ebx
, [rax
+ rsi
*4] ;
# ebx=shift[n]
1598 lea rbx
, [rbx
+ rbx
*2] ;
# rbx=3*is
1599 mov
[rsp
+ nb133nf_is3
],ebx ;
# store is3
1601 mov rax
, [rsp
+ nb133nf_shiftvec
] ;
# eax = base of shiftvec[]
1603 movss xmm0
, [rax
+ rbx
*4]
1604 movss xmm1
, [rax
+ rbx
*4 + 4]
1605 movss xmm2
, [rax
+ rbx
*4 + 8]
1607 mov rcx
, [rsp
+ nb133nf_iinr
] ;
# ecx = pointer into iinr[]
1608 mov ebx
, [rcx
+ rsi
*4] ;
# ebx =ii
1616 lea rbx
, [rbx
+ rbx
*2] ;
# rbx = 3*ii=ii3
1617 mov rax
, [rbp
+ nb133nf_pos
] ;
# eax = base of pos[]
1618 mov
[rsp
+ nb133nf_ii3
], ebx
1620 addss xmm3
, [rax
+ rbx
*4] ;
# ox
1621 addss xmm4
, [rax
+ rbx
*4 + 4] ;
# oy
1622 addss xmm5
, [rax
+ rbx
*4 + 8] ;
# oz
1623 addss xmm6
, [rax
+ rbx
*4 + 12] ;
# h1x
1624 addss xmm7
, [rax
+ rbx
*4 + 16] ;
# h1y
1625 shufps xmm3
, xmm3
, 0
1626 shufps xmm4
, xmm4
, 0
1627 shufps xmm5
, xmm5
, 0
1628 shufps xmm6
, xmm6
, 0
1629 shufps xmm7
, xmm7
, 0
1630 movaps
[rsp
+ nb133nf_ixO
], xmm3
1631 movaps
[rsp
+ nb133nf_iyO
], xmm4
1632 movaps
[rsp
+ nb133nf_izO
], xmm5
1633 movaps
[rsp
+ nb133nf_ixH1
], xmm6
1634 movaps
[rsp
+ nb133nf_iyH1
], xmm7
1640 addss xmm6
, [rax
+ rbx
*4 + 20] ;
# h1z
1641 addss xmm0
, [rax
+ rbx
*4 + 24] ;
# h2x
1642 addss xmm1
, [rax
+ rbx
*4 + 28] ;
# h2y
1643 addss xmm2
, [rax
+ rbx
*4 + 32] ;
# h2z
1644 addss xmm3
, [rax
+ rbx
*4 + 36] ;
# mx
1645 addss xmm4
, [rax
+ rbx
*4 + 40] ;
# my
1646 addss xmm5
, [rax
+ rbx
*4 + 44] ;
# mz
1648 shufps xmm6
, xmm6
, 0
1649 shufps xmm0
, xmm0
, 0
1650 shufps xmm1
, xmm1
, 0
1651 shufps xmm2
, xmm2
, 0
1652 shufps xmm3
, xmm3
, 0
1653 shufps xmm4
, xmm4
, 0
1654 shufps xmm5
, xmm5
, 0
1655 movaps
[rsp
+ nb133nf_izH1
], xmm6
1656 movaps
[rsp
+ nb133nf_ixH2
], xmm0
1657 movaps
[rsp
+ nb133nf_iyH2
], xmm1
1658 movaps
[rsp
+ nb133nf_izH2
], xmm2
1659 movaps
[rsp
+ nb133nf_ixM
], xmm3
1660 movaps
[rsp
+ nb133nf_iyM
], xmm4
1661 movaps
[rsp
+ nb133nf_izM
], xmm5
1665 movaps
[rsp
+ nb133nf_vctot
], xmm4
1666 movaps
[rsp
+ nb133nf_Vvdwtot
], xmm4
1668 mov rax
, [rsp
+ nb133nf_jindex
]
1669 mov ecx
, [rax
+ rsi
*4] ;
# jindex[n]
1670 mov edx
, [rax
+ rsi
*4 + 4] ;
# jindex[n+1]
1671 sub edx
, ecx ;
# number of innerloop atoms
1673 mov rsi
, [rbp
+ nb133nf_pos
]
1674 mov rax
, [rsp
+ nb133nf_jjnr
]
1677 mov
[rsp
+ nb133nf_innerjjnr
], rax ;
# pointer to jjnr[nj0]
1680 add ecx
, [rsp
+ nb133nf_ninner
]
1681 mov
[rsp
+ nb133nf_ninner
], ecx
1683 mov
[rsp
+ nb133nf_innerk
], edx ;
# number of innerloop atoms
1684 jge
.nb133nf_unroll_loop
1685 jmp
.nb133nf_odd_inner
1686 .nb133nf_unroll_loop:
1687 ;
# quad-unroll innerloop here
1688 mov rdx
, [rsp
+ nb133nf_innerjjnr
] ;
# pointer to jjnr[k]
1692 mov edx
, [rdx
+ 12] ;
# eax-edx=jnr1-4
1694 add qword ptr
[rsp
+ nb133nf_innerjjnr
], 16 ;
# advance pointer (unrolled 4)
1696 mov rsi
, [rbp
+ nb133nf_charge
] ;
# base of charge[]
1698 movss xmm3
, [rsi
+ rax
*4]
1699 movss xmm4
, [rsi
+ rcx
*4]
1700 movss xmm6
, [rsi
+ rbx
*4]
1701 movss xmm7
, [rsi
+ rdx
*4]
1703 shufps xmm3
, xmm6
, 0
1704 shufps xmm4
, xmm7
, 0
1705 shufps xmm3
, xmm4
, 136 ;
# constant 10001000 ;# all charges in xmm3
1706 movaps xmm4
, xmm3 ;
# and in xmm4
1707 mulps xmm3
, [rsp
+ nb133nf_iqM
]
1708 mulps xmm4
, [rsp
+ nb133nf_iqH
]
1710 movd mm0
, eax ;
# use mmx registers as temp storage
1715 movaps
[rsp
+ nb133nf_qqM
], xmm3
1716 movaps
[rsp
+ nb133nf_qqH
], xmm4
1718 mov rsi
, [rbp
+ nb133nf_type
]
1719 mov eax
, [rsi
+ rax
*4]
1720 mov ebx
, [rsi
+ rbx
*4]
1721 mov ecx
, [rsi
+ rcx
*4]
1722 mov edx
, [rsi
+ rdx
*4]
1723 mov rsi
, [rbp
+ nb133nf_vdwparam
]
1728 mov edi
, [rsp
+ nb133nf_ntia
]
1734 movlps xmm6
, [rsi
+ rax
*4]
1735 movlps xmm7
, [rsi
+ rcx
*4]
1736 movhps xmm6
, [rsi
+ rbx
*4]
1737 movhps xmm7
, [rsi
+ rdx
*4]
1740 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
1741 shufps xmm6
, xmm7
, 221 ;
# constant 11011101
1748 movaps
[rsp
+ nb133nf_c6
], xmm4
1749 movaps
[rsp
+ nb133nf_c12
], xmm6
1751 mov rsi
, [rbp
+ nb133nf_pos
] ;
# base of pos[]
1753 lea rax
, [rax
+ rax
*2] ;
# replace jnr with j3
1754 lea rbx
, [rbx
+ rbx
*2]
1755 lea rcx
, [rcx
+ rcx
*2] ;
# replace jnr with j3
1756 lea rdx
, [rdx
+ rdx
*2]
1758 ;
# move four coordinates to xmm0-xmm2
1759 movlps xmm4
, [rsi
+ rax
*4]
1760 movlps xmm5
, [rsi
+ rcx
*4]
1761 movss xmm2
, [rsi
+ rax
*4 + 8]
1762 movss xmm6
, [rsi
+ rcx
*4 + 8]
1764 movhps xmm4
, [rsi
+ rbx
*4]
1765 movhps xmm5
, [rsi
+ rdx
*4]
1767 movss xmm0
, [rsi
+ rbx
*4 + 8]
1768 movss xmm1
, [rsi
+ rdx
*4 + 8]
1770 shufps xmm2
, xmm0
, 0
1771 shufps xmm6
, xmm1
, 0
1776 shufps xmm2
, xmm6
, 136 ;
# constant 10001000
1778 shufps xmm0
, xmm5
, 136 ;
# constant 10001000
1779 shufps xmm1
, xmm5
, 221 ;
# constant 11011101
1781 ;
# move ixO-izO to xmm4-xmm6
1782 movaps xmm4
, [rsp
+ nb133nf_ixO
]
1783 movaps xmm5
, [rsp
+ nb133nf_iyO
]
1784 movaps xmm6
, [rsp
+ nb133nf_izO
]
1800 ;
# move ixH1-izH1 to xmm4-xmm6
1801 movaps xmm4
, [rsp
+ nb133nf_ixH1
]
1802 movaps xmm5
, [rsp
+ nb133nf_iyH1
]
1803 movaps xmm6
, [rsp
+ nb133nf_izH1
]
1818 ;
# move ixH2-izH2 to xmm3-xmm5
1819 movaps xmm3
, [rsp
+ nb133nf_ixH2
]
1820 movaps xmm4
, [rsp
+ nb133nf_iyH2
]
1821 movaps xmm5
, [rsp
+ nb133nf_izH2
]
1835 ;
# move ixM-izM to xmm2-xmm4
1836 movaps xmm3
, [rsp
+ nb133nf_iyM
]
1837 movaps xmm4
, [rsp
+ nb133nf_izM
]
1840 movaps xmm2
, [rsp
+ nb133nf_ixM
]
1849 ;
# rsqM in xmm4, rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
1851 ;
# rsqH1 - seed in xmm2
1855 movaps xmm0
, [rsp
+ nb133nf_three
]
1856 mulps xmm2
, xmm6 ;
# rsq*lu*lu
1857 subps xmm0
, xmm2 ;
# constant 30-rsq*lu*lu
1858 mulps xmm0
, xmm3 ;
# lu*(3-rsq*lu*lu)
1859 mulps xmm0
, [rsp
+ nb133nf_half
]
1860 movaps
[rsp
+ nb133nf_rinvH1
], xmm0 ;
# rinvH1
1862 ;
# rsqH2 - seed to xmm2
1866 movaps xmm0
, [rsp
+ nb133nf_three
]
1867 mulps xmm2
, xmm5 ;
# rsq*lu*lu
1868 subps xmm0
, xmm2 ;
# constant 30-rsq*lu*lu
1869 mulps xmm0
, xmm3 ;
# lu*(3-rsq*lu*lu)
1870 mulps xmm0
, [rsp
+ nb133nf_half
]
1871 movaps
[rsp
+ nb133nf_rinvH2
], xmm0 ;
# rinvH2
1873 ;
# rsqM - seed to xmm2
1877 movaps xmm0
, [rsp
+ nb133nf_three
]
1878 mulps xmm2
, xmm4 ;
# rsq*lu*lu
1879 subps xmm0
, xmm2 ;
# constant 30-rsq*lu*lu
1880 mulps xmm0
, xmm3 ;
# lu*(3-rsq*lu*lu)
1881 mulps xmm0
, [rsp
+ nb133nf_half
]
1882 movaps
[rsp
+ nb133nf_rinvM
], xmm0
1884 ;
# Do the O LJ-only interaction directly.
1889 movaps xmm4
, [rsp
+ nb133nf_three
]
1890 mulps xmm2
, xmm7 ;
# rsq*lu*lu
1891 subps xmm4
, xmm2 ;
# constant 30-rsq*lu*lu
1892 mulps xmm4
, xmm3 ;
# lu*(3-rsq*lu*lu)
1893 mulps xmm4
, [rsp
+ nb133nf_half
]
1898 mulps xmm7
, [rsp
+ nb133nf_tsc
] ;
# rtab
1902 cvttps2pi mm7
, xmm5 ;
# mm6/mm7 contain lu indices
1907 movaps xmm1
, xmm7 ;
# xmm1=eps
1909 mulps xmm2
, xmm2 ;
# xmm2=eps2
1913 mov rsi
, [rbp
+ nb133nf_VFtab
]
1922 movlps xmm5
, [rsi
+ rax
*4]
1923 movlps xmm7
, [rsi
+ rcx
*4]
1924 movhps xmm5
, [rsi
+ rbx
*4]
1925 movhps xmm7
, [rsi
+ rdx
*4] ;
# got half dispersion table
1927 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
1928 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
1930 movlps xmm7
, [rsi
+ rax
*4 + 8]
1931 movlps xmm3
, [rsi
+ rcx
*4 + 8]
1932 movhps xmm7
, [rsi
+ rbx
*4 + 8]
1933 movhps xmm3
, [rsi
+ rdx
*4 + 8] ;
# other half of dispersion table
1935 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
1936 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
1937 ;
# dispersion table ready, in xmm4-xmm7
1939 mulps xmm6
, xmm1 ;
# xmm6=Geps
1940 mulps xmm7
, xmm2 ;
# xmm7=Heps2
1942 addps xmm5
, xmm7 ;
# xmm5=Fp
1943 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
1944 addps xmm5
, xmm4 ;
# xmm5=VV
1946 movaps xmm4
, [rsp
+ nb133nf_c6
]
1947 mulps xmm5
, xmm4 ;
# Vvdw6
1949 addps xmm5
, [rsp
+ nb133nf_Vvdwtot
]
1950 movaps
[rsp
+ nb133nf_Vvdwtot
], xmm5
1953 movlps xmm5
, [rsi
+ rax
*4 + 16]
1954 movlps xmm7
, [rsi
+ rcx
*4 + 16]
1955 movhps xmm5
, [rsi
+ rbx
*4 + 16]
1956 movhps xmm7
, [rsi
+ rdx
*4 + 16] ;
# got half repulsion table
1958 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
1959 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
1961 movlps xmm7
, [rsi
+ rax
*4 + 24]
1962 movlps xmm3
, [rsi
+ rcx
*4 + 24]
1963 movhps xmm7
, [rsi
+ rbx
*4 + 24]
1964 movhps xmm3
, [rsi
+ rdx
*4 + 24] ;
# other half of repulsion table
1966 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
1967 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
1968 ;
# table ready, in xmm4-xmm7
1969 mulps xmm6
, xmm1 ;
# xmm6=Geps
1970 mulps xmm7
, xmm2 ;
# xmm7=Heps2
1972 addps xmm5
, xmm7 ;
# xmm5=Fp
1973 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
1974 addps xmm5
, xmm4 ;
# xmm5=VV
1976 movaps xmm4
, [rsp
+ nb133nf_c12
]
1977 mulps xmm5
, xmm4 ;
# Vvdw12
1979 addps xmm5
, [rsp
+ nb133nf_Vvdwtot
]
1980 movaps
[rsp
+ nb133nf_Vvdwtot
], xmm5
1982 ;
# Do H1-H2-M interactions
1983 movaps xmm7
, [rsp
+ nb133nf_rinvH1
]
1984 addps xmm7
, [rsp
+ nb133nf_rinvH2
]
1985 movaps xmm6
, [rsp
+ nb133nf_rinvM
]
1987 mulps xmm7
, [rsp
+ nb133nf_qqH
]
1988 mulps xmm6
, [rsp
+ nb133nf_qqM
]
1991 addps xmm7
, [rsp
+ nb133nf_vctot
]
1992 movaps
[rsp
+ nb133nf_vctot
], xmm7
1994 ;
# should we do one more iteration?
1995 sub dword ptr
[rsp
+ nb133nf_innerk
], 4
1996 jl
.nb133nf_odd_inner
1997 jmp
.nb133nf_unroll_loop
1999 add dword ptr
[rsp
+ nb133nf_innerk
], 4
2000 jnz
.nb133nf_odd_loop
2001 jmp
.nb133nf_updateouterdata
2003 mov rdx
, [rsp
+ nb133nf_innerjjnr
] ;
# pointer to jjnr[k]
2005 add qword ptr
[rsp
+ nb133nf_innerjjnr
], 4
2007 xorps xmm4
, xmm4 ;
# clear reg.
2008 movss xmm4
, [rsp
+ nb133nf_iqM
]
2009 mov rsi
, [rbp
+ nb133nf_charge
]
2010 movhps xmm4
, [rsp
+ nb133nf_iqH
] ;
# [qM 0 qH qH]
2011 shufps xmm4
, xmm4
, 41 ;
# [0 qH qH qM]
2013 movss xmm3
, [rsi
+ rax
*4] ;
# charge in xmm3
2014 shufps xmm3
, xmm3
, 0
2016 movaps
[rsp
+ nb133nf_qqM
], xmm3 ;
# use dummy qq for storage
2019 mov rsi
, [rbp
+ nb133nf_type
]
2020 mov ebx
, [rsi
+ rax
*4]
2021 mov rsi
, [rbp
+ nb133nf_vdwparam
]
2023 add ebx
, [rsp
+ nb133nf_ntia
]
2024 movlps xmm6
, [rsi
+ rbx
*4]
2026 shufps xmm6
, xmm6
, 252 ;
# constant 11111100
2027 shufps xmm7
, xmm7
, 253 ;
# constant 11111101
2028 movaps
[rsp
+ nb133nf_c6
], xmm6
2029 movaps
[rsp
+ nb133nf_c12
], xmm7
2031 mov rsi
, [rbp
+ nb133nf_pos
]
2032 lea rax
, [rax
+ rax
*2]
2034 movss xmm3
, [rsp
+ nb133nf_ixO
]
2035 movss xmm4
, [rsp
+ nb133nf_iyO
]
2036 movss xmm5
, [rsp
+ nb133nf_izO
]
2037 movss xmm0
, [rsp
+ nb133nf_ixH1
]
2038 movss xmm1
, [rsp
+ nb133nf_iyH1
]
2039 movss xmm2
, [rsp
+ nb133nf_izH1
]
2040 unpcklps xmm3
, [rsp
+ nb133nf_ixH2
] ;
# ixO ixH2 - -
2041 unpcklps xmm4
, [rsp
+ nb133nf_iyH2
] ;
# iyO iyH2 - -
2042 unpcklps xmm5
, [rsp
+ nb133nf_izH2
] ;
# izO izH2 - -
2043 unpcklps xmm0
, [rsp
+ nb133nf_ixM
] ;
# ixH1 ixM - -
2044 unpcklps xmm1
, [rsp
+ nb133nf_iyM
] ;
# iyH1 iyM - -
2045 unpcklps xmm2
, [rsp
+ nb133nf_izM
] ;
# izH1 izM - -
2046 unpcklps xmm3
, xmm0 ;
# ixO ixH1 ixH2 ixM
2047 unpcklps xmm4
, xmm1 ;
# same for y
2048 unpcklps xmm5
, xmm2 ;
# same for z
2050 ;
# move j coords to xmm0-xmm2
2051 movss xmm0
, [rsi
+ rax
*4]
2052 movss xmm1
, [rsi
+ rax
*4 + 4]
2053 movss xmm2
, [rsi
+ rax
*4 + 8]
2054 shufps xmm0
, xmm0
, 0
2055 shufps xmm1
, xmm1
, 0
2056 shufps xmm2
, xmm2
, 0
2071 ;
# lookup seed in xmm5
2074 movaps xmm1
, [rsp
+ nb133nf_three
]
2075 mulps xmm5
, xmm4 ;
# rsq*lu*lu
2076 movaps xmm0
, [rsp
+ nb133nf_half
]
2077 subps xmm1
, xmm5 ;
# constant 30-rsq*lu*lu
2079 mulps xmm0
, xmm1 ;
# xmm0=rinv, xmm4=rsq
2081 ;
# LJ table interaction
2083 mulps xmm4
, [rsp
+ nb133nf_tsc
] ;
# rtab
2088 movss xmm1
, xmm4 ;
# xmm1=eps
2090 mulss xmm2
, xmm2 ;
# xmm2=eps2
2093 mov rsi
, [rbp
+ nb133nf_VFtab
]
2097 movlps xmm5
, [rsi
+ rax
*4]
2099 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
2100 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
2102 movlps xmm7
, [rsi
+ rax
*4 + 8]
2104 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
2105 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
2106 ;
# dispersion table ready, in xmm4-xmm7
2108 mulss xmm6
, xmm1 ;
# xmm6=Geps
2109 mulss xmm7
, xmm2 ;
# xmm7=Heps2
2111 addss xmm5
, xmm7 ;
# xmm5=Fp
2112 mulss xmm5
, xmm1 ;
# xmm5=eps*Fp
2113 addss xmm5
, xmm4 ;
# xmm5=VV
2115 movss xmm4
, [rsp
+ nb133nf_c6
]
2116 mulss xmm5
, xmm4 ;
# Vvdw6
2118 ;
# put scalar force on stack Update Vvdwtot directly
2119 addss xmm5
, [rsp
+ nb133nf_Vvdwtot
]
2120 movss
[rsp
+ nb133nf_Vvdwtot
], xmm5
2123 movlps xmm5
, [rsi
+ rax
*4 + 16]
2125 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
2126 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
2128 movlps xmm7
, [rsi
+ rax
*4 + 24]
2130 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
2131 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
2132 ;
# table ready, in xmm4-xmm7
2133 mulss xmm6
, xmm1 ;
# xmm6=Geps
2134 mulss xmm7
, xmm2 ;
# xmm7=Heps2
2136 addss xmm5
, xmm7 ;
# xmm5=Fp
2137 mulss xmm5
, xmm1 ;
# xmm5=eps*Fp
2138 addss xmm5
, xmm4 ;
# xmm5=VV
2140 movss xmm4
, [rsp
+ nb133nf_c12
]
2141 mulss xmm5
, xmm4 ;
# Vvdw12
2143 addss xmm5
, [rsp
+ nb133nf_Vvdwtot
]
2144 movss
[rsp
+ nb133nf_Vvdwtot
], xmm5
2146 mulps xmm0
, [rsp
+ nb133nf_qqM
] ;
# xmm0=vcoul
2148 addps xmm0
, [rsp
+ nb133nf_vctot
]
2149 movaps
[rsp
+ nb133nf_vctot
], xmm0
2151 dec dword ptr
[rsp
+ nb133nf_innerk
]
2152 jz
.nb133nf_updateouterdata
2153 jmp
.nb133nf_odd_loop
2154 .nb133nf_updateouterdata:
2156 mov esi
, [rsp
+ nb133nf_n
]
2157 ;
# get group index for i particle
2158 mov rdx
, [rbp
+ nb133nf_gid
] ;
# base of gid[]
2159 mov edx
, [rdx
+ rsi
*4] ;
# ggid=gid[n]
2161 ;
# accumulate total potential energy and update it
2162 movaps xmm7
, [rsp
+ nb133nf_vctot
]
2165 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
2167 shufps xmm6
, xmm6
, 1
2170 ;
# add earlier value from mem
2171 mov rax
, [rbp
+ nb133nf_Vc
]
2172 addss xmm7
, [rax
+ rdx
*4]
2174 movss
[rax
+ rdx
*4], xmm7
2176 ;
# accumulate total lj energy and update it
2177 movaps xmm7
, [rsp
+ nb133nf_Vvdwtot
]
2180 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
2182 shufps xmm6
, xmm6
, 1
2185 ;
# add earlier value from mem
2186 mov rax
, [rbp
+ nb133nf_Vvdw
]
2187 addss xmm7
, [rax
+ rdx
*4]
2189 movss
[rax
+ rdx
*4], xmm7
2192 mov ecx
, [rsp
+ nb133nf_nn1
]
2193 ;
# esi already loaded with n
2196 jz
.nb133nf_outerend
2198 ;
# not last, iterate outer loop once more!
2199 mov
[rsp
+ nb133nf_n
], esi
2202 ;
# check if more outer neighborlists remain
2203 mov ecx
, [rsp
+ nb133nf_nri
]
2204 ;
# esi already loaded with n above
2207 ;
# non-zero, do one more workunit
2208 jmp
.nb133nf_threadloop
2211 mov eax
, [rsp
+ nb133nf_nouter
]
2212 mov ebx
, [rsp
+ nb133nf_ninner
]
2213 mov rcx
, [rbp
+ nb133nf_outeriter
]
2214 mov rdx
, [rbp
+ nb133nf_inneriter
]
2221 ;
# Save xmm registers to stack
2223 movaps xmm7
, [rsp
+ 16 ]
2224 movaps xmm8
, [rsp
+ 32 ]
2225 movaps xmm9
, [rsp
+ 48 ]
2226 movaps xmm10
, [rsp
+ 64 ]
2227 movaps xmm11
, [rsp
+ 80 ]
2228 movaps xmm12
, [rsp
+ 96 ]
2229 movaps xmm13
, [rsp
+ 112]
2230 movaps xmm14
, [rsp
+ 128]
2231 movaps xmm15
, [rsp
+ 144]
2233 ;
# Reset pointers after restoring xmm6-15