3 ;
# Gromacs 4.0 Copyright (c) 1991-2003
4 ;
# David van der Spoel, Erik Lindahl
6 ;
# This program is free software; you can redistribute it and/or
7 ;
# modify it under the terms of the GNU General Public License
8 ;
# as published by the Free Software Foundation; either version 2
9 ;
# of the License, or (at your option) any later version.
11 ;
# To help us fund GROMACS development, we humbly ask that you cite
12 ;
# the research papers on the package. Check out http://www.gromacs.org
15 ;
# Gnomes, ROck Monsters And Chili Sauce
18 ;
# These files require GNU binutils 2.10 or later, since we
19 ;
# use intel syntax for portability, or a recent version
20 ;
# of NASM that understands Extended 3DNow and SSE2 instructions.
21 ;
# (NASM is normally only used with MS Visual C++).
22 ;
# Since NASM and gnu as disagree on some definitions and use
23 ;
# completely different preprocessing options I have to introduce a
24 ;
# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
25 ;
# Gnu as treats ';' as a line break, i.e. ignores it. This is the
26 ;
# reason why all comments need both symbols...
27 ;
# The source is written for GNU as, with intel syntax. When you use
28 ;
# NASM we redefine a couple of things. The false if-statement around
29 ;
# the following code is seen by GNU as, but NASM doesn't see it, so
30 ;
# the code inside is read by NASM but not gcc.
31 ;
.if 0 # block below only read by NASM
32 %define
.section section
36 ;
# NASM only wants 'dword', not 'dword ptr'.
41 ;
.endif # End of NASM-specific block
42 ;
.intel_syntax noprefix # Line only read by gnu as
47 .globl nb_kernel400_x86_64_sse2
48 .globl _nb_kernel400_x86_64_sse2
49 nb_kernel400_x86_64_sse2
:
50 _nb_kernel400_x86_64_sse2
:
51 ;
# Room for return address and rbp (16 bytes)
52 .equiv nb400_fshift, 16
55 .equiv nb400_faction, 40
56 .equiv nb400_charge, 48
57 .equiv nb400_p_facel, 56
58 .equiv nb400_argkrf, 64
59 .equiv nb400_argcrf, 72
62 .equiv nb400_p_ntype, 96
63 .equiv nb400_vdwparam, 104
64 .equiv nb400_Vvdw, 112
65 .equiv nb400_p_tabscale, 120
66 .equiv nb400_VFtab, 128
67 .equiv nb400_invsqrta, 136
68 .equiv nb400_dvda, 144
69 .equiv nb400_p_gbtabscale, 152
70 .equiv nb400_GBtab, 160
71 .equiv nb400_p_nthreads, 168
72 .equiv nb400_count, 176
74 .equiv nb400_outeriter, 192
75 .equiv nb400_inneriter, 200
76 .equiv nb400_work, 208
77 ;
# stack offsets for local variables
78 ;
# bottom of stack is cache-aligned for sse2 use
87 .equiv nb400_gbtsc, 128
90 .equiv nb400_vctot, 176
94 .equiv nb400_half, 240
95 .equiv nb400_three, 256
96 .equiv nb400_isai, 272
97 .equiv nb400_isaprod, 288
98 .equiv nb400_dvdasum, 304
99 .equiv nb400_gbscale, 320
100 .equiv nb400_nri, 336
101 .equiv nb400_iinr, 344
102 .equiv nb400_jindex, 352
103 .equiv nb400_jjnr, 360
104 .equiv nb400_shift, 368
105 .equiv nb400_shiftvec, 376
106 .equiv nb400_facel, 384
107 .equiv nb400_innerjjnr, 392
108 .equiv nb400_is3, 400
109 .equiv nb400_ii3, 404
111 .equiv nb400_innerk, 412
113 .equiv nb400_nn1, 420
114 .equiv nb400_nouter, 424
115 .equiv nb400_ninner, 428
128 sub rsp
, 440 ;
# local variable stack space (n*16+8)
130 ;
# zero 32-bit iteration counters
132 mov
[rsp
+ nb400_nouter
], eax
133 mov
[rsp
+ nb400_ninner
], eax
136 mov
[rsp
+ nb400_nri
], edi
137 mov
[rsp
+ nb400_iinr
], rsi
138 mov
[rsp
+ nb400_jindex
], rdx
139 mov
[rsp
+ nb400_jjnr
], rcx
140 mov
[rsp
+ nb400_shift
], r8
141 mov
[rsp
+ nb400_shiftvec
], r9
142 mov rsi
, [rbp
+ nb400_p_facel
]
144 movsd
[rsp
+ nb400_facel
], xmm0
146 mov rbx
, [rbp
+ nb400_p_gbtabscale
]
149 movapd
[rsp
+ nb400_gbtsc
], xmm4
151 ;
# create constant floating-point factors on stack
152 mov eax
, 0x00000000 ;
# lower half of double half IEEE (hex)
154 mov
[rsp
+ nb400_half
], eax
155 mov
[rsp
+ nb400_half+
4], ebx
156 movsd xmm1
, [rsp
+ nb400_half
]
157 shufpd xmm1
, xmm1
, 0 ;
# splat to all elements
159 addpd xmm3
, xmm3 ;
# one
161 addpd xmm2
, xmm2 ;
# two
162 addpd xmm3
, xmm2 ;
# three
163 movapd
[rsp
+ nb400_half
], xmm1
164 movapd
[rsp
+ nb400_two
], xmm2
165 movapd
[rsp
+ nb400_three
], xmm3
168 mov rsi
, [rbp
+ nb400_count
] ;
# pointer to sync counter
171 mov ebx
, eax ;
# ebx=*count=nn0
172 add ebx
, 1 ;
# ebx=nn1=nn0+10
174 cmpxchg
[esi
], ebx ;
# write nn1 to *counter,
175 ;
# if it hasnt changed.
176 ;
# or reread *counter to eax.
177 pause ;
# -> better p4 performance
180 ;
# if(nn1>nri) nn1=nri
181 mov ecx
, [rsp
+ nb400_nri
]
184 cmovle ebx
, edx ;
# if(nn1>nri) nn1=nri
185 ;
# Cleared the spinlock if we got here.
186 ;
# eax contains nn0, ebx contains nn1.
187 mov
[rsp
+ nb400_n
], eax
188 mov
[rsp
+ nb400_nn1
], ebx
189 sub ebx
, eax ;
# calc number of outer lists
190 mov esi
, eax ;
# copy n to esi
195 ;
# ebx contains number of outer iterations
196 add ebx
, [rsp
+ nb400_nouter
]
197 mov
[rsp
+ nb400_nouter
], ebx
200 mov rax
, [rsp
+ nb400_shift
] ;
# rax = pointer into shift[]
201 mov ebx
, [rax+rsi
*4] ;
# rbx=shift[n]
203 lea rbx
, [rbx
+ rbx
*2] ;
# rbx=3*is
204 mov
[rsp
+ nb400_is3
],ebx ;
# store is3
206 mov rax
, [rsp
+ nb400_shiftvec
] ;
# rax = base of shiftvec[]
208 movsd xmm0
, [rax
+ rbx
*8]
209 movsd xmm1
, [rax
+ rbx
*8 + 8]
210 movsd xmm2
, [rax
+ rbx
*8 + 16]
212 mov rcx
, [rsp
+ nb400_iinr
] ;
# rcx = pointer into iinr[]
213 mov ebx
, [rcx+rsi
*4] ;
# ebx =ii
214 mov
[rsp
+ nb400_ii
], ebx
216 mov rdx
, [rbp
+ nb400_charge
]
217 movsd xmm3
, [rdx
+ rbx
*8]
218 mulsd xmm3
, [rsp
+ nb400_facel
]
221 mov rdx
, [rbp
+ nb400_invsqrta
] ;
# load invsqrta[ii]
222 movsd xmm4
, [rdx
+ rbx
*8]
225 lea rbx
, [rbx
+ rbx
*2] ;
# rbx = 3*ii=ii3
226 mov rax
, [rbp
+ nb400_pos
] ;
# rax = base of pos[]
228 addsd xmm0
, [rax
+ rbx
*8]
229 addsd xmm1
, [rax
+ rbx
*8 + 8]
230 addsd xmm2
, [rax
+ rbx
*8 + 16]
232 movapd
[rsp
+ nb400_iq
], xmm3
233 movapd
[rsp
+ nb400_isai
], xmm4
239 movapd
[rsp
+ nb400_ix
], xmm0
240 movapd
[rsp
+ nb400_iy
], xmm1
241 movapd
[rsp
+ nb400_iz
], xmm2
243 mov
[rsp
+ nb400_ii3
], ebx
245 ;
# clear vctot and i forces
253 mov rax
, [rsp
+ nb400_jindex
]
254 mov ecx
, [rax
+ rsi
*4] ;
# jindex[n]
255 mov edx
, [rax
+ rsi
*4 + 4] ;
# jindex[n+1]
256 sub edx
, ecx ;
# number of innerloop atoms
258 mov rsi
, [rbp
+ nb400_pos
]
259 mov rdi
, [rbp
+ nb400_faction
]
260 mov rax
, [rsp
+ nb400_jjnr
]
263 mov
[rsp
+ nb400_innerjjnr
], rax ;
# pointer to jjnr[nj0]
266 add ecx
, [rsp
+ nb400_ninner
]
267 mov
[rsp
+ nb400_ninner
], ecx
269 mov
[rsp
+ nb400_innerk
], edx ;
# number of innerloop atoms
270 jge
.nb400_unroll_loop
271 jmp
.nb400_checksingle
273 ;
# twice unrolled innerloop here
274 mov rdx
, [rsp
+ nb400_innerjjnr
] ;
# pointer to jjnr[k]
277 add qword ptr
[rsp
+ nb400_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
279 mov rsi
, [rbp
+ nb400_pos
] ;
# base of pos[]
281 lea
r8, [r12 + r12*2] ;
# j3
282 lea
r9, [r13 + r13*2]
284 ;
# move two coordinates to xmm4-xmm6
285 movlpd xmm4
, [rsi
+ r8*8]
286 movlpd xmm5
, [rsi
+ r8*8 + 8]
287 movlpd xmm6
, [rsi
+ r8*8 + 16]
288 movhpd xmm4
, [rsi
+ r9*8]
289 movhpd xmm5
, [rsi
+ r9*8 + 8]
290 movhpd xmm6
, [rsi
+ r9*8 + 16]
293 subpd xmm4
, [rsp
+ nb400_ix
]
294 subpd xmm5
, [rsp
+ nb400_iy
]
295 subpd xmm6
, [rsp
+ nb400_iz
]
311 mov rsi
, [rbp
+ nb400_invsqrta
]
312 movlpd xmm3
, [rsi
+ r12*8]
316 cvtps2pd xmm2
, xmm5 ;
# lu in low xmm2
318 movhpd xmm3
, [rsi
+ r13*8]
319 mulpd xmm3
, [rsp
+ nb400_isai
]
320 movapd
[rsp
+ nb400_isaprod
], xmm3
322 mulpd xmm3
, [rsp
+ nb400_gbtsc
]
323 movapd
[rsp
+ nb400_gbscale
], xmm3
325 ;
# lookup seed in xmm2
326 movapd xmm5
, xmm2 ;
# copy of lu
327 mulpd xmm2
, xmm2 ;
# lu*lu
328 movapd xmm1
, [rsp
+ nb400_three
]
329 mulpd xmm2
, xmm4 ;
# rsq*lu*lu
330 movapd xmm0
, [rsp
+ nb400_half
]
331 subpd xmm1
, xmm2 ;
# 30-rsq*lu*lu
333 mulpd xmm1
, xmm0 ;
# xmm0=iter1 of rinv (new lu)
335 mov rsi
, [rbp
+ nb400_charge
] ;
# base of charge[]
336 movlpd xmm3
, [rsi
+ r12*8]
338 movapd xmm5
, xmm1 ;
# copy of lu
339 mulpd xmm1
, xmm1 ;
# lu*lu
340 movapd xmm2
, [rsp
+ nb400_three
]
341 mulpd xmm1
, xmm4 ;
# rsq*lu*lu
342 movapd xmm0
, [rsp
+ nb400_half
]
343 subpd xmm2
, xmm1 ;
# 30-rsq*lu*lu
345 mulpd xmm0
, xmm2 ;
# xmm0=iter2 of rinv (new lu)
346 mulpd xmm4
, xmm0 ;
# xmm4=r
348 mulpd xmm6
, [rsp
+ nb400_iq
]
349 movhpd xmm3
, [rsi
+ r13*8]
351 movapd
[rsp
+ nb400_qq
], xmm3
354 movapd
[rsp
+ nb400_r
], xmm4
355 mulpd xmm4
, [rsp
+ nb400_gbscale
]
357 cvttpd2pi mm6
, xmm4 ;
# mm6 = lu idx
360 movapd xmm1
, xmm4 ;
# xmm1=eps
362 pslld mm6
, 2 ;
# idx *= 4
364 mov rsi
, [rbp
+ nb400_GBtab
]
367 movd r11d
, mm6 ;
# indices in r10/r11
369 movapd xmm4
, [rsi
+ r10*8] ;
# Y1 F1
370 movapd xmm3
, [rsi
+ r11*8] ;
# Y2 F2
372 unpcklpd xmm4
, xmm3 ;
# Y1 Y2
373 unpckhpd xmm5
, xmm3 ;
# F1 F2
375 movapd xmm6
, [rsi
+ r10*8 + 16] ;
# G1 H1
376 movapd xmm3
, [rsi
+ r11*8 + 16] ;
# G2 H2
378 unpcklpd xmm6
, xmm3 ;
# G1 G2
379 unpckhpd xmm7
, xmm3 ;
# H1 H2
380 ;
# coulomb table ready, in xmm4-xmm7
382 mulpd xmm7
, xmm1 ;
# xmm7=Heps
383 mulpd xmm6
, xmm1 ;
# xmm6=Geps
384 mulpd xmm7
, xmm1 ;
# xmm7=Heps2
386 addpd xmm5
, xmm7 ;
# xmm5=Fp
387 addpd xmm7
, xmm7 ;
# two*Heps2
388 movapd xmm3
, [rsp
+ nb400_qq
]
390 addpd xmm7
, xmm5 ;
# xmm7=FF
391 mulpd xmm5
, xmm1 ;
# xmm5=eps*Fp
392 addpd xmm5
, xmm4 ;
# xmm5=VV
393 mulpd xmm5
, xmm3 ;
# vcoul=qq*VV
394 mulpd xmm3
, xmm7 ;
# fijC=FF*qq
396 mov rsi
, [rbp
+ nb400_dvda
]
400 mulpd xmm3
, [rsp
+ nb400_gbscale
]
402 mulpd xmm6
, [rsp
+ nb400_r
]
408 ;
# xmm6=(vcoul+fijC*r)
415 ;
# update j atoms dvdaj
417 addsd xmm6
, [rsi
+ r12*8]
418 addsd xmm7
, [rsi
+ r13*8]
419 movsd
[rsi
+ r12*8], xmm6
420 movsd
[rsi
+ r13*8], xmm7
422 ;
# the fj's - start by accumulating forces from memory
423 mov rdi
, [rbp
+ nb400_faction
]
424 movlpd xmm5
, [rdi
+ r8*8]
425 movlpd xmm6
, [rdi
+ r8*8 + 8]
426 movlpd xmm7
, [rdi
+ r8*8 + 16]
427 movhpd xmm5
, [rdi
+ r9*8]
428 movhpd xmm6
, [rdi
+ r9*8 + 8]
429 movhpd xmm7
, [rdi
+ r9*8 + 16]
436 mov rdi
, [rbp
+ nb400_faction
]
450 movlpd
[rdi
+ r8*8], xmm5
451 movlpd
[rdi
+ r8*8 + 8], xmm6
452 movlpd
[rdi
+ r8*8 + 16], xmm7
453 movhpd
[rdi
+ r9*8], xmm5
454 movhpd
[rdi
+ r9*8 + 8], xmm6
455 movhpd
[rdi
+ r9*8 + 16], xmm7
457 ;
# should we do one more iteration?
458 sub dword ptr
[rsp
+ nb400_innerk
], 2
459 jl
.nb400_checksingle
460 jmp
.nb400_unroll_loop
462 mov edx
, [rsp
+ nb400_innerk
]
465 jmp
.nb400_updateouterdata
467 mov rsi
, [rbp
+ nb400_charge
]
468 mov rdx
, [rbp
+ nb400_invsqrta
]
469 mov rdi
, [rbp
+ nb400_pos
]
470 mov rcx
, [rsp
+ nb400_innerjjnr
]
474 mov rsi
, [rbp
+ nb400_invsqrta
]
475 movsd xmm2
, [rsi
+ rax
*8]
476 mulsd xmm2
, [rsp
+ nb400_isai
]
477 movapd
[rsp
+ nb400_isaprod
], xmm2
479 mulsd xmm1
, [rsp
+ nb400_gbtsc
]
480 movapd
[rsp
+ nb400_gbscale
], xmm1
482 mulsd xmm2
, [rsp
+ nb400_iq
]
483 mov rsi
, [rbp
+ nb400_charge
] ;
# base of charge[]
484 movsd xmm3
, [rsi
+ rax
*8]
486 movapd
[rsp
+ nb400_qq
], xmm3
488 mov rsi
, [rbp
+ nb400_pos
] ;
# base of pos[]
490 lea
r8, [rax
+ rax
*2] ;
# j3
492 ;
# move coordinate to xmm4-xmm6
493 movsd xmm4
, [rsi
+ r8*8]
494 movsd xmm5
, [rsi
+ r8*8 + 8]
495 movsd xmm6
, [rsi
+ r8*8 + 16]
497 mov rdi
, [rbp
+ nb400_faction
]
500 subsd xmm4
, [rsp
+ nb400_ix
]
501 subsd xmm5
, [rsp
+ nb400_iy
]
502 subsd xmm6
, [rsp
+ nb400_iz
]
519 cvtss2sd xmm2
, xmm5 ;
# lu in low xmm2
521 ;
# lookup seed in xmm2
522 movapd xmm5
, xmm2 ;
# copy of lu
523 mulsd xmm2
, xmm2 ;
# lu*lu
524 movapd xmm1
, [rsp
+ nb400_three
]
525 mulsd xmm2
, xmm4 ;
# rsq*lu*lu
526 movapd xmm0
, [rsp
+ nb400_half
]
527 subsd xmm1
, xmm2 ;
# 30-rsq*lu*lu
529 mulsd xmm1
, xmm0 ;
# xmm0=iter1 of rinv (new lu)
531 movapd xmm5
, xmm1 ;
# copy of lu
532 mulsd xmm1
, xmm1 ;
# lu*lu
533 movapd xmm2
, [rsp
+ nb400_three
]
534 mulsd xmm1
, xmm4 ;
# rsq*lu*lu
535 movapd xmm0
, [rsp
+ nb400_half
]
536 subsd xmm2
, xmm1 ;
# 30-rsq*lu*lu
538 mulsd xmm0
, xmm2 ;
# xmm0=iter2 of rinv (new lu)
539 mulsd xmm4
, xmm0 ;
# xmm4=r
541 movapd
[rsp
+ nb400_r
], xmm4
542 mulsd xmm4
, [rsp
+ nb400_gbscale
]
544 cvttsd2si r10d
, xmm4 ;
# mm6 = lu idx
547 movapd xmm1
, xmm4 ;
# xmm1=eps
549 shl r10d
, 2 ;
# idx *= 4
551 mov rsi
, [rbp
+ nb400_GBtab
]
553 movapd xmm4
, [rsi
+ r10*8] ;
# Y1 F1
555 movapd xmm6
, [rsi
+ r10*8 + 16] ;
# G1 H1
557 ;
# coulomb table ready, in xmm4-xmm7
559 mulsd xmm7
, xmm1 ;
# xmm7=Heps
560 mulsd xmm6
, xmm1 ;
# xmm6=Geps
561 mulsd xmm7
, xmm1 ;
# xmm7=Heps2
563 addsd xmm5
, xmm7 ;
# xmm5=Fp
564 addsd xmm7
, xmm7 ;
# two*Heps2
565 movapd xmm3
, [rsp
+ nb400_qq
]
567 addsd xmm7
, xmm5 ;
# xmm7=FF
568 mulsd xmm5
, xmm1 ;
# xmm5=eps*Fp
569 addsd xmm5
, xmm4 ;
# xmm5=VV
570 mulsd xmm5
, xmm3 ;
# vcoul=qq*VV
571 mulsd xmm3
, xmm7 ;
# fijC=FF*qq
573 mov rsi
, [rbp
+ nb400_dvda
]
577 mulsd xmm3
, [rsp
+ nb400_gbscale
]
579 mulsd xmm6
, [rsp
+ nb400_r
]
585 ;
# xmm6=(vcoul+fijC*r)
592 ;
# update j atoms dvdaj
593 addsd xmm6
, [rsi
+ rax
*8]
594 movsd
[rsi
+ rax
*8], xmm6
601 mov rdi
, [rbp
+ nb400_faction
]
611 ;
# the fj's - start by accumulating forces from memory
612 mov rdi
, [rbp
+ nb400_faction
]
613 addsd xmm9
, [rdi
+ r8*8]
614 addsd xmm10
, [rdi
+ r8*8 + 8]
615 addsd xmm11
, [rdi
+ r8*8 + 16]
616 movsd
[rdi
+ r8*8], xmm9
617 movsd
[rdi
+ r8*8 + 8], xmm10
618 movsd
[rdi
+ r8*8 + 16], xmm11
620 .nb400_updateouterdata:
621 mov ecx
, [rsp
+ nb400_ii3
]
622 mov rdi
, [rbp
+ nb400_faction
]
623 mov rsi
, [rbp
+ nb400_fshift
]
624 mov edx
, [rsp
+ nb400_is3
]
626 ;
# accumulate i forces in xmm13, xmm14, xmm15
632 addsd xmm15
, xmm5 ;
# sum is in low xmm13-xmm15
635 movsd xmm3
, [rdi
+ rcx
*8]
636 movsd xmm4
, [rdi
+ rcx
*8 + 8]
637 movsd xmm5
, [rdi
+ rcx
*8 + 16]
641 movsd
[rdi
+ rcx
*8], xmm3
642 movsd
[rdi
+ rcx
*8 + 8], xmm4
643 movsd
[rdi
+ rcx
*8 + 16], xmm5
645 ;
# increment fshift force
646 movsd xmm3
, [rsi
+ rdx
*8]
647 movsd xmm4
, [rsi
+ rdx
*8 + 8]
648 movsd xmm5
, [rsi
+ rdx
*8 + 16]
652 movsd
[rsi
+ rdx
*8], xmm3
653 movsd
[rsi
+ rdx
*8 + 8], xmm4
654 movsd
[rsi
+ rdx
*8 + 16], xmm5
657 mov esi
, [rsp
+ nb400_n
]
658 ;
# get group index for i particle
659 mov rdx
, [rbp
+ nb400_gid
] ;
# base of gid[]
660 mov edx
, [rdx
+ rsi
*4] ;
# ggid=gid[n]
662 ;
# accumulate total coulomb energy and update it
664 addsd xmm12
, xmm6 ;
# low xmm12 have the sum now
666 ;
# add earlier value from mem
667 mov rax
, [rbp
+ nb400_Vc
]
668 addsd xmm12
, [rax
+ rdx
*8]
670 movsd
[rax
+ rdx
*8], xmm12
672 ;
# accumulate dVda and update it
674 addsd xmm8
, xmm6 ;
# low xmm8 has the sum now
676 mov edx
, [rsp
+ nb400_ii
]
677 mov rax
, [rbp
+ nb400_dvda
]
678 addsd xmm8
, [rax
+ rdx
*8]
679 movsd
[rax
+ rdx
*8], xmm8
682 mov ecx
, [rsp
+ nb400_nn1
]
683 ;
# esi already loaded with n
688 ;
# not last, iterate outer loop once more!
689 mov
[rsp
+ nb400_n
], esi
692 ;
# check if more outer neighborlists remain
693 mov ecx
, [rsp
+ nb400_nri
]
694 ;
# esi already loaded with n above
697 ;
# non-zero, do one more workunit
698 jmp
.nb400_threadloop
700 mov eax
, [rsp
+ nb400_nouter
]
701 mov ebx
, [rsp
+ nb400_ninner
]
702 mov rcx
, [rbp
+ nb400_outeriter
]
703 mov rdx
, [rbp
+ nb400_inneriter
]
726 .globl nb_kernel400nf_x86_64_sse2
727 .globl _nb_kernel400nf_x86_64_sse2
728 nb_kernel400nf_x86_64_sse2
:
729 _nb_kernel400nf_x86_64_sse2
:
730 .equiv nb400nf_fshift, 16
731 .equiv nb400nf_gid, 24
732 .equiv nb400nf_pos, 32
733 .equiv nb400nf_faction, 40
734 .equiv nb400nf_charge, 48
735 .equiv nb400nf_p_facel, 56
736 .equiv nb400nf_argkrf, 64
737 .equiv nb400nf_argcrf, 72
738 .equiv nb400nf_Vc, 80
739 .equiv nb400nf_type, 88
740 .equiv nb400nf_p_ntype, 96
741 .equiv nb400nf_vdwparam, 104
742 .equiv nb400nf_Vvdw, 112
743 .equiv nb400nf_p_tabscale, 120
744 .equiv nb400nf_VFtab, 128
745 .equiv nb400nf_invsqrta, 136
746 .equiv nb400nf_dvda, 144
747 .equiv nb400nf_p_gbtabscale, 152
748 .equiv nb400nf_GBtab, 160
749 .equiv nb400nf_p_nthreads, 168
750 .equiv nb400nf_count, 176
751 .equiv nb400nf_mtx, 184
752 .equiv nb400nf_outeriter, 192
753 .equiv nb400nf_inneriter, 200
754 .equiv nb400nf_work, 208
755 ;
# stack offsets for local variables
756 ;
# bottom of stack is cache-aligned for sse2 use
758 .equiv nb400nf_iy, 16
759 .equiv nb400nf_iz, 32
760 .equiv nb400nf_iq, 48
761 .equiv nb400nf_gbtsc, 64
762 .equiv nb400nf_qq, 80
763 .equiv nb400nf_vctot, 96
764 .equiv nb400nf_half, 112
765 .equiv nb400nf_three, 128
766 .equiv nb400nf_isai, 144
767 .equiv nb400nf_isaprod, 160
768 .equiv nb400nf_gbscale, 176
769 .equiv nb400nf_nri, 192
770 .equiv nb400nf_iinr, 200
771 .equiv nb400nf_jindex, 208
772 .equiv nb400nf_jjnr, 216
773 .equiv nb400nf_shift, 224
774 .equiv nb400nf_shiftvec, 232
775 .equiv nb400nf_facel, 240
776 .equiv nb400nf_innerjjnr, 248
777 .equiv nb400nf_is3, 256
778 .equiv nb400nf_ii3, 260
779 .equiv nb400nf_innerk, 264
780 .equiv nb400nf_n, 268
781 .equiv nb400nf_nn1, 272
782 .equiv nb400nf_nouter, 276
783 .equiv nb400nf_ninner, 280
796 sub rsp
, 296 ;
# local variable stack space (n*16+8)
798 ;
# zero 32-bit iteration counters
800 mov
[rsp
+ nb400nf_nouter
], eax
801 mov
[rsp
+ nb400nf_ninner
], eax
804 mov
[rsp
+ nb400nf_nri
], edi
805 mov
[rsp
+ nb400nf_iinr
], rsi
806 mov
[rsp
+ nb400nf_jindex
], rdx
807 mov
[rsp
+ nb400nf_jjnr
], rcx
808 mov
[rsp
+ nb400nf_shift
], r8
809 mov
[rsp
+ nb400nf_shiftvec
], r9
810 mov rsi
, [rbp
+ nb400nf_p_facel
]
812 movsd
[rsp
+ nb400nf_facel
], xmm0
814 mov rbx
, [rbp
+ nb400nf_p_gbtabscale
]
817 movapd
[rsp
+ nb400nf_gbtsc
], xmm4
819 ;
# create constant floating-point factors on stack
820 mov eax
, 0x00000000 ;
# lower half of double half IEEE (hex)
822 mov
[rsp
+ nb400nf_half
], eax
823 mov
[rsp
+ nb400nf_half+
4], ebx
824 movsd xmm1
, [rsp
+ nb400nf_half
]
825 shufpd xmm1
, xmm1
, 0 ;
# splat to all elements
827 addpd xmm3
, xmm3 ;
# one
829 addpd xmm2
, xmm2 ;
# two
830 addpd xmm3
, xmm2 ;
# three
831 movapd
[rsp
+ nb400nf_half
], xmm1
832 movapd
[rsp
+ nb400nf_three
], xmm3
835 mov rsi
, [rbp
+ nb400nf_count
] ;
# pointer to sync counter
838 mov ebx
, eax ;
# ebx=*count=nn0
839 add ebx
, 1 ;
# ebx=nn1=nn0+10
841 cmpxchg
[esi
], ebx ;
# write nn1 to *counter,
842 ;
# if it hasnt changed.
843 ;
# or reread *counter to eax.
844 pause ;
# -> better p4 performance
845 jnz
.nb400nf_spinlock
847 ;
# if(nn1>nri) nn1=nri
848 mov ecx
, [rsp
+ nb400nf_nri
]
851 cmovle ebx
, edx ;
# if(nn1>nri) nn1=nri
852 ;
# Cleared the spinlock if we got here.
853 ;
# eax contains nn0, ebx contains nn1.
854 mov
[rsp
+ nb400nf_n
], eax
855 mov
[rsp
+ nb400nf_nn1
], ebx
856 sub ebx
, eax ;
# calc number of outer lists
857 mov esi
, eax ;
# copy n to esi
858 jg
.nb400nf_outerstart
862 ;
# ebx contains number of outer iterations
863 add ebx
, [rsp
+ nb400nf_nouter
]
864 mov
[rsp
+ nb400nf_nouter
], ebx
867 mov rax
, [rsp
+ nb400nf_shift
] ;
# rax = pointer into shift[]
868 mov ebx
, [rax+rsi
*4] ;
# rbx=shift[n]
870 lea rbx
, [rbx
+ rbx
*2] ;
# rbx=3*is
871 mov
[rsp
+ nb400nf_is3
],ebx ;
# store is3
873 mov rax
, [rsp
+ nb400nf_shiftvec
] ;
# rax = base of shiftvec[]
875 movsd xmm0
, [rax
+ rbx
*8]
876 movsd xmm1
, [rax
+ rbx
*8 + 8]
877 movsd xmm2
, [rax
+ rbx
*8 + 16]
879 mov rcx
, [rsp
+ nb400nf_iinr
] ;
# rcx = pointer into iinr[]
880 mov ebx
, [rcx+rsi
*4] ;
# ebx =ii
882 mov rdx
, [rbp
+ nb400nf_charge
]
883 movsd xmm3
, [rdx
+ rbx
*8]
884 mulsd xmm3
, [rsp
+ nb400nf_facel
]
887 mov rdx
, [rbp
+ nb400nf_invsqrta
] ;
# load invsqrta[ii]
888 movsd xmm4
, [rdx
+ rbx
*8]
891 lea rbx
, [rbx
+ rbx
*2] ;
# rbx = 3*ii=ii3
892 mov rax
, [rbp
+ nb400nf_pos
] ;
# rax = base of pos[]
894 addsd xmm0
, [rax
+ rbx
*8]
895 addsd xmm1
, [rax
+ rbx
*8 + 8]
896 addsd xmm2
, [rax
+ rbx
*8 + 16]
898 movapd
[rsp
+ nb400nf_iq
], xmm3
899 movapd
[rsp
+ nb400nf_isai
], xmm4
905 movapd
[rsp
+ nb400nf_ix
], xmm0
906 movapd
[rsp
+ nb400nf_iy
], xmm1
907 movapd
[rsp
+ nb400nf_iz
], xmm2
909 mov
[rsp
+ nb400nf_ii3
], ebx
913 movapd
[rsp
+ nb400nf_vctot
], xmm4
915 mov rax
, [rsp
+ nb400nf_jindex
]
916 mov ecx
, [rax
+ rsi
*4] ;
# jindex[n]
917 mov edx
, [rax
+ rsi
*4 + 4] ;
# jindex[n+1]
918 sub edx
, ecx ;
# number of innerloop atoms
920 mov rsi
, [rbp
+ nb400nf_pos
]
921 mov rdi
, [rbp
+ nb400nf_faction
]
922 mov rax
, [rsp
+ nb400nf_jjnr
]
925 mov
[rsp
+ nb400nf_innerjjnr
], rax ;
# pointer to jjnr[nj0]
928 add ecx
, [rsp
+ nb400nf_ninner
]
929 mov
[rsp
+ nb400nf_ninner
], ecx
931 mov
[rsp
+ nb400nf_innerk
], edx ;
# number of innerloop atoms
932 jge
.nb400nf_unroll_loop
933 jmp
.nb400nf_checksingle
934 .nb400nf_unroll_loop:
935 ;
# twice unrolled innerloop here
936 mov rdx
, [rsp
+ nb400nf_innerjjnr
] ;
# pointer to jjnr[k]
939 add qword ptr
[rsp
+ nb400nf_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
942 mov rsi
, [rbp
+ nb400nf_invsqrta
]
943 movlpd xmm2
, [rsi
+ rax
*8]
944 movhpd xmm2
, [rsi
+ rbx
*8]
945 mulpd xmm2
, [rsp
+ nb400nf_isai
]
946 movapd
[rsp
+ nb400nf_isaprod
], xmm2
948 mulpd xmm1
, [rsp
+ nb400nf_gbtsc
]
949 movapd
[rsp
+ nb400nf_gbscale
], xmm1
951 mov rsi
, [rbp
+ nb400nf_charge
] ;
# base of charge[]
952 movlpd xmm3
, [rsi
+ rax
*8]
953 movhpd xmm3
, [rsi
+ rbx
*8]
955 mulpd xmm2
, [rsp
+ nb400nf_iq
]
957 movapd
[rsp
+ nb400nf_qq
], xmm3
959 mov rsi
, [rbp
+ nb400nf_pos
] ;
# base of pos[]
961 lea rax
, [rax
+ rax
*2] ;
# replace jnr with j3
962 lea rbx
, [rbx
+ rbx
*2]
964 ;
# move two coordinates to xmm0-xmm2
965 movlpd xmm0
, [rsi
+ rax
*8]
966 movlpd xmm1
, [rsi
+ rax
*8 + 8]
967 movlpd xmm2
, [rsi
+ rax
*8 + 16]
968 movhpd xmm0
, [rsi
+ rbx
*8]
969 movhpd xmm1
, [rsi
+ rbx
*8 + 8]
970 movhpd xmm2
, [rsi
+ rbx
*8 + 16]
972 mov rdi
, [rbp
+ nb400nf_faction
]
974 ;
# move nb400nf_ix-iz to xmm4-xmm6
975 movapd xmm4
, [rsp
+ nb400nf_ix
]
976 movapd xmm5
, [rsp
+ nb400nf_iy
]
977 movapd xmm6
, [rsp
+ nb400nf_iz
]
994 cvtps2pd xmm2
, xmm5 ;
# lu in low xmm2
996 ;
# lookup seed in xmm2
997 movapd xmm5
, xmm2 ;
# copy of lu
998 mulpd xmm2
, xmm2 ;
# lu*lu
999 movapd xmm1
, [rsp
+ nb400nf_three
]
1000 mulpd xmm2
, xmm4 ;
# rsq*lu*lu
1001 movapd xmm0
, [rsp
+ nb400nf_half
]
1002 subpd xmm1
, xmm2 ;
# 30-rsq*lu*lu
1004 mulpd xmm1
, xmm0 ;
# xmm0=iter1 of rinv (new lu)
1006 movapd xmm5
, xmm1 ;
# copy of lu
1007 mulpd xmm1
, xmm1 ;
# lu*lu
1008 movapd xmm2
, [rsp
+ nb400nf_three
]
1009 mulpd xmm1
, xmm4 ;
# rsq*lu*lu
1010 movapd xmm0
, [rsp
+ nb400nf_half
]
1011 subpd xmm2
, xmm1 ;
# 30-rsq*lu*lu
1013 mulpd xmm0
, xmm2 ;
# xmm0=iter2 of rinv (new lu)
1014 mulpd xmm4
, xmm0 ;
# xmm4=r
1015 mulpd xmm4
, [rsp
+ nb400nf_gbscale
]
1017 cvttpd2pi mm6
, xmm4 ;
# mm6 = lu idx
1020 movapd xmm1
, xmm4 ;
# xmm1=eps
1022 mulpd xmm2
, xmm2 ;
# xmm2=eps2
1024 pslld mm6
, 2 ;
# idx *= 4
1029 mov rsi
, [rbp
+ nb400nf_GBtab
]
1032 movd ebx
, mm6 ;
# indices in eax/ebx
1034 movapd xmm4
, [rsi
+ rax
*8] ;
# Y1 F1
1035 movapd xmm3
, [rsi
+ rbx
*8] ;
# Y2 F2
1037 unpcklpd xmm4
, xmm3 ;
# Y1 Y2
1038 unpckhpd xmm5
, xmm3 ;
# F1 F2
1040 movapd xmm6
, [rsi
+ rax
*8 + 16] ;
# G1 H1
1041 movapd xmm3
, [rsi
+ rbx
*8 + 16] ;
# G2 H2
1043 unpcklpd xmm6
, xmm3 ;
# G1 G2
1044 unpckhpd xmm7
, xmm3 ;
# H1 H2
1045 ;
# coulomb table ready, in xmm4-xmm7
1046 mulpd xmm6
, xmm1 ;
# xmm6=Geps
1047 mulpd xmm7
, xmm2 ;
# xmm7=Heps2
1049 addpd xmm5
, xmm7 ;
# xmm5=Fp
1050 movapd xmm3
, [rsp
+ nb400nf_qq
]
1051 mulpd xmm5
, xmm1 ;
# xmm5=eps*Fp
1052 addpd xmm5
, xmm4 ;
# xmm5=VV
1053 mulpd xmm5
, xmm3 ;
# vcoul=qq*VV
1054 addpd xmm5
, [rsp
+ nb400nf_vctot
]
1055 movapd
[rsp
+ nb400nf_vctot
], xmm5
1057 ;
# should we do one more iteration?
1058 sub dword ptr
[rsp
+ nb400nf_innerk
], 2
1059 jl
.nb400nf_checksingle
1060 jmp
.nb400nf_unroll_loop
1061 .nb400nf_checksingle:
1062 mov edx
, [rsp
+ nb400nf_innerk
]
1064 jnz
.nb400nf_dosingle
1065 jmp
.nb400nf_updateouterdata
1067 mov rsi
, [rbp
+ nb400nf_charge
]
1068 mov rdx
, [rbp
+ nb400nf_invsqrta
]
1069 mov rdi
, [rbp
+ nb400nf_pos
]
1070 mov rcx
, [rsp
+ nb400nf_innerjjnr
]
1074 movsd xmm7
, [rdx
+ rax
*8]
1075 movlpd xmm6
, [rsi
+ rax
*8] ;
# xmm6(0) has the charge
1076 mulsd xmm7
, [rsp
+ nb400nf_isai
]
1077 movapd
[rsp
+ nb400nf_isaprod
], xmm7
1079 mulpd xmm1
, [rsp
+ nb400nf_gbtsc
]
1080 movapd
[rsp
+ nb400nf_gbscale
], xmm1
1082 mulsd xmm7
, [rsp
+ nb400nf_iq
]
1084 movapd
[rsp
+ nb400nf_qq
], xmm6
1086 lea rax
, [rax
+ rax
*2]
1088 ;
# move coordinates to xmm0-xmm2
1089 movlpd xmm0
, [rdi
+ rax
*8]
1090 movlpd xmm1
, [rdi
+ rax
*8 + 8]
1091 movlpd xmm2
, [rdi
+ rax
*8 + 16]
1093 ;
# move nb400nf_ix-iz to xmm4-xmm6
1094 movapd xmm4
, [rsp
+ nb400nf_ix
]
1095 movapd xmm5
, [rsp
+ nb400nf_iy
]
1096 movapd xmm6
, [rsp
+ nb400nf_iz
]
1113 cvtss2sd xmm2
, xmm5 ;
# lu in low xmm2
1115 ;
# lookup seed in xmm2
1116 movapd xmm5
, xmm2 ;
# copy of lu
1117 mulsd xmm2
, xmm2 ;
# lu*lu
1118 movapd xmm1
, [rsp
+ nb400nf_three
]
1119 mulsd xmm2
, xmm4 ;
# rsq*lu*lu
1120 movapd xmm0
, [rsp
+ nb400nf_half
]
1121 subsd xmm1
, xmm2 ;
# 30-rsq*lu*lu
1123 mulsd xmm1
, xmm0 ;
# xmm0=iter1 of rinv (new lu)
1125 movapd xmm5
, xmm1 ;
# copy of lu
1126 mulsd xmm1
, xmm1 ;
# lu*lu
1127 movapd xmm2
, [rsp
+ nb400nf_three
]
1128 mulsd xmm1
, xmm4 ;
# rsq*lu*lu
1129 movapd xmm0
, [rsp
+ nb400nf_half
]
1130 subsd xmm2
, xmm1 ;
# 30-rsq*lu*lu
1132 mulsd xmm0
, xmm2 ;
# xmm0=iter2 of rinv (new lu)
1134 mulsd xmm4
, xmm0 ;
# xmm4=r
1135 mulsd xmm4
, [rsp
+ nb400nf_gbscale
]
1139 cvttsd2si eax
, xmm4 ;
# mm6 = lu idx
1142 movapd xmm1
, xmm4 ;
# xmm1=eps
1144 mulsd xmm2
, xmm2 ;
# xmm2=eps2
1146 shl eax
, 2 ;
# idx *= 4
1148 mov rsi
, [rbp
+ nb400nf_GBtab
]
1151 movapd xmm4
, [rsi
+ rax
*8] ;
# Y1 F1
1154 unpcklpd xmm4
, xmm3 ;
# Y1
1155 unpckhpd xmm5
, xmm3 ;
# F1
1157 movapd xmm6
, [rsi
+ rax
*8 + 16] ;
# G1 H1
1160 unpcklpd xmm6
, xmm3 ;
# G1
1161 unpckhpd xmm7
, xmm3 ;
# H1
1162 ;
# table ready in xmm4-xmm7
1164 mulsd xmm6
, xmm1 ;
# xmm6=Geps
1165 mulsd xmm7
, xmm2 ;
# xmm7=Heps2
1167 addsd xmm5
, xmm7 ;
# xmm5=Fp
1168 movapd xmm3
, [rsp
+ nb400nf_qq
]
1169 mulsd xmm5
, xmm1 ;
# xmm5=eps*Fp
1170 addsd xmm5
, xmm4 ;
# xmm5=VV
1171 mulsd xmm5
, xmm3 ;
# vcoul=qq*VV
1172 addsd xmm5
, [rsp
+ nb400nf_vctot
]
1173 movsd
[rsp
+ nb400nf_vctot
], xmm5
1175 .nb400nf_updateouterdata:
1177 mov esi
, [rsp
+ nb400nf_n
]
1178 ;
# get group index for i particle
1179 mov rdx
, [rbp
+ nb400nf_gid
] ;
# base of gid[]
1180 mov edx
, [rdx
+ rsi
*4] ;
# ggid=gid[n]
1182 ;
# accumulate total potential energy and update it
1183 movapd xmm7
, [rsp
+ nb400nf_vctot
]
1186 addsd xmm7
, xmm6 ;
# low xmm7 has the sum now
1188 ;
# add earlier value from mem
1189 mov rax
, [rbp
+ nb400nf_Vc
]
1190 addsd xmm7
, [rax
+ rdx
*8]
1192 movsd
[rax
+ rdx
*8], xmm7
1195 mov ecx
, [rsp
+ nb400nf_nn1
]
1196 ;
# esi already loaded with n
1199 jz
.nb400nf_outerend
1201 ;
# not last, iterate outer loop once more!
1202 mov
[rsp
+ nb400nf_n
], esi
1205 ;
# check if more outer neighborlists remain
1206 mov ecx
, [rsp
+ nb400nf_nri
]
1207 ;
# esi already loaded with n above
1210 ;
# non-zero, do one more workunit
1211 jmp
.nb400nf_threadloop
1214 mov eax
, [rsp
+ nb400nf_nouter
]
1215 mov ebx
, [rsp
+ nb400nf_ninner
]
1216 mov rcx
, [rbp
+ nb400nf_outeriter
]
1217 mov rdx
, [rbp
+ nb400nf_inneriter
]