3 ;
# Gromacs 4.0 Copyright (c) 1991-2003
4 ;
# David van der Spoel, Erik Lindahl
6 ;
# This program is free software; you can redistribute it and/or
7 ;
# modify it under the terms of the GNU General Public License
8 ;
# as published by the Free Software Foundation; either version 2
9 ;
# of the License, or (at your option) any later version.
11 ;
# To help us fund GROMACS development, we humbly ask that you cite
12 ;
# the research papers on the package. Check out http://www.gromacs.org
15 ;
# Gnomes, ROck Monsters And Chili Sauce
18 ;
# These files require GNU binutils 2.10 or later, since we
19 ;
# use intel syntax for portability, or a recent version
20 ;
# of NASM that understands Extended 3DNow and SSE2 instructions.
21 ;
# (NASM is normally only used with MS Visual C++).
22 ;
# Since NASM and gnu as disagree on some definitions and use
23 ;
# completely different preprocessing options I have to introduce a
24 ;
# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
25 ;
# Gnu as treats ';' as a line break, i.e. ignores it. This is the
26 ;
# reason why all comments need both symbols...
27 ;
# The source is written for GNU as, with intel syntax. When you use
28 ;
# NASM we redefine a couple of things. The false if-statement around
29 ;
# the following code is seen by GNU as, but NASM doesn't see it, so
30 ;
# the code inside is read by NASM but not gcc.
32 ;
.if 0 # block below only read by NASM
33 %define
.section section
37 ;
# NASM only wants 'dword', not 'dword ptr'.
42 ;
.endif # End of NASM-specific block
43 ;
.intel_syntax noprefix # Line only read by gnu as
49 .globl nb_kernel130_x86_64_sse
50 .globl _nb_kernel130_x86_64_sse
51 nb_kernel130_x86_64_sse
:
52 _nb_kernel130_x86_64_sse
:
53 ;
# Room for return address and rbp (16 bytes)
54 .equiv nb130_fshift, 16
57 .equiv nb130_faction, 40
58 .equiv nb130_charge, 48
59 .equiv nb130_p_facel, 56
60 .equiv nb130_argkrf, 64
61 .equiv nb130_argcrf, 72
64 .equiv nb130_p_ntype, 96
65 .equiv nb130_vdwparam, 104
66 .equiv nb130_Vvdw, 112
67 .equiv nb130_p_tabscale, 120
68 .equiv nb130_VFtab, 128
69 .equiv nb130_invsqrta, 136
70 .equiv nb130_dvda, 144
71 .equiv nb130_p_gbtabscale, 152
72 .equiv nb130_GBtab, 160
73 .equiv nb130_p_nthreads, 168
74 .equiv nb130_count, 176
76 .equiv nb130_outeriter, 192
77 .equiv nb130_inneriter, 200
78 .equiv nb130_work, 208
79 ;
# stack offsets for local variables
80 ;
# bottom of stack is cache-aligned for sse use
92 .equiv nb130_vctot, 176
93 .equiv nb130_Vvdwtot, 192
97 .equiv nb130_half, 256
98 .equiv nb130_three, 272
100 .equiv nb130_nri, 336
101 .equiv nb130_iinr, 344
102 .equiv nb130_jindex, 352
103 .equiv nb130_jjnr, 360
104 .equiv nb130_shift, 368
105 .equiv nb130_shiftvec, 376
106 .equiv nb130_facel, 384
107 .equiv nb130_innerjjnr, 392
108 .equiv nb130_is3, 400
109 .equiv nb130_ii3, 404
110 .equiv nb130_ntia, 408
111 .equiv nb130_innerk, 412
113 .equiv nb130_nn1, 420
114 .equiv nb130_ntype, 424
115 .equiv nb130_nouter, 428
116 .equiv nb130_ninner, 432
121 ;
# Push integer registers on stack
130 ;
# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
133 ;
# Save xmm registers to stack
135 movaps
[rsp
+ 16 ], xmm7
136 movaps
[rsp
+ 32 ], xmm8
137 movaps
[rsp
+ 48 ], xmm9
138 movaps
[rsp
+ 64 ], xmm10
139 movaps
[rsp
+ 80 ], xmm11
140 movaps
[rsp
+ 96 ], xmm12
141 movaps
[rsp
+ 112], xmm13
142 movaps
[rsp
+ 128], xmm14
143 movaps
[rsp
+ 144], xmm15
146 sub rsp
, 432 ;
# local variable stack space (n*16+8)
147 ;
.if 0 # block below only read by NASM - special calling convention on win64
148 %ifidn __OUTPUT_FORMAT__
, win64
149 ;
# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
151 ;
# Adjust stack pointer for different alignment
152 ;
# Move around arguments to fit AMD64 convention below
153 ;
# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
154 ;
# win64 passes args in: rcx,rdx,r8,r9 + stack
162 ;
.endif # end NASM- and win64-specific block
164 ;
# zero 32-bit iteration counters
166 mov
[rsp
+ nb130_nouter
], eax
167 mov
[rsp
+ nb130_ninner
], eax
170 mov
[rsp
+ nb130_nri
], edi
171 mov
[rsp
+ nb130_iinr
], rsi
172 mov
[rsp
+ nb130_jindex
], rdx
173 mov
[rsp
+ nb130_jjnr
], rcx
174 mov
[rsp
+ nb130_shift
], r8
175 mov
[rsp
+ nb130_shiftvec
], r9
176 mov rdi
, [rbp
+ nb130_p_ntype
]
178 mov
[rsp
+ nb130_ntype
], edi
179 mov rsi
, [rbp
+ nb130_p_facel
]
181 movss
[rsp
+ nb130_facel
], xmm0
183 mov rax
, [rbp
+ nb130_p_tabscale
]
186 movaps
[rsp
+ nb130_tsc
], xmm3
188 ;
# create constant floating-point factors on stack
189 mov eax
, 0x3f000000 ;
# half in IEEE (hex)
190 mov
[rsp
+ nb130_half
], eax
191 movss xmm1
, [rsp
+ nb130_half
]
192 shufps xmm1
, xmm1
, 0 ;
# splat to all elements
194 addps xmm2
, xmm2 ;
# one
196 addps xmm2
, xmm2 ;
# two
197 addps xmm3
, xmm2 ;
# three
198 movaps
[rsp
+ nb130_half
], xmm1
199 movaps
[rsp
+ nb130_two
], xmm2
200 movaps
[rsp
+ nb130_three
], xmm3
203 mov rsi
, [rbp
+ nb130_count
] ;
# pointer to sync counter
206 mov ebx
, eax ;
# ebx=*count=nn0
207 add ebx
, 1 ;
# ebx=nn1=nn0+10
209 cmpxchg
[rsi
], ebx ;
# write nn1 to *counter,
210 ;
# if it hasnt changed.
211 ;
# or reread *counter to eax.
212 pause ;
# -> better p4 performance
215 ;
# if(nn1>nri) nn1=nri
216 mov ecx
, [rsp
+ nb130_nri
]
219 cmovle ebx
, edx ;
# if(nn1>nri) nn1=nri
220 ;
# Cleared the spinlock if we got here.
221 ;
# eax contains nn0, ebx contains nn1.
222 mov
[rsp
+ nb130_n
], eax
223 mov
[rsp
+ nb130_nn1
], ebx
224 sub ebx
, eax ;
# calc number of outer lists
225 mov esi
, eax ;
# copy n to esi
230 ;
# ebx contains number of outer iterations
231 add ebx
, [rsp
+ nb130_nouter
]
232 mov
[rsp
+ nb130_nouter
], ebx
235 mov rax
, [rsp
+ nb130_shift
] ;
# eax = pointer into shift[]
236 mov ebx
, [rax
+ rsi
*4] ;
# ebx=shift[n]
238 lea rbx
, [rbx
+ rbx
*2] ;
# rbx=3*is
239 mov
[rsp
+ nb130_is3
],ebx ;
# store is3
241 mov rax
, [rsp
+ nb130_shiftvec
] ;
# eax = base of shiftvec[]
243 movss xmm0
, [rax
+ rbx
*4]
244 movss xmm1
, [rax
+ rbx
*4 + 4]
245 movss xmm2
, [rax
+ rbx
*4 + 8]
247 mov rcx
, [rsp
+ nb130_iinr
] ;
# ecx = pointer into iinr[]
248 mov ebx
, [rcx
+ rsi
*4] ;
# ebx =ii
250 mov rdx
, [rbp
+ nb130_charge
]
251 movss xmm3
, [rdx
+ rbx
*4]
252 mulss xmm3
, [rsp
+ nb130_facel
]
255 mov rdx
, [rbp
+ nb130_type
]
256 mov edx
, [rdx
+ rbx
*4]
257 imul edx
, [rsp
+ nb130_ntype
]
259 mov
[rsp
+ nb130_ntia
], edx
261 lea rbx
, [rbx
+ rbx
*2] ;
# rbx = 3*ii=ii3
262 mov rax
, [rbp
+ nb130_pos
] ;
# eax = base of pos[]
264 addss xmm0
, [rax
+ rbx
*4]
265 addss xmm1
, [rax
+ rbx
*4 + 4]
266 addss xmm2
, [rax
+ rbx
*4 + 8]
268 movaps
[rsp
+ nb130_iq
], xmm3
274 movaps
[rsp
+ nb130_ix
], xmm0
275 movaps
[rsp
+ nb130_iy
], xmm1
276 movaps
[rsp
+ nb130_iz
], xmm2
278 mov
[rsp
+ nb130_ii3
], ebx
280 ;
# clear vctot and i forces
282 movaps
[rsp
+ nb130_vctot
], xmm4
283 movaps
[rsp
+ nb130_Vvdwtot
], xmm4
284 movaps
[rsp
+ nb130_fix
], xmm4
285 movaps
[rsp
+ nb130_fiy
], xmm4
286 movaps
[rsp
+ nb130_fiz
], xmm4
288 mov rax
, [rsp
+ nb130_jindex
]
289 mov ecx
, [rax
+ rsi
*4] ;
# jindex[n]
290 mov edx
, [rax
+ rsi
*4 + 4] ;
# jindex[n+1]
291 sub edx
, ecx ;
# number of innerloop atoms
293 mov rax
, [rsp
+ nb130_jjnr
]
296 mov
[rsp
+ nb130_innerjjnr
], rax ;
# pointer to jjnr[nj0]
299 add ecx
, [rsp
+ nb130_ninner
]
300 mov
[rsp
+ nb130_ninner
], ecx
302 mov
[rsp
+ nb130_innerk
], edx ;
# number of innerloop atoms
303 jge
.nb130_unroll_loop
304 jmp
.nb130_finish_inner
306 ;
# quad-unroll innerloop here
307 mov rdx
, [rsp
+ nb130_innerjjnr
] ;
# pointer to jjnr[k]
311 mov edx
, [rdx
+ 12] ;
# eax-edx=jnr1-4
312 add qword ptr
[rsp
+ nb130_innerjjnr
], 16 ;
# advance pointer (unrolled 4)
314 mov rsi
, [rbp
+ nb130_charge
]
315 movss xmm0
, [rsi
+ rax
*4]
316 movss xmm1
, [rsi
+ rcx
*4]
317 movss xmm2
, [rsi
+ rbx
*4]
318 movss xmm3
, [rsi
+ rdx
*4]
320 unpcklps xmm0
, xmm1 ;
# jqa jqc - -
321 unpcklps xmm2
, xmm3 ;
# jqb jqd - -
322 unpcklps xmm0
, xmm2 ;
# jqa jqb jqc jqd
323 mulps xmm0
, [rsp
+ nb130_iq
]
324 movaps
[rsp
+ nb130_qq
], xmm0
327 mov rsi
, [rbp
+ nb130_type
]
328 mov r12d
, [rsi
+ rax
*4]
329 mov r13d
, [rsi
+ rbx
*4]
330 mov r14d
, [rsi
+ rcx
*4]
331 mov r15d
, [rsi
+ rdx
*4]
336 mov edi
, [rsp
+ nb130_ntia
]
342 mov rsi
, [rbp
+ nb130_vdwparam
]
343 movlps xmm3
, [rsi
+ r12*4]
344 movlps xmm7
, [rsi
+ r14*4]
345 movhps xmm3
, [rsi
+ r13*4]
346 movhps xmm7
, [rsi
+ r15*4]
349 shufps xmm0
, xmm7
, 136 ;
# 10001000
350 shufps xmm3
, xmm7
, 221 ;
# 11011101
352 movaps
[rsp
+ nb130_c6
], xmm0
353 movaps
[rsp
+ nb130_c12
], xmm3
355 lea rax
, [rax
+ rax
*2] ;
# replace jnr with j3
356 lea rbx
, [rbx
+ rbx
*2]
357 lea rcx
, [rcx
+ rcx
*2]
358 lea rdx
, [rdx
+ rdx
*2]
360 mov rdi
, [rbp
+ nb130_pos
]
362 movlps xmm1
, [rdi
+ rax
*4] ;
# x1 y1 - -
363 movlps xmm2
, [rdi
+ rcx
*4] ;
# x3 y3 - -
364 movhps xmm1
, [rdi
+ rbx
*4] ;
# x2 y2 - -
365 movhps xmm2
, [rdi
+ rdx
*4] ;
# x4 y4 - -
367 movss xmm5
, [rdi
+ rax
*4 + 8] ;
# z1 - - -
368 movss xmm6
, [rdi
+ rcx
*4 + 8] ;
# z2 - - -
369 movss xmm7
, [rdi
+ rbx
*4 + 8] ;
# z3 - - -
370 movss xmm8
, [rdi
+ rdx
*4 + 8] ;
# z4 - - -
371 movlhps xmm5
, xmm7 ;
# jzOa - jzOb -
372 movlhps xmm6
, xmm8 ;
# jzOc - jzOd -
375 unpcklps xmm1
, xmm2 ;
# jxa jxc jya jyc
376 unpckhps xmm4
, xmm2 ;
# jxb jxd jyb jyd
378 unpcklps xmm1
, xmm4 ;
# x
379 unpckhps xmm2
, xmm4 ;
# y
380 shufps xmm5
, xmm6
, 136 ;
# 10001000 => jzH2a jzH2b jzH2c jzH2d
383 subps xmm1
, [rsp
+ nb130_ix
]
384 subps xmm2
, [rsp
+ nb130_iy
]
385 subps xmm5
, [rsp
+ nb130_iz
]
388 movaps
[rsp
+ nb130_dx
], xmm1
389 movaps
[rsp
+ nb130_dy
], xmm2
390 movaps
[rsp
+ nb130_dz
], xmm5
401 ;
# calculate rinv=1/sqrt(rsq)
405 movaps xmm4
, [rsp
+ nb130_three
]
406 mulps xmm5
, xmm1 ;
# rsq*lu*lu
407 subps xmm4
, xmm5 ;
# 30-rsq*lu*lu
409 mulps xmm4
, [rsp
+ nb130_half
]
415 mulps xmm1
, [rsp
+ nb130_tsc
] ;
# rtab
417 ;
# truncate and convert to integers
420 ;
# convert back to float
429 ;
# move to integer registers
441 mov rsi
, [rbp
+ nb130_VFtab
]
442 ;
# calculate LJ table
443 movlps xmm5
, [rsi
+ r8*4]
444 movlps xmm9
, [rsi
+ r8*4 + 16]
446 movlps xmm7
, [rsi
+ r10*4]
447 movlps xmm11
, [rsi
+ r10*4 + 16]
449 movaps xmm0
, xmm2 ;
# rinv
450 mulps xmm2
, [rsp
+ nb130_qq
] ;
# vcoul=rinv*qq
451 movaps xmm3
, xmm2 ;
# copy of vcoul (to calc fscal)
452 mulps xmm3
, xmm0 ;
# vcoul*rinv
454 movhps xmm5
, [rsi
+ r9*4]
455 movhps xmm9
, [rsi
+ r9*4 + 16]
457 addps xmm2
, [rsp
+ nb130_vctot
]
458 movaps
[rsp
+ nb130_vctot
], xmm2
460 movhps xmm7
, [rsi
+ r11*4]
461 movhps xmm11
, [rsi
+ r11*4 + 16]
465 shufps xmm4
, xmm7
, 136 ;
# 10001000
466 shufps xmm8
, xmm11
, 136 ;
# 10001000
467 shufps xmm5
, xmm7
, 221 ;
# 11011101
468 shufps xmm9
, xmm11
, 221 ;
# 11011101
470 movlps xmm7
, [rsi
+ r8*4 + 8]
471 movlps xmm11
, [rsi
+ r8*4 + 24]
473 movlps xmm13
, [rsi
+ r10*4 + 8]
474 movlps xmm14
, [rsi
+ r10*4 + 24]
476 movhps xmm7
, [rsi
+ r9*4 + 8]
477 movhps xmm11
, [rsi
+ r9*4 + 24]
479 movhps xmm13
, [rsi
+ r11*4 + 8]
480 movhps xmm14
, [rsi
+ r11*4 + 24]
485 shufps xmm6
, xmm13
, 136 ;
# 10001000
486 shufps xmm10
, xmm14
, 136 ;
# 10001000
487 shufps xmm7
, xmm13
, 221 ;
# 11011101
488 shufps xmm11
, xmm14
, 221 ;
# 11011101
489 ;
# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
491 mulps xmm7
, xmm1 ;
# Heps
493 mulps xmm6
, xmm1 ;
# Geps
495 mulps xmm7
, xmm1 ;
# Heps2
497 addps xmm5
, xmm6 ;
# F+Geps
499 addps xmm5
, xmm7 ;
# F+Geps+Heps2 = Fp
501 addps xmm7
, xmm7 ;
# 2*Heps2
503 addps xmm7
, xmm6 ;
# 2*Heps2+Geps
506 addps xmm7
, xmm5 ;
# FF = Fp + 2*Heps2 + Geps
508 mulps xmm5
, xmm1 ;
# eps*Fp
510 movaps xmm12
, [rsp
+ nb130_c6
]
511 movaps xmm13
, [rsp
+ nb130_c12
]
512 addps xmm5
, xmm4 ;
# VV
515 mulps xmm5
, xmm12 ;
# VV*c6 = vnb6
516 mulps xmm9
, xmm13 ;
# VV*c12 = vnb12
518 addps xmm5
, [rsp
+ nb130_Vvdwtot
]
519 movaps
[rsp
+ nb130_Vvdwtot
], xmm5
521 mulps xmm7
, xmm12 ;
# FF*c6 = fnb6
522 mulps xmm11
, xmm13 ;
# FF*c12 = fnb12
525 mulps xmm7
, [rsp
+ nb130_tsc
]
527 mulps xmm3
, xmm0 ;
# fscal
533 movaps xmm12
, [rsp
+ nb130_fix
]
534 movaps xmm13
, [rsp
+ nb130_fiy
]
535 movaps xmm14
, [rsp
+ nb130_fiz
]
537 mulps xmm9
, [rsp
+ nb130_dx
]
538 mulps xmm10
, [rsp
+ nb130_dy
]
539 mulps xmm11
, [rsp
+ nb130_dz
]
541 ;
# accumulate i forces
545 movaps
[rsp
+ nb130_fix
], xmm12
546 movaps
[rsp
+ nb130_fiy
], xmm13
547 movaps
[rsp
+ nb130_fiz
], xmm14
549 mov rsi
, [rbp
+ nb130_faction
]
550 ;
# the fj's - start by accumulating x & y forces from memory
551 movlps xmm0
, [rsi
+ rax
*4] ;
# x1 y1 - -
552 movlps xmm1
, [rsi
+ rcx
*4] ;
# x3 y3 - -
553 movhps xmm0
, [rsi
+ rbx
*4] ;
# x1 y1 x2 y2
554 movhps xmm1
, [rsi
+ rdx
*4] ;
# x3 y3 x4 y4
557 unpcklps xmm9
, xmm10 ;
# x1 y1 x2 y2
558 unpckhps xmm8
, xmm10 ;
# x3 y3 x4 y4
560 ;
# update fjx and fjy
564 movlps
[rsi
+ rax
*4], xmm0
565 movlps
[rsi
+ rcx
*4], xmm1
566 movhps
[rsi
+ rbx
*4], xmm0
567 movhps
[rsi
+ rdx
*4], xmm1
569 ;
# xmm11: fjz1 fjz2 fjz3 fjz4
570 pshufd xmm10
, xmm11
, 1 ;
# fjz2 - - -
571 movhlps xmm9
, xmm11 ;
# fjz3 - - -
572 pshufd xmm8
, xmm11
, 3 ;
# fjz4 - - -
574 addss xmm11
, [rsi
+ rax
*4 + 8]
575 addss xmm10
, [rsi
+ rbx
*4 + 8]
576 addss xmm9
, [rsi
+ rcx
*4 + 8]
577 addss xmm8
, [rsi
+ rdx
*4 + 8]
578 movss
[rsi
+ rax
*4 + 8], xmm11
579 movss
[rsi
+ rbx
*4 + 8], xmm10
580 movss
[rsi
+ rcx
*4 + 8], xmm9
581 movss
[rsi
+ rdx
*4 + 8], xmm8
583 ;
# should we do one more iteration?
584 sub dword ptr
[rsp
+ nb130_innerk
], 4
585 jl
.nb130_finish_inner
586 jmp
.nb130_unroll_loop
588 ;
# check if at least two particles remain
589 add dword ptr
[rsp
+ nb130_innerk
], 4
590 mov edx
, [rsp
+ nb130_innerk
]
593 jmp
.nb130_checksingle
595 mov rcx
, [rsp
+ nb130_innerjjnr
]
599 add qword ptr
[rsp
+ nb130_innerjjnr
], 8
601 mov rsi
, [rbp
+ nb130_charge
]
602 movss xmm0
, [rsi
+ rax
*4]
603 movss xmm2
, [rsi
+ rbx
*4]
605 unpcklps xmm0
, xmm2 ;
# jqa jqb
606 mulps xmm0
, [rsp
+ nb130_iq
]
607 movaps
[rsp
+ nb130_qq
], xmm0
609 mov rsi
, [rbp
+ nb130_type
]
611 mov r12d
, [rsi
+ rax
*4]
612 mov r13d
, [rsi
+ rbx
*4]
615 mov edi
, [rsp
+ nb130_ntia
]
619 mov rsi
, [rbp
+ nb130_vdwparam
]
620 movlps xmm3
, [rsi
+ r12*4]
621 movhps xmm3
, [rsi
+ r13*4]
625 shufps xmm0
, xmm7
, 136 ;
# 10001000
626 shufps xmm3
, xmm7
, 221 ;
# 11011101
628 movaps
[rsp
+ nb130_c6
], xmm0
629 movaps
[rsp
+ nb130_c12
], xmm3
631 lea rax
, [rax
+ rax
*2] ;
# replace jnr with j3
632 lea rbx
, [rbx
+ rbx
*2]
635 mov rdi
, [rbp
+ nb130_pos
]
637 movlps xmm1
, [rdi
+ rax
*4] ;
# x1 y1 - -
638 movlps xmm2
, [rdi
+ rbx
*4] ;
# x2 y2 - -
640 movss xmm5
, [rdi
+ rax
*4 + 8] ;
# z1 - - -
641 movss xmm6
, [rdi
+ rbx
*4 + 8] ;
# z2 - - -
643 unpcklps xmm1
, xmm2 ;
# x1 x2 y1 y2
644 movhlps xmm2
, xmm1 ;
# y1 y2 - -
645 unpcklps xmm5
, xmm6 ;
# z1 z2 - -
648 subps xmm1
, [rsp
+ nb130_ix
]
649 subps xmm2
, [rsp
+ nb130_iy
]
650 subps xmm5
, [rsp
+ nb130_iz
]
653 movaps
[rsp
+ nb130_dx
], xmm1
654 movaps
[rsp
+ nb130_dy
], xmm2
655 movaps
[rsp
+ nb130_dz
], xmm5
666 ;
# calculate rinv=1/sqrt(rsq)
670 movaps xmm4
, [rsp
+ nb130_three
]
671 mulps xmm5
, xmm1 ;
# rsq*lu*lu
672 subps xmm4
, xmm5 ;
# 30-rsq*lu*lu
674 mulps xmm4
, [rsp
+ nb130_half
]
680 mulps xmm1
, [rsp
+ nb130_tsc
] ;
# rtab
682 ;
# truncate and convert to integers
685 ;
# convert back to float
694 ;
# move to integer registers
702 mov rsi
, [rbp
+ nb130_VFtab
]
703 ;
# calculate LJ table
704 movlps xmm4
, [rsi
+ r8*4]
705 movlps xmm5
, [rsi
+ r9*4]
710 movlps xmm6
, [rsi
+ r8*4 + 8]
711 movlps xmm7
, [rsi
+ r9*4 + 8]
713 movaps xmm0
, xmm2 ;
# rinv
714 mulps xmm2
, [rsp
+ nb130_qq
] ;
# vcoul=rinv*qq
715 movaps xmm3
, xmm2 ;
# copy of vcoul (to calc fscal)
716 mulps xmm3
, xmm0 ;
# vcoul*rinv
721 movlps xmm8
, [rsi
+ r8*4 + 16]
722 movlps xmm9
, [rsi
+ r9*4 + 16]
727 addps xmm2
, [rsp
+ nb130_vctot
]
728 movlps
[rsp
+ nb130_vctot
], xmm2
730 movlps xmm10
, [rsi
+ r8*4 + 24]
731 movlps xmm11
, [rsi
+ r9*4 + 24]
733 unpcklps xmm10
, xmm11
735 ;
# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
737 mulps xmm7
, xmm1 ;
# Heps
739 mulps xmm6
, xmm1 ;
# Geps
741 mulps xmm7
, xmm1 ;
# Heps2
743 addps xmm5
, xmm6 ;
# F+Geps
745 addps xmm5
, xmm7 ;
# F+Geps+Heps2 = Fp
747 addps xmm7
, xmm7 ;
# 2*Heps2
749 addps xmm7
, xmm6 ;
# 2*Heps2+Geps
752 addps xmm7
, xmm5 ;
# FF = Fp + 2*Heps2 + Geps
754 mulps xmm5
, xmm1 ;
# eps*Fp
756 movaps xmm12
, [rsp
+ nb130_c6
]
757 movaps xmm13
, [rsp
+ nb130_c12
]
758 addps xmm5
, xmm4 ;
# VV
761 mulps xmm5
, xmm12 ;
# VV*c6 = vnb6
762 mulps xmm9
, xmm13 ;
# VV*c12 = vnb12
764 addps xmm5
, [rsp
+ nb130_Vvdwtot
]
765 movlps
[rsp
+ nb130_Vvdwtot
], xmm5
767 mulps xmm7
, xmm12 ;
# FF*c6 = fnb6
768 mulps xmm11
, xmm13 ;
# FF*c12 = fnb12
771 mulps xmm7
, [rsp
+ nb130_tsc
]
773 mulps xmm3
, xmm0 ;
# fscal
781 movaps xmm12
, [rsp
+ nb130_fix
]
782 movaps xmm13
, [rsp
+ nb130_fiy
]
783 movaps xmm14
, [rsp
+ nb130_fiz
]
785 mulps xmm9
, [rsp
+ nb130_dx
]
786 mulps xmm10
, [rsp
+ nb130_dy
]
787 mulps xmm11
, [rsp
+ nb130_dz
]
793 ;
# accumulate i forces
797 movaps
[rsp
+ nb130_fix
], xmm12
798 movaps
[rsp
+ nb130_fiy
], xmm13
799 movaps
[rsp
+ nb130_fiz
], xmm14
801 mov rsi
, [rbp
+ nb130_faction
]
802 ;
# the fj's - start by accumulating x & y forces from memory
803 movlps xmm0
, [rsi
+ rax
*4] ;
# x1 y1 - -
804 movhps xmm0
, [rsi
+ rbx
*4] ;
# x1 y1 x2 y2
806 unpcklps xmm9
, xmm10 ;
# x1 y1 x2 y2
809 movlps
[rsi
+ rax
*4], xmm0
810 movhps
[rsi
+ rbx
*4], xmm0
813 pshufd xmm8
, xmm11
, 1
814 addss xmm11
, [rsi
+ rax
*4 + 8]
815 addss xmm8
, [rsi
+ rbx
*4 + 8]
816 movss
[rsi
+ rax
*4 + 8], xmm11
817 movss
[rsi
+ rbx
*4 + 8], xmm8
820 mov edx
, [rsp
+ nb130_innerk
]
823 jmp
.nb130_updateouterdata
825 mov rdi
, [rbp
+ nb130_pos
]
826 mov rcx
, [rsp
+ nb130_innerjjnr
]
829 mov rsi
, [rbp
+ nb130_charge
]
830 movss xmm0
, [rsi
+ rax
*4]
832 mulss xmm0
, [rsp
+ nb130_iq
]
833 movaps
[rsp
+ nb130_qq
], xmm0
835 mov rsi
, [rbp
+ nb130_type
]
837 mov r12d
, [rsi
+ rax
*4]
839 mov edi
, [rsp
+ nb130_ntia
]
842 mov rsi
, [rbp
+ nb130_vdwparam
]
843 movss xmm0
, [rsi
+ r12*4]
844 movss xmm3
, [rsi
+ r12*4 + 4]
846 movaps
[rsp
+ nb130_c6
], xmm0
847 movaps
[rsp
+ nb130_c12
], xmm3
849 lea rax
, [rax
+ rax
*2] ;
# replace jnr with j3
851 mov rdi
, [rbp
+ nb130_pos
]
853 movss xmm1
, [rdi
+ rax
*4]
854 movss xmm2
, [rdi
+ rax
*4 + 4]
855 movss xmm5
, [rdi
+ rax
*4 + 8]
858 subss xmm1
, [rsp
+ nb130_ix
]
859 subss xmm2
, [rsp
+ nb130_iy
]
860 subss xmm5
, [rsp
+ nb130_iz
]
863 movaps
[rsp
+ nb130_dx
], xmm1
864 movaps
[rsp
+ nb130_dy
], xmm2
865 movaps
[rsp
+ nb130_dz
], xmm5
876 ;
# calculate rinv=1/sqrt(rsq)
880 movaps xmm4
, [rsp
+ nb130_three
]
881 mulss xmm5
, xmm1 ;
# rsq*lu*lu
882 subss xmm4
, xmm5 ;
# 30-rsq*lu*lu
884 mulss xmm4
, [rsp
+ nb130_half
]
890 mulss xmm1
, [rsp
+ nb130_tsc
] ;
# rtab
892 ;
# truncate and convert to integers
895 ;
# convert back to float
907 mov rsi
, [rbp
+ nb130_VFtab
]
908 ;
# calculate LJ table
909 movss xmm4
, [rsi
+ r8*4]
910 movss xmm5
, [rsi
+ r8*4 + 4]
911 movss xmm6
, [rsi
+ r8*4 + 8]
912 movss xmm7
, [rsi
+ r8*4 + 12]
913 movss xmm8
, [rsi
+ r8*4 + 16]
914 movss xmm9
, [rsi
+ r8*4 + 20]
915 movss xmm10
, [rsi
+ r8*4 + 24]
916 movss xmm11
, [rsi
+ r8*4 + 28]
917 ;
# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
919 ;
# coulomb interaction
920 movaps xmm0
, xmm2 ;
# rinv
921 mulss xmm2
, [rsp
+ nb130_qq
] ;
# vcoul=rinv*qq
922 movaps xmm3
, xmm2 ;
# copy of vcoul (to calc fscal)
923 mulss xmm3
, xmm0 ;
# vcoul*rinv
925 addss xmm2
, [rsp
+ nb130_vctot
]
926 movss
[rsp
+ nb130_vctot
], xmm2
928 ;
# calculate table interaction
929 mulss xmm7
, xmm1 ;
# Heps
931 mulss xmm6
, xmm1 ;
# Geps
933 mulss xmm7
, xmm1 ;
# Heps2
935 addss xmm5
, xmm6 ;
# F+Geps
937 addss xmm5
, xmm7 ;
# F+Geps+Heps2 = Fp
939 addss xmm7
, xmm7 ;
# 2*Heps2
941 addss xmm7
, xmm6 ;
# 2*Heps2+Geps
944 addss xmm7
, xmm5 ;
# FF = Fp + 2*Heps2 + Geps
946 mulss xmm5
, xmm1 ;
# eps*Fp
948 movaps xmm12
, [rsp
+ nb130_c6
]
949 movaps xmm13
, [rsp
+ nb130_c12
]
950 addss xmm5
, xmm4 ;
# VV
953 mulss xmm5
, xmm12 ;
# VV*c6 = vnb6
954 mulss xmm9
, xmm13 ;
# VV*c12 = vnb12
956 addss xmm5
, [rsp
+ nb130_Vvdwtot
]
957 movss
[rsp
+ nb130_Vvdwtot
], xmm5
959 mulss xmm7
, xmm12 ;
# FF*c6 = fnb6
960 mulss xmm11
, xmm13 ;
# FF*c12 = fnb12
963 mulss xmm7
, [rsp
+ nb130_tsc
]
965 mulss xmm3
, xmm0 ;
# fscal
971 movaps xmm12
, [rsp
+ nb130_fix
]
972 movaps xmm13
, [rsp
+ nb130_fiy
]
973 movaps xmm14
, [rsp
+ nb130_fiz
]
975 mulss xmm9
, [rsp
+ nb130_dx
]
976 mulss xmm10
, [rsp
+ nb130_dy
]
977 mulss xmm11
, [rsp
+ nb130_dz
]
979 ;
# accumulate i forces
983 movaps
[rsp
+ nb130_fix
], xmm12
984 movaps
[rsp
+ nb130_fiy
], xmm13
985 movaps
[rsp
+ nb130_fiz
], xmm14
987 mov rsi
, [rbp
+ nb130_faction
]
989 addss xmm9
, [rsi
+ rax
*4]
990 addss xmm10
, [rsi
+ rax
*4 + 4]
991 addss xmm11
, [rsi
+ rax
*4 + 8]
992 movss
[rsi
+ rax
*4], xmm9
993 movss
[rsi
+ rax
*4 + 4], xmm10
994 movss
[rsi
+ rax
*4 + 8], xmm11
996 .nb130_updateouterdata:
997 mov ecx
, [rsp
+ nb130_ii3
]
998 mov rdi
, [rbp
+ nb130_faction
]
999 mov rsi
, [rbp
+ nb130_fshift
]
1000 mov edx
, [rsp
+ nb130_is3
]
1002 ;
# accumulate i forces in xmm0, xmm1, xmm2
1003 movaps xmm0
, [rsp
+ nb130_fix
]
1004 movaps xmm1
, [rsp
+ nb130_fiy
]
1005 movaps xmm2
, [rsp
+ nb130_fiz
]
1012 addps xmm2
, xmm5 ;
# sum is in 1/2 in xmm0-xmm2
1018 shufps xmm3
, xmm3
, 1
1019 shufps xmm4
, xmm4
, 1
1020 shufps xmm5
, xmm5
, 1
1023 addss xmm2
, xmm5 ;
# xmm0-xmm2 has single force in pos0
1025 ;
# increment i force
1026 movss xmm3
, [rdi
+ rcx
*4]
1027 movss xmm4
, [rdi
+ rcx
*4 + 4]
1028 movss xmm5
, [rdi
+ rcx
*4 + 8]
1032 movss
[rdi
+ rcx
*4], xmm3
1033 movss
[rdi
+ rcx
*4 + 4], xmm4
1034 movss
[rdi
+ rcx
*4 + 8], xmm5
1036 ;
# increment fshift force
1037 movss xmm3
, [rsi
+ rdx
*4]
1038 movss xmm4
, [rsi
+ rdx
*4 + 4]
1039 movss xmm5
, [rsi
+ rdx
*4 + 8]
1043 movss
[rsi
+ rdx
*4], xmm3
1044 movss
[rsi
+ rdx
*4 + 4], xmm4
1045 movss
[rsi
+ rdx
*4 + 8], xmm5
1048 mov esi
, [rsp
+ nb130_n
]
1049 ;
# get group index for i particle
1050 mov rdx
, [rbp
+ nb130_gid
] ;
# base of gid[]
1051 mov edx
, [rdx
+ rsi
*4] ;
# ggid=gid[n]
1053 ;
# accumulate total potential energy and update it
1054 movaps xmm7
, [rsp
+ nb130_vctot
]
1057 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
1059 shufps xmm6
, xmm6
, 1
1062 ;
# add earlier value from mem
1063 mov rax
, [rbp
+ nb130_Vc
]
1064 addss xmm7
, [rax
+ rdx
*4]
1066 movss
[rax
+ rdx
*4], xmm7
1068 ;
# accumulate total lj energy and update it
1069 movaps xmm7
, [rsp
+ nb130_Vvdwtot
]
1072 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
1074 shufps xmm6
, xmm6
, 1
1077 ;
# add earlier value from mem
1078 mov rax
, [rbp
+ nb130_Vvdw
]
1079 addss xmm7
, [rax
+ rdx
*4]
1081 movss
[rax
+ rdx
*4], xmm7
1084 mov ecx
, [rsp
+ nb130_nn1
]
1085 ;
# esi already loaded with n
1090 ;
# not last, iterate outer loop once more!
1091 mov
[rsp
+ nb130_n
], esi
1094 ;
# check if more outer neighborlists remain
1095 mov ecx
, [rsp
+ nb130_nri
]
1096 ;
# esi already loaded with n above
1099 ;
# non-zero, do one more workunit
1100 jmp
.nb130_threadloop
1102 mov eax
, [rsp
+ nb130_nouter
]
1103 mov ebx
, [rsp
+ nb130_ninner
]
1104 mov rcx
, [rbp
+ nb130_outeriter
]
1105 mov rdx
, [rbp
+ nb130_inneriter
]
1112 ;
# Save xmm registers to stack
1114 movaps xmm7
, [rsp
+ 16 ]
1115 movaps xmm8
, [rsp
+ 32 ]
1116 movaps xmm9
, [rsp
+ 48 ]
1117 movaps xmm10
, [rsp
+ 64 ]
1118 movaps xmm11
, [rsp
+ 80 ]
1119 movaps xmm12
, [rsp
+ 96 ]
1120 movaps xmm13
, [rsp
+ 112]
1121 movaps xmm14
, [rsp
+ 128]
1122 movaps xmm15
, [rsp
+ 144]
1124 ;
# Reset pointers after restoring xmm6-15
1143 .globl nb_kernel130nf_x86_64_sse
1144 .globl _nb_kernel130nf_x86_64_sse
1145 nb_kernel130nf_x86_64_sse
:
1146 _nb_kernel130nf_x86_64_sse
:
1147 ;
# Room for return address and rbp (16 bytes)
1148 .equiv nb130nf_fshift, 16
1149 .equiv nb130nf_gid, 24
1150 .equiv nb130nf_pos, 32
1151 .equiv nb130nf_faction, 40
1152 .equiv nb130nf_charge, 48
1153 .equiv nb130nf_p_facel, 56
1154 .equiv nb130nf_argkrf, 64
1155 .equiv nb130nf_argcrf, 72
1156 .equiv nb130nf_Vc, 80
1157 .equiv nb130nf_type, 88
1158 .equiv nb130nf_p_ntype, 96
1159 .equiv nb130nf_vdwparam, 104
1160 .equiv nb130nf_Vvdw, 112
1161 .equiv nb130nf_p_tabscale, 120
1162 .equiv nb130nf_VFtab, 128
1163 .equiv nb130nf_invsqrta, 136
1164 .equiv nb130nf_dvda, 144
1165 .equiv nb130nf_p_gbtabscale, 152
1166 .equiv nb130nf_GBtab, 160
1167 .equiv nb130nf_p_nthreads, 168
1168 .equiv nb130nf_count, 176
1169 .equiv nb130nf_mtx, 184
1170 .equiv nb130nf_outeriter, 192
1171 .equiv nb130nf_inneriter, 200
1172 .equiv nb130nf_work, 208
1173 ;
# stack offsets for local variables
1174 ;
# bottom of stack is cache-aligned for sse use
1175 .equiv nb130nf_ix, 0
1176 .equiv nb130nf_iy, 16
1177 .equiv nb130nf_iz, 32
1178 .equiv nb130nf_iq, 48
1179 .equiv nb130nf_c6, 64
1180 .equiv nb130nf_c12, 80
1181 .equiv nb130nf_vctot, 96
1182 .equiv nb130nf_Vvdwtot, 112
1183 .equiv nb130nf_half, 128
1184 .equiv nb130nf_three, 144
1185 .equiv nb130nf_krf, 160
1186 .equiv nb130nf_crf, 176
1187 .equiv nb130nf_tsc, 192
1188 .equiv nb130nf_nri, 208
1189 .equiv nb130nf_iinr, 216
1190 .equiv nb130nf_jindex, 224
1191 .equiv nb130nf_jjnr, 232
1192 .equiv nb130nf_shift, 240
1193 .equiv nb130nf_shiftvec, 248
1194 .equiv nb130nf_facel, 256
1195 .equiv nb130nf_innerjjnr, 264
1196 .equiv nb130nf_is3, 272
1197 .equiv nb130nf_ii3, 280
1198 .equiv nb130nf_ntia, 284
1199 .equiv nb130nf_innerk, 288
1200 .equiv nb130nf_n, 292
1201 .equiv nb130nf_nn1, 296
1202 .equiv nb130nf_ntype, 300
1203 .equiv nb130nf_nouter, 304
1204 .equiv nb130nf_ninner, 308
1209 ;
# Push integer registers on stack
1218 ;
# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
1221 ;
# Save xmm registers to stack
1223 movaps
[rsp
+ 16 ], xmm7
1224 movaps
[rsp
+ 32 ], xmm8
1225 movaps
[rsp
+ 48 ], xmm9
1226 movaps
[rsp
+ 64 ], xmm10
1227 movaps
[rsp
+ 80 ], xmm11
1228 movaps
[rsp
+ 96 ], xmm12
1229 movaps
[rsp
+ 112], xmm13
1230 movaps
[rsp
+ 128], xmm14
1231 movaps
[rsp
+ 144], xmm15
1234 sub rsp
, 320 ;
# local variable stack space (n*16+8)
1235 ;
.if 0 # block below only read by NASM - special calling convention on win64
1236 %ifidn __OUTPUT_FORMAT__
, win64
1237 ;
# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
1239 ;
# Adjust stack pointer for different alignment
1240 ;
# Move around arguments to fit AMD64 convention below
1241 ;
# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
1242 ;
# win64 passes args in: rcx,rdx,r8,r9 + stack
1250 ;
.endif # end NASM- and win64-specific block
1252 ;
# zero 32-bit iteration counters
1254 mov
[rsp
+ nb130nf_nouter
], eax
1255 mov
[rsp
+ nb130nf_ninner
], eax
1258 mov
[rsp
+ nb130nf_nri
], edi
1259 mov
[rsp
+ nb130nf_iinr
], rsi
1260 mov
[rsp
+ nb130nf_jindex
], rdx
1261 mov
[rsp
+ nb130nf_jjnr
], rcx
1262 mov
[rsp
+ nb130nf_shift
], r8
1263 mov
[rsp
+ nb130nf_shiftvec
], r9
1264 mov rdi
, [rbp
+ nb130nf_p_ntype
]
1266 mov
[rsp
+ nb130nf_ntype
], edi
1267 mov rsi
, [rbp
+ nb130nf_p_facel
]
1269 movss
[rsp
+ nb130nf_facel
], xmm0
1271 mov rax
, [rbp
+ nb130nf_p_tabscale
]
1273 shufps xmm3
, xmm3
, 0
1274 movaps
[rsp
+ nb130nf_tsc
], xmm3
1276 ;
# create constant floating-point factors on stack
1277 mov eax
, 0x3f000000 ;
# half in IEEE (hex)
1278 mov
[rsp
+ nb130nf_half
], eax
1279 movss xmm1
, [rsp
+ nb130nf_half
]
1280 shufps xmm1
, xmm1
, 0 ;
# splat to all elements
1282 addps xmm2
, xmm2 ;
# one
1284 addps xmm2
, xmm2 ;
# two
1285 addps xmm3
, xmm2 ;
# three
1286 movaps
[rsp
+ nb130nf_half
], xmm1
1287 movaps
[rsp
+ nb130nf_three
], xmm3
1290 .nb130nf_threadloop:
1291 mov rsi
, [rbp
+ nb130nf_count
] ;
# pointer to sync counter
1294 mov ebx
, eax ;
# ebx=*count=nn0
1295 add ebx
, 1 ;
# ebx=nn1=nn0+10
1297 cmpxchg
[rsi
], ebx ;
# write nn1 to *counter,
1298 ;
# if it hasnt changed.
1299 ;
# or reread *counter to eax.
1300 pause ;
# -> better p4 performance
1301 jnz
.nb130nf_spinlock
1303 ;
# if(nn1>nri) nn1=nri
1304 mov ecx
, [rsp
+ nb130nf_nri
]
1307 cmovle ebx
, edx ;
# if(nn1>nri) nn1=nri
1308 ;
# Cleared the spinlock if we got here.
1309 ;
# eax contains nn0, ebx contains nn1.
1310 mov
[rsp
+ nb130nf_n
], eax
1311 mov
[rsp
+ nb130nf_nn1
], ebx
1312 sub ebx
, eax ;
# calc number of outer lists
1313 mov esi
, eax ;
# copy n to esi
1314 jg
.nb130nf_outerstart
1317 .nb130nf_outerstart:
1318 ;
# ebx contains number of outer iterations
1319 add ebx
, [rsp
+ nb130nf_nouter
]
1320 mov
[rsp
+ nb130nf_nouter
], ebx
1323 mov rax
, [rsp
+ nb130nf_shift
] ;
# eax = pointer into shift[]
1324 mov ebx
, [rax
+ rsi
*4] ;
# ebx=shift[n]
1326 lea rbx
, [rbx
+ rbx
*2] ;
# rbx=3*is
1327 mov
[rsp
+ nb130nf_is3
],ebx ;
# store is3
1329 mov rax
, [rsp
+ nb130nf_shiftvec
] ;
# eax = base of shiftvec[]
1331 movss xmm0
, [rax
+ rbx
*4]
1332 movss xmm1
, [rax
+ rbx
*4 + 4]
1333 movss xmm2
, [rax
+ rbx
*4 + 8]
1335 mov rcx
, [rsp
+ nb130nf_iinr
] ;
# ecx = pointer into iinr[]
1336 mov ebx
, [rcx
+ rsi
*4] ;
# ebx =ii
1338 mov rdx
, [rbp
+ nb130nf_charge
]
1339 movss xmm3
, [rdx
+ rbx
*4]
1340 mulss xmm3
, [rsp
+ nb130nf_facel
]
1341 shufps xmm3
, xmm3
, 0
1343 mov rdx
, [rbp
+ nb130nf_type
]
1344 mov edx
, [rdx
+ rbx
*4]
1345 imul edx
, [rsp
+ nb130nf_ntype
]
1347 mov
[rsp
+ nb130nf_ntia
], edx
1349 lea rbx
, [rbx
+ rbx
*2] ;
# rbx = 3*ii=ii3
1350 mov rax
, [rbp
+ nb130nf_pos
] ;
# eax = base of pos[]
1352 addss xmm0
, [rax
+ rbx
*4]
1353 addss xmm1
, [rax
+ rbx
*4 + 4]
1354 addss xmm2
, [rax
+ rbx
*4 + 8]
1356 movaps
[rsp
+ nb130nf_iq
], xmm3
1358 shufps xmm0
, xmm0
, 0
1359 shufps xmm1
, xmm1
, 0
1360 shufps xmm2
, xmm2
, 0
1362 movaps
[rsp
+ nb130nf_ix
], xmm0
1363 movaps
[rsp
+ nb130nf_iy
], xmm1
1364 movaps
[rsp
+ nb130nf_iz
], xmm2
1366 mov
[rsp
+ nb130nf_ii3
], ebx
1368 ;
# clear vctot and i forces
1370 movaps
[rsp
+ nb130nf_vctot
], xmm4
1371 movaps
[rsp
+ nb130nf_Vvdwtot
], xmm4
1373 mov rax
, [rsp
+ nb130nf_jindex
]
1374 mov ecx
, [rax
+ rsi
*4] ;
# jindex[n]
1375 mov edx
, [rax
+ rsi
*4 + 4] ;
# jindex[n+1]
1376 sub edx
, ecx ;
# number of innerloop atoms
1378 mov rsi
, [rbp
+ nb130nf_pos
]
1379 mov rax
, [rsp
+ nb130nf_jjnr
]
1382 mov
[rsp
+ nb130nf_innerjjnr
], rax ;
# pointer to jjnr[nj0]
1385 add ecx
, [rsp
+ nb130nf_ninner
]
1386 mov
[rsp
+ nb130nf_ninner
], ecx
1388 mov
[rsp
+ nb130nf_innerk
], edx ;
# number of innerloop atoms
1389 jge
.nb130nf_unroll_loop
1390 jmp
.nb130nf_finish_inner
1391 .nb130nf_unroll_loop:
1392 ;
# quad-unroll innerloop here
1393 mov rdx
, [rsp
+ nb130nf_innerjjnr
] ;
# pointer to jjnr[k]
1397 mov edx
, [rdx
+ 12] ;
# eax-edx=jnr1-4
1398 add qword ptr
[rsp
+ nb130nf_innerjjnr
], 16 ;
# advance pointer (unrolled 4)
1400 mov rsi
, [rbp
+ nb130nf_charge
] ;
# base of charge[]
1402 movss xmm3
, [rsi
+ rax
*4]
1403 movss xmm4
, [rsi
+ rcx
*4]
1404 movss xmm6
, [rsi
+ rbx
*4]
1405 movss xmm7
, [rsi
+ rdx
*4]
1407 movaps xmm2
, [rsp
+ nb130nf_iq
]
1408 shufps xmm3
, xmm6
, 0
1409 shufps xmm4
, xmm7
, 0
1410 shufps xmm3
, xmm4
, 136 ;
# constant 10001000 ;# all charges in xmm3
1411 movd mm0
, eax ;
# use mmx registers as temp storage
1416 mov rsi
, [rbp
+ nb130nf_type
]
1417 mov eax
, [rsi
+ rax
*4]
1418 mov ebx
, [rsi
+ rbx
*4]
1419 mov ecx
, [rsi
+ rcx
*4]
1420 mov edx
, [rsi
+ rdx
*4]
1421 mov rsi
, [rbp
+ nb130nf_vdwparam
]
1426 mov edi
, [rsp
+ nb130nf_ntia
]
1432 movlps xmm6
, [rsi
+ rax
*4]
1433 movlps xmm7
, [rsi
+ rcx
*4]
1434 movhps xmm6
, [rsi
+ rbx
*4]
1435 movhps xmm7
, [rsi
+ rdx
*4]
1438 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
1439 shufps xmm6
, xmm7
, 221 ;
# constant 11011101
1446 movaps
[rsp
+ nb130nf_c6
], xmm4
1447 movaps
[rsp
+ nb130nf_c12
], xmm6
1449 mov rsi
, [rbp
+ nb130nf_pos
] ;
# base of pos[]
1451 lea rax
, [rax
+ rax
*2] ;
# replace jnr with j3
1452 lea rbx
, [rbx
+ rbx
*2]
1455 lea rcx
, [rcx
+ rcx
*2] ;
# replace jnr with j3
1456 lea rdx
, [rdx
+ rdx
*2]
1458 ;
# move four coordinates to xmm0-xmm2
1460 movlps xmm4
, [rsi
+ rax
*4]
1461 movlps xmm5
, [rsi
+ rcx
*4]
1462 movss xmm2
, [rsi
+ rax
*4 + 8]
1463 movss xmm6
, [rsi
+ rcx
*4 + 8]
1465 movhps xmm4
, [rsi
+ rbx
*4]
1466 movhps xmm5
, [rsi
+ rdx
*4]
1468 movss xmm0
, [rsi
+ rbx
*4 + 8]
1469 movss xmm1
, [rsi
+ rdx
*4 + 8]
1471 shufps xmm2
, xmm0
, 0
1472 shufps xmm6
, xmm1
, 0
1477 shufps xmm2
, xmm6
, 136 ;
# constant 10001000
1479 shufps xmm0
, xmm5
, 136 ;
# constant 10001000
1480 shufps xmm1
, xmm5
, 221 ;
# constant 11011101
1482 ;
# move ix-iz to xmm4-xmm6
1483 movaps xmm4
, [rsp
+ nb130nf_ix
]
1484 movaps xmm5
, [rsp
+ nb130nf_iy
]
1485 movaps xmm6
, [rsp
+ nb130nf_iz
]
1501 ;
# lookup seed in xmm5
1504 movaps xmm1
, [rsp
+ nb130nf_three
]
1505 mulps xmm5
, xmm4 ;
# rsq*lu*lu
1506 movaps xmm0
, [rsp
+ nb130nf_half
]
1507 subps xmm1
, xmm5 ;
# constant 30-rsq*lu*lu
1509 mulps xmm0
, xmm1 ;
# xmm0=rinv
1512 addps xmm3
, [rsp
+ nb130nf_vctot
]
1513 movaps
[rsp
+ nb130nf_vctot
], xmm3
1516 mulps xmm4
, xmm1 ;
# r
1517 mulps xmm4
, [rsp
+ nb130nf_tsc
] ;
# rtab
1519 movaps xmm0
, xmm1 ;
# copy of rinv
1522 cvttps2pi mm7
, xmm5 ;
# mm6/mm7 contain lu indices
1527 movaps xmm1
, xmm4 ;
# xmm1=eps
1529 mulps xmm2
, xmm2 ;
# xmm2=eps2
1533 mov rsi
, [rbp
+ nb130nf_VFtab
]
1542 movlps xmm5
, [rsi
+ rax
*4]
1543 movlps xmm7
, [rsi
+ rcx
*4]
1544 movhps xmm5
, [rsi
+ rbx
*4]
1545 movhps xmm7
, [rsi
+ rdx
*4] ;
# got half dispersion table
1547 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
1548 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
1550 movlps xmm7
, [rsi
+ rax
*4 + 8]
1551 movlps xmm3
, [rsi
+ rcx
*4 + 8]
1552 movhps xmm7
, [rsi
+ rbx
*4 + 8]
1553 movhps xmm3
, [rsi
+ rdx
*4 + 8] ;
# other half of dispersion table
1555 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
1556 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
1557 ;
# dispersion table ready, in xmm4-xmm7
1559 mulps xmm6
, xmm1 ;
# xmm6=Geps
1560 mulps xmm7
, xmm2 ;
# xmm7=Heps2
1562 addps xmm5
, xmm7 ;
# xmm5=Fp
1563 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
1564 addps xmm5
, xmm4 ;
# xmm5=VV
1566 movaps xmm4
, [rsp
+ nb130nf_c6
]
1567 mulps xmm5
, xmm4 ;
# Vvdw6
1569 ;
# Update Vvdwtot directly
1570 addps xmm5
, [rsp
+ nb130nf_Vvdwtot
]
1571 movaps
[rsp
+ nb130nf_Vvdwtot
], xmm5
1574 movlps xmm5
, [rsi
+ rax
*4 + 16]
1575 movlps xmm7
, [rsi
+ rcx
*4 + 16]
1576 movhps xmm5
, [rsi
+ rbx
*4 + 16]
1577 movhps xmm7
, [rsi
+ rdx
*4 + 16] ;
# got half repulsion table
1579 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
1580 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
1582 movlps xmm7
, [rsi
+ rax
*4 + 24]
1583 movlps xmm3
, [rsi
+ rcx
*4 + 24]
1584 movhps xmm7
, [rsi
+ rbx
*4 + 24]
1585 movhps xmm3
, [rsi
+ rdx
*4 + 24] ;
# other half of repulsion table
1587 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
1588 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
1589 ;
# table ready, in xmm4-xmm7
1590 mulps xmm6
, xmm1 ;
# xmm6=Geps
1591 mulps xmm7
, xmm2 ;
# xmm7=Heps2
1593 addps xmm5
, xmm7 ;
# xmm5=Fp
1594 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
1595 addps xmm5
, xmm4 ;
# xmm5=VV
1597 movaps xmm4
, [rsp
+ nb130nf_c12
]
1598 mulps xmm5
, xmm4 ;
# Vvdw12
1600 addps xmm5
, [rsp
+ nb130nf_Vvdwtot
]
1601 movaps
[rsp
+ nb130nf_Vvdwtot
], xmm5
1603 ;
# should we do one more iteration?
1604 sub dword ptr
[rsp
+ nb130nf_innerk
], 4
1605 jl
.nb130nf_finish_inner
1606 jmp
.nb130nf_unroll_loop
1607 .nb130nf_finish_inner:
1608 ;
# check if at least two particles remain
1609 add dword ptr
[rsp
+ nb130nf_innerk
], 4
1610 mov edx
, [rsp
+ nb130nf_innerk
]
1613 jmp
.nb130nf_checksingle
1615 mov rsi
, [rbp
+ nb130nf_charge
]
1617 mov rcx
, [rsp
+ nb130nf_innerjjnr
]
1621 add qword ptr
[rsp
+ nb130nf_innerjjnr
], 8
1624 movss xmm3
, [rsi
+ rax
*4]
1625 movss xmm6
, [rsi
+ rbx
*4]
1626 shufps xmm3
, xmm6
, 12 ;
# constant 00001100
1627 shufps xmm3
, xmm3
, 88 ;
# constant 01011000 ;# xmm3(0,1) has the charges
1629 mov rsi
, [rbp
+ nb130nf_type
]
1632 mov ecx
, [rsi
+ rcx
*4]
1633 mov edx
, [rsi
+ rdx
*4]
1634 mov rsi
, [rbp
+ nb130nf_vdwparam
]
1637 mov edi
, [rsp
+ nb130nf_ntia
]
1640 movlps xmm6
, [rsi
+ rcx
*4]
1641 movhps xmm6
, [rsi
+ rdx
*4]
1642 mov rdi
, [rbp
+ nb130nf_pos
]
1645 shufps xmm4
, xmm4
, 8 ;
# constant 00001000
1646 shufps xmm6
, xmm6
, 13 ;
# constant 00001101
1650 movaps
[rsp
+ nb130nf_c6
], xmm4
1651 movaps
[rsp
+ nb130nf_c12
], xmm6
1653 lea rax
, [rax
+ rax
*2]
1654 lea rbx
, [rbx
+ rbx
*2]
1655 ;
# move coordinates to xmm0-xmm2
1656 movlps xmm1
, [rdi
+ rax
*4]
1657 movss xmm2
, [rdi
+ rax
*4 + 8]
1658 movhps xmm1
, [rdi
+ rbx
*4]
1659 movss xmm0
, [rdi
+ rbx
*4 + 8]
1661 mulps xmm3
, [rsp
+ nb130nf_iq
]
1665 shufps xmm2
, xmm0
, 0
1669 shufps xmm2
, xmm2
, 136 ;
# constant 10001000
1671 shufps xmm0
, xmm0
, 136 ;
# constant 10001000
1672 shufps xmm1
, xmm1
, 221 ;
# constant 11011101
1674 ;
# move ix-iz to xmm4-xmm6
1677 movaps xmm4
, [rsp
+ nb130nf_ix
]
1678 movaps xmm5
, [rsp
+ nb130nf_iy
]
1679 movaps xmm6
, [rsp
+ nb130nf_iz
]
1695 ;
# lookup seed in xmm5
1698 movaps xmm1
, [rsp
+ nb130nf_three
]
1699 mulps xmm5
, xmm4 ;
# rsq*lu*lu
1700 movaps xmm0
, [rsp
+ nb130nf_half
]
1701 subps xmm1
, xmm5 ;
# constant 30-rsq*lu*lu
1703 mulps xmm0
, xmm1 ;
# xmm0=rinv
1706 addps xmm3
, [rsp
+ nb130nf_vctot
]
1707 movaps
[rsp
+ nb130nf_vctot
], xmm3
1710 mulps xmm4
, xmm1 ;
# r
1711 mulps xmm4
, [rsp
+ nb130nf_tsc
] ;
# rtab
1713 movaps xmm0
, xmm1 ;
# copy of rinv
1717 movaps xmm1
, xmm4 ;
# xmm1=eps
1719 mulps xmm2
, xmm2 ;
# xmm2=eps2
1722 mov rsi
, [rbp
+ nb130nf_VFtab
]
1728 movlps xmm5
, [rsi
+ rax
*4]
1729 movhps xmm5
, [rsi
+ rbx
*4]
1731 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
1732 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
1734 movlps xmm7
, [rsi
+ rax
*4 + 8]
1735 movhps xmm7
, [rsi
+ rbx
*4 + 8]
1737 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
1738 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
1739 ;
# dispersion table ready, in xmm4-xmm7
1741 mulps xmm6
, xmm1 ;
# xmm6=Geps
1742 mulps xmm7
, xmm2 ;
# xmm7=Heps2
1744 addps xmm5
, xmm7 ;
# xmm5=Fp
1745 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
1746 addps xmm5
, xmm4 ;
# xmm5=VV
1748 movaps xmm4
, [rsp
+ nb130nf_c6
]
1749 mulps xmm5
, xmm4 ;
# Vvdw6
1751 ;
# Update Vvdwtot directly
1752 addps xmm5
, [rsp
+ nb130nf_Vvdwtot
]
1753 movaps
[rsp
+ nb130nf_Vvdwtot
], xmm5
1756 movlps xmm5
, [rsi
+ rax
*4 + 16]
1757 movhps xmm5
, [rsi
+ rbx
*4 + 16]
1759 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
1760 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
1762 movlps xmm7
, [rsi
+ rax
*4 + 24]
1763 movhps xmm7
, [rsi
+ rbx
*4 + 24]
1765 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
1766 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
1767 ;
# table ready, in xmm4-xmm7
1768 mulps xmm6
, xmm1 ;
# xmm6=Geps
1769 mulps xmm7
, xmm2 ;
# xmm7=Heps2
1771 addps xmm5
, xmm7 ;
# xmm5=Fp
1772 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
1773 addps xmm5
, xmm4 ;
# xmm5=VV
1775 movaps xmm4
, [rsp
+ nb130nf_c12
]
1776 mulps xmm5
, xmm4 ;
# Vvdw12
1778 addps xmm5
, [rsp
+ nb130nf_Vvdwtot
]
1779 movaps
[rsp
+ nb130nf_Vvdwtot
], xmm5
1781 .nb130nf_checksingle:
1782 mov edx
, [rsp
+ nb130nf_innerk
]
1784 jnz
.nb130nf_dosingle
1785 jmp
.nb130nf_updateouterdata
1787 mov rsi
, [rbp
+ nb130nf_charge
]
1788 mov rdi
, [rbp
+ nb130nf_pos
]
1789 mov rcx
, [rsp
+ nb130nf_innerjjnr
]
1792 movss xmm3
, [rsi
+ rax
*4] ;
# xmm3(0) has the charge
1794 mov rsi
, [rbp
+ nb130nf_type
]
1796 mov ecx
, [rsi
+ rcx
*4]
1797 mov rsi
, [rbp
+ nb130nf_vdwparam
]
1799 add ecx
, [rsp
+ nb130nf_ntia
]
1801 movlps xmm6
, [rsi
+ rcx
*4]
1803 shufps xmm4
, xmm4
, 252 ;
# constant 11111100
1804 shufps xmm6
, xmm6
, 253 ;
# constant 11111101
1806 movaps
[rsp
+ nb130nf_c6
], xmm4
1807 movaps
[rsp
+ nb130nf_c12
], xmm6
1809 lea rax
, [rax
+ rax
*2]
1811 ;
# move coordinates to xmm0-xmm2
1812 movss xmm0
, [rdi
+ rax
*4]
1813 movss xmm1
, [rdi
+ rax
*4 + 4]
1814 movss xmm2
, [rdi
+ rax
*4 + 8]
1816 mulps xmm3
, [rsp
+ nb130nf_iq
]
1820 movaps xmm4
, [rsp
+ nb130nf_ix
]
1821 movaps xmm5
, [rsp
+ nb130nf_iy
]
1822 movaps xmm6
, [rsp
+ nb130nf_iz
]
1838 ;
# lookup seed in xmm5
1841 movss xmm1
, [rsp
+ nb130nf_three
]
1842 mulss xmm5
, xmm4 ;
# rsq*lu*lu
1843 movss xmm0
, [rsp
+ nb130nf_half
]
1844 subss xmm1
, xmm5 ;
# constant 30-rsq*lu*lu
1846 mulss xmm0
, xmm1 ;
# xmm0=rinv
1849 addss xmm3
, [rsp
+ nb130nf_vctot
]
1850 movss
[rsp
+ nb130nf_vctot
], xmm3
1853 mulss xmm4
, xmm1 ;
# r
1854 mulss xmm4
, [rsp
+ nb130nf_tsc
] ;
# rtab
1856 movaps xmm0
, xmm1 ;
# copy of rinv
1860 movss xmm1
, xmm4 ;
# xmm1=eps
1862 mulss xmm2
, xmm2 ;
# xmm2=eps2
1867 mov rsi
, [rbp
+ nb130nf_VFtab
]
1871 movlps xmm5
, [rsi
+ rax
*4]
1873 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
1874 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
1876 movlps xmm7
, [rsi
+ rax
*4 + 8]
1878 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
1879 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
1880 ;
# dispersion table ready, in xmm4-xmm7
1882 mulss xmm6
, xmm1 ;
# xmm6=Geps
1883 mulss xmm7
, xmm2 ;
# xmm7=Heps2
1885 addss xmm5
, xmm7 ;
# xmm5=Fp
1886 mulss xmm5
, xmm1 ;
# xmm5=eps*Fp
1887 addss xmm5
, xmm4 ;
# xmm5=VV
1889 movss xmm4
, [rsp
+ nb130nf_c6
]
1890 mulss xmm5
, xmm4 ;
# Vvdw6
1892 ;
# Update Vvdwtot directly
1893 addss xmm5
, [rsp
+ nb130nf_Vvdwtot
]
1894 movss
[rsp
+ nb130nf_Vvdwtot
], xmm5
1897 movlps xmm5
, [rsi
+ rax
*4 + 16]
1899 shufps xmm4
, xmm7
, 136 ;
# constant 10001000
1900 shufps xmm5
, xmm7
, 221 ;
# constant 11011101
1902 movlps xmm7
, [rsi
+ rax
*4 + 24]
1904 shufps xmm6
, xmm3
, 136 ;
# constant 10001000
1905 shufps xmm7
, xmm3
, 221 ;
# constant 11011101
1906 ;
# table ready, in xmm4-xmm7
1907 mulss xmm6
, xmm1 ;
# xmm6=Geps
1908 mulss xmm7
, xmm2 ;
# xmm7=Heps2
1910 addss xmm5
, xmm7 ;
# xmm5=Fp
1911 mulss xmm5
, xmm1 ;
# xmm5=eps*Fp
1912 addss xmm5
, xmm4 ;
# xmm5=VV
1914 movss xmm4
, [rsp
+ nb130nf_c12
]
1915 mulss xmm5
, xmm4 ;
# Vvdw12
1917 addss xmm5
, [rsp
+ nb130nf_Vvdwtot
]
1918 movss
[rsp
+ nb130nf_Vvdwtot
], xmm5
1921 .nb130nf_updateouterdata:
1924 mov esi
, [rsp
+ nb130nf_n
]
1925 ;
# get group index for i particle
1926 mov rdx
, [rbp
+ nb130nf_gid
] ;
# base of gid[]
1927 mov edx
, [rdx
+ rsi
*4] ;
# ggid=gid[n]
1929 ;
# accumulate total potential energy and update it
1930 movaps xmm7
, [rsp
+ nb130nf_vctot
]
1933 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
1935 shufps xmm6
, xmm6
, 1
1938 ;
# add earlier value from mem
1939 mov rax
, [rbp
+ nb130nf_Vc
]
1940 addss xmm7
, [rax
+ rdx
*4]
1942 movss
[rax
+ rdx
*4], xmm7
1944 ;
# accumulate total lj energy and update it
1945 movaps xmm7
, [rsp
+ nb130nf_Vvdwtot
]
1948 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
1950 shufps xmm6
, xmm6
, 1
1953 ;
# add earlier value from mem
1954 mov rax
, [rbp
+ nb130nf_Vvdw
]
1955 addss xmm7
, [rax
+ rdx
*4]
1957 movss
[rax
+ rdx
*4], xmm7
1960 mov ecx
, [rsp
+ nb130nf_nn1
]
1961 ;
# esi already loaded with n
1964 jz
.nb130nf_outerend
1966 ;
# not last, iterate outer loop once more!
1967 mov
[rsp
+ nb130nf_n
], esi
1970 ;
# check if more outer neighborlists remain
1971 mov ecx
, [rsp
+ nb130nf_nri
]
1972 ;
# esi already loaded with n above
1975 ;
# non-zero, do one more workunit
1976 jmp
.nb130nf_threadloop
1979 mov eax
, [rsp
+ nb130nf_nouter
]
1980 mov ebx
, [rsp
+ nb130nf_ninner
]
1981 mov rcx
, [rbp
+ nb130nf_outeriter
]
1982 mov rdx
, [rbp
+ nb130nf_inneriter
]
1989 ;
# Save xmm registers to stack
1991 movaps xmm7
, [rsp
+ 16 ]
1992 movaps xmm8
, [rsp
+ 32 ]
1993 movaps xmm9
, [rsp
+ 48 ]
1994 movaps xmm10
, [rsp
+ 64 ]
1995 movaps xmm11
, [rsp
+ 80 ]
1996 movaps xmm12
, [rsp
+ 96 ]
1997 movaps xmm13
, [rsp
+ 112]
1998 movaps xmm14
, [rsp
+ 128]
1999 movaps xmm15
, [rsp
+ 144]
2001 ;
# Reset pointers after restoring xmm6-15