3 ;
# Gromacs 4.0 Copyright (c) 1991-2003
4 ;
# David van der Spoel, Erik Lindahl
6 ;
# This program is free software; you can redistribute it and/or
7 ;
# modify it under the terms of the GNU General Public License
8 ;
# as published by the Free Software Foundation; either version 2
9 ;
# of the License, or (at your option) any later version.
11 ;
# To help us fund GROMACS development, we humbly ask that you cite
12 ;
# the research papers on the package. Check out http://www.gromacs.org
15 ;
# Gnomes, ROck Monsters And Chili Sauce
18 ;
# These files require GNU binutils 2.10 or later, since we
19 ;
# use intel syntax for portability, or a recent version
20 ;
# of NASM that understands Extended 3DNow and SSE2 instructions.
21 ;
# (NASM is normally only used with MS Visual C++).
22 ;
# Since NASM and gnu as disagree on some definitions and use
23 ;
# completely different preprocessing options I have to introduce a
24 ;
# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
25 ;
# Gnu as treats ';' as a line break, i.e. ignores it. This is the
26 ;
# reason why all comments need both symbols...
27 ;
# The source is written for GNU as, with intel syntax. When you use
28 ;
# NASM we redefine a couple of things. The false if-statement around
29 ;
# the following code is seen by GNU as, but NASM doesn't see it, so
30 ;
# the code inside is read by NASM but not gcc.
32 ;
.if 0 # block below only read by NASM
33 %define
.section section
37 ;
# NASM only wants 'dword', not 'dword ptr'.
42 ;
.endif # End of NASM-specific block
43 ;
.intel_syntax noprefix # Line only read by gnu as
49 .globl nb_kernel430_x86_64_sse
50 .globl _nb_kernel430_x86_64_sse
51 nb_kernel430_x86_64_sse
:
52 _nb_kernel430_x86_64_sse
:
53 ;
# Room for return address and rbp (16 bytes)
54 .equiv nb430_fshift, 16
57 .equiv nb430_faction, 40
58 .equiv nb430_charge, 48
59 .equiv nb430_p_facel, 56
60 .equiv nb430_argkrf, 64
61 .equiv nb430_argcrf, 72
64 .equiv nb430_p_ntype, 96
65 .equiv nb430_vdwparam, 104
66 .equiv nb430_Vvdw, 112
67 .equiv nb430_p_tabscale, 120
68 .equiv nb430_VFtab, 128
69 .equiv nb430_invsqrta, 136
70 .equiv nb430_dvda, 144
71 .equiv nb430_p_gbtabscale, 152
72 .equiv nb430_GBtab, 160
73 .equiv nb430_p_nthreads, 168
74 .equiv nb430_count, 176
76 .equiv nb430_outeriter, 192
77 .equiv nb430_inneriter, 200
78 .equiv nb430_work, 208
79 ;
# stack offsets for local variables
80 ;
# bottom of stack is cache-aligned for sse use
89 .equiv nb430_gbtsc, 128
94 .equiv nb430_epsgb, 208
95 .equiv nb430_vctot, 224
96 .equiv nb430_Vvdwtot, 240
100 .equiv nb430_half, 304
101 .equiv nb430_three, 320
103 .equiv nb430_isai, 352
104 .equiv nb430_isaprod, 368
105 .equiv nb430_dvdasum, 384
106 .equiv nb430_gbscale, 400
107 .equiv nb430_rinv, 416
108 .equiv nb430_nri, 432
109 .equiv nb430_iinr, 440
110 .equiv nb430_jindex, 448
111 .equiv nb430_jjnr, 456
112 .equiv nb430_shift, 464
113 .equiv nb430_shiftvec, 472
114 .equiv nb430_facel, 480
115 .equiv nb430_innerjjnr, 488
117 .equiv nb430_is3, 500
118 .equiv nb430_ii3, 504
119 .equiv nb430_ntia, 508
120 .equiv nb430_innerk, 512
122 .equiv nb430_nn1, 520
123 .equiv nb430_ntype, 524
124 .equiv nb430_nouter, 528
125 .equiv nb430_ninner, 532
139 sub rsp
, 552 ;
# local variable stack space (n*16+8)
141 ;
# zero 32-bit iteration counters
143 mov
[rsp
+ nb430_nouter
], eax
144 mov
[rsp
+ nb430_ninner
], eax
149 mov
[rsp
+ nb430_nri
], edi
150 mov
[rsp
+ nb430_iinr
], rsi
151 mov
[rsp
+ nb430_jindex
], rdx
152 mov
[rsp
+ nb430_jjnr
], rcx
153 mov
[rsp
+ nb430_shift
], r8
154 mov
[rsp
+ nb430_shiftvec
], r9
155 mov rdi
, [rbp
+ nb430_p_ntype
]
157 mov
[rsp
+ nb430_ntype
], edi
158 mov rsi
, [rbp
+ nb430_p_facel
]
160 movss
[rsp
+ nb430_facel
], xmm0
162 mov rax
, [rbp
+ nb430_p_tabscale
]
165 movaps
[rsp
+ nb430_tsc
], xmm3
167 mov rbx
, [rbp
+ nb430_p_gbtabscale
]
170 movaps
[rsp
+ nb430_gbtsc
], xmm4
173 ;
# create constant floating-point factors on stack
174 mov eax
, 0x3f000000 ;
# half in IEEE (hex)
175 mov
[rsp
+ nb430_half
], eax
176 movss xmm1
, [rsp
+ nb430_half
]
177 shufps xmm1
, xmm1
, 0 ;
# splat to all elements
179 addps xmm2
, xmm2 ;
# one
181 addps xmm2
, xmm2 ;
# two
182 addps xmm3
, xmm2 ;
# three
183 movaps
[rsp
+ nb430_half
], xmm1
184 movaps
[rsp
+ nb430_three
], xmm3
187 mov rsi
, [rbp
+ nb430_count
] ;
# pointer to sync counter
190 mov ebx
, eax ;
# ebx=*count=nn0
191 add ebx
, 1 ;
# ebx=nn1=nn0+10
193 cmpxchg
[esi
], ebx ;
# write nn1 to *counter,
194 ;
# if it hasnt changed.
195 ;
# or reread *counter to eax.
196 pause ;
# -> better p4 performance
199 ;
# if(nn1>nri) nn1=nri
200 mov ecx
, [rsp
+ nb430_nri
]
203 cmovle ebx
, edx ;
# if(nn1>nri) nn1=nri
204 ;
# Cleared the spinlock if we got here.
205 ;
# eax contains nn0, ebx contains nn1.
206 mov
[rsp
+ nb430_n
], eax
207 mov
[rsp
+ nb430_nn1
], ebx
208 sub ebx
, eax ;
# calc number of outer lists
209 mov esi
, eax ;
# copy n to esi
214 ;
# ebx contains number of outer iterations
215 add ebx
, [rsp
+ nb430_nouter
]
216 mov
[rsp
+ nb430_nouter
], ebx
219 mov rax
, [rsp
+ nb430_shift
] ;
# rax = pointer into shift[]
220 mov ebx
, [rax
+ rsi
*4] ;
# ebx=shift[n]
222 lea rbx
, [rbx
+ rbx
*2] ;
# rbx=3*is
223 mov
[rsp
+ nb430_is3
],ebx ;
# store is3
225 mov rax
, [rsp
+ nb430_shiftvec
] ;
# rax = base of shiftvec[]
227 movss xmm0
, [rax
+ rbx
*4]
228 movss xmm1
, [rax
+ rbx
*4 + 4]
229 movss xmm2
, [rax
+ rbx
*4 + 8]
231 mov rcx
, [rsp
+ nb430_iinr
] ;
# rcx = pointer into iinr[]
232 mov ebx
, [rcx
+ rsi
*4] ;
# ebx =ii
233 mov
[rsp
+ nb430_ii
], ebx
235 mov rdx
, [rbp
+ nb430_charge
]
236 movss xmm3
, [rdx
+ rbx
*4]
237 mulss xmm3
, [rsp
+ nb430_facel
]
240 mov rdx
, [rbp
+ nb430_invsqrta
] ;
# load invsqrta[ii]
241 movss xmm4
, [rdx
+ rbx
*4]
244 mov rdx
, [rbp
+ nb430_type
]
245 mov edx
, [rdx
+ rbx
*4]
246 imul edx
, [rsp
+ nb430_ntype
]
248 mov
[rsp
+ nb430_ntia
], edx
250 lea rbx
, [rbx
+ rbx
*2] ;
# rbx = 3*ii=ii3
251 mov rax
, [rbp
+ nb430_pos
] ;
# rax = base of pos[]
253 addss xmm0
, [rax
+ rbx
*4]
254 addss xmm1
, [rax
+ rbx
*4 + 4]
255 addss xmm2
, [rax
+ rbx
*4 + 8]
257 movaps
[rsp
+ nb430_iq
], xmm3
258 movaps
[rsp
+ nb430_isai
], xmm4
264 movaps
[rsp
+ nb430_ix
], xmm0
265 movaps
[rsp
+ nb430_iy
], xmm1
266 movaps
[rsp
+ nb430_iz
], xmm2
268 mov
[rsp
+ nb430_ii3
], ebx
270 ;
# clear vctot and i forces
272 movaps
[rsp
+ nb430_vctot
], xmm4
273 movaps
[rsp
+ nb430_Vvdwtot
], xmm4
274 movaps
[rsp
+ nb430_dvdasum
], xmm4
275 movaps
[rsp
+ nb430_fix
], xmm4
276 movaps
[rsp
+ nb430_fiy
], xmm4
277 movaps
[rsp
+ nb430_fiz
], xmm4
279 mov rax
, [rsp
+ nb430_jindex
]
280 mov ecx
, [rax
+ rsi
*4] ;
# jindex[n]
281 mov edx
, [rax
+ rsi
*4 + 4] ;
# jindex[n+1]
282 sub edx
, ecx ;
# number of innerloop atoms
284 mov rsi
, [rbp
+ nb430_pos
]
285 mov rdi
, [rbp
+ nb430_faction
]
286 mov rax
, [rsp
+ nb430_jjnr
]
289 mov
[rsp
+ nb430_innerjjnr
], rax ;
# pointer to jjnr[nj0]
292 add ecx
, [rsp
+ nb430_ninner
]
293 mov
[rsp
+ nb430_ninner
], ecx
295 mov
[rsp
+ nb430_innerk
], edx ;
# number of innerloop atoms
297 jge
.nb430_unroll_loop
298 jmp
.nb430_finish_inner
300 ;
# quad-unroll innerloop here
301 mov rdx
, [rsp
+ nb430_innerjjnr
] ;
# pointer to jjnr[k]
305 mov edx
, [rdx
+ 12] ;
# eax-edx=jnr1-4
307 add qword ptr
[rsp
+ nb430_innerjjnr
], 16 ;
# advance pointer (unrolled 4)
310 mov rsi
, [rbp
+ nb430_invsqrta
]
311 movss xmm3
, [rsi
+ rax
*4]
312 movss xmm4
, [rsi
+ rcx
*4]
313 movss xmm6
, [rsi
+ rbx
*4]
314 movss xmm7
, [rsi
+ rdx
*4]
315 movaps xmm2
, [rsp
+ nb430_isai
]
318 shufps xmm3
, xmm4
, 136 ;
# 10001000 ;# all isaj in xmm3
321 movaps
[rsp
+ nb430_isaprod
], xmm2
323 mulps xmm1
, [rsp
+ nb430_gbtsc
]
324 movaps
[rsp
+ nb430_gbscale
], xmm1
326 mov rsi
, [rbp
+ nb430_charge
] ;
# base of charge[]
328 movss xmm3
, [rsi
+ rax
*4]
329 movss xmm4
, [rsi
+ rcx
*4]
330 movss xmm6
, [rsi
+ rbx
*4]
331 movss xmm7
, [rsi
+ rdx
*4]
333 mulps xmm2
, [rsp
+ nb430_iq
]
336 shufps xmm3
, xmm4
, 136 ;
# 10001000 ;# all charges in xmm3
338 movaps
[rsp
+ nb430_qq
], xmm3
341 mov rsi
, [rbp
+ nb430_type
]
342 mov r12d
, [rsi
+ rax
*4]
343 mov r13d
, [rsi
+ rbx
*4]
344 mov r14d
, [rsi
+ rcx
*4]
345 mov r15d
, [rsi
+ rdx
*4]
350 mov edi
, [rsp
+ nb430_ntia
]
356 mov rsi
, [rbp
+ nb430_vdwparam
]
357 movlps xmm3
, [rsi
+ r12*4]
358 movlps xmm7
, [rsi
+ r14*4]
359 movhps xmm3
, [rsi
+ r13*4]
360 movhps xmm7
, [rsi
+ r15*4]
363 shufps xmm0
, xmm7
, 136 ;
# 10001000
364 shufps xmm3
, xmm7
, 221 ;
# 11011101
366 movaps
[rsp
+ nb430_c6
], xmm0
367 movaps
[rsp
+ nb430_c12
], xmm3
369 mov rsi
, [rbp
+ nb430_pos
] ;
# base of pos[]
371 lea
r8, [rax
+ rax
*2] ;
# jnr
372 lea
r9, [rbx
+ rbx
*2]
373 lea
r10, [rcx
+ rcx
*2]
374 lea
r11, [rdx
+ rdx
*2]
376 ;
# move four coordinates to xmm0-xmm2
377 movlps xmm4
, [rsi
+ r8*4]
378 movlps xmm5
, [rsi
+ r10*4]
379 movss xmm2
, [rsi
+ r8*4 + 8]
380 movss xmm6
, [rsi
+ r10*4 + 8]
382 movhps xmm4
, [rsi
+ r9*4]
383 movhps xmm5
, [rsi
+ r11*4]
385 movss xmm0
, [rsi
+ r9*4 + 8]
386 movss xmm1
, [rsi
+ r11*4 + 8]
394 shufps xmm2
, xmm6
, 136 ;
# 10001000
396 shufps xmm0
, xmm5
, 136 ;
# 10001000
397 shufps xmm1
, xmm5
, 221 ;
# 11011101
400 subps xmm0
, [rsp
+ nb430_ix
]
401 subps xmm1
, [rsp
+ nb430_iy
]
402 subps xmm2
, [rsp
+ nb430_iz
]
405 movaps
[rsp
+ nb430_dx
], xmm0
406 movaps
[rsp
+ nb430_dy
], xmm1
407 movaps
[rsp
+ nb430_dz
], xmm2
409 movd mm0
, r8 ;
# store j3
424 ;
# lookup seed in xmm5
427 movaps xmm1
, [rsp
+ nb430_three
]
428 mulps xmm5
, xmm4 ;
# rsq*lu*lu
429 movaps xmm0
, [rsp
+ nb430_half
]
430 subps xmm1
, xmm5 ;
# 30-rsq*lu*lu
432 mulps xmm0
, xmm1 ;
# xmm0=rinv
433 mulps xmm4
, xmm0 ;
# xmm4=r
434 movaps
[rsp
+ nb430_r
], xmm4
435 movaps
[rsp
+ nb430_rinv
], xmm0
437 movaps xmm8
, xmm4 ;
# r
438 mulps xmm4
, [rsp
+ nb430_gbscale
] ;
# rgbtab
439 mulps xmm8
, [rsp
+ nb430_tsc
] ;
# rtab
441 ;
# truncate and convert to integers
442 cvttps2dq xmm5
, xmm4 ;
# gb
443 cvttps2dq xmm9
, xmm8 ;
# lj
445 ;
# convert back to float
446 cvtdq2ps xmm6
, xmm5 ;
# gb
447 cvtdq2ps xmm10
, xmm9 ;
# lj
449 ;
# multiply by 4 and 8, respectively
453 ;
# move to integer registers
454 movhlps xmm7
, xmm5 ;
# gb
455 movhlps xmm11
, xmm9 ;
# lj
457 movd r12d
, xmm9 ;
# lj
458 movd r10d
, xmm7 ;
# gb
459 movd r14d
, xmm11 ;
# lj
460 pshufd xmm5
, xmm5
, 1 ;
# gb
461 pshufd xmm9
, xmm9
, 1 ;
# lj
462 pshufd xmm7
, xmm7
, 1 ;
# gb
463 pshufd xmm11
, xmm11
, 1 ;
# lj
465 movd r13d
, xmm9 ;
# lj
466 movd r11d
, xmm7 ;
# gb
467 movd r15d
, xmm11 ;
# lj
468 ;
# GB indices: r8-r11 LJ indices: r12-r15
471 subps xmm4
, xmm6 ;
# gb
472 subps xmm8
, xmm10 ;
# lj
473 movaps
[rsp
+ nb430_epsgb
], xmm4 ;
# gb eps
474 movaps
[rsp
+ nb430_eps
], xmm8 ;
# lj eps
476 mov rsi
, [rbp
+ nb430_GBtab
]
477 mov rdi
, [rbp
+ nb430_VFtab
]
479 ;
# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
480 movlps xmm1
, [rsi
+ r8*4] ;
# Y1c F1c
481 movlps xmm5
, [rdi
+ r12*4] ;
# Y1d F1d
482 movlps xmm9
, [rdi
+ r12*4 + 16] ;
# Y1r F1r
484 movlps xmm3
, [rsi
+ r10*4] ;
# Y3c F3c
485 movlps xmm7
, [rdi
+ r14*4] ;
# Y3d F3d
486 movlps xmm11
, [rdi
+ r14*4 + 16] ;
# Y3r F3r
488 movhps xmm1
, [rsi
+ r9*4] ;
# Y1c F1c Y2c F2c
489 movhps xmm5
, [rdi
+ r13*4] ;
# Y1d F1d Y2d F2d
490 movhps xmm9
, [rdi
+ r13*4 + 16] ;
# Y1r F1r Y2r F2r
492 movhps xmm3
, [rsi
+ r11*4] ;
# Y3c F3c Y4c F4c
493 movhps xmm7
, [rdi
+ r15*4] ;
# Y3d F3d Y4d F4d
494 movhps xmm11
, [rdi
+ r15*4 + 16] ;
# Y3r F3r Y4r F4r
499 shufps xmm0
, xmm3
, 136 ;
# 10001000 => Y1c Y2c Y3c Y4c
500 shufps xmm4
, xmm7
, 136 ;
# 10001000 => Y1d Y2d Y3d Y4d
501 shufps xmm8
, xmm11
, 136 ;
# 10001000 => Y1r Y2r Y3r Y4r
502 shufps xmm1
, xmm3
, 221 ;
# 11011101 => F1c F2c F3c F4c
503 shufps xmm5
, xmm7
, 221 ;
# 11011101 => F1d F2d F3d F4d
504 shufps xmm9
, xmm11
, 221 ;
# 11011101 => F1r F2r F3r F4r
506 movlps xmm3
, [rsi
+ r8*4 + 8] ;
# G1c H1c
507 movlps xmm7
, [rdi
+ r12*4 + 8] ;
# G1d H1d
508 movlps xmm11
, [rdi
+ r12*4 + 24] ;
# G1r H1r
510 movlps xmm12
, [rsi
+ r10*4 + 8] ;
# G3c H3c
511 movlps xmm13
, [rdi
+ r14*4 + 8] ;
# G3d H3d
512 movlps xmm14
, [rdi
+ r14*4 + 24] ;
# G3r H3r
514 movhps xmm3
, [rsi
+ r9*4 + 8] ;
# G1c H1c G2c H2c
515 movhps xmm7
, [rdi
+ r13*4 + 8] ;
# G1d H1d G2d H2d
516 movhps xmm11
, [rdi
+ r13*4 + 24] ;
# G1r H1r G2r H2r
518 movhps xmm12
, [rsi
+ r11*4 + 8] ;
# G3c H3c G4c H4c
519 movhps xmm13
, [rdi
+ r15*4 + 8] ;
# G3d H3d G4d H4d
520 movhps xmm14
, [rdi
+ r15*4 + 24] ;
# G3r H3r G4r H4r
525 shufps xmm2
, xmm12
, 136 ;
# 10001000 => G1c G2c G3c G4c
526 shufps xmm6
, xmm13
, 136 ;
# 10001000 => G1d G2d G3d G4d
527 shufps xmm10
, xmm14
, 136 ;
# 10001000 => G1r G2r G3r G4r
528 shufps xmm3
, xmm12
, 221 ;
# 11011101 => H1c H2c H3c H4c
529 shufps xmm7
, xmm13
, 221 ;
# 11011101 => H1d H2d H3d H4d
530 shufps xmm11
, xmm14
, 221 ;
# 11011101 => H1r H2r H3r H4r
531 ;
# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
533 movaps xmm12
, [rsp
+ nb430_epsgb
]
534 movaps xmm13
, [rsp
+ nb430_eps
]
536 mulps xmm3
, xmm12 ;
# Heps
539 mulps xmm2
, xmm12 ;
# Geps
542 mulps xmm3
, xmm12 ;
# Heps2
546 addps xmm1
, xmm2 ;
# F+Geps
549 addps xmm1
, xmm3 ;
# F+Geps+Heps2 = Fp
552 addps xmm3
, xmm3 ;
# 2*Heps2
555 addps xmm3
, xmm2 ;
# 2*Heps2+Geps
558 addps xmm3
, xmm1 ;
# FF = Fp + 2*Heps2 + Geps
561 mulps xmm1
, xmm12 ;
# eps*Fp
564 addps xmm1
, xmm0 ;
# VV
567 mulps xmm1
, [rsp
+ nb430_qq
] ;
# VV*qq = vcoul
568 mulps xmm5
, [rsp
+ nb430_c6
] ;
# vnb6
569 mulps xmm9
, [rsp
+ nb430_c12
] ;
# vnb12
570 mulps xmm3
, [rsp
+ nb430_qq
] ;
# FF*qq = fij
571 mulps xmm7
, [rsp
+ nb430_c6
] ;
# fijD
572 mulps xmm11
, [rsp
+ nb430_c12
] ;
#fijR
574 addps xmm11
, xmm7 ;
# fijD+fijR
575 mulps xmm11
, [rsp
+ nb430_tsc
] ;
# (fijD+fijR)*tabscale
577 ;
# accumulate Vvdwtot
578 addps xmm5
, [rsp
+ nb430_Vvdwtot
]
580 movaps
[rsp
+ nb430_Vvdwtot
], xmm5
582 mov rsi
, [rbp
+ nb430_dvda
]
585 mulps xmm3
, [rsp
+ nb430_gbscale
] ;
# fijC=qq*FF*gbscale
587 mulps xmm6
, [rsp
+ nb430_r
]
588 addps xmm6
, xmm1 ;
# vcoul+fijC*r
590 addps xmm3
, xmm11 ;
# fijC+fijD+fijR
593 addps xmm1
, [rsp
+ nb430_vctot
]
594 movaps
[rsp
+ nb430_vctot
], xmm1
596 ;
# xmm6=(vcoul+fijC*r)
602 addps xmm7
, [rsp
+ nb430_dvdasum
]
603 movaps
[rsp
+ nb430_dvdasum
], xmm7
605 ;
# update j atoms dvdaj
609 shufps xmm5
, xmm5
, 0x1
610 shufps xmm4
, xmm4
, 0x1
612 ;
# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
613 addss xmm6
, [rsi
+ rax
*4]
614 addss xmm5
, [rsi
+ rbx
*4]
615 addss xmm7
, [rsi
+ rcx
*4]
616 addss xmm4
, [rsi
+ rdx
*4]
617 movss
[rsi
+ rax
*4], xmm6
618 movss
[rsi
+ rbx
*4], xmm5
619 movss
[rsi
+ rcx
*4], xmm7
620 movss
[rsi
+ rdx
*4], xmm4
623 mulps xmm3
, [rsp
+ nb430_rinv
]
626 movd
r8, mm0 ;
# fetch j3
635 mulps xmm9
, [rsp
+ nb430_dx
]
636 mulps xmm10
, [rsp
+ nb430_dy
]
637 mulps xmm11
, [rsp
+ nb430_dz
]
639 ;
# accumulate i forces
640 movaps xmm12
, [rsp
+ nb430_fix
]
641 movaps xmm13
, [rsp
+ nb430_fiy
]
642 movaps xmm14
, [rsp
+ nb430_fiz
]
646 movaps
[rsp
+ nb430_fix
], xmm12
647 movaps
[rsp
+ nb430_fiy
], xmm13
648 movaps
[rsp
+ nb430_fiz
], xmm14
650 mov rsi
, [rbp
+ nb430_faction
]
651 ;
# the fj's - start by accumulating x & y forces from memory
652 movlps xmm0
, [rsi
+ r8*4] ;
# x1 y1 - -
653 movlps xmm1
, [rsi
+ r10*4] ;
# x3 y3 - -
654 movhps xmm0
, [rsi
+ r9*4] ;
# x1 y1 x2 y2
655 movhps xmm1
, [rsi
+ r11*4] ;
# x3 y3 x4 y4
658 unpcklps xmm9
, xmm10 ;
# x1 y1 x2 y2
659 unpckhps xmm8
, xmm10 ;
# x3 y3 x4 y4
661 ;
# update fjx and fjy
665 movlps
[rsi
+ r8*4], xmm0
666 movlps
[rsi
+ r10*4], xmm1
667 movhps
[rsi
+ r9*4], xmm0
668 movhps
[rsi
+ r11*4], xmm1
670 ;
# xmm11: fjz1 fjz2 fjz3 fjz4
671 pshufd xmm10
, xmm11
, 1 ;
# fjz2 - - -
672 movhlps xmm9
, xmm11 ;
# fjz3 - - -
673 pshufd xmm8
, xmm11
, 3 ;
# fjz4 - - -
675 addss xmm11
, [rsi
+ r8*4 + 8]
676 addss xmm10
, [rsi
+ r9*4 + 8]
677 addss xmm9
, [rsi
+ r10*4 + 8]
678 addss xmm8
, [rsi
+ r11*4 + 8]
679 movss
[rsi
+ r8*4 + 8], xmm11
680 movss
[rsi
+ r9*4 + 8], xmm10
681 movss
[rsi
+ r10*4 + 8], xmm9
682 movss
[rsi
+ r11*4 + 8], xmm8
684 ;
# should we do one more iteration?
685 sub dword ptr
[rsp
+ nb430_innerk
], 4
686 jl
.nb430_finish_inner
687 jmp
.nb430_unroll_loop
689 ;
# check if at least two particles remain
690 add dword ptr
[rsp
+ nb430_innerk
], 4
691 mov edx
, [rsp
+ nb430_innerk
]
694 jmp
.nb430_checksingle
696 mov rcx
, [rsp
+ nb430_innerjjnr
]
700 add qword ptr
[rsp
+ nb430_innerjjnr
], 8
703 mov rsi
, [rbp
+ nb430_invsqrta
]
704 movss xmm3
, [rsi
+ rax
*4]
705 movss xmm6
, [rsi
+ rbx
*4]
706 movaps xmm2
, [rsp
+ nb430_isai
]
709 movaps
[rsp
+ nb430_isaprod
], xmm2
712 mulps xmm1
, [rsp
+ nb430_gbtsc
]
713 movaps
[rsp
+ nb430_gbscale
], xmm1
715 mov rsi
, [rbp
+ nb430_charge
] ;
# base of charge[]
717 movss xmm3
, [rsi
+ rax
*4]
718 movss xmm6
, [rsi
+ rbx
*4]
720 mulps xmm2
, [rsp
+ nb430_iq
]
722 movaps
[rsp
+ nb430_qq
], xmm3
725 mov rsi
, [rbp
+ nb430_type
]
726 mov r12d
, [rsi
+ rax
*4]
727 mov r13d
, [rsi
+ rbx
*4]
730 mov edi
, [rsp
+ nb430_ntia
]
734 mov rsi
, [rbp
+ nb430_vdwparam
]
735 movlps xmm3
, [rsi
+ r12*4]
736 movhps xmm3
, [rsi
+ r13*4]
740 shufps xmm0
, xmm7
, 136 ;
# 10001000
741 shufps xmm3
, xmm7
, 221 ;
# 11011101
743 movaps
[rsp
+ nb430_c6
], xmm0
744 movaps
[rsp
+ nb430_c12
], xmm3
746 mov rsi
, [rbp
+ nb430_pos
] ;
# base of pos[]
748 lea
r8, [rax
+ rax
*2] ;
# j3
749 lea
r9, [rbx
+ rbx
*2]
751 ;
# move four coordinates to xmm0-xmm2
752 movlps xmm0
, [rsi
+ r8*4] ;
# x1 y1 - -
753 movlps xmm1
, [rsi
+ r9*4] ;
# x2 y2 - -
755 movss xmm2
, [rsi
+ r8*4 + 8] ;
# z1 - - -
756 movss xmm7
, [rsi
+ r9*4 + 8] ;
# z2 - - -
758 unpcklps xmm0
, xmm1 ;
# x1 x2 y1 y2
759 movhlps xmm1
, xmm0 ;
# y1 y2 - -
760 unpcklps xmm2
, xmm7 ;
# z1 z2 - -
763 subps xmm0
, [rsp
+ nb430_ix
]
764 subps xmm1
, [rsp
+ nb430_iy
]
765 subps xmm2
, [rsp
+ nb430_iz
]
768 movaps
[rsp
+ nb430_dx
], xmm0
769 movaps
[rsp
+ nb430_dy
], xmm1
770 movaps
[rsp
+ nb430_dz
], xmm2
782 ;
# lookup seed in xmm5
785 movaps xmm1
, [rsp
+ nb430_three
]
786 mulps xmm5
, xmm4 ;
# rsq*lu*lu
787 movaps xmm0
, [rsp
+ nb430_half
]
788 subps xmm1
, xmm5 ;
# 30-rsq*lu*lu
790 mulps xmm0
, xmm1 ;
# xmm0=rinv
791 mulps xmm4
, xmm0 ;
# xmm4=r
792 movaps
[rsp
+ nb430_r
], xmm4
793 movaps
[rsp
+ nb430_rinv
], xmm0
795 movaps xmm8
, xmm4 ;
# r
796 mulps xmm4
, [rsp
+ nb430_gbscale
] ;
# rgbtab
797 mulps xmm8
, [rsp
+ nb430_tsc
] ;
# rtab
799 ;
# truncate and convert to integers
800 cvttps2dq xmm5
, xmm4 ;
# gb
801 cvttps2dq xmm9
, xmm8 ;
# lj
803 ;
# convert back to float
804 cvtdq2ps xmm6
, xmm5 ;
# gb
805 cvtdq2ps xmm10
, xmm9 ;
# lj
807 ;
# multiply by 4 and 8, respectively
811 ;
# move to integer registers
812 movd r12d
, xmm5 ;
# gb
813 movd r14d
, xmm9 ;
# lj
814 pshufd xmm5
, xmm5
, 1 ;
# gb
815 pshufd xmm9
, xmm9
, 1 ;
# lj
816 movd r13d
, xmm5 ;
# gb
817 movd r15d
, xmm9 ;
# lj
818 ;
# GB indices: r12-r13 LJ indices: r14-r15
821 subps xmm4
, xmm6 ;
# gb
822 subps xmm8
, xmm10 ;
# lj
823 movaps
[rsp
+ nb430_epsgb
], xmm4 ;
# gb eps
824 movaps
[rsp
+ nb430_eps
], xmm8 ;
# lj eps
826 mov rsi
, [rbp
+ nb430_GBtab
]
827 mov rdi
, [rbp
+ nb430_VFtab
]
829 ;
# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
830 movlps xmm0
, [rsi
+ r12*4] ;
# Y1c F1c
831 movlps xmm1
, [rsi
+ r13*4] ;
# Y2c F2c
832 movlps xmm4
, [rdi
+ r14*4] ;
# Y1d F1d
833 movlps xmm5
, [rdi
+ r15*4] ;
# Y2d F2d
834 movlps xmm8
, [rdi
+ r14*4 + 16] ;
# Y1r F1r
835 movlps xmm9
, [rdi
+ r15*4 + 16] ;
# Y2r F2r
843 movlps xmm2
, [rsi
+ r12*4 + 8] ;
# G1c H1c
844 movlps xmm3
, [rsi
+ r13*4 + 8] ;
# G2c H2c
845 movlps xmm6
, [rdi
+ r14*4 + 8] ;
# G1d H1d
846 movlps xmm7
, [rdi
+ r15*4 + 8] ;
# G2d H2d
847 movlps xmm10
, [rdi
+ r14*4 + 24] ;
# G1r H1r
848 movlps xmm11
, [rdi
+ r15*4 + 24] ;
# G2r H2r
853 unpcklps xmm10
, xmm11
855 ;
# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
857 movaps xmm12
, [rsp
+ nb430_epsgb
]
858 movaps xmm13
, [rsp
+ nb430_eps
]
860 mulps xmm3
, xmm12 ;
# Heps
863 mulps xmm2
, xmm12 ;
# Geps
866 mulps xmm3
, xmm12 ;
# Heps2
870 addps xmm1
, xmm2 ;
# F+Geps
873 addps xmm1
, xmm3 ;
# F+Geps+Heps2 = Fp
876 addps xmm3
, xmm3 ;
# 2*Heps2
879 addps xmm3
, xmm2 ;
# 2*Heps2+Geps
882 addps xmm3
, xmm1 ;
# FF = Fp + 2*Heps2 + Geps
885 mulps xmm1
, xmm12 ;
# eps*Fp
888 addps xmm1
, xmm0 ;
# VV
891 mulps xmm1
, [rsp
+ nb430_qq
] ;
# VV*qq = vcoul
892 mulps xmm5
, [rsp
+ nb430_c6
] ;
# vnb6
893 mulps xmm9
, [rsp
+ nb430_c12
] ;
# vnb12
894 mulps xmm3
, [rsp
+ nb430_qq
] ;
# FF*qq = fij
895 mulps xmm7
, [rsp
+ nb430_c6
] ;
# fijD
896 mulps xmm11
, [rsp
+ nb430_c12
] ;
#fijR
898 addps xmm11
, xmm7 ;
# fijD+fijR
899 mulps xmm11
, [rsp
+ nb430_tsc
] ;
# (fijD+fijR)*tabscale
901 ;
# accumulate Vvdwtot
902 addps xmm5
, [rsp
+ nb430_Vvdwtot
]
904 movlps
[rsp
+ nb430_Vvdwtot
], xmm5
906 mov rsi
, [rbp
+ nb430_dvda
]
909 mulps xmm3
, [rsp
+ nb430_gbscale
] ;
# fijC=qq*FF*gbscale
911 mulps xmm6
, [rsp
+ nb430_r
]
912 addps xmm6
, xmm1 ;
# vcoul+fijC*r
914 addps xmm3
, xmm11 ;
# fijC+fijD+fijR
917 addps xmm1
, [rsp
+ nb430_vctot
]
918 movlps
[rsp
+ nb430_vctot
], xmm1
920 ;
# xmm6=(vcoul+fijC*r)
926 addps xmm7
, [rsp
+ nb430_dvdasum
]
927 movlps
[rsp
+ nb430_dvdasum
], xmm7
929 ;
# update j atoms dvdaj
931 shufps xmm5
, xmm5
, 0x1
933 ;
# xmm6=dvdaj1 xmm5=dvdaj2
934 addss xmm6
, [rsi
+ rax
*4]
935 addss xmm5
, [rsi
+ rbx
*4]
936 movss
[rsi
+ rax
*4], xmm6
937 movss
[rsi
+ rbx
*4], xmm5
940 mulps xmm3
, [rsp
+ nb430_rinv
]
947 mulps xmm9
, [rsp
+ nb430_dx
]
948 mulps xmm10
, [rsp
+ nb430_dy
]
949 mulps xmm11
, [rsp
+ nb430_dz
]
952 ;
# accumulate i forces
953 movaps xmm12
, [rsp
+ nb430_fix
]
954 movaps xmm13
, [rsp
+ nb430_fiy
]
955 movaps xmm14
, [rsp
+ nb430_fiz
]
959 movlps
[rsp
+ nb430_fix
], xmm12
960 movlps
[rsp
+ nb430_fiy
], xmm13
961 movlps
[rsp
+ nb430_fiz
], xmm14
963 mov rsi
, [rbp
+ nb430_faction
]
964 ;
# the fj's - start by accumulating x & y forces from memory
965 movlps xmm0
, [rsi
+ r8*4] ;
# x1 y1 - -
966 movhps xmm0
, [rsi
+ r9*4] ;
# x1 y1 x2 y2
968 unpcklps xmm9
, xmm10 ;
# x1 y1 x2 y2
971 movlps
[rsi
+ r8*4], xmm0
972 movhps
[rsi
+ r9*4], xmm0
975 pshufd xmm8
, xmm11
, 1
976 addss xmm11
, [rsi
+ r8*4 + 8]
977 addss xmm8
, [rsi
+ r9*4 + 8]
978 movss
[rsi
+ r8*4 + 8], xmm11
979 movss
[rsi
+ r9*4 + 8], xmm8
982 mov edx
, [rsp
+ nb430_innerk
]
985 jmp
.nb430_updateouterdata
987 mov rsi
, [rbp
+ nb430_charge
]
988 mov rdx
, [rbp
+ nb430_invsqrta
]
989 mov rdi
, [rbp
+ nb430_pos
]
990 mov rcx
, [rsp
+ nb430_innerjjnr
]
994 mov rsi
, [rbp
+ nb430_invsqrta
]
995 movss xmm3
, [rsi
+ rax
*4]
996 movaps xmm2
, [rsp
+ nb430_isai
]
998 movaps
[rsp
+ nb430_isaprod
], xmm2
1001 mulss xmm1
, [rsp
+ nb430_gbtsc
]
1002 movaps
[rsp
+ nb430_gbscale
], xmm1
1004 mov rsi
, [rbp
+ nb430_charge
] ;
# base of charge[]
1006 movss xmm3
, [rsi
+ rax
*4]
1007 mulss xmm2
, [rsp
+ nb430_iq
]
1009 movaps
[rsp
+ nb430_qq
], xmm3
1012 mov rsi
, [rbp
+ nb430_type
]
1013 mov r12d
, [rsi
+ rax
*4]
1015 mov edi
, [rsp
+ nb430_ntia
]
1018 mov rsi
, [rbp
+ nb430_vdwparam
]
1019 movss xmm0
, [rsi
+ r12*4]
1020 movss xmm3
, [rsi
+ r12*4 + 4]
1021 movaps
[rsp
+ nb430_c6
], xmm0
1022 movaps
[rsp
+ nb430_c12
], xmm3
1024 mov rsi
, [rbp
+ nb430_pos
] ;
# base of pos[]
1026 lea
r8, [rax
+ rax
*2] ;
# j3
1028 ;
# move four coordinates to xmm0-xmm2
1029 movss xmm0
, [rsi
+ r8*4]
1030 movss xmm1
, [rsi
+ r8*4 + 4]
1031 movss xmm2
, [rsi
+ r8*4 + 8]
1034 subss xmm0
, [rsp
+ nb430_ix
]
1035 subss xmm1
, [rsp
+ nb430_iy
]
1036 subss xmm2
, [rsp
+ nb430_iz
]
1039 movaps
[rsp
+ nb430_dx
], xmm0
1040 movaps
[rsp
+ nb430_dy
], xmm1
1041 movaps
[rsp
+ nb430_dz
], xmm2
1053 ;
# lookup seed in xmm5
1056 movaps xmm1
, [rsp
+ nb430_three
]
1057 mulss xmm5
, xmm4 ;
# rsq*lu*lu
1058 movaps xmm0
, [rsp
+ nb430_half
]
1059 subss xmm1
, xmm5 ;
# 30-rsq*lu*lu
1061 mulss xmm0
, xmm1 ;
# xmm0=rinv
1062 mulss xmm4
, xmm0 ;
# xmm4=r
1063 movaps
[rsp
+ nb430_r
], xmm4
1064 movaps
[rsp
+ nb430_rinv
], xmm0
1066 movaps xmm8
, xmm4 ;
# r
1067 mulss xmm4
, [rsp
+ nb430_gbscale
] ;
# rgbtab
1068 mulss xmm8
, [rsp
+ nb430_tsc
] ;
# rtab
1070 ;
# truncate and convert to integers
1071 cvttss2si r12d
, xmm4 ;
# gb
1072 cvttss2si r14d
, xmm8 ;
# lj
1074 ;
# convert back to float
1075 cvtsi2ss xmm6
, r12d ;
# gb
1076 cvtsi2ss xmm10
, r14d ;
# lj
1078 ;
# multiply by 4 and 8, respectively
1082 ;
# GB index: r12 LJ indices: r14
1085 subss xmm4
, xmm6 ;
# gb
1086 subss xmm8
, xmm10 ;
# lj
1087 movaps
[rsp
+ nb430_epsgb
], xmm4 ;
# gb eps
1088 movaps
[rsp
+ nb430_eps
], xmm8 ;
# lj eps
1090 mov rsi
, [rbp
+ nb430_GBtab
]
1091 mov rdi
, [rbp
+ nb430_VFtab
]
1093 ;
# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
1094 movss xmm0
, [rsi
+ r12*4]
1095 movss xmm1
, [rsi
+ r12*4 + 4]
1096 movss xmm2
, [rsi
+ r12*4 + 8]
1097 movss xmm3
, [rsi
+ r12*4 + 12]
1098 movss xmm4
, [rdi
+ r14*4]
1099 movss xmm5
, [rdi
+ r14*4 + 4]
1100 movss xmm6
, [rdi
+ r14*4 + 8]
1101 movss xmm7
, [rdi
+ r14*4 + 12]
1102 movss xmm8
, [rdi
+ r14*4 + 16]
1103 movss xmm9
, [rdi
+ r14*4 + 20]
1104 movss xmm10
, [rdi
+ r14*4 + 24]
1105 movss xmm11
, [rdi
+ r14*4 + 28]
1106 ;
# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
1108 movaps xmm12
, [rsp
+ nb430_epsgb
]
1109 movaps xmm13
, [rsp
+ nb430_eps
]
1111 mulss xmm3
, xmm12 ;
# Heps
1114 mulss xmm2
, xmm12 ;
# Geps
1117 mulss xmm3
, xmm12 ;
# Heps2
1121 addss xmm1
, xmm2 ;
# F+Geps
1124 addss xmm1
, xmm3 ;
# F+Geps+Heps2 = Fp
1127 addss xmm3
, xmm3 ;
# 2*Heps2
1130 addss xmm3
, xmm2 ;
# 2*Heps2+Geps
1133 addss xmm3
, xmm1 ;
# FF = Fp + 2*Heps2 + Geps
1136 mulss xmm1
, xmm12 ;
# eps*Fp
1139 addss xmm1
, xmm0 ;
# VV
1142 mulss xmm1
, [rsp
+ nb430_qq
] ;
# VV*qq = vcoul
1143 mulss xmm5
, [rsp
+ nb430_c6
] ;
# vnb6
1144 mulss xmm9
, [rsp
+ nb430_c12
] ;
# vnb12
1145 mulss xmm3
, [rsp
+ nb430_qq
] ;
# FF*qq = fij
1146 mulss xmm7
, [rsp
+ nb430_c6
] ;
# fijD
1147 mulss xmm11
, [rsp
+ nb430_c12
] ;
#fijR
1149 addss xmm11
, xmm7 ;
# fijD+fijR
1150 mulss xmm11
, [rsp
+ nb430_tsc
] ;
# (fijD+fijR)*tabscale
1152 ;
# accumulate Vvdwtot
1153 addss xmm5
, [rsp
+ nb430_Vvdwtot
]
1155 movss
[rsp
+ nb430_Vvdwtot
], xmm5
1157 mov rsi
, [rbp
+ nb430_dvda
]
1160 mulss xmm3
, [rsp
+ nb430_gbscale
] ;
# fijC=qq*FF*gbscale
1162 mulss xmm6
, [rsp
+ nb430_r
]
1163 addss xmm6
, xmm1 ;
# vcoul+fijC*r
1165 addss xmm3
, xmm11 ;
# fijC+fijD+fijR
1168 addss xmm1
, [rsp
+ nb430_vctot
]
1169 movss
[rsp
+ nb430_vctot
], xmm1
1171 ;
# xmm6=(vcoul+fijC*r)
1177 addss xmm7
, [rsp
+ nb430_dvdasum
]
1178 movss
[rsp
+ nb430_dvdasum
], xmm7
1180 ;
# update j atoms dvdaj
1183 addss xmm6
, [rsi
+ rax
*4]
1184 movss
[rsi
+ rax
*4], xmm6
1187 mulss xmm3
, [rsp
+ nb430_rinv
]
1194 mulss xmm9
, [rsp
+ nb430_dx
]
1195 mulss xmm10
, [rsp
+ nb430_dy
]
1196 mulss xmm11
, [rsp
+ nb430_dz
]
1198 ;
# accumulate i forces
1199 movaps xmm12
, [rsp
+ nb430_fix
]
1200 movaps xmm13
, [rsp
+ nb430_fiy
]
1201 movaps xmm14
, [rsp
+ nb430_fiz
]
1205 movss
[rsp
+ nb430_fix
], xmm12
1206 movss
[rsp
+ nb430_fiy
], xmm13
1207 movss
[rsp
+ nb430_fiz
], xmm14
1209 mov rsi
, [rbp
+ nb430_faction
]
1211 addss xmm9
, [rsi
+ r8*4]
1212 addss xmm10
, [rsi
+ r8*4 + 4]
1213 addss xmm11
, [rsi
+ r8*4 + 8]
1214 movss
[rsi
+ r8*4], xmm9
1215 movss
[rsi
+ r8*4 + 4], xmm10
1216 movss
[rsi
+ r8*4 + 8], xmm11
1218 .nb430_updateouterdata:
1219 mov ecx
, [rsp
+ nb430_ii3
]
1220 mov rdi
, [rbp
+ nb430_faction
]
1221 mov rsi
, [rbp
+ nb430_fshift
]
1222 mov edx
, [rsp
+ nb430_is3
]
1224 ;
# accumulate i forces in xmm0, xmm1, xmm2
1225 movaps xmm0
, [rsp
+ nb430_fix
]
1226 movaps xmm1
, [rsp
+ nb430_fiy
]
1227 movaps xmm2
, [rsp
+ nb430_fiz
]
1234 addps xmm2
, xmm5 ;
# sum is in 1/2 in xmm0-xmm2
1240 shufps xmm3
, xmm3
, 1
1241 shufps xmm4
, xmm4
, 1
1242 shufps xmm5
, xmm5
, 1
1245 addss xmm2
, xmm5 ;
# xmm0-xmm2 has single force in pos0
1247 ;
# increment i force
1248 movss xmm3
, [rdi
+ rcx
*4]
1249 movss xmm4
, [rdi
+ rcx
*4 + 4]
1250 movss xmm5
, [rdi
+ rcx
*4 + 8]
1254 movss
[rdi
+ rcx
*4], xmm3
1255 movss
[rdi
+ rcx
*4 + 4], xmm4
1256 movss
[rdi
+ rcx
*4 + 8], xmm5
1258 ;
# increment fshift force
1259 movss xmm3
, [rsi
+ rdx
*4]
1260 movss xmm4
, [rsi
+ rdx
*4 + 4]
1261 movss xmm5
, [rsi
+ rdx
*4 + 8]
1265 movss
[rsi
+ rdx
*4], xmm3
1266 movss
[rsi
+ rdx
*4 + 4], xmm4
1267 movss
[rsi
+ rdx
*4 + 8], xmm5
1270 mov esi
, [rsp
+ nb430_n
]
1271 ;
# get group index for i particle
1272 mov rdx
, [rbp
+ nb430_gid
] ;
# base of gid[]
1273 mov edx
, [rdx
+ rsi
*4] ;
# ggid=gid[n]
1275 ;
# accumulate total potential energy and update it
1276 movaps xmm7
, [rsp
+ nb430_vctot
]
1279 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
1281 shufps xmm6
, xmm6
, 1
1284 ;
# add earlier value from mem
1285 mov rax
, [rbp
+ nb430_Vc
]
1286 addss xmm7
, [rax
+ rdx
*4]
1288 movss
[rax
+ rdx
*4], xmm7
1290 ;
# accumulate total lj energy and update it
1291 movaps xmm7
, [rsp
+ nb430_Vvdwtot
]
1294 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
1296 shufps xmm6
, xmm6
, 1
1299 ;
# add earlier value from mem
1300 mov rax
, [rbp
+ nb430_Vvdw
]
1301 addss xmm7
, [rax
+ rdx
*4]
1303 movss
[rax
+ rdx
*4], xmm7
1305 ;
# accumulate dVda and update it
1306 movaps xmm7
, [rsp
+ nb430_dvdasum
]
1309 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
1311 shufps xmm6
, xmm6
, 1
1314 mov edx
, [rsp
+ nb430_ii
]
1315 mov rax
, [rbp
+ nb430_dvda
]
1316 addss xmm7
, [rax
+ rdx
*4]
1317 movss
[rax
+ rdx
*4], xmm7
1320 mov ecx
, [rsp
+ nb430_nn1
]
1321 ;
# esi already loaded with n
1326 ;
# not last, iterate outer loop once more!
1327 mov
[rsp
+ nb430_n
], esi
1330 ;
# check if more outer neighborlists remain
1331 mov ecx
, [rsp
+ nb430_nri
]
1332 ;
# esi already loaded with n above
1335 ;
# non-zero, do one more workunit
1336 jmp
.nb430_threadloop
1338 mov eax
, [rsp
+ nb430_nouter
]
1339 mov ebx
, [rsp
+ nb430_ninner
]
1340 mov rcx
, [rbp
+ nb430_outeriter
]
1341 mov rdx
, [rbp
+ nb430_inneriter
]
1362 .globl nb_kernel430nf_x86_64_sse
1363 .globl _nb_kernel430nf_x86_64_sse
1364 nb_kernel430nf_x86_64_sse
:
1365 _nb_kernel430nf_x86_64_sse
:
1366 ;
# Room for return address and rbp (16 bytes)
1367 .equiv nb430nf_fshift, 16
1368 .equiv nb430nf_gid, 24
1369 .equiv nb430nf_pos, 32
1370 .equiv nb430nf_faction, 40
1371 .equiv nb430nf_charge, 48
1372 .equiv nb430nf_p_facel, 56
1373 .equiv nb430nf_argkrf, 64
1374 .equiv nb430nf_argcrf, 72
1375 .equiv nb430nf_Vc, 80
1376 .equiv nb430nf_type, 88
1377 .equiv nb430nf_p_ntype, 96
1378 .equiv nb430nf_vdwparam, 104
1379 .equiv nb430nf_Vvdw, 112
1380 .equiv nb430nf_p_tabscale, 120
1381 .equiv nb430nf_VFtab, 128
1382 .equiv nb430nf_invsqrta, 136
1383 .equiv nb430nf_dvda, 144
1384 .equiv nb430nf_p_gbtabscale, 152
1385 .equiv nb430nf_GBtab, 160
1386 .equiv nb430nf_p_nthreads, 168
1387 .equiv nb430nf_count, 176
1388 .equiv nb430nf_mtx, 184
1389 .equiv nb430nf_outeriter, 192
1390 .equiv nb430nf_inneriter, 200
1391 .equiv nb430nf_work, 208
1392 ;
# stack offsets for local variables
1393 ;
# bottom of stack is cache-aligned for sse use
1394 .equiv nb430nf_ix, 0
1395 .equiv nb430nf_iy, 16
1396 .equiv nb430nf_iz, 32
1397 .equiv nb430nf_iq, 48
1398 .equiv nb430nf_gbtsc, 64
1399 .equiv nb430nf_tsc, 80
1400 .equiv nb430nf_qq, 96
1401 .equiv nb430nf_c6, 112
1402 .equiv nb430nf_c12, 128
1403 .equiv nb430nf_vctot, 144
1404 .equiv nb430nf_Vvdwtot, 160
1405 .equiv nb430nf_half, 176
1406 .equiv nb430nf_three, 192
1407 .equiv nb430nf_isai, 208
1408 .equiv nb430nf_isaprod, 224
1409 .equiv nb430nf_gbscale, 240
1410 .equiv nb430nf_r, 256
1411 .equiv nb430nf_nri, 272
1412 .equiv nb430nf_iinr, 280
1413 .equiv nb430nf_jindex, 288
1414 .equiv nb430nf_jjnr, 296
1415 .equiv nb430nf_shift, 304
1416 .equiv nb430nf_shiftvec, 312
1417 .equiv nb430nf_facel, 320
1418 .equiv nb430nf_innerjjnr, 328
1419 .equiv nb430nf_is3, 336
1420 .equiv nb430nf_ii3, 340
1421 .equiv nb430nf_ntia, 344
1422 .equiv nb430nf_innerk, 348
1423 .equiv nb430nf_n, 352
1424 .equiv nb430nf_nn1, 356
1425 .equiv nb430nf_ntype, 360
1426 .equiv nb430nf_nouter, 364
1427 .equiv nb430nf_ninner, 368
1441 sub rsp
, 392 ;
# local variable stack space (n*16+8)
1443 ;
# zero 32-bit iteration counters
1445 mov
[rsp
+ nb430nf_nouter
], eax
1446 mov
[rsp
+ nb430nf_ninner
], eax
1449 mov
[rsp
+ nb430nf_nri
], edi
1450 mov
[rsp
+ nb430nf_iinr
], rsi
1451 mov
[rsp
+ nb430nf_jindex
], rdx
1452 mov
[rsp
+ nb430nf_jjnr
], rcx
1453 mov
[rsp
+ nb430nf_shift
], r8
1454 mov
[rsp
+ nb430nf_shiftvec
], r9
1455 mov rdi
, [rbp
+ nb430nf_p_ntype
]
1457 mov
[rsp
+ nb430nf_ntype
], edi
1458 mov rsi
, [rbp
+ nb430nf_p_facel
]
1460 movss
[rsp
+ nb430nf_facel
], xmm0
1462 mov rax
, [rbp
+ nb430nf_p_tabscale
]
1464 shufps xmm3
, xmm3
, 0
1465 movaps
[rsp
+ nb430nf_tsc
], xmm3
1467 mov rbx
, [rbp
+ nb430nf_p_gbtabscale
]
1469 shufps xmm4
, xmm4
, 0
1470 movaps
[rsp
+ nb430nf_gbtsc
], xmm4
1472 ;
# create constant floating-point factors on stack
1473 mov eax
, 0x3f000000 ;
# half in IEEE (hex)
1474 mov
[rsp
+ nb430nf_half
], eax
1475 movss xmm1
, [rsp
+ nb430nf_half
]
1476 shufps xmm1
, xmm1
, 0 ;
# splat to all elements
1478 addps xmm2
, xmm2 ;
# one
1480 addps xmm2
, xmm2 ;
# two
1481 addps xmm3
, xmm2 ;
# three
1482 movaps
[rsp
+ nb430nf_half
], xmm1
1483 movaps
[rsp
+ nb430nf_three
], xmm3
1485 .nb430nf_threadloop:
1486 mov rsi
, [rbp
+ nb430nf_count
] ;
# pointer to sync counter
1489 mov ebx
, eax ;
# ebx=*count=nn0
1490 add ebx
, 1 ;
# ebx=nn1=nn0+10
1492 cmpxchg
[esi
], ebx ;
# write nn1 to *counter,
1493 ;
# if it hasnt changed.
1494 ;
# or reread *counter to eax.
1495 pause ;
# -> better p4 performance
1496 jnz
.nb430nf_spinlock
1498 ;
# if(nn1>nri) nn1=nri
1499 mov ecx
, [rsp
+ nb430nf_nri
]
1502 cmovle ebx
, edx ;
# if(nn1>nri) nn1=nri
1503 ;
# Cleared the spinlock if we got here.
1504 ;
# eax contains nn0, ebx contains nn1.
1505 mov
[rsp
+ nb430nf_n
], eax
1506 mov
[rsp
+ nb430nf_nn1
], ebx
1507 sub ebx
, eax ;
# calc number of outer lists
1508 mov esi
, eax ;
# copy n to esi
1509 jg
.nb430nf_outerstart
1512 .nb430nf_outerstart:
1513 ;
# ebx contains number of outer iterations
1514 add ebx
, [rsp
+ nb430nf_nouter
]
1515 mov
[rsp
+ nb430nf_nouter
], ebx
1518 mov rax
, [rsp
+ nb430nf_shift
] ;
# rax = pointer into shift[]
1519 mov ebx
, [rax
+ rsi
*4] ;
# ebx=shift[n]
1521 lea rbx
, [rbx
+ rbx
*2] ;
# rbx=3*is
1522 mov
[rsp
+ nb430nf_is3
],ebx ;
# store is3
1524 mov rax
, [rsp
+ nb430nf_shiftvec
] ;
# rax = base of shiftvec[]
1526 movss xmm0
, [rax
+ rbx
*4]
1527 movss xmm1
, [rax
+ rbx
*4 + 4]
1528 movss xmm2
, [rax
+ rbx
*4 + 8]
1530 mov rcx
, [rsp
+ nb430nf_iinr
] ;
# rcx = pointer into iinr[]
1531 mov ebx
, [rcx
+ rsi
*4] ;
# ebx =ii
1533 mov rdx
, [rbp
+ nb430nf_charge
]
1534 movss xmm3
, [rdx
+ rbx
*4]
1535 mulss xmm3
, [rsp
+ nb430nf_facel
]
1536 shufps xmm3
, xmm3
, 0
1538 mov rdx
, [rbp
+ nb430nf_invsqrta
] ;
# load invsqrta[ii]
1539 movss xmm4
, [rdx
+ rbx
*4]
1540 shufps xmm4
, xmm4
, 0
1542 mov rdx
, [rbp
+ nb430nf_type
]
1543 mov edx
, [rdx
+ rbx
*4]
1544 imul edx
, [rsp
+ nb430nf_ntype
]
1546 mov
[rsp
+ nb430nf_ntia
], edx
1548 lea rbx
, [rbx
+ rbx
*2] ;
# rbx = 3*ii=ii3
1549 mov rax
, [rbp
+ nb430nf_pos
] ;
# rax = base of pos[]
1551 addss xmm0
, [rax
+ rbx
*4]
1552 addss xmm1
, [rax
+ rbx
*4 + 4]
1553 addss xmm2
, [rax
+ rbx
*4 + 8]
1555 movaps
[rsp
+ nb430nf_iq
], xmm3
1556 movaps
[rsp
+ nb430nf_isai
], xmm4
1558 shufps xmm0
, xmm0
, 0
1559 shufps xmm1
, xmm1
, 0
1560 shufps xmm2
, xmm2
, 0
1562 movaps
[rsp
+ nb430nf_ix
], xmm0
1563 movaps
[rsp
+ nb430nf_iy
], xmm1
1564 movaps
[rsp
+ nb430nf_iz
], xmm2
1566 mov
[rsp
+ nb430nf_ii3
], ebx
1570 movaps
[rsp
+ nb430nf_vctot
], xmm4
1571 movaps
[rsp
+ nb430nf_Vvdwtot
], xmm4
1573 mov rax
, [rsp
+ nb430nf_jindex
]
1574 mov ecx
, [rax
+ rsi
*4] ;
# jindex[n]
1575 mov edx
, [rax
+ rsi
*4 + 4] ;
# jindex[n+1]
1576 sub edx
, ecx ;
# number of innerloop atoms
1578 mov rsi
, [rbp
+ nb430nf_pos
]
1579 mov rdi
, [rbp
+ nb430nf_faction
]
1580 mov rax
, [rsp
+ nb430nf_jjnr
]
1583 mov
[rsp
+ nb430nf_innerjjnr
], rax ;
# pointer to jjnr[nj0]
1586 add ecx
, [rsp
+ nb430nf_ninner
]
1587 mov
[rsp
+ nb430nf_ninner
], ecx
1589 mov
[rsp
+ nb430nf_innerk
], edx ;
# number of innerloop atoms
1590 jge
.nb430nf_unroll_loop
1591 jmp
.nb430nf_finish_inner
1592 .nb430nf_unroll_loop:
1593 ;
# quad-unroll innerloop here
1594 mov rdx
, [rsp
+ nb430nf_innerjjnr
] ;
# pointer to jjnr[k]
1598 mov edx
, [rdx
+ 12] ;
# eax-edx=jnr1-4
1599 add qword ptr
[rsp
+ nb430nf_innerjjnr
], 16 ;
# advance pointer (unrolled 4)
1602 mov rsi
, [rbp
+ nb430nf_invsqrta
]
1603 movss xmm3
, [rsi
+ rax
*4]
1604 movss xmm4
, [rsi
+ rcx
*4]
1605 movss xmm6
, [rsi
+ rbx
*4]
1606 movss xmm7
, [rsi
+ rdx
*4]
1607 movaps xmm2
, [rsp
+ nb430nf_isai
]
1608 shufps xmm3
, xmm6
, 0
1609 shufps xmm4
, xmm7
, 0
1610 shufps xmm3
, xmm4
, 136 ;
# 10001000 ;# all charges in xmm3
1613 movaps
[rsp
+ nb430nf_isaprod
], xmm2
1615 mulps xmm1
, [rsp
+ nb430nf_gbtsc
]
1616 movaps
[rsp
+ nb430nf_gbscale
], xmm1
1618 mov rsi
, [rbp
+ nb430nf_charge
] ;
# base of charge[]
1620 movss xmm3
, [rsi
+ rax
*4]
1621 movss xmm4
, [rsi
+ rcx
*4]
1622 movss xmm6
, [rsi
+ rbx
*4]
1623 movss xmm7
, [rsi
+ rdx
*4]
1625 mulps xmm2
, [rsp
+ nb430nf_iq
]
1626 shufps xmm3
, xmm6
, 0
1627 shufps xmm4
, xmm7
, 0
1628 shufps xmm3
, xmm4
, 136 ;
# 10001000 ;# all charges in xmm3
1630 movaps
[rsp
+ nb430nf_qq
], xmm3
1632 movd mm0
, eax ;
# use mmx registers as temp storage
1637 mov rsi
, [rbp
+ nb430nf_type
]
1638 mov eax
, [rsi
+ rax
*4]
1639 mov ebx
, [rsi
+ rbx
*4]
1640 mov ecx
, [rsi
+ rcx
*4]
1641 mov edx
, [rsi
+ rdx
*4]
1642 mov rsi
, [rbp
+ nb430nf_vdwparam
]
1647 mov edi
, [rsp
+ nb430nf_ntia
]
1653 movlps xmm6
, [rsi
+ rax
*4]
1654 movlps xmm7
, [rsi
+ rcx
*4]
1655 movhps xmm6
, [rsi
+ rbx
*4]
1656 movhps xmm7
, [rsi
+ rdx
*4]
1659 shufps xmm4
, xmm7
, 136 ;
# 10001000
1660 shufps xmm6
, xmm7
, 221 ;
# 11011101
1667 movaps
[rsp
+ nb430nf_c6
], xmm4
1668 movaps
[rsp
+ nb430nf_c12
], xmm6
1670 mov rsi
, [rbp
+ nb430nf_pos
] ;
# base of pos[]
1672 lea rax
, [rax
+ rax
*2] ;
# replace jnr with j3
1673 lea rbx
, [rbx
+ rbx
*2]
1675 lea rcx
, [rcx
+ rcx
*2] ;
# replace jnr with j3
1676 lea rdx
, [rdx
+ rdx
*2]
1678 ;
# move four coordinates to xmm0-xmm2
1680 movlps xmm4
, [rsi
+ rax
*4]
1681 movlps xmm5
, [rsi
+ rcx
*4]
1682 movss xmm2
, [rsi
+ rax
*4 + 8]
1683 movss xmm6
, [rsi
+ rcx
*4 + 8]
1685 movhps xmm4
, [rsi
+ rbx
*4]
1686 movhps xmm5
, [rsi
+ rdx
*4]
1688 movss xmm0
, [rsi
+ rbx
*4 + 8]
1689 movss xmm1
, [rsi
+ rdx
*4 + 8]
1691 shufps xmm2
, xmm0
, 0
1692 shufps xmm6
, xmm1
, 0
1697 shufps xmm2
, xmm6
, 136 ;
# 10001000
1699 shufps xmm0
, xmm5
, 136 ;
# 10001000
1700 shufps xmm1
, xmm5
, 221 ;
# 11011101
1702 ;
# move ix-iz to xmm4-xmm6
1703 movaps xmm4
, [rsp
+ nb430nf_ix
]
1704 movaps xmm5
, [rsp
+ nb430nf_iy
]
1705 movaps xmm6
, [rsp
+ nb430nf_iz
]
1721 ;
# lookup seed in xmm5
1724 movaps xmm1
, [rsp
+ nb430nf_three
]
1725 mulps xmm5
, xmm4 ;
# rsq*lu*lu
1726 movaps xmm0
, [rsp
+ nb430nf_half
]
1727 subps xmm1
, xmm5 ;
# 30-rsq*lu*lu
1729 mulps xmm0
, xmm1 ;
# xmm0=rinv
1730 mulps xmm4
, xmm0 ;
# xmm4=r
1731 movaps
[rsp
+ nb430nf_r
], xmm4
1732 mulps xmm4
, [rsp
+ nb430nf_gbscale
]
1736 cvttps2pi mm7
, xmm5 ;
# mm6/mm7 contain lu indices
1741 movaps xmm1
, xmm4 ;
# xmm1=eps
1743 mulps xmm2
, xmm2 ;
# xmm2=eps2
1752 mov rsi
, [rbp
+ nb430nf_GBtab
]
1760 ;
# load coulomb table
1761 movaps xmm4
, [rsi
+ rax
*4]
1762 movaps xmm5
, [rsi
+ rbx
*4]
1763 movaps xmm6
, [rsi
+ rcx
*4]
1764 movaps xmm7
, [rsi
+ rdx
*4]
1765 ;
# transpose, using xmm3 for scratch
1767 shufps xmm3
, xmm7
, 0xEE
1768 shufps xmm6
, xmm7
, 0x44
1770 shufps xmm7
, xmm5
, 0xEE
1771 shufps xmm4
, xmm5
, 0x44
1773 shufps xmm5
, xmm6
, 0xDD
1774 shufps xmm4
, xmm6
, 0x88
1776 shufps xmm6
, xmm3
, 0x88
1777 shufps xmm7
, xmm3
, 0xDD
1778 ;
# coulomb table ready, in xmm4-xmm7
1780 mulps xmm6
, xmm1 ;
# xmm6=Geps
1781 mulps xmm7
, xmm2 ;
# xmm7=Heps2
1783 addps xmm5
, xmm7 ;
# xmm5=Fp
1784 movaps xmm3
, [rsp
+ nb430nf_qq
]
1785 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
1786 addps xmm5
, xmm4 ;
# xmm5=VV
1787 mulps xmm5
, xmm3 ;
# vcoul=qq*VV
1788 addps xmm5
, [rsp
+ nb430nf_vctot
]
1789 movaps
[rsp
+ nb430nf_vctot
], xmm5
1792 movaps xmm4
, [rsp
+ nb430nf_r
]
1793 mulps xmm4
, [rsp
+ nb430nf_tsc
]
1797 cvttps2pi mm7
, xmm5 ;
# mm6/mm7 contain lu indices
1802 movaps xmm1
, xmm4 ;
# xmm1=eps
1804 mulps xmm2
, xmm2 ;
# xmm2=eps2
1808 mov rsi
, [rbp
+ nb430nf_VFtab
]
1817 movaps xmm4
, [rsi
+ rax
*4]
1818 movaps xmm5
, [rsi
+ rbx
*4]
1819 movaps xmm6
, [rsi
+ rcx
*4]
1820 movaps xmm7
, [rsi
+ rdx
*4]
1821 ;
# transpose, using xmm3 for scratch
1823 shufps xmm3
, xmm7
, 0xEE
1824 shufps xmm6
, xmm7
, 0x44
1826 shufps xmm7
, xmm5
, 0xEE
1827 shufps xmm4
, xmm5
, 0x44
1829 shufps xmm5
, xmm6
, 0xDD
1830 shufps xmm4
, xmm6
, 0x88
1832 shufps xmm6
, xmm3
, 0x88
1833 shufps xmm7
, xmm3
, 0xDD
1834 ;
# dispersion table ready, in xmm4-xmm7
1835 mulps xmm6
, xmm1 ;
# xmm6=Geps
1836 mulps xmm7
, xmm2 ;
# xmm7=Heps2
1838 addps xmm5
, xmm7 ;
# xmm5=Fp
1839 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
1840 addps xmm5
, xmm4 ;
# xmm5=VV
1841 mulps xmm5
, [rsp
+ nb430nf_c6
] ;
# Vvdw6
1842 addps xmm5
, [rsp
+ nb430nf_Vvdwtot
]
1843 movaps
[rsp
+ nb430nf_Vvdwtot
], xmm5
1846 movaps xmm4
, [rsi
+ rax
*4 + 16]
1847 movaps xmm5
, [rsi
+ rbx
*4 + 16]
1848 movaps xmm6
, [rsi
+ rcx
*4 + 16]
1849 movaps xmm7
, [rsi
+ rdx
*4 + 16]
1850 ;
# transpose, using xmm3 for scratch
1852 shufps xmm3
, xmm7
, 0xEE
1853 shufps xmm6
, xmm7
, 0x44
1855 shufps xmm7
, xmm5
, 0xEE
1856 shufps xmm4
, xmm5
, 0x44
1858 shufps xmm5
, xmm6
, 0xDD
1859 shufps xmm4
, xmm6
, 0x88
1861 shufps xmm6
, xmm3
, 0x88
1862 shufps xmm7
, xmm3
, 0xDD
1863 ;
# table ready, in xmm4-xmm7
1864 mulps xmm6
, xmm1 ;
# xmm6=Geps
1865 mulps xmm7
, xmm2 ;
# xmm7=Heps2
1867 addps xmm5
, xmm7 ;
# xmm5=Fp
1868 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
1869 addps xmm5
, xmm4 ;
# xmm5=VV
1871 mulps xmm5
, [rsp
+ nb430nf_c12
] ;
# Vvdw12
1872 addps xmm5
, [rsp
+ nb430nf_Vvdwtot
]
1873 movaps
[rsp
+ nb430nf_Vvdwtot
], xmm5
1875 ;
# should we do one more iteration?
1876 sub dword ptr
[rsp
+ nb430nf_innerk
], 4
1877 jl
.nb430nf_finish_inner
1878 jmp
.nb430nf_unroll_loop
1879 .nb430nf_finish_inner:
1880 ;
# check if at least two particles remain
1881 add dword ptr
[rsp
+ nb430nf_innerk
], 4
1882 mov edx
, [rsp
+ nb430nf_innerk
]
1885 jmp
.nb430nf_checksingle
1888 mov rcx
, [rsp
+ nb430nf_innerjjnr
]
1892 add qword ptr
[rsp
+ nb430nf_innerjjnr
], 8
1898 mov rsi
, [rbp
+ nb430nf_invsqrta
]
1899 movss xmm2
, [rsi
+ rax
*4]
1900 movss xmm3
, [rsi
+ rbx
*4]
1901 unpcklps xmm2
, xmm3 ;
# isa2 in xmm3(0,1)
1902 mulps xmm2
, [rsp
+ nb430nf_isai
]
1903 movaps
[rsp
+ nb430nf_isaprod
], xmm2
1905 mulps xmm1
, [rsp
+ nb430nf_gbtsc
]
1906 movaps
[rsp
+ nb430nf_gbscale
], xmm1
1908 mov rsi
, [rbp
+ nb430nf_charge
] ;
# base of charge[]
1909 movss xmm3
, [rsi
+ rax
*4]
1910 movss xmm6
, [rsi
+ rbx
*4]
1911 unpcklps xmm3
, xmm6 ;
# 00001000 ;# xmm3(0,1) has the charges
1913 mulps xmm2
, [rsp
+ nb430nf_iq
]
1915 movaps
[rsp
+ nb430nf_qq
], xmm3
1917 mov rsi
, [rbp
+ nb430nf_type
]
1920 mov ecx
, [rsi
+ rcx
*4]
1921 mov edx
, [rsi
+ rdx
*4]
1922 mov rsi
, [rbp
+ nb430nf_vdwparam
]
1925 mov edi
, [rsp
+ nb430nf_ntia
]
1928 movlps xmm6
, [rsi
+ rcx
*4]
1929 movhps xmm6
, [rsi
+ rdx
*4]
1930 mov rdi
, [rbp
+ nb430nf_pos
]
1933 shufps xmm4
, xmm4
, 8 ;
# 00001000
1934 shufps xmm6
, xmm6
, 13 ;
# 00001101
1938 movaps
[rsp
+ nb430nf_c6
], xmm4
1939 movaps
[rsp
+ nb430nf_c12
], xmm6
1941 lea rax
, [rax
+ rax
*2]
1942 lea rbx
, [rbx
+ rbx
*2]
1943 ;
# move coordinates to xmm0-xmm2
1944 movlps xmm1
, [rdi
+ rax
*4]
1945 movss xmm2
, [rdi
+ rax
*4 + 8]
1946 movhps xmm1
, [rdi
+ rbx
*4]
1947 movss xmm0
, [rdi
+ rbx
*4 + 8]
1951 shufps xmm2
, xmm0
, 0
1955 shufps xmm2
, xmm2
, 136 ;
# 10001000
1957 shufps xmm0
, xmm0
, 136 ;
# 10001000
1958 shufps xmm1
, xmm1
, 221 ;
# 11011101
1960 mov rdi
, [rbp
+ nb430nf_faction
]
1961 ;
# move ix-iz to xmm4-xmm6
1964 movaps xmm4
, [rsp
+ nb430nf_ix
]
1965 movaps xmm5
, [rsp
+ nb430nf_iy
]
1966 movaps xmm6
, [rsp
+ nb430nf_iz
]
1982 ;
# lookup seed in xmm5
1985 movaps xmm1
, [rsp
+ nb430nf_three
]
1986 mulps xmm5
, xmm4 ;
# rsq*lu*lu
1987 movaps xmm0
, [rsp
+ nb430nf_half
]
1988 subps xmm1
, xmm5 ;
# 30-rsq*lu*lu
1990 mulps xmm0
, xmm1 ;
# xmm0=rinv
1991 mulps xmm4
, xmm0 ;
# xmm4=r
1992 movaps
[rsp
+ nb430nf_r
], xmm4
1993 mulps xmm4
, [rsp
+ nb430nf_gbscale
]
1995 cvttps2pi mm6
, xmm4 ;
# mm6 contain lu indices
1998 movaps xmm1
, xmm4 ;
# xmm1=eps
2000 mulps xmm2
, xmm2 ;
# xmm2=eps2
2004 mov rsi
, [rbp
+ nb430nf_GBtab
]
2009 ;
# load coulomb table
2010 movaps xmm4
, [rsi
+ rcx
*4]
2011 movaps xmm7
, [rsi
+ rdx
*4]
2012 ;
# transpose, using xmm3 for scratch
2014 unpcklps xmm4
, xmm7 ;
# Y1 Y2 F1 F2
2015 unpckhps xmm6
, xmm7 ;
# G1 G2 H1 H2
2016 movhlps xmm5
, xmm4 ;
# F1 F2
2017 movhlps xmm7
, xmm6 ;
# H1 H2
2018 ;
# coulomb table ready, in xmm4-xmm7
2020 mulps xmm6
, xmm1 ;
# xmm6=Geps
2021 mulps xmm7
, xmm2 ;
# xmm7=Heps2
2023 addps xmm5
, xmm7 ;
# xmm5=Fp
2024 movaps xmm3
, [rsp
+ nb430nf_qq
]
2025 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
2026 addps xmm5
, xmm4 ;
# xmm5=VV
2027 mulps xmm5
, xmm3 ;
# vcoul=qq*VV
2028 addps xmm5
, [rsp
+ nb430nf_vctot
]
2029 movaps
[rsp
+ nb430nf_vctot
], xmm5
2031 movaps xmm4
, [rsp
+ nb430nf_r
]
2032 mulps xmm4
, [rsp
+ nb430nf_tsc
]
2037 movaps xmm1
, xmm4 ;
# xmm1=eps
2039 mulps xmm2
, xmm2 ;
# xmm2=eps2
2042 mov rsi
, [rbp
+ nb430nf_VFtab
]
2048 movaps xmm4
, [rsi
+ rcx
*4]
2049 movaps xmm7
, [rsi
+ rdx
*4]
2050 ;
# transpose, using xmm3 for scratch
2052 unpcklps xmm4
, xmm7 ;
# Y1 Y2 F1 F2
2053 unpckhps xmm6
, xmm7 ;
# G1 G2 H1 H2
2054 movhlps xmm5
, xmm4 ;
# F1 F2
2055 movhlps xmm7
, xmm6 ;
# H1 H2
2056 ;
# dispersion table ready, in xmm4-xmm7
2057 mulps xmm6
, xmm1 ;
# xmm6=Geps
2058 mulps xmm7
, xmm2 ;
# xmm7=Heps2
2060 addps xmm5
, xmm7 ;
# xmm5=Fp
2061 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
2062 addps xmm5
, xmm4 ;
# xmm5=VV
2064 mulps xmm5
, [rsp
+ nb430nf_c6
] ;
# Vvdw6
2065 addps xmm5
, [rsp
+ nb430nf_Vvdwtot
]
2066 movaps
[rsp
+ nb430nf_Vvdwtot
], xmm5
2069 movaps xmm4
, [rsi
+ rcx
*4 + 16]
2070 movaps xmm7
, [rsi
+ rdx
*4 + 16]
2071 ;
# transpose, using xmm3 for scratch
2073 unpcklps xmm4
, xmm7 ;
# Y1 Y2 F1 F2
2074 unpckhps xmm6
, xmm7 ;
# G1 G2 H1 H2
2075 movhlps xmm5
, xmm4 ;
# F1 F2
2076 movhlps xmm7
, xmm6 ;
# H1 H2
2077 ;
# table ready, in xmm4-xmm7
2078 mulps xmm6
, xmm1 ;
# xmm6=Geps
2079 mulps xmm7
, xmm2 ;
# xmm7=Heps2
2081 addps xmm5
, xmm7 ;
# xmm5=Fp
2082 mulps xmm5
, xmm1 ;
# xmm5=eps*Fp
2083 addps xmm5
, xmm4 ;
# xmm5=VV
2085 mulps xmm5
, [rsp
+ nb430nf_c12
] ;
# Vvdw12
2087 addps xmm5
, [rsp
+ nb430nf_Vvdwtot
]
2088 movaps
[rsp
+ nb430nf_Vvdwtot
], xmm5
2089 .nb430nf_checksingle:
2090 mov edx
, [rsp
+ nb430nf_innerk
]
2092 jnz
.nb430nf_dosingle
2093 jmp
.nb430nf_updateouterdata
2095 mov rsi
, [rbp
+ nb430nf_charge
]
2096 mov rdx
, [rbp
+ nb430nf_invsqrta
]
2097 mov rdi
, [rbp
+ nb430nf_pos
]
2098 mov rcx
, [rsp
+ nb430nf_innerjjnr
]
2102 movss xmm2
, [rdx
+ rax
*4] ;
# isa2
2103 mulss xmm2
, [rsp
+ nb430nf_isai
]
2104 movss
[rsp
+ nb430nf_isaprod
], xmm2
2106 mulss xmm1
, [rsp
+ nb430nf_gbtsc
]
2107 movss
[rsp
+ nb430nf_gbscale
], xmm1
2109 mulss xmm2
, [rsp
+ nb430nf_iq
]
2110 movss xmm6
, [rsi
+ rax
*4] ;
# xmm6(0) has the charge
2112 movss
[rsp
+ nb430nf_qq
], xmm6
2114 mov rsi
, [rbp
+ nb430nf_type
]
2116 mov ecx
, [rsi
+ rcx
*4]
2117 mov rsi
, [rbp
+ nb430nf_vdwparam
]
2119 add ecx
, [rsp
+ nb430nf_ntia
]
2120 movlps xmm6
, [rsi
+ rcx
*4]
2122 shufps xmm4
, xmm4
, 252 ;
# 11111100
2123 shufps xmm6
, xmm6
, 253 ;
# 11111101
2125 movss
[rsp
+ nb430nf_c6
], xmm4
2126 movss
[rsp
+ nb430nf_c12
], xmm6
2128 lea rax
, [rax
+ rax
*2]
2130 ;
# move coordinates to xmm0-xmm2
2131 movss xmm0
, [rdi
+ rax
*4]
2132 movss xmm1
, [rdi
+ rax
*4 + 4]
2133 movss xmm2
, [rdi
+ rax
*4 + 8]
2135 movss xmm4
, [rsp
+ nb430nf_ix
]
2136 movss xmm5
, [rsp
+ nb430nf_iy
]
2137 movss xmm6
, [rsp
+ nb430nf_iz
]
2153 ;
# lookup seed in xmm5
2156 movss xmm1
, [rsp
+ nb430nf_three
]
2157 mulss xmm5
, xmm4 ;
# rsq*lu*lu
2158 movss xmm0
, [rsp
+ nb430nf_half
]
2159 subss xmm1
, xmm5 ;
# 30-rsq*lu*lu
2161 mulss xmm0
, xmm1 ;
# xmm0=rinv
2163 mulss xmm4
, xmm0 ;
# xmm4=r
2164 movaps
[rsp
+ nb430nf_r
], xmm4
2165 mulss xmm4
, [rsp
+ nb430nf_gbscale
]
2167 cvttss2si ebx
, xmm4 ;
# mm6 contain lu indices
2170 movaps xmm1
, xmm4 ;
# xmm1=eps
2172 mulss xmm2
, xmm2 ;
# xmm2=eps2
2176 mov rsi
, [rbp
+ nb430nf_GBtab
]
2178 movaps xmm4
, [rsi
+ rbx
*4]
2182 shufps xmm5
, xmm5
, 1
2183 shufps xmm7
, xmm7
, 1
2184 ;
# table ready in xmm4-xmm7
2186 mulss xmm6
, xmm1 ;
# xmm6=Geps
2187 mulss xmm7
, xmm2 ;
# xmm7=Heps2
2189 addss xmm5
, xmm7 ;
# xmm5=Fp
2190 movss xmm3
, [rsp
+ nb430nf_qq
]
2191 mulss xmm5
, xmm1 ;
# xmm5=eps*Fp
2192 addss xmm5
, xmm4 ;
# xmm5=VV
2193 mulss xmm5
, xmm3 ;
# vcoul=qq*VV
2194 addss xmm5
, [rsp
+ nb430nf_vctot
]
2195 movss
[rsp
+ nb430nf_vctot
], xmm5
2197 movss xmm4
, [rsp
+ nb430nf_r
]
2198 mulps xmm4
, [rsp
+ nb430nf_tsc
]
2203 movss xmm1
, xmm4 ;
# xmm1=eps
2205 mulss xmm2
, xmm2 ;
# xmm2=eps2
2208 mov rsi
, [rbp
+ nb430nf_VFtab
]
2211 movaps xmm4
, [rsi
+ rbx
*4]
2215 shufps xmm5
, xmm5
, 1
2216 shufps xmm7
, xmm7
, 1
2217 ;
# table ready in xmm4-xmm7
2219 mulss xmm6
, xmm1 ;
# xmm6=Geps
2220 mulss xmm7
, xmm2 ;
# xmm7=Heps2
2222 addss xmm5
, xmm7 ;
# xmm5=Fp
2223 mulss xmm5
, xmm1 ;
# xmm5=eps*Fp
2224 addss xmm5
, xmm4 ;
# xmm5=VV
2225 mulss xmm5
, [rsp
+ nb430nf_c6
] ;
# Vvdw6
2226 addss xmm5
, [rsp
+ nb430nf_Vvdwtot
]
2227 movss
[rsp
+ nb430nf_Vvdwtot
], xmm5
2230 movaps xmm4
, [rsi
+ rbx
*4 + 16]
2234 shufps xmm5
, xmm5
, 1
2235 shufps xmm7
, xmm7
, 1
2236 ;
# table ready in xmm4-xmm7
2238 mulss xmm6
, xmm1 ;
# xmm6=Geps
2239 mulss xmm7
, xmm2 ;
# xmm7=Heps2
2241 addss xmm5
, xmm7 ;
# xmm5=Fp
2242 mulss xmm5
, xmm1 ;
# xmm5=eps*Fp
2243 addss xmm5
, xmm4 ;
# xmm5=VV
2245 mulss xmm5
, [rsp
+ nb430nf_c12
] ;
# Vvdw12
2247 addss xmm5
, [rsp
+ nb430nf_Vvdwtot
]
2248 movss
[rsp
+ nb430nf_Vvdwtot
], xmm5
2250 .nb430nf_updateouterdata:
2252 mov esi
, [rsp
+ nb430nf_n
]
2253 ;
# get group index for i particle
2254 mov rdx
, [rbp
+ nb430nf_gid
] ;
# base of gid[]
2255 mov edx
, [rdx
+ rsi
*4] ;
# ggid=gid[n]
2257 ;
# accumulate total potential energy and update it
2258 movaps xmm7
, [rsp
+ nb430nf_vctot
]
2261 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
2263 shufps xmm6
, xmm6
, 1
2266 ;
# add earlier value from mem
2267 mov rax
, [rbp
+ nb430nf_Vc
]
2268 addss xmm7
, [rax
+ rdx
*4]
2270 movss
[rax
+ rdx
*4], xmm7
2272 ;
# accumulate total lj energy and update it
2273 movaps xmm7
, [rsp
+ nb430nf_Vvdwtot
]
2276 addps xmm7
, xmm6 ;
# pos 0-1 in xmm7 have the sum now
2278 shufps xmm6
, xmm6
, 1
2281 ;
# add earlier value from mem
2282 mov rax
, [rbp
+ nb430nf_Vvdw
]
2283 addss xmm7
, [rax
+ rdx
*4]
2285 movss
[rax
+ rdx
*4], xmm7
2288 mov ecx
, [rsp
+ nb430nf_nn1
]
2289 ;
# esi already loaded with n
2292 jz
.nb430nf_outerend
2294 ;
# not last, iterate outer loop once more!
2295 mov
[rsp
+ nb430nf_n
], esi
2298 ;
# check if more outer neighborlists remain
2299 mov ecx
, [rsp
+ nb430nf_nri
]
2300 ;
# esi already loaded with n above
2303 ;
# non-zero, do one more workunit
2304 jmp
.nb430nf_threadloop
2307 mov eax
, [rsp
+ nb430nf_nouter
]
2308 mov ebx
, [rsp
+ nb430nf_ninner
]
2309 mov rcx
, [rbp
+ nb430nf_outeriter
]
2310 mov rdx
, [rbp
+ nb430nf_inneriter
]