3 ## Gromacs 4.0 Copyright (c) 1991-2003
4 ## David van der Spoel, Erik Lindahl
6 ## This program is free software; you can redistribute it and/or
7 ## modify it under the terms of the GNU General Public License
8 ## as published by the Free Software Foundation; either version 2
9 ## of the License, or (at your option) any later version.
11 ## To help us fund GROMACS development, we humbly ask that you cite
12 ## the research papers on the package. Check out http://www.gromacs.org
15 ## Gnomes, ROck Monsters And Chili Sauce
21 ## nb010 - forces are calculated
22 .globl nb_kernel010_x86_64_sse
23 .globl _nb_kernel010_x86_64_sse
24 nb_kernel010_x86_64_sse
:
25 _nb_kernel010_x86_64_sse
:
26 ## Room for return address and rbp (16 bytes)
30 .set nb010_faction, 40
32 .set nb010_p_facel, 56
37 .set nb010_p_ntype, 96
38 .set nb010_vdwparam, 104
40 .set nb010_p_tabscale, 120
42 .set nb010_invsqrta, 136
44 .set nb010_p_gbtabscale, 152
46 .set nb010_p_nthreads, 168
49 .set nb010_outeriter, 192
50 .set nb010_inneriter, 200
52 ## The mutex (last arg) is not used in assembly.
53 ## stack offsets for local variables
54 ## bottom of stack is cache-aligned for sse use
65 .set nb010_twelve, 160
66 .set nb010_Vvdwtot, 176
74 .set nb010_jindex, 288
77 .set nb010_shiftvec, 312
79 .set nb010_innerjjnr, 328
83 .set nb010_innerk, 348
87 .set nb010_nouter, 364
88 .set nb010_ninner, 368
101 subq $
392,%rsp
# # local variable stack space (n*16+8)
102 ## zero 32-bit iteration counters
104 movl
%eax
,nb010_nouter
(%rsp
)
105 movl
%eax
,nb010_ninner
(%rsp
)
108 movl
%edi
,nb010_nri
(%rsp
)
109 movq
%rsi
,nb010_iinr
(%rsp
)
110 movq
%rdx
,nb010_jindex
(%rsp
)
111 movq
%rcx
,nb010_jjnr
(%rsp
)
112 movq
%r8,nb010_shift
(%rsp
)
113 movq
%r9,nb010_shiftvec
(%rsp
)
114 movq nb010_p_ntype
(%rbp
),%rdi
116 movl
%edi
,nb010_ntype
(%rsp
)
118 ## create constant floating-point factors on stack
119 movl $
0x40000000,%eax
## 2.0 in IEEE (hex)
120 movl
%eax
,nb010_two
(%rsp
)
121 movss nb010_two
(%rsp
),%xmm1
122 shufps $
0,%xmm1
,%xmm1
## splat to all elements
124 addps
%xmm1
,%xmm2
## 4.0
125 addps
%xmm1
,%xmm2
## 6.0
127 addps
%xmm3
,%xmm3
## 12.0
128 movaps
%xmm1
,nb010_two
(%rsp
)
129 movaps
%xmm2
,nb010_six
(%rsp
)
130 movaps
%xmm3
,nb010_twelve
(%rsp
)
133 _nb_kernel010_x86_64_sse.nb010_threadloop
:
134 movq nb010_count
(%rbp
),%rsi
## pointer to sync counter
136 _nb_kernel010_x86_64_sse.nb010_spinlock
:
137 movl
%eax
,%ebx
## ebx=*count=nn0
138 addl $
1,%ebx
## ebx=nn1=nn0+10
140 cmpxchgl
%ebx
,(%rsi
) ## write nn1 to *counter,
141 ## if it hasnt changed.
142 ## or reread *counter to eax.
143 pause
## -> better p4 performance
144 jnz _nb_kernel010_x86_64_sse.nb010_spinlock
146 ## if(nn1>nri) nn1=nri
147 movl nb010_nri
(%rsp
),%ecx
150 cmovlel
%edx
,%ebx
## if(nn1>nri) nn1=nri
151 ## Cleared the spinlock if we got here.
152 ## eax contains nn0, ebx contains nn1.
153 movl
%eax
,nb010_n
(%rsp
)
154 movl
%ebx
,nb010_nn1
(%rsp
)
155 subl
%eax
,%ebx
## calc number of outer lists
156 movl
%eax
,%esi
## copy n to esi
157 jg _nb_kernel010_x86_64_sse.nb010_outerstart
158 jmp _nb_kernel010_x86_64_sse.nb010_end
160 _nb_kernel010_x86_64_sse.nb010_outerstart
:
161 ## ebx contains number of outer iterations
162 addl nb010_nouter
(%rsp
),%ebx
163 movl
%ebx
,nb010_nouter
(%rsp
)
165 _nb_kernel010_x86_64_sse.nb010_outer
:
166 movq nb010_shift
(%rsp
),%rax
## rax = base of shift[]
167 movl
(%rax
,%rsi
,4),%ebx
## ebx=shift[n]
169 lea
(%rbx
,%rbx
,2),%rbx
## rbx=3*is
170 movl
%ebx
,nb010_is3
(%rsp
) ## store is3
172 movq nb010_shiftvec
(%rsp
),%rax
## rax = base of shiftvec[]
174 movss
(%rax
,%rbx
,4),%xmm10
175 movss
4(%rax
,%rbx
,4),%xmm11
176 movss
8(%rax
,%rbx
,4),%xmm12
178 movq nb010_iinr
(%rsp
),%rcx
## rcx = base of iinr[]
179 movl
(%rcx
,%rsi
,4),%ebx
## ebx =ii
181 movq nb010_type
(%rbp
),%rdx
182 movl
(%rdx
,%rbx
,4),%edx
183 imull nb010_ntype
(%rsp
),%edx
185 movl
%edx
,nb010_ntia
(%rsp
)
187 lea
(%rbx
,%rbx
,2),%rbx
## rbx = 3*ii=ii3
188 movq nb010_pos
(%rbp
),%rax
## rax = base of pos[]
190 addss
(%rax
,%rbx
,4),%xmm10
191 addss
4(%rax
,%rbx
,4),%xmm11
192 addss
8(%rax
,%rbx
,4),%xmm12
194 shufps $
0,%xmm10
,%xmm10
195 shufps $
0,%xmm11
,%xmm11
196 shufps $
0,%xmm12
,%xmm12
198 movaps
%xmm10
,nb010_ix
(%rsp
)
199 movaps
%xmm11
,nb010_iy
(%rsp
)
200 movaps
%xmm12
,nb010_iz
(%rsp
)
202 movl
%ebx
,nb010_ii3
(%rsp
)
204 ## clear vvdwtot (xmm12) and i forces (xmm13-xmm15)
210 movq nb010_jindex
(%rsp
),%rax
211 movl
(%rax
,%rsi
,4),%ecx
## jindex[n]
212 movl
4(%rax
,%rsi
,4),%edx
## jindex[n+1]
213 subl
%ecx
,%edx
## number of innerloop atoms
215 movq nb010_jjnr
(%rsp
),%rax
218 movq
%rax
,nb010_innerjjnr
(%rsp
) ## pointer to jjnr[nj0]
221 addl nb010_ninner
(%rsp
),%ecx
222 movl
%ecx
,nb010_ninner
(%rsp
)
224 movl
%edx
,nb010_innerk
(%rsp
) ## number of innerloop atoms
226 jge _nb_kernel010_x86_64_sse.nb010_unroll_loop
227 jmp _nb_kernel010_x86_64_sse.nb010_finish_inner
228 _nb_kernel010_x86_64_sse.nb010_unroll_loop
:
229 ## quad-unrolled innerloop here
230 movq nb010_innerjjnr
(%rsp
),%rdx
## pointer to jjnr[k]
234 movl
12(%rdx
),%edx
## eax-edx=jnr1-4
236 addq $
16,nb010_innerjjnr
(%rsp
) ## advance pointer (unrolled 4)
238 lea
(%rax
,%rax
,2),%r8 ## replace jnr with j3
239 lea
(%rbx
,%rbx
,2),%r9
240 lea
(%rcx
,%rcx
,2),%r10
241 lea
(%rdx
,%rdx
,2),%r11
243 movq nb010_pos
(%rbp
),%rdi
245 movlps
(%rdi
,%r8,4),%xmm1
## x1 y1 - -
246 movlps
(%rdi
,%r10,4),%xmm2
## x3 y3 - -
247 movhps
(%rdi
,%r9,4),%xmm1
## x2 y2 - -
248 movhps
(%rdi
,%r11,4),%xmm2
## x4 y4 - -
250 movss
8(%rdi
,%r8,4),%xmm5
## z1 - - -
251 movss
8(%rdi
,%r10,4),%xmm6
## z2 - - -
252 movss
8(%rdi
,%r9,4),%xmm7
## z3 - - -
253 movss
8(%rdi
,%r11,4),%xmm8
## z4 - - -
254 movlhps
%xmm7
,%xmm5
## jzOa - jzOb -
255 movlhps
%xmm8
,%xmm6
## jzOc - jzOd -
257 movq nb010_type
(%rbp
),%rsi
260 unpcklps
%xmm2
,%xmm1
## jxa jxc jya jyc
261 unpckhps
%xmm2
,%xmm4
## jxb jxd jyb jyd
263 unpcklps
%xmm4
,%xmm1
## x
264 unpckhps
%xmm4
,%xmm2
## y
265 shufps $
136,%xmm6
,%xmm5
## 10001000 => jzH2a jzH2b jzH2c jzH2d
268 movl
(%rsi
,%rax
,4),%r12d
269 movl
(%rsi
,%rbx
,4),%r13d
270 movl
(%rsi
,%rcx
,4),%r14d
271 movl
(%rsi
,%rdx
,4),%r15d
274 subps nb010_ix
(%rsp
),%xmm1
275 subps nb010_iy
(%rsp
),%xmm2
276 subps nb010_iz
(%rsp
),%xmm5
278 ## store dr in xmm9-xmm11
298 movl nb010_ntia
(%rsp
),%edi
304 movq nb010_vdwparam
(%rbp
),%rsi
309 ## 1/x lookup seed in xmm5
310 movaps nb010_two
(%rsp
),%xmm6
313 movlps
(%rsi
,%r12,4),%xmm7
314 movlps
(%rsi
,%r14,4),%xmm8
317 mulps
%xmm5
,%xmm6
## xmm6=rinvsq
319 movaps
%xmm6
,%xmm4
## rinvsq
321 movhps
(%rsi
,%r13,4),%xmm7
322 movhps
(%rsi
,%r15,4),%xmm8
325 mulps
%xmm6
,%xmm1
## rinv4
326 mulps
%xmm6
,%xmm1
## rinv6
328 mulps
%xmm2
,%xmm2
## xmm2=rinv12
332 shufps $
136,%xmm8
,%xmm5
## 10001000
333 shufps $
221,%xmm8
,%xmm7
## 11011101
335 movq nb010_faction
(%rbp
),%rsi
337 mulps
%xmm5
,%xmm1
## c6*rinv6
338 mulps
%xmm7
,%xmm2
## c12*rinv12
340 subps
%xmm1
,%xmm5
## Vvdw=Vvdw12-Vvdw6
341 mulps nb010_six
(%rsp
),%xmm1
342 mulps nb010_twelve
(%rsp
),%xmm2
344 mulps
%xmm2
,%xmm4
## xmm4=total fscal
346 ## the fj's - start by accumulating x & y forces from memory
347 movlps
(%rsi
,%r8,4),%xmm0
## x1 y1 - -
348 movlps
(%rsi
,%r10,4),%xmm1
## x3 y3 - -
349 movhps
(%rsi
,%r9,4),%xmm0
## x1 y1 x2 y2
350 movhps
(%rsi
,%r11,4),%xmm1
## x3 y3 x4 y4
352 ## add potential to Vvdwtot (sum in xmm12)
355 ## calculate scalar force by multiplying dx/dy/dz with fscal
360 ## xmm0-xmm2 contains tx-tz (partial force)
361 ## accumulate i forces
366 ## permute local forces
368 unpcklps
%xmm10
,%xmm9
## x1 y1 x2 y2
369 unpckhps
%xmm10
,%xmm8
## x3 y3 x4 y4
371 ## xmm11: fjz1 fjz2 fjz3 fjz4
372 pshufd $
1,%xmm11
,%xmm5
## fjz2 - - -
373 movhlps
%xmm11
,%xmm4
## fjz3 - - -
374 pshufd $
3,%xmm11
,%xmm3
## fjz4 - - -
376 ## update fjx and fjy
380 movlps
%xmm0
,(%rsi
,%r8,4)
381 movlps
%xmm1
,(%rsi
,%r10,4)
382 movhps
%xmm0
,(%rsi
,%r9,4)
383 movhps
%xmm1
,(%rsi
,%r11,4)
385 addss
8(%rsi
,%r8,4),%xmm11
386 addss
8(%rsi
,%r9,4),%xmm5
387 addss
8(%rsi
,%r10,4),%xmm4
388 addss
8(%rsi
,%r11,4),%xmm3
389 movss
%xmm11
,8(%rsi
,%r8,4)
390 movss
%xmm5
,8(%rsi
,%r9,4)
391 movss
%xmm4
,8(%rsi
,%r10,4)
392 movss
%xmm3
,8(%rsi
,%r11,4)
394 ## should we do one more iteration?
395 subl $
4,nb010_innerk
(%rsp
)
396 jl _nb_kernel010_x86_64_sse.nb010_finish_inner
397 jmp _nb_kernel010_x86_64_sse.nb010_unroll_loop
398 _nb_kernel010_x86_64_sse.nb010_finish_inner
:
399 ## check if at least two particles remain
400 addl $
4,nb010_innerk
(%rsp
)
401 movl nb010_innerk
(%rsp
),%edx
403 jnz _nb_kernel010_x86_64_sse.nb010_dopair
404 jmp _nb_kernel010_x86_64_sse.nb010_checksingle
405 _nb_kernel010_x86_64_sse.nb010_dopair
:
406 ## twice-unrolled innerloop here
407 movq nb010_innerjjnr
(%rsp
),%rdx
## pointer to jjnr[k]
411 addq $
8,nb010_innerjjnr
(%rsp
) ## advance pointer (unrolled 2)
413 movq nb010_type
(%rbp
),%rsi
414 movl
(%rsi
,%rax
,4),%r12d
415 movl
(%rsi
,%rbx
,4),%r13d
418 movl nb010_ntia
(%rsp
),%edi
422 movq nb010_vdwparam
(%rbp
),%rsi
423 movlps
(%rsi
,%r12,4),%xmm3
424 movhps
(%rsi
,%r13,4),%xmm3
428 shufps $
136,%xmm7
,%xmm0
## 10001000
429 shufps $
221,%xmm7
,%xmm3
## 11011101
433 lea
(%rax
,%rax
,2),%rax
## replace jnr with j3
434 lea
(%rbx
,%rbx
,2),%rbx
436 movq nb010_pos
(%rbp
),%rdi
438 movlps
(%rdi
,%rax
,4),%xmm1
## x1 y1 - -
439 movlps
(%rdi
,%rbx
,4),%xmm4
## x2 y2 - -
441 movss
8(%rdi
,%rax
,4),%xmm5
## z1 - - -
442 movss
8(%rdi
,%rbx
,4),%xmm7
## z2 - - -
444 unpcklps
%xmm4
,%xmm1
## x1 x2 y1 y2
445 movhlps
%xmm1
,%xmm2
## y1 y2 - -
446 unpcklps
%xmm7
,%xmm5
## z1 z2 - -
449 subps nb010_ix
(%rsp
),%xmm1
450 subps nb010_iy
(%rsp
),%xmm2
451 subps nb010_iz
(%rsp
),%xmm5
453 ## store dr in xmm9-xmm11
467 ## 1/x lookup seed in xmm5
468 movaps nb010_two
(%rsp
),%xmm6
471 mulps
%xmm5
,%xmm6
## xmm6=rinvsq
473 movaps
%xmm6
,%xmm4
## rinvsq
476 mulps
%xmm6
,%xmm1
## rinv4
477 mulps
%xmm6
,%xmm1
## rinv6
479 mulps
%xmm2
,%xmm2
## xmm2=rinv12
484 subps
%xmm1
,%xmm5
## Vvdw=Vvdw12-Vvdw6
485 mulps nb010_six
(%rsp
),%xmm1
486 mulps nb010_twelve
(%rsp
),%xmm2
488 mulps
%xmm2
,%xmm4
## xmm4=total fscal
493 ## add potential to Vvdwtot (sum in xmm12)
496 ## calculate scalar force by multiplying dx/dy/dz with fscal
505 ## xmm0-xmm2 contains tx-tz (partial force)
506 ## accumulate i forces
511 movq nb010_faction
(%rbp
),%rsi
512 ## the fj's - start by accumulating x & y forces from memory
513 movlps
(%rsi
,%rax
,4),%xmm0
## x1 y1 - -
514 movhps
(%rsi
,%rbx
,4),%xmm0
## x1 y1 x2 y2
516 unpcklps
%xmm10
,%xmm9
## x1 y1 x2 y2
519 movlps
%xmm0
,(%rsi
,%rax
,4)
520 movhps
%xmm0
,(%rsi
,%rbx
,4)
523 pshufd $
1,%xmm11
,%xmm8
524 addss
8(%rsi
,%rax
,4),%xmm11
525 addss
8(%rsi
,%rbx
,4),%xmm8
526 movss
%xmm11
,8(%rsi
,%rax
,4)
527 movss
%xmm8
,8(%rsi
,%rbx
,4)
529 _nb_kernel010_x86_64_sse.nb010_checksingle
:
530 movl nb010_innerk
(%rsp
),%edx
532 jnz _nb_kernel010_x86_64_sse.nb010_dosingle
533 jmp _nb_kernel010_x86_64_sse.nb010_updateouterdata
535 _nb_kernel010_x86_64_sse.nb010_dosingle
:
536 movq nb010_innerjjnr
(%rsp
),%rcx
539 movq nb010_type
(%rbp
),%rsi
540 movl
(%rsi
,%rax
,4),%r12d
542 movl nb010_ntia
(%rsp
),%edi
545 movq nb010_vdwparam
(%rbp
),%rsi
546 movss
(%rsi
,%r12,4),%xmm0
547 movss
4(%rsi
,%r12,4),%xmm3
552 lea
(%rax
,%rax
,2),%rax
## replace jnr with j3
554 movq nb010_pos
(%rbp
),%rdi
556 movss
(%rdi
,%rax
,4),%xmm1
557 movss
4(%rdi
,%rax
,4),%xmm2
558 movss
8(%rdi
,%rax
,4),%xmm5
561 subss nb010_ix
(%rsp
),%xmm1
562 subss nb010_iy
(%rsp
),%xmm2
563 subss nb010_iz
(%rsp
),%xmm5
565 ## store dr in xmm9-xmm11
580 ## 1/x lookup seed in xmm5
581 movaps nb010_two
(%rsp
),%xmm6
584 mulss
%xmm5
,%xmm6
## xmm6=rinvsq
586 movaps
%xmm6
,%xmm4
## rinvsq
589 mulss
%xmm6
,%xmm1
## rinv4
590 mulss
%xmm6
,%xmm1
## rinv6
592 mulss
%xmm2
,%xmm2
## xmm2=rinv12
597 subss
%xmm1
,%xmm5
## Vvdw=Vvdw12-Vvdw6
598 mulss nb010_six
(%rsp
),%xmm1
599 mulss nb010_twelve
(%rsp
),%xmm2
601 mulss
%xmm2
,%xmm4
## xmm4=total fscal
603 ## add potential to Vvdwtot (sum in xmm12)
606 ## calculate scalar force by multiplying dx/dy/dz with fscal
611 ## xmm0-xmm2 contains tx-tz (partial force)
612 ## accumulate i forces
617 movq nb010_faction
(%rbp
),%rsi
619 addss
(%rsi
,%rax
,4),%xmm9
620 addss
4(%rsi
,%rax
,4),%xmm10
621 addss
8(%rsi
,%rax
,4),%xmm11
622 movss
%xmm9
,(%rsi
,%rax
,4)
623 movss
%xmm10
,4(%rsi
,%rax
,4)
624 movss
%xmm11
,8(%rsi
,%rax
,4)
626 _nb_kernel010_x86_64_sse.nb010_updateouterdata
:
627 movl nb010_ii3
(%rsp
),%ecx
628 movq nb010_faction
(%rbp
),%rdi
629 movq nb010_fshift
(%rbp
),%rsi
630 movl nb010_is3
(%rsp
),%edx
632 ## accumulate i forces in xmm13, xmm14, xmm15
642 shufps $
1,%xmm3
,%xmm3
643 shufps $
1,%xmm4
,%xmm4
644 shufps $
1,%xmm5
,%xmm5
647 addss
%xmm5
,%xmm2
## xmm0-xmm2 has single force in pos0
650 movss
(%rdi
,%rcx
,4),%xmm3
651 movss
4(%rdi
,%rcx
,4),%xmm4
652 movss
8(%rdi
,%rcx
,4),%xmm5
656 movss
%xmm3
,(%rdi
,%rcx
,4)
657 movss
%xmm4
,4(%rdi
,%rcx
,4)
658 movss
%xmm5
,8(%rdi
,%rcx
,4)
660 ## increment fshift force
661 movss
(%rsi
,%rdx
,4),%xmm3
662 movss
4(%rsi
,%rdx
,4),%xmm4
663 movss
8(%rsi
,%rdx
,4),%xmm5
667 movss
%xmm3
,(%rsi
,%rdx
,4)
668 movss
%xmm4
,4(%rsi
,%rdx
,4)
669 movss
%xmm5
,8(%rsi
,%rdx
,4)
672 movl nb010_n
(%rsp
),%esi
673 ## get group index for i particle
674 movq nb010_gid
(%rbp
),%rdx
## base of gid[]
675 movl
(%rdx
,%rsi
,4),%edx
## ggid=gid[n]
677 ## accumulate total potential energy and update it
680 addps
%xmm6
,%xmm12
## pos 0-1 in xmm12 have the sum now
682 shufps $
1,%xmm6
,%xmm6
685 ## add earlier value from mem
686 movq nb010_Vvdw
(%rbp
),%rax
687 addss
(%rax
,%rdx
,4),%xmm12
689 movss
%xmm12
,(%rax
,%rdx
,4)
692 movl nb010_nn1
(%rsp
),%ecx
693 ## esi already loaded with n
696 jz _nb_kernel010_x86_64_sse.nb010_outerend
698 ## not last, iterate outer loop once more!
699 movl
%esi
,nb010_n
(%rsp
)
700 jmp _nb_kernel010_x86_64_sse.nb010_outer
701 _nb_kernel010_x86_64_sse.nb010_outerend
:
702 ## check if more outer neighborlists remain
703 movl nb010_nri
(%rsp
),%ecx
704 ## esi already loaded with n above
706 jz _nb_kernel010_x86_64_sse.nb010_end
707 ## non-zero, do one more workunit
708 jmp _nb_kernel010_x86_64_sse.nb010_threadloop
709 _nb_kernel010_x86_64_sse.nb010_end
:
713 movl nb010_nouter
(%rsp
),%eax
714 movl nb010_ninner
(%rsp
),%ebx
715 movq nb010_outeriter
(%rbp
),%rcx
716 movq nb010_inneriter
(%rbp
),%rdx
738 .globl nb_kernel010nf_x86_64_sse
739 .globl _nb_kernel010nf_x86_64_sse
740 nb_kernel010nf_x86_64_sse
:
741 _nb_kernel010nf_x86_64_sse
:
742 ## Room for return address and rbp (16 bytes)
743 .set nb010nf_fshift, 16
746 .set nb010nf_faction, 40
747 .set nb010nf_charge, 48
748 .set nb010nf_p_facel, 56
749 .set nb010nf_argkrf, 64
750 .set nb010nf_argcrf, 72
752 .set nb010nf_type, 88
753 .set nb010nf_p_ntype, 96
754 .set nb010nf_vdwparam, 104
755 .set nb010nf_Vvdw, 112
756 .set nb010nf_p_tabscale, 120
757 .set nb010nf_VFtab, 128
758 .set nb010nf_invsqrta, 136
759 .set nb010nf_dvda, 144
760 .set nb010nf_p_gbtabscale, 152
761 .set nb010nf_GBtab, 160
762 .set nb010nf_p_nthreads, 168
763 .set nb010nf_count, 176
764 .set nb010nf_mtx, 184
765 .set nb010nf_outeriter, 192
766 .set nb010nf_inneriter, 200
767 .set nb010nf_work, 208
768 ## The mutex (last arg) is not used in assembly.
769 ## stack offsets for local variables
770 ## bottom of stack is cache-aligned for sse use
777 .set nb010nf_Vvdwtot, 96
778 .set nb010nf_half, 112
779 .set nb010nf_three, 128
780 .set nb010nf_nri, 144
781 .set nb010nf_iinr, 152
782 .set nb010nf_jindex, 160
783 .set nb010nf_jjnr, 168
784 .set nb010nf_shift, 176
785 .set nb010nf_shiftvec, 184
786 .set nb010nf_innerjjnr, 192
787 .set nb010nf_facel, 200
788 .set nb010nf_ntia, 208
789 .set nb010nf_innerk, 216
790 .set nb010nf_is3, 220
791 .set nb010nf_ii3, 224
793 .set nb010nf_nn1, 232
794 .set nb010nf_ntype, 236
795 .set nb010nf_nouter, 240
796 .set nb010nf_ninner, 244
802 subq $
264,%rsp
# # local variable stack space (n*16+8)
805 ## zero 32-bit iteration counters
807 movl
%eax
,nb010nf_nouter
(%rsp
)
808 movl
%eax
,nb010nf_ninner
(%rsp
)
812 movl
%edi
,nb010nf_nri
(%rsp
)
813 movq
%rsi
,nb010nf_iinr
(%rsp
)
814 movq
%rdx
,nb010nf_jindex
(%rsp
)
815 movq
%rcx
,nb010nf_jjnr
(%rsp
)
816 movq
%r8,nb010nf_shift
(%rsp
)
817 movq
%r9,nb010nf_shiftvec
(%rsp
)
818 movq nb010nf_p_ntype
(%rbp
),%rdi
820 movl
%edi
,nb010nf_ntype
(%rsp
)
822 ## create constant floating-point factors on stack
823 movl $
0x40000000,%eax
## 2.0 in IEEE (hex)
824 movl
%eax
,nb010nf_two
(%rsp
)
825 movss nb010nf_two
(%rsp
),%xmm1
826 shufps $
0,%xmm1
,%xmm1
## splat to all elements
827 movaps
%xmm1
,nb010nf_two
(%rsp
)
829 _nb_kernel010nf_x86_64_sse.nb010nf_threadloop
:
830 movq nb010nf_count
(%rbp
),%rsi
## pointer to sync counter
832 _nb_kernel010nf_x86_64_sse.nb010nf_spinlock
:
833 movl
%eax
,%ebx
## ebx=*count=nn0
834 addl $
1,%ebx
## ebx=nn1=nn0+10
836 cmpxchgl
%ebx
,(%rsi
) ## write nn1 to *counter,
837 ## if it hasnt changed.
838 ## or reread *counter to eax.
839 pause
## -> better p4 performance
840 jnz _nb_kernel010nf_x86_64_sse.nb010nf_spinlock
842 ## if(nn1>nri) nn1=nri
843 movl nb010nf_nri
(%rsp
),%ecx
846 cmovlel
%edx
,%ebx
## if(nn1>nri) nn1=nri
847 ## Cleared the spinlock if we got here.
848 ## eax contains nn0, ebx contains nn1.
849 movl
%eax
,nb010nf_n
(%rsp
)
850 movl
%ebx
,nb010nf_nn1
(%rsp
)
851 subl
%eax
,%ebx
## calc number of outer lists
852 movl
%eax
,%esi
## copy n to esi
853 jg _nb_kernel010nf_x86_64_sse.nb010nf_outerstart
854 jmp _nb_kernel010nf_x86_64_sse.nb010nf_end
856 _nb_kernel010nf_x86_64_sse.nb010nf_outerstart
:
857 ## ebx contains number of outer iterations
858 addl nb010nf_nouter
(%rsp
),%ebx
859 movl
%ebx
,nb010nf_nouter
(%rsp
)
861 _nb_kernel010nf_x86_64_sse.nb010nf_outer
:
862 movq nb010nf_shift
(%rsp
),%rax
## rax = base of shift[]
863 movl
(%rax
,%rsi
,4),%ebx
## ebx=shift[n]
865 lea
(%rbx
,%rbx
,2),%rbx
## rbx=3*is
866 movl
%ebx
,nb010nf_is3
(%rsp
) ## store is3
868 movq nb010nf_shiftvec
(%rsp
),%rax
## rax = base of shiftvec[]
870 movss
(%rax
,%rbx
,4),%xmm0
871 movss
4(%rax
,%rbx
,4),%xmm1
872 movss
8(%rax
,%rbx
,4),%xmm2
874 movq nb010nf_iinr
(%rsp
),%rcx
## rcx = base of iinr[]
875 movl
(%rcx
,%rsi
,4),%ebx
## ebx =ii
877 movq nb010nf_type
(%rbp
),%rdx
878 movl
(%rdx
,%rbx
,4),%edx
879 imull nb010nf_ntype
(%rsp
),%edx
881 movl
%edx
,nb010nf_ntia
(%rsp
)
883 lea
(%rbx
,%rbx
,2),%rbx
## rbx = 3*ii=ii3
884 movq nb010nf_pos
(%rbp
),%rax
## rax = base of pos[]
886 addss
(%rax
,%rbx
,4),%xmm0
887 addss
4(%rax
,%rbx
,4),%xmm1
888 addss
8(%rax
,%rbx
,4),%xmm2
890 shufps $
0,%xmm0
,%xmm0
891 shufps $
0,%xmm1
,%xmm1
892 shufps $
0,%xmm2
,%xmm2
894 movaps
%xmm0
,nb010nf_ix
(%rsp
)
895 movaps
%xmm1
,nb010nf_iy
(%rsp
)
896 movaps
%xmm2
,nb010nf_iz
(%rsp
)
898 movl
%ebx
,nb010nf_ii3
(%rsp
)
900 ## clear Vvdwtot and i forces
902 movaps
%xmm4
,nb010nf_Vvdwtot
(%rsp
)
904 movq nb010nf_jindex
(%rsp
),%rax
905 movl
(%rax
,%rsi
,4),%ecx
## jindex[n]
906 movl
4(%rax
,%rsi
,4),%edx
## jindex[n+1]
907 subl
%ecx
,%edx
## number of innerloop atoms
909 movq nb010nf_pos
(%rbp
),%rsi
910 movq nb010nf_jjnr
(%rsp
),%rax
913 movq
%rax
,nb010nf_innerjjnr
(%rsp
) ## pointer to jjnr[nj0]
916 addl nb010nf_ninner
(%rsp
),%ecx
917 movl
%ecx
,nb010nf_ninner
(%rsp
)
919 movl
%edx
,nb010nf_innerk
(%rsp
) ## number of innerloop atoms
921 jge _nb_kernel010nf_x86_64_sse.nb010nf_unroll_loop
922 jmp _nb_kernel010nf_x86_64_sse.nb010nf_finish_inner
923 _nb_kernel010nf_x86_64_sse.nb010nf_unroll_loop
:
924 ## quad-unroll innerloop here
925 movq nb010nf_innerjjnr
(%rsp
),%rdx
## pointer to jjnr[k]
929 movl
12(%rdx
),%edx
## eax-edx=jnr1-4
930 ## advance pointer (unrolled 4)
931 addq $
16,nb010nf_innerjjnr
(%rsp
)
933 movd
%eax
,%mm0
## use mmx registers as temp storage
938 movq nb010nf_type
(%rbp
),%rsi
939 movl
(%rsi
,%rax
,4),%eax
940 movl
(%rsi
,%rbx
,4),%ebx
941 movl
(%rsi
,%rcx
,4),%ecx
942 movl
(%rsi
,%rdx
,4),%edx
943 movq nb010nf_vdwparam
(%rbp
),%rsi
948 movl nb010nf_ntia
(%rsp
),%edi
954 movlps
(%rsi
,%rax
,4),%xmm6
955 movlps
(%rsi
,%rcx
,4),%xmm7
956 movhps
(%rsi
,%rbx
,4),%xmm6
957 movhps
(%rsi
,%rdx
,4),%xmm7
960 shufps $
136,%xmm7
,%xmm4
## 10001000
961 shufps $
221,%xmm7
,%xmm6
## 11011101
968 movaps
%xmm4
,nb010nf_c6
(%rsp
)
969 movaps
%xmm6
,nb010nf_c12
(%rsp
)
971 movq nb010nf_pos
(%rbp
),%rsi
## base of pos[]
973 lea
(%rax
,%rax
,2),%rax
## replace jnr with j3
974 lea
(%rbx
,%rbx
,2),%rbx
977 lea
(%rcx
,%rcx
,2),%rcx
## replace jnr with j3
978 lea
(%rdx
,%rdx
,2),%rdx
980 ## move four coordinates to xmm0-xmm2
982 movlps
(%rsi
,%rax
,4),%xmm4
983 movlps
(%rsi
,%rcx
,4),%xmm5
984 movss
8(%rsi
,%rax
,4),%xmm2
985 movss
8(%rsi
,%rcx
,4),%xmm6
987 movhps
(%rsi
,%rbx
,4),%xmm4
988 movhps
(%rsi
,%rdx
,4),%xmm5
990 movss
8(%rsi
,%rbx
,4),%xmm0
991 movss
8(%rsi
,%rdx
,4),%xmm1
993 shufps $
0,%xmm0
,%xmm2
994 shufps $
0,%xmm1
,%xmm6
999 shufps $
136,%xmm6
,%xmm2
## 10001000
1001 shufps $
136,%xmm5
,%xmm0
## 10001000
1002 shufps $
221,%xmm5
,%xmm1
## 11011101
1004 ## move ix-iz to xmm4-xmm6
1005 movaps nb010nf_ix
(%rsp
),%xmm4
1006 movaps nb010nf_iy
(%rsp
),%xmm5
1007 movaps nb010nf_iz
(%rsp
),%xmm6
1023 ## 1/x lookup seed in xmm5
1024 movaps nb010nf_two
(%rsp
),%xmm0
1027 mulps
%xmm5
,%xmm0
## xmm0=rinvsq
1032 mulps
%xmm0
,%xmm1
## xmm1=rinvsix
1034 mulps
%xmm2
,%xmm2
## xmm2=rinvtwelve
1036 mulps nb010nf_c6
(%rsp
),%xmm1
1037 mulps nb010nf_c12
(%rsp
),%xmm2
1039 subps
%xmm1
,%xmm5
## Vvdw=Vvdw12-Vvdw6
1040 addps nb010nf_Vvdwtot
(%rsp
),%xmm5
1041 movaps
%xmm5
,nb010nf_Vvdwtot
(%rsp
)
1043 ## should we do one more iteration?
1044 subl $
4,nb010nf_innerk
(%rsp
)
1045 jl _nb_kernel010nf_x86_64_sse.nb010nf_finish_inner
1046 jmp _nb_kernel010nf_x86_64_sse.nb010nf_unroll_loop
1047 _nb_kernel010nf_x86_64_sse.nb010nf_finish_inner
:
1048 ## check if at least two particles remain
1049 addl $
4,nb010nf_innerk
(%rsp
)
1050 movl nb010nf_innerk
(%rsp
),%edx
1052 jnz _nb_kernel010nf_x86_64_sse.nb010nf_dopair
1053 jmp _nb_kernel010nf_x86_64_sse.nb010nf_checksingle
1054 _nb_kernel010nf_x86_64_sse.nb010nf_dopair
:
1055 movq nb010nf_innerjjnr
(%rsp
),%rcx
1059 addq $
8,nb010nf_innerjjnr
(%rsp
)
1061 movq nb010nf_type
(%rbp
),%rsi
1064 movl
(%rsi
,%rcx
,4),%ecx
1065 movl
(%rsi
,%rdx
,4),%edx
1066 movq nb010nf_vdwparam
(%rbp
),%rsi
1069 movl nb010nf_ntia
(%rsp
),%edi
1072 movlps
(%rsi
,%rcx
,4),%xmm6
1073 movhps
(%rsi
,%rdx
,4),%xmm6
1074 movq nb010nf_pos
(%rbp
),%rdi
1077 shufps $
8,%xmm4
,%xmm4
## 00001000
1078 shufps $
13,%xmm6
,%xmm6
## 00001101
1082 movaps
%xmm4
,nb010nf_c6
(%rsp
)
1083 movaps
%xmm6
,nb010nf_c12
(%rsp
)
1085 lea
(%rax
,%rax
,2),%rax
1086 lea
(%rbx
,%rbx
,2),%rbx
1087 ## move coordinates to xmm0-xmm2
1088 movlps
(%rdi
,%rax
,4),%xmm1
1089 movss
8(%rdi
,%rax
,4),%xmm2
1090 movhps
(%rdi
,%rbx
,4),%xmm1
1091 movss
8(%rdi
,%rbx
,4),%xmm0
1095 shufps $
0,%xmm0
,%xmm2
1099 shufps $
136,%xmm2
,%xmm2
## 10001000
1101 shufps $
136,%xmm0
,%xmm0
## 10001000
1102 shufps $
221,%xmm1
,%xmm1
## 11011101
1104 ## move nb010nf_ix-iz to xmm4-xmm6
1107 movaps nb010nf_ix
(%rsp
),%xmm4
1108 movaps nb010nf_iy
(%rsp
),%xmm5
1109 movaps nb010nf_iz
(%rsp
),%xmm6
1126 ## 1/x lookup seed in xmm5
1127 movaps nb010nf_two
(%rsp
),%xmm0
1130 mulps
%xmm5
,%xmm0
## xmm0=rinvsq
1135 mulps
%xmm0
,%xmm1
## xmm1=rinvsix
1137 mulps
%xmm2
,%xmm2
## xmm2=rinvtwelve
1139 mulps nb010nf_c6
(%rsp
),%xmm1
1140 mulps nb010nf_c12
(%rsp
),%xmm2
1142 subps
%xmm1
,%xmm5
## Vvdw=Vvdw12-Vvdw6
1143 addps nb010nf_Vvdwtot
(%rsp
),%xmm5
1144 movaps
%xmm5
,nb010nf_Vvdwtot
(%rsp
)
1146 _nb_kernel010nf_x86_64_sse.nb010nf_checksingle
:
1147 movl nb010nf_innerk
(%rsp
),%edx
1149 jnz _nb_kernel010nf_x86_64_sse.nb010nf_dosingle
1150 jmp _nb_kernel010nf_x86_64_sse.nb010nf_updateouterdata
1151 _nb_kernel010nf_x86_64_sse.nb010nf_dosingle
:
1152 movq nb010nf_pos
(%rbp
),%rdi
1153 movq nb010nf_innerjjnr
(%rsp
),%rcx
1156 movq nb010nf_type
(%rbp
),%rsi
1158 movl
(%rsi
,%rcx
,4),%ecx
1159 movq nb010nf_vdwparam
(%rbp
),%rsi
1161 addl nb010nf_ntia
(%rsp
),%ecx
1163 movlps
(%rsi
,%rcx
,4),%xmm6
1165 shufps $
252,%xmm4
,%xmm4
## 11111100
1166 shufps $
253,%xmm6
,%xmm6
## 11111101
1168 movaps
%xmm4
,nb010nf_c6
(%rsp
)
1169 movaps
%xmm6
,nb010nf_c12
(%rsp
)
1171 lea
(%rax
,%rax
,2),%rax
1173 ## move coordinates to xmm0-xmm2
1174 movss
(%rdi
,%rax
,4),%xmm0
1175 movss
4(%rdi
,%rax
,4),%xmm1
1176 movss
8(%rdi
,%rax
,4),%xmm2
1180 movaps nb010nf_ix
(%rsp
),%xmm4
1181 movaps nb010nf_iy
(%rsp
),%xmm5
1182 movaps nb010nf_iz
(%rsp
),%xmm6
1198 ## 1/x lookup seed in xmm5
1199 movaps nb010nf_two
(%rsp
),%xmm0
1202 mulps
%xmm5
,%xmm0
## xmm0=rinvsq
1207 mulps
%xmm0
,%xmm1
## xmm1=rinvsix
1209 mulps
%xmm2
,%xmm2
## xmm2=rinvtwelve
1211 mulps nb010nf_c6
(%rsp
),%xmm1
1212 mulps nb010nf_c12
(%rsp
),%xmm2
1214 subps
%xmm1
,%xmm5
## Vvdw=Vvdw12-Vvdw6
1215 addss nb010nf_Vvdwtot
(%rsp
),%xmm5
1216 movss
%xmm5
,nb010nf_Vvdwtot
(%rsp
)
1218 _nb_kernel010nf_x86_64_sse.nb010nf_updateouterdata
:
1220 movl nb010nf_n
(%rsp
),%esi
1221 ## get group index for i particle
1222 movq nb010nf_gid
(%rbp
),%rdx
## base of gid[]
1223 movl
(%rdx
,%rsi
,4),%edx
## ggid=gid[n]
1225 ## accumulate total lj energy and update it
1226 movaps nb010nf_Vvdwtot
(%rsp
),%xmm7
1229 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1231 shufps $
1,%xmm6
,%xmm6
1234 ## add earlier value from mem
1235 movq nb010nf_Vvdw
(%rbp
),%rax
1236 addss
(%rax
,%rdx
,4),%xmm7
1238 movss
%xmm7
,(%rax
,%rdx
,4)
1241 movl nb010nf_nn1
(%rsp
),%ecx
1242 ## esi already loaded with n
1245 jz _nb_kernel010nf_x86_64_sse.nb010nf_outerend
1247 ## not last, iterate outer loop once more!
1248 movl
%esi
,nb010nf_n
(%rsp
)
1249 jmp _nb_kernel010nf_x86_64_sse.nb010nf_outer
1250 _nb_kernel010nf_x86_64_sse.nb010nf_outerend
:
1251 ## check if more outer neighborlists remain
1252 movl nb010nf_nri
(%rsp
),%ecx
1253 ## esi already loaded with n above
1255 jz _nb_kernel010nf_x86_64_sse.nb010nf_end
1256 ## non-zero, do one more workunit
1257 jmp _nb_kernel010nf_x86_64_sse.nb010nf_threadloop
1258 _nb_kernel010nf_x86_64_sse.nb010nf_end
:
1260 movl nb010nf_nouter
(%rsp
),%eax
1261 movl nb010nf_ninner
(%rsp
),%ebx
1262 movq nb010nf_outeriter
(%rbp
),%rcx
1263 movq nb010nf_inneriter
(%rbp
),%rdx