3 ## Gromacs 4.0 Copyright (c) 1991-2003
4 ## David van der Spoel, Erik Lindahl
6 ## This program is free software; you can redistribute it and/or
7 ## modify it under the terms of the GNU General Public License
8 ## as published by the Free Software Foundation; either version 2
9 ## of the License, or (at your option) any later version.
11 ## To help us fund GROMACS development, we humbly ask that you cite
12 ## the research papers on the package. Check out http://www.gromacs.org
15 ## Gnomes, ROck Monsters And Chili Sauce
23 .globl nb_kernel310_x86_64_sse
24 .globl _nb_kernel310_x86_64_sse
25 nb_kernel310_x86_64_sse
:
26 _nb_kernel310_x86_64_sse
:
27 ## Room for return address and rbp (16 bytes)
31 .set nb310_faction, 40
33 .set nb310_p_facel, 56
38 .set nb310_p_ntype, 96
39 .set nb310_vdwparam, 104
41 .set nb310_p_tabscale, 120
43 .set nb310_invsqrta, 136
45 .set nb310_p_gbtabscale, 152
47 .set nb310_p_nthreads, 168
50 .set nb310_outeriter, 192
51 .set nb310_inneriter, 200
53 ## stack offsets for local variables
54 ## bottom of stack is cache-aligned for sse use
64 .set nb310_twelve, 144
71 .set nb310_Vvdwtot, 256
79 .set nb310_jindex, 368
82 .set nb310_shiftvec, 392
84 .set nb310_innerjjnr, 408
88 .set nb310_innerk, 428
92 .set nb310_nouter, 444
93 .set nb310_ninner, 448
108 subq $
472,%rsp
## local variable stack space (n*16+8)
110 ## zero 32-bit iteration counters
112 movl
%eax
,nb310_nouter
(%rsp
)
113 movl
%eax
,nb310_ninner
(%rsp
)
116 movl
%edi
,nb310_nri
(%rsp
)
117 movq
%rsi
,nb310_iinr
(%rsp
)
118 movq
%rdx
,nb310_jindex
(%rsp
)
119 movq
%rcx
,nb310_jjnr
(%rsp
)
120 movq
%r8,nb310_shift
(%rsp
)
121 movq
%r9,nb310_shiftvec
(%rsp
)
122 movq nb310_p_ntype
(%rbp
),%rdi
124 movl
%edi
,nb310_ntype
(%rsp
)
125 movq nb310_p_facel
(%rbp
),%rsi
127 movss
%xmm0
,nb310_facel
(%rsp
)
130 movq nb310_p_tabscale
(%rbp
),%rax
132 shufps $
0,%xmm3
,%xmm3
133 movaps
%xmm3
,nb310_tsc
(%rsp
)
136 ## create constant floating-point factors on stack
137 movl $
0x3f000000,%eax
## half in IEEE (hex)
138 movl
%eax
,nb310_half
(%rsp
)
139 movss nb310_half
(%rsp
),%xmm1
140 shufps $
0,%xmm1
,%xmm1
## splat to all elements
142 addps
%xmm2
,%xmm2
## one
144 addps
%xmm2
,%xmm2
## two
145 addps
%xmm2
,%xmm3
## three
147 addps
%xmm4
,%xmm4
## six
149 addps
%xmm5
,%xmm5
## twelve
150 movaps
%xmm1
,nb310_half
(%rsp
)
151 movaps
%xmm2
,nb310_two
(%rsp
)
152 movaps
%xmm3
,nb310_three
(%rsp
)
153 movaps
%xmm4
,nb310_six
(%rsp
)
154 movaps
%xmm5
,nb310_twelve
(%rsp
)
156 _nb_kernel310_x86_64_sse.nb310_threadloop
:
157 movq nb310_count
(%rbp
),%rsi
## pointer to sync counter
159 _nb_kernel310_x86_64_sse.nb310_spinlock
:
160 movl
%eax
,%ebx
## ebx=*count=nn0
161 addl $
1,%ebx
## ebx=nn1=nn0+10
163 cmpxchgl
%ebx
,(%rsi
) ## write nn1 to *counter,
164 ## if it hasnt changed.
165 ## or reread *counter to eax.
166 pause
## -> better p4 performance
167 jnz _nb_kernel310_x86_64_sse.nb310_spinlock
169 ## if(nn1>nri) nn1=nri
170 movl nb310_nri
(%rsp
),%ecx
173 cmovlel
%edx
,%ebx
## if(nn1>nri) nn1=nri
174 ## Cleared the spinlock if we got here.
175 ## eax contains nn0, ebx contains nn1.
176 movl
%eax
,nb310_n
(%rsp
)
177 movl
%ebx
,nb310_nn1
(%rsp
)
178 subl
%eax
,%ebx
## calc number of outer lists
179 movl
%eax
,%esi
## copy n to esi
180 jg _nb_kernel310_x86_64_sse.nb310_outerstart
181 jmp _nb_kernel310_x86_64_sse.nb310_end
183 _nb_kernel310_x86_64_sse.nb310_outerstart
:
184 ## ebx contains number of outer iterations
185 addl nb310_nouter
(%rsp
),%ebx
186 movl
%ebx
,nb310_nouter
(%rsp
)
188 _nb_kernel310_x86_64_sse.nb310_outer
:
189 movq nb310_shift
(%rsp
),%rax
## rax = pointer into shift[]
190 movl
(%rax
,%rsi
,4),%ebx
## ebx=shift[n]
192 lea
(%rbx
,%rbx
,2),%rbx
## rbx=3*is
193 movl
%ebx
,nb310_is3
(%rsp
) ## store is3
195 movq nb310_shiftvec
(%rsp
),%rax
## rax = base of shiftvec[]
197 movss
(%rax
,%rbx
,4),%xmm0
198 movss
4(%rax
,%rbx
,4),%xmm1
199 movss
8(%rax
,%rbx
,4),%xmm2
201 movq nb310_iinr
(%rsp
),%rcx
## rcx = pointer into iinr[]
202 movl
(%rcx
,%rsi
,4),%ebx
## ebx =ii
204 movq nb310_charge
(%rbp
),%rdx
205 movss
(%rdx
,%rbx
,4),%xmm3
206 mulss nb310_facel
(%rsp
),%xmm3
207 shufps $
0,%xmm3
,%xmm3
209 movq nb310_type
(%rbp
),%rdx
210 movl
(%rdx
,%rbx
,4),%edx
211 imull nb310_ntype
(%rsp
),%edx
213 movl
%edx
,nb310_ntia
(%rsp
)
215 lea
(%rbx
,%rbx
,2),%rbx
## rbx = 3*ii=ii3
216 movq nb310_pos
(%rbp
),%rax
## rax = base of pos[]
218 addss
(%rax
,%rbx
,4),%xmm0
219 addss
4(%rax
,%rbx
,4),%xmm1
220 addss
8(%rax
,%rbx
,4),%xmm2
222 movaps
%xmm3
,nb310_iq
(%rsp
)
224 shufps $
0,%xmm0
,%xmm0
225 shufps $
0,%xmm1
,%xmm1
226 shufps $
0,%xmm2
,%xmm2
228 movaps
%xmm0
,nb310_ix
(%rsp
)
229 movaps
%xmm1
,nb310_iy
(%rsp
)
230 movaps
%xmm2
,nb310_iz
(%rsp
)
232 movl
%ebx
,nb310_ii3
(%rsp
)
234 ## clear vctot and i forces
236 movaps
%xmm15
,nb310_vctot
(%rsp
)
237 movaps
%xmm15
,nb310_Vvdwtot
(%rsp
)
241 movq nb310_jindex
(%rsp
),%rax
242 movl
(%rax
,%rsi
,4),%ecx
## jindex[n]
243 movl
4(%rax
,%rsi
,4),%edx
## jindex[n+1]
244 subl
%ecx
,%edx
## number of innerloop atoms
246 movq nb310_pos
(%rbp
),%rsi
247 movq nb310_faction
(%rbp
),%rdi
248 movq nb310_jjnr
(%rsp
),%rax
251 movq
%rax
,nb310_innerjjnr
(%rsp
) ## pointer to jjnr[nj0]
254 addl nb310_ninner
(%rsp
),%ecx
255 movl
%ecx
,nb310_ninner
(%rsp
)
257 movl
%edx
,nb310_innerk
(%rsp
) ## number of innerloop atoms
258 jge _nb_kernel310_x86_64_sse.nb310_unroll_loop
259 jmp _nb_kernel310_x86_64_sse.nb310_finish_inner
260 _nb_kernel310_x86_64_sse.nb310_unroll_loop
:
261 ## quad-unrolled innerloop here
262 movq nb310_innerjjnr
(%rsp
),%rdx
## pointer to jjnr[k]
266 movl
12(%rdx
),%r11d
## eax-edx=jnr1-4
268 addq $
16,nb310_innerjjnr
(%rsp
) ## advance pointer (unrolled 4)
271 lea
(%r8,%r8,2),%rax
## replace jnr with j3
274 lea
(%r10,%r10,2),%rcx
## replace jnr with j3
275 lea
(%r11,%r11,2),%rdx
278 movq nb310_pos
(%rbp
),%rdi
280 movlps
(%rdi
,%rax
,4),%xmm1
## x1 y1 - -
281 movlps
(%rdi
,%rbx
,4),%xmm2
## x2 y2 - -
282 movlps
(%rdi
,%rcx
,4),%xmm3
## x3 y3 - -
283 movlps
(%rdi
,%rdx
,4),%xmm4
## x4 y4 - -
285 movss
8(%rdi
,%rax
,4),%xmm5
## z1 - - -
286 movss
8(%rdi
,%rbx
,4),%xmm6
## z2 - - -
287 movss
8(%rdi
,%rcx
,4),%xmm7
## z3 - - -
288 movss
8(%rdi
,%rdx
,4),%xmm8
## z4 - - -
290 unpcklps
%xmm3
,%xmm1
## x1 x3 y1 y3
291 unpcklps
%xmm4
,%xmm2
## x2 x4 y2 y4
292 unpcklps
%xmm7
,%xmm5
## z1 z3 - -
293 unpcklps
%xmm8
,%xmm6
## z2 z4 - -
294 movq nb310_charge
(%rbp
),%rsi
298 unpcklps
%xmm2
,%xmm1
## x1 x2 x3 x4
299 unpckhps
%xmm2
,%xmm3
## y1 y2 y3 y4
300 unpcklps
%xmm6
,%xmm5
## z1 z2 z3 z4
303 subps nb310_ix
(%rsp
),%xmm1
304 subps nb310_iy
(%rsp
),%xmm3
305 subps nb310_iz
(%rsp
),%xmm5
307 ## store dr in xmm9-xmm11
312 movss
(%rsi
,%r8,4),%xmm0
313 movss
(%rsi
,%r10,4),%xmm2
314 movss
(%rsi
,%r9,4),%xmm6
315 movss
(%rsi
,%r11,4),%xmm8
324 movq nb310_type
(%rbp
),%rsi
332 ## calculate rinv=1/sqrt(rsq)
336 movaps nb310_three
(%rsp
),%xmm1
337 mulps
%xmm3
,%xmm5
## rsq*lu*lu
338 subps
%xmm5
,%xmm1
## 30-rsq*lu*lu
340 mulps nb310_half
(%rsp
),%xmm1
343 mulps nb310_iq
(%rsp
),%xmm0
346 movl
(%rsi
,%r8,4),%r8d
347 movl
(%rsi
,%r9,4),%r9d
348 movl
(%rsi
,%r10,4),%r10d
349 movl
(%rsi
,%r11,4),%r11d
351 mulps
%xmm1
,%xmm3
## r
352 mulps nb310_tsc
(%rsp
),%xmm3
## rtab
353 movaps
%xmm0
,nb310_qq
(%rsp
)
355 ## truncate and convert to integers
356 cvttps2dq
%xmm3
,%xmm2
363 ## convert back to float
366 movl nb310_ntia
(%rsp
),%edi
375 ## move to integer registers
379 pshufd $
1,%xmm2
,%xmm2
380 pshufd $
1,%xmm7
,%xmm7
388 movq nb310_vdwparam
(%rbp
),%rsi
389 movlps
(%rsi
,%r8,4),%xmm7
390 movlps
(%rsi
,%r10,4),%xmm8
391 movhps
(%rsi
,%r9,4),%xmm7
392 movhps
(%rsi
,%r11,4),%xmm8
395 shufps $
136,%xmm8
,%xmm12
## 10001000
396 shufps $
221,%xmm8
,%xmm7
## 11011101
398 movaps
%xmm12
,nb310_c6
(%rsp
)
399 movaps
%xmm7
,nb310_c12
(%rsp
)
401 movq nb310_VFtab
(%rbp
),%rsi
403 movlps
(%rsi
,%r12,4),%xmm5
404 movlps
(%rsi
,%r14,4),%xmm7
405 movhps
(%rsi
,%r13,4),%xmm5
406 movhps
(%rsi
,%r15,4),%xmm7
409 shufps $
136,%xmm7
,%xmm4
## 10001000
410 shufps $
221,%xmm7
,%xmm5
## 11011101
412 movaps
%xmm1
,%xmm0
## rinv
413 mulps
%xmm0
,%xmm0
## rinvsq
414 movaps
%xmm0
,%xmm2
## rinvsq
415 mulps
%xmm2
,%xmm2
## rinv4
416 mulps
%xmm0
,%xmm2
## rinv6
418 mulps
%xmm12
,%xmm12
## rinv12
420 movlps
8(%rsi
,%r12,4),%xmm7
421 movlps
8(%rsi
,%r14,4),%xmm8
422 movhps
8(%rsi
,%r13,4),%xmm7
423 movhps
8(%rsi
,%r15,4),%xmm8
427 mulps nb310_c6
(%rsp
),%xmm2
## vvdw6=c6*rinv6
428 mulps nb310_c12
(%rsp
),%xmm12
## vvdw12=c12*rinv12
431 subps
%xmm2
,%xmm12
## Vvdw=Vvdw12-Vvdw6
433 ## add potential to vvdwtot
434 addps nb310_Vvdwtot
(%rsp
),%xmm12
435 movaps
%xmm12
,nb310_Vvdwtot
(%rsp
)
437 shufps $
136,%xmm8
,%xmm6
## 10001000
438 shufps $
221,%xmm8
,%xmm7
## 11011101
439 ## table data ready in xmm4-xmm7
441 mulps
%xmm3
,%xmm7
## Heps
442 mulps
%xmm3
,%xmm6
## Geps
443 mulps
%xmm3
,%xmm7
## Heps2
445 addps
%xmm6
,%xmm5
## F+Geps
446 addps
%xmm7
,%xmm5
## F+Geps+Heps2 = Fp
447 addps
%xmm7
,%xmm7
## 2*Heps2
448 addps
%xmm6
,%xmm7
## 2*Heps2+Geps
449 addps
%xmm5
,%xmm7
## FF = Fp + 2*Heps2 + Geps
450 mulps
%xmm3
,%xmm5
## eps*Fp
451 addps
%xmm4
,%xmm5
## VV
452 mulps nb310_qq
(%rsp
),%xmm5
## VV*qq=vcoul
453 mulps nb310_qq
(%rsp
),%xmm7
## FF*qq=fijC
456 mulps nb310_six
(%rsp
),%xmm2
457 mulps nb310_twelve
(%rsp
),%xmm0
459 mulps
%xmm1
,%xmm0
## (12*vnb12-6*vnb6)*rinv
461 ## add potential to vctot
462 addps nb310_vctot
(%rsp
),%xmm5
463 movaps
%xmm5
,nb310_vctot
(%rsp
)
465 mulps nb310_tsc
(%rsp
),%xmm7
468 mulps
%xmm1
,%xmm0
## fscal
470 ## calculate scalar force by multiplying dx/dy/dz with fscal
475 movq nb310_faction
(%rbp
),%rsi
476 ## the fj's - start by accumulating x & y forces from memory
477 movlps
(%rsi
,%rax
,4),%xmm0
## x1 y1 - -
478 movlps
(%rsi
,%rcx
,4),%xmm1
## x3 y3 - -
479 movhps
(%rsi
,%rbx
,4),%xmm0
## x1 y1 x2 y2
480 movhps
(%rsi
,%rdx
,4),%xmm1
## x3 y3 x4 y4
482 ## xmm0-xmm2 contains tx-tz (partial force)
483 ## accumulate i forces
489 unpcklps
%xmm10
,%xmm9
## x1 y1 x2 y2
490 unpckhps
%xmm10
,%xmm8
## x3 y3 x4 y4
492 ## update fjx and fjy
496 movlps
%xmm0
,(%rsi
,%rax
,4)
497 movlps
%xmm1
,(%rsi
,%rcx
,4)
498 movhps
%xmm0
,(%rsi
,%rbx
,4)
499 movhps
%xmm1
,(%rsi
,%rdx
,4)
501 ## xmm11: fjz1 fjz2 fjz3 fjz4
502 pshufd $
1,%xmm11
,%xmm10
## fjz2 - - -
503 movhlps
%xmm11
,%xmm9
## fjz3 - - -
504 pshufd $
3,%xmm11
,%xmm8
## fjz4 - - -
506 addss
8(%rsi
,%rax
,4),%xmm11
507 addss
8(%rsi
,%rbx
,4),%xmm10
508 addss
8(%rsi
,%rcx
,4),%xmm9
509 addss
8(%rsi
,%rdx
,4),%xmm8
510 movss
%xmm11
,8(%rsi
,%rax
,4)
511 movss
%xmm10
,8(%rsi
,%rbx
,4)
512 movss
%xmm9
,8(%rsi
,%rcx
,4)
513 movss
%xmm8
,8(%rsi
,%rdx
,4)
515 ## should we do one more iteration?
516 subl $
4,nb310_innerk
(%rsp
)
517 jl _nb_kernel310_x86_64_sse.nb310_finish_inner
518 jmp _nb_kernel310_x86_64_sse.nb310_unroll_loop
519 _nb_kernel310_x86_64_sse.nb310_finish_inner
:
520 ## check if at least two particles remain
521 addl $
4,nb310_innerk
(%rsp
)
522 movl nb310_innerk
(%rsp
),%edx
524 jnz _nb_kernel310_x86_64_sse.nb310_dopair
525 jmp _nb_kernel310_x86_64_sse.nb310_checksingle
526 _nb_kernel310_x86_64_sse.nb310_dopair
:
527 ## twice-unrolled innerloop here
528 movq nb310_innerjjnr
(%rsp
),%rdx
## pointer to jjnr[k]
532 addq $
8,nb310_innerjjnr
(%rsp
) ## advance pointer (unrolled 2)
534 movq nb310_charge
(%rbp
),%rsi
535 movss
(%rsi
,%rax
,4),%xmm0
536 movss
(%rsi
,%rbx
,4),%xmm2
538 unpcklps
%xmm2
,%xmm0
## jqa jqb
539 mulps nb310_iq
(%rsp
),%xmm0
540 movaps
%xmm0
,nb310_qq
(%rsp
)
542 movq nb310_type
(%rbp
),%rsi
544 movl
(%rsi
,%rax
,4),%r12d
545 movl
(%rsi
,%rbx
,4),%r13d
548 movl nb310_ntia
(%rsp
),%edi
552 movq nb310_vdwparam
(%rbp
),%rsi
553 movlps
(%rsi
,%r12,4),%xmm3
554 movhps
(%rsi
,%r13,4),%xmm3
558 shufps $
136,%xmm7
,%xmm0
## 10001000
559 shufps $
221,%xmm7
,%xmm3
## 11011101
561 movaps
%xmm0
,nb310_c6
(%rsp
)
562 movaps
%xmm3
,nb310_c12
(%rsp
)
564 lea
(%rax
,%rax
,2),%rax
## replace jnr with j3
565 lea
(%rbx
,%rbx
,2),%rbx
568 movq nb310_pos
(%rbp
),%rdi
570 movlps
(%rdi
,%rax
,4),%xmm4
## x1 y1 - -
571 movlps
(%rdi
,%rbx
,4),%xmm5
## x2 y2 - -
573 movss
8(%rdi
,%rax
,4),%xmm6
## z1 - - -
574 movss
8(%rdi
,%rbx
,4),%xmm7
## z2 - - -
576 unpcklps
%xmm5
,%xmm4
## x1 x2 y1 y2
577 movhlps
%xmm4
,%xmm5
## y1 y2 - -
578 unpcklps
%xmm7
,%xmm6
## z1 z2 - -
581 subps nb310_ix
(%rsp
),%xmm4
582 subps nb310_iy
(%rsp
),%xmm5
583 subps nb310_iz
(%rsp
),%xmm6
585 ## store dr in xmm9-xmm11
598 ## calculate rinv=1/sqrt(rsq)
602 movaps nb310_three
(%rsp
),%xmm1
603 mulps
%xmm4
,%xmm5
## rsq*lu*lu
604 subps
%xmm5
,%xmm1
## 30-rsq*lu*lu
606 mulps nb310_half
(%rsp
),%xmm1
611 mulps
%xmm1
,%xmm3
## r
612 mulps nb310_tsc
(%rsp
),%xmm3
## rtab
614 ## truncate and convert to integers
615 cvttps2dq
%xmm3
,%xmm2
617 ## convert back to float
623 ## move to integer registers
625 pshufd $
1,%xmm2
,%xmm2
631 movq nb310_VFtab
(%rbp
),%rsi
633 movlps
(%rsi
,%r12,4),%xmm4
634 movlps
(%rsi
,%r13,4),%xmm5
638 movaps
%xmm1
,%xmm0
## rinv
639 mulps
%xmm0
,%xmm0
## rinvsq
640 movaps
%xmm0
,%xmm2
## rinvsq
641 mulps
%xmm2
,%xmm2
## rinv4
642 mulps
%xmm0
,%xmm2
## rinv6
644 mulps
%xmm12
,%xmm12
## rinv12
646 movlps
8(%rsi
,%r12,4),%xmm6
647 movlps
8(%rsi
,%r13,4),%xmm7
650 ## table data ready in xmm4-xmm7
652 mulps nb310_c6
(%rsp
),%xmm2
## vvdw6=c6*rinv6
653 mulps nb310_c12
(%rsp
),%xmm12
## vvdw12=c12*rinv12
656 subps
%xmm2
,%xmm12
## Vvdw=Vvdw12-Vvdw6
658 ## add potential to vvdwtot
659 addps nb310_Vvdwtot
(%rsp
),%xmm12
660 movlps
%xmm12
,nb310_Vvdwtot
(%rsp
)
662 mulps
%xmm3
,%xmm7
## Heps
663 mulps
%xmm3
,%xmm6
## Geps
664 mulps
%xmm3
,%xmm7
## Heps2
666 addps
%xmm6
,%xmm5
## F+Geps
667 addps
%xmm7
,%xmm5
## F+Geps+Heps2 = Fp
668 addps
%xmm7
,%xmm7
## 2*Heps2
669 addps
%xmm6
,%xmm7
## 2*Heps2+Geps
670 addps
%xmm5
,%xmm7
## FF = Fp + 2*Heps2 + Geps
671 mulps
%xmm3
,%xmm5
## eps*Fp
672 addps
%xmm4
,%xmm5
## VV
673 mulps nb310_qq
(%rsp
),%xmm5
## VV*qq=vcoul
674 mulps nb310_qq
(%rsp
),%xmm7
## FF*qq=fijC
677 mulps nb310_six
(%rsp
),%xmm2
678 mulps nb310_twelve
(%rsp
),%xmm0
680 mulps
%xmm1
,%xmm0
## (12*vnb12-6*vnb6)*rinv
682 ## add potential to vctot
683 addps nb310_vctot
(%rsp
),%xmm5
684 movlps
%xmm5
,nb310_vctot
(%rsp
)
688 mulps nb310_tsc
(%rsp
),%xmm7
691 mulps
%xmm1
,%xmm0
## fscal
693 ## calculate scalar force by multiplying dx/dy/dz with fscal
702 ## accumulate i forces
707 movq nb310_faction
(%rbp
),%rsi
708 ## the fj's - start by accumulating x & y forces from memory
709 movlps
(%rsi
,%rax
,4),%xmm0
## x1 y1 - -
710 movhps
(%rsi
,%rbx
,4),%xmm0
## x1 y1 x2 y2
712 unpcklps
%xmm10
,%xmm9
## x1 y1 x2 y2
715 movlps
%xmm0
,(%rsi
,%rax
,4)
716 movhps
%xmm0
,(%rsi
,%rbx
,4)
719 pshufd $
1,%xmm11
,%xmm8
720 addss
8(%rsi
,%rax
,4),%xmm11
721 addss
8(%rsi
,%rbx
,4),%xmm8
722 movss
%xmm11
,8(%rsi
,%rax
,4)
723 movss
%xmm8
,8(%rsi
,%rbx
,4)
725 _nb_kernel310_x86_64_sse.nb310_checksingle
:
726 movl nb310_innerk
(%rsp
),%edx
728 jnz _nb_kernel310_x86_64_sse.nb310_dosingle
729 jmp _nb_kernel310_x86_64_sse.nb310_updateouterdata
731 _nb_kernel310_x86_64_sse.nb310_dosingle
:
732 movq nb310_innerjjnr
(%rsp
),%rcx
735 movq nb310_charge
(%rbp
),%rsi
736 movss
(%rsi
,%rax
,4),%xmm0
738 mulss nb310_iq
(%rsp
),%xmm0
739 movaps
%xmm0
,nb310_qq
(%rsp
)
741 movq nb310_type
(%rbp
),%rsi
743 movl
(%rsi
,%rax
,4),%r12d
745 movl nb310_ntia
(%rsp
),%edi
748 movq nb310_vdwparam
(%rbp
),%rsi
749 movss
(%rsi
,%r12,4),%xmm0
750 movss
4(%rsi
,%r12,4),%xmm3
752 movaps
%xmm0
,nb310_c6
(%rsp
)
753 movaps
%xmm3
,nb310_c12
(%rsp
)
755 lea
(%rax
,%rax
,2),%rax
## replace jnr with j3
757 movq nb310_pos
(%rbp
),%rdi
758 movss
(%rdi
,%rax
,4),%xmm4
## x1 - - -
759 movss
4(%rdi
,%rax
,4),%xmm5
## y2 - - -
760 movss
8(%rdi
,%rax
,4),%xmm6
## 13 - - -
763 subss nb310_ix
(%rsp
),%xmm4
764 subss nb310_iy
(%rsp
),%xmm5
765 subss nb310_iz
(%rsp
),%xmm6
767 ## store dr in xmm9-xmm11
780 ## calculate rinv=1/sqrt(rsq)
784 movaps nb310_three
(%rsp
),%xmm1
785 mulss
%xmm4
,%xmm5
## rsq*lu*lu
786 subss
%xmm5
,%xmm1
## 30-rsq*lu*lu
788 mulss nb310_half
(%rsp
),%xmm1
793 mulss
%xmm1
,%xmm3
## r
794 mulss nb310_tsc
(%rsp
),%xmm3
## rtab
796 ## truncate and convert to integers
797 cvttss2si
%xmm3
,%r12d
799 ## convert back to float
808 movq nb310_VFtab
(%rbp
),%rsi
810 movaps
%xmm1
,%xmm0
## rinv
811 mulss
%xmm0
,%xmm0
## rinvsq
812 movaps
%xmm0
,%xmm2
## rinvsq
813 mulss
%xmm2
,%xmm2
## rinv4
814 mulss
%xmm0
,%xmm2
## rinv6
816 mulss
%xmm12
,%xmm12
## rinv12
819 movss
(%rsi
,%r12,4),%xmm4
820 movss
4(%rsi
,%r12,4),%xmm5
821 movss
8(%rsi
,%r12,4),%xmm6
822 movss
12(%rsi
,%r12,4),%xmm7
823 ## table data ready in xmm4-xmm7
825 mulss nb310_c6
(%rsp
),%xmm2
## vvdw6=c6*rinv6
826 mulss nb310_c12
(%rsp
),%xmm12
## vvdw12=c12*rinv12
829 subss
%xmm2
,%xmm12
## Vvdw=Vvdw12-Vvdw6
831 ## add potential to vvdwtot
832 addss nb310_Vvdwtot
(%rsp
),%xmm12
833 movss
%xmm12
,nb310_Vvdwtot
(%rsp
)
835 mulss
%xmm3
,%xmm7
## Heps
836 mulss
%xmm3
,%xmm6
## Geps
837 mulss
%xmm3
,%xmm7
## Heps2
839 addss
%xmm6
,%xmm5
## F+Geps
840 addss
%xmm7
,%xmm5
## F+Geps+Heps2 = Fp
841 addss
%xmm7
,%xmm7
## 2*Heps2
842 addss
%xmm6
,%xmm7
## 2*Heps2+Geps
843 addss
%xmm5
,%xmm7
## FF = Fp + 2*Heps2 + Geps
844 mulss
%xmm3
,%xmm5
## eps*Fp
845 addss
%xmm4
,%xmm5
## VV
846 mulss nb310_qq
(%rsp
),%xmm5
## VV*qq=vcoul
847 mulss nb310_qq
(%rsp
),%xmm7
## FF*qq=fijC
850 mulss nb310_six
(%rsp
),%xmm2
851 mulss nb310_twelve
(%rsp
),%xmm0
853 mulss
%xmm1
,%xmm0
## (12*vnb12-6*vnb6)*rinv
855 ## add potential to vctot
856 addss nb310_vctot
(%rsp
),%xmm5
857 movss
%xmm5
,nb310_vctot
(%rsp
)
859 mulss nb310_tsc
(%rsp
),%xmm7
862 mulss
%xmm1
,%xmm0
## fscal
864 ## calculate scalar force by multiplying dx/dy/dz with fscal
869 ## accumulate i forces
874 movq nb310_faction
(%rbp
),%rsi
876 addss
(%rsi
,%rax
,4),%xmm9
877 addss
4(%rsi
,%rax
,4),%xmm10
878 addss
8(%rsi
,%rax
,4),%xmm11
879 movss
%xmm9
,(%rsi
,%rax
,4)
880 movss
%xmm10
,4(%rsi
,%rax
,4)
881 movss
%xmm11
,8(%rsi
,%rax
,4)
883 _nb_kernel310_x86_64_sse.nb310_updateouterdata
:
884 movl nb310_ii3
(%rsp
),%ecx
885 movq nb310_faction
(%rbp
),%rdi
886 movq nb310_fshift
(%rbp
),%rsi
887 movl nb310_is3
(%rsp
),%edx
889 ## accumulate i forces in xmm13, xmm14, xmm15
899 shufps $
1,%xmm3
,%xmm3
900 shufps $
1,%xmm4
,%xmm4
901 shufps $
1,%xmm5
,%xmm5
904 addss
%xmm5
,%xmm2
## xmm0-xmm2 has single force in pos0
907 movss
(%rdi
,%rcx
,4),%xmm3
908 movss
4(%rdi
,%rcx
,4),%xmm4
909 movss
8(%rdi
,%rcx
,4),%xmm5
913 movss
%xmm3
,(%rdi
,%rcx
,4)
914 movss
%xmm4
,4(%rdi
,%rcx
,4)
915 movss
%xmm5
,8(%rdi
,%rcx
,4)
917 ## increment fshift force
918 movss
(%rsi
,%rdx
,4),%xmm3
919 movss
4(%rsi
,%rdx
,4),%xmm4
920 movss
8(%rsi
,%rdx
,4),%xmm5
924 movss
%xmm3
,(%rsi
,%rdx
,4)
925 movss
%xmm4
,4(%rsi
,%rdx
,4)
926 movss
%xmm5
,8(%rsi
,%rdx
,4)
929 movl nb310_n
(%rsp
),%esi
930 ## get group index for i particle
931 movq nb310_gid
(%rbp
),%rdx
## base of gid[]
932 movl
(%rdx
,%rsi
,4),%edx
## ggid=gid[n]
934 ## accumulate total potential energy and update it
935 movaps nb310_vctot
(%rsp
),%xmm7
938 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
940 shufps $
1,%xmm6
,%xmm6
943 ## add earlier value from mem
944 movq nb310_Vc
(%rbp
),%rax
945 addss
(%rax
,%rdx
,4),%xmm7
947 movss
%xmm7
,(%rax
,%rdx
,4)
949 ## accumulate total lj energy and update it
950 movaps nb310_Vvdwtot
(%rsp
),%xmm7
953 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
955 shufps $
1,%xmm6
,%xmm6
958 ## add earlier value from mem
959 movq nb310_Vvdw
(%rbp
),%rax
960 addss
(%rax
,%rdx
,4),%xmm7
962 movss
%xmm7
,(%rax
,%rdx
,4)
965 movl nb310_nn1
(%rsp
),%ecx
966 ## esi already loaded with n
969 jz _nb_kernel310_x86_64_sse.nb310_outerend
971 ## not last, iterate outer loop once more!
972 movl
%esi
,nb310_n
(%rsp
)
973 jmp _nb_kernel310_x86_64_sse.nb310_outer
974 _nb_kernel310_x86_64_sse.nb310_outerend
:
975 ## check if more outer neighborlists remain
976 movl nb310_nri
(%rsp
),%ecx
977 ## esi already loaded with n above
979 jz _nb_kernel310_x86_64_sse.nb310_end
980 ## non-zero, do one more workunit
981 jmp _nb_kernel310_x86_64_sse.nb310_threadloop
982 _nb_kernel310_x86_64_sse.nb310_end
:
984 movl nb310_nouter
(%rsp
),%eax
985 movl nb310_ninner
(%rsp
),%ebx
986 movq nb310_outeriter
(%rbp
),%rcx
987 movq nb310_inneriter
(%rbp
),%rdx
1009 .globl nb_kernel310nf_x86_64_sse
1010 .globl _nb_kernel310nf_x86_64_sse
1011 nb_kernel310nf_x86_64_sse
:
1012 _nb_kernel310nf_x86_64_sse
:
1013 ## Room for return address and rbp (16 bytes)
1014 .set nb310nf_fshift, 16
1015 .set nb310nf_gid, 24
1016 .set nb310nf_pos, 32
1017 .set nb310nf_faction, 40
1018 .set nb310nf_charge, 48
1019 .set nb310nf_p_facel, 56
1020 .set nb310nf_argkrf, 64
1021 .set nb310nf_argcrf, 72
1023 .set nb310nf_type, 88
1024 .set nb310nf_p_ntype, 96
1025 .set nb310nf_vdwparam, 104
1026 .set nb310nf_Vvdw, 112
1027 .set nb310nf_p_tabscale, 120
1028 .set nb310nf_VFtab, 128
1029 .set nb310nf_invsqrta, 136
1030 .set nb310nf_dvda, 144
1031 .set nb310nf_p_gbtabscale, 152
1032 .set nb310nf_GBtab, 160
1033 .set nb310nf_p_nthreads, 168
1034 .set nb310nf_count, 176
1035 .set nb310nf_mtx, 184
1036 .set nb310nf_outeriter, 192
1037 .set nb310nf_inneriter, 200
1038 .set nb310nf_work, 208
1039 ## stack offsets for local variables
1040 ## bottom of stack is cache-aligned for sse use
1045 .set nb310nf_tsc, 64
1048 .set nb310nf_c12, 112
1049 .set nb310nf_vctot, 128
1050 .set nb310nf_Vvdwtot, 144
1051 .set nb310nf_half, 160
1052 .set nb310nf_three, 176
1053 .set nb310nf_nri, 192
1054 .set nb310nf_iinr, 200
1055 .set nb310nf_jindex, 208
1056 .set nb310nf_jjnr, 216
1057 .set nb310nf_shift, 224
1058 .set nb310nf_shiftvec, 232
1059 .set nb310nf_facel, 240
1060 .set nb310nf_innerjjnr, 248
1061 .set nb310nf_is3, 256
1062 .set nb310nf_ii3, 260
1063 .set nb310nf_ntia, 264
1064 .set nb310nf_innerk, 268
1066 .set nb310nf_nn1, 276
1067 .set nb310nf_ntype, 280
1068 .set nb310nf_nouter, 284
1069 .set nb310nf_ninner, 288
1083 subq $
312,%rsp
## local variable stack space (n*16+8)
1085 ## zero 32-bit iteration counters
1087 movl
%eax
,nb310nf_nouter
(%rsp
)
1088 movl
%eax
,nb310nf_ninner
(%rsp
)
1091 movl
%edi
,nb310nf_nri
(%rsp
)
1092 movq
%rsi
,nb310nf_iinr
(%rsp
)
1093 movq
%rdx
,nb310nf_jindex
(%rsp
)
1094 movq
%rcx
,nb310nf_jjnr
(%rsp
)
1095 movq
%r8,nb310nf_shift
(%rsp
)
1096 movq
%r9,nb310nf_shiftvec
(%rsp
)
1097 movq nb310nf_p_ntype
(%rbp
),%rdi
1099 movl
%edi
,nb310nf_ntype
(%rsp
)
1100 movq nb310nf_p_facel
(%rbp
),%rsi
1102 movss
%xmm0
,nb310nf_facel
(%rsp
)
1104 movq nb310nf_p_tabscale
(%rbp
),%rax
1106 shufps $
0,%xmm3
,%xmm3
1107 movaps
%xmm3
,nb310nf_tsc
(%rsp
)
1109 ## create constant floating-point factors on stack
1110 movl $
0x3f000000,%eax
## half in IEEE (hex)
1111 movl
%eax
,nb310nf_half
(%rsp
)
1112 movss nb310nf_half
(%rsp
),%xmm1
1113 shufps $
0,%xmm1
,%xmm1
## splat to all elements
1115 addps
%xmm2
,%xmm2
## one
1117 addps
%xmm2
,%xmm2
## two
1118 addps
%xmm2
,%xmm3
## three
1119 movaps
%xmm1
,nb310nf_half
(%rsp
)
1120 movaps
%xmm3
,nb310nf_three
(%rsp
)
1122 _nb_kernel310nf_x86_64_sse.nb310nf_threadloop
:
1123 movq nb310nf_count
(%rbp
),%rsi
## pointer to sync counter
1125 _nb_kernel310nf_x86_64_sse.nb310nf_spinlock
:
1126 movl
%eax
,%ebx
## ebx=*count=nn0
1127 addl $
1,%ebx
## ebx=nn1=nn0+10
1129 cmpxchgl
%ebx
,(%rsi
) ## write nn1 to *counter,
1130 ## if it hasnt changed.
1131 ## or reread *counter to eax.
1132 pause
## -> better p4 performance
1133 jnz _nb_kernel310nf_x86_64_sse.nb310nf_spinlock
1135 ## if(nn1>nri) nn1=nri
1136 movl nb310nf_nri
(%rsp
),%ecx
1139 cmovlel
%edx
,%ebx
## if(nn1>nri) nn1=nri
1140 ## Cleared the spinlock if we got here.
1141 ## eax contains nn0, ebx contains nn1.
1142 movl
%eax
,nb310nf_n
(%rsp
)
1143 movl
%ebx
,nb310nf_nn1
(%rsp
)
1144 subl
%eax
,%ebx
## calc number of outer lists
1145 movl
%eax
,%esi
## copy n to esi
1146 jg _nb_kernel310nf_x86_64_sse.nb310nf_outerstart
1147 jmp _nb_kernel310nf_x86_64_sse.nb310nf_end
1149 _nb_kernel310nf_x86_64_sse.nb310nf_outerstart
:
1150 ## ebx contains number of outer iterations
1151 addl nb310nf_nouter
(%rsp
),%ebx
1152 movl
%ebx
,nb310nf_nouter
(%rsp
)
1154 _nb_kernel310nf_x86_64_sse.nb310nf_outer
:
1155 movq nb310nf_shift
(%rsp
),%rax
## rax = pointer into shift[]
1156 movl
(%rax
,%rsi
,4),%ebx
## ebx=shift[n]
1158 lea
(%rbx
,%rbx
,2),%rbx
## rbx=3*is
1159 movl
%ebx
,nb310nf_is3
(%rsp
) ## store is3
1161 movq nb310nf_shiftvec
(%rsp
),%rax
## rax = base of shiftvec[]
1163 movss
(%rax
,%rbx
,4),%xmm0
1164 movss
4(%rax
,%rbx
,4),%xmm1
1165 movss
8(%rax
,%rbx
,4),%xmm2
1167 movq nb310nf_iinr
(%rsp
),%rcx
## rcx = pointer into iinr[]
1168 movl
(%rcx
,%rsi
,4),%ebx
## ebx =ii
1170 movq nb310nf_charge
(%rbp
),%rdx
1171 movss
(%rdx
,%rbx
,4),%xmm3
1172 mulss nb310nf_facel
(%rsp
),%xmm3
1173 shufps $
0,%xmm3
,%xmm3
1175 movq nb310nf_type
(%rbp
),%rdx
1176 movl
(%rdx
,%rbx
,4),%edx
1177 imull nb310nf_ntype
(%rsp
),%edx
1179 movl
%edx
,nb310nf_ntia
(%rsp
)
1181 lea
(%rbx
,%rbx
,2),%rbx
## rbx = 3*ii=ii3
1182 movq nb310nf_pos
(%rbp
),%rax
## rax = base of pos[]
1184 addss
(%rax
,%rbx
,4),%xmm0
1185 addss
4(%rax
,%rbx
,4),%xmm1
1186 addss
8(%rax
,%rbx
,4),%xmm2
1188 movaps
%xmm3
,nb310nf_iq
(%rsp
)
1190 shufps $
0,%xmm0
,%xmm0
1191 shufps $
0,%xmm1
,%xmm1
1192 shufps $
0,%xmm2
,%xmm2
1194 movaps
%xmm0
,nb310nf_ix
(%rsp
)
1195 movaps
%xmm1
,nb310nf_iy
(%rsp
)
1196 movaps
%xmm2
,nb310nf_iz
(%rsp
)
1198 movl
%ebx
,nb310nf_ii3
(%rsp
)
1200 ## clear vctot and i forces
1202 movaps
%xmm4
,nb310nf_vctot
(%rsp
)
1203 movaps
%xmm4
,nb310nf_Vvdwtot
(%rsp
)
1205 movq nb310nf_jindex
(%rsp
),%rax
1206 movq
(%rax
,%rsi
,4),%rcx
## jindex[n]
1207 movl
4(%rax
,%rsi
,4),%edx
## jindex[n+1]
1208 subl
%ecx
,%edx
## number of innerloop atoms
1210 movq nb310nf_pos
(%rbp
),%rsi
1211 movq nb310nf_jjnr
(%rsp
),%rax
1214 movq
%rax
,nb310nf_innerjjnr
(%rsp
) ## pointer to jjnr[nj0]
1217 addl nb310nf_ninner
(%rsp
),%ecx
1218 movl
%ecx
,nb310nf_ninner
(%rsp
)
1220 movl
%edx
,nb310nf_innerk
(%rsp
) ## number of innerloop atoms
1221 jge _nb_kernel310nf_x86_64_sse.nb310nf_unroll_loop
1222 jmp _nb_kernel310nf_x86_64_sse.nb310nf_finish_inner
1223 _nb_kernel310nf_x86_64_sse.nb310nf_unroll_loop
:
1224 ## quad-unroll innerloop here
1225 movq nb310nf_innerjjnr
(%rsp
),%rdx
## pointer to jjnr[k]
1229 movl
12(%rdx
),%edx
## eax-edx=jnr1-4
1230 addq $
16,nb310nf_innerjjnr
(%rsp
) ## advance pointer (unrolled 4)
1232 movq nb310nf_charge
(%rbp
),%rsi
## base of charge[]
1234 movss
(%rsi
,%rax
,4),%xmm3
1235 movss
(%rsi
,%rcx
,4),%xmm4
1236 movss
(%rsi
,%rbx
,4),%xmm6
1237 movss
(%rsi
,%rdx
,4),%xmm7
1239 movaps nb310nf_iq
(%rsp
),%xmm2
1240 shufps $
0,%xmm6
,%xmm3
1241 shufps $
0,%xmm7
,%xmm4
1242 shufps $
136,%xmm4
,%xmm3
## 10001000 ;# all charges in xmm3
1243 movd
%eax
,%mm0
## use mmx registers as temp storage
1249 movaps
%xmm3
,nb310nf_qq
(%rsp
)
1251 movq nb310nf_type
(%rbp
),%rsi
1252 movl
(%rsi
,%rax
,4),%eax
1253 movl
(%rsi
,%rbx
,4),%ebx
1254 movl
(%rsi
,%rcx
,4),%ecx
1255 movl
(%rsi
,%rdx
,4),%edx
1256 movq nb310nf_vdwparam
(%rbp
),%rsi
1261 movl nb310nf_ntia
(%rsp
),%edi
1267 movlps
(%rsi
,%rax
,4),%xmm6
1268 movlps
(%rsi
,%rcx
,4),%xmm7
1269 movhps
(%rsi
,%rbx
,4),%xmm6
1270 movhps
(%rsi
,%rdx
,4),%xmm7
1273 shufps $
136,%xmm7
,%xmm4
## 10001000
1274 shufps $
221,%xmm7
,%xmm6
## 11011101
1281 movaps
%xmm4
,nb310nf_c6
(%rsp
)
1282 movaps
%xmm6
,nb310nf_c12
(%rsp
)
1284 movq nb310nf_pos
(%rbp
),%rsi
## base of pos[]
1286 lea
(%rax
,%rax
,2),%rax
## replace jnr with j3
1287 lea
(%rbx
,%rbx
,2),%rbx
1289 lea
(%rcx
,%rcx
,2),%rcx
## replace jnr with j3
1290 lea
(%rdx
,%rdx
,2),%rdx
1292 ## move four coordinates to xmm0-xmm2
1294 movlps
(%rsi
,%rax
,4),%xmm4
1295 movlps
(%rsi
,%rcx
,4),%xmm5
1296 movss
8(%rsi
,%rax
,4),%xmm2
1297 movss
8(%rsi
,%rcx
,4),%xmm6
1299 movhps
(%rsi
,%rbx
,4),%xmm4
1300 movhps
(%rsi
,%rdx
,4),%xmm5
1302 movss
8(%rsi
,%rbx
,4),%xmm0
1303 movss
8(%rsi
,%rdx
,4),%xmm1
1305 shufps $
0,%xmm0
,%xmm2
1306 shufps $
0,%xmm1
,%xmm6
1311 shufps $
136,%xmm6
,%xmm2
## 10001000
1313 shufps $
136,%xmm5
,%xmm0
## 10001000
1314 shufps $
221,%xmm5
,%xmm1
## 11011101
1316 ## move ix-iz to xmm4-xmm6
1317 movaps nb310nf_ix
(%rsp
),%xmm4
1318 movaps nb310nf_iy
(%rsp
),%xmm5
1319 movaps nb310nf_iz
(%rsp
),%xmm6
1335 ## lookup seed in xmm5
1338 movaps nb310nf_three
(%rsp
),%xmm1
1339 mulps
%xmm4
,%xmm5
## rsq*lu*lu
1340 movaps nb310nf_half
(%rsp
),%xmm0
1341 subps
%xmm5
,%xmm1
## 30-rsq*lu*lu
1343 mulps
%xmm1
,%xmm0
## xmm0=rinv
1344 mulps
%xmm0
,%xmm4
## xmm4=r
1345 mulps nb310nf_tsc
(%rsp
),%xmm4
1348 cvttps2pi
%xmm4
,%mm6
1349 cvttps2pi
%xmm5
,%mm7
## mm6/mm7 contain lu indices
1354 movaps
%xmm4
,%xmm1
## xmm1=eps
1356 mulps
%xmm2
,%xmm2
## xmm2=eps2
1365 movq nb310nf_VFtab
(%rbp
),%rsi
1373 movlps
(%rsi
,%rax
,4),%xmm5
1374 movlps
(%rsi
,%rcx
,4),%xmm7
1375 movhps
(%rsi
,%rbx
,4),%xmm5
1376 movhps
(%rsi
,%rdx
,4),%xmm7
## got half coulomb table
1379 shufps $
136,%xmm7
,%xmm4
## 10001000
1380 shufps $
221,%xmm7
,%xmm5
## 11011101
1382 movlps
8(%rsi
,%rax
,4),%xmm7
1383 movlps
8(%rsi
,%rcx
,4),%xmm3
1384 movhps
8(%rsi
,%rbx
,4),%xmm7
1385 movhps
8(%rsi
,%rdx
,4),%xmm3
## other half of coulomb table
1387 shufps $
136,%xmm3
,%xmm6
## 10001000
1388 shufps $
221,%xmm3
,%xmm7
## 11011101
1389 ## coulomb table ready, in xmm4-xmm7
1391 mulps
%xmm1
,%xmm6
## xmm6=Geps
1392 mulps
%xmm2
,%xmm7
## xmm7=Heps2
1394 addps
%xmm7
,%xmm5
## xmm5=Fp
1395 movaps nb310nf_qq
(%rsp
),%xmm3
1396 mulps
%xmm1
,%xmm5
## xmm5=eps*Fp
1397 addps
%xmm4
,%xmm5
## xmm5=VV
1398 mulps
%xmm3
,%xmm5
## vcoul=qq*VV
1401 mulps
%xmm0
,%xmm4
## xmm4=rinvsq
1403 ## at this point mm5 contains vcoul
1404 ## increment vcoul - then we can get rid of mm5
1406 addps nb310nf_vctot
(%rsp
),%xmm5
1409 movaps
%xmm5
,nb310nf_vctot
(%rsp
)
1411 mulps
%xmm4
,%xmm6
## xmm6=rinvsix
1413 mulps
%xmm4
,%xmm4
## xmm4=rinvtwelve
1414 mulps nb310nf_c6
(%rsp
),%xmm6
1415 mulps nb310nf_c12
(%rsp
),%xmm4
1416 movaps nb310nf_Vvdwtot
(%rsp
),%xmm7
1419 movaps
%xmm7
,nb310nf_Vvdwtot
(%rsp
)
1422 ## should we do one more iteration?
1423 subl $
4,nb310nf_innerk
(%rsp
)
1424 jl _nb_kernel310nf_x86_64_sse.nb310nf_finish_inner
1425 jmp _nb_kernel310nf_x86_64_sse.nb310nf_unroll_loop
1426 _nb_kernel310nf_x86_64_sse.nb310nf_finish_inner
:
1427 ## check if at least two particles remain
1428 addl $
4,nb310nf_innerk
(%rsp
)
1429 movl nb310nf_innerk
(%rsp
),%edx
1431 jnz _nb_kernel310nf_x86_64_sse.nb310nf_dopair
1432 jmp _nb_kernel310nf_x86_64_sse.nb310nf_checksingle
1433 _nb_kernel310nf_x86_64_sse.nb310nf_dopair
:
1434 movq nb310nf_charge
(%rbp
),%rsi
1435 movq nb310nf_innerjjnr
(%rsp
),%rcx
1438 addq $
8,nb310nf_innerjjnr
(%rsp
)
1440 movss
(%rsi
,%rax
,4),%xmm3
1441 movss
(%rsi
,%rbx
,4),%xmm6
1442 shufps $
0,%xmm6
,%xmm3
1443 shufps $
8,%xmm3
,%xmm3
## 00001000 ;# xmm3(0,1) has the charges
1445 mulps nb310nf_iq
(%rsp
),%xmm3
1447 movaps
%xmm3
,nb310nf_qq
(%rsp
)
1449 movq nb310nf_type
(%rbp
),%rsi
1452 movl
(%rsi
,%rcx
,4),%ecx
1453 movl
(%rsi
,%rdx
,4),%edx
1454 movq nb310nf_vdwparam
(%rbp
),%rsi
1457 movl nb310nf_ntia
(%rsp
),%edi
1460 movlps
(%rsi
,%rcx
,4),%xmm6
1461 movhps
(%rsi
,%rdx
,4),%xmm6
1462 movq nb310nf_pos
(%rbp
),%rdi
1465 shufps $
8,%xmm4
,%xmm4
## 00001000
1466 shufps $
13,%xmm6
,%xmm6
## 00001101
1470 movaps
%xmm4
,nb310nf_c6
(%rsp
)
1471 movaps
%xmm6
,nb310nf_c12
(%rsp
)
1473 lea
(%rax
,%rax
,2),%rax
1474 lea
(%rbx
,%rbx
,2),%rbx
1475 ## move coordinates to xmm0-xmm2
1476 movlps
(%rdi
,%rax
,4),%xmm1
1477 movss
8(%rdi
,%rax
,4),%xmm2
1478 movhps
(%rdi
,%rbx
,4),%xmm1
1479 movss
8(%rdi
,%rbx
,4),%xmm0
1483 shufps $
0,%xmm0
,%xmm2
1487 shufps $
136,%xmm2
,%xmm2
## 10001000
1489 shufps $
136,%xmm0
,%xmm0
## 10001000
1490 shufps $
221,%xmm1
,%xmm1
## 11011101
1492 ## move ix-iz to xmm4-xmm6
1495 movaps nb310nf_ix
(%rsp
),%xmm4
1496 movaps nb310nf_iy
(%rsp
),%xmm5
1497 movaps nb310nf_iz
(%rsp
),%xmm6
1513 ## lookup seed in xmm5
1516 movaps nb310nf_three
(%rsp
),%xmm1
1517 mulps
%xmm4
,%xmm5
## rsq*lu*lu
1518 movaps nb310nf_half
(%rsp
),%xmm0
1519 subps
%xmm5
,%xmm1
## 30-rsq*lu*lu
1521 mulps
%xmm1
,%xmm0
## xmm0=rinv
1522 mulps
%xmm0
,%xmm4
## xmm4=r
1523 mulps nb310nf_tsc
(%rsp
),%xmm4
1525 cvttps2pi
%xmm4
,%mm6
## mm6 contain lu indices
1528 movaps
%xmm4
,%xmm1
## xmm1=eps
1530 mulps
%xmm2
,%xmm2
## xmm2=eps2
1534 movq nb310nf_VFtab
(%rbp
),%rsi
1539 movlps
(%rsi
,%rcx
,4),%xmm5
1540 movhps
(%rsi
,%rdx
,4),%xmm5
## got half coulomb table
1542 shufps $
136,%xmm4
,%xmm4
## 10001000
1543 shufps $
221,%xmm7
,%xmm5
## 11011101
1545 movlps
8(%rsi
,%rcx
,4),%xmm7
1546 movhps
8(%rsi
,%rdx
,4),%xmm7
1548 shufps $
136,%xmm6
,%xmm6
## 10001000
1549 shufps $
221,%xmm7
,%xmm7
## 11011101
1550 ## table ready in xmm4-xmm7
1552 mulps
%xmm1
,%xmm6
## xmm6=Geps
1553 mulps
%xmm2
,%xmm7
## xmm7=Heps2
1555 addps
%xmm7
,%xmm5
## xmm5=Fp
1556 movaps nb310nf_qq
(%rsp
),%xmm3
1557 mulps
%xmm1
,%xmm5
## xmm5=eps*Fp
1558 addps
%xmm4
,%xmm5
## xmm5=VV
1559 mulps
%xmm3
,%xmm5
## vcoul=qq*VV
1562 mulps
%xmm0
,%xmm4
## xmm4=rinvsq
1564 ## at this point mm5 contains vcoul
1565 ## increment vcoul - then we can get rid of mm5
1567 addps nb310nf_vctot
(%rsp
),%xmm5
1572 movaps
%xmm5
,nb310nf_vctot
(%rsp
)
1574 mulps
%xmm4
,%xmm6
## xmm6=rinvsix
1576 mulps
%xmm4
,%xmm4
## xmm4=rinvtwelve
1577 mulps nb310nf_c6
(%rsp
),%xmm6
1578 mulps nb310nf_c12
(%rsp
),%xmm4
1579 movaps nb310nf_Vvdwtot
(%rsp
),%xmm7
1582 movaps
%xmm7
,nb310nf_Vvdwtot
(%rsp
)
1584 _nb_kernel310nf_x86_64_sse.nb310nf_checksingle
:
1585 movl nb310nf_innerk
(%rsp
),%edx
1587 jnz _nb_kernel310nf_x86_64_sse.nb310nf_dosingle
1588 jmp _nb_kernel310nf_x86_64_sse.nb310nf_updateouterdata
1589 _nb_kernel310nf_x86_64_sse.nb310nf_dosingle
:
1590 movq nb310nf_charge
(%rbp
),%rsi
1591 movq nb310nf_pos
(%rbp
),%rdi
1592 movq nb310nf_innerjjnr
(%rsp
),%rcx
1595 movss
(%rsi
,%rax
,4),%xmm6
## xmm6(0) has the charge
1596 mulps nb310nf_iq
(%rsp
),%xmm6
1597 movaps
%xmm6
,nb310nf_qq
(%rsp
)
1599 movq nb310nf_type
(%rbp
),%rsi
1601 movl
(%rsi
,%rcx
,4),%ecx
1602 movq nb310nf_vdwparam
(%rbp
),%rsi
1604 addl nb310nf_ntia
(%rsp
),%ecx
1605 movlps
(%rsi
,%rcx
,4),%xmm6
1607 shufps $
252,%xmm4
,%xmm4
## 11111100
1608 shufps $
253,%xmm6
,%xmm6
## 11111101
1610 movaps
%xmm4
,nb310nf_c6
(%rsp
)
1611 movaps
%xmm6
,nb310nf_c12
(%rsp
)
1613 lea
(%rax
,%rax
,2),%rax
1615 ## move coordinates to xmm0-xmm2
1616 movss
(%rdi
,%rax
,4),%xmm0
1617 movss
4(%rdi
,%rax
,4),%xmm1
1618 movss
8(%rdi
,%rax
,4),%xmm2
1620 movaps nb310nf_ix
(%rsp
),%xmm4
1621 movaps nb310nf_iy
(%rsp
),%xmm5
1622 movaps nb310nf_iz
(%rsp
),%xmm6
1638 ## lookup seed in xmm5
1641 movaps nb310nf_three
(%rsp
),%xmm1
1642 mulps
%xmm4
,%xmm5
## rsq*lu*lu
1643 movaps nb310nf_half
(%rsp
),%xmm0
1644 subps
%xmm5
,%xmm1
## 30-rsq*lu*lu
1646 mulps
%xmm1
,%xmm0
## xmm0=rinv
1648 mulps
%xmm0
,%xmm4
## xmm4=r
1649 mulps nb310nf_tsc
(%rsp
),%xmm4
1651 cvttps2pi
%xmm4
,%mm6
## mm6 contain lu indices
1654 movaps
%xmm4
,%xmm1
## xmm1=eps
1656 mulps
%xmm2
,%xmm2
## xmm2=eps2
1660 movq nb310nf_VFtab
(%rbp
),%rsi
1663 movlps
(%rsi
,%rbx
,4),%xmm4
1664 movlps
8(%rsi
,%rbx
,4),%xmm6
1667 shufps $
1,%xmm5
,%xmm5
1668 shufps $
1,%xmm7
,%xmm7
1669 ## table ready in xmm4-xmm7
1671 mulps
%xmm1
,%xmm6
## xmm6=Geps
1672 mulps
%xmm2
,%xmm7
## xmm7=Heps2
1674 addps
%xmm7
,%xmm5
## xmm5=Fp
1675 movaps nb310nf_qq
(%rsp
),%xmm3
1676 mulps
%xmm1
,%xmm5
## xmm5=eps*Fp
1677 addps
%xmm4
,%xmm5
## xmm5=VV
1678 mulps
%xmm3
,%xmm5
## vcoul=qq*VV
1681 mulps
%xmm0
,%xmm4
## xmm4=rinvsq
1683 ## at this point mm5 contains vcoul
1684 ## increment vcoul - then we can get rid of mm5
1686 addss nb310nf_vctot
(%rsp
),%xmm5
1691 movss
%xmm5
,nb310nf_vctot
(%rsp
)
1693 mulps
%xmm4
,%xmm6
## xmm6=rinvsix
1695 mulps
%xmm4
,%xmm4
## xmm4=rinvtwelve
1696 mulps nb310nf_c6
(%rsp
),%xmm6
1697 mulps nb310nf_c12
(%rsp
),%xmm4
1698 movss nb310nf_Vvdwtot
(%rsp
),%xmm7
1701 movss
%xmm7
,nb310nf_Vvdwtot
(%rsp
)
1703 _nb_kernel310nf_x86_64_sse.nb310nf_updateouterdata
:
1705 movl nb310nf_n
(%rsp
),%esi
1706 ## get group index for i particle
1707 movq nb310nf_gid
(%rbp
),%rdx
## base of gid[]
1708 movl
(%rdx
,%rsi
,4),%edx
## ggid=gid[n]
1710 ## accumulate total potential energy and update it
1711 movaps nb310nf_vctot
(%rsp
),%xmm7
1714 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1716 shufps $
1,%xmm6
,%xmm6
1719 ## add earlier value from mem
1720 movq nb310nf_Vc
(%rbp
),%rax
1721 addss
(%rax
,%rdx
,4),%xmm7
1723 movss
%xmm7
,(%rax
,%rdx
,4)
1725 ## accumulate total lj energy and update it
1726 movaps nb310nf_Vvdwtot
(%rsp
),%xmm7
1729 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1731 shufps $
1,%xmm6
,%xmm6
1734 ## add earlier value from mem
1735 movq nb310nf_Vvdw
(%rbp
),%rax
1736 addss
(%rax
,%rdx
,4),%xmm7
1738 movss
%xmm7
,(%rax
,%rdx
,4)
1741 movl nb310nf_nn1
(%rsp
),%ecx
1742 ## esi already loaded with n
1745 jz _nb_kernel310nf_x86_64_sse.nb310nf_outerend
1747 ## not last, iterate outer loop once more!
1748 movl
%esi
,nb310nf_n
(%rsp
)
1749 jmp _nb_kernel310nf_x86_64_sse.nb310nf_outer
1750 _nb_kernel310nf_x86_64_sse.nb310nf_outerend
:
1751 ## check if more outer neighborlists remain
1752 movl nb310nf_nri
(%rsp
),%ecx
1753 ## esi already loaded with n above
1755 jz _nb_kernel310nf_x86_64_sse.nb310nf_end
1756 ## non-zero, do one more workunit
1757 jmp _nb_kernel310nf_x86_64_sse.nb310nf_threadloop
1758 _nb_kernel310nf_x86_64_sse.nb310nf_end
:
1760 movl nb310nf_nouter
(%rsp
),%eax
1761 movl nb310nf_ninner
(%rsp
),%ebx
1762 movq nb310nf_outeriter
(%rbp
),%rcx
1763 movq nb310nf_inneriter
(%rbp
),%rdx