3 ## Gromacs 4.0 Copyright (c) 1991-2003
4 ## David van der Spoel, Erik Lindahl
6 ## This program is free software; you can redistribute it and/or
7 ## modify it under the terms of the GNU General Public License
8 ## as published by the Free Software Foundation; either version 2
9 ## of the License, or (at your option) any later version.
11 ## To help us fund GROMACS development, we humbly ask that you cite
12 ## the research papers on the package. Check out http://www.gromacs.org
15 ## Gnomes, ROck Monsters And Chili Sauce
23 .globl nb_kernel410_x86_64_sse
24 .globl _nb_kernel410_x86_64_sse
25 nb_kernel410_x86_64_sse
:
26 _nb_kernel410_x86_64_sse
:
27 ## Room for return address and rbp (16 bytes)
31 .set nb410_faction, 40
33 .set nb410_p_facel, 56
38 .set nb410_p_ntype, 96
39 .set nb410_vdwparam, 104
41 .set nb410_p_tabscale, 120
43 .set nb410_invsqrta, 136
45 .set nb410_p_gbtabscale, 152
47 .set nb410_p_nthreads, 168
50 .set nb410_outeriter, 192
51 .set nb410_inneriter, 200
53 ## stack offsets for local variables
54 ## bottom of stack is cache-aligned for sse use
64 .set nb410_twelve, 144
71 .set nb410_Vvdwtot, 256
79 .set nb410_isaprod, 384
80 .set nb410_dvdasum, 400
81 .set nb410_gbscale, 416
84 .set nb410_jindex, 448
87 .set nb410_shiftvec, 472
89 .set nb410_innerjjnr, 488
94 .set nb410_innerk, 512
98 .set nb410_nouter, 528
99 .set nb410_ninner, 532
117 subq $
568,%rsp
## local variable stack space (n*16+8)
119 ## zero 32-bit iteration counters
121 movl
%eax
,nb410_nouter
(%rsp
)
122 movl
%eax
,nb410_ninner
(%rsp
)
125 movl
%edi
,nb410_nri
(%rsp
)
126 movq
%rsi
,nb410_iinr
(%rsp
)
127 movq
%rdx
,nb410_jindex
(%rsp
)
128 movq
%rcx
,nb410_jjnr
(%rsp
)
129 movq
%r8,nb410_shift
(%rsp
)
130 movq
%r9,nb410_shiftvec
(%rsp
)
131 movq nb410_p_ntype
(%rbp
),%rdi
133 movl
%edi
,nb410_ntype
(%rsp
)
134 movq nb410_p_facel
(%rbp
),%rsi
136 movss
%xmm0
,nb410_facel
(%rsp
)
138 movq nb410_p_gbtabscale
(%rbp
),%rbx
140 shufps $
0,%xmm4
,%xmm4
141 movaps
%xmm4
,nb410_gbtsc
(%rsp
)
144 ## create constant floating-point factors on stack
145 movl $
0x3f000000,%eax
## half in IEEE (hex)
146 movl
%eax
,nb410_half
(%rsp
)
147 movss nb410_half
(%rsp
),%xmm1
148 shufps $
0,%xmm1
,%xmm1
## splat to all elements
150 addps
%xmm2
,%xmm2
## one
152 addps
%xmm2
,%xmm2
## two
153 addps
%xmm2
,%xmm3
## three
155 addps
%xmm4
,%xmm4
## six
157 addps
%xmm5
,%xmm5
## twelve
158 movaps
%xmm1
,nb410_half
(%rsp
)
159 movaps
%xmm2
,nb410_two
(%rsp
)
160 movaps
%xmm3
,nb410_three
(%rsp
)
161 movaps
%xmm4
,nb410_six
(%rsp
)
162 movaps
%xmm5
,nb410_twelve
(%rsp
)
164 _nb_kernel410_x86_64_sse.nb410_threadloop
:
165 movq nb410_count
(%rbp
),%rsi
## pointer to sync counter
167 _nb_kernel410_x86_64_sse.nb410_spinlock
:
168 movl
%eax
,%ebx
## ebx=*count=nn0
169 addl $
1,%ebx
## ebx=nn1=nn0+10
171 cmpxchgl
%ebx
,(%rsi
) ## write nn1 to *counter,
172 ## if it hasnt changed.
173 ## or reread *counter to eax.
174 pause
## -> better p4 performance
175 jnz _nb_kernel410_x86_64_sse.nb410_spinlock
177 ## if(nn1>nri) nn1=nri
178 movl nb410_nri
(%rsp
),%ecx
181 cmovlel
%edx
,%ebx
## if(nn1>nri) nn1=nri
182 ## Cleared the spinlock if we got here.
183 ## eax contains nn0, ebx contains nn1.
184 movl
%eax
,nb410_n
(%rsp
)
185 movl
%ebx
,nb410_nn1
(%rsp
)
186 subl
%eax
,%ebx
## calc number of outer lists
187 movl
%eax
,%esi
## copy n to esi
188 jg _nb_kernel410_x86_64_sse.nb410_outerstart
189 jmp _nb_kernel410_x86_64_sse.nb410_end
191 _nb_kernel410_x86_64_sse.nb410_outerstart
:
192 ## ebx contains number of outer iterations
193 addl nb410_nouter
(%rsp
),%ebx
194 movl
%ebx
,nb410_nouter
(%rsp
)
196 _nb_kernel410_x86_64_sse.nb410_outer
:
197 movq nb410_shift
(%rsp
),%rax
## rax = pointer into shift[]
198 movl
(%rax
,%rsi
,4),%ebx
## ebx=shift[n]
200 lea
(%rbx
,%rbx
,2),%rbx
## rbx=3*is
201 movl
%ebx
,nb410_is3
(%rsp
) ## store is3
203 movq nb410_shiftvec
(%rsp
),%rax
## rax = base of shiftvec[]
205 movss
(%rax
,%rbx
,4),%xmm0
206 movss
4(%rax
,%rbx
,4),%xmm1
207 movss
8(%rax
,%rbx
,4),%xmm2
209 movq nb410_iinr
(%rsp
),%rcx
## rcx = pointer into iinr[]
210 movl
(%rcx
,%rsi
,4),%ebx
## ebx =ii
211 movl
%ebx
,nb410_ii
(%rsp
)
213 movq nb410_charge
(%rbp
),%rdx
214 movss
(%rdx
,%rbx
,4),%xmm3
215 mulss nb410_facel
(%rsp
),%xmm3
216 shufps $
0,%xmm3
,%xmm3
218 movq nb410_invsqrta
(%rbp
),%rdx
## load invsqrta[ii]
219 movss
(%rdx
,%rbx
,4),%xmm4
220 shufps $
0,%xmm4
,%xmm4
222 movq nb410_type
(%rbp
),%rdx
223 movl
(%rdx
,%rbx
,4),%edx
224 imull nb410_ntype
(%rsp
),%edx
226 movl
%edx
,nb410_ntia
(%rsp
)
228 lea
(%rbx
,%rbx
,2),%rbx
## rbx = 3*ii=ii3
229 movq nb410_pos
(%rbp
),%rax
## rax = base of pos[]
231 addss
(%rax
,%rbx
,4),%xmm0
232 addss
4(%rax
,%rbx
,4),%xmm1
233 addss
8(%rax
,%rbx
,4),%xmm2
235 movaps
%xmm3
,nb410_iq
(%rsp
)
236 movaps
%xmm4
,nb410_isai
(%rsp
)
238 shufps $
0,%xmm0
,%xmm0
239 shufps $
0,%xmm1
,%xmm1
240 shufps $
0,%xmm2
,%xmm2
242 movaps
%xmm0
,nb410_ix
(%rsp
)
243 movaps
%xmm1
,nb410_iy
(%rsp
)
244 movaps
%xmm2
,nb410_iz
(%rsp
)
246 movl
%ebx
,nb410_ii3
(%rsp
)
248 ## clear vctot and i forces
251 movaps
%xmm13
,nb410_Vvdwtot
(%rsp
)
252 movaps
%xmm13
,nb410_dvdasum
(%rsp
)
256 movq nb410_jindex
(%rsp
),%rax
257 movl
(%rax
,%rsi
,4),%ecx
## jindex[n]
258 movl
4(%rax
,%rsi
,4),%edx
## jindex[n+1]
259 subl
%ecx
,%edx
## number of innerloop atoms
261 movq nb410_pos
(%rbp
),%rsi
262 movq nb410_faction
(%rbp
),%rdi
263 movq nb410_jjnr
(%rsp
),%rax
266 movq
%rax
,nb410_innerjjnr
(%rsp
) ## pointer to jjnr[nj0]
269 addl nb410_ninner
(%rsp
),%ecx
270 movl
%ecx
,nb410_ninner
(%rsp
)
272 movl
%edx
,nb410_innerk
(%rsp
) ## number of innerloop atoms
273 jge _nb_kernel410_x86_64_sse.nb410_unroll_loop
274 jmp _nb_kernel410_x86_64_sse.nb410_finish_inner
275 _nb_kernel410_x86_64_sse.nb410_unroll_loop
:
276 ## quad-unroll innerloop here
277 movq nb410_innerjjnr
(%rsp
),%rdx
## pointer to jjnr[k]
281 movl
12(%rdx
),%edx
## eax-edx=jnr1-4
283 addq $
16,nb410_innerjjnr
(%rsp
) ## advance pointer (unrolled 4)
286 movq nb410_invsqrta
(%rbp
),%rsi
287 movss
(%rsi
,%rax
,4),%xmm3
288 movss
(%rsi
,%rcx
,4),%xmm4
289 movss
(%rsi
,%rbx
,4),%xmm6
290 movss
(%rsi
,%rdx
,4),%xmm7
291 movaps nb410_isai
(%rsp
),%xmm2
292 shufps $
0,%xmm6
,%xmm3
293 shufps $
0,%xmm7
,%xmm4
294 shufps $
136,%xmm4
,%xmm3
## 10001000 ;# all isaj in xmm3
297 movaps
%xmm2
,nb410_isaprod
(%rsp
)
299 mulps nb410_gbtsc
(%rsp
),%xmm1
300 movaps
%xmm1
,nb410_gbscale
(%rsp
)
302 movq nb410_charge
(%rbp
),%rsi
## base of charge[]
304 movss
(%rsi
,%rax
,4),%xmm3
305 movss
(%rsi
,%rcx
,4),%xmm4
306 movss
(%rsi
,%rbx
,4),%xmm6
307 movss
(%rsi
,%rdx
,4),%xmm7
309 mulps nb410_iq
(%rsp
),%xmm2
310 shufps $
0,%xmm6
,%xmm3
311 shufps $
0,%xmm7
,%xmm4
312 shufps $
136,%xmm4
,%xmm3
## 10001000 ;# all charges in xmm3
314 movaps
%xmm3
,nb410_qq
(%rsp
)
317 movq nb410_type
(%rbp
),%rsi
318 movl
(%rsi
,%rax
,4),%r12d
319 movl
(%rsi
,%rbx
,4),%r13d
320 movl
(%rsi
,%rcx
,4),%r14d
321 movl
(%rsi
,%rdx
,4),%r15d
326 movl nb410_ntia
(%rsp
),%edi
332 movq nb410_vdwparam
(%rbp
),%rsi
333 movlps
(%rsi
,%r12,4),%xmm3
334 movlps
(%rsi
,%r14,4),%xmm7
335 movhps
(%rsi
,%r13,4),%xmm3
336 movhps
(%rsi
,%r15,4),%xmm7
339 shufps $
136,%xmm7
,%xmm0
## 10001000
340 shufps $
221,%xmm7
,%xmm3
## 11011101
342 movaps
%xmm0
,nb410_c6
(%rsp
)
343 movaps
%xmm3
,nb410_c12
(%rsp
)
345 movq nb410_pos
(%rbp
),%rsi
## base of pos[]
347 lea
(%rax
,%rax
,2),%r8 ## jnr
348 lea
(%rbx
,%rbx
,2),%r9
349 lea
(%rcx
,%rcx
,2),%r10
350 lea
(%rdx
,%rdx
,2),%r11
352 ## move four coordinates to xmm0-xmm2
353 movlps
(%rsi
,%r8,4),%xmm4
354 movlps
(%rsi
,%r10,4),%xmm5
355 movss
8(%rsi
,%r8,4),%xmm2
356 movss
8(%rsi
,%r10,4),%xmm6
358 movhps
(%rsi
,%r9,4),%xmm4
359 movhps
(%rsi
,%r11,4),%xmm5
361 movss
8(%rsi
,%r9,4),%xmm0
362 movss
8(%rsi
,%r11,4),%xmm1
364 shufps $
0,%xmm0
,%xmm2
365 shufps $
0,%xmm1
,%xmm6
370 shufps $
136,%xmm6
,%xmm2
## 10001000
372 shufps $
136,%xmm5
,%xmm0
## 10001000
373 shufps $
221,%xmm5
,%xmm1
## 11011101
376 subps nb410_ix
(%rsp
),%xmm0
377 subps nb410_iy
(%rsp
),%xmm1
378 subps nb410_iz
(%rsp
),%xmm2
381 movaps
%xmm0
,nb410_dx
(%rsp
)
382 movaps
%xmm1
,nb410_dy
(%rsp
)
383 movaps
%xmm2
,nb410_dz
(%rsp
)
395 ## lookup seed in xmm5
398 movaps nb410_three
(%rsp
),%xmm1
399 mulps
%xmm4
,%xmm5
## rsq*lu*lu
400 movaps nb410_half
(%rsp
),%xmm0
401 subps
%xmm5
,%xmm1
## 30-rsq*lu*lu
403 mulps
%xmm1
,%xmm0
## xmm0=rinv
404 mulps
%xmm0
,%xmm4
## xmm4=r
405 movaps
%xmm4
,nb410_r
(%rsp
)
406 mulps nb410_gbscale
(%rsp
),%xmm4
408 ## truncate and convert to integers
409 cvttps2dq
%xmm4
,%xmm5
411 ## convert back to float
417 ## move to integer registers
421 pshufd $
1,%xmm5
,%xmm5
422 pshufd $
1,%xmm7
,%xmm7
428 movaps
%xmm4
,%xmm1
##eps
430 movq nb410_GBtab
(%rbp
),%rsi
432 movaps
%xmm0
,%xmm9
## rinv
433 mulps
%xmm9
,%xmm9
## rinvsq
434 movaps
%xmm9
,%xmm10
## rinvsq
435 mulps
%xmm10
,%xmm10
## rinv4
436 mulps
%xmm9
,%xmm10
## rinv6
438 mulps
%xmm11
,%xmm11
## rinv12
441 movlps
(%rsi
,%r12,4),%xmm5
442 movlps
(%rsi
,%r14,4),%xmm7
443 movhps
(%rsi
,%r13,4),%xmm5
444 movhps
(%rsi
,%r15,4),%xmm7
447 shufps $
136,%xmm7
,%xmm4
## 10001000
448 shufps $
221,%xmm7
,%xmm5
## 11011101
450 mulps nb410_c6
(%rsp
),%xmm10
## vvdw6=c6*rinv6
451 mulps nb410_c12
(%rsp
),%xmm11
## vvdw12=c12*rinv12
454 subps
%xmm10
,%xmm11
## Vvdw=Vvdw12-Vvdw6
456 ## add potential to vvdwtot
457 addps nb410_Vvdwtot
(%rsp
),%xmm11
458 movaps
%xmm11
,nb410_Vvdwtot
(%rsp
)
460 movlps
8(%rsi
,%r12,4),%xmm7
461 movlps
8(%rsi
,%r14,4),%xmm8
462 movhps
8(%rsi
,%r13,4),%xmm7
463 movhps
8(%rsi
,%r15,4),%xmm8
467 shufps $
136,%xmm8
,%xmm6
## 10001000
468 shufps $
221,%xmm8
,%xmm7
## 11011101
469 ## table data ready in xmm4-xmm7
471 mulps
%xmm1
,%xmm7
## Heps
472 mulps
%xmm1
,%xmm6
## xmm6=Geps
473 mulps
%xmm1
,%xmm7
## Heps2
475 addps
%xmm7
,%xmm5
## xmm5=Fp
476 addps
%xmm7
,%xmm7
## two*Heps2
477 movaps nb410_qq
(%rsp
),%xmm3
479 addps
%xmm5
,%xmm7
## xmm7=FF
480 mulps
%xmm1
,%xmm5
## xmm5=eps*Fp
481 addps
%xmm4
,%xmm5
## xmm5=VV
482 mulps
%xmm3
,%xmm5
## vcoul=qq*VV
483 mulps
%xmm7
,%xmm3
## fijC=FF*qq
484 ## at this point xmm5 contains vcoul and xmm3 fijC
487 mulps nb410_six
(%rsp
),%xmm10
488 mulps nb410_twelve
(%rsp
),%xmm9
490 mulps
%xmm0
,%xmm9
## (12*vnb12-6*vnb6)*rinv
492 movq nb410_dvda
(%rbp
),%rsi
496 mulps nb410_gbscale
(%rsp
),%xmm3
498 mulps nb410_r
(%rsp
),%xmm6
501 ## increment vctot (sum in xmm12)
504 ## xmm6=(vcoul+fijC*r)
509 addps nb410_dvdasum
(%rsp
),%xmm7
510 movaps
%xmm7
,nb410_dvdasum
(%rsp
)
512 ## update j atoms dvdaj
516 shufps $
0x1,%xmm5
,%xmm5
517 shufps $
0x1,%xmm4
,%xmm4
519 ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
520 addss
(%rsi
,%rax
,4),%xmm6
521 addss
(%rsi
,%rbx
,4),%xmm5
522 addss
(%rsi
,%rcx
,4),%xmm7
523 addss
(%rsi
,%rdx
,4),%xmm4
524 movss
%xmm6
,(%rsi
,%rax
,4)
525 movss
%xmm5
,(%rsi
,%rbx
,4)
526 movss
%xmm7
,(%rsi
,%rcx
,4)
527 movss
%xmm4
,(%rsi
,%rdx
,4)
530 mulps
%xmm0
,%xmm9
## fscal
535 mulps nb410_dx
(%rsp
),%xmm9
536 mulps nb410_dy
(%rsp
),%xmm10
537 mulps nb410_dz
(%rsp
),%xmm11
539 ## accumulate i forces
544 movq nb410_faction
(%rbp
),%rsi
545 ## the fj's - start by accumulating x & y forces from memory
546 movlps
(%rsi
,%r8,4),%xmm0
## x1 y1 - -
547 movlps
(%rsi
,%r10,4),%xmm1
## x3 y3 - -
548 movhps
(%rsi
,%r9,4),%xmm0
## x1 y1 x2 y2
549 movhps
(%rsi
,%r11,4),%xmm1
## x3 y3 x4 y4
552 unpcklps
%xmm10
,%xmm9
## x1 y1 x2 y2
553 unpckhps
%xmm10
,%xmm8
## x3 y3 x4 y4
555 ## update fjx and fjy
559 movlps
%xmm0
,(%rsi
,%r8,4)
560 movlps
%xmm1
,(%rsi
,%r10,4)
561 movhps
%xmm0
,(%rsi
,%r9,4)
562 movhps
%xmm1
,(%rsi
,%r11,4)
564 ## xmm11: fjz1 fjz2 fjz3 fjz4
565 pshufd $
1,%xmm11
,%xmm10
## fjz2 - - -
566 movhlps
%xmm11
,%xmm9
## fjz3 - - -
567 pshufd $
3,%xmm11
,%xmm8
## fjz4 - - -
569 addss
8(%rsi
,%r8,4),%xmm11
570 addss
8(%rsi
,%r9,4),%xmm10
571 addss
8(%rsi
,%r10,4),%xmm9
572 addss
8(%rsi
,%r11,4),%xmm8
573 movss
%xmm11
,8(%rsi
,%r8,4)
574 movss
%xmm10
,8(%rsi
,%r9,4)
575 movss
%xmm9
,8(%rsi
,%r10,4)
576 movss
%xmm8
,8(%rsi
,%r11,4)
578 ## should we do one more iteration?
579 subl $
4,nb410_innerk
(%rsp
)
580 jl _nb_kernel410_x86_64_sse.nb410_finish_inner
581 jmp _nb_kernel410_x86_64_sse.nb410_unroll_loop
582 _nb_kernel410_x86_64_sse.nb410_finish_inner
:
583 ## check if at least two particles remain
584 addl $
4,nb410_innerk
(%rsp
)
585 movl nb410_innerk
(%rsp
),%edx
587 jnz _nb_kernel410_x86_64_sse.nb410_dopair
588 jmp _nb_kernel410_x86_64_sse.nb410_checksingle
589 _nb_kernel410_x86_64_sse.nb410_dopair
:
590 movq nb410_innerjjnr
(%rsp
),%rcx
594 addq $
8,nb410_innerjjnr
(%rsp
)
597 movq nb410_invsqrta
(%rbp
),%rsi
598 movss
(%rsi
,%rax
,4),%xmm2
599 movss
(%rsi
,%rbx
,4),%xmm6
602 mulps nb410_isai
(%rsp
),%xmm2
604 movaps
%xmm2
,nb410_isaprod
(%rsp
)
606 mulps nb410_gbtsc
(%rsp
),%xmm1
607 movaps
%xmm1
,nb410_gbscale
(%rsp
)
609 mulps nb410_iq
(%rsp
),%xmm2
610 movq nb410_charge
(%rbp
),%rsi
## base of charge[]
611 movss
(%rsi
,%rax
,4),%xmm3
612 movss
(%rsi
,%rbx
,4),%xmm6
617 movaps
%xmm3
,nb410_qq
(%rsp
)
620 movq nb410_type
(%rbp
),%rsi
621 movl
(%rsi
,%rax
,4),%r12d
622 movl
(%rsi
,%rbx
,4),%r13d
625 movl nb410_ntia
(%rsp
),%edi
629 movq nb410_vdwparam
(%rbp
),%rsi
630 movlps
(%rsi
,%r12,4),%xmm3
631 movhps
(%rsi
,%r13,4),%xmm3
635 shufps $
136,%xmm7
,%xmm0
## 10001000
636 shufps $
221,%xmm7
,%xmm3
## 11011101
638 movaps
%xmm0
,nb410_c6
(%rsp
)
639 movaps
%xmm3
,nb410_c12
(%rsp
)
641 movq nb410_pos
(%rbp
),%rsi
## base of pos[]
643 lea
(%rax
,%rax
,2),%r8 ## j3
644 lea
(%rbx
,%rbx
,2),%r9
646 ## move four coordinates to xmm0-xmm2
647 movlps
(%rsi
,%r8,4),%xmm4
## x1 y1 - -
648 movlps
(%rsi
,%r9,4),%xmm5
## x2 y2 - -
650 movss
8(%rsi
,%r8,4),%xmm6
## z1 - - -
651 movss
8(%rsi
,%r9,4),%xmm7
## z2 - - -
653 unpcklps
%xmm5
,%xmm4
## x1 x2 y1 y2
654 movhlps
%xmm4
,%xmm5
## y1 y2 - -
655 unpcklps
%xmm7
,%xmm6
## z1 z2 - -
658 subps nb410_ix
(%rsp
),%xmm4
659 subps nb410_iy
(%rsp
),%xmm5
660 subps nb410_iz
(%rsp
),%xmm6
663 movaps
%xmm4
,nb410_dx
(%rsp
)
664 movaps
%xmm5
,nb410_dy
(%rsp
)
665 movaps
%xmm6
,nb410_dz
(%rsp
)
676 ## lookup seed in xmm5
679 movaps nb410_three
(%rsp
),%xmm1
680 mulps
%xmm4
,%xmm5
## rsq*lu*lu
681 movaps nb410_half
(%rsp
),%xmm0
682 subps
%xmm5
,%xmm1
## 30-rsq*lu*lu
684 mulps
%xmm1
,%xmm0
## xmm0=rinv
685 mulps
%xmm0
,%xmm4
## xmm4=r
686 movaps
%xmm4
,nb410_r
(%rsp
)
687 mulps nb410_gbscale
(%rsp
),%xmm4
689 ## truncate and convert to integers
690 cvttps2dq
%xmm4
,%xmm5
692 ## convert back to float
698 ## move to integer registers
700 pshufd $
1,%xmm5
,%xmm5
705 movaps
%xmm4
,%xmm1
##eps
707 movq nb410_GBtab
(%rbp
),%rsi
709 movaps
%xmm0
,%xmm9
## rinv
710 mulps
%xmm9
,%xmm9
## rinvsq
711 movaps
%xmm9
,%xmm10
## rinvsq
712 mulps
%xmm10
,%xmm10
## rinv4
713 mulps
%xmm9
,%xmm10
## rinv6
715 mulps
%xmm11
,%xmm11
## rinv12
718 movlps
(%rsi
,%r12,4),%xmm4
## Y1 F1
719 movlps
(%rsi
,%r13,4),%xmm5
## Y2 F2
720 unpcklps
%xmm5
,%xmm4
## Y1 Y2 F1 F2
721 movhlps
%xmm4
,%xmm5
## F1 F2
723 mulps nb410_c6
(%rsp
),%xmm10
## vvdw6=c6*rinv6
724 mulps nb410_c12
(%rsp
),%xmm11
## vvdw12=c12*rinv12
727 subps
%xmm10
,%xmm11
## Vvdw=Vvdw12-Vvdw6
729 ## add potential to vvdwtot
730 addps nb410_Vvdwtot
(%rsp
),%xmm11
731 movlps
%xmm11
,nb410_Vvdwtot
(%rsp
)
733 movlps
8(%rsi
,%r12,4),%xmm6
## G1 H1
734 movlps
8(%rsi
,%r13,4),%xmm7
## G2 H2
735 unpcklps
%xmm7
,%xmm6
## G1 G2
736 movhlps
%xmm6
,%xmm7
## H1 H2
737 ## table data ready in xmm4-xmm7
739 mulps
%xmm1
,%xmm7
## Heps
740 mulps
%xmm1
,%xmm6
## xmm6=Geps
741 mulps
%xmm1
,%xmm7
## Heps2
743 addps
%xmm7
,%xmm5
## xmm5=Fp
744 addps
%xmm7
,%xmm7
## two*Heps2
745 movaps nb410_qq
(%rsp
),%xmm3
748 addps
%xmm5
,%xmm7
## xmm7=FF
749 mulps
%xmm1
,%xmm5
## xmm5=eps*Fp
750 addps
%xmm4
,%xmm5
## xmm5=VV
751 mulps
%xmm3
,%xmm5
## vcoul=qq*VV
752 mulps
%xmm7
,%xmm3
## fijC=FF*qq
753 ## at this point xmm5 contains vcoul and xmm3 fijC
756 mulps nb410_six
(%rsp
),%xmm10
757 mulps nb410_twelve
(%rsp
),%xmm9
759 mulps
%xmm0
,%xmm9
## (12*vnb12-6*vnb6)*rinv
761 ## zero upper part of vcoul
765 movq nb410_dvda
(%rbp
),%rsi
769 mulps nb410_gbscale
(%rsp
),%xmm3
771 mulps nb410_r
(%rsp
),%xmm6
775 ## increment vctot (sum in xmm12)
778 ## xmm6=(vcoul+fijC*r)
782 ## zero upper half of dvda
786 addps nb410_dvdasum
(%rsp
),%xmm7
787 movaps
%xmm7
,nb410_dvdasum
(%rsp
)
789 ## update j atoms dvdaj
791 shufps $
0x1,%xmm5
,%xmm5
793 ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
794 addss
(%rsi
,%rax
,4),%xmm6
795 addss
(%rsi
,%rbx
,4),%xmm5
796 movss
%xmm6
,(%rsi
,%rax
,4)
797 movss
%xmm5
,(%rsi
,%rbx
,4)
802 mulps
%xmm0
,%xmm9
## fscal
807 mulps nb410_dx
(%rsp
),%xmm9
808 mulps nb410_dy
(%rsp
),%xmm10
809 mulps nb410_dz
(%rsp
),%xmm11
815 ## accumulate i forces
820 movq nb410_faction
(%rbp
),%rsi
821 ## the fj's - start by accumulating x & y forces from memory
822 movlps
(%rsi
,%r8,4),%xmm0
## x1 y1 - -
823 movhps
(%rsi
,%r9,4),%xmm0
## x1 y1 x2 y2
825 unpcklps
%xmm10
,%xmm9
## x1 y1 x2 y2
828 movlps
%xmm0
,(%rsi
,%r8,4)
829 movhps
%xmm0
,(%rsi
,%r9,4)
832 pshufd $
1,%xmm11
,%xmm8
833 addss
8(%rsi
,%r8,4),%xmm11
834 addss
8(%rsi
,%r9,4),%xmm8
835 movss
%xmm11
,8(%rsi
,%r8,4)
836 movss
%xmm8
,8(%rsi
,%r9,4)
838 _nb_kernel410_x86_64_sse.nb410_checksingle
:
839 movl nb410_innerk
(%rsp
),%edx
841 jnz _nb_kernel410_x86_64_sse.nb410_dosingle
842 jmp _nb_kernel410_x86_64_sse.nb410_updateouterdata
843 _nb_kernel410_x86_64_sse.nb410_dosingle
:
844 movq nb410_charge
(%rbp
),%rsi
845 movq nb410_invsqrta
(%rbp
),%rdx
846 movq nb410_pos
(%rbp
),%rdi
847 movq nb410_innerjjnr
(%rsp
),%rcx
851 movq nb410_invsqrta
(%rbp
),%rsi
852 movss
(%rsi
,%rax
,4),%xmm3
853 movaps nb410_isai
(%rsp
),%xmm2
856 movss
%xmm2
,nb410_isaprod
(%rsp
)
858 mulss nb410_gbtsc
(%rsp
),%xmm1
859 movss
%xmm1
,nb410_gbscale
(%rsp
)
861 mulss nb410_iq
(%rsp
),%xmm2
862 movq nb410_charge
(%rbp
),%rsi
## base of charge[]
864 movss
(%rsi
,%rax
,4),%xmm3
866 movss
%xmm3
,nb410_qq
(%rsp
)
869 movq nb410_type
(%rbp
),%rsi
870 movl
(%rsi
,%rax
,4),%r12d
872 movl nb410_ntia
(%rsp
),%edi
875 movq nb410_vdwparam
(%rbp
),%rsi
876 movss
(%rsi
,%r12,4),%xmm0
877 movss
4(%rsi
,%r12,4),%xmm3
878 movaps
%xmm0
,nb410_c6
(%rsp
)
879 movaps
%xmm3
,nb410_c12
(%rsp
)
881 movq nb410_pos
(%rbp
),%rsi
## base of pos[]
883 lea
(%rax
,%rax
,2),%r8 ## jnr
885 ## move four coordinates to xmm0-xmm2
886 movss
(%rsi
,%r8,4),%xmm4
887 movss
4(%rsi
,%r8,4),%xmm5
888 movss
8(%rsi
,%r8,4),%xmm6
891 subss nb410_ix
(%rsp
),%xmm4
892 subss nb410_iy
(%rsp
),%xmm5
893 subss nb410_iz
(%rsp
),%xmm6
896 movaps
%xmm4
,nb410_dx
(%rsp
)
897 movaps
%xmm5
,nb410_dy
(%rsp
)
898 movaps
%xmm6
,nb410_dz
(%rsp
)
909 ## lookup seed in xmm5
912 movaps nb410_three
(%rsp
),%xmm1
913 mulss
%xmm4
,%xmm5
## rsq*lu*lu
914 movaps nb410_half
(%rsp
),%xmm0
915 subss
%xmm5
,%xmm1
## 30-rsq*lu*lu
917 mulss
%xmm1
,%xmm0
## xmm0=rinv
918 mulss
%xmm0
,%xmm4
## xmm4=r
919 movaps
%xmm4
,nb410_r
(%rsp
)
920 mulss nb410_gbscale
(%rsp
),%xmm4
922 ## truncate and convert to integers
923 cvttss2si
%xmm4
,%r12d
925 ## convert back to float
933 movaps
%xmm4
,%xmm1
##eps
935 movq nb410_GBtab
(%rbp
),%rsi
937 movaps
%xmm0
,%xmm9
## rinv
938 mulss
%xmm9
,%xmm9
## rinvsq
939 movaps
%xmm9
,%xmm10
## rinvsq
940 mulss
%xmm10
,%xmm10
## rinv4
941 mulss
%xmm9
,%xmm10
## rinv6
943 mulss
%xmm11
,%xmm11
## rinv12
946 movss
(%rsi
,%r12,4),%xmm4
947 movss
4(%rsi
,%r12,4),%xmm5
948 movss
8(%rsi
,%r12,4),%xmm6
949 movss
12(%rsi
,%r12,4),%xmm7
950 ## table data ready in xmm4-xmm7
952 mulss nb410_c6
(%rsp
),%xmm10
## vvdw6=c6*rinv6
953 mulss nb410_c12
(%rsp
),%xmm11
## vvdw12=c12*rinv12
956 subss
%xmm10
,%xmm11
## Vvdw=Vvdw12-Vvdw6
958 ## add potential to vvdwtot
959 addss nb410_Vvdwtot
(%rsp
),%xmm11
960 movss
%xmm11
,nb410_Vvdwtot
(%rsp
)
962 mulss
%xmm1
,%xmm7
## Heps
963 mulss
%xmm1
,%xmm6
## xmm6=Geps
964 mulss
%xmm1
,%xmm7
## Heps2
966 addss
%xmm7
,%xmm5
## xmm5=Fp
967 addss
%xmm7
,%xmm7
## two*Heps2
968 movss nb410_qq
(%rsp
),%xmm3
970 addss
%xmm5
,%xmm7
## xmm7=FF
971 mulss
%xmm1
,%xmm5
## xmm5=eps*Fp
972 addss
%xmm4
,%xmm5
## xmm5=VV
973 mulss
%xmm3
,%xmm5
## vcoul=qq*VV
974 mulss
%xmm7
,%xmm3
## fijC=FF*qq
975 ## at this point xmm5 contains vcoul and xmm3 fijC
978 mulss nb410_six
(%rsp
),%xmm10
979 mulss nb410_twelve
(%rsp
),%xmm9
981 mulss
%xmm0
,%xmm9
## (12*vnb12-6*vnb6)*rinv
983 movq nb410_dvda
(%rbp
),%rsi
987 mulss nb410_gbscale
(%rsp
),%xmm3
989 mulss nb410_r
(%rsp
),%xmm6
992 ## increment vctot (sum in xmm12)
995 ## xmm6=(vcoul+fijC*r)
1000 addss nb410_dvdasum
(%rsp
),%xmm7
1001 movss
%xmm7
,nb410_dvdasum
(%rsp
)
1003 ## update j atoms dvdaj
1004 addss
(%rsi
,%rax
,4),%xmm6
1005 movss
%xmm6
,(%rsi
,%rax
,4)
1008 mulss
%xmm0
,%xmm9
## fscal
1013 mulss nb410_dx
(%rsp
),%xmm9
1014 mulss nb410_dy
(%rsp
),%xmm10
1015 mulss nb410_dz
(%rsp
),%xmm11
1017 ## accumulate i forces
1022 movq nb410_faction
(%rbp
),%rsi
1024 addss
(%rsi
,%r8,4),%xmm9
1025 addss
4(%rsi
,%r8,4),%xmm10
1026 addss
8(%rsi
,%r8,4),%xmm11
1027 movss
%xmm9
,(%rsi
,%r8,4)
1028 movss
%xmm10
,4(%rsi
,%r8,4)
1029 movss
%xmm11
,8(%rsi
,%r8,4)
1031 _nb_kernel410_x86_64_sse.nb410_updateouterdata
:
1032 movl nb410_ii3
(%rsp
),%ecx
1033 movq nb410_faction
(%rbp
),%rdi
1034 movq nb410_fshift
(%rbp
),%rsi
1035 movl nb410_is3
(%rsp
),%edx
1037 ## accumulate i forces in xmm13, xmm14, xmm15
1038 movhlps
%xmm13
,%xmm0
1039 movhlps
%xmm14
,%xmm1
1040 movhlps
%xmm15
,%xmm2
1047 shufps $
1,%xmm3
,%xmm3
1048 shufps $
1,%xmm4
,%xmm4
1049 shufps $
1,%xmm5
,%xmm5
1052 addss
%xmm5
,%xmm2
## xmm0-xmm2 has single force in pos0
1055 ## increment i force
1056 movss
(%rdi
,%rcx
,4),%xmm3
1057 movss
4(%rdi
,%rcx
,4),%xmm4
1058 movss
8(%rdi
,%rcx
,4),%xmm5
1062 movss
%xmm3
,(%rdi
,%rcx
,4)
1063 movss
%xmm4
,4(%rdi
,%rcx
,4)
1064 movss
%xmm5
,8(%rdi
,%rcx
,4)
1066 ## increment fshift force
1067 movss
(%rsi
,%rdx
,4),%xmm3
1068 movss
4(%rsi
,%rdx
,4),%xmm4
1069 movss
8(%rsi
,%rdx
,4),%xmm5
1073 movss
%xmm3
,(%rsi
,%rdx
,4)
1074 movss
%xmm4
,4(%rsi
,%rdx
,4)
1075 movss
%xmm5
,8(%rsi
,%rdx
,4)
1078 movl nb410_n
(%rsp
),%esi
1079 ## get group index for i particle
1080 movq nb410_gid
(%rbp
),%rdx
## base of gid[]
1081 movl
(%rdx
,%rsi
,4),%edx
## ggid=gid[n]
1083 ## accumulate total potential energy and update it
1085 movhlps
%xmm12
,%xmm6
1086 addps
%xmm6
,%xmm12
## pos 0-1 in xmm12 have the sum now
1088 shufps $
1,%xmm6
,%xmm6
1091 ## add earlier value from mem
1092 movq nb410_Vc
(%rbp
),%rax
1093 addss
(%rax
,%rdx
,4),%xmm12
1095 movss
%xmm12
,(%rax
,%rdx
,4)
1097 ## accumulate total lj energy and update it
1098 movaps nb410_Vvdwtot
(%rsp
),%xmm7
1101 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1103 shufps $
1,%xmm6
,%xmm6
1106 ## add earlier value from mem
1107 movq nb410_Vvdw
(%rbp
),%rax
1108 addss
(%rax
,%rdx
,4),%xmm7
1110 movss
%xmm7
,(%rax
,%rdx
,4)
1112 ## accumulate dVda and update it
1113 movaps nb410_dvdasum
(%rsp
),%xmm7
1116 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1118 shufps $
1,%xmm6
,%xmm6
1121 movl nb410_ii
(%rsp
),%edx
1122 movq nb410_dvda
(%rbp
),%rax
1123 addss
(%rax
,%rdx
,4),%xmm7
1124 movss
%xmm7
,(%rax
,%rdx
,4)
1127 movl nb410_nn1
(%rsp
),%ecx
1128 ## esi already loaded with n
1131 jz _nb_kernel410_x86_64_sse.nb410_outerend
1133 ## not last, iterate outer loop once more!
1134 movl
%esi
,nb410_n
(%rsp
)
1135 jmp _nb_kernel410_x86_64_sse.nb410_outer
1136 _nb_kernel410_x86_64_sse.nb410_outerend
:
1137 ## check if more outer neighborlists remain
1138 movl nb410_nri
(%rsp
),%ecx
1139 ## esi already loaded with n above
1141 jz _nb_kernel410_x86_64_sse.nb410_end
1142 ## non-zero, do one more workunit
1143 jmp _nb_kernel410_x86_64_sse.nb410_threadloop
1144 _nb_kernel410_x86_64_sse.nb410_end
:
1146 movl nb410_nouter
(%rsp
),%eax
1147 movl nb410_ninner
(%rsp
),%ebx
1148 movq nb410_outeriter
(%rbp
),%rcx
1149 movq nb410_inneriter
(%rbp
),%rdx
1168 .globl nb_kernel410nf_x86_64_sse
1169 .globl _nb_kernel410nf_x86_64_sse
1170 nb_kernel410nf_x86_64_sse
:
1171 _nb_kernel410nf_x86_64_sse
:
1172 ## Room for return address and rbp (16 bytes)
1173 .set nb410nf_fshift, 16
1174 .set nb410nf_gid, 24
1175 .set nb410nf_pos, 32
1176 .set nb410nf_faction, 40
1177 .set nb410nf_charge, 48
1178 .set nb410nf_p_facel, 56
1179 .set nb410nf_argkrf, 64
1180 .set nb410nf_argcrf, 72
1182 .set nb410nf_type, 88
1183 .set nb410nf_p_ntype, 96
1184 .set nb410nf_vdwparam, 104
1185 .set nb410nf_Vvdw, 112
1186 .set nb410nf_p_tabscale, 120
1187 .set nb410nf_VFtab, 128
1188 .set nb410nf_invsqrta, 136
1189 .set nb410nf_dvda, 144
1190 .set nb410nf_p_gbtabscale, 152
1191 .set nb410nf_GBtab, 160
1192 .set nb410nf_p_nthreads, 168
1193 .set nb410nf_count, 176
1194 .set nb410nf_mtx, 184
1195 .set nb410nf_outeriter, 192
1196 .set nb410nf_inneriter, 200
1197 .set nb410nf_work, 208
1198 ## stack offsets for local variables
1199 ## bottom of stack is cache-aligned for sse use
1204 .set nb410nf_gbtsc, 64
1207 .set nb410nf_c12, 112
1208 .set nb410nf_vctot, 128
1209 .set nb410nf_Vvdwtot, 144
1210 .set nb410nf_half, 160
1211 .set nb410nf_three, 176
1212 .set nb410nf_isai, 192
1213 .set nb410nf_isaprod, 208
1214 .set nb410nf_gbscale, 224
1215 .set nb410nf_nri, 240
1216 .set nb410nf_iinr, 248
1217 .set nb410nf_jindex, 256
1218 .set nb410nf_jjnr, 264
1219 .set nb410nf_shift, 272
1220 .set nb410nf_shiftvec, 280
1221 .set nb410nf_facel, 288
1222 .set nb410nf_innerjjnr, 296
1223 .set nb410nf_is3, 304
1224 .set nb410nf_ii3, 308
1225 .set nb410nf_ntia, 312
1226 .set nb410nf_innerk, 316
1228 .set nb410nf_nn1, 324
1229 .set nb410nf_ntype, 328
1230 .set nb410nf_nouter, 332
1231 .set nb410nf_ninner, 336
1245 subq $
360,%rsp
## local variable stack space (n*16+8)
1247 ## zero 32-bit iteration counters
1249 movl
%eax
,nb410nf_nouter
(%rsp
)
1250 movl
%eax
,nb410nf_ninner
(%rsp
)
1253 movl
%edi
,nb410nf_nri
(%rsp
)
1254 movq
%rsi
,nb410nf_iinr
(%rsp
)
1255 movq
%rdx
,nb410nf_jindex
(%rsp
)
1256 movq
%rcx
,nb410nf_jjnr
(%rsp
)
1257 movq
%r8,nb410nf_shift
(%rsp
)
1258 movq
%r9,nb410nf_shiftvec
(%rsp
)
1259 movq nb410nf_p_ntype
(%rbp
),%rdi
1261 movl
%edi
,nb410nf_ntype
(%rsp
)
1262 movq nb410nf_p_facel
(%rbp
),%rsi
1264 movss
%xmm0
,nb410nf_facel
(%rsp
)
1266 movq nb410nf_p_gbtabscale
(%rbp
),%rbx
1268 shufps $
0,%xmm4
,%xmm4
1269 movaps
%xmm4
,nb410nf_gbtsc
(%rsp
)
1272 ## create constant floating-point factors on stack
1273 movl $
0x3f000000,%eax
## half in IEEE (hex)
1274 movl
%eax
,nb410nf_half
(%rsp
)
1275 movss nb410nf_half
(%rsp
),%xmm1
1276 shufps $
0,%xmm1
,%xmm1
## splat to all elements
1278 addps
%xmm2
,%xmm2
## one
1280 addps
%xmm2
,%xmm2
## two
1281 addps
%xmm2
,%xmm3
## three
1282 movaps
%xmm1
,nb410nf_half
(%rsp
)
1283 movaps
%xmm3
,nb410nf_three
(%rsp
)
1285 _nb_kernel410nf_x86_64_sse.nb410nf_threadloop
:
1286 movq nb410nf_count
(%rbp
),%rsi
## pointer to sync counter
1288 _nb_kernel410nf_x86_64_sse.nb410nf_spinlock
:
1289 movl
%eax
,%ebx
## ebx=*count=nn0
1290 addl $
1,%ebx
## ebx=nn1=nn0+10
1292 cmpxchgl
%ebx
,(%rsi
) ## write nn1 to *counter,
1293 ## if it hasnt changed.
1294 ## or reread *counter to eax.
1295 pause
## -> better p4 performance
1296 jnz _nb_kernel410nf_x86_64_sse.nb410nf_spinlock
1298 ## if(nn1>nri) nn1=nri
1299 movl nb410nf_nri
(%rsp
),%ecx
1302 cmovlel
%edx
,%ebx
## if(nn1>nri) nn1=nri
1303 ## Cleared the spinlock if we got here.
1304 ## eax contains nn0, ebx contains nn1.
1305 movl
%eax
,nb410nf_n
(%rsp
)
1306 movl
%ebx
,nb410nf_nn1
(%rsp
)
1307 subl
%eax
,%ebx
## calc number of outer lists
1308 movl
%eax
,%esi
## copy n to esi
1309 jg _nb_kernel410nf_x86_64_sse.nb410nf_outerstart
1310 jmp _nb_kernel410nf_x86_64_sse.nb410nf_end
1312 _nb_kernel410nf_x86_64_sse.nb410nf_outerstart
:
1313 ## ebx contains number of outer iterations
1314 addl nb410nf_nouter
(%rsp
),%ebx
1315 movl
%ebx
,nb410nf_nouter
(%rsp
)
1317 _nb_kernel410nf_x86_64_sse.nb410nf_outer
:
1318 movq nb410nf_shift
(%rsp
),%rax
## rax = pointer into shift[]
1319 movl
(%rax
,%rsi
,4),%ebx
## ebx=shift[n]
1321 lea
(%rbx
,%rbx
,2),%rbx
## rbx=3*is
1322 movl
%ebx
,nb410nf_is3
(%rsp
) ## store is3
1324 movq nb410nf_shiftvec
(%rsp
),%rax
## rax = base of shiftvec[]
1326 movss
(%rax
,%rbx
,4),%xmm0
1327 movss
4(%rax
,%rbx
,4),%xmm1
1328 movss
8(%rax
,%rbx
,4),%xmm2
1330 movq nb410nf_iinr
(%rsp
),%rcx
## rcx = pointer into iinr[]
1331 movl
(%rcx
,%rsi
,4),%ebx
## ebx =ii
1333 movq nb410nf_charge
(%rbp
),%rdx
1334 movss
(%rdx
,%rbx
,4),%xmm3
1335 mulss nb410nf_facel
(%rsp
),%xmm3
1336 shufps $
0,%xmm3
,%xmm3
1338 movq nb410nf_invsqrta
(%rbp
),%rdx
## load invsqrta[ii]
1339 movss
(%rdx
,%rbx
,4),%xmm4
1340 shufps $
0,%xmm4
,%xmm4
1342 movq nb410nf_type
(%rbp
),%rdx
1343 movl
(%rdx
,%rbx
,4),%edx
1344 imull nb410nf_ntype
(%rsp
),%edx
1346 movl
%edx
,nb410nf_ntia
(%rsp
)
1348 lea
(%rbx
,%rbx
,2),%rbx
## rbx = 3*ii=ii3
1349 movq nb410nf_pos
(%rbp
),%rax
## rax = base of pos[]
1351 addss
(%rax
,%rbx
,4),%xmm0
1352 addss
4(%rax
,%rbx
,4),%xmm1
1353 addss
8(%rax
,%rbx
,4),%xmm2
1355 movaps
%xmm3
,nb410nf_iq
(%rsp
)
1356 movaps
%xmm4
,nb410nf_isai
(%rsp
)
1358 shufps $
0,%xmm0
,%xmm0
1359 shufps $
0,%xmm1
,%xmm1
1360 shufps $
0,%xmm2
,%xmm2
1362 movaps
%xmm0
,nb410nf_ix
(%rsp
)
1363 movaps
%xmm1
,nb410nf_iy
(%rsp
)
1364 movaps
%xmm2
,nb410nf_iz
(%rsp
)
1366 movl
%ebx
,nb410nf_ii3
(%rsp
)
1370 movaps
%xmm4
,nb410nf_vctot
(%rsp
)
1371 movaps
%xmm4
,nb410nf_Vvdwtot
(%rsp
)
1373 movq nb410nf_jindex
(%rsp
),%rax
1374 movl
(%rax
,%rsi
,4),%ecx
## jindex[n]
1375 movl
4(%rax
,%rsi
,4),%edx
## jindex[n+1]
1376 subl
%ecx
,%edx
## number of innerloop atoms
1378 movq nb410nf_pos
(%rbp
),%rsi
1379 movq nb410nf_faction
(%rbp
),%rdi
1380 movq nb410nf_jjnr
(%rsp
),%rax
1383 movq
%rax
,nb410nf_innerjjnr
(%rsp
) ## pointer to jjnr[nj0]
1386 addl nb410nf_ninner
(%rsp
),%ecx
1387 movl
%ecx
,nb410nf_ninner
(%rsp
)
1389 movl
%edx
,nb410nf_innerk
(%rsp
) ## number of innerloop atoms
1390 jge _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop
1391 jmp _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner
1392 _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop
:
1393 ## quad-unroll innerloop here
1394 movq nb410nf_innerjjnr
(%rsp
),%rdx
## pointer to jjnr[k]
1398 movl
12(%rdx
),%edx
## eax-edx=jnr1-4
1399 addq $
16,nb410nf_innerjjnr
(%rsp
) ## advance pointer (unrolled 4)
1402 movq nb410nf_invsqrta
(%rbp
),%rsi
1403 movss
(%rsi
,%rax
,4),%xmm3
1404 movss
(%rsi
,%rcx
,4),%xmm4
1405 movss
(%rsi
,%rbx
,4),%xmm6
1406 movss
(%rsi
,%rdx
,4),%xmm7
1407 movaps nb410nf_isai
(%rsp
),%xmm2
1408 shufps $
0,%xmm6
,%xmm3
1409 shufps $
0,%xmm7
,%xmm4
1410 shufps $
136,%xmm4
,%xmm3
## 10001000 ;# all charges in xmm3
1413 movaps
%xmm2
,nb410nf_isaprod
(%rsp
)
1415 mulps nb410nf_gbtsc
(%rsp
),%xmm1
1416 movaps
%xmm1
,nb410nf_gbscale
(%rsp
)
1418 movq nb410nf_charge
(%rbp
),%rsi
## base of charge[]
1420 movss
(%rsi
,%rax
,4),%xmm3
1421 movss
(%rsi
,%rcx
,4),%xmm4
1422 movss
(%rsi
,%rbx
,4),%xmm6
1423 movss
(%rsi
,%rdx
,4),%xmm7
1425 mulps nb410nf_iq
(%rsp
),%xmm2
1426 shufps $
0,%xmm6
,%xmm3
1427 shufps $
0,%xmm7
,%xmm4
1428 shufps $
136,%xmm4
,%xmm3
## 10001000 ;# all charges in xmm3
1430 movaps
%xmm3
,nb410nf_qq
(%rsp
)
1437 movq nb410nf_type
(%rbp
),%rsi
1438 movl
(%rsi
,%rax
,4),%eax
1439 movl
(%rsi
,%rbx
,4),%ebx
1440 movl
(%rsi
,%rcx
,4),%ecx
1441 movl
(%rsi
,%rdx
,4),%edx
1442 movq nb410nf_vdwparam
(%rbp
),%rsi
1447 movl nb410nf_ntia
(%rsp
),%edi
1453 movlps
(%rsi
,%rax
,4),%xmm6
1454 movlps
(%rsi
,%rcx
,4),%xmm7
1455 movhps
(%rsi
,%rbx
,4),%xmm6
1456 movhps
(%rsi
,%rdx
,4),%xmm7
1459 shufps $
136,%xmm7
,%xmm4
## 10001000
1460 shufps $
221,%xmm7
,%xmm6
## 11011101
1467 movaps
%xmm4
,nb410nf_c6
(%rsp
)
1468 movaps
%xmm6
,nb410nf_c12
(%rsp
)
1470 movq nb410nf_pos
(%rbp
),%rsi
## base of pos[]
1472 lea
(%rax
,%rax
,2),%rax
## replace jnr with j3
1473 lea
(%rbx
,%rbx
,2),%rbx
1475 lea
(%rcx
,%rcx
,2),%rcx
## replace jnr with j3
1476 lea
(%rdx
,%rdx
,2),%rdx
1478 ## move four coordinates to xmm0-xmm2
1480 movlps
(%rsi
,%rax
,4),%xmm4
1481 movlps
(%rsi
,%rcx
,4),%xmm5
1482 movss
8(%rsi
,%rax
,4),%xmm2
1483 movss
8(%rsi
,%rcx
,4),%xmm6
1485 movhps
(%rsi
,%rbx
,4),%xmm4
1486 movhps
(%rsi
,%rdx
,4),%xmm5
1488 movss
8(%rsi
,%rbx
,4),%xmm0
1489 movss
8(%rsi
,%rdx
,4),%xmm1
1491 shufps $
0,%xmm0
,%xmm2
1492 shufps $
0,%xmm1
,%xmm6
1497 shufps $
136,%xmm6
,%xmm2
## 10001000
1499 shufps $
136,%xmm5
,%xmm0
## 10001000
1500 shufps $
221,%xmm5
,%xmm1
## 11011101
1502 ## move ix-iz to xmm4-xmm6
1503 movaps nb410nf_ix
(%rsp
),%xmm4
1504 movaps nb410nf_iy
(%rsp
),%xmm5
1505 movaps nb410nf_iz
(%rsp
),%xmm6
1521 ## lookup seed in xmm5
1524 movaps nb410nf_three
(%rsp
),%xmm1
1525 mulps
%xmm4
,%xmm5
## rsq*lu*lu
1526 movaps nb410nf_half
(%rsp
),%xmm0
1527 subps
%xmm5
,%xmm1
## 30-rsq*lu*lu
1529 mulps
%xmm1
,%xmm0
## xmm0=rinv
1530 mulps
%xmm0
,%xmm4
## xmm4=r
1531 mulps nb410nf_gbscale
(%rsp
),%xmm4
1534 cvttps2pi
%xmm4
,%mm6
1535 cvttps2pi
%xmm5
,%mm7
## mm6/mm7 contain lu indices
1540 movaps
%xmm4
,%xmm1
## xmm1=eps
1542 mulps
%xmm2
,%xmm2
## xmm2=eps2
1551 movq nb410nf_GBtab
(%rbp
),%rsi
1559 ## load coulomb table
1560 movaps
(%rsi
,%rax
,4),%xmm4
1561 movaps
(%rsi
,%rbx
,4),%xmm5
1562 movaps
(%rsi
,%rcx
,4),%xmm6
1563 movaps
(%rsi
,%rdx
,4),%xmm7
1564 ## transpose, using xmm3 for scratch
1566 shufps $
0xEE,%xmm7
,%xmm3
1567 shufps $
0x44,%xmm7
,%xmm6
1569 shufps $
0xEE,%xmm5
,%xmm7
1570 shufps $
0x44,%xmm5
,%xmm4
1572 shufps $
0xDD,%xmm6
,%xmm5
1573 shufps $
0x88,%xmm6
,%xmm4
1575 shufps $
0x88,%xmm3
,%xmm6
1576 shufps $
0xDD,%xmm3
,%xmm7
1577 ## coulomb table ready, in xmm4-xmm7
1578 mulps
%xmm1
,%xmm6
## xmm6=Geps
1579 mulps
%xmm2
,%xmm7
## xmm7=Heps2
1582 addps
%xmm7
,%xmm5
## xmm5=Fp
1583 movaps nb410nf_qq
(%rsp
),%xmm3
1584 mulps
%xmm1
,%xmm5
## xmm5=eps*Fp
1585 addps
%xmm4
,%xmm5
## xmm5=VV
1586 mulps
%xmm3
,%xmm5
## vcoul=qq*VV
1588 addps nb410nf_vctot
(%rsp
),%xmm5
1589 movaps
%xmm5
,nb410nf_vctot
(%rsp
)
1593 mulps
%xmm0
,%xmm4
## xmm4=rinvsq
1598 mulps
%xmm4
,%xmm6
## xmm6=rinvsix
1600 mulps
%xmm4
,%xmm4
## xmm4=rinvtwelve
1601 mulps nb410nf_c6
(%rsp
),%xmm6
1602 mulps nb410nf_c12
(%rsp
),%xmm4
1603 movaps nb410nf_Vvdwtot
(%rsp
),%xmm7
1606 movaps
%xmm7
,nb410nf_Vvdwtot
(%rsp
)
1608 ## should we do one more iteration?
1609 subl $
4,nb410nf_innerk
(%rsp
)
1610 jl _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner
1611 jmp _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop
1612 _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner
:
1613 ## check if at least two particles remain
1614 addl $
4,nb410nf_innerk
(%rsp
)
1615 movl nb410nf_innerk
(%rsp
),%edx
1617 jnz _nb_kernel410nf_x86_64_sse.nb410nf_dopair
1618 jmp _nb_kernel410nf_x86_64_sse.nb410nf_checksingle
1619 _nb_kernel410nf_x86_64_sse.nb410nf_dopair
:
1620 movq nb410nf_innerjjnr
(%rsp
),%rcx
1623 addq $
8,nb410nf_innerjjnr
(%rsp
)
1629 movq nb410nf_invsqrta
(%rbp
),%rsi
1630 movss
(%rsi
,%rax
,4),%xmm2
1631 movss
(%rsi
,%rbx
,4),%xmm3
1632 unpcklps
%xmm3
,%xmm2
## isa2 in xmm3(0,1)
1633 mulps nb410nf_isai
(%rsp
),%xmm2
1634 movaps
%xmm2
,nb410nf_isaprod
(%rsp
)
1636 mulps nb410nf_gbtsc
(%rsp
),%xmm1
1637 movaps
%xmm1
,nb410nf_gbscale
(%rsp
)
1639 movq nb410nf_charge
(%rbp
),%rsi
## base of charge[]
1640 movss
(%rsi
,%rax
,4),%xmm3
1641 movss
(%rsi
,%rbx
,4),%xmm6
1642 unpcklps
%xmm6
,%xmm3
## 00001000 ;# xmm3(0,1) has the charges
1644 mulps nb410nf_iq
(%rsp
),%xmm2
1646 movaps
%xmm3
,nb410nf_qq
(%rsp
)
1648 movq nb410nf_type
(%rbp
),%rsi
1651 movl
(%rsi
,%rcx
,4),%ecx
1652 movl
(%rsi
,%rdx
,4),%edx
1653 movq nb410nf_vdwparam
(%rbp
),%rsi
1656 movl nb410nf_ntia
(%rsp
),%edi
1659 movlps
(%rsi
,%rcx
,4),%xmm6
1660 movhps
(%rsi
,%rdx
,4),%xmm6
1661 movq nb410nf_pos
(%rbp
),%rdi
1664 shufps $
8,%xmm4
,%xmm4
## 00001000
1665 shufps $
13,%xmm6
,%xmm6
## 00001101
1669 movaps
%xmm4
,nb410nf_c6
(%rsp
)
1670 movaps
%xmm6
,nb410nf_c12
(%rsp
)
1672 lea
(%rax
,%rax
,2),%rax
1673 lea
(%rbx
,%rbx
,2),%rbx
1674 ## move coordinates to xmm0-xmm2
1675 movlps
(%rdi
,%rax
,4),%xmm1
1676 movss
8(%rdi
,%rax
,4),%xmm2
1677 movhps
(%rdi
,%rbx
,4),%xmm1
1678 movss
8(%rdi
,%rbx
,4),%xmm0
1682 shufps $
0,%xmm0
,%xmm2
1686 shufps $
136,%xmm2
,%xmm2
## 10001000
1688 shufps $
136,%xmm0
,%xmm0
## 10001000
1689 shufps $
221,%xmm1
,%xmm1
## 11011101
1691 movq nb410nf_faction
(%rbp
),%rdi
1692 ## move ix-iz to xmm4-xmm6
1695 movaps nb410nf_ix
(%rsp
),%xmm4
1696 movaps nb410nf_iy
(%rsp
),%xmm5
1697 movaps nb410nf_iz
(%rsp
),%xmm6
1713 ## lookup seed in xmm5
1716 movaps nb410nf_three
(%rsp
),%xmm1
1717 mulps
%xmm4
,%xmm5
## rsq*lu*lu
1718 movaps nb410nf_half
(%rsp
),%xmm0
1719 subps
%xmm5
,%xmm1
## 30-rsq*lu*lu
1721 mulps
%xmm1
,%xmm0
## xmm0=rinv
1722 mulps
%xmm0
,%xmm4
## xmm4=r
1723 mulps nb410nf_gbscale
(%rsp
),%xmm4
1725 cvttps2pi
%xmm4
,%mm6
## mm6 contain lu indices
1728 movaps
%xmm4
,%xmm1
## xmm1=eps
1730 mulps
%xmm2
,%xmm2
## xmm2=eps2
1734 movq nb410nf_GBtab
(%rbp
),%rsi
1739 ## load coulomb table
1740 movaps
(%rsi
,%rcx
,4),%xmm4
1741 movaps
(%rsi
,%rdx
,4),%xmm7
1742 ## transpose, using xmm3 for scratch
1744 unpcklps
%xmm7
,%xmm4
## Y1 Y2 F1 F2
1745 unpckhps
%xmm7
,%xmm6
## G1 G2 H1 H2
1746 movhlps
%xmm4
,%xmm5
## F1 F2
1747 movhlps
%xmm6
,%xmm7
## H1 H2
1748 ## coulomb table ready, in xmm4-xmm7
1750 mulps
%xmm1
,%xmm6
## xmm6=Geps
1751 mulps
%xmm2
,%xmm7
## xmm7=Heps2
1753 addps
%xmm7
,%xmm5
## xmm5=Fp
1754 movaps nb410nf_qq
(%rsp
),%xmm3
1755 mulps
%xmm1
,%xmm5
## xmm5=eps*Fp
1756 addps
%xmm4
,%xmm5
## xmm5=VV
1757 mulps
%xmm3
,%xmm5
## vcoul=qq*VV
1759 addps nb410nf_vctot
(%rsp
),%xmm5
1760 movaps
%xmm5
,nb410nf_vctot
(%rsp
)
1764 mulps
%xmm0
,%xmm4
## xmm4=rinvsq
1766 ## at this point mm5 contains vcoul and mm3 fijC
1767 ## increment vcoul - then we can get rid of mm5
1773 mulps
%xmm4
,%xmm6
## xmm6=rinvsix
1775 mulps
%xmm4
,%xmm4
## xmm4=rinvtwelve
1776 mulps nb410nf_c6
(%rsp
),%xmm6
1777 mulps nb410nf_c12
(%rsp
),%xmm4
1778 movaps nb410nf_Vvdwtot
(%rsp
),%xmm7
1781 movaps
%xmm7
,nb410nf_Vvdwtot
(%rsp
)
1783 _nb_kernel410nf_x86_64_sse.nb410nf_checksingle
:
1784 movl nb410nf_innerk
(%rsp
),%edx
1786 jnz _nb_kernel410nf_x86_64_sse.nb410nf_dosingle
1787 jmp _nb_kernel410nf_x86_64_sse.nb410nf_updateouterdata
1788 _nb_kernel410nf_x86_64_sse.nb410nf_dosingle
:
1789 movq nb410nf_charge
(%rbp
),%rsi
1790 movq nb410nf_invsqrta
(%rbp
),%rdx
1791 movq nb410nf_pos
(%rbp
),%rdi
1792 movq nb410nf_innerjjnr
(%rsp
),%rcx
1796 movss
(%rdx
,%rax
,4),%xmm2
## isa2
1797 mulss nb410nf_isai
(%rsp
),%xmm2
1798 movss
%xmm2
,nb410nf_isaprod
(%rsp
)
1800 mulss nb410nf_gbtsc
(%rsp
),%xmm1
1801 movss
%xmm1
,nb410nf_gbscale
(%rsp
)
1803 mulss nb410nf_iq
(%rsp
),%xmm2
1804 movss
(%rsi
,%rax
,4),%xmm6
## xmm6(0) has the charge
1806 movss
%xmm6
,nb410nf_qq
(%rsp
)
1808 movq nb410nf_type
(%rbp
),%rsi
1810 movl
(%rsi
,%rcx
,4),%ecx
1811 movq nb410nf_vdwparam
(%rbp
),%rsi
1813 addl nb410nf_ntia
(%rsp
),%ecx
1814 movlps
(%rsi
,%rcx
,4),%xmm6
1816 shufps $
252,%xmm4
,%xmm4
## 11111100
1817 shufps $
253,%xmm6
,%xmm6
## 11111101
1819 movaps
%xmm4
,nb410nf_c6
(%rsp
)
1820 movaps
%xmm6
,nb410nf_c12
(%rsp
)
1822 lea
(%rax
,%rax
,2),%rax
1824 ## move coordinates to xmm0-xmm2
1825 movss
(%rdi
,%rax
,4),%xmm0
1826 movss
4(%rdi
,%rax
,4),%xmm1
1827 movss
8(%rdi
,%rax
,4),%xmm2
1829 movaps nb410nf_ix
(%rsp
),%xmm4
1830 movaps nb410nf_iy
(%rsp
),%xmm5
1831 movaps nb410nf_iz
(%rsp
),%xmm6
1847 ## lookup seed in xmm5
1850 movss nb410nf_three
(%rsp
),%xmm1
1851 mulss
%xmm4
,%xmm5
## rsq*lu*lu
1852 movss nb410nf_half
(%rsp
),%xmm0
1853 subss
%xmm5
,%xmm1
## 30-rsq*lu*lu
1855 mulss
%xmm1
,%xmm0
## xmm0=rinv
1857 mulss
%xmm0
,%xmm4
## xmm4=r
1858 mulss nb410nf_gbscale
(%rsp
),%xmm4
1860 cvttss2si
%xmm4
,%ebx
## mm6 contain lu indices
1863 movaps
%xmm4
,%xmm1
## xmm1=eps
1865 mulss
%xmm2
,%xmm2
## xmm2=eps2
1868 movq nb410nf_GBtab
(%rbp
),%rsi
1870 movaps
(%rsi
,%rbx
,4),%xmm4
1874 shufps $
1,%xmm5
,%xmm5
1875 shufps $
1,%xmm7
,%xmm7
1876 ## table ready in xmm4-xmm7
1878 mulss
%xmm1
,%xmm6
## xmm6=Geps
1879 mulss
%xmm2
,%xmm7
## xmm7=Heps2
1881 addss
%xmm7
,%xmm5
## xmm5=Fp
1882 movss nb410nf_qq
(%rsp
),%xmm3
1883 mulss
%xmm1
,%xmm5
## xmm5=eps*Fp
1884 addss
%xmm4
,%xmm5
## xmm5=VV
1885 mulss
%xmm3
,%xmm5
## vcoul=qq*VV
1886 addss nb410nf_vctot
(%rsp
),%xmm5
1887 movss
%xmm5
,nb410nf_vctot
(%rsp
)
1891 mulss
%xmm0
,%xmm4
## xmm4=rinvsq
1896 mulss
%xmm4
,%xmm6
## xmm6=rinvsix
1898 mulss
%xmm4
,%xmm4
## xmm4=rinvtwelve
1899 mulss nb410nf_c6
(%rsp
),%xmm6
1900 mulss nb410nf_c12
(%rsp
),%xmm4
1901 movss nb410nf_Vvdwtot
(%rsp
),%xmm7
1904 movss
%xmm7
,nb410nf_Vvdwtot
(%rsp
)
1906 _nb_kernel410nf_x86_64_sse.nb410nf_updateouterdata
:
1908 movl nb410nf_n
(%rsp
),%esi
1909 ## get group index for i particle
1910 movq nb410nf_gid
(%rbp
),%rdx
## base of gid[]
1911 movl
(%rdx
,%rsi
,4),%edx
## ggid=gid[n]
1913 ## accumulate total potential energy and update it
1914 movaps nb410nf_vctot
(%rsp
),%xmm7
1917 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1919 shufps $
1,%xmm6
,%xmm6
1922 ## add earlier value from mem
1923 movq nb410nf_Vc
(%rbp
),%rax
1924 addss
(%rax
,%rdx
,4),%xmm7
1926 movss
%xmm7
,(%rax
,%rdx
,4)
1928 ## accumulate total lj energy and update it
1929 movaps nb410nf_Vvdwtot
(%rsp
),%xmm7
1932 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1934 shufps $
1,%xmm6
,%xmm6
1937 ## add earlier value from mem
1938 movq nb410nf_Vvdw
(%rbp
),%rax
1939 addss
(%rax
,%rdx
,4),%xmm7
1941 movss
%xmm7
,(%rax
,%rdx
,4)
1944 movl nb410nf_nn1
(%rsp
),%ecx
1945 ## esi already loaded with n
1948 jz _nb_kernel410nf_x86_64_sse.nb410nf_outerend
1950 ## not last, iterate outer loop once more!
1951 movl
%esi
,nb410nf_n
(%rsp
)
1952 jmp _nb_kernel410nf_x86_64_sse.nb410nf_outer
1953 _nb_kernel410nf_x86_64_sse.nb410nf_outerend
:
1954 ## check if more outer neighborlists remain
1955 movl nb410nf_nri
(%rsp
),%ecx
1956 ## esi already loaded with n above
1958 jz _nb_kernel410nf_x86_64_sse.nb410nf_end
1959 ## non-zero, do one more workunit
1960 jmp _nb_kernel410nf_x86_64_sse.nb410nf_threadloop
1961 _nb_kernel410nf_x86_64_sse.nb410nf_end
:
1963 movl nb410nf_nouter
(%rsp
),%eax
1964 movl nb410nf_ninner
(%rsp
),%ebx
1965 movq nb410nf_outeriter
(%rbp
),%rcx
1966 movq nb410nf_inneriter
(%rbp
),%rdx