3 ## Gromacs 4.0 Copyright (c) 1991-2003
4 ## David van der Spoel, Erik Lindahl
6 ## This program is free software; you can redistribute it and/or
7 ## modify it under the terms of the GNU General Public License
8 ## as published by the Free Software Foundation; either version 2
9 ## of the License, or (at your option) any later version.
11 ## To help us fund GROMACS development, we humbly ask that you cite
12 ## the research papers on the package. Check out http://www.gromacs.org
15 ## Gnomes, ROck Monsters And Chili Sauce
20 .globl nb_kernel410_ia32_sse
21 .globl _nb_kernel410_ia32_sse
22 nb_kernel410_ia32_sse
:
23 _nb_kernel410_ia32_sse
:
29 .set nb410_shiftvec, 28
33 .set nb410_faction, 44
35 .set nb410_p_facel, 52
40 .set nb410_p_ntype, 72
41 .set nb410_vdwparam, 76
43 .set nb410_p_tabscale, 84
45 .set nb410_invsqrta, 92
47 .set nb410_p_gbtabscale, 100
49 .set nb410_p_nthreads, 108
52 .set nb410_outeriter, 120
53 .set nb410_inneriter, 124
55 ## stack offsets for local variables
56 ## bottom of stack is cache-aligned for sse use
66 .set nb410_twelve, 144
73 .set nb410_Vvdwtot, 256
81 .set nb410_isaprod, 384
82 .set nb410_dvdasum, 400
83 .set nb410_gbscale, 416
88 .set nb410_innerjjnr, 448
89 .set nb410_innerk, 452
99 .set nb410_nouter, 492
100 .set nb410_ninner, 496
101 .set nb410_salign, 500
110 subl $
504,%esp
## local stack space
114 movl
%eax
,nb410_salign
(%esp
)
118 ## Move args passed by reference to stack
119 movl nb410_p_nri
(%ebp
),%ecx
120 movl nb410_p_facel
(%ebp
),%esi
121 movl nb410_p_ntype
(%ebp
),%edi
125 movl
%ecx
,nb410_nri
(%esp
)
126 movl
%esi
,nb410_facel
(%esp
)
127 movl
%edi
,nb410_ntype
(%esp
)
129 ## zero iteration counters
131 movl
%eax
,nb410_nouter
(%esp
)
132 movl
%eax
,nb410_ninner
(%esp
)
135 movl nb410_p_gbtabscale
(%ebp
),%eax
137 shufps $
0,%xmm5
,%xmm5
138 movaps
%xmm5
,nb410_gbtsc
(%esp
)
140 ## create constant floating-point factors on stack
141 movl $
0x3f000000,%eax
## constant 0.5 in IEEE (hex)
142 movl
%eax
,nb410_half
(%esp
)
143 movss nb410_half
(%esp
),%xmm1
144 shufps $
0,%xmm1
,%xmm1
## splat to all elements
146 addps
%xmm2
,%xmm2
## constant 1.0
148 addps
%xmm2
,%xmm2
## constant 2.0
149 addps
%xmm2
,%xmm3
## constant 3.0
151 addps
%xmm4
,%xmm4
## 6.0
153 addps
%xmm5
,%xmm5
## constant 12.0
154 movaps
%xmm1
,nb410_half
(%esp
)
155 movaps
%xmm2
,nb410_two
(%esp
)
156 movaps
%xmm3
,nb410_three
(%esp
)
157 movaps
%xmm4
,nb410_six
(%esp
)
158 movaps
%xmm5
,nb410_twelve
(%esp
)
160 _nb_kernel410_ia32_sse.nb410_threadloop
:
161 movl nb410_count
(%ebp
),%esi
## pointer to sync counter
163 _nb_kernel410_ia32_sse.nb410_spinlock
:
164 movl
%eax
,%ebx
## ebx=*count=nn0
165 addl $
1,%ebx
## ebx=nn1=nn0+10
167 cmpxchgl
%ebx
,(%esi
) ## write nn1 to *counter,
168 ## if it hasnt changed.
169 ## or reread *counter to eax.
170 pause
## -> better p4 performance
171 jnz _nb_kernel410_ia32_sse.nb410_spinlock
173 ## if(nn1>nri) nn1=nri
174 movl nb410_nri
(%esp
),%ecx
177 cmovlel
%edx
,%ebx
## if(nn1>nri) nn1=nri
178 ## Cleared the spinlock if we got here.
179 ## eax contains nn0, ebx contains nn1.
180 movl
%eax
,nb410_n
(%esp
)
181 movl
%ebx
,nb410_nn1
(%esp
)
182 subl
%eax
,%ebx
## calc number of outer lists
183 movl
%eax
,%esi
## copy n to esi
184 jg _nb_kernel410_ia32_sse.nb410_outerstart
185 jmp _nb_kernel410_ia32_sse.nb410_end
187 _nb_kernel410_ia32_sse.nb410_outerstart
:
188 ## ebx contains number of outer iterations
189 addl nb410_nouter
(%esp
),%ebx
190 movl
%ebx
,nb410_nouter
(%esp
)
192 _nb_kernel410_ia32_sse.nb410_outer
:
193 movl nb410_shift
(%ebp
),%eax
## eax = pointer into shift[]
194 movl
(%eax
,%esi
,4),%ebx
## ebx=shift[n]
196 leal
(%ebx
,%ebx
,2),%ebx
## ebx=3*is
197 movl
%ebx
,nb410_is3
(%esp
) ## store is3
199 movl nb410_shiftvec
(%ebp
),%eax
## eax = base of shiftvec[]
201 movss
(%eax
,%ebx
,4),%xmm0
202 movss
4(%eax
,%ebx
,4),%xmm1
203 movss
8(%eax
,%ebx
,4),%xmm2
205 movl nb410_iinr
(%ebp
),%ecx
## ecx = pointer into iinr[]
206 movl
(%ecx
,%esi
,4),%ebx
## ebx =ii
207 movl
%ebx
,nb410_ii
(%esp
)
209 movl nb410_charge
(%ebp
),%edx
210 movss
(%edx
,%ebx
,4),%xmm3
211 mulss nb410_facel
(%esp
),%xmm3
212 shufps $
0,%xmm3
,%xmm3
214 movl nb410_invsqrta
(%ebp
),%edx
## load invsqrta[ii]
215 movss
(%edx
,%ebx
,4),%xmm4
216 shufps $
0,%xmm4
,%xmm4
218 movl nb410_type
(%ebp
),%edx
219 movl
(%edx
,%ebx
,4),%edx
220 imull nb410_ntype
(%esp
),%edx
222 movl
%edx
,nb410_ntia
(%esp
)
224 leal
(%ebx
,%ebx
,2),%ebx
## ebx = 3*ii=ii3
225 movl nb410_pos
(%ebp
),%eax
## eax = base of pos[]
227 addss
(%eax
,%ebx
,4),%xmm0
228 addss
4(%eax
,%ebx
,4),%xmm1
229 addss
8(%eax
,%ebx
,4),%xmm2
231 movaps
%xmm3
,nb410_iq
(%esp
)
232 movaps
%xmm4
,nb410_isai
(%esp
)
234 shufps $
0,%xmm0
,%xmm0
235 shufps $
0,%xmm1
,%xmm1
236 shufps $
0,%xmm2
,%xmm2
238 movaps
%xmm0
,nb410_ix
(%esp
)
239 movaps
%xmm1
,nb410_iy
(%esp
)
240 movaps
%xmm2
,nb410_iz
(%esp
)
242 movl
%ebx
,nb410_ii3
(%esp
)
244 ## clear vctot and i forces
246 movaps
%xmm4
,nb410_vctot
(%esp
)
247 movaps
%xmm4
,nb410_Vvdwtot
(%esp
)
248 movaps
%xmm4
,nb410_dvdasum
(%esp
)
249 movaps
%xmm4
,nb410_fix
(%esp
)
250 movaps
%xmm4
,nb410_fiy
(%esp
)
251 movaps
%xmm4
,nb410_fiz
(%esp
)
253 movl nb410_jindex
(%ebp
),%eax
254 movl
(%eax
,%esi
,4),%ecx
## jindex[n]
255 movl
4(%eax
,%esi
,4),%edx
## jindex[n+1]
256 subl
%ecx
,%edx
## number of innerloop atoms
258 movl nb410_pos
(%ebp
),%esi
259 movl nb410_faction
(%ebp
),%edi
260 movl nb410_jjnr
(%ebp
),%eax
263 movl
%eax
,nb410_innerjjnr
(%esp
) ## pointer to jjnr[nj0]
266 addl nb410_ninner
(%esp
),%ecx
267 movl
%ecx
,nb410_ninner
(%esp
)
269 movl
%edx
,nb410_innerk
(%esp
) ## number of innerloop atoms
270 jge _nb_kernel410_ia32_sse.nb410_unroll_loop
271 jmp _nb_kernel410_ia32_sse.nb410_finish_inner
272 _nb_kernel410_ia32_sse.nb410_unroll_loop
:
273 ## quad-unroll innerloop here
274 movl nb410_innerjjnr
(%esp
),%edx
## pointer to jjnr[k]
278 movl
12(%edx
),%edx
## eax-edx=jnr1-4
279 addl $
16,nb410_innerjjnr
(%esp
) ## advance pointer (unrolled 4)
282 movl nb410_invsqrta
(%ebp
),%esi
283 movss
(%esi
,%eax
,4),%xmm3
284 movss
(%esi
,%ecx
,4),%xmm4
285 movss
(%esi
,%ebx
,4),%xmm6
286 movss
(%esi
,%edx
,4),%xmm7
287 movaps nb410_isai
(%esp
),%xmm2
288 shufps $
0,%xmm6
,%xmm3
289 shufps $
0,%xmm7
,%xmm4
290 shufps $
136,%xmm4
,%xmm3
## constant 10001000 ;# all isaj in xmm3
293 movaps
%xmm2
,nb410_isaprod
(%esp
)
295 mulps nb410_gbtsc
(%esp
),%xmm1
296 movaps
%xmm1
,nb410_gbscale
(%esp
)
298 movl nb410_charge
(%ebp
),%esi
## base of charge[]
300 movss
(%esi
,%eax
,4),%xmm3
301 movss
(%esi
,%ecx
,4),%xmm4
302 movss
(%esi
,%ebx
,4),%xmm6
303 movss
(%esi
,%edx
,4),%xmm7
305 mulps nb410_iq
(%esp
),%xmm2
306 shufps $
0,%xmm6
,%xmm3
307 shufps $
0,%xmm7
,%xmm4
308 shufps $
136,%xmm4
,%xmm3
## constant 10001000 ;# all charges in xmm3
310 movaps
%xmm3
,nb410_qq
(%esp
)
317 movl nb410_type
(%ebp
),%esi
318 movl
(%esi
,%eax
,4),%eax
319 movl
(%esi
,%ebx
,4),%ebx
320 movl
(%esi
,%ecx
,4),%ecx
321 movl
(%esi
,%edx
,4),%edx
322 movl nb410_vdwparam
(%ebp
),%esi
327 movl nb410_ntia
(%esp
),%edi
333 movlps
(%esi
,%eax
,4),%xmm6
334 movlps
(%esi
,%ecx
,4),%xmm7
335 movhps
(%esi
,%ebx
,4),%xmm6
336 movhps
(%esi
,%edx
,4),%xmm7
339 shufps $
136,%xmm7
,%xmm4
## constant 10001000
340 shufps $
221,%xmm7
,%xmm6
## constant 11011101
347 movaps
%xmm4
,nb410_c6
(%esp
)
348 movaps
%xmm6
,nb410_c12
(%esp
)
350 movl nb410_pos
(%ebp
),%esi
## base of pos[]
352 movl
%eax
,nb410_jnra
(%esp
)
353 movl
%ebx
,nb410_jnrb
(%esp
)
354 movl
%ecx
,nb410_jnrc
(%esp
)
355 movl
%edx
,nb410_jnrd
(%esp
)
357 leal
(%eax
,%eax
,2),%eax
## replace jnr with j3
358 leal
(%ebx
,%ebx
,2),%ebx
360 leal
(%ecx
,%ecx
,2),%ecx
## replace jnr with j3
361 leal
(%edx
,%edx
,2),%edx
363 ## move four coordinates to xmm0-xmm2
365 movlps
(%esi
,%eax
,4),%xmm4
366 movlps
(%esi
,%ecx
,4),%xmm5
367 movss
8(%esi
,%eax
,4),%xmm2
368 movss
8(%esi
,%ecx
,4),%xmm6
370 movhps
(%esi
,%ebx
,4),%xmm4
371 movhps
(%esi
,%edx
,4),%xmm5
373 movss
8(%esi
,%ebx
,4),%xmm0
374 movss
8(%esi
,%edx
,4),%xmm1
376 shufps $
0,%xmm0
,%xmm2
377 shufps $
0,%xmm1
,%xmm6
382 shufps $
136,%xmm6
,%xmm2
## constant 10001000
384 shufps $
136,%xmm5
,%xmm0
## constant 10001000
385 shufps $
221,%xmm5
,%xmm1
## constant 11011101
387 ## move ix-iz to xmm4-xmm6
388 movaps nb410_ix
(%esp
),%xmm4
389 movaps nb410_iy
(%esp
),%xmm5
390 movaps nb410_iz
(%esp
),%xmm6
398 movaps
%xmm4
,nb410_dx
(%esp
)
399 movaps
%xmm5
,nb410_dy
(%esp
)
400 movaps
%xmm6
,nb410_dz
(%esp
)
410 ## lookup seed in xmm5
413 movaps nb410_three
(%esp
),%xmm1
414 mulps
%xmm4
,%xmm5
## rsq*lu*lu
415 movaps nb410_half
(%esp
),%xmm0
416 subps
%xmm5
,%xmm1
## constant 30-rsq*lu*lu
418 mulps
%xmm1
,%xmm0
## xmm0=rinv
419 mulps
%xmm0
,%xmm4
## xmm4=r
420 movaps
%xmm4
,nb410_r
(%esp
)
421 mulps nb410_gbscale
(%esp
),%xmm4
425 cvttps2pi
%xmm5
,%mm7
## mm6/mm7 contain lu indices
430 movaps
%xmm4
,%xmm1
## xmm1=eps
432 mulps
%xmm2
,%xmm2
## xmm2=eps2
441 movl nb410_GBtab
(%ebp
),%esi
449 ## load coulomb table
450 movaps
(%esi
,%eax
,4),%xmm4
451 movaps
(%esi
,%ebx
,4),%xmm5
452 movaps
(%esi
,%ecx
,4),%xmm6
453 movaps
(%esi
,%edx
,4),%xmm7
454 ## transpose, using xmm3 for scratch
456 shufps $
0xEE,%xmm7
,%xmm3
457 shufps $
0x44,%xmm7
,%xmm6
459 shufps $
0xEE,%xmm5
,%xmm7
460 shufps $
0x44,%xmm5
,%xmm4
462 shufps $
0xDD,%xmm6
,%xmm5
463 shufps $
0x88,%xmm6
,%xmm4
465 shufps $
0x88,%xmm3
,%xmm6
466 shufps $
0xDD,%xmm3
,%xmm7
467 ## coulomb table ready, in xmm4-xmm7
468 mulps
%xmm1
,%xmm6
## xmm6=Geps
469 mulps
%xmm2
,%xmm7
## xmm7=Heps2
472 addps
%xmm7
,%xmm5
## xmm5=Fp
473 mulps nb410_two
(%esp
),%xmm7
## two*Heps2
474 movaps nb410_qq
(%esp
),%xmm3
476 addps
%xmm5
,%xmm7
## xmm7=FF
477 mulps
%xmm1
,%xmm5
## xmm5=eps*Fp
478 addps
%xmm4
,%xmm5
## xmm5=VV
479 mulps
%xmm3
,%xmm5
## vcoul=qq*VV
480 mulps
%xmm7
,%xmm3
## fijC=FF*qq
481 ## get jnr from stack
482 movl nb410_jnra
(%esp
),%eax
483 movl nb410_jnrb
(%esp
),%ebx
484 movl nb410_jnrc
(%esp
),%ecx
485 movl nb410_jnrd
(%esp
),%edx
487 movl nb410_dvda
(%ebp
),%esi
491 mulps nb410_gbscale
(%esp
),%xmm3
493 mulps nb410_r
(%esp
),%xmm6
495 addps nb410_vctot
(%esp
),%xmm5
496 movaps
%xmm5
,nb410_vctot
(%esp
)
498 ## xmm6=(vcoul+fijC*r)
503 addps nb410_dvdasum
(%esp
),%xmm7
504 movaps
%xmm7
,nb410_dvdasum
(%esp
)
506 ## update j atoms dvdaj
510 shufps $
0x1,%xmm5
,%xmm5
511 shufps $
0x1,%xmm4
,%xmm4
512 ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
513 addss
(%esi
,%eax
,4),%xmm6
514 addss
(%esi
,%ebx
,4),%xmm5
515 addss
(%esi
,%ecx
,4),%xmm7
516 addss
(%esi
,%edx
,4),%xmm4
517 movss
%xmm6
,(%esi
,%eax
,4)
518 movss
%xmm5
,(%esi
,%ebx
,4)
519 movss
%xmm7
,(%esi
,%ecx
,4)
520 movss
%xmm4
,(%esi
,%edx
,4)
524 mulps
%xmm0
,%xmm4
## xmm4=rinvsq
529 mulps
%xmm4
,%xmm6
## xmm6=rinvsix
531 mulps
%xmm4
,%xmm4
## xmm4=rinvtwelve
532 mulps nb410_c6
(%esp
),%xmm6
533 mulps nb410_c12
(%esp
),%xmm4
534 movaps nb410_Vvdwtot
(%esp
),%xmm7
536 mulps nb410_twelve
(%esp
),%xmm4
538 mulps nb410_six
(%esp
),%xmm6
539 movaps
%xmm7
,nb410_Vvdwtot
(%esp
)
545 movaps nb410_dx
(%esp
),%xmm0
546 movaps nb410_dy
(%esp
),%xmm1
547 movaps nb410_dz
(%esp
),%xmm2
554 movl nb410_faction
(%ebp
),%edi
558 ## xmm0-xmm2 contains tx-tz (partial force)
560 movaps nb410_fix
(%esp
),%xmm3
561 movaps nb410_fiy
(%esp
),%xmm4
562 movaps nb410_fiz
(%esp
),%xmm5
566 movaps
%xmm3
,nb410_fix
(%esp
)
567 movaps
%xmm4
,nb410_fiy
(%esp
)
568 movaps
%xmm5
,nb410_fiz
(%esp
)
569 ## the fj's - start by accumulating x & y forces from memory
570 movlps
(%edi
,%eax
,4),%xmm4
571 movlps
(%edi
,%ecx
,4),%xmm6
572 movhps
(%edi
,%ebx
,4),%xmm4
573 movhps
(%edi
,%edx
,4),%xmm6
576 shufps $
136,%xmm6
,%xmm3
## constant 10001000
577 shufps $
221,%xmm6
,%xmm4
## constant 11011101
579 ## now xmm3-xmm5 contains fjx, fjy, fjz
583 ## unpack them back so we can store them - first x & y in xmm3/xmm4
588 ## xmm6(l)=x & y for j1, (h) for j2
589 ## xmm3(l)=x & y for j3, (h) for j4
590 movlps
%xmm6
,(%edi
,%eax
,4)
591 movlps
%xmm3
,(%edi
,%ecx
,4)
593 movhps
%xmm6
,(%edi
,%ebx
,4)
594 movhps
%xmm3
,(%edi
,%edx
,4)
597 movss
8(%edi
,%eax
,4),%xmm4
598 movss
8(%edi
,%ebx
,4),%xmm5
599 movss
8(%edi
,%ecx
,4),%xmm6
600 movss
8(%edi
,%edx
,4),%xmm7
602 shufps $
229,%xmm2
,%xmm2
## constant 11100101
604 shufps $
234,%xmm2
,%xmm2
## constant 11101010
606 shufps $
255,%xmm2
,%xmm2
## constant 11111111
608 movss
%xmm4
,8(%edi
,%eax
,4)
609 movss
%xmm5
,8(%edi
,%ebx
,4)
610 movss
%xmm6
,8(%edi
,%ecx
,4)
611 movss
%xmm7
,8(%edi
,%edx
,4)
613 ## should we do one more iteration?
614 subl $
4,nb410_innerk
(%esp
)
615 jl _nb_kernel410_ia32_sse.nb410_finish_inner
616 jmp _nb_kernel410_ia32_sse.nb410_unroll_loop
617 _nb_kernel410_ia32_sse.nb410_finish_inner
:
618 ## check if at least two particles remain
619 addl $
4,nb410_innerk
(%esp
)
620 movl nb410_innerk
(%esp
),%edx
622 jnz _nb_kernel410_ia32_sse.nb410_dopair
623 jmp _nb_kernel410_ia32_sse.nb410_checksingle
624 _nb_kernel410_ia32_sse.nb410_dopair
:
625 movl nb410_innerjjnr
(%esp
),%ecx
628 addl $
8,nb410_innerjjnr
(%esp
)
634 movl nb410_invsqrta
(%ebp
),%esi
635 movss
(%esi
,%eax
,4),%xmm2
636 movss
(%esi
,%ebx
,4),%xmm3
637 unpcklps
%xmm3
,%xmm2
## isaj in xmm2(0,1)
638 mulps nb410_isai
(%esp
),%xmm2
639 movaps
%xmm2
,nb410_isaprod
(%esp
)
641 mulps nb410_gbtsc
(%esp
),%xmm1
642 movaps
%xmm1
,nb410_gbscale
(%esp
)
644 movl nb410_charge
(%ebp
),%esi
## base of charge[]
645 movss
(%esi
,%eax
,4),%xmm3
646 movss
(%esi
,%ebx
,4),%xmm6
647 unpcklps
%xmm6
,%xmm3
## constant 00001000 ;# xmm3(0,1) has the charges
649 mulps nb410_iq
(%esp
),%xmm2
651 movaps
%xmm3
,nb410_qq
(%esp
)
653 movl nb410_type
(%ebp
),%esi
656 movl
(%esi
,%ecx
,4),%ecx
657 movl
(%esi
,%edx
,4),%edx
658 movl nb410_vdwparam
(%ebp
),%esi
661 movl nb410_ntia
(%esp
),%edi
664 movlps
(%esi
,%ecx
,4),%xmm6
665 movhps
(%esi
,%edx
,4),%xmm6
666 movl nb410_pos
(%ebp
),%edi
669 shufps $
8,%xmm4
,%xmm4
## constant 00001000
670 shufps $
13,%xmm6
,%xmm6
## constant 00001101
674 movaps
%xmm4
,nb410_c6
(%esp
)
675 movaps
%xmm6
,nb410_c12
(%esp
)
680 leal
(%eax
,%eax
,2),%eax
681 leal
(%ebx
,%ebx
,2),%ebx
682 ## move coordinates to xmm0-xmm2
683 movlps
(%edi
,%eax
,4),%xmm1
684 movss
8(%edi
,%eax
,4),%xmm2
685 movhps
(%edi
,%ebx
,4),%xmm1
686 movss
8(%edi
,%ebx
,4),%xmm0
690 shufps $
0,%xmm0
,%xmm2
694 shufps $
136,%xmm2
,%xmm2
## constant 10001000
696 shufps $
136,%xmm0
,%xmm0
## constant 10001000
697 shufps $
221,%xmm1
,%xmm1
## constant 11011101
699 movl nb410_faction
(%ebp
),%edi
700 ## move ix-iz to xmm4-xmm6
703 movaps nb410_ix
(%esp
),%xmm4
704 movaps nb410_iy
(%esp
),%xmm5
705 movaps nb410_iz
(%esp
),%xmm6
713 movaps
%xmm4
,nb410_dx
(%esp
)
714 movaps
%xmm5
,nb410_dy
(%esp
)
715 movaps
%xmm6
,nb410_dz
(%esp
)
725 ## lookup seed in xmm5
728 movaps nb410_three
(%esp
),%xmm1
729 mulps
%xmm4
,%xmm5
## rsq*lu*lu
730 movaps nb410_half
(%esp
),%xmm0
731 subps
%xmm5
,%xmm1
## constant 30-rsq*lu*lu
733 mulps
%xmm1
,%xmm0
## xmm0=rinv
734 mulps
%xmm0
,%xmm4
## xmm4=r
735 movaps
%xmm4
,nb410_r
(%esp
)
736 mulps nb410_gbscale
(%esp
),%xmm4
738 cvttps2pi
%xmm4
,%mm6
## mm6 contain lu indices
741 movaps
%xmm4
,%xmm1
## xmm1=eps
743 mulps
%xmm2
,%xmm2
## xmm2=eps2
747 movl nb410_GBtab
(%ebp
),%esi
752 ## load coulomb table
753 movaps
(%esi
,%ecx
,4),%xmm4
754 movaps
(%esi
,%edx
,4),%xmm7
755 ## transpose, using xmm3 for scratch
757 unpcklps
%xmm7
,%xmm4
## Y1 Y2 F1 F2
758 unpckhps
%xmm7
,%xmm6
## G1 G2 H1 H2
759 movhlps
%xmm4
,%xmm5
## F1 F2
760 movhlps
%xmm6
,%xmm7
## H1 H2
761 ## coulomb table ready, in xmm4-xmm7
763 mulps
%xmm1
,%xmm6
## xmm6=Geps
764 mulps
%xmm2
,%xmm7
## xmm7=Heps2
766 addps
%xmm7
,%xmm5
## xmm5=Fp
767 mulps nb410_two
(%esp
),%xmm7
## two*Heps2
768 movaps nb410_qq
(%esp
),%xmm3
770 addps
%xmm5
,%xmm7
## xmm7=FF
771 mulps
%xmm1
,%xmm5
## xmm5=eps*Fp
772 addps
%xmm4
,%xmm5
## xmm5=VV
773 mulps
%xmm3
,%xmm5
## vcoul=qq*VV
774 mulps
%xmm7
,%xmm3
## fijC=FF*qq
779 movl nb410_dvda
(%ebp
),%esi
782 mulps nb410_gbscale
(%esp
),%xmm3
784 mulps nb410_r
(%esp
),%xmm6
786 addps nb410_vctot
(%esp
),%xmm5
787 movaps
%xmm5
,nb410_vctot
(%esp
)
789 ## xmm6=(vcoul+fijC*r)
794 addps nb410_dvdasum
(%esp
),%xmm7
795 movaps
%xmm7
,nb410_dvdasum
(%esp
)
797 ## update j atoms dvdaj
799 shufps $
0x1,%xmm7
,%xmm7
800 addss
(%esi
,%ecx
,4),%xmm6
801 addss
(%esi
,%edx
,4),%xmm7
802 movss
%xmm6
,(%esi
,%ecx
,4)
803 movss
%xmm7
,(%esi
,%edx
,4)
807 mulps
%xmm0
,%xmm4
## xmm4=rinvsq
809 ## at this point mm5 contains vcoul and mm3 fijC
810 ## increment vcoul - then we can get rid of mm5
816 mulps
%xmm4
,%xmm6
## xmm6=rinvsix
818 mulps
%xmm4
,%xmm4
## xmm4=rinvtwelve
819 mulps nb410_c6
(%esp
),%xmm6
820 mulps nb410_c12
(%esp
),%xmm4
821 movaps nb410_Vvdwtot
(%esp
),%xmm7
823 mulps nb410_twelve
(%esp
),%xmm4
825 mulps nb410_six
(%esp
),%xmm6
826 movaps
%xmm7
,nb410_Vvdwtot
(%esp
)
832 movaps nb410_dx
(%esp
),%xmm0
833 movaps nb410_dy
(%esp
),%xmm1
834 movaps nb410_dz
(%esp
),%xmm2
839 ## xmm0-xmm2 contains tx-tz (partial force)
841 movaps nb410_fix
(%esp
),%xmm3
842 movaps nb410_fiy
(%esp
),%xmm4
843 movaps nb410_fiz
(%esp
),%xmm5
847 movaps
%xmm3
,nb410_fix
(%esp
)
848 movaps
%xmm4
,nb410_fiy
(%esp
)
849 movaps
%xmm5
,nb410_fiz
(%esp
)
851 movss
(%edi
,%eax
,4),%xmm3
852 movss
4(%edi
,%eax
,4),%xmm4
853 movss
8(%edi
,%eax
,4),%xmm5
857 movss
%xmm3
,(%edi
,%eax
,4)
858 movss
%xmm4
,4(%edi
,%eax
,4)
859 movss
%xmm5
,8(%edi
,%eax
,4)
861 shufps $
225,%xmm0
,%xmm0
## constant 11100001
862 shufps $
225,%xmm1
,%xmm1
## constant 11100001
863 shufps $
225,%xmm2
,%xmm2
## constant 11100001
865 movss
(%edi
,%ebx
,4),%xmm3
866 movss
4(%edi
,%ebx
,4),%xmm4
867 movss
8(%edi
,%ebx
,4),%xmm5
871 movss
%xmm3
,(%edi
,%ebx
,4)
872 movss
%xmm4
,4(%edi
,%ebx
,4)
873 movss
%xmm5
,8(%edi
,%ebx
,4)
875 _nb_kernel410_ia32_sse.nb410_checksingle
:
876 movl nb410_innerk
(%esp
),%edx
878 jnz _nb_kernel410_ia32_sse.nb410_dosingle
879 jmp _nb_kernel410_ia32_sse.nb410_updateouterdata
880 _nb_kernel410_ia32_sse.nb410_dosingle
:
881 movl nb410_charge
(%ebp
),%esi
882 movl nb410_invsqrta
(%ebp
),%edx
883 movl nb410_pos
(%ebp
),%edi
884 movl nb410_innerjjnr
(%esp
),%ecx
888 movss
(%edx
,%eax
,4),%xmm2
## isaj
889 mulss nb410_isai
(%esp
),%xmm2
890 movss
%xmm2
,nb410_isaprod
(%esp
)
892 mulss nb410_gbtsc
(%esp
),%xmm1
893 movss
%xmm1
,nb410_gbscale
(%esp
)
895 mulss nb410_iq
(%esp
),%xmm2
896 movss
(%esi
,%eax
,4),%xmm6
## xmm6(0) has the charge
898 movss
%xmm6
,nb410_qq
(%esp
)
900 movl nb410_type
(%ebp
),%esi
902 movl
(%esi
,%ecx
,4),%ecx
903 movl nb410_vdwparam
(%ebp
),%esi
905 addl nb410_ntia
(%esp
),%ecx
906 movlps
(%esi
,%ecx
,4),%xmm6
908 shufps $
252,%xmm4
,%xmm4
## constant 11111100
909 shufps $
253,%xmm6
,%xmm6
## constant 11111101
911 movaps
%xmm4
,nb410_c6
(%esp
)
912 movaps
%xmm6
,nb410_c12
(%esp
)
915 leal
(%eax
,%eax
,2),%eax
917 ## move coordinates to xmm0-xmm2
918 movss
(%edi
,%eax
,4),%xmm0
919 movss
4(%edi
,%eax
,4),%xmm1
920 movss
8(%edi
,%eax
,4),%xmm2
922 movaps nb410_ix
(%esp
),%xmm4
923 movaps nb410_iy
(%esp
),%xmm5
924 movaps nb410_iz
(%esp
),%xmm6
932 movss
%xmm4
,nb410_dx
(%esp
)
933 movss
%xmm5
,nb410_dy
(%esp
)
934 movss
%xmm6
,nb410_dz
(%esp
)
944 ## lookup seed in xmm5
947 movss nb410_three
(%esp
),%xmm1
948 mulss
%xmm4
,%xmm5
## rsq*lu*lu
949 movss nb410_half
(%esp
),%xmm0
950 subss
%xmm5
,%xmm1
## constant 30-rsq*lu*lu
952 mulss
%xmm1
,%xmm0
## xmm0=rinv
954 mulss
%xmm0
,%xmm4
## xmm4=r
955 movss
%xmm4
,nb410_r
(%esp
)
956 mulss nb410_gbscale
(%esp
),%xmm4
958 cvttss2si
%xmm4
,%ebx
## mm6 contain lu indices
961 movaps
%xmm4
,%xmm1
## xmm1=eps
963 mulss
%xmm2
,%xmm2
## xmm2=eps2
966 movl nb410_GBtab
(%ebp
),%esi
968 movaps
(%esi
,%ebx
,4),%xmm4
972 shufps $
1,%xmm5
,%xmm5
973 shufps $
1,%xmm7
,%xmm7
974 ## table ready in xmm4-xmm7
976 mulss
%xmm1
,%xmm6
## xmm6=Geps
977 mulss
%xmm2
,%xmm7
## xmm7=Heps2
979 addss
%xmm7
,%xmm5
## xmm5=Fp
980 mulss nb410_two
(%esp
),%xmm7
## two*Heps2
981 movss nb410_qq
(%esp
),%xmm3
983 addss
%xmm5
,%xmm7
## xmm7=FF
984 mulss
%xmm1
,%xmm5
## xmm5=eps*Fp
985 addss
%xmm4
,%xmm5
## xmm5=VV
986 mulss
%xmm3
,%xmm5
## vcoul=qq*VV
987 mulss
%xmm7
,%xmm3
## fijC=FF*qq
990 movl nb410_dvda
(%ebp
),%esi
994 mulss nb410_gbscale
(%esp
),%xmm3
996 mulss nb410_r
(%esp
),%xmm6
998 addss nb410_vctot
(%esp
),%xmm5
999 movss
%xmm5
,nb410_vctot
(%esp
)
1001 ## xmm6=(vcoul+fijC*r)
1006 addps nb410_dvdasum
(%esp
),%xmm7
1007 movaps
%xmm7
,nb410_dvdasum
(%esp
)
1009 ## update j atoms dvdaj
1010 addss
(%esi
,%ebx
,4),%xmm6
1011 movss
%xmm6
,(%esi
,%ebx
,4)
1015 mulss
%xmm0
,%xmm4
## xmm4=rinvsq
1020 mulss
%xmm4
,%xmm6
## xmm6=rinvsix
1022 mulss
%xmm4
,%xmm4
## xmm4=rinvtwelve
1023 mulss nb410_c6
(%esp
),%xmm6
1024 mulss nb410_c12
(%esp
),%xmm4
1025 movss nb410_Vvdwtot
(%esp
),%xmm7
1027 mulss nb410_twelve
(%esp
),%xmm4
1029 mulss nb410_six
(%esp
),%xmm6
1030 movss
%xmm7
,nb410_Vvdwtot
(%esp
)
1036 movss nb410_dx
(%esp
),%xmm0
1037 movss nb410_dy
(%esp
),%xmm1
1038 movss nb410_dz
(%esp
),%xmm2
1040 movl nb410_faction
(%ebp
),%edi
1044 ## xmm0-xmm2 contains tx-tz (partial force)
1046 movss nb410_fix
(%esp
),%xmm3
1047 movss nb410_fiy
(%esp
),%xmm4
1048 movss nb410_fiz
(%esp
),%xmm5
1052 movss
%xmm3
,nb410_fix
(%esp
)
1053 movss
%xmm4
,nb410_fiy
(%esp
)
1054 movss
%xmm5
,nb410_fiz
(%esp
)
1057 movss
(%edi
,%eax
,4),%xmm3
1058 movss
4(%edi
,%eax
,4),%xmm4
1059 movss
8(%edi
,%eax
,4),%xmm5
1063 movss
%xmm3
,(%edi
,%eax
,4)
1064 movss
%xmm4
,4(%edi
,%eax
,4)
1065 movss
%xmm5
,8(%edi
,%eax
,4)
1066 _nb_kernel410_ia32_sse.nb410_updateouterdata
:
1067 movl nb410_ii3
(%esp
),%ecx
1068 movl nb410_faction
(%ebp
),%edi
1069 movl nb410_fshift
(%ebp
),%esi
1070 movl nb410_is3
(%esp
),%edx
1072 ## accumulate i forces in xmm0, xmm1, xmm2
1073 movaps nb410_fix
(%esp
),%xmm0
1074 movaps nb410_fiy
(%esp
),%xmm1
1075 movaps nb410_fiz
(%esp
),%xmm2
1082 addps
%xmm5
,%xmm2
## sum is in 1/2 in xmm0-xmm2
1088 shufps $
1,%xmm3
,%xmm3
1089 shufps $
1,%xmm4
,%xmm4
1090 shufps $
1,%xmm5
,%xmm5
1093 addss
%xmm5
,%xmm2
## xmm0-xmm2 has single force in pos0
1095 ## increment i force
1096 movss
(%edi
,%ecx
,4),%xmm3
1097 movss
4(%edi
,%ecx
,4),%xmm4
1098 movss
8(%edi
,%ecx
,4),%xmm5
1102 movss
%xmm3
,(%edi
,%ecx
,4)
1103 movss
%xmm4
,4(%edi
,%ecx
,4)
1104 movss
%xmm5
,8(%edi
,%ecx
,4)
1106 ## increment fshift force
1107 movss
(%esi
,%edx
,4),%xmm3
1108 movss
4(%esi
,%edx
,4),%xmm4
1109 movss
8(%esi
,%edx
,4),%xmm5
1113 movss
%xmm3
,(%esi
,%edx
,4)
1114 movss
%xmm4
,4(%esi
,%edx
,4)
1115 movss
%xmm5
,8(%esi
,%edx
,4)
1118 movl nb410_n
(%esp
),%esi
1119 ## get group index for i particle
1120 movl nb410_gid
(%ebp
),%edx
## base of gid[]
1121 movl
(%edx
,%esi
,4),%edx
## ggid=gid[n]
1123 ## accumulate total potential energy and update it
1124 movaps nb410_vctot
(%esp
),%xmm7
1127 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1129 shufps $
1,%xmm6
,%xmm6
1132 ## add earlier value from mem
1133 movl nb410_Vc
(%ebp
),%eax
1134 addss
(%eax
,%edx
,4),%xmm7
1136 movss
%xmm7
,(%eax
,%edx
,4)
1138 ## accumulate total lj energy and update it
1139 movaps nb410_Vvdwtot
(%esp
),%xmm7
1142 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1144 shufps $
1,%xmm6
,%xmm6
1147 ## add earlier value from mem
1148 movl nb410_Vvdw
(%ebp
),%eax
1149 addss
(%eax
,%edx
,4),%xmm7
1151 movss
%xmm7
,(%eax
,%edx
,4)
1153 ## accumulate dVda and update it
1154 movaps nb410_dvdasum
(%esp
),%xmm7
1157 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1159 shufps $
1,%xmm6
,%xmm6
1162 movl nb410_ii
(%esp
),%edx
1163 movl nb410_dvda
(%ebp
),%eax
1164 addss
(%eax
,%edx
,4),%xmm7
1165 movss
%xmm7
,(%eax
,%edx
,4)
1168 movl nb410_nn1
(%esp
),%ecx
1169 ## esi already loaded with n
1172 jz _nb_kernel410_ia32_sse.nb410_outerend
1174 ## not last, iterate outer loop once more!
1175 movl
%esi
,nb410_n
(%esp
)
1176 jmp _nb_kernel410_ia32_sse.nb410_outer
1177 _nb_kernel410_ia32_sse.nb410_outerend
:
1178 ## check if more outer neighborlists remain
1179 movl nb410_nri
(%esp
),%ecx
1180 ## esi already loaded with n above
1182 jz _nb_kernel410_ia32_sse.nb410_end
1183 ## non-zero, do one more workunit
1184 jmp _nb_kernel410_ia32_sse.nb410_threadloop
1185 _nb_kernel410_ia32_sse.nb410_end
:
1188 movl nb410_nouter
(%esp
),%eax
1189 movl nb410_ninner
(%esp
),%ebx
1190 movl nb410_outeriter
(%ebp
),%ecx
1191 movl nb410_inneriter
(%ebp
),%edx
1195 movl nb410_salign
(%esp
),%eax
1209 .globl nb_kernel410nf_ia32_sse
1210 .globl _nb_kernel410nf_ia32_sse
1211 nb_kernel410nf_ia32_sse
:
1212 _nb_kernel410nf_ia32_sse
:
1213 .set nb410nf_p_nri, 8
1214 .set nb410nf_iinr, 12
1215 .set nb410nf_jindex, 16
1216 .set nb410nf_jjnr, 20
1217 .set nb410nf_shift, 24
1218 .set nb410nf_shiftvec, 28
1219 .set nb410nf_fshift, 32
1220 .set nb410nf_gid, 36
1221 .set nb410nf_pos, 40
1222 .set nb410nf_faction, 44
1223 .set nb410nf_charge, 48
1224 .set nb410nf_p_facel, 52
1225 .set nb410nf_argkrf, 56
1226 .set nb410nf_argcrf, 60
1228 .set nb410nf_type, 68
1229 .set nb410nf_p_ntype, 72
1230 .set nb410nf_vdwparam, 76
1231 .set nb410nf_Vvdw, 80
1232 .set nb410nf_p_tabscale, 84
1233 .set nb410nf_VFtab, 88
1234 .set nb410nf_invsqrta, 92
1235 .set nb410nf_dvda, 96
1236 .set nb410nf_p_gbtabscale, 100
1237 .set nb410nf_GBtab, 104
1238 .set nb410nf_p_nthreads, 108
1239 .set nb410nf_count, 112
1240 .set nb410nf_mtx, 116
1241 .set nb410nf_outeriter, 120
1242 .set nb410nf_inneriter, 124
1243 .set nb410nf_work, 128
1244 ## stack offsets for local variables
1245 ## bottom of stack is cache-aligned for sse use
1250 .set nb410nf_gbtsc, 64
1253 .set nb410nf_c12, 112
1254 .set nb410nf_vctot, 128
1255 .set nb410nf_Vvdwtot, 144
1256 .set nb410nf_half, 160
1257 .set nb410nf_three, 176
1258 .set nb410nf_isai, 192
1259 .set nb410nf_isaprod, 208
1260 .set nb410nf_gbscale, 224
1261 .set nb410nf_is3, 240
1262 .set nb410nf_ii3, 244
1263 .set nb410nf_ntia, 248
1264 .set nb410nf_innerjjnr, 252
1265 .set nb410nf_innerk, 256
1267 .set nb410nf_nn1, 264
1268 .set nb410nf_nri, 268
1269 .set nb410nf_facel, 272
1270 .set nb410nf_ntype, 276
1271 .set nb410nf_nouter, 280
1272 .set nb410nf_ninner, 284
1273 .set nb410nf_salign, 288
1282 subl $
292,%esp
## local stack space
1286 movl
%eax
,nb410nf_salign
(%esp
)
1290 ## Move args passed by reference to stack
1291 movl nb410nf_p_nri
(%ebp
),%ecx
1292 movl nb410nf_p_facel
(%ebp
),%esi
1293 movl nb410nf_p_ntype
(%ebp
),%edi
1297 movl
%ecx
,nb410nf_nri
(%esp
)
1298 movl
%esi
,nb410nf_facel
(%esp
)
1299 movl
%edi
,nb410nf_ntype
(%esp
)
1301 ## zero iteration counters
1303 movl
%eax
,nb410nf_nouter
(%esp
)
1304 movl
%eax
,nb410nf_ninner
(%esp
)
1307 movl nb410nf_p_gbtabscale
(%ebp
),%eax
1309 shufps $
0,%xmm5
,%xmm5
1310 movaps
%xmm5
,nb410nf_gbtsc
(%esp
)
1312 ## create constant floating-point factors on stack
1313 movl $
0x3f000000,%eax
## constant 0.5 in IEEE (hex)
1314 movl
%eax
,nb410nf_half
(%esp
)
1315 movss nb410nf_half
(%esp
),%xmm1
1316 shufps $
0,%xmm1
,%xmm1
## splat to all elements
1318 addps
%xmm2
,%xmm2
## constant 1.0
1320 addps
%xmm2
,%xmm2
## constant 2.0
1321 addps
%xmm2
,%xmm3
## constant 3.0
1322 movaps
%xmm1
,nb410nf_half
(%esp
)
1323 movaps
%xmm3
,nb410nf_three
(%esp
)
1325 _nb_kernel410nf_ia32_sse.nb410nf_threadloop
:
1326 movl nb410nf_count
(%ebp
),%esi
## pointer to sync counter
1328 _nb_kernel410nf_ia32_sse.nb410nf_spinlock
:
1329 movl
%eax
,%ebx
## ebx=*count=nn0
1330 addl $
1,%ebx
## ebx=nn1=nn0+10
1332 cmpxchgl
%ebx
,(%esi
) ## write nn1 to *counter,
1333 ## if it hasnt changed.
1334 ## or reread *counter to eax.
1335 pause
## -> better p4 performance
1336 jnz _nb_kernel410nf_ia32_sse.nb410nf_spinlock
1338 ## if(nn1>nri) nn1=nri
1339 movl nb410nf_nri
(%esp
),%ecx
1342 cmovlel
%edx
,%ebx
## if(nn1>nri) nn1=nri
1343 ## Cleared the spinlock if we got here.
1344 ## eax contains nn0, ebx contains nn1.
1345 movl
%eax
,nb410nf_n
(%esp
)
1346 movl
%ebx
,nb410nf_nn1
(%esp
)
1347 subl
%eax
,%ebx
## calc number of outer lists
1348 movl
%eax
,%esi
## copy n to esi
1349 jg _nb_kernel410nf_ia32_sse.nb410nf_outerstart
1350 jmp _nb_kernel410nf_ia32_sse.nb410nf_end
1352 _nb_kernel410nf_ia32_sse.nb410nf_outerstart
:
1353 ## ebx contains number of outer iterations
1354 addl nb410nf_nouter
(%esp
),%ebx
1355 movl
%ebx
,nb410nf_nouter
(%esp
)
1357 _nb_kernel410nf_ia32_sse.nb410nf_outer
:
1358 movl nb410nf_shift
(%ebp
),%eax
## eax = pointer into shift[]
1359 movl
(%eax
,%esi
,4),%ebx
## ebx=shift[n]
1361 leal
(%ebx
,%ebx
,2),%ebx
## ebx=3*is
1362 movl
%ebx
,nb410nf_is3
(%esp
) ## store is3
1364 movl nb410nf_shiftvec
(%ebp
),%eax
## eax = base of shiftvec[]
1366 movss
(%eax
,%ebx
,4),%xmm0
1367 movss
4(%eax
,%ebx
,4),%xmm1
1368 movss
8(%eax
,%ebx
,4),%xmm2
1370 movl nb410nf_iinr
(%ebp
),%ecx
## ecx = pointer into iinr[]
1371 movl
(%ecx
,%esi
,4),%ebx
## ebx =ii
1373 movl nb410nf_charge
(%ebp
),%edx
1374 movss
(%edx
,%ebx
,4),%xmm3
1375 mulss nb410nf_facel
(%esp
),%xmm3
1376 shufps $
0,%xmm3
,%xmm3
1378 movl nb410nf_invsqrta
(%ebp
),%edx
## load invsqrta[ii]
1379 movss
(%edx
,%ebx
,4),%xmm4
1380 shufps $
0,%xmm4
,%xmm4
1382 movl nb410nf_type
(%ebp
),%edx
1383 movl
(%edx
,%ebx
,4),%edx
1384 imull nb410nf_ntype
(%esp
),%edx
1386 movl
%edx
,nb410nf_ntia
(%esp
)
1388 leal
(%ebx
,%ebx
,2),%ebx
## ebx = 3*ii=ii3
1389 movl nb410nf_pos
(%ebp
),%eax
## eax = base of pos[]
1391 addss
(%eax
,%ebx
,4),%xmm0
1392 addss
4(%eax
,%ebx
,4),%xmm1
1393 addss
8(%eax
,%ebx
,4),%xmm2
1395 movaps
%xmm3
,nb410nf_iq
(%esp
)
1396 movaps
%xmm4
,nb410nf_isai
(%esp
)
1398 shufps $
0,%xmm0
,%xmm0
1399 shufps $
0,%xmm1
,%xmm1
1400 shufps $
0,%xmm2
,%xmm2
1402 movaps
%xmm0
,nb410nf_ix
(%esp
)
1403 movaps
%xmm1
,nb410nf_iy
(%esp
)
1404 movaps
%xmm2
,nb410nf_iz
(%esp
)
1406 movl
%ebx
,nb410nf_ii3
(%esp
)
1410 movaps
%xmm4
,nb410nf_vctot
(%esp
)
1411 movaps
%xmm4
,nb410nf_Vvdwtot
(%esp
)
1413 movl nb410nf_jindex
(%ebp
),%eax
1414 movl
(%eax
,%esi
,4),%ecx
## jindex[n]
1415 movl
4(%eax
,%esi
,4),%edx
## jindex[n+1]
1416 subl
%ecx
,%edx
## number of innerloop atoms
1418 movl nb410nf_pos
(%ebp
),%esi
1419 movl nb410nf_faction
(%ebp
),%edi
1420 movl nb410nf_jjnr
(%ebp
),%eax
1423 movl
%eax
,nb410nf_innerjjnr
(%esp
) ## pointer to jjnr[nj0]
1426 addl nb410nf_ninner
(%esp
),%ecx
1427 movl
%ecx
,nb410nf_ninner
(%esp
)
1429 movl
%edx
,nb410nf_innerk
(%esp
) ## number of innerloop atoms
1430 jge _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop
1431 jmp _nb_kernel410nf_ia32_sse.nb410nf_finish_inner
1432 _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop
:
1433 ## quad-unroll innerloop here
1434 movl nb410nf_innerjjnr
(%esp
),%edx
## pointer to jjnr[k]
1438 movl
12(%edx
),%edx
## eax-edx=jnr1-4
1439 addl $
16,nb410nf_innerjjnr
(%esp
) ## advance pointer (unrolled 4)
1442 movl nb410nf_invsqrta
(%ebp
),%esi
1443 movss
(%esi
,%eax
,4),%xmm3
1444 movss
(%esi
,%ecx
,4),%xmm4
1445 movss
(%esi
,%ebx
,4),%xmm6
1446 movss
(%esi
,%edx
,4),%xmm7
1447 movaps nb410nf_isai
(%esp
),%xmm2
1448 shufps $
0,%xmm6
,%xmm3
1449 shufps $
0,%xmm7
,%xmm4
1450 shufps $
136,%xmm4
,%xmm3
## constant 10001000 ;# all charges in xmm3
1453 movaps
%xmm2
,nb410nf_isaprod
(%esp
)
1455 mulps nb410nf_gbtsc
(%esp
),%xmm1
1456 movaps
%xmm1
,nb410nf_gbscale
(%esp
)
1458 movl nb410nf_charge
(%ebp
),%esi
## base of charge[]
1460 movss
(%esi
,%eax
,4),%xmm3
1461 movss
(%esi
,%ecx
,4),%xmm4
1462 movss
(%esi
,%ebx
,4),%xmm6
1463 movss
(%esi
,%edx
,4),%xmm7
1465 mulps nb410nf_iq
(%esp
),%xmm2
1466 shufps $
0,%xmm6
,%xmm3
1467 shufps $
0,%xmm7
,%xmm4
1468 shufps $
136,%xmm4
,%xmm3
## constant 10001000 ;# all charges in xmm3
1470 movaps
%xmm3
,nb410nf_qq
(%esp
)
1477 movl nb410nf_type
(%ebp
),%esi
1478 movl
(%esi
,%eax
,4),%eax
1479 movl
(%esi
,%ebx
,4),%ebx
1480 movl
(%esi
,%ecx
,4),%ecx
1481 movl
(%esi
,%edx
,4),%edx
1482 movl nb410nf_vdwparam
(%ebp
),%esi
1487 movl nb410nf_ntia
(%esp
),%edi
1493 movlps
(%esi
,%eax
,4),%xmm6
1494 movlps
(%esi
,%ecx
,4),%xmm7
1495 movhps
(%esi
,%ebx
,4),%xmm6
1496 movhps
(%esi
,%edx
,4),%xmm7
1499 shufps $
136,%xmm7
,%xmm4
## constant 10001000
1500 shufps $
221,%xmm7
,%xmm6
## constant 11011101
1507 movaps
%xmm4
,nb410nf_c6
(%esp
)
1508 movaps
%xmm6
,nb410nf_c12
(%esp
)
1510 movl nb410nf_pos
(%ebp
),%esi
## base of pos[]
1512 leal
(%eax
,%eax
,2),%eax
## replace jnr with j3
1513 leal
(%ebx
,%ebx
,2),%ebx
1515 leal
(%ecx
,%ecx
,2),%ecx
## replace jnr with j3
1516 leal
(%edx
,%edx
,2),%edx
1518 ## move four coordinates to xmm0-xmm2
1520 movlps
(%esi
,%eax
,4),%xmm4
1521 movlps
(%esi
,%ecx
,4),%xmm5
1522 movss
8(%esi
,%eax
,4),%xmm2
1523 movss
8(%esi
,%ecx
,4),%xmm6
1525 movhps
(%esi
,%ebx
,4),%xmm4
1526 movhps
(%esi
,%edx
,4),%xmm5
1528 movss
8(%esi
,%ebx
,4),%xmm0
1529 movss
8(%esi
,%edx
,4),%xmm1
1531 shufps $
0,%xmm0
,%xmm2
1532 shufps $
0,%xmm1
,%xmm6
1537 shufps $
136,%xmm6
,%xmm2
## constant 10001000
1539 shufps $
136,%xmm5
,%xmm0
## constant 10001000
1540 shufps $
221,%xmm5
,%xmm1
## constant 11011101
1542 ## move ix-iz to xmm4-xmm6
1543 movaps nb410nf_ix
(%esp
),%xmm4
1544 movaps nb410nf_iy
(%esp
),%xmm5
1545 movaps nb410nf_iz
(%esp
),%xmm6
1561 ## lookup seed in xmm5
1564 movaps nb410nf_three
(%esp
),%xmm1
1565 mulps
%xmm4
,%xmm5
## rsq*lu*lu
1566 movaps nb410nf_half
(%esp
),%xmm0
1567 subps
%xmm5
,%xmm1
## constant 30-rsq*lu*lu
1569 mulps
%xmm1
,%xmm0
## xmm0=rinv
1570 mulps
%xmm0
,%xmm4
## xmm4=r
1571 mulps nb410nf_gbscale
(%esp
),%xmm4
1574 cvttps2pi
%xmm4
,%mm6
1575 cvttps2pi
%xmm5
,%mm7
## mm6/mm7 contain lu indices
1580 movaps
%xmm4
,%xmm1
## xmm1=eps
1582 mulps
%xmm2
,%xmm2
## xmm2=eps2
1591 movl nb410nf_GBtab
(%ebp
),%esi
1599 ## load coulomb table
1600 movaps
(%esi
,%eax
,4),%xmm4
1601 movaps
(%esi
,%ebx
,4),%xmm5
1602 movaps
(%esi
,%ecx
,4),%xmm6
1603 movaps
(%esi
,%edx
,4),%xmm7
1604 ## transpose, using xmm3 for scratch
1606 shufps $
0xEE,%xmm7
,%xmm3
1607 shufps $
0x44,%xmm7
,%xmm6
1609 shufps $
0xEE,%xmm5
,%xmm7
1610 shufps $
0x44,%xmm5
,%xmm4
1612 shufps $
0xDD,%xmm6
,%xmm5
1613 shufps $
0x88,%xmm6
,%xmm4
1615 shufps $
0x88,%xmm3
,%xmm6
1616 shufps $
0xDD,%xmm3
,%xmm7
1617 ## coulomb table ready, in xmm4-xmm7
1618 mulps
%xmm1
,%xmm6
## xmm6=Geps
1619 mulps
%xmm2
,%xmm7
## xmm7=Heps2
1622 addps
%xmm7
,%xmm5
## xmm5=Fp
1623 movaps nb410nf_qq
(%esp
),%xmm3
1624 mulps
%xmm1
,%xmm5
## xmm5=eps*Fp
1625 addps
%xmm4
,%xmm5
## xmm5=VV
1626 mulps
%xmm3
,%xmm5
## vcoul=qq*VV
1628 addps nb410nf_vctot
(%esp
),%xmm5
1629 movaps
%xmm5
,nb410nf_vctot
(%esp
)
1633 mulps
%xmm0
,%xmm4
## xmm4=rinvsq
1638 mulps
%xmm4
,%xmm6
## xmm6=rinvsix
1640 mulps
%xmm4
,%xmm4
## xmm4=rinvtwelve
1641 mulps nb410nf_c6
(%esp
),%xmm6
1642 mulps nb410nf_c12
(%esp
),%xmm4
1643 movaps nb410nf_Vvdwtot
(%esp
),%xmm7
1646 movaps
%xmm7
,nb410nf_Vvdwtot
(%esp
)
1648 ## should we do one more iteration?
1649 subl $
4,nb410nf_innerk
(%esp
)
1650 jl _nb_kernel410nf_ia32_sse.nb410nf_finish_inner
1651 jmp _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop
1652 _nb_kernel410nf_ia32_sse.nb410nf_finish_inner
:
1653 ## check if at least two particles remain
1654 addl $
4,nb410nf_innerk
(%esp
)
1655 movl nb410nf_innerk
(%esp
),%edx
1657 jnz _nb_kernel410nf_ia32_sse.nb410nf_dopair
1658 jmp _nb_kernel410nf_ia32_sse.nb410nf_checksingle
1659 _nb_kernel410nf_ia32_sse.nb410nf_dopair
:
1660 movl nb410nf_innerjjnr
(%esp
),%ecx
1663 addl $
8,nb410nf_innerjjnr
(%esp
)
1669 movl nb410nf_invsqrta
(%ebp
),%esi
1670 movss
(%esi
,%eax
,4),%xmm2
1671 movss
(%esi
,%ebx
,4),%xmm3
1672 unpcklps
%xmm3
,%xmm2
## isa2 in xmm3(0,1)
1673 mulps nb410nf_isai
(%esp
),%xmm2
1674 movaps
%xmm2
,nb410nf_isaprod
(%esp
)
1676 mulps nb410nf_gbtsc
(%esp
),%xmm1
1677 movaps
%xmm1
,nb410nf_gbscale
(%esp
)
1679 movl nb410nf_charge
(%ebp
),%esi
## base of charge[]
1680 movss
(%esi
,%eax
,4),%xmm3
1681 movss
(%esi
,%ebx
,4),%xmm6
1682 unpcklps
%xmm6
,%xmm3
## constant 00001000 ;# xmm3(0,1) has the charges
1684 mulps nb410nf_iq
(%esp
),%xmm2
1686 movaps
%xmm3
,nb410nf_qq
(%esp
)
1688 movl nb410nf_type
(%ebp
),%esi
1691 movl
(%esi
,%ecx
,4),%ecx
1692 movl
(%esi
,%edx
,4),%edx
1693 movl nb410nf_vdwparam
(%ebp
),%esi
1696 movl nb410nf_ntia
(%esp
),%edi
1699 movlps
(%esi
,%ecx
,4),%xmm6
1700 movhps
(%esi
,%edx
,4),%xmm6
1701 movl nb410nf_pos
(%ebp
),%edi
1704 shufps $
8,%xmm4
,%xmm4
## constant 00001000
1705 shufps $
13,%xmm6
,%xmm6
## constant 00001101
1709 movaps
%xmm4
,nb410nf_c6
(%esp
)
1710 movaps
%xmm6
,nb410nf_c12
(%esp
)
1712 leal
(%eax
,%eax
,2),%eax
1713 leal
(%ebx
,%ebx
,2),%ebx
1714 ## move coordinates to xmm0-xmm2
1715 movlps
(%edi
,%eax
,4),%xmm1
1716 movss
8(%edi
,%eax
,4),%xmm2
1717 movhps
(%edi
,%ebx
,4),%xmm1
1718 movss
8(%edi
,%ebx
,4),%xmm0
1722 shufps $
0,%xmm0
,%xmm2
1726 shufps $
136,%xmm2
,%xmm2
## constant 10001000
1728 shufps $
136,%xmm0
,%xmm0
## constant 10001000
1729 shufps $
221,%xmm1
,%xmm1
## constant 11011101
1731 movl nb410nf_faction
(%ebp
),%edi
1732 ## move ix-iz to xmm4-xmm6
1735 movaps nb410nf_ix
(%esp
),%xmm4
1736 movaps nb410nf_iy
(%esp
),%xmm5
1737 movaps nb410nf_iz
(%esp
),%xmm6
1753 ## lookup seed in xmm5
1756 movaps nb410nf_three
(%esp
),%xmm1
1757 mulps
%xmm4
,%xmm5
## rsq*lu*lu
1758 movaps nb410nf_half
(%esp
),%xmm0
1759 subps
%xmm5
,%xmm1
## constant 30-rsq*lu*lu
1761 mulps
%xmm1
,%xmm0
## xmm0=rinv
1762 mulps
%xmm0
,%xmm4
## xmm4=r
1763 mulps nb410nf_gbscale
(%esp
),%xmm4
1765 cvttps2pi
%xmm4
,%mm6
## mm6 contain lu indices
1768 movaps
%xmm4
,%xmm1
## xmm1=eps
1770 mulps
%xmm2
,%xmm2
## xmm2=eps2
1774 movl nb410nf_GBtab
(%ebp
),%esi
1779 ## load coulomb table
1780 movaps
(%esi
,%ecx
,4),%xmm4
1781 movaps
(%esi
,%edx
,4),%xmm7
1782 ## transpose, using xmm3 for scratch
1784 unpcklps
%xmm7
,%xmm4
## Y1 Y2 F1 F2
1785 unpckhps
%xmm7
,%xmm6
## G1 G2 H1 H2
1786 movhlps
%xmm4
,%xmm5
## F1 F2
1787 movhlps
%xmm6
,%xmm7
## H1 H2
1788 ## coulomb table ready, in xmm4-xmm7
1790 mulps
%xmm1
,%xmm6
## xmm6=Geps
1791 mulps
%xmm2
,%xmm7
## xmm7=Heps2
1793 addps
%xmm7
,%xmm5
## xmm5=Fp
1794 movaps nb410nf_qq
(%esp
),%xmm3
1795 mulps
%xmm1
,%xmm5
## xmm5=eps*Fp
1796 addps
%xmm4
,%xmm5
## xmm5=VV
1797 mulps
%xmm3
,%xmm5
## vcoul=qq*VV
1799 addps nb410nf_vctot
(%esp
),%xmm5
1800 movaps
%xmm5
,nb410nf_vctot
(%esp
)
1804 mulps
%xmm0
,%xmm4
## xmm4=rinvsq
1806 ## at this point mm5 contains vcoul and mm3 fijC
1807 ## increment vcoul - then we can get rid of mm5
1813 mulps
%xmm4
,%xmm6
## xmm6=rinvsix
1815 mulps
%xmm4
,%xmm4
## xmm4=rinvtwelve
1816 mulps nb410nf_c6
(%esp
),%xmm6
1817 mulps nb410nf_c12
(%esp
),%xmm4
1818 movaps nb410nf_Vvdwtot
(%esp
),%xmm7
1821 movaps
%xmm7
,nb410nf_Vvdwtot
(%esp
)
1823 _nb_kernel410nf_ia32_sse.nb410nf_checksingle
:
1824 movl nb410nf_innerk
(%esp
),%edx
1826 jnz _nb_kernel410nf_ia32_sse.nb410nf_dosingle
1827 jmp _nb_kernel410nf_ia32_sse.nb410nf_updateouterdata
1828 _nb_kernel410nf_ia32_sse.nb410nf_dosingle
:
1829 movl nb410nf_charge
(%ebp
),%esi
1830 movl nb410nf_invsqrta
(%ebp
),%edx
1831 movl nb410nf_pos
(%ebp
),%edi
1832 movl nb410nf_innerjjnr
(%esp
),%ecx
1836 movss
(%edx
,%eax
,4),%xmm2
## isa2
1837 mulss nb410nf_isai
(%esp
),%xmm2
1838 movss
%xmm2
,nb410nf_isaprod
(%esp
)
1840 mulss nb410nf_gbtsc
(%esp
),%xmm1
1841 movss
%xmm1
,nb410nf_gbscale
(%esp
)
1843 mulss nb410nf_iq
(%esp
),%xmm2
1844 movss
(%esi
,%eax
,4),%xmm6
## xmm6(0) has the charge
1846 movss
%xmm6
,nb410nf_qq
(%esp
)
1848 movl nb410nf_type
(%ebp
),%esi
1850 movl
(%esi
,%ecx
,4),%ecx
1851 movl nb410nf_vdwparam
(%ebp
),%esi
1853 addl nb410nf_ntia
(%esp
),%ecx
1854 movlps
(%esi
,%ecx
,4),%xmm6
1856 shufps $
252,%xmm4
,%xmm4
## constant 11111100
1857 shufps $
253,%xmm6
,%xmm6
## constant 11111101
1859 movaps
%xmm4
,nb410nf_c6
(%esp
)
1860 movaps
%xmm6
,nb410nf_c12
(%esp
)
1862 leal
(%eax
,%eax
,2),%eax
1864 ## move coordinates to xmm0-xmm2
1865 movss
(%edi
,%eax
,4),%xmm0
1866 movss
4(%edi
,%eax
,4),%xmm1
1867 movss
8(%edi
,%eax
,4),%xmm2
1869 movaps nb410nf_ix
(%esp
),%xmm4
1870 movaps nb410nf_iy
(%esp
),%xmm5
1871 movaps nb410nf_iz
(%esp
),%xmm6
1887 ## lookup seed in xmm5
1890 movss nb410nf_three
(%esp
),%xmm1
1891 mulss
%xmm4
,%xmm5
## rsq*lu*lu
1892 movss nb410nf_half
(%esp
),%xmm0
1893 subss
%xmm5
,%xmm1
## constant 30-rsq*lu*lu
1895 mulss
%xmm1
,%xmm0
## xmm0=rinv
1897 mulss
%xmm0
,%xmm4
## xmm4=r
1898 mulss nb410nf_gbscale
(%esp
),%xmm4
1900 cvttss2si
%xmm4
,%ebx
## mm6 contain lu indices
1903 movaps
%xmm4
,%xmm1
## xmm1=eps
1905 mulss
%xmm2
,%xmm2
## xmm2=eps2
1908 movl nb410nf_GBtab
(%ebp
),%esi
1910 movaps
(%esi
,%ebx
,4),%xmm4
1914 shufps $
1,%xmm5
,%xmm5
1915 shufps $
1,%xmm7
,%xmm7
1916 ## table ready in xmm4-xmm7
1918 mulss
%xmm1
,%xmm6
## xmm6=Geps
1919 mulss
%xmm2
,%xmm7
## xmm7=Heps2
1921 addss
%xmm7
,%xmm5
## xmm5=Fp
1922 movss nb410nf_qq
(%esp
),%xmm3
1923 mulss
%xmm1
,%xmm5
## xmm5=eps*Fp
1924 addss
%xmm4
,%xmm5
## xmm5=VV
1925 mulss
%xmm3
,%xmm5
## vcoul=qq*VV
1926 addss nb410nf_vctot
(%esp
),%xmm5
1927 movss
%xmm5
,nb410nf_vctot
(%esp
)
1931 mulss
%xmm0
,%xmm4
## xmm4=rinvsq
1936 mulss
%xmm4
,%xmm6
## xmm6=rinvsix
1938 mulss
%xmm4
,%xmm4
## xmm4=rinvtwelve
1939 mulss nb410nf_c6
(%esp
),%xmm6
1940 mulss nb410nf_c12
(%esp
),%xmm4
1941 movss nb410nf_Vvdwtot
(%esp
),%xmm7
1944 movss
%xmm7
,nb410nf_Vvdwtot
(%esp
)
1946 _nb_kernel410nf_ia32_sse.nb410nf_updateouterdata
:
1948 movl nb410nf_n
(%esp
),%esi
1949 ## get group index for i particle
1950 movl nb410nf_gid
(%ebp
),%edx
## base of gid[]
1951 movl
(%edx
,%esi
,4),%edx
## ggid=gid[n]
1953 ## accumulate total potential energy and update it
1954 movaps nb410nf_vctot
(%esp
),%xmm7
1957 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1959 shufps $
1,%xmm6
,%xmm6
1962 ## add earlier value from mem
1963 movl nb410nf_Vc
(%ebp
),%eax
1964 addss
(%eax
,%edx
,4),%xmm7
1966 movss
%xmm7
,(%eax
,%edx
,4)
1968 ## accumulate total lj energy and update it
1969 movaps nb410nf_Vvdwtot
(%esp
),%xmm7
1972 addps
%xmm6
,%xmm7
## pos 0-1 in xmm7 have the sum now
1974 shufps $
1,%xmm6
,%xmm6
1977 ## add earlier value from mem
1978 movl nb410nf_Vvdw
(%ebp
),%eax
1979 addss
(%eax
,%edx
,4),%xmm7
1981 movss
%xmm7
,(%eax
,%edx
,4)
1984 movl nb410nf_nn1
(%esp
),%ecx
1985 ## esi already loaded with n
1988 jz _nb_kernel410nf_ia32_sse.nb410nf_outerend
1990 ## not last, iterate outer loop once more!
1991 movl
%esi
,nb410nf_n
(%esp
)
1992 jmp _nb_kernel410nf_ia32_sse.nb410nf_outer
1993 _nb_kernel410nf_ia32_sse.nb410nf_outerend
:
1994 ## check if more outer neighborlists remain
1995 movl nb410nf_nri
(%esp
),%ecx
1996 ## esi already loaded with n above
1998 jz _nb_kernel410nf_ia32_sse.nb410nf_end
1999 ## non-zero, do one more workunit
2000 jmp _nb_kernel410nf_ia32_sse.nb410nf_threadloop
2001 _nb_kernel410nf_ia32_sse.nb410nf_end
:
2004 movl nb410nf_nouter
(%esp
),%eax
2005 movl nb410nf_ninner
(%esp
),%ebx
2006 movl nb410nf_outeriter
(%ebp
),%ecx
2007 movl nb410nf_inneriter
(%ebp
),%edx
2011 movl nb410nf_salign
(%esp
),%eax