4 ;
# This source code is part of
8 ;
# GROningen MAchine for Chemical Simulations
11 ;
# Copyright (c) 1991-2001, University of Groningen, The Netherlands
12 ;
# This program is free software; you can redistribute it and/or
13 ;
# modify it under the terms of the GNU General Public License
14 ;
# as published by the Free Software Foundation; either version 2
15 ;
# of the License, or (at your option) any later version.
17 ;
# If you want to redistribute modifications, please consider that
18 ;
# scientific software is very special. Version control is crucial -
19 ;
# bugs must be traceable. We will be happy to consider code for
20 ;
# inclusion in the official distribution, but derived work must not
21 ;
# be called official GROMACS. Details are found in the README & COPYING
22 ;
# files - if they are missing, get the official version at www.gromacs.org.
24 ;
# To help us fund GROMACS development, we humbly ask that you cite
25 ;
# the papers on the package - you can find them in the top README file.
27 ;
# For more info, check our website at http://www.gromacs.org
30 ;
# Gnomes, ROck Monsters And Chili Sauce
33 ;
# This file contains a subset of the gromacs innerloops
34 ;
# manually written in assembly to optimize performance
35 ;
# on AMD extended 3DNow-enabled processors like Athlon
36 ;
# and later generations.
37 ;
# Erik Lindahl, 2000-2001, erik@theophys.kth.se
40 ;
# These files require GNU binutils 2.10 or later, since we
41 ;
# use intel syntax for portability, or a recent version
42 ;
# of NASM that understands Extended 3DNow and SSE2 instructions.
43 ;
# (NASM is normally only used with MS Visual C++).
45 ;
# Since NASM and gnu as disagree on some definitions and use
46 ;
# completely different preprocessing options I have to introduce a
47 ;
# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
48 ;
# Gnu as treats ';' as a line break, i.e. ignores it. This is the
49 ;
# reason why all comments need both symbols...
50 ;
# The source is written for GNU as, with intel syntax. When you use
51 ;
# NASM we redefine a couple of things. The false if-statement around
52 ;
# the following code is seen by GNU as (NASM doesn't understant this
53 ;
# if syntax), but NASM doesn't see it, so the code inside is only
54 ;
# read by NASM (NASM doesn't understand .if):
56 ;
.if 0 # block below only read by NASM
57 %define
.section section
61 ;
# NASM wants 'dword' only, not 'dword ptr'.
66 ;
.endif # End of NASM-specific block
68 ;
.intel_syntax noprefix # Line only read by gnu as
85 .globl check3dnow ;# try to issue an Extended 3DNow instruction
96 .globl _vecrecip_3dnow
138 jmp short
.vecrecip_mainloop
153 jmp short
.vecrecip_tailloop
164 .globl vecinvsqrt_3dnow
165 .globl _vecinvsqrt_3dnow
180 jecxz
.vecinvsqrt_tail
182 .vecinvsqrt_mainloop:
210 jecxz
.vecinvsqrt_tail
211 jmp short
.vecinvsqrt_mainloop
215 jecxz
.vecinvsqrt_end
216 .vecinvsqrt_tailloop:
227 jecxz
.vecinvsqrt_end
228 jmp short
.vecinvsqrt_tailloop
240 .globl _inl0100_3dnow
244 .equiv i0100_iinr, 12
245 .equiv i0100_jindex, 16
246 .equiv i0100_jjnr, 20
247 .equiv i0100_shift, 24
248 .equiv i0100_shiftvec, 28
249 .equiv i0100_fshift, 32
252 .equiv i0100_faction, 44
253 .equiv i0100_type, 48
254 .equiv i0100_ntype, 52
255 .equiv i0100_nbfp, 56
257 ;
# stack offsets for local variables
263 .equiv i0100_vnbtot, 20
267 .equiv i0100_twelve, 52
268 .equiv i0100_ntia, 60
269 .equiv i0100_innerjjnr, 64
270 .equiv i0100_innerk, 68
278 .equiv i0100_dy2, 100
279 .equiv i0100_dz2, 104
288 sub esp
, 108 ;
# local stack space
290 ;
# move data to local stack
292 movq mm1
, [mm_twelve
]
293 movq
[esp
+ i0100_six
], mm0
294 movq
[esp
+ i0100_twelve
], mm1
295 ;
# assume we have at least one i particle - start directly
297 mov eax
, [ebp
+ i0100_shift
] ;
# eax = pointer into shift[]
298 mov ebx
, [eax
] ;
# ebx=shift[n]
299 add dword ptr
[ebp
+ i0100_shift
], 4 ;
# advance pointer one step
301 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
302 mov
[esp
+ i0100_is3
],ebx ;
# store is3
304 mov eax
, [ebp
+ i0100_shiftvec
] ;
# eax = base of shiftvec[]
306 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1.
307 movd mm1
, [eax
+ ebx
*4 + 8]
309 mov ecx
, [ebp
+ i0100_iinr
] ;
# ecx = pointer into iinr[]
310 add dword ptr
[ebp
+ i0100_iinr
], 4 ;
# advance pointer
311 mov ebx
, [ecx
] ;
# ebx =ii
313 mov edx
, [ebp
+ i0100_type
]
314 mov edx
, [edx
+ ebx
*4]
315 imul edx
, [ebp
+ i0100_ntype
]
317 mov
[esp
+ i0100_ntia
], edx
319 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
320 mov eax
, [ebp
+ i0100_pos
] ;
# eax = base of pos[]
322 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
323 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
324 mov
[esp
+ i0100_ii3
], ebx
326 movq
[esp
+ i0100_ix
], mm0
327 movd
[esp
+ i0100_iz
], mm1
329 ;
# clear total potential and i forces
331 movq
[esp
+ i0100_vnbtot
], mm7
332 movq
[esp
+ i0100_fix
], mm7
333 movd
[esp
+ i0100_fiz
], mm7
335 mov eax
, [ebp
+ i0100_jindex
]
336 mov ecx
, [eax
] ;
# jindex[n]
337 mov edx
, [eax
+ 4] ;
# jindex[n+1]
338 add dword ptr
[ebp
+ i0100_jindex
], 4
339 sub edx
, ecx ;
# number of innerloop atoms
341 mov esi
, [ebp
+ i0100_pos
]
342 mov edi
, [ebp
+ i0100_faction
]
343 mov eax
, [ebp
+ i0100_jjnr
]
346 mov
[esp
+ i0100_innerjjnr
], eax ;
# pointer to jjnr[nj0]
348 mov
[esp
+ i0100_innerk
], edx ;
# number of innerloop atoms
349 jge
.i0100_unroll_loop
350 jmp
.i0100_finish_inner
352 ;
# paired innerloop starts here
353 mov ecx
, [esp
+ i0100_innerjjnr
] ;
# pointer to jjnr[k]
355 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
356 add dword ptr
[esp
+ i0100_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
357 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
359 mov ecx
, [ebp
+ i0100_type
]
360 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
361 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
363 mov esi
, [ebp
+ i0100_nbfp
] ;
# base of nbfp
366 add edx
, [esp
+ i0100_ntia
] ;
# tja = ntia + 2*type
367 add ecx
, [esp
+ i0100_ntia
]
369 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
370 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
372 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
373 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
374 movq
[esp
+ i0100_c6
], mm5
375 movq
[esp
+ i0100_c12
], mm6
377 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
378 lea ebx
, [ebx
+ ebx
*2]
380 mov esi
, [ebp
+ i0100_pos
]
382 movq mm0
, [esp
+ i0100_ix
]
383 movd mm1
, [esp
+ i0100_iz
]
384 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
385 movd mm5
, [esi
+ eax
*4 + 8]
386 pfsubr mm4
,mm0 ;
# dr = ir - jr
388 movq
[esp
+ i0100_dx1
], mm4 ;
# store dr
389 movd
[esp
+ i0100_dz1
], mm5
390 pfmul mm4
,mm4 ;
# square dx,dy,dz
392 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
393 pfacc mm4
, mm5 ;
# first rsq in lower mm4
395 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
396 movd mm7
, [esi
+ ebx
*4 + 8]
398 pfsubr mm6
,mm0 ;
# dr = ir - jr
400 movq
[esp
+ i0100_dx2
], mm6 ;
# store dr
401 movd
[esp
+ i0100_dz2
], mm7
402 pfmul mm6
,mm6 ;
# square dx,dy,dz
404 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
405 pfacc mm6
, mm7 ;
# second rsq in lower mm6
407 pfrcp mm0
, mm4 ;
# lookup reciprocal seed
411 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
412 ;
# amd 3dnow N-R iteration to get full precision.
415 ;
# mm4 now contains invsq,
416 ;
# do potential and fscal
420 pfmul mm4
, mm0 ;
# mm4=rinvsix
422 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
424 pfmul mm5
, [esp
+ i0100_c12
]
425 pfmul mm4
, [esp
+ i0100_c6
]
426 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
429 pfmul mm4
, [esp
+ i0100_six
]
431 pfmul mm5
, [esp
+ i0100_twelve
]
433 pfmul mm0
, mm5 ;
# mm0 is total fscal now
435 prefetchw
[esp
+ i0100_dx1
] ;
# prefetch i forces to cache
437 ;
# spread fscalar to both positions
443 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
444 movq mm2
, [esp
+ i0100_dx1
] ;
# fetch dr
445 movd mm3
, [esp
+ i0100_dz1
]
448 pfadd mm6
, [esp
+ i0100_vnbtot
] ;
# add the earlier value
449 movq
[esp
+ i0100_vnbtot
], mm6 ;
# store the sum
451 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
452 pfmul mm2
, mm0 ;
# mult by fs
455 movq mm4
, [esp
+ i0100_dx2
] ;
# fetch dr
456 movd mm5
, [esp
+ i0100_dz2
]
457 pfmul mm4
, mm1 ;
# mult by fs
461 movq mm0
, [esp
+ i0100_fix
]
462 movd mm1
, [esp
+ i0100_fiz
]
468 movq
[esp
+ i0100_fix
], mm0
469 movd
[esp
+ i0100_fiz
], mm1
472 movq mm0
, [edi
+ eax
*4]
473 movd mm1
, [edi
+ eax
*4 + 8]
474 movq mm6
, [edi
+ ebx
*4]
475 movd mm7
, [edi
+ ebx
*4 + 8]
482 movq
[edi
+ eax
*4], mm0
483 movd
[edi
+ eax
*4 +8], mm1
484 movq
[edi
+ ebx
*4], mm6
485 movd
[edi
+ ebx
*4 + 8], mm7
487 ;
# should we do one more iteration?
488 sub dword ptr
[esp
+ i0100_innerk
], 2
489 jl
.i0100_finish_inner
490 jmp
.i0100_unroll_loop
492 and dword ptr
[esp
+ i0100_innerk
], 1
493 jnz
.i0100_single_inner
494 jmp
.i0100_updateouterdata
496 ;
# a single j particle iteration here - compare with the unrolled code for comments
497 mov eax
, [esp
+ i0100_innerjjnr
]
498 mov eax
, [eax
] ;
# eax=jnr offset
500 mov esi
, [ebp
+ i0100_nbfp
]
501 mov ecx
, [ebp
+ i0100_type
]
502 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
504 add edx
, [esp
+ i0100_ntia
] ;
# tja = ntia + 2*type
505 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
506 movq
[esp
+ i0100_c6
], mm5
507 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
508 movq
[esp
+ i0100_c12
], mm5
510 mov esi
, [ebp
+ i0100_pos
]
511 lea eax
, [eax
+ eax
*2]
513 movq mm0
, [esp
+ i0100_ix
]
514 movd mm1
, [esp
+ i0100_iz
]
515 movq mm4
, [esi
+ eax
*4]
516 movd mm5
, [esi
+ eax
*4 + 8]
519 movq
[esp
+ i0100_dx1
], mm4
521 movd
[esp
+ i0100_dz1
], mm5
524 pfacc mm4
, mm5 ;
# mm4=rsq
528 pfrcpit2 mm4
,mm0 ;
# mm4=invsq
529 ;
# calculate potentials and scalar force
533 pfmul mm4
, mm0 ;
# mm4=rinvsix
535 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
537 pfmul mm5
, [esp
+ i0100_c12
]
538 pfmul mm4
, [esp
+ i0100_c6
]
539 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
542 pfmul mm4
, [esp
+ i0100_six
]
544 pfmul mm5
, [esp
+ i0100_twelve
]
546 pfmul mm0
, mm5 ;
# mm0 is total fscal now
549 pfadd mm6
, [esp
+ i0100_vnbtot
] ;
# add the earlier value
550 movq
[esp
+ i0100_vnbtot
], mm6 ;
# store the sum
552 ;
# spread fscalar to both positions
554 ;
# calc vectorial force
555 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
556 movq mm2
, [esp
+ i0100_dx1
]
557 movd mm3
, [esp
+ i0100_dz1
]
562 ;
# update i particle force
563 movq mm0
, [esp
+ i0100_fix
]
564 movd mm1
, [esp
+ i0100_fiz
]
567 movq
[esp
+ i0100_fix
], mm0
568 movd
[esp
+ i0100_fiz
], mm1
569 ;
# update j particle force
570 movq mm0
, [edi
+ eax
*4]
571 movd mm1
, [edi
+ eax
*4+ 8]
574 movq
[edi
+ eax
*4], mm0
575 movd
[edi
+ eax
*4 +8], mm1
577 .i0100_updateouterdata:
578 mov ecx
, [esp
+ i0100_ii3
]
580 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
581 movd mm7
, [edi
+ ecx
*4 + 8]
582 pfadd mm6
, [esp
+ i0100_fix
]
583 pfadd mm7
, [esp
+ i0100_fiz
]
584 movq
[edi
+ ecx
*4], mm6
585 movd
[edi
+ ecx
*4 +8], mm7
587 mov ebx
, [ebp
+ i0100_fshift
] ;
# increment fshift force
588 mov edx
, [esp
+ i0100_is3
]
590 movq mm6
, [ebx
+ edx
*4]
591 movd mm7
, [ebx
+ edx
*4 + 8]
592 pfadd mm6
, [esp
+ i0100_fix
]
593 pfadd mm7
, [esp
+ i0100_fiz
]
594 movq
[ebx
+ edx
*4], mm6
595 movd
[ebx
+ edx
*4 + 8], mm7
597 mov edx
, [ebp
+ i0100_gid
] ;
# get group index for this i particle
599 add dword ptr
[ebp
+ i0100_gid
], 4 ;
# advance pointer
601 movq mm7
, [esp
+ i0100_vnbtot
]
602 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
604 mov eax
, [ebp
+ i0100_Vnb
]
605 movd mm6
, [eax
+ edx
*4]
607 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
610 mov ecx
, [ebp
+ i0100_nri
]
613 ;
# not last, iterate once more!
614 mov
[ebp
+ i0100_nri
], ecx
634 .globl _inl0110_3dnow
638 .equiv i0110_iinr, 12
639 .equiv i0110_jindex, 16
640 .equiv i0110_jjnr, 20
641 .equiv i0110_shift, 24
642 .equiv i0110_shiftvec, 28
643 .equiv i0110_fshift, 32
646 .equiv i0110_faction, 44
647 .equiv i0110_type, 48
648 .equiv i0110_ntype, 52
649 .equiv i0110_nbfp, 56
651 .equiv i0110_nsatoms, 64
652 ;
# stack offsets for local variables
661 .equiv i0110_vnbtot, 32
665 .equiv i0110_twelve, 64
666 .equiv i0110_ntia, 72
667 .equiv i0110_innerjjnr0, 76
668 .equiv i0110_innerk0, 80
669 .equiv i0110_innerjjnr, 84
670 .equiv i0110_innerk, 88
673 .equiv i0110_fiz, 100
674 .equiv i0110_dx1, 104
675 .equiv i0110_dy1, 108
676 .equiv i0110_dz1, 112
677 .equiv i0110_dx2, 116
678 .equiv i0110_dy2, 120
679 .equiv i0110_dz2, 124
680 .equiv i0110_nsvdwc, 128
681 .equiv i0110_nscoul, 132
682 .equiv i0110_nsvdw, 136
683 .equiv i0110_solnr, 140
692 sub esp
, 144 ;
# local stack space
695 movq mm1
, [mm_twelve
]
696 movq
[esp
+ i0110_six
], mm0
697 movq
[esp
+ i0110_twelve
], mm1
698 ;
# assume we have at least one i particle - start directly
700 mov eax
, [ebp
+ i0110_shift
] ;
# eax = pointer into shift[]
701 mov ebx
, [eax
] ;
# ebx=shift[n]
702 add dword ptr
[ebp
+ i0110_shift
], 4 ;
# advance pointer one step
704 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
705 mov
[esp
+ i0110_is3
],ebx ;
# store is3
707 mov eax
, [ebp
+ i0110_shiftvec
] ;
# eax = base of shiftvec[]
709 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
710 movd mm1
, [eax
+ ebx
*4 + 8]
711 movq
[esp
+ i0110_shX
], mm0
712 movd
[esp
+ i0110_shZ
], mm1
714 mov ecx
, [ebp
+ i0110_iinr
] ;
# ecx = pointer into iinr[]
715 add dword ptr
[ebp
+ i0110_iinr
], 4 ;
# advance pointer
716 mov ebx
, [ecx
] ;
# ebx=ii
718 mov eax
, [ebp
+ i0110_nsatoms
]
719 add dword ptr
[ebp
+ i0110_nsatoms
], 12
726 mov
[esp
+ i0110_nsvdwc
], edx
727 mov
[esp
+ i0110_nscoul
], eax
728 mov
[esp
+ i0110_nsvdw
], ecx
732 movq
[esp
+ i0110_vnbtot
], mm7
733 mov
[esp
+ i0110_solnr
], ebx
735 mov eax
, [ebp
+ i0110_jindex
]
736 mov ecx
, [eax
] ;
# jindex[n]
737 mov edx
, [eax
+ 4] ;
# jindex[n+1]
738 add dword ptr
[ebp
+ i0110_jindex
], 4
739 sub edx
, ecx ;
# number of innerloop atoms
740 mov eax
, [ebp
+ i0110_jjnr
]
743 mov
[esp
+ i0110_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
745 mov
[esp
+ i0110_innerk0
], edx ;
# number of innerloop atoms
746 mov esi
, [ebp
+ i0110_pos
]
747 mov edi
, [ebp
+ i0110_faction
]
749 mov ecx
, [esp
+ i0110_nsvdwc
]
754 mov ebx
, [esp
+ i0110_solnr
]
755 inc dword ptr
[esp
+ i0110_solnr
]
757 mov edx
, [ebp
+ i0110_type
]
758 mov edx
, [edx
+ ebx
*4]
759 imul edx
, [ebp
+ i0110_ntype
]
761 mov
[esp
+ i0110_ntia
], edx
763 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
764 mov eax
, [ebp
+ i0110_pos
] ;
# eax = base of pos[]
765 mov
[esp
+ i0110_ii3
], ebx
767 movq mm0
, [eax
+ ebx
*4]
768 movd mm1
, [eax
+ ebx
*4 + 8]
769 pfadd mm0
, [esp
+ i0110_shX
]
770 pfadd mm1
, [esp
+ i0110_shZ
]
771 movq
[esp
+ i0110_ix
], mm0
772 movd
[esp
+ i0110_iz
], mm1
776 movq
[esp
+ i0110_fix
], mm7
777 movd
[esp
+ i0110_fiz
], mm7
779 mov ecx
, [esp
+ i0110_innerjjnr0
]
780 mov
[esp
+ i0110_innerjjnr
], ecx
781 mov edx
, [esp
+ i0110_innerk0
]
783 mov
[esp
+ i0110_innerk
], edx ;
# number of innerloop atoms
784 jge
.i0110_unroll_vdwc_loop
785 jmp
.i0110_finish_vdwc_inner
786 .i0110_unroll_vdwc_loop:
787 ;
# paired innerloop starts here
788 mov ecx
, [esp
+ i0110_innerjjnr
] ;
# pointer to jjnr[k]
790 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
791 add dword ptr
[esp
+ i0110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
792 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
794 mov ecx
, [ebp
+ i0110_type
]
795 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
796 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
798 mov esi
, [ebp
+ i0110_nbfp
] ;
# base of nbfp
801 add edx
, [esp
+ i0110_ntia
] ;
# tja = ntia + 2*type
802 add ecx
, [esp
+ i0110_ntia
]
804 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
805 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
807 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
808 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
809 movq
[esp
+ i0110_c6
], mm5
810 movq
[esp
+ i0110_c12
], mm6
812 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
813 lea ebx
, [ebx
+ ebx
*2]
815 mov esi
, [ebp
+ i0110_pos
]
817 movq mm0
, [esp
+ i0110_ix
]
818 movd mm1
, [esp
+ i0110_iz
]
819 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
820 movd mm5
, [esi
+ eax
*4 + 8]
821 pfsubr mm4
,mm0 ;
# dr = ir - jr
823 movq
[esp
+ i0110_dx1
], mm4 ;
# store dr
824 movd
[esp
+ i0110_dz1
], mm5
825 pfmul mm4
,mm4 ;
# square dx,dy,dz
827 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
828 pfacc mm4
, mm5 ;
# first rsq in lower mm4
830 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
831 movd mm7
, [esi
+ ebx
*4 + 8]
833 pfsubr mm6
,mm0 ;
# dr = ir - jr
835 movq
[esp
+ i0110_dx2
], mm6 ;
# store dr
836 movd
[esp
+ i0110_dz2
], mm7
837 pfmul mm6
,mm6 ;
# square dx,dy,dz
839 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
840 pfacc mm6
, mm7 ;
# second rsq in lower mm6
842 pfrcp mm0
, mm4 ;
# lookup reciprocal seed
846 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
847 ;
# amd 3dnow N-R iteration to get full precision
850 ;
# mm4 now contains invsq,
851 ;
# do potential and fscal
855 pfmul mm4
, mm0 ;
# mm4=rinvsix
857 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
859 pfmul mm5
, [esp
+ i0110_c12
]
860 pfmul mm4
, [esp
+ i0110_c6
]
861 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
864 pfmul mm4
, [esp
+ i0110_six
]
866 pfmul mm5
, [esp
+ i0110_twelve
]
868 pfmul mm0
, mm5 ;
# mm0 is total fscal now
870 prefetchw
[esp
+ i0110_dx1
] ;
# prefetch i forces to cache
872 ;
# spread fscalar to both positions
878 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
879 movq mm2
, [esp
+ i0110_dx1
] ;
# fetch dr
880 movd mm3
, [esp
+ i0110_dz1
]
883 pfadd mm6
, [esp
+ i0110_vnbtot
] ;
# add the earlier value
884 movq
[esp
+ i0110_vnbtot
], mm6 ;
# store the sum
886 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
887 pfmul mm2
, mm0 ;
# mult by fs
890 movq mm4
, [esp
+ i0110_dx2
] ;
# fetch dr
891 movd mm5
, [esp
+ i0110_dz2
]
892 pfmul mm4
, mm1 ;
# mult by fs
896 movq mm0
, [esp
+ i0110_fix
]
897 movd mm1
, [esp
+ i0110_fiz
]
903 movq
[esp
+ i0110_fix
], mm0
904 movd
[esp
+ i0110_fiz
], mm1
907 movq mm0
, [edi
+ eax
*4]
908 movd mm1
, [edi
+ eax
*4 + 8]
909 movq mm6
, [edi
+ ebx
*4]
910 movd mm7
, [edi
+ ebx
*4 + 8]
917 movq
[edi
+ eax
*4], mm0
918 movd
[edi
+ eax
*4 +8], mm1
919 movq
[edi
+ ebx
*4], mm6
920 movd
[edi
+ ebx
*4 + 8], mm7
922 ;
# should we do one more iteration?
923 sub dword ptr
[esp
+ i0110_innerk
], 2
924 jl
.i0110_finish_vdwc_inner
925 jmp
.i0110_unroll_vdwc_loop
926 .i0110_finish_vdwc_inner:
927 and dword ptr
[esp
+ i0110_innerk
], 1
928 jnz
.i0110_single_vdwc_inner
929 jmp
.i0110_updateouterdata_vdwc
930 .i0110_single_vdwc_inner:
931 ;
# a single j particle iteration here - compare with the unrolled code for comments
932 mov eax
, [esp
+ i0110_innerjjnr
]
933 mov eax
, [eax
] ;
# eax=jnr offset
935 mov esi
, [ebp
+ i0110_nbfp
]
936 mov ecx
, [ebp
+ i0110_type
]
937 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
939 add edx
, [esp
+ i0110_ntia
] ;
# tja = ntia + 2*type
940 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
941 movq
[esp
+ i0110_c6
], mm5
942 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
943 movq
[esp
+ i0110_c12
], mm5
945 mov esi
, [ebp
+ i0110_pos
]
946 lea eax
, [eax
+ eax
*2]
948 movq mm0
, [esp
+ i0110_ix
]
949 movd mm1
, [esp
+ i0110_iz
]
950 movq mm4
, [esi
+ eax
*4]
951 movd mm5
, [esi
+ eax
*4 + 8]
954 movq
[esp
+ i0110_dx1
], mm4
956 movd
[esp
+ i0110_dz1
], mm5
959 pfacc mm4
, mm5 ;
# mm4=rsq
963 pfrcpit2 mm4
,mm0 ;
# mm4=invsq
964 ;
# calculate potentials and scalar force
968 pfmul mm4
, mm0 ;
# mm4=rinvsix
970 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
972 pfmul mm5
, [esp
+ i0110_c12
]
973 pfmul mm4
, [esp
+ i0110_c6
]
974 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
977 pfmul mm4
, [esp
+ i0110_six
]
979 pfmul mm5
, [esp
+ i0110_twelve
]
981 pfmul mm0
, mm5 ;
# mm0 is total fscal now
984 pfadd mm6
, [esp
+ i0110_vnbtot
] ;
# add the earlier value
985 movq
[esp
+ i0110_vnbtot
], mm6 ;
# store the sum
987 ;
# spread fscalar to both positions
989 ;
# calc vectorial force
990 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
991 movq mm2
, [esp
+ i0110_dx1
]
992 movd mm3
, [esp
+ i0110_dz1
]
997 ;
# update i particle force
998 movq mm0
, [esp
+ i0110_fix
]
999 movd mm1
, [esp
+ i0110_fiz
]
1002 movq
[esp
+ i0110_fix
], mm0
1003 movd
[esp
+ i0110_fiz
], mm1
1004 ;
# update j particle force
1005 movq mm0
, [edi
+ eax
*4]
1006 movd mm1
, [edi
+ eax
*4+ 8]
1009 movq
[edi
+ eax
*4], mm0
1010 movd
[edi
+ eax
*4 +8], mm1
1012 .i0110_updateouterdata_vdwc:
1013 mov ecx
, [esp
+ i0110_ii3
]
1015 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
1016 movd mm7
, [edi
+ ecx
*4 + 8]
1017 pfadd mm6
, [esp
+ i0110_fix
]
1018 pfadd mm7
, [esp
+ i0110_fiz
]
1019 movq
[edi
+ ecx
*4], mm6
1020 movd
[edi
+ ecx
*4 +8], mm7
1022 mov ebx
, [ebp
+ i0110_fshift
] ;
# increment fshift force
1023 mov edx
, [esp
+ i0110_is3
]
1025 movq mm6
, [ebx
+ edx
*4]
1026 movd mm7
, [ebx
+ edx
*4 + 8]
1027 pfadd mm6
, [esp
+ i0110_fix
]
1028 pfadd mm7
, [esp
+ i0110_fiz
]
1029 movq
[ebx
+ edx
*4], mm6
1030 movd
[ebx
+ edx
*4 + 8], mm7
1033 dec dword ptr
[esp
+ i0110_nsvdwc
]
1037 mov ebx
, [esp
+ i0110_nscoul
]
1038 add [esp
+ i0110_solnr
], ebx
1040 mov ecx
, [esp
+ i0110_nsvdw
]
1045 mov ebx
, [esp
+ i0110_solnr
]
1046 inc dword ptr
[esp
+ i0110_solnr
]
1048 mov edx
, [ebp
+ i0110_type
]
1049 mov edx
, [edx
+ ebx
*4]
1050 imul edx
, [ebp
+ i0110_ntype
]
1052 mov
[esp
+ i0110_ntia
], edx
1054 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
1055 mov eax
, [ebp
+ i0110_pos
] ;
# eax = base of pos[]
1056 mov
[esp
+ i0110_ii3
], ebx
1058 movq mm0
, [eax
+ ebx
*4]
1059 movd mm1
, [eax
+ ebx
*4 + 8]
1060 pfadd mm0
, [esp
+ i0110_shX
]
1061 pfadd mm1
, [esp
+ i0110_shZ
]
1062 movq
[esp
+ i0110_ix
], mm0
1063 movd
[esp
+ i0110_iz
], mm1
1067 movq
[esp
+ i0110_fix
], mm7
1068 movd
[esp
+ i0110_fiz
], mm7
1070 mov ecx
, [esp
+ i0110_innerjjnr0
]
1071 mov
[esp
+ i0110_innerjjnr
], ecx
1072 mov edx
, [esp
+ i0110_innerk0
]
1074 mov
[esp
+ i0110_innerk
], edx ;
# number of innerloop atoms
1075 jge
.i0110_unroll_vdw_loop
1076 jmp
.i0110_finish_vdw_inner
1077 .i0110_unroll_vdw_loop:
1078 ;
# paired innerloop starts here
1079 mov ecx
, [esp
+ i0110_innerjjnr
] ;
# pointer to jjnr[k]
1081 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
1082 add dword ptr
[esp
+ i0110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
1083 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
1085 mov ecx
, [ebp
+ i0110_type
]
1086 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
1087 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
1089 mov esi
, [ebp
+ i0110_nbfp
] ;
# base of nbfp
1092 add edx
, [esp
+ i0110_ntia
] ;
# tja = ntia + 2*type
1093 add ecx
, [esp
+ i0110_ntia
]
1095 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
1096 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
1098 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
1099 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
1100 movq
[esp
+ i0110_c6
], mm5
1101 movq
[esp
+ i0110_c12
], mm6
1103 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
1104 lea ebx
, [ebx
+ ebx
*2]
1106 mov esi
, [ebp
+ i0110_pos
]
1108 movq mm0
, [esp
+ i0110_ix
]
1109 movd mm1
, [esp
+ i0110_iz
]
1110 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
1111 movd mm5
, [esi
+ eax
*4 + 8]
1112 pfsubr mm4
,mm0 ;
# dr = ir - jr
1114 movq
[esp
+ i0110_dx1
], mm4 ;
# store dr
1115 movd
[esp
+ i0110_dz1
], mm5
1116 pfmul mm4
,mm4 ;
# square dx,dy,dz
1118 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
1119 pfacc mm4
, mm5 ;
# first rsq in lower mm4
1121 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
1122 movd mm7
, [esi
+ ebx
*4 + 8]
1124 pfsubr mm6
,mm0 ;
# dr = ir - jr
1126 movq
[esp
+ i0110_dx2
], mm6 ;
# store dr
1127 movd
[esp
+ i0110_dz2
], mm7
1128 pfmul mm6
,mm6 ;
# square dx,dy,dz
1130 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
1131 pfacc mm6
, mm7 ;
# second rsq in lower mm6
1133 pfrcp mm0
, mm4 ;
# lookup reciprocal seed
1137 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
1138 ;
# amd 3dnow N-R iteration to get full precision
1141 ;
# mm4 now contains invsq,
1142 ;
# do potential and fscal
1146 pfmul mm4
, mm0 ;
# mm4=rinvsix
1148 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
1150 pfmul mm5
, [esp
+ i0110_c12
]
1151 pfmul mm4
, [esp
+ i0110_c6
]
1152 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
1155 pfmul mm4
, [esp
+ i0110_six
]
1157 pfmul mm5
, [esp
+ i0110_twelve
]
1159 pfmul mm0
, mm5 ;
# mm0 is total fscal now
1161 prefetchw
[esp
+ i0110_dx1
] ;
# prefetch i forces to cache
1163 ;
# spread fscalar to both positions
1168 ;
# calc vector force
1169 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
1170 movq mm2
, [esp
+ i0110_dx1
] ;
# fetch dr
1171 movd mm3
, [esp
+ i0110_dz1
]
1174 pfadd mm6
, [esp
+ i0110_vnbtot
] ;
# add the earlier value
1175 movq
[esp
+ i0110_vnbtot
], mm6 ;
# store the sum
1177 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
1178 pfmul mm2
, mm0 ;
# mult by fs
1181 movq mm4
, [esp
+ i0110_dx2
] ;
# fetch dr
1182 movd mm5
, [esp
+ i0110_dz2
]
1183 pfmul mm4
, mm1 ;
# mult by fs
1187 movq mm0
, [esp
+ i0110_fix
]
1188 movd mm1
, [esp
+ i0110_fiz
]
1194 movq
[esp
+ i0110_fix
], mm0
1195 movd
[esp
+ i0110_fiz
], mm1
1198 movq mm0
, [edi
+ eax
*4]
1199 movd mm1
, [edi
+ eax
*4 + 8]
1200 movq mm6
, [edi
+ ebx
*4]
1201 movd mm7
, [edi
+ ebx
*4 + 8]
1208 movq
[edi
+ eax
*4], mm0
1209 movd
[edi
+ eax
*4 +8], mm1
1210 movq
[edi
+ ebx
*4], mm6
1211 movd
[edi
+ ebx
*4 + 8], mm7
1212 ;
# should we do one more iteration?
1213 sub dword ptr
[esp
+ i0110_innerk
], 2
1214 jl
.i0110_finish_vdw_inner
1215 jmp
.i0110_unroll_vdw_loop
1216 .i0110_finish_vdw_inner:
1217 and dword ptr
[esp
+ i0110_innerk
], 1
1218 jnz
.i0110_single_vdw_inner
1219 jmp
.i0110_updateouterdata_vdw
1220 .i0110_single_vdw_inner:
1221 ;
# a single j particle iteration here - compare with the unrolled code for comments
1222 mov eax
, [esp
+ i0110_innerjjnr
]
1223 mov eax
, [eax
] ;
# eax=jnr offset
1225 mov esi
, [ebp
+ i0110_nbfp
]
1226 mov ecx
, [ebp
+ i0110_type
]
1227 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
1229 add edx
, [esp
+ i0110_ntia
] ;
# tja = ntia + 2*type
1230 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
1231 movq
[esp
+ i0110_c6
], mm5
1232 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
1233 movq
[esp
+ i0110_c12
], mm5
1235 mov esi
, [ebp
+ i0110_pos
]
1236 lea eax
, [eax
+ eax
*2]
1238 movq mm0
, [esp
+ i0110_ix
]
1239 movd mm1
, [esp
+ i0110_iz
]
1240 movq mm4
, [esi
+ eax
*4]
1241 movd mm5
, [esi
+ eax
*4 + 8]
1244 movq
[esp
+ i0110_dx1
], mm4
1246 movd
[esp
+ i0110_dz1
], mm5
1249 pfacc mm4
, mm5 ;
# mm4=rsq
1253 pfrcpit2 mm4
,mm0 ;
# mm4=invsq
1254 ;
# calculate potentials and scalar force
1258 pfmul mm4
, mm0 ;
# mm4=rinvsix
1260 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
1262 pfmul mm5
, [esp
+ i0110_c12
]
1263 pfmul mm4
, [esp
+ i0110_c6
]
1264 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
1267 pfmul mm4
, [esp
+ i0110_six
]
1269 pfmul mm5
, [esp
+ i0110_twelve
]
1271 pfmul mm0
, mm5 ;
# mm0 is total fscal now
1274 pfadd mm6
, [esp
+ i0110_vnbtot
] ;
# add the earlier value
1275 movq
[esp
+ i0110_vnbtot
], mm6 ;
# store the sum
1277 ;
# spread fscalar to both positions
1279 ;
# calc vectorial force
1280 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
1281 movq mm2
, [esp
+ i0110_dx1
]
1282 movd mm3
, [esp
+ i0110_dz1
]
1287 ;
# update i particle force
1288 movq mm0
, [esp
+ i0110_fix
]
1289 movd mm1
, [esp
+ i0110_fiz
]
1292 movq
[esp
+ i0110_fix
], mm0
1293 movd
[esp
+ i0110_fiz
], mm1
1294 ;
# update j particle force
1295 movq mm0
, [edi
+ eax
*4]
1296 movd mm1
, [edi
+ eax
*4+ 8]
1299 movq
[edi
+ eax
*4], mm0
1300 movd
[edi
+ eax
*4 +8], mm1
1302 .i0110_updateouterdata_vdw:
1303 mov ecx
, [esp
+ i0110_ii3
]
1305 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
1306 movd mm7
, [edi
+ ecx
*4 + 8]
1307 pfadd mm6
, [esp
+ i0110_fix
]
1308 pfadd mm7
, [esp
+ i0110_fiz
]
1309 movq
[edi
+ ecx
*4], mm6
1310 movd
[edi
+ ecx
*4 +8], mm7
1312 mov ebx
, [ebp
+ i0110_fshift
] ;
# increment fshift force
1313 mov edx
, [esp
+ i0110_is3
]
1315 movq mm6
, [ebx
+ edx
*4]
1316 movd mm7
, [ebx
+ edx
*4 + 8]
1317 pfadd mm6
, [esp
+ i0110_fix
]
1318 pfadd mm7
, [esp
+ i0110_fiz
]
1319 movq
[ebx
+ edx
*4], mm6
1320 movd
[ebx
+ edx
*4 + 8], mm7
1323 dec dword ptr
[esp
+ i0110_nsvdw
]
1328 mov edx
, [ebp
+ i0110_gid
] ;
# get group index for this i particle
1330 add dword ptr
[ebp
+ i0110_gid
], 4 ;
# advance pointer
1332 movq mm7
, [esp
+ i0110_vnbtot
]
1333 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
1335 mov eax
, [ebp
+ i0110_Vnb
]
1336 movd mm6
, [eax
+ edx
*4]
1338 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
1340 mov ecx
, [ebp
+ i0110_nri
]
1343 ;
# not last, iterate once more!
1344 mov
[ebp
+ i0110_nri
], ecx
1360 .globl inl0300_3dnow
1361 .globl _inl0300_3dnow
1365 .equiv i0300_iinr, 12
1366 .equiv i0300_jindex, 16
1367 .equiv i0300_jjnr, 20
1368 .equiv i0300_shift, 24
1369 .equiv i0300_shiftvec, 28
1370 .equiv i0300_fshift, 32
1371 .equiv i0300_gid, 36
1372 .equiv i0300_pos, 40
1373 .equiv i0300_faction, 44
1374 .equiv i0300_type, 48
1375 .equiv i0300_ntype, 52
1376 .equiv i0300_nbfp, 56
1377 .equiv i0300_Vnb, 60
1378 .equiv i0300_tabscale, 64
1379 .equiv i0300_VFtab, 68
1380 ;
# stack offsets for local variables
1386 .equiv i0300_vnbtot, 20
1388 .equiv i0300_c12, 36
1389 .equiv i0300_two, 44
1391 .equiv i0300_tsc, 60
1392 .equiv i0300_ntia, 68
1393 .equiv i0300_innerjjnr, 72
1394 .equiv i0300_innerk, 76
1395 .equiv i0300_fix, 80
1396 .equiv i0300_fiy, 84
1397 .equiv i0300_fiz, 88
1398 .equiv i0300_dx1, 92
1399 .equiv i0300_dy1, 96
1400 .equiv i0300_dz1, 100
1401 .equiv i0300_dx2, 104
1402 .equiv i0300_dy2, 108
1403 .equiv i0300_dz2, 112
1412 sub esp
, 116 ;
# local stack space
1414 ;
# move data to local stack
1416 movd mm3
, [ebp
+ i0300_tabscale
]
1417 movq
[esp
+ i0300_two
], mm0
1419 movq
[esp
+ i0300_tsc
], mm3
1420 ;
# assume we have at least one i particle - start directly
1422 mov eax
, [ebp
+ i0300_shift
] ;
# eax = pointer into shift[]
1423 mov ebx
, [eax
] ;
# ebx=shift[n]
1424 add dword ptr
[ebp
+ i0300_shift
], 4 ;
# advance pointer one step
1426 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
1427 mov
[esp
+ i0300_is3
],ebx ;
# store is3
1429 mov eax
, [ebp
+ i0300_shiftvec
] ;
# eax = base of shiftvec[]
1431 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
1432 movd mm1
, [eax
+ ebx
*4 + 8]
1434 mov ecx
, [ebp
+ i0300_iinr
] ;
# ecx = pointer into iinr[]
1435 add dword ptr
[ebp
+ i0300_iinr
], 4 ;
# advance pointer
1436 mov ebx
, [ecx
] ;
# ebx=ii
1438 mov edx
, [ebp
+ i0300_type
]
1439 mov edx
, [edx
+ ebx
*4]
1440 imul edx
, [ebp
+ i0300_ntype
]
1442 mov
[esp
+ i0300_ntia
], edx
1444 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
1445 mov eax
, [ebp
+ i0300_pos
] ;
# eax = base of pos[]
1447 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
1448 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
1449 mov
[esp
+ i0300_ii3
], ebx
1451 movq
[esp
+ i0300_ix
], mm0
1452 movd
[esp
+ i0300_iz
], mm1
1454 ;
# clear total potential and i forces
1456 movq
[esp
+ i0300_vnbtot
], mm7
1457 movq
[esp
+ i0300_fix
], mm7
1458 movd
[esp
+ i0300_fiz
], mm7
1460 mov eax
, [ebp
+ i0300_jindex
]
1461 mov ecx
, [eax
] ;
# jindex[n]
1462 mov edx
, [eax
+ 4] ;
# jindex[n+1]
1463 add dword ptr
[ebp
+ i0300_jindex
], 4
1464 sub edx
, ecx ;
# number of innerloop atoms
1466 mov esi
, [ebp
+ i0300_pos
]
1467 mov edi
, [ebp
+ i0300_faction
]
1468 mov eax
, [ebp
+ i0300_jjnr
]
1471 mov
[esp
+ i0300_innerjjnr
], eax ;
# pointer to jjnr[nj0]
1473 mov
[esp
+ i0300_innerk
], edx ;
# number of innerloop atoms
1474 jge
.i0300_unroll_loop
1475 jmp
.i0300_finish_inner
1477 ;
# paired innerloop starts here
1478 mov ecx
, [esp
+ i0300_innerjjnr
] ;
# pointer to jjnr[k]
1480 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
1481 add dword ptr
[esp
+ i0300_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
1482 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
1484 mov ecx
, [ebp
+ i0300_type
]
1485 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
1486 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
1488 mov esi
, [ebp
+ i0300_nbfp
] ;
# base of nbfp
1491 add edx
, [esp
+ i0300_ntia
] ;
# tja = ntia + 2*type
1492 add ecx
, [esp
+ i0300_ntia
]
1494 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
1495 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
1497 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
1498 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
1499 movq
[esp
+ i0300_c6
], mm5
1500 movq
[esp
+ i0300_c12
], mm6
1502 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
1503 lea ebx
, [ebx
+ ebx
*2]
1505 mov esi
, [ebp
+ i0300_pos
]
1507 movq mm0
, [esp
+ i0300_ix
]
1508 movd mm1
, [esp
+ i0300_iz
]
1509 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
1510 movd mm5
, [esi
+ eax
*4 + 8]
1511 pfsubr mm4
,mm0 ;
# dr = ir - jr
1513 movq
[esp
+ i0300_dx1
], mm4 ;
# store dr
1514 movd
[esp
+ i0300_dz1
], mm5
1515 pfmul mm4
,mm4 ;
# square dx,dy,dz
1517 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
1518 pfacc mm4
, mm5 ;
# first rsq in lower mm4
1520 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
1521 movd mm7
, [esi
+ ebx
*4 + 8]
1523 pfsubr mm6
,mm0 ;
# dr = ir - jr
1525 movq
[esp
+ i0300_dx2
], mm6 ;
# store dr
1526 movd
[esp
+ i0300_dz2
], mm7
1527 pfmul mm6
,mm6 ;
# square dx,dy,dz
1529 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
1530 pfacc mm6
, mm7 ;
# second rsq in lower mm6
1532 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
1537 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
1538 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
1544 ;
# mm0 is invsqrt, and mm1 r
1545 ;
# do potential and fscal
1546 pfmul mm1
, [esp
+ i0300_tsc
] ;
# mm1=rt
1548 movq
[esp
+ i0300_n1
], mm4
1550 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
1553 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
1555 mov edx
, [ebp
+ i0300_VFtab
]
1557 mov ecx
, [esp
+ i0300_n1
]
1559 ;
# load all the table values we need
1560 movd mm4
, [edx
+ ecx
*4]
1561 movd mm5
, [edx
+ ecx
*4 + 4]
1562 movd mm6
, [edx
+ ecx
*4 + 8]
1563 movd mm7
, [edx
+ ecx
*4 + 12]
1564 mov ecx
, [esp
+ i0300_n1
+ 4]
1566 punpckldq mm4
, [edx
+ ecx
*4]
1567 punpckldq mm5
, [edx
+ ecx
*4 + 4]
1568 punpckldq mm6
, [edx
+ ecx
*4 + 8]
1569 punpckldq mm7
, [edx
+ ecx
*4 + 12]
1570 pfmul mm6
, mm1 ;
# mm6 = Geps
1571 pfmul mm7
, mm2 ;
# mm7 = Heps2
1573 pfadd mm5
, mm7 ;
# mm5 = Fp
1574 pfmul mm7
, [esp
+ i0300_two
] ;
# two*Heps2
1576 pfadd mm7
, mm5 ;
# mm7=FF
1577 pfmul mm5
, mm1 ;
# mm5=eps*Fp
1578 pfadd mm5
, mm4 ;
# mm5= VV
1580 movq mm4
, [esp
+ i0300_c6
]
1581 pfmul mm7
, mm4 ;
# fijD
1582 pfmul mm5
, mm4 ;
# vnb6
1583 movq mm3
, mm7 ;
# add to fscal
1585 ;
# update vnbtot to release mm5!
1586 pfadd mm5
, [esp
+ i0300_vnbtot
] ;
# add the earlier value
1587 movq
[esp
+ i0300_vnbtot
], mm5 ;
# store the sum
1590 mov ecx
, [esp
+ i0300_n1
]
1592 ;
# load all the table values we need
1593 movd mm4
, [edx
+ ecx
*4 + 16]
1594 movd mm5
, [edx
+ ecx
*4 + 20]
1595 movd mm6
, [edx
+ ecx
*4 + 24]
1596 movd mm7
, [edx
+ ecx
*4 + 28]
1597 mov ecx
, [esp
+ i0300_n1
+ 4]
1599 punpckldq mm4
, [edx
+ ecx
*4 + 16]
1600 punpckldq mm5
, [edx
+ ecx
*4 + 20]
1601 punpckldq mm6
, [edx
+ ecx
*4 + 24]
1602 punpckldq mm7
, [edx
+ ecx
*4 + 28]
1604 pfmul mm6
, mm1 ;
# mm6 = Geps
1605 pfmul mm7
, mm2 ;
# mm7 = Heps2
1607 pfadd mm5
, mm7 ;
# mm5 = Fp
1608 pfmul mm7
, [esp
+ i0300_two
] ;
# two*Heps2
1610 pfadd mm7
, mm5 ;
# mm7=FF
1611 pfmul mm5
, mm1 ;
# mm5=eps*Fp
1612 pfadd mm5
, mm4 ;
# mm5= VV
1614 movq mm6
, [esp
+ i0300_c12
]
1615 pfmul mm7
, mm6 ;
# fijR
1616 pfmul mm5
, mm6 ;
# vnb12
1617 pfadd mm3
, mm7 ;
# total fscal fijD+ fijR
1619 ;
# change sign of mm3
1622 pfmul mm1
, [esp
+ i0300_tsc
]
1623 pfmul mm0
, mm1 ;
# mm0 is total fscal now
1625 prefetchw
[esp
+ i0300_dx1
] ;
# prefetch i forces to cache
1627 ;
# spread fscalar to both positions
1632 ;
# calc vector force
1633 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
1634 movq mm2
, [esp
+ i0300_dx1
] ;
# fetch dr
1635 movd mm3
, [esp
+ i0300_dz1
]
1638 pfadd mm5
, [esp
+ i0300_vnbtot
] ;
# add the earlier value
1639 movq
[esp
+ i0300_vnbtot
], mm5 ;
# store the sum
1641 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
1642 pfmul mm2
, mm0 ;
# mult by fs
1645 movq mm4
, [esp
+ i0300_dx2
] ;
# fetch dr
1646 movd mm5
, [esp
+ i0300_dz2
]
1647 pfmul mm4
, mm1 ;
# mult by fs
1651 movq mm0
, [esp
+ i0300_fix
]
1652 movd mm1
, [esp
+ i0300_fiz
]
1658 movq
[esp
+ i0300_fix
], mm0
1659 movd
[esp
+ i0300_fiz
], mm1
1662 movq mm0
, [edi
+ eax
*4]
1663 movd mm1
, [edi
+ eax
*4 + 8]
1664 movq mm6
, [edi
+ ebx
*4]
1665 movd mm7
, [edi
+ ebx
*4 + 8]
1672 movq
[edi
+ eax
*4], mm0
1673 movd
[edi
+ eax
*4 +8], mm1
1674 movq
[edi
+ ebx
*4], mm6
1675 movd
[edi
+ ebx
*4 + 8], mm7
1677 ;
# should we do one more iteration?
1678 sub dword ptr
[esp
+ i0300_innerk
], 2
1679 jl
.i0300_finish_inner
1680 jmp
.i0300_unroll_loop
1681 .i0300_finish_inner:
1682 and dword ptr
[esp
+ i0300_innerk
], 1
1683 jnz
.i0300_single_inner
1684 jmp
.i0300_updateouterdata
1685 .i0300_single_inner:
1686 ;
# a single j particle iteration here - compare with the unrolled code for comments
1687 mov eax
, [esp
+ i0300_innerjjnr
]
1688 mov eax
, [eax
] ;
# eax=jnr offset
1690 mov esi
, [ebp
+ i0300_nbfp
]
1691 mov ecx
, [ebp
+ i0300_type
]
1692 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
1694 add edx
, [esp
+ i0300_ntia
] ;
# tja = ntia + 2*type
1695 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
1696 movq
[esp
+ i0300_c6
], mm5
1697 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
1698 movq
[esp
+ i0300_c12
], mm5
1700 mov esi
, [ebp
+ i0300_pos
]
1701 lea eax
, [eax
+ eax
*2]
1703 movq mm0
, [esp
+ i0300_ix
]
1704 movd mm1
, [esp
+ i0300_iz
]
1705 movq mm4
, [esi
+ eax
*4]
1706 movd mm5
, [esi
+ eax
*4 + 8]
1709 movq
[esp
+ i0300_dx1
], mm4
1711 movd
[esp
+ i0300_dz1
], mm5
1714 pfacc mm4
, mm5 ;
# mm0=rsq
1720 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
1723 ;
# mm0 is invsqrt, and mm1 r
1725 ;
# calculate potentials and scalar force
1726 pfmul mm1
, [esp
+ i0300_tsc
] ;
# mm1=rt
1728 movd
[esp
+ i0300_n1
], mm4
1730 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
1733 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
1735 mov edx
, [ebp
+ i0300_VFtab
]
1736 mov ecx
, [esp
+ i0300_n1
]
1739 ;
# load all the table values we need
1740 movd mm4
, [edx
+ ecx
*4]
1741 movd mm5
, [edx
+ ecx
*4 + 4]
1742 movd mm6
, [edx
+ ecx
*4 + 8]
1743 movd mm7
, [edx
+ ecx
*4 + 12]
1744 pfmul mm6
, mm1 ;
# mm6 = Geps
1745 pfmul mm7
, mm2 ;
# mm7 = Heps2
1747 pfadd mm5
, mm7 ;
# mm5 = Fp
1748 pfmul mm7
, [esp
+ i0300_two
] ;
# two*Heps2
1750 pfadd mm7
, mm5 ;
# mm7=FF
1751 pfmul mm5
, mm1 ;
# mm5=eps*Fp
1752 pfadd mm5
, mm4 ;
# mm5= VV
1754 movq mm4
, [esp
+ i0300_c6
]
1755 pfmul mm7
, mm4 ;
# fijD
1756 pfmul mm5
, mm4 ;
# vnb6
1757 movq mm3
, mm7 ;
# add to fscal
1759 ;
# update vnbtot to release mm5!
1760 pfadd mm5
, [esp
+ i0300_vnbtot
] ;
# add the earlier value
1761 movq
[esp
+ i0300_vnbtot
], mm5 ;
# store the sum
1764 ;
# load all the table values we need
1765 movd mm4
, [edx
+ ecx
*4 + 16]
1766 movd mm5
, [edx
+ ecx
*4 + 20]
1767 movd mm6
, [edx
+ ecx
*4 + 24]
1768 movd mm7
, [edx
+ ecx
*4 + 28]
1770 pfmul mm6
, mm1 ;
# mm6 = Geps
1771 pfmul mm7
, mm2 ;
# mm7 = Heps2
1773 pfadd mm5
, mm7 ;
# mm5 = Fp
1774 pfmul mm7
, [esp
+ i0300_two
] ;
# two*Heps2
1776 pfadd mm7
, mm5 ;
# mm7=FF
1777 pfmul mm5
, mm1 ;
# mm5=eps*Fp
1778 pfadd mm5
, mm4 ;
# mm5= VV
1780 movq mm6
, [esp
+ i0300_c12
]
1781 pfmul mm7
, mm6 ;
# fijR
1782 pfmul mm5
, mm6 ;
# vnb12
1783 pfadd mm3
, mm7 ;
# total fscal fijC+ fijD+ fijR
1785 ;
# change sign of mm3
1788 pfmul mm0
, [esp
+ i0300_tsc
]
1789 pfmul mm0
, mm1 ;
# mm0 is total fscal now
1792 pfadd mm5
, [esp
+ i0300_vnbtot
] ;
# add the earlier value
1793 movq
[esp
+ i0300_vnbtot
], mm5 ;
# store the sum
1795 ;
# spread fscalar to both positions
1797 ;
# calc vectorial force
1798 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
1799 movq mm2
, [esp
+ i0300_dx1
]
1800 movd mm3
, [esp
+ i0300_dz1
]
1805 ;
# update i particle force
1806 movq mm0
, [esp
+ i0300_fix
]
1807 movd mm1
, [esp
+ i0300_fiz
]
1810 movq
[esp
+ i0300_fix
], mm0
1811 movd
[esp
+ i0300_fiz
], mm1
1812 ;
# update j particle force
1813 movq mm0
, [edi
+ eax
*4]
1814 movd mm1
, [edi
+ eax
*4+ 8]
1817 movq
[edi
+ eax
*4], mm0
1818 movd
[edi
+ eax
*4 +8], mm1
1820 .i0300_updateouterdata:
1821 mov ecx
, [esp
+ i0300_ii3
]
1823 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
1824 movd mm7
, [edi
+ ecx
*4 + 8]
1825 pfadd mm6
, [esp
+ i0300_fix
]
1826 pfadd mm7
, [esp
+ i0300_fiz
]
1827 movq
[edi
+ ecx
*4], mm6
1828 movd
[edi
+ ecx
*4 +8], mm7
1830 mov ebx
, [ebp
+ i0300_fshift
] ;
# increment fshift force
1831 mov edx
, [esp
+ i0300_is3
]
1833 movq mm6
, [ebx
+ edx
*4]
1834 movd mm7
, [ebx
+ edx
*4 + 8]
1835 pfadd mm6
, [esp
+ i0300_fix
]
1836 pfadd mm7
, [esp
+ i0300_fiz
]
1837 movq
[ebx
+ edx
*4], mm6
1838 movd
[ebx
+ edx
*4 + 8], mm7
1840 mov edx
, [ebp
+ i0300_gid
] ;
# get group index for this i particle
1842 add dword ptr
[ebp
+ i0300_gid
], 4 ;
# advance pointer
1844 movq mm7
, [esp
+ i0300_vnbtot
]
1845 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
1847 mov eax
, [ebp
+ i0300_Vnb
]
1848 movd mm6
, [eax
+ edx
*4]
1850 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
1853 mov ecx
, [ebp
+ i0300_nri
]
1856 ;
# not last, iterate once more!
1857 mov
[ebp
+ i0300_nri
], ecx
1874 .globl inl0310_3dnow
1875 .globl _inl0310_3dnow
1879 .equiv i0310_iinr, 12
1880 .equiv i0310_jindex, 16
1881 .equiv i0310_jjnr, 20
1882 .equiv i0310_shift, 24
1883 .equiv i0310_shiftvec, 28
1884 .equiv i0310_fshift, 32
1885 .equiv i0310_gid, 36
1886 .equiv i0310_pos, 40
1887 .equiv i0310_faction, 44
1888 .equiv i0310_type, 48
1889 .equiv i0310_ntype, 52
1890 .equiv i0310_nbfp, 56
1891 .equiv i0310_Vnb, 60
1892 .equiv i0310_tabscale, 64
1893 .equiv i0310_VFtab, 68
1894 .equiv i0310_nsatoms, 72
1895 ;
# stack offsets for local variables
1899 .equiv i0310_shY, 12
1900 .equiv i0310_shZ, 16
1904 .equiv i0310_vnbtot, 32
1906 .equiv i0310_c12, 48
1907 .equiv i0310_two, 56
1909 .equiv i0310_tsc, 72
1910 .equiv i0310_ntia, 80
1911 .equiv i0310_innerjjnr0, 84
1912 .equiv i0310_innerk0, 88
1913 .equiv i0310_innerjjnr, 92
1914 .equiv i0310_innerk, 96
1915 .equiv i0310_fix, 100
1916 .equiv i0310_fiy, 104
1917 .equiv i0310_fiz, 108
1918 .equiv i0310_dx1, 112
1919 .equiv i0310_dy1, 116
1920 .equiv i0310_dz1, 120
1921 .equiv i0310_dx2, 124
1922 .equiv i0310_dy2, 128
1923 .equiv i0310_dz2, 132
1924 .equiv i0310_nsvdwc, 136
1925 .equiv i0310_nscoul, 140
1926 .equiv i0310_nsvdw, 144
1927 .equiv i0310_solnr, 148
1936 sub esp
, 152 ;
# local stack space
1939 movd mm3
, [ebp
+ i0310_tabscale
]
1940 movq
[esp
+ i0310_two
], mm0
1942 movq
[esp
+ i0310_tsc
], mm3
1944 ;
# assume we have at least one i particle - start directly
1946 mov eax
, [ebp
+ i0310_shift
] ;
# eax = pointer into shift[]
1947 mov ebx
, [eax
] ;
# ebx=shift[n]
1948 add dword ptr
[ebp
+ i0310_shift
], 4 ;
# advance pointer one step
1950 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
1951 mov
[esp
+ i0310_is3
],ebx ;
# store is3
1953 mov eax
, [ebp
+ i0310_shiftvec
] ;
# eax = base of shiftvec[]
1955 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
1956 movd mm1
, [eax
+ ebx
*4 + 8]
1957 movq
[esp
+ i0310_shX
], mm0
1958 movd
[esp
+ i0310_shZ
], mm1
1960 mov ecx
, [ebp
+ i0310_iinr
] ;
# ecx = pointer into iinr[]
1961 add dword ptr
[ebp
+ i0310_iinr
], 4 ;
# advance pointer
1962 mov ebx
, [ecx
] ;
# ebx=ii
1964 mov eax
, [ebp
+ i0310_nsatoms
]
1965 add dword ptr
[ebp
+ i0310_nsatoms
], 12
1972 mov
[esp
+ i0310_nsvdwc
], edx
1973 mov
[esp
+ i0310_nscoul
], eax
1974 mov
[esp
+ i0310_nsvdw
], ecx
1978 movq
[esp
+ i0310_vnbtot
], mm7
1979 mov
[esp
+ i0310_solnr
], ebx
1981 mov eax
, [ebp
+ i0310_jindex
]
1982 mov ecx
, [eax
] ;
# jindex[n]
1983 mov edx
, [eax
+ 4] ;
# jindex[n+1]
1984 add dword ptr
[ebp
+ i0310_jindex
], 4
1985 sub edx
, ecx ;
# number of innerloop atoms
1986 mov eax
, [ebp
+ i0310_jjnr
]
1989 mov
[esp
+ i0310_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
1991 mov
[esp
+ i0310_innerk0
], edx ;
# number of innerloop atoms
1992 mov esi
, [ebp
+ i0310_pos
]
1993 mov edi
, [ebp
+ i0310_faction
]
1995 mov ecx
, [esp
+ i0310_nsvdwc
]
2000 mov ebx
, [esp
+ i0310_solnr
]
2001 inc dword ptr
[esp
+ i0310_solnr
]
2003 mov edx
, [ebp
+ i0310_type
]
2004 mov edx
, [edx
+ ebx
*4]
2005 imul edx
, [ebp
+ i0310_ntype
]
2007 mov
[esp
+ i0310_ntia
], edx
2009 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
2010 mov eax
, [ebp
+ i0310_pos
] ;
# eax = base of pos[]
2011 mov
[esp
+ i0310_ii3
], ebx
2013 movq mm0
, [eax
+ ebx
*4]
2014 movd mm1
, [eax
+ ebx
*4 + 8]
2015 pfadd mm0
, [esp
+ i0310_shX
]
2016 pfadd mm1
, [esp
+ i0310_shZ
]
2017 movq
[esp
+ i0310_ix
], mm0
2018 movd
[esp
+ i0310_iz
], mm1
2022 movq
[esp
+ i0310_fix
], mm7
2023 movd
[esp
+ i0310_fiz
], mm7
2025 mov ecx
, [esp
+ i0310_innerjjnr0
]
2026 mov
[esp
+ i0310_innerjjnr
], ecx
2027 mov edx
, [esp
+ i0310_innerk0
]
2029 mov
[esp
+ i0310_innerk
], edx ;
# number of innerloop atoms
2030 jge
.i0310_unroll_vdwc_loop
2031 jmp
.i0310_finish_vdwc_inner
2032 .i0310_unroll_vdwc_loop:
2033 ;
# paired innerloop starts here
2034 mov ecx
, [esp
+ i0310_innerjjnr
] ;
# pointer to jjnr[k]
2036 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
2037 add dword ptr
[esp
+ i0310_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
2038 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
2040 mov ecx
, [ebp
+ i0310_type
]
2041 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
2042 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
2044 mov esi
, [ebp
+ i0310_nbfp
] ;
# base of nbfp
2047 add edx
, [esp
+ i0310_ntia
] ;
# tja = ntia + 2*type
2048 add ecx
, [esp
+ i0310_ntia
]
2050 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
2051 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
2053 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
2054 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
2055 movq
[esp
+ i0310_c6
], mm5
2056 movq
[esp
+ i0310_c12
], mm6
2058 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
2059 lea ebx
, [ebx
+ ebx
*2]
2061 mov esi
, [ebp
+ i0310_pos
]
2063 movq mm0
, [esp
+ i0310_ix
]
2064 movd mm1
, [esp
+ i0310_iz
]
2065 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
2066 movd mm5
, [esi
+ eax
*4 + 8]
2067 pfsubr mm4
,mm0 ;
# dr = ir - jr
2069 movq
[esp
+ i0310_dx1
], mm4 ;
# store dr
2070 movd
[esp
+ i0310_dz1
], mm5
2071 pfmul mm4
,mm4 ;
# square dx,dy,dz
2073 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
2074 pfacc mm4
, mm5 ;
# first rsq in lower mm4
2076 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
2077 movd mm7
, [esi
+ ebx
*4 + 8]
2079 pfsubr mm6
,mm0 ;
# dr = ir - jr
2081 movq
[esp
+ i0310_dx2
], mm6 ;
# store dr
2082 movd
[esp
+ i0310_dz2
], mm7
2083 pfmul mm6
,mm6 ;
# square dx,dy,dz
2085 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
2086 pfacc mm6
, mm7 ;
# second rsq in lower mm6
2088 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
2093 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
2094 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
2100 ;
# mm0 is invsqrt, and mm1 r
2101 ;
# do potential and fscal
2102 pfmul mm1
, [esp
+ i0310_tsc
] ;
# mm1=rt
2104 movq
[esp
+ i0310_n1
], mm4
2106 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
2109 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
2111 mov edx
, [ebp
+ i0310_VFtab
]
2113 mov ecx
, [esp
+ i0310_n1
]
2115 ;
# load all the table values we need
2116 movd mm4
, [edx
+ ecx
*4]
2117 movd mm5
, [edx
+ ecx
*4 + 4]
2118 movd mm6
, [edx
+ ecx
*4 + 8]
2119 movd mm7
, [edx
+ ecx
*4 + 12]
2120 mov ecx
, [esp
+ i0310_n1
+ 4]
2122 punpckldq mm4
, [edx
+ ecx
*4]
2123 punpckldq mm5
, [edx
+ ecx
*4 + 4]
2124 punpckldq mm6
, [edx
+ ecx
*4 + 8]
2125 punpckldq mm7
, [edx
+ ecx
*4 + 12]
2126 pfmul mm6
, mm1 ;
# mm6 = Geps
2127 pfmul mm7
, mm2 ;
# mm7 = Heps2
2129 pfadd mm5
, mm7 ;
# mm5 = Fp
2130 pfmul mm7
, [esp
+ i0310_two
] ;
# two*Heps2
2132 pfadd mm7
, mm5 ;
# mm7=FF
2133 pfmul mm5
, mm1 ;
# mm5=eps*Fp
2134 pfadd mm5
, mm4 ;
# mm5= VV
2136 movq mm4
, [esp
+ i0310_c6
]
2137 pfmul mm7
, mm4 ;
# fijD
2138 pfmul mm5
, mm4 ;
# vnb6
2139 movq mm3
, mm7 ;
# add to fscal
2141 ;
# update vnbtot to release mm5!
2142 pfadd mm5
, [esp
+ i0310_vnbtot
] ;
# add the earlier value
2143 movq
[esp
+ i0310_vnbtot
], mm5 ;
# store the sum
2146 mov ecx
, [esp
+ i0310_n1
]
2148 ;
# load all the table values we need
2149 movd mm4
, [edx
+ ecx
*4 + 16]
2150 movd mm5
, [edx
+ ecx
*4 + 20]
2151 movd mm6
, [edx
+ ecx
*4 + 24]
2152 movd mm7
, [edx
+ ecx
*4 + 28]
2153 mov ecx
, [esp
+ i0310_n1
+ 4]
2155 punpckldq mm4
, [edx
+ ecx
*4 + 16]
2156 punpckldq mm5
, [edx
+ ecx
*4 + 20]
2157 punpckldq mm6
, [edx
+ ecx
*4 + 24]
2158 punpckldq mm7
, [edx
+ ecx
*4 + 28]
2160 pfmul mm6
, mm1 ;
# mm6 = Geps
2161 pfmul mm7
, mm2 ;
# mm7 = Heps2
2163 pfadd mm5
, mm7 ;
# mm5 = Fp
2164 pfmul mm7
, [esp
+ i0310_two
] ;
# two*Heps2
2166 pfadd mm7
, mm5 ;
# mm7=FF
2167 pfmul mm5
, mm1 ;
# mm5=eps*Fp
2168 pfadd mm5
, mm4 ;
# mm5= VV
2170 movq mm6
, [esp
+ i0310_c12
]
2171 pfmul mm7
, mm6 ;
# fijR
2172 pfmul mm5
, mm6 ;
# vnb12
2173 pfadd mm3
, mm7 ;
# total fscal fijD+ fijR
2175 ;
# change sign of mm3
2178 pfmul mm1
, [esp
+ i0310_tsc
]
2179 pfmul mm0
, mm1 ;
# mm0 is total fscal now
2181 prefetchw
[esp
+ i0310_dx1
] ;
# prefetch i forces to cache
2183 ;
# spread fscalar to both positions
2188 ;
# calc vector force
2189 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
2190 movq mm2
, [esp
+ i0310_dx1
] ;
# fetch dr
2191 movd mm3
, [esp
+ i0310_dz1
]
2194 pfadd mm5
, [esp
+ i0310_vnbtot
] ;
# add the earlier value
2195 movq
[esp
+ i0310_vnbtot
], mm5 ;
# store the sum
2197 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
2198 pfmul mm2
, mm0 ;
# mult by fs
2201 movq mm4
, [esp
+ i0310_dx2
] ;
# fetch dr
2202 movd mm5
, [esp
+ i0310_dz2
]
2203 pfmul mm4
, mm1 ;
# mult by fs
2207 movq mm0
, [esp
+ i0310_fix
]
2208 movd mm1
, [esp
+ i0310_fiz
]
2214 movq
[esp
+ i0310_fix
], mm0
2215 movd
[esp
+ i0310_fiz
], mm1
2218 movq mm0
, [edi
+ eax
*4]
2219 movd mm1
, [edi
+ eax
*4 + 8]
2220 movq mm6
, [edi
+ ebx
*4]
2221 movd mm7
, [edi
+ ebx
*4 + 8]
2228 movq
[edi
+ eax
*4], mm0
2229 movd
[edi
+ eax
*4 +8], mm1
2230 movq
[edi
+ ebx
*4], mm6
2231 movd
[edi
+ ebx
*4 + 8], mm7
2233 ;
# should we do one more iteration?
2234 sub dword ptr
[esp
+ i0310_innerk
], 2
2235 jl
.i0310_finish_vdwc_inner
2236 jmp
.i0310_unroll_vdwc_loop
2237 .i0310_finish_vdwc_inner:
2238 and dword ptr
[esp
+ i0310_innerk
], 1
2239 jnz
.i0310_single_vdwc_inner
2240 jmp
.i0310_updateouterdata_vdwc
2241 .i0310_single_vdwc_inner:
2242 ;
# a single j particle iteration here - compare with the unrolled code for comments
2243 mov eax
, [esp
+ i0310_innerjjnr
]
2244 mov eax
, [eax
] ;
# eax=jnr offset
2246 mov esi
, [ebp
+ i0310_nbfp
]
2247 mov ecx
, [ebp
+ i0310_type
]
2248 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
2250 add edx
, [esp
+ i0310_ntia
] ;
# tja = ntia + 2*type
2251 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
2252 movq
[esp
+ i0310_c6
], mm5
2253 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
2254 movq
[esp
+ i0310_c12
], mm5
2256 mov esi
, [ebp
+ i0310_pos
]
2257 lea eax
, [eax
+ eax
*2]
2259 movq mm0
, [esp
+ i0310_ix
]
2260 movd mm1
, [esp
+ i0310_iz
]
2261 movq mm4
, [esi
+ eax
*4]
2262 movd mm5
, [esi
+ eax
*4 + 8]
2265 movq
[esp
+ i0310_dx1
], mm4
2267 movd
[esp
+ i0310_dz1
], mm5
2270 pfacc mm4
, mm5 ;
# mm0=rsq
2276 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
2279 ;
# mm0 is invsqrt, and mm1 r
2281 ;
# calculate potentials and scalar force
2282 pfmul mm1
, [esp
+ i0310_tsc
] ;
# mm1=rt
2284 movd
[esp
+ i0310_n1
], mm4
2286 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
2289 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
2291 mov edx
, [ebp
+ i0310_VFtab
]
2292 mov ecx
, [esp
+ i0310_n1
]
2295 ;
# load all the table values we need
2296 movd mm4
, [edx
+ ecx
*4]
2297 movd mm5
, [edx
+ ecx
*4 + 4]
2298 movd mm6
, [edx
+ ecx
*4 + 8]
2299 movd mm7
, [edx
+ ecx
*4 + 12]
2300 pfmul mm6
, mm1 ;
# mm6 = Geps
2301 pfmul mm7
, mm2 ;
# mm7 = Heps2
2303 pfadd mm5
, mm7 ;
# mm5 = Fp
2304 pfmul mm7
, [esp
+ i0310_two
] ;
# two*Heps2
2306 pfadd mm7
, mm5 ;
# mm7=FF
2307 pfmul mm5
, mm1 ;
# mm5=eps*Fp
2308 pfadd mm5
, mm4 ;
# mm5= VV
2310 movq mm4
, [esp
+ i0310_c6
]
2311 pfmul mm7
, mm4 ;
# fijD
2312 pfmul mm5
, mm4 ;
# vnb6
2313 movq mm3
, mm7 ;
# add to fscal
2315 ;
# update vnbtot to release mm5!
2316 pfadd mm5
, [esp
+ i0310_vnbtot
] ;
# add the earlier value
2317 movq
[esp
+ i0310_vnbtot
], mm5 ;
# store the sum
2320 ;
# load all the table values we need
2321 movd mm4
, [edx
+ ecx
*4 + 16]
2322 movd mm5
, [edx
+ ecx
*4 + 20]
2323 movd mm6
, [edx
+ ecx
*4 + 24]
2324 movd mm7
, [edx
+ ecx
*4 + 28]
2326 pfmul mm6
, mm1 ;
# mm6 = Geps
2327 pfmul mm7
, mm2 ;
# mm7 = Heps2
2329 pfadd mm5
, mm7 ;
# mm5 = Fp
2330 pfmul mm7
, [esp
+ i0310_two
] ;
# two*Heps2
2332 pfadd mm7
, mm5 ;
# mm7=FF
2333 pfmul mm5
, mm1 ;
# mm5=eps*Fp
2334 pfadd mm5
, mm4 ;
# mm5= VV
2336 movq mm6
, [esp
+ i0310_c12
]
2337 pfmul mm7
, mm6 ;
# fijR
2338 pfmul mm5
, mm6 ;
# vnb12
2339 pfadd mm3
, mm7 ;
# total fscal fijC+ fijD+ fijR
2341 ;
# change sign of mm3
2344 pfmul mm0
, [esp
+ i0310_tsc
]
2345 pfmul mm0
, mm1 ;
# mm0 is total fscal now
2348 pfadd mm5
, [esp
+ i0310_vnbtot
] ;
# add the earlier value
2349 movq
[esp
+ i0310_vnbtot
], mm5 ;
# store the sum
2351 ;
# spread fscalar to both positions
2353 ;
# calc vectorial force
2354 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
2355 movq mm2
, [esp
+ i0310_dx1
]
2356 movd mm3
, [esp
+ i0310_dz1
]
2361 ;
# update i particle force
2362 movq mm0
, [esp
+ i0310_fix
]
2363 movd mm1
, [esp
+ i0310_fiz
]
2366 movq
[esp
+ i0310_fix
], mm0
2367 movd
[esp
+ i0310_fiz
], mm1
2368 ;
# update j particle force
2369 movq mm0
, [edi
+ eax
*4]
2370 movd mm1
, [edi
+ eax
*4+ 8]
2373 movq
[edi
+ eax
*4], mm0
2374 movd
[edi
+ eax
*4 +8], mm1
2376 .i0310_updateouterdata_vdwc:
2377 mov ecx
, [esp
+ i0310_ii3
]
2379 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
2380 movd mm7
, [edi
+ ecx
*4 + 8]
2381 pfadd mm6
, [esp
+ i0310_fix
]
2382 pfadd mm7
, [esp
+ i0310_fiz
]
2383 movq
[edi
+ ecx
*4], mm6
2384 movd
[edi
+ ecx
*4 +8], mm7
2386 mov ebx
, [ebp
+ i0310_fshift
] ;
# increment fshift force
2387 mov edx
, [esp
+ i0310_is3
]
2389 movq mm6
, [ebx
+ edx
*4]
2390 movd mm7
, [ebx
+ edx
*4 + 8]
2391 pfadd mm6
, [esp
+ i0310_fix
]
2392 pfadd mm7
, [esp
+ i0310_fiz
]
2393 movq
[ebx
+ edx
*4], mm6
2394 movd
[ebx
+ edx
*4 + 8], mm7
2397 dec dword ptr
[esp
+ i0310_nsvdwc
]
2401 mov ebx
, [esp
+ i0310_nscoul
]
2402 add [esp
+ i0310_solnr
], ebx
2404 mov ecx
, [esp
+ i0310_nsvdw
]
2409 mov ebx
, [esp
+ i0310_solnr
]
2410 inc dword ptr
[esp
+ i0310_solnr
]
2412 mov edx
, [ebp
+ i0310_type
]
2413 mov edx
, [edx
+ ebx
*4]
2414 imul edx
, [ebp
+ i0310_ntype
]
2416 mov
[esp
+ i0310_ntia
], edx
2418 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
2419 mov eax
, [ebp
+ i0310_pos
] ;
# eax = base of pos[]
2420 mov
[esp
+ i0310_ii3
], ebx
2422 movq mm0
, [eax
+ ebx
*4]
2423 movd mm1
, [eax
+ ebx
*4 + 8]
2424 pfadd mm0
, [esp
+ i0310_shX
]
2425 pfadd mm1
, [esp
+ i0310_shZ
]
2426 movq
[esp
+ i0310_ix
], mm0
2427 movd
[esp
+ i0310_iz
], mm1
2431 movq
[esp
+ i0310_fix
], mm7
2432 movd
[esp
+ i0310_fiz
], mm7
2434 mov ecx
, [esp
+ i0310_innerjjnr0
]
2435 mov
[esp
+ i0310_innerjjnr
], ecx
2436 mov edx
, [esp
+ i0310_innerk0
]
2438 mov
[esp
+ i0310_innerk
], edx ;
# number of innerloop atoms
2439 jge
.i0310_unroll_vdw_loop
2440 jmp
.i0310_finish_vdw_inner
2441 .i0310_unroll_vdw_loop:
2442 ;
# paired innerloop starts here
2443 mov ecx
, [esp
+ i0310_innerjjnr
] ;
# pointer to jjnr[k]
2445 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
2446 add dword ptr
[esp
+ i0310_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
2447 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
2449 mov ecx
, [ebp
+ i0310_type
]
2450 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
2451 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
2453 mov esi
, [ebp
+ i0310_nbfp
] ;
# base of nbfp
2456 add edx
, [esp
+ i0310_ntia
] ;
# tja = ntia + 2*type
2457 add ecx
, [esp
+ i0310_ntia
]
2459 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
2460 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
2462 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
2463 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
2464 movq
[esp
+ i0310_c6
], mm5
2465 movq
[esp
+ i0310_c12
], mm6
2467 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
2468 lea ebx
, [ebx
+ ebx
*2]
2470 mov esi
, [ebp
+ i0310_pos
]
2472 movq mm0
, [esp
+ i0310_ix
]
2473 movd mm1
, [esp
+ i0310_iz
]
2474 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
2475 movd mm5
, [esi
+ eax
*4 + 8]
2476 pfsubr mm4
,mm0 ;
# dr = ir - jr
2478 movq
[esp
+ i0310_dx1
], mm4 ;
# store dr
2479 movd
[esp
+ i0310_dz1
], mm5
2480 pfmul mm4
,mm4 ;
# square dx,dy,dz
2482 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
2483 pfacc mm4
, mm5 ;
# first rsq in lower mm4
2485 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
2486 movd mm7
, [esi
+ ebx
*4 + 8]
2488 pfsubr mm6
,mm0 ;
# dr = ir - jr
2490 movq
[esp
+ i0310_dx2
], mm6 ;
# store dr
2491 movd
[esp
+ i0310_dz2
], mm7
2492 pfmul mm6
,mm6 ;
# square dx,dy,dz
2494 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
2495 pfacc mm6
, mm7 ;
# second rsq in lower mm6
2497 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
2502 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
2503 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
2509 ;
# mm0 is invsqrt, and mm1 r
2510 ;
# do potential and fscal
2511 pfmul mm1
, [esp
+ i0310_tsc
] ;
# mm1=rt
2513 movq
[esp
+ i0310_n1
], mm4
2515 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
2518 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
2520 mov edx
, [ebp
+ i0310_VFtab
]
2522 mov ecx
, [esp
+ i0310_n1
]
2524 ;
# load all the table values we need
2525 movd mm4
, [edx
+ ecx
*4]
2526 movd mm5
, [edx
+ ecx
*4 + 4]
2527 movd mm6
, [edx
+ ecx
*4 + 8]
2528 movd mm7
, [edx
+ ecx
*4 + 12]
2529 mov ecx
, [esp
+ i0310_n1
+ 4]
2531 punpckldq mm4
, [edx
+ ecx
*4]
2532 punpckldq mm5
, [edx
+ ecx
*4 + 4]
2533 punpckldq mm6
, [edx
+ ecx
*4 + 8]
2534 punpckldq mm7
, [edx
+ ecx
*4 + 12]
2535 pfmul mm6
, mm1 ;
# mm6 = Geps
2536 pfmul mm7
, mm2 ;
# mm7 = Heps2
2538 pfadd mm5
, mm7 ;
# mm5 = Fp
2539 pfmul mm7
, [esp
+ i0310_two
] ;
# two*Heps2
2541 pfadd mm7
, mm5 ;
# mm7=FF
2542 pfmul mm5
, mm1 ;
# mm5=eps*Fp
2543 pfadd mm5
, mm4 ;
# mm5= VV
2545 movq mm4
, [esp
+ i0310_c6
]
2546 pfmul mm7
, mm4 ;
# fijD
2547 pfmul mm5
, mm4 ;
# vnb6
2548 movq mm3
, mm7 ;
# add to fscal
2550 ;
# update vnbtot to release mm5!
2551 pfadd mm5
, [esp
+ i0310_vnbtot
] ;
# add the earlier value
2552 movq
[esp
+ i0310_vnbtot
], mm5 ;
# store the sum
2555 mov ecx
, [esp
+ i0310_n1
]
2557 ;
# load all the table values we need
2558 movd mm4
, [edx
+ ecx
*4 + 16]
2559 movd mm5
, [edx
+ ecx
*4 + 20]
2560 movd mm6
, [edx
+ ecx
*4 + 24]
2561 movd mm7
, [edx
+ ecx
*4 + 28]
2562 mov ecx
, [esp
+ i0310_n1
+ 4]
2564 punpckldq mm4
, [edx
+ ecx
*4 + 16]
2565 punpckldq mm5
, [edx
+ ecx
*4 + 20]
2566 punpckldq mm6
, [edx
+ ecx
*4 + 24]
2567 punpckldq mm7
, [edx
+ ecx
*4 + 28]
2569 pfmul mm6
, mm1 ;
# mm6 = Geps
2570 pfmul mm7
, mm2 ;
# mm7 = Heps2
2572 pfadd mm5
, mm7 ;
# mm5 = Fp
2573 pfmul mm7
, [esp
+ i0310_two
] ;
# two*Heps2
2575 pfadd mm7
, mm5 ;
# mm7=FF
2576 pfmul mm5
, mm1 ;
# mm5=eps*Fp
2577 pfadd mm5
, mm4 ;
# mm5= VV
2579 movq mm6
, [esp
+ i0310_c12
]
2580 pfmul mm7
, mm6 ;
# fijR
2581 pfmul mm5
, mm6 ;
# vnb12
2582 pfadd mm3
, mm7 ;
# total fscal fijD+ fijR
2584 ;
# change sign of mm3
2587 pfmul mm1
, [esp
+ i0310_tsc
]
2588 pfmul mm0
, mm1 ;
# mm0 is total fscal now
2590 prefetchw
[esp
+ i0310_dx1
] ;
# prefetch i forces to cache
2592 ;
# spread fscalar to both positions
2597 ;
# calc vector force
2598 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
2599 movq mm2
, [esp
+ i0310_dx1
] ;
# fetch dr
2600 movd mm3
, [esp
+ i0310_dz1
]
2603 pfadd mm5
, [esp
+ i0310_vnbtot
] ;
# add the earlier value
2604 movq
[esp
+ i0310_vnbtot
], mm5 ;
# store the sum
2606 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
2607 pfmul mm2
, mm0 ;
# mult by fs
2610 movq mm4
, [esp
+ i0310_dx2
] ;
# fetch dr
2611 movd mm5
, [esp
+ i0310_dz2
]
2612 pfmul mm4
, mm1 ;
# mult by fs
2616 movq mm0
, [esp
+ i0310_fix
]
2617 movd mm1
, [esp
+ i0310_fiz
]
2623 movq
[esp
+ i0310_fix
], mm0
2624 movd
[esp
+ i0310_fiz
], mm1
2627 movq mm0
, [edi
+ eax
*4]
2628 movd mm1
, [edi
+ eax
*4 + 8]
2629 movq mm6
, [edi
+ ebx
*4]
2630 movd mm7
, [edi
+ ebx
*4 + 8]
2637 movq
[edi
+ eax
*4], mm0
2638 movd
[edi
+ eax
*4 +8], mm1
2639 movq
[edi
+ ebx
*4], mm6
2640 movd
[edi
+ ebx
*4 + 8], mm7
2642 ;
# should we do one more iteration?
2643 sub dword ptr
[esp
+ i0310_innerk
], 2
2644 jl
.i0310_finish_vdw_inner
2645 jmp
.i0310_unroll_vdw_loop
2646 .i0310_finish_vdw_inner:
2647 and dword ptr
[esp
+ i0310_innerk
], 1
2648 jnz
.i0310_single_vdw_inner
2649 jmp
.i0310_updateouterdata_vdw
2650 .i0310_single_vdw_inner:
2651 ;
# a single j particle iteration here - compare with the unrolled code for comments
2652 mov eax
, [esp
+ i0310_innerjjnr
]
2653 mov eax
, [eax
] ;
# eax=jnr offset
2655 mov esi
, [ebp
+ i0310_nbfp
]
2656 mov ecx
, [ebp
+ i0310_type
]
2657 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
2659 add edx
, [esp
+ i0310_ntia
] ;
# tja = ntia + 2*type
2660 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
2661 movq
[esp
+ i0310_c6
], mm5
2662 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
2663 movq
[esp
+ i0310_c12
], mm5
2665 mov esi
, [ebp
+ i0310_pos
]
2666 lea eax
, [eax
+ eax
*2]
2668 movq mm0
, [esp
+ i0310_ix
]
2669 movd mm1
, [esp
+ i0310_iz
]
2670 movq mm4
, [esi
+ eax
*4]
2671 movd mm5
, [esi
+ eax
*4 + 8]
2674 movq
[esp
+ i0310_dx1
], mm4
2676 movd
[esp
+ i0310_dz1
], mm5
2679 pfacc mm4
, mm5 ;
# mm0=rsq
2685 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
2688 ;
# mm0 is invsqrt, and mm1 r
2690 ;
# calculate potentials and scalar force
2691 pfmul mm1
, [esp
+ i0310_tsc
] ;
# mm1=rt
2693 movd
[esp
+ i0310_n1
], mm4
2695 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
2698 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
2700 mov edx
, [ebp
+ i0310_VFtab
]
2701 mov ecx
, [esp
+ i0310_n1
]
2704 ;
# load all the table values we need
2705 movd mm4
, [edx
+ ecx
*4]
2706 movd mm5
, [edx
+ ecx
*4 + 4]
2707 movd mm6
, [edx
+ ecx
*4 + 8]
2708 movd mm7
, [edx
+ ecx
*4 + 12]
2709 pfmul mm6
, mm1 ;
# mm6 = Geps
2710 pfmul mm7
, mm2 ;
# mm7 = Heps2
2712 pfadd mm5
, mm7 ;
# mm5 = Fp
2713 pfmul mm7
, [esp
+ i0310_two
] ;
# two*Heps2
2715 pfadd mm7
, mm5 ;
# mm7=FF
2716 pfmul mm5
, mm1 ;
# mm5=eps*Fp
2717 pfadd mm5
, mm4 ;
# mm5= VV
2719 movq mm4
, [esp
+ i0310_c6
]
2720 pfmul mm7
, mm4 ;
# fijD
2721 pfmul mm5
, mm4 ;
# vnb6
2722 movq mm3
, mm7 ;
# add to fscal
2724 ;
# update vnbtot to release mm5!
2725 pfadd mm5
, [esp
+ i0310_vnbtot
] ;
# add the earlier value
2726 movq
[esp
+ i0310_vnbtot
], mm5 ;
# store the sum
2729 ;
# load all the table values we need
2730 movd mm4
, [edx
+ ecx
*4 + 16]
2731 movd mm5
, [edx
+ ecx
*4 + 20]
2732 movd mm6
, [edx
+ ecx
*4 + 24]
2733 movd mm7
, [edx
+ ecx
*4 + 28]
2735 pfmul mm6
, mm1 ;
# mm6 = Geps
2736 pfmul mm7
, mm2 ;
# mm7 = Heps2
2738 pfadd mm5
, mm7 ;
# mm5 = Fp
2739 pfmul mm7
, [esp
+ i0310_two
] ;
# two*Heps2
2741 pfadd mm7
, mm5 ;
# mm7=FF
2742 pfmul mm5
, mm1 ;
# mm5=eps*Fp
2743 pfadd mm5
, mm4 ;
# mm5= VV
2745 movq mm6
, [esp
+ i0310_c12
]
2746 pfmul mm7
, mm6 ;
# fijR
2747 pfmul mm5
, mm6 ;
# vnb12
2748 pfadd mm3
, mm7 ;
# total fscal fijC+ fijD+ fijR
2750 ;
# change sign of mm3
2753 pfmul mm0
, [esp
+ i0310_tsc
]
2754 pfmul mm0
, mm1 ;
# mm0 is total fscal now
2757 pfadd mm5
, [esp
+ i0310_vnbtot
] ;
# add the earlier value
2758 movq
[esp
+ i0310_vnbtot
], mm5 ;
# store the sum
2760 ;
# spread fscalar to both positions
2762 ;
# calc vectorial force
2763 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
2764 movq mm2
, [esp
+ i0310_dx1
]
2765 movd mm3
, [esp
+ i0310_dz1
]
2770 ;
# update i particle force
2771 movq mm0
, [esp
+ i0310_fix
]
2772 movd mm1
, [esp
+ i0310_fiz
]
2775 movq
[esp
+ i0310_fix
], mm0
2776 movd
[esp
+ i0310_fiz
], mm1
2777 ;
# update j particle force
2778 movq mm0
, [edi
+ eax
*4]
2779 movd mm1
, [edi
+ eax
*4+ 8]
2782 movq
[edi
+ eax
*4], mm0
2783 movd
[edi
+ eax
*4 +8], mm1
2785 .i0310_updateouterdata_vdw:
2786 mov ecx
, [esp
+ i0310_ii3
]
2788 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
2789 movd mm7
, [edi
+ ecx
*4 + 8]
2790 pfadd mm6
, [esp
+ i0310_fix
]
2791 pfadd mm7
, [esp
+ i0310_fiz
]
2792 movq
[edi
+ ecx
*4], mm6
2793 movd
[edi
+ ecx
*4 +8], mm7
2795 mov ebx
, [ebp
+ i0310_fshift
] ;
# increment fshift force
2796 mov edx
, [esp
+ i0310_is3
]
2798 movq mm6
, [ebx
+ edx
*4]
2799 movd mm7
, [ebx
+ edx
*4 + 8]
2800 pfadd mm6
, [esp
+ i0310_fix
]
2801 pfadd mm7
, [esp
+ i0310_fiz
]
2802 movq
[ebx
+ edx
*4], mm6
2803 movd
[ebx
+ edx
*4 + 8], mm7
2806 dec dword ptr
[esp
+ i0310_nsvdw
]
2811 mov edx
, [ebp
+ i0310_gid
] ;
# get group index for this i particle
2813 add dword ptr
[ebp
+ i0310_gid
], 4 ;
# advance pointer
2815 movq mm7
, [esp
+ i0310_vnbtot
]
2816 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
2818 mov eax
, [ebp
+ i0310_Vnb
]
2819 movd mm6
, [eax
+ edx
*4]
2821 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
2823 mov ecx
, [ebp
+ i0310_nri
]
2826 ;
# not last, iterate once more!
2827 mov
[ebp
+ i0310_nri
], ecx
2842 .globl inl1000_3dnow
2843 .globl _inl1000_3dnow
2847 .equiv i1000_iinr, 12
2848 .equiv i1000_jindex, 16
2849 .equiv i1000_jjnr, 20
2850 .equiv i1000_shift, 24
2851 .equiv i1000_shiftvec, 28
2852 .equiv i1000_fshift, 32
2853 .equiv i1000_gid, 36
2854 .equiv i1000_pos, 40
2855 .equiv i1000_faction, 44
2856 .equiv i1000_charge, 48
2857 .equiv i1000_facel, 52
2859 ;
# stack offsets for local variables
2866 .equiv i1000_vctot, 28
2867 .equiv i1000_innerjjnr, 36
2868 .equiv i1000_innerk, 40
2869 .equiv i1000_fix, 44
2870 .equiv i1000_fiy, 48
2871 .equiv i1000_fiz, 52
2872 .equiv i1000_dx1, 56
2873 .equiv i1000_dy1, 60
2874 .equiv i1000_dz1, 64
2875 .equiv i1000_dx2, 68
2876 .equiv i1000_dy2, 72
2877 .equiv i1000_dz2, 76
2886 sub esp
, 80 ;
# 80 bytes local stack space
2888 ;
# assume we have at least one i particle - start directly
2890 mov eax
, [ebp
+ i1000_shift
] ;
# eax = pointer into shift[]
2891 mov ebx
, [eax
] ;
# ebx=shift[n]
2892 add dword ptr
[ebp
+ i1000_shift
], 4 ;
# advance pointer one step
2894 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
2895 mov
[esp
+ i1000_is3
],ebx ;
# store is3
2897 mov eax
, [ebp
+ i1000_shiftvec
] ;
# eax = base of shiftvec[]
2899 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
2900 movd mm1
, [eax
+ ebx
*4 + 8]
2902 mov ecx
, [ebp
+ i1000_iinr
] ;
# ecx = pointer into iinr[]
2903 add dword ptr
[ebp
+ i1000_iinr
], 4 ;
# advance pointer
2904 mov ebx
, [ecx
] ;
# ebx=ii
2906 mov edx
, [ebp
+ i1000_charge
]
2907 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
2908 pfmul mm2
, [ebp
+ i1000_facel
]
2909 punpckldq mm2
,mm2 ;
# spread to both halves
2910 movq
[esp
+ i1000_iq
], mm2 ;
# iq =facel*charge[ii]
2912 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
2913 mov eax
, [ebp
+ i1000_pos
] ;
# eax = base of pos[]
2915 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
2916 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
2917 mov
[esp
+ i1000_ii3
], ebx
2919 movq
[esp
+ i1000_ix
], mm0
2920 movd
[esp
+ i1000_iz
], mm1
2922 ;
# clear vctot and i forces
2924 movq
[esp
+ i1000_vctot
], mm7
2925 movq
[esp
+ i1000_fix
], mm7
2926 movd
[esp
+ i1000_fiz
], mm7
2928 mov eax
, [ebp
+ i1000_jindex
]
2929 mov ecx
, [eax
] ;
# jindex[n]
2930 mov edx
, [eax
+ 4] ;
# jindex[n+1]
2931 add dword ptr
[ebp
+ i1000_jindex
], 4
2932 sub edx
, ecx ;
# number of innerloop atoms
2934 mov esi
, [ebp
+ i1000_pos
]
2935 mov edi
, [ebp
+ i1000_faction
]
2936 mov eax
, [ebp
+ i1000_jjnr
]
2939 mov
[esp
+ i1000_innerjjnr
], eax ;
# pointer to jjnr[nj0]
2941 mov
[esp
+ i1000_innerk
], edx ;
# number of innerloop atoms
2942 jge
.i1000_unroll_loop
2943 jmp
.i1000_finish_inner
2945 ;
# paired innerloop starts here
2946 mov ecx
, [esp
+ i1000_innerjjnr
] ;
# pointer to jjnr[k]
2948 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
2949 add dword ptr
[esp
+ i1000_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
2950 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
2952 mov ecx
, [ebp
+ i1000_charge
] ;
# base of charge[]
2953 movq mm5
, [esp
+ i1000_iq
]
2954 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
2955 movd mm7
, [ecx
+ ebx
*4] ;
# charge[jnr2]
2956 punpckldq mm3
,mm7 ;
# move charge 2 to high part of mm3
2957 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
2959 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
2960 lea ebx
, [ebx
+ ebx
*2]
2962 movq mm0
, [esp
+ i1000_ix
]
2963 movd mm1
, [esp
+ i1000_iz
]
2964 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
2965 movd mm5
, [esi
+ eax
*4 + 8]
2966 pfsubr mm4
,mm0 ;
# dr = ir - jr
2968 movq
[esp
+ i1000_dx1
], mm4 ;
# store dr
2969 movd
[esp
+ i1000_dz1
], mm5
2970 pfmul mm4
,mm4 ;
# square dx,dy,dz
2972 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
2973 pfacc mm4
, mm5 ;
# first rsq in lower mm4
2975 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
2976 movd mm7
, [esi
+ ebx
*4 + 8]
2978 pfsubr mm6
,mm0 ;
# dr = ir - jr
2980 movq
[esp
+ i1000_dx2
], mm6 ;
# store dr
2981 movd
[esp
+ i1000_dz2
], mm7
2982 pfmul mm6
,mm6 ;
# square dx,dy,dz
2984 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
2985 pfacc mm6
, mm7 ;
# second rsq in lower mm6
2987 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
2991 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
2992 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
2998 ;
# mm0 now contains invsq, and mm1 invsqrt
2999 ;
# do potential and fscal
3001 prefetchw
[esp
+ i1000_dx1
] ;
# prefetch i forces to cache
3003 pfmul mm3
,mm1 ;
# 3 has both vcoul
3004 pfmul mm0
,mm3 ;
# 0 has both fscal
3008 pfadd mm3
, [esp
+ i1000_vctot
] ;
# add the earlier value
3009 movq
[esp
+ i1000_vctot
], mm3 ;
# store the sum
3010 ;
# spread fscalar to both positions
3014 ;
# calc vector force
3015 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
3016 movq mm2
, [esp
+ i1000_dx1
] ;
# fetch dr
3017 movd mm3
, [esp
+ i1000_dz1
]
3018 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
3019 pfmul mm2
, mm0 ;
# mult by fs
3022 movq mm4
, [esp
+ i1000_dx2
] ;
# fetch dr
3023 movd mm5
, [esp
+ i1000_dz2
]
3024 pfmul mm4
, mm1 ;
# mult by fs
3028 movq mm0
, [esp
+ i1000_fix
]
3029 movd mm1
, [esp
+ i1000_fiz
]
3035 movq
[esp
+ i1000_fix
], mm0
3036 movd
[esp
+ i1000_fiz
], mm1
3039 movq mm0
, [edi
+ eax
*4]
3040 movd mm1
, [edi
+ eax
*4 + 8]
3041 movq mm6
, [edi
+ ebx
*4]
3042 movd mm7
, [edi
+ ebx
*4 + 8]
3049 movq
[edi
+ eax
*4], mm0
3050 movd
[edi
+ eax
*4 +8], mm1
3051 movq
[edi
+ ebx
*4], mm6
3052 movd
[edi
+ ebx
*4 + 8], mm7
3054 ;
# should we do one more iteration?
3055 sub dword ptr
[esp
+ i1000_innerk
], 2
3056 jl
.i1000_finish_inner
3057 jmp
.i1000_unroll_loop
3058 .i1000_finish_inner:
3059 and dword ptr
[esp
+ i1000_innerk
], 1
3060 jnz
.i1000_single_inner
3061 jmp
.i1000_updateouterdata
3062 .i1000_single_inner:
3063 ;
# a single j particle iteration here - compare with the unrolled code for comments
3064 mov eax
, [esp
+ i1000_innerjjnr
]
3065 mov eax
, [eax
] ;
# eax=jnr offset
3067 mov ecx
, [ebp
+ i1000_charge
]
3068 movd mm6
, [esp
+ i1000_iq
]
3069 movd mm7
, [ecx
+ eax
*4]
3070 pfmul mm6
, mm7 ;
# mm6=qq
3072 lea eax
, [eax
+ eax
*2]
3074 movq mm0
, [esp
+ i1000_ix
]
3075 movd mm1
, [esp
+ i1000_iz
]
3076 movq mm2
, [esi
+ eax
*4]
3077 movd mm3
, [esi
+ eax
*4 + 8]
3080 movq
[esp
+ i1000_dx1
], mm0
3082 movd
[esp
+ i1000_dz1
], mm1
3085 pfacc mm0
, mm1 ;
# mm0=rsq
3091 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
3093 pfmul mm4
, mm4 ;
# mm4=invsq
3094 ;
# calculate potential and scalar force
3095 pfmul mm6
, mm1 ;
# mm6=vcoul
3096 pfmul mm4
, mm6 ;
# mm4=fscalar
3098 pfadd mm6
, [esp
+ i1000_vctot
]
3099 movq
[esp
+ i1000_vctot
], mm6
3100 ;
# spread fscalar to both positions
3102 ;
# calc vectorial force
3103 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
3104 movq mm0
, [esp
+ i1000_dx1
]
3105 movd mm1
, [esp
+ i1000_dz1
]
3108 ;
# update i particle force
3109 movq mm2
, [esp
+ i1000_fix
]
3110 movd mm3
, [esp
+ i1000_fiz
]
3113 movq
[esp
+ i1000_fix
], mm2
3114 movd
[esp
+ i1000_fiz
], mm3
3115 ;
# update j particle force
3116 movq mm2
, [edi
+ eax
*4]
3117 movd mm3
, [edi
+ eax
*4+ 8]
3120 movq
[edi
+ eax
*4], mm2
3121 movd
[edi
+ eax
*4 +8], mm3
3123 .i1000_updateouterdata:
3124 mov ecx
, [esp
+ i1000_ii3
]
3126 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
3127 movd mm7
, [edi
+ ecx
*4 + 8]
3128 pfadd mm6
, [esp
+ i1000_fix
]
3129 pfadd mm7
, [esp
+ i1000_fiz
]
3130 movq
[edi
+ ecx
*4], mm6
3131 movd
[edi
+ ecx
*4 +8], mm7
3133 mov ebx
, [ebp
+ i1000_fshift
] ;
# increment fshift force
3134 mov edx
, [esp
+ i1000_is3
]
3136 movq mm6
, [ebx
+ edx
*4]
3137 movd mm7
, [ebx
+ edx
*4 + 8]
3138 pfadd mm6
, [esp
+ i1000_fix
]
3139 pfadd mm7
, [esp
+ i1000_fiz
]
3140 movq
[ebx
+ edx
*4], mm6
3141 movd
[ebx
+ edx
*4 + 8], mm7
3143 mov edx
, [ebp
+ i1000_gid
] ;
# get group index for this i particle
3145 add dword ptr
[ebp
+ i1000_gid
], 4 ;
# advance pointer
3147 movq mm7
, [esp
+ i1000_vctot
]
3148 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
3150 mov eax
, [ebp
+ i1000_Vc
]
3151 movd mm6
, [eax
+ edx
*4]
3153 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
3155 mov ecx
, [ebp
+ i1000_nri
]
3158 ;
# not last, iterate once more!
3159 mov
[ebp
+ i1000_nri
], ecx
3174 .globl inl1010_3dnow
3175 .globl _inl1010_3dnow
3179 .equiv i1010_iinr, 12
3180 .equiv i1010_jindex, 16
3181 .equiv i1010_jjnr, 20
3182 .equiv i1010_shift, 24
3183 .equiv i1010_shiftvec, 28
3184 .equiv i1010_fshift, 32
3185 .equiv i1010_gid, 36
3186 .equiv i1010_pos, 40
3187 .equiv i1010_faction, 44
3188 .equiv i1010_charge, 48
3189 .equiv i1010_facel, 52
3191 .equiv i1010_nsatoms, 60
3192 ;
# stack offsets for local variables
3196 .equiv i1010_shY, 12
3197 .equiv i1010_shZ, 16
3202 .equiv i1010_vctot, 40
3203 .equiv i1010_innerjjnr0, 48
3204 .equiv i1010_innerk0, 52
3205 .equiv i1010_innerjjnr, 56
3206 .equiv i1010_innerk, 60
3207 .equiv i1010_fix, 64
3208 .equiv i1010_fiy, 68
3209 .equiv i1010_fiz, 72
3210 .equiv i1010_dx1, 76
3211 .equiv i1010_dy1, 80
3212 .equiv i1010_dz1, 84
3213 .equiv i1010_dx2, 88
3214 .equiv i1010_dy2, 92
3215 .equiv i1010_dz2, 96
3216 .equiv i1010_nscoul, 100
3217 .equiv i1010_solnr, 104
3226 sub esp
, 108 ;
# local stack space
3228 ;
# assume we have at least one i particle - start directly
3229 add dword ptr
[ebp
+ i1010_nsatoms
], 8
3232 mov eax
, [ebp
+ i1010_shift
] ;
# eax = pointer into shift[]
3233 mov ebx
, [eax
] ;
# ebx=shift[n]
3234 add dword ptr
[ebp
+ i1010_shift
], 4 ;
# advance pointer one step
3236 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
3237 mov
[esp
+ i1010_is3
],ebx ;
# store is3
3239 mov eax
, [ebp
+ i1010_shiftvec
] ;
# eax = base of shiftvec[]
3241 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
3242 movd mm1
, [eax
+ ebx
*4 + 8]
3243 movq
[esp
+ i1010_shX
], mm0
3244 movd
[esp
+ i1010_shZ
], mm1
3246 mov ecx
, [ebp
+ i1010_iinr
] ;
# ecx = pointer into iinr[]
3247 add dword ptr
[ebp
+ i1010_iinr
], 4 ;
# advance pointer
3248 mov ebx
, [ecx
] ;
# ebx=ii
3250 mov eax
, [ebp
+ i1010_nsatoms
]
3252 add dword ptr
[ebp
+ i1010_nsatoms
], 12
3253 mov
[esp
+ i1010_nscoul
], ecx
3257 movq
[esp
+ i1010_vctot
], mm7
3258 mov
[esp
+ i1010_solnr
], ebx
3260 mov eax
, [ebp
+ i1010_jindex
] ;
# current pointer to jindex list
3261 mov ecx
, [eax
] ;
# jindex[n]
3262 mov edx
, [eax
+ 4] ;
# jindex[n+1]
3263 add dword ptr
[ebp
+ i1010_jindex
], 4 ;
# advance pointer
3264 sub edx
, ecx ;
# number of innerloop atoms
3265 mov eax
, [ebp
+ i1010_jjnr
]
3267 add eax
, ecx ;
# pointer to index of the first j atom
3268 mov
[esp
+ i1010_innerjjnr0
], eax ;
# save pointer to jjnr[nj0]
3270 mov
[esp
+ i1010_innerk0
], edx ;
# number of innerloop atoms
3271 mov esi
, [ebp
+ i1010_pos
]
3272 mov edi
, [ebp
+ i1010_faction
]
3274 mov ecx
, [esp
+ i1010_nscoul
]
3279 mov ebx
, [esp
+ i1010_solnr
]
3280 inc dword ptr
[esp
+ i1010_solnr
]
3281 mov edx
, [ebp
+ i1010_charge
]
3282 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
3283 pfmul mm2
, [ebp
+ i1010_facel
]
3284 punpckldq mm2
,mm2 ;
# spread to both halves
3285 movq
[esp
+ i1010_iq
], mm2 ;
# iq =facel*charge[ii]
3287 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
3288 mov eax
, [ebp
+ i1010_pos
] ;
# eax = base pointer of pos[]
3289 mov
[esp
+ i1010_ii3
], ebx ;
# store ii3
3291 movq mm0
, [eax
+ ebx
*4] ;
# load x and y coords to mm0
3292 movd mm1
, [eax
+ ebx
*4 + 8] ;
# load z coord to mm1
3293 pfadd mm0
, [esp
+ i1010_shX
] ;
# add shift vector
3294 pfadd mm1
, [esp
+ i1010_shZ
]
3295 movq
[esp
+ i1010_ix
], mm0 ;
# store shifted coords
3296 movd
[esp
+ i1010_iz
], mm1
3300 movq
[esp
+ i1010_fix
], mm7
3301 movd
[esp
+ i1010_fiz
], mm7
3303 mov ecx
, [esp
+ i1010_innerjjnr0
]
3304 mov
[esp
+ i1010_innerjjnr
], ecx
3305 mov edx
, [esp
+ i1010_innerk0
]
3307 mov
[esp
+ i1010_innerk
], edx ;
# number of innerloop atoms
3308 jge
.i1010_unroll_coul_loop
3309 jmp
.i1010_finish_coul_inner
3310 .i1010_unroll_coul_loop:
3311 ;
# paired innerloop starts here
3312 mov ecx
, [esp
+ i1010_innerjjnr
] ;
# pointer to jjnr[k]
3314 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
3315 add dword ptr
[esp
+ i1010_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
3316 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
3318 mov ecx
, [ebp
+ i1010_charge
] ;
# base of charge[]
3319 movq mm5
, [esp
+ i1010_iq
]
3320 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
3321 movd mm7
, [ecx
+ ebx
*4] ;
# charge[jnr2]
3322 punpckldq mm3
,mm7 ;
# move charge 2 to high part of mm3
3323 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
3325 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
3326 lea ebx
, [ebx
+ ebx
*2]
3328 movq mm0
, [esp
+ i1010_ix
]
3329 movd mm1
, [esp
+ i1010_iz
]
3330 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
3331 movd mm5
, [esi
+ eax
*4 + 8]
3332 pfsubr mm4
,mm0 ;
# dr = ir - jr
3334 movq
[esp
+ i1010_dx1
], mm4 ;
# store dr
3335 movd
[esp
+ i1010_dz1
], mm5
3336 pfmul mm4
,mm4 ;
# square dx,dy,dz
3338 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
3339 pfacc mm4
, mm5 ;
# first rsq in lower mm4
3341 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
3342 movd mm7
, [esi
+ ebx
*4 + 8]
3344 pfsubr mm6
,mm0 ;
# dr = ir - jr
3346 movq
[esp
+ i1010_dx2
], mm6 ;
# store dr
3347 movd
[esp
+ i1010_dz2
], mm7
3348 pfmul mm6
,mm6 ;
# square dx,dy,dz
3350 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
3351 pfacc mm6
, mm7 ;
# second rsq in lower mm6
3353 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
3357 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
3358 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
3364 ;
# mm0 now contains invsq, and mm1 invsqrt
3365 ;
# do potential and fscal
3366 prefetchw
[esp
+ i1010_dx1
] ;
# prefetch i forces to cache
3368 pfmul mm3
,mm1 ;
# 3 has both vcoul
3369 pfmul mm0
,mm3 ;
# 0 has both fscal
3373 pfadd mm3
, [esp
+ i1010_vctot
] ;
# add the earlier value
3374 movq
[esp
+ i1010_vctot
], mm3 ;
# store the sum
3375 ;
# spread fscalar to both positions
3379 ;
# calc vector force
3380 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
3381 movq mm2
, [esp
+ i1010_dx1
] ;
# fetch dr
3382 movd mm3
, [esp
+ i1010_dz1
]
3383 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
3384 pfmul mm2
, mm0 ;
# mult by fs
3387 movq mm4
, [esp
+ i1010_dx2
] ;
# fetch dr
3388 movd mm5
, [esp
+ i1010_dz2
]
3389 pfmul mm4
, mm1 ;
# mult by fs
3393 movq mm0
, [esp
+ i1010_fix
]
3394 movd mm1
, [esp
+ i1010_fiz
]
3400 movq
[esp
+ i1010_fix
], mm0
3401 movd
[esp
+ i1010_fiz
], mm1
3404 movq mm0
, [edi
+ eax
*4]
3405 movd mm1
, [edi
+ eax
*4 + 8]
3406 movq mm6
, [edi
+ ebx
*4]
3407 movd mm7
, [edi
+ ebx
*4 + 8]
3414 movq
[edi
+ eax
*4], mm0
3415 movd
[edi
+ eax
*4 +8], mm1
3416 movq
[edi
+ ebx
*4], mm6
3417 movd
[edi
+ ebx
*4 + 8], mm7
3419 ;
# should we do one more iteration?
3420 sub dword ptr
[esp
+ i1010_innerk
], 2
3421 jl
.i1010_finish_coul_inner
3422 jmp
.i1010_unroll_coul_loop
3423 .i1010_finish_coul_inner:
3424 and dword ptr
[esp
+ i1010_innerk
], 1
3425 jnz
.i1010_single_coul_inner
3426 jmp
.i1010_updateouterdata_coul
3427 .i1010_single_coul_inner:
3428 ;
# a single j particle iteration here - compare with the unrolled code for comments
3429 mov eax
, [esp
+ i1010_innerjjnr
]
3430 mov eax
, [eax
] ;
# eax=jnr offset
3432 mov ecx
, [ebp
+ i1010_charge
]
3433 movd mm6
, [esp
+ i1010_iq
]
3434 movd mm7
, [ecx
+ eax
*4]
3435 pfmul mm6
, mm7 ;
# mm6=qq
3437 lea eax
, [eax
+ eax
*2]
3439 movq mm0
, [esp
+ i1010_ix
]
3440 movd mm1
, [esp
+ i1010_iz
]
3441 movq mm2
, [esi
+ eax
*4]
3442 movd mm3
, [esi
+ eax
*4 + 8]
3445 movq
[esp
+ i1010_dx1
], mm0
3447 movd
[esp
+ i1010_dz1
], mm1
3450 pfacc mm0
, mm1 ;
# mm0=rsq
3456 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
3458 pfmul mm4
, mm4 ;
# mm4=invsq
3459 ;
# calculate potential and scalar force
3460 pfmul mm6
, mm1 ;
# mm6=vcoul
3461 pfmul mm4
, mm6 ;
# mm4=fscalar
3463 pfadd mm6
, [esp
+ i1010_vctot
]
3464 movq
[esp
+ i1010_vctot
], mm6
3465 ;
# spread fscalar to both positions
3467 ;
# calc vectorial force
3468 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
3469 movq mm0
, [esp
+ i1010_dx1
]
3470 movd mm1
, [esp
+ i1010_dz1
]
3473 ;
# update i particle force
3474 movq mm2
, [esp
+ i1010_fix
]
3475 movd mm3
, [esp
+ i1010_fiz
]
3478 movq
[esp
+ i1010_fix
], mm2
3479 movd
[esp
+ i1010_fiz
], mm3
3480 ;
# update j particle force
3481 movq mm2
, [edi
+ eax
*4]
3482 movd mm3
, [edi
+ eax
*4+ 8]
3485 movq
[edi
+ eax
*4], mm2
3486 movd
[edi
+ eax
*4 +8], mm3
3488 .i1010_updateouterdata_coul:
3489 mov ecx
, [esp
+ i1010_ii3
]
3491 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
3492 movd mm7
, [edi
+ ecx
*4 + 8]
3493 pfadd mm6
, [esp
+ i1010_fix
]
3494 pfadd mm7
, [esp
+ i1010_fiz
]
3495 movq
[edi
+ ecx
*4], mm6
3496 movd
[edi
+ ecx
*4 +8], mm7
3498 mov ebx
, [ebp
+ i1010_fshift
] ;
# increment fshift force
3499 mov edx
, [esp
+ i1010_is3
]
3501 movq mm6
, [ebx
+ edx
*4]
3502 movd mm7
, [ebx
+ edx
*4 + 8]
3503 pfadd mm6
, [esp
+ i1010_fix
]
3504 pfadd mm7
, [esp
+ i1010_fiz
]
3505 movq
[ebx
+ edx
*4], mm6
3506 movd
[ebx
+ edx
*4 + 8], mm7
3509 dec dword ptr
[esp
+ i1010_nscoul
]
3513 mov edx
, [ebp
+ i1010_gid
] ;
# get group index for this i particle
3515 add dword ptr
[ebp
+ i1010_gid
], 4 ;
# advance pointer
3517 movq mm7
, [esp
+ i1010_vctot
]
3518 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
3520 mov eax
, [ebp
+ i1010_Vc
]
3521 movd mm6
, [eax
+ edx
*4]
3523 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
3525 mov ecx
, [ebp
+ i1010_nri
]
3528 ;
# not last, iterate once more!
3529 mov
[ebp
+ i1010_nri
], ecx
3545 .globl inl1020_3dnow
3546 .globl _inl1020_3dnow
3550 .equiv i1020_iinr, 12
3551 .equiv i1020_jindex, 16
3552 .equiv i1020_jjnr, 20
3553 .equiv i1020_shift, 24
3554 .equiv i1020_shiftvec, 28
3555 .equiv i1020_fshift, 32
3556 .equiv i1020_gid, 36
3557 .equiv i1020_pos, 40
3558 .equiv i1020_faction, 44
3559 .equiv i1020_charge, 48
3560 .equiv i1020_facel, 52
3562 ;
# stack offsets for local variables
3566 .equiv i1020_iyO, 12
3567 .equiv i1020_izO, 16
3568 .equiv i1020_ixH, 20
3569 .equiv i1020_iyH, 28
3570 .equiv i1020_izH, 36
3571 .equiv i1020_iqO, 44
3572 .equiv i1020_iqH, 52
3573 .equiv i1020_vctot, 60
3574 .equiv i1020_innerjjnr, 68
3575 .equiv i1020_innerk, 72
3576 .equiv i1020_fixO, 76
3577 .equiv i1020_fiyO, 80
3578 .equiv i1020_fizO, 84
3579 .equiv i1020_fixH, 88
3580 .equiv i1020_fiyH, 96
3581 .equiv i1020_fizH, 104
3582 .equiv i1020_dxO, 112
3583 .equiv i1020_dyO, 116
3584 .equiv i1020_dzO, 120
3585 .equiv i1020_dxH, 124
3586 .equiv i1020_dyH, 132
3587 .equiv i1020_dzH, 140
3596 sub esp
, 148 ;
# local stack space
3598 ;
# assume we have at least one i particle - start directly
3600 mov ecx
, [ebp
+ i1020_iinr
] ;
# ecx = pointer into iinr[]
3601 mov ebx
, [ecx
] ;
# ebx=ii
3603 mov edx
, [ebp
+ i1020_charge
]
3604 movd mm1
, [ebp
+ i1020_facel
]
3605 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0]
3607 movq
[esp
+ i1020_iqO
], mm2 ;
# iqO = facel*charge[ii]
3609 movd mm2
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1]
3611 punpckldq mm2
,mm2 ;
# spread to both halves
3612 movq
[esp
+ i1020_iqH
], mm2 ;
# iqH = facel*charge[ii0+1]
3614 mov eax
, [ebp
+ i1020_shift
] ;
# eax = pointer into shift[]
3615 mov ebx
, [eax
] ;
# ebx=shift[n]
3616 add dword ptr
[ebp
+ i1020_shift
], 4 ;
# advance pointer one step
3618 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
3619 mov
[esp
+ i1020_is3
],ebx ;
# store is3
3621 mov eax
, [ebp
+ i1020_shiftvec
] ;
# eax = base of shiftvec[]
3623 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6
3624 movd mm6
, [eax
+ ebx
*4 + 8]
3628 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2
3632 mov ecx
, [ebp
+ i1020_iinr
] ;
# ecx = pointer into iinr[]
3633 add dword ptr
[ebp
+ i1020_iinr
], 4 ;
# advance pointer
3634 mov ebx
, [ecx
] ;
# ebx=ii
3636 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
3637 mov eax
, [ebp
+ i1020_pos
] ;
# eax = base of pos[]
3639 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
3640 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
3641 mov
[esp
+ i1020_ii3
], ebx ;
# (use mm7 as temp storage for iz)
3643 movq
[esp
+ i1020_ixO
], mm5
3644 movq
[esp
+ i1020_izO
], mm6
3646 movd mm3
, [eax
+ ebx
*4 + 12]
3647 movd mm4
, [eax
+ ebx
*4 + 16]
3648 movd mm5
, [eax
+ ebx
*4 + 20]
3649 punpckldq mm3
, [eax
+ ebx
*4 + 24]
3650 punpckldq mm4
, [eax
+ ebx
*4 + 28]
3651 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
3656 movq
[esp
+ i1020_ixH
], mm0
3657 movq
[esp
+ i1020_iyH
], mm1
3658 movq
[esp
+ i1020_izH
], mm2
3660 ;
# clear vctot and i forces
3662 movq
[esp
+ i1020_vctot
], mm7
3663 movq
[esp
+ i1020_fixO
], mm7
3664 movd
[esp
+ i1020_fizO
], mm7
3665 movq
[esp
+ i1020_fixH
], mm7
3666 movq
[esp
+ i1020_fiyH
], mm7
3667 movq
[esp
+ i1020_fizH
], mm7
3669 mov eax
, [ebp
+ i1020_jindex
]
3670 mov ecx
, [eax
] ;
# jindex[n]
3671 mov edx
, [eax
+ 4] ;
# jindex[n+1]
3672 add dword ptr
[ebp
+ i1020_jindex
], 4
3673 sub edx
, ecx ;
# number of innerloop atoms
3674 mov
[esp
+ i1020_innerk
], edx ;
# number of innerloop atoms
3676 mov esi
, [ebp
+ i1020_pos
]
3677 mov edi
, [ebp
+ i1020_faction
]
3678 mov eax
, [ebp
+ i1020_jjnr
]
3681 mov
[esp
+ i1020_innerjjnr
], eax ;
# pointer to jjnr[nj0]
3683 ;
# a single j particle iteration here - compare with the unrolled code for comments
3684 mov eax
, [esp
+ i1020_innerjjnr
]
3685 mov eax
, [eax
] ;
# eax=jnr offset
3686 add dword ptr
[esp
+ i1020_innerjjnr
], 4 ;
# advance pointer
3687 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
3689 mov ecx
, [ebp
+ i1020_charge
]
3690 movd mm7
, [ecx
+ eax
*4]
3693 pfmul mm6
, [esp
+ i1020_iqO
]
3694 pfmul mm7
, [esp
+ i1020_iqH
] ;
# mm6=qqO, mm7=qqH
3696 lea eax
, [eax
+ eax
*2]
3698 movq mm0
, [esi
+ eax
*4]
3699 movd mm1
, [esi
+ eax
*4 + 8]
3700 ;
# copy & expand to mm2-mm4 for the H interactions
3708 pfsubr mm0
, [esp
+ i1020_ixO
]
3709 pfsubr mm1
, [esp
+ i1020_izO
]
3711 movq
[esp
+ i1020_dxO
], mm0
3713 movd
[esp
+ i1020_dzO
], mm1
3716 pfadd mm0
, mm1 ;
# mm0=rsqO
3720 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
3721 pfsubr mm2
, [esp
+ i1020_ixH
]
3722 pfsubr mm3
, [esp
+ i1020_iyH
]
3723 pfsubr mm4
, [esp
+ i1020_izH
] ;
# mm2-mm4 is dxH-dzH
3725 movq
[esp
+ i1020_dxH
], mm2
3726 movq
[esp
+ i1020_dyH
], mm3
3727 movq
[esp
+ i1020_dzH
], mm4
3733 pfadd mm3
,mm4 ;
# mm3=rsqH
3740 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
3742 pfmul mm4
, mm4 ;
# mm4=invsq
3743 ;
# calculate potential and scalar force
3744 pfmul mm6
, mm1 ;
# mm6=vcoul
3745 pfmul mm4
, mm6 ;
# mm4=fscalar
3751 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
3756 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
3758 pfmul mm3
,mm3 ;
# mm3=invsq
3759 pfmul mm7
, mm5 ;
# mm7=vcoul
3760 pfmul mm3
, mm7 ;
# mm3=fscal for the two H's
3764 pfadd mm7
, [esp
+ i1020_vctot
]
3765 movq
[esp
+ i1020_vctot
], mm7
3767 ;
# spread oxygen fscalar to both positions
3769 ;
# calc vectorial force for O
3770 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
3771 movq mm0
, [esp
+ i1020_dxO
]
3772 movd mm1
, [esp
+ i1020_dzO
]
3776 ;
# calc vectorial force for H's
3777 movq mm5
, [esp
+ i1020_dxH
]
3778 movq mm6
, [esp
+ i1020_dyH
]
3779 movq mm7
, [esp
+ i1020_dzH
]
3784 ;
# update iO particle force
3785 movq mm2
, [esp
+ i1020_fixO
]
3786 movd mm3
, [esp
+ i1020_fizO
]
3789 movq
[esp
+ i1020_fixO
], mm2
3790 movd
[esp
+ i1020_fizO
], mm3
3793 movq mm2
, [esp
+ i1020_fixH
]
3794 movq mm3
, [esp
+ i1020_fiyH
]
3795 movq mm4
, [esp
+ i1020_fizH
]
3799 movq
[esp
+ i1020_fixH
], mm2
3800 movq
[esp
+ i1020_fiyH
], mm3
3801 movq
[esp
+ i1020_fizH
], mm4
3803 ;
# pack j forces from H in the same form as the oxygen force
3804 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
3805 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
3807 pfadd mm0
, mm5 ;
# add up total force on j particle
3810 ;
# update j particle force
3811 movq mm2
, [edi
+ eax
*4]
3812 movd mm3
, [edi
+ eax
*4 + 8]
3815 movq
[edi
+ eax
*4], mm2
3816 movd
[edi
+ eax
*4 +8], mm3
3819 dec dword ptr
[esp
+ i1020_innerk
]
3820 jz
.i1020_updateouterdata
3821 jmp
.i1020_inner_loop
3822 .i1020_updateouterdata:
3823 mov ecx
, [esp
+ i1020_ii3
]
3825 movq mm6
, [edi
+ ecx
*4] ;
# increment iO force
3826 movd mm7
, [edi
+ ecx
*4 + 8]
3827 pfadd mm6
, [esp
+ i1020_fixO
]
3828 pfadd mm7
, [esp
+ i1020_fizO
]
3829 movq
[edi
+ ecx
*4], mm6
3830 movd
[edi
+ ecx
*4 +8], mm7
3832 movq mm0
, [esp
+ i1020_fixH
]
3833 movq mm3
, [esp
+ i1020_fiyH
]
3834 movq mm1
, [esp
+ i1020_fizH
]
3836 punpckldq mm0
, mm3 ;
# mm0(l)=fxH1, mm0(h)=fyH1
3837 punpckhdq mm2
, mm3 ;
# mm2(l)=fxH2, mm2(h)=fyH2
3843 movq mm6
, [edi
+ ecx
*4 + 12] ;
# increment iH1 force
3844 movd mm7
, [edi
+ ecx
*4 + 20]
3847 movq
[edi
+ ecx
*4 + 12], mm6
3848 movd
[edi
+ ecx
*4 + 20], mm7
3850 movq mm6
, [edi
+ ecx
*4 + 24] ;
# increment iH2 force
3851 movd mm7
, [edi
+ ecx
*4 + 32]
3854 movq
[edi
+ ecx
*4 + 24], mm6
3855 movd
[edi
+ ecx
*4 + 32], mm7
3858 mov ebx
, [ebp
+ i1020_fshift
] ;
# increment fshift force
3859 mov edx
, [esp
+ i1020_is3
]
3861 movq mm6
, [ebx
+ edx
*4]
3862 movd mm7
, [ebx
+ edx
*4 + 8]
3863 pfadd mm6
, [esp
+ i1020_fixO
]
3864 pfadd mm7
, [esp
+ i1020_fizO
]
3869 movq
[ebx
+ edx
*4], mm6
3870 movd
[ebx
+ edx
*4 + 8], mm7
3872 mov edx
, [ebp
+ i1020_gid
] ;
# get group index for this i particle
3874 add dword ptr
[ebp
+ i1020_gid
], 4 ;
# advance pointer
3876 movq mm7
, [esp
+ i1020_vctot
]
3877 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
3879 mov eax
, [ebp
+ i1020_Vc
]
3880 movd mm6
, [eax
+ edx
*4]
3882 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
3885 dec dword ptr
[ebp
+ i1020_nri
]
3887 ;
# not last, iterate once more!
3902 .globl inl1030_3dnow
3903 .globl _inl1030_3dnow
3907 .equiv i1030_iinr, 12
3908 .equiv i1030_jindex, 16
3909 .equiv i1030_jjnr, 20
3910 .equiv i1030_shift, 24
3911 .equiv i1030_shiftvec, 28
3912 .equiv i1030_fshift, 32
3913 .equiv i1030_gid, 36
3914 .equiv i1030_pos, 40
3915 .equiv i1030_faction, 44
3916 .equiv i1030_charge, 48
3917 .equiv i1030_facel, 52
3919 ;
# stack offsets for local variables
3923 .equiv i1030_iyO, 12
3924 .equiv i1030_izO, 16
3925 .equiv i1030_ixH, 20
3926 .equiv i1030_iyH, 28
3927 .equiv i1030_izH, 36
3928 .equiv i1030_qqOO, 44
3929 .equiv i1030_qqOH, 52
3930 .equiv i1030_qqHH, 60
3931 .equiv i1030_vctot, 68
3932 .equiv i1030_innerjjnr, 76
3933 .equiv i1030_innerk, 80
3934 .equiv i1030_fixO, 84
3935 .equiv i1030_fiyO, 88
3936 .equiv i1030_fizO, 92
3937 .equiv i1030_fixH, 96
3938 .equiv i1030_fiyH, 104
3939 .equiv i1030_fizH, 112
3940 .equiv i1030_dxO, 120
3941 .equiv i1030_dyO, 124
3942 .equiv i1030_dzO, 128
3943 .equiv i1030_dxH, 132
3944 .equiv i1030_dyH, 140
3945 .equiv i1030_dzH, 148
3954 sub esp
, 156 ;
# local stack space
3956 ;
# assume we have at least one i particle - start directly
3958 mov ecx
, [ebp
+ i1030_iinr
] ;
# ecx = pointer into iinr[]
3959 mov ebx
, [ecx
] ;
# ebx=ii
3961 mov edx
, [ebp
+ i1030_charge
]
3962 movd mm1
, [ebp
+ i1030_facel
] ;
# mm1=facel
3963 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0] (O)
3964 movd mm3
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1] (H)
3970 pfmul mm4
, mm2 ;
# mm4=qqOO*facel
3971 pfmul mm5
, mm3 ;
# mm5=qqOH*facel
3972 pfmul mm6
, mm3 ;
# mm6=qqHH*facel
3973 punpckldq mm5
,mm5 ;
# spread to both halves
3974 punpckldq mm6
,mm6 ;
# spread to both halves
3975 movq
[esp
+ i1030_qqOO
], mm4
3976 movq
[esp
+ i1030_qqOH
], mm5
3977 movq
[esp
+ i1030_qqHH
], mm6
3979 mov eax
, [ebp
+ i1030_shift
] ;
# eax = pointer into shift[]
3980 mov ebx
, [eax
] ;
# ebx=shift[n]
3981 add dword ptr
[ebp
+ i1030_shift
], 4 ;
# advance pointer one step
3983 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
3984 mov
[esp
+ i1030_is3
],ebx ;
# store is3
3986 mov eax
, [ebp
+ i1030_shiftvec
] ;
# eax = base of shiftvec[]
3988 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6
3989 movd mm6
, [eax
+ ebx
*4 + 8]
3993 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2
3997 mov ecx
, [ebp
+ i1030_iinr
] ;
# ecx = pointer into iinr[]
3998 add dword ptr
[ebp
+ i1030_iinr
], 4 ;
# advance pointer
3999 mov ebx
, [ecx
] ;
# ebx=ii
4001 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
4002 mov eax
, [ebp
+ i1030_pos
] ;
# eax = base of pos[]
4004 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
4005 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
4006 mov
[esp
+ i1030_ii3
], ebx ;
# (use mm7 as temp storage for iz)
4008 movq
[esp
+ i1030_ixO
], mm5
4009 movq
[esp
+ i1030_izO
], mm6
4011 movd mm3
, [eax
+ ebx
*4 + 12]
4012 movd mm4
, [eax
+ ebx
*4 + 16]
4013 movd mm5
, [eax
+ ebx
*4 + 20]
4014 punpckldq mm3
, [eax
+ ebx
*4 + 24]
4015 punpckldq mm4
, [eax
+ ebx
*4 + 28]
4016 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
4021 movq
[esp
+ i1030_ixH
], mm0
4022 movq
[esp
+ i1030_iyH
], mm1
4023 movq
[esp
+ i1030_izH
], mm2
4025 ;
# clear vctot and i forces
4027 movq
[esp
+ i1030_vctot
], mm7
4028 movq
[esp
+ i1030_fixO
], mm7
4029 movq
[esp
+ i1030_fizO
], mm7
4030 movq
[esp
+ i1030_fixH
], mm7
4031 movq
[esp
+ i1030_fiyH
], mm7
4032 movq
[esp
+ i1030_fizH
], mm7
4034 mov eax
, [ebp
+ i1030_jindex
]
4035 mov ecx
, [eax
] ;
# jindex[n]
4036 mov edx
, [eax
+ 4] ;
# jindex[n+1]
4037 add dword ptr
[ebp
+ i1030_jindex
], 4
4038 sub edx
, ecx ;
# number of innerloop atoms
4039 mov
[esp
+ i1030_innerk
], edx ;
# number of innerloop atoms
4041 mov esi
, [ebp
+ i1030_pos
]
4042 mov edi
, [ebp
+ i1030_faction
]
4043 mov eax
, [ebp
+ i1030_jjnr
]
4046 mov
[esp
+ i1030_innerjjnr
], eax ;
# pointer to jjnr[nj0]
4048 ;
# a single j particle iteration here - compare with the unrolled code for comments
4049 mov eax
, [esp
+ i1030_innerjjnr
]
4050 mov eax
, [eax
] ;
# eax=jnr offset
4051 add dword ptr
[esp
+ i1030_innerjjnr
], 4 ;
# advance pointer
4053 movd mm6
, [esp
+ i1030_qqOO
]
4054 movq mm7
, [esp
+ i1030_qqOH
]
4056 lea eax
, [eax
+ eax
*2]
4057 movq mm0
, [esi
+ eax
*4]
4058 movd mm1
, [esi
+ eax
*4 + 8]
4059 ;
# copy & expand to mm2-mm4 for the H interactions
4067 pfsubr mm0
, [esp
+ i1030_ixO
]
4068 pfsubr mm1
, [esp
+ i1030_izO
]
4070 movq
[esp
+ i1030_dxO
], mm0
4072 movd
[esp
+ i1030_dzO
], mm1
4075 pfadd mm0
, mm1 ;
# mm0=rsqO
4079 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
4080 pfsubr mm2
, [esp
+ i1030_ixH
]
4081 pfsubr mm3
, [esp
+ i1030_iyH
]
4082 pfsubr mm4
, [esp
+ i1030_izH
] ;
# mm2-mm4 is dxH-dzH
4084 movq
[esp
+ i1030_dxH
], mm2
4085 movq
[esp
+ i1030_dyH
], mm3
4086 movq
[esp
+ i1030_dzH
], mm4
4092 pfadd mm3
,mm4 ;
# mm3=rsqH
4099 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
4101 pfmul mm4
, mm4 ;
# mm4=invsq
4102 ;
# calculate potential and scalar force
4103 pfmul mm6
, mm1 ;
# mm6=vcoul
4104 pfmul mm4
, mm6 ;
# mm4=fscalar
4110 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
4115 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
4117 pfmul mm3
,mm3 ;
# mm3=invsq
4118 pfmul mm7
, mm5 ;
# mm7=vcoul
4119 pfmul mm3
, mm7 ;
# mm3=fscal for the two H's
4123 pfadd mm7
, [esp
+ i1030_vctot
]
4124 movq
[esp
+ i1030_vctot
], mm7
4126 ;
# spread oxygen fscalar to both positions
4128 ;
# calc vectorial force for O
4129 movq mm0
, [esp
+ i1030_dxO
]
4130 movd mm1
, [esp
+ i1030_dzO
]
4134 ;
# calc vectorial force for H's
4135 movq mm5
, [esp
+ i1030_dxH
]
4136 movq mm6
, [esp
+ i1030_dyH
]
4137 movq mm7
, [esp
+ i1030_dzH
]
4142 ;
# update iO particle force
4143 movq mm2
, [esp
+ i1030_fixO
]
4144 movd mm3
, [esp
+ i1030_fizO
]
4147 movq
[esp
+ i1030_fixO
], mm2
4148 movd
[esp
+ i1030_fizO
], mm3
4151 movq mm2
, [esp
+ i1030_fixH
]
4152 movq mm3
, [esp
+ i1030_fiyH
]
4153 movq mm4
, [esp
+ i1030_fizH
]
4157 movq
[esp
+ i1030_fixH
], mm2
4158 movq
[esp
+ i1030_fiyH
], mm3
4159 movq
[esp
+ i1030_fizH
], mm4
4161 ;
# pack j forces from H in the same form as the oxygen force
4162 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
4163 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
4165 pfadd mm0
, mm5 ;
# add up total force on j particle
4168 ;
# update j particle force
4169 movq mm2
, [edi
+ eax
*4]
4170 movd mm3
, [edi
+ eax
*4 + 8]
4173 movq
[edi
+ eax
*4], mm2
4174 movd
[edi
+ eax
*4 +8], mm3
4176 ;
# interactions with j H1
4177 movq mm0
, [esi
+ eax
*4 + 12]
4178 movd mm1
, [esi
+ eax
*4 + 20]
4179 ;
# copy & expand to mm2-mm4 for the H interactions
4187 movd mm6
, [esp
+ i1030_qqOH
]
4188 movq mm7
, [esp
+ i1030_qqHH
]
4190 pfsubr mm0
, [esp
+ i1030_ixO
]
4191 pfsubr mm1
, [esp
+ i1030_izO
]
4193 movq
[esp
+ i1030_dxO
], mm0
4195 movd
[esp
+ i1030_dzO
], mm1
4198 pfadd mm0
, mm1 ;
# mm0=rsqO
4202 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
4203 pfsubr mm2
, [esp
+ i1030_ixH
]
4204 pfsubr mm3
, [esp
+ i1030_iyH
]
4205 pfsubr mm4
, [esp
+ i1030_izH
] ;
# mm2-mm4 is dxH-dzH
4207 movq
[esp
+ i1030_dxH
], mm2
4208 movq
[esp
+ i1030_dyH
], mm3
4209 movq
[esp
+ i1030_dzH
], mm4
4215 pfadd mm3
,mm4 ;
# mm3=rsqH
4222 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
4224 pfmul mm4
, mm4 ;
# mm4=invsq
4225 ;
# calculate potential and scalar force
4226 pfmul mm6
, mm1 ;
# mm6=vcoul
4227 pfmul mm4
, mm6 ;
# mm4=fscalar
4233 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
4238 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
4240 pfmul mm3
,mm3 ;
# mm3=invsq
4241 pfmul mm7
, mm5 ;
# mm7=vcoul
4242 pfmul mm3
, mm7 ;
# mm3=fscal for the two H's
4246 pfadd mm7
, [esp
+ i1030_vctot
]
4247 movq
[esp
+ i1030_vctot
], mm7
4249 ;
# spread oxygen fscalar to both positions
4251 ;
# calc vectorial force for O
4252 movq mm0
, [esp
+ i1030_dxO
]
4253 movd mm1
, [esp
+ i1030_dzO
]
4257 ;
# calc vectorial force for H's
4258 movq mm5
, [esp
+ i1030_dxH
]
4259 movq mm6
, [esp
+ i1030_dyH
]
4260 movq mm7
, [esp
+ i1030_dzH
]
4265 ;
# update iO particle force
4266 movq mm2
, [esp
+ i1030_fixO
]
4267 movd mm3
, [esp
+ i1030_fizO
]
4270 movq
[esp
+ i1030_fixO
], mm2
4271 movd
[esp
+ i1030_fizO
], mm3
4274 movq mm2
, [esp
+ i1030_fixH
]
4275 movq mm3
, [esp
+ i1030_fiyH
]
4276 movq mm4
, [esp
+ i1030_fizH
]
4280 movq
[esp
+ i1030_fixH
], mm2
4281 movq
[esp
+ i1030_fiyH
], mm3
4282 movq
[esp
+ i1030_fizH
], mm4
4284 ;
# pack j forces from H in the same form as the oxygen force
4285 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
4286 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
4288 pfadd mm0
, mm5 ;
# add up total force on j particle
4291 ;
# update j particle force
4292 movq mm2
, [edi
+ eax
*4 + 12]
4293 movd mm3
, [edi
+ eax
*4 + 20]
4296 movq
[edi
+ eax
*4 + 12], mm2
4297 movd
[edi
+ eax
*4 + 20], mm3
4299 ;
# interactions with j H2
4300 movq mm0
, [esi
+ eax
*4 + 24]
4301 movd mm1
, [esi
+ eax
*4 + 32]
4302 ;
# copy & expand to mm2-mm4 for the H interactions
4310 movd mm6
, [esp
+ i1030_qqOH
]
4311 movq mm7
, [esp
+ i1030_qqHH
]
4313 pfsubr mm0
, [esp
+ i1030_ixO
]
4314 pfsubr mm1
, [esp
+ i1030_izO
]
4316 movq
[esp
+ i1030_dxO
], mm0
4318 movd
[esp
+ i1030_dzO
], mm1
4321 pfadd mm0
, mm1 ;
# mm0=rsqO
4325 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
4326 pfsubr mm2
, [esp
+ i1030_ixH
]
4327 pfsubr mm3
, [esp
+ i1030_iyH
]
4328 pfsubr mm4
, [esp
+ i1030_izH
] ;
# mm2-mm4 is dxH-dzH
4330 movq
[esp
+ i1030_dxH
], mm2
4331 movq
[esp
+ i1030_dyH
], mm3
4332 movq
[esp
+ i1030_dzH
], mm4
4338 pfadd mm3
,mm4 ;
# mm3=rsqH
4345 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
4347 pfmul mm4
, mm4 ;
# mm4=invsq
4348 ;
# calculate potential and scalar force
4349 pfmul mm6
, mm1 ;
# mm6=vcoul
4350 pfmul mm4
, mm6 ;
# mm4=fscalar
4356 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
4361 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
4363 pfmul mm3
,mm3 ;
# mm3=invsq
4364 pfmul mm7
, mm5 ;
# mm7=vcoul
4365 pfmul mm3
, mm7 ;
# mm3=fscal for the two H's
4369 pfadd mm7
, [esp
+ i1030_vctot
]
4370 movq
[esp
+ i1030_vctot
], mm7
4372 ;
# spread oxygen fscalar to both positions
4374 ;
# calc vectorial force for O
4375 movq mm0
, [esp
+ i1030_dxO
]
4376 movd mm1
, [esp
+ i1030_dzO
]
4380 ;
# calc vectorial force for H's
4381 movq mm5
, [esp
+ i1030_dxH
]
4382 movq mm6
, [esp
+ i1030_dyH
]
4383 movq mm7
, [esp
+ i1030_dzH
]
4388 ;
# update iO particle force
4389 movq mm2
, [esp
+ i1030_fixO
]
4390 movd mm3
, [esp
+ i1030_fizO
]
4393 movq
[esp
+ i1030_fixO
], mm2
4394 movd
[esp
+ i1030_fizO
], mm3
4397 movq mm2
, [esp
+ i1030_fixH
]
4398 movq mm3
, [esp
+ i1030_fiyH
]
4399 movq mm4
, [esp
+ i1030_fizH
]
4403 movq
[esp
+ i1030_fixH
], mm2
4404 movq
[esp
+ i1030_fiyH
], mm3
4405 movq
[esp
+ i1030_fizH
], mm4
4407 ;
# pack j forces from H in the same form as the oxygen force
4408 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
4409 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
4411 pfadd mm0
, mm5 ;
# add up total force on j particle
4414 ;
# update j particle force
4415 movq mm2
, [edi
+ eax
*4 + 24]
4416 movd mm3
, [edi
+ eax
*4 + 32]
4419 movq
[edi
+ eax
*4 + 24], mm2
4420 movd
[edi
+ eax
*4 + 32], mm3
4423 dec dword ptr
[esp
+ i1030_innerk
]
4424 jz
.i1030_updateouterdata
4425 jmp
.i1030_inner_loop
4426 .i1030_updateouterdata:
4427 mov ecx
, [esp
+ i1030_ii3
]
4429 movq mm6
, [edi
+ ecx
*4] ;
# increment iO force
4430 movd mm7
, [edi
+ ecx
*4 + 8]
4431 pfadd mm6
, [esp
+ i1030_fixO
]
4432 pfadd mm7
, [esp
+ i1030_fizO
]
4433 movq
[edi
+ ecx
*4], mm6
4434 movd
[edi
+ ecx
*4 +8], mm7
4436 movq mm0
, [esp
+ i1030_fixH
]
4437 movq mm3
, [esp
+ i1030_fiyH
]
4438 movq mm1
, [esp
+ i1030_fizH
]
4440 punpckldq mm0
, mm3 ;
# mm0(l)=fxH1, mm0(h)=fyH1
4441 punpckhdq mm2
, mm3 ;
# mm2(l)=fxH2, mm2(h)=fyH2
4447 movq mm6
, [edi
+ ecx
*4 + 12] ;
# increment iH1 force
4448 movd mm7
, [edi
+ ecx
*4 + 20]
4451 movq
[edi
+ ecx
*4 + 12], mm6
4452 movd
[edi
+ ecx
*4 + 20], mm7
4454 movq mm6
, [edi
+ ecx
*4 + 24] ;
# increment iH2 force
4455 movd mm7
, [edi
+ ecx
*4 + 32]
4458 movq
[edi
+ ecx
*4 + 24], mm6
4459 movd
[edi
+ ecx
*4 + 32], mm7
4462 mov ebx
, [ebp
+ i1030_fshift
] ;
# increment fshift force
4463 mov edx
, [esp
+ i1030_is3
]
4465 movq mm6
, [ebx
+ edx
*4]
4466 movd mm7
, [ebx
+ edx
*4 + 8]
4467 pfadd mm6
, [esp
+ i1030_fixO
]
4468 pfadd mm7
, [esp
+ i1030_fizO
]
4473 movq
[ebx
+ edx
*4], mm6
4474 movd
[ebx
+ edx
*4 + 8], mm7
4476 mov edx
, [ebp
+ i1030_gid
] ;
# get group index for this i particle
4478 add dword ptr
[ebp
+ i1030_gid
], 4 ;
# advance pointer
4480 movq mm7
, [esp
+ i1030_vctot
]
4481 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
4483 mov eax
, [ebp
+ i1030_Vc
]
4484 movd mm6
, [eax
+ edx
*4]
4486 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
4488 dec dword ptr
[ebp
+ i1030_nri
]
4490 ;
# not last, iterate once more!
4505 .globl inl1100_3dnow
4506 .globl _inl1100_3dnow
4510 .equiv i1100_iinr, 12
4511 .equiv i1100_jindex, 16
4512 .equiv i1100_jjnr, 20
4513 .equiv i1100_shift, 24
4514 .equiv i1100_shiftvec, 28
4515 .equiv i1100_fshift, 32
4516 .equiv i1100_gid, 36
4517 .equiv i1100_pos, 40
4518 .equiv i1100_faction, 44
4519 .equiv i1100_charge, 48
4520 .equiv i1100_facel, 52
4522 .equiv i1100_type, 60
4523 .equiv i1100_ntype, 64
4524 .equiv i1100_nbfp, 68
4525 .equiv i1100_Vnb, 72
4526 ;
# stack offsets for local variables
4533 .equiv i1100_vctot, 28
4534 .equiv i1100_vnbtot, 36
4536 .equiv i1100_c12, 52
4537 .equiv i1100_six, 60
4538 .equiv i1100_twelve, 68
4539 .equiv i1100_ntia, 76
4540 .equiv i1100_innerjjnr, 80
4541 .equiv i1100_innerk, 84
4542 .equiv i1100_fix, 88
4543 .equiv i1100_fiy, 92
4544 .equiv i1100_fiz, 96
4545 .equiv i1100_dx1, 100
4546 .equiv i1100_dy1, 104
4547 .equiv i1100_dz1, 108
4548 .equiv i1100_dx2, 112
4549 .equiv i1100_dy2, 116
4550 .equiv i1100_dz2, 120
4560 sub esp
, 124 ;
# local stack space
4562 ;
# move data to local stack
4564 movq mm1
, [mm_twelve
]
4565 movq
[esp
+ i1100_six
], mm0
4566 movq
[esp
+ i1100_twelve
], mm1
4567 ;
# assume we have at least one i particle - start directly
4569 mov eax
, [ebp
+ i1100_shift
] ;
# eax = pointer into shift[]
4570 mov ebx
, [eax
] ;
# ebx=shift[n]
4571 add dword ptr
[ebp
+ i1030_shift
], 4 ;
# advance pointer one step
4573 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
4574 mov
[esp
+ i1030_is3
],ebx ;
# store is3
4576 mov eax
, [ebp
+ i1100_shiftvec
] ;
# eax = base of shiftvec[]
4578 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
4579 movd mm1
, [eax
+ ebx
*4 + 8]
4581 mov ecx
, [ebp
+ i1100_iinr
] ;
# ecx = pointer into iinr[]
4582 add dword ptr
[ebp
+ i1100_iinr
], 4 ;
# advance pointer
4583 mov ebx
, [ecx
] ;
# ebx=ii
4585 mov edx
, [ebp
+ i1100_charge
]
4586 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
4587 pfmul mm2
, [ebp
+ i1100_facel
]
4588 punpckldq mm2
,mm2 ;
# spread to both halves
4589 movq
[esp
+ i1100_iq
], mm2 ;
# iq =facel*charge[ii]
4591 mov edx
, [ebp
+ i1100_type
]
4592 mov edx
, [edx
+ ebx
*4]
4593 imul edx
, [ebp
+ i1100_ntype
]
4595 mov
[esp
+ i1100_ntia
], edx
4597 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
4598 mov eax
, [ebp
+ i1100_pos
] ;
# eax = base of pos[]
4600 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
4601 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
4602 mov
[esp
+ i1100_ii3
], ebx
4604 movq
[esp
+ i1100_ix
], mm0
4605 movd
[esp
+ i1100_iz
], mm1
4607 ;
# clear total potential and i forces
4609 movq
[esp
+ i1100_vctot
], mm7
4610 movq
[esp
+ i1100_vnbtot
], mm7
4611 movq
[esp
+ i1100_fix
], mm7
4612 movd
[esp
+ i1100_fiz
], mm7
4614 mov eax
, [ebp
+ i1100_jindex
]
4615 mov ecx
, [eax
] ;
# jindex[n]
4616 mov edx
, [eax
+ 4] ;
# jindex[n+1]
4617 add dword ptr
[ebp
+ i1100_jindex
], 4
4618 sub edx
, ecx ;
# number of innerloop atoms
4620 mov esi
, [ebp
+ i1100_pos
]
4621 mov edi
, [ebp
+ i1100_faction
]
4622 mov eax
, [ebp
+ i1100_jjnr
]
4625 mov
[esp
+ i1100_innerjjnr
], eax ;
# pointer to jjnr[nj0]
4627 mov
[esp
+ i1100_innerk
], edx ;
# number of innerloop atoms
4628 jge
.i1100_unroll_loop
4629 jmp
.i1100_finish_inner
4631 ;
# paired innerloop starts here
4632 mov ecx
, [esp
+ i1100_innerjjnr
] ;
# pointer to jjnr[k]
4634 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
4635 add dword ptr
[esp
+ i1100_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
4636 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
4638 mov ecx
, [ebp
+ i1100_charge
] ;
# base of charge[]
4639 movq mm5
, [esp
+ i1100_iq
]
4640 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
4641 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
4642 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
4644 mov ecx
, [ebp
+ i1100_type
]
4645 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
4646 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
4648 mov esi
, [ebp
+ i1100_nbfp
] ;
# base of nbfp
4651 add edx
, [esp
+ i1100_ntia
] ;
# tja = ntia + 2*type
4652 add ecx
, [esp
+ i1100_ntia
]
4654 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
4655 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
4657 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
4658 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
4659 movq
[esp
+ i1100_c6
], mm5
4660 movq
[esp
+ i1100_c12
], mm6
4662 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
4663 lea ebx
, [ebx
+ ebx
*2]
4665 mov esi
, [ebp
+ i1100_pos
]
4667 movq mm0
, [esp
+ i1100_ix
]
4668 movd mm1
, [esp
+ i1100_iz
]
4669 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
4670 movd mm5
, [esi
+ eax
*4 + 8]
4671 pfsubr mm4
,mm0 ;
# dr = ir - jr
4673 movq
[esp
+ i1100_dx1
], mm4 ;
# store dr
4674 movd
[esp
+ i1100_dz1
], mm5
4675 pfmul mm4
,mm4 ;
# square dx,dy,dz
4677 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
4678 pfacc mm4
, mm5 ;
# first rsq in lower mm4
4680 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
4681 movd mm7
, [esi
+ ebx
*4 + 8]
4683 pfsubr mm6
,mm0 ;
# dr = ir - jr
4685 movq
[esp
+ i1100_dx2
], mm6 ;
# store dr
4686 movd
[esp
+ i1100_dz2
], mm7
4687 pfmul mm6
,mm6 ;
# square dx,dy,dz
4689 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
4690 pfacc mm6
, mm7 ;
# second rsq in lower mm6
4692 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
4696 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
4697 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
4703 ;
# mm0 now contains invsq, and mm1 invsqrt
4704 ;
# do potential and fscal
4707 pfmul mm4
, mm0 ;
# mm4=rinvsix
4709 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
4711 pfmul mm3
, mm1 ;
# mm3 has vcoul for both interactions
4712 movq mm7
, mm3 ;
# use mm7 for sum to make fscal
4714 pfmul mm5
, [esp
+ i1100_c12
]
4715 pfmul mm4
, [esp
+ i1100_c6
]
4716 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
4719 pfmul mm4
, [esp
+ i1100_six
]
4721 pfmul mm5
, [esp
+ i1100_twelve
]
4724 pfmul mm0
, mm7 ;
# mm0 is total fscal now
4726 prefetchw
[esp
+ i1100_dx1
] ;
# prefetch i forces to cache
4729 pfadd mm3
, [esp
+ i1100_vctot
] ;
# add the earlier value
4730 movq
[esp
+ i1100_vctot
], mm3 ;
# store the sum
4732 ;
# spread fscalar to both positions
4737 ;
# calc vector force
4738 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
4739 movq mm2
, [esp
+ i1100_dx1
] ;
# fetch dr
4740 movd mm3
, [esp
+ i1100_dz1
]
4743 pfadd mm6
, [esp
+ i1100_vnbtot
] ;
# add the earlier value
4744 movq
[esp
+ i1100_vnbtot
], mm6 ;
# store the sum
4746 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
4747 pfmul mm2
, mm0 ;
# mult by fs
4750 movq mm4
, [esp
+ i1100_dx2
] ;
# fetch dr
4751 movd mm5
, [esp
+ i1100_dz2
]
4752 pfmul mm4
, mm1 ;
# mult by fs
4756 movq mm0
, [esp
+ i1100_fix
]
4757 movd mm1
, [esp
+ i1100_fiz
]
4763 movq
[esp
+ i1100_fix
], mm0
4764 movd
[esp
+ i1100_fiz
], mm1
4767 movq mm0
, [edi
+ eax
*4]
4768 movd mm1
, [edi
+ eax
*4 + 8]
4769 movq mm6
, [edi
+ ebx
*4]
4770 movd mm7
, [edi
+ ebx
*4 + 8]
4777 movq
[edi
+ eax
*4], mm0
4778 movd
[edi
+ eax
*4 +8], mm1
4779 movq
[edi
+ ebx
*4], mm6
4780 movd
[edi
+ ebx
*4 + 8], mm7
4782 ;
# should we do one more iteration?
4783 sub dword ptr
[esp
+ i1100_innerk
], 2
4784 jl
.i1100_finish_inner
4785 jmp
.i1100_unroll_loop
4786 .i1100_finish_inner:
4787 and dword ptr
[esp
+ i1100_innerk
], 1
4788 jnz
.i1100_single_inner
4789 jmp
.i1100_updateouterdata
4790 .i1100_single_inner:
4791 ;
# a single j particle iteration here - compare with the unrolled code for comments
4792 mov eax
, [esp
+ i1100_innerjjnr
]
4793 mov eax
, [eax
] ;
# eax=jnr offset
4795 mov ecx
, [ebp
+ i1100_charge
]
4796 movd mm5
, [esp
+ i1100_iq
]
4797 movd mm3
, [ecx
+ eax
*4]
4798 pfmul mm3
, mm5 ;
# mm3=qq
4800 mov esi
, [ebp
+ i1100_nbfp
]
4801 mov ecx
, [ebp
+ i1100_type
]
4802 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
4804 add edx
, [esp
+ i1100_ntia
] ;
# tja = ntia + 2*type
4805 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
4806 movq
[esp
+ i1100_c6
], mm5
4807 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
4808 movq
[esp
+ i1100_c12
], mm5
4811 mov esi
, [ebp
+ i1100_pos
]
4812 lea eax
, [eax
+ eax
*2]
4814 movq mm0
, [esp
+ i1100_ix
]
4815 movd mm1
, [esp
+ i1100_iz
]
4816 movq mm4
, [esi
+ eax
*4]
4817 movd mm5
, [esi
+ eax
*4 + 8]
4820 movq
[esp
+ i1100_dx1
], mm4
4822 movd
[esp
+ i1100_dz1
], mm5
4825 pfacc mm4
, mm5 ;
# mm0=rsq
4831 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
4833 pfmul mm0
, mm0 ;
# mm0=invsq
4834 ;
# calculate potentials and scalar force
4837 pfmul mm4
, mm0 ;
# mm4=rinvsix
4839 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
4841 pfmul mm3
, mm1 ;
# mm3 has vcoul for both interactions
4842 movq mm7
, mm3 ;
# use mm7 for sum to make fscal
4844 pfmul mm5
, [esp
+ i1100_c12
]
4845 pfmul mm4
, [esp
+ i1100_c6
]
4846 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
4849 pfmul mm4
, [esp
+ i1100_six
]
4851 pfmul mm5
, [esp
+ i1100_twelve
]
4854 pfmul mm0
, mm7 ;
# mm0 is total fscal now
4857 pfadd mm3
, [esp
+ i1100_vctot
]
4858 movq
[esp
+ i1100_vctot
], mm3
4861 pfadd mm6
, [esp
+ i1100_vnbtot
] ;
# add the earlier value
4862 movq
[esp
+ i1100_vnbtot
], mm6 ;
# store the sum
4864 ;
# spread fscalar to both positions
4866 ;
# calc vectorial force
4867 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
4868 movq mm2
, [esp
+ i1100_dx1
]
4869 movd mm3
, [esp
+ i1100_dz1
]
4875 ;
# update i particle force
4876 movq mm0
, [esp
+ i1100_fix
]
4877 movd mm1
, [esp
+ i1100_fiz
]
4880 movq
[esp
+ i1100_fix
], mm0
4881 movd
[esp
+ i1100_fiz
], mm1
4882 ;
# update j particle force
4883 movq mm0
, [edi
+ eax
*4]
4884 movd mm1
, [edi
+ eax
*4+ 8]
4887 movq
[edi
+ eax
*4], mm0
4888 movd
[edi
+ eax
*4 +8], mm1
4890 .i1100_updateouterdata:
4891 mov ecx
, [esp
+ i1100_ii3
]
4893 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
4894 movd mm7
, [edi
+ ecx
*4 + 8]
4895 pfadd mm6
, [esp
+ i1100_fix
]
4896 pfadd mm7
, [esp
+ i1100_fiz
]
4897 movq
[edi
+ ecx
*4], mm6
4898 movd
[edi
+ ecx
*4 +8], mm7
4900 mov ebx
, [ebp
+ i1100_fshift
] ;
# increment fshift force
4901 mov edx
, [esp
+ i1100_is3
]
4903 movq mm6
, [ebx
+ edx
*4]
4904 movd mm7
, [ebx
+ edx
*4 + 8]
4905 pfadd mm6
, [esp
+ i1100_fix
]
4906 pfadd mm7
, [esp
+ i1100_fiz
]
4907 movq
[ebx
+ edx
*4], mm6
4908 movd
[ebx
+ edx
*4 + 8], mm7
4910 mov edx
, [ebp
+ i1100_gid
] ;
# get group index for this i particle
4912 add dword ptr
[ebp
+ i1100_gid
], 4 ;
# advance pointer
4914 movq mm7
, [esp
+ i1100_vctot
]
4915 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
4917 mov eax
, [ebp
+ i1100_Vc
]
4918 movd mm6
, [eax
+ edx
*4]
4920 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
4922 movq mm7
, [esp
+ i1100_vnbtot
]
4923 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
4925 mov eax
, [ebp
+ i1100_Vnb
]
4926 movd mm6
, [eax
+ edx
*4]
4928 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
4931 mov ecx
, [ebp
+ i1100_nri
]
4934 ;
# not last, iterate once more!
4935 mov
[ebp
+ i1100_nri
], ecx
4953 .globl inl1110_3dnow
4954 .globl _inl1110_3dnow
4958 .equiv i1110_iinr, 12
4959 .equiv i1110_jindex, 16
4960 .equiv i1110_jjnr, 20
4961 .equiv i1110_shift, 24
4962 .equiv i1110_shiftvec, 28
4963 .equiv i1110_fshift, 32
4964 .equiv i1110_gid, 36
4965 .equiv i1110_pos, 40
4966 .equiv i1110_faction, 44
4967 .equiv i1110_charge, 48
4968 .equiv i1110_facel, 52
4970 .equiv i1110_type, 60
4971 .equiv i1110_ntype, 64
4972 .equiv i1110_nbfp, 68
4973 .equiv i1110_Vnb, 72
4974 .equiv i1110_nsatoms, 76
4975 ;
# stack offsets for local variables
4979 .equiv i1110_shY, 12
4980 .equiv i1110_shZ, 16
4985 .equiv i1110_vctot, 40
4986 .equiv i1110_vnbtot, 48
4988 .equiv i1110_c12, 64
4989 .equiv i1110_six, 72
4990 .equiv i1110_twelve, 80
4991 .equiv i1110_ntia, 88
4992 .equiv i1110_innerjjnr0, 92
4993 .equiv i1110_innerk0, 96
4994 .equiv i1110_innerjjnr, 100
4995 .equiv i1110_innerk, 104
4996 .equiv i1110_fix, 108
4997 .equiv i1110_fiy, 112
4998 .equiv i1110_fiz, 116
4999 .equiv i1110_dx1, 120
5000 .equiv i1110_dy1, 124
5001 .equiv i1110_dz1, 128
5002 .equiv i1110_dx2, 132
5003 .equiv i1110_dy2, 136
5004 .equiv i1110_dz2, 140
5005 .equiv i1110_nsvdwc, 144
5006 .equiv i1110_nscoul, 148
5007 .equiv i1110_nsvdw, 152
5008 .equiv i1110_solnr, 156
5017 sub esp
, 160 ;
# local stack space
5020 movq mm1
, [mm_twelve
]
5021 movq
[esp
+ i1110_six
], mm0
5022 movq
[esp
+ i1110_twelve
], mm1
5023 ;
# assume we have at least one i particle - start directly
5025 mov eax
, [ebp
+ i1110_shift
] ;
# eax = pointer into shift[]
5026 mov ebx
, [eax
] ;
# ebx=shift[n]
5027 add dword ptr
[ebp
+ i1110_shift
], 4 ;
# advance pointer one step
5029 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
5030 mov
[esp
+ i1110_is3
],ebx ;
# store is3
5032 mov eax
, [ebp
+ i1110_shiftvec
] ;
# eax = base of shiftvec[]
5034 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
5035 movd mm1
, [eax
+ ebx
*4 + 8]
5036 movq
[esp
+ i1110_shX
], mm0
5037 movd
[esp
+ i1110_shZ
], mm1
5039 mov ecx
, [ebp
+ i1110_iinr
] ;
# ecx = pointer into iinr[]
5040 add dword ptr
[ebp
+ i1110_iinr
], 4 ;
# advance pointer
5041 mov ebx
, [ecx
] ;
# ebx=ii
5043 mov eax
, [ebp
+ i1110_nsatoms
]
5044 add dword ptr
[ebp
+ i1110_nsatoms
], 12
5051 mov
[esp
+ i1110_nsvdwc
], edx
5052 mov
[esp
+ i1110_nscoul
], eax
5053 mov
[esp
+ i1110_nsvdw
], ecx
5057 movq
[esp
+ i1110_vctot
], mm7
5058 movq
[esp
+ i1110_vnbtot
], mm7
5059 mov
[esp
+ i1110_solnr
], ebx
5061 mov eax
, [ebp
+ i1110_jindex
]
5062 mov ecx
, [eax
] ;
# jindex[n]
5063 mov edx
, [eax
+ 4] ;
# jindex[n+1]
5064 add dword ptr
[ebp
+ i1110_jindex
], 4
5065 sub edx
, ecx ;
# number of innerloop atoms
5066 mov eax
, [ebp
+ i1110_jjnr
]
5069 mov
[esp
+ i1110_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
5071 mov
[esp
+ i1110_innerk0
], edx ;
# number of innerloop atoms
5072 mov esi
, [ebp
+ i1110_pos
]
5073 mov edi
, [ebp
+ i1110_faction
]
5075 mov ecx
, [esp
+ i1110_nsvdwc
]
5080 mov ebx
, [esp
+ i1110_solnr
]
5081 inc dword ptr
[esp
+ i1110_solnr
]
5082 mov edx
, [ebp
+ i1110_charge
]
5083 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
5084 pfmul mm2
, [ebp
+ i1110_facel
]
5085 punpckldq mm2
,mm2 ;
# spread to both halves
5086 movq
[esp
+ i1110_iq
], mm2 ;
# iq =facel*charge[ii]
5088 mov edx
, [ebp
+ i1110_type
]
5089 mov edx
, [edx
+ ebx
*4]
5090 imul edx
, [ebp
+ i1110_ntype
]
5092 mov
[esp
+ i1110_ntia
], edx
5094 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
5095 mov eax
, [ebp
+ i1110_pos
] ;
# eax = base of pos[]
5096 mov
[esp
+ i1110_ii3
], ebx
5098 movq mm0
, [eax
+ ebx
*4]
5099 movd mm1
, [eax
+ ebx
*4 + 8]
5100 pfadd mm0
, [esp
+ i1110_shX
]
5101 pfadd mm1
, [esp
+ i1110_shZ
]
5102 movq
[esp
+ i1110_ix
], mm0
5103 movd
[esp
+ i1110_iz
], mm1
5107 movq
[esp
+ i1110_fix
], mm7
5108 movd
[esp
+ i1110_fiz
], mm7
5110 mov ecx
, [esp
+ i1110_innerjjnr0
]
5111 mov
[esp
+ i1110_innerjjnr
], ecx
5112 mov edx
, [esp
+ i1110_innerk0
]
5114 mov
[esp
+ i1110_innerk
], edx ;
# number of innerloop atoms
5115 jge
.i1110_unroll_vdwc_loop
5116 jmp
.i1110_finish_vdwc_inner
5117 .i1110_unroll_vdwc_loop:
5118 ;
# paired innerloop starts here
5119 mov ecx
, [esp
+ i1110_innerjjnr
] ;
# pointer to jjnr[k]
5121 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
5122 add dword ptr
[esp
+ i1110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
5123 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
5125 mov ecx
, [ebp
+ i1110_charge
] ;
# base of charge[]
5126 movq mm5
, [esp
+ i1110_iq
]
5127 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
5128 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
5129 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
5131 mov ecx
, [ebp
+ i1110_type
]
5132 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
5133 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
5135 mov esi
, [ebp
+ i1110_nbfp
] ;
# base of nbfp
5138 add edx
, [esp
+ i1110_ntia
] ;
# tja = ntia + 2*type
5139 add ecx
, [esp
+ i1110_ntia
]
5141 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
5142 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
5144 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
5145 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
5146 movq
[esp
+ i1110_c6
], mm5
5147 movq
[esp
+ i1110_c12
], mm6
5149 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
5150 lea ebx
, [ebx
+ ebx
*2]
5152 mov esi
, [ebp
+ i1110_pos
]
5154 movq mm0
, [esp
+ i1110_ix
]
5155 movd mm1
, [esp
+ i1110_iz
]
5156 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
5157 movd mm5
, [esi
+ eax
*4 + 8]
5158 pfsubr mm4
,mm0 ;
# dr = ir - jr
5160 movq
[esp
+ i1110_dx1
], mm4 ;
# store dr
5161 movd
[esp
+ i1110_dz1
], mm5
5162 pfmul mm4
,mm4 ;
# square dx,dy,dz
5164 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
5165 pfacc mm4
, mm5 ;
# first rsq in lower mm4
5167 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
5168 movd mm7
, [esi
+ ebx
*4 + 8]
5170 pfsubr mm6
,mm0 ;
# dr = ir - jr
5172 movq
[esp
+ i1110_dx2
], mm6 ;
# store dr
5173 movd
[esp
+ i1110_dz2
], mm7
5174 pfmul mm6
,mm6 ;
# square dx,dy,dz
5176 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
5177 pfacc mm6
, mm7 ;
# second rsq in lower mm6
5179 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
5183 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
5184 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
5190 ;
# mm0 now contains invsq, and mm1 invsqrt
5191 ;
# do potential and fscal
5194 pfmul mm4
, mm0 ;
# mm4=rinvsix
5196 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
5198 pfmul mm3
, mm1 ;
# mm3 has vcoul for both interactions
5199 movq mm7
, mm3 ;
# use mm7 for sum to make fscal
5201 pfmul mm5
, [esp
+ i1110_c12
]
5202 pfmul mm4
, [esp
+ i1110_c6
]
5203 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
5206 pfmul mm4
, [esp
+ i1110_six
]
5208 pfmul mm5
, [esp
+ i1110_twelve
]
5211 pfmul mm0
, mm7 ;
# mm0 is total fscal now
5213 prefetchw
[esp
+ i1110_dx1
] ;
# prefetch i forces to cache
5216 pfadd mm3
, [esp
+ i1110_vctot
] ;
# add the earlier value
5217 movq
[esp
+ i1110_vctot
], mm3 ;
# store the sum
5219 ;
# spread fscalar to both positions
5224 ;
# calc vector force
5225 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
5226 movq mm2
, [esp
+ i1110_dx1
] ;
# fetch dr
5227 movd mm3
, [esp
+ i1110_dz1
]
5230 pfadd mm6
, [esp
+ i1110_vnbtot
] ;
# add the earlier value
5231 movq
[esp
+ i1110_vnbtot
], mm6 ;
# store the sum
5233 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
5234 pfmul mm2
, mm0 ;
# mult by fs
5237 movq mm4
, [esp
+ i1110_dx2
] ;
# fetch dr
5238 movd mm5
, [esp
+ i1110_dz2
]
5239 pfmul mm4
, mm1 ;
# mult by fs
5243 movq mm0
, [esp
+ i1110_fix
]
5244 movd mm1
, [esp
+ i1110_fiz
]
5250 movq
[esp
+ i1110_fix
], mm0
5251 movd
[esp
+ i1110_fiz
], mm1
5254 movq mm0
, [edi
+ eax
*4]
5255 movd mm1
, [edi
+ eax
*4 + 8]
5256 movq mm6
, [edi
+ ebx
*4]
5257 movd mm7
, [edi
+ ebx
*4 + 8]
5264 movq
[edi
+ eax
*4], mm0
5265 movd
[edi
+ eax
*4 +8], mm1
5266 movq
[edi
+ ebx
*4], mm6
5267 movd
[edi
+ ebx
*4 + 8], mm7
5269 ;
# should we do one more iteration?
5270 sub dword ptr
[esp
+ i1110_innerk
], 2
5271 jl
.i1110_finish_vdwc_inner
5272 jmp
.i1110_unroll_vdwc_loop
5273 .i1110_finish_vdwc_inner:
5274 and dword ptr
[esp
+ i1110_innerk
], 1
5275 jnz
.i1110_single_vdwc_inner
5276 jmp
.i1110_updateouterdata_vdwc
5277 .i1110_single_vdwc_inner:
5278 ;
# a single j particle iteration here - compare with the unrolled code for comments
5279 mov eax
, [esp
+ i1110_innerjjnr
]
5280 mov eax
, [eax
] ;
# eax=jnr offset
5282 mov ecx
, [ebp
+ i1110_charge
]
5283 movd mm5
, [esp
+ i1110_iq
]
5284 movd mm3
, [ecx
+ eax
*4]
5285 pfmul mm3
, mm5 ;
# mm3=qq
5287 mov esi
, [ebp
+ i1110_nbfp
]
5288 mov ecx
, [ebp
+ i1110_type
]
5289 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
5291 add edx
, [esp
+ i1110_ntia
] ;
# tja = ntia + 2*type
5292 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
5293 movq
[esp
+ i1110_c6
], mm5
5294 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
5295 movq
[esp
+ i1110_c12
], mm5
5298 mov esi
, [ebp
+ i1110_pos
]
5299 lea eax
, [eax
+ eax
*2]
5301 movq mm0
, [esp
+ i1110_ix
]
5302 movd mm1
, [esp
+ i1110_iz
]
5303 movq mm4
, [esi
+ eax
*4]
5304 movd mm5
, [esi
+ eax
*4 + 8]
5307 movq
[esp
+ i1110_dx1
], mm4
5309 movd
[esp
+ i1110_dz1
], mm5
5312 pfacc mm4
, mm5 ;
# mm0=rsq
5318 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
5320 pfmul mm0
, mm0 ;
# mm0=invsq
5321 ;
# calculate potentials and scalar force
5324 pfmul mm4
, mm0 ;
# mm4=rinvsix
5326 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
5328 pfmul mm3
, mm1 ;
# mm3 has vcoul for both interactions
5329 movq mm7
, mm3 ;
# use mm7 for sum to make fscal
5331 pfmul mm5
, [esp
+ i1110_c12
]
5332 pfmul mm4
, [esp
+ i1110_c6
]
5333 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
5336 pfmul mm4
, [esp
+ i1110_six
]
5338 pfmul mm5
, [esp
+ i1110_twelve
]
5341 pfmul mm0
, mm7 ;
# mm0 is total fscal now
5344 pfadd mm3
, [esp
+ i1110_vctot
]
5345 movq
[esp
+ i1110_vctot
], mm3
5348 pfadd mm6
, [esp
+ i1110_vnbtot
] ;
# add the earlier value
5349 movq
[esp
+ i1110_vnbtot
], mm6 ;
# store the sum
5351 ;
# spread fscalar to both positions
5353 ;
# calc vectorial force
5354 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
5355 movq mm2
, [esp
+ i1110_dx1
]
5356 movd mm3
, [esp
+ i1110_dz1
]
5362 ;
# update i particle force
5363 movq mm0
, [esp
+ i1110_fix
]
5364 movd mm1
, [esp
+ i1110_fiz
]
5367 movq
[esp
+ i1110_fix
], mm0
5368 movd
[esp
+ i1110_fiz
], mm1
5369 ;
# update j particle force
5370 movq mm0
, [edi
+ eax
*4]
5371 movd mm1
, [edi
+ eax
*4+ 8]
5374 movq
[edi
+ eax
*4], mm0
5375 movd
[edi
+ eax
*4 +8], mm1
5377 .i1110_updateouterdata_vdwc:
5378 mov ecx
, [esp
+ i1110_ii3
]
5380 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
5381 movd mm7
, [edi
+ ecx
*4 + 8]
5382 pfadd mm6
, [esp
+ i1110_fix
]
5383 pfadd mm7
, [esp
+ i1110_fiz
]
5384 movq
[edi
+ ecx
*4], mm6
5385 movd
[edi
+ ecx
*4 +8], mm7
5387 mov ebx
, [ebp
+ i1110_fshift
] ;
# increment fshift force
5388 mov edx
, [esp
+ i1110_is3
]
5390 movq mm6
, [ebx
+ edx
*4]
5391 movd mm7
, [ebx
+ edx
*4 + 8]
5392 pfadd mm6
, [esp
+ i1110_fix
]
5393 pfadd mm7
, [esp
+ i1110_fiz
]
5394 movq
[ebx
+ edx
*4], mm6
5395 movd
[ebx
+ edx
*4 + 8], mm7
5398 dec dword ptr
[esp
+ i1110_nsvdwc
]
5402 mov ecx
, [esp
+ i1110_nscoul
]
5407 mov ebx
, [esp
+ i1110_solnr
]
5408 inc dword ptr
[esp
+ i1110_solnr
]
5409 mov edx
, [ebp
+ i1110_charge
]
5410 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
5411 pfmul mm2
, [ebp
+ i1110_facel
]
5412 punpckldq mm2
,mm2 ;
# spread to both halves
5413 movq
[esp
+ i1110_iq
], mm2 ;
# iq =facel*charge[ii]
5415 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
5416 mov eax
, [ebp
+ i1110_pos
] ;
# eax = base of pos[]
5417 mov
[esp
+ i1110_ii3
], ebx
5419 movq mm0
, [eax
+ ebx
*4]
5420 movd mm1
, [eax
+ ebx
*4 + 8]
5421 pfadd mm0
, [esp
+ i1110_shX
]
5422 pfadd mm1
, [esp
+ i1110_shZ
]
5423 movq
[esp
+ i1110_ix
], mm0
5424 movd
[esp
+ i1110_iz
], mm1
5428 movq
[esp
+ i1110_fix
], mm7
5429 movd
[esp
+ i1110_fiz
], mm7
5431 mov ecx
, [esp
+ i1110_innerjjnr0
]
5432 mov
[esp
+ i1110_innerjjnr
], ecx
5433 mov edx
, [esp
+ i1110_innerk0
]
5435 mov
[esp
+ i1110_innerk
], edx ;
# number of innerloop atoms
5436 jge
.i1110_unroll_coul_loop
5437 jmp
.i1110_finish_coul_inner
5438 .i1110_unroll_coul_loop:
5439 ;
# paired innerloop starts here
5440 mov ecx
, [esp
+ i1110_innerjjnr
] ;
# pointer to jjnr[k]
5442 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
5443 add dword ptr
[esp
+ i1110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
5444 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
5446 mov ecx
, [ebp
+ i1110_charge
] ;
# base of charge[]
5447 movq mm5
, [esp
+ i1110_iq
]
5448 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
5449 movd mm7
, [ecx
+ ebx
*4] ;
# charge[jnr2]
5450 punpckldq mm3
,mm7 ;
# move charge 2 to high part of mm3
5451 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
5453 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
5454 lea ebx
, [ebx
+ ebx
*2]
5456 movq mm0
, [esp
+ i1110_ix
]
5457 movd mm1
, [esp
+ i1110_iz
]
5458 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
5459 movd mm5
, [esi
+ eax
*4 + 8]
5460 pfsubr mm4
,mm0 ;
# dr = ir - jr
5462 movq
[esp
+ i1110_dx1
], mm4 ;
# store dr
5463 movd
[esp
+ i1110_dz1
], mm5
5464 pfmul mm4
,mm4 ;
# square dx,dy,dz
5466 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
5467 pfacc mm4
, mm5 ;
# first rsq in lower mm4
5469 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
5470 movd mm7
, [esi
+ ebx
*4 + 8]
5472 pfsubr mm6
,mm0 ;
# dr = ir - jr
5474 movq
[esp
+ i1110_dx2
], mm6 ;
# store dr
5475 movd
[esp
+ i1110_dz2
], mm7
5476 pfmul mm6
,mm6 ;
# square dx,dy,dz
5478 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
5479 pfacc mm6
, mm7 ;
# second rsq in lower mm6
5481 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
5485 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
5486 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
5492 ;
# mm0 now contains invsq, and mm1 invsqrt
5493 ;
# do potential and fscal
5494 prefetchw
[esp
+ i1110_dx1
] ;
# prefetch i forces to cache
5496 pfmul mm3
,mm1 ;
# 3 has both vcoul
5497 pfmul mm0
,mm3 ;
# 0 has both fscal
5501 pfadd mm3
, [esp
+ i1110_vctot
] ;
# add the earlier value
5502 movq
[esp
+ i1110_vctot
], mm3 ;
# store the sum
5503 ;
# spread fscalar to both positions
5507 ;
# calc vector force
5508 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
5509 movq mm2
, [esp
+ i1110_dx1
] ;
# fetch dr
5510 movd mm3
, [esp
+ i1110_dz1
]
5511 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
5512 pfmul mm2
, mm0 ;
# mult by fs
5515 movq mm4
, [esp
+ i1110_dx2
] ;
# fetch dr
5516 movd mm5
, [esp
+ i1110_dz2
]
5517 pfmul mm4
, mm1 ;
# mult by fs
5521 movq mm0
, [esp
+ i1110_fix
]
5522 movd mm1
, [esp
+ i1110_fiz
]
5528 movq
[esp
+ i1110_fix
], mm0
5529 movd
[esp
+ i1110_fiz
], mm1
5532 movq mm0
, [edi
+ eax
*4]
5533 movd mm1
, [edi
+ eax
*4 + 8]
5534 movq mm6
, [edi
+ ebx
*4]
5535 movd mm7
, [edi
+ ebx
*4 + 8]
5542 movq
[edi
+ eax
*4], mm0
5543 movd
[edi
+ eax
*4 +8], mm1
5544 movq
[edi
+ ebx
*4], mm6
5545 movd
[edi
+ ebx
*4 + 8], mm7
5547 ;
# should we do one more iteration?
5548 sub dword ptr
[esp
+ i1110_innerk
], 2
5549 jl
.i1110_finish_coul_inner
5550 jmp
.i1110_unroll_coul_loop
5551 .i1110_finish_coul_inner:
5552 and dword ptr
[esp
+ i1110_innerk
], 1
5553 jnz
.i1110_single_coul_inner
5554 jmp
.i1110_updateouterdata_coul
5555 .i1110_single_coul_inner:
5556 ;
# a single j particle iteration here - compare with the unrolled code for comments
5557 mov eax
, [esp
+ i1110_innerjjnr
]
5558 mov eax
, [eax
] ;
# eax=jnr offset
5560 mov ecx
, [ebp
+ i1110_charge
]
5561 movd mm6
, [esp
+ i1110_iq
]
5562 movd mm7
, [ecx
+ eax
*4]
5563 pfmul mm6
, mm7 ;
# mm6=qq
5565 lea eax
, [eax
+ eax
*2]
5567 movq mm0
, [esp
+ i1110_ix
]
5568 movd mm1
, [esp
+ i1110_iz
]
5569 movq mm2
, [esi
+ eax
*4]
5570 movd mm3
, [esi
+ eax
*4 + 8]
5573 movq
[esp
+ i1110_dx1
], mm0
5575 movd
[esp
+ i1110_dz1
], mm1
5578 pfacc mm0
, mm1 ;
# mm0=rsq
5584 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
5586 pfmul mm4
, mm4 ;
# mm4=invsq
5587 ;
# calculate potential and scalar force
5588 pfmul mm6
, mm1 ;
# mm6=vcoul
5589 pfmul mm4
, mm6 ;
# mm4=fscalar
5591 pfadd mm6
, [esp
+ i1110_vctot
]
5592 movq
[esp
+ i1110_vctot
], mm6
5593 ;
# spread fscalar to both positions
5595 ;
# calc vectorial force
5596 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
5597 movq mm0
, [esp
+ i1110_dx1
]
5598 movd mm1
, [esp
+ i1110_dz1
]
5601 ;
# update i particle force
5602 movq mm2
, [esp
+ i1110_fix
]
5603 movd mm3
, [esp
+ i1110_fiz
]
5606 movq
[esp
+ i1110_fix
], mm2
5607 movd
[esp
+ i1110_fiz
], mm3
5608 ;
# update j particle force
5609 movq mm2
, [edi
+ eax
*4]
5610 movd mm3
, [edi
+ eax
*4+ 8]
5613 movq
[edi
+ eax
*4], mm2
5614 movd
[edi
+ eax
*4 +8], mm3
5616 .i1110_updateouterdata_coul:
5617 mov ecx
, [esp
+ i1110_ii3
]
5619 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
5620 movd mm7
, [edi
+ ecx
*4 + 8]
5621 pfadd mm6
, [esp
+ i1110_fix
]
5622 pfadd mm7
, [esp
+ i1110_fiz
]
5623 movq
[edi
+ ecx
*4], mm6
5624 movd
[edi
+ ecx
*4 +8], mm7
5626 mov ebx
, [ebp
+ i1110_fshift
] ;
# increment fshift force
5627 mov edx
, [esp
+ i1110_is3
]
5629 movq mm6
, [ebx
+ edx
*4]
5630 movd mm7
, [ebx
+ edx
*4 + 8]
5631 pfadd mm6
, [esp
+ i1110_fix
]
5632 pfadd mm7
, [esp
+ i1110_fiz
]
5633 movq
[ebx
+ edx
*4], mm6
5634 movd
[ebx
+ edx
*4 + 8], mm7
5637 dec dword ptr
[esp
+ i1110_nscoul
]
5641 mov ecx
, [esp
+ i1110_nsvdw
]
5646 mov ebx
, [esp
+ i1110_solnr
]
5647 inc dword ptr
[esp
+ i1110_solnr
]
5649 mov edx
, [ebp
+ i1110_type
]
5650 mov edx
, [edx
+ ebx
*4]
5651 imul edx
, [ebp
+ i1110_ntype
]
5653 mov
[esp
+ i1110_ntia
], edx
5655 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
5656 mov eax
, [ebp
+ i1110_pos
] ;
# eax = base of pos[]
5657 mov
[esp
+ i1110_ii3
], ebx
5659 movq mm0
, [eax
+ ebx
*4]
5660 movd mm1
, [eax
+ ebx
*4 + 8]
5661 pfadd mm0
, [esp
+ i1110_shX
]
5662 pfadd mm1
, [esp
+ i1110_shZ
]
5663 movq
[esp
+ i1110_ix
], mm0
5664 movd
[esp
+ i1110_iz
], mm1
5668 movq
[esp
+ i1110_fix
], mm7
5669 movd
[esp
+ i1110_fiz
], mm7
5671 mov ecx
, [esp
+ i1110_innerjjnr0
]
5672 mov
[esp
+ i1110_innerjjnr
], ecx
5673 mov edx
, [esp
+ i1110_innerk0
]
5675 mov
[esp
+ i1110_innerk
], edx ;
# number of innerloop atoms
5676 jge
.i1110_unroll_vdw_loop
5677 jmp
.i1110_finish_vdw_inner
5678 .i1110_unroll_vdw_loop:
5679 ;
# paired innerloop starts here
5680 mov ecx
, [esp
+ i1110_innerjjnr
] ;
# pointer to jjnr[k]
5682 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
5683 add dword ptr
[esp
+ i1110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
5684 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
5686 mov ecx
, [ebp
+ i1110_type
]
5687 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
5688 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
5690 mov esi
, [ebp
+ i1110_nbfp
] ;
# base of nbfp
5693 add edx
, [esp
+ i1110_ntia
] ;
# tja = ntia + 2*type
5694 add ecx
, [esp
+ i1110_ntia
]
5696 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
5697 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
5699 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
5700 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
5701 movq
[esp
+ i1110_c6
], mm5
5702 movq
[esp
+ i1110_c12
], mm6
5704 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
5705 lea ebx
, [ebx
+ ebx
*2]
5707 mov esi
, [ebp
+ i1110_pos
]
5709 movq mm0
, [esp
+ i1110_ix
]
5710 movd mm1
, [esp
+ i1110_iz
]
5711 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
5712 movd mm5
, [esi
+ eax
*4 + 8]
5713 pfsubr mm4
,mm0 ;
# dr = ir - jr
5715 movq
[esp
+ i1110_dx1
], mm4 ;
# store dr
5716 movd
[esp
+ i1110_dz1
], mm5
5717 pfmul mm4
,mm4 ;
# square dx,dy,dz
5719 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
5720 pfacc mm4
, mm5 ;
# first rsq in lower mm4
5722 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
5723 movd mm7
, [esi
+ ebx
*4 + 8]
5725 pfsubr mm6
,mm0 ;
# dr = ir - jr
5727 movq
[esp
+ i1110_dx2
], mm6 ;
# store dr
5728 movd
[esp
+ i1110_dz2
], mm7
5729 pfmul mm6
,mm6 ;
# square dx,dy,dz
5731 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
5732 pfacc mm6
, mm7 ;
# second rsq in lower mm6
5734 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
5738 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
5739 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
5745 ;
# mm0 now contains invsq, and mm1 invsqrt
5746 ;
# do potential and fscal
5749 pfmul mm4
, mm0 ;
# mm4=rinvsix
5751 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
5753 pfmul mm5
, [esp
+ i1110_c12
]
5754 pfmul mm4
, [esp
+ i1110_c6
]
5755 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
5758 pfmul mm4
, [esp
+ i1110_six
]
5760 pfmul mm5
, [esp
+ i1110_twelve
]
5763 pfmul mm0
, mm7 ;
# mm0 is total fscal now
5765 prefetchw
[esp
+ i1110_dx1
] ;
# prefetch i forces to cache
5767 ;
# spread fscalar to both positions
5772 ;
# calc vector force
5773 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
5774 movq mm2
, [esp
+ i1110_dx1
] ;
# fetch dr
5775 movd mm3
, [esp
+ i1110_dz1
]
5778 pfadd mm6
, [esp
+ i1110_vnbtot
] ;
# add the earlier value
5779 movq
[esp
+ i1110_vnbtot
], mm6 ;
# store the sum
5781 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
5782 pfmul mm2
, mm0 ;
# mult by fs
5785 movq mm4
, [esp
+ i1110_dx2
] ;
# fetch dr
5786 movd mm5
, [esp
+ i1110_dz2
]
5787 pfmul mm4
, mm1 ;
# mult by fs
5791 movq mm0
, [esp
+ i1110_fix
]
5792 movd mm1
, [esp
+ i1110_fiz
]
5798 movq
[esp
+ i1110_fix
], mm0
5799 movd
[esp
+ i1110_fiz
], mm1
5802 movq mm0
, [edi
+ eax
*4]
5803 movd mm1
, [edi
+ eax
*4 + 8]
5804 movq mm6
, [edi
+ ebx
*4]
5805 movd mm7
, [edi
+ ebx
*4 + 8]
5812 movq
[edi
+ eax
*4], mm0
5813 movd
[edi
+ eax
*4 +8], mm1
5814 movq
[edi
+ ebx
*4], mm6
5815 movd
[edi
+ ebx
*4 + 8], mm7
5817 ;
# should we do one more iteration?
5818 sub dword ptr
[esp
+ i1110_innerk
], 2
5819 jl
.i1110_finish_vdw_inner
5820 jmp
.i1110_unroll_vdw_loop
5821 .i1110_finish_vdw_inner:
5822 and dword ptr
[esp
+ i1110_innerk
], 1
5823 jnz
.i1110_single_vdw_inner
5824 jmp
.i1110_updateouterdata_vdw
5825 .i1110_single_vdw_inner:
5826 ;
# a single j particle iteration here - compare with the unrolled code for comments
5827 mov eax
, [esp
+ i1110_innerjjnr
]
5828 mov eax
, [eax
] ;
# eax=jnr offset
5830 mov esi
, [ebp
+ i1110_nbfp
]
5831 mov ecx
, [ebp
+ i1110_type
]
5832 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
5834 add edx
, [esp
+ i1110_ntia
] ;
# tja = ntia + 2*type
5835 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
5836 movq
[esp
+ i1110_c6
], mm5
5837 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
5838 movq
[esp
+ i1110_c12
], mm5
5841 mov esi
, [ebp
+ i1110_pos
]
5842 lea eax
, [eax
+ eax
*2]
5844 movq mm0
, [esp
+ i1110_ix
]
5845 movd mm1
, [esp
+ i1110_iz
]
5846 movq mm4
, [esi
+ eax
*4]
5847 movd mm5
, [esi
+ eax
*4 + 8]
5850 movq
[esp
+ i1110_dx1
], mm4
5852 movd
[esp
+ i1110_dz1
], mm5
5855 pfacc mm4
, mm5 ;
# mm0=rsq
5861 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
5863 pfmul mm0
, mm0 ;
# mm0=invsq
5864 ;
# calculate potentials and scalar force
5867 pfmul mm4
, mm0 ;
# mm4=rinvsix
5869 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
5871 pfmul mm5
, [esp
+ i1110_c12
]
5872 pfmul mm4
, [esp
+ i1110_c6
]
5873 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
5876 pfmul mm4
, [esp
+ i1110_six
]
5878 pfmul mm5
, [esp
+ i1110_twelve
]
5881 pfmul mm0
, mm7 ;
# mm0 is total fscal now
5884 pfadd mm6
, [esp
+ i1110_vnbtot
] ;
# add the earlier value
5885 movq
[esp
+ i1110_vnbtot
], mm6 ;
# store the sum
5887 ;
# spread fscalar to both positions
5889 ;
# calc vectorial force
5890 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
5891 movq mm2
, [esp
+ i1110_dx1
]
5892 movd mm3
, [esp
+ i1110_dz1
]
5898 ;
# update i particle force
5899 movq mm0
, [esp
+ i1110_fix
]
5900 movd mm1
, [esp
+ i1110_fiz
]
5903 movq
[esp
+ i1110_fix
], mm0
5904 movd
[esp
+ i1110_fiz
], mm1
5905 ;
# update j particle force
5906 movq mm0
, [edi
+ eax
*4]
5907 movd mm1
, [edi
+ eax
*4+ 8]
5910 movq
[edi
+ eax
*4], mm0
5911 movd
[edi
+ eax
*4 +8], mm1
5913 .i1110_updateouterdata_vdw:
5914 mov ecx
, [esp
+ i1110_ii3
]
5916 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
5917 movd mm7
, [edi
+ ecx
*4 + 8]
5918 pfadd mm6
, [esp
+ i1110_fix
]
5919 pfadd mm7
, [esp
+ i1110_fiz
]
5920 movq
[edi
+ ecx
*4], mm6
5921 movd
[edi
+ ecx
*4 +8], mm7
5923 mov ebx
, [ebp
+ i1110_fshift
] ;
# increment fshift force
5924 mov edx
, [esp
+ i1110_is3
]
5926 movq mm6
, [ebx
+ edx
*4]
5927 movd mm7
, [ebx
+ edx
*4 + 8]
5928 pfadd mm6
, [esp
+ i1110_fix
]
5929 pfadd mm7
, [esp
+ i1110_fiz
]
5930 movq
[ebx
+ edx
*4], mm6
5931 movd
[ebx
+ edx
*4 + 8], mm7
5934 dec dword ptr
[esp
+ i1110_nsvdw
]
5939 mov edx
, [ebp
+ i1110_gid
] ;
# get group index for this i particle
5941 add dword ptr
[ebp
+ i1110_gid
], 4 ;
# advance pointer
5943 movq mm7
, [esp
+ i1110_vctot
]
5944 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
5946 mov eax
, [ebp
+ i1110_Vc
]
5947 movd mm6
, [eax
+ edx
*4]
5949 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
5951 movq mm7
, [esp
+ i1110_vnbtot
]
5952 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
5954 mov eax
, [ebp
+ i1110_Vnb
]
5955 movd mm6
, [eax
+ edx
*4]
5957 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
5959 mov ecx
, [ebp
+ i1110_nri
]
5962 ;
# not last, iterate once more!
5963 mov
[ebp
+ i1110_nri
], ecx
5979 .globl inl1120_3dnow
5980 .globl _inl1120_3dnow
5984 .equiv i1120_iinr, 12
5985 .equiv i1120_jindex, 16
5986 .equiv i1120_jjnr, 20
5987 .equiv i1120_shift, 24
5988 .equiv i1120_shiftvec, 28
5989 .equiv i1120_fshift, 32
5990 .equiv i1120_gid, 36
5991 .equiv i1120_pos, 40
5992 .equiv i1120_faction, 44
5993 .equiv i1120_charge, 48
5994 .equiv i1120_facel, 52
5996 .equiv i1120_type, 60
5997 .equiv i1120_ntype, 64
5998 .equiv i1120_nbfp, 68
5999 .equiv i1120_Vnb, 72
6000 ;
# stack offsets for local variables
6004 .equiv i1120_iyO, 12
6005 .equiv i1120_izO, 16
6006 .equiv i1120_ixH, 20
6007 .equiv i1120_iyH, 28
6008 .equiv i1120_izH, 36
6009 .equiv i1120_iqO, 44
6010 .equiv i1120_iqH, 52
6011 .equiv i1120_vctot, 60
6012 .equiv i1120_vnbtot, 68
6014 .equiv i1120_c12, 84
6015 .equiv i1120_six, 92
6016 .equiv i1120_twelve, 100
6017 .equiv i1120_ntia, 108
6018 .equiv i1120_innerjjnr, 116
6019 .equiv i1120_innerk, 120
6020 .equiv i1120_fixO, 124
6021 .equiv i1120_fiyO, 128
6022 .equiv i1120_fizO, 132
6023 .equiv i1120_fixH, 136
6024 .equiv i1120_fiyH, 144
6025 .equiv i1120_fizH, 152
6026 .equiv i1120_dxO, 160
6027 .equiv i1120_dyO, 164
6028 .equiv i1120_dzO, 168
6029 .equiv i1120_dxH, 172
6030 .equiv i1120_dyH, 180
6031 .equiv i1120_dzH, 188
6040 sub esp
, 196 ;
# local stack space
6042 ;
# assume we have at least one i particle - start directly
6044 mov ecx
, [ebp
+ i1120_iinr
] ;
# ecx = pointer into iinr[]
6045 mov ebx
, [ecx
] ;
# ebx=ii
6047 mov edx
, [ebp
+ i1120_charge
]
6048 movd mm1
, [ebp
+ i1120_facel
]
6049 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0]
6051 movq
[esp
+ i1120_iqO
], mm2 ;
# iqO = facel*charge[ii]
6053 movd mm2
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1]
6055 punpckldq mm2
,mm2 ;
# spread to both halves
6056 movq
[esp
+ i1120_iqH
], mm2 ;
# iqH = facel*charge[ii0+1]
6058 mov edx
, [ebp
+ i1120_type
]
6059 mov ecx
, [edx
+ ebx
*4]
6061 imul ecx
, [ebp
+ i1120_ntype
] ;
# ecx = ntia = 2*ntype*type[ii0]
6062 mov
[esp
+ i1120_ntia
], ecx
6065 movq mm4
, [mm_twelve
]
6066 movq
[esp
+ i1120_six
], mm3
6067 movq
[esp
+ i1120_twelve
], mm4
6069 mov eax
, [ebp
+ i1120_shift
] ;
# eax = pointer into shift[]
6070 mov ebx
, [eax
] ;
# ebx=shift[n]
6071 add dword ptr
[ebp
+ i1120_shift
], 4 ;
# advance pointer one step
6073 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
6074 mov
[esp
+ i1120_is3
],ebx ;
# store is3
6076 mov eax
, [ebp
+ i1120_shiftvec
] ;
# eax = base of shiftvec[]
6078 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
6079 movd mm6
, [eax
+ ebx
*4 + 8]
6083 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
6087 mov ecx
, [ebp
+ i1120_iinr
] ;
# ecx = pointer into iinr[]
6088 add dword ptr
[ebp
+ i1120_iinr
], 4 ;
# advance pointer
6089 mov ebx
, [ecx
] ;
# ebx=ii
6091 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
6092 mov eax
, [ebp
+ i1120_pos
] ;
# eax = base of pos[]
6094 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
6095 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
6096 mov
[esp
+ i1120_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
6098 movq
[esp
+ i1120_ixO
], mm5
6099 movq
[esp
+ i1120_izO
], mm6
6101 movd mm3
, [eax
+ ebx
*4 + 12]
6102 movd mm4
, [eax
+ ebx
*4 + 16]
6103 movd mm5
, [eax
+ ebx
*4 + 20]
6104 punpckldq mm3
, [eax
+ ebx
*4 + 24]
6105 punpckldq mm4
, [eax
+ ebx
*4 + 28]
6106 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
6111 movq
[esp
+ i1120_ixH
], mm0
6112 movq
[esp
+ i1120_iyH
], mm1
6113 movq
[esp
+ i1120_izH
], mm2
6115 ;
# clear vctot and i forces
6117 movq
[esp
+ i1120_vctot
], mm7
6118 movq
[esp
+ i1120_vnbtot
], mm7
6119 movq
[esp
+ i1120_fixO
], mm7
6120 movd
[esp
+ i1120_fizO
], mm7
6121 movq
[esp
+ i1120_fixH
], mm7
6122 movq
[esp
+ i1120_fiyH
], mm7
6123 movq
[esp
+ i1120_fizH
], mm7
6125 mov eax
, [ebp
+ i1120_jindex
]
6126 mov ecx
, [eax
] ;
# jindex[n]
6127 mov edx
, [eax
+ 4] ;
# jindex[n+1]
6128 add dword ptr
[ebp
+ i1120_jindex
], 4
6129 sub edx
, ecx ;
# number of innerloop atoms
6130 mov
[esp
+ i1120_innerk
], edx ;
# number of innerloop atoms
6132 mov esi
, [ebp
+ i1120_pos
]
6133 mov edi
, [ebp
+ i1120_faction
]
6134 mov eax
, [ebp
+ i1120_jjnr
]
6137 mov
[esp
+ i1120_innerjjnr
], eax ;
# pointer to jjnr[nj0]
6139 ;
# a single j particle iteration here - compare with the unrolled code for comments.
6140 mov eax
, [esp
+ i1120_innerjjnr
]
6141 mov eax
, [eax
] ;
# eax=jnr offset
6142 add dword ptr
[esp
+ i1120_innerjjnr
], 4 ;
# advance pointer
6143 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
6145 mov ecx
, [ebp
+ i1120_charge
]
6146 movd mm7
, [ecx
+ eax
*4]
6149 pfmul mm6
, [esp
+ i1120_iqO
]
6150 pfmul mm7
, [esp
+ i1120_iqH
] ;
# mm6=qqO, mm7=qqH
6152 mov ecx
, [ebp
+ i1120_type
]
6153 mov edx
, [ecx
+ eax
*4] ;
# type [jnr]
6154 mov ecx
, [ebp
+ i1120_nbfp
]
6156 add edx
, [esp
+ i1120_ntia
] ;
# tja = ntia + 2*type
6157 movd mm5
, [ecx
+ edx
*4] ;
# mm5 = 1st c6
6158 movq
[esp
+ i1120_c6
], mm5
6159 movd mm5
, [ecx
+ edx
*4 + 4] ;
# mm5 = 1st c12
6160 movq
[esp
+ i1120_c12
], mm5
6162 lea eax
, [eax
+ eax
*2]
6164 movq mm0
, [esi
+ eax
*4]
6165 movd mm1
, [esi
+ eax
*4 + 8]
6166 ;
# copy & expand to mm2-mm4 for the H interactions
6174 pfsubr mm0
, [esp
+ i1120_ixO
]
6175 pfsubr mm1
, [esp
+ i1120_izO
]
6177 movq
[esp
+ i1120_dxO
], mm0
6179 movd
[esp
+ i1120_dzO
], mm1
6182 pfadd mm0
, mm1 ;
# mm0=rsqO
6186 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
6187 pfsubr mm2
, [esp
+ i1120_ixH
]
6188 pfsubr mm3
, [esp
+ i1120_iyH
]
6189 pfsubr mm4
, [esp
+ i1120_izH
] ;
# mm2-mm4 is dxH-dzH
6191 movq
[esp
+ i1120_dxH
], mm2
6192 movq
[esp
+ i1120_dyH
], mm3
6193 movq
[esp
+ i1120_dzH
], mm4
6199 pfadd mm3
,mm4 ;
# mm3=rsqH
6206 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
6208 pfmul mm4
, mm4 ;
# mm4=invsq
6212 pfmul mm0
, mm4 ;
# mm0=rinvsix
6214 pfmul mm2
, mm2 ;
# mm2=rintwelve
6216 ;
# calculate potential and scalar force
6217 pfmul mm6
, mm1 ;
# mm6=vcoul
6218 movq mm1
, mm6 ;
# use mm1 for fscal sum
6220 ;
# LJ for the oxygen
6221 pfmul mm0
, [esp
+ i1120_c6
]
6222 pfmul mm2
, [esp
+ i1120_c12
]
6224 ;
# calc nb potential
6229 pfmul mm0
, [esp
+ i1120_six
]
6230 pfmul mm2
, [esp
+ i1120_twelve
]
6232 ;
# increment scalar force
6235 pfmul mm4
, mm1 ;
# total scalar force on oxygen.
6237 ;
# update nb potential
6238 pfadd mm5
, [esp
+ i1120_vnbtot
]
6239 movq
[esp
+ i1120_vnbtot
], mm5
6245 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3.
6250 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
6252 pfmul mm3
,mm3 ;
# mm3=invsq
6253 pfmul mm7
, mm5 ;
# mm7=vcoul
6254 pfmul mm3
, mm7 ;
# mm3=fscal for the two H's.
6258 pfadd mm7
, [esp
+ i1120_vctot
]
6259 movq
[esp
+ i1120_vctot
], mm7
6261 ;
# spread oxygen fscalar to both positions
6263 ;
# calc vectorial force for O
6264 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
6265 movq mm0
, [esp
+ i1120_dxO
]
6266 movd mm1
, [esp
+ i1120_dzO
]
6270 ;
# calc vectorial force for H's
6271 movq mm5
, [esp
+ i1120_dxH
]
6272 movq mm6
, [esp
+ i1120_dyH
]
6273 movq mm7
, [esp
+ i1120_dzH
]
6278 ;
# update iO particle force
6279 movq mm2
, [esp
+ i1120_fixO
]
6280 movd mm3
, [esp
+ i1120_fizO
]
6283 movq
[esp
+ i1120_fixO
], mm2
6284 movd
[esp
+ i1120_fizO
], mm3
6287 movq mm2
, [esp
+ i1120_fixH
]
6288 movq mm3
, [esp
+ i1120_fiyH
]
6289 movq mm4
, [esp
+ i1120_fizH
]
6293 movq
[esp
+ i1120_fixH
], mm2
6294 movq
[esp
+ i1120_fiyH
], mm3
6295 movq
[esp
+ i1120_fizH
], mm4
6297 ;
# pack j forces from H in the same form as the oxygen force.
6298 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
6299 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
6301 pfadd mm0
, mm5 ;
# add up total force on j particle.
6304 ;
# update j particle force
6305 movq mm2
, [edi
+ eax
*4]
6306 movd mm3
, [edi
+ eax
*4 + 8]
6309 movq
[edi
+ eax
*4], mm2
6310 movd
[edi
+ eax
*4 +8], mm3
6313 dec dword ptr
[esp
+ i1120_innerk
]
6314 jz
.i1120_updateouterdata
6315 jmp
.i1120_inner_loop
6316 .i1120_updateouterdata:
6317 mov ecx
, [esp
+ i1120_ii3
]
6319 movq mm6
, [edi
+ ecx
*4] ;
# increment iO force
6320 movd mm7
, [edi
+ ecx
*4 + 8]
6321 pfadd mm6
, [esp
+ i1120_fixO
]
6322 pfadd mm7
, [esp
+ i1120_fizO
]
6323 movq
[edi
+ ecx
*4], mm6
6324 movd
[edi
+ ecx
*4 +8], mm7
6326 movq mm0
, [esp
+ i1120_fixH
]
6327 movq mm3
, [esp
+ i1120_fiyH
]
6328 movq mm1
, [esp
+ i1120_fizH
]
6330 punpckldq mm0
, mm3 ;
# mm0(l)=fxH1, mm0(h)=fyH1
6331 punpckhdq mm2
, mm3 ;
# mm2(l)=fxH2, mm2(h)=fyH2
6337 movq mm6
, [edi
+ ecx
*4 + 12] ;
# increment iH1 force
6338 movd mm7
, [edi
+ ecx
*4 + 20]
6341 movq
[edi
+ ecx
*4 + 12], mm6
6342 movd
[edi
+ ecx
*4 + 20], mm7
6344 movq mm6
, [edi
+ ecx
*4 + 24] ;
# increment iH2 force
6345 movd mm7
, [edi
+ ecx
*4 + 32]
6348 movq
[edi
+ ecx
*4 + 24], mm6
6349 movd
[edi
+ ecx
*4 + 32], mm7
6352 mov ebx
, [ebp
+ i1120_fshift
] ;
# increment fshift force
6353 mov edx
, [esp
+ i1120_is3
]
6355 movq mm6
, [ebx
+ edx
*4]
6356 movd mm7
, [ebx
+ edx
*4 + 8]
6357 pfadd mm6
, [esp
+ i1120_fixO
]
6358 pfadd mm7
, [esp
+ i1120_fizO
]
6363 movq
[ebx
+ edx
*4], mm6
6364 movd
[ebx
+ edx
*4 + 8], mm7
6366 mov edx
, [ebp
+ i1120_gid
] ;
# get group index for this i particle
6368 add dword ptr
[ebp
+ i1120_gid
], 4 ;
# advance pointer
6370 movq mm7
, [esp
+ i1120_vctot
]
6371 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
6373 mov eax
, [ebp
+ i1120_Vc
]
6374 movd mm6
, [eax
+ edx
*4]
6376 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
6378 movq mm7
, [esp
+ i1120_vnbtot
]
6379 pfacc mm7
,mm7 ;
# same for Vnb
6381 mov eax
, [ebp
+ i1120_Vnb
]
6382 movd mm6
, [eax
+ edx
*4]
6384 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
6386 dec dword ptr
[ebp
+ i1120_nri
]
6388 ;
# not last, iterate once more!
6404 .globl inl1130_3dnow
6405 .globl _inl1130_3dnow
6409 .equiv i1130_iinr, 12
6410 .equiv i1130_jindex, 16
6411 .equiv i1130_jjnr, 20
6412 .equiv i1130_shift, 24
6413 .equiv i1130_shiftvec, 28
6414 .equiv i1130_fshift, 32
6415 .equiv i1130_gid, 36
6416 .equiv i1130_pos, 40
6417 .equiv i1130_faction, 44
6418 .equiv i1130_charge, 48
6419 .equiv i1130_facel, 52
6421 .equiv i1130_type, 60
6422 .equiv i1130_ntype, 64
6423 .equiv i1130_nbfp, 68
6424 .equiv i1130_Vnb, 72
6425 ;
# stack offsets for local variables
6429 .equiv i1130_iyO, 12
6430 .equiv i1130_izO, 16
6431 .equiv i1130_ixH, 20
6432 .equiv i1130_iyH, 28
6433 .equiv i1130_izH, 36
6434 .equiv i1130_qqOO, 44
6435 .equiv i1130_qqOH, 52
6436 .equiv i1130_qqHH, 60
6438 .equiv i1130_c12, 76
6439 .equiv i1130_six, 84
6440 .equiv i1130_twelve, 92
6441 .equiv i1130_vctot, 100
6442 .equiv i1130_vnbtot, 108
6443 .equiv i1130_innerjjnr, 116
6444 .equiv i1130_innerk, 120
6445 .equiv i1130_fixO, 124
6446 .equiv i1130_fiyO, 128
6447 .equiv i1130_fizO, 132
6448 .equiv i1130_fixH, 136
6449 .equiv i1130_fiyH, 144
6450 .equiv i1130_fizH, 152
6451 .equiv i1130_dxO, 160
6452 .equiv i1130_dyO, 164
6453 .equiv i1130_dzO, 168
6454 .equiv i1130_dxH, 172
6455 .equiv i1130_dyH, 180
6456 .equiv i1130_dzH, 188
6465 sub esp
, 196 ;
# local stack space
6467 ;
# assume we have at least one i particle - start directly
6469 mov ecx
, [ebp
+ i1130_iinr
] ;
# ecx = pointer into iinr[]
6470 mov ebx
, [ecx
] ;
# ebx=ii
6472 mov edx
, [ebp
+ i1130_charge
]
6473 movd mm1
, [ebp
+ i1130_facel
] ;
# mm1=facel
6474 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0] (O)
6475 movd mm3
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1] (H)
6481 pfmul mm4
, mm2 ;
# mm4=qqOO*facel
6482 pfmul mm5
, mm3 ;
# mm5=qqOH*facel
6483 pfmul mm6
, mm3 ;
# mm6=qqHH*facel
6484 punpckldq mm5
,mm5 ;
# spread to both halves
6485 punpckldq mm6
,mm6 ;
# spread to both halves
6486 movq
[esp
+ i1130_qqOO
], mm4
6487 movq
[esp
+ i1130_qqOH
], mm5
6488 movq
[esp
+ i1130_qqHH
], mm6
6489 mov edx
, [ebp
+ i1130_type
]
6490 mov ecx
, [edx
+ ebx
*4]
6493 imul ecx
, [ebp
+ i1130_ntype
]
6495 mov eax
, [ebp
+ i1130_nbfp
]
6496 movd mm0
, [eax
+ edx
*4]
6497 movd mm1
, [eax
+ edx
*4 + 4]
6498 movq
[esp
+ i1130_c6
], mm0
6499 movq
[esp
+ i1130_c12
], mm1
6501 movq mm3
, [mm_twelve
]
6502 movq
[esp
+ i1130_six
], mm2
6503 movq
[esp
+ i1130_twelve
], mm3
6505 mov eax
, [ebp
+ i1130_shift
] ;
# eax = pointer into shift[]
6506 mov ebx
, [eax
] ;
# ebx=shift[n]
6507 add dword ptr
[ebp
+ i1130_shift
], 4 ;
# advance pointer one step
6509 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
6510 mov
[esp
+ i1130_is3
],ebx ;
# store is3
6512 mov eax
, [ebp
+ i1130_shiftvec
] ;
# eax = base of shiftvec[]
6514 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
6515 movd mm6
, [eax
+ ebx
*4 + 8]
6519 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
6523 mov ecx
, [ebp
+ i1130_iinr
] ;
# ecx = pointer into iinr[]
6524 add dword ptr
[ebp
+ i1130_iinr
], 4 ;
# advance pointer
6525 mov ebx
, [ecx
] ;
# ebx=ii
6527 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
6528 mov eax
, [ebp
+ i1130_pos
] ;
# eax = base of pos[]
6530 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
6531 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
6532 mov
[esp
+ i1130_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
6534 movq
[esp
+ i1130_ixO
], mm5
6535 movq
[esp
+ i1130_izO
], mm6
6537 movd mm3
, [eax
+ ebx
*4 + 12]
6538 movd mm4
, [eax
+ ebx
*4 + 16]
6539 movd mm5
, [eax
+ ebx
*4 + 20]
6540 punpckldq mm3
, [eax
+ ebx
*4 + 24]
6541 punpckldq mm4
, [eax
+ ebx
*4 + 28]
6542 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
6547 movq
[esp
+ i1130_ixH
], mm0
6548 movq
[esp
+ i1130_iyH
], mm1
6549 movq
[esp
+ i1130_izH
], mm2
6551 ;
# clear vctot and i forces
6553 movq
[esp
+ i1130_vctot
], mm7
6554 movq
[esp
+ i1130_vnbtot
], mm7
6555 movq
[esp
+ i1130_fixO
], mm7
6556 movq
[esp
+ i1130_fizO
], mm7
6557 movq
[esp
+ i1130_fixH
], mm7
6558 movq
[esp
+ i1130_fiyH
], mm7
6559 movq
[esp
+ i1130_fizH
], mm7
6561 mov eax
, [ebp
+ i1130_jindex
]
6562 mov ecx
, [eax
] ;
# jindex[n]
6563 mov edx
, [eax
+ 4] ;
# jindex[n+1]
6564 add dword ptr
[ebp
+ i1130_jindex
], 4
6565 sub edx
, ecx ;
# number of innerloop atoms
6566 mov
[esp
+ i1130_innerk
], edx ;
# number of innerloop atoms
6568 mov esi
, [ebp
+ i1130_pos
]
6569 mov edi
, [ebp
+ i1130_faction
]
6570 mov eax
, [ebp
+ i1130_jjnr
]
6573 mov
[esp
+ i1130_innerjjnr
], eax ;
# pointer to jjnr[nj0]
6575 ;
# a single j particle iteration here - compare with the unrolled code for comments.
6576 mov eax
, [esp
+ i1130_innerjjnr
]
6577 mov eax
, [eax
] ;
# eax=jnr offset
6578 add dword ptr
[esp
+ i1130_innerjjnr
], 4 ;
# advance pointer
6580 movd mm6
, [esp
+ i1130_qqOO
]
6581 movq mm7
, [esp
+ i1130_qqOH
]
6583 lea eax
, [eax
+ eax
*2]
6584 movq mm0
, [esi
+ eax
*4]
6585 movd mm1
, [esi
+ eax
*4 + 8]
6586 ;
# copy & expand to mm2-mm4 for the H interactions
6594 pfsubr mm0
, [esp
+ i1130_ixO
]
6595 pfsubr mm1
, [esp
+ i1130_izO
]
6597 movq
[esp
+ i1130_dxO
], mm0
6599 movd
[esp
+ i1130_dzO
], mm1
6602 pfadd mm0
, mm1 ;
# mm0=rsqO
6606 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
6607 pfsubr mm2
, [esp
+ i1130_ixH
]
6608 pfsubr mm3
, [esp
+ i1130_iyH
]
6609 pfsubr mm4
, [esp
+ i1130_izH
] ;
# mm2-mm4 is dxH-dzH
6611 movq
[esp
+ i1130_dxH
], mm2
6612 movq
[esp
+ i1130_dyH
], mm3
6613 movq
[esp
+ i1130_dzH
], mm4
6619 pfadd mm3
,mm4 ;
# mm3=rsqH
6626 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
6628 pfmul mm4
, mm4 ;
# mm4=invsq
6635 pfmul mm2
, [esp
+ i1130_c6
]
6636 pfmul mm0
, [esp
+ i1130_c12
]
6638 pfsub mm5
, mm2 ;
# vnb
6640 pfmul mm2
, [esp
+ i1130_six
]
6641 pfmul mm0
, [esp
+ i1130_twelve
]
6645 ;
# calculate potential and scalar force
6646 pfmul mm6
, mm1 ;
# mm6=vcoul
6648 pfmul mm4
, mm0 ;
# mm4=fscalar
6650 ;
# update nb potential
6651 pfadd mm5
, [esp
+ i1130_vnbtot
]
6652 movq
[esp
+ i1130_vnbtot
], mm5
6658 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
6663 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
6665 pfmul mm3
,mm3 ;
# mm3=invsq
6666 pfmul mm7
, mm5 ;
# mm7=vcoul
6667 pfmul mm3
, mm7 ;
# mm3=fscal for the two H's.
6671 pfadd mm7
, [esp
+ i1130_vctot
]
6672 movq
[esp
+ i1130_vctot
], mm7
6674 ;
# spread oxygen fscalar to both positions
6676 ;
# calc vectorial force for O
6677 movq mm0
, [esp
+ i1130_dxO
]
6678 movd mm1
, [esp
+ i1130_dzO
]
6682 ;
# calc vectorial force for H's
6683 movq mm5
, [esp
+ i1130_dxH
]
6684 movq mm6
, [esp
+ i1130_dyH
]
6685 movq mm7
, [esp
+ i1130_dzH
]
6690 ;
# update iO particle force
6691 movq mm2
, [esp
+ i1130_fixO
]
6692 movd mm3
, [esp
+ i1130_fizO
]
6695 movq
[esp
+ i1130_fixO
], mm2
6696 movd
[esp
+ i1130_fizO
], mm3
6699 movq mm2
, [esp
+ i1130_fixH
]
6700 movq mm3
, [esp
+ i1130_fiyH
]
6701 movq mm4
, [esp
+ i1130_fizH
]
6705 movq
[esp
+ i1130_fixH
], mm2
6706 movq
[esp
+ i1130_fiyH
], mm3
6707 movq
[esp
+ i1130_fizH
], mm4
6709 ;
# pack j forces from H in the same form as the oxygen force.
6710 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
6711 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
6713 pfadd mm0
, mm5 ;
# add up total force on j particle.
6716 ;
# update j particle force
6717 movq mm2
, [edi
+ eax
*4]
6718 movd mm3
, [edi
+ eax
*4 + 8]
6721 movq
[edi
+ eax
*4], mm2
6722 movd
[edi
+ eax
*4 +8], mm3
6724 ;
# interactions with j H1
6725 movq mm0
, [esi
+ eax
*4 + 12]
6726 movd mm1
, [esi
+ eax
*4 + 20]
6727 ;
# copy & expand to mm2-mm4 for the H interactions
6735 movd mm6
, [esp
+ i1130_qqOH
]
6736 movq mm7
, [esp
+ i1130_qqHH
]
6738 pfsubr mm0
, [esp
+ i1130_ixO
]
6739 pfsubr mm1
, [esp
+ i1130_izO
]
6741 movq
[esp
+ i1130_dxO
], mm0
6743 movd
[esp
+ i1130_dzO
], mm1
6746 pfadd mm0
, mm1 ;
# mm0=rsqO
6750 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
6751 pfsubr mm2
, [esp
+ i1130_ixH
]
6752 pfsubr mm3
, [esp
+ i1130_iyH
]
6753 pfsubr mm4
, [esp
+ i1130_izH
] ;
# mm2-mm4 is dxH-dzH
6755 movq
[esp
+ i1130_dxH
], mm2
6756 movq
[esp
+ i1130_dyH
], mm3
6757 movq
[esp
+ i1130_dzH
], mm4
6763 pfadd mm3
,mm4 ;
# mm3=rsqH
6770 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
6772 pfmul mm4
, mm4 ;
# mm4=invsq
6773 ;
# calculate potential and scalar force
6774 pfmul mm6
, mm1 ;
# mm6=vcoul
6775 pfmul mm4
, mm6 ;
# mm4=fscalar
6781 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
6786 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
6788 pfmul mm3
,mm3 ;
# mm3=invsq
6789 pfmul mm7
, mm5 ;
# mm7=vcoul
6790 pfmul mm3
, mm7 ;
# mm3=fscal for the two H's.
6794 pfadd mm7
, [esp
+ i1130_vctot
]
6795 movq
[esp
+ i1130_vctot
], mm7
6797 ;
# spread oxygen fscalar to both positions
6799 ;
# calc vectorial force for O
6800 movq mm0
, [esp
+ i1130_dxO
]
6801 movd mm1
, [esp
+ i1130_dzO
]
6805 ;
# calc vectorial force for H's
6806 movq mm5
, [esp
+ i1130_dxH
]
6807 movq mm6
, [esp
+ i1130_dyH
]
6808 movq mm7
, [esp
+ i1130_dzH
]
6813 ;
# update iO particle force
6814 movq mm2
, [esp
+ i1130_fixO
]
6815 movd mm3
, [esp
+ i1130_fizO
]
6818 movq
[esp
+ i1130_fixO
], mm2
6819 movd
[esp
+ i1130_fizO
], mm3
6822 movq mm2
, [esp
+ i1130_fixH
]
6823 movq mm3
, [esp
+ i1130_fiyH
]
6824 movq mm4
, [esp
+ i1130_fizH
]
6828 movq
[esp
+ i1130_fixH
], mm2
6829 movq
[esp
+ i1130_fiyH
], mm3
6830 movq
[esp
+ i1130_fizH
], mm4
6832 ;
# pack j forces from H in the same form as the oxygen force.
6833 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
6834 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
6836 pfadd mm0
, mm5 ;
# add up total force on j particle.
6839 ;
# update j particle force
6840 movq mm2
, [edi
+ eax
*4 + 12]
6841 movd mm3
, [edi
+ eax
*4 + 20]
6844 movq
[edi
+ eax
*4 + 12], mm2
6845 movd
[edi
+ eax
*4 + 20], mm3
6847 ;
# interactions with j H2
6848 movq mm0
, [esi
+ eax
*4 + 24]
6849 movd mm1
, [esi
+ eax
*4 + 32]
6850 ;
# copy & expand to mm2-mm4 for the H interactions
6858 movd mm6
, [esp
+ i1130_qqOH
]
6859 movq mm7
, [esp
+ i1130_qqHH
]
6861 pfsubr mm0
, [esp
+ i1130_ixO
]
6862 pfsubr mm1
, [esp
+ i1130_izO
]
6864 movq
[esp
+ i1130_dxO
], mm0
6866 movd
[esp
+ i1130_dzO
], mm1
6869 pfadd mm0
, mm1 ;
# mm0=rsqO
6873 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
6874 pfsubr mm2
, [esp
+ i1130_ixH
]
6875 pfsubr mm3
, [esp
+ i1130_iyH
]
6876 pfsubr mm4
, [esp
+ i1130_izH
] ;
# mm2-mm4 is dxH-dzH
6878 movq
[esp
+ i1130_dxH
], mm2
6879 movq
[esp
+ i1130_dyH
], mm3
6880 movq
[esp
+ i1130_dzH
], mm4
6886 pfadd mm3
,mm4 ;
# mm3=rsqH
6893 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
6895 pfmul mm4
, mm4 ;
# mm4=invsq
6896 ;
# calculate potential and scalar force
6897 pfmul mm6
, mm1 ;
# mm6=vcoul
6898 pfmul mm4
, mm6 ;
# mm4=fscalar
6904 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3.
6909 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
6911 pfmul mm3
,mm3 ;
# mm3=invsq
6912 pfmul mm7
, mm5 ;
# mm7=vcoul
6913 pfmul mm3
, mm7 ;
# mm3=fscal for the two H's.
6917 pfadd mm7
, [esp
+ i1130_vctot
]
6918 movq
[esp
+ i1130_vctot
], mm7
6920 ;
# spread oxygen fscalar to both positions
6922 ;
# calc vectorial force for O
6923 movq mm0
, [esp
+ i1130_dxO
]
6924 movd mm1
, [esp
+ i1130_dzO
]
6928 ;
# calc vectorial force for H's
6929 movq mm5
, [esp
+ i1130_dxH
]
6930 movq mm6
, [esp
+ i1130_dyH
]
6931 movq mm7
, [esp
+ i1130_dzH
]
6936 ;
# update iO particle force
6937 movq mm2
, [esp
+ i1130_fixO
]
6938 movd mm3
, [esp
+ i1130_fizO
]
6941 movq
[esp
+ i1130_fixO
], mm2
6942 movd
[esp
+ i1130_fizO
], mm3
6945 movq mm2
, [esp
+ i1130_fixH
]
6946 movq mm3
, [esp
+ i1130_fiyH
]
6947 movq mm4
, [esp
+ i1130_fizH
]
6951 movq
[esp
+ i1130_fixH
], mm2
6952 movq
[esp
+ i1130_fiyH
], mm3
6953 movq
[esp
+ i1130_fizH
], mm4
6955 ;
# pack j forces from H in the same form as the oxygen force.
6956 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
6957 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
6959 pfadd mm0
, mm5 ;
# add up total force on j particle.
6962 ;
# update j particle force
6963 movq mm2
, [edi
+ eax
*4 + 24]
6964 movd mm3
, [edi
+ eax
*4 + 32]
6967 movq
[edi
+ eax
*4 + 24], mm2
6968 movd
[edi
+ eax
*4 + 32], mm3
6971 dec dword ptr
[esp
+ i1130_innerk
]
6972 jz
.i1130_updateouterdata
6973 jmp
.i1130_inner_loop
6974 .i1130_updateouterdata:
6975 mov ecx
, [esp
+ i1130_ii3
]
6977 movq mm6
, [edi
+ ecx
*4] ;
# increment iO force
6978 movd mm7
, [edi
+ ecx
*4 + 8]
6979 pfadd mm6
, [esp
+ i1130_fixO
]
6980 pfadd mm7
, [esp
+ i1130_fizO
]
6981 movq
[edi
+ ecx
*4], mm6
6982 movd
[edi
+ ecx
*4 +8], mm7
6984 movq mm0
, [esp
+ i1130_fixH
]
6985 movq mm3
, [esp
+ i1130_fiyH
]
6986 movq mm1
, [esp
+ i1130_fizH
]
6988 punpckldq mm0
, mm3 ;
# mm0(l)=fxH1, mm0(h)=fyH1
6989 punpckhdq mm2
, mm3 ;
# mm2(l)=fxH2, mm2(h)=fyH2
6995 movq mm6
, [edi
+ ecx
*4 + 12] ;
# increment iH1 force
6996 movd mm7
, [edi
+ ecx
*4 + 20]
6999 movq
[edi
+ ecx
*4 + 12], mm6
7000 movd
[edi
+ ecx
*4 + 20], mm7
7002 movq mm6
, [edi
+ ecx
*4 + 24] ;
# increment iH2 force
7003 movd mm7
, [edi
+ ecx
*4 + 32]
7006 movq
[edi
+ ecx
*4 + 24], mm6
7007 movd
[edi
+ ecx
*4 + 32], mm7
7010 mov ebx
, [ebp
+ i1130_fshift
] ;
# increment fshift force
7011 mov edx
, [esp
+ i1130_is3
]
7013 movq mm6
, [ebx
+ edx
*4]
7014 movd mm7
, [ebx
+ edx
*4 + 8]
7015 pfadd mm6
, [esp
+ i1130_fixO
]
7016 pfadd mm7
, [esp
+ i1130_fizO
]
7021 movq
[ebx
+ edx
*4], mm6
7022 movd
[ebx
+ edx
*4 + 8], mm7
7024 mov edx
, [ebp
+ i1130_gid
] ;
# get group index for this i particle
7026 add dword ptr
[ebp
+ i1130_gid
], 4 ;
# advance pointer
7028 movq mm7
, [esp
+ i1130_vctot
]
7029 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
7031 mov eax
, [ebp
+ i1130_Vc
]
7032 movd mm6
, [eax
+ edx
*4]
7034 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
7036 movq mm7
, [esp
+ i1130_vnbtot
]
7037 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
7039 mov eax
, [ebp
+ i1130_Vnb
]
7040 movd mm6
, [eax
+ edx
*4]
7042 movd
[eax
+ edx
*4], mm6 ;
# increment vnbtot[gid]
7044 dec dword ptr
[ebp
+ i1130_nri
]
7046 ;
# not last, iterate once more!
7063 .globl inl3000_3dnow
7064 .globl _inl3000_3dnow
7068 .equiv i3000_iinr, 12
7069 .equiv i3000_jindex, 16
7070 .equiv i3000_jjnr, 20
7071 .equiv i3000_shift, 24
7072 .equiv i3000_shiftvec, 28
7073 .equiv i3000_fshift, 32
7074 .equiv i3000_gid, 36
7075 .equiv i3000_pos, 40
7076 .equiv i3000_faction, 44
7077 .equiv i3000_charge, 48
7078 .equiv i3000_facel, 52
7080 .equiv i3000_tabscale, 60
7081 .equiv i3000_VFtab, 64
7082 ;
# stack offsets for local variables
7089 .equiv i3000_vctot, 28
7090 .equiv i3000_two, 36
7092 .equiv i3000_tsc, 52
7093 .equiv i3000_ntia, 60
7094 .equiv i3000_innerjjnr, 64
7095 .equiv i3000_innerk, 68
7096 .equiv i3000_fix, 72
7097 .equiv i3000_fiy, 76
7098 .equiv i3000_fiz, 80
7099 .equiv i3000_dx1, 84
7100 .equiv i3000_dy1, 88
7101 .equiv i3000_dz1, 92
7102 .equiv i3000_dx2, 96
7103 .equiv i3000_dy2, 100
7104 .equiv i3000_dz2, 104
7113 sub esp
, 108 ;
# local stack space
7115 ;
# move data to local stack
7117 movd mm3
, [ebp
+ i3000_tabscale
]
7118 movq
[esp
+ i3000_two
], mm0
7120 movq
[esp
+ i3000_tsc
], mm3
7121 ;
# assume we have at least one i particle - start directly
7123 mov eax
, [ebp
+ i3000_shift
] ;
# eax = pointer into shift[]
7124 mov ebx
, [eax
] ;
# ebx=shift[n]
7125 add dword ptr
[ebp
+ i3000_shift
], 4 ;
# advance pointer one step
7127 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
7128 mov
[esp
+ i3000_is3
],ebx ;
# store is3
7130 mov eax
, [ebp
+ i3000_shiftvec
] ;
# eax = base of shiftvec[]
7132 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
7133 movd mm1
, [eax
+ ebx
*4 + 8]
7135 mov ecx
, [ebp
+ i3000_iinr
] ;
# ecx = pointer into iinr[]
7136 add dword ptr
[ebp
+ i3000_iinr
], 4 ;
# advance pointer
7137 mov ebx
, [ecx
] ;
# ebx=ii
7139 mov edx
, [ebp
+ i3000_charge
]
7140 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
7141 pfmul mm2
, [ebp
+ i3000_facel
]
7142 punpckldq mm2
,mm2 ;
# spread to both halves
7143 movq
[esp
+ i3000_iq
], mm2 ;
# iq =facel*charge[ii]
7145 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
7146 mov eax
, [ebp
+ i3000_pos
] ;
# eax = base of pos[]
7148 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
7149 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
7150 mov
[esp
+ i3000_ii3
], ebx
7152 movq
[esp
+ i3000_ix
], mm0
7153 movd
[esp
+ i3000_iz
], mm1
7155 ;
# clear total potential and i forces
7157 movq
[esp
+ i3000_vctot
], mm7
7158 movq
[esp
+ i3000_fix
], mm7
7159 movd
[esp
+ i3000_fiz
], mm7
7161 mov eax
, [ebp
+ i3000_jindex
]
7162 mov ecx
, [eax
] ;
# jindex[n]
7163 mov edx
, [eax
+ 4] ;
# jindex[n+1]
7164 add dword ptr
[ebp
+ i3000_jindex
], 4
7165 sub edx
, ecx ;
# number of innerloop atoms
7167 mov esi
, [ebp
+ i3000_pos
]
7168 mov edi
, [ebp
+ i3000_faction
]
7169 mov eax
, [ebp
+ i3000_jjnr
]
7172 mov
[esp
+ i3000_innerjjnr
], eax ;
# pointer to jjnr[nj0]
7174 mov
[esp
+ i3000_innerk
], edx ;
# number of innerloop atoms
7175 jge
.i3000_unroll_loop
7176 jmp
.i3000_finish_inner
7178 ;
# paired innerloop starts here
7179 mov ecx
, [esp
+ i3000_innerjjnr
] ;
# pointer to jjnr[k]
7181 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
7182 add dword ptr
[esp
+ i3000_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
7183 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
7185 mov ecx
, [ebp
+ i3000_charge
] ;
# base of charge[]
7186 movq mm5
, [esp
+ i3000_iq
]
7187 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
7188 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
7189 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
7191 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
7192 lea ebx
, [ebx
+ ebx
*2]
7194 mov esi
, [ebp
+ i3000_pos
]
7196 movq mm0
, [esp
+ i3000_ix
]
7197 movd mm1
, [esp
+ i3000_iz
]
7198 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
7199 movd mm5
, [esi
+ eax
*4 + 8]
7200 pfsubr mm4
,mm0 ;
# dr = ir - jr
7202 movq
[esp
+ i3000_dx1
], mm4 ;
# store dr
7203 movd
[esp
+ i3000_dz1
], mm5
7204 pfmul mm4
,mm4 ;
# square dx,dy,dz
7206 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
7207 pfacc mm4
, mm5 ;
# first rsq in lower mm4
7209 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
7210 movd mm7
, [esi
+ ebx
*4 + 8]
7212 pfsubr mm6
,mm0 ;
# dr = ir - jr
7214 movq
[esp
+ i3000_dx2
], mm6 ;
# store dr
7215 movd
[esp
+ i3000_dz2
], mm7
7216 pfmul mm6
,mm6 ;
# square dx,dy,dz
7218 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
7219 pfacc mm6
, mm7 ;
# second rsq in lower mm6
7221 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
7226 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
7227 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
7233 ;
# mm0 is invsqrt, and mm1 r.
7234 ;
# do potential and fscal
7235 pfmul mm1
, [esp
+ i3000_tsc
] ;
# mm1=rt
7237 movq
[esp
+ i3000_n1
], mm4
7239 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
7242 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
7244 mov edx
, [ebp
+ i3000_VFtab
]
7245 mov ecx
, [esp
+ i3000_n1
]
7248 ;
# load all the table values we need
7249 movd mm4
, [edx
+ ecx
*4]
7250 movd mm5
, [edx
+ ecx
*4 + 4]
7251 movd mm6
, [edx
+ ecx
*4 + 8]
7252 movd mm7
, [edx
+ ecx
*4 + 12]
7253 mov ecx
, [esp
+ i3000_n1
+ 4]
7255 punpckldq mm4
, [edx
+ ecx
*4]
7256 punpckldq mm5
, [edx
+ ecx
*4 + 4]
7257 punpckldq mm6
, [edx
+ ecx
*4 + 8]
7258 punpckldq mm7
, [edx
+ ecx
*4 + 12]
7260 pfmul mm6
, mm1 ;
# mm6 = Geps
7261 pfmul mm7
, mm2 ;
# mm7 = Heps2
7264 pfadd mm5
, mm7 ;
# mm5 = Fp
7266 pfmul mm7
, [esp
+ i3000_two
] ;
# two*Heps2
7268 pfadd mm7
, mm5 ;
# mm7=FF
7270 pfmul mm5
, mm1 ;
# mm5=eps*Fp
7271 pfadd mm5
, mm4 ;
# mm5= VV
7273 pfmul mm5
, mm3 ;
# vcoul=qq*VV
7274 pfmul mm3
, mm7 ;
# fijC=FF*qq
7276 ;
# at this point mm5 contains vcoul and mm3 fijC.
7277 ;
# increment vcoul - then we can get rid of mm5.
7279 pfadd mm5
, [esp
+ i3000_vctot
] ;
# add the earlier value
7280 movq
[esp
+ i3000_vctot
], mm5 ;
# store the sum
7282 ;
# change sign of mm3
7285 pfmul mm1
, [esp
+ i3000_tsc
]
7286 pfmul mm0
, mm1 ;
# mm0 is total fscal now
7288 prefetchw
[esp
+ i3000_dx1
] ;
# prefetch i forces to cache
7290 ;
# spread fscalar to both positions
7295 ;
# calc vector force
7296 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
7297 movq mm2
, [esp
+ i3000_dx1
] ;
# fetch dr
7298 movd mm3
, [esp
+ i3000_dz1
]
7300 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
7301 pfmul mm2
, mm0 ;
# mult by fs
7304 movq mm4
, [esp
+ i3000_dx2
] ;
# fetch dr
7305 movd mm5
, [esp
+ i3000_dz2
]
7306 pfmul mm4
, mm1 ;
# mult by fs
7310 movq mm0
, [esp
+ i3000_fix
]
7311 movd mm1
, [esp
+ i3000_fiz
]
7317 movq
[esp
+ i3000_fix
], mm0
7318 movd
[esp
+ i3000_fiz
], mm1
7321 movq mm0
, [edi
+ eax
*4]
7322 movd mm1
, [edi
+ eax
*4 + 8]
7323 movq mm6
, [edi
+ ebx
*4]
7324 movd mm7
, [edi
+ ebx
*4 + 8]
7331 movq
[edi
+ eax
*4], mm0
7332 movd
[edi
+ eax
*4 +8], mm1
7333 movq
[edi
+ ebx
*4], mm6
7334 movd
[edi
+ ebx
*4 + 8], mm7
7336 ;
# should we do one more iteration?
7337 sub dword ptr
[esp
+ i3000_innerk
], 2
7338 jl
.i3000_finish_inner
7339 jmp
.i3000_unroll_loop
7340 .i3000_finish_inner:
7341 and dword ptr
[esp
+ i3000_innerk
], 1
7342 jnz
.i3000_single_inner
7343 jmp
.i3000_updateouterdata
7344 .i3000_single_inner:
7345 ;
# a single j particle iteration here - compare with the unrolled code for comments.
7346 mov eax
, [esp
+ i3000_innerjjnr
]
7347 mov eax
, [eax
] ;
# eax=jnr offset
7349 mov ecx
, [ebp
+ i3000_charge
]
7350 movd mm5
, [esp
+ i3000_iq
]
7351 movd mm3
, [ecx
+ eax
*4]
7352 pfmul mm3
, mm5 ;
# mm3=qq
7354 mov esi
, [ebp
+ i3000_pos
]
7355 lea eax
, [eax
+ eax
*2]
7357 movq mm0
, [esp
+ i3000_ix
]
7358 movd mm1
, [esp
+ i3000_iz
]
7359 movq mm4
, [esi
+ eax
*4]
7360 movd mm5
, [esi
+ eax
*4 + 8]
7363 movq
[esp
+ i3000_dx1
], mm4
7365 movd
[esp
+ i3000_dz1
], mm5
7368 pfacc mm4
, mm5 ;
# mm0=rsq
7374 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
7377 ;
# mm0 is invsqrt, and mm1 r.
7379 ;
# calculate potentials and scalar force
7380 pfmul mm1
, [esp
+ i3000_tsc
] ;
# mm1=rt
7382 movd
[esp
+ i3000_n1
], mm4
7384 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
7387 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
7390 mov edx
, [ebp
+ i3000_VFtab
]
7391 mov ecx
, [esp
+ i3000_n1
]
7393 ;
# load all the table values we need
7394 movd mm4
, [edx
+ ecx
*4]
7395 movd mm5
, [edx
+ ecx
*4 + 4]
7396 movd mm6
, [edx
+ ecx
*4 + 8]
7397 movd mm7
, [edx
+ ecx
*4 + 12]
7399 pfmul mm6
, mm1 ;
# mm6 = Geps
7400 pfmul mm7
, mm2 ;
# mm7 = Heps2
7403 pfadd mm5
, mm7 ;
# mm5 = Fp
7405 pfmul mm7
, [esp
+ i3000_two
] ;
# two*Heps2
7407 pfadd mm7
, mm5 ;
# mm7=FF
7409 pfmul mm5
, mm1 ;
# mm5=eps*Fp
7410 pfadd mm5
, mm4 ;
# mm5= VV
7412 pfmul mm5
, mm3 ;
# vcoul=qq*VV
7413 pfmul mm3
, mm7 ;
# fijC=FF*qq
7415 ;
# at this point mm5 contains vcoul and mm3 fijC
7416 ;
# increment vcoul - then we can get rid of mm5
7418 pfadd mm5
, [esp
+ i3000_vctot
] ;
# add the earlier value
7419 movq
[esp
+ i3000_vctot
], mm5 ;
# store the sum
7421 ;
# change sign of mm3
7424 pfmul mm0
, [esp
+ i3000_tsc
]
7425 pfmul mm0
, mm1 ;
# mm0 is total fscal now
7427 ;
# spread fscalar to both positions
7429 ;
# calc vectorial force
7430 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
7431 movq mm2
, [esp
+ i3000_dx1
]
7432 movd mm3
, [esp
+ i3000_dz1
]
7438 ;
# update i particle force
7439 movq mm0
, [esp
+ i3000_fix
]
7440 movd mm1
, [esp
+ i3000_fiz
]
7443 movq
[esp
+ i3000_fix
], mm0
7444 movd
[esp
+ i3000_fiz
], mm1
7445 ;
# update j particle force
7446 movq mm0
, [edi
+ eax
*4]
7447 movd mm1
, [edi
+ eax
*4+ 8]
7450 movq
[edi
+ eax
*4], mm0
7451 movd
[edi
+ eax
*4 +8], mm1
7453 .i3000_updateouterdata:
7454 mov ecx
, [esp
+ i3000_ii3
]
7456 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
7457 movd mm7
, [edi
+ ecx
*4 + 8]
7458 pfadd mm6
, [esp
+ i3000_fix
]
7459 pfadd mm7
, [esp
+ i3000_fiz
]
7460 movq
[edi
+ ecx
*4], mm6
7461 movd
[edi
+ ecx
*4 +8], mm7
7463 mov ebx
, [ebp
+ i3000_fshift
] ;
# increment fshift force
7464 mov edx
, [esp
+ i3000_is3
]
7466 movq mm6
, [ebx
+ edx
*4]
7467 movd mm7
, [ebx
+ edx
*4 + 8]
7468 pfadd mm6
, [esp
+ i3000_fix
]
7469 pfadd mm7
, [esp
+ i3000_fiz
]
7470 movq
[ebx
+ edx
*4], mm6
7471 movd
[ebx
+ edx
*4 + 8], mm7
7473 mov edx
, [ebp
+ i3000_gid
] ;
# get group index for this i particle
7475 add dword ptr
[ebp
+ i3000_gid
], 4 ;
# advance pointer
7477 movq mm7
, [esp
+ i3000_vctot
]
7478 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
7480 mov eax
, [ebp
+ i3000_Vc
]
7481 movd mm6
, [eax
+ edx
*4]
7483 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
7486 mov ecx
, [ebp
+ i3000_nri
]
7489 ;
# not last, iterate once more!
7490 mov
[ebp
+ i3000_nri
], ecx
7507 .globl inl3010_3dnow
7508 .globl _inl3010_3dnow
7512 .equiv i3010_iinr, 12
7513 .equiv i3010_jindex, 16
7514 .equiv i3010_jjnr, 20
7515 .equiv i3010_shift, 24
7516 .equiv i3010_shiftvec, 28
7517 .equiv i3010_fshift, 32
7518 .equiv i3010_gid, 36
7519 .equiv i3010_pos, 40
7520 .equiv i3010_faction, 44
7521 .equiv i3010_charge, 48
7522 .equiv i3010_facel, 52
7524 .equiv i3010_tabscale, 60
7525 .equiv i3010_VFtab, 64
7526 .equiv i3010_nsatoms, 68
7527 ;
# stack offsets for local variables
7531 .equiv i3010_shY, 12
7532 .equiv i3010_shZ, 16
7537 .equiv i3010_vctot, 40
7538 .equiv i3010_two, 48
7540 .equiv i3010_tsc, 64
7541 .equiv i3010_innerjjnr0, 72
7542 .equiv i3010_innerk0, 76
7543 .equiv i3010_innerjjnr, 80
7544 .equiv i3010_innerk, 84
7545 .equiv i3010_fix, 88
7546 .equiv i3010_fiy, 92
7547 .equiv i3010_fiz, 96
7548 .equiv i3010_dx1, 100
7549 .equiv i3010_dy1, 104
7550 .equiv i3010_dz1, 108
7551 .equiv i3010_dx2, 112
7552 .equiv i3010_dy2, 116
7553 .equiv i3010_dz2, 120
7554 .equiv i3010_nscoul, 124
7555 .equiv i3010_solnr, 128
7564 sub esp
, 132 ;
# local stack space
7567 add dword ptr
[ebp
+ i3010_nsatoms
], 8
7569 movq
[esp
+ i3010_two
], mm2
7570 movd mm3
, [ebp
+ i3010_tabscale
]
7572 movq
[esp
+ i3010_tsc
], mm3
7574 ;
# assume we have at least one i particle - start directly
7576 mov eax
, [ebp
+ i3010_shift
] ;
# eax = pointer into shift[]
7577 mov ebx
, [eax
] ;
# ebx=shift[n]
7578 add dword ptr
[ebp
+ i3010_shift
], 4 ;
# advance pointer one step
7580 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
7581 mov
[esp
+ i3010_is3
],ebx ;
# store is3
7583 mov eax
, [ebp
+ i3010_shiftvec
] ;
# eax = base of shiftvec[]
7585 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
7586 movd mm1
, [eax
+ ebx
*4 + 8]
7587 movq
[esp
+ i3010_shX
], mm0
7588 movd
[esp
+ i3010_shZ
], mm1
7590 mov ecx
, [ebp
+ i3010_iinr
] ;
# ecx = pointer into iinr[]
7591 add dword ptr
[ebp
+ i3010_iinr
], 4 ;
# advance pointer
7592 mov ebx
, [ecx
] ;
# ebx=ii
7594 mov eax
, [ebp
+ i3010_nsatoms
]
7596 add dword ptr
[ebp
+ i3010_nsatoms
], 12
7597 mov
[esp
+ i3010_nscoul
], ecx
7601 movq
[esp
+ i3010_vctot
], mm7
7602 mov
[esp
+ i3010_solnr
], ebx
7604 mov eax
, [ebp
+ i3010_jindex
]
7605 mov ecx
, [eax
] ;
# jindex[n]
7606 mov edx
, [eax
+ 4] ;
# jindex[n+1]
7607 add dword ptr
[ebp
+ i3010_jindex
], 4
7608 sub edx
, ecx ;
# number of innerloop atoms
7609 mov eax
, [ebp
+ i3010_jjnr
]
7612 mov
[esp
+ i3010_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
7614 mov
[esp
+ i3010_innerk0
], edx ;
# number of innerloop atoms
7615 mov esi
, [ebp
+ i3010_pos
]
7616 mov edi
, [ebp
+ i3010_faction
]
7617 mov ecx
, [esp
+ i3010_nscoul
]
7622 mov ebx
, [esp
+ i3010_solnr
]
7623 inc dword ptr
[esp
+ i3010_solnr
]
7624 mov edx
, [ebp
+ i3010_charge
]
7625 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
7626 pfmul mm2
, [ebp
+ i3010_facel
]
7627 punpckldq mm2
,mm2 ;
# spread to both halves
7628 movq
[esp
+ i3010_iq
], mm2 ;
# iq =facel*charge[ii]
7630 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
7631 mov eax
, [ebp
+ i3010_pos
] ;
# eax = base of pos[]
7632 mov
[esp
+ i3010_ii3
], ebx
7634 movq mm0
, [eax
+ ebx
*4]
7635 movd mm1
, [eax
+ ebx
*4 + 8]
7636 pfadd mm0
, [esp
+ i3010_shX
]
7637 pfadd mm1
, [esp
+ i3010_shZ
]
7638 movq
[esp
+ i3010_ix
], mm0
7639 movd
[esp
+ i3010_iz
], mm1
7643 movq
[esp
+ i3010_fix
], mm7
7644 movd
[esp
+ i3010_fiz
], mm7
7646 mov ecx
, [esp
+ i3010_innerjjnr0
]
7647 mov
[esp
+ i3010_innerjjnr
], ecx
7648 mov edx
, [esp
+ i3010_innerk0
]
7650 mov
[esp
+ i3010_innerk
], edx ;
# number of innerloop atoms
7651 jge
.i3010_unroll_coul_loop
7652 jmp
.i3010_finish_coul_inner
7653 .i3010_unroll_coul_loop:
7654 ;
# paired innerloop starts here
7655 mov ecx
, [esp
+ i3010_innerjjnr
] ;
# pointer to jjnr[k]
7657 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
7658 add dword ptr
[esp
+ i3010_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
7659 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
7661 mov ecx
, [ebp
+ i3010_charge
] ;
# base of charge[]
7662 movq mm5
, [esp
+ i3010_iq
]
7663 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
7664 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
7665 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
7667 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
7668 lea ebx
, [ebx
+ ebx
*2]
7670 mov esi
, [ebp
+ i3010_pos
]
7672 movq mm0
, [esp
+ i3010_ix
]
7673 movd mm1
, [esp
+ i3010_iz
]
7674 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
7675 movd mm5
, [esi
+ eax
*4 + 8]
7676 pfsubr mm4
,mm0 ;
# dr = ir - jr
7678 movq
[esp
+ i3010_dx1
], mm4 ;
# store dr
7679 movd
[esp
+ i3010_dz1
], mm5
7680 pfmul mm4
,mm4 ;
# square dx,dy,dz
7682 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
7683 pfacc mm4
, mm5 ;
# first rsq in lower mm4
7685 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
7686 movd mm7
, [esi
+ ebx
*4 + 8]
7688 pfsubr mm6
,mm0 ;
# dr = ir - jr
7690 movq
[esp
+ i3010_dx2
], mm6 ;
# store dr
7691 movd
[esp
+ i3010_dz2
], mm7
7692 pfmul mm6
,mm6 ;
# square dx,dy,dz
7694 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
7695 pfacc mm6
, mm7 ;
# second rsq in lower mm6
7697 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
7702 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
7703 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
7709 ;
# mm0 is invsqrt, and mm1 r.
7710 ;
# do potential and fscal
7711 pfmul mm1
, [esp
+ i3010_tsc
] ;
# mm1=rt
7713 movq
[esp
+ i3010_n1
], mm4
7715 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
7718 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
7720 mov edx
, [ebp
+ i3010_VFtab
]
7721 mov ecx
, [esp
+ i3010_n1
]
7724 ;
# load all the table values we need
7725 movd mm4
, [edx
+ ecx
*4]
7726 movd mm5
, [edx
+ ecx
*4 + 4]
7727 movd mm6
, [edx
+ ecx
*4 + 8]
7728 movd mm7
, [edx
+ ecx
*4 + 12]
7729 mov ecx
, [esp
+ i3010_n1
+ 4]
7731 punpckldq mm4
, [edx
+ ecx
*4]
7732 punpckldq mm5
, [edx
+ ecx
*4 + 4]
7733 punpckldq mm6
, [edx
+ ecx
*4 + 8]
7734 punpckldq mm7
, [edx
+ ecx
*4 + 12]
7736 pfmul mm6
, mm1 ;
# mm6 = Geps
7737 pfmul mm7
, mm2 ;
# mm7 = Heps2
7740 pfadd mm5
, mm7 ;
# mm5 = Fp
7742 pfmul mm7
, [esp
+ i3010_two
] ;
# two*Heps2
7744 pfadd mm7
, mm5 ;
# mm7=FF
7746 pfmul mm5
, mm1 ;
# mm5=eps*Fp
7747 pfadd mm5
, mm4 ;
# mm5= VV
7749 pfmul mm5
, mm3 ;
# vcoul=qq*VV
7750 pfmul mm3
, mm7 ;
# fijC=FF*qq
7752 ;
# at this point mm5 contains vcoul and mm3 fijC
7753 ;
# increment vcoul - then we can get rid of mm5
7755 pfadd mm5
, [esp
+ i3010_vctot
] ;
# add the earlier value
7756 movq
[esp
+ i3010_vctot
], mm5 ;
# store the sum
7758 ;
# change sign of mm3
7761 pfmul mm1
, [esp
+ i3010_tsc
]
7762 pfmul mm0
, mm1 ;
# mm0 is total fscal now
7764 prefetchw
[esp
+ i3010_dx1
] ;
# prefetch i forces to cache
7766 ;
# spread fscalar to both positions
7771 ;
# calc vector force
7772 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
7773 movq mm2
, [esp
+ i3010_dx1
] ;
# fetch dr
7774 movd mm3
, [esp
+ i3010_dz1
]
7776 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
7777 pfmul mm2
, mm0 ;
# mult by fs
7780 movq mm4
, [esp
+ i3010_dx2
] ;
# fetch dr
7781 movd mm5
, [esp
+ i3010_dz2
]
7782 pfmul mm4
, mm1 ;
# mult by fs
7786 movq mm0
, [esp
+ i3010_fix
]
7787 movd mm1
, [esp
+ i3010_fiz
]
7793 movq
[esp
+ i3010_fix
], mm0
7794 movd
[esp
+ i3010_fiz
], mm1
7797 movq mm0
, [edi
+ eax
*4]
7798 movd mm1
, [edi
+ eax
*4 + 8]
7799 movq mm6
, [edi
+ ebx
*4]
7800 movd mm7
, [edi
+ ebx
*4 + 8]
7807 movq
[edi
+ eax
*4], mm0
7808 movd
[edi
+ eax
*4 +8], mm1
7809 movq
[edi
+ ebx
*4], mm6
7810 movd
[edi
+ ebx
*4 + 8], mm7
7812 ;
# should we do one more iteration?
7813 sub dword ptr
[esp
+ i3010_innerk
], 2
7814 jl
.i3010_finish_coul_inner
7815 jmp
.i3010_unroll_coul_loop
7816 .i3010_finish_coul_inner:
7817 and dword ptr
[esp
+ i3010_innerk
], 1
7818 jnz
.i3010_single_coul_inner
7819 jmp
.i3010_updateouterdata_coul
7820 .i3010_single_coul_inner:
7821 ;
# a single j particle iteration here - compare with the unrolled code for comments.
7822 mov eax
, [esp
+ i3010_innerjjnr
]
7823 mov eax
, [eax
] ;
# eax=jnr offset
7825 mov ecx
, [ebp
+ i3010_charge
]
7826 movd mm5
, [esp
+ i3010_iq
]
7827 movd mm3
, [ecx
+ eax
*4]
7828 pfmul mm3
, mm5 ;
# mm3=qq
7830 mov esi
, [ebp
+ i3010_pos
]
7831 lea eax
, [eax
+ eax
*2]
7833 movq mm0
, [esp
+ i3010_ix
]
7834 movd mm1
, [esp
+ i3010_iz
]
7835 movq mm4
, [esi
+ eax
*4]
7836 movd mm5
, [esi
+ eax
*4 + 8]
7839 movq
[esp
+ i3010_dx1
], mm4
7841 movd
[esp
+ i3010_dz1
], mm5
7844 pfacc mm4
, mm5 ;
# mm0=rsq
7850 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
7853 ;
# mm0 is invsqrt, and mm1 r.
7855 ;
# calculate potentials and scalar force
7856 pfmul mm1
, [esp
+ i3010_tsc
] ;
# mm1=rt
7858 movd
[esp
+ i3010_n1
], mm4
7860 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
7863 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
7866 mov edx
, [ebp
+ i3010_VFtab
]
7867 mov ecx
, [esp
+ i3010_n1
]
7869 ;
# load all the table values we need
7870 movd mm4
, [edx
+ ecx
*4]
7871 movd mm5
, [edx
+ ecx
*4 + 4]
7872 movd mm6
, [edx
+ ecx
*4 + 8]
7873 movd mm7
, [edx
+ ecx
*4 + 12]
7875 pfmul mm6
, mm1 ;
# mm6 = Geps
7876 pfmul mm7
, mm2 ;
# mm7 = Heps2
7879 pfadd mm5
, mm7 ;
# mm5 = Fp
7881 pfmul mm7
, [esp
+ i3010_two
] ;
# two*Heps2
7883 pfadd mm7
, mm5 ;
# mm7=FF
7885 pfmul mm5
, mm1 ;
# mm5=eps*Fp
7886 pfadd mm5
, mm4 ;
# mm5= VV
7888 pfmul mm5
, mm3 ;
# vcoul=qq*VV
7889 pfmul mm3
, mm7 ;
# fijC=FF*qq
7891 ;
# at this point mm5 contains vcoul and mm3 fijC
7892 ;
# increment vcoul - then we can get rid of mm5
7894 pfadd mm5
, [esp
+ i3010_vctot
] ;
# add the earlier value
7895 movq
[esp
+ i3010_vctot
], mm5 ;
# store the sum
7897 ;
# change sign of mm3
7900 pfmul mm0
, [esp
+ i3010_tsc
]
7901 pfmul mm0
, mm1 ;
# mm0 is total fscal now
7903 ;
# spread fscalar to both positions
7905 ;
# calc vectorial force
7906 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
7907 movq mm2
, [esp
+ i3010_dx1
]
7908 movd mm3
, [esp
+ i3010_dz1
]
7914 ;
# update i particle force
7915 movq mm0
, [esp
+ i3010_fix
]
7916 movd mm1
, [esp
+ i3010_fiz
]
7919 movq
[esp
+ i3010_fix
], mm0
7920 movd
[esp
+ i3010_fiz
], mm1
7921 ;
# update j particle force
7922 movq mm0
, [edi
+ eax
*4]
7923 movd mm1
, [edi
+ eax
*4+ 8]
7926 movq
[edi
+ eax
*4], mm0
7927 movd
[edi
+ eax
*4 +8], mm1
7929 .i3010_updateouterdata_coul:
7930 mov ecx
, [esp
+ i3010_ii3
]
7932 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
7933 movd mm7
, [edi
+ ecx
*4 + 8]
7934 pfadd mm6
, [esp
+ i3010_fix
]
7935 pfadd mm7
, [esp
+ i3010_fiz
]
7936 movq
[edi
+ ecx
*4], mm6
7937 movd
[edi
+ ecx
*4 +8], mm7
7939 mov ebx
, [ebp
+ i3010_fshift
] ;
# increment fshift force
7940 mov edx
, [esp
+ i3010_is3
]
7942 movq mm6
, [ebx
+ edx
*4]
7943 movd mm7
, [ebx
+ edx
*4 + 8]
7944 pfadd mm6
, [esp
+ i3010_fix
]
7945 pfadd mm7
, [esp
+ i3010_fiz
]
7946 movq
[ebx
+ edx
*4], mm6
7947 movd
[ebx
+ edx
*4 + 8], mm7
7950 dec dword ptr
[esp
+ i3010_nscoul
]
7954 mov edx
, [ebp
+ i3010_gid
] ;
# get group index for this i particle
7956 add dword ptr
[ebp
+ i3010_gid
], 4 ;
# advance pointer
7958 movq mm7
, [esp
+ i3010_vctot
]
7959 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
7961 mov eax
, [ebp
+ i3010_Vc
]
7962 movd mm6
, [eax
+ edx
*4]
7964 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
7966 mov ecx
, [ebp
+ i3010_nri
]
7969 ;
# not last, iterate once more!
7970 mov
[ebp
+ i3010_nri
], ecx
7987 .globl inl3020_3dnow
7988 .globl _inl3020_3dnow
7992 .equiv i3020_iinr, 12
7993 .equiv i3020_jindex, 16
7994 .equiv i3020_jjnr, 20
7995 .equiv i3020_shift, 24
7996 .equiv i3020_shiftvec, 28
7997 .equiv i3020_fshift, 32
7998 .equiv i3020_gid, 36
7999 .equiv i3020_pos, 40
8000 .equiv i3020_faction, 44
8001 .equiv i3020_charge, 48
8002 .equiv i3020_facel, 52
8004 .equiv i3020_tabscale, 60
8005 .equiv i3020_VFtab, 64
8006 ;
# stack offsets for local variables
8010 .equiv i3020_iyO, 12
8011 .equiv i3020_izO, 16
8012 .equiv i3020_ixH, 20
8013 .equiv i3020_iyH, 28
8014 .equiv i3020_izH, 36
8015 .equiv i3020_iqO, 44
8016 .equiv i3020_iqH, 52
8017 .equiv i3020_qqO, 60
8018 .equiv i3020_qqH, 68
8019 .equiv i3020_vctot, 76
8020 .equiv i3020_two, 84
8022 .equiv i3020_tsc, 100
8023 .equiv i3020_innerjjnr, 108
8024 .equiv i3020_innerk, 112
8025 .equiv i3020_fixO, 116
8026 .equiv i3020_fiyO, 120
8027 .equiv i3020_fizO, 124
8028 .equiv i3020_fixH, 128
8029 .equiv i3020_fiyH, 136
8030 .equiv i3020_fizH, 144
8031 .equiv i3020_dxO, 152
8032 .equiv i3020_dyO, 156
8033 .equiv i3020_dzO, 160
8034 .equiv i3020_dxH, 164
8035 .equiv i3020_dyH, 172
8036 .equiv i3020_dzH, 180
8037 .equiv i3020_tmprsqH, 188
8046 sub esp
, 196 ;
# local stack space
8049 mov ecx
, [ebp
+ i3020_iinr
] ;
# ecx = pointer into iinr[]
8050 mov ebx
, [ecx
] ;
# ebx=ii
8052 mov edx
, [ebp
+ i3020_charge
]
8053 movd mm1
, [ebp
+ i3020_facel
]
8054 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0]
8056 movq
[esp
+ i3020_iqO
], mm2 ;
# iqO = facel*charge[ii]
8058 movd mm2
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1]
8060 punpckldq mm2
,mm2 ;
# spread to both halves
8061 movq
[esp
+ i3020_iqH
], mm2 ;
# iqH = facel*charge[ii0+1]
8064 movd mm4
, [ebp
+ i3020_tabscale
]
8065 punpckldq mm4
,mm4 ;
# spread to both halves
8066 movq
[esp
+ i3020_two
], mm3
8067 movq
[esp
+ i3020_tsc
], mm4
8068 ;
# assume we have at least one i particle - start directly
8070 mov eax
, [ebp
+ i3020_shift
] ;
# eax = pointer into shift[]
8071 mov ebx
, [eax
] ;
# ebx=shift[n]
8072 add dword ptr
[ebp
+ i3020_shift
], 4 ;
# advance pointer one step
8074 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
8075 mov
[esp
+ i3020_is3
],ebx ;
# store is3
8077 mov eax
, [ebp
+ i3020_shiftvec
] ;
# eax = base of shiftvec[]
8079 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
8080 movd mm6
, [eax
+ ebx
*4 + 8]
8084 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
8088 mov ecx
, [ebp
+ i3020_iinr
] ;
# ecx = pointer into iinr[]
8089 add dword ptr
[ebp
+ i3020_iinr
], 4 ;
# advance pointer
8090 mov ebx
, [ecx
] ;
# ebx=ii
8092 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
8093 mov eax
, [ebp
+ i3020_pos
] ;
# eax = base of pos[]
8095 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
8096 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
8097 mov
[esp
+ i3020_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
8099 movq
[esp
+ i3020_ixO
], mm5
8100 movq
[esp
+ i3020_izO
], mm6
8102 movd mm3
, [eax
+ ebx
*4 + 12]
8103 movd mm4
, [eax
+ ebx
*4 + 16]
8104 movd mm5
, [eax
+ ebx
*4 + 20]
8105 punpckldq mm3
, [eax
+ ebx
*4 + 24]
8106 punpckldq mm4
, [eax
+ ebx
*4 + 28]
8107 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
8112 movq
[esp
+ i3020_ixH
], mm0
8113 movq
[esp
+ i3020_iyH
], mm1
8114 movq
[esp
+ i3020_izH
], mm2
8116 ;
# clear vctot and i forces
8118 movq
[esp
+ i3020_vctot
], mm7
8119 movq
[esp
+ i3020_fixO
], mm7
8120 movd
[esp
+ i3020_fizO
], mm7
8121 movq
[esp
+ i3020_fixH
], mm7
8122 movq
[esp
+ i3020_fiyH
], mm7
8123 movq
[esp
+ i3020_fizH
], mm7
8125 mov eax
, [ebp
+ i3020_jindex
]
8126 mov ecx
, [eax
] ;
# jindex[n]
8127 mov edx
, [eax
+ 4] ;
# jindex[n+1]
8128 add dword ptr
[ebp
+ i3020_jindex
], 4
8129 sub edx
, ecx ;
# number of innerloop atoms
8130 mov
[esp
+ i3020_innerk
], edx
8132 mov esi
, [ebp
+ i3020_pos
]
8133 mov edi
, [ebp
+ i3020_faction
]
8134 mov eax
, [ebp
+ i3020_jjnr
]
8137 mov
[esp
+ i3020_innerjjnr
], eax ;
# pointer to jjnr[nj0]
8139 ;
# a single j particle iteration
8140 mov eax
, [esp
+ i3020_innerjjnr
]
8141 mov eax
, [eax
] ;
# eax=jnr offset
8142 add dword ptr
[esp
+ i3020_innerjjnr
], 4 ;
# advance pointer
8143 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
8145 mov ecx
, [ebp
+ i3020_charge
]
8146 movd mm7
, [ecx
+ eax
*4]
8149 pfmul mm6
, [esp
+ i3020_iqO
]
8150 pfmul mm7
, [esp
+ i3020_iqH
] ;
# mm6=qqO, mm7=qqH
8151 movd
[esp
+ i3020_qqO
], mm6
8152 movq
[esp
+ i3020_qqH
], mm7
8154 lea eax
, [eax
+ eax
*2]
8156 movq mm0
, [esi
+ eax
*4]
8157 movd mm1
, [esi
+ eax
*4 + 8]
8158 ;
# copy & expand to mm2-mm4 for the H interactions
8166 pfsubr mm0
, [esp
+ i3020_ixO
]
8167 pfsubr mm1
, [esp
+ i3020_izO
]
8169 movq
[esp
+ i3020_dxO
], mm0
8171 movd
[esp
+ i3020_dzO
], mm1
8174 pfadd mm0
, mm1 ;
# mm0=rsqO
8178 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
8179 pfsubr mm2
, [esp
+ i3020_ixH
]
8180 pfsubr mm3
, [esp
+ i3020_iyH
]
8181 pfsubr mm4
, [esp
+ i3020_izH
] ;
# mm2-mm4 is dxH-dzH
8183 movq
[esp
+ i3020_dxH
], mm2
8184 movq
[esp
+ i3020_dyH
], mm3
8185 movq
[esp
+ i3020_dzH
], mm4
8191 pfadd mm3
,mm4 ;
# mm3=rsqH
8192 movq
[esp
+ i3020_tmprsqH
], mm3
8199 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
8201 pfmul mm0
, mm1 ;
# mm0=r
8203 pfmul mm0
, [esp
+ i3020_tsc
]
8205 movd
[esp
+ i3020_n1
], mm4
8207 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
8209 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
8212 mov edx
, [ebp
+ i3020_VFtab
]
8213 mov ecx
, [esp
+ i3020_n1
]
8215 ;
# load all values we need
8216 movd mm4
, [edx
+ ecx
*4]
8217 movd mm5
, [edx
+ ecx
*4 + 4]
8218 movd mm6
, [edx
+ ecx
*4 + 8]
8219 movd mm7
, [edx
+ ecx
*4 + 12]
8221 pfmul mm6
, mm0 ;
# mm6 = Geps
8222 pfmul mm7
, mm2 ;
# mm7 = Heps2
8225 pfadd mm5
, mm7 ;
# mm5 = Fp
8227 pfmul mm7
, [esp
+ i3020_two
] ;
# two*Heps2
8229 pfadd mm7
, mm5 ;
# mm7=FF
8231 pfmul mm5
, mm0 ;
# mm5=eps*Fp
8232 pfadd mm5
, mm4 ;
# mm5= VV
8234 pfmul mm5
, [esp
+ i3020_qqO
] ;
# vcoul=qq*VV
8235 pfmul mm7
, [esp
+ i3020_qqO
] ;
# fijC=qq*FF
8236 ;
# update vctot directly, use mm3 for fscal sum.
8237 pfadd mm5
, [esp
+ i3020_vctot
]
8238 movq
[esp
+ i3020_vctot
], mm5
8241 ;
# change sign of fscal and multiply with rinv
8244 pfmul mm3
, [esp
+ i3020_tsc
]
8245 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
8247 ;
# Ready with the oxygen - potential is updated, fscal is in mm3.
8248 ;
# now do the two hydrogens.
8250 movq mm0
, [esp
+ i3020_tmprsqH
] ;
# mm0=rsqH
8256 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
8261 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
8263 pfmul mm0
,mm1 ;
# mm0=r
8264 pfmul mm0
, [esp
+ i3020_tsc
]
8266 movq
[esp
+ i3020_n1
], mm4
8268 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
8270 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
8273 mov edx
, [ebp
+ i3020_VFtab
]
8274 mov ecx
, [esp
+ i3020_n1
]
8276 ;
# load all values we need
8277 movd mm4
, [edx
+ ecx
*4]
8278 movd mm5
, [edx
+ ecx
*4 + 4]
8279 movd mm6
, [edx
+ ecx
*4 + 8]
8280 movd mm7
, [edx
+ ecx
*4 + 12]
8281 mov ecx
, [esp
+ i3020_n1
+ 4]
8283 punpckldq mm4
, [edx
+ ecx
*4]
8284 punpckldq mm5
, [edx
+ ecx
*4 + 4]
8285 punpckldq mm6
, [edx
+ ecx
*4 + 8]
8286 punpckldq mm7
, [edx
+ ecx
*4 + 12]
8288 pfmul mm6
, mm0 ;
# mm6 = Geps
8289 pfmul mm7
, mm2 ;
# mm7 = Heps2
8292 pfadd mm5
, mm7 ;
# mm5 = Fp
8294 pfmul mm7
, [esp
+ i3020_two
] ;
# two*Heps2
8296 pfadd mm7
, mm5 ;
# mm7=FF
8298 pfmul mm5
, mm0 ;
# mm5=eps*Fp
8299 pfadd mm5
, mm4 ;
# mm5= VV
8301 pfmul mm5
, [esp
+ i3020_qqH
] ;
# vcoul=qq*VV
8302 pfmul mm7
, [esp
+ i3020_qqH
] ;
# fijC=qq*FF
8305 pfadd mm5
, [esp
+ i3020_vctot
]
8306 movq
[esp
+ i3020_vctot
], mm5
8308 ;
# change sign of fijC and multiply by rinv
8311 pfmul mm4
, [esp
+ i3020_tsc
]
8312 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
8314 ;
# spread oxygen fscalar to both positions
8316 ;
# calc vectorial force for O
8317 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
8318 movq mm0
, [esp
+ i3020_dxO
]
8319 movd mm1
, [esp
+ i3020_dzO
]
8323 ;
# calc vectorial force for H's
8324 movq mm5
, [esp
+ i3020_dxH
]
8325 movq mm6
, [esp
+ i3020_dyH
]
8326 movq mm7
, [esp
+ i3020_dzH
]
8331 ;
# update iO particle force
8332 movq mm2
, [esp
+ i3020_fixO
]
8333 movd mm3
, [esp
+ i3020_fizO
]
8336 movq
[esp
+ i3020_fixO
], mm2
8337 movd
[esp
+ i3020_fizO
], mm3
8340 movq mm2
, [esp
+ i3020_fixH
]
8341 movq mm3
, [esp
+ i3020_fiyH
]
8342 movq mm4
, [esp
+ i3020_fizH
]
8346 movq
[esp
+ i3020_fixH
], mm2
8347 movq
[esp
+ i3020_fiyH
], mm3
8348 movq
[esp
+ i3020_fizH
], mm4
8350 ;
# pack j forces from H in the same form as the oxygen force.
8351 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
8352 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
8354 pfadd mm0
, mm5 ;
# add up total force on j particle.
8357 ;
# update j particle force
8358 movq mm2
, [edi
+ eax
*4]
8359 movd mm3
, [edi
+ eax
*4 + 8]
8362 movq
[edi
+ eax
*4], mm2
8363 movd
[edi
+ eax
*4 + 8], mm3
8366 dec dword ptr
[esp
+ i3020_innerk
]
8367 jz
.i3020_updateouterdata
8368 jmp
.i3020_inner_loop
8369 .i3020_updateouterdata:
8370 mov ecx
, [esp
+ i3020_ii3
]
8372 movq mm6
, [edi
+ ecx
*4] ;
# increment iO force
8373 movd mm7
, [edi
+ ecx
*4 + 8]
8374 pfadd mm6
, [esp
+ i3020_fixO
]
8375 pfadd mm7
, [esp
+ i3020_fizO
]
8376 movq
[edi
+ ecx
*4], mm6
8377 movd
[edi
+ ecx
*4 +8], mm7
8379 movq mm0
, [esp
+ i3020_fixH
]
8380 movq mm3
, [esp
+ i3020_fiyH
]
8381 movq mm1
, [esp
+ i3020_fizH
]
8383 punpckldq mm0
, mm3 ;
# mm0(l)=fxH1, mm0(h)=fyH1
8384 punpckhdq mm2
, mm3 ;
# mm2(l)=fxH2, mm2(h)=fyH2
8390 movq mm6
, [edi
+ ecx
*4 + 12] ;
# increment iH1 force
8391 movd mm7
, [edi
+ ecx
*4 + 20]
8394 movq
[edi
+ ecx
*4 + 12], mm6
8395 movd
[edi
+ ecx
*4 + 20], mm7
8397 movq mm6
, [edi
+ ecx
*4 + 24] ;
# increment iH2 force
8398 movd mm7
, [edi
+ ecx
*4 + 32]
8401 movq
[edi
+ ecx
*4 + 24], mm6
8402 movd
[edi
+ ecx
*4 + 32], mm7
8405 mov ebx
, [ebp
+ i3020_fshift
] ;
# increment fshift force
8406 mov edx
, [esp
+ i3020_is3
]
8408 movq mm6
, [ebx
+ edx
*4]
8409 movd mm7
, [ebx
+ edx
*4 + 8]
8410 pfadd mm6
, [esp
+ i3020_fixO
]
8411 pfadd mm7
, [esp
+ i3020_fizO
]
8416 movq
[ebx
+ edx
*4], mm6
8417 movd
[ebx
+ edx
*4 + 8], mm7
8419 mov edx
, [ebp
+ i3020_gid
] ;
# get group index for this i particle
8421 add dword ptr
[ebp
+ i3020_gid
], 4 ;
# advance pointer
8423 movq mm7
, [esp
+ i3020_vctot
]
8424 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
8426 mov eax
, [ebp
+ i3020_Vc
]
8427 movd mm6
, [eax
+ edx
*4]
8429 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
8432 dec dword ptr
[ebp
+ i3020_nri
]
8434 ;
# not last, iterate once more!
8450 .globl inl3030_3dnow
8451 .globl _inl3030_3dnow
8455 .equiv i3030_iinr, 12
8456 .equiv i3030_jindex, 16
8457 .equiv i3030_jjnr, 20
8458 .equiv i3030_shift, 24
8459 .equiv i3030_shiftvec, 28
8460 .equiv i3030_fshift, 32
8461 .equiv i3030_gid, 36
8462 .equiv i3030_pos, 40
8463 .equiv i3030_faction, 44
8464 .equiv i3030_charge, 48
8465 .equiv i3030_facel, 52
8467 .equiv i3030_tabscale, 60
8468 .equiv i3030_VFtab, 64
8469 ;
# stack offsets for local variables
8473 .equiv i3030_iyO, 12
8474 .equiv i3030_izO, 16
8475 .equiv i3030_ixH, 20
8476 .equiv i3030_iyH, 28
8477 .equiv i3030_izH, 36
8478 .equiv i3030_qqOO, 44
8479 .equiv i3030_qqOH, 52
8480 .equiv i3030_qqHH, 60
8481 .equiv i3030_two, 68
8483 .equiv i3030_tsc, 84
8484 .equiv i3030_vctot, 92
8485 .equiv i3030_innerjjnr, 100
8486 .equiv i3030_innerk, 104
8487 .equiv i3030_fixO, 108
8488 .equiv i3030_fiyO, 112
8489 .equiv i3030_fizO, 116
8490 .equiv i3030_fixH, 120
8491 .equiv i3030_fiyH, 128
8492 .equiv i3030_fizH, 136
8493 .equiv i3030_dxO, 144
8494 .equiv i3030_dyO, 148
8495 .equiv i3030_dzO, 152
8496 .equiv i3030_dxH, 156
8497 .equiv i3030_dyH, 164
8498 .equiv i3030_dzH, 172
8499 .equiv i3030_tmprsqH, 180
8508 sub esp
, 188 ;
# local stack space
8510 ;
# assume we have at least one i particle - start directly
8512 mov ecx
, [ebp
+ i3030_iinr
] ;
# ecx = pointer into iinr[]
8513 mov ebx
, [ecx
] ;
# ebx=ii
8515 mov edx
, [ebp
+ i3030_charge
]
8516 movd mm1
, [ebp
+ i3030_facel
] ;
# mm1=facel
8517 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0] (O)
8518 movd mm3
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1] (H)
8524 pfmul mm4
, mm2 ;
# mm4=qqOO*facel
8525 pfmul mm5
, mm3 ;
# mm5=qqOH*facel
8526 pfmul mm6
, mm3 ;
# mm6=qqHH*facel
8527 punpckldq mm5
,mm5 ;
# spread to both halves
8528 punpckldq mm6
,mm6 ;
# spread to both halves
8529 movq
[esp
+ i3030_qqOO
], mm4
8530 movq
[esp
+ i3030_qqOH
], mm5
8531 movq
[esp
+ i3030_qqHH
], mm6
8533 movq
[esp
+ i3030_two
], mm2
8534 movd mm3
, [ebp
+ i3030_tabscale
]
8536 movq
[esp
+ i3030_tsc
], mm3
8538 mov eax
, [ebp
+ i3030_shift
] ;
# eax = pointer into shift[]
8539 mov ebx
, [eax
] ;
# ebx=shift[n]
8540 add dword ptr
[ebp
+ i3030_shift
], 4 ;
# advance pointer one step
8542 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
8543 mov
[esp
+ i3030_is3
],ebx ;
# store is3
8545 mov eax
, [ebp
+ i3030_shiftvec
] ;
# eax = base of shiftvec[]
8547 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
8548 movd mm6
, [eax
+ ebx
*4 + 8]
8552 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
8556 mov ecx
, [ebp
+ i3030_iinr
] ;
# ecx = pointer into iinr[]
8557 add dword ptr
[ebp
+ i3030_iinr
], 4 ;
# advance pointer
8558 mov ebx
, [ecx
] ;
# ebx=ii
8560 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
8561 mov eax
, [ebp
+ i3030_pos
] ;
# eax = base of pos[]
8563 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
8564 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
8565 mov
[esp
+ i3030_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
8567 movq
[esp
+ i3030_ixO
], mm5
8568 movq
[esp
+ i3030_izO
], mm6
8570 movd mm3
, [eax
+ ebx
*4 + 12]
8571 movd mm4
, [eax
+ ebx
*4 + 16]
8572 movd mm5
, [eax
+ ebx
*4 + 20]
8573 punpckldq mm3
, [eax
+ ebx
*4 + 24]
8574 punpckldq mm4
, [eax
+ ebx
*4 + 28]
8575 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
8580 movq
[esp
+ i3030_ixH
], mm0
8581 movq
[esp
+ i3030_iyH
], mm1
8582 movq
[esp
+ i3030_izH
], mm2
8584 ;
# clear vctot and i forces
8586 movq
[esp
+ i3030_vctot
], mm7
8587 movq
[esp
+ i3030_fixO
], mm7
8588 movq
[esp
+ i3030_fizO
], mm7
8589 movq
[esp
+ i3030_fixH
], mm7
8590 movq
[esp
+ i3030_fiyH
], mm7
8591 movq
[esp
+ i3030_fizH
], mm7
8593 mov eax
, [ebp
+ i3030_jindex
]
8594 mov ecx
, [eax
] ;
# jindex[n]
8595 mov edx
, [eax
+ 4] ;
# jindex[n+1]
8596 add dword ptr
[ebp
+ i3030_jindex
], 4
8597 sub edx
, ecx ;
# number of innerloop atoms
8598 mov
[esp
+ i3030_innerk
], edx ;
# number of innerloop atoms
8600 mov esi
, [ebp
+ i3030_pos
]
8601 mov edi
, [ebp
+ i3030_faction
]
8602 mov eax
, [ebp
+ i3030_jjnr
]
8605 mov
[esp
+ i3030_innerjjnr
], eax ;
# pointer to jjnr[nj0]
8607 ;
# a single j particle iteration here - compare with the unrolled code for comments.
8608 mov eax
, [esp
+ i3030_innerjjnr
]
8609 mov eax
, [eax
] ;
# eax=jnr offset
8610 add dword ptr
[esp
+ i3030_innerjjnr
], 4 ;
# advance pointer
8612 lea eax
, [eax
+ eax
*2]
8614 movq mm0
, [esi
+ eax
*4]
8615 movd mm1
, [esi
+ eax
*4 + 8]
8616 ;
# copy & expand to mm2-mm4 for the H interactions
8624 pfsubr mm0
, [esp
+ i3030_ixO
]
8625 pfsubr mm1
, [esp
+ i3030_izO
]
8627 movq
[esp
+ i3030_dxO
], mm0
8629 movd
[esp
+ i3030_dzO
], mm1
8632 pfadd mm0
, mm1 ;
# mm0=rsqO
8636 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
8637 pfsubr mm2
, [esp
+ i3030_ixH
]
8638 pfsubr mm3
, [esp
+ i3030_iyH
]
8639 pfsubr mm4
, [esp
+ i3030_izH
] ;
# mm2-mm4 is dxH-dzH
8641 movq
[esp
+ i3030_dxH
], mm2
8642 movq
[esp
+ i3030_dyH
], mm3
8643 movq
[esp
+ i3030_dzH
], mm4
8649 pfadd mm3
,mm4 ;
# mm3=rsqH
8650 movq
[esp
+ i3030_tmprsqH
], mm3
8657 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
8658 pfmul mm0
, mm1 ;
# mm0=rsq
8660 pfmul mm0
, [esp
+ i3030_tsc
]
8662 movd
[esp
+ i3030_n1
], mm4
8664 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
8666 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
8669 mov edx
, [ebp
+ i3030_VFtab
]
8670 mov ecx
, [esp
+ i3030_n1
]
8673 ;
# load all values we need
8674 movd mm4
, [edx
+ ecx
*4]
8675 movd mm5
, [edx
+ ecx
*4 + 4]
8676 movd mm6
, [edx
+ ecx
*4 + 8]
8677 movd mm7
, [edx
+ ecx
*4 + 12]
8679 pfmul mm6
, mm0 ;
# mm6 = Geps
8680 pfmul mm7
, mm2 ;
# mm7 = Heps2
8683 pfadd mm5
, mm7 ;
# mm5 = Fp
8685 pfmul mm7
, [esp
+ i3030_two
] ;
# two*Heps2
8687 pfadd mm7
, mm5 ;
# mm7=FF
8689 pfmul mm5
, mm0 ;
# mm5=eps*Fp
8690 pfadd mm5
, mm4 ;
# mm5= VV
8692 pfmul mm5
, [esp
+ i3030_qqOO
] ;
# vcoul=qq*VV
8693 pfmul mm7
, [esp
+ i3030_qqOO
] ;
# fijC=qq*FF
8695 ;
# update vctot directly, use mm3 for fscal sum.
8696 pfadd mm5
, [esp
+ i3030_vctot
]
8697 movq
[esp
+ i3030_vctot
], mm5
8700 ;
# change sign of fscal and multiply with rinv
8703 pfmul mm3
, [esp
+ i3030_tsc
]
8704 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
8706 ;
# Ready with the oxygen - potential is updated, fscal is in mm3.
8707 ;
# time for hydrogens!
8709 movq mm0
, [esp
+ i3030_tmprsqH
]
8715 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
8720 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
8722 pfmul mm0
,mm1 ;
# mm0=r
8723 pfmul mm0
, [esp
+ i3030_tsc
]
8725 movq
[esp
+ i3030_n1
], mm4
8727 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
8729 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
8732 mov edx
, [ebp
+ i3030_VFtab
]
8733 mov ecx
, [esp
+ i3030_n1
]
8735 ;
# load all values we need
8736 movd mm4
, [edx
+ ecx
*4]
8737 movd mm5
, [edx
+ ecx
*4 + 4]
8738 movd mm6
, [edx
+ ecx
*4 + 8]
8739 movd mm7
, [edx
+ ecx
*4 + 12]
8740 mov ecx
, [esp
+ i3030_n1
+ 4]
8742 punpckldq mm4
, [edx
+ ecx
*4]
8743 punpckldq mm5
, [edx
+ ecx
*4 + 4]
8744 punpckldq mm6
, [edx
+ ecx
*4 + 8]
8745 punpckldq mm7
, [edx
+ ecx
*4 + 12]
8747 pfmul mm6
, mm0 ;
# mm6 = Geps
8748 pfmul mm7
, mm2 ;
# mm7 = Heps2
8751 pfadd mm5
, mm7 ;
# mm5 = Fp
8753 pfmul mm7
, [esp
+ i3030_two
] ;
# two*Heps2
8755 pfadd mm7
, mm5 ;
# mm7=FF
8757 pfmul mm5
, mm0 ;
# mm5=eps*Fp
8758 pfadd mm5
, mm4 ;
# mm5= VV
8760 pfmul mm5
, [esp
+ i3030_qqOH
] ;
# vcoul=qq*VV
8761 pfmul mm7
, [esp
+ i3030_qqOH
] ;
# fijC=qq*FF
8763 pfadd mm5
, [esp
+ i3030_vctot
]
8764 movq
[esp
+ i3030_vctot
], mm5
8766 ;
# change sign of fijC and multiply by rinv
8769 pfmul mm4
, [esp
+ i3030_tsc
]
8770 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
8772 ;
# spread oxygen fscalar to both positions
8774 ;
# calc vectorial force for O
8775 movq mm0
, [esp
+ i3030_dxO
]
8776 movd mm1
, [esp
+ i3030_dzO
]
8780 ;
# calc vectorial force for H's
8781 movq mm5
, [esp
+ i3030_dxH
]
8782 movq mm6
, [esp
+ i3030_dyH
]
8783 movq mm7
, [esp
+ i3030_dzH
]
8788 ;
# update iO particle force
8789 movq mm2
, [esp
+ i3030_fixO
]
8790 movd mm3
, [esp
+ i3030_fizO
]
8793 movq
[esp
+ i3030_fixO
], mm2
8794 movd
[esp
+ i3030_fizO
], mm3
8797 movq mm2
, [esp
+ i3030_fixH
]
8798 movq mm3
, [esp
+ i3030_fiyH
]
8799 movq mm4
, [esp
+ i3030_fizH
]
8803 movq
[esp
+ i3030_fixH
], mm2
8804 movq
[esp
+ i3030_fiyH
], mm3
8805 movq
[esp
+ i3030_fizH
], mm4
8807 ;
# pack j forces from H in the same form as the oxygen force.
8808 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
8809 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
8811 pfadd mm0
, mm5 ;
# add up total force on j particle.
8814 ;
# update j particle force
8815 movq mm2
, [edi
+ eax
*4]
8816 movd mm3
, [edi
+ eax
*4 + 8]
8819 movq
[edi
+ eax
*4], mm2
8820 movd
[edi
+ eax
*4 +8], mm3
8822 ;
# interactions with j H1
8824 movq mm0
, [esi
+ eax
*4 + 12]
8825 movd mm1
, [esi
+ eax
*4 + 20]
8826 ;
# copy & expand to mm2-mm4 for the H interactions
8834 pfsubr mm0
, [esp
+ i3030_ixO
]
8835 pfsubr mm1
, [esp
+ i3030_izO
]
8837 movq
[esp
+ i3030_dxO
], mm0
8839 movd
[esp
+ i3030_dzO
], mm1
8842 pfadd mm0
, mm1 ;
# mm0=rsqO
8846 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
8847 pfsubr mm2
, [esp
+ i3030_ixH
]
8848 pfsubr mm3
, [esp
+ i3030_iyH
]
8849 pfsubr mm4
, [esp
+ i3030_izH
] ;
# mm2-mm4 is dxH-dzH
8851 movq
[esp
+ i3030_dxH
], mm2
8852 movq
[esp
+ i3030_dyH
], mm3
8853 movq
[esp
+ i3030_dzH
], mm4
8859 pfadd mm3
,mm4 ;
# mm3=rsqH
8860 movq
[esp
+ i3030_tmprsqH
], mm3
8867 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
8868 pfmul mm0
, mm1 ;
# mm0=rsq
8870 pfmul mm0
, [esp
+ i3030_tsc
]
8872 movd
[esp
+ i3030_n1
], mm4
8874 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
8876 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
8879 mov edx
, [ebp
+ i3030_VFtab
]
8880 mov ecx
, [esp
+ i3030_n1
]
8883 ;
# load all values we need
8884 movd mm4
, [edx
+ ecx
*4]
8885 movd mm5
, [edx
+ ecx
*4 + 4]
8886 movd mm6
, [edx
+ ecx
*4 + 8]
8887 movd mm7
, [edx
+ ecx
*4 + 12]
8889 pfmul mm6
, mm0 ;
# mm6 = Geps
8890 pfmul mm7
, mm2 ;
# mm7 = Heps2
8893 pfadd mm5
, mm7 ;
# mm5 = Fp
8895 pfmul mm7
, [esp
+ i3030_two
] ;
# two*Heps2
8897 pfadd mm7
, mm5 ;
# mm7=FF
8899 pfmul mm5
, mm0 ;
# mm5=eps*Fp
8900 pfadd mm5
, mm4 ;
# mm5= VV
8902 pfmul mm5
, [esp
+ i3030_qqOH
] ;
# vcoul=qq*VV
8903 pfmul mm7
, [esp
+ i3030_qqOH
] ;
# fijC=qq*FF
8905 ;
# update vctot directly, force is moved to mm3
8906 pfadd mm5
, [esp
+ i3030_vctot
]
8907 movq
[esp
+ i3030_vctot
], mm5
8910 pfmul mm3
, [esp
+ i3030_tsc
]
8911 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
8913 movq mm0
, [esp
+ i3030_tmprsqH
]
8919 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
8924 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
8926 pfmul mm0
,mm1 ;
# mm0=r
8927 pfmul mm0
, [esp
+ i3030_tsc
]
8929 movq
[esp
+ i3030_n1
], mm4
8931 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
8933 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
8936 mov edx
, [ebp
+ i3030_VFtab
]
8937 mov ecx
, [esp
+ i3030_n1
]
8939 ;
# load all values we need
8940 movd mm4
, [edx
+ ecx
*4]
8941 movd mm5
, [edx
+ ecx
*4 + 4]
8942 movd mm6
, [edx
+ ecx
*4 + 8]
8943 movd mm7
, [edx
+ ecx
*4 + 12]
8944 mov ecx
, [esp
+ i3030_n1
+ 4]
8946 punpckldq mm4
, [edx
+ ecx
*4]
8947 punpckldq mm5
, [edx
+ ecx
*4 + 4]
8948 punpckldq mm6
, [edx
+ ecx
*4 + 8]
8949 punpckldq mm7
, [edx
+ ecx
*4 + 12]
8952 pfmul mm6
, mm0 ;
# mm6 = Geps
8953 pfmul mm7
, mm2 ;
# mm7 = Heps2
8956 pfadd mm5
, mm7 ;
# mm5 = Fp
8958 pfmul mm7
, [esp
+ i3030_two
] ;
# two*Heps2
8960 pfadd mm7
, mm5 ;
# mm7=FF
8962 pfmul mm5
, mm0 ;
# mm5=eps*Fp
8963 pfadd mm5
, mm4 ;
# mm5= VV
8965 pfmul mm5
, [esp
+ i3030_qqHH
] ;
# vcoul=qq*VV
8966 pfmul mm7
, [esp
+ i3030_qqHH
] ;
# fijC=qq*FF
8968 pfadd mm5
, [esp
+ i3030_vctot
]
8969 movq
[esp
+ i3030_vctot
], mm5
8971 ;
# change sign of fijC and multiply by rinv
8974 pfmul mm4
, [esp
+ i3030_tsc
]
8975 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
8977 ;
# spread oxygen fscalar to both positions
8979 ;
# calc vectorial force for O
8980 movq mm0
, [esp
+ i3030_dxO
]
8981 movd mm1
, [esp
+ i3030_dzO
]
8985 ;
# calc vectorial force for H's
8986 movq mm5
, [esp
+ i3030_dxH
]
8987 movq mm6
, [esp
+ i3030_dyH
]
8988 movq mm7
, [esp
+ i3030_dzH
]
8993 ;
# update iO particle force
8994 movq mm2
, [esp
+ i3030_fixO
]
8995 movd mm3
, [esp
+ i3030_fizO
]
8998 movq
[esp
+ i3030_fixO
], mm2
8999 movd
[esp
+ i3030_fizO
], mm3
9002 movq mm2
, [esp
+ i3030_fixH
]
9003 movq mm3
, [esp
+ i3030_fiyH
]
9004 movq mm4
, [esp
+ i3030_fizH
]
9008 movq
[esp
+ i3030_fixH
], mm2
9009 movq
[esp
+ i3030_fiyH
], mm3
9010 movq
[esp
+ i3030_fizH
], mm4
9012 ;
# pack j forces from H in the same form as the oxygen force.
9013 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
9014 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
9016 pfadd mm0
, mm5 ;
# add up total force on j particle.
9019 ;
# update j particle force
9020 movq mm2
, [edi
+ eax
*4 + 12]
9021 movd mm3
, [edi
+ eax
*4 + 20]
9024 movq
[edi
+ eax
*4 + 12], mm2
9025 movd
[edi
+ eax
*4 + 20], mm3
9027 ;
# interactions with j H2
9028 movq mm0
, [esi
+ eax
*4 + 24]
9029 movd mm1
, [esi
+ eax
*4 + 32]
9030 ;
# copy & expand to mm2-mm4 for the H interactions
9038 pfsubr mm0
, [esp
+ i3030_ixO
]
9039 pfsubr mm1
, [esp
+ i3030_izO
]
9041 movq
[esp
+ i3030_dxO
], mm0
9043 movd
[esp
+ i3030_dzO
], mm1
9046 pfadd mm0
, mm1 ;
# mm0=rsqO
9050 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
9051 pfsubr mm2
, [esp
+ i3030_ixH
]
9052 pfsubr mm3
, [esp
+ i3030_iyH
]
9053 pfsubr mm4
, [esp
+ i3030_izH
] ;
# mm2-mm4 is dxH-dzH
9055 movq
[esp
+ i3030_dxH
], mm2
9056 movq
[esp
+ i3030_dyH
], mm3
9057 movq
[esp
+ i3030_dzH
], mm4
9063 pfadd mm3
,mm4 ;
# mm3=rsqH
9064 movq
[esp
+ i3030_tmprsqH
], mm3
9071 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
9074 pfmul mm0
, [esp
+ i3030_tsc
]
9076 movd
[esp
+ i3030_n1
], mm4
9078 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
9080 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
9083 mov edx
, [ebp
+ i3030_VFtab
]
9084 mov ecx
, [esp
+ i3030_n1
]
9087 ;
# load all values we need
9088 movd mm4
, [edx
+ ecx
*4]
9089 movd mm5
, [edx
+ ecx
*4 + 4]
9090 movd mm6
, [edx
+ ecx
*4 + 8]
9091 movd mm7
, [edx
+ ecx
*4 + 12]
9093 pfmul mm6
, mm0 ;
# mm6 = Geps
9094 pfmul mm7
, mm2 ;
# mm7 = Heps2
9097 pfadd mm5
, mm7 ;
# mm5 = Fp
9099 pfmul mm7
, [esp
+ i3030_two
] ;
# two*Heps2
9101 pfadd mm7
, mm5 ;
# mm7=FF
9103 pfmul mm5
, mm0 ;
# mm5=eps*Fp
9104 pfadd mm5
, mm4 ;
# mm5= VV
9106 pfmul mm5
, [esp
+ i3030_qqOH
] ;
# vcoul=qq*VV
9107 pfmul mm7
, [esp
+ i3030_qqOH
] ;
# fijC=qq*FF
9109 ;
# update vctot directly, use mm3 for fscal sum.
9110 pfadd mm5
, [esp
+ i3030_vctot
]
9111 movq
[esp
+ i3030_vctot
], mm5
9114 pfmul mm3
, [esp
+ i3030_tsc
]
9115 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
9117 movq mm0
, [esp
+ i3030_tmprsqH
]
9123 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
9128 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
9130 pfmul mm0
,mm1 ;
# mm0=r
9131 pfmul mm0
, [esp
+ i3030_tsc
]
9133 movq
[esp
+ i3030_n1
], mm4
9135 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
9137 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
9140 mov edx
, [ebp
+ i3030_VFtab
]
9141 mov ecx
, [esp
+ i3030_n1
]
9143 ;
# load all values we need
9144 movd mm4
, [edx
+ ecx
*4]
9145 movd mm5
, [edx
+ ecx
*4 + 4]
9146 movd mm6
, [edx
+ ecx
*4 + 8]
9147 movd mm7
, [edx
+ ecx
*4 + 12]
9148 mov ecx
, [esp
+ i3030_n1
+ 4]
9150 punpckldq mm4
, [edx
+ ecx
*4]
9151 punpckldq mm5
, [edx
+ ecx
*4 + 4]
9152 punpckldq mm6
, [edx
+ ecx
*4 + 8]
9153 punpckldq mm7
, [edx
+ ecx
*4 + 12]
9156 pfmul mm6
, mm0 ;
# mm6 = Geps
9157 pfmul mm7
, mm2 ;
# mm7 = Heps2
9160 pfadd mm5
, mm7 ;
# mm5 = Fp
9162 pfmul mm7
, [esp
+ i3030_two
] ;
# two*Heps2
9164 pfadd mm7
, mm5 ;
# mm7=FF
9166 pfmul mm5
, mm0 ;
# mm5=eps*Fp
9167 pfadd mm5
, mm4 ;
# mm5= VV
9169 pfmul mm5
, [esp
+ i3030_qqHH
] ;
# vcoul=qq*VV
9170 pfmul mm7
, [esp
+ i3030_qqHH
] ;
# fijC=qq*FF
9172 pfadd mm5
, [esp
+ i3030_vctot
]
9173 movq
[esp
+ i3030_vctot
], mm5
9175 ;
# change sign of fijC and multiply by rinv
9178 pfmul mm4
, [esp
+ i3030_tsc
]
9179 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
9181 ;
# spread oxygen fscalar to both positions
9183 ;
# calc vectorial force for O
9184 movq mm0
, [esp
+ i3030_dxO
]
9185 movd mm1
, [esp
+ i3030_dzO
]
9189 ;
# calc vectorial force for H's
9190 movq mm5
, [esp
+ i3030_dxH
]
9191 movq mm6
, [esp
+ i3030_dyH
]
9192 movq mm7
, [esp
+ i3030_dzH
]
9197 ;
# update iO particle force
9198 movq mm2
, [esp
+ i3030_fixO
]
9199 movd mm3
, [esp
+ i3030_fizO
]
9202 movq
[esp
+ i3030_fixO
], mm2
9203 movd
[esp
+ i3030_fizO
], mm3
9206 movq mm2
, [esp
+ i3030_fixH
]
9207 movq mm3
, [esp
+ i3030_fiyH
]
9208 movq mm4
, [esp
+ i3030_fizH
]
9212 movq
[esp
+ i3030_fixH
], mm2
9213 movq
[esp
+ i3030_fiyH
], mm3
9214 movq
[esp
+ i3030_fizH
], mm4
9216 ;
# pack j forces from H in the same form as the oxygen force.
9217 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
9218 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
9220 pfadd mm0
, mm5 ;
# add up total force on j particle.
9223 ;
# update j particle force
9224 movq mm2
, [edi
+ eax
*4 + 24]
9225 movd mm3
, [edi
+ eax
*4 + 32]
9228 movq
[edi
+ eax
*4 + 24], mm2
9229 movd
[edi
+ eax
*4 + 32], mm3
9232 dec dword ptr
[esp
+ i3030_innerk
]
9233 jz
.i3030_updateouterdata
9234 jmp
.i3030_inner_loop
9235 .i3030_updateouterdata:
9236 mov ecx
, [esp
+ i3030_ii3
]
9238 movq mm6
, [edi
+ ecx
*4] ;
# increment iO force
9239 movd mm7
, [edi
+ ecx
*4 + 8]
9240 pfadd mm6
, [esp
+ i3030_fixO
]
9241 pfadd mm7
, [esp
+ i3030_fizO
]
9242 movq
[edi
+ ecx
*4], mm6
9243 movd
[edi
+ ecx
*4 +8], mm7
9245 movq mm0
, [esp
+ i3030_fixH
]
9246 movq mm3
, [esp
+ i3030_fiyH
]
9247 movq mm1
, [esp
+ i3030_fizH
]
9249 punpckldq mm0
, mm3 ;
# mm0(l)=fxH1, mm0(h)=fyH1
9250 punpckhdq mm2
, mm3 ;
# mm2(l)=fxH2, mm2(h)=fyH2
9256 movq mm6
, [edi
+ ecx
*4 + 12] ;
# increment iH1 force
9257 movd mm7
, [edi
+ ecx
*4 + 20]
9260 movq
[edi
+ ecx
*4 + 12], mm6
9261 movd
[edi
+ ecx
*4 + 20], mm7
9263 movq mm6
, [edi
+ ecx
*4 + 24] ;
# increment iH2 force
9264 movd mm7
, [edi
+ ecx
*4 + 32]
9267 movq
[edi
+ ecx
*4 + 24], mm6
9268 movd
[edi
+ ecx
*4 + 32], mm7
9271 mov ebx
, [ebp
+ i3030_fshift
] ;
# increment fshift force
9272 mov edx
, [esp
+ i3030_is3
]
9274 movq mm6
, [ebx
+ edx
*4]
9275 movd mm7
, [ebx
+ edx
*4 + 8]
9276 pfadd mm6
, [esp
+ i3030_fixO
]
9277 pfadd mm7
, [esp
+ i3030_fizO
]
9282 movq
[ebx
+ edx
*4], mm6
9283 movd
[ebx
+ edx
*4 + 8], mm7
9285 mov edx
, [ebp
+ i3030_gid
] ;
# get group index for this i particle
9287 add dword ptr
[ebp
+ i3030_gid
], 4 ;
# advance pointer
9289 movq mm7
, [esp
+ i3030_vctot
]
9290 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
9292 mov eax
, [ebp
+ i3030_Vc
]
9293 movd mm6
, [eax
+ edx
*4]
9295 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
9298 dec dword ptr
[ebp
+ i3030_nri
]
9300 ;
# not last, iterate once more!
9317 .globl inl3100_3dnow
9318 .globl _inl3100_3dnow
9322 .equiv i3100_iinr, 12
9323 .equiv i3100_jindex, 16
9324 .equiv i3100_jjnr, 20
9325 .equiv i3100_shift, 24
9326 .equiv i3100_shiftvec, 28
9327 .equiv i3100_fshift, 32
9328 .equiv i3100_gid, 36
9329 .equiv i3100_pos, 40
9330 .equiv i3100_faction, 44
9331 .equiv i3100_charge, 48
9332 .equiv i3100_facel, 52
9334 .equiv i3100_type, 60
9335 .equiv i3100_ntype, 64
9336 .equiv i3100_nbfp, 68
9337 .equiv i3100_Vnb, 72
9338 .equiv i3100_tabscale, 76
9339 .equiv i3100_VFtab, 80
9340 ;
# stack offsets for local variables
9347 .equiv i3100_vctot, 28
9348 .equiv i3100_vnbtot, 36
9350 .equiv i3100_c12, 52
9351 .equiv i3100_six, 60
9352 .equiv i3100_twelve, 68
9353 .equiv i3100_two, 76
9355 .equiv i3100_tsc, 92
9356 .equiv i3100_ntia, 100
9357 .equiv i3100_innerjjnr, 104
9358 .equiv i3100_innerk, 108
9359 .equiv i3100_fix, 112
9360 .equiv i3100_fiy, 116
9361 .equiv i3100_fiz, 120
9362 .equiv i3100_dx1, 124
9363 .equiv i3100_dy1, 128
9364 .equiv i3100_dz1, 132
9365 .equiv i3100_dx2, 136
9366 .equiv i3100_dy2, 140
9367 .equiv i3100_dz2, 144
9376 sub esp
, 148 ;
# local stack space
9378 ;
# move data to local stack
9381 movq mm2
, [mm_twelve
]
9382 movd mm3
, [ebp
+ i3100_tabscale
]
9383 movq
[esp
+ i3100_two
], mm0
9384 movq
[esp
+ i3100_six
], mm1
9385 movq
[esp
+ i3100_twelve
], mm2
9387 movq
[esp
+ i3100_tsc
], mm3
9388 ;
# assume we have at least one i particle - start directly
9390 mov eax
, [ebp
+ i3100_shift
] ;
# eax = pointer into shift[]
9391 mov ebx
, [eax
] ;
# ebx=shift[n]
9392 add dword ptr
[ebp
+ i3100_shift
], 4 ;
# advance pointer one step
9394 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
9395 mov
[esp
+ i3100_is3
],ebx ;
# store is3
9397 mov eax
, [ebp
+ i3100_shiftvec
] ;
# eax = base of shiftvec[]
9399 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
9400 movd mm1
, [eax
+ ebx
*4 + 8]
9402 mov ecx
, [ebp
+ i3100_iinr
] ;
# ecx = pointer into iinr[]
9403 add dword ptr
[ebp
+ i3100_iinr
], 4 ;
# advance pointer
9404 mov ebx
, [ecx
] ;
# ebx=ii
9406 mov edx
, [ebp
+ i3100_charge
]
9407 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
9408 pfmul mm2
, [ebp
+ i3100_facel
]
9409 punpckldq mm2
,mm2 ;
# spread to both halves
9410 movq
[esp
+ i3100_iq
], mm2 ;
# iq =facel*charge[ii]
9412 mov edx
, [ebp
+ i3100_type
]
9413 mov edx
, [edx
+ ebx
*4]
9414 imul edx
, [ebp
+ i3100_ntype
]
9416 mov
[esp
+ i3100_ntia
], edx
9418 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
9419 mov eax
, [ebp
+ i3100_pos
] ;
# eax = base of pos[]
9421 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
9422 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
9423 mov
[esp
+ i3100_ii3
], ebx
9425 movq
[esp
+ i3100_ix
], mm0
9426 movd
[esp
+ i3100_iz
], mm1
9428 ;
# clear total potential and i forces
9430 movq
[esp
+ i3100_vctot
], mm7
9431 movq
[esp
+ i3100_vnbtot
], mm7
9432 movq
[esp
+ i3100_fix
], mm7
9433 movd
[esp
+ i3100_fiz
], mm7
9435 mov eax
, [ebp
+ i3100_jindex
]
9436 mov ecx
, [eax
] ;
# jindex[n]
9437 mov edx
, [eax
+ 4] ;
# jindex[n+1]
9438 add dword ptr
[ebp
+ i3100_jindex
], 4
9439 sub edx
, ecx ;
# number of innerloop atoms
9441 mov esi
, [ebp
+ i3100_pos
]
9442 mov edi
, [ebp
+ i3100_faction
]
9443 mov eax
, [ebp
+ i3100_jjnr
]
9446 mov
[esp
+ i3100_innerjjnr
], eax ;
# pointer to jjnr[nj0]
9448 mov
[esp
+ i3100_innerk
], edx ;
# number of innerloop atoms
9449 jge
.i3100_unroll_loop
9450 jmp
.i3100_finish_inner
9452 ;
# paired innerloop starts here
9453 mov ecx
, [esp
+ i3100_innerjjnr
] ;
# pointer to jjnr[k]
9455 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
9456 add dword ptr
[esp
+ i3100_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
9457 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
9459 mov ecx
, [ebp
+ i3100_charge
] ;
# base of charge[]
9460 movq mm5
, [esp
+ i3100_iq
]
9461 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
9462 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
9463 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
9465 mov ecx
, [ebp
+ i3100_type
]
9466 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
9467 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
9469 mov esi
, [ebp
+ i3100_nbfp
] ;
# base of nbfp
9472 add edx
, [esp
+ i3100_ntia
] ;
# tja = ntia + 2*type
9473 add ecx
, [esp
+ i3100_ntia
]
9475 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
9476 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
9478 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
9479 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
9480 movq
[esp
+ i3100_c6
], mm5
9481 movq
[esp
+ i3100_c12
], mm6
9483 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
9484 lea ebx
, [ebx
+ ebx
*2]
9486 mov esi
, [ebp
+ i3100_pos
]
9488 movq mm0
, [esp
+ i3100_ix
]
9489 movd mm1
, [esp
+ i3100_iz
]
9490 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
9491 movd mm5
, [esi
+ eax
*4 + 8]
9492 pfsubr mm4
,mm0 ;
# dr = ir - jr
9494 movq
[esp
+ i3100_dx1
], mm4 ;
# store dr
9495 movd
[esp
+ i3100_dz1
], mm5
9496 pfmul mm4
,mm4 ;
# square dx,dy,dz
9498 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
9499 pfacc mm4
, mm5 ;
# first rsq in lower mm4
9501 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
9502 movd mm7
, [esi
+ ebx
*4 + 8]
9504 pfsubr mm6
,mm0 ;
# dr = ir - jr
9506 movq
[esp
+ i3100_dx2
], mm6 ;
# store dr
9507 movd
[esp
+ i3100_dz2
], mm7
9508 pfmul mm6
,mm6 ;
# square dx,dy,dz
9510 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
9511 pfacc mm6
, mm7 ;
# second rsq in lower mm6
9513 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
9518 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
9519 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
9525 ;
# mm0 is invsqrt, and mm1 r.
9526 ;
# do potential and fscal
9527 pfmul mm1
, [esp
+ i3100_tsc
] ;
# mm1=rt
9529 movq
[esp
+ i3100_n1
], mm4
9531 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
9534 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
9536 mov edx
, [ebp
+ i3100_VFtab
]
9537 mov ecx
, [esp
+ i3100_n1
]
9540 ;
# load all the table values we need
9541 movd mm4
, [edx
+ ecx
*4]
9542 movd mm5
, [edx
+ ecx
*4 + 4]
9543 movd mm6
, [edx
+ ecx
*4 + 8]
9544 movd mm7
, [edx
+ ecx
*4 + 12]
9545 mov ecx
, [esp
+ i3100_n1
+ 4]
9547 punpckldq mm4
, [edx
+ ecx
*4]
9548 punpckldq mm5
, [edx
+ ecx
*4 + 4]
9549 punpckldq mm6
, [edx
+ ecx
*4 + 8]
9550 punpckldq mm7
, [edx
+ ecx
*4 + 12]
9552 pfmul mm6
, mm1 ;
# mm6 = Geps
9553 pfmul mm7
, mm2 ;
# mm7 = Heps2
9556 pfadd mm5
, mm7 ;
# mm5 = Fp
9558 pfmul mm7
, [esp
+ i3100_two
] ;
# two*Heps2
9560 pfadd mm7
, mm5 ;
# mm7=FF
9562 pfmul mm5
, mm1 ;
# mm5=eps*Fp
9563 pfadd mm5
, mm4 ;
# mm5= VV
9565 pfmul mm5
, mm3 ;
# vcoul=qq*VV
9566 pfmul mm3
, mm7 ;
# fijC=FF*qq
9569 pfmul mm1
,mm1 ;
# mm1=invsq
9572 pfmul mm2
,mm1 ;
# mm2=rinvsix
9574 pfmul mm1
,mm1 ;
# mm1=rinvtwelve
9576 pfmul mm3
, [esp
+ i3100_tsc
]
9578 pfmul mm1
, [esp
+ i3100_c12
]
9580 pfmul mm2
, [esp
+ i3100_c6
]
9583 pfsub mm4
, mm2 ;
# mm4 = vnb12-vnb6
9585 pfmul mm2
, [esp
+ i3100_six
]
9586 pfmul mm1
, [esp
+ i3100_twelve
]
9589 pfmul mm1
, mm0 ;
# mm1= (12*vnb12-6*vnb6)*rinv11
9594 pfadd mm5
, [esp
+ i3100_vctot
] ;
# add the earlier value
9595 movq
[esp
+ i3100_vctot
], mm5 ;
# store the sum
9597 pfmul mm0
, mm1 ;
# mm0 is total fscal now
9599 prefetchw
[esp
+ i3100_dx1
] ;
# prefetch i forces to cache
9601 ;
# spread fscalar to both positions
9606 ;
# calc vector force
9607 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
9608 movq mm2
, [esp
+ i3100_dx1
] ;
# fetch dr
9609 movd mm3
, [esp
+ i3100_dz1
]
9612 pfadd mm4
, [esp
+ i3100_vnbtot
] ;
# add the earlier value
9613 movq
[esp
+ i3100_vnbtot
], mm4 ;
# store the sum
9615 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
9616 pfmul mm2
, mm0 ;
# mult by fs
9619 movq mm4
, [esp
+ i3100_dx2
] ;
# fetch dr
9620 movd mm5
, [esp
+ i3100_dz2
]
9621 pfmul mm4
, mm1 ;
# mult by fs
9625 movq mm0
, [esp
+ i3100_fix
]
9626 movd mm1
, [esp
+ i3100_fiz
]
9632 movq
[esp
+ i3100_fix
], mm0
9633 movd
[esp
+ i3100_fiz
], mm1
9636 movq mm0
, [edi
+ eax
*4]
9637 movd mm1
, [edi
+ eax
*4 + 8]
9638 movq mm6
, [edi
+ ebx
*4]
9639 movd mm7
, [edi
+ ebx
*4 + 8]
9646 movq
[edi
+ eax
*4], mm0
9647 movd
[edi
+ eax
*4 +8], mm1
9648 movq
[edi
+ ebx
*4], mm6
9649 movd
[edi
+ ebx
*4 + 8], mm7
9651 ;
# should we do one more iteration?
9652 sub dword ptr
[esp
+ i3100_innerk
], 2
9653 jl
.i3100_finish_inner
9654 jmp
.i3100_unroll_loop
9655 .i3100_finish_inner:
9656 and dword ptr
[esp
+ i3100_innerk
], 1
9657 jnz
.i3100_single_inner
9658 jmp
.i3100_updateouterdata
9659 .i3100_single_inner:
9660 ;
# a single j particle iteration here - compare with the unrolled code for comments.
9661 mov eax
, [esp
+ i3100_innerjjnr
]
9662 mov eax
, [eax
] ;
# eax=jnr offset
9664 mov ecx
, [ebp
+ i3100_charge
]
9665 movd mm5
, [esp
+ i3100_iq
]
9666 movd mm3
, [ecx
+ eax
*4]
9667 pfmul mm3
, mm5 ;
# mm3=qq
9669 mov esi
, [ebp
+ i3100_nbfp
]
9670 mov ecx
, [ebp
+ i3100_type
]
9671 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
9673 add edx
, [esp
+ i3100_ntia
] ;
# tja = ntia + 2*type
9674 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
9675 movq
[esp
+ i3100_c6
], mm5
9676 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
9677 movq
[esp
+ i3100_c12
], mm5
9680 mov esi
, [ebp
+ i3100_pos
]
9681 lea eax
, [eax
+ eax
*2]
9683 movq mm0
, [esp
+ i3100_ix
]
9684 movd mm1
, [esp
+ i3100_iz
]
9685 movq mm4
, [esi
+ eax
*4]
9686 movd mm5
, [esi
+ eax
*4 + 8]
9689 movq
[esp
+ i3100_dx1
], mm4
9691 movd
[esp
+ i3100_dz1
], mm5
9694 pfacc mm4
, mm5 ;
# mm4=rsq
9700 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
9703 ;
# mm0 is invsqrt, and mm1 r.
9704 ;
# calculate potentials and scalar force
9705 pfmul mm1
, [esp
+ i3100_tsc
] ;
# mm1=rt
9707 movd
[esp
+ i3100_n1
], mm4
9709 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
9712 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
9715 mov edx
, [ebp
+ i3100_VFtab
]
9716 mov ecx
, [esp
+ i3100_n1
]
9718 ;
# load all the table values we need
9719 movd mm4
, [edx
+ ecx
*4]
9720 movd mm5
, [edx
+ ecx
*4 + 4]
9721 movd mm6
, [edx
+ ecx
*4 + 8]
9722 movd mm7
, [edx
+ ecx
*4 + 12]
9724 pfmul mm6
, mm1 ;
# mm6 = Geps
9725 pfmul mm7
, mm2 ;
# mm7 = Heps2
9728 pfadd mm5
, mm7 ;
# mm5 = Fp
9730 pfmul mm7
, [esp
+ i3100_two
] ;
# two*Heps2
9732 pfadd mm7
, mm5 ;
# mm7=FF
9734 pfmul mm5
, mm1 ;
# mm5=eps*Fp
9735 pfadd mm5
, mm4 ;
# mm5= VV
9737 pfmul mm5
, mm3 ;
# vcoul=qq*VV
9738 pfmul mm3
, mm7 ;
# fijC=FF*qq
9740 ;
# at this point mm5 contains vcoul and mm3 fijC
9743 pfmul mm1
,mm1 ;
# mm1=invsq
9746 pfmul mm2
,mm1 ;
# mm2=rinvsix
9748 pfmul mm1
,mm1 ;
# mm1=rinvtwelve
9750 pfmul mm3
, [esp
+ i3100_tsc
]
9752 pfmul mm1
, [esp
+ i3100_c12
]
9754 pfmul mm2
, [esp
+ i3100_c6
]
9757 pfsub mm4
, mm2 ;
# mm4 = vnb12-vnb6
9759 pfmul mm2
, [esp
+ i3100_six
]
9760 pfmul mm1
, [esp
+ i3100_twelve
]
9763 pfmul mm1
, mm0 ;
# mm1= (12*vnb12-6*vnb6)*rinv11
9768 pfadd mm5
, [esp
+ i3100_vctot
] ;
# add the earlier value
9769 movq
[esp
+ i3100_vctot
], mm5 ;
# store the sum
9771 pfmul mm0
, mm1 ;
# mm0 is total fscal now
9773 ;
# spread fscalar to both positions
9775 ;
# calc vectorial force
9776 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
9777 movq mm2
, [esp
+ i3100_dx1
]
9778 movd mm3
, [esp
+ i3100_dz1
]
9781 pfadd mm4
, [esp
+ i3100_vnbtot
] ;
# add the earlier value
9782 movq
[esp
+ i3100_vnbtot
], mm4 ;
# store the sum
9787 ;
# update i particle force
9788 movq mm0
, [esp
+ i3100_fix
]
9789 movd mm1
, [esp
+ i3100_fiz
]
9792 movq
[esp
+ i3100_fix
], mm0
9793 movd
[esp
+ i3100_fiz
], mm1
9794 ;
# update j particle force
9795 movq mm0
, [edi
+ eax
*4]
9796 movd mm1
, [edi
+ eax
*4+ 8]
9799 movq
[edi
+ eax
*4], mm0
9800 movd
[edi
+ eax
*4 +8], mm1
9802 .i3100_updateouterdata:
9803 mov ecx
, [esp
+ i3100_ii3
]
9805 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
9806 movd mm7
, [edi
+ ecx
*4 + 8]
9807 pfadd mm6
, [esp
+ i3100_fix
]
9808 pfadd mm7
, [esp
+ i3100_fiz
]
9809 movq
[edi
+ ecx
*4], mm6
9810 movd
[edi
+ ecx
*4 +8], mm7
9812 mov ebx
, [ebp
+ i3100_fshift
] ;
# increment fshift force
9813 mov edx
, [esp
+ i3100_is3
]
9815 movq mm6
, [ebx
+ edx
*4]
9816 movd mm7
, [ebx
+ edx
*4 + 8]
9817 pfadd mm6
, [esp
+ i3100_fix
]
9818 pfadd mm7
, [esp
+ i3100_fiz
]
9819 movq
[ebx
+ edx
*4], mm6
9820 movd
[ebx
+ edx
*4 + 8], mm7
9822 mov edx
, [ebp
+ i3100_gid
] ;
# get group index for this i particle
9824 add dword ptr
[ebp
+ i3100_gid
], 4 ;
# advance pointer
9826 movq mm7
, [esp
+ i3100_vctot
]
9827 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
9829 mov eax
, [ebp
+ i3100_Vc
]
9830 movd mm6
, [eax
+ edx
*4]
9832 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
9834 movq mm7
, [esp
+ i3100_vnbtot
]
9835 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
9837 mov eax
, [ebp
+ i3100_Vnb
]
9838 movd mm6
, [eax
+ edx
*4]
9840 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
9843 mov ecx
, [ebp
+ i3100_nri
]
9846 ;
# not last, iterate once more!
9847 mov
[ebp
+ i3100_nri
], ecx
9867 .globl inl3110_3dnow
9868 .globl _inl3110_3dnow
9872 .equiv i3110_iinr, 12
9873 .equiv i3110_jindex, 16
9874 .equiv i3110_jjnr, 20
9875 .equiv i3110_shift, 24
9876 .equiv i3110_shiftvec, 28
9877 .equiv i3110_fshift, 32
9878 .equiv i3110_gid, 36
9879 .equiv i3110_pos, 40
9880 .equiv i3110_faction, 44
9881 .equiv i3110_charge, 48
9882 .equiv i3110_facel, 52
9884 .equiv i3110_type, 60
9885 .equiv i3110_ntype, 64
9886 .equiv i3110_nbfp, 68
9887 .equiv i3110_Vnb, 72
9888 .equiv i3110_tabscale, 76
9889 .equiv i3110_VFtab, 80
9890 .equiv i3110_nsatoms, 84
9891 ;
# stack offsets for local variables
9895 .equiv i3110_shY, 12
9896 .equiv i3110_shZ, 16
9901 .equiv i3110_vctot, 40
9902 .equiv i3110_vnbtot, 48
9904 .equiv i3110_c12, 64
9905 .equiv i3110_six, 72
9906 .equiv i3110_twelve, 80
9907 .equiv i3110_two, 88
9909 .equiv i3110_tsc, 104
9910 .equiv i3110_ntia, 112
9911 .equiv i3110_innerjjnr0, 116
9912 .equiv i3110_innerk0, 120
9913 .equiv i3110_innerjjnr, 124
9914 .equiv i3110_innerk, 128
9915 .equiv i3110_fix, 132
9916 .equiv i3110_fiy, 136
9917 .equiv i3110_fiz, 140
9918 .equiv i3110_dx1, 144
9919 .equiv i3110_dy1, 148
9920 .equiv i3110_dz1, 152
9921 .equiv i3110_dx2, 156
9922 .equiv i3110_dy2, 160
9923 .equiv i3110_dz2, 164
9924 .equiv i3110_nsvdwc, 168
9925 .equiv i3110_nscoul, 172
9926 .equiv i3110_nsvdw, 176
9927 .equiv i3110_solnr, 180
9936 sub esp
, 184 ;
# local stack space
9939 movq mm1
, [mm_twelve
]
9940 movq
[esp
+ i3110_six
], mm0
9941 movq
[esp
+ i3110_twelve
], mm1
9943 movd mm3
, [ebp
+ i3110_tabscale
]
9944 movq
[esp
+ i3110_two
], mm2
9946 movq
[esp
+ i3110_tsc
], mm3
9947 ;
# assume we have at least one i particle - start directly
9949 mov eax
, [ebp
+ i3110_shift
] ;
# eax = pointer into shift[]
9950 mov ebx
, [eax
] ;
# ebx=shift[n]
9951 add dword ptr
[ebp
+ i3110_shift
], 4 ;
# advance pointer one step
9953 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
9954 mov
[esp
+ i3110_is3
],ebx ;
# store is3
9956 mov eax
, [ebp
+ i3110_shiftvec
] ;
# eax = base of shiftvec[]
9958 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
9959 movd mm1
, [eax
+ ebx
*4 + 8]
9960 movq
[esp
+ i3110_shX
], mm0
9961 movd
[esp
+ i3110_shZ
], mm1
9963 mov ecx
, [ebp
+ i3110_iinr
] ;
# ecx = pointer into iinr[]
9964 add dword ptr
[ebp
+ i3110_iinr
], 4 ;
# advance pointer
9965 mov ebx
, [ecx
] ;
# ebx=ii
9967 mov eax
, [ebp
+ i3110_nsatoms
]
9968 add dword ptr
[ebp
+ i3110_nsatoms
], 12
9975 mov
[esp
+ i3110_nsvdwc
], edx
9976 mov
[esp
+ i3110_nscoul
], eax
9977 mov
[esp
+ i3110_nsvdw
], ecx
9981 movq
[esp
+ i3110_vctot
], mm7
9982 movq
[esp
+ i3110_vnbtot
], mm7
9983 mov
[esp
+ i3110_solnr
], ebx
9985 mov eax
, [ebp
+ i3110_jindex
]
9986 mov ecx
, [eax
] ;
# jindex[n]
9987 mov edx
, [eax
+ 4] ;
# jindex[n+1]
9988 add dword ptr
[ebp
+ i3110_jindex
], 4
9989 sub edx
, ecx ;
# number of innerloop atoms
9990 mov eax
, [ebp
+ i3110_jjnr
]
9993 mov
[esp
+ i3110_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
9995 mov
[esp
+ i3110_innerk0
], edx ;
# number of innerloop atoms
9996 mov esi
, [ebp
+ i3110_pos
]
9997 mov edi
, [ebp
+ i3110_faction
]
9999 mov ecx
, [esp
+ i3110_nsvdwc
]
10001 jnz
.i3110_mno_vdwc
10002 jmp
.i3110_testcoul
10004 mov ebx
, [esp
+ i3110_solnr
]
10005 inc dword ptr
[esp
+ i3110_solnr
]
10006 mov edx
, [ebp
+ i3110_charge
]
10007 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
10008 pfmul mm2
, [ebp
+ i3110_facel
]
10009 punpckldq mm2
,mm2 ;
# spread to both halves
10010 movq
[esp
+ i3110_iq
], mm2 ;
# iq =facel*charge[ii]
10012 mov edx
, [ebp
+ i3110_type
]
10013 mov edx
, [edx
+ ebx
*4]
10014 imul edx
, [ebp
+ i3110_ntype
]
10016 mov
[esp
+ i3110_ntia
], edx
10018 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
10019 mov eax
, [ebp
+ i3110_pos
] ;
# eax = base of pos[]
10020 mov
[esp
+ i3110_ii3
], ebx
10022 movq mm0
, [eax
+ ebx
*4]
10023 movd mm1
, [eax
+ ebx
*4 + 8]
10024 pfadd mm0
, [esp
+ i3110_shX
]
10025 pfadd mm1
, [esp
+ i3110_shZ
]
10026 movq
[esp
+ i3110_ix
], mm0
10027 movd
[esp
+ i3110_iz
], mm1
10031 movq
[esp
+ i3110_fix
], mm7
10032 movd
[esp
+ i3110_fiz
], mm7
10034 mov ecx
, [esp
+ i3110_innerjjnr0
]
10035 mov
[esp
+ i3110_innerjjnr
], ecx
10036 mov edx
, [esp
+ i3110_innerk0
]
10038 mov
[esp
+ i3110_innerk
], edx ;
# number of innerloop atoms
10039 jge
.i3110_unroll_vdwc_loop
10040 jmp
.i3110_finish_vdwc_inner
10041 .i3110_unroll_vdwc_loop:
10042 ;
# paired innerloop starts here
10043 mov ecx
, [esp
+ i3110_innerjjnr
] ;
# pointer to jjnr[k]
10045 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
10046 add dword ptr
[esp
+ i3110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
10047 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
10049 mov ecx
, [ebp
+ i3110_charge
] ;
# base of charge[]
10050 movq mm5
, [esp
+ i3110_iq
]
10051 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
10052 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
10053 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
10055 mov ecx
, [ebp
+ i3110_type
]
10056 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
10057 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
10059 mov esi
, [ebp
+ i3110_nbfp
] ;
# base of nbfp
10062 add edx
, [esp
+ i3110_ntia
] ;
# tja = ntia + 2*type
10063 add ecx
, [esp
+ i3110_ntia
]
10065 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
10066 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
10068 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
10069 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
10070 movq
[esp
+ i3110_c6
], mm5
10071 movq
[esp
+ i3110_c12
], mm6
10073 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
10074 lea ebx
, [ebx
+ ebx
*2]
10076 mov esi
, [ebp
+ i3110_pos
]
10078 movq mm0
, [esp
+ i3110_ix
]
10079 movd mm1
, [esp
+ i3110_iz
]
10080 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
10081 movd mm5
, [esi
+ eax
*4 + 8]
10082 pfsubr mm4
,mm0 ;
# dr = ir - jr
10084 movq
[esp
+ i3110_dx1
], mm4 ;
# store dr
10085 movd
[esp
+ i3110_dz1
], mm5
10086 pfmul mm4
,mm4 ;
# square dx,dy,dz
10088 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
10089 pfacc mm4
, mm5 ;
# first rsq in lower mm4
10091 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
10092 movd mm7
, [esi
+ ebx
*4 + 8]
10094 pfsubr mm6
,mm0 ;
# dr = ir - jr
10096 movq
[esp
+ i3110_dx2
], mm6 ;
# store dr
10097 movd
[esp
+ i3110_dz2
], mm7
10098 pfmul mm6
,mm6 ;
# square dx,dy,dz
10100 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
10101 pfacc mm6
, mm7 ;
# second rsq in lower mm6
10103 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
10108 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
10109 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
10115 ;
# mm0 is invsqrt, and mm1 r.
10116 ;
# do potential and fscal
10117 pfmul mm1
, [esp
+ i3110_tsc
] ;
# mm1=rt
10119 movq
[esp
+ i3110_n1
], mm4
10121 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
10124 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
10126 mov edx
, [ebp
+ i3110_VFtab
]
10127 mov ecx
, [esp
+ i3110_n1
]
10130 ;
# load all the table values we need
10131 movd mm4
, [edx
+ ecx
*4]
10132 movd mm5
, [edx
+ ecx
*4 + 4]
10133 movd mm6
, [edx
+ ecx
*4 + 8]
10134 movd mm7
, [edx
+ ecx
*4 + 12]
10135 mov ecx
, [esp
+ i3110_n1
+ 4]
10137 punpckldq mm4
, [edx
+ ecx
*4]
10138 punpckldq mm5
, [edx
+ ecx
*4 + 4]
10139 punpckldq mm6
, [edx
+ ecx
*4 + 8]
10140 punpckldq mm7
, [edx
+ ecx
*4 + 12]
10142 pfmul mm6
, mm1 ;
# mm6 = Geps
10143 pfmul mm7
, mm2 ;
# mm7 = Heps2
10146 pfadd mm5
, mm7 ;
# mm5 = Fp
10148 pfmul mm7
, [esp
+ i3110_two
] ;
# two*Heps2
10150 pfadd mm7
, mm5 ;
# mm7=FF
10152 pfmul mm5
, mm1 ;
# mm5=eps*Fp
10153 pfadd mm5
, mm4 ;
# mm5= VV
10155 pfmul mm5
, mm3 ;
# vcoul=qq*VV
10156 pfmul mm3
, mm7 ;
# fijC=FF*qq
10159 pfmul mm1
,mm1 ;
# mm1=invsq
10162 pfmul mm2
,mm1 ;
# mm2=rinvsix
10164 pfmul mm1
,mm1 ;
# mm1=rinvtwelve
10166 pfmul mm3
, [esp
+ i3110_tsc
]
10168 pfmul mm1
, [esp
+ i3110_c12
]
10170 pfmul mm2
, [esp
+ i3110_c6
]
10173 pfsub mm4
, mm2 ;
# mm4 = vnb12-vnb6
10175 pfmul mm2
, [esp
+ i3110_six
]
10176 pfmul mm1
, [esp
+ i3110_twelve
]
10179 pfmul mm1
, mm0 ;
# mm1= (12*vnb12-6*vnb6)*rinv11
10184 pfadd mm5
, [esp
+ i3110_vctot
] ;
# add the earlier value
10185 movq
[esp
+ i3110_vctot
], mm5 ;
# store the sum
10187 pfmul mm0
, mm1 ;
# mm0 is total fscal now
10189 prefetchw
[esp
+ i3110_dx1
] ;
# prefetch i forces to cache
10191 ;
# spread fscalar to both positions
10196 ;
# calc vector force
10197 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
10198 movq mm2
, [esp
+ i3110_dx1
] ;
# fetch dr
10199 movd mm3
, [esp
+ i3110_dz1
]
10202 pfadd mm4
, [esp
+ i3110_vnbtot
] ;
# add the earlier value
10203 movq
[esp
+ i3110_vnbtot
], mm4 ;
# store the sum
10205 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
10206 pfmul mm2
, mm0 ;
# mult by fs
10209 movq mm4
, [esp
+ i3110_dx2
] ;
# fetch dr
10210 movd mm5
, [esp
+ i3110_dz2
]
10211 pfmul mm4
, mm1 ;
# mult by fs
10215 movq mm0
, [esp
+ i3110_fix
]
10216 movd mm1
, [esp
+ i3110_fiz
]
10222 movq
[esp
+ i3110_fix
], mm0
10223 movd
[esp
+ i3110_fiz
], mm1
10226 movq mm0
, [edi
+ eax
*4]
10227 movd mm1
, [edi
+ eax
*4 + 8]
10228 movq mm6
, [edi
+ ebx
*4]
10229 movd mm7
, [edi
+ ebx
*4 + 8]
10236 movq
[edi
+ eax
*4], mm0
10237 movd
[edi
+ eax
*4 +8], mm1
10238 movq
[edi
+ ebx
*4], mm6
10239 movd
[edi
+ ebx
*4 + 8], mm7
10241 ;
# should we do one more iteration?
10242 sub dword ptr
[esp
+ i3110_innerk
], 2
10243 jl
.i3110_finish_vdwc_inner
10244 jmp
.i3110_unroll_vdwc_loop
10245 .i3110_finish_vdwc_inner:
10246 and dword ptr
[esp
+ i3110_innerk
], 1
10247 jnz
.i3110_single_vdwc_inner
10248 jmp
.i3110_updateouterdata_vdwc
10249 .i3110_single_vdwc_inner:
10250 ;
# a single j particle iteration here - compare with the unrolled code for comments.
10251 mov eax
, [esp
+ i3110_innerjjnr
]
10252 mov eax
, [eax
] ;
# eax=jnr offset
10254 mov ecx
, [ebp
+ i3110_charge
]
10255 movd mm5
, [esp
+ i3110_iq
]
10256 movd mm3
, [ecx
+ eax
*4]
10257 pfmul mm3
, mm5 ;
# mm3=qq
10259 mov esi
, [ebp
+ i3110_nbfp
]
10260 mov ecx
, [ebp
+ i3110_type
]
10261 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
10263 add edx
, [esp
+ i3110_ntia
] ;
# tja = ntia + 2*type
10264 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
10265 movq
[esp
+ i3110_c6
], mm5
10266 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
10267 movq
[esp
+ i3110_c12
], mm5
10270 mov esi
, [ebp
+ i3110_pos
]
10271 lea eax
, [eax
+ eax
*2]
10273 movq mm0
, [esp
+ i3110_ix
]
10274 movd mm1
, [esp
+ i3110_iz
]
10275 movq mm4
, [esi
+ eax
*4]
10276 movd mm5
, [esi
+ eax
*4 + 8]
10279 movq
[esp
+ i3110_dx1
], mm4
10281 movd
[esp
+ i3110_dz1
], mm5
10284 pfacc mm4
, mm5 ;
# mm4=rsq
10290 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
10293 ;
# mm0 is invsqrt, and mm1 r.
10294 ;
# calculate potentials and scalar force
10295 pfmul mm1
, [esp
+ i3110_tsc
] ;
# mm1=rt
10297 movd
[esp
+ i3110_n1
], mm4
10299 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
10302 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
10305 mov edx
, [ebp
+ i3110_VFtab
]
10306 mov ecx
, [esp
+ i3110_n1
]
10308 ;
# load all the table values we need
10309 movd mm4
, [edx
+ ecx
*4]
10310 movd mm5
, [edx
+ ecx
*4 + 4]
10311 movd mm6
, [edx
+ ecx
*4 + 8]
10312 movd mm7
, [edx
+ ecx
*4 + 12]
10314 pfmul mm6
, mm1 ;
# mm6 = Geps
10315 pfmul mm7
, mm2 ;
# mm7 = Heps2
10318 pfadd mm5
, mm7 ;
# mm5 = Fp
10320 pfmul mm7
, [esp
+ i3110_two
] ;
# two*Heps2
10322 pfadd mm7
, mm5 ;
# mm7=FF
10324 pfmul mm5
, mm1 ;
# mm5=eps*Fp
10325 pfadd mm5
, mm4 ;
# mm5= VV
10327 pfmul mm5
, mm3 ;
# vcoul=qq*VV
10328 pfmul mm3
, mm7 ;
# fijC=FF*qq
10331 pfmul mm1
,mm1 ;
# mm1=invsq
10334 pfmul mm2
,mm1 ;
# mm2=rinvsix
10336 pfmul mm1
,mm1 ;
# mm1=rinvtwelve
10338 pfmul mm3
, [esp
+ i3110_tsc
]
10340 pfmul mm1
, [esp
+ i3110_c12
]
10342 pfmul mm2
, [esp
+ i3110_c6
]
10345 pfsub mm4
, mm2 ;
# mm4 = vnb12-vnb6
10347 pfmul mm2
, [esp
+ i3110_six
]
10348 pfmul mm1
, [esp
+ i3110_twelve
]
10351 pfmul mm1
, mm0 ;
# mm1= (12*vnb12-6*vnb6)*rinv11
10356 pfadd mm5
, [esp
+ i3110_vctot
] ;
# add the earlier value
10357 movq
[esp
+ i3110_vctot
], mm5 ;
# store the sum
10359 pfmul mm0
, mm1 ;
# mm0 is total fscal now
10361 ;
# spread fscalar to both positions
10363 ;
# calc vectorial force
10364 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
10365 movq mm2
, [esp
+ i3110_dx1
]
10366 movd mm3
, [esp
+ i3110_dz1
]
10369 pfadd mm4
, [esp
+ i3110_vnbtot
] ;
# add the earlier value
10370 movq
[esp
+ i3110_vnbtot
], mm4 ;
# store the sum
10375 ;
# update i particle force
10376 movq mm0
, [esp
+ i3110_fix
]
10377 movd mm1
, [esp
+ i3110_fiz
]
10380 movq
[esp
+ i3110_fix
], mm0
10381 movd
[esp
+ i3110_fiz
], mm1
10382 ;
# update j particle force
10383 movq mm0
, [edi
+ eax
*4]
10384 movd mm1
, [edi
+ eax
*4+ 8]
10387 movq
[edi
+ eax
*4], mm0
10388 movd
[edi
+ eax
*4 +8], mm1
10390 .i3110_updateouterdata_vdwc:
10391 mov ecx
, [esp
+ i3110_ii3
]
10393 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
10394 movd mm7
, [edi
+ ecx
*4 + 8]
10395 pfadd mm6
, [esp
+ i3110_fix
]
10396 pfadd mm7
, [esp
+ i3110_fiz
]
10397 movq
[edi
+ ecx
*4], mm6
10398 movd
[edi
+ ecx
*4 +8], mm7
10400 mov ebx
, [ebp
+ i3110_fshift
] ;
# increment fshift force
10401 mov edx
, [esp
+ i3110_is3
]
10403 movq mm6
, [ebx
+ edx
*4]
10404 movd mm7
, [ebx
+ edx
*4 + 8]
10405 pfadd mm6
, [esp
+ i3110_fix
]
10406 pfadd mm7
, [esp
+ i3110_fiz
]
10407 movq
[ebx
+ edx
*4], mm6
10408 movd
[ebx
+ edx
*4 + 8], mm7
10410 ;
# loop back to mno
10411 dec dword ptr
[esp
+ i3110_nsvdwc
]
10413 jmp
.i3110_mno_vdwc
10415 mov ecx
, [esp
+ i3110_nscoul
]
10417 jnz
.i3110_mno_coul
10420 mov ebx
, [esp
+ i3110_solnr
]
10421 inc dword ptr
[esp
+ i3110_solnr
]
10422 mov edx
, [ebp
+ i3110_charge
]
10423 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
10424 pfmul mm2
, [ebp
+ i3110_facel
]
10425 punpckldq mm2
,mm2 ;
# spread to both halves
10426 movq
[esp
+ i3110_iq
], mm2 ;
# iq =facel*charge[ii]
10428 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
10429 mov eax
, [ebp
+ i3110_pos
] ;
# eax = base of pos[]
10430 mov
[esp
+ i3110_ii3
], ebx
10432 movq mm0
, [eax
+ ebx
*4]
10433 movd mm1
, [eax
+ ebx
*4 + 8]
10434 pfadd mm0
, [esp
+ i3110_shX
]
10435 pfadd mm1
, [esp
+ i3110_shZ
]
10436 movq
[esp
+ i3110_ix
], mm0
10437 movd
[esp
+ i3110_iz
], mm1
10441 movq
[esp
+ i3110_fix
], mm7
10442 movd
[esp
+ i3110_fiz
], mm7
10444 mov ecx
, [esp
+ i3110_innerjjnr0
]
10445 mov
[esp
+ i3110_innerjjnr
], ecx
10446 mov edx
, [esp
+ i3110_innerk0
]
10448 mov
[esp
+ i3110_innerk
], edx ;
# number of innerloop atoms
10449 jge
.i3110_unroll_coul_loop
10450 jmp
.i3110_finish_coul_inner
10451 .i3110_unroll_coul_loop:
10452 ;
# paired innerloop starts here
10453 mov ecx
, [esp
+ i3110_innerjjnr
] ;
# pointer to jjnr[k]
10455 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
10456 add dword ptr
[esp
+ i3110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
10457 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
10459 mov ecx
, [ebp
+ i3110_charge
] ;
# base of charge[]
10460 movq mm5
, [esp
+ i3110_iq
]
10461 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
10462 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
10463 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
10465 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
10466 lea ebx
, [ebx
+ ebx
*2]
10468 mov esi
, [ebp
+ i3110_pos
]
10470 movq mm0
, [esp
+ i3110_ix
]
10471 movd mm1
, [esp
+ i3110_iz
]
10472 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
10473 movd mm5
, [esi
+ eax
*4 + 8]
10474 pfsubr mm4
,mm0 ;
# dr = ir - jr
10476 movq
[esp
+ i3110_dx1
], mm4 ;
# store dr
10477 movd
[esp
+ i3110_dz1
], mm5
10478 pfmul mm4
,mm4 ;
# square dx,dy,dz
10480 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
10481 pfacc mm4
, mm5 ;
# first rsq in lower mm4
10483 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
10484 movd mm7
, [esi
+ ebx
*4 + 8]
10486 pfsubr mm6
,mm0 ;
# dr = ir - jr
10488 movq
[esp
+ i3110_dx2
], mm6 ;
# store dr
10489 movd
[esp
+ i3110_dz2
], mm7
10490 pfmul mm6
,mm6 ;
# square dx,dy,dz
10492 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
10493 pfacc mm6
, mm7 ;
# second rsq in lower mm6
10495 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
10500 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
10501 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
10507 ;
# mm0 is invsqrt, and mm1 r.
10508 ;
# do potential and fscal
10509 pfmul mm1
, [esp
+ i3110_tsc
] ;
# mm1=rt
10511 movq
[esp
+ i3110_n1
], mm4
10513 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
10516 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
10518 mov edx
, [ebp
+ i3110_VFtab
]
10519 mov ecx
, [esp
+ i3110_n1
]
10522 ;
# load all the table values we need
10523 movd mm4
, [edx
+ ecx
*4]
10524 movd mm5
, [edx
+ ecx
*4 + 4]
10525 movd mm6
, [edx
+ ecx
*4 + 8]
10526 movd mm7
, [edx
+ ecx
*4 + 12]
10527 mov ecx
, [esp
+ i3110_n1
+ 4]
10529 punpckldq mm4
, [edx
+ ecx
*4]
10530 punpckldq mm5
, [edx
+ ecx
*4 + 4]
10531 punpckldq mm6
, [edx
+ ecx
*4 + 8]
10532 punpckldq mm7
, [edx
+ ecx
*4 + 12]
10534 pfmul mm6
, mm1 ;
# mm6 = Geps
10535 pfmul mm7
, mm2 ;
# mm7 = Heps2
10538 pfadd mm5
, mm7 ;
# mm5 = Fp
10540 pfmul mm7
, [esp
+ i3110_two
] ;
# two*Heps2
10542 pfadd mm7
, mm5 ;
# mm7=FF
10544 pfmul mm5
, mm1 ;
# mm5=eps*Fp
10545 pfadd mm5
, mm4 ;
# mm5= VV
10547 pfmul mm5
, mm3 ;
# vcoul=qq*VV
10548 pfmul mm3
, mm7 ;
# fijC=FF*qq
10550 ;
# at this point mm5 contains vcoul and mm3 fijC
10551 ;
# increment vcoul - then we can get rid of mm5
10553 pfadd mm5
, [esp
+ i3110_vctot
] ;
# add the earlier value
10554 movq
[esp
+ i3110_vctot
], mm5 ;
# store the sum
10556 ;
# change sign of mm3
10559 pfmul mm1
, [esp
+ i3110_tsc
]
10560 pfmul mm0
, mm1 ;
# mm0 is total fscal now
10562 prefetchw
[esp
+ i3110_dx1
] ;
# prefetch i forces to cache
10564 ;
# spread fscalar to both positions
10569 ;
# calc vector force
10570 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
10571 movq mm2
, [esp
+ i3110_dx1
] ;
# fetch dr
10572 movd mm3
, [esp
+ i3110_dz1
]
10574 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
10575 pfmul mm2
, mm0 ;
# mult by fs
10578 movq mm4
, [esp
+ i3110_dx2
] ;
# fetch dr
10579 movd mm5
, [esp
+ i3110_dz2
]
10580 pfmul mm4
, mm1 ;
# mult by fs
10584 movq mm0
, [esp
+ i3110_fix
]
10585 movd mm1
, [esp
+ i3110_fiz
]
10591 movq
[esp
+ i3110_fix
], mm0
10592 movd
[esp
+ i3110_fiz
], mm1
10595 movq mm0
, [edi
+ eax
*4]
10596 movd mm1
, [edi
+ eax
*4 + 8]
10597 movq mm6
, [edi
+ ebx
*4]
10598 movd mm7
, [edi
+ ebx
*4 + 8]
10605 movq
[edi
+ eax
*4], mm0
10606 movd
[edi
+ eax
*4 +8], mm1
10607 movq
[edi
+ ebx
*4], mm6
10608 movd
[edi
+ ebx
*4 + 8], mm7
10610 ;
# should we do one more iteration?
10611 sub dword ptr
[esp
+ i3110_innerk
], 2
10612 jl
.i3110_finish_coul_inner
10613 jmp
.i3110_unroll_coul_loop
10614 .i3110_finish_coul_inner:
10615 and dword ptr
[esp
+ i3110_innerk
], 1
10616 jnz
.i3110_single_coul_inner
10617 jmp
.i3110_updateouterdata_coul
10618 .i3110_single_coul_inner:
10619 ;
# a single j particle iteration here - compare with the unrolled code for comments.
10620 mov eax
, [esp
+ i3110_innerjjnr
]
10621 mov eax
, [eax
] ;
# eax=jnr offset
10623 mov ecx
, [ebp
+ i3110_charge
]
10624 movd mm5
, [esp
+ i3110_iq
]
10625 movd mm3
, [ecx
+ eax
*4]
10626 pfmul mm3
, mm5 ;
# mm3=qq
10628 mov esi
, [ebp
+ i3110_pos
]
10629 lea eax
, [eax
+ eax
*2]
10631 movq mm0
, [esp
+ i3110_ix
]
10632 movd mm1
, [esp
+ i3110_iz
]
10633 movq mm4
, [esi
+ eax
*4]
10634 movd mm5
, [esi
+ eax
*4 + 8]
10637 movq
[esp
+ i3110_dx1
], mm4
10639 movd
[esp
+ i3110_dz1
], mm5
10642 pfacc mm4
, mm5 ;
# mm0=rsq
10648 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
10651 ;
# mm0 is invsqrt, and mm1 r.
10653 ;
# calculate potentials and scalar force
10654 pfmul mm1
, [esp
+ i3110_tsc
] ;
# mm1=rt
10656 movd
[esp
+ i3110_n1
], mm4
10658 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
10661 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
10664 mov edx
, [ebp
+ i3110_VFtab
]
10665 mov ecx
, [esp
+ i3110_n1
]
10667 ;
# load all the table values we need
10668 movd mm4
, [edx
+ ecx
*4]
10669 movd mm5
, [edx
+ ecx
*4 + 4]
10670 movd mm6
, [edx
+ ecx
*4 + 8]
10671 movd mm7
, [edx
+ ecx
*4 + 12]
10673 pfmul mm6
, mm1 ;
# mm6 = Geps
10674 pfmul mm7
, mm2 ;
# mm7 = Heps2
10677 pfadd mm5
, mm7 ;
# mm5 = Fp
10679 pfmul mm7
, [esp
+ i3110_two
] ;
# two*Heps2
10681 pfadd mm7
, mm5 ;
# mm7=FF
10683 pfmul mm5
, mm1 ;
# mm5=eps*Fp
10684 pfadd mm5
, mm4 ;
# mm5= VV
10686 pfmul mm5
, mm3 ;
# vcoul=qq*VV
10687 pfmul mm3
, mm7 ;
# fijC=FF*qq
10689 ;
# at this point mm5 contains vcoul and mm3 fijC
10690 ;
# increment vcoul - then we can get rid of mm5
10692 pfadd mm5
, [esp
+ i3110_vctot
] ;
# add the earlier value
10693 movq
[esp
+ i3110_vctot
], mm5 ;
# store the sum
10695 ;
# change sign of mm3
10698 pfmul mm0
, [esp
+ i3110_tsc
]
10699 pfmul mm0
, mm1 ;
# mm0 is total fscal now
10701 ;
# spread fscalar to both positions
10703 ;
# calc vectorial force
10704 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
10705 movq mm2
, [esp
+ i3110_dx1
]
10706 movd mm3
, [esp
+ i3110_dz1
]
10712 ;
# update i particle force
10713 movq mm0
, [esp
+ i3110_fix
]
10714 movd mm1
, [esp
+ i3110_fiz
]
10717 movq
[esp
+ i3110_fix
], mm0
10718 movd
[esp
+ i3110_fiz
], mm1
10719 ;
# update j particle force
10720 movq mm0
, [edi
+ eax
*4]
10721 movd mm1
, [edi
+ eax
*4+ 8]
10724 movq
[edi
+ eax
*4], mm0
10725 movd
[edi
+ eax
*4 +8], mm1
10727 .i3110_updateouterdata_coul:
10728 mov ecx
, [esp
+ i3110_ii3
]
10730 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
10731 movd mm7
, [edi
+ ecx
*4 + 8]
10732 pfadd mm6
, [esp
+ i3110_fix
]
10733 pfadd mm7
, [esp
+ i3110_fiz
]
10734 movq
[edi
+ ecx
*4], mm6
10735 movd
[edi
+ ecx
*4 +8], mm7
10737 mov ebx
, [ebp
+ i3110_fshift
] ;
# increment fshift force
10738 mov edx
, [esp
+ i3110_is3
]
10740 movq mm6
, [ebx
+ edx
*4]
10741 movd mm7
, [ebx
+ edx
*4 + 8]
10742 pfadd mm6
, [esp
+ i3110_fix
]
10743 pfadd mm7
, [esp
+ i3110_fiz
]
10744 movq
[ebx
+ edx
*4], mm6
10745 movd
[ebx
+ edx
*4 + 8], mm7
10747 ;
# loop back to mno
10748 dec dword ptr
[esp
+ i3110_nscoul
]
10750 jmp
.i3110_mno_coul
10752 mov ecx
, [esp
+ i3110_nsvdw
]
10755 jmp
.i3110_last_mno
10757 mov ebx
, [esp
+ i3110_solnr
]
10758 inc dword ptr
[esp
+ i3110_solnr
]
10760 mov edx
, [ebp
+ i3110_type
]
10761 mov edx
, [edx
+ ebx
*4]
10762 imul edx
, [ebp
+ i3110_ntype
]
10764 mov
[esp
+ i3110_ntia
], edx
10766 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
10767 mov eax
, [ebp
+ i3110_pos
] ;
# eax = base of pos[]
10768 mov
[esp
+ i3110_ii3
], ebx
10770 movq mm0
, [eax
+ ebx
*4]
10771 movd mm1
, [eax
+ ebx
*4 + 8]
10772 pfadd mm0
, [esp
+ i3110_shX
]
10773 pfadd mm1
, [esp
+ i3110_shZ
]
10774 movq
[esp
+ i3110_ix
], mm0
10775 movd
[esp
+ i3110_iz
], mm1
10779 movq
[esp
+ i3110_fix
], mm7
10780 movd
[esp
+ i3110_fiz
], mm7
10782 mov ecx
, [esp
+ i3110_innerjjnr0
]
10783 mov
[esp
+ i3110_innerjjnr
], ecx
10784 mov edx
, [esp
+ i3110_innerk0
]
10786 mov
[esp
+ i3110_innerk
], edx ;
# number of innerloop atoms
10787 jge
.i3110_unroll_vdw_loop
10788 jmp
.i3110_finish_vdw_inner
10789 .i3110_unroll_vdw_loop:
10790 ;
# paired innerloop starts here
10791 mov ecx
, [esp
+ i3110_innerjjnr
] ;
# pointer to jjnr[k]
10793 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
10794 add dword ptr
[esp
+ i3110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
10795 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
10797 mov ecx
, [ebp
+ i3110_type
]
10798 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
10799 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
10801 mov esi
, [ebp
+ i3110_nbfp
] ;
# base of nbfp
10804 add edx
, [esp
+ i3110_ntia
] ;
# tja = ntia + 2*type
10805 add ecx
, [esp
+ i3110_ntia
]
10807 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
10808 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
10810 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
10811 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
10812 movq
[esp
+ i3110_c6
], mm5
10813 movq
[esp
+ i3110_c12
], mm6
10815 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
10816 lea ebx
, [ebx
+ ebx
*2]
10818 mov esi
, [ebp
+ i3110_pos
]
10820 movq mm0
, [esp
+ i3110_ix
]
10821 movd mm1
, [esp
+ i3110_iz
]
10822 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
10823 movd mm5
, [esi
+ eax
*4 + 8]
10824 pfsubr mm4
,mm0 ;
# dr = ir - jr
10826 movq
[esp
+ i3110_dx1
], mm4 ;
# store dr
10827 movd
[esp
+ i3110_dz1
], mm5
10828 pfmul mm4
,mm4 ;
# square dx,dy,dz
10830 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
10831 pfacc mm4
, mm5 ;
# first rsq in lower mm4
10833 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
10834 movd mm7
, [esi
+ ebx
*4 + 8]
10836 pfsubr mm6
,mm0 ;
# dr = ir - jr
10838 movq
[esp
+ i3110_dx2
], mm6 ;
# store dr
10839 movd
[esp
+ i3110_dz2
], mm7
10840 pfmul mm6
,mm6 ;
# square dx,dy,dz
10842 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
10843 pfacc mm6
, mm7 ;
# second rsq in lower mm6
10845 pfrcp mm0
, mm4 ;
# lookup reciprocal seed
10849 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
10850 ;
# amd 3dnow N-R iteration to get full precision.
10853 ;
# mm4 now contains invsq,
10854 ;
# do potential and fscal
10858 pfmul mm4
, mm0 ;
# mm4=rinvsix
10860 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
10862 pfmul mm5
, [esp
+ i3110_c12
]
10863 pfmul mm4
, [esp
+ i3110_c6
]
10864 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
10867 pfmul mm4
, [esp
+ i3110_six
]
10869 pfmul mm5
, [esp
+ i3110_twelve
]
10871 pfmul mm0
, mm5 ;
# mm0 is total fscal now
10873 prefetchw
[esp
+ i3110_dx1
] ;
# prefetch i forces to cache
10875 ;
# spread fscalar to both positions
10880 ;
# calc vector force
10881 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
10882 movq mm2
, [esp
+ i3110_dx1
] ;
# fetch dr
10883 movd mm3
, [esp
+ i3110_dz1
]
10886 pfadd mm6
, [esp
+ i3110_vnbtot
] ;
# add the earlier value
10887 movq
[esp
+ i3110_vnbtot
], mm6 ;
# store the sum
10889 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
10890 pfmul mm2
, mm0 ;
# mult by fs
10893 movq mm4
, [esp
+ i3110_dx2
] ;
# fetch dr
10894 movd mm5
, [esp
+ i3110_dz2
]
10895 pfmul mm4
, mm1 ;
# mult by fs
10899 movq mm0
, [esp
+ i3110_fix
]
10900 movd mm1
, [esp
+ i3110_fiz
]
10906 movq
[esp
+ i3110_fix
], mm0
10907 movd
[esp
+ i3110_fiz
], mm1
10910 movq mm0
, [edi
+ eax
*4]
10911 movd mm1
, [edi
+ eax
*4 + 8]
10912 movq mm6
, [edi
+ ebx
*4]
10913 movd mm7
, [edi
+ ebx
*4 + 8]
10920 movq
[edi
+ eax
*4], mm0
10921 movd
[edi
+ eax
*4 +8], mm1
10922 movq
[edi
+ ebx
*4], mm6
10923 movd
[edi
+ ebx
*4 + 8], mm7
10925 ;
# should we do one more iteration?
10926 sub dword ptr
[esp
+ i3110_innerk
], 2
10927 jl
.i3110_finish_vdw_inner
10928 jmp
.i3110_unroll_vdw_loop
10929 .i3110_finish_vdw_inner:
10930 and dword ptr
[esp
+ i3110_innerk
], 1
10931 jnz
.i3110_single_vdw_inner
10932 jmp
.i3110_updateouterdata_vdw
10933 .i3110_single_vdw_inner:
10934 ;
# a single j particle iteration here - compare with the unrolled code for comments.
10935 mov eax
, [esp
+ i3110_innerjjnr
]
10936 mov eax
, [eax
] ;
# eax=jnr offset
10938 mov esi
, [ebp
+ i3110_nbfp
]
10939 mov ecx
, [ebp
+ i3110_type
]
10940 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
10942 add edx
, [esp
+ i3110_ntia
] ;
# tja = ntia + 2*type
10943 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
10944 movq
[esp
+ i3110_c6
], mm5
10945 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
10946 movq
[esp
+ i3110_c12
], mm5
10948 mov esi
, [ebp
+ i3110_pos
]
10949 lea eax
, [eax
+ eax
*2]
10951 movq mm0
, [esp
+ i3110_ix
]
10952 movd mm1
, [esp
+ i3110_iz
]
10953 movq mm4
, [esi
+ eax
*4]
10954 movd mm5
, [esi
+ eax
*4 + 8]
10957 movq
[esp
+ i3110_dx1
], mm4
10959 movd
[esp
+ i3110_dz1
], mm5
10962 pfacc mm4
, mm5 ;
# mm4=rsq
10966 pfrcpit2 mm4
,mm0 ;
# mm4=invsq
10967 ;
# calculate potentials and scalar force
10971 pfmul mm4
, mm0 ;
# mm4=rinvsix
10973 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
10975 pfmul mm5
, [esp
+ i3110_c12
]
10976 pfmul mm4
, [esp
+ i3110_c6
]
10977 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
10980 pfmul mm4
, [esp
+ i3110_six
]
10982 pfmul mm5
, [esp
+ i3110_twelve
]
10984 pfmul mm0
, mm5 ;
# mm0 is total fscal now
10987 pfadd mm6
, [esp
+ i3110_vnbtot
] ;
# add the earlier value
10988 movq
[esp
+ i3110_vnbtot
], mm6 ;
# store the sum
10990 ;
# spread fscalar to both positions
10992 ;
# calc vectorial force
10993 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
10994 movq mm2
, [esp
+ i3110_dx1
]
10995 movd mm3
, [esp
+ i3110_dz1
]
11000 ;
# update i particle force
11001 movq mm0
, [esp
+ i3110_fix
]
11002 movd mm1
, [esp
+ i3110_fiz
]
11005 movq
[esp
+ i3110_fix
], mm0
11006 movd
[esp
+ i3110_fiz
], mm1
11007 ;
# update j particle force
11008 movq mm0
, [edi
+ eax
*4]
11009 movd mm1
, [edi
+ eax
*4+ 8]
11012 movq
[edi
+ eax
*4], mm0
11013 movd
[edi
+ eax
*4 +8], mm1
11015 .i3110_updateouterdata_vdw:
11016 mov ecx
, [esp
+ i3110_ii3
]
11018 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
11019 movd mm7
, [edi
+ ecx
*4 + 8]
11020 pfadd mm6
, [esp
+ i3110_fix
]
11021 pfadd mm7
, [esp
+ i3110_fiz
]
11022 movq
[edi
+ ecx
*4], mm6
11023 movd
[edi
+ ecx
*4 +8], mm7
11025 mov ebx
, [ebp
+ i3110_fshift
] ;
# increment fshift force
11026 mov edx
, [esp
+ i3110_is3
]
11028 movq mm6
, [ebx
+ edx
*4]
11029 movd mm7
, [ebx
+ edx
*4 + 8]
11030 pfadd mm6
, [esp
+ i3110_fix
]
11031 pfadd mm7
, [esp
+ i3110_fiz
]
11032 movq
[ebx
+ edx
*4], mm6
11033 movd
[ebx
+ edx
*4 + 8], mm7
11035 ;
# loop back to mno
11036 dec dword ptr
[esp
+ i3110_nsvdw
]
11041 mov edx
, [ebp
+ i3110_gid
] ;
# get group index for this i particle
11043 add dword ptr
[ebp
+ i3110_gid
], 4 ;
# advance pointer
11045 movq mm7
, [esp
+ i3110_vctot
]
11046 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
11048 mov eax
, [ebp
+ i3110_Vc
]
11049 movd mm6
, [eax
+ edx
*4]
11051 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
11053 movq mm7
, [esp
+ i3110_vnbtot
]
11054 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
11056 mov eax
, [ebp
+ i3110_Vnb
]
11057 movd mm6
, [eax
+ edx
*4]
11059 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
11061 mov ecx
, [ebp
+ i3110_nri
]
11064 ;
# not last, iterate once more!
11065 mov
[ebp
+ i3110_nri
], ecx
11082 .globl inl3120_3dnow
11083 .globl _inl3120_3dnow
11086 .equiv i3120_nri, 8
11087 .equiv i3120_iinr, 12
11088 .equiv i3120_jindex, 16
11089 .equiv i3120_jjnr, 20
11090 .equiv i3120_shift, 24
11091 .equiv i3120_shiftvec, 28
11092 .equiv i3120_fshift, 32
11093 .equiv i3120_gid, 36
11094 .equiv i3120_pos, 40
11095 .equiv i3120_faction, 44
11096 .equiv i3120_charge, 48
11097 .equiv i3120_facel, 52
11098 .equiv i3120_Vc, 56
11099 .equiv i3120_type, 60
11100 .equiv i3120_ntype, 64
11101 .equiv i3120_nbfp, 68
11102 .equiv i3120_Vnb, 72
11103 .equiv i3120_tabscale, 76
11104 .equiv i3120_VFtab, 80
11105 ;
# stack offsets for local variables
11106 .equiv i3120_is3, 0
11107 .equiv i3120_ii3, 4
11108 .equiv i3120_ixO, 8
11109 .equiv i3120_iyO, 12
11110 .equiv i3120_izO, 16
11111 .equiv i3120_ixH, 20
11112 .equiv i3120_iyH, 28
11113 .equiv i3120_izH, 36
11114 .equiv i3120_iqO, 44
11115 .equiv i3120_iqH, 52
11116 .equiv i3120_qqO, 60
11117 .equiv i3120_qqH, 68
11118 .equiv i3120_vctot, 76
11119 .equiv i3120_vnbtot, 84
11120 .equiv i3120_c6, 92
11121 .equiv i3120_c12, 100
11122 .equiv i3120_six, 108
11123 .equiv i3120_twelve, 116
11124 .equiv i3120_two, 124
11125 .equiv i3120_n1, 132
11126 .equiv i3120_tsc, 140
11127 .equiv i3120_ntia, 148
11128 .equiv i3120_innerjjnr, 156
11129 .equiv i3120_innerk, 160
11130 .equiv i3120_fixO, 164
11131 .equiv i3120_fiyO, 168
11132 .equiv i3120_fizO, 172
11133 .equiv i3120_fixH, 176
11134 .equiv i3120_fiyH, 184
11135 .equiv i3120_fizH, 192
11136 .equiv i3120_dxO, 200
11137 .equiv i3120_dyO, 204
11138 .equiv i3120_dzO, 208
11139 .equiv i3120_dxH, 212
11140 .equiv i3120_dyH, 220
11141 .equiv i3120_dzH, 228
11142 .equiv i3120_tmprsqH, 236
11151 sub esp
, 244 ;
# local stack space
11154 mov ecx
, [ebp
+ i3120_iinr
] ;
# ecx = pointer into iinr[]
11155 mov ebx
, [ecx
] ;
# ebx=ii
11157 mov edx
, [ebp
+ i3120_charge
]
11158 movd mm1
, [ebp
+ i3120_facel
]
11159 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0]
11161 movq
[esp
+ i3120_iqO
], mm2 ;
# iqO = facel*charge[ii]
11163 movd mm2
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1]
11165 punpckldq mm2
,mm2 ;
# spread to both halves
11166 movq
[esp
+ i3120_iqH
], mm2 ;
# iqH = facel*charge[ii0+1]
11168 mov edx
, [ebp
+ i3120_type
]
11169 mov edx
, [edx
+ ebx
*4]
11172 imul ecx
, [ebp
+ i3120_ntype
] ;
# ecx = ntia = 2*ntype*type[ii0]
11173 mov
[esp
+ i3120_ntia
], ecx
11177 movq mm5
, [mm_twelve
]
11178 movq mm6
, [ebp
+ i3120_tabscale
]
11179 punpckldq mm6
,mm6 ;
# spread to both halves
11180 movq
[esp
+ i3120_two
], mm3
11181 movq
[esp
+ i3120_six
], mm4
11182 movq
[esp
+ i3120_twelve
], mm5
11183 movq
[esp
+ i3120_tsc
], mm6
11184 ;
# assume we have at least one i particle - start directly
11186 mov eax
, [ebp
+ i3120_shift
] ;
# eax = pointer into shift[]
11187 mov ebx
, [eax
] ;
# ebx=shift[n]
11188 add dword ptr
[ebp
+ i3120_shift
], 4 ;
# advance pointer one step
11190 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
11191 mov
[esp
+ i3120_is3
],ebx ;
# store is3
11193 mov eax
, [ebp
+ i3120_shiftvec
] ;
# eax = base of shiftvec[]
11195 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
11196 movd mm6
, [eax
+ ebx
*4 + 8]
11200 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
11204 mov ecx
, [ebp
+ i3120_iinr
] ;
# ecx = pointer into iinr[]
11205 add dword ptr
[ebp
+ i3120_iinr
], 4 ;
# advance pointer
11206 mov ebx
, [ecx
] ;
# ebx=ii
11208 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
11209 mov eax
, [ebp
+ i3120_pos
] ;
# eax = base of pos[]
11211 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
11212 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
11213 mov
[esp
+ i3120_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
11215 movq
[esp
+ i3120_ixO
], mm5
11216 movq
[esp
+ i3120_izO
], mm6
11218 movd mm3
, [eax
+ ebx
*4 + 12]
11219 movd mm4
, [eax
+ ebx
*4 + 16]
11220 movd mm5
, [eax
+ ebx
*4 + 20]
11221 punpckldq mm3
, [eax
+ ebx
*4 + 24]
11222 punpckldq mm4
, [eax
+ ebx
*4 + 28]
11223 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
11228 movq
[esp
+ i3120_ixH
], mm0
11229 movq
[esp
+ i3120_iyH
], mm1
11230 movq
[esp
+ i3120_izH
], mm2
11232 ;
# clear vctot and i forces
11234 movq
[esp
+ i3120_vctot
], mm7
11235 movq
[esp
+ i3120_vnbtot
], mm7
11236 movq
[esp
+ i3120_fixO
], mm7
11237 movd
[esp
+ i3120_fizO
], mm7
11238 movq
[esp
+ i3120_fixH
], mm7
11239 movq
[esp
+ i3120_fiyH
], mm7
11240 movq
[esp
+ i3120_fizH
], mm7
11242 mov eax
, [ebp
+ i3120_jindex
]
11243 mov ecx
, [eax
] ;
# jindex[n]
11244 mov edx
, [eax
+ 4] ;
# jindex[n+1]
11245 add dword ptr
[ebp
+ i3120_jindex
], 4
11246 sub edx
, ecx ;
# number of innerloop atoms
11247 mov
[esp
+ i3120_innerk
], edx
11249 mov esi
, [ebp
+ i3120_pos
]
11250 mov edi
, [ebp
+ i3120_faction
]
11251 mov eax
, [ebp
+ i3120_jjnr
]
11254 mov
[esp
+ i3120_innerjjnr
], eax ;
# pointer to jjnr[nj0]
11256 ;
# a single j particle iteration here - compare with the unrolled code for comments.
11257 mov eax
, [esp
+ i3120_innerjjnr
]
11258 mov eax
, [eax
] ;
# eax=jnr offset
11259 add dword ptr
[esp
+ i3120_innerjjnr
], 4 ;
# advance pointer
11260 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
11262 mov ecx
, [ebp
+ i3120_charge
]
11263 movd mm7
, [ecx
+ eax
*4]
11266 pfmul mm6
, [esp
+ i3120_iqO
]
11267 pfmul mm7
, [esp
+ i3120_iqH
] ;
# mm6=qqO, mm7=qqH
11268 movd
[esp
+ i3120_qqO
], mm6
11269 movq
[esp
+ i3120_qqH
], mm7
11271 mov ecx
, [ebp
+ i3120_type
]
11272 mov edx
, [ecx
+ eax
*4] ;
# type [jnr]
11273 mov ecx
, [ebp
+ i3120_nbfp
]
11275 add edx
, [esp
+ i3120_ntia
] ;
# tja = ntia + 2*type
11276 movd mm5
, [ecx
+ edx
*4] ;
# mm5 = 1st c6
11277 movq
[esp
+ i3120_c6
], mm5
11278 movd mm5
, [ecx
+ edx
*4 + 4] ;
# mm5 = 1st c12
11279 movq
[esp
+ i3120_c12
], mm5
11281 lea eax
, [eax
+ eax
*2]
11283 movq mm0
, [esi
+ eax
*4]
11284 movd mm1
, [esi
+ eax
*4 + 8]
11285 ;
# copy & expand to mm2-mm4 for the H interactions
11293 pfsubr mm0
, [esp
+ i3120_ixO
]
11294 pfsubr mm1
, [esp
+ i3120_izO
]
11296 movq
[esp
+ i3120_dxO
], mm0
11298 movd
[esp
+ i3120_dzO
], mm1
11301 pfadd mm0
, mm1 ;
# mm0=rsqO
11305 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
11306 pfsubr mm2
, [esp
+ i3120_ixH
]
11307 pfsubr mm3
, [esp
+ i3120_iyH
]
11308 pfsubr mm4
, [esp
+ i3120_izH
] ;
# mm2-mm4 is dxH-dzH
11310 movq
[esp
+ i3120_dxH
], mm2
11311 movq
[esp
+ i3120_dyH
], mm3
11312 movq
[esp
+ i3120_dzH
], mm4
11318 pfadd mm3
,mm4 ;
# mm3=rsqH
11319 movq
[esp
+ i3120_tmprsqH
], mm3
11326 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
11328 pfmul mm0
, mm1 ;
# mm0=r
11330 pfmul mm0
, [esp
+ i3120_tsc
]
11332 movd
[esp
+ i3120_n1
], mm4
11334 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
11336 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
11339 mov edx
, [ebp
+ i3120_VFtab
]
11340 mov ecx
, [esp
+ i3120_n1
]
11342 ;
# load all values we need
11343 movd mm4
, [edx
+ ecx
*4]
11344 movd mm5
, [edx
+ ecx
*4 + 4]
11345 movd mm6
, [edx
+ ecx
*4 + 8]
11346 movd mm7
, [edx
+ ecx
*4 + 12]
11348 pfmul mm6
, mm0 ;
# mm6 = Geps
11349 pfmul mm7
, mm2 ;
# mm7 = Heps2
11352 pfadd mm5
, mm7 ;
# mm5 = Fp
11354 pfmul mm7
, [esp
+ i3120_two
] ;
# two*Heps2
11356 pfadd mm7
, mm5 ;
# mm7=FF
11358 pfmul mm5
, mm0 ;
# mm5=eps*Fp
11359 pfadd mm5
, mm4 ;
# mm5= VV
11361 pfmul mm5
, [esp
+ i3120_qqO
] ;
# vcoul=qq*VV
11362 pfmul mm7
, [esp
+ i3120_qqO
] ;
# fijC=qq*FF
11363 ;
# update vctot directly, use mm3 for fscal sum.
11364 pfadd mm5
, [esp
+ i3120_vctot
]
11365 movq
[esp
+ i3120_vctot
], mm5
11368 pfmul mm3
, [esp
+ i3120_tsc
]
11370 ;
# nontabulated LJ - mm1 is invsqrt. - keep mm1!
11372 pfmul mm0
, mm0 ;
# mm0 is invsq
11375 pfmul mm2
, mm0 ;
# mm2 = rinvsix
11377 pfmul mm4
, mm4 ;
# mm4=rinvtwelve
11379 pfmul mm4
, [esp
+ i3120_c12
]
11380 pfmul mm2
, [esp
+ i3120_c6
]
11382 pfsub mm5
, mm2 ;
# mm5=vnb12-vnb6
11384 pfmul mm2
, [esp
+ i3120_six
]
11385 pfmul mm4
, [esp
+ i3120_twelve
]
11387 pfmul mm4
, mm1 ;
# mm4=(12*vnb12-6*vnb6)*rinv11
11390 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
11393 pfadd mm5
, [esp
+ i3120_vnbtot
] ;
# add the earlier value
11394 movq
[esp
+ i3120_vnbtot
], mm5 ;
# store the sum
11396 ;
# Ready with the oxygen - potential is updated, fscal is in mm3.
11397 ;
# now do the two hydrogens.
11398 movq mm0
, [esp
+ i3120_tmprsqH
] ;
# mm0=rsqH
11404 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
11409 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
11411 pfmul mm0
,mm1 ;
# mm0=r
11412 pfmul mm0
, [esp
+ i3120_tsc
]
11414 movq
[esp
+ i3120_n1
], mm4
11416 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
11418 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
11421 mov edx
, [ebp
+ i3120_VFtab
]
11422 mov ecx
, [esp
+ i3120_n1
]
11424 ;
# load all values we need
11425 movd mm4
, [edx
+ ecx
*4]
11426 movd mm5
, [edx
+ ecx
*4 + 4]
11427 movd mm6
, [edx
+ ecx
*4 + 8]
11428 movd mm7
, [edx
+ ecx
*4 + 12]
11429 mov ecx
, [esp
+ i3120_n1
+ 4]
11431 punpckldq mm4
, [edx
+ ecx
*4]
11432 punpckldq mm5
, [edx
+ ecx
*4 + 4]
11433 punpckldq mm6
, [edx
+ ecx
*4 + 8]
11434 punpckldq mm7
, [edx
+ ecx
*4 + 12]
11436 pfmul mm6
, mm0 ;
# mm6 = Geps
11437 pfmul mm7
, mm2 ;
# mm7 = Heps2
11440 pfadd mm5
, mm7 ;
# mm5 = Fp
11442 pfmul mm7
, [esp
+ i3120_two
] ;
# two*Heps2
11444 pfadd mm7
, mm5 ;
# mm7=FF
11446 pfmul mm5
, mm0 ;
# mm5=eps*Fp
11447 pfadd mm5
, mm4 ;
# mm5= VV
11449 pfmul mm5
, [esp
+ i3120_qqH
] ;
# vcoul=qq*VV
11450 pfmul mm7
, [esp
+ i3120_qqH
] ;
# fijC=qq*FF
11452 pfadd mm5
, [esp
+ i3120_vctot
]
11453 movq
[esp
+ i3120_vctot
], mm5
11455 ;
# change sign of fijC and multiply by rinv
11458 pfmul mm4
, [esp
+ i3120_tsc
]
11459 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
11461 ;
# spread oxygen fscalar to both positions
11463 ;
# calc vectorial force for O
11464 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
11465 movq mm0
, [esp
+ i3120_dxO
]
11466 movd mm1
, [esp
+ i3120_dzO
]
11470 ;
# calc vectorial force for H's
11471 movq mm5
, [esp
+ i3120_dxH
]
11472 movq mm6
, [esp
+ i3120_dyH
]
11473 movq mm7
, [esp
+ i3120_dzH
]
11478 ;
# update iO particle force
11479 movq mm2
, [esp
+ i3120_fixO
]
11480 movd mm3
, [esp
+ i3120_fizO
]
11483 movq
[esp
+ i3120_fixO
], mm2
11484 movd
[esp
+ i3120_fizO
], mm3
11486 ;
# update iH forces
11487 movq mm2
, [esp
+ i3120_fixH
]
11488 movq mm3
, [esp
+ i3120_fiyH
]
11489 movq mm4
, [esp
+ i3120_fizH
]
11493 movq
[esp
+ i3120_fixH
], mm2
11494 movq
[esp
+ i3120_fiyH
], mm3
11495 movq
[esp
+ i3120_fizH
], mm4
11497 ;
# pack j forces from H in the same form as the oxygen force.
11498 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
11499 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
11501 pfadd mm0
, mm5 ;
# add up total force on j particle.
11504 ;
# update j particle force
11505 movq mm2
, [edi
+ eax
*4]
11506 movd mm3
, [edi
+ eax
*4 + 8]
11509 movq
[edi
+ eax
*4], mm2
11510 movd
[edi
+ eax
*4 +8], mm3
11512 ;
# done - one more?
11513 dec dword ptr
[esp
+ i3120_innerk
]
11514 jz
.i3120_updateouterdata
11515 jmp
.i3120_inner_loop
11516 .i3120_updateouterdata:
11517 mov ecx
, [esp
+ i3120_ii3
]
11519 movq mm6
, [edi
+ ecx
*4] ;
# increment iO force
11520 movd mm7
, [edi
+ ecx
*4 + 8]
11521 pfadd mm6
, [esp
+ i3120_fixO
]
11522 pfadd mm7
, [esp
+ i3120_fizO
]
11523 movq
[edi
+ ecx
*4], mm6
11524 movd
[edi
+ ecx
*4 +8], mm7
11526 movq mm0
, [esp
+ i3120_fixH
]
11527 movq mm3
, [esp
+ i3120_fiyH
]
11528 movq mm1
, [esp
+ i3120_fizH
]
11530 punpckldq mm0
, mm3 ;
# mm0(l)=fxH1, mm0(h)=fyH1
11531 punpckhdq mm2
, mm3 ;
# mm2(l)=fxH2, mm2(h)=fyH2
11537 movq mm6
, [edi
+ ecx
*4 + 12] ;
# increment iH1 force
11538 movd mm7
, [edi
+ ecx
*4 + 20]
11541 movq
[edi
+ ecx
*4 + 12], mm6
11542 movd
[edi
+ ecx
*4 + 20], mm7
11544 movq mm6
, [edi
+ ecx
*4 + 24] ;
# increment iH2 force
11545 movd mm7
, [edi
+ ecx
*4 + 32]
11548 movq
[edi
+ ecx
*4 + 24], mm6
11549 movd
[edi
+ ecx
*4 + 32], mm7
11552 mov ebx
, [ebp
+ i3120_fshift
] ;
# increment fshift force
11553 mov edx
, [esp
+ i3120_is3
]
11555 movq mm6
, [ebx
+ edx
*4]
11556 movd mm7
, [ebx
+ edx
*4 + 8]
11557 pfadd mm6
, [esp
+ i3120_fixO
]
11558 pfadd mm7
, [esp
+ i3120_fizO
]
11563 movq
[ebx
+ edx
*4], mm6
11564 movd
[ebx
+ edx
*4 + 8], mm7
11566 mov edx
, [ebp
+ i3120_gid
] ;
# get group index for this i particle
11568 add dword ptr
[ebp
+ i3120_gid
], 4 ;
# advance pointer
11570 movq mm7
, [esp
+ i3120_vctot
]
11571 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
11573 mov eax
, [ebp
+ i3120_Vc
]
11574 movd mm6
, [eax
+ edx
*4]
11576 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
11578 movq mm7
, [esp
+ i3120_vnbtot
]
11579 pfacc mm7
,mm7 ;
# same for Vnb
11581 mov eax
, [ebp
+ i3120_Vnb
]
11582 movd mm6
, [eax
+ edx
*4]
11584 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
11586 dec dword ptr
[ebp
+ i3120_nri
]
11588 ;
# not last, iterate once more!
11606 .globl inl3130_3dnow
11607 .globl _inl3130_3dnow
11610 .equiv i3130_nri, 8
11611 .equiv i3130_iinr, 12
11612 .equiv i3130_jindex, 16
11613 .equiv i3130_jjnr, 20
11614 .equiv i3130_shift, 24
11615 .equiv i3130_shiftvec, 28
11616 .equiv i3130_fshift, 32
11617 .equiv i3130_gid, 36
11618 .equiv i3130_pos, 40
11619 .equiv i3130_faction, 44
11620 .equiv i3130_charge, 48
11621 .equiv i3130_facel, 52
11622 .equiv i3130_Vc, 56
11623 .equiv i3130_type, 60
11624 .equiv i3130_ntype, 64
11625 .equiv i3130_nbfp, 68
11626 .equiv i3130_Vnb, 72
11627 .equiv i3130_tabscale, 76
11628 .equiv i3130_VFtab, 80
11629 ;
# stack offsets for local variables
11630 .equiv i3130_is3, 0
11631 .equiv i3130_ii3, 4
11632 .equiv i3130_ixO, 8
11633 .equiv i3130_iyO, 12
11634 .equiv i3130_izO, 16
11635 .equiv i3130_ixH, 20
11636 .equiv i3130_iyH, 28
11637 .equiv i3130_izH, 36
11638 .equiv i3130_qqOO, 44
11639 .equiv i3130_qqOH, 52
11640 .equiv i3130_qqHH, 60
11641 .equiv i3130_c6, 68
11642 .equiv i3130_c12, 76
11643 .equiv i3130_six, 84
11644 .equiv i3130_twelve, 92
11645 .equiv i3130_two, 100
11646 .equiv i3130_n1, 108
11647 .equiv i3130_tsc, 116
11648 .equiv i3130_vctot, 124
11649 .equiv i3130_vnbtot, 132
11650 .equiv i3130_innerjjnr, 140
11651 .equiv i3130_innerk, 144
11652 .equiv i3130_fixO, 148
11653 .equiv i3130_fiyO, 152
11654 .equiv i3130_fizO, 156
11655 .equiv i3130_fixH, 160
11656 .equiv i3130_fiyH, 168
11657 .equiv i3130_fizH, 176
11658 .equiv i3130_dxO, 184
11659 .equiv i3130_dyO, 188
11660 .equiv i3130_dzO, 192
11661 .equiv i3130_dxH, 200
11662 .equiv i3130_dyH, 208
11663 .equiv i3130_dzH, 216
11664 .equiv i3130_tmprsqH, 224
11673 sub esp
, 232 ;
# local stack space
11675 ;
# assume we have at least one i particle - start directly
11677 mov ecx
, [ebp
+ i3130_iinr
] ;
# ecx = pointer into iinr[]
11678 mov ebx
, [ecx
] ;
# ebx=ii
11680 mov edx
, [ebp
+ i3130_charge
]
11681 movd mm1
, [ebp
+ i3130_facel
] ;
# mm1=facel
11682 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0] (O)
11683 movd mm3
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1] (H)
11689 pfmul mm4
, mm2 ;
# mm4=qqOO*facel
11690 pfmul mm5
, mm3 ;
# mm5=qqOH*facel
11691 pfmul mm6
, mm3 ;
# mm6=qqHH*facel
11692 punpckldq mm5
,mm5 ;
# spread to both halves
11693 punpckldq mm6
,mm6 ;
# spread to both halves
11694 movq
[esp
+ i3130_qqOO
], mm4
11695 movq
[esp
+ i3130_qqOH
], mm5
11696 movq
[esp
+ i3130_qqHH
], mm6
11697 mov edx
, [ebp
+ i3130_type
]
11698 mov ecx
, [edx
+ ebx
*4]
11701 imul ecx
, [ebp
+ i3130_ntype
]
11703 mov eax
, [ebp
+ i3130_nbfp
]
11704 movd mm0
, [eax
+ edx
*4]
11705 movd mm1
, [eax
+ edx
*4 + 4]
11706 movq
[esp
+ i3130_c6
], mm0
11707 movq
[esp
+ i3130_c12
], mm1
11710 movq mm4
, [mm_twelve
]
11711 movq
[esp
+ i3130_two
], mm2
11712 movq
[esp
+ i3130_six
], mm3
11713 movq
[esp
+ i3130_twelve
], mm4
11714 movd mm5
, [ebp
+ i3130_tabscale
]
11716 movq
[esp
+ i3130_tsc
], mm5
11718 mov eax
, [ebp
+ i3130_shift
] ;
# eax = pointer into shift[]
11719 mov ebx
, [eax
] ;
# ebx=shift[n]
11720 add dword ptr
[ebp
+ i3130_shift
], 4 ;
# advance pointer one step
11722 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
11723 mov
[esp
+ i3130_is3
],ebx ;
# store is3
11725 mov eax
, [ebp
+ i3130_shiftvec
] ;
# eax = base of shiftvec[]
11727 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
11728 movd mm6
, [eax
+ ebx
*4 + 8]
11732 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
11736 mov ecx
, [ebp
+ i3130_iinr
] ;
# ecx = pointer into iinr[]
11737 add dword ptr
[ebp
+ i3130_iinr
], 4 ;
# advance pointer
11738 mov ebx
, [ecx
] ;
# ebx=ii
11740 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
11741 mov eax
, [ebp
+ i3130_pos
] ;
# eax = base of pos[]
11743 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
11744 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
11745 mov
[esp
+ i3130_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
11747 movq
[esp
+ i3130_ixO
], mm5
11748 movq
[esp
+ i3130_izO
], mm6
11750 movd mm3
, [eax
+ ebx
*4 + 12]
11751 movd mm4
, [eax
+ ebx
*4 + 16]
11752 movd mm5
, [eax
+ ebx
*4 + 20]
11753 punpckldq mm3
, [eax
+ ebx
*4 + 24]
11754 punpckldq mm4
, [eax
+ ebx
*4 + 28]
11755 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
11760 movq
[esp
+ i3130_ixH
], mm0
11761 movq
[esp
+ i3130_iyH
], mm1
11762 movq
[esp
+ i3130_izH
], mm2
11764 ;
# clear vctot and i forces
11766 movq
[esp
+ i3130_vctot
], mm7
11767 movq
[esp
+ i3130_vnbtot
], mm7
11768 movq
[esp
+ i3130_fixO
], mm7
11769 movq
[esp
+ i3130_fizO
], mm7
11770 movq
[esp
+ i3130_fixH
], mm7
11771 movq
[esp
+ i3130_fiyH
], mm7
11772 movq
[esp
+ i3130_fizH
], mm7
11774 mov eax
, [ebp
+ i3130_jindex
]
11775 mov ecx
, [eax
] ;
# jindex[n]
11776 mov edx
, [eax
+ 4] ;
# jindex[n+1]
11777 add dword ptr
[ebp
+ i3130_jindex
], 4
11778 sub edx
, ecx ;
# number of innerloop atoms
11779 mov
[esp
+ i3130_innerk
], edx ;
# number of innerloop atoms
11781 mov esi
, [ebp
+ i3130_pos
]
11782 mov edi
, [ebp
+ i3130_faction
]
11783 mov eax
, [ebp
+ i3130_jjnr
]
11786 mov
[esp
+ i3130_innerjjnr
], eax ;
# pointer to jjnr[nj0]
11788 ;
# a single j particle iteration here - compare with the unrolled code for comments.
11789 mov eax
, [esp
+ i3130_innerjjnr
]
11790 mov eax
, [eax
] ;
# eax=jnr offset
11791 add dword ptr
[esp
+ i3130_innerjjnr
], 4 ;
# advance pointer
11793 lea eax
, [eax
+ eax
*2]
11795 movq mm0
, [esi
+ eax
*4]
11796 movd mm1
, [esi
+ eax
*4 + 8]
11797 ;
# copy & expand to mm2-mm4 for the H interactions
11805 pfsubr mm0
, [esp
+ i3130_ixO
]
11806 pfsubr mm1
, [esp
+ i3130_izO
]
11808 movq
[esp
+ i3130_dxO
], mm0
11810 movd
[esp
+ i3130_dzO
], mm1
11813 pfadd mm0
, mm1 ;
# mm0=rsqO
11817 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
11818 pfsubr mm2
, [esp
+ i3130_ixH
]
11819 pfsubr mm3
, [esp
+ i3130_iyH
]
11820 pfsubr mm4
, [esp
+ i3130_izH
] ;
# mm2-mm4 is dxH-dzH
11822 movq
[esp
+ i3130_dxH
], mm2
11823 movq
[esp
+ i3130_dyH
], mm3
11824 movq
[esp
+ i3130_dzH
], mm4
11830 pfadd mm3
,mm4 ;
# mm3=rsqH
11831 movq
[esp
+ i3130_tmprsqH
], mm3
11838 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
11839 pfmul mm0
, mm1 ;
# mm0=rsq
11841 pfmul mm0
, [esp
+ i3130_tsc
]
11843 movd
[esp
+ i3130_n1
], mm4
11845 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
11847 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
11850 mov edx
, [ebp
+ i3130_VFtab
]
11851 mov ecx
, [esp
+ i3130_n1
]
11854 ;
# load all values we need
11855 movd mm4
, [edx
+ ecx
*4]
11856 movd mm5
, [edx
+ ecx
*4 + 4]
11857 movd mm6
, [edx
+ ecx
*4 + 8]
11858 movd mm7
, [edx
+ ecx
*4 + 12]
11860 pfmul mm6
, mm0 ;
# mm6 = Geps
11861 pfmul mm7
, mm2 ;
# mm7 = Heps2
11864 pfadd mm5
, mm7 ;
# mm5 = Fp
11866 pfmul mm7
, [esp
+ i3130_two
] ;
# two*Heps2
11868 pfadd mm7
, mm5 ;
# mm7=FF
11870 pfmul mm5
, mm0 ;
# mm5=eps*Fp
11871 pfadd mm5
, mm4 ;
# mm5= VV
11873 pfmul mm5
, [esp
+ i3130_qqOO
] ;
# vcoul=qq*VV
11874 pfmul mm7
, [esp
+ i3130_qqOO
] ;
# fijC=qq*FF
11876 ;
# update vctot directly, use mm3 for fscal sum.
11877 pfadd mm5
, [esp
+ i3130_vctot
]
11878 movq
[esp
+ i3130_vctot
], mm5
11880 pfmul mm3
, [esp
+ i3130_tsc
]
11888 pfmul mm5
,mm5 ;
# mm4=rinvsix, mm5=rinvtwelve
11890 pfmul mm4
, [esp
+ i3130_c6
]
11891 pfmul mm5
, [esp
+ i3130_c12
]
11895 pfmul mm4
, [esp
+ i3130_six
]
11896 pfmul mm5
, [esp
+ i3130_twelve
]
11901 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
11904 pfadd mm6
, [esp
+ i3130_vnbtot
] ;
# add the earlier value
11905 movq
[esp
+ i3130_vnbtot
], mm6 ;
# store the sum
11907 ;
# Ready with the oxygen - potential is updated, fscal is in mm3.
11908 ;
# time for hydrogens!
11910 movq mm0
, [esp
+ i3130_tmprsqH
]
11916 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
11921 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
11923 pfmul mm0
,mm1 ;
# mm0=r
11924 pfmul mm0
, [esp
+ i3130_tsc
]
11926 movq
[esp
+ i3130_n1
], mm4
11928 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
11930 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
11933 mov edx
, [ebp
+ i3130_VFtab
]
11934 mov ecx
, [esp
+ i3130_n1
]
11936 ;
# load all values we need
11937 movd mm4
, [edx
+ ecx
*4]
11938 movd mm5
, [edx
+ ecx
*4 + 4]
11939 movd mm6
, [edx
+ ecx
*4 + 8]
11940 movd mm7
, [edx
+ ecx
*4 + 12]
11941 mov ecx
, [esp
+ i3130_n1
+ 4]
11943 punpckldq mm4
, [edx
+ ecx
*4]
11944 punpckldq mm5
, [edx
+ ecx
*4 + 4]
11945 punpckldq mm6
, [edx
+ ecx
*4 + 8]
11946 punpckldq mm7
, [edx
+ ecx
*4 + 12]
11948 pfmul mm6
, mm0 ;
# mm6 = Geps
11949 pfmul mm7
, mm2 ;
# mm7 = Heps2
11952 pfadd mm5
, mm7 ;
# mm5 = Fp
11954 pfmul mm7
, [esp
+ i3130_two
] ;
# two*Heps2
11956 pfadd mm7
, mm5 ;
# mm7=FF
11958 pfmul mm5
, mm0 ;
# mm5=eps*Fp
11959 pfadd mm5
, mm4 ;
# mm5= VV
11961 pfmul mm5
, [esp
+ i3130_qqOH
] ;
# vcoul=qq*VV
11962 pfmul mm7
, [esp
+ i3130_qqOH
] ;
# fijC=qq*FF
11964 pfadd mm5
, [esp
+ i3130_vctot
]
11965 movq
[esp
+ i3130_vctot
], mm5
11967 ;
# change sign of fijC and multiply by rinv
11970 pfmul mm4
, [esp
+ i3130_tsc
]
11971 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
11973 ;
# spread oxygen fscalar to both positions
11975 ;
# calc vectorial force for O
11976 movq mm0
, [esp
+ i3130_dxO
]
11977 movd mm1
, [esp
+ i3130_dzO
]
11981 ;
# calc vectorial force for H's
11982 movq mm5
, [esp
+ i3130_dxH
]
11983 movq mm6
, [esp
+ i3130_dyH
]
11984 movq mm7
, [esp
+ i3130_dzH
]
11989 ;
# update iO particle force
11990 movq mm2
, [esp
+ i3130_fixO
]
11991 movd mm3
, [esp
+ i3130_fizO
]
11994 movq
[esp
+ i3130_fixO
], mm2
11995 movd
[esp
+ i3130_fizO
], mm3
11997 ;
# update iH forces
11998 movq mm2
, [esp
+ i3130_fixH
]
11999 movq mm3
, [esp
+ i3130_fiyH
]
12000 movq mm4
, [esp
+ i3130_fizH
]
12004 movq
[esp
+ i3130_fixH
], mm2
12005 movq
[esp
+ i3130_fiyH
], mm3
12006 movq
[esp
+ i3130_fizH
], mm4
12008 ;
# pack j forces from H in the same form as the oxygen force.
12009 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
12010 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
12012 pfadd mm0
, mm5 ;
# add up total force on j particle.
12015 ;
# update j particle force
12016 movq mm2
, [edi
+ eax
*4]
12017 movd mm3
, [edi
+ eax
*4 + 8]
12020 movq
[edi
+ eax
*4], mm2
12021 movd
[edi
+ eax
*4 +8], mm3
12023 ;
# interactions with j H1
12025 movq mm0
, [esi
+ eax
*4 + 12]
12026 movd mm1
, [esi
+ eax
*4 + 20]
12027 ;
# copy & expand to mm2-mm4 for the H interactions
12035 pfsubr mm0
, [esp
+ i3130_ixO
]
12036 pfsubr mm1
, [esp
+ i3130_izO
]
12038 movq
[esp
+ i3130_dxO
], mm0
12040 movd
[esp
+ i3130_dzO
], mm1
12043 pfadd mm0
, mm1 ;
# mm0=rsqO
12047 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
12048 pfsubr mm2
, [esp
+ i3130_ixH
]
12049 pfsubr mm3
, [esp
+ i3130_iyH
]
12050 pfsubr mm4
, [esp
+ i3130_izH
] ;
# mm2-mm4 is dxH-dzH
12052 movq
[esp
+ i3130_dxH
], mm2
12053 movq
[esp
+ i3130_dyH
], mm3
12054 movq
[esp
+ i3130_dzH
], mm4
12060 pfadd mm3
,mm4 ;
# mm3=rsqH
12061 movq
[esp
+ i3130_tmprsqH
], mm3
12068 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
12069 pfmul mm0
, mm1 ;
# mm0=rsq
12071 pfmul mm0
, [esp
+ i3130_tsc
]
12073 movd
[esp
+ i3130_n1
], mm4
12075 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
12077 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
12080 mov edx
, [ebp
+ i3130_VFtab
]
12081 mov ecx
, [esp
+ i3130_n1
]
12084 ;
# load all values we need
12085 movd mm4
, [edx
+ ecx
*4]
12086 movd mm5
, [edx
+ ecx
*4 + 4]
12087 movd mm6
, [edx
+ ecx
*4 + 8]
12088 movd mm7
, [edx
+ ecx
*4 + 12]
12090 pfmul mm6
, mm0 ;
# mm6 = Geps
12091 pfmul mm7
, mm2 ;
# mm7 = Heps2
12094 pfadd mm5
, mm7 ;
# mm5 = Fp
12096 pfmul mm7
, [esp
+ i3130_two
] ;
# two*Heps2
12098 pfadd mm7
, mm5 ;
# mm7=FF
12100 pfmul mm5
, mm0 ;
# mm5=eps*Fp
12101 pfadd mm5
, mm4 ;
# mm5= VV
12103 pfmul mm5
, [esp
+ i3130_qqOH
] ;
# vcoul=qq*VV
12104 pfmul mm7
, [esp
+ i3130_qqOH
] ;
# fijC=qq*FF
12106 ;
# update vctot directly, force is moved to mm3
12107 pfadd mm5
, [esp
+ i3130_vctot
]
12108 movq
[esp
+ i3130_vctot
], mm5
12111 pfmul mm3
, [esp
+ i3130_tsc
]
12112 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
12114 movq mm0
, [esp
+ i3130_tmprsqH
]
12120 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
12125 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
12127 pfmul mm0
,mm1 ;
# mm0=r
12128 pfmul mm0
, [esp
+ i3130_tsc
]
12130 movq
[esp
+ i3130_n1
], mm4
12132 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
12134 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
12137 mov edx
, [ebp
+ i3130_VFtab
]
12138 mov ecx
, [esp
+ i3130_n1
]
12140 ;
# load all values we need
12141 movd mm4
, [edx
+ ecx
*4]
12142 movd mm5
, [edx
+ ecx
*4 + 4]
12143 movd mm6
, [edx
+ ecx
*4 + 8]
12144 movd mm7
, [edx
+ ecx
*4 + 12]
12145 mov ecx
, [esp
+ i3130_n1
+ 4]
12147 punpckldq mm4
, [edx
+ ecx
*4]
12148 punpckldq mm5
, [edx
+ ecx
*4 + 4]
12149 punpckldq mm6
, [edx
+ ecx
*4 + 8]
12150 punpckldq mm7
, [edx
+ ecx
*4 + 12]
12153 pfmul mm6
, mm0 ;
# mm6 = Geps
12154 pfmul mm7
, mm2 ;
# mm7 = Heps2
12157 pfadd mm5
, mm7 ;
# mm5 = Fp
12159 pfmul mm7
, [esp
+ i3130_two
] ;
# two*Heps2
12161 pfadd mm7
, mm5 ;
# mm7=FF
12163 pfmul mm5
, mm0 ;
# mm5=eps*Fp
12164 pfadd mm5
, mm4 ;
# mm5= VV
12166 pfmul mm5
, [esp
+ i3130_qqHH
] ;
# vcoul=qq*VV
12167 pfmul mm7
, [esp
+ i3130_qqHH
] ;
# fijC=qq*FF
12169 pfadd mm5
, [esp
+ i3130_vctot
]
12170 movq
[esp
+ i3130_vctot
], mm5
12172 ;
# change sign of fijC and multiply by rinv
12175 pfmul mm4
, [esp
+ i3130_tsc
]
12176 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
12178 ;
# spread oxygen fscalar to both positions
12180 ;
# calc vectorial force for O
12181 movq mm0
, [esp
+ i3130_dxO
]
12182 movd mm1
, [esp
+ i3130_dzO
]
12186 ;
# calc vectorial force for H's
12187 movq mm5
, [esp
+ i3130_dxH
]
12188 movq mm6
, [esp
+ i3130_dyH
]
12189 movq mm7
, [esp
+ i3130_dzH
]
12194 ;
# update iO particle force
12195 movq mm2
, [esp
+ i3130_fixO
]
12196 movd mm3
, [esp
+ i3130_fizO
]
12199 movq
[esp
+ i3130_fixO
], mm2
12200 movd
[esp
+ i3130_fizO
], mm3
12202 ;
# update iH forces
12203 movq mm2
, [esp
+ i3130_fixH
]
12204 movq mm3
, [esp
+ i3130_fiyH
]
12205 movq mm4
, [esp
+ i3130_fizH
]
12209 movq
[esp
+ i3130_fixH
], mm2
12210 movq
[esp
+ i3130_fiyH
], mm3
12211 movq
[esp
+ i3130_fizH
], mm4
12213 ;
# pack j forces from H in the same form as the oxygen force.
12214 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
12215 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
12217 pfadd mm0
, mm5 ;
# add up total force on j particle.
12220 ;
# update j particle force
12221 movq mm2
, [edi
+ eax
*4 + 12]
12222 movd mm3
, [edi
+ eax
*4 + 20]
12225 movq
[edi
+ eax
*4 + 12], mm2
12226 movd
[edi
+ eax
*4 + 20], mm3
12228 ;
# interactions with j H2
12229 movq mm0
, [esi
+ eax
*4 + 24]
12230 movd mm1
, [esi
+ eax
*4 + 32]
12231 ;
# copy & expand to mm2-mm4 for the H interactions
12239 pfsubr mm0
, [esp
+ i3130_ixO
]
12240 pfsubr mm1
, [esp
+ i3130_izO
]
12242 movq
[esp
+ i3130_dxO
], mm0
12244 movd
[esp
+ i3130_dzO
], mm1
12247 pfadd mm0
, mm1 ;
# mm0=rsqO
12251 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
12252 pfsubr mm2
, [esp
+ i3130_ixH
]
12253 pfsubr mm3
, [esp
+ i3130_iyH
]
12254 pfsubr mm4
, [esp
+ i3130_izH
] ;
# mm2-mm4 is dxH-dzH
12256 movq
[esp
+ i3130_dxH
], mm2
12257 movq
[esp
+ i3130_dyH
], mm3
12258 movq
[esp
+ i3130_dzH
], mm4
12264 pfadd mm3
,mm4 ;
# mm3=rsqH
12265 movq
[esp
+ i3130_tmprsqH
], mm3
12272 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
12275 pfmul mm0
, [esp
+ i3130_tsc
]
12277 movd
[esp
+ i3130_n1
], mm4
12279 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
12281 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
12284 mov edx
, [ebp
+ i3130_VFtab
]
12285 mov ecx
, [esp
+ i3130_n1
]
12288 ;
# load all values we need
12289 movd mm4
, [edx
+ ecx
*4]
12290 movd mm5
, [edx
+ ecx
*4 + 4]
12291 movd mm6
, [edx
+ ecx
*4 + 8]
12292 movd mm7
, [edx
+ ecx
*4 + 12]
12294 pfmul mm6
, mm0 ;
# mm6 = Geps
12295 pfmul mm7
, mm2 ;
# mm7 = Heps2
12298 pfadd mm5
, mm7 ;
# mm5 = Fp
12300 pfmul mm7
, [esp
+ i3130_two
] ;
# two*Heps2
12302 pfadd mm7
, mm5 ;
# mm7=FF
12304 pfmul mm5
, mm0 ;
# mm5=eps*Fp
12305 pfadd mm5
, mm4 ;
# mm5= VV
12307 pfmul mm5
, [esp
+ i3130_qqOH
] ;
# vcoul=qq*VV
12308 pfmul mm7
, [esp
+ i3130_qqOH
] ;
# fijC=qq*FF
12310 ;
# update vctot directly, use mm3 for fscal sum.
12311 pfadd mm5
, [esp
+ i3130_vctot
]
12312 movq
[esp
+ i3130_vctot
], mm5
12315 pfmul mm3
, [esp
+ i3130_tsc
]
12316 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
12318 movq mm0
, [esp
+ i3130_tmprsqH
]
12324 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
12329 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
12331 pfmul mm0
,mm1 ;
# mm0=r
12332 pfmul mm0
, [esp
+ i3130_tsc
]
12334 movq
[esp
+ i3130_n1
], mm4
12336 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
12338 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
12341 mov edx
, [ebp
+ i3130_VFtab
]
12342 mov ecx
, [esp
+ i3130_n1
]
12344 ;
# load all values we need
12345 movd mm4
, [edx
+ ecx
*4]
12346 movd mm5
, [edx
+ ecx
*4 + 4]
12347 movd mm6
, [edx
+ ecx
*4 + 8]
12348 movd mm7
, [edx
+ ecx
*4 + 12]
12349 mov ecx
, [esp
+ i3130_n1
+ 4]
12351 punpckldq mm4
, [edx
+ ecx
*4]
12352 punpckldq mm5
, [edx
+ ecx
*4 + 4]
12353 punpckldq mm6
, [edx
+ ecx
*4 + 8]
12354 punpckldq mm7
, [edx
+ ecx
*4 + 12]
12357 pfmul mm6
, mm0 ;
# mm6 = Geps
12358 pfmul mm7
, mm2 ;
# mm7 = Heps2
12361 pfadd mm5
, mm7 ;
# mm5 = Fp
12363 pfmul mm7
, [esp
+ i3130_two
] ;
# two*Heps2
12365 pfadd mm7
, mm5 ;
# mm7=FF
12367 pfmul mm5
, mm0 ;
# mm5=eps*Fp
12368 pfadd mm5
, mm4 ;
# mm5= VV
12370 pfmul mm5
, [esp
+ i3130_qqHH
] ;
# vcoul=qq*VV
12371 pfmul mm7
, [esp
+ i3130_qqHH
] ;
# fijC=qq*FF
12373 pfadd mm5
, [esp
+ i3130_vctot
]
12374 movq
[esp
+ i3130_vctot
], mm5
12376 ;
# change sign of fijC and multiply by rinv
12379 pfmul mm4
, [esp
+ i3130_tsc
]
12380 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
12382 ;
# spread oxygen fscalar to both positions
12384 ;
# calc vectorial force for O
12385 movq mm0
, [esp
+ i3130_dxO
]
12386 movd mm1
, [esp
+ i3130_dzO
]
12390 ;
# calc vectorial force for H's
12391 movq mm5
, [esp
+ i3130_dxH
]
12392 movq mm6
, [esp
+ i3130_dyH
]
12393 movq mm7
, [esp
+ i3130_dzH
]
12398 ;
# update iO particle force
12399 movq mm2
, [esp
+ i3130_fixO
]
12400 movd mm3
, [esp
+ i3130_fizO
]
12403 movq
[esp
+ i3130_fixO
], mm2
12404 movd
[esp
+ i3130_fizO
], mm3
12406 ;
# update iH forces
12407 movq mm2
, [esp
+ i3130_fixH
]
12408 movq mm3
, [esp
+ i3130_fiyH
]
12409 movq mm4
, [esp
+ i3130_fizH
]
12413 movq
[esp
+ i3130_fixH
], mm2
12414 movq
[esp
+ i3130_fiyH
], mm3
12415 movq
[esp
+ i3130_fizH
], mm4
12417 ;
# pack j forces from H in the same form as the oxygen force.
12418 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
12419 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
12421 pfadd mm0
, mm5 ;
# add up total force on j particle.
12424 ;
# update j particle force
12425 movq mm2
, [edi
+ eax
*4 + 24]
12426 movd mm3
, [edi
+ eax
*4 + 32]
12429 movq
[edi
+ eax
*4 + 24], mm2
12430 movd
[edi
+ eax
*4 + 32], mm3
12432 ;
# done - one more?
12433 dec dword ptr
[esp
+ i3130_innerk
]
12434 jz
.i3130_updateouterdata
12435 jmp
.i3130_inner_loop
12436 .i3130_updateouterdata:
12437 mov ecx
, [esp
+ i3130_ii3
]
12439 movq mm6
, [edi
+ ecx
*4] ;
# increment iO force
12440 movd mm7
, [edi
+ ecx
*4 + 8]
12441 pfadd mm6
, [esp
+ i3130_fixO
]
12442 pfadd mm7
, [esp
+ i3130_fizO
]
12443 movq
[edi
+ ecx
*4], mm6
12444 movd
[edi
+ ecx
*4 +8], mm7
12446 movq mm0
, [esp
+ i3130_fixH
]
12447 movq mm3
, [esp
+ i3130_fiyH
]
12448 movq mm1
, [esp
+ i3130_fizH
]
12450 punpckldq mm0
, mm3 ;
# mm0(l)=fxH1, mm0(h)=fyH1
12451 punpckhdq mm2
, mm3 ;
# mm2(l)=fxH2, mm2(h)=fyH2
12457 movq mm6
, [edi
+ ecx
*4 + 12] ;
# increment iH1 force
12458 movd mm7
, [edi
+ ecx
*4 + 20]
12461 movq
[edi
+ ecx
*4 + 12], mm6
12462 movd
[edi
+ ecx
*4 + 20], mm7
12464 movq mm6
, [edi
+ ecx
*4 + 24] ;
# increment iH2 force
12465 movd mm7
, [edi
+ ecx
*4 + 32]
12468 movq
[edi
+ ecx
*4 + 24], mm6
12469 movd
[edi
+ ecx
*4 + 32], mm7
12472 mov ebx
, [ebp
+ i3130_fshift
] ;
# increment fshift force
12473 mov edx
, [esp
+ i3130_is3
]
12475 movq mm6
, [ebx
+ edx
*4]
12476 movd mm7
, [ebx
+ edx
*4 + 8]
12477 pfadd mm6
, [esp
+ i3130_fixO
]
12478 pfadd mm7
, [esp
+ i3130_fizO
]
12483 movq
[ebx
+ edx
*4], mm6
12484 movd
[ebx
+ edx
*4 + 8], mm7
12486 mov edx
, [ebp
+ i3130_gid
] ;
# get group index for this i particle
12488 add dword ptr
[ebp
+ i3130_gid
], 4 ;
# advance pointer
12490 movq mm7
, [esp
+ i3130_vctot
]
12491 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
12493 mov eax
, [ebp
+ i3130_Vc
]
12494 movd mm6
, [eax
+ edx
*4]
12496 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
12498 movq mm7
, [esp
+ i3130_vnbtot
]
12499 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
12501 mov eax
, [ebp
+ i3130_Vnb
]
12502 movd mm6
, [eax
+ edx
*4]
12504 movd
[eax
+ edx
*4], mm6 ;
# increment vnbtot[gid]
12506 dec dword ptr
[ebp
+ i3130_nri
]
12508 ;
# not last, iterate once more!
12523 .globl inl3300_3dnow
12524 .globl _inl3300_3dnow
12527 .equiv i3300_nri, 8
12528 .equiv i3300_iinr, 12
12529 .equiv i3300_jindex, 16
12530 .equiv i3300_jjnr, 20
12531 .equiv i3300_shift, 24
12532 .equiv i3300_shiftvec, 28
12533 .equiv i3300_fshift, 32
12534 .equiv i3300_gid, 36
12535 .equiv i3300_pos, 40
12536 .equiv i3300_faction, 44
12537 .equiv i3300_charge, 48
12538 .equiv i3300_facel, 52
12539 .equiv i3300_Vc, 56
12540 .equiv i3300_type, 60
12541 .equiv i3300_ntype, 64
12542 .equiv i3300_nbfp, 68
12543 .equiv i3300_Vnb, 72
12544 .equiv i3300_tabscale, 76
12545 .equiv i3300_VFtab, 80
12546 ;
# stack offsets for local variables
12547 .equiv i3300_is3, 0
12548 .equiv i3300_ii3, 4
12550 .equiv i3300_iy, 12
12551 .equiv i3300_iz, 16
12552 .equiv i3300_iq, 20
12553 .equiv i3300_vctot, 28
12554 .equiv i3300_vnbtot, 36
12555 .equiv i3300_c6, 44
12556 .equiv i3300_c12, 52
12557 .equiv i3300_two, 60
12558 .equiv i3300_n1, 68
12559 .equiv i3300_tsc, 76
12560 .equiv i3300_ntia, 84
12561 .equiv i3300_innerjjnr, 88
12562 .equiv i3300_innerk, 92
12563 .equiv i3300_fix, 96
12564 .equiv i3300_fiy, 100
12565 .equiv i3300_fiz, 104
12566 .equiv i3300_dx1, 108
12567 .equiv i3300_dy1, 112
12568 .equiv i3300_dz1, 116
12569 .equiv i3300_dx2, 120
12570 .equiv i3300_dy2, 124
12571 .equiv i3300_dz2, 128
12580 sub esp
, 132 ;
# local stack space
12582 ;
# move data to local stack
12584 movd mm3
, [ebp
+ i3300_tabscale
]
12585 movq
[esp
+ i3300_two
], mm0
12587 movq
[esp
+ i3300_tsc
], mm3
12588 ;
# assume we have at least one i particle - start directly
12590 mov eax
, [ebp
+ i3300_shift
] ;
# eax = pointer into shift[]
12591 mov ebx
, [eax
] ;
# ebx=shift[n]
12592 add dword ptr
[ebp
+ i3300_shift
], 4 ;
# advance pointer one step
12594 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
12595 mov
[esp
+ i3300_is3
],ebx ;
# store is3
12597 mov eax
, [ebp
+ i3300_shiftvec
] ;
# eax = base of shiftvec[]
12599 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
12600 movd mm1
, [eax
+ ebx
*4 + 8]
12602 mov ecx
, [ebp
+ i3300_iinr
] ;
# ecx = pointer into iinr[]
12603 add dword ptr
[ebp
+ i3300_iinr
], 4 ;
# advance pointer
12604 mov ebx
, [ecx
] ;
# ebx=ii
12606 mov edx
, [ebp
+ i3300_charge
]
12607 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
12608 pfmul mm2
, [ebp
+ i3300_facel
]
12609 punpckldq mm2
,mm2 ;
# spread to both halves
12610 movq
[esp
+ i3300_iq
], mm2 ;
# iq =facel*charge[ii]
12612 mov edx
, [ebp
+ i3300_type
]
12613 mov edx
, [edx
+ ebx
*4]
12614 imul edx
, [ebp
+ i3300_ntype
]
12616 mov
[esp
+ i3300_ntia
], edx
12618 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
12619 mov eax
, [ebp
+ i3300_pos
] ;
# eax = base of pos[]
12621 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
12622 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
12623 mov
[esp
+ i3300_ii3
], ebx
12625 movq
[esp
+ i3300_ix
], mm0
12626 movd
[esp
+ i3300_iz
], mm1
12628 ;
# clear total potential and i forces
12630 movq
[esp
+ i3300_vctot
], mm7
12631 movq
[esp
+ i3300_vnbtot
], mm7
12632 movq
[esp
+ i3300_fix
], mm7
12633 movd
[esp
+ i3300_fiz
], mm7
12635 mov eax
, [ebp
+ i3300_jindex
]
12636 mov ecx
, [eax
] ;
# jindex[n]
12637 mov edx
, [eax
+ 4] ;
# jindex[n+1]
12638 add dword ptr
[ebp
+ i3300_jindex
], 4
12639 sub edx
, ecx ;
# number of innerloop atoms
12641 mov esi
, [ebp
+ i3300_pos
]
12642 mov edi
, [ebp
+ i3300_faction
]
12643 mov eax
, [ebp
+ i3300_jjnr
]
12646 mov
[esp
+ i3300_innerjjnr
], eax ;
# pointer to jjnr[nj0]
12648 mov
[esp
+ i3300_innerk
], edx ;
# number of innerloop atoms
12649 jge
.i3300_unroll_loop
12650 jmp
.i3300_finish_inner
12651 .i3300_unroll_loop:
12652 ;
# paired innerloop starts here
12653 mov ecx
, [esp
+ i3300_innerjjnr
] ;
# pointer to jjnr[k]
12655 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
12656 add dword ptr
[esp
+ i3300_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
12657 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
12659 mov ecx
, [ebp
+ i3300_charge
] ;
# base of charge[]
12660 movq mm5
, [esp
+ i3300_iq
]
12661 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
12662 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
12663 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
12665 mov ecx
, [ebp
+ i3300_type
]
12666 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
12667 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
12669 mov esi
, [ebp
+ i3300_nbfp
] ;
# base of nbfp
12672 add edx
, [esp
+ i3300_ntia
] ;
# tja = ntia + 2*type
12673 add ecx
, [esp
+ i3300_ntia
]
12675 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
12676 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
12678 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
12679 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
12680 movq
[esp
+ i3300_c6
], mm5
12681 movq
[esp
+ i3300_c12
], mm6
12683 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
12684 lea ebx
, [ebx
+ ebx
*2]
12686 mov esi
, [ebp
+ i3300_pos
]
12688 movq mm0
, [esp
+ i3300_ix
]
12689 movd mm1
, [esp
+ i3300_iz
]
12690 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
12691 movd mm5
, [esi
+ eax
*4 + 8]
12692 pfsubr mm4
,mm0 ;
# dr = ir - jr
12694 movq
[esp
+ i3300_dx1
], mm4 ;
# store dr
12695 movd
[esp
+ i3300_dz1
], mm5
12696 pfmul mm4
,mm4 ;
# square dx,dy,dz
12698 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
12699 pfacc mm4
, mm5 ;
# first rsq in lower mm4
12701 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
12702 movd mm7
, [esi
+ ebx
*4 + 8]
12704 pfsubr mm6
,mm0 ;
# dr = ir - jr
12706 movq
[esp
+ i3300_dx2
], mm6 ;
# store dr
12707 movd
[esp
+ i3300_dz2
], mm7
12708 pfmul mm6
,mm6 ;
# square dx,dy,dz
12710 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
12711 pfacc mm6
, mm7 ;
# second rsq in lower mm6
12713 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
12718 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
12719 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
12725 ;
# mm0 is invsqrt, and mm1 r.
12726 ;
# do potential and fscal
12727 pfmul mm1
, [esp
+ i3300_tsc
] ;
# mm1=rt
12729 movq
[esp
+ i3300_n1
], mm4
12731 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
12734 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
12736 mov edx
, [ebp
+ i3300_VFtab
]
12737 mov ecx
, [esp
+ i3300_n1
]
12738 lea ecx
, [ecx
+ ecx
*2]
12740 ;
# load all the table values we need
12741 movd mm4
, [edx
+ ecx
*4]
12742 movd mm5
, [edx
+ ecx
*4 + 4]
12743 movd mm6
, [edx
+ ecx
*4 + 8]
12744 movd mm7
, [edx
+ ecx
*4 + 12]
12745 mov ecx
, [esp
+ i3300_n1
+ 4]
12746 lea ecx
, [ecx
+ ecx
*2]
12748 punpckldq mm4
, [edx
+ ecx
*4]
12749 punpckldq mm5
, [edx
+ ecx
*4 + 4]
12750 punpckldq mm6
, [edx
+ ecx
*4 + 8]
12751 punpckldq mm7
, [edx
+ ecx
*4 + 12]
12753 pfmul mm6
, mm1 ;
# mm6 = Geps
12754 pfmul mm7
, mm2 ;
# mm7 = Heps2
12757 pfadd mm5
, mm7 ;
# mm5 = Fp
12759 pfmul mm7
, [esp
+ i3300_two
] ;
# two*Heps2
12761 pfadd mm7
, mm5 ;
# mm7=FF
12763 pfmul mm5
, mm1 ;
# mm5=eps*Fp
12764 pfadd mm5
, mm4 ;
# mm5= VV
12766 pfmul mm5
, mm3 ;
# vcoul=qq*VV
12767 pfmul mm3
, mm7 ;
# fijC=FF*qq
12769 ;
# at this point mm5 contains vcoul and mm3 fijC
12770 ;
# increment vcoul - then we can get rid of mm5
12772 pfadd mm5
, [esp
+ i3300_vctot
] ;
# add the earlier value
12773 movq
[esp
+ i3300_vctot
], mm5 ;
# store the sum
12775 ;
# dispersion table
12776 mov ecx
, [esp
+ i3300_n1
]
12777 lea ecx
, [ecx
+ ecx
*2]
12779 ;
# load all the table values we need
12780 movd mm4
, [edx
+ ecx
*4 + 16]
12781 movd mm5
, [edx
+ ecx
*4 + 20]
12782 movd mm6
, [edx
+ ecx
*4 + 24]
12783 movd mm7
, [edx
+ ecx
*4 + 28]
12784 mov ecx
, [esp
+ i3300_n1
+ 4]
12785 lea ecx
, [ecx
+ ecx
*2]
12787 punpckldq mm4
, [edx
+ ecx
*4 + 16]
12788 punpckldq mm5
, [edx
+ ecx
*4 + 20]
12789 punpckldq mm6
, [edx
+ ecx
*4 + 24]
12790 punpckldq mm7
, [edx
+ ecx
*4 + 28]
12791 pfmul mm6
, mm1 ;
# mm6 = Geps
12792 pfmul mm7
, mm2 ;
# mm7 = Heps2
12794 pfadd mm5
, mm7 ;
# mm5 = Fp
12795 pfmul mm7
, [esp
+ i3300_two
] ;
# two*Heps2
12797 pfadd mm7
, mm5 ;
# mm7=FF
12798 pfmul mm5
, mm1 ;
# mm5=eps*Fp
12799 pfadd mm5
, mm4 ;
# mm5= VV
12801 movq mm4
, [esp
+ i3300_c6
]
12802 pfmul mm7
, mm4 ;
# fijD
12803 pfmul mm5
, mm4 ;
# vnb6
12804 pfadd mm3
, mm7 ;
# add to fscal
12806 ;
# update vnbtot to release mm5!
12807 pfadd mm5
, [esp
+ i3300_vnbtot
] ;
# add the earlier value
12808 movq
[esp
+ i3300_vnbtot
], mm5 ;
# store the sum
12811 mov ecx
, [esp
+ i3300_n1
]
12812 lea ecx
, [ecx
+ ecx
*2]
12814 ;
# load all the table values we need
12815 movd mm4
, [edx
+ ecx
*4 + 32]
12816 movd mm5
, [edx
+ ecx
*4 + 36]
12817 movd mm6
, [edx
+ ecx
*4 + 40]
12818 movd mm7
, [edx
+ ecx
*4 + 44]
12819 mov ecx
, [esp
+ i3300_n1
+ 4]
12820 lea ecx
, [ecx
+ ecx
*2]
12822 punpckldq mm4
, [edx
+ ecx
*4 + 32]
12823 punpckldq mm5
, [edx
+ ecx
*4 + 36]
12824 punpckldq mm6
, [edx
+ ecx
*4 + 40]
12825 punpckldq mm7
, [edx
+ ecx
*4 + 44]
12827 pfmul mm6
, mm1 ;
# mm6 = Geps
12828 pfmul mm7
, mm2 ;
# mm7 = Heps2
12830 pfadd mm5
, mm7 ;
# mm5 = Fp
12831 pfmul mm7
, [esp
+ i3300_two
] ;
# two*Heps2
12833 pfadd mm7
, mm5 ;
# mm7=FF
12834 pfmul mm5
, mm1 ;
# mm5=eps*Fp
12835 pfadd mm5
, mm4 ;
# mm5= VV
12837 movq mm6
, [esp
+ i3300_c12
]
12838 pfmul mm7
, mm6 ;
# fijR
12839 pfmul mm5
, mm6 ;
# vnb12
12840 pfadd mm3
, mm7 ;
# total fscal fijC+ fijD+ fijR
12842 ;
# change sign of mm3
12845 pfmul mm0
, [esp
+ i3300_tsc
]
12846 pfmul mm0
, mm1 ;
# mm0 is total fscal now
12848 prefetchw
[esp
+ i3300_dx1
] ;
# prefetch i forces to cache
12850 ;
# spread fscalar to both positions
12855 ;
# calc vector force
12856 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
12857 movq mm2
, [esp
+ i3300_dx1
] ;
# fetch dr
12858 movd mm3
, [esp
+ i3300_dz1
]
12861 pfadd mm5
, [esp
+ i3300_vnbtot
] ;
# add the earlier value
12862 movq
[esp
+ i3300_vnbtot
], mm5 ;
# store the sum
12864 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
12865 pfmul mm2
, mm0 ;
# mult by fs
12868 movq mm4
, [esp
+ i3300_dx2
] ;
# fetch dr
12869 movd mm5
, [esp
+ i3300_dz2
]
12870 pfmul mm4
, mm1 ;
# mult by fs
12874 movq mm0
, [esp
+ i3300_fix
]
12875 movd mm1
, [esp
+ i3300_fiz
]
12881 movq
[esp
+ i3300_fix
], mm0
12882 movd
[esp
+ i3300_fiz
], mm1
12885 movq mm0
, [edi
+ eax
*4]
12886 movd mm1
, [edi
+ eax
*4 + 8]
12887 movq mm6
, [edi
+ ebx
*4]
12888 movd mm7
, [edi
+ ebx
*4 + 8]
12895 movq
[edi
+ eax
*4], mm0
12896 movd
[edi
+ eax
*4 +8], mm1
12897 movq
[edi
+ ebx
*4], mm6
12898 movd
[edi
+ ebx
*4 + 8], mm7
12900 ;
# should we do one more iteration?
12901 sub dword ptr
[esp
+ i3300_innerk
], 2
12902 jl
.i3300_finish_inner
12903 jmp
.i3300_unroll_loop
12904 .i3300_finish_inner:
12905 and dword ptr
[esp
+ i3300_innerk
], 1
12906 jnz
.i3300_single_inner
12907 jmp
.i3300_updateouterdata
12908 .i3300_single_inner:
12909 ;
# a single j particle iteration here - compare with the unrolled code for comments.
12910 mov eax
, [esp
+ i3300_innerjjnr
]
12911 mov eax
, [eax
] ;
# eax=jnr offset
12913 mov ecx
, [ebp
+ i3300_charge
]
12914 movd mm5
, [esp
+ i3300_iq
]
12915 movd mm3
, [ecx
+ eax
*4]
12916 pfmul mm3
, mm5 ;
# mm3=qq
12918 mov esi
, [ebp
+ i3300_nbfp
]
12919 mov ecx
, [ebp
+ i3300_type
]
12920 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
12922 add edx
, [esp
+ i3300_ntia
] ;
# tja = ntia + 2*type
12923 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
12924 movq
[esp
+ i3300_c6
], mm5
12925 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
12926 movq
[esp
+ i3300_c12
], mm5
12928 mov esi
, [ebp
+ i3300_pos
]
12929 lea eax
, [eax
+ eax
*2]
12931 movq mm0
, [esp
+ i3300_ix
]
12932 movd mm1
, [esp
+ i3300_iz
]
12933 movq mm4
, [esi
+ eax
*4]
12934 movd mm5
, [esi
+ eax
*4 + 8]
12937 movq
[esp
+ i3300_dx1
], mm4
12939 movd
[esp
+ i3300_dz1
], mm5
12942 pfacc mm4
, mm5 ;
# mm0=rsq
12948 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
12951 ;
# mm0 is invsqrt, and mm1 r.
12953 ;
# calculate potentials and scalar force
12954 pfmul mm1
, [esp
+ i3300_tsc
] ;
# mm1=rt
12956 movd
[esp
+ i3300_n1
], mm4
12958 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
12961 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
12964 mov edx
, [ebp
+ i3300_VFtab
]
12965 mov ecx
, [esp
+ i3300_n1
]
12966 lea ecx
, [ecx
+ ecx
*2]
12968 ;
# load all the table values we need
12969 movd mm4
, [edx
+ ecx
*4]
12970 movd mm5
, [edx
+ ecx
*4 + 4]
12971 movd mm6
, [edx
+ ecx
*4 + 8]
12972 movd mm7
, [edx
+ ecx
*4 + 12]
12974 pfmul mm6
, mm1 ;
# mm6 = Geps
12975 pfmul mm7
, mm2 ;
# mm7 = Heps2
12978 pfadd mm5
, mm7 ;
# mm5 = Fp
12980 pfmul mm7
, [esp
+ i3300_two
] ;
# two*Heps2
12982 pfadd mm7
, mm5 ;
# mm7=FF
12984 pfmul mm5
, mm1 ;
# mm5=eps*Fp
12985 pfadd mm5
, mm4 ;
# mm5= VV
12987 pfmul mm5
, mm3 ;
# vcoul=qq*VV
12988 pfmul mm3
, mm7 ;
# fijC=FF*qq
12990 ;
# at this point mm5 contains vcoul and mm3 fijC
12991 ;
# increment vcoul - then we can get rid of mm5
12993 pfadd mm5
, [esp
+ i3300_vctot
] ;
# add the earlier value
12994 movq
[esp
+ i3300_vctot
], mm5 ;
# store the sum
12996 ;
# dispersion table
12997 ;
# load all the table values we need
12998 movd mm4
, [edx
+ ecx
*4 + 16]
12999 movd mm5
, [edx
+ ecx
*4 + 20]
13000 movd mm6
, [edx
+ ecx
*4 + 24]
13001 movd mm7
, [edx
+ ecx
*4 + 28]
13002 pfmul mm6
, mm1 ;
# mm6 = Geps
13003 pfmul mm7
, mm2 ;
# mm7 = Heps2
13005 pfadd mm5
, mm7 ;
# mm5 = Fp
13006 pfmul mm7
, [esp
+ i3300_two
] ;
# two*Heps2
13008 pfadd mm7
, mm5 ;
# mm7=FF
13009 pfmul mm5
, mm1 ;
# mm5=eps*Fp
13010 pfadd mm5
, mm4 ;
# mm5= VV
13012 movq mm4
, [esp
+ i3300_c6
]
13013 pfmul mm7
, mm4 ;
# fijD
13014 pfmul mm5
, mm4 ;
# vnb6
13015 pfadd mm3
, mm7 ;
# add to fscal
13017 ;
# update vnbtot to release mm5!
13018 pfadd mm5
, [esp
+ i3300_vnbtot
] ;
# add the earlier value
13019 movq
[esp
+ i3300_vnbtot
], mm5 ;
# store the sum
13022 ;
# load all the table values we need
13023 movd mm4
, [edx
+ ecx
*4 + 32]
13024 movd mm5
, [edx
+ ecx
*4 + 36]
13025 movd mm6
, [edx
+ ecx
*4 + 40]
13026 movd mm7
, [edx
+ ecx
*4 + 44]
13028 pfmul mm6
, mm1 ;
# mm6 = Geps
13029 pfmul mm7
, mm2 ;
# mm7 = Heps2
13031 pfadd mm5
, mm7 ;
# mm5 = Fp
13032 pfmul mm7
, [esp
+ i3300_two
] ;
# two*Heps2
13034 pfadd mm7
, mm5 ;
# mm7=FF
13035 pfmul mm5
, mm1 ;
# mm5=eps*Fp
13036 pfadd mm5
, mm4 ;
# mm5= VV
13038 movq mm6
, [esp
+ i3300_c12
]
13039 pfmul mm7
, mm6 ;
# fijR
13040 pfmul mm5
, mm6 ;
# vnb12
13041 pfadd mm3
, mm7 ;
# total fscal fijC+ fijD+ fijR
13043 ;
# change sign of mm3
13046 pfmul mm0
, [esp
+ i3300_tsc
]
13047 pfmul mm0
, mm1 ;
# mm0 is total fscal now
13050 pfadd mm5
, [esp
+ i3300_vnbtot
] ;
# add the earlier value
13051 movq
[esp
+ i3300_vnbtot
], mm5 ;
# store the sum
13053 ;
# spread fscalar to both positions
13055 ;
# calc vectorial force
13056 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
13057 movq mm2
, [esp
+ i3300_dx1
]
13058 movd mm3
, [esp
+ i3300_dz1
]
13064 ;
# update i particle force
13065 movq mm0
, [esp
+ i3300_fix
]
13066 movd mm1
, [esp
+ i3300_fiz
]
13069 movq
[esp
+ i3300_fix
], mm0
13070 movd
[esp
+ i3300_fiz
], mm1
13071 ;
# update j particle force
13072 movq mm0
, [edi
+ eax
*4]
13073 movd mm1
, [edi
+ eax
*4+ 8]
13076 movq
[edi
+ eax
*4], mm0
13077 movd
[edi
+ eax
*4 +8], mm1
13079 .i3300_updateouterdata:
13080 mov ecx
, [esp
+ i3300_ii3
]
13082 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
13083 movd mm7
, [edi
+ ecx
*4 + 8]
13084 pfadd mm6
, [esp
+ i3300_fix
]
13085 pfadd mm7
, [esp
+ i3300_fiz
]
13086 movq
[edi
+ ecx
*4], mm6
13087 movd
[edi
+ ecx
*4 +8], mm7
13089 mov ebx
, [ebp
+ i3300_fshift
] ;
# increment fshift force
13090 mov edx
, [esp
+ i3300_is3
]
13092 movq mm6
, [ebx
+ edx
*4]
13093 movd mm7
, [ebx
+ edx
*4 + 8]
13094 pfadd mm6
, [esp
+ i3300_fix
]
13095 pfadd mm7
, [esp
+ i3300_fiz
]
13096 movq
[ebx
+ edx
*4], mm6
13097 movd
[ebx
+ edx
*4 + 8], mm7
13099 mov edx
, [ebp
+ i3300_gid
] ;
# get group index for this i particle
13101 add dword ptr
[ebp
+ i3300_gid
], 4 ;
# advance pointer
13103 movq mm7
, [esp
+ i3300_vctot
]
13104 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
13106 mov eax
, [ebp
+ i3300_Vc
]
13107 movd mm6
, [eax
+ edx
*4]
13109 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
13111 movq mm7
, [esp
+ i3300_vnbtot
]
13112 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
13114 mov eax
, [ebp
+ i3300_Vnb
]
13115 movd mm6
, [eax
+ edx
*4]
13117 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
13120 mov ecx
, [ebp
+ i3300_nri
]
13123 ;
# not last, iterate once more!
13124 mov
[ebp
+ i3300_nri
], ecx
13142 .globl inl3310_3dnow
13143 .globl _inl3310_3dnow
13146 .equiv i3310_nri, 8
13147 .equiv i3310_iinr, 12
13148 .equiv i3310_jindex, 16
13149 .equiv i3310_jjnr, 20
13150 .equiv i3310_shift, 24
13151 .equiv i3310_shiftvec, 28
13152 .equiv i3310_fshift, 32
13153 .equiv i3310_gid, 36
13154 .equiv i3310_pos, 40
13155 .equiv i3310_faction, 44
13156 .equiv i3310_charge, 48
13157 .equiv i3310_facel, 52
13158 .equiv i3310_Vc, 56
13159 .equiv i3310_type, 60
13160 .equiv i3310_ntype, 64
13161 .equiv i3310_nbfp, 68
13162 .equiv i3310_Vnb, 72
13163 .equiv i3310_tabscale, 76
13164 .equiv i3310_VFtab, 80
13165 .equiv i3310_nsatoms, 84
13166 ;
# stack offsets for local variables
13167 .equiv i3310_is3, 0
13168 .equiv i3310_ii3, 4
13169 .equiv i3310_shX, 8
13170 .equiv i3310_shY, 12
13171 .equiv i3310_shZ, 16
13172 .equiv i3310_ix, 20
13173 .equiv i3310_iy, 24
13174 .equiv i3310_iz, 28
13175 .equiv i3310_iq, 32
13176 .equiv i3310_vctot, 40
13177 .equiv i3310_vnbtot, 48
13178 .equiv i3310_c6, 56
13179 .equiv i3310_c12, 64
13180 .equiv i3310_two, 72
13181 .equiv i3310_n1, 80
13182 .equiv i3310_tsc, 88
13183 .equiv i3310_ntia, 96
13184 .equiv i3310_innerjjnr0, 100
13185 .equiv i3310_innerk0, 104
13186 .equiv i3310_innerjjnr, 108
13187 .equiv i3310_innerk, 112
13188 .equiv i3310_fix, 116
13189 .equiv i3310_fiy, 120
13190 .equiv i3310_fiz, 124
13191 .equiv i3310_dx1, 128
13192 .equiv i3310_dy1, 132
13193 .equiv i3310_dz1, 136
13194 .equiv i3310_dx2, 140
13195 .equiv i3310_dy2, 144
13196 .equiv i3310_dz2, 148
13197 .equiv i3310_nsvdwc, 152
13198 .equiv i3310_nscoul, 156
13199 .equiv i3310_nsvdw, 160
13200 .equiv i3310_solnr, 164
13209 sub esp
, 168 ;
# local stack space
13212 movd mm3
, [ebp
+ i3310_tabscale
]
13213 movq
[esp
+ i3310_two
], mm0
13215 movq
[esp
+ i3310_tsc
], mm3
13216 ;
# assume we have at least one i particle - start directly
13218 mov eax
, [ebp
+ i3310_shift
] ;
# eax = pointer into shift[]
13219 mov ebx
, [eax
] ;
# ebx=shift[n]
13220 add dword ptr
[ebp
+ i3310_shift
], 4 ;
# advance pointer one step
13222 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
13223 mov
[esp
+ i3310_is3
],ebx ;
# store is3
13225 mov eax
, [ebp
+ i3310_shiftvec
] ;
# eax = base of shiftvec[]
13227 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
13228 movd mm1
, [eax
+ ebx
*4 + 8]
13229 movq
[esp
+ i3310_shX
], mm0
13230 movd
[esp
+ i3310_shZ
], mm1
13232 mov ecx
, [ebp
+ i3310_iinr
] ;
# ecx = pointer into iinr[]
13233 add dword ptr
[ebp
+ i3310_iinr
], 4 ;
# advance pointer
13234 mov ebx
, [ecx
] ;
# ebx=ii
13236 mov eax
, [ebp
+ i3310_nsatoms
]
13237 add dword ptr
[ebp
+ i3310_nsatoms
], 12
13244 mov
[esp
+ i3310_nsvdwc
], edx
13245 mov
[esp
+ i3310_nscoul
], eax
13246 mov
[esp
+ i3310_nsvdw
], ecx
13250 movq
[esp
+ i3310_vctot
], mm7
13251 movq
[esp
+ i3310_vnbtot
], mm7
13252 mov
[esp
+ i3310_solnr
], ebx
13254 mov eax
, [ebp
+ i3310_jindex
]
13255 mov ecx
, [eax
] ;
# jindex[n]
13256 mov edx
, [eax
+ 4] ;
# jindex[n+1]
13257 add dword ptr
[ebp
+ i3310_jindex
], 4
13258 sub edx
, ecx ;
# number of innerloop atoms
13259 mov eax
, [ebp
+ i3310_jjnr
]
13262 mov
[esp
+ i3310_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
13264 mov
[esp
+ i3310_innerk0
], edx ;
# number of innerloop atoms
13265 mov esi
, [ebp
+ i3310_pos
]
13266 mov edi
, [ebp
+ i3310_faction
]
13268 mov ecx
, [esp
+ i3310_nsvdwc
]
13270 jnz
.i3310_mno_vdwc
13271 jmp
.i3310_testcoul
13273 mov ebx
, [esp
+ i3310_solnr
]
13274 inc dword ptr
[esp
+ i3310_solnr
]
13275 mov edx
, [ebp
+ i3310_charge
]
13276 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
13277 pfmul mm2
, [ebp
+ i3310_facel
]
13278 punpckldq mm2
,mm2 ;
# spread to both halves
13279 movq
[esp
+ i3310_iq
], mm2 ;
# iq =facel*charge[ii]
13281 mov edx
, [ebp
+ i3310_type
]
13282 mov edx
, [edx
+ ebx
*4]
13283 imul edx
, [ebp
+ i3310_ntype
]
13285 mov
[esp
+ i3310_ntia
], edx
13287 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
13288 mov eax
, [ebp
+ i3310_pos
] ;
# eax = base of pos[]
13289 mov
[esp
+ i3310_ii3
], ebx
13291 movq mm0
, [eax
+ ebx
*4]
13292 movd mm1
, [eax
+ ebx
*4 + 8]
13293 pfadd mm0
, [esp
+ i3310_shX
]
13294 pfadd mm1
, [esp
+ i3310_shZ
]
13295 movq
[esp
+ i3310_ix
], mm0
13296 movd
[esp
+ i3310_iz
], mm1
13300 movq
[esp
+ i3310_fix
], mm7
13301 movd
[esp
+ i3310_fiz
], mm7
13303 mov ecx
, [esp
+ i3310_innerjjnr0
]
13304 mov
[esp
+ i3310_innerjjnr
], ecx
13305 mov edx
, [esp
+ i3310_innerk0
]
13307 mov
[esp
+ i3310_innerk
], edx ;
# number of innerloop atoms
13308 jge
.i3310_unroll_vdwc_loop
13309 jmp
.i3310_finish_vdwc_inner
13310 .i3310_unroll_vdwc_loop:
13311 ;
# paired innerloop starts here
13312 mov ecx
, [esp
+ i3310_innerjjnr
] ;
# pointer to jjnr[k]
13314 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
13315 add dword ptr
[esp
+ i3310_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
13316 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
13318 mov ecx
, [ebp
+ i3310_charge
] ;
# base of charge[]
13319 movq mm5
, [esp
+ i3310_iq
]
13320 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
13321 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
13322 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
13324 mov ecx
, [ebp
+ i3310_type
]
13325 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
13326 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
13328 mov esi
, [ebp
+ i3310_nbfp
] ;
# base of nbfp
13331 add edx
, [esp
+ i3310_ntia
] ;
# tja = ntia + 2*type
13332 add ecx
, [esp
+ i3310_ntia
]
13334 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
13335 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
13337 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
13338 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
13339 movq
[esp
+ i3310_c6
], mm5
13340 movq
[esp
+ i3310_c12
], mm6
13342 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
13343 lea ebx
, [ebx
+ ebx
*2]
13345 mov esi
, [ebp
+ i3310_pos
]
13347 movq mm0
, [esp
+ i3310_ix
]
13348 movd mm1
, [esp
+ i3310_iz
]
13349 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
13350 movd mm5
, [esi
+ eax
*4 + 8]
13351 pfsubr mm4
,mm0 ;
# dr = ir - jr
13353 movq
[esp
+ i3310_dx1
], mm4 ;
# store dr
13354 movd
[esp
+ i3310_dz1
], mm5
13355 pfmul mm4
,mm4 ;
# square dx,dy,dz
13357 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
13358 pfacc mm4
, mm5 ;
# first rsq in lower mm4
13360 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
13361 movd mm7
, [esi
+ ebx
*4 + 8]
13363 pfsubr mm6
,mm0 ;
# dr = ir - jr
13365 movq
[esp
+ i3310_dx2
], mm6 ;
# store dr
13366 movd
[esp
+ i3310_dz2
], mm7
13367 pfmul mm6
,mm6 ;
# square dx,dy,dz
13369 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
13370 pfacc mm6
, mm7 ;
# second rsq in lower mm6
13372 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
13377 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
13378 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
13384 ;
# mm0 is invsqrt, and mm1 r.
13385 ;
# do potential and fscal
13386 pfmul mm1
, [esp
+ i3310_tsc
] ;
# mm1=rt
13388 movq
[esp
+ i3310_n1
], mm4
13390 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
13393 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
13395 mov edx
, [ebp
+ i3310_VFtab
]
13396 mov ecx
, [esp
+ i3310_n1
]
13397 lea ecx
, [ecx
+ ecx
*2]
13399 ;
# load all the table values we need
13400 movd mm4
, [edx
+ ecx
*4]
13401 movd mm5
, [edx
+ ecx
*4 + 4]
13402 movd mm6
, [edx
+ ecx
*4 + 8]
13403 movd mm7
, [edx
+ ecx
*4 + 12]
13404 mov ecx
, [esp
+ i3310_n1
+ 4]
13405 lea ecx
, [ecx
+ ecx
*2]
13407 punpckldq mm4
, [edx
+ ecx
*4]
13408 punpckldq mm5
, [edx
+ ecx
*4 + 4]
13409 punpckldq mm6
, [edx
+ ecx
*4 + 8]
13410 punpckldq mm7
, [edx
+ ecx
*4 + 12]
13412 pfmul mm6
, mm1 ;
# mm6 = Geps
13413 pfmul mm7
, mm2 ;
# mm7 = Heps2
13416 pfadd mm5
, mm7 ;
# mm5 = Fp
13418 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
13420 pfadd mm7
, mm5 ;
# mm7=FF
13422 pfmul mm5
, mm1 ;
# mm5=eps*Fp
13423 pfadd mm5
, mm4 ;
# mm5= VV
13425 pfmul mm5
, mm3 ;
# vcoul=qq*VV
13426 pfmul mm3
, mm7 ;
# fijC=FF*qq
13428 ;
# at this point mm5 contains vcoul and mm3 fijC
13429 ;
# increment vcoul - then we can get rid of mm5
13431 pfadd mm5
, [esp
+ i3310_vctot
] ;
# add the earlier value
13432 movq
[esp
+ i3310_vctot
], mm5 ;
# store the sum
13434 ;
# dispersion table
13435 mov ecx
, [esp
+ i3310_n1
]
13436 lea ecx
, [ecx
+ ecx
*2]
13438 ;
# load all the table values we need
13439 movd mm4
, [edx
+ ecx
*4 + 16]
13440 movd mm5
, [edx
+ ecx
*4 + 20]
13441 movd mm6
, [edx
+ ecx
*4 + 24]
13442 movd mm7
, [edx
+ ecx
*4 + 28]
13443 mov ecx
, [esp
+ i3310_n1
+ 4]
13444 lea ecx
, [ecx
+ ecx
*2]
13446 punpckldq mm4
, [edx
+ ecx
*4 + 16]
13447 punpckldq mm5
, [edx
+ ecx
*4 + 20]
13448 punpckldq mm6
, [edx
+ ecx
*4 + 24]
13449 punpckldq mm7
, [edx
+ ecx
*4 + 28]
13450 pfmul mm6
, mm1 ;
# mm6 = Geps
13451 pfmul mm7
, mm2 ;
# mm7 = Heps2
13453 pfadd mm5
, mm7 ;
# mm5 = Fp
13454 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
13456 pfadd mm7
, mm5 ;
# mm7=FF
13457 pfmul mm5
, mm1 ;
# mm5=eps*Fp
13458 pfadd mm5
, mm4 ;
# mm5= VV
13460 movq mm4
, [esp
+ i3310_c6
]
13461 pfmul mm7
, mm4 ;
# fijD
13462 pfmul mm5
, mm4 ;
# vnb6
13463 pfadd mm3
, mm7 ;
# add to fscal
13465 ;
# update vnbtot to release mm5!
13466 pfadd mm5
, [esp
+ i3310_vnbtot
] ;
# add the earlier value
13467 movq
[esp
+ i3310_vnbtot
], mm5 ;
# store the sum
13470 mov ecx
, [esp
+ i3310_n1
]
13471 lea ecx
, [ecx
+ ecx
*2]
13473 ;
# load all the table values we need
13474 movd mm4
, [edx
+ ecx
*4 + 32]
13475 movd mm5
, [edx
+ ecx
*4 + 36]
13476 movd mm6
, [edx
+ ecx
*4 + 40]
13477 movd mm7
, [edx
+ ecx
*4 + 44]
13478 mov ecx
, [esp
+ i3310_n1
+ 4]
13479 lea ecx
, [ecx
+ ecx
*2]
13481 punpckldq mm4
, [edx
+ ecx
*4 + 32]
13482 punpckldq mm5
, [edx
+ ecx
*4 + 36]
13483 punpckldq mm6
, [edx
+ ecx
*4 + 40]
13484 punpckldq mm7
, [edx
+ ecx
*4 + 44]
13486 pfmul mm6
, mm1 ;
# mm6 = Geps
13487 pfmul mm7
, mm2 ;
# mm7 = Heps2
13489 pfadd mm5
, mm7 ;
# mm5 = Fp
13490 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
13492 pfadd mm7
, mm5 ;
# mm7=FF
13493 pfmul mm5
, mm1 ;
# mm5=eps*Fp
13494 pfadd mm5
, mm4 ;
# mm5= VV
13496 movq mm6
, [esp
+ i3310_c12
]
13497 pfmul mm7
, mm6 ;
# fijR
13498 pfmul mm5
, mm6 ;
# vnb12
13499 pfadd mm3
, mm7 ;
# total fscal fijC+ fijD+ fijR
13501 ;
# change sign of mm3
13504 pfmul mm0
, [esp
+ i3310_tsc
]
13505 pfmul mm0
, mm1 ;
# mm0 is total fscal now
13507 prefetchw
[esp
+ i3310_dx1
] ;
# prefetch i forces to cache
13509 ;
# spread fscalar to both positions
13514 ;
# calc vector force
13515 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
13516 movq mm2
, [esp
+ i3310_dx1
] ;
# fetch dr
13517 movd mm3
, [esp
+ i3310_dz1
]
13520 pfadd mm5
, [esp
+ i3310_vnbtot
] ;
# add the earlier value
13521 movq
[esp
+ i3310_vnbtot
], mm5 ;
# store the sum
13523 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
13524 pfmul mm2
, mm0 ;
# mult by fs
13527 movq mm4
, [esp
+ i3310_dx2
] ;
# fetch dr
13528 movd mm5
, [esp
+ i3310_dz2
]
13529 pfmul mm4
, mm1 ;
# mult by fs
13533 movq mm0
, [esp
+ i3310_fix
]
13534 movd mm1
, [esp
+ i3310_fiz
]
13540 movq
[esp
+ i3310_fix
], mm0
13541 movd
[esp
+ i3310_fiz
], mm1
13544 movq mm0
, [edi
+ eax
*4]
13545 movd mm1
, [edi
+ eax
*4 + 8]
13546 movq mm6
, [edi
+ ebx
*4]
13547 movd mm7
, [edi
+ ebx
*4 + 8]
13554 movq
[edi
+ eax
*4], mm0
13555 movd
[edi
+ eax
*4 +8], mm1
13556 movq
[edi
+ ebx
*4], mm6
13557 movd
[edi
+ ebx
*4 + 8], mm7
13559 ;
# should we do one more iteration?
13560 sub dword ptr
[esp
+ i3310_innerk
], 2
13561 jl
.i3310_finish_vdwc_inner
13562 jmp
.i3310_unroll_vdwc_loop
13563 .i3310_finish_vdwc_inner:
13564 and dword ptr
[esp
+ i3310_innerk
], 1
13565 jnz
.i3310_single_vdwc_inner
13566 jmp
.i3310_updateouterdata_vdwc
13567 .i3310_single_vdwc_inner:
13568 ;
# a single j particle iteration here - compare with the unrolled code for comments.
13569 mov eax
, [esp
+ i3310_innerjjnr
]
13570 mov eax
, [eax
] ;
# eax=jnr offset
13572 mov ecx
, [ebp
+ i3310_charge
]
13573 movd mm5
, [esp
+ i3310_iq
]
13574 movd mm3
, [ecx
+ eax
*4]
13575 pfmul mm3
, mm5 ;
# mm3=qq
13577 mov esi
, [ebp
+ i3310_nbfp
]
13578 mov ecx
, [ebp
+ i3310_type
]
13579 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
13581 add edx
, [esp
+ i3310_ntia
] ;
# tja = ntia + 2*type
13582 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
13583 movq
[esp
+ i3310_c6
], mm5
13584 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
13585 movq
[esp
+ i3310_c12
], mm5
13587 mov esi
, [ebp
+ i3310_pos
]
13588 lea eax
, [eax
+ eax
*2]
13590 movq mm0
, [esp
+ i3310_ix
]
13591 movd mm1
, [esp
+ i3310_iz
]
13592 movq mm4
, [esi
+ eax
*4]
13593 movd mm5
, [esi
+ eax
*4 + 8]
13596 movq
[esp
+ i3310_dx1
], mm4
13598 movd
[esp
+ i3310_dz1
], mm5
13601 pfacc mm4
, mm5 ;
# mm0=rsq
13607 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
13610 ;
# mm0 is invsqrt, and mm1 r.
13612 ;
# calculate potentials and scalar force
13613 pfmul mm1
, [esp
+ i3310_tsc
] ;
# mm1=rt
13615 movd
[esp
+ i3310_n1
], mm4
13617 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
13620 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
13623 mov edx
, [ebp
+ i3310_VFtab
]
13624 mov ecx
, [esp
+ i3310_n1
]
13625 lea ecx
, [ecx
+ ecx
*2]
13627 ;
# load all the table values we need
13628 movd mm4
, [edx
+ ecx
*4]
13629 movd mm5
, [edx
+ ecx
*4 + 4]
13630 movd mm6
, [edx
+ ecx
*4 + 8]
13631 movd mm7
, [edx
+ ecx
*4 + 12]
13633 pfmul mm6
, mm1 ;
# mm6 = Geps
13634 pfmul mm7
, mm2 ;
# mm7 = Heps2
13637 pfadd mm5
, mm7 ;
# mm5 = Fp
13639 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
13641 pfadd mm7
, mm5 ;
# mm7=FF
13643 pfmul mm5
, mm1 ;
# mm5=eps*Fp
13644 pfadd mm5
, mm4 ;
# mm5= VV
13646 pfmul mm5
, mm3 ;
# vcoul=qq*VV
13647 pfmul mm3
, mm7 ;
# fijC=FF*qq
13649 ;
# at this point mm5 contains vcoul and mm3 fijC
13650 ;
# increment vcoul - then we can get rid of mm5
13652 pfadd mm5
, [esp
+ i3310_vctot
] ;
# add the earlier value
13653 movq
[esp
+ i3310_vctot
], mm5 ;
# store the sum
13655 ;
# dispersion table
13656 ;
# load all the table values we need
13657 movd mm4
, [edx
+ ecx
*4 + 16]
13658 movd mm5
, [edx
+ ecx
*4 + 20]
13659 movd mm6
, [edx
+ ecx
*4 + 24]
13660 movd mm7
, [edx
+ ecx
*4 + 28]
13661 pfmul mm6
, mm1 ;
# mm6 = Geps
13662 pfmul mm7
, mm2 ;
# mm7 = Heps2
13664 pfadd mm5
, mm7 ;
# mm5 = Fp
13665 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
13667 pfadd mm7
, mm5 ;
# mm7=FF
13668 pfmul mm5
, mm1 ;
# mm5=eps*Fp
13669 pfadd mm5
, mm4 ;
# mm5= VV
13671 movq mm4
, [esp
+ i3310_c6
]
13672 pfmul mm7
, mm4 ;
# fijD
13673 pfmul mm5
, mm4 ;
# vnb6
13674 pfadd mm3
, mm7 ;
# add to fscal
13676 ;
# update vnbtot to release mm5!
13677 pfadd mm5
, [esp
+ i3310_vnbtot
] ;
# add the earlier value
13678 movq
[esp
+ i3310_vnbtot
], mm5 ;
# store the sum
13681 ;
# load all the table values we need
13682 movd mm4
, [edx
+ ecx
*4 + 32]
13683 movd mm5
, [edx
+ ecx
*4 + 36]
13684 movd mm6
, [edx
+ ecx
*4 + 40]
13685 movd mm7
, [edx
+ ecx
*4 + 44]
13687 pfmul mm6
, mm1 ;
# mm6 = Geps
13688 pfmul mm7
, mm2 ;
# mm7 = Heps2
13690 pfadd mm5
, mm7 ;
# mm5 = Fp
13691 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
13693 pfadd mm7
, mm5 ;
# mm7=FF
13694 pfmul mm5
, mm1 ;
# mm5=eps*Fp
13695 pfadd mm5
, mm4 ;
# mm5= VV
13697 movq mm6
, [esp
+ i3310_c12
]
13698 pfmul mm7
, mm6 ;
# fijR
13699 pfmul mm5
, mm6 ;
# vnb12
13700 pfadd mm3
, mm7 ;
# total fscal fijC+ fijD+ fijR
13702 ;
# change sign of mm3
13705 pfmul mm0
, [esp
+ i3310_tsc
]
13706 pfmul mm0
, mm1 ;
# mm0 is total fscal now
13709 pfadd mm5
, [esp
+ i3310_vnbtot
] ;
# add the earlier value
13710 movq
[esp
+ i3310_vnbtot
], mm5 ;
# store the sum
13712 ;
# spread fscalar to both positions
13714 ;
# calc vectorial force
13715 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
13716 movq mm2
, [esp
+ i3310_dx1
]
13717 movd mm3
, [esp
+ i3310_dz1
]
13723 ;
# update i particle force
13724 movq mm0
, [esp
+ i3310_fix
]
13725 movd mm1
, [esp
+ i3310_fiz
]
13728 movq
[esp
+ i3310_fix
], mm0
13729 movd
[esp
+ i3310_fiz
], mm1
13730 ;
# update j particle force
13731 movq mm0
, [edi
+ eax
*4]
13732 movd mm1
, [edi
+ eax
*4+ 8]
13735 movq
[edi
+ eax
*4], mm0
13736 movd
[edi
+ eax
*4 +8], mm1
13738 .i3310_updateouterdata_vdwc:
13739 mov ecx
, [esp
+ i3310_ii3
]
13741 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
13742 movd mm7
, [edi
+ ecx
*4 + 8]
13743 pfadd mm6
, [esp
+ i3310_fix
]
13744 pfadd mm7
, [esp
+ i3310_fiz
]
13745 movq
[edi
+ ecx
*4], mm6
13746 movd
[edi
+ ecx
*4 +8], mm7
13748 mov ebx
, [ebp
+ i3310_fshift
] ;
# increment fshift force
13749 mov edx
, [esp
+ i3310_is3
]
13751 movq mm6
, [ebx
+ edx
*4]
13752 movd mm7
, [ebx
+ edx
*4 + 8]
13753 pfadd mm6
, [esp
+ i3310_fix
]
13754 pfadd mm7
, [esp
+ i3310_fiz
]
13755 movq
[ebx
+ edx
*4], mm6
13756 movd
[ebx
+ edx
*4 + 8], mm7
13758 ;
# loop back to mno
13759 dec dword ptr
[esp
+ i3310_nsvdwc
]
13761 jmp
.i3310_mno_vdwc
13763 mov ecx
, [esp
+ i3310_nscoul
]
13765 jnz
.i3310_mno_coul
13768 mov ebx
, [esp
+ i3310_solnr
]
13769 inc dword ptr
[esp
+ i3310_solnr
]
13770 mov edx
, [ebp
+ i3310_charge
]
13771 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
13772 pfmul mm2
, [ebp
+ i3310_facel
]
13773 punpckldq mm2
,mm2 ;
# spread to both halves
13774 movq
[esp
+ i3310_iq
], mm2 ;
# iq =facel*charge[ii]
13776 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
13777 mov eax
, [ebp
+ i3310_pos
] ;
# eax = base of pos[]
13778 mov
[esp
+ i3310_ii3
], ebx
13780 movq mm0
, [eax
+ ebx
*4]
13781 movd mm1
, [eax
+ ebx
*4 + 8]
13782 pfadd mm0
, [esp
+ i3310_shX
]
13783 pfadd mm1
, [esp
+ i3310_shZ
]
13784 movq
[esp
+ i3310_ix
], mm0
13785 movd
[esp
+ i3310_iz
], mm1
13789 movq
[esp
+ i3310_fix
], mm7
13790 movd
[esp
+ i3310_fiz
], mm7
13792 mov ecx
, [esp
+ i3310_innerjjnr0
]
13793 mov
[esp
+ i3310_innerjjnr
], ecx
13794 mov edx
, [esp
+ i3310_innerk0
]
13796 mov
[esp
+ i3310_innerk
], edx ;
# number of innerloop atoms
13797 jge
.i3310_unroll_coul_loop
13798 jmp
.i3310_finish_coul_inner
13799 .i3310_unroll_coul_loop:
13800 ;
# paired innerloop starts here
13801 mov ecx
, [esp
+ i3310_innerjjnr
] ;
# pointer to jjnr[k]
13803 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
13804 add dword ptr
[esp
+ i3310_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
13805 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
13807 mov ecx
, [ebp
+ i3310_charge
] ;
# base of charge[]
13808 movq mm5
, [esp
+ i3310_iq
]
13809 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
13810 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
13811 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
13813 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
13814 lea ebx
, [ebx
+ ebx
*2]
13816 mov esi
, [ebp
+ i3310_pos
]
13818 movq mm0
, [esp
+ i3310_ix
]
13819 movd mm1
, [esp
+ i3310_iz
]
13820 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
13821 movd mm5
, [esi
+ eax
*4 + 8]
13822 pfsubr mm4
,mm0 ;
# dr = ir - jr
13824 movq
[esp
+ i3310_dx1
], mm4 ;
# store dr
13825 movd
[esp
+ i3310_dz1
], mm5
13826 pfmul mm4
,mm4 ;
# square dx,dy,dz
13828 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
13829 pfacc mm4
, mm5 ;
# first rsq in lower mm4
13831 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
13832 movd mm7
, [esi
+ ebx
*4 + 8]
13834 pfsubr mm6
,mm0 ;
# dr = ir - jr
13836 movq
[esp
+ i3310_dx2
], mm6 ;
# store dr
13837 movd
[esp
+ i3310_dz2
], mm7
13838 pfmul mm6
,mm6 ;
# square dx,dy,dz
13840 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
13841 pfacc mm6
, mm7 ;
# second rsq in lower mm6
13843 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
13848 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
13849 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
13855 ;
# mm0 is invsqrt, and mm1 r.
13856 ;
# do potential and fscal
13857 pfmul mm1
, [esp
+ i3310_tsc
] ;
# mm1=rt
13859 movq
[esp
+ i3310_n1
], mm4
13861 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
13864 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
13866 mov edx
, [ebp
+ i3310_VFtab
]
13867 mov ecx
, [esp
+ i3310_n1
]
13868 lea ecx
, [ecx
+ ecx
*2]
13871 ;
# load all the table values we need
13872 movd mm4
, [edx
+ ecx
*4]
13873 movd mm5
, [edx
+ ecx
*4 + 4]
13874 movd mm6
, [edx
+ ecx
*4 + 8]
13875 movd mm7
, [edx
+ ecx
*4 + 12]
13876 mov ecx
, [esp
+ i3310_n1
+ 4]
13877 lea ecx
, [ecx
+ ecx
*2]
13879 punpckldq mm4
, [edx
+ ecx
*4]
13880 punpckldq mm5
, [edx
+ ecx
*4 + 4]
13881 punpckldq mm6
, [edx
+ ecx
*4 + 8]
13882 punpckldq mm7
, [edx
+ ecx
*4 + 12]
13884 pfmul mm6
, mm1 ;
# mm6 = Geps
13885 pfmul mm7
, mm2 ;
# mm7 = Heps2
13888 pfadd mm5
, mm7 ;
# mm5 = Fp
13890 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
13892 pfadd mm7
, mm5 ;
# mm7=FF
13894 pfmul mm5
, mm1 ;
# mm5=eps*Fp
13895 pfadd mm5
, mm4 ;
# mm5= VV
13897 pfmul mm5
, mm3 ;
# vcoul=qq*VV
13898 pfmul mm3
, mm7 ;
# fijC=FF*qq
13900 ;
# at this point mm5 contains vcoul and mm3 fijC
13901 ;
# increment vcoul - then we can get rid of mm5
13903 pfadd mm5
, [esp
+ i3310_vctot
] ;
# add the earlier value
13904 movq
[esp
+ i3310_vctot
], mm5 ;
# store the sum
13906 ;
# change sign of mm3
13909 pfmul mm1
, [esp
+ i3310_tsc
]
13910 pfmul mm0
, mm1 ;
# mm0 is total fscal now
13912 prefetchw
[esp
+ i3310_dx1
] ;
# prefetch i forces to cache
13914 ;
# spread fscalar to both positions
13919 ;
# calc vector force
13920 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
13921 movq mm2
, [esp
+ i3310_dx1
] ;
# fetch dr
13922 movd mm3
, [esp
+ i3310_dz1
]
13924 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
13925 pfmul mm2
, mm0 ;
# mult by fs
13928 movq mm4
, [esp
+ i3310_dx2
] ;
# fetch dr
13929 movd mm5
, [esp
+ i3310_dz2
]
13930 pfmul mm4
, mm1 ;
# mult by fs
13934 movq mm0
, [esp
+ i3310_fix
]
13935 movd mm1
, [esp
+ i3310_fiz
]
13941 movq
[esp
+ i3310_fix
], mm0
13942 movd
[esp
+ i3310_fiz
], mm1
13945 movq mm0
, [edi
+ eax
*4]
13946 movd mm1
, [edi
+ eax
*4 + 8]
13947 movq mm6
, [edi
+ ebx
*4]
13948 movd mm7
, [edi
+ ebx
*4 + 8]
13955 movq
[edi
+ eax
*4], mm0
13956 movd
[edi
+ eax
*4 +8], mm1
13957 movq
[edi
+ ebx
*4], mm6
13958 movd
[edi
+ ebx
*4 + 8], mm7
13960 ;
# should we do one more iteration?
13961 sub dword ptr
[esp
+ i3310_innerk
], 2
13962 jl
.i3310_finish_coul_inner
13963 jmp
.i3310_unroll_coul_loop
13964 .i3310_finish_coul_inner:
13965 and dword ptr
[esp
+ i3310_innerk
], 1
13966 jnz
.i3310_single_coul_inner
13967 jmp
.i3310_updateouterdata_coul
13968 .i3310_single_coul_inner:
13969 ;
# a single j particle iteration here - compare with the unrolled code for comments.
13970 mov eax
, [esp
+ i3310_innerjjnr
]
13971 mov eax
, [eax
] ;
# eax=jnr offset
13973 mov ecx
, [ebp
+ i3310_charge
]
13974 movd mm5
, [esp
+ i3310_iq
]
13975 movd mm3
, [ecx
+ eax
*4]
13976 pfmul mm3
, mm5 ;
# mm3=qq
13978 mov esi
, [ebp
+ i3310_pos
]
13979 lea eax
, [eax
+ eax
*2]
13981 movq mm0
, [esp
+ i3310_ix
]
13982 movd mm1
, [esp
+ i3310_iz
]
13983 movq mm4
, [esi
+ eax
*4]
13984 movd mm5
, [esi
+ eax
*4 + 8]
13987 movq
[esp
+ i3310_dx1
], mm4
13989 movd
[esp
+ i3310_dz1
], mm5
13992 pfacc mm4
, mm5 ;
# mm0=rsq
13998 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
14001 ;
# mm0 is invsqrt, and mm1 r.
14003 ;
# calculate potentials and scalar force
14004 pfmul mm1
, [esp
+ i3310_tsc
] ;
# mm1=rt
14006 movd
[esp
+ i3310_n1
], mm4
14008 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
14011 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
14014 mov edx
, [ebp
+ i3310_VFtab
]
14015 mov ecx
, [esp
+ i3310_n1
]
14016 lea ecx
, [ecx
+ ecx
*2]
14018 ;
# load all the table values we need
14019 movd mm4
, [edx
+ ecx
*4]
14020 movd mm5
, [edx
+ ecx
*4 + 4]
14021 movd mm6
, [edx
+ ecx
*4 + 8]
14022 movd mm7
, [edx
+ ecx
*4 + 12]
14024 pfmul mm6
, mm1 ;
# mm6 = Geps
14025 pfmul mm7
, mm2 ;
# mm7 = Heps2
14028 pfadd mm5
, mm7 ;
# mm5 = Fp
14030 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
14032 pfadd mm7
, mm5 ;
# mm7=FF
14034 pfmul mm5
, mm1 ;
# mm5=eps*Fp
14035 pfadd mm5
, mm4 ;
# mm5= VV
14037 pfmul mm5
, mm3 ;
# vcoul=qq*VV
14038 pfmul mm3
, mm7 ;
# fijC=FF*qq
14040 ;
# at this point mm5 contains vcoul and mm3 fijC
14041 ;
# increment vcoul - then we can get rid of mm5
14043 pfadd mm5
, [esp
+ i3310_vctot
] ;
# add the earlier value
14044 movq
[esp
+ i3310_vctot
], mm5 ;
# store the sum
14046 ;
# change sign of mm3
14049 pfmul mm0
, [esp
+ i3310_tsc
]
14050 pfmul mm0
, mm1 ;
# mm0 is total fscal now
14052 ;
# spread fscalar to both positions
14054 ;
# calc vectorial force
14055 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
14056 movq mm2
, [esp
+ i3310_dx1
]
14057 movd mm3
, [esp
+ i3310_dz1
]
14063 ;
# update i particle force
14064 movq mm0
, [esp
+ i3310_fix
]
14065 movd mm1
, [esp
+ i3310_fiz
]
14068 movq
[esp
+ i3310_fix
], mm0
14069 movd
[esp
+ i3310_fiz
], mm1
14070 ;
# update j particle force
14071 movq mm0
, [edi
+ eax
*4]
14072 movd mm1
, [edi
+ eax
*4+ 8]
14075 movq
[edi
+ eax
*4], mm0
14076 movd
[edi
+ eax
*4 +8], mm1
14078 .i3310_updateouterdata_coul:
14079 mov ecx
, [esp
+ i3310_ii3
]
14081 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
14082 movd mm7
, [edi
+ ecx
*4 + 8]
14083 pfadd mm6
, [esp
+ i3310_fix
]
14084 pfadd mm7
, [esp
+ i3310_fiz
]
14085 movq
[edi
+ ecx
*4], mm6
14086 movd
[edi
+ ecx
*4 +8], mm7
14088 mov ebx
, [ebp
+ i3310_fshift
] ;
# increment fshift force
14089 mov edx
, [esp
+ i3310_is3
]
14091 movq mm6
, [ebx
+ edx
*4]
14092 movd mm7
, [ebx
+ edx
*4 + 8]
14093 pfadd mm6
, [esp
+ i3310_fix
]
14094 pfadd mm7
, [esp
+ i3310_fiz
]
14095 movq
[ebx
+ edx
*4], mm6
14096 movd
[ebx
+ edx
*4 + 8], mm7
14098 ;
# loop back to mno
14099 dec dword ptr
[esp
+ i3310_nscoul
]
14101 jmp
.i3310_mno_coul
14103 mov ecx
, [esp
+ i3310_nsvdw
]
14106 jmp
.i3310_last_mno
14108 mov ebx
, [esp
+ i3310_solnr
]
14109 inc dword ptr
[esp
+ i3310_solnr
]
14111 mov edx
, [ebp
+ i3310_type
]
14112 mov edx
, [edx
+ ebx
*4]
14113 imul edx
, [ebp
+ i3310_ntype
]
14115 mov
[esp
+ i3310_ntia
], edx
14117 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
14118 mov eax
, [ebp
+ i3310_pos
] ;
# eax = base of pos[]
14119 mov
[esp
+ i3310_ii3
], ebx
14121 movq mm0
, [eax
+ ebx
*4]
14122 movd mm1
, [eax
+ ebx
*4 + 8]
14123 pfadd mm0
, [esp
+ i3310_shX
]
14124 pfadd mm1
, [esp
+ i3310_shZ
]
14125 movq
[esp
+ i3310_ix
], mm0
14126 movd
[esp
+ i3310_iz
], mm1
14130 movq
[esp
+ i3310_fix
], mm7
14131 movd
[esp
+ i3310_fiz
], mm7
14133 mov ecx
, [esp
+ i3310_innerjjnr0
]
14134 mov
[esp
+ i3310_innerjjnr
], ecx
14135 mov edx
, [esp
+ i3310_innerk0
]
14137 mov
[esp
+ i3310_innerk
], edx ;
# number of innerloop atoms
14138 jge
.i3310_unroll_vdw_loop
14139 jmp
.i3310_finish_vdw_inner
14140 .i3310_unroll_vdw_loop:
14141 ;
# paired innerloop starts here
14142 mov ecx
, [esp
+ i3310_innerjjnr
] ;
# pointer to jjnr[k]
14144 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
14145 add dword ptr
[esp
+ i3310_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
14146 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
14148 mov ecx
, [ebp
+ i3310_type
]
14149 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
14150 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
14152 mov esi
, [ebp
+ i3310_nbfp
] ;
# base of nbfp
14155 add edx
, [esp
+ i3310_ntia
] ;
# tja = ntia + 2*type
14156 add ecx
, [esp
+ i3310_ntia
]
14158 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
14159 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
14161 punpckldq mm5
, mm7 ;
# mm5 = 1st c6 / 2nd c6
14162 punpckhdq mm6
, mm7 ;
# mm6 = 1st c12 / 2nd c12
14163 movq
[esp
+ i3310_c6
], mm5
14164 movq
[esp
+ i3310_c12
], mm6
14166 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
14167 lea ebx
, [ebx
+ ebx
*2]
14169 mov esi
, [ebp
+ i3310_pos
]
14171 movq mm0
, [esp
+ i3310_ix
]
14172 movd mm1
, [esp
+ i3310_iz
]
14173 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
14174 movd mm5
, [esi
+ eax
*4 + 8]
14175 pfsubr mm4
,mm0 ;
# dr = ir - jr
14177 movq
[esp
+ i3310_dx1
], mm4 ;
# store dr
14178 movd
[esp
+ i3310_dz1
], mm5
14179 pfmul mm4
,mm4 ;
# square dx,dy,dz
14181 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
14182 pfacc mm4
, mm5 ;
# first rsq in lower mm4
14184 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
14185 movd mm7
, [esi
+ ebx
*4 + 8]
14187 pfsubr mm6
, mm0 ;
# dr = ir - jr
14189 movq
[esp
+ i3310_dx2
], mm6 ;
# store dr
14190 movd
[esp
+ i3310_dz2
], mm7
14191 pfmul mm6
, mm6 ;
# square dx,dy,dz
14193 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
14194 pfacc mm6
, mm7 ;
# second rsq in lower mm6
14196 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
14201 punpckldq mm4
, mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
14202 movq mm2
, mm0 ;
# amd 3dnow N-R iteration to get full precision.
14208 ;
# mm0 is invsqrt, and mm1 r.
14209 ;
# do potential and fscal
14210 pfmul mm1
, [esp
+ i3310_tsc
] ;
# mm1=rt
14212 movq
[esp
+ i3310_n1
], mm4
14214 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
14217 pfmul mm2
, mm2 ;
# mm1 is eps, mm2 is eps2
14219 mov edx
, [ebp
+ i3310_VFtab
]
14220 ;
# dispersion table
14221 mov ecx
, [esp
+ i3310_n1
]
14222 lea ecx
, [ecx
+ ecx
*2]
14224 ;
# load all the table values we need
14225 movd mm4
, [edx
+ ecx
*4]
14226 movd mm5
, [edx
+ ecx
*4 + 4]
14227 movd mm6
, [edx
+ ecx
*4 + 8]
14228 movd mm7
, [edx
+ ecx
*4 + 12]
14229 mov ecx
, [esp
+ i3310_n1
+ 4]
14230 lea ecx
, [ecx
+ ecx
*2]
14232 punpckldq mm4
, [edx
+ ecx
*4]
14233 punpckldq mm5
, [edx
+ ecx
*4 + 4]
14234 punpckldq mm6
, [edx
+ ecx
*4 + 8]
14235 punpckldq mm7
, [edx
+ ecx
*4 + 12]
14236 pfmul mm6
, mm1 ;
# mm6 = Geps
14237 pfmul mm7
, mm2 ;
# mm7 = Heps2
14239 pfadd mm5
, mm7 ;
# mm5 = Fp
14240 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
14242 pfadd mm7
, mm5 ;
# mm7=FF
14243 pfmul mm5
, mm1 ;
# mm5=eps*Fp
14244 pfadd mm5
, mm4 ;
# mm5= VV
14246 movq mm4
, [esp
+ i3310_c6
]
14247 pfmul mm7
, mm4 ;
# fijD
14248 pfmul mm5
, mm4 ;
# vnb6
14249 movq mm3
, mm7 ;
# add to fscal
14251 ;
# update vnbtot to release mm5!
14252 pfadd mm5
, [esp
+ i3310_vnbtot
] ;
# add the earlier value
14253 movq
[esp
+ i3310_vnbtot
], mm5 ;
# store the sum
14256 mov ecx
, [esp
+ i3310_n1
]
14257 lea ecx
, [ecx
+ ecx
*2]
14259 ;
# load all the table values we need
14260 movd mm4
, [edx
+ ecx
*4 + 16]
14261 movd mm5
, [edx
+ ecx
*4 + 20]
14262 movd mm6
, [edx
+ ecx
*4 + 24]
14263 movd mm7
, [edx
+ ecx
*4 + 28]
14264 mov ecx
, [esp
+ i3310_n1
+ 4]
14265 lea ecx
, [ecx
+ ecx
*2]
14267 punpckldq mm4
, [edx
+ ecx
*4 + 16]
14268 punpckldq mm5
, [edx
+ ecx
*4 + 20]
14269 punpckldq mm6
, [edx
+ ecx
*4 + 24]
14270 punpckldq mm7
, [edx
+ ecx
*4 + 28]
14272 pfmul mm6
, mm1 ;
# mm6 = Geps
14273 pfmul mm7
, mm2 ;
# mm7 = Heps2
14275 pfadd mm5
, mm7 ;
# mm5 = Fp
14276 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
14278 pfadd mm7
, mm5 ;
# mm7=FF
14279 pfmul mm5
, mm1 ;
# mm5=eps*Fp
14280 pfadd mm5
, mm4 ;
# mm5= VV
14282 movq mm6
, [esp
+ i3310_c12
]
14283 pfmul mm7
, mm6 ;
# fijR
14284 pfmul mm5
, mm6 ;
# vnb12
14285 pfadd mm3
, mm7 ;
# total fscal fijD+ fijR
14287 ;
# change sign of mm3
14290 pfmul mm1
, [esp
+ i3310_tsc
]
14291 pfmul mm0
, mm1 ;
# mm0 is total fscal now
14293 prefetchw
[esp
+ i3310_dx1
] ;
# prefetch i forces to cache
14295 ;
# spread fscalar to both positions
14300 ;
# calc vector force
14301 prefetchw
[edi
+ eax
*4] ;
# prefetch the 1st faction to cache
14302 movq mm2
, [esp
+ i3310_dx1
] ;
# fetch dr
14303 movd mm3
, [esp
+ i3310_dz1
]
14306 pfadd mm5
, [esp
+ i3310_vnbtot
] ;
# add the earlier value
14307 movq
[esp
+ i3310_vnbtot
], mm5 ;
# store the sum
14309 prefetchw
[edi
+ ebx
*4] ;
# prefetch the 2nd faction to cache
14310 pfmul mm2
, mm0 ;
# mult by fs
14313 movq mm4
, [esp
+ i3310_dx2
] ;
# fetch dr
14314 movd mm5
, [esp
+ i3310_dz2
]
14315 pfmul mm4
, mm1 ;
# mult by fs
14319 movq mm0
, [esp
+ i3310_fix
]
14320 movd mm1
, [esp
+ i3310_fiz
]
14326 movq
[esp
+ i3310_fix
], mm0
14327 movd
[esp
+ i3310_fiz
], mm1
14330 movq mm0
, [edi
+ eax
*4]
14331 movd mm1
, [edi
+ eax
*4 + 8]
14332 movq mm6
, [edi
+ ebx
*4]
14333 movd mm7
, [edi
+ ebx
*4 + 8]
14340 movq
[edi
+ eax
*4], mm0
14341 movd
[edi
+ eax
*4 +8], mm1
14342 movq
[edi
+ ebx
*4], mm6
14343 movd
[edi
+ ebx
*4 + 8], mm7
14345 ;
# should we do one more iteration?
14346 sub dword ptr
[esp
+ i3310_innerk
], 2
14347 jl
.i3310_finish_vdw_inner
14348 jmp
.i3310_unroll_vdw_loop
14349 .i3310_finish_vdw_inner:
14350 and dword ptr
[esp
+ i3310_innerk
], 1
14351 jnz
.i3310_single_vdw_inner
14352 jmp
.i3310_updateouterdata_vdw
14353 .i3310_single_vdw_inner:
14354 ;
# a single j particle iteration here - compare with the unrolled code for comments.
14355 mov eax
, [esp
+ i3310_innerjjnr
]
14356 mov eax
, [eax
] ;
# eax=jnr offset
14358 mov esi
, [ebp
+ i3310_nbfp
]
14359 mov ecx
, [ebp
+ i3310_type
]
14360 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
14362 add edx
, [esp
+ i3310_ntia
] ;
# tja = ntia + 2*type
14363 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
14364 movq
[esp
+ i3310_c6
], mm5
14365 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
14366 movq
[esp
+ i3310_c12
], mm5
14368 mov esi
, [ebp
+ i3310_pos
]
14369 lea eax
, [eax
+ eax
*2]
14371 movq mm0
, [esp
+ i3310_ix
]
14372 movd mm1
, [esp
+ i3310_iz
]
14373 movq mm4
, [esi
+ eax
*4]
14374 movd mm5
, [esi
+ eax
*4 + 8]
14377 movq
[esp
+ i3310_dx1
], mm4
14379 movd
[esp
+ i3310_dz1
], mm5
14382 pfacc mm4
, mm5 ;
# mm0=rsq
14388 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
14391 ;
# mm0 is invsqrt, and mm1 r.
14393 ;
# calculate potentials and scalar force
14394 pfmul mm1
, [esp
+ i3310_tsc
] ;
# mm1=rt
14396 movd
[esp
+ i3310_n1
], mm4
14398 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 n0.
14401 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
14403 mov edx
, [ebp
+ i3310_VFtab
]
14404 mov ecx
, [esp
+ i3310_n1
]
14405 lea ecx
, [ecx
+ ecx
*2]
14407 ;
# dispersion table
14408 ;
# load all the table values we need
14410 movd mm4
, [edx
+ ecx
*4]
14411 movd mm5
, [edx
+ ecx
*4 + 4]
14412 movd mm6
, [edx
+ ecx
*4 + 8]
14413 movd mm7
, [edx
+ ecx
*4 + 12]
14414 pfmul mm6
, mm1 ;
# mm6 = Geps
14415 pfmul mm7
, mm2 ;
# mm7 = Heps2
14417 pfadd mm5
, mm7 ;
# mm5 = Fp
14418 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
14420 pfadd mm7
, mm5 ;
# mm7=FF
14421 pfmul mm5
, mm1 ;
# mm5=eps*Fp
14422 pfadd mm5
, mm4 ;
# mm5= VV
14424 movq mm4
, [esp
+ i3310_c6
]
14425 pfmul mm7
, mm4 ;
# fijD
14426 pfmul mm5
, mm4 ;
# vnb6
14427 movq mm3
, mm7 ;
# add to fscal
14429 ;
# update vnbtot to release mm5!
14430 pfadd mm5
, [esp
+ i3310_vnbtot
] ;
# add the earlier value
14431 movq
[esp
+ i3310_vnbtot
], mm5 ;
# store the sum
14434 ;
# load all the table values we need
14436 movd mm4
, [edx
+ ecx
*4 + 16]
14437 movd mm5
, [edx
+ ecx
*4 + 20]
14438 movd mm6
, [edx
+ ecx
*4 + 24]
14439 movd mm7
, [edx
+ ecx
*4 + 28]
14441 pfmul mm6
, mm1 ;
# mm6 = Geps
14442 pfmul mm7
, mm2 ;
# mm7 = Heps2
14444 pfadd mm5
, mm7 ;
# mm5 = Fp
14445 pfmul mm7
, [esp
+ i3310_two
] ;
# two*Heps2
14447 pfadd mm7
, mm5 ;
# mm7=FF
14448 pfmul mm5
, mm1 ;
# mm5=eps*Fp
14449 pfadd mm5
, mm4 ;
# mm5= VV
14451 movq mm6
, [esp
+ i3310_c12
]
14452 pfmul mm7
, mm6 ;
# fijR
14453 pfmul mm5
, mm6 ;
# vnb12
14454 pfadd mm3
, mm7 ;
# total fscal fijC+ fijD+ fijR
14456 ;
# change sign of mm3
14459 pfmul mm0
, [esp
+ i3310_tsc
]
14460 pfmul mm0
, mm1 ;
# mm0 is total fscal now
14463 pfadd mm5
, [esp
+ i3310_vnbtot
] ;
# add the earlier value
14464 movq
[esp
+ i3310_vnbtot
], mm5 ;
# store the sum
14466 ;
# spread fscalar to both positions
14468 ;
# calc vectorial force
14469 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
14470 movq mm2
, [esp
+ i3310_dx1
]
14471 movd mm3
, [esp
+ i3310_dz1
]
14476 ;
# update i particle force
14477 movq mm0
, [esp
+ i3310_fix
]
14478 movd mm1
, [esp
+ i3310_fiz
]
14481 movq
[esp
+ i3310_fix
], mm0
14482 movd
[esp
+ i3310_fiz
], mm1
14483 ;
# update j particle force
14484 movq mm0
, [edi
+ eax
*4]
14485 movd mm1
, [edi
+ eax
*4+ 8]
14488 movq
[edi
+ eax
*4], mm0
14489 movd
[edi
+ eax
*4 +8], mm1
14491 .i3310_updateouterdata_vdw:
14492 mov ecx
, [esp
+ i3310_ii3
]
14494 movq mm6
, [edi
+ ecx
*4] ;
# increment i force
14495 movd mm7
, [edi
+ ecx
*4 + 8]
14496 pfadd mm6
, [esp
+ i3310_fix
]
14497 pfadd mm7
, [esp
+ i3310_fiz
]
14498 movq
[edi
+ ecx
*4], mm6
14499 movd
[edi
+ ecx
*4 +8], mm7
14501 mov ebx
, [ebp
+ i3310_fshift
] ;
# increment fshift force
14502 mov edx
, [esp
+ i3310_is3
]
14504 movq mm6
, [ebx
+ edx
*4]
14505 movd mm7
, [ebx
+ edx
*4 + 8]
14506 pfadd mm6
, [esp
+ i3310_fix
]
14507 pfadd mm7
, [esp
+ i3310_fiz
]
14508 movq
[ebx
+ edx
*4], mm6
14509 movd
[ebx
+ edx
*4 + 8], mm7
14511 ;
# loop back to mno
14512 dec dword ptr
[esp
+ i3310_nsvdw
]
14517 mov edx
, [ebp
+ i3310_gid
] ;
# get group index for this i particle
14519 add dword ptr
[ebp
+ i3310_gid
], 4 ;
# advance pointer
14521 movq mm7
, [esp
+ i3310_vctot
]
14522 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
14524 mov eax
, [ebp
+ i3310_Vc
]
14525 movd mm6
, [eax
+ edx
*4]
14527 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
14529 movq mm7
, [esp
+ i3310_vnbtot
]
14530 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
14532 mov eax
, [ebp
+ i3310_Vnb
]
14533 movd mm6
, [eax
+ edx
*4]
14535 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
14537 mov ecx
, [ebp
+ i3310_nri
]
14540 ;
# not last, iterate once more!
14541 mov
[ebp
+ i3310_nri
], ecx
14556 .globl inl3320_3dnow
14557 .globl _inl3320_3dnow
14560 .equiv i3320_nri, 8
14561 .equiv i3320_iinr, 12
14562 .equiv i3320_jindex, 16
14563 .equiv i3320_jjnr, 20
14564 .equiv i3320_shift, 24
14565 .equiv i3320_shiftvec, 28
14566 .equiv i3320_fshift, 32
14567 .equiv i3320_gid, 36
14568 .equiv i3320_pos, 40
14569 .equiv i3320_faction, 44
14570 .equiv i3320_charge, 48
14571 .equiv i3320_facel, 52
14572 .equiv i3320_Vc, 56
14573 .equiv i3320_type, 60
14574 .equiv i3320_ntype, 64
14575 .equiv i3320_nbfp, 68
14576 .equiv i3320_Vnb, 72
14577 .equiv i3320_tabscale, 76
14578 .equiv i3320_VFtab, 80
14579 ;
# stack offsets for local variables
14580 .equiv i3320_is3, 0
14581 .equiv i3320_ii3, 4
14582 .equiv i3320_ixO, 8
14583 .equiv i3320_iyO, 12
14584 .equiv i3320_izO, 16
14585 .equiv i3320_ixH, 20
14586 .equiv i3320_iyH, 28
14587 .equiv i3320_izH, 36
14588 .equiv i3320_iqO, 44
14589 .equiv i3320_iqH, 52
14590 .equiv i3320_qqO, 60
14591 .equiv i3320_qqH, 68
14592 .equiv i3320_vctot, 76
14593 .equiv i3320_vnbtot, 84
14594 .equiv i3320_c6, 92
14595 .equiv i3320_c12, 100
14596 .equiv i3320_two, 108
14597 .equiv i3320_n1, 116
14598 .equiv i3320_tsc, 124
14599 .equiv i3320_ntia, 132
14600 .equiv i3320_innerjjnr, 140
14601 .equiv i3320_innerk, 144
14602 .equiv i3320_fixO, 148
14603 .equiv i3320_fiyO, 152
14604 .equiv i3320_fizO, 156
14605 .equiv i3320_fixH, 160
14606 .equiv i3320_fiyH, 168
14607 .equiv i3320_fizH, 176
14608 .equiv i3320_dxO, 184
14609 .equiv i3320_dyO, 188
14610 .equiv i3320_dzO, 192
14611 .equiv i3320_dxH, 196
14612 .equiv i3320_dyH, 204
14613 .equiv i3320_dzH, 212
14614 .equiv i3320_tmprsqH, 220
14623 sub esp
, 228 ;
# local stack space
14626 mov ecx
, [ebp
+ i3320_iinr
] ;
# ecx = pointer into iinr[]
14627 mov ebx
, [ecx
] ;
# ebx=ii
14629 mov edx
, [ebp
+ i3320_charge
]
14630 movd mm1
, [ebp
+ i3320_facel
]
14631 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0]
14633 movq
[esp
+ i3320_iqO
], mm2 ;
# iqO = facel*charge[ii]
14635 movd mm2
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1]
14637 punpckldq mm2
,mm2 ;
# spread to both halves
14638 movq
[esp
+ i3320_iqH
], mm2 ;
# iqH = facel*charge[ii0+1]
14640 mov edx
, [ebp
+ i3320_type
]
14641 mov ecx
, [edx
+ ebx
*4]
14643 imul ecx
, [ebp
+ i3320_ntype
] ;
# ecx = ntia = 2*ntype*type[ii0]
14644 mov
[esp
+ i3320_ntia
], ecx
14647 movq mm4
, [ebp
+ i3320_tabscale
]
14648 punpckldq mm4
,mm4 ;
# spread to both halves
14649 movq
[esp
+ i3320_two
], mm3
14650 movq
[esp
+ i3320_tsc
], mm4
14651 ;
# assume we have at least one i particle - start directly
14653 mov eax
, [ebp
+ i3320_shift
] ;
# eax = pointer into shift[]
14654 mov ebx
, [eax
] ;
# ebx=shift[n]
14655 add dword ptr
[ebp
+ i3320_shift
], 4 ;
# advance pointer one step
14657 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
14658 mov
[esp
+ i3320_is3
],ebx ;
# store is3
14660 mov eax
, [ebp
+ i3320_shiftvec
] ;
# eax = base of shiftvec[]
14662 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
14663 movd mm6
, [eax
+ ebx
*4 + 8]
14667 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
14671 mov ecx
, [ebp
+ i3320_iinr
] ;
# ecx = pointer into iinr[]
14672 add dword ptr
[ebp
+ i3320_iinr
], 4 ;
# advance pointer
14673 mov ebx
, [ecx
] ;
# ebx=ii
14675 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
14676 mov eax
, [ebp
+ i3320_pos
] ;
# eax = base of pos[]
14678 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
14679 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
14680 mov
[esp
+ i3320_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
14682 movq
[esp
+ i3320_ixO
], mm5
14683 movq
[esp
+ i3320_izO
], mm6
14685 movd mm3
, [eax
+ ebx
*4 + 12]
14686 movd mm4
, [eax
+ ebx
*4 + 16]
14687 movd mm5
, [eax
+ ebx
*4 + 20]
14688 punpckldq mm3
, [eax
+ ebx
*4 + 24]
14689 punpckldq mm4
, [eax
+ ebx
*4 + 28]
14690 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
14695 movq
[esp
+ i3320_ixH
], mm0
14696 movq
[esp
+ i3320_iyH
], mm1
14697 movq
[esp
+ i3320_izH
], mm2
14699 ;
# clear vctot and i forces
14701 movq
[esp
+ i3320_vctot
], mm7
14702 movq
[esp
+ i3320_vnbtot
], mm7
14703 movq
[esp
+ i3320_fixO
], mm7
14704 movd
[esp
+ i3320_fizO
], mm7
14705 movq
[esp
+ i3320_fixH
], mm7
14706 movq
[esp
+ i3320_fiyH
], mm7
14707 movq
[esp
+ i3320_fizH
], mm7
14709 mov eax
, [ebp
+ i3320_jindex
]
14710 mov ecx
, [eax
] ;
# jindex[n]
14711 mov edx
, [eax
+ 4] ;
# jindex[n+1]
14712 add dword ptr
[ebp
+ i3320_jindex
], 4
14713 sub edx
, ecx ;
# number of innerloop atoms
14714 mov
[esp
+ i3320_innerk
], edx
14716 mov esi
, [ebp
+ i3320_pos
]
14717 mov edi
, [ebp
+ i3320_faction
]
14718 mov eax
, [ebp
+ i3320_jjnr
]
14721 mov
[esp
+ i3320_innerjjnr
], eax ;
# pointer to jjnr[nj0]
14723 ;
# a single j particle iteration here - compare with the unrolled code for comments.
14724 mov eax
, [esp
+ i3320_innerjjnr
]
14725 mov eax
, [eax
] ;
# eax=jnr offset
14726 add dword ptr
[esp
+ i3320_innerjjnr
], 4 ;
# advance pointer
14727 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
14729 mov ecx
, [ebp
+ i3320_charge
]
14730 movd mm7
, [ecx
+ eax
*4]
14733 pfmul mm6
, [esp
+ i3320_iqO
]
14734 pfmul mm7
, [esp
+ i3320_iqH
] ;
# mm6=qqO, mm7=qqH
14735 movd
[esp
+ i3320_qqO
], mm6
14736 movq
[esp
+ i3320_qqH
], mm7
14738 mov ecx
, [ebp
+ i3320_type
]
14739 mov edx
, [ecx
+ eax
*4] ;
# type [jnr]
14740 mov ecx
, [ebp
+ i3320_nbfp
]
14742 add edx
, [esp
+ i3320_ntia
] ;
# tja = ntia + 2*type
14743 movd mm5
, [ecx
+ edx
*4] ;
# mm5 = 1st c6
14744 movq
[esp
+ i3320_c6
], mm5
14745 movd mm5
, [ecx
+ edx
*4 + 4] ;
# mm5 = 1st c12
14746 movq
[esp
+ i3320_c12
], mm5
14748 lea eax
, [eax
+ eax
*2]
14750 movq mm0
, [esi
+ eax
*4]
14751 movd mm1
, [esi
+ eax
*4 + 8]
14752 ;
# copy & expand to mm2-mm4 for the H interactions
14760 pfsubr mm0
, [esp
+ i3320_ixO
]
14761 pfsubr mm1
, [esp
+ i3320_izO
]
14763 movq
[esp
+ i3320_dxO
], mm0
14765 movd
[esp
+ i3320_dzO
], mm1
14768 pfadd mm0
, mm1 ;
# mm0=rsqO
14772 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
14773 pfsubr mm2
, [esp
+ i3320_ixH
]
14774 pfsubr mm3
, [esp
+ i3320_iyH
]
14775 pfsubr mm4
, [esp
+ i3320_izH
] ;
# mm2-mm4 is dxH-dzH
14777 movq
[esp
+ i3320_dxH
], mm2
14778 movq
[esp
+ i3320_dyH
], mm3
14779 movq
[esp
+ i3320_dzH
], mm4
14785 pfadd mm3
,mm4 ;
# mm3=rsqH
14786 movq
[esp
+ i3320_tmprsqH
], mm3
14793 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
14795 pfmul mm0
, mm1 ;
# mm0=r
14797 pfmul mm0
, [esp
+ i3320_tsc
]
14799 movd
[esp
+ i3320_n1
], mm4
14801 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
14803 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
14806 mov edx
, [ebp
+ i3320_VFtab
]
14807 mov ecx
, [esp
+ i3320_n1
]
14808 lea ecx
, [ecx
+ ecx
*2]
14810 ;
# load all values we need
14811 movd mm4
, [edx
+ ecx
*4]
14812 movd mm5
, [edx
+ ecx
*4 + 4]
14813 movd mm6
, [edx
+ ecx
*4 + 8]
14814 movd mm7
, [edx
+ ecx
*4 + 12]
14816 pfmul mm6
, mm0 ;
# mm6 = Geps
14817 pfmul mm7
, mm2 ;
# mm7 = Heps2
14820 pfadd mm5
, mm7 ;
# mm5 = Fp
14822 pfmul mm7
, [esp
+ i3320_two
] ;
# two*Heps2
14824 pfadd mm7
, mm5 ;
# mm7=FF
14826 pfmul mm5
, mm0 ;
# mm5=eps*Fp
14827 pfadd mm5
, mm4 ;
# mm5= VV
14829 pfmul mm5
, [esp
+ i3320_qqO
] ;
# vcoul=qq*VV
14830 pfmul mm7
, [esp
+ i3320_qqO
] ;
# fijC=qq*FF
14832 ;
# update vctot directly, use mm3 for fscal sum.
14833 pfadd mm5
, [esp
+ i3320_vctot
]
14834 movq
[esp
+ i3320_vctot
], mm5
14837 ;
# dispersion table
14838 ;
# load all the table values we need
14839 movd mm4
, [edx
+ ecx
*4 + 16]
14840 movd mm5
, [edx
+ ecx
*4 + 20]
14841 movd mm6
, [edx
+ ecx
*4 + 24]
14842 movd mm7
, [edx
+ ecx
*4 + 28]
14843 pfmul mm6
, mm0 ;
# mm6 = Geps
14844 pfmul mm7
, mm2 ;
# mm7 = Heps2
14846 pfadd mm5
, mm7 ;
# mm5 = Fp
14847 pfmul mm7
, [esp
+ i3320_two
] ;
# two*Heps2
14849 pfadd mm7
, mm5 ;
# mm7=FF
14850 pfmul mm5
, mm0 ;
# mm5=eps*Fp
14851 pfadd mm5
, mm4 ;
# mm5= VV
14853 movq mm4
, [esp
+ i3320_c6
]
14854 pfmul mm7
, mm4 ;
# fijD
14855 pfmul mm5
, mm4 ;
# vnb6
14856 pfadd mm3
, mm7 ;
# add to fscal
14858 ;
# update vnbtot to release mm5!
14859 pfadd mm5
, [esp
+ i3320_vnbtot
] ;
# add the earlier value
14860 movq
[esp
+ i3320_vnbtot
], mm5 ;
# store the sum
14863 ;
# load all the table values we need
14864 movd mm4
, [edx
+ ecx
*4 + 32]
14865 movd mm5
, [edx
+ ecx
*4 + 36]
14866 movd mm6
, [edx
+ ecx
*4 + 40]
14867 movd mm7
, [edx
+ ecx
*4 + 44]
14869 pfmul mm6
, mm0 ;
# mm6 = Geps
14870 pfmul mm7
, mm2 ;
# mm7 = Heps2
14872 pfadd mm5
, mm7 ;
# mm5 = Fp
14873 pfmul mm7
, [esp
+ i3320_two
] ;
# two*Heps2
14875 pfadd mm7
, mm5 ;
# mm7=FF
14876 pfmul mm5
, mm0 ;
# mm5=eps*Fp
14877 pfadd mm5
, mm4 ;
# mm5= VV
14879 movq mm6
, [esp
+ i3320_c12
]
14880 pfmul mm7
, mm6 ;
# fijR
14881 pfmul mm5
, mm6 ;
# vnb12
14882 pfadd mm3
, mm7 ;
# total fscal fijC+ fijD+ fijR
14884 ;
# change sign of fscal and multiply with rinv
14887 pfmul mm3
, [esp
+ i3320_tsc
]
14888 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
14891 pfadd mm5
, [esp
+ i3320_vnbtot
] ;
# add the earlier value
14892 movq
[esp
+ i3320_vnbtot
], mm5 ;
# store the sum
14894 ;
# Ready with the oxygen - potential is updated, fscal is in mm3.
14895 ;
# now do the two hydrogens.
14896 movq mm0
, [esp
+ i3320_tmprsqH
] ;
# mm0=rsqH
14902 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
14907 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
14909 pfmul mm0
,mm1 ;
# mm0=r
14910 pfmul mm0
, [esp
+ i3320_tsc
]
14912 movq
[esp
+ i3320_n1
], mm4
14914 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
14916 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
14919 mov edx
, [ebp
+ i3320_VFtab
]
14920 mov ecx
, [esp
+ i3320_n1
]
14921 lea ecx
, [ecx
+ ecx
*2]
14923 ;
# load all values we need
14924 movd mm4
, [edx
+ ecx
*4]
14925 movd mm5
, [edx
+ ecx
*4 + 4]
14926 movd mm6
, [edx
+ ecx
*4 + 8]
14927 movd mm7
, [edx
+ ecx
*4 + 12]
14928 mov ecx
, [esp
+ i3320_n1
+ 4]
14929 lea ecx
, [ecx
+ ecx
*2]
14931 punpckldq mm4
, [edx
+ ecx
*4]
14932 punpckldq mm5
, [edx
+ ecx
*4 + 4]
14933 punpckldq mm6
, [edx
+ ecx
*4 + 8]
14934 punpckldq mm7
, [edx
+ ecx
*4 + 12]
14937 pfmul mm6
, mm0 ;
# mm6 = Geps
14938 pfmul mm7
, mm2 ;
# mm7 = Heps2
14941 pfadd mm5
, mm7 ;
# mm5 = Fp
14943 pfmul mm7
, [esp
+ i3320_two
] ;
# two*Heps2
14945 pfadd mm7
, mm5 ;
# mm7=FF
14947 pfmul mm5
, mm0 ;
# mm5=eps*Fp
14948 pfadd mm5
, mm4 ;
# mm5= VV
14950 pfmul mm5
, [esp
+ i3320_qqH
] ;
# vcoul=qq*VV
14951 pfmul mm7
, [esp
+ i3320_qqH
] ;
# fijC=qq*FF
14953 pfadd mm5
, [esp
+ i3320_vctot
]
14954 movq
[esp
+ i3320_vctot
], mm5
14956 ;
# change sign of fijC and multiply by rinv
14959 pfmul mm4
, [esp
+ i3320_tsc
]
14960 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
14962 ;
# spread oxygen fscalar to both positions
14964 ;
# calc vectorial force for O
14965 prefetchw
[edi
+ eax
*4] ;
# prefetch faction to cache
14966 movq mm0
, [esp
+ i3320_dxO
]
14967 movd mm1
, [esp
+ i3320_dzO
]
14971 ;
# calc vectorial force for H's
14972 movq mm5
, [esp
+ i3320_dxH
]
14973 movq mm6
, [esp
+ i3320_dyH
]
14974 movq mm7
, [esp
+ i3320_dzH
]
14979 ;
# update iO particle force
14980 movq mm2
, [esp
+ i3320_fixO
]
14981 movd mm3
, [esp
+ i3320_fizO
]
14984 movq
[esp
+ i3320_fixO
], mm2
14985 movd
[esp
+ i3320_fizO
], mm3
14987 ;
# update iH forces
14988 movq mm2
, [esp
+ i3320_fixH
]
14989 movq mm3
, [esp
+ i3320_fiyH
]
14990 movq mm4
, [esp
+ i3320_fizH
]
14994 movq
[esp
+ i3320_fixH
], mm2
14995 movq
[esp
+ i3320_fiyH
], mm3
14996 movq
[esp
+ i3320_fizH
], mm4
14998 ;
# pack j forces from H in the same form as the oxygen force.
14999 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
15000 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
15002 pfadd mm0
, mm5 ;
# add up total force on j particle.
15005 ;
# update j particle force
15006 movq mm2
, [edi
+ eax
*4]
15007 movd mm3
, [edi
+ eax
*4 + 8]
15010 movq
[edi
+ eax
*4], mm2
15011 movd
[edi
+ eax
*4 +8], mm3
15013 ;
# done - one more?
15014 dec dword ptr
[esp
+ i3320_innerk
]
15015 jz
.i3320_updateouterdata
15016 jmp
.i3320_inner_loop
15017 .i3320_updateouterdata:
15018 mov ecx
, [esp
+ i3320_ii3
]
15020 movq mm6
, [edi
+ ecx
*4] ;
# increment iO force
15021 movd mm7
, [edi
+ ecx
*4 + 8]
15022 pfadd mm6
, [esp
+ i3320_fixO
]
15023 pfadd mm7
, [esp
+ i3320_fizO
]
15024 movq
[edi
+ ecx
*4], mm6
15025 movd
[edi
+ ecx
*4 +8], mm7
15027 movq mm0
, [esp
+ i3320_fixH
]
15028 movq mm3
, [esp
+ i3320_fiyH
]
15029 movq mm1
, [esp
+ i3320_fizH
]
15031 punpckldq mm0
, mm3 ;
# mm0(l)=fxH1, mm0(h)=fyH1
15032 punpckhdq mm2
, mm3 ;
# mm2(l)=fxH2, mm2(h)=fyH2
15038 movq mm6
, [edi
+ ecx
*4 + 12] ;
# increment iH1 force
15039 movd mm7
, [edi
+ ecx
*4 + 20]
15042 movq
[edi
+ ecx
*4 + 12], mm6
15043 movd
[edi
+ ecx
*4 + 20], mm7
15045 movq mm6
, [edi
+ ecx
*4 + 24] ;
# increment iH2 force
15046 movd mm7
, [edi
+ ecx
*4 + 32]
15049 movq
[edi
+ ecx
*4 + 24], mm6
15050 movd
[edi
+ ecx
*4 + 32], mm7
15053 mov ebx
, [ebp
+ i3320_fshift
] ;
# increment fshift force
15054 mov edx
, [esp
+ i3320_is3
]
15056 movq mm6
, [ebx
+ edx
*4]
15057 movd mm7
, [ebx
+ edx
*4 + 8]
15058 pfadd mm6
, [esp
+ i3320_fixO
]
15059 pfadd mm7
, [esp
+ i3320_fizO
]
15064 movq
[ebx
+ edx
*4], mm6
15065 movd
[ebx
+ edx
*4 + 8], mm7
15067 mov edx
, [ebp
+ i3320_gid
] ;
# get group index for this i particle
15069 add dword ptr
[ebp
+ i3320_gid
], 4 ;
# advance pointer
15071 movq mm7
, [esp
+ i3320_vctot
]
15072 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
15074 mov eax
, [ebp
+ i3320_Vc
]
15075 movd mm6
, [eax
+ edx
*4]
15077 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
15079 movq mm7
, [esp
+ i3320_vnbtot
]
15080 pfacc mm7
,mm7 ;
# same for Vnb
15082 mov eax
, [ebp
+ i3320_Vnb
]
15083 movd mm6
, [eax
+ edx
*4]
15085 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
15087 dec dword ptr
[ebp
+ i3320_nri
]
15089 ;
# not last, iterate once more!
15105 .globl inl3330_3dnow
15106 .globl _inl3330_3dnow
15109 .equiv i3330_nri, 8
15110 .equiv i3330_iinr, 12
15111 .equiv i3330_jindex, 16
15112 .equiv i3330_jjnr, 20
15113 .equiv i3330_shift, 24
15114 .equiv i3330_shiftvec, 28
15115 .equiv i3330_fshift, 32
15116 .equiv i3330_gid, 36
15117 .equiv i3330_pos, 40
15118 .equiv i3330_faction, 44
15119 .equiv i3330_charge, 48
15120 .equiv i3330_facel, 52
15121 .equiv i3330_Vc, 56
15122 .equiv i3330_type, 60
15123 .equiv i3330_ntype, 64
15124 .equiv i3330_nbfp, 68
15125 .equiv i3330_Vnb, 72
15126 .equiv i3330_tabscale, 76
15127 .equiv i3330_VFtab, 80
15128 ;
# stack offsets for local variables
15129 .equiv i3330_is3, 0
15130 .equiv i3330_ii3, 4
15131 .equiv i3330_ixO, 8
15132 .equiv i3330_iyO, 12
15133 .equiv i3330_izO, 16
15134 .equiv i3330_ixH, 20
15135 .equiv i3330_iyH, 28
15136 .equiv i3330_izH, 36
15137 .equiv i3330_qqOO, 44
15138 .equiv i3330_qqOH, 52
15139 .equiv i3330_qqHH, 60
15140 .equiv i3330_c6, 68
15141 .equiv i3330_c12, 76
15142 .equiv i3330_two, 84
15143 .equiv i3330_n1, 92
15144 .equiv i3330_tsc, 100
15145 .equiv i3330_vctot, 108
15146 .equiv i3330_vnbtot, 116
15147 .equiv i3330_innerjjnr, 124
15148 .equiv i3330_innerk, 128
15149 .equiv i3330_fixO, 132
15150 .equiv i3330_fiyO, 136
15151 .equiv i3330_fizO, 140
15152 .equiv i3330_fixH, 144
15153 .equiv i3330_fiyH, 152
15154 .equiv i3330_fizH, 160
15155 .equiv i3330_dxO, 168
15156 .equiv i3330_dyO, 172
15157 .equiv i3330_dzO, 176
15158 .equiv i3330_dxH, 180
15159 .equiv i3330_dyH, 188
15160 .equiv i3330_dzH, 196
15161 .equiv i3330_tmprsqH, 204
15170 sub esp
, 212 ;
# local stack space
15172 ;
# assume we have at least one i particle - start directly
15174 mov ecx
, [ebp
+ i3330_iinr
] ;
# ecx = pointer into iinr[]
15175 mov ebx
, [ecx
] ;
# ebx=ii
15177 mov edx
, [ebp
+ i3330_charge
]
15178 movd mm1
, [ebp
+ i3330_facel
] ;
# mm1=facel
15179 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0] (O)
15180 movd mm3
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1] (H)
15186 pfmul mm4
, mm2 ;
# mm4=qqOO*facel
15187 pfmul mm5
, mm3 ;
# mm5=qqOH*facel
15188 pfmul mm6
, mm3 ;
# mm6=qqHH*facel
15189 punpckldq mm5
,mm5 ;
# spread to both halves
15190 punpckldq mm6
,mm6 ;
# spread to both halves
15191 movq
[esp
+ i3330_qqOO
], mm4
15192 movq
[esp
+ i3330_qqOH
], mm5
15193 movq
[esp
+ i3330_qqHH
], mm6
15194 mov edx
, [ebp
+ i3330_type
]
15195 mov ecx
, [edx
+ ebx
*4]
15198 imul ecx
, [ebp
+ i3330_ntype
]
15200 mov eax
, [ebp
+ i3330_nbfp
]
15201 movd mm0
, [eax
+ edx
*4]
15202 movd mm1
, [eax
+ edx
*4 + 4]
15203 movq
[esp
+ i3330_c6
], mm0
15204 movq
[esp
+ i3330_c12
], mm1
15206 movq
[esp
+ i3330_two
], mm2
15207 movd mm3
, [ebp
+ i3330_tabscale
]
15209 movq
[esp
+ i3330_tsc
], mm3
15211 mov eax
, [ebp
+ i3330_shift
] ;
# eax = pointer into shift[]
15212 mov ebx
, [eax
] ;
# ebx=shift[n]
15213 add dword ptr
[ebp
+ i3330_shift
], 4 ;
# advance pointer one step
15215 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
15216 mov
[esp
+ i3330_is3
],ebx ;
# store is3
15218 mov eax
, [ebp
+ i3330_shiftvec
] ;
# eax = base of shiftvec[]
15220 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
15221 movd mm6
, [eax
+ ebx
*4 + 8]
15225 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
15229 mov ecx
, [ebp
+ i3330_iinr
] ;
# ecx = pointer into iinr[]
15230 add dword ptr
[ebp
+ i3330_iinr
], 4 ;
# advance pointer
15231 mov ebx
, [ecx
] ;
# ebx=ii
15233 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
15234 mov eax
, [ebp
+ i3330_pos
] ;
# eax = base of pos[]
15236 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
15237 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
15238 mov
[esp
+ i3330_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
15240 movq
[esp
+ i3330_ixO
], mm5
15241 movq
[esp
+ i3330_izO
], mm6
15243 movd mm3
, [eax
+ ebx
*4 + 12]
15244 movd mm4
, [eax
+ ebx
*4 + 16]
15245 movd mm5
, [eax
+ ebx
*4 + 20]
15246 punpckldq mm3
, [eax
+ ebx
*4 + 24]
15247 punpckldq mm4
, [eax
+ ebx
*4 + 28]
15248 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
15253 movq
[esp
+ i3330_ixH
], mm0
15254 movq
[esp
+ i3330_iyH
], mm1
15255 movq
[esp
+ i3330_izH
], mm2
15257 ;
# clear vctot and i forces
15259 movq
[esp
+ i3330_vctot
], mm7
15260 movq
[esp
+ i3330_vnbtot
], mm7
15261 movq
[esp
+ i3330_fixO
], mm7
15262 movq
[esp
+ i3330_fizO
], mm7
15263 movq
[esp
+ i3330_fixH
], mm7
15264 movq
[esp
+ i3330_fiyH
], mm7
15265 movq
[esp
+ i3330_fizH
], mm7
15267 mov eax
, [ebp
+ i3330_jindex
]
15268 mov ecx
, [eax
] ;
# jindex[n]
15269 mov edx
, [eax
+ 4] ;
# jindex[n+1]
15270 add dword ptr
[ebp
+ i3330_jindex
], 4
15271 sub edx
, ecx ;
# number of innerloop atoms
15272 mov
[esp
+ i3330_innerk
], edx
15274 mov esi
, [ebp
+ i3330_pos
]
15275 mov edi
, [ebp
+ i3330_faction
]
15276 mov eax
, [ebp
+ i3330_jjnr
]
15279 mov
[esp
+ i3330_innerjjnr
], eax ;
# pointer to jjnr[nj0]
15281 ;
# a single j particle iteration here - compare with the unrolled code for comments.
15282 mov eax
, [esp
+ i3330_innerjjnr
]
15283 mov eax
, [eax
] ;
# eax=jnr offset
15284 add dword ptr
[esp
+ i3330_innerjjnr
], 4 ;
# advance pointer
15286 lea eax
, [eax
+ eax
*2]
15288 movq mm0
, [esi
+ eax
*4]
15289 movd mm1
, [esi
+ eax
*4 + 8]
15290 ;
# copy & expand to mm2-mm4 for the H interactions
15298 pfsubr mm0
, [esp
+ i3330_ixO
]
15299 pfsubr mm1
, [esp
+ i3330_izO
]
15301 movq
[esp
+ i3330_dxO
], mm0
15303 movd
[esp
+ i3330_dzO
], mm1
15306 pfadd mm0
, mm1 ;
# mm0=rsqO
15310 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
15311 pfsubr mm2
, [esp
+ i3330_ixH
]
15312 pfsubr mm3
, [esp
+ i3330_iyH
]
15313 pfsubr mm4
, [esp
+ i3330_izH
] ;
# mm2-mm4 is dxH-dzH
15315 movq
[esp
+ i3330_dxH
], mm2
15316 movq
[esp
+ i3330_dyH
], mm3
15317 movq
[esp
+ i3330_dzH
], mm4
15323 pfadd mm3
,mm4 ;
# mm3=rsqH
15324 movq
[esp
+ i3330_tmprsqH
], mm3
15331 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
15332 pfmul mm0
, mm1 ;
# mm0=rsq
15334 pfmul mm0
, [esp
+ i3330_tsc
]
15336 movd
[esp
+ i3330_n1
], mm4
15338 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
15340 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
15343 mov edx
, [ebp
+ i3330_VFtab
]
15344 mov ecx
, [esp
+ i3330_n1
]
15345 lea ecx
, [ecx
+ ecx
*2]
15348 ;
# load all values we need
15349 movd mm4
, [edx
+ ecx
*4]
15350 movd mm5
, [edx
+ ecx
*4 + 4]
15351 movd mm6
, [edx
+ ecx
*4 + 8]
15352 movd mm7
, [edx
+ ecx
*4 + 12]
15354 pfmul mm6
, mm0 ;
# mm6 = Geps
15355 pfmul mm7
, mm2 ;
# mm7 = Heps2
15358 pfadd mm5
, mm7 ;
# mm5 = Fp
15360 pfmul mm7
, [esp
+ i3330_two
] ;
# two*Heps2
15362 pfadd mm7
, mm5 ;
# mm7=FF
15364 pfmul mm5
, mm0 ;
# mm5=eps*Fp
15365 pfadd mm5
, mm4 ;
# mm5= VV
15367 pfmul mm5
, [esp
+ i3330_qqOO
] ;
# vcoul=qq*VV
15368 pfmul mm7
, [esp
+ i3330_qqOO
] ;
# fijC=qq*FF
15370 ;
# update vctot directly, use mm3 for fscal sum.
15371 pfadd mm5
, [esp
+ i3330_vctot
]
15372 movq
[esp
+ i3330_vctot
], mm5
15375 ;
# dispersion table
15376 ;
# load all the table values we need
15377 movd mm4
, [edx
+ ecx
*4 + 16]
15378 movd mm5
, [edx
+ ecx
*4 + 20]
15379 movd mm6
, [edx
+ ecx
*4 + 24]
15380 movd mm7
, [edx
+ ecx
*4 + 28]
15381 pfmul mm6
, mm0 ;
# mm6 = Geps
15382 pfmul mm7
, mm2 ;
# mm7 = Heps2
15384 pfadd mm5
, mm7 ;
# mm5 = Fp
15385 pfmul mm7
, [esp
+ i3330_two
] ;
# two*Heps2
15387 pfadd mm7
, mm5 ;
# mm7=FF
15388 pfmul mm5
, mm0 ;
# mm5=eps*Fp
15389 pfadd mm5
, mm4 ;
# mm5= VV
15391 movq mm4
, [esp
+ i3330_c6
]
15392 pfmul mm7
, mm4 ;
# fijD
15393 pfmul mm5
, mm4 ;
# vnb6
15394 pfadd mm3
, mm7 ;
# add to fscal
15396 ;
# update vnbtot to release mm5!
15397 pfadd mm5
, [esp
+ i3330_vnbtot
] ;
# add the earlier value
15398 movq
[esp
+ i3330_vnbtot
], mm5 ;
# store the sum
15401 ;
# load all the table values we need
15402 movd mm4
, [edx
+ ecx
*4 + 32]
15403 movd mm5
, [edx
+ ecx
*4 + 36]
15404 movd mm6
, [edx
+ ecx
*4 + 40]
15405 movd mm7
, [edx
+ ecx
*4 + 44]
15407 pfmul mm6
, mm0 ;
# mm6 = Geps
15408 pfmul mm7
, mm2 ;
# mm7 = Heps2
15410 pfadd mm5
, mm7 ;
# mm5 = Fp
15411 pfmul mm7
, [esp
+ i3330_two
] ;
# two*Heps2
15413 pfadd mm7
, mm5 ;
# mm7=FF
15414 pfmul mm5
, mm0 ;
# mm5=eps*Fp
15415 pfadd mm5
, mm4 ;
# mm5= VV
15417 movq mm6
, [esp
+ i3330_c12
]
15418 pfmul mm7
, mm6 ;
# fijR
15419 pfmul mm5
, mm6 ;
# vnb12
15420 pfadd mm3
, mm7 ;
# total fscal fijC+ fijD+ fijR
15422 ;
# change sign of fscal and multiply with rinv
15425 pfmul mm3
, [esp
+ i3330_tsc
]
15426 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
15429 pfadd mm5
, [esp
+ i3330_vnbtot
] ;
# add the earlier value
15430 movq
[esp
+ i3330_vnbtot
], mm5 ;
# store the sum
15432 ;
# Ready with the oxygen - potential is updated, fscal is in mm3.
15433 ;
# time for hydrogens!
15436 movq mm0
, [esp
+ i3330_tmprsqH
]
15442 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
15447 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
15449 pfmul mm0
,mm1 ;
# mm0=r
15450 pfmul mm0
, [esp
+ i3330_tsc
]
15452 movq
[esp
+ i3330_n1
], mm4
15454 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
15456 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
15459 mov edx
, [ebp
+ i3330_VFtab
]
15460 mov ecx
, [esp
+ i3330_n1
]
15461 lea ecx
, [ecx
+ ecx
*2]
15463 ;
# load all values we need
15464 movd mm4
, [edx
+ ecx
*4]
15465 movd mm5
, [edx
+ ecx
*4 + 4]
15466 movd mm6
, [edx
+ ecx
*4 + 8]
15467 movd mm7
, [edx
+ ecx
*4 + 12]
15468 mov ecx
, [esp
+ i3330_n1
+ 4]
15469 lea ecx
, [ecx
+ ecx
*2]
15471 punpckldq mm4
, [edx
+ ecx
*4]
15472 punpckldq mm5
, [edx
+ ecx
*4 + 4]
15473 punpckldq mm6
, [edx
+ ecx
*4 + 8]
15474 punpckldq mm7
, [edx
+ ecx
*4 + 12]
15476 pfmul mm6
, mm0 ;
# mm6 = Geps
15477 pfmul mm7
, mm2 ;
# mm7 = Heps2
15480 pfadd mm5
, mm7 ;
# mm5 = Fp
15482 pfmul mm7
, [esp
+ i3330_two
] ;
# two*Heps2
15484 pfadd mm7
, mm5 ;
# mm7=FF
15486 pfmul mm5
, mm0 ;
# mm5=eps*Fp
15487 pfadd mm5
, mm4 ;
# mm5= VV
15489 pfmul mm5
, [esp
+ i3330_qqOH
] ;
# vcoul=qq*VV
15490 pfmul mm7
, [esp
+ i3330_qqOH
] ;
# fijC=qq*FF
15492 pfadd mm5
, [esp
+ i3330_vctot
]
15493 movq
[esp
+ i3330_vctot
], mm5
15495 ;
# change sign of fijC and multiply by rinv
15498 pfmul mm4
, [esp
+ i3330_tsc
]
15499 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
15501 ;
# spread oxygen fscalar to both positions
15503 ;
# calc vectorial force for O
15504 movq mm0
, [esp
+ i3330_dxO
]
15505 movd mm1
, [esp
+ i3330_dzO
]
15509 ;
# calc vectorial force for H's
15510 movq mm5
, [esp
+ i3330_dxH
]
15511 movq mm6
, [esp
+ i3330_dyH
]
15512 movq mm7
, [esp
+ i3330_dzH
]
15517 ;
# update iO particle force
15518 movq mm2
, [esp
+ i3330_fixO
]
15519 movd mm3
, [esp
+ i3330_fizO
]
15522 movq
[esp
+ i3330_fixO
], mm2
15523 movd
[esp
+ i3330_fizO
], mm3
15525 ;
# update iH forces
15526 movq mm2
, [esp
+ i3330_fixH
]
15527 movq mm3
, [esp
+ i3330_fiyH
]
15528 movq mm4
, [esp
+ i3330_fizH
]
15532 movq
[esp
+ i3330_fixH
], mm2
15533 movq
[esp
+ i3330_fiyH
], mm3
15534 movq
[esp
+ i3330_fizH
], mm4
15536 ;
# pack j forces from H in the same form as the oxygen force.
15537 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
15538 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
15540 pfadd mm0
, mm5 ;
# add up total force on j particle.
15543 ;
# update j particle force
15544 movq mm2
, [edi
+ eax
*4]
15545 movd mm3
, [edi
+ eax
*4 + 8]
15548 movq
[edi
+ eax
*4], mm2
15549 movd
[edi
+ eax
*4 +8], mm3
15551 ;
# interactions with j H1
15553 movq mm0
, [esi
+ eax
*4 + 12]
15554 movd mm1
, [esi
+ eax
*4 + 20]
15555 ;
# copy & expand to mm2-mm4 for the H interactions
15563 pfsubr mm0
, [esp
+ i3330_ixO
]
15564 pfsubr mm1
, [esp
+ i3330_izO
]
15566 movq
[esp
+ i3330_dxO
], mm0
15568 movd
[esp
+ i3330_dzO
], mm1
15571 pfadd mm0
, mm1 ;
# mm0=rsqO
15575 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
15576 pfsubr mm2
, [esp
+ i3330_ixH
]
15577 pfsubr mm3
, [esp
+ i3330_iyH
]
15578 pfsubr mm4
, [esp
+ i3330_izH
] ;
# mm2-mm4 is dxH-dzH
15580 movq
[esp
+ i3330_dxH
], mm2
15581 movq
[esp
+ i3330_dyH
], mm3
15582 movq
[esp
+ i3330_dzH
], mm4
15588 pfadd mm3
,mm4 ;
# mm3=rsqH
15589 movq
[esp
+ i3330_tmprsqH
], mm3
15596 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
15597 pfmul mm0
, mm1 ;
# mm0=rsq
15599 pfmul mm0
, [esp
+ i3330_tsc
]
15601 movd
[esp
+ i3330_n1
], mm4
15603 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
15605 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
15608 mov edx
, [ebp
+ i3330_VFtab
]
15609 mov ecx
, [esp
+ i3330_n1
]
15610 lea ecx
, [ecx
+ ecx
*2]
15613 ;
# load all values we need
15614 movd mm4
, [edx
+ ecx
*4]
15615 movd mm5
, [edx
+ ecx
*4 + 4]
15616 movd mm6
, [edx
+ ecx
*4 + 8]
15617 movd mm7
, [edx
+ ecx
*4 + 12]
15619 pfmul mm6
, mm0 ;
# mm6 = Geps
15620 pfmul mm7
, mm2 ;
# mm7 = Heps2
15623 pfadd mm5
, mm7 ;
# mm5 = Fp
15625 pfmul mm7
, [esp
+ i3330_two
] ;
# two*Heps2
15627 pfadd mm7
, mm5 ;
# mm7=FF
15629 pfmul mm5
, mm0 ;
# mm5=eps*Fp
15630 pfadd mm5
, mm4 ;
# mm5= VV
15632 pfmul mm5
, [esp
+ i3330_qqOH
] ;
# vcoul=qq*VV
15633 pfmul mm7
, [esp
+ i3330_qqOH
] ;
# fijC=qq*FF
15635 ;
# update vctot directly, force is moved to mm3.
15636 pfadd mm5
, [esp
+ i3330_vctot
]
15637 movq
[esp
+ i3330_vctot
], mm5
15640 pfmul mm3
, [esp
+ i3330_tsc
]
15641 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
15643 movq mm0
, [esp
+ i3330_tmprsqH
]
15649 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
15654 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
15656 pfmul mm0
,mm1 ;
# mm0=r
15657 pfmul mm0
, [esp
+ i3330_tsc
]
15659 movq
[esp
+ i3330_n1
], mm4
15661 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
15663 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
15666 mov edx
, [ebp
+ i3330_VFtab
]
15667 mov ecx
, [esp
+ i3330_n1
]
15668 lea ecx
, [ecx
+ ecx
*2]
15670 ;
# load all values we need
15671 movd mm4
, [edx
+ ecx
*4]
15672 movd mm5
, [edx
+ ecx
*4 + 4]
15673 movd mm6
, [edx
+ ecx
*4 + 8]
15674 movd mm7
, [edx
+ ecx
*4 + 12]
15675 mov ecx
, [esp
+ i3330_n1
+ 4]
15676 lea ecx
, [ecx
+ ecx
*2]
15678 punpckldq mm4
, [edx
+ ecx
*4]
15679 punpckldq mm5
, [edx
+ ecx
*4 + 4]
15680 punpckldq mm6
, [edx
+ ecx
*4 + 8]
15681 punpckldq mm7
, [edx
+ ecx
*4 + 12]
15684 pfmul mm6
, mm0 ;
# mm6 = Geps
15685 pfmul mm7
, mm2 ;
# mm7 = Heps2
15688 pfadd mm5
, mm7 ;
# mm5 = Fp
15690 pfmul mm7
, [esp
+ i3330_two
] ;
# two*Heps2
15692 pfadd mm7
, mm5 ;
# mm7=FF
15694 pfmul mm5
, mm0 ;
# mm5=eps*Fp
15695 pfadd mm5
, mm4 ;
# mm5= VV
15697 pfmul mm5
, [esp
+ i3330_qqHH
] ;
# vcoul=qq*VV
15698 pfmul mm7
, [esp
+ i3330_qqHH
] ;
# fijC=qq*FF
15700 pfadd mm5
, [esp
+ i3330_vctot
]
15701 movq
[esp
+ i3330_vctot
], mm5
15703 ;
# change sign of fijC and multiply by rinv
15706 pfmul mm4
, [esp
+ i3330_tsc
]
15707 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
15709 ;
# spread oxygen fscalar to both positions
15711 ;
# calc vectorial force for O
15712 movq mm0
, [esp
+ i3330_dxO
]
15713 movd mm1
, [esp
+ i3330_dzO
]
15717 ;
# calc vectorial force for H's
15718 movq mm5
, [esp
+ i3330_dxH
]
15719 movq mm6
, [esp
+ i3330_dyH
]
15720 movq mm7
, [esp
+ i3330_dzH
]
15725 ;
# update iO particle force
15726 movq mm2
, [esp
+ i3330_fixO
]
15727 movd mm3
, [esp
+ i3330_fizO
]
15730 movq
[esp
+ i3330_fixO
], mm2
15731 movd
[esp
+ i3330_fizO
], mm3
15733 ;
# update iH forces
15734 movq mm2
, [esp
+ i3330_fixH
]
15735 movq mm3
, [esp
+ i3330_fiyH
]
15736 movq mm4
, [esp
+ i3330_fizH
]
15740 movq
[esp
+ i3330_fixH
], mm2
15741 movq
[esp
+ i3330_fiyH
], mm3
15742 movq
[esp
+ i3330_fizH
], mm4
15744 ;
# pack j forces from H in the same form as the oxygen force.
15745 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
15746 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
15748 pfadd mm0
, mm5 ;
# add up total force on j particle.
15751 ;
# update j particle force
15752 movq mm2
, [edi
+ eax
*4 + 12]
15753 movd mm3
, [edi
+ eax
*4 + 20]
15756 movq
[edi
+ eax
*4 + 12], mm2
15757 movd
[edi
+ eax
*4 + 20], mm3
15759 ;
# interactions with j H2
15760 movq mm0
, [esi
+ eax
*4 + 24]
15761 movd mm1
, [esi
+ eax
*4 + 32]
15762 ;
# copy & expand to mm2-mm4 for the H interactions
15770 pfsubr mm0
, [esp
+ i3330_ixO
]
15771 pfsubr mm1
, [esp
+ i3330_izO
]
15773 movq
[esp
+ i3330_dxO
], mm0
15775 movd
[esp
+ i3330_dzO
], mm1
15778 pfadd mm0
, mm1 ;
# mm0=rsqO
15782 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
15783 pfsubr mm2
, [esp
+ i3330_ixH
]
15784 pfsubr mm3
, [esp
+ i3330_iyH
]
15785 pfsubr mm4
, [esp
+ i3330_izH
] ;
# mm2-mm4 is dxH-dzH
15787 movq
[esp
+ i3330_dxH
], mm2
15788 movq
[esp
+ i3330_dyH
], mm3
15789 movq
[esp
+ i3330_dzH
], mm4
15795 pfadd mm3
,mm4 ;
# mm3=rsqH
15796 movq
[esp
+ i3330_tmprsqH
], mm3
15803 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
15806 pfmul mm0
, [esp
+ i3330_tsc
]
15808 movd
[esp
+ i3330_n1
], mm4
15810 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
15812 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
15815 mov edx
, [ebp
+ i3330_VFtab
]
15816 mov ecx
, [esp
+ i3330_n1
]
15817 lea ecx
, [ecx
+ ecx
*2]
15820 ;
# load all values we need
15821 movd mm4
, [edx
+ ecx
*4]
15822 movd mm5
, [edx
+ ecx
*4 + 4]
15823 movd mm6
, [edx
+ ecx
*4 + 8]
15824 movd mm7
, [edx
+ ecx
*4 + 12]
15826 pfmul mm6
, mm0 ;
# mm6 = Geps
15827 pfmul mm7
, mm2 ;
# mm7 = Heps2
15830 pfadd mm5
, mm7 ;
# mm5 = Fp
15832 pfmul mm7
, [esp
+ i3330_two
] ;
# two*Heps2
15834 pfadd mm7
, mm5 ;
# mm7=FF
15836 pfmul mm5
, mm0 ;
# mm5=eps*Fp
15837 pfadd mm5
, mm4 ;
# mm5= VV
15839 pfmul mm5
, [esp
+ i3330_qqOH
] ;
# vcoul=qq*VV
15840 pfmul mm7
, [esp
+ i3330_qqOH
] ;
# fijC=qq*FF
15842 ;
# update vctot directly, use mm3 for fscal sum
15843 pfadd mm5
, [esp
+ i3330_vctot
]
15844 movq
[esp
+ i3330_vctot
], mm5
15847 pfmul mm3
, [esp
+ i3330_tsc
]
15848 pfmul mm3
, mm1 ;
# mm3 is total fscal (for the oxygen) now
15850 movq mm0
, [esp
+ i3330_tmprsqH
]
15856 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
15861 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
15863 pfmul mm0
,mm1 ;
# mm0=r
15864 pfmul mm0
, [esp
+ i3330_tsc
]
15866 movq
[esp
+ i3330_n1
], mm4
15868 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
15870 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
15873 mov edx
, [ebp
+ i3330_VFtab
]
15874 mov ecx
, [esp
+ i3330_n1
]
15875 lea ecx
, [ecx
+ ecx
*2]
15877 ;
# load all values we need
15878 movd mm4
, [edx
+ ecx
*4]
15879 movd mm5
, [edx
+ ecx
*4 + 4]
15880 movd mm6
, [edx
+ ecx
*4 + 8]
15881 movd mm7
, [edx
+ ecx
*4 + 12]
15882 mov ecx
, [esp
+ i3330_n1
+ 4];
# mm5 = Fp
15883 lea ecx
, [ecx
+ ecx
*2]
15885 punpckldq mm4
, [edx
+ ecx
*4]
15886 punpckldq mm5
, [edx
+ ecx
*4 + 4]
15887 punpckldq mm6
, [edx
+ ecx
*4 + 8]
15888 punpckldq mm7
, [edx
+ ecx
*4 + 12]
15891 pfmul mm6
, mm0 ;
# mm6 = Geps
15892 pfmul mm7
, mm2 ;
# mm7 = Heps2
15895 pfadd mm5
, mm7 ;
# mm5 = Fp
15897 pfmul mm7
, [esp
+ i3330_two
] ;
# two*Heps2
15899 pfadd mm7
, mm5 ;
# mm7=FF
15901 pfmul mm5
, mm0 ;
# mm5=eps*Fp
15902 pfadd mm5
, mm4 ;
# mm5= VV
15904 pfmul mm5
, [esp
+ i3330_qqHH
] ;
# vcoul=qq*VV
15905 pfmul mm7
, [esp
+ i3330_qqHH
] ;
# fijC=qq*FF
15907 pfadd mm5
, [esp
+ i3330_vctot
]
15908 movq
[esp
+ i3330_vctot
], mm5
15910 ;
# change sign of fijC and multiply by rinv
15913 pfmul mm4
, [esp
+ i3330_tsc
]
15914 pfmul mm4
, mm1 ;
# mm4 is total fscal (for the hydrogens) now
15916 ;
# spread oxygen fscalar to both positions
15918 ;
# calc vectorial force for O
15919 movq mm0
, [esp
+ i3330_dxO
]
15920 movd mm1
, [esp
+ i3330_dzO
]
15924 ;
# calc vectorial force for H's
15925 movq mm5
, [esp
+ i3330_dxH
]
15926 movq mm6
, [esp
+ i3330_dyH
]
15927 movq mm7
, [esp
+ i3330_dzH
]
15932 ;
# update iO particle force
15933 movq mm2
, [esp
+ i3330_fixO
]
15934 movd mm3
, [esp
+ i3330_fizO
]
15937 movq
[esp
+ i3330_fixO
], mm2
15938 movd
[esp
+ i3330_fizO
], mm3
15940 ;
# update iH forces
15941 movq mm2
, [esp
+ i3330_fixH
]
15942 movq mm3
, [esp
+ i3330_fiyH
]
15943 movq mm4
, [esp
+ i3330_fizH
]
15947 movq
[esp
+ i3330_fixH
], mm2
15948 movq
[esp
+ i3330_fiyH
], mm3
15949 movq
[esp
+ i3330_fizH
], mm4
15951 ;
# pack j forces from H in the same form as the oxygen force.
15952 pfacc mm5
, mm6 ;
# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
15953 pfacc mm7
, mm7 ;
# mm7(l)=fjz(H1+ h2)
15955 pfadd mm0
, mm5 ;
# add up total force on j particle.
15958 ;
# update j particle force
15959 movq mm2
, [edi
+ eax
*4 + 24]
15960 movd mm3
, [edi
+ eax
*4 + 32]
15963 movq
[edi
+ eax
*4 + 24], mm2
15964 movd
[edi
+ eax
*4 + 32], mm3
15966 ;
# done - one more?
15967 dec dword ptr
[esp
+ i3330_innerk
]
15968 jz
.i3330_updateouterdata
15969 jmp
.i3330_inner_loop
15970 .i3330_updateouterdata:
15971 mov ecx
, [esp
+ i3330_ii3
]
15973 movq mm6
, [edi
+ ecx
*4] ;
# increment iO force
15974 movd mm7
, [edi
+ ecx
*4 + 8]
15975 pfadd mm6
, [esp
+ i3330_fixO
]
15976 pfadd mm7
, [esp
+ i3330_fizO
]
15977 movq
[edi
+ ecx
*4], mm6
15978 movd
[edi
+ ecx
*4 +8], mm7
15980 movq mm0
, [esp
+ i3330_fixH
]
15981 movq mm3
, [esp
+ i3330_fiyH
]
15982 movq mm1
, [esp
+ i3330_fizH
]
15984 punpckldq mm0
, mm3 ;
# mm0(l)=fxH1, mm0(h)=fyH1
15985 punpckhdq mm2
, mm3 ;
# mm2(l)=fxH2, mm2(h)=fyH2
15991 movq mm6
, [edi
+ ecx
*4 + 12] ;
# increment iH1 force
15992 movd mm7
, [edi
+ ecx
*4 + 20]
15995 movq
[edi
+ ecx
*4 + 12], mm6
15996 movd
[edi
+ ecx
*4 + 20], mm7
15998 movq mm6
, [edi
+ ecx
*4 + 24] ;
# increment iH2 force
15999 movd mm7
, [edi
+ ecx
*4 + 32]
16002 movq
[edi
+ ecx
*4 + 24], mm6
16003 movd
[edi
+ ecx
*4 + 32], mm7
16006 mov ebx
, [ebp
+ i3330_fshift
] ;
# increment fshift force
16007 mov edx
, [esp
+ i3330_is3
]
16009 movq mm6
, [ebx
+ edx
*4]
16010 movd mm7
, [ebx
+ edx
*4 + 8]
16011 pfadd mm6
, [esp
+ i3330_fixO
]
16012 pfadd mm7
, [esp
+ i3330_fizO
]
16017 movq
[ebx
+ edx
*4], mm6
16018 movd
[ebx
+ edx
*4 + 8], mm7
16020 mov edx
, [ebp
+ i3330_gid
] ;
# get group index for this i particle
16022 add dword ptr
[ebp
+ i3330_gid
], 4 ;
# advance pointer
16024 movq mm7
, [esp
+ i3330_vctot
]
16025 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
16027 mov eax
, [ebp
+ i3330_Vc
]
16028 movd mm6
, [eax
+ edx
*4]
16030 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
16032 movq mm7
, [esp
+ i3330_vnbtot
]
16033 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
16035 mov eax
, [ebp
+ i3330_Vnb
]
16036 movd mm6
, [eax
+ edx
*4]
16038 movd
[eax
+ edx
*4], mm6 ;
# increment vnbtot[gid]
16040 dec dword ptr
[ebp
+ i3330_nri
]
16042 ;
# not last, iterate once more!
16061 .globl mcinl0100_3dnow
16062 .globl _mcinl0100_3dnow
16065 .equiv mci0100_nri, 8
16066 .equiv mci0100_iinr, 12
16067 .equiv mci0100_jindex, 16
16068 .equiv mci0100_jjnr, 20
16069 .equiv mci0100_shift, 24
16070 .equiv mci0100_shiftvec, 28
16071 .equiv mci0100_gid, 32
16072 .equiv mci0100_pos, 36
16073 .equiv mci0100_type, 40
16074 .equiv mci0100_ntype, 44
16075 .equiv mci0100_nbfp, 48
16076 .equiv mci0100_Vnb, 52
16077 ;
# stack offsets for local variables
16078 .equiv mci0100_is3, 0
16079 .equiv mci0100_ii3, 4
16080 .equiv mci0100_ix, 8
16081 .equiv mci0100_iy, 12
16082 .equiv mci0100_iz, 16
16083 .equiv mci0100_vnbtot, 20
16084 .equiv mci0100_c6, 28
16085 .equiv mci0100_c12, 36
16086 .equiv mci0100_ntia, 44
16087 .equiv mci0100_innerjjnr, 48
16088 .equiv mci0100_innerk, 52
16097 sub esp
, 56 ;
# local stack space
16099 ;
# assume we have at least one i particle - start directly
16101 mov eax
, [ebp
+ mci0100_shift
] ;
# eax = pointer into shift[]
16102 mov ebx
, [eax
] ;
# ebx=shift[n]
16103 add dword ptr
[ebp
+ mci0100_shift
], 4 ;
# advance pointer one step
16105 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
16106 mov
[esp
+ mci0100_is3
],ebx ;
# store is3
16108 mov eax
, [ebp
+ mci0100_shiftvec
] ;
# eax = base of shiftvec[]
16110 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1.
16111 movd mm1
, [eax
+ ebx
*4 + 8]
16113 mov ecx
, [ebp
+ mci0100_iinr
] ;
# ecx = pointer into iinr[]
16114 add dword ptr
[ebp
+ mci0100_iinr
], 4 ;
# advance pointer
16115 mov ebx
, [ecx
] ;
# ebx =ii
16117 mov edx
, [ebp
+ mci0100_type
]
16118 mov edx
, [edx
+ ebx
*4]
16119 imul edx
, [ebp
+ mci0100_ntype
]
16121 mov
[esp
+ mci0100_ntia
], edx
16123 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
16124 mov eax
, [ebp
+ mci0100_pos
] ;
# eax = base of pos[]
16126 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
16127 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
16128 mov
[esp
+ mci0100_ii3
], ebx
16130 movq
[esp
+ mci0100_ix
], mm0
16131 movd
[esp
+ mci0100_iz
], mm1
16133 ;
# clear total potential
16135 movq
[esp
+ mci0100_vnbtot
], mm7
16137 mov eax
, [ebp
+ mci0100_jindex
]
16138 mov ecx
, [eax
] ;
# jindex[n]
16139 mov edx
, [eax
+ 4] ;
# jindex[n+1]
16140 add dword ptr
[ebp
+ mci0100_jindex
], 4
16141 sub edx
, ecx ;
# number of innerloop atoms
16143 mov esi
, [ebp
+ mci0100_pos
]
16144 mov eax
, [ebp
+ mci0100_jjnr
]
16147 mov
[esp
+ mci0100_innerjjnr
], eax ;
# pointer to jjnr[nj0]
16149 mov
[esp
+ mci0100_innerk
], edx ;
# number of innerloop atoms
16150 jge
.mci0100_unroll_loop
16151 jmp
.mci0100_finish_inner
16152 .mci0100_unroll_loop:
16153 ;
# paired innerloop starts here
16154 mov ecx
, [esp
+ mci0100_innerjjnr
] ;
# pointer to jjnr[k]
16156 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
16157 add dword ptr
[esp
+ mci0100_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
16158 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
16160 mov ecx
, [ebp
+ mci0100_type
]
16161 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
16162 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
16164 mov esi
, [ebp
+ mci0100_nbfp
] ;
# base of nbfp
16167 add edx
, [esp
+ mci0100_ntia
] ;
# tja = ntia + 2*type
16168 add ecx
, [esp
+ mci0100_ntia
]
16170 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
16171 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
16173 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
16174 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
16175 movq
[esp
+ mci0100_c6
], mm5
16176 movq
[esp
+ mci0100_c12
], mm6
16178 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
16179 lea ebx
, [ebx
+ ebx
*2]
16181 mov esi
, [ebp
+ mci0100_pos
]
16183 movq mm0
, [esp
+ mci0100_ix
]
16184 movd mm1
, [esp
+ mci0100_iz
]
16185 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
16186 movd mm5
, [esi
+ eax
*4 + 8]
16187 pfsubr mm4
,mm0 ;
# dr = ir - jr
16189 pfmul mm4
,mm4 ;
# square dx,dy,dz
16191 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
16192 pfacc mm4
, mm5 ;
# first rsq in lower mm4
16194 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
16195 movd mm7
, [esi
+ ebx
*4 + 8]
16197 pfsubr mm6
,mm0 ;
# dr = ir - jr
16199 pfmul mm6
,mm6 ;
# square dx,dy,dz
16201 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
16202 pfacc mm6
, mm7 ;
# second rsq in lower mm6
16204 pfrcp mm0
, mm4 ;
# lookup reciprocal seed
16208 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
16209 ;
# amd 3dnow N-R iteration to get full precision.
16212 ;
# mm4 now contains invsq,
16213 ;
# do potential and fscal
16217 pfmul mm4
, mm0 ;
# mm4=rinvsix
16219 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
16221 pfmul mm5
, [esp
+ mci0100_c12
]
16222 pfmul mm4
, [esp
+ mci0100_c6
]
16223 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
16226 pfadd mm6
, [esp
+ mci0100_vnbtot
] ;
# add the earlier value
16227 movq
[esp
+ mci0100_vnbtot
], mm6 ;
# store the sum
16229 ;
# should we do one more iteration?
16230 sub dword ptr
[esp
+ mci0100_innerk
], 2
16231 jl
.mci0100_finish_inner
16232 jmp
.mci0100_unroll_loop
16233 .mci0100_finish_inner:
16234 and dword ptr
[esp
+ mci0100_innerk
], 1
16235 jnz
.mci0100_single_inner
16236 jmp
.mci0100_updateouterdata
16237 .mci0100_single_inner:
16238 ;
# a single j particle iteration here - compare with the unrolled code for comments
16239 mov eax
, [esp
+ mci0100_innerjjnr
]
16240 mov eax
, [eax
] ;
# eax=jnr offset
16242 mov esi
, [ebp
+ mci0100_nbfp
]
16243 mov ecx
, [ebp
+ mci0100_type
]
16244 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
16246 add edx
, [esp
+ mci0100_ntia
] ;
# tja = ntia + 2*type
16247 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
16248 movq
[esp
+ mci0100_c6
], mm5
16249 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
16250 movq
[esp
+ mci0100_c12
], mm5
16252 mov esi
, [ebp
+ mci0100_pos
]
16253 lea eax
, [eax
+ eax
*2]
16255 movq mm0
, [esp
+ mci0100_ix
]
16256 movd mm1
, [esp
+ mci0100_iz
]
16257 movq mm4
, [esi
+ eax
*4]
16258 movd mm5
, [esi
+ eax
*4 + 8]
16264 pfacc mm4
, mm5 ;
# mm4=rsq
16268 pfrcpit2 mm4
,mm0 ;
# mm4=invsq
16269 ;
# calculate potentials and scalar force
16273 pfmul mm4
, mm0 ;
# mm4=rinvsix
16275 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
16277 pfmul mm5
, [esp
+ mci0100_c12
]
16278 pfmul mm4
, [esp
+ mci0100_c6
]
16279 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
16282 pfadd mm6
, [esp
+ mci0100_vnbtot
] ;
# add the earlier value
16283 movq
[esp
+ mci0100_vnbtot
], mm6 ;
# store the sum
16285 .mci0100_updateouterdata:
16286 mov edx
, [ebp
+ mci0100_gid
] ;
# get group index for this i particle
16288 add dword ptr
[ebp
+ mci0100_gid
], 4 ;
# advance pointer
16290 movq mm7
, [esp
+ mci0100_vnbtot
]
16291 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
16293 mov eax
, [ebp
+ mci0100_Vnb
]
16294 movd mm6
, [eax
+ edx
*4]
16296 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
16299 mov ecx
, [ebp
+ mci0100_nri
]
16302 ;
# not last, iterate once more!
16303 mov
[ebp
+ mci0100_nri
], ecx
16322 .globl mcinl0110_3dnow
16323 .globl _mcinl0110_3dnow
16326 .equiv mci0110_nri, 8
16327 .equiv mci0110_iinr, 12
16328 .equiv mci0110_jindex, 16
16329 .equiv mci0110_jjnr, 20
16330 .equiv mci0110_shift, 24
16331 .equiv mci0110_shiftvec, 28
16332 .equiv mci0110_gid, 32
16333 .equiv mci0110_pos, 36
16334 .equiv mci0110_type, 40
16335 .equiv mci0110_ntype, 44
16336 .equiv mci0110_nbfp, 48
16337 .equiv mci0110_Vnb, 52
16338 .equiv mci0110_nsatoms, 56
16339 ;
# stack offsets for local variables
16340 .equiv mci0110_is3, 0
16341 .equiv mci0110_ii3, 4
16342 .equiv mci0110_shX, 8
16343 .equiv mci0110_shY, 12
16344 .equiv mci0110_shZ, 16
16345 .equiv mci0110_ix, 20
16346 .equiv mci0110_iy, 24
16347 .equiv mci0110_iz, 28
16348 .equiv mci0110_vnbtot, 32
16349 .equiv mci0110_c6, 40
16350 .equiv mci0110_c12, 48
16351 .equiv mci0110_ntia, 56
16352 .equiv mci0110_innerjjnr0, 60
16353 .equiv mci0110_innerk0, 64
16354 .equiv mci0110_innerjjnr, 68
16355 .equiv mci0110_innerk, 72
16356 .equiv mci0110_nsvdwc, 76
16357 .equiv mci0110_nscoul, 80
16358 .equiv mci0110_nsvdw, 84
16359 .equiv mci0110_solnr, 88
16368 sub esp
, 92 ;
# local stack space
16371 ;
# assume we have at least one i particle - start directly
16373 mov eax
, [ebp
+ mci0110_shift
] ;
# eax = pointer into shift[]
16374 mov ebx
, [eax
] ;
# ebx=shift[n]
16375 add dword ptr
[ebp
+ mci0110_shift
], 4 ;
# advance pointer one step
16377 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
16378 mov
[esp
+ mci0110_is3
],ebx ;
# store is3
16380 mov eax
, [ebp
+ mci0110_shiftvec
] ;
# eax = base of shiftvec[]
16382 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
16383 movd mm1
, [eax
+ ebx
*4 + 8]
16384 movq
[esp
+ mci0110_shX
], mm0
16385 movd
[esp
+ mci0110_shZ
], mm1
16387 mov ecx
, [ebp
+ mci0110_iinr
] ;
# ecx = pointer into iinr[]
16388 add dword ptr
[ebp
+ mci0110_iinr
], 4 ;
# advance pointer
16389 mov ebx
, [ecx
] ;
# ebx=ii
16391 mov eax
, [ebp
+ mci0110_nsatoms
]
16392 add dword ptr
[ebp
+ mci0110_nsatoms
], 12
16399 mov
[esp
+ mci0110_nsvdwc
], edx
16400 mov
[esp
+ mci0110_nscoul
], eax
16401 mov
[esp
+ mci0110_nsvdw
], ecx
16405 movq
[esp
+ mci0110_vnbtot
], mm7
16406 mov
[esp
+ mci0110_solnr
], ebx
16408 mov eax
, [ebp
+ mci0110_jindex
]
16409 mov ecx
, [eax
] ;
# jindex[n]
16410 mov edx
, [eax
+ 4] ;
# jindex[n+1]
16411 add dword ptr
[ebp
+ mci0110_jindex
], 4
16412 sub edx
, ecx ;
# number of innerloop atoms
16413 mov eax
, [ebp
+ mci0110_jjnr
]
16416 mov
[esp
+ mci0110_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
16418 mov
[esp
+ mci0110_innerk0
], edx ;
# number of innerloop atoms
16419 mov esi
, [ebp
+ mci0110_pos
]
16421 mov ecx
, [esp
+ mci0110_nsvdwc
]
16423 jnz
.mci0110_mno_vdwc
16424 jmp
.mci0110_testvdw
16426 mov ebx
, [esp
+ mci0110_solnr
]
16427 inc dword ptr
[esp
+ mci0110_solnr
]
16429 mov edx
, [ebp
+ mci0110_type
]
16430 mov edx
, [edx
+ ebx
*4]
16431 imul edx
, [ebp
+ mci0110_ntype
]
16433 mov
[esp
+ mci0110_ntia
], edx
16435 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
16436 mov eax
, [ebp
+ mci0110_pos
] ;
# eax = base of pos[]
16437 mov
[esp
+ mci0110_ii3
], ebx
16439 movq mm0
, [eax
+ ebx
*4]
16440 movd mm1
, [eax
+ ebx
*4 + 8]
16441 pfadd mm0
, [esp
+ mci0110_shX
]
16442 pfadd mm1
, [esp
+ mci0110_shZ
]
16443 movq
[esp
+ mci0110_ix
], mm0
16444 movd
[esp
+ mci0110_iz
], mm1
16446 mov ecx
, [esp
+ mci0110_innerjjnr0
]
16447 mov
[esp
+ mci0110_innerjjnr
], ecx
16448 mov edx
, [esp
+ mci0110_innerk0
]
16450 mov
[esp
+ mci0110_innerk
], edx ;
# number of innerloop atoms
16451 jge
.mci0110_unroll_vdwc_loop
16452 jmp
.mci0110_finish_vdwc_inner
16453 .mci0110_unroll_vdwc_loop:
16454 ;
# paired innerloop starts here
16455 mov ecx
, [esp
+ mci0110_innerjjnr
] ;
# pointer to jjnr[k]
16457 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
16458 add dword ptr
[esp
+ mci0110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
16459 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
16461 mov ecx
, [ebp
+ mci0110_type
]
16462 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
16463 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
16465 mov esi
, [ebp
+ mci0110_nbfp
] ;
# base of nbfp
16468 add edx
, [esp
+ mci0110_ntia
] ;
# tja = ntia + 2*type
16469 add ecx
, [esp
+ mci0110_ntia
]
16471 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
16472 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
16474 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
16475 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
16476 movq
[esp
+ mci0110_c6
], mm5
16477 movq
[esp
+ mci0110_c12
], mm6
16479 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
16480 lea ebx
, [ebx
+ ebx
*2]
16482 mov esi
, [ebp
+ mci0110_pos
]
16484 movq mm0
, [esp
+ mci0110_ix
]
16485 movd mm1
, [esp
+ mci0110_iz
]
16486 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
16487 movd mm5
, [esi
+ eax
*4 + 8]
16488 pfsubr mm4
,mm0 ;
# dr = ir - jr
16490 pfmul mm4
,mm4 ;
# square dx,dy,dz
16492 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
16493 pfacc mm4
, mm5 ;
# first rsq in lower mm4
16495 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
16496 movd mm7
, [esi
+ ebx
*4 + 8]
16498 pfsubr mm6
,mm0 ;
# dr = ir - jr
16500 pfmul mm6
,mm6 ;
# square dx,dy,dz
16502 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
16503 pfacc mm6
, mm7 ;
# second rsq in lower mm6
16505 pfrcp mm0
, mm4 ;
# lookup reciprocal seed
16509 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
16510 ;
# amd 3dnow N-R iteration to get full precision
16513 ;
# mm4 now contains invsq,
16514 ;
# do potential and fscal
16518 pfmul mm4
, mm0 ;
# mm4=rinvsix
16520 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
16522 pfmul mm5
, [esp
+ mci0110_c12
]
16523 pfmul mm4
, [esp
+ mci0110_c6
]
16524 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
16527 pfadd mm6
, [esp
+ mci0110_vnbtot
] ;
# add the earlier value
16528 movq
[esp
+ mci0110_vnbtot
], mm6 ;
# store the sum
16530 ;
# should we do one more iteration?
16531 sub dword ptr
[esp
+ mci0110_innerk
], 2
16532 jl
.mci0110_finish_vdwc_inner
16533 jmp
.mci0110_unroll_vdwc_loop
16534 .mci0110_finish_vdwc_inner:
16535 and dword ptr
[esp
+ mci0110_innerk
], 1
16536 jnz
.mci0110_single_vdwc_inner
16537 jmp
.mci0110_updateouterdata_vdwc
16538 .mci0110_single_vdwc_inner:
16539 ;
# a single j particle iteration here - compare with the unrolled code for comments
16540 mov eax
, [esp
+ mci0110_innerjjnr
]
16541 mov eax
, [eax
] ;
# eax=jnr offset
16543 mov esi
, [ebp
+ mci0110_nbfp
]
16544 mov ecx
, [ebp
+ mci0110_type
]
16545 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
16547 add edx
, [esp
+ mci0110_ntia
] ;
# tja = ntia + 2*type
16548 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
16549 movq
[esp
+ mci0110_c6
], mm5
16550 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
16551 movq
[esp
+ mci0110_c12
], mm5
16553 mov esi
, [ebp
+ mci0110_pos
]
16554 lea eax
, [eax
+ eax
*2]
16556 movq mm0
, [esp
+ mci0110_ix
]
16557 movd mm1
, [esp
+ mci0110_iz
]
16558 movq mm4
, [esi
+ eax
*4]
16559 movd mm5
, [esi
+ eax
*4 + 8]
16565 pfacc mm4
, mm5 ;
# mm4=rsq
16569 pfrcpit2 mm4
,mm0 ;
# mm4=invsq
16570 ;
# calculate potentials and scalar force
16574 pfmul mm4
, mm0 ;
# mm4=rinvsix
16576 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
16578 pfmul mm5
, [esp
+ mci0110_c12
]
16579 pfmul mm4
, [esp
+ mci0110_c6
]
16580 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
16583 pfadd mm6
, [esp
+ mci0110_vnbtot
] ;
# add the earlier value
16584 movq
[esp
+ mci0110_vnbtot
], mm6 ;
# store the sum
16586 .mci0110_updateouterdata_vdwc:
16587 ;
# loop back to mno
16588 dec dword ptr
[esp
+ mci0110_nsvdwc
]
16589 jz
.mci0110_testvdw
16590 jmp
.mci0110_mno_vdwc
16592 mov ebx
, [esp
+ mci0110_nscoul
]
16593 add [esp
+ mci0110_solnr
], ebx
16595 mov ecx
, [esp
+ mci0110_nsvdw
]
16597 jnz
.mci0110_mno_vdw
16598 jmp
.mci0110_last_mno
16600 mov ebx
, [esp
+ mci0110_solnr
]
16601 inc dword ptr
[esp
+ mci0110_solnr
]
16603 mov edx
, [ebp
+ mci0110_type
]
16604 mov edx
, [edx
+ ebx
*4]
16605 imul edx
, [ebp
+ mci0110_ntype
]
16607 mov
[esp
+ mci0110_ntia
], edx
16609 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
16610 mov eax
, [ebp
+ mci0110_pos
] ;
# eax = base of pos[]
16611 mov
[esp
+ mci0110_ii3
], ebx
16613 movq mm0
, [eax
+ ebx
*4]
16614 movd mm1
, [eax
+ ebx
*4 + 8]
16615 pfadd mm0
, [esp
+ mci0110_shX
]
16616 pfadd mm1
, [esp
+ mci0110_shZ
]
16617 movq
[esp
+ mci0110_ix
], mm0
16618 movd
[esp
+ mci0110_iz
], mm1
16620 mov ecx
, [esp
+ mci0110_innerjjnr0
]
16621 mov
[esp
+ mci0110_innerjjnr
], ecx
16622 mov edx
, [esp
+ mci0110_innerk0
]
16624 mov
[esp
+ mci0110_innerk
], edx ;
# number of innerloop atoms
16625 jge
.mci0110_unroll_vdw_loop
16626 jmp
.mci0110_finish_vdw_inner
16627 .mci0110_unroll_vdw_loop:
16628 ;
# paired innerloop starts here
16629 mov ecx
, [esp
+ mci0110_innerjjnr
] ;
# pointer to jjnr[k]
16631 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
16632 add dword ptr
[esp
+ mci0110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
16633 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
16635 mov ecx
, [ebp
+ mci0110_type
]
16636 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
16637 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
16639 mov esi
, [ebp
+ mci0110_nbfp
] ;
# base of nbfp
16642 add edx
, [esp
+ mci0110_ntia
] ;
# tja = ntia + 2*type
16643 add ecx
, [esp
+ mci0110_ntia
]
16645 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
16646 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
16648 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
16649 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
16650 movq
[esp
+ mci0110_c6
], mm5
16651 movq
[esp
+ mci0110_c12
], mm6
16653 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
16654 lea ebx
, [ebx
+ ebx
*2]
16656 mov esi
, [ebp
+ mci0110_pos
]
16658 movq mm0
, [esp
+ mci0110_ix
]
16659 movd mm1
, [esp
+ mci0110_iz
]
16660 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
16661 movd mm5
, [esi
+ eax
*4 + 8]
16662 pfsubr mm4
,mm0 ;
# dr = ir - jr
16664 pfmul mm4
,mm4 ;
# square dx,dy,dz
16666 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
16667 pfacc mm4
, mm5 ;
# first rsq in lower mm4
16669 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
16670 movd mm7
, [esi
+ ebx
*4 + 8]
16672 pfsubr mm6
,mm0 ;
# dr = ir - jr
16674 pfmul mm6
,mm6 ;
# square dx,dy,dz
16676 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
16677 pfacc mm6
, mm7 ;
# second rsq in lower mm6
16679 pfrcp mm0
, mm4 ;
# lookup reciprocal seed
16683 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
16684 ;
# amd 3dnow N-R iteration to get full precision
16687 ;
# mm4 now contains invsq,
16688 ;
# do potential and fscal
16692 pfmul mm4
, mm0 ;
# mm4=rinvsix
16694 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
16696 pfmul mm5
, [esp
+ mci0110_c12
]
16697 pfmul mm4
, [esp
+ mci0110_c6
]
16698 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
16701 pfadd mm6
, [esp
+ mci0110_vnbtot
] ;
# add the earlier value
16702 movq
[esp
+ mci0110_vnbtot
], mm6 ;
# store the sum
16704 ;
# should we do one more iteration?
16705 sub dword ptr
[esp
+ mci0110_innerk
], 2
16706 jl
.mci0110_finish_vdw_inner
16707 jmp
.mci0110_unroll_vdw_loop
16708 .mci0110_finish_vdw_inner:
16709 and dword ptr
[esp
+ mci0110_innerk
], 1
16710 jnz
.mci0110_single_vdw_inner
16711 jmp
.mci0110_updateouterdata_vdw
16712 .mci0110_single_vdw_inner:
16713 ;
# a single j particle iteration here - compare with the unrolled code for comments
16714 mov eax
, [esp
+ mci0110_innerjjnr
]
16715 mov eax
, [eax
] ;
# eax=jnr offset
16717 mov esi
, [ebp
+ mci0110_nbfp
]
16718 mov ecx
, [ebp
+ mci0110_type
]
16719 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
16721 add edx
, [esp
+ mci0110_ntia
] ;
# tja = ntia + 2*type
16722 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
16723 movq
[esp
+ mci0110_c6
], mm5
16724 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
16725 movq
[esp
+ mci0110_c12
], mm5
16727 mov esi
, [ebp
+ mci0110_pos
]
16728 lea eax
, [eax
+ eax
*2]
16730 movq mm0
, [esp
+ mci0110_ix
]
16731 movd mm1
, [esp
+ mci0110_iz
]
16732 movq mm4
, [esi
+ eax
*4]
16733 movd mm5
, [esi
+ eax
*4 + 8]
16739 pfacc mm4
, mm5 ;
# mm4=rsq
16743 pfrcpit2 mm4
,mm0 ;
# mm4=invsq
16744 ;
# calculate potentials and scalar force
16748 pfmul mm4
, mm0 ;
# mm4=rinvsix
16750 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
16752 pfmul mm5
, [esp
+ mci0110_c12
]
16753 pfmul mm4
, [esp
+ mci0110_c6
]
16754 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
16757 pfadd mm6
, [esp
+ mci0110_vnbtot
] ;
# add the earlier value
16758 movq
[esp
+ mci0110_vnbtot
], mm6 ;
# store the sum
16760 .mci0110_updateouterdata_vdw:
16761 ;
# loop back to mno
16762 dec dword ptr
[esp
+ mci0110_nsvdw
]
16763 jz
.mci0110_last_mno
16764 jmp
.mci0110_mno_vdw
16767 mov edx
, [ebp
+ mci0110_gid
] ;
# get group index for this i particle
16769 add dword ptr
[ebp
+ mci0110_gid
], 4 ;
# advance pointer
16771 movq mm7
, [esp
+ mci0110_vnbtot
]
16772 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
16774 mov eax
, [ebp
+ mci0110_Vnb
]
16775 movd mm6
, [eax
+ edx
*4]
16777 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
16779 mov ecx
, [ebp
+ mci0110_nri
]
16782 ;
# not last, iterate once more!
16783 mov
[ebp
+ mci0110_nri
], ecx
16799 .globl mcinl0300_3dnow
16800 .globl _mcinl0300_3dnow
16803 .equiv mci0300_nri, 8
16804 .equiv mci0300_iinr, 12
16805 .equiv mci0300_jindex, 16
16806 .equiv mci0300_jjnr, 20
16807 .equiv mci0300_shift, 24
16808 .equiv mci0300_shiftvec, 28
16809 .equiv mci0300_gid, 32
16810 .equiv mci0300_pos, 36
16811 .equiv mci0300_type, 40
16812 .equiv mci0300_ntype, 44
16813 .equiv mci0300_nbfp, 48
16814 .equiv mci0300_Vnb, 52
16815 .equiv mci0300_tabscale, 56
16816 .equiv mci0300_VFtab, 60
16817 ;
# stack offsets for local variables
16818 .equiv mci0300_is3, 0
16819 .equiv mci0300_ii3, 4
16820 .equiv mci0300_ix, 8
16821 .equiv mci0300_iy, 12
16822 .equiv mci0300_iz, 16
16823 .equiv mci0300_vnbtot, 20
16824 .equiv mci0300_c6, 28
16825 .equiv mci0300_c12, 36
16826 .equiv mci0300_n1, 44
16827 .equiv mci0300_tsc, 52
16828 .equiv mci0300_ntia, 60
16829 .equiv mci0300_innerjjnr, 64
16830 .equiv mci0300_innerk, 68
16839 sub esp
, 72 ;
# local stack space
16841 ;
# move data to local stack
16842 movd mm3
, [ebp
+ mci0300_tabscale
]
16844 movq
[esp
+ mci0300_tsc
], mm3
16845 ;
# assume we have at least one i particle - start directly
16847 mov eax
, [ebp
+ mci0300_shift
] ;
# eax = pointer into shift[]
16848 mov ebx
, [eax
] ;
# ebx=shift[n]
16849 add dword ptr
[ebp
+ mci0300_shift
], 4 ;
# advance pointer one step
16851 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
16852 mov
[esp
+ mci0300_is3
],ebx ;
# store is3
16854 mov eax
, [ebp
+ mci0300_shiftvec
] ;
# eax = base of shiftvec[]
16856 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
16857 movd mm1
, [eax
+ ebx
*4 + 8]
16859 mov ecx
, [ebp
+ mci0300_iinr
] ;
# ecx = pointer into iinr[]
16860 add dword ptr
[ebp
+ mci0300_iinr
], 4 ;
# advance pointer
16861 mov ebx
, [ecx
] ;
# ebx=ii
16863 mov edx
, [ebp
+ mci0300_type
]
16864 mov edx
, [edx
+ ebx
*4]
16865 imul edx
, [ebp
+ mci0300_ntype
]
16867 mov
[esp
+ mci0300_ntia
], edx
16869 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
16870 mov eax
, [ebp
+ mci0300_pos
] ;
# eax = base of pos[]
16872 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
16873 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
16874 mov
[esp
+ mci0300_ii3
], ebx
16876 movq
[esp
+ mci0300_ix
], mm0
16877 movd
[esp
+ mci0300_iz
], mm1
16879 ;
# clear total potential
16881 movq
[esp
+ mci0300_vnbtot
], mm7
16883 mov eax
, [ebp
+ mci0300_jindex
]
16884 mov ecx
, [eax
] ;
# jindex[n]
16885 mov edx
, [eax
+ 4] ;
# jindex[n+1]
16886 add dword ptr
[ebp
+ mci0300_jindex
], 4
16887 sub edx
, ecx ;
# number of innerloop atoms
16889 mov esi
, [ebp
+ mci0300_pos
]
16890 mov eax
, [ebp
+ mci0300_jjnr
]
16893 mov
[esp
+ mci0300_innerjjnr
], eax ;
# pointer to jjnr[nj0]
16895 mov
[esp
+ mci0300_innerk
], edx ;
# number of innerloop atoms
16896 jge
.mci0300_unroll_loop
16897 jmp
.mci0300_finish_inner
16898 .mci0300_unroll_loop:
16899 ;
# paired innerloop starts here
16900 mov ecx
, [esp
+ mci0300_innerjjnr
] ;
# pointer to jjnr[k]
16902 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
16903 add dword ptr
[esp
+ mci0300_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
16904 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
16906 mov ecx
, [ebp
+ mci0300_type
]
16907 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
16908 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
16910 mov esi
, [ebp
+ mci0300_nbfp
] ;
# base of nbfp
16913 add edx
, [esp
+ mci0300_ntia
] ;
# tja = ntia + 2*type
16914 add ecx
, [esp
+ mci0300_ntia
]
16916 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
16917 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
16919 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
16920 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
16921 movq
[esp
+ mci0300_c6
], mm5
16922 movq
[esp
+ mci0300_c12
], mm6
16924 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
16925 lea ebx
, [ebx
+ ebx
*2]
16927 mov esi
, [ebp
+ mci0300_pos
]
16929 movq mm0
, [esp
+ mci0300_ix
]
16930 movd mm1
, [esp
+ mci0300_iz
]
16931 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
16932 movd mm5
, [esi
+ eax
*4 + 8]
16933 pfsubr mm4
,mm0 ;
# dr = ir - jr
16935 pfmul mm4
,mm4 ;
# square dx,dy,dz
16937 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
16938 pfacc mm4
, mm5 ;
# first rsq in lower mm4
16940 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
16941 movd mm7
, [esi
+ ebx
*4 + 8]
16943 pfsubr mm6
,mm0 ;
# dr = ir - jr
16945 pfmul mm6
,mm6 ;
# square dx,dy,dz
16947 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
16948 pfacc mm6
, mm7 ;
# second rsq in lower mm6
16950 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
16955 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
16956 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
16962 ;
# mm0 is invsqrt, and mm1 r
16963 ;
# do potential and fscal
16964 pfmul mm1
, [esp
+ mci0300_tsc
] ;
# mm1=rt
16966 movq
[esp
+ mci0300_n1
], mm4
16968 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
16971 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
16973 mov edx
, [ebp
+ mci0300_VFtab
]
16974 ;
# dispersion table
16975 mov ecx
, [esp
+ mci0300_n1
]
16977 ;
# load all the table values we need
16978 movd mm4
, [edx
+ ecx
*4]
16979 movd mm5
, [edx
+ ecx
*4 + 4]
16980 movd mm6
, [edx
+ ecx
*4 + 8]
16981 movd mm7
, [edx
+ ecx
*4 + 12]
16982 mov ecx
, [esp
+ mci0300_n1
+ 4]
16984 punpckldq mm4
, [edx
+ ecx
*4]
16985 punpckldq mm5
, [edx
+ ecx
*4 + 4]
16986 punpckldq mm6
, [edx
+ ecx
*4 + 8]
16987 punpckldq mm7
, [edx
+ ecx
*4 + 12]
16988 pfmul mm6
, mm1 ;
# mm6 = Geps
16989 pfmul mm7
, mm2 ;
# mm7 = Heps2
16991 pfadd mm5
, mm7 ;
# mm5 = Fp
16992 pfmul mm5
, mm1 ;
# mm5=eps*Fp
16993 pfadd mm5
, mm4 ;
# mm5= VV
16995 movq mm4
, [esp
+ mci0300_c6
]
16996 pfmul mm5
, mm4 ;
# vnb6
16997 ;
# update vnbtot to release mm5!
16998 pfadd mm5
, [esp
+ mci0300_vnbtot
] ;
# add the earlier value
16999 movq
[esp
+ mci0300_vnbtot
], mm5 ;
# store the sum
17002 mov ecx
, [esp
+ mci0300_n1
]
17004 ;
# load all the table values we need
17005 movd mm4
, [edx
+ ecx
*4 + 16]
17006 movd mm5
, [edx
+ ecx
*4 + 20]
17007 movd mm6
, [edx
+ ecx
*4 + 24]
17008 movd mm7
, [edx
+ ecx
*4 + 28]
17009 mov ecx
, [esp
+ mci0300_n1
+ 4]
17011 punpckldq mm4
, [edx
+ ecx
*4 + 16]
17012 punpckldq mm5
, [edx
+ ecx
*4 + 20]
17013 punpckldq mm6
, [edx
+ ecx
*4 + 24]
17014 punpckldq mm7
, [edx
+ ecx
*4 + 28]
17016 pfmul mm6
, mm1 ;
# mm6 = Geps
17017 pfmul mm7
, mm2 ;
# mm7 = Heps2
17019 pfadd mm5
, mm7 ;
# mm5 = Fp
17020 pfmul mm5
, mm1 ;
# mm5=eps*Fp
17021 pfadd mm5
, mm4 ;
# mm5= VV
17023 movq mm6
, [esp
+ mci0300_c12
]
17024 pfmul mm5
, mm6 ;
# vnb12
17026 pfadd mm5
, [esp
+ mci0300_vnbtot
] ;
# add the earlier value
17027 movq
[esp
+ mci0300_vnbtot
], mm5 ;
# store the sum
17029 ;
# should we do one more iteration?
17030 sub dword ptr
[esp
+ mci0300_innerk
], 2
17031 jl
.mci0300_finish_inner
17032 jmp
.mci0300_unroll_loop
17033 .mci0300_finish_inner:
17034 and dword ptr
[esp
+ mci0300_innerk
], 1
17035 jnz
.mci0300_single_inner
17036 jmp
.mci0300_updateouterdata
17037 .mci0300_single_inner:
17038 ;
# a single j particle iteration here - compare with the unrolled code for comments
17039 mov eax
, [esp
+ mci0300_innerjjnr
]
17040 mov eax
, [eax
] ;
# eax=jnr offset
17042 mov esi
, [ebp
+ mci0300_nbfp
]
17043 mov ecx
, [ebp
+ mci0300_type
]
17044 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
17046 add edx
, [esp
+ mci0300_ntia
] ;
# tja = ntia + 2*type
17047 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
17048 movq
[esp
+ mci0300_c6
], mm5
17049 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
17050 movq
[esp
+ mci0300_c12
], mm5
17052 mov esi
, [ebp
+ mci0300_pos
]
17053 lea eax
, [eax
+ eax
*2]
17055 movq mm0
, [esp
+ mci0300_ix
]
17056 movd mm1
, [esp
+ mci0300_iz
]
17057 movq mm4
, [esi
+ eax
*4]
17058 movd mm5
, [esi
+ eax
*4 + 8]
17064 pfacc mm4
, mm5 ;
# mm0=rsq
17070 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
17073 ;
# mm0 is invsqrt, and mm1 r
17075 ;
# calculate potentials and scalar force
17076 pfmul mm1
, [esp
+ mci0300_tsc
] ;
# mm1=rt
17078 movd
[esp
+ mci0300_n1
], mm4
17080 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
17083 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
17085 mov edx
, [ebp
+ mci0300_VFtab
]
17086 mov ecx
, [esp
+ mci0300_n1
]
17088 ;
# dispersion table
17089 ;
# load all the table values we need
17090 movd mm4
, [edx
+ ecx
*4]
17091 movd mm5
, [edx
+ ecx
*4 + 4]
17092 movd mm6
, [edx
+ ecx
*4 + 8]
17093 movd mm7
, [edx
+ ecx
*4 + 12]
17094 pfmul mm6
, mm1 ;
# mm6 = Geps
17095 pfmul mm7
, mm2 ;
# mm7 = Heps2
17097 pfadd mm5
, mm7 ;
# mm5 = Fp
17098 pfmul mm5
, mm1 ;
# mm5=eps*Fp
17099 pfadd mm5
, mm4 ;
# mm5= VV
17101 movq mm4
, [esp
+ mci0300_c6
]
17102 pfmul mm5
, mm4 ;
# vnb6
17103 ;
# update vnbtot to release mm5!
17104 pfadd mm5
, [esp
+ mci0300_vnbtot
] ;
# add the earlier value
17105 movq
[esp
+ mci0300_vnbtot
], mm5 ;
# store the sum
17108 ;
# load all the table values we need
17109 movd mm4
, [edx
+ ecx
*4 + 16]
17110 movd mm5
, [edx
+ ecx
*4 + 20]
17111 movd mm6
, [edx
+ ecx
*4 + 24]
17112 movd mm7
, [edx
+ ecx
*4 + 28]
17114 pfmul mm6
, mm1 ;
# mm6 = Geps
17115 pfmul mm7
, mm2 ;
# mm7 = Heps2
17117 pfadd mm5
, mm7 ;
# mm5 = Fp
17118 pfmul mm5
, mm1 ;
# mm5=eps*Fp
17119 pfadd mm5
, mm4 ;
# mm5= VV
17121 movq mm6
, [esp
+ mci0300_c12
]
17122 pfmul mm5
, mm6 ;
# vnb12
17124 pfadd mm5
, [esp
+ mci0300_vnbtot
] ;
# add the earlier value
17125 movq
[esp
+ mci0300_vnbtot
], mm5 ;
# store the sum
17127 .mci0300_updateouterdata:
17128 mov edx
, [ebp
+ mci0300_gid
] ;
# get group index for this i particle
17130 add dword ptr
[ebp
+ mci0300_gid
], 4 ;
# advance pointer
17132 movq mm7
, [esp
+ mci0300_vnbtot
]
17133 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
17135 mov eax
, [ebp
+ mci0300_Vnb
]
17136 movd mm6
, [eax
+ edx
*4]
17138 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
17141 mov ecx
, [ebp
+ mci0300_nri
]
17144 ;
# not last, iterate once more!
17145 mov
[ebp
+ mci0300_nri
], ecx
17162 .globl mcinl0310_3dnow
17163 .globl _mcinl0310_3dnow
17166 .equiv mci0310_nri, 8
17167 .equiv mci0310_iinr, 12
17168 .equiv mci0310_jindex, 16
17169 .equiv mci0310_jjnr, 20
17170 .equiv mci0310_shift, 24
17171 .equiv mci0310_shiftvec, 28
17172 .equiv mci0310_gid, 32
17173 .equiv mci0310_pos, 36
17174 .equiv mci0310_type, 40
17175 .equiv mci0310_ntype, 44
17176 .equiv mci0310_nbfp, 48
17177 .equiv mci0310_Vnb, 52
17178 .equiv mci0310_tabscale, 56
17179 .equiv mci0310_VFtab, 60
17180 .equiv mci0310_nsatoms, 64
17181 ;
# stack offsets for local variables
17182 .equiv mci0310_is3, 0
17183 .equiv mci0310_ii3, 4
17184 .equiv mci0310_shX, 8
17185 .equiv mci0310_shY, 12
17186 .equiv mci0310_shZ, 16
17187 .equiv mci0310_ix, 20
17188 .equiv mci0310_iy, 24
17189 .equiv mci0310_iz, 28
17190 .equiv mci0310_vnbtot, 32
17191 .equiv mci0310_c6, 40
17192 .equiv mci0310_c12, 48
17193 .equiv mci0310_n1, 56
17194 .equiv mci0310_tsc, 64
17195 .equiv mci0310_ntia, 72
17196 .equiv mci0310_innerjjnr0, 76
17197 .equiv mci0310_innerk0, 80
17198 .equiv mci0310_innerjjnr, 84
17199 .equiv mci0310_innerk, 88
17200 .equiv mci0310_nsvdwc, 92
17201 .equiv mci0310_nscoul, 96
17202 .equiv mci0310_nsvdw, 100
17203 .equiv mci0310_solnr, 104
17212 sub esp
, 108 ;
# local stack space
17214 movd mm3
, [ebp
+ mci0310_tabscale
]
17216 movq
[esp
+ mci0310_tsc
], mm3
17218 ;
# assume we have at least one i particle - start directly
17220 mov eax
, [ebp
+ mci0310_shift
] ;
# eax = pointer into shift[]
17221 mov ebx
, [eax
] ;
# ebx=shift[n]
17222 add dword ptr
[ebp
+ mci0310_shift
], 4 ;
# advance pointer one step
17224 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
17225 mov
[esp
+ mci0310_is3
],ebx ;
# store is3
17227 mov eax
, [ebp
+ mci0310_shiftvec
] ;
# eax = base of shiftvec[]
17229 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
17230 movd mm1
, [eax
+ ebx
*4 + 8]
17231 movq
[esp
+ mci0310_shX
], mm0
17232 movd
[esp
+ mci0310_shZ
], mm1
17234 mov ecx
, [ebp
+ mci0310_iinr
] ;
# ecx = pointer into iinr[]
17235 add dword ptr
[ebp
+ mci0310_iinr
], 4 ;
# advance pointer
17236 mov ebx
, [ecx
] ;
# ebx=ii
17238 mov eax
, [ebp
+ mci0310_nsatoms
]
17239 add dword ptr
[ebp
+ mci0310_nsatoms
], 12
17246 mov
[esp
+ mci0310_nsvdwc
], edx
17247 mov
[esp
+ mci0310_nscoul
], eax
17248 mov
[esp
+ mci0310_nsvdw
], ecx
17252 movq
[esp
+ mci0310_vnbtot
], mm7
17253 mov
[esp
+ mci0310_solnr
], ebx
17255 mov eax
, [ebp
+ mci0310_jindex
]
17256 mov ecx
, [eax
] ;
# jindex[n]
17257 mov edx
, [eax
+ 4] ;
# jindex[n+1]
17258 add dword ptr
[ebp
+ mci0310_jindex
], 4
17259 sub edx
, ecx ;
# number of innerloop atoms
17260 mov eax
, [ebp
+ mci0310_jjnr
]
17263 mov
[esp
+ mci0310_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
17265 mov
[esp
+ mci0310_innerk0
], edx ;
# number of innerloop atoms
17266 mov esi
, [ebp
+ mci0310_pos
]
17268 mov ecx
, [esp
+ mci0310_nsvdwc
]
17270 jnz
.mci0310_mno_vdwc
17271 jmp
.mci0310_testvdw
17273 mov ebx
, [esp
+ mci0310_solnr
]
17274 inc dword ptr
[esp
+ mci0310_solnr
]
17276 mov edx
, [ebp
+ mci0310_type
]
17277 mov edx
, [edx
+ ebx
*4]
17278 imul edx
, [ebp
+ mci0310_ntype
]
17280 mov
[esp
+ mci0310_ntia
], edx
17282 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
17283 mov eax
, [ebp
+ mci0310_pos
] ;
# eax = base of pos[]
17284 mov
[esp
+ mci0310_ii3
], ebx
17286 movq mm0
, [eax
+ ebx
*4]
17287 movd mm1
, [eax
+ ebx
*4 + 8]
17288 pfadd mm0
, [esp
+ mci0310_shX
]
17289 pfadd mm1
, [esp
+ mci0310_shZ
]
17290 movq
[esp
+ mci0310_ix
], mm0
17291 movd
[esp
+ mci0310_iz
], mm1
17293 mov ecx
, [esp
+ mci0310_innerjjnr0
]
17294 mov
[esp
+ mci0310_innerjjnr
], ecx
17295 mov edx
, [esp
+ mci0310_innerk0
]
17297 mov
[esp
+ mci0310_innerk
], edx ;
# number of innerloop atoms
17298 jge
.mci0310_unroll_vdwc_loop
17299 jmp
.mci0310_finish_vdwc_inner
17300 .mci0310_unroll_vdwc_loop:
17301 ;
# paired innerloop starts here
17302 mov ecx
, [esp
+ mci0310_innerjjnr
] ;
# pointer to jjnr[k]
17304 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
17305 add dword ptr
[esp
+ mci0310_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
17306 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
17308 mov ecx
, [ebp
+ mci0310_type
]
17309 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
17310 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
17312 mov esi
, [ebp
+ mci0310_nbfp
] ;
# base of nbfp
17315 add edx
, [esp
+ mci0310_ntia
] ;
# tja = ntia + 2*type
17316 add ecx
, [esp
+ mci0310_ntia
]
17318 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
17319 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
17321 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
17322 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
17323 movq
[esp
+ mci0310_c6
], mm5
17324 movq
[esp
+ mci0310_c12
], mm6
17326 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
17327 lea ebx
, [ebx
+ ebx
*2]
17329 mov esi
, [ebp
+ mci0310_pos
]
17331 movq mm0
, [esp
+ mci0310_ix
]
17332 movd mm1
, [esp
+ mci0310_iz
]
17333 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
17334 movd mm5
, [esi
+ eax
*4 + 8]
17335 pfsubr mm4
,mm0 ;
# dr = ir - jr
17337 pfmul mm4
,mm4 ;
# square dx,dy,dz
17339 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
17340 pfacc mm4
, mm5 ;
# first rsq in lower mm4
17342 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
17343 movd mm7
, [esi
+ ebx
*4 + 8]
17345 pfsubr mm6
,mm0 ;
# dr = ir - jr
17347 pfmul mm6
,mm6 ;
# square dx,dy,dz
17349 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
17350 pfacc mm6
, mm7 ;
# second rsq in lower mm6
17352 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
17357 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
17358 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
17364 ;
# mm0 is invsqrt, and mm1 r
17365 ;
# do potential and fscal
17366 pfmul mm1
, [esp
+ mci0310_tsc
] ;
# mm1=rt
17368 movq
[esp
+ mci0310_n1
], mm4
17370 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
17373 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
17375 mov edx
, [ebp
+ mci0310_VFtab
]
17376 ;
# dispersion table
17377 mov ecx
, [esp
+ mci0310_n1
]
17379 ;
# load all the table values we need
17380 movd mm4
, [edx
+ ecx
*4]
17381 movd mm5
, [edx
+ ecx
*4 + 4]
17382 movd mm6
, [edx
+ ecx
*4 + 8]
17383 movd mm7
, [edx
+ ecx
*4 + 12]
17384 mov ecx
, [esp
+ mci0310_n1
+ 4]
17386 punpckldq mm4
, [edx
+ ecx
*4]
17387 punpckldq mm5
, [edx
+ ecx
*4 + 4]
17388 punpckldq mm6
, [edx
+ ecx
*4 + 8]
17389 punpckldq mm7
, [edx
+ ecx
*4 + 12]
17390 pfmul mm6
, mm1 ;
# mm6 = Geps
17391 pfmul mm7
, mm2 ;
# mm7 = Heps2
17393 pfadd mm5
, mm7 ;
# mm5 = Fp
17394 pfmul mm5
, mm1 ;
# mm5=eps*Fp
17395 pfadd mm5
, mm4 ;
# mm5= VV
17397 movq mm4
, [esp
+ mci0310_c6
]
17398 pfmul mm5
, mm4 ;
# vnb6
17399 ;
# update vnbtot to release mm5!
17400 pfadd mm5
, [esp
+ mci0310_vnbtot
] ;
# add the earlier value
17401 movq
[esp
+ mci0310_vnbtot
], mm5 ;
# store the sum
17404 mov ecx
, [esp
+ mci0310_n1
]
17406 ;
# load all the table values we need
17407 movd mm4
, [edx
+ ecx
*4 + 16]
17408 movd mm5
, [edx
+ ecx
*4 + 20]
17409 movd mm6
, [edx
+ ecx
*4 + 24]
17410 movd mm7
, [edx
+ ecx
*4 + 28]
17411 mov ecx
, [esp
+ mci0310_n1
+ 4]
17413 punpckldq mm4
, [edx
+ ecx
*4 + 16]
17414 punpckldq mm5
, [edx
+ ecx
*4 + 20]
17415 punpckldq mm6
, [edx
+ ecx
*4 + 24]
17416 punpckldq mm7
, [edx
+ ecx
*4 + 28]
17418 pfmul mm6
, mm1 ;
# mm6 = Geps
17419 pfmul mm7
, mm2 ;
# mm7 = Heps2
17421 pfadd mm5
, mm7 ;
# mm5 = Fp
17422 pfmul mm5
, mm1 ;
# mm5=eps*Fp
17423 pfadd mm5
, mm4 ;
# mm5= VV
17425 movq mm6
, [esp
+ mci0310_c12
]
17426 pfmul mm5
, mm6 ;
# vnb12
17427 pfadd mm5
, [esp
+ mci0310_vnbtot
] ;
# add the earlier value
17428 movq
[esp
+ mci0310_vnbtot
], mm5 ;
# store the sum
17430 ;
# should we do one more iteration?
17431 sub dword ptr
[esp
+ mci0310_innerk
], 2
17432 jl
.mci0310_finish_vdwc_inner
17433 jmp
.mci0310_unroll_vdwc_loop
17434 .mci0310_finish_vdwc_inner:
17435 and dword ptr
[esp
+ mci0310_innerk
], 1
17436 jnz
.mci0310_single_vdwc_inner
17437 jmp
.mci0310_updateouterdata_vdwc
17438 .mci0310_single_vdwc_inner:
17439 ;
# a single j particle iteration here - compare with the unrolled code for comments
17440 mov eax
, [esp
+ mci0310_innerjjnr
]
17441 mov eax
, [eax
] ;
# eax=jnr offset
17443 mov esi
, [ebp
+ mci0310_nbfp
]
17444 mov ecx
, [ebp
+ mci0310_type
]
17445 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
17447 add edx
, [esp
+ mci0310_ntia
] ;
# tja = ntia + 2*type
17448 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
17449 movq
[esp
+ mci0310_c6
], mm5
17450 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
17451 movq
[esp
+ mci0310_c12
], mm5
17453 mov esi
, [ebp
+ mci0310_pos
]
17454 lea eax
, [eax
+ eax
*2]
17456 movq mm0
, [esp
+ mci0310_ix
]
17457 movd mm1
, [esp
+ mci0310_iz
]
17458 movq mm4
, [esi
+ eax
*4]
17459 movd mm5
, [esi
+ eax
*4 + 8]
17465 pfacc mm4
, mm5 ;
# mm0=rsq
17471 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
17474 ;
# mm0 is invsqrt, and mm1 r
17476 ;
# calculate potentials and scalar force
17477 pfmul mm1
, [esp
+ mci0310_tsc
] ;
# mm1=rt
17479 movd
[esp
+ mci0310_n1
], mm4
17481 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
17484 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
17486 mov edx
, [ebp
+ mci0310_VFtab
]
17487 mov ecx
, [esp
+ mci0310_n1
]
17489 ;
# dispersion table
17490 ;
# load all the table values we need
17491 movd mm4
, [edx
+ ecx
*4]
17492 movd mm5
, [edx
+ ecx
*4 + 4]
17493 movd mm6
, [edx
+ ecx
*4 + 8]
17494 movd mm7
, [edx
+ ecx
*4 + 12]
17495 pfmul mm6
, mm1 ;
# mm6 = Geps
17496 pfmul mm7
, mm2 ;
# mm7 = Heps2
17498 pfadd mm5
, mm7 ;
# mm5 = Fp
17499 pfmul mm5
, mm1 ;
# mm5=eps*Fp
17500 pfadd mm5
, mm4 ;
# mm5= VV
17502 movq mm4
, [esp
+ mci0310_c6
]
17503 pfmul mm5
, mm4 ;
# vnb6
17504 ;
# update vnbtot to release mm5!
17505 pfadd mm5
, [esp
+ mci0310_vnbtot
] ;
# add the earlier value
17506 movq
[esp
+ mci0310_vnbtot
], mm5 ;
# store the sum
17509 ;
# load all the table values we need
17510 movd mm4
, [edx
+ ecx
*4 + 16]
17511 movd mm5
, [edx
+ ecx
*4 + 20]
17512 movd mm6
, [edx
+ ecx
*4 + 24]
17513 movd mm7
, [edx
+ ecx
*4 + 28]
17515 pfmul mm6
, mm1 ;
# mm6 = Geps
17516 pfmul mm7
, mm2 ;
# mm7 = Heps2
17518 pfadd mm5
, mm7 ;
# mm5 = Fp
17519 pfmul mm5
, mm1 ;
# mm5=eps*Fp
17520 pfadd mm5
, mm4 ;
# mm5= VV
17522 movq mm6
, [esp
+ mci0310_c12
]
17523 pfmul mm5
, mm6 ;
# vnb12
17525 pfadd mm5
, [esp
+ mci0310_vnbtot
] ;
# add the earlier value
17526 movq
[esp
+ mci0310_vnbtot
], mm5 ;
# store the sum
17528 .mci0310_updateouterdata_vdwc:
17529 ;
# loop back to mno
17530 dec dword ptr
[esp
+ mci0310_nsvdwc
]
17531 jz
.mci0310_testvdw
17532 jmp
.mci0310_mno_vdwc
17534 mov ebx
, [esp
+ mci0310_nscoul
]
17535 add [esp
+ mci0310_solnr
], ebx
17537 mov ecx
, [esp
+ mci0310_nsvdw
]
17539 jnz
.mci0310_mno_vdw
17540 jmp
.mci0310_last_mno
17542 mov ebx
, [esp
+ mci0310_solnr
]
17543 inc dword ptr
[esp
+ mci0310_solnr
]
17545 mov edx
, [ebp
+ mci0310_type
]
17546 mov edx
, [edx
+ ebx
*4]
17547 imul edx
, [ebp
+ mci0310_ntype
]
17549 mov
[esp
+ mci0310_ntia
], edx
17551 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
17552 mov eax
, [ebp
+ mci0310_pos
] ;
# eax = base of pos[]
17553 mov
[esp
+ mci0310_ii3
], ebx
17555 movq mm0
, [eax
+ ebx
*4]
17556 movd mm1
, [eax
+ ebx
*4 + 8]
17557 pfadd mm0
, [esp
+ mci0310_shX
]
17558 pfadd mm1
, [esp
+ mci0310_shZ
]
17559 movq
[esp
+ mci0310_ix
], mm0
17560 movd
[esp
+ mci0310_iz
], mm1
17562 mov ecx
, [esp
+ mci0310_innerjjnr0
]
17563 mov
[esp
+ mci0310_innerjjnr
], ecx
17564 mov edx
, [esp
+ mci0310_innerk0
]
17566 mov
[esp
+ mci0310_innerk
], edx ;
# number of innerloop atoms
17567 jge
.mci0310_unroll_vdw_loop
17568 jmp
.mci0310_finish_vdw_inner
17569 .mci0310_unroll_vdw_loop:
17570 ;
# paired innerloop starts here
17571 mov ecx
, [esp
+ mci0310_innerjjnr
] ;
# pointer to jjnr[k]
17573 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
17574 add dword ptr
[esp
+ mci0310_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
17575 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
17577 mov ecx
, [ebp
+ mci0310_type
]
17578 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
17579 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
17581 mov esi
, [ebp
+ mci0310_nbfp
] ;
# base of nbfp
17584 add edx
, [esp
+ mci0310_ntia
] ;
# tja = ntia + 2*type
17585 add ecx
, [esp
+ mci0310_ntia
]
17587 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
17588 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
17590 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
17591 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
17592 movq
[esp
+ mci0310_c6
], mm5
17593 movq
[esp
+ mci0310_c12
], mm6
17595 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
17596 lea ebx
, [ebx
+ ebx
*2]
17598 mov esi
, [ebp
+ mci0310_pos
]
17600 movq mm0
, [esp
+ mci0310_ix
]
17601 movd mm1
, [esp
+ mci0310_iz
]
17602 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
17603 movd mm5
, [esi
+ eax
*4 + 8]
17604 pfsubr mm4
,mm0 ;
# dr = ir - jr
17606 pfmul mm4
,mm4 ;
# square dx,dy,dz
17608 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
17609 pfacc mm4
, mm5 ;
# first rsq in lower mm4
17611 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
17612 movd mm7
, [esi
+ ebx
*4 + 8]
17614 pfsubr mm6
,mm0 ;
# dr = ir - jr
17616 pfmul mm6
,mm6 ;
# square dx,dy,dz
17618 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
17619 pfacc mm6
, mm7 ;
# second rsq in lower mm6
17621 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
17626 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
17627 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
17633 ;
# mm0 is invsqrt, and mm1 r
17634 ;
# do potential and fscal
17635 pfmul mm1
, [esp
+ mci0310_tsc
] ;
# mm1=rt
17637 movq
[esp
+ mci0310_n1
], mm4
17639 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
17642 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
17644 mov edx
, [ebp
+ mci0310_VFtab
]
17645 ;
# dispersion table
17646 mov ecx
, [esp
+ mci0310_n1
]
17648 ;
# load all the table values we need
17649 movd mm4
, [edx
+ ecx
*4]
17650 movd mm5
, [edx
+ ecx
*4 + 4]
17651 movd mm6
, [edx
+ ecx
*4 + 8]
17652 movd mm7
, [edx
+ ecx
*4 + 12]
17653 mov ecx
, [esp
+ mci0310_n1
+ 4]
17655 punpckldq mm4
, [edx
+ ecx
*4]
17656 punpckldq mm5
, [edx
+ ecx
*4 + 4]
17657 punpckldq mm6
, [edx
+ ecx
*4 + 8]
17658 punpckldq mm7
, [edx
+ ecx
*4 + 12]
17659 pfmul mm6
, mm1 ;
# mm6 = Geps
17660 pfmul mm7
, mm2 ;
# mm7 = Heps2
17662 pfadd mm5
, mm7 ;
# mm5 = Fp
17663 pfmul mm5
, mm1 ;
# mm5=eps*Fp
17664 pfadd mm5
, mm4 ;
# mm5= VV
17666 movq mm4
, [esp
+ mci0310_c6
]
17667 pfmul mm5
, mm4 ;
# vnb6
17668 ;
# update vnbtot to release mm5!
17669 pfadd mm5
, [esp
+ mci0310_vnbtot
] ;
# add the earlier value
17670 movq
[esp
+ mci0310_vnbtot
], mm5 ;
# store the sum
17673 mov ecx
, [esp
+ mci0310_n1
]
17675 ;
# load all the table values we need
17676 movd mm4
, [edx
+ ecx
*4 + 16]
17677 movd mm5
, [edx
+ ecx
*4 + 20]
17678 movd mm6
, [edx
+ ecx
*4 + 24]
17679 movd mm7
, [edx
+ ecx
*4 + 28]
17680 mov ecx
, [esp
+ mci0310_n1
+ 4]
17682 punpckldq mm4
, [edx
+ ecx
*4 + 16]
17683 punpckldq mm5
, [edx
+ ecx
*4 + 20]
17684 punpckldq mm6
, [edx
+ ecx
*4 + 24]
17685 punpckldq mm7
, [edx
+ ecx
*4 + 28]
17687 pfmul mm6
, mm1 ;
# mm6 = Geps
17688 pfmul mm7
, mm2 ;
# mm7 = Heps2
17690 pfadd mm5
, mm7 ;
# mm5 = Fp
17691 pfmul mm5
, mm1 ;
# mm5=eps*Fp
17692 pfadd mm5
, mm4 ;
# mm5= VV
17694 movq mm6
, [esp
+ mci0310_c12
]
17695 pfmul mm5
, mm6 ;
# vnb12
17697 pfadd mm5
, [esp
+ mci0310_vnbtot
] ;
# add the earlier value
17698 movq
[esp
+ mci0310_vnbtot
], mm5 ;
# store the sum
17700 ;
# should we do one more iteration?
17701 sub dword ptr
[esp
+ mci0310_innerk
], 2
17702 jl
.mci0310_finish_vdw_inner
17703 jmp
.mci0310_unroll_vdw_loop
17704 .mci0310_finish_vdw_inner:
17705 and dword ptr
[esp
+ mci0310_innerk
], 1
17706 jnz
.mci0310_single_vdw_inner
17707 jmp
.mci0310_updateouterdata_vdw
17708 .mci0310_single_vdw_inner:
17709 ;
# a single j particle iteration here - compare with the unrolled code for comments
17710 mov eax
, [esp
+ mci0310_innerjjnr
]
17711 mov eax
, [eax
] ;
# eax=jnr offset
17713 mov esi
, [ebp
+ mci0310_nbfp
]
17714 mov ecx
, [ebp
+ mci0310_type
]
17715 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
17717 add edx
, [esp
+ mci0310_ntia
] ;
# tja = ntia + 2*type
17718 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
17719 movq
[esp
+ mci0310_c6
], mm5
17720 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
17721 movq
[esp
+ mci0310_c12
], mm5
17723 mov esi
, [ebp
+ mci0310_pos
]
17724 lea eax
, [eax
+ eax
*2]
17726 movq mm0
, [esp
+ mci0310_ix
]
17727 movd mm1
, [esp
+ mci0310_iz
]
17728 movq mm4
, [esi
+ eax
*4]
17729 movd mm5
, [esi
+ eax
*4 + 8]
17735 pfacc mm4
, mm5 ;
# mm0=rsq
17741 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
17744 ;
# mm0 is invsqrt, and mm1 r
17746 ;
# calculate potentials and scalar force
17747 pfmul mm1
, [esp
+ mci0310_tsc
] ;
# mm1=rt
17749 movd
[esp
+ mci0310_n1
], mm4
17751 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
17754 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
17756 mov edx
, [ebp
+ mci0310_VFtab
]
17757 mov ecx
, [esp
+ mci0310_n1
]
17759 ;
# dispersion table
17760 ;
# load all the table values we need
17761 movd mm4
, [edx
+ ecx
*4]
17762 movd mm5
, [edx
+ ecx
*4 + 4]
17763 movd mm6
, [edx
+ ecx
*4 + 8]
17764 movd mm7
, [edx
+ ecx
*4 + 12]
17765 pfmul mm6
, mm1 ;
# mm6 = Geps
17766 pfmul mm7
, mm2 ;
# mm7 = Heps2
17768 pfadd mm5
, mm7 ;
# mm5 = Fp
17769 pfmul mm5
, mm1 ;
# mm5=eps*Fp
17770 pfadd mm5
, mm4 ;
# mm5= VV
17772 movq mm4
, [esp
+ mci0310_c6
]
17773 pfmul mm5
, mm4 ;
# vnb6
17774 ;
# update vnbtot to release mm5!
17775 pfadd mm5
, [esp
+ mci0310_vnbtot
] ;
# add the earlier value
17776 movq
[esp
+ mci0310_vnbtot
], mm5 ;
# store the sum
17779 ;
# load all the table values we need
17780 movd mm4
, [edx
+ ecx
*4 + 16]
17781 movd mm5
, [edx
+ ecx
*4 + 20]
17782 movd mm6
, [edx
+ ecx
*4 + 24]
17783 movd mm7
, [edx
+ ecx
*4 + 28]
17785 pfmul mm6
, mm1 ;
# mm6 = Geps
17786 pfmul mm7
, mm2 ;
# mm7 = Heps2
17788 pfadd mm5
, mm7 ;
# mm5 = Fp
17789 pfmul mm5
, mm1 ;
# mm5=eps*Fp
17790 pfadd mm5
, mm4 ;
# mm5= VV
17792 movq mm6
, [esp
+ mci0310_c12
]
17793 pfmul mm5
, mm6 ;
# vnb12
17795 pfadd mm5
, [esp
+ mci0310_vnbtot
] ;
# add the earlier value
17796 movq
[esp
+ mci0310_vnbtot
], mm5 ;
# store the sum
17798 .mci0310_updateouterdata_vdw:
17799 ;
# loop back to mno
17800 dec dword ptr
[esp
+ mci0310_nsvdw
]
17801 jz
.mci0310_last_mno
17802 jmp
.mci0310_mno_vdw
17805 mov edx
, [ebp
+ mci0310_gid
] ;
# get group index for this i particle
17807 add dword ptr
[ebp
+ mci0310_gid
], 4 ;
# advance pointer
17809 movq mm7
, [esp
+ mci0310_vnbtot
]
17810 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
17812 mov eax
, [ebp
+ mci0310_Vnb
]
17813 movd mm6
, [eax
+ edx
*4]
17815 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
17817 mov ecx
, [ebp
+ mci0310_nri
]
17820 ;
# not last, iterate once more!
17821 mov
[ebp
+ mci0310_nri
], ecx
17836 .globl mcinl1000_3dnow
17837 .globl _mcinl1000_3dnow
17840 .equiv mci1000_nri, 8
17841 .equiv mci1000_iinr, 12
17842 .equiv mci1000_jindex, 16
17843 .equiv mci1000_jjnr, 20
17844 .equiv mci1000_shift, 24
17845 .equiv mci1000_shiftvec, 28
17846 .equiv mci1000_gid, 32
17847 .equiv mci1000_pos, 36
17848 .equiv mci1000_charge, 40
17849 .equiv mci1000_facel, 44
17850 .equiv mci1000_Vc, 48
17851 ;
# stack offsets for local variables
17852 .equiv mci1000_is3, 0
17853 .equiv mci1000_ii3, 4
17854 .equiv mci1000_ix, 8
17855 .equiv mci1000_iy, 12
17856 .equiv mci1000_iz, 16
17857 .equiv mci1000_iq, 20
17858 .equiv mci1000_vctot, 28
17859 .equiv mci1000_innerjjnr, 36
17860 .equiv mci1000_innerk, 40
17869 sub esp
, 44 ;
# 80 bytes local stack space
17871 ;
# assume we have at least one i particle - start directly
17873 mov eax
, [ebp
+ mci1000_shift
] ;
# eax = pointer into shift[]
17874 mov ebx
, [eax
] ;
# ebx=shift[n]
17875 add dword ptr
[ebp
+ mci1000_shift
], 4 ;
# advance pointer one step
17877 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
17878 mov
[esp
+ mci1000_is3
],ebx ;
# store is3
17880 mov eax
, [ebp
+ mci1000_shiftvec
] ;
# eax = base of shiftvec[]
17882 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
17883 movd mm1
, [eax
+ ebx
*4 + 8]
17885 mov ecx
, [ebp
+ mci1000_iinr
] ;
# ecx = pointer into iinr[]
17886 add dword ptr
[ebp
+ mci1000_iinr
], 4 ;
# advance pointer
17887 mov ebx
, [ecx
] ;
# ebx=ii
17889 mov edx
, [ebp
+ mci1000_charge
]
17890 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
17891 pfmul mm2
, [ebp
+ mci1000_facel
]
17892 punpckldq mm2
,mm2 ;
# spread to both halves
17893 movq
[esp
+ mci1000_iq
], mm2 ;
# iq =facel*charge[ii]
17895 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
17896 mov eax
, [ebp
+ mci1000_pos
] ;
# eax = base of pos[]
17898 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
17899 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
17900 mov
[esp
+ mci1000_ii3
], ebx
17902 movq
[esp
+ mci1000_ix
], mm0
17903 movd
[esp
+ mci1000_iz
], mm1
17907 movq
[esp
+ mci1000_vctot
], mm7
17909 mov eax
, [ebp
+ mci1000_jindex
]
17910 mov ecx
, [eax
] ;
# jindex[n]
17911 mov edx
, [eax
+ 4] ;
# jindex[n+1]
17912 add dword ptr
[ebp
+ mci1000_jindex
], 4
17913 sub edx
, ecx ;
# number of innerloop atoms
17915 mov esi
, [ebp
+ mci1000_pos
]
17916 mov eax
, [ebp
+ mci1000_jjnr
]
17919 mov
[esp
+ mci1000_innerjjnr
], eax ;
# pointer to jjnr[nj0]
17921 mov
[esp
+ mci1000_innerk
], edx ;
# number of innerloop atoms
17922 jge
.mci1000_unroll_loop
17923 jmp
.mci1000_finish_inner
17924 .mci1000_unroll_loop:
17925 ;
# paired innerloop starts here
17926 mov ecx
, [esp
+ mci1000_innerjjnr
] ;
# pointer to jjnr[k]
17928 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
17929 add dword ptr
[esp
+ mci1000_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
17930 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
17932 mov ecx
, [ebp
+ mci1000_charge
] ;
# base of charge[]
17933 movq mm5
, [esp
+ mci1000_iq
]
17934 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
17935 movd mm7
, [ecx
+ ebx
*4] ;
# charge[jnr2]
17936 punpckldq mm3
,mm7 ;
# move charge 2 to high part of mm3
17937 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
17939 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
17940 lea ebx
, [ebx
+ ebx
*2]
17942 movq mm0
, [esp
+ mci1000_ix
]
17943 movd mm1
, [esp
+ mci1000_iz
]
17944 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
17945 movd mm5
, [esi
+ eax
*4 + 8]
17946 pfsubr mm4
,mm0 ;
# dr = ir - jr
17948 pfmul mm4
,mm4 ;
# square dx,dy,dz
17950 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
17951 pfacc mm4
, mm5 ;
# first rsq in lower mm4
17953 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
17954 movd mm7
, [esi
+ ebx
*4 + 8]
17956 pfsubr mm6
,mm0 ;
# dr = ir - jr
17958 pfmul mm6
,mm6 ;
# square dx,dy,dz
17960 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
17961 pfacc mm6
, mm7 ;
# second rsq in lower mm6
17963 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
17967 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
17968 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
17974 ;
# do potential and fscal
17977 pfmul mm3
,mm1 ;
# 3 has both vcoul
17978 pfadd mm3
, [esp
+ mci1000_vctot
] ;
# add the earlier value
17979 movq
[esp
+ mci1000_vctot
], mm3 ;
# store the sum
17981 ;
# should we do one more iteration?
17982 sub dword ptr
[esp
+ mci1000_innerk
], 2
17983 jl
.mci1000_finish_inner
17984 jmp
.mci1000_unroll_loop
17985 .mci1000_finish_inner:
17986 and dword ptr
[esp
+ mci1000_innerk
], 1
17987 jnz
.mci1000_single_inner
17988 jmp
.mci1000_updateouterdata
17989 .mci1000_single_inner:
17990 ;
# a single j particle iteration here - compare with the unrolled code for comments
17991 mov eax
, [esp
+ mci1000_innerjjnr
]
17992 mov eax
, [eax
] ;
# eax=jnr offset
17994 mov ecx
, [ebp
+ mci1000_charge
]
17995 movd mm6
, [esp
+ mci1000_iq
]
17996 movd mm7
, [ecx
+ eax
*4]
17997 pfmul mm6
, mm7 ;
# mm6=qq
17999 lea eax
, [eax
+ eax
*2]
18001 movq mm0
, [esp
+ mci1000_ix
]
18002 movd mm1
, [esp
+ mci1000_iz
]
18003 movq mm2
, [esi
+ eax
*4]
18004 movd mm3
, [esi
+ eax
*4 + 8]
18010 pfacc mm0
, mm1 ;
# mm0=rsq
18016 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
18017 ;
# calculate potential and scalar force
18018 pfmul mm6
, mm1 ;
# mm6=vcoul
18019 pfadd mm6
, [esp
+ mci1000_vctot
]
18020 movq
[esp
+ mci1000_vctot
], mm6
18022 .mci1000_updateouterdata:
18023 mov edx
, [ebp
+ mci1000_gid
] ;
# get group index for this i particle
18025 add dword ptr
[ebp
+ mci1000_gid
], 4 ;
# advance pointer
18027 movq mm7
, [esp
+ mci1000_vctot
]
18028 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
18030 mov eax
, [ebp
+ mci1000_Vc
]
18031 movd mm6
, [eax
+ edx
*4]
18033 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
18035 mov ecx
, [ebp
+ mci1000_nri
]
18038 ;
# not last, iterate once more!
18039 mov
[ebp
+ mci1000_nri
], ecx
18054 .globl mcinl1010_3dnow
18055 .globl _mcinl1010_3dnow
18058 .equiv mci1010_nri, 8
18059 .equiv mci1010_iinr, 12
18060 .equiv mci1010_jindex, 16
18061 .equiv mci1010_jjnr, 20
18062 .equiv mci1010_shift, 24
18063 .equiv mci1010_shiftvec, 28
18064 .equiv mci1010_gid, 32
18065 .equiv mci1010_pos, 36
18066 .equiv mci1010_charge, 40
18067 .equiv mci1010_facel, 44
18068 .equiv mci1010_Vc, 48
18069 .equiv mci1010_nsatoms, 52
18070 ;
# stack offsets for local variables
18071 .equiv mci1010_is3, 0
18072 .equiv mci1010_ii3, 4
18073 .equiv mci1010_shX, 8
18074 .equiv mci1010_shY, 12
18075 .equiv mci1010_shZ, 16
18076 .equiv mci1010_ix, 20
18077 .equiv mci1010_iy, 24
18078 .equiv mci1010_iz, 28
18079 .equiv mci1010_iq, 32
18080 .equiv mci1010_vctot, 40
18081 .equiv mci1010_innerjjnr0, 48
18082 .equiv mci1010_innerk0, 52
18083 .equiv mci1010_innerjjnr, 56
18084 .equiv mci1010_innerk, 60
18085 .equiv mci1010_nscoul, 64
18086 .equiv mci1010_solnr, 68
18095 sub esp
, 72 ;
# local stack space
18097 ;
# assume we have at least one i particle - start directly
18098 add dword ptr
[ebp
+ mci1010_nsatoms
], 8
18101 mov eax
, [ebp
+ mci1010_shift
] ;
# eax = pointer into shift[]
18102 mov ebx
, [eax
] ;
# ebx=shift[n]
18103 add dword ptr
[ebp
+ mci1010_shift
], 4 ;
# advance pointer one step
18105 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
18106 mov
[esp
+ mci1010_is3
],ebx ;
# store is3
18108 mov eax
, [ebp
+ mci1010_shiftvec
] ;
# eax = base of shiftvec[]
18110 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
18111 movd mm1
, [eax
+ ebx
*4 + 8]
18112 movq
[esp
+ mci1010_shX
], mm0
18113 movd
[esp
+ mci1010_shZ
], mm1
18115 mov ecx
, [ebp
+ mci1010_iinr
] ;
# ecx = pointer into iinr[]
18116 add dword ptr
[ebp
+ mci1010_iinr
], 4 ;
# advance pointer
18117 mov ebx
, [ecx
] ;
# ebx=ii
18119 mov eax
, [ebp
+ mci1010_nsatoms
]
18121 add dword ptr
[ebp
+ mci1010_nsatoms
], 12
18122 mov
[esp
+ mci1010_nscoul
], ecx
18126 movq
[esp
+ mci1010_vctot
], mm7
18127 mov
[esp
+ mci1010_solnr
], ebx
18129 mov eax
, [ebp
+ mci1010_jindex
]
18130 mov ecx
, [eax
] ;
# jindex[n]
18131 mov edx
, [eax
+ 4] ;
# jindex[n+1]
18132 add dword ptr
[ebp
+ mci1010_jindex
], 4
18133 sub edx
, ecx ;
# number of innerloop atoms
18134 mov eax
, [ebp
+ mci1010_jjnr
]
18137 mov
[esp
+ mci1010_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
18139 mov
[esp
+ mci1010_innerk0
], edx ;
# number of innerloop atoms
18140 mov esi
, [ebp
+ mci1010_pos
]
18142 mov ecx
, [esp
+ mci1010_nscoul
]
18144 jnz
.mci1010_mno_coul
18145 jmp
.mci1010_last_mno
18147 mov ebx
, [esp
+ mci1010_solnr
]
18148 inc dword ptr
[esp
+ mci1010_solnr
]
18149 mov edx
, [ebp
+ mci1010_charge
]
18150 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
18151 pfmul mm2
, [ebp
+ mci1010_facel
]
18152 punpckldq mm2
,mm2 ;
# spread to both halves
18153 movq
[esp
+ mci1010_iq
], mm2 ;
# iq =facel*charge[ii]
18155 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
18156 mov eax
, [ebp
+ mci1010_pos
] ;
# eax = base of pos[]
18157 mov
[esp
+ mci1010_ii3
], ebx
18159 movq mm0
, [eax
+ ebx
*4]
18160 movd mm1
, [eax
+ ebx
*4 + 8]
18161 pfadd mm0
, [esp
+ mci1010_shX
]
18162 pfadd mm1
, [esp
+ mci1010_shZ
]
18163 movq
[esp
+ mci1010_ix
], mm0
18164 movd
[esp
+ mci1010_iz
], mm1
18166 mov ecx
, [esp
+ mci1010_innerjjnr0
]
18167 mov
[esp
+ mci1010_innerjjnr
], ecx
18168 mov edx
, [esp
+ mci1010_innerk0
]
18170 mov
[esp
+ mci1010_innerk
], edx ;
# number of innerloop atoms
18171 jge
.mci1010_unroll_coul_loop
18172 jmp
.mci1010_finish_coul_inner
18173 .mci1010_unroll_coul_loop:
18174 ;
# paired innerloop starts here
18175 mov ecx
, [esp
+ mci1010_innerjjnr
] ;
# pointer to jjnr[k]
18177 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
18178 add dword ptr
[esp
+ mci1010_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
18179 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
18181 mov ecx
, [ebp
+ mci1010_charge
] ;
# base of charge[]
18182 movq mm5
, [esp
+ mci1010_iq
]
18183 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
18184 movd mm7
, [ecx
+ ebx
*4] ;
# charge[jnr2]
18185 punpckldq mm3
,mm7 ;
# move charge 2 to high part of mm3
18186 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
18188 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
18189 lea ebx
, [ebx
+ ebx
*2]
18191 movq mm0
, [esp
+ mci1010_ix
]
18192 movd mm1
, [esp
+ mci1010_iz
]
18193 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
18194 movd mm5
, [esi
+ eax
*4 + 8]
18195 pfsubr mm4
,mm0 ;
# dr = ir - jr
18197 pfmul mm4
,mm4 ;
# square dx,dy,dz
18199 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
18200 pfacc mm4
, mm5 ;
# first rsq in lower mm4
18202 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
18203 movd mm7
, [esi
+ ebx
*4 + 8]
18205 pfsubr mm6
,mm0 ;
# dr = ir - jr
18207 pfmul mm6
,mm6 ;
# square dx,dy,dz
18209 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
18210 pfacc mm6
, mm7 ;
# second rsq in lower mm6
18212 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
18216 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
18217 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
18225 pfmul mm3
,mm1 ;
# 3 has both vcoul
18226 pfadd mm3
, [esp
+ mci1010_vctot
] ;
# add the earlier value
18227 movq
[esp
+ mci1010_vctot
], mm3 ;
# store the sum
18229 ;
# should we do one more iteration?
18230 sub dword ptr
[esp
+ mci1010_innerk
], 2
18231 jl
.mci1010_finish_coul_inner
18232 jmp
.mci1010_unroll_coul_loop
18233 .mci1010_finish_coul_inner:
18234 and dword ptr
[esp
+ mci1010_innerk
], 1
18235 jnz
.mci1010_single_coul_inner
18236 jmp
.mci1010_updateouterdata_coul
18237 .mci1010_single_coul_inner:
18238 ;
# a single j particle iteration here - compare with the unrolled code for comments
18239 mov eax
, [esp
+ mci1010_innerjjnr
]
18240 mov eax
, [eax
] ;
# eax=jnr offset
18242 mov ecx
, [ebp
+ mci1010_charge
]
18243 movd mm6
, [esp
+ mci1010_iq
]
18244 movd mm7
, [ecx
+ eax
*4]
18245 pfmul mm6
, mm7 ;
# mm6=qq
18247 lea eax
, [eax
+ eax
*2]
18249 movq mm0
, [esp
+ mci1010_ix
]
18250 movd mm1
, [esp
+ mci1010_iz
]
18251 movq mm2
, [esi
+ eax
*4]
18252 movd mm3
, [esi
+ eax
*4 + 8]
18258 pfacc mm0
, mm1 ;
# mm0=rsq
18264 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
18265 ;
# calculate potential and scalar force
18266 pfmul mm6
, mm1 ;
# mm6=vcoul
18267 pfadd mm6
, [esp
+ mci1010_vctot
]
18268 movq
[esp
+ mci1010_vctot
], mm6
18270 .mci1010_updateouterdata_coul:
18271 ;
# loop back to mno
18272 dec dword ptr
[esp
+ mci1010_nscoul
]
18273 jz
.mci1010_last_mno
18274 jmp
.mci1010_mno_coul
18276 mov edx
, [ebp
+ mci1010_gid
] ;
# get group index for this i particle
18278 add dword ptr
[ebp
+ mci1010_gid
], 4 ;
# advance pointer
18280 movq mm7
, [esp
+ mci1010_vctot
]
18281 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
18283 mov eax
, [ebp
+ mci1010_Vc
]
18284 movd mm6
, [eax
+ edx
*4]
18286 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
18288 mov ecx
, [ebp
+ mci1010_nri
]
18291 ;
# not last, iterate once more!
18292 mov
[ebp
+ mci1010_nri
], ecx
18307 .globl mcinl1020_3dnow
18308 .globl _mcinl1020_3dnow
18311 .equiv mci1020_nri, 8
18312 .equiv mci1020_iinr, 12
18313 .equiv mci1020_jindex, 16
18314 .equiv mci1020_jjnr, 20
18315 .equiv mci1020_shift, 24
18316 .equiv mci1020_shiftvec, 28
18317 .equiv mci1020_gid, 32
18318 .equiv mci1020_pos, 36
18319 .equiv mci1020_charge, 40
18320 .equiv mci1020_facel, 44
18321 .equiv mci1020_Vc, 48
18322 ;
# stack offsets for local variables
18323 .equiv mci1020_is3, 0
18324 .equiv mci1020_ii3, 4
18325 .equiv mci1020_ixO, 8
18326 .equiv mci1020_iyO, 12
18327 .equiv mci1020_izO, 16
18328 .equiv mci1020_ixH, 20
18329 .equiv mci1020_iyH, 28
18330 .equiv mci1020_izH, 36
18331 .equiv mci1020_iqO, 44
18332 .equiv mci1020_iqH, 52
18333 .equiv mci1020_vctot, 60
18334 .equiv mci1020_innerjjnr, 68
18335 .equiv mci1020_innerk, 72
18344 sub esp
, 76 ;
# local stack space
18346 ;
# assume we have at least one i particle - start directly
18348 mov ecx
, [ebp
+ mci1020_iinr
] ;
# ecx = pointer into iinr[]
18349 mov ebx
, [ecx
] ;
# ebx=ii
18351 mov edx
, [ebp
+ mci1020_charge
]
18352 movd mm1
, [ebp
+ mci1020_facel
]
18353 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
18355 movq
[esp
+ mci1020_iqO
], mm2 ;
# iqO = facel*charge[ii]
18357 movd mm2
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1]
18359 punpckldq mm2
,mm2 ;
# spread to both halves
18360 movq
[esp
+ mci1020_iqH
], mm2 ;
# iqH = facel*charge[ii0+1]
18362 mov eax
, [ebp
+ mci1020_shift
] ;
# eax = pointer into shift[]
18363 mov ebx
, [eax
] ;
# ebx=shift[n]
18364 add dword ptr
[ebp
+ mci1020_shift
], 4 ;
# advance pointer one step
18366 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
18367 mov
[esp
+ mci1020_is3
],ebx ;
# store is3
18369 mov eax
, [ebp
+ mci1020_shiftvec
] ;
# eax = base of shiftvec[]
18371 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6
18372 movd mm6
, [eax
+ ebx
*4 + 8]
18376 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2
18380 mov ecx
, [ebp
+ mci1020_iinr
] ;
# ecx = pointer into iinr[]
18381 add dword ptr
[ebp
+ mci1020_iinr
], 4 ;
# advance pointer
18382 mov ebx
, [ecx
] ;
# ebx=ii
18384 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
18385 mov eax
, [ebp
+ mci1020_pos
] ;
# eax = base of pos[]
18387 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
18388 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
18389 mov
[esp
+ mci1020_ii3
], ebx ;
# (use mm7 as temp storage for iz)
18391 movq
[esp
+ mci1020_ixO
], mm5
18392 movq
[esp
+ mci1020_izO
], mm6
18394 movd mm3
, [eax
+ ebx
*4 + 12]
18395 movd mm4
, [eax
+ ebx
*4 + 16]
18396 movd mm5
, [eax
+ ebx
*4 + 20]
18397 punpckldq mm3
, [eax
+ ebx
*4 + 24]
18398 punpckldq mm4
, [eax
+ ebx
*4 + 28]
18399 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
18404 movq
[esp
+ mci1020_ixH
], mm0
18405 movq
[esp
+ mci1020_iyH
], mm1
18406 movq
[esp
+ mci1020_izH
], mm2
18408 ;
# clear vctot and i forces
18410 movq
[esp
+ mci1020_vctot
], mm7
18412 mov eax
, [ebp
+ mci1020_jindex
]
18413 mov ecx
, [eax
] ;
# jindex[n]
18414 mov edx
, [eax
+ 4] ;
# jindex[n+1]
18415 add dword ptr
[ebp
+ mci1020_jindex
], 4
18416 sub edx
, ecx ;
# number of innerloop atoms
18417 mov
[esp
+ mci1020_innerk
], edx ;
# number of innerloop atoms
18419 mov esi
, [ebp
+ mci1020_pos
]
18420 mov eax
, [ebp
+ mci1020_jjnr
]
18423 mov
[esp
+ mci1020_innerjjnr
], eax ;
# pointer to jjnr[nj0]
18424 .mci1020_inner_loop:
18425 ;
# a single j particle iteration here - compare with the unrolled code for comments
18426 mov eax
, [esp
+ mci1020_innerjjnr
]
18427 mov eax
, [eax
] ;
# eax=jnr offset
18428 add dword ptr
[esp
+ mci1020_innerjjnr
], 4 ;
# advance pointer
18429 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
18431 mov ecx
, [ebp
+ mci1020_charge
]
18432 movd mm7
, [ecx
+ eax
*4]
18435 pfmul mm6
, [esp
+ mci1020_iqO
]
18436 pfmul mm7
, [esp
+ mci1020_iqH
] ;
# mm6=qqO, mm7=qqH
18438 lea eax
, [eax
+ eax
*2]
18440 movq mm0
, [esi
+ eax
*4]
18441 movd mm1
, [esi
+ eax
*4 + 8]
18442 ;
# copy & expand to mm2-mm4 for the H interactions
18450 pfsubr mm0
, [esp
+ mci1020_ixO
]
18451 pfsubr mm1
, [esp
+ mci1020_izO
]
18456 pfadd mm0
, mm1 ;
# mm0=rsqO
18460 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
18461 pfsubr mm2
, [esp
+ mci1020_ixH
]
18462 pfsubr mm3
, [esp
+ mci1020_iyH
]
18463 pfsubr mm4
, [esp
+ mci1020_izH
] ;
# mm2-mm4 is dxH-dzH
18470 pfadd mm3
,mm4 ;
# mm3=rsqH
18477 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
18478 ;
# calculate potential and scalar force
18479 pfmul mm6
, mm1 ;
# mm6=vcoul
18485 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
18490 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
18491 pfmul mm7
, mm5 ;
# mm7=vcoul
18494 pfadd mm7
, [esp
+ mci1020_vctot
]
18495 movq
[esp
+ mci1020_vctot
], mm7
18497 ;
# done - one more?
18498 dec dword ptr
[esp
+ mci1020_innerk
]
18499 jz
.mci1020_updateouterdata
18500 jmp
.mci1020_inner_loop
18501 .mci1020_updateouterdata:
18502 mov edx
, [ebp
+ mci1020_gid
] ;
# get group index for this i particle
18504 add dword ptr
[ebp
+ mci1020_gid
], 4 ;
# advance pointer
18506 movq mm7
, [esp
+ mci1020_vctot
]
18507 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
18509 mov eax
, [ebp
+ mci1020_Vc
]
18510 movd mm6
, [eax
+ edx
*4]
18512 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
18515 dec dword ptr
[ebp
+ mci1020_nri
]
18517 ;
# not last, iterate once more!
18532 .globl mcinl1030_3dnow
18533 .globl _mcinl1030_3dnow
18536 .equiv mci1030_nri, 8
18537 .equiv mci1030_iinr, 12
18538 .equiv mci1030_jindex, 16
18539 .equiv mci1030_jjnr, 20
18540 .equiv mci1030_shift, 24
18541 .equiv mci1030_shiftvec, 28
18542 .equiv mci1030_gid, 32
18543 .equiv mci1030_pos, 36
18544 .equiv mci1030_charge, 40
18545 .equiv mci1030_facel, 44
18546 .equiv mci1030_Vc, 48
18547 ;
# stack offsets for local variables
18548 .equiv mci1030_is3, 0
18549 .equiv mci1030_ii3, 4
18550 .equiv mci1030_ixO, 8
18551 .equiv mci1030_iyO, 12
18552 .equiv mci1030_izO, 16
18553 .equiv mci1030_ixH, 20
18554 .equiv mci1030_iyH, 28
18555 .equiv mci1030_izH, 36
18556 .equiv mci1030_qqOO, 44
18557 .equiv mci1030_qqOH, 52
18558 .equiv mci1030_qqHH, 60
18559 .equiv mci1030_vctot, 68
18560 .equiv mci1030_innerjjnr, 76
18561 .equiv mci1030_innerk, 80
18570 sub esp
, 84 ;
# local stack space
18572 ;
# assume we have at least one i particle - start directly
18574 mov ecx
, [ebp
+ mci1030_iinr
] ;
# ecx = pointer into iinr[]
18575 mov ebx
, [ecx
] ;
# ebx=ii
18577 mov edx
, [ebp
+ mci1030_charge
]
18578 movd mm1
, [ebp
+ mci1030_facel
] ;
# mm1=facel
18579 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0] (O)
18580 movd mm3
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1] (H)
18586 pfmul mm4
, mm2 ;
# mm4=qqOO*facel
18587 pfmul mm5
, mm3 ;
# mm5=qqOH*facel
18588 pfmul mm6
, mm3 ;
# mm6=qqHH*facel
18589 punpckldq mm5
,mm5 ;
# spread to both halves
18590 punpckldq mm6
,mm6 ;
# spread to both halves
18591 movq
[esp
+ mci1030_qqOO
], mm4
18592 movq
[esp
+ mci1030_qqOH
], mm5
18593 movq
[esp
+ mci1030_qqHH
], mm6
18595 mov eax
, [ebp
+ mci1030_shift
] ;
# eax = pointer into shift[]
18596 mov ebx
, [eax
] ;
# ebx=shift[n]
18597 add dword ptr
[ebp
+ mci1030_shift
], 4 ;
# advance pointer one step
18599 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
18600 mov
[esp
+ mci1030_is3
],ebx ;
# store is3
18602 mov eax
, [ebp
+ mci1030_shiftvec
] ;
# eax = base of shiftvec[]
18604 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6
18605 movd mm6
, [eax
+ ebx
*4 + 8]
18609 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2
18613 mov ecx
, [ebp
+ mci1030_iinr
] ;
# ecx = pointer into iinr[]
18614 add dword ptr
[ebp
+ mci1030_iinr
], 4 ;
# advance pointer
18615 mov ebx
, [ecx
] ;
# ebx=ii
18617 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
18618 mov eax
, [ebp
+ mci1030_pos
] ;
# eax = base of pos[]
18620 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
18621 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
18622 mov
[esp
+ mci1030_ii3
], ebx ;
# (use mm7 as temp storage for iz)
18624 movq
[esp
+ mci1030_ixO
], mm5
18625 movq
[esp
+ mci1030_izO
], mm6
18627 movd mm3
, [eax
+ ebx
*4 + 12]
18628 movd mm4
, [eax
+ ebx
*4 + 16]
18629 movd mm5
, [eax
+ ebx
*4 + 20]
18630 punpckldq mm3
, [eax
+ ebx
*4 + 24]
18631 punpckldq mm4
, [eax
+ ebx
*4 + 28]
18632 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
18637 movq
[esp
+ mci1030_ixH
], mm0
18638 movq
[esp
+ mci1030_iyH
], mm1
18639 movq
[esp
+ mci1030_izH
], mm2
18641 ;
# clear vctot and i forces
18643 movq
[esp
+ mci1030_vctot
], mm7
18645 mov eax
, [ebp
+ mci1030_jindex
]
18646 mov ecx
, [eax
] ;
# jindex[n]
18647 mov edx
, [eax
+ 4] ;
# jindex[n+1]
18648 add dword ptr
[ebp
+ mci1030_jindex
], 4
18649 sub edx
, ecx ;
# number of innerloop atoms
18650 mov
[esp
+ mci1030_innerk
], edx ;
# number of innerloop atoms
18652 mov esi
, [ebp
+ mci1030_pos
]
18653 mov eax
, [ebp
+ mci1030_jjnr
]
18656 mov
[esp
+ mci1030_innerjjnr
], eax ;
# pointer to jjnr[nj0]
18657 .mci1030_inner_loop:
18658 ;
# a single j particle iteration here - compare with the unrolled code for comments
18659 mov eax
, [esp
+ mci1030_innerjjnr
]
18660 mov eax
, [eax
] ;
# eax=jnr offset
18661 add dword ptr
[esp
+ mci1030_innerjjnr
], 4 ;
# advance pointer
18663 movd mm6
, [esp
+ mci1030_qqOO
]
18664 movq mm7
, [esp
+ mci1030_qqOH
]
18666 lea eax
, [eax
+ eax
*2]
18667 movq mm0
, [esi
+ eax
*4]
18668 movd mm1
, [esi
+ eax
*4 + 8]
18669 ;
# copy & expand to mm2-mm4 for the H interactions
18677 pfsubr mm0
, [esp
+ mci1030_ixO
]
18678 pfsubr mm1
, [esp
+ mci1030_izO
]
18683 pfadd mm0
, mm1 ;
# mm0=rsqO
18687 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
18688 pfsubr mm2
, [esp
+ mci1030_ixH
]
18689 pfsubr mm3
, [esp
+ mci1030_iyH
]
18690 pfsubr mm4
, [esp
+ mci1030_izH
] ;
# mm2-mm4 is dxH-dzH
18697 pfadd mm3
,mm4 ;
# mm3=rsqH
18704 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
18705 ;
# calculate potential and scalar force
18706 pfmul mm6
, mm1 ;
# mm6=vcoul
18712 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
18717 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
18718 pfmul mm7
, mm5 ;
# mm7=vcoul
18721 pfadd mm7
, [esp
+ mci1030_vctot
]
18722 movq
[esp
+ mci1030_vctot
], mm7
18724 ;
# interactions with j H1
18725 movq mm0
, [esi
+ eax
*4 + 12]
18726 movd mm1
, [esi
+ eax
*4 + 20]
18727 ;
# copy & expand to mm2-mm4 for the H interactions
18735 movd mm6
, [esp
+ mci1030_qqOH
]
18736 movq mm7
, [esp
+ mci1030_qqHH
]
18738 pfsubr mm0
, [esp
+ mci1030_ixO
]
18739 pfsubr mm1
, [esp
+ mci1030_izO
]
18744 pfadd mm0
, mm1 ;
# mm0=rsqO
18748 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
18749 pfsubr mm2
, [esp
+ mci1030_ixH
]
18750 pfsubr mm3
, [esp
+ mci1030_iyH
]
18751 pfsubr mm4
, [esp
+ mci1030_izH
] ;
# mm2-mm4 is dxH-dzH
18758 pfadd mm3
,mm4 ;
# mm3=rsqH
18765 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
18766 ;
# calculate potential and scalar force
18767 pfmul mm6
, mm1 ;
# mm6=vcoul
18773 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
18778 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
18779 pfmul mm7
, mm5 ;
# mm7=vcoul
18782 pfadd mm7
, [esp
+ mci1030_vctot
]
18783 movq
[esp
+ mci1030_vctot
], mm7
18785 ;
# interactions with j H2
18786 movq mm0
, [esi
+ eax
*4 + 24]
18787 movd mm1
, [esi
+ eax
*4 + 32]
18788 ;
# copy & expand to mm2-mm4 for the H interactions
18796 movd mm6
, [esp
+ mci1030_qqOH
]
18797 movq mm7
, [esp
+ mci1030_qqHH
]
18799 pfsubr mm0
, [esp
+ mci1030_ixO
]
18800 pfsubr mm1
, [esp
+ mci1030_izO
]
18805 pfadd mm0
, mm1 ;
# mm0=rsqO
18809 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
18810 pfsubr mm2
, [esp
+ mci1030_ixH
]
18811 pfsubr mm3
, [esp
+ mci1030_iyH
]
18812 pfsubr mm4
, [esp
+ mci1030_izH
] ;
# mm2-mm4 is dxH-dzH
18819 pfadd mm3
,mm4 ;
# mm3=rsqH
18826 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
18827 ;
# calculate potential and scalar force
18828 pfmul mm6
, mm1 ;
# mm6=vcoul
18834 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
18839 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
18840 pfmul mm7
, mm5 ;
# mm7=vcoul
18843 pfadd mm7
, [esp
+ mci1030_vctot
]
18844 movq
[esp
+ mci1030_vctot
], mm7
18846 ;
# done - one more?
18847 dec dword ptr
[esp
+ mci1030_innerk
]
18848 jz
.mci1030_updateouterdata
18849 jmp
.mci1030_inner_loop
18850 .mci1030_updateouterdata:
18851 mov edx
, [ebp
+ mci1030_gid
] ;
# get group index for this i particle
18853 add dword ptr
[ebp
+ mci1030_gid
], 4 ;
# advance pointer
18855 movq mm7
, [esp
+ mci1030_vctot
]
18856 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
18858 mov eax
, [ebp
+ mci1030_Vc
]
18859 movd mm6
, [eax
+ edx
*4]
18861 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
18863 dec dword ptr
[ebp
+ mci1030_nri
]
18865 ;
# not last, iterate once more!
18880 .globl mcinl1100_3dnow
18881 .globl _mcinl1100_3dnow
18884 .equiv mci1100_nri, 8
18885 .equiv mci1100_iinr, 12
18886 .equiv mci1100_jindex, 16
18887 .equiv mci1100_jjnr, 20
18888 .equiv mci1100_shift, 24
18889 .equiv mci1100_shiftvec, 28
18890 .equiv mci1100_gid, 32
18891 .equiv mci1100_pos, 36
18892 .equiv mci1100_charge, 40
18893 .equiv mci1100_facel, 44
18894 .equiv mci1100_Vc, 48
18895 .equiv mci1100_type, 52
18896 .equiv mci1100_ntype, 56
18897 .equiv mci1100_nbfp, 60
18898 .equiv mci1100_Vnb, 64
18899 ;
# stack offsets for local variables
18900 .equiv mci1100_is3, 0
18901 .equiv mci1100_ii3, 4
18902 .equiv mci1100_ix, 8
18903 .equiv mci1100_iy, 12
18904 .equiv mci1100_iz, 16
18905 .equiv mci1100_iq, 20
18906 .equiv mci1100_vctot, 28
18907 .equiv mci1100_vnbtot, 36
18908 .equiv mci1100_c6, 44
18909 .equiv mci1100_c12, 52
18910 .equiv mci1100_ntia, 60
18911 .equiv mci1100_innerjjnr, 64
18912 .equiv mci1100_innerk, 68
18922 sub esp
, 72 ;
# local stack space
18924 ;
# move data to local stack
18925 ;
# assume we have at least one i particle - start directly
18927 mov eax
, [ebp
+ mci1100_shift
] ;
# eax = pointer into shift[]
18928 mov ebx
, [eax
] ;
# ebx=shift[n]
18929 add dword ptr
[ebp
+ mci1100_shift
], 4 ;
# advance pointer one step
18931 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
18932 mov
[esp
+ mci1100_is3
],ebx ;
# store is3
18934 mov eax
, [ebp
+ mci1100_shiftvec
] ;
# eax = base of shiftvec[]
18936 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
18937 movd mm1
, [eax
+ ebx
*4 + 8]
18939 mov ecx
, [ebp
+ mci1100_iinr
] ;
# ecx = pointer into iinr[]
18940 add dword ptr
[ebp
+ mci1100_iinr
], 4 ;
# advance pointer
18941 mov ebx
, [ecx
] ;
# ebx=ii
18943 mov edx
, [ebp
+ mci1100_charge
]
18944 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
18945 pfmul mm2
, [ebp
+ mci1100_facel
]
18946 punpckldq mm2
,mm2 ;
# spread to both halves
18947 movq
[esp
+ mci1100_iq
], mm2 ;
# iq =facel*charge[ii]
18949 mov edx
, [ebp
+ mci1100_type
]
18950 mov edx
, [edx
+ ebx
*4]
18951 imul edx
, [ebp
+ mci1100_ntype
]
18953 mov
[esp
+ mci1100_ntia
], edx
18955 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
18956 mov eax
, [ebp
+ mci1100_pos
] ;
# eax = base of pos[]
18958 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
18959 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
18960 mov
[esp
+ mci1100_ii3
], ebx
18962 movq
[esp
+ mci1100_ix
], mm0
18963 movd
[esp
+ mci1100_iz
], mm1
18965 ;
# clear total potential and i forces
18967 movq
[esp
+ mci1100_vctot
], mm7
18968 movq
[esp
+ mci1100_vnbtot
], mm7
18970 mov eax
, [ebp
+ mci1100_jindex
]
18971 mov ecx
, [eax
] ;
# jindex[n]
18972 mov edx
, [eax
+ 4] ;
# jindex[n+1]
18973 add dword ptr
[ebp
+ mci1100_jindex
], 4
18974 sub edx
, ecx ;
# number of innerloop atoms
18976 mov esi
, [ebp
+ mci1100_pos
]
18977 mov eax
, [ebp
+ mci1100_jjnr
]
18980 mov
[esp
+ mci1100_innerjjnr
], eax ;
# pointer to jjnr[nj0]
18982 mov
[esp
+ mci1100_innerk
], edx ;
# number of innerloop atoms
18983 jge
.mci1100_unroll_loop
18984 jmp
.mci1100_finish_inner
18985 .mci1100_unroll_loop:
18986 ;
# paired innerloop starts here
18987 mov ecx
, [esp
+ mci1100_innerjjnr
] ;
# pointer to jjnr[k]
18989 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
18990 add dword ptr
[esp
+ mci1100_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
18991 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
18993 mov ecx
, [ebp
+ mci1100_charge
] ;
# base of charge[]
18994 movq mm5
, [esp
+ mci1100_iq
]
18995 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
18996 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
18997 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
18999 mov ecx
, [ebp
+ mci1100_type
]
19000 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
19001 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
19003 mov esi
, [ebp
+ mci1100_nbfp
] ;
# base of nbfp
19006 add edx
, [esp
+ mci1100_ntia
] ;
# tja = ntia + 2*type
19007 add ecx
, [esp
+ mci1100_ntia
]
19009 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
19010 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
19012 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
19013 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
19014 movq
[esp
+ mci1100_c6
], mm5
19015 movq
[esp
+ mci1100_c12
], mm6
19017 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
19018 lea ebx
, [ebx
+ ebx
*2]
19020 mov esi
, [ebp
+ mci1100_pos
]
19022 movq mm0
, [esp
+ mci1100_ix
]
19023 movd mm1
, [esp
+ mci1100_iz
]
19024 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
19025 movd mm5
, [esi
+ eax
*4 + 8]
19026 pfsubr mm4
,mm0 ;
# dr = ir - jr
19028 pfmul mm4
,mm4 ;
# square dx,dy,dz
19030 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
19031 pfacc mm4
, mm5 ;
# first rsq in lower mm4
19033 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
19034 movd mm7
, [esi
+ ebx
*4 + 8]
19036 pfsubr mm6
,mm0 ;
# dr = ir - jr
19038 pfmul mm6
,mm6 ;
# square dx,dy,dz
19040 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
19041 pfacc mm6
, mm7 ;
# second rsq in lower mm6
19043 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
19047 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
19048 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
19054 ;
# mm0 now contains invsq, and mm1 invsqrt
19055 ;
# do potential and fscal
19058 pfmul mm4
, mm0 ;
# mm4=rinvsix
19060 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
19062 pfmul mm3
, mm1 ;
# mm3 has vcoul for both interactions
19063 pfmul mm5
, [esp
+ mci1100_c12
]
19064 pfmul mm4
, [esp
+ mci1100_c6
]
19065 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
19068 pfadd mm3
, [esp
+ mci1100_vctot
] ;
# add the earlier value
19069 movq
[esp
+ mci1100_vctot
], mm3 ;
# store the sum
19071 pfadd mm6
, [esp
+ mci1100_vnbtot
] ;
# add the earlier value
19072 movq
[esp
+ mci1100_vnbtot
], mm6 ;
# store the sum
19074 ;
# should we do one more iteration?
19075 sub dword ptr
[esp
+ mci1100_innerk
], 2
19076 jl
.mci1100_finish_inner
19077 jmp
.mci1100_unroll_loop
19078 .mci1100_finish_inner:
19079 and dword ptr
[esp
+ mci1100_innerk
], 1
19080 jnz
.mci1100_single_inner
19081 jmp
.mci1100_updateouterdata
19082 .mci1100_single_inner:
19083 ;
# a single j particle iteration here - compare with the unrolled code for comments
19084 mov eax
, [esp
+ mci1100_innerjjnr
]
19085 mov eax
, [eax
] ;
# eax=jnr offset
19087 mov ecx
, [ebp
+ mci1100_charge
]
19088 movd mm5
, [esp
+ mci1100_iq
]
19089 movd mm3
, [ecx
+ eax
*4]
19090 pfmul mm3
, mm5 ;
# mm3=qq
19092 mov esi
, [ebp
+ mci1100_nbfp
]
19093 mov ecx
, [ebp
+ mci1100_type
]
19094 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
19096 add edx
, [esp
+ mci1100_ntia
] ;
# tja = ntia + 2*type
19097 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
19098 movq
[esp
+ mci1100_c6
], mm5
19099 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
19100 movq
[esp
+ mci1100_c12
], mm5
19103 mov esi
, [ebp
+ mci1100_pos
]
19104 lea eax
, [eax
+ eax
*2]
19106 movq mm0
, [esp
+ mci1100_ix
]
19107 movd mm1
, [esp
+ mci1100_iz
]
19108 movq mm4
, [esi
+ eax
*4]
19109 movd mm5
, [esi
+ eax
*4 + 8]
19115 pfacc mm4
, mm5 ;
# mm0=rsq
19121 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
19123 pfmul mm0
, mm0 ;
# mm0=invsq
19124 ;
# calculate potentials and scalar force
19127 pfmul mm4
, mm0 ;
# mm4=rinvsix
19129 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
19131 pfmul mm3
, mm1 ;
# mm3 has vcoul for both interactions
19132 pfmul mm5
, [esp
+ mci1100_c12
]
19133 pfmul mm4
, [esp
+ mci1100_c6
]
19134 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
19137 pfadd mm3
, [esp
+ mci1100_vctot
]
19138 movq
[esp
+ mci1100_vctot
], mm3
19140 pfadd mm6
, [esp
+ mci1100_vnbtot
] ;
# add the earlier value
19141 movq
[esp
+ mci1100_vnbtot
], mm6 ;
# store the sum
19143 .mci1100_updateouterdata:
19144 mov edx
, [ebp
+ mci1100_gid
] ;
# get group index for this i particle
19146 add dword ptr
[ebp
+ mci1100_gid
], 4 ;
# advance pointer
19148 movq mm7
, [esp
+ mci1100_vctot
]
19149 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
19151 mov eax
, [ebp
+ mci1100_Vc
]
19152 movd mm6
, [eax
+ edx
*4]
19154 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
19156 movq mm7
, [esp
+ mci1100_vnbtot
]
19157 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
19159 mov eax
, [ebp
+ mci1100_Vnb
]
19160 movd mm6
, [eax
+ edx
*4]
19162 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
19165 mov ecx
, [ebp
+ mci1100_nri
]
19168 ;
# not last, iterate once more!
19169 mov
[ebp
+ mci1100_nri
], ecx
19187 .globl mcinl1110_3dnow
19188 .globl _mcinl1110_3dnow
19191 .equiv mci1110_nri, 8
19192 .equiv mci1110_iinr, 12
19193 .equiv mci1110_jindex, 16
19194 .equiv mci1110_jjnr, 20
19195 .equiv mci1110_shift, 24
19196 .equiv mci1110_shiftvec, 28
19197 .equiv mci1110_gid, 32
19198 .equiv mci1110_pos, 36
19199 .equiv mci1110_charge, 40
19200 .equiv mci1110_facel, 44
19201 .equiv mci1110_Vc, 48
19202 .equiv mci1110_type, 52
19203 .equiv mci1110_ntype, 56
19204 .equiv mci1110_nbfp, 60
19205 .equiv mci1110_Vnb, 64
19206 .equiv mci1110_nsatoms, 68
19207 ;
# stack offsets for local variables
19208 .equiv mci1110_is3, 0
19209 .equiv mci1110_ii3, 4
19210 .equiv mci1110_shX, 8
19211 .equiv mci1110_shY, 12
19212 .equiv mci1110_shZ, 16
19213 .equiv mci1110_ix, 20
19214 .equiv mci1110_iy, 24
19215 .equiv mci1110_iz, 28
19216 .equiv mci1110_iq, 32
19217 .equiv mci1110_vctot, 40
19218 .equiv mci1110_vnbtot, 48
19219 .equiv mci1110_c6, 56
19220 .equiv mci1110_c12, 64
19221 .equiv mci1110_ntia, 72
19222 .equiv mci1110_innerjjnr0, 76
19223 .equiv mci1110_innerk0, 80
19224 .equiv mci1110_innerjjnr, 84
19225 .equiv mci1110_innerk, 88
19226 .equiv mci1110_nsvdwc, 92
19227 .equiv mci1110_nscoul, 96
19228 .equiv mci1110_nsvdw, 100
19229 .equiv mci1110_solnr, 104
19238 sub esp
, 108 ;
# local stack space
19240 ;
# assume we have at least one i particle - start directly
19242 mov eax
, [ebp
+ mci1110_shift
] ;
# eax = pointer into shift[]
19243 mov ebx
, [eax
] ;
# ebx=shift[n]
19244 add dword ptr
[ebp
+ mci1110_shift
], 4 ;
# advance pointer one step
19246 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
19247 mov
[esp
+ mci1110_is3
],ebx ;
# store is3
19249 mov eax
, [ebp
+ mci1110_shiftvec
] ;
# eax = base of shiftvec[]
19251 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
19252 movd mm1
, [eax
+ ebx
*4 + 8]
19253 movq
[esp
+ mci1110_shX
], mm0
19254 movd
[esp
+ mci1110_shZ
], mm1
19256 mov ecx
, [ebp
+ mci1110_iinr
] ;
# ecx = pointer into iinr[]
19257 add dword ptr
[ebp
+ mci1110_iinr
], 4 ;
# advance pointer
19258 mov ebx
, [ecx
] ;
# ebx=ii
19260 mov eax
, [ebp
+ mci1110_nsatoms
]
19261 add dword ptr
[ebp
+ mci1110_nsatoms
], 12
19268 mov
[esp
+ mci1110_nsvdwc
], edx
19269 mov
[esp
+ mci1110_nscoul
], eax
19270 mov
[esp
+ mci1110_nsvdw
], ecx
19274 movq
[esp
+ mci1110_vctot
], mm7
19275 movq
[esp
+ mci1110_vnbtot
], mm7
19276 mov
[esp
+ mci1110_solnr
], ebx
19278 mov eax
, [ebp
+ mci1110_jindex
]
19279 mov ecx
, [eax
] ;
# jindex[n]
19280 mov edx
, [eax
+ 4] ;
# jindex[n+1]
19281 add dword ptr
[ebp
+ mci1110_jindex
], 4
19282 sub edx
, ecx ;
# number of innerloop atoms
19283 mov eax
, [ebp
+ mci1110_jjnr
]
19286 mov
[esp
+ mci1110_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
19288 mov
[esp
+ mci1110_innerk0
], edx ;
# number of innerloop atoms
19289 mov esi
, [ebp
+ mci1110_pos
]
19291 mov ecx
, [esp
+ mci1110_nsvdwc
]
19293 jnz
.mci1110_mno_vdwc
19294 jmp
.mci1110_testcoul
19296 mov ebx
, [esp
+ mci1110_solnr
]
19297 inc dword ptr
[esp
+ mci1110_solnr
]
19298 mov edx
, [ebp
+ mci1110_charge
]
19299 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
19300 pfmul mm2
, [ebp
+ mci1110_facel
]
19301 punpckldq mm2
,mm2 ;
# spread to both halves
19302 movq
[esp
+ mci1110_iq
], mm2 ;
# iq =facel*charge[ii]
19304 mov edx
, [ebp
+ mci1110_type
]
19305 mov edx
, [edx
+ ebx
*4]
19306 imul edx
, [ebp
+ mci1110_ntype
]
19308 mov
[esp
+ mci1110_ntia
], edx
19310 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
19311 mov eax
, [ebp
+ mci1110_pos
] ;
# eax = base of pos[]
19312 mov
[esp
+ mci1110_ii3
], ebx
19314 movq mm0
, [eax
+ ebx
*4]
19315 movd mm1
, [eax
+ ebx
*4 + 8]
19316 pfadd mm0
, [esp
+ mci1110_shX
]
19317 pfadd mm1
, [esp
+ mci1110_shZ
]
19318 movq
[esp
+ mci1110_ix
], mm0
19319 movd
[esp
+ mci1110_iz
], mm1
19321 mov ecx
, [esp
+ mci1110_innerjjnr0
]
19322 mov
[esp
+ mci1110_innerjjnr
], ecx
19323 mov edx
, [esp
+ mci1110_innerk0
]
19325 mov
[esp
+ mci1110_innerk
], edx ;
# number of innerloop atoms
19326 jge
.mci1110_unroll_vdwc_loop
19327 jmp
.mci1110_finish_vdwc_inner
19328 .mci1110_unroll_vdwc_loop:
19329 ;
# paired innerloop starts here
19330 mov ecx
, [esp
+ mci1110_innerjjnr
] ;
# pointer to jjnr[k]
19332 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
19333 add dword ptr
[esp
+ mci1110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
19334 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
19336 mov ecx
, [ebp
+ mci1110_charge
] ;
# base of charge[]
19337 movq mm5
, [esp
+ mci1110_iq
]
19338 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
19339 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
19340 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
19342 mov ecx
, [ebp
+ mci1110_type
]
19343 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
19344 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
19346 mov esi
, [ebp
+ mci1110_nbfp
] ;
# base of nbfp
19349 add edx
, [esp
+ mci1110_ntia
] ;
# tja = ntia + 2*type
19350 add ecx
, [esp
+ mci1110_ntia
]
19352 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
19353 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
19355 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
19356 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
19357 movq
[esp
+ mci1110_c6
], mm5
19358 movq
[esp
+ mci1110_c12
], mm6
19360 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
19361 lea ebx
, [ebx
+ ebx
*2]
19363 mov esi
, [ebp
+ mci1110_pos
]
19365 movq mm0
, [esp
+ mci1110_ix
]
19366 movd mm1
, [esp
+ mci1110_iz
]
19367 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
19368 movd mm5
, [esi
+ eax
*4 + 8]
19369 pfsubr mm4
,mm0 ;
# dr = ir - jr
19371 pfmul mm4
,mm4 ;
# square dx,dy,dz
19373 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
19374 pfacc mm4
, mm5 ;
# first rsq in lower mm4
19376 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
19377 movd mm7
, [esi
+ ebx
*4 + 8]
19379 pfsubr mm6
,mm0 ;
# dr = ir - jr
19381 pfmul mm6
,mm6 ;
# square dx,dy,dz
19383 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
19384 pfacc mm6
, mm7 ;
# second rsq in lower mm6
19386 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
19390 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
19391 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
19397 ;
# mm0 now contains invsq, and mm1 invsqrt
19398 ;
# do potential and fscal
19401 pfmul mm4
, mm0 ;
# mm4=rinvsix
19403 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
19405 pfmul mm3
, mm1 ;
# mm3 has vcoul for both interactions
19406 pfmul mm5
, [esp
+ mci1110_c12
]
19407 pfmul mm4
, [esp
+ mci1110_c6
]
19408 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
19411 pfadd mm3
, [esp
+ mci1110_vctot
] ;
# add the earlier value
19412 movq
[esp
+ mci1110_vctot
], mm3 ;
# store the sum
19414 pfadd mm6
, [esp
+ mci1110_vnbtot
] ;
# add the earlier value
19415 movq
[esp
+ mci1110_vnbtot
], mm6 ;
# store the sum
19417 ;
# should we do one more iteration?
19418 sub dword ptr
[esp
+ mci1110_innerk
], 2
19419 jl
.mci1110_finish_vdwc_inner
19420 jmp
.mci1110_unroll_vdwc_loop
19421 .mci1110_finish_vdwc_inner:
19422 and dword ptr
[esp
+ mci1110_innerk
], 1
19423 jnz
.mci1110_single_vdwc_inner
19424 jmp
.mci1110_updateouterdata_vdwc
19425 .mci1110_single_vdwc_inner:
19426 ;
# a single j particle iteration here - compare with the unrolled code for comments
19427 mov eax
, [esp
+ mci1110_innerjjnr
]
19428 mov eax
, [eax
] ;
# eax=jnr offset
19430 mov ecx
, [ebp
+ mci1110_charge
]
19431 movd mm5
, [esp
+ mci1110_iq
]
19432 movd mm3
, [ecx
+ eax
*4]
19433 pfmul mm3
, mm5 ;
# mm3=qq
19435 mov esi
, [ebp
+ mci1110_nbfp
]
19436 mov ecx
, [ebp
+ mci1110_type
]
19437 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
19439 add edx
, [esp
+ mci1110_ntia
] ;
# tja = ntia + 2*type
19440 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
19441 movq
[esp
+ mci1110_c6
], mm5
19442 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
19443 movq
[esp
+ mci1110_c12
], mm5
19446 mov esi
, [ebp
+ mci1110_pos
]
19447 lea eax
, [eax
+ eax
*2]
19449 movq mm0
, [esp
+ mci1110_ix
]
19450 movd mm1
, [esp
+ mci1110_iz
]
19451 movq mm4
, [esi
+ eax
*4]
19452 movd mm5
, [esi
+ eax
*4 + 8]
19458 pfacc mm4
, mm5 ;
# mm0=rsq
19464 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
19466 pfmul mm0
, mm0 ;
# mm0=invsq
19467 ;
# calculate potentials and scalar force
19470 pfmul mm4
, mm0 ;
# mm4=rinvsix
19472 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
19474 pfmul mm3
, mm1 ;
# mm3 has vcoul for both interactions
19476 pfmul mm5
, [esp
+ mci1110_c12
]
19477 pfmul mm4
, [esp
+ mci1110_c6
]
19478 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
19481 pfadd mm3
, [esp
+ mci1110_vctot
]
19482 movq
[esp
+ mci1110_vctot
], mm3
19484 pfadd mm6
, [esp
+ mci1110_vnbtot
] ;
# add the earlier value
19485 movq
[esp
+ mci1110_vnbtot
], mm6 ;
# store the sum
19486 .mci1110_updateouterdata_vdwc:
19487 ;
# loop back to mno
19488 dec dword ptr
[esp
+ mci1110_nsvdwc
]
19489 jz
.mci1110_testcoul
19490 jmp
.mci1110_mno_vdwc
19492 mov ecx
, [esp
+ mci1110_nscoul
]
19494 jnz
.mci1110_mno_coul
19495 jmp
.mci1110_testvdw
19497 mov ebx
, [esp
+ mci1110_solnr
]
19498 inc dword ptr
[esp
+ mci1110_solnr
]
19499 mov edx
, [ebp
+ mci1110_charge
]
19500 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
19501 pfmul mm2
, [ebp
+ mci1110_facel
]
19502 punpckldq mm2
,mm2 ;
# spread to both halves
19503 movq
[esp
+ mci1110_iq
], mm2 ;
# iq =facel*charge[ii]
19505 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
19506 mov eax
, [ebp
+ mci1110_pos
] ;
# eax = base of pos[]
19507 mov
[esp
+ mci1110_ii3
], ebx
19509 movq mm0
, [eax
+ ebx
*4]
19510 movd mm1
, [eax
+ ebx
*4 + 8]
19511 pfadd mm0
, [esp
+ mci1110_shX
]
19512 pfadd mm1
, [esp
+ mci1110_shZ
]
19513 movq
[esp
+ mci1110_ix
], mm0
19514 movd
[esp
+ mci1110_iz
], mm1
19516 mov ecx
, [esp
+ mci1110_innerjjnr0
]
19517 mov
[esp
+ mci1110_innerjjnr
], ecx
19518 mov edx
, [esp
+ mci1110_innerk0
]
19520 mov
[esp
+ mci1110_innerk
], edx ;
# number of innerloop atoms
19521 jge
.mci1110_unroll_coul_loop
19522 jmp
.mci1110_finish_coul_inner
19523 .mci1110_unroll_coul_loop:
19524 ;
# paired innerloop starts here
19525 mov ecx
, [esp
+ mci1110_innerjjnr
] ;
# pointer to jjnr[k]
19527 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
19528 add dword ptr
[esp
+ mci1110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
19529 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
19531 mov ecx
, [ebp
+ mci1110_charge
] ;
# base of charge[]
19532 movq mm5
, [esp
+ mci1110_iq
]
19533 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
19534 movd mm7
, [ecx
+ ebx
*4] ;
# charge[jnr2]
19535 punpckldq mm3
,mm7 ;
# move charge 2 to high part of mm3
19536 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
19538 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
19539 lea ebx
, [ebx
+ ebx
*2]
19541 movq mm0
, [esp
+ mci1110_ix
]
19542 movd mm1
, [esp
+ mci1110_iz
]
19543 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
19544 movd mm5
, [esi
+ eax
*4 + 8]
19545 pfsubr mm4
,mm0 ;
# dr = ir - jr
19547 pfmul mm4
,mm4 ;
# square dx,dy,dz
19549 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
19550 pfacc mm4
, mm5 ;
# first rsq in lower mm4
19552 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
19553 movd mm7
, [esi
+ ebx
*4 + 8]
19555 pfsubr mm6
,mm0 ;
# dr = ir - jr
19557 pfmul mm6
,mm6 ;
# square dx,dy,dz
19559 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
19560 pfacc mm6
, mm7 ;
# second rsq in lower mm6
19562 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
19566 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
19567 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
19573 ;
# do potential and fscal
19574 pfmul mm3
,mm1 ;
# 3 has both vcoul
19575 pfadd mm3
, [esp
+ mci1110_vctot
] ;
# add the earlier value
19576 movq
[esp
+ mci1110_vctot
], mm3 ;
# store the sum
19578 ;
# should we do one more iteration?
19579 sub dword ptr
[esp
+ mci1110_innerk
], 2
19580 jl
.mci1110_finish_coul_inner
19581 jmp
.mci1110_unroll_coul_loop
19582 .mci1110_finish_coul_inner:
19583 and dword ptr
[esp
+ mci1110_innerk
], 1
19584 jnz
.mci1110_single_coul_inner
19585 jmp
.mci1110_updateouterdata_coul
19586 .mci1110_single_coul_inner:
19587 ;
# a single j particle iteration here - compare with the unrolled code for comments
19588 mov eax
, [esp
+ mci1110_innerjjnr
]
19589 mov eax
, [eax
] ;
# eax=jnr offset
19591 mov ecx
, [ebp
+ mci1110_charge
]
19592 movd mm6
, [esp
+ mci1110_iq
]
19593 movd mm7
, [ecx
+ eax
*4]
19594 pfmul mm6
, mm7 ;
# mm6=qq
19596 lea eax
, [eax
+ eax
*2]
19598 movq mm0
, [esp
+ mci1110_ix
]
19599 movd mm1
, [esp
+ mci1110_iz
]
19600 movq mm2
, [esi
+ eax
*4]
19601 movd mm3
, [esi
+ eax
*4 + 8]
19607 pfacc mm0
, mm1 ;
# mm0=rsq
19613 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
19614 ;
# calculate potential and scalar force
19615 pfmul mm6
, mm1 ;
# mm6=vcoul
19617 pfadd mm6
, [esp
+ mci1110_vctot
]
19618 movq
[esp
+ mci1110_vctot
], mm6
19620 .mci1110_updateouterdata_coul:
19621 ;
# loop back to mno
19622 dec dword ptr
[esp
+ mci1110_nscoul
]
19623 jz
.mci1110_testvdw
19624 jmp
.mci1110_mno_coul
19626 mov ecx
, [esp
+ mci1110_nsvdw
]
19628 jnz
.mci1110_mno_vdw
19629 jmp
.mci1110_last_mno
19631 mov ebx
, [esp
+ mci1110_solnr
]
19632 inc dword ptr
[esp
+ mci1110_solnr
]
19634 mov edx
, [ebp
+ mci1110_type
]
19635 mov edx
, [edx
+ ebx
*4]
19636 imul edx
, [ebp
+ mci1110_ntype
]
19638 mov
[esp
+ mci1110_ntia
], edx
19640 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
19641 mov eax
, [ebp
+ mci1110_pos
] ;
# eax = base of pos[]
19642 mov
[esp
+ mci1110_ii3
], ebx
19644 movq mm0
, [eax
+ ebx
*4]
19645 movd mm1
, [eax
+ ebx
*4 + 8]
19646 pfadd mm0
, [esp
+ mci1110_shX
]
19647 pfadd mm1
, [esp
+ mci1110_shZ
]
19648 movq
[esp
+ mci1110_ix
], mm0
19649 movd
[esp
+ mci1110_iz
], mm1
19651 mov ecx
, [esp
+ mci1110_innerjjnr0
]
19652 mov
[esp
+ mci1110_innerjjnr
], ecx
19653 mov edx
, [esp
+ mci1110_innerk0
]
19655 mov
[esp
+ mci1110_innerk
], edx ;
# number of innerloop atoms
19656 jge
.mci1110_unroll_vdw_loop
19657 jmp
.mci1110_finish_vdw_inner
19658 .mci1110_unroll_vdw_loop:
19659 ;
# paired innerloop starts here
19660 mov ecx
, [esp
+ mci1110_innerjjnr
] ;
# pointer to jjnr[k]
19662 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
19663 add dword ptr
[esp
+ mci1110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
19664 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
19666 mov ecx
, [ebp
+ mci1110_type
]
19667 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
19668 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
19670 mov esi
, [ebp
+ mci1110_nbfp
] ;
# base of nbfp
19673 add edx
, [esp
+ mci1110_ntia
] ;
# tja = ntia + 2*type
19674 add ecx
, [esp
+ mci1110_ntia
]
19676 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
19677 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
19679 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
19680 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
19681 movq
[esp
+ mci1110_c6
], mm5
19682 movq
[esp
+ mci1110_c12
], mm6
19684 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
19685 lea ebx
, [ebx
+ ebx
*2]
19687 mov esi
, [ebp
+ mci1110_pos
]
19689 movq mm0
, [esp
+ mci1110_ix
]
19690 movd mm1
, [esp
+ mci1110_iz
]
19691 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
19692 movd mm5
, [esi
+ eax
*4 + 8]
19693 pfsubr mm4
,mm0 ;
# dr = ir - jr
19695 pfmul mm4
,mm4 ;
# square dx,dy,dz
19697 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
19698 pfacc mm4
, mm5 ;
# first rsq in lower mm4
19700 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
19701 movd mm7
, [esi
+ ebx
*4 + 8]
19703 pfsubr mm6
,mm0 ;
# dr = ir - jr
19705 pfmul mm6
,mm6 ;
# square dx,dy,dz
19707 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
19708 pfacc mm6
, mm7 ;
# second rsq in lower mm6
19710 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
19714 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs
19715 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision
19721 ;
# mm0 now contains invsq, and mm1 invsqrt
19722 ;
# do potential and fscal
19725 pfmul mm4
, mm0 ;
# mm4=rinvsix
19727 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
19729 pfmul mm5
, [esp
+ mci1110_c12
]
19730 pfmul mm4
, [esp
+ mci1110_c6
]
19731 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
19734 pfadd mm6
, [esp
+ mci1110_vnbtot
] ;
# add the earlier value
19735 movq
[esp
+ mci1110_vnbtot
], mm6 ;
# store the sum
19737 ;
# should we do one more iteration?
19738 sub dword ptr
[esp
+ mci1110_innerk
], 2
19739 jl
.mci1110_finish_vdw_inner
19740 jmp
.mci1110_unroll_vdw_loop
19741 .mci1110_finish_vdw_inner:
19742 and dword ptr
[esp
+ mci1110_innerk
], 1
19743 jnz
.mci1110_single_vdw_inner
19744 jmp
.mci1110_updateouterdata_vdw
19745 .mci1110_single_vdw_inner:
19746 ;
# a single j particle iteration here - compare with the unrolled code for comments
19747 mov eax
, [esp
+ mci1110_innerjjnr
]
19748 mov eax
, [eax
] ;
# eax=jnr offset
19750 mov esi
, [ebp
+ mci1110_nbfp
]
19751 mov ecx
, [ebp
+ mci1110_type
]
19752 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
19754 add edx
, [esp
+ mci1110_ntia
] ;
# tja = ntia + 2*type
19755 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
19756 movq
[esp
+ mci1110_c6
], mm5
19757 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
19758 movq
[esp
+ mci1110_c12
], mm5
19761 mov esi
, [ebp
+ mci1110_pos
]
19762 lea eax
, [eax
+ eax
*2]
19764 movq mm0
, [esp
+ mci1110_ix
]
19765 movd mm1
, [esp
+ mci1110_iz
]
19766 movq mm4
, [esi
+ eax
*4]
19767 movd mm5
, [esi
+ eax
*4 + 8]
19773 pfacc mm4
, mm5 ;
# mm0=rsq
19779 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
19781 pfmul mm0
, mm0 ;
# mm0=invsq
19782 ;
# calculate potentials and scalar force
19785 pfmul mm4
, mm0 ;
# mm4=rinvsix
19787 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
19789 pfmul mm5
, [esp
+ mci1110_c12
]
19790 pfmul mm4
, [esp
+ mci1110_c6
]
19791 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
19794 pfadd mm6
, [esp
+ mci1110_vnbtot
] ;
# add the earlier value
19795 movq
[esp
+ mci1110_vnbtot
], mm6 ;
# store the sum
19797 .mci1110_updateouterdata_vdw:
19798 ;
# loop back to mno
19799 dec dword ptr
[esp
+ mci1110_nsvdw
]
19800 jz
.mci1110_last_mno
19801 jmp
.mci1110_mno_vdw
19804 mov edx
, [ebp
+ mci1110_gid
] ;
# get group index for this i particle
19806 add dword ptr
[ebp
+ mci1110_gid
], 4 ;
# advance pointer
19808 movq mm7
, [esp
+ mci1110_vctot
]
19809 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
19811 mov eax
, [ebp
+ mci1110_Vc
]
19812 movd mm6
, [eax
+ edx
*4]
19814 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
19816 movq mm7
, [esp
+ mci1110_vnbtot
]
19817 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
19819 mov eax
, [ebp
+ mci1110_Vnb
]
19820 movd mm6
, [eax
+ edx
*4]
19822 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
19824 mov ecx
, [ebp
+ mci1110_nri
]
19827 ;
# not last, iterate once more!
19828 mov
[ebp
+ mci1110_nri
], ecx
19844 .globl mcinl1120_3dnow
19845 .globl _mcinl1120_3dnow
19848 .equiv mci1120_nri, 8
19849 .equiv mci1120_iinr, 12
19850 .equiv mci1120_jindex, 16
19851 .equiv mci1120_jjnr, 20
19852 .equiv mci1120_shift, 24
19853 .equiv mci1120_shiftvec, 28
19854 .equiv mci1120_gid, 32
19855 .equiv mci1120_pos, 36
19856 .equiv mci1120_charge, 40
19857 .equiv mci1120_facel, 44
19858 .equiv mci1120_Vc, 48
19859 .equiv mci1120_type, 52
19860 .equiv mci1120_ntype, 56
19861 .equiv mci1120_nbfp, 60
19862 .equiv mci1120_Vnb, 64
19863 ;
# stack offsets for local variables
19864 .equiv mci1120_is3, 0
19865 .equiv mci1120_ii3, 4
19866 .equiv mci1120_ixO, 8
19867 .equiv mci1120_iyO, 12
19868 .equiv mci1120_izO, 16
19869 .equiv mci1120_ixH, 20
19870 .equiv mci1120_iyH, 28
19871 .equiv mci1120_izH, 36
19872 .equiv mci1120_iqO, 44
19873 .equiv mci1120_iqH, 52
19874 .equiv mci1120_vctot, 60
19875 .equiv mci1120_vnbtot, 68
19876 .equiv mci1120_c6, 76
19877 .equiv mci1120_c12, 84
19878 .equiv mci1120_ntia, 92
19879 .equiv mci1120_innerjjnr, 96
19880 .equiv mci1120_innerk, 100
19889 sub esp
, 104 ;
# local stack space
19891 ;
# assume we have at least one i particle - start directly
19893 mov ecx
, [ebp
+ mci1120_iinr
] ;
# ecx = pointer into iinr[]
19894 mov ebx
, [ecx
] ;
# ebx=ii
19896 mov edx
, [ebp
+ mci1120_charge
]
19897 movd mm1
, [ebp
+ mci1120_facel
]
19898 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0]
19900 movq
[esp
+ mci1120_iqO
], mm2 ;
# iqO = facel*charge[ii]
19902 movd mm2
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1]
19904 punpckldq mm2
,mm2 ;
# spread to both halves
19905 movq
[esp
+ mci1120_iqH
], mm2 ;
# iqH = facel*charge[ii0+1]
19907 mov edx
, [ebp
+ mci1120_type
]
19908 mov ecx
, [edx
+ ebx
*4]
19910 imul ecx
, [ebp
+ mci1120_ntype
] ;
# ecx = ntia = 2*ntype*type[ii0]
19911 mov
[esp
+ mci1120_ntia
], ecx
19914 mov eax
, [ebp
+ mci1120_shift
] ;
# eax = pointer into shift[]
19915 mov ebx
, [eax
] ;
# ebx=shift[n]
19916 add dword ptr
[ebp
+ mci1120_shift
], 4 ;
# advance pointer one step
19918 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
19919 mov
[esp
+ mci1120_is3
],ebx ;
# store is3
19921 mov eax
, [ebp
+ mci1120_shiftvec
] ;
# eax = base of shiftvec[]
19923 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
19924 movd mm6
, [eax
+ ebx
*4 + 8]
19928 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
19932 mov ecx
, [ebp
+ mci1120_iinr
] ;
# ecx = pointer into iinr[]
19933 add dword ptr
[ebp
+ mci1120_iinr
], 4 ;
# advance pointer
19934 mov ebx
, [ecx
] ;
# ebx=ii
19936 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
19937 mov eax
, [ebp
+ mci1120_pos
] ;
# eax = base of pos[]
19939 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
19940 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
19941 mov
[esp
+ mci1120_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
19943 movq
[esp
+ mci1120_ixO
], mm5
19944 movq
[esp
+ mci1120_izO
], mm6
19946 movd mm3
, [eax
+ ebx
*4 + 12]
19947 movd mm4
, [eax
+ ebx
*4 + 16]
19948 movd mm5
, [eax
+ ebx
*4 + 20]
19949 punpckldq mm3
, [eax
+ ebx
*4 + 24]
19950 punpckldq mm4
, [eax
+ ebx
*4 + 28]
19951 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
19956 movq
[esp
+ mci1120_ixH
], mm0
19957 movq
[esp
+ mci1120_iyH
], mm1
19958 movq
[esp
+ mci1120_izH
], mm2
19960 ;
# clear vctot and i forces
19962 movq
[esp
+ mci1120_vctot
], mm7
19963 movq
[esp
+ mci1120_vnbtot
], mm7
19965 mov eax
, [ebp
+ mci1120_jindex
]
19966 mov ecx
, [eax
] ;
# jindex[n]
19967 mov edx
, [eax
+ 4] ;
# jindex[n+1]
19968 add dword ptr
[ebp
+ mci1120_jindex
], 4
19969 sub edx
, ecx ;
# number of innerloop atoms
19970 mov
[esp
+ mci1120_innerk
], edx ;
# number of innerloop atoms
19972 mov esi
, [ebp
+ mci1120_pos
]
19973 mov eax
, [ebp
+ mci1120_jjnr
]
19976 mov
[esp
+ mci1120_innerjjnr
], eax ;
# pointer to jjnr[nj0]
19977 .mci1120_inner_loop:
19978 ;
# a single j particle iteration here - compare with the unrolled code for comments.
19979 mov eax
, [esp
+ mci1120_innerjjnr
]
19980 mov eax
, [eax
] ;
# eax=jnr offset
19981 add dword ptr
[esp
+ mci1120_innerjjnr
], 4 ;
# advance pointer
19982 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
19984 mov ecx
, [ebp
+ mci1120_charge
]
19985 movd mm7
, [ecx
+ eax
*4]
19988 pfmul mm6
, [esp
+ mci1120_iqO
]
19989 pfmul mm7
, [esp
+ mci1120_iqH
] ;
# mm6=qqO, mm7=qqH
19991 mov ecx
, [ebp
+ mci1120_type
]
19992 mov edx
, [ecx
+ eax
*4] ;
# type [jnr]
19993 mov ecx
, [ebp
+ mci1120_nbfp
]
19995 add edx
, [esp
+ mci1120_ntia
] ;
# tja = ntia + 2*type
19996 movd mm5
, [ecx
+ edx
*4] ;
# mm5 = 1st c6
19997 movq
[esp
+ mci1120_c6
], mm5
19998 movd mm5
, [ecx
+ edx
*4 + 4] ;
# mm5 = 1st c12
19999 movq
[esp
+ mci1120_c12
], mm5
20001 lea eax
, [eax
+ eax
*2]
20003 movq mm0
, [esi
+ eax
*4]
20004 movd mm1
, [esi
+ eax
*4 + 8]
20005 ;
# copy & expand to mm2-mm4 for the H interactions
20013 pfsubr mm0
, [esp
+ mci1120_ixO
]
20014 pfsubr mm1
, [esp
+ mci1120_izO
]
20019 pfadd mm0
, mm1 ;
# mm0=rsqO
20023 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
20024 pfsubr mm2
, [esp
+ mci1120_ixH
]
20025 pfsubr mm3
, [esp
+ mci1120_iyH
]
20026 pfsubr mm4
, [esp
+ mci1120_izH
] ;
# mm2-mm4 is dxH-dzH
20033 pfadd mm3
,mm4 ;
# mm3=rsqH
20040 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
20042 pfmul mm4
, mm4 ;
# mm4=invsq
20046 pfmul mm0
, mm4 ;
# mm0=rinvsix
20048 pfmul mm2
, mm2 ;
# mm2=rintwelve
20050 ;
# calculate potential and scalar force
20051 pfmul mm6
, mm1 ;
# mm6=vcoul
20052 movq mm1
, mm6 ;
# use mm1 for fscal sum
20054 ;
# LJ for the oxygen
20055 pfmul mm0
, [esp
+ mci1120_c6
]
20056 pfmul mm2
, [esp
+ mci1120_c12
]
20058 ;
# calc nb potential
20060 ;
# update nb potential
20061 pfadd mm2
, [esp
+ mci1120_vnbtot
]
20062 movq
[esp
+ mci1120_vnbtot
], mm2
20068 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3.
20073 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
20074 pfmul mm7
, mm5 ;
# mm7=vcoul
20077 pfadd mm7
, [esp
+ mci1120_vctot
]
20078 movq
[esp
+ mci1120_vctot
], mm7
20080 ;
# done - one more?
20081 dec dword ptr
[esp
+ mci1120_innerk
]
20082 jz
.mci1120_updateouterdata
20083 jmp
.mci1120_inner_loop
20084 .mci1120_updateouterdata:
20085 mov edx
, [ebp
+ mci1120_gid
] ;
# get group index for this i particle
20087 add dword ptr
[ebp
+ mci1120_gid
], 4 ;
# advance pointer
20089 movq mm7
, [esp
+ mci1120_vctot
]
20090 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
20092 mov eax
, [ebp
+ mci1120_Vc
]
20093 movd mm6
, [eax
+ edx
*4]
20095 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
20097 movq mm7
, [esp
+ mci1120_vnbtot
]
20098 pfacc mm7
,mm7 ;
# same for Vnb
20100 mov eax
, [ebp
+ mci1120_Vnb
]
20101 movd mm6
, [eax
+ edx
*4]
20103 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
20105 dec dword ptr
[ebp
+ mci1120_nri
]
20107 ;
# not last, iterate once more!
20123 .globl mcinl1130_3dnow
20124 .globl _mcinl1130_3dnow
20127 .equiv mci1130_nri, 8
20128 .equiv mci1130_iinr, 12
20129 .equiv mci1130_jindex, 16
20130 .equiv mci1130_jjnr, 20
20131 .equiv mci1130_shift, 24
20132 .equiv mci1130_shiftvec, 28
20133 .equiv mci1130_gid, 32
20134 .equiv mci1130_pos, 36
20135 .equiv mci1130_charge, 40
20136 .equiv mci1130_facel, 44
20137 .equiv mci1130_Vc, 48
20138 .equiv mci1130_type, 52
20139 .equiv mci1130_ntype, 56
20140 .equiv mci1130_nbfp, 60
20141 .equiv mci1130_Vnb, 64
20142 ;
# stack offsets for local variables
20143 .equiv mci1130_is3, 0
20144 .equiv mci1130_ii3, 4
20145 .equiv mci1130_ixO, 8
20146 .equiv mci1130_iyO, 12
20147 .equiv mci1130_izO, 16
20148 .equiv mci1130_ixH, 20
20149 .equiv mci1130_iyH, 28
20150 .equiv mci1130_izH, 36
20151 .equiv mci1130_qqOO, 44
20152 .equiv mci1130_qqOH, 52
20153 .equiv mci1130_qqHH, 60
20154 .equiv mci1130_c6, 68
20155 .equiv mci1130_c12, 76
20156 .equiv mci1130_vctot, 84
20157 .equiv mci1130_vnbtot, 92
20158 .equiv mci1130_innerjjnr, 100
20159 .equiv mci1130_innerk, 104
20168 sub esp
, 108 ;
# local stack space
20170 ;
# assume we have at least one i particle - start directly
20172 mov ecx
, [ebp
+ mci1130_iinr
] ;
# ecx = pointer into iinr[]
20173 mov ebx
, [ecx
] ;
# ebx=ii
20175 mov edx
, [ebp
+ mci1130_charge
]
20176 movd mm1
, [ebp
+ mci1130_facel
] ;
# mm1=facel
20177 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0] (O)
20178 movd mm3
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1] (H)
20184 pfmul mm4
, mm2 ;
# mm4=qqOO*facel
20185 pfmul mm5
, mm3 ;
# mm5=qqOH*facel
20186 pfmul mm6
, mm3 ;
# mm6=qqHH*facel
20187 punpckldq mm5
,mm5 ;
# spread to both halves
20188 punpckldq mm6
,mm6 ;
# spread to both halves
20189 movq
[esp
+ mci1130_qqOO
], mm4
20190 movq
[esp
+ mci1130_qqOH
], mm5
20191 movq
[esp
+ mci1130_qqHH
], mm6
20192 mov edx
, [ebp
+ mci1130_type
]
20193 mov ecx
, [edx
+ ebx
*4]
20196 imul ecx
, [ebp
+ mci1130_ntype
]
20198 mov eax
, [ebp
+ mci1130_nbfp
]
20199 movd mm0
, [eax
+ edx
*4]
20200 movd mm1
, [eax
+ edx
*4 + 4]
20201 movq
[esp
+ mci1130_c6
], mm0
20202 movq
[esp
+ mci1130_c12
], mm1
20205 mov eax
, [ebp
+ mci1130_shift
] ;
# eax = pointer into shift[]
20206 mov ebx
, [eax
] ;
# ebx=shift[n]
20207 add dword ptr
[ebp
+ mci1130_shift
], 4 ;
# advance pointer one step
20209 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
20210 mov
[esp
+ mci1130_is3
],ebx ;
# store is3
20212 mov eax
, [ebp
+ mci1130_shiftvec
] ;
# eax = base of shiftvec[]
20214 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
20215 movd mm6
, [eax
+ ebx
*4 + 8]
20219 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
20223 mov ecx
, [ebp
+ mci1130_iinr
] ;
# ecx = pointer into iinr[]
20224 add dword ptr
[ebp
+ mci1130_iinr
], 4 ;
# advance pointer
20225 mov ebx
, [ecx
] ;
# ebx=ii
20227 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
20228 mov eax
, [ebp
+ mci1130_pos
] ;
# eax = base of pos[]
20230 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
20231 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
20232 mov
[esp
+ mci1130_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
20234 movq
[esp
+ mci1130_ixO
], mm5
20235 movq
[esp
+ mci1130_izO
], mm6
20237 movd mm3
, [eax
+ ebx
*4 + 12]
20238 movd mm4
, [eax
+ ebx
*4 + 16]
20239 movd mm5
, [eax
+ ebx
*4 + 20]
20240 punpckldq mm3
, [eax
+ ebx
*4 + 24]
20241 punpckldq mm4
, [eax
+ ebx
*4 + 28]
20242 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
20247 movq
[esp
+ mci1130_ixH
], mm0
20248 movq
[esp
+ mci1130_iyH
], mm1
20249 movq
[esp
+ mci1130_izH
], mm2
20251 ;
# clear vctot and i forces
20253 movq
[esp
+ mci1130_vctot
], mm7
20254 movq
[esp
+ mci1130_vnbtot
], mm7
20256 mov eax
, [ebp
+ mci1130_jindex
]
20257 mov ecx
, [eax
] ;
# jindex[n]
20258 mov edx
, [eax
+ 4] ;
# jindex[n+1]
20259 add dword ptr
[ebp
+ mci1130_jindex
], 4
20260 sub edx
, ecx ;
# number of innerloop atoms
20261 mov
[esp
+ mci1130_innerk
], edx ;
# number of innerloop atoms
20263 mov esi
, [ebp
+ mci1130_pos
]
20264 mov eax
, [ebp
+ mci1130_jjnr
]
20267 mov
[esp
+ mci1130_innerjjnr
], eax ;
# pointer to jjnr[nj0]
20268 .mci1130_inner_loop:
20269 ;
# a single j particle iteration here - compare with the unrolled code for comments.
20270 mov eax
, [esp
+ mci1130_innerjjnr
]
20271 mov eax
, [eax
] ;
# eax=jnr offset
20272 add dword ptr
[esp
+ mci1130_innerjjnr
], 4 ;
# advance pointer
20274 movd mm6
, [esp
+ mci1130_qqOO
]
20275 movq mm7
, [esp
+ mci1130_qqOH
]
20277 lea eax
, [eax
+ eax
*2]
20278 movq mm0
, [esi
+ eax
*4]
20279 movd mm1
, [esi
+ eax
*4 + 8]
20280 ;
# copy & expand to mm2-mm4 for the H interactions
20288 pfsubr mm0
, [esp
+ mci1130_ixO
]
20289 pfsubr mm1
, [esp
+ mci1130_izO
]
20294 pfadd mm0
, mm1 ;
# mm0=rsqO
20298 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
20299 pfsubr mm2
, [esp
+ mci1130_ixH
]
20300 pfsubr mm3
, [esp
+ mci1130_iyH
]
20301 pfsubr mm4
, [esp
+ mci1130_izH
] ;
# mm2-mm4 is dxH-dzH
20308 pfadd mm3
,mm4 ;
# mm3=rsqH
20315 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
20317 pfmul mm4
, mm4 ;
# mm4=invsq
20324 pfmul mm2
, [esp
+ mci1130_c6
]
20325 pfmul mm0
, [esp
+ mci1130_c12
]
20327 pfsub mm5
, mm2 ;
# vnb
20329 ;
# calculate potential and scalar force
20330 pfmul mm6
, mm1 ;
# mm6=vcoul
20331 ;
# update nb potential
20332 pfadd mm5
, [esp
+ mci1130_vnbtot
]
20333 movq
[esp
+ mci1130_vnbtot
], mm5
20339 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
20344 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
20345 pfmul mm7
, mm5 ;
# mm7=vcoul
20348 pfadd mm7
, [esp
+ mci1130_vctot
]
20349 movq
[esp
+ mci1130_vctot
], mm7
20351 ;
# interactions with j H1
20352 movq mm0
, [esi
+ eax
*4 + 12]
20353 movd mm1
, [esi
+ eax
*4 + 20]
20354 ;
# copy & expand to mm2-mm4 for the H interactions
20362 movd mm6
, [esp
+ mci1130_qqOH
]
20363 movq mm7
, [esp
+ mci1130_qqHH
]
20365 pfsubr mm0
, [esp
+ mci1130_ixO
]
20366 pfsubr mm1
, [esp
+ mci1130_izO
]
20371 pfadd mm0
, mm1 ;
# mm0=rsqO
20375 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
20376 pfsubr mm2
, [esp
+ mci1130_ixH
]
20377 pfsubr mm3
, [esp
+ mci1130_iyH
]
20378 pfsubr mm4
, [esp
+ mci1130_izH
] ;
# mm2-mm4 is dxH-dzH
20385 pfadd mm3
,mm4 ;
# mm3=rsqH
20392 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
20393 ;
# calculate potential and scalar force
20394 pfmul mm6
, mm1 ;
# mm6=vcoul
20400 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3
20405 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
20406 pfmul mm7
, mm5 ;
# mm7=vcoul
20409 pfadd mm7
, [esp
+ mci1130_vctot
]
20410 movq
[esp
+ mci1130_vctot
], mm7
20412 ;
# interactions with j H2
20413 movq mm0
, [esi
+ eax
*4 + 24]
20414 movd mm1
, [esi
+ eax
*4 + 32]
20415 ;
# copy & expand to mm2-mm4 for the H interactions
20423 movd mm6
, [esp
+ mci1130_qqOH
]
20424 movq mm7
, [esp
+ mci1130_qqHH
]
20426 pfsubr mm0
, [esp
+ mci1130_ixO
]
20427 pfsubr mm1
, [esp
+ mci1130_izO
]
20432 pfadd mm0
, mm1 ;
# mm0=rsqO
20436 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
20437 pfsubr mm2
, [esp
+ mci1130_ixH
]
20438 pfsubr mm3
, [esp
+ mci1130_iyH
]
20439 pfsubr mm4
, [esp
+ mci1130_izH
] ;
# mm2-mm4 is dxH-dzH
20446 pfadd mm3
,mm4 ;
# mm3=rsqH
20453 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
20454 ;
# calculate potential and scalar force
20455 pfmul mm6
, mm1 ;
# mm6=vcoul
20461 punpckldq mm5
,mm2 ;
# seeds are in mm5 now, and rsq in mm3.
20466 pfrcpit2 mm5
,mm2 ;
# mm5=invsqrt
20467 pfmul mm7
, mm5 ;
# mm7=vcoul
20471 pfadd mm7
, [esp
+ mci1130_vctot
]
20472 movq
[esp
+ mci1130_vctot
], mm7
20474 ;
# done - one more?
20475 dec dword ptr
[esp
+ mci1130_innerk
]
20476 jz
.mci1130_updateouterdata
20477 jmp
.mci1130_inner_loop
20478 .mci1130_updateouterdata:
20479 mov edx
, [ebp
+ mci1130_gid
] ;
# get group index for this i particle
20481 add dword ptr
[ebp
+ mci1130_gid
], 4 ;
# advance pointer
20483 movq mm7
, [esp
+ mci1130_vctot
]
20484 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
20486 mov eax
, [ebp
+ mci1130_Vc
]
20487 movd mm6
, [eax
+ edx
*4]
20489 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
20491 movq mm7
, [esp
+ mci1130_vnbtot
]
20492 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
20494 mov eax
, [ebp
+ mci1130_Vnb
]
20495 movd mm6
, [eax
+ edx
*4]
20497 movd
[eax
+ edx
*4], mm6 ;
# increment vnbtot[gid]
20499 dec dword ptr
[ebp
+ mci1130_nri
]
20501 ;
# not last, iterate once more!
20518 .globl mcinl3000_3dnow
20519 .globl _mcinl3000_3dnow
20522 .equiv mci3000_nri, 8
20523 .equiv mci3000_iinr, 12
20524 .equiv mci3000_jindex, 16
20525 .equiv mci3000_jjnr, 20
20526 .equiv mci3000_shift, 24
20527 .equiv mci3000_shiftvec, 28
20528 .equiv mci3000_gid, 32
20529 .equiv mci3000_pos, 36
20530 .equiv mci3000_charge, 40
20531 .equiv mci3000_facel, 44
20532 .equiv mci3000_Vc, 48
20533 .equiv mci3000_tabscale, 52
20534 .equiv mci3000_VFtab, 56
20535 ;
# stack offsets for local variables
20536 .equiv mci3000_is3, 0
20537 .equiv mci3000_ii3, 4
20538 .equiv mci3000_ix, 8
20539 .equiv mci3000_iy, 12
20540 .equiv mci3000_iz, 16
20541 .equiv mci3000_iq, 20
20542 .equiv mci3000_vctot, 28
20543 .equiv mci3000_n1, 36
20544 .equiv mci3000_tsc, 44
20545 .equiv mci3000_ntia, 52
20546 .equiv mci3000_innerjjnr, 56
20547 .equiv mci3000_innerk, 60
20556 sub esp
, 64 ;
# local stack space
20558 ;
# move data to local stack
20559 movd mm3
, [ebp
+ mci3000_tabscale
]
20561 movq
[esp
+ mci3000_tsc
], mm3
20562 ;
# assume we have at least one i particle - start directly
20564 mov eax
, [ebp
+ mci3000_shift
] ;
# eax = pointer into shift[]
20565 mov ebx
, [eax
] ;
# ebx=shift[n]
20566 add dword ptr
[ebp
+ mci3000_shift
], 4 ;
# advance pointer one step
20568 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
20569 mov
[esp
+ mci3000_is3
],ebx ;
# store is3
20571 mov eax
, [ebp
+ mci3000_shiftvec
] ;
# eax = base of shiftvec[]
20573 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
20574 movd mm1
, [eax
+ ebx
*4 + 8]
20576 mov ecx
, [ebp
+ mci3000_iinr
] ;
# ecx = pointer into iinr[]
20577 add dword ptr
[ebp
+ mci3000_iinr
], 4 ;
# advance pointer
20578 mov ebx
, [ecx
] ;
# ebx=ii
20580 mov edx
, [ebp
+ mci3000_charge
]
20581 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
20582 pfmul mm2
, [ebp
+ mci3000_facel
]
20583 punpckldq mm2
,mm2 ;
# spread to both halves
20584 movq
[esp
+ mci3000_iq
], mm2 ;
# iq =facel*charge[ii]
20586 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
20587 mov eax
, [ebp
+ mci3000_pos
] ;
# eax = base of pos[]
20589 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
20590 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
20591 mov
[esp
+ mci3000_ii3
], ebx
20593 movq
[esp
+ mci3000_ix
], mm0
20594 movd
[esp
+ mci3000_iz
], mm1
20596 ;
# clear total potential and i forces
20598 movq
[esp
+ mci3000_vctot
], mm7
20600 mov eax
, [ebp
+ mci3000_jindex
]
20601 mov ecx
, [eax
] ;
# jindex[n]
20602 mov edx
, [eax
+ 4] ;
# jindex[n+1]
20603 add dword ptr
[ebp
+ mci3000_jindex
], 4
20604 sub edx
, ecx ;
# number of innerloop atoms
20606 mov esi
, [ebp
+ mci3000_pos
]
20607 mov eax
, [ebp
+ mci3000_jjnr
]
20610 mov
[esp
+ mci3000_innerjjnr
], eax ;
# pointer to jjnr[nj0]
20612 mov
[esp
+ mci3000_innerk
], edx ;
# number of innerloop atoms
20613 jge
.mci3000_unroll_loop
20614 jmp
.mci3000_finish_inner
20615 .mci3000_unroll_loop:
20616 ;
# paired innerloop starts here
20617 mov ecx
, [esp
+ mci3000_innerjjnr
] ;
# pointer to jjnr[k]
20619 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
20620 add dword ptr
[esp
+ mci3000_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
20621 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
20623 mov ecx
, [ebp
+ mci3000_charge
] ;
# base of charge[]
20624 movq mm5
, [esp
+ mci3000_iq
]
20625 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
20626 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
20627 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
20629 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
20630 lea ebx
, [ebx
+ ebx
*2]
20632 mov esi
, [ebp
+ mci3000_pos
]
20634 movq mm0
, [esp
+ mci3000_ix
]
20635 movd mm1
, [esp
+ mci3000_iz
]
20636 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
20637 movd mm5
, [esi
+ eax
*4 + 8]
20638 pfsubr mm4
,mm0 ;
# dr = ir - jr
20640 pfmul mm4
,mm4 ;
# square dx,dy,dz
20642 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
20643 pfacc mm4
, mm5 ;
# first rsq in lower mm4
20645 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
20646 movd mm7
, [esi
+ ebx
*4 + 8]
20648 pfsubr mm6
,mm0 ;
# dr = ir - jr
20650 pfmul mm6
,mm6 ;
# square dx,dy,dz
20652 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
20653 pfacc mm6
, mm7 ;
# second rsq in lower mm6
20655 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
20660 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
20661 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
20667 ;
# mm0 is invsqrt, and mm1 r.
20668 ;
# do potential and fscal
20669 pfmul mm1
, [esp
+ mci3000_tsc
] ;
# mm1=rt
20671 movq
[esp
+ mci3000_n1
], mm4
20673 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
20676 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
20678 mov edx
, [ebp
+ mci3000_VFtab
]
20679 mov ecx
, [esp
+ mci3000_n1
]
20682 ;
# load all the table values we need
20683 movd mm4
, [edx
+ ecx
*4]
20684 movd mm5
, [edx
+ ecx
*4 + 4]
20685 movd mm6
, [edx
+ ecx
*4 + 8]
20686 movd mm7
, [edx
+ ecx
*4 + 12]
20687 mov ecx
, [esp
+ mci3000_n1
+ 4]
20689 punpckldq mm4
, [edx
+ ecx
*4]
20690 punpckldq mm5
, [edx
+ ecx
*4 + 4]
20691 punpckldq mm6
, [edx
+ ecx
*4 + 8]
20692 punpckldq mm7
, [edx
+ ecx
*4 + 12]
20694 pfmul mm6
, mm1 ;
# mm6 = Geps
20695 pfmul mm7
, mm2 ;
# mm7 = Heps2
20698 pfadd mm5
, mm7 ;
# mm5 = Fp
20700 pfmul mm5
, mm1 ;
# mm5=eps*Fp
20701 pfadd mm5
, mm4 ;
# mm5= VV
20703 pfmul mm5
, mm3 ;
# vcoul=qq*VV
20704 ;
# at this point mm5 contains vcoul
20705 ;
# increment vcoul - then we can get rid of mm5.
20707 pfadd mm5
, [esp
+ mci3000_vctot
] ;
# add the earlier value
20708 movq
[esp
+ mci3000_vctot
], mm5 ;
# store the sum
20710 ;
# should we do one more iteration?
20711 sub dword ptr
[esp
+ mci3000_innerk
], 2
20712 jl
.mci3000_finish_inner
20713 jmp
.mci3000_unroll_loop
20714 .mci3000_finish_inner:
20715 and dword ptr
[esp
+ mci3000_innerk
], 1
20716 jnz
.mci3000_single_inner
20717 jmp
.mci3000_updateouterdata
20718 .mci3000_single_inner:
20719 ;
# a single j particle iteration here - compare with the unrolled code for comments.
20720 mov eax
, [esp
+ mci3000_innerjjnr
]
20721 mov eax
, [eax
] ;
# eax=jnr offset
20723 mov ecx
, [ebp
+ mci3000_charge
]
20724 movd mm5
, [esp
+ mci3000_iq
]
20725 movd mm3
, [ecx
+ eax
*4]
20726 pfmul mm3
, mm5 ;
# mm3=qq
20728 mov esi
, [ebp
+ mci3000_pos
]
20729 lea eax
, [eax
+ eax
*2]
20731 movq mm0
, [esp
+ mci3000_ix
]
20732 movd mm1
, [esp
+ mci3000_iz
]
20733 movq mm4
, [esi
+ eax
*4]
20734 movd mm5
, [esi
+ eax
*4 + 8]
20740 pfacc mm4
, mm5 ;
# mm0=rsq
20746 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
20749 ;
# mm0 is invsqrt, and mm1 r.
20751 ;
# calculate potentials and scalar force
20752 pfmul mm1
, [esp
+ mci3000_tsc
] ;
# mm1=rt
20754 movd
[esp
+ mci3000_n1
], mm4
20756 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
20759 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
20762 mov edx
, [ebp
+ mci3000_VFtab
]
20763 mov ecx
, [esp
+ mci3000_n1
]
20765 ;
# load all the table values we need
20766 movd mm4
, [edx
+ ecx
*4]
20767 movd mm5
, [edx
+ ecx
*4 + 4]
20768 movd mm6
, [edx
+ ecx
*4 + 8]
20769 movd mm7
, [edx
+ ecx
*4 + 12]
20771 pfmul mm6
, mm1 ;
# mm6 = Geps
20772 pfmul mm7
, mm2 ;
# mm7 = Heps2
20775 pfadd mm5
, mm7 ;
# mm5 = Fp
20777 pfmul mm5
, mm1 ;
# mm5=eps*Fp
20778 pfadd mm5
, mm4 ;
# mm5= VV
20780 pfmul mm5
, mm3 ;
# vcoul=qq*VV
20782 ;
# at this point mm5 contains vcoul
20783 ;
# increment vcoul - then we can get rid of mm5
20785 pfadd mm5
, [esp
+ mci3000_vctot
] ;
# add the earlier value
20786 movq
[esp
+ mci3000_vctot
], mm5 ;
# store the sum
20788 .mci3000_updateouterdata:
20789 mov edx
, [ebp
+ mci3000_gid
] ;
# get group index for this i particle
20791 add dword ptr
[ebp
+ mci3000_gid
], 4 ;
# advance pointer
20793 movq mm7
, [esp
+ mci3000_vctot
]
20794 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
20796 mov eax
, [ebp
+ mci3000_Vc
]
20797 movd mm6
, [eax
+ edx
*4]
20799 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
20802 mov ecx
, [ebp
+ mci3000_nri
]
20805 ;
# not last, iterate once more!
20806 mov
[ebp
+ mci3000_nri
], ecx
20823 .globl mcinl3010_3dnow
20824 .globl _mcinl3010_3dnow
20827 .equiv mci3010_nri, 8
20828 .equiv mci3010_iinr, 12
20829 .equiv mci3010_jindex, 16
20830 .equiv mci3010_jjnr, 20
20831 .equiv mci3010_shift, 24
20832 .equiv mci3010_shiftvec, 28
20833 .equiv mci3010_gid, 32
20834 .equiv mci3010_pos, 36
20835 .equiv mci3010_charge, 40
20836 .equiv mci3010_facel, 44
20837 .equiv mci3010_Vc, 48
20838 .equiv mci3010_tabscale, 52
20839 .equiv mci3010_VFtab, 56
20840 .equiv mci3010_nsatoms, 60
20841 ;
# stack offsets for local variables
20842 .equiv mci3010_is3, 0
20843 .equiv mci3010_ii3, 4
20844 .equiv mci3010_shX, 8
20845 .equiv mci3010_shY, 12
20846 .equiv mci3010_shZ, 16
20847 .equiv mci3010_ix, 20
20848 .equiv mci3010_iy, 24
20849 .equiv mci3010_iz, 28
20850 .equiv mci3010_iq, 32
20851 .equiv mci3010_vctot, 40
20852 .equiv mci3010_n1, 48
20853 .equiv mci3010_tsc, 56
20854 .equiv mci3010_innerjjnr0, 64
20855 .equiv mci3010_innerk0, 68
20856 .equiv mci3010_innerjjnr, 72
20857 .equiv mci3010_innerk, 76
20858 .equiv mci3010_nscoul, 80
20859 .equiv mci3010_solnr, 84
20868 sub esp
, 88 ;
# local stack space
20871 add dword ptr
[ebp
+ mci3010_nsatoms
], 8
20872 movd mm3
, [ebp
+ mci3010_tabscale
]
20874 movq
[esp
+ mci3010_tsc
], mm3
20876 ;
# assume we have at least one i particle - start directly
20878 mov eax
, [ebp
+ mci3010_shift
] ;
# eax = pointer into shift[]
20879 mov ebx
, [eax
] ;
# ebx=shift[n]
20880 add dword ptr
[ebp
+ mci3010_shift
], 4 ;
# advance pointer one step
20882 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
20883 mov
[esp
+ mci3010_is3
],ebx ;
# store is3
20885 mov eax
, [ebp
+ mci3010_shiftvec
] ;
# eax = base of shiftvec[]
20887 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
20888 movd mm1
, [eax
+ ebx
*4 + 8]
20889 movq
[esp
+ mci3010_shX
], mm0
20890 movd
[esp
+ mci3010_shZ
], mm1
20892 mov ecx
, [ebp
+ mci3010_iinr
] ;
# ecx = pointer into iinr[]
20893 add dword ptr
[ebp
+ mci3010_iinr
], 4 ;
# advance pointer
20894 mov ebx
, [ecx
] ;
# ebx=ii
20896 mov eax
, [ebp
+ mci3010_nsatoms
]
20898 add dword ptr
[ebp
+ mci3010_nsatoms
], 12
20899 mov
[esp
+ mci3010_nscoul
], ecx
20903 movq
[esp
+ mci3010_vctot
], mm7
20904 mov
[esp
+ mci3010_solnr
], ebx
20906 mov eax
, [ebp
+ mci3010_jindex
]
20907 mov ecx
, [eax
] ;
# jindex[n]
20908 mov edx
, [eax
+ 4] ;
# jindex[n+1]
20909 add dword ptr
[ebp
+ mci3010_jindex
], 4
20910 sub edx
, ecx ;
# number of innerloop atoms
20911 mov eax
, [ebp
+ mci3010_jjnr
]
20914 mov
[esp
+ mci3010_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
20916 mov
[esp
+ mci3010_innerk0
], edx ;
# number of innerloop atoms
20917 mov esi
, [ebp
+ mci3010_pos
]
20918 mov ecx
, [esp
+ mci3010_nscoul
]
20920 jnz
.mci3010_mno_coul
20921 jmp
.mci3010_last_mno
20923 mov ebx
, [esp
+ mci3010_solnr
]
20924 inc dword ptr
[esp
+ mci3010_solnr
]
20925 mov edx
, [ebp
+ mci3010_charge
]
20926 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
20927 pfmul mm2
, [ebp
+ mci3010_facel
]
20928 punpckldq mm2
,mm2 ;
# spread to both halves
20929 movq
[esp
+ mci3010_iq
], mm2 ;
# iq =facel*charge[ii]
20931 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
20932 mov eax
, [ebp
+ mci3010_pos
] ;
# eax = base of pos[]
20933 mov
[esp
+ mci3010_ii3
], ebx
20935 movq mm0
, [eax
+ ebx
*4]
20936 movd mm1
, [eax
+ ebx
*4 + 8]
20937 pfadd mm0
, [esp
+ mci3010_shX
]
20938 pfadd mm1
, [esp
+ mci3010_shZ
]
20939 movq
[esp
+ mci3010_ix
], mm0
20940 movd
[esp
+ mci3010_iz
], mm1
20942 mov ecx
, [esp
+ mci3010_innerjjnr0
]
20943 mov
[esp
+ mci3010_innerjjnr
], ecx
20944 mov edx
, [esp
+ mci3010_innerk0
]
20946 mov
[esp
+ mci3010_innerk
], edx ;
# number of innerloop atoms
20947 jge
.mci3010_unroll_coul_loop
20948 jmp
.mci3010_finish_coul_inner
20949 .mci3010_unroll_coul_loop:
20950 ;
# paired innerloop starts here
20951 mov ecx
, [esp
+ mci3010_innerjjnr
] ;
# pointer to jjnr[k]
20953 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
20954 add dword ptr
[esp
+ mci3010_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
20955 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
20957 mov ecx
, [ebp
+ mci3010_charge
] ;
# base of charge[]
20958 movq mm5
, [esp
+ mci3010_iq
]
20959 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
20960 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
20961 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
20963 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
20964 lea ebx
, [ebx
+ ebx
*2]
20966 mov esi
, [ebp
+ mci3010_pos
]
20968 movq mm0
, [esp
+ mci3010_ix
]
20969 movd mm1
, [esp
+ mci3010_iz
]
20970 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
20971 movd mm5
, [esi
+ eax
*4 + 8]
20972 pfsubr mm4
,mm0 ;
# dr = ir - jr
20974 pfmul mm4
,mm4 ;
# square dx,dy,dz
20976 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
20977 pfacc mm4
, mm5 ;
# first rsq in lower mm4
20979 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
20980 movd mm7
, [esi
+ ebx
*4 + 8]
20982 pfsubr mm6
,mm0 ;
# dr = ir - jr
20984 pfmul mm6
,mm6 ;
# square dx,dy,dz
20986 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
20987 pfacc mm6
, mm7 ;
# second rsq in lower mm6
20989 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
20994 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
20995 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
21001 ;
# mm0 is invsqrt, and mm1 r.
21002 ;
# do potential and fscal
21003 pfmul mm1
, [esp
+ mci3010_tsc
] ;
# mm1=rt
21005 movq
[esp
+ mci3010_n1
], mm4
21007 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
21010 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
21012 mov edx
, [ebp
+ mci3010_VFtab
]
21013 mov ecx
, [esp
+ mci3010_n1
]
21016 ;
# load all the table values we need
21017 movd mm4
, [edx
+ ecx
*4]
21018 movd mm5
, [edx
+ ecx
*4 + 4]
21019 movd mm6
, [edx
+ ecx
*4 + 8]
21020 movd mm7
, [edx
+ ecx
*4 + 12]
21021 mov ecx
, [esp
+ mci3010_n1
+ 4]
21023 punpckldq mm4
, [edx
+ ecx
*4]
21024 punpckldq mm5
, [edx
+ ecx
*4 + 4]
21025 punpckldq mm6
, [edx
+ ecx
*4 + 8]
21026 punpckldq mm7
, [edx
+ ecx
*4 + 12]
21028 pfmul mm6
, mm1 ;
# mm6 = Geps
21029 pfmul mm7
, mm2 ;
# mm7 = Heps2
21032 pfadd mm5
, mm7 ;
# mm5 = Fp
21034 pfmul mm5
, mm1 ;
# mm5=eps*Fp
21035 pfadd mm5
, mm4 ;
# mm5= VV
21037 pfmul mm5
, mm3 ;
# vcoul=qq*VV
21039 ;
# at this point mm5 contains vcoul
21040 ;
# increment vcoul - then we can get rid of mm5
21042 pfadd mm5
, [esp
+ mci3010_vctot
] ;
# add the earlier value
21043 movq
[esp
+ mci3010_vctot
], mm5 ;
# store the sum
21045 ;
# should we do one more iteration?
21046 sub dword ptr
[esp
+ mci3010_innerk
], 2
21047 jl
.mci3010_finish_coul_inner
21048 jmp
.mci3010_unroll_coul_loop
21049 .mci3010_finish_coul_inner:
21050 and dword ptr
[esp
+ mci3010_innerk
], 1
21051 jnz
.mci3010_single_coul_inner
21052 jmp
.mci3010_updateouterdata_coul
21053 .mci3010_single_coul_inner:
21054 ;
# a single j particle iteration here - compare with the unrolled code for comments.
21055 mov eax
, [esp
+ mci3010_innerjjnr
]
21056 mov eax
, [eax
] ;
# eax=jnr offset
21058 mov ecx
, [ebp
+ mci3010_charge
]
21059 movd mm5
, [esp
+ mci3010_iq
]
21060 movd mm3
, [ecx
+ eax
*4]
21061 pfmul mm3
, mm5 ;
# mm3=qq
21063 mov esi
, [ebp
+ mci3010_pos
]
21064 lea eax
, [eax
+ eax
*2]
21066 movq mm0
, [esp
+ mci3010_ix
]
21067 movd mm1
, [esp
+ mci3010_iz
]
21068 movq mm4
, [esi
+ eax
*4]
21069 movd mm5
, [esi
+ eax
*4 + 8]
21075 pfacc mm4
, mm5 ;
# mm0=rsq
21081 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
21084 ;
# mm0 is invsqrt, and mm1 r.
21086 ;
# calculate potentials and scalar force
21087 pfmul mm1
, [esp
+ mci3010_tsc
] ;
# mm1=rt
21089 movd
[esp
+ mci3010_n1
], mm4
21091 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
21094 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
21097 mov edx
, [ebp
+ mci3010_VFtab
]
21098 mov ecx
, [esp
+ mci3010_n1
]
21100 ;
# load all the table values we need
21101 movd mm4
, [edx
+ ecx
*4]
21102 movd mm5
, [edx
+ ecx
*4 + 4]
21103 movd mm6
, [edx
+ ecx
*4 + 8]
21104 movd mm7
, [edx
+ ecx
*4 + 12]
21106 pfmul mm6
, mm1 ;
# mm6 = Geps
21107 pfmul mm7
, mm2 ;
# mm7 = Heps2
21110 pfadd mm5
, mm7 ;
# mm5 = Fp
21112 pfmul mm5
, mm1 ;
# mm5=eps*Fp
21113 pfadd mm5
, mm4 ;
# mm5= VV
21115 pfmul mm5
, mm3 ;
# vcoul=qq*VV
21117 ;
# at this point mm5 contains vcoul
21118 ;
# increment vcoul - then we can get rid of mm5
21120 pfadd mm5
, [esp
+ mci3010_vctot
] ;
# add the earlier value
21121 movq
[esp
+ mci3010_vctot
], mm5 ;
# store the sum
21123 .mci3010_updateouterdata_coul:
21124 ;
# loop back to mno
21125 dec dword ptr
[esp
+ mci3010_nscoul
]
21126 jz
.mci3010_last_mno
21127 jmp
.mci3010_mno_coul
21129 mov edx
, [ebp
+ mci3010_gid
] ;
# get group index for this i particle
21131 add dword ptr
[ebp
+ mci3010_gid
], 4 ;
# advance pointer
21133 movq mm7
, [esp
+ mci3010_vctot
]
21134 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
21136 mov eax
, [ebp
+ mci3010_Vc
]
21137 movd mm6
, [eax
+ edx
*4]
21139 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
21141 mov ecx
, [ebp
+ mci3010_nri
]
21144 ;
# not last, iterate once more!
21145 mov
[ebp
+ mci3010_nri
], ecx
21162 .globl mcinl3020_3dnow
21163 .globl _mcinl3020_3dnow
21166 .equiv mci3020_nri, 8
21167 .equiv mci3020_iinr, 12
21168 .equiv mci3020_jindex, 16
21169 .equiv mci3020_jjnr, 20
21170 .equiv mci3020_shift, 24
21171 .equiv mci3020_shiftvec, 28
21172 .equiv mci3020_gid, 32
21173 .equiv mci3020_pos, 36
21174 .equiv mci3020_charge, 40
21175 .equiv mci3020_facel, 44
21176 .equiv mci3020_Vc, 48
21177 .equiv mci3020_tabscale, 52
21178 .equiv mci3020_VFtab, 56
21179 ;
# stack offsets for local variables
21180 .equiv mci3020_is3, 0
21181 .equiv mci3020_ii3, 4
21182 .equiv mci3020_ixO, 8
21183 .equiv mci3020_iyO, 12
21184 .equiv mci3020_izO, 16
21185 .equiv mci3020_ixH, 20
21186 .equiv mci3020_iyH, 28
21187 .equiv mci3020_izH, 36
21188 .equiv mci3020_iqO, 44
21189 .equiv mci3020_iqH, 52
21190 .equiv mci3020_qqO, 60
21191 .equiv mci3020_qqH, 68
21192 .equiv mci3020_vctot, 76
21193 .equiv mci3020_n1, 84
21194 .equiv mci3020_tsc, 92
21195 .equiv mci3020_innerjjnr, 100
21196 .equiv mci3020_innerk, 104
21197 .equiv mci3020_tmprsqH, 108
21206 sub esp
, 116 ;
# local stack space
21209 mov ecx
, [ebp
+ mci3020_iinr
] ;
# ecx = pointer into iinr[]
21210 mov ebx
, [ecx
] ;
# ebx=ii
21212 mov edx
, [ebp
+ mci3020_charge
]
21213 movd mm1
, [ebp
+ mci3020_facel
]
21214 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0]
21216 movq
[esp
+ mci3020_iqO
], mm2 ;
# iqO = facel*charge[ii]
21218 movd mm2
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1]
21220 punpckldq mm2
,mm2 ;
# spread to both halves
21221 movq
[esp
+ mci3020_iqH
], mm2 ;
# iqH = facel*charge[ii0+1]
21223 movd mm4
, [ebp
+ mci3020_tabscale
]
21224 punpckldq mm4
,mm4 ;
# spread to both halves
21225 movq
[esp
+ mci3020_tsc
], mm4
21226 ;
# assume we have at least one i particle - start directly
21228 mov eax
, [ebp
+ mci3020_shift
] ;
# eax = pointer into shift[]
21229 mov ebx
, [eax
] ;
# ebx=shift[n]
21230 add dword ptr
[ebp
+ mci3020_shift
], 4 ;
# advance pointer one step
21232 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
21233 mov
[esp
+ mci3020_is3
],ebx ;
# store is3
21235 mov eax
, [ebp
+ mci3020_shiftvec
] ;
# eax = base of shiftvec[]
21237 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
21238 movd mm6
, [eax
+ ebx
*4 + 8]
21242 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
21246 mov ecx
, [ebp
+ mci3020_iinr
] ;
# ecx = pointer into iinr[]
21247 add dword ptr
[ebp
+ mci3020_iinr
], 4 ;
# advance pointer
21248 mov ebx
, [ecx
] ;
# ebx=ii
21250 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
21251 mov eax
, [ebp
+ mci3020_pos
] ;
# eax = base of pos[]
21253 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
21254 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
21255 mov
[esp
+ mci3020_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
21257 movq
[esp
+ mci3020_ixO
], mm5
21258 movq
[esp
+ mci3020_izO
], mm6
21260 movd mm3
, [eax
+ ebx
*4 + 12]
21261 movd mm4
, [eax
+ ebx
*4 + 16]
21262 movd mm5
, [eax
+ ebx
*4 + 20]
21263 punpckldq mm3
, [eax
+ ebx
*4 + 24]
21264 punpckldq mm4
, [eax
+ ebx
*4 + 28]
21265 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
21270 movq
[esp
+ mci3020_ixH
], mm0
21271 movq
[esp
+ mci3020_iyH
], mm1
21272 movq
[esp
+ mci3020_izH
], mm2
21274 ;
# clear vctot and i forces
21276 movq
[esp
+ mci3020_vctot
], mm7
21278 mov eax
, [ebp
+ mci3020_jindex
]
21279 mov ecx
, [eax
] ;
# jindex[n]
21280 mov edx
, [eax
+ 4] ;
# jindex[n+1]
21281 add dword ptr
[ebp
+ mci3020_jindex
], 4
21282 sub edx
, ecx ;
# number of innerloop atoms
21283 mov
[esp
+ mci3020_innerk
], edx
21285 mov esi
, [ebp
+ mci3020_pos
]
21286 mov eax
, [ebp
+ mci3020_jjnr
]
21289 mov
[esp
+ mci3020_innerjjnr
], eax ;
# pointer to jjnr[nj0]
21290 .mci3020_inner_loop:
21291 ;
# a single j particle iteration
21292 mov eax
, [esp
+ mci3020_innerjjnr
]
21293 mov eax
, [eax
] ;
# eax=jnr offset
21294 add dword ptr
[esp
+ mci3020_innerjjnr
], 4 ;
# advance pointer
21295 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
21297 mov ecx
, [ebp
+ mci3020_charge
]
21298 movd mm7
, [ecx
+ eax
*4]
21301 pfmul mm6
, [esp
+ mci3020_iqO
]
21302 pfmul mm7
, [esp
+ mci3020_iqH
] ;
# mm6=qqO, mm7=qqH
21303 movd
[esp
+ mci3020_qqO
], mm6
21304 movq
[esp
+ mci3020_qqH
], mm7
21306 lea eax
, [eax
+ eax
*2]
21308 movq mm0
, [esi
+ eax
*4]
21309 movd mm1
, [esi
+ eax
*4 + 8]
21310 ;
# copy & expand to mm2-mm4 for the H interactions
21318 pfsubr mm0
, [esp
+ mci3020_ixO
]
21319 pfsubr mm1
, [esp
+ mci3020_izO
]
21324 pfadd mm0
, mm1 ;
# mm0=rsqO
21328 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
21329 pfsubr mm2
, [esp
+ mci3020_ixH
]
21330 pfsubr mm3
, [esp
+ mci3020_iyH
]
21331 pfsubr mm4
, [esp
+ mci3020_izH
] ;
# mm2-mm4 is dxH-dzH
21338 pfadd mm3
,mm4 ;
# mm3=rsqH
21339 movq
[esp
+ mci3020_tmprsqH
], mm3
21346 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
21348 pfmul mm0
, mm1 ;
# mm0=r
21350 pfmul mm0
, [esp
+ mci3020_tsc
]
21352 movd
[esp
+ mci3020_n1
], mm4
21354 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
21356 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
21359 mov edx
, [ebp
+ mci3020_VFtab
]
21360 mov ecx
, [esp
+ mci3020_n1
]
21362 ;
# load all values we need
21363 movd mm4
, [edx
+ ecx
*4]
21364 movd mm5
, [edx
+ ecx
*4 + 4]
21365 movd mm6
, [edx
+ ecx
*4 + 8]
21366 movd mm7
, [edx
+ ecx
*4 + 12]
21368 pfmul mm6
, mm0 ;
# mm6 = Geps
21369 pfmul mm7
, mm2 ;
# mm7 = Heps2
21372 pfadd mm5
, mm7 ;
# mm5 = Fp
21374 pfmul mm5
, mm0 ;
# mm5=eps*Fp
21375 pfadd mm5
, mm4 ;
# mm5= VV
21377 pfmul mm5
, [esp
+ mci3020_qqO
] ;
# vcoul=qq*VV
21378 ;
# update vctot directly
21379 pfadd mm5
, [esp
+ mci3020_vctot
]
21380 movq
[esp
+ mci3020_vctot
], mm5
21382 ;
# now do the two hydrogens.
21383 movq mm0
, [esp
+ mci3020_tmprsqH
] ;
# mm0=rsqH
21389 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
21394 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
21396 pfmul mm0
,mm1 ;
# mm0=r
21397 pfmul mm0
, [esp
+ mci3020_tsc
]
21399 movq
[esp
+ mci3020_n1
], mm4
21401 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
21403 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
21406 mov edx
, [ebp
+ mci3020_VFtab
]
21407 mov ecx
, [esp
+ mci3020_n1
]
21409 ;
# load all values we need
21410 movd mm4
, [edx
+ ecx
*4]
21411 movd mm5
, [edx
+ ecx
*4 + 4]
21412 movd mm6
, [edx
+ ecx
*4 + 8]
21413 movd mm7
, [edx
+ ecx
*4 + 12]
21414 mov ecx
, [esp
+ mci3020_n1
+ 4]
21416 punpckldq mm4
, [edx
+ ecx
*4]
21417 punpckldq mm5
, [edx
+ ecx
*4 + 4]
21418 punpckldq mm6
, [edx
+ ecx
*4 + 8]
21419 punpckldq mm7
, [edx
+ ecx
*4 + 12]
21421 pfmul mm6
, mm0 ;
# mm6 = Geps
21422 pfmul mm7
, mm2 ;
# mm7 = Heps2
21425 pfadd mm5
, mm7 ;
# mm5 = Fp
21427 pfmul mm5
, mm0 ;
# mm5=eps*Fp
21428 pfadd mm5
, mm4 ;
# mm5= VV
21430 pfmul mm5
, [esp
+ mci3020_qqH
] ;
# vcoul=qq*VV
21433 pfadd mm5
, [esp
+ mci3020_vctot
]
21434 movq
[esp
+ mci3020_vctot
], mm5
21436 ;
# done - one more?
21437 dec dword ptr
[esp
+ mci3020_innerk
]
21438 jz
.mci3020_updateouterdata
21439 jmp
.mci3020_inner_loop
21440 .mci3020_updateouterdata:
21441 mov edx
, [ebp
+ mci3020_gid
] ;
# get group index for this i particle
21443 add dword ptr
[ebp
+ mci3020_gid
], 4 ;
# advance pointer
21445 movq mm7
, [esp
+ mci3020_vctot
]
21446 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
21448 mov eax
, [ebp
+ mci3020_Vc
]
21449 movd mm6
, [eax
+ edx
*4]
21451 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
21454 dec dword ptr
[ebp
+ mci3020_nri
]
21456 ;
# not last, iterate once more!
21472 .globl mcinl3030_3dnow
21473 .globl _mcinl3030_3dnow
21476 .equiv mci3030_nri, 8
21477 .equiv mci3030_iinr, 12
21478 .equiv mci3030_jindex, 16
21479 .equiv mci3030_jjnr, 20
21480 .equiv mci3030_shift, 24
21481 .equiv mci3030_shiftvec, 28
21482 .equiv mci3030_gid, 32
21483 .equiv mci3030_pos, 36
21484 .equiv mci3030_charge, 40
21485 .equiv mci3030_facel, 44
21486 .equiv mci3030_Vc, 48
21487 .equiv mci3030_tabscale, 52
21488 .equiv mci3030_VFtab, 56
21489 ;
# stack offsets for local variables
21490 .equiv mci3030_is3, 0
21491 .equiv mci3030_ii3, 4
21492 .equiv mci3030_ixO, 8
21493 .equiv mci3030_iyO, 12
21494 .equiv mci3030_izO, 16
21495 .equiv mci3030_ixH, 20
21496 .equiv mci3030_iyH, 28
21497 .equiv mci3030_izH, 36
21498 .equiv mci3030_qqOO, 44
21499 .equiv mci3030_qqOH, 52
21500 .equiv mci3030_qqHH, 60
21501 .equiv mci3030_n1, 68
21502 .equiv mci3030_tsc, 76
21503 .equiv mci3030_vctot, 84
21504 .equiv mci3030_innerjjnr, 92
21505 .equiv mci3030_innerk, 96
21506 .equiv mci3030_tmprsqH, 100
21515 sub esp
, 108 ;
# local stack space
21517 ;
# assume we have at least one i particle - start directly
21519 mov ecx
, [ebp
+ mci3030_iinr
] ;
# ecx = pointer into iinr[]
21520 mov ebx
, [ecx
] ;
# ebx=ii
21522 mov edx
, [ebp
+ mci3030_charge
]
21523 movd mm1
, [ebp
+ mci3030_facel
] ;
# mm1=facel
21524 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0] (O)
21525 movd mm3
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1] (H)
21531 pfmul mm4
, mm2 ;
# mm4=qqOO*facel
21532 pfmul mm5
, mm3 ;
# mm5=qqOH*facel
21533 pfmul mm6
, mm3 ;
# mm6=qqHH*facel
21534 punpckldq mm5
,mm5 ;
# spread to both halves
21535 punpckldq mm6
,mm6 ;
# spread to both halves
21536 movq
[esp
+ mci3030_qqOO
], mm4
21537 movq
[esp
+ mci3030_qqOH
], mm5
21538 movq
[esp
+ mci3030_qqHH
], mm6
21539 movd mm3
, [ebp
+ mci3030_tabscale
]
21541 movq
[esp
+ mci3030_tsc
], mm3
21543 mov eax
, [ebp
+ mci3030_shift
] ;
# eax = pointer into shift[]
21544 mov ebx
, [eax
] ;
# ebx=shift[n]
21545 add dword ptr
[ebp
+ mci3030_shift
], 4 ;
# advance pointer one step
21547 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
21548 mov
[esp
+ mci3030_is3
],ebx ;
# store is3
21550 mov eax
, [ebp
+ mci3030_shiftvec
] ;
# eax = base of shiftvec[]
21552 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
21553 movd mm6
, [eax
+ ebx
*4 + 8]
21557 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
21561 mov ecx
, [ebp
+ mci3030_iinr
] ;
# ecx = pointer into iinr[]
21562 add dword ptr
[ebp
+ mci3030_iinr
], 4 ;
# advance pointer
21563 mov ebx
, [ecx
] ;
# ebx=ii
21565 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
21566 mov eax
, [ebp
+ mci3030_pos
] ;
# eax = base of pos[]
21568 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
21569 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
21570 mov
[esp
+ mci3030_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
21572 movq
[esp
+ mci3030_ixO
], mm5
21573 movq
[esp
+ mci3030_izO
], mm6
21575 movd mm3
, [eax
+ ebx
*4 + 12]
21576 movd mm4
, [eax
+ ebx
*4 + 16]
21577 movd mm5
, [eax
+ ebx
*4 + 20]
21578 punpckldq mm3
, [eax
+ ebx
*4 + 24]
21579 punpckldq mm4
, [eax
+ ebx
*4 + 28]
21580 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
21585 movq
[esp
+ mci3030_ixH
], mm0
21586 movq
[esp
+ mci3030_iyH
], mm1
21587 movq
[esp
+ mci3030_izH
], mm2
21589 ;
# clear vctot and i forces
21591 movq
[esp
+ mci3030_vctot
], mm7
21593 mov eax
, [ebp
+ mci3030_jindex
]
21594 mov ecx
, [eax
] ;
# jindex[n]
21595 mov edx
, [eax
+ 4] ;
# jindex[n+1]
21596 add dword ptr
[ebp
+ mci3030_jindex
], 4
21597 sub edx
, ecx ;
# number of innerloop atoms
21598 mov
[esp
+ mci3030_innerk
], edx ;
# number of innerloop atoms
21600 mov esi
, [ebp
+ mci3030_pos
]
21601 mov eax
, [ebp
+ mci3030_jjnr
]
21604 mov
[esp
+ mci3030_innerjjnr
], eax ;
# pointer to jjnr[nj0]
21605 .mci3030_inner_loop:
21606 ;
# a single j particle iteration here - compare with the unrolled code for comments.
21607 mov eax
, [esp
+ mci3030_innerjjnr
]
21608 mov eax
, [eax
] ;
# eax=jnr offset
21609 add dword ptr
[esp
+ mci3030_innerjjnr
], 4 ;
# advance pointer
21611 lea eax
, [eax
+ eax
*2]
21613 movq mm0
, [esi
+ eax
*4]
21614 movd mm1
, [esi
+ eax
*4 + 8]
21615 ;
# copy & expand to mm2-mm4 for the H interactions
21623 pfsubr mm0
, [esp
+ mci3030_ixO
]
21624 pfsubr mm1
, [esp
+ mci3030_izO
]
21629 pfadd mm0
, mm1 ;
# mm0=rsqO
21633 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
21634 pfsubr mm2
, [esp
+ mci3030_ixH
]
21635 pfsubr mm3
, [esp
+ mci3030_iyH
]
21636 pfsubr mm4
, [esp
+ mci3030_izH
] ;
# mm2-mm4 is dxH-dzH
21643 pfadd mm3
,mm4 ;
# mm3=rsqH
21644 movq
[esp
+ mci3030_tmprsqH
], mm3
21651 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
21652 pfmul mm0
, mm1 ;
# mm0=rsq
21654 pfmul mm0
, [esp
+ mci3030_tsc
]
21656 movd
[esp
+ mci3030_n1
], mm4
21658 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
21660 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
21663 mov edx
, [ebp
+ mci3030_VFtab
]
21664 mov ecx
, [esp
+ mci3030_n1
]
21667 ;
# load all values we need
21668 movd mm4
, [edx
+ ecx
*4]
21669 movd mm5
, [edx
+ ecx
*4 + 4]
21670 movd mm6
, [edx
+ ecx
*4 + 8]
21671 movd mm7
, [edx
+ ecx
*4 + 12]
21673 pfmul mm6
, mm0 ;
# mm6 = Geps
21674 pfmul mm7
, mm2 ;
# mm7 = Heps2
21677 pfadd mm5
, mm7 ;
# mm5 = Fp
21679 pfmul mm5
, mm0 ;
# mm5=eps*Fp
21680 pfadd mm5
, mm4 ;
# mm5= VV
21682 pfmul mm5
, [esp
+ mci3030_qqOO
] ;
# vcoul=qq*VV
21683 ;
# update vctot directly, use mm3 for fscal sum.
21684 pfadd mm5
, [esp
+ mci3030_vctot
]
21685 movq
[esp
+ mci3030_vctot
], mm5
21687 ;
# time for hydrogens!
21689 movq mm0
, [esp
+ mci3030_tmprsqH
]
21695 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
21700 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
21702 pfmul mm0
,mm1 ;
# mm0=r
21703 pfmul mm0
, [esp
+ mci3030_tsc
]
21705 movq
[esp
+ mci3030_n1
], mm4
21707 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
21709 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
21712 mov edx
, [ebp
+ mci3030_VFtab
]
21713 mov ecx
, [esp
+ mci3030_n1
]
21715 ;
# load all values we need
21716 movd mm4
, [edx
+ ecx
*4]
21717 movd mm5
, [edx
+ ecx
*4 + 4]
21718 movd mm6
, [edx
+ ecx
*4 + 8]
21719 movd mm7
, [edx
+ ecx
*4 + 12]
21720 mov ecx
, [esp
+ mci3030_n1
+ 4]
21722 punpckldq mm4
, [edx
+ ecx
*4]
21723 punpckldq mm5
, [edx
+ ecx
*4 + 4]
21724 punpckldq mm6
, [edx
+ ecx
*4 + 8]
21725 punpckldq mm7
, [edx
+ ecx
*4 + 12]
21727 pfmul mm6
, mm0 ;
# mm6 = Geps
21728 pfmul mm7
, mm2 ;
# mm7 = Heps2
21731 pfadd mm5
, mm7 ;
# mm5 = Fp
21733 pfmul mm5
, mm0 ;
# mm5=eps*Fp
21734 pfadd mm5
, mm4 ;
# mm5= VV
21736 pfmul mm5
, [esp
+ mci3030_qqOH
] ;
# vcoul=qq*VV
21738 pfadd mm5
, [esp
+ mci3030_vctot
]
21739 movq
[esp
+ mci3030_vctot
], mm5
21741 ;
# interactions with j H1
21743 movq mm0
, [esi
+ eax
*4 + 12]
21744 movd mm1
, [esi
+ eax
*4 + 20]
21745 ;
# copy & expand to mm2-mm4 for the H interactions
21753 pfsubr mm0
, [esp
+ mci3030_ixO
]
21754 pfsubr mm1
, [esp
+ mci3030_izO
]
21759 pfadd mm0
, mm1 ;
# mm0=rsqO
21763 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
21764 pfsubr mm2
, [esp
+ mci3030_ixH
]
21765 pfsubr mm3
, [esp
+ mci3030_iyH
]
21766 pfsubr mm4
, [esp
+ mci3030_izH
] ;
# mm2-mm4 is dxH-dzH
21773 pfadd mm3
,mm4 ;
# mm3=rsqH
21774 movq
[esp
+ mci3030_tmprsqH
], mm3
21781 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
21782 pfmul mm0
, mm1 ;
# mm0=rsq
21784 pfmul mm0
, [esp
+ mci3030_tsc
]
21786 movd
[esp
+ mci3030_n1
], mm4
21788 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
21790 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
21793 mov edx
, [ebp
+ mci3030_VFtab
]
21794 mov ecx
, [esp
+ mci3030_n1
]
21797 ;
# load all values we need
21798 movd mm4
, [edx
+ ecx
*4]
21799 movd mm5
, [edx
+ ecx
*4 + 4]
21800 movd mm6
, [edx
+ ecx
*4 + 8]
21801 movd mm7
, [edx
+ ecx
*4 + 12]
21803 pfmul mm6
, mm0 ;
# mm6 = Geps
21804 pfmul mm7
, mm2 ;
# mm7 = Heps2
21807 pfadd mm5
, mm7 ;
# mm5 = Fp
21809 pfmul mm5
, mm0 ;
# mm5=eps*Fp
21810 pfadd mm5
, mm4 ;
# mm5= VV
21812 pfmul mm5
, [esp
+ mci3030_qqOH
] ;
# vcoul=qq*VV
21814 ;
# update vctot directly, force is moved to mm3
21815 pfadd mm5
, [esp
+ mci3030_vctot
]
21816 movq
[esp
+ mci3030_vctot
], mm5
21818 movq mm0
, [esp
+ mci3030_tmprsqH
]
21824 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
21829 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
21831 pfmul mm0
,mm1 ;
# mm0=r
21832 pfmul mm0
, [esp
+ mci3030_tsc
]
21834 movq
[esp
+ mci3030_n1
], mm4
21836 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
21838 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
21841 mov edx
, [ebp
+ mci3030_VFtab
]
21842 mov ecx
, [esp
+ mci3030_n1
]
21844 ;
# load all values we need
21845 movd mm4
, [edx
+ ecx
*4]
21846 movd mm5
, [edx
+ ecx
*4 + 4]
21847 movd mm6
, [edx
+ ecx
*4 + 8]
21848 movd mm7
, [edx
+ ecx
*4 + 12]
21849 mov ecx
, [esp
+ mci3030_n1
+ 4]
21851 punpckldq mm4
, [edx
+ ecx
*4]
21852 punpckldq mm5
, [edx
+ ecx
*4 + 4]
21853 punpckldq mm6
, [edx
+ ecx
*4 + 8]
21854 punpckldq mm7
, [edx
+ ecx
*4 + 12]
21857 pfmul mm6
, mm0 ;
# mm6 = Geps
21858 pfmul mm7
, mm2 ;
# mm7 = Heps2
21861 pfadd mm5
, mm7 ;
# mm5 = Fp
21863 pfmul mm5
, mm0 ;
# mm5=eps*Fp
21864 pfadd mm5
, mm4 ;
# mm5= VV
21866 pfmul mm5
, [esp
+ mci3030_qqHH
] ;
# vcoul=qq*VV
21868 pfadd mm5
, [esp
+ mci3030_vctot
]
21869 movq
[esp
+ mci3030_vctot
], mm5
21871 ;
# interactions with j H2
21872 movq mm0
, [esi
+ eax
*4 + 24]
21873 movd mm1
, [esi
+ eax
*4 + 32]
21874 ;
# copy & expand to mm2-mm4 for the H interactions
21882 pfsubr mm0
, [esp
+ mci3030_ixO
]
21883 pfsubr mm1
, [esp
+ mci3030_izO
]
21888 pfadd mm0
, mm1 ;
# mm0=rsqO
21892 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
21893 pfsubr mm2
, [esp
+ mci3030_ixH
]
21894 pfsubr mm3
, [esp
+ mci3030_iyH
]
21895 pfsubr mm4
, [esp
+ mci3030_izH
] ;
# mm2-mm4 is dxH-dzH
21902 pfadd mm3
,mm4 ;
# mm3=rsqH
21903 movq
[esp
+ mci3030_tmprsqH
], mm3
21910 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
21913 pfmul mm0
, [esp
+ mci3030_tsc
]
21915 movd
[esp
+ mci3030_n1
], mm4
21917 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
21919 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
21922 mov edx
, [ebp
+ mci3030_VFtab
]
21923 mov ecx
, [esp
+ mci3030_n1
]
21926 ;
# load all values we need
21927 movd mm4
, [edx
+ ecx
*4]
21928 movd mm5
, [edx
+ ecx
*4 + 4]
21929 movd mm6
, [edx
+ ecx
*4 + 8]
21930 movd mm7
, [edx
+ ecx
*4 + 12]
21932 pfmul mm6
, mm0 ;
# mm6 = Geps
21933 pfmul mm7
, mm2 ;
# mm7 = Heps2
21936 pfadd mm5
, mm7 ;
# mm5 = Fp
21938 pfmul mm5
, mm0 ;
# mm5=eps*Fp
21939 pfadd mm5
, mm4 ;
# mm5= VV
21941 pfmul mm5
, [esp
+ mci3030_qqOH
] ;
# vcoul=qq*VV
21943 ;
# update vctot directly, use mm3 for fscal sum.
21944 pfadd mm5
, [esp
+ mci3030_vctot
]
21945 movq
[esp
+ mci3030_vctot
], mm5
21947 movq mm0
, [esp
+ mci3030_tmprsqH
]
21953 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
21958 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
21960 pfmul mm0
,mm1 ;
# mm0=r
21961 pfmul mm0
, [esp
+ mci3030_tsc
]
21963 movq
[esp
+ mci3030_n1
], mm4
21965 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
21967 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
21970 mov edx
, [ebp
+ mci3030_VFtab
]
21971 mov ecx
, [esp
+ mci3030_n1
]
21973 ;
# load all values we need
21974 movd mm4
, [edx
+ ecx
*4]
21975 movd mm5
, [edx
+ ecx
*4 + 4]
21976 movd mm6
, [edx
+ ecx
*4 + 8]
21977 movd mm7
, [edx
+ ecx
*4 + 12]
21978 mov ecx
, [esp
+ mci3030_n1
+ 4]
21980 punpckldq mm4
, [edx
+ ecx
*4]
21981 punpckldq mm5
, [edx
+ ecx
*4 + 4]
21982 punpckldq mm6
, [edx
+ ecx
*4 + 8]
21983 punpckldq mm7
, [edx
+ ecx
*4 + 12]
21986 pfmul mm6
, mm0 ;
# mm6 = Geps
21987 pfmul mm7
, mm2 ;
# mm7 = Heps2
21990 pfadd mm5
, mm7 ;
# mm5 = Fp
21992 pfmul mm5
, mm0 ;
# mm5=eps*Fp
21993 pfadd mm5
, mm4 ;
# mm5= VV
21995 pfmul mm5
, [esp
+ mci3030_qqHH
] ;
# vcoul=qq*VV
21997 pfadd mm5
, [esp
+ mci3030_vctot
]
21998 movq
[esp
+ mci3030_vctot
], mm5
22000 ;
# done - one more?
22001 dec dword ptr
[esp
+ mci3030_innerk
]
22002 jz
.mci3030_updateouterdata
22003 jmp
.mci3030_inner_loop
22004 .mci3030_updateouterdata:
22005 mov edx
, [ebp
+ mci3030_gid
] ;
# get group index for this i particle
22007 add dword ptr
[ebp
+ mci3030_gid
], 4 ;
# advance pointer
22009 movq mm7
, [esp
+ mci3030_vctot
]
22010 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
22012 mov eax
, [ebp
+ mci3030_Vc
]
22013 movd mm6
, [eax
+ edx
*4]
22015 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
22018 dec dword ptr
[ebp
+ mci3030_nri
]
22020 ;
# not last, iterate once more!
22037 .globl mcinl3100_3dnow
22038 .globl _mcinl3100_3dnow
22041 .equiv mci3100_nri, 8
22042 .equiv mci3100_iinr, 12
22043 .equiv mci3100_jindex, 16
22044 .equiv mci3100_jjnr, 20
22045 .equiv mci3100_shift, 24
22046 .equiv mci3100_shiftvec, 28
22047 .equiv mci3100_gid, 32
22048 .equiv mci3100_pos, 36
22049 .equiv mci3100_charge, 40
22050 .equiv mci3100_facel, 44
22051 .equiv mci3100_Vc, 48
22052 .equiv mci3100_type, 52
22053 .equiv mci3100_ntype, 56
22054 .equiv mci3100_nbfp, 60
22055 .equiv mci3100_Vnb, 64
22056 .equiv mci3100_tabscale, 68
22057 .equiv mci3100_VFtab, 72
22058 ;
# stack offsets for local variables
22059 .equiv mci3100_is3, 0
22060 .equiv mci3100_ii3, 4
22061 .equiv mci3100_ix, 8
22062 .equiv mci3100_iy, 12
22063 .equiv mci3100_iz, 16
22064 .equiv mci3100_iq, 20
22065 .equiv mci3100_vctot, 28
22066 .equiv mci3100_vnbtot, 36
22067 .equiv mci3100_c6, 44
22068 .equiv mci3100_c12, 52
22069 .equiv mci3100_n1, 60
22070 .equiv mci3100_tsc, 68
22071 .equiv mci3100_ntia, 76
22072 .equiv mci3100_innerjjnr, 80
22073 .equiv mci3100_innerk, 84
22082 sub esp
, 88 ;
# local stack space
22084 ;
# move data to local stack
22085 movd mm3
, [ebp
+ mci3100_tabscale
]
22087 movq
[esp
+ mci3100_tsc
], mm3
22088 ;
# assume we have at least one i particle - start directly
22090 mov eax
, [ebp
+ mci3100_shift
] ;
# eax = pointer into shift[]
22091 mov ebx
, [eax
] ;
# ebx=shift[n]
22092 add dword ptr
[ebp
+ mci3100_shift
], 4 ;
# advance pointer one step
22094 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
22095 mov
[esp
+ mci3100_is3
],ebx ;
# store is3
22097 mov eax
, [ebp
+ mci3100_shiftvec
] ;
# eax = base of shiftvec[]
22099 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
22100 movd mm1
, [eax
+ ebx
*4 + 8]
22102 mov ecx
, [ebp
+ mci3100_iinr
] ;
# ecx = pointer into iinr[]
22103 add dword ptr
[ebp
+ mci3100_iinr
], 4 ;
# advance pointer
22104 mov ebx
, [ecx
] ;
# ebx=ii
22106 mov edx
, [ebp
+ mci3100_charge
]
22107 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
22108 pfmul mm2
, [ebp
+ mci3100_facel
]
22109 punpckldq mm2
,mm2 ;
# spread to both halves
22110 movq
[esp
+ mci3100_iq
], mm2 ;
# iq =facel*charge[ii]
22112 mov edx
, [ebp
+ mci3100_type
]
22113 mov edx
, [edx
+ ebx
*4]
22114 imul edx
, [ebp
+ mci3100_ntype
]
22116 mov
[esp
+ mci3100_ntia
], edx
22118 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
22119 mov eax
, [ebp
+ mci3100_pos
] ;
# eax = base of pos[]
22121 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
22122 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
22123 mov
[esp
+ mci3100_ii3
], ebx
22125 movq
[esp
+ mci3100_ix
], mm0
22126 movd
[esp
+ mci3100_iz
], mm1
22128 ;
# clear total potential and i forces
22130 movq
[esp
+ mci3100_vctot
], mm7
22131 movq
[esp
+ mci3100_vnbtot
], mm7
22133 mov eax
, [ebp
+ mci3100_jindex
]
22134 mov ecx
, [eax
] ;
# jindex[n]
22135 mov edx
, [eax
+ 4] ;
# jindex[n+1]
22136 add dword ptr
[ebp
+ mci3100_jindex
], 4
22137 sub edx
, ecx ;
# number of innerloop atoms
22139 mov esi
, [ebp
+ mci3100_pos
]
22140 mov eax
, [ebp
+ mci3100_jjnr
]
22143 mov
[esp
+ mci3100_innerjjnr
], eax ;
# pointer to jjnr[nj0]
22145 mov
[esp
+ mci3100_innerk
], edx ;
# number of innerloop atoms
22146 jge
.mci3100_unroll_loop
22147 jmp
.mci3100_finish_inner
22148 .mci3100_unroll_loop:
22149 ;
# paired innerloop starts here
22150 mov ecx
, [esp
+ mci3100_innerjjnr
] ;
# pointer to jjnr[k]
22152 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
22153 add dword ptr
[esp
+ mci3100_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
22154 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
22156 mov ecx
, [ebp
+ mci3100_charge
] ;
# base of charge[]
22157 movq mm5
, [esp
+ mci3100_iq
]
22158 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
22159 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
22160 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
22162 mov ecx
, [ebp
+ mci3100_type
]
22163 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
22164 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
22166 mov esi
, [ebp
+ mci3100_nbfp
] ;
# base of nbfp
22169 add edx
, [esp
+ mci3100_ntia
] ;
# tja = ntia + 2*type
22170 add ecx
, [esp
+ mci3100_ntia
]
22172 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
22173 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
22175 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
22176 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
22177 movq
[esp
+ mci3100_c6
], mm5
22178 movq
[esp
+ mci3100_c12
], mm6
22180 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
22181 lea ebx
, [ebx
+ ebx
*2]
22183 mov esi
, [ebp
+ mci3100_pos
]
22185 movq mm0
, [esp
+ mci3100_ix
]
22186 movd mm1
, [esp
+ mci3100_iz
]
22187 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
22188 movd mm5
, [esi
+ eax
*4 + 8]
22189 pfsubr mm4
,mm0 ;
# dr = ir - jr
22191 pfmul mm4
,mm4 ;
# square dx,dy,dz
22193 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
22194 pfacc mm4
, mm5 ;
# first rsq in lower mm4
22196 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
22197 movd mm7
, [esi
+ ebx
*4 + 8]
22199 pfsubr mm6
,mm0 ;
# dr = ir - jr
22201 pfmul mm6
,mm6 ;
# square dx,dy,dz
22203 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
22204 pfacc mm6
, mm7 ;
# second rsq in lower mm6
22206 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
22210 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
22211 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
22217 ;
# mm0 is invsqrt, and mm1 r.
22218 ;
# do potential and fscal
22219 pfmul mm1
, [esp
+ mci3100_tsc
] ;
# mm1=rt
22221 movq
[esp
+ mci3100_n1
], mm4
22223 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
22226 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
22228 mov edx
, [ebp
+ mci3100_VFtab
]
22229 mov ecx
, [esp
+ mci3100_n1
]
22232 ;
# load all the table values we need
22233 movd mm4
, [edx
+ ecx
*4]
22234 movd mm5
, [edx
+ ecx
*4 + 4]
22235 movd mm6
, [edx
+ ecx
*4 + 8]
22236 movd mm7
, [edx
+ ecx
*4 + 12]
22237 mov ecx
, [esp
+ mci3100_n1
+ 4]
22239 punpckldq mm4
, [edx
+ ecx
*4]
22240 punpckldq mm5
, [edx
+ ecx
*4 + 4]
22241 punpckldq mm6
, [edx
+ ecx
*4 + 8]
22242 punpckldq mm7
, [edx
+ ecx
*4 + 12]
22244 pfmul mm6
, mm1 ;
# mm6 = Geps
22245 pfmul mm7
, mm2 ;
# mm7 = Heps2
22248 pfadd mm5
, mm7 ;
# mm5 = Fp
22250 pfmul mm5
, mm1 ;
# mm5=eps*Fp
22251 pfadd mm5
, mm4 ;
# mm5= VV
22253 pfmul mm5
, mm3 ;
# vcoul=qq*VV
22256 pfmul mm1
,mm1 ;
# mm1=invsq
22259 pfmul mm2
,mm1 ;
# mm2=rinvsix
22261 pfmul mm1
,mm1 ;
# mm1=rinvtwelve
22263 pfmul mm3
, [esp
+ mci3100_tsc
]
22265 pfmul mm1
, [esp
+ mci3100_c12
]
22267 pfmul mm2
, [esp
+ mci3100_c6
]
22270 pfsub mm4
, mm2 ;
# mm4 = vnb12-vnb6
22272 pfadd mm5
, [esp
+ mci3100_vctot
] ;
# add the earlier value
22273 movq
[esp
+ mci3100_vctot
], mm5 ;
# store the sum
22275 pfadd mm4
, [esp
+ mci3100_vnbtot
] ;
# add the earlier value
22276 movq
[esp
+ mci3100_vnbtot
], mm4 ;
# store the sum
22278 ;
# should we do one more iteration?
22279 sub dword ptr
[esp
+ mci3100_innerk
], 2
22280 jl
.mci3100_finish_inner
22281 jmp
.mci3100_unroll_loop
22282 .mci3100_finish_inner:
22283 and dword ptr
[esp
+ mci3100_innerk
], 1
22284 jnz
.mci3100_single_inner
22285 jmp
.mci3100_updateouterdata
22286 .mci3100_single_inner:
22287 ;
# a single j particle iteration here - compare with the unrolled code for comments.
22288 mov eax
, [esp
+ mci3100_innerjjnr
]
22289 mov eax
, [eax
] ;
# eax=jnr offset
22291 mov ecx
, [ebp
+ mci3100_charge
]
22292 movd mm5
, [esp
+ mci3100_iq
]
22293 movd mm3
, [ecx
+ eax
*4]
22294 pfmul mm3
, mm5 ;
# mm3=qq
22296 mov esi
, [ebp
+ mci3100_nbfp
]
22297 mov ecx
, [ebp
+ mci3100_type
]
22298 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
22300 add edx
, [esp
+ mci3100_ntia
] ;
# tja = ntia + 2*type
22301 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
22302 movq
[esp
+ mci3100_c6
], mm5
22303 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
22304 movq
[esp
+ mci3100_c12
], mm5
22307 mov esi
, [ebp
+ mci3100_pos
]
22308 lea eax
, [eax
+ eax
*2]
22310 movq mm0
, [esp
+ mci3100_ix
]
22311 movd mm1
, [esp
+ mci3100_iz
]
22312 movq mm4
, [esi
+ eax
*4]
22313 movd mm5
, [esi
+ eax
*4 + 8]
22319 pfacc mm4
, mm5 ;
# mm4=rsq
22325 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
22328 ;
# mm0 is invsqrt, and mm1 r.
22329 ;
# calculate potentials and scalar force
22330 pfmul mm1
, [esp
+ mci3100_tsc
] ;
# mm1=rt
22332 movd
[esp
+ mci3100_n1
], mm4
22334 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
22337 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
22340 mov edx
, [ebp
+ mci3100_VFtab
]
22341 mov ecx
, [esp
+ mci3100_n1
]
22343 ;
# load all the table values we need
22344 movd mm4
, [edx
+ ecx
*4]
22345 movd mm5
, [edx
+ ecx
*4 + 4]
22346 movd mm6
, [edx
+ ecx
*4 + 8]
22347 movd mm7
, [edx
+ ecx
*4 + 12]
22349 pfmul mm6
, mm1 ;
# mm6 = Geps
22350 pfmul mm7
, mm2 ;
# mm7 = Heps2
22353 pfadd mm5
, mm7 ;
# mm5 = Fp
22355 pfmul mm5
, mm1 ;
# mm5=eps*Fp
22356 pfadd mm5
, mm4 ;
# mm5= VV
22358 pfmul mm5
, mm3 ;
# vcoul=qq*VV
22359 ;
# at this point mm5 contains vcoul
22362 pfmul mm1
,mm1 ;
# mm1=invsq
22365 pfmul mm2
,mm1 ;
# mm2=rinvsix
22367 pfmul mm1
,mm1 ;
# mm1=rinvtwelve
22369 pfmul mm3
, [esp
+ mci3100_tsc
]
22371 pfmul mm1
, [esp
+ mci3100_c12
]
22373 pfmul mm2
, [esp
+ mci3100_c6
]
22376 pfsub mm4
, mm2 ;
# mm4 = vnb12-vnb6
22378 pfadd mm5
, [esp
+ mci3100_vctot
] ;
# add the earlier value
22379 movq
[esp
+ mci3100_vctot
], mm5 ;
# store the sum
22381 pfadd mm4
, [esp
+ mci3100_vnbtot
] ;
# add the earlier value
22382 movq
[esp
+ mci3100_vnbtot
], mm4 ;
# store the sum
22384 .mci3100_updateouterdata:
22385 mov edx
, [ebp
+ mci3100_gid
] ;
# get group index for this i particle
22387 add dword ptr
[ebp
+ mci3100_gid
], 4 ;
# advance pointer
22389 movq mm7
, [esp
+ mci3100_vctot
]
22390 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
22392 mov eax
, [ebp
+ mci3100_Vc
]
22393 movd mm6
, [eax
+ edx
*4]
22395 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
22397 movq mm7
, [esp
+ mci3100_vnbtot
]
22398 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
22400 mov eax
, [ebp
+ mci3100_Vnb
]
22401 movd mm6
, [eax
+ edx
*4]
22403 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
22406 mov ecx
, [ebp
+ mci3100_nri
]
22409 ;
# not last, iterate once more!
22410 mov
[ebp
+ mci3100_nri
], ecx
22427 .globl mcinl3110_3dnow
22428 .globl _mcinl3110_3dnow
22431 .equiv mci3110_nri, 8
22432 .equiv mci3110_iinr, 12
22433 .equiv mci3110_jindex, 16
22434 .equiv mci3110_jjnr, 20
22435 .equiv mci3110_shift, 24
22436 .equiv mci3110_shiftvec, 28
22437 .equiv mci3110_gid, 32
22438 .equiv mci3110_pos, 36
22439 .equiv mci3110_charge, 40
22440 .equiv mci3110_facel, 44
22441 .equiv mci3110_Vc, 48
22442 .equiv mci3110_type, 52
22443 .equiv mci3110_ntype, 56
22444 .equiv mci3110_nbfp, 60
22445 .equiv mci3110_Vnb, 64
22446 .equiv mci3110_tabscale, 68
22447 .equiv mci3110_VFtab, 72
22448 .equiv mci3110_nsatoms, 76
22449 ;
# stack offsets for local variables
22450 .equiv mci3110_is3, 0
22451 .equiv mci3110_ii3, 4
22452 .equiv mci3110_shX, 8
22453 .equiv mci3110_shY, 12
22454 .equiv mci3110_shZ, 16
22455 .equiv mci3110_ix, 20
22456 .equiv mci3110_iy, 24
22457 .equiv mci3110_iz, 28
22458 .equiv mci3110_iq, 32
22459 .equiv mci3110_vctot, 40
22460 .equiv mci3110_vnbtot, 48
22461 .equiv mci3110_c6, 56
22462 .equiv mci3110_c12, 64
22463 .equiv mci3110_two, 72
22464 .equiv mci3110_n1, 80
22465 .equiv mci3110_tsc, 88
22466 .equiv mci3110_ntia, 96
22467 .equiv mci3110_innerjjnr0, 104
22468 .equiv mci3110_innerk0, 108
22469 .equiv mci3110_innerjjnr, 112
22470 .equiv mci3110_innerk, 116
22471 .equiv mci3110_nsvdwc, 120
22472 .equiv mci3110_nscoul, 124
22473 .equiv mci3110_nsvdw, 128
22474 .equiv mci3110_solnr, 132
22483 sub esp
, 136 ;
# local stack space
22486 movd mm3
, [ebp
+ mci3110_tabscale
]
22487 movq
[esp
+ mci3110_two
], mm2
22489 movq
[esp
+ mci3110_tsc
], mm3
22490 ;
# assume we have at least one i particle - start directly
22492 mov eax
, [ebp
+ mci3110_shift
] ;
# eax = pointer into shift[]
22493 mov ebx
, [eax
] ;
# ebx=shift[n]
22494 add dword ptr
[ebp
+ mci3110_shift
], 4 ;
# advance pointer one step
22496 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
22497 mov
[esp
+ mci3110_is3
],ebx ;
# store is3
22499 mov eax
, [ebp
+ mci3110_shiftvec
] ;
# eax = base of shiftvec[]
22501 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
22502 movd mm1
, [eax
+ ebx
*4 + 8]
22503 movq
[esp
+ mci3110_shX
], mm0
22504 movd
[esp
+ mci3110_shZ
], mm1
22506 mov ecx
, [ebp
+ mci3110_iinr
] ;
# ecx = pointer into iinr[]
22507 add dword ptr
[ebp
+ mci3110_iinr
], 4 ;
# advance pointer
22508 mov ebx
, [ecx
] ;
# ebx=ii
22510 mov eax
, [ebp
+ mci3110_nsatoms
]
22511 add dword ptr
[ebp
+ mci3110_nsatoms
], 12
22518 mov
[esp
+ mci3110_nsvdwc
], edx
22519 mov
[esp
+ mci3110_nscoul
], eax
22520 mov
[esp
+ mci3110_nsvdw
], ecx
22524 movq
[esp
+ mci3110_vctot
], mm7
22525 movq
[esp
+ mci3110_vnbtot
], mm7
22526 mov
[esp
+ mci3110_solnr
], ebx
22528 mov eax
, [ebp
+ mci3110_jindex
]
22529 mov ecx
, [eax
] ;
# jindex[n]
22530 mov edx
, [eax
+ 4] ;
# jindex[n+1]
22531 add dword ptr
[ebp
+ mci3110_jindex
], 4
22532 sub edx
, ecx ;
# number of innerloop atoms
22533 mov eax
, [ebp
+ mci3110_jjnr
]
22536 mov
[esp
+ mci3110_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
22538 mov
[esp
+ mci3110_innerk0
], edx ;
# number of innerloop atoms
22539 mov esi
, [ebp
+ mci3110_pos
]
22541 mov ecx
, [esp
+ mci3110_nsvdwc
]
22543 jnz
.mci3110_mno_vdwc
22544 jmp
.mci3110_testcoul
22546 mov ebx
, [esp
+ mci3110_solnr
]
22547 inc dword ptr
[esp
+ mci3110_solnr
]
22548 mov edx
, [ebp
+ mci3110_charge
]
22549 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
22550 pfmul mm2
, [ebp
+ mci3110_facel
]
22551 punpckldq mm2
,mm2 ;
# spread to both halves
22552 movq
[esp
+ mci3110_iq
], mm2 ;
# iq =facel*charge[ii]
22554 mov edx
, [ebp
+ mci3110_type
]
22555 mov edx
, [edx
+ ebx
*4]
22556 imul edx
, [ebp
+ mci3110_ntype
]
22558 mov
[esp
+ mci3110_ntia
], edx
22560 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
22561 mov eax
, [ebp
+ mci3110_pos
] ;
# eax = base of pos[]
22562 mov
[esp
+ mci3110_ii3
], ebx
22564 movq mm0
, [eax
+ ebx
*4]
22565 movd mm1
, [eax
+ ebx
*4 + 8]
22566 pfadd mm0
, [esp
+ mci3110_shX
]
22567 pfadd mm1
, [esp
+ mci3110_shZ
]
22568 movq
[esp
+ mci3110_ix
], mm0
22569 movd
[esp
+ mci3110_iz
], mm1
22571 mov ecx
, [esp
+ mci3110_innerjjnr0
]
22572 mov
[esp
+ mci3110_innerjjnr
], ecx
22573 mov edx
, [esp
+ mci3110_innerk0
]
22575 mov
[esp
+ mci3110_innerk
], edx ;
# number of innerloop atoms
22576 jge
.mci3110_unroll_vdwc_loop
22577 jmp
.mci3110_finish_vdwc_inner
22578 .mci3110_unroll_vdwc_loop:
22579 ;
# paired innerloop starts here
22580 mov ecx
, [esp
+ mci3110_innerjjnr
] ;
# pointer to jjnr[k]
22582 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
22583 add dword ptr
[esp
+ mci3110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
22584 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
22586 mov ecx
, [ebp
+ mci3110_charge
] ;
# base of charge[]
22587 movq mm5
, [esp
+ mci3110_iq
]
22588 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
22589 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
22590 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
22592 mov ecx
, [ebp
+ mci3110_type
]
22593 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
22594 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
22596 mov esi
, [ebp
+ mci3110_nbfp
] ;
# base of nbfp
22599 add edx
, [esp
+ mci3110_ntia
] ;
# tja = ntia + 2*type
22600 add ecx
, [esp
+ mci3110_ntia
]
22602 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
22603 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
22605 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
22606 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
22607 movq
[esp
+ mci3110_c6
], mm5
22608 movq
[esp
+ mci3110_c12
], mm6
22610 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
22611 lea ebx
, [ebx
+ ebx
*2]
22613 mov esi
, [ebp
+ mci3110_pos
]
22615 movq mm0
, [esp
+ mci3110_ix
]
22616 movd mm1
, [esp
+ mci3110_iz
]
22617 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
22618 movd mm5
, [esi
+ eax
*4 + 8]
22619 pfsubr mm4
,mm0 ;
# dr = ir - jr
22621 pfmul mm4
,mm4 ;
# square dx,dy,dz
22623 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
22624 pfacc mm4
, mm5 ;
# first rsq in lower mm4
22626 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
22627 movd mm7
, [esi
+ ebx
*4 + 8]
22629 pfsubr mm6
,mm0 ;
# dr = ir - jr
22631 pfmul mm6
,mm6 ;
# square dx,dy,dz
22633 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
22634 pfacc mm6
, mm7 ;
# second rsq in lower mm6
22636 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
22641 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
22642 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
22648 ;
# mm0 is invsqrt, and mm1 r.
22649 ;
# do potential and fscal
22650 pfmul mm1
, [esp
+ mci3110_tsc
] ;
# mm1=rt
22652 movq
[esp
+ mci3110_n1
], mm4
22654 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
22657 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
22659 mov edx
, [ebp
+ mci3110_VFtab
]
22660 mov ecx
, [esp
+ mci3110_n1
]
22663 ;
# load all the table values we need
22664 movd mm4
, [edx
+ ecx
*4]
22665 movd mm5
, [edx
+ ecx
*4 + 4]
22666 movd mm6
, [edx
+ ecx
*4 + 8]
22667 movd mm7
, [edx
+ ecx
*4 + 12]
22668 mov ecx
, [esp
+ mci3110_n1
+ 4]
22670 punpckldq mm4
, [edx
+ ecx
*4]
22671 punpckldq mm5
, [edx
+ ecx
*4 + 4]
22672 punpckldq mm6
, [edx
+ ecx
*4 + 8]
22673 punpckldq mm7
, [edx
+ ecx
*4 + 12]
22675 pfmul mm6
, mm1 ;
# mm6 = Geps
22676 pfmul mm7
, mm2 ;
# mm7 = Heps2
22679 pfadd mm5
, mm7 ;
# mm5 = Fp
22681 pfmul mm7
, [esp
+ mci3110_two
] ;
# two*Heps2
22683 pfmul mm5
, mm1 ;
# mm5=eps*Fp
22684 pfadd mm5
, mm4 ;
# mm5= VV
22686 pfmul mm5
, mm3 ;
# vcoul=qq*VV
22689 pfmul mm1
,mm1 ;
# mm1=invsq
22692 pfmul mm2
,mm1 ;
# mm2=rinvsix
22694 pfmul mm1
,mm1 ;
# mm1=rinvtwelve
22696 pfmul mm3
, [esp
+ mci3110_tsc
]
22698 pfmul mm1
, [esp
+ mci3110_c12
]
22700 pfmul mm2
, [esp
+ mci3110_c6
]
22703 pfsub mm4
, mm2 ;
# mm4 = vnb12-vnb6
22705 pfadd mm5
, [esp
+ mci3110_vctot
] ;
# add the earlier value
22706 movq
[esp
+ mci3110_vctot
], mm5 ;
# store the sum
22708 pfadd mm4
, [esp
+ mci3110_vnbtot
] ;
# add the earlier value
22709 movq
[esp
+ mci3110_vnbtot
], mm4 ;
# store the sum
22711 ;
# should we do one more iteration?
22712 sub dword ptr
[esp
+ mci3110_innerk
], 2
22713 jl
.mci3110_finish_vdwc_inner
22714 jmp
.mci3110_unroll_vdwc_loop
22715 .mci3110_finish_vdwc_inner:
22716 and dword ptr
[esp
+ mci3110_innerk
], 1
22717 jnz
.mci3110_single_vdwc_inner
22718 jmp
.mci3110_updateouterdata_vdwc
22719 .mci3110_single_vdwc_inner:
22720 ;
# a single j particle iteration here - compare with the unrolled code for comments.
22721 mov eax
, [esp
+ mci3110_innerjjnr
]
22722 mov eax
, [eax
] ;
# eax=jnr offset
22724 mov ecx
, [ebp
+ mci3110_charge
]
22725 movd mm5
, [esp
+ mci3110_iq
]
22726 movd mm3
, [ecx
+ eax
*4]
22727 pfmul mm3
, mm5 ;
# mm3=qq
22729 mov esi
, [ebp
+ mci3110_nbfp
]
22730 mov ecx
, [ebp
+ mci3110_type
]
22731 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
22733 add edx
, [esp
+ mci3110_ntia
] ;
# tja = ntia + 2*type
22734 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
22735 movq
[esp
+ mci3110_c6
], mm5
22736 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
22737 movq
[esp
+ mci3110_c12
], mm5
22740 mov esi
, [ebp
+ mci3110_pos
]
22741 lea eax
, [eax
+ eax
*2]
22743 movq mm0
, [esp
+ mci3110_ix
]
22744 movd mm1
, [esp
+ mci3110_iz
]
22745 movq mm4
, [esi
+ eax
*4]
22746 movd mm5
, [esi
+ eax
*4 + 8]
22752 pfacc mm4
, mm5 ;
# mm4=rsq
22758 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
22761 ;
# mm0 is invsqrt, and mm1 r.
22762 ;
# calculate potentials and scalar force
22763 pfmul mm1
, [esp
+ mci3110_tsc
] ;
# mm1=rt
22765 movd
[esp
+ mci3110_n1
], mm4
22767 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
22770 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
22773 mov edx
, [ebp
+ mci3110_VFtab
]
22774 mov ecx
, [esp
+ mci3110_n1
]
22776 ;
# load all the table values we need
22777 movd mm4
, [edx
+ ecx
*4]
22778 movd mm5
, [edx
+ ecx
*4 + 4]
22779 movd mm6
, [edx
+ ecx
*4 + 8]
22780 movd mm7
, [edx
+ ecx
*4 + 12]
22782 pfmul mm6
, mm1 ;
# mm6 = Geps
22783 pfmul mm7
, mm2 ;
# mm7 = Heps2
22786 pfadd mm5
, mm7 ;
# mm5 = Fp
22788 pfmul mm7
, [esp
+ mci3110_two
] ;
# two*Heps2
22790 pfmul mm5
, mm1 ;
# mm5=eps*Fp
22791 pfadd mm5
, mm4 ;
# mm5= VV
22793 pfmul mm5
, mm3 ;
# vcoul=qq*VV
22796 pfmul mm1
,mm1 ;
# mm1=invsq
22799 pfmul mm2
,mm1 ;
# mm2=rinvsix
22801 pfmul mm1
,mm1 ;
# mm1=rinvtwelve
22803 pfmul mm3
, [esp
+ mci3110_tsc
]
22805 pfmul mm1
, [esp
+ mci3110_c12
]
22807 pfmul mm2
, [esp
+ mci3110_c6
]
22810 pfsub mm4
, mm2 ;
# mm4 = vnb12-vnb6
22812 pfadd mm5
, [esp
+ mci3110_vctot
] ;
# add the earlier value
22813 movq
[esp
+ mci3110_vctot
], mm5 ;
# store the sum
22815 pfadd mm4
, [esp
+ mci3110_vnbtot
] ;
# add the earlier value
22816 movq
[esp
+ mci3110_vnbtot
], mm4 ;
# store the sum
22818 .mci3110_updateouterdata_vdwc:
22819 ;
# loop back to mno
22820 dec dword ptr
[esp
+ mci3110_nsvdwc
]
22821 jz
.mci3110_testcoul
22822 jmp
.mci3110_mno_vdwc
22824 mov ecx
, [esp
+ mci3110_nscoul
]
22826 jnz
.mci3110_mno_coul
22827 jmp
.mci3110_testvdw
22829 mov ebx
, [esp
+ mci3110_solnr
]
22830 inc dword ptr
[esp
+ mci3110_solnr
]
22831 mov edx
, [ebp
+ mci3110_charge
]
22832 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
22833 pfmul mm2
, [ebp
+ mci3110_facel
]
22834 punpckldq mm2
,mm2 ;
# spread to both halves
22835 movq
[esp
+ mci3110_iq
], mm2 ;
# iq =facel*charge[ii]
22837 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
22838 mov eax
, [ebp
+ mci3110_pos
] ;
# eax = base of pos[]
22839 mov
[esp
+ mci3110_ii3
], ebx
22841 movq mm0
, [eax
+ ebx
*4]
22842 movd mm1
, [eax
+ ebx
*4 + 8]
22843 pfadd mm0
, [esp
+ mci3110_shX
]
22844 pfadd mm1
, [esp
+ mci3110_shZ
]
22845 movq
[esp
+ mci3110_ix
], mm0
22846 movd
[esp
+ mci3110_iz
], mm1
22848 mov ecx
, [esp
+ mci3110_innerjjnr0
]
22849 mov
[esp
+ mci3110_innerjjnr
], ecx
22850 mov edx
, [esp
+ mci3110_innerk0
]
22852 mov
[esp
+ mci3110_innerk
], edx ;
# number of innerloop atoms
22853 jge
.mci3110_unroll_coul_loop
22854 jmp
.mci3110_finish_coul_inner
22855 .mci3110_unroll_coul_loop:
22856 ;
# paired innerloop starts here
22857 mov ecx
, [esp
+ mci3110_innerjjnr
] ;
# pointer to jjnr[k]
22859 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
22860 add dword ptr
[esp
+ mci3110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
22861 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
22863 mov ecx
, [ebp
+ mci3110_charge
] ;
# base of charge[]
22864 movq mm5
, [esp
+ mci3110_iq
]
22865 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
22866 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
22867 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
22869 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
22870 lea ebx
, [ebx
+ ebx
*2]
22872 mov esi
, [ebp
+ mci3110_pos
]
22874 movq mm0
, [esp
+ mci3110_ix
]
22875 movd mm1
, [esp
+ mci3110_iz
]
22876 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
22877 movd mm5
, [esi
+ eax
*4 + 8]
22878 pfsubr mm4
,mm0 ;
# dr = ir - jr
22880 pfmul mm4
,mm4 ;
# square dx,dy,dz
22882 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
22883 pfacc mm4
, mm5 ;
# first rsq in lower mm4
22885 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
22886 movd mm7
, [esi
+ ebx
*4 + 8]
22888 pfsubr mm6
,mm0 ;
# dr = ir - jr
22890 pfmul mm6
,mm6 ;
# square dx,dy,dz
22892 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
22893 pfacc mm6
, mm7 ;
# second rsq in lower mm6
22895 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
22899 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
22900 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
22906 ;
# mm0 is invsqrt, and mm1 r.
22907 ;
# do potential and fscal
22908 pfmul mm1
, [esp
+ mci3110_tsc
] ;
# mm1=rt
22910 movq
[esp
+ mci3110_n1
], mm4
22912 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
22915 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
22917 mov edx
, [ebp
+ mci3110_VFtab
]
22918 mov ecx
, [esp
+ mci3110_n1
]
22921 ;
# load all the table values we need
22922 movd mm4
, [edx
+ ecx
*4]
22923 movd mm5
, [edx
+ ecx
*4 + 4]
22924 movd mm6
, [edx
+ ecx
*4 + 8]
22925 movd mm7
, [edx
+ ecx
*4 + 12]
22926 mov ecx
, [esp
+ mci3110_n1
+ 4]
22928 punpckldq mm4
, [edx
+ ecx
*4]
22929 punpckldq mm5
, [edx
+ ecx
*4 + 4]
22930 punpckldq mm6
, [edx
+ ecx
*4 + 8]
22931 punpckldq mm7
, [edx
+ ecx
*4 + 12]
22933 pfmul mm6
, mm1 ;
# mm6 = Geps
22934 pfmul mm7
, mm2 ;
# mm7 = Heps2
22937 pfadd mm5
, mm7 ;
# mm5 = Fp
22939 pfmul mm7
, [esp
+ mci3110_two
] ;
# two*Heps2
22941 pfmul mm5
, mm1 ;
# mm5=eps*Fp
22942 pfadd mm5
, mm4 ;
# mm5= VV
22944 pfmul mm5
, mm3 ;
# vcoul=qq*VV
22946 ;
# at this point mm5 contains vcoul
22947 ;
# increment vcoul - then we can get rid of mm5
22949 pfadd mm5
, [esp
+ mci3110_vctot
] ;
# add the earlier value
22950 movq
[esp
+ mci3110_vctot
], mm5 ;
# store the sum
22952 ;
# should we do one more iteration?
22953 sub dword ptr
[esp
+ mci3110_innerk
], 2
22954 jl
.mci3110_finish_coul_inner
22955 jmp
.mci3110_unroll_coul_loop
22956 .mci3110_finish_coul_inner:
22957 and dword ptr
[esp
+ mci3110_innerk
], 1
22958 jnz
.mci3110_single_coul_inner
22959 jmp
.mci3110_updateouterdata_coul
22960 .mci3110_single_coul_inner:
22961 ;
# a single j particle iteration here - compare with the unrolled code for comments.
22962 mov eax
, [esp
+ mci3110_innerjjnr
]
22963 mov eax
, [eax
] ;
# eax=jnr offset
22965 mov ecx
, [ebp
+ mci3110_charge
]
22966 movd mm5
, [esp
+ mci3110_iq
]
22967 movd mm3
, [ecx
+ eax
*4]
22968 pfmul mm3
, mm5 ;
# mm3=qq
22970 mov esi
, [ebp
+ mci3110_pos
]
22971 lea eax
, [eax
+ eax
*2]
22973 movq mm0
, [esp
+ mci3110_ix
]
22974 movd mm1
, [esp
+ mci3110_iz
]
22975 movq mm4
, [esi
+ eax
*4]
22976 movd mm5
, [esi
+ eax
*4 + 8]
22982 pfacc mm4
, mm5 ;
# mm0=rsq
22988 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
22991 ;
# mm0 is invsqrt, and mm1 r.
22993 ;
# calculate potentials and scalar force
22994 pfmul mm1
, [esp
+ mci3110_tsc
] ;
# mm1=rt
22996 movd
[esp
+ mci3110_n1
], mm4
22998 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
23001 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
23004 mov edx
, [ebp
+ mci3110_VFtab
]
23005 mov ecx
, [esp
+ mci3110_n1
]
23007 ;
# load all the table values we need
23008 movd mm4
, [edx
+ ecx
*4]
23009 movd mm5
, [edx
+ ecx
*4 + 4]
23010 movd mm6
, [edx
+ ecx
*4 + 8]
23011 movd mm7
, [edx
+ ecx
*4 + 12]
23013 pfmul mm6
, mm1 ;
# mm6 = Geps
23014 pfmul mm7
, mm2 ;
# mm7 = Heps2
23017 pfadd mm5
, mm7 ;
# mm5 = Fp
23019 pfmul mm7
, [esp
+ mci3110_two
] ;
# two*Heps2
23021 pfmul mm5
, mm1 ;
# mm5=eps*Fp
23022 pfadd mm5
, mm4 ;
# mm5= VV
23024 pfmul mm5
, mm3 ;
# vcoul=qq*VV
23026 ;
# at this point mm5 contains vcoul
23027 ;
# increment vcoul - then we can get rid of mm5
23029 pfadd mm5
, [esp
+ mci3110_vctot
] ;
# add the earlier value
23030 movq
[esp
+ mci3110_vctot
], mm5 ;
# store the sum
23032 .mci3110_updateouterdata_coul:
23033 ;
# loop back to mno
23034 dec dword ptr
[esp
+ mci3110_nscoul
]
23035 jz
.mci3110_testvdw
23036 jmp
.mci3110_mno_coul
23038 mov ecx
, [esp
+ mci3110_nsvdw
]
23040 jnz
.mci3110_mno_vdw
23041 jmp
.mci3110_last_mno
23043 mov ebx
, [esp
+ mci3110_solnr
]
23044 inc dword ptr
[esp
+ mci3110_solnr
]
23046 mov edx
, [ebp
+ mci3110_type
]
23047 mov edx
, [edx
+ ebx
*4]
23048 imul edx
, [ebp
+ mci3110_ntype
]
23050 mov
[esp
+ mci3110_ntia
], edx
23052 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
23053 mov eax
, [ebp
+ mci3110_pos
] ;
# eax = base of pos[]
23054 mov
[esp
+ mci3110_ii3
], ebx
23056 movq mm0
, [eax
+ ebx
*4]
23057 movd mm1
, [eax
+ ebx
*4 + 8]
23058 pfadd mm0
, [esp
+ mci3110_shX
]
23059 pfadd mm1
, [esp
+ mci3110_shZ
]
23060 movq
[esp
+ mci3110_ix
], mm0
23061 movd
[esp
+ mci3110_iz
], mm1
23063 mov ecx
, [esp
+ mci3110_innerjjnr0
]
23064 mov
[esp
+ mci3110_innerjjnr
], ecx
23065 mov edx
, [esp
+ mci3110_innerk0
]
23067 mov
[esp
+ mci3110_innerk
], edx ;
# number of innerloop atoms
23068 jge
.mci3110_unroll_vdw_loop
23069 jmp
.mci3110_finish_vdw_inner
23070 .mci3110_unroll_vdw_loop:
23071 ;
# paired innerloop starts here
23072 mov ecx
, [esp
+ mci3110_innerjjnr
] ;
# pointer to jjnr[k]
23074 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
23075 add dword ptr
[esp
+ mci3110_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
23076 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
23078 mov ecx
, [ebp
+ mci3110_type
]
23079 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
23080 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
23082 mov esi
, [ebp
+ mci3110_nbfp
] ;
# base of nbfp
23085 add edx
, [esp
+ mci3110_ntia
] ;
# tja = ntia + 2*type
23086 add ecx
, [esp
+ mci3110_ntia
]
23088 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
23089 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
23091 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
23092 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
23093 movq
[esp
+ mci3110_c6
], mm5
23094 movq
[esp
+ mci3110_c12
], mm6
23096 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
23097 lea ebx
, [ebx
+ ebx
*2]
23099 mov esi
, [ebp
+ mci3110_pos
]
23101 movq mm0
, [esp
+ mci3110_ix
]
23102 movd mm1
, [esp
+ mci3110_iz
]
23103 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
23104 movd mm5
, [esi
+ eax
*4 + 8]
23105 pfsubr mm4
,mm0 ;
# dr = ir - jr
23107 pfmul mm4
,mm4 ;
# square dx,dy,dz
23109 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
23110 pfacc mm4
, mm5 ;
# first rsq in lower mm4
23112 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
23113 movd mm7
, [esi
+ ebx
*4 + 8]
23115 pfsubr mm6
,mm0 ;
# dr = ir - jr
23117 pfmul mm6
,mm6 ;
# square dx,dy,dz
23119 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
23120 pfacc mm6
, mm7 ;
# second rsq in lower mm6
23122 pfrcp mm0
, mm4 ;
# lookup reciprocal seed
23126 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
23127 ;
# amd 3dnow N-R iteration to get full precision.
23130 ;
# mm4 now contains invsq,
23131 ;
# do potential and fscal
23135 pfmul mm4
, mm0 ;
# mm4=rinvsix
23137 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
23139 pfmul mm5
, [esp
+ mci3110_c12
]
23140 pfmul mm4
, [esp
+ mci3110_c6
]
23141 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
23144 pfadd mm6
, [esp
+ mci3110_vnbtot
] ;
# add the earlier value
23145 movq
[esp
+ mci3110_vnbtot
], mm6 ;
# store the sum
23147 ;
# should we do one more iteration?
23148 sub dword ptr
[esp
+ mci3110_innerk
], 2
23149 jl
.mci3110_finish_vdw_inner
23150 jmp
.mci3110_unroll_vdw_loop
23151 .mci3110_finish_vdw_inner:
23152 and dword ptr
[esp
+ mci3110_innerk
], 1
23153 jnz
.mci3110_single_vdw_inner
23154 jmp
.mci3110_updateouterdata_vdw
23155 .mci3110_single_vdw_inner:
23156 ;
# a single j particle iteration here - compare with the unrolled code for comments.
23157 mov eax
, [esp
+ mci3110_innerjjnr
]
23158 mov eax
, [eax
] ;
# eax=jnr offset
23160 mov esi
, [ebp
+ mci3110_nbfp
]
23161 mov ecx
, [ebp
+ mci3110_type
]
23162 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
23164 add edx
, [esp
+ mci3110_ntia
] ;
# tja = ntia + 2*type
23165 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
23166 movq
[esp
+ mci3110_c6
], mm5
23167 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
23168 movq
[esp
+ mci3110_c12
], mm5
23170 mov esi
, [ebp
+ mci3110_pos
]
23171 lea eax
, [eax
+ eax
*2]
23173 movq mm0
, [esp
+ mci3110_ix
]
23174 movd mm1
, [esp
+ mci3110_iz
]
23175 movq mm4
, [esi
+ eax
*4]
23176 movd mm5
, [esi
+ eax
*4 + 8]
23182 pfacc mm4
, mm5 ;
# mm4=rsq
23186 pfrcpit2 mm4
,mm0 ;
# mm4=invsq
23187 ;
# calculate potentials and scalar force
23191 pfmul mm4
, mm0 ;
# mm4=rinvsix
23193 pfmul mm5
, mm5 ;
# mm5=rinvtwelve
23195 pfmul mm5
, [esp
+ mci3110_c12
]
23196 pfmul mm4
, [esp
+ mci3110_c6
]
23197 movq mm6
, mm5 ;
# mm6 is vnb12-vnb6
23200 pfadd mm6
, [esp
+ mci3110_vnbtot
] ;
# add the earlier value
23201 movq
[esp
+ mci3110_vnbtot
], mm6 ;
# store the sum
23203 .mci3110_updateouterdata_vdw:
23204 ;
# loop back to mno
23205 dec dword ptr
[esp
+ mci3110_nsvdw
]
23206 jz
.mci3110_last_mno
23207 jmp
.mci3110_mno_vdw
23210 mov edx
, [ebp
+ mci3110_gid
] ;
# get group index for this i particle
23212 add dword ptr
[ebp
+ mci3110_gid
], 4 ;
# advance pointer
23214 movq mm7
, [esp
+ mci3110_vctot
]
23215 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
23217 mov eax
, [ebp
+ mci3110_Vc
]
23218 movd mm6
, [eax
+ edx
*4]
23220 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
23222 movq mm7
, [esp
+ mci3110_vnbtot
]
23223 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
23225 mov eax
, [ebp
+ mci3110_Vnb
]
23226 movd mm6
, [eax
+ edx
*4]
23228 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
23230 mov ecx
, [ebp
+ mci3110_nri
]
23233 ;
# not last, iterate once more!
23234 mov
[ebp
+ mci3110_nri
], ecx
23251 .globl mcinl3120_3dnow
23252 .globl _mcinl3120_3dnow
23255 .equiv mci3120_nri, 8
23256 .equiv mci3120_iinr, 12
23257 .equiv mci3120_jindex, 16
23258 .equiv mci3120_jjnr, 20
23259 .equiv mci3120_shift, 24
23260 .equiv mci3120_shiftvec, 28
23261 .equiv mci3120_gid, 32
23262 .equiv mci3120_pos, 36
23263 .equiv mci3120_charge, 40
23264 .equiv mci3120_facel, 44
23265 .equiv mci3120_Vc, 48
23266 .equiv mci3120_type, 52
23267 .equiv mci3120_ntype, 56
23268 .equiv mci3120_nbfp, 60
23269 .equiv mci3120_Vnb, 64
23270 .equiv mci3120_tabscale, 68
23271 .equiv mci3120_VFtab, 72
23272 ;
# stack offsets for local variables
23273 .equiv mci3120_is3, 0
23274 .equiv mci3120_ii3, 4
23275 .equiv mci3120_ixO, 8
23276 .equiv mci3120_iyO, 12
23277 .equiv mci3120_izO, 16
23278 .equiv mci3120_ixH, 20
23279 .equiv mci3120_iyH, 28
23280 .equiv mci3120_izH, 36
23281 .equiv mci3120_iqO, 44
23282 .equiv mci3120_iqH, 52
23283 .equiv mci3120_qqO, 60
23284 .equiv mci3120_qqH, 68
23285 .equiv mci3120_vctot, 76
23286 .equiv mci3120_vnbtot, 84
23287 .equiv mci3120_c6, 92
23288 .equiv mci3120_c12, 100
23289 .equiv mci3120_n1, 108
23290 .equiv mci3120_tsc, 116
23291 .equiv mci3120_ntia, 124
23292 .equiv mci3120_innerjjnr, 128
23293 .equiv mci3120_innerk, 132
23294 .equiv mci3120_tmprsqH, 136
23303 sub esp
, 144 ;
# local stack space
23306 mov ecx
, [ebp
+ mci3120_iinr
] ;
# ecx = pointer into iinr[]
23307 mov ebx
, [ecx
] ;
# ebx=ii
23309 mov edx
, [ebp
+ mci3120_charge
]
23310 movd mm1
, [ebp
+ mci3120_facel
]
23311 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0]
23313 movq
[esp
+ mci3120_iqO
], mm2 ;
# iqO = facel*charge[ii]
23315 movd mm2
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1]
23317 punpckldq mm2
,mm2 ;
# spread to both halves
23318 movq
[esp
+ mci3120_iqH
], mm2 ;
# iqH = facel*charge[ii0+1]
23320 mov edx
, [ebp
+ mci3120_type
]
23321 mov edx
, [edx
+ ebx
*4]
23324 imul ecx
, [ebp
+ mci3120_ntype
] ;
# ecx = ntia = 2*ntype*type[ii0]
23325 mov
[esp
+ mci3120_ntia
], ecx
23327 movq mm6
, [ebp
+ mci3120_tabscale
]
23328 punpckldq mm6
,mm6 ;
# spread to both halves
23329 movq
[esp
+ mci3120_tsc
], mm6
23330 ;
# assume we have at least one i particle - start directly
23332 mov eax
, [ebp
+ mci3120_shift
] ;
# eax = pointer into shift[]
23333 mov ebx
, [eax
] ;
# ebx=shift[n]
23334 add dword ptr
[ebp
+ mci3120_shift
], 4 ;
# advance pointer one step
23336 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
23337 mov
[esp
+ mci3120_is3
],ebx ;
# store is3
23339 mov eax
, [ebp
+ mci3120_shiftvec
] ;
# eax = base of shiftvec[]
23341 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
23342 movd mm6
, [eax
+ ebx
*4 + 8]
23346 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
23350 mov ecx
, [ebp
+ mci3120_iinr
] ;
# ecx = pointer into iinr[]
23351 add dword ptr
[ebp
+ mci3120_iinr
], 4 ;
# advance pointer
23352 mov ebx
, [ecx
] ;
# ebx=ii
23354 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
23355 mov eax
, [ebp
+ mci3120_pos
] ;
# eax = base of pos[]
23357 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
23358 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
23359 mov
[esp
+ mci3120_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
23361 movq
[esp
+ mci3120_ixO
], mm5
23362 movq
[esp
+ mci3120_izO
], mm6
23364 movd mm3
, [eax
+ ebx
*4 + 12]
23365 movd mm4
, [eax
+ ebx
*4 + 16]
23366 movd mm5
, [eax
+ ebx
*4 + 20]
23367 punpckldq mm3
, [eax
+ ebx
*4 + 24]
23368 punpckldq mm4
, [eax
+ ebx
*4 + 28]
23369 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
23374 movq
[esp
+ mci3120_ixH
], mm0
23375 movq
[esp
+ mci3120_iyH
], mm1
23376 movq
[esp
+ mci3120_izH
], mm2
23378 ;
# clear vctot and i forces
23380 movq
[esp
+ mci3120_vctot
], mm7
23381 movq
[esp
+ mci3120_vnbtot
], mm7
23383 mov eax
, [ebp
+ mci3120_jindex
]
23384 mov ecx
, [eax
] ;
# jindex[n]
23385 mov edx
, [eax
+ 4] ;
# jindex[n+1]
23386 add dword ptr
[ebp
+ mci3120_jindex
], 4
23387 sub edx
, ecx ;
# number of innerloop atoms
23388 mov
[esp
+ mci3120_innerk
], edx
23390 mov esi
, [ebp
+ mci3120_pos
]
23391 mov eax
, [ebp
+ mci3120_jjnr
]
23394 mov
[esp
+ mci3120_innerjjnr
], eax ;
# pointer to jjnr[nj0]
23395 .mci3120_inner_loop:
23396 ;
# a single j particle iteration here - compare with the unrolled code for comments.
23397 mov eax
, [esp
+ mci3120_innerjjnr
]
23398 mov eax
, [eax
] ;
# eax=jnr offset
23399 add dword ptr
[esp
+ mci3120_innerjjnr
], 4 ;
# advance pointer
23400 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
23402 mov ecx
, [ebp
+ mci3120_charge
]
23403 movd mm7
, [ecx
+ eax
*4]
23406 pfmul mm6
, [esp
+ mci3120_iqO
]
23407 pfmul mm7
, [esp
+ mci3120_iqH
] ;
# mm6=qqO, mm7=qqH
23408 movd
[esp
+ mci3120_qqO
], mm6
23409 movq
[esp
+ mci3120_qqH
], mm7
23411 mov ecx
, [ebp
+ mci3120_type
]
23412 mov edx
, [ecx
+ eax
*4] ;
# type [jnr]
23413 mov ecx
, [ebp
+ mci3120_nbfp
]
23415 add edx
, [esp
+ mci3120_ntia
] ;
# tja = ntia + 2*type
23416 movd mm5
, [ecx
+ edx
*4] ;
# mm5 = 1st c6
23417 movq
[esp
+ mci3120_c6
], mm5
23418 movd mm5
, [ecx
+ edx
*4 + 4] ;
# mm5 = 1st c12
23419 movq
[esp
+ mci3120_c12
], mm5
23421 lea eax
, [eax
+ eax
*2]
23423 movq mm0
, [esi
+ eax
*4]
23424 movd mm1
, [esi
+ eax
*4 + 8]
23425 ;
# copy & expand to mm2-mm4 for the H interactions
23433 pfsubr mm0
, [esp
+ mci3120_ixO
]
23434 pfsubr mm1
, [esp
+ mci3120_izO
]
23439 pfadd mm0
, mm1 ;
# mm0=rsqO
23443 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
23444 pfsubr mm2
, [esp
+ mci3120_ixH
]
23445 pfsubr mm3
, [esp
+ mci3120_iyH
]
23446 pfsubr mm4
, [esp
+ mci3120_izH
] ;
# mm2-mm4 is dxH-dzH
23453 pfadd mm3
,mm4 ;
# mm3=rsqH
23454 movq
[esp
+ mci3120_tmprsqH
], mm3
23461 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
23463 pfmul mm0
, mm1 ;
# mm0=r
23465 pfmul mm0
, [esp
+ mci3120_tsc
]
23467 movd
[esp
+ mci3120_n1
], mm4
23469 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
23471 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
23474 mov edx
, [ebp
+ mci3120_VFtab
]
23475 mov ecx
, [esp
+ mci3120_n1
]
23477 ;
# load all values we need
23478 movd mm4
, [edx
+ ecx
*4]
23479 movd mm5
, [edx
+ ecx
*4 + 4]
23480 movd mm6
, [edx
+ ecx
*4 + 8]
23481 movd mm7
, [edx
+ ecx
*4 + 12]
23483 pfmul mm6
, mm0 ;
# mm6 = Geps
23484 pfmul mm7
, mm2 ;
# mm7 = Heps2
23487 pfadd mm5
, mm7 ;
# mm5 = Fp
23489 pfmul mm5
, mm0 ;
# mm5=eps*Fp
23490 pfadd mm5
, mm4 ;
# mm5= VV
23492 pfmul mm5
, [esp
+ mci3120_qqO
] ;
# vcoul=qq*VV
23493 ;
# update vctot directly
23494 pfadd mm5
, [esp
+ mci3120_vctot
]
23495 movq
[esp
+ mci3120_vctot
], mm5
23497 ;
# nontabulated LJ - mm1 is invsqrt. - keep mm1!
23499 pfmul mm0
, mm0 ;
# mm0 is invsq
23502 pfmul mm2
, mm0 ;
# mm2 = rinvsix
23504 pfmul mm4
, mm4 ;
# mm4=rinvtwelve
23506 pfmul mm4
, [esp
+ mci3120_c12
]
23507 pfmul mm2
, [esp
+ mci3120_c6
]
23508 pfsub mm4
, mm2 ;
# mm4=vnb12-vnb6
23511 pfadd mm4
, [esp
+ mci3120_vnbtot
] ;
# add the earlier value
23512 movq
[esp
+ mci3120_vnbtot
], mm4 ;
# store the sum
23514 ;
# now do the two hydrogens.
23515 movq mm0
, [esp
+ mci3120_tmprsqH
] ;
# mm0=rsqH
23521 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
23526 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
23528 pfmul mm0
,mm1 ;
# mm0=r
23529 pfmul mm0
, [esp
+ mci3120_tsc
]
23531 movq
[esp
+ mci3120_n1
], mm4
23533 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
23535 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
23538 mov edx
, [ebp
+ mci3120_VFtab
]
23539 mov ecx
, [esp
+ mci3120_n1
]
23541 ;
# load all values we need
23542 movd mm4
, [edx
+ ecx
*4]
23543 movd mm5
, [edx
+ ecx
*4 + 4]
23544 movd mm6
, [edx
+ ecx
*4 + 8]
23545 movd mm7
, [edx
+ ecx
*4 + 12]
23546 mov ecx
, [esp
+ mci3120_n1
+ 4]
23548 punpckldq mm4
, [edx
+ ecx
*4]
23549 punpckldq mm5
, [edx
+ ecx
*4 + 4]
23550 punpckldq mm6
, [edx
+ ecx
*4 + 8]
23551 punpckldq mm7
, [edx
+ ecx
*4 + 12]
23553 pfmul mm6
, mm0 ;
# mm6 = Geps
23554 pfmul mm7
, mm2 ;
# mm7 = Heps2
23557 pfadd mm5
, mm7 ;
# mm5 = Fp
23560 pfmul mm5
, mm0 ;
# mm5=eps*Fp
23561 pfadd mm5
, mm4 ;
# mm5= VV
23563 pfmul mm5
, [esp
+ mci3120_qqH
] ;
# vcoul=qq*VV
23565 pfadd mm5
, [esp
+ mci3120_vctot
]
23566 movq
[esp
+ mci3120_vctot
], mm5
23568 ;
# done - one more?
23569 dec dword ptr
[esp
+ mci3120_innerk
]
23570 jz
.mci3120_updateouterdata
23571 jmp
.mci3120_inner_loop
23572 .mci3120_updateouterdata:
23574 mov edx
, [ebp
+ mci3120_gid
] ;
# get group index for this i particle
23576 add dword ptr
[ebp
+ mci3120_gid
], 4 ;
# advance pointer
23578 movq mm7
, [esp
+ mci3120_vctot
]
23579 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
23581 mov eax
, [ebp
+ mci3120_Vc
]
23582 movd mm6
, [eax
+ edx
*4]
23584 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
23586 movq mm7
, [esp
+ mci3120_vnbtot
]
23587 pfacc mm7
,mm7 ;
# same for Vnb
23589 mov eax
, [ebp
+ mci3120_Vnb
]
23590 movd mm6
, [eax
+ edx
*4]
23592 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
23594 dec dword ptr
[ebp
+ mci3120_nri
]
23596 ;
# not last, iterate once more!
23614 .globl mcinl3130_3dnow
23615 .globl _mcinl3130_3dnow
23618 .equiv mci3130_nri, 8
23619 .equiv mci3130_iinr, 12
23620 .equiv mci3130_jindex, 16
23621 .equiv mci3130_jjnr, 20
23622 .equiv mci3130_shift, 24
23623 .equiv mci3130_shiftvec, 28
23624 .equiv mci3130_gid, 32
23625 .equiv mci3130_pos, 36
23626 .equiv mci3130_charge, 40
23627 .equiv mci3130_facel, 44
23628 .equiv mci3130_Vc, 48
23629 .equiv mci3130_type, 52
23630 .equiv mci3130_ntype, 56
23631 .equiv mci3130_nbfp, 60
23632 .equiv mci3130_Vnb, 64
23633 .equiv mci3130_tabscale, 68
23634 .equiv mci3130_VFtab, 72
23635 ;
# stack offsets for local variables
23636 .equiv mci3130_is3, 0
23637 .equiv mci3130_ii3, 4
23638 .equiv mci3130_ixO, 8
23639 .equiv mci3130_iyO, 12
23640 .equiv mci3130_izO, 16
23641 .equiv mci3130_ixH, 20
23642 .equiv mci3130_iyH, 28
23643 .equiv mci3130_izH, 36
23644 .equiv mci3130_qqOO, 44
23645 .equiv mci3130_qqOH, 52
23646 .equiv mci3130_qqHH, 60
23647 .equiv mci3130_c6, 68
23648 .equiv mci3130_c12, 76
23649 .equiv mci3130_n1, 84
23650 .equiv mci3130_tsc, 92
23651 .equiv mci3130_vctot, 100
23652 .equiv mci3130_vnbtot, 108
23653 .equiv mci3130_innerjjnr, 116
23654 .equiv mci3130_innerk, 120
23655 .equiv mci3130_tmprsqH, 124
23664 sub esp
, 132 ;
# local stack space
23666 ;
# assume we have at least one i particle - start directly
23668 mov ecx
, [ebp
+ mci3130_iinr
] ;
# ecx = pointer into iinr[]
23669 mov ebx
, [ecx
] ;
# ebx=ii
23671 mov edx
, [ebp
+ mci3130_charge
]
23672 movd mm1
, [ebp
+ mci3130_facel
] ;
# mm1=facel
23673 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0] (O)
23674 movd mm3
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1] (H)
23680 pfmul mm4
, mm2 ;
# mm4=qqOO*facel
23681 pfmul mm5
, mm3 ;
# mm5=qqOH*facel
23682 pfmul mm6
, mm3 ;
# mm6=qqHH*facel
23683 punpckldq mm5
,mm5 ;
# spread to both halves
23684 punpckldq mm6
,mm6 ;
# spread to both halves
23685 movq
[esp
+ mci3130_qqOO
], mm4
23686 movq
[esp
+ mci3130_qqOH
], mm5
23687 movq
[esp
+ mci3130_qqHH
], mm6
23688 mov edx
, [ebp
+ mci3130_type
]
23689 mov ecx
, [edx
+ ebx
*4]
23692 imul ecx
, [ebp
+ mci3130_ntype
]
23694 mov eax
, [ebp
+ mci3130_nbfp
]
23695 movd mm0
, [eax
+ edx
*4]
23696 movd mm1
, [eax
+ edx
*4 + 4]
23697 movq
[esp
+ mci3130_c6
], mm0
23698 movq
[esp
+ mci3130_c12
], mm1
23699 movd mm5
, [ebp
+ mci3130_tabscale
]
23701 movq
[esp
+ mci3130_tsc
], mm5
23703 mov eax
, [ebp
+ mci3130_shift
] ;
# eax = pointer into shift[]
23704 mov ebx
, [eax
] ;
# ebx=shift[n]
23705 add dword ptr
[ebp
+ mci3130_shift
], 4 ;
# advance pointer one step
23707 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
23708 mov
[esp
+ mci3130_is3
],ebx ;
# store is3
23710 mov eax
, [ebp
+ mci3130_shiftvec
] ;
# eax = base of shiftvec[]
23712 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
23713 movd mm6
, [eax
+ ebx
*4 + 8]
23717 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
23721 mov ecx
, [ebp
+ mci3130_iinr
] ;
# ecx = pointer into iinr[]
23722 add dword ptr
[ebp
+ mci3130_iinr
], 4 ;
# advance pointer
23723 mov ebx
, [ecx
] ;
# ebx=ii
23725 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
23726 mov eax
, [ebp
+ mci3130_pos
] ;
# eax = base of pos[]
23728 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
23729 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
23730 mov
[esp
+ mci3130_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
23732 movq
[esp
+ mci3130_ixO
], mm5
23733 movq
[esp
+ mci3130_izO
], mm6
23735 movd mm3
, [eax
+ ebx
*4 + 12]
23736 movd mm4
, [eax
+ ebx
*4 + 16]
23737 movd mm5
, [eax
+ ebx
*4 + 20]
23738 punpckldq mm3
, [eax
+ ebx
*4 + 24]
23739 punpckldq mm4
, [eax
+ ebx
*4 + 28]
23740 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
23745 movq
[esp
+ mci3130_ixH
], mm0
23746 movq
[esp
+ mci3130_iyH
], mm1
23747 movq
[esp
+ mci3130_izH
], mm2
23749 ;
# clear vctot and i forces
23751 movq
[esp
+ mci3130_vctot
], mm7
23752 movq
[esp
+ mci3130_vnbtot
], mm7
23754 mov eax
, [ebp
+ mci3130_jindex
]
23755 mov ecx
, [eax
] ;
# jindex[n]
23756 mov edx
, [eax
+ 4] ;
# jindex[n+1]
23757 add dword ptr
[ebp
+ mci3130_jindex
], 4
23758 sub edx
, ecx ;
# number of innerloop atoms
23759 mov
[esp
+ mci3130_innerk
], edx ;
# number of innerloop atoms
23761 mov esi
, [ebp
+ mci3130_pos
]
23762 mov eax
, [ebp
+ mci3130_jjnr
]
23765 mov
[esp
+ mci3130_innerjjnr
], eax ;
# pointer to jjnr[nj0]
23766 .mci3130_inner_loop:
23767 ;
# a single j particle iteration here - compare with the unrolled code for comments.
23768 mov eax
, [esp
+ mci3130_innerjjnr
]
23769 mov eax
, [eax
] ;
# eax=jnr offset
23770 add dword ptr
[esp
+ mci3130_innerjjnr
], 4 ;
# advance pointer
23772 lea eax
, [eax
+ eax
*2]
23774 movq mm0
, [esi
+ eax
*4]
23775 movd mm1
, [esi
+ eax
*4 + 8]
23776 ;
# copy & expand to mm2-mm4 for the H interactions
23784 pfsubr mm0
, [esp
+ mci3130_ixO
]
23785 pfsubr mm1
, [esp
+ mci3130_izO
]
23790 pfadd mm0
, mm1 ;
# mm0=rsqO
23794 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
23795 pfsubr mm2
, [esp
+ mci3130_ixH
]
23796 pfsubr mm3
, [esp
+ mci3130_iyH
]
23797 pfsubr mm4
, [esp
+ mci3130_izH
] ;
# mm2-mm4 is dxH-dzH
23804 pfadd mm3
,mm4 ;
# mm3=rsqH
23805 movq
[esp
+ mci3130_tmprsqH
], mm3
23812 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
23813 pfmul mm0
, mm1 ;
# mm0=rsq
23815 pfmul mm0
, [esp
+ mci3130_tsc
]
23817 movd
[esp
+ mci3130_n1
], mm4
23819 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
23821 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
23824 mov edx
, [ebp
+ mci3130_VFtab
]
23825 mov ecx
, [esp
+ mci3130_n1
]
23828 ;
# load all values we need
23829 movd mm4
, [edx
+ ecx
*4]
23830 movd mm5
, [edx
+ ecx
*4 + 4]
23831 movd mm6
, [edx
+ ecx
*4 + 8]
23832 movd mm7
, [edx
+ ecx
*4 + 12]
23834 pfmul mm6
, mm0 ;
# mm6 = Geps
23835 pfmul mm7
, mm2 ;
# mm7 = Heps2
23838 pfadd mm5
, mm7 ;
# mm5 = Fp
23840 pfmul mm5
, mm0 ;
# mm5=eps*Fp
23841 pfadd mm5
, mm4 ;
# mm5= VV
23843 pfmul mm5
, [esp
+ mci3130_qqOO
] ;
# vcoul=qq*VV
23845 ;
# update vctot directly
23846 pfadd mm5
, [esp
+ mci3130_vctot
]
23847 movq
[esp
+ mci3130_vctot
], mm5
23855 pfmul mm5
,mm5 ;
# mm4=rinvsix, mm5=rinvtwelve
23857 pfmul mm4
, [esp
+ mci3130_c6
]
23858 pfmul mm5
, [esp
+ mci3130_c12
]
23863 pfadd mm6
, [esp
+ mci3130_vnbtot
] ;
# add the earlier value
23864 movq
[esp
+ mci3130_vnbtot
], mm6 ;
# store the sum
23866 ;
# time for hydrogens!
23868 movq mm0
, [esp
+ mci3130_tmprsqH
]
23874 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
23879 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
23881 pfmul mm0
,mm1 ;
# mm0=r
23882 pfmul mm0
, [esp
+ mci3130_tsc
]
23884 movq
[esp
+ mci3130_n1
], mm4
23886 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
23888 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
23891 mov edx
, [ebp
+ mci3130_VFtab
]
23892 mov ecx
, [esp
+ mci3130_n1
]
23894 ;
# load all values we need
23895 movd mm4
, [edx
+ ecx
*4]
23896 movd mm5
, [edx
+ ecx
*4 + 4]
23897 movd mm6
, [edx
+ ecx
*4 + 8]
23898 movd mm7
, [edx
+ ecx
*4 + 12]
23899 mov ecx
, [esp
+ mci3130_n1
+ 4]
23901 punpckldq mm4
, [edx
+ ecx
*4]
23902 punpckldq mm5
, [edx
+ ecx
*4 + 4]
23903 punpckldq mm6
, [edx
+ ecx
*4 + 8]
23904 punpckldq mm7
, [edx
+ ecx
*4 + 12]
23906 pfmul mm6
, mm0 ;
# mm6 = Geps
23907 pfmul mm7
, mm2 ;
# mm7 = Heps2
23910 pfadd mm5
, mm7 ;
# mm5 = Fp
23912 pfmul mm5
, mm0 ;
# mm5=eps*Fp
23913 pfadd mm5
, mm4 ;
# mm5= VV
23915 pfmul mm5
, [esp
+ mci3130_qqOH
] ;
# vcoul=qq*VV
23917 pfadd mm5
, [esp
+ mci3130_vctot
]
23918 movq
[esp
+ mci3130_vctot
], mm5
23920 ;
# interactions with j H1
23922 movq mm0
, [esi
+ eax
*4 + 12]
23923 movd mm1
, [esi
+ eax
*4 + 20]
23924 ;
# copy & expand to mm2-mm4 for the H interactions
23932 pfsubr mm0
, [esp
+ mci3130_ixO
]
23933 pfsubr mm1
, [esp
+ mci3130_izO
]
23938 pfadd mm0
, mm1 ;
# mm0=rsqO
23942 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
23943 pfsubr mm2
, [esp
+ mci3130_ixH
]
23944 pfsubr mm3
, [esp
+ mci3130_iyH
]
23945 pfsubr mm4
, [esp
+ mci3130_izH
] ;
# mm2-mm4 is dxH-dzH
23952 pfadd mm3
,mm4 ;
# mm3=rsqH
23953 movq
[esp
+ mci3130_tmprsqH
], mm3
23960 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
23961 pfmul mm0
, mm1 ;
# mm0=rsq
23963 pfmul mm0
, [esp
+ mci3130_tsc
]
23965 movd
[esp
+ mci3130_n1
], mm4
23967 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
23969 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
23972 mov edx
, [ebp
+ mci3130_VFtab
]
23973 mov ecx
, [esp
+ mci3130_n1
]
23976 ;
# load all values we need
23977 movd mm4
, [edx
+ ecx
*4]
23978 movd mm5
, [edx
+ ecx
*4 + 4]
23979 movd mm6
, [edx
+ ecx
*4 + 8]
23980 movd mm7
, [edx
+ ecx
*4 + 12]
23982 pfmul mm6
, mm0 ;
# mm6 = Geps
23983 pfmul mm7
, mm2 ;
# mm7 = Heps2
23986 pfadd mm5
, mm7 ;
# mm5 = Fp
23988 pfmul mm5
, mm0 ;
# mm5=eps*Fp
23989 pfadd mm5
, mm4 ;
# mm5= VV
23991 pfmul mm5
, [esp
+ mci3130_qqOH
] ;
# vcoul=qq*VV
23993 ;
# update vctot directly, force is moved to mm3
23994 pfadd mm5
, [esp
+ mci3130_vctot
]
23995 movq
[esp
+ mci3130_vctot
], mm5
23997 movq mm0
, [esp
+ mci3130_tmprsqH
]
24003 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
24008 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
24010 pfmul mm0
,mm1 ;
# mm0=r
24011 pfmul mm0
, [esp
+ mci3130_tsc
]
24013 movq
[esp
+ mci3130_n1
], mm4
24015 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
24017 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
24020 mov edx
, [ebp
+ mci3130_VFtab
]
24021 mov ecx
, [esp
+ mci3130_n1
]
24023 ;
# load all values we need
24024 movd mm4
, [edx
+ ecx
*4]
24025 movd mm5
, [edx
+ ecx
*4 + 4]
24026 movd mm6
, [edx
+ ecx
*4 + 8]
24027 movd mm7
, [edx
+ ecx
*4 + 12]
24028 mov ecx
, [esp
+ mci3130_n1
+ 4]
24030 punpckldq mm4
, [edx
+ ecx
*4]
24031 punpckldq mm5
, [edx
+ ecx
*4 + 4]
24032 punpckldq mm6
, [edx
+ ecx
*4 + 8]
24033 punpckldq mm7
, [edx
+ ecx
*4 + 12]
24036 pfmul mm6
, mm0 ;
# mm6 = Geps
24037 pfmul mm7
, mm2 ;
# mm7 = Heps2
24040 pfadd mm5
, mm7 ;
# mm5 = Fp
24042 pfmul mm5
, mm0 ;
# mm5=eps*Fp
24043 pfadd mm5
, mm4 ;
# mm5= VV
24045 pfmul mm5
, [esp
+ mci3130_qqHH
] ;
# vcoul=qq*VV
24047 pfadd mm5
, [esp
+ mci3130_vctot
]
24048 movq
[esp
+ mci3130_vctot
], mm5
24050 ;
# interactions with j H2
24051 movq mm0
, [esi
+ eax
*4 + 24]
24052 movd mm1
, [esi
+ eax
*4 + 32]
24053 ;
# copy & expand to mm2-mm4 for the H interactions
24061 pfsubr mm0
, [esp
+ mci3130_ixO
]
24062 pfsubr mm1
, [esp
+ mci3130_izO
]
24067 pfadd mm0
, mm1 ;
# mm0=rsqO
24071 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
24072 pfsubr mm2
, [esp
+ mci3130_ixH
]
24073 pfsubr mm3
, [esp
+ mci3130_iyH
]
24074 pfsubr mm4
, [esp
+ mci3130_izH
] ;
# mm2-mm4 is dxH-dzH
24081 pfadd mm3
,mm4 ;
# mm3=rsqH
24082 movq
[esp
+ mci3130_tmprsqH
], mm3
24089 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
24092 pfmul mm0
, [esp
+ mci3130_tsc
]
24094 movd
[esp
+ mci3130_n1
], mm4
24096 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
24098 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
24101 mov edx
, [ebp
+ mci3130_VFtab
]
24102 mov ecx
, [esp
+ mci3130_n1
]
24105 ;
# load all values we need
24106 movd mm4
, [edx
+ ecx
*4]
24107 movd mm5
, [edx
+ ecx
*4 + 4]
24108 movd mm6
, [edx
+ ecx
*4 + 8]
24109 movd mm7
, [edx
+ ecx
*4 + 12]
24111 pfmul mm6
, mm0 ;
# mm6 = Geps
24112 pfmul mm7
, mm2 ;
# mm7 = Heps2
24115 pfadd mm5
, mm7 ;
# mm5 = Fp
24117 pfmul mm5
, mm0 ;
# mm5=eps*Fp
24118 pfadd mm5
, mm4 ;
# mm5= VV
24120 pfmul mm5
, [esp
+ mci3130_qqOH
] ;
# vcoul=qq*VV
24122 ;
# update vctot directly
24123 pfadd mm5
, [esp
+ mci3130_vctot
]
24124 movq
[esp
+ mci3130_vctot
], mm5
24126 movq mm0
, [esp
+ mci3130_tmprsqH
]
24132 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
24137 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
24139 pfmul mm0
,mm1 ;
# mm0=r
24140 pfmul mm0
, [esp
+ mci3130_tsc
]
24142 movq
[esp
+ mci3130_n1
], mm4
24144 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
24146 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
24149 mov edx
, [ebp
+ mci3130_VFtab
]
24150 mov ecx
, [esp
+ mci3130_n1
]
24152 ;
# load all values we need
24153 movd mm4
, [edx
+ ecx
*4]
24154 movd mm5
, [edx
+ ecx
*4 + 4]
24155 movd mm6
, [edx
+ ecx
*4 + 8]
24156 movd mm7
, [edx
+ ecx
*4 + 12]
24157 mov ecx
, [esp
+ mci3130_n1
+ 4]
24159 punpckldq mm4
, [edx
+ ecx
*4]
24160 punpckldq mm5
, [edx
+ ecx
*4 + 4]
24161 punpckldq mm6
, [edx
+ ecx
*4 + 8]
24162 punpckldq mm7
, [edx
+ ecx
*4 + 12]
24165 pfmul mm6
, mm0 ;
# mm6 = Geps
24166 pfmul mm7
, mm2 ;
# mm7 = Heps2
24169 pfadd mm5
, mm7 ;
# mm5 = Fp
24171 pfmul mm5
, mm0 ;
# mm5=eps*Fp
24172 pfadd mm5
, mm4 ;
# mm5= VV
24174 pfmul mm5
, [esp
+ mci3130_qqHH
] ;
# vcoul=qq*VV
24176 pfadd mm5
, [esp
+ mci3130_vctot
]
24177 movq
[esp
+ mci3130_vctot
], mm5
24179 ;
# done - one more?
24180 dec dword ptr
[esp
+ mci3130_innerk
]
24181 jz
.mci3130_updateouterdata
24182 jmp
.mci3130_inner_loop
24183 .mci3130_updateouterdata:
24184 mov edx
, [ebp
+ mci3130_gid
] ;
# get group index for this i particle
24186 add dword ptr
[ebp
+ mci3130_gid
], 4 ;
# advance pointer
24188 movq mm7
, [esp
+ mci3130_vctot
]
24189 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
24191 mov eax
, [ebp
+ mci3130_Vc
]
24192 movd mm6
, [eax
+ edx
*4]
24194 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
24196 movq mm7
, [esp
+ mci3130_vnbtot
]
24197 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
24199 mov eax
, [ebp
+ mci3130_Vnb
]
24200 movd mm6
, [eax
+ edx
*4]
24202 movd
[eax
+ edx
*4], mm6 ;
# increment vnbtot[gid]
24204 dec dword ptr
[ebp
+ mci3130_nri
]
24206 ;
# not last, iterate once more!
24221 .globl mcinl3300_3dnow
24222 .globl _mcinl3300_3dnow
24225 .equiv mci3300_nri, 8
24226 .equiv mci3300_iinr, 12
24227 .equiv mci3300_jindex, 16
24228 .equiv mci3300_jjnr, 20
24229 .equiv mci3300_shift, 24
24230 .equiv mci3300_shiftvec, 28
24231 .equiv mci3300_gid, 32
24232 .equiv mci3300_pos, 36
24233 .equiv mci3300_charge, 40
24234 .equiv mci3300_facel, 44
24235 .equiv mci3300_Vc, 48
24236 .equiv mci3300_type, 52
24237 .equiv mci3300_ntype, 56
24238 .equiv mci3300_nbfp, 60
24239 .equiv mci3300_Vnb, 64
24240 .equiv mci3300_tabscale, 68
24241 .equiv mci3300_VFtab, 72
24242 ;
# stack offsets for local variables
24243 .equiv mci3300_is3, 0
24244 .equiv mci3300_ii3, 4
24245 .equiv mci3300_ix, 8
24246 .equiv mci3300_iy, 12
24247 .equiv mci3300_iz, 16
24248 .equiv mci3300_iq, 20
24249 .equiv mci3300_vctot, 28
24250 .equiv mci3300_vnbtot, 36
24251 .equiv mci3300_c6, 44
24252 .equiv mci3300_c12, 52
24253 .equiv mci3300_n1, 60
24254 .equiv mci3300_tsc, 68
24255 .equiv mci3300_ntia, 76
24256 .equiv mci3300_innerjjnr, 80
24257 .equiv mci3300_innerk, 84
24266 sub esp
, 88 ;
# local stack space
24268 ;
# move data to local stack
24269 movd mm3
, [ebp
+ mci3300_tabscale
]
24271 movq
[esp
+ mci3300_tsc
], mm3
24272 ;
# assume we have at least one i particle - start directly
24274 mov eax
, [ebp
+ mci3300_shift
] ;
# eax = pointer into shift[]
24275 mov ebx
, [eax
] ;
# ebx=shift[n]
24276 add dword ptr
[ebp
+ mci3300_shift
], 4 ;
# advance pointer one step
24278 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
24279 mov
[esp
+ mci3300_is3
],ebx ;
# store is3
24281 mov eax
, [ebp
+ mci3300_shiftvec
] ;
# eax = base of shiftvec[]
24283 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
24284 movd mm1
, [eax
+ ebx
*4 + 8]
24286 mov ecx
, [ebp
+ mci3300_iinr
] ;
# ecx = pointer into iinr[]
24287 add dword ptr
[ebp
+ mci3300_iinr
], 4 ;
# advance pointer
24288 mov ebx
, [ecx
] ;
# ebx=ii
24290 mov edx
, [ebp
+ mci3300_charge
]
24291 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
24292 pfmul mm2
, [ebp
+ mci3300_facel
]
24293 punpckldq mm2
,mm2 ;
# spread to both halves
24294 movq
[esp
+ mci3300_iq
], mm2 ;
# iq =facel*charge[ii]
24296 mov edx
, [ebp
+ mci3300_type
]
24297 mov edx
, [edx
+ ebx
*4]
24298 imul edx
, [ebp
+ mci3300_ntype
]
24300 mov
[esp
+ mci3300_ntia
], edx
24302 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
24303 mov eax
, [ebp
+ mci3300_pos
] ;
# eax = base of pos[]
24305 pfadd mm0
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
24306 movd mm3
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
24307 mov
[esp
+ mci3300_ii3
], ebx
24309 movq
[esp
+ mci3300_ix
], mm0
24310 movd
[esp
+ mci3300_iz
], mm1
24312 ;
# clear total potential and i forces
24314 movq
[esp
+ mci3300_vctot
], mm7
24315 movq
[esp
+ mci3300_vnbtot
], mm7
24317 mov eax
, [ebp
+ mci3300_jindex
]
24318 mov ecx
, [eax
] ;
# jindex[n]
24319 mov edx
, [eax
+ 4] ;
# jindex[n+1]
24320 add dword ptr
[ebp
+ mci3300_jindex
], 4
24321 sub edx
, ecx ;
# number of innerloop atoms
24323 mov esi
, [ebp
+ mci3300_pos
]
24324 mov eax
, [ebp
+ mci3300_jjnr
]
24327 mov
[esp
+ mci3300_innerjjnr
], eax ;
# pointer to jjnr[nj0]
24329 mov
[esp
+ mci3300_innerk
], edx ;
# number of innerloop atoms
24330 jge
.mci3300_unroll_loop
24331 jmp
.mci3300_finish_inner
24332 .mci3300_unroll_loop:
24333 ;
# paired innerloop starts here
24334 mov ecx
, [esp
+ mci3300_innerjjnr
] ;
# pointer to jjnr[k]
24336 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
24337 add dword ptr
[esp
+ mci3300_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
24338 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
24340 mov ecx
, [ebp
+ mci3300_charge
] ;
# base of charge[]
24341 movq mm5
, [esp
+ mci3300_iq
]
24342 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
24343 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
24344 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
24346 mov ecx
, [ebp
+ mci3300_type
]
24347 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
24348 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
24350 mov esi
, [ebp
+ mci3300_nbfp
] ;
# base of nbfp
24353 add edx
, [esp
+ mci3300_ntia
] ;
# tja = ntia + 2*type
24354 add ecx
, [esp
+ mci3300_ntia
]
24356 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
24357 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
24359 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
24360 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
24361 movq
[esp
+ mci3300_c6
], mm5
24362 movq
[esp
+ mci3300_c12
], mm6
24364 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
24365 lea ebx
, [ebx
+ ebx
*2]
24367 mov esi
, [ebp
+ mci3300_pos
]
24369 movq mm0
, [esp
+ mci3300_ix
]
24370 movd mm1
, [esp
+ mci3300_iz
]
24371 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
24372 movd mm5
, [esi
+ eax
*4 + 8]
24373 pfsubr mm4
,mm0 ;
# dr = ir - jr
24375 pfmul mm4
,mm4 ;
# square dx,dy,dz
24377 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
24378 pfacc mm4
, mm5 ;
# first rsq in lower mm4
24380 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
24381 movd mm7
, [esi
+ ebx
*4 + 8]
24383 pfsubr mm6
,mm0 ;
# dr = ir - jr
24385 pfmul mm6
,mm6 ;
# square dx,dy,dz
24387 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
24388 pfacc mm6
, mm7 ;
# second rsq in lower mm6
24390 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
24395 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
24396 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
24402 ;
# mm0 is invsqrt, and mm1 r.
24403 ;
# do potential and fscal
24404 pfmul mm1
, [esp
+ mci3300_tsc
] ;
# mm1=rt
24406 movq
[esp
+ mci3300_n1
], mm4
24408 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
24411 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
24413 mov edx
, [ebp
+ mci3300_VFtab
]
24414 mov ecx
, [esp
+ mci3300_n1
]
24415 lea ecx
, [ecx
+ ecx
*2]
24417 ;
# load all the table values we need
24418 movd mm4
, [edx
+ ecx
*4]
24419 movd mm5
, [edx
+ ecx
*4 + 4]
24420 movd mm6
, [edx
+ ecx
*4 + 8]
24421 movd mm7
, [edx
+ ecx
*4 + 12]
24422 mov ecx
, [esp
+ mci3300_n1
+ 4]
24423 lea ecx
, [ecx
+ ecx
*2]
24425 punpckldq mm4
, [edx
+ ecx
*4]
24426 punpckldq mm5
, [edx
+ ecx
*4 + 4]
24427 punpckldq mm6
, [edx
+ ecx
*4 + 8]
24428 punpckldq mm7
, [edx
+ ecx
*4 + 12]
24430 pfmul mm6
, mm1 ;
# mm6 = Geps
24431 pfmul mm7
, mm2 ;
# mm7 = Heps2
24434 pfadd mm5
, mm7 ;
# mm5 = Fp
24436 pfmul mm5
, mm1 ;
# mm5=eps*Fp
24437 pfadd mm5
, mm4 ;
# mm5= VV
24439 pfmul mm5
, mm3 ;
# vcoul=qq*VV
24441 ;
# at this point mm5 contains vcoul
24442 ;
# increment vcoul - then we can get rid of mm5
24444 pfadd mm5
, [esp
+ mci3300_vctot
] ;
# add the earlier value
24445 movq
[esp
+ mci3300_vctot
], mm5 ;
# store the sum
24447 ;
# dispersion table
24448 mov ecx
, [esp
+ mci3300_n1
]
24449 lea ecx
, [ecx
+ ecx
*2]
24451 ;
# load all the table values we need
24452 movd mm4
, [edx
+ ecx
*4 + 16]
24453 movd mm5
, [edx
+ ecx
*4 + 20]
24454 movd mm6
, [edx
+ ecx
*4 + 24]
24455 movd mm7
, [edx
+ ecx
*4 + 28]
24456 mov ecx
, [esp
+ mci3300_n1
+ 4]
24457 lea ecx
, [ecx
+ ecx
*2]
24459 punpckldq mm4
, [edx
+ ecx
*4 + 16]
24460 punpckldq mm5
, [edx
+ ecx
*4 + 20]
24461 punpckldq mm6
, [edx
+ ecx
*4 + 24]
24462 punpckldq mm7
, [edx
+ ecx
*4 + 28]
24463 pfmul mm6
, mm1 ;
# mm6 = Geps
24464 pfmul mm7
, mm2 ;
# mm7 = Heps2
24466 pfadd mm5
, mm7 ;
# mm5 = Fp
24467 pfmul mm5
, mm1 ;
# mm5=eps*Fp
24468 pfadd mm5
, mm4 ;
# mm5= VV
24470 movq mm4
, [esp
+ mci3300_c6
]
24471 pfmul mm5
, mm4 ;
# vnb6
24472 ;
# update vnbtot to release mm5!
24473 pfadd mm5
, [esp
+ mci3300_vnbtot
] ;
# add the earlier value
24474 movq
[esp
+ mci3300_vnbtot
], mm5 ;
# store the sum
24477 mov ecx
, [esp
+ mci3300_n1
]
24478 lea ecx
, [ecx
+ ecx
*2]
24480 ;
# load all the table values we need
24481 movd mm4
, [edx
+ ecx
*4 + 32]
24482 movd mm5
, [edx
+ ecx
*4 + 36]
24483 movd mm6
, [edx
+ ecx
*4 + 40]
24484 movd mm7
, [edx
+ ecx
*4 + 44]
24485 mov ecx
, [esp
+ mci3300_n1
+ 4]
24486 lea ecx
, [ecx
+ ecx
*2]
24488 punpckldq mm4
, [edx
+ ecx
*4 + 32]
24489 punpckldq mm5
, [edx
+ ecx
*4 + 36]
24490 punpckldq mm6
, [edx
+ ecx
*4 + 40]
24491 punpckldq mm7
, [edx
+ ecx
*4 + 44]
24493 pfmul mm6
, mm1 ;
# mm6 = Geps
24494 pfmul mm7
, mm2 ;
# mm7 = Heps2
24496 pfadd mm5
, mm7 ;
# mm5 = Fp
24497 pfmul mm5
, mm1 ;
# mm5=eps*Fp
24498 pfadd mm5
, mm4 ;
# mm5= VV
24500 movq mm6
, [esp
+ mci3300_c12
]
24501 pfmul mm5
, mm6 ;
# vnb12
24503 pfadd mm5
, [esp
+ mci3300_vnbtot
] ;
# add the earlier value
24504 movq
[esp
+ mci3300_vnbtot
], mm5 ;
# store the sum
24506 ;
# should we do one more iteration?
24507 sub dword ptr
[esp
+ mci3300_innerk
], 2
24508 jl
.mci3300_finish_inner
24509 jmp
.mci3300_unroll_loop
24510 .mci3300_finish_inner:
24511 and dword ptr
[esp
+ mci3300_innerk
], 1
24512 jnz
.mci3300_single_inner
24513 jmp
.mci3300_updateouterdata
24514 .mci3300_single_inner:
24515 ;
# a single j particle iteration here - compare with the unrolled code for comments.
24516 mov eax
, [esp
+ mci3300_innerjjnr
]
24517 mov eax
, [eax
] ;
# eax=jnr offset
24519 mov ecx
, [ebp
+ mci3300_charge
]
24520 movd mm5
, [esp
+ mci3300_iq
]
24521 movd mm3
, [ecx
+ eax
*4]
24522 pfmul mm3
, mm5 ;
# mm3=qq
24524 mov esi
, [ebp
+ mci3300_nbfp
]
24525 mov ecx
, [ebp
+ mci3300_type
]
24526 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
24528 add edx
, [esp
+ mci3300_ntia
] ;
# tja = ntia + 2*type
24529 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
24530 movq
[esp
+ mci3300_c6
], mm5
24531 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
24532 movq
[esp
+ mci3300_c12
], mm5
24534 mov esi
, [ebp
+ mci3300_pos
]
24535 lea eax
, [eax
+ eax
*2]
24537 movq mm0
, [esp
+ mci3300_ix
]
24538 movd mm1
, [esp
+ mci3300_iz
]
24539 movq mm4
, [esi
+ eax
*4]
24540 movd mm5
, [esi
+ eax
*4 + 8]
24546 pfacc mm4
, mm5 ;
# mm0=rsq
24552 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
24555 ;
# mm0 is invsqrt, and mm1 r.
24557 ;
# calculate potentials and scalar force
24558 pfmul mm1
, [esp
+ mci3300_tsc
] ;
# mm1=rt
24560 movd
[esp
+ mci3300_n1
], mm4
24562 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
24565 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
24568 mov edx
, [ebp
+ mci3300_VFtab
]
24569 mov ecx
, [esp
+ mci3300_n1
]
24570 lea ecx
, [ecx
+ ecx
*2]
24572 ;
# load all the table values we need
24573 movd mm4
, [edx
+ ecx
*4]
24574 movd mm5
, [edx
+ ecx
*4 + 4]
24575 movd mm6
, [edx
+ ecx
*4 + 8]
24576 movd mm7
, [edx
+ ecx
*4 + 12]
24578 pfmul mm6
, mm1 ;
# mm6 = Geps
24579 pfmul mm7
, mm2 ;
# mm7 = Heps2
24582 pfadd mm5
, mm7 ;
# mm5 = Fp
24584 pfmul mm5
, mm1 ;
# mm5=eps*Fp
24585 pfadd mm5
, mm4 ;
# mm5= VV
24587 pfmul mm5
, mm3 ;
# vcoul=qq*VV
24589 ;
# at this point mm5 contains vcoul
24590 ;
# increment vcoul - then we can get rid of mm5
24592 pfadd mm5
, [esp
+ mci3300_vctot
] ;
# add the earlier value
24593 movq
[esp
+ mci3300_vctot
], mm5 ;
# store the sum
24595 ;
# dispersion table
24596 ;
# load all the table values we need
24597 movd mm4
, [edx
+ ecx
*4 + 16]
24598 movd mm5
, [edx
+ ecx
*4 + 20]
24599 movd mm6
, [edx
+ ecx
*4 + 24]
24600 movd mm7
, [edx
+ ecx
*4 + 28]
24601 pfmul mm6
, mm1 ;
# mm6 = Geps
24602 pfmul mm7
, mm2 ;
# mm7 = Heps2
24604 pfadd mm5
, mm7 ;
# mm5 = Fp
24605 pfmul mm5
, mm1 ;
# mm5=eps*Fp
24606 pfadd mm5
, mm4 ;
# mm5= VV
24608 movq mm4
, [esp
+ mci3300_c6
]
24609 pfmul mm5
, mm4 ;
# vnb6
24611 ;
# update vnbtot to release mm5!
24612 pfadd mm5
, [esp
+ mci3300_vnbtot
] ;
# add the earlier value
24613 movq
[esp
+ mci3300_vnbtot
], mm5 ;
# store the sum
24616 ;
# load all the table values we need
24617 movd mm4
, [edx
+ ecx
*4 + 32]
24618 movd mm5
, [edx
+ ecx
*4 + 36]
24619 movd mm6
, [edx
+ ecx
*4 + 40]
24620 movd mm7
, [edx
+ ecx
*4 + 44]
24622 pfmul mm6
, mm1 ;
# mm6 = Geps
24623 pfmul mm7
, mm2 ;
# mm7 = Heps2
24625 pfadd mm5
, mm7 ;
# mm5 = Fp
24626 pfmul mm5
, mm1 ;
# mm5=eps*Fp
24627 pfadd mm5
, mm4 ;
# mm5= VV
24629 movq mm6
, [esp
+ mci3300_c12
]
24630 pfmul mm5
, mm6 ;
# vnb12
24632 pfadd mm5
, [esp
+ mci3300_vnbtot
] ;
# add the earlier value
24633 movq
[esp
+ mci3300_vnbtot
], mm5 ;
# store the sum
24635 .mci3300_updateouterdata:
24636 mov edx
, [ebp
+ mci3300_gid
] ;
# get group index for this i particle
24638 add dword ptr
[ebp
+ mci3300_gid
], 4 ;
# advance pointer
24640 movq mm7
, [esp
+ mci3300_vctot
]
24641 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
24643 mov eax
, [ebp
+ mci3300_Vc
]
24644 movd mm6
, [eax
+ edx
*4]
24646 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
24648 movq mm7
, [esp
+ mci3300_vnbtot
]
24649 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
24651 mov eax
, [ebp
+ mci3300_Vnb
]
24652 movd mm6
, [eax
+ edx
*4]
24654 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
24657 mov ecx
, [ebp
+ mci3300_nri
]
24660 ;
# not last, iterate once more!
24661 mov
[ebp
+ mci3300_nri
], ecx
24679 .globl mcinl3310_3dnow
24680 .globl _mcinl3310_3dnow
24683 .equiv mci3310_nri, 8
24684 .equiv mci3310_iinr, 12
24685 .equiv mci3310_jindex, 16
24686 .equiv mci3310_jjnr, 20
24687 .equiv mci3310_shift, 24
24688 .equiv mci3310_shiftvec, 28
24689 .equiv mci3310_gid, 32
24690 .equiv mci3310_pos, 36
24691 .equiv mci3310_charge, 40
24692 .equiv mci3310_facel, 44
24693 .equiv mci3310_Vc, 48
24694 .equiv mci3310_type, 52
24695 .equiv mci3310_ntype, 56
24696 .equiv mci3310_nbfp, 60
24697 .equiv mci3310_Vnb, 64
24698 .equiv mci3310_tabscale, 68
24699 .equiv mci3310_VFtab, 72
24700 .equiv mci3310_nsatoms, 76
24701 ;
# stack offsets for local variables
24702 .equiv mci3310_is3, 0
24703 .equiv mci3310_ii3, 4
24704 .equiv mci3310_shX, 8
24705 .equiv mci3310_shY, 12
24706 .equiv mci3310_shZ, 16
24707 .equiv mci3310_ix, 20
24708 .equiv mci3310_iy, 24
24709 .equiv mci3310_iz, 28
24710 .equiv mci3310_iq, 32
24711 .equiv mci3310_vctot, 40
24712 .equiv mci3310_vnbtot, 48
24713 .equiv mci3310_c6, 56
24714 .equiv mci3310_c12, 64
24715 .equiv mci3310_n1, 72
24716 .equiv mci3310_tsc, 80
24717 .equiv mci3310_ntia, 88
24718 .equiv mci3310_innerjjnr0, 92
24719 .equiv mci3310_innerk0, 96
24720 .equiv mci3310_innerjjnr, 100
24721 .equiv mci3310_innerk, 104
24722 .equiv mci3310_nsvdwc, 108
24723 .equiv mci3310_nscoul, 112
24724 .equiv mci3310_nsvdw, 116
24725 .equiv mci3310_solnr, 120
24734 sub esp
, 124 ;
# local stack space
24736 movd mm3
, [ebp
+ mci3310_tabscale
]
24738 movq
[esp
+ mci3310_tsc
], mm3
24739 ;
# assume we have at least one i particle - start directly
24741 mov eax
, [ebp
+ mci3310_shift
] ;
# eax = pointer into shift[]
24742 mov ebx
, [eax
] ;
# ebx=shift[n]
24743 add dword ptr
[ebp
+ mci3310_shift
], 4 ;
# advance pointer one step
24745 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
24746 mov
[esp
+ mci3310_is3
],ebx ;
# store is3
24748 mov eax
, [ebp
+ mci3310_shiftvec
] ;
# eax = base of shiftvec[]
24750 movq mm0
, [eax
+ ebx
*4] ;
# move shX/shY to mm0 and shZ to mm1
24751 movd mm1
, [eax
+ ebx
*4 + 8]
24752 movq
[esp
+ mci3310_shX
], mm0
24753 movd
[esp
+ mci3310_shZ
], mm1
24755 mov ecx
, [ebp
+ mci3310_iinr
] ;
# ecx = pointer into iinr[]
24756 add dword ptr
[ebp
+ mci3310_iinr
], 4 ;
# advance pointer
24757 mov ebx
, [ecx
] ;
# ebx=ii
24759 mov eax
, [ebp
+ mci3310_nsatoms
]
24760 add dword ptr
[ebp
+ mci3310_nsatoms
], 12
24767 mov
[esp
+ mci3310_nsvdwc
], edx
24768 mov
[esp
+ mci3310_nscoul
], eax
24769 mov
[esp
+ mci3310_nsvdw
], ecx
24773 movq
[esp
+ mci3310_vctot
], mm7
24774 movq
[esp
+ mci3310_vnbtot
], mm7
24775 mov
[esp
+ mci3310_solnr
], ebx
24777 mov eax
, [ebp
+ mci3310_jindex
]
24778 mov ecx
, [eax
] ;
# jindex[n]
24779 mov edx
, [eax
+ 4] ;
# jindex[n+1]
24780 add dword ptr
[ebp
+ mci3310_jindex
], 4
24781 sub edx
, ecx ;
# number of innerloop atoms
24782 mov eax
, [ebp
+ mci3310_jjnr
]
24785 mov
[esp
+ mci3310_innerjjnr0
], eax ;
# pointer to jjnr[nj0]
24787 mov
[esp
+ mci3310_innerk0
], edx ;
# number of innerloop atoms
24788 mov esi
, [ebp
+ mci3310_pos
]
24790 mov ecx
, [esp
+ mci3310_nsvdwc
]
24792 jnz
.mci3310_mno_vdwc
24793 jmp
.mci3310_testcoul
24795 mov ebx
, [esp
+ mci3310_solnr
]
24796 inc dword ptr
[esp
+ mci3310_solnr
]
24797 mov edx
, [ebp
+ mci3310_charge
]
24798 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
24799 pfmul mm2
, [ebp
+ mci3310_facel
]
24800 punpckldq mm2
,mm2 ;
# spread to both halves
24801 movq
[esp
+ mci3310_iq
], mm2 ;
# iq =facel*charge[ii]
24803 mov edx
, [ebp
+ mci3310_type
]
24804 mov edx
, [edx
+ ebx
*4]
24805 imul edx
, [ebp
+ mci3310_ntype
]
24807 mov
[esp
+ mci3310_ntia
], edx
24809 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
24810 mov eax
, [ebp
+ mci3310_pos
] ;
# eax = base of pos[]
24811 mov
[esp
+ mci3310_ii3
], ebx
24813 movq mm0
, [eax
+ ebx
*4]
24814 movd mm1
, [eax
+ ebx
*4 + 8]
24815 pfadd mm0
, [esp
+ mci3310_shX
]
24816 pfadd mm1
, [esp
+ mci3310_shZ
]
24817 movq
[esp
+ mci3310_ix
], mm0
24818 movd
[esp
+ mci3310_iz
], mm1
24820 mov ecx
, [esp
+ mci3310_innerjjnr0
]
24821 mov
[esp
+ mci3310_innerjjnr
], ecx
24822 mov edx
, [esp
+ mci3310_innerk0
]
24824 mov
[esp
+ mci3310_innerk
], edx ;
# number of innerloop atoms
24825 jge
.mci3310_unroll_vdwc_loop
24826 jmp
.mci3310_finish_vdwc_inner
24827 .mci3310_unroll_vdwc_loop:
24828 ;
# paired innerloop starts here
24829 mov ecx
, [esp
+ mci3310_innerjjnr
] ;
# pointer to jjnr[k]
24831 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
24832 add dword ptr
[esp
+ mci3310_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
24833 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
24835 mov ecx
, [ebp
+ mci3310_charge
] ;
# base of charge[]
24836 movq mm5
, [esp
+ mci3310_iq
]
24837 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
24838 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
24839 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
24841 mov ecx
, [ebp
+ mci3310_type
]
24842 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
24843 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
24845 mov esi
, [ebp
+ mci3310_nbfp
] ;
# base of nbfp
24848 add edx
, [esp
+ mci3310_ntia
] ;
# tja = ntia + 2*type
24849 add ecx
, [esp
+ mci3310_ntia
]
24851 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
24852 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
24854 punpckldq mm5
,mm7 ;
# mm5 = 1st c6 / 2nd c6
24855 punpckhdq mm6
,mm7 ;
# mm6 = 1st c12 / 2nd c12
24856 movq
[esp
+ mci3310_c6
], mm5
24857 movq
[esp
+ mci3310_c12
], mm6
24859 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
24860 lea ebx
, [ebx
+ ebx
*2]
24862 mov esi
, [ebp
+ mci3310_pos
]
24864 movq mm0
, [esp
+ mci3310_ix
]
24865 movd mm1
, [esp
+ mci3310_iz
]
24866 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
24867 movd mm5
, [esi
+ eax
*4 + 8]
24868 pfsubr mm4
,mm0 ;
# dr = ir - jr
24870 pfmul mm4
,mm4 ;
# square dx,dy,dz
24872 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
24873 pfacc mm4
, mm5 ;
# first rsq in lower mm4
24875 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
24876 movd mm7
, [esi
+ ebx
*4 + 8]
24878 pfsubr mm6
,mm0 ;
# dr = ir - jr
24880 pfmul mm6
,mm6 ;
# square dx,dy,dz
24882 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
24883 pfacc mm6
, mm7 ;
# second rsq in lower mm6
24885 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
24889 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
24890 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
24896 ;
# mm0 is invsqrt, and mm1 r.
24897 ;
# do potential and fscal
24898 pfmul mm1
, [esp
+ mci3310_tsc
] ;
# mm1=rt
24900 movq
[esp
+ mci3310_n1
], mm4
24902 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
24905 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
24907 mov edx
, [ebp
+ mci3310_VFtab
]
24908 mov ecx
, [esp
+ mci3310_n1
]
24909 lea ecx
, [ecx
+ ecx
*2]
24911 ;
# load all the table values we need
24912 movd mm4
, [edx
+ ecx
*4]
24913 movd mm5
, [edx
+ ecx
*4 + 4]
24914 movd mm6
, [edx
+ ecx
*4 + 8]
24915 movd mm7
, [edx
+ ecx
*4 + 12]
24916 mov ecx
, [esp
+ mci3310_n1
+ 4]
24917 lea ecx
, [ecx
+ ecx
*2]
24919 punpckldq mm4
, [edx
+ ecx
*4]
24920 punpckldq mm5
, [edx
+ ecx
*4 + 4]
24921 punpckldq mm6
, [edx
+ ecx
*4 + 8]
24922 punpckldq mm7
, [edx
+ ecx
*4 + 12]
24924 pfmul mm6
, mm1 ;
# mm6 = Geps
24925 pfmul mm7
, mm2 ;
# mm7 = Heps2
24928 pfadd mm5
, mm7 ;
# mm5 = Fp
24930 pfmul mm5
, mm1 ;
# mm5=eps*Fp
24931 pfadd mm5
, mm4 ;
# mm5= VV
24933 pfmul mm5
, mm3 ;
# vcoul=qq*VV
24935 ;
# at this point mm5 contains vcoul
24936 ;
# increment vcoul - then we can get rid of mm5
24938 pfadd mm5
, [esp
+ mci3310_vctot
] ;
# add the earlier value
24939 movq
[esp
+ mci3310_vctot
], mm5 ;
# store the sum
24941 ;
# dispersion table
24942 mov ecx
, [esp
+ mci3310_n1
]
24943 lea ecx
, [ecx
+ ecx
*2]
24945 ;
# load all the table values we need
24946 movd mm4
, [edx
+ ecx
*4 + 16]
24947 movd mm5
, [edx
+ ecx
*4 + 20]
24948 movd mm6
, [edx
+ ecx
*4 + 24]
24949 movd mm7
, [edx
+ ecx
*4 + 28]
24950 mov ecx
, [esp
+ mci3310_n1
+ 4]
24951 lea ecx
, [ecx
+ ecx
*2]
24953 punpckldq mm4
, [edx
+ ecx
*4 + 16]
24954 punpckldq mm5
, [edx
+ ecx
*4 + 20]
24955 punpckldq mm6
, [edx
+ ecx
*4 + 24]
24956 punpckldq mm7
, [edx
+ ecx
*4 + 28]
24957 pfmul mm6
, mm1 ;
# mm6 = Geps
24958 pfmul mm7
, mm2 ;
# mm7 = Heps2
24960 pfadd mm5
, mm7 ;
# mm5 = Fp
24961 pfmul mm5
, mm1 ;
# mm5=eps*Fp
24962 pfadd mm5
, mm4 ;
# mm5= VV
24964 movq mm4
, [esp
+ mci3310_c6
]
24965 pfmul mm5
, mm4 ;
# vnb6
24967 ;
# update vnbtot to release mm5!
24968 pfadd mm5
, [esp
+ mci3310_vnbtot
] ;
# add the earlier value
24969 movq
[esp
+ mci3310_vnbtot
], mm5 ;
# store the sum
24972 mov ecx
, [esp
+ mci3310_n1
]
24973 lea ecx
, [ecx
+ ecx
*2]
24975 ;
# load all the table values we need
24976 movd mm4
, [edx
+ ecx
*4 + 32]
24977 movd mm5
, [edx
+ ecx
*4 + 36]
24978 movd mm6
, [edx
+ ecx
*4 + 40]
24979 movd mm7
, [edx
+ ecx
*4 + 44]
24980 mov ecx
, [esp
+ mci3310_n1
+ 4]
24981 lea ecx
, [ecx
+ ecx
*2]
24983 punpckldq mm4
, [edx
+ ecx
*4 + 32]
24984 punpckldq mm5
, [edx
+ ecx
*4 + 36]
24985 punpckldq mm6
, [edx
+ ecx
*4 + 40]
24986 punpckldq mm7
, [edx
+ ecx
*4 + 44]
24988 pfmul mm6
, mm1 ;
# mm6 = Geps
24989 pfmul mm7
, mm2 ;
# mm7 = Heps2
24991 pfadd mm5
, mm7 ;
# mm5 = Fp
24992 pfmul mm5
, mm1 ;
# mm5=eps*Fp
24993 pfadd mm5
, mm4 ;
# mm5= VV
24995 movq mm6
, [esp
+ mci3310_c12
]
24996 pfmul mm5
, mm6 ;
# vnb12
24998 pfadd mm5
, [esp
+ mci3310_vnbtot
] ;
# add the earlier value
24999 movq
[esp
+ mci3310_vnbtot
], mm5 ;
# store the sum
25001 ;
# should we do one more iteration?
25002 sub dword ptr
[esp
+ mci3310_innerk
], 2
25003 jl
.mci3310_finish_vdwc_inner
25004 jmp
.mci3310_unroll_vdwc_loop
25005 .mci3310_finish_vdwc_inner:
25006 and dword ptr
[esp
+ mci3310_innerk
], 1
25007 jnz
.mci3310_single_vdwc_inner
25008 jmp
.mci3310_updateouterdata_vdwc
25009 .mci3310_single_vdwc_inner:
25010 ;
# a single j particle iteration here - compare with the unrolled code for comments.
25011 mov eax
, [esp
+ mci3310_innerjjnr
]
25012 mov eax
, [eax
] ;
# eax=jnr offset
25014 mov ecx
, [ebp
+ mci3310_charge
]
25015 movd mm5
, [esp
+ mci3310_iq
]
25016 movd mm3
, [ecx
+ eax
*4]
25017 pfmul mm3
, mm5 ;
# mm3=qq
25019 mov esi
, [ebp
+ mci3310_nbfp
]
25020 mov ecx
, [ebp
+ mci3310_type
]
25021 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
25023 add edx
, [esp
+ mci3310_ntia
] ;
# tja = ntia + 2*type
25024 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
25025 movq
[esp
+ mci3310_c6
], mm5
25026 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
25027 movq
[esp
+ mci3310_c12
], mm5
25029 mov esi
, [ebp
+ mci3310_pos
]
25030 lea eax
, [eax
+ eax
*2]
25032 movq mm0
, [esp
+ mci3310_ix
]
25033 movd mm1
, [esp
+ mci3310_iz
]
25034 movq mm4
, [esi
+ eax
*4]
25035 movd mm5
, [esi
+ eax
*4 + 8]
25041 pfacc mm4
, mm5 ;
# mm0=rsq
25047 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
25050 ;
# mm0 is invsqrt, and mm1 r.
25052 ;
# calculate potentials and scalar force
25053 pfmul mm1
, [esp
+ mci3310_tsc
] ;
# mm1=rt
25055 movd
[esp
+ mci3310_n1
], mm4
25057 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
25060 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
25063 mov edx
, [ebp
+ mci3310_VFtab
]
25064 mov ecx
, [esp
+ mci3310_n1
]
25065 lea ecx
, [ecx
+ ecx
*2]
25067 ;
# load all the table values we need
25068 movd mm4
, [edx
+ ecx
*4]
25069 movd mm5
, [edx
+ ecx
*4 + 4]
25070 movd mm6
, [edx
+ ecx
*4 + 8]
25071 movd mm7
, [edx
+ ecx
*4 + 12]
25073 pfmul mm6
, mm1 ;
# mm6 = Geps
25074 pfmul mm7
, mm2 ;
# mm7 = Heps2
25077 pfadd mm5
, mm7 ;
# mm5 = Fp
25079 pfmul mm5
, mm1 ;
# mm5=eps*Fp
25080 pfadd mm5
, mm4 ;
# mm5= VV
25082 pfmul mm5
, mm3 ;
# vcoul=qq*VV
25084 ;
# at this point mm5 contains vcoul
25085 ;
# increment vcoul - then we can get rid of mm5
25087 pfadd mm5
, [esp
+ mci3310_vctot
] ;
# add the earlier value
25088 movq
[esp
+ mci3310_vctot
], mm5 ;
# store the sum
25090 ;
# dispersion table
25091 ;
# load all the table values we need
25092 movd mm4
, [edx
+ ecx
*4 + 16]
25093 movd mm5
, [edx
+ ecx
*4 + 20]
25094 movd mm6
, [edx
+ ecx
*4 + 24]
25095 movd mm7
, [edx
+ ecx
*4 + 28]
25096 pfmul mm6
, mm1 ;
# mm6 = Geps
25097 pfmul mm7
, mm2 ;
# mm7 = Heps2
25099 pfadd mm5
, mm7 ;
# mm5 = Fp
25100 pfmul mm5
, mm1 ;
# mm5=eps*Fp
25101 pfadd mm5
, mm4 ;
# mm5= VV
25103 movq mm4
, [esp
+ mci3310_c6
]
25104 pfmul mm5
, mm4 ;
# vnb6
25105 ;
# update vnbtot to release mm5!
25106 pfadd mm5
, [esp
+ mci3310_vnbtot
] ;
# add the earlier value
25107 movq
[esp
+ mci3310_vnbtot
], mm5 ;
# store the sum
25110 ;
# load all the table values we need
25111 movd mm4
, [edx
+ ecx
*4 + 32]
25112 movd mm5
, [edx
+ ecx
*4 + 36]
25113 movd mm6
, [edx
+ ecx
*4 + 40]
25114 movd mm7
, [edx
+ ecx
*4 + 44]
25116 pfmul mm6
, mm1 ;
# mm6 = Geps
25117 pfmul mm7
, mm2 ;
# mm7 = Heps2
25119 pfadd mm5
, mm7 ;
# mm5 = Fp
25120 pfmul mm5
, mm1 ;
# mm5=eps*Fp
25121 pfadd mm5
, mm4 ;
# mm5= VV
25123 movq mm6
, [esp
+ mci3310_c12
]
25124 pfmul mm5
, mm6 ;
# vnb12
25126 ;
# change sign of mm3
25129 pfmul mm0
, [esp
+ mci3310_tsc
]
25130 pfmul mm0
, mm1 ;
# mm0 is total fscal now
25133 pfadd mm5
, [esp
+ mci3310_vnbtot
] ;
# add the earlier value
25134 movq
[esp
+ mci3310_vnbtot
], mm5 ;
# store the sum
25136 .mci3310_updateouterdata_vdwc:
25137 ;
# loop back to mno
25138 dec dword ptr
[esp
+ mci3310_nsvdwc
]
25139 jz
.mci3310_testcoul
25140 jmp
.mci3310_mno_vdwc
25142 mov ecx
, [esp
+ mci3310_nscoul
]
25144 jnz
.mci3310_mno_coul
25145 jmp
.mci3310_testvdw
25147 mov ebx
, [esp
+ mci3310_solnr
]
25148 inc dword ptr
[esp
+ mci3310_solnr
]
25149 mov edx
, [ebp
+ mci3310_charge
]
25150 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii]
25151 pfmul mm2
, [ebp
+ mci3310_facel
]
25152 punpckldq mm2
,mm2 ;
# spread to both halves
25153 movq
[esp
+ mci3310_iq
], mm2 ;
# iq =facel*charge[ii]
25155 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
25156 mov eax
, [ebp
+ mci3310_pos
] ;
# eax = base of pos[]
25157 mov
[esp
+ mci3310_ii3
], ebx
25159 movq mm0
, [eax
+ ebx
*4]
25160 movd mm1
, [eax
+ ebx
*4 + 8]
25161 pfadd mm0
, [esp
+ mci3310_shX
]
25162 pfadd mm1
, [esp
+ mci3310_shZ
]
25163 movq
[esp
+ mci3310_ix
], mm0
25164 movd
[esp
+ mci3310_iz
], mm1
25166 mov ecx
, [esp
+ mci3310_innerjjnr0
]
25167 mov
[esp
+ mci3310_innerjjnr
], ecx
25168 mov edx
, [esp
+ mci3310_innerk0
]
25170 mov
[esp
+ mci3310_innerk
], edx ;
# number of innerloop atoms
25171 jge
.mci3310_unroll_coul_loop
25172 jmp
.mci3310_finish_coul_inner
25173 .mci3310_unroll_coul_loop:
25174 ;
# paired innerloop starts here
25175 mov ecx
, [esp
+ mci3310_innerjjnr
] ;
# pointer to jjnr[k]
25177 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
25178 add dword ptr
[esp
+ mci3310_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
25179 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
25181 mov ecx
, [ebp
+ mci3310_charge
] ;
# base of charge[]
25182 movq mm5
, [esp
+ mci3310_iq
]
25183 movd mm3
, [ecx
+ eax
*4] ;
# charge[jnr1]
25184 punpckldq mm3
, [ecx
+ ebx
*4] ;
# move charge 2 to high part of mm3
25185 pfmul mm3
,mm5 ;
# mm3 now has qq for both particles
25187 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
25188 lea ebx
, [ebx
+ ebx
*2]
25190 mov esi
, [ebp
+ mci3310_pos
]
25192 movq mm0
, [esp
+ mci3310_ix
]
25193 movd mm1
, [esp
+ mci3310_iz
]
25194 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
25195 movd mm5
, [esi
+ eax
*4 + 8]
25196 pfsubr mm4
,mm0 ;
# dr = ir - jr
25198 pfmul mm4
,mm4 ;
# square dx,dy,dz
25200 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
25201 pfacc mm4
, mm5 ;
# first rsq in lower mm4
25203 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
25204 movd mm7
, [esi
+ ebx
*4 + 8]
25206 pfsubr mm6
,mm0 ;
# dr = ir - jr
25208 pfmul mm6
,mm6 ;
# square dx,dy,dz
25210 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
25211 pfacc mm6
, mm7 ;
# second rsq in lower mm6
25213 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
25218 punpckldq mm4
,mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
25219 movq mm2
,mm0 ;
# amd 3dnow N-R iteration to get full precision.
25225 ;
# mm0 is invsqrt, and mm1 r.
25226 ;
# do potential and fscal
25227 pfmul mm1
, [esp
+ mci3310_tsc
] ;
# mm1=rt
25229 movq
[esp
+ mci3310_n1
], mm4
25231 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
25234 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
25236 mov edx
, [ebp
+ mci3310_VFtab
]
25237 mov ecx
, [esp
+ mci3310_n1
]
25238 lea ecx
, [ecx
+ ecx
*2]
25241 ;
# load all the table values we need
25242 movd mm4
, [edx
+ ecx
*4]
25243 movd mm5
, [edx
+ ecx
*4 + 4]
25244 movd mm6
, [edx
+ ecx
*4 + 8]
25245 movd mm7
, [edx
+ ecx
*4 + 12]
25246 mov ecx
, [esp
+ mci3310_n1
+ 4]
25247 lea ecx
, [ecx
+ ecx
*2]
25249 punpckldq mm4
, [edx
+ ecx
*4]
25250 punpckldq mm5
, [edx
+ ecx
*4 + 4]
25251 punpckldq mm6
, [edx
+ ecx
*4 + 8]
25252 punpckldq mm7
, [edx
+ ecx
*4 + 12]
25254 pfmul mm6
, mm1 ;
# mm6 = Geps
25255 pfmul mm7
, mm2 ;
# mm7 = Heps2
25258 pfadd mm5
, mm7 ;
# mm5 = Fp
25260 pfmul mm5
, mm1 ;
# mm5=eps*Fp
25261 pfadd mm5
, mm4 ;
# mm5= VV
25263 pfmul mm5
, mm3 ;
# vcoul=qq*VV
25265 ;
# at this point mm5 contains vcoul
25266 ;
# increment vcoul - then we can get rid of mm5
25268 pfadd mm5
, [esp
+ mci3310_vctot
] ;
# add the earlier value
25269 movq
[esp
+ mci3310_vctot
], mm5 ;
# store the sum
25271 ;
# should we do one more iteration?
25272 sub dword ptr
[esp
+ mci3310_innerk
], 2
25273 jl
.mci3310_finish_coul_inner
25274 jmp
.mci3310_unroll_coul_loop
25275 .mci3310_finish_coul_inner:
25276 and dword ptr
[esp
+ mci3310_innerk
], 1
25277 jnz
.mci3310_single_coul_inner
25278 jmp
.mci3310_updateouterdata_coul
25279 .mci3310_single_coul_inner:
25280 ;
# a single j particle iteration here - compare with the unrolled code for comments.
25281 mov eax
, [esp
+ mci3310_innerjjnr
]
25282 mov eax
, [eax
] ;
# eax=jnr offset
25284 mov ecx
, [ebp
+ mci3310_charge
]
25285 movd mm5
, [esp
+ mci3310_iq
]
25286 movd mm3
, [ecx
+ eax
*4]
25287 pfmul mm3
, mm5 ;
# mm3=qq
25289 mov esi
, [ebp
+ mci3310_pos
]
25290 lea eax
, [eax
+ eax
*2]
25292 movq mm0
, [esp
+ mci3310_ix
]
25293 movd mm1
, [esp
+ mci3310_iz
]
25294 movq mm4
, [esi
+ eax
*4]
25295 movd mm5
, [esi
+ eax
*4 + 8]
25301 pfacc mm4
, mm5 ;
# mm0=rsq
25307 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
25310 ;
# mm0 is invsqrt, and mm1 r.
25312 ;
# calculate potentials and scalar force
25313 pfmul mm1
, [esp
+ mci3310_tsc
] ;
# mm1=rt
25315 movd
[esp
+ mci3310_n1
], mm4
25317 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
25320 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
25323 mov edx
, [ebp
+ mci3310_VFtab
]
25324 mov ecx
, [esp
+ mci3310_n1
]
25325 lea ecx
, [ecx
+ ecx
*2]
25327 ;
# load all the table values we need
25328 movd mm4
, [edx
+ ecx
*4]
25329 movd mm5
, [edx
+ ecx
*4 + 4]
25330 movd mm6
, [edx
+ ecx
*4 + 8]
25331 movd mm7
, [edx
+ ecx
*4 + 12]
25333 pfmul mm6
, mm1 ;
# mm6 = Geps
25334 pfmul mm7
, mm2 ;
# mm7 = Heps2
25337 pfadd mm5
, mm7 ;
# mm5 = Fp
25339 pfmul mm5
, mm1 ;
# mm5=eps*Fp
25340 pfadd mm5
, mm4 ;
# mm5= VV
25342 pfmul mm5
, mm3 ;
# vcoul=qq*VV
25344 ;
# at this point mm5 contains vcoul
25345 ;
# increment vcoul - then we can get rid of mm5
25347 pfadd mm5
, [esp
+ mci3310_vctot
] ;
# add the earlier value
25348 movq
[esp
+ mci3310_vctot
], mm5 ;
# store the sum
25350 .mci3310_updateouterdata_coul:
25351 ;
# loop back to mno
25352 dec dword ptr
[esp
+ mci3310_nscoul
]
25353 jz
.mci3310_testvdw
25354 jmp
.mci3310_mno_coul
25356 mov ecx
, [esp
+ mci3310_nsvdw
]
25358 jnz
.mci3310_mno_vdw
25359 jmp
.mci3310_last_mno
25361 mov ebx
, [esp
+ mci3310_solnr
]
25362 inc dword ptr
[esp
+ mci3310_solnr
]
25364 mov edx
, [ebp
+ mci3310_type
]
25365 mov edx
, [edx
+ ebx
*4]
25366 imul edx
, [ebp
+ mci3310_ntype
]
25368 mov
[esp
+ mci3310_ntia
], edx
25370 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
25371 mov eax
, [ebp
+ mci3310_pos
] ;
# eax = base of pos[]
25372 mov
[esp
+ mci3310_ii3
], ebx
25374 movq mm0
, [eax
+ ebx
*4]
25375 movd mm1
, [eax
+ ebx
*4 + 8]
25376 pfadd mm0
, [esp
+ mci3310_shX
]
25377 pfadd mm1
, [esp
+ mci3310_shZ
]
25378 movq
[esp
+ mci3310_ix
], mm0
25379 movd
[esp
+ mci3310_iz
], mm1
25381 mov ecx
, [esp
+ mci3310_innerjjnr0
]
25382 mov
[esp
+ mci3310_innerjjnr
], ecx
25383 mov edx
, [esp
+ mci3310_innerk0
]
25385 mov
[esp
+ mci3310_innerk
], edx ;
# number of innerloop atoms
25386 jge
.mci3310_unroll_vdw_loop
25387 jmp
.mci3310_finish_vdw_inner
25388 .mci3310_unroll_vdw_loop:
25389 ;
# paired innerloop starts here
25390 mov ecx
, [esp
+ mci3310_innerjjnr
] ;
# pointer to jjnr[k]
25392 mov ebx
, [ecx
+ 4] ;
# eax/ebx=jnr
25393 add dword ptr
[esp
+ mci3310_innerjjnr
], 8 ;
# advance pointer (unrolled 2)
25394 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
25396 mov ecx
, [ebp
+ mci3310_type
]
25397 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
25398 mov ecx
, [ecx
+ ebx
*4] ;
# type [jnr2]
25400 mov esi
, [ebp
+ mci3310_nbfp
] ;
# base of nbfp
25403 add edx
, [esp
+ mci3310_ntia
] ;
# tja = ntia + 2*type
25404 add ecx
, [esp
+ mci3310_ntia
]
25406 movq mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6 / c12
25407 movq mm7
, [esi
+ ecx
*4] ;
# mm7 = 2nd c6 / c12
25409 punpckldq mm5
, mm7 ;
# mm5 = 1st c6 / 2nd c6
25410 punpckhdq mm6
, mm7 ;
# mm6 = 1st c12 / 2nd c12
25411 movq
[esp
+ mci3310_c6
], mm5
25412 movq
[esp
+ mci3310_c12
], mm6
25414 lea eax
, [eax
+ eax
*2] ;
# replace jnr with j3
25415 lea ebx
, [ebx
+ ebx
*2]
25417 mov esi
, [ebp
+ mci3310_pos
]
25419 movq mm0
, [esp
+ mci3310_ix
]
25420 movd mm1
, [esp
+ mci3310_iz
]
25421 movq mm4
, [esi
+ eax
*4] ;
# fetch first j coordinates
25422 movd mm5
, [esi
+ eax
*4 + 8]
25423 pfsubr mm4
,mm0 ;
# dr = ir - jr
25425 pfmul mm4
,mm4 ;
# square dx,dy,dz
25427 pfacc mm4
, mm5 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
25428 pfacc mm4
, mm5 ;
# first rsq in lower mm4
25430 movq mm6
, [esi
+ ebx
*4] ;
# fetch second j coordinates
25431 movd mm7
, [esi
+ ebx
*4 + 8]
25433 pfsubr mm6
, mm0 ;
# dr = ir - jr
25435 pfmul mm6
, mm6 ;
# square dx,dy,dz
25437 pfacc mm6
, mm7 ;
# accumulate to get dx*dx+ dy*dy+ dz*dz
25438 pfacc mm6
, mm7 ;
# second rsq in lower mm6
25440 pfrsqrt mm0
, mm4 ;
# lookup inverse square root seed
25445 punpckldq mm4
, mm6 ;
# now 4 has rsq and 0 the seed for both pairs.
25446 movq mm2
, mm0 ;
# amd 3dnow N-R iteration to get full precision.
25452 ;
# mm0 is invsqrt, and mm1 r.
25453 ;
# do potential and fscal
25454 pfmul mm1
, [esp
+ mci3310_tsc
] ;
# mm1=rt
25456 movq
[esp
+ mci3310_n1
], mm4
25458 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 is n0
25461 pfmul mm2
, mm2 ;
# mm1 is eps, mm2 is eps2
25463 mov edx
, [ebp
+ mci3310_VFtab
]
25464 ;
# dispersion table
25465 mov ecx
, [esp
+ mci3310_n1
]
25466 lea ecx
, [ecx
+ ecx
*2]
25468 ;
# load all the table values we need
25469 movd mm4
, [edx
+ ecx
*4]
25470 movd mm5
, [edx
+ ecx
*4 + 4]
25471 movd mm6
, [edx
+ ecx
*4 + 8]
25472 movd mm7
, [edx
+ ecx
*4 + 12]
25473 mov ecx
, [esp
+ mci3310_n1
+ 4]
25474 lea ecx
, [ecx
+ ecx
*2]
25476 punpckldq mm4
, [edx
+ ecx
*4]
25477 punpckldq mm5
, [edx
+ ecx
*4 + 4]
25478 punpckldq mm6
, [edx
+ ecx
*4 + 8]
25479 punpckldq mm7
, [edx
+ ecx
*4 + 12]
25480 pfmul mm6
, mm1 ;
# mm6 = Geps
25481 pfmul mm7
, mm2 ;
# mm7 = Heps2
25483 pfadd mm5
, mm7 ;
# mm5 = Fp
25484 pfmul mm5
, mm1 ;
# mm5=eps*Fp
25485 pfadd mm5
, mm4 ;
# mm5= VV
25487 movq mm4
, [esp
+ mci3310_c6
]
25488 pfmul mm5
, mm4 ;
# vnb6
25490 ;
# update vnbtot to release mm5!
25491 pfadd mm5
, [esp
+ mci3310_vnbtot
] ;
# add the earlier value
25492 movq
[esp
+ mci3310_vnbtot
], mm5 ;
# store the sum
25495 mov ecx
, [esp
+ mci3310_n1
]
25496 lea ecx
, [ecx
+ ecx
*2]
25498 ;
# load all the table values we need
25499 movd mm4
, [edx
+ ecx
*4 + 16]
25500 movd mm5
, [edx
+ ecx
*4 + 20]
25501 movd mm6
, [edx
+ ecx
*4 + 24]
25502 movd mm7
, [edx
+ ecx
*4 + 28]
25503 mov ecx
, [esp
+ mci3310_n1
+ 4]
25504 lea ecx
, [ecx
+ ecx
*2]
25506 punpckldq mm4
, [edx
+ ecx
*4 + 16]
25507 punpckldq mm5
, [edx
+ ecx
*4 + 20]
25508 punpckldq mm6
, [edx
+ ecx
*4 + 24]
25509 punpckldq mm7
, [edx
+ ecx
*4 + 28]
25511 pfmul mm6
, mm1 ;
# mm6 = Geps
25512 pfmul mm7
, mm2 ;
# mm7 = Heps2
25514 pfadd mm5
, mm7 ;
# mm5 = Fp
25515 pfmul mm5
, mm1 ;
# mm5=eps*Fp
25516 pfadd mm5
, mm4 ;
# mm5= VV
25518 movq mm6
, [esp
+ mci3310_c12
]
25519 pfmul mm5
, mm6 ;
# vnb12
25521 pfadd mm5
, [esp
+ mci3310_vnbtot
] ;
# add the earlier value
25522 movq
[esp
+ mci3310_vnbtot
], mm5 ;
# store the sum
25524 ;
# should we do one more iteration?
25525 sub dword ptr
[esp
+ mci3310_innerk
], 2
25526 jl
.mci3310_finish_vdw_inner
25527 jmp
.mci3310_unroll_vdw_loop
25528 .mci3310_finish_vdw_inner:
25529 and dword ptr
[esp
+ mci3310_innerk
], 1
25530 jnz
.mci3310_single_vdw_inner
25531 jmp
.mci3310_updateouterdata_vdw
25532 .mci3310_single_vdw_inner:
25533 ;
# a single j particle iteration here - compare with the unrolled code for comments.
25534 mov eax
, [esp
+ mci3310_innerjjnr
]
25535 mov eax
, [eax
] ;
# eax=jnr offset
25537 mov esi
, [ebp
+ mci3310_nbfp
]
25538 mov ecx
, [ebp
+ mci3310_type
]
25539 mov edx
, [ecx
+ eax
*4] ;
# type [jnr1]
25541 add edx
, [esp
+ mci3310_ntia
] ;
# tja = ntia + 2*type
25542 movd mm5
, [esi
+ edx
*4] ;
# mm5 = 1st c6
25543 movq
[esp
+ mci3310_c6
], mm5
25544 movd mm5
, [esi
+ edx
*4 + 4] ;
# mm5 = 1st c12
25545 movq
[esp
+ mci3310_c12
], mm5
25547 mov esi
, [ebp
+ mci3310_pos
]
25548 lea eax
, [eax
+ eax
*2]
25550 movq mm0
, [esp
+ mci3310_ix
]
25551 movd mm1
, [esp
+ mci3310_iz
]
25552 movq mm4
, [esi
+ eax
*4]
25553 movd mm5
, [esi
+ eax
*4 + 8]
25559 pfacc mm4
, mm5 ;
# mm0=rsq
25565 pfrcpit2 mm0
,mm2 ;
# mm1=invsqrt
25568 ;
# mm0 is invsqrt, and mm1 r.
25570 ;
# calculate potentials and scalar force
25571 pfmul mm1
, [esp
+ mci3310_tsc
] ;
# mm1=rt
25573 movd
[esp
+ mci3310_n1
], mm4
25575 pfsub mm1
, mm4 ;
# now mm1 is eps and mm4 n0.
25578 pfmul mm2
,mm2 ;
# mm1 is eps, mm2 is eps2
25580 mov edx
, [ebp
+ mci3310_VFtab
]
25581 mov ecx
, [esp
+ mci3310_n1
]
25582 lea ecx
, [ecx
+ ecx
*2]
25584 ;
# dispersion table
25585 ;
# load all the table values we need
25587 movd mm4
, [edx
+ ecx
*4]
25588 movd mm5
, [edx
+ ecx
*4 + 4]
25589 movd mm6
, [edx
+ ecx
*4 + 8]
25590 movd mm7
, [edx
+ ecx
*4 + 12]
25591 pfmul mm6
, mm1 ;
# mm6 = Geps
25592 pfmul mm7
, mm2 ;
# mm7 = Heps2
25594 pfadd mm5
, mm7 ;
# mm5 = Fp
25595 pfmul mm5
, mm1 ;
# mm5=eps*Fp
25596 pfadd mm5
, mm4 ;
# mm5= VV
25598 movq mm4
, [esp
+ mci3310_c6
]
25599 pfmul mm5
, mm4 ;
# vnb6
25600 ;
# update vnbtot to release mm5!
25601 pfadd mm5
, [esp
+ mci3310_vnbtot
] ;
# add the earlier value
25602 movq
[esp
+ mci3310_vnbtot
], mm5 ;
# store the sum
25605 ;
# load all the table values we need
25607 movd mm4
, [edx
+ ecx
*4 + 16]
25608 movd mm5
, [edx
+ ecx
*4 + 20]
25609 movd mm6
, [edx
+ ecx
*4 + 24]
25610 movd mm7
, [edx
+ ecx
*4 + 28]
25612 pfmul mm6
, mm1 ;
# mm6 = Geps
25613 pfmul mm7
, mm2 ;
# mm7 = Heps2
25615 pfadd mm5
, mm7 ;
# mm5 = Fp
25616 pfmul mm5
, mm1 ;
# mm5=eps*Fp
25617 pfadd mm5
, mm4 ;
# mm5= VV
25619 movq mm6
, [esp
+ mci3310_c12
]
25620 pfmul mm5
, mm6 ;
# vnb12
25622 pfadd mm5
, [esp
+ mci3310_vnbtot
] ;
# add the earlier value
25623 movq
[esp
+ mci3310_vnbtot
], mm5 ;
# store the sum
25625 .mci3310_updateouterdata_vdw:
25626 ;
# loop back to mno
25627 dec dword ptr
[esp
+ mci3310_nsvdw
]
25628 jz
.mci3310_last_mno
25629 jmp
.mci3310_mno_vdw
25632 mov edx
, [ebp
+ mci3310_gid
] ;
# get group index for this i particle
25634 add dword ptr
[ebp
+ mci3310_gid
], 4 ;
# advance pointer
25636 movq mm7
, [esp
+ mci3310_vctot
]
25637 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
25639 mov eax
, [ebp
+ mci3310_Vc
]
25640 movd mm6
, [eax
+ edx
*4]
25642 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
25644 movq mm7
, [esp
+ mci3310_vnbtot
]
25645 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
25647 mov eax
, [ebp
+ mci3310_Vnb
]
25648 movd mm6
, [eax
+ edx
*4]
25650 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
25652 mov ecx
, [ebp
+ mci3310_nri
]
25655 ;
# not last, iterate once more!
25656 mov
[ebp
+ mci3310_nri
], ecx
25671 .globl mcinl3320_3dnow
25672 .globl _mcinl3320_3dnow
25675 .equiv mci3320_nri, 8
25676 .equiv mci3320_iinr, 12
25677 .equiv mci3320_jindex, 16
25678 .equiv mci3320_jjnr, 20
25679 .equiv mci3320_shift, 24
25680 .equiv mci3320_shiftvec, 28
25681 .equiv mci3320_gid, 32
25682 .equiv mci3320_pos, 36
25683 .equiv mci3320_charge, 40
25684 .equiv mci3320_facel, 44
25685 .equiv mci3320_Vc, 48
25686 .equiv mci3320_type, 52
25687 .equiv mci3320_ntype, 56
25688 .equiv mci3320_nbfp, 60
25689 .equiv mci3320_Vnb, 64
25690 .equiv mci3320_tabscale, 68
25691 .equiv mci3320_VFtab, 72
25692 ;
# stack offsets for local variables
25693 .equiv mci3320_is3, 0
25694 .equiv mci3320_ii3, 4
25695 .equiv mci3320_ixO, 8
25696 .equiv mci3320_iyO, 12
25697 .equiv mci3320_izO, 16
25698 .equiv mci3320_ixH, 20
25699 .equiv mci3320_iyH, 28
25700 .equiv mci3320_izH, 36
25701 .equiv mci3320_iqO, 44
25702 .equiv mci3320_iqH, 52
25703 .equiv mci3320_qqO, 60
25704 .equiv mci3320_qqH, 68
25705 .equiv mci3320_vctot, 76
25706 .equiv mci3320_vnbtot, 84
25707 .equiv mci3320_c6, 92
25708 .equiv mci3320_c12, 100
25709 .equiv mci3320_n1, 108
25710 .equiv mci3320_tsc, 116
25711 .equiv mci3320_ntia, 124
25712 .equiv mci3320_innerjjnr, 128
25713 .equiv mci3320_innerk, 132
25714 .equiv mci3320_tmprsqH, 136
25723 sub esp
, 144 ;
# local stack space
25726 mov ecx
, [ebp
+ mci3320_iinr
] ;
# ecx = pointer into iinr[]
25727 mov ebx
, [ecx
] ;
# ebx=ii
25729 mov edx
, [ebp
+ mci3320_charge
]
25730 movd mm1
, [ebp
+ mci3320_facel
]
25731 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0]
25733 movq
[esp
+ mci3320_iqO
], mm2 ;
# iqO = facel*charge[ii]
25735 movd mm2
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1]
25737 punpckldq mm2
,mm2 ;
# spread to both halves
25738 movq
[esp
+ mci3320_iqH
], mm2 ;
# iqH = facel*charge[ii0+1]
25740 mov edx
, [ebp
+ mci3320_type
]
25741 mov ecx
, [edx
+ ebx
*4]
25743 imul ecx
, [ebp
+ mci3320_ntype
] ;
# ecx = ntia = 2*ntype*type[ii0]
25744 mov
[esp
+ mci3320_ntia
], ecx
25746 movq mm4
, [ebp
+ mci3320_tabscale
]
25747 punpckldq mm4
,mm4 ;
# spread to both halves
25748 movq
[esp
+ mci3320_tsc
], mm4
25749 ;
# assume we have at least one i particle - start directly
25751 mov eax
, [ebp
+ mci3320_shift
] ;
# eax = pointer into shift[]
25752 mov ebx
, [eax
] ;
# ebx=shift[n]
25753 add dword ptr
[ebp
+ mci3320_shift
], 4 ;
# advance pointer one step
25755 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
25756 mov
[esp
+ mci3320_is3
],ebx ;
# store is3
25758 mov eax
, [ebp
+ mci3320_shiftvec
] ;
# eax = base of shiftvec[]
25760 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
25761 movd mm6
, [eax
+ ebx
*4 + 8]
25765 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
25769 mov ecx
, [ebp
+ mci3320_iinr
] ;
# ecx = pointer into iinr[]
25770 add dword ptr
[ebp
+ mci3320_iinr
], 4 ;
# advance pointer
25771 mov ebx
, [ecx
] ;
# ebx=ii
25773 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
25774 mov eax
, [ebp
+ mci3320_pos
] ;
# eax = base of pos[]
25776 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
25777 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
25778 mov
[esp
+ mci3320_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
25780 movq
[esp
+ mci3320_ixO
], mm5
25781 movq
[esp
+ mci3320_izO
], mm6
25783 movd mm3
, [eax
+ ebx
*4 + 12]
25784 movd mm4
, [eax
+ ebx
*4 + 16]
25785 movd mm5
, [eax
+ ebx
*4 + 20]
25786 punpckldq mm3
, [eax
+ ebx
*4 + 24]
25787 punpckldq mm4
, [eax
+ ebx
*4 + 28]
25788 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
25793 movq
[esp
+ mci3320_ixH
], mm0
25794 movq
[esp
+ mci3320_iyH
], mm1
25795 movq
[esp
+ mci3320_izH
], mm2
25797 ;
# clear vctot and i forces
25799 movq
[esp
+ mci3320_vctot
], mm7
25800 movq
[esp
+ mci3320_vnbtot
], mm7
25802 mov eax
, [ebp
+ mci3320_jindex
]
25803 mov ecx
, [eax
] ;
# jindex[n]
25804 mov edx
, [eax
+ 4] ;
# jindex[n+1]
25805 add dword ptr
[ebp
+ mci3320_jindex
], 4
25806 sub edx
, ecx ;
# number of innerloop atoms
25807 mov
[esp
+ mci3320_innerk
], edx
25809 mov esi
, [ebp
+ mci3320_pos
]
25810 mov eax
, [ebp
+ mci3320_jjnr
]
25813 mov
[esp
+ mci3320_innerjjnr
], eax ;
# pointer to jjnr[nj0]
25814 .mci3320_inner_loop:
25815 ;
# a single j particle iteration here - compare with the unrolled code for comments.
25816 mov eax
, [esp
+ mci3320_innerjjnr
]
25817 mov eax
, [eax
] ;
# eax=jnr offset
25818 add dword ptr
[esp
+ mci3320_innerjjnr
], 4 ;
# advance pointer
25819 prefetch
[ecx
+ 16] ;
# prefetch data - trial and error says 16 is best
25821 mov ecx
, [ebp
+ mci3320_charge
]
25822 movd mm7
, [ecx
+ eax
*4]
25825 pfmul mm6
, [esp
+ mci3320_iqO
]
25826 pfmul mm7
, [esp
+ mci3320_iqH
] ;
# mm6=qqO, mm7=qqH
25827 movd
[esp
+ mci3320_qqO
], mm6
25828 movq
[esp
+ mci3320_qqH
], mm7
25830 mov ecx
, [ebp
+ mci3320_type
]
25831 mov edx
, [ecx
+ eax
*4] ;
# type [jnr]
25832 mov ecx
, [ebp
+ mci3320_nbfp
]
25834 add edx
, [esp
+ mci3320_ntia
] ;
# tja = ntia + 2*type
25835 movd mm5
, [ecx
+ edx
*4] ;
# mm5 = 1st c6
25836 movq
[esp
+ mci3320_c6
], mm5
25837 movd mm5
, [ecx
+ edx
*4 + 4] ;
# mm5 = 1st c12
25838 movq
[esp
+ mci3320_c12
], mm5
25840 lea eax
, [eax
+ eax
*2]
25842 movq mm0
, [esi
+ eax
*4]
25843 movd mm1
, [esi
+ eax
*4 + 8]
25844 ;
# copy & expand to mm2-mm4 for the H interactions
25852 pfsubr mm0
, [esp
+ mci3320_ixO
]
25853 pfsubr mm1
, [esp
+ mci3320_izO
]
25858 pfadd mm0
, mm1 ;
# mm0=rsqO
25862 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
25863 pfsubr mm2
, [esp
+ mci3320_ixH
]
25864 pfsubr mm3
, [esp
+ mci3320_iyH
]
25865 pfsubr mm4
, [esp
+ mci3320_izH
] ;
# mm2-mm4 is dxH-dzH
25872 pfadd mm3
,mm4 ;
# mm3=rsqH
25873 movq
[esp
+ mci3320_tmprsqH
], mm3
25880 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
25882 pfmul mm0
, mm1 ;
# mm0=r
25884 pfmul mm0
, [esp
+ mci3320_tsc
]
25886 movd
[esp
+ mci3320_n1
], mm4
25888 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
25890 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
25893 mov edx
, [ebp
+ mci3320_VFtab
]
25894 mov ecx
, [esp
+ mci3320_n1
]
25895 lea ecx
, [ecx
+ ecx
*2]
25897 ;
# load all values we need
25898 movd mm4
, [edx
+ ecx
*4]
25899 movd mm5
, [edx
+ ecx
*4 + 4]
25900 movd mm6
, [edx
+ ecx
*4 + 8]
25901 movd mm7
, [edx
+ ecx
*4 + 12]
25903 pfmul mm6
, mm0 ;
# mm6 = Geps
25904 pfmul mm7
, mm2 ;
# mm7 = Heps2
25907 pfadd mm5
, mm7 ;
# mm5 = Fp
25909 pfmul mm5
, mm0 ;
# mm5=eps*Fp
25910 pfadd mm5
, mm4 ;
# mm5= VV
25912 pfmul mm5
, [esp
+ mci3320_qqO
] ;
# vcoul=qq*VV
25914 ;
# update vctot directly
25915 pfadd mm5
, [esp
+ mci3320_vctot
]
25916 movq
[esp
+ mci3320_vctot
], mm5
25918 ;
# dispersion table
25919 ;
# load all the table values we need
25920 movd mm4
, [edx
+ ecx
*4 + 16]
25921 movd mm5
, [edx
+ ecx
*4 + 20]
25922 movd mm6
, [edx
+ ecx
*4 + 24]
25923 movd mm7
, [edx
+ ecx
*4 + 28]
25924 pfmul mm6
, mm0 ;
# mm6 = Geps
25925 pfmul mm7
, mm2 ;
# mm7 = Heps2
25927 pfadd mm5
, mm7 ;
# mm5 = Fp
25928 pfmul mm5
, mm0 ;
# mm5=eps*Fp
25929 pfadd mm5
, mm4 ;
# mm5= VV
25931 movq mm4
, [esp
+ mci3320_c6
]
25932 pfmul mm5
, mm4 ;
# vnb6
25933 ;
# update vnbtot to release mm5!
25934 pfadd mm5
, [esp
+ mci3320_vnbtot
] ;
# add the earlier value
25935 movq
[esp
+ mci3320_vnbtot
], mm5 ;
# store the sum
25938 ;
# load all the table values we need
25939 movd mm4
, [edx
+ ecx
*4 + 32]
25940 movd mm5
, [edx
+ ecx
*4 + 36]
25941 movd mm6
, [edx
+ ecx
*4 + 40]
25942 movd mm7
, [edx
+ ecx
*4 + 44]
25944 pfmul mm6
, mm0 ;
# mm6 = Geps
25945 pfmul mm7
, mm2 ;
# mm7 = Heps2
25947 pfadd mm5
, mm7 ;
# mm5 = Fp
25948 pfmul mm5
, mm0 ;
# mm5=eps*Fp
25949 pfadd mm5
, mm4 ;
# mm5= VV
25951 movq mm6
, [esp
+ mci3320_c12
]
25952 pfmul mm5
, mm6 ;
# vnb12
25954 pfadd mm5
, [esp
+ mci3320_vnbtot
] ;
# add the earlier value
25955 movq
[esp
+ mci3320_vnbtot
], mm5 ;
# store the sum
25957 ;
# now do the two hydrogens.
25958 movq mm0
, [esp
+ mci3320_tmprsqH
] ;
# mm0=rsqH
25964 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
25969 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
25971 pfmul mm0
,mm1 ;
# mm0=r
25972 pfmul mm0
, [esp
+ mci3320_tsc
]
25974 movq
[esp
+ mci3320_n1
], mm4
25976 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
25978 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
25981 mov edx
, [ebp
+ mci3320_VFtab
]
25982 mov ecx
, [esp
+ mci3320_n1
]
25983 lea ecx
, [ecx
+ ecx
*2]
25985 ;
# load all values we need
25986 movd mm4
, [edx
+ ecx
*4]
25987 movd mm5
, [edx
+ ecx
*4 + 4]
25988 movd mm6
, [edx
+ ecx
*4 + 8]
25989 movd mm7
, [edx
+ ecx
*4 + 12]
25990 mov ecx
, [esp
+ mci3320_n1
+ 4]
25991 lea ecx
, [ecx
+ ecx
*2]
25993 punpckldq mm4
, [edx
+ ecx
*4]
25994 punpckldq mm5
, [edx
+ ecx
*4 + 4]
25995 punpckldq mm6
, [edx
+ ecx
*4 + 8]
25996 punpckldq mm7
, [edx
+ ecx
*4 + 12]
25999 pfmul mm6
, mm0 ;
# mm6 = Geps
26000 pfmul mm7
, mm2 ;
# mm7 = Heps2
26003 pfadd mm5
, mm7 ;
# mm5 = Fp
26005 pfmul mm5
, mm0 ;
# mm5=eps*Fp
26006 pfadd mm5
, mm4 ;
# mm5= VV
26008 pfmul mm5
, [esp
+ mci3320_qqH
] ;
# vcoul=qq*VV
26010 pfadd mm5
, [esp
+ mci3320_vctot
]
26011 movq
[esp
+ mci3320_vctot
], mm5
26013 ;
# done - one more?
26014 dec dword ptr
[esp
+ mci3320_innerk
]
26015 jz
.mci3320_updateouterdata
26016 jmp
.mci3320_inner_loop
26017 .mci3320_updateouterdata:
26018 mov edx
, [ebp
+ mci3320_gid
] ;
# get group index for this i particle
26020 add dword ptr
[ebp
+ mci3320_gid
], 4 ;
# advance pointer
26022 movq mm7
, [esp
+ mci3320_vctot
]
26023 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
26025 mov eax
, [ebp
+ mci3320_Vc
]
26026 movd mm6
, [eax
+ edx
*4]
26028 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
26030 movq mm7
, [esp
+ mci3320_vnbtot
]
26031 pfacc mm7
,mm7 ;
# same for Vnb
26033 mov eax
, [ebp
+ mci3320_Vnb
]
26034 movd mm6
, [eax
+ edx
*4]
26036 movd
[eax
+ edx
*4], mm6 ;
# increment vnb[gid]
26038 dec dword ptr
[ebp
+ mci3320_nri
]
26040 ;
# not last, iterate once more!
26056 .globl mcinl3330_3dnow
26057 .globl _mcinl3330_3dnow
26060 .equiv mci3330_nri, 8
26061 .equiv mci3330_iinr, 12
26062 .equiv mci3330_jindex, 16
26063 .equiv mci3330_jjnr, 20
26064 .equiv mci3330_shift, 24
26065 .equiv mci3330_shiftvec, 28
26066 .equiv mci3330_gid, 32
26067 .equiv mci3330_pos, 36
26068 .equiv mci3330_charge, 40
26069 .equiv mci3330_facel, 44
26070 .equiv mci3330_Vc, 48
26071 .equiv mci3330_type, 52
26072 .equiv mci3330_ntype, 56
26073 .equiv mci3330_nbfp, 60
26074 .equiv mci3330_Vnb, 64
26075 .equiv mci3330_tabscale, 68
26076 .equiv mci3330_VFtab, 72
26077 ;
# stack offsets for local variables
26078 .equiv mci3330_is3, 0
26079 .equiv mci3330_ii3, 4
26080 .equiv mci3330_ixO, 8
26081 .equiv mci3330_iyO, 12
26082 .equiv mci3330_izO, 16
26083 .equiv mci3330_ixH, 20
26084 .equiv mci3330_iyH, 28
26085 .equiv mci3330_izH, 36
26086 .equiv mci3330_qqOO, 44
26087 .equiv mci3330_qqOH, 52
26088 .equiv mci3330_qqHH, 60
26089 .equiv mci3330_c6, 68
26090 .equiv mci3330_c12, 76
26091 .equiv mci3330_n1, 84
26092 .equiv mci3330_tsc, 92
26093 .equiv mci3330_vctot, 100
26094 .equiv mci3330_vnbtot, 108
26095 .equiv mci3330_innerjjnr, 116
26096 .equiv mci3330_innerk, 120
26097 .equiv mci3330_tmprsqH, 124
26106 sub esp
, 132 ;
# local stack space
26108 ;
# assume we have at least one i particle - start directly
26110 mov ecx
, [ebp
+ mci3330_iinr
] ;
# ecx = pointer into iinr[]
26111 mov ebx
, [ecx
] ;
# ebx=ii
26113 mov edx
, [ebp
+ mci3330_charge
]
26114 movd mm1
, [ebp
+ mci3330_facel
] ;
# mm1=facel
26115 movd mm2
, [edx
+ ebx
*4] ;
# mm2=charge[ii0] (O)
26116 movd mm3
, [edx
+ ebx
*4 + 4] ;
# mm2=charge[ii0+1] (H)
26122 pfmul mm4
, mm2 ;
# mm4=qqOO*facel
26123 pfmul mm5
, mm3 ;
# mm5=qqOH*facel
26124 pfmul mm6
, mm3 ;
# mm6=qqHH*facel
26125 punpckldq mm5
,mm5 ;
# spread to both halves
26126 punpckldq mm6
,mm6 ;
# spread to both halves
26127 movq
[esp
+ mci3330_qqOO
], mm4
26128 movq
[esp
+ mci3330_qqOH
], mm5
26129 movq
[esp
+ mci3330_qqHH
], mm6
26130 mov edx
, [ebp
+ mci3330_type
]
26131 mov ecx
, [edx
+ ebx
*4]
26134 imul ecx
, [ebp
+ mci3330_ntype
]
26136 mov eax
, [ebp
+ mci3330_nbfp
]
26137 movd mm0
, [eax
+ edx
*4]
26138 movd mm1
, [eax
+ edx
*4 + 4]
26139 movq
[esp
+ mci3330_c6
], mm0
26140 movq
[esp
+ mci3330_c12
], mm1
26141 movd mm3
, [ebp
+ mci3330_tabscale
]
26143 movq
[esp
+ mci3330_tsc
], mm3
26145 mov eax
, [ebp
+ mci3330_shift
] ;
# eax = pointer into shift[]
26146 mov ebx
, [eax
] ;
# ebx=shift[n]
26147 add dword ptr
[ebp
+ mci3330_shift
], 4 ;
# advance pointer one step
26149 lea ebx
, [ebx
+ ebx
*2] ;
# ebx=3*is
26150 mov
[esp
+ mci3330_is3
],ebx ;
# store is3
26152 mov eax
, [ebp
+ mci3330_shiftvec
] ;
# eax = base of shiftvec[]
26154 movq mm5
, [eax
+ ebx
*4] ;
# move shX/shY to mm5 and shZ to mm6.
26155 movd mm6
, [eax
+ ebx
*4 + 8]
26159 punpckldq mm0
,mm0 ;
# also expand shX,Y,Z in mm0--mm2.
26163 mov ecx
, [ebp
+ mci3330_iinr
] ;
# ecx = pointer into iinr[]
26164 add dword ptr
[ebp
+ mci3330_iinr
], 4 ;
# advance pointer
26165 mov ebx
, [ecx
] ;
# ebx=ii
26167 lea ebx
, [ebx
+ ebx
*2] ;
# ebx = 3*ii=ii3
26168 mov eax
, [ebp
+ mci3330_pos
] ;
# eax = base of pos[]
26170 pfadd mm5
, [eax
+ ebx
*4] ;
# ix = shX + posX (and iy too)
26171 movd mm7
, [eax
+ ebx
*4 + 8] ;
# cant use direct memory add for 4 bytes (iz)
26172 mov
[esp
+ mci3330_ii3
], ebx ;
# (use mm7 as temp. storage for iz.)
26174 movq
[esp
+ mci3330_ixO
], mm5
26175 movq
[esp
+ mci3330_izO
], mm6
26177 movd mm3
, [eax
+ ebx
*4 + 12]
26178 movd mm4
, [eax
+ ebx
*4 + 16]
26179 movd mm5
, [eax
+ ebx
*4 + 20]
26180 punpckldq mm3
, [eax
+ ebx
*4 + 24]
26181 punpckldq mm4
, [eax
+ ebx
*4 + 28]
26182 punpckldq mm5
, [eax
+ ebx
*4 + 32] ;
# coords of H1 in low mm3-mm5, H2 in high
26187 movq
[esp
+ mci3330_ixH
], mm0
26188 movq
[esp
+ mci3330_iyH
], mm1
26189 movq
[esp
+ mci3330_izH
], mm2
26191 ;
# clear vctot and i forces
26193 movq
[esp
+ mci3330_vctot
], mm7
26194 movq
[esp
+ mci3330_vnbtot
], mm7
26196 mov eax
, [ebp
+ mci3330_jindex
]
26197 mov ecx
, [eax
] ;
# jindex[n]
26198 mov edx
, [eax
+ 4] ;
# jindex[n+1]
26199 add dword ptr
[ebp
+ mci3330_jindex
], 4
26200 sub edx
, ecx ;
# number of innerloop atoms
26201 mov
[esp
+ mci3330_innerk
], edx
26203 mov esi
, [ebp
+ mci3330_pos
]
26204 mov eax
, [ebp
+ mci3330_jjnr
]
26207 mov
[esp
+ mci3330_innerjjnr
], eax ;
# pointer to jjnr[nj0]
26208 .mci3330_inner_loop:
26209 ;
# a single j particle iteration here - compare with the unrolled code for comments.
26210 mov eax
, [esp
+ mci3330_innerjjnr
]
26211 mov eax
, [eax
] ;
# eax=jnr offset
26212 add dword ptr
[esp
+ mci3330_innerjjnr
], 4 ;
# advance pointer
26214 lea eax
, [eax
+ eax
*2]
26216 movq mm0
, [esi
+ eax
*4]
26217 movd mm1
, [esi
+ eax
*4 + 8]
26218 ;
# copy & expand to mm2-mm4 for the H interactions
26226 pfsubr mm0
, [esp
+ mci3330_ixO
]
26227 pfsubr mm1
, [esp
+ mci3330_izO
]
26232 pfadd mm0
, mm1 ;
# mm0=rsqO
26236 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
26237 pfsubr mm2
, [esp
+ mci3330_ixH
]
26238 pfsubr mm3
, [esp
+ mci3330_iyH
]
26239 pfsubr mm4
, [esp
+ mci3330_izH
] ;
# mm2-mm4 is dxH-dzH
26246 pfadd mm3
,mm4 ;
# mm3=rsqH
26247 movq
[esp
+ mci3330_tmprsqH
], mm3
26254 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
26255 pfmul mm0
, mm1 ;
# mm0=rsq
26257 pfmul mm0
, [esp
+ mci3330_tsc
]
26259 movd
[esp
+ mci3330_n1
], mm4
26261 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
26263 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
26266 mov edx
, [ebp
+ mci3330_VFtab
]
26267 mov ecx
, [esp
+ mci3330_n1
]
26268 lea ecx
, [ecx
+ ecx
*2]
26271 ;
# load all values we need
26272 movd mm4
, [edx
+ ecx
*4]
26273 movd mm5
, [edx
+ ecx
*4 + 4]
26274 movd mm6
, [edx
+ ecx
*4 + 8]
26275 movd mm7
, [edx
+ ecx
*4 + 12]
26277 pfmul mm6
, mm0 ;
# mm6 = Geps
26278 pfmul mm7
, mm2 ;
# mm7 = Heps2
26281 pfadd mm5
, mm7 ;
# mm5 = Fp
26283 pfmul mm5
, mm0 ;
# mm5=eps*Fp
26284 pfadd mm5
, mm4 ;
# mm5= VV
26286 pfmul mm5
, [esp
+ mci3330_qqOO
] ;
# vcoul=qq*VV
26287 ;
# update vctot directly, use mm3 for fscal sum.
26288 pfadd mm5
, [esp
+ mci3330_vctot
]
26289 movq
[esp
+ mci3330_vctot
], mm5
26291 ;
# dispersion table
26292 ;
# load all the table values we need
26293 movd mm4
, [edx
+ ecx
*4 + 16]
26294 movd mm5
, [edx
+ ecx
*4 + 20]
26295 movd mm6
, [edx
+ ecx
*4 + 24]
26296 movd mm7
, [edx
+ ecx
*4 + 28]
26297 pfmul mm6
, mm0 ;
# mm6 = Geps
26298 pfmul mm7
, mm2 ;
# mm7 = Heps2
26300 pfadd mm5
, mm7 ;
# mm5 = Fp
26301 pfmul mm5
, mm0 ;
# mm5=eps*Fp
26302 pfadd mm5
, mm4 ;
# mm5= VV
26304 movq mm4
, [esp
+ mci3330_c6
]
26305 pfmul mm5
, mm4 ;
# vnb6
26306 ;
# update vnbtot to release mm5!
26307 pfadd mm5
, [esp
+ mci3330_vnbtot
] ;
# add the earlier value
26308 movq
[esp
+ mci3330_vnbtot
], mm5 ;
# store the sum
26311 ;
# load all the table values we need
26312 movd mm4
, [edx
+ ecx
*4 + 32]
26313 movd mm5
, [edx
+ ecx
*4 + 36]
26314 movd mm6
, [edx
+ ecx
*4 + 40]
26315 movd mm7
, [edx
+ ecx
*4 + 44]
26317 pfmul mm6
, mm0 ;
# mm6 = Geps
26318 pfmul mm7
, mm2 ;
# mm7 = Heps2
26320 pfadd mm5
, mm7 ;
# mm5 = Fp
26321 pfmul mm5
, mm0 ;
# mm5=eps*Fp
26322 pfadd mm5
, mm4 ;
# mm5= VV
26324 movq mm6
, [esp
+ mci3330_c12
]
26325 pfmul mm5
, mm6 ;
# vnb12
26326 ;
# change sign of fscal and multiply with rinv
26328 pfadd mm5
, [esp
+ mci3330_vnbtot
] ;
# add the earlier value
26329 movq
[esp
+ mci3330_vnbtot
], mm5 ;
# store the sum
26331 ;
# Ready with the oxygen - time for hydrogens
26333 movq mm0
, [esp
+ mci3330_tmprsqH
]
26339 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
26344 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
26346 pfmul mm0
,mm1 ;
# mm0=r
26347 pfmul mm0
, [esp
+ mci3330_tsc
]
26349 movq
[esp
+ mci3330_n1
], mm4
26351 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
26353 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
26356 mov edx
, [ebp
+ mci3330_VFtab
]
26357 mov ecx
, [esp
+ mci3330_n1
]
26358 lea ecx
, [ecx
+ ecx
*2]
26360 ;
# load all values we need
26361 movd mm4
, [edx
+ ecx
*4]
26362 movd mm5
, [edx
+ ecx
*4 + 4]
26363 movd mm6
, [edx
+ ecx
*4 + 8]
26364 movd mm7
, [edx
+ ecx
*4 + 12]
26365 mov ecx
, [esp
+ mci3330_n1
+ 4]
26366 lea ecx
, [ecx
+ ecx
*2]
26368 punpckldq mm4
, [edx
+ ecx
*4]
26369 punpckldq mm5
, [edx
+ ecx
*4 + 4]
26370 punpckldq mm6
, [edx
+ ecx
*4 + 8]
26371 punpckldq mm7
, [edx
+ ecx
*4 + 12]
26373 pfmul mm6
, mm0 ;
# mm6 = Geps
26374 pfmul mm7
, mm2 ;
# mm7 = Heps2
26377 pfadd mm5
, mm7 ;
# mm5 = Fp
26379 pfmul mm5
, mm0 ;
# mm5=eps*Fp
26380 pfadd mm5
, mm4 ;
# mm5= VV
26382 pfmul mm5
, [esp
+ mci3330_qqOH
] ;
# vcoul=qq*VV
26384 pfadd mm5
, [esp
+ mci3330_vctot
]
26385 movq
[esp
+ mci3330_vctot
], mm5
26387 ;
# interactions with j H1
26388 movq mm0
, [esi
+ eax
*4 + 12]
26389 movd mm1
, [esi
+ eax
*4 + 20]
26390 ;
# copy & expand to mm2-mm4 for the H interactions
26398 pfsubr mm0
, [esp
+ mci3330_ixO
]
26399 pfsubr mm1
, [esp
+ mci3330_izO
]
26404 pfadd mm0
, mm1 ;
# mm0=rsqO
26408 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
26409 pfsubr mm2
, [esp
+ mci3330_ixH
]
26410 pfsubr mm3
, [esp
+ mci3330_iyH
]
26411 pfsubr mm4
, [esp
+ mci3330_izH
] ;
# mm2-mm4 is dxH-dzH
26418 pfadd mm3
,mm4 ;
# mm3=rsqH
26419 movq
[esp
+ mci3330_tmprsqH
], mm3
26426 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
26427 pfmul mm0
, mm1 ;
# mm0=rsq
26429 pfmul mm0
, [esp
+ mci3330_tsc
]
26431 movd
[esp
+ mci3330_n1
], mm4
26433 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
26435 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
26438 mov edx
, [ebp
+ mci3330_VFtab
]
26439 mov ecx
, [esp
+ mci3330_n1
]
26440 lea ecx
, [ecx
+ ecx
*2]
26443 ;
# load all values we need
26444 movd mm4
, [edx
+ ecx
*4]
26445 movd mm5
, [edx
+ ecx
*4 + 4]
26446 movd mm6
, [edx
+ ecx
*4 + 8]
26447 movd mm7
, [edx
+ ecx
*4 + 12]
26449 pfmul mm6
, mm0 ;
# mm6 = Geps
26450 pfmul mm7
, mm2 ;
# mm7 = Heps2
26453 pfadd mm5
, mm7 ;
# mm5 = Fp
26455 pfmul mm5
, mm0 ;
# mm5=eps*Fp
26456 pfadd mm5
, mm4 ;
# mm5= VV
26458 pfmul mm5
, [esp
+ mci3330_qqOH
] ;
# vcoul=qq*VV
26459 ;
# update vctot directly
26460 pfadd mm5
, [esp
+ mci3330_vctot
]
26461 movq
[esp
+ mci3330_vctot
], mm5
26463 movq mm0
, [esp
+ mci3330_tmprsqH
]
26468 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
26473 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
26475 pfmul mm0
,mm1 ;
# mm0=r
26476 pfmul mm0
, [esp
+ mci3330_tsc
]
26478 movq
[esp
+ mci3330_n1
], mm4
26480 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
26482 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
26485 mov edx
, [ebp
+ mci3330_VFtab
]
26486 mov ecx
, [esp
+ mci3330_n1
]
26487 lea ecx
, [ecx
+ ecx
*2]
26489 ;
# load all values we need
26490 movd mm4
, [edx
+ ecx
*4]
26491 movd mm5
, [edx
+ ecx
*4 + 4]
26492 movd mm6
, [edx
+ ecx
*4 + 8]
26493 movd mm7
, [edx
+ ecx
*4 + 12]
26494 mov ecx
, [esp
+ mci3330_n1
+ 4]
26495 lea ecx
, [ecx
+ ecx
*2]
26497 punpckldq mm4
, [edx
+ ecx
*4]
26498 punpckldq mm5
, [edx
+ ecx
*4 + 4]
26499 punpckldq mm6
, [edx
+ ecx
*4 + 8]
26500 punpckldq mm7
, [edx
+ ecx
*4 + 12]
26503 pfmul mm6
, mm0 ;
# mm6 = Geps
26504 pfmul mm7
, mm2 ;
# mm7 = Heps2
26507 pfadd mm5
, mm7 ;
# mm5 = Fp
26509 pfmul mm5
, mm0 ;
# mm5=eps*Fp
26510 pfadd mm5
, mm4 ;
# mm5= VV
26512 pfmul mm5
, [esp
+ mci3330_qqHH
] ;
# vcoul=qq*VV
26514 pfadd mm5
, [esp
+ mci3330_vctot
]
26515 movq
[esp
+ mci3330_vctot
], mm5
26517 ;
# interactions with j H2
26518 movq mm0
, [esi
+ eax
*4 + 24]
26519 movd mm1
, [esi
+ eax
*4 + 32]
26520 ;
# copy & expand to mm2-mm4 for the H interactions
26528 pfsubr mm0
, [esp
+ mci3330_ixO
]
26529 pfsubr mm1
, [esp
+ mci3330_izO
]
26534 pfadd mm0
, mm1 ;
# mm0=rsqO
26538 punpckldq mm4
, mm4 ;
# mm2-mm4 is jx-jz
26539 pfsubr mm2
, [esp
+ mci3330_ixH
]
26540 pfsubr mm3
, [esp
+ mci3330_iyH
]
26541 pfsubr mm4
, [esp
+ mci3330_izH
] ;
# mm2-mm4 is dxH-dzH
26548 pfadd mm3
,mm4 ;
# mm3=rsqH
26549 movq
[esp
+ mci3330_tmprsqH
], mm3
26556 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
26559 pfmul mm0
, [esp
+ mci3330_tsc
]
26561 movd
[esp
+ mci3330_n1
], mm4
26563 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
26565 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
26568 mov edx
, [ebp
+ mci3330_VFtab
]
26569 mov ecx
, [esp
+ mci3330_n1
]
26570 lea ecx
, [ecx
+ ecx
*2]
26573 ;
# load all values we need
26574 movd mm4
, [edx
+ ecx
*4]
26575 movd mm5
, [edx
+ ecx
*4 + 4]
26576 movd mm6
, [edx
+ ecx
*4 + 8]
26577 movd mm7
, [edx
+ ecx
*4 + 12]
26579 pfmul mm6
, mm0 ;
# mm6 = Geps
26580 pfmul mm7
, mm2 ;
# mm7 = Heps2
26583 pfadd mm5
, mm7 ;
# mm5 = Fp
26585 pfmul mm5
, mm0 ;
# mm5=eps*Fp
26586 pfadd mm5
, mm4 ;
# mm5= VV
26588 pfmul mm5
, [esp
+ mci3330_qqOH
] ;
# vcoul=qq*VV
26589 ;
# update vctot directly
26590 pfadd mm5
, [esp
+ mci3330_vctot
]
26591 movq
[esp
+ mci3330_vctot
], mm5
26593 movq mm0
, [esp
+ mci3330_tmprsqH
]
26598 punpckldq mm1
,mm2 ;
# seeds are in mm1 now, and rsq in mm0.
26603 pfrcpit2 mm1
,mm2 ;
# mm1=invsqrt
26605 pfmul mm0
,mm1 ;
# mm0=r
26606 pfmul mm0
, [esp
+ mci3330_tsc
]
26608 movq
[esp
+ mci3330_n1
], mm4
26610 pfsub mm0
, mm4 ;
# now mm0 is eps and mm4 n0
26612 pfmul mm2
, mm2 ;
# mm0 is eps, mm2 eps2
26615 mov edx
, [ebp
+ mci3330_VFtab
]
26616 mov ecx
, [esp
+ mci3330_n1
]
26617 lea ecx
, [ecx
+ ecx
*2]
26619 ;
# load all values we need
26620 movd mm4
, [edx
+ ecx
*4]
26621 movd mm5
, [edx
+ ecx
*4 + 4]
26622 movd mm6
, [edx
+ ecx
*4 + 8]
26623 movd mm7
, [edx
+ ecx
*4 + 12]
26624 mov ecx
, [esp
+ mci3330_n1
+ 4] ;
# mm5 = Fp
26625 lea ecx
, [ecx
+ ecx
*2]
26627 punpckldq mm4
, [edx
+ ecx
*4]
26628 punpckldq mm5
, [edx
+ ecx
*4 + 4]
26629 punpckldq mm6
, [edx
+ ecx
*4 + 8]
26630 punpckldq mm7
, [edx
+ ecx
*4 + 12]
26633 pfmul mm6
, mm0 ;
# mm6 = Geps
26634 pfmul mm7
, mm2 ;
# mm7 = Heps2
26637 pfadd mm5
, mm7 ;
# mm5 = Fp
26639 pfmul mm5
, mm0 ;
# mm5=eps*Fp
26640 pfadd mm5
, mm4 ;
# mm5= VV
26642 pfmul mm5
, [esp
+ mci3330_qqHH
] ;
# vcoul=qq*VV
26644 pfadd mm5
, [esp
+ mci3330_vctot
]
26645 movq
[esp
+ mci3330_vctot
], mm5
26647 ;
# done - one more?
26648 dec dword ptr
[esp
+ mci3330_innerk
]
26649 jz
.mci3330_updateouterdata
26650 jmp
.mci3330_inner_loop
26651 .mci3330_updateouterdata:
26652 mov edx
, [ebp
+ mci3330_gid
] ;
# get group index for this i particle
26654 add dword ptr
[ebp
+ mci3330_gid
], 4 ;
# advance pointer
26656 movq mm7
, [esp
+ mci3330_vctot
]
26657 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
26659 mov eax
, [ebp
+ mci3330_Vc
]
26660 movd mm6
, [eax
+ edx
*4]
26662 movd
[eax
+ edx
*4], mm6 ;
# increment vc[gid]
26664 movq mm7
, [esp
+ mci3330_vnbtot
]
26665 pfacc mm7
,mm7 ;
# get and sum the two parts of total potential
26667 mov eax
, [ebp
+ mci3330_Vnb
]
26668 movd mm6
, [eax
+ edx
*4]
26670 movd
[eax
+ edx
*4], mm6 ;
# increment vnbtot[gid]
26672 dec dword ptr
[ebp
+ mci3330_nri
]
26674 ;
# not last, iterate once more!