Added conditional inclusion of config.h to source files
[gromacs.git] / src / gmxlib / x86_3dnow.s
blob7861c69c88687eabb84d096b1eb1864e78f8746d
1 ;#
2 ;# $Id$
3 ;#
4 ;# This source code is part of
5 ;#
6 ;# G R O M A C S
7 ;#
8 ;# GROningen MAchine for Chemical Simulations
9 ;#
10 ;# VERSION 3.1
11 ;# Copyright (c) 1991-2001, University of Groningen, The Netherlands
12 ;# This program is free software; you can redistribute it and/or
13 ;# modify it under the terms of the GNU General Public License
14 ;# as published by the Free Software Foundation; either version 2
15 ;# of the License, or (at your option) any later version.
16 ;#
17 ;# If you want to redistribute modifications, please consider that
18 ;# scientific software is very special. Version control is crucial -
19 ;# bugs must be traceable. We will be happy to consider code for
20 ;# inclusion in the official distribution, but derived work must not
21 ;# be called official GROMACS. Details are found in the README & COPYING
22 ;# files - if they are missing, get the official version at www.gromacs.org.
23 ;#
24 ;# To help us fund GROMACS development, we humbly ask that you cite
25 ;# the papers on the package - you can find them in the top README file.
26 ;#
27 ;# For more info, check our website at http://www.gromacs.org
28 ;#
29 ;# And Hey:
30 ;# Gnomes, ROck Monsters And Chili Sauce
33 ;# This file contains a subset of the gromacs innerloops
34 ;# manually written in assembly to optimize performance
35 ;# on AMD extended 3DNow-enabled processors like Athlon
36 ;# and later generations.
37 ;# Erik Lindahl, 2000-2001, erik@theophys.kth.se
40 ;# These files require GNU binutils 2.10 or later, since we
41 ;# use intel syntax for portability, or a recent version
42 ;# of NASM that understands Extended 3DNow and SSE2 instructions.
43 ;# (NASM is normally only used with MS Visual C++).
45 ;# Since NASM and gnu as disagree on some definitions and use
46 ;# completely different preprocessing options I have to introduce a
47 ;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
48 ;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
49 ;# reason why all comments need both symbols...
50 ;# The source is written for GNU as, with intel syntax. When you use
51 ;# NASM we redefine a couple of things. The false if-statement around
52 ;# the following code is seen by GNU as (NASM doesn't understant this
53 ;# if syntax), but NASM doesn't see it, so the code inside is only
54 ;# read by NASM (NASM doesn't understand .if):
56 ; .if 0 # block below only read by NASM
57 %define .section section
58 %define .long dd
59 %define .align align
60 %define .globl global
61 ;# NASM wants 'dword' only, not 'dword ptr'.
62 %define ptr
63 %macro .equiv 2
64 %1 equ %2
65 %endmacro
66 ; .endif # End of NASM-specific block
68 ; .intel_syntax noprefix # Line only read by gnu as
71 .section .text
73 mm_two:
74 .long 0x40000000
75 .long 0x40000000
76 mm_six:
77 .long 0x40c00000
78 .long 0x40c00000
79 mm_twelve:
80 .long 0x41400000
81 .long 0x41400000
83 .align 4
85 .globl check3dnow ;# try to issue an Extended 3DNow instruction
86 .globl _check3dnow
87 check3dnow:
88 _check3dnow:
89 femms
90 pswapd mm0,mm0
91 femms
92 ret
95 .globl vecrecip_3dnow
96 .globl _vecrecip_3dnow
97 vecrecip_3dnow:
98 _vecrecip_3dnow:
99 push ebp
100 mov ebp,esp
101 push eax
102 push ebx
103 push ecx
104 push edx
106 mov eax, [ebp + 8]
107 mov ebx, [ebp + 12]
108 mov ecx, [ebp + 16]
109 mov edx, ecx
110 shr ecx, 2
111 jecxz .vecrecip_tail
112 emms
113 .vecrecip_mainloop:
114 movq mm0,[eax]
115 add eax, 8
116 pfrcp mm1,mm0
117 movq mm4,[eax]
118 pswapd mm0,mm0
119 add eax, 8
120 pfrcp mm2,mm0
121 pswapd mm0,mm0
122 pfrcp mm5,mm4
123 pswapd mm4,mm4
124 punpckldq mm1,mm2
125 pfrcp mm6,mm4
126 pswapd mm4,mm4
127 pfrcpit1 mm0,mm1
128 punpckldq mm5,mm6
129 pfrcpit2 mm0,mm1
130 movq [ebx],mm0
131 pfrcpit1 mm4,mm5
132 add ebx, 8
133 pfrcpit2 mm4,mm5
134 movq [ebx],mm4
135 add ebx, 8
136 dec ecx
137 jecxz .vecrecip_tail
138 jmp short .vecrecip_mainloop
139 .vecrecip_tail:
140 mov ecx,edx
141 and ecx,3
142 jecxz .vecrecip_end
143 .vecrecip_tailloop:
144 movd mm0,[eax]
145 add eax, 4
146 pfrcp mm1,mm0
147 pfrcpit1 mm0,mm1
148 pfrcpit2 mm0,mm1
149 movd [ebx],mm0
150 add ebx, 4
151 dec ecx
152 jecxz .vecrecip_end
153 jmp short .vecrecip_tailloop
154 .vecrecip_end:
155 emms
156 pop edx
157 pop ecx
158 pop ebx
159 pop eax
160 leave
164 .globl vecinvsqrt_3dnow
165 .globl _vecinvsqrt_3dnow
166 vecinvsqrt_3dnow:
167 _vecinvsqrt_3dnow:
168 push ebp
169 mov ebp,esp
170 push eax
171 push ebx
172 push ecx
173 push edx
175 mov eax, [ebp + 8]
176 mov ebx, [ebp + 12]
177 mov ecx, [ebp + 16]
178 mov edx, ecx
179 shr ecx, 2
180 jecxz .vecinvsqrt_tail
181 emms
182 .vecinvsqrt_mainloop:
183 movq mm0,[eax]
184 add eax, 8
185 pfrsqrt mm1,mm0
186 movq mm4,[eax]
187 pswapd mm0,mm0
188 add eax, 8
189 pfrsqrt mm2,mm0
190 pswapd mm0,mm0
191 pfrsqrt mm5,mm4
192 pswapd mm4,mm4
193 punpckldq mm1,mm2
194 pfrsqrt mm6,mm4
195 movq mm3,mm1
196 pswapd mm4,mm4
197 pfmul mm1,mm1
198 punpckldq mm5,mm6
199 pfrsqit1 mm1,mm0
200 movq mm7,mm5
201 pfrcpit2 mm1,mm3
202 pfmul mm5,mm5
203 movq [ebx],mm1
204 pfrsqit1 mm5,mm4
205 add ebx, 8
206 pfrcpit2 mm5,mm7
207 movq [ebx],mm5
208 add ebx, 8
209 dec ecx
210 jecxz .vecinvsqrt_tail
211 jmp short .vecinvsqrt_mainloop
212 .vecinvsqrt_tail:
213 mov ecx,edx
214 and ecx,3
215 jecxz .vecinvsqrt_end
216 .vecinvsqrt_tailloop:
217 movd mm0,[eax]
218 add eax, 4
219 pfrsqrt mm1,mm0
220 movq mm2,mm1
221 pfmul mm1,mm1
222 pfrsqit1 mm1,mm0
223 pfrcpit2 mm1,mm2
224 movd [ebx],mm1
225 add ebx, 4
226 dec ecx
227 jecxz .vecinvsqrt_end
228 jmp short .vecinvsqrt_tailloop
229 .vecinvsqrt_end:
230 emms
231 pop edx
232 pop ecx
233 pop ebx
234 pop eax
235 leave
239 .globl inl0100_3dnow
240 .globl _inl0100_3dnow
241 inl0100_3dnow:
242 _inl0100_3dnow:
243 .equiv i0100_nri, 8
244 .equiv i0100_iinr, 12
245 .equiv i0100_jindex, 16
246 .equiv i0100_jjnr, 20
247 .equiv i0100_shift, 24
248 .equiv i0100_shiftvec, 28
249 .equiv i0100_fshift, 32
250 .equiv i0100_gid, 36
251 .equiv i0100_pos, 40
252 .equiv i0100_faction, 44
253 .equiv i0100_type, 48
254 .equiv i0100_ntype, 52
255 .equiv i0100_nbfp, 56
256 .equiv i0100_Vnb, 60
257 ;# stack offsets for local variables
258 .equiv i0100_is3, 0
259 .equiv i0100_ii3, 4
260 .equiv i0100_ix, 8
261 .equiv i0100_iy, 12
262 .equiv i0100_iz, 16
263 .equiv i0100_vnbtot, 20
264 .equiv i0100_c6, 28
265 .equiv i0100_c12, 36
266 .equiv i0100_six, 44
267 .equiv i0100_twelve, 52
268 .equiv i0100_ntia, 60
269 .equiv i0100_innerjjnr, 64
270 .equiv i0100_innerk, 68
271 .equiv i0100_fix, 72
272 .equiv i0100_fiy, 76
273 .equiv i0100_fiz, 80
274 .equiv i0100_dx1, 84
275 .equiv i0100_dy1, 88
276 .equiv i0100_dz1, 92
277 .equiv i0100_dx2, 96
278 .equiv i0100_dy2, 100
279 .equiv i0100_dz2, 104
280 push ebp
281 mov ebp,esp
282 push eax
283 push ebx
284 push ecx
285 push edx
286 push esi
287 push edi
288 sub esp, 108 ;# local stack space
289 femms
290 ;# move data to local stack
291 movq mm0, [mm_six]
292 movq mm1, [mm_twelve]
293 movq [esp + i0100_six ], mm0
294 movq [esp + i0100_twelve ], mm1
295 ;# assume we have at least one i particle - start directly
296 .i0100_outer:
297 mov eax, [ebp + i0100_shift] ;# eax = pointer into shift[]
298 mov ebx, [eax] ;# ebx=shift[n]
299 add dword ptr [ebp + i0100_shift], 4 ;# advance pointer one step
301 lea ebx, [ebx + ebx*2] ;# ebx=3*is
302 mov [esp + i0100_is3],ebx ;# store is3
304 mov eax, [ebp + i0100_shiftvec] ;# eax = base of shiftvec[]
306 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1.
307 movd mm1, [eax + ebx*4 + 8]
309 mov ecx, [ebp + i0100_iinr] ;# ecx = pointer into iinr[]
310 add dword ptr [ebp + i0100_iinr], 4 ;# advance pointer
311 mov ebx, [ecx] ;# ebx =ii
313 mov edx, [ebp + i0100_type]
314 mov edx, [edx + ebx*4]
315 imul edx, [ebp + i0100_ntype]
316 shl edx, 1
317 mov [esp + i0100_ntia], edx
319 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
320 mov eax, [ebp + i0100_pos] ;# eax = base of pos[]
322 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
323 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
324 mov [esp + i0100_ii3], ebx
325 pfadd mm1, mm3
326 movq [esp + i0100_ix], mm0
327 movd [esp + i0100_iz], mm1
329 ;# clear total potential and i forces
330 pxor mm7,mm7
331 movq [esp + i0100_vnbtot], mm7
332 movq [esp + i0100_fix], mm7
333 movd [esp + i0100_fiz], mm7
335 mov eax, [ebp + i0100_jindex]
336 mov ecx, [eax] ;# jindex[n]
337 mov edx, [eax + 4] ;# jindex[n+1]
338 add dword ptr [ebp + i0100_jindex], 4
339 sub edx, ecx ;# number of innerloop atoms
341 mov esi, [ebp + i0100_pos]
342 mov edi, [ebp + i0100_faction]
343 mov eax, [ebp + i0100_jjnr]
344 shl ecx, 2
345 add eax, ecx
346 mov [esp + i0100_innerjjnr], eax ;# pointer to jjnr[nj0]
347 sub edx, 2
348 mov [esp + i0100_innerk], edx ;# number of innerloop atoms
349 jge .i0100_unroll_loop
350 jmp .i0100_finish_inner
351 .i0100_unroll_loop:
352 ;# paired innerloop starts here
353 mov ecx, [esp + i0100_innerjjnr] ;# pointer to jjnr[k]
354 mov eax, [ecx]
355 mov ebx, [ecx + 4] ;# eax/ebx=jnr
356 add dword ptr [esp + i0100_innerjjnr], 8 ;# advance pointer (unrolled 2)
357 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
359 mov ecx, [ebp + i0100_type]
360 mov edx, [ecx + eax*4] ;# type [jnr1]
361 mov ecx, [ecx + ebx*4] ;# type [jnr2]
363 mov esi, [ebp + i0100_nbfp] ;# base of nbfp
364 shl edx, 1
365 shl ecx, 1
366 add edx, [esp + i0100_ntia] ;# tja = ntia + 2*type
367 add ecx, [esp + i0100_ntia]
369 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
370 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
371 movq mm6,mm5
372 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
373 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
374 movq [esp + i0100_c6], mm5
375 movq [esp + i0100_c12], mm6
377 lea eax, [eax + eax*2] ;# replace jnr with j3
378 lea ebx, [ebx + ebx*2]
380 mov esi, [ebp + i0100_pos]
382 movq mm0, [esp + i0100_ix]
383 movd mm1, [esp + i0100_iz]
384 movq mm4, [esi + eax*4] ;# fetch first j coordinates
385 movd mm5, [esi + eax*4 + 8]
386 pfsubr mm4,mm0 ;# dr = ir - jr
387 pfsubr mm5,mm1
388 movq [esp + i0100_dx1], mm4 ;# store dr
389 movd [esp + i0100_dz1], mm5
390 pfmul mm4,mm4 ;# square dx,dy,dz
391 pfmul mm5,mm5
392 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
393 pfacc mm4, mm5 ;# first rsq in lower mm4
395 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
396 movd mm7, [esi + ebx*4 + 8]
398 pfsubr mm6,mm0 ;# dr = ir - jr
399 pfsubr mm7,mm1
400 movq [esp + i0100_dx2], mm6 ;# store dr
401 movd [esp + i0100_dz2], mm7
402 pfmul mm6,mm6 ;# square dx,dy,dz
403 pfmul mm7,mm7
404 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
405 pfacc mm6, mm7 ;# second rsq in lower mm6
407 pfrcp mm0, mm4 ;# lookup reciprocal seed
408 pfrcp mm1, mm6
410 punpckldq mm0,mm1
411 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
412 ;# amd 3dnow N-R iteration to get full precision.
413 pfrcpit1 mm4,mm0
414 pfrcpit2 mm4,mm0
415 ;# mm4 now contains invsq,
416 ;# do potential and fscal
418 movq mm0, mm4
419 pfmul mm4, mm0
420 pfmul mm4, mm0 ;# mm4=rinvsix
421 movq mm5, mm4
422 pfmul mm5, mm5 ;# mm5=rinvtwelve
424 pfmul mm5, [esp + i0100_c12]
425 pfmul mm4, [esp + i0100_c6]
426 movq mm6, mm5 ;# mm6 is vnb12-vnb6
427 pfsub mm6, mm4
429 pfmul mm4, [esp + i0100_six]
431 pfmul mm5, [esp + i0100_twelve]
432 pfsub mm5,mm4
433 pfmul mm0, mm5 ;# mm0 is total fscal now
435 prefetchw [esp + i0100_dx1] ;# prefetch i forces to cache
437 ;# spread fscalar to both positions
438 movq mm1,mm0
439 punpckldq mm0,mm0
440 punpckhdq mm1,mm1
442 ;# calc vector force
443 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
444 movq mm2, [esp + i0100_dx1] ;# fetch dr
445 movd mm3, [esp + i0100_dz1]
447 ;# update vnbtot
448 pfadd mm6, [esp + i0100_vnbtot] ;# add the earlier value
449 movq [esp + i0100_vnbtot], mm6 ;# store the sum
451 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
452 pfmul mm2, mm0 ;# mult by fs
453 pfmul mm3, mm0
455 movq mm4, [esp + i0100_dx2] ;# fetch dr
456 movd mm5, [esp + i0100_dz2]
457 pfmul mm4, mm1 ;# mult by fs
458 pfmul mm5, mm1
459 ;# update i forces
461 movq mm0, [esp + i0100_fix]
462 movd mm1, [esp + i0100_fiz]
463 pfadd mm0, mm2
464 pfadd mm1, mm3
466 pfadd mm0, mm4
467 pfadd mm1, mm5
468 movq [esp + i0100_fix], mm0
469 movd [esp + i0100_fiz], mm1
470 ;# update j forces
472 movq mm0, [edi + eax*4]
473 movd mm1, [edi + eax*4 + 8]
474 movq mm6, [edi + ebx*4]
475 movd mm7, [edi + ebx*4 + 8]
477 pfsub mm0, mm2
478 pfsub mm1, mm3
479 pfsub mm6, mm4
480 pfsub mm7, mm5
482 movq [edi + eax*4], mm0
483 movd [edi + eax*4 +8], mm1
484 movq [edi + ebx*4], mm6
485 movd [edi + ebx*4 + 8], mm7
487 ;# should we do one more iteration?
488 sub dword ptr [esp + i0100_innerk], 2
489 jl .i0100_finish_inner
490 jmp .i0100_unroll_loop
491 .i0100_finish_inner:
492 and dword ptr [esp + i0100_innerk], 1
493 jnz .i0100_single_inner
494 jmp .i0100_updateouterdata
495 .i0100_single_inner:
496 ;# a single j particle iteration here - compare with the unrolled code for comments
497 mov eax, [esp + i0100_innerjjnr]
498 mov eax, [eax] ;# eax=jnr offset
500 mov esi, [ebp + i0100_nbfp]
501 mov ecx, [ebp + i0100_type]
502 mov edx, [ecx + eax*4] ;# type [jnr1]
503 shl edx, 1
504 add edx, [esp + i0100_ntia] ;# tja = ntia + 2*type
505 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
506 movq [esp + i0100_c6], mm5
507 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
508 movq [esp + i0100_c12], mm5
510 mov esi, [ebp + i0100_pos]
511 lea eax, [eax + eax*2]
513 movq mm0, [esp + i0100_ix]
514 movd mm1, [esp + i0100_iz]
515 movq mm4, [esi + eax*4]
516 movd mm5, [esi + eax*4 + 8]
517 pfsubr mm4, mm0
518 pfsubr mm5, mm1
519 movq [esp + i0100_dx1], mm4
520 pfmul mm4,mm4
521 movd [esp + i0100_dz1], mm5
522 pfmul mm5,mm5
523 pfacc mm4, mm5
524 pfacc mm4, mm5 ;# mm4=rsq
526 pfrcp mm0,mm4
527 pfrcpit1 mm4,mm0
528 pfrcpit2 mm4,mm0 ;# mm4=invsq
529 ;# calculate potentials and scalar force
530 movq mm0, mm4
532 pfmul mm4, mm0
533 pfmul mm4, mm0 ;# mm4=rinvsix
534 movq mm5, mm4
535 pfmul mm5, mm5 ;# mm5=rinvtwelve
537 pfmul mm5, [esp + i0100_c12]
538 pfmul mm4, [esp + i0100_c6]
539 movq mm6, mm5 ;# mm6 is vnb12-vnb6
540 pfsub mm6, mm4
542 pfmul mm4, [esp + i0100_six]
544 pfmul mm5, [esp + i0100_twelve]
545 pfsub mm5, mm4
546 pfmul mm0, mm5 ;# mm0 is total fscal now
548 ;# update vnbtot
549 pfadd mm6, [esp + i0100_vnbtot] ;# add the earlier value
550 movq [esp + i0100_vnbtot], mm6 ;# store the sum
552 ;# spread fscalar to both positions
553 punpckldq mm0,mm0
554 ;# calc vectorial force
555 prefetchw [edi + eax*4] ;# prefetch faction to cache
556 movq mm2, [esp + i0100_dx1]
557 movd mm3, [esp + i0100_dz1]
559 pfmul mm2, mm0
560 pfmul mm3, mm0
562 ;# update i particle force
563 movq mm0, [esp + i0100_fix]
564 movd mm1, [esp + i0100_fiz]
565 pfadd mm0, mm2
566 pfadd mm1, mm3
567 movq [esp + i0100_fix], mm0
568 movd [esp + i0100_fiz], mm1
569 ;# update j particle force
570 movq mm0, [edi + eax*4]
571 movd mm1, [edi + eax *4+ 8]
572 pfsub mm0, mm2
573 pfsub mm1, mm3
574 movq [edi + eax*4], mm0
575 movd [edi + eax*4 +8], mm1
576 ;# done!
577 .i0100_updateouterdata:
578 mov ecx, [esp + i0100_ii3]
580 movq mm6, [edi + ecx*4] ;# increment i force
581 movd mm7, [edi + ecx*4 + 8]
582 pfadd mm6, [esp + i0100_fix]
583 pfadd mm7, [esp + i0100_fiz]
584 movq [edi + ecx*4], mm6
585 movd [edi + ecx*4 +8], mm7
587 mov ebx, [ebp + i0100_fshift] ;# increment fshift force
588 mov edx, [esp + i0100_is3]
590 movq mm6, [ebx + edx*4]
591 movd mm7, [ebx + edx*4 + 8]
592 pfadd mm6, [esp + i0100_fix]
593 pfadd mm7, [esp + i0100_fiz]
594 movq [ebx + edx*4], mm6
595 movd [ebx + edx*4 + 8], mm7
597 mov edx, [ebp + i0100_gid] ;# get group index for this i particle
598 mov edx, [edx]
599 add dword ptr [ebp + i0100_gid], 4 ;# advance pointer
601 movq mm7, [esp + i0100_vnbtot]
602 pfacc mm7,mm7 ;# get and sum the two parts of total potential
604 mov eax, [ebp + i0100_Vnb]
605 movd mm6, [eax + edx*4]
606 pfadd mm6, mm7
607 movd [eax + edx*4], mm6 ;# increment vnb[gid]
609 ;# finish if last
610 mov ecx, [ebp + i0100_nri]
611 dec ecx
612 jecxz .i0100_end
613 ;# not last, iterate once more!
614 mov [ebp + i0100_nri], ecx
615 jmp .i0100_outer
616 .i0100_end:
617 femms
618 add esp, 108
619 pop edi
620 pop esi
621 pop edx
622 pop ecx
623 pop ebx
624 pop eax
625 leave
633 .globl inl0110_3dnow
634 .globl _inl0110_3dnow
635 inl0110_3dnow:
636 _inl0110_3dnow:
637 .equiv i0110_nri, 8
638 .equiv i0110_iinr, 12
639 .equiv i0110_jindex, 16
640 .equiv i0110_jjnr, 20
641 .equiv i0110_shift, 24
642 .equiv i0110_shiftvec, 28
643 .equiv i0110_fshift, 32
644 .equiv i0110_gid, 36
645 .equiv i0110_pos, 40
646 .equiv i0110_faction, 44
647 .equiv i0110_type, 48
648 .equiv i0110_ntype, 52
649 .equiv i0110_nbfp, 56
650 .equiv i0110_Vnb, 60
651 .equiv i0110_nsatoms, 64
652 ;# stack offsets for local variables
653 .equiv i0110_is3, 0
654 .equiv i0110_ii3, 4
655 .equiv i0110_shX, 8
656 .equiv i0110_shY, 12
657 .equiv i0110_shZ, 16
658 .equiv i0110_ix, 20
659 .equiv i0110_iy, 24
660 .equiv i0110_iz, 28
661 .equiv i0110_vnbtot, 32
662 .equiv i0110_c6, 40
663 .equiv i0110_c12, 48
664 .equiv i0110_six, 56
665 .equiv i0110_twelve, 64
666 .equiv i0110_ntia, 72
667 .equiv i0110_innerjjnr0, 76
668 .equiv i0110_innerk0, 80
669 .equiv i0110_innerjjnr, 84
670 .equiv i0110_innerk, 88
671 .equiv i0110_fix, 92
672 .equiv i0110_fiy, 96
673 .equiv i0110_fiz, 100
674 .equiv i0110_dx1, 104
675 .equiv i0110_dy1, 108
676 .equiv i0110_dz1, 112
677 .equiv i0110_dx2, 116
678 .equiv i0110_dy2, 120
679 .equiv i0110_dz2, 124
680 .equiv i0110_nsvdwc, 128
681 .equiv i0110_nscoul, 132
682 .equiv i0110_nsvdw, 136
683 .equiv i0110_solnr, 140
684 push ebp
685 mov ebp,esp
686 push eax
687 push ebx
688 push ecx
689 push edx
690 push esi
691 push edi
692 sub esp, 144 ;# local stack space
693 femms
694 movq mm0, [mm_six]
695 movq mm1, [mm_twelve]
696 movq [esp + i0110_six], mm0
697 movq [esp + i0110_twelve], mm1
698 ;# assume we have at least one i particle - start directly
699 .i0110_outer:
700 mov eax, [ebp + i0110_shift] ;# eax = pointer into shift[]
701 mov ebx, [eax] ;# ebx=shift[n]
702 add dword ptr [ebp + i0110_shift], 4 ;# advance pointer one step
704 lea ebx, [ebx + ebx*2] ;# ebx=3*is
705 mov [esp + i0110_is3],ebx ;# store is3
707 mov eax, [ebp + i0110_shiftvec] ;# eax = base of shiftvec[]
709 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
710 movd mm1, [eax + ebx*4 + 8]
711 movq [esp + i0110_shX], mm0
712 movd [esp + i0110_shZ], mm1
714 mov ecx, [ebp + i0110_iinr] ;# ecx = pointer into iinr[]
715 add dword ptr [ebp + i0110_iinr], 4 ;# advance pointer
716 mov ebx, [ecx] ;# ebx=ii
718 mov eax, [ebp + i0110_nsatoms]
719 add dword ptr [ebp + i0110_nsatoms], 12
720 mov ecx, [eax]
721 mov edx, [eax + 4]
722 mov eax, [eax + 8]
723 sub ecx, eax
724 sub eax, edx
726 mov [esp + i0110_nsvdwc], edx
727 mov [esp + i0110_nscoul], eax
728 mov [esp + i0110_nsvdw], ecx
730 ;# clear potential
731 pxor mm7,mm7
732 movq [esp + i0110_vnbtot], mm7
733 mov [esp + i0110_solnr], ebx
735 mov eax, [ebp + i0110_jindex]
736 mov ecx, [eax] ;# jindex[n]
737 mov edx, [eax + 4] ;# jindex[n+1]
738 add dword ptr [ebp + i0110_jindex], 4
739 sub edx, ecx ;# number of innerloop atoms
740 mov eax, [ebp + i0110_jjnr]
741 shl ecx, 2
742 add eax, ecx
743 mov [esp + i0110_innerjjnr0], eax ;# pointer to jjnr[nj0]
745 mov [esp + i0110_innerk0], edx ;# number of innerloop atoms
746 mov esi, [ebp + i0110_pos]
747 mov edi, [ebp + i0110_faction]
749 mov ecx, [esp + i0110_nsvdwc]
750 cmp ecx, 0
751 jnz .i0110_mno_vdwc
752 jmp .i0110_testvdw
753 .i0110_mno_vdwc:
754 mov ebx, [esp + i0110_solnr]
755 inc dword ptr [esp + i0110_solnr]
757 mov edx, [ebp + i0110_type]
758 mov edx, [edx + ebx*4]
759 imul edx, [ebp + i0110_ntype]
760 shl edx, 1
761 mov [esp + i0110_ntia], edx
763 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
764 mov eax, [ebp + i0110_pos] ;# eax = base of pos[]
765 mov [esp + i0110_ii3], ebx
767 movq mm0, [eax + ebx*4]
768 movd mm1, [eax + ebx*4 + 8]
769 pfadd mm0, [esp + i0110_shX]
770 pfadd mm1, [esp + i0110_shZ]
771 movq [esp + i0110_ix], mm0
772 movd [esp + i0110_iz], mm1
774 ;# clear forces
775 pxor mm7,mm7
776 movq [esp + i0110_fix], mm7
777 movd [esp + i0110_fiz], mm7
779 mov ecx, [esp + i0110_innerjjnr0]
780 mov [esp + i0110_innerjjnr], ecx
781 mov edx, [esp + i0110_innerk0]
782 sub edx, 2
783 mov [esp + i0110_innerk], edx ;# number of innerloop atoms
784 jge .i0110_unroll_vdwc_loop
785 jmp .i0110_finish_vdwc_inner
786 .i0110_unroll_vdwc_loop:
787 ;# paired innerloop starts here
788 mov ecx, [esp + i0110_innerjjnr] ;# pointer to jjnr[k]
789 mov eax, [ecx]
790 mov ebx, [ecx + 4] ;# eax/ebx=jnr
791 add dword ptr [esp + i0110_innerjjnr], 8 ;# advance pointer (unrolled 2)
792 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
794 mov ecx, [ebp + i0110_type]
795 mov edx, [ecx + eax*4] ;# type [jnr1]
796 mov ecx, [ecx + ebx*4] ;# type [jnr2]
798 mov esi, [ebp + i0110_nbfp] ;# base of nbfp
799 shl edx, 1
800 shl ecx, 1
801 add edx, [esp + i0110_ntia] ;# tja = ntia + 2*type
802 add ecx, [esp + i0110_ntia]
804 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
805 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
806 movq mm6,mm5
807 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
808 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
809 movq [esp + i0110_c6], mm5
810 movq [esp + i0110_c12], mm6
812 lea eax, [eax + eax*2] ;# replace jnr with j3
813 lea ebx, [ebx + ebx*2]
815 mov esi, [ebp + i0110_pos]
817 movq mm0, [esp + i0110_ix]
818 movd mm1, [esp + i0110_iz]
819 movq mm4, [esi + eax*4] ;# fetch first j coordinates
820 movd mm5, [esi + eax*4 + 8]
821 pfsubr mm4,mm0 ;# dr = ir - jr
822 pfsubr mm5,mm1
823 movq [esp + i0110_dx1], mm4 ;# store dr
824 movd [esp + i0110_dz1], mm5
825 pfmul mm4,mm4 ;# square dx,dy,dz
826 pfmul mm5,mm5
827 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
828 pfacc mm4, mm5 ;# first rsq in lower mm4
830 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
831 movd mm7, [esi + ebx*4 + 8]
833 pfsubr mm6,mm0 ;# dr = ir - jr
834 pfsubr mm7,mm1
835 movq [esp + i0110_dx2], mm6 ;# store dr
836 movd [esp + i0110_dz2], mm7
837 pfmul mm6,mm6 ;# square dx,dy,dz
838 pfmul mm7,mm7
839 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
840 pfacc mm6, mm7 ;# second rsq in lower mm6
842 pfrcp mm0, mm4 ;# lookup reciprocal seed
843 pfrcp mm1, mm6
845 punpckldq mm0,mm1
846 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
847 ;# amd 3dnow N-R iteration to get full precision
848 pfrcpit1 mm4,mm0
849 pfrcpit2 mm4,mm0
850 ;# mm4 now contains invsq,
851 ;# do potential and fscal
853 movq mm0, mm4
854 pfmul mm4, mm0
855 pfmul mm4, mm0 ;# mm4=rinvsix
856 movq mm5, mm4
857 pfmul mm5, mm5 ;# mm5=rinvtwelve
859 pfmul mm5, [esp + i0110_c12]
860 pfmul mm4, [esp + i0110_c6]
861 movq mm6, mm5 ;# mm6 is vnb12-vnb6
862 pfsub mm6, mm4
864 pfmul mm4, [esp + i0110_six]
866 pfmul mm5, [esp + i0110_twelve]
867 pfsub mm5,mm4
868 pfmul mm0, mm5 ;# mm0 is total fscal now
870 prefetchw [esp + i0110_dx1] ;# prefetch i forces to cache
872 ;# spread fscalar to both positions
873 movq mm1,mm0
874 punpckldq mm0,mm0
875 punpckhdq mm1,mm1
877 ;# calc vector force
878 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
879 movq mm2, [esp + i0110_dx1] ;# fetch dr
880 movd mm3, [esp + i0110_dz1]
882 ;# update vnbtot
883 pfadd mm6, [esp + i0110_vnbtot] ;# add the earlier value
884 movq [esp + i0110_vnbtot], mm6 ;# store the sum
886 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
887 pfmul mm2, mm0 ;# mult by fs
888 pfmul mm3, mm0
890 movq mm4, [esp + i0110_dx2] ;# fetch dr
891 movd mm5, [esp + i0110_dz2]
892 pfmul mm4, mm1 ;# mult by fs
893 pfmul mm5, mm1
894 ;# update i forces
896 movq mm0, [esp + i0110_fix]
897 movd mm1, [esp + i0110_fiz]
898 pfadd mm0, mm2
899 pfadd mm1, mm3
901 pfadd mm0, mm4
902 pfadd mm1, mm5
903 movq [esp + i0110_fix], mm0
904 movd [esp + i0110_fiz], mm1
905 ;# update j forces
907 movq mm0, [edi + eax*4]
908 movd mm1, [edi + eax*4 + 8]
909 movq mm6, [edi + ebx*4]
910 movd mm7, [edi + ebx*4 + 8]
912 pfsub mm0, mm2
913 pfsub mm1, mm3
914 pfsub mm6, mm4
915 pfsub mm7, mm5
917 movq [edi + eax*4], mm0
918 movd [edi + eax*4 +8], mm1
919 movq [edi + ebx*4], mm6
920 movd [edi + ebx*4 + 8], mm7
922 ;# should we do one more iteration?
923 sub dword ptr [esp + i0110_innerk], 2
924 jl .i0110_finish_vdwc_inner
925 jmp .i0110_unroll_vdwc_loop
926 .i0110_finish_vdwc_inner:
927 and dword ptr [esp + i0110_innerk], 1
928 jnz .i0110_single_vdwc_inner
929 jmp .i0110_updateouterdata_vdwc
930 .i0110_single_vdwc_inner:
931 ;# a single j particle iteration here - compare with the unrolled code for comments
932 mov eax, [esp + i0110_innerjjnr]
933 mov eax, [eax] ;# eax=jnr offset
935 mov esi, [ebp + i0110_nbfp]
936 mov ecx, [ebp + i0110_type]
937 mov edx, [ecx + eax*4] ;# type [jnr1]
938 shl edx, 1
939 add edx, [esp + i0110_ntia] ;# tja = ntia + 2*type
940 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
941 movq [esp + i0110_c6], mm5
942 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
943 movq [esp + i0110_c12], mm5
945 mov esi, [ebp + i0110_pos]
946 lea eax, [eax + eax*2]
948 movq mm0, [esp + i0110_ix]
949 movd mm1, [esp + i0110_iz]
950 movq mm4, [esi + eax*4]
951 movd mm5, [esi + eax*4 + 8]
952 pfsubr mm4, mm0
953 pfsubr mm5, mm1
954 movq [esp + i0110_dx1], mm4
955 pfmul mm4,mm4
956 movd [esp + i0110_dz1], mm5
957 pfmul mm5,mm5
958 pfacc mm4, mm5
959 pfacc mm4, mm5 ;# mm4=rsq
961 pfrcp mm0,mm4
962 pfrcpit1 mm4,mm0
963 pfrcpit2 mm4,mm0 ;# mm4=invsq
964 ;# calculate potentials and scalar force
965 movq mm0, mm4
967 pfmul mm4, mm0
968 pfmul mm4, mm0 ;# mm4=rinvsix
969 movq mm5, mm4
970 pfmul mm5, mm5 ;# mm5=rinvtwelve
972 pfmul mm5, [esp + i0110_c12]
973 pfmul mm4, [esp + i0110_c6]
974 movq mm6, mm5 ;# mm6 is vnb12-vnb6
975 pfsub mm6, mm4
977 pfmul mm4, [esp + i0110_six]
979 pfmul mm5, [esp + i0110_twelve]
980 pfsub mm5, mm4
981 pfmul mm0, mm5 ;# mm0 is total fscal now
983 ;# update vnbtot
984 pfadd mm6, [esp + i0110_vnbtot] ;# add the earlier value
985 movq [esp + i0110_vnbtot], mm6 ;# store the sum
987 ;# spread fscalar to both positions
988 punpckldq mm0,mm0
989 ;# calc vectorial force
990 prefetchw [edi + eax*4] ;# prefetch faction to cache
991 movq mm2, [esp + i0110_dx1]
992 movd mm3, [esp + i0110_dz1]
994 pfmul mm2, mm0
995 pfmul mm3, mm0
997 ;# update i particle force
998 movq mm0, [esp + i0110_fix]
999 movd mm1, [esp + i0110_fiz]
1000 pfadd mm0, mm2
1001 pfadd mm1, mm3
1002 movq [esp + i0110_fix], mm0
1003 movd [esp + i0110_fiz], mm1
1004 ;# update j particle force
1005 movq mm0, [edi + eax*4]
1006 movd mm1, [edi + eax *4+ 8]
1007 pfsub mm0, mm2
1008 pfsub mm1, mm3
1009 movq [edi + eax*4], mm0
1010 movd [edi + eax*4 +8], mm1
1011 ;# done!
1012 .i0110_updateouterdata_vdwc:
1013 mov ecx, [esp + i0110_ii3]
1015 movq mm6, [edi + ecx*4] ;# increment i force
1016 movd mm7, [edi + ecx*4 + 8]
1017 pfadd mm6, [esp + i0110_fix]
1018 pfadd mm7, [esp + i0110_fiz]
1019 movq [edi + ecx*4], mm6
1020 movd [edi + ecx*4 +8], mm7
1022 mov ebx, [ebp + i0110_fshift] ;# increment fshift force
1023 mov edx, [esp + i0110_is3]
1025 movq mm6, [ebx + edx*4]
1026 movd mm7, [ebx + edx*4 + 8]
1027 pfadd mm6, [esp + i0110_fix]
1028 pfadd mm7, [esp + i0110_fiz]
1029 movq [ebx + edx*4], mm6
1030 movd [ebx + edx*4 + 8], mm7
1032 ;# loop back to mno
1033 dec dword ptr [esp + i0110_nsvdwc]
1034 jz .i0110_testvdw
1035 jmp .i0110_mno_vdwc
1036 .i0110_testvdw:
1037 mov ebx, [esp + i0110_nscoul]
1038 add [esp + i0110_solnr], ebx
1040 mov ecx, [esp + i0110_nsvdw]
1041 cmp ecx, 0
1042 jnz .i0110_mno_vdw
1043 jmp .i0110_last_mno
1044 .i0110_mno_vdw:
1045 mov ebx, [esp + i0110_solnr]
1046 inc dword ptr [esp + i0110_solnr]
1048 mov edx, [ebp + i0110_type]
1049 mov edx, [edx + ebx*4]
1050 imul edx, [ebp + i0110_ntype]
1051 shl edx, 1
1052 mov [esp + i0110_ntia], edx
1054 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
1055 mov eax, [ebp + i0110_pos] ;# eax = base of pos[]
1056 mov [esp + i0110_ii3], ebx
1058 movq mm0, [eax + ebx*4]
1059 movd mm1, [eax + ebx*4 + 8]
1060 pfadd mm0, [esp + i0110_shX]
1061 pfadd mm1, [esp + i0110_shZ]
1062 movq [esp + i0110_ix], mm0
1063 movd [esp + i0110_iz], mm1
1065 ;# clear forces
1066 pxor mm7,mm7
1067 movq [esp + i0110_fix], mm7
1068 movd [esp + i0110_fiz], mm7
1070 mov ecx, [esp + i0110_innerjjnr0]
1071 mov [esp + i0110_innerjjnr], ecx
1072 mov edx, [esp + i0110_innerk0]
1073 sub edx, 2
1074 mov [esp + i0110_innerk], edx ;# number of innerloop atoms
1075 jge .i0110_unroll_vdw_loop
1076 jmp .i0110_finish_vdw_inner
1077 .i0110_unroll_vdw_loop:
1078 ;# paired innerloop starts here
1079 mov ecx, [esp + i0110_innerjjnr] ;# pointer to jjnr[k]
1080 mov eax, [ecx]
1081 mov ebx, [ecx + 4] ;# eax/ebx=jnr
1082 add dword ptr [esp + i0110_innerjjnr], 8 ;# advance pointer (unrolled 2)
1083 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
1085 mov ecx, [ebp + i0110_type]
1086 mov edx, [ecx + eax*4] ;# type [jnr1]
1087 mov ecx, [ecx + ebx*4] ;# type [jnr2]
1089 mov esi, [ebp + i0110_nbfp] ;# base of nbfp
1090 shl edx, 1
1091 shl ecx, 1
1092 add edx, [esp + i0110_ntia] ;# tja = ntia + 2*type
1093 add ecx, [esp + i0110_ntia]
1095 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
1096 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
1097 movq mm6,mm5
1098 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
1099 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
1100 movq [esp + i0110_c6], mm5
1101 movq [esp + i0110_c12], mm6
1103 lea eax, [eax + eax*2] ;# replace jnr with j3
1104 lea ebx, [ebx + ebx*2]
1106 mov esi, [ebp + i0110_pos]
1108 movq mm0, [esp + i0110_ix]
1109 movd mm1, [esp + i0110_iz]
1110 movq mm4, [esi + eax*4] ;# fetch first j coordinates
1111 movd mm5, [esi + eax*4 + 8]
1112 pfsubr mm4,mm0 ;# dr = ir - jr
1113 pfsubr mm5,mm1
1114 movq [esp + i0110_dx1], mm4 ;# store dr
1115 movd [esp + i0110_dz1], mm5
1116 pfmul mm4,mm4 ;# square dx,dy,dz
1117 pfmul mm5,mm5
1118 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
1119 pfacc mm4, mm5 ;# first rsq in lower mm4
1121 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
1122 movd mm7, [esi + ebx*4 + 8]
1124 pfsubr mm6,mm0 ;# dr = ir - jr
1125 pfsubr mm7,mm1
1126 movq [esp + i0110_dx2], mm6 ;# store dr
1127 movd [esp + i0110_dz2], mm7
1128 pfmul mm6,mm6 ;# square dx,dy,dz
1129 pfmul mm7,mm7
1130 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
1131 pfacc mm6, mm7 ;# second rsq in lower mm6
1133 pfrcp mm0, mm4 ;# lookup reciprocal seed
1134 pfrcp mm1, mm6
1136 punpckldq mm0,mm1
1137 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
1138 ;# amd 3dnow N-R iteration to get full precision
1139 pfrcpit1 mm4,mm0
1140 pfrcpit2 mm4,mm0
1141 ;# mm4 now contains invsq,
1142 ;# do potential and fscal
1144 movq mm0, mm4
1145 pfmul mm4, mm0
1146 pfmul mm4, mm0 ;# mm4=rinvsix
1147 movq mm5, mm4
1148 pfmul mm5, mm5 ;# mm5=rinvtwelve
1150 pfmul mm5, [esp + i0110_c12]
1151 pfmul mm4, [esp + i0110_c6]
1152 movq mm6, mm5 ;# mm6 is vnb12-vnb6
1153 pfsub mm6, mm4
1155 pfmul mm4, [esp + i0110_six]
1157 pfmul mm5, [esp + i0110_twelve]
1158 pfsub mm5,mm4
1159 pfmul mm0, mm5 ;# mm0 is total fscal now
1161 prefetchw [esp + i0110_dx1] ;# prefetch i forces to cache
1163 ;# spread fscalar to both positions
1164 movq mm1,mm0
1165 punpckldq mm0,mm0
1166 punpckhdq mm1,mm1
1168 ;# calc vector force
1169 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
1170 movq mm2, [esp + i0110_dx1] ;# fetch dr
1171 movd mm3, [esp + i0110_dz1]
1173 ;# update vnbtot
1174 pfadd mm6, [esp + i0110_vnbtot] ;# add the earlier value
1175 movq [esp + i0110_vnbtot], mm6 ;# store the sum
1177 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
1178 pfmul mm2, mm0 ;# mult by fs
1179 pfmul mm3, mm0
1181 movq mm4, [esp + i0110_dx2] ;# fetch dr
1182 movd mm5, [esp + i0110_dz2]
1183 pfmul mm4, mm1 ;# mult by fs
1184 pfmul mm5, mm1
1185 ;# update i forces
1187 movq mm0, [esp + i0110_fix]
1188 movd mm1, [esp + i0110_fiz]
1189 pfadd mm0, mm2
1190 pfadd mm1, mm3
1192 pfadd mm0, mm4
1193 pfadd mm1, mm5
1194 movq [esp + i0110_fix], mm0
1195 movd [esp + i0110_fiz], mm1
1196 ;# update j forces
1198 movq mm0, [edi + eax*4]
1199 movd mm1, [edi + eax*4 + 8]
1200 movq mm6, [edi + ebx*4]
1201 movd mm7, [edi + ebx*4 + 8]
1203 pfsub mm0, mm2
1204 pfsub mm1, mm3
1205 pfsub mm6, mm4
1206 pfsub mm7, mm5
1208 movq [edi + eax*4], mm0
1209 movd [edi + eax*4 +8], mm1
1210 movq [edi + ebx*4], mm6
1211 movd [edi + ebx*4 + 8], mm7
1212 ;# should we do one more iteration?
1213 sub dword ptr [esp + i0110_innerk], 2
1214 jl .i0110_finish_vdw_inner
1215 jmp .i0110_unroll_vdw_loop
1216 .i0110_finish_vdw_inner:
1217 and dword ptr [esp + i0110_innerk], 1
1218 jnz .i0110_single_vdw_inner
1219 jmp .i0110_updateouterdata_vdw
1220 .i0110_single_vdw_inner:
1221 ;# a single j particle iteration here - compare with the unrolled code for comments
1222 mov eax, [esp + i0110_innerjjnr]
1223 mov eax, [eax] ;# eax=jnr offset
1225 mov esi, [ebp + i0110_nbfp]
1226 mov ecx, [ebp + i0110_type]
1227 mov edx, [ecx + eax*4] ;# type [jnr1]
1228 shl edx, 1
1229 add edx, [esp + i0110_ntia] ;# tja = ntia + 2*type
1230 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
1231 movq [esp + i0110_c6], mm5
1232 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
1233 movq [esp + i0110_c12], mm5
1235 mov esi, [ebp + i0110_pos]
1236 lea eax, [eax + eax*2]
1238 movq mm0, [esp + i0110_ix]
1239 movd mm1, [esp + i0110_iz]
1240 movq mm4, [esi + eax*4]
1241 movd mm5, [esi + eax*4 + 8]
1242 pfsubr mm4, mm0
1243 pfsubr mm5, mm1
1244 movq [esp + i0110_dx1], mm4
1245 pfmul mm4,mm4
1246 movd [esp + i0110_dz1], mm5
1247 pfmul mm5,mm5
1248 pfacc mm4, mm5
1249 pfacc mm4, mm5 ;# mm4=rsq
1251 pfrcp mm0,mm4
1252 pfrcpit1 mm4,mm0
1253 pfrcpit2 mm4,mm0 ;# mm4=invsq
1254 ;# calculate potentials and scalar force
1255 movq mm0, mm4
1257 pfmul mm4, mm0
1258 pfmul mm4, mm0 ;# mm4=rinvsix
1259 movq mm5, mm4
1260 pfmul mm5, mm5 ;# mm5=rinvtwelve
1262 pfmul mm5, [esp + i0110_c12]
1263 pfmul mm4, [esp + i0110_c6]
1264 movq mm6, mm5 ;# mm6 is vnb12-vnb6
1265 pfsub mm6, mm4
1267 pfmul mm4, [esp + i0110_six]
1269 pfmul mm5, [esp + i0110_twelve]
1270 pfsub mm5, mm4
1271 pfmul mm0, mm5 ;# mm0 is total fscal now
1273 ;# update vnbtot
1274 pfadd mm6, [esp + i0110_vnbtot] ;# add the earlier value
1275 movq [esp + i0110_vnbtot], mm6 ;# store the sum
1277 ;# spread fscalar to both positions
1278 punpckldq mm0,mm0
1279 ;# calc vectorial force
1280 prefetchw [edi + eax*4] ;# prefetch faction to cache
1281 movq mm2, [esp + i0110_dx1]
1282 movd mm3, [esp + i0110_dz1]
1284 pfmul mm2, mm0
1285 pfmul mm3, mm0
1287 ;# update i particle force
1288 movq mm0, [esp + i0110_fix]
1289 movd mm1, [esp + i0110_fiz]
1290 pfadd mm0, mm2
1291 pfadd mm1, mm3
1292 movq [esp + i0110_fix], mm0
1293 movd [esp + i0110_fiz], mm1
1294 ;# update j particle force
1295 movq mm0, [edi + eax*4]
1296 movd mm1, [edi + eax *4+ 8]
1297 pfsub mm0, mm2
1298 pfsub mm1, mm3
1299 movq [edi + eax*4], mm0
1300 movd [edi + eax*4 +8], mm1
1301 ;# done!
1302 .i0110_updateouterdata_vdw:
1303 mov ecx, [esp + i0110_ii3]
1305 movq mm6, [edi + ecx*4] ;# increment i force
1306 movd mm7, [edi + ecx*4 + 8]
1307 pfadd mm6, [esp + i0110_fix]
1308 pfadd mm7, [esp + i0110_fiz]
1309 movq [edi + ecx*4], mm6
1310 movd [edi + ecx*4 +8], mm7
1312 mov ebx, [ebp + i0110_fshift] ;# increment fshift force
1313 mov edx, [esp + i0110_is3]
1315 movq mm6, [ebx + edx*4]
1316 movd mm7, [ebx + edx*4 + 8]
1317 pfadd mm6, [esp + i0110_fix]
1318 pfadd mm7, [esp + i0110_fiz]
1319 movq [ebx + edx*4], mm6
1320 movd [ebx + edx*4 + 8], mm7
1322 ;# loop back to mno
1323 dec dword ptr [esp + i0110_nsvdw]
1324 jz .i0110_last_mno
1325 jmp .i0110_mno_vdw
1327 .i0110_last_mno:
1328 mov edx, [ebp + i0110_gid] ;# get group index for this i particle
1329 mov edx, [edx]
1330 add dword ptr [ebp + i0110_gid], 4 ;# advance pointer
1332 movq mm7, [esp + i0110_vnbtot]
1333 pfacc mm7,mm7 ;# get and sum the two parts of total potential
1335 mov eax, [ebp + i0110_Vnb]
1336 movd mm6, [eax + edx*4]
1337 pfadd mm6, mm7
1338 movd [eax + edx*4], mm6 ;# increment vc[gid]
1339 ;# finish if last
1340 mov ecx, [ebp + i0110_nri]
1341 dec ecx
1342 jecxz .i0110_end
1343 ;# not last, iterate once more!
1344 mov [ebp + i0110_nri], ecx
1345 jmp .i0110_outer
1346 .i0110_end:
1347 femms
1348 add esp, 144
1349 pop edi
1350 pop esi
1351 pop edx
1352 pop ecx
1353 pop ebx
1354 pop eax
1355 leave
1360 .globl inl0300_3dnow
1361 .globl _inl0300_3dnow
1362 inl0300_3dnow:
1363 _inl0300_3dnow:
1364 .equiv i0300_nri, 8
1365 .equiv i0300_iinr, 12
1366 .equiv i0300_jindex, 16
1367 .equiv i0300_jjnr, 20
1368 .equiv i0300_shift, 24
1369 .equiv i0300_shiftvec, 28
1370 .equiv i0300_fshift, 32
1371 .equiv i0300_gid, 36
1372 .equiv i0300_pos, 40
1373 .equiv i0300_faction, 44
1374 .equiv i0300_type, 48
1375 .equiv i0300_ntype, 52
1376 .equiv i0300_nbfp, 56
1377 .equiv i0300_Vnb, 60
1378 .equiv i0300_tabscale, 64
1379 .equiv i0300_VFtab, 68
1380 ;# stack offsets for local variables
1381 .equiv i0300_is3, 0
1382 .equiv i0300_ii3, 4
1383 .equiv i0300_ix, 8
1384 .equiv i0300_iy, 12
1385 .equiv i0300_iz, 16
1386 .equiv i0300_vnbtot, 20
1387 .equiv i0300_c6, 28
1388 .equiv i0300_c12, 36
1389 .equiv i0300_two, 44
1390 .equiv i0300_n1, 52
1391 .equiv i0300_tsc, 60
1392 .equiv i0300_ntia, 68
1393 .equiv i0300_innerjjnr, 72
1394 .equiv i0300_innerk, 76
1395 .equiv i0300_fix, 80
1396 .equiv i0300_fiy, 84
1397 .equiv i0300_fiz, 88
1398 .equiv i0300_dx1, 92
1399 .equiv i0300_dy1, 96
1400 .equiv i0300_dz1, 100
1401 .equiv i0300_dx2, 104
1402 .equiv i0300_dy2, 108
1403 .equiv i0300_dz2, 112
1404 push ebp
1405 mov ebp,esp
1406 push eax
1407 push ebx
1408 push ecx
1409 push edx
1410 push esi
1411 push edi
1412 sub esp, 116 ;# local stack space
1413 femms
1414 ;# move data to local stack
1415 movq mm0, [mm_two]
1416 movd mm3, [ebp + i0300_tabscale]
1417 movq [esp + i0300_two], mm0
1418 punpckldq mm3,mm3
1419 movq [esp + i0300_tsc], mm3
1420 ;# assume we have at least one i particle - start directly
1421 .i0300_outer:
1422 mov eax, [ebp + i0300_shift] ;# eax = pointer into shift[]
1423 mov ebx, [eax] ;# ebx=shift[n]
1424 add dword ptr [ebp + i0300_shift], 4 ;# advance pointer one step
1426 lea ebx, [ebx + ebx*2] ;# ebx=3*is
1427 mov [esp + i0300_is3],ebx ;# store is3
1429 mov eax, [ebp + i0300_shiftvec] ;# eax = base of shiftvec[]
1431 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
1432 movd mm1, [eax + ebx*4 + 8]
1434 mov ecx, [ebp + i0300_iinr] ;# ecx = pointer into iinr[]
1435 add dword ptr [ebp + i0300_iinr], 4 ;# advance pointer
1436 mov ebx, [ecx] ;# ebx=ii
1438 mov edx, [ebp + i0300_type]
1439 mov edx, [edx + ebx*4]
1440 imul edx, [ebp + i0300_ntype]
1441 shl edx, 1
1442 mov [esp + i0300_ntia], edx
1444 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
1445 mov eax, [ebp + i0300_pos] ;# eax = base of pos[]
1447 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
1448 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
1449 mov [esp + i0300_ii3], ebx
1450 pfadd mm1, mm3
1451 movq [esp + i0300_ix], mm0
1452 movd [esp + i0300_iz], mm1
1454 ;# clear total potential and i forces
1455 pxor mm7,mm7
1456 movq [esp + i0300_vnbtot], mm7
1457 movq [esp + i0300_fix], mm7
1458 movd [esp + i0300_fiz], mm7
1460 mov eax, [ebp + i0300_jindex]
1461 mov ecx, [eax] ;# jindex[n]
1462 mov edx, [eax + 4] ;# jindex[n+1]
1463 add dword ptr [ebp + i0300_jindex], 4
1464 sub edx, ecx ;# number of innerloop atoms
1466 mov esi, [ebp + i0300_pos]
1467 mov edi, [ebp + i0300_faction]
1468 mov eax, [ebp + i0300_jjnr]
1469 shl ecx, 2
1470 add eax, ecx
1471 mov [esp + i0300_innerjjnr], eax ;# pointer to jjnr[nj0]
1472 sub edx, 2
1473 mov [esp + i0300_innerk], edx ;# number of innerloop atoms
1474 jge .i0300_unroll_loop
1475 jmp .i0300_finish_inner
1476 .i0300_unroll_loop:
1477 ;# paired innerloop starts here
1478 mov ecx, [esp + i0300_innerjjnr] ;# pointer to jjnr[k]
1479 mov eax, [ecx]
1480 mov ebx, [ecx + 4] ;# eax/ebx=jnr
1481 add dword ptr [esp + i0300_innerjjnr], 8 ;# advance pointer (unrolled 2)
1482 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
1484 mov ecx, [ebp + i0300_type]
1485 mov edx, [ecx + eax*4] ;# type [jnr1]
1486 mov ecx, [ecx + ebx*4] ;# type [jnr2]
1488 mov esi, [ebp + i0300_nbfp] ;# base of nbfp
1489 shl edx, 1
1490 shl ecx, 1
1491 add edx, [esp + i0300_ntia] ;# tja = ntia + 2*type
1492 add ecx, [esp + i0300_ntia]
1494 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
1495 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
1496 movq mm6,mm5
1497 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
1498 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
1499 movq [esp + i0300_c6], mm5
1500 movq [esp + i0300_c12], mm6
1502 lea eax, [eax + eax*2] ;# replace jnr with j3
1503 lea ebx, [ebx + ebx*2]
1505 mov esi, [ebp + i0300_pos]
1507 movq mm0, [esp + i0300_ix]
1508 movd mm1, [esp + i0300_iz]
1509 movq mm4, [esi + eax*4] ;# fetch first j coordinates
1510 movd mm5, [esi + eax*4 + 8]
1511 pfsubr mm4,mm0 ;# dr = ir - jr
1512 pfsubr mm5,mm1
1513 movq [esp + i0300_dx1], mm4 ;# store dr
1514 movd [esp + i0300_dz1], mm5
1515 pfmul mm4,mm4 ;# square dx,dy,dz
1516 pfmul mm5,mm5
1517 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
1518 pfacc mm4, mm5 ;# first rsq in lower mm4
1520 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
1521 movd mm7, [esi + ebx*4 + 8]
1523 pfsubr mm6,mm0 ;# dr = ir - jr
1524 pfsubr mm7,mm1
1525 movq [esp + i0300_dx2], mm6 ;# store dr
1526 movd [esp + i0300_dz2], mm7
1527 pfmul mm6,mm6 ;# square dx,dy,dz
1528 pfmul mm7,mm7
1529 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
1530 pfacc mm6, mm7 ;# second rsq in lower mm6
1532 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
1533 pfrsqrt mm1, mm6
1536 punpckldq mm0,mm1
1537 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
1538 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
1539 pfmul mm0,mm0
1540 pfrsqit1 mm0,mm4
1541 pfrcpit2 mm0,mm2
1542 pfmul mm4, mm0
1543 movq mm1, mm4
1544 ;# mm0 is invsqrt, and mm1 r
1545 ;# do potential and fscal
1546 pfmul mm1, [esp + i0300_tsc] ;# mm1=rt
1547 pf2iw mm4,mm1
1548 movq [esp + i0300_n1], mm4
1549 pi2fd mm4,mm4
1550 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
1552 movq mm2,mm1
1553 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
1555 mov edx, [ebp + i0300_VFtab]
1556 ;# dispersion table
1557 mov ecx, [esp + i0300_n1]
1558 shl ecx, 3
1559 ;# load all the table values we need
1560 movd mm4, [edx + ecx*4]
1561 movd mm5, [edx + ecx*4 + 4]
1562 movd mm6, [edx + ecx*4 + 8]
1563 movd mm7, [edx + ecx*4 + 12]
1564 mov ecx, [esp + i0300_n1 + 4]
1565 shl ecx, 3
1566 punpckldq mm4, [edx + ecx*4]
1567 punpckldq mm5, [edx + ecx*4 + 4]
1568 punpckldq mm6, [edx + ecx*4 + 8]
1569 punpckldq mm7, [edx + ecx*4 + 12]
1570 pfmul mm6, mm1 ;# mm6 = Geps
1571 pfmul mm7, mm2 ;# mm7 = Heps2
1572 pfadd mm5, mm6
1573 pfadd mm5, mm7 ;# mm5 = Fp
1574 pfmul mm7, [esp + i0300_two] ;# two*Heps2
1575 pfadd mm7, mm6
1576 pfadd mm7, mm5 ;# mm7=FF
1577 pfmul mm5, mm1 ;# mm5=eps*Fp
1578 pfadd mm5, mm4 ;# mm5= VV
1580 movq mm4, [esp + i0300_c6]
1581 pfmul mm7, mm4 ;# fijD
1582 pfmul mm5, mm4 ;# vnb6
1583 movq mm3, mm7 ;# add to fscal
1585 ;# update vnbtot to release mm5!
1586 pfadd mm5, [esp + i0300_vnbtot] ;# add the earlier value
1587 movq [esp + i0300_vnbtot], mm5 ;# store the sum
1589 ;# repulsion table
1590 mov ecx, [esp + i0300_n1]
1591 shl ecx, 3
1592 ;# load all the table values we need
1593 movd mm4, [edx + ecx*4 + 16]
1594 movd mm5, [edx + ecx*4 + 20]
1595 movd mm6, [edx + ecx*4 + 24]
1596 movd mm7, [edx + ecx*4 + 28]
1597 mov ecx, [esp + i0300_n1 + 4]
1598 shl ecx, 3
1599 punpckldq mm4, [edx + ecx*4 + 16]
1600 punpckldq mm5, [edx + ecx*4 + 20]
1601 punpckldq mm6, [edx + ecx*4 + 24]
1602 punpckldq mm7, [edx + ecx*4 + 28]
1604 pfmul mm6, mm1 ;# mm6 = Geps
1605 pfmul mm7, mm2 ;# mm7 = Heps2
1606 pfadd mm5, mm6
1607 pfadd mm5, mm7 ;# mm5 = Fp
1608 pfmul mm7, [esp + i0300_two] ;# two*Heps2
1609 pfadd mm7, mm6
1610 pfadd mm7, mm5 ;# mm7=FF
1611 pfmul mm5, mm1 ;# mm5=eps*Fp
1612 pfadd mm5, mm4 ;# mm5= VV
1614 movq mm6, [esp + i0300_c12]
1615 pfmul mm7, mm6 ;# fijR
1616 pfmul mm5, mm6 ;# vnb12
1617 pfadd mm3, mm7 ;# total fscal fijD+ fijR
1619 ;# change sign of mm3
1620 pxor mm1,mm1
1621 pfsub mm1, mm3
1622 pfmul mm1, [esp + i0300_tsc]
1623 pfmul mm0, mm1 ;# mm0 is total fscal now
1625 prefetchw [esp + i0300_dx1] ;# prefetch i forces to cache
1627 ;# spread fscalar to both positions
1628 movq mm1,mm0
1629 punpckldq mm0,mm0
1630 punpckhdq mm1,mm1
1632 ;# calc vector force
1633 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
1634 movq mm2, [esp + i0300_dx1] ;# fetch dr
1635 movd mm3, [esp + i0300_dz1]
1637 ;# update vnbtot
1638 pfadd mm5, [esp + i0300_vnbtot] ;# add the earlier value
1639 movq [esp + i0300_vnbtot], mm5 ;# store the sum
1641 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
1642 pfmul mm2, mm0 ;# mult by fs
1643 pfmul mm3, mm0
1645 movq mm4, [esp + i0300_dx2] ;# fetch dr
1646 movd mm5, [esp + i0300_dz2]
1647 pfmul mm4, mm1 ;# mult by fs
1648 pfmul mm5, mm1
1649 ;# update i forces
1651 movq mm0, [esp + i0300_fix]
1652 movd mm1, [esp + i0300_fiz]
1653 pfadd mm0, mm2
1654 pfadd mm1, mm3
1656 pfadd mm0, mm4
1657 pfadd mm1, mm5
1658 movq [esp + i0300_fix], mm0
1659 movd [esp + i0300_fiz], mm1
1660 ;# update j forces
1662 movq mm0, [edi + eax*4]
1663 movd mm1, [edi + eax*4 + 8]
1664 movq mm6, [edi + ebx*4]
1665 movd mm7, [edi + ebx*4 + 8]
1667 pfsub mm0, mm2
1668 pfsub mm1, mm3
1669 pfsub mm6, mm4
1670 pfsub mm7, mm5
1672 movq [edi + eax*4], mm0
1673 movd [edi + eax*4 +8], mm1
1674 movq [edi + ebx*4], mm6
1675 movd [edi + ebx*4 + 8], mm7
1677 ;# should we do one more iteration?
1678 sub dword ptr [esp + i0300_innerk], 2
1679 jl .i0300_finish_inner
1680 jmp .i0300_unroll_loop
1681 .i0300_finish_inner:
1682 and dword ptr [esp + i0300_innerk], 1
1683 jnz .i0300_single_inner
1684 jmp .i0300_updateouterdata
1685 .i0300_single_inner:
1686 ;# a single j particle iteration here - compare with the unrolled code for comments
1687 mov eax, [esp + i0300_innerjjnr]
1688 mov eax, [eax] ;# eax=jnr offset
1690 mov esi, [ebp + i0300_nbfp]
1691 mov ecx, [ebp + i0300_type]
1692 mov edx, [ecx + eax*4] ;# type [jnr1]
1693 shl edx, 1
1694 add edx, [esp + i0300_ntia] ;# tja = ntia + 2*type
1695 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
1696 movq [esp + i0300_c6], mm5
1697 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
1698 movq [esp + i0300_c12], mm5
1700 mov esi, [ebp + i0300_pos]
1701 lea eax, [eax + eax*2]
1703 movq mm0, [esp + i0300_ix]
1704 movd mm1, [esp + i0300_iz]
1705 movq mm4, [esi + eax*4]
1706 movd mm5, [esi + eax*4 + 8]
1707 pfsubr mm4, mm0
1708 pfsubr mm5, mm1
1709 movq [esp + i0300_dx1], mm4
1710 pfmul mm4,mm4
1711 movd [esp + i0300_dz1], mm5
1712 pfmul mm5,mm5
1713 pfacc mm4, mm5
1714 pfacc mm4, mm5 ;# mm0=rsq
1716 pfrsqrt mm0,mm4
1717 movq mm2,mm0
1718 pfmul mm0,mm0
1719 pfrsqit1 mm0,mm4
1720 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
1721 pfmul mm4, mm0
1722 movq mm1, mm4
1723 ;# mm0 is invsqrt, and mm1 r
1725 ;# calculate potentials and scalar force
1726 pfmul mm1, [esp + i0300_tsc] ;# mm1=rt
1727 pf2iw mm4,mm1
1728 movd [esp + i0300_n1], mm4
1729 pi2fd mm4,mm4
1730 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
1732 movq mm2,mm1
1733 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
1735 mov edx, [ebp + i0300_VFtab]
1736 mov ecx, [esp + i0300_n1]
1737 shl ecx, 3
1738 ;# dispersion table
1739 ;# load all the table values we need
1740 movd mm4, [edx + ecx*4]
1741 movd mm5, [edx + ecx*4 + 4]
1742 movd mm6, [edx + ecx*4 + 8]
1743 movd mm7, [edx + ecx*4 + 12]
1744 pfmul mm6, mm1 ;# mm6 = Geps
1745 pfmul mm7, mm2 ;# mm7 = Heps2
1746 pfadd mm5, mm6
1747 pfadd mm5, mm7 ;# mm5 = Fp
1748 pfmul mm7, [esp + i0300_two] ;# two*Heps2
1749 pfadd mm7, mm6
1750 pfadd mm7, mm5 ;# mm7=FF
1751 pfmul mm5, mm1 ;# mm5=eps*Fp
1752 pfadd mm5, mm4 ;# mm5= VV
1754 movq mm4, [esp + i0300_c6]
1755 pfmul mm7, mm4 ;# fijD
1756 pfmul mm5, mm4 ;# vnb6
1757 movq mm3, mm7 ;# add to fscal
1759 ;# update vnbtot to release mm5!
1760 pfadd mm5, [esp + i0300_vnbtot] ;# add the earlier value
1761 movq [esp + i0300_vnbtot], mm5 ;# store the sum
1763 ;# repulsion table
1764 ;# load all the table values we need
1765 movd mm4, [edx + ecx*4 + 16]
1766 movd mm5, [edx + ecx*4 + 20]
1767 movd mm6, [edx + ecx*4 + 24]
1768 movd mm7, [edx + ecx*4 + 28]
1770 pfmul mm6, mm1 ;# mm6 = Geps
1771 pfmul mm7, mm2 ;# mm7 = Heps2
1772 pfadd mm5, mm6
1773 pfadd mm5, mm7 ;# mm5 = Fp
1774 pfmul mm7, [esp + i0300_two] ;# two*Heps2
1775 pfadd mm7, mm6
1776 pfadd mm7, mm5 ;# mm7=FF
1777 pfmul mm5, mm1 ;# mm5=eps*Fp
1778 pfadd mm5, mm4 ;# mm5= VV
1780 movq mm6, [esp + i0300_c12]
1781 pfmul mm7, mm6 ;# fijR
1782 pfmul mm5, mm6 ;# vnb12
1783 pfadd mm3, mm7 ;# total fscal fijC+ fijD+ fijR
1785 ;# change sign of mm3
1786 pxor mm1,mm1
1787 pfsub mm1, mm3
1788 pfmul mm0, [esp + i0300_tsc]
1789 pfmul mm0, mm1 ;# mm0 is total fscal now
1791 ;# update vnbtot
1792 pfadd mm5, [esp + i0300_vnbtot] ;# add the earlier value
1793 movq [esp + i0300_vnbtot], mm5 ;# store the sum
1795 ;# spread fscalar to both positions
1796 punpckldq mm0,mm0
1797 ;# calc vectorial force
1798 prefetchw [edi + eax*4] ;# prefetch faction to cache
1799 movq mm2, [esp + i0300_dx1]
1800 movd mm3, [esp + i0300_dz1]
1802 pfmul mm2, mm0
1803 pfmul mm3, mm0
1805 ;# update i particle force
1806 movq mm0, [esp + i0300_fix]
1807 movd mm1, [esp + i0300_fiz]
1808 pfadd mm0, mm2
1809 pfadd mm1, mm3
1810 movq [esp + i0300_fix], mm0
1811 movd [esp + i0300_fiz], mm1
1812 ;# update j particle force
1813 movq mm0, [edi + eax*4]
1814 movd mm1, [edi + eax *4+ 8]
1815 pfsub mm0, mm2
1816 pfsub mm1, mm3
1817 movq [edi + eax*4], mm0
1818 movd [edi + eax*4 +8], mm1
1819 ;# done!
1820 .i0300_updateouterdata:
1821 mov ecx, [esp + i0300_ii3]
1823 movq mm6, [edi + ecx*4] ;# increment i force
1824 movd mm7, [edi + ecx*4 + 8]
1825 pfadd mm6, [esp + i0300_fix]
1826 pfadd mm7, [esp + i0300_fiz]
1827 movq [edi + ecx*4], mm6
1828 movd [edi + ecx*4 +8], mm7
1830 mov ebx, [ebp + i0300_fshift] ;# increment fshift force
1831 mov edx, [esp + i0300_is3]
1833 movq mm6, [ebx + edx*4]
1834 movd mm7, [ebx + edx*4 + 8]
1835 pfadd mm6, [esp + i0300_fix]
1836 pfadd mm7, [esp + i0300_fiz]
1837 movq [ebx + edx*4], mm6
1838 movd [ebx + edx*4 + 8], mm7
1840 mov edx, [ebp + i0300_gid] ;# get group index for this i particle
1841 mov edx, [edx]
1842 add dword ptr [ebp + i0300_gid], 4 ;# advance pointer
1844 movq mm7, [esp + i0300_vnbtot]
1845 pfacc mm7,mm7 ;# get and sum the two parts of total potential
1847 mov eax, [ebp + i0300_Vnb]
1848 movd mm6, [eax + edx*4]
1849 pfadd mm6, mm7
1850 movd [eax + edx*4], mm6 ;# increment vnb[gid]
1852 ;# finish if last
1853 mov ecx, [ebp + i0300_nri]
1854 dec ecx
1855 jecxz .i0300_end
1856 ;# not last, iterate once more!
1857 mov [ebp + i0300_nri], ecx
1858 jmp .i0300_outer
1859 .i0300_end:
1860 femms
1861 add esp, 116
1862 pop edi
1863 pop esi
1864 pop edx
1865 pop ecx
1866 pop ebx
1867 pop eax
1868 leave
1874 .globl inl0310_3dnow
1875 .globl _inl0310_3dnow
1876 inl0310_3dnow:
1877 _inl0310_3dnow:
1878 .equiv i0310_nri, 8
1879 .equiv i0310_iinr, 12
1880 .equiv i0310_jindex, 16
1881 .equiv i0310_jjnr, 20
1882 .equiv i0310_shift, 24
1883 .equiv i0310_shiftvec, 28
1884 .equiv i0310_fshift, 32
1885 .equiv i0310_gid, 36
1886 .equiv i0310_pos, 40
1887 .equiv i0310_faction, 44
1888 .equiv i0310_type, 48
1889 .equiv i0310_ntype, 52
1890 .equiv i0310_nbfp, 56
1891 .equiv i0310_Vnb, 60
1892 .equiv i0310_tabscale, 64
1893 .equiv i0310_VFtab, 68
1894 .equiv i0310_nsatoms, 72
1895 ;# stack offsets for local variables
1896 .equiv i0310_is3, 0
1897 .equiv i0310_ii3, 4
1898 .equiv i0310_shX, 8
1899 .equiv i0310_shY, 12
1900 .equiv i0310_shZ, 16
1901 .equiv i0310_ix, 20
1902 .equiv i0310_iy, 24
1903 .equiv i0310_iz, 28
1904 .equiv i0310_vnbtot, 32
1905 .equiv i0310_c6, 40
1906 .equiv i0310_c12, 48
1907 .equiv i0310_two, 56
1908 .equiv i0310_n1, 64
1909 .equiv i0310_tsc, 72
1910 .equiv i0310_ntia, 80
1911 .equiv i0310_innerjjnr0, 84
1912 .equiv i0310_innerk0, 88
1913 .equiv i0310_innerjjnr, 92
1914 .equiv i0310_innerk, 96
1915 .equiv i0310_fix, 100
1916 .equiv i0310_fiy, 104
1917 .equiv i0310_fiz, 108
1918 .equiv i0310_dx1, 112
1919 .equiv i0310_dy1, 116
1920 .equiv i0310_dz1, 120
1921 .equiv i0310_dx2, 124
1922 .equiv i0310_dy2, 128
1923 .equiv i0310_dz2, 132
1924 .equiv i0310_nsvdwc, 136
1925 .equiv i0310_nscoul, 140
1926 .equiv i0310_nsvdw, 144
1927 .equiv i0310_solnr, 148
1928 push ebp
1929 mov ebp,esp
1930 push eax
1931 push ebx
1932 push ecx
1933 push edx
1934 push esi
1935 push edi
1936 sub esp, 152 ;# local stack space
1937 femms
1938 movq mm0, [mm_two]
1939 movd mm3, [ebp + i0310_tabscale]
1940 movq [esp + i0310_two], mm0
1941 punpckldq mm3,mm3
1942 movq [esp + i0310_tsc], mm3
1944 ;# assume we have at least one i particle - start directly
1945 .i0310_outer:
1946 mov eax, [ebp + i0310_shift] ;# eax = pointer into shift[]
1947 mov ebx, [eax] ;# ebx=shift[n]
1948 add dword ptr [ebp + i0310_shift], 4 ;# advance pointer one step
1950 lea ebx, [ebx + ebx*2] ;# ebx=3*is
1951 mov [esp + i0310_is3],ebx ;# store is3
1953 mov eax, [ebp + i0310_shiftvec] ;# eax = base of shiftvec[]
1955 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
1956 movd mm1, [eax + ebx*4 + 8]
1957 movq [esp + i0310_shX], mm0
1958 movd [esp + i0310_shZ], mm1
1960 mov ecx, [ebp + i0310_iinr] ;# ecx = pointer into iinr[]
1961 add dword ptr [ebp + i0310_iinr], 4 ;# advance pointer
1962 mov ebx, [ecx] ;# ebx=ii
1964 mov eax, [ebp + i0310_nsatoms]
1965 add dword ptr [ebp + i0310_nsatoms], 12
1966 mov ecx, [eax]
1967 mov edx, [eax + 4]
1968 mov eax, [eax + 8]
1969 sub ecx, eax
1970 sub eax, edx
1972 mov [esp + i0310_nsvdwc], edx
1973 mov [esp + i0310_nscoul], eax
1974 mov [esp + i0310_nsvdw], ecx
1976 ;# clear potential
1977 pxor mm7,mm7
1978 movq [esp + i0310_vnbtot], mm7
1979 mov [esp + i0310_solnr], ebx
1981 mov eax, [ebp + i0310_jindex]
1982 mov ecx, [eax] ;# jindex[n]
1983 mov edx, [eax + 4] ;# jindex[n+1]
1984 add dword ptr [ebp + i0310_jindex], 4
1985 sub edx, ecx ;# number of innerloop atoms
1986 mov eax, [ebp + i0310_jjnr]
1987 shl ecx, 2
1988 add eax, ecx
1989 mov [esp + i0310_innerjjnr0], eax ;# pointer to jjnr[nj0]
1991 mov [esp + i0310_innerk0], edx ;# number of innerloop atoms
1992 mov esi, [ebp + i0310_pos]
1993 mov edi, [ebp + i0310_faction]
1995 mov ecx, [esp + i0310_nsvdwc]
1996 cmp ecx, 0
1997 jnz .i0310_mno_vdwc
1998 jmp .i0310_testvdw
1999 .i0310_mno_vdwc:
2000 mov ebx, [esp + i0310_solnr]
2001 inc dword ptr [esp + i0310_solnr]
2003 mov edx, [ebp + i0310_type]
2004 mov edx, [edx + ebx*4]
2005 imul edx, [ebp + i0310_ntype]
2006 shl edx, 1
2007 mov [esp + i0310_ntia], edx
2009 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
2010 mov eax, [ebp + i0310_pos] ;# eax = base of pos[]
2011 mov [esp + i0310_ii3], ebx
2013 movq mm0, [eax + ebx*4]
2014 movd mm1, [eax + ebx*4 + 8]
2015 pfadd mm0, [esp + i0310_shX]
2016 pfadd mm1, [esp + i0310_shZ]
2017 movq [esp + i0310_ix], mm0
2018 movd [esp + i0310_iz], mm1
2020 ;# clear forces
2021 pxor mm7,mm7
2022 movq [esp + i0310_fix], mm7
2023 movd [esp + i0310_fiz], mm7
2025 mov ecx, [esp + i0310_innerjjnr0]
2026 mov [esp + i0310_innerjjnr], ecx
2027 mov edx, [esp + i0310_innerk0]
2028 sub edx, 2
2029 mov [esp + i0310_innerk], edx ;# number of innerloop atoms
2030 jge .i0310_unroll_vdwc_loop
2031 jmp .i0310_finish_vdwc_inner
2032 .i0310_unroll_vdwc_loop:
2033 ;# paired innerloop starts here
2034 mov ecx, [esp + i0310_innerjjnr] ;# pointer to jjnr[k]
2035 mov eax, [ecx]
2036 mov ebx, [ecx + 4] ;# eax/ebx=jnr
2037 add dword ptr [esp + i0310_innerjjnr], 8 ;# advance pointer (unrolled 2)
2038 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
2040 mov ecx, [ebp + i0310_type]
2041 mov edx, [ecx + eax*4] ;# type [jnr1]
2042 mov ecx, [ecx + ebx*4] ;# type [jnr2]
2044 mov esi, [ebp + i0310_nbfp] ;# base of nbfp
2045 shl edx, 1
2046 shl ecx, 1
2047 add edx, [esp + i0310_ntia] ;# tja = ntia + 2*type
2048 add ecx, [esp + i0310_ntia]
2050 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
2051 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
2052 movq mm6,mm5
2053 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
2054 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
2055 movq [esp + i0310_c6], mm5
2056 movq [esp + i0310_c12], mm6
2058 lea eax, [eax + eax*2] ;# replace jnr with j3
2059 lea ebx, [ebx + ebx*2]
2061 mov esi, [ebp + i0310_pos]
2063 movq mm0, [esp + i0310_ix]
2064 movd mm1, [esp + i0310_iz]
2065 movq mm4, [esi + eax*4] ;# fetch first j coordinates
2066 movd mm5, [esi + eax*4 + 8]
2067 pfsubr mm4,mm0 ;# dr = ir - jr
2068 pfsubr mm5,mm1
2069 movq [esp + i0310_dx1], mm4 ;# store dr
2070 movd [esp + i0310_dz1], mm5
2071 pfmul mm4,mm4 ;# square dx,dy,dz
2072 pfmul mm5,mm5
2073 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
2074 pfacc mm4, mm5 ;# first rsq in lower mm4
2076 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
2077 movd mm7, [esi + ebx*4 + 8]
2079 pfsubr mm6,mm0 ;# dr = ir - jr
2080 pfsubr mm7,mm1
2081 movq [esp + i0310_dx2], mm6 ;# store dr
2082 movd [esp + i0310_dz2], mm7
2083 pfmul mm6,mm6 ;# square dx,dy,dz
2084 pfmul mm7,mm7
2085 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
2086 pfacc mm6, mm7 ;# second rsq in lower mm6
2088 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
2089 pfrsqrt mm1, mm6
2092 punpckldq mm0,mm1
2093 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
2094 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
2095 pfmul mm0,mm0
2096 pfrsqit1 mm0,mm4
2097 pfrcpit2 mm0,mm2
2098 pfmul mm4, mm0
2099 movq mm1, mm4
2100 ;# mm0 is invsqrt, and mm1 r
2101 ;# do potential and fscal
2102 pfmul mm1, [esp + i0310_tsc] ;# mm1=rt
2103 pf2iw mm4,mm1
2104 movq [esp + i0310_n1], mm4
2105 pi2fd mm4,mm4
2106 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
2108 movq mm2,mm1
2109 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
2111 mov edx, [ebp + i0310_VFtab]
2112 ;# dispersion table
2113 mov ecx, [esp + i0310_n1]
2114 shl ecx, 3
2115 ;# load all the table values we need
2116 movd mm4, [edx + ecx*4]
2117 movd mm5, [edx + ecx*4 + 4]
2118 movd mm6, [edx + ecx*4 + 8]
2119 movd mm7, [edx + ecx*4 + 12]
2120 mov ecx, [esp + i0310_n1 + 4]
2121 shl ecx, 3
2122 punpckldq mm4, [edx + ecx*4]
2123 punpckldq mm5, [edx + ecx*4 + 4]
2124 punpckldq mm6, [edx + ecx*4 + 8]
2125 punpckldq mm7, [edx + ecx*4 + 12]
2126 pfmul mm6, mm1 ;# mm6 = Geps
2127 pfmul mm7, mm2 ;# mm7 = Heps2
2128 pfadd mm5, mm6
2129 pfadd mm5, mm7 ;# mm5 = Fp
2130 pfmul mm7, [esp + i0310_two] ;# two*Heps2
2131 pfadd mm7, mm6
2132 pfadd mm7, mm5 ;# mm7=FF
2133 pfmul mm5, mm1 ;# mm5=eps*Fp
2134 pfadd mm5, mm4 ;# mm5= VV
2136 movq mm4, [esp + i0310_c6]
2137 pfmul mm7, mm4 ;# fijD
2138 pfmul mm5, mm4 ;# vnb6
2139 movq mm3, mm7 ;# add to fscal
2141 ;# update vnbtot to release mm5!
2142 pfadd mm5, [esp + i0310_vnbtot] ;# add the earlier value
2143 movq [esp + i0310_vnbtot], mm5 ;# store the sum
2145 ;# repulsion table
2146 mov ecx, [esp + i0310_n1]
2147 shl ecx, 3
2148 ;# load all the table values we need
2149 movd mm4, [edx + ecx*4 + 16]
2150 movd mm5, [edx + ecx*4 + 20]
2151 movd mm6, [edx + ecx*4 + 24]
2152 movd mm7, [edx + ecx*4 + 28]
2153 mov ecx, [esp + i0310_n1 + 4]
2154 shl ecx, 3
2155 punpckldq mm4, [edx + ecx*4 + 16]
2156 punpckldq mm5, [edx + ecx*4 + 20]
2157 punpckldq mm6, [edx + ecx*4 + 24]
2158 punpckldq mm7, [edx + ecx*4 + 28]
2160 pfmul mm6, mm1 ;# mm6 = Geps
2161 pfmul mm7, mm2 ;# mm7 = Heps2
2162 pfadd mm5, mm6
2163 pfadd mm5, mm7 ;# mm5 = Fp
2164 pfmul mm7, [esp + i0310_two] ;# two*Heps2
2165 pfadd mm7, mm6
2166 pfadd mm7, mm5 ;# mm7=FF
2167 pfmul mm5, mm1 ;# mm5=eps*Fp
2168 pfadd mm5, mm4 ;# mm5= VV
2170 movq mm6, [esp + i0310_c12]
2171 pfmul mm7, mm6 ;# fijR
2172 pfmul mm5, mm6 ;# vnb12
2173 pfadd mm3, mm7 ;# total fscal fijD+ fijR
2175 ;# change sign of mm3
2176 pxor mm1,mm1
2177 pfsub mm1, mm3
2178 pfmul mm1, [esp + i0310_tsc]
2179 pfmul mm0, mm1 ;# mm0 is total fscal now
2181 prefetchw [esp + i0310_dx1] ;# prefetch i forces to cache
2183 ;# spread fscalar to both positions
2184 movq mm1,mm0
2185 punpckldq mm0,mm0
2186 punpckhdq mm1,mm1
2188 ;# calc vector force
2189 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
2190 movq mm2, [esp + i0310_dx1] ;# fetch dr
2191 movd mm3, [esp + i0310_dz1]
2193 ;# update vnbtot
2194 pfadd mm5, [esp + i0310_vnbtot] ;# add the earlier value
2195 movq [esp + i0310_vnbtot], mm5 ;# store the sum
2197 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
2198 pfmul mm2, mm0 ;# mult by fs
2199 pfmul mm3, mm0
2201 movq mm4, [esp + i0310_dx2] ;# fetch dr
2202 movd mm5, [esp + i0310_dz2]
2203 pfmul mm4, mm1 ;# mult by fs
2204 pfmul mm5, mm1
2205 ;# update i forces
2207 movq mm0, [esp + i0310_fix]
2208 movd mm1, [esp + i0310_fiz]
2209 pfadd mm0, mm2
2210 pfadd mm1, mm3
2212 pfadd mm0, mm4
2213 pfadd mm1, mm5
2214 movq [esp + i0310_fix], mm0
2215 movd [esp + i0310_fiz], mm1
2216 ;# update j forces
2218 movq mm0, [edi + eax*4]
2219 movd mm1, [edi + eax*4 + 8]
2220 movq mm6, [edi + ebx*4]
2221 movd mm7, [edi + ebx*4 + 8]
2223 pfsub mm0, mm2
2224 pfsub mm1, mm3
2225 pfsub mm6, mm4
2226 pfsub mm7, mm5
2228 movq [edi + eax*4], mm0
2229 movd [edi + eax*4 +8], mm1
2230 movq [edi + ebx*4], mm6
2231 movd [edi + ebx*4 + 8], mm7
2233 ;# should we do one more iteration?
2234 sub dword ptr [esp + i0310_innerk], 2
2235 jl .i0310_finish_vdwc_inner
2236 jmp .i0310_unroll_vdwc_loop
2237 .i0310_finish_vdwc_inner:
2238 and dword ptr [esp + i0310_innerk], 1
2239 jnz .i0310_single_vdwc_inner
2240 jmp .i0310_updateouterdata_vdwc
2241 .i0310_single_vdwc_inner:
2242 ;# a single j particle iteration here - compare with the unrolled code for comments
2243 mov eax, [esp + i0310_innerjjnr]
2244 mov eax, [eax] ;# eax=jnr offset
2246 mov esi, [ebp + i0310_nbfp]
2247 mov ecx, [ebp + i0310_type]
2248 mov edx, [ecx + eax*4] ;# type [jnr1]
2249 shl edx, 1
2250 add edx, [esp + i0310_ntia] ;# tja = ntia + 2*type
2251 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
2252 movq [esp + i0310_c6], mm5
2253 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
2254 movq [esp + i0310_c12], mm5
2256 mov esi, [ebp + i0310_pos]
2257 lea eax, [eax + eax*2]
2259 movq mm0, [esp + i0310_ix]
2260 movd mm1, [esp + i0310_iz]
2261 movq mm4, [esi + eax*4]
2262 movd mm5, [esi + eax*4 + 8]
2263 pfsubr mm4, mm0
2264 pfsubr mm5, mm1
2265 movq [esp + i0310_dx1], mm4
2266 pfmul mm4,mm4
2267 movd [esp + i0310_dz1], mm5
2268 pfmul mm5,mm5
2269 pfacc mm4, mm5
2270 pfacc mm4, mm5 ;# mm0=rsq
2272 pfrsqrt mm0,mm4
2273 movq mm2,mm0
2274 pfmul mm0,mm0
2275 pfrsqit1 mm0,mm4
2276 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
2277 pfmul mm4, mm0
2278 movq mm1, mm4
2279 ;# mm0 is invsqrt, and mm1 r
2281 ;# calculate potentials and scalar force
2282 pfmul mm1, [esp + i0310_tsc] ;# mm1=rt
2283 pf2iw mm4,mm1
2284 movd [esp + i0310_n1], mm4
2285 pi2fd mm4,mm4
2286 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
2288 movq mm2,mm1
2289 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
2291 mov edx, [ebp + i0310_VFtab]
2292 mov ecx, [esp + i0310_n1]
2293 shl ecx, 3
2294 ;# dispersion table
2295 ;# load all the table values we need
2296 movd mm4, [edx + ecx*4]
2297 movd mm5, [edx + ecx*4 + 4]
2298 movd mm6, [edx + ecx*4 + 8]
2299 movd mm7, [edx + ecx*4 + 12]
2300 pfmul mm6, mm1 ;# mm6 = Geps
2301 pfmul mm7, mm2 ;# mm7 = Heps2
2302 pfadd mm5, mm6
2303 pfadd mm5, mm7 ;# mm5 = Fp
2304 pfmul mm7, [esp + i0310_two] ;# two*Heps2
2305 pfadd mm7, mm6
2306 pfadd mm7, mm5 ;# mm7=FF
2307 pfmul mm5, mm1 ;# mm5=eps*Fp
2308 pfadd mm5, mm4 ;# mm5= VV
2310 movq mm4, [esp + i0310_c6]
2311 pfmul mm7, mm4 ;# fijD
2312 pfmul mm5, mm4 ;# vnb6
2313 movq mm3, mm7 ;# add to fscal
2315 ;# update vnbtot to release mm5!
2316 pfadd mm5, [esp + i0310_vnbtot] ;# add the earlier value
2317 movq [esp + i0310_vnbtot], mm5 ;# store the sum
2319 ;# repulsion table
2320 ;# load all the table values we need
2321 movd mm4, [edx + ecx*4 + 16]
2322 movd mm5, [edx + ecx*4 + 20]
2323 movd mm6, [edx + ecx*4 + 24]
2324 movd mm7, [edx + ecx*4 + 28]
2326 pfmul mm6, mm1 ;# mm6 = Geps
2327 pfmul mm7, mm2 ;# mm7 = Heps2
2328 pfadd mm5, mm6
2329 pfadd mm5, mm7 ;# mm5 = Fp
2330 pfmul mm7, [esp + i0310_two] ;# two*Heps2
2331 pfadd mm7, mm6
2332 pfadd mm7, mm5 ;# mm7=FF
2333 pfmul mm5, mm1 ;# mm5=eps*Fp
2334 pfadd mm5, mm4 ;# mm5= VV
2336 movq mm6, [esp + i0310_c12]
2337 pfmul mm7, mm6 ;# fijR
2338 pfmul mm5, mm6 ;# vnb12
2339 pfadd mm3, mm7 ;# total fscal fijC+ fijD+ fijR
2341 ;# change sign of mm3
2342 pxor mm1,mm1
2343 pfsub mm1, mm3
2344 pfmul mm0, [esp + i0310_tsc]
2345 pfmul mm0, mm1 ;# mm0 is total fscal now
2347 ;# update vnbtot
2348 pfadd mm5, [esp + i0310_vnbtot] ;# add the earlier value
2349 movq [esp + i0310_vnbtot], mm5 ;# store the sum
2351 ;# spread fscalar to both positions
2352 punpckldq mm0,mm0
2353 ;# calc vectorial force
2354 prefetchw [edi + eax*4] ;# prefetch faction to cache
2355 movq mm2, [esp + i0310_dx1]
2356 movd mm3, [esp + i0310_dz1]
2358 pfmul mm2, mm0
2359 pfmul mm3, mm0
2361 ;# update i particle force
2362 movq mm0, [esp + i0310_fix]
2363 movd mm1, [esp + i0310_fiz]
2364 pfadd mm0, mm2
2365 pfadd mm1, mm3
2366 movq [esp + i0310_fix], mm0
2367 movd [esp + i0310_fiz], mm1
2368 ;# update j particle force
2369 movq mm0, [edi + eax*4]
2370 movd mm1, [edi + eax *4+ 8]
2371 pfsub mm0, mm2
2372 pfsub mm1, mm3
2373 movq [edi + eax*4], mm0
2374 movd [edi + eax*4 +8], mm1
2375 ;# done!
2376 .i0310_updateouterdata_vdwc:
2377 mov ecx, [esp + i0310_ii3]
2379 movq mm6, [edi + ecx*4] ;# increment i force
2380 movd mm7, [edi + ecx*4 + 8]
2381 pfadd mm6, [esp + i0310_fix]
2382 pfadd mm7, [esp + i0310_fiz]
2383 movq [edi + ecx*4], mm6
2384 movd [edi + ecx*4 +8], mm7
2386 mov ebx, [ebp + i0310_fshift] ;# increment fshift force
2387 mov edx, [esp + i0310_is3]
2389 movq mm6, [ebx + edx*4]
2390 movd mm7, [ebx + edx*4 + 8]
2391 pfadd mm6, [esp + i0310_fix]
2392 pfadd mm7, [esp + i0310_fiz]
2393 movq [ebx + edx*4], mm6
2394 movd [ebx + edx*4 + 8], mm7
2396 ;# loop back to mno
2397 dec dword ptr [esp + i0310_nsvdwc]
2398 jz .i0310_testvdw
2399 jmp .i0310_mno_vdwc
2400 .i0310_testvdw:
2401 mov ebx, [esp + i0310_nscoul]
2402 add [esp + i0310_solnr], ebx
2404 mov ecx, [esp + i0310_nsvdw]
2405 cmp ecx, 0
2406 jnz .i0310_mno_vdw
2407 jmp .i0310_last_mno
2408 .i0310_mno_vdw:
2409 mov ebx, [esp + i0310_solnr]
2410 inc dword ptr [esp + i0310_solnr]
2412 mov edx, [ebp + i0310_type]
2413 mov edx, [edx + ebx*4]
2414 imul edx, [ebp + i0310_ntype]
2415 shl edx, 1
2416 mov [esp + i0310_ntia], edx
2418 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
2419 mov eax, [ebp + i0310_pos] ;# eax = base of pos[]
2420 mov [esp + i0310_ii3], ebx
2422 movq mm0, [eax + ebx*4]
2423 movd mm1, [eax + ebx*4 + 8]
2424 pfadd mm0, [esp + i0310_shX]
2425 pfadd mm1, [esp + i0310_shZ]
2426 movq [esp + i0310_ix], mm0
2427 movd [esp + i0310_iz], mm1
2429 ;# clear forces
2430 pxor mm7,mm7
2431 movq [esp + i0310_fix], mm7
2432 movd [esp + i0310_fiz], mm7
2434 mov ecx, [esp + i0310_innerjjnr0]
2435 mov [esp + i0310_innerjjnr], ecx
2436 mov edx, [esp + i0310_innerk0]
2437 sub edx, 2
2438 mov [esp + i0310_innerk], edx ;# number of innerloop atoms
2439 jge .i0310_unroll_vdw_loop
2440 jmp .i0310_finish_vdw_inner
2441 .i0310_unroll_vdw_loop:
2442 ;# paired innerloop starts here
2443 mov ecx, [esp + i0310_innerjjnr] ;# pointer to jjnr[k]
2444 mov eax, [ecx]
2445 mov ebx, [ecx + 4] ;# eax/ebx=jnr
2446 add dword ptr [esp + i0310_innerjjnr], 8 ;# advance pointer (unrolled 2)
2447 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
2449 mov ecx, [ebp + i0310_type]
2450 mov edx, [ecx + eax*4] ;# type [jnr1]
2451 mov ecx, [ecx + ebx*4] ;# type [jnr2]
2453 mov esi, [ebp + i0310_nbfp] ;# base of nbfp
2454 shl edx, 1
2455 shl ecx, 1
2456 add edx, [esp + i0310_ntia] ;# tja = ntia + 2*type
2457 add ecx, [esp + i0310_ntia]
2459 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
2460 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
2461 movq mm6,mm5
2462 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
2463 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
2464 movq [esp + i0310_c6], mm5
2465 movq [esp + i0310_c12], mm6
2467 lea eax, [eax + eax*2] ;# replace jnr with j3
2468 lea ebx, [ebx + ebx*2]
2470 mov esi, [ebp + i0310_pos]
2472 movq mm0, [esp + i0310_ix]
2473 movd mm1, [esp + i0310_iz]
2474 movq mm4, [esi + eax*4] ;# fetch first j coordinates
2475 movd mm5, [esi + eax*4 + 8]
2476 pfsubr mm4,mm0 ;# dr = ir - jr
2477 pfsubr mm5,mm1
2478 movq [esp + i0310_dx1], mm4 ;# store dr
2479 movd [esp + i0310_dz1], mm5
2480 pfmul mm4,mm4 ;# square dx,dy,dz
2481 pfmul mm5,mm5
2482 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
2483 pfacc mm4, mm5 ;# first rsq in lower mm4
2485 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
2486 movd mm7, [esi + ebx*4 + 8]
2488 pfsubr mm6,mm0 ;# dr = ir - jr
2489 pfsubr mm7,mm1
2490 movq [esp + i0310_dx2], mm6 ;# store dr
2491 movd [esp + i0310_dz2], mm7
2492 pfmul mm6,mm6 ;# square dx,dy,dz
2493 pfmul mm7,mm7
2494 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
2495 pfacc mm6, mm7 ;# second rsq in lower mm6
2497 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
2498 pfrsqrt mm1, mm6
2501 punpckldq mm0,mm1
2502 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
2503 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
2504 pfmul mm0,mm0
2505 pfrsqit1 mm0,mm4
2506 pfrcpit2 mm0,mm2
2507 pfmul mm4, mm0
2508 movq mm1, mm4
2509 ;# mm0 is invsqrt, and mm1 r
2510 ;# do potential and fscal
2511 pfmul mm1, [esp + i0310_tsc] ;# mm1=rt
2512 pf2iw mm4,mm1
2513 movq [esp + i0310_n1], mm4
2514 pi2fd mm4,mm4
2515 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
2517 movq mm2,mm1
2518 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
2520 mov edx, [ebp + i0310_VFtab]
2521 ;# dispersion table
2522 mov ecx, [esp + i0310_n1]
2523 shl ecx, 3
2524 ;# load all the table values we need
2525 movd mm4, [edx + ecx*4]
2526 movd mm5, [edx + ecx*4 + 4]
2527 movd mm6, [edx + ecx*4 + 8]
2528 movd mm7, [edx + ecx*4 + 12]
2529 mov ecx, [esp + i0310_n1 + 4]
2530 shl ecx, 3
2531 punpckldq mm4, [edx + ecx*4]
2532 punpckldq mm5, [edx + ecx*4 + 4]
2533 punpckldq mm6, [edx + ecx*4 + 8]
2534 punpckldq mm7, [edx + ecx*4 + 12]
2535 pfmul mm6, mm1 ;# mm6 = Geps
2536 pfmul mm7, mm2 ;# mm7 = Heps2
2537 pfadd mm5, mm6
2538 pfadd mm5, mm7 ;# mm5 = Fp
2539 pfmul mm7, [esp + i0310_two] ;# two*Heps2
2540 pfadd mm7, mm6
2541 pfadd mm7, mm5 ;# mm7=FF
2542 pfmul mm5, mm1 ;# mm5=eps*Fp
2543 pfadd mm5, mm4 ;# mm5= VV
2545 movq mm4, [esp + i0310_c6]
2546 pfmul mm7, mm4 ;# fijD
2547 pfmul mm5, mm4 ;# vnb6
2548 movq mm3, mm7 ;# add to fscal
2550 ;# update vnbtot to release mm5!
2551 pfadd mm5, [esp + i0310_vnbtot] ;# add the earlier value
2552 movq [esp + i0310_vnbtot], mm5 ;# store the sum
2554 ;# repulsion table
2555 mov ecx, [esp + i0310_n1]
2556 shl ecx, 3
2557 ;# load all the table values we need
2558 movd mm4, [edx + ecx*4 + 16]
2559 movd mm5, [edx + ecx*4 + 20]
2560 movd mm6, [edx + ecx*4 + 24]
2561 movd mm7, [edx + ecx*4 + 28]
2562 mov ecx, [esp + i0310_n1 + 4]
2563 shl ecx, 3
2564 punpckldq mm4, [edx + ecx*4 + 16]
2565 punpckldq mm5, [edx + ecx*4 + 20]
2566 punpckldq mm6, [edx + ecx*4 + 24]
2567 punpckldq mm7, [edx + ecx*4 + 28]
2569 pfmul mm6, mm1 ;# mm6 = Geps
2570 pfmul mm7, mm2 ;# mm7 = Heps2
2571 pfadd mm5, mm6
2572 pfadd mm5, mm7 ;# mm5 = Fp
2573 pfmul mm7, [esp + i0310_two] ;# two*Heps2
2574 pfadd mm7, mm6
2575 pfadd mm7, mm5 ;# mm7=FF
2576 pfmul mm5, mm1 ;# mm5=eps*Fp
2577 pfadd mm5, mm4 ;# mm5= VV
2579 movq mm6, [esp + i0310_c12]
2580 pfmul mm7, mm6 ;# fijR
2581 pfmul mm5, mm6 ;# vnb12
2582 pfadd mm3, mm7 ;# total fscal fijD+ fijR
2584 ;# change sign of mm3
2585 pxor mm1,mm1
2586 pfsub mm1, mm3
2587 pfmul mm1, [esp + i0310_tsc]
2588 pfmul mm0, mm1 ;# mm0 is total fscal now
2590 prefetchw [esp + i0310_dx1] ;# prefetch i forces to cache
2592 ;# spread fscalar to both positions
2593 movq mm1,mm0
2594 punpckldq mm0,mm0
2595 punpckhdq mm1,mm1
2597 ;# calc vector force
2598 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
2599 movq mm2, [esp + i0310_dx1] ;# fetch dr
2600 movd mm3, [esp + i0310_dz1]
2602 ;# update vnbtot
2603 pfadd mm5, [esp + i0310_vnbtot] ;# add the earlier value
2604 movq [esp + i0310_vnbtot], mm5 ;# store the sum
2606 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
2607 pfmul mm2, mm0 ;# mult by fs
2608 pfmul mm3, mm0
2610 movq mm4, [esp + i0310_dx2] ;# fetch dr
2611 movd mm5, [esp + i0310_dz2]
2612 pfmul mm4, mm1 ;# mult by fs
2613 pfmul mm5, mm1
2614 ;# update i forces
2616 movq mm0, [esp + i0310_fix]
2617 movd mm1, [esp + i0310_fiz]
2618 pfadd mm0, mm2
2619 pfadd mm1, mm3
2621 pfadd mm0, mm4
2622 pfadd mm1, mm5
2623 movq [esp + i0310_fix], mm0
2624 movd [esp + i0310_fiz], mm1
2625 ;# update j forces
2627 movq mm0, [edi + eax*4]
2628 movd mm1, [edi + eax*4 + 8]
2629 movq mm6, [edi + ebx*4]
2630 movd mm7, [edi + ebx*4 + 8]
2632 pfsub mm0, mm2
2633 pfsub mm1, mm3
2634 pfsub mm6, mm4
2635 pfsub mm7, mm5
2637 movq [edi + eax*4], mm0
2638 movd [edi + eax*4 +8], mm1
2639 movq [edi + ebx*4], mm6
2640 movd [edi + ebx*4 + 8], mm7
2642 ;# should we do one more iteration?
2643 sub dword ptr [esp + i0310_innerk], 2
2644 jl .i0310_finish_vdw_inner
2645 jmp .i0310_unroll_vdw_loop
2646 .i0310_finish_vdw_inner:
2647 and dword ptr [esp + i0310_innerk], 1
2648 jnz .i0310_single_vdw_inner
2649 jmp .i0310_updateouterdata_vdw
2650 .i0310_single_vdw_inner:
2651 ;# a single j particle iteration here - compare with the unrolled code for comments
2652 mov eax, [esp + i0310_innerjjnr]
2653 mov eax, [eax] ;# eax=jnr offset
2655 mov esi, [ebp + i0310_nbfp]
2656 mov ecx, [ebp + i0310_type]
2657 mov edx, [ecx + eax*4] ;# type [jnr1]
2658 shl edx, 1
2659 add edx, [esp + i0310_ntia] ;# tja = ntia + 2*type
2660 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
2661 movq [esp + i0310_c6], mm5
2662 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
2663 movq [esp + i0310_c12], mm5
2665 mov esi, [ebp + i0310_pos]
2666 lea eax, [eax + eax*2]
2668 movq mm0, [esp + i0310_ix]
2669 movd mm1, [esp + i0310_iz]
2670 movq mm4, [esi + eax*4]
2671 movd mm5, [esi + eax*4 + 8]
2672 pfsubr mm4, mm0
2673 pfsubr mm5, mm1
2674 movq [esp + i0310_dx1], mm4
2675 pfmul mm4,mm4
2676 movd [esp + i0310_dz1], mm5
2677 pfmul mm5,mm5
2678 pfacc mm4, mm5
2679 pfacc mm4, mm5 ;# mm0=rsq
2681 pfrsqrt mm0,mm4
2682 movq mm2,mm0
2683 pfmul mm0,mm0
2684 pfrsqit1 mm0,mm4
2685 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
2686 pfmul mm4, mm0
2687 movq mm1, mm4
2688 ;# mm0 is invsqrt, and mm1 r
2690 ;# calculate potentials and scalar force
2691 pfmul mm1, [esp + i0310_tsc] ;# mm1=rt
2692 pf2iw mm4,mm1
2693 movd [esp + i0310_n1], mm4
2694 pi2fd mm4,mm4
2695 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
2697 movq mm2,mm1
2698 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
2700 mov edx, [ebp + i0310_VFtab]
2701 mov ecx, [esp + i0310_n1]
2702 shl ecx, 3
2703 ;# dispersion table
2704 ;# load all the table values we need
2705 movd mm4, [edx + ecx*4]
2706 movd mm5, [edx + ecx*4 + 4]
2707 movd mm6, [edx + ecx*4 + 8]
2708 movd mm7, [edx + ecx*4 + 12]
2709 pfmul mm6, mm1 ;# mm6 = Geps
2710 pfmul mm7, mm2 ;# mm7 = Heps2
2711 pfadd mm5, mm6
2712 pfadd mm5, mm7 ;# mm5 = Fp
2713 pfmul mm7, [esp + i0310_two] ;# two*Heps2
2714 pfadd mm7, mm6
2715 pfadd mm7, mm5 ;# mm7=FF
2716 pfmul mm5, mm1 ;# mm5=eps*Fp
2717 pfadd mm5, mm4 ;# mm5= VV
2719 movq mm4, [esp + i0310_c6]
2720 pfmul mm7, mm4 ;# fijD
2721 pfmul mm5, mm4 ;# vnb6
2722 movq mm3, mm7 ;# add to fscal
2724 ;# update vnbtot to release mm5!
2725 pfadd mm5, [esp + i0310_vnbtot] ;# add the earlier value
2726 movq [esp + i0310_vnbtot], mm5 ;# store the sum
2728 ;# repulsion table
2729 ;# load all the table values we need
2730 movd mm4, [edx + ecx*4 + 16]
2731 movd mm5, [edx + ecx*4 + 20]
2732 movd mm6, [edx + ecx*4 + 24]
2733 movd mm7, [edx + ecx*4 + 28]
2735 pfmul mm6, mm1 ;# mm6 = Geps
2736 pfmul mm7, mm2 ;# mm7 = Heps2
2737 pfadd mm5, mm6
2738 pfadd mm5, mm7 ;# mm5 = Fp
2739 pfmul mm7, [esp + i0310_two] ;# two*Heps2
2740 pfadd mm7, mm6
2741 pfadd mm7, mm5 ;# mm7=FF
2742 pfmul mm5, mm1 ;# mm5=eps*Fp
2743 pfadd mm5, mm4 ;# mm5= VV
2745 movq mm6, [esp + i0310_c12]
2746 pfmul mm7, mm6 ;# fijR
2747 pfmul mm5, mm6 ;# vnb12
2748 pfadd mm3, mm7 ;# total fscal fijC+ fijD+ fijR
2750 ;# change sign of mm3
2751 pxor mm1,mm1
2752 pfsub mm1, mm3
2753 pfmul mm0, [esp + i0310_tsc]
2754 pfmul mm0, mm1 ;# mm0 is total fscal now
2756 ;# update vnbtot
2757 pfadd mm5, [esp + i0310_vnbtot] ;# add the earlier value
2758 movq [esp + i0310_vnbtot], mm5 ;# store the sum
2760 ;# spread fscalar to both positions
2761 punpckldq mm0,mm0
2762 ;# calc vectorial force
2763 prefetchw [edi + eax*4] ;# prefetch faction to cache
2764 movq mm2, [esp + i0310_dx1]
2765 movd mm3, [esp + i0310_dz1]
2767 pfmul mm2, mm0
2768 pfmul mm3, mm0
2770 ;# update i particle force
2771 movq mm0, [esp + i0310_fix]
2772 movd mm1, [esp + i0310_fiz]
2773 pfadd mm0, mm2
2774 pfadd mm1, mm3
2775 movq [esp + i0310_fix], mm0
2776 movd [esp + i0310_fiz], mm1
2777 ;# update j particle force
2778 movq mm0, [edi + eax*4]
2779 movd mm1, [edi + eax *4+ 8]
2780 pfsub mm0, mm2
2781 pfsub mm1, mm3
2782 movq [edi + eax*4], mm0
2783 movd [edi + eax*4 +8], mm1
2784 ;# done!
2785 .i0310_updateouterdata_vdw:
2786 mov ecx, [esp + i0310_ii3]
2788 movq mm6, [edi + ecx*4] ;# increment i force
2789 movd mm7, [edi + ecx*4 + 8]
2790 pfadd mm6, [esp + i0310_fix]
2791 pfadd mm7, [esp + i0310_fiz]
2792 movq [edi + ecx*4], mm6
2793 movd [edi + ecx*4 +8], mm7
2795 mov ebx, [ebp + i0310_fshift] ;# increment fshift force
2796 mov edx, [esp + i0310_is3]
2798 movq mm6, [ebx + edx*4]
2799 movd mm7, [ebx + edx*4 + 8]
2800 pfadd mm6, [esp + i0310_fix]
2801 pfadd mm7, [esp + i0310_fiz]
2802 movq [ebx + edx*4], mm6
2803 movd [ebx + edx*4 + 8], mm7
2805 ;# loop back to mno
2806 dec dword ptr [esp + i0310_nsvdw]
2807 jz .i0310_last_mno
2808 jmp .i0310_mno_vdw
2810 .i0310_last_mno:
2811 mov edx, [ebp + i0310_gid] ;# get group index for this i particle
2812 mov edx, [edx]
2813 add dword ptr [ebp + i0310_gid], 4 ;# advance pointer
2815 movq mm7, [esp + i0310_vnbtot]
2816 pfacc mm7,mm7 ;# get and sum the two parts of total potential
2818 mov eax, [ebp + i0310_Vnb]
2819 movd mm6, [eax + edx*4]
2820 pfadd mm6, mm7
2821 movd [eax + edx*4], mm6 ;# increment vc[gid]
2822 ;# finish if last
2823 mov ecx, [ebp + i0310_nri]
2824 dec ecx
2825 jecxz .i0310_end
2826 ;# not last, iterate once more!
2827 mov [ebp + i0310_nri], ecx
2828 jmp .i0310_outer
2829 .i0310_end:
2830 femms
2831 add esp, 152
2832 pop edi
2833 pop esi
2834 pop edx
2835 pop ecx
2836 pop ebx
2837 pop eax
2838 leave
2842 .globl inl1000_3dnow
2843 .globl _inl1000_3dnow
2844 inl1000_3dnow:
2845 _inl1000_3dnow:
2846 .equiv i1000_nri, 8
2847 .equiv i1000_iinr, 12
2848 .equiv i1000_jindex, 16
2849 .equiv i1000_jjnr, 20
2850 .equiv i1000_shift, 24
2851 .equiv i1000_shiftvec, 28
2852 .equiv i1000_fshift, 32
2853 .equiv i1000_gid, 36
2854 .equiv i1000_pos, 40
2855 .equiv i1000_faction, 44
2856 .equiv i1000_charge, 48
2857 .equiv i1000_facel, 52
2858 .equiv i1000_Vc, 56
2859 ;# stack offsets for local variables
2860 .equiv i1000_is3, 0
2861 .equiv i1000_ii3, 4
2862 .equiv i1000_ix, 8
2863 .equiv i1000_iy, 12
2864 .equiv i1000_iz, 16
2865 .equiv i1000_iq, 20
2866 .equiv i1000_vctot, 28
2867 .equiv i1000_innerjjnr, 36
2868 .equiv i1000_innerk, 40
2869 .equiv i1000_fix, 44
2870 .equiv i1000_fiy, 48
2871 .equiv i1000_fiz, 52
2872 .equiv i1000_dx1, 56
2873 .equiv i1000_dy1, 60
2874 .equiv i1000_dz1, 64
2875 .equiv i1000_dx2, 68
2876 .equiv i1000_dy2, 72
2877 .equiv i1000_dz2, 76
2878 push ebp
2879 mov ebp,esp
2880 push eax
2881 push ebx
2882 push ecx
2883 push edx
2884 push esi
2885 push edi
2886 sub esp, 80 ;# 80 bytes local stack space
2887 femms
2888 ;# assume we have at least one i particle - start directly
2889 .i1000_outer:
2890 mov eax, [ebp + i1000_shift] ;# eax = pointer into shift[]
2891 mov ebx, [eax] ;# ebx=shift[n]
2892 add dword ptr [ebp + i1000_shift], 4 ;# advance pointer one step
2894 lea ebx, [ebx + ebx*2] ;# ebx=3*is
2895 mov [esp + i1000_is3],ebx ;# store is3
2897 mov eax, [ebp + i1000_shiftvec] ;# eax = base of shiftvec[]
2899 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
2900 movd mm1, [eax + ebx*4 + 8]
2902 mov ecx, [ebp + i1000_iinr] ;# ecx = pointer into iinr[]
2903 add dword ptr [ebp + i1000_iinr], 4 ;# advance pointer
2904 mov ebx, [ecx] ;# ebx=ii
2906 mov edx, [ebp + i1000_charge]
2907 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
2908 pfmul mm2, [ebp + i1000_facel]
2909 punpckldq mm2,mm2 ;# spread to both halves
2910 movq [esp + i1000_iq], mm2 ;# iq =facel*charge[ii]
2912 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
2913 mov eax, [ebp + i1000_pos] ;# eax = base of pos[]
2915 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
2916 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
2917 mov [esp + i1000_ii3], ebx
2918 pfadd mm1, mm3
2919 movq [esp + i1000_ix], mm0
2920 movd [esp + i1000_iz], mm1
2922 ;# clear vctot and i forces
2923 pxor mm7,mm7
2924 movq [esp + i1000_vctot], mm7
2925 movq [esp + i1000_fix], mm7
2926 movd [esp + i1000_fiz], mm7
2928 mov eax, [ebp + i1000_jindex]
2929 mov ecx, [eax] ;# jindex[n]
2930 mov edx, [eax + 4] ;# jindex[n+1]
2931 add dword ptr [ebp + i1000_jindex], 4
2932 sub edx, ecx ;# number of innerloop atoms
2934 mov esi, [ebp + i1000_pos]
2935 mov edi, [ebp + i1000_faction]
2936 mov eax, [ebp + i1000_jjnr]
2937 shl ecx, 2
2938 add eax, ecx
2939 mov [esp + i1000_innerjjnr], eax ;# pointer to jjnr[nj0]
2940 sub edx, 2
2941 mov [esp + i1000_innerk], edx ;# number of innerloop atoms
2942 jge .i1000_unroll_loop
2943 jmp .i1000_finish_inner
2944 .i1000_unroll_loop:
2945 ;# paired innerloop starts here
2946 mov ecx, [esp + i1000_innerjjnr] ;# pointer to jjnr[k]
2947 mov eax, [ecx]
2948 mov ebx, [ecx + 4] ;# eax/ebx=jnr
2949 add dword ptr [esp + i1000_innerjjnr], 8 ;# advance pointer (unrolled 2)
2950 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
2952 mov ecx, [ebp + i1000_charge] ;# base of charge[]
2953 movq mm5, [esp + i1000_iq]
2954 movd mm3, [ecx + eax*4] ;# charge[jnr1]
2955 movd mm7, [ecx + ebx*4] ;# charge[jnr2]
2956 punpckldq mm3,mm7 ;# move charge 2 to high part of mm3
2957 pfmul mm3,mm5 ;# mm3 now has qq for both particles
2959 lea eax, [eax + eax*2] ;# replace jnr with j3
2960 lea ebx, [ebx + ebx*2]
2962 movq mm0, [esp + i1000_ix]
2963 movd mm1, [esp + i1000_iz]
2964 movq mm4, [esi + eax*4] ;# fetch first j coordinates
2965 movd mm5, [esi + eax*4 + 8]
2966 pfsubr mm4,mm0 ;# dr = ir - jr
2967 pfsubr mm5,mm1
2968 movq [esp + i1000_dx1], mm4 ;# store dr
2969 movd [esp + i1000_dz1], mm5
2970 pfmul mm4,mm4 ;# square dx,dy,dz
2971 pfmul mm5,mm5
2972 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
2973 pfacc mm4, mm5 ;# first rsq in lower mm4
2975 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
2976 movd mm7, [esi + ebx*4 + 8]
2978 pfsubr mm6,mm0 ;# dr = ir - jr
2979 pfsubr mm7,mm1
2980 movq [esp + i1000_dx2], mm6 ;# store dr
2981 movd [esp + i1000_dz2], mm7
2982 pfmul mm6,mm6 ;# square dx,dy,dz
2983 pfmul mm7,mm7
2984 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
2985 pfacc mm6, mm7 ;# second rsq in lower mm6
2987 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
2988 pfrsqrt mm1, mm6
2990 punpckldq mm0,mm1
2991 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
2992 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
2993 pfmul mm0,mm0
2994 pfrsqit1 mm0,mm4
2995 pfrcpit2 mm0,mm2
2996 movq mm1,mm0
2997 pfmul mm0,mm0
2998 ;# mm0 now contains invsq, and mm1 invsqrt
2999 ;# do potential and fscal
3001 prefetchw [esp + i1000_dx1] ;# prefetch i forces to cache
3003 pfmul mm3,mm1 ;# 3 has both vcoul
3004 pfmul mm0,mm3 ;# 0 has both fscal
3006 ;# update vctot
3008 pfadd mm3, [esp + i1000_vctot] ;# add the earlier value
3009 movq [esp + i1000_vctot], mm3 ;# store the sum
3010 ;# spread fscalar to both positions
3011 movq mm1,mm0
3012 punpckldq mm0,mm0
3013 punpckhdq mm1,mm1
3014 ;# calc vector force
3015 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
3016 movq mm2, [esp + i1000_dx1] ;# fetch dr
3017 movd mm3, [esp + i1000_dz1]
3018 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
3019 pfmul mm2, mm0 ;# mult by fs
3020 pfmul mm3, mm0
3022 movq mm4, [esp + i1000_dx2] ;# fetch dr
3023 movd mm5, [esp + i1000_dz2]
3024 pfmul mm4, mm1 ;# mult by fs
3025 pfmul mm5, mm1
3026 ;# update i forces
3028 movq mm0, [esp + i1000_fix]
3029 movd mm1, [esp + i1000_fiz]
3030 pfadd mm0, mm2
3031 pfadd mm1, mm3
3033 pfadd mm0, mm4
3034 pfadd mm1, mm5
3035 movq [esp + i1000_fix], mm0
3036 movd [esp + i1000_fiz], mm1
3037 ;# update j forces
3039 movq mm0, [edi + eax*4]
3040 movd mm1, [edi + eax*4 + 8]
3041 movq mm6, [edi + ebx*4]
3042 movd mm7, [edi + ebx*4 + 8]
3044 pfsub mm0, mm2
3045 pfsub mm1, mm3
3046 pfsub mm6, mm4
3047 pfsub mm7, mm5
3049 movq [edi + eax*4], mm0
3050 movd [edi + eax*4 +8], mm1
3051 movq [edi + ebx*4], mm6
3052 movd [edi + ebx*4 + 8], mm7
3054 ;# should we do one more iteration?
3055 sub dword ptr [esp + i1000_innerk], 2
3056 jl .i1000_finish_inner
3057 jmp .i1000_unroll_loop
3058 .i1000_finish_inner:
3059 and dword ptr [esp + i1000_innerk], 1
3060 jnz .i1000_single_inner
3061 jmp .i1000_updateouterdata
3062 .i1000_single_inner:
3063 ;# a single j particle iteration here - compare with the unrolled code for comments
3064 mov eax, [esp + i1000_innerjjnr]
3065 mov eax, [eax] ;# eax=jnr offset
3067 mov ecx, [ebp + i1000_charge]
3068 movd mm6, [esp + i1000_iq]
3069 movd mm7, [ecx + eax*4]
3070 pfmul mm6, mm7 ;# mm6=qq
3072 lea eax, [eax + eax*2]
3074 movq mm0, [esp + i1000_ix]
3075 movd mm1, [esp + i1000_iz]
3076 movq mm2, [esi + eax*4]
3077 movd mm3, [esi + eax*4 + 8]
3078 pfsub mm0, mm2
3079 pfsub mm1, mm3
3080 movq [esp + i1000_dx1], mm0
3081 pfmul mm0,mm0
3082 movd [esp + i1000_dz1], mm1
3083 pfmul mm1,mm1
3084 pfacc mm0, mm1
3085 pfacc mm0, mm1 ;# mm0=rsq
3087 pfrsqrt mm1,mm0
3088 movq mm2,mm1
3089 pfmul mm1,mm1
3090 pfrsqit1 mm1,mm0
3091 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
3092 movq mm4, mm1
3093 pfmul mm4, mm4 ;# mm4=invsq
3094 ;# calculate potential and scalar force
3095 pfmul mm6, mm1 ;# mm6=vcoul
3096 pfmul mm4, mm6 ;# mm4=fscalar
3097 ;# update vctot
3098 pfadd mm6, [esp + i1000_vctot]
3099 movq [esp + i1000_vctot], mm6
3100 ;# spread fscalar to both positions
3101 punpckldq mm4,mm4
3102 ;# calc vectorial force
3103 prefetchw [edi + eax*4] ;# prefetch faction to cache
3104 movq mm0, [esp + i1000_dx1]
3105 movd mm1, [esp + i1000_dz1]
3106 pfmul mm0, mm4
3107 pfmul mm1, mm4
3108 ;# update i particle force
3109 movq mm2, [esp + i1000_fix]
3110 movd mm3, [esp + i1000_fiz]
3111 pfadd mm2, mm0
3112 pfadd mm3, mm1
3113 movq [esp + i1000_fix], mm2
3114 movd [esp + i1000_fiz], mm3
3115 ;# update j particle force
3116 movq mm2, [edi + eax*4]
3117 movd mm3, [edi + eax *4+ 8]
3118 pfsub mm2, mm0
3119 pfsub mm3, mm1
3120 movq [edi + eax*4], mm2
3121 movd [edi + eax*4 +8], mm3
3122 ;# done!
3123 .i1000_updateouterdata:
3124 mov ecx, [esp + i1000_ii3]
3126 movq mm6, [edi + ecx*4] ;# increment i force
3127 movd mm7, [edi + ecx*4 + 8]
3128 pfadd mm6, [esp + i1000_fix]
3129 pfadd mm7, [esp + i1000_fiz]
3130 movq [edi + ecx*4], mm6
3131 movd [edi + ecx*4 +8], mm7
3133 mov ebx, [ebp + i1000_fshift] ;# increment fshift force
3134 mov edx, [esp + i1000_is3]
3136 movq mm6, [ebx + edx*4]
3137 movd mm7, [ebx + edx*4 + 8]
3138 pfadd mm6, [esp + i1000_fix]
3139 pfadd mm7, [esp + i1000_fiz]
3140 movq [ebx + edx*4], mm6
3141 movd [ebx + edx*4 + 8], mm7
3143 mov edx, [ebp + i1000_gid] ;# get group index for this i particle
3144 mov edx, [edx]
3145 add dword ptr [ebp + i1000_gid], 4 ;# advance pointer
3147 movq mm7, [esp + i1000_vctot]
3148 pfacc mm7,mm7 ;# get and sum the two parts of total potential
3150 mov eax, [ebp + i1000_Vc]
3151 movd mm6, [eax + edx*4]
3152 pfadd mm6, mm7
3153 movd [eax + edx*4], mm6 ;# increment vc[gid]
3154 ;# finish if last
3155 mov ecx, [ebp + i1000_nri]
3156 dec ecx
3157 jecxz .i1000_end
3158 ;# not last, iterate once more!
3159 mov [ebp + i1000_nri], ecx
3160 jmp .i1000_outer
3161 .i1000_end:
3162 femms
3163 add esp, 80
3164 pop edi
3165 pop esi
3166 pop edx
3167 pop ecx
3168 pop ebx
3169 pop eax
3170 leave
3174 .globl inl1010_3dnow
3175 .globl _inl1010_3dnow
3176 inl1010_3dnow:
3177 _inl1010_3dnow:
3178 .equiv i1010_nri, 8
3179 .equiv i1010_iinr, 12
3180 .equiv i1010_jindex, 16
3181 .equiv i1010_jjnr, 20
3182 .equiv i1010_shift, 24
3183 .equiv i1010_shiftvec, 28
3184 .equiv i1010_fshift, 32
3185 .equiv i1010_gid, 36
3186 .equiv i1010_pos, 40
3187 .equiv i1010_faction, 44
3188 .equiv i1010_charge, 48
3189 .equiv i1010_facel, 52
3190 .equiv i1010_Vc, 56
3191 .equiv i1010_nsatoms, 60
3192 ;# stack offsets for local variables
3193 .equiv i1010_is3, 0
3194 .equiv i1010_ii3, 4
3195 .equiv i1010_shX, 8
3196 .equiv i1010_shY, 12
3197 .equiv i1010_shZ, 16
3198 .equiv i1010_ix, 20
3199 .equiv i1010_iy, 24
3200 .equiv i1010_iz, 28
3201 .equiv i1010_iq, 32
3202 .equiv i1010_vctot, 40
3203 .equiv i1010_innerjjnr0, 48
3204 .equiv i1010_innerk0, 52
3205 .equiv i1010_innerjjnr, 56
3206 .equiv i1010_innerk, 60
3207 .equiv i1010_fix, 64
3208 .equiv i1010_fiy, 68
3209 .equiv i1010_fiz, 72
3210 .equiv i1010_dx1, 76
3211 .equiv i1010_dy1, 80
3212 .equiv i1010_dz1, 84
3213 .equiv i1010_dx2, 88
3214 .equiv i1010_dy2, 92
3215 .equiv i1010_dz2, 96
3216 .equiv i1010_nscoul, 100
3217 .equiv i1010_solnr, 104
3218 push ebp
3219 mov ebp,esp
3220 push eax
3221 push ebx
3222 push ecx
3223 push edx
3224 push esi
3225 push edi
3226 sub esp, 108 ;# local stack space
3227 femms
3228 ;# assume we have at least one i particle - start directly
3229 add dword ptr [ebp + i1010_nsatoms], 8
3231 .i1010_outer:
3232 mov eax, [ebp + i1010_shift] ;# eax = pointer into shift[]
3233 mov ebx, [eax] ;# ebx=shift[n]
3234 add dword ptr [ebp + i1010_shift], 4 ;# advance pointer one step
3236 lea ebx, [ebx + ebx*2] ;# ebx=3*is
3237 mov [esp + i1010_is3],ebx ;# store is3
3239 mov eax, [ebp + i1010_shiftvec] ;# eax = base of shiftvec[]
3241 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
3242 movd mm1, [eax + ebx*4 + 8]
3243 movq [esp + i1010_shX], mm0
3244 movd [esp + i1010_shZ], mm1
3246 mov ecx, [ebp + i1010_iinr] ;# ecx = pointer into iinr[]
3247 add dword ptr [ebp + i1010_iinr], 4 ;# advance pointer
3248 mov ebx, [ecx] ;# ebx=ii
3250 mov eax, [ebp + i1010_nsatoms]
3251 mov ecx, [eax]
3252 add dword ptr [ebp + i1010_nsatoms], 12
3253 mov [esp + i1010_nscoul], ecx
3255 ;# clear potential
3256 pxor mm7,mm7
3257 movq [esp + i1010_vctot], mm7
3258 mov [esp + i1010_solnr], ebx
3260 mov eax, [ebp + i1010_jindex] ;# current pointer to jindex list
3261 mov ecx, [eax] ;# jindex[n]
3262 mov edx, [eax + 4] ;# jindex[n+1]
3263 add dword ptr [ebp + i1010_jindex], 4 ;# advance pointer
3264 sub edx, ecx ;# number of innerloop atoms
3265 mov eax, [ebp + i1010_jjnr]
3266 shl ecx, 2
3267 add eax, ecx ;# pointer to index of the first j atom
3268 mov [esp + i1010_innerjjnr0], eax ;# save pointer to jjnr[nj0]
3270 mov [esp + i1010_innerk0], edx ;# number of innerloop atoms
3271 mov esi, [ebp + i1010_pos]
3272 mov edi, [ebp + i1010_faction]
3274 mov ecx, [esp + i1010_nscoul]
3275 cmp ecx, 0
3276 jnz .i1010_mno_coul
3277 jmp .i1010_last_mno
3278 .i1010_mno_coul:
3279 mov ebx, [esp + i1010_solnr]
3280 inc dword ptr [esp + i1010_solnr]
3281 mov edx, [ebp + i1010_charge]
3282 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
3283 pfmul mm2, [ebp + i1010_facel]
3284 punpckldq mm2,mm2 ;# spread to both halves
3285 movq [esp + i1010_iq], mm2 ;# iq =facel*charge[ii]
3287 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
3288 mov eax, [ebp + i1010_pos] ;# eax = base pointer of pos[]
3289 mov [esp + i1010_ii3], ebx ;# store ii3
3291 movq mm0, [eax + ebx*4] ;# load x and y coords to mm0
3292 movd mm1, [eax + ebx*4 + 8] ;# load z coord to mm1
3293 pfadd mm0, [esp + i1010_shX] ;# add shift vector
3294 pfadd mm1, [esp + i1010_shZ]
3295 movq [esp + i1010_ix], mm0 ;# store shifted coords
3296 movd [esp + i1010_iz], mm1
3298 ;# clear forces
3299 pxor mm7,mm7
3300 movq [esp + i1010_fix], mm7
3301 movd [esp + i1010_fiz], mm7
3303 mov ecx, [esp + i1010_innerjjnr0]
3304 mov [esp + i1010_innerjjnr], ecx
3305 mov edx, [esp + i1010_innerk0]
3306 sub edx, 2
3307 mov [esp + i1010_innerk], edx ;# number of innerloop atoms
3308 jge .i1010_unroll_coul_loop
3309 jmp .i1010_finish_coul_inner
3310 .i1010_unroll_coul_loop:
3311 ;# paired innerloop starts here
3312 mov ecx, [esp + i1010_innerjjnr] ;# pointer to jjnr[k]
3313 mov eax, [ecx]
3314 mov ebx, [ecx + 4] ;# eax/ebx=jnr
3315 add dword ptr [esp + i1010_innerjjnr], 8 ;# advance pointer (unrolled 2)
3316 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
3318 mov ecx, [ebp + i1010_charge] ;# base of charge[]
3319 movq mm5, [esp + i1010_iq]
3320 movd mm3, [ecx + eax*4] ;# charge[jnr1]
3321 movd mm7, [ecx + ebx*4] ;# charge[jnr2]
3322 punpckldq mm3,mm7 ;# move charge 2 to high part of mm3
3323 pfmul mm3,mm5 ;# mm3 now has qq for both particles
3325 lea eax, [eax + eax*2] ;# replace jnr with j3
3326 lea ebx, [ebx + ebx*2]
3328 movq mm0, [esp + i1010_ix]
3329 movd mm1, [esp + i1010_iz]
3330 movq mm4, [esi + eax*4] ;# fetch first j coordinates
3331 movd mm5, [esi + eax*4 + 8]
3332 pfsubr mm4,mm0 ;# dr = ir - jr
3333 pfsubr mm5,mm1
3334 movq [esp + i1010_dx1], mm4 ;# store dr
3335 movd [esp + i1010_dz1], mm5
3336 pfmul mm4,mm4 ;# square dx,dy,dz
3337 pfmul mm5,mm5
3338 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
3339 pfacc mm4, mm5 ;# first rsq in lower mm4
3341 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
3342 movd mm7, [esi + ebx*4 + 8]
3344 pfsubr mm6,mm0 ;# dr = ir - jr
3345 pfsubr mm7,mm1
3346 movq [esp + i1010_dx2], mm6 ;# store dr
3347 movd [esp + i1010_dz2], mm7
3348 pfmul mm6,mm6 ;# square dx,dy,dz
3349 pfmul mm7,mm7
3350 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
3351 pfacc mm6, mm7 ;# second rsq in lower mm6
3353 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
3354 pfrsqrt mm1, mm6
3356 punpckldq mm0,mm1
3357 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
3358 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
3359 pfmul mm0,mm0
3360 pfrsqit1 mm0,mm4
3361 pfrcpit2 mm0,mm2
3362 movq mm1,mm0
3363 pfmul mm0,mm0
3364 ;# mm0 now contains invsq, and mm1 invsqrt
3365 ;# do potential and fscal
3366 prefetchw [esp + i1010_dx1] ;# prefetch i forces to cache
3368 pfmul mm3,mm1 ;# 3 has both vcoul
3369 pfmul mm0,mm3 ;# 0 has both fscal
3371 ;# update vctot
3373 pfadd mm3, [esp + i1010_vctot] ;# add the earlier value
3374 movq [esp + i1010_vctot], mm3 ;# store the sum
3375 ;# spread fscalar to both positions
3376 movq mm1,mm0
3377 punpckldq mm0,mm0
3378 punpckhdq mm1,mm1
3379 ;# calc vector force
3380 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
3381 movq mm2, [esp + i1010_dx1] ;# fetch dr
3382 movd mm3, [esp + i1010_dz1]
3383 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
3384 pfmul mm2, mm0 ;# mult by fs
3385 pfmul mm3, mm0
3387 movq mm4, [esp + i1010_dx2] ;# fetch dr
3388 movd mm5, [esp + i1010_dz2]
3389 pfmul mm4, mm1 ;# mult by fs
3390 pfmul mm5, mm1
3391 ;# update i forces
3393 movq mm0, [esp + i1010_fix]
3394 movd mm1, [esp + i1010_fiz]
3395 pfadd mm0, mm2
3396 pfadd mm1, mm3
3398 pfadd mm0, mm4
3399 pfadd mm1, mm5
3400 movq [esp + i1010_fix], mm0
3401 movd [esp + i1010_fiz], mm1
3402 ;# update j forces
3404 movq mm0, [edi + eax*4]
3405 movd mm1, [edi + eax*4 + 8]
3406 movq mm6, [edi + ebx*4]
3407 movd mm7, [edi + ebx*4 + 8]
3409 pfsub mm0, mm2
3410 pfsub mm1, mm3
3411 pfsub mm6, mm4
3412 pfsub mm7, mm5
3414 movq [edi + eax*4], mm0
3415 movd [edi + eax*4 +8], mm1
3416 movq [edi + ebx*4], mm6
3417 movd [edi + ebx*4 + 8], mm7
3419 ;# should we do one more iteration?
3420 sub dword ptr [esp + i1010_innerk], 2
3421 jl .i1010_finish_coul_inner
3422 jmp .i1010_unroll_coul_loop
3423 .i1010_finish_coul_inner:
3424 and dword ptr [esp + i1010_innerk], 1
3425 jnz .i1010_single_coul_inner
3426 jmp .i1010_updateouterdata_coul
3427 .i1010_single_coul_inner:
3428 ;# a single j particle iteration here - compare with the unrolled code for comments
3429 mov eax, [esp + i1010_innerjjnr]
3430 mov eax, [eax] ;# eax=jnr offset
3432 mov ecx, [ebp + i1010_charge]
3433 movd mm6, [esp + i1010_iq]
3434 movd mm7, [ecx + eax*4]
3435 pfmul mm6, mm7 ;# mm6=qq
3437 lea eax, [eax + eax*2]
3439 movq mm0, [esp + i1010_ix]
3440 movd mm1, [esp + i1010_iz]
3441 movq mm2, [esi + eax*4]
3442 movd mm3, [esi + eax*4 + 8]
3443 pfsub mm0, mm2
3444 pfsub mm1, mm3
3445 movq [esp + i1010_dx1], mm0
3446 pfmul mm0,mm0
3447 movd [esp + i1010_dz1], mm1
3448 pfmul mm1,mm1
3449 pfacc mm0, mm1
3450 pfacc mm0, mm1 ;# mm0=rsq
3452 pfrsqrt mm1,mm0
3453 movq mm2,mm1
3454 pfmul mm1,mm1
3455 pfrsqit1 mm1,mm0
3456 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
3457 movq mm4, mm1
3458 pfmul mm4, mm4 ;# mm4=invsq
3459 ;# calculate potential and scalar force
3460 pfmul mm6, mm1 ;# mm6=vcoul
3461 pfmul mm4, mm6 ;# mm4=fscalar
3462 ;# update vctot
3463 pfadd mm6, [esp + i1010_vctot]
3464 movq [esp + i1010_vctot], mm6
3465 ;# spread fscalar to both positions
3466 punpckldq mm4,mm4
3467 ;# calc vectorial force
3468 prefetchw [edi + eax*4] ;# prefetch faction to cache
3469 movq mm0, [esp + i1010_dx1]
3470 movd mm1, [esp + i1010_dz1]
3471 pfmul mm0, mm4
3472 pfmul mm1, mm4
3473 ;# update i particle force
3474 movq mm2, [esp + i1010_fix]
3475 movd mm3, [esp + i1010_fiz]
3476 pfadd mm2, mm0
3477 pfadd mm3, mm1
3478 movq [esp + i1010_fix], mm2
3479 movd [esp + i1010_fiz], mm3
3480 ;# update j particle force
3481 movq mm2, [edi + eax*4]
3482 movd mm3, [edi + eax *4+ 8]
3483 pfsub mm2, mm0
3484 pfsub mm3, mm1
3485 movq [edi + eax*4], mm2
3486 movd [edi + eax*4 +8], mm3
3487 ;# done!
3488 .i1010_updateouterdata_coul:
3489 mov ecx, [esp + i1010_ii3]
3491 movq mm6, [edi + ecx*4] ;# increment i force
3492 movd mm7, [edi + ecx*4 + 8]
3493 pfadd mm6, [esp + i1010_fix]
3494 pfadd mm7, [esp + i1010_fiz]
3495 movq [edi + ecx*4], mm6
3496 movd [edi + ecx*4 +8], mm7
3498 mov ebx, [ebp + i1010_fshift] ;# increment fshift force
3499 mov edx, [esp + i1010_is3]
3501 movq mm6, [ebx + edx*4]
3502 movd mm7, [ebx + edx*4 + 8]
3503 pfadd mm6, [esp + i1010_fix]
3504 pfadd mm7, [esp + i1010_fiz]
3505 movq [ebx + edx*4], mm6
3506 movd [ebx + edx*4 + 8], mm7
3508 ;# loop back to mno
3509 dec dword ptr [esp + i1010_nscoul]
3510 jz .i1010_last_mno
3511 jmp .i1010_mno_coul
3512 .i1010_last_mno:
3513 mov edx, [ebp + i1010_gid] ;# get group index for this i particle
3514 mov edx, [edx]
3515 add dword ptr [ebp + i1010_gid], 4 ;# advance pointer
3517 movq mm7, [esp + i1010_vctot]
3518 pfacc mm7,mm7 ;# get and sum the two parts of total potential
3520 mov eax, [ebp + i1010_Vc]
3521 movd mm6, [eax + edx*4]
3522 pfadd mm6, mm7
3523 movd [eax + edx*4], mm6 ;# increment vc[gid]
3524 ;# finish if last
3525 mov ecx, [ebp + i1010_nri]
3526 dec ecx
3527 jecxz .i1010_end
3528 ;# not last, iterate once more!
3529 mov [ebp + i1010_nri], ecx
3530 jmp .i1010_outer
3531 .i1010_end:
3532 femms
3533 add esp, 108
3534 pop edi
3535 pop esi
3536 pop edx
3537 pop ecx
3538 pop ebx
3539 pop eax
3540 leave
3545 .globl inl1020_3dnow
3546 .globl _inl1020_3dnow
3547 inl1020_3dnow:
3548 _inl1020_3dnow:
3549 .equiv i1020_nri, 8
3550 .equiv i1020_iinr, 12
3551 .equiv i1020_jindex, 16
3552 .equiv i1020_jjnr, 20
3553 .equiv i1020_shift, 24
3554 .equiv i1020_shiftvec, 28
3555 .equiv i1020_fshift, 32
3556 .equiv i1020_gid, 36
3557 .equiv i1020_pos, 40
3558 .equiv i1020_faction, 44
3559 .equiv i1020_charge, 48
3560 .equiv i1020_facel, 52
3561 .equiv i1020_Vc, 56
3562 ;# stack offsets for local variables
3563 .equiv i1020_is3, 0
3564 .equiv i1020_ii3, 4
3565 .equiv i1020_ixO, 8
3566 .equiv i1020_iyO, 12
3567 .equiv i1020_izO, 16
3568 .equiv i1020_ixH, 20
3569 .equiv i1020_iyH, 28
3570 .equiv i1020_izH, 36
3571 .equiv i1020_iqO, 44
3572 .equiv i1020_iqH, 52
3573 .equiv i1020_vctot, 60
3574 .equiv i1020_innerjjnr, 68
3575 .equiv i1020_innerk, 72
3576 .equiv i1020_fixO, 76
3577 .equiv i1020_fiyO, 80
3578 .equiv i1020_fizO, 84
3579 .equiv i1020_fixH, 88
3580 .equiv i1020_fiyH, 96
3581 .equiv i1020_fizH, 104
3582 .equiv i1020_dxO, 112
3583 .equiv i1020_dyO, 116
3584 .equiv i1020_dzO, 120
3585 .equiv i1020_dxH, 124
3586 .equiv i1020_dyH, 132
3587 .equiv i1020_dzH, 140
3588 push ebp
3589 mov ebp,esp
3590 push eax
3591 push ebx
3592 push ecx
3593 push edx
3594 push esi
3595 push edi
3596 sub esp, 148 ;# local stack space
3597 femms
3598 ;# assume we have at least one i particle - start directly
3600 mov ecx, [ebp + i1020_iinr] ;# ecx = pointer into iinr[]
3601 mov ebx, [ecx] ;# ebx=ii
3603 mov edx, [ebp + i1020_charge]
3604 movd mm1, [ebp + i1020_facel]
3605 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0]
3606 pfmul mm2, mm1
3607 movq [esp + i1020_iqO], mm2 ;# iqO = facel*charge[ii]
3609 movd mm2, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1]
3610 pfmul mm2, mm1
3611 punpckldq mm2,mm2 ;# spread to both halves
3612 movq [esp + i1020_iqH], mm2 ;# iqH = facel*charge[ii0+1]
3613 .i1020_outer:
3614 mov eax, [ebp + i1020_shift] ;# eax = pointer into shift[]
3615 mov ebx, [eax] ;# ebx=shift[n]
3616 add dword ptr [ebp + i1020_shift], 4 ;# advance pointer one step
3618 lea ebx, [ebx + ebx*2] ;# ebx=3*is
3619 mov [esp + i1020_is3],ebx ;# store is3
3621 mov eax, [ebp + i1020_shiftvec] ;# eax = base of shiftvec[]
3623 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6
3624 movd mm6, [eax + ebx*4 + 8]
3625 movq mm0, mm5
3626 movq mm1, mm5
3627 movq mm2, mm6
3628 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2
3629 punpckhdq mm1,mm1
3630 punpckldq mm2,mm2
3632 mov ecx, [ebp + i1020_iinr] ;# ecx = pointer into iinr[]
3633 add dword ptr [ebp + i1020_iinr], 4 ;# advance pointer
3634 mov ebx, [ecx] ;# ebx=ii
3636 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
3637 mov eax, [ebp + i1020_pos] ;# eax = base of pos[]
3639 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
3640 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
3641 mov [esp + i1020_ii3], ebx ;# (use mm7 as temp storage for iz)
3642 pfadd mm6, mm7
3643 movq [esp + i1020_ixO], mm5
3644 movq [esp + i1020_izO], mm6
3646 movd mm3, [eax + ebx*4 + 12]
3647 movd mm4, [eax + ebx*4 + 16]
3648 movd mm5, [eax + ebx*4 + 20]
3649 punpckldq mm3, [eax + ebx*4 + 24]
3650 punpckldq mm4, [eax + ebx*4 + 28]
3651 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
3653 pfadd mm0, mm3
3654 pfadd mm1, mm4
3655 pfadd mm2, mm5
3656 movq [esp + i1020_ixH], mm0
3657 movq [esp + i1020_iyH], mm1
3658 movq [esp + i1020_izH], mm2
3660 ;# clear vctot and i forces
3661 pxor mm7,mm7
3662 movq [esp + i1020_vctot], mm7
3663 movq [esp + i1020_fixO], mm7
3664 movd [esp + i1020_fizO], mm7
3665 movq [esp + i1020_fixH], mm7
3666 movq [esp + i1020_fiyH], mm7
3667 movq [esp + i1020_fizH], mm7
3669 mov eax, [ebp + i1020_jindex]
3670 mov ecx, [eax] ;# jindex[n]
3671 mov edx, [eax + 4] ;# jindex[n+1]
3672 add dword ptr [ebp + i1020_jindex], 4
3673 sub edx, ecx ;# number of innerloop atoms
3674 mov [esp + i1020_innerk], edx ;# number of innerloop atoms
3676 mov esi, [ebp + i1020_pos]
3677 mov edi, [ebp + i1020_faction]
3678 mov eax, [ebp + i1020_jjnr]
3679 shl ecx, 2
3680 add eax, ecx
3681 mov [esp + i1020_innerjjnr], eax ;# pointer to jjnr[nj0]
3682 .i1020_inner_loop:
3683 ;# a single j particle iteration here - compare with the unrolled code for comments
3684 mov eax, [esp + i1020_innerjjnr]
3685 mov eax, [eax] ;# eax=jnr offset
3686 add dword ptr [esp + i1020_innerjjnr], 4 ;# advance pointer
3687 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
3689 mov ecx, [ebp + i1020_charge]
3690 movd mm7, [ecx + eax*4]
3691 punpckldq mm7,mm7
3692 movq mm6,mm7
3693 pfmul mm6, [esp + i1020_iqO]
3694 pfmul mm7, [esp + i1020_iqH] ;# mm6=qqO, mm7=qqH
3696 lea eax, [eax + eax*2]
3698 movq mm0, [esi + eax*4]
3699 movd mm1, [esi + eax*4 + 8]
3700 ;# copy & expand to mm2-mm4 for the H interactions
3701 movq mm2, mm0
3702 movq mm3, mm0
3703 movq mm4, mm1
3704 punpckldq mm2,mm2
3705 punpckhdq mm3,mm3
3706 punpckldq mm4,mm4
3708 pfsubr mm0, [esp + i1020_ixO]
3709 pfsubr mm1, [esp + i1020_izO]
3711 movq [esp + i1020_dxO], mm0
3712 pfmul mm0,mm0
3713 movd [esp + i1020_dzO], mm1
3714 pfmul mm1,mm1
3715 pfacc mm0, mm1
3716 pfadd mm0, mm1 ;# mm0=rsqO
3718 punpckldq mm2, mm2
3719 punpckldq mm3, mm3
3720 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
3721 pfsubr mm2, [esp + i1020_ixH]
3722 pfsubr mm3, [esp + i1020_iyH]
3723 pfsubr mm4, [esp + i1020_izH] ;# mm2-mm4 is dxH-dzH
3725 movq [esp + i1020_dxH], mm2
3726 movq [esp + i1020_dyH], mm3
3727 movq [esp + i1020_dzH], mm4
3728 pfmul mm2,mm2
3729 pfmul mm3,mm3
3730 pfmul mm4,mm4
3732 pfadd mm3,mm2
3733 pfadd mm3,mm4 ;# mm3=rsqH
3735 pfrsqrt mm1,mm0
3737 movq mm2,mm1
3738 pfmul mm1,mm1
3739 pfrsqit1 mm1,mm0
3740 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
3741 movq mm4, mm1
3742 pfmul mm4, mm4 ;# mm4=invsq
3743 ;# calculate potential and scalar force
3744 pfmul mm6, mm1 ;# mm6=vcoul
3745 pfmul mm4, mm6 ;# mm4=fscalar
3747 pfrsqrt mm5, mm3
3748 pswapd mm3,mm3
3749 pfrsqrt mm2, mm3
3750 pswapd mm3,mm3
3751 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
3753 movq mm2, mm5
3754 pfmul mm5,mm5
3755 pfrsqit1 mm5,mm3
3756 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
3757 movq mm3,mm5
3758 pfmul mm3,mm3 ;# mm3=invsq
3759 pfmul mm7, mm5 ;# mm7=vcoul
3760 pfmul mm3, mm7 ;# mm3=fscal for the two H's
3762 ;# update vctot
3763 pfadd mm7, mm6
3764 pfadd mm7, [esp + i1020_vctot]
3765 movq [esp + i1020_vctot], mm7
3767 ;# spread oxygen fscalar to both positions
3768 punpckldq mm4,mm4
3769 ;# calc vectorial force for O
3770 prefetchw [edi + eax*4] ;# prefetch faction to cache
3771 movq mm0, [esp + i1020_dxO]
3772 movd mm1, [esp + i1020_dzO]
3773 pfmul mm0, mm4
3774 pfmul mm1, mm4
3776 ;# calc vectorial force for H's
3777 movq mm5, [esp + i1020_dxH]
3778 movq mm6, [esp + i1020_dyH]
3779 movq mm7, [esp + i1020_dzH]
3780 pfmul mm5, mm3
3781 pfmul mm6, mm3
3782 pfmul mm7, mm3
3784 ;# update iO particle force
3785 movq mm2, [esp + i1020_fixO]
3786 movd mm3, [esp + i1020_fizO]
3787 pfadd mm2, mm0
3788 pfadd mm3, mm1
3789 movq [esp + i1020_fixO], mm2
3790 movd [esp + i1020_fizO], mm3
3792 ;# update iH forces
3793 movq mm2, [esp + i1020_fixH]
3794 movq mm3, [esp + i1020_fiyH]
3795 movq mm4, [esp + i1020_fizH]
3796 pfadd mm2, mm5
3797 pfadd mm3, mm6
3798 pfadd mm4, mm7
3799 movq [esp + i1020_fixH], mm2
3800 movq [esp + i1020_fiyH], mm3
3801 movq [esp + i1020_fizH], mm4
3803 ;# pack j forces from H in the same form as the oxygen force
3804 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
3805 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
3807 pfadd mm0, mm5 ;# add up total force on j particle
3808 pfadd mm1, mm7
3810 ;# update j particle force
3811 movq mm2, [edi + eax*4]
3812 movd mm3, [edi + eax*4 + 8]
3813 pfsub mm2, mm0
3814 pfsub mm3, mm1
3815 movq [edi + eax*4], mm2
3816 movd [edi + eax*4 +8], mm3
3818 ;# done - one more?
3819 dec dword ptr [esp + i1020_innerk]
3820 jz .i1020_updateouterdata
3821 jmp .i1020_inner_loop
3822 .i1020_updateouterdata:
3823 mov ecx, [esp + i1020_ii3]
3825 movq mm6, [edi + ecx*4] ;# increment iO force
3826 movd mm7, [edi + ecx*4 + 8]
3827 pfadd mm6, [esp + i1020_fixO]
3828 pfadd mm7, [esp + i1020_fizO]
3829 movq [edi + ecx*4], mm6
3830 movd [edi + ecx*4 +8], mm7
3832 movq mm0, [esp + i1020_fixH]
3833 movq mm3, [esp + i1020_fiyH]
3834 movq mm1, [esp + i1020_fizH]
3835 movq mm2, mm0
3836 punpckldq mm0, mm3 ;# mm0(l)=fxH1, mm0(h)=fyH1
3837 punpckhdq mm2, mm3 ;# mm2(l)=fxH2, mm2(h)=fyH2
3838 movq mm3, mm1
3839 pswapd mm3,mm3
3840 ;# mm1 is fzH1
3841 ;# mm3 is fzH2
3843 movq mm6, [edi + ecx*4 + 12] ;# increment iH1 force
3844 movd mm7, [edi + ecx*4 + 20]
3845 pfadd mm6, mm0
3846 pfadd mm7, mm1
3847 movq [edi + ecx*4 + 12], mm6
3848 movd [edi + ecx*4 + 20], mm7
3850 movq mm6, [edi + ecx*4 + 24] ;# increment iH2 force
3851 movd mm7, [edi + ecx*4 + 32]
3852 pfadd mm6, mm2
3853 pfadd mm7, mm3
3854 movq [edi + ecx*4 + 24], mm6
3855 movd [edi + ecx*4 + 32], mm7
3858 mov ebx, [ebp + i1020_fshift] ;# increment fshift force
3859 mov edx, [esp + i1020_is3]
3861 movq mm6, [ebx + edx*4]
3862 movd mm7, [ebx + edx*4 + 8]
3863 pfadd mm6, [esp + i1020_fixO]
3864 pfadd mm7, [esp + i1020_fizO]
3865 pfadd mm6, mm0
3866 pfadd mm7, mm1
3867 pfadd mm6, mm2
3868 pfadd mm7, mm3
3869 movq [ebx + edx*4], mm6
3870 movd [ebx + edx*4 + 8], mm7
3872 mov edx, [ebp + i1020_gid] ;# get group index for this i particle
3873 mov edx, [edx]
3874 add dword ptr [ebp + i1020_gid], 4 ;# advance pointer
3876 movq mm7, [esp + i1020_vctot]
3877 pfacc mm7,mm7 ;# get and sum the two parts of total potential
3879 mov eax, [ebp + i1020_Vc]
3880 movd mm6, [eax + edx*4]
3881 pfadd mm6, mm7
3882 movd [eax + edx*4], mm6 ;# increment vc[gid]
3884 ;# finish if last
3885 dec dword ptr [ebp + i1020_nri]
3886 jz .i1020_end
3887 ;# not last, iterate once more!
3888 jmp .i1020_outer
3889 .i1020_end:
3890 femms
3891 add esp, 148
3892 pop edi
3893 pop esi
3894 pop edx
3895 pop ecx
3896 pop ebx
3897 pop eax
3898 leave
3902 .globl inl1030_3dnow
3903 .globl _inl1030_3dnow
3904 inl1030_3dnow:
3905 _inl1030_3dnow:
3906 .equiv i1030_nri, 8
3907 .equiv i1030_iinr, 12
3908 .equiv i1030_jindex, 16
3909 .equiv i1030_jjnr, 20
3910 .equiv i1030_shift, 24
3911 .equiv i1030_shiftvec, 28
3912 .equiv i1030_fshift, 32
3913 .equiv i1030_gid, 36
3914 .equiv i1030_pos, 40
3915 .equiv i1030_faction, 44
3916 .equiv i1030_charge, 48
3917 .equiv i1030_facel, 52
3918 .equiv i1030_Vc, 56
3919 ;# stack offsets for local variables
3920 .equiv i1030_is3, 0
3921 .equiv i1030_ii3, 4
3922 .equiv i1030_ixO, 8
3923 .equiv i1030_iyO, 12
3924 .equiv i1030_izO, 16
3925 .equiv i1030_ixH, 20
3926 .equiv i1030_iyH, 28
3927 .equiv i1030_izH, 36
3928 .equiv i1030_qqOO, 44
3929 .equiv i1030_qqOH, 52
3930 .equiv i1030_qqHH, 60
3931 .equiv i1030_vctot, 68
3932 .equiv i1030_innerjjnr, 76
3933 .equiv i1030_innerk, 80
3934 .equiv i1030_fixO, 84
3935 .equiv i1030_fiyO, 88
3936 .equiv i1030_fizO, 92
3937 .equiv i1030_fixH, 96
3938 .equiv i1030_fiyH, 104
3939 .equiv i1030_fizH, 112
3940 .equiv i1030_dxO, 120
3941 .equiv i1030_dyO, 124
3942 .equiv i1030_dzO, 128
3943 .equiv i1030_dxH, 132
3944 .equiv i1030_dyH, 140
3945 .equiv i1030_dzH, 148
3946 push ebp
3947 mov ebp,esp
3948 push eax
3949 push ebx
3950 push ecx
3951 push edx
3952 push esi
3953 push edi
3954 sub esp, 156 ;# local stack space
3955 femms
3956 ;# assume we have at least one i particle - start directly
3958 mov ecx, [ebp + i1030_iinr] ;# ecx = pointer into iinr[]
3959 mov ebx, [ecx] ;# ebx=ii
3961 mov edx, [ebp + i1030_charge]
3962 movd mm1, [ebp + i1030_facel] ;# mm1=facel
3963 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0] (O)
3964 movd mm3, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1] (H)
3965 movq mm4, mm2
3966 pfmul mm4, mm1
3967 movq mm6, mm3
3968 pfmul mm6, mm1
3969 movq mm5, mm4
3970 pfmul mm4, mm2 ;# mm4=qqOO*facel
3971 pfmul mm5, mm3 ;# mm5=qqOH*facel
3972 pfmul mm6, mm3 ;# mm6=qqHH*facel
3973 punpckldq mm5,mm5 ;# spread to both halves
3974 punpckldq mm6,mm6 ;# spread to both halves
3975 movq [esp + i1030_qqOO], mm4
3976 movq [esp + i1030_qqOH], mm5
3977 movq [esp + i1030_qqHH], mm6
3978 .i1030_outer:
3979 mov eax, [ebp + i1030_shift] ;# eax = pointer into shift[]
3980 mov ebx, [eax] ;# ebx=shift[n]
3981 add dword ptr [ebp + i1030_shift], 4 ;# advance pointer one step
3983 lea ebx, [ebx + ebx*2] ;# ebx=3*is
3984 mov [esp + i1030_is3],ebx ;# store is3
3986 mov eax, [ebp + i1030_shiftvec] ;# eax = base of shiftvec[]
3988 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6
3989 movd mm6, [eax + ebx*4 + 8]
3990 movq mm0, mm5
3991 movq mm1, mm5
3992 movq mm2, mm6
3993 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2
3994 punpckhdq mm1,mm1
3995 punpckldq mm2,mm2
3997 mov ecx, [ebp + i1030_iinr] ;# ecx = pointer into iinr[]
3998 add dword ptr [ebp + i1030_iinr], 4 ;# advance pointer
3999 mov ebx, [ecx] ;# ebx=ii
4001 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
4002 mov eax, [ebp + i1030_pos] ;# eax = base of pos[]
4004 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
4005 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
4006 mov [esp + i1030_ii3], ebx ;# (use mm7 as temp storage for iz)
4007 pfadd mm6, mm7
4008 movq [esp + i1030_ixO], mm5
4009 movq [esp + i1030_izO], mm6
4011 movd mm3, [eax + ebx*4 + 12]
4012 movd mm4, [eax + ebx*4 + 16]
4013 movd mm5, [eax + ebx*4 + 20]
4014 punpckldq mm3, [eax + ebx*4 + 24]
4015 punpckldq mm4, [eax + ebx*4 + 28]
4016 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
4018 pfadd mm0, mm3
4019 pfadd mm1, mm4
4020 pfadd mm2, mm5
4021 movq [esp + i1030_ixH], mm0
4022 movq [esp + i1030_iyH], mm1
4023 movq [esp + i1030_izH], mm2
4025 ;# clear vctot and i forces
4026 pxor mm7,mm7
4027 movq [esp + i1030_vctot], mm7
4028 movq [esp + i1030_fixO], mm7
4029 movq [esp + i1030_fizO], mm7
4030 movq [esp + i1030_fixH], mm7
4031 movq [esp + i1030_fiyH], mm7
4032 movq [esp + i1030_fizH], mm7
4034 mov eax, [ebp + i1030_jindex]
4035 mov ecx, [eax] ;# jindex[n]
4036 mov edx, [eax + 4] ;# jindex[n+1]
4037 add dword ptr [ebp + i1030_jindex], 4
4038 sub edx, ecx ;# number of innerloop atoms
4039 mov [esp + i1030_innerk], edx ;# number of innerloop atoms
4041 mov esi, [ebp + i1030_pos]
4042 mov edi, [ebp + i1030_faction]
4043 mov eax, [ebp + i1030_jjnr]
4044 shl ecx, 2
4045 add eax, ecx
4046 mov [esp + i1030_innerjjnr], eax ;# pointer to jjnr[nj0]
4047 .i1030_inner_loop:
4048 ;# a single j particle iteration here - compare with the unrolled code for comments
4049 mov eax, [esp + i1030_innerjjnr]
4050 mov eax, [eax] ;# eax=jnr offset
4051 add dword ptr [esp + i1030_innerjjnr], 4 ;# advance pointer
4053 movd mm6, [esp + i1030_qqOO]
4054 movq mm7, [esp + i1030_qqOH]
4056 lea eax, [eax + eax*2]
4057 movq mm0, [esi + eax*4]
4058 movd mm1, [esi + eax*4 + 8]
4059 ;# copy & expand to mm2-mm4 for the H interactions
4060 movq mm2, mm0
4061 movq mm3, mm0
4062 movq mm4, mm1
4063 punpckldq mm2,mm2
4064 punpckhdq mm3,mm3
4065 punpckldq mm4,mm4
4067 pfsubr mm0, [esp + i1030_ixO]
4068 pfsubr mm1, [esp + i1030_izO]
4070 movq [esp + i1030_dxO], mm0
4071 pfmul mm0,mm0
4072 movd [esp + i1030_dzO], mm1
4073 pfmul mm1,mm1
4074 pfacc mm0, mm0
4075 pfadd mm0, mm1 ;# mm0=rsqO
4077 punpckldq mm2, mm2
4078 punpckldq mm3, mm3
4079 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
4080 pfsubr mm2, [esp + i1030_ixH]
4081 pfsubr mm3, [esp + i1030_iyH]
4082 pfsubr mm4, [esp + i1030_izH] ;# mm2-mm4 is dxH-dzH
4084 movq [esp + i1030_dxH], mm2
4085 movq [esp + i1030_dyH], mm3
4086 movq [esp + i1030_dzH], mm4
4087 pfmul mm2,mm2
4088 pfmul mm3,mm3
4089 pfmul mm4,mm4
4091 pfadd mm3,mm2
4092 pfadd mm3,mm4 ;# mm3=rsqH
4094 pfrsqrt mm1,mm0
4096 movq mm2,mm1
4097 pfmul mm1,mm1
4098 pfrsqit1 mm1,mm0
4099 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
4100 movq mm4, mm1
4101 pfmul mm4, mm4 ;# mm4=invsq
4102 ;# calculate potential and scalar force
4103 pfmul mm6, mm1 ;# mm6=vcoul
4104 pfmul mm4, mm6 ;# mm4=fscalar
4106 pfrsqrt mm5, mm3
4107 pswapd mm3,mm3
4108 pfrsqrt mm2, mm3
4109 pswapd mm3,mm3
4110 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
4112 movq mm2, mm5
4113 pfmul mm5,mm5
4114 pfrsqit1 mm5,mm3
4115 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
4116 movq mm3,mm5
4117 pfmul mm3,mm3 ;# mm3=invsq
4118 pfmul mm7, mm5 ;# mm7=vcoul
4119 pfmul mm3, mm7 ;# mm3=fscal for the two H's
4121 ;# update vctot
4122 pfadd mm7, mm6
4123 pfadd mm7, [esp + i1030_vctot]
4124 movq [esp + i1030_vctot], mm7
4126 ;# spread oxygen fscalar to both positions
4127 punpckldq mm4,mm4
4128 ;# calc vectorial force for O
4129 movq mm0, [esp + i1030_dxO]
4130 movd mm1, [esp + i1030_dzO]
4131 pfmul mm0, mm4
4132 pfmul mm1, mm4
4134 ;# calc vectorial force for H's
4135 movq mm5, [esp + i1030_dxH]
4136 movq mm6, [esp + i1030_dyH]
4137 movq mm7, [esp + i1030_dzH]
4138 pfmul mm5, mm3
4139 pfmul mm6, mm3
4140 pfmul mm7, mm3
4142 ;# update iO particle force
4143 movq mm2, [esp + i1030_fixO]
4144 movd mm3, [esp + i1030_fizO]
4145 pfadd mm2, mm0
4146 pfadd mm3, mm1
4147 movq [esp + i1030_fixO], mm2
4148 movd [esp + i1030_fizO], mm3
4150 ;# update iH forces
4151 movq mm2, [esp + i1030_fixH]
4152 movq mm3, [esp + i1030_fiyH]
4153 movq mm4, [esp + i1030_fizH]
4154 pfadd mm2, mm5
4155 pfadd mm3, mm6
4156 pfadd mm4, mm7
4157 movq [esp + i1030_fixH], mm2
4158 movq [esp + i1030_fiyH], mm3
4159 movq [esp + i1030_fizH], mm4
4161 ;# pack j forces from H in the same form as the oxygen force
4162 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
4163 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
4165 pfadd mm0, mm5 ;# add up total force on j particle
4166 pfadd mm1, mm7
4168 ;# update j particle force
4169 movq mm2, [edi + eax*4]
4170 movd mm3, [edi + eax*4 + 8]
4171 pfsub mm2, mm0
4172 pfsub mm3, mm1
4173 movq [edi + eax*4], mm2
4174 movd [edi + eax*4 +8], mm3
4176 ;# interactions with j H1
4177 movq mm0, [esi + eax*4 + 12]
4178 movd mm1, [esi + eax*4 + 20]
4179 ;# copy & expand to mm2-mm4 for the H interactions
4180 movq mm2, mm0
4181 movq mm3, mm0
4182 movq mm4, mm1
4183 punpckldq mm2,mm2
4184 punpckhdq mm3,mm3
4185 punpckldq mm4,mm4
4187 movd mm6, [esp + i1030_qqOH]
4188 movq mm7, [esp + i1030_qqHH]
4190 pfsubr mm0, [esp + i1030_ixO]
4191 pfsubr mm1, [esp + i1030_izO]
4193 movq [esp + i1030_dxO], mm0
4194 pfmul mm0,mm0
4195 movd [esp + i1030_dzO], mm1
4196 pfmul mm1,mm1
4197 pfacc mm0, mm1
4198 pfadd mm0, mm1 ;# mm0=rsqO
4200 punpckldq mm2, mm2
4201 punpckldq mm3, mm3
4202 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
4203 pfsubr mm2, [esp + i1030_ixH]
4204 pfsubr mm3, [esp + i1030_iyH]
4205 pfsubr mm4, [esp + i1030_izH] ;# mm2-mm4 is dxH-dzH
4207 movq [esp + i1030_dxH], mm2
4208 movq [esp + i1030_dyH], mm3
4209 movq [esp + i1030_dzH], mm4
4210 pfmul mm2,mm2
4211 pfmul mm3,mm3
4212 pfmul mm4,mm4
4214 pfadd mm3,mm2
4215 pfadd mm3,mm4 ;# mm3=rsqH
4217 pfrsqrt mm1,mm0
4219 movq mm2,mm1
4220 pfmul mm1,mm1
4221 pfrsqit1 mm1,mm0
4222 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
4223 movq mm4, mm1
4224 pfmul mm4, mm4 ;# mm4=invsq
4225 ;# calculate potential and scalar force
4226 pfmul mm6, mm1 ;# mm6=vcoul
4227 pfmul mm4, mm6 ;# mm4=fscalar
4229 pfrsqrt mm5, mm3
4230 pswapd mm3,mm3
4231 pfrsqrt mm2, mm3
4232 pswapd mm3,mm3
4233 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
4235 movq mm2, mm5
4236 pfmul mm5,mm5
4237 pfrsqit1 mm5,mm3
4238 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
4239 movq mm3,mm5
4240 pfmul mm3,mm3 ;# mm3=invsq
4241 pfmul mm7, mm5 ;# mm7=vcoul
4242 pfmul mm3, mm7 ;# mm3=fscal for the two H's
4244 ;# update vctot
4245 pfadd mm7, mm6
4246 pfadd mm7, [esp + i1030_vctot]
4247 movq [esp + i1030_vctot], mm7
4249 ;# spread oxygen fscalar to both positions
4250 punpckldq mm4,mm4
4251 ;# calc vectorial force for O
4252 movq mm0, [esp + i1030_dxO]
4253 movd mm1, [esp + i1030_dzO]
4254 pfmul mm0, mm4
4255 pfmul mm1, mm4
4257 ;# calc vectorial force for H's
4258 movq mm5, [esp + i1030_dxH]
4259 movq mm6, [esp + i1030_dyH]
4260 movq mm7, [esp + i1030_dzH]
4261 pfmul mm5, mm3
4262 pfmul mm6, mm3
4263 pfmul mm7, mm3
4265 ;# update iO particle force
4266 movq mm2, [esp + i1030_fixO]
4267 movd mm3, [esp + i1030_fizO]
4268 pfadd mm2, mm0
4269 pfadd mm3, mm1
4270 movq [esp + i1030_fixO], mm2
4271 movd [esp + i1030_fizO], mm3
4273 ;# update iH forces
4274 movq mm2, [esp + i1030_fixH]
4275 movq mm3, [esp + i1030_fiyH]
4276 movq mm4, [esp + i1030_fizH]
4277 pfadd mm2, mm5
4278 pfadd mm3, mm6
4279 pfadd mm4, mm7
4280 movq [esp + i1030_fixH], mm2
4281 movq [esp + i1030_fiyH], mm3
4282 movq [esp + i1030_fizH], mm4
4284 ;# pack j forces from H in the same form as the oxygen force
4285 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
4286 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
4288 pfadd mm0, mm5 ;# add up total force on j particle
4289 pfadd mm1, mm7
4291 ;# update j particle force
4292 movq mm2, [edi + eax*4 + 12]
4293 movd mm3, [edi + eax*4 + 20]
4294 pfsub mm2, mm0
4295 pfsub mm3, mm1
4296 movq [edi + eax*4 + 12], mm2
4297 movd [edi + eax*4 + 20], mm3
4299 ;# interactions with j H2
4300 movq mm0, [esi + eax*4 + 24]
4301 movd mm1, [esi + eax*4 + 32]
4302 ;# copy & expand to mm2-mm4 for the H interactions
4303 movq mm2, mm0
4304 movq mm3, mm0
4305 movq mm4, mm1
4306 punpckldq mm2,mm2
4307 punpckhdq mm3,mm3
4308 punpckldq mm4,mm4
4310 movd mm6, [esp + i1030_qqOH]
4311 movq mm7, [esp + i1030_qqHH]
4313 pfsubr mm0, [esp + i1030_ixO]
4314 pfsubr mm1, [esp + i1030_izO]
4316 movq [esp + i1030_dxO], mm0
4317 pfmul mm0,mm0
4318 movd [esp + i1030_dzO], mm1
4319 pfmul mm1,mm1
4320 pfacc mm0, mm1
4321 pfadd mm0, mm1 ;# mm0=rsqO
4323 punpckldq mm2, mm2
4324 punpckldq mm3, mm3
4325 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
4326 pfsubr mm2, [esp + i1030_ixH]
4327 pfsubr mm3, [esp + i1030_iyH]
4328 pfsubr mm4, [esp + i1030_izH] ;# mm2-mm4 is dxH-dzH
4330 movq [esp + i1030_dxH], mm2
4331 movq [esp + i1030_dyH], mm3
4332 movq [esp + i1030_dzH], mm4
4333 pfmul mm2,mm2
4334 pfmul mm3,mm3
4335 pfmul mm4,mm4
4337 pfadd mm3,mm2
4338 pfadd mm3,mm4 ;# mm3=rsqH
4340 pfrsqrt mm1,mm0
4342 movq mm2,mm1
4343 pfmul mm1,mm1
4344 pfrsqit1 mm1,mm0
4345 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
4346 movq mm4, mm1
4347 pfmul mm4, mm4 ;# mm4=invsq
4348 ;# calculate potential and scalar force
4349 pfmul mm6, mm1 ;# mm6=vcoul
4350 pfmul mm4, mm6 ;# mm4=fscalar
4352 pfrsqrt mm5, mm3
4353 pswapd mm3,mm3
4354 pfrsqrt mm2, mm3
4355 pswapd mm3,mm3
4356 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
4358 movq mm2, mm5
4359 pfmul mm5,mm5
4360 pfrsqit1 mm5,mm3
4361 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
4362 movq mm3,mm5
4363 pfmul mm3,mm3 ;# mm3=invsq
4364 pfmul mm7, mm5 ;# mm7=vcoul
4365 pfmul mm3, mm7 ;# mm3=fscal for the two H's
4367 ;# update vctot
4368 pfadd mm7, mm6
4369 pfadd mm7, [esp + i1030_vctot]
4370 movq [esp + i1030_vctot], mm7
4372 ;# spread oxygen fscalar to both positions
4373 punpckldq mm4,mm4
4374 ;# calc vectorial force for O
4375 movq mm0, [esp + i1030_dxO]
4376 movd mm1, [esp + i1030_dzO]
4377 pfmul mm0, mm4
4378 pfmul mm1, mm4
4380 ;# calc vectorial force for H's
4381 movq mm5, [esp + i1030_dxH]
4382 movq mm6, [esp + i1030_dyH]
4383 movq mm7, [esp + i1030_dzH]
4384 pfmul mm5, mm3
4385 pfmul mm6, mm3
4386 pfmul mm7, mm3
4388 ;# update iO particle force
4389 movq mm2, [esp + i1030_fixO]
4390 movd mm3, [esp + i1030_fizO]
4391 pfadd mm2, mm0
4392 pfadd mm3, mm1
4393 movq [esp + i1030_fixO], mm2
4394 movd [esp + i1030_fizO], mm3
4396 ;# update iH forces
4397 movq mm2, [esp + i1030_fixH]
4398 movq mm3, [esp + i1030_fiyH]
4399 movq mm4, [esp + i1030_fizH]
4400 pfadd mm2, mm5
4401 pfadd mm3, mm6
4402 pfadd mm4, mm7
4403 movq [esp + i1030_fixH], mm2
4404 movq [esp + i1030_fiyH], mm3
4405 movq [esp + i1030_fizH], mm4
4407 ;# pack j forces from H in the same form as the oxygen force
4408 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
4409 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
4411 pfadd mm0, mm5 ;# add up total force on j particle
4412 pfadd mm1, mm7
4414 ;# update j particle force
4415 movq mm2, [edi + eax*4 + 24]
4416 movd mm3, [edi + eax*4 + 32]
4417 pfsub mm2, mm0
4418 pfsub mm3, mm1
4419 movq [edi + eax*4 + 24], mm2
4420 movd [edi + eax*4 + 32], mm3
4422 ;# done - one more?
4423 dec dword ptr [esp + i1030_innerk]
4424 jz .i1030_updateouterdata
4425 jmp .i1030_inner_loop
4426 .i1030_updateouterdata:
4427 mov ecx, [esp + i1030_ii3]
4429 movq mm6, [edi + ecx*4] ;# increment iO force
4430 movd mm7, [edi + ecx*4 + 8]
4431 pfadd mm6, [esp + i1030_fixO]
4432 pfadd mm7, [esp + i1030_fizO]
4433 movq [edi + ecx*4], mm6
4434 movd [edi + ecx*4 +8], mm7
4436 movq mm0, [esp + i1030_fixH]
4437 movq mm3, [esp + i1030_fiyH]
4438 movq mm1, [esp + i1030_fizH]
4439 movq mm2, mm0
4440 punpckldq mm0, mm3 ;# mm0(l)=fxH1, mm0(h)=fyH1
4441 punpckhdq mm2, mm3 ;# mm2(l)=fxH2, mm2(h)=fyH2
4442 movq mm3, mm1
4443 pswapd mm3,mm3
4444 ;# mm1 is fzH1
4445 ;# mm3 is fzH2
4447 movq mm6, [edi + ecx*4 + 12] ;# increment iH1 force
4448 movd mm7, [edi + ecx*4 + 20]
4449 pfadd mm6, mm0
4450 pfadd mm7, mm1
4451 movq [edi + ecx*4 + 12], mm6
4452 movd [edi + ecx*4 + 20], mm7
4454 movq mm6, [edi + ecx*4 + 24] ;# increment iH2 force
4455 movd mm7, [edi + ecx*4 + 32]
4456 pfadd mm6, mm2
4457 pfadd mm7, mm3
4458 movq [edi + ecx*4 + 24], mm6
4459 movd [edi + ecx*4 + 32], mm7
4462 mov ebx, [ebp + i1030_fshift] ;# increment fshift force
4463 mov edx, [esp + i1030_is3]
4465 movq mm6, [ebx + edx*4]
4466 movd mm7, [ebx + edx*4 + 8]
4467 pfadd mm6, [esp + i1030_fixO]
4468 pfadd mm7, [esp + i1030_fizO]
4469 pfadd mm6, mm0
4470 pfadd mm7, mm1
4471 pfadd mm6, mm2
4472 pfadd mm7, mm3
4473 movq [ebx + edx*4], mm6
4474 movd [ebx + edx*4 + 8], mm7
4476 mov edx, [ebp + i1030_gid] ;# get group index for this i particle
4477 mov edx, [edx]
4478 add dword ptr [ebp + i1030_gid], 4 ;# advance pointer
4480 movq mm7, [esp + i1030_vctot]
4481 pfacc mm7,mm7 ;# get and sum the two parts of total potential
4483 mov eax, [ebp + i1030_Vc]
4484 movd mm6, [eax + edx*4]
4485 pfadd mm6, mm7
4486 movd [eax + edx*4], mm6 ;# increment vc[gid]
4487 ;# finish if last
4488 dec dword ptr [ebp + i1030_nri]
4489 jz .i1030_end
4490 ;# not last, iterate once more!
4491 jmp .i1030_outer
4492 .i1030_end:
4493 femms
4494 add esp, 156
4495 pop edi
4496 pop esi
4497 pop edx
4498 pop ecx
4499 pop ebx
4500 pop eax
4501 leave
4505 .globl inl1100_3dnow
4506 .globl _inl1100_3dnow
4507 inl1100_3dnow:
4508 _inl1100_3dnow:
4509 .equiv i1100_nri, 8
4510 .equiv i1100_iinr, 12
4511 .equiv i1100_jindex, 16
4512 .equiv i1100_jjnr, 20
4513 .equiv i1100_shift, 24
4514 .equiv i1100_shiftvec, 28
4515 .equiv i1100_fshift, 32
4516 .equiv i1100_gid, 36
4517 .equiv i1100_pos, 40
4518 .equiv i1100_faction, 44
4519 .equiv i1100_charge, 48
4520 .equiv i1100_facel, 52
4521 .equiv i1100_Vc, 56
4522 .equiv i1100_type, 60
4523 .equiv i1100_ntype, 64
4524 .equiv i1100_nbfp, 68
4525 .equiv i1100_Vnb, 72
4526 ;# stack offsets for local variables
4527 .equiv i1100_is3, 0
4528 .equiv i1100_ii3, 4
4529 .equiv i1100_ix, 8
4530 .equiv i1100_iy, 12
4531 .equiv i1100_iz, 16
4532 .equiv i1100_iq, 20
4533 .equiv i1100_vctot, 28
4534 .equiv i1100_vnbtot, 36
4535 .equiv i1100_c6, 44
4536 .equiv i1100_c12, 52
4537 .equiv i1100_six, 60
4538 .equiv i1100_twelve, 68
4539 .equiv i1100_ntia, 76
4540 .equiv i1100_innerjjnr, 80
4541 .equiv i1100_innerk, 84
4542 .equiv i1100_fix, 88
4543 .equiv i1100_fiy, 92
4544 .equiv i1100_fiz, 96
4545 .equiv i1100_dx1, 100
4546 .equiv i1100_dy1, 104
4547 .equiv i1100_dz1, 108
4548 .equiv i1100_dx2, 112
4549 .equiv i1100_dy2, 116
4550 .equiv i1100_dz2, 120
4551 push ebp
4552 mov ebp,esp
4554 push eax
4555 push ebx
4556 push ecx
4557 push edx
4558 push esi
4559 push edi
4560 sub esp, 124 ;# local stack space
4561 femms
4562 ;# move data to local stack
4563 movq mm0, [mm_six]
4564 movq mm1, [mm_twelve]
4565 movq [esp + i1100_six], mm0
4566 movq [esp + i1100_twelve], mm1
4567 ;# assume we have at least one i particle - start directly
4568 .i1100_outer:
4569 mov eax, [ebp + i1100_shift] ;# eax = pointer into shift[]
4570 mov ebx, [eax] ;# ebx=shift[n]
4571 add dword ptr [ebp + i1030_shift], 4 ;# advance pointer one step
4573 lea ebx, [ebx + ebx*2] ;# ebx=3*is
4574 mov [esp + i1030_is3],ebx ;# store is3
4576 mov eax, [ebp + i1100_shiftvec] ;# eax = base of shiftvec[]
4578 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
4579 movd mm1, [eax + ebx*4 + 8]
4581 mov ecx, [ebp + i1100_iinr] ;# ecx = pointer into iinr[]
4582 add dword ptr [ebp + i1100_iinr], 4 ;# advance pointer
4583 mov ebx, [ecx] ;# ebx=ii
4585 mov edx, [ebp + i1100_charge]
4586 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
4587 pfmul mm2, [ebp + i1100_facel]
4588 punpckldq mm2,mm2 ;# spread to both halves
4589 movq [esp + i1100_iq], mm2 ;# iq =facel*charge[ii]
4591 mov edx, [ebp + i1100_type]
4592 mov edx, [edx + ebx*4]
4593 imul edx, [ebp + i1100_ntype]
4594 shl edx, 1
4595 mov [esp + i1100_ntia], edx
4597 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
4598 mov eax, [ebp + i1100_pos] ;# eax = base of pos[]
4600 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
4601 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
4602 mov [esp + i1100_ii3], ebx
4603 pfadd mm1, mm3
4604 movq [esp + i1100_ix], mm0
4605 movd [esp + i1100_iz], mm1
4607 ;# clear total potential and i forces
4608 pxor mm7,mm7
4609 movq [esp + i1100_vctot], mm7
4610 movq [esp + i1100_vnbtot], mm7
4611 movq [esp + i1100_fix], mm7
4612 movd [esp + i1100_fiz], mm7
4614 mov eax, [ebp + i1100_jindex]
4615 mov ecx, [eax] ;# jindex[n]
4616 mov edx, [eax + 4] ;# jindex[n+1]
4617 add dword ptr [ebp + i1100_jindex], 4
4618 sub edx, ecx ;# number of innerloop atoms
4620 mov esi, [ebp + i1100_pos]
4621 mov edi, [ebp + i1100_faction]
4622 mov eax, [ebp + i1100_jjnr]
4623 shl ecx, 2
4624 add eax, ecx
4625 mov [esp + i1100_innerjjnr], eax ;# pointer to jjnr[nj0]
4626 sub edx, 2
4627 mov [esp + i1100_innerk], edx ;# number of innerloop atoms
4628 jge .i1100_unroll_loop
4629 jmp .i1100_finish_inner
4630 .i1100_unroll_loop:
4631 ;# paired innerloop starts here
4632 mov ecx, [esp + i1100_innerjjnr] ;# pointer to jjnr[k]
4633 mov eax, [ecx]
4634 mov ebx, [ecx + 4] ;# eax/ebx=jnr
4635 add dword ptr [esp + i1100_innerjjnr], 8 ;# advance pointer (unrolled 2)
4636 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
4638 mov ecx, [ebp + i1100_charge] ;# base of charge[]
4639 movq mm5, [esp + i1100_iq]
4640 movd mm3, [ecx + eax*4] ;# charge[jnr1]
4641 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
4642 pfmul mm3,mm5 ;# mm3 now has qq for both particles
4644 mov ecx, [ebp + i1100_type]
4645 mov edx, [ecx + eax*4] ;# type [jnr1]
4646 mov ecx, [ecx + ebx*4] ;# type [jnr2]
4648 mov esi, [ebp + i1100_nbfp] ;# base of nbfp
4649 shl edx, 1
4650 shl ecx, 1
4651 add edx, [esp + i1100_ntia] ;# tja = ntia + 2*type
4652 add ecx, [esp + i1100_ntia]
4654 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
4655 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
4656 movq mm6,mm5
4657 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
4658 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
4659 movq [esp + i1100_c6], mm5
4660 movq [esp + i1100_c12], mm6
4662 lea eax, [eax + eax*2] ;# replace jnr with j3
4663 lea ebx, [ebx + ebx*2]
4665 mov esi, [ebp + i1100_pos]
4667 movq mm0, [esp + i1100_ix]
4668 movd mm1, [esp + i1100_iz]
4669 movq mm4, [esi + eax*4] ;# fetch first j coordinates
4670 movd mm5, [esi + eax*4 + 8]
4671 pfsubr mm4,mm0 ;# dr = ir - jr
4672 pfsubr mm5,mm1
4673 movq [esp + i1100_dx1], mm4 ;# store dr
4674 movd [esp + i1100_dz1], mm5
4675 pfmul mm4,mm4 ;# square dx,dy,dz
4676 pfmul mm5,mm5
4677 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
4678 pfacc mm4, mm5 ;# first rsq in lower mm4
4680 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
4681 movd mm7, [esi + ebx*4 + 8]
4683 pfsubr mm6,mm0 ;# dr = ir - jr
4684 pfsubr mm7,mm1
4685 movq [esp + i1100_dx2], mm6 ;# store dr
4686 movd [esp + i1100_dz2], mm7
4687 pfmul mm6,mm6 ;# square dx,dy,dz
4688 pfmul mm7,mm7
4689 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
4690 pfacc mm6, mm7 ;# second rsq in lower mm6
4692 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
4693 pfrsqrt mm1, mm6
4695 punpckldq mm0,mm1
4696 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
4697 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
4698 pfmul mm0,mm0
4699 pfrsqit1 mm0,mm4
4700 pfrcpit2 mm0,mm2
4701 movq mm1,mm0
4702 pfmul mm0,mm0
4703 ;# mm0 now contains invsq, and mm1 invsqrt
4704 ;# do potential and fscal
4705 movq mm4, mm0
4706 pfmul mm4, mm0
4707 pfmul mm4, mm0 ;# mm4=rinvsix
4708 movq mm5, mm4
4709 pfmul mm5, mm5 ;# mm5=rinvtwelve
4711 pfmul mm3, mm1 ;# mm3 has vcoul for both interactions
4712 movq mm7, mm3 ;# use mm7 for sum to make fscal
4714 pfmul mm5, [esp + i1100_c12]
4715 pfmul mm4, [esp + i1100_c6]
4716 movq mm6, mm5 ;# mm6 is vnb12-vnb6
4717 pfsub mm6, mm4
4719 pfmul mm4, [esp + i1100_six]
4721 pfmul mm5, [esp + i1100_twelve]
4722 pfsub mm7,mm4
4723 pfadd mm7, mm5
4724 pfmul mm0, mm7 ;# mm0 is total fscal now
4726 prefetchw [esp + i1100_dx1] ;# prefetch i forces to cache
4728 ;# update vctot
4729 pfadd mm3, [esp + i1100_vctot] ;# add the earlier value
4730 movq [esp + i1100_vctot], mm3 ;# store the sum
4732 ;# spread fscalar to both positions
4733 movq mm1,mm0
4734 punpckldq mm0,mm0
4735 punpckhdq mm1,mm1
4737 ;# calc vector force
4738 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
4739 movq mm2, [esp + i1100_dx1] ;# fetch dr
4740 movd mm3, [esp + i1100_dz1]
4742 ;# update vnbtot
4743 pfadd mm6, [esp + i1100_vnbtot] ;# add the earlier value
4744 movq [esp + i1100_vnbtot], mm6 ;# store the sum
4746 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
4747 pfmul mm2, mm0 ;# mult by fs
4748 pfmul mm3, mm0
4750 movq mm4, [esp + i1100_dx2] ;# fetch dr
4751 movd mm5, [esp + i1100_dz2]
4752 pfmul mm4, mm1 ;# mult by fs
4753 pfmul mm5, mm1
4754 ;# update i forces
4756 movq mm0, [esp + i1100_fix]
4757 movd mm1, [esp + i1100_fiz]
4758 pfadd mm0, mm2
4759 pfadd mm1, mm3
4761 pfadd mm0, mm4
4762 pfadd mm1, mm5
4763 movq [esp + i1100_fix], mm0
4764 movd [esp + i1100_fiz], mm1
4765 ;# update j forces
4767 movq mm0, [edi + eax*4]
4768 movd mm1, [edi + eax*4 + 8]
4769 movq mm6, [edi + ebx*4]
4770 movd mm7, [edi + ebx*4 + 8]
4772 pfsub mm0, mm2
4773 pfsub mm1, mm3
4774 pfsub mm6, mm4
4775 pfsub mm7, mm5
4777 movq [edi + eax*4], mm0
4778 movd [edi + eax*4 +8], mm1
4779 movq [edi + ebx*4], mm6
4780 movd [edi + ebx*4 + 8], mm7
4782 ;# should we do one more iteration?
4783 sub dword ptr [esp + i1100_innerk], 2
4784 jl .i1100_finish_inner
4785 jmp .i1100_unroll_loop
4786 .i1100_finish_inner:
4787 and dword ptr [esp + i1100_innerk], 1
4788 jnz .i1100_single_inner
4789 jmp .i1100_updateouterdata
4790 .i1100_single_inner:
4791 ;# a single j particle iteration here - compare with the unrolled code for comments
4792 mov eax, [esp + i1100_innerjjnr]
4793 mov eax, [eax] ;# eax=jnr offset
4795 mov ecx, [ebp + i1100_charge]
4796 movd mm5, [esp + i1100_iq]
4797 movd mm3, [ecx + eax*4]
4798 pfmul mm3, mm5 ;# mm3=qq
4800 mov esi, [ebp + i1100_nbfp]
4801 mov ecx, [ebp + i1100_type]
4802 mov edx, [ecx + eax*4] ;# type [jnr1]
4803 shl edx, 1
4804 add edx, [esp + i1100_ntia] ;# tja = ntia + 2*type
4805 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
4806 movq [esp + i1100_c6], mm5
4807 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
4808 movq [esp + i1100_c12], mm5
4811 mov esi, [ebp + i1100_pos]
4812 lea eax, [eax + eax*2]
4814 movq mm0, [esp + i1100_ix]
4815 movd mm1, [esp + i1100_iz]
4816 movq mm4, [esi + eax*4]
4817 movd mm5, [esi + eax*4 + 8]
4818 pfsubr mm4, mm0
4819 pfsubr mm5, mm1
4820 movq [esp + i1100_dx1], mm4
4821 pfmul mm4,mm4
4822 movd [esp + i1100_dz1], mm5
4823 pfmul mm5,mm5
4824 pfacc mm4, mm5
4825 pfacc mm4, mm5 ;# mm0=rsq
4827 pfrsqrt mm0,mm4
4828 movq mm2,mm0
4829 pfmul mm0,mm0
4830 pfrsqit1 mm0,mm4
4831 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
4832 movq mm1, mm0
4833 pfmul mm0, mm0 ;# mm0=invsq
4834 ;# calculate potentials and scalar force
4835 movq mm4, mm0
4836 pfmul mm4, mm0
4837 pfmul mm4, mm0 ;# mm4=rinvsix
4838 movq mm5, mm4
4839 pfmul mm5, mm5 ;# mm5=rinvtwelve
4841 pfmul mm3, mm1 ;# mm3 has vcoul for both interactions
4842 movq mm7, mm3 ;# use mm7 for sum to make fscal
4844 pfmul mm5, [esp + i1100_c12]
4845 pfmul mm4, [esp + i1100_c6]
4846 movq mm6, mm5 ;# mm6 is vnb12-vnb6
4847 pfsub mm6, mm4
4849 pfmul mm4, [esp + i1100_six]
4851 pfmul mm5, [esp + i1100_twelve]
4852 pfsub mm7,mm4
4853 pfadd mm7, mm5
4854 pfmul mm0, mm7 ;# mm0 is total fscal now
4856 ;# update vctot
4857 pfadd mm3, [esp + i1100_vctot]
4858 movq [esp + i1100_vctot], mm3
4860 ;# update vnbtot
4861 pfadd mm6, [esp + i1100_vnbtot] ;# add the earlier value
4862 movq [esp + i1100_vnbtot], mm6 ;# store the sum
4864 ;# spread fscalar to both positions
4865 punpckldq mm0,mm0
4866 ;# calc vectorial force
4867 prefetchw [edi + eax*4] ;# prefetch faction to cache
4868 movq mm2, [esp + i1100_dx1]
4869 movd mm3, [esp + i1100_dz1]
4872 pfmul mm2, mm0
4873 pfmul mm3, mm0
4875 ;# update i particle force
4876 movq mm0, [esp + i1100_fix]
4877 movd mm1, [esp + i1100_fiz]
4878 pfadd mm0, mm2
4879 pfadd mm1, mm3
4880 movq [esp + i1100_fix], mm0
4881 movd [esp + i1100_fiz], mm1
4882 ;# update j particle force
4883 movq mm0, [edi + eax*4]
4884 movd mm1, [edi + eax *4+ 8]
4885 pfsub mm0, mm2
4886 pfsub mm1, mm3
4887 movq [edi + eax*4], mm0
4888 movd [edi + eax*4 +8], mm1
4889 ;# done!
4890 .i1100_updateouterdata:
4891 mov ecx, [esp + i1100_ii3]
4893 movq mm6, [edi + ecx*4] ;# increment i force
4894 movd mm7, [edi + ecx*4 + 8]
4895 pfadd mm6, [esp + i1100_fix]
4896 pfadd mm7, [esp + i1100_fiz]
4897 movq [edi + ecx*4], mm6
4898 movd [edi + ecx*4 +8], mm7
4900 mov ebx, [ebp + i1100_fshift] ;# increment fshift force
4901 mov edx, [esp + i1100_is3]
4903 movq mm6, [ebx + edx*4]
4904 movd mm7, [ebx + edx*4 + 8]
4905 pfadd mm6, [esp + i1100_fix]
4906 pfadd mm7, [esp + i1100_fiz]
4907 movq [ebx + edx*4], mm6
4908 movd [ebx + edx*4 + 8], mm7
4910 mov edx, [ebp + i1100_gid] ;# get group index for this i particle
4911 mov edx, [edx]
4912 add dword ptr [ebp + i1100_gid], 4 ;# advance pointer
4914 movq mm7, [esp + i1100_vctot]
4915 pfacc mm7,mm7 ;# get and sum the two parts of total potential
4917 mov eax, [ebp + i1100_Vc]
4918 movd mm6, [eax + edx*4]
4919 pfadd mm6, mm7
4920 movd [eax + edx*4], mm6 ;# increment vc[gid]
4922 movq mm7, [esp + i1100_vnbtot]
4923 pfacc mm7,mm7 ;# get and sum the two parts of total potential
4925 mov eax, [ebp + i1100_Vnb]
4926 movd mm6, [eax + edx*4]
4927 pfadd mm6, mm7
4928 movd [eax + edx*4], mm6 ;# increment vnb[gid]
4930 ;# finish if last
4931 mov ecx, [ebp + i1100_nri]
4932 dec ecx
4933 jecxz .i1100_end
4934 ;# not last, iterate once more!
4935 mov [ebp + i1100_nri], ecx
4936 jmp .i1100_outer
4937 .i1100_end:
4938 femms
4939 add esp, 124
4940 pop edi
4941 pop esi
4942 pop edx
4943 pop ecx
4944 pop ebx
4945 pop eax
4946 leave
4953 .globl inl1110_3dnow
4954 .globl _inl1110_3dnow
4955 inl1110_3dnow:
4956 _inl1110_3dnow:
4957 .equiv i1110_nri, 8
4958 .equiv i1110_iinr, 12
4959 .equiv i1110_jindex, 16
4960 .equiv i1110_jjnr, 20
4961 .equiv i1110_shift, 24
4962 .equiv i1110_shiftvec, 28
4963 .equiv i1110_fshift, 32
4964 .equiv i1110_gid, 36
4965 .equiv i1110_pos, 40
4966 .equiv i1110_faction, 44
4967 .equiv i1110_charge, 48
4968 .equiv i1110_facel, 52
4969 .equiv i1110_Vc, 56
4970 .equiv i1110_type, 60
4971 .equiv i1110_ntype, 64
4972 .equiv i1110_nbfp, 68
4973 .equiv i1110_Vnb, 72
4974 .equiv i1110_nsatoms, 76
4975 ;# stack offsets for local variables
4976 .equiv i1110_is3, 0
4977 .equiv i1110_ii3, 4
4978 .equiv i1110_shX, 8
4979 .equiv i1110_shY, 12
4980 .equiv i1110_shZ, 16
4981 .equiv i1110_ix, 20
4982 .equiv i1110_iy, 24
4983 .equiv i1110_iz, 28
4984 .equiv i1110_iq, 32
4985 .equiv i1110_vctot, 40
4986 .equiv i1110_vnbtot, 48
4987 .equiv i1110_c6, 56
4988 .equiv i1110_c12, 64
4989 .equiv i1110_six, 72
4990 .equiv i1110_twelve, 80
4991 .equiv i1110_ntia, 88
4992 .equiv i1110_innerjjnr0, 92
4993 .equiv i1110_innerk0, 96
4994 .equiv i1110_innerjjnr, 100
4995 .equiv i1110_innerk, 104
4996 .equiv i1110_fix, 108
4997 .equiv i1110_fiy, 112
4998 .equiv i1110_fiz, 116
4999 .equiv i1110_dx1, 120
5000 .equiv i1110_dy1, 124
5001 .equiv i1110_dz1, 128
5002 .equiv i1110_dx2, 132
5003 .equiv i1110_dy2, 136
5004 .equiv i1110_dz2, 140
5005 .equiv i1110_nsvdwc, 144
5006 .equiv i1110_nscoul, 148
5007 .equiv i1110_nsvdw, 152
5008 .equiv i1110_solnr, 156
5009 push ebp
5010 mov ebp,esp
5011 push eax
5012 push ebx
5013 push ecx
5014 push edx
5015 push esi
5016 push edi
5017 sub esp, 160 ;# local stack space
5018 femms
5019 movq mm0, [mm_six]
5020 movq mm1, [mm_twelve]
5021 movq [esp + i1110_six], mm0
5022 movq [esp + i1110_twelve], mm1
5023 ;# assume we have at least one i particle - start directly
5024 .i1110_outer:
5025 mov eax, [ebp + i1110_shift] ;# eax = pointer into shift[]
5026 mov ebx, [eax] ;# ebx=shift[n]
5027 add dword ptr [ebp + i1110_shift], 4 ;# advance pointer one step
5029 lea ebx, [ebx + ebx*2] ;# ebx=3*is
5030 mov [esp + i1110_is3],ebx ;# store is3
5032 mov eax, [ebp + i1110_shiftvec] ;# eax = base of shiftvec[]
5034 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
5035 movd mm1, [eax + ebx*4 + 8]
5036 movq [esp + i1110_shX], mm0
5037 movd [esp + i1110_shZ], mm1
5039 mov ecx, [ebp + i1110_iinr] ;# ecx = pointer into iinr[]
5040 add dword ptr [ebp + i1110_iinr], 4 ;# advance pointer
5041 mov ebx, [ecx] ;# ebx=ii
5043 mov eax, [ebp + i1110_nsatoms]
5044 add dword ptr [ebp + i1110_nsatoms], 12
5045 mov ecx, [eax]
5046 mov edx, [eax + 4]
5047 mov eax, [eax + 8]
5048 sub ecx, eax
5049 sub eax, edx
5051 mov [esp + i1110_nsvdwc], edx
5052 mov [esp + i1110_nscoul], eax
5053 mov [esp + i1110_nsvdw], ecx
5055 ;# clear potential
5056 pxor mm7,mm7
5057 movq [esp + i1110_vctot], mm7
5058 movq [esp + i1110_vnbtot], mm7
5059 mov [esp + i1110_solnr], ebx
5061 mov eax, [ebp + i1110_jindex]
5062 mov ecx, [eax] ;# jindex[n]
5063 mov edx, [eax + 4] ;# jindex[n+1]
5064 add dword ptr [ebp + i1110_jindex], 4
5065 sub edx, ecx ;# number of innerloop atoms
5066 mov eax, [ebp + i1110_jjnr]
5067 shl ecx, 2
5068 add eax, ecx
5069 mov [esp + i1110_innerjjnr0], eax ;# pointer to jjnr[nj0]
5071 mov [esp + i1110_innerk0], edx ;# number of innerloop atoms
5072 mov esi, [ebp + i1110_pos]
5073 mov edi, [ebp + i1110_faction]
5075 mov ecx, [esp + i1110_nsvdwc]
5076 cmp ecx, 0
5077 jnz .i1110_mno_vdwc
5078 jmp .i1110_testcoul
5079 .i1110_mno_vdwc:
5080 mov ebx, [esp + i1110_solnr]
5081 inc dword ptr [esp + i1110_solnr]
5082 mov edx, [ebp + i1110_charge]
5083 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
5084 pfmul mm2, [ebp + i1110_facel]
5085 punpckldq mm2,mm2 ;# spread to both halves
5086 movq [esp + i1110_iq], mm2 ;# iq =facel*charge[ii]
5088 mov edx, [ebp + i1110_type]
5089 mov edx, [edx + ebx*4]
5090 imul edx, [ebp + i1110_ntype]
5091 shl edx, 1
5092 mov [esp + i1110_ntia], edx
5094 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
5095 mov eax, [ebp + i1110_pos] ;# eax = base of pos[]
5096 mov [esp + i1110_ii3], ebx
5098 movq mm0, [eax + ebx*4]
5099 movd mm1, [eax + ebx*4 + 8]
5100 pfadd mm0, [esp + i1110_shX]
5101 pfadd mm1, [esp + i1110_shZ]
5102 movq [esp + i1110_ix], mm0
5103 movd [esp + i1110_iz], mm1
5105 ;# clear forces
5106 pxor mm7,mm7
5107 movq [esp + i1110_fix], mm7
5108 movd [esp + i1110_fiz], mm7
5110 mov ecx, [esp + i1110_innerjjnr0]
5111 mov [esp + i1110_innerjjnr], ecx
5112 mov edx, [esp + i1110_innerk0]
5113 sub edx, 2
5114 mov [esp + i1110_innerk], edx ;# number of innerloop atoms
5115 jge .i1110_unroll_vdwc_loop
5116 jmp .i1110_finish_vdwc_inner
5117 .i1110_unroll_vdwc_loop:
5118 ;# paired innerloop starts here
5119 mov ecx, [esp + i1110_innerjjnr] ;# pointer to jjnr[k]
5120 mov eax, [ecx]
5121 mov ebx, [ecx + 4] ;# eax/ebx=jnr
5122 add dword ptr [esp + i1110_innerjjnr], 8 ;# advance pointer (unrolled 2)
5123 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
5125 mov ecx, [ebp + i1110_charge] ;# base of charge[]
5126 movq mm5, [esp + i1110_iq]
5127 movd mm3, [ecx + eax*4] ;# charge[jnr1]
5128 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
5129 pfmul mm3,mm5 ;# mm3 now has qq for both particles
5131 mov ecx, [ebp + i1110_type]
5132 mov edx, [ecx + eax*4] ;# type [jnr1]
5133 mov ecx, [ecx + ebx*4] ;# type [jnr2]
5135 mov esi, [ebp + i1110_nbfp] ;# base of nbfp
5136 shl edx, 1
5137 shl ecx, 1
5138 add edx, [esp + i1110_ntia] ;# tja = ntia + 2*type
5139 add ecx, [esp + i1110_ntia]
5141 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
5142 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
5143 movq mm6,mm5
5144 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
5145 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
5146 movq [esp + i1110_c6], mm5
5147 movq [esp + i1110_c12], mm6
5149 lea eax, [eax + eax*2] ;# replace jnr with j3
5150 lea ebx, [ebx + ebx*2]
5152 mov esi, [ebp + i1110_pos]
5154 movq mm0, [esp + i1110_ix]
5155 movd mm1, [esp + i1110_iz]
5156 movq mm4, [esi + eax*4] ;# fetch first j coordinates
5157 movd mm5, [esi + eax*4 + 8]
5158 pfsubr mm4,mm0 ;# dr = ir - jr
5159 pfsubr mm5,mm1
5160 movq [esp + i1110_dx1], mm4 ;# store dr
5161 movd [esp + i1110_dz1], mm5
5162 pfmul mm4,mm4 ;# square dx,dy,dz
5163 pfmul mm5,mm5
5164 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
5165 pfacc mm4, mm5 ;# first rsq in lower mm4
5167 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
5168 movd mm7, [esi + ebx*4 + 8]
5170 pfsubr mm6,mm0 ;# dr = ir - jr
5171 pfsubr mm7,mm1
5172 movq [esp + i1110_dx2], mm6 ;# store dr
5173 movd [esp + i1110_dz2], mm7
5174 pfmul mm6,mm6 ;# square dx,dy,dz
5175 pfmul mm7,mm7
5176 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
5177 pfacc mm6, mm7 ;# second rsq in lower mm6
5179 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
5180 pfrsqrt mm1, mm6
5182 punpckldq mm0,mm1
5183 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
5184 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
5185 pfmul mm0,mm0
5186 pfrsqit1 mm0,mm4
5187 pfrcpit2 mm0,mm2
5188 movq mm1,mm0
5189 pfmul mm0,mm0
5190 ;# mm0 now contains invsq, and mm1 invsqrt
5191 ;# do potential and fscal
5192 movq mm4, mm0
5193 pfmul mm4, mm0
5194 pfmul mm4, mm0 ;# mm4=rinvsix
5195 movq mm5, mm4
5196 pfmul mm5, mm5 ;# mm5=rinvtwelve
5198 pfmul mm3, mm1 ;# mm3 has vcoul for both interactions
5199 movq mm7, mm3 ;# use mm7 for sum to make fscal
5201 pfmul mm5, [esp + i1110_c12]
5202 pfmul mm4, [esp + i1110_c6]
5203 movq mm6, mm5 ;# mm6 is vnb12-vnb6
5204 pfsub mm6, mm4
5206 pfmul mm4, [esp + i1110_six]
5208 pfmul mm5, [esp + i1110_twelve]
5209 pfsub mm7,mm4
5210 pfadd mm7, mm5
5211 pfmul mm0, mm7 ;# mm0 is total fscal now
5213 prefetchw [esp + i1110_dx1] ;# prefetch i forces to cache
5215 ;# update vctot
5216 pfadd mm3, [esp + i1110_vctot] ;# add the earlier value
5217 movq [esp + i1110_vctot], mm3 ;# store the sum
5219 ;# spread fscalar to both positions
5220 movq mm1,mm0
5221 punpckldq mm0,mm0
5222 punpckhdq mm1,mm1
5224 ;# calc vector force
5225 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
5226 movq mm2, [esp + i1110_dx1] ;# fetch dr
5227 movd mm3, [esp + i1110_dz1]
5229 ;# update vnbtot
5230 pfadd mm6, [esp + i1110_vnbtot] ;# add the earlier value
5231 movq [esp + i1110_vnbtot], mm6 ;# store the sum
5233 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
5234 pfmul mm2, mm0 ;# mult by fs
5235 pfmul mm3, mm0
5237 movq mm4, [esp + i1110_dx2] ;# fetch dr
5238 movd mm5, [esp + i1110_dz2]
5239 pfmul mm4, mm1 ;# mult by fs
5240 pfmul mm5, mm1
5241 ;# update i forces
5243 movq mm0, [esp + i1110_fix]
5244 movd mm1, [esp + i1110_fiz]
5245 pfadd mm0, mm2
5246 pfadd mm1, mm3
5248 pfadd mm0, mm4
5249 pfadd mm1, mm5
5250 movq [esp + i1110_fix], mm0
5251 movd [esp + i1110_fiz], mm1
5252 ;# update j forces
5254 movq mm0, [edi + eax*4]
5255 movd mm1, [edi + eax*4 + 8]
5256 movq mm6, [edi + ebx*4]
5257 movd mm7, [edi + ebx*4 + 8]
5259 pfsub mm0, mm2
5260 pfsub mm1, mm3
5261 pfsub mm6, mm4
5262 pfsub mm7, mm5
5264 movq [edi + eax*4], mm0
5265 movd [edi + eax*4 +8], mm1
5266 movq [edi + ebx*4], mm6
5267 movd [edi + ebx*4 + 8], mm7
5269 ;# should we do one more iteration?
5270 sub dword ptr [esp + i1110_innerk], 2
5271 jl .i1110_finish_vdwc_inner
5272 jmp .i1110_unroll_vdwc_loop
5273 .i1110_finish_vdwc_inner:
5274 and dword ptr [esp + i1110_innerk], 1
5275 jnz .i1110_single_vdwc_inner
5276 jmp .i1110_updateouterdata_vdwc
5277 .i1110_single_vdwc_inner:
5278 ;# a single j particle iteration here - compare with the unrolled code for comments
5279 mov eax, [esp + i1110_innerjjnr]
5280 mov eax, [eax] ;# eax=jnr offset
5282 mov ecx, [ebp + i1110_charge]
5283 movd mm5, [esp + i1110_iq]
5284 movd mm3, [ecx + eax*4]
5285 pfmul mm3, mm5 ;# mm3=qq
5287 mov esi, [ebp + i1110_nbfp]
5288 mov ecx, [ebp + i1110_type]
5289 mov edx, [ecx + eax*4] ;# type [jnr1]
5290 shl edx, 1
5291 add edx, [esp + i1110_ntia] ;# tja = ntia + 2*type
5292 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
5293 movq [esp + i1110_c6], mm5
5294 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
5295 movq [esp + i1110_c12], mm5
5298 mov esi, [ebp + i1110_pos]
5299 lea eax, [eax + eax*2]
5301 movq mm0, [esp + i1110_ix]
5302 movd mm1, [esp + i1110_iz]
5303 movq mm4, [esi + eax*4]
5304 movd mm5, [esi + eax*4 + 8]
5305 pfsubr mm4, mm0
5306 pfsubr mm5, mm1
5307 movq [esp + i1110_dx1], mm4
5308 pfmul mm4,mm4
5309 movd [esp + i1110_dz1], mm5
5310 pfmul mm5,mm5
5311 pfacc mm4, mm5
5312 pfacc mm4, mm5 ;# mm0=rsq
5314 pfrsqrt mm0,mm4
5315 movq mm2,mm0
5316 pfmul mm0,mm0
5317 pfrsqit1 mm0,mm4
5318 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
5319 movq mm1, mm0
5320 pfmul mm0, mm0 ;# mm0=invsq
5321 ;# calculate potentials and scalar force
5322 movq mm4, mm0
5323 pfmul mm4, mm0
5324 pfmul mm4, mm0 ;# mm4=rinvsix
5325 movq mm5, mm4
5326 pfmul mm5, mm5 ;# mm5=rinvtwelve
5328 pfmul mm3, mm1 ;# mm3 has vcoul for both interactions
5329 movq mm7, mm3 ;# use mm7 for sum to make fscal
5331 pfmul mm5, [esp + i1110_c12]
5332 pfmul mm4, [esp + i1110_c6]
5333 movq mm6, mm5 ;# mm6 is vnb12-vnb6
5334 pfsub mm6, mm4
5336 pfmul mm4, [esp + i1110_six]
5338 pfmul mm5, [esp + i1110_twelve]
5339 pfsub mm7,mm4
5340 pfadd mm7, mm5
5341 pfmul mm0, mm7 ;# mm0 is total fscal now
5343 ;# update vctot
5344 pfadd mm3, [esp + i1110_vctot]
5345 movq [esp + i1110_vctot], mm3
5347 ;# update vnbtot
5348 pfadd mm6, [esp + i1110_vnbtot] ;# add the earlier value
5349 movq [esp + i1110_vnbtot], mm6 ;# store the sum
5351 ;# spread fscalar to both positions
5352 punpckldq mm0,mm0
5353 ;# calc vectorial force
5354 prefetchw [edi + eax*4] ;# prefetch faction to cache
5355 movq mm2, [esp + i1110_dx1]
5356 movd mm3, [esp + i1110_dz1]
5359 pfmul mm2, mm0
5360 pfmul mm3, mm0
5362 ;# update i particle force
5363 movq mm0, [esp + i1110_fix]
5364 movd mm1, [esp + i1110_fiz]
5365 pfadd mm0, mm2
5366 pfadd mm1, mm3
5367 movq [esp + i1110_fix], mm0
5368 movd [esp + i1110_fiz], mm1
5369 ;# update j particle force
5370 movq mm0, [edi + eax*4]
5371 movd mm1, [edi + eax *4+ 8]
5372 pfsub mm0, mm2
5373 pfsub mm1, mm3
5374 movq [edi + eax*4], mm0
5375 movd [edi + eax*4 +8], mm1
5376 ;# done!
5377 .i1110_updateouterdata_vdwc:
5378 mov ecx, [esp + i1110_ii3]
5380 movq mm6, [edi + ecx*4] ;# increment i force
5381 movd mm7, [edi + ecx*4 + 8]
5382 pfadd mm6, [esp + i1110_fix]
5383 pfadd mm7, [esp + i1110_fiz]
5384 movq [edi + ecx*4], mm6
5385 movd [edi + ecx*4 +8], mm7
5387 mov ebx, [ebp + i1110_fshift] ;# increment fshift force
5388 mov edx, [esp + i1110_is3]
5390 movq mm6, [ebx + edx*4]
5391 movd mm7, [ebx + edx*4 + 8]
5392 pfadd mm6, [esp + i1110_fix]
5393 pfadd mm7, [esp + i1110_fiz]
5394 movq [ebx + edx*4], mm6
5395 movd [ebx + edx*4 + 8], mm7
5397 ;# loop back to mno
5398 dec dword ptr [esp + i1110_nsvdwc]
5399 jz .i1110_testcoul
5400 jmp .i1110_mno_vdwc
5401 .i1110_testcoul:
5402 mov ecx, [esp + i1110_nscoul]
5403 cmp ecx, 0
5404 jnz .i1110_mno_coul
5405 jmp .i1110_testvdw
5406 .i1110_mno_coul:
5407 mov ebx, [esp + i1110_solnr]
5408 inc dword ptr [esp + i1110_solnr]
5409 mov edx, [ebp + i1110_charge]
5410 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
5411 pfmul mm2, [ebp + i1110_facel]
5412 punpckldq mm2,mm2 ;# spread to both halves
5413 movq [esp + i1110_iq], mm2 ;# iq =facel*charge[ii]
5415 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
5416 mov eax, [ebp + i1110_pos] ;# eax = base of pos[]
5417 mov [esp + i1110_ii3], ebx
5419 movq mm0, [eax + ebx*4]
5420 movd mm1, [eax + ebx*4 + 8]
5421 pfadd mm0, [esp + i1110_shX]
5422 pfadd mm1, [esp + i1110_shZ]
5423 movq [esp + i1110_ix], mm0
5424 movd [esp + i1110_iz], mm1
5426 ;# clear forces
5427 pxor mm7,mm7
5428 movq [esp + i1110_fix], mm7
5429 movd [esp + i1110_fiz], mm7
5431 mov ecx, [esp + i1110_innerjjnr0]
5432 mov [esp + i1110_innerjjnr], ecx
5433 mov edx, [esp + i1110_innerk0]
5434 sub edx, 2
5435 mov [esp + i1110_innerk], edx ;# number of innerloop atoms
5436 jge .i1110_unroll_coul_loop
5437 jmp .i1110_finish_coul_inner
5438 .i1110_unroll_coul_loop:
5439 ;# paired innerloop starts here
5440 mov ecx, [esp + i1110_innerjjnr] ;# pointer to jjnr[k]
5441 mov eax, [ecx]
5442 mov ebx, [ecx + 4] ;# eax/ebx=jnr
5443 add dword ptr [esp + i1110_innerjjnr], 8 ;# advance pointer (unrolled 2)
5444 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
5446 mov ecx, [ebp + i1110_charge] ;# base of charge[]
5447 movq mm5, [esp + i1110_iq]
5448 movd mm3, [ecx + eax*4] ;# charge[jnr1]
5449 movd mm7, [ecx + ebx*4] ;# charge[jnr2]
5450 punpckldq mm3,mm7 ;# move charge 2 to high part of mm3
5451 pfmul mm3,mm5 ;# mm3 now has qq for both particles
5453 lea eax, [eax + eax*2] ;# replace jnr with j3
5454 lea ebx, [ebx + ebx*2]
5456 movq mm0, [esp + i1110_ix]
5457 movd mm1, [esp + i1110_iz]
5458 movq mm4, [esi + eax*4] ;# fetch first j coordinates
5459 movd mm5, [esi + eax*4 + 8]
5460 pfsubr mm4,mm0 ;# dr = ir - jr
5461 pfsubr mm5,mm1
5462 movq [esp + i1110_dx1], mm4 ;# store dr
5463 movd [esp + i1110_dz1], mm5
5464 pfmul mm4,mm4 ;# square dx,dy,dz
5465 pfmul mm5,mm5
5466 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
5467 pfacc mm4, mm5 ;# first rsq in lower mm4
5469 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
5470 movd mm7, [esi + ebx*4 + 8]
5472 pfsubr mm6,mm0 ;# dr = ir - jr
5473 pfsubr mm7,mm1
5474 movq [esp + i1110_dx2], mm6 ;# store dr
5475 movd [esp + i1110_dz2], mm7
5476 pfmul mm6,mm6 ;# square dx,dy,dz
5477 pfmul mm7,mm7
5478 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
5479 pfacc mm6, mm7 ;# second rsq in lower mm6
5481 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
5482 pfrsqrt mm1, mm6
5484 punpckldq mm0,mm1
5485 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
5486 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
5487 pfmul mm0,mm0
5488 pfrsqit1 mm0,mm4
5489 pfrcpit2 mm0,mm2
5490 movq mm1,mm0
5491 pfmul mm0,mm0
5492 ;# mm0 now contains invsq, and mm1 invsqrt
5493 ;# do potential and fscal
5494 prefetchw [esp + i1110_dx1] ;# prefetch i forces to cache
5496 pfmul mm3,mm1 ;# 3 has both vcoul
5497 pfmul mm0,mm3 ;# 0 has both fscal
5499 ;# update vctot
5501 pfadd mm3, [esp + i1110_vctot] ;# add the earlier value
5502 movq [esp + i1110_vctot], mm3 ;# store the sum
5503 ;# spread fscalar to both positions
5504 movq mm1,mm0
5505 punpckldq mm0,mm0
5506 punpckhdq mm1,mm1
5507 ;# calc vector force
5508 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
5509 movq mm2, [esp + i1110_dx1] ;# fetch dr
5510 movd mm3, [esp + i1110_dz1]
5511 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
5512 pfmul mm2, mm0 ;# mult by fs
5513 pfmul mm3, mm0
5515 movq mm4, [esp + i1110_dx2] ;# fetch dr
5516 movd mm5, [esp + i1110_dz2]
5517 pfmul mm4, mm1 ;# mult by fs
5518 pfmul mm5, mm1
5519 ;# update i forces
5521 movq mm0, [esp + i1110_fix]
5522 movd mm1, [esp + i1110_fiz]
5523 pfadd mm0, mm2
5524 pfadd mm1, mm3
5526 pfadd mm0, mm4
5527 pfadd mm1, mm5
5528 movq [esp + i1110_fix], mm0
5529 movd [esp + i1110_fiz], mm1
5530 ;# update j forces
5532 movq mm0, [edi + eax*4]
5533 movd mm1, [edi + eax*4 + 8]
5534 movq mm6, [edi + ebx*4]
5535 movd mm7, [edi + ebx*4 + 8]
5537 pfsub mm0, mm2
5538 pfsub mm1, mm3
5539 pfsub mm6, mm4
5540 pfsub mm7, mm5
5542 movq [edi + eax*4], mm0
5543 movd [edi + eax*4 +8], mm1
5544 movq [edi + ebx*4], mm6
5545 movd [edi + ebx*4 + 8], mm7
5547 ;# should we do one more iteration?
5548 sub dword ptr [esp + i1110_innerk], 2
5549 jl .i1110_finish_coul_inner
5550 jmp .i1110_unroll_coul_loop
5551 .i1110_finish_coul_inner:
5552 and dword ptr [esp + i1110_innerk], 1
5553 jnz .i1110_single_coul_inner
5554 jmp .i1110_updateouterdata_coul
5555 .i1110_single_coul_inner:
5556 ;# a single j particle iteration here - compare with the unrolled code for comments
5557 mov eax, [esp + i1110_innerjjnr]
5558 mov eax, [eax] ;# eax=jnr offset
5560 mov ecx, [ebp + i1110_charge]
5561 movd mm6, [esp + i1110_iq]
5562 movd mm7, [ecx + eax*4]
5563 pfmul mm6, mm7 ;# mm6=qq
5565 lea eax, [eax + eax*2]
5567 movq mm0, [esp + i1110_ix]
5568 movd mm1, [esp + i1110_iz]
5569 movq mm2, [esi + eax*4]
5570 movd mm3, [esi + eax*4 + 8]
5571 pfsub mm0, mm2
5572 pfsub mm1, mm3
5573 movq [esp + i1110_dx1], mm0
5574 pfmul mm0,mm0
5575 movd [esp + i1110_dz1], mm1
5576 pfmul mm1,mm1
5577 pfacc mm0, mm1
5578 pfacc mm0, mm1 ;# mm0=rsq
5580 pfrsqrt mm1,mm0
5581 movq mm2,mm1
5582 pfmul mm1,mm1
5583 pfrsqit1 mm1,mm0
5584 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
5585 movq mm4, mm1
5586 pfmul mm4, mm4 ;# mm4=invsq
5587 ;# calculate potential and scalar force
5588 pfmul mm6, mm1 ;# mm6=vcoul
5589 pfmul mm4, mm6 ;# mm4=fscalar
5590 ;# update vctot
5591 pfadd mm6, [esp + i1110_vctot]
5592 movq [esp + i1110_vctot], mm6
5593 ;# spread fscalar to both positions
5594 punpckldq mm4,mm4
5595 ;# calc vectorial force
5596 prefetchw [edi + eax*4] ;# prefetch faction to cache
5597 movq mm0, [esp + i1110_dx1]
5598 movd mm1, [esp + i1110_dz1]
5599 pfmul mm0, mm4
5600 pfmul mm1, mm4
5601 ;# update i particle force
5602 movq mm2, [esp + i1110_fix]
5603 movd mm3, [esp + i1110_fiz]
5604 pfadd mm2, mm0
5605 pfadd mm3, mm1
5606 movq [esp + i1110_fix], mm2
5607 movd [esp + i1110_fiz], mm3
5608 ;# update j particle force
5609 movq mm2, [edi + eax*4]
5610 movd mm3, [edi + eax *4+ 8]
5611 pfsub mm2, mm0
5612 pfsub mm3, mm1
5613 movq [edi + eax*4], mm2
5614 movd [edi + eax*4 +8], mm3
5615 ;# done!
5616 .i1110_updateouterdata_coul:
5617 mov ecx, [esp + i1110_ii3]
5619 movq mm6, [edi + ecx*4] ;# increment i force
5620 movd mm7, [edi + ecx*4 + 8]
5621 pfadd mm6, [esp + i1110_fix]
5622 pfadd mm7, [esp + i1110_fiz]
5623 movq [edi + ecx*4], mm6
5624 movd [edi + ecx*4 +8], mm7
5626 mov ebx, [ebp + i1110_fshift] ;# increment fshift force
5627 mov edx, [esp + i1110_is3]
5629 movq mm6, [ebx + edx*4]
5630 movd mm7, [ebx + edx*4 + 8]
5631 pfadd mm6, [esp + i1110_fix]
5632 pfadd mm7, [esp + i1110_fiz]
5633 movq [ebx + edx*4], mm6
5634 movd [ebx + edx*4 + 8], mm7
5636 ;# loop back to mno
5637 dec dword ptr [esp + i1110_nscoul]
5638 jz .i1110_testvdw
5639 jmp .i1110_mno_coul
5640 .i1110_testvdw:
5641 mov ecx, [esp + i1110_nsvdw]
5642 cmp ecx, 0
5643 jnz .i1110_mno_vdw
5644 jmp .i1110_last_mno
5645 .i1110_mno_vdw:
5646 mov ebx, [esp + i1110_solnr]
5647 inc dword ptr [esp + i1110_solnr]
5649 mov edx, [ebp + i1110_type]
5650 mov edx, [edx + ebx*4]
5651 imul edx, [ebp + i1110_ntype]
5652 shl edx, 1
5653 mov [esp + i1110_ntia], edx
5655 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
5656 mov eax, [ebp + i1110_pos] ;# eax = base of pos[]
5657 mov [esp + i1110_ii3], ebx
5659 movq mm0, [eax + ebx*4]
5660 movd mm1, [eax + ebx*4 + 8]
5661 pfadd mm0, [esp + i1110_shX]
5662 pfadd mm1, [esp + i1110_shZ]
5663 movq [esp + i1110_ix], mm0
5664 movd [esp + i1110_iz], mm1
5666 ;# clear forces
5667 pxor mm7,mm7
5668 movq [esp + i1110_fix], mm7
5669 movd [esp + i1110_fiz], mm7
5671 mov ecx, [esp + i1110_innerjjnr0]
5672 mov [esp + i1110_innerjjnr], ecx
5673 mov edx, [esp + i1110_innerk0]
5674 sub edx, 2
5675 mov [esp + i1110_innerk], edx ;# number of innerloop atoms
5676 jge .i1110_unroll_vdw_loop
5677 jmp .i1110_finish_vdw_inner
5678 .i1110_unroll_vdw_loop:
5679 ;# paired innerloop starts here
5680 mov ecx, [esp + i1110_innerjjnr] ;# pointer to jjnr[k]
5681 mov eax, [ecx]
5682 mov ebx, [ecx + 4] ;# eax/ebx=jnr
5683 add dword ptr [esp + i1110_innerjjnr], 8 ;# advance pointer (unrolled 2)
5684 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
5686 mov ecx, [ebp + i1110_type]
5687 mov edx, [ecx + eax*4] ;# type [jnr1]
5688 mov ecx, [ecx + ebx*4] ;# type [jnr2]
5690 mov esi, [ebp + i1110_nbfp] ;# base of nbfp
5691 shl edx, 1
5692 shl ecx, 1
5693 add edx, [esp + i1110_ntia] ;# tja = ntia + 2*type
5694 add ecx, [esp + i1110_ntia]
5696 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
5697 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
5698 movq mm6,mm5
5699 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
5700 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
5701 movq [esp + i1110_c6], mm5
5702 movq [esp + i1110_c12], mm6
5704 lea eax, [eax + eax*2] ;# replace jnr with j3
5705 lea ebx, [ebx + ebx*2]
5707 mov esi, [ebp + i1110_pos]
5709 movq mm0, [esp + i1110_ix]
5710 movd mm1, [esp + i1110_iz]
5711 movq mm4, [esi + eax*4] ;# fetch first j coordinates
5712 movd mm5, [esi + eax*4 + 8]
5713 pfsubr mm4,mm0 ;# dr = ir - jr
5714 pfsubr mm5,mm1
5715 movq [esp + i1110_dx1], mm4 ;# store dr
5716 movd [esp + i1110_dz1], mm5
5717 pfmul mm4,mm4 ;# square dx,dy,dz
5718 pfmul mm5,mm5
5719 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
5720 pfacc mm4, mm5 ;# first rsq in lower mm4
5722 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
5723 movd mm7, [esi + ebx*4 + 8]
5725 pfsubr mm6,mm0 ;# dr = ir - jr
5726 pfsubr mm7,mm1
5727 movq [esp + i1110_dx2], mm6 ;# store dr
5728 movd [esp + i1110_dz2], mm7
5729 pfmul mm6,mm6 ;# square dx,dy,dz
5730 pfmul mm7,mm7
5731 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
5732 pfacc mm6, mm7 ;# second rsq in lower mm6
5734 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
5735 pfrsqrt mm1, mm6
5737 punpckldq mm0,mm1
5738 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
5739 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
5740 pfmul mm0,mm0
5741 pfrsqit1 mm0,mm4
5742 pfrcpit2 mm0,mm2
5743 movq mm1,mm0
5744 pfmul mm0,mm0
5745 ;# mm0 now contains invsq, and mm1 invsqrt
5746 ;# do potential and fscal
5747 movq mm4, mm0
5748 pfmul mm4, mm0
5749 pfmul mm4, mm0 ;# mm4=rinvsix
5750 movq mm5, mm4
5751 pfmul mm5, mm5 ;# mm5=rinvtwelve
5753 pfmul mm5, [esp + i1110_c12]
5754 pfmul mm4, [esp + i1110_c6]
5755 movq mm6, mm5 ;# mm6 is vnb12-vnb6
5756 pfsub mm6, mm4
5758 pfmul mm4, [esp + i1110_six]
5760 pfmul mm5, [esp + i1110_twelve]
5761 movq mm7, mm5
5762 pfsub mm7,mm4
5763 pfmul mm0, mm7 ;# mm0 is total fscal now
5765 prefetchw [esp + i1110_dx1] ;# prefetch i forces to cache
5767 ;# spread fscalar to both positions
5768 movq mm1,mm0
5769 punpckldq mm0,mm0
5770 punpckhdq mm1,mm1
5772 ;# calc vector force
5773 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
5774 movq mm2, [esp + i1110_dx1] ;# fetch dr
5775 movd mm3, [esp + i1110_dz1]
5777 ;# update vnbtot
5778 pfadd mm6, [esp + i1110_vnbtot] ;# add the earlier value
5779 movq [esp + i1110_vnbtot], mm6 ;# store the sum
5781 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
5782 pfmul mm2, mm0 ;# mult by fs
5783 pfmul mm3, mm0
5785 movq mm4, [esp + i1110_dx2] ;# fetch dr
5786 movd mm5, [esp + i1110_dz2]
5787 pfmul mm4, mm1 ;# mult by fs
5788 pfmul mm5, mm1
5789 ;# update i forces
5791 movq mm0, [esp + i1110_fix]
5792 movd mm1, [esp + i1110_fiz]
5793 pfadd mm0, mm2
5794 pfadd mm1, mm3
5796 pfadd mm0, mm4
5797 pfadd mm1, mm5
5798 movq [esp + i1110_fix], mm0
5799 movd [esp + i1110_fiz], mm1
5800 ;# update j forces
5802 movq mm0, [edi + eax*4]
5803 movd mm1, [edi + eax*4 + 8]
5804 movq mm6, [edi + ebx*4]
5805 movd mm7, [edi + ebx*4 + 8]
5807 pfsub mm0, mm2
5808 pfsub mm1, mm3
5809 pfsub mm6, mm4
5810 pfsub mm7, mm5
5812 movq [edi + eax*4], mm0
5813 movd [edi + eax*4 +8], mm1
5814 movq [edi + ebx*4], mm6
5815 movd [edi + ebx*4 + 8], mm7
5817 ;# should we do one more iteration?
5818 sub dword ptr [esp + i1110_innerk], 2
5819 jl .i1110_finish_vdw_inner
5820 jmp .i1110_unroll_vdw_loop
5821 .i1110_finish_vdw_inner:
5822 and dword ptr [esp + i1110_innerk], 1
5823 jnz .i1110_single_vdw_inner
5824 jmp .i1110_updateouterdata_vdw
5825 .i1110_single_vdw_inner:
5826 ;# a single j particle iteration here - compare with the unrolled code for comments
5827 mov eax, [esp + i1110_innerjjnr]
5828 mov eax, [eax] ;# eax=jnr offset
5830 mov esi, [ebp + i1110_nbfp]
5831 mov ecx, [ebp + i1110_type]
5832 mov edx, [ecx + eax*4] ;# type [jnr1]
5833 shl edx, 1
5834 add edx, [esp + i1110_ntia] ;# tja = ntia + 2*type
5835 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
5836 movq [esp + i1110_c6], mm5
5837 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
5838 movq [esp + i1110_c12], mm5
5841 mov esi, [ebp + i1110_pos]
5842 lea eax, [eax + eax*2]
5844 movq mm0, [esp + i1110_ix]
5845 movd mm1, [esp + i1110_iz]
5846 movq mm4, [esi + eax*4]
5847 movd mm5, [esi + eax*4 + 8]
5848 pfsubr mm4, mm0
5849 pfsubr mm5, mm1
5850 movq [esp + i1110_dx1], mm4
5851 pfmul mm4,mm4
5852 movd [esp + i1110_dz1], mm5
5853 pfmul mm5,mm5
5854 pfacc mm4, mm5
5855 pfacc mm4, mm5 ;# mm0=rsq
5857 pfrsqrt mm0,mm4
5858 movq mm2,mm0
5859 pfmul mm0,mm0
5860 pfrsqit1 mm0,mm4
5861 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
5862 movq mm1, mm0
5863 pfmul mm0, mm0 ;# mm0=invsq
5864 ;# calculate potentials and scalar force
5865 movq mm4, mm0
5866 pfmul mm4, mm0
5867 pfmul mm4, mm0 ;# mm4=rinvsix
5868 movq mm5, mm4
5869 pfmul mm5, mm5 ;# mm5=rinvtwelve
5871 pfmul mm5, [esp + i1110_c12]
5872 pfmul mm4, [esp + i1110_c6]
5873 movq mm6, mm5 ;# mm6 is vnb12-vnb6
5874 pfsub mm6, mm4
5876 pfmul mm4, [esp + i1110_six]
5878 pfmul mm5, [esp + i1110_twelve]
5879 movq mm7, mm5
5880 pfsub mm7,mm4
5881 pfmul mm0, mm7 ;# mm0 is total fscal now
5883 ;# update vnbtot
5884 pfadd mm6, [esp + i1110_vnbtot] ;# add the earlier value
5885 movq [esp + i1110_vnbtot], mm6 ;# store the sum
5887 ;# spread fscalar to both positions
5888 punpckldq mm0,mm0
5889 ;# calc vectorial force
5890 prefetchw [edi + eax*4] ;# prefetch faction to cache
5891 movq mm2, [esp + i1110_dx1]
5892 movd mm3, [esp + i1110_dz1]
5895 pfmul mm2, mm0
5896 pfmul mm3, mm0
5898 ;# update i particle force
5899 movq mm0, [esp + i1110_fix]
5900 movd mm1, [esp + i1110_fiz]
5901 pfadd mm0, mm2
5902 pfadd mm1, mm3
5903 movq [esp + i1110_fix], mm0
5904 movd [esp + i1110_fiz], mm1
5905 ;# update j particle force
5906 movq mm0, [edi + eax*4]
5907 movd mm1, [edi + eax *4+ 8]
5908 pfsub mm0, mm2
5909 pfsub mm1, mm3
5910 movq [edi + eax*4], mm0
5911 movd [edi + eax*4 +8], mm1
5912 ;# done!
5913 .i1110_updateouterdata_vdw:
5914 mov ecx, [esp + i1110_ii3]
5916 movq mm6, [edi + ecx*4] ;# increment i force
5917 movd mm7, [edi + ecx*4 + 8]
5918 pfadd mm6, [esp + i1110_fix]
5919 pfadd mm7, [esp + i1110_fiz]
5920 movq [edi + ecx*4], mm6
5921 movd [edi + ecx*4 +8], mm7
5923 mov ebx, [ebp + i1110_fshift] ;# increment fshift force
5924 mov edx, [esp + i1110_is3]
5926 movq mm6, [ebx + edx*4]
5927 movd mm7, [ebx + edx*4 + 8]
5928 pfadd mm6, [esp + i1110_fix]
5929 pfadd mm7, [esp + i1110_fiz]
5930 movq [ebx + edx*4], mm6
5931 movd [ebx + edx*4 + 8], mm7
5933 ;# loop back to mno
5934 dec dword ptr [esp + i1110_nsvdw]
5935 jz .i1110_last_mno
5936 jmp .i1110_mno_vdw
5938 .i1110_last_mno:
5939 mov edx, [ebp + i1110_gid] ;# get group index for this i particle
5940 mov edx, [edx]
5941 add dword ptr [ebp + i1110_gid], 4 ;# advance pointer
5943 movq mm7, [esp + i1110_vctot]
5944 pfacc mm7,mm7 ;# get and sum the two parts of total potential
5946 mov eax, [ebp + i1110_Vc]
5947 movd mm6, [eax + edx*4]
5948 pfadd mm6, mm7
5949 movd [eax + edx*4], mm6 ;# increment vc[gid]
5951 movq mm7, [esp + i1110_vnbtot]
5952 pfacc mm7,mm7 ;# get and sum the two parts of total potential
5954 mov eax, [ebp + i1110_Vnb]
5955 movd mm6, [eax + edx*4]
5956 pfadd mm6, mm7
5957 movd [eax + edx*4], mm6 ;# increment vc[gid]
5958 ;# finish if last
5959 mov ecx, [ebp + i1110_nri]
5960 dec ecx
5961 jecxz .i1110_end
5962 ;# not last, iterate once more!
5963 mov [ebp + i1110_nri], ecx
5964 jmp .i1110_outer
5965 .i1110_end:
5966 femms
5967 add esp, 160
5968 pop edi
5969 pop esi
5970 pop edx
5971 pop ecx
5972 pop ebx
5973 pop eax
5974 leave
5979 .globl inl1120_3dnow
5980 .globl _inl1120_3dnow
5981 inl1120_3dnow:
5982 _inl1120_3dnow:
5983 .equiv i1120_nri, 8
5984 .equiv i1120_iinr, 12
5985 .equiv i1120_jindex, 16
5986 .equiv i1120_jjnr, 20
5987 .equiv i1120_shift, 24
5988 .equiv i1120_shiftvec, 28
5989 .equiv i1120_fshift, 32
5990 .equiv i1120_gid, 36
5991 .equiv i1120_pos, 40
5992 .equiv i1120_faction, 44
5993 .equiv i1120_charge, 48
5994 .equiv i1120_facel, 52
5995 .equiv i1120_Vc, 56
5996 .equiv i1120_type, 60
5997 .equiv i1120_ntype, 64
5998 .equiv i1120_nbfp, 68
5999 .equiv i1120_Vnb, 72
6000 ;# stack offsets for local variables
6001 .equiv i1120_is3, 0
6002 .equiv i1120_ii3, 4
6003 .equiv i1120_ixO, 8
6004 .equiv i1120_iyO, 12
6005 .equiv i1120_izO, 16
6006 .equiv i1120_ixH, 20
6007 .equiv i1120_iyH, 28
6008 .equiv i1120_izH, 36
6009 .equiv i1120_iqO, 44
6010 .equiv i1120_iqH, 52
6011 .equiv i1120_vctot, 60
6012 .equiv i1120_vnbtot, 68
6013 .equiv i1120_c6, 76
6014 .equiv i1120_c12, 84
6015 .equiv i1120_six, 92
6016 .equiv i1120_twelve, 100
6017 .equiv i1120_ntia, 108
6018 .equiv i1120_innerjjnr, 116
6019 .equiv i1120_innerk, 120
6020 .equiv i1120_fixO, 124
6021 .equiv i1120_fiyO, 128
6022 .equiv i1120_fizO, 132
6023 .equiv i1120_fixH, 136
6024 .equiv i1120_fiyH, 144
6025 .equiv i1120_fizH, 152
6026 .equiv i1120_dxO, 160
6027 .equiv i1120_dyO, 164
6028 .equiv i1120_dzO, 168
6029 .equiv i1120_dxH, 172
6030 .equiv i1120_dyH, 180
6031 .equiv i1120_dzH, 188
6032 push ebp
6033 mov ebp,esp
6034 push eax
6035 push ebx
6036 push ecx
6037 push edx
6038 push esi
6039 push edi
6040 sub esp, 196 ;# local stack space
6041 femms
6042 ;# assume we have at least one i particle - start directly
6044 mov ecx, [ebp + i1120_iinr] ;# ecx = pointer into iinr[]
6045 mov ebx, [ecx] ;# ebx=ii
6047 mov edx, [ebp + i1120_charge]
6048 movd mm1, [ebp + i1120_facel]
6049 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0]
6050 pfmul mm2, mm1
6051 movq [esp + i1120_iqO], mm2 ;# iqO = facel*charge[ii]
6053 movd mm2, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1]
6054 pfmul mm2, mm1
6055 punpckldq mm2,mm2 ;# spread to both halves
6056 movq [esp + i1120_iqH], mm2 ;# iqH = facel*charge[ii0+1]
6058 mov edx, [ebp + i1120_type]
6059 mov ecx, [edx + ebx*4]
6060 shl ecx, 1
6061 imul ecx, [ebp + i1120_ntype] ;# ecx = ntia = 2*ntype*type[ii0]
6062 mov [esp + i1120_ntia], ecx
6064 movq mm3, [mm_six]
6065 movq mm4, [mm_twelve]
6066 movq [esp + i1120_six], mm3
6067 movq [esp + i1120_twelve], mm4
6068 .i1120_outer:
6069 mov eax, [ebp + i1120_shift] ;# eax = pointer into shift[]
6070 mov ebx, [eax] ;# ebx=shift[n]
6071 add dword ptr [ebp + i1120_shift], 4 ;# advance pointer one step
6073 lea ebx, [ebx + ebx*2] ;# ebx=3*is
6074 mov [esp + i1120_is3],ebx ;# store is3
6076 mov eax, [ebp + i1120_shiftvec] ;# eax = base of shiftvec[]
6078 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
6079 movd mm6, [eax + ebx*4 + 8]
6080 movq mm0, mm5
6081 movq mm1, mm5
6082 movq mm2, mm6
6083 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
6084 punpckhdq mm1,mm1
6085 punpckldq mm2,mm2
6087 mov ecx, [ebp + i1120_iinr] ;# ecx = pointer into iinr[]
6088 add dword ptr [ebp + i1120_iinr], 4 ;# advance pointer
6089 mov ebx, [ecx] ;# ebx=ii
6091 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
6092 mov eax, [ebp + i1120_pos] ;# eax = base of pos[]
6094 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
6095 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
6096 mov [esp + i1120_ii3], ebx ;# (use mm7 as temp. storage for iz.)
6097 pfadd mm6, mm7
6098 movq [esp + i1120_ixO], mm5
6099 movq [esp + i1120_izO], mm6
6101 movd mm3, [eax + ebx*4 + 12]
6102 movd mm4, [eax + ebx*4 + 16]
6103 movd mm5, [eax + ebx*4 + 20]
6104 punpckldq mm3, [eax + ebx*4 + 24]
6105 punpckldq mm4, [eax + ebx*4 + 28]
6106 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
6108 pfadd mm0, mm3
6109 pfadd mm1, mm4
6110 pfadd mm2, mm5
6111 movq [esp + i1120_ixH], mm0
6112 movq [esp + i1120_iyH], mm1
6113 movq [esp + i1120_izH], mm2
6115 ;# clear vctot and i forces
6116 pxor mm7,mm7
6117 movq [esp + i1120_vctot], mm7
6118 movq [esp + i1120_vnbtot], mm7
6119 movq [esp + i1120_fixO], mm7
6120 movd [esp + i1120_fizO], mm7
6121 movq [esp + i1120_fixH], mm7
6122 movq [esp + i1120_fiyH], mm7
6123 movq [esp + i1120_fizH], mm7
6125 mov eax, [ebp + i1120_jindex]
6126 mov ecx, [eax] ;# jindex[n]
6127 mov edx, [eax + 4] ;# jindex[n+1]
6128 add dword ptr [ebp + i1120_jindex], 4
6129 sub edx, ecx ;# number of innerloop atoms
6130 mov [esp + i1120_innerk], edx ;# number of innerloop atoms
6132 mov esi, [ebp + i1120_pos]
6133 mov edi, [ebp + i1120_faction]
6134 mov eax, [ebp + i1120_jjnr]
6135 shl ecx, 2
6136 add eax, ecx
6137 mov [esp + i1120_innerjjnr], eax ;# pointer to jjnr[nj0]
6138 .i1120_inner_loop:
6139 ;# a single j particle iteration here - compare with the unrolled code for comments.
6140 mov eax, [esp + i1120_innerjjnr]
6141 mov eax, [eax] ;# eax=jnr offset
6142 add dword ptr [esp + i1120_innerjjnr], 4 ;# advance pointer
6143 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
6145 mov ecx, [ebp + i1120_charge]
6146 movd mm7, [ecx + eax*4]
6147 punpckldq mm7,mm7
6148 movq mm6,mm7
6149 pfmul mm6, [esp + i1120_iqO]
6150 pfmul mm7, [esp + i1120_iqH] ;# mm6=qqO, mm7=qqH
6152 mov ecx, [ebp + i1120_type]
6153 mov edx, [ecx + eax*4] ;# type [jnr]
6154 mov ecx, [ebp + i1120_nbfp]
6155 shl edx, 1
6156 add edx, [esp + i1120_ntia] ;# tja = ntia + 2*type
6157 movd mm5, [ecx + edx*4] ;# mm5 = 1st c6
6158 movq [esp + i1120_c6], mm5
6159 movd mm5, [ecx + edx*4 + 4] ;# mm5 = 1st c12
6160 movq [esp + i1120_c12], mm5
6162 lea eax, [eax + eax*2]
6164 movq mm0, [esi + eax*4]
6165 movd mm1, [esi + eax*4 + 8]
6166 ;# copy & expand to mm2-mm4 for the H interactions
6167 movq mm2, mm0
6168 movq mm3, mm0
6169 movq mm4, mm1
6170 punpckldq mm2,mm2
6171 punpckhdq mm3,mm3
6172 punpckldq mm4,mm4
6174 pfsubr mm0, [esp + i1120_ixO]
6175 pfsubr mm1, [esp + i1120_izO]
6177 movq [esp + i1120_dxO], mm0
6178 pfmul mm0,mm0
6179 movd [esp + i1120_dzO], mm1
6180 pfmul mm1,mm1
6181 pfacc mm0, mm1
6182 pfadd mm0, mm1 ;# mm0=rsqO
6184 punpckldq mm2, mm2
6185 punpckldq mm3, mm3
6186 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
6187 pfsubr mm2, [esp + i1120_ixH]
6188 pfsubr mm3, [esp + i1120_iyH]
6189 pfsubr mm4, [esp + i1120_izH] ;# mm2-mm4 is dxH-dzH
6191 movq [esp + i1120_dxH], mm2
6192 movq [esp + i1120_dyH], mm3
6193 movq [esp + i1120_dzH], mm4
6194 pfmul mm2,mm2
6195 pfmul mm3,mm3
6196 pfmul mm4,mm4
6198 pfadd mm3,mm2
6199 pfadd mm3,mm4 ;# mm3=rsqH
6201 pfrsqrt mm1,mm0
6203 movq mm2,mm1
6204 pfmul mm1,mm1
6205 pfrsqit1 mm1,mm0
6206 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
6207 movq mm4, mm1
6208 pfmul mm4, mm4 ;# mm4=invsq
6210 movq mm0, mm4
6211 pfmul mm0, mm4
6212 pfmul mm0, mm4 ;# mm0=rinvsix
6213 movq mm2, mm0
6214 pfmul mm2, mm2 ;# mm2=rintwelve
6216 ;# calculate potential and scalar force
6217 pfmul mm6, mm1 ;# mm6=vcoul
6218 movq mm1, mm6 ;# use mm1 for fscal sum
6220 ;# LJ for the oxygen
6221 pfmul mm0, [esp + i1120_c6]
6222 pfmul mm2, [esp + i1120_c12]
6224 ;# calc nb potential
6225 movq mm5, mm2
6226 pfsub mm5, mm0
6228 ;# calc nb force
6229 pfmul mm0, [esp + i1120_six]
6230 pfmul mm2, [esp + i1120_twelve]
6232 ;# increment scalar force
6233 pfsub mm1, mm0
6234 pfadd mm1, mm2
6235 pfmul mm4, mm1 ;# total scalar force on oxygen.
6237 ;# update nb potential
6238 pfadd mm5, [esp + i1120_vnbtot]
6239 movq [esp + i1120_vnbtot], mm5
6241 pfrsqrt mm5, mm3
6242 pswapd mm3,mm3
6243 pfrsqrt mm2, mm3
6244 pswapd mm3,mm3
6245 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3.
6247 movq mm2, mm5
6248 pfmul mm5,mm5
6249 pfrsqit1 mm5,mm3
6250 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
6251 movq mm3,mm5
6252 pfmul mm3,mm3 ;# mm3=invsq
6253 pfmul mm7, mm5 ;# mm7=vcoul
6254 pfmul mm3, mm7 ;# mm3=fscal for the two H's.
6256 ;# update vctot
6257 pfadd mm7, mm6
6258 pfadd mm7, [esp + i1120_vctot]
6259 movq [esp + i1120_vctot], mm7
6261 ;# spread oxygen fscalar to both positions
6262 punpckldq mm4,mm4
6263 ;# calc vectorial force for O
6264 prefetchw [edi + eax*4] ;# prefetch faction to cache
6265 movq mm0, [esp + i1120_dxO]
6266 movd mm1, [esp + i1120_dzO]
6267 pfmul mm0, mm4
6268 pfmul mm1, mm4
6270 ;# calc vectorial force for H's
6271 movq mm5, [esp + i1120_dxH]
6272 movq mm6, [esp + i1120_dyH]
6273 movq mm7, [esp + i1120_dzH]
6274 pfmul mm5, mm3
6275 pfmul mm6, mm3
6276 pfmul mm7, mm3
6278 ;# update iO particle force
6279 movq mm2, [esp + i1120_fixO]
6280 movd mm3, [esp + i1120_fizO]
6281 pfadd mm2, mm0
6282 pfadd mm3, mm1
6283 movq [esp + i1120_fixO], mm2
6284 movd [esp + i1120_fizO], mm3
6286 ;# update iH forces
6287 movq mm2, [esp + i1120_fixH]
6288 movq mm3, [esp + i1120_fiyH]
6289 movq mm4, [esp + i1120_fizH]
6290 pfadd mm2, mm5
6291 pfadd mm3, mm6
6292 pfadd mm4, mm7
6293 movq [esp + i1120_fixH], mm2
6294 movq [esp + i1120_fiyH], mm3
6295 movq [esp + i1120_fizH], mm4
6297 ;# pack j forces from H in the same form as the oxygen force.
6298 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
6299 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
6301 pfadd mm0, mm5 ;# add up total force on j particle.
6302 pfadd mm1, mm7
6304 ;# update j particle force
6305 movq mm2, [edi + eax*4]
6306 movd mm3, [edi + eax*4 + 8]
6307 pfsub mm2, mm0
6308 pfsub mm3, mm1
6309 movq [edi + eax*4], mm2
6310 movd [edi + eax*4 +8], mm3
6312 ;# done - one more?
6313 dec dword ptr [esp + i1120_innerk]
6314 jz .i1120_updateouterdata
6315 jmp .i1120_inner_loop
6316 .i1120_updateouterdata:
6317 mov ecx, [esp + i1120_ii3]
6319 movq mm6, [edi + ecx*4] ;# increment iO force
6320 movd mm7, [edi + ecx*4 + 8]
6321 pfadd mm6, [esp + i1120_fixO]
6322 pfadd mm7, [esp + i1120_fizO]
6323 movq [edi + ecx*4], mm6
6324 movd [edi + ecx*4 +8], mm7
6326 movq mm0, [esp + i1120_fixH]
6327 movq mm3, [esp + i1120_fiyH]
6328 movq mm1, [esp + i1120_fizH]
6329 movq mm2, mm0
6330 punpckldq mm0, mm3 ;# mm0(l)=fxH1, mm0(h)=fyH1
6331 punpckhdq mm2, mm3 ;# mm2(l)=fxH2, mm2(h)=fyH2
6332 movq mm3, mm1
6333 pswapd mm3,mm3
6334 ;# mm1 is fzH1
6335 ;# mm3 is fzH2
6337 movq mm6, [edi + ecx*4 + 12] ;# increment iH1 force
6338 movd mm7, [edi + ecx*4 + 20]
6339 pfadd mm6, mm0
6340 pfadd mm7, mm1
6341 movq [edi + ecx*4 + 12], mm6
6342 movd [edi + ecx*4 + 20], mm7
6344 movq mm6, [edi + ecx*4 + 24] ;# increment iH2 force
6345 movd mm7, [edi + ecx*4 + 32]
6346 pfadd mm6, mm2
6347 pfadd mm7, mm3
6348 movq [edi + ecx*4 + 24], mm6
6349 movd [edi + ecx*4 + 32], mm7
6352 mov ebx, [ebp + i1120_fshift] ;# increment fshift force
6353 mov edx, [esp + i1120_is3]
6355 movq mm6, [ebx + edx*4]
6356 movd mm7, [ebx + edx*4 + 8]
6357 pfadd mm6, [esp + i1120_fixO]
6358 pfadd mm7, [esp + i1120_fizO]
6359 pfadd mm6, mm0
6360 pfadd mm7, mm1
6361 pfadd mm6, mm2
6362 pfadd mm7, mm3
6363 movq [ebx + edx*4], mm6
6364 movd [ebx + edx*4 + 8], mm7
6366 mov edx, [ebp + i1120_gid] ;# get group index for this i particle
6367 mov edx, [edx]
6368 add dword ptr [ebp + i1120_gid], 4 ;# advance pointer
6370 movq mm7, [esp + i1120_vctot]
6371 pfacc mm7,mm7 ;# get and sum the two parts of total potential
6373 mov eax, [ebp + i1120_Vc]
6374 movd mm6, [eax + edx*4]
6375 pfadd mm6, mm7
6376 movd [eax + edx*4], mm6 ;# increment vc[gid]
6378 movq mm7, [esp + i1120_vnbtot]
6379 pfacc mm7,mm7 ;# same for Vnb
6381 mov eax, [ebp + i1120_Vnb]
6382 movd mm6, [eax + edx*4]
6383 pfadd mm6, mm7
6384 movd [eax + edx*4], mm6 ;# increment vnb[gid]
6385 ;# finish if last
6386 dec dword ptr [ebp + i1120_nri]
6387 jz .i1120_end
6388 ;# not last, iterate once more!
6389 jmp .i1120_outer
6390 .i1120_end:
6391 femms
6392 add esp, 196
6393 pop edi
6394 pop esi
6395 pop edx
6396 pop ecx
6397 pop ebx
6398 pop eax
6399 leave
6404 .globl inl1130_3dnow
6405 .globl _inl1130_3dnow
6406 inl1130_3dnow:
6407 _inl1130_3dnow:
6408 .equiv i1130_nri, 8
6409 .equiv i1130_iinr, 12
6410 .equiv i1130_jindex, 16
6411 .equiv i1130_jjnr, 20
6412 .equiv i1130_shift, 24
6413 .equiv i1130_shiftvec, 28
6414 .equiv i1130_fshift, 32
6415 .equiv i1130_gid, 36
6416 .equiv i1130_pos, 40
6417 .equiv i1130_faction, 44
6418 .equiv i1130_charge, 48
6419 .equiv i1130_facel, 52
6420 .equiv i1130_Vc, 56
6421 .equiv i1130_type, 60
6422 .equiv i1130_ntype, 64
6423 .equiv i1130_nbfp, 68
6424 .equiv i1130_Vnb, 72
6425 ;# stack offsets for local variables
6426 .equiv i1130_is3, 0
6427 .equiv i1130_ii3, 4
6428 .equiv i1130_ixO, 8
6429 .equiv i1130_iyO, 12
6430 .equiv i1130_izO, 16
6431 .equiv i1130_ixH, 20
6432 .equiv i1130_iyH, 28
6433 .equiv i1130_izH, 36
6434 .equiv i1130_qqOO, 44
6435 .equiv i1130_qqOH, 52
6436 .equiv i1130_qqHH, 60
6437 .equiv i1130_c6, 68
6438 .equiv i1130_c12, 76
6439 .equiv i1130_six, 84
6440 .equiv i1130_twelve, 92
6441 .equiv i1130_vctot, 100
6442 .equiv i1130_vnbtot, 108
6443 .equiv i1130_innerjjnr, 116
6444 .equiv i1130_innerk, 120
6445 .equiv i1130_fixO, 124
6446 .equiv i1130_fiyO, 128
6447 .equiv i1130_fizO, 132
6448 .equiv i1130_fixH, 136
6449 .equiv i1130_fiyH, 144
6450 .equiv i1130_fizH, 152
6451 .equiv i1130_dxO, 160
6452 .equiv i1130_dyO, 164
6453 .equiv i1130_dzO, 168
6454 .equiv i1130_dxH, 172
6455 .equiv i1130_dyH, 180
6456 .equiv i1130_dzH, 188
6457 push ebp
6458 mov ebp,esp
6459 push eax
6460 push ebx
6461 push ecx
6462 push edx
6463 push esi
6464 push edi
6465 sub esp, 196 ;# local stack space
6466 femms
6467 ;# assume we have at least one i particle - start directly
6469 mov ecx, [ebp + i1130_iinr] ;# ecx = pointer into iinr[]
6470 mov ebx, [ecx] ;# ebx=ii
6472 mov edx, [ebp + i1130_charge]
6473 movd mm1, [ebp + i1130_facel] ;# mm1=facel
6474 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0] (O)
6475 movd mm3, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1] (H)
6476 movq mm4, mm2
6477 pfmul mm4, mm1
6478 movq mm6, mm3
6479 pfmul mm6, mm1
6480 movq mm5, mm4
6481 pfmul mm4, mm2 ;# mm4=qqOO*facel
6482 pfmul mm5, mm3 ;# mm5=qqOH*facel
6483 pfmul mm6, mm3 ;# mm6=qqHH*facel
6484 punpckldq mm5,mm5 ;# spread to both halves
6485 punpckldq mm6,mm6 ;# spread to both halves
6486 movq [esp + i1130_qqOO], mm4
6487 movq [esp + i1130_qqOH], mm5
6488 movq [esp + i1130_qqHH], mm6
6489 mov edx, [ebp + i1130_type]
6490 mov ecx, [edx + ebx*4]
6491 shl ecx, 1
6492 mov edx, ecx
6493 imul ecx, [ebp + i1130_ntype]
6494 add edx, ecx
6495 mov eax, [ebp + i1130_nbfp]
6496 movd mm0, [eax + edx*4]
6497 movd mm1, [eax + edx*4 + 4]
6498 movq [esp + i1130_c6], mm0
6499 movq [esp + i1130_c12], mm1
6500 movq mm2, [mm_six]
6501 movq mm3, [mm_twelve]
6502 movq [esp + i1130_six], mm2
6503 movq [esp + i1130_twelve], mm3
6504 .i1130_outer:
6505 mov eax, [ebp + i1130_shift] ;# eax = pointer into shift[]
6506 mov ebx, [eax] ;# ebx=shift[n]
6507 add dword ptr [ebp + i1130_shift], 4 ;# advance pointer one step
6509 lea ebx, [ebx + ebx*2] ;# ebx=3*is
6510 mov [esp + i1130_is3],ebx ;# store is3
6512 mov eax, [ebp + i1130_shiftvec] ;# eax = base of shiftvec[]
6514 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
6515 movd mm6, [eax + ebx*4 + 8]
6516 movq mm0, mm5
6517 movq mm1, mm5
6518 movq mm2, mm6
6519 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
6520 punpckhdq mm1,mm1
6521 punpckldq mm2,mm2
6523 mov ecx, [ebp + i1130_iinr] ;# ecx = pointer into iinr[]
6524 add dword ptr [ebp + i1130_iinr], 4 ;# advance pointer
6525 mov ebx, [ecx] ;# ebx=ii
6527 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
6528 mov eax, [ebp + i1130_pos] ;# eax = base of pos[]
6530 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
6531 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
6532 mov [esp + i1130_ii3], ebx ;# (use mm7 as temp. storage for iz.)
6533 pfadd mm6, mm7
6534 movq [esp + i1130_ixO], mm5
6535 movq [esp + i1130_izO], mm6
6537 movd mm3, [eax + ebx*4 + 12]
6538 movd mm4, [eax + ebx*4 + 16]
6539 movd mm5, [eax + ebx*4 + 20]
6540 punpckldq mm3, [eax + ebx*4 + 24]
6541 punpckldq mm4, [eax + ebx*4 + 28]
6542 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
6544 pfadd mm0, mm3
6545 pfadd mm1, mm4
6546 pfadd mm2, mm5
6547 movq [esp + i1130_ixH], mm0
6548 movq [esp + i1130_iyH], mm1
6549 movq [esp + i1130_izH], mm2
6551 ;# clear vctot and i forces
6552 pxor mm7,mm7
6553 movq [esp + i1130_vctot], mm7
6554 movq [esp + i1130_vnbtot], mm7
6555 movq [esp + i1130_fixO], mm7
6556 movq [esp + i1130_fizO], mm7
6557 movq [esp + i1130_fixH], mm7
6558 movq [esp + i1130_fiyH], mm7
6559 movq [esp + i1130_fizH], mm7
6561 mov eax, [ebp + i1130_jindex]
6562 mov ecx, [eax] ;# jindex[n]
6563 mov edx, [eax + 4] ;# jindex[n+1]
6564 add dword ptr [ebp + i1130_jindex], 4
6565 sub edx, ecx ;# number of innerloop atoms
6566 mov [esp + i1130_innerk], edx ;# number of innerloop atoms
6568 mov esi, [ebp + i1130_pos]
6569 mov edi, [ebp + i1130_faction]
6570 mov eax, [ebp + i1130_jjnr]
6571 shl ecx, 2
6572 add eax, ecx
6573 mov [esp + i1130_innerjjnr], eax ;# pointer to jjnr[nj0]
6574 .i1130_inner_loop:
6575 ;# a single j particle iteration here - compare with the unrolled code for comments.
6576 mov eax, [esp + i1130_innerjjnr]
6577 mov eax, [eax] ;# eax=jnr offset
6578 add dword ptr [esp + i1130_innerjjnr], 4 ;# advance pointer
6580 movd mm6, [esp + i1130_qqOO]
6581 movq mm7, [esp + i1130_qqOH]
6583 lea eax, [eax + eax*2]
6584 movq mm0, [esi + eax*4]
6585 movd mm1, [esi + eax*4 + 8]
6586 ;# copy & expand to mm2-mm4 for the H interactions
6587 movq mm2, mm0
6588 movq mm3, mm0
6589 movq mm4, mm1
6590 punpckldq mm2,mm2
6591 punpckhdq mm3,mm3
6592 punpckldq mm4,mm4
6594 pfsubr mm0, [esp + i1130_ixO]
6595 pfsubr mm1, [esp + i1130_izO]
6597 movq [esp + i1130_dxO], mm0
6598 pfmul mm0,mm0
6599 movd [esp + i1130_dzO], mm1
6600 pfmul mm1,mm1
6601 pfacc mm0, mm0
6602 pfadd mm0, mm1 ;# mm0=rsqO
6604 punpckldq mm2, mm2
6605 punpckldq mm3, mm3
6606 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
6607 pfsubr mm2, [esp + i1130_ixH]
6608 pfsubr mm3, [esp + i1130_iyH]
6609 pfsubr mm4, [esp + i1130_izH] ;# mm2-mm4 is dxH-dzH
6611 movq [esp + i1130_dxH], mm2
6612 movq [esp + i1130_dyH], mm3
6613 movq [esp + i1130_dzH], mm4
6614 pfmul mm2,mm2
6615 pfmul mm3,mm3
6616 pfmul mm4,mm4
6618 pfadd mm3,mm2
6619 pfadd mm3,mm4 ;# mm3=rsqH
6621 pfrsqrt mm1,mm0
6623 movq mm2,mm1
6624 pfmul mm1,mm1
6625 pfrsqit1 mm1,mm0
6626 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
6627 movq mm4, mm1
6628 pfmul mm4, mm4 ;# mm4=invsq
6630 movq mm2, mm4
6631 pfmul mm2, mm4
6632 pfmul mm2, mm4
6633 movq mm0, mm2
6634 pfmul mm0,mm0
6635 pfmul mm2, [esp + i1130_c6]
6636 pfmul mm0, [esp + i1130_c12]
6637 movq mm5, mm0
6638 pfsub mm5, mm2 ;# vnb
6640 pfmul mm2, [esp + i1130_six]
6641 pfmul mm0, [esp + i1130_twelve]
6643 pfsub mm0, mm2
6645 ;# calculate potential and scalar force
6646 pfmul mm6, mm1 ;# mm6=vcoul
6647 pfadd mm0, mm6
6648 pfmul mm4, mm0 ;# mm4=fscalar
6650 ;# update nb potential
6651 pfadd mm5, [esp + i1130_vnbtot]
6652 movq [esp + i1130_vnbtot], mm5
6654 pfrsqrt mm5, mm3
6655 pswapd mm3,mm3
6656 pfrsqrt mm2, mm3
6657 pswapd mm3,mm3
6658 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
6660 movq mm2, mm5
6661 pfmul mm5,mm5
6662 pfrsqit1 mm5,mm3
6663 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
6664 movq mm3,mm5
6665 pfmul mm3,mm3 ;# mm3=invsq
6666 pfmul mm7, mm5 ;# mm7=vcoul
6667 pfmul mm3, mm7 ;# mm3=fscal for the two H's.
6669 ;# update vctot
6670 pfadd mm7, mm6
6671 pfadd mm7, [esp + i1130_vctot]
6672 movq [esp + i1130_vctot], mm7
6674 ;# spread oxygen fscalar to both positions
6675 punpckldq mm4,mm4
6676 ;# calc vectorial force for O
6677 movq mm0, [esp + i1130_dxO]
6678 movd mm1, [esp + i1130_dzO]
6679 pfmul mm0, mm4
6680 pfmul mm1, mm4
6682 ;# calc vectorial force for H's
6683 movq mm5, [esp + i1130_dxH]
6684 movq mm6, [esp + i1130_dyH]
6685 movq mm7, [esp + i1130_dzH]
6686 pfmul mm5, mm3
6687 pfmul mm6, mm3
6688 pfmul mm7, mm3
6690 ;# update iO particle force
6691 movq mm2, [esp + i1130_fixO]
6692 movd mm3, [esp + i1130_fizO]
6693 pfadd mm2, mm0
6694 pfadd mm3, mm1
6695 movq [esp + i1130_fixO], mm2
6696 movd [esp + i1130_fizO], mm3
6698 ;# update iH forces
6699 movq mm2, [esp + i1130_fixH]
6700 movq mm3, [esp + i1130_fiyH]
6701 movq mm4, [esp + i1130_fizH]
6702 pfadd mm2, mm5
6703 pfadd mm3, mm6
6704 pfadd mm4, mm7
6705 movq [esp + i1130_fixH], mm2
6706 movq [esp + i1130_fiyH], mm3
6707 movq [esp + i1130_fizH], mm4
6709 ;# pack j forces from H in the same form as the oxygen force.
6710 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
6711 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
6713 pfadd mm0, mm5 ;# add up total force on j particle.
6714 pfadd mm1, mm7
6716 ;# update j particle force
6717 movq mm2, [edi + eax*4]
6718 movd mm3, [edi + eax*4 + 8]
6719 pfsub mm2, mm0
6720 pfsub mm3, mm1
6721 movq [edi + eax*4], mm2
6722 movd [edi + eax*4 +8], mm3
6724 ;# interactions with j H1
6725 movq mm0, [esi + eax*4 + 12]
6726 movd mm1, [esi + eax*4 + 20]
6727 ;# copy & expand to mm2-mm4 for the H interactions
6728 movq mm2, mm0
6729 movq mm3, mm0
6730 movq mm4, mm1
6731 punpckldq mm2,mm2
6732 punpckhdq mm3,mm3
6733 punpckldq mm4,mm4
6735 movd mm6, [esp + i1130_qqOH]
6736 movq mm7, [esp + i1130_qqHH]
6738 pfsubr mm0, [esp + i1130_ixO]
6739 pfsubr mm1, [esp + i1130_izO]
6741 movq [esp + i1130_dxO], mm0
6742 pfmul mm0,mm0
6743 movd [esp + i1130_dzO], mm1
6744 pfmul mm1,mm1
6745 pfacc mm0, mm1
6746 pfadd mm0, mm1 ;# mm0=rsqO
6748 punpckldq mm2, mm2
6749 punpckldq mm3, mm3
6750 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
6751 pfsubr mm2, [esp + i1130_ixH]
6752 pfsubr mm3, [esp + i1130_iyH]
6753 pfsubr mm4, [esp + i1130_izH] ;# mm2-mm4 is dxH-dzH
6755 movq [esp + i1130_dxH], mm2
6756 movq [esp + i1130_dyH], mm3
6757 movq [esp + i1130_dzH], mm4
6758 pfmul mm2,mm2
6759 pfmul mm3,mm3
6760 pfmul mm4,mm4
6762 pfadd mm3,mm2
6763 pfadd mm3,mm4 ;# mm3=rsqH
6765 pfrsqrt mm1,mm0
6767 movq mm2,mm1
6768 pfmul mm1,mm1
6769 pfrsqit1 mm1,mm0
6770 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
6771 movq mm4, mm1
6772 pfmul mm4, mm4 ;# mm4=invsq
6773 ;# calculate potential and scalar force
6774 pfmul mm6, mm1 ;# mm6=vcoul
6775 pfmul mm4, mm6 ;# mm4=fscalar
6777 pfrsqrt mm5, mm3
6778 pswapd mm3,mm3
6779 pfrsqrt mm2, mm3
6780 pswapd mm3,mm3
6781 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
6783 movq mm2, mm5
6784 pfmul mm5,mm5
6785 pfrsqit1 mm5,mm3
6786 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
6787 movq mm3,mm5
6788 pfmul mm3,mm3 ;# mm3=invsq
6789 pfmul mm7, mm5 ;# mm7=vcoul
6790 pfmul mm3, mm7 ;# mm3=fscal for the two H's.
6792 ;# update vctot
6793 pfadd mm7, mm6
6794 pfadd mm7, [esp + i1130_vctot]
6795 movq [esp + i1130_vctot], mm7
6797 ;# spread oxygen fscalar to both positions
6798 punpckldq mm4,mm4
6799 ;# calc vectorial force for O
6800 movq mm0, [esp + i1130_dxO]
6801 movd mm1, [esp + i1130_dzO]
6802 pfmul mm0, mm4
6803 pfmul mm1, mm4
6805 ;# calc vectorial force for H's
6806 movq mm5, [esp + i1130_dxH]
6807 movq mm6, [esp + i1130_dyH]
6808 movq mm7, [esp + i1130_dzH]
6809 pfmul mm5, mm3
6810 pfmul mm6, mm3
6811 pfmul mm7, mm3
6813 ;# update iO particle force
6814 movq mm2, [esp + i1130_fixO]
6815 movd mm3, [esp + i1130_fizO]
6816 pfadd mm2, mm0
6817 pfadd mm3, mm1
6818 movq [esp + i1130_fixO], mm2
6819 movd [esp + i1130_fizO], mm3
6821 ;# update iH forces
6822 movq mm2, [esp + i1130_fixH]
6823 movq mm3, [esp + i1130_fiyH]
6824 movq mm4, [esp + i1130_fizH]
6825 pfadd mm2, mm5
6826 pfadd mm3, mm6
6827 pfadd mm4, mm7
6828 movq [esp + i1130_fixH], mm2
6829 movq [esp + i1130_fiyH], mm3
6830 movq [esp + i1130_fizH], mm4
6832 ;# pack j forces from H in the same form as the oxygen force.
6833 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
6834 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
6836 pfadd mm0, mm5 ;# add up total force on j particle.
6837 pfadd mm1, mm7
6839 ;# update j particle force
6840 movq mm2, [edi + eax*4 + 12]
6841 movd mm3, [edi + eax*4 + 20]
6842 pfsub mm2, mm0
6843 pfsub mm3, mm1
6844 movq [edi + eax*4 + 12], mm2
6845 movd [edi + eax*4 + 20], mm3
6847 ;# interactions with j H2
6848 movq mm0, [esi + eax*4 + 24]
6849 movd mm1, [esi + eax*4 + 32]
6850 ;# copy & expand to mm2-mm4 for the H interactions
6851 movq mm2, mm0
6852 movq mm3, mm0
6853 movq mm4, mm1
6854 punpckldq mm2,mm2
6855 punpckhdq mm3,mm3
6856 punpckldq mm4,mm4
6858 movd mm6, [esp + i1130_qqOH]
6859 movq mm7, [esp + i1130_qqHH]
6861 pfsubr mm0, [esp + i1130_ixO]
6862 pfsubr mm1, [esp + i1130_izO]
6864 movq [esp + i1130_dxO], mm0
6865 pfmul mm0,mm0
6866 movd [esp + i1130_dzO], mm1
6867 pfmul mm1,mm1
6868 pfacc mm0, mm1
6869 pfadd mm0, mm1 ;# mm0=rsqO
6871 punpckldq mm2, mm2
6872 punpckldq mm3, mm3
6873 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
6874 pfsubr mm2, [esp + i1130_ixH]
6875 pfsubr mm3, [esp + i1130_iyH]
6876 pfsubr mm4, [esp + i1130_izH] ;# mm2-mm4 is dxH-dzH
6878 movq [esp + i1130_dxH], mm2
6879 movq [esp + i1130_dyH], mm3
6880 movq [esp + i1130_dzH], mm4
6881 pfmul mm2,mm2
6882 pfmul mm3,mm3
6883 pfmul mm4,mm4
6885 pfadd mm3,mm2
6886 pfadd mm3,mm4 ;# mm3=rsqH
6888 pfrsqrt mm1,mm0
6890 movq mm2,mm1
6891 pfmul mm1,mm1
6892 pfrsqit1 mm1,mm0
6893 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
6894 movq mm4, mm1
6895 pfmul mm4, mm4 ;# mm4=invsq
6896 ;# calculate potential and scalar force
6897 pfmul mm6, mm1 ;# mm6=vcoul
6898 pfmul mm4, mm6 ;# mm4=fscalar
6900 pfrsqrt mm5, mm3
6901 pswapd mm3,mm3
6902 pfrsqrt mm2, mm3
6903 pswapd mm3,mm3
6904 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3.
6906 movq mm2, mm5
6907 pfmul mm5,mm5
6908 pfrsqit1 mm5,mm3
6909 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
6910 movq mm3,mm5
6911 pfmul mm3,mm3 ;# mm3=invsq
6912 pfmul mm7, mm5 ;# mm7=vcoul
6913 pfmul mm3, mm7 ;# mm3=fscal for the two H's.
6915 ;# update vctot
6916 pfadd mm7, mm6
6917 pfadd mm7, [esp + i1130_vctot]
6918 movq [esp + i1130_vctot], mm7
6920 ;# spread oxygen fscalar to both positions
6921 punpckldq mm4,mm4
6922 ;# calc vectorial force for O
6923 movq mm0, [esp + i1130_dxO]
6924 movd mm1, [esp + i1130_dzO]
6925 pfmul mm0, mm4
6926 pfmul mm1, mm4
6928 ;# calc vectorial force for H's
6929 movq mm5, [esp + i1130_dxH]
6930 movq mm6, [esp + i1130_dyH]
6931 movq mm7, [esp + i1130_dzH]
6932 pfmul mm5, mm3
6933 pfmul mm6, mm3
6934 pfmul mm7, mm3
6936 ;# update iO particle force
6937 movq mm2, [esp + i1130_fixO]
6938 movd mm3, [esp + i1130_fizO]
6939 pfadd mm2, mm0
6940 pfadd mm3, mm1
6941 movq [esp + i1130_fixO], mm2
6942 movd [esp + i1130_fizO], mm3
6944 ;# update iH forces
6945 movq mm2, [esp + i1130_fixH]
6946 movq mm3, [esp + i1130_fiyH]
6947 movq mm4, [esp + i1130_fizH]
6948 pfadd mm2, mm5
6949 pfadd mm3, mm6
6950 pfadd mm4, mm7
6951 movq [esp + i1130_fixH], mm2
6952 movq [esp + i1130_fiyH], mm3
6953 movq [esp + i1130_fizH], mm4
6955 ;# pack j forces from H in the same form as the oxygen force.
6956 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
6957 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
6959 pfadd mm0, mm5 ;# add up total force on j particle.
6960 pfadd mm1, mm7
6962 ;# update j particle force
6963 movq mm2, [edi + eax*4 + 24]
6964 movd mm3, [edi + eax*4 + 32]
6965 pfsub mm2, mm0
6966 pfsub mm3, mm1
6967 movq [edi + eax*4 + 24], mm2
6968 movd [edi + eax*4 + 32], mm3
6970 ;# done - one more?
6971 dec dword ptr [esp + i1130_innerk]
6972 jz .i1130_updateouterdata
6973 jmp .i1130_inner_loop
6974 .i1130_updateouterdata:
6975 mov ecx, [esp + i1130_ii3]
6977 movq mm6, [edi + ecx*4] ;# increment iO force
6978 movd mm7, [edi + ecx*4 + 8]
6979 pfadd mm6, [esp + i1130_fixO]
6980 pfadd mm7, [esp + i1130_fizO]
6981 movq [edi + ecx*4], mm6
6982 movd [edi + ecx*4 +8], mm7
6984 movq mm0, [esp + i1130_fixH]
6985 movq mm3, [esp + i1130_fiyH]
6986 movq mm1, [esp + i1130_fizH]
6987 movq mm2, mm0
6988 punpckldq mm0, mm3 ;# mm0(l)=fxH1, mm0(h)=fyH1
6989 punpckhdq mm2, mm3 ;# mm2(l)=fxH2, mm2(h)=fyH2
6990 movq mm3, mm1
6991 pswapd mm3,mm3
6992 ;# mm1 is fzH1
6993 ;# mm3 is fzH2
6995 movq mm6, [edi + ecx*4 + 12] ;# increment iH1 force
6996 movd mm7, [edi + ecx*4 + 20]
6997 pfadd mm6, mm0
6998 pfadd mm7, mm1
6999 movq [edi + ecx*4 + 12], mm6
7000 movd [edi + ecx*4 + 20], mm7
7002 movq mm6, [edi + ecx*4 + 24] ;# increment iH2 force
7003 movd mm7, [edi + ecx*4 + 32]
7004 pfadd mm6, mm2
7005 pfadd mm7, mm3
7006 movq [edi + ecx*4 + 24], mm6
7007 movd [edi + ecx*4 + 32], mm7
7010 mov ebx, [ebp + i1130_fshift] ;# increment fshift force
7011 mov edx, [esp + i1130_is3]
7013 movq mm6, [ebx + edx*4]
7014 movd mm7, [ebx + edx*4 + 8]
7015 pfadd mm6, [esp + i1130_fixO]
7016 pfadd mm7, [esp + i1130_fizO]
7017 pfadd mm6, mm0
7018 pfadd mm7, mm1
7019 pfadd mm6, mm2
7020 pfadd mm7, mm3
7021 movq [ebx + edx*4], mm6
7022 movd [ebx + edx*4 + 8], mm7
7024 mov edx, [ebp + i1130_gid] ;# get group index for this i particle
7025 mov edx, [edx]
7026 add dword ptr [ebp + i1130_gid], 4 ;# advance pointer
7028 movq mm7, [esp + i1130_vctot]
7029 pfacc mm7,mm7 ;# get and sum the two parts of total potential
7031 mov eax, [ebp + i1130_Vc]
7032 movd mm6, [eax + edx*4]
7033 pfadd mm6, mm7
7034 movd [eax + edx*4], mm6 ;# increment vc[gid]
7036 movq mm7, [esp + i1130_vnbtot]
7037 pfacc mm7,mm7 ;# get and sum the two parts of total potential
7039 mov eax, [ebp + i1130_Vnb]
7040 movd mm6, [eax + edx*4]
7041 pfadd mm6, mm7
7042 movd [eax + edx*4], mm6 ;# increment vnbtot[gid]
7043 ;# finish if last
7044 dec dword ptr [ebp + i1130_nri]
7045 jz .i1130_end
7046 ;# not last, iterate once more!
7047 jmp .i1130_outer
7048 .i1130_end:
7049 femms
7050 add esp, 196
7051 pop edi
7052 pop esi
7053 pop edx
7054 pop ecx
7055 pop ebx
7056 pop eax
7057 leave
7063 .globl inl3000_3dnow
7064 .globl _inl3000_3dnow
7065 inl3000_3dnow:
7066 _inl3000_3dnow:
7067 .equiv i3000_nri, 8
7068 .equiv i3000_iinr, 12
7069 .equiv i3000_jindex, 16
7070 .equiv i3000_jjnr, 20
7071 .equiv i3000_shift, 24
7072 .equiv i3000_shiftvec, 28
7073 .equiv i3000_fshift, 32
7074 .equiv i3000_gid, 36
7075 .equiv i3000_pos, 40
7076 .equiv i3000_faction, 44
7077 .equiv i3000_charge, 48
7078 .equiv i3000_facel, 52
7079 .equiv i3000_Vc, 56
7080 .equiv i3000_tabscale, 60
7081 .equiv i3000_VFtab, 64
7082 ;# stack offsets for local variables
7083 .equiv i3000_is3, 0
7084 .equiv i3000_ii3, 4
7085 .equiv i3000_ix, 8
7086 .equiv i3000_iy, 12
7087 .equiv i3000_iz, 16
7088 .equiv i3000_iq, 20
7089 .equiv i3000_vctot, 28
7090 .equiv i3000_two, 36
7091 .equiv i3000_n1, 44
7092 .equiv i3000_tsc, 52
7093 .equiv i3000_ntia, 60
7094 .equiv i3000_innerjjnr, 64
7095 .equiv i3000_innerk, 68
7096 .equiv i3000_fix, 72
7097 .equiv i3000_fiy, 76
7098 .equiv i3000_fiz, 80
7099 .equiv i3000_dx1, 84
7100 .equiv i3000_dy1, 88
7101 .equiv i3000_dz1, 92
7102 .equiv i3000_dx2, 96
7103 .equiv i3000_dy2, 100
7104 .equiv i3000_dz2, 104
7105 push ebp
7106 mov ebp,esp
7107 push eax
7108 push ebx
7109 push ecx
7110 push edx
7111 push esi
7112 push edi
7113 sub esp, 108 ;# local stack space
7114 femms
7115 ;# move data to local stack
7116 movq mm0, [mm_two]
7117 movd mm3, [ebp + i3000_tabscale]
7118 movq [esp + i3000_two], mm0
7119 punpckldq mm3,mm3
7120 movq [esp + i3000_tsc], mm3
7121 ;# assume we have at least one i particle - start directly
7122 .i3000_outer:
7123 mov eax, [ebp + i3000_shift] ;# eax = pointer into shift[]
7124 mov ebx, [eax] ;# ebx=shift[n]
7125 add dword ptr [ebp + i3000_shift], 4 ;# advance pointer one step
7127 lea ebx, [ebx + ebx*2] ;# ebx=3*is
7128 mov [esp + i3000_is3],ebx ;# store is3
7130 mov eax, [ebp + i3000_shiftvec] ;# eax = base of shiftvec[]
7132 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
7133 movd mm1, [eax + ebx*4 + 8]
7135 mov ecx, [ebp + i3000_iinr] ;# ecx = pointer into iinr[]
7136 add dword ptr [ebp + i3000_iinr], 4 ;# advance pointer
7137 mov ebx, [ecx] ;# ebx=ii
7139 mov edx, [ebp + i3000_charge]
7140 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
7141 pfmul mm2, [ebp + i3000_facel]
7142 punpckldq mm2,mm2 ;# spread to both halves
7143 movq [esp + i3000_iq], mm2 ;# iq =facel*charge[ii]
7145 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
7146 mov eax, [ebp + i3000_pos] ;# eax = base of pos[]
7148 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
7149 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
7150 mov [esp + i3000_ii3], ebx
7151 pfadd mm1, mm3
7152 movq [esp + i3000_ix], mm0
7153 movd [esp + i3000_iz], mm1
7155 ;# clear total potential and i forces
7156 pxor mm7,mm7
7157 movq [esp + i3000_vctot], mm7
7158 movq [esp + i3000_fix], mm7
7159 movd [esp + i3000_fiz], mm7
7161 mov eax, [ebp + i3000_jindex]
7162 mov ecx, [eax] ;# jindex[n]
7163 mov edx, [eax + 4] ;# jindex[n+1]
7164 add dword ptr [ebp + i3000_jindex], 4
7165 sub edx, ecx ;# number of innerloop atoms
7167 mov esi, [ebp + i3000_pos]
7168 mov edi, [ebp + i3000_faction]
7169 mov eax, [ebp + i3000_jjnr]
7170 shl ecx, 2
7171 add eax, ecx
7172 mov [esp + i3000_innerjjnr], eax ;# pointer to jjnr[nj0]
7173 sub edx, 2
7174 mov [esp + i3000_innerk], edx ;# number of innerloop atoms
7175 jge .i3000_unroll_loop
7176 jmp .i3000_finish_inner
7177 .i3000_unroll_loop:
7178 ;# paired innerloop starts here
7179 mov ecx, [esp + i3000_innerjjnr] ;# pointer to jjnr[k]
7180 mov eax, [ecx]
7181 mov ebx, [ecx + 4] ;# eax/ebx=jnr
7182 add dword ptr [esp + i3000_innerjjnr], 8 ;# advance pointer (unrolled 2)
7183 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
7185 mov ecx, [ebp + i3000_charge] ;# base of charge[]
7186 movq mm5, [esp + i3000_iq]
7187 movd mm3, [ecx + eax*4] ;# charge[jnr1]
7188 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
7189 pfmul mm3,mm5 ;# mm3 now has qq for both particles
7191 lea eax, [eax + eax*2] ;# replace jnr with j3
7192 lea ebx, [ebx + ebx*2]
7194 mov esi, [ebp + i3000_pos]
7196 movq mm0, [esp + i3000_ix]
7197 movd mm1, [esp + i3000_iz]
7198 movq mm4, [esi + eax*4] ;# fetch first j coordinates
7199 movd mm5, [esi + eax*4 + 8]
7200 pfsubr mm4,mm0 ;# dr = ir - jr
7201 pfsubr mm5,mm1
7202 movq [esp + i3000_dx1], mm4 ;# store dr
7203 movd [esp + i3000_dz1], mm5
7204 pfmul mm4,mm4 ;# square dx,dy,dz
7205 pfmul mm5,mm5
7206 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
7207 pfacc mm4, mm5 ;# first rsq in lower mm4
7209 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
7210 movd mm7, [esi + ebx*4 + 8]
7212 pfsubr mm6,mm0 ;# dr = ir - jr
7213 pfsubr mm7,mm1
7214 movq [esp + i3000_dx2], mm6 ;# store dr
7215 movd [esp + i3000_dz2], mm7
7216 pfmul mm6,mm6 ;# square dx,dy,dz
7217 pfmul mm7,mm7
7218 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
7219 pfacc mm6, mm7 ;# second rsq in lower mm6
7221 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
7222 pfrsqrt mm1, mm6
7225 punpckldq mm0,mm1
7226 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
7227 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
7228 pfmul mm0,mm0
7229 pfrsqit1 mm0,mm4
7230 pfrcpit2 mm0,mm2
7231 pfmul mm4, mm0
7232 movq mm1, mm4
7233 ;# mm0 is invsqrt, and mm1 r.
7234 ;# do potential and fscal
7235 pfmul mm1, [esp + i3000_tsc] ;# mm1=rt
7236 pf2iw mm4,mm1
7237 movq [esp + i3000_n1], mm4
7238 pi2fd mm4,mm4
7239 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
7241 movq mm2,mm1
7242 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
7244 mov edx, [ebp + i3000_VFtab]
7245 mov ecx, [esp + i3000_n1]
7246 shl ecx, 2
7247 ;# coulomb table
7248 ;# load all the table values we need
7249 movd mm4, [edx + ecx*4]
7250 movd mm5, [edx + ecx*4 + 4]
7251 movd mm6, [edx + ecx*4 + 8]
7252 movd mm7, [edx + ecx*4 + 12]
7253 mov ecx, [esp + i3000_n1 + 4]
7254 shl ecx, 2
7255 punpckldq mm4, [edx + ecx*4]
7256 punpckldq mm5, [edx + ecx*4 + 4]
7257 punpckldq mm6, [edx + ecx*4 + 8]
7258 punpckldq mm7, [edx + ecx*4 + 12]
7260 pfmul mm6, mm1 ;# mm6 = Geps
7261 pfmul mm7, mm2 ;# mm7 = Heps2
7263 pfadd mm5, mm6
7264 pfadd mm5, mm7 ;# mm5 = Fp
7266 pfmul mm7, [esp + i3000_two] ;# two*Heps2
7267 pfadd mm7, mm6
7268 pfadd mm7, mm5 ;# mm7=FF
7270 pfmul mm5, mm1 ;# mm5=eps*Fp
7271 pfadd mm5, mm4 ;# mm5= VV
7273 pfmul mm5, mm3 ;# vcoul=qq*VV
7274 pfmul mm3, mm7 ;# fijC=FF*qq
7276 ;# at this point mm5 contains vcoul and mm3 fijC.
7277 ;# increment vcoul - then we can get rid of mm5.
7278 ;# update vctot
7279 pfadd mm5, [esp + i3000_vctot] ;# add the earlier value
7280 movq [esp + i3000_vctot], mm5 ;# store the sum
7282 ;# change sign of mm3
7283 pxor mm1,mm1
7284 pfsub mm1, mm3
7285 pfmul mm1, [esp + i3000_tsc]
7286 pfmul mm0, mm1 ;# mm0 is total fscal now
7288 prefetchw [esp + i3000_dx1] ;# prefetch i forces to cache
7290 ;# spread fscalar to both positions
7291 movq mm1,mm0
7292 punpckldq mm0,mm0
7293 punpckhdq mm1,mm1
7295 ;# calc vector force
7296 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
7297 movq mm2, [esp + i3000_dx1] ;# fetch dr
7298 movd mm3, [esp + i3000_dz1]
7300 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
7301 pfmul mm2, mm0 ;# mult by fs
7302 pfmul mm3, mm0
7304 movq mm4, [esp + i3000_dx2] ;# fetch dr
7305 movd mm5, [esp + i3000_dz2]
7306 pfmul mm4, mm1 ;# mult by fs
7307 pfmul mm5, mm1
7308 ;# update i forces
7310 movq mm0, [esp + i3000_fix]
7311 movd mm1, [esp + i3000_fiz]
7312 pfadd mm0, mm2
7313 pfadd mm1, mm3
7315 pfadd mm0, mm4
7316 pfadd mm1, mm5
7317 movq [esp + i3000_fix], mm0
7318 movd [esp + i3000_fiz], mm1
7319 ;# update j forces
7321 movq mm0, [edi + eax*4]
7322 movd mm1, [edi + eax*4 + 8]
7323 movq mm6, [edi + ebx*4]
7324 movd mm7, [edi + ebx*4 + 8]
7326 pfsub mm0, mm2
7327 pfsub mm1, mm3
7328 pfsub mm6, mm4
7329 pfsub mm7, mm5
7331 movq [edi + eax*4], mm0
7332 movd [edi + eax*4 +8], mm1
7333 movq [edi + ebx*4], mm6
7334 movd [edi + ebx*4 + 8], mm7
7336 ;# should we do one more iteration?
7337 sub dword ptr [esp + i3000_innerk], 2
7338 jl .i3000_finish_inner
7339 jmp .i3000_unroll_loop
7340 .i3000_finish_inner:
7341 and dword ptr [esp + i3000_innerk], 1
7342 jnz .i3000_single_inner
7343 jmp .i3000_updateouterdata
7344 .i3000_single_inner:
7345 ;# a single j particle iteration here - compare with the unrolled code for comments.
7346 mov eax, [esp + i3000_innerjjnr]
7347 mov eax, [eax] ;# eax=jnr offset
7349 mov ecx, [ebp + i3000_charge]
7350 movd mm5, [esp + i3000_iq]
7351 movd mm3, [ecx + eax*4]
7352 pfmul mm3, mm5 ;# mm3=qq
7354 mov esi, [ebp + i3000_pos]
7355 lea eax, [eax + eax*2]
7357 movq mm0, [esp + i3000_ix]
7358 movd mm1, [esp + i3000_iz]
7359 movq mm4, [esi + eax*4]
7360 movd mm5, [esi + eax*4 + 8]
7361 pfsubr mm4, mm0
7362 pfsubr mm5, mm1
7363 movq [esp + i3000_dx1], mm4
7364 pfmul mm4,mm4
7365 movd [esp + i3000_dz1], mm5
7366 pfmul mm5,mm5
7367 pfacc mm4, mm5
7368 pfacc mm4, mm5 ;# mm0=rsq
7370 pfrsqrt mm0,mm4
7371 movq mm2,mm0
7372 pfmul mm0,mm0
7373 pfrsqit1 mm0,mm4
7374 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
7375 pfmul mm4, mm0
7376 movq mm1, mm4
7377 ;# mm0 is invsqrt, and mm1 r.
7379 ;# calculate potentials and scalar force
7380 pfmul mm1, [esp + i3000_tsc] ;# mm1=rt
7381 pf2iw mm4,mm1
7382 movd [esp + i3000_n1], mm4
7383 pi2fd mm4,mm4
7384 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
7386 movq mm2,mm1
7387 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
7389 ;# coulomb table
7390 mov edx, [ebp + i3000_VFtab]
7391 mov ecx, [esp + i3000_n1]
7392 shl ecx, 2
7393 ;# load all the table values we need
7394 movd mm4, [edx + ecx*4]
7395 movd mm5, [edx + ecx*4 + 4]
7396 movd mm6, [edx + ecx*4 + 8]
7397 movd mm7, [edx + ecx*4 + 12]
7399 pfmul mm6, mm1 ;# mm6 = Geps
7400 pfmul mm7, mm2 ;# mm7 = Heps2
7402 pfadd mm5, mm6
7403 pfadd mm5, mm7 ;# mm5 = Fp
7405 pfmul mm7, [esp + i3000_two] ;# two*Heps2
7406 pfadd mm7, mm6
7407 pfadd mm7, mm5 ;# mm7=FF
7409 pfmul mm5, mm1 ;# mm5=eps*Fp
7410 pfadd mm5, mm4 ;# mm5= VV
7412 pfmul mm5, mm3 ;# vcoul=qq*VV
7413 pfmul mm3, mm7 ;# fijC=FF*qq
7415 ;# at this point mm5 contains vcoul and mm3 fijC
7416 ;# increment vcoul - then we can get rid of mm5
7417 ;# update vctot
7418 pfadd mm5, [esp + i3000_vctot] ;# add the earlier value
7419 movq [esp + i3000_vctot], mm5 ;# store the sum
7421 ;# change sign of mm3
7422 pxor mm1,mm1
7423 pfsub mm1, mm3
7424 pfmul mm0, [esp + i3000_tsc]
7425 pfmul mm0, mm1 ;# mm0 is total fscal now
7427 ;# spread fscalar to both positions
7428 punpckldq mm0,mm0
7429 ;# calc vectorial force
7430 prefetchw [edi + eax*4] ;# prefetch faction to cache
7431 movq mm2, [esp + i3000_dx1]
7432 movd mm3, [esp + i3000_dz1]
7435 pfmul mm2, mm0
7436 pfmul mm3, mm0
7438 ;# update i particle force
7439 movq mm0, [esp + i3000_fix]
7440 movd mm1, [esp + i3000_fiz]
7441 pfadd mm0, mm2
7442 pfadd mm1, mm3
7443 movq [esp + i3000_fix], mm0
7444 movd [esp + i3000_fiz], mm1
7445 ;# update j particle force
7446 movq mm0, [edi + eax*4]
7447 movd mm1, [edi + eax *4+ 8]
7448 pfsub mm0, mm2
7449 pfsub mm1, mm3
7450 movq [edi + eax*4], mm0
7451 movd [edi + eax*4 +8], mm1
7452 ;# done!
7453 .i3000_updateouterdata:
7454 mov ecx, [esp + i3000_ii3]
7456 movq mm6, [edi + ecx*4] ;# increment i force
7457 movd mm7, [edi + ecx*4 + 8]
7458 pfadd mm6, [esp + i3000_fix]
7459 pfadd mm7, [esp + i3000_fiz]
7460 movq [edi + ecx*4], mm6
7461 movd [edi + ecx*4 +8], mm7
7463 mov ebx, [ebp + i3000_fshift] ;# increment fshift force
7464 mov edx, [esp + i3000_is3]
7466 movq mm6, [ebx + edx*4]
7467 movd mm7, [ebx + edx*4 + 8]
7468 pfadd mm6, [esp + i3000_fix]
7469 pfadd mm7, [esp + i3000_fiz]
7470 movq [ebx + edx*4], mm6
7471 movd [ebx + edx*4 + 8], mm7
7473 mov edx, [ebp + i3000_gid] ;# get group index for this i particle
7474 mov edx, [edx]
7475 add dword ptr [ebp + i3000_gid], 4 ;# advance pointer
7477 movq mm7, [esp + i3000_vctot]
7478 pfacc mm7,mm7 ;# get and sum the two parts of total potential
7480 mov eax, [ebp + i3000_Vc]
7481 movd mm6, [eax + edx*4]
7482 pfadd mm6, mm7
7483 movd [eax + edx*4], mm6 ;# increment vc[gid]
7485 ;# finish if last
7486 mov ecx, [ebp + i3000_nri]
7487 dec ecx
7488 jecxz .i3000_end
7489 ;# not last, iterate once more!
7490 mov [ebp + i3000_nri], ecx
7491 jmp .i3000_outer
7492 .i3000_end:
7493 femms
7494 add esp, 108
7495 pop edi
7496 pop esi
7497 pop edx
7498 pop ecx
7499 pop ebx
7500 pop eax
7501 leave
7507 .globl inl3010_3dnow
7508 .globl _inl3010_3dnow
7509 inl3010_3dnow:
7510 _inl3010_3dnow:
7511 .equiv i3010_nri, 8
7512 .equiv i3010_iinr, 12
7513 .equiv i3010_jindex, 16
7514 .equiv i3010_jjnr, 20
7515 .equiv i3010_shift, 24
7516 .equiv i3010_shiftvec, 28
7517 .equiv i3010_fshift, 32
7518 .equiv i3010_gid, 36
7519 .equiv i3010_pos, 40
7520 .equiv i3010_faction, 44
7521 .equiv i3010_charge, 48
7522 .equiv i3010_facel, 52
7523 .equiv i3010_Vc, 56
7524 .equiv i3010_tabscale, 60
7525 .equiv i3010_VFtab, 64
7526 .equiv i3010_nsatoms, 68
7527 ;# stack offsets for local variables
7528 .equiv i3010_is3, 0
7529 .equiv i3010_ii3, 4
7530 .equiv i3010_shX, 8
7531 .equiv i3010_shY, 12
7532 .equiv i3010_shZ, 16
7533 .equiv i3010_ix, 20
7534 .equiv i3010_iy, 24
7535 .equiv i3010_iz, 28
7536 .equiv i3010_iq, 32
7537 .equiv i3010_vctot, 40
7538 .equiv i3010_two, 48
7539 .equiv i3010_n1, 56
7540 .equiv i3010_tsc, 64
7541 .equiv i3010_innerjjnr0, 72
7542 .equiv i3010_innerk0, 76
7543 .equiv i3010_innerjjnr, 80
7544 .equiv i3010_innerk, 84
7545 .equiv i3010_fix, 88
7546 .equiv i3010_fiy, 92
7547 .equiv i3010_fiz, 96
7548 .equiv i3010_dx1, 100
7549 .equiv i3010_dy1, 104
7550 .equiv i3010_dz1, 108
7551 .equiv i3010_dx2, 112
7552 .equiv i3010_dy2, 116
7553 .equiv i3010_dz2, 120
7554 .equiv i3010_nscoul, 124
7555 .equiv i3010_solnr, 128
7556 push ebp
7557 mov ebp,esp
7558 push eax
7559 push ebx
7560 push ecx
7561 push edx
7562 push esi
7563 push edi
7564 sub esp, 132 ;# local stack space
7565 femms
7567 add dword ptr [ebp + i3010_nsatoms], 8
7568 movq mm2, [mm_two]
7569 movq [esp + i3010_two], mm2
7570 movd mm3, [ebp + i3010_tabscale]
7571 punpckldq mm3,mm3
7572 movq [esp + i3010_tsc], mm3
7574 ;# assume we have at least one i particle - start directly
7575 .i3010_outer:
7576 mov eax, [ebp + i3010_shift] ;# eax = pointer into shift[]
7577 mov ebx, [eax] ;# ebx=shift[n]
7578 add dword ptr [ebp + i3010_shift], 4 ;# advance pointer one step
7580 lea ebx, [ebx + ebx*2] ;# ebx=3*is
7581 mov [esp + i3010_is3],ebx ;# store is3
7583 mov eax, [ebp + i3010_shiftvec] ;# eax = base of shiftvec[]
7585 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
7586 movd mm1, [eax + ebx*4 + 8]
7587 movq [esp + i3010_shX], mm0
7588 movd [esp + i3010_shZ], mm1
7590 mov ecx, [ebp + i3010_iinr] ;# ecx = pointer into iinr[]
7591 add dword ptr [ebp + i3010_iinr], 4 ;# advance pointer
7592 mov ebx, [ecx] ;# ebx=ii
7594 mov eax, [ebp + i3010_nsatoms]
7595 mov ecx, [eax]
7596 add dword ptr [ebp + i3010_nsatoms], 12
7597 mov [esp + i3010_nscoul], ecx
7599 ;# clear potential
7600 pxor mm7,mm7
7601 movq [esp + i3010_vctot], mm7
7602 mov [esp + i3010_solnr], ebx
7604 mov eax, [ebp + i3010_jindex]
7605 mov ecx, [eax] ;# jindex[n]
7606 mov edx, [eax + 4] ;# jindex[n+1]
7607 add dword ptr [ebp + i3010_jindex], 4
7608 sub edx, ecx ;# number of innerloop atoms
7609 mov eax, [ebp + i3010_jjnr]
7610 shl ecx, 2
7611 add eax, ecx
7612 mov [esp + i3010_innerjjnr0], eax ;# pointer to jjnr[nj0]
7614 mov [esp + i3010_innerk0], edx ;# number of innerloop atoms
7615 mov esi, [ebp + i3010_pos]
7616 mov edi, [ebp + i3010_faction]
7617 mov ecx, [esp + i3010_nscoul]
7618 cmp ecx, 0
7619 jnz .i3010_mno_coul
7620 jmp .i3010_last_mno
7621 .i3010_mno_coul:
7622 mov ebx, [esp + i3010_solnr]
7623 inc dword ptr [esp + i3010_solnr]
7624 mov edx, [ebp + i3010_charge]
7625 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
7626 pfmul mm2, [ebp + i3010_facel]
7627 punpckldq mm2,mm2 ;# spread to both halves
7628 movq [esp + i3010_iq], mm2 ;# iq =facel*charge[ii]
7630 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
7631 mov eax, [ebp + i3010_pos] ;# eax = base of pos[]
7632 mov [esp + i3010_ii3], ebx
7634 movq mm0, [eax + ebx*4]
7635 movd mm1, [eax + ebx*4 + 8]
7636 pfadd mm0, [esp + i3010_shX]
7637 pfadd mm1, [esp + i3010_shZ]
7638 movq [esp + i3010_ix], mm0
7639 movd [esp + i3010_iz], mm1
7641 ;# clear forces
7642 pxor mm7,mm7
7643 movq [esp + i3010_fix], mm7
7644 movd [esp + i3010_fiz], mm7
7646 mov ecx, [esp + i3010_innerjjnr0]
7647 mov [esp + i3010_innerjjnr], ecx
7648 mov edx, [esp + i3010_innerk0]
7649 sub edx, 2
7650 mov [esp + i3010_innerk], edx ;# number of innerloop atoms
7651 jge .i3010_unroll_coul_loop
7652 jmp .i3010_finish_coul_inner
7653 .i3010_unroll_coul_loop:
7654 ;# paired innerloop starts here
7655 mov ecx, [esp + i3010_innerjjnr] ;# pointer to jjnr[k]
7656 mov eax, [ecx]
7657 mov ebx, [ecx + 4] ;# eax/ebx=jnr
7658 add dword ptr [esp + i3010_innerjjnr], 8 ;# advance pointer (unrolled 2)
7659 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
7661 mov ecx, [ebp + i3010_charge] ;# base of charge[]
7662 movq mm5, [esp + i3010_iq]
7663 movd mm3, [ecx + eax*4] ;# charge[jnr1]
7664 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
7665 pfmul mm3,mm5 ;# mm3 now has qq for both particles
7667 lea eax, [eax + eax*2] ;# replace jnr with j3
7668 lea ebx, [ebx + ebx*2]
7670 mov esi, [ebp + i3010_pos]
7672 movq mm0, [esp + i3010_ix]
7673 movd mm1, [esp + i3010_iz]
7674 movq mm4, [esi + eax*4] ;# fetch first j coordinates
7675 movd mm5, [esi + eax*4 + 8]
7676 pfsubr mm4,mm0 ;# dr = ir - jr
7677 pfsubr mm5,mm1
7678 movq [esp + i3010_dx1], mm4 ;# store dr
7679 movd [esp + i3010_dz1], mm5
7680 pfmul mm4,mm4 ;# square dx,dy,dz
7681 pfmul mm5,mm5
7682 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
7683 pfacc mm4, mm5 ;# first rsq in lower mm4
7685 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
7686 movd mm7, [esi + ebx*4 + 8]
7688 pfsubr mm6,mm0 ;# dr = ir - jr
7689 pfsubr mm7,mm1
7690 movq [esp + i3010_dx2], mm6 ;# store dr
7691 movd [esp + i3010_dz2], mm7
7692 pfmul mm6,mm6 ;# square dx,dy,dz
7693 pfmul mm7,mm7
7694 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
7695 pfacc mm6, mm7 ;# second rsq in lower mm6
7697 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
7698 pfrsqrt mm1, mm6
7701 punpckldq mm0,mm1
7702 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
7703 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
7704 pfmul mm0,mm0
7705 pfrsqit1 mm0,mm4
7706 pfrcpit2 mm0,mm2
7707 pfmul mm4, mm0
7708 movq mm1, mm4
7709 ;# mm0 is invsqrt, and mm1 r.
7710 ;# do potential and fscal
7711 pfmul mm1, [esp + i3010_tsc] ;# mm1=rt
7712 pf2iw mm4,mm1
7713 movq [esp + i3010_n1], mm4
7714 pi2fd mm4,mm4
7715 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
7717 movq mm2,mm1
7718 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
7720 mov edx, [ebp + i3010_VFtab]
7721 mov ecx, [esp + i3010_n1]
7722 shl ecx, 2
7723 ;# coulomb table
7724 ;# load all the table values we need
7725 movd mm4, [edx + ecx*4]
7726 movd mm5, [edx + ecx*4 + 4]
7727 movd mm6, [edx + ecx*4 + 8]
7728 movd mm7, [edx + ecx*4 + 12]
7729 mov ecx, [esp + i3010_n1 + 4]
7730 shl ecx, 2
7731 punpckldq mm4, [edx + ecx*4]
7732 punpckldq mm5, [edx + ecx*4 + 4]
7733 punpckldq mm6, [edx + ecx*4 + 8]
7734 punpckldq mm7, [edx + ecx*4 + 12]
7736 pfmul mm6, mm1 ;# mm6 = Geps
7737 pfmul mm7, mm2 ;# mm7 = Heps2
7739 pfadd mm5, mm6
7740 pfadd mm5, mm7 ;# mm5 = Fp
7742 pfmul mm7, [esp + i3010_two] ;# two*Heps2
7743 pfadd mm7, mm6
7744 pfadd mm7, mm5 ;# mm7=FF
7746 pfmul mm5, mm1 ;# mm5=eps*Fp
7747 pfadd mm5, mm4 ;# mm5= VV
7749 pfmul mm5, mm3 ;# vcoul=qq*VV
7750 pfmul mm3, mm7 ;# fijC=FF*qq
7752 ;# at this point mm5 contains vcoul and mm3 fijC
7753 ;# increment vcoul - then we can get rid of mm5
7754 ;# update vctot
7755 pfadd mm5, [esp + i3010_vctot] ;# add the earlier value
7756 movq [esp + i3010_vctot], mm5 ;# store the sum
7758 ;# change sign of mm3
7759 pxor mm1,mm1
7760 pfsub mm1, mm3
7761 pfmul mm1, [esp + i3010_tsc]
7762 pfmul mm0, mm1 ;# mm0 is total fscal now
7764 prefetchw [esp + i3010_dx1] ;# prefetch i forces to cache
7766 ;# spread fscalar to both positions
7767 movq mm1,mm0
7768 punpckldq mm0,mm0
7769 punpckhdq mm1,mm1
7771 ;# calc vector force
7772 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
7773 movq mm2, [esp + i3010_dx1] ;# fetch dr
7774 movd mm3, [esp + i3010_dz1]
7776 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
7777 pfmul mm2, mm0 ;# mult by fs
7778 pfmul mm3, mm0
7780 movq mm4, [esp + i3010_dx2] ;# fetch dr
7781 movd mm5, [esp + i3010_dz2]
7782 pfmul mm4, mm1 ;# mult by fs
7783 pfmul mm5, mm1
7784 ;# update i forces
7786 movq mm0, [esp + i3010_fix]
7787 movd mm1, [esp + i3010_fiz]
7788 pfadd mm0, mm2
7789 pfadd mm1, mm3
7791 pfadd mm0, mm4
7792 pfadd mm1, mm5
7793 movq [esp + i3010_fix], mm0
7794 movd [esp + i3010_fiz], mm1
7795 ;# update j forces
7797 movq mm0, [edi + eax*4]
7798 movd mm1, [edi + eax*4 + 8]
7799 movq mm6, [edi + ebx*4]
7800 movd mm7, [edi + ebx*4 + 8]
7802 pfsub mm0, mm2
7803 pfsub mm1, mm3
7804 pfsub mm6, mm4
7805 pfsub mm7, mm5
7807 movq [edi + eax*4], mm0
7808 movd [edi + eax*4 +8], mm1
7809 movq [edi + ebx*4], mm6
7810 movd [edi + ebx*4 + 8], mm7
7812 ;# should we do one more iteration?
7813 sub dword ptr [esp + i3010_innerk], 2
7814 jl .i3010_finish_coul_inner
7815 jmp .i3010_unroll_coul_loop
7816 .i3010_finish_coul_inner:
7817 and dword ptr [esp + i3010_innerk], 1
7818 jnz .i3010_single_coul_inner
7819 jmp .i3010_updateouterdata_coul
7820 .i3010_single_coul_inner:
7821 ;# a single j particle iteration here - compare with the unrolled code for comments.
7822 mov eax, [esp + i3010_innerjjnr]
7823 mov eax, [eax] ;# eax=jnr offset
7825 mov ecx, [ebp + i3010_charge]
7826 movd mm5, [esp + i3010_iq]
7827 movd mm3, [ecx + eax*4]
7828 pfmul mm3, mm5 ;# mm3=qq
7830 mov esi, [ebp + i3010_pos]
7831 lea eax, [eax + eax*2]
7833 movq mm0, [esp + i3010_ix]
7834 movd mm1, [esp + i3010_iz]
7835 movq mm4, [esi + eax*4]
7836 movd mm5, [esi + eax*4 + 8]
7837 pfsubr mm4, mm0
7838 pfsubr mm5, mm1
7839 movq [esp + i3010_dx1], mm4
7840 pfmul mm4,mm4
7841 movd [esp + i3010_dz1], mm5
7842 pfmul mm5,mm5
7843 pfacc mm4, mm5
7844 pfacc mm4, mm5 ;# mm0=rsq
7846 pfrsqrt mm0,mm4
7847 movq mm2,mm0
7848 pfmul mm0,mm0
7849 pfrsqit1 mm0,mm4
7850 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
7851 pfmul mm4, mm0
7852 movq mm1, mm4
7853 ;# mm0 is invsqrt, and mm1 r.
7855 ;# calculate potentials and scalar force
7856 pfmul mm1, [esp + i3010_tsc] ;# mm1=rt
7857 pf2iw mm4,mm1
7858 movd [esp + i3010_n1], mm4
7859 pi2fd mm4,mm4
7860 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
7862 movq mm2,mm1
7863 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
7865 ;# coulomb table
7866 mov edx, [ebp + i3010_VFtab]
7867 mov ecx, [esp + i3010_n1]
7868 shl ecx, 2
7869 ;# load all the table values we need
7870 movd mm4, [edx + ecx*4]
7871 movd mm5, [edx + ecx*4 + 4]
7872 movd mm6, [edx + ecx*4 + 8]
7873 movd mm7, [edx + ecx*4 + 12]
7875 pfmul mm6, mm1 ;# mm6 = Geps
7876 pfmul mm7, mm2 ;# mm7 = Heps2
7878 pfadd mm5, mm6
7879 pfadd mm5, mm7 ;# mm5 = Fp
7881 pfmul mm7, [esp + i3010_two] ;# two*Heps2
7882 pfadd mm7, mm6
7883 pfadd mm7, mm5 ;# mm7=FF
7885 pfmul mm5, mm1 ;# mm5=eps*Fp
7886 pfadd mm5, mm4 ;# mm5= VV
7888 pfmul mm5, mm3 ;# vcoul=qq*VV
7889 pfmul mm3, mm7 ;# fijC=FF*qq
7891 ;# at this point mm5 contains vcoul and mm3 fijC
7892 ;# increment vcoul - then we can get rid of mm5
7893 ;# update vctot
7894 pfadd mm5, [esp + i3010_vctot] ;# add the earlier value
7895 movq [esp + i3010_vctot], mm5 ;# store the sum
7897 ;# change sign of mm3
7898 pxor mm1,mm1
7899 pfsub mm1, mm3
7900 pfmul mm0, [esp + i3010_tsc]
7901 pfmul mm0, mm1 ;# mm0 is total fscal now
7903 ;# spread fscalar to both positions
7904 punpckldq mm0,mm0
7905 ;# calc vectorial force
7906 prefetchw [edi + eax*4] ;# prefetch faction to cache
7907 movq mm2, [esp + i3010_dx1]
7908 movd mm3, [esp + i3010_dz1]
7911 pfmul mm2, mm0
7912 pfmul mm3, mm0
7914 ;# update i particle force
7915 movq mm0, [esp + i3010_fix]
7916 movd mm1, [esp + i3010_fiz]
7917 pfadd mm0, mm2
7918 pfadd mm1, mm3
7919 movq [esp + i3010_fix], mm0
7920 movd [esp + i3010_fiz], mm1
7921 ;# update j particle force
7922 movq mm0, [edi + eax*4]
7923 movd mm1, [edi + eax *4+ 8]
7924 pfsub mm0, mm2
7925 pfsub mm1, mm3
7926 movq [edi + eax*4], mm0
7927 movd [edi + eax*4 +8], mm1
7928 ;# done!
7929 .i3010_updateouterdata_coul:
7930 mov ecx, [esp + i3010_ii3]
7932 movq mm6, [edi + ecx*4] ;# increment i force
7933 movd mm7, [edi + ecx*4 + 8]
7934 pfadd mm6, [esp + i3010_fix]
7935 pfadd mm7, [esp + i3010_fiz]
7936 movq [edi + ecx*4], mm6
7937 movd [edi + ecx*4 +8], mm7
7939 mov ebx, [ebp + i3010_fshift] ;# increment fshift force
7940 mov edx, [esp + i3010_is3]
7942 movq mm6, [ebx + edx*4]
7943 movd mm7, [ebx + edx*4 + 8]
7944 pfadd mm6, [esp + i3010_fix]
7945 pfadd mm7, [esp + i3010_fiz]
7946 movq [ebx + edx*4], mm6
7947 movd [ebx + edx*4 + 8], mm7
7949 ;# loop back to mno
7950 dec dword ptr [esp + i3010_nscoul]
7951 jz .i3010_last_mno
7952 jmp .i3010_mno_coul
7953 .i3010_last_mno:
7954 mov edx, [ebp + i3010_gid] ;# get group index for this i particle
7955 mov edx, [edx]
7956 add dword ptr [ebp + i3010_gid], 4 ;# advance pointer
7958 movq mm7, [esp + i3010_vctot]
7959 pfacc mm7,mm7 ;# get and sum the two parts of total potential
7961 mov eax, [ebp + i3010_Vc]
7962 movd mm6, [eax + edx*4]
7963 pfadd mm6, mm7
7964 movd [eax + edx*4], mm6 ;# increment vc[gid]
7965 ;# finish if last
7966 mov ecx, [ebp + i3010_nri]
7967 dec ecx
7968 jecxz .i3010_end
7969 ;# not last, iterate once more!
7970 mov [ebp + i3010_nri], ecx
7971 jmp .i3010_outer
7972 .i3010_end:
7973 femms
7974 add esp, 132
7975 pop edi
7976 pop esi
7977 pop edx
7978 pop ecx
7979 pop ebx
7980 pop eax
7981 leave
7987 .globl inl3020_3dnow
7988 .globl _inl3020_3dnow
7989 inl3020_3dnow:
7990 _inl3020_3dnow:
7991 .equiv i3020_nri, 8
7992 .equiv i3020_iinr, 12
7993 .equiv i3020_jindex, 16
7994 .equiv i3020_jjnr, 20
7995 .equiv i3020_shift, 24
7996 .equiv i3020_shiftvec, 28
7997 .equiv i3020_fshift, 32
7998 .equiv i3020_gid, 36
7999 .equiv i3020_pos, 40
8000 .equiv i3020_faction, 44
8001 .equiv i3020_charge, 48
8002 .equiv i3020_facel, 52
8003 .equiv i3020_Vc, 56
8004 .equiv i3020_tabscale, 60
8005 .equiv i3020_VFtab, 64
8006 ;# stack offsets for local variables
8007 .equiv i3020_is3, 0
8008 .equiv i3020_ii3, 4
8009 .equiv i3020_ixO, 8
8010 .equiv i3020_iyO, 12
8011 .equiv i3020_izO, 16
8012 .equiv i3020_ixH, 20
8013 .equiv i3020_iyH, 28
8014 .equiv i3020_izH, 36
8015 .equiv i3020_iqO, 44
8016 .equiv i3020_iqH, 52
8017 .equiv i3020_qqO, 60
8018 .equiv i3020_qqH, 68
8019 .equiv i3020_vctot, 76
8020 .equiv i3020_two, 84
8021 .equiv i3020_n1, 92
8022 .equiv i3020_tsc, 100
8023 .equiv i3020_innerjjnr, 108
8024 .equiv i3020_innerk, 112
8025 .equiv i3020_fixO, 116
8026 .equiv i3020_fiyO, 120
8027 .equiv i3020_fizO, 124
8028 .equiv i3020_fixH, 128
8029 .equiv i3020_fiyH, 136
8030 .equiv i3020_fizH, 144
8031 .equiv i3020_dxO, 152
8032 .equiv i3020_dyO, 156
8033 .equiv i3020_dzO, 160
8034 .equiv i3020_dxH, 164
8035 .equiv i3020_dyH, 172
8036 .equiv i3020_dzH, 180
8037 .equiv i3020_tmprsqH, 188
8038 push ebp
8039 mov ebp,esp
8040 push eax
8041 push ebx
8042 push ecx
8043 push edx
8044 push esi
8045 push edi
8046 sub esp, 196 ;# local stack space
8047 femms
8049 mov ecx, [ebp + i3020_iinr] ;# ecx = pointer into iinr[]
8050 mov ebx, [ecx] ;# ebx=ii
8052 mov edx, [ebp + i3020_charge]
8053 movd mm1, [ebp + i3020_facel]
8054 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0]
8055 pfmul mm2, mm1
8056 movq [esp + i3020_iqO], mm2 ;# iqO = facel*charge[ii]
8058 movd mm2, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1]
8059 pfmul mm2, mm1
8060 punpckldq mm2,mm2 ;# spread to both halves
8061 movq [esp + i3020_iqH], mm2 ;# iqH = facel*charge[ii0+1]
8063 movq mm3, [mm_two]
8064 movd mm4, [ebp + i3020_tabscale]
8065 punpckldq mm4,mm4 ;# spread to both halves
8066 movq [esp + i3020_two], mm3
8067 movq [esp + i3020_tsc], mm4
8068 ;# assume we have at least one i particle - start directly
8069 .i3020_outer:
8070 mov eax, [ebp + i3020_shift] ;# eax = pointer into shift[]
8071 mov ebx, [eax] ;# ebx=shift[n]
8072 add dword ptr [ebp + i3020_shift], 4 ;# advance pointer one step
8074 lea ebx, [ebx + ebx*2] ;# ebx=3*is
8075 mov [esp + i3020_is3],ebx ;# store is3
8077 mov eax, [ebp + i3020_shiftvec] ;# eax = base of shiftvec[]
8079 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
8080 movd mm6, [eax + ebx*4 + 8]
8081 movq mm0, mm5
8082 movq mm1, mm5
8083 movq mm2, mm6
8084 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
8085 punpckhdq mm1,mm1
8086 punpckldq mm2,mm2
8088 mov ecx, [ebp + i3020_iinr] ;# ecx = pointer into iinr[]
8089 add dword ptr [ebp + i3020_iinr], 4 ;# advance pointer
8090 mov ebx, [ecx] ;# ebx=ii
8092 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
8093 mov eax, [ebp + i3020_pos] ;# eax = base of pos[]
8095 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
8096 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
8097 mov [esp + i3020_ii3], ebx ;# (use mm7 as temp. storage for iz.)
8098 pfadd mm6, mm7
8099 movq [esp + i3020_ixO], mm5
8100 movq [esp + i3020_izO], mm6
8102 movd mm3, [eax + ebx*4 + 12]
8103 movd mm4, [eax + ebx*4 + 16]
8104 movd mm5, [eax + ebx*4 + 20]
8105 punpckldq mm3, [eax + ebx*4 + 24]
8106 punpckldq mm4, [eax + ebx*4 + 28]
8107 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
8109 pfadd mm0, mm3
8110 pfadd mm1, mm4
8111 pfadd mm2, mm5
8112 movq [esp + i3020_ixH], mm0
8113 movq [esp + i3020_iyH], mm1
8114 movq [esp + i3020_izH], mm2
8116 ;# clear vctot and i forces
8117 pxor mm7,mm7
8118 movq [esp + i3020_vctot], mm7
8119 movq [esp + i3020_fixO], mm7
8120 movd [esp + i3020_fizO], mm7
8121 movq [esp + i3020_fixH], mm7
8122 movq [esp + i3020_fiyH], mm7
8123 movq [esp + i3020_fizH], mm7
8125 mov eax, [ebp + i3020_jindex]
8126 mov ecx, [eax] ;# jindex[n]
8127 mov edx, [eax + 4] ;# jindex[n+1]
8128 add dword ptr [ebp + i3020_jindex], 4
8129 sub edx, ecx ;# number of innerloop atoms
8130 mov [esp + i3020_innerk], edx
8132 mov esi, [ebp + i3020_pos]
8133 mov edi, [ebp + i3020_faction]
8134 mov eax, [ebp + i3020_jjnr]
8135 shl ecx, 2
8136 add eax, ecx
8137 mov [esp + i3020_innerjjnr], eax ;# pointer to jjnr[nj0]
8138 .i3020_inner_loop:
8139 ;# a single j particle iteration
8140 mov eax, [esp + i3020_innerjjnr]
8141 mov eax, [eax] ;# eax=jnr offset
8142 add dword ptr [esp + i3020_innerjjnr], 4 ;# advance pointer
8143 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
8145 mov ecx, [ebp + i3020_charge]
8146 movd mm7, [ecx + eax*4]
8147 punpckldq mm7,mm7
8148 movq mm6,mm7
8149 pfmul mm6, [esp + i3020_iqO]
8150 pfmul mm7, [esp + i3020_iqH] ;# mm6=qqO, mm7=qqH
8151 movd [esp + i3020_qqO], mm6
8152 movq [esp + i3020_qqH], mm7
8154 lea eax, [eax + eax*2]
8156 movq mm0, [esi + eax*4]
8157 movd mm1, [esi + eax*4 + 8]
8158 ;# copy & expand to mm2-mm4 for the H interactions
8159 movq mm2, mm0
8160 movq mm3, mm0
8161 movq mm4, mm1
8162 punpckldq mm2,mm2
8163 punpckhdq mm3,mm3
8164 punpckldq mm4,mm4
8166 pfsubr mm0, [esp + i3020_ixO]
8167 pfsubr mm1, [esp + i3020_izO]
8169 movq [esp + i3020_dxO], mm0
8170 pfmul mm0,mm0
8171 movd [esp + i3020_dzO], mm1
8172 pfmul mm1,mm1
8173 pfacc mm0, mm1
8174 pfadd mm0, mm1 ;# mm0=rsqO
8176 punpckldq mm2, mm2
8177 punpckldq mm3, mm3
8178 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
8179 pfsubr mm2, [esp + i3020_ixH]
8180 pfsubr mm3, [esp + i3020_iyH]
8181 pfsubr mm4, [esp + i3020_izH] ;# mm2-mm4 is dxH-dzH
8183 movq [esp + i3020_dxH], mm2
8184 movq [esp + i3020_dyH], mm3
8185 movq [esp + i3020_dzH], mm4
8186 pfmul mm2,mm2
8187 pfmul mm3,mm3
8188 pfmul mm4,mm4
8190 pfadd mm3,mm2
8191 pfadd mm3,mm4 ;# mm3=rsqH
8192 movq [esp + i3020_tmprsqH], mm3
8194 pfrsqrt mm1,mm0
8196 movq mm2,mm1
8197 pfmul mm1,mm1
8198 pfrsqit1 mm1,mm0
8199 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
8201 pfmul mm0, mm1 ;# mm0=r
8203 pfmul mm0, [esp + i3020_tsc]
8204 pf2iw mm4, mm0
8205 movd [esp + i3020_n1], mm4
8206 pi2fd mm4,mm4
8207 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
8208 movq mm2, mm0
8209 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
8211 ;# coulomb table
8212 mov edx, [ebp + i3020_VFtab]
8213 mov ecx, [esp + i3020_n1]
8214 shl ecx, 2
8215 ;# load all values we need
8216 movd mm4, [edx + ecx*4]
8217 movd mm5, [edx + ecx*4 + 4]
8218 movd mm6, [edx + ecx*4 + 8]
8219 movd mm7, [edx + ecx*4 + 12]
8221 pfmul mm6, mm0 ;# mm6 = Geps
8222 pfmul mm7, mm2 ;# mm7 = Heps2
8224 pfadd mm5, mm6
8225 pfadd mm5, mm7 ;# mm5 = Fp
8227 pfmul mm7, [esp + i3020_two] ;# two*Heps2
8228 pfadd mm7, mm6
8229 pfadd mm7, mm5 ;# mm7=FF
8231 pfmul mm5, mm0 ;# mm5=eps*Fp
8232 pfadd mm5, mm4 ;# mm5= VV
8234 pfmul mm5, [esp + i3020_qqO] ;# vcoul=qq*VV
8235 pfmul mm7, [esp + i3020_qqO] ;# fijC=qq*FF
8236 ;# update vctot directly, use mm3 for fscal sum.
8237 pfadd mm5, [esp + i3020_vctot]
8238 movq [esp + i3020_vctot], mm5
8239 movq mm3, mm7
8241 ;# change sign of fscal and multiply with rinv
8242 pxor mm0,mm0
8243 pfsubr mm3, mm0
8244 pfmul mm3, [esp + i3020_tsc]
8245 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
8247 ;# Ready with the oxygen - potential is updated, fscal is in mm3.
8248 ;# now do the two hydrogens.
8250 movq mm0, [esp + i3020_tmprsqH] ;# mm0=rsqH
8252 pfrsqrt mm1, mm0
8253 pswapd mm0,mm0
8254 pfrsqrt mm2, mm0
8255 pswapd mm0,mm0
8256 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
8258 movq mm2, mm1
8259 pfmul mm1,mm1
8260 pfrsqit1 mm1,mm0
8261 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
8263 pfmul mm0,mm1 ;# mm0=r
8264 pfmul mm0, [esp + i3020_tsc]
8265 pf2iw mm4, mm0
8266 movq [esp + i3020_n1], mm4
8267 pi2fd mm4,mm4
8268 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
8269 movq mm2, mm0
8270 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
8272 ;# coulomb table
8273 mov edx, [ebp + i3020_VFtab]
8274 mov ecx, [esp + i3020_n1]
8275 shl ecx, 2
8276 ;# load all values we need
8277 movd mm4, [edx + ecx*4]
8278 movd mm5, [edx + ecx*4 + 4]
8279 movd mm6, [edx + ecx*4 + 8]
8280 movd mm7, [edx + ecx*4 + 12]
8281 mov ecx, [esp + i3020_n1 + 4]
8282 shl ecx, 2
8283 punpckldq mm4, [edx + ecx*4]
8284 punpckldq mm5, [edx + ecx*4 + 4]
8285 punpckldq mm6, [edx + ecx*4 + 8]
8286 punpckldq mm7, [edx + ecx*4 + 12]
8288 pfmul mm6, mm0 ;# mm6 = Geps
8289 pfmul mm7, mm2 ;# mm7 = Heps2
8291 pfadd mm5, mm6
8292 pfadd mm5, mm7 ;# mm5 = Fp
8294 pfmul mm7, [esp + i3020_two] ;# two*Heps2
8295 pfadd mm7, mm6
8296 pfadd mm7, mm5 ;# mm7=FF
8298 pfmul mm5, mm0 ;# mm5=eps*Fp
8299 pfadd mm5, mm4 ;# mm5= VV
8301 pfmul mm5, [esp + i3020_qqH] ;# vcoul=qq*VV
8302 pfmul mm7, [esp + i3020_qqH] ;# fijC=qq*FF
8304 ;# update vctot
8305 pfadd mm5, [esp + i3020_vctot]
8306 movq [esp + i3020_vctot], mm5
8308 ;# change sign of fijC and multiply by rinv
8309 pxor mm4,mm4
8310 pfsub mm4, mm7
8311 pfmul mm4, [esp + i3020_tsc]
8312 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
8314 ;# spread oxygen fscalar to both positions
8315 punpckldq mm3,mm3
8316 ;# calc vectorial force for O
8317 prefetchw [edi + eax*4] ;# prefetch faction to cache
8318 movq mm0, [esp + i3020_dxO]
8319 movd mm1, [esp + i3020_dzO]
8320 pfmul mm0, mm3
8321 pfmul mm1, mm3
8323 ;# calc vectorial force for H's
8324 movq mm5, [esp + i3020_dxH]
8325 movq mm6, [esp + i3020_dyH]
8326 movq mm7, [esp + i3020_dzH]
8327 pfmul mm5, mm4
8328 pfmul mm6, mm4
8329 pfmul mm7, mm4
8331 ;# update iO particle force
8332 movq mm2, [esp + i3020_fixO]
8333 movd mm3, [esp + i3020_fizO]
8334 pfadd mm2, mm0
8335 pfadd mm3, mm1
8336 movq [esp + i3020_fixO], mm2
8337 movd [esp + i3020_fizO], mm3
8339 ;# update iH forces
8340 movq mm2, [esp + i3020_fixH]
8341 movq mm3, [esp + i3020_fiyH]
8342 movq mm4, [esp + i3020_fizH]
8343 pfadd mm2, mm5
8344 pfadd mm3, mm6
8345 pfadd mm4, mm7
8346 movq [esp + i3020_fixH], mm2
8347 movq [esp + i3020_fiyH], mm3
8348 movq [esp + i3020_fizH], mm4
8350 ;# pack j forces from H in the same form as the oxygen force.
8351 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
8352 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
8354 pfadd mm0, mm5 ;# add up total force on j particle.
8355 pfadd mm1, mm7
8357 ;# update j particle force
8358 movq mm2, [edi + eax*4]
8359 movd mm3, [edi + eax*4 + 8]
8360 pfsub mm2, mm0
8361 pfsub mm3, mm1
8362 movq [edi + eax*4], mm2
8363 movd [edi + eax*4 + 8], mm3
8365 ;# done - one more?
8366 dec dword ptr [esp + i3020_innerk]
8367 jz .i3020_updateouterdata
8368 jmp .i3020_inner_loop
8369 .i3020_updateouterdata:
8370 mov ecx, [esp + i3020_ii3]
8372 movq mm6, [edi + ecx*4] ;# increment iO force
8373 movd mm7, [edi + ecx*4 + 8]
8374 pfadd mm6, [esp + i3020_fixO]
8375 pfadd mm7, [esp + i3020_fizO]
8376 movq [edi + ecx*4], mm6
8377 movd [edi + ecx*4 +8], mm7
8379 movq mm0, [esp + i3020_fixH]
8380 movq mm3, [esp + i3020_fiyH]
8381 movq mm1, [esp + i3020_fizH]
8382 movq mm2, mm0
8383 punpckldq mm0, mm3 ;# mm0(l)=fxH1, mm0(h)=fyH1
8384 punpckhdq mm2, mm3 ;# mm2(l)=fxH2, mm2(h)=fyH2
8385 movq mm3, mm1
8386 pswapd mm3, mm3
8387 ;# mm1 is fzH1
8388 ;# mm3 is fzH2
8390 movq mm6, [edi + ecx*4 + 12] ;# increment iH1 force
8391 movd mm7, [edi + ecx*4 + 20]
8392 pfadd mm6, mm0
8393 pfadd mm7, mm1
8394 movq [edi + ecx*4 + 12], mm6
8395 movd [edi + ecx*4 + 20], mm7
8397 movq mm6, [edi + ecx*4 + 24] ;# increment iH2 force
8398 movd mm7, [edi + ecx*4 + 32]
8399 pfadd mm6, mm2
8400 pfadd mm7, mm3
8401 movq [edi + ecx*4 + 24], mm6
8402 movd [edi + ecx*4 + 32], mm7
8405 mov ebx, [ebp + i3020_fshift] ;# increment fshift force
8406 mov edx, [esp + i3020_is3]
8408 movq mm6, [ebx + edx*4]
8409 movd mm7, [ebx + edx*4 + 8]
8410 pfadd mm6, [esp + i3020_fixO]
8411 pfadd mm7, [esp + i3020_fizO]
8412 pfadd mm6, mm0
8413 pfadd mm7, mm1
8414 pfadd mm6, mm2
8415 pfadd mm7, mm3
8416 movq [ebx + edx*4], mm6
8417 movd [ebx + edx*4 + 8], mm7
8419 mov edx, [ebp + i3020_gid] ;# get group index for this i particle
8420 mov edx, [edx]
8421 add dword ptr [ebp + i3020_gid], 4 ;# advance pointer
8423 movq mm7, [esp + i3020_vctot]
8424 pfacc mm7,mm7 ;# get and sum the two parts of total potential
8426 mov eax, [ebp + i3020_Vc]
8427 movd mm6, [eax + edx*4]
8428 pfadd mm6, mm7
8429 movd [eax + edx*4], mm6 ;# increment vc[gid]
8431 ;# finish if last
8432 dec dword ptr [ebp + i3020_nri]
8433 jz .i3020_end
8434 ;# not last, iterate once more!
8435 jmp .i3020_outer
8436 .i3020_end:
8437 femms
8438 add esp, 196
8439 pop edi
8440 pop esi
8441 pop edx
8442 pop ecx
8443 pop ebx
8444 pop eax
8445 leave
8450 .globl inl3030_3dnow
8451 .globl _inl3030_3dnow
8452 inl3030_3dnow:
8453 _inl3030_3dnow:
8454 .equiv i3030_nri, 8
8455 .equiv i3030_iinr, 12
8456 .equiv i3030_jindex, 16
8457 .equiv i3030_jjnr, 20
8458 .equiv i3030_shift, 24
8459 .equiv i3030_shiftvec, 28
8460 .equiv i3030_fshift, 32
8461 .equiv i3030_gid, 36
8462 .equiv i3030_pos, 40
8463 .equiv i3030_faction, 44
8464 .equiv i3030_charge, 48
8465 .equiv i3030_facel, 52
8466 .equiv i3030_Vc, 56
8467 .equiv i3030_tabscale, 60
8468 .equiv i3030_VFtab, 64
8469 ;# stack offsets for local variables
8470 .equiv i3030_is3, 0
8471 .equiv i3030_ii3, 4
8472 .equiv i3030_ixO, 8
8473 .equiv i3030_iyO, 12
8474 .equiv i3030_izO, 16
8475 .equiv i3030_ixH, 20
8476 .equiv i3030_iyH, 28
8477 .equiv i3030_izH, 36
8478 .equiv i3030_qqOO, 44
8479 .equiv i3030_qqOH, 52
8480 .equiv i3030_qqHH, 60
8481 .equiv i3030_two, 68
8482 .equiv i3030_n1, 76
8483 .equiv i3030_tsc, 84
8484 .equiv i3030_vctot, 92
8485 .equiv i3030_innerjjnr, 100
8486 .equiv i3030_innerk, 104
8487 .equiv i3030_fixO, 108
8488 .equiv i3030_fiyO, 112
8489 .equiv i3030_fizO, 116
8490 .equiv i3030_fixH, 120
8491 .equiv i3030_fiyH, 128
8492 .equiv i3030_fizH, 136
8493 .equiv i3030_dxO, 144
8494 .equiv i3030_dyO, 148
8495 .equiv i3030_dzO, 152
8496 .equiv i3030_dxH, 156
8497 .equiv i3030_dyH, 164
8498 .equiv i3030_dzH, 172
8499 .equiv i3030_tmprsqH, 180
8500 push ebp
8501 mov ebp,esp
8502 push eax
8503 push ebx
8504 push ecx
8505 push edx
8506 push esi
8507 push edi
8508 sub esp, 188 ;# local stack space
8509 femms
8510 ;# assume we have at least one i particle - start directly
8512 mov ecx, [ebp + i3030_iinr] ;# ecx = pointer into iinr[]
8513 mov ebx, [ecx] ;# ebx=ii
8515 mov edx, [ebp + i3030_charge]
8516 movd mm1, [ebp + i3030_facel] ;# mm1=facel
8517 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0] (O)
8518 movd mm3, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1] (H)
8519 movq mm4, mm2
8520 pfmul mm4, mm1
8521 movq mm6, mm3
8522 pfmul mm6, mm1
8523 movq mm5, mm4
8524 pfmul mm4, mm2 ;# mm4=qqOO*facel
8525 pfmul mm5, mm3 ;# mm5=qqOH*facel
8526 pfmul mm6, mm3 ;# mm6=qqHH*facel
8527 punpckldq mm5,mm5 ;# spread to both halves
8528 punpckldq mm6,mm6 ;# spread to both halves
8529 movq [esp + i3030_qqOO], mm4
8530 movq [esp + i3030_qqOH], mm5
8531 movq [esp + i3030_qqHH], mm6
8532 movq mm2, [mm_two]
8533 movq [esp + i3030_two], mm2
8534 movd mm3, [ebp + i3030_tabscale]
8535 punpckldq mm3,mm3
8536 movq [esp + i3030_tsc], mm3
8537 .i3030_outer:
8538 mov eax, [ebp + i3030_shift] ;# eax = pointer into shift[]
8539 mov ebx, [eax] ;# ebx=shift[n]
8540 add dword ptr [ebp + i3030_shift], 4 ;# advance pointer one step
8542 lea ebx, [ebx + ebx*2] ;# ebx=3*is
8543 mov [esp + i3030_is3],ebx ;# store is3
8545 mov eax, [ebp + i3030_shiftvec] ;# eax = base of shiftvec[]
8547 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
8548 movd mm6, [eax + ebx*4 + 8]
8549 movq mm0, mm5
8550 movq mm1, mm5
8551 movq mm2, mm6
8552 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
8553 punpckhdq mm1,mm1
8554 punpckldq mm2,mm2
8556 mov ecx, [ebp + i3030_iinr] ;# ecx = pointer into iinr[]
8557 add dword ptr [ebp + i3030_iinr], 4 ;# advance pointer
8558 mov ebx, [ecx] ;# ebx=ii
8560 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
8561 mov eax, [ebp + i3030_pos] ;# eax = base of pos[]
8563 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
8564 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
8565 mov [esp + i3030_ii3], ebx ;# (use mm7 as temp. storage for iz.)
8566 pfadd mm6, mm7
8567 movq [esp + i3030_ixO], mm5
8568 movq [esp + i3030_izO], mm6
8570 movd mm3, [eax + ebx*4 + 12]
8571 movd mm4, [eax + ebx*4 + 16]
8572 movd mm5, [eax + ebx*4 + 20]
8573 punpckldq mm3, [eax + ebx*4 + 24]
8574 punpckldq mm4, [eax + ebx*4 + 28]
8575 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
8577 pfadd mm0, mm3
8578 pfadd mm1, mm4
8579 pfadd mm2, mm5
8580 movq [esp + i3030_ixH], mm0
8581 movq [esp + i3030_iyH], mm1
8582 movq [esp + i3030_izH], mm2
8584 ;# clear vctot and i forces
8585 pxor mm7,mm7
8586 movq [esp + i3030_vctot], mm7
8587 movq [esp + i3030_fixO], mm7
8588 movq [esp + i3030_fizO], mm7
8589 movq [esp + i3030_fixH], mm7
8590 movq [esp + i3030_fiyH], mm7
8591 movq [esp + i3030_fizH], mm7
8593 mov eax, [ebp + i3030_jindex]
8594 mov ecx, [eax] ;# jindex[n]
8595 mov edx, [eax + 4] ;# jindex[n+1]
8596 add dword ptr [ebp + i3030_jindex], 4
8597 sub edx, ecx ;# number of innerloop atoms
8598 mov [esp + i3030_innerk], edx ;# number of innerloop atoms
8600 mov esi, [ebp + i3030_pos]
8601 mov edi, [ebp + i3030_faction]
8602 mov eax, [ebp + i3030_jjnr]
8603 shl ecx, 2
8604 add eax, ecx
8605 mov [esp + i3030_innerjjnr], eax ;# pointer to jjnr[nj0]
8606 .i3030_inner_loop:
8607 ;# a single j particle iteration here - compare with the unrolled code for comments.
8608 mov eax, [esp + i3030_innerjjnr]
8609 mov eax, [eax] ;# eax=jnr offset
8610 add dword ptr [esp + i3030_innerjjnr], 4 ;# advance pointer
8612 lea eax, [eax + eax*2]
8614 movq mm0, [esi + eax*4]
8615 movd mm1, [esi + eax*4 + 8]
8616 ;# copy & expand to mm2-mm4 for the H interactions
8617 movq mm2, mm0
8618 movq mm3, mm0
8619 movq mm4, mm1
8620 punpckldq mm2,mm2
8621 punpckhdq mm3,mm3
8622 punpckldq mm4,mm4
8624 pfsubr mm0, [esp + i3030_ixO]
8625 pfsubr mm1, [esp + i3030_izO]
8627 movq [esp + i3030_dxO], mm0
8628 pfmul mm0,mm0
8629 movd [esp + i3030_dzO], mm1
8630 pfmul mm1,mm1
8631 pfacc mm0, mm0
8632 pfadd mm0, mm1 ;# mm0=rsqO
8634 punpckldq mm2, mm2
8635 punpckldq mm3, mm3
8636 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
8637 pfsubr mm2, [esp + i3030_ixH]
8638 pfsubr mm3, [esp + i3030_iyH]
8639 pfsubr mm4, [esp + i3030_izH] ;# mm2-mm4 is dxH-dzH
8641 movq [esp + i3030_dxH], mm2
8642 movq [esp + i3030_dyH], mm3
8643 movq [esp + i3030_dzH], mm4
8644 pfmul mm2,mm2
8645 pfmul mm3,mm3
8646 pfmul mm4,mm4
8648 pfadd mm3,mm2
8649 pfadd mm3,mm4 ;# mm3=rsqH
8650 movq [esp + i3030_tmprsqH], mm3
8652 pfrsqrt mm1,mm0
8654 movq mm2,mm1
8655 pfmul mm1,mm1
8656 pfrsqit1 mm1,mm0
8657 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
8658 pfmul mm0, mm1 ;# mm0=rsq
8660 pfmul mm0, [esp + i3030_tsc]
8661 pf2iw mm4, mm0
8662 movd [esp + i3030_n1], mm4
8663 pi2fd mm4,mm4
8664 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
8665 movq mm2, mm0
8666 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
8668 ;# coulomb table
8669 mov edx, [ebp + i3030_VFtab]
8670 mov ecx, [esp + i3030_n1]
8671 shl ecx, 2
8673 ;# load all values we need
8674 movd mm4, [edx + ecx*4]
8675 movd mm5, [edx + ecx*4 + 4]
8676 movd mm6, [edx + ecx*4 + 8]
8677 movd mm7, [edx + ecx*4 + 12]
8679 pfmul mm6, mm0 ;# mm6 = Geps
8680 pfmul mm7, mm2 ;# mm7 = Heps2
8682 pfadd mm5, mm6
8683 pfadd mm5, mm7 ;# mm5 = Fp
8685 pfmul mm7, [esp + i3030_two] ;# two*Heps2
8686 pfadd mm7, mm6
8687 pfadd mm7, mm5 ;# mm7=FF
8689 pfmul mm5, mm0 ;# mm5=eps*Fp
8690 pfadd mm5, mm4 ;# mm5= VV
8692 pfmul mm5, [esp + i3030_qqOO] ;# vcoul=qq*VV
8693 pfmul mm7, [esp + i3030_qqOO] ;# fijC=qq*FF
8695 ;# update vctot directly, use mm3 for fscal sum.
8696 pfadd mm5, [esp + i3030_vctot]
8697 movq [esp + i3030_vctot], mm5
8698 movq mm3, mm7
8700 ;# change sign of fscal and multiply with rinv
8701 pxor mm0,mm0
8702 pfsubr mm3, mm0
8703 pfmul mm3, [esp + i3030_tsc]
8704 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
8706 ;# Ready with the oxygen - potential is updated, fscal is in mm3.
8707 ;# time for hydrogens!
8709 movq mm0, [esp + i3030_tmprsqH]
8711 pfrsqrt mm1, mm0
8712 pswapd mm0,mm0
8713 pfrsqrt mm2, mm0
8714 pswapd mm0,mm0
8715 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
8717 movq mm2, mm1
8718 pfmul mm1,mm1
8719 pfrsqit1 mm1,mm0
8720 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
8722 pfmul mm0,mm1 ;# mm0=r
8723 pfmul mm0, [esp + i3030_tsc]
8724 pf2iw mm4, mm0
8725 movq [esp + i3030_n1], mm4
8726 pi2fd mm4,mm4
8727 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
8728 movq mm2, mm0
8729 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
8731 ;# coulomb table
8732 mov edx, [ebp + i3030_VFtab]
8733 mov ecx, [esp + i3030_n1]
8734 shl ecx, 2
8735 ;# load all values we need
8736 movd mm4, [edx + ecx*4]
8737 movd mm5, [edx + ecx*4 + 4]
8738 movd mm6, [edx + ecx*4 + 8]
8739 movd mm7, [edx + ecx*4 + 12]
8740 mov ecx, [esp + i3030_n1 + 4]
8741 shl ecx, 2
8742 punpckldq mm4, [edx + ecx*4]
8743 punpckldq mm5, [edx + ecx*4 + 4]
8744 punpckldq mm6, [edx + ecx*4 + 8]
8745 punpckldq mm7, [edx + ecx*4 + 12]
8747 pfmul mm6, mm0 ;# mm6 = Geps
8748 pfmul mm7, mm2 ;# mm7 = Heps2
8750 pfadd mm5, mm6
8751 pfadd mm5, mm7 ;# mm5 = Fp
8753 pfmul mm7, [esp + i3030_two] ;# two*Heps2
8754 pfadd mm7, mm6
8755 pfadd mm7, mm5 ;# mm7=FF
8757 pfmul mm5, mm0 ;# mm5=eps*Fp
8758 pfadd mm5, mm4 ;# mm5= VV
8760 pfmul mm5, [esp + i3030_qqOH] ;# vcoul=qq*VV
8761 pfmul mm7, [esp + i3030_qqOH] ;# fijC=qq*FF
8762 ;# update vctot
8763 pfadd mm5, [esp + i3030_vctot]
8764 movq [esp + i3030_vctot], mm5
8766 ;# change sign of fijC and multiply by rinv
8767 pxor mm4,mm4
8768 pfsub mm4, mm7
8769 pfmul mm4, [esp + i3030_tsc]
8770 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
8772 ;# spread oxygen fscalar to both positions
8773 punpckldq mm3,mm3
8774 ;# calc vectorial force for O
8775 movq mm0, [esp + i3030_dxO]
8776 movd mm1, [esp + i3030_dzO]
8777 pfmul mm0, mm3
8778 pfmul mm1, mm3
8780 ;# calc vectorial force for H's
8781 movq mm5, [esp + i3030_dxH]
8782 movq mm6, [esp + i3030_dyH]
8783 movq mm7, [esp + i3030_dzH]
8784 pfmul mm5, mm4
8785 pfmul mm6, mm4
8786 pfmul mm7, mm4
8788 ;# update iO particle force
8789 movq mm2, [esp + i3030_fixO]
8790 movd mm3, [esp + i3030_fizO]
8791 pfadd mm2, mm0
8792 pfadd mm3, mm1
8793 movq [esp + i3030_fixO], mm2
8794 movd [esp + i3030_fizO], mm3
8796 ;# update iH forces
8797 movq mm2, [esp + i3030_fixH]
8798 movq mm3, [esp + i3030_fiyH]
8799 movq mm4, [esp + i3030_fizH]
8800 pfadd mm2, mm5
8801 pfadd mm3, mm6
8802 pfadd mm4, mm7
8803 movq [esp + i3030_fixH], mm2
8804 movq [esp + i3030_fiyH], mm3
8805 movq [esp + i3030_fizH], mm4
8807 ;# pack j forces from H in the same form as the oxygen force.
8808 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
8809 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
8811 pfadd mm0, mm5 ;# add up total force on j particle.
8812 pfadd mm1, mm7
8814 ;# update j particle force
8815 movq mm2, [edi + eax*4]
8816 movd mm3, [edi + eax*4 + 8]
8817 pfsub mm2, mm0
8818 pfsub mm3, mm1
8819 movq [edi + eax*4], mm2
8820 movd [edi + eax*4 +8], mm3
8822 ;# interactions with j H1
8824 movq mm0, [esi + eax*4 + 12]
8825 movd mm1, [esi + eax*4 + 20]
8826 ;# copy & expand to mm2-mm4 for the H interactions
8827 movq mm2, mm0
8828 movq mm3, mm0
8829 movq mm4, mm1
8830 punpckldq mm2,mm2
8831 punpckhdq mm3,mm3
8832 punpckldq mm4,mm4
8834 pfsubr mm0, [esp + i3030_ixO]
8835 pfsubr mm1, [esp + i3030_izO]
8837 movq [esp + i3030_dxO], mm0
8838 pfmul mm0,mm0
8839 movd [esp + i3030_dzO], mm1
8840 pfmul mm1,mm1
8841 pfacc mm0, mm1
8842 pfadd mm0, mm1 ;# mm0=rsqO
8844 punpckldq mm2, mm2
8845 punpckldq mm3, mm3
8846 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
8847 pfsubr mm2, [esp + i3030_ixH]
8848 pfsubr mm3, [esp + i3030_iyH]
8849 pfsubr mm4, [esp + i3030_izH] ;# mm2-mm4 is dxH-dzH
8851 movq [esp + i3030_dxH], mm2
8852 movq [esp + i3030_dyH], mm3
8853 movq [esp + i3030_dzH], mm4
8854 pfmul mm2,mm2
8855 pfmul mm3,mm3
8856 pfmul mm4,mm4
8858 pfadd mm3,mm2
8859 pfadd mm3,mm4 ;# mm3=rsqH
8860 movq [esp + i3030_tmprsqH], mm3
8862 pfrsqrt mm1,mm0
8864 movq mm2,mm1
8865 pfmul mm1,mm1
8866 pfrsqit1 mm1,mm0
8867 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
8868 pfmul mm0, mm1 ;# mm0=rsq
8870 pfmul mm0, [esp + i3030_tsc]
8871 pf2iw mm4, mm0
8872 movd [esp + i3030_n1], mm4
8873 pi2fd mm4,mm4
8874 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
8875 movq mm2, mm0
8876 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
8878 ;# coulomb table
8879 mov edx, [ebp + i3030_VFtab]
8880 mov ecx, [esp + i3030_n1]
8881 shl ecx, 2
8883 ;# load all values we need
8884 movd mm4, [edx + ecx*4]
8885 movd mm5, [edx + ecx*4 + 4]
8886 movd mm6, [edx + ecx*4 + 8]
8887 movd mm7, [edx + ecx*4 + 12]
8889 pfmul mm6, mm0 ;# mm6 = Geps
8890 pfmul mm7, mm2 ;# mm7 = Heps2
8892 pfadd mm5, mm6
8893 pfadd mm5, mm7 ;# mm5 = Fp
8895 pfmul mm7, [esp + i3030_two] ;# two*Heps2
8896 pfadd mm7, mm6
8897 pfadd mm7, mm5 ;# mm7=FF
8899 pfmul mm5, mm0 ;# mm5=eps*Fp
8900 pfadd mm5, mm4 ;# mm5= VV
8902 pfmul mm5, [esp + i3030_qqOH] ;# vcoul=qq*VV
8903 pfmul mm7, [esp + i3030_qqOH] ;# fijC=qq*FF
8905 ;# update vctot directly, force is moved to mm3
8906 pfadd mm5, [esp + i3030_vctot]
8907 movq [esp + i3030_vctot], mm5
8908 pxor mm3, mm3
8909 pfsub mm3, mm7
8910 pfmul mm3, [esp + i3030_tsc]
8911 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
8913 movq mm0, [esp + i3030_tmprsqH]
8915 pfrsqrt mm1, mm0
8916 pswapd mm0,mm0
8917 pfrsqrt mm2, mm0
8918 pswapd mm0,mm0
8919 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
8921 movq mm2, mm1
8922 pfmul mm1,mm1
8923 pfrsqit1 mm1,mm0
8924 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
8926 pfmul mm0,mm1 ;# mm0=r
8927 pfmul mm0, [esp + i3030_tsc]
8928 pf2iw mm4, mm0
8929 movq [esp + i3030_n1], mm4
8930 pi2fd mm4,mm4
8931 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
8932 movq mm2, mm0
8933 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
8935 ;# coulomb table
8936 mov edx, [ebp + i3030_VFtab]
8937 mov ecx, [esp + i3030_n1]
8938 shl ecx, 2
8939 ;# load all values we need
8940 movd mm4, [edx + ecx*4]
8941 movd mm5, [edx + ecx*4 + 4]
8942 movd mm6, [edx + ecx*4 + 8]
8943 movd mm7, [edx + ecx*4 + 12]
8944 mov ecx, [esp + i3030_n1 + 4]
8945 shl ecx, 2
8946 punpckldq mm4, [edx + ecx*4]
8947 punpckldq mm5, [edx + ecx*4 + 4]
8948 punpckldq mm6, [edx + ecx*4 + 8]
8949 punpckldq mm7, [edx + ecx*4 + 12]
8952 pfmul mm6, mm0 ;# mm6 = Geps
8953 pfmul mm7, mm2 ;# mm7 = Heps2
8955 pfadd mm5, mm6
8956 pfadd mm5, mm7 ;# mm5 = Fp
8958 pfmul mm7, [esp + i3030_two] ;# two*Heps2
8959 pfadd mm7, mm6
8960 pfadd mm7, mm5 ;# mm7=FF
8962 pfmul mm5, mm0 ;# mm5=eps*Fp
8963 pfadd mm5, mm4 ;# mm5= VV
8965 pfmul mm5, [esp + i3030_qqHH] ;# vcoul=qq*VV
8966 pfmul mm7, [esp + i3030_qqHH] ;# fijC=qq*FF
8967 ;# update vctot
8968 pfadd mm5, [esp + i3030_vctot]
8969 movq [esp + i3030_vctot], mm5
8971 ;# change sign of fijC and multiply by rinv
8972 pxor mm4,mm4
8973 pfsub mm4, mm7
8974 pfmul mm4, [esp + i3030_tsc]
8975 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
8977 ;# spread oxygen fscalar to both positions
8978 punpckldq mm3,mm3
8979 ;# calc vectorial force for O
8980 movq mm0, [esp + i3030_dxO]
8981 movd mm1, [esp + i3030_dzO]
8982 pfmul mm0, mm3
8983 pfmul mm1, mm3
8985 ;# calc vectorial force for H's
8986 movq mm5, [esp + i3030_dxH]
8987 movq mm6, [esp + i3030_dyH]
8988 movq mm7, [esp + i3030_dzH]
8989 pfmul mm5, mm4
8990 pfmul mm6, mm4
8991 pfmul mm7, mm4
8993 ;# update iO particle force
8994 movq mm2, [esp + i3030_fixO]
8995 movd mm3, [esp + i3030_fizO]
8996 pfadd mm2, mm0
8997 pfadd mm3, mm1
8998 movq [esp + i3030_fixO], mm2
8999 movd [esp + i3030_fizO], mm3
9001 ;# update iH forces
9002 movq mm2, [esp + i3030_fixH]
9003 movq mm3, [esp + i3030_fiyH]
9004 movq mm4, [esp + i3030_fizH]
9005 pfadd mm2, mm5
9006 pfadd mm3, mm6
9007 pfadd mm4, mm7
9008 movq [esp + i3030_fixH], mm2
9009 movq [esp + i3030_fiyH], mm3
9010 movq [esp + i3030_fizH], mm4
9012 ;# pack j forces from H in the same form as the oxygen force.
9013 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
9014 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
9016 pfadd mm0, mm5 ;# add up total force on j particle.
9017 pfadd mm1, mm7
9019 ;# update j particle force
9020 movq mm2, [edi + eax*4 + 12]
9021 movd mm3, [edi + eax*4 + 20]
9022 pfsub mm2, mm0
9023 pfsub mm3, mm1
9024 movq [edi + eax*4 + 12], mm2
9025 movd [edi + eax*4 + 20], mm3
9027 ;# interactions with j H2
9028 movq mm0, [esi + eax*4 + 24]
9029 movd mm1, [esi + eax*4 + 32]
9030 ;# copy & expand to mm2-mm4 for the H interactions
9031 movq mm2, mm0
9032 movq mm3, mm0
9033 movq mm4, mm1
9034 punpckldq mm2,mm2
9035 punpckhdq mm3,mm3
9036 punpckldq mm4,mm4
9038 pfsubr mm0, [esp + i3030_ixO]
9039 pfsubr mm1, [esp + i3030_izO]
9041 movq [esp + i3030_dxO], mm0
9042 pfmul mm0,mm0
9043 movd [esp + i3030_dzO], mm1
9044 pfmul mm1,mm1
9045 pfacc mm0, mm1
9046 pfadd mm0, mm1 ;# mm0=rsqO
9048 punpckldq mm2, mm2
9049 punpckldq mm3, mm3
9050 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
9051 pfsubr mm2, [esp + i3030_ixH]
9052 pfsubr mm3, [esp + i3030_iyH]
9053 pfsubr mm4, [esp + i3030_izH] ;# mm2-mm4 is dxH-dzH
9055 movq [esp + i3030_dxH], mm2
9056 movq [esp + i3030_dyH], mm3
9057 movq [esp + i3030_dzH], mm4
9058 pfmul mm2,mm2
9059 pfmul mm3,mm3
9060 pfmul mm4,mm4
9062 pfadd mm3,mm2
9063 pfadd mm3,mm4 ;# mm3=rsqH
9064 movq [esp + i3030_tmprsqH], mm3
9066 pfrsqrt mm1,mm0
9068 movq mm2,mm1
9069 pfmul mm1,mm1
9070 pfrsqit1 mm1,mm0
9071 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
9072 pfmul mm0, mm1
9074 pfmul mm0, [esp + i3030_tsc]
9075 pf2iw mm4, mm0
9076 movd [esp + i3030_n1], mm4
9077 pi2fd mm4,mm4
9078 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
9079 movq mm2, mm0
9080 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
9082 ;# coulomb table
9083 mov edx, [ebp + i3030_VFtab]
9084 mov ecx, [esp + i3030_n1]
9085 shl ecx, 2
9087 ;# load all values we need
9088 movd mm4, [edx + ecx*4]
9089 movd mm5, [edx + ecx*4 + 4]
9090 movd mm6, [edx + ecx*4 + 8]
9091 movd mm7, [edx + ecx*4 + 12]
9093 pfmul mm6, mm0 ;# mm6 = Geps
9094 pfmul mm7, mm2 ;# mm7 = Heps2
9096 pfadd mm5, mm6
9097 pfadd mm5, mm7 ;# mm5 = Fp
9099 pfmul mm7, [esp + i3030_two] ;# two*Heps2
9100 pfadd mm7, mm6
9101 pfadd mm7, mm5 ;# mm7=FF
9103 pfmul mm5, mm0 ;# mm5=eps*Fp
9104 pfadd mm5, mm4 ;# mm5= VV
9106 pfmul mm5, [esp + i3030_qqOH] ;# vcoul=qq*VV
9107 pfmul mm7, [esp + i3030_qqOH] ;# fijC=qq*FF
9109 ;# update vctot directly, use mm3 for fscal sum.
9110 pfadd mm5, [esp + i3030_vctot]
9111 movq [esp + i3030_vctot], mm5
9112 pxor mm3,mm3
9113 pfsub mm3, mm7
9114 pfmul mm3, [esp + i3030_tsc]
9115 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
9117 movq mm0, [esp + i3030_tmprsqH]
9119 pfrsqrt mm1, mm0
9120 pswapd mm0,mm0
9121 pfrsqrt mm2, mm0
9122 pswapd mm0,mm0
9123 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
9125 movq mm2, mm1
9126 pfmul mm1,mm1
9127 pfrsqit1 mm1,mm0
9128 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
9130 pfmul mm0,mm1 ;# mm0=r
9131 pfmul mm0, [esp + i3030_tsc]
9132 pf2iw mm4, mm0
9133 movq [esp + i3030_n1], mm4
9134 pi2fd mm4,mm4
9135 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
9136 movq mm2, mm0
9137 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
9139 ;# coulomb table
9140 mov edx, [ebp + i3030_VFtab]
9141 mov ecx, [esp + i3030_n1]
9142 shl ecx, 2
9143 ;# load all values we need
9144 movd mm4, [edx + ecx*4]
9145 movd mm5, [edx + ecx*4 + 4]
9146 movd mm6, [edx + ecx*4 + 8]
9147 movd mm7, [edx + ecx*4 + 12]
9148 mov ecx, [esp + i3030_n1 + 4]
9149 shl ecx, 2
9150 punpckldq mm4, [edx + ecx*4]
9151 punpckldq mm5, [edx + ecx*4 + 4]
9152 punpckldq mm6, [edx + ecx*4 + 8]
9153 punpckldq mm7, [edx + ecx*4 + 12]
9156 pfmul mm6, mm0 ;# mm6 = Geps
9157 pfmul mm7, mm2 ;# mm7 = Heps2
9159 pfadd mm5, mm6
9160 pfadd mm5, mm7 ;# mm5 = Fp
9162 pfmul mm7, [esp + i3030_two] ;# two*Heps2
9163 pfadd mm7, mm6
9164 pfadd mm7, mm5 ;# mm7=FF
9166 pfmul mm5, mm0 ;# mm5=eps*Fp
9167 pfadd mm5, mm4 ;# mm5= VV
9169 pfmul mm5, [esp + i3030_qqHH] ;# vcoul=qq*VV
9170 pfmul mm7, [esp + i3030_qqHH] ;# fijC=qq*FF
9171 ;# update vctot
9172 pfadd mm5, [esp + i3030_vctot]
9173 movq [esp + i3030_vctot], mm5
9175 ;# change sign of fijC and multiply by rinv
9176 pxor mm4,mm4
9177 pfsub mm4, mm7
9178 pfmul mm4, [esp + i3030_tsc]
9179 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
9181 ;# spread oxygen fscalar to both positions
9182 punpckldq mm3,mm3
9183 ;# calc vectorial force for O
9184 movq mm0, [esp + i3030_dxO]
9185 movd mm1, [esp + i3030_dzO]
9186 pfmul mm0, mm3
9187 pfmul mm1, mm3
9189 ;# calc vectorial force for H's
9190 movq mm5, [esp + i3030_dxH]
9191 movq mm6, [esp + i3030_dyH]
9192 movq mm7, [esp + i3030_dzH]
9193 pfmul mm5, mm4
9194 pfmul mm6, mm4
9195 pfmul mm7, mm4
9197 ;# update iO particle force
9198 movq mm2, [esp + i3030_fixO]
9199 movd mm3, [esp + i3030_fizO]
9200 pfadd mm2, mm0
9201 pfadd mm3, mm1
9202 movq [esp + i3030_fixO], mm2
9203 movd [esp + i3030_fizO], mm3
9205 ;# update iH forces
9206 movq mm2, [esp + i3030_fixH]
9207 movq mm3, [esp + i3030_fiyH]
9208 movq mm4, [esp + i3030_fizH]
9209 pfadd mm2, mm5
9210 pfadd mm3, mm6
9211 pfadd mm4, mm7
9212 movq [esp + i3030_fixH], mm2
9213 movq [esp + i3030_fiyH], mm3
9214 movq [esp + i3030_fizH], mm4
9216 ;# pack j forces from H in the same form as the oxygen force.
9217 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
9218 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
9220 pfadd mm0, mm5 ;# add up total force on j particle.
9221 pfadd mm1, mm7
9223 ;# update j particle force
9224 movq mm2, [edi + eax*4 + 24]
9225 movd mm3, [edi + eax*4 + 32]
9226 pfsub mm2, mm0
9227 pfsub mm3, mm1
9228 movq [edi + eax*4 + 24], mm2
9229 movd [edi + eax*4 + 32], mm3
9231 ;# done - one more?
9232 dec dword ptr [esp + i3030_innerk]
9233 jz .i3030_updateouterdata
9234 jmp .i3030_inner_loop
9235 .i3030_updateouterdata:
9236 mov ecx, [esp + i3030_ii3]
9238 movq mm6, [edi + ecx*4] ;# increment iO force
9239 movd mm7, [edi + ecx*4 + 8]
9240 pfadd mm6, [esp + i3030_fixO]
9241 pfadd mm7, [esp + i3030_fizO]
9242 movq [edi + ecx*4], mm6
9243 movd [edi + ecx*4 +8], mm7
9245 movq mm0, [esp + i3030_fixH]
9246 movq mm3, [esp + i3030_fiyH]
9247 movq mm1, [esp + i3030_fizH]
9248 movq mm2, mm0
9249 punpckldq mm0, mm3 ;# mm0(l)=fxH1, mm0(h)=fyH1
9250 punpckhdq mm2, mm3 ;# mm2(l)=fxH2, mm2(h)=fyH2
9251 movq mm3, mm1
9252 pswapd mm3,mm3
9253 ;# mm1 is fzH1
9254 ;# mm3 is fzH2
9256 movq mm6, [edi + ecx*4 + 12] ;# increment iH1 force
9257 movd mm7, [edi + ecx*4 + 20]
9258 pfadd mm6, mm0
9259 pfadd mm7, mm1
9260 movq [edi + ecx*4 + 12], mm6
9261 movd [edi + ecx*4 + 20], mm7
9263 movq mm6, [edi + ecx*4 + 24] ;# increment iH2 force
9264 movd mm7, [edi + ecx*4 + 32]
9265 pfadd mm6, mm2
9266 pfadd mm7, mm3
9267 movq [edi + ecx*4 + 24], mm6
9268 movd [edi + ecx*4 + 32], mm7
9271 mov ebx, [ebp + i3030_fshift] ;# increment fshift force
9272 mov edx, [esp + i3030_is3]
9274 movq mm6, [ebx + edx*4]
9275 movd mm7, [ebx + edx*4 + 8]
9276 pfadd mm6, [esp + i3030_fixO]
9277 pfadd mm7, [esp + i3030_fizO]
9278 pfadd mm6, mm0
9279 pfadd mm7, mm1
9280 pfadd mm6, mm2
9281 pfadd mm7, mm3
9282 movq [ebx + edx*4], mm6
9283 movd [ebx + edx*4 + 8], mm7
9285 mov edx, [ebp + i3030_gid] ;# get group index for this i particle
9286 mov edx, [edx]
9287 add dword ptr [ebp + i3030_gid], 4 ;# advance pointer
9289 movq mm7, [esp + i3030_vctot]
9290 pfacc mm7,mm7 ;# get and sum the two parts of total potential
9292 mov eax, [ebp + i3030_Vc]
9293 movd mm6, [eax + edx*4]
9294 pfadd mm6, mm7
9295 movd [eax + edx*4], mm6 ;# increment vc[gid]
9297 ;# finish if last
9298 dec dword ptr [ebp + i3030_nri]
9299 jz .i3030_end
9300 ;# not last, iterate once more!
9301 jmp .i3030_outer
9302 .i3030_end:
9303 femms
9304 add esp, 188
9305 pop edi
9306 pop esi
9307 pop edx
9308 pop ecx
9309 pop ebx
9310 pop eax
9311 leave
9317 .globl inl3100_3dnow
9318 .globl _inl3100_3dnow
9319 inl3100_3dnow:
9320 _inl3100_3dnow:
9321 .equiv i3100_nri, 8
9322 .equiv i3100_iinr, 12
9323 .equiv i3100_jindex, 16
9324 .equiv i3100_jjnr, 20
9325 .equiv i3100_shift, 24
9326 .equiv i3100_shiftvec, 28
9327 .equiv i3100_fshift, 32
9328 .equiv i3100_gid, 36
9329 .equiv i3100_pos, 40
9330 .equiv i3100_faction, 44
9331 .equiv i3100_charge, 48
9332 .equiv i3100_facel, 52
9333 .equiv i3100_Vc, 56
9334 .equiv i3100_type, 60
9335 .equiv i3100_ntype, 64
9336 .equiv i3100_nbfp, 68
9337 .equiv i3100_Vnb, 72
9338 .equiv i3100_tabscale, 76
9339 .equiv i3100_VFtab, 80
9340 ;# stack offsets for local variables
9341 .equiv i3100_is3, 0
9342 .equiv i3100_ii3, 4
9343 .equiv i3100_ix, 8
9344 .equiv i3100_iy, 12
9345 .equiv i3100_iz, 16
9346 .equiv i3100_iq, 20
9347 .equiv i3100_vctot, 28
9348 .equiv i3100_vnbtot, 36
9349 .equiv i3100_c6, 44
9350 .equiv i3100_c12, 52
9351 .equiv i3100_six, 60
9352 .equiv i3100_twelve, 68
9353 .equiv i3100_two, 76
9354 .equiv i3100_n1, 84
9355 .equiv i3100_tsc, 92
9356 .equiv i3100_ntia, 100
9357 .equiv i3100_innerjjnr, 104
9358 .equiv i3100_innerk, 108
9359 .equiv i3100_fix, 112
9360 .equiv i3100_fiy, 116
9361 .equiv i3100_fiz, 120
9362 .equiv i3100_dx1, 124
9363 .equiv i3100_dy1, 128
9364 .equiv i3100_dz1, 132
9365 .equiv i3100_dx2, 136
9366 .equiv i3100_dy2, 140
9367 .equiv i3100_dz2, 144
9368 push ebp
9369 mov ebp,esp
9370 push eax
9371 push ebx
9372 push ecx
9373 push edx
9374 push esi
9375 push edi
9376 sub esp, 148 ;# local stack space
9377 femms
9378 ;# move data to local stack
9379 movq mm0, [mm_two]
9380 movq mm1, [mm_six]
9381 movq mm2, [mm_twelve]
9382 movd mm3, [ebp + i3100_tabscale]
9383 movq [esp + i3100_two], mm0
9384 movq [esp + i3100_six], mm1
9385 movq [esp + i3100_twelve], mm2
9386 punpckldq mm3,mm3
9387 movq [esp + i3100_tsc], mm3
9388 ;# assume we have at least one i particle - start directly
9389 .i3100_outer:
9390 mov eax, [ebp + i3100_shift] ;# eax = pointer into shift[]
9391 mov ebx, [eax] ;# ebx=shift[n]
9392 add dword ptr [ebp + i3100_shift], 4 ;# advance pointer one step
9394 lea ebx, [ebx + ebx*2] ;# ebx=3*is
9395 mov [esp + i3100_is3],ebx ;# store is3
9397 mov eax, [ebp + i3100_shiftvec] ;# eax = base of shiftvec[]
9399 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
9400 movd mm1, [eax + ebx*4 + 8]
9402 mov ecx, [ebp + i3100_iinr] ;# ecx = pointer into iinr[]
9403 add dword ptr [ebp + i3100_iinr], 4 ;# advance pointer
9404 mov ebx, [ecx] ;# ebx=ii
9406 mov edx, [ebp + i3100_charge]
9407 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
9408 pfmul mm2, [ebp + i3100_facel]
9409 punpckldq mm2,mm2 ;# spread to both halves
9410 movq [esp + i3100_iq], mm2 ;# iq =facel*charge[ii]
9412 mov edx, [ebp + i3100_type]
9413 mov edx, [edx + ebx*4]
9414 imul edx, [ebp + i3100_ntype]
9415 shl edx, 1
9416 mov [esp + i3100_ntia], edx
9418 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
9419 mov eax, [ebp + i3100_pos] ;# eax = base of pos[]
9421 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
9422 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
9423 mov [esp + i3100_ii3], ebx
9424 pfadd mm1, mm3
9425 movq [esp + i3100_ix], mm0
9426 movd [esp + i3100_iz], mm1
9428 ;# clear total potential and i forces
9429 pxor mm7,mm7
9430 movq [esp + i3100_vctot], mm7
9431 movq [esp + i3100_vnbtot], mm7
9432 movq [esp + i3100_fix], mm7
9433 movd [esp + i3100_fiz], mm7
9435 mov eax, [ebp + i3100_jindex]
9436 mov ecx, [eax] ;# jindex[n]
9437 mov edx, [eax + 4] ;# jindex[n+1]
9438 add dword ptr [ebp + i3100_jindex], 4
9439 sub edx, ecx ;# number of innerloop atoms
9441 mov esi, [ebp + i3100_pos]
9442 mov edi, [ebp + i3100_faction]
9443 mov eax, [ebp + i3100_jjnr]
9444 shl ecx, 2
9445 add eax, ecx
9446 mov [esp + i3100_innerjjnr], eax ;# pointer to jjnr[nj0]
9447 sub edx, 2
9448 mov [esp + i3100_innerk], edx ;# number of innerloop atoms
9449 jge .i3100_unroll_loop
9450 jmp .i3100_finish_inner
9451 .i3100_unroll_loop:
9452 ;# paired innerloop starts here
9453 mov ecx, [esp + i3100_innerjjnr] ;# pointer to jjnr[k]
9454 mov eax, [ecx]
9455 mov ebx, [ecx + 4] ;# eax/ebx=jnr
9456 add dword ptr [esp + i3100_innerjjnr], 8 ;# advance pointer (unrolled 2)
9457 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
9459 mov ecx, [ebp + i3100_charge] ;# base of charge[]
9460 movq mm5, [esp + i3100_iq]
9461 movd mm3, [ecx + eax*4] ;# charge[jnr1]
9462 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
9463 pfmul mm3,mm5 ;# mm3 now has qq for both particles
9465 mov ecx, [ebp + i3100_type]
9466 mov edx, [ecx + eax*4] ;# type [jnr1]
9467 mov ecx, [ecx + ebx*4] ;# type [jnr2]
9469 mov esi, [ebp + i3100_nbfp] ;# base of nbfp
9470 shl edx, 1
9471 shl ecx, 1
9472 add edx, [esp + i3100_ntia] ;# tja = ntia + 2*type
9473 add ecx, [esp + i3100_ntia]
9475 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
9476 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
9477 movq mm6,mm5
9478 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
9479 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
9480 movq [esp + i3100_c6], mm5
9481 movq [esp + i3100_c12], mm6
9483 lea eax, [eax + eax*2] ;# replace jnr with j3
9484 lea ebx, [ebx + ebx*2]
9486 mov esi, [ebp + i3100_pos]
9488 movq mm0, [esp + i3100_ix]
9489 movd mm1, [esp + i3100_iz]
9490 movq mm4, [esi + eax*4] ;# fetch first j coordinates
9491 movd mm5, [esi + eax*4 + 8]
9492 pfsubr mm4,mm0 ;# dr = ir - jr
9493 pfsubr mm5,mm1
9494 movq [esp + i3100_dx1], mm4 ;# store dr
9495 movd [esp + i3100_dz1], mm5
9496 pfmul mm4,mm4 ;# square dx,dy,dz
9497 pfmul mm5,mm5
9498 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
9499 pfacc mm4, mm5 ;# first rsq in lower mm4
9501 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
9502 movd mm7, [esi + ebx*4 + 8]
9504 pfsubr mm6,mm0 ;# dr = ir - jr
9505 pfsubr mm7,mm1
9506 movq [esp + i3100_dx2], mm6 ;# store dr
9507 movd [esp + i3100_dz2], mm7
9508 pfmul mm6,mm6 ;# square dx,dy,dz
9509 pfmul mm7,mm7
9510 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
9511 pfacc mm6, mm7 ;# second rsq in lower mm6
9513 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
9514 pfrsqrt mm1, mm6
9517 punpckldq mm0,mm1
9518 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
9519 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
9520 pfmul mm0,mm0
9521 pfrsqit1 mm0,mm4
9522 pfrcpit2 mm0,mm2
9523 pfmul mm4, mm0
9524 movq mm1, mm4
9525 ;# mm0 is invsqrt, and mm1 r.
9526 ;# do potential and fscal
9527 pfmul mm1, [esp + i3100_tsc] ;# mm1=rt
9528 pf2iw mm4,mm1
9529 movq [esp + i3100_n1], mm4
9530 pi2fd mm4,mm4
9531 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
9533 movq mm2,mm1
9534 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
9536 mov edx, [ebp + i3100_VFtab]
9537 mov ecx, [esp + i3100_n1]
9538 shl ecx, 2
9539 ;# coulomb table
9540 ;# load all the table values we need
9541 movd mm4, [edx + ecx*4]
9542 movd mm5, [edx + ecx*4 + 4]
9543 movd mm6, [edx + ecx*4 + 8]
9544 movd mm7, [edx + ecx*4 + 12]
9545 mov ecx, [esp + i3100_n1 + 4]
9546 shl ecx, 2
9547 punpckldq mm4, [edx + ecx*4]
9548 punpckldq mm5, [edx + ecx*4 + 4]
9549 punpckldq mm6, [edx + ecx*4 + 8]
9550 punpckldq mm7, [edx + ecx*4 + 12]
9552 pfmul mm6, mm1 ;# mm6 = Geps
9553 pfmul mm7, mm2 ;# mm7 = Heps2
9555 pfadd mm5, mm6
9556 pfadd mm5, mm7 ;# mm5 = Fp
9558 pfmul mm7, [esp + i3100_two] ;# two*Heps2
9559 pfadd mm7, mm6
9560 pfadd mm7, mm5 ;# mm7=FF
9562 pfmul mm5, mm1 ;# mm5=eps*Fp
9563 pfadd mm5, mm4 ;# mm5= VV
9565 pfmul mm5, mm3 ;# vcoul=qq*VV
9566 pfmul mm3, mm7 ;# fijC=FF*qq
9568 movq mm1, mm0
9569 pfmul mm1,mm1 ;# mm1=invsq
9570 movq mm2, mm1
9571 pfmul mm2,mm1
9572 pfmul mm2,mm1 ;# mm2=rinvsix
9573 movq mm1,mm2
9574 pfmul mm1,mm1 ;# mm1=rinvtwelve
9576 pfmul mm3, [esp + i3100_tsc]
9578 pfmul mm1, [esp + i3100_c12]
9580 pfmul mm2, [esp + i3100_c6]
9582 movq mm4, mm1
9583 pfsub mm4, mm2 ;# mm4 = vnb12-vnb6
9585 pfmul mm2, [esp + i3100_six]
9586 pfmul mm1, [esp + i3100_twelve]
9588 pfsub mm1, mm2
9589 pfmul mm1, mm0 ;# mm1= (12*vnb12-6*vnb6)*rinv11
9591 pfsub mm1, mm3
9593 ;# update vctot
9594 pfadd mm5, [esp + i3100_vctot] ;# add the earlier value
9595 movq [esp + i3100_vctot], mm5 ;# store the sum
9597 pfmul mm0, mm1 ;# mm0 is total fscal now
9599 prefetchw [esp + i3100_dx1] ;# prefetch i forces to cache
9601 ;# spread fscalar to both positions
9602 movq mm1,mm0
9603 punpckldq mm0,mm0
9604 punpckhdq mm1,mm1
9606 ;# calc vector force
9607 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
9608 movq mm2, [esp + i3100_dx1] ;# fetch dr
9609 movd mm3, [esp + i3100_dz1]
9611 ;# update vnbtot
9612 pfadd mm4, [esp + i3100_vnbtot] ;# add the earlier value
9613 movq [esp + i3100_vnbtot], mm4 ;# store the sum
9615 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
9616 pfmul mm2, mm0 ;# mult by fs
9617 pfmul mm3, mm0
9619 movq mm4, [esp + i3100_dx2] ;# fetch dr
9620 movd mm5, [esp + i3100_dz2]
9621 pfmul mm4, mm1 ;# mult by fs
9622 pfmul mm5, mm1
9623 ;# update i forces
9625 movq mm0, [esp + i3100_fix]
9626 movd mm1, [esp + i3100_fiz]
9627 pfadd mm0, mm2
9628 pfadd mm1, mm3
9630 pfadd mm0, mm4
9631 pfadd mm1, mm5
9632 movq [esp + i3100_fix], mm0
9633 movd [esp + i3100_fiz], mm1
9634 ;# update j forces
9636 movq mm0, [edi + eax*4]
9637 movd mm1, [edi + eax*4 + 8]
9638 movq mm6, [edi + ebx*4]
9639 movd mm7, [edi + ebx*4 + 8]
9641 pfsub mm0, mm2
9642 pfsub mm1, mm3
9643 pfsub mm6, mm4
9644 pfsub mm7, mm5
9646 movq [edi + eax*4], mm0
9647 movd [edi + eax*4 +8], mm1
9648 movq [edi + ebx*4], mm6
9649 movd [edi + ebx*4 + 8], mm7
9651 ;# should we do one more iteration?
9652 sub dword ptr [esp + i3100_innerk], 2
9653 jl .i3100_finish_inner
9654 jmp .i3100_unroll_loop
9655 .i3100_finish_inner:
9656 and dword ptr [esp + i3100_innerk], 1
9657 jnz .i3100_single_inner
9658 jmp .i3100_updateouterdata
9659 .i3100_single_inner:
9660 ;# a single j particle iteration here - compare with the unrolled code for comments.
9661 mov eax, [esp + i3100_innerjjnr]
9662 mov eax, [eax] ;# eax=jnr offset
9664 mov ecx, [ebp + i3100_charge]
9665 movd mm5, [esp + i3100_iq]
9666 movd mm3, [ecx + eax*4]
9667 pfmul mm3, mm5 ;# mm3=qq
9669 mov esi, [ebp + i3100_nbfp]
9670 mov ecx, [ebp + i3100_type]
9671 mov edx, [ecx + eax*4] ;# type [jnr1]
9672 shl edx, 1
9673 add edx, [esp + i3100_ntia] ;# tja = ntia + 2*type
9674 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
9675 movq [esp + i3100_c6], mm5
9676 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
9677 movq [esp + i3100_c12], mm5
9680 mov esi, [ebp + i3100_pos]
9681 lea eax, [eax + eax*2]
9683 movq mm0, [esp + i3100_ix]
9684 movd mm1, [esp + i3100_iz]
9685 movq mm4, [esi + eax*4]
9686 movd mm5, [esi + eax*4 + 8]
9687 pfsubr mm4, mm0
9688 pfsubr mm5, mm1
9689 movq [esp + i3100_dx1], mm4
9690 pfmul mm4,mm4
9691 movd [esp + i3100_dz1], mm5
9692 pfmul mm5,mm5
9693 pfacc mm4, mm5
9694 pfacc mm4, mm5 ;# mm4=rsq
9696 pfrsqrt mm0,mm4
9697 movq mm2,mm0
9698 pfmul mm0,mm0
9699 pfrsqit1 mm0,mm4
9700 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
9701 pfmul mm4, mm0
9702 movq mm1, mm4
9703 ;# mm0 is invsqrt, and mm1 r.
9704 ;# calculate potentials and scalar force
9705 pfmul mm1, [esp + i3100_tsc] ;# mm1=rt
9706 pf2iw mm4,mm1
9707 movd [esp + i3100_n1], mm4
9708 pi2fd mm4,mm4
9709 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
9711 movq mm2,mm1
9712 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
9714 ;# coulomb table
9715 mov edx, [ebp + i3100_VFtab]
9716 mov ecx, [esp + i3100_n1]
9717 shl ecx, 2
9718 ;# load all the table values we need
9719 movd mm4, [edx + ecx*4]
9720 movd mm5, [edx + ecx*4 + 4]
9721 movd mm6, [edx + ecx*4 + 8]
9722 movd mm7, [edx + ecx*4 + 12]
9724 pfmul mm6, mm1 ;# mm6 = Geps
9725 pfmul mm7, mm2 ;# mm7 = Heps2
9727 pfadd mm5, mm6
9728 pfadd mm5, mm7 ;# mm5 = Fp
9730 pfmul mm7, [esp + i3100_two] ;# two*Heps2
9731 pfadd mm7, mm6
9732 pfadd mm7, mm5 ;# mm7=FF
9734 pfmul mm5, mm1 ;# mm5=eps*Fp
9735 pfadd mm5, mm4 ;# mm5= VV
9737 pfmul mm5, mm3 ;# vcoul=qq*VV
9738 pfmul mm3, mm7 ;# fijC=FF*qq
9740 ;# at this point mm5 contains vcoul and mm3 fijC
9742 movq mm1, mm0
9743 pfmul mm1,mm1 ;# mm1=invsq
9744 movq mm2, mm1
9745 pfmul mm2,mm1
9746 pfmul mm2,mm1 ;# mm2=rinvsix
9747 movq mm1,mm2
9748 pfmul mm1,mm1 ;# mm1=rinvtwelve
9750 pfmul mm3, [esp + i3100_tsc]
9752 pfmul mm1, [esp + i3100_c12]
9754 pfmul mm2, [esp + i3100_c6]
9756 movq mm4, mm1
9757 pfsub mm4, mm2 ;# mm4 = vnb12-vnb6
9759 pfmul mm2, [esp + i3100_six]
9760 pfmul mm1, [esp + i3100_twelve]
9762 pfsub mm1, mm2
9763 pfmul mm1, mm0 ;# mm1= (12*vnb12-6*vnb6)*rinv11
9765 pfsub mm1, mm3
9767 ;# update vctot
9768 pfadd mm5, [esp + i3100_vctot] ;# add the earlier value
9769 movq [esp + i3100_vctot], mm5 ;# store the sum
9771 pfmul mm0, mm1 ;# mm0 is total fscal now
9773 ;# spread fscalar to both positions
9774 punpckldq mm0,mm0
9775 ;# calc vectorial force
9776 prefetchw [edi + eax*4] ;# prefetch faction to cache
9777 movq mm2, [esp + i3100_dx1]
9778 movd mm3, [esp + i3100_dz1]
9780 ;# update vnbtot
9781 pfadd mm4, [esp + i3100_vnbtot] ;# add the earlier value
9782 movq [esp + i3100_vnbtot], mm4 ;# store the sum
9784 pfmul mm2, mm0
9785 pfmul mm3, mm0
9787 ;# update i particle force
9788 movq mm0, [esp + i3100_fix]
9789 movd mm1, [esp + i3100_fiz]
9790 pfadd mm0, mm2
9791 pfadd mm1, mm3
9792 movq [esp + i3100_fix], mm0
9793 movd [esp + i3100_fiz], mm1
9794 ;# update j particle force
9795 movq mm0, [edi + eax*4]
9796 movd mm1, [edi + eax *4+ 8]
9797 pfsub mm0, mm2
9798 pfsub mm1, mm3
9799 movq [edi + eax*4], mm0
9800 movd [edi + eax*4 +8], mm1
9801 ;# done!
9802 .i3100_updateouterdata:
9803 mov ecx, [esp + i3100_ii3]
9805 movq mm6, [edi + ecx*4] ;# increment i force
9806 movd mm7, [edi + ecx*4 + 8]
9807 pfadd mm6, [esp + i3100_fix]
9808 pfadd mm7, [esp + i3100_fiz]
9809 movq [edi + ecx*4], mm6
9810 movd [edi + ecx*4 +8], mm7
9812 mov ebx, [ebp + i3100_fshift] ;# increment fshift force
9813 mov edx, [esp + i3100_is3]
9815 movq mm6, [ebx + edx*4]
9816 movd mm7, [ebx + edx*4 + 8]
9817 pfadd mm6, [esp + i3100_fix]
9818 pfadd mm7, [esp + i3100_fiz]
9819 movq [ebx + edx*4], mm6
9820 movd [ebx + edx*4 + 8], mm7
9822 mov edx, [ebp + i3100_gid] ;# get group index for this i particle
9823 mov edx, [edx]
9824 add dword ptr [ebp + i3100_gid], 4 ;# advance pointer
9826 movq mm7, [esp + i3100_vctot]
9827 pfacc mm7,mm7 ;# get and sum the two parts of total potential
9829 mov eax, [ebp + i3100_Vc]
9830 movd mm6, [eax + edx*4]
9831 pfadd mm6, mm7
9832 movd [eax + edx*4], mm6 ;# increment vc[gid]
9834 movq mm7, [esp + i3100_vnbtot]
9835 pfacc mm7,mm7 ;# get and sum the two parts of total potential
9837 mov eax, [ebp + i3100_Vnb]
9838 movd mm6, [eax + edx*4]
9839 pfadd mm6, mm7
9840 movd [eax + edx*4], mm6 ;# increment vnb[gid]
9842 ;# finish if last
9843 mov ecx, [ebp + i3100_nri]
9844 dec ecx
9845 jecxz .i3100_end
9846 ;# not last, iterate once more!
9847 mov [ebp + i3100_nri], ecx
9848 jmp .i3100_outer
9849 .i3100_end:
9850 femms
9851 add esp, 148
9852 pop edi
9853 pop esi
9854 pop edx
9855 pop ecx
9856 pop ebx
9857 pop eax
9858 leave
9867 .globl inl3110_3dnow
9868 .globl _inl3110_3dnow
9869 inl3110_3dnow:
9870 _inl3110_3dnow:
9871 .equiv i3110_nri, 8
9872 .equiv i3110_iinr, 12
9873 .equiv i3110_jindex, 16
9874 .equiv i3110_jjnr, 20
9875 .equiv i3110_shift, 24
9876 .equiv i3110_shiftvec, 28
9877 .equiv i3110_fshift, 32
9878 .equiv i3110_gid, 36
9879 .equiv i3110_pos, 40
9880 .equiv i3110_faction, 44
9881 .equiv i3110_charge, 48
9882 .equiv i3110_facel, 52
9883 .equiv i3110_Vc, 56
9884 .equiv i3110_type, 60
9885 .equiv i3110_ntype, 64
9886 .equiv i3110_nbfp, 68
9887 .equiv i3110_Vnb, 72
9888 .equiv i3110_tabscale, 76
9889 .equiv i3110_VFtab, 80
9890 .equiv i3110_nsatoms, 84
9891 ;# stack offsets for local variables
9892 .equiv i3110_is3, 0
9893 .equiv i3110_ii3, 4
9894 .equiv i3110_shX, 8
9895 .equiv i3110_shY, 12
9896 .equiv i3110_shZ, 16
9897 .equiv i3110_ix, 20
9898 .equiv i3110_iy, 24
9899 .equiv i3110_iz, 28
9900 .equiv i3110_iq, 32
9901 .equiv i3110_vctot, 40
9902 .equiv i3110_vnbtot, 48
9903 .equiv i3110_c6, 56
9904 .equiv i3110_c12, 64
9905 .equiv i3110_six, 72
9906 .equiv i3110_twelve, 80
9907 .equiv i3110_two, 88
9908 .equiv i3110_n1, 96
9909 .equiv i3110_tsc, 104
9910 .equiv i3110_ntia, 112
9911 .equiv i3110_innerjjnr0, 116
9912 .equiv i3110_innerk0, 120
9913 .equiv i3110_innerjjnr, 124
9914 .equiv i3110_innerk, 128
9915 .equiv i3110_fix, 132
9916 .equiv i3110_fiy, 136
9917 .equiv i3110_fiz, 140
9918 .equiv i3110_dx1, 144
9919 .equiv i3110_dy1, 148
9920 .equiv i3110_dz1, 152
9921 .equiv i3110_dx2, 156
9922 .equiv i3110_dy2, 160
9923 .equiv i3110_dz2, 164
9924 .equiv i3110_nsvdwc, 168
9925 .equiv i3110_nscoul, 172
9926 .equiv i3110_nsvdw, 176
9927 .equiv i3110_solnr, 180
9928 push ebp
9929 mov ebp,esp
9930 push eax
9931 push ebx
9932 push ecx
9933 push edx
9934 push esi
9935 push edi
9936 sub esp, 184 ;# local stack space
9937 femms
9938 movq mm0, [mm_six]
9939 movq mm1, [mm_twelve]
9940 movq [esp + i3110_six], mm0
9941 movq [esp + i3110_twelve], mm1
9942 movq mm2, [mm_two]
9943 movd mm3, [ebp + i3110_tabscale]
9944 movq [esp + i3110_two], mm2
9945 punpckldq mm3,mm3
9946 movq [esp + i3110_tsc], mm3
9947 ;# assume we have at least one i particle - start directly
9948 .i3110_outer:
9949 mov eax, [ebp + i3110_shift] ;# eax = pointer into shift[]
9950 mov ebx, [eax] ;# ebx=shift[n]
9951 add dword ptr [ebp + i3110_shift], 4 ;# advance pointer one step
9953 lea ebx, [ebx + ebx*2] ;# ebx=3*is
9954 mov [esp + i3110_is3],ebx ;# store is3
9956 mov eax, [ebp + i3110_shiftvec] ;# eax = base of shiftvec[]
9958 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
9959 movd mm1, [eax + ebx*4 + 8]
9960 movq [esp + i3110_shX], mm0
9961 movd [esp + i3110_shZ], mm1
9963 mov ecx, [ebp + i3110_iinr] ;# ecx = pointer into iinr[]
9964 add dword ptr [ebp + i3110_iinr], 4 ;# advance pointer
9965 mov ebx, [ecx] ;# ebx=ii
9967 mov eax, [ebp + i3110_nsatoms]
9968 add dword ptr [ebp + i3110_nsatoms], 12
9969 mov ecx, [eax]
9970 mov edx, [eax + 4]
9971 mov eax, [eax + 8]
9972 sub ecx, eax
9973 sub eax, edx
9975 mov [esp + i3110_nsvdwc], edx
9976 mov [esp + i3110_nscoul], eax
9977 mov [esp + i3110_nsvdw], ecx
9979 ;# clear potential
9980 pxor mm7,mm7
9981 movq [esp + i3110_vctot], mm7
9982 movq [esp + i3110_vnbtot], mm7
9983 mov [esp + i3110_solnr], ebx
9985 mov eax, [ebp + i3110_jindex]
9986 mov ecx, [eax] ;# jindex[n]
9987 mov edx, [eax + 4] ;# jindex[n+1]
9988 add dword ptr [ebp + i3110_jindex], 4
9989 sub edx, ecx ;# number of innerloop atoms
9990 mov eax, [ebp + i3110_jjnr]
9991 shl ecx, 2
9992 add eax, ecx
9993 mov [esp + i3110_innerjjnr0], eax ;# pointer to jjnr[nj0]
9995 mov [esp + i3110_innerk0], edx ;# number of innerloop atoms
9996 mov esi, [ebp + i3110_pos]
9997 mov edi, [ebp + i3110_faction]
9999 mov ecx, [esp + i3110_nsvdwc]
10000 cmp ecx, 0
10001 jnz .i3110_mno_vdwc
10002 jmp .i3110_testcoul
10003 .i3110_mno_vdwc:
10004 mov ebx, [esp + i3110_solnr]
10005 inc dword ptr [esp + i3110_solnr]
10006 mov edx, [ebp + i3110_charge]
10007 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
10008 pfmul mm2, [ebp + i3110_facel]
10009 punpckldq mm2,mm2 ;# spread to both halves
10010 movq [esp + i3110_iq], mm2 ;# iq =facel*charge[ii]
10012 mov edx, [ebp + i3110_type]
10013 mov edx, [edx + ebx*4]
10014 imul edx, [ebp + i3110_ntype]
10015 shl edx, 1
10016 mov [esp + i3110_ntia], edx
10018 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
10019 mov eax, [ebp + i3110_pos] ;# eax = base of pos[]
10020 mov [esp + i3110_ii3], ebx
10022 movq mm0, [eax + ebx*4]
10023 movd mm1, [eax + ebx*4 + 8]
10024 pfadd mm0, [esp + i3110_shX]
10025 pfadd mm1, [esp + i3110_shZ]
10026 movq [esp + i3110_ix], mm0
10027 movd [esp + i3110_iz], mm1
10029 ;# clear forces
10030 pxor mm7,mm7
10031 movq [esp + i3110_fix], mm7
10032 movd [esp + i3110_fiz], mm7
10034 mov ecx, [esp + i3110_innerjjnr0]
10035 mov [esp + i3110_innerjjnr], ecx
10036 mov edx, [esp + i3110_innerk0]
10037 sub edx, 2
10038 mov [esp + i3110_innerk], edx ;# number of innerloop atoms
10039 jge .i3110_unroll_vdwc_loop
10040 jmp .i3110_finish_vdwc_inner
10041 .i3110_unroll_vdwc_loop:
10042 ;# paired innerloop starts here
10043 mov ecx, [esp + i3110_innerjjnr] ;# pointer to jjnr[k]
10044 mov eax, [ecx]
10045 mov ebx, [ecx + 4] ;# eax/ebx=jnr
10046 add dword ptr [esp + i3110_innerjjnr], 8 ;# advance pointer (unrolled 2)
10047 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
10049 mov ecx, [ebp + i3110_charge] ;# base of charge[]
10050 movq mm5, [esp + i3110_iq]
10051 movd mm3, [ecx + eax*4] ;# charge[jnr1]
10052 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
10053 pfmul mm3,mm5 ;# mm3 now has qq for both particles
10055 mov ecx, [ebp + i3110_type]
10056 mov edx, [ecx + eax*4] ;# type [jnr1]
10057 mov ecx, [ecx + ebx*4] ;# type [jnr2]
10059 mov esi, [ebp + i3110_nbfp] ;# base of nbfp
10060 shl edx, 1
10061 shl ecx, 1
10062 add edx, [esp + i3110_ntia] ;# tja = ntia + 2*type
10063 add ecx, [esp + i3110_ntia]
10065 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
10066 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
10067 movq mm6,mm5
10068 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
10069 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
10070 movq [esp + i3110_c6], mm5
10071 movq [esp + i3110_c12], mm6
10073 lea eax, [eax + eax*2] ;# replace jnr with j3
10074 lea ebx, [ebx + ebx*2]
10076 mov esi, [ebp + i3110_pos]
10078 movq mm0, [esp + i3110_ix]
10079 movd mm1, [esp + i3110_iz]
10080 movq mm4, [esi + eax*4] ;# fetch first j coordinates
10081 movd mm5, [esi + eax*4 + 8]
10082 pfsubr mm4,mm0 ;# dr = ir - jr
10083 pfsubr mm5,mm1
10084 movq [esp + i3110_dx1], mm4 ;# store dr
10085 movd [esp + i3110_dz1], mm5
10086 pfmul mm4,mm4 ;# square dx,dy,dz
10087 pfmul mm5,mm5
10088 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
10089 pfacc mm4, mm5 ;# first rsq in lower mm4
10091 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
10092 movd mm7, [esi + ebx*4 + 8]
10094 pfsubr mm6,mm0 ;# dr = ir - jr
10095 pfsubr mm7,mm1
10096 movq [esp + i3110_dx2], mm6 ;# store dr
10097 movd [esp + i3110_dz2], mm7
10098 pfmul mm6,mm6 ;# square dx,dy,dz
10099 pfmul mm7,mm7
10100 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
10101 pfacc mm6, mm7 ;# second rsq in lower mm6
10103 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
10104 pfrsqrt mm1, mm6
10107 punpckldq mm0,mm1
10108 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
10109 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
10110 pfmul mm0,mm0
10111 pfrsqit1 mm0,mm4
10112 pfrcpit2 mm0,mm2
10113 pfmul mm4, mm0
10114 movq mm1, mm4
10115 ;# mm0 is invsqrt, and mm1 r.
10116 ;# do potential and fscal
10117 pfmul mm1, [esp + i3110_tsc] ;# mm1=rt
10118 pf2iw mm4,mm1
10119 movq [esp + i3110_n1], mm4
10120 pi2fd mm4,mm4
10121 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
10123 movq mm2,mm1
10124 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
10126 mov edx, [ebp + i3110_VFtab]
10127 mov ecx, [esp + i3110_n1]
10128 shl ecx, 2
10129 ;# coulomb table
10130 ;# load all the table values we need
10131 movd mm4, [edx + ecx*4]
10132 movd mm5, [edx + ecx*4 + 4]
10133 movd mm6, [edx + ecx*4 + 8]
10134 movd mm7, [edx + ecx*4 + 12]
10135 mov ecx, [esp + i3110_n1 + 4]
10136 shl ecx, 2
10137 punpckldq mm4, [edx + ecx*4]
10138 punpckldq mm5, [edx + ecx*4 + 4]
10139 punpckldq mm6, [edx + ecx*4 + 8]
10140 punpckldq mm7, [edx + ecx*4 + 12]
10142 pfmul mm6, mm1 ;# mm6 = Geps
10143 pfmul mm7, mm2 ;# mm7 = Heps2
10145 pfadd mm5, mm6
10146 pfadd mm5, mm7 ;# mm5 = Fp
10148 pfmul mm7, [esp + i3110_two] ;# two*Heps2
10149 pfadd mm7, mm6
10150 pfadd mm7, mm5 ;# mm7=FF
10152 pfmul mm5, mm1 ;# mm5=eps*Fp
10153 pfadd mm5, mm4 ;# mm5= VV
10155 pfmul mm5, mm3 ;# vcoul=qq*VV
10156 pfmul mm3, mm7 ;# fijC=FF*qq
10158 movq mm1, mm0
10159 pfmul mm1,mm1 ;# mm1=invsq
10160 movq mm2, mm1
10161 pfmul mm2,mm1
10162 pfmul mm2,mm1 ;# mm2=rinvsix
10163 movq mm1,mm2
10164 pfmul mm1,mm1 ;# mm1=rinvtwelve
10166 pfmul mm3, [esp + i3110_tsc]
10168 pfmul mm1, [esp + i3110_c12]
10170 pfmul mm2, [esp + i3110_c6]
10172 movq mm4, mm1
10173 pfsub mm4, mm2 ;# mm4 = vnb12-vnb6
10175 pfmul mm2, [esp + i3110_six]
10176 pfmul mm1, [esp + i3110_twelve]
10178 pfsub mm1, mm2
10179 pfmul mm1, mm0 ;# mm1= (12*vnb12-6*vnb6)*rinv11
10181 pfsub mm1, mm3
10183 ;# update vctot
10184 pfadd mm5, [esp + i3110_vctot] ;# add the earlier value
10185 movq [esp + i3110_vctot], mm5 ;# store the sum
10187 pfmul mm0, mm1 ;# mm0 is total fscal now
10189 prefetchw [esp + i3110_dx1] ;# prefetch i forces to cache
10191 ;# spread fscalar to both positions
10192 movq mm1,mm0
10193 punpckldq mm0,mm0
10194 punpckhdq mm1,mm1
10196 ;# calc vector force
10197 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
10198 movq mm2, [esp + i3110_dx1] ;# fetch dr
10199 movd mm3, [esp + i3110_dz1]
10201 ;# update vnbtot
10202 pfadd mm4, [esp + i3110_vnbtot] ;# add the earlier value
10203 movq [esp + i3110_vnbtot], mm4 ;# store the sum
10205 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
10206 pfmul mm2, mm0 ;# mult by fs
10207 pfmul mm3, mm0
10209 movq mm4, [esp + i3110_dx2] ;# fetch dr
10210 movd mm5, [esp + i3110_dz2]
10211 pfmul mm4, mm1 ;# mult by fs
10212 pfmul mm5, mm1
10213 ;# update i forces
10215 movq mm0, [esp + i3110_fix]
10216 movd mm1, [esp + i3110_fiz]
10217 pfadd mm0, mm2
10218 pfadd mm1, mm3
10220 pfadd mm0, mm4
10221 pfadd mm1, mm5
10222 movq [esp + i3110_fix], mm0
10223 movd [esp + i3110_fiz], mm1
10224 ;# update j forces
10226 movq mm0, [edi + eax*4]
10227 movd mm1, [edi + eax*4 + 8]
10228 movq mm6, [edi + ebx*4]
10229 movd mm7, [edi + ebx*4 + 8]
10231 pfsub mm0, mm2
10232 pfsub mm1, mm3
10233 pfsub mm6, mm4
10234 pfsub mm7, mm5
10236 movq [edi + eax*4], mm0
10237 movd [edi + eax*4 +8], mm1
10238 movq [edi + ebx*4], mm6
10239 movd [edi + ebx*4 + 8], mm7
10241 ;# should we do one more iteration?
10242 sub dword ptr [esp + i3110_innerk], 2
10243 jl .i3110_finish_vdwc_inner
10244 jmp .i3110_unroll_vdwc_loop
10245 .i3110_finish_vdwc_inner:
10246 and dword ptr [esp + i3110_innerk], 1
10247 jnz .i3110_single_vdwc_inner
10248 jmp .i3110_updateouterdata_vdwc
10249 .i3110_single_vdwc_inner:
10250 ;# a single j particle iteration here - compare with the unrolled code for comments.
10251 mov eax, [esp + i3110_innerjjnr]
10252 mov eax, [eax] ;# eax=jnr offset
10254 mov ecx, [ebp + i3110_charge]
10255 movd mm5, [esp + i3110_iq]
10256 movd mm3, [ecx + eax*4]
10257 pfmul mm3, mm5 ;# mm3=qq
10259 mov esi, [ebp + i3110_nbfp]
10260 mov ecx, [ebp + i3110_type]
10261 mov edx, [ecx + eax*4] ;# type [jnr1]
10262 shl edx, 1
10263 add edx, [esp + i3110_ntia] ;# tja = ntia + 2*type
10264 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
10265 movq [esp + i3110_c6], mm5
10266 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
10267 movq [esp + i3110_c12], mm5
10270 mov esi, [ebp + i3110_pos]
10271 lea eax, [eax + eax*2]
10273 movq mm0, [esp + i3110_ix]
10274 movd mm1, [esp + i3110_iz]
10275 movq mm4, [esi + eax*4]
10276 movd mm5, [esi + eax*4 + 8]
10277 pfsubr mm4, mm0
10278 pfsubr mm5, mm1
10279 movq [esp + i3110_dx1], mm4
10280 pfmul mm4,mm4
10281 movd [esp + i3110_dz1], mm5
10282 pfmul mm5,mm5
10283 pfacc mm4, mm5
10284 pfacc mm4, mm5 ;# mm4=rsq
10286 pfrsqrt mm0,mm4
10287 movq mm2,mm0
10288 pfmul mm0,mm0
10289 pfrsqit1 mm0,mm4
10290 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
10291 pfmul mm4, mm0
10292 movq mm1, mm4
10293 ;# mm0 is invsqrt, and mm1 r.
10294 ;# calculate potentials and scalar force
10295 pfmul mm1, [esp + i3110_tsc] ;# mm1=rt
10296 pf2iw mm4,mm1
10297 movd [esp + i3110_n1], mm4
10298 pi2fd mm4,mm4
10299 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
10301 movq mm2,mm1
10302 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
10304 ;# coulomb table
10305 mov edx, [ebp + i3110_VFtab]
10306 mov ecx, [esp + i3110_n1]
10307 shl ecx, 2
10308 ;# load all the table values we need
10309 movd mm4, [edx + ecx*4]
10310 movd mm5, [edx + ecx*4 + 4]
10311 movd mm6, [edx + ecx*4 + 8]
10312 movd mm7, [edx + ecx*4 + 12]
10314 pfmul mm6, mm1 ;# mm6 = Geps
10315 pfmul mm7, mm2 ;# mm7 = Heps2
10317 pfadd mm5, mm6
10318 pfadd mm5, mm7 ;# mm5 = Fp
10320 pfmul mm7, [esp + i3110_two] ;# two*Heps2
10321 pfadd mm7, mm6
10322 pfadd mm7, mm5 ;# mm7=FF
10324 pfmul mm5, mm1 ;# mm5=eps*Fp
10325 pfadd mm5, mm4 ;# mm5= VV
10327 pfmul mm5, mm3 ;# vcoul=qq*VV
10328 pfmul mm3, mm7 ;# fijC=FF*qq
10330 movq mm1, mm0
10331 pfmul mm1,mm1 ;# mm1=invsq
10332 movq mm2, mm1
10333 pfmul mm2,mm1
10334 pfmul mm2,mm1 ;# mm2=rinvsix
10335 movq mm1,mm2
10336 pfmul mm1,mm1 ;# mm1=rinvtwelve
10338 pfmul mm3, [esp + i3110_tsc]
10340 pfmul mm1, [esp + i3110_c12]
10342 pfmul mm2, [esp + i3110_c6]
10344 movq mm4, mm1
10345 pfsub mm4, mm2 ;# mm4 = vnb12-vnb6
10347 pfmul mm2, [esp + i3110_six]
10348 pfmul mm1, [esp + i3110_twelve]
10350 pfsub mm1, mm2
10351 pfmul mm1, mm0 ;# mm1= (12*vnb12-6*vnb6)*rinv11
10353 pfsub mm1, mm3
10355 ;# update vctot
10356 pfadd mm5, [esp + i3110_vctot] ;# add the earlier value
10357 movq [esp + i3110_vctot], mm5 ;# store the sum
10359 pfmul mm0, mm1 ;# mm0 is total fscal now
10361 ;# spread fscalar to both positions
10362 punpckldq mm0,mm0
10363 ;# calc vectorial force
10364 prefetchw [edi + eax*4] ;# prefetch faction to cache
10365 movq mm2, [esp + i3110_dx1]
10366 movd mm3, [esp + i3110_dz1]
10368 ;# update vnbtot
10369 pfadd mm4, [esp + i3110_vnbtot] ;# add the earlier value
10370 movq [esp + i3110_vnbtot], mm4 ;# store the sum
10372 pfmul mm2, mm0
10373 pfmul mm3, mm0
10375 ;# update i particle force
10376 movq mm0, [esp + i3110_fix]
10377 movd mm1, [esp + i3110_fiz]
10378 pfadd mm0, mm2
10379 pfadd mm1, mm3
10380 movq [esp + i3110_fix], mm0
10381 movd [esp + i3110_fiz], mm1
10382 ;# update j particle force
10383 movq mm0, [edi + eax*4]
10384 movd mm1, [edi + eax *4+ 8]
10385 pfsub mm0, mm2
10386 pfsub mm1, mm3
10387 movq [edi + eax*4], mm0
10388 movd [edi + eax*4 +8], mm1
10389 ;# done!
10390 .i3110_updateouterdata_vdwc:
10391 mov ecx, [esp + i3110_ii3]
10393 movq mm6, [edi + ecx*4] ;# increment i force
10394 movd mm7, [edi + ecx*4 + 8]
10395 pfadd mm6, [esp + i3110_fix]
10396 pfadd mm7, [esp + i3110_fiz]
10397 movq [edi + ecx*4], mm6
10398 movd [edi + ecx*4 +8], mm7
10400 mov ebx, [ebp + i3110_fshift] ;# increment fshift force
10401 mov edx, [esp + i3110_is3]
10403 movq mm6, [ebx + edx*4]
10404 movd mm7, [ebx + edx*4 + 8]
10405 pfadd mm6, [esp + i3110_fix]
10406 pfadd mm7, [esp + i3110_fiz]
10407 movq [ebx + edx*4], mm6
10408 movd [ebx + edx*4 + 8], mm7
10410 ;# loop back to mno
10411 dec dword ptr [esp + i3110_nsvdwc]
10412 jz .i3110_testcoul
10413 jmp .i3110_mno_vdwc
10414 .i3110_testcoul:
10415 mov ecx, [esp + i3110_nscoul]
10416 cmp ecx, 0
10417 jnz .i3110_mno_coul
10418 jmp .i3110_testvdw
10419 .i3110_mno_coul:
10420 mov ebx, [esp + i3110_solnr]
10421 inc dword ptr [esp + i3110_solnr]
10422 mov edx, [ebp + i3110_charge]
10423 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
10424 pfmul mm2, [ebp + i3110_facel]
10425 punpckldq mm2,mm2 ;# spread to both halves
10426 movq [esp + i3110_iq], mm2 ;# iq =facel*charge[ii]
10428 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
10429 mov eax, [ebp + i3110_pos] ;# eax = base of pos[]
10430 mov [esp + i3110_ii3], ebx
10432 movq mm0, [eax + ebx*4]
10433 movd mm1, [eax + ebx*4 + 8]
10434 pfadd mm0, [esp + i3110_shX]
10435 pfadd mm1, [esp + i3110_shZ]
10436 movq [esp + i3110_ix], mm0
10437 movd [esp + i3110_iz], mm1
10439 ;# clear forces
10440 pxor mm7,mm7
10441 movq [esp + i3110_fix], mm7
10442 movd [esp + i3110_fiz], mm7
10444 mov ecx, [esp + i3110_innerjjnr0]
10445 mov [esp + i3110_innerjjnr], ecx
10446 mov edx, [esp + i3110_innerk0]
10447 sub edx, 2
10448 mov [esp + i3110_innerk], edx ;# number of innerloop atoms
10449 jge .i3110_unroll_coul_loop
10450 jmp .i3110_finish_coul_inner
10451 .i3110_unroll_coul_loop:
10452 ;# paired innerloop starts here
10453 mov ecx, [esp + i3110_innerjjnr] ;# pointer to jjnr[k]
10454 mov eax, [ecx]
10455 mov ebx, [ecx + 4] ;# eax/ebx=jnr
10456 add dword ptr [esp + i3110_innerjjnr], 8 ;# advance pointer (unrolled 2)
10457 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
10459 mov ecx, [ebp + i3110_charge] ;# base of charge[]
10460 movq mm5, [esp + i3110_iq]
10461 movd mm3, [ecx + eax*4] ;# charge[jnr1]
10462 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
10463 pfmul mm3,mm5 ;# mm3 now has qq for both particles
10465 lea eax, [eax + eax*2] ;# replace jnr with j3
10466 lea ebx, [ebx + ebx*2]
10468 mov esi, [ebp + i3110_pos]
10470 movq mm0, [esp + i3110_ix]
10471 movd mm1, [esp + i3110_iz]
10472 movq mm4, [esi + eax*4] ;# fetch first j coordinates
10473 movd mm5, [esi + eax*4 + 8]
10474 pfsubr mm4,mm0 ;# dr = ir - jr
10475 pfsubr mm5,mm1
10476 movq [esp + i3110_dx1], mm4 ;# store dr
10477 movd [esp + i3110_dz1], mm5
10478 pfmul mm4,mm4 ;# square dx,dy,dz
10479 pfmul mm5,mm5
10480 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
10481 pfacc mm4, mm5 ;# first rsq in lower mm4
10483 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
10484 movd mm7, [esi + ebx*4 + 8]
10486 pfsubr mm6,mm0 ;# dr = ir - jr
10487 pfsubr mm7,mm1
10488 movq [esp + i3110_dx2], mm6 ;# store dr
10489 movd [esp + i3110_dz2], mm7
10490 pfmul mm6,mm6 ;# square dx,dy,dz
10491 pfmul mm7,mm7
10492 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
10493 pfacc mm6, mm7 ;# second rsq in lower mm6
10495 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
10496 pfrsqrt mm1, mm6
10499 punpckldq mm0,mm1
10500 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
10501 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
10502 pfmul mm0,mm0
10503 pfrsqit1 mm0,mm4
10504 pfrcpit2 mm0,mm2
10505 pfmul mm4, mm0
10506 movq mm1, mm4
10507 ;# mm0 is invsqrt, and mm1 r.
10508 ;# do potential and fscal
10509 pfmul mm1, [esp + i3110_tsc] ;# mm1=rt
10510 pf2iw mm4,mm1
10511 movq [esp + i3110_n1], mm4
10512 pi2fd mm4,mm4
10513 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
10515 movq mm2,mm1
10516 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
10518 mov edx, [ebp + i3110_VFtab]
10519 mov ecx, [esp + i3110_n1]
10520 shl ecx, 2
10521 ;# coulomb table
10522 ;# load all the table values we need
10523 movd mm4, [edx + ecx*4]
10524 movd mm5, [edx + ecx*4 + 4]
10525 movd mm6, [edx + ecx*4 + 8]
10526 movd mm7, [edx + ecx*4 + 12]
10527 mov ecx, [esp + i3110_n1 + 4]
10528 shl ecx, 2
10529 punpckldq mm4, [edx + ecx*4]
10530 punpckldq mm5, [edx + ecx*4 + 4]
10531 punpckldq mm6, [edx + ecx*4 + 8]
10532 punpckldq mm7, [edx + ecx*4 + 12]
10534 pfmul mm6, mm1 ;# mm6 = Geps
10535 pfmul mm7, mm2 ;# mm7 = Heps2
10537 pfadd mm5, mm6
10538 pfadd mm5, mm7 ;# mm5 = Fp
10540 pfmul mm7, [esp + i3110_two] ;# two*Heps2
10541 pfadd mm7, mm6
10542 pfadd mm7, mm5 ;# mm7=FF
10544 pfmul mm5, mm1 ;# mm5=eps*Fp
10545 pfadd mm5, mm4 ;# mm5= VV
10547 pfmul mm5, mm3 ;# vcoul=qq*VV
10548 pfmul mm3, mm7 ;# fijC=FF*qq
10550 ;# at this point mm5 contains vcoul and mm3 fijC
10551 ;# increment vcoul - then we can get rid of mm5
10552 ;# update vctot
10553 pfadd mm5, [esp + i3110_vctot] ;# add the earlier value
10554 movq [esp + i3110_vctot], mm5 ;# store the sum
10556 ;# change sign of mm3
10557 pxor mm1,mm1
10558 pfsub mm1, mm3
10559 pfmul mm1, [esp + i3110_tsc]
10560 pfmul mm0, mm1 ;# mm0 is total fscal now
10562 prefetchw [esp + i3110_dx1] ;# prefetch i forces to cache
10564 ;# spread fscalar to both positions
10565 movq mm1,mm0
10566 punpckldq mm0,mm0
10567 punpckhdq mm1,mm1
10569 ;# calc vector force
10570 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
10571 movq mm2, [esp + i3110_dx1] ;# fetch dr
10572 movd mm3, [esp + i3110_dz1]
10574 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
10575 pfmul mm2, mm0 ;# mult by fs
10576 pfmul mm3, mm0
10578 movq mm4, [esp + i3110_dx2] ;# fetch dr
10579 movd mm5, [esp + i3110_dz2]
10580 pfmul mm4, mm1 ;# mult by fs
10581 pfmul mm5, mm1
10582 ;# update i forces
10584 movq mm0, [esp + i3110_fix]
10585 movd mm1, [esp + i3110_fiz]
10586 pfadd mm0, mm2
10587 pfadd mm1, mm3
10589 pfadd mm0, mm4
10590 pfadd mm1, mm5
10591 movq [esp + i3110_fix], mm0
10592 movd [esp + i3110_fiz], mm1
10593 ;# update j forces
10595 movq mm0, [edi + eax*4]
10596 movd mm1, [edi + eax*4 + 8]
10597 movq mm6, [edi + ebx*4]
10598 movd mm7, [edi + ebx*4 + 8]
10600 pfsub mm0, mm2
10601 pfsub mm1, mm3
10602 pfsub mm6, mm4
10603 pfsub mm7, mm5
10605 movq [edi + eax*4], mm0
10606 movd [edi + eax*4 +8], mm1
10607 movq [edi + ebx*4], mm6
10608 movd [edi + ebx*4 + 8], mm7
10610 ;# should we do one more iteration?
10611 sub dword ptr [esp + i3110_innerk], 2
10612 jl .i3110_finish_coul_inner
10613 jmp .i3110_unroll_coul_loop
10614 .i3110_finish_coul_inner:
10615 and dword ptr [esp + i3110_innerk], 1
10616 jnz .i3110_single_coul_inner
10617 jmp .i3110_updateouterdata_coul
10618 .i3110_single_coul_inner:
10619 ;# a single j particle iteration here - compare with the unrolled code for comments.
10620 mov eax, [esp + i3110_innerjjnr]
10621 mov eax, [eax] ;# eax=jnr offset
10623 mov ecx, [ebp + i3110_charge]
10624 movd mm5, [esp + i3110_iq]
10625 movd mm3, [ecx + eax*4]
10626 pfmul mm3, mm5 ;# mm3=qq
10628 mov esi, [ebp + i3110_pos]
10629 lea eax, [eax + eax*2]
10631 movq mm0, [esp + i3110_ix]
10632 movd mm1, [esp + i3110_iz]
10633 movq mm4, [esi + eax*4]
10634 movd mm5, [esi + eax*4 + 8]
10635 pfsubr mm4, mm0
10636 pfsubr mm5, mm1
10637 movq [esp + i3110_dx1], mm4
10638 pfmul mm4,mm4
10639 movd [esp + i3110_dz1], mm5
10640 pfmul mm5,mm5
10641 pfacc mm4, mm5
10642 pfacc mm4, mm5 ;# mm0=rsq
10644 pfrsqrt mm0,mm4
10645 movq mm2,mm0
10646 pfmul mm0,mm0
10647 pfrsqit1 mm0,mm4
10648 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
10649 pfmul mm4, mm0
10650 movq mm1, mm4
10651 ;# mm0 is invsqrt, and mm1 r.
10653 ;# calculate potentials and scalar force
10654 pfmul mm1, [esp + i3110_tsc] ;# mm1=rt
10655 pf2iw mm4,mm1
10656 movd [esp + i3110_n1], mm4
10657 pi2fd mm4,mm4
10658 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
10660 movq mm2,mm1
10661 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
10663 ;# coulomb table
10664 mov edx, [ebp + i3110_VFtab]
10665 mov ecx, [esp + i3110_n1]
10666 shl ecx, 2
10667 ;# load all the table values we need
10668 movd mm4, [edx + ecx*4]
10669 movd mm5, [edx + ecx*4 + 4]
10670 movd mm6, [edx + ecx*4 + 8]
10671 movd mm7, [edx + ecx*4 + 12]
10673 pfmul mm6, mm1 ;# mm6 = Geps
10674 pfmul mm7, mm2 ;# mm7 = Heps2
10676 pfadd mm5, mm6
10677 pfadd mm5, mm7 ;# mm5 = Fp
10679 pfmul mm7, [esp + i3110_two] ;# two*Heps2
10680 pfadd mm7, mm6
10681 pfadd mm7, mm5 ;# mm7=FF
10683 pfmul mm5, mm1 ;# mm5=eps*Fp
10684 pfadd mm5, mm4 ;# mm5= VV
10686 pfmul mm5, mm3 ;# vcoul=qq*VV
10687 pfmul mm3, mm7 ;# fijC=FF*qq
10689 ;# at this point mm5 contains vcoul and mm3 fijC
10690 ;# increment vcoul - then we can get rid of mm5
10691 ;# update vctot
10692 pfadd mm5, [esp + i3110_vctot] ;# add the earlier value
10693 movq [esp + i3110_vctot], mm5 ;# store the sum
10695 ;# change sign of mm3
10696 pxor mm1,mm1
10697 pfsub mm1, mm3
10698 pfmul mm0, [esp + i3110_tsc]
10699 pfmul mm0, mm1 ;# mm0 is total fscal now
10701 ;# spread fscalar to both positions
10702 punpckldq mm0,mm0
10703 ;# calc vectorial force
10704 prefetchw [edi + eax*4] ;# prefetch faction to cache
10705 movq mm2, [esp + i3110_dx1]
10706 movd mm3, [esp + i3110_dz1]
10709 pfmul mm2, mm0
10710 pfmul mm3, mm0
10712 ;# update i particle force
10713 movq mm0, [esp + i3110_fix]
10714 movd mm1, [esp + i3110_fiz]
10715 pfadd mm0, mm2
10716 pfadd mm1, mm3
10717 movq [esp + i3110_fix], mm0
10718 movd [esp + i3110_fiz], mm1
10719 ;# update j particle force
10720 movq mm0, [edi + eax*4]
10721 movd mm1, [edi + eax *4+ 8]
10722 pfsub mm0, mm2
10723 pfsub mm1, mm3
10724 movq [edi + eax*4], mm0
10725 movd [edi + eax*4 +8], mm1
10726 ;# done!
10727 .i3110_updateouterdata_coul:
10728 mov ecx, [esp + i3110_ii3]
10730 movq mm6, [edi + ecx*4] ;# increment i force
10731 movd mm7, [edi + ecx*4 + 8]
10732 pfadd mm6, [esp + i3110_fix]
10733 pfadd mm7, [esp + i3110_fiz]
10734 movq [edi + ecx*4], mm6
10735 movd [edi + ecx*4 +8], mm7
10737 mov ebx, [ebp + i3110_fshift] ;# increment fshift force
10738 mov edx, [esp + i3110_is3]
10740 movq mm6, [ebx + edx*4]
10741 movd mm7, [ebx + edx*4 + 8]
10742 pfadd mm6, [esp + i3110_fix]
10743 pfadd mm7, [esp + i3110_fiz]
10744 movq [ebx + edx*4], mm6
10745 movd [ebx + edx*4 + 8], mm7
10747 ;# loop back to mno
10748 dec dword ptr [esp + i3110_nscoul]
10749 jz .i3110_testvdw
10750 jmp .i3110_mno_coul
10751 .i3110_testvdw:
10752 mov ecx, [esp + i3110_nsvdw]
10753 cmp ecx, 0
10754 jnz .i3110_mno_vdw
10755 jmp .i3110_last_mno
10756 .i3110_mno_vdw:
10757 mov ebx, [esp + i3110_solnr]
10758 inc dword ptr [esp + i3110_solnr]
10760 mov edx, [ebp + i3110_type]
10761 mov edx, [edx + ebx*4]
10762 imul edx, [ebp + i3110_ntype]
10763 shl edx, 1
10764 mov [esp + i3110_ntia], edx
10766 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
10767 mov eax, [ebp + i3110_pos] ;# eax = base of pos[]
10768 mov [esp + i3110_ii3], ebx
10770 movq mm0, [eax + ebx*4]
10771 movd mm1, [eax + ebx*4 + 8]
10772 pfadd mm0, [esp + i3110_shX]
10773 pfadd mm1, [esp + i3110_shZ]
10774 movq [esp + i3110_ix], mm0
10775 movd [esp + i3110_iz], mm1
10777 ;# clear forces
10778 pxor mm7,mm7
10779 movq [esp + i3110_fix], mm7
10780 movd [esp + i3110_fiz], mm7
10782 mov ecx, [esp + i3110_innerjjnr0]
10783 mov [esp + i3110_innerjjnr], ecx
10784 mov edx, [esp + i3110_innerk0]
10785 sub edx, 2
10786 mov [esp + i3110_innerk], edx ;# number of innerloop atoms
10787 jge .i3110_unroll_vdw_loop
10788 jmp .i3110_finish_vdw_inner
10789 .i3110_unroll_vdw_loop:
10790 ;# paired innerloop starts here
10791 mov ecx, [esp + i3110_innerjjnr] ;# pointer to jjnr[k]
10792 mov eax, [ecx]
10793 mov ebx, [ecx + 4] ;# eax/ebx=jnr
10794 add dword ptr [esp + i3110_innerjjnr], 8 ;# advance pointer (unrolled 2)
10795 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
10797 mov ecx, [ebp + i3110_type]
10798 mov edx, [ecx + eax*4] ;# type [jnr1]
10799 mov ecx, [ecx + ebx*4] ;# type [jnr2]
10801 mov esi, [ebp + i3110_nbfp] ;# base of nbfp
10802 shl edx, 1
10803 shl ecx, 1
10804 add edx, [esp + i3110_ntia] ;# tja = ntia + 2*type
10805 add ecx, [esp + i3110_ntia]
10807 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
10808 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
10809 movq mm6,mm5
10810 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
10811 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
10812 movq [esp + i3110_c6], mm5
10813 movq [esp + i3110_c12], mm6
10815 lea eax, [eax + eax*2] ;# replace jnr with j3
10816 lea ebx, [ebx + ebx*2]
10818 mov esi, [ebp + i3110_pos]
10820 movq mm0, [esp + i3110_ix]
10821 movd mm1, [esp + i3110_iz]
10822 movq mm4, [esi + eax*4] ;# fetch first j coordinates
10823 movd mm5, [esi + eax*4 + 8]
10824 pfsubr mm4,mm0 ;# dr = ir - jr
10825 pfsubr mm5,mm1
10826 movq [esp + i3110_dx1], mm4 ;# store dr
10827 movd [esp + i3110_dz1], mm5
10828 pfmul mm4,mm4 ;# square dx,dy,dz
10829 pfmul mm5,mm5
10830 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
10831 pfacc mm4, mm5 ;# first rsq in lower mm4
10833 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
10834 movd mm7, [esi + ebx*4 + 8]
10836 pfsubr mm6,mm0 ;# dr = ir - jr
10837 pfsubr mm7,mm1
10838 movq [esp + i3110_dx2], mm6 ;# store dr
10839 movd [esp + i3110_dz2], mm7
10840 pfmul mm6,mm6 ;# square dx,dy,dz
10841 pfmul mm7,mm7
10842 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
10843 pfacc mm6, mm7 ;# second rsq in lower mm6
10845 pfrcp mm0, mm4 ;# lookup reciprocal seed
10846 pfrcp mm1, mm6
10848 punpckldq mm0,mm1
10849 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
10850 ;# amd 3dnow N-R iteration to get full precision.
10851 pfrcpit1 mm4,mm0
10852 pfrcpit2 mm4,mm0
10853 ;# mm4 now contains invsq,
10854 ;# do potential and fscal
10856 movq mm0, mm4
10857 pfmul mm4, mm0
10858 pfmul mm4, mm0 ;# mm4=rinvsix
10859 movq mm5, mm4
10860 pfmul mm5, mm5 ;# mm5=rinvtwelve
10862 pfmul mm5, [esp + i3110_c12]
10863 pfmul mm4, [esp + i3110_c6]
10864 movq mm6, mm5 ;# mm6 is vnb12-vnb6
10865 pfsub mm6, mm4
10867 pfmul mm4, [esp + i3110_six]
10869 pfmul mm5, [esp + i3110_twelve]
10870 pfsub mm5,mm4
10871 pfmul mm0, mm5 ;# mm0 is total fscal now
10873 prefetchw [esp + i3110_dx1] ;# prefetch i forces to cache
10875 ;# spread fscalar to both positions
10876 movq mm1,mm0
10877 punpckldq mm0,mm0
10878 punpckhdq mm1,mm1
10880 ;# calc vector force
10881 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
10882 movq mm2, [esp + i3110_dx1] ;# fetch dr
10883 movd mm3, [esp + i3110_dz1]
10885 ;# update vnbtot
10886 pfadd mm6, [esp + i3110_vnbtot] ;# add the earlier value
10887 movq [esp + i3110_vnbtot], mm6 ;# store the sum
10889 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
10890 pfmul mm2, mm0 ;# mult by fs
10891 pfmul mm3, mm0
10893 movq mm4, [esp + i3110_dx2] ;# fetch dr
10894 movd mm5, [esp + i3110_dz2]
10895 pfmul mm4, mm1 ;# mult by fs
10896 pfmul mm5, mm1
10897 ;# update i forces
10899 movq mm0, [esp + i3110_fix]
10900 movd mm1, [esp + i3110_fiz]
10901 pfadd mm0, mm2
10902 pfadd mm1, mm3
10904 pfadd mm0, mm4
10905 pfadd mm1, mm5
10906 movq [esp + i3110_fix], mm0
10907 movd [esp + i3110_fiz], mm1
10908 ;# update j forces
10910 movq mm0, [edi + eax*4]
10911 movd mm1, [edi + eax*4 + 8]
10912 movq mm6, [edi + ebx*4]
10913 movd mm7, [edi + ebx*4 + 8]
10915 pfsub mm0, mm2
10916 pfsub mm1, mm3
10917 pfsub mm6, mm4
10918 pfsub mm7, mm5
10920 movq [edi + eax*4], mm0
10921 movd [edi + eax*4 +8], mm1
10922 movq [edi + ebx*4], mm6
10923 movd [edi + ebx*4 + 8], mm7
10925 ;# should we do one more iteration?
10926 sub dword ptr [esp + i3110_innerk], 2
10927 jl .i3110_finish_vdw_inner
10928 jmp .i3110_unroll_vdw_loop
10929 .i3110_finish_vdw_inner:
10930 and dword ptr [esp + i3110_innerk], 1
10931 jnz .i3110_single_vdw_inner
10932 jmp .i3110_updateouterdata_vdw
10933 .i3110_single_vdw_inner:
10934 ;# a single j particle iteration here - compare with the unrolled code for comments.
10935 mov eax, [esp + i3110_innerjjnr]
10936 mov eax, [eax] ;# eax=jnr offset
10938 mov esi, [ebp + i3110_nbfp]
10939 mov ecx, [ebp + i3110_type]
10940 mov edx, [ecx + eax*4] ;# type [jnr1]
10941 shl edx, 1
10942 add edx, [esp + i3110_ntia] ;# tja = ntia + 2*type
10943 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
10944 movq [esp + i3110_c6], mm5
10945 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
10946 movq [esp + i3110_c12], mm5
10948 mov esi, [ebp + i3110_pos]
10949 lea eax, [eax + eax*2]
10951 movq mm0, [esp + i3110_ix]
10952 movd mm1, [esp + i3110_iz]
10953 movq mm4, [esi + eax*4]
10954 movd mm5, [esi + eax*4 + 8]
10955 pfsubr mm4, mm0
10956 pfsubr mm5, mm1
10957 movq [esp + i3110_dx1], mm4
10958 pfmul mm4,mm4
10959 movd [esp + i3110_dz1], mm5
10960 pfmul mm5,mm5
10961 pfacc mm4, mm5
10962 pfacc mm4, mm5 ;# mm4=rsq
10964 pfrcp mm0,mm4
10965 pfrcpit1 mm4,mm0
10966 pfrcpit2 mm4,mm0 ;# mm4=invsq
10967 ;# calculate potentials and scalar force
10968 movq mm0, mm4
10970 pfmul mm4, mm0
10971 pfmul mm4, mm0 ;# mm4=rinvsix
10972 movq mm5, mm4
10973 pfmul mm5, mm5 ;# mm5=rinvtwelve
10975 pfmul mm5, [esp + i3110_c12]
10976 pfmul mm4, [esp + i3110_c6]
10977 movq mm6, mm5 ;# mm6 is vnb12-vnb6
10978 pfsub mm6, mm4
10980 pfmul mm4, [esp + i3110_six]
10982 pfmul mm5, [esp + i3110_twelve]
10983 pfsub mm5, mm4
10984 pfmul mm0, mm5 ;# mm0 is total fscal now
10986 ;# update vnbtot
10987 pfadd mm6, [esp + i3110_vnbtot] ;# add the earlier value
10988 movq [esp + i3110_vnbtot], mm6 ;# store the sum
10990 ;# spread fscalar to both positions
10991 punpckldq mm0,mm0
10992 ;# calc vectorial force
10993 prefetchw [edi + eax*4] ;# prefetch faction to cache
10994 movq mm2, [esp + i3110_dx1]
10995 movd mm3, [esp + i3110_dz1]
10997 pfmul mm2, mm0
10998 pfmul mm3, mm0
11000 ;# update i particle force
11001 movq mm0, [esp + i3110_fix]
11002 movd mm1, [esp + i3110_fiz]
11003 pfadd mm0, mm2
11004 pfadd mm1, mm3
11005 movq [esp + i3110_fix], mm0
11006 movd [esp + i3110_fiz], mm1
11007 ;# update j particle force
11008 movq mm0, [edi + eax*4]
11009 movd mm1, [edi + eax *4+ 8]
11010 pfsub mm0, mm2
11011 pfsub mm1, mm3
11012 movq [edi + eax*4], mm0
11013 movd [edi + eax*4 +8], mm1
11014 ;# done!
11015 .i3110_updateouterdata_vdw:
11016 mov ecx, [esp + i3110_ii3]
11018 movq mm6, [edi + ecx*4] ;# increment i force
11019 movd mm7, [edi + ecx*4 + 8]
11020 pfadd mm6, [esp + i3110_fix]
11021 pfadd mm7, [esp + i3110_fiz]
11022 movq [edi + ecx*4], mm6
11023 movd [edi + ecx*4 +8], mm7
11025 mov ebx, [ebp + i3110_fshift] ;# increment fshift force
11026 mov edx, [esp + i3110_is3]
11028 movq mm6, [ebx + edx*4]
11029 movd mm7, [ebx + edx*4 + 8]
11030 pfadd mm6, [esp + i3110_fix]
11031 pfadd mm7, [esp + i3110_fiz]
11032 movq [ebx + edx*4], mm6
11033 movd [ebx + edx*4 + 8], mm7
11035 ;# loop back to mno
11036 dec dword ptr [esp + i3110_nsvdw]
11037 jz .i3110_last_mno
11038 jmp .i3110_mno_vdw
11040 .i3110_last_mno:
11041 mov edx, [ebp + i3110_gid] ;# get group index for this i particle
11042 mov edx, [edx]
11043 add dword ptr [ebp + i3110_gid], 4 ;# advance pointer
11045 movq mm7, [esp + i3110_vctot]
11046 pfacc mm7,mm7 ;# get and sum the two parts of total potential
11048 mov eax, [ebp + i3110_Vc]
11049 movd mm6, [eax + edx*4]
11050 pfadd mm6, mm7
11051 movd [eax + edx*4], mm6 ;# increment vc[gid]
11053 movq mm7, [esp + i3110_vnbtot]
11054 pfacc mm7,mm7 ;# get and sum the two parts of total potential
11056 mov eax, [ebp + i3110_Vnb]
11057 movd mm6, [eax + edx*4]
11058 pfadd mm6, mm7
11059 movd [eax + edx*4], mm6 ;# increment vc[gid]
11060 ;# finish if last
11061 mov ecx, [ebp + i3110_nri]
11062 dec ecx
11063 jecxz .i3110_end
11064 ;# not last, iterate once more!
11065 mov [ebp + i3110_nri], ecx
11066 jmp .i3110_outer
11067 .i3110_end:
11068 femms
11069 add esp, 184
11070 pop edi
11071 pop esi
11072 pop edx
11073 pop ecx
11074 pop ebx
11075 pop eax
11076 leave
11082 .globl inl3120_3dnow
11083 .globl _inl3120_3dnow
11084 inl3120_3dnow:
11085 _inl3120_3dnow:
11086 .equiv i3120_nri, 8
11087 .equiv i3120_iinr, 12
11088 .equiv i3120_jindex, 16
11089 .equiv i3120_jjnr, 20
11090 .equiv i3120_shift, 24
11091 .equiv i3120_shiftvec, 28
11092 .equiv i3120_fshift, 32
11093 .equiv i3120_gid, 36
11094 .equiv i3120_pos, 40
11095 .equiv i3120_faction, 44
11096 .equiv i3120_charge, 48
11097 .equiv i3120_facel, 52
11098 .equiv i3120_Vc, 56
11099 .equiv i3120_type, 60
11100 .equiv i3120_ntype, 64
11101 .equiv i3120_nbfp, 68
11102 .equiv i3120_Vnb, 72
11103 .equiv i3120_tabscale, 76
11104 .equiv i3120_VFtab, 80
11105 ;# stack offsets for local variables
11106 .equiv i3120_is3, 0
11107 .equiv i3120_ii3, 4
11108 .equiv i3120_ixO, 8
11109 .equiv i3120_iyO, 12
11110 .equiv i3120_izO, 16
11111 .equiv i3120_ixH, 20
11112 .equiv i3120_iyH, 28
11113 .equiv i3120_izH, 36
11114 .equiv i3120_iqO, 44
11115 .equiv i3120_iqH, 52
11116 .equiv i3120_qqO, 60
11117 .equiv i3120_qqH, 68
11118 .equiv i3120_vctot, 76
11119 .equiv i3120_vnbtot, 84
11120 .equiv i3120_c6, 92
11121 .equiv i3120_c12, 100
11122 .equiv i3120_six, 108
11123 .equiv i3120_twelve, 116
11124 .equiv i3120_two, 124
11125 .equiv i3120_n1, 132
11126 .equiv i3120_tsc, 140
11127 .equiv i3120_ntia, 148
11128 .equiv i3120_innerjjnr, 156
11129 .equiv i3120_innerk, 160
11130 .equiv i3120_fixO, 164
11131 .equiv i3120_fiyO, 168
11132 .equiv i3120_fizO, 172
11133 .equiv i3120_fixH, 176
11134 .equiv i3120_fiyH, 184
11135 .equiv i3120_fizH, 192
11136 .equiv i3120_dxO, 200
11137 .equiv i3120_dyO, 204
11138 .equiv i3120_dzO, 208
11139 .equiv i3120_dxH, 212
11140 .equiv i3120_dyH, 220
11141 .equiv i3120_dzH, 228
11142 .equiv i3120_tmprsqH, 236
11143 push ebp
11144 mov ebp,esp
11145 push eax
11146 push ebx
11147 push ecx
11148 push edx
11149 push esi
11150 push edi
11151 sub esp, 244 ;# local stack space
11152 femms
11154 mov ecx, [ebp + i3120_iinr] ;# ecx = pointer into iinr[]
11155 mov ebx, [ecx] ;# ebx=ii
11157 mov edx, [ebp + i3120_charge]
11158 movd mm1, [ebp + i3120_facel]
11159 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0]
11160 pfmul mm2, mm1
11161 movq [esp + i3120_iqO], mm2 ;# iqO = facel*charge[ii]
11163 movd mm2, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1]
11164 pfmul mm2, mm1
11165 punpckldq mm2,mm2 ;# spread to both halves
11166 movq [esp + i3120_iqH], mm2 ;# iqH = facel*charge[ii0+1]
11168 mov edx, [ebp + i3120_type]
11169 mov edx, [edx + ebx*4]
11170 shl edx, 1
11171 mov ecx, edx
11172 imul ecx, [ebp + i3120_ntype] ;# ecx = ntia = 2*ntype*type[ii0]
11173 mov [esp + i3120_ntia], ecx
11175 movq mm3, [mm_two]
11176 movq mm4, [mm_six]
11177 movq mm5, [mm_twelve]
11178 movq mm6, [ebp + i3120_tabscale]
11179 punpckldq mm6,mm6 ;# spread to both halves
11180 movq [esp + i3120_two], mm3
11181 movq [esp + i3120_six], mm4
11182 movq [esp + i3120_twelve], mm5
11183 movq [esp + i3120_tsc], mm6
11184 ;# assume we have at least one i particle - start directly
11185 .i3120_outer:
11186 mov eax, [ebp + i3120_shift] ;# eax = pointer into shift[]
11187 mov ebx, [eax] ;# ebx=shift[n]
11188 add dword ptr [ebp + i3120_shift], 4 ;# advance pointer one step
11190 lea ebx, [ebx + ebx*2] ;# ebx=3*is
11191 mov [esp + i3120_is3],ebx ;# store is3
11193 mov eax, [ebp + i3120_shiftvec] ;# eax = base of shiftvec[]
11195 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
11196 movd mm6, [eax + ebx*4 + 8]
11197 movq mm0, mm5
11198 movq mm1, mm5
11199 movq mm2, mm6
11200 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
11201 punpckhdq mm1,mm1
11202 punpckldq mm2,mm2
11204 mov ecx, [ebp + i3120_iinr] ;# ecx = pointer into iinr[]
11205 add dword ptr [ebp + i3120_iinr], 4 ;# advance pointer
11206 mov ebx, [ecx] ;# ebx=ii
11208 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
11209 mov eax, [ebp + i3120_pos] ;# eax = base of pos[]
11211 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
11212 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
11213 mov [esp + i3120_ii3], ebx ;# (use mm7 as temp. storage for iz.)
11214 pfadd mm6, mm7
11215 movq [esp + i3120_ixO], mm5
11216 movq [esp + i3120_izO], mm6
11218 movd mm3, [eax + ebx*4 + 12]
11219 movd mm4, [eax + ebx*4 + 16]
11220 movd mm5, [eax + ebx*4 + 20]
11221 punpckldq mm3, [eax + ebx*4 + 24]
11222 punpckldq mm4, [eax + ebx*4 + 28]
11223 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
11225 pfadd mm0, mm3
11226 pfadd mm1, mm4
11227 pfadd mm2, mm5
11228 movq [esp + i3120_ixH], mm0
11229 movq [esp + i3120_iyH], mm1
11230 movq [esp + i3120_izH], mm2
11232 ;# clear vctot and i forces
11233 pxor mm7,mm7
11234 movq [esp + i3120_vctot], mm7
11235 movq [esp + i3120_vnbtot], mm7
11236 movq [esp + i3120_fixO], mm7
11237 movd [esp + i3120_fizO], mm7
11238 movq [esp + i3120_fixH], mm7
11239 movq [esp + i3120_fiyH], mm7
11240 movq [esp + i3120_fizH], mm7
11242 mov eax, [ebp + i3120_jindex]
11243 mov ecx, [eax] ;# jindex[n]
11244 mov edx, [eax + 4] ;# jindex[n+1]
11245 add dword ptr [ebp + i3120_jindex], 4
11246 sub edx, ecx ;# number of innerloop atoms
11247 mov [esp + i3120_innerk], edx
11249 mov esi, [ebp + i3120_pos]
11250 mov edi, [ebp + i3120_faction]
11251 mov eax, [ebp + i3120_jjnr]
11252 shl ecx, 2
11253 add eax, ecx
11254 mov [esp + i3120_innerjjnr], eax ;# pointer to jjnr[nj0]
11255 .i3120_inner_loop:
11256 ;# a single j particle iteration here - compare with the unrolled code for comments.
11257 mov eax, [esp + i3120_innerjjnr]
11258 mov eax, [eax] ;# eax=jnr offset
11259 add dword ptr [esp + i3120_innerjjnr], 4 ;# advance pointer
11260 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
11262 mov ecx, [ebp + i3120_charge]
11263 movd mm7, [ecx + eax*4]
11264 punpckldq mm7,mm7
11265 movq mm6,mm7
11266 pfmul mm6, [esp + i3120_iqO]
11267 pfmul mm7, [esp + i3120_iqH] ;# mm6=qqO, mm7=qqH
11268 movd [esp + i3120_qqO], mm6
11269 movq [esp + i3120_qqH], mm7
11271 mov ecx, [ebp + i3120_type]
11272 mov edx, [ecx + eax*4] ;# type [jnr]
11273 mov ecx, [ebp + i3120_nbfp]
11274 shl edx, 1
11275 add edx, [esp + i3120_ntia] ;# tja = ntia + 2*type
11276 movd mm5, [ecx + edx*4] ;# mm5 = 1st c6
11277 movq [esp + i3120_c6], mm5
11278 movd mm5, [ecx + edx*4 + 4] ;# mm5 = 1st c12
11279 movq [esp + i3120_c12], mm5
11281 lea eax, [eax + eax*2]
11283 movq mm0, [esi + eax*4]
11284 movd mm1, [esi + eax*4 + 8]
11285 ;# copy & expand to mm2-mm4 for the H interactions
11286 movq mm2, mm0
11287 movq mm3, mm0
11288 movq mm4, mm1
11289 punpckldq mm2,mm2
11290 punpckhdq mm3,mm3
11291 punpckldq mm4,mm4
11293 pfsubr mm0, [esp + i3120_ixO]
11294 pfsubr mm1, [esp + i3120_izO]
11296 movq [esp + i3120_dxO], mm0
11297 pfmul mm0,mm0
11298 movd [esp + i3120_dzO], mm1
11299 pfmul mm1,mm1
11300 pfacc mm0, mm1
11301 pfadd mm0, mm1 ;# mm0=rsqO
11303 punpckldq mm2, mm2
11304 punpckldq mm3, mm3
11305 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
11306 pfsubr mm2, [esp + i3120_ixH]
11307 pfsubr mm3, [esp + i3120_iyH]
11308 pfsubr mm4, [esp + i3120_izH] ;# mm2-mm4 is dxH-dzH
11310 movq [esp + i3120_dxH], mm2
11311 movq [esp + i3120_dyH], mm3
11312 movq [esp + i3120_dzH], mm4
11313 pfmul mm2,mm2
11314 pfmul mm3,mm3
11315 pfmul mm4,mm4
11317 pfadd mm3,mm2
11318 pfadd mm3,mm4 ;# mm3=rsqH
11319 movq [esp + i3120_tmprsqH], mm3
11321 pfrsqrt mm1,mm0
11323 movq mm2,mm1
11324 pfmul mm1,mm1
11325 pfrsqit1 mm1,mm0
11326 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
11328 pfmul mm0, mm1 ;# mm0=r
11330 pfmul mm0, [esp + i3120_tsc]
11331 pf2iw mm4, mm0
11332 movd [esp + i3120_n1], mm4
11333 pi2fd mm4,mm4
11334 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
11335 movq mm2, mm0
11336 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
11338 ;# coulomb table
11339 mov edx, [ebp + i3120_VFtab]
11340 mov ecx, [esp + i3120_n1]
11341 shl ecx, 2
11342 ;# load all values we need
11343 movd mm4, [edx + ecx*4]
11344 movd mm5, [edx + ecx*4 + 4]
11345 movd mm6, [edx + ecx*4 + 8]
11346 movd mm7, [edx + ecx*4 + 12]
11348 pfmul mm6, mm0 ;# mm6 = Geps
11349 pfmul mm7, mm2 ;# mm7 = Heps2
11351 pfadd mm5, mm6
11352 pfadd mm5, mm7 ;# mm5 = Fp
11354 pfmul mm7, [esp + i3120_two] ;# two*Heps2
11355 pfadd mm7, mm6
11356 pfadd mm7, mm5 ;# mm7=FF
11358 pfmul mm5, mm0 ;# mm5=eps*Fp
11359 pfadd mm5, mm4 ;# mm5= VV
11361 pfmul mm5, [esp + i3120_qqO] ;# vcoul=qq*VV
11362 pfmul mm7, [esp + i3120_qqO] ;# fijC=qq*FF
11363 ;# update vctot directly, use mm3 for fscal sum.
11364 pfadd mm5, [esp + i3120_vctot]
11365 movq [esp + i3120_vctot], mm5
11367 movq mm3, mm7
11368 pfmul mm3, [esp + i3120_tsc]
11370 ;# nontabulated LJ - mm1 is invsqrt. - keep mm1!
11371 movq mm0, mm1
11372 pfmul mm0, mm0 ;# mm0 is invsq
11373 movq mm2, mm0
11374 pfmul mm2, mm0
11375 pfmul mm2, mm0 ;# mm2 = rinvsix
11376 movq mm4, mm2
11377 pfmul mm4, mm4 ;# mm4=rinvtwelve
11379 pfmul mm4, [esp + i3120_c12]
11380 pfmul mm2, [esp + i3120_c6]
11381 movq mm5, mm4
11382 pfsub mm5, mm2 ;# mm5=vnb12-vnb6
11384 pfmul mm2, [esp + i3120_six]
11385 pfmul mm4, [esp + i3120_twelve]
11386 pfsub mm4, mm2
11387 pfmul mm4, mm1 ;# mm4=(12*vnb12-6*vnb6)*rinv11
11389 pfsubr mm3, mm4
11390 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
11392 ;# update vnbtot
11393 pfadd mm5, [esp + i3120_vnbtot] ;# add the earlier value
11394 movq [esp + i3120_vnbtot], mm5 ;# store the sum
11396 ;# Ready with the oxygen - potential is updated, fscal is in mm3.
11397 ;# now do the two hydrogens.
11398 movq mm0, [esp + i3120_tmprsqH] ;# mm0=rsqH
11400 pfrsqrt mm1, mm0
11401 pswapd mm0,mm0
11402 pfrsqrt mm2, mm0
11403 pswapd mm0,mm0
11404 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
11406 movq mm2, mm1
11407 pfmul mm1,mm1
11408 pfrsqit1 mm1,mm0
11409 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
11411 pfmul mm0,mm1 ;# mm0=r
11412 pfmul mm0, [esp + i3120_tsc]
11413 pf2iw mm4, mm0
11414 movq [esp + i3120_n1], mm4
11415 pi2fd mm4,mm4
11416 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
11417 movq mm2, mm0
11418 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
11420 ;# coulomb table
11421 mov edx, [ebp + i3120_VFtab]
11422 mov ecx, [esp + i3120_n1]
11423 shl ecx, 2
11424 ;# load all values we need
11425 movd mm4, [edx + ecx*4]
11426 movd mm5, [edx + ecx*4 + 4]
11427 movd mm6, [edx + ecx*4 + 8]
11428 movd mm7, [edx + ecx*4 + 12]
11429 mov ecx, [esp + i3120_n1 + 4]
11430 shl ecx, 2
11431 punpckldq mm4, [edx + ecx*4]
11432 punpckldq mm5, [edx + ecx*4 + 4]
11433 punpckldq mm6, [edx + ecx*4 + 8]
11434 punpckldq mm7, [edx + ecx*4 + 12]
11436 pfmul mm6, mm0 ;# mm6 = Geps
11437 pfmul mm7, mm2 ;# mm7 = Heps2
11439 pfadd mm5, mm6
11440 pfadd mm5, mm7 ;# mm5 = Fp
11442 pfmul mm7, [esp + i3120_two] ;# two*Heps2
11443 pfadd mm7, mm6
11444 pfadd mm7, mm5 ;# mm7=FF
11446 pfmul mm5, mm0 ;# mm5=eps*Fp
11447 pfadd mm5, mm4 ;# mm5= VV
11449 pfmul mm5, [esp + i3120_qqH] ;# vcoul=qq*VV
11450 pfmul mm7, [esp + i3120_qqH] ;# fijC=qq*FF
11451 ;# update vctot
11452 pfadd mm5, [esp + i3120_vctot]
11453 movq [esp + i3120_vctot], mm5
11455 ;# change sign of fijC and multiply by rinv
11456 pxor mm4,mm4
11457 pfsub mm4, mm7
11458 pfmul mm4, [esp + i3120_tsc]
11459 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
11461 ;# spread oxygen fscalar to both positions
11462 punpckldq mm3,mm3
11463 ;# calc vectorial force for O
11464 prefetchw [edi + eax*4] ;# prefetch faction to cache
11465 movq mm0, [esp + i3120_dxO]
11466 movd mm1, [esp + i3120_dzO]
11467 pfmul mm0, mm3
11468 pfmul mm1, mm3
11470 ;# calc vectorial force for H's
11471 movq mm5, [esp + i3120_dxH]
11472 movq mm6, [esp + i3120_dyH]
11473 movq mm7, [esp + i3120_dzH]
11474 pfmul mm5, mm4
11475 pfmul mm6, mm4
11476 pfmul mm7, mm4
11478 ;# update iO particle force
11479 movq mm2, [esp + i3120_fixO]
11480 movd mm3, [esp + i3120_fizO]
11481 pfadd mm2, mm0
11482 pfadd mm3, mm1
11483 movq [esp + i3120_fixO], mm2
11484 movd [esp + i3120_fizO], mm3
11486 ;# update iH forces
11487 movq mm2, [esp + i3120_fixH]
11488 movq mm3, [esp + i3120_fiyH]
11489 movq mm4, [esp + i3120_fizH]
11490 pfadd mm2, mm5
11491 pfadd mm3, mm6
11492 pfadd mm4, mm7
11493 movq [esp + i3120_fixH], mm2
11494 movq [esp + i3120_fiyH], mm3
11495 movq [esp + i3120_fizH], mm4
11497 ;# pack j forces from H in the same form as the oxygen force.
11498 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
11499 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
11501 pfadd mm0, mm5 ;# add up total force on j particle.
11502 pfadd mm1, mm7
11504 ;# update j particle force
11505 movq mm2, [edi + eax*4]
11506 movd mm3, [edi + eax*4 + 8]
11507 pfsub mm2, mm0
11508 pfsub mm3, mm1
11509 movq [edi + eax*4], mm2
11510 movd [edi + eax*4 +8], mm3
11512 ;# done - one more?
11513 dec dword ptr [esp + i3120_innerk]
11514 jz .i3120_updateouterdata
11515 jmp .i3120_inner_loop
11516 .i3120_updateouterdata:
11517 mov ecx, [esp + i3120_ii3]
11519 movq mm6, [edi + ecx*4] ;# increment iO force
11520 movd mm7, [edi + ecx*4 + 8]
11521 pfadd mm6, [esp + i3120_fixO]
11522 pfadd mm7, [esp + i3120_fizO]
11523 movq [edi + ecx*4], mm6
11524 movd [edi + ecx*4 +8], mm7
11526 movq mm0, [esp + i3120_fixH]
11527 movq mm3, [esp + i3120_fiyH]
11528 movq mm1, [esp + i3120_fizH]
11529 movq mm2, mm0
11530 punpckldq mm0, mm3 ;# mm0(l)=fxH1, mm0(h)=fyH1
11531 punpckhdq mm2, mm3 ;# mm2(l)=fxH2, mm2(h)=fyH2
11532 movq mm3, mm1
11533 pswapd mm3,mm3
11534 ;# mm1 is fzH1
11535 ;# mm3 is fzH2
11537 movq mm6, [edi + ecx*4 + 12] ;# increment iH1 force
11538 movd mm7, [edi + ecx*4 + 20]
11539 pfadd mm6, mm0
11540 pfadd mm7, mm1
11541 movq [edi + ecx*4 + 12], mm6
11542 movd [edi + ecx*4 + 20], mm7
11544 movq mm6, [edi + ecx*4 + 24] ;# increment iH2 force
11545 movd mm7, [edi + ecx*4 + 32]
11546 pfadd mm6, mm2
11547 pfadd mm7, mm3
11548 movq [edi + ecx*4 + 24], mm6
11549 movd [edi + ecx*4 + 32], mm7
11552 mov ebx, [ebp + i3120_fshift] ;# increment fshift force
11553 mov edx, [esp + i3120_is3]
11555 movq mm6, [ebx + edx*4]
11556 movd mm7, [ebx + edx*4 + 8]
11557 pfadd mm6, [esp + i3120_fixO]
11558 pfadd mm7, [esp + i3120_fizO]
11559 pfadd mm6, mm0
11560 pfadd mm7, mm1
11561 pfadd mm6, mm2
11562 pfadd mm7, mm3
11563 movq [ebx + edx*4], mm6
11564 movd [ebx + edx*4 + 8], mm7
11566 mov edx, [ebp + i3120_gid] ;# get group index for this i particle
11567 mov edx, [edx]
11568 add dword ptr [ebp + i3120_gid], 4 ;# advance pointer
11570 movq mm7, [esp + i3120_vctot]
11571 pfacc mm7,mm7 ;# get and sum the two parts of total potential
11573 mov eax, [ebp + i3120_Vc]
11574 movd mm6, [eax + edx*4]
11575 pfadd mm6, mm7
11576 movd [eax + edx*4], mm6 ;# increment vc[gid]
11578 movq mm7, [esp + i3120_vnbtot]
11579 pfacc mm7,mm7 ;# same for Vnb
11581 mov eax, [ebp + i3120_Vnb]
11582 movd mm6, [eax + edx*4]
11583 pfadd mm6, mm7
11584 movd [eax + edx*4], mm6 ;# increment vnb[gid]
11585 ;# finish if last
11586 dec dword ptr [ebp + i3120_nri]
11587 jz .i3120_end
11588 ;# not last, iterate once more!
11589 jmp .i3120_outer
11590 .i3120_end:
11591 femms
11592 add esp, 244
11593 pop edi
11594 pop esi
11595 pop edx
11596 pop ecx
11597 pop ebx
11598 pop eax
11599 leave
11606 .globl inl3130_3dnow
11607 .globl _inl3130_3dnow
11608 inl3130_3dnow:
11609 _inl3130_3dnow:
11610 .equiv i3130_nri, 8
11611 .equiv i3130_iinr, 12
11612 .equiv i3130_jindex, 16
11613 .equiv i3130_jjnr, 20
11614 .equiv i3130_shift, 24
11615 .equiv i3130_shiftvec, 28
11616 .equiv i3130_fshift, 32
11617 .equiv i3130_gid, 36
11618 .equiv i3130_pos, 40
11619 .equiv i3130_faction, 44
11620 .equiv i3130_charge, 48
11621 .equiv i3130_facel, 52
11622 .equiv i3130_Vc, 56
11623 .equiv i3130_type, 60
11624 .equiv i3130_ntype, 64
11625 .equiv i3130_nbfp, 68
11626 .equiv i3130_Vnb, 72
11627 .equiv i3130_tabscale, 76
11628 .equiv i3130_VFtab, 80
11629 ;# stack offsets for local variables
11630 .equiv i3130_is3, 0
11631 .equiv i3130_ii3, 4
11632 .equiv i3130_ixO, 8
11633 .equiv i3130_iyO, 12
11634 .equiv i3130_izO, 16
11635 .equiv i3130_ixH, 20
11636 .equiv i3130_iyH, 28
11637 .equiv i3130_izH, 36
11638 .equiv i3130_qqOO, 44
11639 .equiv i3130_qqOH, 52
11640 .equiv i3130_qqHH, 60
11641 .equiv i3130_c6, 68
11642 .equiv i3130_c12, 76
11643 .equiv i3130_six, 84
11644 .equiv i3130_twelve, 92
11645 .equiv i3130_two, 100
11646 .equiv i3130_n1, 108
11647 .equiv i3130_tsc, 116
11648 .equiv i3130_vctot, 124
11649 .equiv i3130_vnbtot, 132
11650 .equiv i3130_innerjjnr, 140
11651 .equiv i3130_innerk, 144
11652 .equiv i3130_fixO, 148
11653 .equiv i3130_fiyO, 152
11654 .equiv i3130_fizO, 156
11655 .equiv i3130_fixH, 160
11656 .equiv i3130_fiyH, 168
11657 .equiv i3130_fizH, 176
11658 .equiv i3130_dxO, 184
11659 .equiv i3130_dyO, 188
11660 .equiv i3130_dzO, 192
11661 .equiv i3130_dxH, 200
11662 .equiv i3130_dyH, 208
11663 .equiv i3130_dzH, 216
11664 .equiv i3130_tmprsqH, 224
11665 push ebp
11666 mov ebp,esp
11667 push eax
11668 push ebx
11669 push ecx
11670 push edx
11671 push esi
11672 push edi
11673 sub esp, 232 ;# local stack space
11674 femms
11675 ;# assume we have at least one i particle - start directly
11677 mov ecx, [ebp + i3130_iinr] ;# ecx = pointer into iinr[]
11678 mov ebx, [ecx] ;# ebx=ii
11680 mov edx, [ebp + i3130_charge]
11681 movd mm1, [ebp + i3130_facel] ;# mm1=facel
11682 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0] (O)
11683 movd mm3, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1] (H)
11684 movq mm4, mm2
11685 pfmul mm4, mm1
11686 movq mm6, mm3
11687 pfmul mm6, mm1
11688 movq mm5, mm4
11689 pfmul mm4, mm2 ;# mm4=qqOO*facel
11690 pfmul mm5, mm3 ;# mm5=qqOH*facel
11691 pfmul mm6, mm3 ;# mm6=qqHH*facel
11692 punpckldq mm5,mm5 ;# spread to both halves
11693 punpckldq mm6,mm6 ;# spread to both halves
11694 movq [esp + i3130_qqOO], mm4
11695 movq [esp + i3130_qqOH], mm5
11696 movq [esp + i3130_qqHH], mm6
11697 mov edx, [ebp + i3130_type]
11698 mov ecx, [edx + ebx*4]
11699 shl ecx, 1
11700 mov edx, ecx
11701 imul ecx, [ebp + i3130_ntype]
11702 add edx, ecx
11703 mov eax, [ebp + i3130_nbfp]
11704 movd mm0, [eax + edx*4]
11705 movd mm1, [eax + edx*4 + 4]
11706 movq [esp + i3130_c6], mm0
11707 movq [esp + i3130_c12], mm1
11708 movq mm2, [mm_two]
11709 movq mm3, [mm_six]
11710 movq mm4, [mm_twelve]
11711 movq [esp + i3130_two], mm2
11712 movq [esp + i3130_six], mm3
11713 movq [esp + i3130_twelve], mm4
11714 movd mm5, [ebp + i3130_tabscale]
11715 punpckldq mm5,mm5
11716 movq [esp + i3130_tsc], mm5
11717 .i3130_outer:
11718 mov eax, [ebp + i3130_shift] ;# eax = pointer into shift[]
11719 mov ebx, [eax] ;# ebx=shift[n]
11720 add dword ptr [ebp + i3130_shift], 4 ;# advance pointer one step
11722 lea ebx, [ebx + ebx*2] ;# ebx=3*is
11723 mov [esp + i3130_is3],ebx ;# store is3
11725 mov eax, [ebp + i3130_shiftvec] ;# eax = base of shiftvec[]
11727 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
11728 movd mm6, [eax + ebx*4 + 8]
11729 movq mm0, mm5
11730 movq mm1, mm5
11731 movq mm2, mm6
11732 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
11733 punpckhdq mm1,mm1
11734 punpckldq mm2,mm2
11736 mov ecx, [ebp + i3130_iinr] ;# ecx = pointer into iinr[]
11737 add dword ptr [ebp + i3130_iinr], 4 ;# advance pointer
11738 mov ebx, [ecx] ;# ebx=ii
11740 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
11741 mov eax, [ebp + i3130_pos] ;# eax = base of pos[]
11743 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
11744 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
11745 mov [esp + i3130_ii3], ebx ;# (use mm7 as temp. storage for iz.)
11746 pfadd mm6, mm7
11747 movq [esp + i3130_ixO], mm5
11748 movq [esp + i3130_izO], mm6
11750 movd mm3, [eax + ebx*4 + 12]
11751 movd mm4, [eax + ebx*4 + 16]
11752 movd mm5, [eax + ebx*4 + 20]
11753 punpckldq mm3, [eax + ebx*4 + 24]
11754 punpckldq mm4, [eax + ebx*4 + 28]
11755 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
11757 pfadd mm0, mm3
11758 pfadd mm1, mm4
11759 pfadd mm2, mm5
11760 movq [esp + i3130_ixH], mm0
11761 movq [esp + i3130_iyH], mm1
11762 movq [esp + i3130_izH], mm2
11764 ;# clear vctot and i forces
11765 pxor mm7,mm7
11766 movq [esp + i3130_vctot], mm7
11767 movq [esp + i3130_vnbtot], mm7
11768 movq [esp + i3130_fixO], mm7
11769 movq [esp + i3130_fizO], mm7
11770 movq [esp + i3130_fixH], mm7
11771 movq [esp + i3130_fiyH], mm7
11772 movq [esp + i3130_fizH], mm7
11774 mov eax, [ebp + i3130_jindex]
11775 mov ecx, [eax] ;# jindex[n]
11776 mov edx, [eax + 4] ;# jindex[n+1]
11777 add dword ptr [ebp + i3130_jindex], 4
11778 sub edx, ecx ;# number of innerloop atoms
11779 mov [esp + i3130_innerk], edx ;# number of innerloop atoms
11781 mov esi, [ebp + i3130_pos]
11782 mov edi, [ebp + i3130_faction]
11783 mov eax, [ebp + i3130_jjnr]
11784 shl ecx, 2
11785 add eax, ecx
11786 mov [esp + i3130_innerjjnr], eax ;# pointer to jjnr[nj0]
11787 .i3130_inner_loop:
11788 ;# a single j particle iteration here - compare with the unrolled code for comments.
11789 mov eax, [esp + i3130_innerjjnr]
11790 mov eax, [eax] ;# eax=jnr offset
11791 add dword ptr [esp + i3130_innerjjnr], 4 ;# advance pointer
11793 lea eax, [eax + eax*2]
11795 movq mm0, [esi + eax*4]
11796 movd mm1, [esi + eax*4 + 8]
11797 ;# copy & expand to mm2-mm4 for the H interactions
11798 movq mm2, mm0
11799 movq mm3, mm0
11800 movq mm4, mm1
11801 punpckldq mm2,mm2
11802 punpckhdq mm3,mm3
11803 punpckldq mm4,mm4
11805 pfsubr mm0, [esp + i3130_ixO]
11806 pfsubr mm1, [esp + i3130_izO]
11808 movq [esp + i3130_dxO], mm0
11809 pfmul mm0,mm0
11810 movd [esp + i3130_dzO], mm1
11811 pfmul mm1,mm1
11812 pfacc mm0, mm0
11813 pfadd mm0, mm1 ;# mm0=rsqO
11815 punpckldq mm2, mm2
11816 punpckldq mm3, mm3
11817 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
11818 pfsubr mm2, [esp + i3130_ixH]
11819 pfsubr mm3, [esp + i3130_iyH]
11820 pfsubr mm4, [esp + i3130_izH] ;# mm2-mm4 is dxH-dzH
11822 movq [esp + i3130_dxH], mm2
11823 movq [esp + i3130_dyH], mm3
11824 movq [esp + i3130_dzH], mm4
11825 pfmul mm2,mm2
11826 pfmul mm3,mm3
11827 pfmul mm4,mm4
11829 pfadd mm3,mm2
11830 pfadd mm3,mm4 ;# mm3=rsqH
11831 movq [esp + i3130_tmprsqH], mm3
11833 pfrsqrt mm1,mm0
11835 movq mm2,mm1
11836 pfmul mm1,mm1
11837 pfrsqit1 mm1,mm0
11838 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
11839 pfmul mm0, mm1 ;# mm0=rsq
11841 pfmul mm0, [esp + i3130_tsc]
11842 pf2iw mm4, mm0
11843 movd [esp + i3130_n1], mm4
11844 pi2fd mm4,mm4
11845 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
11846 movq mm2, mm0
11847 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
11849 ;# coulomb table
11850 mov edx, [ebp + i3130_VFtab]
11851 mov ecx, [esp + i3130_n1]
11852 shl ecx, 2
11854 ;# load all values we need
11855 movd mm4, [edx + ecx*4]
11856 movd mm5, [edx + ecx*4 + 4]
11857 movd mm6, [edx + ecx*4 + 8]
11858 movd mm7, [edx + ecx*4 + 12]
11860 pfmul mm6, mm0 ;# mm6 = Geps
11861 pfmul mm7, mm2 ;# mm7 = Heps2
11863 pfadd mm5, mm6
11864 pfadd mm5, mm7 ;# mm5 = Fp
11866 pfmul mm7, [esp + i3130_two] ;# two*Heps2
11867 pfadd mm7, mm6
11868 pfadd mm7, mm5 ;# mm7=FF
11870 pfmul mm5, mm0 ;# mm5=eps*Fp
11871 pfadd mm5, mm4 ;# mm5= VV
11873 pfmul mm5, [esp + i3130_qqOO] ;# vcoul=qq*VV
11874 pfmul mm7, [esp + i3130_qqOO] ;# fijC=qq*FF
11876 ;# update vctot directly, use mm3 for fscal sum.
11877 pfadd mm5, [esp + i3130_vctot]
11878 movq [esp + i3130_vctot], mm5
11879 movq mm3, mm7
11880 pfmul mm3, [esp + i3130_tsc]
11882 movq mm5, mm1
11883 pfmul mm5,mm5
11884 movq mm4, mm5
11885 pfmul mm4,mm5
11886 pfmul mm4,mm5
11887 movq mm5, mm4
11888 pfmul mm5,mm5 ;# mm4=rinvsix, mm5=rinvtwelve
11890 pfmul mm4, [esp + i3130_c6]
11891 pfmul mm5, [esp + i3130_c12]
11892 movq mm6,mm5
11893 pfsub mm6,mm4
11895 pfmul mm4, [esp + i3130_six]
11896 pfmul mm5, [esp + i3130_twelve]
11897 pfsub mm5,mm4
11898 pfmul mm5, mm1
11899 pfsubr mm3, mm5
11901 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
11903 ;# update vnbtot
11904 pfadd mm6, [esp + i3130_vnbtot] ;# add the earlier value
11905 movq [esp + i3130_vnbtot], mm6 ;# store the sum
11907 ;# Ready with the oxygen - potential is updated, fscal is in mm3.
11908 ;# time for hydrogens!
11910 movq mm0, [esp + i3130_tmprsqH]
11912 pfrsqrt mm1, mm0
11913 pswapd mm0,mm0
11914 pfrsqrt mm2, mm0
11915 pswapd mm0,mm0
11916 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
11918 movq mm2, mm1
11919 pfmul mm1,mm1
11920 pfrsqit1 mm1,mm0
11921 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
11923 pfmul mm0,mm1 ;# mm0=r
11924 pfmul mm0, [esp + i3130_tsc]
11925 pf2iw mm4, mm0
11926 movq [esp + i3130_n1], mm4
11927 pi2fd mm4,mm4
11928 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
11929 movq mm2, mm0
11930 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
11932 ;# coulomb table
11933 mov edx, [ebp + i3130_VFtab]
11934 mov ecx, [esp + i3130_n1]
11935 shl ecx, 2
11936 ;# load all values we need
11937 movd mm4, [edx + ecx*4]
11938 movd mm5, [edx + ecx*4 + 4]
11939 movd mm6, [edx + ecx*4 + 8]
11940 movd mm7, [edx + ecx*4 + 12]
11941 mov ecx, [esp + i3130_n1 + 4]
11942 shl ecx, 2
11943 punpckldq mm4, [edx + ecx*4]
11944 punpckldq mm5, [edx + ecx*4 + 4]
11945 punpckldq mm6, [edx + ecx*4 + 8]
11946 punpckldq mm7, [edx + ecx*4 + 12]
11948 pfmul mm6, mm0 ;# mm6 = Geps
11949 pfmul mm7, mm2 ;# mm7 = Heps2
11951 pfadd mm5, mm6
11952 pfadd mm5, mm7 ;# mm5 = Fp
11954 pfmul mm7, [esp + i3130_two] ;# two*Heps2
11955 pfadd mm7, mm6
11956 pfadd mm7, mm5 ;# mm7=FF
11958 pfmul mm5, mm0 ;# mm5=eps*Fp
11959 pfadd mm5, mm4 ;# mm5= VV
11961 pfmul mm5, [esp + i3130_qqOH] ;# vcoul=qq*VV
11962 pfmul mm7, [esp + i3130_qqOH] ;# fijC=qq*FF
11963 ;# update vctot
11964 pfadd mm5, [esp + i3130_vctot]
11965 movq [esp + i3130_vctot], mm5
11967 ;# change sign of fijC and multiply by rinv
11968 pxor mm4,mm4
11969 pfsub mm4, mm7
11970 pfmul mm4, [esp + i3130_tsc]
11971 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
11973 ;# spread oxygen fscalar to both positions
11974 punpckldq mm3,mm3
11975 ;# calc vectorial force for O
11976 movq mm0, [esp + i3130_dxO]
11977 movd mm1, [esp + i3130_dzO]
11978 pfmul mm0, mm3
11979 pfmul mm1, mm3
11981 ;# calc vectorial force for H's
11982 movq mm5, [esp + i3130_dxH]
11983 movq mm6, [esp + i3130_dyH]
11984 movq mm7, [esp + i3130_dzH]
11985 pfmul mm5, mm4
11986 pfmul mm6, mm4
11987 pfmul mm7, mm4
11989 ;# update iO particle force
11990 movq mm2, [esp + i3130_fixO]
11991 movd mm3, [esp + i3130_fizO]
11992 pfadd mm2, mm0
11993 pfadd mm3, mm1
11994 movq [esp + i3130_fixO], mm2
11995 movd [esp + i3130_fizO], mm3
11997 ;# update iH forces
11998 movq mm2, [esp + i3130_fixH]
11999 movq mm3, [esp + i3130_fiyH]
12000 movq mm4, [esp + i3130_fizH]
12001 pfadd mm2, mm5
12002 pfadd mm3, mm6
12003 pfadd mm4, mm7
12004 movq [esp + i3130_fixH], mm2
12005 movq [esp + i3130_fiyH], mm3
12006 movq [esp + i3130_fizH], mm4
12008 ;# pack j forces from H in the same form as the oxygen force.
12009 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
12010 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
12012 pfadd mm0, mm5 ;# add up total force on j particle.
12013 pfadd mm1, mm7
12015 ;# update j particle force
12016 movq mm2, [edi + eax*4]
12017 movd mm3, [edi + eax*4 + 8]
12018 pfsub mm2, mm0
12019 pfsub mm3, mm1
12020 movq [edi + eax*4], mm2
12021 movd [edi + eax*4 +8], mm3
12023 ;# interactions with j H1
12025 movq mm0, [esi + eax*4 + 12]
12026 movd mm1, [esi + eax*4 + 20]
12027 ;# copy & expand to mm2-mm4 for the H interactions
12028 movq mm2, mm0
12029 movq mm3, mm0
12030 movq mm4, mm1
12031 punpckldq mm2,mm2
12032 punpckhdq mm3,mm3
12033 punpckldq mm4,mm4
12035 pfsubr mm0, [esp + i3130_ixO]
12036 pfsubr mm1, [esp + i3130_izO]
12038 movq [esp + i3130_dxO], mm0
12039 pfmul mm0,mm0
12040 movd [esp + i3130_dzO], mm1
12041 pfmul mm1,mm1
12042 pfacc mm0, mm1
12043 pfadd mm0, mm1 ;# mm0=rsqO
12045 punpckldq mm2, mm2
12046 punpckldq mm3, mm3
12047 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
12048 pfsubr mm2, [esp + i3130_ixH]
12049 pfsubr mm3, [esp + i3130_iyH]
12050 pfsubr mm4, [esp + i3130_izH] ;# mm2-mm4 is dxH-dzH
12052 movq [esp + i3130_dxH], mm2
12053 movq [esp + i3130_dyH], mm3
12054 movq [esp + i3130_dzH], mm4
12055 pfmul mm2,mm2
12056 pfmul mm3,mm3
12057 pfmul mm4,mm4
12059 pfadd mm3,mm2
12060 pfadd mm3,mm4 ;# mm3=rsqH
12061 movq [esp + i3130_tmprsqH], mm3
12063 pfrsqrt mm1,mm0
12065 movq mm2,mm1
12066 pfmul mm1,mm1
12067 pfrsqit1 mm1,mm0
12068 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
12069 pfmul mm0, mm1 ;# mm0=rsq
12071 pfmul mm0, [esp + i3130_tsc]
12072 pf2iw mm4, mm0
12073 movd [esp + i3130_n1], mm4
12074 pi2fd mm4,mm4
12075 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
12076 movq mm2, mm0
12077 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
12079 ;# coulomb table
12080 mov edx, [ebp + i3130_VFtab]
12081 mov ecx, [esp + i3130_n1]
12082 shl ecx, 2
12084 ;# load all values we need
12085 movd mm4, [edx + ecx*4]
12086 movd mm5, [edx + ecx*4 + 4]
12087 movd mm6, [edx + ecx*4 + 8]
12088 movd mm7, [edx + ecx*4 + 12]
12090 pfmul mm6, mm0 ;# mm6 = Geps
12091 pfmul mm7, mm2 ;# mm7 = Heps2
12093 pfadd mm5, mm6
12094 pfadd mm5, mm7 ;# mm5 = Fp
12096 pfmul mm7, [esp + i3130_two] ;# two*Heps2
12097 pfadd mm7, mm6
12098 pfadd mm7, mm5 ;# mm7=FF
12100 pfmul mm5, mm0 ;# mm5=eps*Fp
12101 pfadd mm5, mm4 ;# mm5= VV
12103 pfmul mm5, [esp + i3130_qqOH] ;# vcoul=qq*VV
12104 pfmul mm7, [esp + i3130_qqOH] ;# fijC=qq*FF
12106 ;# update vctot directly, force is moved to mm3
12107 pfadd mm5, [esp + i3130_vctot]
12108 movq [esp + i3130_vctot], mm5
12109 pxor mm3, mm3
12110 pfsub mm3, mm7
12111 pfmul mm3, [esp + i3130_tsc]
12112 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
12114 movq mm0, [esp + i3130_tmprsqH]
12116 pfrsqrt mm1, mm0
12117 pswapd mm0,mm0
12118 pfrsqrt mm2, mm0
12119 pswapd mm0,mm0
12120 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
12122 movq mm2, mm1
12123 pfmul mm1,mm1
12124 pfrsqit1 mm1,mm0
12125 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
12127 pfmul mm0,mm1 ;# mm0=r
12128 pfmul mm0, [esp + i3130_tsc]
12129 pf2iw mm4, mm0
12130 movq [esp + i3130_n1], mm4
12131 pi2fd mm4,mm4
12132 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
12133 movq mm2, mm0
12134 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
12136 ;# coulomb table
12137 mov edx, [ebp + i3130_VFtab]
12138 mov ecx, [esp + i3130_n1]
12139 shl ecx, 2
12140 ;# load all values we need
12141 movd mm4, [edx + ecx*4]
12142 movd mm5, [edx + ecx*4 + 4]
12143 movd mm6, [edx + ecx*4 + 8]
12144 movd mm7, [edx + ecx*4 + 12]
12145 mov ecx, [esp + i3130_n1 + 4]
12146 shl ecx, 2
12147 punpckldq mm4, [edx + ecx*4]
12148 punpckldq mm5, [edx + ecx*4 + 4]
12149 punpckldq mm6, [edx + ecx*4 + 8]
12150 punpckldq mm7, [edx + ecx*4 + 12]
12153 pfmul mm6, mm0 ;# mm6 = Geps
12154 pfmul mm7, mm2 ;# mm7 = Heps2
12156 pfadd mm5, mm6
12157 pfadd mm5, mm7 ;# mm5 = Fp
12159 pfmul mm7, [esp + i3130_two] ;# two*Heps2
12160 pfadd mm7, mm6
12161 pfadd mm7, mm5 ;# mm7=FF
12163 pfmul mm5, mm0 ;# mm5=eps*Fp
12164 pfadd mm5, mm4 ;# mm5= VV
12166 pfmul mm5, [esp + i3130_qqHH] ;# vcoul=qq*VV
12167 pfmul mm7, [esp + i3130_qqHH] ;# fijC=qq*FF
12168 ;# update vctot
12169 pfadd mm5, [esp + i3130_vctot]
12170 movq [esp + i3130_vctot], mm5
12172 ;# change sign of fijC and multiply by rinv
12173 pxor mm4,mm4
12174 pfsub mm4, mm7
12175 pfmul mm4, [esp + i3130_tsc]
12176 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
12178 ;# spread oxygen fscalar to both positions
12179 punpckldq mm3,mm3
12180 ;# calc vectorial force for O
12181 movq mm0, [esp + i3130_dxO]
12182 movd mm1, [esp + i3130_dzO]
12183 pfmul mm0, mm3
12184 pfmul mm1, mm3
12186 ;# calc vectorial force for H's
12187 movq mm5, [esp + i3130_dxH]
12188 movq mm6, [esp + i3130_dyH]
12189 movq mm7, [esp + i3130_dzH]
12190 pfmul mm5, mm4
12191 pfmul mm6, mm4
12192 pfmul mm7, mm4
12194 ;# update iO particle force
12195 movq mm2, [esp + i3130_fixO]
12196 movd mm3, [esp + i3130_fizO]
12197 pfadd mm2, mm0
12198 pfadd mm3, mm1
12199 movq [esp + i3130_fixO], mm2
12200 movd [esp + i3130_fizO], mm3
12202 ;# update iH forces
12203 movq mm2, [esp + i3130_fixH]
12204 movq mm3, [esp + i3130_fiyH]
12205 movq mm4, [esp + i3130_fizH]
12206 pfadd mm2, mm5
12207 pfadd mm3, mm6
12208 pfadd mm4, mm7
12209 movq [esp + i3130_fixH], mm2
12210 movq [esp + i3130_fiyH], mm3
12211 movq [esp + i3130_fizH], mm4
12213 ;# pack j forces from H in the same form as the oxygen force.
12214 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
12215 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
12217 pfadd mm0, mm5 ;# add up total force on j particle.
12218 pfadd mm1, mm7
12220 ;# update j particle force
12221 movq mm2, [edi + eax*4 + 12]
12222 movd mm3, [edi + eax*4 + 20]
12223 pfsub mm2, mm0
12224 pfsub mm3, mm1
12225 movq [edi + eax*4 + 12], mm2
12226 movd [edi + eax*4 + 20], mm3
12228 ;# interactions with j H2
12229 movq mm0, [esi + eax*4 + 24]
12230 movd mm1, [esi + eax*4 + 32]
12231 ;# copy & expand to mm2-mm4 for the H interactions
12232 movq mm2, mm0
12233 movq mm3, mm0
12234 movq mm4, mm1
12235 punpckldq mm2,mm2
12236 punpckhdq mm3,mm3
12237 punpckldq mm4,mm4
12239 pfsubr mm0, [esp + i3130_ixO]
12240 pfsubr mm1, [esp + i3130_izO]
12242 movq [esp + i3130_dxO], mm0
12243 pfmul mm0,mm0
12244 movd [esp + i3130_dzO], mm1
12245 pfmul mm1,mm1
12246 pfacc mm0, mm1
12247 pfadd mm0, mm1 ;# mm0=rsqO
12249 punpckldq mm2, mm2
12250 punpckldq mm3, mm3
12251 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
12252 pfsubr mm2, [esp + i3130_ixH]
12253 pfsubr mm3, [esp + i3130_iyH]
12254 pfsubr mm4, [esp + i3130_izH] ;# mm2-mm4 is dxH-dzH
12256 movq [esp + i3130_dxH], mm2
12257 movq [esp + i3130_dyH], mm3
12258 movq [esp + i3130_dzH], mm4
12259 pfmul mm2,mm2
12260 pfmul mm3,mm3
12261 pfmul mm4,mm4
12263 pfadd mm3,mm2
12264 pfadd mm3,mm4 ;# mm3=rsqH
12265 movq [esp + i3130_tmprsqH], mm3
12267 pfrsqrt mm1,mm0
12269 movq mm2,mm1
12270 pfmul mm1,mm1
12271 pfrsqit1 mm1,mm0
12272 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
12273 pfmul mm0, mm1
12275 pfmul mm0, [esp + i3130_tsc]
12276 pf2iw mm4, mm0
12277 movd [esp + i3130_n1], mm4
12278 pi2fd mm4,mm4
12279 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
12280 movq mm2, mm0
12281 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
12283 ;# coulomb table
12284 mov edx, [ebp + i3130_VFtab]
12285 mov ecx, [esp + i3130_n1]
12286 shl ecx, 2
12288 ;# load all values we need
12289 movd mm4, [edx + ecx*4]
12290 movd mm5, [edx + ecx*4 + 4]
12291 movd mm6, [edx + ecx*4 + 8]
12292 movd mm7, [edx + ecx*4 + 12]
12294 pfmul mm6, mm0 ;# mm6 = Geps
12295 pfmul mm7, mm2 ;# mm7 = Heps2
12297 pfadd mm5, mm6
12298 pfadd mm5, mm7 ;# mm5 = Fp
12300 pfmul mm7, [esp + i3130_two] ;# two*Heps2
12301 pfadd mm7, mm6
12302 pfadd mm7, mm5 ;# mm7=FF
12304 pfmul mm5, mm0 ;# mm5=eps*Fp
12305 pfadd mm5, mm4 ;# mm5= VV
12307 pfmul mm5, [esp + i3130_qqOH] ;# vcoul=qq*VV
12308 pfmul mm7, [esp + i3130_qqOH] ;# fijC=qq*FF
12310 ;# update vctot directly, use mm3 for fscal sum.
12311 pfadd mm5, [esp + i3130_vctot]
12312 movq [esp + i3130_vctot], mm5
12313 pxor mm3,mm3
12314 pfsub mm3, mm7
12315 pfmul mm3, [esp + i3130_tsc]
12316 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
12318 movq mm0, [esp + i3130_tmprsqH]
12320 pfrsqrt mm1, mm0
12321 pswapd mm0,mm0
12322 pfrsqrt mm2, mm0
12323 pswapd mm0,mm0
12324 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
12326 movq mm2, mm1
12327 pfmul mm1,mm1
12328 pfrsqit1 mm1,mm0
12329 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
12331 pfmul mm0,mm1 ;# mm0=r
12332 pfmul mm0, [esp + i3130_tsc]
12333 pf2iw mm4, mm0
12334 movq [esp + i3130_n1], mm4
12335 pi2fd mm4,mm4
12336 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
12337 movq mm2, mm0
12338 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
12340 ;# coulomb table
12341 mov edx, [ebp + i3130_VFtab]
12342 mov ecx, [esp + i3130_n1]
12343 shl ecx, 2
12344 ;# load all values we need
12345 movd mm4, [edx + ecx*4]
12346 movd mm5, [edx + ecx*4 + 4]
12347 movd mm6, [edx + ecx*4 + 8]
12348 movd mm7, [edx + ecx*4 + 12]
12349 mov ecx, [esp + i3130_n1 + 4]
12350 shl ecx, 2
12351 punpckldq mm4, [edx + ecx*4]
12352 punpckldq mm5, [edx + ecx*4 + 4]
12353 punpckldq mm6, [edx + ecx*4 + 8]
12354 punpckldq mm7, [edx + ecx*4 + 12]
12357 pfmul mm6, mm0 ;# mm6 = Geps
12358 pfmul mm7, mm2 ;# mm7 = Heps2
12360 pfadd mm5, mm6
12361 pfadd mm5, mm7 ;# mm5 = Fp
12363 pfmul mm7, [esp + i3130_two] ;# two*Heps2
12364 pfadd mm7, mm6
12365 pfadd mm7, mm5 ;# mm7=FF
12367 pfmul mm5, mm0 ;# mm5=eps*Fp
12368 pfadd mm5, mm4 ;# mm5= VV
12370 pfmul mm5, [esp + i3130_qqHH] ;# vcoul=qq*VV
12371 pfmul mm7, [esp + i3130_qqHH] ;# fijC=qq*FF
12372 ;# update vctot
12373 pfadd mm5, [esp + i3130_vctot]
12374 movq [esp + i3130_vctot], mm5
12376 ;# change sign of fijC and multiply by rinv
12377 pxor mm4,mm4
12378 pfsub mm4, mm7
12379 pfmul mm4, [esp + i3130_tsc]
12380 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
12382 ;# spread oxygen fscalar to both positions
12383 punpckldq mm3,mm3
12384 ;# calc vectorial force for O
12385 movq mm0, [esp + i3130_dxO]
12386 movd mm1, [esp + i3130_dzO]
12387 pfmul mm0, mm3
12388 pfmul mm1, mm3
12390 ;# calc vectorial force for H's
12391 movq mm5, [esp + i3130_dxH]
12392 movq mm6, [esp + i3130_dyH]
12393 movq mm7, [esp + i3130_dzH]
12394 pfmul mm5, mm4
12395 pfmul mm6, mm4
12396 pfmul mm7, mm4
12398 ;# update iO particle force
12399 movq mm2, [esp + i3130_fixO]
12400 movd mm3, [esp + i3130_fizO]
12401 pfadd mm2, mm0
12402 pfadd mm3, mm1
12403 movq [esp + i3130_fixO], mm2
12404 movd [esp + i3130_fizO], mm3
12406 ;# update iH forces
12407 movq mm2, [esp + i3130_fixH]
12408 movq mm3, [esp + i3130_fiyH]
12409 movq mm4, [esp + i3130_fizH]
12410 pfadd mm2, mm5
12411 pfadd mm3, mm6
12412 pfadd mm4, mm7
12413 movq [esp + i3130_fixH], mm2
12414 movq [esp + i3130_fiyH], mm3
12415 movq [esp + i3130_fizH], mm4
12417 ;# pack j forces from H in the same form as the oxygen force.
12418 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
12419 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
12421 pfadd mm0, mm5 ;# add up total force on j particle.
12422 pfadd mm1, mm7
12424 ;# update j particle force
12425 movq mm2, [edi + eax*4 + 24]
12426 movd mm3, [edi + eax*4 + 32]
12427 pfsub mm2, mm0
12428 pfsub mm3, mm1
12429 movq [edi + eax*4 + 24], mm2
12430 movd [edi + eax*4 + 32], mm3
12432 ;# done - one more?
12433 dec dword ptr [esp + i3130_innerk]
12434 jz .i3130_updateouterdata
12435 jmp .i3130_inner_loop
12436 .i3130_updateouterdata:
12437 mov ecx, [esp + i3130_ii3]
12439 movq mm6, [edi + ecx*4] ;# increment iO force
12440 movd mm7, [edi + ecx*4 + 8]
12441 pfadd mm6, [esp + i3130_fixO]
12442 pfadd mm7, [esp + i3130_fizO]
12443 movq [edi + ecx*4], mm6
12444 movd [edi + ecx*4 +8], mm7
12446 movq mm0, [esp + i3130_fixH]
12447 movq mm3, [esp + i3130_fiyH]
12448 movq mm1, [esp + i3130_fizH]
12449 movq mm2, mm0
12450 punpckldq mm0, mm3 ;# mm0(l)=fxH1, mm0(h)=fyH1
12451 punpckhdq mm2, mm3 ;# mm2(l)=fxH2, mm2(h)=fyH2
12452 movq mm3, mm1
12453 pswapd mm3,mm3
12454 ;# mm1 is fzH1
12455 ;# mm3 is fzH2
12457 movq mm6, [edi + ecx*4 + 12] ;# increment iH1 force
12458 movd mm7, [edi + ecx*4 + 20]
12459 pfadd mm6, mm0
12460 pfadd mm7, mm1
12461 movq [edi + ecx*4 + 12], mm6
12462 movd [edi + ecx*4 + 20], mm7
12464 movq mm6, [edi + ecx*4 + 24] ;# increment iH2 force
12465 movd mm7, [edi + ecx*4 + 32]
12466 pfadd mm6, mm2
12467 pfadd mm7, mm3
12468 movq [edi + ecx*4 + 24], mm6
12469 movd [edi + ecx*4 + 32], mm7
12472 mov ebx, [ebp + i3130_fshift] ;# increment fshift force
12473 mov edx, [esp + i3130_is3]
12475 movq mm6, [ebx + edx*4]
12476 movd mm7, [ebx + edx*4 + 8]
12477 pfadd mm6, [esp + i3130_fixO]
12478 pfadd mm7, [esp + i3130_fizO]
12479 pfadd mm6, mm0
12480 pfadd mm7, mm1
12481 pfadd mm6, mm2
12482 pfadd mm7, mm3
12483 movq [ebx + edx*4], mm6
12484 movd [ebx + edx*4 + 8], mm7
12486 mov edx, [ebp + i3130_gid] ;# get group index for this i particle
12487 mov edx, [edx]
12488 add dword ptr [ebp + i3130_gid], 4 ;# advance pointer
12490 movq mm7, [esp + i3130_vctot]
12491 pfacc mm7,mm7 ;# get and sum the two parts of total potential
12493 mov eax, [ebp + i3130_Vc]
12494 movd mm6, [eax + edx*4]
12495 pfadd mm6, mm7
12496 movd [eax + edx*4], mm6 ;# increment vc[gid]
12498 movq mm7, [esp + i3130_vnbtot]
12499 pfacc mm7,mm7 ;# get and sum the two parts of total potential
12501 mov eax, [ebp + i3130_Vnb]
12502 movd mm6, [eax + edx*4]
12503 pfadd mm6, mm7
12504 movd [eax + edx*4], mm6 ;# increment vnbtot[gid]
12505 ;# finish if last
12506 dec dword ptr [ebp + i3130_nri]
12507 jz .i3130_end
12508 ;# not last, iterate once more!
12509 jmp .i3130_outer
12510 .i3130_end:
12511 femms
12512 add esp, 232
12513 pop edi
12514 pop esi
12515 pop edx
12516 pop ecx
12517 pop ebx
12518 pop eax
12519 leave
12523 .globl inl3300_3dnow
12524 .globl _inl3300_3dnow
12525 inl3300_3dnow:
12526 _inl3300_3dnow:
12527 .equiv i3300_nri, 8
12528 .equiv i3300_iinr, 12
12529 .equiv i3300_jindex, 16
12530 .equiv i3300_jjnr, 20
12531 .equiv i3300_shift, 24
12532 .equiv i3300_shiftvec, 28
12533 .equiv i3300_fshift, 32
12534 .equiv i3300_gid, 36
12535 .equiv i3300_pos, 40
12536 .equiv i3300_faction, 44
12537 .equiv i3300_charge, 48
12538 .equiv i3300_facel, 52
12539 .equiv i3300_Vc, 56
12540 .equiv i3300_type, 60
12541 .equiv i3300_ntype, 64
12542 .equiv i3300_nbfp, 68
12543 .equiv i3300_Vnb, 72
12544 .equiv i3300_tabscale, 76
12545 .equiv i3300_VFtab, 80
12546 ;# stack offsets for local variables
12547 .equiv i3300_is3, 0
12548 .equiv i3300_ii3, 4
12549 .equiv i3300_ix, 8
12550 .equiv i3300_iy, 12
12551 .equiv i3300_iz, 16
12552 .equiv i3300_iq, 20
12553 .equiv i3300_vctot, 28
12554 .equiv i3300_vnbtot, 36
12555 .equiv i3300_c6, 44
12556 .equiv i3300_c12, 52
12557 .equiv i3300_two, 60
12558 .equiv i3300_n1, 68
12559 .equiv i3300_tsc, 76
12560 .equiv i3300_ntia, 84
12561 .equiv i3300_innerjjnr, 88
12562 .equiv i3300_innerk, 92
12563 .equiv i3300_fix, 96
12564 .equiv i3300_fiy, 100
12565 .equiv i3300_fiz, 104
12566 .equiv i3300_dx1, 108
12567 .equiv i3300_dy1, 112
12568 .equiv i3300_dz1, 116
12569 .equiv i3300_dx2, 120
12570 .equiv i3300_dy2, 124
12571 .equiv i3300_dz2, 128
12572 push ebp
12573 mov ebp,esp
12574 push eax
12575 push ebx
12576 push ecx
12577 push edx
12578 push esi
12579 push edi
12580 sub esp, 132 ;# local stack space
12581 femms
12582 ;# move data to local stack
12583 movq mm0, [mm_two]
12584 movd mm3, [ebp + i3300_tabscale]
12585 movq [esp + i3300_two], mm0
12586 punpckldq mm3,mm3
12587 movq [esp + i3300_tsc], mm3
12588 ;# assume we have at least one i particle - start directly
12589 .i3300_outer:
12590 mov eax, [ebp + i3300_shift] ;# eax = pointer into shift[]
12591 mov ebx, [eax] ;# ebx=shift[n]
12592 add dword ptr [ebp + i3300_shift], 4 ;# advance pointer one step
12594 lea ebx, [ebx + ebx*2] ;# ebx=3*is
12595 mov [esp + i3300_is3],ebx ;# store is3
12597 mov eax, [ebp + i3300_shiftvec] ;# eax = base of shiftvec[]
12599 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
12600 movd mm1, [eax + ebx*4 + 8]
12602 mov ecx, [ebp + i3300_iinr] ;# ecx = pointer into iinr[]
12603 add dword ptr [ebp + i3300_iinr], 4 ;# advance pointer
12604 mov ebx, [ecx] ;# ebx=ii
12606 mov edx, [ebp + i3300_charge]
12607 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
12608 pfmul mm2, [ebp + i3300_facel]
12609 punpckldq mm2,mm2 ;# spread to both halves
12610 movq [esp + i3300_iq], mm2 ;# iq =facel*charge[ii]
12612 mov edx, [ebp + i3300_type]
12613 mov edx, [edx + ebx*4]
12614 imul edx, [ebp + i3300_ntype]
12615 shl edx, 1
12616 mov [esp + i3300_ntia], edx
12618 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
12619 mov eax, [ebp + i3300_pos] ;# eax = base of pos[]
12621 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
12622 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
12623 mov [esp + i3300_ii3], ebx
12624 pfadd mm1, mm3
12625 movq [esp + i3300_ix], mm0
12626 movd [esp + i3300_iz], mm1
12628 ;# clear total potential and i forces
12629 pxor mm7,mm7
12630 movq [esp + i3300_vctot], mm7
12631 movq [esp + i3300_vnbtot], mm7
12632 movq [esp + i3300_fix], mm7
12633 movd [esp + i3300_fiz], mm7
12635 mov eax, [ebp + i3300_jindex]
12636 mov ecx, [eax] ;# jindex[n]
12637 mov edx, [eax + 4] ;# jindex[n+1]
12638 add dword ptr [ebp + i3300_jindex], 4
12639 sub edx, ecx ;# number of innerloop atoms
12641 mov esi, [ebp + i3300_pos]
12642 mov edi, [ebp + i3300_faction]
12643 mov eax, [ebp + i3300_jjnr]
12644 shl ecx, 2
12645 add eax, ecx
12646 mov [esp + i3300_innerjjnr], eax ;# pointer to jjnr[nj0]
12647 sub edx, 2
12648 mov [esp + i3300_innerk], edx ;# number of innerloop atoms
12649 jge .i3300_unroll_loop
12650 jmp .i3300_finish_inner
12651 .i3300_unroll_loop:
12652 ;# paired innerloop starts here
12653 mov ecx, [esp + i3300_innerjjnr] ;# pointer to jjnr[k]
12654 mov eax, [ecx]
12655 mov ebx, [ecx + 4] ;# eax/ebx=jnr
12656 add dword ptr [esp + i3300_innerjjnr], 8 ;# advance pointer (unrolled 2)
12657 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
12659 mov ecx, [ebp + i3300_charge] ;# base of charge[]
12660 movq mm5, [esp + i3300_iq]
12661 movd mm3, [ecx + eax*4] ;# charge[jnr1]
12662 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
12663 pfmul mm3,mm5 ;# mm3 now has qq for both particles
12665 mov ecx, [ebp + i3300_type]
12666 mov edx, [ecx + eax*4] ;# type [jnr1]
12667 mov ecx, [ecx + ebx*4] ;# type [jnr2]
12669 mov esi, [ebp + i3300_nbfp] ;# base of nbfp
12670 shl edx, 1
12671 shl ecx, 1
12672 add edx, [esp + i3300_ntia] ;# tja = ntia + 2*type
12673 add ecx, [esp + i3300_ntia]
12675 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
12676 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
12677 movq mm6,mm5
12678 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
12679 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
12680 movq [esp + i3300_c6], mm5
12681 movq [esp + i3300_c12], mm6
12683 lea eax, [eax + eax*2] ;# replace jnr with j3
12684 lea ebx, [ebx + ebx*2]
12686 mov esi, [ebp + i3300_pos]
12688 movq mm0, [esp + i3300_ix]
12689 movd mm1, [esp + i3300_iz]
12690 movq mm4, [esi + eax*4] ;# fetch first j coordinates
12691 movd mm5, [esi + eax*4 + 8]
12692 pfsubr mm4,mm0 ;# dr = ir - jr
12693 pfsubr mm5,mm1
12694 movq [esp + i3300_dx1], mm4 ;# store dr
12695 movd [esp + i3300_dz1], mm5
12696 pfmul mm4,mm4 ;# square dx,dy,dz
12697 pfmul mm5,mm5
12698 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
12699 pfacc mm4, mm5 ;# first rsq in lower mm4
12701 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
12702 movd mm7, [esi + ebx*4 + 8]
12704 pfsubr mm6,mm0 ;# dr = ir - jr
12705 pfsubr mm7,mm1
12706 movq [esp + i3300_dx2], mm6 ;# store dr
12707 movd [esp + i3300_dz2], mm7
12708 pfmul mm6,mm6 ;# square dx,dy,dz
12709 pfmul mm7,mm7
12710 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
12711 pfacc mm6, mm7 ;# second rsq in lower mm6
12713 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
12714 pfrsqrt mm1, mm6
12717 punpckldq mm0,mm1
12718 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
12719 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
12720 pfmul mm0,mm0
12721 pfrsqit1 mm0,mm4
12722 pfrcpit2 mm0,mm2
12723 pfmul mm4, mm0
12724 movq mm1, mm4
12725 ;# mm0 is invsqrt, and mm1 r.
12726 ;# do potential and fscal
12727 pfmul mm1, [esp + i3300_tsc] ;# mm1=rt
12728 pf2iw mm4,mm1
12729 movq [esp + i3300_n1], mm4
12730 pi2fd mm4,mm4
12731 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
12733 movq mm2,mm1
12734 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
12736 mov edx, [ebp + i3300_VFtab]
12737 mov ecx, [esp + i3300_n1]
12738 lea ecx, [ecx + ecx*2]
12739 shl ecx, 2
12740 ;# load all the table values we need
12741 movd mm4, [edx + ecx*4]
12742 movd mm5, [edx + ecx*4 + 4]
12743 movd mm6, [edx + ecx*4 + 8]
12744 movd mm7, [edx + ecx*4 + 12]
12745 mov ecx, [esp + i3300_n1 + 4]
12746 lea ecx, [ecx + ecx*2]
12747 shl ecx, 2
12748 punpckldq mm4, [edx + ecx*4]
12749 punpckldq mm5, [edx + ecx*4 + 4]
12750 punpckldq mm6, [edx + ecx*4 + 8]
12751 punpckldq mm7, [edx + ecx*4 + 12]
12753 pfmul mm6, mm1 ;# mm6 = Geps
12754 pfmul mm7, mm2 ;# mm7 = Heps2
12756 pfadd mm5, mm6
12757 pfadd mm5, mm7 ;# mm5 = Fp
12759 pfmul mm7, [esp + i3300_two] ;# two*Heps2
12760 pfadd mm7, mm6
12761 pfadd mm7, mm5 ;# mm7=FF
12763 pfmul mm5, mm1 ;# mm5=eps*Fp
12764 pfadd mm5, mm4 ;# mm5= VV
12766 pfmul mm5, mm3 ;# vcoul=qq*VV
12767 pfmul mm3, mm7 ;# fijC=FF*qq
12769 ;# at this point mm5 contains vcoul and mm3 fijC
12770 ;# increment vcoul - then we can get rid of mm5
12771 ;# update vctot
12772 pfadd mm5, [esp + i3300_vctot] ;# add the earlier value
12773 movq [esp + i3300_vctot], mm5 ;# store the sum
12775 ;# dispersion table
12776 mov ecx, [esp + i3300_n1]
12777 lea ecx, [ecx + ecx*2]
12778 shl ecx, 2
12779 ;# load all the table values we need
12780 movd mm4, [edx + ecx*4 + 16]
12781 movd mm5, [edx + ecx*4 + 20]
12782 movd mm6, [edx + ecx*4 + 24]
12783 movd mm7, [edx + ecx*4 + 28]
12784 mov ecx, [esp + i3300_n1 + 4]
12785 lea ecx, [ecx + ecx*2]
12786 shl ecx, 2
12787 punpckldq mm4, [edx + ecx*4 + 16]
12788 punpckldq mm5, [edx + ecx*4 + 20]
12789 punpckldq mm6, [edx + ecx*4 + 24]
12790 punpckldq mm7, [edx + ecx*4 + 28]
12791 pfmul mm6, mm1 ;# mm6 = Geps
12792 pfmul mm7, mm2 ;# mm7 = Heps2
12793 pfadd mm5, mm6
12794 pfadd mm5, mm7 ;# mm5 = Fp
12795 pfmul mm7, [esp + i3300_two] ;# two*Heps2
12796 pfadd mm7, mm6
12797 pfadd mm7, mm5 ;# mm7=FF
12798 pfmul mm5, mm1 ;# mm5=eps*Fp
12799 pfadd mm5, mm4 ;# mm5= VV
12801 movq mm4, [esp + i3300_c6]
12802 pfmul mm7, mm4 ;# fijD
12803 pfmul mm5, mm4 ;# vnb6
12804 pfadd mm3, mm7 ;# add to fscal
12806 ;# update vnbtot to release mm5!
12807 pfadd mm5, [esp + i3300_vnbtot] ;# add the earlier value
12808 movq [esp + i3300_vnbtot], mm5 ;# store the sum
12810 ;# repulsion table
12811 mov ecx, [esp + i3300_n1]
12812 lea ecx, [ecx + ecx*2]
12813 shl ecx, 2
12814 ;# load all the table values we need
12815 movd mm4, [edx + ecx*4 + 32]
12816 movd mm5, [edx + ecx*4 + 36]
12817 movd mm6, [edx + ecx*4 + 40]
12818 movd mm7, [edx + ecx*4 + 44]
12819 mov ecx, [esp + i3300_n1 + 4]
12820 lea ecx, [ecx + ecx*2]
12821 shl ecx, 2
12822 punpckldq mm4, [edx + ecx*4 + 32]
12823 punpckldq mm5, [edx + ecx*4 + 36]
12824 punpckldq mm6, [edx + ecx*4 + 40]
12825 punpckldq mm7, [edx + ecx*4 + 44]
12827 pfmul mm6, mm1 ;# mm6 = Geps
12828 pfmul mm7, mm2 ;# mm7 = Heps2
12829 pfadd mm5, mm6
12830 pfadd mm5, mm7 ;# mm5 = Fp
12831 pfmul mm7, [esp + i3300_two] ;# two*Heps2
12832 pfadd mm7, mm6
12833 pfadd mm7, mm5 ;# mm7=FF
12834 pfmul mm5, mm1 ;# mm5=eps*Fp
12835 pfadd mm5, mm4 ;# mm5= VV
12837 movq mm6, [esp + i3300_c12]
12838 pfmul mm7, mm6 ;# fijR
12839 pfmul mm5, mm6 ;# vnb12
12840 pfadd mm3, mm7 ;# total fscal fijC+ fijD+ fijR
12842 ;# change sign of mm3
12843 pxor mm1,mm1
12844 pfsub mm1, mm3
12845 pfmul mm0, [esp + i3300_tsc]
12846 pfmul mm0, mm1 ;# mm0 is total fscal now
12848 prefetchw [esp + i3300_dx1] ;# prefetch i forces to cache
12850 ;# spread fscalar to both positions
12851 movq mm1,mm0
12852 punpckldq mm0,mm0
12853 punpckhdq mm1,mm1
12855 ;# calc vector force
12856 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
12857 movq mm2, [esp + i3300_dx1] ;# fetch dr
12858 movd mm3, [esp + i3300_dz1]
12860 ;# update vnbtot
12861 pfadd mm5, [esp + i3300_vnbtot] ;# add the earlier value
12862 movq [esp + i3300_vnbtot], mm5 ;# store the sum
12864 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
12865 pfmul mm2, mm0 ;# mult by fs
12866 pfmul mm3, mm0
12868 movq mm4, [esp + i3300_dx2] ;# fetch dr
12869 movd mm5, [esp + i3300_dz2]
12870 pfmul mm4, mm1 ;# mult by fs
12871 pfmul mm5, mm1
12872 ;# update i forces
12874 movq mm0, [esp + i3300_fix]
12875 movd mm1, [esp + i3300_fiz]
12876 pfadd mm0, mm2
12877 pfadd mm1, mm3
12879 pfadd mm0, mm4
12880 pfadd mm1, mm5
12881 movq [esp + i3300_fix], mm0
12882 movd [esp + i3300_fiz], mm1
12883 ;# update j forces
12885 movq mm0, [edi + eax*4]
12886 movd mm1, [edi + eax*4 + 8]
12887 movq mm6, [edi + ebx*4]
12888 movd mm7, [edi + ebx*4 + 8]
12890 pfsub mm0, mm2
12891 pfsub mm1, mm3
12892 pfsub mm6, mm4
12893 pfsub mm7, mm5
12895 movq [edi + eax*4], mm0
12896 movd [edi + eax*4 +8], mm1
12897 movq [edi + ebx*4], mm6
12898 movd [edi + ebx*4 + 8], mm7
12900 ;# should we do one more iteration?
12901 sub dword ptr [esp + i3300_innerk], 2
12902 jl .i3300_finish_inner
12903 jmp .i3300_unroll_loop
12904 .i3300_finish_inner:
12905 and dword ptr [esp + i3300_innerk], 1
12906 jnz .i3300_single_inner
12907 jmp .i3300_updateouterdata
12908 .i3300_single_inner:
12909 ;# a single j particle iteration here - compare with the unrolled code for comments.
12910 mov eax, [esp + i3300_innerjjnr]
12911 mov eax, [eax] ;# eax=jnr offset
12913 mov ecx, [ebp + i3300_charge]
12914 movd mm5, [esp + i3300_iq]
12915 movd mm3, [ecx + eax*4]
12916 pfmul mm3, mm5 ;# mm3=qq
12918 mov esi, [ebp + i3300_nbfp]
12919 mov ecx, [ebp + i3300_type]
12920 mov edx, [ecx + eax*4] ;# type [jnr1]
12921 shl edx, 1
12922 add edx, [esp + i3300_ntia] ;# tja = ntia + 2*type
12923 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
12924 movq [esp + i3300_c6], mm5
12925 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
12926 movq [esp + i3300_c12], mm5
12928 mov esi, [ebp + i3300_pos]
12929 lea eax, [eax + eax*2]
12931 movq mm0, [esp + i3300_ix]
12932 movd mm1, [esp + i3300_iz]
12933 movq mm4, [esi + eax*4]
12934 movd mm5, [esi + eax*4 + 8]
12935 pfsubr mm4, mm0
12936 pfsubr mm5, mm1
12937 movq [esp + i3300_dx1], mm4
12938 pfmul mm4,mm4
12939 movd [esp + i3300_dz1], mm5
12940 pfmul mm5,mm5
12941 pfacc mm4, mm5
12942 pfacc mm4, mm5 ;# mm0=rsq
12944 pfrsqrt mm0,mm4
12945 movq mm2,mm0
12946 pfmul mm0,mm0
12947 pfrsqit1 mm0,mm4
12948 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
12949 pfmul mm4, mm0
12950 movq mm1, mm4
12951 ;# mm0 is invsqrt, and mm1 r.
12953 ;# calculate potentials and scalar force
12954 pfmul mm1, [esp + i3300_tsc] ;# mm1=rt
12955 pf2iw mm4,mm1
12956 movd [esp + i3300_n1], mm4
12957 pi2fd mm4,mm4
12958 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
12960 movq mm2,mm1
12961 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
12963 ;# coulomb table
12964 mov edx, [ebp + i3300_VFtab]
12965 mov ecx, [esp + i3300_n1]
12966 lea ecx, [ecx + ecx*2]
12967 shl ecx, 2
12968 ;# load all the table values we need
12969 movd mm4, [edx + ecx*4]
12970 movd mm5, [edx + ecx*4 + 4]
12971 movd mm6, [edx + ecx*4 + 8]
12972 movd mm7, [edx + ecx*4 + 12]
12974 pfmul mm6, mm1 ;# mm6 = Geps
12975 pfmul mm7, mm2 ;# mm7 = Heps2
12977 pfadd mm5, mm6
12978 pfadd mm5, mm7 ;# mm5 = Fp
12980 pfmul mm7, [esp + i3300_two] ;# two*Heps2
12981 pfadd mm7, mm6
12982 pfadd mm7, mm5 ;# mm7=FF
12984 pfmul mm5, mm1 ;# mm5=eps*Fp
12985 pfadd mm5, mm4 ;# mm5= VV
12987 pfmul mm5, mm3 ;# vcoul=qq*VV
12988 pfmul mm3, mm7 ;# fijC=FF*qq
12990 ;# at this point mm5 contains vcoul and mm3 fijC
12991 ;# increment vcoul - then we can get rid of mm5
12992 ;# update vctot
12993 pfadd mm5, [esp + i3300_vctot] ;# add the earlier value
12994 movq [esp + i3300_vctot], mm5 ;# store the sum
12996 ;# dispersion table
12997 ;# load all the table values we need
12998 movd mm4, [edx + ecx*4 + 16]
12999 movd mm5, [edx + ecx*4 + 20]
13000 movd mm6, [edx + ecx*4 + 24]
13001 movd mm7, [edx + ecx*4 + 28]
13002 pfmul mm6, mm1 ;# mm6 = Geps
13003 pfmul mm7, mm2 ;# mm7 = Heps2
13004 pfadd mm5, mm6
13005 pfadd mm5, mm7 ;# mm5 = Fp
13006 pfmul mm7, [esp + i3300_two] ;# two*Heps2
13007 pfadd mm7, mm6
13008 pfadd mm7, mm5 ;# mm7=FF
13009 pfmul mm5, mm1 ;# mm5=eps*Fp
13010 pfadd mm5, mm4 ;# mm5= VV
13012 movq mm4, [esp + i3300_c6]
13013 pfmul mm7, mm4 ;# fijD
13014 pfmul mm5, mm4 ;# vnb6
13015 pfadd mm3, mm7 ;# add to fscal
13017 ;# update vnbtot to release mm5!
13018 pfadd mm5, [esp + i3300_vnbtot] ;# add the earlier value
13019 movq [esp + i3300_vnbtot], mm5 ;# store the sum
13021 ;# repulsion table
13022 ;# load all the table values we need
13023 movd mm4, [edx + ecx*4 + 32]
13024 movd mm5, [edx + ecx*4 + 36]
13025 movd mm6, [edx + ecx*4 + 40]
13026 movd mm7, [edx + ecx*4 + 44]
13028 pfmul mm6, mm1 ;# mm6 = Geps
13029 pfmul mm7, mm2 ;# mm7 = Heps2
13030 pfadd mm5, mm6
13031 pfadd mm5, mm7 ;# mm5 = Fp
13032 pfmul mm7, [esp + i3300_two] ;# two*Heps2
13033 pfadd mm7, mm6
13034 pfadd mm7, mm5 ;# mm7=FF
13035 pfmul mm5, mm1 ;# mm5=eps*Fp
13036 pfadd mm5, mm4 ;# mm5= VV
13038 movq mm6, [esp + i3300_c12]
13039 pfmul mm7, mm6 ;# fijR
13040 pfmul mm5, mm6 ;# vnb12
13041 pfadd mm3, mm7 ;# total fscal fijC+ fijD+ fijR
13043 ;# change sign of mm3
13044 pxor mm1,mm1
13045 pfsub mm1, mm3
13046 pfmul mm0, [esp + i3300_tsc]
13047 pfmul mm0, mm1 ;# mm0 is total fscal now
13049 ;# update vnbtot
13050 pfadd mm5, [esp + i3300_vnbtot] ;# add the earlier value
13051 movq [esp + i3300_vnbtot], mm5 ;# store the sum
13053 ;# spread fscalar to both positions
13054 punpckldq mm0,mm0
13055 ;# calc vectorial force
13056 prefetchw [edi + eax*4] ;# prefetch faction to cache
13057 movq mm2, [esp + i3300_dx1]
13058 movd mm3, [esp + i3300_dz1]
13061 pfmul mm2, mm0
13062 pfmul mm3, mm0
13064 ;# update i particle force
13065 movq mm0, [esp + i3300_fix]
13066 movd mm1, [esp + i3300_fiz]
13067 pfadd mm0, mm2
13068 pfadd mm1, mm3
13069 movq [esp + i3300_fix], mm0
13070 movd [esp + i3300_fiz], mm1
13071 ;# update j particle force
13072 movq mm0, [edi + eax*4]
13073 movd mm1, [edi + eax *4+ 8]
13074 pfsub mm0, mm2
13075 pfsub mm1, mm3
13076 movq [edi + eax*4], mm0
13077 movd [edi + eax*4 +8], mm1
13078 ;# done!
13079 .i3300_updateouterdata:
13080 mov ecx, [esp + i3300_ii3]
13082 movq mm6, [edi + ecx*4] ;# increment i force
13083 movd mm7, [edi + ecx*4 + 8]
13084 pfadd mm6, [esp + i3300_fix]
13085 pfadd mm7, [esp + i3300_fiz]
13086 movq [edi + ecx*4], mm6
13087 movd [edi + ecx*4 +8], mm7
13089 mov ebx, [ebp + i3300_fshift] ;# increment fshift force
13090 mov edx, [esp + i3300_is3]
13092 movq mm6, [ebx + edx*4]
13093 movd mm7, [ebx + edx*4 + 8]
13094 pfadd mm6, [esp + i3300_fix]
13095 pfadd mm7, [esp + i3300_fiz]
13096 movq [ebx + edx*4], mm6
13097 movd [ebx + edx*4 + 8], mm7
13099 mov edx, [ebp + i3300_gid] ;# get group index for this i particle
13100 mov edx, [edx]
13101 add dword ptr [ebp + i3300_gid], 4 ;# advance pointer
13103 movq mm7, [esp + i3300_vctot]
13104 pfacc mm7,mm7 ;# get and sum the two parts of total potential
13106 mov eax, [ebp + i3300_Vc]
13107 movd mm6, [eax + edx*4]
13108 pfadd mm6, mm7
13109 movd [eax + edx*4], mm6 ;# increment vc[gid]
13111 movq mm7, [esp + i3300_vnbtot]
13112 pfacc mm7,mm7 ;# get and sum the two parts of total potential
13114 mov eax, [ebp + i3300_Vnb]
13115 movd mm6, [eax + edx*4]
13116 pfadd mm6, mm7
13117 movd [eax + edx*4], mm6 ;# increment vnb[gid]
13119 ;# finish if last
13120 mov ecx, [ebp + i3300_nri]
13121 dec ecx
13122 jecxz .i3300_end
13123 ;# not last, iterate once more!
13124 mov [ebp + i3300_nri], ecx
13125 jmp .i3300_outer
13126 .i3300_end:
13127 femms
13128 add esp, 132
13129 pop edi
13130 pop esi
13131 pop edx
13132 pop ecx
13133 pop ebx
13134 pop eax
13135 leave
13142 .globl inl3310_3dnow
13143 .globl _inl3310_3dnow
13144 inl3310_3dnow:
13145 _inl3310_3dnow:
13146 .equiv i3310_nri, 8
13147 .equiv i3310_iinr, 12
13148 .equiv i3310_jindex, 16
13149 .equiv i3310_jjnr, 20
13150 .equiv i3310_shift, 24
13151 .equiv i3310_shiftvec, 28
13152 .equiv i3310_fshift, 32
13153 .equiv i3310_gid, 36
13154 .equiv i3310_pos, 40
13155 .equiv i3310_faction, 44
13156 .equiv i3310_charge, 48
13157 .equiv i3310_facel, 52
13158 .equiv i3310_Vc, 56
13159 .equiv i3310_type, 60
13160 .equiv i3310_ntype, 64
13161 .equiv i3310_nbfp, 68
13162 .equiv i3310_Vnb, 72
13163 .equiv i3310_tabscale, 76
13164 .equiv i3310_VFtab, 80
13165 .equiv i3310_nsatoms, 84
13166 ;# stack offsets for local variables
13167 .equiv i3310_is3, 0
13168 .equiv i3310_ii3, 4
13169 .equiv i3310_shX, 8
13170 .equiv i3310_shY, 12
13171 .equiv i3310_shZ, 16
13172 .equiv i3310_ix, 20
13173 .equiv i3310_iy, 24
13174 .equiv i3310_iz, 28
13175 .equiv i3310_iq, 32
13176 .equiv i3310_vctot, 40
13177 .equiv i3310_vnbtot, 48
13178 .equiv i3310_c6, 56
13179 .equiv i3310_c12, 64
13180 .equiv i3310_two, 72
13181 .equiv i3310_n1, 80
13182 .equiv i3310_tsc, 88
13183 .equiv i3310_ntia, 96
13184 .equiv i3310_innerjjnr0, 100
13185 .equiv i3310_innerk0, 104
13186 .equiv i3310_innerjjnr, 108
13187 .equiv i3310_innerk, 112
13188 .equiv i3310_fix, 116
13189 .equiv i3310_fiy, 120
13190 .equiv i3310_fiz, 124
13191 .equiv i3310_dx1, 128
13192 .equiv i3310_dy1, 132
13193 .equiv i3310_dz1, 136
13194 .equiv i3310_dx2, 140
13195 .equiv i3310_dy2, 144
13196 .equiv i3310_dz2, 148
13197 .equiv i3310_nsvdwc, 152
13198 .equiv i3310_nscoul, 156
13199 .equiv i3310_nsvdw, 160
13200 .equiv i3310_solnr, 164
13201 push ebp
13202 mov ebp,esp
13203 push eax
13204 push ebx
13205 push ecx
13206 push edx
13207 push esi
13208 push edi
13209 sub esp, 168 ;# local stack space
13210 femms
13211 movq mm0, [mm_two]
13212 movd mm3, [ebp + i3310_tabscale]
13213 movq [esp + i3310_two], mm0
13214 punpckldq mm3,mm3
13215 movq [esp + i3310_tsc], mm3
13216 ;# assume we have at least one i particle - start directly
13217 .i3310_outer:
13218 mov eax, [ebp + i3310_shift] ;# eax = pointer into shift[]
13219 mov ebx, [eax] ;# ebx=shift[n]
13220 add dword ptr [ebp + i3310_shift], 4 ;# advance pointer one step
13222 lea ebx, [ebx + ebx*2] ;# ebx=3*is
13223 mov [esp + i3310_is3],ebx ;# store is3
13225 mov eax, [ebp + i3310_shiftvec] ;# eax = base of shiftvec[]
13227 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
13228 movd mm1, [eax + ebx*4 + 8]
13229 movq [esp + i3310_shX], mm0
13230 movd [esp + i3310_shZ], mm1
13232 mov ecx, [ebp + i3310_iinr] ;# ecx = pointer into iinr[]
13233 add dword ptr [ebp + i3310_iinr], 4 ;# advance pointer
13234 mov ebx, [ecx] ;# ebx=ii
13236 mov eax, [ebp + i3310_nsatoms]
13237 add dword ptr [ebp + i3310_nsatoms], 12
13238 mov ecx, [eax]
13239 mov edx, [eax + 4]
13240 mov eax, [eax + 8]
13241 sub ecx, eax
13242 sub eax, edx
13244 mov [esp + i3310_nsvdwc], edx
13245 mov [esp + i3310_nscoul], eax
13246 mov [esp + i3310_nsvdw], ecx
13248 ;# clear potential
13249 pxor mm7,mm7
13250 movq [esp + i3310_vctot], mm7
13251 movq [esp + i3310_vnbtot], mm7
13252 mov [esp + i3310_solnr], ebx
13254 mov eax, [ebp + i3310_jindex]
13255 mov ecx, [eax] ;# jindex[n]
13256 mov edx, [eax + 4] ;# jindex[n+1]
13257 add dword ptr [ebp + i3310_jindex], 4
13258 sub edx, ecx ;# number of innerloop atoms
13259 mov eax, [ebp + i3310_jjnr]
13260 shl ecx, 2
13261 add eax, ecx
13262 mov [esp + i3310_innerjjnr0], eax ;# pointer to jjnr[nj0]
13264 mov [esp + i3310_innerk0], edx ;# number of innerloop atoms
13265 mov esi, [ebp + i3310_pos]
13266 mov edi, [ebp + i3310_faction]
13268 mov ecx, [esp + i3310_nsvdwc]
13269 cmp ecx, 0
13270 jnz .i3310_mno_vdwc
13271 jmp .i3310_testcoul
13272 .i3310_mno_vdwc:
13273 mov ebx, [esp + i3310_solnr]
13274 inc dword ptr [esp + i3310_solnr]
13275 mov edx, [ebp + i3310_charge]
13276 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
13277 pfmul mm2, [ebp + i3310_facel]
13278 punpckldq mm2,mm2 ;# spread to both halves
13279 movq [esp + i3310_iq], mm2 ;# iq =facel*charge[ii]
13281 mov edx, [ebp + i3310_type]
13282 mov edx, [edx + ebx*4]
13283 imul edx, [ebp + i3310_ntype]
13284 shl edx, 1
13285 mov [esp + i3310_ntia], edx
13287 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
13288 mov eax, [ebp + i3310_pos] ;# eax = base of pos[]
13289 mov [esp + i3310_ii3], ebx
13291 movq mm0, [eax + ebx*4]
13292 movd mm1, [eax + ebx*4 + 8]
13293 pfadd mm0, [esp + i3310_shX]
13294 pfadd mm1, [esp + i3310_shZ]
13295 movq [esp + i3310_ix], mm0
13296 movd [esp + i3310_iz], mm1
13298 ;# clear forces
13299 pxor mm7,mm7
13300 movq [esp + i3310_fix], mm7
13301 movd [esp + i3310_fiz], mm7
13303 mov ecx, [esp + i3310_innerjjnr0]
13304 mov [esp + i3310_innerjjnr], ecx
13305 mov edx, [esp + i3310_innerk0]
13306 sub edx, 2
13307 mov [esp + i3310_innerk], edx ;# number of innerloop atoms
13308 jge .i3310_unroll_vdwc_loop
13309 jmp .i3310_finish_vdwc_inner
13310 .i3310_unroll_vdwc_loop:
13311 ;# paired innerloop starts here
13312 mov ecx, [esp + i3310_innerjjnr] ;# pointer to jjnr[k]
13313 mov eax, [ecx]
13314 mov ebx, [ecx + 4] ;# eax/ebx=jnr
13315 add dword ptr [esp + i3310_innerjjnr], 8 ;# advance pointer (unrolled 2)
13316 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
13318 mov ecx, [ebp + i3310_charge] ;# base of charge[]
13319 movq mm5, [esp + i3310_iq]
13320 movd mm3, [ecx + eax*4] ;# charge[jnr1]
13321 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
13322 pfmul mm3,mm5 ;# mm3 now has qq for both particles
13324 mov ecx, [ebp + i3310_type]
13325 mov edx, [ecx + eax*4] ;# type [jnr1]
13326 mov ecx, [ecx + ebx*4] ;# type [jnr2]
13328 mov esi, [ebp + i3310_nbfp] ;# base of nbfp
13329 shl edx, 1
13330 shl ecx, 1
13331 add edx, [esp + i3310_ntia] ;# tja = ntia + 2*type
13332 add ecx, [esp + i3310_ntia]
13334 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
13335 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
13336 movq mm6,mm5
13337 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
13338 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
13339 movq [esp + i3310_c6], mm5
13340 movq [esp + i3310_c12], mm6
13342 lea eax, [eax + eax*2] ;# replace jnr with j3
13343 lea ebx, [ebx + ebx*2]
13345 mov esi, [ebp + i3310_pos]
13347 movq mm0, [esp + i3310_ix]
13348 movd mm1, [esp + i3310_iz]
13349 movq mm4, [esi + eax*4] ;# fetch first j coordinates
13350 movd mm5, [esi + eax*4 + 8]
13351 pfsubr mm4,mm0 ;# dr = ir - jr
13352 pfsubr mm5,mm1
13353 movq [esp + i3310_dx1], mm4 ;# store dr
13354 movd [esp + i3310_dz1], mm5
13355 pfmul mm4,mm4 ;# square dx,dy,dz
13356 pfmul mm5,mm5
13357 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
13358 pfacc mm4, mm5 ;# first rsq in lower mm4
13360 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
13361 movd mm7, [esi + ebx*4 + 8]
13363 pfsubr mm6,mm0 ;# dr = ir - jr
13364 pfsubr mm7,mm1
13365 movq [esp + i3310_dx2], mm6 ;# store dr
13366 movd [esp + i3310_dz2], mm7
13367 pfmul mm6,mm6 ;# square dx,dy,dz
13368 pfmul mm7,mm7
13369 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
13370 pfacc mm6, mm7 ;# second rsq in lower mm6
13372 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
13373 pfrsqrt mm1, mm6
13376 punpckldq mm0,mm1
13377 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
13378 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
13379 pfmul mm0,mm0
13380 pfrsqit1 mm0,mm4
13381 pfrcpit2 mm0,mm2
13382 pfmul mm4, mm0
13383 movq mm1, mm4
13384 ;# mm0 is invsqrt, and mm1 r.
13385 ;# do potential and fscal
13386 pfmul mm1, [esp + i3310_tsc] ;# mm1=rt
13387 pf2iw mm4,mm1
13388 movq [esp + i3310_n1], mm4
13389 pi2fd mm4,mm4
13390 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
13392 movq mm2,mm1
13393 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
13395 mov edx, [ebp + i3310_VFtab]
13396 mov ecx, [esp + i3310_n1]
13397 lea ecx, [ecx + ecx*2]
13398 shl ecx, 2
13399 ;# load all the table values we need
13400 movd mm4, [edx + ecx*4]
13401 movd mm5, [edx + ecx*4 + 4]
13402 movd mm6, [edx + ecx*4 + 8]
13403 movd mm7, [edx + ecx*4 + 12]
13404 mov ecx, [esp + i3310_n1 + 4]
13405 lea ecx, [ecx + ecx*2]
13406 shl ecx, 2
13407 punpckldq mm4, [edx + ecx*4]
13408 punpckldq mm5, [edx + ecx*4 + 4]
13409 punpckldq mm6, [edx + ecx*4 + 8]
13410 punpckldq mm7, [edx + ecx*4 + 12]
13412 pfmul mm6, mm1 ;# mm6 = Geps
13413 pfmul mm7, mm2 ;# mm7 = Heps2
13415 pfadd mm5, mm6
13416 pfadd mm5, mm7 ;# mm5 = Fp
13418 pfmul mm7, [esp + i3310_two] ;# two*Heps2
13419 pfadd mm7, mm6
13420 pfadd mm7, mm5 ;# mm7=FF
13422 pfmul mm5, mm1 ;# mm5=eps*Fp
13423 pfadd mm5, mm4 ;# mm5= VV
13425 pfmul mm5, mm3 ;# vcoul=qq*VV
13426 pfmul mm3, mm7 ;# fijC=FF*qq
13428 ;# at this point mm5 contains vcoul and mm3 fijC
13429 ;# increment vcoul - then we can get rid of mm5
13430 ;# update vctot
13431 pfadd mm5, [esp + i3310_vctot] ;# add the earlier value
13432 movq [esp + i3310_vctot], mm5 ;# store the sum
13434 ;# dispersion table
13435 mov ecx, [esp + i3310_n1]
13436 lea ecx, [ecx + ecx*2]
13437 shl ecx, 2
13438 ;# load all the table values we need
13439 movd mm4, [edx + ecx*4 + 16]
13440 movd mm5, [edx + ecx*4 + 20]
13441 movd mm6, [edx + ecx*4 + 24]
13442 movd mm7, [edx + ecx*4 + 28]
13443 mov ecx, [esp + i3310_n1 + 4]
13444 lea ecx, [ecx + ecx*2]
13445 shl ecx, 2
13446 punpckldq mm4, [edx + ecx*4 + 16]
13447 punpckldq mm5, [edx + ecx*4 + 20]
13448 punpckldq mm6, [edx + ecx*4 + 24]
13449 punpckldq mm7, [edx + ecx*4 + 28]
13450 pfmul mm6, mm1 ;# mm6 = Geps
13451 pfmul mm7, mm2 ;# mm7 = Heps2
13452 pfadd mm5, mm6
13453 pfadd mm5, mm7 ;# mm5 = Fp
13454 pfmul mm7, [esp + i3310_two] ;# two*Heps2
13455 pfadd mm7, mm6
13456 pfadd mm7, mm5 ;# mm7=FF
13457 pfmul mm5, mm1 ;# mm5=eps*Fp
13458 pfadd mm5, mm4 ;# mm5= VV
13460 movq mm4, [esp + i3310_c6]
13461 pfmul mm7, mm4 ;# fijD
13462 pfmul mm5, mm4 ;# vnb6
13463 pfadd mm3, mm7 ;# add to fscal
13465 ;# update vnbtot to release mm5!
13466 pfadd mm5, [esp + i3310_vnbtot] ;# add the earlier value
13467 movq [esp + i3310_vnbtot], mm5 ;# store the sum
13469 ;# repulsion table
13470 mov ecx, [esp + i3310_n1]
13471 lea ecx, [ecx + ecx*2]
13472 shl ecx, 2
13473 ;# load all the table values we need
13474 movd mm4, [edx + ecx*4 + 32]
13475 movd mm5, [edx + ecx*4 + 36]
13476 movd mm6, [edx + ecx*4 + 40]
13477 movd mm7, [edx + ecx*4 + 44]
13478 mov ecx, [esp + i3310_n1 + 4]
13479 lea ecx, [ecx + ecx*2]
13480 shl ecx, 2
13481 punpckldq mm4, [edx + ecx*4 + 32]
13482 punpckldq mm5, [edx + ecx*4 + 36]
13483 punpckldq mm6, [edx + ecx*4 + 40]
13484 punpckldq mm7, [edx + ecx*4 + 44]
13486 pfmul mm6, mm1 ;# mm6 = Geps
13487 pfmul mm7, mm2 ;# mm7 = Heps2
13488 pfadd mm5, mm6
13489 pfadd mm5, mm7 ;# mm5 = Fp
13490 pfmul mm7, [esp + i3310_two] ;# two*Heps2
13491 pfadd mm7, mm6
13492 pfadd mm7, mm5 ;# mm7=FF
13493 pfmul mm5, mm1 ;# mm5=eps*Fp
13494 pfadd mm5, mm4 ;# mm5= VV
13496 movq mm6, [esp + i3310_c12]
13497 pfmul mm7, mm6 ;# fijR
13498 pfmul mm5, mm6 ;# vnb12
13499 pfadd mm3, mm7 ;# total fscal fijC+ fijD+ fijR
13501 ;# change sign of mm3
13502 pxor mm1,mm1
13503 pfsub mm1, mm3
13504 pfmul mm0, [esp + i3310_tsc]
13505 pfmul mm0, mm1 ;# mm0 is total fscal now
13507 prefetchw [esp + i3310_dx1] ;# prefetch i forces to cache
13509 ;# spread fscalar to both positions
13510 movq mm1,mm0
13511 punpckldq mm0,mm0
13512 punpckhdq mm1,mm1
13514 ;# calc vector force
13515 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
13516 movq mm2, [esp + i3310_dx1] ;# fetch dr
13517 movd mm3, [esp + i3310_dz1]
13519 ;# update vnbtot
13520 pfadd mm5, [esp + i3310_vnbtot] ;# add the earlier value
13521 movq [esp + i3310_vnbtot], mm5 ;# store the sum
13523 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
13524 pfmul mm2, mm0 ;# mult by fs
13525 pfmul mm3, mm0
13527 movq mm4, [esp + i3310_dx2] ;# fetch dr
13528 movd mm5, [esp + i3310_dz2]
13529 pfmul mm4, mm1 ;# mult by fs
13530 pfmul mm5, mm1
13531 ;# update i forces
13533 movq mm0, [esp + i3310_fix]
13534 movd mm1, [esp + i3310_fiz]
13535 pfadd mm0, mm2
13536 pfadd mm1, mm3
13538 pfadd mm0, mm4
13539 pfadd mm1, mm5
13540 movq [esp + i3310_fix], mm0
13541 movd [esp + i3310_fiz], mm1
13542 ;# update j forces
13544 movq mm0, [edi + eax*4]
13545 movd mm1, [edi + eax*4 + 8]
13546 movq mm6, [edi + ebx*4]
13547 movd mm7, [edi + ebx*4 + 8]
13549 pfsub mm0, mm2
13550 pfsub mm1, mm3
13551 pfsub mm6, mm4
13552 pfsub mm7, mm5
13554 movq [edi + eax*4], mm0
13555 movd [edi + eax*4 +8], mm1
13556 movq [edi + ebx*4], mm6
13557 movd [edi + ebx*4 + 8], mm7
13559 ;# should we do one more iteration?
13560 sub dword ptr [esp + i3310_innerk], 2
13561 jl .i3310_finish_vdwc_inner
13562 jmp .i3310_unroll_vdwc_loop
13563 .i3310_finish_vdwc_inner:
13564 and dword ptr [esp + i3310_innerk], 1
13565 jnz .i3310_single_vdwc_inner
13566 jmp .i3310_updateouterdata_vdwc
13567 .i3310_single_vdwc_inner:
13568 ;# a single j particle iteration here - compare with the unrolled code for comments.
13569 mov eax, [esp + i3310_innerjjnr]
13570 mov eax, [eax] ;# eax=jnr offset
13572 mov ecx, [ebp + i3310_charge]
13573 movd mm5, [esp + i3310_iq]
13574 movd mm3, [ecx + eax*4]
13575 pfmul mm3, mm5 ;# mm3=qq
13577 mov esi, [ebp + i3310_nbfp]
13578 mov ecx, [ebp + i3310_type]
13579 mov edx, [ecx + eax*4] ;# type [jnr1]
13580 shl edx, 1
13581 add edx, [esp + i3310_ntia] ;# tja = ntia + 2*type
13582 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
13583 movq [esp + i3310_c6], mm5
13584 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
13585 movq [esp + i3310_c12], mm5
13587 mov esi, [ebp + i3310_pos]
13588 lea eax, [eax + eax*2]
13590 movq mm0, [esp + i3310_ix]
13591 movd mm1, [esp + i3310_iz]
13592 movq mm4, [esi + eax*4]
13593 movd mm5, [esi + eax*4 + 8]
13594 pfsubr mm4, mm0
13595 pfsubr mm5, mm1
13596 movq [esp + i3310_dx1], mm4
13597 pfmul mm4,mm4
13598 movd [esp + i3310_dz1], mm5
13599 pfmul mm5,mm5
13600 pfacc mm4, mm5
13601 pfacc mm4, mm5 ;# mm0=rsq
13603 pfrsqrt mm0,mm4
13604 movq mm2,mm0
13605 pfmul mm0,mm0
13606 pfrsqit1 mm0,mm4
13607 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
13608 pfmul mm4, mm0
13609 movq mm1, mm4
13610 ;# mm0 is invsqrt, and mm1 r.
13612 ;# calculate potentials and scalar force
13613 pfmul mm1, [esp + i3310_tsc] ;# mm1=rt
13614 pf2iw mm4,mm1
13615 movd [esp + i3310_n1], mm4
13616 pi2fd mm4,mm4
13617 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
13619 movq mm2,mm1
13620 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
13622 ;# coulomb table
13623 mov edx, [ebp + i3310_VFtab]
13624 mov ecx, [esp + i3310_n1]
13625 lea ecx, [ecx + ecx*2]
13626 shl ecx, 2
13627 ;# load all the table values we need
13628 movd mm4, [edx + ecx*4]
13629 movd mm5, [edx + ecx*4 + 4]
13630 movd mm6, [edx + ecx*4 + 8]
13631 movd mm7, [edx + ecx*4 + 12]
13633 pfmul mm6, mm1 ;# mm6 = Geps
13634 pfmul mm7, mm2 ;# mm7 = Heps2
13636 pfadd mm5, mm6
13637 pfadd mm5, mm7 ;# mm5 = Fp
13639 pfmul mm7, [esp + i3310_two] ;# two*Heps2
13640 pfadd mm7, mm6
13641 pfadd mm7, mm5 ;# mm7=FF
13643 pfmul mm5, mm1 ;# mm5=eps*Fp
13644 pfadd mm5, mm4 ;# mm5= VV
13646 pfmul mm5, mm3 ;# vcoul=qq*VV
13647 pfmul mm3, mm7 ;# fijC=FF*qq
13649 ;# at this point mm5 contains vcoul and mm3 fijC
13650 ;# increment vcoul - then we can get rid of mm5
13651 ;# update vctot
13652 pfadd mm5, [esp + i3310_vctot] ;# add the earlier value
13653 movq [esp + i3310_vctot], mm5 ;# store the sum
13655 ;# dispersion table
13656 ;# load all the table values we need
13657 movd mm4, [edx + ecx*4 + 16]
13658 movd mm5, [edx + ecx*4 + 20]
13659 movd mm6, [edx + ecx*4 + 24]
13660 movd mm7, [edx + ecx*4 + 28]
13661 pfmul mm6, mm1 ;# mm6 = Geps
13662 pfmul mm7, mm2 ;# mm7 = Heps2
13663 pfadd mm5, mm6
13664 pfadd mm5, mm7 ;# mm5 = Fp
13665 pfmul mm7, [esp + i3310_two] ;# two*Heps2
13666 pfadd mm7, mm6
13667 pfadd mm7, mm5 ;# mm7=FF
13668 pfmul mm5, mm1 ;# mm5=eps*Fp
13669 pfadd mm5, mm4 ;# mm5= VV
13671 movq mm4, [esp + i3310_c6]
13672 pfmul mm7, mm4 ;# fijD
13673 pfmul mm5, mm4 ;# vnb6
13674 pfadd mm3, mm7 ;# add to fscal
13676 ;# update vnbtot to release mm5!
13677 pfadd mm5, [esp + i3310_vnbtot] ;# add the earlier value
13678 movq [esp + i3310_vnbtot], mm5 ;# store the sum
13680 ;# repulsion table
13681 ;# load all the table values we need
13682 movd mm4, [edx + ecx*4 + 32]
13683 movd mm5, [edx + ecx*4 + 36]
13684 movd mm6, [edx + ecx*4 + 40]
13685 movd mm7, [edx + ecx*4 + 44]
13687 pfmul mm6, mm1 ;# mm6 = Geps
13688 pfmul mm7, mm2 ;# mm7 = Heps2
13689 pfadd mm5, mm6
13690 pfadd mm5, mm7 ;# mm5 = Fp
13691 pfmul mm7, [esp + i3310_two] ;# two*Heps2
13692 pfadd mm7, mm6
13693 pfadd mm7, mm5 ;# mm7=FF
13694 pfmul mm5, mm1 ;# mm5=eps*Fp
13695 pfadd mm5, mm4 ;# mm5= VV
13697 movq mm6, [esp + i3310_c12]
13698 pfmul mm7, mm6 ;# fijR
13699 pfmul mm5, mm6 ;# vnb12
13700 pfadd mm3, mm7 ;# total fscal fijC+ fijD+ fijR
13702 ;# change sign of mm3
13703 pxor mm1,mm1
13704 pfsub mm1, mm3
13705 pfmul mm0, [esp + i3310_tsc]
13706 pfmul mm0, mm1 ;# mm0 is total fscal now
13708 ;# update vnbtot
13709 pfadd mm5, [esp + i3310_vnbtot] ;# add the earlier value
13710 movq [esp + i3310_vnbtot], mm5 ;# store the sum
13712 ;# spread fscalar to both positions
13713 punpckldq mm0,mm0
13714 ;# calc vectorial force
13715 prefetchw [edi + eax*4] ;# prefetch faction to cache
13716 movq mm2, [esp + i3310_dx1]
13717 movd mm3, [esp + i3310_dz1]
13720 pfmul mm2, mm0
13721 pfmul mm3, mm0
13723 ;# update i particle force
13724 movq mm0, [esp + i3310_fix]
13725 movd mm1, [esp + i3310_fiz]
13726 pfadd mm0, mm2
13727 pfadd mm1, mm3
13728 movq [esp + i3310_fix], mm0
13729 movd [esp + i3310_fiz], mm1
13730 ;# update j particle force
13731 movq mm0, [edi + eax*4]
13732 movd mm1, [edi + eax *4+ 8]
13733 pfsub mm0, mm2
13734 pfsub mm1, mm3
13735 movq [edi + eax*4], mm0
13736 movd [edi + eax*4 +8], mm1
13737 ;# done!
13738 .i3310_updateouterdata_vdwc:
13739 mov ecx, [esp + i3310_ii3]
13741 movq mm6, [edi + ecx*4] ;# increment i force
13742 movd mm7, [edi + ecx*4 + 8]
13743 pfadd mm6, [esp + i3310_fix]
13744 pfadd mm7, [esp + i3310_fiz]
13745 movq [edi + ecx*4], mm6
13746 movd [edi + ecx*4 +8], mm7
13748 mov ebx, [ebp + i3310_fshift] ;# increment fshift force
13749 mov edx, [esp + i3310_is3]
13751 movq mm6, [ebx + edx*4]
13752 movd mm7, [ebx + edx*4 + 8]
13753 pfadd mm6, [esp + i3310_fix]
13754 pfadd mm7, [esp + i3310_fiz]
13755 movq [ebx + edx*4], mm6
13756 movd [ebx + edx*4 + 8], mm7
13758 ;# loop back to mno
13759 dec dword ptr [esp + i3310_nsvdwc]
13760 jz .i3310_testcoul
13761 jmp .i3310_mno_vdwc
13762 .i3310_testcoul:
13763 mov ecx, [esp + i3310_nscoul]
13764 cmp ecx, 0
13765 jnz .i3310_mno_coul
13766 jmp .i3310_testvdw
13767 .i3310_mno_coul:
13768 mov ebx, [esp + i3310_solnr]
13769 inc dword ptr [esp + i3310_solnr]
13770 mov edx, [ebp + i3310_charge]
13771 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
13772 pfmul mm2, [ebp + i3310_facel]
13773 punpckldq mm2,mm2 ;# spread to both halves
13774 movq [esp + i3310_iq], mm2 ;# iq =facel*charge[ii]
13776 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
13777 mov eax, [ebp + i3310_pos] ;# eax = base of pos[]
13778 mov [esp + i3310_ii3], ebx
13780 movq mm0, [eax + ebx*4]
13781 movd mm1, [eax + ebx*4 + 8]
13782 pfadd mm0, [esp + i3310_shX]
13783 pfadd mm1, [esp + i3310_shZ]
13784 movq [esp + i3310_ix], mm0
13785 movd [esp + i3310_iz], mm1
13787 ;# clear forces
13788 pxor mm7,mm7
13789 movq [esp + i3310_fix], mm7
13790 movd [esp + i3310_fiz], mm7
13792 mov ecx, [esp + i3310_innerjjnr0]
13793 mov [esp + i3310_innerjjnr], ecx
13794 mov edx, [esp + i3310_innerk0]
13795 sub edx, 2
13796 mov [esp + i3310_innerk], edx ;# number of innerloop atoms
13797 jge .i3310_unroll_coul_loop
13798 jmp .i3310_finish_coul_inner
13799 .i3310_unroll_coul_loop:
13800 ;# paired innerloop starts here
13801 mov ecx, [esp + i3310_innerjjnr] ;# pointer to jjnr[k]
13802 mov eax, [ecx]
13803 mov ebx, [ecx + 4] ;# eax/ebx=jnr
13804 add dword ptr [esp + i3310_innerjjnr], 8 ;# advance pointer (unrolled 2)
13805 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
13807 mov ecx, [ebp + i3310_charge] ;# base of charge[]
13808 movq mm5, [esp + i3310_iq]
13809 movd mm3, [ecx + eax*4] ;# charge[jnr1]
13810 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
13811 pfmul mm3,mm5 ;# mm3 now has qq for both particles
13813 lea eax, [eax + eax*2] ;# replace jnr with j3
13814 lea ebx, [ebx + ebx*2]
13816 mov esi, [ebp + i3310_pos]
13818 movq mm0, [esp + i3310_ix]
13819 movd mm1, [esp + i3310_iz]
13820 movq mm4, [esi + eax*4] ;# fetch first j coordinates
13821 movd mm5, [esi + eax*4 + 8]
13822 pfsubr mm4,mm0 ;# dr = ir - jr
13823 pfsubr mm5,mm1
13824 movq [esp + i3310_dx1], mm4 ;# store dr
13825 movd [esp + i3310_dz1], mm5
13826 pfmul mm4,mm4 ;# square dx,dy,dz
13827 pfmul mm5,mm5
13828 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
13829 pfacc mm4, mm5 ;# first rsq in lower mm4
13831 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
13832 movd mm7, [esi + ebx*4 + 8]
13834 pfsubr mm6,mm0 ;# dr = ir - jr
13835 pfsubr mm7,mm1
13836 movq [esp + i3310_dx2], mm6 ;# store dr
13837 movd [esp + i3310_dz2], mm7
13838 pfmul mm6,mm6 ;# square dx,dy,dz
13839 pfmul mm7,mm7
13840 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
13841 pfacc mm6, mm7 ;# second rsq in lower mm6
13843 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
13844 pfrsqrt mm1, mm6
13847 punpckldq mm0,mm1
13848 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
13849 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
13850 pfmul mm0,mm0
13851 pfrsqit1 mm0,mm4
13852 pfrcpit2 mm0,mm2
13853 pfmul mm4, mm0
13854 movq mm1, mm4
13855 ;# mm0 is invsqrt, and mm1 r.
13856 ;# do potential and fscal
13857 pfmul mm1, [esp + i3310_tsc] ;# mm1=rt
13858 pf2iw mm4,mm1
13859 movq [esp + i3310_n1], mm4
13860 pi2fd mm4,mm4
13861 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
13863 movq mm2,mm1
13864 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
13866 mov edx, [ebp + i3310_VFtab]
13867 mov ecx, [esp + i3310_n1]
13868 lea ecx, [ecx + ecx*2]
13869 shl ecx, 2
13870 ;# coulomb table
13871 ;# load all the table values we need
13872 movd mm4, [edx + ecx*4]
13873 movd mm5, [edx + ecx*4 + 4]
13874 movd mm6, [edx + ecx*4 + 8]
13875 movd mm7, [edx + ecx*4 + 12]
13876 mov ecx, [esp + i3310_n1 + 4]
13877 lea ecx, [ecx + ecx*2]
13878 shl ecx, 2
13879 punpckldq mm4, [edx + ecx*4]
13880 punpckldq mm5, [edx + ecx*4 + 4]
13881 punpckldq mm6, [edx + ecx*4 + 8]
13882 punpckldq mm7, [edx + ecx*4 + 12]
13884 pfmul mm6, mm1 ;# mm6 = Geps
13885 pfmul mm7, mm2 ;# mm7 = Heps2
13887 pfadd mm5, mm6
13888 pfadd mm5, mm7 ;# mm5 = Fp
13890 pfmul mm7, [esp + i3310_two] ;# two*Heps2
13891 pfadd mm7, mm6
13892 pfadd mm7, mm5 ;# mm7=FF
13894 pfmul mm5, mm1 ;# mm5=eps*Fp
13895 pfadd mm5, mm4 ;# mm5= VV
13897 pfmul mm5, mm3 ;# vcoul=qq*VV
13898 pfmul mm3, mm7 ;# fijC=FF*qq
13900 ;# at this point mm5 contains vcoul and mm3 fijC
13901 ;# increment vcoul - then we can get rid of mm5
13902 ;# update vctot
13903 pfadd mm5, [esp + i3310_vctot] ;# add the earlier value
13904 movq [esp + i3310_vctot], mm5 ;# store the sum
13906 ;# change sign of mm3
13907 pxor mm1,mm1
13908 pfsub mm1, mm3
13909 pfmul mm1, [esp + i3310_tsc]
13910 pfmul mm0, mm1 ;# mm0 is total fscal now
13912 prefetchw [esp + i3310_dx1] ;# prefetch i forces to cache
13914 ;# spread fscalar to both positions
13915 movq mm1,mm0
13916 punpckldq mm0,mm0
13917 punpckhdq mm1,mm1
13919 ;# calc vector force
13920 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
13921 movq mm2, [esp + i3310_dx1] ;# fetch dr
13922 movd mm3, [esp + i3310_dz1]
13924 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
13925 pfmul mm2, mm0 ;# mult by fs
13926 pfmul mm3, mm0
13928 movq mm4, [esp + i3310_dx2] ;# fetch dr
13929 movd mm5, [esp + i3310_dz2]
13930 pfmul mm4, mm1 ;# mult by fs
13931 pfmul mm5, mm1
13932 ;# update i forces
13934 movq mm0, [esp + i3310_fix]
13935 movd mm1, [esp + i3310_fiz]
13936 pfadd mm0, mm2
13937 pfadd mm1, mm3
13939 pfadd mm0, mm4
13940 pfadd mm1, mm5
13941 movq [esp + i3310_fix], mm0
13942 movd [esp + i3310_fiz], mm1
13943 ;# update j forces
13945 movq mm0, [edi + eax*4]
13946 movd mm1, [edi + eax*4 + 8]
13947 movq mm6, [edi + ebx*4]
13948 movd mm7, [edi + ebx*4 + 8]
13950 pfsub mm0, mm2
13951 pfsub mm1, mm3
13952 pfsub mm6, mm4
13953 pfsub mm7, mm5
13955 movq [edi + eax*4], mm0
13956 movd [edi + eax*4 +8], mm1
13957 movq [edi + ebx*4], mm6
13958 movd [edi + ebx*4 + 8], mm7
13960 ;# should we do one more iteration?
13961 sub dword ptr [esp + i3310_innerk], 2
13962 jl .i3310_finish_coul_inner
13963 jmp .i3310_unroll_coul_loop
13964 .i3310_finish_coul_inner:
13965 and dword ptr [esp + i3310_innerk], 1
13966 jnz .i3310_single_coul_inner
13967 jmp .i3310_updateouterdata_coul
13968 .i3310_single_coul_inner:
13969 ;# a single j particle iteration here - compare with the unrolled code for comments.
13970 mov eax, [esp + i3310_innerjjnr]
13971 mov eax, [eax] ;# eax=jnr offset
13973 mov ecx, [ebp + i3310_charge]
13974 movd mm5, [esp + i3310_iq]
13975 movd mm3, [ecx + eax*4]
13976 pfmul mm3, mm5 ;# mm3=qq
13978 mov esi, [ebp + i3310_pos]
13979 lea eax, [eax + eax*2]
13981 movq mm0, [esp + i3310_ix]
13982 movd mm1, [esp + i3310_iz]
13983 movq mm4, [esi + eax*4]
13984 movd mm5, [esi + eax*4 + 8]
13985 pfsubr mm4, mm0
13986 pfsubr mm5, mm1
13987 movq [esp + i3310_dx1], mm4
13988 pfmul mm4,mm4
13989 movd [esp + i3310_dz1], mm5
13990 pfmul mm5,mm5
13991 pfacc mm4, mm5
13992 pfacc mm4, mm5 ;# mm0=rsq
13994 pfrsqrt mm0,mm4
13995 movq mm2,mm0
13996 pfmul mm0,mm0
13997 pfrsqit1 mm0,mm4
13998 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
13999 pfmul mm4, mm0
14000 movq mm1, mm4
14001 ;# mm0 is invsqrt, and mm1 r.
14003 ;# calculate potentials and scalar force
14004 pfmul mm1, [esp + i3310_tsc] ;# mm1=rt
14005 pf2iw mm4,mm1
14006 movd [esp + i3310_n1], mm4
14007 pi2fd mm4,mm4
14008 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
14010 movq mm2,mm1
14011 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
14013 ;# coulomb table
14014 mov edx, [ebp + i3310_VFtab]
14015 mov ecx, [esp + i3310_n1]
14016 lea ecx, [ecx + ecx*2]
14017 shl ecx, 2
14018 ;# load all the table values we need
14019 movd mm4, [edx + ecx*4]
14020 movd mm5, [edx + ecx*4 + 4]
14021 movd mm6, [edx + ecx*4 + 8]
14022 movd mm7, [edx + ecx*4 + 12]
14024 pfmul mm6, mm1 ;# mm6 = Geps
14025 pfmul mm7, mm2 ;# mm7 = Heps2
14027 pfadd mm5, mm6
14028 pfadd mm5, mm7 ;# mm5 = Fp
14030 pfmul mm7, [esp + i3310_two] ;# two*Heps2
14031 pfadd mm7, mm6
14032 pfadd mm7, mm5 ;# mm7=FF
14034 pfmul mm5, mm1 ;# mm5=eps*Fp
14035 pfadd mm5, mm4 ;# mm5= VV
14037 pfmul mm5, mm3 ;# vcoul=qq*VV
14038 pfmul mm3, mm7 ;# fijC=FF*qq
14040 ;# at this point mm5 contains vcoul and mm3 fijC
14041 ;# increment vcoul - then we can get rid of mm5
14042 ;# update vctot
14043 pfadd mm5, [esp + i3310_vctot] ;# add the earlier value
14044 movq [esp + i3310_vctot], mm5 ;# store the sum
14046 ;# change sign of mm3
14047 pxor mm1,mm1
14048 pfsub mm1, mm3
14049 pfmul mm0, [esp + i3310_tsc]
14050 pfmul mm0, mm1 ;# mm0 is total fscal now
14052 ;# spread fscalar to both positions
14053 punpckldq mm0,mm0
14054 ;# calc vectorial force
14055 prefetchw [edi + eax*4] ;# prefetch faction to cache
14056 movq mm2, [esp + i3310_dx1]
14057 movd mm3, [esp + i3310_dz1]
14060 pfmul mm2, mm0
14061 pfmul mm3, mm0
14063 ;# update i particle force
14064 movq mm0, [esp + i3310_fix]
14065 movd mm1, [esp + i3310_fiz]
14066 pfadd mm0, mm2
14067 pfadd mm1, mm3
14068 movq [esp + i3310_fix], mm0
14069 movd [esp + i3310_fiz], mm1
14070 ;# update j particle force
14071 movq mm0, [edi + eax*4]
14072 movd mm1, [edi + eax *4+ 8]
14073 pfsub mm0, mm2
14074 pfsub mm1, mm3
14075 movq [edi + eax*4], mm0
14076 movd [edi + eax*4 +8], mm1
14077 ;# done!
14078 .i3310_updateouterdata_coul:
14079 mov ecx, [esp + i3310_ii3]
14081 movq mm6, [edi + ecx*4] ;# increment i force
14082 movd mm7, [edi + ecx*4 + 8]
14083 pfadd mm6, [esp + i3310_fix]
14084 pfadd mm7, [esp + i3310_fiz]
14085 movq [edi + ecx*4], mm6
14086 movd [edi + ecx*4 +8], mm7
14088 mov ebx, [ebp + i3310_fshift] ;# increment fshift force
14089 mov edx, [esp + i3310_is3]
14091 movq mm6, [ebx + edx*4]
14092 movd mm7, [ebx + edx*4 + 8]
14093 pfadd mm6, [esp + i3310_fix]
14094 pfadd mm7, [esp + i3310_fiz]
14095 movq [ebx + edx*4], mm6
14096 movd [ebx + edx*4 + 8], mm7
14098 ;# loop back to mno
14099 dec dword ptr [esp + i3310_nscoul]
14100 jz .i3310_testvdw
14101 jmp .i3310_mno_coul
14102 .i3310_testvdw:
14103 mov ecx, [esp + i3310_nsvdw]
14104 cmp ecx, 0
14105 jnz .i3310_mno_vdw
14106 jmp .i3310_last_mno
14107 .i3310_mno_vdw:
14108 mov ebx, [esp + i3310_solnr]
14109 inc dword ptr [esp + i3310_solnr]
14111 mov edx, [ebp + i3310_type]
14112 mov edx, [edx + ebx*4]
14113 imul edx, [ebp + i3310_ntype]
14114 shl edx, 1
14115 mov [esp + i3310_ntia], edx
14117 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
14118 mov eax, [ebp + i3310_pos] ;# eax = base of pos[]
14119 mov [esp + i3310_ii3], ebx
14121 movq mm0, [eax + ebx*4]
14122 movd mm1, [eax + ebx*4 + 8]
14123 pfadd mm0, [esp + i3310_shX]
14124 pfadd mm1, [esp + i3310_shZ]
14125 movq [esp + i3310_ix], mm0
14126 movd [esp + i3310_iz], mm1
14128 ;# clear forces
14129 pxor mm7,mm7
14130 movq [esp + i3310_fix], mm7
14131 movd [esp + i3310_fiz], mm7
14133 mov ecx, [esp + i3310_innerjjnr0]
14134 mov [esp + i3310_innerjjnr], ecx
14135 mov edx, [esp + i3310_innerk0]
14136 sub edx, 2
14137 mov [esp + i3310_innerk], edx ;# number of innerloop atoms
14138 jge .i3310_unroll_vdw_loop
14139 jmp .i3310_finish_vdw_inner
14140 .i3310_unroll_vdw_loop:
14141 ;# paired innerloop starts here
14142 mov ecx, [esp + i3310_innerjjnr] ;# pointer to jjnr[k]
14143 mov eax, [ecx]
14144 mov ebx, [ecx + 4] ;# eax/ebx=jnr
14145 add dword ptr [esp + i3310_innerjjnr], 8 ;# advance pointer (unrolled 2)
14146 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
14148 mov ecx, [ebp + i3310_type]
14149 mov edx, [ecx + eax*4] ;# type [jnr1]
14150 mov ecx, [ecx + ebx*4] ;# type [jnr2]
14152 mov esi, [ebp + i3310_nbfp] ;# base of nbfp
14153 shl edx, 1
14154 shl ecx, 1
14155 add edx, [esp + i3310_ntia] ;# tja = ntia + 2*type
14156 add ecx, [esp + i3310_ntia]
14158 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
14159 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
14160 movq mm6, mm5
14161 punpckldq mm5, mm7 ;# mm5 = 1st c6 / 2nd c6
14162 punpckhdq mm6, mm7 ;# mm6 = 1st c12 / 2nd c12
14163 movq [esp + i3310_c6], mm5
14164 movq [esp + i3310_c12], mm6
14166 lea eax, [eax + eax*2] ;# replace jnr with j3
14167 lea ebx, [ebx + ebx*2]
14169 mov esi, [ebp + i3310_pos]
14171 movq mm0, [esp + i3310_ix]
14172 movd mm1, [esp + i3310_iz]
14173 movq mm4, [esi + eax*4] ;# fetch first j coordinates
14174 movd mm5, [esi + eax*4 + 8]
14175 pfsubr mm4,mm0 ;# dr = ir - jr
14176 pfsubr mm5,mm1
14177 movq [esp + i3310_dx1], mm4 ;# store dr
14178 movd [esp + i3310_dz1], mm5
14179 pfmul mm4,mm4 ;# square dx,dy,dz
14180 pfmul mm5,mm5
14181 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
14182 pfacc mm4, mm5 ;# first rsq in lower mm4
14184 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
14185 movd mm7, [esi + ebx*4 + 8]
14187 pfsubr mm6, mm0 ;# dr = ir - jr
14188 pfsubr mm7, mm1
14189 movq [esp + i3310_dx2], mm6 ;# store dr
14190 movd [esp + i3310_dz2], mm7
14191 pfmul mm6, mm6 ;# square dx,dy,dz
14192 pfmul mm7, mm7
14193 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
14194 pfacc mm6, mm7 ;# second rsq in lower mm6
14196 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
14197 pfrsqrt mm1, mm6
14200 punpckldq mm0, mm1
14201 punpckldq mm4, mm6 ;# now 4 has rsq and 0 the seed for both pairs.
14202 movq mm2, mm0 ;# amd 3dnow N-R iteration to get full precision.
14203 pfmul mm0, mm0
14204 pfrsqit1 mm0, mm4
14205 pfrcpit2 mm0, mm2
14206 pfmul mm4, mm0
14207 movq mm1, mm4
14208 ;# mm0 is invsqrt, and mm1 r.
14209 ;# do potential and fscal
14210 pfmul mm1, [esp + i3310_tsc] ;# mm1=rt
14211 pf2iw mm4, mm1
14212 movq [esp + i3310_n1], mm4
14213 pi2fd mm4, mm4
14214 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
14216 movq mm2, mm1
14217 pfmul mm2, mm2 ;# mm1 is eps, mm2 is eps2
14219 mov edx, [ebp + i3310_VFtab]
14220 ;# dispersion table
14221 mov ecx, [esp + i3310_n1]
14222 lea ecx, [ecx + ecx*2]
14223 shl ecx, 2
14224 ;# load all the table values we need
14225 movd mm4, [edx + ecx*4]
14226 movd mm5, [edx + ecx*4 + 4]
14227 movd mm6, [edx + ecx*4 + 8]
14228 movd mm7, [edx + ecx*4 + 12]
14229 mov ecx, [esp + i3310_n1 + 4]
14230 lea ecx, [ecx + ecx*2]
14231 shl ecx, 2
14232 punpckldq mm4, [edx + ecx*4]
14233 punpckldq mm5, [edx + ecx*4 + 4]
14234 punpckldq mm6, [edx + ecx*4 + 8]
14235 punpckldq mm7, [edx + ecx*4 + 12]
14236 pfmul mm6, mm1 ;# mm6 = Geps
14237 pfmul mm7, mm2 ;# mm7 = Heps2
14238 pfadd mm5, mm6
14239 pfadd mm5, mm7 ;# mm5 = Fp
14240 pfmul mm7, [esp + i3310_two] ;# two*Heps2
14241 pfadd mm7, mm6
14242 pfadd mm7, mm5 ;# mm7=FF
14243 pfmul mm5, mm1 ;# mm5=eps*Fp
14244 pfadd mm5, mm4 ;# mm5= VV
14246 movq mm4, [esp + i3310_c6]
14247 pfmul mm7, mm4 ;# fijD
14248 pfmul mm5, mm4 ;# vnb6
14249 movq mm3, mm7 ;# add to fscal
14251 ;# update vnbtot to release mm5!
14252 pfadd mm5, [esp + i3310_vnbtot] ;# add the earlier value
14253 movq [esp + i3310_vnbtot], mm5 ;# store the sum
14255 ;# repulsion table
14256 mov ecx, [esp + i3310_n1]
14257 lea ecx, [ecx + ecx*2]
14258 shl ecx, 2
14259 ;# load all the table values we need
14260 movd mm4, [edx + ecx*4 + 16]
14261 movd mm5, [edx + ecx*4 + 20]
14262 movd mm6, [edx + ecx*4 + 24]
14263 movd mm7, [edx + ecx*4 + 28]
14264 mov ecx, [esp + i3310_n1 + 4]
14265 lea ecx, [ecx + ecx*2]
14266 shl ecx, 2
14267 punpckldq mm4, [edx + ecx*4 + 16]
14268 punpckldq mm5, [edx + ecx*4 + 20]
14269 punpckldq mm6, [edx + ecx*4 + 24]
14270 punpckldq mm7, [edx + ecx*4 + 28]
14272 pfmul mm6, mm1 ;# mm6 = Geps
14273 pfmul mm7, mm2 ;# mm7 = Heps2
14274 pfadd mm5, mm6
14275 pfadd mm5, mm7 ;# mm5 = Fp
14276 pfmul mm7, [esp + i3310_two] ;# two*Heps2
14277 pfadd mm7, mm6
14278 pfadd mm7, mm5 ;# mm7=FF
14279 pfmul mm5, mm1 ;# mm5=eps*Fp
14280 pfadd mm5, mm4 ;# mm5= VV
14282 movq mm6, [esp + i3310_c12]
14283 pfmul mm7, mm6 ;# fijR
14284 pfmul mm5, mm6 ;# vnb12
14285 pfadd mm3, mm7 ;# total fscal fijD+ fijR
14287 ;# change sign of mm3
14288 pxor mm1,mm1
14289 pfsub mm1, mm3
14290 pfmul mm1, [esp + i3310_tsc]
14291 pfmul mm0, mm1 ;# mm0 is total fscal now
14293 prefetchw [esp + i3310_dx1] ;# prefetch i forces to cache
14295 ;# spread fscalar to both positions
14296 movq mm1,mm0
14297 punpckldq mm0,mm0
14298 punpckhdq mm1,mm1
14300 ;# calc vector force
14301 prefetchw [edi + eax*4] ;# prefetch the 1st faction to cache
14302 movq mm2, [esp + i3310_dx1] ;# fetch dr
14303 movd mm3, [esp + i3310_dz1]
14305 ;# update vnbtot
14306 pfadd mm5, [esp + i3310_vnbtot] ;# add the earlier value
14307 movq [esp + i3310_vnbtot], mm5 ;# store the sum
14309 prefetchw [edi + ebx*4] ;# prefetch the 2nd faction to cache
14310 pfmul mm2, mm0 ;# mult by fs
14311 pfmul mm3, mm0
14313 movq mm4, [esp + i3310_dx2] ;# fetch dr
14314 movd mm5, [esp + i3310_dz2]
14315 pfmul mm4, mm1 ;# mult by fs
14316 pfmul mm5, mm1
14317 ;# update i forces
14319 movq mm0, [esp + i3310_fix]
14320 movd mm1, [esp + i3310_fiz]
14321 pfadd mm0, mm2
14322 pfadd mm1, mm3
14324 pfadd mm0, mm4
14325 pfadd mm1, mm5
14326 movq [esp + i3310_fix], mm0
14327 movd [esp + i3310_fiz], mm1
14328 ;# update j forces
14330 movq mm0, [edi + eax*4]
14331 movd mm1, [edi + eax*4 + 8]
14332 movq mm6, [edi + ebx*4]
14333 movd mm7, [edi + ebx*4 + 8]
14335 pfsub mm0, mm2
14336 pfsub mm1, mm3
14337 pfsub mm6, mm4
14338 pfsub mm7, mm5
14340 movq [edi + eax*4], mm0
14341 movd [edi + eax*4 +8], mm1
14342 movq [edi + ebx*4], mm6
14343 movd [edi + ebx*4 + 8], mm7
14345 ;# should we do one more iteration?
14346 sub dword ptr [esp + i3310_innerk], 2
14347 jl .i3310_finish_vdw_inner
14348 jmp .i3310_unroll_vdw_loop
14349 .i3310_finish_vdw_inner:
14350 and dword ptr [esp + i3310_innerk], 1
14351 jnz .i3310_single_vdw_inner
14352 jmp .i3310_updateouterdata_vdw
14353 .i3310_single_vdw_inner:
14354 ;# a single j particle iteration here - compare with the unrolled code for comments.
14355 mov eax, [esp + i3310_innerjjnr]
14356 mov eax, [eax] ;# eax=jnr offset
14358 mov esi, [ebp + i3310_nbfp]
14359 mov ecx, [ebp + i3310_type]
14360 mov edx, [ecx + eax*4] ;# type [jnr1]
14361 shl edx, 1
14362 add edx, [esp + i3310_ntia] ;# tja = ntia + 2*type
14363 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
14364 movq [esp + i3310_c6], mm5
14365 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
14366 movq [esp + i3310_c12], mm5
14368 mov esi, [ebp + i3310_pos]
14369 lea eax, [eax + eax*2]
14371 movq mm0, [esp + i3310_ix]
14372 movd mm1, [esp + i3310_iz]
14373 movq mm4, [esi + eax*4]
14374 movd mm5, [esi + eax*4 + 8]
14375 pfsubr mm4, mm0
14376 pfsubr mm5, mm1
14377 movq [esp + i3310_dx1], mm4
14378 pfmul mm4,mm4
14379 movd [esp + i3310_dz1], mm5
14380 pfmul mm5,mm5
14381 pfacc mm4, mm5
14382 pfacc mm4, mm5 ;# mm0=rsq
14384 pfrsqrt mm0,mm4
14385 movq mm2,mm0
14386 pfmul mm0,mm0
14387 pfrsqit1 mm0,mm4
14388 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
14389 pfmul mm4, mm0
14390 movq mm1, mm4
14391 ;# mm0 is invsqrt, and mm1 r.
14393 ;# calculate potentials and scalar force
14394 pfmul mm1, [esp + i3310_tsc] ;# mm1=rt
14395 pf2iw mm4,mm1
14396 movd [esp + i3310_n1], mm4
14397 pi2fd mm4,mm4
14398 pfsub mm1, mm4 ;# now mm1 is eps and mm4 n0.
14400 movq mm2,mm1
14401 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
14403 mov edx, [ebp + i3310_VFtab]
14404 mov ecx, [esp + i3310_n1]
14405 lea ecx, [ecx + ecx*2]
14406 shl ecx, 2
14407 ;# dispersion table
14408 ;# load all the table values we need
14410 movd mm4, [edx + ecx*4]
14411 movd mm5, [edx + ecx*4 + 4]
14412 movd mm6, [edx + ecx*4 + 8]
14413 movd mm7, [edx + ecx*4 + 12]
14414 pfmul mm6, mm1 ;# mm6 = Geps
14415 pfmul mm7, mm2 ;# mm7 = Heps2
14416 pfadd mm5, mm6
14417 pfadd mm5, mm7 ;# mm5 = Fp
14418 pfmul mm7, [esp + i3310_two] ;# two*Heps2
14419 pfadd mm7, mm6
14420 pfadd mm7, mm5 ;# mm7=FF
14421 pfmul mm5, mm1 ;# mm5=eps*Fp
14422 pfadd mm5, mm4 ;# mm5= VV
14424 movq mm4, [esp + i3310_c6]
14425 pfmul mm7, mm4 ;# fijD
14426 pfmul mm5, mm4 ;# vnb6
14427 movq mm3, mm7 ;# add to fscal
14429 ;# update vnbtot to release mm5!
14430 pfadd mm5, [esp + i3310_vnbtot] ;# add the earlier value
14431 movq [esp + i3310_vnbtot], mm5 ;# store the sum
14433 ;# repulsion table
14434 ;# load all the table values we need
14436 movd mm4, [edx + ecx*4 + 16]
14437 movd mm5, [edx + ecx*4 + 20]
14438 movd mm6, [edx + ecx*4 + 24]
14439 movd mm7, [edx + ecx*4 + 28]
14441 pfmul mm6, mm1 ;# mm6 = Geps
14442 pfmul mm7, mm2 ;# mm7 = Heps2
14443 pfadd mm5, mm6
14444 pfadd mm5, mm7 ;# mm5 = Fp
14445 pfmul mm7, [esp + i3310_two] ;# two*Heps2
14446 pfadd mm7, mm6
14447 pfadd mm7, mm5 ;# mm7=FF
14448 pfmul mm5, mm1 ;# mm5=eps*Fp
14449 pfadd mm5, mm4 ;# mm5= VV
14451 movq mm6, [esp + i3310_c12]
14452 pfmul mm7, mm6 ;# fijR
14453 pfmul mm5, mm6 ;# vnb12
14454 pfadd mm3, mm7 ;# total fscal fijC+ fijD+ fijR
14456 ;# change sign of mm3
14457 pxor mm1,mm1
14458 pfsub mm1, mm3
14459 pfmul mm0, [esp + i3310_tsc]
14460 pfmul mm0, mm1 ;# mm0 is total fscal now
14462 ;# update vnbtot
14463 pfadd mm5, [esp + i3310_vnbtot] ;# add the earlier value
14464 movq [esp + i3310_vnbtot], mm5 ;# store the sum
14466 ;# spread fscalar to both positions
14467 punpckldq mm0,mm0
14468 ;# calc vectorial force
14469 prefetchw [edi + eax*4] ;# prefetch faction to cache
14470 movq mm2, [esp + i3310_dx1]
14471 movd mm3, [esp + i3310_dz1]
14473 pfmul mm2, mm0
14474 pfmul mm3, mm0
14476 ;# update i particle force
14477 movq mm0, [esp + i3310_fix]
14478 movd mm1, [esp + i3310_fiz]
14479 pfadd mm0, mm2
14480 pfadd mm1, mm3
14481 movq [esp + i3310_fix], mm0
14482 movd [esp + i3310_fiz], mm1
14483 ;# update j particle force
14484 movq mm0, [edi + eax*4]
14485 movd mm1, [edi + eax *4+ 8]
14486 pfsub mm0, mm2
14487 pfsub mm1, mm3
14488 movq [edi + eax*4], mm0
14489 movd [edi + eax*4 +8], mm1
14490 ;# done!
14491 .i3310_updateouterdata_vdw:
14492 mov ecx, [esp + i3310_ii3]
14494 movq mm6, [edi + ecx*4] ;# increment i force
14495 movd mm7, [edi + ecx*4 + 8]
14496 pfadd mm6, [esp + i3310_fix]
14497 pfadd mm7, [esp + i3310_fiz]
14498 movq [edi + ecx*4], mm6
14499 movd [edi + ecx*4 +8], mm7
14501 mov ebx, [ebp + i3310_fshift] ;# increment fshift force
14502 mov edx, [esp + i3310_is3]
14504 movq mm6, [ebx + edx*4]
14505 movd mm7, [ebx + edx*4 + 8]
14506 pfadd mm6, [esp + i3310_fix]
14507 pfadd mm7, [esp + i3310_fiz]
14508 movq [ebx + edx*4], mm6
14509 movd [ebx + edx*4 + 8], mm7
14511 ;# loop back to mno
14512 dec dword ptr [esp + i3310_nsvdw]
14513 jz .i3310_last_mno
14514 jmp .i3310_mno_vdw
14516 .i3310_last_mno:
14517 mov edx, [ebp + i3310_gid] ;# get group index for this i particle
14518 mov edx, [edx]
14519 add dword ptr [ebp + i3310_gid], 4 ;# advance pointer
14521 movq mm7, [esp + i3310_vctot]
14522 pfacc mm7,mm7 ;# get and sum the two parts of total potential
14524 mov eax, [ebp + i3310_Vc]
14525 movd mm6, [eax + edx*4]
14526 pfadd mm6, mm7
14527 movd [eax + edx*4], mm6 ;# increment vc[gid]
14529 movq mm7, [esp + i3310_vnbtot]
14530 pfacc mm7,mm7 ;# get and sum the two parts of total potential
14532 mov eax, [ebp + i3310_Vnb]
14533 movd mm6, [eax + edx*4]
14534 pfadd mm6, mm7
14535 movd [eax + edx*4], mm6 ;# increment vc[gid]
14536 ;# finish if last
14537 mov ecx, [ebp + i3310_nri]
14538 dec ecx
14539 jecxz .i3310_end
14540 ;# not last, iterate once more!
14541 mov [ebp + i3310_nri], ecx
14542 jmp .i3310_outer
14543 .i3310_end:
14544 femms
14545 add esp, 168
14546 pop edi
14547 pop esi
14548 pop edx
14549 pop ecx
14550 pop ebx
14551 pop eax
14552 leave
14556 .globl inl3320_3dnow
14557 .globl _inl3320_3dnow
14558 inl3320_3dnow:
14559 _inl3320_3dnow:
14560 .equiv i3320_nri, 8
14561 .equiv i3320_iinr, 12
14562 .equiv i3320_jindex, 16
14563 .equiv i3320_jjnr, 20
14564 .equiv i3320_shift, 24
14565 .equiv i3320_shiftvec, 28
14566 .equiv i3320_fshift, 32
14567 .equiv i3320_gid, 36
14568 .equiv i3320_pos, 40
14569 .equiv i3320_faction, 44
14570 .equiv i3320_charge, 48
14571 .equiv i3320_facel, 52
14572 .equiv i3320_Vc, 56
14573 .equiv i3320_type, 60
14574 .equiv i3320_ntype, 64
14575 .equiv i3320_nbfp, 68
14576 .equiv i3320_Vnb, 72
14577 .equiv i3320_tabscale, 76
14578 .equiv i3320_VFtab, 80
14579 ;# stack offsets for local variables
14580 .equiv i3320_is3, 0
14581 .equiv i3320_ii3, 4
14582 .equiv i3320_ixO, 8
14583 .equiv i3320_iyO, 12
14584 .equiv i3320_izO, 16
14585 .equiv i3320_ixH, 20
14586 .equiv i3320_iyH, 28
14587 .equiv i3320_izH, 36
14588 .equiv i3320_iqO, 44
14589 .equiv i3320_iqH, 52
14590 .equiv i3320_qqO, 60
14591 .equiv i3320_qqH, 68
14592 .equiv i3320_vctot, 76
14593 .equiv i3320_vnbtot, 84
14594 .equiv i3320_c6, 92
14595 .equiv i3320_c12, 100
14596 .equiv i3320_two, 108
14597 .equiv i3320_n1, 116
14598 .equiv i3320_tsc, 124
14599 .equiv i3320_ntia, 132
14600 .equiv i3320_innerjjnr, 140
14601 .equiv i3320_innerk, 144
14602 .equiv i3320_fixO, 148
14603 .equiv i3320_fiyO, 152
14604 .equiv i3320_fizO, 156
14605 .equiv i3320_fixH, 160
14606 .equiv i3320_fiyH, 168
14607 .equiv i3320_fizH, 176
14608 .equiv i3320_dxO, 184
14609 .equiv i3320_dyO, 188
14610 .equiv i3320_dzO, 192
14611 .equiv i3320_dxH, 196
14612 .equiv i3320_dyH, 204
14613 .equiv i3320_dzH, 212
14614 .equiv i3320_tmprsqH, 220
14615 push ebp
14616 mov ebp,esp
14617 push eax
14618 push ebx
14619 push ecx
14620 push edx
14621 push esi
14622 push edi
14623 sub esp, 228 ;# local stack space
14624 femms
14626 mov ecx, [ebp + i3320_iinr] ;# ecx = pointer into iinr[]
14627 mov ebx, [ecx] ;# ebx=ii
14629 mov edx, [ebp + i3320_charge]
14630 movd mm1, [ebp + i3320_facel]
14631 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0]
14632 pfmul mm2, mm1
14633 movq [esp + i3320_iqO], mm2 ;# iqO = facel*charge[ii]
14635 movd mm2, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1]
14636 pfmul mm2, mm1
14637 punpckldq mm2,mm2 ;# spread to both halves
14638 movq [esp + i3320_iqH], mm2 ;# iqH = facel*charge[ii0+1]
14640 mov edx, [ebp + i3320_type]
14641 mov ecx, [edx + ebx*4]
14642 shl ecx, 1
14643 imul ecx, [ebp + i3320_ntype] ;# ecx = ntia = 2*ntype*type[ii0]
14644 mov [esp + i3320_ntia], ecx
14646 movq mm3, [mm_two]
14647 movq mm4, [ebp + i3320_tabscale]
14648 punpckldq mm4,mm4 ;# spread to both halves
14649 movq [esp + i3320_two], mm3
14650 movq [esp + i3320_tsc], mm4
14651 ;# assume we have at least one i particle - start directly
14652 .i3320_outer:
14653 mov eax, [ebp + i3320_shift] ;# eax = pointer into shift[]
14654 mov ebx, [eax] ;# ebx=shift[n]
14655 add dword ptr [ebp + i3320_shift], 4 ;# advance pointer one step
14657 lea ebx, [ebx + ebx*2] ;# ebx=3*is
14658 mov [esp + i3320_is3],ebx ;# store is3
14660 mov eax, [ebp + i3320_shiftvec] ;# eax = base of shiftvec[]
14662 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
14663 movd mm6, [eax + ebx*4 + 8]
14664 movq mm0, mm5
14665 movq mm1, mm5
14666 movq mm2, mm6
14667 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
14668 punpckhdq mm1,mm1
14669 punpckldq mm2,mm2
14671 mov ecx, [ebp + i3320_iinr] ;# ecx = pointer into iinr[]
14672 add dword ptr [ebp + i3320_iinr], 4 ;# advance pointer
14673 mov ebx, [ecx] ;# ebx=ii
14675 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
14676 mov eax, [ebp + i3320_pos] ;# eax = base of pos[]
14678 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
14679 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
14680 mov [esp + i3320_ii3], ebx ;# (use mm7 as temp. storage for iz.)
14681 pfadd mm6, mm7
14682 movq [esp + i3320_ixO], mm5
14683 movq [esp + i3320_izO], mm6
14685 movd mm3, [eax + ebx*4 + 12]
14686 movd mm4, [eax + ebx*4 + 16]
14687 movd mm5, [eax + ebx*4 + 20]
14688 punpckldq mm3, [eax + ebx*4 + 24]
14689 punpckldq mm4, [eax + ebx*4 + 28]
14690 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
14692 pfadd mm0, mm3
14693 pfadd mm1, mm4
14694 pfadd mm2, mm5
14695 movq [esp + i3320_ixH], mm0
14696 movq [esp + i3320_iyH], mm1
14697 movq [esp + i3320_izH], mm2
14699 ;# clear vctot and i forces
14700 pxor mm7,mm7
14701 movq [esp + i3320_vctot], mm7
14702 movq [esp + i3320_vnbtot], mm7
14703 movq [esp + i3320_fixO], mm7
14704 movd [esp + i3320_fizO], mm7
14705 movq [esp + i3320_fixH], mm7
14706 movq [esp + i3320_fiyH], mm7
14707 movq [esp + i3320_fizH], mm7
14709 mov eax, [ebp + i3320_jindex]
14710 mov ecx, [eax] ;# jindex[n]
14711 mov edx, [eax + 4] ;# jindex[n+1]
14712 add dword ptr [ebp + i3320_jindex], 4
14713 sub edx, ecx ;# number of innerloop atoms
14714 mov [esp + i3320_innerk], edx
14716 mov esi, [ebp + i3320_pos]
14717 mov edi, [ebp + i3320_faction]
14718 mov eax, [ebp + i3320_jjnr]
14719 shl ecx, 2
14720 add eax, ecx
14721 mov [esp + i3320_innerjjnr], eax ;# pointer to jjnr[nj0]
14722 .i3320_inner_loop:
14723 ;# a single j particle iteration here - compare with the unrolled code for comments.
14724 mov eax, [esp + i3320_innerjjnr]
14725 mov eax, [eax] ;# eax=jnr offset
14726 add dword ptr [esp + i3320_innerjjnr], 4 ;# advance pointer
14727 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
14729 mov ecx, [ebp + i3320_charge]
14730 movd mm7, [ecx + eax*4]
14731 punpckldq mm7,mm7
14732 movq mm6,mm7
14733 pfmul mm6, [esp + i3320_iqO]
14734 pfmul mm7, [esp + i3320_iqH] ;# mm6=qqO, mm7=qqH
14735 movd [esp + i3320_qqO], mm6
14736 movq [esp + i3320_qqH], mm7
14738 mov ecx, [ebp + i3320_type]
14739 mov edx, [ecx + eax*4] ;# type [jnr]
14740 mov ecx, [ebp + i3320_nbfp]
14741 shl edx, 1
14742 add edx, [esp + i3320_ntia] ;# tja = ntia + 2*type
14743 movd mm5, [ecx + edx*4] ;# mm5 = 1st c6
14744 movq [esp + i3320_c6], mm5
14745 movd mm5, [ecx + edx*4 + 4] ;# mm5 = 1st c12
14746 movq [esp + i3320_c12], mm5
14748 lea eax, [eax + eax*2]
14750 movq mm0, [esi + eax*4]
14751 movd mm1, [esi + eax*4 + 8]
14752 ;# copy & expand to mm2-mm4 for the H interactions
14753 movq mm2, mm0
14754 movq mm3, mm0
14755 movq mm4, mm1
14756 punpckldq mm2,mm2
14757 punpckhdq mm3,mm3
14758 punpckldq mm4,mm4
14760 pfsubr mm0, [esp + i3320_ixO]
14761 pfsubr mm1, [esp + i3320_izO]
14763 movq [esp + i3320_dxO], mm0
14764 pfmul mm0,mm0
14765 movd [esp + i3320_dzO], mm1
14766 pfmul mm1,mm1
14767 pfacc mm0, mm1
14768 pfadd mm0, mm1 ;# mm0=rsqO
14770 punpckldq mm2, mm2
14771 punpckldq mm3, mm3
14772 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
14773 pfsubr mm2, [esp + i3320_ixH]
14774 pfsubr mm3, [esp + i3320_iyH]
14775 pfsubr mm4, [esp + i3320_izH] ;# mm2-mm4 is dxH-dzH
14777 movq [esp + i3320_dxH], mm2
14778 movq [esp + i3320_dyH], mm3
14779 movq [esp + i3320_dzH], mm4
14780 pfmul mm2,mm2
14781 pfmul mm3,mm3
14782 pfmul mm4,mm4
14784 pfadd mm3,mm2
14785 pfadd mm3,mm4 ;# mm3=rsqH
14786 movq [esp + i3320_tmprsqH], mm3
14788 pfrsqrt mm1,mm0
14790 movq mm2,mm1
14791 pfmul mm1,mm1
14792 pfrsqit1 mm1,mm0
14793 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
14795 pfmul mm0, mm1 ;# mm0=r
14797 pfmul mm0, [esp + i3320_tsc]
14798 pf2iw mm4, mm0
14799 movd [esp + i3320_n1], mm4
14800 pi2fd mm4,mm4
14801 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
14802 movq mm2, mm0
14803 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
14805 ;# coulomb table
14806 mov edx, [ebp + i3320_VFtab]
14807 mov ecx, [esp + i3320_n1]
14808 lea ecx, [ecx + ecx*2]
14809 shl ecx, 2
14810 ;# load all values we need
14811 movd mm4, [edx + ecx*4]
14812 movd mm5, [edx + ecx*4 + 4]
14813 movd mm6, [edx + ecx*4 + 8]
14814 movd mm7, [edx + ecx*4 + 12]
14816 pfmul mm6, mm0 ;# mm6 = Geps
14817 pfmul mm7, mm2 ;# mm7 = Heps2
14819 pfadd mm5, mm6
14820 pfadd mm5, mm7 ;# mm5 = Fp
14822 pfmul mm7, [esp + i3320_two] ;# two*Heps2
14823 pfadd mm7, mm6
14824 pfadd mm7, mm5 ;# mm7=FF
14826 pfmul mm5, mm0 ;# mm5=eps*Fp
14827 pfadd mm5, mm4 ;# mm5= VV
14829 pfmul mm5, [esp + i3320_qqO] ;# vcoul=qq*VV
14830 pfmul mm7, [esp + i3320_qqO] ;# fijC=qq*FF
14832 ;# update vctot directly, use mm3 for fscal sum.
14833 pfadd mm5, [esp + i3320_vctot]
14834 movq [esp + i3320_vctot], mm5
14835 movq mm3, mm7
14837 ;# dispersion table
14838 ;# load all the table values we need
14839 movd mm4, [edx + ecx*4 + 16]
14840 movd mm5, [edx + ecx*4 + 20]
14841 movd mm6, [edx + ecx*4 + 24]
14842 movd mm7, [edx + ecx*4 + 28]
14843 pfmul mm6, mm0 ;# mm6 = Geps
14844 pfmul mm7, mm2 ;# mm7 = Heps2
14845 pfadd mm5, mm6
14846 pfadd mm5, mm7 ;# mm5 = Fp
14847 pfmul mm7, [esp + i3320_two] ;# two*Heps2
14848 pfadd mm7, mm6
14849 pfadd mm7, mm5 ;# mm7=FF
14850 pfmul mm5, mm0 ;# mm5=eps*Fp
14851 pfadd mm5, mm4 ;# mm5= VV
14853 movq mm4, [esp + i3320_c6]
14854 pfmul mm7, mm4 ;# fijD
14855 pfmul mm5, mm4 ;# vnb6
14856 pfadd mm3, mm7 ;# add to fscal
14858 ;# update vnbtot to release mm5!
14859 pfadd mm5, [esp + i3320_vnbtot] ;# add the earlier value
14860 movq [esp + i3320_vnbtot], mm5 ;# store the sum
14862 ;# repulsion table
14863 ;# load all the table values we need
14864 movd mm4, [edx + ecx*4 + 32]
14865 movd mm5, [edx + ecx*4 + 36]
14866 movd mm6, [edx + ecx*4 + 40]
14867 movd mm7, [edx + ecx*4 + 44]
14869 pfmul mm6, mm0 ;# mm6 = Geps
14870 pfmul mm7, mm2 ;# mm7 = Heps2
14871 pfadd mm5, mm6
14872 pfadd mm5, mm7 ;# mm5 = Fp
14873 pfmul mm7, [esp + i3320_two] ;# two*Heps2
14874 pfadd mm7, mm6
14875 pfadd mm7, mm5 ;# mm7=FF
14876 pfmul mm5, mm0 ;# mm5=eps*Fp
14877 pfadd mm5, mm4 ;# mm5= VV
14879 movq mm6, [esp + i3320_c12]
14880 pfmul mm7, mm6 ;# fijR
14881 pfmul mm5, mm6 ;# vnb12
14882 pfadd mm3, mm7 ;# total fscal fijC+ fijD+ fijR
14884 ;# change sign of fscal and multiply with rinv
14885 pxor mm0,mm0
14886 pfsubr mm3, mm0
14887 pfmul mm3, [esp + i3320_tsc]
14888 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
14890 ;# update vnbtot
14891 pfadd mm5, [esp + i3320_vnbtot] ;# add the earlier value
14892 movq [esp + i3320_vnbtot], mm5 ;# store the sum
14894 ;# Ready with the oxygen - potential is updated, fscal is in mm3.
14895 ;# now do the two hydrogens.
14896 movq mm0, [esp + i3320_tmprsqH] ;# mm0=rsqH
14898 pfrsqrt mm1, mm0
14899 pswapd mm0,mm0
14900 pfrsqrt mm2, mm0
14901 pswapd mm0,mm0
14902 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
14904 movq mm2, mm1
14905 pfmul mm1,mm1
14906 pfrsqit1 mm1,mm0
14907 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
14909 pfmul mm0,mm1 ;# mm0=r
14910 pfmul mm0, [esp + i3320_tsc]
14911 pf2iw mm4, mm0
14912 movq [esp + i3320_n1], mm4
14913 pi2fd mm4,mm4
14914 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
14915 movq mm2, mm0
14916 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
14918 ;# coulomb table
14919 mov edx, [ebp + i3320_VFtab]
14920 mov ecx, [esp + i3320_n1]
14921 lea ecx, [ecx + ecx*2]
14922 shl ecx, 2
14923 ;# load all values we need
14924 movd mm4, [edx + ecx*4]
14925 movd mm5, [edx + ecx*4 + 4]
14926 movd mm6, [edx + ecx*4 + 8]
14927 movd mm7, [edx + ecx*4 + 12]
14928 mov ecx, [esp + i3320_n1 + 4]
14929 lea ecx, [ecx + ecx*2]
14930 shl ecx, 2
14931 punpckldq mm4, [edx + ecx*4]
14932 punpckldq mm5, [edx + ecx*4 + 4]
14933 punpckldq mm6, [edx + ecx*4 + 8]
14934 punpckldq mm7, [edx + ecx*4 + 12]
14937 pfmul mm6, mm0 ;# mm6 = Geps
14938 pfmul mm7, mm2 ;# mm7 = Heps2
14940 pfadd mm5, mm6
14941 pfadd mm5, mm7 ;# mm5 = Fp
14943 pfmul mm7, [esp + i3320_two] ;# two*Heps2
14944 pfadd mm7, mm6
14945 pfadd mm7, mm5 ;# mm7=FF
14947 pfmul mm5, mm0 ;# mm5=eps*Fp
14948 pfadd mm5, mm4 ;# mm5= VV
14950 pfmul mm5, [esp + i3320_qqH] ;# vcoul=qq*VV
14951 pfmul mm7, [esp + i3320_qqH] ;# fijC=qq*FF
14952 ;# update vctot
14953 pfadd mm5, [esp + i3320_vctot]
14954 movq [esp + i3320_vctot], mm5
14956 ;# change sign of fijC and multiply by rinv
14957 pxor mm4,mm4
14958 pfsub mm4, mm7
14959 pfmul mm4, [esp + i3320_tsc]
14960 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
14962 ;# spread oxygen fscalar to both positions
14963 punpckldq mm3,mm3
14964 ;# calc vectorial force for O
14965 prefetchw [edi + eax*4] ;# prefetch faction to cache
14966 movq mm0, [esp + i3320_dxO]
14967 movd mm1, [esp + i3320_dzO]
14968 pfmul mm0, mm3
14969 pfmul mm1, mm3
14971 ;# calc vectorial force for H's
14972 movq mm5, [esp + i3320_dxH]
14973 movq mm6, [esp + i3320_dyH]
14974 movq mm7, [esp + i3320_dzH]
14975 pfmul mm5, mm4
14976 pfmul mm6, mm4
14977 pfmul mm7, mm4
14979 ;# update iO particle force
14980 movq mm2, [esp + i3320_fixO]
14981 movd mm3, [esp + i3320_fizO]
14982 pfadd mm2, mm0
14983 pfadd mm3, mm1
14984 movq [esp + i3320_fixO], mm2
14985 movd [esp + i3320_fizO], mm3
14987 ;# update iH forces
14988 movq mm2, [esp + i3320_fixH]
14989 movq mm3, [esp + i3320_fiyH]
14990 movq mm4, [esp + i3320_fizH]
14991 pfadd mm2, mm5
14992 pfadd mm3, mm6
14993 pfadd mm4, mm7
14994 movq [esp + i3320_fixH], mm2
14995 movq [esp + i3320_fiyH], mm3
14996 movq [esp + i3320_fizH], mm4
14998 ;# pack j forces from H in the same form as the oxygen force.
14999 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
15000 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
15002 pfadd mm0, mm5 ;# add up total force on j particle.
15003 pfadd mm1, mm7
15005 ;# update j particle force
15006 movq mm2, [edi + eax*4]
15007 movd mm3, [edi + eax*4 + 8]
15008 pfsub mm2, mm0
15009 pfsub mm3, mm1
15010 movq [edi + eax*4], mm2
15011 movd [edi + eax*4 +8], mm3
15013 ;# done - one more?
15014 dec dword ptr [esp + i3320_innerk]
15015 jz .i3320_updateouterdata
15016 jmp .i3320_inner_loop
15017 .i3320_updateouterdata:
15018 mov ecx, [esp + i3320_ii3]
15020 movq mm6, [edi + ecx*4] ;# increment iO force
15021 movd mm7, [edi + ecx*4 + 8]
15022 pfadd mm6, [esp + i3320_fixO]
15023 pfadd mm7, [esp + i3320_fizO]
15024 movq [edi + ecx*4], mm6
15025 movd [edi + ecx*4 +8], mm7
15027 movq mm0, [esp + i3320_fixH]
15028 movq mm3, [esp + i3320_fiyH]
15029 movq mm1, [esp + i3320_fizH]
15030 movq mm2, mm0
15031 punpckldq mm0, mm3 ;# mm0(l)=fxH1, mm0(h)=fyH1
15032 punpckhdq mm2, mm3 ;# mm2(l)=fxH2, mm2(h)=fyH2
15033 movq mm3, mm1
15034 pswapd mm3,mm3
15035 ;# mm1 is fzH1
15036 ;# mm3 is fzH2
15038 movq mm6, [edi + ecx*4 + 12] ;# increment iH1 force
15039 movd mm7, [edi + ecx*4 + 20]
15040 pfadd mm6, mm0
15041 pfadd mm7, mm1
15042 movq [edi + ecx*4 + 12], mm6
15043 movd [edi + ecx*4 + 20], mm7
15045 movq mm6, [edi + ecx*4 + 24] ;# increment iH2 force
15046 movd mm7, [edi + ecx*4 + 32]
15047 pfadd mm6, mm2
15048 pfadd mm7, mm3
15049 movq [edi + ecx*4 + 24], mm6
15050 movd [edi + ecx*4 + 32], mm7
15053 mov ebx, [ebp + i3320_fshift] ;# increment fshift force
15054 mov edx, [esp + i3320_is3]
15056 movq mm6, [ebx + edx*4]
15057 movd mm7, [ebx + edx*4 + 8]
15058 pfadd mm6, [esp + i3320_fixO]
15059 pfadd mm7, [esp + i3320_fizO]
15060 pfadd mm6, mm0
15061 pfadd mm7, mm1
15062 pfadd mm6, mm2
15063 pfadd mm7, mm3
15064 movq [ebx + edx*4], mm6
15065 movd [ebx + edx*4 + 8], mm7
15067 mov edx, [ebp + i3320_gid] ;# get group index for this i particle
15068 mov edx, [edx]
15069 add dword ptr [ebp + i3320_gid], 4 ;# advance pointer
15071 movq mm7, [esp + i3320_vctot]
15072 pfacc mm7,mm7 ;# get and sum the two parts of total potential
15074 mov eax, [ebp + i3320_Vc]
15075 movd mm6, [eax + edx*4]
15076 pfadd mm6, mm7
15077 movd [eax + edx*4], mm6 ;# increment vc[gid]
15079 movq mm7, [esp + i3320_vnbtot]
15080 pfacc mm7,mm7 ;# same for Vnb
15082 mov eax, [ebp + i3320_Vnb]
15083 movd mm6, [eax + edx*4]
15084 pfadd mm6, mm7
15085 movd [eax + edx*4], mm6 ;# increment vnb[gid]
15086 ;# finish if last
15087 dec dword ptr [ebp + i3320_nri]
15088 jz .i3320_end
15089 ;# not last, iterate once more!
15090 jmp .i3320_outer
15091 .i3320_end:
15092 femms
15093 add esp, 228
15094 pop edi
15095 pop esi
15096 pop edx
15097 pop ecx
15098 pop ebx
15099 pop eax
15100 leave
15105 .globl inl3330_3dnow
15106 .globl _inl3330_3dnow
15107 inl3330_3dnow:
15108 _inl3330_3dnow:
15109 .equiv i3330_nri, 8
15110 .equiv i3330_iinr, 12
15111 .equiv i3330_jindex, 16
15112 .equiv i3330_jjnr, 20
15113 .equiv i3330_shift, 24
15114 .equiv i3330_shiftvec, 28
15115 .equiv i3330_fshift, 32
15116 .equiv i3330_gid, 36
15117 .equiv i3330_pos, 40
15118 .equiv i3330_faction, 44
15119 .equiv i3330_charge, 48
15120 .equiv i3330_facel, 52
15121 .equiv i3330_Vc, 56
15122 .equiv i3330_type, 60
15123 .equiv i3330_ntype, 64
15124 .equiv i3330_nbfp, 68
15125 .equiv i3330_Vnb, 72
15126 .equiv i3330_tabscale, 76
15127 .equiv i3330_VFtab, 80
15128 ;# stack offsets for local variables
15129 .equiv i3330_is3, 0
15130 .equiv i3330_ii3, 4
15131 .equiv i3330_ixO, 8
15132 .equiv i3330_iyO, 12
15133 .equiv i3330_izO, 16
15134 .equiv i3330_ixH, 20
15135 .equiv i3330_iyH, 28
15136 .equiv i3330_izH, 36
15137 .equiv i3330_qqOO, 44
15138 .equiv i3330_qqOH, 52
15139 .equiv i3330_qqHH, 60
15140 .equiv i3330_c6, 68
15141 .equiv i3330_c12, 76
15142 .equiv i3330_two, 84
15143 .equiv i3330_n1, 92
15144 .equiv i3330_tsc, 100
15145 .equiv i3330_vctot, 108
15146 .equiv i3330_vnbtot, 116
15147 .equiv i3330_innerjjnr, 124
15148 .equiv i3330_innerk, 128
15149 .equiv i3330_fixO, 132
15150 .equiv i3330_fiyO, 136
15151 .equiv i3330_fizO, 140
15152 .equiv i3330_fixH, 144
15153 .equiv i3330_fiyH, 152
15154 .equiv i3330_fizH, 160
15155 .equiv i3330_dxO, 168
15156 .equiv i3330_dyO, 172
15157 .equiv i3330_dzO, 176
15158 .equiv i3330_dxH, 180
15159 .equiv i3330_dyH, 188
15160 .equiv i3330_dzH, 196
15161 .equiv i3330_tmprsqH, 204
15162 push ebp
15163 mov ebp,esp
15164 push eax
15165 push ebx
15166 push ecx
15167 push edx
15168 push esi
15169 push edi
15170 sub esp, 212 ;# local stack space
15171 femms
15172 ;# assume we have at least one i particle - start directly
15174 mov ecx, [ebp + i3330_iinr] ;# ecx = pointer into iinr[]
15175 mov ebx, [ecx] ;# ebx=ii
15177 mov edx, [ebp + i3330_charge]
15178 movd mm1, [ebp + i3330_facel] ;# mm1=facel
15179 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0] (O)
15180 movd mm3, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1] (H)
15181 movq mm4, mm2
15182 pfmul mm4, mm1
15183 movq mm6, mm3
15184 pfmul mm6, mm1
15185 movq mm5, mm4
15186 pfmul mm4, mm2 ;# mm4=qqOO*facel
15187 pfmul mm5, mm3 ;# mm5=qqOH*facel
15188 pfmul mm6, mm3 ;# mm6=qqHH*facel
15189 punpckldq mm5,mm5 ;# spread to both halves
15190 punpckldq mm6,mm6 ;# spread to both halves
15191 movq [esp + i3330_qqOO], mm4
15192 movq [esp + i3330_qqOH], mm5
15193 movq [esp + i3330_qqHH], mm6
15194 mov edx, [ebp + i3330_type]
15195 mov ecx, [edx + ebx*4]
15196 shl ecx, 1
15197 mov edx, ecx
15198 imul ecx, [ebp + i3330_ntype]
15199 add edx, ecx
15200 mov eax, [ebp + i3330_nbfp]
15201 movd mm0, [eax + edx*4]
15202 movd mm1, [eax + edx*4 + 4]
15203 movq [esp + i3330_c6], mm0
15204 movq [esp + i3330_c12], mm1
15205 movq mm2, [mm_two]
15206 movq [esp + i3330_two], mm2
15207 movd mm3, [ebp + i3330_tabscale]
15208 punpckldq mm3,mm3
15209 movq [esp + i3330_tsc], mm3
15210 .i3330_outer:
15211 mov eax, [ebp + i3330_shift] ;# eax = pointer into shift[]
15212 mov ebx, [eax] ;# ebx=shift[n]
15213 add dword ptr [ebp + i3330_shift], 4 ;# advance pointer one step
15215 lea ebx, [ebx + ebx*2] ;# ebx=3*is
15216 mov [esp + i3330_is3],ebx ;# store is3
15218 mov eax, [ebp + i3330_shiftvec] ;# eax = base of shiftvec[]
15220 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
15221 movd mm6, [eax + ebx*4 + 8]
15222 movq mm0, mm5
15223 movq mm1, mm5
15224 movq mm2, mm6
15225 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
15226 punpckhdq mm1,mm1
15227 punpckldq mm2,mm2
15229 mov ecx, [ebp + i3330_iinr] ;# ecx = pointer into iinr[]
15230 add dword ptr [ebp + i3330_iinr], 4 ;# advance pointer
15231 mov ebx, [ecx] ;# ebx=ii
15233 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
15234 mov eax, [ebp + i3330_pos] ;# eax = base of pos[]
15236 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
15237 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
15238 mov [esp + i3330_ii3], ebx ;# (use mm7 as temp. storage for iz.)
15239 pfadd mm6, mm7
15240 movq [esp + i3330_ixO], mm5
15241 movq [esp + i3330_izO], mm6
15243 movd mm3, [eax + ebx*4 + 12]
15244 movd mm4, [eax + ebx*4 + 16]
15245 movd mm5, [eax + ebx*4 + 20]
15246 punpckldq mm3, [eax + ebx*4 + 24]
15247 punpckldq mm4, [eax + ebx*4 + 28]
15248 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
15250 pfadd mm0, mm3
15251 pfadd mm1, mm4
15252 pfadd mm2, mm5
15253 movq [esp + i3330_ixH], mm0
15254 movq [esp + i3330_iyH], mm1
15255 movq [esp + i3330_izH], mm2
15257 ;# clear vctot and i forces
15258 pxor mm7,mm7
15259 movq [esp + i3330_vctot], mm7
15260 movq [esp + i3330_vnbtot], mm7
15261 movq [esp + i3330_fixO], mm7
15262 movq [esp + i3330_fizO], mm7
15263 movq [esp + i3330_fixH], mm7
15264 movq [esp + i3330_fiyH], mm7
15265 movq [esp + i3330_fizH], mm7
15267 mov eax, [ebp + i3330_jindex]
15268 mov ecx, [eax] ;# jindex[n]
15269 mov edx, [eax + 4] ;# jindex[n+1]
15270 add dword ptr [ebp + i3330_jindex], 4
15271 sub edx, ecx ;# number of innerloop atoms
15272 mov [esp + i3330_innerk], edx
15274 mov esi, [ebp + i3330_pos]
15275 mov edi, [ebp + i3330_faction]
15276 mov eax, [ebp + i3330_jjnr]
15277 shl ecx, 2
15278 add eax, ecx
15279 mov [esp + i3330_innerjjnr], eax ;# pointer to jjnr[nj0]
15280 .i3330_inner_loop:
15281 ;# a single j particle iteration here - compare with the unrolled code for comments.
15282 mov eax, [esp + i3330_innerjjnr]
15283 mov eax, [eax] ;# eax=jnr offset
15284 add dword ptr [esp + i3330_innerjjnr], 4 ;# advance pointer
15286 lea eax, [eax + eax*2]
15288 movq mm0, [esi + eax*4]
15289 movd mm1, [esi + eax*4 + 8]
15290 ;# copy & expand to mm2-mm4 for the H interactions
15291 movq mm2, mm0
15292 movq mm3, mm0
15293 movq mm4, mm1
15294 punpckldq mm2,mm2
15295 punpckhdq mm3,mm3
15296 punpckldq mm4,mm4
15298 pfsubr mm0, [esp + i3330_ixO]
15299 pfsubr mm1, [esp + i3330_izO]
15301 movq [esp + i3330_dxO], mm0
15302 pfmul mm0,mm0
15303 movd [esp + i3330_dzO], mm1
15304 pfmul mm1,mm1
15305 pfacc mm0, mm0
15306 pfadd mm0, mm1 ;# mm0=rsqO
15308 punpckldq mm2, mm2
15309 punpckldq mm3, mm3
15310 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
15311 pfsubr mm2, [esp + i3330_ixH]
15312 pfsubr mm3, [esp + i3330_iyH]
15313 pfsubr mm4, [esp + i3330_izH] ;# mm2-mm4 is dxH-dzH
15315 movq [esp + i3330_dxH], mm2
15316 movq [esp + i3330_dyH], mm3
15317 movq [esp + i3330_dzH], mm4
15318 pfmul mm2,mm2
15319 pfmul mm3,mm3
15320 pfmul mm4,mm4
15322 pfadd mm3,mm2
15323 pfadd mm3,mm4 ;# mm3=rsqH
15324 movq [esp + i3330_tmprsqH], mm3
15326 pfrsqrt mm1,mm0
15328 movq mm2,mm1
15329 pfmul mm1,mm1
15330 pfrsqit1 mm1,mm0
15331 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
15332 pfmul mm0, mm1 ;# mm0=rsq
15334 pfmul mm0, [esp + i3330_tsc]
15335 pf2iw mm4, mm0
15336 movd [esp + i3330_n1], mm4
15337 pi2fd mm4,mm4
15338 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
15339 movq mm2, mm0
15340 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
15342 ;# coulomb table
15343 mov edx, [ebp + i3330_VFtab]
15344 mov ecx, [esp + i3330_n1]
15345 lea ecx, [ecx + ecx*2]
15346 shl ecx, 2
15348 ;# load all values we need
15349 movd mm4, [edx + ecx*4]
15350 movd mm5, [edx + ecx*4 + 4]
15351 movd mm6, [edx + ecx*4 + 8]
15352 movd mm7, [edx + ecx*4 + 12]
15354 pfmul mm6, mm0 ;# mm6 = Geps
15355 pfmul mm7, mm2 ;# mm7 = Heps2
15357 pfadd mm5, mm6
15358 pfadd mm5, mm7 ;# mm5 = Fp
15360 pfmul mm7, [esp + i3330_two] ;# two*Heps2
15361 pfadd mm7, mm6
15362 pfadd mm7, mm5 ;# mm7=FF
15364 pfmul mm5, mm0 ;# mm5=eps*Fp
15365 pfadd mm5, mm4 ;# mm5= VV
15367 pfmul mm5, [esp + i3330_qqOO] ;# vcoul=qq*VV
15368 pfmul mm7, [esp + i3330_qqOO] ;# fijC=qq*FF
15370 ;# update vctot directly, use mm3 for fscal sum.
15371 pfadd mm5, [esp + i3330_vctot]
15372 movq [esp + i3330_vctot], mm5
15373 movq mm3, mm7
15375 ;# dispersion table
15376 ;# load all the table values we need
15377 movd mm4, [edx + ecx*4 + 16]
15378 movd mm5, [edx + ecx*4 + 20]
15379 movd mm6, [edx + ecx*4 + 24]
15380 movd mm7, [edx + ecx*4 + 28]
15381 pfmul mm6, mm0 ;# mm6 = Geps
15382 pfmul mm7, mm2 ;# mm7 = Heps2
15383 pfadd mm5, mm6
15384 pfadd mm5, mm7 ;# mm5 = Fp
15385 pfmul mm7, [esp + i3330_two] ;# two*Heps2
15386 pfadd mm7, mm6
15387 pfadd mm7, mm5 ;# mm7=FF
15388 pfmul mm5, mm0 ;# mm5=eps*Fp
15389 pfadd mm5, mm4 ;# mm5= VV
15391 movq mm4, [esp + i3330_c6]
15392 pfmul mm7, mm4 ;# fijD
15393 pfmul mm5, mm4 ;# vnb6
15394 pfadd mm3, mm7 ;# add to fscal
15396 ;# update vnbtot to release mm5!
15397 pfadd mm5, [esp + i3330_vnbtot] ;# add the earlier value
15398 movq [esp + i3330_vnbtot], mm5 ;# store the sum
15400 ;# repulsion table
15401 ;# load all the table values we need
15402 movd mm4, [edx + ecx*4 + 32]
15403 movd mm5, [edx + ecx*4 + 36]
15404 movd mm6, [edx + ecx*4 + 40]
15405 movd mm7, [edx + ecx*4 + 44]
15407 pfmul mm6, mm0 ;# mm6 = Geps
15408 pfmul mm7, mm2 ;# mm7 = Heps2
15409 pfadd mm5, mm6
15410 pfadd mm5, mm7 ;# mm5 = Fp
15411 pfmul mm7, [esp + i3330_two] ;# two*Heps2
15412 pfadd mm7, mm6
15413 pfadd mm7, mm5 ;# mm7=FF
15414 pfmul mm5, mm0 ;# mm5=eps*Fp
15415 pfadd mm5, mm4 ;# mm5= VV
15417 movq mm6, [esp + i3330_c12]
15418 pfmul mm7, mm6 ;# fijR
15419 pfmul mm5, mm6 ;# vnb12
15420 pfadd mm3, mm7 ;# total fscal fijC+ fijD+ fijR
15422 ;# change sign of fscal and multiply with rinv
15423 pxor mm0,mm0
15424 pfsubr mm3, mm0
15425 pfmul mm3, [esp + i3330_tsc]
15426 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
15428 ;# update vnbtot
15429 pfadd mm5, [esp + i3330_vnbtot] ;# add the earlier value
15430 movq [esp + i3330_vnbtot], mm5 ;# store the sum
15432 ;# Ready with the oxygen - potential is updated, fscal is in mm3.
15433 ;# time for hydrogens!
15436 movq mm0, [esp + i3330_tmprsqH]
15438 pfrsqrt mm1, mm0
15439 pswapd mm0,mm0
15440 pfrsqrt mm2, mm0
15441 pswapd mm0,mm0
15442 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
15444 movq mm2, mm1
15445 pfmul mm1,mm1
15446 pfrsqit1 mm1,mm0
15447 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
15449 pfmul mm0,mm1 ;# mm0=r
15450 pfmul mm0, [esp + i3330_tsc]
15451 pf2iw mm4, mm0
15452 movq [esp + i3330_n1], mm4
15453 pi2fd mm4,mm4
15454 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
15455 movq mm2, mm0
15456 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
15458 ;# coulomb table
15459 mov edx, [ebp + i3330_VFtab]
15460 mov ecx, [esp + i3330_n1]
15461 lea ecx, [ecx + ecx*2]
15462 shl ecx, 2
15463 ;# load all values we need
15464 movd mm4, [edx + ecx*4]
15465 movd mm5, [edx + ecx*4 + 4]
15466 movd mm6, [edx + ecx*4 + 8]
15467 movd mm7, [edx + ecx*4 + 12]
15468 mov ecx, [esp + i3330_n1 + 4]
15469 lea ecx, [ecx + ecx*2]
15470 shl ecx, 2
15471 punpckldq mm4, [edx + ecx*4]
15472 punpckldq mm5, [edx + ecx*4 + 4]
15473 punpckldq mm6, [edx + ecx*4 + 8]
15474 punpckldq mm7, [edx + ecx*4 + 12]
15476 pfmul mm6, mm0 ;# mm6 = Geps
15477 pfmul mm7, mm2 ;# mm7 = Heps2
15479 pfadd mm5, mm6
15480 pfadd mm5, mm7 ;# mm5 = Fp
15482 pfmul mm7, [esp + i3330_two] ;# two*Heps2
15483 pfadd mm7, mm6
15484 pfadd mm7, mm5 ;# mm7=FF
15486 pfmul mm5, mm0 ;# mm5=eps*Fp
15487 pfadd mm5, mm4 ;# mm5= VV
15489 pfmul mm5, [esp + i3330_qqOH] ;# vcoul=qq*VV
15490 pfmul mm7, [esp + i3330_qqOH] ;# fijC=qq*FF
15491 ;# update vctot
15492 pfadd mm5, [esp + i3330_vctot]
15493 movq [esp + i3330_vctot], mm5
15495 ;# change sign of fijC and multiply by rinv
15496 pxor mm4,mm4
15497 pfsub mm4, mm7
15498 pfmul mm4, [esp + i3330_tsc]
15499 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
15501 ;# spread oxygen fscalar to both positions
15502 punpckldq mm3,mm3
15503 ;# calc vectorial force for O
15504 movq mm0, [esp + i3330_dxO]
15505 movd mm1, [esp + i3330_dzO]
15506 pfmul mm0, mm3
15507 pfmul mm1, mm3
15509 ;# calc vectorial force for H's
15510 movq mm5, [esp + i3330_dxH]
15511 movq mm6, [esp + i3330_dyH]
15512 movq mm7, [esp + i3330_dzH]
15513 pfmul mm5, mm4
15514 pfmul mm6, mm4
15515 pfmul mm7, mm4
15517 ;# update iO particle force
15518 movq mm2, [esp + i3330_fixO]
15519 movd mm3, [esp + i3330_fizO]
15520 pfadd mm2, mm0
15521 pfadd mm3, mm1
15522 movq [esp + i3330_fixO], mm2
15523 movd [esp + i3330_fizO], mm3
15525 ;# update iH forces
15526 movq mm2, [esp + i3330_fixH]
15527 movq mm3, [esp + i3330_fiyH]
15528 movq mm4, [esp + i3330_fizH]
15529 pfadd mm2, mm5
15530 pfadd mm3, mm6
15531 pfadd mm4, mm7
15532 movq [esp + i3330_fixH], mm2
15533 movq [esp + i3330_fiyH], mm3
15534 movq [esp + i3330_fizH], mm4
15536 ;# pack j forces from H in the same form as the oxygen force.
15537 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
15538 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
15540 pfadd mm0, mm5 ;# add up total force on j particle.
15541 pfadd mm1, mm7
15543 ;# update j particle force
15544 movq mm2, [edi + eax*4]
15545 movd mm3, [edi + eax*4 + 8]
15546 pfsub mm2, mm0
15547 pfsub mm3, mm1
15548 movq [edi + eax*4], mm2
15549 movd [edi + eax*4 +8], mm3
15551 ;# interactions with j H1
15553 movq mm0, [esi + eax*4 + 12]
15554 movd mm1, [esi + eax*4 + 20]
15555 ;# copy & expand to mm2-mm4 for the H interactions
15556 movq mm2, mm0
15557 movq mm3, mm0
15558 movq mm4, mm1
15559 punpckldq mm2,mm2
15560 punpckhdq mm3,mm3
15561 punpckldq mm4,mm4
15563 pfsubr mm0, [esp + i3330_ixO]
15564 pfsubr mm1, [esp + i3330_izO]
15566 movq [esp + i3330_dxO], mm0
15567 pfmul mm0,mm0
15568 movd [esp + i3330_dzO], mm1
15569 pfmul mm1,mm1
15570 pfacc mm0, mm1
15571 pfadd mm0, mm1 ;# mm0=rsqO
15573 punpckldq mm2, mm2
15574 punpckldq mm3, mm3
15575 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
15576 pfsubr mm2, [esp + i3330_ixH]
15577 pfsubr mm3, [esp + i3330_iyH]
15578 pfsubr mm4, [esp + i3330_izH] ;# mm2-mm4 is dxH-dzH
15580 movq [esp + i3330_dxH], mm2
15581 movq [esp + i3330_dyH], mm3
15582 movq [esp + i3330_dzH], mm4
15583 pfmul mm2,mm2
15584 pfmul mm3,mm3
15585 pfmul mm4,mm4
15587 pfadd mm3,mm2
15588 pfadd mm3,mm4 ;# mm3=rsqH
15589 movq [esp + i3330_tmprsqH], mm3
15591 pfrsqrt mm1,mm0
15593 movq mm2,mm1
15594 pfmul mm1,mm1
15595 pfrsqit1 mm1,mm0
15596 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
15597 pfmul mm0, mm1 ;# mm0=rsq
15599 pfmul mm0, [esp + i3330_tsc]
15600 pf2iw mm4, mm0
15601 movd [esp + i3330_n1], mm4
15602 pi2fd mm4,mm4
15603 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
15604 movq mm2, mm0
15605 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
15607 ;# coulomb table
15608 mov edx, [ebp + i3330_VFtab]
15609 mov ecx, [esp + i3330_n1]
15610 lea ecx, [ecx + ecx*2]
15611 shl ecx, 2
15613 ;# load all values we need
15614 movd mm4, [edx + ecx*4]
15615 movd mm5, [edx + ecx*4 + 4]
15616 movd mm6, [edx + ecx*4 + 8]
15617 movd mm7, [edx + ecx*4 + 12]
15619 pfmul mm6, mm0 ;# mm6 = Geps
15620 pfmul mm7, mm2 ;# mm7 = Heps2
15622 pfadd mm5, mm6
15623 pfadd mm5, mm7 ;# mm5 = Fp
15625 pfmul mm7, [esp + i3330_two] ;# two*Heps2
15626 pfadd mm7, mm6
15627 pfadd mm7, mm5 ;# mm7=FF
15629 pfmul mm5, mm0 ;# mm5=eps*Fp
15630 pfadd mm5, mm4 ;# mm5= VV
15632 pfmul mm5, [esp + i3330_qqOH] ;# vcoul=qq*VV
15633 pfmul mm7, [esp + i3330_qqOH] ;# fijC=qq*FF
15635 ;# update vctot directly, force is moved to mm3.
15636 pfadd mm5, [esp + i3330_vctot]
15637 movq [esp + i3330_vctot], mm5
15638 pxor mm3, mm3
15639 pfsub mm3, mm7
15640 pfmul mm3, [esp + i3330_tsc]
15641 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
15643 movq mm0, [esp + i3330_tmprsqH]
15645 pfrsqrt mm1, mm0
15646 pswapd mm0,mm0
15647 pfrsqrt mm2, mm0
15648 pswapd mm0,mm0
15649 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
15651 movq mm2, mm1
15652 pfmul mm1,mm1
15653 pfrsqit1 mm1,mm0
15654 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
15656 pfmul mm0,mm1 ;# mm0=r
15657 pfmul mm0, [esp + i3330_tsc]
15658 pf2iw mm4, mm0
15659 movq [esp + i3330_n1], mm4
15660 pi2fd mm4,mm4
15661 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
15662 movq mm2, mm0
15663 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
15665 ;# coulomb table
15666 mov edx, [ebp + i3330_VFtab]
15667 mov ecx, [esp + i3330_n1]
15668 lea ecx, [ecx + ecx*2]
15669 shl ecx, 2
15670 ;# load all values we need
15671 movd mm4, [edx + ecx*4]
15672 movd mm5, [edx + ecx*4 + 4]
15673 movd mm6, [edx + ecx*4 + 8]
15674 movd mm7, [edx + ecx*4 + 12]
15675 mov ecx, [esp + i3330_n1 + 4]
15676 lea ecx, [ecx + ecx*2]
15677 shl ecx, 2
15678 punpckldq mm4, [edx + ecx*4]
15679 punpckldq mm5, [edx + ecx*4 + 4]
15680 punpckldq mm6, [edx + ecx*4 + 8]
15681 punpckldq mm7, [edx + ecx*4 + 12]
15684 pfmul mm6, mm0 ;# mm6 = Geps
15685 pfmul mm7, mm2 ;# mm7 = Heps2
15687 pfadd mm5, mm6
15688 pfadd mm5, mm7 ;# mm5 = Fp
15690 pfmul mm7, [esp + i3330_two] ;# two*Heps2
15691 pfadd mm7, mm6
15692 pfadd mm7, mm5 ;# mm7=FF
15694 pfmul mm5, mm0 ;# mm5=eps*Fp
15695 pfadd mm5, mm4 ;# mm5= VV
15697 pfmul mm5, [esp + i3330_qqHH] ;# vcoul=qq*VV
15698 pfmul mm7, [esp + i3330_qqHH] ;# fijC=qq*FF
15699 ;# update vctot
15700 pfadd mm5, [esp + i3330_vctot]
15701 movq [esp + i3330_vctot], mm5
15703 ;# change sign of fijC and multiply by rinv
15704 pxor mm4,mm4
15705 pfsub mm4, mm7
15706 pfmul mm4, [esp + i3330_tsc]
15707 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
15709 ;# spread oxygen fscalar to both positions
15710 punpckldq mm3,mm3
15711 ;# calc vectorial force for O
15712 movq mm0, [esp + i3330_dxO]
15713 movd mm1, [esp + i3330_dzO]
15714 pfmul mm0, mm3
15715 pfmul mm1, mm3
15717 ;# calc vectorial force for H's
15718 movq mm5, [esp + i3330_dxH]
15719 movq mm6, [esp + i3330_dyH]
15720 movq mm7, [esp + i3330_dzH]
15721 pfmul mm5, mm4
15722 pfmul mm6, mm4
15723 pfmul mm7, mm4
15725 ;# update iO particle force
15726 movq mm2, [esp + i3330_fixO]
15727 movd mm3, [esp + i3330_fizO]
15728 pfadd mm2, mm0
15729 pfadd mm3, mm1
15730 movq [esp + i3330_fixO], mm2
15731 movd [esp + i3330_fizO], mm3
15733 ;# update iH forces
15734 movq mm2, [esp + i3330_fixH]
15735 movq mm3, [esp + i3330_fiyH]
15736 movq mm4, [esp + i3330_fizH]
15737 pfadd mm2, mm5
15738 pfadd mm3, mm6
15739 pfadd mm4, mm7
15740 movq [esp + i3330_fixH], mm2
15741 movq [esp + i3330_fiyH], mm3
15742 movq [esp + i3330_fizH], mm4
15744 ;# pack j forces from H in the same form as the oxygen force.
15745 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
15746 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
15748 pfadd mm0, mm5 ;# add up total force on j particle.
15749 pfadd mm1, mm7
15751 ;# update j particle force
15752 movq mm2, [edi + eax*4 + 12]
15753 movd mm3, [edi + eax*4 + 20]
15754 pfsub mm2, mm0
15755 pfsub mm3, mm1
15756 movq [edi + eax*4 + 12], mm2
15757 movd [edi + eax*4 + 20], mm3
15759 ;# interactions with j H2
15760 movq mm0, [esi + eax*4 + 24]
15761 movd mm1, [esi + eax*4 + 32]
15762 ;# copy & expand to mm2-mm4 for the H interactions
15763 movq mm2, mm0
15764 movq mm3, mm0
15765 movq mm4, mm1
15766 punpckldq mm2,mm2
15767 punpckhdq mm3,mm3
15768 punpckldq mm4,mm4
15770 pfsubr mm0, [esp + i3330_ixO]
15771 pfsubr mm1, [esp + i3330_izO]
15773 movq [esp + i3330_dxO], mm0
15774 pfmul mm0,mm0
15775 movd [esp + i3330_dzO], mm1
15776 pfmul mm1,mm1
15777 pfacc mm0, mm1
15778 pfadd mm0, mm1 ;# mm0=rsqO
15780 punpckldq mm2, mm2
15781 punpckldq mm3, mm3
15782 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
15783 pfsubr mm2, [esp + i3330_ixH]
15784 pfsubr mm3, [esp + i3330_iyH]
15785 pfsubr mm4, [esp + i3330_izH] ;# mm2-mm4 is dxH-dzH
15787 movq [esp + i3330_dxH], mm2
15788 movq [esp + i3330_dyH], mm3
15789 movq [esp + i3330_dzH], mm4
15790 pfmul mm2,mm2
15791 pfmul mm3,mm3
15792 pfmul mm4,mm4
15794 pfadd mm3,mm2
15795 pfadd mm3,mm4 ;# mm3=rsqH
15796 movq [esp + i3330_tmprsqH], mm3
15798 pfrsqrt mm1,mm0
15800 movq mm2,mm1
15801 pfmul mm1,mm1
15802 pfrsqit1 mm1,mm0
15803 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
15804 pfmul mm0, mm1
15806 pfmul mm0, [esp + i3330_tsc]
15807 pf2iw mm4, mm0
15808 movd [esp + i3330_n1], mm4
15809 pi2fd mm4,mm4
15810 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
15811 movq mm2, mm0
15812 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
15814 ;# coulomb table
15815 mov edx, [ebp + i3330_VFtab]
15816 mov ecx, [esp + i3330_n1]
15817 lea ecx, [ecx + ecx*2]
15818 shl ecx, 2
15820 ;# load all values we need
15821 movd mm4, [edx + ecx*4]
15822 movd mm5, [edx + ecx*4 + 4]
15823 movd mm6, [edx + ecx*4 + 8]
15824 movd mm7, [edx + ecx*4 + 12]
15826 pfmul mm6, mm0 ;# mm6 = Geps
15827 pfmul mm7, mm2 ;# mm7 = Heps2
15829 pfadd mm5, mm6
15830 pfadd mm5, mm7 ;# mm5 = Fp
15832 pfmul mm7, [esp + i3330_two] ;# two*Heps2
15833 pfadd mm7, mm6
15834 pfadd mm7, mm5 ;# mm7=FF
15836 pfmul mm5, mm0 ;# mm5=eps*Fp
15837 pfadd mm5, mm4 ;# mm5= VV
15839 pfmul mm5, [esp + i3330_qqOH] ;# vcoul=qq*VV
15840 pfmul mm7, [esp + i3330_qqOH] ;# fijC=qq*FF
15842 ;# update vctot directly, use mm3 for fscal sum
15843 pfadd mm5, [esp + i3330_vctot]
15844 movq [esp + i3330_vctot], mm5
15845 pxor mm3,mm3
15846 pfsub mm3, mm7
15847 pfmul mm3, [esp + i3330_tsc]
15848 pfmul mm3, mm1 ;# mm3 is total fscal (for the oxygen) now
15850 movq mm0, [esp + i3330_tmprsqH]
15852 pfrsqrt mm1, mm0
15853 pswapd mm0,mm0
15854 pfrsqrt mm2, mm0
15855 pswapd mm0,mm0
15856 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
15858 movq mm2, mm1
15859 pfmul mm1,mm1
15860 pfrsqit1 mm1,mm0
15861 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
15863 pfmul mm0,mm1 ;# mm0=r
15864 pfmul mm0, [esp + i3330_tsc]
15865 pf2iw mm4, mm0
15866 movq [esp + i3330_n1], mm4
15867 pi2fd mm4,mm4
15868 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
15869 movq mm2, mm0
15870 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
15872 ;# coulomb table
15873 mov edx, [ebp + i3330_VFtab]
15874 mov ecx, [esp + i3330_n1]
15875 lea ecx, [ecx + ecx*2]
15876 shl ecx, 2
15877 ;# load all values we need
15878 movd mm4, [edx + ecx*4]
15879 movd mm5, [edx + ecx*4 + 4]
15880 movd mm6, [edx + ecx*4 + 8]
15881 movd mm7, [edx + ecx*4 + 12]
15882 mov ecx, [esp + i3330_n1 + 4];# mm5 = Fp
15883 lea ecx, [ecx + ecx*2]
15884 shl ecx, 2
15885 punpckldq mm4, [edx + ecx*4]
15886 punpckldq mm5, [edx + ecx*4 + 4]
15887 punpckldq mm6, [edx + ecx*4 + 8]
15888 punpckldq mm7, [edx + ecx*4 + 12]
15891 pfmul mm6, mm0 ;# mm6 = Geps
15892 pfmul mm7, mm2 ;# mm7 = Heps2
15894 pfadd mm5, mm6
15895 pfadd mm5, mm7 ;# mm5 = Fp
15897 pfmul mm7, [esp + i3330_two] ;# two*Heps2
15898 pfadd mm7, mm6
15899 pfadd mm7, mm5 ;# mm7=FF
15901 pfmul mm5, mm0 ;# mm5=eps*Fp
15902 pfadd mm5, mm4 ;# mm5= VV
15904 pfmul mm5, [esp + i3330_qqHH] ;# vcoul=qq*VV
15905 pfmul mm7, [esp + i3330_qqHH] ;# fijC=qq*FF
15906 ;# update vctot
15907 pfadd mm5, [esp + i3330_vctot]
15908 movq [esp + i3330_vctot], mm5
15910 ;# change sign of fijC and multiply by rinv
15911 pxor mm4,mm4
15912 pfsub mm4, mm7
15913 pfmul mm4, [esp + i3330_tsc]
15914 pfmul mm4, mm1 ;# mm4 is total fscal (for the hydrogens) now
15916 ;# spread oxygen fscalar to both positions
15917 punpckldq mm3,mm3
15918 ;# calc vectorial force for O
15919 movq mm0, [esp + i3330_dxO]
15920 movd mm1, [esp + i3330_dzO]
15921 pfmul mm0, mm3
15922 pfmul mm1, mm3
15924 ;# calc vectorial force for H's
15925 movq mm5, [esp + i3330_dxH]
15926 movq mm6, [esp + i3330_dyH]
15927 movq mm7, [esp + i3330_dzH]
15928 pfmul mm5, mm4
15929 pfmul mm6, mm4
15930 pfmul mm7, mm4
15932 ;# update iO particle force
15933 movq mm2, [esp + i3330_fixO]
15934 movd mm3, [esp + i3330_fizO]
15935 pfadd mm2, mm0
15936 pfadd mm3, mm1
15937 movq [esp + i3330_fixO], mm2
15938 movd [esp + i3330_fizO], mm3
15940 ;# update iH forces
15941 movq mm2, [esp + i3330_fixH]
15942 movq mm3, [esp + i3330_fiyH]
15943 movq mm4, [esp + i3330_fizH]
15944 pfadd mm2, mm5
15945 pfadd mm3, mm6
15946 pfadd mm4, mm7
15947 movq [esp + i3330_fixH], mm2
15948 movq [esp + i3330_fiyH], mm3
15949 movq [esp + i3330_fizH], mm4
15951 ;# pack j forces from H in the same form as the oxygen force.
15952 pfacc mm5, mm6 ;# mm5(l)=fjx(H1+ h2) mm5(h)=fjy(H1+ h2)
15953 pfacc mm7, mm7 ;# mm7(l)=fjz(H1+ h2)
15955 pfadd mm0, mm5 ;# add up total force on j particle.
15956 pfadd mm1, mm7
15958 ;# update j particle force
15959 movq mm2, [edi + eax*4 + 24]
15960 movd mm3, [edi + eax*4 + 32]
15961 pfsub mm2, mm0
15962 pfsub mm3, mm1
15963 movq [edi + eax*4 + 24], mm2
15964 movd [edi + eax*4 + 32], mm3
15966 ;# done - one more?
15967 dec dword ptr [esp + i3330_innerk]
15968 jz .i3330_updateouterdata
15969 jmp .i3330_inner_loop
15970 .i3330_updateouterdata:
15971 mov ecx, [esp + i3330_ii3]
15973 movq mm6, [edi + ecx*4] ;# increment iO force
15974 movd mm7, [edi + ecx*4 + 8]
15975 pfadd mm6, [esp + i3330_fixO]
15976 pfadd mm7, [esp + i3330_fizO]
15977 movq [edi + ecx*4], mm6
15978 movd [edi + ecx*4 +8], mm7
15980 movq mm0, [esp + i3330_fixH]
15981 movq mm3, [esp + i3330_fiyH]
15982 movq mm1, [esp + i3330_fizH]
15983 movq mm2, mm0
15984 punpckldq mm0, mm3 ;# mm0(l)=fxH1, mm0(h)=fyH1
15985 punpckhdq mm2, mm3 ;# mm2(l)=fxH2, mm2(h)=fyH2
15986 movq mm3, mm1
15987 pswapd mm3,mm3
15988 ;# mm1 is fzH1
15989 ;# mm3 is fzH2
15991 movq mm6, [edi + ecx*4 + 12] ;# increment iH1 force
15992 movd mm7, [edi + ecx*4 + 20]
15993 pfadd mm6, mm0
15994 pfadd mm7, mm1
15995 movq [edi + ecx*4 + 12], mm6
15996 movd [edi + ecx*4 + 20], mm7
15998 movq mm6, [edi + ecx*4 + 24] ;# increment iH2 force
15999 movd mm7, [edi + ecx*4 + 32]
16000 pfadd mm6, mm2
16001 pfadd mm7, mm3
16002 movq [edi + ecx*4 + 24], mm6
16003 movd [edi + ecx*4 + 32], mm7
16006 mov ebx, [ebp + i3330_fshift] ;# increment fshift force
16007 mov edx, [esp + i3330_is3]
16009 movq mm6, [ebx + edx*4]
16010 movd mm7, [ebx + edx*4 + 8]
16011 pfadd mm6, [esp + i3330_fixO]
16012 pfadd mm7, [esp + i3330_fizO]
16013 pfadd mm6, mm0
16014 pfadd mm7, mm1
16015 pfadd mm6, mm2
16016 pfadd mm7, mm3
16017 movq [ebx + edx*4], mm6
16018 movd [ebx + edx*4 + 8], mm7
16020 mov edx, [ebp + i3330_gid] ;# get group index for this i particle
16021 mov edx, [edx]
16022 add dword ptr [ebp + i3330_gid], 4 ;# advance pointer
16024 movq mm7, [esp + i3330_vctot]
16025 pfacc mm7,mm7 ;# get and sum the two parts of total potential
16027 mov eax, [ebp + i3330_Vc]
16028 movd mm6, [eax + edx*4]
16029 pfadd mm6, mm7
16030 movd [eax + edx*4], mm6 ;# increment vc[gid]
16032 movq mm7, [esp + i3330_vnbtot]
16033 pfacc mm7,mm7 ;# get and sum the two parts of total potential
16035 mov eax, [ebp + i3330_Vnb]
16036 movd mm6, [eax + edx*4]
16037 pfadd mm6, mm7
16038 movd [eax + edx*4], mm6 ;# increment vnbtot[gid]
16039 ;# finish if last
16040 dec dword ptr [ebp + i3330_nri]
16041 jz .i3330_end
16042 ;# not last, iterate once more!
16043 jmp .i3330_outer
16044 .i3330_end:
16045 femms
16046 add esp, 212
16047 pop edi
16048 pop esi
16049 pop edx
16050 pop ecx
16051 pop ebx
16052 pop eax
16053 leave
16061 .globl mcinl0100_3dnow
16062 .globl _mcinl0100_3dnow
16063 mcinl0100_3dnow:
16064 _mcinl0100_3dnow:
16065 .equiv mci0100_nri, 8
16066 .equiv mci0100_iinr, 12
16067 .equiv mci0100_jindex, 16
16068 .equiv mci0100_jjnr, 20
16069 .equiv mci0100_shift, 24
16070 .equiv mci0100_shiftvec, 28
16071 .equiv mci0100_gid, 32
16072 .equiv mci0100_pos, 36
16073 .equiv mci0100_type, 40
16074 .equiv mci0100_ntype, 44
16075 .equiv mci0100_nbfp, 48
16076 .equiv mci0100_Vnb, 52
16077 ;# stack offsets for local variables
16078 .equiv mci0100_is3, 0
16079 .equiv mci0100_ii3, 4
16080 .equiv mci0100_ix, 8
16081 .equiv mci0100_iy, 12
16082 .equiv mci0100_iz, 16
16083 .equiv mci0100_vnbtot, 20
16084 .equiv mci0100_c6, 28
16085 .equiv mci0100_c12, 36
16086 .equiv mci0100_ntia, 44
16087 .equiv mci0100_innerjjnr, 48
16088 .equiv mci0100_innerk, 52
16089 push ebp
16090 mov ebp,esp
16091 push eax
16092 push ebx
16093 push ecx
16094 push edx
16095 push esi
16096 push edi
16097 sub esp, 56 ;# local stack space
16098 femms
16099 ;# assume we have at least one i particle - start directly
16100 .mci0100_outer:
16101 mov eax, [ebp + mci0100_shift] ;# eax = pointer into shift[]
16102 mov ebx, [eax] ;# ebx=shift[n]
16103 add dword ptr [ebp + mci0100_shift], 4 ;# advance pointer one step
16105 lea ebx, [ebx + ebx*2] ;# ebx=3*is
16106 mov [esp + mci0100_is3],ebx ;# store is3
16108 mov eax, [ebp + mci0100_shiftvec] ;# eax = base of shiftvec[]
16110 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1.
16111 movd mm1, [eax + ebx*4 + 8]
16113 mov ecx, [ebp + mci0100_iinr] ;# ecx = pointer into iinr[]
16114 add dword ptr [ebp + mci0100_iinr], 4 ;# advance pointer
16115 mov ebx, [ecx] ;# ebx =ii
16117 mov edx, [ebp + mci0100_type]
16118 mov edx, [edx + ebx*4]
16119 imul edx, [ebp + mci0100_ntype]
16120 shl edx, 1
16121 mov [esp + mci0100_ntia], edx
16123 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
16124 mov eax, [ebp + mci0100_pos] ;# eax = base of pos[]
16126 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
16127 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
16128 mov [esp + mci0100_ii3], ebx
16129 pfadd mm1, mm3
16130 movq [esp + mci0100_ix], mm0
16131 movd [esp + mci0100_iz], mm1
16133 ;# clear total potential
16134 pxor mm7,mm7
16135 movq [esp + mci0100_vnbtot], mm7
16137 mov eax, [ebp + mci0100_jindex]
16138 mov ecx, [eax] ;# jindex[n]
16139 mov edx, [eax + 4] ;# jindex[n+1]
16140 add dword ptr [ebp + mci0100_jindex], 4
16141 sub edx, ecx ;# number of innerloop atoms
16143 mov esi, [ebp + mci0100_pos]
16144 mov eax, [ebp + mci0100_jjnr]
16145 shl ecx, 2
16146 add eax, ecx
16147 mov [esp + mci0100_innerjjnr], eax ;# pointer to jjnr[nj0]
16148 sub edx, 2
16149 mov [esp + mci0100_innerk], edx ;# number of innerloop atoms
16150 jge .mci0100_unroll_loop
16151 jmp .mci0100_finish_inner
16152 .mci0100_unroll_loop:
16153 ;# paired innerloop starts here
16154 mov ecx, [esp + mci0100_innerjjnr] ;# pointer to jjnr[k]
16155 mov eax, [ecx]
16156 mov ebx, [ecx + 4] ;# eax/ebx=jnr
16157 add dword ptr [esp + mci0100_innerjjnr], 8 ;# advance pointer (unrolled 2)
16158 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
16160 mov ecx, [ebp + mci0100_type]
16161 mov edx, [ecx + eax*4] ;# type [jnr1]
16162 mov ecx, [ecx + ebx*4] ;# type [jnr2]
16164 mov esi, [ebp + mci0100_nbfp] ;# base of nbfp
16165 shl edx, 1
16166 shl ecx, 1
16167 add edx, [esp + mci0100_ntia] ;# tja = ntia + 2*type
16168 add ecx, [esp + mci0100_ntia]
16170 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
16171 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
16172 movq mm6,mm5
16173 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
16174 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
16175 movq [esp + mci0100_c6], mm5
16176 movq [esp + mci0100_c12], mm6
16178 lea eax, [eax + eax*2] ;# replace jnr with j3
16179 lea ebx, [ebx + ebx*2]
16181 mov esi, [ebp + mci0100_pos]
16183 movq mm0, [esp + mci0100_ix]
16184 movd mm1, [esp + mci0100_iz]
16185 movq mm4, [esi + eax*4] ;# fetch first j coordinates
16186 movd mm5, [esi + eax*4 + 8]
16187 pfsubr mm4,mm0 ;# dr = ir - jr
16188 pfsubr mm5,mm1
16189 pfmul mm4,mm4 ;# square dx,dy,dz
16190 pfmul mm5,mm5
16191 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
16192 pfacc mm4, mm5 ;# first rsq in lower mm4
16194 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
16195 movd mm7, [esi + ebx*4 + 8]
16197 pfsubr mm6,mm0 ;# dr = ir - jr
16198 pfsubr mm7,mm1
16199 pfmul mm6,mm6 ;# square dx,dy,dz
16200 pfmul mm7,mm7
16201 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
16202 pfacc mm6, mm7 ;# second rsq in lower mm6
16204 pfrcp mm0, mm4 ;# lookup reciprocal seed
16205 pfrcp mm1, mm6
16207 punpckldq mm0,mm1
16208 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
16209 ;# amd 3dnow N-R iteration to get full precision.
16210 pfrcpit1 mm4,mm0
16211 pfrcpit2 mm4,mm0
16212 ;# mm4 now contains invsq,
16213 ;# do potential and fscal
16215 movq mm0, mm4
16216 pfmul mm4, mm0
16217 pfmul mm4, mm0 ;# mm4=rinvsix
16218 movq mm5, mm4
16219 pfmul mm5, mm5 ;# mm5=rinvtwelve
16221 pfmul mm5, [esp + mci0100_c12]
16222 pfmul mm4, [esp + mci0100_c6]
16223 movq mm6, mm5 ;# mm6 is vnb12-vnb6
16224 pfsub mm6, mm4
16225 ;# update vnbtot
16226 pfadd mm6, [esp + mci0100_vnbtot] ;# add the earlier value
16227 movq [esp + mci0100_vnbtot], mm6 ;# store the sum
16229 ;# should we do one more iteration?
16230 sub dword ptr [esp + mci0100_innerk], 2
16231 jl .mci0100_finish_inner
16232 jmp .mci0100_unroll_loop
16233 .mci0100_finish_inner:
16234 and dword ptr [esp + mci0100_innerk], 1
16235 jnz .mci0100_single_inner
16236 jmp .mci0100_updateouterdata
16237 .mci0100_single_inner:
16238 ;# a single j particle iteration here - compare with the unrolled code for comments
16239 mov eax, [esp + mci0100_innerjjnr]
16240 mov eax, [eax] ;# eax=jnr offset
16242 mov esi, [ebp + mci0100_nbfp]
16243 mov ecx, [ebp + mci0100_type]
16244 mov edx, [ecx + eax*4] ;# type [jnr1]
16245 shl edx, 1
16246 add edx, [esp + mci0100_ntia] ;# tja = ntia + 2*type
16247 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
16248 movq [esp + mci0100_c6], mm5
16249 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
16250 movq [esp + mci0100_c12], mm5
16252 mov esi, [ebp + mci0100_pos]
16253 lea eax, [eax + eax*2]
16255 movq mm0, [esp + mci0100_ix]
16256 movd mm1, [esp + mci0100_iz]
16257 movq mm4, [esi + eax*4]
16258 movd mm5, [esi + eax*4 + 8]
16259 pfsubr mm4, mm0
16260 pfsubr mm5, mm1
16261 pfmul mm4,mm4
16262 pfmul mm5,mm5
16263 pfacc mm4, mm5
16264 pfacc mm4, mm5 ;# mm4=rsq
16266 pfrcp mm0,mm4
16267 pfrcpit1 mm4,mm0
16268 pfrcpit2 mm4,mm0 ;# mm4=invsq
16269 ;# calculate potentials and scalar force
16270 movq mm0, mm4
16272 pfmul mm4, mm0
16273 pfmul mm4, mm0 ;# mm4=rinvsix
16274 movq mm5, mm4
16275 pfmul mm5, mm5 ;# mm5=rinvtwelve
16277 pfmul mm5, [esp + mci0100_c12]
16278 pfmul mm4, [esp + mci0100_c6]
16279 movq mm6, mm5 ;# mm6 is vnb12-vnb6
16280 pfsub mm6, mm4
16281 ;# update vnbtot
16282 pfadd mm6, [esp + mci0100_vnbtot] ;# add the earlier value
16283 movq [esp + mci0100_vnbtot], mm6 ;# store the sum
16285 .mci0100_updateouterdata:
16286 mov edx, [ebp + mci0100_gid] ;# get group index for this i particle
16287 mov edx, [edx]
16288 add dword ptr [ebp + mci0100_gid], 4 ;# advance pointer
16290 movq mm7, [esp + mci0100_vnbtot]
16291 pfacc mm7,mm7 ;# get and sum the two parts of total potential
16293 mov eax, [ebp + mci0100_Vnb]
16294 movd mm6, [eax + edx*4]
16295 pfadd mm6, mm7
16296 movd [eax + edx*4], mm6 ;# increment vnb[gid]
16298 ;# finish if last
16299 mov ecx, [ebp + mci0100_nri]
16300 dec ecx
16301 jecxz .mci0100_end
16302 ;# not last, iterate once more!
16303 mov [ebp + mci0100_nri], ecx
16304 jmp .mci0100_outer
16305 .mci0100_end:
16306 femms
16307 add esp, 56
16308 pop edi
16309 pop esi
16310 pop edx
16311 pop ecx
16312 pop ebx
16313 pop eax
16314 leave
16322 .globl mcinl0110_3dnow
16323 .globl _mcinl0110_3dnow
16324 mcinl0110_3dnow:
16325 _mcinl0110_3dnow:
16326 .equiv mci0110_nri, 8
16327 .equiv mci0110_iinr, 12
16328 .equiv mci0110_jindex, 16
16329 .equiv mci0110_jjnr, 20
16330 .equiv mci0110_shift, 24
16331 .equiv mci0110_shiftvec, 28
16332 .equiv mci0110_gid, 32
16333 .equiv mci0110_pos, 36
16334 .equiv mci0110_type, 40
16335 .equiv mci0110_ntype, 44
16336 .equiv mci0110_nbfp, 48
16337 .equiv mci0110_Vnb, 52
16338 .equiv mci0110_nsatoms, 56
16339 ;# stack offsets for local variables
16340 .equiv mci0110_is3, 0
16341 .equiv mci0110_ii3, 4
16342 .equiv mci0110_shX, 8
16343 .equiv mci0110_shY, 12
16344 .equiv mci0110_shZ, 16
16345 .equiv mci0110_ix, 20
16346 .equiv mci0110_iy, 24
16347 .equiv mci0110_iz, 28
16348 .equiv mci0110_vnbtot, 32
16349 .equiv mci0110_c6, 40
16350 .equiv mci0110_c12, 48
16351 .equiv mci0110_ntia, 56
16352 .equiv mci0110_innerjjnr0, 60
16353 .equiv mci0110_innerk0, 64
16354 .equiv mci0110_innerjjnr, 68
16355 .equiv mci0110_innerk, 72
16356 .equiv mci0110_nsvdwc, 76
16357 .equiv mci0110_nscoul, 80
16358 .equiv mci0110_nsvdw, 84
16359 .equiv mci0110_solnr, 88
16360 push ebp
16361 mov ebp,esp
16362 push eax
16363 push ebx
16364 push ecx
16365 push edx
16366 push esi
16367 push edi
16368 sub esp, 92 ;# local stack space
16369 femms
16371 ;# assume we have at least one i particle - start directly
16372 .mci0110_outer:
16373 mov eax, [ebp + mci0110_shift] ;# eax = pointer into shift[]
16374 mov ebx, [eax] ;# ebx=shift[n]
16375 add dword ptr [ebp + mci0110_shift], 4 ;# advance pointer one step
16377 lea ebx, [ebx + ebx*2] ;# ebx=3*is
16378 mov [esp + mci0110_is3],ebx ;# store is3
16380 mov eax, [ebp + mci0110_shiftvec] ;# eax = base of shiftvec[]
16382 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
16383 movd mm1, [eax + ebx*4 + 8]
16384 movq [esp + mci0110_shX], mm0
16385 movd [esp + mci0110_shZ], mm1
16387 mov ecx, [ebp + mci0110_iinr] ;# ecx = pointer into iinr[]
16388 add dword ptr [ebp + mci0110_iinr], 4 ;# advance pointer
16389 mov ebx, [ecx] ;# ebx=ii
16391 mov eax, [ebp + mci0110_nsatoms]
16392 add dword ptr [ebp + mci0110_nsatoms], 12
16393 mov ecx, [eax]
16394 mov edx, [eax + 4]
16395 mov eax, [eax + 8]
16396 sub ecx, eax
16397 sub eax, edx
16399 mov [esp + mci0110_nsvdwc], edx
16400 mov [esp + mci0110_nscoul], eax
16401 mov [esp + mci0110_nsvdw], ecx
16403 ;# clear potential
16404 pxor mm7,mm7
16405 movq [esp + mci0110_vnbtot], mm7
16406 mov [esp + mci0110_solnr], ebx
16408 mov eax, [ebp + mci0110_jindex]
16409 mov ecx, [eax] ;# jindex[n]
16410 mov edx, [eax + 4] ;# jindex[n+1]
16411 add dword ptr [ebp + mci0110_jindex], 4
16412 sub edx, ecx ;# number of innerloop atoms
16413 mov eax, [ebp + mci0110_jjnr]
16414 shl ecx, 2
16415 add eax, ecx
16416 mov [esp + mci0110_innerjjnr0], eax ;# pointer to jjnr[nj0]
16418 mov [esp + mci0110_innerk0], edx ;# number of innerloop atoms
16419 mov esi, [ebp + mci0110_pos]
16421 mov ecx, [esp + mci0110_nsvdwc]
16422 cmp ecx, 0
16423 jnz .mci0110_mno_vdwc
16424 jmp .mci0110_testvdw
16425 .mci0110_mno_vdwc:
16426 mov ebx, [esp + mci0110_solnr]
16427 inc dword ptr [esp + mci0110_solnr]
16429 mov edx, [ebp + mci0110_type]
16430 mov edx, [edx + ebx*4]
16431 imul edx, [ebp + mci0110_ntype]
16432 shl edx, 1
16433 mov [esp + mci0110_ntia], edx
16435 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
16436 mov eax, [ebp + mci0110_pos] ;# eax = base of pos[]
16437 mov [esp + mci0110_ii3], ebx
16439 movq mm0, [eax + ebx*4]
16440 movd mm1, [eax + ebx*4 + 8]
16441 pfadd mm0, [esp + mci0110_shX]
16442 pfadd mm1, [esp + mci0110_shZ]
16443 movq [esp + mci0110_ix], mm0
16444 movd [esp + mci0110_iz], mm1
16446 mov ecx, [esp + mci0110_innerjjnr0]
16447 mov [esp + mci0110_innerjjnr], ecx
16448 mov edx, [esp + mci0110_innerk0]
16449 sub edx, 2
16450 mov [esp + mci0110_innerk], edx ;# number of innerloop atoms
16451 jge .mci0110_unroll_vdwc_loop
16452 jmp .mci0110_finish_vdwc_inner
16453 .mci0110_unroll_vdwc_loop:
16454 ;# paired innerloop starts here
16455 mov ecx, [esp + mci0110_innerjjnr] ;# pointer to jjnr[k]
16456 mov eax, [ecx]
16457 mov ebx, [ecx + 4] ;# eax/ebx=jnr
16458 add dword ptr [esp + mci0110_innerjjnr], 8 ;# advance pointer (unrolled 2)
16459 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
16461 mov ecx, [ebp + mci0110_type]
16462 mov edx, [ecx + eax*4] ;# type [jnr1]
16463 mov ecx, [ecx + ebx*4] ;# type [jnr2]
16465 mov esi, [ebp + mci0110_nbfp] ;# base of nbfp
16466 shl edx, 1
16467 shl ecx, 1
16468 add edx, [esp + mci0110_ntia] ;# tja = ntia + 2*type
16469 add ecx, [esp + mci0110_ntia]
16471 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
16472 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
16473 movq mm6,mm5
16474 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
16475 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
16476 movq [esp + mci0110_c6], mm5
16477 movq [esp + mci0110_c12], mm6
16479 lea eax, [eax + eax*2] ;# replace jnr with j3
16480 lea ebx, [ebx + ebx*2]
16482 mov esi, [ebp + mci0110_pos]
16484 movq mm0, [esp + mci0110_ix]
16485 movd mm1, [esp + mci0110_iz]
16486 movq mm4, [esi + eax*4] ;# fetch first j coordinates
16487 movd mm5, [esi + eax*4 + 8]
16488 pfsubr mm4,mm0 ;# dr = ir - jr
16489 pfsubr mm5,mm1
16490 pfmul mm4,mm4 ;# square dx,dy,dz
16491 pfmul mm5,mm5
16492 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
16493 pfacc mm4, mm5 ;# first rsq in lower mm4
16495 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
16496 movd mm7, [esi + ebx*4 + 8]
16498 pfsubr mm6,mm0 ;# dr = ir - jr
16499 pfsubr mm7,mm1
16500 pfmul mm6,mm6 ;# square dx,dy,dz
16501 pfmul mm7,mm7
16502 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
16503 pfacc mm6, mm7 ;# second rsq in lower mm6
16505 pfrcp mm0, mm4 ;# lookup reciprocal seed
16506 pfrcp mm1, mm6
16508 punpckldq mm0,mm1
16509 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
16510 ;# amd 3dnow N-R iteration to get full precision
16511 pfrcpit1 mm4,mm0
16512 pfrcpit2 mm4,mm0
16513 ;# mm4 now contains invsq,
16514 ;# do potential and fscal
16516 movq mm0, mm4
16517 pfmul mm4, mm0
16518 pfmul mm4, mm0 ;# mm4=rinvsix
16519 movq mm5, mm4
16520 pfmul mm5, mm5 ;# mm5=rinvtwelve
16522 pfmul mm5, [esp + mci0110_c12]
16523 pfmul mm4, [esp + mci0110_c6]
16524 movq mm6, mm5 ;# mm6 is vnb12-vnb6
16525 pfsub mm6, mm4
16526 ;# update vnbtot
16527 pfadd mm6, [esp + mci0110_vnbtot] ;# add the earlier value
16528 movq [esp + mci0110_vnbtot], mm6 ;# store the sum
16530 ;# should we do one more iteration?
16531 sub dword ptr [esp + mci0110_innerk], 2
16532 jl .mci0110_finish_vdwc_inner
16533 jmp .mci0110_unroll_vdwc_loop
16534 .mci0110_finish_vdwc_inner:
16535 and dword ptr [esp + mci0110_innerk], 1
16536 jnz .mci0110_single_vdwc_inner
16537 jmp .mci0110_updateouterdata_vdwc
16538 .mci0110_single_vdwc_inner:
16539 ;# a single j particle iteration here - compare with the unrolled code for comments
16540 mov eax, [esp + mci0110_innerjjnr]
16541 mov eax, [eax] ;# eax=jnr offset
16543 mov esi, [ebp + mci0110_nbfp]
16544 mov ecx, [ebp + mci0110_type]
16545 mov edx, [ecx + eax*4] ;# type [jnr1]
16546 shl edx, 1
16547 add edx, [esp + mci0110_ntia] ;# tja = ntia + 2*type
16548 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
16549 movq [esp + mci0110_c6], mm5
16550 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
16551 movq [esp + mci0110_c12], mm5
16553 mov esi, [ebp + mci0110_pos]
16554 lea eax, [eax + eax*2]
16556 movq mm0, [esp + mci0110_ix]
16557 movd mm1, [esp + mci0110_iz]
16558 movq mm4, [esi + eax*4]
16559 movd mm5, [esi + eax*4 + 8]
16560 pfsubr mm4, mm0
16561 pfsubr mm5, mm1
16562 pfmul mm4,mm4
16563 pfmul mm5,mm5
16564 pfacc mm4, mm5
16565 pfacc mm4, mm5 ;# mm4=rsq
16567 pfrcp mm0,mm4
16568 pfrcpit1 mm4,mm0
16569 pfrcpit2 mm4,mm0 ;# mm4=invsq
16570 ;# calculate potentials and scalar force
16571 movq mm0, mm4
16573 pfmul mm4, mm0
16574 pfmul mm4, mm0 ;# mm4=rinvsix
16575 movq mm5, mm4
16576 pfmul mm5, mm5 ;# mm5=rinvtwelve
16578 pfmul mm5, [esp + mci0110_c12]
16579 pfmul mm4, [esp + mci0110_c6]
16580 movq mm6, mm5 ;# mm6 is vnb12-vnb6
16581 pfsub mm6, mm4
16582 ;# update vnbtot
16583 pfadd mm6, [esp + mci0110_vnbtot] ;# add the earlier value
16584 movq [esp + mci0110_vnbtot], mm6 ;# store the sum
16586 .mci0110_updateouterdata_vdwc:
16587 ;# loop back to mno
16588 dec dword ptr [esp + mci0110_nsvdwc]
16589 jz .mci0110_testvdw
16590 jmp .mci0110_mno_vdwc
16591 .mci0110_testvdw:
16592 mov ebx, [esp + mci0110_nscoul]
16593 add [esp + mci0110_solnr], ebx
16595 mov ecx, [esp + mci0110_nsvdw]
16596 cmp ecx, 0
16597 jnz .mci0110_mno_vdw
16598 jmp .mci0110_last_mno
16599 .mci0110_mno_vdw:
16600 mov ebx, [esp + mci0110_solnr]
16601 inc dword ptr [esp + mci0110_solnr]
16603 mov edx, [ebp + mci0110_type]
16604 mov edx, [edx + ebx*4]
16605 imul edx, [ebp + mci0110_ntype]
16606 shl edx, 1
16607 mov [esp + mci0110_ntia], edx
16609 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
16610 mov eax, [ebp + mci0110_pos] ;# eax = base of pos[]
16611 mov [esp + mci0110_ii3], ebx
16613 movq mm0, [eax + ebx*4]
16614 movd mm1, [eax + ebx*4 + 8]
16615 pfadd mm0, [esp + mci0110_shX]
16616 pfadd mm1, [esp + mci0110_shZ]
16617 movq [esp + mci0110_ix], mm0
16618 movd [esp + mci0110_iz], mm1
16620 mov ecx, [esp + mci0110_innerjjnr0]
16621 mov [esp + mci0110_innerjjnr], ecx
16622 mov edx, [esp + mci0110_innerk0]
16623 sub edx, 2
16624 mov [esp + mci0110_innerk], edx ;# number of innerloop atoms
16625 jge .mci0110_unroll_vdw_loop
16626 jmp .mci0110_finish_vdw_inner
16627 .mci0110_unroll_vdw_loop:
16628 ;# paired innerloop starts here
16629 mov ecx, [esp + mci0110_innerjjnr] ;# pointer to jjnr[k]
16630 mov eax, [ecx]
16631 mov ebx, [ecx + 4] ;# eax/ebx=jnr
16632 add dword ptr [esp + mci0110_innerjjnr], 8 ;# advance pointer (unrolled 2)
16633 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
16635 mov ecx, [ebp + mci0110_type]
16636 mov edx, [ecx + eax*4] ;# type [jnr1]
16637 mov ecx, [ecx + ebx*4] ;# type [jnr2]
16639 mov esi, [ebp + mci0110_nbfp] ;# base of nbfp
16640 shl edx, 1
16641 shl ecx, 1
16642 add edx, [esp + mci0110_ntia] ;# tja = ntia + 2*type
16643 add ecx, [esp + mci0110_ntia]
16645 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
16646 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
16647 movq mm6,mm5
16648 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
16649 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
16650 movq [esp + mci0110_c6], mm5
16651 movq [esp + mci0110_c12], mm6
16653 lea eax, [eax + eax*2] ;# replace jnr with j3
16654 lea ebx, [ebx + ebx*2]
16656 mov esi, [ebp + mci0110_pos]
16658 movq mm0, [esp + mci0110_ix]
16659 movd mm1, [esp + mci0110_iz]
16660 movq mm4, [esi + eax*4] ;# fetch first j coordinates
16661 movd mm5, [esi + eax*4 + 8]
16662 pfsubr mm4,mm0 ;# dr = ir - jr
16663 pfsubr mm5,mm1
16664 pfmul mm4,mm4 ;# square dx,dy,dz
16665 pfmul mm5,mm5
16666 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
16667 pfacc mm4, mm5 ;# first rsq in lower mm4
16669 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
16670 movd mm7, [esi + ebx*4 + 8]
16672 pfsubr mm6,mm0 ;# dr = ir - jr
16673 pfsubr mm7,mm1
16674 pfmul mm6,mm6 ;# square dx,dy,dz
16675 pfmul mm7,mm7
16676 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
16677 pfacc mm6, mm7 ;# second rsq in lower mm6
16679 pfrcp mm0, mm4 ;# lookup reciprocal seed
16680 pfrcp mm1, mm6
16682 punpckldq mm0,mm1
16683 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
16684 ;# amd 3dnow N-R iteration to get full precision
16685 pfrcpit1 mm4,mm0
16686 pfrcpit2 mm4,mm0
16687 ;# mm4 now contains invsq,
16688 ;# do potential and fscal
16690 movq mm0, mm4
16691 pfmul mm4, mm0
16692 pfmul mm4, mm0 ;# mm4=rinvsix
16693 movq mm5, mm4
16694 pfmul mm5, mm5 ;# mm5=rinvtwelve
16696 pfmul mm5, [esp + mci0110_c12]
16697 pfmul mm4, [esp + mci0110_c6]
16698 movq mm6, mm5 ;# mm6 is vnb12-vnb6
16699 pfsub mm6, mm4
16700 ;# update vnbtot
16701 pfadd mm6, [esp + mci0110_vnbtot] ;# add the earlier value
16702 movq [esp + mci0110_vnbtot], mm6 ;# store the sum
16704 ;# should we do one more iteration?
16705 sub dword ptr [esp + mci0110_innerk], 2
16706 jl .mci0110_finish_vdw_inner
16707 jmp .mci0110_unroll_vdw_loop
16708 .mci0110_finish_vdw_inner:
16709 and dword ptr [esp + mci0110_innerk], 1
16710 jnz .mci0110_single_vdw_inner
16711 jmp .mci0110_updateouterdata_vdw
16712 .mci0110_single_vdw_inner:
16713 ;# a single j particle iteration here - compare with the unrolled code for comments
16714 mov eax, [esp + mci0110_innerjjnr]
16715 mov eax, [eax] ;# eax=jnr offset
16717 mov esi, [ebp + mci0110_nbfp]
16718 mov ecx, [ebp + mci0110_type]
16719 mov edx, [ecx + eax*4] ;# type [jnr1]
16720 shl edx, 1
16721 add edx, [esp + mci0110_ntia] ;# tja = ntia + 2*type
16722 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
16723 movq [esp + mci0110_c6], mm5
16724 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
16725 movq [esp + mci0110_c12], mm5
16727 mov esi, [ebp + mci0110_pos]
16728 lea eax, [eax + eax*2]
16730 movq mm0, [esp + mci0110_ix]
16731 movd mm1, [esp + mci0110_iz]
16732 movq mm4, [esi + eax*4]
16733 movd mm5, [esi + eax*4 + 8]
16734 pfsubr mm4, mm0
16735 pfsubr mm5, mm1
16736 pfmul mm4,mm4
16737 pfmul mm5,mm5
16738 pfacc mm4, mm5
16739 pfacc mm4, mm5 ;# mm4=rsq
16741 pfrcp mm0,mm4
16742 pfrcpit1 mm4,mm0
16743 pfrcpit2 mm4,mm0 ;# mm4=invsq
16744 ;# calculate potentials and scalar force
16745 movq mm0, mm4
16747 pfmul mm4, mm0
16748 pfmul mm4, mm0 ;# mm4=rinvsix
16749 movq mm5, mm4
16750 pfmul mm5, mm5 ;# mm5=rinvtwelve
16752 pfmul mm5, [esp + mci0110_c12]
16753 pfmul mm4, [esp + mci0110_c6]
16754 movq mm6, mm5 ;# mm6 is vnb12-vnb6
16755 pfsub mm6, mm4
16756 ;# update vnbtot
16757 pfadd mm6, [esp + mci0110_vnbtot] ;# add the earlier value
16758 movq [esp + mci0110_vnbtot], mm6 ;# store the sum
16760 .mci0110_updateouterdata_vdw:
16761 ;# loop back to mno
16762 dec dword ptr [esp + mci0110_nsvdw]
16763 jz .mci0110_last_mno
16764 jmp .mci0110_mno_vdw
16766 .mci0110_last_mno:
16767 mov edx, [ebp + mci0110_gid] ;# get group index for this i particle
16768 mov edx, [edx]
16769 add dword ptr [ebp + mci0110_gid], 4 ;# advance pointer
16771 movq mm7, [esp + mci0110_vnbtot]
16772 pfacc mm7,mm7 ;# get and sum the two parts of total potential
16774 mov eax, [ebp + mci0110_Vnb]
16775 movd mm6, [eax + edx*4]
16776 pfadd mm6, mm7
16777 movd [eax + edx*4], mm6 ;# increment vc[gid]
16778 ;# finish if last
16779 mov ecx, [ebp + mci0110_nri]
16780 dec ecx
16781 jecxz .mci0110_end
16782 ;# not last, iterate once more!
16783 mov [ebp + mci0110_nri], ecx
16784 jmp .mci0110_outer
16785 .mci0110_end:
16786 femms
16787 add esp, 92
16788 pop edi
16789 pop esi
16790 pop edx
16791 pop ecx
16792 pop ebx
16793 pop eax
16794 leave
16799 .globl mcinl0300_3dnow
16800 .globl _mcinl0300_3dnow
16801 mcinl0300_3dnow:
16802 _mcinl0300_3dnow:
16803 .equiv mci0300_nri, 8
16804 .equiv mci0300_iinr, 12
16805 .equiv mci0300_jindex, 16
16806 .equiv mci0300_jjnr, 20
16807 .equiv mci0300_shift, 24
16808 .equiv mci0300_shiftvec, 28
16809 .equiv mci0300_gid, 32
16810 .equiv mci0300_pos, 36
16811 .equiv mci0300_type, 40
16812 .equiv mci0300_ntype, 44
16813 .equiv mci0300_nbfp, 48
16814 .equiv mci0300_Vnb, 52
16815 .equiv mci0300_tabscale, 56
16816 .equiv mci0300_VFtab, 60
16817 ;# stack offsets for local variables
16818 .equiv mci0300_is3, 0
16819 .equiv mci0300_ii3, 4
16820 .equiv mci0300_ix, 8
16821 .equiv mci0300_iy, 12
16822 .equiv mci0300_iz, 16
16823 .equiv mci0300_vnbtot, 20
16824 .equiv mci0300_c6, 28
16825 .equiv mci0300_c12, 36
16826 .equiv mci0300_n1, 44
16827 .equiv mci0300_tsc, 52
16828 .equiv mci0300_ntia, 60
16829 .equiv mci0300_innerjjnr, 64
16830 .equiv mci0300_innerk, 68
16831 push ebp
16832 mov ebp,esp
16833 push eax
16834 push ebx
16835 push ecx
16836 push edx
16837 push esi
16838 push edi
16839 sub esp, 72 ;# local stack space
16840 femms
16841 ;# move data to local stack
16842 movd mm3, [ebp + mci0300_tabscale]
16843 punpckldq mm3,mm3
16844 movq [esp + mci0300_tsc], mm3
16845 ;# assume we have at least one i particle - start directly
16846 .mci0300_outer:
16847 mov eax, [ebp + mci0300_shift] ;# eax = pointer into shift[]
16848 mov ebx, [eax] ;# ebx=shift[n]
16849 add dword ptr [ebp + mci0300_shift], 4 ;# advance pointer one step
16851 lea ebx, [ebx + ebx*2] ;# ebx=3*is
16852 mov [esp + mci0300_is3],ebx ;# store is3
16854 mov eax, [ebp + mci0300_shiftvec] ;# eax = base of shiftvec[]
16856 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
16857 movd mm1, [eax + ebx*4 + 8]
16859 mov ecx, [ebp + mci0300_iinr] ;# ecx = pointer into iinr[]
16860 add dword ptr [ebp + mci0300_iinr], 4 ;# advance pointer
16861 mov ebx, [ecx] ;# ebx=ii
16863 mov edx, [ebp + mci0300_type]
16864 mov edx, [edx + ebx*4]
16865 imul edx, [ebp + mci0300_ntype]
16866 shl edx, 1
16867 mov [esp + mci0300_ntia], edx
16869 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
16870 mov eax, [ebp + mci0300_pos] ;# eax = base of pos[]
16872 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
16873 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
16874 mov [esp + mci0300_ii3], ebx
16875 pfadd mm1, mm3
16876 movq [esp + mci0300_ix], mm0
16877 movd [esp + mci0300_iz], mm1
16879 ;# clear total potential
16880 pxor mm7,mm7
16881 movq [esp + mci0300_vnbtot], mm7
16883 mov eax, [ebp + mci0300_jindex]
16884 mov ecx, [eax] ;# jindex[n]
16885 mov edx, [eax + 4] ;# jindex[n+1]
16886 add dword ptr [ebp + mci0300_jindex], 4
16887 sub edx, ecx ;# number of innerloop atoms
16889 mov esi, [ebp + mci0300_pos]
16890 mov eax, [ebp + mci0300_jjnr]
16891 shl ecx, 2
16892 add eax, ecx
16893 mov [esp + mci0300_innerjjnr], eax ;# pointer to jjnr[nj0]
16894 sub edx, 2
16895 mov [esp + mci0300_innerk], edx ;# number of innerloop atoms
16896 jge .mci0300_unroll_loop
16897 jmp .mci0300_finish_inner
16898 .mci0300_unroll_loop:
16899 ;# paired innerloop starts here
16900 mov ecx, [esp + mci0300_innerjjnr] ;# pointer to jjnr[k]
16901 mov eax, [ecx]
16902 mov ebx, [ecx + 4] ;# eax/ebx=jnr
16903 add dword ptr [esp + mci0300_innerjjnr], 8 ;# advance pointer (unrolled 2)
16904 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
16906 mov ecx, [ebp + mci0300_type]
16907 mov edx, [ecx + eax*4] ;# type [jnr1]
16908 mov ecx, [ecx + ebx*4] ;# type [jnr2]
16910 mov esi, [ebp + mci0300_nbfp] ;# base of nbfp
16911 shl edx, 1
16912 shl ecx, 1
16913 add edx, [esp + mci0300_ntia] ;# tja = ntia + 2*type
16914 add ecx, [esp + mci0300_ntia]
16916 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
16917 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
16918 movq mm6,mm5
16919 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
16920 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
16921 movq [esp + mci0300_c6], mm5
16922 movq [esp + mci0300_c12], mm6
16924 lea eax, [eax + eax*2] ;# replace jnr with j3
16925 lea ebx, [ebx + ebx*2]
16927 mov esi, [ebp + mci0300_pos]
16929 movq mm0, [esp + mci0300_ix]
16930 movd mm1, [esp + mci0300_iz]
16931 movq mm4, [esi + eax*4] ;# fetch first j coordinates
16932 movd mm5, [esi + eax*4 + 8]
16933 pfsubr mm4,mm0 ;# dr = ir - jr
16934 pfsubr mm5,mm1
16935 pfmul mm4,mm4 ;# square dx,dy,dz
16936 pfmul mm5,mm5
16937 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
16938 pfacc mm4, mm5 ;# first rsq in lower mm4
16940 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
16941 movd mm7, [esi + ebx*4 + 8]
16943 pfsubr mm6,mm0 ;# dr = ir - jr
16944 pfsubr mm7,mm1
16945 pfmul mm6,mm6 ;# square dx,dy,dz
16946 pfmul mm7,mm7
16947 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
16948 pfacc mm6, mm7 ;# second rsq in lower mm6
16950 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
16951 pfrsqrt mm1, mm6
16954 punpckldq mm0,mm1
16955 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
16956 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
16957 pfmul mm0,mm0
16958 pfrsqit1 mm0,mm4
16959 pfrcpit2 mm0,mm2
16960 pfmul mm4, mm0
16961 movq mm1, mm4
16962 ;# mm0 is invsqrt, and mm1 r
16963 ;# do potential and fscal
16964 pfmul mm1, [esp + mci0300_tsc] ;# mm1=rt
16965 pf2iw mm4,mm1
16966 movq [esp + mci0300_n1], mm4
16967 pi2fd mm4,mm4
16968 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
16970 movq mm2,mm1
16971 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
16973 mov edx, [ebp + mci0300_VFtab]
16974 ;# dispersion table
16975 mov ecx, [esp + mci0300_n1]
16976 shl ecx, 3
16977 ;# load all the table values we need
16978 movd mm4, [edx + ecx*4]
16979 movd mm5, [edx + ecx*4 + 4]
16980 movd mm6, [edx + ecx*4 + 8]
16981 movd mm7, [edx + ecx*4 + 12]
16982 mov ecx, [esp + mci0300_n1 + 4]
16983 shl ecx, 3
16984 punpckldq mm4, [edx + ecx*4]
16985 punpckldq mm5, [edx + ecx*4 + 4]
16986 punpckldq mm6, [edx + ecx*4 + 8]
16987 punpckldq mm7, [edx + ecx*4 + 12]
16988 pfmul mm6, mm1 ;# mm6 = Geps
16989 pfmul mm7, mm2 ;# mm7 = Heps2
16990 pfadd mm5, mm6
16991 pfadd mm5, mm7 ;# mm5 = Fp
16992 pfmul mm5, mm1 ;# mm5=eps*Fp
16993 pfadd mm5, mm4 ;# mm5= VV
16995 movq mm4, [esp + mci0300_c6]
16996 pfmul mm5, mm4 ;# vnb6
16997 ;# update vnbtot to release mm5!
16998 pfadd mm5, [esp + mci0300_vnbtot] ;# add the earlier value
16999 movq [esp + mci0300_vnbtot], mm5 ;# store the sum
17001 ;# repulsion table
17002 mov ecx, [esp + mci0300_n1]
17003 shl ecx, 3
17004 ;# load all the table values we need
17005 movd mm4, [edx + ecx*4 + 16]
17006 movd mm5, [edx + ecx*4 + 20]
17007 movd mm6, [edx + ecx*4 + 24]
17008 movd mm7, [edx + ecx*4 + 28]
17009 mov ecx, [esp + mci0300_n1 + 4]
17010 shl ecx, 3
17011 punpckldq mm4, [edx + ecx*4 + 16]
17012 punpckldq mm5, [edx + ecx*4 + 20]
17013 punpckldq mm6, [edx + ecx*4 + 24]
17014 punpckldq mm7, [edx + ecx*4 + 28]
17016 pfmul mm6, mm1 ;# mm6 = Geps
17017 pfmul mm7, mm2 ;# mm7 = Heps2
17018 pfadd mm5, mm6
17019 pfadd mm5, mm7 ;# mm5 = Fp
17020 pfmul mm5, mm1 ;# mm5=eps*Fp
17021 pfadd mm5, mm4 ;# mm5= VV
17023 movq mm6, [esp + mci0300_c12]
17024 pfmul mm5, mm6 ;# vnb12
17025 ;# update vnbtot
17026 pfadd mm5, [esp + mci0300_vnbtot] ;# add the earlier value
17027 movq [esp + mci0300_vnbtot], mm5 ;# store the sum
17029 ;# should we do one more iteration?
17030 sub dword ptr [esp + mci0300_innerk], 2
17031 jl .mci0300_finish_inner
17032 jmp .mci0300_unroll_loop
17033 .mci0300_finish_inner:
17034 and dword ptr [esp + mci0300_innerk], 1
17035 jnz .mci0300_single_inner
17036 jmp .mci0300_updateouterdata
17037 .mci0300_single_inner:
17038 ;# a single j particle iteration here - compare with the unrolled code for comments
17039 mov eax, [esp + mci0300_innerjjnr]
17040 mov eax, [eax] ;# eax=jnr offset
17042 mov esi, [ebp + mci0300_nbfp]
17043 mov ecx, [ebp + mci0300_type]
17044 mov edx, [ecx + eax*4] ;# type [jnr1]
17045 shl edx, 1
17046 add edx, [esp + mci0300_ntia] ;# tja = ntia + 2*type
17047 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
17048 movq [esp + mci0300_c6], mm5
17049 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
17050 movq [esp + mci0300_c12], mm5
17052 mov esi, [ebp + mci0300_pos]
17053 lea eax, [eax + eax*2]
17055 movq mm0, [esp + mci0300_ix]
17056 movd mm1, [esp + mci0300_iz]
17057 movq mm4, [esi + eax*4]
17058 movd mm5, [esi + eax*4 + 8]
17059 pfsubr mm4, mm0
17060 pfsubr mm5, mm1
17061 pfmul mm4,mm4
17062 pfmul mm5,mm5
17063 pfacc mm4, mm5
17064 pfacc mm4, mm5 ;# mm0=rsq
17066 pfrsqrt mm0,mm4
17067 movq mm2,mm0
17068 pfmul mm0,mm0
17069 pfrsqit1 mm0,mm4
17070 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
17071 pfmul mm4, mm0
17072 movq mm1, mm4
17073 ;# mm0 is invsqrt, and mm1 r
17075 ;# calculate potentials and scalar force
17076 pfmul mm1, [esp + mci0300_tsc] ;# mm1=rt
17077 pf2iw mm4,mm1
17078 movd [esp + mci0300_n1], mm4
17079 pi2fd mm4,mm4
17080 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
17082 movq mm2,mm1
17083 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
17085 mov edx, [ebp + mci0300_VFtab]
17086 mov ecx, [esp + mci0300_n1]
17087 shl ecx, 3
17088 ;# dispersion table
17089 ;# load all the table values we need
17090 movd mm4, [edx + ecx*4]
17091 movd mm5, [edx + ecx*4 + 4]
17092 movd mm6, [edx + ecx*4 + 8]
17093 movd mm7, [edx + ecx*4 + 12]
17094 pfmul mm6, mm1 ;# mm6 = Geps
17095 pfmul mm7, mm2 ;# mm7 = Heps2
17096 pfadd mm5, mm6
17097 pfadd mm5, mm7 ;# mm5 = Fp
17098 pfmul mm5, mm1 ;# mm5=eps*Fp
17099 pfadd mm5, mm4 ;# mm5= VV
17101 movq mm4, [esp + mci0300_c6]
17102 pfmul mm5, mm4 ;# vnb6
17103 ;# update vnbtot to release mm5!
17104 pfadd mm5, [esp + mci0300_vnbtot] ;# add the earlier value
17105 movq [esp + mci0300_vnbtot], mm5 ;# store the sum
17107 ;# repulsion table
17108 ;# load all the table values we need
17109 movd mm4, [edx + ecx*4 + 16]
17110 movd mm5, [edx + ecx*4 + 20]
17111 movd mm6, [edx + ecx*4 + 24]
17112 movd mm7, [edx + ecx*4 + 28]
17114 pfmul mm6, mm1 ;# mm6 = Geps
17115 pfmul mm7, mm2 ;# mm7 = Heps2
17116 pfadd mm5, mm6
17117 pfadd mm5, mm7 ;# mm5 = Fp
17118 pfmul mm5, mm1 ;# mm5=eps*Fp
17119 pfadd mm5, mm4 ;# mm5= VV
17121 movq mm6, [esp + mci0300_c12]
17122 pfmul mm5, mm6 ;# vnb12
17123 ;# update vnbtot
17124 pfadd mm5, [esp + mci0300_vnbtot] ;# add the earlier value
17125 movq [esp + mci0300_vnbtot], mm5 ;# store the sum
17127 .mci0300_updateouterdata:
17128 mov edx, [ebp + mci0300_gid] ;# get group index for this i particle
17129 mov edx, [edx]
17130 add dword ptr [ebp + mci0300_gid], 4 ;# advance pointer
17132 movq mm7, [esp + mci0300_vnbtot]
17133 pfacc mm7,mm7 ;# get and sum the two parts of total potential
17135 mov eax, [ebp + mci0300_Vnb]
17136 movd mm6, [eax + edx*4]
17137 pfadd mm6, mm7
17138 movd [eax + edx*4], mm6 ;# increment vnb[gid]
17140 ;# finish if last
17141 mov ecx, [ebp + mci0300_nri]
17142 dec ecx
17143 jecxz .mci0300_end
17144 ;# not last, iterate once more!
17145 mov [ebp + mci0300_nri], ecx
17146 jmp .mci0300_outer
17147 .mci0300_end:
17148 femms
17149 add esp, 72
17150 pop edi
17151 pop esi
17152 pop edx
17153 pop ecx
17154 pop ebx
17155 pop eax
17156 leave
17162 .globl mcinl0310_3dnow
17163 .globl _mcinl0310_3dnow
17164 mcinl0310_3dnow:
17165 _mcinl0310_3dnow:
17166 .equiv mci0310_nri, 8
17167 .equiv mci0310_iinr, 12
17168 .equiv mci0310_jindex, 16
17169 .equiv mci0310_jjnr, 20
17170 .equiv mci0310_shift, 24
17171 .equiv mci0310_shiftvec, 28
17172 .equiv mci0310_gid, 32
17173 .equiv mci0310_pos, 36
17174 .equiv mci0310_type, 40
17175 .equiv mci0310_ntype, 44
17176 .equiv mci0310_nbfp, 48
17177 .equiv mci0310_Vnb, 52
17178 .equiv mci0310_tabscale, 56
17179 .equiv mci0310_VFtab, 60
17180 .equiv mci0310_nsatoms, 64
17181 ;# stack offsets for local variables
17182 .equiv mci0310_is3, 0
17183 .equiv mci0310_ii3, 4
17184 .equiv mci0310_shX, 8
17185 .equiv mci0310_shY, 12
17186 .equiv mci0310_shZ, 16
17187 .equiv mci0310_ix, 20
17188 .equiv mci0310_iy, 24
17189 .equiv mci0310_iz, 28
17190 .equiv mci0310_vnbtot, 32
17191 .equiv mci0310_c6, 40
17192 .equiv mci0310_c12, 48
17193 .equiv mci0310_n1, 56
17194 .equiv mci0310_tsc, 64
17195 .equiv mci0310_ntia, 72
17196 .equiv mci0310_innerjjnr0, 76
17197 .equiv mci0310_innerk0, 80
17198 .equiv mci0310_innerjjnr, 84
17199 .equiv mci0310_innerk, 88
17200 .equiv mci0310_nsvdwc, 92
17201 .equiv mci0310_nscoul, 96
17202 .equiv mci0310_nsvdw, 100
17203 .equiv mci0310_solnr, 104
17204 push ebp
17205 mov ebp,esp
17206 push eax
17207 push ebx
17208 push ecx
17209 push edx
17210 push esi
17211 push edi
17212 sub esp, 108 ;# local stack space
17213 femms
17214 movd mm3, [ebp + mci0310_tabscale]
17215 punpckldq mm3,mm3
17216 movq [esp + mci0310_tsc], mm3
17218 ;# assume we have at least one i particle - start directly
17219 .mci0310_outer:
17220 mov eax, [ebp + mci0310_shift] ;# eax = pointer into shift[]
17221 mov ebx, [eax] ;# ebx=shift[n]
17222 add dword ptr [ebp + mci0310_shift], 4 ;# advance pointer one step
17224 lea ebx, [ebx + ebx*2] ;# ebx=3*is
17225 mov [esp + mci0310_is3],ebx ;# store is3
17227 mov eax, [ebp + mci0310_shiftvec] ;# eax = base of shiftvec[]
17229 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
17230 movd mm1, [eax + ebx*4 + 8]
17231 movq [esp + mci0310_shX], mm0
17232 movd [esp + mci0310_shZ], mm1
17234 mov ecx, [ebp + mci0310_iinr] ;# ecx = pointer into iinr[]
17235 add dword ptr [ebp + mci0310_iinr], 4 ;# advance pointer
17236 mov ebx, [ecx] ;# ebx=ii
17238 mov eax, [ebp + mci0310_nsatoms]
17239 add dword ptr [ebp + mci0310_nsatoms], 12
17240 mov ecx, [eax]
17241 mov edx, [eax + 4]
17242 mov eax, [eax + 8]
17243 sub ecx, eax
17244 sub eax, edx
17246 mov [esp + mci0310_nsvdwc], edx
17247 mov [esp + mci0310_nscoul], eax
17248 mov [esp + mci0310_nsvdw], ecx
17250 ;# clear potential
17251 pxor mm7,mm7
17252 movq [esp + mci0310_vnbtot], mm7
17253 mov [esp + mci0310_solnr], ebx
17255 mov eax, [ebp + mci0310_jindex]
17256 mov ecx, [eax] ;# jindex[n]
17257 mov edx, [eax + 4] ;# jindex[n+1]
17258 add dword ptr [ebp + mci0310_jindex], 4
17259 sub edx, ecx ;# number of innerloop atoms
17260 mov eax, [ebp + mci0310_jjnr]
17261 shl ecx, 2
17262 add eax, ecx
17263 mov [esp + mci0310_innerjjnr0], eax ;# pointer to jjnr[nj0]
17265 mov [esp + mci0310_innerk0], edx ;# number of innerloop atoms
17266 mov esi, [ebp + mci0310_pos]
17268 mov ecx, [esp + mci0310_nsvdwc]
17269 cmp ecx, 0
17270 jnz .mci0310_mno_vdwc
17271 jmp .mci0310_testvdw
17272 .mci0310_mno_vdwc:
17273 mov ebx, [esp + mci0310_solnr]
17274 inc dword ptr [esp + mci0310_solnr]
17276 mov edx, [ebp + mci0310_type]
17277 mov edx, [edx + ebx*4]
17278 imul edx, [ebp + mci0310_ntype]
17279 shl edx, 1
17280 mov [esp + mci0310_ntia], edx
17282 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
17283 mov eax, [ebp + mci0310_pos] ;# eax = base of pos[]
17284 mov [esp + mci0310_ii3], ebx
17286 movq mm0, [eax + ebx*4]
17287 movd mm1, [eax + ebx*4 + 8]
17288 pfadd mm0, [esp + mci0310_shX]
17289 pfadd mm1, [esp + mci0310_shZ]
17290 movq [esp + mci0310_ix], mm0
17291 movd [esp + mci0310_iz], mm1
17293 mov ecx, [esp + mci0310_innerjjnr0]
17294 mov [esp + mci0310_innerjjnr], ecx
17295 mov edx, [esp + mci0310_innerk0]
17296 sub edx, 2
17297 mov [esp + mci0310_innerk], edx ;# number of innerloop atoms
17298 jge .mci0310_unroll_vdwc_loop
17299 jmp .mci0310_finish_vdwc_inner
17300 .mci0310_unroll_vdwc_loop:
17301 ;# paired innerloop starts here
17302 mov ecx, [esp + mci0310_innerjjnr] ;# pointer to jjnr[k]
17303 mov eax, [ecx]
17304 mov ebx, [ecx + 4] ;# eax/ebx=jnr
17305 add dword ptr [esp + mci0310_innerjjnr], 8 ;# advance pointer (unrolled 2)
17306 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
17308 mov ecx, [ebp + mci0310_type]
17309 mov edx, [ecx + eax*4] ;# type [jnr1]
17310 mov ecx, [ecx + ebx*4] ;# type [jnr2]
17312 mov esi, [ebp + mci0310_nbfp] ;# base of nbfp
17313 shl edx, 1
17314 shl ecx, 1
17315 add edx, [esp + mci0310_ntia] ;# tja = ntia + 2*type
17316 add ecx, [esp + mci0310_ntia]
17318 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
17319 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
17320 movq mm6,mm5
17321 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
17322 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
17323 movq [esp + mci0310_c6], mm5
17324 movq [esp + mci0310_c12], mm6
17326 lea eax, [eax + eax*2] ;# replace jnr with j3
17327 lea ebx, [ebx + ebx*2]
17329 mov esi, [ebp + mci0310_pos]
17331 movq mm0, [esp + mci0310_ix]
17332 movd mm1, [esp + mci0310_iz]
17333 movq mm4, [esi + eax*4] ;# fetch first j coordinates
17334 movd mm5, [esi + eax*4 + 8]
17335 pfsubr mm4,mm0 ;# dr = ir - jr
17336 pfsubr mm5,mm1
17337 pfmul mm4,mm4 ;# square dx,dy,dz
17338 pfmul mm5,mm5
17339 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
17340 pfacc mm4, mm5 ;# first rsq in lower mm4
17342 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
17343 movd mm7, [esi + ebx*4 + 8]
17345 pfsubr mm6,mm0 ;# dr = ir - jr
17346 pfsubr mm7,mm1
17347 pfmul mm6,mm6 ;# square dx,dy,dz
17348 pfmul mm7,mm7
17349 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
17350 pfacc mm6, mm7 ;# second rsq in lower mm6
17352 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
17353 pfrsqrt mm1, mm6
17356 punpckldq mm0,mm1
17357 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
17358 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
17359 pfmul mm0,mm0
17360 pfrsqit1 mm0,mm4
17361 pfrcpit2 mm0,mm2
17362 pfmul mm4, mm0
17363 movq mm1, mm4
17364 ;# mm0 is invsqrt, and mm1 r
17365 ;# do potential and fscal
17366 pfmul mm1, [esp + mci0310_tsc] ;# mm1=rt
17367 pf2iw mm4,mm1
17368 movq [esp + mci0310_n1], mm4
17369 pi2fd mm4,mm4
17370 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
17372 movq mm2,mm1
17373 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
17375 mov edx, [ebp + mci0310_VFtab]
17376 ;# dispersion table
17377 mov ecx, [esp + mci0310_n1]
17378 shl ecx, 3
17379 ;# load all the table values we need
17380 movd mm4, [edx + ecx*4]
17381 movd mm5, [edx + ecx*4 + 4]
17382 movd mm6, [edx + ecx*4 + 8]
17383 movd mm7, [edx + ecx*4 + 12]
17384 mov ecx, [esp + mci0310_n1 + 4]
17385 shl ecx, 3
17386 punpckldq mm4, [edx + ecx*4]
17387 punpckldq mm5, [edx + ecx*4 + 4]
17388 punpckldq mm6, [edx + ecx*4 + 8]
17389 punpckldq mm7, [edx + ecx*4 + 12]
17390 pfmul mm6, mm1 ;# mm6 = Geps
17391 pfmul mm7, mm2 ;# mm7 = Heps2
17392 pfadd mm5, mm6
17393 pfadd mm5, mm7 ;# mm5 = Fp
17394 pfmul mm5, mm1 ;# mm5=eps*Fp
17395 pfadd mm5, mm4 ;# mm5= VV
17397 movq mm4, [esp + mci0310_c6]
17398 pfmul mm5, mm4 ;# vnb6
17399 ;# update vnbtot to release mm5!
17400 pfadd mm5, [esp + mci0310_vnbtot] ;# add the earlier value
17401 movq [esp + mci0310_vnbtot], mm5 ;# store the sum
17403 ;# repulsion table
17404 mov ecx, [esp + mci0310_n1]
17405 shl ecx, 3
17406 ;# load all the table values we need
17407 movd mm4, [edx + ecx*4 + 16]
17408 movd mm5, [edx + ecx*4 + 20]
17409 movd mm6, [edx + ecx*4 + 24]
17410 movd mm7, [edx + ecx*4 + 28]
17411 mov ecx, [esp + mci0310_n1 + 4]
17412 shl ecx, 3
17413 punpckldq mm4, [edx + ecx*4 + 16]
17414 punpckldq mm5, [edx + ecx*4 + 20]
17415 punpckldq mm6, [edx + ecx*4 + 24]
17416 punpckldq mm7, [edx + ecx*4 + 28]
17418 pfmul mm6, mm1 ;# mm6 = Geps
17419 pfmul mm7, mm2 ;# mm7 = Heps2
17420 pfadd mm5, mm6
17421 pfadd mm5, mm7 ;# mm5 = Fp
17422 pfmul mm5, mm1 ;# mm5=eps*Fp
17423 pfadd mm5, mm4 ;# mm5= VV
17425 movq mm6, [esp + mci0310_c12]
17426 pfmul mm5, mm6 ;# vnb12
17427 pfadd mm5, [esp + mci0310_vnbtot] ;# add the earlier value
17428 movq [esp + mci0310_vnbtot], mm5 ;# store the sum
17430 ;# should we do one more iteration?
17431 sub dword ptr [esp + mci0310_innerk], 2
17432 jl .mci0310_finish_vdwc_inner
17433 jmp .mci0310_unroll_vdwc_loop
17434 .mci0310_finish_vdwc_inner:
17435 and dword ptr [esp + mci0310_innerk], 1
17436 jnz .mci0310_single_vdwc_inner
17437 jmp .mci0310_updateouterdata_vdwc
17438 .mci0310_single_vdwc_inner:
17439 ;# a single j particle iteration here - compare with the unrolled code for comments
17440 mov eax, [esp + mci0310_innerjjnr]
17441 mov eax, [eax] ;# eax=jnr offset
17443 mov esi, [ebp + mci0310_nbfp]
17444 mov ecx, [ebp + mci0310_type]
17445 mov edx, [ecx + eax*4] ;# type [jnr1]
17446 shl edx, 1
17447 add edx, [esp + mci0310_ntia] ;# tja = ntia + 2*type
17448 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
17449 movq [esp + mci0310_c6], mm5
17450 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
17451 movq [esp + mci0310_c12], mm5
17453 mov esi, [ebp + mci0310_pos]
17454 lea eax, [eax + eax*2]
17456 movq mm0, [esp + mci0310_ix]
17457 movd mm1, [esp + mci0310_iz]
17458 movq mm4, [esi + eax*4]
17459 movd mm5, [esi + eax*4 + 8]
17460 pfsubr mm4, mm0
17461 pfsubr mm5, mm1
17462 pfmul mm4,mm4
17463 pfmul mm5,mm5
17464 pfacc mm4, mm5
17465 pfacc mm4, mm5 ;# mm0=rsq
17467 pfrsqrt mm0,mm4
17468 movq mm2,mm0
17469 pfmul mm0,mm0
17470 pfrsqit1 mm0,mm4
17471 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
17472 pfmul mm4, mm0
17473 movq mm1, mm4
17474 ;# mm0 is invsqrt, and mm1 r
17476 ;# calculate potentials and scalar force
17477 pfmul mm1, [esp + mci0310_tsc] ;# mm1=rt
17478 pf2iw mm4,mm1
17479 movd [esp + mci0310_n1], mm4
17480 pi2fd mm4,mm4
17481 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
17483 movq mm2,mm1
17484 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
17486 mov edx, [ebp + mci0310_VFtab]
17487 mov ecx, [esp + mci0310_n1]
17488 shl ecx, 3
17489 ;# dispersion table
17490 ;# load all the table values we need
17491 movd mm4, [edx + ecx*4]
17492 movd mm5, [edx + ecx*4 + 4]
17493 movd mm6, [edx + ecx*4 + 8]
17494 movd mm7, [edx + ecx*4 + 12]
17495 pfmul mm6, mm1 ;# mm6 = Geps
17496 pfmul mm7, mm2 ;# mm7 = Heps2
17497 pfadd mm5, mm6
17498 pfadd mm5, mm7 ;# mm5 = Fp
17499 pfmul mm5, mm1 ;# mm5=eps*Fp
17500 pfadd mm5, mm4 ;# mm5= VV
17502 movq mm4, [esp + mci0310_c6]
17503 pfmul mm5, mm4 ;# vnb6
17504 ;# update vnbtot to release mm5!
17505 pfadd mm5, [esp + mci0310_vnbtot] ;# add the earlier value
17506 movq [esp + mci0310_vnbtot], mm5 ;# store the sum
17508 ;# repulsion table
17509 ;# load all the table values we need
17510 movd mm4, [edx + ecx*4 + 16]
17511 movd mm5, [edx + ecx*4 + 20]
17512 movd mm6, [edx + ecx*4 + 24]
17513 movd mm7, [edx + ecx*4 + 28]
17515 pfmul mm6, mm1 ;# mm6 = Geps
17516 pfmul mm7, mm2 ;# mm7 = Heps2
17517 pfadd mm5, mm6
17518 pfadd mm5, mm7 ;# mm5 = Fp
17519 pfmul mm5, mm1 ;# mm5=eps*Fp
17520 pfadd mm5, mm4 ;# mm5= VV
17522 movq mm6, [esp + mci0310_c12]
17523 pfmul mm5, mm6 ;# vnb12
17524 ;# update vnbtot
17525 pfadd mm5, [esp + mci0310_vnbtot] ;# add the earlier value
17526 movq [esp + mci0310_vnbtot], mm5 ;# store the sum
17528 .mci0310_updateouterdata_vdwc:
17529 ;# loop back to mno
17530 dec dword ptr [esp + mci0310_nsvdwc]
17531 jz .mci0310_testvdw
17532 jmp .mci0310_mno_vdwc
17533 .mci0310_testvdw:
17534 mov ebx, [esp + mci0310_nscoul]
17535 add [esp + mci0310_solnr], ebx
17537 mov ecx, [esp + mci0310_nsvdw]
17538 cmp ecx, 0
17539 jnz .mci0310_mno_vdw
17540 jmp .mci0310_last_mno
17541 .mci0310_mno_vdw:
17542 mov ebx, [esp + mci0310_solnr]
17543 inc dword ptr [esp + mci0310_solnr]
17545 mov edx, [ebp + mci0310_type]
17546 mov edx, [edx + ebx*4]
17547 imul edx, [ebp + mci0310_ntype]
17548 shl edx, 1
17549 mov [esp + mci0310_ntia], edx
17551 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
17552 mov eax, [ebp + mci0310_pos] ;# eax = base of pos[]
17553 mov [esp + mci0310_ii3], ebx
17555 movq mm0, [eax + ebx*4]
17556 movd mm1, [eax + ebx*4 + 8]
17557 pfadd mm0, [esp + mci0310_shX]
17558 pfadd mm1, [esp + mci0310_shZ]
17559 movq [esp + mci0310_ix], mm0
17560 movd [esp + mci0310_iz], mm1
17562 mov ecx, [esp + mci0310_innerjjnr0]
17563 mov [esp + mci0310_innerjjnr], ecx
17564 mov edx, [esp + mci0310_innerk0]
17565 sub edx, 2
17566 mov [esp + mci0310_innerk], edx ;# number of innerloop atoms
17567 jge .mci0310_unroll_vdw_loop
17568 jmp .mci0310_finish_vdw_inner
17569 .mci0310_unroll_vdw_loop:
17570 ;# paired innerloop starts here
17571 mov ecx, [esp + mci0310_innerjjnr] ;# pointer to jjnr[k]
17572 mov eax, [ecx]
17573 mov ebx, [ecx + 4] ;# eax/ebx=jnr
17574 add dword ptr [esp + mci0310_innerjjnr], 8 ;# advance pointer (unrolled 2)
17575 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
17577 mov ecx, [ebp + mci0310_type]
17578 mov edx, [ecx + eax*4] ;# type [jnr1]
17579 mov ecx, [ecx + ebx*4] ;# type [jnr2]
17581 mov esi, [ebp + mci0310_nbfp] ;# base of nbfp
17582 shl edx, 1
17583 shl ecx, 1
17584 add edx, [esp + mci0310_ntia] ;# tja = ntia + 2*type
17585 add ecx, [esp + mci0310_ntia]
17587 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
17588 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
17589 movq mm6,mm5
17590 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
17591 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
17592 movq [esp + mci0310_c6], mm5
17593 movq [esp + mci0310_c12], mm6
17595 lea eax, [eax + eax*2] ;# replace jnr with j3
17596 lea ebx, [ebx + ebx*2]
17598 mov esi, [ebp + mci0310_pos]
17600 movq mm0, [esp + mci0310_ix]
17601 movd mm1, [esp + mci0310_iz]
17602 movq mm4, [esi + eax*4] ;# fetch first j coordinates
17603 movd mm5, [esi + eax*4 + 8]
17604 pfsubr mm4,mm0 ;# dr = ir - jr
17605 pfsubr mm5,mm1
17606 pfmul mm4,mm4 ;# square dx,dy,dz
17607 pfmul mm5,mm5
17608 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
17609 pfacc mm4, mm5 ;# first rsq in lower mm4
17611 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
17612 movd mm7, [esi + ebx*4 + 8]
17614 pfsubr mm6,mm0 ;# dr = ir - jr
17615 pfsubr mm7,mm1
17616 pfmul mm6,mm6 ;# square dx,dy,dz
17617 pfmul mm7,mm7
17618 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
17619 pfacc mm6, mm7 ;# second rsq in lower mm6
17621 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
17622 pfrsqrt mm1, mm6
17625 punpckldq mm0,mm1
17626 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
17627 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
17628 pfmul mm0,mm0
17629 pfrsqit1 mm0,mm4
17630 pfrcpit2 mm0,mm2
17631 pfmul mm4, mm0
17632 movq mm1, mm4
17633 ;# mm0 is invsqrt, and mm1 r
17634 ;# do potential and fscal
17635 pfmul mm1, [esp + mci0310_tsc] ;# mm1=rt
17636 pf2iw mm4,mm1
17637 movq [esp + mci0310_n1], mm4
17638 pi2fd mm4,mm4
17639 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
17641 movq mm2,mm1
17642 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
17644 mov edx, [ebp + mci0310_VFtab]
17645 ;# dispersion table
17646 mov ecx, [esp + mci0310_n1]
17647 shl ecx, 3
17648 ;# load all the table values we need
17649 movd mm4, [edx + ecx*4]
17650 movd mm5, [edx + ecx*4 + 4]
17651 movd mm6, [edx + ecx*4 + 8]
17652 movd mm7, [edx + ecx*4 + 12]
17653 mov ecx, [esp + mci0310_n1 + 4]
17654 shl ecx, 3
17655 punpckldq mm4, [edx + ecx*4]
17656 punpckldq mm5, [edx + ecx*4 + 4]
17657 punpckldq mm6, [edx + ecx*4 + 8]
17658 punpckldq mm7, [edx + ecx*4 + 12]
17659 pfmul mm6, mm1 ;# mm6 = Geps
17660 pfmul mm7, mm2 ;# mm7 = Heps2
17661 pfadd mm5, mm6
17662 pfadd mm5, mm7 ;# mm5 = Fp
17663 pfmul mm5, mm1 ;# mm5=eps*Fp
17664 pfadd mm5, mm4 ;# mm5= VV
17666 movq mm4, [esp + mci0310_c6]
17667 pfmul mm5, mm4 ;# vnb6
17668 ;# update vnbtot to release mm5!
17669 pfadd mm5, [esp + mci0310_vnbtot] ;# add the earlier value
17670 movq [esp + mci0310_vnbtot], mm5 ;# store the sum
17672 ;# repulsion table
17673 mov ecx, [esp + mci0310_n1]
17674 shl ecx, 3
17675 ;# load all the table values we need
17676 movd mm4, [edx + ecx*4 + 16]
17677 movd mm5, [edx + ecx*4 + 20]
17678 movd mm6, [edx + ecx*4 + 24]
17679 movd mm7, [edx + ecx*4 + 28]
17680 mov ecx, [esp + mci0310_n1 + 4]
17681 shl ecx, 3
17682 punpckldq mm4, [edx + ecx*4 + 16]
17683 punpckldq mm5, [edx + ecx*4 + 20]
17684 punpckldq mm6, [edx + ecx*4 + 24]
17685 punpckldq mm7, [edx + ecx*4 + 28]
17687 pfmul mm6, mm1 ;# mm6 = Geps
17688 pfmul mm7, mm2 ;# mm7 = Heps2
17689 pfadd mm5, mm6
17690 pfadd mm5, mm7 ;# mm5 = Fp
17691 pfmul mm5, mm1 ;# mm5=eps*Fp
17692 pfadd mm5, mm4 ;# mm5= VV
17694 movq mm6, [esp + mci0310_c12]
17695 pfmul mm5, mm6 ;# vnb12
17696 ;# update vnbtot
17697 pfadd mm5, [esp + mci0310_vnbtot] ;# add the earlier value
17698 movq [esp + mci0310_vnbtot], mm5 ;# store the sum
17700 ;# should we do one more iteration?
17701 sub dword ptr [esp + mci0310_innerk], 2
17702 jl .mci0310_finish_vdw_inner
17703 jmp .mci0310_unroll_vdw_loop
17704 .mci0310_finish_vdw_inner:
17705 and dword ptr [esp + mci0310_innerk], 1
17706 jnz .mci0310_single_vdw_inner
17707 jmp .mci0310_updateouterdata_vdw
17708 .mci0310_single_vdw_inner:
17709 ;# a single j particle iteration here - compare with the unrolled code for comments
17710 mov eax, [esp + mci0310_innerjjnr]
17711 mov eax, [eax] ;# eax=jnr offset
17713 mov esi, [ebp + mci0310_nbfp]
17714 mov ecx, [ebp + mci0310_type]
17715 mov edx, [ecx + eax*4] ;# type [jnr1]
17716 shl edx, 1
17717 add edx, [esp + mci0310_ntia] ;# tja = ntia + 2*type
17718 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
17719 movq [esp + mci0310_c6], mm5
17720 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
17721 movq [esp + mci0310_c12], mm5
17723 mov esi, [ebp + mci0310_pos]
17724 lea eax, [eax + eax*2]
17726 movq mm0, [esp + mci0310_ix]
17727 movd mm1, [esp + mci0310_iz]
17728 movq mm4, [esi + eax*4]
17729 movd mm5, [esi + eax*4 + 8]
17730 pfsubr mm4, mm0
17731 pfsubr mm5, mm1
17732 pfmul mm4,mm4
17733 pfmul mm5,mm5
17734 pfacc mm4, mm5
17735 pfacc mm4, mm5 ;# mm0=rsq
17737 pfrsqrt mm0,mm4
17738 movq mm2,mm0
17739 pfmul mm0,mm0
17740 pfrsqit1 mm0,mm4
17741 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
17742 pfmul mm4, mm0
17743 movq mm1, mm4
17744 ;# mm0 is invsqrt, and mm1 r
17746 ;# calculate potentials and scalar force
17747 pfmul mm1, [esp + mci0310_tsc] ;# mm1=rt
17748 pf2iw mm4,mm1
17749 movd [esp + mci0310_n1], mm4
17750 pi2fd mm4,mm4
17751 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
17753 movq mm2,mm1
17754 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
17756 mov edx, [ebp + mci0310_VFtab]
17757 mov ecx, [esp + mci0310_n1]
17758 shl ecx, 3
17759 ;# dispersion table
17760 ;# load all the table values we need
17761 movd mm4, [edx + ecx*4]
17762 movd mm5, [edx + ecx*4 + 4]
17763 movd mm6, [edx + ecx*4 + 8]
17764 movd mm7, [edx + ecx*4 + 12]
17765 pfmul mm6, mm1 ;# mm6 = Geps
17766 pfmul mm7, mm2 ;# mm7 = Heps2
17767 pfadd mm5, mm6
17768 pfadd mm5, mm7 ;# mm5 = Fp
17769 pfmul mm5, mm1 ;# mm5=eps*Fp
17770 pfadd mm5, mm4 ;# mm5= VV
17772 movq mm4, [esp + mci0310_c6]
17773 pfmul mm5, mm4 ;# vnb6
17774 ;# update vnbtot to release mm5!
17775 pfadd mm5, [esp + mci0310_vnbtot] ;# add the earlier value
17776 movq [esp + mci0310_vnbtot], mm5 ;# store the sum
17778 ;# repulsion table
17779 ;# load all the table values we need
17780 movd mm4, [edx + ecx*4 + 16]
17781 movd mm5, [edx + ecx*4 + 20]
17782 movd mm6, [edx + ecx*4 + 24]
17783 movd mm7, [edx + ecx*4 + 28]
17785 pfmul mm6, mm1 ;# mm6 = Geps
17786 pfmul mm7, mm2 ;# mm7 = Heps2
17787 pfadd mm5, mm6
17788 pfadd mm5, mm7 ;# mm5 = Fp
17789 pfmul mm5, mm1 ;# mm5=eps*Fp
17790 pfadd mm5, mm4 ;# mm5= VV
17792 movq mm6, [esp + mci0310_c12]
17793 pfmul mm5, mm6 ;# vnb12
17794 ;# update vnbtot
17795 pfadd mm5, [esp + mci0310_vnbtot] ;# add the earlier value
17796 movq [esp + mci0310_vnbtot], mm5 ;# store the sum
17798 .mci0310_updateouterdata_vdw:
17799 ;# loop back to mno
17800 dec dword ptr [esp + mci0310_nsvdw]
17801 jz .mci0310_last_mno
17802 jmp .mci0310_mno_vdw
17804 .mci0310_last_mno:
17805 mov edx, [ebp + mci0310_gid] ;# get group index for this i particle
17806 mov edx, [edx]
17807 add dword ptr [ebp + mci0310_gid], 4 ;# advance pointer
17809 movq mm7, [esp + mci0310_vnbtot]
17810 pfacc mm7,mm7 ;# get and sum the two parts of total potential
17812 mov eax, [ebp + mci0310_Vnb]
17813 movd mm6, [eax + edx*4]
17814 pfadd mm6, mm7
17815 movd [eax + edx*4], mm6 ;# increment vc[gid]
17816 ;# finish if last
17817 mov ecx, [ebp + mci0310_nri]
17818 dec ecx
17819 jecxz .mci0310_end
17820 ;# not last, iterate once more!
17821 mov [ebp + mci0310_nri], ecx
17822 jmp .mci0310_outer
17823 .mci0310_end:
17824 femms
17825 add esp, 108
17826 pop edi
17827 pop esi
17828 pop edx
17829 pop ecx
17830 pop ebx
17831 pop eax
17832 leave
17836 .globl mcinl1000_3dnow
17837 .globl _mcinl1000_3dnow
17838 mcinl1000_3dnow:
17839 _mcinl1000_3dnow:
17840 .equiv mci1000_nri, 8
17841 .equiv mci1000_iinr, 12
17842 .equiv mci1000_jindex, 16
17843 .equiv mci1000_jjnr, 20
17844 .equiv mci1000_shift, 24
17845 .equiv mci1000_shiftvec, 28
17846 .equiv mci1000_gid, 32
17847 .equiv mci1000_pos, 36
17848 .equiv mci1000_charge, 40
17849 .equiv mci1000_facel, 44
17850 .equiv mci1000_Vc, 48
17851 ;# stack offsets for local variables
17852 .equiv mci1000_is3, 0
17853 .equiv mci1000_ii3, 4
17854 .equiv mci1000_ix, 8
17855 .equiv mci1000_iy, 12
17856 .equiv mci1000_iz, 16
17857 .equiv mci1000_iq, 20
17858 .equiv mci1000_vctot, 28
17859 .equiv mci1000_innerjjnr, 36
17860 .equiv mci1000_innerk, 40
17861 push ebp
17862 mov ebp,esp
17863 push eax
17864 push ebx
17865 push ecx
17866 push edx
17867 push esi
17868 push edi
17869 sub esp, 44 ;# 80 bytes local stack space
17870 femms
17871 ;# assume we have at least one i particle - start directly
17872 .mci1000_outer:
17873 mov eax, [ebp + mci1000_shift] ;# eax = pointer into shift[]
17874 mov ebx, [eax] ;# ebx=shift[n]
17875 add dword ptr [ebp + mci1000_shift], 4 ;# advance pointer one step
17877 lea ebx, [ebx + ebx*2] ;# ebx=3*is
17878 mov [esp + mci1000_is3],ebx ;# store is3
17880 mov eax, [ebp + mci1000_shiftvec] ;# eax = base of shiftvec[]
17882 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
17883 movd mm1, [eax + ebx*4 + 8]
17885 mov ecx, [ebp + mci1000_iinr] ;# ecx = pointer into iinr[]
17886 add dword ptr [ebp + mci1000_iinr], 4 ;# advance pointer
17887 mov ebx, [ecx] ;# ebx=ii
17889 mov edx, [ebp + mci1000_charge]
17890 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
17891 pfmul mm2, [ebp + mci1000_facel]
17892 punpckldq mm2,mm2 ;# spread to both halves
17893 movq [esp + mci1000_iq], mm2 ;# iq =facel*charge[ii]
17895 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
17896 mov eax, [ebp + mci1000_pos] ;# eax = base of pos[]
17898 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
17899 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
17900 mov [esp + mci1000_ii3], ebx
17901 pfadd mm1, mm3
17902 movq [esp + mci1000_ix], mm0
17903 movd [esp + mci1000_iz], mm1
17905 ;# clear vctot
17906 pxor mm7,mm7
17907 movq [esp + mci1000_vctot], mm7
17909 mov eax, [ebp + mci1000_jindex]
17910 mov ecx, [eax] ;# jindex[n]
17911 mov edx, [eax + 4] ;# jindex[n+1]
17912 add dword ptr [ebp + mci1000_jindex], 4
17913 sub edx, ecx ;# number of innerloop atoms
17915 mov esi, [ebp + mci1000_pos]
17916 mov eax, [ebp + mci1000_jjnr]
17917 shl ecx, 2
17918 add eax, ecx
17919 mov [esp + mci1000_innerjjnr], eax ;# pointer to jjnr[nj0]
17920 sub edx, 2
17921 mov [esp + mci1000_innerk], edx ;# number of innerloop atoms
17922 jge .mci1000_unroll_loop
17923 jmp .mci1000_finish_inner
17924 .mci1000_unroll_loop:
17925 ;# paired innerloop starts here
17926 mov ecx, [esp + mci1000_innerjjnr] ;# pointer to jjnr[k]
17927 mov eax, [ecx]
17928 mov ebx, [ecx + 4] ;# eax/ebx=jnr
17929 add dword ptr [esp + mci1000_innerjjnr], 8 ;# advance pointer (unrolled 2)
17930 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
17932 mov ecx, [ebp + mci1000_charge] ;# base of charge[]
17933 movq mm5, [esp + mci1000_iq]
17934 movd mm3, [ecx + eax*4] ;# charge[jnr1]
17935 movd mm7, [ecx + ebx*4] ;# charge[jnr2]
17936 punpckldq mm3,mm7 ;# move charge 2 to high part of mm3
17937 pfmul mm3,mm5 ;# mm3 now has qq for both particles
17939 lea eax, [eax + eax*2] ;# replace jnr with j3
17940 lea ebx, [ebx + ebx*2]
17942 movq mm0, [esp + mci1000_ix]
17943 movd mm1, [esp + mci1000_iz]
17944 movq mm4, [esi + eax*4] ;# fetch first j coordinates
17945 movd mm5, [esi + eax*4 + 8]
17946 pfsubr mm4,mm0 ;# dr = ir - jr
17947 pfsubr mm5,mm1
17948 pfmul mm4,mm4 ;# square dx,dy,dz
17949 pfmul mm5,mm5
17950 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
17951 pfacc mm4, mm5 ;# first rsq in lower mm4
17953 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
17954 movd mm7, [esi + ebx*4 + 8]
17956 pfsubr mm6,mm0 ;# dr = ir - jr
17957 pfsubr mm7,mm1
17958 pfmul mm6,mm6 ;# square dx,dy,dz
17959 pfmul mm7,mm7
17960 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
17961 pfacc mm6, mm7 ;# second rsq in lower mm6
17963 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
17964 pfrsqrt mm1, mm6
17966 punpckldq mm0,mm1
17967 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
17968 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
17969 pfmul mm0,mm0
17970 pfrsqit1 mm0,mm4
17971 pfrcpit2 mm0,mm2
17972 movq mm1,mm0
17973 ;# mm1=invsqrt
17974 ;# do potential and fscal
17977 pfmul mm3,mm1 ;# 3 has both vcoul
17978 pfadd mm3, [esp + mci1000_vctot] ;# add the earlier value
17979 movq [esp + mci1000_vctot], mm3 ;# store the sum
17981 ;# should we do one more iteration?
17982 sub dword ptr [esp + mci1000_innerk], 2
17983 jl .mci1000_finish_inner
17984 jmp .mci1000_unroll_loop
17985 .mci1000_finish_inner:
17986 and dword ptr [esp + mci1000_innerk], 1
17987 jnz .mci1000_single_inner
17988 jmp .mci1000_updateouterdata
17989 .mci1000_single_inner:
17990 ;# a single j particle iteration here - compare with the unrolled code for comments
17991 mov eax, [esp + mci1000_innerjjnr]
17992 mov eax, [eax] ;# eax=jnr offset
17994 mov ecx, [ebp + mci1000_charge]
17995 movd mm6, [esp + mci1000_iq]
17996 movd mm7, [ecx + eax*4]
17997 pfmul mm6, mm7 ;# mm6=qq
17999 lea eax, [eax + eax*2]
18001 movq mm0, [esp + mci1000_ix]
18002 movd mm1, [esp + mci1000_iz]
18003 movq mm2, [esi + eax*4]
18004 movd mm3, [esi + eax*4 + 8]
18005 pfsub mm0, mm2
18006 pfsub mm1, mm3
18007 pfmul mm0,mm0
18008 pfmul mm1,mm1
18009 pfacc mm0, mm1
18010 pfacc mm0, mm1 ;# mm0=rsq
18012 pfrsqrt mm1,mm0
18013 movq mm2,mm1
18014 pfmul mm1,mm1
18015 pfrsqit1 mm1,mm0
18016 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
18017 ;# calculate potential and scalar force
18018 pfmul mm6, mm1 ;# mm6=vcoul
18019 pfadd mm6, [esp + mci1000_vctot]
18020 movq [esp + mci1000_vctot], mm6
18022 .mci1000_updateouterdata:
18023 mov edx, [ebp + mci1000_gid] ;# get group index for this i particle
18024 mov edx, [edx]
18025 add dword ptr [ebp + mci1000_gid], 4 ;# advance pointer
18027 movq mm7, [esp + mci1000_vctot]
18028 pfacc mm7,mm7 ;# get and sum the two parts of total potential
18030 mov eax, [ebp + mci1000_Vc]
18031 movd mm6, [eax + edx*4]
18032 pfadd mm6, mm7
18033 movd [eax + edx*4], mm6 ;# increment vc[gid]
18034 ;# finish if last
18035 mov ecx, [ebp + mci1000_nri]
18036 dec ecx
18037 jecxz .mci1000_end
18038 ;# not last, iterate once more!
18039 mov [ebp + mci1000_nri], ecx
18040 jmp .mci1000_outer
18041 .mci1000_end:
18042 femms
18043 add esp, 44
18044 pop edi
18045 pop esi
18046 pop edx
18047 pop ecx
18048 pop ebx
18049 pop eax
18050 leave
18054 .globl mcinl1010_3dnow
18055 .globl _mcinl1010_3dnow
18056 mcinl1010_3dnow:
18057 _mcinl1010_3dnow:
18058 .equiv mci1010_nri, 8
18059 .equiv mci1010_iinr, 12
18060 .equiv mci1010_jindex, 16
18061 .equiv mci1010_jjnr, 20
18062 .equiv mci1010_shift, 24
18063 .equiv mci1010_shiftvec, 28
18064 .equiv mci1010_gid, 32
18065 .equiv mci1010_pos, 36
18066 .equiv mci1010_charge, 40
18067 .equiv mci1010_facel, 44
18068 .equiv mci1010_Vc, 48
18069 .equiv mci1010_nsatoms, 52
18070 ;# stack offsets for local variables
18071 .equiv mci1010_is3, 0
18072 .equiv mci1010_ii3, 4
18073 .equiv mci1010_shX, 8
18074 .equiv mci1010_shY, 12
18075 .equiv mci1010_shZ, 16
18076 .equiv mci1010_ix, 20
18077 .equiv mci1010_iy, 24
18078 .equiv mci1010_iz, 28
18079 .equiv mci1010_iq, 32
18080 .equiv mci1010_vctot, 40
18081 .equiv mci1010_innerjjnr0, 48
18082 .equiv mci1010_innerk0, 52
18083 .equiv mci1010_innerjjnr, 56
18084 .equiv mci1010_innerk, 60
18085 .equiv mci1010_nscoul, 64
18086 .equiv mci1010_solnr, 68
18087 push ebp
18088 mov ebp,esp
18089 push eax
18090 push ebx
18091 push ecx
18092 push edx
18093 push esi
18094 push edi
18095 sub esp, 72 ;# local stack space
18096 femms
18097 ;# assume we have at least one i particle - start directly
18098 add dword ptr [ebp + mci1010_nsatoms], 8
18100 .mci1010_outer:
18101 mov eax, [ebp + mci1010_shift] ;# eax = pointer into shift[]
18102 mov ebx, [eax] ;# ebx=shift[n]
18103 add dword ptr [ebp + mci1010_shift], 4 ;# advance pointer one step
18105 lea ebx, [ebx + ebx*2] ;# ebx=3*is
18106 mov [esp + mci1010_is3],ebx ;# store is3
18108 mov eax, [ebp + mci1010_shiftvec] ;# eax = base of shiftvec[]
18110 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
18111 movd mm1, [eax + ebx*4 + 8]
18112 movq [esp + mci1010_shX], mm0
18113 movd [esp + mci1010_shZ], mm1
18115 mov ecx, [ebp + mci1010_iinr] ;# ecx = pointer into iinr[]
18116 add dword ptr [ebp + mci1010_iinr], 4 ;# advance pointer
18117 mov ebx, [ecx] ;# ebx=ii
18119 mov eax, [ebp + mci1010_nsatoms]
18120 mov ecx, [eax]
18121 add dword ptr [ebp + mci1010_nsatoms], 12
18122 mov [esp + mci1010_nscoul], ecx
18124 ;# clear potential
18125 pxor mm7,mm7
18126 movq [esp + mci1010_vctot], mm7
18127 mov [esp + mci1010_solnr], ebx
18129 mov eax, [ebp + mci1010_jindex]
18130 mov ecx, [eax] ;# jindex[n]
18131 mov edx, [eax + 4] ;# jindex[n+1]
18132 add dword ptr [ebp + mci1010_jindex], 4
18133 sub edx, ecx ;# number of innerloop atoms
18134 mov eax, [ebp + mci1010_jjnr]
18135 shl ecx, 2
18136 add eax, ecx
18137 mov [esp + mci1010_innerjjnr0], eax ;# pointer to jjnr[nj0]
18139 mov [esp + mci1010_innerk0], edx ;# number of innerloop atoms
18140 mov esi, [ebp + mci1010_pos]
18142 mov ecx, [esp + mci1010_nscoul]
18143 cmp ecx, 0
18144 jnz .mci1010_mno_coul
18145 jmp .mci1010_last_mno
18146 .mci1010_mno_coul:
18147 mov ebx, [esp + mci1010_solnr]
18148 inc dword ptr [esp + mci1010_solnr]
18149 mov edx, [ebp + mci1010_charge]
18150 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
18151 pfmul mm2, [ebp + mci1010_facel]
18152 punpckldq mm2,mm2 ;# spread to both halves
18153 movq [esp + mci1010_iq], mm2 ;# iq =facel*charge[ii]
18155 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
18156 mov eax, [ebp + mci1010_pos] ;# eax = base of pos[]
18157 mov [esp + mci1010_ii3], ebx
18159 movq mm0, [eax + ebx*4]
18160 movd mm1, [eax + ebx*4 + 8]
18161 pfadd mm0, [esp + mci1010_shX]
18162 pfadd mm1, [esp + mci1010_shZ]
18163 movq [esp + mci1010_ix], mm0
18164 movd [esp + mci1010_iz], mm1
18166 mov ecx, [esp + mci1010_innerjjnr0]
18167 mov [esp + mci1010_innerjjnr], ecx
18168 mov edx, [esp + mci1010_innerk0]
18169 sub edx, 2
18170 mov [esp + mci1010_innerk], edx ;# number of innerloop atoms
18171 jge .mci1010_unroll_coul_loop
18172 jmp .mci1010_finish_coul_inner
18173 .mci1010_unroll_coul_loop:
18174 ;# paired innerloop starts here
18175 mov ecx, [esp + mci1010_innerjjnr] ;# pointer to jjnr[k]
18176 mov eax, [ecx]
18177 mov ebx, [ecx + 4] ;# eax/ebx=jnr
18178 add dword ptr [esp + mci1010_innerjjnr], 8 ;# advance pointer (unrolled 2)
18179 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
18181 mov ecx, [ebp + mci1010_charge] ;# base of charge[]
18182 movq mm5, [esp + mci1010_iq]
18183 movd mm3, [ecx + eax*4] ;# charge[jnr1]
18184 movd mm7, [ecx + ebx*4] ;# charge[jnr2]
18185 punpckldq mm3,mm7 ;# move charge 2 to high part of mm3
18186 pfmul mm3,mm5 ;# mm3 now has qq for both particles
18188 lea eax, [eax + eax*2] ;# replace jnr with j3
18189 lea ebx, [ebx + ebx*2]
18191 movq mm0, [esp + mci1010_ix]
18192 movd mm1, [esp + mci1010_iz]
18193 movq mm4, [esi + eax*4] ;# fetch first j coordinates
18194 movd mm5, [esi + eax*4 + 8]
18195 pfsubr mm4,mm0 ;# dr = ir - jr
18196 pfsubr mm5,mm1
18197 pfmul mm4,mm4 ;# square dx,dy,dz
18198 pfmul mm5,mm5
18199 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
18200 pfacc mm4, mm5 ;# first rsq in lower mm4
18202 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
18203 movd mm7, [esi + ebx*4 + 8]
18205 pfsubr mm6,mm0 ;# dr = ir - jr
18206 pfsubr mm7,mm1
18207 pfmul mm6,mm6 ;# square dx,dy,dz
18208 pfmul mm7,mm7
18209 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
18210 pfacc mm6, mm7 ;# second rsq in lower mm6
18212 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
18213 pfrsqrt mm1, mm6
18215 punpckldq mm0,mm1
18216 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
18217 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
18218 pfmul mm0,mm0
18219 pfrsqit1 mm0,mm4
18220 pfrcpit2 mm0,mm2
18221 movq mm1,mm0
18222 ;# mm1=invsqrt
18223 ;# do potential
18225 pfmul mm3,mm1 ;# 3 has both vcoul
18226 pfadd mm3, [esp + mci1010_vctot] ;# add the earlier value
18227 movq [esp + mci1010_vctot], mm3 ;# store the sum
18229 ;# should we do one more iteration?
18230 sub dword ptr [esp + mci1010_innerk], 2
18231 jl .mci1010_finish_coul_inner
18232 jmp .mci1010_unroll_coul_loop
18233 .mci1010_finish_coul_inner:
18234 and dword ptr [esp + mci1010_innerk], 1
18235 jnz .mci1010_single_coul_inner
18236 jmp .mci1010_updateouterdata_coul
18237 .mci1010_single_coul_inner:
18238 ;# a single j particle iteration here - compare with the unrolled code for comments
18239 mov eax, [esp + mci1010_innerjjnr]
18240 mov eax, [eax] ;# eax=jnr offset
18242 mov ecx, [ebp + mci1010_charge]
18243 movd mm6, [esp + mci1010_iq]
18244 movd mm7, [ecx + eax*4]
18245 pfmul mm6, mm7 ;# mm6=qq
18247 lea eax, [eax + eax*2]
18249 movq mm0, [esp + mci1010_ix]
18250 movd mm1, [esp + mci1010_iz]
18251 movq mm2, [esi + eax*4]
18252 movd mm3, [esi + eax*4 + 8]
18253 pfsub mm0, mm2
18254 pfsub mm1, mm3
18255 pfmul mm0,mm0
18256 pfmul mm1,mm1
18257 pfacc mm0, mm1
18258 pfacc mm0, mm1 ;# mm0=rsq
18260 pfrsqrt mm1,mm0
18261 movq mm2,mm1
18262 pfmul mm1,mm1
18263 pfrsqit1 mm1,mm0
18264 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
18265 ;# calculate potential and scalar force
18266 pfmul mm6, mm1 ;# mm6=vcoul
18267 pfadd mm6, [esp + mci1010_vctot]
18268 movq [esp + mci1010_vctot], mm6
18270 .mci1010_updateouterdata_coul:
18271 ;# loop back to mno
18272 dec dword ptr [esp + mci1010_nscoul]
18273 jz .mci1010_last_mno
18274 jmp .mci1010_mno_coul
18275 .mci1010_last_mno:
18276 mov edx, [ebp + mci1010_gid] ;# get group index for this i particle
18277 mov edx, [edx]
18278 add dword ptr [ebp + mci1010_gid], 4 ;# advance pointer
18280 movq mm7, [esp + mci1010_vctot]
18281 pfacc mm7,mm7 ;# get and sum the two parts of total potential
18283 mov eax, [ebp + mci1010_Vc]
18284 movd mm6, [eax + edx*4]
18285 pfadd mm6, mm7
18286 movd [eax + edx*4], mm6 ;# increment vc[gid]
18287 ;# finish if last
18288 mov ecx, [ebp + mci1010_nri]
18289 dec ecx
18290 jecxz .mci1010_end
18291 ;# not last, iterate once more!
18292 mov [ebp + mci1010_nri], ecx
18293 jmp .mci1010_outer
18294 .mci1010_end:
18295 femms
18296 add esp, 72
18297 pop edi
18298 pop esi
18299 pop edx
18300 pop ecx
18301 pop ebx
18302 pop eax
18303 leave
18307 .globl mcinl1020_3dnow
18308 .globl _mcinl1020_3dnow
18309 mcinl1020_3dnow:
18310 _mcinl1020_3dnow:
18311 .equiv mci1020_nri, 8
18312 .equiv mci1020_iinr, 12
18313 .equiv mci1020_jindex, 16
18314 .equiv mci1020_jjnr, 20
18315 .equiv mci1020_shift, 24
18316 .equiv mci1020_shiftvec, 28
18317 .equiv mci1020_gid, 32
18318 .equiv mci1020_pos, 36
18319 .equiv mci1020_charge, 40
18320 .equiv mci1020_facel, 44
18321 .equiv mci1020_Vc, 48
18322 ;# stack offsets for local variables
18323 .equiv mci1020_is3, 0
18324 .equiv mci1020_ii3, 4
18325 .equiv mci1020_ixO, 8
18326 .equiv mci1020_iyO, 12
18327 .equiv mci1020_izO, 16
18328 .equiv mci1020_ixH, 20
18329 .equiv mci1020_iyH, 28
18330 .equiv mci1020_izH, 36
18331 .equiv mci1020_iqO, 44
18332 .equiv mci1020_iqH, 52
18333 .equiv mci1020_vctot, 60
18334 .equiv mci1020_innerjjnr, 68
18335 .equiv mci1020_innerk, 72
18336 push ebp
18337 mov ebp,esp
18338 push eax
18339 push ebx
18340 push ecx
18341 push edx
18342 push esi
18343 push edi
18344 sub esp, 76 ;# local stack space
18345 femms
18346 ;# assume we have at least one i particle - start directly
18348 mov ecx, [ebp + mci1020_iinr] ;# ecx = pointer into iinr[]
18349 mov ebx, [ecx] ;# ebx=ii
18351 mov edx, [ebp + mci1020_charge]
18352 movd mm1, [ebp + mci1020_facel]
18353 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
18354 pfmul mm2, mm1
18355 movq [esp + mci1020_iqO], mm2 ;# iqO = facel*charge[ii]
18357 movd mm2, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1]
18358 pfmul mm2, mm1
18359 punpckldq mm2,mm2 ;# spread to both halves
18360 movq [esp + mci1020_iqH], mm2 ;# iqH = facel*charge[ii0+1]
18361 .mci1020_outer:
18362 mov eax, [ebp + mci1020_shift] ;# eax = pointer into shift[]
18363 mov ebx, [eax] ;# ebx=shift[n]
18364 add dword ptr [ebp + mci1020_shift], 4 ;# advance pointer one step
18366 lea ebx, [ebx + ebx*2] ;# ebx=3*is
18367 mov [esp + mci1020_is3],ebx ;# store is3
18369 mov eax, [ebp + mci1020_shiftvec] ;# eax = base of shiftvec[]
18371 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6
18372 movd mm6, [eax + ebx*4 + 8]
18373 movq mm0, mm5
18374 movq mm1, mm5
18375 movq mm2, mm6
18376 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2
18377 punpckhdq mm1,mm1
18378 punpckldq mm2,mm2
18380 mov ecx, [ebp + mci1020_iinr] ;# ecx = pointer into iinr[]
18381 add dword ptr [ebp + mci1020_iinr], 4 ;# advance pointer
18382 mov ebx, [ecx] ;# ebx=ii
18384 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
18385 mov eax, [ebp + mci1020_pos] ;# eax = base of pos[]
18387 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
18388 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
18389 mov [esp + mci1020_ii3], ebx ;# (use mm7 as temp storage for iz)
18390 pfadd mm6, mm7
18391 movq [esp + mci1020_ixO], mm5
18392 movq [esp + mci1020_izO], mm6
18394 movd mm3, [eax + ebx*4 + 12]
18395 movd mm4, [eax + ebx*4 + 16]
18396 movd mm5, [eax + ebx*4 + 20]
18397 punpckldq mm3, [eax + ebx*4 + 24]
18398 punpckldq mm4, [eax + ebx*4 + 28]
18399 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
18401 pfadd mm0, mm3
18402 pfadd mm1, mm4
18403 pfadd mm2, mm5
18404 movq [esp + mci1020_ixH], mm0
18405 movq [esp + mci1020_iyH], mm1
18406 movq [esp + mci1020_izH], mm2
18408 ;# clear vctot and i forces
18409 pxor mm7,mm7
18410 movq [esp + mci1020_vctot], mm7
18412 mov eax, [ebp + mci1020_jindex]
18413 mov ecx, [eax] ;# jindex[n]
18414 mov edx, [eax + 4] ;# jindex[n+1]
18415 add dword ptr [ebp + mci1020_jindex], 4
18416 sub edx, ecx ;# number of innerloop atoms
18417 mov [esp + mci1020_innerk], edx ;# number of innerloop atoms
18419 mov esi, [ebp + mci1020_pos]
18420 mov eax, [ebp + mci1020_jjnr]
18421 shl ecx, 2
18422 add eax, ecx
18423 mov [esp + mci1020_innerjjnr], eax ;# pointer to jjnr[nj0]
18424 .mci1020_inner_loop:
18425 ;# a single j particle iteration here - compare with the unrolled code for comments
18426 mov eax, [esp + mci1020_innerjjnr]
18427 mov eax, [eax] ;# eax=jnr offset
18428 add dword ptr [esp + mci1020_innerjjnr], 4 ;# advance pointer
18429 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
18431 mov ecx, [ebp + mci1020_charge]
18432 movd mm7, [ecx + eax*4]
18433 punpckldq mm7,mm7
18434 movq mm6,mm7
18435 pfmul mm6, [esp + mci1020_iqO]
18436 pfmul mm7, [esp + mci1020_iqH] ;# mm6=qqO, mm7=qqH
18438 lea eax, [eax + eax*2]
18440 movq mm0, [esi + eax*4]
18441 movd mm1, [esi + eax*4 + 8]
18442 ;# copy & expand to mm2-mm4 for the H interactions
18443 movq mm2, mm0
18444 movq mm3, mm0
18445 movq mm4, mm1
18446 punpckldq mm2,mm2
18447 punpckhdq mm3,mm3
18448 punpckldq mm4,mm4
18450 pfsubr mm0, [esp + mci1020_ixO]
18451 pfsubr mm1, [esp + mci1020_izO]
18453 pfmul mm0,mm0
18454 pfmul mm1,mm1
18455 pfacc mm0, mm1
18456 pfadd mm0, mm1 ;# mm0=rsqO
18458 punpckldq mm2, mm2
18459 punpckldq mm3, mm3
18460 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
18461 pfsubr mm2, [esp + mci1020_ixH]
18462 pfsubr mm3, [esp + mci1020_iyH]
18463 pfsubr mm4, [esp + mci1020_izH] ;# mm2-mm4 is dxH-dzH
18465 pfmul mm2,mm2
18466 pfmul mm3,mm3
18467 pfmul mm4,mm4
18469 pfadd mm3,mm2
18470 pfadd mm3,mm4 ;# mm3=rsqH
18472 pfrsqrt mm1,mm0
18474 movq mm2,mm1
18475 pfmul mm1,mm1
18476 pfrsqit1 mm1,mm0
18477 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
18478 ;# calculate potential and scalar force
18479 pfmul mm6, mm1 ;# mm6=vcoul
18481 pfrsqrt mm5, mm3
18482 pswapd mm3,mm3
18483 pfrsqrt mm2, mm3
18484 pswapd mm3,mm3
18485 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
18487 movq mm2, mm5
18488 pfmul mm5,mm5
18489 pfrsqit1 mm5,mm3
18490 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
18491 pfmul mm7, mm5 ;# mm7=vcoul
18492 ;# update vctot
18493 pfadd mm7, mm6
18494 pfadd mm7, [esp + mci1020_vctot]
18495 movq [esp + mci1020_vctot], mm7
18497 ;# done - one more?
18498 dec dword ptr [esp + mci1020_innerk]
18499 jz .mci1020_updateouterdata
18500 jmp .mci1020_inner_loop
18501 .mci1020_updateouterdata:
18502 mov edx, [ebp + mci1020_gid] ;# get group index for this i particle
18503 mov edx, [edx]
18504 add dword ptr [ebp + mci1020_gid], 4 ;# advance pointer
18506 movq mm7, [esp + mci1020_vctot]
18507 pfacc mm7,mm7 ;# get and sum the two parts of total potential
18509 mov eax, [ebp + mci1020_Vc]
18510 movd mm6, [eax + edx*4]
18511 pfadd mm6, mm7
18512 movd [eax + edx*4], mm6 ;# increment vc[gid]
18514 ;# finish if last
18515 dec dword ptr [ebp + mci1020_nri]
18516 jz .mci1020_end
18517 ;# not last, iterate once more!
18518 jmp .mci1020_outer
18519 .mci1020_end:
18520 femms
18521 add esp, 76
18522 pop edi
18523 pop esi
18524 pop edx
18525 pop ecx
18526 pop ebx
18527 pop eax
18528 leave
18532 .globl mcinl1030_3dnow
18533 .globl _mcinl1030_3dnow
18534 mcinl1030_3dnow:
18535 _mcinl1030_3dnow:
18536 .equiv mci1030_nri, 8
18537 .equiv mci1030_iinr, 12
18538 .equiv mci1030_jindex, 16
18539 .equiv mci1030_jjnr, 20
18540 .equiv mci1030_shift, 24
18541 .equiv mci1030_shiftvec, 28
18542 .equiv mci1030_gid, 32
18543 .equiv mci1030_pos, 36
18544 .equiv mci1030_charge, 40
18545 .equiv mci1030_facel, 44
18546 .equiv mci1030_Vc, 48
18547 ;# stack offsets for local variables
18548 .equiv mci1030_is3, 0
18549 .equiv mci1030_ii3, 4
18550 .equiv mci1030_ixO, 8
18551 .equiv mci1030_iyO, 12
18552 .equiv mci1030_izO, 16
18553 .equiv mci1030_ixH, 20
18554 .equiv mci1030_iyH, 28
18555 .equiv mci1030_izH, 36
18556 .equiv mci1030_qqOO, 44
18557 .equiv mci1030_qqOH, 52
18558 .equiv mci1030_qqHH, 60
18559 .equiv mci1030_vctot, 68
18560 .equiv mci1030_innerjjnr, 76
18561 .equiv mci1030_innerk, 80
18562 push ebp
18563 mov ebp,esp
18564 push eax
18565 push ebx
18566 push ecx
18567 push edx
18568 push esi
18569 push edi
18570 sub esp, 84 ;# local stack space
18571 femms
18572 ;# assume we have at least one i particle - start directly
18574 mov ecx, [ebp + mci1030_iinr] ;# ecx = pointer into iinr[]
18575 mov ebx, [ecx] ;# ebx=ii
18577 mov edx, [ebp + mci1030_charge]
18578 movd mm1, [ebp + mci1030_facel] ;# mm1=facel
18579 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0] (O)
18580 movd mm3, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1] (H)
18581 movq mm4, mm2
18582 pfmul mm4, mm1
18583 movq mm6, mm3
18584 pfmul mm6, mm1
18585 movq mm5, mm4
18586 pfmul mm4, mm2 ;# mm4=qqOO*facel
18587 pfmul mm5, mm3 ;# mm5=qqOH*facel
18588 pfmul mm6, mm3 ;# mm6=qqHH*facel
18589 punpckldq mm5,mm5 ;# spread to both halves
18590 punpckldq mm6,mm6 ;# spread to both halves
18591 movq [esp + mci1030_qqOO], mm4
18592 movq [esp + mci1030_qqOH], mm5
18593 movq [esp + mci1030_qqHH], mm6
18594 .mci1030_outer:
18595 mov eax, [ebp + mci1030_shift] ;# eax = pointer into shift[]
18596 mov ebx, [eax] ;# ebx=shift[n]
18597 add dword ptr [ebp + mci1030_shift], 4 ;# advance pointer one step
18599 lea ebx, [ebx + ebx*2] ;# ebx=3*is
18600 mov [esp + mci1030_is3],ebx ;# store is3
18602 mov eax, [ebp + mci1030_shiftvec] ;# eax = base of shiftvec[]
18604 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6
18605 movd mm6, [eax + ebx*4 + 8]
18606 movq mm0, mm5
18607 movq mm1, mm5
18608 movq mm2, mm6
18609 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2
18610 punpckhdq mm1,mm1
18611 punpckldq mm2,mm2
18613 mov ecx, [ebp + mci1030_iinr] ;# ecx = pointer into iinr[]
18614 add dword ptr [ebp + mci1030_iinr], 4 ;# advance pointer
18615 mov ebx, [ecx] ;# ebx=ii
18617 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
18618 mov eax, [ebp + mci1030_pos] ;# eax = base of pos[]
18620 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
18621 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
18622 mov [esp + mci1030_ii3], ebx ;# (use mm7 as temp storage for iz)
18623 pfadd mm6, mm7
18624 movq [esp + mci1030_ixO], mm5
18625 movq [esp + mci1030_izO], mm6
18627 movd mm3, [eax + ebx*4 + 12]
18628 movd mm4, [eax + ebx*4 + 16]
18629 movd mm5, [eax + ebx*4 + 20]
18630 punpckldq mm3, [eax + ebx*4 + 24]
18631 punpckldq mm4, [eax + ebx*4 + 28]
18632 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
18634 pfadd mm0, mm3
18635 pfadd mm1, mm4
18636 pfadd mm2, mm5
18637 movq [esp + mci1030_ixH], mm0
18638 movq [esp + mci1030_iyH], mm1
18639 movq [esp + mci1030_izH], mm2
18641 ;# clear vctot and i forces
18642 pxor mm7,mm7
18643 movq [esp + mci1030_vctot], mm7
18645 mov eax, [ebp + mci1030_jindex]
18646 mov ecx, [eax] ;# jindex[n]
18647 mov edx, [eax + 4] ;# jindex[n+1]
18648 add dword ptr [ebp + mci1030_jindex], 4
18649 sub edx, ecx ;# number of innerloop atoms
18650 mov [esp + mci1030_innerk], edx ;# number of innerloop atoms
18652 mov esi, [ebp + mci1030_pos]
18653 mov eax, [ebp + mci1030_jjnr]
18654 shl ecx, 2
18655 add eax, ecx
18656 mov [esp + mci1030_innerjjnr], eax ;# pointer to jjnr[nj0]
18657 .mci1030_inner_loop:
18658 ;# a single j particle iteration here - compare with the unrolled code for comments
18659 mov eax, [esp + mci1030_innerjjnr]
18660 mov eax, [eax] ;# eax=jnr offset
18661 add dword ptr [esp + mci1030_innerjjnr], 4 ;# advance pointer
18663 movd mm6, [esp + mci1030_qqOO]
18664 movq mm7, [esp + mci1030_qqOH]
18666 lea eax, [eax + eax*2]
18667 movq mm0, [esi + eax*4]
18668 movd mm1, [esi + eax*4 + 8]
18669 ;# copy & expand to mm2-mm4 for the H interactions
18670 movq mm2, mm0
18671 movq mm3, mm0
18672 movq mm4, mm1
18673 punpckldq mm2,mm2
18674 punpckhdq mm3,mm3
18675 punpckldq mm4,mm4
18677 pfsubr mm0, [esp + mci1030_ixO]
18678 pfsubr mm1, [esp + mci1030_izO]
18680 pfmul mm0,mm0
18681 pfmul mm1,mm1
18682 pfacc mm0, mm0
18683 pfadd mm0, mm1 ;# mm0=rsqO
18685 punpckldq mm2, mm2
18686 punpckldq mm3, mm3
18687 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
18688 pfsubr mm2, [esp + mci1030_ixH]
18689 pfsubr mm3, [esp + mci1030_iyH]
18690 pfsubr mm4, [esp + mci1030_izH] ;# mm2-mm4 is dxH-dzH
18692 pfmul mm2,mm2
18693 pfmul mm3,mm3
18694 pfmul mm4,mm4
18696 pfadd mm3,mm2
18697 pfadd mm3,mm4 ;# mm3=rsqH
18699 pfrsqrt mm1,mm0
18701 movq mm2,mm1
18702 pfmul mm1,mm1
18703 pfrsqit1 mm1,mm0
18704 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
18705 ;# calculate potential and scalar force
18706 pfmul mm6, mm1 ;# mm6=vcoul
18708 pfrsqrt mm5, mm3
18709 pswapd mm3,mm3
18710 pfrsqrt mm2, mm3
18711 pswapd mm3,mm3
18712 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
18714 movq mm2, mm5
18715 pfmul mm5,mm5
18716 pfrsqit1 mm5,mm3
18717 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
18718 pfmul mm7, mm5 ;# mm7=vcoul
18719 ;# update vctot
18720 pfadd mm7, mm6
18721 pfadd mm7, [esp + mci1030_vctot]
18722 movq [esp + mci1030_vctot], mm7
18724 ;# interactions with j H1
18725 movq mm0, [esi + eax*4 + 12]
18726 movd mm1, [esi + eax*4 + 20]
18727 ;# copy & expand to mm2-mm4 for the H interactions
18728 movq mm2, mm0
18729 movq mm3, mm0
18730 movq mm4, mm1
18731 punpckldq mm2,mm2
18732 punpckhdq mm3,mm3
18733 punpckldq mm4,mm4
18735 movd mm6, [esp + mci1030_qqOH]
18736 movq mm7, [esp + mci1030_qqHH]
18738 pfsubr mm0, [esp + mci1030_ixO]
18739 pfsubr mm1, [esp + mci1030_izO]
18741 pfmul mm0,mm0
18742 pfmul mm1,mm1
18743 pfacc mm0, mm1
18744 pfadd mm0, mm1 ;# mm0=rsqO
18746 punpckldq mm2, mm2
18747 punpckldq mm3, mm3
18748 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
18749 pfsubr mm2, [esp + mci1030_ixH]
18750 pfsubr mm3, [esp + mci1030_iyH]
18751 pfsubr mm4, [esp + mci1030_izH] ;# mm2-mm4 is dxH-dzH
18753 pfmul mm2,mm2
18754 pfmul mm3,mm3
18755 pfmul mm4,mm4
18757 pfadd mm3,mm2
18758 pfadd mm3,mm4 ;# mm3=rsqH
18760 pfrsqrt mm1,mm0
18762 movq mm2,mm1
18763 pfmul mm1,mm1
18764 pfrsqit1 mm1,mm0
18765 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
18766 ;# calculate potential and scalar force
18767 pfmul mm6, mm1 ;# mm6=vcoul
18769 pfrsqrt mm5, mm3
18770 pswapd mm3,mm3
18771 pfrsqrt mm2, mm3
18772 pswapd mm3,mm3
18773 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
18775 movq mm2, mm5
18776 pfmul mm5,mm5
18777 pfrsqit1 mm5,mm3
18778 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
18779 pfmul mm7, mm5 ;# mm7=vcoul
18780 ;# update vctot
18781 pfadd mm7, mm6
18782 pfadd mm7, [esp + mci1030_vctot]
18783 movq [esp + mci1030_vctot], mm7
18785 ;# interactions with j H2
18786 movq mm0, [esi + eax*4 + 24]
18787 movd mm1, [esi + eax*4 + 32]
18788 ;# copy & expand to mm2-mm4 for the H interactions
18789 movq mm2, mm0
18790 movq mm3, mm0
18791 movq mm4, mm1
18792 punpckldq mm2,mm2
18793 punpckhdq mm3,mm3
18794 punpckldq mm4,mm4
18796 movd mm6, [esp + mci1030_qqOH]
18797 movq mm7, [esp + mci1030_qqHH]
18799 pfsubr mm0, [esp + mci1030_ixO]
18800 pfsubr mm1, [esp + mci1030_izO]
18802 pfmul mm0,mm0
18803 pfmul mm1,mm1
18804 pfacc mm0, mm1
18805 pfadd mm0, mm1 ;# mm0=rsqO
18807 punpckldq mm2, mm2
18808 punpckldq mm3, mm3
18809 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
18810 pfsubr mm2, [esp + mci1030_ixH]
18811 pfsubr mm3, [esp + mci1030_iyH]
18812 pfsubr mm4, [esp + mci1030_izH] ;# mm2-mm4 is dxH-dzH
18814 pfmul mm2,mm2
18815 pfmul mm3,mm3
18816 pfmul mm4,mm4
18818 pfadd mm3,mm2
18819 pfadd mm3,mm4 ;# mm3=rsqH
18821 pfrsqrt mm1,mm0
18823 movq mm2,mm1
18824 pfmul mm1,mm1
18825 pfrsqit1 mm1,mm0
18826 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
18827 ;# calculate potential and scalar force
18828 pfmul mm6, mm1 ;# mm6=vcoul
18830 pfrsqrt mm5, mm3
18831 pswapd mm3,mm3
18832 pfrsqrt mm2, mm3
18833 pswapd mm3,mm3
18834 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
18836 movq mm2, mm5
18837 pfmul mm5,mm5
18838 pfrsqit1 mm5,mm3
18839 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
18840 pfmul mm7, mm5 ;# mm7=vcoul
18841 ;# update vctot
18842 pfadd mm7, mm6
18843 pfadd mm7, [esp + mci1030_vctot]
18844 movq [esp + mci1030_vctot], mm7
18846 ;# done - one more?
18847 dec dword ptr [esp + mci1030_innerk]
18848 jz .mci1030_updateouterdata
18849 jmp .mci1030_inner_loop
18850 .mci1030_updateouterdata:
18851 mov edx, [ebp + mci1030_gid] ;# get group index for this i particle
18852 mov edx, [edx]
18853 add dword ptr [ebp + mci1030_gid], 4 ;# advance pointer
18855 movq mm7, [esp + mci1030_vctot]
18856 pfacc mm7,mm7 ;# get and sum the two parts of total potential
18858 mov eax, [ebp + mci1030_Vc]
18859 movd mm6, [eax + edx*4]
18860 pfadd mm6, mm7
18861 movd [eax + edx*4], mm6 ;# increment vc[gid]
18862 ;# finish if last
18863 dec dword ptr [ebp + mci1030_nri]
18864 jz .mci1030_end
18865 ;# not last, iterate once more!
18866 jmp .mci1030_outer
18867 .mci1030_end:
18868 femms
18869 add esp, 84
18870 pop edi
18871 pop esi
18872 pop edx
18873 pop ecx
18874 pop ebx
18875 pop eax
18876 leave
18880 .globl mcinl1100_3dnow
18881 .globl _mcinl1100_3dnow
18882 mcinl1100_3dnow:
18883 _mcinl1100_3dnow:
18884 .equiv mci1100_nri, 8
18885 .equiv mci1100_iinr, 12
18886 .equiv mci1100_jindex, 16
18887 .equiv mci1100_jjnr, 20
18888 .equiv mci1100_shift, 24
18889 .equiv mci1100_shiftvec, 28
18890 .equiv mci1100_gid, 32
18891 .equiv mci1100_pos, 36
18892 .equiv mci1100_charge, 40
18893 .equiv mci1100_facel, 44
18894 .equiv mci1100_Vc, 48
18895 .equiv mci1100_type, 52
18896 .equiv mci1100_ntype, 56
18897 .equiv mci1100_nbfp, 60
18898 .equiv mci1100_Vnb, 64
18899 ;# stack offsets for local variables
18900 .equiv mci1100_is3, 0
18901 .equiv mci1100_ii3, 4
18902 .equiv mci1100_ix, 8
18903 .equiv mci1100_iy, 12
18904 .equiv mci1100_iz, 16
18905 .equiv mci1100_iq, 20
18906 .equiv mci1100_vctot, 28
18907 .equiv mci1100_vnbtot, 36
18908 .equiv mci1100_c6, 44
18909 .equiv mci1100_c12, 52
18910 .equiv mci1100_ntia, 60
18911 .equiv mci1100_innerjjnr, 64
18912 .equiv mci1100_innerk, 68
18913 push ebp
18914 mov ebp,esp
18916 push eax
18917 push ebx
18918 push ecx
18919 push edx
18920 push esi
18921 push edi
18922 sub esp, 72 ;# local stack space
18923 femms
18924 ;# move data to local stack
18925 ;# assume we have at least one i particle - start directly
18926 .mci1100_outer:
18927 mov eax, [ebp + mci1100_shift] ;# eax = pointer into shift[]
18928 mov ebx, [eax] ;# ebx=shift[n]
18929 add dword ptr [ebp + mci1100_shift], 4 ;# advance pointer one step
18931 lea ebx, [ebx + ebx*2] ;# ebx=3*is
18932 mov [esp + mci1100_is3],ebx ;# store is3
18934 mov eax, [ebp + mci1100_shiftvec] ;# eax = base of shiftvec[]
18936 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
18937 movd mm1, [eax + ebx*4 + 8]
18939 mov ecx, [ebp + mci1100_iinr] ;# ecx = pointer into iinr[]
18940 add dword ptr [ebp + mci1100_iinr], 4 ;# advance pointer
18941 mov ebx, [ecx] ;# ebx=ii
18943 mov edx, [ebp + mci1100_charge]
18944 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
18945 pfmul mm2, [ebp + mci1100_facel]
18946 punpckldq mm2,mm2 ;# spread to both halves
18947 movq [esp + mci1100_iq], mm2 ;# iq =facel*charge[ii]
18949 mov edx, [ebp + mci1100_type]
18950 mov edx, [edx + ebx*4]
18951 imul edx, [ebp + mci1100_ntype]
18952 shl edx, 1
18953 mov [esp + mci1100_ntia], edx
18955 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
18956 mov eax, [ebp + mci1100_pos] ;# eax = base of pos[]
18958 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
18959 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
18960 mov [esp + mci1100_ii3], ebx
18961 pfadd mm1, mm3
18962 movq [esp + mci1100_ix], mm0
18963 movd [esp + mci1100_iz], mm1
18965 ;# clear total potential and i forces
18966 pxor mm7,mm7
18967 movq [esp + mci1100_vctot], mm7
18968 movq [esp + mci1100_vnbtot], mm7
18970 mov eax, [ebp + mci1100_jindex]
18971 mov ecx, [eax] ;# jindex[n]
18972 mov edx, [eax + 4] ;# jindex[n+1]
18973 add dword ptr [ebp + mci1100_jindex], 4
18974 sub edx, ecx ;# number of innerloop atoms
18976 mov esi, [ebp + mci1100_pos]
18977 mov eax, [ebp + mci1100_jjnr]
18978 shl ecx, 2
18979 add eax, ecx
18980 mov [esp + mci1100_innerjjnr], eax ;# pointer to jjnr[nj0]
18981 sub edx, 2
18982 mov [esp + mci1100_innerk], edx ;# number of innerloop atoms
18983 jge .mci1100_unroll_loop
18984 jmp .mci1100_finish_inner
18985 .mci1100_unroll_loop:
18986 ;# paired innerloop starts here
18987 mov ecx, [esp + mci1100_innerjjnr] ;# pointer to jjnr[k]
18988 mov eax, [ecx]
18989 mov ebx, [ecx + 4] ;# eax/ebx=jnr
18990 add dword ptr [esp + mci1100_innerjjnr], 8 ;# advance pointer (unrolled 2)
18991 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
18993 mov ecx, [ebp + mci1100_charge] ;# base of charge[]
18994 movq mm5, [esp + mci1100_iq]
18995 movd mm3, [ecx + eax*4] ;# charge[jnr1]
18996 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
18997 pfmul mm3,mm5 ;# mm3 now has qq for both particles
18999 mov ecx, [ebp + mci1100_type]
19000 mov edx, [ecx + eax*4] ;# type [jnr1]
19001 mov ecx, [ecx + ebx*4] ;# type [jnr2]
19003 mov esi, [ebp + mci1100_nbfp] ;# base of nbfp
19004 shl edx, 1
19005 shl ecx, 1
19006 add edx, [esp + mci1100_ntia] ;# tja = ntia + 2*type
19007 add ecx, [esp + mci1100_ntia]
19009 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
19010 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
19011 movq mm6,mm5
19012 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
19013 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
19014 movq [esp + mci1100_c6], mm5
19015 movq [esp + mci1100_c12], mm6
19017 lea eax, [eax + eax*2] ;# replace jnr with j3
19018 lea ebx, [ebx + ebx*2]
19020 mov esi, [ebp + mci1100_pos]
19022 movq mm0, [esp + mci1100_ix]
19023 movd mm1, [esp + mci1100_iz]
19024 movq mm4, [esi + eax*4] ;# fetch first j coordinates
19025 movd mm5, [esi + eax*4 + 8]
19026 pfsubr mm4,mm0 ;# dr = ir - jr
19027 pfsubr mm5,mm1
19028 pfmul mm4,mm4 ;# square dx,dy,dz
19029 pfmul mm5,mm5
19030 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
19031 pfacc mm4, mm5 ;# first rsq in lower mm4
19033 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
19034 movd mm7, [esi + ebx*4 + 8]
19036 pfsubr mm6,mm0 ;# dr = ir - jr
19037 pfsubr mm7,mm1
19038 pfmul mm6,mm6 ;# square dx,dy,dz
19039 pfmul mm7,mm7
19040 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
19041 pfacc mm6, mm7 ;# second rsq in lower mm6
19043 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
19044 pfrsqrt mm1, mm6
19046 punpckldq mm0,mm1
19047 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
19048 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
19049 pfmul mm0,mm0
19050 pfrsqit1 mm0,mm4
19051 pfrcpit2 mm0,mm2
19052 movq mm1,mm0
19053 pfmul mm0,mm0
19054 ;# mm0 now contains invsq, and mm1 invsqrt
19055 ;# do potential and fscal
19056 movq mm4, mm0
19057 pfmul mm4, mm0
19058 pfmul mm4, mm0 ;# mm4=rinvsix
19059 movq mm5, mm4
19060 pfmul mm5, mm5 ;# mm5=rinvtwelve
19062 pfmul mm3, mm1 ;# mm3 has vcoul for both interactions
19063 pfmul mm5, [esp + mci1100_c12]
19064 pfmul mm4, [esp + mci1100_c6]
19065 movq mm6, mm5 ;# mm6 is vnb12-vnb6
19066 pfsub mm6, mm4
19067 ;# update vctot
19068 pfadd mm3, [esp + mci1100_vctot] ;# add the earlier value
19069 movq [esp + mci1100_vctot], mm3 ;# store the sum
19070 ;# update vnbtot
19071 pfadd mm6, [esp + mci1100_vnbtot] ;# add the earlier value
19072 movq [esp + mci1100_vnbtot], mm6 ;# store the sum
19074 ;# should we do one more iteration?
19075 sub dword ptr [esp + mci1100_innerk], 2
19076 jl .mci1100_finish_inner
19077 jmp .mci1100_unroll_loop
19078 .mci1100_finish_inner:
19079 and dword ptr [esp + mci1100_innerk], 1
19080 jnz .mci1100_single_inner
19081 jmp .mci1100_updateouterdata
19082 .mci1100_single_inner:
19083 ;# a single j particle iteration here - compare with the unrolled code for comments
19084 mov eax, [esp + mci1100_innerjjnr]
19085 mov eax, [eax] ;# eax=jnr offset
19087 mov ecx, [ebp + mci1100_charge]
19088 movd mm5, [esp + mci1100_iq]
19089 movd mm3, [ecx + eax*4]
19090 pfmul mm3, mm5 ;# mm3=qq
19092 mov esi, [ebp + mci1100_nbfp]
19093 mov ecx, [ebp + mci1100_type]
19094 mov edx, [ecx + eax*4] ;# type [jnr1]
19095 shl edx, 1
19096 add edx, [esp + mci1100_ntia] ;# tja = ntia + 2*type
19097 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
19098 movq [esp + mci1100_c6], mm5
19099 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
19100 movq [esp + mci1100_c12], mm5
19103 mov esi, [ebp + mci1100_pos]
19104 lea eax, [eax + eax*2]
19106 movq mm0, [esp + mci1100_ix]
19107 movd mm1, [esp + mci1100_iz]
19108 movq mm4, [esi + eax*4]
19109 movd mm5, [esi + eax*4 + 8]
19110 pfsubr mm4, mm0
19111 pfsubr mm5, mm1
19112 pfmul mm4,mm4
19113 pfmul mm5,mm5
19114 pfacc mm4, mm5
19115 pfacc mm4, mm5 ;# mm0=rsq
19117 pfrsqrt mm0,mm4
19118 movq mm2,mm0
19119 pfmul mm0,mm0
19120 pfrsqit1 mm0,mm4
19121 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
19122 movq mm1, mm0
19123 pfmul mm0, mm0 ;# mm0=invsq
19124 ;# calculate potentials and scalar force
19125 movq mm4, mm0
19126 pfmul mm4, mm0
19127 pfmul mm4, mm0 ;# mm4=rinvsix
19128 movq mm5, mm4
19129 pfmul mm5, mm5 ;# mm5=rinvtwelve
19131 pfmul mm3, mm1 ;# mm3 has vcoul for both interactions
19132 pfmul mm5, [esp + mci1100_c12]
19133 pfmul mm4, [esp + mci1100_c6]
19134 movq mm6, mm5 ;# mm6 is vnb12-vnb6
19135 pfsub mm6, mm4
19136 ;# update vctot
19137 pfadd mm3, [esp + mci1100_vctot]
19138 movq [esp + mci1100_vctot], mm3
19139 ;# update vnbtot
19140 pfadd mm6, [esp + mci1100_vnbtot] ;# add the earlier value
19141 movq [esp + mci1100_vnbtot], mm6 ;# store the sum
19143 .mci1100_updateouterdata:
19144 mov edx, [ebp + mci1100_gid] ;# get group index for this i particle
19145 mov edx, [edx]
19146 add dword ptr [ebp + mci1100_gid], 4 ;# advance pointer
19148 movq mm7, [esp + mci1100_vctot]
19149 pfacc mm7,mm7 ;# get and sum the two parts of total potential
19151 mov eax, [ebp + mci1100_Vc]
19152 movd mm6, [eax + edx*4]
19153 pfadd mm6, mm7
19154 movd [eax + edx*4], mm6 ;# increment vc[gid]
19156 movq mm7, [esp + mci1100_vnbtot]
19157 pfacc mm7,mm7 ;# get and sum the two parts of total potential
19159 mov eax, [ebp + mci1100_Vnb]
19160 movd mm6, [eax + edx*4]
19161 pfadd mm6, mm7
19162 movd [eax + edx*4], mm6 ;# increment vnb[gid]
19164 ;# finish if last
19165 mov ecx, [ebp + mci1100_nri]
19166 dec ecx
19167 jecxz .mci1100_end
19168 ;# not last, iterate once more!
19169 mov [ebp + mci1100_nri], ecx
19170 jmp .mci1100_outer
19171 .mci1100_end:
19172 femms
19173 add esp, 72
19174 pop edi
19175 pop esi
19176 pop edx
19177 pop ecx
19178 pop ebx
19179 pop eax
19180 leave
19187 .globl mcinl1110_3dnow
19188 .globl _mcinl1110_3dnow
19189 mcinl1110_3dnow:
19190 _mcinl1110_3dnow:
19191 .equiv mci1110_nri, 8
19192 .equiv mci1110_iinr, 12
19193 .equiv mci1110_jindex, 16
19194 .equiv mci1110_jjnr, 20
19195 .equiv mci1110_shift, 24
19196 .equiv mci1110_shiftvec, 28
19197 .equiv mci1110_gid, 32
19198 .equiv mci1110_pos, 36
19199 .equiv mci1110_charge, 40
19200 .equiv mci1110_facel, 44
19201 .equiv mci1110_Vc, 48
19202 .equiv mci1110_type, 52
19203 .equiv mci1110_ntype, 56
19204 .equiv mci1110_nbfp, 60
19205 .equiv mci1110_Vnb, 64
19206 .equiv mci1110_nsatoms, 68
19207 ;# stack offsets for local variables
19208 .equiv mci1110_is3, 0
19209 .equiv mci1110_ii3, 4
19210 .equiv mci1110_shX, 8
19211 .equiv mci1110_shY, 12
19212 .equiv mci1110_shZ, 16
19213 .equiv mci1110_ix, 20
19214 .equiv mci1110_iy, 24
19215 .equiv mci1110_iz, 28
19216 .equiv mci1110_iq, 32
19217 .equiv mci1110_vctot, 40
19218 .equiv mci1110_vnbtot, 48
19219 .equiv mci1110_c6, 56
19220 .equiv mci1110_c12, 64
19221 .equiv mci1110_ntia, 72
19222 .equiv mci1110_innerjjnr0, 76
19223 .equiv mci1110_innerk0, 80
19224 .equiv mci1110_innerjjnr, 84
19225 .equiv mci1110_innerk, 88
19226 .equiv mci1110_nsvdwc, 92
19227 .equiv mci1110_nscoul, 96
19228 .equiv mci1110_nsvdw, 100
19229 .equiv mci1110_solnr, 104
19230 push ebp
19231 mov ebp,esp
19232 push eax
19233 push ebx
19234 push ecx
19235 push edx
19236 push esi
19237 push edi
19238 sub esp, 108 ;# local stack space
19239 femms
19240 ;# assume we have at least one i particle - start directly
19241 .mci1110_outer:
19242 mov eax, [ebp + mci1110_shift] ;# eax = pointer into shift[]
19243 mov ebx, [eax] ;# ebx=shift[n]
19244 add dword ptr [ebp + mci1110_shift], 4 ;# advance pointer one step
19246 lea ebx, [ebx + ebx*2] ;# ebx=3*is
19247 mov [esp + mci1110_is3],ebx ;# store is3
19249 mov eax, [ebp + mci1110_shiftvec] ;# eax = base of shiftvec[]
19251 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
19252 movd mm1, [eax + ebx*4 + 8]
19253 movq [esp + mci1110_shX], mm0
19254 movd [esp + mci1110_shZ], mm1
19256 mov ecx, [ebp + mci1110_iinr] ;# ecx = pointer into iinr[]
19257 add dword ptr [ebp + mci1110_iinr], 4 ;# advance pointer
19258 mov ebx, [ecx] ;# ebx=ii
19260 mov eax, [ebp + mci1110_nsatoms]
19261 add dword ptr [ebp + mci1110_nsatoms], 12
19262 mov ecx, [eax]
19263 mov edx, [eax + 4]
19264 mov eax, [eax + 8]
19265 sub ecx, eax
19266 sub eax, edx
19268 mov [esp + mci1110_nsvdwc], edx
19269 mov [esp + mci1110_nscoul], eax
19270 mov [esp + mci1110_nsvdw], ecx
19272 ;# clear potential
19273 pxor mm7,mm7
19274 movq [esp + mci1110_vctot], mm7
19275 movq [esp + mci1110_vnbtot], mm7
19276 mov [esp + mci1110_solnr], ebx
19278 mov eax, [ebp + mci1110_jindex]
19279 mov ecx, [eax] ;# jindex[n]
19280 mov edx, [eax + 4] ;# jindex[n+1]
19281 add dword ptr [ebp + mci1110_jindex], 4
19282 sub edx, ecx ;# number of innerloop atoms
19283 mov eax, [ebp + mci1110_jjnr]
19284 shl ecx, 2
19285 add eax, ecx
19286 mov [esp + mci1110_innerjjnr0], eax ;# pointer to jjnr[nj0]
19288 mov [esp + mci1110_innerk0], edx ;# number of innerloop atoms
19289 mov esi, [ebp + mci1110_pos]
19291 mov ecx, [esp + mci1110_nsvdwc]
19292 cmp ecx, 0
19293 jnz .mci1110_mno_vdwc
19294 jmp .mci1110_testcoul
19295 .mci1110_mno_vdwc:
19296 mov ebx, [esp + mci1110_solnr]
19297 inc dword ptr [esp + mci1110_solnr]
19298 mov edx, [ebp + mci1110_charge]
19299 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
19300 pfmul mm2, [ebp + mci1110_facel]
19301 punpckldq mm2,mm2 ;# spread to both halves
19302 movq [esp + mci1110_iq], mm2 ;# iq =facel*charge[ii]
19304 mov edx, [ebp + mci1110_type]
19305 mov edx, [edx + ebx*4]
19306 imul edx, [ebp + mci1110_ntype]
19307 shl edx, 1
19308 mov [esp + mci1110_ntia], edx
19310 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
19311 mov eax, [ebp + mci1110_pos] ;# eax = base of pos[]
19312 mov [esp + mci1110_ii3], ebx
19314 movq mm0, [eax + ebx*4]
19315 movd mm1, [eax + ebx*4 + 8]
19316 pfadd mm0, [esp + mci1110_shX]
19317 pfadd mm1, [esp + mci1110_shZ]
19318 movq [esp + mci1110_ix], mm0
19319 movd [esp + mci1110_iz], mm1
19321 mov ecx, [esp + mci1110_innerjjnr0]
19322 mov [esp + mci1110_innerjjnr], ecx
19323 mov edx, [esp + mci1110_innerk0]
19324 sub edx, 2
19325 mov [esp + mci1110_innerk], edx ;# number of innerloop atoms
19326 jge .mci1110_unroll_vdwc_loop
19327 jmp .mci1110_finish_vdwc_inner
19328 .mci1110_unroll_vdwc_loop:
19329 ;# paired innerloop starts here
19330 mov ecx, [esp + mci1110_innerjjnr] ;# pointer to jjnr[k]
19331 mov eax, [ecx]
19332 mov ebx, [ecx + 4] ;# eax/ebx=jnr
19333 add dword ptr [esp + mci1110_innerjjnr], 8 ;# advance pointer (unrolled 2)
19334 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
19336 mov ecx, [ebp + mci1110_charge] ;# base of charge[]
19337 movq mm5, [esp + mci1110_iq]
19338 movd mm3, [ecx + eax*4] ;# charge[jnr1]
19339 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
19340 pfmul mm3,mm5 ;# mm3 now has qq for both particles
19342 mov ecx, [ebp + mci1110_type]
19343 mov edx, [ecx + eax*4] ;# type [jnr1]
19344 mov ecx, [ecx + ebx*4] ;# type [jnr2]
19346 mov esi, [ebp + mci1110_nbfp] ;# base of nbfp
19347 shl edx, 1
19348 shl ecx, 1
19349 add edx, [esp + mci1110_ntia] ;# tja = ntia + 2*type
19350 add ecx, [esp + mci1110_ntia]
19352 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
19353 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
19354 movq mm6,mm5
19355 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
19356 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
19357 movq [esp + mci1110_c6], mm5
19358 movq [esp + mci1110_c12], mm6
19360 lea eax, [eax + eax*2] ;# replace jnr with j3
19361 lea ebx, [ebx + ebx*2]
19363 mov esi, [ebp + mci1110_pos]
19365 movq mm0, [esp + mci1110_ix]
19366 movd mm1, [esp + mci1110_iz]
19367 movq mm4, [esi + eax*4] ;# fetch first j coordinates
19368 movd mm5, [esi + eax*4 + 8]
19369 pfsubr mm4,mm0 ;# dr = ir - jr
19370 pfsubr mm5,mm1
19371 pfmul mm4,mm4 ;# square dx,dy,dz
19372 pfmul mm5,mm5
19373 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
19374 pfacc mm4, mm5 ;# first rsq in lower mm4
19376 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
19377 movd mm7, [esi + ebx*4 + 8]
19379 pfsubr mm6,mm0 ;# dr = ir - jr
19380 pfsubr mm7,mm1
19381 pfmul mm6,mm6 ;# square dx,dy,dz
19382 pfmul mm7,mm7
19383 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
19384 pfacc mm6, mm7 ;# second rsq in lower mm6
19386 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
19387 pfrsqrt mm1, mm6
19389 punpckldq mm0,mm1
19390 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
19391 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
19392 pfmul mm0,mm0
19393 pfrsqit1 mm0,mm4
19394 pfrcpit2 mm0,mm2
19395 movq mm1,mm0
19396 pfmul mm0,mm0
19397 ;# mm0 now contains invsq, and mm1 invsqrt
19398 ;# do potential and fscal
19399 movq mm4, mm0
19400 pfmul mm4, mm0
19401 pfmul mm4, mm0 ;# mm4=rinvsix
19402 movq mm5, mm4
19403 pfmul mm5, mm5 ;# mm5=rinvtwelve
19405 pfmul mm3, mm1 ;# mm3 has vcoul for both interactions
19406 pfmul mm5, [esp + mci1110_c12]
19407 pfmul mm4, [esp + mci1110_c6]
19408 movq mm6, mm5 ;# mm6 is vnb12-vnb6
19409 pfsub mm6, mm4
19410 ;# update vctot
19411 pfadd mm3, [esp + mci1110_vctot] ;# add the earlier value
19412 movq [esp + mci1110_vctot], mm3 ;# store the sum
19413 ;# update vnbtot
19414 pfadd mm6, [esp + mci1110_vnbtot] ;# add the earlier value
19415 movq [esp + mci1110_vnbtot], mm6 ;# store the sum
19417 ;# should we do one more iteration?
19418 sub dword ptr [esp + mci1110_innerk], 2
19419 jl .mci1110_finish_vdwc_inner
19420 jmp .mci1110_unroll_vdwc_loop
19421 .mci1110_finish_vdwc_inner:
19422 and dword ptr [esp + mci1110_innerk], 1
19423 jnz .mci1110_single_vdwc_inner
19424 jmp .mci1110_updateouterdata_vdwc
19425 .mci1110_single_vdwc_inner:
19426 ;# a single j particle iteration here - compare with the unrolled code for comments
19427 mov eax, [esp + mci1110_innerjjnr]
19428 mov eax, [eax] ;# eax=jnr offset
19430 mov ecx, [ebp + mci1110_charge]
19431 movd mm5, [esp + mci1110_iq]
19432 movd mm3, [ecx + eax*4]
19433 pfmul mm3, mm5 ;# mm3=qq
19435 mov esi, [ebp + mci1110_nbfp]
19436 mov ecx, [ebp + mci1110_type]
19437 mov edx, [ecx + eax*4] ;# type [jnr1]
19438 shl edx, 1
19439 add edx, [esp + mci1110_ntia] ;# tja = ntia + 2*type
19440 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
19441 movq [esp + mci1110_c6], mm5
19442 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
19443 movq [esp + mci1110_c12], mm5
19446 mov esi, [ebp + mci1110_pos]
19447 lea eax, [eax + eax*2]
19449 movq mm0, [esp + mci1110_ix]
19450 movd mm1, [esp + mci1110_iz]
19451 movq mm4, [esi + eax*4]
19452 movd mm5, [esi + eax*4 + 8]
19453 pfsubr mm4, mm0
19454 pfsubr mm5, mm1
19455 pfmul mm4,mm4
19456 pfmul mm5,mm5
19457 pfacc mm4, mm5
19458 pfacc mm4, mm5 ;# mm0=rsq
19460 pfrsqrt mm0,mm4
19461 movq mm2,mm0
19462 pfmul mm0,mm0
19463 pfrsqit1 mm0,mm4
19464 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
19465 movq mm1, mm0
19466 pfmul mm0, mm0 ;# mm0=invsq
19467 ;# calculate potentials and scalar force
19468 movq mm4, mm0
19469 pfmul mm4, mm0
19470 pfmul mm4, mm0 ;# mm4=rinvsix
19471 movq mm5, mm4
19472 pfmul mm5, mm5 ;# mm5=rinvtwelve
19474 pfmul mm3, mm1 ;# mm3 has vcoul for both interactions
19476 pfmul mm5, [esp + mci1110_c12]
19477 pfmul mm4, [esp + mci1110_c6]
19478 movq mm6, mm5 ;# mm6 is vnb12-vnb6
19479 pfsub mm6, mm4
19480 ;# update vctot
19481 pfadd mm3, [esp + mci1110_vctot]
19482 movq [esp + mci1110_vctot], mm3
19483 ;# update vnbtot
19484 pfadd mm6, [esp + mci1110_vnbtot] ;# add the earlier value
19485 movq [esp + mci1110_vnbtot], mm6 ;# store the sum
19486 .mci1110_updateouterdata_vdwc:
19487 ;# loop back to mno
19488 dec dword ptr [esp + mci1110_nsvdwc]
19489 jz .mci1110_testcoul
19490 jmp .mci1110_mno_vdwc
19491 .mci1110_testcoul:
19492 mov ecx, [esp + mci1110_nscoul]
19493 cmp ecx, 0
19494 jnz .mci1110_mno_coul
19495 jmp .mci1110_testvdw
19496 .mci1110_mno_coul:
19497 mov ebx, [esp + mci1110_solnr]
19498 inc dword ptr [esp + mci1110_solnr]
19499 mov edx, [ebp + mci1110_charge]
19500 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
19501 pfmul mm2, [ebp + mci1110_facel]
19502 punpckldq mm2,mm2 ;# spread to both halves
19503 movq [esp + mci1110_iq], mm2 ;# iq =facel*charge[ii]
19505 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
19506 mov eax, [ebp + mci1110_pos] ;# eax = base of pos[]
19507 mov [esp + mci1110_ii3], ebx
19509 movq mm0, [eax + ebx*4]
19510 movd mm1, [eax + ebx*4 + 8]
19511 pfadd mm0, [esp + mci1110_shX]
19512 pfadd mm1, [esp + mci1110_shZ]
19513 movq [esp + mci1110_ix], mm0
19514 movd [esp + mci1110_iz], mm1
19516 mov ecx, [esp + mci1110_innerjjnr0]
19517 mov [esp + mci1110_innerjjnr], ecx
19518 mov edx, [esp + mci1110_innerk0]
19519 sub edx, 2
19520 mov [esp + mci1110_innerk], edx ;# number of innerloop atoms
19521 jge .mci1110_unroll_coul_loop
19522 jmp .mci1110_finish_coul_inner
19523 .mci1110_unroll_coul_loop:
19524 ;# paired innerloop starts here
19525 mov ecx, [esp + mci1110_innerjjnr] ;# pointer to jjnr[k]
19526 mov eax, [ecx]
19527 mov ebx, [ecx + 4] ;# eax/ebx=jnr
19528 add dword ptr [esp + mci1110_innerjjnr], 8 ;# advance pointer (unrolled 2)
19529 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
19531 mov ecx, [ebp + mci1110_charge] ;# base of charge[]
19532 movq mm5, [esp + mci1110_iq]
19533 movd mm3, [ecx + eax*4] ;# charge[jnr1]
19534 movd mm7, [ecx + ebx*4] ;# charge[jnr2]
19535 punpckldq mm3,mm7 ;# move charge 2 to high part of mm3
19536 pfmul mm3,mm5 ;# mm3 now has qq for both particles
19538 lea eax, [eax + eax*2] ;# replace jnr with j3
19539 lea ebx, [ebx + ebx*2]
19541 movq mm0, [esp + mci1110_ix]
19542 movd mm1, [esp + mci1110_iz]
19543 movq mm4, [esi + eax*4] ;# fetch first j coordinates
19544 movd mm5, [esi + eax*4 + 8]
19545 pfsubr mm4,mm0 ;# dr = ir - jr
19546 pfsubr mm5,mm1
19547 pfmul mm4,mm4 ;# square dx,dy,dz
19548 pfmul mm5,mm5
19549 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
19550 pfacc mm4, mm5 ;# first rsq in lower mm4
19552 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
19553 movd mm7, [esi + ebx*4 + 8]
19555 pfsubr mm6,mm0 ;# dr = ir - jr
19556 pfsubr mm7,mm1
19557 pfmul mm6,mm6 ;# square dx,dy,dz
19558 pfmul mm7,mm7
19559 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
19560 pfacc mm6, mm7 ;# second rsq in lower mm6
19562 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
19563 pfrsqrt mm1, mm6
19565 punpckldq mm0,mm1
19566 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
19567 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
19568 pfmul mm0,mm0
19569 pfrsqit1 mm0,mm4
19570 pfrcpit2 mm0,mm2
19571 movq mm1,mm0
19572 ;# mm1 is invsqrt
19573 ;# do potential and fscal
19574 pfmul mm3,mm1 ;# 3 has both vcoul
19575 pfadd mm3, [esp + mci1110_vctot] ;# add the earlier value
19576 movq [esp + mci1110_vctot], mm3 ;# store the sum
19578 ;# should we do one more iteration?
19579 sub dword ptr [esp + mci1110_innerk], 2
19580 jl .mci1110_finish_coul_inner
19581 jmp .mci1110_unroll_coul_loop
19582 .mci1110_finish_coul_inner:
19583 and dword ptr [esp + mci1110_innerk], 1
19584 jnz .mci1110_single_coul_inner
19585 jmp .mci1110_updateouterdata_coul
19586 .mci1110_single_coul_inner:
19587 ;# a single j particle iteration here - compare with the unrolled code for comments
19588 mov eax, [esp + mci1110_innerjjnr]
19589 mov eax, [eax] ;# eax=jnr offset
19591 mov ecx, [ebp + mci1110_charge]
19592 movd mm6, [esp + mci1110_iq]
19593 movd mm7, [ecx + eax*4]
19594 pfmul mm6, mm7 ;# mm6=qq
19596 lea eax, [eax + eax*2]
19598 movq mm0, [esp + mci1110_ix]
19599 movd mm1, [esp + mci1110_iz]
19600 movq mm2, [esi + eax*4]
19601 movd mm3, [esi + eax*4 + 8]
19602 pfsub mm0, mm2
19603 pfsub mm1, mm3
19604 pfmul mm0,mm0
19605 pfmul mm1,mm1
19606 pfacc mm0, mm1
19607 pfacc mm0, mm1 ;# mm0=rsq
19609 pfrsqrt mm1,mm0
19610 movq mm2,mm1
19611 pfmul mm1,mm1
19612 pfrsqit1 mm1,mm0
19613 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
19614 ;# calculate potential and scalar force
19615 pfmul mm6, mm1 ;# mm6=vcoul
19616 ;# update vctot
19617 pfadd mm6, [esp + mci1110_vctot]
19618 movq [esp + mci1110_vctot], mm6
19620 .mci1110_updateouterdata_coul:
19621 ;# loop back to mno
19622 dec dword ptr [esp + mci1110_nscoul]
19623 jz .mci1110_testvdw
19624 jmp .mci1110_mno_coul
19625 .mci1110_testvdw:
19626 mov ecx, [esp + mci1110_nsvdw]
19627 cmp ecx, 0
19628 jnz .mci1110_mno_vdw
19629 jmp .mci1110_last_mno
19630 .mci1110_mno_vdw:
19631 mov ebx, [esp + mci1110_solnr]
19632 inc dword ptr [esp + mci1110_solnr]
19634 mov edx, [ebp + mci1110_type]
19635 mov edx, [edx + ebx*4]
19636 imul edx, [ebp + mci1110_ntype]
19637 shl edx, 1
19638 mov [esp + mci1110_ntia], edx
19640 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
19641 mov eax, [ebp + mci1110_pos] ;# eax = base of pos[]
19642 mov [esp + mci1110_ii3], ebx
19644 movq mm0, [eax + ebx*4]
19645 movd mm1, [eax + ebx*4 + 8]
19646 pfadd mm0, [esp + mci1110_shX]
19647 pfadd mm1, [esp + mci1110_shZ]
19648 movq [esp + mci1110_ix], mm0
19649 movd [esp + mci1110_iz], mm1
19651 mov ecx, [esp + mci1110_innerjjnr0]
19652 mov [esp + mci1110_innerjjnr], ecx
19653 mov edx, [esp + mci1110_innerk0]
19654 sub edx, 2
19655 mov [esp + mci1110_innerk], edx ;# number of innerloop atoms
19656 jge .mci1110_unroll_vdw_loop
19657 jmp .mci1110_finish_vdw_inner
19658 .mci1110_unroll_vdw_loop:
19659 ;# paired innerloop starts here
19660 mov ecx, [esp + mci1110_innerjjnr] ;# pointer to jjnr[k]
19661 mov eax, [ecx]
19662 mov ebx, [ecx + 4] ;# eax/ebx=jnr
19663 add dword ptr [esp + mci1110_innerjjnr], 8 ;# advance pointer (unrolled 2)
19664 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
19666 mov ecx, [ebp + mci1110_type]
19667 mov edx, [ecx + eax*4] ;# type [jnr1]
19668 mov ecx, [ecx + ebx*4] ;# type [jnr2]
19670 mov esi, [ebp + mci1110_nbfp] ;# base of nbfp
19671 shl edx, 1
19672 shl ecx, 1
19673 add edx, [esp + mci1110_ntia] ;# tja = ntia + 2*type
19674 add ecx, [esp + mci1110_ntia]
19676 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
19677 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
19678 movq mm6,mm5
19679 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
19680 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
19681 movq [esp + mci1110_c6], mm5
19682 movq [esp + mci1110_c12], mm6
19684 lea eax, [eax + eax*2] ;# replace jnr with j3
19685 lea ebx, [ebx + ebx*2]
19687 mov esi, [ebp + mci1110_pos]
19689 movq mm0, [esp + mci1110_ix]
19690 movd mm1, [esp + mci1110_iz]
19691 movq mm4, [esi + eax*4] ;# fetch first j coordinates
19692 movd mm5, [esi + eax*4 + 8]
19693 pfsubr mm4,mm0 ;# dr = ir - jr
19694 pfsubr mm5,mm1
19695 pfmul mm4,mm4 ;# square dx,dy,dz
19696 pfmul mm5,mm5
19697 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
19698 pfacc mm4, mm5 ;# first rsq in lower mm4
19700 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
19701 movd mm7, [esi + ebx*4 + 8]
19703 pfsubr mm6,mm0 ;# dr = ir - jr
19704 pfsubr mm7,mm1
19705 pfmul mm6,mm6 ;# square dx,dy,dz
19706 pfmul mm7,mm7
19707 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
19708 pfacc mm6, mm7 ;# second rsq in lower mm6
19710 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
19711 pfrsqrt mm1, mm6
19713 punpckldq mm0,mm1
19714 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs
19715 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision
19716 pfmul mm0,mm0
19717 pfrsqit1 mm0,mm4
19718 pfrcpit2 mm0,mm2
19719 movq mm1,mm0
19720 pfmul mm0,mm0
19721 ;# mm0 now contains invsq, and mm1 invsqrt
19722 ;# do potential and fscal
19723 movq mm4, mm0
19724 pfmul mm4, mm0
19725 pfmul mm4, mm0 ;# mm4=rinvsix
19726 movq mm5, mm4
19727 pfmul mm5, mm5 ;# mm5=rinvtwelve
19729 pfmul mm5, [esp + mci1110_c12]
19730 pfmul mm4, [esp + mci1110_c6]
19731 movq mm6, mm5 ;# mm6 is vnb12-vnb6
19732 pfsub mm6, mm4
19733 ;# update vnbtot
19734 pfadd mm6, [esp + mci1110_vnbtot] ;# add the earlier value
19735 movq [esp + mci1110_vnbtot], mm6 ;# store the sum
19737 ;# should we do one more iteration?
19738 sub dword ptr [esp + mci1110_innerk], 2
19739 jl .mci1110_finish_vdw_inner
19740 jmp .mci1110_unroll_vdw_loop
19741 .mci1110_finish_vdw_inner:
19742 and dword ptr [esp + mci1110_innerk], 1
19743 jnz .mci1110_single_vdw_inner
19744 jmp .mci1110_updateouterdata_vdw
19745 .mci1110_single_vdw_inner:
19746 ;# a single j particle iteration here - compare with the unrolled code for comments
19747 mov eax, [esp + mci1110_innerjjnr]
19748 mov eax, [eax] ;# eax=jnr offset
19750 mov esi, [ebp + mci1110_nbfp]
19751 mov ecx, [ebp + mci1110_type]
19752 mov edx, [ecx + eax*4] ;# type [jnr1]
19753 shl edx, 1
19754 add edx, [esp + mci1110_ntia] ;# tja = ntia + 2*type
19755 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
19756 movq [esp + mci1110_c6], mm5
19757 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
19758 movq [esp + mci1110_c12], mm5
19761 mov esi, [ebp + mci1110_pos]
19762 lea eax, [eax + eax*2]
19764 movq mm0, [esp + mci1110_ix]
19765 movd mm1, [esp + mci1110_iz]
19766 movq mm4, [esi + eax*4]
19767 movd mm5, [esi + eax*4 + 8]
19768 pfsubr mm4, mm0
19769 pfsubr mm5, mm1
19770 pfmul mm4,mm4
19771 pfmul mm5,mm5
19772 pfacc mm4, mm5
19773 pfacc mm4, mm5 ;# mm0=rsq
19775 pfrsqrt mm0,mm4
19776 movq mm2,mm0
19777 pfmul mm0,mm0
19778 pfrsqit1 mm0,mm4
19779 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
19780 movq mm1, mm0
19781 pfmul mm0, mm0 ;# mm0=invsq
19782 ;# calculate potentials and scalar force
19783 movq mm4, mm0
19784 pfmul mm4, mm0
19785 pfmul mm4, mm0 ;# mm4=rinvsix
19786 movq mm5, mm4
19787 pfmul mm5, mm5 ;# mm5=rinvtwelve
19789 pfmul mm5, [esp + mci1110_c12]
19790 pfmul mm4, [esp + mci1110_c6]
19791 movq mm6, mm5 ;# mm6 is vnb12-vnb6
19792 pfsub mm6, mm4
19793 ;# update vnbtot
19794 pfadd mm6, [esp + mci1110_vnbtot] ;# add the earlier value
19795 movq [esp + mci1110_vnbtot], mm6 ;# store the sum
19797 .mci1110_updateouterdata_vdw:
19798 ;# loop back to mno
19799 dec dword ptr [esp + mci1110_nsvdw]
19800 jz .mci1110_last_mno
19801 jmp .mci1110_mno_vdw
19803 .mci1110_last_mno:
19804 mov edx, [ebp + mci1110_gid] ;# get group index for this i particle
19805 mov edx, [edx]
19806 add dword ptr [ebp + mci1110_gid], 4 ;# advance pointer
19808 movq mm7, [esp + mci1110_vctot]
19809 pfacc mm7,mm7 ;# get and sum the two parts of total potential
19811 mov eax, [ebp + mci1110_Vc]
19812 movd mm6, [eax + edx*4]
19813 pfadd mm6, mm7
19814 movd [eax + edx*4], mm6 ;# increment vc[gid]
19816 movq mm7, [esp + mci1110_vnbtot]
19817 pfacc mm7,mm7 ;# get and sum the two parts of total potential
19819 mov eax, [ebp + mci1110_Vnb]
19820 movd mm6, [eax + edx*4]
19821 pfadd mm6, mm7
19822 movd [eax + edx*4], mm6 ;# increment vc[gid]
19823 ;# finish if last
19824 mov ecx, [ebp + mci1110_nri]
19825 dec ecx
19826 jecxz .mci1110_end
19827 ;# not last, iterate once more!
19828 mov [ebp + mci1110_nri], ecx
19829 jmp .mci1110_outer
19830 .mci1110_end:
19831 femms
19832 add esp, 108
19833 pop edi
19834 pop esi
19835 pop edx
19836 pop ecx
19837 pop ebx
19838 pop eax
19839 leave
19844 .globl mcinl1120_3dnow
19845 .globl _mcinl1120_3dnow
19846 mcinl1120_3dnow:
19847 _mcinl1120_3dnow:
19848 .equiv mci1120_nri, 8
19849 .equiv mci1120_iinr, 12
19850 .equiv mci1120_jindex, 16
19851 .equiv mci1120_jjnr, 20
19852 .equiv mci1120_shift, 24
19853 .equiv mci1120_shiftvec, 28
19854 .equiv mci1120_gid, 32
19855 .equiv mci1120_pos, 36
19856 .equiv mci1120_charge, 40
19857 .equiv mci1120_facel, 44
19858 .equiv mci1120_Vc, 48
19859 .equiv mci1120_type, 52
19860 .equiv mci1120_ntype, 56
19861 .equiv mci1120_nbfp, 60
19862 .equiv mci1120_Vnb, 64
19863 ;# stack offsets for local variables
19864 .equiv mci1120_is3, 0
19865 .equiv mci1120_ii3, 4
19866 .equiv mci1120_ixO, 8
19867 .equiv mci1120_iyO, 12
19868 .equiv mci1120_izO, 16
19869 .equiv mci1120_ixH, 20
19870 .equiv mci1120_iyH, 28
19871 .equiv mci1120_izH, 36
19872 .equiv mci1120_iqO, 44
19873 .equiv mci1120_iqH, 52
19874 .equiv mci1120_vctot, 60
19875 .equiv mci1120_vnbtot, 68
19876 .equiv mci1120_c6, 76
19877 .equiv mci1120_c12, 84
19878 .equiv mci1120_ntia, 92
19879 .equiv mci1120_innerjjnr, 96
19880 .equiv mci1120_innerk, 100
19881 push ebp
19882 mov ebp,esp
19883 push eax
19884 push ebx
19885 push ecx
19886 push edx
19887 push esi
19888 push edi
19889 sub esp, 104 ;# local stack space
19890 femms
19891 ;# assume we have at least one i particle - start directly
19893 mov ecx, [ebp + mci1120_iinr] ;# ecx = pointer into iinr[]
19894 mov ebx, [ecx] ;# ebx=ii
19896 mov edx, [ebp + mci1120_charge]
19897 movd mm1, [ebp + mci1120_facel]
19898 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0]
19899 pfmul mm2, mm1
19900 movq [esp + mci1120_iqO], mm2 ;# iqO = facel*charge[ii]
19902 movd mm2, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1]
19903 pfmul mm2, mm1
19904 punpckldq mm2,mm2 ;# spread to both halves
19905 movq [esp + mci1120_iqH], mm2 ;# iqH = facel*charge[ii0+1]
19907 mov edx, [ebp + mci1120_type]
19908 mov ecx, [edx + ebx*4]
19909 shl ecx, 1
19910 imul ecx, [ebp + mci1120_ntype] ;# ecx = ntia = 2*ntype*type[ii0]
19911 mov [esp + mci1120_ntia], ecx
19913 .mci1120_outer:
19914 mov eax, [ebp + mci1120_shift] ;# eax = pointer into shift[]
19915 mov ebx, [eax] ;# ebx=shift[n]
19916 add dword ptr [ebp + mci1120_shift], 4 ;# advance pointer one step
19918 lea ebx, [ebx + ebx*2] ;# ebx=3*is
19919 mov [esp + mci1120_is3],ebx ;# store is3
19921 mov eax, [ebp + mci1120_shiftvec] ;# eax = base of shiftvec[]
19923 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
19924 movd mm6, [eax + ebx*4 + 8]
19925 movq mm0, mm5
19926 movq mm1, mm5
19927 movq mm2, mm6
19928 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
19929 punpckhdq mm1,mm1
19930 punpckldq mm2,mm2
19932 mov ecx, [ebp + mci1120_iinr] ;# ecx = pointer into iinr[]
19933 add dword ptr [ebp + mci1120_iinr], 4 ;# advance pointer
19934 mov ebx, [ecx] ;# ebx=ii
19936 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
19937 mov eax, [ebp + mci1120_pos] ;# eax = base of pos[]
19939 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
19940 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
19941 mov [esp + mci1120_ii3], ebx ;# (use mm7 as temp. storage for iz.)
19942 pfadd mm6, mm7
19943 movq [esp + mci1120_ixO], mm5
19944 movq [esp + mci1120_izO], mm6
19946 movd mm3, [eax + ebx*4 + 12]
19947 movd mm4, [eax + ebx*4 + 16]
19948 movd mm5, [eax + ebx*4 + 20]
19949 punpckldq mm3, [eax + ebx*4 + 24]
19950 punpckldq mm4, [eax + ebx*4 + 28]
19951 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
19953 pfadd mm0, mm3
19954 pfadd mm1, mm4
19955 pfadd mm2, mm5
19956 movq [esp + mci1120_ixH], mm0
19957 movq [esp + mci1120_iyH], mm1
19958 movq [esp + mci1120_izH], mm2
19960 ;# clear vctot and i forces
19961 pxor mm7,mm7
19962 movq [esp + mci1120_vctot], mm7
19963 movq [esp + mci1120_vnbtot], mm7
19965 mov eax, [ebp + mci1120_jindex]
19966 mov ecx, [eax] ;# jindex[n]
19967 mov edx, [eax + 4] ;# jindex[n+1]
19968 add dword ptr [ebp + mci1120_jindex], 4
19969 sub edx, ecx ;# number of innerloop atoms
19970 mov [esp + mci1120_innerk], edx ;# number of innerloop atoms
19972 mov esi, [ebp + mci1120_pos]
19973 mov eax, [ebp + mci1120_jjnr]
19974 shl ecx, 2
19975 add eax, ecx
19976 mov [esp + mci1120_innerjjnr], eax ;# pointer to jjnr[nj0]
19977 .mci1120_inner_loop:
19978 ;# a single j particle iteration here - compare with the unrolled code for comments.
19979 mov eax, [esp + mci1120_innerjjnr]
19980 mov eax, [eax] ;# eax=jnr offset
19981 add dword ptr [esp + mci1120_innerjjnr], 4 ;# advance pointer
19982 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
19984 mov ecx, [ebp + mci1120_charge]
19985 movd mm7, [ecx + eax*4]
19986 punpckldq mm7,mm7
19987 movq mm6,mm7
19988 pfmul mm6, [esp + mci1120_iqO]
19989 pfmul mm7, [esp + mci1120_iqH] ;# mm6=qqO, mm7=qqH
19991 mov ecx, [ebp + mci1120_type]
19992 mov edx, [ecx + eax*4] ;# type [jnr]
19993 mov ecx, [ebp + mci1120_nbfp]
19994 shl edx, 1
19995 add edx, [esp + mci1120_ntia] ;# tja = ntia + 2*type
19996 movd mm5, [ecx + edx*4] ;# mm5 = 1st c6
19997 movq [esp + mci1120_c6], mm5
19998 movd mm5, [ecx + edx*4 + 4] ;# mm5 = 1st c12
19999 movq [esp + mci1120_c12], mm5
20001 lea eax, [eax + eax*2]
20003 movq mm0, [esi + eax*4]
20004 movd mm1, [esi + eax*4 + 8]
20005 ;# copy & expand to mm2-mm4 for the H interactions
20006 movq mm2, mm0
20007 movq mm3, mm0
20008 movq mm4, mm1
20009 punpckldq mm2,mm2
20010 punpckhdq mm3,mm3
20011 punpckldq mm4,mm4
20013 pfsubr mm0, [esp + mci1120_ixO]
20014 pfsubr mm1, [esp + mci1120_izO]
20016 pfmul mm0,mm0
20017 pfmul mm1,mm1
20018 pfacc mm0, mm1
20019 pfadd mm0, mm1 ;# mm0=rsqO
20021 punpckldq mm2, mm2
20022 punpckldq mm3, mm3
20023 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
20024 pfsubr mm2, [esp + mci1120_ixH]
20025 pfsubr mm3, [esp + mci1120_iyH]
20026 pfsubr mm4, [esp + mci1120_izH] ;# mm2-mm4 is dxH-dzH
20028 pfmul mm2,mm2
20029 pfmul mm3,mm3
20030 pfmul mm4,mm4
20032 pfadd mm3,mm2
20033 pfadd mm3,mm4 ;# mm3=rsqH
20035 pfrsqrt mm1,mm0
20037 movq mm2,mm1
20038 pfmul mm1,mm1
20039 pfrsqit1 mm1,mm0
20040 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
20041 movq mm4, mm1
20042 pfmul mm4, mm4 ;# mm4=invsq
20044 movq mm0, mm4
20045 pfmul mm0, mm4
20046 pfmul mm0, mm4 ;# mm0=rinvsix
20047 movq mm2, mm0
20048 pfmul mm2, mm2 ;# mm2=rintwelve
20050 ;# calculate potential and scalar force
20051 pfmul mm6, mm1 ;# mm6=vcoul
20052 movq mm1, mm6 ;# use mm1 for fscal sum
20054 ;# LJ for the oxygen
20055 pfmul mm0, [esp + mci1120_c6]
20056 pfmul mm2, [esp + mci1120_c12]
20058 ;# calc nb potential
20059 pfsub mm2, mm0
20060 ;# update nb potential
20061 pfadd mm2, [esp + mci1120_vnbtot]
20062 movq [esp + mci1120_vnbtot], mm2
20064 pfrsqrt mm5, mm3
20065 pswapd mm3,mm3
20066 pfrsqrt mm2, mm3
20067 pswapd mm3,mm3
20068 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3.
20070 movq mm2, mm5
20071 pfmul mm5,mm5
20072 pfrsqit1 mm5,mm3
20073 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
20074 pfmul mm7, mm5 ;# mm7=vcoul
20075 ;# update vctot
20076 pfadd mm7, mm6
20077 pfadd mm7, [esp + mci1120_vctot]
20078 movq [esp + mci1120_vctot], mm7
20080 ;# done - one more?
20081 dec dword ptr [esp + mci1120_innerk]
20082 jz .mci1120_updateouterdata
20083 jmp .mci1120_inner_loop
20084 .mci1120_updateouterdata:
20085 mov edx, [ebp + mci1120_gid] ;# get group index for this i particle
20086 mov edx, [edx]
20087 add dword ptr [ebp + mci1120_gid], 4 ;# advance pointer
20089 movq mm7, [esp + mci1120_vctot]
20090 pfacc mm7,mm7 ;# get and sum the two parts of total potential
20092 mov eax, [ebp + mci1120_Vc]
20093 movd mm6, [eax + edx*4]
20094 pfadd mm6, mm7
20095 movd [eax + edx*4], mm6 ;# increment vc[gid]
20097 movq mm7, [esp + mci1120_vnbtot]
20098 pfacc mm7,mm7 ;# same for Vnb
20100 mov eax, [ebp + mci1120_Vnb]
20101 movd mm6, [eax + edx*4]
20102 pfadd mm6, mm7
20103 movd [eax + edx*4], mm6 ;# increment vnb[gid]
20104 ;# finish if last
20105 dec dword ptr [ebp + mci1120_nri]
20106 jz .mci1120_end
20107 ;# not last, iterate once more!
20108 jmp .mci1120_outer
20109 .mci1120_end:
20110 femms
20111 add esp, 104
20112 pop edi
20113 pop esi
20114 pop edx
20115 pop ecx
20116 pop ebx
20117 pop eax
20118 leave
20123 .globl mcinl1130_3dnow
20124 .globl _mcinl1130_3dnow
20125 mcinl1130_3dnow:
20126 _mcinl1130_3dnow:
20127 .equiv mci1130_nri, 8
20128 .equiv mci1130_iinr, 12
20129 .equiv mci1130_jindex, 16
20130 .equiv mci1130_jjnr, 20
20131 .equiv mci1130_shift, 24
20132 .equiv mci1130_shiftvec, 28
20133 .equiv mci1130_gid, 32
20134 .equiv mci1130_pos, 36
20135 .equiv mci1130_charge, 40
20136 .equiv mci1130_facel, 44
20137 .equiv mci1130_Vc, 48
20138 .equiv mci1130_type, 52
20139 .equiv mci1130_ntype, 56
20140 .equiv mci1130_nbfp, 60
20141 .equiv mci1130_Vnb, 64
20142 ;# stack offsets for local variables
20143 .equiv mci1130_is3, 0
20144 .equiv mci1130_ii3, 4
20145 .equiv mci1130_ixO, 8
20146 .equiv mci1130_iyO, 12
20147 .equiv mci1130_izO, 16
20148 .equiv mci1130_ixH, 20
20149 .equiv mci1130_iyH, 28
20150 .equiv mci1130_izH, 36
20151 .equiv mci1130_qqOO, 44
20152 .equiv mci1130_qqOH, 52
20153 .equiv mci1130_qqHH, 60
20154 .equiv mci1130_c6, 68
20155 .equiv mci1130_c12, 76
20156 .equiv mci1130_vctot, 84
20157 .equiv mci1130_vnbtot, 92
20158 .equiv mci1130_innerjjnr, 100
20159 .equiv mci1130_innerk, 104
20160 push ebp
20161 mov ebp,esp
20162 push eax
20163 push ebx
20164 push ecx
20165 push edx
20166 push esi
20167 push edi
20168 sub esp, 108 ;# local stack space
20169 femms
20170 ;# assume we have at least one i particle - start directly
20172 mov ecx, [ebp + mci1130_iinr] ;# ecx = pointer into iinr[]
20173 mov ebx, [ecx] ;# ebx=ii
20175 mov edx, [ebp + mci1130_charge]
20176 movd mm1, [ebp + mci1130_facel] ;# mm1=facel
20177 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0] (O)
20178 movd mm3, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1] (H)
20179 movq mm4, mm2
20180 pfmul mm4, mm1
20181 movq mm6, mm3
20182 pfmul mm6, mm1
20183 movq mm5, mm4
20184 pfmul mm4, mm2 ;# mm4=qqOO*facel
20185 pfmul mm5, mm3 ;# mm5=qqOH*facel
20186 pfmul mm6, mm3 ;# mm6=qqHH*facel
20187 punpckldq mm5,mm5 ;# spread to both halves
20188 punpckldq mm6,mm6 ;# spread to both halves
20189 movq [esp + mci1130_qqOO], mm4
20190 movq [esp + mci1130_qqOH], mm5
20191 movq [esp + mci1130_qqHH], mm6
20192 mov edx, [ebp + mci1130_type]
20193 mov ecx, [edx + ebx*4]
20194 shl ecx, 1
20195 mov edx, ecx
20196 imul ecx, [ebp + mci1130_ntype]
20197 add edx, ecx
20198 mov eax, [ebp + mci1130_nbfp]
20199 movd mm0, [eax + edx*4]
20200 movd mm1, [eax + edx*4 + 4]
20201 movq [esp + mci1130_c6], mm0
20202 movq [esp + mci1130_c12], mm1
20204 .mci1130_outer:
20205 mov eax, [ebp + mci1130_shift] ;# eax = pointer into shift[]
20206 mov ebx, [eax] ;# ebx=shift[n]
20207 add dword ptr [ebp + mci1130_shift], 4 ;# advance pointer one step
20209 lea ebx, [ebx + ebx*2] ;# ebx=3*is
20210 mov [esp + mci1130_is3],ebx ;# store is3
20212 mov eax, [ebp + mci1130_shiftvec] ;# eax = base of shiftvec[]
20214 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
20215 movd mm6, [eax + ebx*4 + 8]
20216 movq mm0, mm5
20217 movq mm1, mm5
20218 movq mm2, mm6
20219 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
20220 punpckhdq mm1,mm1
20221 punpckldq mm2,mm2
20223 mov ecx, [ebp + mci1130_iinr] ;# ecx = pointer into iinr[]
20224 add dword ptr [ebp + mci1130_iinr], 4 ;# advance pointer
20225 mov ebx, [ecx] ;# ebx=ii
20227 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
20228 mov eax, [ebp + mci1130_pos] ;# eax = base of pos[]
20230 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
20231 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
20232 mov [esp + mci1130_ii3], ebx ;# (use mm7 as temp. storage for iz.)
20233 pfadd mm6, mm7
20234 movq [esp + mci1130_ixO], mm5
20235 movq [esp + mci1130_izO], mm6
20237 movd mm3, [eax + ebx*4 + 12]
20238 movd mm4, [eax + ebx*4 + 16]
20239 movd mm5, [eax + ebx*4 + 20]
20240 punpckldq mm3, [eax + ebx*4 + 24]
20241 punpckldq mm4, [eax + ebx*4 + 28]
20242 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
20244 pfadd mm0, mm3
20245 pfadd mm1, mm4
20246 pfadd mm2, mm5
20247 movq [esp + mci1130_ixH], mm0
20248 movq [esp + mci1130_iyH], mm1
20249 movq [esp + mci1130_izH], mm2
20251 ;# clear vctot and i forces
20252 pxor mm7,mm7
20253 movq [esp + mci1130_vctot], mm7
20254 movq [esp + mci1130_vnbtot], mm7
20256 mov eax, [ebp + mci1130_jindex]
20257 mov ecx, [eax] ;# jindex[n]
20258 mov edx, [eax + 4] ;# jindex[n+1]
20259 add dword ptr [ebp + mci1130_jindex], 4
20260 sub edx, ecx ;# number of innerloop atoms
20261 mov [esp + mci1130_innerk], edx ;# number of innerloop atoms
20263 mov esi, [ebp + mci1130_pos]
20264 mov eax, [ebp + mci1130_jjnr]
20265 shl ecx, 2
20266 add eax, ecx
20267 mov [esp + mci1130_innerjjnr], eax ;# pointer to jjnr[nj0]
20268 .mci1130_inner_loop:
20269 ;# a single j particle iteration here - compare with the unrolled code for comments.
20270 mov eax, [esp + mci1130_innerjjnr]
20271 mov eax, [eax] ;# eax=jnr offset
20272 add dword ptr [esp + mci1130_innerjjnr], 4 ;# advance pointer
20274 movd mm6, [esp + mci1130_qqOO]
20275 movq mm7, [esp + mci1130_qqOH]
20277 lea eax, [eax + eax*2]
20278 movq mm0, [esi + eax*4]
20279 movd mm1, [esi + eax*4 + 8]
20280 ;# copy & expand to mm2-mm4 for the H interactions
20281 movq mm2, mm0
20282 movq mm3, mm0
20283 movq mm4, mm1
20284 punpckldq mm2,mm2
20285 punpckhdq mm3,mm3
20286 punpckldq mm4,mm4
20288 pfsubr mm0, [esp + mci1130_ixO]
20289 pfsubr mm1, [esp + mci1130_izO]
20291 pfmul mm0,mm0
20292 pfmul mm1,mm1
20293 pfacc mm0, mm0
20294 pfadd mm0, mm1 ;# mm0=rsqO
20296 punpckldq mm2, mm2
20297 punpckldq mm3, mm3
20298 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
20299 pfsubr mm2, [esp + mci1130_ixH]
20300 pfsubr mm3, [esp + mci1130_iyH]
20301 pfsubr mm4, [esp + mci1130_izH] ;# mm2-mm4 is dxH-dzH
20303 pfmul mm2,mm2
20304 pfmul mm3,mm3
20305 pfmul mm4,mm4
20307 pfadd mm3,mm2
20308 pfadd mm3,mm4 ;# mm3=rsqH
20310 pfrsqrt mm1,mm0
20312 movq mm2,mm1
20313 pfmul mm1,mm1
20314 pfrsqit1 mm1,mm0
20315 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
20316 movq mm4, mm1
20317 pfmul mm4, mm4 ;# mm4=invsq
20319 movq mm2, mm4
20320 pfmul mm2, mm4
20321 pfmul mm2, mm4
20322 movq mm0, mm2
20323 pfmul mm0,mm0
20324 pfmul mm2, [esp + mci1130_c6]
20325 pfmul mm0, [esp + mci1130_c12]
20326 movq mm5, mm0
20327 pfsub mm5, mm2 ;# vnb
20329 ;# calculate potential and scalar force
20330 pfmul mm6, mm1 ;# mm6=vcoul
20331 ;# update nb potential
20332 pfadd mm5, [esp + mci1130_vnbtot]
20333 movq [esp + mci1130_vnbtot], mm5
20335 pfrsqrt mm5, mm3
20336 pswapd mm3,mm3
20337 pfrsqrt mm2, mm3
20338 pswapd mm3,mm3
20339 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
20341 movq mm2, mm5
20342 pfmul mm5,mm5
20343 pfrsqit1 mm5,mm3
20344 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
20345 pfmul mm7, mm5 ;# mm7=vcoul
20346 ;# update vctot
20347 pfadd mm7, mm6
20348 pfadd mm7, [esp + mci1130_vctot]
20349 movq [esp + mci1130_vctot], mm7
20351 ;# interactions with j H1
20352 movq mm0, [esi + eax*4 + 12]
20353 movd mm1, [esi + eax*4 + 20]
20354 ;# copy & expand to mm2-mm4 for the H interactions
20355 movq mm2, mm0
20356 movq mm3, mm0
20357 movq mm4, mm1
20358 punpckldq mm2,mm2
20359 punpckhdq mm3,mm3
20360 punpckldq mm4,mm4
20362 movd mm6, [esp + mci1130_qqOH]
20363 movq mm7, [esp + mci1130_qqHH]
20365 pfsubr mm0, [esp + mci1130_ixO]
20366 pfsubr mm1, [esp + mci1130_izO]
20368 pfmul mm0,mm0
20369 pfmul mm1,mm1
20370 pfacc mm0, mm1
20371 pfadd mm0, mm1 ;# mm0=rsqO
20373 punpckldq mm2, mm2
20374 punpckldq mm3, mm3
20375 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
20376 pfsubr mm2, [esp + mci1130_ixH]
20377 pfsubr mm3, [esp + mci1130_iyH]
20378 pfsubr mm4, [esp + mci1130_izH] ;# mm2-mm4 is dxH-dzH
20380 pfmul mm2,mm2
20381 pfmul mm3,mm3
20382 pfmul mm4,mm4
20384 pfadd mm3,mm2
20385 pfadd mm3,mm4 ;# mm3=rsqH
20387 pfrsqrt mm1,mm0
20389 movq mm2,mm1
20390 pfmul mm1,mm1
20391 pfrsqit1 mm1,mm0
20392 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
20393 ;# calculate potential and scalar force
20394 pfmul mm6, mm1 ;# mm6=vcoul
20396 pfrsqrt mm5, mm3
20397 pswapd mm3,mm3
20398 pfrsqrt mm2, mm3
20399 pswapd mm3,mm3
20400 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3
20402 movq mm2, mm5
20403 pfmul mm5,mm5
20404 pfrsqit1 mm5,mm3
20405 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
20406 pfmul mm7, mm5 ;# mm7=vcoul
20407 ;# update vctot
20408 pfadd mm7, mm6
20409 pfadd mm7, [esp + mci1130_vctot]
20410 movq [esp + mci1130_vctot], mm7
20412 ;# interactions with j H2
20413 movq mm0, [esi + eax*4 + 24]
20414 movd mm1, [esi + eax*4 + 32]
20415 ;# copy & expand to mm2-mm4 for the H interactions
20416 movq mm2, mm0
20417 movq mm3, mm0
20418 movq mm4, mm1
20419 punpckldq mm2,mm2
20420 punpckhdq mm3,mm3
20421 punpckldq mm4,mm4
20423 movd mm6, [esp + mci1130_qqOH]
20424 movq mm7, [esp + mci1130_qqHH]
20426 pfsubr mm0, [esp + mci1130_ixO]
20427 pfsubr mm1, [esp + mci1130_izO]
20429 pfmul mm0,mm0
20430 pfmul mm1,mm1
20431 pfacc mm0, mm1
20432 pfadd mm0, mm1 ;# mm0=rsqO
20434 punpckldq mm2, mm2
20435 punpckldq mm3, mm3
20436 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
20437 pfsubr mm2, [esp + mci1130_ixH]
20438 pfsubr mm3, [esp + mci1130_iyH]
20439 pfsubr mm4, [esp + mci1130_izH] ;# mm2-mm4 is dxH-dzH
20441 pfmul mm2,mm2
20442 pfmul mm3,mm3
20443 pfmul mm4,mm4
20445 pfadd mm3,mm2
20446 pfadd mm3,mm4 ;# mm3=rsqH
20448 pfrsqrt mm1,mm0
20450 movq mm2,mm1
20451 pfmul mm1,mm1
20452 pfrsqit1 mm1,mm0
20453 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
20454 ;# calculate potential and scalar force
20455 pfmul mm6, mm1 ;# mm6=vcoul
20457 pfrsqrt mm5, mm3
20458 pswapd mm3,mm3
20459 pfrsqrt mm2, mm3
20460 pswapd mm3,mm3
20461 punpckldq mm5,mm2 ;# seeds are in mm5 now, and rsq in mm3.
20463 movq mm2, mm5
20464 pfmul mm5,mm5
20465 pfrsqit1 mm5,mm3
20466 pfrcpit2 mm5,mm2 ;# mm5=invsqrt
20467 pfmul mm7, mm5 ;# mm7=vcoul
20469 ;# update vctot
20470 pfadd mm7, mm6
20471 pfadd mm7, [esp + mci1130_vctot]
20472 movq [esp + mci1130_vctot], mm7
20474 ;# done - one more?
20475 dec dword ptr [esp + mci1130_innerk]
20476 jz .mci1130_updateouterdata
20477 jmp .mci1130_inner_loop
20478 .mci1130_updateouterdata:
20479 mov edx, [ebp + mci1130_gid] ;# get group index for this i particle
20480 mov edx, [edx]
20481 add dword ptr [ebp + mci1130_gid], 4 ;# advance pointer
20483 movq mm7, [esp + mci1130_vctot]
20484 pfacc mm7,mm7 ;# get and sum the two parts of total potential
20486 mov eax, [ebp + mci1130_Vc]
20487 movd mm6, [eax + edx*4]
20488 pfadd mm6, mm7
20489 movd [eax + edx*4], mm6 ;# increment vc[gid]
20491 movq mm7, [esp + mci1130_vnbtot]
20492 pfacc mm7,mm7 ;# get and sum the two parts of total potential
20494 mov eax, [ebp + mci1130_Vnb]
20495 movd mm6, [eax + edx*4]
20496 pfadd mm6, mm7
20497 movd [eax + edx*4], mm6 ;# increment vnbtot[gid]
20498 ;# finish if last
20499 dec dword ptr [ebp + mci1130_nri]
20500 jz .mci1130_end
20501 ;# not last, iterate once more!
20502 jmp .mci1130_outer
20503 .mci1130_end:
20504 femms
20505 add esp, 108
20506 pop edi
20507 pop esi
20508 pop edx
20509 pop ecx
20510 pop ebx
20511 pop eax
20512 leave
20518 .globl mcinl3000_3dnow
20519 .globl _mcinl3000_3dnow
20520 mcinl3000_3dnow:
20521 _mcinl3000_3dnow:
20522 .equiv mci3000_nri, 8
20523 .equiv mci3000_iinr, 12
20524 .equiv mci3000_jindex, 16
20525 .equiv mci3000_jjnr, 20
20526 .equiv mci3000_shift, 24
20527 .equiv mci3000_shiftvec, 28
20528 .equiv mci3000_gid, 32
20529 .equiv mci3000_pos, 36
20530 .equiv mci3000_charge, 40
20531 .equiv mci3000_facel, 44
20532 .equiv mci3000_Vc, 48
20533 .equiv mci3000_tabscale, 52
20534 .equiv mci3000_VFtab, 56
20535 ;# stack offsets for local variables
20536 .equiv mci3000_is3, 0
20537 .equiv mci3000_ii3, 4
20538 .equiv mci3000_ix, 8
20539 .equiv mci3000_iy, 12
20540 .equiv mci3000_iz, 16
20541 .equiv mci3000_iq, 20
20542 .equiv mci3000_vctot, 28
20543 .equiv mci3000_n1, 36
20544 .equiv mci3000_tsc, 44
20545 .equiv mci3000_ntia, 52
20546 .equiv mci3000_innerjjnr, 56
20547 .equiv mci3000_innerk, 60
20548 push ebp
20549 mov ebp,esp
20550 push eax
20551 push ebx
20552 push ecx
20553 push edx
20554 push esi
20555 push edi
20556 sub esp, 64 ;# local stack space
20557 femms
20558 ;# move data to local stack
20559 movd mm3, [ebp + mci3000_tabscale]
20560 punpckldq mm3,mm3
20561 movq [esp + mci3000_tsc], mm3
20562 ;# assume we have at least one i particle - start directly
20563 .mci3000_outer:
20564 mov eax, [ebp + mci3000_shift] ;# eax = pointer into shift[]
20565 mov ebx, [eax] ;# ebx=shift[n]
20566 add dword ptr [ebp + mci3000_shift], 4 ;# advance pointer one step
20568 lea ebx, [ebx + ebx*2] ;# ebx=3*is
20569 mov [esp + mci3000_is3],ebx ;# store is3
20571 mov eax, [ebp + mci3000_shiftvec] ;# eax = base of shiftvec[]
20573 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
20574 movd mm1, [eax + ebx*4 + 8]
20576 mov ecx, [ebp + mci3000_iinr] ;# ecx = pointer into iinr[]
20577 add dword ptr [ebp + mci3000_iinr], 4 ;# advance pointer
20578 mov ebx, [ecx] ;# ebx=ii
20580 mov edx, [ebp + mci3000_charge]
20581 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
20582 pfmul mm2, [ebp + mci3000_facel]
20583 punpckldq mm2,mm2 ;# spread to both halves
20584 movq [esp + mci3000_iq], mm2 ;# iq =facel*charge[ii]
20586 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
20587 mov eax, [ebp + mci3000_pos] ;# eax = base of pos[]
20589 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
20590 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
20591 mov [esp + mci3000_ii3], ebx
20592 pfadd mm1, mm3
20593 movq [esp + mci3000_ix], mm0
20594 movd [esp + mci3000_iz], mm1
20596 ;# clear total potential and i forces
20597 pxor mm7,mm7
20598 movq [esp + mci3000_vctot], mm7
20600 mov eax, [ebp + mci3000_jindex]
20601 mov ecx, [eax] ;# jindex[n]
20602 mov edx, [eax + 4] ;# jindex[n+1]
20603 add dword ptr [ebp + mci3000_jindex], 4
20604 sub edx, ecx ;# number of innerloop atoms
20606 mov esi, [ebp + mci3000_pos]
20607 mov eax, [ebp + mci3000_jjnr]
20608 shl ecx, 2
20609 add eax, ecx
20610 mov [esp + mci3000_innerjjnr], eax ;# pointer to jjnr[nj0]
20611 sub edx, 2
20612 mov [esp + mci3000_innerk], edx ;# number of innerloop atoms
20613 jge .mci3000_unroll_loop
20614 jmp .mci3000_finish_inner
20615 .mci3000_unroll_loop:
20616 ;# paired innerloop starts here
20617 mov ecx, [esp + mci3000_innerjjnr] ;# pointer to jjnr[k]
20618 mov eax, [ecx]
20619 mov ebx, [ecx + 4] ;# eax/ebx=jnr
20620 add dword ptr [esp + mci3000_innerjjnr], 8 ;# advance pointer (unrolled 2)
20621 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
20623 mov ecx, [ebp + mci3000_charge] ;# base of charge[]
20624 movq mm5, [esp + mci3000_iq]
20625 movd mm3, [ecx + eax*4] ;# charge[jnr1]
20626 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
20627 pfmul mm3,mm5 ;# mm3 now has qq for both particles
20629 lea eax, [eax + eax*2] ;# replace jnr with j3
20630 lea ebx, [ebx + ebx*2]
20632 mov esi, [ebp + mci3000_pos]
20634 movq mm0, [esp + mci3000_ix]
20635 movd mm1, [esp + mci3000_iz]
20636 movq mm4, [esi + eax*4] ;# fetch first j coordinates
20637 movd mm5, [esi + eax*4 + 8]
20638 pfsubr mm4,mm0 ;# dr = ir - jr
20639 pfsubr mm5,mm1
20640 pfmul mm4,mm4 ;# square dx,dy,dz
20641 pfmul mm5,mm5
20642 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
20643 pfacc mm4, mm5 ;# first rsq in lower mm4
20645 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
20646 movd mm7, [esi + ebx*4 + 8]
20648 pfsubr mm6,mm0 ;# dr = ir - jr
20649 pfsubr mm7,mm1
20650 pfmul mm6,mm6 ;# square dx,dy,dz
20651 pfmul mm7,mm7
20652 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
20653 pfacc mm6, mm7 ;# second rsq in lower mm6
20655 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
20656 pfrsqrt mm1, mm6
20659 punpckldq mm0,mm1
20660 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
20661 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
20662 pfmul mm0,mm0
20663 pfrsqit1 mm0,mm4
20664 pfrcpit2 mm0,mm2
20665 pfmul mm4, mm0
20666 movq mm1, mm4
20667 ;# mm0 is invsqrt, and mm1 r.
20668 ;# do potential and fscal
20669 pfmul mm1, [esp + mci3000_tsc] ;# mm1=rt
20670 pf2iw mm4,mm1
20671 movq [esp + mci3000_n1], mm4
20672 pi2fd mm4,mm4
20673 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
20675 movq mm2,mm1
20676 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
20678 mov edx, [ebp + mci3000_VFtab]
20679 mov ecx, [esp + mci3000_n1]
20680 shl ecx, 2
20681 ;# coulomb table
20682 ;# load all the table values we need
20683 movd mm4, [edx + ecx*4]
20684 movd mm5, [edx + ecx*4 + 4]
20685 movd mm6, [edx + ecx*4 + 8]
20686 movd mm7, [edx + ecx*4 + 12]
20687 mov ecx, [esp + mci3000_n1 + 4]
20688 shl ecx, 2
20689 punpckldq mm4, [edx + ecx*4]
20690 punpckldq mm5, [edx + ecx*4 + 4]
20691 punpckldq mm6, [edx + ecx*4 + 8]
20692 punpckldq mm7, [edx + ecx*4 + 12]
20694 pfmul mm6, mm1 ;# mm6 = Geps
20695 pfmul mm7, mm2 ;# mm7 = Heps2
20697 pfadd mm5, mm6
20698 pfadd mm5, mm7 ;# mm5 = Fp
20700 pfmul mm5, mm1 ;# mm5=eps*Fp
20701 pfadd mm5, mm4 ;# mm5= VV
20703 pfmul mm5, mm3 ;# vcoul=qq*VV
20704 ;# at this point mm5 contains vcoul
20705 ;# increment vcoul - then we can get rid of mm5.
20706 ;# update vctot
20707 pfadd mm5, [esp + mci3000_vctot] ;# add the earlier value
20708 movq [esp + mci3000_vctot], mm5 ;# store the sum
20710 ;# should we do one more iteration?
20711 sub dword ptr [esp + mci3000_innerk], 2
20712 jl .mci3000_finish_inner
20713 jmp .mci3000_unroll_loop
20714 .mci3000_finish_inner:
20715 and dword ptr [esp + mci3000_innerk], 1
20716 jnz .mci3000_single_inner
20717 jmp .mci3000_updateouterdata
20718 .mci3000_single_inner:
20719 ;# a single j particle iteration here - compare with the unrolled code for comments.
20720 mov eax, [esp + mci3000_innerjjnr]
20721 mov eax, [eax] ;# eax=jnr offset
20723 mov ecx, [ebp + mci3000_charge]
20724 movd mm5, [esp + mci3000_iq]
20725 movd mm3, [ecx + eax*4]
20726 pfmul mm3, mm5 ;# mm3=qq
20728 mov esi, [ebp + mci3000_pos]
20729 lea eax, [eax + eax*2]
20731 movq mm0, [esp + mci3000_ix]
20732 movd mm1, [esp + mci3000_iz]
20733 movq mm4, [esi + eax*4]
20734 movd mm5, [esi + eax*4 + 8]
20735 pfsubr mm4, mm0
20736 pfsubr mm5, mm1
20737 pfmul mm4,mm4
20738 pfmul mm5,mm5
20739 pfacc mm4, mm5
20740 pfacc mm4, mm5 ;# mm0=rsq
20742 pfrsqrt mm0,mm4
20743 movq mm2,mm0
20744 pfmul mm0,mm0
20745 pfrsqit1 mm0,mm4
20746 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
20747 pfmul mm4, mm0
20748 movq mm1, mm4
20749 ;# mm0 is invsqrt, and mm1 r.
20751 ;# calculate potentials and scalar force
20752 pfmul mm1, [esp + mci3000_tsc] ;# mm1=rt
20753 pf2iw mm4,mm1
20754 movd [esp + mci3000_n1], mm4
20755 pi2fd mm4,mm4
20756 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
20758 movq mm2,mm1
20759 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
20761 ;# coulomb table
20762 mov edx, [ebp + mci3000_VFtab]
20763 mov ecx, [esp + mci3000_n1]
20764 shl ecx, 2
20765 ;# load all the table values we need
20766 movd mm4, [edx + ecx*4]
20767 movd mm5, [edx + ecx*4 + 4]
20768 movd mm6, [edx + ecx*4 + 8]
20769 movd mm7, [edx + ecx*4 + 12]
20771 pfmul mm6, mm1 ;# mm6 = Geps
20772 pfmul mm7, mm2 ;# mm7 = Heps2
20774 pfadd mm5, mm6
20775 pfadd mm5, mm7 ;# mm5 = Fp
20777 pfmul mm5, mm1 ;# mm5=eps*Fp
20778 pfadd mm5, mm4 ;# mm5= VV
20780 pfmul mm5, mm3 ;# vcoul=qq*VV
20782 ;# at this point mm5 contains vcoul
20783 ;# increment vcoul - then we can get rid of mm5
20784 ;# update vctot
20785 pfadd mm5, [esp + mci3000_vctot] ;# add the earlier value
20786 movq [esp + mci3000_vctot], mm5 ;# store the sum
20788 .mci3000_updateouterdata:
20789 mov edx, [ebp + mci3000_gid] ;# get group index for this i particle
20790 mov edx, [edx]
20791 add dword ptr [ebp + mci3000_gid], 4 ;# advance pointer
20793 movq mm7, [esp + mci3000_vctot]
20794 pfacc mm7,mm7 ;# get and sum the two parts of total potential
20796 mov eax, [ebp + mci3000_Vc]
20797 movd mm6, [eax + edx*4]
20798 pfadd mm6, mm7
20799 movd [eax + edx*4], mm6 ;# increment vc[gid]
20801 ;# finish if last
20802 mov ecx, [ebp + mci3000_nri]
20803 dec ecx
20804 jecxz .mci3000_end
20805 ;# not last, iterate once more!
20806 mov [ebp + mci3000_nri], ecx
20807 jmp .mci3000_outer
20808 .mci3000_end:
20809 femms
20810 add esp, 64
20811 pop edi
20812 pop esi
20813 pop edx
20814 pop ecx
20815 pop ebx
20816 pop eax
20817 leave
20823 .globl mcinl3010_3dnow
20824 .globl _mcinl3010_3dnow
20825 mcinl3010_3dnow:
20826 _mcinl3010_3dnow:
20827 .equiv mci3010_nri, 8
20828 .equiv mci3010_iinr, 12
20829 .equiv mci3010_jindex, 16
20830 .equiv mci3010_jjnr, 20
20831 .equiv mci3010_shift, 24
20832 .equiv mci3010_shiftvec, 28
20833 .equiv mci3010_gid, 32
20834 .equiv mci3010_pos, 36
20835 .equiv mci3010_charge, 40
20836 .equiv mci3010_facel, 44
20837 .equiv mci3010_Vc, 48
20838 .equiv mci3010_tabscale, 52
20839 .equiv mci3010_VFtab, 56
20840 .equiv mci3010_nsatoms, 60
20841 ;# stack offsets for local variables
20842 .equiv mci3010_is3, 0
20843 .equiv mci3010_ii3, 4
20844 .equiv mci3010_shX, 8
20845 .equiv mci3010_shY, 12
20846 .equiv mci3010_shZ, 16
20847 .equiv mci3010_ix, 20
20848 .equiv mci3010_iy, 24
20849 .equiv mci3010_iz, 28
20850 .equiv mci3010_iq, 32
20851 .equiv mci3010_vctot, 40
20852 .equiv mci3010_n1, 48
20853 .equiv mci3010_tsc, 56
20854 .equiv mci3010_innerjjnr0, 64
20855 .equiv mci3010_innerk0, 68
20856 .equiv mci3010_innerjjnr, 72
20857 .equiv mci3010_innerk, 76
20858 .equiv mci3010_nscoul, 80
20859 .equiv mci3010_solnr, 84
20860 push ebp
20861 mov ebp,esp
20862 push eax
20863 push ebx
20864 push ecx
20865 push edx
20866 push esi
20867 push edi
20868 sub esp, 88 ;# local stack space
20869 femms
20871 add dword ptr [ebp + mci3010_nsatoms], 8
20872 movd mm3, [ebp + mci3010_tabscale]
20873 punpckldq mm3,mm3
20874 movq [esp + mci3010_tsc], mm3
20876 ;# assume we have at least one i particle - start directly
20877 .mci3010_outer:
20878 mov eax, [ebp + mci3010_shift] ;# eax = pointer into shift[]
20879 mov ebx, [eax] ;# ebx=shift[n]
20880 add dword ptr [ebp + mci3010_shift], 4 ;# advance pointer one step
20882 lea ebx, [ebx + ebx*2] ;# ebx=3*is
20883 mov [esp + mci3010_is3],ebx ;# store is3
20885 mov eax, [ebp + mci3010_shiftvec] ;# eax = base of shiftvec[]
20887 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
20888 movd mm1, [eax + ebx*4 + 8]
20889 movq [esp + mci3010_shX], mm0
20890 movd [esp + mci3010_shZ], mm1
20892 mov ecx, [ebp + mci3010_iinr] ;# ecx = pointer into iinr[]
20893 add dword ptr [ebp + mci3010_iinr], 4 ;# advance pointer
20894 mov ebx, [ecx] ;# ebx=ii
20896 mov eax, [ebp + mci3010_nsatoms]
20897 mov ecx, [eax]
20898 add dword ptr [ebp + mci3010_nsatoms], 12
20899 mov [esp + mci3010_nscoul], ecx
20901 ;# clear potential
20902 pxor mm7,mm7
20903 movq [esp + mci3010_vctot], mm7
20904 mov [esp + mci3010_solnr], ebx
20906 mov eax, [ebp + mci3010_jindex]
20907 mov ecx, [eax] ;# jindex[n]
20908 mov edx, [eax + 4] ;# jindex[n+1]
20909 add dword ptr [ebp + mci3010_jindex], 4
20910 sub edx, ecx ;# number of innerloop atoms
20911 mov eax, [ebp + mci3010_jjnr]
20912 shl ecx, 2
20913 add eax, ecx
20914 mov [esp + mci3010_innerjjnr0], eax ;# pointer to jjnr[nj0]
20916 mov [esp + mci3010_innerk0], edx ;# number of innerloop atoms
20917 mov esi, [ebp + mci3010_pos]
20918 mov ecx, [esp + mci3010_nscoul]
20919 cmp ecx, 0
20920 jnz .mci3010_mno_coul
20921 jmp .mci3010_last_mno
20922 .mci3010_mno_coul:
20923 mov ebx, [esp + mci3010_solnr]
20924 inc dword ptr [esp + mci3010_solnr]
20925 mov edx, [ebp + mci3010_charge]
20926 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
20927 pfmul mm2, [ebp + mci3010_facel]
20928 punpckldq mm2,mm2 ;# spread to both halves
20929 movq [esp + mci3010_iq], mm2 ;# iq =facel*charge[ii]
20931 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
20932 mov eax, [ebp + mci3010_pos] ;# eax = base of pos[]
20933 mov [esp + mci3010_ii3], ebx
20935 movq mm0, [eax + ebx*4]
20936 movd mm1, [eax + ebx*4 + 8]
20937 pfadd mm0, [esp + mci3010_shX]
20938 pfadd mm1, [esp + mci3010_shZ]
20939 movq [esp + mci3010_ix], mm0
20940 movd [esp + mci3010_iz], mm1
20942 mov ecx, [esp + mci3010_innerjjnr0]
20943 mov [esp + mci3010_innerjjnr], ecx
20944 mov edx, [esp + mci3010_innerk0]
20945 sub edx, 2
20946 mov [esp + mci3010_innerk], edx ;# number of innerloop atoms
20947 jge .mci3010_unroll_coul_loop
20948 jmp .mci3010_finish_coul_inner
20949 .mci3010_unroll_coul_loop:
20950 ;# paired innerloop starts here
20951 mov ecx, [esp + mci3010_innerjjnr] ;# pointer to jjnr[k]
20952 mov eax, [ecx]
20953 mov ebx, [ecx + 4] ;# eax/ebx=jnr
20954 add dword ptr [esp + mci3010_innerjjnr], 8 ;# advance pointer (unrolled 2)
20955 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
20957 mov ecx, [ebp + mci3010_charge] ;# base of charge[]
20958 movq mm5, [esp + mci3010_iq]
20959 movd mm3, [ecx + eax*4] ;# charge[jnr1]
20960 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
20961 pfmul mm3,mm5 ;# mm3 now has qq for both particles
20963 lea eax, [eax + eax*2] ;# replace jnr with j3
20964 lea ebx, [ebx + ebx*2]
20966 mov esi, [ebp + mci3010_pos]
20968 movq mm0, [esp + mci3010_ix]
20969 movd mm1, [esp + mci3010_iz]
20970 movq mm4, [esi + eax*4] ;# fetch first j coordinates
20971 movd mm5, [esi + eax*4 + 8]
20972 pfsubr mm4,mm0 ;# dr = ir - jr
20973 pfsubr mm5,mm1
20974 pfmul mm4,mm4 ;# square dx,dy,dz
20975 pfmul mm5,mm5
20976 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
20977 pfacc mm4, mm5 ;# first rsq in lower mm4
20979 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
20980 movd mm7, [esi + ebx*4 + 8]
20982 pfsubr mm6,mm0 ;# dr = ir - jr
20983 pfsubr mm7,mm1
20984 pfmul mm6,mm6 ;# square dx,dy,dz
20985 pfmul mm7,mm7
20986 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
20987 pfacc mm6, mm7 ;# second rsq in lower mm6
20989 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
20990 pfrsqrt mm1, mm6
20993 punpckldq mm0,mm1
20994 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
20995 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
20996 pfmul mm0,mm0
20997 pfrsqit1 mm0,mm4
20998 pfrcpit2 mm0,mm2
20999 pfmul mm4, mm0
21000 movq mm1, mm4
21001 ;# mm0 is invsqrt, and mm1 r.
21002 ;# do potential and fscal
21003 pfmul mm1, [esp + mci3010_tsc] ;# mm1=rt
21004 pf2iw mm4,mm1
21005 movq [esp + mci3010_n1], mm4
21006 pi2fd mm4,mm4
21007 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
21009 movq mm2,mm1
21010 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
21012 mov edx, [ebp + mci3010_VFtab]
21013 mov ecx, [esp + mci3010_n1]
21014 shl ecx, 2
21015 ;# coulomb table
21016 ;# load all the table values we need
21017 movd mm4, [edx + ecx*4]
21018 movd mm5, [edx + ecx*4 + 4]
21019 movd mm6, [edx + ecx*4 + 8]
21020 movd mm7, [edx + ecx*4 + 12]
21021 mov ecx, [esp + mci3010_n1 + 4]
21022 shl ecx, 2
21023 punpckldq mm4, [edx + ecx*4]
21024 punpckldq mm5, [edx + ecx*4 + 4]
21025 punpckldq mm6, [edx + ecx*4 + 8]
21026 punpckldq mm7, [edx + ecx*4 + 12]
21028 pfmul mm6, mm1 ;# mm6 = Geps
21029 pfmul mm7, mm2 ;# mm7 = Heps2
21031 pfadd mm5, mm6
21032 pfadd mm5, mm7 ;# mm5 = Fp
21034 pfmul mm5, mm1 ;# mm5=eps*Fp
21035 pfadd mm5, mm4 ;# mm5= VV
21037 pfmul mm5, mm3 ;# vcoul=qq*VV
21039 ;# at this point mm5 contains vcoul
21040 ;# increment vcoul - then we can get rid of mm5
21041 ;# update vctot
21042 pfadd mm5, [esp + mci3010_vctot] ;# add the earlier value
21043 movq [esp + mci3010_vctot], mm5 ;# store the sum
21045 ;# should we do one more iteration?
21046 sub dword ptr [esp + mci3010_innerk], 2
21047 jl .mci3010_finish_coul_inner
21048 jmp .mci3010_unroll_coul_loop
21049 .mci3010_finish_coul_inner:
21050 and dword ptr [esp + mci3010_innerk], 1
21051 jnz .mci3010_single_coul_inner
21052 jmp .mci3010_updateouterdata_coul
21053 .mci3010_single_coul_inner:
21054 ;# a single j particle iteration here - compare with the unrolled code for comments.
21055 mov eax, [esp + mci3010_innerjjnr]
21056 mov eax, [eax] ;# eax=jnr offset
21058 mov ecx, [ebp + mci3010_charge]
21059 movd mm5, [esp + mci3010_iq]
21060 movd mm3, [ecx + eax*4]
21061 pfmul mm3, mm5 ;# mm3=qq
21063 mov esi, [ebp + mci3010_pos]
21064 lea eax, [eax + eax*2]
21066 movq mm0, [esp + mci3010_ix]
21067 movd mm1, [esp + mci3010_iz]
21068 movq mm4, [esi + eax*4]
21069 movd mm5, [esi + eax*4 + 8]
21070 pfsubr mm4, mm0
21071 pfsubr mm5, mm1
21072 pfmul mm4,mm4
21073 pfmul mm5,mm5
21074 pfacc mm4, mm5
21075 pfacc mm4, mm5 ;# mm0=rsq
21077 pfrsqrt mm0,mm4
21078 movq mm2,mm0
21079 pfmul mm0,mm0
21080 pfrsqit1 mm0,mm4
21081 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
21082 pfmul mm4, mm0
21083 movq mm1, mm4
21084 ;# mm0 is invsqrt, and mm1 r.
21086 ;# calculate potentials and scalar force
21087 pfmul mm1, [esp + mci3010_tsc] ;# mm1=rt
21088 pf2iw mm4,mm1
21089 movd [esp + mci3010_n1], mm4
21090 pi2fd mm4,mm4
21091 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
21093 movq mm2,mm1
21094 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
21096 ;# coulomb table
21097 mov edx, [ebp + mci3010_VFtab]
21098 mov ecx, [esp + mci3010_n1]
21099 shl ecx, 2
21100 ;# load all the table values we need
21101 movd mm4, [edx + ecx*4]
21102 movd mm5, [edx + ecx*4 + 4]
21103 movd mm6, [edx + ecx*4 + 8]
21104 movd mm7, [edx + ecx*4 + 12]
21106 pfmul mm6, mm1 ;# mm6 = Geps
21107 pfmul mm7, mm2 ;# mm7 = Heps2
21109 pfadd mm5, mm6
21110 pfadd mm5, mm7 ;# mm5 = Fp
21112 pfmul mm5, mm1 ;# mm5=eps*Fp
21113 pfadd mm5, mm4 ;# mm5= VV
21115 pfmul mm5, mm3 ;# vcoul=qq*VV
21117 ;# at this point mm5 contains vcoul
21118 ;# increment vcoul - then we can get rid of mm5
21119 ;# update vctot
21120 pfadd mm5, [esp + mci3010_vctot] ;# add the earlier value
21121 movq [esp + mci3010_vctot], mm5 ;# store the sum
21123 .mci3010_updateouterdata_coul:
21124 ;# loop back to mno
21125 dec dword ptr [esp + mci3010_nscoul]
21126 jz .mci3010_last_mno
21127 jmp .mci3010_mno_coul
21128 .mci3010_last_mno:
21129 mov edx, [ebp + mci3010_gid] ;# get group index for this i particle
21130 mov edx, [edx]
21131 add dword ptr [ebp + mci3010_gid], 4 ;# advance pointer
21133 movq mm7, [esp + mci3010_vctot]
21134 pfacc mm7,mm7 ;# get and sum the two parts of total potential
21136 mov eax, [ebp + mci3010_Vc]
21137 movd mm6, [eax + edx*4]
21138 pfadd mm6, mm7
21139 movd [eax + edx*4], mm6 ;# increment vc[gid]
21140 ;# finish if last
21141 mov ecx, [ebp + mci3010_nri]
21142 dec ecx
21143 jecxz .mci3010_end
21144 ;# not last, iterate once more!
21145 mov [ebp + mci3010_nri], ecx
21146 jmp .mci3010_outer
21147 .mci3010_end:
21148 femms
21149 add esp, 88
21150 pop edi
21151 pop esi
21152 pop edx
21153 pop ecx
21154 pop ebx
21155 pop eax
21156 leave
21162 .globl mcinl3020_3dnow
21163 .globl _mcinl3020_3dnow
21164 mcinl3020_3dnow:
21165 _mcinl3020_3dnow:
21166 .equiv mci3020_nri, 8
21167 .equiv mci3020_iinr, 12
21168 .equiv mci3020_jindex, 16
21169 .equiv mci3020_jjnr, 20
21170 .equiv mci3020_shift, 24
21171 .equiv mci3020_shiftvec, 28
21172 .equiv mci3020_gid, 32
21173 .equiv mci3020_pos, 36
21174 .equiv mci3020_charge, 40
21175 .equiv mci3020_facel, 44
21176 .equiv mci3020_Vc, 48
21177 .equiv mci3020_tabscale, 52
21178 .equiv mci3020_VFtab, 56
21179 ;# stack offsets for local variables
21180 .equiv mci3020_is3, 0
21181 .equiv mci3020_ii3, 4
21182 .equiv mci3020_ixO, 8
21183 .equiv mci3020_iyO, 12
21184 .equiv mci3020_izO, 16
21185 .equiv mci3020_ixH, 20
21186 .equiv mci3020_iyH, 28
21187 .equiv mci3020_izH, 36
21188 .equiv mci3020_iqO, 44
21189 .equiv mci3020_iqH, 52
21190 .equiv mci3020_qqO, 60
21191 .equiv mci3020_qqH, 68
21192 .equiv mci3020_vctot, 76
21193 .equiv mci3020_n1, 84
21194 .equiv mci3020_tsc, 92
21195 .equiv mci3020_innerjjnr, 100
21196 .equiv mci3020_innerk, 104
21197 .equiv mci3020_tmprsqH, 108
21198 push ebp
21199 mov ebp,esp
21200 push eax
21201 push ebx
21202 push ecx
21203 push edx
21204 push esi
21205 push edi
21206 sub esp, 116 ;# local stack space
21207 femms
21209 mov ecx, [ebp + mci3020_iinr] ;# ecx = pointer into iinr[]
21210 mov ebx, [ecx] ;# ebx=ii
21212 mov edx, [ebp + mci3020_charge]
21213 movd mm1, [ebp + mci3020_facel]
21214 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0]
21215 pfmul mm2, mm1
21216 movq [esp + mci3020_iqO], mm2 ;# iqO = facel*charge[ii]
21218 movd mm2, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1]
21219 pfmul mm2, mm1
21220 punpckldq mm2,mm2 ;# spread to both halves
21221 movq [esp + mci3020_iqH], mm2 ;# iqH = facel*charge[ii0+1]
21223 movd mm4, [ebp + mci3020_tabscale]
21224 punpckldq mm4,mm4 ;# spread to both halves
21225 movq [esp + mci3020_tsc], mm4
21226 ;# assume we have at least one i particle - start directly
21227 .mci3020_outer:
21228 mov eax, [ebp + mci3020_shift] ;# eax = pointer into shift[]
21229 mov ebx, [eax] ;# ebx=shift[n]
21230 add dword ptr [ebp + mci3020_shift], 4 ;# advance pointer one step
21232 lea ebx, [ebx + ebx*2] ;# ebx=3*is
21233 mov [esp + mci3020_is3],ebx ;# store is3
21235 mov eax, [ebp + mci3020_shiftvec] ;# eax = base of shiftvec[]
21237 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
21238 movd mm6, [eax + ebx*4 + 8]
21239 movq mm0, mm5
21240 movq mm1, mm5
21241 movq mm2, mm6
21242 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
21243 punpckhdq mm1,mm1
21244 punpckldq mm2,mm2
21246 mov ecx, [ebp + mci3020_iinr] ;# ecx = pointer into iinr[]
21247 add dword ptr [ebp + mci3020_iinr], 4 ;# advance pointer
21248 mov ebx, [ecx] ;# ebx=ii
21250 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
21251 mov eax, [ebp + mci3020_pos] ;# eax = base of pos[]
21253 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
21254 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
21255 mov [esp + mci3020_ii3], ebx ;# (use mm7 as temp. storage for iz.)
21256 pfadd mm6, mm7
21257 movq [esp + mci3020_ixO], mm5
21258 movq [esp + mci3020_izO], mm6
21260 movd mm3, [eax + ebx*4 + 12]
21261 movd mm4, [eax + ebx*4 + 16]
21262 movd mm5, [eax + ebx*4 + 20]
21263 punpckldq mm3, [eax + ebx*4 + 24]
21264 punpckldq mm4, [eax + ebx*4 + 28]
21265 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
21267 pfadd mm0, mm3
21268 pfadd mm1, mm4
21269 pfadd mm2, mm5
21270 movq [esp + mci3020_ixH], mm0
21271 movq [esp + mci3020_iyH], mm1
21272 movq [esp + mci3020_izH], mm2
21274 ;# clear vctot and i forces
21275 pxor mm7,mm7
21276 movq [esp + mci3020_vctot], mm7
21278 mov eax, [ebp + mci3020_jindex]
21279 mov ecx, [eax] ;# jindex[n]
21280 mov edx, [eax + 4] ;# jindex[n+1]
21281 add dword ptr [ebp + mci3020_jindex], 4
21282 sub edx, ecx ;# number of innerloop atoms
21283 mov [esp + mci3020_innerk], edx
21285 mov esi, [ebp + mci3020_pos]
21286 mov eax, [ebp + mci3020_jjnr]
21287 shl ecx, 2
21288 add eax, ecx
21289 mov [esp + mci3020_innerjjnr], eax ;# pointer to jjnr[nj0]
21290 .mci3020_inner_loop:
21291 ;# a single j particle iteration
21292 mov eax, [esp + mci3020_innerjjnr]
21293 mov eax, [eax] ;# eax=jnr offset
21294 add dword ptr [esp + mci3020_innerjjnr], 4 ;# advance pointer
21295 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
21297 mov ecx, [ebp + mci3020_charge]
21298 movd mm7, [ecx + eax*4]
21299 punpckldq mm7,mm7
21300 movq mm6,mm7
21301 pfmul mm6, [esp + mci3020_iqO]
21302 pfmul mm7, [esp + mci3020_iqH] ;# mm6=qqO, mm7=qqH
21303 movd [esp + mci3020_qqO], mm6
21304 movq [esp + mci3020_qqH], mm7
21306 lea eax, [eax + eax*2]
21308 movq mm0, [esi + eax*4]
21309 movd mm1, [esi + eax*4 + 8]
21310 ;# copy & expand to mm2-mm4 for the H interactions
21311 movq mm2, mm0
21312 movq mm3, mm0
21313 movq mm4, mm1
21314 punpckldq mm2,mm2
21315 punpckhdq mm3,mm3
21316 punpckldq mm4,mm4
21318 pfsubr mm0, [esp + mci3020_ixO]
21319 pfsubr mm1, [esp + mci3020_izO]
21321 pfmul mm0,mm0
21322 pfmul mm1,mm1
21323 pfacc mm0, mm1
21324 pfadd mm0, mm1 ;# mm0=rsqO
21326 punpckldq mm2, mm2
21327 punpckldq mm3, mm3
21328 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
21329 pfsubr mm2, [esp + mci3020_ixH]
21330 pfsubr mm3, [esp + mci3020_iyH]
21331 pfsubr mm4, [esp + mci3020_izH] ;# mm2-mm4 is dxH-dzH
21333 pfmul mm2,mm2
21334 pfmul mm3,mm3
21335 pfmul mm4,mm4
21337 pfadd mm3,mm2
21338 pfadd mm3,mm4 ;# mm3=rsqH
21339 movq [esp + mci3020_tmprsqH], mm3
21341 pfrsqrt mm1,mm0
21343 movq mm2,mm1
21344 pfmul mm1,mm1
21345 pfrsqit1 mm1,mm0
21346 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
21348 pfmul mm0, mm1 ;# mm0=r
21350 pfmul mm0, [esp + mci3020_tsc]
21351 pf2iw mm4, mm0
21352 movd [esp + mci3020_n1], mm4
21353 pi2fd mm4,mm4
21354 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
21355 movq mm2, mm0
21356 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
21358 ;# coulomb table
21359 mov edx, [ebp + mci3020_VFtab]
21360 mov ecx, [esp + mci3020_n1]
21361 shl ecx, 2
21362 ;# load all values we need
21363 movd mm4, [edx + ecx*4]
21364 movd mm5, [edx + ecx*4 + 4]
21365 movd mm6, [edx + ecx*4 + 8]
21366 movd mm7, [edx + ecx*4 + 12]
21368 pfmul mm6, mm0 ;# mm6 = Geps
21369 pfmul mm7, mm2 ;# mm7 = Heps2
21371 pfadd mm5, mm6
21372 pfadd mm5, mm7 ;# mm5 = Fp
21374 pfmul mm5, mm0 ;# mm5=eps*Fp
21375 pfadd mm5, mm4 ;# mm5= VV
21377 pfmul mm5, [esp + mci3020_qqO] ;# vcoul=qq*VV
21378 ;# update vctot directly
21379 pfadd mm5, [esp + mci3020_vctot]
21380 movq [esp + mci3020_vctot], mm5
21382 ;# now do the two hydrogens.
21383 movq mm0, [esp + mci3020_tmprsqH] ;# mm0=rsqH
21385 pfrsqrt mm1, mm0
21386 pswapd mm0,mm0
21387 pfrsqrt mm2, mm0
21388 pswapd mm0,mm0
21389 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
21391 movq mm2, mm1
21392 pfmul mm1,mm1
21393 pfrsqit1 mm1,mm0
21394 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
21396 pfmul mm0,mm1 ;# mm0=r
21397 pfmul mm0, [esp + mci3020_tsc]
21398 pf2iw mm4, mm0
21399 movq [esp + mci3020_n1], mm4
21400 pi2fd mm4,mm4
21401 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
21402 movq mm2, mm0
21403 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
21405 ;# coulomb table
21406 mov edx, [ebp + mci3020_VFtab]
21407 mov ecx, [esp + mci3020_n1]
21408 shl ecx, 2
21409 ;# load all values we need
21410 movd mm4, [edx + ecx*4]
21411 movd mm5, [edx + ecx*4 + 4]
21412 movd mm6, [edx + ecx*4 + 8]
21413 movd mm7, [edx + ecx*4 + 12]
21414 mov ecx, [esp + mci3020_n1 + 4]
21415 shl ecx, 2
21416 punpckldq mm4, [edx + ecx*4]
21417 punpckldq mm5, [edx + ecx*4 + 4]
21418 punpckldq mm6, [edx + ecx*4 + 8]
21419 punpckldq mm7, [edx + ecx*4 + 12]
21421 pfmul mm6, mm0 ;# mm6 = Geps
21422 pfmul mm7, mm2 ;# mm7 = Heps2
21424 pfadd mm5, mm6
21425 pfadd mm5, mm7 ;# mm5 = Fp
21427 pfmul mm5, mm0 ;# mm5=eps*Fp
21428 pfadd mm5, mm4 ;# mm5= VV
21430 pfmul mm5, [esp + mci3020_qqH] ;# vcoul=qq*VV
21432 ;# update vctot
21433 pfadd mm5, [esp + mci3020_vctot]
21434 movq [esp + mci3020_vctot], mm5
21436 ;# done - one more?
21437 dec dword ptr [esp + mci3020_innerk]
21438 jz .mci3020_updateouterdata
21439 jmp .mci3020_inner_loop
21440 .mci3020_updateouterdata:
21441 mov edx, [ebp + mci3020_gid] ;# get group index for this i particle
21442 mov edx, [edx]
21443 add dword ptr [ebp + mci3020_gid], 4 ;# advance pointer
21445 movq mm7, [esp + mci3020_vctot]
21446 pfacc mm7,mm7 ;# get and sum the two parts of total potential
21448 mov eax, [ebp + mci3020_Vc]
21449 movd mm6, [eax + edx*4]
21450 pfadd mm6, mm7
21451 movd [eax + edx*4], mm6 ;# increment vc[gid]
21453 ;# finish if last
21454 dec dword ptr [ebp + mci3020_nri]
21455 jz .mci3020_end
21456 ;# not last, iterate once more!
21457 jmp .mci3020_outer
21458 .mci3020_end:
21459 femms
21460 add esp, 116
21461 pop edi
21462 pop esi
21463 pop edx
21464 pop ecx
21465 pop ebx
21466 pop eax
21467 leave
21472 .globl mcinl3030_3dnow
21473 .globl _mcinl3030_3dnow
21474 mcinl3030_3dnow:
21475 _mcinl3030_3dnow:
21476 .equiv mci3030_nri, 8
21477 .equiv mci3030_iinr, 12
21478 .equiv mci3030_jindex, 16
21479 .equiv mci3030_jjnr, 20
21480 .equiv mci3030_shift, 24
21481 .equiv mci3030_shiftvec, 28
21482 .equiv mci3030_gid, 32
21483 .equiv mci3030_pos, 36
21484 .equiv mci3030_charge, 40
21485 .equiv mci3030_facel, 44
21486 .equiv mci3030_Vc, 48
21487 .equiv mci3030_tabscale, 52
21488 .equiv mci3030_VFtab, 56
21489 ;# stack offsets for local variables
21490 .equiv mci3030_is3, 0
21491 .equiv mci3030_ii3, 4
21492 .equiv mci3030_ixO, 8
21493 .equiv mci3030_iyO, 12
21494 .equiv mci3030_izO, 16
21495 .equiv mci3030_ixH, 20
21496 .equiv mci3030_iyH, 28
21497 .equiv mci3030_izH, 36
21498 .equiv mci3030_qqOO, 44
21499 .equiv mci3030_qqOH, 52
21500 .equiv mci3030_qqHH, 60
21501 .equiv mci3030_n1, 68
21502 .equiv mci3030_tsc, 76
21503 .equiv mci3030_vctot, 84
21504 .equiv mci3030_innerjjnr, 92
21505 .equiv mci3030_innerk, 96
21506 .equiv mci3030_tmprsqH, 100
21507 push ebp
21508 mov ebp,esp
21509 push eax
21510 push ebx
21511 push ecx
21512 push edx
21513 push esi
21514 push edi
21515 sub esp, 108 ;# local stack space
21516 femms
21517 ;# assume we have at least one i particle - start directly
21519 mov ecx, [ebp + mci3030_iinr] ;# ecx = pointer into iinr[]
21520 mov ebx, [ecx] ;# ebx=ii
21522 mov edx, [ebp + mci3030_charge]
21523 movd mm1, [ebp + mci3030_facel] ;# mm1=facel
21524 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0] (O)
21525 movd mm3, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1] (H)
21526 movq mm4, mm2
21527 pfmul mm4, mm1
21528 movq mm6, mm3
21529 pfmul mm6, mm1
21530 movq mm5, mm4
21531 pfmul mm4, mm2 ;# mm4=qqOO*facel
21532 pfmul mm5, mm3 ;# mm5=qqOH*facel
21533 pfmul mm6, mm3 ;# mm6=qqHH*facel
21534 punpckldq mm5,mm5 ;# spread to both halves
21535 punpckldq mm6,mm6 ;# spread to both halves
21536 movq [esp + mci3030_qqOO], mm4
21537 movq [esp + mci3030_qqOH], mm5
21538 movq [esp + mci3030_qqHH], mm6
21539 movd mm3, [ebp + mci3030_tabscale]
21540 punpckldq mm3,mm3
21541 movq [esp + mci3030_tsc], mm3
21542 .mci3030_outer:
21543 mov eax, [ebp + mci3030_shift] ;# eax = pointer into shift[]
21544 mov ebx, [eax] ;# ebx=shift[n]
21545 add dword ptr [ebp + mci3030_shift], 4 ;# advance pointer one step
21547 lea ebx, [ebx + ebx*2] ;# ebx=3*is
21548 mov [esp + mci3030_is3],ebx ;# store is3
21550 mov eax, [ebp + mci3030_shiftvec] ;# eax = base of shiftvec[]
21552 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
21553 movd mm6, [eax + ebx*4 + 8]
21554 movq mm0, mm5
21555 movq mm1, mm5
21556 movq mm2, mm6
21557 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
21558 punpckhdq mm1,mm1
21559 punpckldq mm2,mm2
21561 mov ecx, [ebp + mci3030_iinr] ;# ecx = pointer into iinr[]
21562 add dword ptr [ebp + mci3030_iinr], 4 ;# advance pointer
21563 mov ebx, [ecx] ;# ebx=ii
21565 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
21566 mov eax, [ebp + mci3030_pos] ;# eax = base of pos[]
21568 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
21569 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
21570 mov [esp + mci3030_ii3], ebx ;# (use mm7 as temp. storage for iz.)
21571 pfadd mm6, mm7
21572 movq [esp + mci3030_ixO], mm5
21573 movq [esp + mci3030_izO], mm6
21575 movd mm3, [eax + ebx*4 + 12]
21576 movd mm4, [eax + ebx*4 + 16]
21577 movd mm5, [eax + ebx*4 + 20]
21578 punpckldq mm3, [eax + ebx*4 + 24]
21579 punpckldq mm4, [eax + ebx*4 + 28]
21580 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
21582 pfadd mm0, mm3
21583 pfadd mm1, mm4
21584 pfadd mm2, mm5
21585 movq [esp + mci3030_ixH], mm0
21586 movq [esp + mci3030_iyH], mm1
21587 movq [esp + mci3030_izH], mm2
21589 ;# clear vctot and i forces
21590 pxor mm7,mm7
21591 movq [esp + mci3030_vctot], mm7
21593 mov eax, [ebp + mci3030_jindex]
21594 mov ecx, [eax] ;# jindex[n]
21595 mov edx, [eax + 4] ;# jindex[n+1]
21596 add dword ptr [ebp + mci3030_jindex], 4
21597 sub edx, ecx ;# number of innerloop atoms
21598 mov [esp + mci3030_innerk], edx ;# number of innerloop atoms
21600 mov esi, [ebp + mci3030_pos]
21601 mov eax, [ebp + mci3030_jjnr]
21602 shl ecx, 2
21603 add eax, ecx
21604 mov [esp + mci3030_innerjjnr], eax ;# pointer to jjnr[nj0]
21605 .mci3030_inner_loop:
21606 ;# a single j particle iteration here - compare with the unrolled code for comments.
21607 mov eax, [esp + mci3030_innerjjnr]
21608 mov eax, [eax] ;# eax=jnr offset
21609 add dword ptr [esp + mci3030_innerjjnr], 4 ;# advance pointer
21611 lea eax, [eax + eax*2]
21613 movq mm0, [esi + eax*4]
21614 movd mm1, [esi + eax*4 + 8]
21615 ;# copy & expand to mm2-mm4 for the H interactions
21616 movq mm2, mm0
21617 movq mm3, mm0
21618 movq mm4, mm1
21619 punpckldq mm2,mm2
21620 punpckhdq mm3,mm3
21621 punpckldq mm4,mm4
21623 pfsubr mm0, [esp + mci3030_ixO]
21624 pfsubr mm1, [esp + mci3030_izO]
21626 pfmul mm0,mm0
21627 pfmul mm1,mm1
21628 pfacc mm0, mm0
21629 pfadd mm0, mm1 ;# mm0=rsqO
21631 punpckldq mm2, mm2
21632 punpckldq mm3, mm3
21633 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
21634 pfsubr mm2, [esp + mci3030_ixH]
21635 pfsubr mm3, [esp + mci3030_iyH]
21636 pfsubr mm4, [esp + mci3030_izH] ;# mm2-mm4 is dxH-dzH
21638 pfmul mm2,mm2
21639 pfmul mm3,mm3
21640 pfmul mm4,mm4
21642 pfadd mm3,mm2
21643 pfadd mm3,mm4 ;# mm3=rsqH
21644 movq [esp + mci3030_tmprsqH], mm3
21646 pfrsqrt mm1,mm0
21648 movq mm2,mm1
21649 pfmul mm1,mm1
21650 pfrsqit1 mm1,mm0
21651 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
21652 pfmul mm0, mm1 ;# mm0=rsq
21654 pfmul mm0, [esp + mci3030_tsc]
21655 pf2iw mm4, mm0
21656 movd [esp + mci3030_n1], mm4
21657 pi2fd mm4,mm4
21658 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
21659 movq mm2, mm0
21660 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
21662 ;# coulomb table
21663 mov edx, [ebp + mci3030_VFtab]
21664 mov ecx, [esp + mci3030_n1]
21665 shl ecx, 2
21667 ;# load all values we need
21668 movd mm4, [edx + ecx*4]
21669 movd mm5, [edx + ecx*4 + 4]
21670 movd mm6, [edx + ecx*4 + 8]
21671 movd mm7, [edx + ecx*4 + 12]
21673 pfmul mm6, mm0 ;# mm6 = Geps
21674 pfmul mm7, mm2 ;# mm7 = Heps2
21676 pfadd mm5, mm6
21677 pfadd mm5, mm7 ;# mm5 = Fp
21679 pfmul mm5, mm0 ;# mm5=eps*Fp
21680 pfadd mm5, mm4 ;# mm5= VV
21682 pfmul mm5, [esp + mci3030_qqOO] ;# vcoul=qq*VV
21683 ;# update vctot directly, use mm3 for fscal sum.
21684 pfadd mm5, [esp + mci3030_vctot]
21685 movq [esp + mci3030_vctot], mm5
21687 ;# time for hydrogens!
21689 movq mm0, [esp + mci3030_tmprsqH]
21691 pfrsqrt mm1, mm0
21692 pswapd mm0,mm0
21693 pfrsqrt mm2, mm0
21694 pswapd mm0,mm0
21695 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
21697 movq mm2, mm1
21698 pfmul mm1,mm1
21699 pfrsqit1 mm1,mm0
21700 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
21702 pfmul mm0,mm1 ;# mm0=r
21703 pfmul mm0, [esp + mci3030_tsc]
21704 pf2iw mm4, mm0
21705 movq [esp + mci3030_n1], mm4
21706 pi2fd mm4,mm4
21707 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
21708 movq mm2, mm0
21709 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
21711 ;# coulomb table
21712 mov edx, [ebp + mci3030_VFtab]
21713 mov ecx, [esp + mci3030_n1]
21714 shl ecx, 2
21715 ;# load all values we need
21716 movd mm4, [edx + ecx*4]
21717 movd mm5, [edx + ecx*4 + 4]
21718 movd mm6, [edx + ecx*4 + 8]
21719 movd mm7, [edx + ecx*4 + 12]
21720 mov ecx, [esp + mci3030_n1 + 4]
21721 shl ecx, 2
21722 punpckldq mm4, [edx + ecx*4]
21723 punpckldq mm5, [edx + ecx*4 + 4]
21724 punpckldq mm6, [edx + ecx*4 + 8]
21725 punpckldq mm7, [edx + ecx*4 + 12]
21727 pfmul mm6, mm0 ;# mm6 = Geps
21728 pfmul mm7, mm2 ;# mm7 = Heps2
21730 pfadd mm5, mm6
21731 pfadd mm5, mm7 ;# mm5 = Fp
21733 pfmul mm5, mm0 ;# mm5=eps*Fp
21734 pfadd mm5, mm4 ;# mm5= VV
21736 pfmul mm5, [esp + mci3030_qqOH] ;# vcoul=qq*VV
21737 ;# update vctot
21738 pfadd mm5, [esp + mci3030_vctot]
21739 movq [esp + mci3030_vctot], mm5
21741 ;# interactions with j H1
21743 movq mm0, [esi + eax*4 + 12]
21744 movd mm1, [esi + eax*4 + 20]
21745 ;# copy & expand to mm2-mm4 for the H interactions
21746 movq mm2, mm0
21747 movq mm3, mm0
21748 movq mm4, mm1
21749 punpckldq mm2,mm2
21750 punpckhdq mm3,mm3
21751 punpckldq mm4,mm4
21753 pfsubr mm0, [esp + mci3030_ixO]
21754 pfsubr mm1, [esp + mci3030_izO]
21756 pfmul mm0,mm0
21757 pfmul mm1,mm1
21758 pfacc mm0, mm1
21759 pfadd mm0, mm1 ;# mm0=rsqO
21761 punpckldq mm2, mm2
21762 punpckldq mm3, mm3
21763 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
21764 pfsubr mm2, [esp + mci3030_ixH]
21765 pfsubr mm3, [esp + mci3030_iyH]
21766 pfsubr mm4, [esp + mci3030_izH] ;# mm2-mm4 is dxH-dzH
21768 pfmul mm2,mm2
21769 pfmul mm3,mm3
21770 pfmul mm4,mm4
21772 pfadd mm3,mm2
21773 pfadd mm3,mm4 ;# mm3=rsqH
21774 movq [esp + mci3030_tmprsqH], mm3
21776 pfrsqrt mm1,mm0
21778 movq mm2,mm1
21779 pfmul mm1,mm1
21780 pfrsqit1 mm1,mm0
21781 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
21782 pfmul mm0, mm1 ;# mm0=rsq
21784 pfmul mm0, [esp + mci3030_tsc]
21785 pf2iw mm4, mm0
21786 movd [esp + mci3030_n1], mm4
21787 pi2fd mm4,mm4
21788 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
21789 movq mm2, mm0
21790 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
21792 ;# coulomb table
21793 mov edx, [ebp + mci3030_VFtab]
21794 mov ecx, [esp + mci3030_n1]
21795 shl ecx, 2
21797 ;# load all values we need
21798 movd mm4, [edx + ecx*4]
21799 movd mm5, [edx + ecx*4 + 4]
21800 movd mm6, [edx + ecx*4 + 8]
21801 movd mm7, [edx + ecx*4 + 12]
21803 pfmul mm6, mm0 ;# mm6 = Geps
21804 pfmul mm7, mm2 ;# mm7 = Heps2
21806 pfadd mm5, mm6
21807 pfadd mm5, mm7 ;# mm5 = Fp
21809 pfmul mm5, mm0 ;# mm5=eps*Fp
21810 pfadd mm5, mm4 ;# mm5= VV
21812 pfmul mm5, [esp + mci3030_qqOH] ;# vcoul=qq*VV
21814 ;# update vctot directly, force is moved to mm3
21815 pfadd mm5, [esp + mci3030_vctot]
21816 movq [esp + mci3030_vctot], mm5
21818 movq mm0, [esp + mci3030_tmprsqH]
21820 pfrsqrt mm1, mm0
21821 pswapd mm0,mm0
21822 pfrsqrt mm2, mm0
21823 pswapd mm0,mm0
21824 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
21826 movq mm2, mm1
21827 pfmul mm1,mm1
21828 pfrsqit1 mm1,mm0
21829 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
21831 pfmul mm0,mm1 ;# mm0=r
21832 pfmul mm0, [esp + mci3030_tsc]
21833 pf2iw mm4, mm0
21834 movq [esp + mci3030_n1], mm4
21835 pi2fd mm4,mm4
21836 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
21837 movq mm2, mm0
21838 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
21840 ;# coulomb table
21841 mov edx, [ebp + mci3030_VFtab]
21842 mov ecx, [esp + mci3030_n1]
21843 shl ecx, 2
21844 ;# load all values we need
21845 movd mm4, [edx + ecx*4]
21846 movd mm5, [edx + ecx*4 + 4]
21847 movd mm6, [edx + ecx*4 + 8]
21848 movd mm7, [edx + ecx*4 + 12]
21849 mov ecx, [esp + mci3030_n1 + 4]
21850 shl ecx, 2
21851 punpckldq mm4, [edx + ecx*4]
21852 punpckldq mm5, [edx + ecx*4 + 4]
21853 punpckldq mm6, [edx + ecx*4 + 8]
21854 punpckldq mm7, [edx + ecx*4 + 12]
21857 pfmul mm6, mm0 ;# mm6 = Geps
21858 pfmul mm7, mm2 ;# mm7 = Heps2
21860 pfadd mm5, mm6
21861 pfadd mm5, mm7 ;# mm5 = Fp
21863 pfmul mm5, mm0 ;# mm5=eps*Fp
21864 pfadd mm5, mm4 ;# mm5= VV
21866 pfmul mm5, [esp + mci3030_qqHH] ;# vcoul=qq*VV
21867 ;# update vctot
21868 pfadd mm5, [esp + mci3030_vctot]
21869 movq [esp + mci3030_vctot], mm5
21871 ;# interactions with j H2
21872 movq mm0, [esi + eax*4 + 24]
21873 movd mm1, [esi + eax*4 + 32]
21874 ;# copy & expand to mm2-mm4 for the H interactions
21875 movq mm2, mm0
21876 movq mm3, mm0
21877 movq mm4, mm1
21878 punpckldq mm2,mm2
21879 punpckhdq mm3,mm3
21880 punpckldq mm4,mm4
21882 pfsubr mm0, [esp + mci3030_ixO]
21883 pfsubr mm1, [esp + mci3030_izO]
21885 pfmul mm0,mm0
21886 pfmul mm1,mm1
21887 pfacc mm0, mm1
21888 pfadd mm0, mm1 ;# mm0=rsqO
21890 punpckldq mm2, mm2
21891 punpckldq mm3, mm3
21892 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
21893 pfsubr mm2, [esp + mci3030_ixH]
21894 pfsubr mm3, [esp + mci3030_iyH]
21895 pfsubr mm4, [esp + mci3030_izH] ;# mm2-mm4 is dxH-dzH
21897 pfmul mm2,mm2
21898 pfmul mm3,mm3
21899 pfmul mm4,mm4
21901 pfadd mm3,mm2
21902 pfadd mm3,mm4 ;# mm3=rsqH
21903 movq [esp + mci3030_tmprsqH], mm3
21905 pfrsqrt mm1,mm0
21907 movq mm2,mm1
21908 pfmul mm1,mm1
21909 pfrsqit1 mm1,mm0
21910 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
21911 pfmul mm0, mm1
21913 pfmul mm0, [esp + mci3030_tsc]
21914 pf2iw mm4, mm0
21915 movd [esp + mci3030_n1], mm4
21916 pi2fd mm4,mm4
21917 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
21918 movq mm2, mm0
21919 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
21921 ;# coulomb table
21922 mov edx, [ebp + mci3030_VFtab]
21923 mov ecx, [esp + mci3030_n1]
21924 shl ecx, 2
21926 ;# load all values we need
21927 movd mm4, [edx + ecx*4]
21928 movd mm5, [edx + ecx*4 + 4]
21929 movd mm6, [edx + ecx*4 + 8]
21930 movd mm7, [edx + ecx*4 + 12]
21932 pfmul mm6, mm0 ;# mm6 = Geps
21933 pfmul mm7, mm2 ;# mm7 = Heps2
21935 pfadd mm5, mm6
21936 pfadd mm5, mm7 ;# mm5 = Fp
21938 pfmul mm5, mm0 ;# mm5=eps*Fp
21939 pfadd mm5, mm4 ;# mm5= VV
21941 pfmul mm5, [esp + mci3030_qqOH] ;# vcoul=qq*VV
21943 ;# update vctot directly, use mm3 for fscal sum.
21944 pfadd mm5, [esp + mci3030_vctot]
21945 movq [esp + mci3030_vctot], mm5
21947 movq mm0, [esp + mci3030_tmprsqH]
21949 pfrsqrt mm1, mm0
21950 pswapd mm0,mm0
21951 pfrsqrt mm2, mm0
21952 pswapd mm0,mm0
21953 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
21955 movq mm2, mm1
21956 pfmul mm1,mm1
21957 pfrsqit1 mm1,mm0
21958 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
21960 pfmul mm0,mm1 ;# mm0=r
21961 pfmul mm0, [esp + mci3030_tsc]
21962 pf2iw mm4, mm0
21963 movq [esp + mci3030_n1], mm4
21964 pi2fd mm4,mm4
21965 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
21966 movq mm2, mm0
21967 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
21969 ;# coulomb table
21970 mov edx, [ebp + mci3030_VFtab]
21971 mov ecx, [esp + mci3030_n1]
21972 shl ecx, 2
21973 ;# load all values we need
21974 movd mm4, [edx + ecx*4]
21975 movd mm5, [edx + ecx*4 + 4]
21976 movd mm6, [edx + ecx*4 + 8]
21977 movd mm7, [edx + ecx*4 + 12]
21978 mov ecx, [esp + mci3030_n1 + 4]
21979 shl ecx, 2
21980 punpckldq mm4, [edx + ecx*4]
21981 punpckldq mm5, [edx + ecx*4 + 4]
21982 punpckldq mm6, [edx + ecx*4 + 8]
21983 punpckldq mm7, [edx + ecx*4 + 12]
21986 pfmul mm6, mm0 ;# mm6 = Geps
21987 pfmul mm7, mm2 ;# mm7 = Heps2
21989 pfadd mm5, mm6
21990 pfadd mm5, mm7 ;# mm5 = Fp
21992 pfmul mm5, mm0 ;# mm5=eps*Fp
21993 pfadd mm5, mm4 ;# mm5= VV
21995 pfmul mm5, [esp + mci3030_qqHH] ;# vcoul=qq*VV
21996 ;# update vctot
21997 pfadd mm5, [esp + mci3030_vctot]
21998 movq [esp + mci3030_vctot], mm5
22000 ;# done - one more?
22001 dec dword ptr [esp + mci3030_innerk]
22002 jz .mci3030_updateouterdata
22003 jmp .mci3030_inner_loop
22004 .mci3030_updateouterdata:
22005 mov edx, [ebp + mci3030_gid] ;# get group index for this i particle
22006 mov edx, [edx]
22007 add dword ptr [ebp + mci3030_gid], 4 ;# advance pointer
22009 movq mm7, [esp + mci3030_vctot]
22010 pfacc mm7,mm7 ;# get and sum the two parts of total potential
22012 mov eax, [ebp + mci3030_Vc]
22013 movd mm6, [eax + edx*4]
22014 pfadd mm6, mm7
22015 movd [eax + edx*4], mm6 ;# increment vc[gid]
22017 ;# finish if last
22018 dec dword ptr [ebp + mci3030_nri]
22019 jz .mci3030_end
22020 ;# not last, iterate once more!
22021 jmp .mci3030_outer
22022 .mci3030_end:
22023 femms
22024 add esp, 108
22025 pop edi
22026 pop esi
22027 pop edx
22028 pop ecx
22029 pop ebx
22030 pop eax
22031 leave
22037 .globl mcinl3100_3dnow
22038 .globl _mcinl3100_3dnow
22039 mcinl3100_3dnow:
22040 _mcinl3100_3dnow:
22041 .equiv mci3100_nri, 8
22042 .equiv mci3100_iinr, 12
22043 .equiv mci3100_jindex, 16
22044 .equiv mci3100_jjnr, 20
22045 .equiv mci3100_shift, 24
22046 .equiv mci3100_shiftvec, 28
22047 .equiv mci3100_gid, 32
22048 .equiv mci3100_pos, 36
22049 .equiv mci3100_charge, 40
22050 .equiv mci3100_facel, 44
22051 .equiv mci3100_Vc, 48
22052 .equiv mci3100_type, 52
22053 .equiv mci3100_ntype, 56
22054 .equiv mci3100_nbfp, 60
22055 .equiv mci3100_Vnb, 64
22056 .equiv mci3100_tabscale, 68
22057 .equiv mci3100_VFtab, 72
22058 ;# stack offsets for local variables
22059 .equiv mci3100_is3, 0
22060 .equiv mci3100_ii3, 4
22061 .equiv mci3100_ix, 8
22062 .equiv mci3100_iy, 12
22063 .equiv mci3100_iz, 16
22064 .equiv mci3100_iq, 20
22065 .equiv mci3100_vctot, 28
22066 .equiv mci3100_vnbtot, 36
22067 .equiv mci3100_c6, 44
22068 .equiv mci3100_c12, 52
22069 .equiv mci3100_n1, 60
22070 .equiv mci3100_tsc, 68
22071 .equiv mci3100_ntia, 76
22072 .equiv mci3100_innerjjnr, 80
22073 .equiv mci3100_innerk, 84
22074 push ebp
22075 mov ebp,esp
22076 push eax
22077 push ebx
22078 push ecx
22079 push edx
22080 push esi
22081 push edi
22082 sub esp, 88 ;# local stack space
22083 femms
22084 ;# move data to local stack
22085 movd mm3, [ebp + mci3100_tabscale]
22086 punpckldq mm3,mm3
22087 movq [esp + mci3100_tsc], mm3
22088 ;# assume we have at least one i particle - start directly
22089 .mci3100_outer:
22090 mov eax, [ebp + mci3100_shift] ;# eax = pointer into shift[]
22091 mov ebx, [eax] ;# ebx=shift[n]
22092 add dword ptr [ebp + mci3100_shift], 4 ;# advance pointer one step
22094 lea ebx, [ebx + ebx*2] ;# ebx=3*is
22095 mov [esp + mci3100_is3],ebx ;# store is3
22097 mov eax, [ebp + mci3100_shiftvec] ;# eax = base of shiftvec[]
22099 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
22100 movd mm1, [eax + ebx*4 + 8]
22102 mov ecx, [ebp + mci3100_iinr] ;# ecx = pointer into iinr[]
22103 add dword ptr [ebp + mci3100_iinr], 4 ;# advance pointer
22104 mov ebx, [ecx] ;# ebx=ii
22106 mov edx, [ebp + mci3100_charge]
22107 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
22108 pfmul mm2, [ebp + mci3100_facel]
22109 punpckldq mm2,mm2 ;# spread to both halves
22110 movq [esp + mci3100_iq], mm2 ;# iq =facel*charge[ii]
22112 mov edx, [ebp + mci3100_type]
22113 mov edx, [edx + ebx*4]
22114 imul edx, [ebp + mci3100_ntype]
22115 shl edx, 1
22116 mov [esp + mci3100_ntia], edx
22118 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
22119 mov eax, [ebp + mci3100_pos] ;# eax = base of pos[]
22121 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
22122 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
22123 mov [esp + mci3100_ii3], ebx
22124 pfadd mm1, mm3
22125 movq [esp + mci3100_ix], mm0
22126 movd [esp + mci3100_iz], mm1
22128 ;# clear total potential and i forces
22129 pxor mm7,mm7
22130 movq [esp + mci3100_vctot], mm7
22131 movq [esp + mci3100_vnbtot], mm7
22133 mov eax, [ebp + mci3100_jindex]
22134 mov ecx, [eax] ;# jindex[n]
22135 mov edx, [eax + 4] ;# jindex[n+1]
22136 add dword ptr [ebp + mci3100_jindex], 4
22137 sub edx, ecx ;# number of innerloop atoms
22139 mov esi, [ebp + mci3100_pos]
22140 mov eax, [ebp + mci3100_jjnr]
22141 shl ecx, 2
22142 add eax, ecx
22143 mov [esp + mci3100_innerjjnr], eax ;# pointer to jjnr[nj0]
22144 sub edx, 2
22145 mov [esp + mci3100_innerk], edx ;# number of innerloop atoms
22146 jge .mci3100_unroll_loop
22147 jmp .mci3100_finish_inner
22148 .mci3100_unroll_loop:
22149 ;# paired innerloop starts here
22150 mov ecx, [esp + mci3100_innerjjnr] ;# pointer to jjnr[k]
22151 mov eax, [ecx]
22152 mov ebx, [ecx + 4] ;# eax/ebx=jnr
22153 add dword ptr [esp + mci3100_innerjjnr], 8 ;# advance pointer (unrolled 2)
22154 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
22156 mov ecx, [ebp + mci3100_charge] ;# base of charge[]
22157 movq mm5, [esp + mci3100_iq]
22158 movd mm3, [ecx + eax*4] ;# charge[jnr1]
22159 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
22160 pfmul mm3,mm5 ;# mm3 now has qq for both particles
22162 mov ecx, [ebp + mci3100_type]
22163 mov edx, [ecx + eax*4] ;# type [jnr1]
22164 mov ecx, [ecx + ebx*4] ;# type [jnr2]
22166 mov esi, [ebp + mci3100_nbfp] ;# base of nbfp
22167 shl edx, 1
22168 shl ecx, 1
22169 add edx, [esp + mci3100_ntia] ;# tja = ntia + 2*type
22170 add ecx, [esp + mci3100_ntia]
22172 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
22173 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
22174 movq mm6,mm5
22175 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
22176 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
22177 movq [esp + mci3100_c6], mm5
22178 movq [esp + mci3100_c12], mm6
22180 lea eax, [eax + eax*2] ;# replace jnr with j3
22181 lea ebx, [ebx + ebx*2]
22183 mov esi, [ebp + mci3100_pos]
22185 movq mm0, [esp + mci3100_ix]
22186 movd mm1, [esp + mci3100_iz]
22187 movq mm4, [esi + eax*4] ;# fetch first j coordinates
22188 movd mm5, [esi + eax*4 + 8]
22189 pfsubr mm4,mm0 ;# dr = ir - jr
22190 pfsubr mm5,mm1
22191 pfmul mm4,mm4 ;# square dx,dy,dz
22192 pfmul mm5,mm5
22193 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
22194 pfacc mm4, mm5 ;# first rsq in lower mm4
22196 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
22197 movd mm7, [esi + ebx*4 + 8]
22199 pfsubr mm6,mm0 ;# dr = ir - jr
22200 pfsubr mm7,mm1
22201 pfmul mm6,mm6 ;# square dx,dy,dz
22202 pfmul mm7,mm7
22203 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
22204 pfacc mm6, mm7 ;# second rsq in lower mm6
22206 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
22207 pfrsqrt mm1, mm6
22209 punpckldq mm0,mm1
22210 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
22211 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
22212 pfmul mm0,mm0
22213 pfrsqit1 mm0,mm4
22214 pfrcpit2 mm0,mm2
22215 pfmul mm4, mm0
22216 movq mm1, mm4
22217 ;# mm0 is invsqrt, and mm1 r.
22218 ;# do potential and fscal
22219 pfmul mm1, [esp + mci3100_tsc] ;# mm1=rt
22220 pf2iw mm4,mm1
22221 movq [esp + mci3100_n1], mm4
22222 pi2fd mm4,mm4
22223 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
22225 movq mm2,mm1
22226 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
22228 mov edx, [ebp + mci3100_VFtab]
22229 mov ecx, [esp + mci3100_n1]
22230 shl ecx, 2
22231 ;# coulomb table
22232 ;# load all the table values we need
22233 movd mm4, [edx + ecx*4]
22234 movd mm5, [edx + ecx*4 + 4]
22235 movd mm6, [edx + ecx*4 + 8]
22236 movd mm7, [edx + ecx*4 + 12]
22237 mov ecx, [esp + mci3100_n1 + 4]
22238 shl ecx, 2
22239 punpckldq mm4, [edx + ecx*4]
22240 punpckldq mm5, [edx + ecx*4 + 4]
22241 punpckldq mm6, [edx + ecx*4 + 8]
22242 punpckldq mm7, [edx + ecx*4 + 12]
22244 pfmul mm6, mm1 ;# mm6 = Geps
22245 pfmul mm7, mm2 ;# mm7 = Heps2
22247 pfadd mm5, mm6
22248 pfadd mm5, mm7 ;# mm5 = Fp
22250 pfmul mm5, mm1 ;# mm5=eps*Fp
22251 pfadd mm5, mm4 ;# mm5= VV
22253 pfmul mm5, mm3 ;# vcoul=qq*VV
22255 movq mm1, mm0
22256 pfmul mm1,mm1 ;# mm1=invsq
22257 movq mm2, mm1
22258 pfmul mm2,mm1
22259 pfmul mm2,mm1 ;# mm2=rinvsix
22260 movq mm1,mm2
22261 pfmul mm1,mm1 ;# mm1=rinvtwelve
22263 pfmul mm3, [esp + mci3100_tsc]
22265 pfmul mm1, [esp + mci3100_c12]
22267 pfmul mm2, [esp + mci3100_c6]
22269 movq mm4, mm1
22270 pfsub mm4, mm2 ;# mm4 = vnb12-vnb6
22271 ;# update vctot
22272 pfadd mm5, [esp + mci3100_vctot] ;# add the earlier value
22273 movq [esp + mci3100_vctot], mm5 ;# store the sum
22274 ;# update vnbtot
22275 pfadd mm4, [esp + mci3100_vnbtot] ;# add the earlier value
22276 movq [esp + mci3100_vnbtot], mm4 ;# store the sum
22278 ;# should we do one more iteration?
22279 sub dword ptr [esp + mci3100_innerk], 2
22280 jl .mci3100_finish_inner
22281 jmp .mci3100_unroll_loop
22282 .mci3100_finish_inner:
22283 and dword ptr [esp + mci3100_innerk], 1
22284 jnz .mci3100_single_inner
22285 jmp .mci3100_updateouterdata
22286 .mci3100_single_inner:
22287 ;# a single j particle iteration here - compare with the unrolled code for comments.
22288 mov eax, [esp + mci3100_innerjjnr]
22289 mov eax, [eax] ;# eax=jnr offset
22291 mov ecx, [ebp + mci3100_charge]
22292 movd mm5, [esp + mci3100_iq]
22293 movd mm3, [ecx + eax*4]
22294 pfmul mm3, mm5 ;# mm3=qq
22296 mov esi, [ebp + mci3100_nbfp]
22297 mov ecx, [ebp + mci3100_type]
22298 mov edx, [ecx + eax*4] ;# type [jnr1]
22299 shl edx, 1
22300 add edx, [esp + mci3100_ntia] ;# tja = ntia + 2*type
22301 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
22302 movq [esp + mci3100_c6], mm5
22303 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
22304 movq [esp + mci3100_c12], mm5
22307 mov esi, [ebp + mci3100_pos]
22308 lea eax, [eax + eax*2]
22310 movq mm0, [esp + mci3100_ix]
22311 movd mm1, [esp + mci3100_iz]
22312 movq mm4, [esi + eax*4]
22313 movd mm5, [esi + eax*4 + 8]
22314 pfsubr mm4, mm0
22315 pfsubr mm5, mm1
22316 pfmul mm4,mm4
22317 pfmul mm5,mm5
22318 pfacc mm4, mm5
22319 pfacc mm4, mm5 ;# mm4=rsq
22321 pfrsqrt mm0,mm4
22322 movq mm2,mm0
22323 pfmul mm0,mm0
22324 pfrsqit1 mm0,mm4
22325 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
22326 pfmul mm4, mm0
22327 movq mm1, mm4
22328 ;# mm0 is invsqrt, and mm1 r.
22329 ;# calculate potentials and scalar force
22330 pfmul mm1, [esp + mci3100_tsc] ;# mm1=rt
22331 pf2iw mm4,mm1
22332 movd [esp + mci3100_n1], mm4
22333 pi2fd mm4,mm4
22334 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
22336 movq mm2,mm1
22337 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
22339 ;# coulomb table
22340 mov edx, [ebp + mci3100_VFtab]
22341 mov ecx, [esp + mci3100_n1]
22342 shl ecx, 2
22343 ;# load all the table values we need
22344 movd mm4, [edx + ecx*4]
22345 movd mm5, [edx + ecx*4 + 4]
22346 movd mm6, [edx + ecx*4 + 8]
22347 movd mm7, [edx + ecx*4 + 12]
22349 pfmul mm6, mm1 ;# mm6 = Geps
22350 pfmul mm7, mm2 ;# mm7 = Heps2
22352 pfadd mm5, mm6
22353 pfadd mm5, mm7 ;# mm5 = Fp
22355 pfmul mm5, mm1 ;# mm5=eps*Fp
22356 pfadd mm5, mm4 ;# mm5= VV
22358 pfmul mm5, mm3 ;# vcoul=qq*VV
22359 ;# at this point mm5 contains vcoul
22361 movq mm1, mm0
22362 pfmul mm1,mm1 ;# mm1=invsq
22363 movq mm2, mm1
22364 pfmul mm2,mm1
22365 pfmul mm2,mm1 ;# mm2=rinvsix
22366 movq mm1,mm2
22367 pfmul mm1,mm1 ;# mm1=rinvtwelve
22369 pfmul mm3, [esp + mci3100_tsc]
22371 pfmul mm1, [esp + mci3100_c12]
22373 pfmul mm2, [esp + mci3100_c6]
22375 movq mm4, mm1
22376 pfsub mm4, mm2 ;# mm4 = vnb12-vnb6
22377 ;# update vctot
22378 pfadd mm5, [esp + mci3100_vctot] ;# add the earlier value
22379 movq [esp + mci3100_vctot], mm5 ;# store the sum
22380 ;# update vnbtot
22381 pfadd mm4, [esp + mci3100_vnbtot] ;# add the earlier value
22382 movq [esp + mci3100_vnbtot], mm4 ;# store the sum
22384 .mci3100_updateouterdata:
22385 mov edx, [ebp + mci3100_gid] ;# get group index for this i particle
22386 mov edx, [edx]
22387 add dword ptr [ebp + mci3100_gid], 4 ;# advance pointer
22389 movq mm7, [esp + mci3100_vctot]
22390 pfacc mm7,mm7 ;# get and sum the two parts of total potential
22392 mov eax, [ebp + mci3100_Vc]
22393 movd mm6, [eax + edx*4]
22394 pfadd mm6, mm7
22395 movd [eax + edx*4], mm6 ;# increment vc[gid]
22397 movq mm7, [esp + mci3100_vnbtot]
22398 pfacc mm7,mm7 ;# get and sum the two parts of total potential
22400 mov eax, [ebp + mci3100_Vnb]
22401 movd mm6, [eax + edx*4]
22402 pfadd mm6, mm7
22403 movd [eax + edx*4], mm6 ;# increment vnb[gid]
22405 ;# finish if last
22406 mov ecx, [ebp + mci3100_nri]
22407 dec ecx
22408 jecxz .mci3100_end
22409 ;# not last, iterate once more!
22410 mov [ebp + mci3100_nri], ecx
22411 jmp .mci3100_outer
22412 .mci3100_end:
22413 femms
22414 add esp, 88
22415 pop edi
22416 pop esi
22417 pop edx
22418 pop ecx
22419 pop ebx
22420 pop eax
22421 leave
22427 .globl mcinl3110_3dnow
22428 .globl _mcinl3110_3dnow
22429 mcinl3110_3dnow:
22430 _mcinl3110_3dnow:
22431 .equiv mci3110_nri, 8
22432 .equiv mci3110_iinr, 12
22433 .equiv mci3110_jindex, 16
22434 .equiv mci3110_jjnr, 20
22435 .equiv mci3110_shift, 24
22436 .equiv mci3110_shiftvec, 28
22437 .equiv mci3110_gid, 32
22438 .equiv mci3110_pos, 36
22439 .equiv mci3110_charge, 40
22440 .equiv mci3110_facel, 44
22441 .equiv mci3110_Vc, 48
22442 .equiv mci3110_type, 52
22443 .equiv mci3110_ntype, 56
22444 .equiv mci3110_nbfp, 60
22445 .equiv mci3110_Vnb, 64
22446 .equiv mci3110_tabscale, 68
22447 .equiv mci3110_VFtab, 72
22448 .equiv mci3110_nsatoms, 76
22449 ;# stack offsets for local variables
22450 .equiv mci3110_is3, 0
22451 .equiv mci3110_ii3, 4
22452 .equiv mci3110_shX, 8
22453 .equiv mci3110_shY, 12
22454 .equiv mci3110_shZ, 16
22455 .equiv mci3110_ix, 20
22456 .equiv mci3110_iy, 24
22457 .equiv mci3110_iz, 28
22458 .equiv mci3110_iq, 32
22459 .equiv mci3110_vctot, 40
22460 .equiv mci3110_vnbtot, 48
22461 .equiv mci3110_c6, 56
22462 .equiv mci3110_c12, 64
22463 .equiv mci3110_two, 72
22464 .equiv mci3110_n1, 80
22465 .equiv mci3110_tsc, 88
22466 .equiv mci3110_ntia, 96
22467 .equiv mci3110_innerjjnr0, 104
22468 .equiv mci3110_innerk0, 108
22469 .equiv mci3110_innerjjnr, 112
22470 .equiv mci3110_innerk, 116
22471 .equiv mci3110_nsvdwc, 120
22472 .equiv mci3110_nscoul, 124
22473 .equiv mci3110_nsvdw, 128
22474 .equiv mci3110_solnr, 132
22475 push ebp
22476 mov ebp,esp
22477 push eax
22478 push ebx
22479 push ecx
22480 push edx
22481 push esi
22482 push edi
22483 sub esp, 136 ;# local stack space
22484 femms
22485 movq mm2, [mm_two]
22486 movd mm3, [ebp + mci3110_tabscale]
22487 movq [esp + mci3110_two], mm2
22488 punpckldq mm3,mm3
22489 movq [esp + mci3110_tsc], mm3
22490 ;# assume we have at least one i particle - start directly
22491 .mci3110_outer:
22492 mov eax, [ebp + mci3110_shift] ;# eax = pointer into shift[]
22493 mov ebx, [eax] ;# ebx=shift[n]
22494 add dword ptr [ebp + mci3110_shift], 4 ;# advance pointer one step
22496 lea ebx, [ebx + ebx*2] ;# ebx=3*is
22497 mov [esp + mci3110_is3],ebx ;# store is3
22499 mov eax, [ebp + mci3110_shiftvec] ;# eax = base of shiftvec[]
22501 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
22502 movd mm1, [eax + ebx*4 + 8]
22503 movq [esp + mci3110_shX], mm0
22504 movd [esp + mci3110_shZ], mm1
22506 mov ecx, [ebp + mci3110_iinr] ;# ecx = pointer into iinr[]
22507 add dword ptr [ebp + mci3110_iinr], 4 ;# advance pointer
22508 mov ebx, [ecx] ;# ebx=ii
22510 mov eax, [ebp + mci3110_nsatoms]
22511 add dword ptr [ebp + mci3110_nsatoms], 12
22512 mov ecx, [eax]
22513 mov edx, [eax + 4]
22514 mov eax, [eax + 8]
22515 sub ecx, eax
22516 sub eax, edx
22518 mov [esp + mci3110_nsvdwc], edx
22519 mov [esp + mci3110_nscoul], eax
22520 mov [esp + mci3110_nsvdw], ecx
22522 ;# clear potential
22523 pxor mm7,mm7
22524 movq [esp + mci3110_vctot], mm7
22525 movq [esp + mci3110_vnbtot], mm7
22526 mov [esp + mci3110_solnr], ebx
22528 mov eax, [ebp + mci3110_jindex]
22529 mov ecx, [eax] ;# jindex[n]
22530 mov edx, [eax + 4] ;# jindex[n+1]
22531 add dword ptr [ebp + mci3110_jindex], 4
22532 sub edx, ecx ;# number of innerloop atoms
22533 mov eax, [ebp + mci3110_jjnr]
22534 shl ecx, 2
22535 add eax, ecx
22536 mov [esp + mci3110_innerjjnr0], eax ;# pointer to jjnr[nj0]
22538 mov [esp + mci3110_innerk0], edx ;# number of innerloop atoms
22539 mov esi, [ebp + mci3110_pos]
22541 mov ecx, [esp + mci3110_nsvdwc]
22542 cmp ecx, 0
22543 jnz .mci3110_mno_vdwc
22544 jmp .mci3110_testcoul
22545 .mci3110_mno_vdwc:
22546 mov ebx, [esp + mci3110_solnr]
22547 inc dword ptr [esp + mci3110_solnr]
22548 mov edx, [ebp + mci3110_charge]
22549 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
22550 pfmul mm2, [ebp + mci3110_facel]
22551 punpckldq mm2,mm2 ;# spread to both halves
22552 movq [esp + mci3110_iq], mm2 ;# iq =facel*charge[ii]
22554 mov edx, [ebp + mci3110_type]
22555 mov edx, [edx + ebx*4]
22556 imul edx, [ebp + mci3110_ntype]
22557 shl edx, 1
22558 mov [esp + mci3110_ntia], edx
22560 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
22561 mov eax, [ebp + mci3110_pos] ;# eax = base of pos[]
22562 mov [esp + mci3110_ii3], ebx
22564 movq mm0, [eax + ebx*4]
22565 movd mm1, [eax + ebx*4 + 8]
22566 pfadd mm0, [esp + mci3110_shX]
22567 pfadd mm1, [esp + mci3110_shZ]
22568 movq [esp + mci3110_ix], mm0
22569 movd [esp + mci3110_iz], mm1
22571 mov ecx, [esp + mci3110_innerjjnr0]
22572 mov [esp + mci3110_innerjjnr], ecx
22573 mov edx, [esp + mci3110_innerk0]
22574 sub edx, 2
22575 mov [esp + mci3110_innerk], edx ;# number of innerloop atoms
22576 jge .mci3110_unroll_vdwc_loop
22577 jmp .mci3110_finish_vdwc_inner
22578 .mci3110_unroll_vdwc_loop:
22579 ;# paired innerloop starts here
22580 mov ecx, [esp + mci3110_innerjjnr] ;# pointer to jjnr[k]
22581 mov eax, [ecx]
22582 mov ebx, [ecx + 4] ;# eax/ebx=jnr
22583 add dword ptr [esp + mci3110_innerjjnr], 8 ;# advance pointer (unrolled 2)
22584 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
22586 mov ecx, [ebp + mci3110_charge] ;# base of charge[]
22587 movq mm5, [esp + mci3110_iq]
22588 movd mm3, [ecx + eax*4] ;# charge[jnr1]
22589 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
22590 pfmul mm3,mm5 ;# mm3 now has qq for both particles
22592 mov ecx, [ebp + mci3110_type]
22593 mov edx, [ecx + eax*4] ;# type [jnr1]
22594 mov ecx, [ecx + ebx*4] ;# type [jnr2]
22596 mov esi, [ebp + mci3110_nbfp] ;# base of nbfp
22597 shl edx, 1
22598 shl ecx, 1
22599 add edx, [esp + mci3110_ntia] ;# tja = ntia + 2*type
22600 add ecx, [esp + mci3110_ntia]
22602 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
22603 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
22604 movq mm6,mm5
22605 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
22606 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
22607 movq [esp + mci3110_c6], mm5
22608 movq [esp + mci3110_c12], mm6
22610 lea eax, [eax + eax*2] ;# replace jnr with j3
22611 lea ebx, [ebx + ebx*2]
22613 mov esi, [ebp + mci3110_pos]
22615 movq mm0, [esp + mci3110_ix]
22616 movd mm1, [esp + mci3110_iz]
22617 movq mm4, [esi + eax*4] ;# fetch first j coordinates
22618 movd mm5, [esi + eax*4 + 8]
22619 pfsubr mm4,mm0 ;# dr = ir - jr
22620 pfsubr mm5,mm1
22621 pfmul mm4,mm4 ;# square dx,dy,dz
22622 pfmul mm5,mm5
22623 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
22624 pfacc mm4, mm5 ;# first rsq in lower mm4
22626 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
22627 movd mm7, [esi + ebx*4 + 8]
22629 pfsubr mm6,mm0 ;# dr = ir - jr
22630 pfsubr mm7,mm1
22631 pfmul mm6,mm6 ;# square dx,dy,dz
22632 pfmul mm7,mm7
22633 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
22634 pfacc mm6, mm7 ;# second rsq in lower mm6
22636 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
22637 pfrsqrt mm1, mm6
22640 punpckldq mm0,mm1
22641 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
22642 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
22643 pfmul mm0,mm0
22644 pfrsqit1 mm0,mm4
22645 pfrcpit2 mm0,mm2
22646 pfmul mm4, mm0
22647 movq mm1, mm4
22648 ;# mm0 is invsqrt, and mm1 r.
22649 ;# do potential and fscal
22650 pfmul mm1, [esp + mci3110_tsc] ;# mm1=rt
22651 pf2iw mm4,mm1
22652 movq [esp + mci3110_n1], mm4
22653 pi2fd mm4,mm4
22654 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
22656 movq mm2,mm1
22657 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
22659 mov edx, [ebp + mci3110_VFtab]
22660 mov ecx, [esp + mci3110_n1]
22661 shl ecx, 2
22662 ;# coulomb table
22663 ;# load all the table values we need
22664 movd mm4, [edx + ecx*4]
22665 movd mm5, [edx + ecx*4 + 4]
22666 movd mm6, [edx + ecx*4 + 8]
22667 movd mm7, [edx + ecx*4 + 12]
22668 mov ecx, [esp + mci3110_n1 + 4]
22669 shl ecx, 2
22670 punpckldq mm4, [edx + ecx*4]
22671 punpckldq mm5, [edx + ecx*4 + 4]
22672 punpckldq mm6, [edx + ecx*4 + 8]
22673 punpckldq mm7, [edx + ecx*4 + 12]
22675 pfmul mm6, mm1 ;# mm6 = Geps
22676 pfmul mm7, mm2 ;# mm7 = Heps2
22678 pfadd mm5, mm6
22679 pfadd mm5, mm7 ;# mm5 = Fp
22681 pfmul mm7, [esp + mci3110_two] ;# two*Heps2
22683 pfmul mm5, mm1 ;# mm5=eps*Fp
22684 pfadd mm5, mm4 ;# mm5= VV
22686 pfmul mm5, mm3 ;# vcoul=qq*VV
22688 movq mm1, mm0
22689 pfmul mm1,mm1 ;# mm1=invsq
22690 movq mm2, mm1
22691 pfmul mm2,mm1
22692 pfmul mm2,mm1 ;# mm2=rinvsix
22693 movq mm1,mm2
22694 pfmul mm1,mm1 ;# mm1=rinvtwelve
22696 pfmul mm3, [esp + mci3110_tsc]
22698 pfmul mm1, [esp + mci3110_c12]
22700 pfmul mm2, [esp + mci3110_c6]
22702 movq mm4, mm1
22703 pfsub mm4, mm2 ;# mm4 = vnb12-vnb6
22704 ;# update vctot
22705 pfadd mm5, [esp + mci3110_vctot] ;# add the earlier value
22706 movq [esp + mci3110_vctot], mm5 ;# store the sum
22707 ;# update vnbtot
22708 pfadd mm4, [esp + mci3110_vnbtot] ;# add the earlier value
22709 movq [esp + mci3110_vnbtot], mm4 ;# store the sum
22711 ;# should we do one more iteration?
22712 sub dword ptr [esp + mci3110_innerk], 2
22713 jl .mci3110_finish_vdwc_inner
22714 jmp .mci3110_unroll_vdwc_loop
22715 .mci3110_finish_vdwc_inner:
22716 and dword ptr [esp + mci3110_innerk], 1
22717 jnz .mci3110_single_vdwc_inner
22718 jmp .mci3110_updateouterdata_vdwc
22719 .mci3110_single_vdwc_inner:
22720 ;# a single j particle iteration here - compare with the unrolled code for comments.
22721 mov eax, [esp + mci3110_innerjjnr]
22722 mov eax, [eax] ;# eax=jnr offset
22724 mov ecx, [ebp + mci3110_charge]
22725 movd mm5, [esp + mci3110_iq]
22726 movd mm3, [ecx + eax*4]
22727 pfmul mm3, mm5 ;# mm3=qq
22729 mov esi, [ebp + mci3110_nbfp]
22730 mov ecx, [ebp + mci3110_type]
22731 mov edx, [ecx + eax*4] ;# type [jnr1]
22732 shl edx, 1
22733 add edx, [esp + mci3110_ntia] ;# tja = ntia + 2*type
22734 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
22735 movq [esp + mci3110_c6], mm5
22736 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
22737 movq [esp + mci3110_c12], mm5
22740 mov esi, [ebp + mci3110_pos]
22741 lea eax, [eax + eax*2]
22743 movq mm0, [esp + mci3110_ix]
22744 movd mm1, [esp + mci3110_iz]
22745 movq mm4, [esi + eax*4]
22746 movd mm5, [esi + eax*4 + 8]
22747 pfsubr mm4, mm0
22748 pfsubr mm5, mm1
22749 pfmul mm4,mm4
22750 pfmul mm5,mm5
22751 pfacc mm4, mm5
22752 pfacc mm4, mm5 ;# mm4=rsq
22754 pfrsqrt mm0,mm4
22755 movq mm2,mm0
22756 pfmul mm0,mm0
22757 pfrsqit1 mm0,mm4
22758 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
22759 pfmul mm4, mm0
22760 movq mm1, mm4
22761 ;# mm0 is invsqrt, and mm1 r.
22762 ;# calculate potentials and scalar force
22763 pfmul mm1, [esp + mci3110_tsc] ;# mm1=rt
22764 pf2iw mm4,mm1
22765 movd [esp + mci3110_n1], mm4
22766 pi2fd mm4,mm4
22767 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
22769 movq mm2,mm1
22770 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
22772 ;# coulomb table
22773 mov edx, [ebp + mci3110_VFtab]
22774 mov ecx, [esp + mci3110_n1]
22775 shl ecx, 2
22776 ;# load all the table values we need
22777 movd mm4, [edx + ecx*4]
22778 movd mm5, [edx + ecx*4 + 4]
22779 movd mm6, [edx + ecx*4 + 8]
22780 movd mm7, [edx + ecx*4 + 12]
22782 pfmul mm6, mm1 ;# mm6 = Geps
22783 pfmul mm7, mm2 ;# mm7 = Heps2
22785 pfadd mm5, mm6
22786 pfadd mm5, mm7 ;# mm5 = Fp
22788 pfmul mm7, [esp + mci3110_two] ;# two*Heps2
22790 pfmul mm5, mm1 ;# mm5=eps*Fp
22791 pfadd mm5, mm4 ;# mm5= VV
22793 pfmul mm5, mm3 ;# vcoul=qq*VV
22795 movq mm1, mm0
22796 pfmul mm1,mm1 ;# mm1=invsq
22797 movq mm2, mm1
22798 pfmul mm2,mm1
22799 pfmul mm2,mm1 ;# mm2=rinvsix
22800 movq mm1,mm2
22801 pfmul mm1,mm1 ;# mm1=rinvtwelve
22803 pfmul mm3, [esp + mci3110_tsc]
22805 pfmul mm1, [esp + mci3110_c12]
22807 pfmul mm2, [esp + mci3110_c6]
22809 movq mm4, mm1
22810 pfsub mm4, mm2 ;# mm4 = vnb12-vnb6
22811 ;# update vctot
22812 pfadd mm5, [esp + mci3110_vctot] ;# add the earlier value
22813 movq [esp + mci3110_vctot], mm5 ;# store the sum
22814 ;# update vnbtot
22815 pfadd mm4, [esp + mci3110_vnbtot] ;# add the earlier value
22816 movq [esp + mci3110_vnbtot], mm4 ;# store the sum
22818 .mci3110_updateouterdata_vdwc:
22819 ;# loop back to mno
22820 dec dword ptr [esp + mci3110_nsvdwc]
22821 jz .mci3110_testcoul
22822 jmp .mci3110_mno_vdwc
22823 .mci3110_testcoul:
22824 mov ecx, [esp + mci3110_nscoul]
22825 cmp ecx, 0
22826 jnz .mci3110_mno_coul
22827 jmp .mci3110_testvdw
22828 .mci3110_mno_coul:
22829 mov ebx, [esp + mci3110_solnr]
22830 inc dword ptr [esp + mci3110_solnr]
22831 mov edx, [ebp + mci3110_charge]
22832 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
22833 pfmul mm2, [ebp + mci3110_facel]
22834 punpckldq mm2,mm2 ;# spread to both halves
22835 movq [esp + mci3110_iq], mm2 ;# iq =facel*charge[ii]
22837 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
22838 mov eax, [ebp + mci3110_pos] ;# eax = base of pos[]
22839 mov [esp + mci3110_ii3], ebx
22841 movq mm0, [eax + ebx*4]
22842 movd mm1, [eax + ebx*4 + 8]
22843 pfadd mm0, [esp + mci3110_shX]
22844 pfadd mm1, [esp + mci3110_shZ]
22845 movq [esp + mci3110_ix], mm0
22846 movd [esp + mci3110_iz], mm1
22848 mov ecx, [esp + mci3110_innerjjnr0]
22849 mov [esp + mci3110_innerjjnr], ecx
22850 mov edx, [esp + mci3110_innerk0]
22851 sub edx, 2
22852 mov [esp + mci3110_innerk], edx ;# number of innerloop atoms
22853 jge .mci3110_unroll_coul_loop
22854 jmp .mci3110_finish_coul_inner
22855 .mci3110_unroll_coul_loop:
22856 ;# paired innerloop starts here
22857 mov ecx, [esp + mci3110_innerjjnr] ;# pointer to jjnr[k]
22858 mov eax, [ecx]
22859 mov ebx, [ecx + 4] ;# eax/ebx=jnr
22860 add dword ptr [esp + mci3110_innerjjnr], 8 ;# advance pointer (unrolled 2)
22861 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
22863 mov ecx, [ebp + mci3110_charge] ;# base of charge[]
22864 movq mm5, [esp + mci3110_iq]
22865 movd mm3, [ecx + eax*4] ;# charge[jnr1]
22866 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
22867 pfmul mm3,mm5 ;# mm3 now has qq for both particles
22869 lea eax, [eax + eax*2] ;# replace jnr with j3
22870 lea ebx, [ebx + ebx*2]
22872 mov esi, [ebp + mci3110_pos]
22874 movq mm0, [esp + mci3110_ix]
22875 movd mm1, [esp + mci3110_iz]
22876 movq mm4, [esi + eax*4] ;# fetch first j coordinates
22877 movd mm5, [esi + eax*4 + 8]
22878 pfsubr mm4,mm0 ;# dr = ir - jr
22879 pfsubr mm5,mm1
22880 pfmul mm4,mm4 ;# square dx,dy,dz
22881 pfmul mm5,mm5
22882 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
22883 pfacc mm4, mm5 ;# first rsq in lower mm4
22885 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
22886 movd mm7, [esi + ebx*4 + 8]
22888 pfsubr mm6,mm0 ;# dr = ir - jr
22889 pfsubr mm7,mm1
22890 pfmul mm6,mm6 ;# square dx,dy,dz
22891 pfmul mm7,mm7
22892 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
22893 pfacc mm6, mm7 ;# second rsq in lower mm6
22895 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
22896 pfrsqrt mm1, mm6
22898 punpckldq mm0,mm1
22899 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
22900 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
22901 pfmul mm0,mm0
22902 pfrsqit1 mm0,mm4
22903 pfrcpit2 mm0,mm2
22904 pfmul mm4, mm0
22905 movq mm1, mm4
22906 ;# mm0 is invsqrt, and mm1 r.
22907 ;# do potential and fscal
22908 pfmul mm1, [esp + mci3110_tsc] ;# mm1=rt
22909 pf2iw mm4,mm1
22910 movq [esp + mci3110_n1], mm4
22911 pi2fd mm4,mm4
22912 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
22914 movq mm2,mm1
22915 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
22917 mov edx, [ebp + mci3110_VFtab]
22918 mov ecx, [esp + mci3110_n1]
22919 shl ecx, 2
22920 ;# coulomb table
22921 ;# load all the table values we need
22922 movd mm4, [edx + ecx*4]
22923 movd mm5, [edx + ecx*4 + 4]
22924 movd mm6, [edx + ecx*4 + 8]
22925 movd mm7, [edx + ecx*4 + 12]
22926 mov ecx, [esp + mci3110_n1 + 4]
22927 shl ecx, 2
22928 punpckldq mm4, [edx + ecx*4]
22929 punpckldq mm5, [edx + ecx*4 + 4]
22930 punpckldq mm6, [edx + ecx*4 + 8]
22931 punpckldq mm7, [edx + ecx*4 + 12]
22933 pfmul mm6, mm1 ;# mm6 = Geps
22934 pfmul mm7, mm2 ;# mm7 = Heps2
22936 pfadd mm5, mm6
22937 pfadd mm5, mm7 ;# mm5 = Fp
22939 pfmul mm7, [esp + mci3110_two] ;# two*Heps2
22941 pfmul mm5, mm1 ;# mm5=eps*Fp
22942 pfadd mm5, mm4 ;# mm5= VV
22944 pfmul mm5, mm3 ;# vcoul=qq*VV
22946 ;# at this point mm5 contains vcoul
22947 ;# increment vcoul - then we can get rid of mm5
22948 ;# update vctot
22949 pfadd mm5, [esp + mci3110_vctot] ;# add the earlier value
22950 movq [esp + mci3110_vctot], mm5 ;# store the sum
22952 ;# should we do one more iteration?
22953 sub dword ptr [esp + mci3110_innerk], 2
22954 jl .mci3110_finish_coul_inner
22955 jmp .mci3110_unroll_coul_loop
22956 .mci3110_finish_coul_inner:
22957 and dword ptr [esp + mci3110_innerk], 1
22958 jnz .mci3110_single_coul_inner
22959 jmp .mci3110_updateouterdata_coul
22960 .mci3110_single_coul_inner:
22961 ;# a single j particle iteration here - compare with the unrolled code for comments.
22962 mov eax, [esp + mci3110_innerjjnr]
22963 mov eax, [eax] ;# eax=jnr offset
22965 mov ecx, [ebp + mci3110_charge]
22966 movd mm5, [esp + mci3110_iq]
22967 movd mm3, [ecx + eax*4]
22968 pfmul mm3, mm5 ;# mm3=qq
22970 mov esi, [ebp + mci3110_pos]
22971 lea eax, [eax + eax*2]
22973 movq mm0, [esp + mci3110_ix]
22974 movd mm1, [esp + mci3110_iz]
22975 movq mm4, [esi + eax*4]
22976 movd mm5, [esi + eax*4 + 8]
22977 pfsubr mm4, mm0
22978 pfsubr mm5, mm1
22979 pfmul mm4,mm4
22980 pfmul mm5,mm5
22981 pfacc mm4, mm5
22982 pfacc mm4, mm5 ;# mm0=rsq
22984 pfrsqrt mm0,mm4
22985 movq mm2,mm0
22986 pfmul mm0,mm0
22987 pfrsqit1 mm0,mm4
22988 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
22989 pfmul mm4, mm0
22990 movq mm1, mm4
22991 ;# mm0 is invsqrt, and mm1 r.
22993 ;# calculate potentials and scalar force
22994 pfmul mm1, [esp + mci3110_tsc] ;# mm1=rt
22995 pf2iw mm4,mm1
22996 movd [esp + mci3110_n1], mm4
22997 pi2fd mm4,mm4
22998 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
23000 movq mm2,mm1
23001 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
23003 ;# coulomb table
23004 mov edx, [ebp + mci3110_VFtab]
23005 mov ecx, [esp + mci3110_n1]
23006 shl ecx, 2
23007 ;# load all the table values we need
23008 movd mm4, [edx + ecx*4]
23009 movd mm5, [edx + ecx*4 + 4]
23010 movd mm6, [edx + ecx*4 + 8]
23011 movd mm7, [edx + ecx*4 + 12]
23013 pfmul mm6, mm1 ;# mm6 = Geps
23014 pfmul mm7, mm2 ;# mm7 = Heps2
23016 pfadd mm5, mm6
23017 pfadd mm5, mm7 ;# mm5 = Fp
23019 pfmul mm7, [esp + mci3110_two] ;# two*Heps2
23021 pfmul mm5, mm1 ;# mm5=eps*Fp
23022 pfadd mm5, mm4 ;# mm5= VV
23024 pfmul mm5, mm3 ;# vcoul=qq*VV
23026 ;# at this point mm5 contains vcoul
23027 ;# increment vcoul - then we can get rid of mm5
23028 ;# update vctot
23029 pfadd mm5, [esp + mci3110_vctot] ;# add the earlier value
23030 movq [esp + mci3110_vctot], mm5 ;# store the sum
23032 .mci3110_updateouterdata_coul:
23033 ;# loop back to mno
23034 dec dword ptr [esp + mci3110_nscoul]
23035 jz .mci3110_testvdw
23036 jmp .mci3110_mno_coul
23037 .mci3110_testvdw:
23038 mov ecx, [esp + mci3110_nsvdw]
23039 cmp ecx, 0
23040 jnz .mci3110_mno_vdw
23041 jmp .mci3110_last_mno
23042 .mci3110_mno_vdw:
23043 mov ebx, [esp + mci3110_solnr]
23044 inc dword ptr [esp + mci3110_solnr]
23046 mov edx, [ebp + mci3110_type]
23047 mov edx, [edx + ebx*4]
23048 imul edx, [ebp + mci3110_ntype]
23049 shl edx, 1
23050 mov [esp + mci3110_ntia], edx
23052 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
23053 mov eax, [ebp + mci3110_pos] ;# eax = base of pos[]
23054 mov [esp + mci3110_ii3], ebx
23056 movq mm0, [eax + ebx*4]
23057 movd mm1, [eax + ebx*4 + 8]
23058 pfadd mm0, [esp + mci3110_shX]
23059 pfadd mm1, [esp + mci3110_shZ]
23060 movq [esp + mci3110_ix], mm0
23061 movd [esp + mci3110_iz], mm1
23063 mov ecx, [esp + mci3110_innerjjnr0]
23064 mov [esp + mci3110_innerjjnr], ecx
23065 mov edx, [esp + mci3110_innerk0]
23066 sub edx, 2
23067 mov [esp + mci3110_innerk], edx ;# number of innerloop atoms
23068 jge .mci3110_unroll_vdw_loop
23069 jmp .mci3110_finish_vdw_inner
23070 .mci3110_unroll_vdw_loop:
23071 ;# paired innerloop starts here
23072 mov ecx, [esp + mci3110_innerjjnr] ;# pointer to jjnr[k]
23073 mov eax, [ecx]
23074 mov ebx, [ecx + 4] ;# eax/ebx=jnr
23075 add dword ptr [esp + mci3110_innerjjnr], 8 ;# advance pointer (unrolled 2)
23076 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
23078 mov ecx, [ebp + mci3110_type]
23079 mov edx, [ecx + eax*4] ;# type [jnr1]
23080 mov ecx, [ecx + ebx*4] ;# type [jnr2]
23082 mov esi, [ebp + mci3110_nbfp] ;# base of nbfp
23083 shl edx, 1
23084 shl ecx, 1
23085 add edx, [esp + mci3110_ntia] ;# tja = ntia + 2*type
23086 add ecx, [esp + mci3110_ntia]
23088 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
23089 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
23090 movq mm6,mm5
23091 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
23092 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
23093 movq [esp + mci3110_c6], mm5
23094 movq [esp + mci3110_c12], mm6
23096 lea eax, [eax + eax*2] ;# replace jnr with j3
23097 lea ebx, [ebx + ebx*2]
23099 mov esi, [ebp + mci3110_pos]
23101 movq mm0, [esp + mci3110_ix]
23102 movd mm1, [esp + mci3110_iz]
23103 movq mm4, [esi + eax*4] ;# fetch first j coordinates
23104 movd mm5, [esi + eax*4 + 8]
23105 pfsubr mm4,mm0 ;# dr = ir - jr
23106 pfsubr mm5,mm1
23107 pfmul mm4,mm4 ;# square dx,dy,dz
23108 pfmul mm5,mm5
23109 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
23110 pfacc mm4, mm5 ;# first rsq in lower mm4
23112 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
23113 movd mm7, [esi + ebx*4 + 8]
23115 pfsubr mm6,mm0 ;# dr = ir - jr
23116 pfsubr mm7,mm1
23117 pfmul mm6,mm6 ;# square dx,dy,dz
23118 pfmul mm7,mm7
23119 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
23120 pfacc mm6, mm7 ;# second rsq in lower mm6
23122 pfrcp mm0, mm4 ;# lookup reciprocal seed
23123 pfrcp mm1, mm6
23125 punpckldq mm0,mm1
23126 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
23127 ;# amd 3dnow N-R iteration to get full precision.
23128 pfrcpit1 mm4,mm0
23129 pfrcpit2 mm4,mm0
23130 ;# mm4 now contains invsq,
23131 ;# do potential and fscal
23133 movq mm0, mm4
23134 pfmul mm4, mm0
23135 pfmul mm4, mm0 ;# mm4=rinvsix
23136 movq mm5, mm4
23137 pfmul mm5, mm5 ;# mm5=rinvtwelve
23139 pfmul mm5, [esp + mci3110_c12]
23140 pfmul mm4, [esp + mci3110_c6]
23141 movq mm6, mm5 ;# mm6 is vnb12-vnb6
23142 pfsub mm6, mm4
23143 ;# update vnbtot
23144 pfadd mm6, [esp + mci3110_vnbtot] ;# add the earlier value
23145 movq [esp + mci3110_vnbtot], mm6 ;# store the sum
23147 ;# should we do one more iteration?
23148 sub dword ptr [esp + mci3110_innerk], 2
23149 jl .mci3110_finish_vdw_inner
23150 jmp .mci3110_unroll_vdw_loop
23151 .mci3110_finish_vdw_inner:
23152 and dword ptr [esp + mci3110_innerk], 1
23153 jnz .mci3110_single_vdw_inner
23154 jmp .mci3110_updateouterdata_vdw
23155 .mci3110_single_vdw_inner:
23156 ;# a single j particle iteration here - compare with the unrolled code for comments.
23157 mov eax, [esp + mci3110_innerjjnr]
23158 mov eax, [eax] ;# eax=jnr offset
23160 mov esi, [ebp + mci3110_nbfp]
23161 mov ecx, [ebp + mci3110_type]
23162 mov edx, [ecx + eax*4] ;# type [jnr1]
23163 shl edx, 1
23164 add edx, [esp + mci3110_ntia] ;# tja = ntia + 2*type
23165 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
23166 movq [esp + mci3110_c6], mm5
23167 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
23168 movq [esp + mci3110_c12], mm5
23170 mov esi, [ebp + mci3110_pos]
23171 lea eax, [eax + eax*2]
23173 movq mm0, [esp + mci3110_ix]
23174 movd mm1, [esp + mci3110_iz]
23175 movq mm4, [esi + eax*4]
23176 movd mm5, [esi + eax*4 + 8]
23177 pfsubr mm4, mm0
23178 pfsubr mm5, mm1
23179 pfmul mm4,mm4
23180 pfmul mm5,mm5
23181 pfacc mm4, mm5
23182 pfacc mm4, mm5 ;# mm4=rsq
23184 pfrcp mm0,mm4
23185 pfrcpit1 mm4,mm0
23186 pfrcpit2 mm4,mm0 ;# mm4=invsq
23187 ;# calculate potentials and scalar force
23188 movq mm0, mm4
23190 pfmul mm4, mm0
23191 pfmul mm4, mm0 ;# mm4=rinvsix
23192 movq mm5, mm4
23193 pfmul mm5, mm5 ;# mm5=rinvtwelve
23195 pfmul mm5, [esp + mci3110_c12]
23196 pfmul mm4, [esp + mci3110_c6]
23197 movq mm6, mm5 ;# mm6 is vnb12-vnb6
23198 pfsub mm6, mm4
23199 ;# update vnbtot
23200 pfadd mm6, [esp + mci3110_vnbtot] ;# add the earlier value
23201 movq [esp + mci3110_vnbtot], mm6 ;# store the sum
23203 .mci3110_updateouterdata_vdw:
23204 ;# loop back to mno
23205 dec dword ptr [esp + mci3110_nsvdw]
23206 jz .mci3110_last_mno
23207 jmp .mci3110_mno_vdw
23209 .mci3110_last_mno:
23210 mov edx, [ebp + mci3110_gid] ;# get group index for this i particle
23211 mov edx, [edx]
23212 add dword ptr [ebp + mci3110_gid], 4 ;# advance pointer
23214 movq mm7, [esp + mci3110_vctot]
23215 pfacc mm7,mm7 ;# get and sum the two parts of total potential
23217 mov eax, [ebp + mci3110_Vc]
23218 movd mm6, [eax + edx*4]
23219 pfadd mm6, mm7
23220 movd [eax + edx*4], mm6 ;# increment vc[gid]
23222 movq mm7, [esp + mci3110_vnbtot]
23223 pfacc mm7,mm7 ;# get and sum the two parts of total potential
23225 mov eax, [ebp + mci3110_Vnb]
23226 movd mm6, [eax + edx*4]
23227 pfadd mm6, mm7
23228 movd [eax + edx*4], mm6 ;# increment vc[gid]
23229 ;# finish if last
23230 mov ecx, [ebp + mci3110_nri]
23231 dec ecx
23232 jecxz .mci3110_end
23233 ;# not last, iterate once more!
23234 mov [ebp + mci3110_nri], ecx
23235 jmp .mci3110_outer
23236 .mci3110_end:
23237 femms
23238 add esp, 136
23239 pop edi
23240 pop esi
23241 pop edx
23242 pop ecx
23243 pop ebx
23244 pop eax
23245 leave
23251 .globl mcinl3120_3dnow
23252 .globl _mcinl3120_3dnow
23253 mcinl3120_3dnow:
23254 _mcinl3120_3dnow:
23255 .equiv mci3120_nri, 8
23256 .equiv mci3120_iinr, 12
23257 .equiv mci3120_jindex, 16
23258 .equiv mci3120_jjnr, 20
23259 .equiv mci3120_shift, 24
23260 .equiv mci3120_shiftvec, 28
23261 .equiv mci3120_gid, 32
23262 .equiv mci3120_pos, 36
23263 .equiv mci3120_charge, 40
23264 .equiv mci3120_facel, 44
23265 .equiv mci3120_Vc, 48
23266 .equiv mci3120_type, 52
23267 .equiv mci3120_ntype, 56
23268 .equiv mci3120_nbfp, 60
23269 .equiv mci3120_Vnb, 64
23270 .equiv mci3120_tabscale, 68
23271 .equiv mci3120_VFtab, 72
23272 ;# stack offsets for local variables
23273 .equiv mci3120_is3, 0
23274 .equiv mci3120_ii3, 4
23275 .equiv mci3120_ixO, 8
23276 .equiv mci3120_iyO, 12
23277 .equiv mci3120_izO, 16
23278 .equiv mci3120_ixH, 20
23279 .equiv mci3120_iyH, 28
23280 .equiv mci3120_izH, 36
23281 .equiv mci3120_iqO, 44
23282 .equiv mci3120_iqH, 52
23283 .equiv mci3120_qqO, 60
23284 .equiv mci3120_qqH, 68
23285 .equiv mci3120_vctot, 76
23286 .equiv mci3120_vnbtot, 84
23287 .equiv mci3120_c6, 92
23288 .equiv mci3120_c12, 100
23289 .equiv mci3120_n1, 108
23290 .equiv mci3120_tsc, 116
23291 .equiv mci3120_ntia, 124
23292 .equiv mci3120_innerjjnr, 128
23293 .equiv mci3120_innerk, 132
23294 .equiv mci3120_tmprsqH, 136
23295 push ebp
23296 mov ebp,esp
23297 push eax
23298 push ebx
23299 push ecx
23300 push edx
23301 push esi
23302 push edi
23303 sub esp, 144 ;# local stack space
23304 femms
23306 mov ecx, [ebp + mci3120_iinr] ;# ecx = pointer into iinr[]
23307 mov ebx, [ecx] ;# ebx=ii
23309 mov edx, [ebp + mci3120_charge]
23310 movd mm1, [ebp + mci3120_facel]
23311 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0]
23312 pfmul mm2, mm1
23313 movq [esp + mci3120_iqO], mm2 ;# iqO = facel*charge[ii]
23315 movd mm2, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1]
23316 pfmul mm2, mm1
23317 punpckldq mm2,mm2 ;# spread to both halves
23318 movq [esp + mci3120_iqH], mm2 ;# iqH = facel*charge[ii0+1]
23320 mov edx, [ebp + mci3120_type]
23321 mov edx, [edx + ebx*4]
23322 shl edx, 1
23323 mov ecx, edx
23324 imul ecx, [ebp + mci3120_ntype] ;# ecx = ntia = 2*ntype*type[ii0]
23325 mov [esp + mci3120_ntia], ecx
23327 movq mm6, [ebp + mci3120_tabscale]
23328 punpckldq mm6,mm6 ;# spread to both halves
23329 movq [esp + mci3120_tsc], mm6
23330 ;# assume we have at least one i particle - start directly
23331 .mci3120_outer:
23332 mov eax, [ebp + mci3120_shift] ;# eax = pointer into shift[]
23333 mov ebx, [eax] ;# ebx=shift[n]
23334 add dword ptr [ebp + mci3120_shift], 4 ;# advance pointer one step
23336 lea ebx, [ebx + ebx*2] ;# ebx=3*is
23337 mov [esp + mci3120_is3],ebx ;# store is3
23339 mov eax, [ebp + mci3120_shiftvec] ;# eax = base of shiftvec[]
23341 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
23342 movd mm6, [eax + ebx*4 + 8]
23343 movq mm0, mm5
23344 movq mm1, mm5
23345 movq mm2, mm6
23346 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
23347 punpckhdq mm1,mm1
23348 punpckldq mm2,mm2
23350 mov ecx, [ebp + mci3120_iinr] ;# ecx = pointer into iinr[]
23351 add dword ptr [ebp + mci3120_iinr], 4 ;# advance pointer
23352 mov ebx, [ecx] ;# ebx=ii
23354 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
23355 mov eax, [ebp + mci3120_pos] ;# eax = base of pos[]
23357 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
23358 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
23359 mov [esp + mci3120_ii3], ebx ;# (use mm7 as temp. storage for iz.)
23360 pfadd mm6, mm7
23361 movq [esp + mci3120_ixO], mm5
23362 movq [esp + mci3120_izO], mm6
23364 movd mm3, [eax + ebx*4 + 12]
23365 movd mm4, [eax + ebx*4 + 16]
23366 movd mm5, [eax + ebx*4 + 20]
23367 punpckldq mm3, [eax + ebx*4 + 24]
23368 punpckldq mm4, [eax + ebx*4 + 28]
23369 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
23371 pfadd mm0, mm3
23372 pfadd mm1, mm4
23373 pfadd mm2, mm5
23374 movq [esp + mci3120_ixH], mm0
23375 movq [esp + mci3120_iyH], mm1
23376 movq [esp + mci3120_izH], mm2
23378 ;# clear vctot and i forces
23379 pxor mm7,mm7
23380 movq [esp + mci3120_vctot], mm7
23381 movq [esp + mci3120_vnbtot], mm7
23383 mov eax, [ebp + mci3120_jindex]
23384 mov ecx, [eax] ;# jindex[n]
23385 mov edx, [eax + 4] ;# jindex[n+1]
23386 add dword ptr [ebp + mci3120_jindex], 4
23387 sub edx, ecx ;# number of innerloop atoms
23388 mov [esp + mci3120_innerk], edx
23390 mov esi, [ebp + mci3120_pos]
23391 mov eax, [ebp + mci3120_jjnr]
23392 shl ecx, 2
23393 add eax, ecx
23394 mov [esp + mci3120_innerjjnr], eax ;# pointer to jjnr[nj0]
23395 .mci3120_inner_loop:
23396 ;# a single j particle iteration here - compare with the unrolled code for comments.
23397 mov eax, [esp + mci3120_innerjjnr]
23398 mov eax, [eax] ;# eax=jnr offset
23399 add dword ptr [esp + mci3120_innerjjnr], 4 ;# advance pointer
23400 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
23402 mov ecx, [ebp + mci3120_charge]
23403 movd mm7, [ecx + eax*4]
23404 punpckldq mm7,mm7
23405 movq mm6,mm7
23406 pfmul mm6, [esp + mci3120_iqO]
23407 pfmul mm7, [esp + mci3120_iqH] ;# mm6=qqO, mm7=qqH
23408 movd [esp + mci3120_qqO], mm6
23409 movq [esp + mci3120_qqH], mm7
23411 mov ecx, [ebp + mci3120_type]
23412 mov edx, [ecx + eax*4] ;# type [jnr]
23413 mov ecx, [ebp + mci3120_nbfp]
23414 shl edx, 1
23415 add edx, [esp + mci3120_ntia] ;# tja = ntia + 2*type
23416 movd mm5, [ecx + edx*4] ;# mm5 = 1st c6
23417 movq [esp + mci3120_c6], mm5
23418 movd mm5, [ecx + edx*4 + 4] ;# mm5 = 1st c12
23419 movq [esp + mci3120_c12], mm5
23421 lea eax, [eax + eax*2]
23423 movq mm0, [esi + eax*4]
23424 movd mm1, [esi + eax*4 + 8]
23425 ;# copy & expand to mm2-mm4 for the H interactions
23426 movq mm2, mm0
23427 movq mm3, mm0
23428 movq mm4, mm1
23429 punpckldq mm2,mm2
23430 punpckhdq mm3,mm3
23431 punpckldq mm4,mm4
23433 pfsubr mm0, [esp + mci3120_ixO]
23434 pfsubr mm1, [esp + mci3120_izO]
23436 pfmul mm0,mm0
23437 pfmul mm1,mm1
23438 pfacc mm0, mm1
23439 pfadd mm0, mm1 ;# mm0=rsqO
23441 punpckldq mm2, mm2
23442 punpckldq mm3, mm3
23443 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
23444 pfsubr mm2, [esp + mci3120_ixH]
23445 pfsubr mm3, [esp + mci3120_iyH]
23446 pfsubr mm4, [esp + mci3120_izH] ;# mm2-mm4 is dxH-dzH
23448 pfmul mm2,mm2
23449 pfmul mm3,mm3
23450 pfmul mm4,mm4
23452 pfadd mm3,mm2
23453 pfadd mm3,mm4 ;# mm3=rsqH
23454 movq [esp + mci3120_tmprsqH], mm3
23456 pfrsqrt mm1,mm0
23458 movq mm2,mm1
23459 pfmul mm1,mm1
23460 pfrsqit1 mm1,mm0
23461 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
23463 pfmul mm0, mm1 ;# mm0=r
23465 pfmul mm0, [esp + mci3120_tsc]
23466 pf2iw mm4, mm0
23467 movd [esp + mci3120_n1], mm4
23468 pi2fd mm4,mm4
23469 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
23470 movq mm2, mm0
23471 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
23473 ;# coulomb table
23474 mov edx, [ebp + mci3120_VFtab]
23475 mov ecx, [esp + mci3120_n1]
23476 shl ecx, 2
23477 ;# load all values we need
23478 movd mm4, [edx + ecx*4]
23479 movd mm5, [edx + ecx*4 + 4]
23480 movd mm6, [edx + ecx*4 + 8]
23481 movd mm7, [edx + ecx*4 + 12]
23483 pfmul mm6, mm0 ;# mm6 = Geps
23484 pfmul mm7, mm2 ;# mm7 = Heps2
23486 pfadd mm5, mm6
23487 pfadd mm5, mm7 ;# mm5 = Fp
23489 pfmul mm5, mm0 ;# mm5=eps*Fp
23490 pfadd mm5, mm4 ;# mm5= VV
23492 pfmul mm5, [esp + mci3120_qqO] ;# vcoul=qq*VV
23493 ;# update vctot directly
23494 pfadd mm5, [esp + mci3120_vctot]
23495 movq [esp + mci3120_vctot], mm5
23497 ;# nontabulated LJ - mm1 is invsqrt. - keep mm1!
23498 movq mm0, mm1
23499 pfmul mm0, mm0 ;# mm0 is invsq
23500 movq mm2, mm0
23501 pfmul mm2, mm0
23502 pfmul mm2, mm0 ;# mm2 = rinvsix
23503 movq mm4, mm2
23504 pfmul mm4, mm4 ;# mm4=rinvtwelve
23506 pfmul mm4, [esp + mci3120_c12]
23507 pfmul mm2, [esp + mci3120_c6]
23508 pfsub mm4, mm2 ;# mm4=vnb12-vnb6
23510 ;# update vnbtot
23511 pfadd mm4, [esp + mci3120_vnbtot] ;# add the earlier value
23512 movq [esp + mci3120_vnbtot], mm4 ;# store the sum
23514 ;# now do the two hydrogens.
23515 movq mm0, [esp + mci3120_tmprsqH] ;# mm0=rsqH
23517 pfrsqrt mm1, mm0
23518 pswapd mm0,mm0
23519 pfrsqrt mm2, mm0
23520 pswapd mm0,mm0
23521 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
23523 movq mm2, mm1
23524 pfmul mm1,mm1
23525 pfrsqit1 mm1,mm0
23526 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
23528 pfmul mm0,mm1 ;# mm0=r
23529 pfmul mm0, [esp + mci3120_tsc]
23530 pf2iw mm4, mm0
23531 movq [esp + mci3120_n1], mm4
23532 pi2fd mm4,mm4
23533 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
23534 movq mm2, mm0
23535 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
23537 ;# coulomb table
23538 mov edx, [ebp + mci3120_VFtab]
23539 mov ecx, [esp + mci3120_n1]
23540 shl ecx, 2
23541 ;# load all values we need
23542 movd mm4, [edx + ecx*4]
23543 movd mm5, [edx + ecx*4 + 4]
23544 movd mm6, [edx + ecx*4 + 8]
23545 movd mm7, [edx + ecx*4 + 12]
23546 mov ecx, [esp + mci3120_n1 + 4]
23547 shl ecx, 2
23548 punpckldq mm4, [edx + ecx*4]
23549 punpckldq mm5, [edx + ecx*4 + 4]
23550 punpckldq mm6, [edx + ecx*4 + 8]
23551 punpckldq mm7, [edx + ecx*4 + 12]
23553 pfmul mm6, mm0 ;# mm6 = Geps
23554 pfmul mm7, mm2 ;# mm7 = Heps2
23556 pfadd mm5, mm6
23557 pfadd mm5, mm7 ;# mm5 = Fp
23560 pfmul mm5, mm0 ;# mm5=eps*Fp
23561 pfadd mm5, mm4 ;# mm5= VV
23563 pfmul mm5, [esp + mci3120_qqH] ;# vcoul=qq*VV
23564 ;# update vctot
23565 pfadd mm5, [esp + mci3120_vctot]
23566 movq [esp + mci3120_vctot], mm5
23568 ;# done - one more?
23569 dec dword ptr [esp + mci3120_innerk]
23570 jz .mci3120_updateouterdata
23571 jmp .mci3120_inner_loop
23572 .mci3120_updateouterdata:
23574 mov edx, [ebp + mci3120_gid] ;# get group index for this i particle
23575 mov edx, [edx]
23576 add dword ptr [ebp + mci3120_gid], 4 ;# advance pointer
23578 movq mm7, [esp + mci3120_vctot]
23579 pfacc mm7,mm7 ;# get and sum the two parts of total potential
23581 mov eax, [ebp + mci3120_Vc]
23582 movd mm6, [eax + edx*4]
23583 pfadd mm6, mm7
23584 movd [eax + edx*4], mm6 ;# increment vc[gid]
23586 movq mm7, [esp + mci3120_vnbtot]
23587 pfacc mm7,mm7 ;# same for Vnb
23589 mov eax, [ebp + mci3120_Vnb]
23590 movd mm6, [eax + edx*4]
23591 pfadd mm6, mm7
23592 movd [eax + edx*4], mm6 ;# increment vnb[gid]
23593 ;# finish if last
23594 dec dword ptr [ebp + mci3120_nri]
23595 jz .mci3120_end
23596 ;# not last, iterate once more!
23597 jmp .mci3120_outer
23598 .mci3120_end:
23599 femms
23600 add esp, 144
23601 pop edi
23602 pop esi
23603 pop edx
23604 pop ecx
23605 pop ebx
23606 pop eax
23607 leave
23614 .globl mcinl3130_3dnow
23615 .globl _mcinl3130_3dnow
23616 mcinl3130_3dnow:
23617 _mcinl3130_3dnow:
23618 .equiv mci3130_nri, 8
23619 .equiv mci3130_iinr, 12
23620 .equiv mci3130_jindex, 16
23621 .equiv mci3130_jjnr, 20
23622 .equiv mci3130_shift, 24
23623 .equiv mci3130_shiftvec, 28
23624 .equiv mci3130_gid, 32
23625 .equiv mci3130_pos, 36
23626 .equiv mci3130_charge, 40
23627 .equiv mci3130_facel, 44
23628 .equiv mci3130_Vc, 48
23629 .equiv mci3130_type, 52
23630 .equiv mci3130_ntype, 56
23631 .equiv mci3130_nbfp, 60
23632 .equiv mci3130_Vnb, 64
23633 .equiv mci3130_tabscale, 68
23634 .equiv mci3130_VFtab, 72
23635 ;# stack offsets for local variables
23636 .equiv mci3130_is3, 0
23637 .equiv mci3130_ii3, 4
23638 .equiv mci3130_ixO, 8
23639 .equiv mci3130_iyO, 12
23640 .equiv mci3130_izO, 16
23641 .equiv mci3130_ixH, 20
23642 .equiv mci3130_iyH, 28
23643 .equiv mci3130_izH, 36
23644 .equiv mci3130_qqOO, 44
23645 .equiv mci3130_qqOH, 52
23646 .equiv mci3130_qqHH, 60
23647 .equiv mci3130_c6, 68
23648 .equiv mci3130_c12, 76
23649 .equiv mci3130_n1, 84
23650 .equiv mci3130_tsc, 92
23651 .equiv mci3130_vctot, 100
23652 .equiv mci3130_vnbtot, 108
23653 .equiv mci3130_innerjjnr, 116
23654 .equiv mci3130_innerk, 120
23655 .equiv mci3130_tmprsqH, 124
23656 push ebp
23657 mov ebp,esp
23658 push eax
23659 push ebx
23660 push ecx
23661 push edx
23662 push esi
23663 push edi
23664 sub esp, 132 ;# local stack space
23665 femms
23666 ;# assume we have at least one i particle - start directly
23668 mov ecx, [ebp + mci3130_iinr] ;# ecx = pointer into iinr[]
23669 mov ebx, [ecx] ;# ebx=ii
23671 mov edx, [ebp + mci3130_charge]
23672 movd mm1, [ebp + mci3130_facel] ;# mm1=facel
23673 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0] (O)
23674 movd mm3, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1] (H)
23675 movq mm4, mm2
23676 pfmul mm4, mm1
23677 movq mm6, mm3
23678 pfmul mm6, mm1
23679 movq mm5, mm4
23680 pfmul mm4, mm2 ;# mm4=qqOO*facel
23681 pfmul mm5, mm3 ;# mm5=qqOH*facel
23682 pfmul mm6, mm3 ;# mm6=qqHH*facel
23683 punpckldq mm5,mm5 ;# spread to both halves
23684 punpckldq mm6,mm6 ;# spread to both halves
23685 movq [esp + mci3130_qqOO], mm4
23686 movq [esp + mci3130_qqOH], mm5
23687 movq [esp + mci3130_qqHH], mm6
23688 mov edx, [ebp + mci3130_type]
23689 mov ecx, [edx + ebx*4]
23690 shl ecx, 1
23691 mov edx, ecx
23692 imul ecx, [ebp + mci3130_ntype]
23693 add edx, ecx
23694 mov eax, [ebp + mci3130_nbfp]
23695 movd mm0, [eax + edx*4]
23696 movd mm1, [eax + edx*4 + 4]
23697 movq [esp + mci3130_c6], mm0
23698 movq [esp + mci3130_c12], mm1
23699 movd mm5, [ebp + mci3130_tabscale]
23700 punpckldq mm5,mm5
23701 movq [esp + mci3130_tsc], mm5
23702 .mci3130_outer:
23703 mov eax, [ebp + mci3130_shift] ;# eax = pointer into shift[]
23704 mov ebx, [eax] ;# ebx=shift[n]
23705 add dword ptr [ebp + mci3130_shift], 4 ;# advance pointer one step
23707 lea ebx, [ebx + ebx*2] ;# ebx=3*is
23708 mov [esp + mci3130_is3],ebx ;# store is3
23710 mov eax, [ebp + mci3130_shiftvec] ;# eax = base of shiftvec[]
23712 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
23713 movd mm6, [eax + ebx*4 + 8]
23714 movq mm0, mm5
23715 movq mm1, mm5
23716 movq mm2, mm6
23717 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
23718 punpckhdq mm1,mm1
23719 punpckldq mm2,mm2
23721 mov ecx, [ebp + mci3130_iinr] ;# ecx = pointer into iinr[]
23722 add dword ptr [ebp + mci3130_iinr], 4 ;# advance pointer
23723 mov ebx, [ecx] ;# ebx=ii
23725 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
23726 mov eax, [ebp + mci3130_pos] ;# eax = base of pos[]
23728 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
23729 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
23730 mov [esp + mci3130_ii3], ebx ;# (use mm7 as temp. storage for iz.)
23731 pfadd mm6, mm7
23732 movq [esp + mci3130_ixO], mm5
23733 movq [esp + mci3130_izO], mm6
23735 movd mm3, [eax + ebx*4 + 12]
23736 movd mm4, [eax + ebx*4 + 16]
23737 movd mm5, [eax + ebx*4 + 20]
23738 punpckldq mm3, [eax + ebx*4 + 24]
23739 punpckldq mm4, [eax + ebx*4 + 28]
23740 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
23742 pfadd mm0, mm3
23743 pfadd mm1, mm4
23744 pfadd mm2, mm5
23745 movq [esp + mci3130_ixH], mm0
23746 movq [esp + mci3130_iyH], mm1
23747 movq [esp + mci3130_izH], mm2
23749 ;# clear vctot and i forces
23750 pxor mm7,mm7
23751 movq [esp + mci3130_vctot], mm7
23752 movq [esp + mci3130_vnbtot], mm7
23754 mov eax, [ebp + mci3130_jindex]
23755 mov ecx, [eax] ;# jindex[n]
23756 mov edx, [eax + 4] ;# jindex[n+1]
23757 add dword ptr [ebp + mci3130_jindex], 4
23758 sub edx, ecx ;# number of innerloop atoms
23759 mov [esp + mci3130_innerk], edx ;# number of innerloop atoms
23761 mov esi, [ebp + mci3130_pos]
23762 mov eax, [ebp + mci3130_jjnr]
23763 shl ecx, 2
23764 add eax, ecx
23765 mov [esp + mci3130_innerjjnr], eax ;# pointer to jjnr[nj0]
23766 .mci3130_inner_loop:
23767 ;# a single j particle iteration here - compare with the unrolled code for comments.
23768 mov eax, [esp + mci3130_innerjjnr]
23769 mov eax, [eax] ;# eax=jnr offset
23770 add dword ptr [esp + mci3130_innerjjnr], 4 ;# advance pointer
23772 lea eax, [eax + eax*2]
23774 movq mm0, [esi + eax*4]
23775 movd mm1, [esi + eax*4 + 8]
23776 ;# copy & expand to mm2-mm4 for the H interactions
23777 movq mm2, mm0
23778 movq mm3, mm0
23779 movq mm4, mm1
23780 punpckldq mm2,mm2
23781 punpckhdq mm3,mm3
23782 punpckldq mm4,mm4
23784 pfsubr mm0, [esp + mci3130_ixO]
23785 pfsubr mm1, [esp + mci3130_izO]
23787 pfmul mm0,mm0
23788 pfmul mm1,mm1
23789 pfacc mm0, mm0
23790 pfadd mm0, mm1 ;# mm0=rsqO
23792 punpckldq mm2, mm2
23793 punpckldq mm3, mm3
23794 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
23795 pfsubr mm2, [esp + mci3130_ixH]
23796 pfsubr mm3, [esp + mci3130_iyH]
23797 pfsubr mm4, [esp + mci3130_izH] ;# mm2-mm4 is dxH-dzH
23799 pfmul mm2,mm2
23800 pfmul mm3,mm3
23801 pfmul mm4,mm4
23803 pfadd mm3,mm2
23804 pfadd mm3,mm4 ;# mm3=rsqH
23805 movq [esp + mci3130_tmprsqH], mm3
23807 pfrsqrt mm1,mm0
23809 movq mm2,mm1
23810 pfmul mm1,mm1
23811 pfrsqit1 mm1,mm0
23812 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
23813 pfmul mm0, mm1 ;# mm0=rsq
23815 pfmul mm0, [esp + mci3130_tsc]
23816 pf2iw mm4, mm0
23817 movd [esp + mci3130_n1], mm4
23818 pi2fd mm4,mm4
23819 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
23820 movq mm2, mm0
23821 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
23823 ;# coulomb table
23824 mov edx, [ebp + mci3130_VFtab]
23825 mov ecx, [esp + mci3130_n1]
23826 shl ecx, 2
23828 ;# load all values we need
23829 movd mm4, [edx + ecx*4]
23830 movd mm5, [edx + ecx*4 + 4]
23831 movd mm6, [edx + ecx*4 + 8]
23832 movd mm7, [edx + ecx*4 + 12]
23834 pfmul mm6, mm0 ;# mm6 = Geps
23835 pfmul mm7, mm2 ;# mm7 = Heps2
23837 pfadd mm5, mm6
23838 pfadd mm5, mm7 ;# mm5 = Fp
23840 pfmul mm5, mm0 ;# mm5=eps*Fp
23841 pfadd mm5, mm4 ;# mm5= VV
23843 pfmul mm5, [esp + mci3130_qqOO] ;# vcoul=qq*VV
23845 ;# update vctot directly
23846 pfadd mm5, [esp + mci3130_vctot]
23847 movq [esp + mci3130_vctot], mm5
23849 movq mm5, mm1
23850 pfmul mm5,mm5
23851 movq mm4, mm5
23852 pfmul mm4,mm5
23853 pfmul mm4,mm5
23854 movq mm5, mm4
23855 pfmul mm5,mm5 ;# mm4=rinvsix, mm5=rinvtwelve
23857 pfmul mm4, [esp + mci3130_c6]
23858 pfmul mm5, [esp + mci3130_c12]
23859 movq mm6,mm5
23860 pfsub mm6,mm4
23862 ;# update vnbtot
23863 pfadd mm6, [esp + mci3130_vnbtot] ;# add the earlier value
23864 movq [esp + mci3130_vnbtot], mm6 ;# store the sum
23866 ;# time for hydrogens!
23868 movq mm0, [esp + mci3130_tmprsqH]
23870 pfrsqrt mm1, mm0
23871 pswapd mm0,mm0
23872 pfrsqrt mm2, mm0
23873 pswapd mm0,mm0
23874 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
23876 movq mm2, mm1
23877 pfmul mm1,mm1
23878 pfrsqit1 mm1,mm0
23879 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
23881 pfmul mm0,mm1 ;# mm0=r
23882 pfmul mm0, [esp + mci3130_tsc]
23883 pf2iw mm4, mm0
23884 movq [esp + mci3130_n1], mm4
23885 pi2fd mm4,mm4
23886 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
23887 movq mm2, mm0
23888 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
23890 ;# coulomb table
23891 mov edx, [ebp + mci3130_VFtab]
23892 mov ecx, [esp + mci3130_n1]
23893 shl ecx, 2
23894 ;# load all values we need
23895 movd mm4, [edx + ecx*4]
23896 movd mm5, [edx + ecx*4 + 4]
23897 movd mm6, [edx + ecx*4 + 8]
23898 movd mm7, [edx + ecx*4 + 12]
23899 mov ecx, [esp + mci3130_n1 + 4]
23900 shl ecx, 2
23901 punpckldq mm4, [edx + ecx*4]
23902 punpckldq mm5, [edx + ecx*4 + 4]
23903 punpckldq mm6, [edx + ecx*4 + 8]
23904 punpckldq mm7, [edx + ecx*4 + 12]
23906 pfmul mm6, mm0 ;# mm6 = Geps
23907 pfmul mm7, mm2 ;# mm7 = Heps2
23909 pfadd mm5, mm6
23910 pfadd mm5, mm7 ;# mm5 = Fp
23912 pfmul mm5, mm0 ;# mm5=eps*Fp
23913 pfadd mm5, mm4 ;# mm5= VV
23915 pfmul mm5, [esp + mci3130_qqOH] ;# vcoul=qq*VV
23916 ;# update vctot
23917 pfadd mm5, [esp + mci3130_vctot]
23918 movq [esp + mci3130_vctot], mm5
23920 ;# interactions with j H1
23922 movq mm0, [esi + eax*4 + 12]
23923 movd mm1, [esi + eax*4 + 20]
23924 ;# copy & expand to mm2-mm4 for the H interactions
23925 movq mm2, mm0
23926 movq mm3, mm0
23927 movq mm4, mm1
23928 punpckldq mm2,mm2
23929 punpckhdq mm3,mm3
23930 punpckldq mm4,mm4
23932 pfsubr mm0, [esp + mci3130_ixO]
23933 pfsubr mm1, [esp + mci3130_izO]
23935 pfmul mm0,mm0
23936 pfmul mm1,mm1
23937 pfacc mm0, mm1
23938 pfadd mm0, mm1 ;# mm0=rsqO
23940 punpckldq mm2, mm2
23941 punpckldq mm3, mm3
23942 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
23943 pfsubr mm2, [esp + mci3130_ixH]
23944 pfsubr mm3, [esp + mci3130_iyH]
23945 pfsubr mm4, [esp + mci3130_izH] ;# mm2-mm4 is dxH-dzH
23947 pfmul mm2,mm2
23948 pfmul mm3,mm3
23949 pfmul mm4,mm4
23951 pfadd mm3,mm2
23952 pfadd mm3,mm4 ;# mm3=rsqH
23953 movq [esp + mci3130_tmprsqH], mm3
23955 pfrsqrt mm1,mm0
23957 movq mm2,mm1
23958 pfmul mm1,mm1
23959 pfrsqit1 mm1,mm0
23960 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
23961 pfmul mm0, mm1 ;# mm0=rsq
23963 pfmul mm0, [esp + mci3130_tsc]
23964 pf2iw mm4, mm0
23965 movd [esp + mci3130_n1], mm4
23966 pi2fd mm4,mm4
23967 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
23968 movq mm2, mm0
23969 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
23971 ;# coulomb table
23972 mov edx, [ebp + mci3130_VFtab]
23973 mov ecx, [esp + mci3130_n1]
23974 shl ecx, 2
23976 ;# load all values we need
23977 movd mm4, [edx + ecx*4]
23978 movd mm5, [edx + ecx*4 + 4]
23979 movd mm6, [edx + ecx*4 + 8]
23980 movd mm7, [edx + ecx*4 + 12]
23982 pfmul mm6, mm0 ;# mm6 = Geps
23983 pfmul mm7, mm2 ;# mm7 = Heps2
23985 pfadd mm5, mm6
23986 pfadd mm5, mm7 ;# mm5 = Fp
23988 pfmul mm5, mm0 ;# mm5=eps*Fp
23989 pfadd mm5, mm4 ;# mm5= VV
23991 pfmul mm5, [esp + mci3130_qqOH] ;# vcoul=qq*VV
23993 ;# update vctot directly, force is moved to mm3
23994 pfadd mm5, [esp + mci3130_vctot]
23995 movq [esp + mci3130_vctot], mm5
23997 movq mm0, [esp + mci3130_tmprsqH]
23999 pfrsqrt mm1, mm0
24000 pswapd mm0,mm0
24001 pfrsqrt mm2, mm0
24002 pswapd mm0,mm0
24003 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
24005 movq mm2, mm1
24006 pfmul mm1,mm1
24007 pfrsqit1 mm1,mm0
24008 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
24010 pfmul mm0,mm1 ;# mm0=r
24011 pfmul mm0, [esp + mci3130_tsc]
24012 pf2iw mm4, mm0
24013 movq [esp + mci3130_n1], mm4
24014 pi2fd mm4,mm4
24015 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
24016 movq mm2, mm0
24017 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
24019 ;# coulomb table
24020 mov edx, [ebp + mci3130_VFtab]
24021 mov ecx, [esp + mci3130_n1]
24022 shl ecx, 2
24023 ;# load all values we need
24024 movd mm4, [edx + ecx*4]
24025 movd mm5, [edx + ecx*4 + 4]
24026 movd mm6, [edx + ecx*4 + 8]
24027 movd mm7, [edx + ecx*4 + 12]
24028 mov ecx, [esp + mci3130_n1 + 4]
24029 shl ecx, 2
24030 punpckldq mm4, [edx + ecx*4]
24031 punpckldq mm5, [edx + ecx*4 + 4]
24032 punpckldq mm6, [edx + ecx*4 + 8]
24033 punpckldq mm7, [edx + ecx*4 + 12]
24036 pfmul mm6, mm0 ;# mm6 = Geps
24037 pfmul mm7, mm2 ;# mm7 = Heps2
24039 pfadd mm5, mm6
24040 pfadd mm5, mm7 ;# mm5 = Fp
24042 pfmul mm5, mm0 ;# mm5=eps*Fp
24043 pfadd mm5, mm4 ;# mm5= VV
24045 pfmul mm5, [esp + mci3130_qqHH] ;# vcoul=qq*VV
24046 ;# update vctot
24047 pfadd mm5, [esp + mci3130_vctot]
24048 movq [esp + mci3130_vctot], mm5
24050 ;# interactions with j H2
24051 movq mm0, [esi + eax*4 + 24]
24052 movd mm1, [esi + eax*4 + 32]
24053 ;# copy & expand to mm2-mm4 for the H interactions
24054 movq mm2, mm0
24055 movq mm3, mm0
24056 movq mm4, mm1
24057 punpckldq mm2,mm2
24058 punpckhdq mm3,mm3
24059 punpckldq mm4,mm4
24061 pfsubr mm0, [esp + mci3130_ixO]
24062 pfsubr mm1, [esp + mci3130_izO]
24064 pfmul mm0,mm0
24065 pfmul mm1,mm1
24066 pfacc mm0, mm1
24067 pfadd mm0, mm1 ;# mm0=rsqO
24069 punpckldq mm2, mm2
24070 punpckldq mm3, mm3
24071 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
24072 pfsubr mm2, [esp + mci3130_ixH]
24073 pfsubr mm3, [esp + mci3130_iyH]
24074 pfsubr mm4, [esp + mci3130_izH] ;# mm2-mm4 is dxH-dzH
24076 pfmul mm2,mm2
24077 pfmul mm3,mm3
24078 pfmul mm4,mm4
24080 pfadd mm3,mm2
24081 pfadd mm3,mm4 ;# mm3=rsqH
24082 movq [esp + mci3130_tmprsqH], mm3
24084 pfrsqrt mm1,mm0
24086 movq mm2,mm1
24087 pfmul mm1,mm1
24088 pfrsqit1 mm1,mm0
24089 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
24090 pfmul mm0, mm1
24092 pfmul mm0, [esp + mci3130_tsc]
24093 pf2iw mm4, mm0
24094 movd [esp + mci3130_n1], mm4
24095 pi2fd mm4,mm4
24096 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
24097 movq mm2, mm0
24098 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
24100 ;# coulomb table
24101 mov edx, [ebp + mci3130_VFtab]
24102 mov ecx, [esp + mci3130_n1]
24103 shl ecx, 2
24105 ;# load all values we need
24106 movd mm4, [edx + ecx*4]
24107 movd mm5, [edx + ecx*4 + 4]
24108 movd mm6, [edx + ecx*4 + 8]
24109 movd mm7, [edx + ecx*4 + 12]
24111 pfmul mm6, mm0 ;# mm6 = Geps
24112 pfmul mm7, mm2 ;# mm7 = Heps2
24114 pfadd mm5, mm6
24115 pfadd mm5, mm7 ;# mm5 = Fp
24117 pfmul mm5, mm0 ;# mm5=eps*Fp
24118 pfadd mm5, mm4 ;# mm5= VV
24120 pfmul mm5, [esp + mci3130_qqOH] ;# vcoul=qq*VV
24122 ;# update vctot directly
24123 pfadd mm5, [esp + mci3130_vctot]
24124 movq [esp + mci3130_vctot], mm5
24126 movq mm0, [esp + mci3130_tmprsqH]
24128 pfrsqrt mm1, mm0
24129 pswapd mm0,mm0
24130 pfrsqrt mm2, mm0
24131 pswapd mm0,mm0
24132 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
24134 movq mm2, mm1
24135 pfmul mm1,mm1
24136 pfrsqit1 mm1,mm0
24137 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
24139 pfmul mm0,mm1 ;# mm0=r
24140 pfmul mm0, [esp + mci3130_tsc]
24141 pf2iw mm4, mm0
24142 movq [esp + mci3130_n1], mm4
24143 pi2fd mm4,mm4
24144 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
24145 movq mm2, mm0
24146 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
24148 ;# coulomb table
24149 mov edx, [ebp + mci3130_VFtab]
24150 mov ecx, [esp + mci3130_n1]
24151 shl ecx, 2
24152 ;# load all values we need
24153 movd mm4, [edx + ecx*4]
24154 movd mm5, [edx + ecx*4 + 4]
24155 movd mm6, [edx + ecx*4 + 8]
24156 movd mm7, [edx + ecx*4 + 12]
24157 mov ecx, [esp + mci3130_n1 + 4]
24158 shl ecx, 2
24159 punpckldq mm4, [edx + ecx*4]
24160 punpckldq mm5, [edx + ecx*4 + 4]
24161 punpckldq mm6, [edx + ecx*4 + 8]
24162 punpckldq mm7, [edx + ecx*4 + 12]
24165 pfmul mm6, mm0 ;# mm6 = Geps
24166 pfmul mm7, mm2 ;# mm7 = Heps2
24168 pfadd mm5, mm6
24169 pfadd mm5, mm7 ;# mm5 = Fp
24171 pfmul mm5, mm0 ;# mm5=eps*Fp
24172 pfadd mm5, mm4 ;# mm5= VV
24174 pfmul mm5, [esp + mci3130_qqHH] ;# vcoul=qq*VV
24175 ;# update vctot
24176 pfadd mm5, [esp + mci3130_vctot]
24177 movq [esp + mci3130_vctot], mm5
24179 ;# done - one more?
24180 dec dword ptr [esp + mci3130_innerk]
24181 jz .mci3130_updateouterdata
24182 jmp .mci3130_inner_loop
24183 .mci3130_updateouterdata:
24184 mov edx, [ebp + mci3130_gid] ;# get group index for this i particle
24185 mov edx, [edx]
24186 add dword ptr [ebp + mci3130_gid], 4 ;# advance pointer
24188 movq mm7, [esp + mci3130_vctot]
24189 pfacc mm7,mm7 ;# get and sum the two parts of total potential
24191 mov eax, [ebp + mci3130_Vc]
24192 movd mm6, [eax + edx*4]
24193 pfadd mm6, mm7
24194 movd [eax + edx*4], mm6 ;# increment vc[gid]
24196 movq mm7, [esp + mci3130_vnbtot]
24197 pfacc mm7,mm7 ;# get and sum the two parts of total potential
24199 mov eax, [ebp + mci3130_Vnb]
24200 movd mm6, [eax + edx*4]
24201 pfadd mm6, mm7
24202 movd [eax + edx*4], mm6 ;# increment vnbtot[gid]
24203 ;# finish if last
24204 dec dword ptr [ebp + mci3130_nri]
24205 jz .mci3130_end
24206 ;# not last, iterate once more!
24207 jmp .mci3130_outer
24208 .mci3130_end:
24209 femms
24210 add esp, 132
24211 pop edi
24212 pop esi
24213 pop edx
24214 pop ecx
24215 pop ebx
24216 pop eax
24217 leave
24221 .globl mcinl3300_3dnow
24222 .globl _mcinl3300_3dnow
24223 mcinl3300_3dnow:
24224 _mcinl3300_3dnow:
24225 .equiv mci3300_nri, 8
24226 .equiv mci3300_iinr, 12
24227 .equiv mci3300_jindex, 16
24228 .equiv mci3300_jjnr, 20
24229 .equiv mci3300_shift, 24
24230 .equiv mci3300_shiftvec, 28
24231 .equiv mci3300_gid, 32
24232 .equiv mci3300_pos, 36
24233 .equiv mci3300_charge, 40
24234 .equiv mci3300_facel, 44
24235 .equiv mci3300_Vc, 48
24236 .equiv mci3300_type, 52
24237 .equiv mci3300_ntype, 56
24238 .equiv mci3300_nbfp, 60
24239 .equiv mci3300_Vnb, 64
24240 .equiv mci3300_tabscale, 68
24241 .equiv mci3300_VFtab, 72
24242 ;# stack offsets for local variables
24243 .equiv mci3300_is3, 0
24244 .equiv mci3300_ii3, 4
24245 .equiv mci3300_ix, 8
24246 .equiv mci3300_iy, 12
24247 .equiv mci3300_iz, 16
24248 .equiv mci3300_iq, 20
24249 .equiv mci3300_vctot, 28
24250 .equiv mci3300_vnbtot, 36
24251 .equiv mci3300_c6, 44
24252 .equiv mci3300_c12, 52
24253 .equiv mci3300_n1, 60
24254 .equiv mci3300_tsc, 68
24255 .equiv mci3300_ntia, 76
24256 .equiv mci3300_innerjjnr, 80
24257 .equiv mci3300_innerk, 84
24258 push ebp
24259 mov ebp,esp
24260 push eax
24261 push ebx
24262 push ecx
24263 push edx
24264 push esi
24265 push edi
24266 sub esp, 88 ;# local stack space
24267 femms
24268 ;# move data to local stack
24269 movd mm3, [ebp + mci3300_tabscale]
24270 punpckldq mm3,mm3
24271 movq [esp + mci3300_tsc], mm3
24272 ;# assume we have at least one i particle - start directly
24273 .mci3300_outer:
24274 mov eax, [ebp + mci3300_shift] ;# eax = pointer into shift[]
24275 mov ebx, [eax] ;# ebx=shift[n]
24276 add dword ptr [ebp + mci3300_shift], 4 ;# advance pointer one step
24278 lea ebx, [ebx + ebx*2] ;# ebx=3*is
24279 mov [esp + mci3300_is3],ebx ;# store is3
24281 mov eax, [ebp + mci3300_shiftvec] ;# eax = base of shiftvec[]
24283 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
24284 movd mm1, [eax + ebx*4 + 8]
24286 mov ecx, [ebp + mci3300_iinr] ;# ecx = pointer into iinr[]
24287 add dword ptr [ebp + mci3300_iinr], 4 ;# advance pointer
24288 mov ebx, [ecx] ;# ebx=ii
24290 mov edx, [ebp + mci3300_charge]
24291 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
24292 pfmul mm2, [ebp + mci3300_facel]
24293 punpckldq mm2,mm2 ;# spread to both halves
24294 movq [esp + mci3300_iq], mm2 ;# iq =facel*charge[ii]
24296 mov edx, [ebp + mci3300_type]
24297 mov edx, [edx + ebx*4]
24298 imul edx, [ebp + mci3300_ntype]
24299 shl edx, 1
24300 mov [esp + mci3300_ntia], edx
24302 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
24303 mov eax, [ebp + mci3300_pos] ;# eax = base of pos[]
24305 pfadd mm0, [eax + ebx*4] ;# ix = shX + posX (and iy too)
24306 movd mm3, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
24307 mov [esp + mci3300_ii3], ebx
24308 pfadd mm1, mm3
24309 movq [esp + mci3300_ix], mm0
24310 movd [esp + mci3300_iz], mm1
24312 ;# clear total potential and i forces
24313 pxor mm7,mm7
24314 movq [esp + mci3300_vctot], mm7
24315 movq [esp + mci3300_vnbtot], mm7
24317 mov eax, [ebp + mci3300_jindex]
24318 mov ecx, [eax] ;# jindex[n]
24319 mov edx, [eax + 4] ;# jindex[n+1]
24320 add dword ptr [ebp + mci3300_jindex], 4
24321 sub edx, ecx ;# number of innerloop atoms
24323 mov esi, [ebp + mci3300_pos]
24324 mov eax, [ebp + mci3300_jjnr]
24325 shl ecx, 2
24326 add eax, ecx
24327 mov [esp + mci3300_innerjjnr], eax ;# pointer to jjnr[nj0]
24328 sub edx, 2
24329 mov [esp + mci3300_innerk], edx ;# number of innerloop atoms
24330 jge .mci3300_unroll_loop
24331 jmp .mci3300_finish_inner
24332 .mci3300_unroll_loop:
24333 ;# paired innerloop starts here
24334 mov ecx, [esp + mci3300_innerjjnr] ;# pointer to jjnr[k]
24335 mov eax, [ecx]
24336 mov ebx, [ecx + 4] ;# eax/ebx=jnr
24337 add dword ptr [esp + mci3300_innerjjnr], 8 ;# advance pointer (unrolled 2)
24338 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
24340 mov ecx, [ebp + mci3300_charge] ;# base of charge[]
24341 movq mm5, [esp + mci3300_iq]
24342 movd mm3, [ecx + eax*4] ;# charge[jnr1]
24343 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
24344 pfmul mm3,mm5 ;# mm3 now has qq for both particles
24346 mov ecx, [ebp + mci3300_type]
24347 mov edx, [ecx + eax*4] ;# type [jnr1]
24348 mov ecx, [ecx + ebx*4] ;# type [jnr2]
24350 mov esi, [ebp + mci3300_nbfp] ;# base of nbfp
24351 shl edx, 1
24352 shl ecx, 1
24353 add edx, [esp + mci3300_ntia] ;# tja = ntia + 2*type
24354 add ecx, [esp + mci3300_ntia]
24356 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
24357 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
24358 movq mm6,mm5
24359 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
24360 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
24361 movq [esp + mci3300_c6], mm5
24362 movq [esp + mci3300_c12], mm6
24364 lea eax, [eax + eax*2] ;# replace jnr with j3
24365 lea ebx, [ebx + ebx*2]
24367 mov esi, [ebp + mci3300_pos]
24369 movq mm0, [esp + mci3300_ix]
24370 movd mm1, [esp + mci3300_iz]
24371 movq mm4, [esi + eax*4] ;# fetch first j coordinates
24372 movd mm5, [esi + eax*4 + 8]
24373 pfsubr mm4,mm0 ;# dr = ir - jr
24374 pfsubr mm5,mm1
24375 pfmul mm4,mm4 ;# square dx,dy,dz
24376 pfmul mm5,mm5
24377 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
24378 pfacc mm4, mm5 ;# first rsq in lower mm4
24380 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
24381 movd mm7, [esi + ebx*4 + 8]
24383 pfsubr mm6,mm0 ;# dr = ir - jr
24384 pfsubr mm7,mm1
24385 pfmul mm6,mm6 ;# square dx,dy,dz
24386 pfmul mm7,mm7
24387 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
24388 pfacc mm6, mm7 ;# second rsq in lower mm6
24390 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
24391 pfrsqrt mm1, mm6
24394 punpckldq mm0,mm1
24395 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
24396 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
24397 pfmul mm0,mm0
24398 pfrsqit1 mm0,mm4
24399 pfrcpit2 mm0,mm2
24400 pfmul mm4, mm0
24401 movq mm1, mm4
24402 ;# mm0 is invsqrt, and mm1 r.
24403 ;# do potential and fscal
24404 pfmul mm1, [esp + mci3300_tsc] ;# mm1=rt
24405 pf2iw mm4,mm1
24406 movq [esp + mci3300_n1], mm4
24407 pi2fd mm4,mm4
24408 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
24410 movq mm2,mm1
24411 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
24413 mov edx, [ebp + mci3300_VFtab]
24414 mov ecx, [esp + mci3300_n1]
24415 lea ecx, [ecx + ecx*2]
24416 shl ecx, 2
24417 ;# load all the table values we need
24418 movd mm4, [edx + ecx*4]
24419 movd mm5, [edx + ecx*4 + 4]
24420 movd mm6, [edx + ecx*4 + 8]
24421 movd mm7, [edx + ecx*4 + 12]
24422 mov ecx, [esp + mci3300_n1 + 4]
24423 lea ecx, [ecx + ecx*2]
24424 shl ecx, 2
24425 punpckldq mm4, [edx + ecx*4]
24426 punpckldq mm5, [edx + ecx*4 + 4]
24427 punpckldq mm6, [edx + ecx*4 + 8]
24428 punpckldq mm7, [edx + ecx*4 + 12]
24430 pfmul mm6, mm1 ;# mm6 = Geps
24431 pfmul mm7, mm2 ;# mm7 = Heps2
24433 pfadd mm5, mm6
24434 pfadd mm5, mm7 ;# mm5 = Fp
24436 pfmul mm5, mm1 ;# mm5=eps*Fp
24437 pfadd mm5, mm4 ;# mm5= VV
24439 pfmul mm5, mm3 ;# vcoul=qq*VV
24441 ;# at this point mm5 contains vcoul
24442 ;# increment vcoul - then we can get rid of mm5
24443 ;# update vctot
24444 pfadd mm5, [esp + mci3300_vctot] ;# add the earlier value
24445 movq [esp + mci3300_vctot], mm5 ;# store the sum
24447 ;# dispersion table
24448 mov ecx, [esp + mci3300_n1]
24449 lea ecx, [ecx + ecx*2]
24450 shl ecx, 2
24451 ;# load all the table values we need
24452 movd mm4, [edx + ecx*4 + 16]
24453 movd mm5, [edx + ecx*4 + 20]
24454 movd mm6, [edx + ecx*4 + 24]
24455 movd mm7, [edx + ecx*4 + 28]
24456 mov ecx, [esp + mci3300_n1 + 4]
24457 lea ecx, [ecx + ecx*2]
24458 shl ecx, 2
24459 punpckldq mm4, [edx + ecx*4 + 16]
24460 punpckldq mm5, [edx + ecx*4 + 20]
24461 punpckldq mm6, [edx + ecx*4 + 24]
24462 punpckldq mm7, [edx + ecx*4 + 28]
24463 pfmul mm6, mm1 ;# mm6 = Geps
24464 pfmul mm7, mm2 ;# mm7 = Heps2
24465 pfadd mm5, mm6
24466 pfadd mm5, mm7 ;# mm5 = Fp
24467 pfmul mm5, mm1 ;# mm5=eps*Fp
24468 pfadd mm5, mm4 ;# mm5= VV
24470 movq mm4, [esp + mci3300_c6]
24471 pfmul mm5, mm4 ;# vnb6
24472 ;# update vnbtot to release mm5!
24473 pfadd mm5, [esp + mci3300_vnbtot] ;# add the earlier value
24474 movq [esp + mci3300_vnbtot], mm5 ;# store the sum
24476 ;# repulsion table
24477 mov ecx, [esp + mci3300_n1]
24478 lea ecx, [ecx + ecx*2]
24479 shl ecx, 2
24480 ;# load all the table values we need
24481 movd mm4, [edx + ecx*4 + 32]
24482 movd mm5, [edx + ecx*4 + 36]
24483 movd mm6, [edx + ecx*4 + 40]
24484 movd mm7, [edx + ecx*4 + 44]
24485 mov ecx, [esp + mci3300_n1 + 4]
24486 lea ecx, [ecx + ecx*2]
24487 shl ecx, 2
24488 punpckldq mm4, [edx + ecx*4 + 32]
24489 punpckldq mm5, [edx + ecx*4 + 36]
24490 punpckldq mm6, [edx + ecx*4 + 40]
24491 punpckldq mm7, [edx + ecx*4 + 44]
24493 pfmul mm6, mm1 ;# mm6 = Geps
24494 pfmul mm7, mm2 ;# mm7 = Heps2
24495 pfadd mm5, mm6
24496 pfadd mm5, mm7 ;# mm5 = Fp
24497 pfmul mm5, mm1 ;# mm5=eps*Fp
24498 pfadd mm5, mm4 ;# mm5= VV
24500 movq mm6, [esp + mci3300_c12]
24501 pfmul mm5, mm6 ;# vnb12
24502 ;# update vnbtot
24503 pfadd mm5, [esp + mci3300_vnbtot] ;# add the earlier value
24504 movq [esp + mci3300_vnbtot], mm5 ;# store the sum
24506 ;# should we do one more iteration?
24507 sub dword ptr [esp + mci3300_innerk], 2
24508 jl .mci3300_finish_inner
24509 jmp .mci3300_unroll_loop
24510 .mci3300_finish_inner:
24511 and dword ptr [esp + mci3300_innerk], 1
24512 jnz .mci3300_single_inner
24513 jmp .mci3300_updateouterdata
24514 .mci3300_single_inner:
24515 ;# a single j particle iteration here - compare with the unrolled code for comments.
24516 mov eax, [esp + mci3300_innerjjnr]
24517 mov eax, [eax] ;# eax=jnr offset
24519 mov ecx, [ebp + mci3300_charge]
24520 movd mm5, [esp + mci3300_iq]
24521 movd mm3, [ecx + eax*4]
24522 pfmul mm3, mm5 ;# mm3=qq
24524 mov esi, [ebp + mci3300_nbfp]
24525 mov ecx, [ebp + mci3300_type]
24526 mov edx, [ecx + eax*4] ;# type [jnr1]
24527 shl edx, 1
24528 add edx, [esp + mci3300_ntia] ;# tja = ntia + 2*type
24529 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
24530 movq [esp + mci3300_c6], mm5
24531 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
24532 movq [esp + mci3300_c12], mm5
24534 mov esi, [ebp + mci3300_pos]
24535 lea eax, [eax + eax*2]
24537 movq mm0, [esp + mci3300_ix]
24538 movd mm1, [esp + mci3300_iz]
24539 movq mm4, [esi + eax*4]
24540 movd mm5, [esi + eax*4 + 8]
24541 pfsubr mm4, mm0
24542 pfsubr mm5, mm1
24543 pfmul mm4,mm4
24544 pfmul mm5,mm5
24545 pfacc mm4, mm5
24546 pfacc mm4, mm5 ;# mm0=rsq
24548 pfrsqrt mm0,mm4
24549 movq mm2,mm0
24550 pfmul mm0,mm0
24551 pfrsqit1 mm0,mm4
24552 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
24553 pfmul mm4, mm0
24554 movq mm1, mm4
24555 ;# mm0 is invsqrt, and mm1 r.
24557 ;# calculate potentials and scalar force
24558 pfmul mm1, [esp + mci3300_tsc] ;# mm1=rt
24559 pf2iw mm4,mm1
24560 movd [esp + mci3300_n1], mm4
24561 pi2fd mm4,mm4
24562 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
24564 movq mm2,mm1
24565 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
24567 ;# coulomb table
24568 mov edx, [ebp + mci3300_VFtab]
24569 mov ecx, [esp + mci3300_n1]
24570 lea ecx, [ecx + ecx*2]
24571 shl ecx, 2
24572 ;# load all the table values we need
24573 movd mm4, [edx + ecx*4]
24574 movd mm5, [edx + ecx*4 + 4]
24575 movd mm6, [edx + ecx*4 + 8]
24576 movd mm7, [edx + ecx*4 + 12]
24578 pfmul mm6, mm1 ;# mm6 = Geps
24579 pfmul mm7, mm2 ;# mm7 = Heps2
24581 pfadd mm5, mm6
24582 pfadd mm5, mm7 ;# mm5 = Fp
24584 pfmul mm5, mm1 ;# mm5=eps*Fp
24585 pfadd mm5, mm4 ;# mm5= VV
24587 pfmul mm5, mm3 ;# vcoul=qq*VV
24589 ;# at this point mm5 contains vcoul
24590 ;# increment vcoul - then we can get rid of mm5
24591 ;# update vctot
24592 pfadd mm5, [esp + mci3300_vctot] ;# add the earlier value
24593 movq [esp + mci3300_vctot], mm5 ;# store the sum
24595 ;# dispersion table
24596 ;# load all the table values we need
24597 movd mm4, [edx + ecx*4 + 16]
24598 movd mm5, [edx + ecx*4 + 20]
24599 movd mm6, [edx + ecx*4 + 24]
24600 movd mm7, [edx + ecx*4 + 28]
24601 pfmul mm6, mm1 ;# mm6 = Geps
24602 pfmul mm7, mm2 ;# mm7 = Heps2
24603 pfadd mm5, mm6
24604 pfadd mm5, mm7 ;# mm5 = Fp
24605 pfmul mm5, mm1 ;# mm5=eps*Fp
24606 pfadd mm5, mm4 ;# mm5= VV
24608 movq mm4, [esp + mci3300_c6]
24609 pfmul mm5, mm4 ;# vnb6
24611 ;# update vnbtot to release mm5!
24612 pfadd mm5, [esp + mci3300_vnbtot] ;# add the earlier value
24613 movq [esp + mci3300_vnbtot], mm5 ;# store the sum
24615 ;# repulsion table
24616 ;# load all the table values we need
24617 movd mm4, [edx + ecx*4 + 32]
24618 movd mm5, [edx + ecx*4 + 36]
24619 movd mm6, [edx + ecx*4 + 40]
24620 movd mm7, [edx + ecx*4 + 44]
24622 pfmul mm6, mm1 ;# mm6 = Geps
24623 pfmul mm7, mm2 ;# mm7 = Heps2
24624 pfadd mm5, mm6
24625 pfadd mm5, mm7 ;# mm5 = Fp
24626 pfmul mm5, mm1 ;# mm5=eps*Fp
24627 pfadd mm5, mm4 ;# mm5= VV
24629 movq mm6, [esp + mci3300_c12]
24630 pfmul mm5, mm6 ;# vnb12
24631 ;# update vnbtot
24632 pfadd mm5, [esp + mci3300_vnbtot] ;# add the earlier value
24633 movq [esp + mci3300_vnbtot], mm5 ;# store the sum
24635 .mci3300_updateouterdata:
24636 mov edx, [ebp + mci3300_gid] ;# get group index for this i particle
24637 mov edx, [edx]
24638 add dword ptr [ebp + mci3300_gid], 4 ;# advance pointer
24640 movq mm7, [esp + mci3300_vctot]
24641 pfacc mm7,mm7 ;# get and sum the two parts of total potential
24643 mov eax, [ebp + mci3300_Vc]
24644 movd mm6, [eax + edx*4]
24645 pfadd mm6, mm7
24646 movd [eax + edx*4], mm6 ;# increment vc[gid]
24648 movq mm7, [esp + mci3300_vnbtot]
24649 pfacc mm7,mm7 ;# get and sum the two parts of total potential
24651 mov eax, [ebp + mci3300_Vnb]
24652 movd mm6, [eax + edx*4]
24653 pfadd mm6, mm7
24654 movd [eax + edx*4], mm6 ;# increment vnb[gid]
24656 ;# finish if last
24657 mov ecx, [ebp + mci3300_nri]
24658 dec ecx
24659 jecxz .mci3300_end
24660 ;# not last, iterate once more!
24661 mov [ebp + mci3300_nri], ecx
24662 jmp .mci3300_outer
24663 .mci3300_end:
24664 femms
24665 add esp, 88
24666 pop edi
24667 pop esi
24668 pop edx
24669 pop ecx
24670 pop ebx
24671 pop eax
24672 leave
24679 .globl mcinl3310_3dnow
24680 .globl _mcinl3310_3dnow
24681 mcinl3310_3dnow:
24682 _mcinl3310_3dnow:
24683 .equiv mci3310_nri, 8
24684 .equiv mci3310_iinr, 12
24685 .equiv mci3310_jindex, 16
24686 .equiv mci3310_jjnr, 20
24687 .equiv mci3310_shift, 24
24688 .equiv mci3310_shiftvec, 28
24689 .equiv mci3310_gid, 32
24690 .equiv mci3310_pos, 36
24691 .equiv mci3310_charge, 40
24692 .equiv mci3310_facel, 44
24693 .equiv mci3310_Vc, 48
24694 .equiv mci3310_type, 52
24695 .equiv mci3310_ntype, 56
24696 .equiv mci3310_nbfp, 60
24697 .equiv mci3310_Vnb, 64
24698 .equiv mci3310_tabscale, 68
24699 .equiv mci3310_VFtab, 72
24700 .equiv mci3310_nsatoms, 76
24701 ;# stack offsets for local variables
24702 .equiv mci3310_is3, 0
24703 .equiv mci3310_ii3, 4
24704 .equiv mci3310_shX, 8
24705 .equiv mci3310_shY, 12
24706 .equiv mci3310_shZ, 16
24707 .equiv mci3310_ix, 20
24708 .equiv mci3310_iy, 24
24709 .equiv mci3310_iz, 28
24710 .equiv mci3310_iq, 32
24711 .equiv mci3310_vctot, 40
24712 .equiv mci3310_vnbtot, 48
24713 .equiv mci3310_c6, 56
24714 .equiv mci3310_c12, 64
24715 .equiv mci3310_n1, 72
24716 .equiv mci3310_tsc, 80
24717 .equiv mci3310_ntia, 88
24718 .equiv mci3310_innerjjnr0, 92
24719 .equiv mci3310_innerk0, 96
24720 .equiv mci3310_innerjjnr, 100
24721 .equiv mci3310_innerk, 104
24722 .equiv mci3310_nsvdwc, 108
24723 .equiv mci3310_nscoul, 112
24724 .equiv mci3310_nsvdw, 116
24725 .equiv mci3310_solnr, 120
24726 push ebp
24727 mov ebp,esp
24728 push eax
24729 push ebx
24730 push ecx
24731 push edx
24732 push esi
24733 push edi
24734 sub esp, 124 ;# local stack space
24735 femms
24736 movd mm3, [ebp + mci3310_tabscale]
24737 punpckldq mm3,mm3
24738 movq [esp + mci3310_tsc], mm3
24739 ;# assume we have at least one i particle - start directly
24740 .mci3310_outer:
24741 mov eax, [ebp + mci3310_shift] ;# eax = pointer into shift[]
24742 mov ebx, [eax] ;# ebx=shift[n]
24743 add dword ptr [ebp + mci3310_shift], 4 ;# advance pointer one step
24745 lea ebx, [ebx + ebx*2] ;# ebx=3*is
24746 mov [esp + mci3310_is3],ebx ;# store is3
24748 mov eax, [ebp + mci3310_shiftvec] ;# eax = base of shiftvec[]
24750 movq mm0, [eax + ebx*4] ;# move shX/shY to mm0 and shZ to mm1
24751 movd mm1, [eax + ebx*4 + 8]
24752 movq [esp + mci3310_shX], mm0
24753 movd [esp + mci3310_shZ], mm1
24755 mov ecx, [ebp + mci3310_iinr] ;# ecx = pointer into iinr[]
24756 add dword ptr [ebp + mci3310_iinr], 4 ;# advance pointer
24757 mov ebx, [ecx] ;# ebx=ii
24759 mov eax, [ebp + mci3310_nsatoms]
24760 add dword ptr [ebp + mci3310_nsatoms], 12
24761 mov ecx, [eax]
24762 mov edx, [eax + 4]
24763 mov eax, [eax + 8]
24764 sub ecx, eax
24765 sub eax, edx
24767 mov [esp + mci3310_nsvdwc], edx
24768 mov [esp + mci3310_nscoul], eax
24769 mov [esp + mci3310_nsvdw], ecx
24771 ;# clear potential
24772 pxor mm7,mm7
24773 movq [esp + mci3310_vctot], mm7
24774 movq [esp + mci3310_vnbtot], mm7
24775 mov [esp + mci3310_solnr], ebx
24777 mov eax, [ebp + mci3310_jindex]
24778 mov ecx, [eax] ;# jindex[n]
24779 mov edx, [eax + 4] ;# jindex[n+1]
24780 add dword ptr [ebp + mci3310_jindex], 4
24781 sub edx, ecx ;# number of innerloop atoms
24782 mov eax, [ebp + mci3310_jjnr]
24783 shl ecx, 2
24784 add eax, ecx
24785 mov [esp + mci3310_innerjjnr0], eax ;# pointer to jjnr[nj0]
24787 mov [esp + mci3310_innerk0], edx ;# number of innerloop atoms
24788 mov esi, [ebp + mci3310_pos]
24790 mov ecx, [esp + mci3310_nsvdwc]
24791 cmp ecx, 0
24792 jnz .mci3310_mno_vdwc
24793 jmp .mci3310_testcoul
24794 .mci3310_mno_vdwc:
24795 mov ebx, [esp + mci3310_solnr]
24796 inc dword ptr [esp + mci3310_solnr]
24797 mov edx, [ebp + mci3310_charge]
24798 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
24799 pfmul mm2, [ebp + mci3310_facel]
24800 punpckldq mm2,mm2 ;# spread to both halves
24801 movq [esp + mci3310_iq], mm2 ;# iq =facel*charge[ii]
24803 mov edx, [ebp + mci3310_type]
24804 mov edx, [edx + ebx*4]
24805 imul edx, [ebp + mci3310_ntype]
24806 shl edx, 1
24807 mov [esp + mci3310_ntia], edx
24809 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
24810 mov eax, [ebp + mci3310_pos] ;# eax = base of pos[]
24811 mov [esp + mci3310_ii3], ebx
24813 movq mm0, [eax + ebx*4]
24814 movd mm1, [eax + ebx*4 + 8]
24815 pfadd mm0, [esp + mci3310_shX]
24816 pfadd mm1, [esp + mci3310_shZ]
24817 movq [esp + mci3310_ix], mm0
24818 movd [esp + mci3310_iz], mm1
24820 mov ecx, [esp + mci3310_innerjjnr0]
24821 mov [esp + mci3310_innerjjnr], ecx
24822 mov edx, [esp + mci3310_innerk0]
24823 sub edx, 2
24824 mov [esp + mci3310_innerk], edx ;# number of innerloop atoms
24825 jge .mci3310_unroll_vdwc_loop
24826 jmp .mci3310_finish_vdwc_inner
24827 .mci3310_unroll_vdwc_loop:
24828 ;# paired innerloop starts here
24829 mov ecx, [esp + mci3310_innerjjnr] ;# pointer to jjnr[k]
24830 mov eax, [ecx]
24831 mov ebx, [ecx + 4] ;# eax/ebx=jnr
24832 add dword ptr [esp + mci3310_innerjjnr], 8 ;# advance pointer (unrolled 2)
24833 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
24835 mov ecx, [ebp + mci3310_charge] ;# base of charge[]
24836 movq mm5, [esp + mci3310_iq]
24837 movd mm3, [ecx + eax*4] ;# charge[jnr1]
24838 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
24839 pfmul mm3,mm5 ;# mm3 now has qq for both particles
24841 mov ecx, [ebp + mci3310_type]
24842 mov edx, [ecx + eax*4] ;# type [jnr1]
24843 mov ecx, [ecx + ebx*4] ;# type [jnr2]
24845 mov esi, [ebp + mci3310_nbfp] ;# base of nbfp
24846 shl edx, 1
24847 shl ecx, 1
24848 add edx, [esp + mci3310_ntia] ;# tja = ntia + 2*type
24849 add ecx, [esp + mci3310_ntia]
24851 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
24852 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
24853 movq mm6,mm5
24854 punpckldq mm5,mm7 ;# mm5 = 1st c6 / 2nd c6
24855 punpckhdq mm6,mm7 ;# mm6 = 1st c12 / 2nd c12
24856 movq [esp + mci3310_c6], mm5
24857 movq [esp + mci3310_c12], mm6
24859 lea eax, [eax + eax*2] ;# replace jnr with j3
24860 lea ebx, [ebx + ebx*2]
24862 mov esi, [ebp + mci3310_pos]
24864 movq mm0, [esp + mci3310_ix]
24865 movd mm1, [esp + mci3310_iz]
24866 movq mm4, [esi + eax*4] ;# fetch first j coordinates
24867 movd mm5, [esi + eax*4 + 8]
24868 pfsubr mm4,mm0 ;# dr = ir - jr
24869 pfsubr mm5,mm1
24870 pfmul mm4,mm4 ;# square dx,dy,dz
24871 pfmul mm5,mm5
24872 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
24873 pfacc mm4, mm5 ;# first rsq in lower mm4
24875 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
24876 movd mm7, [esi + ebx*4 + 8]
24878 pfsubr mm6,mm0 ;# dr = ir - jr
24879 pfsubr mm7,mm1
24880 pfmul mm6,mm6 ;# square dx,dy,dz
24881 pfmul mm7,mm7
24882 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
24883 pfacc mm6, mm7 ;# second rsq in lower mm6
24885 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
24886 pfrsqrt mm1, mm6
24888 punpckldq mm0,mm1
24889 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
24890 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
24891 pfmul mm0,mm0
24892 pfrsqit1 mm0,mm4
24893 pfrcpit2 mm0,mm2
24894 pfmul mm4, mm0
24895 movq mm1, mm4
24896 ;# mm0 is invsqrt, and mm1 r.
24897 ;# do potential and fscal
24898 pfmul mm1, [esp + mci3310_tsc] ;# mm1=rt
24899 pf2iw mm4,mm1
24900 movq [esp + mci3310_n1], mm4
24901 pi2fd mm4,mm4
24902 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
24904 movq mm2,mm1
24905 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
24907 mov edx, [ebp + mci3310_VFtab]
24908 mov ecx, [esp + mci3310_n1]
24909 lea ecx, [ecx + ecx*2]
24910 shl ecx, 2
24911 ;# load all the table values we need
24912 movd mm4, [edx + ecx*4]
24913 movd mm5, [edx + ecx*4 + 4]
24914 movd mm6, [edx + ecx*4 + 8]
24915 movd mm7, [edx + ecx*4 + 12]
24916 mov ecx, [esp + mci3310_n1 + 4]
24917 lea ecx, [ecx + ecx*2]
24918 shl ecx, 2
24919 punpckldq mm4, [edx + ecx*4]
24920 punpckldq mm5, [edx + ecx*4 + 4]
24921 punpckldq mm6, [edx + ecx*4 + 8]
24922 punpckldq mm7, [edx + ecx*4 + 12]
24924 pfmul mm6, mm1 ;# mm6 = Geps
24925 pfmul mm7, mm2 ;# mm7 = Heps2
24927 pfadd mm5, mm6
24928 pfadd mm5, mm7 ;# mm5 = Fp
24930 pfmul mm5, mm1 ;# mm5=eps*Fp
24931 pfadd mm5, mm4 ;# mm5= VV
24933 pfmul mm5, mm3 ;# vcoul=qq*VV
24935 ;# at this point mm5 contains vcoul
24936 ;# increment vcoul - then we can get rid of mm5
24937 ;# update vctot
24938 pfadd mm5, [esp + mci3310_vctot] ;# add the earlier value
24939 movq [esp + mci3310_vctot], mm5 ;# store the sum
24941 ;# dispersion table
24942 mov ecx, [esp + mci3310_n1]
24943 lea ecx, [ecx + ecx*2]
24944 shl ecx, 2
24945 ;# load all the table values we need
24946 movd mm4, [edx + ecx*4 + 16]
24947 movd mm5, [edx + ecx*4 + 20]
24948 movd mm6, [edx + ecx*4 + 24]
24949 movd mm7, [edx + ecx*4 + 28]
24950 mov ecx, [esp + mci3310_n1 + 4]
24951 lea ecx, [ecx + ecx*2]
24952 shl ecx, 2
24953 punpckldq mm4, [edx + ecx*4 + 16]
24954 punpckldq mm5, [edx + ecx*4 + 20]
24955 punpckldq mm6, [edx + ecx*4 + 24]
24956 punpckldq mm7, [edx + ecx*4 + 28]
24957 pfmul mm6, mm1 ;# mm6 = Geps
24958 pfmul mm7, mm2 ;# mm7 = Heps2
24959 pfadd mm5, mm6
24960 pfadd mm5, mm7 ;# mm5 = Fp
24961 pfmul mm5, mm1 ;# mm5=eps*Fp
24962 pfadd mm5, mm4 ;# mm5= VV
24964 movq mm4, [esp + mci3310_c6]
24965 pfmul mm5, mm4 ;# vnb6
24967 ;# update vnbtot to release mm5!
24968 pfadd mm5, [esp + mci3310_vnbtot] ;# add the earlier value
24969 movq [esp + mci3310_vnbtot], mm5 ;# store the sum
24971 ;# repulsion table
24972 mov ecx, [esp + mci3310_n1]
24973 lea ecx, [ecx + ecx*2]
24974 shl ecx, 2
24975 ;# load all the table values we need
24976 movd mm4, [edx + ecx*4 + 32]
24977 movd mm5, [edx + ecx*4 + 36]
24978 movd mm6, [edx + ecx*4 + 40]
24979 movd mm7, [edx + ecx*4 + 44]
24980 mov ecx, [esp + mci3310_n1 + 4]
24981 lea ecx, [ecx + ecx*2]
24982 shl ecx, 2
24983 punpckldq mm4, [edx + ecx*4 + 32]
24984 punpckldq mm5, [edx + ecx*4 + 36]
24985 punpckldq mm6, [edx + ecx*4 + 40]
24986 punpckldq mm7, [edx + ecx*4 + 44]
24988 pfmul mm6, mm1 ;# mm6 = Geps
24989 pfmul mm7, mm2 ;# mm7 = Heps2
24990 pfadd mm5, mm6
24991 pfadd mm5, mm7 ;# mm5 = Fp
24992 pfmul mm5, mm1 ;# mm5=eps*Fp
24993 pfadd mm5, mm4 ;# mm5= VV
24995 movq mm6, [esp + mci3310_c12]
24996 pfmul mm5, mm6 ;# vnb12
24997 ;# update vnbtot
24998 pfadd mm5, [esp + mci3310_vnbtot] ;# add the earlier value
24999 movq [esp + mci3310_vnbtot], mm5 ;# store the sum
25001 ;# should we do one more iteration?
25002 sub dword ptr [esp + mci3310_innerk], 2
25003 jl .mci3310_finish_vdwc_inner
25004 jmp .mci3310_unroll_vdwc_loop
25005 .mci3310_finish_vdwc_inner:
25006 and dword ptr [esp + mci3310_innerk], 1
25007 jnz .mci3310_single_vdwc_inner
25008 jmp .mci3310_updateouterdata_vdwc
25009 .mci3310_single_vdwc_inner:
25010 ;# a single j particle iteration here - compare with the unrolled code for comments.
25011 mov eax, [esp + mci3310_innerjjnr]
25012 mov eax, [eax] ;# eax=jnr offset
25014 mov ecx, [ebp + mci3310_charge]
25015 movd mm5, [esp + mci3310_iq]
25016 movd mm3, [ecx + eax*4]
25017 pfmul mm3, mm5 ;# mm3=qq
25019 mov esi, [ebp + mci3310_nbfp]
25020 mov ecx, [ebp + mci3310_type]
25021 mov edx, [ecx + eax*4] ;# type [jnr1]
25022 shl edx, 1
25023 add edx, [esp + mci3310_ntia] ;# tja = ntia + 2*type
25024 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
25025 movq [esp + mci3310_c6], mm5
25026 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
25027 movq [esp + mci3310_c12], mm5
25029 mov esi, [ebp + mci3310_pos]
25030 lea eax, [eax + eax*2]
25032 movq mm0, [esp + mci3310_ix]
25033 movd mm1, [esp + mci3310_iz]
25034 movq mm4, [esi + eax*4]
25035 movd mm5, [esi + eax*4 + 8]
25036 pfsubr mm4, mm0
25037 pfsubr mm5, mm1
25038 pfmul mm4,mm4
25039 pfmul mm5,mm5
25040 pfacc mm4, mm5
25041 pfacc mm4, mm5 ;# mm0=rsq
25043 pfrsqrt mm0,mm4
25044 movq mm2,mm0
25045 pfmul mm0,mm0
25046 pfrsqit1 mm0,mm4
25047 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
25048 pfmul mm4, mm0
25049 movq mm1, mm4
25050 ;# mm0 is invsqrt, and mm1 r.
25052 ;# calculate potentials and scalar force
25053 pfmul mm1, [esp + mci3310_tsc] ;# mm1=rt
25054 pf2iw mm4,mm1
25055 movd [esp + mci3310_n1], mm4
25056 pi2fd mm4,mm4
25057 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
25059 movq mm2,mm1
25060 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
25062 ;# coulomb table
25063 mov edx, [ebp + mci3310_VFtab]
25064 mov ecx, [esp + mci3310_n1]
25065 lea ecx, [ecx + ecx*2]
25066 shl ecx, 2
25067 ;# load all the table values we need
25068 movd mm4, [edx + ecx*4]
25069 movd mm5, [edx + ecx*4 + 4]
25070 movd mm6, [edx + ecx*4 + 8]
25071 movd mm7, [edx + ecx*4 + 12]
25073 pfmul mm6, mm1 ;# mm6 = Geps
25074 pfmul mm7, mm2 ;# mm7 = Heps2
25076 pfadd mm5, mm6
25077 pfadd mm5, mm7 ;# mm5 = Fp
25079 pfmul mm5, mm1 ;# mm5=eps*Fp
25080 pfadd mm5, mm4 ;# mm5= VV
25082 pfmul mm5, mm3 ;# vcoul=qq*VV
25084 ;# at this point mm5 contains vcoul
25085 ;# increment vcoul - then we can get rid of mm5
25086 ;# update vctot
25087 pfadd mm5, [esp + mci3310_vctot] ;# add the earlier value
25088 movq [esp + mci3310_vctot], mm5 ;# store the sum
25090 ;# dispersion table
25091 ;# load all the table values we need
25092 movd mm4, [edx + ecx*4 + 16]
25093 movd mm5, [edx + ecx*4 + 20]
25094 movd mm6, [edx + ecx*4 + 24]
25095 movd mm7, [edx + ecx*4 + 28]
25096 pfmul mm6, mm1 ;# mm6 = Geps
25097 pfmul mm7, mm2 ;# mm7 = Heps2
25098 pfadd mm5, mm6
25099 pfadd mm5, mm7 ;# mm5 = Fp
25100 pfmul mm5, mm1 ;# mm5=eps*Fp
25101 pfadd mm5, mm4 ;# mm5= VV
25103 movq mm4, [esp + mci3310_c6]
25104 pfmul mm5, mm4 ;# vnb6
25105 ;# update vnbtot to release mm5!
25106 pfadd mm5, [esp + mci3310_vnbtot] ;# add the earlier value
25107 movq [esp + mci3310_vnbtot], mm5 ;# store the sum
25109 ;# repulsion table
25110 ;# load all the table values we need
25111 movd mm4, [edx + ecx*4 + 32]
25112 movd mm5, [edx + ecx*4 + 36]
25113 movd mm6, [edx + ecx*4 + 40]
25114 movd mm7, [edx + ecx*4 + 44]
25116 pfmul mm6, mm1 ;# mm6 = Geps
25117 pfmul mm7, mm2 ;# mm7 = Heps2
25118 pfadd mm5, mm6
25119 pfadd mm5, mm7 ;# mm5 = Fp
25120 pfmul mm5, mm1 ;# mm5=eps*Fp
25121 pfadd mm5, mm4 ;# mm5= VV
25123 movq mm6, [esp + mci3310_c12]
25124 pfmul mm5, mm6 ;# vnb12
25126 ;# change sign of mm3
25127 pxor mm1,mm1
25128 pfsub mm1, mm3
25129 pfmul mm0, [esp + mci3310_tsc]
25130 pfmul mm0, mm1 ;# mm0 is total fscal now
25132 ;# update vnbtot
25133 pfadd mm5, [esp + mci3310_vnbtot] ;# add the earlier value
25134 movq [esp + mci3310_vnbtot], mm5 ;# store the sum
25136 .mci3310_updateouterdata_vdwc:
25137 ;# loop back to mno
25138 dec dword ptr [esp + mci3310_nsvdwc]
25139 jz .mci3310_testcoul
25140 jmp .mci3310_mno_vdwc
25141 .mci3310_testcoul:
25142 mov ecx, [esp + mci3310_nscoul]
25143 cmp ecx, 0
25144 jnz .mci3310_mno_coul
25145 jmp .mci3310_testvdw
25146 .mci3310_mno_coul:
25147 mov ebx, [esp + mci3310_solnr]
25148 inc dword ptr [esp + mci3310_solnr]
25149 mov edx, [ebp + mci3310_charge]
25150 movd mm2, [edx + ebx*4] ;# mm2=charge[ii]
25151 pfmul mm2, [ebp + mci3310_facel]
25152 punpckldq mm2,mm2 ;# spread to both halves
25153 movq [esp + mci3310_iq], mm2 ;# iq =facel*charge[ii]
25155 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
25156 mov eax, [ebp + mci3310_pos] ;# eax = base of pos[]
25157 mov [esp + mci3310_ii3], ebx
25159 movq mm0, [eax + ebx*4]
25160 movd mm1, [eax + ebx*4 + 8]
25161 pfadd mm0, [esp + mci3310_shX]
25162 pfadd mm1, [esp + mci3310_shZ]
25163 movq [esp + mci3310_ix], mm0
25164 movd [esp + mci3310_iz], mm1
25166 mov ecx, [esp + mci3310_innerjjnr0]
25167 mov [esp + mci3310_innerjjnr], ecx
25168 mov edx, [esp + mci3310_innerk0]
25169 sub edx, 2
25170 mov [esp + mci3310_innerk], edx ;# number of innerloop atoms
25171 jge .mci3310_unroll_coul_loop
25172 jmp .mci3310_finish_coul_inner
25173 .mci3310_unroll_coul_loop:
25174 ;# paired innerloop starts here
25175 mov ecx, [esp + mci3310_innerjjnr] ;# pointer to jjnr[k]
25176 mov eax, [ecx]
25177 mov ebx, [ecx + 4] ;# eax/ebx=jnr
25178 add dword ptr [esp + mci3310_innerjjnr], 8 ;# advance pointer (unrolled 2)
25179 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
25181 mov ecx, [ebp + mci3310_charge] ;# base of charge[]
25182 movq mm5, [esp + mci3310_iq]
25183 movd mm3, [ecx + eax*4] ;# charge[jnr1]
25184 punpckldq mm3, [ecx + ebx*4] ;# move charge 2 to high part of mm3
25185 pfmul mm3,mm5 ;# mm3 now has qq for both particles
25187 lea eax, [eax + eax*2] ;# replace jnr with j3
25188 lea ebx, [ebx + ebx*2]
25190 mov esi, [ebp + mci3310_pos]
25192 movq mm0, [esp + mci3310_ix]
25193 movd mm1, [esp + mci3310_iz]
25194 movq mm4, [esi + eax*4] ;# fetch first j coordinates
25195 movd mm5, [esi + eax*4 + 8]
25196 pfsubr mm4,mm0 ;# dr = ir - jr
25197 pfsubr mm5,mm1
25198 pfmul mm4,mm4 ;# square dx,dy,dz
25199 pfmul mm5,mm5
25200 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
25201 pfacc mm4, mm5 ;# first rsq in lower mm4
25203 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
25204 movd mm7, [esi + ebx*4 + 8]
25206 pfsubr mm6,mm0 ;# dr = ir - jr
25207 pfsubr mm7,mm1
25208 pfmul mm6,mm6 ;# square dx,dy,dz
25209 pfmul mm7,mm7
25210 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
25211 pfacc mm6, mm7 ;# second rsq in lower mm6
25213 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
25214 pfrsqrt mm1, mm6
25217 punpckldq mm0,mm1
25218 punpckldq mm4,mm6 ;# now 4 has rsq and 0 the seed for both pairs.
25219 movq mm2,mm0 ;# amd 3dnow N-R iteration to get full precision.
25220 pfmul mm0,mm0
25221 pfrsqit1 mm0,mm4
25222 pfrcpit2 mm0,mm2
25223 pfmul mm4, mm0
25224 movq mm1, mm4
25225 ;# mm0 is invsqrt, and mm1 r.
25226 ;# do potential and fscal
25227 pfmul mm1, [esp + mci3310_tsc] ;# mm1=rt
25228 pf2iw mm4,mm1
25229 movq [esp + mci3310_n1], mm4
25230 pi2fd mm4,mm4
25231 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
25233 movq mm2,mm1
25234 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
25236 mov edx, [ebp + mci3310_VFtab]
25237 mov ecx, [esp + mci3310_n1]
25238 lea ecx, [ecx + ecx*2]
25239 shl ecx, 2
25240 ;# coulomb table
25241 ;# load all the table values we need
25242 movd mm4, [edx + ecx*4]
25243 movd mm5, [edx + ecx*4 + 4]
25244 movd mm6, [edx + ecx*4 + 8]
25245 movd mm7, [edx + ecx*4 + 12]
25246 mov ecx, [esp + mci3310_n1 + 4]
25247 lea ecx, [ecx + ecx*2]
25248 shl ecx, 2
25249 punpckldq mm4, [edx + ecx*4]
25250 punpckldq mm5, [edx + ecx*4 + 4]
25251 punpckldq mm6, [edx + ecx*4 + 8]
25252 punpckldq mm7, [edx + ecx*4 + 12]
25254 pfmul mm6, mm1 ;# mm6 = Geps
25255 pfmul mm7, mm2 ;# mm7 = Heps2
25257 pfadd mm5, mm6
25258 pfadd mm5, mm7 ;# mm5 = Fp
25260 pfmul mm5, mm1 ;# mm5=eps*Fp
25261 pfadd mm5, mm4 ;# mm5= VV
25263 pfmul mm5, mm3 ;# vcoul=qq*VV
25265 ;# at this point mm5 contains vcoul
25266 ;# increment vcoul - then we can get rid of mm5
25267 ;# update vctot
25268 pfadd mm5, [esp + mci3310_vctot] ;# add the earlier value
25269 movq [esp + mci3310_vctot], mm5 ;# store the sum
25271 ;# should we do one more iteration?
25272 sub dword ptr [esp + mci3310_innerk], 2
25273 jl .mci3310_finish_coul_inner
25274 jmp .mci3310_unroll_coul_loop
25275 .mci3310_finish_coul_inner:
25276 and dword ptr [esp + mci3310_innerk], 1
25277 jnz .mci3310_single_coul_inner
25278 jmp .mci3310_updateouterdata_coul
25279 .mci3310_single_coul_inner:
25280 ;# a single j particle iteration here - compare with the unrolled code for comments.
25281 mov eax, [esp + mci3310_innerjjnr]
25282 mov eax, [eax] ;# eax=jnr offset
25284 mov ecx, [ebp + mci3310_charge]
25285 movd mm5, [esp + mci3310_iq]
25286 movd mm3, [ecx + eax*4]
25287 pfmul mm3, mm5 ;# mm3=qq
25289 mov esi, [ebp + mci3310_pos]
25290 lea eax, [eax + eax*2]
25292 movq mm0, [esp + mci3310_ix]
25293 movd mm1, [esp + mci3310_iz]
25294 movq mm4, [esi + eax*4]
25295 movd mm5, [esi + eax*4 + 8]
25296 pfsubr mm4, mm0
25297 pfsubr mm5, mm1
25298 pfmul mm4,mm4
25299 pfmul mm5,mm5
25300 pfacc mm4, mm5
25301 pfacc mm4, mm5 ;# mm0=rsq
25303 pfrsqrt mm0,mm4
25304 movq mm2,mm0
25305 pfmul mm0,mm0
25306 pfrsqit1 mm0,mm4
25307 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
25308 pfmul mm4, mm0
25309 movq mm1, mm4
25310 ;# mm0 is invsqrt, and mm1 r.
25312 ;# calculate potentials and scalar force
25313 pfmul mm1, [esp + mci3310_tsc] ;# mm1=rt
25314 pf2iw mm4,mm1
25315 movd [esp + mci3310_n1], mm4
25316 pi2fd mm4,mm4
25317 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
25319 movq mm2,mm1
25320 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
25322 ;# coulomb table
25323 mov edx, [ebp + mci3310_VFtab]
25324 mov ecx, [esp + mci3310_n1]
25325 lea ecx, [ecx + ecx*2]
25326 shl ecx, 2
25327 ;# load all the table values we need
25328 movd mm4, [edx + ecx*4]
25329 movd mm5, [edx + ecx*4 + 4]
25330 movd mm6, [edx + ecx*4 + 8]
25331 movd mm7, [edx + ecx*4 + 12]
25333 pfmul mm6, mm1 ;# mm6 = Geps
25334 pfmul mm7, mm2 ;# mm7 = Heps2
25336 pfadd mm5, mm6
25337 pfadd mm5, mm7 ;# mm5 = Fp
25339 pfmul mm5, mm1 ;# mm5=eps*Fp
25340 pfadd mm5, mm4 ;# mm5= VV
25342 pfmul mm5, mm3 ;# vcoul=qq*VV
25344 ;# at this point mm5 contains vcoul
25345 ;# increment vcoul - then we can get rid of mm5
25346 ;# update vctot
25347 pfadd mm5, [esp + mci3310_vctot] ;# add the earlier value
25348 movq [esp + mci3310_vctot], mm5 ;# store the sum
25350 .mci3310_updateouterdata_coul:
25351 ;# loop back to mno
25352 dec dword ptr [esp + mci3310_nscoul]
25353 jz .mci3310_testvdw
25354 jmp .mci3310_mno_coul
25355 .mci3310_testvdw:
25356 mov ecx, [esp + mci3310_nsvdw]
25357 cmp ecx, 0
25358 jnz .mci3310_mno_vdw
25359 jmp .mci3310_last_mno
25360 .mci3310_mno_vdw:
25361 mov ebx, [esp + mci3310_solnr]
25362 inc dword ptr [esp + mci3310_solnr]
25364 mov edx, [ebp + mci3310_type]
25365 mov edx, [edx + ebx*4]
25366 imul edx, [ebp + mci3310_ntype]
25367 shl edx, 1
25368 mov [esp + mci3310_ntia], edx
25370 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
25371 mov eax, [ebp + mci3310_pos] ;# eax = base of pos[]
25372 mov [esp + mci3310_ii3], ebx
25374 movq mm0, [eax + ebx*4]
25375 movd mm1, [eax + ebx*4 + 8]
25376 pfadd mm0, [esp + mci3310_shX]
25377 pfadd mm1, [esp + mci3310_shZ]
25378 movq [esp + mci3310_ix], mm0
25379 movd [esp + mci3310_iz], mm1
25381 mov ecx, [esp + mci3310_innerjjnr0]
25382 mov [esp + mci3310_innerjjnr], ecx
25383 mov edx, [esp + mci3310_innerk0]
25384 sub edx, 2
25385 mov [esp + mci3310_innerk], edx ;# number of innerloop atoms
25386 jge .mci3310_unroll_vdw_loop
25387 jmp .mci3310_finish_vdw_inner
25388 .mci3310_unroll_vdw_loop:
25389 ;# paired innerloop starts here
25390 mov ecx, [esp + mci3310_innerjjnr] ;# pointer to jjnr[k]
25391 mov eax, [ecx]
25392 mov ebx, [ecx + 4] ;# eax/ebx=jnr
25393 add dword ptr [esp + mci3310_innerjjnr], 8 ;# advance pointer (unrolled 2)
25394 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
25396 mov ecx, [ebp + mci3310_type]
25397 mov edx, [ecx + eax*4] ;# type [jnr1]
25398 mov ecx, [ecx + ebx*4] ;# type [jnr2]
25400 mov esi, [ebp + mci3310_nbfp] ;# base of nbfp
25401 shl edx, 1
25402 shl ecx, 1
25403 add edx, [esp + mci3310_ntia] ;# tja = ntia + 2*type
25404 add ecx, [esp + mci3310_ntia]
25406 movq mm5, [esi + edx*4] ;# mm5 = 1st c6 / c12
25407 movq mm7, [esi + ecx*4] ;# mm7 = 2nd c6 / c12
25408 movq mm6, mm5
25409 punpckldq mm5, mm7 ;# mm5 = 1st c6 / 2nd c6
25410 punpckhdq mm6, mm7 ;# mm6 = 1st c12 / 2nd c12
25411 movq [esp + mci3310_c6], mm5
25412 movq [esp + mci3310_c12], mm6
25414 lea eax, [eax + eax*2] ;# replace jnr with j3
25415 lea ebx, [ebx + ebx*2]
25417 mov esi, [ebp + mci3310_pos]
25419 movq mm0, [esp + mci3310_ix]
25420 movd mm1, [esp + mci3310_iz]
25421 movq mm4, [esi + eax*4] ;# fetch first j coordinates
25422 movd mm5, [esi + eax*4 + 8]
25423 pfsubr mm4,mm0 ;# dr = ir - jr
25424 pfsubr mm5,mm1
25425 pfmul mm4,mm4 ;# square dx,dy,dz
25426 pfmul mm5,mm5
25427 pfacc mm4, mm5 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
25428 pfacc mm4, mm5 ;# first rsq in lower mm4
25430 movq mm6, [esi + ebx*4] ;# fetch second j coordinates
25431 movd mm7, [esi + ebx*4 + 8]
25433 pfsubr mm6, mm0 ;# dr = ir - jr
25434 pfsubr mm7, mm1
25435 pfmul mm6, mm6 ;# square dx,dy,dz
25436 pfmul mm7, mm7
25437 pfacc mm6, mm7 ;# accumulate to get dx*dx+ dy*dy+ dz*dz
25438 pfacc mm6, mm7 ;# second rsq in lower mm6
25440 pfrsqrt mm0, mm4 ;# lookup inverse square root seed
25441 pfrsqrt mm1, mm6
25444 punpckldq mm0, mm1
25445 punpckldq mm4, mm6 ;# now 4 has rsq and 0 the seed for both pairs.
25446 movq mm2, mm0 ;# amd 3dnow N-R iteration to get full precision.
25447 pfmul mm0, mm0
25448 pfrsqit1 mm0, mm4
25449 pfrcpit2 mm0, mm2
25450 pfmul mm4, mm0
25451 movq mm1, mm4
25452 ;# mm0 is invsqrt, and mm1 r.
25453 ;# do potential and fscal
25454 pfmul mm1, [esp + mci3310_tsc] ;# mm1=rt
25455 pf2iw mm4, mm1
25456 movq [esp + mci3310_n1], mm4
25457 pi2fd mm4, mm4
25458 pfsub mm1, mm4 ;# now mm1 is eps and mm4 is n0
25460 movq mm2, mm1
25461 pfmul mm2, mm2 ;# mm1 is eps, mm2 is eps2
25463 mov edx, [ebp + mci3310_VFtab]
25464 ;# dispersion table
25465 mov ecx, [esp + mci3310_n1]
25466 lea ecx, [ecx + ecx*2]
25467 shl ecx, 2
25468 ;# load all the table values we need
25469 movd mm4, [edx + ecx*4]
25470 movd mm5, [edx + ecx*4 + 4]
25471 movd mm6, [edx + ecx*4 + 8]
25472 movd mm7, [edx + ecx*4 + 12]
25473 mov ecx, [esp + mci3310_n1 + 4]
25474 lea ecx, [ecx + ecx*2]
25475 shl ecx, 2
25476 punpckldq mm4, [edx + ecx*4]
25477 punpckldq mm5, [edx + ecx*4 + 4]
25478 punpckldq mm6, [edx + ecx*4 + 8]
25479 punpckldq mm7, [edx + ecx*4 + 12]
25480 pfmul mm6, mm1 ;# mm6 = Geps
25481 pfmul mm7, mm2 ;# mm7 = Heps2
25482 pfadd mm5, mm6
25483 pfadd mm5, mm7 ;# mm5 = Fp
25484 pfmul mm5, mm1 ;# mm5=eps*Fp
25485 pfadd mm5, mm4 ;# mm5= VV
25487 movq mm4, [esp + mci3310_c6]
25488 pfmul mm5, mm4 ;# vnb6
25490 ;# update vnbtot to release mm5!
25491 pfadd mm5, [esp + mci3310_vnbtot] ;# add the earlier value
25492 movq [esp + mci3310_vnbtot], mm5 ;# store the sum
25494 ;# repulsion table
25495 mov ecx, [esp + mci3310_n1]
25496 lea ecx, [ecx + ecx*2]
25497 shl ecx, 2
25498 ;# load all the table values we need
25499 movd mm4, [edx + ecx*4 + 16]
25500 movd mm5, [edx + ecx*4 + 20]
25501 movd mm6, [edx + ecx*4 + 24]
25502 movd mm7, [edx + ecx*4 + 28]
25503 mov ecx, [esp + mci3310_n1 + 4]
25504 lea ecx, [ecx + ecx*2]
25505 shl ecx, 2
25506 punpckldq mm4, [edx + ecx*4 + 16]
25507 punpckldq mm5, [edx + ecx*4 + 20]
25508 punpckldq mm6, [edx + ecx*4 + 24]
25509 punpckldq mm7, [edx + ecx*4 + 28]
25511 pfmul mm6, mm1 ;# mm6 = Geps
25512 pfmul mm7, mm2 ;# mm7 = Heps2
25513 pfadd mm5, mm6
25514 pfadd mm5, mm7 ;# mm5 = Fp
25515 pfmul mm5, mm1 ;# mm5=eps*Fp
25516 pfadd mm5, mm4 ;# mm5= VV
25518 movq mm6, [esp + mci3310_c12]
25519 pfmul mm5, mm6 ;# vnb12
25520 ;# update vnbtot
25521 pfadd mm5, [esp + mci3310_vnbtot] ;# add the earlier value
25522 movq [esp + mci3310_vnbtot], mm5 ;# store the sum
25524 ;# should we do one more iteration?
25525 sub dword ptr [esp + mci3310_innerk], 2
25526 jl .mci3310_finish_vdw_inner
25527 jmp .mci3310_unroll_vdw_loop
25528 .mci3310_finish_vdw_inner:
25529 and dword ptr [esp + mci3310_innerk], 1
25530 jnz .mci3310_single_vdw_inner
25531 jmp .mci3310_updateouterdata_vdw
25532 .mci3310_single_vdw_inner:
25533 ;# a single j particle iteration here - compare with the unrolled code for comments.
25534 mov eax, [esp + mci3310_innerjjnr]
25535 mov eax, [eax] ;# eax=jnr offset
25537 mov esi, [ebp + mci3310_nbfp]
25538 mov ecx, [ebp + mci3310_type]
25539 mov edx, [ecx + eax*4] ;# type [jnr1]
25540 shl edx, 1
25541 add edx, [esp + mci3310_ntia] ;# tja = ntia + 2*type
25542 movd mm5, [esi + edx*4] ;# mm5 = 1st c6
25543 movq [esp + mci3310_c6], mm5
25544 movd mm5, [esi + edx*4 + 4] ;# mm5 = 1st c12
25545 movq [esp + mci3310_c12], mm5
25547 mov esi, [ebp + mci3310_pos]
25548 lea eax, [eax + eax*2]
25550 movq mm0, [esp + mci3310_ix]
25551 movd mm1, [esp + mci3310_iz]
25552 movq mm4, [esi + eax*4]
25553 movd mm5, [esi + eax*4 + 8]
25554 pfsubr mm4, mm0
25555 pfsubr mm5, mm1
25556 pfmul mm4,mm4
25557 pfmul mm5,mm5
25558 pfacc mm4, mm5
25559 pfacc mm4, mm5 ;# mm0=rsq
25561 pfrsqrt mm0,mm4
25562 movq mm2,mm0
25563 pfmul mm0,mm0
25564 pfrsqit1 mm0,mm4
25565 pfrcpit2 mm0,mm2 ;# mm1=invsqrt
25566 pfmul mm4, mm0
25567 movq mm1, mm4
25568 ;# mm0 is invsqrt, and mm1 r.
25570 ;# calculate potentials and scalar force
25571 pfmul mm1, [esp + mci3310_tsc] ;# mm1=rt
25572 pf2iw mm4,mm1
25573 movd [esp + mci3310_n1], mm4
25574 pi2fd mm4,mm4
25575 pfsub mm1, mm4 ;# now mm1 is eps and mm4 n0.
25577 movq mm2,mm1
25578 pfmul mm2,mm2 ;# mm1 is eps, mm2 is eps2
25580 mov edx, [ebp + mci3310_VFtab]
25581 mov ecx, [esp + mci3310_n1]
25582 lea ecx, [ecx + ecx*2]
25583 shl ecx, 2
25584 ;# dispersion table
25585 ;# load all the table values we need
25587 movd mm4, [edx + ecx*4]
25588 movd mm5, [edx + ecx*4 + 4]
25589 movd mm6, [edx + ecx*4 + 8]
25590 movd mm7, [edx + ecx*4 + 12]
25591 pfmul mm6, mm1 ;# mm6 = Geps
25592 pfmul mm7, mm2 ;# mm7 = Heps2
25593 pfadd mm5, mm6
25594 pfadd mm5, mm7 ;# mm5 = Fp
25595 pfmul mm5, mm1 ;# mm5=eps*Fp
25596 pfadd mm5, mm4 ;# mm5= VV
25598 movq mm4, [esp + mci3310_c6]
25599 pfmul mm5, mm4 ;# vnb6
25600 ;# update vnbtot to release mm5!
25601 pfadd mm5, [esp + mci3310_vnbtot] ;# add the earlier value
25602 movq [esp + mci3310_vnbtot], mm5 ;# store the sum
25604 ;# repulsion table
25605 ;# load all the table values we need
25607 movd mm4, [edx + ecx*4 + 16]
25608 movd mm5, [edx + ecx*4 + 20]
25609 movd mm6, [edx + ecx*4 + 24]
25610 movd mm7, [edx + ecx*4 + 28]
25612 pfmul mm6, mm1 ;# mm6 = Geps
25613 pfmul mm7, mm2 ;# mm7 = Heps2
25614 pfadd mm5, mm6
25615 pfadd mm5, mm7 ;# mm5 = Fp
25616 pfmul mm5, mm1 ;# mm5=eps*Fp
25617 pfadd mm5, mm4 ;# mm5= VV
25619 movq mm6, [esp + mci3310_c12]
25620 pfmul mm5, mm6 ;# vnb12
25621 ;# update vnbtot
25622 pfadd mm5, [esp + mci3310_vnbtot] ;# add the earlier value
25623 movq [esp + mci3310_vnbtot], mm5 ;# store the sum
25625 .mci3310_updateouterdata_vdw:
25626 ;# loop back to mno
25627 dec dword ptr [esp + mci3310_nsvdw]
25628 jz .mci3310_last_mno
25629 jmp .mci3310_mno_vdw
25631 .mci3310_last_mno:
25632 mov edx, [ebp + mci3310_gid] ;# get group index for this i particle
25633 mov edx, [edx]
25634 add dword ptr [ebp + mci3310_gid], 4 ;# advance pointer
25636 movq mm7, [esp + mci3310_vctot]
25637 pfacc mm7,mm7 ;# get and sum the two parts of total potential
25639 mov eax, [ebp + mci3310_Vc]
25640 movd mm6, [eax + edx*4]
25641 pfadd mm6, mm7
25642 movd [eax + edx*4], mm6 ;# increment vc[gid]
25644 movq mm7, [esp + mci3310_vnbtot]
25645 pfacc mm7,mm7 ;# get and sum the two parts of total potential
25647 mov eax, [ebp + mci3310_Vnb]
25648 movd mm6, [eax + edx*4]
25649 pfadd mm6, mm7
25650 movd [eax + edx*4], mm6 ;# increment vc[gid]
25651 ;# finish if last
25652 mov ecx, [ebp + mci3310_nri]
25653 dec ecx
25654 jecxz .mci3310_end
25655 ;# not last, iterate once more!
25656 mov [ebp + mci3310_nri], ecx
25657 jmp .mci3310_outer
25658 .mci3310_end:
25659 femms
25660 add esp, 124
25661 pop edi
25662 pop esi
25663 pop edx
25664 pop ecx
25665 pop ebx
25666 pop eax
25667 leave
25671 .globl mcinl3320_3dnow
25672 .globl _mcinl3320_3dnow
25673 mcinl3320_3dnow:
25674 _mcinl3320_3dnow:
25675 .equiv mci3320_nri, 8
25676 .equiv mci3320_iinr, 12
25677 .equiv mci3320_jindex, 16
25678 .equiv mci3320_jjnr, 20
25679 .equiv mci3320_shift, 24
25680 .equiv mci3320_shiftvec, 28
25681 .equiv mci3320_gid, 32
25682 .equiv mci3320_pos, 36
25683 .equiv mci3320_charge, 40
25684 .equiv mci3320_facel, 44
25685 .equiv mci3320_Vc, 48
25686 .equiv mci3320_type, 52
25687 .equiv mci3320_ntype, 56
25688 .equiv mci3320_nbfp, 60
25689 .equiv mci3320_Vnb, 64
25690 .equiv mci3320_tabscale, 68
25691 .equiv mci3320_VFtab, 72
25692 ;# stack offsets for local variables
25693 .equiv mci3320_is3, 0
25694 .equiv mci3320_ii3, 4
25695 .equiv mci3320_ixO, 8
25696 .equiv mci3320_iyO, 12
25697 .equiv mci3320_izO, 16
25698 .equiv mci3320_ixH, 20
25699 .equiv mci3320_iyH, 28
25700 .equiv mci3320_izH, 36
25701 .equiv mci3320_iqO, 44
25702 .equiv mci3320_iqH, 52
25703 .equiv mci3320_qqO, 60
25704 .equiv mci3320_qqH, 68
25705 .equiv mci3320_vctot, 76
25706 .equiv mci3320_vnbtot, 84
25707 .equiv mci3320_c6, 92
25708 .equiv mci3320_c12, 100
25709 .equiv mci3320_n1, 108
25710 .equiv mci3320_tsc, 116
25711 .equiv mci3320_ntia, 124
25712 .equiv mci3320_innerjjnr, 128
25713 .equiv mci3320_innerk, 132
25714 .equiv mci3320_tmprsqH, 136
25715 push ebp
25716 mov ebp,esp
25717 push eax
25718 push ebx
25719 push ecx
25720 push edx
25721 push esi
25722 push edi
25723 sub esp, 144 ;# local stack space
25724 femms
25726 mov ecx, [ebp + mci3320_iinr] ;# ecx = pointer into iinr[]
25727 mov ebx, [ecx] ;# ebx=ii
25729 mov edx, [ebp + mci3320_charge]
25730 movd mm1, [ebp + mci3320_facel]
25731 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0]
25732 pfmul mm2, mm1
25733 movq [esp + mci3320_iqO], mm2 ;# iqO = facel*charge[ii]
25735 movd mm2, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1]
25736 pfmul mm2, mm1
25737 punpckldq mm2,mm2 ;# spread to both halves
25738 movq [esp + mci3320_iqH], mm2 ;# iqH = facel*charge[ii0+1]
25740 mov edx, [ebp + mci3320_type]
25741 mov ecx, [edx + ebx*4]
25742 shl ecx, 1
25743 imul ecx, [ebp + mci3320_ntype] ;# ecx = ntia = 2*ntype*type[ii0]
25744 mov [esp + mci3320_ntia], ecx
25746 movq mm4, [ebp + mci3320_tabscale]
25747 punpckldq mm4,mm4 ;# spread to both halves
25748 movq [esp + mci3320_tsc], mm4
25749 ;# assume we have at least one i particle - start directly
25750 .mci3320_outer:
25751 mov eax, [ebp + mci3320_shift] ;# eax = pointer into shift[]
25752 mov ebx, [eax] ;# ebx=shift[n]
25753 add dword ptr [ebp + mci3320_shift], 4 ;# advance pointer one step
25755 lea ebx, [ebx + ebx*2] ;# ebx=3*is
25756 mov [esp + mci3320_is3],ebx ;# store is3
25758 mov eax, [ebp + mci3320_shiftvec] ;# eax = base of shiftvec[]
25760 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
25761 movd mm6, [eax + ebx*4 + 8]
25762 movq mm0, mm5
25763 movq mm1, mm5
25764 movq mm2, mm6
25765 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
25766 punpckhdq mm1,mm1
25767 punpckldq mm2,mm2
25769 mov ecx, [ebp + mci3320_iinr] ;# ecx = pointer into iinr[]
25770 add dword ptr [ebp + mci3320_iinr], 4 ;# advance pointer
25771 mov ebx, [ecx] ;# ebx=ii
25773 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
25774 mov eax, [ebp + mci3320_pos] ;# eax = base of pos[]
25776 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
25777 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
25778 mov [esp + mci3320_ii3], ebx ;# (use mm7 as temp. storage for iz.)
25779 pfadd mm6, mm7
25780 movq [esp + mci3320_ixO], mm5
25781 movq [esp + mci3320_izO], mm6
25783 movd mm3, [eax + ebx*4 + 12]
25784 movd mm4, [eax + ebx*4 + 16]
25785 movd mm5, [eax + ebx*4 + 20]
25786 punpckldq mm3, [eax + ebx*4 + 24]
25787 punpckldq mm4, [eax + ebx*4 + 28]
25788 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
25790 pfadd mm0, mm3
25791 pfadd mm1, mm4
25792 pfadd mm2, mm5
25793 movq [esp + mci3320_ixH], mm0
25794 movq [esp + mci3320_iyH], mm1
25795 movq [esp + mci3320_izH], mm2
25797 ;# clear vctot and i forces
25798 pxor mm7,mm7
25799 movq [esp + mci3320_vctot], mm7
25800 movq [esp + mci3320_vnbtot], mm7
25802 mov eax, [ebp + mci3320_jindex]
25803 mov ecx, [eax] ;# jindex[n]
25804 mov edx, [eax + 4] ;# jindex[n+1]
25805 add dword ptr [ebp + mci3320_jindex], 4
25806 sub edx, ecx ;# number of innerloop atoms
25807 mov [esp + mci3320_innerk], edx
25809 mov esi, [ebp + mci3320_pos]
25810 mov eax, [ebp + mci3320_jjnr]
25811 shl ecx, 2
25812 add eax, ecx
25813 mov [esp + mci3320_innerjjnr], eax ;# pointer to jjnr[nj0]
25814 .mci3320_inner_loop:
25815 ;# a single j particle iteration here - compare with the unrolled code for comments.
25816 mov eax, [esp + mci3320_innerjjnr]
25817 mov eax, [eax] ;# eax=jnr offset
25818 add dword ptr [esp + mci3320_innerjjnr], 4 ;# advance pointer
25819 prefetch [ecx + 16] ;# prefetch data - trial and error says 16 is best
25821 mov ecx, [ebp + mci3320_charge]
25822 movd mm7, [ecx + eax*4]
25823 punpckldq mm7,mm7
25824 movq mm6,mm7
25825 pfmul mm6, [esp + mci3320_iqO]
25826 pfmul mm7, [esp + mci3320_iqH] ;# mm6=qqO, mm7=qqH
25827 movd [esp + mci3320_qqO], mm6
25828 movq [esp + mci3320_qqH], mm7
25830 mov ecx, [ebp + mci3320_type]
25831 mov edx, [ecx + eax*4] ;# type [jnr]
25832 mov ecx, [ebp + mci3320_nbfp]
25833 shl edx, 1
25834 add edx, [esp + mci3320_ntia] ;# tja = ntia + 2*type
25835 movd mm5, [ecx + edx*4] ;# mm5 = 1st c6
25836 movq [esp + mci3320_c6], mm5
25837 movd mm5, [ecx + edx*4 + 4] ;# mm5 = 1st c12
25838 movq [esp + mci3320_c12], mm5
25840 lea eax, [eax + eax*2]
25842 movq mm0, [esi + eax*4]
25843 movd mm1, [esi + eax*4 + 8]
25844 ;# copy & expand to mm2-mm4 for the H interactions
25845 movq mm2, mm0
25846 movq mm3, mm0
25847 movq mm4, mm1
25848 punpckldq mm2,mm2
25849 punpckhdq mm3,mm3
25850 punpckldq mm4,mm4
25852 pfsubr mm0, [esp + mci3320_ixO]
25853 pfsubr mm1, [esp + mci3320_izO]
25855 pfmul mm0,mm0
25856 pfmul mm1,mm1
25857 pfacc mm0, mm1
25858 pfadd mm0, mm1 ;# mm0=rsqO
25860 punpckldq mm2, mm2
25861 punpckldq mm3, mm3
25862 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
25863 pfsubr mm2, [esp + mci3320_ixH]
25864 pfsubr mm3, [esp + mci3320_iyH]
25865 pfsubr mm4, [esp + mci3320_izH] ;# mm2-mm4 is dxH-dzH
25867 pfmul mm2,mm2
25868 pfmul mm3,mm3
25869 pfmul mm4,mm4
25871 pfadd mm3,mm2
25872 pfadd mm3,mm4 ;# mm3=rsqH
25873 movq [esp + mci3320_tmprsqH], mm3
25875 pfrsqrt mm1,mm0
25877 movq mm2,mm1
25878 pfmul mm1,mm1
25879 pfrsqit1 mm1,mm0
25880 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
25882 pfmul mm0, mm1 ;# mm0=r
25884 pfmul mm0, [esp + mci3320_tsc]
25885 pf2iw mm4, mm0
25886 movd [esp + mci3320_n1], mm4
25887 pi2fd mm4,mm4
25888 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
25889 movq mm2, mm0
25890 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
25892 ;# coulomb table
25893 mov edx, [ebp + mci3320_VFtab]
25894 mov ecx, [esp + mci3320_n1]
25895 lea ecx, [ecx + ecx*2]
25896 shl ecx, 2
25897 ;# load all values we need
25898 movd mm4, [edx + ecx*4]
25899 movd mm5, [edx + ecx*4 + 4]
25900 movd mm6, [edx + ecx*4 + 8]
25901 movd mm7, [edx + ecx*4 + 12]
25903 pfmul mm6, mm0 ;# mm6 = Geps
25904 pfmul mm7, mm2 ;# mm7 = Heps2
25906 pfadd mm5, mm6
25907 pfadd mm5, mm7 ;# mm5 = Fp
25909 pfmul mm5, mm0 ;# mm5=eps*Fp
25910 pfadd mm5, mm4 ;# mm5= VV
25912 pfmul mm5, [esp + mci3320_qqO] ;# vcoul=qq*VV
25914 ;# update vctot directly
25915 pfadd mm5, [esp + mci3320_vctot]
25916 movq [esp + mci3320_vctot], mm5
25918 ;# dispersion table
25919 ;# load all the table values we need
25920 movd mm4, [edx + ecx*4 + 16]
25921 movd mm5, [edx + ecx*4 + 20]
25922 movd mm6, [edx + ecx*4 + 24]
25923 movd mm7, [edx + ecx*4 + 28]
25924 pfmul mm6, mm0 ;# mm6 = Geps
25925 pfmul mm7, mm2 ;# mm7 = Heps2
25926 pfadd mm5, mm6
25927 pfadd mm5, mm7 ;# mm5 = Fp
25928 pfmul mm5, mm0 ;# mm5=eps*Fp
25929 pfadd mm5, mm4 ;# mm5= VV
25931 movq mm4, [esp + mci3320_c6]
25932 pfmul mm5, mm4 ;# vnb6
25933 ;# update vnbtot to release mm5!
25934 pfadd mm5, [esp + mci3320_vnbtot] ;# add the earlier value
25935 movq [esp + mci3320_vnbtot], mm5 ;# store the sum
25937 ;# repulsion table
25938 ;# load all the table values we need
25939 movd mm4, [edx + ecx*4 + 32]
25940 movd mm5, [edx + ecx*4 + 36]
25941 movd mm6, [edx + ecx*4 + 40]
25942 movd mm7, [edx + ecx*4 + 44]
25944 pfmul mm6, mm0 ;# mm6 = Geps
25945 pfmul mm7, mm2 ;# mm7 = Heps2
25946 pfadd mm5, mm6
25947 pfadd mm5, mm7 ;# mm5 = Fp
25948 pfmul mm5, mm0 ;# mm5=eps*Fp
25949 pfadd mm5, mm4 ;# mm5= VV
25951 movq mm6, [esp + mci3320_c12]
25952 pfmul mm5, mm6 ;# vnb12
25953 ;# update vnbtot
25954 pfadd mm5, [esp + mci3320_vnbtot] ;# add the earlier value
25955 movq [esp + mci3320_vnbtot], mm5 ;# store the sum
25957 ;# now do the two hydrogens.
25958 movq mm0, [esp + mci3320_tmprsqH] ;# mm0=rsqH
25960 pfrsqrt mm1, mm0
25961 pswapd mm0,mm0
25962 pfrsqrt mm2, mm0
25963 pswapd mm0,mm0
25964 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
25966 movq mm2, mm1
25967 pfmul mm1,mm1
25968 pfrsqit1 mm1,mm0
25969 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
25971 pfmul mm0,mm1 ;# mm0=r
25972 pfmul mm0, [esp + mci3320_tsc]
25973 pf2iw mm4, mm0
25974 movq [esp + mci3320_n1], mm4
25975 pi2fd mm4,mm4
25976 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
25977 movq mm2, mm0
25978 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
25980 ;# coulomb table
25981 mov edx, [ebp + mci3320_VFtab]
25982 mov ecx, [esp + mci3320_n1]
25983 lea ecx, [ecx + ecx*2]
25984 shl ecx, 2
25985 ;# load all values we need
25986 movd mm4, [edx + ecx*4]
25987 movd mm5, [edx + ecx*4 + 4]
25988 movd mm6, [edx + ecx*4 + 8]
25989 movd mm7, [edx + ecx*4 + 12]
25990 mov ecx, [esp + mci3320_n1 + 4]
25991 lea ecx, [ecx + ecx*2]
25992 shl ecx, 2
25993 punpckldq mm4, [edx + ecx*4]
25994 punpckldq mm5, [edx + ecx*4 + 4]
25995 punpckldq mm6, [edx + ecx*4 + 8]
25996 punpckldq mm7, [edx + ecx*4 + 12]
25999 pfmul mm6, mm0 ;# mm6 = Geps
26000 pfmul mm7, mm2 ;# mm7 = Heps2
26002 pfadd mm5, mm6
26003 pfadd mm5, mm7 ;# mm5 = Fp
26005 pfmul mm5, mm0 ;# mm5=eps*Fp
26006 pfadd mm5, mm4 ;# mm5= VV
26008 pfmul mm5, [esp + mci3320_qqH] ;# vcoul=qq*VV
26009 ;# update vctot
26010 pfadd mm5, [esp + mci3320_vctot]
26011 movq [esp + mci3320_vctot], mm5
26013 ;# done - one more?
26014 dec dword ptr [esp + mci3320_innerk]
26015 jz .mci3320_updateouterdata
26016 jmp .mci3320_inner_loop
26017 .mci3320_updateouterdata:
26018 mov edx, [ebp + mci3320_gid] ;# get group index for this i particle
26019 mov edx, [edx]
26020 add dword ptr [ebp + mci3320_gid], 4 ;# advance pointer
26022 movq mm7, [esp + mci3320_vctot]
26023 pfacc mm7,mm7 ;# get and sum the two parts of total potential
26025 mov eax, [ebp + mci3320_Vc]
26026 movd mm6, [eax + edx*4]
26027 pfadd mm6, mm7
26028 movd [eax + edx*4], mm6 ;# increment vc[gid]
26030 movq mm7, [esp + mci3320_vnbtot]
26031 pfacc mm7,mm7 ;# same for Vnb
26033 mov eax, [ebp + mci3320_Vnb]
26034 movd mm6, [eax + edx*4]
26035 pfadd mm6, mm7
26036 movd [eax + edx*4], mm6 ;# increment vnb[gid]
26037 ;# finish if last
26038 dec dword ptr [ebp + mci3320_nri]
26039 jz .mci3320_end
26040 ;# not last, iterate once more!
26041 jmp .mci3320_outer
26042 .mci3320_end:
26043 femms
26044 add esp, 144
26045 pop edi
26046 pop esi
26047 pop edx
26048 pop ecx
26049 pop ebx
26050 pop eax
26051 leave
26056 .globl mcinl3330_3dnow
26057 .globl _mcinl3330_3dnow
26058 mcinl3330_3dnow:
26059 _mcinl3330_3dnow:
26060 .equiv mci3330_nri, 8
26061 .equiv mci3330_iinr, 12
26062 .equiv mci3330_jindex, 16
26063 .equiv mci3330_jjnr, 20
26064 .equiv mci3330_shift, 24
26065 .equiv mci3330_shiftvec, 28
26066 .equiv mci3330_gid, 32
26067 .equiv mci3330_pos, 36
26068 .equiv mci3330_charge, 40
26069 .equiv mci3330_facel, 44
26070 .equiv mci3330_Vc, 48
26071 .equiv mci3330_type, 52
26072 .equiv mci3330_ntype, 56
26073 .equiv mci3330_nbfp, 60
26074 .equiv mci3330_Vnb, 64
26075 .equiv mci3330_tabscale, 68
26076 .equiv mci3330_VFtab, 72
26077 ;# stack offsets for local variables
26078 .equiv mci3330_is3, 0
26079 .equiv mci3330_ii3, 4
26080 .equiv mci3330_ixO, 8
26081 .equiv mci3330_iyO, 12
26082 .equiv mci3330_izO, 16
26083 .equiv mci3330_ixH, 20
26084 .equiv mci3330_iyH, 28
26085 .equiv mci3330_izH, 36
26086 .equiv mci3330_qqOO, 44
26087 .equiv mci3330_qqOH, 52
26088 .equiv mci3330_qqHH, 60
26089 .equiv mci3330_c6, 68
26090 .equiv mci3330_c12, 76
26091 .equiv mci3330_n1, 84
26092 .equiv mci3330_tsc, 92
26093 .equiv mci3330_vctot, 100
26094 .equiv mci3330_vnbtot, 108
26095 .equiv mci3330_innerjjnr, 116
26096 .equiv mci3330_innerk, 120
26097 .equiv mci3330_tmprsqH, 124
26098 push ebp
26099 mov ebp,esp
26100 push eax
26101 push ebx
26102 push ecx
26103 push edx
26104 push esi
26105 push edi
26106 sub esp, 132 ;# local stack space
26107 femms
26108 ;# assume we have at least one i particle - start directly
26110 mov ecx, [ebp + mci3330_iinr] ;# ecx = pointer into iinr[]
26111 mov ebx, [ecx] ;# ebx=ii
26113 mov edx, [ebp + mci3330_charge]
26114 movd mm1, [ebp + mci3330_facel] ;# mm1=facel
26115 movd mm2, [edx + ebx*4] ;# mm2=charge[ii0] (O)
26116 movd mm3, [edx + ebx*4 + 4] ;# mm2=charge[ii0+1] (H)
26117 movq mm4, mm2
26118 pfmul mm4, mm1
26119 movq mm6, mm3
26120 pfmul mm6, mm1
26121 movq mm5, mm4
26122 pfmul mm4, mm2 ;# mm4=qqOO*facel
26123 pfmul mm5, mm3 ;# mm5=qqOH*facel
26124 pfmul mm6, mm3 ;# mm6=qqHH*facel
26125 punpckldq mm5,mm5 ;# spread to both halves
26126 punpckldq mm6,mm6 ;# spread to both halves
26127 movq [esp + mci3330_qqOO], mm4
26128 movq [esp + mci3330_qqOH], mm5
26129 movq [esp + mci3330_qqHH], mm6
26130 mov edx, [ebp + mci3330_type]
26131 mov ecx, [edx + ebx*4]
26132 shl ecx, 1
26133 mov edx, ecx
26134 imul ecx, [ebp + mci3330_ntype]
26135 add edx, ecx
26136 mov eax, [ebp + mci3330_nbfp]
26137 movd mm0, [eax + edx*4]
26138 movd mm1, [eax + edx*4 + 4]
26139 movq [esp + mci3330_c6], mm0
26140 movq [esp + mci3330_c12], mm1
26141 movd mm3, [ebp + mci3330_tabscale]
26142 punpckldq mm3,mm3
26143 movq [esp + mci3330_tsc], mm3
26144 .mci3330_outer:
26145 mov eax, [ebp + mci3330_shift] ;# eax = pointer into shift[]
26146 mov ebx, [eax] ;# ebx=shift[n]
26147 add dword ptr [ebp + mci3330_shift], 4 ;# advance pointer one step
26149 lea ebx, [ebx + ebx*2] ;# ebx=3*is
26150 mov [esp + mci3330_is3],ebx ;# store is3
26152 mov eax, [ebp + mci3330_shiftvec] ;# eax = base of shiftvec[]
26154 movq mm5, [eax + ebx*4] ;# move shX/shY to mm5 and shZ to mm6.
26155 movd mm6, [eax + ebx*4 + 8]
26156 movq mm0, mm5
26157 movq mm1, mm5
26158 movq mm2, mm6
26159 punpckldq mm0,mm0 ;# also expand shX,Y,Z in mm0--mm2.
26160 punpckhdq mm1,mm1
26161 punpckldq mm2,mm2
26163 mov ecx, [ebp + mci3330_iinr] ;# ecx = pointer into iinr[]
26164 add dword ptr [ebp + mci3330_iinr], 4 ;# advance pointer
26165 mov ebx, [ecx] ;# ebx=ii
26167 lea ebx, [ebx + ebx*2] ;# ebx = 3*ii=ii3
26168 mov eax, [ebp + mci3330_pos] ;# eax = base of pos[]
26170 pfadd mm5, [eax + ebx*4] ;# ix = shX + posX (and iy too)
26171 movd mm7, [eax + ebx*4 + 8] ;# cant use direct memory add for 4 bytes (iz)
26172 mov [esp + mci3330_ii3], ebx ;# (use mm7 as temp. storage for iz.)
26173 pfadd mm6, mm7
26174 movq [esp + mci3330_ixO], mm5
26175 movq [esp + mci3330_izO], mm6
26177 movd mm3, [eax + ebx*4 + 12]
26178 movd mm4, [eax + ebx*4 + 16]
26179 movd mm5, [eax + ebx*4 + 20]
26180 punpckldq mm3, [eax + ebx*4 + 24]
26181 punpckldq mm4, [eax + ebx*4 + 28]
26182 punpckldq mm5, [eax + ebx*4 + 32] ;# coords of H1 in low mm3-mm5, H2 in high
26184 pfadd mm0, mm3
26185 pfadd mm1, mm4
26186 pfadd mm2, mm5
26187 movq [esp + mci3330_ixH], mm0
26188 movq [esp + mci3330_iyH], mm1
26189 movq [esp + mci3330_izH], mm2
26191 ;# clear vctot and i forces
26192 pxor mm7,mm7
26193 movq [esp + mci3330_vctot], mm7
26194 movq [esp + mci3330_vnbtot], mm7
26196 mov eax, [ebp + mci3330_jindex]
26197 mov ecx, [eax] ;# jindex[n]
26198 mov edx, [eax + 4] ;# jindex[n+1]
26199 add dword ptr [ebp + mci3330_jindex], 4
26200 sub edx, ecx ;# number of innerloop atoms
26201 mov [esp + mci3330_innerk], edx
26203 mov esi, [ebp + mci3330_pos]
26204 mov eax, [ebp + mci3330_jjnr]
26205 shl ecx, 2
26206 add eax, ecx
26207 mov [esp + mci3330_innerjjnr], eax ;# pointer to jjnr[nj0]
26208 .mci3330_inner_loop:
26209 ;# a single j particle iteration here - compare with the unrolled code for comments.
26210 mov eax, [esp + mci3330_innerjjnr]
26211 mov eax, [eax] ;# eax=jnr offset
26212 add dword ptr [esp + mci3330_innerjjnr], 4 ;# advance pointer
26214 lea eax, [eax + eax*2]
26216 movq mm0, [esi + eax*4]
26217 movd mm1, [esi + eax*4 + 8]
26218 ;# copy & expand to mm2-mm4 for the H interactions
26219 movq mm2, mm0
26220 movq mm3, mm0
26221 movq mm4, mm1
26222 punpckldq mm2,mm2
26223 punpckhdq mm3,mm3
26224 punpckldq mm4,mm4
26226 pfsubr mm0, [esp + mci3330_ixO]
26227 pfsubr mm1, [esp + mci3330_izO]
26229 pfmul mm0,mm0
26230 pfmul mm1,mm1
26231 pfacc mm0, mm0
26232 pfadd mm0, mm1 ;# mm0=rsqO
26234 punpckldq mm2, mm2
26235 punpckldq mm3, mm3
26236 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
26237 pfsubr mm2, [esp + mci3330_ixH]
26238 pfsubr mm3, [esp + mci3330_iyH]
26239 pfsubr mm4, [esp + mci3330_izH] ;# mm2-mm4 is dxH-dzH
26241 pfmul mm2,mm2
26242 pfmul mm3,mm3
26243 pfmul mm4,mm4
26245 pfadd mm3,mm2
26246 pfadd mm3,mm4 ;# mm3=rsqH
26247 movq [esp + mci3330_tmprsqH], mm3
26249 pfrsqrt mm1,mm0
26251 movq mm2,mm1
26252 pfmul mm1,mm1
26253 pfrsqit1 mm1,mm0
26254 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
26255 pfmul mm0, mm1 ;# mm0=rsq
26257 pfmul mm0, [esp + mci3330_tsc]
26258 pf2iw mm4, mm0
26259 movd [esp + mci3330_n1], mm4
26260 pi2fd mm4,mm4
26261 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
26262 movq mm2, mm0
26263 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
26265 ;# coulomb table
26266 mov edx, [ebp + mci3330_VFtab]
26267 mov ecx, [esp + mci3330_n1]
26268 lea ecx, [ecx + ecx*2]
26269 shl ecx, 2
26271 ;# load all values we need
26272 movd mm4, [edx + ecx*4]
26273 movd mm5, [edx + ecx*4 + 4]
26274 movd mm6, [edx + ecx*4 + 8]
26275 movd mm7, [edx + ecx*4 + 12]
26277 pfmul mm6, mm0 ;# mm6 = Geps
26278 pfmul mm7, mm2 ;# mm7 = Heps2
26280 pfadd mm5, mm6
26281 pfadd mm5, mm7 ;# mm5 = Fp
26283 pfmul mm5, mm0 ;# mm5=eps*Fp
26284 pfadd mm5, mm4 ;# mm5= VV
26286 pfmul mm5, [esp + mci3330_qqOO] ;# vcoul=qq*VV
26287 ;# update vctot directly, use mm3 for fscal sum.
26288 pfadd mm5, [esp + mci3330_vctot]
26289 movq [esp + mci3330_vctot], mm5
26291 ;# dispersion table
26292 ;# load all the table values we need
26293 movd mm4, [edx + ecx*4 + 16]
26294 movd mm5, [edx + ecx*4 + 20]
26295 movd mm6, [edx + ecx*4 + 24]
26296 movd mm7, [edx + ecx*4 + 28]
26297 pfmul mm6, mm0 ;# mm6 = Geps
26298 pfmul mm7, mm2 ;# mm7 = Heps2
26299 pfadd mm5, mm6
26300 pfadd mm5, mm7 ;# mm5 = Fp
26301 pfmul mm5, mm0 ;# mm5=eps*Fp
26302 pfadd mm5, mm4 ;# mm5= VV
26304 movq mm4, [esp + mci3330_c6]
26305 pfmul mm5, mm4 ;# vnb6
26306 ;# update vnbtot to release mm5!
26307 pfadd mm5, [esp + mci3330_vnbtot] ;# add the earlier value
26308 movq [esp + mci3330_vnbtot], mm5 ;# store the sum
26310 ;# repulsion table
26311 ;# load all the table values we need
26312 movd mm4, [edx + ecx*4 + 32]
26313 movd mm5, [edx + ecx*4 + 36]
26314 movd mm6, [edx + ecx*4 + 40]
26315 movd mm7, [edx + ecx*4 + 44]
26317 pfmul mm6, mm0 ;# mm6 = Geps
26318 pfmul mm7, mm2 ;# mm7 = Heps2
26319 pfadd mm5, mm6
26320 pfadd mm5, mm7 ;# mm5 = Fp
26321 pfmul mm5, mm0 ;# mm5=eps*Fp
26322 pfadd mm5, mm4 ;# mm5= VV
26324 movq mm6, [esp + mci3330_c12]
26325 pfmul mm5, mm6 ;# vnb12
26326 ;# change sign of fscal and multiply with rinv
26327 ;# update vnbtot
26328 pfadd mm5, [esp + mci3330_vnbtot] ;# add the earlier value
26329 movq [esp + mci3330_vnbtot], mm5 ;# store the sum
26331 ;# Ready with the oxygen - time for hydrogens
26333 movq mm0, [esp + mci3330_tmprsqH]
26335 pfrsqrt mm1, mm0
26336 pswapd mm0,mm0
26337 pfrsqrt mm2, mm0
26338 pswapd mm0,mm0
26339 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
26341 movq mm2, mm1
26342 pfmul mm1,mm1
26343 pfrsqit1 mm1,mm0
26344 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
26346 pfmul mm0,mm1 ;# mm0=r
26347 pfmul mm0, [esp + mci3330_tsc]
26348 pf2iw mm4, mm0
26349 movq [esp + mci3330_n1], mm4
26350 pi2fd mm4,mm4
26351 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
26352 movq mm2, mm0
26353 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
26355 ;# coulomb table
26356 mov edx, [ebp + mci3330_VFtab]
26357 mov ecx, [esp + mci3330_n1]
26358 lea ecx, [ecx + ecx*2]
26359 shl ecx, 2
26360 ;# load all values we need
26361 movd mm4, [edx + ecx*4]
26362 movd mm5, [edx + ecx*4 + 4]
26363 movd mm6, [edx + ecx*4 + 8]
26364 movd mm7, [edx + ecx*4 + 12]
26365 mov ecx, [esp + mci3330_n1 + 4]
26366 lea ecx, [ecx + ecx*2]
26367 shl ecx, 2
26368 punpckldq mm4, [edx + ecx*4]
26369 punpckldq mm5, [edx + ecx*4 + 4]
26370 punpckldq mm6, [edx + ecx*4 + 8]
26371 punpckldq mm7, [edx + ecx*4 + 12]
26373 pfmul mm6, mm0 ;# mm6 = Geps
26374 pfmul mm7, mm2 ;# mm7 = Heps2
26376 pfadd mm5, mm6
26377 pfadd mm5, mm7 ;# mm5 = Fp
26379 pfmul mm5, mm0 ;# mm5=eps*Fp
26380 pfadd mm5, mm4 ;# mm5= VV
26382 pfmul mm5, [esp + mci3330_qqOH] ;# vcoul=qq*VV
26383 ;# update vctot
26384 pfadd mm5, [esp + mci3330_vctot]
26385 movq [esp + mci3330_vctot], mm5
26387 ;# interactions with j H1
26388 movq mm0, [esi + eax*4 + 12]
26389 movd mm1, [esi + eax*4 + 20]
26390 ;# copy & expand to mm2-mm4 for the H interactions
26391 movq mm2, mm0
26392 movq mm3, mm0
26393 movq mm4, mm1
26394 punpckldq mm2,mm2
26395 punpckhdq mm3,mm3
26396 punpckldq mm4,mm4
26398 pfsubr mm0, [esp + mci3330_ixO]
26399 pfsubr mm1, [esp + mci3330_izO]
26401 pfmul mm0,mm0
26402 pfmul mm1,mm1
26403 pfacc mm0, mm1
26404 pfadd mm0, mm1 ;# mm0=rsqO
26406 punpckldq mm2, mm2
26407 punpckldq mm3, mm3
26408 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
26409 pfsubr mm2, [esp + mci3330_ixH]
26410 pfsubr mm3, [esp + mci3330_iyH]
26411 pfsubr mm4, [esp + mci3330_izH] ;# mm2-mm4 is dxH-dzH
26413 pfmul mm2,mm2
26414 pfmul mm3,mm3
26415 pfmul mm4,mm4
26417 pfadd mm3,mm2
26418 pfadd mm3,mm4 ;# mm3=rsqH
26419 movq [esp + mci3330_tmprsqH], mm3
26421 pfrsqrt mm1,mm0
26423 movq mm2,mm1
26424 pfmul mm1,mm1
26425 pfrsqit1 mm1,mm0
26426 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
26427 pfmul mm0, mm1 ;# mm0=rsq
26429 pfmul mm0, [esp + mci3330_tsc]
26430 pf2iw mm4, mm0
26431 movd [esp + mci3330_n1], mm4
26432 pi2fd mm4,mm4
26433 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
26434 movq mm2, mm0
26435 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
26437 ;# coulomb table
26438 mov edx, [ebp + mci3330_VFtab]
26439 mov ecx, [esp + mci3330_n1]
26440 lea ecx, [ecx + ecx*2]
26441 shl ecx, 2
26443 ;# load all values we need
26444 movd mm4, [edx + ecx*4]
26445 movd mm5, [edx + ecx*4 + 4]
26446 movd mm6, [edx + ecx*4 + 8]
26447 movd mm7, [edx + ecx*4 + 12]
26449 pfmul mm6, mm0 ;# mm6 = Geps
26450 pfmul mm7, mm2 ;# mm7 = Heps2
26452 pfadd mm5, mm6
26453 pfadd mm5, mm7 ;# mm5 = Fp
26455 pfmul mm5, mm0 ;# mm5=eps*Fp
26456 pfadd mm5, mm4 ;# mm5= VV
26458 pfmul mm5, [esp + mci3330_qqOH] ;# vcoul=qq*VV
26459 ;# update vctot directly
26460 pfadd mm5, [esp + mci3330_vctot]
26461 movq [esp + mci3330_vctot], mm5
26463 movq mm0, [esp + mci3330_tmprsqH]
26464 pfrsqrt mm1, mm0
26465 pswapd mm0,mm0
26466 pfrsqrt mm2, mm0
26467 pswapd mm0,mm0
26468 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
26470 movq mm2, mm1
26471 pfmul mm1,mm1
26472 pfrsqit1 mm1,mm0
26473 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
26475 pfmul mm0,mm1 ;# mm0=r
26476 pfmul mm0, [esp + mci3330_tsc]
26477 pf2iw mm4, mm0
26478 movq [esp + mci3330_n1], mm4
26479 pi2fd mm4,mm4
26480 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
26481 movq mm2, mm0
26482 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
26484 ;# coulomb table
26485 mov edx, [ebp + mci3330_VFtab]
26486 mov ecx, [esp + mci3330_n1]
26487 lea ecx, [ecx + ecx*2]
26488 shl ecx, 2
26489 ;# load all values we need
26490 movd mm4, [edx + ecx*4]
26491 movd mm5, [edx + ecx*4 + 4]
26492 movd mm6, [edx + ecx*4 + 8]
26493 movd mm7, [edx + ecx*4 + 12]
26494 mov ecx, [esp + mci3330_n1 + 4]
26495 lea ecx, [ecx + ecx*2]
26496 shl ecx, 2
26497 punpckldq mm4, [edx + ecx*4]
26498 punpckldq mm5, [edx + ecx*4 + 4]
26499 punpckldq mm6, [edx + ecx*4 + 8]
26500 punpckldq mm7, [edx + ecx*4 + 12]
26503 pfmul mm6, mm0 ;# mm6 = Geps
26504 pfmul mm7, mm2 ;# mm7 = Heps2
26506 pfadd mm5, mm6
26507 pfadd mm5, mm7 ;# mm5 = Fp
26509 pfmul mm5, mm0 ;# mm5=eps*Fp
26510 pfadd mm5, mm4 ;# mm5= VV
26512 pfmul mm5, [esp + mci3330_qqHH] ;# vcoul=qq*VV
26513 ;# update vctot
26514 pfadd mm5, [esp + mci3330_vctot]
26515 movq [esp + mci3330_vctot], mm5
26517 ;# interactions with j H2
26518 movq mm0, [esi + eax*4 + 24]
26519 movd mm1, [esi + eax*4 + 32]
26520 ;# copy & expand to mm2-mm4 for the H interactions
26521 movq mm2, mm0
26522 movq mm3, mm0
26523 movq mm4, mm1
26524 punpckldq mm2,mm2
26525 punpckhdq mm3,mm3
26526 punpckldq mm4,mm4
26528 pfsubr mm0, [esp + mci3330_ixO]
26529 pfsubr mm1, [esp + mci3330_izO]
26531 pfmul mm0,mm0
26532 pfmul mm1,mm1
26533 pfacc mm0, mm1
26534 pfadd mm0, mm1 ;# mm0=rsqO
26536 punpckldq mm2, mm2
26537 punpckldq mm3, mm3
26538 punpckldq mm4, mm4 ;# mm2-mm4 is jx-jz
26539 pfsubr mm2, [esp + mci3330_ixH]
26540 pfsubr mm3, [esp + mci3330_iyH]
26541 pfsubr mm4, [esp + mci3330_izH] ;# mm2-mm4 is dxH-dzH
26543 pfmul mm2,mm2
26544 pfmul mm3,mm3
26545 pfmul mm4,mm4
26547 pfadd mm3,mm2
26548 pfadd mm3,mm4 ;# mm3=rsqH
26549 movq [esp + mci3330_tmprsqH], mm3
26551 pfrsqrt mm1,mm0
26553 movq mm2,mm1
26554 pfmul mm1,mm1
26555 pfrsqit1 mm1,mm0
26556 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
26557 pfmul mm0, mm1
26559 pfmul mm0, [esp + mci3330_tsc]
26560 pf2iw mm4, mm0
26561 movd [esp + mci3330_n1], mm4
26562 pi2fd mm4,mm4
26563 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
26564 movq mm2, mm0
26565 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
26567 ;# coulomb table
26568 mov edx, [ebp + mci3330_VFtab]
26569 mov ecx, [esp + mci3330_n1]
26570 lea ecx, [ecx + ecx*2]
26571 shl ecx, 2
26573 ;# load all values we need
26574 movd mm4, [edx + ecx*4]
26575 movd mm5, [edx + ecx*4 + 4]
26576 movd mm6, [edx + ecx*4 + 8]
26577 movd mm7, [edx + ecx*4 + 12]
26579 pfmul mm6, mm0 ;# mm6 = Geps
26580 pfmul mm7, mm2 ;# mm7 = Heps2
26582 pfadd mm5, mm6
26583 pfadd mm5, mm7 ;# mm5 = Fp
26585 pfmul mm5, mm0 ;# mm5=eps*Fp
26586 pfadd mm5, mm4 ;# mm5= VV
26588 pfmul mm5, [esp + mci3330_qqOH] ;# vcoul=qq*VV
26589 ;# update vctot directly
26590 pfadd mm5, [esp + mci3330_vctot]
26591 movq [esp + mci3330_vctot], mm5
26593 movq mm0, [esp + mci3330_tmprsqH]
26594 pfrsqrt mm1, mm0
26595 pswapd mm0,mm0
26596 pfrsqrt mm2, mm0
26597 pswapd mm0,mm0
26598 punpckldq mm1,mm2 ;# seeds are in mm1 now, and rsq in mm0.
26600 movq mm2, mm1
26601 pfmul mm1,mm1
26602 pfrsqit1 mm1,mm0
26603 pfrcpit2 mm1,mm2 ;# mm1=invsqrt
26605 pfmul mm0,mm1 ;# mm0=r
26606 pfmul mm0, [esp + mci3330_tsc]
26607 pf2iw mm4, mm0
26608 movq [esp + mci3330_n1], mm4
26609 pi2fd mm4,mm4
26610 pfsub mm0, mm4 ;# now mm0 is eps and mm4 n0
26611 movq mm2, mm0
26612 pfmul mm2, mm2 ;# mm0 is eps, mm2 eps2
26614 ;# coulomb table
26615 mov edx, [ebp + mci3330_VFtab]
26616 mov ecx, [esp + mci3330_n1]
26617 lea ecx, [ecx + ecx*2]
26618 shl ecx, 2
26619 ;# load all values we need
26620 movd mm4, [edx + ecx*4]
26621 movd mm5, [edx + ecx*4 + 4]
26622 movd mm6, [edx + ecx*4 + 8]
26623 movd mm7, [edx + ecx*4 + 12]
26624 mov ecx, [esp + mci3330_n1 + 4] ;# mm5 = Fp
26625 lea ecx, [ecx + ecx*2]
26626 shl ecx, 2
26627 punpckldq mm4, [edx + ecx*4]
26628 punpckldq mm5, [edx + ecx*4 + 4]
26629 punpckldq mm6, [edx + ecx*4 + 8]
26630 punpckldq mm7, [edx + ecx*4 + 12]
26633 pfmul mm6, mm0 ;# mm6 = Geps
26634 pfmul mm7, mm2 ;# mm7 = Heps2
26636 pfadd mm5, mm6
26637 pfadd mm5, mm7 ;# mm5 = Fp
26639 pfmul mm5, mm0 ;# mm5=eps*Fp
26640 pfadd mm5, mm4 ;# mm5= VV
26642 pfmul mm5, [esp + mci3330_qqHH] ;# vcoul=qq*VV
26643 ;# update vctot
26644 pfadd mm5, [esp + mci3330_vctot]
26645 movq [esp + mci3330_vctot], mm5
26647 ;# done - one more?
26648 dec dword ptr [esp + mci3330_innerk]
26649 jz .mci3330_updateouterdata
26650 jmp .mci3330_inner_loop
26651 .mci3330_updateouterdata:
26652 mov edx, [ebp + mci3330_gid] ;# get group index for this i particle
26653 mov edx, [edx]
26654 add dword ptr [ebp + mci3330_gid], 4 ;# advance pointer
26656 movq mm7, [esp + mci3330_vctot]
26657 pfacc mm7,mm7 ;# get and sum the two parts of total potential
26659 mov eax, [ebp + mci3330_Vc]
26660 movd mm6, [eax + edx*4]
26661 pfadd mm6, mm7
26662 movd [eax + edx*4], mm6 ;# increment vc[gid]
26664 movq mm7, [esp + mci3330_vnbtot]
26665 pfacc mm7,mm7 ;# get and sum the two parts of total potential
26667 mov eax, [ebp + mci3330_Vnb]
26668 movd mm6, [eax + edx*4]
26669 pfadd mm6, mm7
26670 movd [eax + edx*4], mm6 ;# increment vnbtot[gid]
26671 ;# finish if last
26672 dec dword ptr [ebp + mci3330_nri]
26673 jz .mci3330_end
26674 ;# not last, iterate once more!
26675 jmp .mci3330_outer
26676 .mci3330_end:
26677 femms
26678 add esp, 132
26679 pop edi
26680 pop esi
26681 pop edx
26682 pop ecx
26683 pop ebx
26684 pop eax
26685 leave