Renamed intel syntax assembly files to avoid double extensions
[gromacs.git] / src / gmxlib / nonbonded / nb_kernel_x86_64_sse2 / nb_kernel400_x86_64_sse2.intel_syntax.s
blobcdc2d9f689c939856f4c915d43da8f82939a09cb
1 ;#
2 ;#
3 ;# Gromacs 4.0 Copyright (c) 1991-2003
4 ;# David van der Spoel, Erik Lindahl
5 ;#
6 ;# This program is free software; you can redistribute it and/or
7 ;# modify it under the terms of the GNU General Public License
8 ;# as published by the Free Software Foundation; either version 2
9 ;# of the License, or (at your option) any later version.
11 ;# To help us fund GROMACS development, we humbly ask that you cite
12 ;# the research papers on the package. Check out http://www.gromacs.org
13 ;#
14 ;# And Hey:
15 ;# Gnomes, ROck Monsters And Chili Sauce
18 ;# These files require GNU binutils 2.10 or later, since we
19 ;# use intel syntax for portability, or a recent version
20 ;# of NASM that understands Extended 3DNow and SSE2 instructions.
21 ;# (NASM is normally only used with MS Visual C++).
22 ;# Since NASM and gnu as disagree on some definitions and use
23 ;# completely different preprocessing options I have to introduce a
24 ;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
25 ;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
26 ;# reason why all comments need both symbols...
27 ;# The source is written for GNU as, with intel syntax. When you use
28 ;# NASM we redefine a couple of things. The false if-statement around
29 ;# the following code is seen by GNU as, but NASM doesn't see it, so
30 ;# the code inside is read by NASM but not gcc.
31 ; .if 0 # block below only read by NASM
32 %define .section section
33 %define .long dd
34 %define .align align
35 %define .globl global
36 ;# NASM only wants 'dword', not 'dword ptr'.
37 %define ptr
38 %macro .equiv 2
39 %1 equ %2
40 %endmacro
41 ; .endif # End of NASM-specific block
42 ; .intel_syntax noprefix # Line only read by gnu as
47 .globl nb_kernel400_x86_64_sse2
48 .globl _nb_kernel400_x86_64_sse2
49 nb_kernel400_x86_64_sse2:
50 _nb_kernel400_x86_64_sse2:
51 ;# Room for return address and rbp (16 bytes)
52 .equiv nb400_fshift, 16
53 .equiv nb400_gid, 24
54 .equiv nb400_pos, 32
55 .equiv nb400_faction, 40
56 .equiv nb400_charge, 48
57 .equiv nb400_p_facel, 56
58 .equiv nb400_argkrf, 64
59 .equiv nb400_argcrf, 72
60 .equiv nb400_Vc, 80
61 .equiv nb400_type, 88
62 .equiv nb400_p_ntype, 96
63 .equiv nb400_vdwparam, 104
64 .equiv nb400_Vvdw, 112
65 .equiv nb400_p_tabscale, 120
66 .equiv nb400_VFtab, 128
67 .equiv nb400_invsqrta, 136
68 .equiv nb400_dvda, 144
69 .equiv nb400_p_gbtabscale, 152
70 .equiv nb400_GBtab, 160
71 .equiv nb400_p_nthreads, 168
72 .equiv nb400_count, 176
73 .equiv nb400_mtx, 184
74 .equiv nb400_outeriter, 192
75 .equiv nb400_inneriter, 200
76 .equiv nb400_work, 208
77 ;# stack offsets for local variables
78 ;# bottom of stack is cache-aligned for sse2 use
79 .equiv nb400_ix, 0
80 .equiv nb400_iy, 16
81 .equiv nb400_iz, 32
82 .equiv nb400_iq, 48
83 .equiv nb400_dx, 64
84 .equiv nb400_dy, 80
85 .equiv nb400_dz, 96
86 .equiv nb400_two, 112
87 .equiv nb400_gbtsc, 128
88 .equiv nb400_qq, 144
89 .equiv nb400_r, 160
90 .equiv nb400_vctot, 176
91 .equiv nb400_fix, 192
92 .equiv nb400_fiy, 208
93 .equiv nb400_fiz, 224
94 .equiv nb400_half, 240
95 .equiv nb400_three, 256
96 .equiv nb400_isai, 272
97 .equiv nb400_isaprod, 288
98 .equiv nb400_dvdasum, 304
99 .equiv nb400_gbscale, 320
100 .equiv nb400_nri, 336
101 .equiv nb400_iinr, 344
102 .equiv nb400_jindex, 352
103 .equiv nb400_jjnr, 360
104 .equiv nb400_shift, 368
105 .equiv nb400_shiftvec, 376
106 .equiv nb400_facel, 384
107 .equiv nb400_innerjjnr, 392
108 .equiv nb400_is3, 400
109 .equiv nb400_ii3, 404
110 .equiv nb400_ii, 408
111 .equiv nb400_innerk, 412
112 .equiv nb400_n, 416
113 .equiv nb400_nn1, 420
114 .equiv nb400_nouter, 424
115 .equiv nb400_ninner, 428
116 push rbp
117 mov rbp, rsp
118 push rbx
121 emms
123 push r12
124 push r13
125 push r14
126 push r15
128 sub rsp, 440 ;# local variable stack space (n*16+8)
130 ;# zero 32-bit iteration counters
131 mov eax, 0
132 mov [rsp + nb400_nouter], eax
133 mov [rsp + nb400_ninner], eax
135 mov edi, [rdi]
136 mov [rsp + nb400_nri], edi
137 mov [rsp + nb400_iinr], rsi
138 mov [rsp + nb400_jindex], rdx
139 mov [rsp + nb400_jjnr], rcx
140 mov [rsp + nb400_shift], r8
141 mov [rsp + nb400_shiftvec], r9
142 mov rsi, [rbp + nb400_p_facel]
143 movsd xmm0, [rsi]
144 movsd [rsp + nb400_facel], xmm0
146 mov rbx, [rbp + nb400_p_gbtabscale]
147 movsd xmm4, [rbx]
148 shufpd xmm4, xmm4, 0
149 movapd [rsp + nb400_gbtsc], xmm4
151 ;# create constant floating-point factors on stack
152 mov eax, 0x00000000 ;# lower half of double half IEEE (hex)
153 mov ebx, 0x3fe00000
154 mov [rsp + nb400_half], eax
155 mov [rsp + nb400_half+4], ebx
156 movsd xmm1, [rsp + nb400_half]
157 shufpd xmm1, xmm1, 0 ;# splat to all elements
158 movapd xmm3, xmm1
159 addpd xmm3, xmm3 ;# one
160 movapd xmm2, xmm3
161 addpd xmm2, xmm2 ;# two
162 addpd xmm3, xmm2 ;# three
163 movapd [rsp + nb400_half], xmm1
164 movapd [rsp + nb400_two], xmm2
165 movapd [rsp + nb400_three], xmm3
167 .nb400_threadloop:
168 mov rsi, [rbp + nb400_count] ;# pointer to sync counter
169 mov eax, [rsi]
170 .nb400_spinlock:
171 mov ebx, eax ;# ebx=*count=nn0
172 add ebx, 1 ;# ebx=nn1=nn0+10
173 lock
174 cmpxchg [esi], ebx ;# write nn1 to *counter,
175 ;# if it hasnt changed.
176 ;# or reread *counter to eax.
177 pause ;# -> better p4 performance
178 jnz .nb400_spinlock
180 ;# if(nn1>nri) nn1=nri
181 mov ecx, [rsp + nb400_nri]
182 mov edx, ecx
183 sub ecx, ebx
184 cmovle ebx, edx ;# if(nn1>nri) nn1=nri
185 ;# Cleared the spinlock if we got here.
186 ;# eax contains nn0, ebx contains nn1.
187 mov [rsp + nb400_n], eax
188 mov [rsp + nb400_nn1], ebx
189 sub ebx, eax ;# calc number of outer lists
190 mov esi, eax ;# copy n to esi
191 jg .nb400_outerstart
192 jmp .nb400_end
194 .nb400_outerstart:
195 ;# ebx contains number of outer iterations
196 add ebx, [rsp + nb400_nouter]
197 mov [rsp + nb400_nouter], ebx
199 .nb400_outer:
200 mov rax, [rsp + nb400_shift] ;# rax = pointer into shift[]
201 mov ebx, [rax+rsi*4] ;# rbx=shift[n]
203 lea rbx, [rbx + rbx*2] ;# rbx=3*is
204 mov [rsp + nb400_is3],ebx ;# store is3
206 mov rax, [rsp + nb400_shiftvec] ;# rax = base of shiftvec[]
208 movsd xmm0, [rax + rbx*8]
209 movsd xmm1, [rax + rbx*8 + 8]
210 movsd xmm2, [rax + rbx*8 + 16]
212 mov rcx, [rsp + nb400_iinr] ;# rcx = pointer into iinr[]
213 mov ebx, [rcx+rsi*4] ;# ebx =ii
214 mov [rsp + nb400_ii], ebx
216 mov rdx, [rbp + nb400_charge]
217 movsd xmm3, [rdx + rbx*8]
218 mulsd xmm3, [rsp + nb400_facel]
219 shufpd xmm3, xmm3, 0
221 mov rdx, [rbp + nb400_invsqrta] ;# load invsqrta[ii]
222 movsd xmm4, [rdx + rbx*8]
223 shufpd xmm4, xmm4, 0
225 lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3
226 mov rax, [rbp + nb400_pos] ;# rax = base of pos[]
228 addsd xmm0, [rax + rbx*8]
229 addsd xmm1, [rax + rbx*8 + 8]
230 addsd xmm2, [rax + rbx*8 + 16]
232 movapd [rsp + nb400_iq], xmm3
233 movapd [rsp + nb400_isai], xmm4
235 shufpd xmm0, xmm0, 0
236 shufpd xmm1, xmm1, 0
237 shufpd xmm2, xmm2, 0
239 movapd [rsp + nb400_ix], xmm0
240 movapd [rsp + nb400_iy], xmm1
241 movapd [rsp + nb400_iz], xmm2
243 mov [rsp + nb400_ii3], ebx
245 ;# clear vctot and i forces
246 xorpd xmm4, xmm4
247 movapd xmm8, xmm4
248 movapd xmm12, xmm4
249 movapd xmm13, xmm4
250 movapd xmm14, xmm4
251 movapd xmm15, xmm4
253 mov rax, [rsp + nb400_jindex]
254 mov ecx, [rax + rsi*4] ;# jindex[n]
255 mov edx, [rax + rsi*4 + 4] ;# jindex[n+1]
256 sub edx, ecx ;# number of innerloop atoms
258 mov rsi, [rbp + nb400_pos]
259 mov rdi, [rbp + nb400_faction]
260 mov rax, [rsp + nb400_jjnr]
261 shl ecx, 2
262 add rax, rcx
263 mov [rsp + nb400_innerjjnr], rax ;# pointer to jjnr[nj0]
264 mov ecx, edx
265 sub edx, 2
266 add ecx, [rsp + nb400_ninner]
267 mov [rsp + nb400_ninner], ecx
268 add edx, 0
269 mov [rsp + nb400_innerk], edx ;# number of innerloop atoms
270 jge .nb400_unroll_loop
271 jmp .nb400_checksingle
272 .nb400_unroll_loop:
273 ;# twice unrolled innerloop here
274 mov rdx, [rsp + nb400_innerjjnr] ;# pointer to jjnr[k]
275 mov r12d, [rdx]
276 mov r13d, [rdx + 4]
277 add qword ptr [rsp + nb400_innerjjnr], 8 ;# advance pointer (unrolled 2)
279 mov rsi, [rbp + nb400_pos] ;# base of pos[]
281 lea r8, [r12 + r12*2] ;# j3
282 lea r9, [r13 + r13*2]
284 ;# move two coordinates to xmm4-xmm6
285 movlpd xmm4, [rsi + r8*8]
286 movlpd xmm5, [rsi + r8*8 + 8]
287 movlpd xmm6, [rsi + r8*8 + 16]
288 movhpd xmm4, [rsi + r9*8]
289 movhpd xmm5, [rsi + r9*8 + 8]
290 movhpd xmm6, [rsi + r9*8 + 16]
292 ;# calc dr
293 subpd xmm4, [rsp + nb400_ix]
294 subpd xmm5, [rsp + nb400_iy]
295 subpd xmm6, [rsp + nb400_iz]
298 ;# store dr
299 movapd xmm9, xmm4
300 movapd xmm10, xmm5
301 movapd xmm11, xmm6
303 ;# square it
304 mulpd xmm4,xmm4
305 mulpd xmm5,xmm5
306 mulpd xmm6,xmm6
307 addpd xmm4, xmm5
308 addpd xmm4, xmm6
309 ;# rsq in xmm4
311 mov rsi, [rbp + nb400_invsqrta]
312 movlpd xmm3, [rsi + r12*8]
314 cvtpd2ps xmm5, xmm4
315 rsqrtps xmm5, xmm5
316 cvtps2pd xmm2, xmm5 ;# lu in low xmm2
318 movhpd xmm3, [rsi + r13*8]
319 mulpd xmm3, [rsp + nb400_isai]
320 movapd [rsp + nb400_isaprod], xmm3
321 movapd xmm6, xmm3
322 mulpd xmm3, [rsp + nb400_gbtsc]
323 movapd [rsp + nb400_gbscale], xmm3
325 ;# lookup seed in xmm2
326 movapd xmm5, xmm2 ;# copy of lu
327 mulpd xmm2, xmm2 ;# lu*lu
328 movapd xmm1, [rsp + nb400_three]
329 mulpd xmm2, xmm4 ;# rsq*lu*lu
330 movapd xmm0, [rsp + nb400_half]
331 subpd xmm1, xmm2 ;# 30-rsq*lu*lu
332 mulpd xmm1, xmm5
333 mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu)
335 mov rsi, [rbp + nb400_charge] ;# base of charge[]
336 movlpd xmm3, [rsi + r12*8]
338 movapd xmm5, xmm1 ;# copy of lu
339 mulpd xmm1, xmm1 ;# lu*lu
340 movapd xmm2, [rsp + nb400_three]
341 mulpd xmm1, xmm4 ;# rsq*lu*lu
342 movapd xmm0, [rsp + nb400_half]
343 subpd xmm2, xmm1 ;# 30-rsq*lu*lu
344 mulpd xmm2, xmm5
345 mulpd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu)
346 mulpd xmm4, xmm0 ;# xmm4=r
348 mulpd xmm6, [rsp + nb400_iq]
349 movhpd xmm3, [rsi + r13*8]
350 mulpd xmm3, xmm6
351 movapd [rsp + nb400_qq], xmm3
354 movapd [rsp + nb400_r], xmm4
355 mulpd xmm4, [rsp + nb400_gbscale]
357 cvttpd2pi mm6, xmm4 ;# mm6 = lu idx
358 cvtpi2pd xmm5, mm6
359 subpd xmm4, xmm5
360 movapd xmm1, xmm4 ;# xmm1=eps
362 pslld mm6, 2 ;# idx *= 4
364 mov rsi, [rbp + nb400_GBtab]
365 movd r10d, mm6
366 psrlq mm6, 32
367 movd r11d, mm6 ;# indices in r10/r11
369 movapd xmm4, [rsi + r10*8] ;# Y1 F1
370 movapd xmm3, [rsi + r11*8] ;# Y2 F2
371 movapd xmm5, xmm4
372 unpcklpd xmm4, xmm3 ;# Y1 Y2
373 unpckhpd xmm5, xmm3 ;# F1 F2
375 movapd xmm6, [rsi + r10*8 + 16] ;# G1 H1
376 movapd xmm3, [rsi + r11*8 + 16] ;# G2 H2
377 movapd xmm7, xmm6
378 unpcklpd xmm6, xmm3 ;# G1 G2
379 unpckhpd xmm7, xmm3 ;# H1 H2
380 ;# coulomb table ready, in xmm4-xmm7
382 mulpd xmm7, xmm1 ;# xmm7=Heps
383 mulpd xmm6, xmm1 ;# xmm6=Geps
384 mulpd xmm7, xmm1 ;# xmm7=Heps2
385 addpd xmm5, xmm6
386 addpd xmm5, xmm7 ;# xmm5=Fp
387 addpd xmm7, xmm7 ;# two*Heps2
388 movapd xmm3, [rsp + nb400_qq]
389 addpd xmm7, xmm6
390 addpd xmm7, xmm5 ;# xmm7=FF
391 mulpd xmm5, xmm1 ;# xmm5=eps*Fp
392 addpd xmm5, xmm4 ;# xmm5=VV
393 mulpd xmm5, xmm3 ;# vcoul=qq*VV
394 mulpd xmm3, xmm7 ;# fijC=FF*qq
396 mov rsi, [rbp + nb400_dvda]
398 ;# Calculate dVda
399 xorpd xmm7, xmm7
400 mulpd xmm3, [rsp + nb400_gbscale]
401 movapd xmm6, xmm3
402 mulpd xmm6, [rsp + nb400_r]
403 addpd xmm6, xmm5
405 ;# update vctot
406 addpd xmm12, xmm5
408 ;# xmm6=(vcoul+fijC*r)
409 subpd xmm7, xmm6
410 movapd xmm6, xmm7
412 ;# update dvdasum
413 addpd xmm8, xmm7
415 ;# update j atoms dvdaj
416 movhlps xmm7, xmm6
417 addsd xmm6, [rsi + r12*8]
418 addsd xmm7, [rsi + r13*8]
419 movsd [rsi + r12*8], xmm6
420 movsd [rsi + r13*8], xmm7
422 ;# the fj's - start by accumulating forces from memory
423 mov rdi, [rbp + nb400_faction]
424 movlpd xmm5, [rdi + r8*8]
425 movlpd xmm6, [rdi + r8*8 + 8]
426 movlpd xmm7, [rdi + r8*8 + 16]
427 movhpd xmm5, [rdi + r9*8]
428 movhpd xmm6, [rdi + r9*8 + 8]
429 movhpd xmm7, [rdi + r9*8 + 16]
431 xorpd xmm4, xmm4
433 mulpd xmm3, xmm0
434 subpd xmm4, xmm3
436 mov rdi, [rbp + nb400_faction]
437 mulpd xmm9, xmm4
438 mulpd xmm10, xmm4
439 mulpd xmm11, xmm4
441 addpd xmm5, xmm9
442 addpd xmm6, xmm10
443 addpd xmm7, xmm11
445 ;# now update f_i
446 addpd xmm13, xmm9
447 addpd xmm14, xmm10
448 addpd xmm15, xmm11
450 movlpd [rdi + r8*8], xmm5
451 movlpd [rdi + r8*8 + 8], xmm6
452 movlpd [rdi + r8*8 + 16], xmm7
453 movhpd [rdi + r9*8], xmm5
454 movhpd [rdi + r9*8 + 8], xmm6
455 movhpd [rdi + r9*8 + 16], xmm7
457 ;# should we do one more iteration?
458 sub dword ptr [rsp + nb400_innerk], 2
459 jl .nb400_checksingle
460 jmp .nb400_unroll_loop
461 .nb400_checksingle:
462 mov edx, [rsp + nb400_innerk]
463 and edx, 1
464 jnz .nb400_dosingle
465 jmp .nb400_updateouterdata
466 .nb400_dosingle:
467 mov rsi, [rbp + nb400_charge]
468 mov rdx, [rbp + nb400_invsqrta]
469 mov rdi, [rbp + nb400_pos]
470 mov rcx, [rsp + nb400_innerjjnr]
471 mov eax, [rcx]
473 ;# load isaj
474 mov rsi, [rbp + nb400_invsqrta]
475 movsd xmm2, [rsi + rax*8]
476 mulsd xmm2, [rsp + nb400_isai]
477 movapd [rsp + nb400_isaprod], xmm2
478 movapd xmm1, xmm2
479 mulsd xmm1, [rsp + nb400_gbtsc]
480 movapd [rsp + nb400_gbscale], xmm1
482 mulsd xmm2, [rsp + nb400_iq]
483 mov rsi, [rbp + nb400_charge] ;# base of charge[]
484 movsd xmm3, [rsi + rax*8]
485 mulsd xmm3, xmm2
486 movapd [rsp + nb400_qq], xmm3
488 mov rsi, [rbp + nb400_pos] ;# base of pos[]
490 lea r8, [rax + rax*2] ;# j3
492 ;# move coordinate to xmm4-xmm6
493 movsd xmm4, [rsi + r8*8]
494 movsd xmm5, [rsi + r8*8 + 8]
495 movsd xmm6, [rsi + r8*8 + 16]
497 mov rdi, [rbp + nb400_faction]
499 ;# calc dr
500 subsd xmm4, [rsp + nb400_ix]
501 subsd xmm5, [rsp + nb400_iy]
502 subsd xmm6, [rsp + nb400_iz]
504 ;# store dr
505 movapd xmm9, xmm4
506 movapd xmm10, xmm5
507 movapd xmm11, xmm6
509 ;# square it
510 mulsd xmm4,xmm4
511 mulsd xmm5,xmm5
512 mulsd xmm6,xmm6
513 addsd xmm4, xmm5
514 addsd xmm4, xmm6
515 ;# rsq in xmm4
517 cvtsd2ss xmm5, xmm4
518 rsqrtss xmm5, xmm5
519 cvtss2sd xmm2, xmm5 ;# lu in low xmm2
521 ;# lookup seed in xmm2
522 movapd xmm5, xmm2 ;# copy of lu
523 mulsd xmm2, xmm2 ;# lu*lu
524 movapd xmm1, [rsp + nb400_three]
525 mulsd xmm2, xmm4 ;# rsq*lu*lu
526 movapd xmm0, [rsp + nb400_half]
527 subsd xmm1, xmm2 ;# 30-rsq*lu*lu
528 mulsd xmm1, xmm5
529 mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu)
531 movapd xmm5, xmm1 ;# copy of lu
532 mulsd xmm1, xmm1 ;# lu*lu
533 movapd xmm2, [rsp + nb400_three]
534 mulsd xmm1, xmm4 ;# rsq*lu*lu
535 movapd xmm0, [rsp + nb400_half]
536 subsd xmm2, xmm1 ;# 30-rsq*lu*lu
537 mulsd xmm2, xmm5
538 mulsd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu)
539 mulsd xmm4, xmm0 ;# xmm4=r
541 movapd [rsp + nb400_r], xmm4
542 mulsd xmm4, [rsp + nb400_gbscale]
544 cvttsd2si r10d, xmm4 ;# mm6 = lu idx
545 cvtsi2sd xmm5, r10d
546 subsd xmm4, xmm5
547 movapd xmm1, xmm4 ;# xmm1=eps
549 shl r10d, 2 ;# idx *= 4
551 mov rsi, [rbp + nb400_GBtab]
553 movapd xmm4, [rsi + r10*8] ;# Y1 F1
554 movhlps xmm5, xmm4
555 movapd xmm6, [rsi + r10*8 + 16] ;# G1 H1
556 movhlps xmm7, xmm6
557 ;# coulomb table ready, in xmm4-xmm7
559 mulsd xmm7, xmm1 ;# xmm7=Heps
560 mulsd xmm6, xmm1 ;# xmm6=Geps
561 mulsd xmm7, xmm1 ;# xmm7=Heps2
562 addsd xmm5, xmm6
563 addsd xmm5, xmm7 ;# xmm5=Fp
564 addsd xmm7, xmm7 ;# two*Heps2
565 movapd xmm3, [rsp + nb400_qq]
566 addsd xmm7, xmm6
567 addsd xmm7, xmm5 ;# xmm7=FF
568 mulsd xmm5, xmm1 ;# xmm5=eps*Fp
569 addsd xmm5, xmm4 ;# xmm5=VV
570 mulsd xmm5, xmm3 ;# vcoul=qq*VV
571 mulsd xmm3, xmm7 ;# fijC=FF*qq
573 mov rsi, [rbp + nb400_dvda]
575 ;# Calculate dVda
576 xorpd xmm7, xmm7
577 mulsd xmm3, [rsp + nb400_gbscale]
578 movapd xmm6, xmm3
579 mulsd xmm6, [rsp + nb400_r]
580 addsd xmm6, xmm5
582 ;# update vctot
583 addsd xmm12, xmm5
585 ;# xmm6=(vcoul+fijC*r)
586 subsd xmm7, xmm6
587 movapd xmm6, xmm7
589 ;# update dvdasum
590 addsd xmm8, xmm7
592 ;# update j atoms dvdaj
593 addsd xmm6, [rsi + rax*8]
594 movsd [rsi + rax*8], xmm6
596 xorpd xmm4, xmm4
598 mulsd xmm3, xmm0
599 subsd xmm4, xmm3
601 mov rdi, [rbp + nb400_faction]
602 mulsd xmm9, xmm4
603 mulsd xmm10, xmm4
604 mulsd xmm11, xmm4
606 ;# now update f_i
607 addsd xmm13, xmm9
608 addsd xmm14, xmm10
609 addsd xmm15, xmm11
611 ;# the fj's - start by accumulating forces from memory
612 mov rdi, [rbp + nb400_faction]
613 addsd xmm9, [rdi + r8*8]
614 addsd xmm10, [rdi + r8*8 + 8]
615 addsd xmm11, [rdi + r8*8 + 16]
616 movsd [rdi + r8*8], xmm9
617 movsd [rdi + r8*8 + 8], xmm10
618 movsd [rdi + r8*8 + 16], xmm11
620 .nb400_updateouterdata:
621 mov ecx, [rsp + nb400_ii3]
622 mov rdi, [rbp + nb400_faction]
623 mov rsi, [rbp + nb400_fshift]
624 mov edx, [rsp + nb400_is3]
626 ;# accumulate i forces in xmm13, xmm14, xmm15
627 movhlps xmm3, xmm13
628 movhlps xmm4, xmm14
629 movhlps xmm5, xmm15
630 addsd xmm13, xmm3
631 addsd xmm14, xmm4
632 addsd xmm15, xmm5 ;# sum is in low xmm13-xmm15
634 ;# increment i force
635 movsd xmm3, [rdi + rcx*8]
636 movsd xmm4, [rdi + rcx*8 + 8]
637 movsd xmm5, [rdi + rcx*8 + 16]
638 subsd xmm3, xmm13
639 subsd xmm4, xmm14
640 subsd xmm5, xmm15
641 movsd [rdi + rcx*8], xmm3
642 movsd [rdi + rcx*8 + 8], xmm4
643 movsd [rdi + rcx*8 + 16], xmm5
645 ;# increment fshift force
646 movsd xmm3, [rsi + rdx*8]
647 movsd xmm4, [rsi + rdx*8 + 8]
648 movsd xmm5, [rsi + rdx*8 + 16]
649 subsd xmm3, xmm13
650 subsd xmm4, xmm14
651 subsd xmm5, xmm15
652 movsd [rsi + rdx*8], xmm3
653 movsd [rsi + rdx*8 + 8], xmm4
654 movsd [rsi + rdx*8 + 16], xmm5
656 ;# get n from stack
657 mov esi, [rsp + nb400_n]
658 ;# get group index for i particle
659 mov rdx, [rbp + nb400_gid] ;# base of gid[]
660 mov edx, [rdx + rsi*4] ;# ggid=gid[n]
662 ;# accumulate total coulomb energy and update it
663 movhlps xmm6, xmm12
664 addsd xmm12, xmm6 ;# low xmm12 have the sum now
666 ;# add earlier value from mem
667 mov rax, [rbp + nb400_Vc]
668 addsd xmm12, [rax + rdx*8]
669 ;# move back to mem
670 movsd [rax + rdx*8], xmm12
672 ;# accumulate dVda and update it
673 movhlps xmm6, xmm8
674 addsd xmm8, xmm6 ;# low xmm8 has the sum now
676 mov edx, [rsp + nb400_ii]
677 mov rax, [rbp + nb400_dvda]
678 addsd xmm8, [rax + rdx*8]
679 movsd [rax + rdx*8], xmm8
681 ;# finish if last
682 mov ecx, [rsp + nb400_nn1]
683 ;# esi already loaded with n
684 inc esi
685 sub ecx, esi
686 jz .nb400_outerend
688 ;# not last, iterate outer loop once more!
689 mov [rsp + nb400_n], esi
690 jmp .nb400_outer
691 .nb400_outerend:
692 ;# check if more outer neighborlists remain
693 mov ecx, [rsp + nb400_nri]
694 ;# esi already loaded with n above
695 sub ecx, esi
696 jz .nb400_end
697 ;# non-zero, do one more workunit
698 jmp .nb400_threadloop
699 .nb400_end:
700 mov eax, [rsp + nb400_nouter]
701 mov ebx, [rsp + nb400_ninner]
702 mov rcx, [rbp + nb400_outeriter]
703 mov rdx, [rbp + nb400_inneriter]
704 mov [rcx], eax
705 mov [rdx], ebx
707 add rsp, 440
708 emms
711 pop r15
712 pop r14
713 pop r13
714 pop r12
716 pop rbx
717 pop rbp
726 .globl nb_kernel400nf_x86_64_sse2
727 .globl _nb_kernel400nf_x86_64_sse2
728 nb_kernel400nf_x86_64_sse2:
729 _nb_kernel400nf_x86_64_sse2:
730 .equiv nb400nf_fshift, 16
731 .equiv nb400nf_gid, 24
732 .equiv nb400nf_pos, 32
733 .equiv nb400nf_faction, 40
734 .equiv nb400nf_charge, 48
735 .equiv nb400nf_p_facel, 56
736 .equiv nb400nf_argkrf, 64
737 .equiv nb400nf_argcrf, 72
738 .equiv nb400nf_Vc, 80
739 .equiv nb400nf_type, 88
740 .equiv nb400nf_p_ntype, 96
741 .equiv nb400nf_vdwparam, 104
742 .equiv nb400nf_Vvdw, 112
743 .equiv nb400nf_p_tabscale, 120
744 .equiv nb400nf_VFtab, 128
745 .equiv nb400nf_invsqrta, 136
746 .equiv nb400nf_dvda, 144
747 .equiv nb400nf_p_gbtabscale, 152
748 .equiv nb400nf_GBtab, 160
749 .equiv nb400nf_p_nthreads, 168
750 .equiv nb400nf_count, 176
751 .equiv nb400nf_mtx, 184
752 .equiv nb400nf_outeriter, 192
753 .equiv nb400nf_inneriter, 200
754 .equiv nb400nf_work, 208
755 ;# stack offsets for local variables
756 ;# bottom of stack is cache-aligned for sse2 use
757 .equiv nb400nf_ix, 0
758 .equiv nb400nf_iy, 16
759 .equiv nb400nf_iz, 32
760 .equiv nb400nf_iq, 48
761 .equiv nb400nf_gbtsc, 64
762 .equiv nb400nf_qq, 80
763 .equiv nb400nf_vctot, 96
764 .equiv nb400nf_half, 112
765 .equiv nb400nf_three, 128
766 .equiv nb400nf_isai, 144
767 .equiv nb400nf_isaprod, 160
768 .equiv nb400nf_gbscale, 176
769 .equiv nb400nf_nri, 192
770 .equiv nb400nf_iinr, 200
771 .equiv nb400nf_jindex, 208
772 .equiv nb400nf_jjnr, 216
773 .equiv nb400nf_shift, 224
774 .equiv nb400nf_shiftvec, 232
775 .equiv nb400nf_facel, 240
776 .equiv nb400nf_innerjjnr, 248
777 .equiv nb400nf_is3, 256
778 .equiv nb400nf_ii3, 260
779 .equiv nb400nf_innerk, 264
780 .equiv nb400nf_n, 268
781 .equiv nb400nf_nn1, 272
782 .equiv nb400nf_nouter, 276
783 .equiv nb400nf_ninner, 280
784 push rbp
785 mov rbp, rsp
786 push rbx
789 emms
791 push r12
792 push r13
793 push r14
794 push r15
796 sub rsp, 296 ;# local variable stack space (n*16+8)
798 ;# zero 32-bit iteration counters
799 mov eax, 0
800 mov [rsp + nb400nf_nouter], eax
801 mov [rsp + nb400nf_ninner], eax
803 mov edi, [rdi]
804 mov [rsp + nb400nf_nri], edi
805 mov [rsp + nb400nf_iinr], rsi
806 mov [rsp + nb400nf_jindex], rdx
807 mov [rsp + nb400nf_jjnr], rcx
808 mov [rsp + nb400nf_shift], r8
809 mov [rsp + nb400nf_shiftvec], r9
810 mov rsi, [rbp + nb400nf_p_facel]
811 movsd xmm0, [rsi]
812 movsd [rsp + nb400nf_facel], xmm0
814 mov rbx, [rbp + nb400nf_p_gbtabscale]
815 movsd xmm4, [rbx]
816 shufpd xmm4, xmm4, 0
817 movapd [rsp + nb400nf_gbtsc], xmm4
819 ;# create constant floating-point factors on stack
820 mov eax, 0x00000000 ;# lower half of double half IEEE (hex)
821 mov ebx, 0x3fe00000
822 mov [rsp + nb400nf_half], eax
823 mov [rsp + nb400nf_half+4], ebx
824 movsd xmm1, [rsp + nb400nf_half]
825 shufpd xmm1, xmm1, 0 ;# splat to all elements
826 movapd xmm3, xmm1
827 addpd xmm3, xmm3 ;# one
828 movapd xmm2, xmm3
829 addpd xmm2, xmm2 ;# two
830 addpd xmm3, xmm2 ;# three
831 movapd [rsp + nb400nf_half], xmm1
832 movapd [rsp + nb400nf_three], xmm3
834 .nb400nf_threadloop:
835 mov rsi, [rbp + nb400nf_count] ;# pointer to sync counter
836 mov eax, [rsi]
837 .nb400nf_spinlock:
838 mov ebx, eax ;# ebx=*count=nn0
839 add ebx, 1 ;# ebx=nn1=nn0+10
840 lock
841 cmpxchg [esi], ebx ;# write nn1 to *counter,
842 ;# if it hasnt changed.
843 ;# or reread *counter to eax.
844 pause ;# -> better p4 performance
845 jnz .nb400nf_spinlock
847 ;# if(nn1>nri) nn1=nri
848 mov ecx, [rsp + nb400nf_nri]
849 mov edx, ecx
850 sub ecx, ebx
851 cmovle ebx, edx ;# if(nn1>nri) nn1=nri
852 ;# Cleared the spinlock if we got here.
853 ;# eax contains nn0, ebx contains nn1.
854 mov [rsp + nb400nf_n], eax
855 mov [rsp + nb400nf_nn1], ebx
856 sub ebx, eax ;# calc number of outer lists
857 mov esi, eax ;# copy n to esi
858 jg .nb400nf_outerstart
859 jmp .nb400nf_end
861 .nb400nf_outerstart:
862 ;# ebx contains number of outer iterations
863 add ebx, [rsp + nb400nf_nouter]
864 mov [rsp + nb400nf_nouter], ebx
866 .nb400nf_outer:
867 mov rax, [rsp + nb400nf_shift] ;# rax = pointer into shift[]
868 mov ebx, [rax+rsi*4] ;# rbx=shift[n]
870 lea rbx, [rbx + rbx*2] ;# rbx=3*is
871 mov [rsp + nb400nf_is3],ebx ;# store is3
873 mov rax, [rsp + nb400nf_shiftvec] ;# rax = base of shiftvec[]
875 movsd xmm0, [rax + rbx*8]
876 movsd xmm1, [rax + rbx*8 + 8]
877 movsd xmm2, [rax + rbx*8 + 16]
879 mov rcx, [rsp + nb400nf_iinr] ;# rcx = pointer into iinr[]
880 mov ebx, [rcx+rsi*4] ;# ebx =ii
882 mov rdx, [rbp + nb400nf_charge]
883 movsd xmm3, [rdx + rbx*8]
884 mulsd xmm3, [rsp + nb400nf_facel]
885 shufpd xmm3, xmm3, 0
887 mov rdx, [rbp + nb400nf_invsqrta] ;# load invsqrta[ii]
888 movsd xmm4, [rdx + rbx*8]
889 shufpd xmm4, xmm4, 0
891 lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3
892 mov rax, [rbp + nb400nf_pos] ;# rax = base of pos[]
894 addsd xmm0, [rax + rbx*8]
895 addsd xmm1, [rax + rbx*8 + 8]
896 addsd xmm2, [rax + rbx*8 + 16]
898 movapd [rsp + nb400nf_iq], xmm3
899 movapd [rsp + nb400nf_isai], xmm4
901 shufpd xmm0, xmm0, 0
902 shufpd xmm1, xmm1, 0
903 shufpd xmm2, xmm2, 0
905 movapd [rsp + nb400nf_ix], xmm0
906 movapd [rsp + nb400nf_iy], xmm1
907 movapd [rsp + nb400nf_iz], xmm2
909 mov [rsp + nb400nf_ii3], ebx
911 ;# clear vctot
912 xorpd xmm4, xmm4
913 movapd [rsp + nb400nf_vctot], xmm4
915 mov rax, [rsp + nb400nf_jindex]
916 mov ecx, [rax + rsi*4] ;# jindex[n]
917 mov edx, [rax + rsi*4 + 4] ;# jindex[n+1]
918 sub edx, ecx ;# number of innerloop atoms
920 mov rsi, [rbp + nb400nf_pos]
921 mov rdi, [rbp + nb400nf_faction]
922 mov rax, [rsp + nb400nf_jjnr]
923 shl ecx, 2
924 add rax, rcx
925 mov [rsp + nb400nf_innerjjnr], rax ;# pointer to jjnr[nj0]
926 mov ecx, edx
927 sub edx, 2
928 add ecx, [rsp + nb400nf_ninner]
929 mov [rsp + nb400nf_ninner], ecx
930 add edx, 0
931 mov [rsp + nb400nf_innerk], edx ;# number of innerloop atoms
932 jge .nb400nf_unroll_loop
933 jmp .nb400nf_checksingle
934 .nb400nf_unroll_loop:
935 ;# twice unrolled innerloop here
936 mov rdx, [rsp + nb400nf_innerjjnr] ;# pointer to jjnr[k]
937 mov eax, [rdx]
938 mov ebx, [rdx + 4]
939 add qword ptr [rsp + nb400nf_innerjjnr], 8 ;# advance pointer (unrolled 2)
941 ;# load isa2
942 mov rsi, [rbp + nb400nf_invsqrta]
943 movlpd xmm2, [rsi + rax*8]
944 movhpd xmm2, [rsi + rbx*8]
945 mulpd xmm2, [rsp + nb400nf_isai]
946 movapd [rsp + nb400nf_isaprod], xmm2
947 movapd xmm1, xmm2
948 mulpd xmm1, [rsp + nb400nf_gbtsc]
949 movapd [rsp + nb400nf_gbscale], xmm1
951 mov rsi, [rbp + nb400nf_charge] ;# base of charge[]
952 movlpd xmm3, [rsi + rax*8]
953 movhpd xmm3, [rsi + rbx*8]
955 mulpd xmm2, [rsp + nb400nf_iq]
956 mulpd xmm3, xmm2
957 movapd [rsp + nb400nf_qq], xmm3
959 mov rsi, [rbp + nb400nf_pos] ;# base of pos[]
961 lea rax, [rax + rax*2] ;# replace jnr with j3
962 lea rbx, [rbx + rbx*2]
964 ;# move two coordinates to xmm0-xmm2
965 movlpd xmm0, [rsi + rax*8]
966 movlpd xmm1, [rsi + rax*8 + 8]
967 movlpd xmm2, [rsi + rax*8 + 16]
968 movhpd xmm0, [rsi + rbx*8]
969 movhpd xmm1, [rsi + rbx*8 + 8]
970 movhpd xmm2, [rsi + rbx*8 + 16]
972 mov rdi, [rbp + nb400nf_faction]
974 ;# move nb400nf_ix-iz to xmm4-xmm6
975 movapd xmm4, [rsp + nb400nf_ix]
976 movapd xmm5, [rsp + nb400nf_iy]
977 movapd xmm6, [rsp + nb400nf_iz]
979 ;# calc dr
980 subpd xmm4, xmm0
981 subpd xmm5, xmm1
982 subpd xmm6, xmm2
984 ;# square it
985 mulpd xmm4,xmm4
986 mulpd xmm5,xmm5
987 mulpd xmm6,xmm6
988 addpd xmm4, xmm5
989 addpd xmm4, xmm6
990 ;# rsq in xmm4
992 cvtpd2ps xmm5, xmm4
993 rsqrtps xmm5, xmm5
994 cvtps2pd xmm2, xmm5 ;# lu in low xmm2
996 ;# lookup seed in xmm2
997 movapd xmm5, xmm2 ;# copy of lu
998 mulpd xmm2, xmm2 ;# lu*lu
999 movapd xmm1, [rsp + nb400nf_three]
1000 mulpd xmm2, xmm4 ;# rsq*lu*lu
1001 movapd xmm0, [rsp + nb400nf_half]
1002 subpd xmm1, xmm2 ;# 30-rsq*lu*lu
1003 mulpd xmm1, xmm5
1004 mulpd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu)
1006 movapd xmm5, xmm1 ;# copy of lu
1007 mulpd xmm1, xmm1 ;# lu*lu
1008 movapd xmm2, [rsp + nb400nf_three]
1009 mulpd xmm1, xmm4 ;# rsq*lu*lu
1010 movapd xmm0, [rsp + nb400nf_half]
1011 subpd xmm2, xmm1 ;# 30-rsq*lu*lu
1012 mulpd xmm2, xmm5
1013 mulpd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu)
1014 mulpd xmm4, xmm0 ;# xmm4=r
1015 mulpd xmm4, [rsp + nb400nf_gbscale]
1017 cvttpd2pi mm6, xmm4 ;# mm6 = lu idx
1018 cvtpi2pd xmm5, mm6
1019 subpd xmm4, xmm5
1020 movapd xmm1, xmm4 ;# xmm1=eps
1021 movapd xmm2, xmm1
1022 mulpd xmm2, xmm2 ;# xmm2=eps2
1024 pslld mm6, 2 ;# idx *= 4
1026 movd mm0, eax
1027 movd mm1, ebx
1029 mov rsi, [rbp + nb400nf_GBtab]
1030 movd eax, mm6
1031 psrlq mm6, 32
1032 movd ebx, mm6 ;# indices in eax/ebx
1034 movapd xmm4, [rsi + rax*8] ;# Y1 F1
1035 movapd xmm3, [rsi + rbx*8] ;# Y2 F2
1036 movapd xmm5, xmm4
1037 unpcklpd xmm4, xmm3 ;# Y1 Y2
1038 unpckhpd xmm5, xmm3 ;# F1 F2
1040 movapd xmm6, [rsi + rax*8 + 16] ;# G1 H1
1041 movapd xmm3, [rsi + rbx*8 + 16] ;# G2 H2
1042 movapd xmm7, xmm6
1043 unpcklpd xmm6, xmm3 ;# G1 G2
1044 unpckhpd xmm7, xmm3 ;# H1 H2
1045 ;# coulomb table ready, in xmm4-xmm7
1046 mulpd xmm6, xmm1 ;# xmm6=Geps
1047 mulpd xmm7, xmm2 ;# xmm7=Heps2
1048 addpd xmm5, xmm6
1049 addpd xmm5, xmm7 ;# xmm5=Fp
1050 movapd xmm3, [rsp + nb400nf_qq]
1051 mulpd xmm5, xmm1 ;# xmm5=eps*Fp
1052 addpd xmm5, xmm4 ;# xmm5=VV
1053 mulpd xmm5, xmm3 ;# vcoul=qq*VV
1054 addpd xmm5, [rsp + nb400nf_vctot]
1055 movapd [rsp + nb400nf_vctot], xmm5
1057 ;# should we do one more iteration?
1058 sub dword ptr [rsp + nb400nf_innerk], 2
1059 jl .nb400nf_checksingle
1060 jmp .nb400nf_unroll_loop
1061 .nb400nf_checksingle:
1062 mov edx, [rsp + nb400nf_innerk]
1063 and edx, 1
1064 jnz .nb400nf_dosingle
1065 jmp .nb400nf_updateouterdata
1066 .nb400nf_dosingle:
1067 mov rsi, [rbp + nb400nf_charge]
1068 mov rdx, [rbp + nb400nf_invsqrta]
1069 mov rdi, [rbp + nb400nf_pos]
1070 mov rcx, [rsp + nb400nf_innerjjnr]
1071 mov eax, [rcx]
1072 xorpd xmm6, xmm6
1073 movapd xmm7, xmm6
1074 movsd xmm7, [rdx + rax*8]
1075 movlpd xmm6, [rsi + rax*8] ;# xmm6(0) has the charge
1076 mulsd xmm7, [rsp + nb400nf_isai]
1077 movapd [rsp + nb400nf_isaprod], xmm7
1078 movapd xmm1, xmm7
1079 mulpd xmm1, [rsp + nb400nf_gbtsc]
1080 movapd [rsp + nb400nf_gbscale], xmm1
1082 mulsd xmm7, [rsp + nb400nf_iq]
1083 mulsd xmm6, xmm7
1084 movapd [rsp + nb400nf_qq], xmm6
1086 lea rax, [rax + rax*2]
1088 ;# move coordinates to xmm0-xmm2
1089 movlpd xmm0, [rdi + rax*8]
1090 movlpd xmm1, [rdi + rax*8 + 8]
1091 movlpd xmm2, [rdi + rax*8 + 16]
1093 ;# move nb400nf_ix-iz to xmm4-xmm6
1094 movapd xmm4, [rsp + nb400nf_ix]
1095 movapd xmm5, [rsp + nb400nf_iy]
1096 movapd xmm6, [rsp + nb400nf_iz]
1098 ;# calc dr
1099 subsd xmm4, xmm0
1100 subsd xmm5, xmm1
1101 subsd xmm6, xmm2
1103 ;# square it
1104 mulsd xmm4,xmm4
1105 mulsd xmm5,xmm5
1106 mulsd xmm6,xmm6
1107 addsd xmm4, xmm5
1108 addsd xmm4, xmm6
1109 ;# rsq in xmm4
1111 cvtsd2ss xmm5, xmm4
1112 rsqrtss xmm5, xmm5
1113 cvtss2sd xmm2, xmm5 ;# lu in low xmm2
1115 ;# lookup seed in xmm2
1116 movapd xmm5, xmm2 ;# copy of lu
1117 mulsd xmm2, xmm2 ;# lu*lu
1118 movapd xmm1, [rsp + nb400nf_three]
1119 mulsd xmm2, xmm4 ;# rsq*lu*lu
1120 movapd xmm0, [rsp + nb400nf_half]
1121 subsd xmm1, xmm2 ;# 30-rsq*lu*lu
1122 mulsd xmm1, xmm5
1123 mulsd xmm1, xmm0 ;# xmm0=iter1 of rinv (new lu)
1125 movapd xmm5, xmm1 ;# copy of lu
1126 mulsd xmm1, xmm1 ;# lu*lu
1127 movapd xmm2, [rsp + nb400nf_three]
1128 mulsd xmm1, xmm4 ;# rsq*lu*lu
1129 movapd xmm0, [rsp + nb400nf_half]
1130 subsd xmm2, xmm1 ;# 30-rsq*lu*lu
1131 mulsd xmm2, xmm5
1132 mulsd xmm0, xmm2 ;# xmm0=iter2 of rinv (new lu)
1134 mulsd xmm4, xmm0 ;# xmm4=r
1135 mulsd xmm4, [rsp + nb400nf_gbscale]
1137 movd mm0, eax
1139 cvttsd2si eax, xmm4 ;# mm6 = lu idx
1140 cvtsi2sd xmm5, eax
1141 subsd xmm4, xmm5
1142 movapd xmm1, xmm4 ;# xmm1=eps
1143 movapd xmm2, xmm1
1144 mulsd xmm2, xmm2 ;# xmm2=eps2
1146 shl eax, 2 ;# idx *= 4
1148 mov rsi, [rbp + nb400nf_GBtab]
1150 ;# Coulomb
1151 movapd xmm4, [rsi + rax*8] ;# Y1 F1
1152 xorpd xmm3, xmm3
1153 movapd xmm5, xmm4
1154 unpcklpd xmm4, xmm3 ;# Y1
1155 unpckhpd xmm5, xmm3 ;# F1
1157 movapd xmm6, [rsi + rax*8 + 16] ;# G1 H1
1158 xorpd xmm3, xmm3
1159 movapd xmm7, xmm6
1160 unpcklpd xmm6, xmm3 ;# G1
1161 unpckhpd xmm7, xmm3 ;# H1
1162 ;# table ready in xmm4-xmm7
1164 mulsd xmm6, xmm1 ;# xmm6=Geps
1165 mulsd xmm7, xmm2 ;# xmm7=Heps2
1166 addsd xmm5, xmm6
1167 addsd xmm5, xmm7 ;# xmm5=Fp
1168 movapd xmm3, [rsp + nb400nf_qq]
1169 mulsd xmm5, xmm1 ;# xmm5=eps*Fp
1170 addsd xmm5, xmm4 ;# xmm5=VV
1171 mulsd xmm5, xmm3 ;# vcoul=qq*VV
1172 addsd xmm5, [rsp + nb400nf_vctot]
1173 movsd [rsp + nb400nf_vctot], xmm5
1175 .nb400nf_updateouterdata:
1176 ;# get n from stack
1177 mov esi, [rsp + nb400nf_n]
1178 ;# get group index for i particle
1179 mov rdx, [rbp + nb400nf_gid] ;# base of gid[]
1180 mov edx, [rdx + rsi*4] ;# ggid=gid[n]
1182 ;# accumulate total potential energy and update it
1183 movapd xmm7, [rsp + nb400nf_vctot]
1184 ;# accumulate
1185 movhlps xmm6, xmm7
1186 addsd xmm7, xmm6 ;# low xmm7 has the sum now
1188 ;# add earlier value from mem
1189 mov rax, [rbp + nb400nf_Vc]
1190 addsd xmm7, [rax + rdx*8]
1191 ;# move back to mem
1192 movsd [rax + rdx*8], xmm7
1194 ;# finish if last
1195 mov ecx, [rsp + nb400nf_nn1]
1196 ;# esi already loaded with n
1197 inc esi
1198 sub ecx, esi
1199 jz .nb400nf_outerend
1201 ;# not last, iterate outer loop once more!
1202 mov [rsp + nb400nf_n], esi
1203 jmp .nb400nf_outer
1204 .nb400nf_outerend:
1205 ;# check if more outer neighborlists remain
1206 mov ecx, [rsp + nb400nf_nri]
1207 ;# esi already loaded with n above
1208 sub ecx, esi
1209 jz .nb400nf_end
1210 ;# non-zero, do one more workunit
1211 jmp .nb400nf_threadloop
1212 .nb400nf_end:
1214 mov eax, [rsp + nb400nf_nouter]
1215 mov ebx, [rsp + nb400nf_ninner]
1216 mov rcx, [rbp + nb400nf_outeriter]
1217 mov rdx, [rbp + nb400nf_inneriter]
1218 mov [rcx], eax
1219 mov [rdx], ebx
1221 add rsp, 296
1222 emms
1225 pop r15
1226 pop r14
1227 pop r13
1228 pop r12
1230 pop rbx
1231 pop rbp