Renamed intel syntax assembly files to avoid double extensions
[gromacs.git] / src / gmxlib / nonbonded / nb_kernel_x86_64_sse / nb_kernel430_x86_64_sse.intel_syntax.s
blobe3ee63bb60d2994d4a90f754fb5ca9b0b4d1f6ac
1 ;#
2 ;#
3 ;# Gromacs 4.0 Copyright (c) 1991-2003
4 ;# David van der Spoel, Erik Lindahl
5 ;#
6 ;# This program is free software; you can redistribute it and/or
7 ;# modify it under the terms of the GNU General Public License
8 ;# as published by the Free Software Foundation; either version 2
9 ;# of the License, or (at your option) any later version.
11 ;# To help us fund GROMACS development, we humbly ask that you cite
12 ;# the research papers on the package. Check out http://www.gromacs.org
13 ;#
14 ;# And Hey:
15 ;# Gnomes, ROck Monsters And Chili Sauce
18 ;# These files require GNU binutils 2.10 or later, since we
19 ;# use intel syntax for portability, or a recent version
20 ;# of NASM that understands Extended 3DNow and SSE2 instructions.
21 ;# (NASM is normally only used with MS Visual C++).
22 ;# Since NASM and gnu as disagree on some definitions and use
23 ;# completely different preprocessing options I have to introduce a
24 ;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
25 ;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
26 ;# reason why all comments need both symbols...
27 ;# The source is written for GNU as, with intel syntax. When you use
28 ;# NASM we redefine a couple of things. The false if-statement around
29 ;# the following code is seen by GNU as, but NASM doesn't see it, so
30 ;# the code inside is read by NASM but not gcc.
32 ; .if 0 # block below only read by NASM
33 %define .section section
34 %define .long dd
35 %define .align align
36 %define .globl global
37 ;# NASM only wants 'dword', not 'dword ptr'.
38 %define ptr
39 %macro .equiv 2
40 %1 equ %2
41 %endmacro
42 ; .endif # End of NASM-specific block
43 ; .intel_syntax noprefix # Line only read by gnu as
49 .globl nb_kernel430_x86_64_sse
50 .globl _nb_kernel430_x86_64_sse
51 nb_kernel430_x86_64_sse:
52 _nb_kernel430_x86_64_sse:
53 ;# Room for return address and rbp (16 bytes)
54 .equiv nb430_fshift, 16
55 .equiv nb430_gid, 24
56 .equiv nb430_pos, 32
57 .equiv nb430_faction, 40
58 .equiv nb430_charge, 48
59 .equiv nb430_p_facel, 56
60 .equiv nb430_argkrf, 64
61 .equiv nb430_argcrf, 72
62 .equiv nb430_Vc, 80
63 .equiv nb430_type, 88
64 .equiv nb430_p_ntype, 96
65 .equiv nb430_vdwparam, 104
66 .equiv nb430_Vvdw, 112
67 .equiv nb430_p_tabscale, 120
68 .equiv nb430_VFtab, 128
69 .equiv nb430_invsqrta, 136
70 .equiv nb430_dvda, 144
71 .equiv nb430_p_gbtabscale, 152
72 .equiv nb430_GBtab, 160
73 .equiv nb430_p_nthreads, 168
74 .equiv nb430_count, 176
75 .equiv nb430_mtx, 184
76 .equiv nb430_outeriter, 192
77 .equiv nb430_inneriter, 200
78 .equiv nb430_work, 208
79 ;# stack offsets for local variables
80 ;# bottom of stack is cache-aligned for sse use
81 .equiv nb430_ix, 0
82 .equiv nb430_iy, 16
83 .equiv nb430_iz, 32
84 .equiv nb430_iq, 48
85 .equiv nb430_dx, 64
86 .equiv nb430_dy, 80
87 .equiv nb430_dz, 96
88 .equiv nb430_eps, 112
89 .equiv nb430_gbtsc, 128
90 .equiv nb430_tsc, 144
91 .equiv nb430_qq, 160
92 .equiv nb430_c6, 176
93 .equiv nb430_c12, 192
94 .equiv nb430_epsgb, 208
95 .equiv nb430_vctot, 224
96 .equiv nb430_Vvdwtot, 240
97 .equiv nb430_fix, 256
98 .equiv nb430_fiy, 272
99 .equiv nb430_fiz, 288
100 .equiv nb430_half, 304
101 .equiv nb430_three, 320
102 .equiv nb430_r, 336
103 .equiv nb430_isai, 352
104 .equiv nb430_isaprod, 368
105 .equiv nb430_dvdasum, 384
106 .equiv nb430_gbscale, 400
107 .equiv nb430_rinv, 416
108 .equiv nb430_nri, 432
109 .equiv nb430_iinr, 440
110 .equiv nb430_jindex, 448
111 .equiv nb430_jjnr, 456
112 .equiv nb430_shift, 464
113 .equiv nb430_shiftvec, 472
114 .equiv nb430_facel, 480
115 .equiv nb430_innerjjnr, 488
116 .equiv nb430_ii, 496
117 .equiv nb430_is3, 500
118 .equiv nb430_ii3, 504
119 .equiv nb430_ntia, 508
120 .equiv nb430_innerk, 512
121 .equiv nb430_n, 516
122 .equiv nb430_nn1, 520
123 .equiv nb430_ntype, 524
124 .equiv nb430_nouter, 528
125 .equiv nb430_ninner, 532
127 push rbp
128 mov rbp, rsp
129 push rbx
132 emms
134 push r12
135 push r13
136 push r14
137 push r15
139 sub rsp, 552 ;# local variable stack space (n*16+8)
141 ;# zero 32-bit iteration counters
142 mov eax, 0
143 mov [rsp + nb430_nouter], eax
144 mov [rsp + nb430_ninner], eax
148 mov edi, [rdi]
149 mov [rsp + nb430_nri], edi
150 mov [rsp + nb430_iinr], rsi
151 mov [rsp + nb430_jindex], rdx
152 mov [rsp + nb430_jjnr], rcx
153 mov [rsp + nb430_shift], r8
154 mov [rsp + nb430_shiftvec], r9
155 mov rdi, [rbp + nb430_p_ntype]
156 mov edi, [rdi]
157 mov [rsp + nb430_ntype], edi
158 mov rsi, [rbp + nb430_p_facel]
159 movss xmm0, [rsi]
160 movss [rsp + nb430_facel], xmm0
162 mov rax, [rbp + nb430_p_tabscale]
163 movss xmm3, [rax]
164 shufps xmm3, xmm3, 0
165 movaps [rsp + nb430_tsc], xmm3
167 mov rbx, [rbp + nb430_p_gbtabscale]
168 movss xmm4, [rbx]
169 shufps xmm4, xmm4, 0
170 movaps [rsp + nb430_gbtsc], xmm4
173 ;# create constant floating-point factors on stack
174 mov eax, 0x3f000000 ;# half in IEEE (hex)
175 mov [rsp + nb430_half], eax
176 movss xmm1, [rsp + nb430_half]
177 shufps xmm1, xmm1, 0 ;# splat to all elements
178 movaps xmm2, xmm1
179 addps xmm2, xmm2 ;# one
180 movaps xmm3, xmm2
181 addps xmm2, xmm2 ;# two
182 addps xmm3, xmm2 ;# three
183 movaps [rsp + nb430_half], xmm1
184 movaps [rsp + nb430_three], xmm3
186 .nb430_threadloop:
187 mov rsi, [rbp + nb430_count] ;# pointer to sync counter
188 mov eax, [rsi]
189 .nb430_spinlock:
190 mov ebx, eax ;# ebx=*count=nn0
191 add ebx, 1 ;# ebx=nn1=nn0+10
192 lock
193 cmpxchg [esi], ebx ;# write nn1 to *counter,
194 ;# if it hasnt changed.
195 ;# or reread *counter to eax.
196 pause ;# -> better p4 performance
197 jnz .nb430_spinlock
199 ;# if(nn1>nri) nn1=nri
200 mov ecx, [rsp + nb430_nri]
201 mov edx, ecx
202 sub ecx, ebx
203 cmovle ebx, edx ;# if(nn1>nri) nn1=nri
204 ;# Cleared the spinlock if we got here.
205 ;# eax contains nn0, ebx contains nn1.
206 mov [rsp + nb430_n], eax
207 mov [rsp + nb430_nn1], ebx
208 sub ebx, eax ;# calc number of outer lists
209 mov esi, eax ;# copy n to esi
210 jg .nb430_outerstart
211 jmp .nb430_end
213 .nb430_outerstart:
214 ;# ebx contains number of outer iterations
215 add ebx, [rsp + nb430_nouter]
216 mov [rsp + nb430_nouter], ebx
218 .nb430_outer:
219 mov rax, [rsp + nb430_shift] ;# rax = pointer into shift[]
220 mov ebx, [rax + rsi*4] ;# ebx=shift[n]
222 lea rbx, [rbx + rbx*2] ;# rbx=3*is
223 mov [rsp + nb430_is3],ebx ;# store is3
225 mov rax, [rsp + nb430_shiftvec] ;# rax = base of shiftvec[]
227 movss xmm0, [rax + rbx*4]
228 movss xmm1, [rax + rbx*4 + 4]
229 movss xmm2, [rax + rbx*4 + 8]
231 mov rcx, [rsp + nb430_iinr] ;# rcx = pointer into iinr[]
232 mov ebx, [rcx + rsi*4] ;# ebx =ii
233 mov [rsp + nb430_ii], ebx
235 mov rdx, [rbp + nb430_charge]
236 movss xmm3, [rdx + rbx*4]
237 mulss xmm3, [rsp + nb430_facel]
238 shufps xmm3, xmm3, 0
240 mov rdx, [rbp + nb430_invsqrta] ;# load invsqrta[ii]
241 movss xmm4, [rdx + rbx*4]
242 shufps xmm4, xmm4, 0
244 mov rdx, [rbp + nb430_type]
245 mov edx, [rdx + rbx*4]
246 imul edx, [rsp + nb430_ntype]
247 shl edx, 1
248 mov [rsp + nb430_ntia], edx
250 lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3
251 mov rax, [rbp + nb430_pos] ;# rax = base of pos[]
253 addss xmm0, [rax + rbx*4]
254 addss xmm1, [rax + rbx*4 + 4]
255 addss xmm2, [rax + rbx*4 + 8]
257 movaps [rsp + nb430_iq], xmm3
258 movaps [rsp + nb430_isai], xmm4
260 shufps xmm0, xmm0, 0
261 shufps xmm1, xmm1, 0
262 shufps xmm2, xmm2, 0
264 movaps [rsp + nb430_ix], xmm0
265 movaps [rsp + nb430_iy], xmm1
266 movaps [rsp + nb430_iz], xmm2
268 mov [rsp + nb430_ii3], ebx
270 ;# clear vctot and i forces
271 xorps xmm4, xmm4
272 movaps [rsp + nb430_vctot], xmm4
273 movaps [rsp + nb430_Vvdwtot], xmm4
274 movaps [rsp + nb430_dvdasum], xmm4
275 movaps [rsp + nb430_fix], xmm4
276 movaps [rsp + nb430_fiy], xmm4
277 movaps [rsp + nb430_fiz], xmm4
279 mov rax, [rsp + nb430_jindex]
280 mov ecx, [rax + rsi*4] ;# jindex[n]
281 mov edx, [rax + rsi*4 + 4] ;# jindex[n+1]
282 sub edx, ecx ;# number of innerloop atoms
284 mov rsi, [rbp + nb430_pos]
285 mov rdi, [rbp + nb430_faction]
286 mov rax, [rsp + nb430_jjnr]
287 shl ecx, 2
288 add rax, rcx
289 mov [rsp + nb430_innerjjnr], rax ;# pointer to jjnr[nj0]
290 mov ecx, edx
291 sub edx, 4
292 add ecx, [rsp + nb430_ninner]
293 mov [rsp + nb430_ninner], ecx
294 add edx, 0
295 mov [rsp + nb430_innerk], edx ;# number of innerloop atoms
297 jge .nb430_unroll_loop
298 jmp .nb430_finish_inner
299 .nb430_unroll_loop:
300 ;# quad-unroll innerloop here
301 mov rdx, [rsp + nb430_innerjjnr] ;# pointer to jjnr[k]
302 mov eax, [rdx]
303 mov ebx, [rdx + 4]
304 mov ecx, [rdx + 8]
305 mov edx, [rdx + 12] ;# eax-edx=jnr1-4
307 add qword ptr [rsp + nb430_innerjjnr], 16 ;# advance pointer (unrolled 4)
309 ;# load isaj
310 mov rsi, [rbp + nb430_invsqrta]
311 movss xmm3, [rsi + rax*4]
312 movss xmm4, [rsi + rcx*4]
313 movss xmm6, [rsi + rbx*4]
314 movss xmm7, [rsi + rdx*4]
315 movaps xmm2, [rsp + nb430_isai]
316 shufps xmm3, xmm6, 0
317 shufps xmm4, xmm7, 0
318 shufps xmm3, xmm4, 136 ;# 10001000 ;# all isaj in xmm3
319 mulps xmm2, xmm3
321 movaps [rsp + nb430_isaprod], xmm2
322 movaps xmm1, xmm2
323 mulps xmm1, [rsp + nb430_gbtsc]
324 movaps [rsp + nb430_gbscale], xmm1
326 mov rsi, [rbp + nb430_charge] ;# base of charge[]
328 movss xmm3, [rsi + rax*4]
329 movss xmm4, [rsi + rcx*4]
330 movss xmm6, [rsi + rbx*4]
331 movss xmm7, [rsi + rdx*4]
333 mulps xmm2, [rsp + nb430_iq]
334 shufps xmm3, xmm6, 0
335 shufps xmm4, xmm7, 0
336 shufps xmm3, xmm4, 136 ;# 10001000 ;# all charges in xmm3
337 mulps xmm3, xmm2
338 movaps [rsp + nb430_qq], xmm3
340 ;# vdw parameters
341 mov rsi, [rbp + nb430_type]
342 mov r12d, [rsi + rax*4]
343 mov r13d, [rsi + rbx*4]
344 mov r14d, [rsi + rcx*4]
345 mov r15d, [rsi + rdx*4]
346 shl r12d, 1
347 shl r13d, 1
348 shl r14d, 1
349 shl r15d, 1
350 mov edi, [rsp + nb430_ntia]
351 add r12d, edi
352 add r13d, edi
353 add r14d, edi
354 add r15d, edi
356 mov rsi, [rbp + nb430_vdwparam]
357 movlps xmm3, [rsi + r12*4]
358 movlps xmm7, [rsi + r14*4]
359 movhps xmm3, [rsi + r13*4]
360 movhps xmm7, [rsi + r15*4]
362 movaps xmm0, xmm3
363 shufps xmm0, xmm7, 136 ;# 10001000
364 shufps xmm3, xmm7, 221 ;# 11011101
366 movaps [rsp + nb430_c6], xmm0
367 movaps [rsp + nb430_c12], xmm3
369 mov rsi, [rbp + nb430_pos] ;# base of pos[]
371 lea r8, [rax + rax*2] ;# jnr
372 lea r9, [rbx + rbx*2]
373 lea r10, [rcx + rcx*2]
374 lea r11, [rdx + rdx*2]
376 ;# move four coordinates to xmm0-xmm2
377 movlps xmm4, [rsi + r8*4]
378 movlps xmm5, [rsi + r10*4]
379 movss xmm2, [rsi + r8*4 + 8]
380 movss xmm6, [rsi + r10*4 + 8]
382 movhps xmm4, [rsi + r9*4]
383 movhps xmm5, [rsi + r11*4]
385 movss xmm0, [rsi + r9*4 + 8]
386 movss xmm1, [rsi + r11*4 + 8]
388 shufps xmm2, xmm0, 0
389 shufps xmm6, xmm1, 0
391 movaps xmm0, xmm4
392 movaps xmm1, xmm4
394 shufps xmm2, xmm6, 136 ;# 10001000
396 shufps xmm0, xmm5, 136 ;# 10001000
397 shufps xmm1, xmm5, 221 ;# 11011101
399 ;# calc dr
400 subps xmm0, [rsp + nb430_ix]
401 subps xmm1, [rsp + nb430_iy]
402 subps xmm2, [rsp + nb430_iz]
404 ;# store dr
405 movaps [rsp + nb430_dx], xmm0
406 movaps [rsp + nb430_dy], xmm1
407 movaps [rsp + nb430_dz], xmm2
409 movd mm0, r8 ;# store j3
410 movd mm1, r9
411 movd mm2, r10
412 movd mm3, r11
414 ;# square it
415 mulps xmm0,xmm0
416 mulps xmm1,xmm1
417 mulps xmm2,xmm2
418 addps xmm0, xmm1
419 addps xmm0, xmm2
420 movaps xmm4, xmm0
421 ;# rsq in xmm4
423 rsqrtps xmm5, xmm4
424 ;# lookup seed in xmm5
425 movaps xmm2, xmm5
426 mulps xmm5, xmm5
427 movaps xmm1, [rsp + nb430_three]
428 mulps xmm5, xmm4 ;# rsq*lu*lu
429 movaps xmm0, [rsp + nb430_half]
430 subps xmm1, xmm5 ;# 30-rsq*lu*lu
431 mulps xmm1, xmm2
432 mulps xmm0, xmm1 ;# xmm0=rinv
433 mulps xmm4, xmm0 ;# xmm4=r
434 movaps [rsp + nb430_r], xmm4
435 movaps [rsp + nb430_rinv], xmm0
437 movaps xmm8, xmm4 ;# r
438 mulps xmm4, [rsp + nb430_gbscale] ;# rgbtab
439 mulps xmm8, [rsp + nb430_tsc] ;# rtab
441 ;# truncate and convert to integers
442 cvttps2dq xmm5, xmm4 ;# gb
443 cvttps2dq xmm9, xmm8 ;# lj
445 ;# convert back to float
446 cvtdq2ps xmm6, xmm5 ;# gb
447 cvtdq2ps xmm10, xmm9 ;# lj
449 ;# multiply by 4 and 8, respectively
450 pslld xmm5, 2 ;# gb
451 pslld xmm9, 3 ;# lj
453 ;# move to integer registers
454 movhlps xmm7, xmm5 ;# gb
455 movhlps xmm11, xmm9 ;# lj
456 movd r8d, xmm5 ;# gb
457 movd r12d, xmm9 ;# lj
458 movd r10d, xmm7 ;# gb
459 movd r14d, xmm11 ;# lj
460 pshufd xmm5, xmm5, 1 ;# gb
461 pshufd xmm9, xmm9, 1 ;# lj
462 pshufd xmm7, xmm7, 1 ;# gb
463 pshufd xmm11, xmm11, 1 ;# lj
464 movd r9d, xmm5 ;# gb
465 movd r13d, xmm9 ;# lj
466 movd r11d, xmm7 ;# gb
467 movd r15d, xmm11 ;# lj
468 ;# GB indices: r8-r11 LJ indices: r12-r15
470 ;# calculate eps
471 subps xmm4, xmm6 ;# gb
472 subps xmm8, xmm10 ;# lj
473 movaps [rsp + nb430_epsgb], xmm4 ;# gb eps
474 movaps [rsp + nb430_eps], xmm8 ;# lj eps
476 mov rsi, [rbp + nb430_GBtab]
477 mov rdi, [rbp + nb430_VFtab]
479 ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
480 movlps xmm1, [rsi + r8*4] ;# Y1c F1c
481 movlps xmm5, [rdi + r12*4] ;# Y1d F1d
482 movlps xmm9, [rdi + r12*4 + 16] ;# Y1r F1r
484 movlps xmm3, [rsi + r10*4] ;# Y3c F3c
485 movlps xmm7, [rdi + r14*4] ;# Y3d F3d
486 movlps xmm11, [rdi + r14*4 + 16] ;# Y3r F3r
488 movhps xmm1, [rsi + r9*4] ;# Y1c F1c Y2c F2c
489 movhps xmm5, [rdi + r13*4] ;# Y1d F1d Y2d F2d
490 movhps xmm9, [rdi + r13*4 + 16] ;# Y1r F1r Y2r F2r
492 movhps xmm3, [rsi + r11*4] ;# Y3c F3c Y4c F4c
493 movhps xmm7, [rdi + r15*4] ;# Y3d F3d Y4d F4d
494 movhps xmm11, [rdi + r15*4 + 16] ;# Y3r F3r Y4r F4r
496 movaps xmm0, xmm1
497 movaps xmm4, xmm5
498 movaps xmm8, xmm9
499 shufps xmm0, xmm3, 136 ;# 10001000 => Y1c Y2c Y3c Y4c
500 shufps xmm4, xmm7, 136 ;# 10001000 => Y1d Y2d Y3d Y4d
501 shufps xmm8, xmm11, 136 ;# 10001000 => Y1r Y2r Y3r Y4r
502 shufps xmm1, xmm3, 221 ;# 11011101 => F1c F2c F3c F4c
503 shufps xmm5, xmm7, 221 ;# 11011101 => F1d F2d F3d F4d
504 shufps xmm9, xmm11, 221 ;# 11011101 => F1r F2r F3r F4r
506 movlps xmm3, [rsi + r8*4 + 8] ;# G1c H1c
507 movlps xmm7, [rdi + r12*4 + 8] ;# G1d H1d
508 movlps xmm11, [rdi + r12*4 + 24] ;# G1r H1r
510 movlps xmm12, [rsi + r10*4 + 8] ;# G3c H3c
511 movlps xmm13, [rdi + r14*4 + 8] ;# G3d H3d
512 movlps xmm14, [rdi + r14*4 + 24] ;# G3r H3r
514 movhps xmm3, [rsi + r9*4 + 8] ;# G1c H1c G2c H2c
515 movhps xmm7, [rdi + r13*4 + 8] ;# G1d H1d G2d H2d
516 movhps xmm11, [rdi + r13*4 + 24] ;# G1r H1r G2r H2r
518 movhps xmm12, [rsi + r11*4 + 8] ;# G3c H3c G4c H4c
519 movhps xmm13, [rdi + r15*4 + 8] ;# G3d H3d G4d H4d
520 movhps xmm14, [rdi + r15*4 + 24] ;# G3r H3r G4r H4r
521 movaps xmm2, xmm3
522 movaps xmm6, xmm7
523 movaps xmm10, xmm11
525 shufps xmm2, xmm12, 136 ;# 10001000 => G1c G2c G3c G4c
526 shufps xmm6, xmm13, 136 ;# 10001000 => G1d G2d G3d G4d
527 shufps xmm10, xmm14, 136 ;# 10001000 => G1r G2r G3r G4r
528 shufps xmm3, xmm12, 221 ;# 11011101 => H1c H2c H3c H4c
529 shufps xmm7, xmm13, 221 ;# 11011101 => H1d H2d H3d H4d
530 shufps xmm11, xmm14, 221 ;# 11011101 => H1r H2r H3r H4r
531 ;# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
533 movaps xmm12, [rsp + nb430_epsgb]
534 movaps xmm13, [rsp + nb430_eps]
536 mulps xmm3, xmm12 ;# Heps
537 mulps xmm7, xmm13
538 mulps xmm11, xmm13
539 mulps xmm2, xmm12 ;# Geps
540 mulps xmm6, xmm13
541 mulps xmm10, xmm13
542 mulps xmm3, xmm12 ;# Heps2
543 mulps xmm7, xmm13
544 mulps xmm11, xmm13
546 addps xmm1, xmm2 ;# F+Geps
547 addps xmm5, xmm6
548 addps xmm9, xmm10
549 addps xmm1, xmm3 ;# F+Geps+Heps2 = Fp
550 addps xmm5, xmm7
551 addps xmm9, xmm11
552 addps xmm3, xmm3 ;# 2*Heps2
553 addps xmm7, xmm7
554 addps xmm11, xmm11
555 addps xmm3, xmm2 ;# 2*Heps2+Geps
556 addps xmm7, xmm6
557 addps xmm11, xmm10
558 addps xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps
559 addps xmm7, xmm5
560 addps xmm11, xmm9
561 mulps xmm1, xmm12 ;# eps*Fp
562 mulps xmm5, xmm13
563 mulps xmm9, xmm13
564 addps xmm1, xmm0 ;# VV
565 addps xmm5, xmm4
566 addps xmm9, xmm8
567 mulps xmm1, [rsp + nb430_qq] ;# VV*qq = vcoul
568 mulps xmm5, [rsp + nb430_c6] ;# vnb6
569 mulps xmm9, [rsp + nb430_c12] ;# vnb12
570 mulps xmm3, [rsp + nb430_qq] ;# FF*qq = fij
571 mulps xmm7, [rsp + nb430_c6] ;# fijD
572 mulps xmm11, [rsp + nb430_c12] ;#fijR
574 addps xmm11, xmm7 ;# fijD+fijR
575 mulps xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale
577 ;# accumulate Vvdwtot
578 addps xmm5, [rsp + nb430_Vvdwtot]
579 addps xmm5, xmm9
580 movaps [rsp + nb430_Vvdwtot], xmm5
582 mov rsi, [rbp + nb430_dvda]
584 ;# Calculate dVda
585 mulps xmm3, [rsp + nb430_gbscale] ;# fijC=qq*FF*gbscale
586 movaps xmm6, xmm3
587 mulps xmm6, [rsp + nb430_r]
588 addps xmm6, xmm1 ;# vcoul+fijC*r
590 addps xmm3, xmm11 ;# fijC+fijD+fijR
592 ;# increment vctot
593 addps xmm1, [rsp + nb430_vctot]
594 movaps [rsp + nb430_vctot], xmm1
596 ;# xmm6=(vcoul+fijC*r)
597 xorps xmm7, xmm7
598 subps xmm7, xmm6
599 movaps xmm6, xmm7
601 ;# update dvdasum
602 addps xmm7, [rsp + nb430_dvdasum]
603 movaps [rsp + nb430_dvdasum], xmm7
605 ;# update j atoms dvdaj
606 movhlps xmm7, xmm6
607 movaps xmm5, xmm6
608 movaps xmm4, xmm7
609 shufps xmm5, xmm5, 0x1
610 shufps xmm4, xmm4, 0x1
612 ;# xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
613 addss xmm6, [rsi + rax*4]
614 addss xmm5, [rsi + rbx*4]
615 addss xmm7, [rsi + rcx*4]
616 addss xmm4, [rsi + rdx*4]
617 movss [rsi + rax*4], xmm6
618 movss [rsi + rbx*4], xmm5
619 movss [rsi + rcx*4], xmm7
620 movss [rsi + rdx*4], xmm4
622 xorps xmm4, xmm4
623 mulps xmm3, [rsp + nb430_rinv]
624 subps xmm4, xmm3
626 movd r8, mm0 ;# fetch j3
627 movd r9, mm1
628 movd r10, mm2
629 movd r11, mm3
631 movaps xmm9, xmm4
632 movaps xmm10, xmm4
633 movaps xmm11, xmm4
635 mulps xmm9, [rsp + nb430_dx]
636 mulps xmm10, [rsp + nb430_dy]
637 mulps xmm11, [rsp + nb430_dz]
639 ;# accumulate i forces
640 movaps xmm12, [rsp + nb430_fix]
641 movaps xmm13, [rsp + nb430_fiy]
642 movaps xmm14, [rsp + nb430_fiz]
643 addps xmm12, xmm9
644 addps xmm13, xmm10
645 addps xmm14, xmm11
646 movaps [rsp + nb430_fix], xmm12
647 movaps [rsp + nb430_fiy], xmm13
648 movaps [rsp + nb430_fiz], xmm14
650 mov rsi, [rbp + nb430_faction]
651 ;# the fj's - start by accumulating x & y forces from memory
652 movlps xmm0, [rsi + r8*4] ;# x1 y1 - -
653 movlps xmm1, [rsi + r10*4] ;# x3 y3 - -
654 movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2
655 movhps xmm1, [rsi + r11*4] ;# x3 y3 x4 y4
657 movaps xmm8, xmm9
658 unpcklps xmm9, xmm10 ;# x1 y1 x2 y2
659 unpckhps xmm8, xmm10 ;# x3 y3 x4 y4
661 ;# update fjx and fjy
662 addps xmm0, xmm9
663 addps xmm1, xmm8
665 movlps [rsi + r8*4], xmm0
666 movlps [rsi + r10*4], xmm1
667 movhps [rsi + r9*4], xmm0
668 movhps [rsi + r11*4], xmm1
670 ;# xmm11: fjz1 fjz2 fjz3 fjz4
671 pshufd xmm10, xmm11, 1 ;# fjz2 - - -
672 movhlps xmm9, xmm11 ;# fjz3 - - -
673 pshufd xmm8, xmm11, 3 ;# fjz4 - - -
675 addss xmm11, [rsi + r8*4 + 8]
676 addss xmm10, [rsi + r9*4 + 8]
677 addss xmm9, [rsi + r10*4 + 8]
678 addss xmm8, [rsi + r11*4 + 8]
679 movss [rsi + r8*4 + 8], xmm11
680 movss [rsi + r9*4 + 8], xmm10
681 movss [rsi + r10*4 + 8], xmm9
682 movss [rsi + r11*4 + 8], xmm8
684 ;# should we do one more iteration?
685 sub dword ptr [rsp + nb430_innerk], 4
686 jl .nb430_finish_inner
687 jmp .nb430_unroll_loop
688 .nb430_finish_inner:
689 ;# check if at least two particles remain
690 add dword ptr [rsp + nb430_innerk], 4
691 mov edx, [rsp + nb430_innerk]
692 and edx, 2
693 jnz .nb430_dopair
694 jmp .nb430_checksingle
695 .nb430_dopair:
696 mov rcx, [rsp + nb430_innerjjnr]
698 mov eax, [rcx]
699 mov ebx, [rcx + 4]
700 add qword ptr [rsp + nb430_innerjjnr], 8
702 ;# load isaj
703 mov rsi, [rbp + nb430_invsqrta]
704 movss xmm3, [rsi + rax*4]
705 movss xmm6, [rsi + rbx*4]
706 movaps xmm2, [rsp + nb430_isai]
707 unpcklps xmm3, xmm6
708 mulps xmm2, xmm3
709 movaps [rsp + nb430_isaprod], xmm2
711 movaps xmm1, xmm2
712 mulps xmm1, [rsp + nb430_gbtsc]
713 movaps [rsp + nb430_gbscale], xmm1
715 mov rsi, [rbp + nb430_charge] ;# base of charge[]
717 movss xmm3, [rsi + rax*4]
718 movss xmm6, [rsi + rbx*4]
719 unpcklps xmm3, xmm6
720 mulps xmm2, [rsp + nb430_iq]
721 mulps xmm3, xmm2
722 movaps [rsp + nb430_qq], xmm3
724 ;# vdw parameters
725 mov rsi, [rbp + nb430_type]
726 mov r12d, [rsi + rax*4]
727 mov r13d, [rsi + rbx*4]
728 shl r12d, 1
729 shl r13d, 1
730 mov edi, [rsp + nb430_ntia]
731 add r12d, edi
732 add r13d, edi
734 mov rsi, [rbp + nb430_vdwparam]
735 movlps xmm3, [rsi + r12*4]
736 movhps xmm3, [rsi + r13*4]
738 xorps xmm7, xmm7
739 movaps xmm0, xmm3
740 shufps xmm0, xmm7, 136 ;# 10001000
741 shufps xmm3, xmm7, 221 ;# 11011101
743 movaps [rsp + nb430_c6], xmm0
744 movaps [rsp + nb430_c12], xmm3
746 mov rsi, [rbp + nb430_pos] ;# base of pos[]
748 lea r8, [rax + rax*2] ;# j3
749 lea r9, [rbx + rbx*2]
751 ;# move four coordinates to xmm0-xmm2
752 movlps xmm0, [rsi + r8*4] ;# x1 y1 - -
753 movlps xmm1, [rsi + r9*4] ;# x2 y2 - -
755 movss xmm2, [rsi + r8*4 + 8] ;# z1 - - -
756 movss xmm7, [rsi + r9*4 + 8] ;# z2 - - -
758 unpcklps xmm0, xmm1 ;# x1 x2 y1 y2
759 movhlps xmm1, xmm0 ;# y1 y2 - -
760 unpcklps xmm2, xmm7 ;# z1 z2 - -
762 ;# calc dr
763 subps xmm0, [rsp + nb430_ix]
764 subps xmm1, [rsp + nb430_iy]
765 subps xmm2, [rsp + nb430_iz]
767 ;# store dr
768 movaps [rsp + nb430_dx], xmm0
769 movaps [rsp + nb430_dy], xmm1
770 movaps [rsp + nb430_dz], xmm2
772 ;# square it
773 mulps xmm0,xmm0
774 mulps xmm1,xmm1
775 mulps xmm2,xmm2
776 addps xmm0, xmm1
777 addps xmm0, xmm2
778 movaps xmm4, xmm0
779 ;# rsq in xmm4
781 rsqrtps xmm5, xmm4
782 ;# lookup seed in xmm5
783 movaps xmm2, xmm5
784 mulps xmm5, xmm5
785 movaps xmm1, [rsp + nb430_three]
786 mulps xmm5, xmm4 ;# rsq*lu*lu
787 movaps xmm0, [rsp + nb430_half]
788 subps xmm1, xmm5 ;# 30-rsq*lu*lu
789 mulps xmm1, xmm2
790 mulps xmm0, xmm1 ;# xmm0=rinv
791 mulps xmm4, xmm0 ;# xmm4=r
792 movaps [rsp + nb430_r], xmm4
793 movaps [rsp + nb430_rinv], xmm0
795 movaps xmm8, xmm4 ;# r
796 mulps xmm4, [rsp + nb430_gbscale] ;# rgbtab
797 mulps xmm8, [rsp + nb430_tsc] ;# rtab
799 ;# truncate and convert to integers
800 cvttps2dq xmm5, xmm4 ;# gb
801 cvttps2dq xmm9, xmm8 ;# lj
803 ;# convert back to float
804 cvtdq2ps xmm6, xmm5 ;# gb
805 cvtdq2ps xmm10, xmm9 ;# lj
807 ;# multiply by 4 and 8, respectively
808 pslld xmm5, 2 ;# gb
809 pslld xmm9, 3 ;# lj
811 ;# move to integer registers
812 movd r12d, xmm5 ;# gb
813 movd r14d, xmm9 ;# lj
814 pshufd xmm5, xmm5, 1 ;# gb
815 pshufd xmm9, xmm9, 1 ;# lj
816 movd r13d, xmm5 ;# gb
817 movd r15d, xmm9 ;# lj
818 ;# GB indices: r12-r13 LJ indices: r14-r15
820 ;# calculate eps
821 subps xmm4, xmm6 ;# gb
822 subps xmm8, xmm10 ;# lj
823 movaps [rsp + nb430_epsgb], xmm4 ;# gb eps
824 movaps [rsp + nb430_eps], xmm8 ;# lj eps
826 mov rsi, [rbp + nb430_GBtab]
827 mov rdi, [rbp + nb430_VFtab]
829 ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
830 movlps xmm0, [rsi + r12*4] ;# Y1c F1c
831 movlps xmm1, [rsi + r13*4] ;# Y2c F2c
832 movlps xmm4, [rdi + r14*4] ;# Y1d F1d
833 movlps xmm5, [rdi + r15*4] ;# Y2d F2d
834 movlps xmm8, [rdi + r14*4 + 16] ;# Y1r F1r
835 movlps xmm9, [rdi + r15*4 + 16] ;# Y2r F2r
837 unpcklps xmm0, xmm1
838 movhlps xmm1, xmm0
839 unpcklps xmm4, xmm5
840 movhlps xmm5, xmm4
841 unpcklps xmm8, xmm9
842 movhlps xmm9, xmm8
843 movlps xmm2, [rsi + r12*4 + 8] ;# G1c H1c
844 movlps xmm3, [rsi + r13*4 + 8] ;# G2c H2c
845 movlps xmm6, [rdi + r14*4 + 8] ;# G1d H1d
846 movlps xmm7, [rdi + r15*4 + 8] ;# G2d H2d
847 movlps xmm10, [rdi + r14*4 + 24] ;# G1r H1r
848 movlps xmm11, [rdi + r15*4 + 24] ;# G2r H2r
849 unpcklps xmm2, xmm3
850 movhlps xmm3, xmm2
851 unpcklps xmm6, xmm7
852 movhlps xmm7, xmm6
853 unpcklps xmm10, xmm11
854 movhlps xmm11, xmm10
855 ;# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
857 movaps xmm12, [rsp + nb430_epsgb]
858 movaps xmm13, [rsp + nb430_eps]
860 mulps xmm3, xmm12 ;# Heps
861 mulps xmm7, xmm13
862 mulps xmm11, xmm13
863 mulps xmm2, xmm12 ;# Geps
864 mulps xmm6, xmm13
865 mulps xmm10, xmm13
866 mulps xmm3, xmm12 ;# Heps2
867 mulps xmm7, xmm13
868 mulps xmm11, xmm13
870 addps xmm1, xmm2 ;# F+Geps
871 addps xmm5, xmm6
872 addps xmm9, xmm10
873 addps xmm1, xmm3 ;# F+Geps+Heps2 = Fp
874 addps xmm5, xmm7
875 addps xmm9, xmm11
876 addps xmm3, xmm3 ;# 2*Heps2
877 addps xmm7, xmm7
878 addps xmm11, xmm11
879 addps xmm3, xmm2 ;# 2*Heps2+Geps
880 addps xmm7, xmm6
881 addps xmm11, xmm10
882 addps xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps
883 addps xmm7, xmm5
884 addps xmm11, xmm9
885 mulps xmm1, xmm12 ;# eps*Fp
886 mulps xmm5, xmm13
887 mulps xmm9, xmm13
888 addps xmm1, xmm0 ;# VV
889 addps xmm5, xmm4
890 addps xmm9, xmm8
891 mulps xmm1, [rsp + nb430_qq] ;# VV*qq = vcoul
892 mulps xmm5, [rsp + nb430_c6] ;# vnb6
893 mulps xmm9, [rsp + nb430_c12] ;# vnb12
894 mulps xmm3, [rsp + nb430_qq] ;# FF*qq = fij
895 mulps xmm7, [rsp + nb430_c6] ;# fijD
896 mulps xmm11, [rsp + nb430_c12] ;#fijR
898 addps xmm11, xmm7 ;# fijD+fijR
899 mulps xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale
901 ;# accumulate Vvdwtot
902 addps xmm5, [rsp + nb430_Vvdwtot]
903 addps xmm5, xmm9
904 movlps [rsp + nb430_Vvdwtot], xmm5
906 mov rsi, [rbp + nb430_dvda]
908 ;# Calculate dVda
909 mulps xmm3, [rsp + nb430_gbscale] ;# fijC=qq*FF*gbscale
910 movaps xmm6, xmm3
911 mulps xmm6, [rsp + nb430_r]
912 addps xmm6, xmm1 ;# vcoul+fijC*r
914 addps xmm3, xmm11 ;# fijC+fijD+fijR
916 ;# increment vctot
917 addps xmm1, [rsp + nb430_vctot]
918 movlps [rsp + nb430_vctot], xmm1
920 ;# xmm6=(vcoul+fijC*r)
921 xorps xmm7, xmm7
922 subps xmm7, xmm6
923 movaps xmm6, xmm7
925 ;# update dvdasum
926 addps xmm7, [rsp + nb430_dvdasum]
927 movlps [rsp + nb430_dvdasum], xmm7
929 ;# update j atoms dvdaj
930 movaps xmm5, xmm6
931 shufps xmm5, xmm5, 0x1
933 ;# xmm6=dvdaj1 xmm5=dvdaj2
934 addss xmm6, [rsi + rax*4]
935 addss xmm5, [rsi + rbx*4]
936 movss [rsi + rax*4], xmm6
937 movss [rsi + rbx*4], xmm5
939 xorps xmm4, xmm4
940 mulps xmm3, [rsp + nb430_rinv]
941 subps xmm4, xmm3
943 movaps xmm9, xmm4
944 movaps xmm10, xmm4
945 movaps xmm11, xmm4
947 mulps xmm9, [rsp + nb430_dx]
948 mulps xmm10, [rsp + nb430_dy]
949 mulps xmm11, [rsp + nb430_dz]
952 ;# accumulate i forces
953 movaps xmm12, [rsp + nb430_fix]
954 movaps xmm13, [rsp + nb430_fiy]
955 movaps xmm14, [rsp + nb430_fiz]
956 addps xmm12, xmm9
957 addps xmm13, xmm10
958 addps xmm14, xmm11
959 movlps [rsp + nb430_fix], xmm12
960 movlps [rsp + nb430_fiy], xmm13
961 movlps [rsp + nb430_fiz], xmm14
963 mov rsi, [rbp + nb430_faction]
964 ;# the fj's - start by accumulating x & y forces from memory
965 movlps xmm0, [rsi + r8*4] ;# x1 y1 - -
966 movhps xmm0, [rsi + r9*4] ;# x1 y1 x2 y2
968 unpcklps xmm9, xmm10 ;# x1 y1 x2 y2
969 addps xmm0, xmm9
971 movlps [rsi + r8*4], xmm0
972 movhps [rsi + r9*4], xmm0
974 ;# z forces
975 pshufd xmm8, xmm11, 1
976 addss xmm11, [rsi + r8*4 + 8]
977 addss xmm8, [rsi + r9*4 + 8]
978 movss [rsi + r8*4 + 8], xmm11
979 movss [rsi + r9*4 + 8], xmm8
981 .nb430_checksingle:
982 mov edx, [rsp + nb430_innerk]
983 and edx, 1
984 jnz .nb430_dosingle
985 jmp .nb430_updateouterdata
986 .nb430_dosingle:
987 mov rsi, [rbp + nb430_charge]
988 mov rdx, [rbp + nb430_invsqrta]
989 mov rdi, [rbp + nb430_pos]
990 mov rcx, [rsp + nb430_innerjjnr]
991 mov eax, [rcx]
993 ;# load isaj
994 mov rsi, [rbp + nb430_invsqrta]
995 movss xmm3, [rsi + rax*4]
996 movaps xmm2, [rsp + nb430_isai]
997 mulss xmm2, xmm3
998 movaps [rsp + nb430_isaprod], xmm2
1000 movaps xmm1, xmm2
1001 mulss xmm1, [rsp + nb430_gbtsc]
1002 movaps [rsp + nb430_gbscale], xmm1
1004 mov rsi, [rbp + nb430_charge] ;# base of charge[]
1006 movss xmm3, [rsi + rax*4]
1007 mulss xmm2, [rsp + nb430_iq]
1008 mulss xmm3, xmm2
1009 movaps [rsp + nb430_qq], xmm3
1011 ;# vdw parameters
1012 mov rsi, [rbp + nb430_type]
1013 mov r12d, [rsi + rax*4]
1014 shl r12d, 1
1015 mov edi, [rsp + nb430_ntia]
1016 add r12d, edi
1018 mov rsi, [rbp + nb430_vdwparam]
1019 movss xmm0, [rsi + r12*4]
1020 movss xmm3, [rsi + r12*4 + 4]
1021 movaps [rsp + nb430_c6], xmm0
1022 movaps [rsp + nb430_c12], xmm3
1024 mov rsi, [rbp + nb430_pos] ;# base of pos[]
1026 lea r8, [rax + rax*2] ;# j3
1028 ;# move four coordinates to xmm0-xmm2
1029 movss xmm0, [rsi + r8*4]
1030 movss xmm1, [rsi + r8*4 + 4]
1031 movss xmm2, [rsi + r8*4 + 8]
1033 ;# calc dr
1034 subss xmm0, [rsp + nb430_ix]
1035 subss xmm1, [rsp + nb430_iy]
1036 subss xmm2, [rsp + nb430_iz]
1038 ;# store dr
1039 movaps [rsp + nb430_dx], xmm0
1040 movaps [rsp + nb430_dy], xmm1
1041 movaps [rsp + nb430_dz], xmm2
1043 ;# square it
1044 mulss xmm0,xmm0
1045 mulss xmm1,xmm1
1046 mulss xmm2,xmm2
1047 addss xmm0, xmm1
1048 addss xmm0, xmm2
1049 movaps xmm4, xmm0
1050 ;# rsq in xmm4
1052 rsqrtss xmm5, xmm4
1053 ;# lookup seed in xmm5
1054 movaps xmm2, xmm5
1055 mulss xmm5, xmm5
1056 movaps xmm1, [rsp + nb430_three]
1057 mulss xmm5, xmm4 ;# rsq*lu*lu
1058 movaps xmm0, [rsp + nb430_half]
1059 subss xmm1, xmm5 ;# 30-rsq*lu*lu
1060 mulss xmm1, xmm2
1061 mulss xmm0, xmm1 ;# xmm0=rinv
1062 mulss xmm4, xmm0 ;# xmm4=r
1063 movaps [rsp + nb430_r], xmm4
1064 movaps [rsp + nb430_rinv], xmm0
1066 movaps xmm8, xmm4 ;# r
1067 mulss xmm4, [rsp + nb430_gbscale] ;# rgbtab
1068 mulss xmm8, [rsp + nb430_tsc] ;# rtab
1070 ;# truncate and convert to integers
1071 cvttss2si r12d, xmm4 ;# gb
1072 cvttss2si r14d, xmm8 ;# lj
1074 ;# convert back to float
1075 cvtsi2ss xmm6, r12d ;# gb
1076 cvtsi2ss xmm10, r14d ;# lj
1078 ;# multiply by 4 and 8, respectively
1079 shl r12d, 2 ;# gb
1080 shl r14d, 3 ;# lj
1082 ;# GB index: r12 LJ indices: r14
1084 ;# calculate eps
1085 subss xmm4, xmm6 ;# gb
1086 subss xmm8, xmm10 ;# lj
1087 movaps [rsp + nb430_epsgb], xmm4 ;# gb eps
1088 movaps [rsp + nb430_eps], xmm8 ;# lj eps
1090 mov rsi, [rbp + nb430_GBtab]
1091 mov rdi, [rbp + nb430_VFtab]
1093 ;# load GB table data to xmm0-xmm3, disp to xmm4-xmm7, rep. to xmm8-xmm11
1094 movss xmm0, [rsi + r12*4]
1095 movss xmm1, [rsi + r12*4 + 4]
1096 movss xmm2, [rsi + r12*4 + 8]
1097 movss xmm3, [rsi + r12*4 + 12]
1098 movss xmm4, [rdi + r14*4]
1099 movss xmm5, [rdi + r14*4 + 4]
1100 movss xmm6, [rdi + r14*4 + 8]
1101 movss xmm7, [rdi + r14*4 + 12]
1102 movss xmm8, [rdi + r14*4 + 16]
1103 movss xmm9, [rdi + r14*4 + 20]
1104 movss xmm10, [rdi + r14*4 + 24]
1105 movss xmm11, [rdi + r14*4 + 28]
1106 ;# table data ready. Coul in xmm0-xmm3 , disp in xmm4-xmm7 , rep. in xmm8-xmm11
1108 movaps xmm12, [rsp + nb430_epsgb]
1109 movaps xmm13, [rsp + nb430_eps]
1111 mulss xmm3, xmm12 ;# Heps
1112 mulss xmm7, xmm13
1113 mulss xmm11, xmm13
1114 mulss xmm2, xmm12 ;# Geps
1115 mulss xmm6, xmm13
1116 mulss xmm10, xmm13
1117 mulss xmm3, xmm12 ;# Heps2
1118 mulss xmm7, xmm13
1119 mulss xmm11, xmm13
1121 addss xmm1, xmm2 ;# F+Geps
1122 addss xmm5, xmm6
1123 addss xmm9, xmm10
1124 addss xmm1, xmm3 ;# F+Geps+Heps2 = Fp
1125 addss xmm5, xmm7
1126 addss xmm9, xmm11
1127 addss xmm3, xmm3 ;# 2*Heps2
1128 addss xmm7, xmm7
1129 addss xmm11, xmm11
1130 addss xmm3, xmm2 ;# 2*Heps2+Geps
1131 addss xmm7, xmm6
1132 addss xmm11, xmm10
1133 addss xmm3, xmm1 ;# FF = Fp + 2*Heps2 + Geps
1134 addss xmm7, xmm5
1135 addss xmm11, xmm9
1136 mulss xmm1, xmm12 ;# eps*Fp
1137 mulss xmm5, xmm13
1138 mulss xmm9, xmm13
1139 addss xmm1, xmm0 ;# VV
1140 addss xmm5, xmm4
1141 addss xmm9, xmm8
1142 mulss xmm1, [rsp + nb430_qq] ;# VV*qq = vcoul
1143 mulss xmm5, [rsp + nb430_c6] ;# vnb6
1144 mulss xmm9, [rsp + nb430_c12] ;# vnb12
1145 mulss xmm3, [rsp + nb430_qq] ;# FF*qq = fij
1146 mulss xmm7, [rsp + nb430_c6] ;# fijD
1147 mulss xmm11, [rsp + nb430_c12] ;#fijR
1149 addss xmm11, xmm7 ;# fijD+fijR
1150 mulss xmm11, [rsp + nb430_tsc] ;# (fijD+fijR)*tabscale
1152 ;# accumulate Vvdwtot
1153 addss xmm5, [rsp + nb430_Vvdwtot]
1154 addss xmm5, xmm9
1155 movss [rsp + nb430_Vvdwtot], xmm5
1157 mov rsi, [rbp + nb430_dvda]
1159 ;# Calculate dVda
1160 mulss xmm3, [rsp + nb430_gbscale] ;# fijC=qq*FF*gbscale
1161 movaps xmm6, xmm3
1162 mulss xmm6, [rsp + nb430_r]
1163 addss xmm6, xmm1 ;# vcoul+fijC*r
1165 addss xmm3, xmm11 ;# fijC+fijD+fijR
1167 ;# increment vctot
1168 addss xmm1, [rsp + nb430_vctot]
1169 movss [rsp + nb430_vctot], xmm1
1171 ;# xmm6=(vcoul+fijC*r)
1172 xorps xmm7, xmm7
1173 subss xmm7, xmm6
1174 movaps xmm6, xmm7
1176 ;# update dvdasum
1177 addss xmm7, [rsp + nb430_dvdasum]
1178 movss [rsp + nb430_dvdasum], xmm7
1180 ;# update j atoms dvdaj
1182 ;# xmm6=dvdaj1
1183 addss xmm6, [rsi + rax*4]
1184 movss [rsi + rax*4], xmm6
1186 xorps xmm4, xmm4
1187 mulss xmm3, [rsp + nb430_rinv]
1188 subss xmm4, xmm3
1190 movss xmm9, xmm4
1191 movss xmm10, xmm4
1192 movss xmm11, xmm4
1194 mulss xmm9, [rsp + nb430_dx]
1195 mulss xmm10, [rsp + nb430_dy]
1196 mulss xmm11, [rsp + nb430_dz]
1198 ;# accumulate i forces
1199 movaps xmm12, [rsp + nb430_fix]
1200 movaps xmm13, [rsp + nb430_fiy]
1201 movaps xmm14, [rsp + nb430_fiz]
1202 addss xmm12, xmm9
1203 addss xmm13, xmm10
1204 addss xmm14, xmm11
1205 movss [rsp + nb430_fix], xmm12
1206 movss [rsp + nb430_fiy], xmm13
1207 movss [rsp + nb430_fiz], xmm14
1209 mov rsi, [rbp + nb430_faction]
1210 ;# add to j forces
1211 addss xmm9, [rsi + r8*4]
1212 addss xmm10, [rsi + r8*4 + 4]
1213 addss xmm11, [rsi + r8*4 + 8]
1214 movss [rsi + r8*4], xmm9
1215 movss [rsi + r8*4 + 4], xmm10
1216 movss [rsi + r8*4 + 8], xmm11
1218 .nb430_updateouterdata:
1219 mov ecx, [rsp + nb430_ii3]
1220 mov rdi, [rbp + nb430_faction]
1221 mov rsi, [rbp + nb430_fshift]
1222 mov edx, [rsp + nb430_is3]
1224 ;# accumulate i forces in xmm0, xmm1, xmm2
1225 movaps xmm0, [rsp + nb430_fix]
1226 movaps xmm1, [rsp + nb430_fiy]
1227 movaps xmm2, [rsp + nb430_fiz]
1229 movhlps xmm3, xmm0
1230 movhlps xmm4, xmm1
1231 movhlps xmm5, xmm2
1232 addps xmm0, xmm3
1233 addps xmm1, xmm4
1234 addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2
1236 movaps xmm3, xmm0
1237 movaps xmm4, xmm1
1238 movaps xmm5, xmm2
1240 shufps xmm3, xmm3, 1
1241 shufps xmm4, xmm4, 1
1242 shufps xmm5, xmm5, 1
1243 addss xmm0, xmm3
1244 addss xmm1, xmm4
1245 addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0
1247 ;# increment i force
1248 movss xmm3, [rdi + rcx*4]
1249 movss xmm4, [rdi + rcx*4 + 4]
1250 movss xmm5, [rdi + rcx*4 + 8]
1251 subss xmm3, xmm0
1252 subss xmm4, xmm1
1253 subss xmm5, xmm2
1254 movss [rdi + rcx*4], xmm3
1255 movss [rdi + rcx*4 + 4], xmm4
1256 movss [rdi + rcx*4 + 8], xmm5
1258 ;# increment fshift force
1259 movss xmm3, [rsi + rdx*4]
1260 movss xmm4, [rsi + rdx*4 + 4]
1261 movss xmm5, [rsi + rdx*4 + 8]
1262 subss xmm3, xmm0
1263 subss xmm4, xmm1
1264 subss xmm5, xmm2
1265 movss [rsi + rdx*4], xmm3
1266 movss [rsi + rdx*4 + 4], xmm4
1267 movss [rsi + rdx*4 + 8], xmm5
1269 ;# get n from stack
1270 mov esi, [rsp + nb430_n]
1271 ;# get group index for i particle
1272 mov rdx, [rbp + nb430_gid] ;# base of gid[]
1273 mov edx, [rdx + rsi*4] ;# ggid=gid[n]
1275 ;# accumulate total potential energy and update it
1276 movaps xmm7, [rsp + nb430_vctot]
1277 ;# accumulate
1278 movhlps xmm6, xmm7
1279 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
1280 movaps xmm6, xmm7
1281 shufps xmm6, xmm6, 1
1282 addss xmm7, xmm6
1284 ;# add earlier value from mem
1285 mov rax, [rbp + nb430_Vc]
1286 addss xmm7, [rax + rdx*4]
1287 ;# move back to mem
1288 movss [rax + rdx*4], xmm7
1290 ;# accumulate total lj energy and update it
1291 movaps xmm7, [rsp + nb430_Vvdwtot]
1292 ;# accumulate
1293 movhlps xmm6, xmm7
1294 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
1295 movaps xmm6, xmm7
1296 shufps xmm6, xmm6, 1
1297 addss xmm7, xmm6
1299 ;# add earlier value from mem
1300 mov rax, [rbp + nb430_Vvdw]
1301 addss xmm7, [rax + rdx*4]
1302 ;# move back to mem
1303 movss [rax + rdx*4], xmm7
1305 ;# accumulate dVda and update it
1306 movaps xmm7, [rsp + nb430_dvdasum]
1307 ;# accumulate
1308 movhlps xmm6, xmm7
1309 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
1310 movaps xmm6, xmm7
1311 shufps xmm6, xmm6, 1
1312 addss xmm7, xmm6
1314 mov edx, [rsp + nb430_ii]
1315 mov rax, [rbp + nb430_dvda]
1316 addss xmm7, [rax + rdx*4]
1317 movss [rax + rdx*4], xmm7
1319 ;# finish if last
1320 mov ecx, [rsp + nb430_nn1]
1321 ;# esi already loaded with n
1322 inc esi
1323 sub ecx, esi
1324 jz .nb430_outerend
1326 ;# not last, iterate outer loop once more!
1327 mov [rsp + nb430_n], esi
1328 jmp .nb430_outer
1329 .nb430_outerend:
1330 ;# check if more outer neighborlists remain
1331 mov ecx, [rsp + nb430_nri]
1332 ;# esi already loaded with n above
1333 sub ecx, esi
1334 jz .nb430_end
1335 ;# non-zero, do one more workunit
1336 jmp .nb430_threadloop
1337 .nb430_end:
1338 mov eax, [rsp + nb430_nouter]
1339 mov ebx, [rsp + nb430_ninner]
1340 mov rcx, [rbp + nb430_outeriter]
1341 mov rdx, [rbp + nb430_inneriter]
1342 mov [rcx], eax
1343 mov [rdx], ebx
1345 add rsp, 552
1346 emms
1349 pop r15
1350 pop r14
1351 pop r13
1352 pop r12
1354 pop rbx
1355 pop rbp
1362 .globl nb_kernel430nf_x86_64_sse
1363 .globl _nb_kernel430nf_x86_64_sse
1364 nb_kernel430nf_x86_64_sse:
1365 _nb_kernel430nf_x86_64_sse:
1366 ;# Room for return address and rbp (16 bytes)
1367 .equiv nb430nf_fshift, 16
1368 .equiv nb430nf_gid, 24
1369 .equiv nb430nf_pos, 32
1370 .equiv nb430nf_faction, 40
1371 .equiv nb430nf_charge, 48
1372 .equiv nb430nf_p_facel, 56
1373 .equiv nb430nf_argkrf, 64
1374 .equiv nb430nf_argcrf, 72
1375 .equiv nb430nf_Vc, 80
1376 .equiv nb430nf_type, 88
1377 .equiv nb430nf_p_ntype, 96
1378 .equiv nb430nf_vdwparam, 104
1379 .equiv nb430nf_Vvdw, 112
1380 .equiv nb430nf_p_tabscale, 120
1381 .equiv nb430nf_VFtab, 128
1382 .equiv nb430nf_invsqrta, 136
1383 .equiv nb430nf_dvda, 144
1384 .equiv nb430nf_p_gbtabscale, 152
1385 .equiv nb430nf_GBtab, 160
1386 .equiv nb430nf_p_nthreads, 168
1387 .equiv nb430nf_count, 176
1388 .equiv nb430nf_mtx, 184
1389 .equiv nb430nf_outeriter, 192
1390 .equiv nb430nf_inneriter, 200
1391 .equiv nb430nf_work, 208
1392 ;# stack offsets for local variables
1393 ;# bottom of stack is cache-aligned for sse use
1394 .equiv nb430nf_ix, 0
1395 .equiv nb430nf_iy, 16
1396 .equiv nb430nf_iz, 32
1397 .equiv nb430nf_iq, 48
1398 .equiv nb430nf_gbtsc, 64
1399 .equiv nb430nf_tsc, 80
1400 .equiv nb430nf_qq, 96
1401 .equiv nb430nf_c6, 112
1402 .equiv nb430nf_c12, 128
1403 .equiv nb430nf_vctot, 144
1404 .equiv nb430nf_Vvdwtot, 160
1405 .equiv nb430nf_half, 176
1406 .equiv nb430nf_three, 192
1407 .equiv nb430nf_isai, 208
1408 .equiv nb430nf_isaprod, 224
1409 .equiv nb430nf_gbscale, 240
1410 .equiv nb430nf_r, 256
1411 .equiv nb430nf_nri, 272
1412 .equiv nb430nf_iinr, 280
1413 .equiv nb430nf_jindex, 288
1414 .equiv nb430nf_jjnr, 296
1415 .equiv nb430nf_shift, 304
1416 .equiv nb430nf_shiftvec, 312
1417 .equiv nb430nf_facel, 320
1418 .equiv nb430nf_innerjjnr, 328
1419 .equiv nb430nf_is3, 336
1420 .equiv nb430nf_ii3, 340
1421 .equiv nb430nf_ntia, 344
1422 .equiv nb430nf_innerk, 348
1423 .equiv nb430nf_n, 352
1424 .equiv nb430nf_nn1, 356
1425 .equiv nb430nf_ntype, 360
1426 .equiv nb430nf_nouter, 364
1427 .equiv nb430nf_ninner, 368
1429 push rbp
1430 mov rbp, rsp
1431 push rbx
1434 emms
1436 push r12
1437 push r13
1438 push r14
1439 push r15
1441 sub rsp, 392 ;# local variable stack space (n*16+8)
1443 ;# zero 32-bit iteration counters
1444 mov eax, 0
1445 mov [rsp + nb430nf_nouter], eax
1446 mov [rsp + nb430nf_ninner], eax
1448 mov edi, [rdi]
1449 mov [rsp + nb430nf_nri], edi
1450 mov [rsp + nb430nf_iinr], rsi
1451 mov [rsp + nb430nf_jindex], rdx
1452 mov [rsp + nb430nf_jjnr], rcx
1453 mov [rsp + nb430nf_shift], r8
1454 mov [rsp + nb430nf_shiftvec], r9
1455 mov rdi, [rbp + nb430nf_p_ntype]
1456 mov edi, [rdi]
1457 mov [rsp + nb430nf_ntype], edi
1458 mov rsi, [rbp + nb430nf_p_facel]
1459 movss xmm0, [rsi]
1460 movss [rsp + nb430nf_facel], xmm0
1462 mov rax, [rbp + nb430nf_p_tabscale]
1463 movss xmm3, [rax]
1464 shufps xmm3, xmm3, 0
1465 movaps [rsp + nb430nf_tsc], xmm3
1467 mov rbx, [rbp + nb430nf_p_gbtabscale]
1468 movss xmm4, [rbx]
1469 shufps xmm4, xmm4, 0
1470 movaps [rsp + nb430nf_gbtsc], xmm4
1472 ;# create constant floating-point factors on stack
1473 mov eax, 0x3f000000 ;# half in IEEE (hex)
1474 mov [rsp + nb430nf_half], eax
1475 movss xmm1, [rsp + nb430nf_half]
1476 shufps xmm1, xmm1, 0 ;# splat to all elements
1477 movaps xmm2, xmm1
1478 addps xmm2, xmm2 ;# one
1479 movaps xmm3, xmm2
1480 addps xmm2, xmm2 ;# two
1481 addps xmm3, xmm2 ;# three
1482 movaps [rsp + nb430nf_half], xmm1
1483 movaps [rsp + nb430nf_three], xmm3
1485 .nb430nf_threadloop:
1486 mov rsi, [rbp + nb430nf_count] ;# pointer to sync counter
1487 mov eax, [rsi]
1488 .nb430nf_spinlock:
1489 mov ebx, eax ;# ebx=*count=nn0
1490 add ebx, 1 ;# ebx=nn1=nn0+10
1491 lock
1492 cmpxchg [esi], ebx ;# write nn1 to *counter,
1493 ;# if it hasnt changed.
1494 ;# or reread *counter to eax.
1495 pause ;# -> better p4 performance
1496 jnz .nb430nf_spinlock
1498 ;# if(nn1>nri) nn1=nri
1499 mov ecx, [rsp + nb430nf_nri]
1500 mov edx, ecx
1501 sub ecx, ebx
1502 cmovle ebx, edx ;# if(nn1>nri) nn1=nri
1503 ;# Cleared the spinlock if we got here.
1504 ;# eax contains nn0, ebx contains nn1.
1505 mov [rsp + nb430nf_n], eax
1506 mov [rsp + nb430nf_nn1], ebx
1507 sub ebx, eax ;# calc number of outer lists
1508 mov esi, eax ;# copy n to esi
1509 jg .nb430nf_outerstart
1510 jmp .nb430nf_end
1512 .nb430nf_outerstart:
1513 ;# ebx contains number of outer iterations
1514 add ebx, [rsp + nb430nf_nouter]
1515 mov [rsp + nb430nf_nouter], ebx
1517 .nb430nf_outer:
1518 mov rax, [rsp + nb430nf_shift] ;# rax = pointer into shift[]
1519 mov ebx, [rax + rsi*4] ;# ebx=shift[n]
1521 lea rbx, [rbx + rbx*2] ;# rbx=3*is
1522 mov [rsp + nb430nf_is3],ebx ;# store is3
1524 mov rax, [rsp + nb430nf_shiftvec] ;# rax = base of shiftvec[]
1526 movss xmm0, [rax + rbx*4]
1527 movss xmm1, [rax + rbx*4 + 4]
1528 movss xmm2, [rax + rbx*4 + 8]
1530 mov rcx, [rsp + nb430nf_iinr] ;# rcx = pointer into iinr[]
1531 mov ebx, [rcx + rsi*4] ;# ebx =ii
1533 mov rdx, [rbp + nb430nf_charge]
1534 movss xmm3, [rdx + rbx*4]
1535 mulss xmm3, [rsp + nb430nf_facel]
1536 shufps xmm3, xmm3, 0
1538 mov rdx, [rbp + nb430nf_invsqrta] ;# load invsqrta[ii]
1539 movss xmm4, [rdx + rbx*4]
1540 shufps xmm4, xmm4, 0
1542 mov rdx, [rbp + nb430nf_type]
1543 mov edx, [rdx + rbx*4]
1544 imul edx, [rsp + nb430nf_ntype]
1545 shl edx, 1
1546 mov [rsp + nb430nf_ntia], edx
1548 lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3
1549 mov rax, [rbp + nb430nf_pos] ;# rax = base of pos[]
1551 addss xmm0, [rax + rbx*4]
1552 addss xmm1, [rax + rbx*4 + 4]
1553 addss xmm2, [rax + rbx*4 + 8]
1555 movaps [rsp + nb430nf_iq], xmm3
1556 movaps [rsp + nb430nf_isai], xmm4
1558 shufps xmm0, xmm0, 0
1559 shufps xmm1, xmm1, 0
1560 shufps xmm2, xmm2, 0
1562 movaps [rsp + nb430nf_ix], xmm0
1563 movaps [rsp + nb430nf_iy], xmm1
1564 movaps [rsp + nb430nf_iz], xmm2
1566 mov [rsp + nb430nf_ii3], ebx
1568 ;# clear vctot
1569 xorps xmm4, xmm4
1570 movaps [rsp + nb430nf_vctot], xmm4
1571 movaps [rsp + nb430nf_Vvdwtot], xmm4
1573 mov rax, [rsp + nb430nf_jindex]
1574 mov ecx, [rax + rsi*4] ;# jindex[n]
1575 mov edx, [rax + rsi*4 + 4] ;# jindex[n+1]
1576 sub edx, ecx ;# number of innerloop atoms
1578 mov rsi, [rbp + nb430nf_pos]
1579 mov rdi, [rbp + nb430nf_faction]
1580 mov rax, [rsp + nb430nf_jjnr]
1581 shl ecx, 2
1582 add rax, rcx
1583 mov [rsp + nb430nf_innerjjnr], rax ;# pointer to jjnr[nj0]
1584 mov ecx, edx
1585 sub edx, 4
1586 add ecx, [rsp + nb430nf_ninner]
1587 mov [rsp + nb430nf_ninner], ecx
1588 add edx, 0
1589 mov [rsp + nb430nf_innerk], edx ;# number of innerloop atoms
1590 jge .nb430nf_unroll_loop
1591 jmp .nb430nf_finish_inner
1592 .nb430nf_unroll_loop:
1593 ;# quad-unroll innerloop here
1594 mov rdx, [rsp + nb430nf_innerjjnr] ;# pointer to jjnr[k]
1595 mov eax, [rdx]
1596 mov ebx, [rdx + 4]
1597 mov ecx, [rdx + 8]
1598 mov edx, [rdx + 12] ;# eax-edx=jnr1-4
1599 add qword ptr [rsp + nb430nf_innerjjnr], 16 ;# advance pointer (unrolled 4)
1601 ;# load isa2
1602 mov rsi, [rbp + nb430nf_invsqrta]
1603 movss xmm3, [rsi + rax*4]
1604 movss xmm4, [rsi + rcx*4]
1605 movss xmm6, [rsi + rbx*4]
1606 movss xmm7, [rsi + rdx*4]
1607 movaps xmm2, [rsp + nb430nf_isai]
1608 shufps xmm3, xmm6, 0
1609 shufps xmm4, xmm7, 0
1610 shufps xmm3, xmm4, 136 ;# 10001000 ;# all charges in xmm3
1611 mulps xmm2, xmm3
1613 movaps [rsp + nb430nf_isaprod], xmm2
1614 movaps xmm1, xmm2
1615 mulps xmm1, [rsp + nb430nf_gbtsc]
1616 movaps [rsp + nb430nf_gbscale], xmm1
1618 mov rsi, [rbp + nb430nf_charge] ;# base of charge[]
1620 movss xmm3, [rsi + rax*4]
1621 movss xmm4, [rsi + rcx*4]
1622 movss xmm6, [rsi + rbx*4]
1623 movss xmm7, [rsi + rdx*4]
1625 mulps xmm2, [rsp + nb430nf_iq]
1626 shufps xmm3, xmm6, 0
1627 shufps xmm4, xmm7, 0
1628 shufps xmm3, xmm4, 136 ;# 10001000 ;# all charges in xmm3
1629 mulps xmm3, xmm2
1630 movaps [rsp + nb430nf_qq], xmm3
1632 movd mm0, eax ;# use mmx registers as temp storage
1633 movd mm1, ebx
1634 movd mm2, ecx
1635 movd mm3, edx
1637 mov rsi, [rbp + nb430nf_type]
1638 mov eax, [rsi + rax*4]
1639 mov ebx, [rsi + rbx*4]
1640 mov ecx, [rsi + rcx*4]
1641 mov edx, [rsi + rdx*4]
1642 mov rsi, [rbp + nb430nf_vdwparam]
1643 shl eax, 1
1644 shl ebx, 1
1645 shl ecx, 1
1646 shl edx, 1
1647 mov edi, [rsp + nb430nf_ntia]
1648 add eax, edi
1649 add ebx, edi
1650 add ecx, edi
1651 add edx, edi
1653 movlps xmm6, [rsi + rax*4]
1654 movlps xmm7, [rsi + rcx*4]
1655 movhps xmm6, [rsi + rbx*4]
1656 movhps xmm7, [rsi + rdx*4]
1658 movaps xmm4, xmm6
1659 shufps xmm4, xmm7, 136 ;# 10001000
1660 shufps xmm6, xmm7, 221 ;# 11011101
1662 movd eax, mm0
1663 movd ebx, mm1
1664 movd ecx, mm2
1665 movd edx, mm3
1667 movaps [rsp + nb430nf_c6], xmm4
1668 movaps [rsp + nb430nf_c12], xmm6
1670 mov rsi, [rbp + nb430nf_pos] ;# base of pos[]
1672 lea rax, [rax + rax*2] ;# replace jnr with j3
1673 lea rbx, [rbx + rbx*2]
1675 lea rcx, [rcx + rcx*2] ;# replace jnr with j3
1676 lea rdx, [rdx + rdx*2]
1678 ;# move four coordinates to xmm0-xmm2
1680 movlps xmm4, [rsi + rax*4]
1681 movlps xmm5, [rsi + rcx*4]
1682 movss xmm2, [rsi + rax*4 + 8]
1683 movss xmm6, [rsi + rcx*4 + 8]
1685 movhps xmm4, [rsi + rbx*4]
1686 movhps xmm5, [rsi + rdx*4]
1688 movss xmm0, [rsi + rbx*4 + 8]
1689 movss xmm1, [rsi + rdx*4 + 8]
1691 shufps xmm2, xmm0, 0
1692 shufps xmm6, xmm1, 0
1694 movaps xmm0, xmm4
1695 movaps xmm1, xmm4
1697 shufps xmm2, xmm6, 136 ;# 10001000
1699 shufps xmm0, xmm5, 136 ;# 10001000
1700 shufps xmm1, xmm5, 221 ;# 11011101
1702 ;# move ix-iz to xmm4-xmm6
1703 movaps xmm4, [rsp + nb430nf_ix]
1704 movaps xmm5, [rsp + nb430nf_iy]
1705 movaps xmm6, [rsp + nb430nf_iz]
1707 ;# calc dr
1708 subps xmm4, xmm0
1709 subps xmm5, xmm1
1710 subps xmm6, xmm2
1712 ;# square it
1713 mulps xmm4,xmm4
1714 mulps xmm5,xmm5
1715 mulps xmm6,xmm6
1716 addps xmm4, xmm5
1717 addps xmm4, xmm6
1718 ;# rsq in xmm4
1720 rsqrtps xmm5, xmm4
1721 ;# lookup seed in xmm5
1722 movaps xmm2, xmm5
1723 mulps xmm5, xmm5
1724 movaps xmm1, [rsp + nb430nf_three]
1725 mulps xmm5, xmm4 ;# rsq*lu*lu
1726 movaps xmm0, [rsp + nb430nf_half]
1727 subps xmm1, xmm5 ;# 30-rsq*lu*lu
1728 mulps xmm1, xmm2
1729 mulps xmm0, xmm1 ;# xmm0=rinv
1730 mulps xmm4, xmm0 ;# xmm4=r
1731 movaps [rsp + nb430nf_r], xmm4
1732 mulps xmm4, [rsp + nb430nf_gbscale]
1734 movhlps xmm5, xmm4
1735 cvttps2pi mm6, xmm4
1736 cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices
1737 cvtpi2ps xmm6, mm6
1738 cvtpi2ps xmm5, mm7
1739 movlhps xmm6, xmm5
1740 subps xmm4, xmm6
1741 movaps xmm1, xmm4 ;# xmm1=eps
1742 movaps xmm2, xmm1
1743 mulps xmm2, xmm2 ;# xmm2=eps2
1744 pslld mm6, 2
1745 pslld mm7, 2
1747 movd mm0, eax
1748 movd mm1, ebx
1749 movd mm2, ecx
1750 movd mm3, edx
1752 mov rsi, [rbp + nb430nf_GBtab]
1753 movd eax, mm6
1754 psrlq mm6, 32
1755 movd ecx, mm7
1756 psrlq mm7, 32
1757 movd ebx, mm6
1758 movd edx, mm7
1760 ;# load coulomb table
1761 movaps xmm4, [rsi + rax*4]
1762 movaps xmm5, [rsi + rbx*4]
1763 movaps xmm6, [rsi + rcx*4]
1764 movaps xmm7, [rsi + rdx*4]
1765 ;# transpose, using xmm3 for scratch
1766 movaps xmm3, xmm6
1767 shufps xmm3, xmm7, 0xEE
1768 shufps xmm6, xmm7, 0x44
1769 movaps xmm7, xmm4
1770 shufps xmm7, xmm5, 0xEE
1771 shufps xmm4, xmm5, 0x44
1772 movaps xmm5, xmm4
1773 shufps xmm5, xmm6, 0xDD
1774 shufps xmm4, xmm6, 0x88
1775 movaps xmm6, xmm7
1776 shufps xmm6, xmm3, 0x88
1777 shufps xmm7, xmm3, 0xDD
1778 ;# coulomb table ready, in xmm4-xmm7
1780 mulps xmm6, xmm1 ;# xmm6=Geps
1781 mulps xmm7, xmm2 ;# xmm7=Heps2
1782 addps xmm5, xmm6
1783 addps xmm5, xmm7 ;# xmm5=Fp
1784 movaps xmm3, [rsp + nb430nf_qq]
1785 mulps xmm5, xmm1 ;# xmm5=eps*Fp
1786 addps xmm5, xmm4 ;# xmm5=VV
1787 mulps xmm5, xmm3 ;# vcoul=qq*VV
1788 addps xmm5, [rsp + nb430nf_vctot]
1789 movaps [rsp + nb430nf_vctot], xmm5
1792 movaps xmm4, [rsp + nb430nf_r]
1793 mulps xmm4, [rsp + nb430nf_tsc]
1795 movhlps xmm5, xmm4
1796 cvttps2pi mm6, xmm4
1797 cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices
1798 cvtpi2ps xmm6, mm6
1799 cvtpi2ps xmm5, mm7
1800 movlhps xmm6, xmm5
1801 subps xmm4, xmm6
1802 movaps xmm1, xmm4 ;# xmm1=eps
1803 movaps xmm2, xmm1
1804 mulps xmm2, xmm2 ;# xmm2=eps2
1805 pslld mm6, 3
1806 pslld mm7, 3
1808 mov rsi, [rbp + nb430nf_VFtab]
1809 movd eax, mm6
1810 psrlq mm6, 32
1811 movd ecx, mm7
1812 psrlq mm7, 32
1813 movd ebx, mm6
1814 movd edx, mm7
1816 ;# dispersion
1817 movaps xmm4, [rsi + rax*4]
1818 movaps xmm5, [rsi + rbx*4]
1819 movaps xmm6, [rsi + rcx*4]
1820 movaps xmm7, [rsi + rdx*4]
1821 ;# transpose, using xmm3 for scratch
1822 movaps xmm3, xmm6
1823 shufps xmm3, xmm7, 0xEE
1824 shufps xmm6, xmm7, 0x44
1825 movaps xmm7, xmm4
1826 shufps xmm7, xmm5, 0xEE
1827 shufps xmm4, xmm5, 0x44
1828 movaps xmm5, xmm4
1829 shufps xmm5, xmm6, 0xDD
1830 shufps xmm4, xmm6, 0x88
1831 movaps xmm6, xmm7
1832 shufps xmm6, xmm3, 0x88
1833 shufps xmm7, xmm3, 0xDD
1834 ;# dispersion table ready, in xmm4-xmm7
1835 mulps xmm6, xmm1 ;# xmm6=Geps
1836 mulps xmm7, xmm2 ;# xmm7=Heps2
1837 addps xmm5, xmm6
1838 addps xmm5, xmm7 ;# xmm5=Fp
1839 mulps xmm5, xmm1 ;# xmm5=eps*Fp
1840 addps xmm5, xmm4 ;# xmm5=VV
1841 mulps xmm5, [rsp + nb430nf_c6] ;# Vvdw6
1842 addps xmm5, [rsp + nb430nf_Vvdwtot]
1843 movaps [rsp + nb430nf_Vvdwtot], xmm5
1845 ;# repulsion
1846 movaps xmm4, [rsi + rax*4 + 16]
1847 movaps xmm5, [rsi + rbx*4 + 16]
1848 movaps xmm6, [rsi + rcx*4 + 16]
1849 movaps xmm7, [rsi + rdx*4 + 16]
1850 ;# transpose, using xmm3 for scratch
1851 movaps xmm3, xmm6
1852 shufps xmm3, xmm7, 0xEE
1853 shufps xmm6, xmm7, 0x44
1854 movaps xmm7, xmm4
1855 shufps xmm7, xmm5, 0xEE
1856 shufps xmm4, xmm5, 0x44
1857 movaps xmm5, xmm4
1858 shufps xmm5, xmm6, 0xDD
1859 shufps xmm4, xmm6, 0x88
1860 movaps xmm6, xmm7
1861 shufps xmm6, xmm3, 0x88
1862 shufps xmm7, xmm3, 0xDD
1863 ;# table ready, in xmm4-xmm7
1864 mulps xmm6, xmm1 ;# xmm6=Geps
1865 mulps xmm7, xmm2 ;# xmm7=Heps2
1866 addps xmm5, xmm6
1867 addps xmm5, xmm7 ;# xmm5=Fp
1868 mulps xmm5, xmm1 ;# xmm5=eps*Fp
1869 addps xmm5, xmm4 ;# xmm5=VV
1871 mulps xmm5, [rsp + nb430nf_c12] ;# Vvdw12
1872 addps xmm5, [rsp + nb430nf_Vvdwtot]
1873 movaps [rsp + nb430nf_Vvdwtot], xmm5
1875 ;# should we do one more iteration?
1876 sub dword ptr [rsp + nb430nf_innerk], 4
1877 jl .nb430nf_finish_inner
1878 jmp .nb430nf_unroll_loop
1879 .nb430nf_finish_inner:
1880 ;# check if at least two particles remain
1881 add dword ptr [rsp + nb430nf_innerk], 4
1882 mov edx, [rsp + nb430nf_innerk]
1883 and edx, 2
1884 jnz .nb430nf_dopair
1885 jmp .nb430nf_checksingle
1886 .nb430nf_dopair:
1888 mov rcx, [rsp + nb430nf_innerjjnr]
1890 mov eax, [rcx]
1891 mov ebx, [rcx + 4]
1892 add qword ptr [rsp + nb430nf_innerjjnr], 8
1894 xorps xmm2, xmm2
1895 movaps xmm6, xmm2
1897 ;# load isa2
1898 mov rsi, [rbp + nb430nf_invsqrta]
1899 movss xmm2, [rsi + rax*4]
1900 movss xmm3, [rsi + rbx*4]
1901 unpcklps xmm2, xmm3 ;# isa2 in xmm3(0,1)
1902 mulps xmm2, [rsp + nb430nf_isai]
1903 movaps [rsp + nb430nf_isaprod], xmm2
1904 movaps xmm1, xmm2
1905 mulps xmm1, [rsp + nb430nf_gbtsc]
1906 movaps [rsp + nb430nf_gbscale], xmm1
1908 mov rsi, [rbp + nb430nf_charge] ;# base of charge[]
1909 movss xmm3, [rsi + rax*4]
1910 movss xmm6, [rsi + rbx*4]
1911 unpcklps xmm3, xmm6 ;# 00001000 ;# xmm3(0,1) has the charges
1913 mulps xmm2, [rsp + nb430nf_iq]
1914 mulps xmm3, xmm2
1915 movaps [rsp + nb430nf_qq], xmm3
1917 mov rsi, [rbp + nb430nf_type]
1918 mov ecx, eax
1919 mov edx, ebx
1920 mov ecx, [rsi + rcx*4]
1921 mov edx, [rsi + rdx*4]
1922 mov rsi, [rbp + nb430nf_vdwparam]
1923 shl ecx, 1
1924 shl edx, 1
1925 mov edi, [rsp + nb430nf_ntia]
1926 add ecx, edi
1927 add edx, edi
1928 movlps xmm6, [rsi + rcx*4]
1929 movhps xmm6, [rsi + rdx*4]
1930 mov rdi, [rbp + nb430nf_pos]
1932 movaps xmm4, xmm6
1933 shufps xmm4, xmm4, 8 ;# 00001000
1934 shufps xmm6, xmm6, 13 ;# 00001101
1935 movlhps xmm4, xmm7
1936 movlhps xmm6, xmm7
1938 movaps [rsp + nb430nf_c6], xmm4
1939 movaps [rsp + nb430nf_c12], xmm6
1941 lea rax, [rax + rax*2]
1942 lea rbx, [rbx + rbx*2]
1943 ;# move coordinates to xmm0-xmm2
1944 movlps xmm1, [rdi + rax*4]
1945 movss xmm2, [rdi + rax*4 + 8]
1946 movhps xmm1, [rdi + rbx*4]
1947 movss xmm0, [rdi + rbx*4 + 8]
1949 movlhps xmm3, xmm7
1951 shufps xmm2, xmm0, 0
1953 movaps xmm0, xmm1
1955 shufps xmm2, xmm2, 136 ;# 10001000
1957 shufps xmm0, xmm0, 136 ;# 10001000
1958 shufps xmm1, xmm1, 221 ;# 11011101
1960 mov rdi, [rbp + nb430nf_faction]
1961 ;# move ix-iz to xmm4-xmm6
1962 xorps xmm7, xmm7
1964 movaps xmm4, [rsp + nb430nf_ix]
1965 movaps xmm5, [rsp + nb430nf_iy]
1966 movaps xmm6, [rsp + nb430nf_iz]
1968 ;# calc dr
1969 subps xmm4, xmm0
1970 subps xmm5, xmm1
1971 subps xmm6, xmm2
1973 ;# square it
1974 mulps xmm4,xmm4
1975 mulps xmm5,xmm5
1976 mulps xmm6,xmm6
1977 addps xmm4, xmm5
1978 addps xmm4, xmm6
1979 ;# rsq in xmm4
1981 rsqrtps xmm5, xmm4
1982 ;# lookup seed in xmm5
1983 movaps xmm2, xmm5
1984 mulps xmm5, xmm5
1985 movaps xmm1, [rsp + nb430nf_three]
1986 mulps xmm5, xmm4 ;# rsq*lu*lu
1987 movaps xmm0, [rsp + nb430nf_half]
1988 subps xmm1, xmm5 ;# 30-rsq*lu*lu
1989 mulps xmm1, xmm2
1990 mulps xmm0, xmm1 ;# xmm0=rinv
1991 mulps xmm4, xmm0 ;# xmm4=r
1992 movaps [rsp + nb430nf_r], xmm4
1993 mulps xmm4, [rsp + nb430nf_gbscale]
1995 cvttps2pi mm6, xmm4 ;# mm6 contain lu indices
1996 cvtpi2ps xmm6, mm6
1997 subps xmm4, xmm6
1998 movaps xmm1, xmm4 ;# xmm1=eps
1999 movaps xmm2, xmm1
2000 mulps xmm2, xmm2 ;# xmm2=eps2
2002 pslld mm6, 2
2004 mov rsi, [rbp + nb430nf_GBtab]
2005 movd ecx, mm6
2006 psrlq mm6, 32
2007 movd edx, mm6
2009 ;# load coulomb table
2010 movaps xmm4, [rsi + rcx*4]
2011 movaps xmm7, [rsi + rdx*4]
2012 ;# transpose, using xmm3 for scratch
2013 movaps xmm6, xmm4
2014 unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2
2015 unpckhps xmm6, xmm7 ;# G1 G2 H1 H2
2016 movhlps xmm5, xmm4 ;# F1 F2
2017 movhlps xmm7, xmm6 ;# H1 H2
2018 ;# coulomb table ready, in xmm4-xmm7
2020 mulps xmm6, xmm1 ;# xmm6=Geps
2021 mulps xmm7, xmm2 ;# xmm7=Heps2
2022 addps xmm5, xmm6
2023 addps xmm5, xmm7 ;# xmm5=Fp
2024 movaps xmm3, [rsp + nb430nf_qq]
2025 mulps xmm5, xmm1 ;# xmm5=eps*Fp
2026 addps xmm5, xmm4 ;# xmm5=VV
2027 mulps xmm5, xmm3 ;# vcoul=qq*VV
2028 addps xmm5, [rsp + nb430nf_vctot]
2029 movaps [rsp + nb430nf_vctot], xmm5
2031 movaps xmm4, [rsp + nb430nf_r]
2032 mulps xmm4, [rsp + nb430nf_tsc]
2034 cvttps2pi mm6, xmm4
2035 cvtpi2ps xmm6, mm6
2036 subps xmm4, xmm6
2037 movaps xmm1, xmm4 ;# xmm1=eps
2038 movaps xmm2, xmm1
2039 mulps xmm2, xmm2 ;# xmm2=eps2
2040 pslld mm6, 3
2042 mov rsi, [rbp + nb430nf_VFtab]
2043 movd ecx, mm6
2044 psrlq mm6, 32
2045 movd edx, mm6
2047 ;# dispersion
2048 movaps xmm4, [rsi + rcx*4]
2049 movaps xmm7, [rsi + rdx*4]
2050 ;# transpose, using xmm3 for scratch
2051 movaps xmm6, xmm4
2052 unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2
2053 unpckhps xmm6, xmm7 ;# G1 G2 H1 H2
2054 movhlps xmm5, xmm4 ;# F1 F2
2055 movhlps xmm7, xmm6 ;# H1 H2
2056 ;# dispersion table ready, in xmm4-xmm7
2057 mulps xmm6, xmm1 ;# xmm6=Geps
2058 mulps xmm7, xmm2 ;# xmm7=Heps2
2059 addps xmm5, xmm6
2060 addps xmm5, xmm7 ;# xmm5=Fp
2061 mulps xmm5, xmm1 ;# xmm5=eps*Fp
2062 addps xmm5, xmm4 ;# xmm5=VV
2064 mulps xmm5, [rsp + nb430nf_c6] ;# Vvdw6
2065 addps xmm5, [rsp + nb430nf_Vvdwtot]
2066 movaps [rsp + nb430nf_Vvdwtot], xmm5
2068 ;# repulsion
2069 movaps xmm4, [rsi + rcx*4 + 16]
2070 movaps xmm7, [rsi + rdx*4 + 16]
2071 ;# transpose, using xmm3 for scratch
2072 movaps xmm6, xmm4
2073 unpcklps xmm4, xmm7 ;# Y1 Y2 F1 F2
2074 unpckhps xmm6, xmm7 ;# G1 G2 H1 H2
2075 movhlps xmm5, xmm4 ;# F1 F2
2076 movhlps xmm7, xmm6 ;# H1 H2
2077 ;# table ready, in xmm4-xmm7
2078 mulps xmm6, xmm1 ;# xmm6=Geps
2079 mulps xmm7, xmm2 ;# xmm7=Heps2
2080 addps xmm5, xmm6
2081 addps xmm5, xmm7 ;# xmm5=Fp
2082 mulps xmm5, xmm1 ;# xmm5=eps*Fp
2083 addps xmm5, xmm4 ;# xmm5=VV
2085 mulps xmm5, [rsp + nb430nf_c12] ;# Vvdw12
2087 addps xmm5, [rsp + nb430nf_Vvdwtot]
2088 movaps [rsp + nb430nf_Vvdwtot], xmm5
2089 .nb430nf_checksingle:
2090 mov edx, [rsp + nb430nf_innerk]
2091 and edx, 1
2092 jnz .nb430nf_dosingle
2093 jmp .nb430nf_updateouterdata
2094 .nb430nf_dosingle:
2095 mov rsi, [rbp + nb430nf_charge]
2096 mov rdx, [rbp + nb430nf_invsqrta]
2097 mov rdi, [rbp + nb430nf_pos]
2098 mov rcx, [rsp + nb430nf_innerjjnr]
2099 mov eax, [rcx]
2100 xorps xmm2, xmm2
2101 movaps xmm6, xmm2
2102 movss xmm2, [rdx + rax*4] ;# isa2
2103 mulss xmm2, [rsp + nb430nf_isai]
2104 movss [rsp + nb430nf_isaprod], xmm2
2105 movss xmm1, xmm2
2106 mulss xmm1, [rsp + nb430nf_gbtsc]
2107 movss [rsp + nb430nf_gbscale], xmm1
2109 mulss xmm2, [rsp + nb430nf_iq]
2110 movss xmm6, [rsi + rax*4] ;# xmm6(0) has the charge
2111 mulss xmm6, xmm2
2112 movss [rsp + nb430nf_qq], xmm6
2114 mov rsi, [rbp + nb430nf_type]
2115 mov ecx, eax
2116 mov ecx, [rsi + rcx*4]
2117 mov rsi, [rbp + nb430nf_vdwparam]
2118 shl ecx, 1
2119 add ecx, [rsp + nb430nf_ntia]
2120 movlps xmm6, [rsi + rcx*4]
2121 movaps xmm4, xmm6
2122 shufps xmm4, xmm4, 252 ;# 11111100
2123 shufps xmm6, xmm6, 253 ;# 11111101
2125 movss [rsp + nb430nf_c6], xmm4
2126 movss [rsp + nb430nf_c12], xmm6
2128 lea rax, [rax + rax*2]
2130 ;# move coordinates to xmm0-xmm2
2131 movss xmm0, [rdi + rax*4]
2132 movss xmm1, [rdi + rax*4 + 4]
2133 movss xmm2, [rdi + rax*4 + 8]
2135 movss xmm4, [rsp + nb430nf_ix]
2136 movss xmm5, [rsp + nb430nf_iy]
2137 movss xmm6, [rsp + nb430nf_iz]
2139 ;# calc dr
2140 subss xmm4, xmm0
2141 subss xmm5, xmm1
2142 subss xmm6, xmm2
2144 ;# square it
2145 mulss xmm4,xmm4
2146 mulss xmm5,xmm5
2147 mulss xmm6,xmm6
2148 addss xmm4, xmm5
2149 addss xmm4, xmm6
2150 ;# rsq in xmm4
2152 rsqrtss xmm5, xmm4
2153 ;# lookup seed in xmm5
2154 movaps xmm2, xmm5
2155 mulss xmm5, xmm5
2156 movss xmm1, [rsp + nb430nf_three]
2157 mulss xmm5, xmm4 ;# rsq*lu*lu
2158 movss xmm0, [rsp + nb430nf_half]
2159 subss xmm1, xmm5 ;# 30-rsq*lu*lu
2160 mulss xmm1, xmm2
2161 mulss xmm0, xmm1 ;# xmm0=rinv
2163 mulss xmm4, xmm0 ;# xmm4=r
2164 movaps [rsp + nb430nf_r], xmm4
2165 mulss xmm4, [rsp + nb430nf_gbscale]
2167 cvttss2si ebx, xmm4 ;# mm6 contain lu indices
2168 cvtsi2ss xmm6, ebx
2169 subss xmm4, xmm6
2170 movaps xmm1, xmm4 ;# xmm1=eps
2171 movaps xmm2, xmm1
2172 mulss xmm2, xmm2 ;# xmm2=eps2
2174 shl ebx, 2
2176 mov rsi, [rbp + nb430nf_GBtab]
2178 movaps xmm4, [rsi + rbx*4]
2179 movhlps xmm6, xmm4
2180 movaps xmm5, xmm4
2181 movaps xmm7, xmm6
2182 shufps xmm5, xmm5, 1
2183 shufps xmm7, xmm7, 1
2184 ;# table ready in xmm4-xmm7
2186 mulss xmm6, xmm1 ;# xmm6=Geps
2187 mulss xmm7, xmm2 ;# xmm7=Heps2
2188 addss xmm5, xmm6
2189 addss xmm5, xmm7 ;# xmm5=Fp
2190 movss xmm3, [rsp + nb430nf_qq]
2191 mulss xmm5, xmm1 ;# xmm5=eps*Fp
2192 addss xmm5, xmm4 ;# xmm5=VV
2193 mulss xmm5, xmm3 ;# vcoul=qq*VV
2194 addss xmm5, [rsp + nb430nf_vctot]
2195 movss [rsp + nb430nf_vctot], xmm5
2197 movss xmm4, [rsp + nb430nf_r]
2198 mulps xmm4, [rsp + nb430nf_tsc]
2200 cvttss2si ebx, xmm4
2201 cvtsi2ss xmm6, ebx
2202 subss xmm4, xmm6
2203 movss xmm1, xmm4 ;# xmm1=eps
2204 movss xmm2, xmm1
2205 mulss xmm2, xmm2 ;# xmm2=eps2
2207 shl ebx, 3
2208 mov rsi, [rbp + nb430nf_VFtab]
2210 ;# dispersion
2211 movaps xmm4, [rsi + rbx*4]
2212 movhlps xmm6, xmm4
2213 movaps xmm5, xmm4
2214 movaps xmm7, xmm6
2215 shufps xmm5, xmm5, 1
2216 shufps xmm7, xmm7, 1
2217 ;# table ready in xmm4-xmm7
2219 mulss xmm6, xmm1 ;# xmm6=Geps
2220 mulss xmm7, xmm2 ;# xmm7=Heps2
2221 addss xmm5, xmm6
2222 addss xmm5, xmm7 ;# xmm5=Fp
2223 mulss xmm5, xmm1 ;# xmm5=eps*Fp
2224 addss xmm5, xmm4 ;# xmm5=VV
2225 mulss xmm5, [rsp + nb430nf_c6] ;# Vvdw6
2226 addss xmm5, [rsp + nb430nf_Vvdwtot]
2227 movss [rsp + nb430nf_Vvdwtot], xmm5
2229 ;# repulsion
2230 movaps xmm4, [rsi + rbx*4 + 16]
2231 movhlps xmm6, xmm4
2232 movaps xmm5, xmm4
2233 movaps xmm7, xmm6
2234 shufps xmm5, xmm5, 1
2235 shufps xmm7, xmm7, 1
2236 ;# table ready in xmm4-xmm7
2238 mulss xmm6, xmm1 ;# xmm6=Geps
2239 mulss xmm7, xmm2 ;# xmm7=Heps2
2240 addss xmm5, xmm6
2241 addss xmm5, xmm7 ;# xmm5=Fp
2242 mulss xmm5, xmm1 ;# xmm5=eps*Fp
2243 addss xmm5, xmm4 ;# xmm5=VV
2245 mulss xmm5, [rsp + nb430nf_c12] ;# Vvdw12
2247 addss xmm5, [rsp + nb430nf_Vvdwtot]
2248 movss [rsp + nb430nf_Vvdwtot], xmm5
2250 .nb430nf_updateouterdata:
2251 ;# get n from stack
2252 mov esi, [rsp + nb430nf_n]
2253 ;# get group index for i particle
2254 mov rdx, [rbp + nb430nf_gid] ;# base of gid[]
2255 mov edx, [rdx + rsi*4] ;# ggid=gid[n]
2257 ;# accumulate total potential energy and update it
2258 movaps xmm7, [rsp + nb430nf_vctot]
2259 ;# accumulate
2260 movhlps xmm6, xmm7
2261 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
2262 movaps xmm6, xmm7
2263 shufps xmm6, xmm6, 1
2264 addss xmm7, xmm6
2266 ;# add earlier value from mem
2267 mov rax, [rbp + nb430nf_Vc]
2268 addss xmm7, [rax + rdx*4]
2269 ;# move back to mem
2270 movss [rax + rdx*4], xmm7
2272 ;# accumulate total lj energy and update it
2273 movaps xmm7, [rsp + nb430nf_Vvdwtot]
2274 ;# accumulate
2275 movhlps xmm6, xmm7
2276 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
2277 movaps xmm6, xmm7
2278 shufps xmm6, xmm6, 1
2279 addss xmm7, xmm6
2281 ;# add earlier value from mem
2282 mov rax, [rbp + nb430nf_Vvdw]
2283 addss xmm7, [rax + rdx*4]
2284 ;# move back to mem
2285 movss [rax + rdx*4], xmm7
2287 ;# finish if last
2288 mov ecx, [rsp + nb430nf_nn1]
2289 ;# esi already loaded with n
2290 inc esi
2291 sub ecx, esi
2292 jz .nb430nf_outerend
2294 ;# not last, iterate outer loop once more!
2295 mov [rsp + nb430nf_n], esi
2296 jmp .nb430nf_outer
2297 .nb430nf_outerend:
2298 ;# check if more outer neighborlists remain
2299 mov ecx, [rsp + nb430nf_nri]
2300 ;# esi already loaded with n above
2301 sub ecx, esi
2302 jz .nb430nf_end
2303 ;# non-zero, do one more workunit
2304 jmp .nb430nf_threadloop
2305 .nb430nf_end:
2307 mov eax, [rsp + nb430nf_nouter]
2308 mov ebx, [rsp + nb430nf_ninner]
2309 mov rcx, [rbp + nb430nf_outeriter]
2310 mov rdx, [rbp + nb430nf_inneriter]
2311 mov [rcx], eax
2312 mov [rdx], ebx
2314 add rsp, 392
2315 emms
2318 pop r15
2319 pop r14
2320 pop r13
2321 pop r12
2323 pop rbx
2324 pop rbp