Updated intel syntax x86-64 asm files to also support MS win64 call convention (ifdef...
[gromacs/rigid-bodies.git] / src / gmxlib / nonbonded / nb_kernel_x86_64_sse / nb_kernel130_x86_64_sse_intel_syntax.s
blobc25aa752275ea74ec3a3c283bec779c9e7d0061e
1 ;#
2 ;#
3 ;# Gromacs 4.0 Copyright (c) 1991-2003
4 ;# David van der Spoel, Erik Lindahl
5 ;#
6 ;# This program is free software; you can redistribute it and/or
7 ;# modify it under the terms of the GNU General Public License
8 ;# as published by the Free Software Foundation; either version 2
9 ;# of the License, or (at your option) any later version.
11 ;# To help us fund GROMACS development, we humbly ask that you cite
12 ;# the research papers on the package. Check out http://www.gromacs.org
13 ;#
14 ;# And Hey:
15 ;# Gnomes, ROck Monsters And Chili Sauce
18 ;# These files require GNU binutils 2.10 or later, since we
19 ;# use intel syntax for portability, or a recent version
20 ;# of NASM that understands Extended 3DNow and SSE2 instructions.
21 ;# (NASM is normally only used with MS Visual C++).
22 ;# Since NASM and gnu as disagree on some definitions and use
23 ;# completely different preprocessing options I have to introduce a
24 ;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
25 ;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
26 ;# reason why all comments need both symbols...
27 ;# The source is written for GNU as, with intel syntax. When you use
28 ;# NASM we redefine a couple of things. The false if-statement around
29 ;# the following code is seen by GNU as, but NASM doesn't see it, so
30 ;# the code inside is read by NASM but not gcc.
32 ; .if 0 # block below only read by NASM
33 %define .section section
34 %define .long dd
35 %define .align align
36 %define .globl global
37 ;# NASM only wants 'dword', not 'dword ptr'.
38 %define ptr
39 %macro .equiv 2
40 %1 equ %2
41 %endmacro
42 ; .endif # End of NASM-specific block
43 ; .intel_syntax noprefix # Line only read by gnu as
45 .section .text
49 .globl nb_kernel130_x86_64_sse
50 .globl _nb_kernel130_x86_64_sse
51 nb_kernel130_x86_64_sse:
52 _nb_kernel130_x86_64_sse:
53 ;# Room for return address and rbp (16 bytes)
54 .equiv nb130_fshift, 16
55 .equiv nb130_gid, 24
56 .equiv nb130_pos, 32
57 .equiv nb130_faction, 40
58 .equiv nb130_charge, 48
59 .equiv nb130_p_facel, 56
60 .equiv nb130_argkrf, 64
61 .equiv nb130_argcrf, 72
62 .equiv nb130_Vc, 80
63 .equiv nb130_type, 88
64 .equiv nb130_p_ntype, 96
65 .equiv nb130_vdwparam, 104
66 .equiv nb130_Vvdw, 112
67 .equiv nb130_p_tabscale, 120
68 .equiv nb130_VFtab, 128
69 .equiv nb130_invsqrta, 136
70 .equiv nb130_dvda, 144
71 .equiv nb130_p_gbtabscale, 152
72 .equiv nb130_GBtab, 160
73 .equiv nb130_p_nthreads, 168
74 .equiv nb130_count, 176
75 .equiv nb130_mtx, 184
76 .equiv nb130_outeriter, 192
77 .equiv nb130_inneriter, 200
78 .equiv nb130_work, 208
79 ;# stack offsets for local variables
80 ;# bottom of stack is cache-aligned for sse use
81 .equiv nb130_ix, 0
82 .equiv nb130_iy, 16
83 .equiv nb130_iz, 32
84 .equiv nb130_iq, 48
85 .equiv nb130_dx, 64
86 .equiv nb130_dy, 80
87 .equiv nb130_dz, 96
88 .equiv nb130_c6, 112
89 .equiv nb130_c12, 128
90 .equiv nb130_tsc, 144
91 .equiv nb130_qq, 160
92 .equiv nb130_vctot, 176
93 .equiv nb130_Vvdwtot, 192
94 .equiv nb130_fix, 208
95 .equiv nb130_fiy, 224
96 .equiv nb130_fiz, 240
97 .equiv nb130_half, 256
98 .equiv nb130_three, 272
99 .equiv nb130_two, 288
100 .equiv nb130_nri, 336
101 .equiv nb130_iinr, 344
102 .equiv nb130_jindex, 352
103 .equiv nb130_jjnr, 360
104 .equiv nb130_shift, 368
105 .equiv nb130_shiftvec, 376
106 .equiv nb130_facel, 384
107 .equiv nb130_innerjjnr, 392
108 .equiv nb130_is3, 400
109 .equiv nb130_ii3, 404
110 .equiv nb130_ntia, 408
111 .equiv nb130_innerk, 412
112 .equiv nb130_n, 416
113 .equiv nb130_nn1, 420
114 .equiv nb130_ntype, 424
115 .equiv nb130_nouter, 428
116 .equiv nb130_ninner, 432
118 push rbp
119 mov rbp, rsp
121 ;# Push integer registers on stack
122 push rbx
123 push rsi
124 push rdi
125 push r12
126 push r13
127 push r14
128 push r15
130 ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
131 sub rsp, 168
133 ;# Save xmm registers to stack
134 movaps [rsp ], xmm6
135 movaps [rsp + 16 ], xmm7
136 movaps [rsp + 32 ], xmm8
137 movaps [rsp + 48 ], xmm9
138 movaps [rsp + 64 ], xmm10
139 movaps [rsp + 80 ], xmm11
140 movaps [rsp + 96 ], xmm12
141 movaps [rsp + 112], xmm13
142 movaps [rsp + 128], xmm14
143 movaps [rsp + 144], xmm15
145 emms
146 sub rsp, 432 ;# local variable stack space (n*16+8)
147 ; .if 0 # block below only read by NASM - special calling convention on win64
148 %ifidn __OUTPUT_FORMAT__, win64
149 ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
150 add rbp, 48
151 ;# Adjust stack pointer for different alignment
152 ;# Move around arguments to fit AMD64 convention below
153 ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
154 ;# win64 passes args in: rcx,rdx,r8,r9 + stack
155 mov rdi, rcx
156 mov rsi, rdx
157 mov rdx, r8
158 mov rcx, r9
159 mov r8, [rbp]
160 mov r9, [rbp + 8]
161 %endif
162 ; .endif # end NASM- and win64-specific block
164 ;# zero 32-bit iteration counters
165 mov eax, 0
166 mov [rsp + nb130_nouter], eax
167 mov [rsp + nb130_ninner], eax
169 mov edi, [rdi]
170 mov [rsp + nb130_nri], edi
171 mov [rsp + nb130_iinr], rsi
172 mov [rsp + nb130_jindex], rdx
173 mov [rsp + nb130_jjnr], rcx
174 mov [rsp + nb130_shift], r8
175 mov [rsp + nb130_shiftvec], r9
176 mov rdi, [rbp + nb130_p_ntype]
177 mov edi, [rdi]
178 mov [rsp + nb130_ntype], edi
179 mov rsi, [rbp + nb130_p_facel]
180 movss xmm0, [rsi]
181 movss [rsp + nb130_facel], xmm0
183 mov rax, [rbp + nb130_p_tabscale]
184 movss xmm3, [rax]
185 shufps xmm3, xmm3, 0
186 movaps [rsp + nb130_tsc], xmm3
188 ;# create constant floating-point factors on stack
189 mov eax, 0x3f000000 ;# half in IEEE (hex)
190 mov [rsp + nb130_half], eax
191 movss xmm1, [rsp + nb130_half]
192 shufps xmm1, xmm1, 0 ;# splat to all elements
193 movaps xmm2, xmm1
194 addps xmm2, xmm2 ;# one
195 movaps xmm3, xmm2
196 addps xmm2, xmm2 ;# two
197 addps xmm3, xmm2 ;# three
198 movaps [rsp + nb130_half], xmm1
199 movaps [rsp + nb130_two], xmm2
200 movaps [rsp + nb130_three], xmm3
202 .nb130_threadloop:
203 mov rsi, [rbp + nb130_count] ;# pointer to sync counter
204 mov eax, [rsi]
205 .nb130_spinlock:
206 mov ebx, eax ;# ebx=*count=nn0
207 add ebx, 1 ;# ebx=nn1=nn0+10
208 lock
209 cmpxchg [rsi], ebx ;# write nn1 to *counter,
210 ;# if it hasnt changed.
211 ;# or reread *counter to eax.
212 pause ;# -> better p4 performance
213 jnz .nb130_spinlock
215 ;# if(nn1>nri) nn1=nri
216 mov ecx, [rsp + nb130_nri]
217 mov edx, ecx
218 sub ecx, ebx
219 cmovle ebx, edx ;# if(nn1>nri) nn1=nri
220 ;# Cleared the spinlock if we got here.
221 ;# eax contains nn0, ebx contains nn1.
222 mov [rsp + nb130_n], eax
223 mov [rsp + nb130_nn1], ebx
224 sub ebx, eax ;# calc number of outer lists
225 mov esi, eax ;# copy n to esi
226 jg .nb130_outerstart
227 jmp .nb130_end
229 .nb130_outerstart:
230 ;# ebx contains number of outer iterations
231 add ebx, [rsp + nb130_nouter]
232 mov [rsp + nb130_nouter], ebx
234 .nb130_outer:
235 mov rax, [rsp + nb130_shift] ;# eax = pointer into shift[]
236 mov ebx, [rax + rsi*4] ;# ebx=shift[n]
238 lea rbx, [rbx + rbx*2] ;# rbx=3*is
239 mov [rsp + nb130_is3],ebx ;# store is3
241 mov rax, [rsp + nb130_shiftvec] ;# eax = base of shiftvec[]
243 movss xmm0, [rax + rbx*4]
244 movss xmm1, [rax + rbx*4 + 4]
245 movss xmm2, [rax + rbx*4 + 8]
247 mov rcx, [rsp + nb130_iinr] ;# ecx = pointer into iinr[]
248 mov ebx, [rcx + rsi*4] ;# ebx =ii
250 mov rdx, [rbp + nb130_charge]
251 movss xmm3, [rdx + rbx*4]
252 mulss xmm3, [rsp + nb130_facel]
253 shufps xmm3, xmm3, 0
255 mov rdx, [rbp + nb130_type]
256 mov edx, [rdx + rbx*4]
257 imul edx, [rsp + nb130_ntype]
258 shl edx, 1
259 mov [rsp + nb130_ntia], edx
261 lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3
262 mov rax, [rbp + nb130_pos] ;# eax = base of pos[]
264 addss xmm0, [rax + rbx*4]
265 addss xmm1, [rax + rbx*4 + 4]
266 addss xmm2, [rax + rbx*4 + 8]
268 movaps [rsp + nb130_iq], xmm3
270 shufps xmm0, xmm0, 0
271 shufps xmm1, xmm1, 0
272 shufps xmm2, xmm2, 0
274 movaps [rsp + nb130_ix], xmm0
275 movaps [rsp + nb130_iy], xmm1
276 movaps [rsp + nb130_iz], xmm2
278 mov [rsp + nb130_ii3], ebx
280 ;# clear vctot and i forces
281 xorps xmm4, xmm4
282 movaps [rsp + nb130_vctot], xmm4
283 movaps [rsp + nb130_Vvdwtot], xmm4
284 movaps [rsp + nb130_fix], xmm4
285 movaps [rsp + nb130_fiy], xmm4
286 movaps [rsp + nb130_fiz], xmm4
288 mov rax, [rsp + nb130_jindex]
289 mov ecx, [rax + rsi*4] ;# jindex[n]
290 mov edx, [rax + rsi*4 + 4] ;# jindex[n+1]
291 sub edx, ecx ;# number of innerloop atoms
293 mov rax, [rsp + nb130_jjnr]
294 shl ecx, 2
295 add rax, rcx
296 mov [rsp + nb130_innerjjnr], rax ;# pointer to jjnr[nj0]
297 mov ecx, edx
298 sub edx, 4
299 add ecx, [rsp + nb130_ninner]
300 mov [rsp + nb130_ninner], ecx
301 add edx, 0
302 mov [rsp + nb130_innerk], edx ;# number of innerloop atoms
303 jge .nb130_unroll_loop
304 jmp .nb130_finish_inner
305 .nb130_unroll_loop:
306 ;# quad-unroll innerloop here
307 mov rdx, [rsp + nb130_innerjjnr] ;# pointer to jjnr[k]
308 mov eax, [rdx]
309 mov ebx, [rdx + 4]
310 mov ecx, [rdx + 8]
311 mov edx, [rdx + 12] ;# eax-edx=jnr1-4
312 add qword ptr [rsp + nb130_innerjjnr], 16 ;# advance pointer (unrolled 4)
314 mov rsi, [rbp + nb130_charge]
315 movss xmm0, [rsi + rax*4]
316 movss xmm1, [rsi + rcx*4]
317 movss xmm2, [rsi + rbx*4]
318 movss xmm3, [rsi + rdx*4]
320 unpcklps xmm0, xmm1 ;# jqa jqc - -
321 unpcklps xmm2, xmm3 ;# jqb jqd - -
322 unpcklps xmm0, xmm2 ;# jqa jqb jqc jqd
323 mulps xmm0, [rsp + nb130_iq]
324 movaps [rsp + nb130_qq], xmm0
326 ;# vdw parameters
327 mov rsi, [rbp + nb130_type]
328 mov r12d, [rsi + rax*4]
329 mov r13d, [rsi + rbx*4]
330 mov r14d, [rsi + rcx*4]
331 mov r15d, [rsi + rdx*4]
332 shl r12d, 1
333 shl r13d, 1
334 shl r14d, 1
335 shl r15d, 1
336 mov edi, [rsp + nb130_ntia]
337 add r12d, edi
338 add r13d, edi
339 add r14d, edi
340 add r15d, edi
342 mov rsi, [rbp + nb130_vdwparam]
343 movlps xmm3, [rsi + r12*4]
344 movlps xmm7, [rsi + r14*4]
345 movhps xmm3, [rsi + r13*4]
346 movhps xmm7, [rsi + r15*4]
348 movaps xmm0, xmm3
349 shufps xmm0, xmm7, 136 ;# 10001000
350 shufps xmm3, xmm7, 221 ;# 11011101
352 movaps [rsp + nb130_c6], xmm0
353 movaps [rsp + nb130_c12], xmm3
355 lea rax, [rax + rax*2] ;# replace jnr with j3
356 lea rbx, [rbx + rbx*2]
357 lea rcx, [rcx + rcx*2]
358 lea rdx, [rdx + rdx*2]
360 mov rdi, [rbp + nb130_pos]
361 ;# load coordinates
362 movlps xmm1, [rdi + rax*4] ;# x1 y1 - -
363 movlps xmm2, [rdi + rcx*4] ;# x3 y3 - -
364 movhps xmm1, [rdi + rbx*4] ;# x2 y2 - -
365 movhps xmm2, [rdi + rdx*4] ;# x4 y4 - -
367 movss xmm5, [rdi + rax*4 + 8] ;# z1 - - -
368 movss xmm6, [rdi + rcx*4 + 8] ;# z2 - - -
369 movss xmm7, [rdi + rbx*4 + 8] ;# z3 - - -
370 movss xmm8, [rdi + rdx*4 + 8] ;# z4 - - -
371 movlhps xmm5, xmm7 ;# jzOa - jzOb -
372 movlhps xmm6, xmm8 ;# jzOc - jzOd -
374 movaps xmm4, xmm1
375 unpcklps xmm1, xmm2 ;# jxa jxc jya jyc
376 unpckhps xmm4, xmm2 ;# jxb jxd jyb jyd
377 movaps xmm2, xmm1
378 unpcklps xmm1, xmm4 ;# x
379 unpckhps xmm2, xmm4 ;# y
380 shufps xmm5, xmm6, 136 ;# 10001000 => jzH2a jzH2b jzH2c jzH2d
382 ;# calc dr
383 subps xmm1, [rsp + nb130_ix]
384 subps xmm2, [rsp + nb130_iy]
385 subps xmm5, [rsp + nb130_iz]
387 ;# store dr
388 movaps [rsp + nb130_dx], xmm1
389 movaps [rsp + nb130_dy], xmm2
390 movaps [rsp + nb130_dz], xmm5
392 ;# square it
393 mulps xmm1,xmm1
394 mulps xmm2,xmm2
395 mulps xmm5,xmm5
396 addps xmm1, xmm2
397 addps xmm1, xmm5
399 ;# rsq in xmm1
401 ;# calculate rinv=1/sqrt(rsq)
402 rsqrtps xmm5, xmm1
403 movaps xmm2, xmm5
404 mulps xmm5, xmm5
405 movaps xmm4, [rsp + nb130_three]
406 mulps xmm5, xmm1 ;# rsq*lu*lu
407 subps xmm4, xmm5 ;# 30-rsq*lu*lu
408 mulps xmm4, xmm2
409 mulps xmm4, [rsp + nb130_half]
410 movaps xmm2, xmm4
411 mulps xmm1, xmm4
412 ;# xmm2=rinv
413 ;# xmm1=r
415 mulps xmm1, [rsp + nb130_tsc] ;# rtab
417 ;# truncate and convert to integers
418 cvttps2dq xmm5, xmm1
420 ;# convert back to float
421 cvtdq2ps xmm4, xmm5
423 ;# multiply by 8
424 pslld xmm5, 3
426 ;# calculate eps
427 subps xmm1, xmm4
429 ;# move to integer registers
430 movhlps xmm6, xmm5
431 movd r8d, xmm5
432 movd r10d, xmm6
433 pshufd xmm5, xmm5, 1
434 pshufd xmm6, xmm6, 1
435 movd r9d, xmm5
436 movd r11d, xmm6
438 ;# xmm1=eps
439 ;# xmm2=rinv
441 mov rsi, [rbp + nb130_VFtab]
442 ;# calculate LJ table
443 movlps xmm5, [rsi + r8*4]
444 movlps xmm9, [rsi + r8*4 + 16]
446 movlps xmm7, [rsi + r10*4]
447 movlps xmm11, [rsi + r10*4 + 16]
449 movaps xmm0, xmm2 ;# rinv
450 mulps xmm2, [rsp + nb130_qq] ;# vcoul=rinv*qq
451 movaps xmm3, xmm2 ;# copy of vcoul (to calc fscal)
452 mulps xmm3, xmm0 ;# vcoul*rinv
454 movhps xmm5, [rsi + r9*4]
455 movhps xmm9, [rsi + r9*4 + 16]
457 addps xmm2, [rsp + nb130_vctot]
458 movaps [rsp + nb130_vctot], xmm2
460 movhps xmm7, [rsi + r11*4]
461 movhps xmm11, [rsi + r11*4 + 16]
463 movaps xmm4, xmm5
464 movaps xmm8, xmm9
465 shufps xmm4, xmm7, 136 ;# 10001000
466 shufps xmm8, xmm11, 136 ;# 10001000
467 shufps xmm5, xmm7, 221 ;# 11011101
468 shufps xmm9, xmm11, 221 ;# 11011101
470 movlps xmm7, [rsi + r8*4 + 8]
471 movlps xmm11, [rsi + r8*4 + 24]
473 movlps xmm13, [rsi + r10*4 + 8]
474 movlps xmm14, [rsi + r10*4 + 24]
476 movhps xmm7, [rsi + r9*4 + 8]
477 movhps xmm11, [rsi + r9*4 + 24]
479 movhps xmm13, [rsi + r11*4 + 8]
480 movhps xmm14, [rsi + r11*4 + 24]
482 movaps xmm6, xmm7
483 movaps xmm10, xmm11
485 shufps xmm6, xmm13, 136 ;# 10001000
486 shufps xmm10, xmm14, 136 ;# 10001000
487 shufps xmm7, xmm13, 221 ;# 11011101
488 shufps xmm11, xmm14, 221 ;# 11011101
489 ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
491 mulps xmm7, xmm1 ;# Heps
492 mulps xmm11, xmm1
493 mulps xmm6, xmm1 ;# Geps
494 mulps xmm10, xmm1
495 mulps xmm7, xmm1 ;# Heps2
496 mulps xmm11, xmm1
497 addps xmm5, xmm6 ;# F+Geps
498 addps xmm9, xmm10
499 addps xmm5, xmm7 ;# F+Geps+Heps2 = Fp
500 addps xmm9, xmm11
501 addps xmm7, xmm7 ;# 2*Heps2
502 addps xmm11, xmm11
503 addps xmm7, xmm6 ;# 2*Heps2+Geps
504 addps xmm11, xmm10
506 addps xmm7, xmm5 ;# FF = Fp + 2*Heps2 + Geps
507 addps xmm11, xmm9
508 mulps xmm5, xmm1 ;# eps*Fp
509 mulps xmm9, xmm1
510 movaps xmm12, [rsp + nb130_c6]
511 movaps xmm13, [rsp + nb130_c12]
512 addps xmm5, xmm4 ;# VV
513 addps xmm9, xmm8
515 mulps xmm5, xmm12 ;# VV*c6 = vnb6
516 mulps xmm9, xmm13 ;# VV*c12 = vnb12
517 addps xmm5, xmm9
518 addps xmm5, [rsp + nb130_Vvdwtot]
519 movaps [rsp + nb130_Vvdwtot], xmm5
521 mulps xmm7, xmm12 ;# FF*c6 = fnb6
522 mulps xmm11, xmm13 ;# FF*c12 = fnb12
523 addps xmm7, xmm11
525 mulps xmm7, [rsp + nb130_tsc]
526 subps xmm3, xmm7
527 mulps xmm3, xmm0 ;# fscal
529 movaps xmm9, xmm3
530 movaps xmm10, xmm3
531 movaps xmm11, xmm3
533 movaps xmm12, [rsp + nb130_fix]
534 movaps xmm13, [rsp + nb130_fiy]
535 movaps xmm14, [rsp + nb130_fiz]
537 mulps xmm9, [rsp + nb130_dx]
538 mulps xmm10, [rsp + nb130_dy]
539 mulps xmm11, [rsp + nb130_dz]
541 ;# accumulate i forces
542 addps xmm12, xmm9
543 addps xmm13, xmm10
544 addps xmm14, xmm11
545 movaps [rsp + nb130_fix], xmm12
546 movaps [rsp + nb130_fiy], xmm13
547 movaps [rsp + nb130_fiz], xmm14
549 mov rsi, [rbp + nb130_faction]
550 ;# the fj's - start by accumulating x & y forces from memory
551 movlps xmm0, [rsi + rax*4] ;# x1 y1 - -
552 movlps xmm1, [rsi + rcx*4] ;# x3 y3 - -
553 movhps xmm0, [rsi + rbx*4] ;# x1 y1 x2 y2
554 movhps xmm1, [rsi + rdx*4] ;# x3 y3 x4 y4
556 movaps xmm8, xmm9
557 unpcklps xmm9, xmm10 ;# x1 y1 x2 y2
558 unpckhps xmm8, xmm10 ;# x3 y3 x4 y4
560 ;# update fjx and fjy
561 addps xmm0, xmm9
562 addps xmm1, xmm8
564 movlps [rsi + rax*4], xmm0
565 movlps [rsi + rcx*4], xmm1
566 movhps [rsi + rbx*4], xmm0
567 movhps [rsi + rdx*4], xmm1
569 ;# xmm11: fjz1 fjz2 fjz3 fjz4
570 pshufd xmm10, xmm11, 1 ;# fjz2 - - -
571 movhlps xmm9, xmm11 ;# fjz3 - - -
572 pshufd xmm8, xmm11, 3 ;# fjz4 - - -
574 addss xmm11, [rsi + rax*4 + 8]
575 addss xmm10, [rsi + rbx*4 + 8]
576 addss xmm9, [rsi + rcx*4 + 8]
577 addss xmm8, [rsi + rdx*4 + 8]
578 movss [rsi + rax*4 + 8], xmm11
579 movss [rsi + rbx*4 + 8], xmm10
580 movss [rsi + rcx*4 + 8], xmm9
581 movss [rsi + rdx*4 + 8], xmm8
583 ;# should we do one more iteration?
584 sub dword ptr [rsp + nb130_innerk], 4
585 jl .nb130_finish_inner
586 jmp .nb130_unroll_loop
587 .nb130_finish_inner:
588 ;# check if at least two particles remain
589 add dword ptr [rsp + nb130_innerk], 4
590 mov edx, [rsp + nb130_innerk]
591 and edx, 2
592 jnz .nb130_dopair
593 jmp .nb130_checksingle
594 .nb130_dopair:
595 mov rcx, [rsp + nb130_innerjjnr]
597 mov eax, [rcx]
598 mov ebx, [rcx + 4]
599 add qword ptr [rsp + nb130_innerjjnr], 8
601 mov rsi, [rbp + nb130_charge]
602 movss xmm0, [rsi + rax*4]
603 movss xmm2, [rsi + rbx*4]
605 unpcklps xmm0, xmm2 ;# jqa jqb
606 mulps xmm0, [rsp + nb130_iq]
607 movaps [rsp + nb130_qq], xmm0
609 mov rsi, [rbp + nb130_type]
610 ;# vdw parameters
611 mov r12d, [rsi + rax*4]
612 mov r13d, [rsi + rbx*4]
613 shl r12d, 1
614 shl r13d, 1
615 mov edi, [rsp + nb130_ntia]
616 add r12d, edi
617 add r13d, edi
619 mov rsi, [rbp + nb130_vdwparam]
620 movlps xmm3, [rsi + r12*4]
621 movhps xmm3, [rsi + r13*4]
623 xorps xmm7, xmm7
624 movaps xmm0, xmm3
625 shufps xmm0, xmm7, 136 ;# 10001000
626 shufps xmm3, xmm7, 221 ;# 11011101
628 movaps [rsp + nb130_c6], xmm0
629 movaps [rsp + nb130_c12], xmm3
631 lea rax, [rax + rax*2] ;# replace jnr with j3
632 lea rbx, [rbx + rbx*2]
634 ;# load coordinates
635 mov rdi, [rbp + nb130_pos]
637 movlps xmm1, [rdi + rax*4] ;# x1 y1 - -
638 movlps xmm2, [rdi + rbx*4] ;# x2 y2 - -
640 movss xmm5, [rdi + rax*4 + 8] ;# z1 - - -
641 movss xmm6, [rdi + rbx*4 + 8] ;# z2 - - -
643 unpcklps xmm1, xmm2 ;# x1 x2 y1 y2
644 movhlps xmm2, xmm1 ;# y1 y2 - -
645 unpcklps xmm5, xmm6 ;# z1 z2 - -
647 ;# calc dr
648 subps xmm1, [rsp + nb130_ix]
649 subps xmm2, [rsp + nb130_iy]
650 subps xmm5, [rsp + nb130_iz]
652 ;# store dr
653 movaps [rsp + nb130_dx], xmm1
654 movaps [rsp + nb130_dy], xmm2
655 movaps [rsp + nb130_dz], xmm5
657 ;# square it
658 mulps xmm1,xmm1
659 mulps xmm2,xmm2
660 mulps xmm5,xmm5
661 addps xmm1, xmm2
662 addps xmm1, xmm5
664 ;# rsq in xmm1
666 ;# calculate rinv=1/sqrt(rsq)
667 rsqrtps xmm5, xmm1
668 movaps xmm2, xmm5
669 mulps xmm5, xmm5
670 movaps xmm4, [rsp + nb130_three]
671 mulps xmm5, xmm1 ;# rsq*lu*lu
672 subps xmm4, xmm5 ;# 30-rsq*lu*lu
673 mulps xmm4, xmm2
674 mulps xmm4, [rsp + nb130_half]
675 movaps xmm2, xmm4
676 mulps xmm1, xmm4
677 ;# xmm2=rinv
678 ;# xmm1=r
680 mulps xmm1, [rsp + nb130_tsc] ;# rtab
682 ;# truncate and convert to integers
683 cvttps2dq xmm5, xmm1
685 ;# convert back to float
686 cvtdq2ps xmm4, xmm5
688 ;# multiply by 8
689 pslld xmm5, 3
691 ;# calculate eps
692 subps xmm1, xmm4
694 ;# move to integer registers
695 movd r8d, xmm5
696 pshufd xmm5, xmm5, 1
697 movd r9d, xmm5
699 ;# xmm1=eps
700 ;# xmm2=rinv
702 mov rsi, [rbp + nb130_VFtab]
703 ;# calculate LJ table
704 movlps xmm4, [rsi + r8*4]
705 movlps xmm5, [rsi + r9*4]
707 unpcklps xmm4, xmm5
708 movhlps xmm5, xmm4
710 movlps xmm6, [rsi + r8*4 + 8]
711 movlps xmm7, [rsi + r9*4 + 8]
713 movaps xmm0, xmm2 ;# rinv
714 mulps xmm2, [rsp + nb130_qq] ;# vcoul=rinv*qq
715 movaps xmm3, xmm2 ;# copy of vcoul (to calc fscal)
716 mulps xmm3, xmm0 ;# vcoul*rinv
718 unpcklps xmm6, xmm7
719 movhlps xmm7, xmm6
721 movlps xmm8, [rsi + r8*4 + 16]
722 movlps xmm9, [rsi + r9*4 + 16]
724 unpcklps xmm8, xmm9
725 movhlps xmm9, xmm8
727 addps xmm2, [rsp + nb130_vctot]
728 movlps [rsp + nb130_vctot], xmm2
730 movlps xmm10, [rsi + r8*4 + 24]
731 movlps xmm11, [rsi + r9*4 + 24]
733 unpcklps xmm10, xmm11
734 movhlps xmm11, xmm10
735 ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
737 mulps xmm7, xmm1 ;# Heps
738 mulps xmm11, xmm1
739 mulps xmm6, xmm1 ;# Geps
740 mulps xmm10, xmm1
741 mulps xmm7, xmm1 ;# Heps2
742 mulps xmm11, xmm1
743 addps xmm5, xmm6 ;# F+Geps
744 addps xmm9, xmm10
745 addps xmm5, xmm7 ;# F+Geps+Heps2 = Fp
746 addps xmm9, xmm11
747 addps xmm7, xmm7 ;# 2*Heps2
748 addps xmm11, xmm11
749 addps xmm7, xmm6 ;# 2*Heps2+Geps
750 addps xmm11, xmm10
752 addps xmm7, xmm5 ;# FF = Fp + 2*Heps2 + Geps
753 addps xmm11, xmm9
754 mulps xmm5, xmm1 ;# eps*Fp
755 mulps xmm9, xmm1
756 movaps xmm12, [rsp + nb130_c6]
757 movaps xmm13, [rsp + nb130_c12]
758 addps xmm5, xmm4 ;# VV
759 addps xmm9, xmm8
761 mulps xmm5, xmm12 ;# VV*c6 = vnb6
762 mulps xmm9, xmm13 ;# VV*c12 = vnb12
763 addps xmm5, xmm9
764 addps xmm5, [rsp + nb130_Vvdwtot]
765 movlps [rsp + nb130_Vvdwtot], xmm5
767 mulps xmm7, xmm12 ;# FF*c6 = fnb6
768 mulps xmm11, xmm13 ;# FF*c12 = fnb12
769 addps xmm7, xmm11
771 mulps xmm7, [rsp + nb130_tsc]
772 subps xmm3, xmm7
773 mulps xmm3, xmm0 ;# fscal
775 movaps xmm9, xmm3
776 movaps xmm10, xmm3
777 movaps xmm11, xmm3
779 xorps xmm8, xmm8
781 movaps xmm12, [rsp + nb130_fix]
782 movaps xmm13, [rsp + nb130_fiy]
783 movaps xmm14, [rsp + nb130_fiz]
785 mulps xmm9, [rsp + nb130_dx]
786 mulps xmm10, [rsp + nb130_dy]
787 mulps xmm11, [rsp + nb130_dz]
789 movlhps xmm9, xmm8
790 movlhps xmm10, xmm8
791 movlhps xmm11, xmm8
793 ;# accumulate i forces
794 addps xmm12, xmm9
795 addps xmm13, xmm10
796 addps xmm14, xmm11
797 movaps [rsp + nb130_fix], xmm12
798 movaps [rsp + nb130_fiy], xmm13
799 movaps [rsp + nb130_fiz], xmm14
801 mov rsi, [rbp + nb130_faction]
802 ;# the fj's - start by accumulating x & y forces from memory
803 movlps xmm0, [rsi + rax*4] ;# x1 y1 - -
804 movhps xmm0, [rsi + rbx*4] ;# x1 y1 x2 y2
806 unpcklps xmm9, xmm10 ;# x1 y1 x2 y2
807 addps xmm0, xmm9
809 movlps [rsi + rax*4], xmm0
810 movhps [rsi + rbx*4], xmm0
812 ;# z forces
813 pshufd xmm8, xmm11, 1
814 addss xmm11, [rsi + rax*4 + 8]
815 addss xmm8, [rsi + rbx*4 + 8]
816 movss [rsi + rax*4 + 8], xmm11
817 movss [rsi + rbx*4 + 8], xmm8
819 .nb130_checksingle:
820 mov edx, [rsp + nb130_innerk]
821 and edx, 1
822 jnz .nb130_dosingle
823 jmp .nb130_updateouterdata
824 .nb130_dosingle:
825 mov rdi, [rbp + nb130_pos]
826 mov rcx, [rsp + nb130_innerjjnr]
827 mov eax, [rcx]
829 mov rsi, [rbp + nb130_charge]
830 movss xmm0, [rsi + rax*4]
832 mulss xmm0, [rsp + nb130_iq]
833 movaps [rsp + nb130_qq], xmm0
835 mov rsi, [rbp + nb130_type]
836 ;# vdw parameters
837 mov r12d, [rsi + rax*4]
838 shl r12d, 1
839 mov edi, [rsp + nb130_ntia]
840 add r12d, edi
842 mov rsi, [rbp + nb130_vdwparam]
843 movss xmm0, [rsi + r12*4]
844 movss xmm3, [rsi + r12*4 + 4]
846 movaps [rsp + nb130_c6], xmm0
847 movaps [rsp + nb130_c12], xmm3
849 lea rax, [rax + rax*2] ;# replace jnr with j3
851 mov rdi, [rbp + nb130_pos]
852 ;# load coordinates
853 movss xmm1, [rdi + rax*4]
854 movss xmm2, [rdi + rax*4 + 4]
855 movss xmm5, [rdi + rax*4 + 8]
857 ;# calc dr
858 subss xmm1, [rsp + nb130_ix]
859 subss xmm2, [rsp + nb130_iy]
860 subss xmm5, [rsp + nb130_iz]
862 ;# store dr
863 movaps [rsp + nb130_dx], xmm1
864 movaps [rsp + nb130_dy], xmm2
865 movaps [rsp + nb130_dz], xmm5
867 ;# square it
868 mulss xmm1,xmm1
869 mulss xmm2,xmm2
870 mulss xmm5,xmm5
871 addss xmm1, xmm2
872 addss xmm1, xmm5
874 ;# rsq in xmm1
876 ;# calculate rinv=1/sqrt(rsq)
877 rsqrtss xmm5, xmm1
878 movaps xmm2, xmm5
879 mulss xmm5, xmm5
880 movaps xmm4, [rsp + nb130_three]
881 mulss xmm5, xmm1 ;# rsq*lu*lu
882 subss xmm4, xmm5 ;# 30-rsq*lu*lu
883 mulss xmm4, xmm2
884 mulss xmm4, [rsp + nb130_half]
885 movaps xmm2, xmm4
886 mulss xmm1, xmm4
887 ;# xmm2=rinv
888 ;# xmm1=r
890 mulss xmm1, [rsp + nb130_tsc] ;# rtab
892 ;# truncate and convert to integers
893 cvttss2si r8d, xmm1
895 ;# convert back to float
896 cvtsi2ss xmm4, r8d
898 ;# multiply by 8
899 shl r8d, 3
901 ;# calculate eps
902 subss xmm1, xmm4
904 ;# xmm1=eps
905 ;# xmm2=rinv
907 mov rsi, [rbp + nb130_VFtab]
908 ;# calculate LJ table
909 movss xmm4, [rsi + r8*4]
910 movss xmm5, [rsi + r8*4 + 4]
911 movss xmm6, [rsi + r8*4 + 8]
912 movss xmm7, [rsi + r8*4 + 12]
913 movss xmm8, [rsi + r8*4 + 16]
914 movss xmm9, [rsi + r8*4 + 20]
915 movss xmm10, [rsi + r8*4 + 24]
916 movss xmm11, [rsi + r8*4 + 28]
917 ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
919 ;# coulomb interaction
920 movaps xmm0, xmm2 ;# rinv
921 mulss xmm2, [rsp + nb130_qq] ;# vcoul=rinv*qq
922 movaps xmm3, xmm2 ;# copy of vcoul (to calc fscal)
923 mulss xmm3, xmm0 ;# vcoul*rinv
925 addss xmm2, [rsp + nb130_vctot]
926 movss [rsp + nb130_vctot], xmm2
928 ;# calculate table interaction
929 mulss xmm7, xmm1 ;# Heps
930 mulss xmm11, xmm1
931 mulss xmm6, xmm1 ;# Geps
932 mulss xmm10, xmm1
933 mulss xmm7, xmm1 ;# Heps2
934 mulss xmm11, xmm1
935 addss xmm5, xmm6 ;# F+Geps
936 addss xmm9, xmm10
937 addss xmm5, xmm7 ;# F+Geps+Heps2 = Fp
938 addss xmm9, xmm11
939 addss xmm7, xmm7 ;# 2*Heps2
940 addss xmm11, xmm11
941 addss xmm7, xmm6 ;# 2*Heps2+Geps
942 addss xmm11, xmm10
944 addss xmm7, xmm5 ;# FF = Fp + 2*Heps2 + Geps
945 addss xmm11, xmm9
946 mulss xmm5, xmm1 ;# eps*Fp
947 mulss xmm9, xmm1
948 movaps xmm12, [rsp + nb130_c6]
949 movaps xmm13, [rsp + nb130_c12]
950 addss xmm5, xmm4 ;# VV
951 addss xmm9, xmm8
953 mulss xmm5, xmm12 ;# VV*c6 = vnb6
954 mulss xmm9, xmm13 ;# VV*c12 = vnb12
955 addss xmm5, xmm9
956 addss xmm5, [rsp + nb130_Vvdwtot]
957 movss [rsp + nb130_Vvdwtot], xmm5
959 mulss xmm7, xmm12 ;# FF*c6 = fnb6
960 mulss xmm11, xmm13 ;# FF*c12 = fnb12
961 addss xmm7, xmm11
963 mulss xmm7, [rsp + nb130_tsc]
964 subss xmm3, xmm7
965 mulss xmm3, xmm0 ;# fscal
967 movaps xmm9, xmm3
968 movaps xmm10, xmm3
969 movaps xmm11, xmm3
971 movaps xmm12, [rsp + nb130_fix]
972 movaps xmm13, [rsp + nb130_fiy]
973 movaps xmm14, [rsp + nb130_fiz]
975 mulss xmm9, [rsp + nb130_dx]
976 mulss xmm10, [rsp + nb130_dy]
977 mulss xmm11, [rsp + nb130_dz]
979 ;# accumulate i forces
980 addss xmm12, xmm9
981 addss xmm13, xmm10
982 addss xmm14, xmm11
983 movaps [rsp + nb130_fix], xmm12
984 movaps [rsp + nb130_fiy], xmm13
985 movaps [rsp + nb130_fiz], xmm14
987 mov rsi, [rbp + nb130_faction]
988 ;# add to j forces
989 addss xmm9, [rsi + rax*4]
990 addss xmm10, [rsi + rax*4 + 4]
991 addss xmm11, [rsi + rax*4 + 8]
992 movss [rsi + rax*4], xmm9
993 movss [rsi + rax*4 + 4], xmm10
994 movss [rsi + rax*4 + 8], xmm11
996 .nb130_updateouterdata:
997 mov ecx, [rsp + nb130_ii3]
998 mov rdi, [rbp + nb130_faction]
999 mov rsi, [rbp + nb130_fshift]
1000 mov edx, [rsp + nb130_is3]
1002 ;# accumulate i forces in xmm0, xmm1, xmm2
1003 movaps xmm0, [rsp + nb130_fix]
1004 movaps xmm1, [rsp + nb130_fiy]
1005 movaps xmm2, [rsp + nb130_fiz]
1007 movhlps xmm3, xmm0
1008 movhlps xmm4, xmm1
1009 movhlps xmm5, xmm2
1010 addps xmm0, xmm3
1011 addps xmm1, xmm4
1012 addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2
1014 movaps xmm3, xmm0
1015 movaps xmm4, xmm1
1016 movaps xmm5, xmm2
1018 shufps xmm3, xmm3, 1
1019 shufps xmm4, xmm4, 1
1020 shufps xmm5, xmm5, 1
1021 addss xmm0, xmm3
1022 addss xmm1, xmm4
1023 addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0
1025 ;# increment i force
1026 movss xmm3, [rdi + rcx*4]
1027 movss xmm4, [rdi + rcx*4 + 4]
1028 movss xmm5, [rdi + rcx*4 + 8]
1029 subss xmm3, xmm0
1030 subss xmm4, xmm1
1031 subss xmm5, xmm2
1032 movss [rdi + rcx*4], xmm3
1033 movss [rdi + rcx*4 + 4], xmm4
1034 movss [rdi + rcx*4 + 8], xmm5
1036 ;# increment fshift force
1037 movss xmm3, [rsi + rdx*4]
1038 movss xmm4, [rsi + rdx*4 + 4]
1039 movss xmm5, [rsi + rdx*4 + 8]
1040 subss xmm3, xmm0
1041 subss xmm4, xmm1
1042 subss xmm5, xmm2
1043 movss [rsi + rdx*4], xmm3
1044 movss [rsi + rdx*4 + 4], xmm4
1045 movss [rsi + rdx*4 + 8], xmm5
1047 ;# get n from stack
1048 mov esi, [rsp + nb130_n]
1049 ;# get group index for i particle
1050 mov rdx, [rbp + nb130_gid] ;# base of gid[]
1051 mov edx, [rdx + rsi*4] ;# ggid=gid[n]
1053 ;# accumulate total potential energy and update it
1054 movaps xmm7, [rsp + nb130_vctot]
1055 ;# accumulate
1056 movhlps xmm6, xmm7
1057 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
1058 movaps xmm6, xmm7
1059 shufps xmm6, xmm6, 1
1060 addss xmm7, xmm6
1062 ;# add earlier value from mem
1063 mov rax, [rbp + nb130_Vc]
1064 addss xmm7, [rax + rdx*4]
1065 ;# move back to mem
1066 movss [rax + rdx*4], xmm7
1068 ;# accumulate total lj energy and update it
1069 movaps xmm7, [rsp + nb130_Vvdwtot]
1070 ;# accumulate
1071 movhlps xmm6, xmm7
1072 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
1073 movaps xmm6, xmm7
1074 shufps xmm6, xmm6, 1
1075 addss xmm7, xmm6
1077 ;# add earlier value from mem
1078 mov rax, [rbp + nb130_Vvdw]
1079 addss xmm7, [rax + rdx*4]
1080 ;# move back to mem
1081 movss [rax + rdx*4], xmm7
1083 ;# finish if last
1084 mov ecx, [rsp + nb130_nn1]
1085 ;# esi already loaded with n
1086 inc esi
1087 sub ecx, esi
1088 jz .nb130_outerend
1090 ;# not last, iterate outer loop once more!
1091 mov [rsp + nb130_n], esi
1092 jmp .nb130_outer
1093 .nb130_outerend:
1094 ;# check if more outer neighborlists remain
1095 mov ecx, [rsp + nb130_nri]
1096 ;# esi already loaded with n above
1097 sub ecx, esi
1098 jz .nb130_end
1099 ;# non-zero, do one more workunit
1100 jmp .nb130_threadloop
1101 .nb130_end:
1102 mov eax, [rsp + nb130_nouter]
1103 mov ebx, [rsp + nb130_ninner]
1104 mov rcx, [rbp + nb130_outeriter]
1105 mov rdx, [rbp + nb130_inneriter]
1106 mov [rcx], eax
1107 mov [rdx], ebx
1109 add rsp, 432
1110 emms
1112 ;# Save xmm registers to stack
1113 movaps xmm6, [rsp ]
1114 movaps xmm7, [rsp + 16 ]
1115 movaps xmm8, [rsp + 32 ]
1116 movaps xmm9, [rsp + 48 ]
1117 movaps xmm10, [rsp + 64 ]
1118 movaps xmm11, [rsp + 80 ]
1119 movaps xmm12, [rsp + 96 ]
1120 movaps xmm13, [rsp + 112]
1121 movaps xmm14, [rsp + 128]
1122 movaps xmm15, [rsp + 144]
1124 ;# Reset pointers after restoring xmm6-15
1125 add rsp, 168
1127 pop r15
1128 pop r14
1129 pop r13
1130 pop r12
1131 pop rdi
1132 pop rsi
1133 pop rbx
1135 pop rbp
1143 .globl nb_kernel130nf_x86_64_sse
1144 .globl _nb_kernel130nf_x86_64_sse
1145 nb_kernel130nf_x86_64_sse:
1146 _nb_kernel130nf_x86_64_sse:
1147 ;# Room for return address and rbp (16 bytes)
1148 .equiv nb130nf_fshift, 16
1149 .equiv nb130nf_gid, 24
1150 .equiv nb130nf_pos, 32
1151 .equiv nb130nf_faction, 40
1152 .equiv nb130nf_charge, 48
1153 .equiv nb130nf_p_facel, 56
1154 .equiv nb130nf_argkrf, 64
1155 .equiv nb130nf_argcrf, 72
1156 .equiv nb130nf_Vc, 80
1157 .equiv nb130nf_type, 88
1158 .equiv nb130nf_p_ntype, 96
1159 .equiv nb130nf_vdwparam, 104
1160 .equiv nb130nf_Vvdw, 112
1161 .equiv nb130nf_p_tabscale, 120
1162 .equiv nb130nf_VFtab, 128
1163 .equiv nb130nf_invsqrta, 136
1164 .equiv nb130nf_dvda, 144
1165 .equiv nb130nf_p_gbtabscale, 152
1166 .equiv nb130nf_GBtab, 160
1167 .equiv nb130nf_p_nthreads, 168
1168 .equiv nb130nf_count, 176
1169 .equiv nb130nf_mtx, 184
1170 .equiv nb130nf_outeriter, 192
1171 .equiv nb130nf_inneriter, 200
1172 .equiv nb130nf_work, 208
1173 ;# stack offsets for local variables
1174 ;# bottom of stack is cache-aligned for sse use
1175 .equiv nb130nf_ix, 0
1176 .equiv nb130nf_iy, 16
1177 .equiv nb130nf_iz, 32
1178 .equiv nb130nf_iq, 48
1179 .equiv nb130nf_c6, 64
1180 .equiv nb130nf_c12, 80
1181 .equiv nb130nf_vctot, 96
1182 .equiv nb130nf_Vvdwtot, 112
1183 .equiv nb130nf_half, 128
1184 .equiv nb130nf_three, 144
1185 .equiv nb130nf_krf, 160
1186 .equiv nb130nf_crf, 176
1187 .equiv nb130nf_tsc, 192
1188 .equiv nb130nf_nri, 208
1189 .equiv nb130nf_iinr, 216
1190 .equiv nb130nf_jindex, 224
1191 .equiv nb130nf_jjnr, 232
1192 .equiv nb130nf_shift, 240
1193 .equiv nb130nf_shiftvec, 248
1194 .equiv nb130nf_facel, 256
1195 .equiv nb130nf_innerjjnr, 264
1196 .equiv nb130nf_is3, 272
1197 .equiv nb130nf_ii3, 280
1198 .equiv nb130nf_ntia, 284
1199 .equiv nb130nf_innerk, 288
1200 .equiv nb130nf_n, 292
1201 .equiv nb130nf_nn1, 296
1202 .equiv nb130nf_ntype, 300
1203 .equiv nb130nf_nouter, 304
1204 .equiv nb130nf_ninner, 308
1206 push rbp
1207 mov rbp, rsp
1209 ;# Push integer registers on stack
1210 push rbx
1211 push rsi
1212 push rdi
1213 push r12
1214 push r13
1215 push r14
1216 push r15
1218 ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
1219 sub rsp, 168
1221 ;# Save xmm registers to stack
1222 movaps [rsp ], xmm6
1223 movaps [rsp + 16 ], xmm7
1224 movaps [rsp + 32 ], xmm8
1225 movaps [rsp + 48 ], xmm9
1226 movaps [rsp + 64 ], xmm10
1227 movaps [rsp + 80 ], xmm11
1228 movaps [rsp + 96 ], xmm12
1229 movaps [rsp + 112], xmm13
1230 movaps [rsp + 128], xmm14
1231 movaps [rsp + 144], xmm15
1233 emms
1234 sub rsp, 320 ;# local variable stack space (n*16+8)
1235 ; .if 0 # block below only read by NASM - special calling convention on win64
1236 %ifidn __OUTPUT_FORMAT__, win64
1237 ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
1238 add rbp, 48
1239 ;# Adjust stack pointer for different alignment
1240 ;# Move around arguments to fit AMD64 convention below
1241 ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
1242 ;# win64 passes args in: rcx,rdx,r8,r9 + stack
1243 mov rdi, rcx
1244 mov rsi, rdx
1245 mov rdx, r8
1246 mov rcx, r9
1247 mov r8, [rbp]
1248 mov r9, [rbp + 8]
1249 %endif
1250 ; .endif # end NASM- and win64-specific block
1252 ;# zero 32-bit iteration counters
1253 mov eax, 0
1254 mov [rsp + nb130nf_nouter], eax
1255 mov [rsp + nb130nf_ninner], eax
1257 mov edi, [rdi]
1258 mov [rsp + nb130nf_nri], edi
1259 mov [rsp + nb130nf_iinr], rsi
1260 mov [rsp + nb130nf_jindex], rdx
1261 mov [rsp + nb130nf_jjnr], rcx
1262 mov [rsp + nb130nf_shift], r8
1263 mov [rsp + nb130nf_shiftvec], r9
1264 mov rdi, [rbp + nb130nf_p_ntype]
1265 mov edi, [rdi]
1266 mov [rsp + nb130nf_ntype], edi
1267 mov rsi, [rbp + nb130nf_p_facel]
1268 movss xmm0, [rsi]
1269 movss [rsp + nb130nf_facel], xmm0
1271 mov rax, [rbp + nb130nf_p_tabscale]
1272 movss xmm3, [rax]
1273 shufps xmm3, xmm3, 0
1274 movaps [rsp + nb130nf_tsc], xmm3
1276 ;# create constant floating-point factors on stack
1277 mov eax, 0x3f000000 ;# half in IEEE (hex)
1278 mov [rsp + nb130nf_half], eax
1279 movss xmm1, [rsp + nb130nf_half]
1280 shufps xmm1, xmm1, 0 ;# splat to all elements
1281 movaps xmm2, xmm1
1282 addps xmm2, xmm2 ;# one
1283 movaps xmm3, xmm2
1284 addps xmm2, xmm2 ;# two
1285 addps xmm3, xmm2 ;# three
1286 movaps [rsp + nb130nf_half], xmm1
1287 movaps [rsp + nb130nf_three], xmm3
1290 .nb130nf_threadloop:
1291 mov rsi, [rbp + nb130nf_count] ;# pointer to sync counter
1292 mov eax, [rsi]
1293 .nb130nf_spinlock:
1294 mov ebx, eax ;# ebx=*count=nn0
1295 add ebx, 1 ;# ebx=nn1=nn0+10
1296 lock
1297 cmpxchg [rsi], ebx ;# write nn1 to *counter,
1298 ;# if it hasnt changed.
1299 ;# or reread *counter to eax.
1300 pause ;# -> better p4 performance
1301 jnz .nb130nf_spinlock
1303 ;# if(nn1>nri) nn1=nri
1304 mov ecx, [rsp + nb130nf_nri]
1305 mov edx, ecx
1306 sub ecx, ebx
1307 cmovle ebx, edx ;# if(nn1>nri) nn1=nri
1308 ;# Cleared the spinlock if we got here.
1309 ;# eax contains nn0, ebx contains nn1.
1310 mov [rsp + nb130nf_n], eax
1311 mov [rsp + nb130nf_nn1], ebx
1312 sub ebx, eax ;# calc number of outer lists
1313 mov esi, eax ;# copy n to esi
1314 jg .nb130nf_outerstart
1315 jmp .nb130nf_end
1317 .nb130nf_outerstart:
1318 ;# ebx contains number of outer iterations
1319 add ebx, [rsp + nb130nf_nouter]
1320 mov [rsp + nb130nf_nouter], ebx
1322 .nb130nf_outer:
1323 mov rax, [rsp + nb130nf_shift] ;# eax = pointer into shift[]
1324 mov ebx, [rax + rsi*4] ;# ebx=shift[n]
1326 lea rbx, [rbx + rbx*2] ;# rbx=3*is
1327 mov [rsp + nb130nf_is3],ebx ;# store is3
1329 mov rax, [rsp + nb130nf_shiftvec] ;# eax = base of shiftvec[]
1331 movss xmm0, [rax + rbx*4]
1332 movss xmm1, [rax + rbx*4 + 4]
1333 movss xmm2, [rax + rbx*4 + 8]
1335 mov rcx, [rsp + nb130nf_iinr] ;# ecx = pointer into iinr[]
1336 mov ebx, [rcx + rsi*4] ;# ebx =ii
1338 mov rdx, [rbp + nb130nf_charge]
1339 movss xmm3, [rdx + rbx*4]
1340 mulss xmm3, [rsp + nb130nf_facel]
1341 shufps xmm3, xmm3, 0
1343 mov rdx, [rbp + nb130nf_type]
1344 mov edx, [rdx + rbx*4]
1345 imul edx, [rsp + nb130nf_ntype]
1346 shl edx, 1
1347 mov [rsp + nb130nf_ntia], edx
1349 lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3
1350 mov rax, [rbp + nb130nf_pos] ;# eax = base of pos[]
1352 addss xmm0, [rax + rbx*4]
1353 addss xmm1, [rax + rbx*4 + 4]
1354 addss xmm2, [rax + rbx*4 + 8]
1356 movaps [rsp + nb130nf_iq], xmm3
1358 shufps xmm0, xmm0, 0
1359 shufps xmm1, xmm1, 0
1360 shufps xmm2, xmm2, 0
1362 movaps [rsp + nb130nf_ix], xmm0
1363 movaps [rsp + nb130nf_iy], xmm1
1364 movaps [rsp + nb130nf_iz], xmm2
1366 mov [rsp + nb130nf_ii3], ebx
1368 ;# clear vctot and i forces
1369 xorps xmm4, xmm4
1370 movaps [rsp + nb130nf_vctot], xmm4
1371 movaps [rsp + nb130nf_Vvdwtot], xmm4
1373 mov rax, [rsp + nb130nf_jindex]
1374 mov ecx, [rax + rsi*4] ;# jindex[n]
1375 mov edx, [rax + rsi*4 + 4] ;# jindex[n+1]
1376 sub edx, ecx ;# number of innerloop atoms
1378 mov rsi, [rbp + nb130nf_pos]
1379 mov rax, [rsp + nb130nf_jjnr]
1380 shl ecx, 2
1381 add rax, rcx
1382 mov [rsp + nb130nf_innerjjnr], rax ;# pointer to jjnr[nj0]
1383 mov ecx, edx
1384 sub edx, 4
1385 add ecx, [rsp + nb130nf_ninner]
1386 mov [rsp + nb130nf_ninner], ecx
1387 add edx, 0
1388 mov [rsp + nb130nf_innerk], edx ;# number of innerloop atoms
1389 jge .nb130nf_unroll_loop
1390 jmp .nb130nf_finish_inner
1391 .nb130nf_unroll_loop:
1392 ;# quad-unroll innerloop here
1393 mov rdx, [rsp + nb130nf_innerjjnr] ;# pointer to jjnr[k]
1394 mov eax, [rdx]
1395 mov ebx, [rdx + 4]
1396 mov ecx, [rdx + 8]
1397 mov edx, [rdx + 12] ;# eax-edx=jnr1-4
1398 add qword ptr [rsp + nb130nf_innerjjnr], 16 ;# advance pointer (unrolled 4)
1400 mov rsi, [rbp + nb130nf_charge] ;# base of charge[]
1402 movss xmm3, [rsi + rax*4]
1403 movss xmm4, [rsi + rcx*4]
1404 movss xmm6, [rsi + rbx*4]
1405 movss xmm7, [rsi + rdx*4]
1407 movaps xmm2, [rsp + nb130nf_iq]
1408 shufps xmm3, xmm6, 0
1409 shufps xmm4, xmm7, 0
1410 shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3
1411 movd mm0, eax ;# use mmx registers as temp storage
1412 movd mm1, ebx
1413 movd mm2, ecx
1414 movd mm3, edx
1416 mov rsi, [rbp + nb130nf_type]
1417 mov eax, [rsi + rax*4]
1418 mov ebx, [rsi + rbx*4]
1419 mov ecx, [rsi + rcx*4]
1420 mov edx, [rsi + rdx*4]
1421 mov rsi, [rbp + nb130nf_vdwparam]
1422 shl eax, 1
1423 shl ebx, 1
1424 shl ecx, 1
1425 shl edx, 1
1426 mov edi, [rsp + nb130nf_ntia]
1427 add eax, edi
1428 add ebx, edi
1429 add ecx, edi
1430 add edx, edi
1432 movlps xmm6, [rsi + rax*4]
1433 movlps xmm7, [rsi + rcx*4]
1434 movhps xmm6, [rsi + rbx*4]
1435 movhps xmm7, [rsi + rdx*4]
1437 movaps xmm4, xmm6
1438 shufps xmm4, xmm7, 136 ;# constant 10001000
1439 shufps xmm6, xmm7, 221 ;# constant 11011101
1441 movd eax, mm0
1442 movd ebx, mm1
1443 movd ecx, mm2
1444 movd edx, mm3
1446 movaps [rsp + nb130nf_c6], xmm4
1447 movaps [rsp + nb130nf_c12], xmm6
1449 mov rsi, [rbp + nb130nf_pos] ;# base of pos[]
1451 lea rax, [rax + rax*2] ;# replace jnr with j3
1452 lea rbx, [rbx + rbx*2]
1454 mulps xmm3, xmm2
1455 lea rcx, [rcx + rcx*2] ;# replace jnr with j3
1456 lea rdx, [rdx + rdx*2]
1458 ;# move four coordinates to xmm0-xmm2
1460 movlps xmm4, [rsi + rax*4]
1461 movlps xmm5, [rsi + rcx*4]
1462 movss xmm2, [rsi + rax*4 + 8]
1463 movss xmm6, [rsi + rcx*4 + 8]
1465 movhps xmm4, [rsi + rbx*4]
1466 movhps xmm5, [rsi + rdx*4]
1468 movss xmm0, [rsi + rbx*4 + 8]
1469 movss xmm1, [rsi + rdx*4 + 8]
1471 shufps xmm2, xmm0, 0
1472 shufps xmm6, xmm1, 0
1474 movaps xmm0, xmm4
1475 movaps xmm1, xmm4
1477 shufps xmm2, xmm6, 136 ;# constant 10001000
1479 shufps xmm0, xmm5, 136 ;# constant 10001000
1480 shufps xmm1, xmm5, 221 ;# constant 11011101
1482 ;# move ix-iz to xmm4-xmm6
1483 movaps xmm4, [rsp + nb130nf_ix]
1484 movaps xmm5, [rsp + nb130nf_iy]
1485 movaps xmm6, [rsp + nb130nf_iz]
1487 ;# calc dr
1488 subps xmm4, xmm0
1489 subps xmm5, xmm1
1490 subps xmm6, xmm2
1492 ;# square it
1493 mulps xmm4,xmm4
1494 mulps xmm5,xmm5
1495 mulps xmm6,xmm6
1496 addps xmm4, xmm5
1497 addps xmm4, xmm6
1498 ;# rsq in xmm4
1500 rsqrtps xmm5, xmm4
1501 ;# lookup seed in xmm5
1502 movaps xmm2, xmm5
1503 mulps xmm5, xmm5
1504 movaps xmm1, [rsp + nb130nf_three]
1505 mulps xmm5, xmm4 ;# rsq*lu*lu
1506 movaps xmm0, [rsp + nb130nf_half]
1507 subps xmm1, xmm5 ;# constant 30-rsq*lu*lu
1508 mulps xmm1, xmm2
1509 mulps xmm0, xmm1 ;# xmm0=rinv
1510 movaps xmm1, xmm0
1511 mulps xmm3, xmm0
1512 addps xmm3, [rsp + nb130nf_vctot]
1513 movaps [rsp + nb130nf_vctot], xmm3
1515 ;# LJ table
1516 mulps xmm4, xmm1 ;# r
1517 mulps xmm4, [rsp + nb130nf_tsc] ;# rtab
1519 movaps xmm0, xmm1 ;# copy of rinv
1520 movhlps xmm5, xmm4
1521 cvttps2pi mm6, xmm4
1522 cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices
1523 cvtpi2ps xmm6, mm6
1524 cvtpi2ps xmm5, mm7
1525 movlhps xmm6, xmm5
1526 subps xmm4, xmm6
1527 movaps xmm1, xmm4 ;# xmm1=eps
1528 movaps xmm2, xmm1
1529 mulps xmm2, xmm2 ;# xmm2=eps2
1530 pslld mm6, 3
1531 pslld mm7, 3
1533 mov rsi, [rbp + nb130nf_VFtab]
1534 movd eax, mm6
1535 psrlq mm6, 32
1536 movd ecx, mm7
1537 psrlq mm7, 32
1538 movd ebx, mm6
1539 movd edx, mm7
1541 ;# dispersion
1542 movlps xmm5, [rsi + rax*4]
1543 movlps xmm7, [rsi + rcx*4]
1544 movhps xmm5, [rsi + rbx*4]
1545 movhps xmm7, [rsi + rdx*4] ;# got half dispersion table
1546 movaps xmm4, xmm5
1547 shufps xmm4, xmm7, 136 ;# constant 10001000
1548 shufps xmm5, xmm7, 221 ;# constant 11011101
1550 movlps xmm7, [rsi + rax*4 + 8]
1551 movlps xmm3, [rsi + rcx*4 + 8]
1552 movhps xmm7, [rsi + rbx*4 + 8]
1553 movhps xmm3, [rsi + rdx*4 + 8] ;# other half of dispersion table
1554 movaps xmm6, xmm7
1555 shufps xmm6, xmm3, 136 ;# constant 10001000
1556 shufps xmm7, xmm3, 221 ;# constant 11011101
1557 ;# dispersion table ready, in xmm4-xmm7
1559 mulps xmm6, xmm1 ;# xmm6=Geps
1560 mulps xmm7, xmm2 ;# xmm7=Heps2
1561 addps xmm5, xmm6
1562 addps xmm5, xmm7 ;# xmm5=Fp
1563 mulps xmm5, xmm1 ;# xmm5=eps*Fp
1564 addps xmm5, xmm4 ;# xmm5=VV
1566 movaps xmm4, [rsp + nb130nf_c6]
1567 mulps xmm5, xmm4 ;# Vvdw6
1569 ;# Update Vvdwtot directly
1570 addps xmm5, [rsp + nb130nf_Vvdwtot]
1571 movaps [rsp + nb130nf_Vvdwtot], xmm5
1573 ;# repulsion
1574 movlps xmm5, [rsi + rax*4 + 16]
1575 movlps xmm7, [rsi + rcx*4 + 16]
1576 movhps xmm5, [rsi + rbx*4 + 16]
1577 movhps xmm7, [rsi + rdx*4 + 16] ;# got half repulsion table
1578 movaps xmm4, xmm5
1579 shufps xmm4, xmm7, 136 ;# constant 10001000
1580 shufps xmm5, xmm7, 221 ;# constant 11011101
1582 movlps xmm7, [rsi + rax*4 + 24]
1583 movlps xmm3, [rsi + rcx*4 + 24]
1584 movhps xmm7, [rsi + rbx*4 + 24]
1585 movhps xmm3, [rsi + rdx*4 + 24] ;# other half of repulsion table
1586 movaps xmm6, xmm7
1587 shufps xmm6, xmm3, 136 ;# constant 10001000
1588 shufps xmm7, xmm3, 221 ;# constant 11011101
1589 ;# table ready, in xmm4-xmm7
1590 mulps xmm6, xmm1 ;# xmm6=Geps
1591 mulps xmm7, xmm2 ;# xmm7=Heps2
1592 addps xmm5, xmm6
1593 addps xmm5, xmm7 ;# xmm5=Fp
1594 mulps xmm5, xmm1 ;# xmm5=eps*Fp
1595 addps xmm5, xmm4 ;# xmm5=VV
1597 movaps xmm4, [rsp + nb130nf_c12]
1598 mulps xmm5, xmm4 ;# Vvdw12
1600 addps xmm5, [rsp + nb130nf_Vvdwtot]
1601 movaps [rsp + nb130nf_Vvdwtot], xmm5
1603 ;# should we do one more iteration?
1604 sub dword ptr [rsp + nb130nf_innerk], 4
1605 jl .nb130nf_finish_inner
1606 jmp .nb130nf_unroll_loop
1607 .nb130nf_finish_inner:
1608 ;# check if at least two particles remain
1609 add dword ptr [rsp + nb130nf_innerk], 4
1610 mov edx, [rsp + nb130nf_innerk]
1611 and edx, 2
1612 jnz .nb130nf_dopair
1613 jmp .nb130nf_checksingle
1614 .nb130nf_dopair:
1615 mov rsi, [rbp + nb130nf_charge]
1617 mov rcx, [rsp + nb130nf_innerjjnr]
1619 mov eax, [rcx]
1620 mov ebx, [rcx + 4]
1621 add qword ptr [rsp + nb130nf_innerjjnr], 8
1623 xorps xmm3, xmm3
1624 movss xmm3, [rsi + rax*4]
1625 movss xmm6, [rsi + rbx*4]
1626 shufps xmm3, xmm6, 12 ;# constant 00001100
1627 shufps xmm3, xmm3, 88 ;# constant 01011000 ;# xmm3(0,1) has the charges
1629 mov rsi, [rbp + nb130nf_type]
1630 mov ecx, eax
1631 mov edx, ebx
1632 mov ecx, [rsi + rcx*4]
1633 mov edx, [rsi + rdx*4]
1634 mov rsi, [rbp + nb130nf_vdwparam]
1635 shl ecx, 1
1636 shl edx, 1
1637 mov edi, [rsp + nb130nf_ntia]
1638 add ecx, edi
1639 add edx, edi
1640 movlps xmm6, [rsi + rcx*4]
1641 movhps xmm6, [rsi + rdx*4]
1642 mov rdi, [rbp + nb130nf_pos]
1643 xorps xmm7,xmm7
1644 movaps xmm4, xmm6
1645 shufps xmm4, xmm4, 8 ;# constant 00001000
1646 shufps xmm6, xmm6, 13 ;# constant 00001101
1647 movlhps xmm4, xmm7
1648 movlhps xmm6, xmm7
1650 movaps [rsp + nb130nf_c6], xmm4
1651 movaps [rsp + nb130nf_c12], xmm6
1653 lea rax, [rax + rax*2]
1654 lea rbx, [rbx + rbx*2]
1655 ;# move coordinates to xmm0-xmm2
1656 movlps xmm1, [rdi + rax*4]
1657 movss xmm2, [rdi + rax*4 + 8]
1658 movhps xmm1, [rdi + rbx*4]
1659 movss xmm0, [rdi + rbx*4 + 8]
1661 mulps xmm3, [rsp + nb130nf_iq]
1663 movlhps xmm3, xmm7
1665 shufps xmm2, xmm0, 0
1667 movaps xmm0, xmm1
1669 shufps xmm2, xmm2, 136 ;# constant 10001000
1671 shufps xmm0, xmm0, 136 ;# constant 10001000
1672 shufps xmm1, xmm1, 221 ;# constant 11011101
1674 ;# move ix-iz to xmm4-xmm6
1675 xorps xmm7, xmm7
1677 movaps xmm4, [rsp + nb130nf_ix]
1678 movaps xmm5, [rsp + nb130nf_iy]
1679 movaps xmm6, [rsp + nb130nf_iz]
1681 ;# calc dr
1682 subps xmm4, xmm0
1683 subps xmm5, xmm1
1684 subps xmm6, xmm2
1686 ;# square it
1687 mulps xmm4,xmm4
1688 mulps xmm5,xmm5
1689 mulps xmm6,xmm6
1690 addps xmm4, xmm5
1691 addps xmm4, xmm6
1692 ;# rsq in xmm4
1694 rsqrtps xmm5, xmm4
1695 ;# lookup seed in xmm5
1696 movaps xmm2, xmm5
1697 mulps xmm5, xmm5
1698 movaps xmm1, [rsp + nb130nf_three]
1699 mulps xmm5, xmm4 ;# rsq*lu*lu
1700 movaps xmm0, [rsp + nb130nf_half]
1701 subps xmm1, xmm5 ;# constant 30-rsq*lu*lu
1702 mulps xmm1, xmm2
1703 mulps xmm0, xmm1 ;# xmm0=rinv
1704 movaps xmm1, xmm0
1705 mulps xmm3, xmm0
1706 addps xmm3, [rsp + nb130nf_vctot]
1707 movaps [rsp + nb130nf_vctot], xmm3
1709 ;# LJ table
1710 mulps xmm4, xmm1 ;# r
1711 mulps xmm4, [rsp + nb130nf_tsc] ;# rtab
1713 movaps xmm0, xmm1 ;# copy of rinv
1714 cvttps2pi mm6, xmm4
1715 cvtpi2ps xmm6, mm6
1716 subps xmm4, xmm6
1717 movaps xmm1, xmm4 ;# xmm1=eps
1718 movaps xmm2, xmm1
1719 mulps xmm2, xmm2 ;# xmm2=eps2
1720 pslld mm6, 3
1722 mov rsi, [rbp + nb130nf_VFtab]
1723 movd eax, mm6
1724 psrlq mm6, 32
1725 movd ebx, mm6
1727 ;# dispersion
1728 movlps xmm5, [rsi + rax*4]
1729 movhps xmm5, [rsi + rbx*4]
1730 movaps xmm4, xmm5
1731 shufps xmm4, xmm7, 136 ;# constant 10001000
1732 shufps xmm5, xmm7, 221 ;# constant 11011101
1734 movlps xmm7, [rsi + rax*4 + 8]
1735 movhps xmm7, [rsi + rbx*4 + 8]
1736 movaps xmm6, xmm7
1737 shufps xmm6, xmm3, 136 ;# constant 10001000
1738 shufps xmm7, xmm3, 221 ;# constant 11011101
1739 ;# dispersion table ready, in xmm4-xmm7
1741 mulps xmm6, xmm1 ;# xmm6=Geps
1742 mulps xmm7, xmm2 ;# xmm7=Heps2
1743 addps xmm5, xmm6
1744 addps xmm5, xmm7 ;# xmm5=Fp
1745 mulps xmm5, xmm1 ;# xmm5=eps*Fp
1746 addps xmm5, xmm4 ;# xmm5=VV
1748 movaps xmm4, [rsp + nb130nf_c6]
1749 mulps xmm5, xmm4 ;# Vvdw6
1751 ;# Update Vvdwtot directly
1752 addps xmm5, [rsp + nb130nf_Vvdwtot]
1753 movaps [rsp + nb130nf_Vvdwtot], xmm5
1755 ;# repulsion
1756 movlps xmm5, [rsi + rax*4 + 16]
1757 movhps xmm5, [rsi + rbx*4 + 16]
1758 movaps xmm4, xmm5
1759 shufps xmm4, xmm7, 136 ;# constant 10001000
1760 shufps xmm5, xmm7, 221 ;# constant 11011101
1762 movlps xmm7, [rsi + rax*4 + 24]
1763 movhps xmm7, [rsi + rbx*4 + 24]
1764 movaps xmm6, xmm7
1765 shufps xmm6, xmm3, 136 ;# constant 10001000
1766 shufps xmm7, xmm3, 221 ;# constant 11011101
1767 ;# table ready, in xmm4-xmm7
1768 mulps xmm6, xmm1 ;# xmm6=Geps
1769 mulps xmm7, xmm2 ;# xmm7=Heps2
1770 addps xmm5, xmm6
1771 addps xmm5, xmm7 ;# xmm5=Fp
1772 mulps xmm5, xmm1 ;# xmm5=eps*Fp
1773 addps xmm5, xmm4 ;# xmm5=VV
1775 movaps xmm4, [rsp + nb130nf_c12]
1776 mulps xmm5, xmm4 ;# Vvdw12
1778 addps xmm5, [rsp + nb130nf_Vvdwtot]
1779 movaps [rsp + nb130nf_Vvdwtot], xmm5
1781 .nb130nf_checksingle:
1782 mov edx, [rsp + nb130nf_innerk]
1783 and edx, 1
1784 jnz .nb130nf_dosingle
1785 jmp .nb130nf_updateouterdata
1786 .nb130nf_dosingle:
1787 mov rsi, [rbp + nb130nf_charge]
1788 mov rdi, [rbp + nb130nf_pos]
1789 mov rcx, [rsp + nb130nf_innerjjnr]
1790 xorps xmm3, xmm3
1791 mov eax, [rcx]
1792 movss xmm3, [rsi + rax*4] ;# xmm3(0) has the charge
1794 mov rsi, [rbp + nb130nf_type]
1795 mov ecx, eax
1796 mov ecx, [rsi + rcx*4]
1797 mov rsi, [rbp + nb130nf_vdwparam]
1798 shl ecx, 1
1799 add ecx, [rsp + nb130nf_ntia]
1800 xorps xmm6, xmm6
1801 movlps xmm6, [rsi + rcx*4]
1802 movaps xmm4, xmm6
1803 shufps xmm4, xmm4, 252 ;# constant 11111100
1804 shufps xmm6, xmm6, 253 ;# constant 11111101
1806 movaps [rsp + nb130nf_c6], xmm4
1807 movaps [rsp + nb130nf_c12], xmm6
1809 lea rax, [rax + rax*2]
1811 ;# move coordinates to xmm0-xmm2
1812 movss xmm0, [rdi + rax*4]
1813 movss xmm1, [rdi + rax*4 + 4]
1814 movss xmm2, [rdi + rax*4 + 8]
1816 mulps xmm3, [rsp + nb130nf_iq]
1818 xorps xmm7, xmm7
1820 movaps xmm4, [rsp + nb130nf_ix]
1821 movaps xmm5, [rsp + nb130nf_iy]
1822 movaps xmm6, [rsp + nb130nf_iz]
1824 ;# calc dr
1825 subps xmm4, xmm0
1826 subps xmm5, xmm1
1827 subps xmm6, xmm2
1829 ;# square it
1830 mulps xmm4,xmm4
1831 mulps xmm5,xmm5
1832 mulps xmm6,xmm6
1833 addps xmm4, xmm5
1834 addps xmm4, xmm6
1835 ;# rsq in xmm4
1837 rsqrtss xmm5, xmm4
1838 ;# lookup seed in xmm5
1839 movss xmm2, xmm5
1840 mulss xmm5, xmm5
1841 movss xmm1, [rsp + nb130nf_three]
1842 mulss xmm5, xmm4 ;# rsq*lu*lu
1843 movss xmm0, [rsp + nb130nf_half]
1844 subss xmm1, xmm5 ;# constant 30-rsq*lu*lu
1845 mulss xmm1, xmm2
1846 mulss xmm0, xmm1 ;# xmm0=rinv
1847 movaps xmm1, xmm0
1848 mulss xmm3, xmm0
1849 addss xmm3, [rsp + nb130nf_vctot]
1850 movss [rsp + nb130nf_vctot], xmm3
1852 ;# LJ table
1853 mulss xmm4, xmm1 ;# r
1854 mulss xmm4, [rsp + nb130nf_tsc] ;# rtab
1856 movaps xmm0, xmm1 ;# copy of rinv
1857 cvttps2pi mm6, xmm4
1858 cvtpi2ps xmm6, mm6
1859 subss xmm4, xmm6
1860 movss xmm1, xmm4 ;# xmm1=eps
1861 movss xmm2, xmm1
1862 mulss xmm2, xmm2 ;# xmm2=eps2
1863 pslld mm6, 3
1865 movd mm0, eax
1867 mov rsi, [rbp + nb130nf_VFtab]
1868 movd eax, mm6
1870 ;# dispersion
1871 movlps xmm5, [rsi + rax*4]
1872 movaps xmm4, xmm5
1873 shufps xmm4, xmm7, 136 ;# constant 10001000
1874 shufps xmm5, xmm7, 221 ;# constant 11011101
1876 movlps xmm7, [rsi + rax*4 + 8]
1877 movaps xmm6, xmm7
1878 shufps xmm6, xmm3, 136 ;# constant 10001000
1879 shufps xmm7, xmm3, 221 ;# constant 11011101
1880 ;# dispersion table ready, in xmm4-xmm7
1882 mulss xmm6, xmm1 ;# xmm6=Geps
1883 mulss xmm7, xmm2 ;# xmm7=Heps2
1884 addss xmm5, xmm6
1885 addss xmm5, xmm7 ;# xmm5=Fp
1886 mulss xmm5, xmm1 ;# xmm5=eps*Fp
1887 addss xmm5, xmm4 ;# xmm5=VV
1889 movss xmm4, [rsp + nb130nf_c6]
1890 mulss xmm5, xmm4 ;# Vvdw6
1892 ;# Update Vvdwtot directly
1893 addss xmm5, [rsp + nb130nf_Vvdwtot]
1894 movss [rsp + nb130nf_Vvdwtot], xmm5
1896 ;# repulsion
1897 movlps xmm5, [rsi + rax*4 + 16]
1898 movaps xmm4, xmm5
1899 shufps xmm4, xmm7, 136 ;# constant 10001000
1900 shufps xmm5, xmm7, 221 ;# constant 11011101
1902 movlps xmm7, [rsi + rax*4 + 24]
1903 movaps xmm6, xmm7
1904 shufps xmm6, xmm3, 136 ;# constant 10001000
1905 shufps xmm7, xmm3, 221 ;# constant 11011101
1906 ;# table ready, in xmm4-xmm7
1907 mulss xmm6, xmm1 ;# xmm6=Geps
1908 mulss xmm7, xmm2 ;# xmm7=Heps2
1909 addss xmm5, xmm6
1910 addss xmm5, xmm7 ;# xmm5=Fp
1911 mulss xmm5, xmm1 ;# xmm5=eps*Fp
1912 addss xmm5, xmm4 ;# xmm5=VV
1914 movss xmm4, [rsp + nb130nf_c12]
1915 mulss xmm5, xmm4 ;# Vvdw12
1917 addss xmm5, [rsp + nb130nf_Vvdwtot]
1918 movss [rsp + nb130nf_Vvdwtot], xmm5
1921 .nb130nf_updateouterdata:
1923 ;# get n from stack
1924 mov esi, [rsp + nb130nf_n]
1925 ;# get group index for i particle
1926 mov rdx, [rbp + nb130nf_gid] ;# base of gid[]
1927 mov edx, [rdx + rsi*4] ;# ggid=gid[n]
1929 ;# accumulate total potential energy and update it
1930 movaps xmm7, [rsp + nb130nf_vctot]
1931 ;# accumulate
1932 movhlps xmm6, xmm7
1933 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
1934 movaps xmm6, xmm7
1935 shufps xmm6, xmm6, 1
1936 addss xmm7, xmm6
1938 ;# add earlier value from mem
1939 mov rax, [rbp + nb130nf_Vc]
1940 addss xmm7, [rax + rdx*4]
1941 ;# move back to mem
1942 movss [rax + rdx*4], xmm7
1944 ;# accumulate total lj energy and update it
1945 movaps xmm7, [rsp + nb130nf_Vvdwtot]
1946 ;# accumulate
1947 movhlps xmm6, xmm7
1948 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
1949 movaps xmm6, xmm7
1950 shufps xmm6, xmm6, 1
1951 addss xmm7, xmm6
1953 ;# add earlier value from mem
1954 mov rax, [rbp + nb130nf_Vvdw]
1955 addss xmm7, [rax + rdx*4]
1956 ;# move back to mem
1957 movss [rax + rdx*4], xmm7
1959 ;# finish if last
1960 mov ecx, [rsp + nb130nf_nn1]
1961 ;# esi already loaded with n
1962 inc esi
1963 sub ecx, esi
1964 jz .nb130nf_outerend
1966 ;# not last, iterate outer loop once more!
1967 mov [rsp + nb130nf_n], esi
1968 jmp .nb130nf_outer
1969 .nb130nf_outerend:
1970 ;# check if more outer neighborlists remain
1971 mov ecx, [rsp + nb130nf_nri]
1972 ;# esi already loaded with n above
1973 sub ecx, esi
1974 jz .nb130nf_end
1975 ;# non-zero, do one more workunit
1976 jmp .nb130nf_threadloop
1977 .nb130nf_end:
1979 mov eax, [rsp + nb130nf_nouter]
1980 mov ebx, [rsp + nb130nf_ninner]
1981 mov rcx, [rbp + nb130nf_outeriter]
1982 mov rdx, [rbp + nb130nf_inneriter]
1983 mov [rcx], eax
1984 mov [rdx], ebx
1986 add rsp, 320
1987 emms
1989 ;# Save xmm registers to stack
1990 movaps xmm6, [rsp ]
1991 movaps xmm7, [rsp + 16 ]
1992 movaps xmm8, [rsp + 32 ]
1993 movaps xmm9, [rsp + 48 ]
1994 movaps xmm10, [rsp + 64 ]
1995 movaps xmm11, [rsp + 80 ]
1996 movaps xmm12, [rsp + 96 ]
1997 movaps xmm13, [rsp + 112]
1998 movaps xmm14, [rsp + 128]
1999 movaps xmm15, [rsp + 144]
2001 ;# Reset pointers after restoring xmm6-15
2002 add rsp, 168
2004 pop r15
2005 pop r14
2006 pop r13
2007 pop r12
2008 pop rdi
2009 pop rsi
2010 pop rbx
2012 pop rbp