Updated intel syntax x86-64 asm files to also support MS win64 call convention (ifdef...
[gromacs/rigid-bodies.git] / src / gmxlib / nonbonded / nb_kernel_x86_64_sse / nb_kernel133_x86_64_sse_intel_syntax.s
blob90a0e5ac57714d5b9a2f416f80678e8ced5a7df2
1 ;#
2 ;#
3 ;# Gromacs 4.0 Copyright (c) 1991-2003
4 ;# David van der Spoel, Erik Lindahl
5 ;#
6 ;# This program is free software; you can redistribute it and/or
7 ;# modify it under the terms of the GNU General Public License
8 ;# as published by the Free Software Foundation; either version 2
9 ;# of the License, or (at your option) any later version.
11 ;# To help us fund GROMACS development, we humbly ask that you cite
12 ;# the research papers on the package. Check out http://www.gromacs.org
13 ;#
14 ;# And Hey:
15 ;# Gnomes, ROck Monsters And Chili Sauce
18 ;# These files require GNU binutils 2.10 or later, since we
19 ;# use intel syntax for portability, or a recent version
20 ;# of NASM that understands Extended 3DNow and SSE2 instructions.
21 ;# (NASM is normally only used with MS Visual C++).
22 ;# Since NASM and gnu as disagree on some definitions and use
23 ;# completely different preprocessing options I have to introduce a
24 ;# trick: NASM uses ';' for comments, while gnu as uses '#' on x86.
25 ;# Gnu as treats ';' as a line break, i.e. ignores it. This is the
26 ;# reason why all comments need both symbols...
27 ;# The source is written for GNU as, with intel syntax. When you use
28 ;# NASM we redefine a couple of things. The false if-statement around
29 ;# the following code is seen by GNU as, but NASM doesn't see it, so
30 ;# the code inside is read by NASM but not gcc.
32 ; .if 0 # block below only read by NASM
33 %define .section section
34 %define .long dd
35 %define .align align
36 %define .globl global
37 ;# NASM only wants 'dword', not 'dword ptr'.
38 %define ptr
39 %macro .equiv 2
40 %1 equ %2
41 %endmacro
42 ; .endif # End of NASM-specific block
43 ; .intel_syntax noprefix # Line only read by gnu as
45 .section .text
48 .globl nb_kernel133_x86_64_sse
49 .globl _nb_kernel133_x86_64_sse
50 nb_kernel133_x86_64_sse:
51 _nb_kernel133_x86_64_sse:
52 ;# Room for return address and rbp (16 bytes)
53 .equiv nb133_fshift, 16
54 .equiv nb133_gid, 24
55 .equiv nb133_pos, 32
56 .equiv nb133_faction, 40
57 .equiv nb133_charge, 48
58 .equiv nb133_p_facel, 56
59 .equiv nb133_argkrf, 64
60 .equiv nb133_argcrf, 72
61 .equiv nb133_Vc, 80
62 .equiv nb133_type, 88
63 .equiv nb133_p_ntype, 96
64 .equiv nb133_vdwparam, 104
65 .equiv nb133_Vvdw, 112
66 .equiv nb133_p_tabscale, 120
67 .equiv nb133_VFtab, 128
68 .equiv nb133_invsqrta, 136
69 .equiv nb133_dvda, 144
70 .equiv nb133_p_gbtabscale, 152
71 .equiv nb133_GBtab, 160
72 .equiv nb133_p_nthreads, 168
73 .equiv nb133_count, 176
74 .equiv nb133_mtx, 184
75 .equiv nb133_outeriter, 192
76 .equiv nb133_inneriter, 200
77 .equiv nb133_work, 208
78 ;# stack offsets for local variables
79 ;# bottom of stack is cache-aligned for sse use
80 .equiv nb133_ixO, 0
81 .equiv nb133_iyO, 16
82 .equiv nb133_izO, 32
83 .equiv nb133_ixH1, 48
84 .equiv nb133_iyH1, 64
85 .equiv nb133_izH1, 80
86 .equiv nb133_ixH2, 96
87 .equiv nb133_iyH2, 112
88 .equiv nb133_izH2, 128
89 .equiv nb133_ixM, 144
90 .equiv nb133_iyM, 160
91 .equiv nb133_izM, 176
92 .equiv nb133_iqM, 192
93 .equiv nb133_iqH, 208
94 .equiv nb133_dxO, 224
95 .equiv nb133_dyO, 240
96 .equiv nb133_dzO, 256
97 .equiv nb133_dxH1, 272
98 .equiv nb133_dyH1, 288
99 .equiv nb133_dzH1, 304
100 .equiv nb133_dxH2, 320
101 .equiv nb133_dyH2, 336
102 .equiv nb133_dzH2, 352
103 .equiv nb133_dxM, 368
104 .equiv nb133_dyM, 384
105 .equiv nb133_dzM, 400
106 .equiv nb133_qqM, 416
107 .equiv nb133_qqH, 432
108 .equiv nb133_rinvH1, 448
109 .equiv nb133_rinvH2, 464
110 .equiv nb133_rinvM, 480
111 .equiv nb133_two, 496
112 .equiv nb133_c6, 512
113 .equiv nb133_c12, 528
114 .equiv nb133_tsc, 544
115 .equiv nb133_fstmp, 560
116 .equiv nb133_krf, 576
117 .equiv nb133_crf, 592
118 .equiv nb133_krsqH1, 608
119 .equiv nb133_krsqH2, 624
120 .equiv nb133_krsqM, 640
121 .equiv nb133_vctot, 656
122 .equiv nb133_Vvdwtot, 672
123 .equiv nb133_fixO, 688
124 .equiv nb133_fiyO, 704
125 .equiv nb133_fizO, 720
126 .equiv nb133_fixH1, 736
127 .equiv nb133_fiyH1, 752
128 .equiv nb133_fizH1, 768
129 .equiv nb133_fixH2, 784
130 .equiv nb133_fiyH2, 800
131 .equiv nb133_fizH2, 816
132 .equiv nb133_fixM, 832
133 .equiv nb133_fiyM, 848
134 .equiv nb133_fizM, 864
135 .equiv nb133_fjx, 880
136 .equiv nb133_fjy, 896
137 .equiv nb133_fjz, 912
138 .equiv nb133_half, 928
139 .equiv nb133_three, 944
140 .equiv nb133_rsqOO, 960
141 .equiv nb133_facel, 976
142 .equiv nb133_iinr, 984
143 .equiv nb133_jindex, 992
144 .equiv nb133_jjnr, 1000
145 .equiv nb133_shift, 1008
146 .equiv nb133_shiftvec, 1016
147 .equiv nb133_innerjjnr, 1024
148 .equiv nb133_is3, 1032
149 .equiv nb133_ii3, 1036
150 .equiv nb133_nri, 1040
151 .equiv nb133_ntia, 1044
152 .equiv nb133_innerk, 1048
153 .equiv nb133_n, 1052
154 .equiv nb133_nn1, 1056
155 .equiv nb133_nouter, 1060
156 .equiv nb133_ninner, 1064
158 push rbp
159 mov rbp, rsp
161 ;# Push integer registers on stack
162 push rbx
163 push rsi
164 push rdi
165 push r12
166 push r13
167 push r14
168 push r15
170 ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
171 sub rsp, 168
173 ;# Save xmm registers to stack
174 movaps [rsp ], xmm6
175 movaps [rsp + 16 ], xmm7
176 movaps [rsp + 32 ], xmm8
177 movaps [rsp + 48 ], xmm9
178 movaps [rsp + 64 ], xmm10
179 movaps [rsp + 80 ], xmm11
180 movaps [rsp + 96 ], xmm12
181 movaps [rsp + 112], xmm13
182 movaps [rsp + 128], xmm14
183 movaps [rsp + 144], xmm15
185 emms
186 sub rsp, 1072 ;# local variable stack space (n*16+8)
187 ; .if 0 # block below only read by NASM - special calling convention on win64
188 %ifidn __OUTPUT_FORMAT__, win64
189 ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
190 add rbp, 48
191 ;# Adjust stack pointer for different alignment
192 ;# Move around arguments to fit AMD64 convention below
193 ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
194 ;# win64 passes args in: rcx,rdx,r8,r9 + stack
195 mov rdi, rcx
196 mov rsi, rdx
197 mov rdx, r8
198 mov rcx, r9
199 mov r8, [rbp]
200 mov r9, [rbp + 8]
201 %endif
202 ; .endif # end NASM- and win64-specific block
204 ;# zero 32-bit iteration counters
205 mov eax, 0
206 mov [rsp + nb133_nouter], eax
207 mov [rsp + nb133_ninner], eax
209 mov edi, [rdi]
210 mov [rsp + nb133_nri], edi
211 mov [rsp + nb133_iinr], rsi
212 mov [rsp + nb133_jindex], rdx
213 mov [rsp + nb133_jjnr], rcx
214 mov [rsp + nb133_shift], r8
215 mov [rsp + nb133_shiftvec], r9
216 mov rsi, [rbp + nb133_p_facel]
217 movss xmm0, [rsi]
218 movss [rsp + nb133_facel], xmm0
220 ;# create constant floating-point factors on stack
221 mov eax, 0x3f000000 ;# half in IEEE (hex)
222 mov [rsp + nb133_half], eax
223 movss xmm1, [rsp + nb133_half]
224 shufps xmm1, xmm1, 0 ;# splat to all elements
225 movaps xmm2, xmm1
226 addps xmm2, xmm2 ;# one
227 movaps xmm3, xmm2
228 addps xmm2, xmm2 ;# two
229 addps xmm3, xmm2 ;# three
230 movaps [rsp + nb133_half], xmm1
231 movaps [rsp + nb133_two], xmm2
232 movaps [rsp + nb133_three], xmm3
234 mov rax, [rbp + nb133_p_tabscale]
235 movss xmm3, [rax]
236 shufps xmm3, xmm3, 0
237 movaps [rsp + nb133_tsc], xmm3
239 ;# assume we have at least one i particle - start directly
240 mov rcx, [rsp + nb133_iinr] ;# rcx = pointer into iinr[]
241 mov ebx, [rcx] ;# ebx =ii
243 mov rdx, [rbp + nb133_charge]
244 movss xmm4, [rdx + rbx*4 + 4]
245 movss xmm3, [rdx + rbx*4 + 12]
246 mov rsi, [rbp + nb133_p_facel]
247 movss xmm0, [rsi]
248 movss xmm5, [rsp + nb133_facel]
249 mulss xmm3, xmm5
250 mulss xmm4, xmm5
252 shufps xmm3, xmm3, 0
253 shufps xmm4, xmm4, 0
254 movaps [rsp + nb133_iqM], xmm3
255 movaps [rsp + nb133_iqH], xmm4
257 mov rdx, [rbp + nb133_type]
258 mov ecx, [rdx + rbx*4]
259 shl ecx, 1
260 mov rdi, [rbp + nb133_p_ntype]
261 imul ecx, [rdi] ;# rcx = ntia = 2*ntype*type[ii0]
262 mov [rsp + nb133_ntia], ecx
263 .nb133_threadloop:
264 mov rsi, [rbp + nb133_count] ;# pointer to sync counter
265 mov eax, [rsi]
266 .nb133_spinlock:
267 mov ebx, eax ;# ebx=*count=nn0
268 add ebx, 1 ;# ebx=nn1=nn0+10
269 lock
270 cmpxchg [rsi], ebx ;# write nn1 to *counter,
271 ;# if it hasnt changed.
272 ;# or reread *counter to eax.
273 pause ;# -> better p4 performance
274 jnz .nb133_spinlock
276 ;# if(nn1>nri) nn1=nri
277 mov ecx, [rsp + nb133_nri]
278 mov edx, ecx
279 sub ecx, ebx
280 cmovle ebx, edx ;# if(nn1>nri) nn1=nri
281 ;# Cleared the spinlock if we got here.
282 ;# eax contains nn0, ebx contains nn1.
283 mov [rsp + nb133_n], eax
284 mov [rsp + nb133_nn1], ebx
285 sub ebx, eax ;# calc number of outer lists
286 mov esi, eax ;# copy n to esi
287 jg .nb133_outerstart
288 jmp .nb133_end
290 .nb133_outerstart:
291 ;# ebx contains number of outer iterations
292 add ebx, [rsp + nb133_nouter]
293 mov [rsp + nb133_nouter], ebx
295 .nb133_outer:
296 mov rax, [rsp + nb133_shift] ;# eax = pointer into shift[]
297 mov ebx, [rax + rsi*4] ;# ebx=shift[n]
299 lea rbx, [rbx + rbx*2] ;# rbx=3*is
300 mov [rsp + nb133_is3],ebx ;# store is3
302 mov rax, [rsp + nb133_shiftvec] ;# eax = base of shiftvec[]
304 movss xmm0, [rax + rbx*4]
305 movss xmm1, [rax + rbx*4 + 4]
306 movss xmm2, [rax + rbx*4 + 8]
308 mov rcx, [rsp + nb133_iinr] ;# ecx = pointer into iinr[]
309 mov ebx, [rcx + rsi*4] ;# ebx =ii
311 movaps xmm3, xmm0
312 movaps xmm4, xmm1
313 movaps xmm5, xmm2
314 movaps xmm6, xmm0
315 movaps xmm7, xmm1
317 lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3
318 mov rax, [rbp + nb133_pos] ;# eax = base of pos[]
319 mov [rsp + nb133_ii3], ebx
321 addss xmm3, [rax + rbx*4] ;# ox
322 addss xmm4, [rax + rbx*4 + 4] ;# oy
323 addss xmm5, [rax + rbx*4 + 8] ;# oz
324 addss xmm6, [rax + rbx*4 + 12] ;# h1x
325 addss xmm7, [rax + rbx*4 + 16] ;# h1y
326 shufps xmm3, xmm3, 0
327 shufps xmm4, xmm4, 0
328 shufps xmm5, xmm5, 0
329 shufps xmm6, xmm6, 0
330 shufps xmm7, xmm7, 0
331 movaps [rsp + nb133_ixO], xmm3
332 movaps [rsp + nb133_iyO], xmm4
333 movaps [rsp + nb133_izO], xmm5
334 movaps [rsp + nb133_ixH1], xmm6
335 movaps [rsp + nb133_iyH1], xmm7
337 movss xmm6, xmm2
338 movss xmm3, xmm0
339 movss xmm4, xmm1
340 movss xmm5, xmm2
341 addss xmm6, [rax + rbx*4 + 20] ;# h1z
342 addss xmm0, [rax + rbx*4 + 24] ;# h2x
343 addss xmm1, [rax + rbx*4 + 28] ;# h2y
344 addss xmm2, [rax + rbx*4 + 32] ;# h2z
345 addss xmm3, [rax + rbx*4 + 36] ;# mx
346 addss xmm4, [rax + rbx*4 + 40] ;# my
347 addss xmm5, [rax + rbx*4 + 44] ;# mz
349 shufps xmm6, xmm6, 0
350 shufps xmm0, xmm0, 0
351 shufps xmm1, xmm1, 0
352 shufps xmm2, xmm2, 0
353 shufps xmm3, xmm3, 0
354 shufps xmm4, xmm4, 0
355 shufps xmm5, xmm5, 0
356 movaps [rsp + nb133_izH1], xmm6
357 movaps [rsp + nb133_ixH2], xmm0
358 movaps [rsp + nb133_iyH2], xmm1
359 movaps [rsp + nb133_izH2], xmm2
360 movaps [rsp + nb133_ixM], xmm3
361 movaps [rsp + nb133_iyM], xmm4
362 movaps [rsp + nb133_izM], xmm5
364 ;# clear vctot and i forces
365 xorps xmm4, xmm4
366 movaps [rsp + nb133_vctot], xmm4
367 movaps [rsp + nb133_Vvdwtot], xmm4
368 movaps [rsp + nb133_fixO], xmm4
369 movaps [rsp + nb133_fiyO], xmm4
370 movaps [rsp + nb133_fizO], xmm4
371 movaps [rsp + nb133_fixH1], xmm4
372 movaps [rsp + nb133_fiyH1], xmm4
373 movaps [rsp + nb133_fizH1], xmm4
374 movaps [rsp + nb133_fixH2], xmm4
375 movaps [rsp + nb133_fiyH2], xmm4
376 movaps [rsp + nb133_fizH2], xmm4
377 movaps [rsp + nb133_fixM], xmm4
378 movaps [rsp + nb133_fiyM], xmm4
379 movaps [rsp + nb133_fizM], xmm4
381 mov rax, [rsp + nb133_jindex]
382 mov ecx, [rax + rsi*4] ;# jindex[n]
383 mov edx, [rax + rsi*4 + 4] ;# jindex[n+1]
384 sub edx, ecx ;# number of innerloop atoms
386 mov rsi, [rbp + nb133_pos]
387 mov rdi, [rbp + nb133_faction]
388 mov rax, [rsp + nb133_jjnr]
389 shl ecx, 2
390 add rax, rcx
391 mov [rsp + nb133_innerjjnr], rax ;# pointer to jjnr[nj0]
392 mov ecx, edx
393 sub edx, 4
394 add ecx, [rsp + nb133_ninner]
395 mov [rsp + nb133_ninner], ecx
396 add edx, 0
397 mov [rsp + nb133_innerk], edx ;# number of innerloop atoms
398 jge .nb133_unroll_loop
399 jmp .nb133_odd_inner
400 .nb133_unroll_loop:
401 ;# quad-unroll innerloop here
402 mov rdx, [rsp + nb133_innerjjnr] ;# pointer to jjnr[k]
403 mov eax, [rdx]
404 mov ebx, [rdx + 4]
405 mov ecx, [rdx + 8]
406 mov edx, [rdx + 12] ;# eax-edx=jnr1-4
408 add qword ptr [rsp + nb133_innerjjnr], 16 ;# advance pointer (unrolled 4)
410 mov rsi, [rbp + nb133_charge] ;# base of charge[]
412 movss xmm3, [rsi + rax*4]
413 movss xmm4, [rsi + rcx*4]
414 movss xmm6, [rsi + rbx*4]
415 movss xmm7, [rsi + rdx*4]
417 shufps xmm3, xmm6, 0
418 shufps xmm4, xmm7, 0
419 shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3
420 movaps xmm4, xmm3 ;# and in xmm4
421 mulps xmm3, [rsp + nb133_iqM]
422 mulps xmm4, [rsp + nb133_iqH]
424 movaps [rsp + nb133_qqM], xmm3
425 movaps [rsp + nb133_qqH], xmm4
427 mov rsi, [rbp + nb133_type]
428 mov r8d, [rsi + rax*4]
429 mov r9d, [rsi + rbx*4]
430 mov r10d, [rsi + rcx*4]
431 mov r11d, [rsi + rdx*4]
432 mov rsi, [rbp + nb133_vdwparam]
433 shl r8d, 1
434 shl r9d, 1
435 shl r10d, 1
436 shl r11d, 1
437 mov edi, [rsp + nb133_ntia]
438 add r8d, edi
439 add r9d, edi
440 add r10d, edi
441 add r11d, edi
443 movlps xmm6, [rsi + r8*4]
444 movlps xmm7, [rsi + r10*4]
445 movhps xmm6, [rsi + r9*4]
446 movhps xmm7, [rsi + r11*4]
448 movaps xmm4, xmm6
449 shufps xmm4, xmm7, 136 ;# constant 10001000
450 shufps xmm6, xmm7, 221 ;# constant 11011101
452 movaps [rsp + nb133_c6], xmm4
453 movaps [rsp + nb133_c12], xmm6
455 mov rsi, [rbp + nb133_pos] ;# base of pos[]
457 lea rax, [rax + rax*2] ;# replace jnr with j3
458 lea rbx, [rbx + rbx*2]
459 lea rcx, [rcx + rcx*2] ;# replace jnr with j3
460 lea rdx, [rdx + rdx*2]
462 ;# move four coordinates to xmm0-xmm2
463 movlps xmm4, [rsi + rax*4]
464 movlps xmm5, [rsi + rcx*4]
465 movss xmm2, [rsi + rax*4 + 8]
466 movss xmm6, [rsi + rcx*4 + 8]
468 movhps xmm4, [rsi + rbx*4]
469 movhps xmm5, [rsi + rdx*4]
471 movss xmm0, [rsi + rbx*4 + 8]
472 movss xmm1, [rsi + rdx*4 + 8]
474 shufps xmm2, xmm0, 0
475 shufps xmm6, xmm1, 0
477 movaps xmm0, xmm4
478 movaps xmm1, xmm4
480 shufps xmm2, xmm6, 136 ;# constant 10001000
481 shufps xmm0, xmm5, 136 ;# constant 10001000
482 shufps xmm1, xmm5, 221 ;# constant 11011101
484 ;# xmm0 = jx
485 ;# xmm1 = jy
486 ;# xmm2 = jz
488 ;# O interaction
489 ;# copy to xmm3-xmm5
490 movaps xmm3, xmm0
491 movaps xmm4, xmm1
492 movaps xmm5, xmm2
494 subps xmm3, [rsp + nb133_ixO]
495 subps xmm4, [rsp + nb133_iyO]
496 subps xmm5, [rsp + nb133_izO]
498 movaps [rsp + nb133_dxO], xmm3
499 movaps [rsp + nb133_dyO], xmm4
500 movaps [rsp + nb133_dzO], xmm5
502 mulps xmm3, xmm3
503 mulps xmm4, xmm4
504 mulps xmm5, xmm5
506 addps xmm3, xmm4
507 addps xmm3, xmm5
508 ;# xmm3=rsq
510 ;# calculate rinv=1/sqrt(rsq)
511 rsqrtps xmm5, xmm3
512 movaps xmm15, xmm5
513 mulps xmm5, xmm5
514 movaps xmm4, [rsp + nb133_three]
515 mulps xmm5, xmm3 ;# rsq*lu*lu
516 subps xmm4, xmm5 ;# 30-rsq*lu*lu
517 mulps xmm4, xmm15
518 mulps xmm4, [rsp + nb133_half]
519 movaps xmm15, xmm4
520 mulps xmm3, xmm4
521 ;# xmm15=rinv
522 ;# xmm3=r
524 mulps xmm3, [rsp + nb133_tsc] ;# rtab
526 ;# truncate and convert to integers
527 cvttps2dq xmm5, xmm3
529 ;# convert back to float
530 cvtdq2ps xmm4, xmm5
532 ;# multiply by 8
533 pslld xmm5, 3
535 ;# calculate eps
536 subps xmm3, xmm4 ;# xmm3=eps
538 ;# move to integer registers
539 movhlps xmm6, xmm5
540 movd r8d, xmm5
541 movd r10d, xmm6
542 pshufd xmm5, xmm5, 1
543 pshufd xmm6, xmm6, 1
544 movd r9d, xmm5
545 movd r11d, xmm6
546 ;# xmm3=eps
547 ;# xmm15=rinv
549 mov rsi, [rbp + nb133_VFtab]
550 ;# calculate LJ table
551 movlps xmm5, [rsi + r8*4]
552 movlps xmm9, [rsi + r8*4 + 16]
554 movlps xmm7, [rsi + r10*4]
555 movlps xmm11, [rsi + r10*4 + 16]
557 movhps xmm5, [rsi + r9*4]
558 movhps xmm9, [rsi + r9*4 + 16]
560 movhps xmm7, [rsi + r11*4]
561 movhps xmm11, [rsi + r11*4 + 16]
563 movaps xmm4, xmm5
564 movaps xmm8, xmm9
565 shufps xmm4, xmm7, 136 ;# 10001000
566 shufps xmm8, xmm11, 136 ;# 10001000
567 shufps xmm5, xmm7, 221 ;# 11011101
568 shufps xmm9, xmm11, 221 ;# 11011101
570 movlps xmm7, [rsi + r8*4 + 8]
571 movlps xmm11, [rsi + r8*4 + 24]
573 movlps xmm13, [rsi + r10*4 + 8]
574 movlps xmm14, [rsi + r10*4 + 24]
576 movhps xmm7, [rsi + r9*4 + 8]
577 movhps xmm11, [rsi + r9*4 + 24]
579 movhps xmm13, [rsi + r11*4 + 8]
580 movhps xmm14, [rsi + r11*4 + 24]
582 movaps xmm6, xmm7
583 movaps xmm10, xmm11
585 shufps xmm6, xmm13, 136 ;# 10001000
586 shufps xmm10, xmm14, 136 ;# 10001000
587 shufps xmm7, xmm13, 221 ;# 11011101
588 shufps xmm11, xmm14, 221 ;# 11011101
589 ;# dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11
591 mulps xmm7, xmm3 ;# Heps
592 mulps xmm11, xmm3
593 mulps xmm6, xmm3 ;# Geps
594 mulps xmm10, xmm3
595 mulps xmm7, xmm3 ;# Heps2
596 mulps xmm11, xmm3
597 addps xmm5, xmm6 ;# F+Geps
598 addps xmm9, xmm10
599 addps xmm5, xmm7 ;# F+Geps+Heps2 = Fp
600 addps xmm9, xmm11
601 addps xmm7, xmm7 ;# 2*Heps2
602 addps xmm11, xmm11
603 addps xmm7, xmm6 ;# 2*Heps2+Geps
604 addps xmm11, xmm10
606 addps xmm7, xmm5 ;# FF = Fp + 2*Heps2 + Geps
607 addps xmm11, xmm9
608 mulps xmm5, xmm3 ;# eps*Fp
609 mulps xmm9, xmm3
610 movaps xmm12, [rsp + nb133_c6]
611 movaps xmm13, [rsp + nb133_c12]
612 addps xmm5, xmm4 ;# VV
613 addps xmm9, xmm8
615 mulps xmm5, xmm12 ;# VV*c6 = vnb6
616 mulps xmm9, xmm13 ;# VV*c12 = vnb12
617 addps xmm5, xmm9
618 addps xmm5, [rsp + nb133_Vvdwtot]
619 movaps [rsp + nb133_Vvdwtot], xmm5
621 mulps xmm7, xmm12 ;# FF*c6 = fnb6
622 mulps xmm11, xmm13 ;# FF*c12 = fnb12
623 addps xmm7, xmm11
625 mulps xmm7, [rsp + nb133_tsc]
626 mulps xmm7, xmm15 ;# -fscal
627 xorps xmm9, xmm9
629 subps xmm9, xmm7 ;# fscal
630 movaps xmm10, xmm9
631 movaps xmm11, xmm9
633 mulps xmm9, [rsp + nb133_dxO] ;# fx/fy/fz
634 mulps xmm10, [rsp + nb133_dyO]
635 mulps xmm11, [rsp + nb133_dzO]
637 ;# save j force temporarily
638 movaps [rsp + nb133_fjx], xmm9
639 movaps [rsp + nb133_fjy], xmm10
640 movaps [rsp + nb133_fjz], xmm11
642 ;# increment i O force
643 addps xmm9, [rsp + nb133_fixO]
644 addps xmm10, [rsp + nb133_fiyO]
645 addps xmm11, [rsp + nb133_fizO]
646 movaps [rsp + nb133_fixO], xmm9
647 movaps [rsp + nb133_fiyO], xmm10
648 movaps [rsp + nb133_fizO], xmm11
649 ;# finished O LJ interaction.
651 ;# do H1, H2, and M interactions in parallel.
652 ;# xmm0-xmm2 still contain j coordinates.
653 movaps xmm3, xmm0
654 movaps xmm4, xmm1
655 movaps xmm5, xmm2
656 movaps xmm6, xmm0
657 movaps xmm7, xmm1
658 movaps xmm8, xmm2
660 subps xmm0, [rsp + nb133_ixH1]
661 subps xmm1, [rsp + nb133_iyH1]
662 subps xmm2, [rsp + nb133_izH1]
663 subps xmm3, [rsp + nb133_ixH2]
664 subps xmm4, [rsp + nb133_iyH2]
665 subps xmm5, [rsp + nb133_izH2]
666 subps xmm6, [rsp + nb133_ixM]
667 subps xmm7, [rsp + nb133_iyM]
668 subps xmm8, [rsp + nb133_izM]
670 movaps [rsp + nb133_dxH1], xmm0
671 movaps [rsp + nb133_dyH1], xmm1
672 movaps [rsp + nb133_dzH1], xmm2
673 mulps xmm0, xmm0
674 mulps xmm1, xmm1
675 mulps xmm2, xmm2
676 movaps [rsp + nb133_dxH2], xmm3
677 movaps [rsp + nb133_dyH2], xmm4
678 movaps [rsp + nb133_dzH2], xmm5
679 mulps xmm3, xmm3
680 mulps xmm4, xmm4
681 mulps xmm5, xmm5
682 movaps [rsp + nb133_dxM], xmm6
683 movaps [rsp + nb133_dyM], xmm7
684 movaps [rsp + nb133_dzM], xmm8
685 mulps xmm6, xmm6
686 mulps xmm7, xmm7
687 mulps xmm8, xmm8
688 addps xmm0, xmm1
689 addps xmm0, xmm2
690 addps xmm3, xmm4
691 addps xmm3, xmm5
692 addps xmm6, xmm7
693 addps xmm6, xmm8
695 ;# start doing invsqrt for j atoms
696 rsqrtps xmm1, xmm0
697 rsqrtps xmm4, xmm3
698 rsqrtps xmm7, xmm6
700 movaps xmm2, xmm1
701 movaps xmm5, xmm4
702 movaps xmm8, xmm7
704 mulps xmm1, xmm1 ;# lu*lu
705 mulps xmm4, xmm4 ;# lu*lu
706 mulps xmm7, xmm7 ;# lu*lu
708 movaps xmm9, [rsp + nb133_three]
709 movaps xmm10, xmm9
710 movaps xmm11, xmm9
712 mulps xmm1, xmm0 ;# rsq*lu*lu
713 mulps xmm4, xmm3 ;# rsq*lu*lu
714 mulps xmm7, xmm6 ;# rsq*lu*lu
716 subps xmm9, xmm1
717 subps xmm10, xmm4
718 subps xmm11, xmm7 ;# 3-rsq*lu*lu
720 mulps xmm9, xmm2
721 mulps xmm10, xmm5
722 mulps xmm11, xmm8 ;# lu*(3-rsq*lu*lu)
724 movaps xmm0, [rsp + nb133_half]
725 mulps xmm9, xmm0 ;# rinvH1
726 mulps xmm10, xmm0 ;# rinvH2
727 mulps xmm11, xmm0 ;# rinvM
729 ;# interactions
730 movaps xmm0, xmm9 ;# rinv
731 movaps xmm1, xmm10
732 movaps xmm2, xmm11
733 mulps xmm9, xmm9 ;# rinvsq
734 mulps xmm10, xmm10
735 mulps xmm11, xmm11
736 mulps xmm0, [rsp + nb133_qqH]
737 mulps xmm1, [rsp + nb133_qqH]
738 mulps xmm2, [rsp + nb133_qqM]
739 mulps xmm9, xmm0
740 mulps xmm10, xmm1
741 mulps xmm11, xmm2
743 addps xmm0, [rsp + nb133_vctot]
744 addps xmm1, xmm2
745 addps xmm0, xmm1
746 movaps [rsp + nb133_vctot], xmm0
748 ;# move j forces to local temp variables
749 mov rdi, [rbp + nb133_faction]
750 movlps xmm0, [rdi + rax*4] ;# jxa jya - -
751 movlps xmm1, [rdi + rcx*4] ;# jxc jyc - -
752 movhps xmm0, [rdi + rbx*4] ;# jxa jya jxb jyb
753 movhps xmm1, [rdi + rdx*4] ;# jxc jyc jxd jyd
755 movss xmm2, [rdi + rax*4 + 8] ;# jza - - -
756 movss xmm3, [rdi + rcx*4 + 8] ;# jzc - - -
757 movss xmm5, [rdi + rbx*4 + 8] ;# jzb - - -
758 movss xmm6, [rdi + rdx*4 + 8] ;# jzd - - -
759 movlhps xmm2, xmm5
760 movlhps xmm3, xmm6
762 shufps xmm2, xmm3, 136 ;# 10001000 => jza jzb jzc jzd
764 ;# xmm0: jxa jya jxb jyb
765 ;# xmm1: jxc jyc jxd jyd
766 ;# xmm2: jza jzb jzc jzd
768 movaps xmm7, xmm9
769 movaps xmm8, xmm9
770 movaps xmm13, xmm11
771 movaps xmm14, xmm11
772 movaps xmm15, xmm11
773 movaps xmm11, xmm10
774 movaps xmm12, xmm10
776 mulps xmm7, [rsp + nb133_dxH1]
777 mulps xmm8, [rsp + nb133_dyH1]
778 mulps xmm9, [rsp + nb133_dzH1]
779 mulps xmm10, [rsp + nb133_dxH2]
780 mulps xmm11, [rsp + nb133_dyH2]
781 mulps xmm12, [rsp + nb133_dzH2]
782 mulps xmm13, [rsp + nb133_dxM]
783 mulps xmm14, [rsp + nb133_dyM]
784 mulps xmm15, [rsp + nb133_dzM]
786 ;# fetch forces from O interaction
787 movaps xmm3, [rsp + nb133_fjx]
788 movaps xmm4, [rsp + nb133_fjy]
789 addps xmm2, [rsp + nb133_fjz]
791 addps xmm3, xmm7
792 addps xmm4, xmm8
793 addps xmm2, xmm9
794 addps xmm7, [rsp + nb133_fixH1]
795 addps xmm8, [rsp + nb133_fiyH1]
796 addps xmm9, [rsp + nb133_fizH1]
798 addps xmm3, xmm10
799 addps xmm4, xmm11
800 addps xmm2, xmm12
801 addps xmm10, [rsp + nb133_fixH2]
802 addps xmm11, [rsp + nb133_fiyH2]
803 addps xmm12, [rsp + nb133_fizH2]
805 addps xmm3, xmm13
806 addps xmm4, xmm14
807 addps xmm2, xmm15
808 addps xmm13, [rsp + nb133_fixM]
809 addps xmm14, [rsp + nb133_fiyM]
810 addps xmm15, [rsp + nb133_fizM]
812 movaps [rsp + nb133_fixH1], xmm7
813 movaps [rsp + nb133_fiyH1], xmm8
814 movaps [rsp + nb133_fizH1], xmm9
815 movaps [rsp + nb133_fixH2], xmm10
816 movaps [rsp + nb133_fiyH2], xmm11
817 movaps [rsp + nb133_fizH2], xmm12
818 movaps [rsp + nb133_fixM], xmm13
819 movaps [rsp + nb133_fiyM], xmm14
820 movaps [rsp + nb133_fizM], xmm15
822 ;# xmm3 = fjx , xmm4 = fjy , xmm2=fjz, already updated.
823 movaps xmm5, xmm3
824 unpcklps xmm3, xmm4 ;# fjx1 fjy1 fjx2 fjy2
825 unpckhps xmm5, xmm4 ;# fjx3 fjy3 fjx4 fjy4
827 addps xmm0, xmm3
828 addps xmm1, xmm5
829 movhlps xmm3, xmm2 ;# fjzc fjzd
831 movlps [rdi + rax*4], xmm0
832 movhps [rdi + rbx*4], xmm0
833 movlps [rdi + rcx*4], xmm1
834 movhps [rdi + rdx*4], xmm1
835 movss [rdi + rax*4 + 8], xmm2
836 movss [rdi + rcx*4 + 8], xmm3
837 shufps xmm2, xmm2, 1
838 shufps xmm3, xmm3, 1
839 movss [rdi + rbx*4 + 8], xmm2
840 movss [rdi + rdx*4 + 8], xmm3
842 ;# should we do one more iteration?
843 sub dword ptr [rsp + nb133_innerk], 4
844 jl .nb133_odd_inner
845 jmp .nb133_unroll_loop
846 .nb133_odd_inner:
847 add dword ptr [rsp + nb133_innerk], 4
848 jnz .nb133_odd_loop
849 jmp .nb133_updateouterdata
850 .nb133_odd_loop:
851 mov rdx, [rsp + nb133_innerjjnr] ;# pointer to jjnr[k]
852 mov eax, [rdx]
853 add qword ptr [rsp + nb133_innerjjnr], 4
855 xorps xmm4, xmm4 ;# clear reg.
856 movss xmm4, [rsp + nb133_iqM]
857 mov rsi, [rbp + nb133_charge]
858 movhps xmm4, [rsp + nb133_iqH] ;# [qM 0 qH qH]
859 shufps xmm4, xmm4, 41 ;# [0 qH qH qM]
861 movss xmm3, [rsi + rax*4] ;# charge in xmm3
862 shufps xmm3, xmm3, 0
863 mulps xmm3, xmm4
864 movaps [rsp + nb133_qqM], xmm3 ;# use dummy qq for storage
866 xorps xmm6, xmm6
867 mov rsi, [rbp + nb133_type]
868 mov ebx, [rsi + rax*4]
869 mov rsi, [rbp + nb133_vdwparam]
870 shl ebx, 1
871 add ebx, [rsp + nb133_ntia]
872 movlps xmm6, [rsi + rbx*4]
873 movaps xmm7, xmm6
874 shufps xmm6, xmm6, 252 ;# constant 11111100
875 shufps xmm7, xmm7, 253 ;# constant 11111101
876 movaps [rsp + nb133_c6], xmm6
877 movaps [rsp + nb133_c12], xmm7
879 mov rsi, [rbp + nb133_pos]
880 lea rax, [rax + rax*2]
882 movss xmm0, [rsp + nb133_ixO]
883 movss xmm1, [rsp + nb133_iyO]
884 movss xmm2, [rsp + nb133_izO]
885 movss xmm3, [rsp + nb133_ixH1]
886 movss xmm4, [rsp + nb133_iyH1]
887 movss xmm5, [rsp + nb133_izH1]
888 unpcklps xmm0, [rsp + nb133_ixH2] ;# ixO ixH2 - -
889 unpcklps xmm1, [rsp + nb133_iyH2] ;# iyO iyH2 - -
890 unpcklps xmm2, [rsp + nb133_izH2] ;# izO izH2 - -
891 unpcklps xmm3, [rsp + nb133_ixM] ;# ixH1 ixM - -
892 unpcklps xmm4, [rsp + nb133_iyM] ;# iyH1 iyM - -
893 unpcklps xmm5, [rsp + nb133_izM] ;# izH1 izM - -
894 unpcklps xmm0, xmm3 ;# ixO ixH1 ixH2 ixM
895 unpcklps xmm1, xmm4 ;# same for y
896 unpcklps xmm2, xmm5 ;# same for z
898 ;# move j coords to xmm0-xmm2
899 movss xmm3, [rsi + rax*4]
900 movss xmm4, [rsi + rax*4 + 4]
901 movss xmm5, [rsi + rax*4 + 8]
902 shufps xmm3, xmm3, 0
903 shufps xmm4, xmm4, 0
904 shufps xmm5, xmm5, 0
906 subps xmm3, xmm0
907 subps xmm4, xmm1
908 subps xmm5, xmm2
910 ;# use O distances for storage
911 movaps [rsp + nb133_dxO], xmm3
912 movaps [rsp + nb133_dyO], xmm4
913 movaps [rsp + nb133_dzO], xmm5
915 mulps xmm3, xmm3
916 mulps xmm4, xmm4
917 mulps xmm5, xmm5
919 addps xmm4, xmm3
920 addps xmm4, xmm5
921 ;# rsq in xmm4
923 rsqrtps xmm5, xmm4
924 ;# lookup seed in xmm5
925 movaps xmm2, xmm5
926 mulps xmm5, xmm5
927 movaps xmm1, [rsp + nb133_three]
928 mulps xmm5, xmm4 ;# rsq*lu*lu
929 movaps xmm0, [rsp + nb133_half]
930 subps xmm1, xmm5 ;# constant 30-rsq*lu*lu
931 mulps xmm1, xmm2
932 mulps xmm0, xmm1 ;# xmm0=rinv, xmm4=rsq
934 ;# LJ table interaction
935 mulps xmm4, xmm0
936 mulps xmm4, [rsp + nb133_tsc] ;# rtab
938 cvttps2pi mm6, xmm4
939 cvtpi2ps xmm6, mm6
940 subss xmm4, xmm6
941 movss xmm1, xmm4 ;# xmm1=eps
942 movss xmm2, xmm1
943 mulss xmm2, xmm2 ;# xmm2=eps2
944 pslld mm6, 3
946 movd mm0, eax
948 mov rsi, [rbp + nb133_VFtab]
949 movd eax, mm6
951 ;# dispersion
952 movlps xmm5, [rsi + rax*4]
953 movaps xmm4, xmm5
954 shufps xmm4, xmm7, 136 ;# constant 10001000
955 shufps xmm5, xmm7, 221 ;# constant 11011101
957 movlps xmm7, [rsi + rax*4 + 8]
958 movaps xmm6, xmm7
959 shufps xmm6, xmm3, 136 ;# constant 10001000
960 shufps xmm7, xmm3, 221 ;# constant 11011101
961 ;# dispersion table ready, in xmm4-xmm7
963 mulss xmm6, xmm1 ;# xmm6=Geps
964 mulss xmm7, xmm2 ;# xmm7=Heps2
965 addss xmm5, xmm6
966 addss xmm5, xmm7 ;# xmm5=Fp
967 mulss xmm7, [rsp + nb133_two] ;# two*Heps2
968 addss xmm7, xmm6
969 addss xmm7, xmm5 ;# xmm7=FF
970 mulss xmm5, xmm1 ;# xmm5=eps*Fp
971 addss xmm5, xmm4 ;# xmm5=VV
973 movss xmm4, [rsp + nb133_c6]
974 mulss xmm7, xmm4 ;# fijD
975 mulss xmm5, xmm4 ;# Vvdw6
976 mulss xmm7, [rsp + nb133_tsc]
977 ;# put scalar force on stack Update Vvdwtot directly
978 addss xmm5, [rsp + nb133_Vvdwtot]
979 movss [rsp + nb133_fstmp], xmm7
980 movss [rsp + nb133_Vvdwtot], xmm5
982 ;# repulsion
983 movlps xmm5, [rsi + rax*4 + 16]
984 movaps xmm4, xmm5
985 shufps xmm4, xmm7, 136 ;# constant 10001000
986 shufps xmm5, xmm7, 221 ;# constant 11011101
988 movlps xmm7, [rsi + rax*4 + 24]
989 movaps xmm6, xmm7
990 shufps xmm6, xmm3, 136 ;# constant 10001000
991 shufps xmm7, xmm3, 221 ;# constant 11011101
992 ;# table ready, in xmm4-xmm7
993 mulss xmm6, xmm1 ;# xmm6=Geps
994 mulss xmm7, xmm2 ;# xmm7=Heps2
995 addss xmm5, xmm6
996 addss xmm5, xmm7 ;# xmm5=Fp
997 mulss xmm7, [rsp + nb133_two] ;# two*Heps2
998 addss xmm7, xmm6
999 addss xmm7, xmm5 ;# xmm7=FF
1000 mulss xmm5, xmm1 ;# xmm5=eps*Fp
1001 addss xmm5, xmm4 ;# xmm5=VV
1003 movss xmm4, [rsp + nb133_c12]
1004 mulss xmm7, xmm4 ;# fijR
1005 mulss xmm5, xmm4 ;# Vvdw12
1006 mulss xmm7, [rsp + nb133_tsc]
1007 addss xmm7, [rsp + nb133_fstmp]
1008 movss [rsp + nb133_fstmp], xmm7
1009 addss xmm5, [rsp + nb133_Vvdwtot]
1010 movss [rsp + nb133_Vvdwtot], xmm5
1012 movd eax, mm0
1014 movaps xmm4, xmm0
1015 mulps xmm4, [rsp + nb133_qqM]
1016 movaps xmm2, xmm4
1017 mulps xmm4, xmm0
1018 subss xmm4, [rsp + nb133_fstmp]
1019 mulps xmm4, xmm0
1021 addps xmm2, [rsp + nb133_vctot]
1022 movaps [rsp + nb133_vctot], xmm2
1024 movaps xmm0, [rsp + nb133_dxO]
1025 movaps xmm1, [rsp + nb133_dyO]
1026 movaps xmm2, [rsp + nb133_dzO]
1028 mulps xmm0, xmm4
1029 mulps xmm1, xmm4
1030 mulps xmm2, xmm4 ;# xmm0-xmm2 now contains tx-tz (partial force)
1032 movss xmm3, [rsp + nb133_fixO]
1033 movss xmm4, [rsp + nb133_fiyO]
1034 movss xmm5, [rsp + nb133_fizO]
1035 addss xmm3, xmm0
1036 addss xmm4, xmm1
1037 addss xmm5, xmm2
1038 movss [rsp + nb133_fixO], xmm3
1039 movss [rsp + nb133_fiyO], xmm4
1040 movss [rsp + nb133_fizO], xmm5 ;# updated the O force now do the H's
1042 movaps xmm3, xmm0
1043 movaps xmm4, xmm1
1044 movaps xmm5, xmm2
1045 shufps xmm3, xmm3, 0x39 ;# shift right
1046 shufps xmm4, xmm4, 0x39
1047 shufps xmm5, xmm5, 0x39
1048 addss xmm3, [rsp + nb133_fixH1]
1049 addss xmm4, [rsp + nb133_fiyH1]
1050 addss xmm5, [rsp + nb133_fizH1]
1051 movss [rsp + nb133_fixH1], xmm3
1052 movss [rsp + nb133_fiyH1], xmm4
1053 movss [rsp + nb133_fizH1], xmm5 ;# updated the H1 force
1055 shufps xmm3, xmm3, 0x39
1056 shufps xmm4, xmm4, 0x39
1057 shufps xmm5, xmm5, 0x39
1058 addss xmm3, [rsp + nb133_fixH2]
1059 addss xmm4, [rsp + nb133_fiyH2]
1060 addss xmm5, [rsp + nb133_fizH2]
1061 movss [rsp + nb133_fixH2], xmm3
1062 movss [rsp + nb133_fiyH2], xmm4
1063 movss [rsp + nb133_fizH2], xmm5 ;# updated the H2 force
1065 mov rdi, [rbp + nb133_faction]
1066 shufps xmm3, xmm3, 0x39
1067 shufps xmm4, xmm4, 0x39
1068 shufps xmm5, xmm5, 0x39
1069 addss xmm3, [rsp + nb133_fixM]
1070 addss xmm4, [rsp + nb133_fiyM]
1071 addss xmm5, [rsp + nb133_fizM]
1072 movss [rsp + nb133_fixM], xmm3
1073 movss [rsp + nb133_fiyM], xmm4
1074 movss [rsp + nb133_fizM], xmm5 ;# updated the M force
1076 ;# the fj's - move in from mem start by acc. tx/ty/tz in xmm0, xmm1
1077 movlps xmm6, [rdi + rax*4]
1078 movss xmm7, [rdi + rax*4 + 8]
1080 movhlps xmm3, xmm0
1081 movhlps xmm4, xmm1
1082 movhlps xmm5, xmm2
1083 addps xmm3, xmm0
1084 addps xmm4, xmm1
1085 addps xmm5, xmm2
1086 movaps xmm0, xmm3
1087 movaps xmm1, xmm4
1088 movaps xmm2, xmm5
1090 shufps xmm3, xmm3, 0x39 ;# shift right
1091 shufps xmm4, xmm4, 0x39
1092 shufps xmm5, xmm5, 0x39
1093 addss xmm0, xmm3
1094 addss xmm1, xmm4
1095 addss xmm2, xmm5
1096 unpcklps xmm0, xmm1 ;# x,y sum in xmm0, z sum in xmm2
1098 addps xmm6, xmm0
1099 addss xmm7, xmm2
1101 movlps [rdi + rax*4], xmm6
1102 movss [rdi + rax*4 + 8], xmm7
1104 dec dword ptr [rsp + nb133_innerk]
1105 jz .nb133_updateouterdata
1106 jmp .nb133_odd_loop
1107 .nb133_updateouterdata:
1108 mov ecx, [rsp + nb133_ii3]
1109 mov rdi, [rbp + nb133_faction]
1110 mov rsi, [rbp + nb133_fshift]
1111 mov edx, [rsp + nb133_is3]
1113 ;# accumulate Oi forces in xmm0, xmm1, xmm2
1114 movaps xmm0, [rsp + nb133_fixO]
1115 movaps xmm1, [rsp + nb133_fiyO]
1116 movaps xmm2, [rsp + nb133_fizO]
1118 movhlps xmm3, xmm0
1119 movhlps xmm4, xmm1
1120 movhlps xmm5, xmm2
1121 addps xmm0, xmm3
1122 addps xmm1, xmm4
1123 addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2
1125 movaps xmm3, xmm0
1126 movaps xmm4, xmm1
1127 movaps xmm5, xmm2
1129 shufps xmm3, xmm3, 1
1130 shufps xmm4, xmm4, 1
1131 shufps xmm5, xmm5, 1
1132 addss xmm0, xmm3
1133 addss xmm1, xmm4
1134 addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0
1136 ;# increment i force
1137 movss xmm3, [rdi + rcx*4]
1138 movss xmm4, [rdi + rcx*4 + 4]
1139 movss xmm5, [rdi + rcx*4 + 8]
1140 subss xmm3, xmm0
1141 subss xmm4, xmm1
1142 subss xmm5, xmm2
1143 movss [rdi + rcx*4], xmm3
1144 movss [rdi + rcx*4 + 4], xmm4
1145 movss [rdi + rcx*4 + 8], xmm5
1147 ;# accumulate force in xmm6/xmm7 for fshift
1148 movaps xmm6, xmm0
1149 movss xmm7, xmm2
1150 movlhps xmm6, xmm1
1151 shufps xmm6, xmm6, 8 ;# constant 00001000
1153 ;# accumulate H1i forces in xmm0, xmm1, xmm2
1154 movaps xmm0, [rsp + nb133_fixH1]
1155 movaps xmm1, [rsp + nb133_fiyH1]
1156 movaps xmm2, [rsp + nb133_fizH1]
1158 movhlps xmm3, xmm0
1159 movhlps xmm4, xmm1
1160 movhlps xmm5, xmm2
1161 addps xmm0, xmm3
1162 addps xmm1, xmm4
1163 addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2
1165 movaps xmm3, xmm0
1166 movaps xmm4, xmm1
1167 movaps xmm5, xmm2
1169 shufps xmm3, xmm3, 1
1170 shufps xmm4, xmm4, 1
1171 shufps xmm5, xmm5, 1
1172 addss xmm0, xmm3
1173 addss xmm1, xmm4
1174 addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0
1176 ;# increment i force
1177 movss xmm3, [rdi + rcx*4 + 12]
1178 movss xmm4, [rdi + rcx*4 + 16]
1179 movss xmm5, [rdi + rcx*4 + 20]
1180 subss xmm3, xmm0
1181 subss xmm4, xmm1
1182 subss xmm5, xmm2
1183 movss [rdi + rcx*4 + 12], xmm3
1184 movss [rdi + rcx*4 + 16], xmm4
1185 movss [rdi + rcx*4 + 20], xmm5
1187 ;# accumulate force in xmm6/xmm7 for fshift
1188 addss xmm7, xmm2
1189 movlhps xmm0, xmm1
1190 shufps xmm0, xmm0, 8 ;# constant 00001000
1191 addps xmm6, xmm0
1193 ;# accumulate H2i forces in xmm0, xmm1, xmm2
1194 movaps xmm0, [rsp + nb133_fixH2]
1195 movaps xmm1, [rsp + nb133_fiyH2]
1196 movaps xmm2, [rsp + nb133_fizH2]
1198 movhlps xmm3, xmm0
1199 movhlps xmm4, xmm1
1200 movhlps xmm5, xmm2
1201 addps xmm0, xmm3
1202 addps xmm1, xmm4
1203 addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2
1205 movaps xmm3, xmm0
1206 movaps xmm4, xmm1
1207 movaps xmm5, xmm2
1209 shufps xmm3, xmm3, 1
1210 shufps xmm4, xmm4, 1
1211 shufps xmm5, xmm5, 1
1212 addss xmm0, xmm3
1213 addss xmm1, xmm4
1214 addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0
1216 ;# increment i force
1217 movss xmm3, [rdi + rcx*4 + 24]
1218 movss xmm4, [rdi + rcx*4 + 28]
1219 movss xmm5, [rdi + rcx*4 + 32]
1220 subss xmm3, xmm0
1221 subss xmm4, xmm1
1222 subss xmm5, xmm2
1223 movss [rdi + rcx*4 + 24], xmm3
1224 movss [rdi + rcx*4 + 28], xmm4
1225 movss [rdi + rcx*4 + 32], xmm5
1227 ;# accumulate force in xmm6/xmm7 for fshift
1228 addss xmm7, xmm2
1229 movlhps xmm0, xmm1
1230 shufps xmm0, xmm0, 8 ;# constant 00001000
1231 addps xmm6, xmm0
1233 ;# accumulate Mi forces in xmm0, xmm1, xmm2
1234 movaps xmm0, [rsp + nb133_fixM]
1235 movaps xmm1, [rsp + nb133_fiyM]
1236 movaps xmm2, [rsp + nb133_fizM]
1238 movhlps xmm3, xmm0
1239 movhlps xmm4, xmm1
1240 movhlps xmm5, xmm2
1241 addps xmm0, xmm3
1242 addps xmm1, xmm4
1243 addps xmm2, xmm5 ;# sum is in 1/2 in xmm0-xmm2
1245 movaps xmm3, xmm0
1246 movaps xmm4, xmm1
1247 movaps xmm5, xmm2
1249 shufps xmm3, xmm3, 1
1250 shufps xmm4, xmm4, 1
1251 shufps xmm5, xmm5, 1
1252 addss xmm0, xmm3
1253 addss xmm1, xmm4
1254 addss xmm2, xmm5 ;# xmm0-xmm2 has single force in pos0
1256 ;# increment i force
1257 movss xmm3, [rdi + rcx*4 + 36]
1258 movss xmm4, [rdi + rcx*4 + 40]
1259 movss xmm5, [rdi + rcx*4 + 44]
1260 subss xmm3, xmm0
1261 subss xmm4, xmm1
1262 subss xmm5, xmm2
1263 movss [rdi + rcx*4 + 36], xmm3
1264 movss [rdi + rcx*4 + 40], xmm4
1265 movss [rdi + rcx*4 + 44], xmm5
1267 ;# accumulate force in xmm6/xmm7 for fshift
1268 addss xmm7, xmm2
1269 movlhps xmm0, xmm1
1270 shufps xmm0, xmm0, 8 ;# constant 00001000
1271 addps xmm6, xmm0
1273 ;# increment fshift force
1274 movlps xmm3, [rsi + rdx*4]
1275 movss xmm4, [rsi + rdx*4 + 8]
1276 subps xmm3, xmm6
1277 subss xmm4, xmm7
1278 movlps [rsi + rdx*4], xmm3
1279 movss [rsi + rdx*4 + 8], xmm4
1281 ;# get n from stack
1282 mov esi, [rsp + nb133_n]
1283 ;# get group index for i particle
1284 mov rdx, [rbp + nb133_gid] ;# base of gid[]
1285 mov edx, [rdx + rsi*4] ;# ggid=gid[n]
1287 ;# accumulate total potential energy and update it
1288 movaps xmm7, [rsp + nb133_vctot]
1289 ;# accumulate
1290 movhlps xmm6, xmm7
1291 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
1292 movaps xmm6, xmm7
1293 shufps xmm6, xmm6, 1
1294 addss xmm7, xmm6
1296 ;# add earlier value from mem
1297 mov rax, [rbp + nb133_Vc]
1298 addss xmm7, [rax + rdx*4]
1299 ;# move back to mem
1300 movss [rax + rdx*4], xmm7
1302 ;# accumulate total lj energy and update it
1303 movaps xmm7, [rsp + nb133_Vvdwtot]
1304 ;# accumulate
1305 movhlps xmm6, xmm7
1306 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
1307 movaps xmm6, xmm7
1308 shufps xmm6, xmm6, 1
1309 addss xmm7, xmm6
1311 ;# add earlier value from mem
1312 mov rax, [rbp + nb133_Vvdw]
1313 addss xmm7, [rax + rdx*4]
1314 ;# move back to mem
1315 movss [rax + rdx*4], xmm7
1317 ;# finish if last
1318 mov ecx, [rsp + nb133_nn1]
1319 ;# esi already loaded with n
1320 inc esi
1321 sub ecx, esi
1322 jz .nb133_outerend
1324 ;# not last, iterate outer loop once more!
1325 mov [rsp + nb133_n], esi
1326 jmp .nb133_outer
1327 .nb133_outerend:
1328 ;# check if more outer neighborlists remain
1329 mov ecx, [rsp + nb133_nri]
1330 ;# esi already loaded with n above
1331 sub ecx, esi
1332 jz .nb133_end
1333 ;# non-zero, do one more workunit
1334 jmp .nb133_threadloop
1335 .nb133_end:
1336 mov eax, [rsp + nb133_nouter]
1337 mov ebx, [rsp + nb133_ninner]
1338 mov rcx, [rbp + nb133_outeriter]
1339 mov rdx, [rbp + nb133_inneriter]
1340 mov [rcx], eax
1341 mov [rdx], ebx
1343 add rsp, 1072
1344 emms
1346 ;# Save xmm registers to stack
1347 movaps xmm6, [rsp ]
1348 movaps xmm7, [rsp + 16 ]
1349 movaps xmm8, [rsp + 32 ]
1350 movaps xmm9, [rsp + 48 ]
1351 movaps xmm10, [rsp + 64 ]
1352 movaps xmm11, [rsp + 80 ]
1353 movaps xmm12, [rsp + 96 ]
1354 movaps xmm13, [rsp + 112]
1355 movaps xmm14, [rsp + 128]
1356 movaps xmm15, [rsp + 144]
1358 ;# Reset pointers after restoring xmm6-15
1359 add rsp, 168
1361 pop r15
1362 pop r14
1363 pop r13
1364 pop r12
1365 pop rdi
1366 pop rsi
1367 pop rbx
1369 pop rbp
1377 .globl nb_kernel133nf_x86_64_sse
1378 .globl _nb_kernel133nf_x86_64_sse
1379 nb_kernel133nf_x86_64_sse:
1380 _nb_kernel133nf_x86_64_sse:
1381 ;# Room for return address and rbp (16 bytes)
1382 .equiv nb133nf_fshift, 16
1383 .equiv nb133nf_gid, 24
1384 .equiv nb133nf_pos, 32
1385 .equiv nb133nf_faction, 40
1386 .equiv nb133nf_charge, 48
1387 .equiv nb133nf_p_facel, 56
1388 .equiv nb133nf_argkrf, 64
1389 .equiv nb133nf_argcrf, 72
1390 .equiv nb133nf_Vc, 80
1391 .equiv nb133nf_type, 88
1392 .equiv nb133nf_p_ntype, 96
1393 .equiv nb133nf_vdwparam, 104
1394 .equiv nb133nf_Vvdw, 112
1395 .equiv nb133nf_p_tabscale, 120
1396 .equiv nb133nf_VFtab, 128
1397 .equiv nb133nf_invsqrta, 136
1398 .equiv nb133nf_dvda, 144
1399 .equiv nb133nf_p_gbtabscale, 152
1400 .equiv nb133nf_GBtab, 160
1401 .equiv nb133nf_p_nthreads, 168
1402 .equiv nb133nf_count, 176
1403 .equiv nb133nf_mtx, 184
1404 .equiv nb133nf_outeriter, 192
1405 .equiv nb133nf_inneriter, 200
1406 .equiv nb133nf_work, 208
1407 ;# stack offsets for local variables
1408 ;# bottom of stack is cache-aligned for sse use
1409 .equiv nb133nf_ixO, 0
1410 .equiv nb133nf_iyO, 16
1411 .equiv nb133nf_izO, 32
1412 .equiv nb133nf_ixH1, 48
1413 .equiv nb133nf_iyH1, 64
1414 .equiv nb133nf_izH1, 80
1415 .equiv nb133nf_ixH2, 96
1416 .equiv nb133nf_iyH2, 112
1417 .equiv nb133nf_izH2, 128
1418 .equiv nb133nf_ixM, 144
1419 .equiv nb133nf_iyM, 160
1420 .equiv nb133nf_izM, 176
1421 .equiv nb133nf_iqM, 192
1422 .equiv nb133nf_iqH, 208
1423 .equiv nb133nf_qqM, 224
1424 .equiv nb133nf_qqH, 240
1425 .equiv nb133nf_rinvH1, 256
1426 .equiv nb133nf_rinvH2, 272
1427 .equiv nb133nf_rinvM, 288
1428 .equiv nb133nf_tsc, 304
1429 .equiv nb133nf_c6, 320
1430 .equiv nb133nf_c12, 336
1431 .equiv nb133nf_krf, 352
1432 .equiv nb133nf_crf, 368
1433 .equiv nb133nf_krsqH1, 384
1434 .equiv nb133nf_krsqH2, 400
1435 .equiv nb133nf_krsqM, 416
1436 .equiv nb133nf_vctot, 432
1437 .equiv nb133nf_Vvdwtot, 448
1438 .equiv nb133nf_half, 464
1439 .equiv nb133nf_three, 480
1440 .equiv nb133nf_nri, 496
1441 .equiv nb133nf_iinr, 504
1442 .equiv nb133nf_jindex, 512
1443 .equiv nb133nf_jjnr, 520
1444 .equiv nb133nf_shift, 528
1445 .equiv nb133nf_shiftvec, 536
1446 .equiv nb133nf_facel, 544
1447 .equiv nb133nf_innerjjnr, 552
1448 .equiv nb133nf_is3, 560
1449 .equiv nb133nf_ii3, 564
1450 .equiv nb133nf_ntia, 568
1451 .equiv nb133nf_innerk, 572
1452 .equiv nb133nf_n, 576
1453 .equiv nb133nf_nn1, 580
1454 .equiv nb133nf_nouter, 584
1455 .equiv nb133nf_ninner, 588
1457 push rbp
1458 mov rbp, rsp
1460 ;# Push integer registers on stack
1461 push rbx
1462 push rsi
1463 push rdi
1464 push r12
1465 push r13
1466 push r14
1467 push r15
1469 ;# Make room for registers xmm6-xmm15 (10 registers=160 bytes)
1470 sub rsp, 168
1472 ;# Save xmm registers to stack
1473 movaps [rsp ], xmm6
1474 movaps [rsp + 16 ], xmm7
1475 movaps [rsp + 32 ], xmm8
1476 movaps [rsp + 48 ], xmm9
1477 movaps [rsp + 64 ], xmm10
1478 movaps [rsp + 80 ], xmm11
1479 movaps [rsp + 96 ], xmm12
1480 movaps [rsp + 112], xmm13
1481 movaps [rsp + 128], xmm14
1482 movaps [rsp + 144], xmm15
1484 emms
1485 sub rsp, 592 ;# local variable stack space (n*16+8)
1486 ; .if 0 # block below only read by NASM - special calling convention on win64
1487 %ifidn __OUTPUT_FORMAT__, win64
1488 ;# Adjust rbp to account for shadow space (32) & two extra args (2*8) on stack
1489 add rbp, 48
1490 ;# Adjust stack pointer for different alignment
1491 ;# Move around arguments to fit AMD64 convention below
1492 ;# AMD64 passes args in: rdi,rsi,rdx,rcx,r8,r9 + stack
1493 ;# win64 passes args in: rcx,rdx,r8,r9 + stack
1494 mov rdi, rcx
1495 mov rsi, rdx
1496 mov rdx, r8
1497 mov rcx, r9
1498 mov r8, [rbp]
1499 mov r9, [rbp + 8]
1500 %endif
1501 ; .endif # end NASM- and win64-specific block
1503 ;# zero 32-bit iteration counters
1504 mov eax, 0
1505 mov [rsp + nb133nf_nouter], eax
1506 mov [rsp + nb133nf_ninner], eax
1508 mov edi, [rdi]
1509 mov [rsp + nb133nf_nri], edi
1510 mov [rsp + nb133nf_iinr], rsi
1511 mov [rsp + nb133nf_jindex], rdx
1512 mov [rsp + nb133nf_jjnr], rcx
1513 mov [rsp + nb133nf_shift], r8
1514 mov [rsp + nb133nf_shiftvec], r9
1515 mov rsi, [rbp + nb133nf_p_facel]
1516 movss xmm0, [rsi]
1517 movss [rsp + nb133nf_facel], xmm0
1519 mov rax, [rbp + nb133nf_p_tabscale]
1520 movss xmm3, [rax]
1521 shufps xmm3, xmm3, 0
1522 movaps [rsp + nb133nf_tsc], xmm3
1524 ;# create constant floating-point factors on stack
1525 mov eax, 0x3f000000 ;# half in IEEE (hex)
1526 mov [rsp + nb133nf_half], eax
1527 movss xmm1, [rsp + nb133nf_half]
1528 shufps xmm1, xmm1, 0 ;# splat to all elements
1529 movaps xmm2, xmm1
1530 addps xmm2, xmm2 ;# one
1531 movaps xmm3, xmm2
1532 addps xmm2, xmm2 ;# two
1533 addps xmm3, xmm2 ;# three
1534 movaps [rsp + nb133nf_half], xmm1
1535 movaps [rsp + nb133nf_three], xmm3
1537 ;# assume we have at least one i particle - start directly
1538 mov rcx, [rsp + nb133nf_iinr] ;# rcx = pointer into iinr[]
1539 mov ebx, [rcx] ;# ebx =ii
1541 mov rdx, [rbp + nb133nf_charge]
1542 movss xmm4, [rdx + rbx*4 + 4]
1543 movss xmm3, [rdx + rbx*4 + 12]
1544 mov rsi, [rbp + nb133nf_p_facel]
1545 movss xmm0, [rsi]
1546 movss xmm5, [rsp + nb133nf_facel]
1547 mulss xmm3, xmm5
1548 mulss xmm4, xmm5
1550 shufps xmm3, xmm3, 0
1551 shufps xmm4, xmm4, 0
1552 movaps [rsp + nb133nf_iqM], xmm3
1553 movaps [rsp + nb133nf_iqH], xmm4
1555 mov rdx, [rbp + nb133nf_type]
1556 mov ecx, [rdx + rbx*4]
1557 shl ecx, 1
1558 mov rdi, [rbp + nb133nf_p_ntype]
1559 imul ecx, [rdi] ;# rcx = ntia = 2*ntype*type[ii0]
1560 mov [rsp + nb133nf_ntia], ecx
1562 .nb133nf_threadloop:
1563 mov rsi, [rbp + nb133nf_count] ;# pointer to sync counter
1564 mov eax, [rsi]
1565 .nb133nf_spinlock:
1566 mov ebx, eax ;# ebx=*count=nn0
1567 add ebx, 1 ;# ebx=nn1=nn0+10
1568 lock
1569 cmpxchg [rsi], ebx ;# write nn1 to *counter,
1570 ;# if it hasnt changed.
1571 ;# or reread *counter to eax.
1572 pause ;# -> better p4 performance
1573 jnz .nb133nf_spinlock
1575 ;# if(nn1>nri) nn1=nri
1576 mov ecx, [rsp + nb133nf_nri]
1577 mov edx, ecx
1578 sub ecx, ebx
1579 cmovle ebx, edx ;# if(nn1>nri) nn1=nri
1580 ;# Cleared the spinlock if we got here.
1581 ;# eax contains nn0, ebx contains nn1.
1582 mov [rsp + nb133nf_n], eax
1583 mov [rsp + nb133nf_nn1], ebx
1584 sub ebx, eax ;# calc number of outer lists
1585 mov esi, eax ;# copy n to esi
1586 jg .nb133nf_outerstart
1587 jmp .nb133nf_end
1589 .nb133nf_outerstart:
1590 ;# ebx contains number of outer iterations
1591 add ebx, [rsp + nb133nf_nouter]
1592 mov [rsp + nb133nf_nouter], ebx
1594 .nb133nf_outer:
1595 mov rax, [rsp + nb133nf_shift] ;# eax = pointer into shift[]
1596 mov ebx, [rax + rsi*4] ;# ebx=shift[n]
1598 lea rbx, [rbx + rbx*2] ;# rbx=3*is
1599 mov [rsp + nb133nf_is3],ebx ;# store is3
1601 mov rax, [rsp + nb133nf_shiftvec] ;# eax = base of shiftvec[]
1603 movss xmm0, [rax + rbx*4]
1604 movss xmm1, [rax + rbx*4 + 4]
1605 movss xmm2, [rax + rbx*4 + 8]
1607 mov rcx, [rsp + nb133nf_iinr] ;# ecx = pointer into iinr[]
1608 mov ebx, [rcx + rsi*4] ;# ebx =ii
1610 movaps xmm3, xmm0
1611 movaps xmm4, xmm1
1612 movaps xmm5, xmm2
1613 movaps xmm6, xmm0
1614 movaps xmm7, xmm1
1616 lea rbx, [rbx + rbx*2] ;# rbx = 3*ii=ii3
1617 mov rax, [rbp + nb133nf_pos] ;# eax = base of pos[]
1618 mov [rsp + nb133nf_ii3], ebx
1620 addss xmm3, [rax + rbx*4] ;# ox
1621 addss xmm4, [rax + rbx*4 + 4] ;# oy
1622 addss xmm5, [rax + rbx*4 + 8] ;# oz
1623 addss xmm6, [rax + rbx*4 + 12] ;# h1x
1624 addss xmm7, [rax + rbx*4 + 16] ;# h1y
1625 shufps xmm3, xmm3, 0
1626 shufps xmm4, xmm4, 0
1627 shufps xmm5, xmm5, 0
1628 shufps xmm6, xmm6, 0
1629 shufps xmm7, xmm7, 0
1630 movaps [rsp + nb133nf_ixO], xmm3
1631 movaps [rsp + nb133nf_iyO], xmm4
1632 movaps [rsp + nb133nf_izO], xmm5
1633 movaps [rsp + nb133nf_ixH1], xmm6
1634 movaps [rsp + nb133nf_iyH1], xmm7
1636 movss xmm6, xmm2
1637 movss xmm3, xmm0
1638 movss xmm4, xmm1
1639 movss xmm5, xmm2
1640 addss xmm6, [rax + rbx*4 + 20] ;# h1z
1641 addss xmm0, [rax + rbx*4 + 24] ;# h2x
1642 addss xmm1, [rax + rbx*4 + 28] ;# h2y
1643 addss xmm2, [rax + rbx*4 + 32] ;# h2z
1644 addss xmm3, [rax + rbx*4 + 36] ;# mx
1645 addss xmm4, [rax + rbx*4 + 40] ;# my
1646 addss xmm5, [rax + rbx*4 + 44] ;# mz
1648 shufps xmm6, xmm6, 0
1649 shufps xmm0, xmm0, 0
1650 shufps xmm1, xmm1, 0
1651 shufps xmm2, xmm2, 0
1652 shufps xmm3, xmm3, 0
1653 shufps xmm4, xmm4, 0
1654 shufps xmm5, xmm5, 0
1655 movaps [rsp + nb133nf_izH1], xmm6
1656 movaps [rsp + nb133nf_ixH2], xmm0
1657 movaps [rsp + nb133nf_iyH2], xmm1
1658 movaps [rsp + nb133nf_izH2], xmm2
1659 movaps [rsp + nb133nf_ixM], xmm3
1660 movaps [rsp + nb133nf_iyM], xmm4
1661 movaps [rsp + nb133nf_izM], xmm5
1663 ;# clear vctot
1664 xorps xmm4, xmm4
1665 movaps [rsp + nb133nf_vctot], xmm4
1666 movaps [rsp + nb133nf_Vvdwtot], xmm4
1668 mov rax, [rsp + nb133nf_jindex]
1669 mov ecx, [rax + rsi*4] ;# jindex[n]
1670 mov edx, [rax + rsi*4 + 4] ;# jindex[n+1]
1671 sub edx, ecx ;# number of innerloop atoms
1673 mov rsi, [rbp + nb133nf_pos]
1674 mov rax, [rsp + nb133nf_jjnr]
1675 shl ecx, 2
1676 add rax, rcx
1677 mov [rsp + nb133nf_innerjjnr], rax ;# pointer to jjnr[nj0]
1678 mov ecx, edx
1679 sub edx, 4
1680 add ecx, [rsp + nb133nf_ninner]
1681 mov [rsp + nb133nf_ninner], ecx
1682 add edx, 0
1683 mov [rsp + nb133nf_innerk], edx ;# number of innerloop atoms
1684 jge .nb133nf_unroll_loop
1685 jmp .nb133nf_odd_inner
1686 .nb133nf_unroll_loop:
1687 ;# quad-unroll innerloop here
1688 mov rdx, [rsp + nb133nf_innerjjnr] ;# pointer to jjnr[k]
1689 mov eax, [rdx]
1690 mov ebx, [rdx + 4]
1691 mov ecx, [rdx + 8]
1692 mov edx, [rdx + 12] ;# eax-edx=jnr1-4
1694 add qword ptr [rsp + nb133nf_innerjjnr], 16 ;# advance pointer (unrolled 4)
1696 mov rsi, [rbp + nb133nf_charge] ;# base of charge[]
1698 movss xmm3, [rsi + rax*4]
1699 movss xmm4, [rsi + rcx*4]
1700 movss xmm6, [rsi + rbx*4]
1701 movss xmm7, [rsi + rdx*4]
1703 shufps xmm3, xmm6, 0
1704 shufps xmm4, xmm7, 0
1705 shufps xmm3, xmm4, 136 ;# constant 10001000 ;# all charges in xmm3
1706 movaps xmm4, xmm3 ;# and in xmm4
1707 mulps xmm3, [rsp + nb133nf_iqM]
1708 mulps xmm4, [rsp + nb133nf_iqH]
1710 movd mm0, eax ;# use mmx registers as temp storage
1711 movd mm1, ebx
1712 movd mm2, ecx
1713 movd mm3, edx
1715 movaps [rsp + nb133nf_qqM], xmm3
1716 movaps [rsp + nb133nf_qqH], xmm4
1718 mov rsi, [rbp + nb133nf_type]
1719 mov eax, [rsi + rax*4]
1720 mov ebx, [rsi + rbx*4]
1721 mov ecx, [rsi + rcx*4]
1722 mov edx, [rsi + rdx*4]
1723 mov rsi, [rbp + nb133nf_vdwparam]
1724 shl eax, 1
1725 shl ebx, 1
1726 shl ecx, 1
1727 shl edx, 1
1728 mov edi, [rsp + nb133nf_ntia]
1729 add eax, edi
1730 add ebx, edi
1731 add ecx, edi
1732 add edx, edi
1734 movlps xmm6, [rsi + rax*4]
1735 movlps xmm7, [rsi + rcx*4]
1736 movhps xmm6, [rsi + rbx*4]
1737 movhps xmm7, [rsi + rdx*4]
1739 movaps xmm4, xmm6
1740 shufps xmm4, xmm7, 136 ;# constant 10001000
1741 shufps xmm6, xmm7, 221 ;# constant 11011101
1743 movd eax, mm0
1744 movd ebx, mm1
1745 movd ecx, mm2
1746 movd edx, mm3
1748 movaps [rsp + nb133nf_c6], xmm4
1749 movaps [rsp + nb133nf_c12], xmm6
1751 mov rsi, [rbp + nb133nf_pos] ;# base of pos[]
1753 lea rax, [rax + rax*2] ;# replace jnr with j3
1754 lea rbx, [rbx + rbx*2]
1755 lea rcx, [rcx + rcx*2] ;# replace jnr with j3
1756 lea rdx, [rdx + rdx*2]
1758 ;# move four coordinates to xmm0-xmm2
1759 movlps xmm4, [rsi + rax*4]
1760 movlps xmm5, [rsi + rcx*4]
1761 movss xmm2, [rsi + rax*4 + 8]
1762 movss xmm6, [rsi + rcx*4 + 8]
1764 movhps xmm4, [rsi + rbx*4]
1765 movhps xmm5, [rsi + rdx*4]
1767 movss xmm0, [rsi + rbx*4 + 8]
1768 movss xmm1, [rsi + rdx*4 + 8]
1770 shufps xmm2, xmm0, 0
1771 shufps xmm6, xmm1, 0
1773 movaps xmm0, xmm4
1774 movaps xmm1, xmm4
1776 shufps xmm2, xmm6, 136 ;# constant 10001000
1778 shufps xmm0, xmm5, 136 ;# constant 10001000
1779 shufps xmm1, xmm5, 221 ;# constant 11011101
1781 ;# move ixO-izO to xmm4-xmm6
1782 movaps xmm4, [rsp + nb133nf_ixO]
1783 movaps xmm5, [rsp + nb133nf_iyO]
1784 movaps xmm6, [rsp + nb133nf_izO]
1786 ;# calc dr
1787 subps xmm4, xmm0
1788 subps xmm5, xmm1
1789 subps xmm6, xmm2
1791 ;# square it
1792 mulps xmm4,xmm4
1793 mulps xmm5,xmm5
1794 mulps xmm6,xmm6
1795 addps xmm4, xmm5
1796 addps xmm4, xmm6
1797 movaps xmm7, xmm4
1798 ;# rsqO in xmm7
1800 ;# move ixH1-izH1 to xmm4-xmm6
1801 movaps xmm4, [rsp + nb133nf_ixH1]
1802 movaps xmm5, [rsp + nb133nf_iyH1]
1803 movaps xmm6, [rsp + nb133nf_izH1]
1805 ;# calc dr
1806 subps xmm4, xmm0
1807 subps xmm5, xmm1
1808 subps xmm6, xmm2
1810 ;# square it
1811 mulps xmm4,xmm4
1812 mulps xmm5,xmm5
1813 mulps xmm6,xmm6
1814 addps xmm6, xmm5
1815 addps xmm6, xmm4
1816 ;# rsqH1 in xmm6
1818 ;# move ixH2-izH2 to xmm3-xmm5
1819 movaps xmm3, [rsp + nb133nf_ixH2]
1820 movaps xmm4, [rsp + nb133nf_iyH2]
1821 movaps xmm5, [rsp + nb133nf_izH2]
1823 ;# calc dr
1824 subps xmm3, xmm0
1825 subps xmm4, xmm1
1826 subps xmm5, xmm2
1828 ;# square it
1829 mulps xmm3,xmm3
1830 mulps xmm4,xmm4
1831 mulps xmm5,xmm5
1832 addps xmm5, xmm4
1833 addps xmm5, xmm3
1835 ;# move ixM-izM to xmm2-xmm4
1836 movaps xmm3, [rsp + nb133nf_iyM]
1837 movaps xmm4, [rsp + nb133nf_izM]
1838 subps xmm3, xmm1
1839 subps xmm4, xmm2
1840 movaps xmm2, [rsp + nb133nf_ixM]
1841 subps xmm2, xmm0
1843 ;# square it
1844 mulps xmm2,xmm2
1845 mulps xmm3,xmm3
1846 mulps xmm4,xmm4
1847 addps xmm4, xmm3
1848 addps xmm4, xmm2
1849 ;# rsqM in xmm4, rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
1851 ;# rsqH1 - seed in xmm2
1852 rsqrtps xmm2, xmm6
1853 movaps xmm3, xmm2
1854 mulps xmm2, xmm2
1855 movaps xmm0, [rsp + nb133nf_three]
1856 mulps xmm2, xmm6 ;# rsq*lu*lu
1857 subps xmm0, xmm2 ;# constant 30-rsq*lu*lu
1858 mulps xmm0, xmm3 ;# lu*(3-rsq*lu*lu)
1859 mulps xmm0, [rsp + nb133nf_half]
1860 movaps [rsp + nb133nf_rinvH1], xmm0 ;# rinvH1
1862 ;# rsqH2 - seed to xmm2
1863 rsqrtps xmm2, xmm5
1864 movaps xmm3, xmm2
1865 mulps xmm2, xmm2
1866 movaps xmm0, [rsp + nb133nf_three]
1867 mulps xmm2, xmm5 ;# rsq*lu*lu
1868 subps xmm0, xmm2 ;# constant 30-rsq*lu*lu
1869 mulps xmm0, xmm3 ;# lu*(3-rsq*lu*lu)
1870 mulps xmm0, [rsp + nb133nf_half]
1871 movaps [rsp + nb133nf_rinvH2], xmm0 ;# rinvH2
1873 ;# rsqM - seed to xmm2
1874 rsqrtps xmm2, xmm4
1875 movaps xmm3, xmm2
1876 mulps xmm2, xmm2
1877 movaps xmm0, [rsp + nb133nf_three]
1878 mulps xmm2, xmm4 ;# rsq*lu*lu
1879 subps xmm0, xmm2 ;# constant 30-rsq*lu*lu
1880 mulps xmm0, xmm3 ;# lu*(3-rsq*lu*lu)
1881 mulps xmm0, [rsp + nb133nf_half]
1882 movaps [rsp + nb133nf_rinvM], xmm0
1884 ;# Do the O LJ-only interaction directly.
1885 ;# rsqO is in xmm7
1886 rsqrtps xmm2, xmm7
1887 movaps xmm3, xmm2
1888 mulps xmm2, xmm2
1889 movaps xmm4, [rsp + nb133nf_three]
1890 mulps xmm2, xmm7 ;# rsq*lu*lu
1891 subps xmm4, xmm2 ;# constant 30-rsq*lu*lu
1892 mulps xmm4, xmm3 ;# lu*(3-rsq*lu*lu)
1893 mulps xmm4, [rsp + nb133nf_half]
1894 movaps xmm0, xmm4
1895 ;# xmm0=rinvO
1897 mulps xmm7, xmm0
1898 mulps xmm7, [rsp + nb133nf_tsc] ;# rtab
1900 movhlps xmm5, xmm7
1901 cvttps2pi mm6, xmm7
1902 cvttps2pi mm7, xmm5 ;# mm6/mm7 contain lu indices
1903 cvtpi2ps xmm6, mm6
1904 cvtpi2ps xmm5, mm7
1905 movlhps xmm6, xmm5
1906 subps xmm7, xmm6
1907 movaps xmm1, xmm7 ;# xmm1=eps
1908 movaps xmm2, xmm1
1909 mulps xmm2, xmm2 ;# xmm2=eps2
1910 pslld mm6, 3
1911 pslld mm7, 3
1913 mov rsi, [rbp + nb133nf_VFtab]
1914 movd eax, mm6
1915 psrlq mm6, 32
1916 movd ecx, mm7
1917 psrlq mm7, 32
1918 movd ebx, mm6
1919 movd edx, mm7
1921 ;# dispersion
1922 movlps xmm5, [rsi + rax*4]
1923 movlps xmm7, [rsi + rcx*4]
1924 movhps xmm5, [rsi + rbx*4]
1925 movhps xmm7, [rsi + rdx*4] ;# got half dispersion table
1926 movaps xmm4, xmm5
1927 shufps xmm4, xmm7, 136 ;# constant 10001000
1928 shufps xmm5, xmm7, 221 ;# constant 11011101
1930 movlps xmm7, [rsi + rax*4 + 8]
1931 movlps xmm3, [rsi + rcx*4 + 8]
1932 movhps xmm7, [rsi + rbx*4 + 8]
1933 movhps xmm3, [rsi + rdx*4 + 8] ;# other half of dispersion table
1934 movaps xmm6, xmm7
1935 shufps xmm6, xmm3, 136 ;# constant 10001000
1936 shufps xmm7, xmm3, 221 ;# constant 11011101
1937 ;# dispersion table ready, in xmm4-xmm7
1939 mulps xmm6, xmm1 ;# xmm6=Geps
1940 mulps xmm7, xmm2 ;# xmm7=Heps2
1941 addps xmm5, xmm6
1942 addps xmm5, xmm7 ;# xmm5=Fp
1943 mulps xmm5, xmm1 ;# xmm5=eps*Fp
1944 addps xmm5, xmm4 ;# xmm5=VV
1946 movaps xmm4, [rsp + nb133nf_c6]
1947 mulps xmm5, xmm4 ;# Vvdw6
1949 addps xmm5, [rsp + nb133nf_Vvdwtot]
1950 movaps [rsp + nb133nf_Vvdwtot], xmm5
1952 ;# repulsion
1953 movlps xmm5, [rsi + rax*4 + 16]
1954 movlps xmm7, [rsi + rcx*4 + 16]
1955 movhps xmm5, [rsi + rbx*4 + 16]
1956 movhps xmm7, [rsi + rdx*4 + 16] ;# got half repulsion table
1957 movaps xmm4, xmm5
1958 shufps xmm4, xmm7, 136 ;# constant 10001000
1959 shufps xmm5, xmm7, 221 ;# constant 11011101
1961 movlps xmm7, [rsi + rax*4 + 24]
1962 movlps xmm3, [rsi + rcx*4 + 24]
1963 movhps xmm7, [rsi + rbx*4 + 24]
1964 movhps xmm3, [rsi + rdx*4 + 24] ;# other half of repulsion table
1965 movaps xmm6, xmm7
1966 shufps xmm6, xmm3, 136 ;# constant 10001000
1967 shufps xmm7, xmm3, 221 ;# constant 11011101
1968 ;# table ready, in xmm4-xmm7
1969 mulps xmm6, xmm1 ;# xmm6=Geps
1970 mulps xmm7, xmm2 ;# xmm7=Heps2
1971 addps xmm5, xmm6
1972 addps xmm5, xmm7 ;# xmm5=Fp
1973 mulps xmm5, xmm1 ;# xmm5=eps*Fp
1974 addps xmm5, xmm4 ;# xmm5=VV
1976 movaps xmm4, [rsp + nb133nf_c12]
1977 mulps xmm5, xmm4 ;# Vvdw12
1979 addps xmm5, [rsp + nb133nf_Vvdwtot]
1980 movaps [rsp + nb133nf_Vvdwtot], xmm5
1982 ;# Do H1-H2-M interactions
1983 movaps xmm7, [rsp + nb133nf_rinvH1]
1984 addps xmm7, [rsp + nb133nf_rinvH2]
1985 movaps xmm6, [rsp + nb133nf_rinvM]
1987 mulps xmm7, [rsp + nb133nf_qqH]
1988 mulps xmm6, [rsp + nb133nf_qqM]
1989 addps xmm7, xmm6
1991 addps xmm7, [rsp + nb133nf_vctot]
1992 movaps [rsp + nb133nf_vctot], xmm7
1994 ;# should we do one more iteration?
1995 sub dword ptr [rsp + nb133nf_innerk], 4
1996 jl .nb133nf_odd_inner
1997 jmp .nb133nf_unroll_loop
1998 .nb133nf_odd_inner:
1999 add dword ptr [rsp + nb133nf_innerk], 4
2000 jnz .nb133nf_odd_loop
2001 jmp .nb133nf_updateouterdata
2002 .nb133nf_odd_loop:
2003 mov rdx, [rsp + nb133nf_innerjjnr] ;# pointer to jjnr[k]
2004 mov eax, [rdx]
2005 add qword ptr [rsp + nb133nf_innerjjnr], 4
2007 xorps xmm4, xmm4 ;# clear reg.
2008 movss xmm4, [rsp + nb133nf_iqM]
2009 mov rsi, [rbp + nb133nf_charge]
2010 movhps xmm4, [rsp + nb133nf_iqH] ;# [qM 0 qH qH]
2011 shufps xmm4, xmm4, 41 ;# [0 qH qH qM]
2013 movss xmm3, [rsi + rax*4] ;# charge in xmm3
2014 shufps xmm3, xmm3, 0
2015 mulps xmm3, xmm4
2016 movaps [rsp + nb133nf_qqM], xmm3 ;# use dummy qq for storage
2018 xorps xmm6, xmm6
2019 mov rsi, [rbp + nb133nf_type]
2020 mov ebx, [rsi + rax*4]
2021 mov rsi, [rbp + nb133nf_vdwparam]
2022 shl ebx, 1
2023 add ebx, [rsp + nb133nf_ntia]
2024 movlps xmm6, [rsi + rbx*4]
2025 movaps xmm7, xmm6
2026 shufps xmm6, xmm6, 252 ;# constant 11111100
2027 shufps xmm7, xmm7, 253 ;# constant 11111101
2028 movaps [rsp + nb133nf_c6], xmm6
2029 movaps [rsp + nb133nf_c12], xmm7
2031 mov rsi, [rbp + nb133nf_pos]
2032 lea rax, [rax + rax*2]
2034 movss xmm3, [rsp + nb133nf_ixO]
2035 movss xmm4, [rsp + nb133nf_iyO]
2036 movss xmm5, [rsp + nb133nf_izO]
2037 movss xmm0, [rsp + nb133nf_ixH1]
2038 movss xmm1, [rsp + nb133nf_iyH1]
2039 movss xmm2, [rsp + nb133nf_izH1]
2040 unpcklps xmm3, [rsp + nb133nf_ixH2] ;# ixO ixH2 - -
2041 unpcklps xmm4, [rsp + nb133nf_iyH2] ;# iyO iyH2 - -
2042 unpcklps xmm5, [rsp + nb133nf_izH2] ;# izO izH2 - -
2043 unpcklps xmm0, [rsp + nb133nf_ixM] ;# ixH1 ixM - -
2044 unpcklps xmm1, [rsp + nb133nf_iyM] ;# iyH1 iyM - -
2045 unpcklps xmm2, [rsp + nb133nf_izM] ;# izH1 izM - -
2046 unpcklps xmm3, xmm0 ;# ixO ixH1 ixH2 ixM
2047 unpcklps xmm4, xmm1 ;# same for y
2048 unpcklps xmm5, xmm2 ;# same for z
2050 ;# move j coords to xmm0-xmm2
2051 movss xmm0, [rsi + rax*4]
2052 movss xmm1, [rsi + rax*4 + 4]
2053 movss xmm2, [rsi + rax*4 + 8]
2054 shufps xmm0, xmm0, 0
2055 shufps xmm1, xmm1, 0
2056 shufps xmm2, xmm2, 0
2058 subps xmm3, xmm0
2059 subps xmm4, xmm1
2060 subps xmm5, xmm2
2062 mulps xmm3, xmm3
2063 mulps xmm4, xmm4
2064 mulps xmm5, xmm5
2066 addps xmm4, xmm3
2067 addps xmm4, xmm5
2068 ;# rsq in xmm4
2070 rsqrtps xmm5, xmm4
2071 ;# lookup seed in xmm5
2072 movaps xmm2, xmm5
2073 mulps xmm5, xmm5
2074 movaps xmm1, [rsp + nb133nf_three]
2075 mulps xmm5, xmm4 ;# rsq*lu*lu
2076 movaps xmm0, [rsp + nb133nf_half]
2077 subps xmm1, xmm5 ;# constant 30-rsq*lu*lu
2078 mulps xmm1, xmm2
2079 mulps xmm0, xmm1 ;# xmm0=rinv, xmm4=rsq
2081 ;# LJ table interaction
2082 mulps xmm4, xmm0
2083 mulps xmm4, [rsp + nb133nf_tsc] ;# rtab
2085 cvttps2pi mm6, xmm4
2086 cvtpi2ps xmm6, mm6
2087 subss xmm4, xmm6
2088 movss xmm1, xmm4 ;# xmm1=eps
2089 movss xmm2, xmm1
2090 mulss xmm2, xmm2 ;# xmm2=eps2
2091 pslld mm6, 3
2093 mov rsi, [rbp + nb133nf_VFtab]
2094 movd eax, mm6
2096 ;# dispersion
2097 movlps xmm5, [rsi + rax*4]
2098 movaps xmm4, xmm5
2099 shufps xmm4, xmm7, 136 ;# constant 10001000
2100 shufps xmm5, xmm7, 221 ;# constant 11011101
2102 movlps xmm7, [rsi + rax*4 + 8]
2103 movaps xmm6, xmm7
2104 shufps xmm6, xmm3, 136 ;# constant 10001000
2105 shufps xmm7, xmm3, 221 ;# constant 11011101
2106 ;# dispersion table ready, in xmm4-xmm7
2108 mulss xmm6, xmm1 ;# xmm6=Geps
2109 mulss xmm7, xmm2 ;# xmm7=Heps2
2110 addss xmm5, xmm6
2111 addss xmm5, xmm7 ;# xmm5=Fp
2112 mulss xmm5, xmm1 ;# xmm5=eps*Fp
2113 addss xmm5, xmm4 ;# xmm5=VV
2115 movss xmm4, [rsp + nb133nf_c6]
2116 mulss xmm5, xmm4 ;# Vvdw6
2118 ;# put scalar force on stack Update Vvdwtot directly
2119 addss xmm5, [rsp + nb133nf_Vvdwtot]
2120 movss [rsp + nb133nf_Vvdwtot], xmm5
2122 ;# repulsion
2123 movlps xmm5, [rsi + rax*4 + 16]
2124 movaps xmm4, xmm5
2125 shufps xmm4, xmm7, 136 ;# constant 10001000
2126 shufps xmm5, xmm7, 221 ;# constant 11011101
2128 movlps xmm7, [rsi + rax*4 + 24]
2129 movaps xmm6, xmm7
2130 shufps xmm6, xmm3, 136 ;# constant 10001000
2131 shufps xmm7, xmm3, 221 ;# constant 11011101
2132 ;# table ready, in xmm4-xmm7
2133 mulss xmm6, xmm1 ;# xmm6=Geps
2134 mulss xmm7, xmm2 ;# xmm7=Heps2
2135 addss xmm5, xmm6
2136 addss xmm5, xmm7 ;# xmm5=Fp
2137 mulss xmm5, xmm1 ;# xmm5=eps*Fp
2138 addss xmm5, xmm4 ;# xmm5=VV
2140 movss xmm4, [rsp + nb133nf_c12]
2141 mulss xmm5, xmm4 ;# Vvdw12
2143 addss xmm5, [rsp + nb133nf_Vvdwtot]
2144 movss [rsp + nb133nf_Vvdwtot], xmm5
2146 mulps xmm0, [rsp + nb133nf_qqM] ;# xmm0=vcoul
2148 addps xmm0, [rsp + nb133nf_vctot]
2149 movaps [rsp + nb133nf_vctot], xmm0
2151 dec dword ptr [rsp + nb133nf_innerk]
2152 jz .nb133nf_updateouterdata
2153 jmp .nb133nf_odd_loop
2154 .nb133nf_updateouterdata:
2155 ;# get n from stack
2156 mov esi, [rsp + nb133nf_n]
2157 ;# get group index for i particle
2158 mov rdx, [rbp + nb133nf_gid] ;# base of gid[]
2159 mov edx, [rdx + rsi*4] ;# ggid=gid[n]
2161 ;# accumulate total potential energy and update it
2162 movaps xmm7, [rsp + nb133nf_vctot]
2163 ;# accumulate
2164 movhlps xmm6, xmm7
2165 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
2166 movaps xmm6, xmm7
2167 shufps xmm6, xmm6, 1
2168 addss xmm7, xmm6
2170 ;# add earlier value from mem
2171 mov rax, [rbp + nb133nf_Vc]
2172 addss xmm7, [rax + rdx*4]
2173 ;# move back to mem
2174 movss [rax + rdx*4], xmm7
2176 ;# accumulate total lj energy and update it
2177 movaps xmm7, [rsp + nb133nf_Vvdwtot]
2178 ;# accumulate
2179 movhlps xmm6, xmm7
2180 addps xmm7, xmm6 ;# pos 0-1 in xmm7 have the sum now
2181 movaps xmm6, xmm7
2182 shufps xmm6, xmm6, 1
2183 addss xmm7, xmm6
2185 ;# add earlier value from mem
2186 mov rax, [rbp + nb133nf_Vvdw]
2187 addss xmm7, [rax + rdx*4]
2188 ;# move back to mem
2189 movss [rax + rdx*4], xmm7
2191 ;# finish if last
2192 mov ecx, [rsp + nb133nf_nn1]
2193 ;# esi already loaded with n
2194 inc esi
2195 sub ecx, esi
2196 jz .nb133nf_outerend
2198 ;# not last, iterate outer loop once more!
2199 mov [rsp + nb133nf_n], esi
2200 jmp .nb133nf_outer
2201 .nb133nf_outerend:
2202 ;# check if more outer neighborlists remain
2203 mov ecx, [rsp + nb133nf_nri]
2204 ;# esi already loaded with n above
2205 sub ecx, esi
2206 jz .nb133nf_end
2207 ;# non-zero, do one more workunit
2208 jmp .nb133nf_threadloop
2209 .nb133nf_end:
2211 mov eax, [rsp + nb133nf_nouter]
2212 mov ebx, [rsp + nb133nf_ninner]
2213 mov rcx, [rbp + nb133nf_outeriter]
2214 mov rdx, [rbp + nb133nf_inneriter]
2215 mov [rcx], eax
2216 mov [rdx], ebx
2218 add rsp, 592
2219 emms
2221 ;# Save xmm registers to stack
2222 movaps xmm6, [rsp ]
2223 movaps xmm7, [rsp + 16 ]
2224 movaps xmm8, [rsp + 32 ]
2225 movaps xmm9, [rsp + 48 ]
2226 movaps xmm10, [rsp + 64 ]
2227 movaps xmm11, [rsp + 80 ]
2228 movaps xmm12, [rsp + 96 ]
2229 movaps xmm13, [rsp + 112]
2230 movaps xmm14, [rsp + 128]
2231 movaps xmm15, [rsp + 144]
2233 ;# Reset pointers after restoring xmm6-15
2234 add rsp, 168
2236 pop r15
2237 pop r14
2238 pop r13
2239 pop r12
2240 pop rdi
2241 pop rsi
2242 pop rbx
2244 pop rbp