Updated intel syntax x86-64 asm files to also support MS win64 call convention (ifdef...
[gromacs/rigid-bodies.git] / src / gmxlib / nonbonded / nb_kernel_x86_64_sse / nb_kernel010_x86_64_sse.s
blob5e5d347c9f2963a74eb2332175e0f8b9130d9697
1 ##
2 ##
3 ## Gromacs 4.0 Copyright (c) 1991-2003
4 ## David van der Spoel, Erik Lindahl
5 ##
6 ## This program is free software; you can redistribute it and/or
7 ## modify it under the terms of the GNU General Public License
8 ## as published by the Free Software Foundation; either version 2
9 ## of the License, or (at your option) any later version.
11 ## To help us fund GROMACS development, we humbly ask that you cite
12 ## the research papers on the package. Check out http://www.gromacs.org
13 ##
14 ## And Hey:
15 ## Gnomes, ROck Monsters And Chili Sauce
21 ## nb010 - forces are calculated
22 .globl nb_kernel010_x86_64_sse
23 .globl _nb_kernel010_x86_64_sse
24 nb_kernel010_x86_64_sse:
25 _nb_kernel010_x86_64_sse:
26 ## Room for return address and rbp (16 bytes)
27 .set nb010_fshift, 16
28 .set nb010_gid, 24
29 .set nb010_pos, 32
30 .set nb010_faction, 40
31 .set nb010_charge, 48
32 .set nb010_p_facel, 56
33 .set nb010_argkrf, 64
34 .set nb010_argcrf, 72
35 .set nb010_Vc, 80
36 .set nb010_type, 88
37 .set nb010_p_ntype, 96
38 .set nb010_vdwparam, 104
39 .set nb010_Vvdw, 112
40 .set nb010_p_tabscale, 120
41 .set nb010_VFtab, 128
42 .set nb010_invsqrta, 136
43 .set nb010_dvda, 144
44 .set nb010_p_gbtabscale, 152
45 .set nb010_GBtab, 160
46 .set nb010_p_nthreads, 168
47 .set nb010_count, 176
48 .set nb010_mtx, 184
49 .set nb010_outeriter, 192
50 .set nb010_inneriter, 200
51 .set nb010_work, 208
52 ## The mutex (last arg) is not used in assembly.
53 ## stack offsets for local variables
54 ## bottom of stack is cache-aligned for sse use
55 .set nb010_ix, 0
56 .set nb010_iy, 16
57 .set nb010_iz, 32
58 .set nb010_dx, 48
59 .set nb010_dy, 64
60 .set nb010_dz, 80
61 .set nb010_two, 96
62 .set nb010_c6, 112
63 .set nb010_c12, 128
64 .set nb010_six, 144
65 .set nb010_twelve, 160
66 .set nb010_Vvdwtot, 176
67 .set nb010_fix, 192
68 .set nb010_fiy, 208
69 .set nb010_fiz, 224
70 .set nb010_half, 240
71 .set nb010_three, 256
72 .set nb010_nri, 272
73 .set nb010_iinr, 280
74 .set nb010_jindex, 288
75 .set nb010_jjnr, 296
76 .set nb010_shift, 304
77 .set nb010_shiftvec, 312
78 .set nb010_facel, 320
79 .set nb010_innerjjnr, 328
80 .set nb010_is3, 336
81 .set nb010_ii3, 340
82 .set nb010_ntia, 344
83 .set nb010_innerk, 348
84 .set nb010_n, 352
85 .set nb010_nn1, 356
86 .set nb010_ntype, 360
87 .set nb010_nouter, 364
88 .set nb010_ninner, 368
90 push %rbp
91 movq %rsp,%rbp
92 push %rbx
94 push %r12
95 push %r13
96 push %r14
97 push %r15
99 emms
101 subq $392,%rsp # # local variable stack space (n*16+8)
102 ## zero 32-bit iteration counters
103 movl $0,%eax
104 movl %eax,nb010_nouter(%rsp)
105 movl %eax,nb010_ninner(%rsp)
107 movl (%rdi),%edi
108 movl %edi,nb010_nri(%rsp)
109 movq %rsi,nb010_iinr(%rsp)
110 movq %rdx,nb010_jindex(%rsp)
111 movq %rcx,nb010_jjnr(%rsp)
112 movq %r8,nb010_shift(%rsp)
113 movq %r9,nb010_shiftvec(%rsp)
114 movq nb010_p_ntype(%rbp),%rdi
115 movl (%rdi),%edi
116 movl %edi,nb010_ntype(%rsp)
118 ## create constant floating-point factors on stack
119 movl $0x40000000,%eax ## 2.0 in IEEE (hex)
120 movl %eax,nb010_two(%rsp)
121 movss nb010_two(%rsp),%xmm1
122 shufps $0,%xmm1,%xmm1 ## splat to all elements
123 movaps %xmm1,%xmm2
124 addps %xmm1,%xmm2 ## 4.0
125 addps %xmm1,%xmm2 ## 6.0
126 movaps %xmm2,%xmm3
127 addps %xmm3,%xmm3 ## 12.0
128 movaps %xmm1,nb010_two(%rsp)
129 movaps %xmm2,nb010_six(%rsp)
130 movaps %xmm3,nb010_twelve(%rsp)
133 _nb_kernel010_x86_64_sse.nb010_threadloop:
134 movq nb010_count(%rbp),%rsi ## pointer to sync counter
135 movl (%rsi),%eax
136 _nb_kernel010_x86_64_sse.nb010_spinlock:
137 movl %eax,%ebx ## ebx=*count=nn0
138 addl $1,%ebx ## ebx=nn1=nn0+10
139 lock
140 cmpxchgl %ebx,(%rsi) ## write nn1 to *counter,
141 ## if it hasnt changed.
142 ## or reread *counter to eax.
143 pause ## -> better p4 performance
144 jnz _nb_kernel010_x86_64_sse.nb010_spinlock
146 ## if(nn1>nri) nn1=nri
147 movl nb010_nri(%rsp),%ecx
148 movl %ecx,%edx
149 subl %ebx,%ecx
150 cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri
151 ## Cleared the spinlock if we got here.
152 ## eax contains nn0, ebx contains nn1.
153 movl %eax,nb010_n(%rsp)
154 movl %ebx,nb010_nn1(%rsp)
155 subl %eax,%ebx ## calc number of outer lists
156 movl %eax,%esi ## copy n to esi
157 jg _nb_kernel010_x86_64_sse.nb010_outerstart
158 jmp _nb_kernel010_x86_64_sse.nb010_end
160 _nb_kernel010_x86_64_sse.nb010_outerstart:
161 ## ebx contains number of outer iterations
162 addl nb010_nouter(%rsp),%ebx
163 movl %ebx,nb010_nouter(%rsp)
165 _nb_kernel010_x86_64_sse.nb010_outer:
166 movq nb010_shift(%rsp),%rax ## rax = base of shift[]
167 movl (%rax,%rsi,4),%ebx ## ebx=shift[n]
169 lea (%rbx,%rbx,2),%rbx ## rbx=3*is
170 movl %ebx,nb010_is3(%rsp) ## store is3
172 movq nb010_shiftvec(%rsp),%rax ## rax = base of shiftvec[]
174 movss (%rax,%rbx,4),%xmm10
175 movss 4(%rax,%rbx,4),%xmm11
176 movss 8(%rax,%rbx,4),%xmm12
178 movq nb010_iinr(%rsp),%rcx ## rcx = base of iinr[]
179 movl (%rcx,%rsi,4),%ebx ## ebx =ii
181 movq nb010_type(%rbp),%rdx
182 movl (%rdx,%rbx,4),%edx
183 imull nb010_ntype(%rsp),%edx
184 shll %edx
185 movl %edx,nb010_ntia(%rsp)
187 lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3
188 movq nb010_pos(%rbp),%rax ## rax = base of pos[]
190 addss (%rax,%rbx,4),%xmm10
191 addss 4(%rax,%rbx,4),%xmm11
192 addss 8(%rax,%rbx,4),%xmm12
194 shufps $0,%xmm10,%xmm10
195 shufps $0,%xmm11,%xmm11
196 shufps $0,%xmm12,%xmm12
198 movaps %xmm10,nb010_ix(%rsp)
199 movaps %xmm11,nb010_iy(%rsp)
200 movaps %xmm12,nb010_iz(%rsp)
202 movl %ebx,nb010_ii3(%rsp)
204 ## clear vvdwtot (xmm12) and i forces (xmm13-xmm15)
205 xorps %xmm12,%xmm12
206 movaps %xmm12,%xmm13
207 movaps %xmm12,%xmm14
208 movaps %xmm12,%xmm15
210 movq nb010_jindex(%rsp),%rax
211 movl (%rax,%rsi,4),%ecx ## jindex[n]
212 movl 4(%rax,%rsi,4),%edx ## jindex[n+1]
213 subl %ecx,%edx ## number of innerloop atoms
215 movq nb010_jjnr(%rsp),%rax
216 shll $2,%ecx
217 addq %rcx,%rax
218 movq %rax,nb010_innerjjnr(%rsp) ## pointer to jjnr[nj0]
219 movl %edx,%ecx
220 subl $4,%edx
221 addl nb010_ninner(%rsp),%ecx
222 movl %ecx,nb010_ninner(%rsp)
223 addl $0,%edx
224 movl %edx,nb010_innerk(%rsp) ## number of innerloop atoms
226 jge _nb_kernel010_x86_64_sse.nb010_unroll_loop
227 jmp _nb_kernel010_x86_64_sse.nb010_finish_inner
228 _nb_kernel010_x86_64_sse.nb010_unroll_loop:
229 ## quad-unrolled innerloop here
230 movq nb010_innerjjnr(%rsp),%rdx ## pointer to jjnr[k]
231 movl (%rdx),%eax
232 movl 4(%rdx),%ebx
233 movl 8(%rdx),%ecx
234 movl 12(%rdx),%edx ## eax-edx=jnr1-4
236 addq $16,nb010_innerjjnr(%rsp) ## advance pointer (unrolled 4)
238 lea (%rax,%rax,2),%r8 ## replace jnr with j3
239 lea (%rbx,%rbx,2),%r9
240 lea (%rcx,%rcx,2),%r10
241 lea (%rdx,%rdx,2),%r11
243 movq nb010_pos(%rbp),%rdi
244 ## load coordinates
245 movlps (%rdi,%r8,4),%xmm1 ## x1 y1 - -
246 movlps (%rdi,%r10,4),%xmm2 ## x3 y3 - -
247 movhps (%rdi,%r9,4),%xmm1 ## x2 y2 - -
248 movhps (%rdi,%r11,4),%xmm2 ## x4 y4 - -
250 movss 8(%rdi,%r8,4),%xmm5 ## z1 - - -
251 movss 8(%rdi,%r10,4),%xmm6 ## z2 - - -
252 movss 8(%rdi,%r9,4),%xmm7 ## z3 - - -
253 movss 8(%rdi,%r11,4),%xmm8 ## z4 - - -
254 movlhps %xmm7,%xmm5 ## jzOa - jzOb -
255 movlhps %xmm8,%xmm6 ## jzOc - jzOd -
257 movq nb010_type(%rbp),%rsi
259 movaps %xmm1,%xmm4
260 unpcklps %xmm2,%xmm1 ## jxa jxc jya jyc
261 unpckhps %xmm2,%xmm4 ## jxb jxd jyb jyd
262 movaps %xmm1,%xmm2
263 unpcklps %xmm4,%xmm1 ## x
264 unpckhps %xmm4,%xmm2 ## y
265 shufps $136,%xmm6,%xmm5 ## 10001000 => jzH2a jzH2b jzH2c jzH2d
267 ## load vdw types
268 movl (%rsi,%rax,4),%r12d
269 movl (%rsi,%rbx,4),%r13d
270 movl (%rsi,%rcx,4),%r14d
271 movl (%rsi,%rdx,4),%r15d
273 ## calc dr
274 subps nb010_ix(%rsp),%xmm1
275 subps nb010_iy(%rsp),%xmm2
276 subps nb010_iz(%rsp),%xmm5
278 ## store dr in xmm9-xmm11
279 movaps %xmm1,%xmm9
280 movaps %xmm2,%xmm10
281 movaps %xmm5,%xmm11
283 ## type *=2
284 shll %r12d
285 shll %r13d
286 shll %r14d
287 shll %r15d
289 ## square it
290 mulps %xmm1,%xmm1
291 mulps %xmm2,%xmm2
292 mulps %xmm5,%xmm5
293 addps %xmm2,%xmm1
294 addps %xmm5,%xmm1
295 ## rsq in xmm1
297 ## 2*type*ntia
298 movl nb010_ntia(%rsp),%edi
299 addl %edi,%r12d
300 addl %edi,%r13d
301 addl %edi,%r14d
302 addl %edi,%r15d
304 movq nb010_vdwparam(%rbp),%rsi
305 ## xmm0=c6
306 ## xmm3=c12
308 rcpps %xmm1,%xmm5
309 ## 1/x lookup seed in xmm5
310 movaps nb010_two(%rsp),%xmm6
311 mulps %xmm5,%xmm1
312 ## load c6/c12
313 movlps (%rsi,%r12,4),%xmm7
314 movlps (%rsi,%r14,4),%xmm8
316 subps %xmm1,%xmm6
317 mulps %xmm5,%xmm6 ## xmm6=rinvsq
319 movaps %xmm6,%xmm4 ## rinvsq
321 movhps (%rsi,%r13,4),%xmm7
322 movhps (%rsi,%r15,4),%xmm8
324 movaps %xmm6,%xmm1
325 mulps %xmm6,%xmm1 ## rinv4
326 mulps %xmm6,%xmm1 ## rinv6
327 movaps %xmm1,%xmm2
328 mulps %xmm2,%xmm2 ## xmm2=rinv12
330 ## shuffle c6/c12
331 movaps %xmm7,%xmm5
332 shufps $136,%xmm8,%xmm5 ## 10001000
333 shufps $221,%xmm8,%xmm7 ## 11011101
335 movq nb010_faction(%rbp),%rsi
337 mulps %xmm5,%xmm1 ## c6*rinv6
338 mulps %xmm7,%xmm2 ## c12*rinv12
339 movaps %xmm2,%xmm5
340 subps %xmm1,%xmm5 ## Vvdw=Vvdw12-Vvdw6
341 mulps nb010_six(%rsp),%xmm1
342 mulps nb010_twelve(%rsp),%xmm2
343 subps %xmm1,%xmm2
344 mulps %xmm2,%xmm4 ## xmm4=total fscal
346 ## the fj's - start by accumulating x & y forces from memory
347 movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
348 movlps (%rsi,%r10,4),%xmm1 ## x3 y3 - -
349 movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
350 movhps (%rsi,%r11,4),%xmm1 ## x3 y3 x4 y4
352 ## add potential to Vvdwtot (sum in xmm12)
353 addps %xmm5,%xmm12
355 ## calculate scalar force by multiplying dx/dy/dz with fscal
356 mulps %xmm4,%xmm9
357 mulps %xmm4,%xmm10
358 mulps %xmm4,%xmm11
360 ## xmm0-xmm2 contains tx-tz (partial force)
361 ## accumulate i forces
362 addps %xmm9,%xmm13
363 addps %xmm10,%xmm14
364 addps %xmm11,%xmm15
366 ## permute local forces
367 movaps %xmm9,%xmm8
368 unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
369 unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4
371 ## xmm11: fjz1 fjz2 fjz3 fjz4
372 pshufd $1,%xmm11,%xmm5 ## fjz2 - - -
373 movhlps %xmm11,%xmm4 ## fjz3 - - -
374 pshufd $3,%xmm11,%xmm3 ## fjz4 - - -
376 ## update fjx and fjy
377 addps %xmm9,%xmm0
378 addps %xmm8,%xmm1
380 movlps %xmm0,(%rsi,%r8,4)
381 movlps %xmm1,(%rsi,%r10,4)
382 movhps %xmm0,(%rsi,%r9,4)
383 movhps %xmm1,(%rsi,%r11,4)
385 addss 8(%rsi,%r8,4),%xmm11
386 addss 8(%rsi,%r9,4),%xmm5
387 addss 8(%rsi,%r10,4),%xmm4
388 addss 8(%rsi,%r11,4),%xmm3
389 movss %xmm11,8(%rsi,%r8,4)
390 movss %xmm5,8(%rsi,%r9,4)
391 movss %xmm4,8(%rsi,%r10,4)
392 movss %xmm3,8(%rsi,%r11,4)
394 ## should we do one more iteration?
395 subl $4,nb010_innerk(%rsp)
396 jl _nb_kernel010_x86_64_sse.nb010_finish_inner
397 jmp _nb_kernel010_x86_64_sse.nb010_unroll_loop
398 _nb_kernel010_x86_64_sse.nb010_finish_inner:
399 ## check if at least two particles remain
400 addl $4,nb010_innerk(%rsp)
401 movl nb010_innerk(%rsp),%edx
402 andl $2,%edx
403 jnz _nb_kernel010_x86_64_sse.nb010_dopair
404 jmp _nb_kernel010_x86_64_sse.nb010_checksingle
405 _nb_kernel010_x86_64_sse.nb010_dopair:
406 ## twice-unrolled innerloop here
407 movq nb010_innerjjnr(%rsp),%rdx ## pointer to jjnr[k]
408 movl (%rdx),%eax
409 movl 4(%rdx),%ebx
411 addq $8,nb010_innerjjnr(%rsp) ## advance pointer (unrolled 2)
413 movq nb010_type(%rbp),%rsi
414 movl (%rsi,%rax,4),%r12d
415 movl (%rsi,%rbx,4),%r13d
416 shll %r12d
417 shll %r13d
418 movl nb010_ntia(%rsp),%edi
419 addl %edi,%r12d
420 addl %edi,%r13d
422 movq nb010_vdwparam(%rbp),%rsi
423 movlps (%rsi,%r12,4),%xmm3
424 movhps (%rsi,%r13,4),%xmm3
426 xorps %xmm7,%xmm7
427 movaps %xmm3,%xmm0
428 shufps $136,%xmm7,%xmm0 ## 10001000
429 shufps $221,%xmm7,%xmm3 ## 11011101
431 ## xmm0=c6
432 ## xmm3=c12
433 lea (%rax,%rax,2),%rax ## replace jnr with j3
434 lea (%rbx,%rbx,2),%rbx
436 movq nb010_pos(%rbp),%rdi
437 ## load coordinates
438 movlps (%rdi,%rax,4),%xmm1 ## x1 y1 - -
439 movlps (%rdi,%rbx,4),%xmm4 ## x2 y2 - -
441 movss 8(%rdi,%rax,4),%xmm5 ## z1 - - -
442 movss 8(%rdi,%rbx,4),%xmm7 ## z2 - - -
444 unpcklps %xmm4,%xmm1 ## x1 x2 y1 y2
445 movhlps %xmm1,%xmm2 ## y1 y2 - -
446 unpcklps %xmm7,%xmm5 ## z1 z2 - -
448 ## calc dr
449 subps nb010_ix(%rsp),%xmm1
450 subps nb010_iy(%rsp),%xmm2
451 subps nb010_iz(%rsp),%xmm5
453 ## store dr in xmm9-xmm11
454 movaps %xmm1,%xmm9
455 movaps %xmm2,%xmm10
456 movaps %xmm5,%xmm11
458 ## square it
459 mulps %xmm1,%xmm1
460 mulps %xmm2,%xmm2
461 mulps %xmm5,%xmm5
462 addps %xmm2,%xmm1
463 addps %xmm5,%xmm1
464 ## rsq in xmm1
466 rcpps %xmm1,%xmm5
467 ## 1/x lookup seed in xmm5
468 movaps nb010_two(%rsp),%xmm6
469 mulps %xmm5,%xmm1
470 subps %xmm1,%xmm6
471 mulps %xmm5,%xmm6 ## xmm6=rinvsq
473 movaps %xmm6,%xmm4 ## rinvsq
475 movaps %xmm6,%xmm1
476 mulps %xmm6,%xmm1 ## rinv4
477 mulps %xmm6,%xmm1 ## rinv6
478 movaps %xmm1,%xmm2
479 mulps %xmm2,%xmm2 ## xmm2=rinv12
481 mulps %xmm0,%xmm1
482 mulps %xmm3,%xmm2
483 movaps %xmm2,%xmm5
484 subps %xmm1,%xmm5 ## Vvdw=Vvdw12-Vvdw6
485 mulps nb010_six(%rsp),%xmm1
486 mulps nb010_twelve(%rsp),%xmm2
487 subps %xmm1,%xmm2
488 mulps %xmm2,%xmm4 ## xmm4=total fscal
490 xorps %xmm7,%xmm7
491 movlhps %xmm7,%xmm5
493 ## add potential to Vvdwtot (sum in xmm12)
494 addps %xmm5,%xmm12
496 ## calculate scalar force by multiplying dx/dy/dz with fscal
497 mulps %xmm4,%xmm9
498 mulps %xmm4,%xmm10
499 mulps %xmm4,%xmm11
501 movlhps %xmm7,%xmm9
502 movlhps %xmm7,%xmm10
503 movlhps %xmm7,%xmm11
505 ## xmm0-xmm2 contains tx-tz (partial force)
506 ## accumulate i forces
507 addps %xmm9,%xmm13
508 addps %xmm10,%xmm14
509 addps %xmm11,%xmm15
511 movq nb010_faction(%rbp),%rsi
512 ## the fj's - start by accumulating x & y forces from memory
513 movlps (%rsi,%rax,4),%xmm0 ## x1 y1 - -
514 movhps (%rsi,%rbx,4),%xmm0 ## x1 y1 x2 y2
516 unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
517 addps %xmm9,%xmm0
519 movlps %xmm0,(%rsi,%rax,4)
520 movhps %xmm0,(%rsi,%rbx,4)
522 ## z forces
523 pshufd $1,%xmm11,%xmm8
524 addss 8(%rsi,%rax,4),%xmm11
525 addss 8(%rsi,%rbx,4),%xmm8
526 movss %xmm11,8(%rsi,%rax,4)
527 movss %xmm8,8(%rsi,%rbx,4)
529 _nb_kernel010_x86_64_sse.nb010_checksingle:
530 movl nb010_innerk(%rsp),%edx
531 andl $1,%edx
532 jnz _nb_kernel010_x86_64_sse.nb010_dosingle
533 jmp _nb_kernel010_x86_64_sse.nb010_updateouterdata
535 _nb_kernel010_x86_64_sse.nb010_dosingle:
536 movq nb010_innerjjnr(%rsp),%rcx
537 movl (%rcx),%eax
539 movq nb010_type(%rbp),%rsi
540 movl (%rsi,%rax,4),%r12d
541 shll %r12d
542 movl nb010_ntia(%rsp),%edi
543 addl %edi,%r12d
545 movq nb010_vdwparam(%rbp),%rsi
546 movss (%rsi,%r12,4),%xmm0
547 movss 4(%rsi,%r12,4),%xmm3
549 ## xmm0=c6
550 ## xmm3=c12
552 lea (%rax,%rax,2),%rax ## replace jnr with j3
554 movq nb010_pos(%rbp),%rdi
555 ## load coordinates
556 movss (%rdi,%rax,4),%xmm1
557 movss 4(%rdi,%rax,4),%xmm2
558 movss 8(%rdi,%rax,4),%xmm5
560 ## calc dr
561 subss nb010_ix(%rsp),%xmm1
562 subss nb010_iy(%rsp),%xmm2
563 subss nb010_iz(%rsp),%xmm5
565 ## store dr in xmm9-xmm11
566 movaps %xmm1,%xmm9
567 movaps %xmm2,%xmm10
568 movaps %xmm5,%xmm11
570 ## square it
571 mulss %xmm1,%xmm1
572 mulss %xmm2,%xmm2
573 mulss %xmm5,%xmm5
574 addss %xmm2,%xmm1
575 addss %xmm5,%xmm1
576 ## rsq in xmm1
578 ## rsq in xmm4
579 rcpss %xmm1,%xmm5
580 ## 1/x lookup seed in xmm5
581 movaps nb010_two(%rsp),%xmm6
582 mulss %xmm5,%xmm1
583 subss %xmm1,%xmm6
584 mulss %xmm5,%xmm6 ## xmm6=rinvsq
586 movaps %xmm6,%xmm4 ## rinvsq
588 movaps %xmm6,%xmm1
589 mulss %xmm6,%xmm1 ## rinv4
590 mulss %xmm6,%xmm1 ## rinv6
591 movaps %xmm1,%xmm2
592 mulss %xmm2,%xmm2 ## xmm2=rinv12
594 mulss %xmm0,%xmm1
595 mulss %xmm3,%xmm2
596 movaps %xmm2,%xmm5
597 subss %xmm1,%xmm5 ## Vvdw=Vvdw12-Vvdw6
598 mulss nb010_six(%rsp),%xmm1
599 mulss nb010_twelve(%rsp),%xmm2
600 subss %xmm1,%xmm2
601 mulss %xmm2,%xmm4 ## xmm4=total fscal
603 ## add potential to Vvdwtot (sum in xmm12)
604 addss %xmm5,%xmm12
606 ## calculate scalar force by multiplying dx/dy/dz with fscal
607 mulss %xmm4,%xmm9
608 mulss %xmm4,%xmm10
609 mulss %xmm4,%xmm11
611 ## xmm0-xmm2 contains tx-tz (partial force)
612 ## accumulate i forces
613 addss %xmm9,%xmm13
614 addss %xmm10,%xmm14
615 addss %xmm11,%xmm15
617 movq nb010_faction(%rbp),%rsi
618 ## add to j forces
619 addss (%rsi,%rax,4),%xmm9
620 addss 4(%rsi,%rax,4),%xmm10
621 addss 8(%rsi,%rax,4),%xmm11
622 movss %xmm9,(%rsi,%rax,4)
623 movss %xmm10,4(%rsi,%rax,4)
624 movss %xmm11,8(%rsi,%rax,4)
626 _nb_kernel010_x86_64_sse.nb010_updateouterdata:
627 movl nb010_ii3(%rsp),%ecx
628 movq nb010_faction(%rbp),%rdi
629 movq nb010_fshift(%rbp),%rsi
630 movl nb010_is3(%rsp),%edx
632 ## accumulate i forces in xmm13, xmm14, xmm15
633 movhlps %xmm13,%xmm0
634 movhlps %xmm14,%xmm1
635 movhlps %xmm15,%xmm2
636 addps %xmm13,%xmm0
637 addps %xmm14,%xmm1
638 addps %xmm15,%xmm2
639 movaps %xmm0,%xmm3
640 movaps %xmm1,%xmm4
641 movaps %xmm2,%xmm5
642 shufps $1,%xmm3,%xmm3
643 shufps $1,%xmm4,%xmm4
644 shufps $1,%xmm5,%xmm5
645 addss %xmm3,%xmm0
646 addss %xmm4,%xmm1
647 addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0
649 ## increment i force
650 movss (%rdi,%rcx,4),%xmm3
651 movss 4(%rdi,%rcx,4),%xmm4
652 movss 8(%rdi,%rcx,4),%xmm5
653 subss %xmm0,%xmm3
654 subss %xmm1,%xmm4
655 subss %xmm2,%xmm5
656 movss %xmm3,(%rdi,%rcx,4)
657 movss %xmm4,4(%rdi,%rcx,4)
658 movss %xmm5,8(%rdi,%rcx,4)
660 ## increment fshift force
661 movss (%rsi,%rdx,4),%xmm3
662 movss 4(%rsi,%rdx,4),%xmm4
663 movss 8(%rsi,%rdx,4),%xmm5
664 subss %xmm0,%xmm3
665 subss %xmm1,%xmm4
666 subss %xmm2,%xmm5
667 movss %xmm3,(%rsi,%rdx,4)
668 movss %xmm4,4(%rsi,%rdx,4)
669 movss %xmm5,8(%rsi,%rdx,4)
671 ## get n from stack
672 movl nb010_n(%rsp),%esi
673 ## get group index for i particle
674 movq nb010_gid(%rbp),%rdx ## base of gid[]
675 movl (%rdx,%rsi,4),%edx ## ggid=gid[n]
677 ## accumulate total potential energy and update it
678 ## accumulate
679 movhlps %xmm12,%xmm6
680 addps %xmm6,%xmm12 ## pos 0-1 in xmm12 have the sum now
681 movaps %xmm12,%xmm6
682 shufps $1,%xmm6,%xmm6
683 addss %xmm6,%xmm12
685 ## add earlier value from mem
686 movq nb010_Vvdw(%rbp),%rax
687 addss (%rax,%rdx,4),%xmm12
688 ## move back to mem
689 movss %xmm12,(%rax,%rdx,4)
691 ## finish if last
692 movl nb010_nn1(%rsp),%ecx
693 ## esi already loaded with n
694 incl %esi
695 subl %esi,%ecx
696 jz _nb_kernel010_x86_64_sse.nb010_outerend
698 ## not last, iterate outer loop once more!
699 movl %esi,nb010_n(%rsp)
700 jmp _nb_kernel010_x86_64_sse.nb010_outer
701 _nb_kernel010_x86_64_sse.nb010_outerend:
702 ## check if more outer neighborlists remain
703 movl nb010_nri(%rsp),%ecx
704 ## esi already loaded with n above
705 subl %esi,%ecx
706 jz _nb_kernel010_x86_64_sse.nb010_end
707 ## non-zero, do one more workunit
708 jmp _nb_kernel010_x86_64_sse.nb010_threadloop
709 _nb_kernel010_x86_64_sse.nb010_end:
711 emms
713 movl nb010_nouter(%rsp),%eax
714 movl nb010_ninner(%rsp),%ebx
715 movq nb010_outeriter(%rbp),%rcx
716 movq nb010_inneriter(%rbp),%rdx
717 movl %eax,(%rcx)
718 movl %ebx,(%rdx)
720 addq $392,%rsp
722 pop %r15
723 pop %r14
724 pop %r13
725 pop %r12
727 pop %rbx
728 pop %rbp
738 .globl nb_kernel010nf_x86_64_sse
739 .globl _nb_kernel010nf_x86_64_sse
740 nb_kernel010nf_x86_64_sse:
741 _nb_kernel010nf_x86_64_sse:
742 ## Room for return address and rbp (16 bytes)
743 .set nb010nf_fshift, 16
744 .set nb010nf_gid, 24
745 .set nb010nf_pos, 32
746 .set nb010nf_faction, 40
747 .set nb010nf_charge, 48
748 .set nb010nf_p_facel, 56
749 .set nb010nf_argkrf, 64
750 .set nb010nf_argcrf, 72
751 .set nb010nf_Vc, 80
752 .set nb010nf_type, 88
753 .set nb010nf_p_ntype, 96
754 .set nb010nf_vdwparam, 104
755 .set nb010nf_Vvdw, 112
756 .set nb010nf_p_tabscale, 120
757 .set nb010nf_VFtab, 128
758 .set nb010nf_invsqrta, 136
759 .set nb010nf_dvda, 144
760 .set nb010nf_p_gbtabscale, 152
761 .set nb010nf_GBtab, 160
762 .set nb010nf_p_nthreads, 168
763 .set nb010nf_count, 176
764 .set nb010nf_mtx, 184
765 .set nb010nf_outeriter, 192
766 .set nb010nf_inneriter, 200
767 .set nb010nf_work, 208
768 ## The mutex (last arg) is not used in assembly.
769 ## stack offsets for local variables
770 ## bottom of stack is cache-aligned for sse use
771 .set nb010nf_ix, 0
772 .set nb010nf_iy, 16
773 .set nb010nf_iz, 32
774 .set nb010nf_two, 48
775 .set nb010nf_c6, 64
776 .set nb010nf_c12, 80
777 .set nb010nf_Vvdwtot, 96
778 .set nb010nf_half, 112
779 .set nb010nf_three, 128
780 .set nb010nf_nri, 144
781 .set nb010nf_iinr, 152
782 .set nb010nf_jindex, 160
783 .set nb010nf_jjnr, 168
784 .set nb010nf_shift, 176
785 .set nb010nf_shiftvec, 184
786 .set nb010nf_innerjjnr, 192
787 .set nb010nf_facel, 200
788 .set nb010nf_ntia, 208
789 .set nb010nf_innerk, 216
790 .set nb010nf_is3, 220
791 .set nb010nf_ii3, 224
792 .set nb010nf_n, 228
793 .set nb010nf_nn1, 232
794 .set nb010nf_ntype, 236
795 .set nb010nf_nouter, 240
796 .set nb010nf_ninner, 244
798 push %rbp
799 movq %rsp,%rbp
800 push %rbx
802 subq $264,%rsp # # local variable stack space (n*16+8)
803 emms
805 ## zero 32-bit iteration counters
806 movl $0,%eax
807 movl %eax,nb010nf_nouter(%rsp)
808 movl %eax,nb010nf_ninner(%rsp)
811 movl (%rdi),%edi
812 movl %edi,nb010nf_nri(%rsp)
813 movq %rsi,nb010nf_iinr(%rsp)
814 movq %rdx,nb010nf_jindex(%rsp)
815 movq %rcx,nb010nf_jjnr(%rsp)
816 movq %r8,nb010nf_shift(%rsp)
817 movq %r9,nb010nf_shiftvec(%rsp)
818 movq nb010nf_p_ntype(%rbp),%rdi
819 movl (%rdi),%edi
820 movl %edi,nb010nf_ntype(%rsp)
822 ## create constant floating-point factors on stack
823 movl $0x40000000,%eax ## 2.0 in IEEE (hex)
824 movl %eax,nb010nf_two(%rsp)
825 movss nb010nf_two(%rsp),%xmm1
826 shufps $0,%xmm1,%xmm1 ## splat to all elements
827 movaps %xmm1,nb010nf_two(%rsp)
829 _nb_kernel010nf_x86_64_sse.nb010nf_threadloop:
830 movq nb010nf_count(%rbp),%rsi ## pointer to sync counter
831 movl (%rsi),%eax
832 _nb_kernel010nf_x86_64_sse.nb010nf_spinlock:
833 movl %eax,%ebx ## ebx=*count=nn0
834 addl $1,%ebx ## ebx=nn1=nn0+10
835 lock
836 cmpxchgl %ebx,(%rsi) ## write nn1 to *counter,
837 ## if it hasnt changed.
838 ## or reread *counter to eax.
839 pause ## -> better p4 performance
840 jnz _nb_kernel010nf_x86_64_sse.nb010nf_spinlock
842 ## if(nn1>nri) nn1=nri
843 movl nb010nf_nri(%rsp),%ecx
844 movl %ecx,%edx
845 subl %ebx,%ecx
846 cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri
847 ## Cleared the spinlock if we got here.
848 ## eax contains nn0, ebx contains nn1.
849 movl %eax,nb010nf_n(%rsp)
850 movl %ebx,nb010nf_nn1(%rsp)
851 subl %eax,%ebx ## calc number of outer lists
852 movl %eax,%esi ## copy n to esi
853 jg _nb_kernel010nf_x86_64_sse.nb010nf_outerstart
854 jmp _nb_kernel010nf_x86_64_sse.nb010nf_end
856 _nb_kernel010nf_x86_64_sse.nb010nf_outerstart:
857 ## ebx contains number of outer iterations
858 addl nb010nf_nouter(%rsp),%ebx
859 movl %ebx,nb010nf_nouter(%rsp)
861 _nb_kernel010nf_x86_64_sse.nb010nf_outer:
862 movq nb010nf_shift(%rsp),%rax ## rax = base of shift[]
863 movl (%rax,%rsi,4),%ebx ## ebx=shift[n]
865 lea (%rbx,%rbx,2),%rbx ## rbx=3*is
866 movl %ebx,nb010nf_is3(%rsp) ## store is3
868 movq nb010nf_shiftvec(%rsp),%rax ## rax = base of shiftvec[]
870 movss (%rax,%rbx,4),%xmm0
871 movss 4(%rax,%rbx,4),%xmm1
872 movss 8(%rax,%rbx,4),%xmm2
874 movq nb010nf_iinr(%rsp),%rcx ## rcx = base of iinr[]
875 movl (%rcx,%rsi,4),%ebx ## ebx =ii
877 movq nb010nf_type(%rbp),%rdx
878 movl (%rdx,%rbx,4),%edx
879 imull nb010nf_ntype(%rsp),%edx
880 shll %edx
881 movl %edx,nb010nf_ntia(%rsp)
883 lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3
884 movq nb010nf_pos(%rbp),%rax ## rax = base of pos[]
886 addss (%rax,%rbx,4),%xmm0
887 addss 4(%rax,%rbx,4),%xmm1
888 addss 8(%rax,%rbx,4),%xmm2
890 shufps $0,%xmm0,%xmm0
891 shufps $0,%xmm1,%xmm1
892 shufps $0,%xmm2,%xmm2
894 movaps %xmm0,nb010nf_ix(%rsp)
895 movaps %xmm1,nb010nf_iy(%rsp)
896 movaps %xmm2,nb010nf_iz(%rsp)
898 movl %ebx,nb010nf_ii3(%rsp)
900 ## clear Vvdwtot and i forces
901 xorps %xmm4,%xmm4
902 movaps %xmm4,nb010nf_Vvdwtot(%rsp)
904 movq nb010nf_jindex(%rsp),%rax
905 movl (%rax,%rsi,4),%ecx ## jindex[n]
906 movl 4(%rax,%rsi,4),%edx ## jindex[n+1]
907 subl %ecx,%edx ## number of innerloop atoms
909 movq nb010nf_pos(%rbp),%rsi
910 movq nb010nf_jjnr(%rsp),%rax
911 shll $2,%ecx
912 addq %rcx,%rax
913 movq %rax,nb010nf_innerjjnr(%rsp) ## pointer to jjnr[nj0]
914 movl %edx,%ecx
915 subl $4,%edx
916 addl nb010nf_ninner(%rsp),%ecx
917 movl %ecx,nb010nf_ninner(%rsp)
918 addl $0,%edx
919 movl %edx,nb010nf_innerk(%rsp) ## number of innerloop atoms
921 jge _nb_kernel010nf_x86_64_sse.nb010nf_unroll_loop
922 jmp _nb_kernel010nf_x86_64_sse.nb010nf_finish_inner
923 _nb_kernel010nf_x86_64_sse.nb010nf_unroll_loop:
924 ## quad-unroll innerloop here
925 movq nb010nf_innerjjnr(%rsp),%rdx ## pointer to jjnr[k]
926 movl (%rdx),%eax
927 movl 4(%rdx),%ebx
928 movl 8(%rdx),%ecx
929 movl 12(%rdx),%edx ## eax-edx=jnr1-4
930 ## advance pointer (unrolled 4)
931 addq $16,nb010nf_innerjjnr(%rsp)
933 movd %eax,%mm0 ## use mmx registers as temp storage
934 movd %ebx,%mm1
935 movd %ecx,%mm2
936 movd %edx,%mm3
938 movq nb010nf_type(%rbp),%rsi
939 movl (%rsi,%rax,4),%eax
940 movl (%rsi,%rbx,4),%ebx
941 movl (%rsi,%rcx,4),%ecx
942 movl (%rsi,%rdx,4),%edx
943 movq nb010nf_vdwparam(%rbp),%rsi
944 shll %eax
945 shll %ebx
946 shll %ecx
947 shll %edx
948 movl nb010nf_ntia(%rsp),%edi
949 addl %edi,%eax
950 addl %edi,%ebx
951 addl %edi,%ecx
952 addl %edi,%edx
954 movlps (%rsi,%rax,4),%xmm6
955 movlps (%rsi,%rcx,4),%xmm7
956 movhps (%rsi,%rbx,4),%xmm6
957 movhps (%rsi,%rdx,4),%xmm7
959 movaps %xmm6,%xmm4
960 shufps $136,%xmm7,%xmm4 ## 10001000
961 shufps $221,%xmm7,%xmm6 ## 11011101
963 movd %mm0,%eax
964 movd %mm1,%ebx
965 movd %mm2,%ecx
966 movd %mm3,%edx
968 movaps %xmm4,nb010nf_c6(%rsp)
969 movaps %xmm6,nb010nf_c12(%rsp)
971 movq nb010nf_pos(%rbp),%rsi ## base of pos[]
973 lea (%rax,%rax,2),%rax ## replace jnr with j3
974 lea (%rbx,%rbx,2),%rbx
976 mulps %xmm2,%xmm3
977 lea (%rcx,%rcx,2),%rcx ## replace jnr with j3
978 lea (%rdx,%rdx,2),%rdx
980 ## move four coordinates to xmm0-xmm2
982 movlps (%rsi,%rax,4),%xmm4
983 movlps (%rsi,%rcx,4),%xmm5
984 movss 8(%rsi,%rax,4),%xmm2
985 movss 8(%rsi,%rcx,4),%xmm6
987 movhps (%rsi,%rbx,4),%xmm4
988 movhps (%rsi,%rdx,4),%xmm5
990 movss 8(%rsi,%rbx,4),%xmm0
991 movss 8(%rsi,%rdx,4),%xmm1
993 shufps $0,%xmm0,%xmm2
994 shufps $0,%xmm1,%xmm6
996 movaps %xmm4,%xmm0
997 movaps %xmm4,%xmm1
999 shufps $136,%xmm6,%xmm2 ## 10001000
1001 shufps $136,%xmm5,%xmm0 ## 10001000
1002 shufps $221,%xmm5,%xmm1 ## 11011101
1004 ## move ix-iz to xmm4-xmm6
1005 movaps nb010nf_ix(%rsp),%xmm4
1006 movaps nb010nf_iy(%rsp),%xmm5
1007 movaps nb010nf_iz(%rsp),%xmm6
1009 ## calc dr
1010 subps %xmm0,%xmm4
1011 subps %xmm1,%xmm5
1012 subps %xmm2,%xmm6
1014 ## square it
1015 mulps %xmm4,%xmm4
1016 mulps %xmm5,%xmm5
1017 mulps %xmm6,%xmm6
1018 addps %xmm5,%xmm4
1019 addps %xmm6,%xmm4
1021 ## rsq in xmm4
1022 rcpps %xmm4,%xmm5
1023 ## 1/x lookup seed in xmm5
1024 movaps nb010nf_two(%rsp),%xmm0
1025 mulps %xmm5,%xmm4
1026 subps %xmm4,%xmm0
1027 mulps %xmm5,%xmm0 ## xmm0=rinvsq
1028 movaps %xmm0,%xmm4
1030 movaps %xmm0,%xmm1
1031 mulps %xmm0,%xmm1
1032 mulps %xmm0,%xmm1 ## xmm1=rinvsix
1033 movaps %xmm1,%xmm2
1034 mulps %xmm2,%xmm2 ## xmm2=rinvtwelve
1036 mulps nb010nf_c6(%rsp),%xmm1
1037 mulps nb010nf_c12(%rsp),%xmm2
1038 movaps %xmm2,%xmm5
1039 subps %xmm1,%xmm5 ## Vvdw=Vvdw12-Vvdw6
1040 addps nb010nf_Vvdwtot(%rsp),%xmm5
1041 movaps %xmm5,nb010nf_Vvdwtot(%rsp)
1043 ## should we do one more iteration?
1044 subl $4,nb010nf_innerk(%rsp)
1045 jl _nb_kernel010nf_x86_64_sse.nb010nf_finish_inner
1046 jmp _nb_kernel010nf_x86_64_sse.nb010nf_unroll_loop
1047 _nb_kernel010nf_x86_64_sse.nb010nf_finish_inner:
1048 ## check if at least two particles remain
1049 addl $4,nb010nf_innerk(%rsp)
1050 movl nb010nf_innerk(%rsp),%edx
1051 andl $2,%edx
1052 jnz _nb_kernel010nf_x86_64_sse.nb010nf_dopair
1053 jmp _nb_kernel010nf_x86_64_sse.nb010nf_checksingle
1054 _nb_kernel010nf_x86_64_sse.nb010nf_dopair:
1055 movq nb010nf_innerjjnr(%rsp),%rcx
1057 movl (%rcx),%eax
1058 movl 4(%rcx),%ebx
1059 addq $8,nb010nf_innerjjnr(%rsp)
1061 movq nb010nf_type(%rbp),%rsi
1062 movl %eax,%ecx
1063 movl %ebx,%edx
1064 movl (%rsi,%rcx,4),%ecx
1065 movl (%rsi,%rdx,4),%edx
1066 movq nb010nf_vdwparam(%rbp),%rsi
1067 shll %ecx
1068 shll %edx
1069 movl nb010nf_ntia(%rsp),%edi
1070 addl %edi,%ecx
1071 addl %edi,%edx
1072 movlps (%rsi,%rcx,4),%xmm6
1073 movhps (%rsi,%rdx,4),%xmm6
1074 movq nb010nf_pos(%rbp),%rdi
1075 xorps %xmm7,%xmm7
1076 movaps %xmm6,%xmm4
1077 shufps $8,%xmm4,%xmm4 ## 00001000
1078 shufps $13,%xmm6,%xmm6 ## 00001101
1079 movlhps %xmm7,%xmm4
1080 movlhps %xmm7,%xmm6
1082 movaps %xmm4,nb010nf_c6(%rsp)
1083 movaps %xmm6,nb010nf_c12(%rsp)
1085 lea (%rax,%rax,2),%rax
1086 lea (%rbx,%rbx,2),%rbx
1087 ## move coordinates to xmm0-xmm2
1088 movlps (%rdi,%rax,4),%xmm1
1089 movss 8(%rdi,%rax,4),%xmm2
1090 movhps (%rdi,%rbx,4),%xmm1
1091 movss 8(%rdi,%rbx,4),%xmm0
1093 movlhps %xmm7,%xmm3
1095 shufps $0,%xmm0,%xmm2
1097 movaps %xmm1,%xmm0
1099 shufps $136,%xmm2,%xmm2 ## 10001000
1101 shufps $136,%xmm0,%xmm0 ## 10001000
1102 shufps $221,%xmm1,%xmm1 ## 11011101
1104 ## move nb010nf_ix-iz to xmm4-xmm6
1105 xorps %xmm7,%xmm7
1107 movaps nb010nf_ix(%rsp),%xmm4
1108 movaps nb010nf_iy(%rsp),%xmm5
1109 movaps nb010nf_iz(%rsp),%xmm6
1111 ## calc dr
1112 subps %xmm0,%xmm4
1113 subps %xmm1,%xmm5
1114 subps %xmm2,%xmm6
1116 ## square it
1117 mulps %xmm4,%xmm4
1118 mulps %xmm5,%xmm5
1119 mulps %xmm6,%xmm6
1120 addps %xmm5,%xmm4
1121 addps %xmm6,%xmm4
1122 ## rsq in xmm4
1125 rcpps %xmm4,%xmm5
1126 ## 1/x lookup seed in xmm5
1127 movaps nb010nf_two(%rsp),%xmm0
1128 mulps %xmm5,%xmm4
1129 subps %xmm4,%xmm0
1130 mulps %xmm5,%xmm0 ## xmm0=rinvsq
1131 movaps %xmm0,%xmm4
1133 movaps %xmm0,%xmm1
1134 mulps %xmm0,%xmm1
1135 mulps %xmm0,%xmm1 ## xmm1=rinvsix
1136 movaps %xmm1,%xmm2
1137 mulps %xmm2,%xmm2 ## xmm2=rinvtwelve
1139 mulps nb010nf_c6(%rsp),%xmm1
1140 mulps nb010nf_c12(%rsp),%xmm2
1141 movaps %xmm2,%xmm5
1142 subps %xmm1,%xmm5 ## Vvdw=Vvdw12-Vvdw6
1143 addps nb010nf_Vvdwtot(%rsp),%xmm5
1144 movaps %xmm5,nb010nf_Vvdwtot(%rsp)
1146 _nb_kernel010nf_x86_64_sse.nb010nf_checksingle:
1147 movl nb010nf_innerk(%rsp),%edx
1148 andl $1,%edx
1149 jnz _nb_kernel010nf_x86_64_sse.nb010nf_dosingle
1150 jmp _nb_kernel010nf_x86_64_sse.nb010nf_updateouterdata
1151 _nb_kernel010nf_x86_64_sse.nb010nf_dosingle:
1152 movq nb010nf_pos(%rbp),%rdi
1153 movq nb010nf_innerjjnr(%rsp),%rcx
1154 movl (%rcx),%eax
1156 movq nb010nf_type(%rbp),%rsi
1157 movl %eax,%ecx
1158 movl (%rsi,%rcx,4),%ecx
1159 movq nb010nf_vdwparam(%rbp),%rsi
1160 shll %ecx
1161 addl nb010nf_ntia(%rsp),%ecx
1162 xorps %xmm6,%xmm6
1163 movlps (%rsi,%rcx,4),%xmm6
1164 movaps %xmm6,%xmm4
1165 shufps $252,%xmm4,%xmm4 ## 11111100
1166 shufps $253,%xmm6,%xmm6 ## 11111101
1168 movaps %xmm4,nb010nf_c6(%rsp)
1169 movaps %xmm6,nb010nf_c12(%rsp)
1171 lea (%rax,%rax,2),%rax
1173 ## move coordinates to xmm0-xmm2
1174 movss (%rdi,%rax,4),%xmm0
1175 movss 4(%rdi,%rax,4),%xmm1
1176 movss 8(%rdi,%rax,4),%xmm2
1178 xorps %xmm7,%xmm7
1180 movaps nb010nf_ix(%rsp),%xmm4
1181 movaps nb010nf_iy(%rsp),%xmm5
1182 movaps nb010nf_iz(%rsp),%xmm6
1184 ## calc dr
1185 subps %xmm0,%xmm4
1186 subps %xmm1,%xmm5
1187 subps %xmm2,%xmm6
1189 ## square it
1190 mulps %xmm4,%xmm4
1191 mulps %xmm5,%xmm5
1192 mulps %xmm6,%xmm6
1193 addps %xmm5,%xmm4
1194 addps %xmm6,%xmm4
1195 ## rsq in xmm4
1197 rcpps %xmm4,%xmm5
1198 ## 1/x lookup seed in xmm5
1199 movaps nb010nf_two(%rsp),%xmm0
1200 mulps %xmm5,%xmm4
1201 subps %xmm4,%xmm0
1202 mulps %xmm5,%xmm0 ## xmm0=rinvsq
1203 movaps %xmm0,%xmm4
1205 movaps %xmm0,%xmm1
1206 mulps %xmm0,%xmm1
1207 mulps %xmm0,%xmm1 ## xmm1=rinvsix
1208 movaps %xmm1,%xmm2
1209 mulps %xmm2,%xmm2 ## xmm2=rinvtwelve
1211 mulps nb010nf_c6(%rsp),%xmm1
1212 mulps nb010nf_c12(%rsp),%xmm2
1213 movaps %xmm2,%xmm5
1214 subps %xmm1,%xmm5 ## Vvdw=Vvdw12-Vvdw6
1215 addss nb010nf_Vvdwtot(%rsp),%xmm5
1216 movss %xmm5,nb010nf_Vvdwtot(%rsp)
1218 _nb_kernel010nf_x86_64_sse.nb010nf_updateouterdata:
1219 ## get n from stack
1220 movl nb010nf_n(%rsp),%esi
1221 ## get group index for i particle
1222 movq nb010nf_gid(%rbp),%rdx ## base of gid[]
1223 movl (%rdx,%rsi,4),%edx ## ggid=gid[n]
1225 ## accumulate total lj energy and update it
1226 movaps nb010nf_Vvdwtot(%rsp),%xmm7
1227 ## accumulate
1228 movhlps %xmm7,%xmm6
1229 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1230 movaps %xmm7,%xmm6
1231 shufps $1,%xmm6,%xmm6
1232 addss %xmm6,%xmm7
1234 ## add earlier value from mem
1235 movq nb010nf_Vvdw(%rbp),%rax
1236 addss (%rax,%rdx,4),%xmm7
1237 ## move back to mem
1238 movss %xmm7,(%rax,%rdx,4)
1240 ## finish if last
1241 movl nb010nf_nn1(%rsp),%ecx
1242 ## esi already loaded with n
1243 incl %esi
1244 subl %esi,%ecx
1245 jz _nb_kernel010nf_x86_64_sse.nb010nf_outerend
1247 ## not last, iterate outer loop once more!
1248 movl %esi,nb010nf_n(%rsp)
1249 jmp _nb_kernel010nf_x86_64_sse.nb010nf_outer
1250 _nb_kernel010nf_x86_64_sse.nb010nf_outerend:
1251 ## check if more outer neighborlists remain
1252 movl nb010nf_nri(%rsp),%ecx
1253 ## esi already loaded with n above
1254 subl %esi,%ecx
1255 jz _nb_kernel010nf_x86_64_sse.nb010nf_end
1256 ## non-zero, do one more workunit
1257 jmp _nb_kernel010nf_x86_64_sse.nb010nf_threadloop
1258 _nb_kernel010nf_x86_64_sse.nb010nf_end:
1260 movl nb010nf_nouter(%rsp),%eax
1261 movl nb010nf_ninner(%rsp),%ebx
1262 movq nb010nf_outeriter(%rbp),%rcx
1263 movq nb010nf_inneriter(%rbp),%rdx
1264 movl %eax,(%rcx)
1265 movl %ebx,(%rdx)
1267 addq $264,%rsp
1268 emms
1270 pop %rbx
1271 pop %rbp