Updated intel syntax x86-64 asm files to also support MS win64 call convention (ifdef...
[gromacs/rigid-bodies.git] / src / gmxlib / nonbonded / nb_kernel_x86_64_sse / nb_kernel310_x86_64_sse.s
blob09850397fedac2bcf42920bb9a9430c2e12c310c
1 ##
2 ##
3 ## Gromacs 4.0 Copyright (c) 1991-2003
4 ## David van der Spoel, Erik Lindahl
5 ##
6 ## This program is free software; you can redistribute it and/or
7 ## modify it under the terms of the GNU General Public License
8 ## as published by the Free Software Foundation; either version 2
9 ## of the License, or (at your option) any later version.
11 ## To help us fund GROMACS development, we humbly ask that you cite
12 ## the research papers on the package. Check out http://www.gromacs.org
13 ##
14 ## And Hey:
15 ## Gnomes, ROck Monsters And Chili Sauce
23 .globl nb_kernel310_x86_64_sse
24 .globl _nb_kernel310_x86_64_sse
25 nb_kernel310_x86_64_sse:
26 _nb_kernel310_x86_64_sse:
27 ## Room for return address and rbp (16 bytes)
28 .set nb310_fshift, 16
29 .set nb310_gid, 24
30 .set nb310_pos, 32
31 .set nb310_faction, 40
32 .set nb310_charge, 48
33 .set nb310_p_facel, 56
34 .set nb310_argkrf, 64
35 .set nb310_argcrf, 72
36 .set nb310_Vc, 80
37 .set nb310_type, 88
38 .set nb310_p_ntype, 96
39 .set nb310_vdwparam, 104
40 .set nb310_Vvdw, 112
41 .set nb310_p_tabscale, 120
42 .set nb310_VFtab, 128
43 .set nb310_invsqrta, 136
44 .set nb310_dvda, 144
45 .set nb310_p_gbtabscale, 152
46 .set nb310_GBtab, 160
47 .set nb310_p_nthreads, 168
48 .set nb310_count, 176
49 .set nb310_mtx, 184
50 .set nb310_outeriter, 192
51 .set nb310_inneriter, 200
52 .set nb310_work, 208
53 ## stack offsets for local variables
54 ## bottom of stack is cache-aligned for sse use
55 .set nb310_ix, 0
56 .set nb310_iy, 16
57 .set nb310_iz, 32
58 .set nb310_iq, 48
59 .set nb310_dx, 64
60 .set nb310_dy, 80
61 .set nb310_dz, 96
62 .set nb310_two, 112
63 .set nb310_six, 128
64 .set nb310_twelve, 144
65 .set nb310_tsc, 160
66 .set nb310_qq, 176
67 .set nb310_c6, 192
68 .set nb310_c12, 208
69 .set nb310_fscal, 224
70 .set nb310_vctot, 240
71 .set nb310_Vvdwtot, 256
72 .set nb310_fix, 272
73 .set nb310_fiy, 288
74 .set nb310_fiz, 304
75 .set nb310_half, 320
76 .set nb310_three, 336
77 .set nb310_nri, 352
78 .set nb310_iinr, 360
79 .set nb310_jindex, 368
80 .set nb310_jjnr, 376
81 .set nb310_shift, 384
82 .set nb310_shiftvec, 392
83 .set nb310_facel, 400
84 .set nb310_innerjjnr, 408
85 .set nb310_is3, 416
86 .set nb310_ii3, 420
87 .set nb310_ntia, 424
88 .set nb310_innerk, 428
89 .set nb310_n, 432
90 .set nb310_nn1, 436
91 .set nb310_ntype, 440
92 .set nb310_nouter, 444
93 .set nb310_ninner, 448
96 push %rbp
97 movq %rsp,%rbp
98 push %rbx
101 emms
103 push %r12
104 push %r13
105 push %r14
106 push %r15
108 subq $472,%rsp ## local variable stack space (n*16+8)
110 ## zero 32-bit iteration counters
111 movl $0,%eax
112 movl %eax,nb310_nouter(%rsp)
113 movl %eax,nb310_ninner(%rsp)
115 movl (%rdi),%edi
116 movl %edi,nb310_nri(%rsp)
117 movq %rsi,nb310_iinr(%rsp)
118 movq %rdx,nb310_jindex(%rsp)
119 movq %rcx,nb310_jjnr(%rsp)
120 movq %r8,nb310_shift(%rsp)
121 movq %r9,nb310_shiftvec(%rsp)
122 movq nb310_p_ntype(%rbp),%rdi
123 movl (%rdi),%edi
124 movl %edi,nb310_ntype(%rsp)
125 movq nb310_p_facel(%rbp),%rsi
126 movss (%rsi),%xmm0
127 movss %xmm0,nb310_facel(%rsp)
130 movq nb310_p_tabscale(%rbp),%rax
131 movss (%rax),%xmm3
132 shufps $0,%xmm3,%xmm3
133 movaps %xmm3,nb310_tsc(%rsp)
136 ## create constant floating-point factors on stack
137 movl $0x3f000000,%eax ## half in IEEE (hex)
138 movl %eax,nb310_half(%rsp)
139 movss nb310_half(%rsp),%xmm1
140 shufps $0,%xmm1,%xmm1 ## splat to all elements
141 movaps %xmm1,%xmm2
142 addps %xmm2,%xmm2 ## one
143 movaps %xmm2,%xmm3
144 addps %xmm2,%xmm2 ## two
145 addps %xmm2,%xmm3 ## three
146 movaps %xmm3,%xmm4
147 addps %xmm4,%xmm4 ## six
148 movaps %xmm4,%xmm5
149 addps %xmm5,%xmm5 ## twelve
150 movaps %xmm1,nb310_half(%rsp)
151 movaps %xmm2,nb310_two(%rsp)
152 movaps %xmm3,nb310_three(%rsp)
153 movaps %xmm4,nb310_six(%rsp)
154 movaps %xmm5,nb310_twelve(%rsp)
156 _nb_kernel310_x86_64_sse.nb310_threadloop:
157 movq nb310_count(%rbp),%rsi ## pointer to sync counter
158 movl (%rsi),%eax
159 _nb_kernel310_x86_64_sse.nb310_spinlock:
160 movl %eax,%ebx ## ebx=*count=nn0
161 addl $1,%ebx ## ebx=nn1=nn0+10
162 lock
163 cmpxchgl %ebx,(%rsi) ## write nn1 to *counter,
164 ## if it hasnt changed.
165 ## or reread *counter to eax.
166 pause ## -> better p4 performance
167 jnz _nb_kernel310_x86_64_sse.nb310_spinlock
169 ## if(nn1>nri) nn1=nri
170 movl nb310_nri(%rsp),%ecx
171 movl %ecx,%edx
172 subl %ebx,%ecx
173 cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri
174 ## Cleared the spinlock if we got here.
175 ## eax contains nn0, ebx contains nn1.
176 movl %eax,nb310_n(%rsp)
177 movl %ebx,nb310_nn1(%rsp)
178 subl %eax,%ebx ## calc number of outer lists
179 movl %eax,%esi ## copy n to esi
180 jg _nb_kernel310_x86_64_sse.nb310_outerstart
181 jmp _nb_kernel310_x86_64_sse.nb310_end
183 _nb_kernel310_x86_64_sse.nb310_outerstart:
184 ## ebx contains number of outer iterations
185 addl nb310_nouter(%rsp),%ebx
186 movl %ebx,nb310_nouter(%rsp)
188 _nb_kernel310_x86_64_sse.nb310_outer:
189 movq nb310_shift(%rsp),%rax ## rax = pointer into shift[]
190 movl (%rax,%rsi,4),%ebx ## ebx=shift[n]
192 lea (%rbx,%rbx,2),%rbx ## rbx=3*is
193 movl %ebx,nb310_is3(%rsp) ## store is3
195 movq nb310_shiftvec(%rsp),%rax ## rax = base of shiftvec[]
197 movss (%rax,%rbx,4),%xmm0
198 movss 4(%rax,%rbx,4),%xmm1
199 movss 8(%rax,%rbx,4),%xmm2
201 movq nb310_iinr(%rsp),%rcx ## rcx = pointer into iinr[]
202 movl (%rcx,%rsi,4),%ebx ## ebx =ii
204 movq nb310_charge(%rbp),%rdx
205 movss (%rdx,%rbx,4),%xmm3
206 mulss nb310_facel(%rsp),%xmm3
207 shufps $0,%xmm3,%xmm3
209 movq nb310_type(%rbp),%rdx
210 movl (%rdx,%rbx,4),%edx
211 imull nb310_ntype(%rsp),%edx
212 shll %edx
213 movl %edx,nb310_ntia(%rsp)
215 lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3
216 movq nb310_pos(%rbp),%rax ## rax = base of pos[]
218 addss (%rax,%rbx,4),%xmm0
219 addss 4(%rax,%rbx,4),%xmm1
220 addss 8(%rax,%rbx,4),%xmm2
222 movaps %xmm3,nb310_iq(%rsp)
224 shufps $0,%xmm0,%xmm0
225 shufps $0,%xmm1,%xmm1
226 shufps $0,%xmm2,%xmm2
228 movaps %xmm0,nb310_ix(%rsp)
229 movaps %xmm1,nb310_iy(%rsp)
230 movaps %xmm2,nb310_iz(%rsp)
232 movl %ebx,nb310_ii3(%rsp)
234 ## clear vctot and i forces
235 xorps %xmm15,%xmm15
236 movaps %xmm15,nb310_vctot(%rsp)
237 movaps %xmm15,nb310_Vvdwtot(%rsp)
238 movaps %xmm15,%xmm14
239 movaps %xmm15,%xmm13
241 movq nb310_jindex(%rsp),%rax
242 movl (%rax,%rsi,4),%ecx ## jindex[n]
243 movl 4(%rax,%rsi,4),%edx ## jindex[n+1]
244 subl %ecx,%edx ## number of innerloop atoms
246 movq nb310_pos(%rbp),%rsi
247 movq nb310_faction(%rbp),%rdi
248 movq nb310_jjnr(%rsp),%rax
249 shll $2,%ecx
250 addq %rcx,%rax
251 movq %rax,nb310_innerjjnr(%rsp) ## pointer to jjnr[nj0]
252 movl %edx,%ecx
253 subl $4,%edx
254 addl nb310_ninner(%rsp),%ecx
255 movl %ecx,nb310_ninner(%rsp)
256 addl $0,%edx
257 movl %edx,nb310_innerk(%rsp) ## number of innerloop atoms
258 jge _nb_kernel310_x86_64_sse.nb310_unroll_loop
259 jmp _nb_kernel310_x86_64_sse.nb310_finish_inner
260 _nb_kernel310_x86_64_sse.nb310_unroll_loop:
261 ## quad-unrolled innerloop here
262 movq nb310_innerjjnr(%rsp),%rdx ## pointer to jjnr[k]
263 movl (%rdx),%r8d
264 movl 4(%rdx),%r9d
265 movl 8(%rdx),%r10d
266 movl 12(%rdx),%r11d ## eax-edx=jnr1-4
268 addq $16,nb310_innerjjnr(%rsp) ## advance pointer (unrolled 4)
271 lea (%r8,%r8,2),%rax ## replace jnr with j3
272 lea (%r9,%r9,2),%rbx
274 lea (%r10,%r10,2),%rcx ## replace jnr with j3
275 lea (%r11,%r11,2),%rdx
277 ## load coordinates
278 movq nb310_pos(%rbp),%rdi
280 movlps (%rdi,%rax,4),%xmm1 ## x1 y1 - -
281 movlps (%rdi,%rbx,4),%xmm2 ## x2 y2 - -
282 movlps (%rdi,%rcx,4),%xmm3 ## x3 y3 - -
283 movlps (%rdi,%rdx,4),%xmm4 ## x4 y4 - -
285 movss 8(%rdi,%rax,4),%xmm5 ## z1 - - -
286 movss 8(%rdi,%rbx,4),%xmm6 ## z2 - - -
287 movss 8(%rdi,%rcx,4),%xmm7 ## z3 - - -
288 movss 8(%rdi,%rdx,4),%xmm8 ## z4 - - -
290 unpcklps %xmm3,%xmm1 ## x1 x3 y1 y3
291 unpcklps %xmm4,%xmm2 ## x2 x4 y2 y4
292 unpcklps %xmm7,%xmm5 ## z1 z3 - -
293 unpcklps %xmm8,%xmm6 ## z2 z4 - -
294 movq nb310_charge(%rbp),%rsi
296 movaps %xmm1,%xmm3
298 unpcklps %xmm2,%xmm1 ## x1 x2 x3 x4
299 unpckhps %xmm2,%xmm3 ## y1 y2 y3 y4
300 unpcklps %xmm6,%xmm5 ## z1 z2 z3 z4
302 ## calc dr
303 subps nb310_ix(%rsp),%xmm1
304 subps nb310_iy(%rsp),%xmm3
305 subps nb310_iz(%rsp),%xmm5
307 ## store dr in xmm9-xmm11
308 movaps %xmm1,%xmm9
309 movaps %xmm3,%xmm10
310 movaps %xmm5,%xmm11
312 movss (%rsi,%r8,4),%xmm0
313 movss (%rsi,%r10,4),%xmm2
314 movss (%rsi,%r9,4),%xmm6
315 movss (%rsi,%r11,4),%xmm8
317 ## square it
318 mulps %xmm1,%xmm1
319 mulps %xmm3,%xmm3
320 mulps %xmm5,%xmm5
321 addps %xmm1,%xmm3
322 addps %xmm5,%xmm3
323 ## rsq in xmm3
324 movq nb310_type(%rbp),%rsi
326 unpcklps %xmm2,%xmm0
327 unpcklps %xmm8,%xmm6
329 unpcklps %xmm6,%xmm0
332 ## calculate rinv=1/sqrt(rsq)
333 rsqrtps %xmm3,%xmm5
334 movaps %xmm5,%xmm2
335 mulps %xmm5,%xmm5
336 movaps nb310_three(%rsp),%xmm1
337 mulps %xmm3,%xmm5 ## rsq*lu*lu
338 subps %xmm5,%xmm1 ## 30-rsq*lu*lu
339 mulps %xmm2,%xmm1
340 mulps nb310_half(%rsp),%xmm1
341 ## xmm1=rinv
342 ## xmm3=rsq
343 mulps nb310_iq(%rsp),%xmm0
345 ## vdw types
346 movl (%rsi,%r8,4),%r8d
347 movl (%rsi,%r9,4),%r9d
348 movl (%rsi,%r10,4),%r10d
349 movl (%rsi,%r11,4),%r11d
351 mulps %xmm1,%xmm3 ## r
352 mulps nb310_tsc(%rsp),%xmm3 ## rtab
353 movaps %xmm0,nb310_qq(%rsp)
355 ## truncate and convert to integers
356 cvttps2dq %xmm3,%xmm2
358 shll %r8d
359 shll %r9d
360 shll %r10d
361 shll %r11d
363 ## convert back to float
364 cvtdq2ps %xmm2,%xmm0
366 movl nb310_ntia(%rsp),%edi
367 addl %edi,%r8d
368 addl %edi,%r9d
369 addl %edi,%r10d
370 addl %edi,%r11d
372 ## multiply by 4
373 pslld $2,%xmm2
375 ## move to integer registers
376 movhlps %xmm2,%xmm7
377 movd %xmm2,%r12d
378 movd %xmm7,%r14d
379 pshufd $1,%xmm2,%xmm2
380 pshufd $1,%xmm7,%xmm7
381 movd %xmm2,%r13d
382 movd %xmm7,%r15d
385 ## calculate eps
386 subps %xmm0,%xmm3
388 movq nb310_vdwparam(%rbp),%rsi
389 movlps (%rsi,%r8,4),%xmm7
390 movlps (%rsi,%r10,4),%xmm8
391 movhps (%rsi,%r9,4),%xmm7
392 movhps (%rsi,%r11,4),%xmm8
394 movaps %xmm7,%xmm12
395 shufps $136,%xmm8,%xmm12 ## 10001000
396 shufps $221,%xmm8,%xmm7 ## 11011101
398 movaps %xmm12,nb310_c6(%rsp)
399 movaps %xmm7,nb310_c12(%rsp)
401 movq nb310_VFtab(%rbp),%rsi
402 ## load table data
403 movlps (%rsi,%r12,4),%xmm5
404 movlps (%rsi,%r14,4),%xmm7
405 movhps (%rsi,%r13,4),%xmm5
406 movhps (%rsi,%r15,4),%xmm7
408 movaps %xmm5,%xmm4
409 shufps $136,%xmm7,%xmm4 ## 10001000
410 shufps $221,%xmm7,%xmm5 ## 11011101
412 movaps %xmm1,%xmm0 ## rinv
413 mulps %xmm0,%xmm0 ## rinvsq
414 movaps %xmm0,%xmm2 ## rinvsq
415 mulps %xmm2,%xmm2 ## rinv4
416 mulps %xmm0,%xmm2 ## rinv6
417 movaps %xmm2,%xmm12
418 mulps %xmm12,%xmm12 ## rinv12
420 movlps 8(%rsi,%r12,4),%xmm7
421 movlps 8(%rsi,%r14,4),%xmm8
422 movhps 8(%rsi,%r13,4),%xmm7
423 movhps 8(%rsi,%r15,4),%xmm8
425 movaps %xmm7,%xmm6
427 mulps nb310_c6(%rsp),%xmm2 ## vvdw6=c6*rinv6
428 mulps nb310_c12(%rsp),%xmm12 ## vvdw12=c12*rinv12
430 movaps %xmm12,%xmm0
431 subps %xmm2,%xmm12 ## Vvdw=Vvdw12-Vvdw6
433 ## add potential to vvdwtot
434 addps nb310_Vvdwtot(%rsp),%xmm12
435 movaps %xmm12,nb310_Vvdwtot(%rsp)
437 shufps $136,%xmm8,%xmm6 ## 10001000
438 shufps $221,%xmm8,%xmm7 ## 11011101
439 ## table data ready in xmm4-xmm7
441 mulps %xmm3,%xmm7 ## Heps
442 mulps %xmm3,%xmm6 ## Geps
443 mulps %xmm3,%xmm7 ## Heps2
445 addps %xmm6,%xmm5 ## F+Geps
446 addps %xmm7,%xmm5 ## F+Geps+Heps2 = Fp
447 addps %xmm7,%xmm7 ## 2*Heps2
448 addps %xmm6,%xmm7 ## 2*Heps2+Geps
449 addps %xmm5,%xmm7 ## FF = Fp + 2*Heps2 + Geps
450 mulps %xmm3,%xmm5 ## eps*Fp
451 addps %xmm4,%xmm5 ## VV
452 mulps nb310_qq(%rsp),%xmm5 ## VV*qq=vcoul
453 mulps nb310_qq(%rsp),%xmm7 ## FF*qq=fijC
455 ## LJ forces
456 mulps nb310_six(%rsp),%xmm2
457 mulps nb310_twelve(%rsp),%xmm0
458 subps %xmm2,%xmm0
459 mulps %xmm1,%xmm0 ## (12*vnb12-6*vnb6)*rinv
461 ## add potential to vctot
462 addps nb310_vctot(%rsp),%xmm5
463 movaps %xmm5,nb310_vctot(%rsp)
465 mulps nb310_tsc(%rsp),%xmm7
466 subps %xmm7,%xmm0
468 mulps %xmm1,%xmm0 ## fscal
470 ## calculate scalar force by multiplying dx/dy/dz with fscal
471 mulps %xmm0,%xmm9
472 mulps %xmm0,%xmm10
473 mulps %xmm0,%xmm11
475 movq nb310_faction(%rbp),%rsi
476 ## the fj's - start by accumulating x & y forces from memory
477 movlps (%rsi,%rax,4),%xmm0 ## x1 y1 - -
478 movlps (%rsi,%rcx,4),%xmm1 ## x3 y3 - -
479 movhps (%rsi,%rbx,4),%xmm0 ## x1 y1 x2 y2
480 movhps (%rsi,%rdx,4),%xmm1 ## x3 y3 x4 y4
482 ## xmm0-xmm2 contains tx-tz (partial force)
483 ## accumulate i forces
484 addps %xmm9,%xmm13
485 addps %xmm10,%xmm14
486 addps %xmm11,%xmm15
488 movaps %xmm9,%xmm8
489 unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
490 unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4
492 ## update fjx and fjy
493 addps %xmm9,%xmm0
494 addps %xmm8,%xmm1
496 movlps %xmm0,(%rsi,%rax,4)
497 movlps %xmm1,(%rsi,%rcx,4)
498 movhps %xmm0,(%rsi,%rbx,4)
499 movhps %xmm1,(%rsi,%rdx,4)
501 ## xmm11: fjz1 fjz2 fjz3 fjz4
502 pshufd $1,%xmm11,%xmm10 ## fjz2 - - -
503 movhlps %xmm11,%xmm9 ## fjz3 - - -
504 pshufd $3,%xmm11,%xmm8 ## fjz4 - - -
506 addss 8(%rsi,%rax,4),%xmm11
507 addss 8(%rsi,%rbx,4),%xmm10
508 addss 8(%rsi,%rcx,4),%xmm9
509 addss 8(%rsi,%rdx,4),%xmm8
510 movss %xmm11,8(%rsi,%rax,4)
511 movss %xmm10,8(%rsi,%rbx,4)
512 movss %xmm9,8(%rsi,%rcx,4)
513 movss %xmm8,8(%rsi,%rdx,4)
515 ## should we do one more iteration?
516 subl $4,nb310_innerk(%rsp)
517 jl _nb_kernel310_x86_64_sse.nb310_finish_inner
518 jmp _nb_kernel310_x86_64_sse.nb310_unroll_loop
519 _nb_kernel310_x86_64_sse.nb310_finish_inner:
520 ## check if at least two particles remain
521 addl $4,nb310_innerk(%rsp)
522 movl nb310_innerk(%rsp),%edx
523 andl $2,%edx
524 jnz _nb_kernel310_x86_64_sse.nb310_dopair
525 jmp _nb_kernel310_x86_64_sse.nb310_checksingle
526 _nb_kernel310_x86_64_sse.nb310_dopair:
527 ## twice-unrolled innerloop here
528 movq nb310_innerjjnr(%rsp),%rdx ## pointer to jjnr[k]
529 movl (%rdx),%eax
530 movl 4(%rdx),%ebx
532 addq $8,nb310_innerjjnr(%rsp) ## advance pointer (unrolled 2)
534 movq nb310_charge(%rbp),%rsi
535 movss (%rsi,%rax,4),%xmm0
536 movss (%rsi,%rbx,4),%xmm2
538 unpcklps %xmm2,%xmm0 ## jqa jqb
539 mulps nb310_iq(%rsp),%xmm0
540 movaps %xmm0,nb310_qq(%rsp)
542 movq nb310_type(%rbp),%rsi
543 ## vdw parameters
544 movl (%rsi,%rax,4),%r12d
545 movl (%rsi,%rbx,4),%r13d
546 shll %r12d
547 shll %r13d
548 movl nb310_ntia(%rsp),%edi
549 addl %edi,%r12d
550 addl %edi,%r13d
552 movq nb310_vdwparam(%rbp),%rsi
553 movlps (%rsi,%r12,4),%xmm3
554 movhps (%rsi,%r13,4),%xmm3
556 xorps %xmm7,%xmm7
557 movaps %xmm3,%xmm0
558 shufps $136,%xmm7,%xmm0 ## 10001000
559 shufps $221,%xmm7,%xmm3 ## 11011101
561 movaps %xmm0,nb310_c6(%rsp)
562 movaps %xmm3,nb310_c12(%rsp)
564 lea (%rax,%rax,2),%rax ## replace jnr with j3
565 lea (%rbx,%rbx,2),%rbx
567 ## load coordinates
568 movq nb310_pos(%rbp),%rdi
570 movlps (%rdi,%rax,4),%xmm4 ## x1 y1 - -
571 movlps (%rdi,%rbx,4),%xmm5 ## x2 y2 - -
573 movss 8(%rdi,%rax,4),%xmm6 ## z1 - - -
574 movss 8(%rdi,%rbx,4),%xmm7 ## z2 - - -
576 unpcklps %xmm5,%xmm4 ## x1 x2 y1 y2
577 movhlps %xmm4,%xmm5 ## y1 y2 - -
578 unpcklps %xmm7,%xmm6 ## z1 z2 - -
580 ## calc dr
581 subps nb310_ix(%rsp),%xmm4
582 subps nb310_iy(%rsp),%xmm5
583 subps nb310_iz(%rsp),%xmm6
585 ## store dr in xmm9-xmm11
586 movaps %xmm4,%xmm9
587 movaps %xmm5,%xmm10
588 movaps %xmm6,%xmm11
590 ## square it
591 mulps %xmm4,%xmm4
592 mulps %xmm5,%xmm5
593 mulps %xmm6,%xmm6
594 addps %xmm5,%xmm4
595 addps %xmm6,%xmm4
596 ## rsq in xmm4
598 ## calculate rinv=1/sqrt(rsq)
599 rsqrtps %xmm4,%xmm5
600 movaps %xmm5,%xmm2
601 mulps %xmm5,%xmm5
602 movaps nb310_three(%rsp),%xmm1
603 mulps %xmm4,%xmm5 ## rsq*lu*lu
604 subps %xmm5,%xmm1 ## 30-rsq*lu*lu
605 mulps %xmm2,%xmm1
606 mulps nb310_half(%rsp),%xmm1
607 ## xmm1=rinv
608 movaps %xmm4,%xmm3
609 ## xmm3=rsq
611 mulps %xmm1,%xmm3 ## r
612 mulps nb310_tsc(%rsp),%xmm3 ## rtab
614 ## truncate and convert to integers
615 cvttps2dq %xmm3,%xmm2
617 ## convert back to float
618 cvtdq2ps %xmm2,%xmm0
620 ## multiply by 4
621 pslld $2,%xmm2
623 ## move to integer registers
624 movd %xmm2,%r12d
625 pshufd $1,%xmm2,%xmm2
626 movd %xmm2,%r13d
628 ## calculate eps
629 subps %xmm0,%xmm3
631 movq nb310_VFtab(%rbp),%rsi
632 ## load table data
633 movlps (%rsi,%r12,4),%xmm4
634 movlps (%rsi,%r13,4),%xmm5
635 unpcklps %xmm5,%xmm4
636 movhlps %xmm4,%xmm5
638 movaps %xmm1,%xmm0 ## rinv
639 mulps %xmm0,%xmm0 ## rinvsq
640 movaps %xmm0,%xmm2 ## rinvsq
641 mulps %xmm2,%xmm2 ## rinv4
642 mulps %xmm0,%xmm2 ## rinv6
643 movaps %xmm2,%xmm12
644 mulps %xmm12,%xmm12 ## rinv12
646 movlps 8(%rsi,%r12,4),%xmm6
647 movlps 8(%rsi,%r13,4),%xmm7
648 unpcklps %xmm7,%xmm6
649 movhlps %xmm6,%xmm7
650 ## table data ready in xmm4-xmm7
652 mulps nb310_c6(%rsp),%xmm2 ## vvdw6=c6*rinv6
653 mulps nb310_c12(%rsp),%xmm12 ## vvdw12=c12*rinv12
655 movaps %xmm12,%xmm0
656 subps %xmm2,%xmm12 ## Vvdw=Vvdw12-Vvdw6
658 ## add potential to vvdwtot
659 addps nb310_Vvdwtot(%rsp),%xmm12
660 movlps %xmm12,nb310_Vvdwtot(%rsp)
662 mulps %xmm3,%xmm7 ## Heps
663 mulps %xmm3,%xmm6 ## Geps
664 mulps %xmm3,%xmm7 ## Heps2
666 addps %xmm6,%xmm5 ## F+Geps
667 addps %xmm7,%xmm5 ## F+Geps+Heps2 = Fp
668 addps %xmm7,%xmm7 ## 2*Heps2
669 addps %xmm6,%xmm7 ## 2*Heps2+Geps
670 addps %xmm5,%xmm7 ## FF = Fp + 2*Heps2 + Geps
671 mulps %xmm3,%xmm5 ## eps*Fp
672 addps %xmm4,%xmm5 ## VV
673 mulps nb310_qq(%rsp),%xmm5 ## VV*qq=vcoul
674 mulps nb310_qq(%rsp),%xmm7 ## FF*qq=fijC
676 ## LJ forces
677 mulps nb310_six(%rsp),%xmm2
678 mulps nb310_twelve(%rsp),%xmm0
679 subps %xmm2,%xmm0
680 mulps %xmm1,%xmm0 ## (12*vnb12-6*vnb6)*rinv
682 ## add potential to vctot
683 addps nb310_vctot(%rsp),%xmm5
684 movlps %xmm5,nb310_vctot(%rsp)
686 xorps %xmm8,%xmm8
688 mulps nb310_tsc(%rsp),%xmm7
689 subps %xmm7,%xmm0
691 mulps %xmm1,%xmm0 ## fscal
693 ## calculate scalar force by multiplying dx/dy/dz with fscal
694 mulps %xmm0,%xmm9
695 mulps %xmm0,%xmm10
696 mulps %xmm0,%xmm11
698 movlhps %xmm8,%xmm9
699 movlhps %xmm8,%xmm10
700 movlhps %xmm8,%xmm11
702 ## accumulate i forces
703 addps %xmm9,%xmm13
704 addps %xmm10,%xmm14
705 addps %xmm11,%xmm15
707 movq nb310_faction(%rbp),%rsi
708 ## the fj's - start by accumulating x & y forces from memory
709 movlps (%rsi,%rax,4),%xmm0 ## x1 y1 - -
710 movhps (%rsi,%rbx,4),%xmm0 ## x1 y1 x2 y2
712 unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
713 addps %xmm9,%xmm0
715 movlps %xmm0,(%rsi,%rax,4)
716 movhps %xmm0,(%rsi,%rbx,4)
718 ## z forces
719 pshufd $1,%xmm11,%xmm8
720 addss 8(%rsi,%rax,4),%xmm11
721 addss 8(%rsi,%rbx,4),%xmm8
722 movss %xmm11,8(%rsi,%rax,4)
723 movss %xmm8,8(%rsi,%rbx,4)
725 _nb_kernel310_x86_64_sse.nb310_checksingle:
726 movl nb310_innerk(%rsp),%edx
727 andl $1,%edx
728 jnz _nb_kernel310_x86_64_sse.nb310_dosingle
729 jmp _nb_kernel310_x86_64_sse.nb310_updateouterdata
731 _nb_kernel310_x86_64_sse.nb310_dosingle:
732 movq nb310_innerjjnr(%rsp),%rcx
733 movl (%rcx),%eax
735 movq nb310_charge(%rbp),%rsi
736 movss (%rsi,%rax,4),%xmm0
738 mulss nb310_iq(%rsp),%xmm0
739 movaps %xmm0,nb310_qq(%rsp)
741 movq nb310_type(%rbp),%rsi
742 ## vdw parameters
743 movl (%rsi,%rax,4),%r12d
744 shll %r12d
745 movl nb310_ntia(%rsp),%edi
746 addl %edi,%r12d
748 movq nb310_vdwparam(%rbp),%rsi
749 movss (%rsi,%r12,4),%xmm0
750 movss 4(%rsi,%r12,4),%xmm3
752 movaps %xmm0,nb310_c6(%rsp)
753 movaps %xmm3,nb310_c12(%rsp)
755 lea (%rax,%rax,2),%rax ## replace jnr with j3
757 movq nb310_pos(%rbp),%rdi
758 movss (%rdi,%rax,4),%xmm4 ## x1 - - -
759 movss 4(%rdi,%rax,4),%xmm5 ## y2 - - -
760 movss 8(%rdi,%rax,4),%xmm6 ## 13 - - -
762 ## calc dr
763 subss nb310_ix(%rsp),%xmm4
764 subss nb310_iy(%rsp),%xmm5
765 subss nb310_iz(%rsp),%xmm6
767 ## store dr in xmm9-xmm11
768 movaps %xmm4,%xmm9
769 movaps %xmm5,%xmm10
770 movaps %xmm6,%xmm11
772 ## square it
773 mulss %xmm4,%xmm4
774 mulss %xmm5,%xmm5
775 mulss %xmm6,%xmm6
776 addss %xmm5,%xmm4
777 addss %xmm6,%xmm4
778 ## rsq in xmm4
780 ## calculate rinv=1/sqrt(rsq)
781 rsqrtss %xmm4,%xmm5
782 movaps %xmm5,%xmm2
783 mulss %xmm5,%xmm5
784 movaps nb310_three(%rsp),%xmm1
785 mulss %xmm4,%xmm5 ## rsq*lu*lu
786 subss %xmm5,%xmm1 ## 30-rsq*lu*lu
787 mulss %xmm2,%xmm1
788 mulss nb310_half(%rsp),%xmm1
789 ## xmm1=rinv
790 movaps %xmm4,%xmm3
791 ## xmm3=rsq
793 mulss %xmm1,%xmm3 ## r
794 mulss nb310_tsc(%rsp),%xmm3 ## rtab
796 ## truncate and convert to integers
797 cvttss2si %xmm3,%r12d
799 ## convert back to float
800 cvtsi2ss %r12d,%xmm0
802 ## multiply by 4
803 shll $2,%r12d
805 ## calculate eps
806 subss %xmm0,%xmm3
808 movq nb310_VFtab(%rbp),%rsi
810 movaps %xmm1,%xmm0 ## rinv
811 mulss %xmm0,%xmm0 ## rinvsq
812 movaps %xmm0,%xmm2 ## rinvsq
813 mulss %xmm2,%xmm2 ## rinv4
814 mulss %xmm0,%xmm2 ## rinv6
815 movaps %xmm2,%xmm12
816 mulss %xmm12,%xmm12 ## rinv12
818 ## load table data
819 movss (%rsi,%r12,4),%xmm4
820 movss 4(%rsi,%r12,4),%xmm5
821 movss 8(%rsi,%r12,4),%xmm6
822 movss 12(%rsi,%r12,4),%xmm7
823 ## table data ready in xmm4-xmm7
825 mulss nb310_c6(%rsp),%xmm2 ## vvdw6=c6*rinv6
826 mulss nb310_c12(%rsp),%xmm12 ## vvdw12=c12*rinv12
828 movaps %xmm12,%xmm0
829 subss %xmm2,%xmm12 ## Vvdw=Vvdw12-Vvdw6
831 ## add potential to vvdwtot
832 addss nb310_Vvdwtot(%rsp),%xmm12
833 movss %xmm12,nb310_Vvdwtot(%rsp)
835 mulss %xmm3,%xmm7 ## Heps
836 mulss %xmm3,%xmm6 ## Geps
837 mulss %xmm3,%xmm7 ## Heps2
839 addss %xmm6,%xmm5 ## F+Geps
840 addss %xmm7,%xmm5 ## F+Geps+Heps2 = Fp
841 addss %xmm7,%xmm7 ## 2*Heps2
842 addss %xmm6,%xmm7 ## 2*Heps2+Geps
843 addss %xmm5,%xmm7 ## FF = Fp + 2*Heps2 + Geps
844 mulss %xmm3,%xmm5 ## eps*Fp
845 addss %xmm4,%xmm5 ## VV
846 mulss nb310_qq(%rsp),%xmm5 ## VV*qq=vcoul
847 mulss nb310_qq(%rsp),%xmm7 ## FF*qq=fijC
849 ## LJ forces
850 mulss nb310_six(%rsp),%xmm2
851 mulss nb310_twelve(%rsp),%xmm0
852 subss %xmm2,%xmm0
853 mulss %xmm1,%xmm0 ## (12*vnb12-6*vnb6)*rinv
855 ## add potential to vctot
856 addss nb310_vctot(%rsp),%xmm5
857 movss %xmm5,nb310_vctot(%rsp)
859 mulss nb310_tsc(%rsp),%xmm7
860 subss %xmm7,%xmm0
862 mulss %xmm1,%xmm0 ## fscal
864 ## calculate scalar force by multiplying dx/dy/dz with fscal
865 mulss %xmm0,%xmm9
866 mulss %xmm0,%xmm10
867 mulss %xmm0,%xmm11
869 ## accumulate i forces
870 addss %xmm9,%xmm13
871 addss %xmm10,%xmm14
872 addss %xmm11,%xmm15
874 movq nb310_faction(%rbp),%rsi
875 ## add to j forces
876 addss (%rsi,%rax,4),%xmm9
877 addss 4(%rsi,%rax,4),%xmm10
878 addss 8(%rsi,%rax,4),%xmm11
879 movss %xmm9,(%rsi,%rax,4)
880 movss %xmm10,4(%rsi,%rax,4)
881 movss %xmm11,8(%rsi,%rax,4)
883 _nb_kernel310_x86_64_sse.nb310_updateouterdata:
884 movl nb310_ii3(%rsp),%ecx
885 movq nb310_faction(%rbp),%rdi
886 movq nb310_fshift(%rbp),%rsi
887 movl nb310_is3(%rsp),%edx
889 ## accumulate i forces in xmm13, xmm14, xmm15
890 movhlps %xmm13,%xmm0
891 movhlps %xmm14,%xmm1
892 movhlps %xmm15,%xmm2
893 addps %xmm13,%xmm0
894 addps %xmm14,%xmm1
895 addps %xmm15,%xmm2
896 movaps %xmm0,%xmm3
897 movaps %xmm1,%xmm4
898 movaps %xmm2,%xmm5
899 shufps $1,%xmm3,%xmm3
900 shufps $1,%xmm4,%xmm4
901 shufps $1,%xmm5,%xmm5
902 addss %xmm3,%xmm0
903 addss %xmm4,%xmm1
904 addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0
906 ## increment i force
907 movss (%rdi,%rcx,4),%xmm3
908 movss 4(%rdi,%rcx,4),%xmm4
909 movss 8(%rdi,%rcx,4),%xmm5
910 subss %xmm0,%xmm3
911 subss %xmm1,%xmm4
912 subss %xmm2,%xmm5
913 movss %xmm3,(%rdi,%rcx,4)
914 movss %xmm4,4(%rdi,%rcx,4)
915 movss %xmm5,8(%rdi,%rcx,4)
917 ## increment fshift force
918 movss (%rsi,%rdx,4),%xmm3
919 movss 4(%rsi,%rdx,4),%xmm4
920 movss 8(%rsi,%rdx,4),%xmm5
921 subss %xmm0,%xmm3
922 subss %xmm1,%xmm4
923 subss %xmm2,%xmm5
924 movss %xmm3,(%rsi,%rdx,4)
925 movss %xmm4,4(%rsi,%rdx,4)
926 movss %xmm5,8(%rsi,%rdx,4)
928 ## get n from stack
929 movl nb310_n(%rsp),%esi
930 ## get group index for i particle
931 movq nb310_gid(%rbp),%rdx ## base of gid[]
932 movl (%rdx,%rsi,4),%edx ## ggid=gid[n]
934 ## accumulate total potential energy and update it
935 movaps nb310_vctot(%rsp),%xmm7
936 ## accumulate
937 movhlps %xmm7,%xmm6
938 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
939 movaps %xmm7,%xmm6
940 shufps $1,%xmm6,%xmm6
941 addss %xmm6,%xmm7
943 ## add earlier value from mem
944 movq nb310_Vc(%rbp),%rax
945 addss (%rax,%rdx,4),%xmm7
946 ## move back to mem
947 movss %xmm7,(%rax,%rdx,4)
949 ## accumulate total lj energy and update it
950 movaps nb310_Vvdwtot(%rsp),%xmm7
951 ## accumulate
952 movhlps %xmm7,%xmm6
953 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
954 movaps %xmm7,%xmm6
955 shufps $1,%xmm6,%xmm6
956 addss %xmm6,%xmm7
958 ## add earlier value from mem
959 movq nb310_Vvdw(%rbp),%rax
960 addss (%rax,%rdx,4),%xmm7
961 ## move back to mem
962 movss %xmm7,(%rax,%rdx,4)
964 ## finish if last
965 movl nb310_nn1(%rsp),%ecx
966 ## esi already loaded with n
967 incl %esi
968 subl %esi,%ecx
969 jz _nb_kernel310_x86_64_sse.nb310_outerend
971 ## not last, iterate outer loop once more!
972 movl %esi,nb310_n(%rsp)
973 jmp _nb_kernel310_x86_64_sse.nb310_outer
974 _nb_kernel310_x86_64_sse.nb310_outerend:
975 ## check if more outer neighborlists remain
976 movl nb310_nri(%rsp),%ecx
977 ## esi already loaded with n above
978 subl %esi,%ecx
979 jz _nb_kernel310_x86_64_sse.nb310_end
980 ## non-zero, do one more workunit
981 jmp _nb_kernel310_x86_64_sse.nb310_threadloop
982 _nb_kernel310_x86_64_sse.nb310_end:
984 movl nb310_nouter(%rsp),%eax
985 movl nb310_ninner(%rsp),%ebx
986 movq nb310_outeriter(%rbp),%rcx
987 movq nb310_inneriter(%rbp),%rdx
988 movl %eax,(%rcx)
989 movl %ebx,(%rdx)
991 addq $472,%rsp
992 emms
995 pop %r15
996 pop %r14
997 pop %r13
998 pop %r12
1000 pop %rbx
1001 pop %rbp
1009 .globl nb_kernel310nf_x86_64_sse
1010 .globl _nb_kernel310nf_x86_64_sse
1011 nb_kernel310nf_x86_64_sse:
1012 _nb_kernel310nf_x86_64_sse:
1013 ## Room for return address and rbp (16 bytes)
1014 .set nb310nf_fshift, 16
1015 .set nb310nf_gid, 24
1016 .set nb310nf_pos, 32
1017 .set nb310nf_faction, 40
1018 .set nb310nf_charge, 48
1019 .set nb310nf_p_facel, 56
1020 .set nb310nf_argkrf, 64
1021 .set nb310nf_argcrf, 72
1022 .set nb310nf_Vc, 80
1023 .set nb310nf_type, 88
1024 .set nb310nf_p_ntype, 96
1025 .set nb310nf_vdwparam, 104
1026 .set nb310nf_Vvdw, 112
1027 .set nb310nf_p_tabscale, 120
1028 .set nb310nf_VFtab, 128
1029 .set nb310nf_invsqrta, 136
1030 .set nb310nf_dvda, 144
1031 .set nb310nf_p_gbtabscale, 152
1032 .set nb310nf_GBtab, 160
1033 .set nb310nf_p_nthreads, 168
1034 .set nb310nf_count, 176
1035 .set nb310nf_mtx, 184
1036 .set nb310nf_outeriter, 192
1037 .set nb310nf_inneriter, 200
1038 .set nb310nf_work, 208
1039 ## stack offsets for local variables
1040 ## bottom of stack is cache-aligned for sse use
1041 .set nb310nf_ix, 0
1042 .set nb310nf_iy, 16
1043 .set nb310nf_iz, 32
1044 .set nb310nf_iq, 48
1045 .set nb310nf_tsc, 64
1046 .set nb310nf_qq, 80
1047 .set nb310nf_c6, 96
1048 .set nb310nf_c12, 112
1049 .set nb310nf_vctot, 128
1050 .set nb310nf_Vvdwtot, 144
1051 .set nb310nf_half, 160
1052 .set nb310nf_three, 176
1053 .set nb310nf_nri, 192
1054 .set nb310nf_iinr, 200
1055 .set nb310nf_jindex, 208
1056 .set nb310nf_jjnr, 216
1057 .set nb310nf_shift, 224
1058 .set nb310nf_shiftvec, 232
1059 .set nb310nf_facel, 240
1060 .set nb310nf_innerjjnr, 248
1061 .set nb310nf_is3, 256
1062 .set nb310nf_ii3, 260
1063 .set nb310nf_ntia, 264
1064 .set nb310nf_innerk, 268
1065 .set nb310nf_n, 272
1066 .set nb310nf_nn1, 276
1067 .set nb310nf_ntype, 280
1068 .set nb310nf_nouter, 284
1069 .set nb310nf_ninner, 288
1071 push %rbp
1072 movq %rsp,%rbp
1073 push %rbx
1076 emms
1078 push %r12
1079 push %r13
1080 push %r14
1081 push %r15
1083 subq $312,%rsp ## local variable stack space (n*16+8)
1085 ## zero 32-bit iteration counters
1086 movl $0,%eax
1087 movl %eax,nb310nf_nouter(%rsp)
1088 movl %eax,nb310nf_ninner(%rsp)
1090 movl (%rdi),%edi
1091 movl %edi,nb310nf_nri(%rsp)
1092 movq %rsi,nb310nf_iinr(%rsp)
1093 movq %rdx,nb310nf_jindex(%rsp)
1094 movq %rcx,nb310nf_jjnr(%rsp)
1095 movq %r8,nb310nf_shift(%rsp)
1096 movq %r9,nb310nf_shiftvec(%rsp)
1097 movq nb310nf_p_ntype(%rbp),%rdi
1098 movl (%rdi),%edi
1099 movl %edi,nb310nf_ntype(%rsp)
1100 movq nb310nf_p_facel(%rbp),%rsi
1101 movss (%rsi),%xmm0
1102 movss %xmm0,nb310nf_facel(%rsp)
1104 movq nb310nf_p_tabscale(%rbp),%rax
1105 movss (%rax),%xmm3
1106 shufps $0,%xmm3,%xmm3
1107 movaps %xmm3,nb310nf_tsc(%rsp)
1109 ## create constant floating-point factors on stack
1110 movl $0x3f000000,%eax ## half in IEEE (hex)
1111 movl %eax,nb310nf_half(%rsp)
1112 movss nb310nf_half(%rsp),%xmm1
1113 shufps $0,%xmm1,%xmm1 ## splat to all elements
1114 movaps %xmm1,%xmm2
1115 addps %xmm2,%xmm2 ## one
1116 movaps %xmm2,%xmm3
1117 addps %xmm2,%xmm2 ## two
1118 addps %xmm2,%xmm3 ## three
1119 movaps %xmm1,nb310nf_half(%rsp)
1120 movaps %xmm3,nb310nf_three(%rsp)
1122 _nb_kernel310nf_x86_64_sse.nb310nf_threadloop:
1123 movq nb310nf_count(%rbp),%rsi ## pointer to sync counter
1124 movl (%rsi),%eax
1125 _nb_kernel310nf_x86_64_sse.nb310nf_spinlock:
1126 movl %eax,%ebx ## ebx=*count=nn0
1127 addl $1,%ebx ## ebx=nn1=nn0+10
1128 lock
1129 cmpxchgl %ebx,(%rsi) ## write nn1 to *counter,
1130 ## if it hasnt changed.
1131 ## or reread *counter to eax.
1132 pause ## -> better p4 performance
1133 jnz _nb_kernel310nf_x86_64_sse.nb310nf_spinlock
1135 ## if(nn1>nri) nn1=nri
1136 movl nb310nf_nri(%rsp),%ecx
1137 movl %ecx,%edx
1138 subl %ebx,%ecx
1139 cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri
1140 ## Cleared the spinlock if we got here.
1141 ## eax contains nn0, ebx contains nn1.
1142 movl %eax,nb310nf_n(%rsp)
1143 movl %ebx,nb310nf_nn1(%rsp)
1144 subl %eax,%ebx ## calc number of outer lists
1145 movl %eax,%esi ## copy n to esi
1146 jg _nb_kernel310nf_x86_64_sse.nb310nf_outerstart
1147 jmp _nb_kernel310nf_x86_64_sse.nb310nf_end
1149 _nb_kernel310nf_x86_64_sse.nb310nf_outerstart:
1150 ## ebx contains number of outer iterations
1151 addl nb310nf_nouter(%rsp),%ebx
1152 movl %ebx,nb310nf_nouter(%rsp)
1154 _nb_kernel310nf_x86_64_sse.nb310nf_outer:
1155 movq nb310nf_shift(%rsp),%rax ## rax = pointer into shift[]
1156 movl (%rax,%rsi,4),%ebx ## ebx=shift[n]
1158 lea (%rbx,%rbx,2),%rbx ## rbx=3*is
1159 movl %ebx,nb310nf_is3(%rsp) ## store is3
1161 movq nb310nf_shiftvec(%rsp),%rax ## rax = base of shiftvec[]
1163 movss (%rax,%rbx,4),%xmm0
1164 movss 4(%rax,%rbx,4),%xmm1
1165 movss 8(%rax,%rbx,4),%xmm2
1167 movq nb310nf_iinr(%rsp),%rcx ## rcx = pointer into iinr[]
1168 movl (%rcx,%rsi,4),%ebx ## ebx =ii
1170 movq nb310nf_charge(%rbp),%rdx
1171 movss (%rdx,%rbx,4),%xmm3
1172 mulss nb310nf_facel(%rsp),%xmm3
1173 shufps $0,%xmm3,%xmm3
1175 movq nb310nf_type(%rbp),%rdx
1176 movl (%rdx,%rbx,4),%edx
1177 imull nb310nf_ntype(%rsp),%edx
1178 shll %edx
1179 movl %edx,nb310nf_ntia(%rsp)
1181 lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3
1182 movq nb310nf_pos(%rbp),%rax ## rax = base of pos[]
1184 addss (%rax,%rbx,4),%xmm0
1185 addss 4(%rax,%rbx,4),%xmm1
1186 addss 8(%rax,%rbx,4),%xmm2
1188 movaps %xmm3,nb310nf_iq(%rsp)
1190 shufps $0,%xmm0,%xmm0
1191 shufps $0,%xmm1,%xmm1
1192 shufps $0,%xmm2,%xmm2
1194 movaps %xmm0,nb310nf_ix(%rsp)
1195 movaps %xmm1,nb310nf_iy(%rsp)
1196 movaps %xmm2,nb310nf_iz(%rsp)
1198 movl %ebx,nb310nf_ii3(%rsp)
1200 ## clear vctot and i forces
1201 xorps %xmm4,%xmm4
1202 movaps %xmm4,nb310nf_vctot(%rsp)
1203 movaps %xmm4,nb310nf_Vvdwtot(%rsp)
1205 movq nb310nf_jindex(%rsp),%rax
1206 movq (%rax,%rsi,4),%rcx ## jindex[n]
1207 movl 4(%rax,%rsi,4),%edx ## jindex[n+1]
1208 subl %ecx,%edx ## number of innerloop atoms
1210 movq nb310nf_pos(%rbp),%rsi
1211 movq nb310nf_jjnr(%rsp),%rax
1212 shll $2,%ecx
1213 addq %rcx,%rax
1214 movq %rax,nb310nf_innerjjnr(%rsp) ## pointer to jjnr[nj0]
1215 movl %edx,%ecx
1216 subl $4,%edx
1217 addl nb310nf_ninner(%rsp),%ecx
1218 movl %ecx,nb310nf_ninner(%rsp)
1219 addl $0,%edx
1220 movl %edx,nb310nf_innerk(%rsp) ## number of innerloop atoms
1221 jge _nb_kernel310nf_x86_64_sse.nb310nf_unroll_loop
1222 jmp _nb_kernel310nf_x86_64_sse.nb310nf_finish_inner
1223 _nb_kernel310nf_x86_64_sse.nb310nf_unroll_loop:
1224 ## quad-unroll innerloop here
1225 movq nb310nf_innerjjnr(%rsp),%rdx ## pointer to jjnr[k]
1226 movl (%rdx),%eax
1227 movl 4(%rdx),%ebx
1228 movl 8(%rdx),%ecx
1229 movl 12(%rdx),%edx ## eax-edx=jnr1-4
1230 addq $16,nb310nf_innerjjnr(%rsp) ## advance pointer (unrolled 4)
1232 movq nb310nf_charge(%rbp),%rsi ## base of charge[]
1234 movss (%rsi,%rax,4),%xmm3
1235 movss (%rsi,%rcx,4),%xmm4
1236 movss (%rsi,%rbx,4),%xmm6
1237 movss (%rsi,%rdx,4),%xmm7
1239 movaps nb310nf_iq(%rsp),%xmm2
1240 shufps $0,%xmm6,%xmm3
1241 shufps $0,%xmm7,%xmm4
1242 shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3
1243 movd %eax,%mm0 ## use mmx registers as temp storage
1244 movd %ebx,%mm1
1245 mulps %xmm2,%xmm3
1246 movd %ecx,%mm2
1247 movd %edx,%mm3
1249 movaps %xmm3,nb310nf_qq(%rsp)
1251 movq nb310nf_type(%rbp),%rsi
1252 movl (%rsi,%rax,4),%eax
1253 movl (%rsi,%rbx,4),%ebx
1254 movl (%rsi,%rcx,4),%ecx
1255 movl (%rsi,%rdx,4),%edx
1256 movq nb310nf_vdwparam(%rbp),%rsi
1257 shll %eax
1258 shll %ebx
1259 shll %ecx
1260 shll %edx
1261 movl nb310nf_ntia(%rsp),%edi
1262 addl %edi,%eax
1263 addl %edi,%ebx
1264 addl %edi,%ecx
1265 addl %edi,%edx
1267 movlps (%rsi,%rax,4),%xmm6
1268 movlps (%rsi,%rcx,4),%xmm7
1269 movhps (%rsi,%rbx,4),%xmm6
1270 movhps (%rsi,%rdx,4),%xmm7
1272 movaps %xmm6,%xmm4
1273 shufps $136,%xmm7,%xmm4 ## 10001000
1274 shufps $221,%xmm7,%xmm6 ## 11011101
1276 movd %mm0,%eax
1277 movd %mm1,%ebx
1278 movd %mm2,%ecx
1279 movd %mm3,%edx
1281 movaps %xmm4,nb310nf_c6(%rsp)
1282 movaps %xmm6,nb310nf_c12(%rsp)
1284 movq nb310nf_pos(%rbp),%rsi ## base of pos[]
1286 lea (%rax,%rax,2),%rax ## replace jnr with j3
1287 lea (%rbx,%rbx,2),%rbx
1289 lea (%rcx,%rcx,2),%rcx ## replace jnr with j3
1290 lea (%rdx,%rdx,2),%rdx
1292 ## move four coordinates to xmm0-xmm2
1294 movlps (%rsi,%rax,4),%xmm4
1295 movlps (%rsi,%rcx,4),%xmm5
1296 movss 8(%rsi,%rax,4),%xmm2
1297 movss 8(%rsi,%rcx,4),%xmm6
1299 movhps (%rsi,%rbx,4),%xmm4
1300 movhps (%rsi,%rdx,4),%xmm5
1302 movss 8(%rsi,%rbx,4),%xmm0
1303 movss 8(%rsi,%rdx,4),%xmm1
1305 shufps $0,%xmm0,%xmm2
1306 shufps $0,%xmm1,%xmm6
1308 movaps %xmm4,%xmm0
1309 movaps %xmm4,%xmm1
1311 shufps $136,%xmm6,%xmm2 ## 10001000
1313 shufps $136,%xmm5,%xmm0 ## 10001000
1314 shufps $221,%xmm5,%xmm1 ## 11011101
1316 ## move ix-iz to xmm4-xmm6
1317 movaps nb310nf_ix(%rsp),%xmm4
1318 movaps nb310nf_iy(%rsp),%xmm5
1319 movaps nb310nf_iz(%rsp),%xmm6
1321 ## calc dr
1322 subps %xmm0,%xmm4
1323 subps %xmm1,%xmm5
1324 subps %xmm2,%xmm6
1326 ## square it
1327 mulps %xmm4,%xmm4
1328 mulps %xmm5,%xmm5
1329 mulps %xmm6,%xmm6
1330 addps %xmm5,%xmm4
1331 addps %xmm6,%xmm4
1332 ## rsq in xmm4
1334 rsqrtps %xmm4,%xmm5
1335 ## lookup seed in xmm5
1336 movaps %xmm5,%xmm2
1337 mulps %xmm5,%xmm5
1338 movaps nb310nf_three(%rsp),%xmm1
1339 mulps %xmm4,%xmm5 ## rsq*lu*lu
1340 movaps nb310nf_half(%rsp),%xmm0
1341 subps %xmm5,%xmm1 ## 30-rsq*lu*lu
1342 mulps %xmm2,%xmm1
1343 mulps %xmm1,%xmm0 ## xmm0=rinv
1344 mulps %xmm0,%xmm4 ## xmm4=r
1345 mulps nb310nf_tsc(%rsp),%xmm4
1347 movhlps %xmm4,%xmm5
1348 cvttps2pi %xmm4,%mm6
1349 cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices
1350 cvtpi2ps %mm6,%xmm6
1351 cvtpi2ps %mm7,%xmm5
1352 movlhps %xmm5,%xmm6
1353 subps %xmm6,%xmm4
1354 movaps %xmm4,%xmm1 ## xmm1=eps
1355 movaps %xmm1,%xmm2
1356 mulps %xmm2,%xmm2 ## xmm2=eps2
1357 pslld $2,%mm6
1358 pslld $2,%mm7
1360 movd %eax,%mm0
1361 movd %ebx,%mm1
1362 movd %ecx,%mm2
1363 movd %edx,%mm3
1365 movq nb310nf_VFtab(%rbp),%rsi
1366 movd %mm6,%eax
1367 psrlq $32,%mm6
1368 movd %mm7,%ecx
1369 psrlq $32,%mm7
1370 movd %mm6,%ebx
1371 movd %mm7,%edx
1373 movlps (%rsi,%rax,4),%xmm5
1374 movlps (%rsi,%rcx,4),%xmm7
1375 movhps (%rsi,%rbx,4),%xmm5
1376 movhps (%rsi,%rdx,4),%xmm7 ## got half coulomb table
1378 movaps %xmm5,%xmm4
1379 shufps $136,%xmm7,%xmm4 ## 10001000
1380 shufps $221,%xmm7,%xmm5 ## 11011101
1382 movlps 8(%rsi,%rax,4),%xmm7
1383 movlps 8(%rsi,%rcx,4),%xmm3
1384 movhps 8(%rsi,%rbx,4),%xmm7
1385 movhps 8(%rsi,%rdx,4),%xmm3 ## other half of coulomb table
1386 movaps %xmm7,%xmm6
1387 shufps $136,%xmm3,%xmm6 ## 10001000
1388 shufps $221,%xmm3,%xmm7 ## 11011101
1389 ## coulomb table ready, in xmm4-xmm7
1391 mulps %xmm1,%xmm6 ## xmm6=Geps
1392 mulps %xmm2,%xmm7 ## xmm7=Heps2
1393 addps %xmm6,%xmm5
1394 addps %xmm7,%xmm5 ## xmm5=Fp
1395 movaps nb310nf_qq(%rsp),%xmm3
1396 mulps %xmm1,%xmm5 ## xmm5=eps*Fp
1397 addps %xmm4,%xmm5 ## xmm5=VV
1398 mulps %xmm3,%xmm5 ## vcoul=qq*VV
1399 ## L-J
1400 movaps %xmm0,%xmm4
1401 mulps %xmm0,%xmm4 ## xmm4=rinvsq
1403 ## at this point mm5 contains vcoul
1404 ## increment vcoul - then we can get rid of mm5
1405 ## update vctot
1406 addps nb310nf_vctot(%rsp),%xmm5
1407 movaps %xmm4,%xmm6
1408 mulps %xmm4,%xmm6
1409 movaps %xmm5,nb310nf_vctot(%rsp)
1411 mulps %xmm4,%xmm6 ## xmm6=rinvsix
1412 movaps %xmm6,%xmm4
1413 mulps %xmm4,%xmm4 ## xmm4=rinvtwelve
1414 mulps nb310nf_c6(%rsp),%xmm6
1415 mulps nb310nf_c12(%rsp),%xmm4
1416 movaps nb310nf_Vvdwtot(%rsp),%xmm7
1417 addps %xmm4,%xmm7
1418 subps %xmm6,%xmm7
1419 movaps %xmm7,nb310nf_Vvdwtot(%rsp)
1422 ## should we do one more iteration?
1423 subl $4,nb310nf_innerk(%rsp)
1424 jl _nb_kernel310nf_x86_64_sse.nb310nf_finish_inner
1425 jmp _nb_kernel310nf_x86_64_sse.nb310nf_unroll_loop
1426 _nb_kernel310nf_x86_64_sse.nb310nf_finish_inner:
1427 ## check if at least two particles remain
1428 addl $4,nb310nf_innerk(%rsp)
1429 movl nb310nf_innerk(%rsp),%edx
1430 andl $2,%edx
1431 jnz _nb_kernel310nf_x86_64_sse.nb310nf_dopair
1432 jmp _nb_kernel310nf_x86_64_sse.nb310nf_checksingle
1433 _nb_kernel310nf_x86_64_sse.nb310nf_dopair:
1434 movq nb310nf_charge(%rbp),%rsi
1435 movq nb310nf_innerjjnr(%rsp),%rcx
1436 movl (%rcx),%eax
1437 movl 4(%rcx),%ebx
1438 addq $8,nb310nf_innerjjnr(%rsp)
1439 xorps %xmm7,%xmm7
1440 movss (%rsi,%rax,4),%xmm3
1441 movss (%rsi,%rbx,4),%xmm6
1442 shufps $0,%xmm6,%xmm3
1443 shufps $8,%xmm3,%xmm3 ## 00001000 ;# xmm3(0,1) has the charges
1445 mulps nb310nf_iq(%rsp),%xmm3
1446 movlhps %xmm7,%xmm3
1447 movaps %xmm3,nb310nf_qq(%rsp)
1449 movq nb310nf_type(%rbp),%rsi
1450 movl %eax,%ecx
1451 movl %ebx,%edx
1452 movl (%rsi,%rcx,4),%ecx
1453 movl (%rsi,%rdx,4),%edx
1454 movq nb310nf_vdwparam(%rbp),%rsi
1455 shll %ecx
1456 shll %edx
1457 movl nb310nf_ntia(%rsp),%edi
1458 addl %edi,%ecx
1459 addl %edi,%edx
1460 movlps (%rsi,%rcx,4),%xmm6
1461 movhps (%rsi,%rdx,4),%xmm6
1462 movq nb310nf_pos(%rbp),%rdi
1464 movaps %xmm6,%xmm4
1465 shufps $8,%xmm4,%xmm4 ## 00001000
1466 shufps $13,%xmm6,%xmm6 ## 00001101
1467 movlhps %xmm7,%xmm4
1468 movlhps %xmm7,%xmm6
1470 movaps %xmm4,nb310nf_c6(%rsp)
1471 movaps %xmm6,nb310nf_c12(%rsp)
1473 lea (%rax,%rax,2),%rax
1474 lea (%rbx,%rbx,2),%rbx
1475 ## move coordinates to xmm0-xmm2
1476 movlps (%rdi,%rax,4),%xmm1
1477 movss 8(%rdi,%rax,4),%xmm2
1478 movhps (%rdi,%rbx,4),%xmm1
1479 movss 8(%rdi,%rbx,4),%xmm0
1481 movlhps %xmm7,%xmm3
1483 shufps $0,%xmm0,%xmm2
1485 movaps %xmm1,%xmm0
1487 shufps $136,%xmm2,%xmm2 ## 10001000
1489 shufps $136,%xmm0,%xmm0 ## 10001000
1490 shufps $221,%xmm1,%xmm1 ## 11011101
1492 ## move ix-iz to xmm4-xmm6
1493 xorps %xmm7,%xmm7
1495 movaps nb310nf_ix(%rsp),%xmm4
1496 movaps nb310nf_iy(%rsp),%xmm5
1497 movaps nb310nf_iz(%rsp),%xmm6
1499 ## calc dr
1500 subps %xmm0,%xmm4
1501 subps %xmm1,%xmm5
1502 subps %xmm2,%xmm6
1504 ## square it
1505 mulps %xmm4,%xmm4
1506 mulps %xmm5,%xmm5
1507 mulps %xmm6,%xmm6
1508 addps %xmm5,%xmm4
1509 addps %xmm6,%xmm4
1510 ## rsq in xmm4
1512 rsqrtps %xmm4,%xmm5
1513 ## lookup seed in xmm5
1514 movaps %xmm5,%xmm2
1515 mulps %xmm5,%xmm5
1516 movaps nb310nf_three(%rsp),%xmm1
1517 mulps %xmm4,%xmm5 ## rsq*lu*lu
1518 movaps nb310nf_half(%rsp),%xmm0
1519 subps %xmm5,%xmm1 ## 30-rsq*lu*lu
1520 mulps %xmm2,%xmm1
1521 mulps %xmm1,%xmm0 ## xmm0=rinv
1522 mulps %xmm0,%xmm4 ## xmm4=r
1523 mulps nb310nf_tsc(%rsp),%xmm4
1525 cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices
1526 cvtpi2ps %mm6,%xmm6
1527 subps %xmm6,%xmm4
1528 movaps %xmm4,%xmm1 ## xmm1=eps
1529 movaps %xmm1,%xmm2
1530 mulps %xmm2,%xmm2 ## xmm2=eps2
1532 pslld $2,%mm6
1534 movq nb310nf_VFtab(%rbp),%rsi
1535 movd %mm6,%ecx
1536 psrlq $32,%mm6
1537 movd %mm6,%edx
1539 movlps (%rsi,%rcx,4),%xmm5
1540 movhps (%rsi,%rdx,4),%xmm5 ## got half coulomb table
1541 movaps %xmm5,%xmm4
1542 shufps $136,%xmm4,%xmm4 ## 10001000
1543 shufps $221,%xmm7,%xmm5 ## 11011101
1545 movlps 8(%rsi,%rcx,4),%xmm7
1546 movhps 8(%rsi,%rdx,4),%xmm7
1547 movaps %xmm7,%xmm6
1548 shufps $136,%xmm6,%xmm6 ## 10001000
1549 shufps $221,%xmm7,%xmm7 ## 11011101
1550 ## table ready in xmm4-xmm7
1552 mulps %xmm1,%xmm6 ## xmm6=Geps
1553 mulps %xmm2,%xmm7 ## xmm7=Heps2
1554 addps %xmm6,%xmm5
1555 addps %xmm7,%xmm5 ## xmm5=Fp
1556 movaps nb310nf_qq(%rsp),%xmm3
1557 mulps %xmm1,%xmm5 ## xmm5=eps*Fp
1558 addps %xmm4,%xmm5 ## xmm5=VV
1559 mulps %xmm3,%xmm5 ## vcoul=qq*VV
1560 ## L-J
1561 movaps %xmm0,%xmm4
1562 mulps %xmm0,%xmm4 ## xmm4=rinvsq
1564 ## at this point mm5 contains vcoul
1565 ## increment vcoul - then we can get rid of mm5
1566 ## update vctot
1567 addps nb310nf_vctot(%rsp),%xmm5
1569 movaps %xmm4,%xmm6
1570 mulps %xmm4,%xmm6
1572 movaps %xmm5,nb310nf_vctot(%rsp)
1574 mulps %xmm4,%xmm6 ## xmm6=rinvsix
1575 movaps %xmm6,%xmm4
1576 mulps %xmm4,%xmm4 ## xmm4=rinvtwelve
1577 mulps nb310nf_c6(%rsp),%xmm6
1578 mulps nb310nf_c12(%rsp),%xmm4
1579 movaps nb310nf_Vvdwtot(%rsp),%xmm7
1580 addps %xmm4,%xmm7
1581 subps %xmm6,%xmm7
1582 movaps %xmm7,nb310nf_Vvdwtot(%rsp)
1584 _nb_kernel310nf_x86_64_sse.nb310nf_checksingle:
1585 movl nb310nf_innerk(%rsp),%edx
1586 andl $1,%edx
1587 jnz _nb_kernel310nf_x86_64_sse.nb310nf_dosingle
1588 jmp _nb_kernel310nf_x86_64_sse.nb310nf_updateouterdata
1589 _nb_kernel310nf_x86_64_sse.nb310nf_dosingle:
1590 movq nb310nf_charge(%rbp),%rsi
1591 movq nb310nf_pos(%rbp),%rdi
1592 movq nb310nf_innerjjnr(%rsp),%rcx
1593 movl (%rcx),%eax
1594 xorps %xmm6,%xmm6
1595 movss (%rsi,%rax,4),%xmm6 ## xmm6(0) has the charge
1596 mulps nb310nf_iq(%rsp),%xmm6
1597 movaps %xmm6,nb310nf_qq(%rsp)
1599 movq nb310nf_type(%rbp),%rsi
1600 movl %eax,%ecx
1601 movl (%rsi,%rcx,4),%ecx
1602 movq nb310nf_vdwparam(%rbp),%rsi
1603 shll %ecx
1604 addl nb310nf_ntia(%rsp),%ecx
1605 movlps (%rsi,%rcx,4),%xmm6
1606 movaps %xmm6,%xmm4
1607 shufps $252,%xmm4,%xmm4 ## 11111100
1608 shufps $253,%xmm6,%xmm6 ## 11111101
1610 movaps %xmm4,nb310nf_c6(%rsp)
1611 movaps %xmm6,nb310nf_c12(%rsp)
1613 lea (%rax,%rax,2),%rax
1615 ## move coordinates to xmm0-xmm2
1616 movss (%rdi,%rax,4),%xmm0
1617 movss 4(%rdi,%rax,4),%xmm1
1618 movss 8(%rdi,%rax,4),%xmm2
1620 movaps nb310nf_ix(%rsp),%xmm4
1621 movaps nb310nf_iy(%rsp),%xmm5
1622 movaps nb310nf_iz(%rsp),%xmm6
1624 ## calc dr
1625 subps %xmm0,%xmm4
1626 subps %xmm1,%xmm5
1627 subps %xmm2,%xmm6
1629 ## square it
1630 mulps %xmm4,%xmm4
1631 mulps %xmm5,%xmm5
1632 mulps %xmm6,%xmm6
1633 addps %xmm5,%xmm4
1634 addps %xmm6,%xmm4
1635 ## rsq in xmm4
1637 rsqrtps %xmm4,%xmm5
1638 ## lookup seed in xmm5
1639 movaps %xmm5,%xmm2
1640 mulps %xmm5,%xmm5
1641 movaps nb310nf_three(%rsp),%xmm1
1642 mulps %xmm4,%xmm5 ## rsq*lu*lu
1643 movaps nb310nf_half(%rsp),%xmm0
1644 subps %xmm5,%xmm1 ## 30-rsq*lu*lu
1645 mulps %xmm2,%xmm1
1646 mulps %xmm1,%xmm0 ## xmm0=rinv
1648 mulps %xmm0,%xmm4 ## xmm4=r
1649 mulps nb310nf_tsc(%rsp),%xmm4
1651 cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices
1652 cvtpi2ps %mm6,%xmm6
1653 subps %xmm6,%xmm4
1654 movaps %xmm4,%xmm1 ## xmm1=eps
1655 movaps %xmm1,%xmm2
1656 mulps %xmm2,%xmm2 ## xmm2=eps2
1658 pslld $2,%mm6
1660 movq nb310nf_VFtab(%rbp),%rsi
1661 movd %mm6,%ebx
1663 movlps (%rsi,%rbx,4),%xmm4
1664 movlps 8(%rsi,%rbx,4),%xmm6
1665 movaps %xmm4,%xmm5
1666 movaps %xmm6,%xmm7
1667 shufps $1,%xmm5,%xmm5
1668 shufps $1,%xmm7,%xmm7
1669 ## table ready in xmm4-xmm7
1671 mulps %xmm1,%xmm6 ## xmm6=Geps
1672 mulps %xmm2,%xmm7 ## xmm7=Heps2
1673 addps %xmm6,%xmm5
1674 addps %xmm7,%xmm5 ## xmm5=Fp
1675 movaps nb310nf_qq(%rsp),%xmm3
1676 mulps %xmm1,%xmm5 ## xmm5=eps*Fp
1677 addps %xmm4,%xmm5 ## xmm5=VV
1678 mulps %xmm3,%xmm5 ## vcoul=qq*VV
1679 ## L-J
1680 movaps %xmm0,%xmm4
1681 mulps %xmm0,%xmm4 ## xmm4=rinvsq
1683 ## at this point mm5 contains vcoul
1684 ## increment vcoul - then we can get rid of mm5
1685 ## update vctot
1686 addss nb310nf_vctot(%rsp),%xmm5
1688 movaps %xmm4,%xmm6
1689 mulps %xmm4,%xmm6
1691 movss %xmm5,nb310nf_vctot(%rsp)
1693 mulps %xmm4,%xmm6 ## xmm6=rinvsix
1694 movaps %xmm6,%xmm4
1695 mulps %xmm4,%xmm4 ## xmm4=rinvtwelve
1696 mulps nb310nf_c6(%rsp),%xmm6
1697 mulps nb310nf_c12(%rsp),%xmm4
1698 movss nb310nf_Vvdwtot(%rsp),%xmm7
1699 addps %xmm4,%xmm7
1700 subps %xmm6,%xmm7
1701 movss %xmm7,nb310nf_Vvdwtot(%rsp)
1703 _nb_kernel310nf_x86_64_sse.nb310nf_updateouterdata:
1704 ## get n from stack
1705 movl nb310nf_n(%rsp),%esi
1706 ## get group index for i particle
1707 movq nb310nf_gid(%rbp),%rdx ## base of gid[]
1708 movl (%rdx,%rsi,4),%edx ## ggid=gid[n]
1710 ## accumulate total potential energy and update it
1711 movaps nb310nf_vctot(%rsp),%xmm7
1712 ## accumulate
1713 movhlps %xmm7,%xmm6
1714 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1715 movaps %xmm7,%xmm6
1716 shufps $1,%xmm6,%xmm6
1717 addss %xmm6,%xmm7
1719 ## add earlier value from mem
1720 movq nb310nf_Vc(%rbp),%rax
1721 addss (%rax,%rdx,4),%xmm7
1722 ## move back to mem
1723 movss %xmm7,(%rax,%rdx,4)
1725 ## accumulate total lj energy and update it
1726 movaps nb310nf_Vvdwtot(%rsp),%xmm7
1727 ## accumulate
1728 movhlps %xmm7,%xmm6
1729 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1730 movaps %xmm7,%xmm6
1731 shufps $1,%xmm6,%xmm6
1732 addss %xmm6,%xmm7
1734 ## add earlier value from mem
1735 movq nb310nf_Vvdw(%rbp),%rax
1736 addss (%rax,%rdx,4),%xmm7
1737 ## move back to mem
1738 movss %xmm7,(%rax,%rdx,4)
1740 ## finish if last
1741 movl nb310nf_nn1(%rsp),%ecx
1742 ## esi already loaded with n
1743 incl %esi
1744 subl %esi,%ecx
1745 jz _nb_kernel310nf_x86_64_sse.nb310nf_outerend
1747 ## not last, iterate outer loop once more!
1748 movl %esi,nb310nf_n(%rsp)
1749 jmp _nb_kernel310nf_x86_64_sse.nb310nf_outer
1750 _nb_kernel310nf_x86_64_sse.nb310nf_outerend:
1751 ## check if more outer neighborlists remain
1752 movl nb310nf_nri(%rsp),%ecx
1753 ## esi already loaded with n above
1754 subl %esi,%ecx
1755 jz _nb_kernel310nf_x86_64_sse.nb310nf_end
1756 ## non-zero, do one more workunit
1757 jmp _nb_kernel310nf_x86_64_sse.nb310nf_threadloop
1758 _nb_kernel310nf_x86_64_sse.nb310nf_end:
1760 movl nb310nf_nouter(%rsp),%eax
1761 movl nb310nf_ninner(%rsp),%ebx
1762 movq nb310nf_outeriter(%rbp),%rcx
1763 movq nb310nf_inneriter(%rbp),%rdx
1764 movl %eax,(%rcx)
1765 movl %ebx,(%rdx)
1767 addq $312,%rsp
1768 emms
1771 pop %r15
1772 pop %r14
1773 pop %r13
1774 pop %r12
1776 pop %rbx
1777 pop %rbp