Renamed intel syntax assembly files to avoid double extensions
[gromacs.git] / src / gmxlib / nonbonded / nb_kernel_x86_64_sse / nb_kernel410_x86_64_sse.s
blobf1953c7bf6c63cddfc068481fdd9c309adb78a67
1 ##
2 ##
3 ## Gromacs 4.0 Copyright (c) 1991-2003
4 ## David van der Spoel, Erik Lindahl
5 ##
6 ## This program is free software; you can redistribute it and/or
7 ## modify it under the terms of the GNU General Public License
8 ## as published by the Free Software Foundation; either version 2
9 ## of the License, or (at your option) any later version.
11 ## To help us fund GROMACS development, we humbly ask that you cite
12 ## the research papers on the package. Check out http://www.gromacs.org
13 ##
14 ## And Hey:
15 ## Gnomes, ROck Monsters And Chili Sauce
23 .globl nb_kernel410_x86_64_sse
24 .globl _nb_kernel410_x86_64_sse
25 nb_kernel410_x86_64_sse:
26 _nb_kernel410_x86_64_sse:
27 ## Room for return address and rbp (16 bytes)
28 .set nb410_fshift, 16
29 .set nb410_gid, 24
30 .set nb410_pos, 32
31 .set nb410_faction, 40
32 .set nb410_charge, 48
33 .set nb410_p_facel, 56
34 .set nb410_argkrf, 64
35 .set nb410_argcrf, 72
36 .set nb410_Vc, 80
37 .set nb410_type, 88
38 .set nb410_p_ntype, 96
39 .set nb410_vdwparam, 104
40 .set nb410_Vvdw, 112
41 .set nb410_p_tabscale, 120
42 .set nb410_VFtab, 128
43 .set nb410_invsqrta, 136
44 .set nb410_dvda, 144
45 .set nb410_p_gbtabscale, 152
46 .set nb410_GBtab, 160
47 .set nb410_p_nthreads, 168
48 .set nb410_count, 176
49 .set nb410_mtx, 184
50 .set nb410_outeriter, 192
51 .set nb410_inneriter, 200
52 .set nb410_work, 208
53 ## stack offsets for local variables
54 ## bottom of stack is cache-aligned for sse use
55 .set nb410_ix, 0
56 .set nb410_iy, 16
57 .set nb410_iz, 32
58 .set nb410_iq, 48
59 .set nb410_dx, 64
60 .set nb410_dy, 80
61 .set nb410_dz, 96
62 .set nb410_two, 112
63 .set nb410_six, 128
64 .set nb410_twelve, 144
65 .set nb410_gbtsc, 160
66 .set nb410_qq, 176
67 .set nb410_c6, 192
68 .set nb410_c12, 208
69 .set nb410_fscal, 224
70 .set nb410_vctot, 240
71 .set nb410_Vvdwtot, 256
72 .set nb410_fix, 272
73 .set nb410_fiy, 288
74 .set nb410_fiz, 304
75 .set nb410_half, 320
76 .set nb410_three, 336
77 .set nb410_r, 352
78 .set nb410_isai, 368
79 .set nb410_isaprod, 384
80 .set nb410_dvdasum, 400
81 .set nb410_gbscale, 416
82 .set nb410_nri, 432
83 .set nb410_iinr, 440
84 .set nb410_jindex, 448
85 .set nb410_jjnr, 456
86 .set nb410_shift, 464
87 .set nb410_shiftvec, 472
88 .set nb410_facel, 480
89 .set nb410_innerjjnr, 488
90 .set nb410_is3, 496
91 .set nb410_ii3, 500
92 .set nb410_ii, 504
93 .set nb410_ntia, 508
94 .set nb410_innerk, 512
95 .set nb410_n, 516
96 .set nb410_nn1, 520
97 .set nb410_ntype, 524
98 .set nb410_nouter, 528
99 .set nb410_ninner, 532
100 .set nb410_jnra, 536
101 .set nb410_jnrb, 540
102 .set nb410_jnrc, 544
103 .set nb410_jnrd, 548
105 push %rbp
106 movq %rsp,%rbp
107 push %rbx
110 emms
112 push %r12
113 push %r13
114 push %r14
115 push %r15
117 subq $568,%rsp ## local variable stack space (n*16+8)
119 ## zero 32-bit iteration counters
120 movl $0,%eax
121 movl %eax,nb410_nouter(%rsp)
122 movl %eax,nb410_ninner(%rsp)
124 movl (%rdi),%edi
125 movl %edi,nb410_nri(%rsp)
126 movq %rsi,nb410_iinr(%rsp)
127 movq %rdx,nb410_jindex(%rsp)
128 movq %rcx,nb410_jjnr(%rsp)
129 movq %r8,nb410_shift(%rsp)
130 movq %r9,nb410_shiftvec(%rsp)
131 movq nb410_p_ntype(%rbp),%rdi
132 movl (%rdi),%edi
133 movl %edi,nb410_ntype(%rsp)
134 movq nb410_p_facel(%rbp),%rsi
135 movss (%rsi),%xmm0
136 movss %xmm0,nb410_facel(%rsp)
138 movq nb410_p_gbtabscale(%rbp),%rbx
139 movss (%rbx),%xmm4
140 shufps $0,%xmm4,%xmm4
141 movaps %xmm4,nb410_gbtsc(%rsp)
144 ## create constant floating-point factors on stack
145 movl $0x3f000000,%eax ## half in IEEE (hex)
146 movl %eax,nb410_half(%rsp)
147 movss nb410_half(%rsp),%xmm1
148 shufps $0,%xmm1,%xmm1 ## splat to all elements
149 movaps %xmm1,%xmm2
150 addps %xmm2,%xmm2 ## one
151 movaps %xmm2,%xmm3
152 addps %xmm2,%xmm2 ## two
153 addps %xmm2,%xmm3 ## three
154 movaps %xmm3,%xmm4
155 addps %xmm4,%xmm4 ## six
156 movaps %xmm4,%xmm5
157 addps %xmm5,%xmm5 ## twelve
158 movaps %xmm1,nb410_half(%rsp)
159 movaps %xmm2,nb410_two(%rsp)
160 movaps %xmm3,nb410_three(%rsp)
161 movaps %xmm4,nb410_six(%rsp)
162 movaps %xmm5,nb410_twelve(%rsp)
164 _nb_kernel410_x86_64_sse.nb410_threadloop:
165 movq nb410_count(%rbp),%rsi ## pointer to sync counter
166 movl (%rsi),%eax
167 _nb_kernel410_x86_64_sse.nb410_spinlock:
168 movl %eax,%ebx ## ebx=*count=nn0
169 addl $1,%ebx ## ebx=nn1=nn0+10
170 lock
171 cmpxchgl %ebx,(%rsi) ## write nn1 to *counter,
172 ## if it hasnt changed.
173 ## or reread *counter to eax.
174 pause ## -> better p4 performance
175 jnz _nb_kernel410_x86_64_sse.nb410_spinlock
177 ## if(nn1>nri) nn1=nri
178 movl nb410_nri(%rsp),%ecx
179 movl %ecx,%edx
180 subl %ebx,%ecx
181 cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri
182 ## Cleared the spinlock if we got here.
183 ## eax contains nn0, ebx contains nn1.
184 movl %eax,nb410_n(%rsp)
185 movl %ebx,nb410_nn1(%rsp)
186 subl %eax,%ebx ## calc number of outer lists
187 movl %eax,%esi ## copy n to esi
188 jg _nb_kernel410_x86_64_sse.nb410_outerstart
189 jmp _nb_kernel410_x86_64_sse.nb410_end
191 _nb_kernel410_x86_64_sse.nb410_outerstart:
192 ## ebx contains number of outer iterations
193 addl nb410_nouter(%rsp),%ebx
194 movl %ebx,nb410_nouter(%rsp)
196 _nb_kernel410_x86_64_sse.nb410_outer:
197 movq nb410_shift(%rsp),%rax ## rax = pointer into shift[]
198 movl (%rax,%rsi,4),%ebx ## ebx=shift[n]
200 lea (%rbx,%rbx,2),%rbx ## rbx=3*is
201 movl %ebx,nb410_is3(%rsp) ## store is3
203 movq nb410_shiftvec(%rsp),%rax ## rax = base of shiftvec[]
205 movss (%rax,%rbx,4),%xmm0
206 movss 4(%rax,%rbx,4),%xmm1
207 movss 8(%rax,%rbx,4),%xmm2
209 movq nb410_iinr(%rsp),%rcx ## rcx = pointer into iinr[]
210 movl (%rcx,%rsi,4),%ebx ## ebx =ii
211 movl %ebx,nb410_ii(%rsp)
213 movq nb410_charge(%rbp),%rdx
214 movss (%rdx,%rbx,4),%xmm3
215 mulss nb410_facel(%rsp),%xmm3
216 shufps $0,%xmm3,%xmm3
218 movq nb410_invsqrta(%rbp),%rdx ## load invsqrta[ii]
219 movss (%rdx,%rbx,4),%xmm4
220 shufps $0,%xmm4,%xmm4
222 movq nb410_type(%rbp),%rdx
223 movl (%rdx,%rbx,4),%edx
224 imull nb410_ntype(%rsp),%edx
225 shll %edx
226 movl %edx,nb410_ntia(%rsp)
228 lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3
229 movq nb410_pos(%rbp),%rax ## rax = base of pos[]
231 addss (%rax,%rbx,4),%xmm0
232 addss 4(%rax,%rbx,4),%xmm1
233 addss 8(%rax,%rbx,4),%xmm2
235 movaps %xmm3,nb410_iq(%rsp)
236 movaps %xmm4,nb410_isai(%rsp)
238 shufps $0,%xmm0,%xmm0
239 shufps $0,%xmm1,%xmm1
240 shufps $0,%xmm2,%xmm2
242 movaps %xmm0,nb410_ix(%rsp)
243 movaps %xmm1,nb410_iy(%rsp)
244 movaps %xmm2,nb410_iz(%rsp)
246 movl %ebx,nb410_ii3(%rsp)
248 ## clear vctot and i forces
249 xorps %xmm13,%xmm13
250 movaps %xmm13,%xmm12
251 movaps %xmm13,nb410_Vvdwtot(%rsp)
252 movaps %xmm13,nb410_dvdasum(%rsp)
253 movaps %xmm13,%xmm14
254 movaps %xmm13,%xmm15
256 movq nb410_jindex(%rsp),%rax
257 movl (%rax,%rsi,4),%ecx ## jindex[n]
258 movl 4(%rax,%rsi,4),%edx ## jindex[n+1]
259 subl %ecx,%edx ## number of innerloop atoms
261 movq nb410_pos(%rbp),%rsi
262 movq nb410_faction(%rbp),%rdi
263 movq nb410_jjnr(%rsp),%rax
264 shll $2,%ecx
265 addq %rcx,%rax
266 movq %rax,nb410_innerjjnr(%rsp) ## pointer to jjnr[nj0]
267 movl %edx,%ecx
268 subl $4,%edx
269 addl nb410_ninner(%rsp),%ecx
270 movl %ecx,nb410_ninner(%rsp)
271 addl $0,%edx
272 movl %edx,nb410_innerk(%rsp) ## number of innerloop atoms
273 jge _nb_kernel410_x86_64_sse.nb410_unroll_loop
274 jmp _nb_kernel410_x86_64_sse.nb410_finish_inner
275 _nb_kernel410_x86_64_sse.nb410_unroll_loop:
276 ## quad-unroll innerloop here
277 movq nb410_innerjjnr(%rsp),%rdx ## pointer to jjnr[k]
278 movl (%rdx),%eax
279 movl 4(%rdx),%ebx
280 movl 8(%rdx),%ecx
281 movl 12(%rdx),%edx ## eax-edx=jnr1-4
283 addq $16,nb410_innerjjnr(%rsp) ## advance pointer (unrolled 4)
285 ## load isaj
286 movq nb410_invsqrta(%rbp),%rsi
287 movss (%rsi,%rax,4),%xmm3
288 movss (%rsi,%rcx,4),%xmm4
289 movss (%rsi,%rbx,4),%xmm6
290 movss (%rsi,%rdx,4),%xmm7
291 movaps nb410_isai(%rsp),%xmm2
292 shufps $0,%xmm6,%xmm3
293 shufps $0,%xmm7,%xmm4
294 shufps $136,%xmm4,%xmm3 ## 10001000 ;# all isaj in xmm3
295 mulps %xmm3,%xmm2
297 movaps %xmm2,nb410_isaprod(%rsp)
298 movaps %xmm2,%xmm1
299 mulps nb410_gbtsc(%rsp),%xmm1
300 movaps %xmm1,nb410_gbscale(%rsp)
302 movq nb410_charge(%rbp),%rsi ## base of charge[]
304 movss (%rsi,%rax,4),%xmm3
305 movss (%rsi,%rcx,4),%xmm4
306 movss (%rsi,%rbx,4),%xmm6
307 movss (%rsi,%rdx,4),%xmm7
309 mulps nb410_iq(%rsp),%xmm2
310 shufps $0,%xmm6,%xmm3
311 shufps $0,%xmm7,%xmm4
312 shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3
313 mulps %xmm2,%xmm3
314 movaps %xmm3,nb410_qq(%rsp)
316 ## vdw parameters
317 movq nb410_type(%rbp),%rsi
318 movl (%rsi,%rax,4),%r12d
319 movl (%rsi,%rbx,4),%r13d
320 movl (%rsi,%rcx,4),%r14d
321 movl (%rsi,%rdx,4),%r15d
322 shll %r12d
323 shll %r13d
324 shll %r14d
325 shll %r15d
326 movl nb410_ntia(%rsp),%edi
327 addl %edi,%r12d
328 addl %edi,%r13d
329 addl %edi,%r14d
330 addl %edi,%r15d
332 movq nb410_vdwparam(%rbp),%rsi
333 movlps (%rsi,%r12,4),%xmm3
334 movlps (%rsi,%r14,4),%xmm7
335 movhps (%rsi,%r13,4),%xmm3
336 movhps (%rsi,%r15,4),%xmm7
338 movaps %xmm3,%xmm0
339 shufps $136,%xmm7,%xmm0 ## 10001000
340 shufps $221,%xmm7,%xmm3 ## 11011101
342 movaps %xmm0,nb410_c6(%rsp)
343 movaps %xmm3,nb410_c12(%rsp)
345 movq nb410_pos(%rbp),%rsi ## base of pos[]
347 lea (%rax,%rax,2),%r8 ## jnr
348 lea (%rbx,%rbx,2),%r9
349 lea (%rcx,%rcx,2),%r10
350 lea (%rdx,%rdx,2),%r11
352 ## move four coordinates to xmm0-xmm2
353 movlps (%rsi,%r8,4),%xmm4
354 movlps (%rsi,%r10,4),%xmm5
355 movss 8(%rsi,%r8,4),%xmm2
356 movss 8(%rsi,%r10,4),%xmm6
358 movhps (%rsi,%r9,4),%xmm4
359 movhps (%rsi,%r11,4),%xmm5
361 movss 8(%rsi,%r9,4),%xmm0
362 movss 8(%rsi,%r11,4),%xmm1
364 shufps $0,%xmm0,%xmm2
365 shufps $0,%xmm1,%xmm6
367 movaps %xmm4,%xmm0
368 movaps %xmm4,%xmm1
370 shufps $136,%xmm6,%xmm2 ## 10001000
372 shufps $136,%xmm5,%xmm0 ## 10001000
373 shufps $221,%xmm5,%xmm1 ## 11011101
375 ## calc dr
376 subps nb410_ix(%rsp),%xmm0
377 subps nb410_iy(%rsp),%xmm1
378 subps nb410_iz(%rsp),%xmm2
380 ## store dr
381 movaps %xmm0,nb410_dx(%rsp)
382 movaps %xmm1,nb410_dy(%rsp)
383 movaps %xmm2,nb410_dz(%rsp)
385 ## square it
386 mulps %xmm0,%xmm0
387 mulps %xmm1,%xmm1
388 mulps %xmm2,%xmm2
389 addps %xmm1,%xmm0
390 addps %xmm2,%xmm0
391 movaps %xmm0,%xmm4
392 ## rsq in xmm4
394 rsqrtps %xmm4,%xmm5
395 ## lookup seed in xmm5
396 movaps %xmm5,%xmm2
397 mulps %xmm5,%xmm5
398 movaps nb410_three(%rsp),%xmm1
399 mulps %xmm4,%xmm5 ## rsq*lu*lu
400 movaps nb410_half(%rsp),%xmm0
401 subps %xmm5,%xmm1 ## 30-rsq*lu*lu
402 mulps %xmm2,%xmm1
403 mulps %xmm1,%xmm0 ## xmm0=rinv
404 mulps %xmm0,%xmm4 ## xmm4=r
405 movaps %xmm4,nb410_r(%rsp)
406 mulps nb410_gbscale(%rsp),%xmm4
408 ## truncate and convert to integers
409 cvttps2dq %xmm4,%xmm5
411 ## convert back to float
412 cvtdq2ps %xmm5,%xmm6
414 ## multiply by 4
415 pslld $2,%xmm5
417 ## move to integer registers
418 movhlps %xmm5,%xmm7
419 movd %xmm5,%r12d
420 movd %xmm7,%r14d
421 pshufd $1,%xmm5,%xmm5
422 pshufd $1,%xmm7,%xmm7
423 movd %xmm5,%r13d
424 movd %xmm7,%r15d
426 ## calculate eps
427 subps %xmm6,%xmm4
428 movaps %xmm4,%xmm1 ##eps
430 movq nb410_GBtab(%rbp),%rsi
432 movaps %xmm0,%xmm9 ## rinv
433 mulps %xmm9,%xmm9 ## rinvsq
434 movaps %xmm9,%xmm10 ## rinvsq
435 mulps %xmm10,%xmm10 ## rinv4
436 mulps %xmm9,%xmm10 ## rinv6
437 movaps %xmm10,%xmm11
438 mulps %xmm11,%xmm11 ## rinv12
440 ## load table data
441 movlps (%rsi,%r12,4),%xmm5
442 movlps (%rsi,%r14,4),%xmm7
443 movhps (%rsi,%r13,4),%xmm5
444 movhps (%rsi,%r15,4),%xmm7
446 movaps %xmm5,%xmm4
447 shufps $136,%xmm7,%xmm4 ## 10001000
448 shufps $221,%xmm7,%xmm5 ## 11011101
450 mulps nb410_c6(%rsp),%xmm10 ## vvdw6=c6*rinv6
451 mulps nb410_c12(%rsp),%xmm11 ## vvdw12=c12*rinv12
453 movaps %xmm11,%xmm9
454 subps %xmm10,%xmm11 ## Vvdw=Vvdw12-Vvdw6
456 ## add potential to vvdwtot
457 addps nb410_Vvdwtot(%rsp),%xmm11
458 movaps %xmm11,nb410_Vvdwtot(%rsp)
460 movlps 8(%rsi,%r12,4),%xmm7
461 movlps 8(%rsi,%r14,4),%xmm8
462 movhps 8(%rsi,%r13,4),%xmm7
463 movhps 8(%rsi,%r15,4),%xmm8
465 movaps %xmm7,%xmm6
467 shufps $136,%xmm8,%xmm6 ## 10001000
468 shufps $221,%xmm8,%xmm7 ## 11011101
469 ## table data ready in xmm4-xmm7
471 mulps %xmm1,%xmm7 ## Heps
472 mulps %xmm1,%xmm6 ## xmm6=Geps
473 mulps %xmm1,%xmm7 ## Heps2
474 addps %xmm6,%xmm5
475 addps %xmm7,%xmm5 ## xmm5=Fp
476 addps %xmm7,%xmm7 ## two*Heps2
477 movaps nb410_qq(%rsp),%xmm3
478 addps %xmm6,%xmm7
479 addps %xmm5,%xmm7 ## xmm7=FF
480 mulps %xmm1,%xmm5 ## xmm5=eps*Fp
481 addps %xmm4,%xmm5 ## xmm5=VV
482 mulps %xmm3,%xmm5 ## vcoul=qq*VV
483 mulps %xmm7,%xmm3 ## fijC=FF*qq
484 ## at this point xmm5 contains vcoul and xmm3 fijC
486 ## LJ forces
487 mulps nb410_six(%rsp),%xmm10
488 mulps nb410_twelve(%rsp),%xmm9
489 subps %xmm10,%xmm9
490 mulps %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv
492 movq nb410_dvda(%rbp),%rsi
494 ## Calculate dVda
495 xorps %xmm7,%xmm7
496 mulps nb410_gbscale(%rsp),%xmm3
497 movaps %xmm3,%xmm6
498 mulps nb410_r(%rsp),%xmm6
499 addps %xmm5,%xmm6
501 ## increment vctot (sum in xmm12)
502 addps %xmm5,%xmm12
504 ## xmm6=(vcoul+fijC*r)
505 subps %xmm6,%xmm7
506 movaps %xmm7,%xmm6
508 ## update dvdasum
509 addps nb410_dvdasum(%rsp),%xmm7
510 movaps %xmm7,nb410_dvdasum(%rsp)
512 ## update j atoms dvdaj
513 movhlps %xmm6,%xmm7
514 movaps %xmm6,%xmm5
515 movaps %xmm7,%xmm4
516 shufps $0x1,%xmm5,%xmm5
517 shufps $0x1,%xmm4,%xmm4
519 ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
520 addss (%rsi,%rax,4),%xmm6
521 addss (%rsi,%rbx,4),%xmm5
522 addss (%rsi,%rcx,4),%xmm7
523 addss (%rsi,%rdx,4),%xmm4
524 movss %xmm6,(%rsi,%rax,4)
525 movss %xmm5,(%rsi,%rbx,4)
526 movss %xmm7,(%rsi,%rcx,4)
527 movss %xmm4,(%rsi,%rdx,4)
529 subps %xmm3,%xmm9
530 mulps %xmm0,%xmm9 ## fscal
532 movaps %xmm9,%xmm10
533 movaps %xmm9,%xmm11
535 mulps nb410_dx(%rsp),%xmm9
536 mulps nb410_dy(%rsp),%xmm10
537 mulps nb410_dz(%rsp),%xmm11
539 ## accumulate i forces
540 addps %xmm9,%xmm13
541 addps %xmm10,%xmm14
542 addps %xmm11,%xmm15
544 movq nb410_faction(%rbp),%rsi
545 ## the fj's - start by accumulating x & y forces from memory
546 movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
547 movlps (%rsi,%r10,4),%xmm1 ## x3 y3 - -
548 movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
549 movhps (%rsi,%r11,4),%xmm1 ## x3 y3 x4 y4
551 movaps %xmm9,%xmm8
552 unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
553 unpckhps %xmm10,%xmm8 ## x3 y3 x4 y4
555 ## update fjx and fjy
556 addps %xmm9,%xmm0
557 addps %xmm8,%xmm1
559 movlps %xmm0,(%rsi,%r8,4)
560 movlps %xmm1,(%rsi,%r10,4)
561 movhps %xmm0,(%rsi,%r9,4)
562 movhps %xmm1,(%rsi,%r11,4)
564 ## xmm11: fjz1 fjz2 fjz3 fjz4
565 pshufd $1,%xmm11,%xmm10 ## fjz2 - - -
566 movhlps %xmm11,%xmm9 ## fjz3 - - -
567 pshufd $3,%xmm11,%xmm8 ## fjz4 - - -
569 addss 8(%rsi,%r8,4),%xmm11
570 addss 8(%rsi,%r9,4),%xmm10
571 addss 8(%rsi,%r10,4),%xmm9
572 addss 8(%rsi,%r11,4),%xmm8
573 movss %xmm11,8(%rsi,%r8,4)
574 movss %xmm10,8(%rsi,%r9,4)
575 movss %xmm9,8(%rsi,%r10,4)
576 movss %xmm8,8(%rsi,%r11,4)
578 ## should we do one more iteration?
579 subl $4,nb410_innerk(%rsp)
580 jl _nb_kernel410_x86_64_sse.nb410_finish_inner
581 jmp _nb_kernel410_x86_64_sse.nb410_unroll_loop
582 _nb_kernel410_x86_64_sse.nb410_finish_inner:
583 ## check if at least two particles remain
584 addl $4,nb410_innerk(%rsp)
585 movl nb410_innerk(%rsp),%edx
586 andl $2,%edx
587 jnz _nb_kernel410_x86_64_sse.nb410_dopair
588 jmp _nb_kernel410_x86_64_sse.nb410_checksingle
589 _nb_kernel410_x86_64_sse.nb410_dopair:
590 movq nb410_innerjjnr(%rsp),%rcx
592 movl (%rcx),%eax
593 movl 4(%rcx),%ebx
594 addq $8,nb410_innerjjnr(%rsp)
596 ## load isaj
597 movq nb410_invsqrta(%rbp),%rsi
598 movss (%rsi,%rax,4),%xmm2
599 movss (%rsi,%rbx,4),%xmm6
600 unpcklps %xmm6,%xmm2
602 mulps nb410_isai(%rsp),%xmm2
604 movaps %xmm2,nb410_isaprod(%rsp)
605 movaps %xmm2,%xmm1
606 mulps nb410_gbtsc(%rsp),%xmm1
607 movaps %xmm1,nb410_gbscale(%rsp)
609 mulps nb410_iq(%rsp),%xmm2
610 movq nb410_charge(%rbp),%rsi ## base of charge[]
611 movss (%rsi,%rax,4),%xmm3
612 movss (%rsi,%rbx,4),%xmm6
613 unpcklps %xmm6,%xmm3
616 mulps %xmm2,%xmm3
617 movaps %xmm3,nb410_qq(%rsp)
619 ## vdw parameters
620 movq nb410_type(%rbp),%rsi
621 movl (%rsi,%rax,4),%r12d
622 movl (%rsi,%rbx,4),%r13d
623 shll %r12d
624 shll %r13d
625 movl nb410_ntia(%rsp),%edi
626 addl %edi,%r12d
627 addl %edi,%r13d
629 movq nb410_vdwparam(%rbp),%rsi
630 movlps (%rsi,%r12,4),%xmm3
631 movhps (%rsi,%r13,4),%xmm3
633 xorps %xmm7,%xmm7
634 movaps %xmm3,%xmm0
635 shufps $136,%xmm7,%xmm0 ## 10001000
636 shufps $221,%xmm7,%xmm3 ## 11011101
638 movaps %xmm0,nb410_c6(%rsp)
639 movaps %xmm3,nb410_c12(%rsp)
641 movq nb410_pos(%rbp),%rsi ## base of pos[]
643 lea (%rax,%rax,2),%r8 ## j3
644 lea (%rbx,%rbx,2),%r9
646 ## move four coordinates to xmm0-xmm2
647 movlps (%rsi,%r8,4),%xmm4 ## x1 y1 - -
648 movlps (%rsi,%r9,4),%xmm5 ## x2 y2 - -
650 movss 8(%rsi,%r8,4),%xmm6 ## z1 - - -
651 movss 8(%rsi,%r9,4),%xmm7 ## z2 - - -
653 unpcklps %xmm5,%xmm4 ## x1 x2 y1 y2
654 movhlps %xmm4,%xmm5 ## y1 y2 - -
655 unpcklps %xmm7,%xmm6 ## z1 z2 - -
657 ## calc dr
658 subps nb410_ix(%rsp),%xmm4
659 subps nb410_iy(%rsp),%xmm5
660 subps nb410_iz(%rsp),%xmm6
662 ## store dr
663 movaps %xmm4,nb410_dx(%rsp)
664 movaps %xmm5,nb410_dy(%rsp)
665 movaps %xmm6,nb410_dz(%rsp)
667 ## square it
668 mulps %xmm4,%xmm4
669 mulps %xmm5,%xmm5
670 mulps %xmm6,%xmm6
671 addps %xmm5,%xmm4
672 addps %xmm6,%xmm4
673 ## rsq in xmm4
675 rsqrtps %xmm4,%xmm5
676 ## lookup seed in xmm5
677 movaps %xmm5,%xmm2
678 mulps %xmm5,%xmm5
679 movaps nb410_three(%rsp),%xmm1
680 mulps %xmm4,%xmm5 ## rsq*lu*lu
681 movaps nb410_half(%rsp),%xmm0
682 subps %xmm5,%xmm1 ## 30-rsq*lu*lu
683 mulps %xmm2,%xmm1
684 mulps %xmm1,%xmm0 ## xmm0=rinv
685 mulps %xmm0,%xmm4 ## xmm4=r
686 movaps %xmm4,nb410_r(%rsp)
687 mulps nb410_gbscale(%rsp),%xmm4
689 ## truncate and convert to integers
690 cvttps2dq %xmm4,%xmm5
692 ## convert back to float
693 cvtdq2ps %xmm5,%xmm6
695 ## multiply by 4
696 pslld $2,%xmm5
698 ## move to integer registers
699 movd %xmm5,%r12d
700 pshufd $1,%xmm5,%xmm5
701 movd %xmm5,%r13d
703 ## calculate eps
704 subps %xmm6,%xmm4
705 movaps %xmm4,%xmm1 ##eps
707 movq nb410_GBtab(%rbp),%rsi
709 movaps %xmm0,%xmm9 ## rinv
710 mulps %xmm9,%xmm9 ## rinvsq
711 movaps %xmm9,%xmm10 ## rinvsq
712 mulps %xmm10,%xmm10 ## rinv4
713 mulps %xmm9,%xmm10 ## rinv6
714 movaps %xmm10,%xmm11
715 mulps %xmm11,%xmm11 ## rinv12
717 ## load table data
718 movlps (%rsi,%r12,4),%xmm4 ## Y1 F1
719 movlps (%rsi,%r13,4),%xmm5 ## Y2 F2
720 unpcklps %xmm5,%xmm4 ## Y1 Y2 F1 F2
721 movhlps %xmm4,%xmm5 ## F1 F2
723 mulps nb410_c6(%rsp),%xmm10 ## vvdw6=c6*rinv6
724 mulps nb410_c12(%rsp),%xmm11 ## vvdw12=c12*rinv12
726 movaps %xmm11,%xmm9
727 subps %xmm10,%xmm11 ## Vvdw=Vvdw12-Vvdw6
729 ## add potential to vvdwtot
730 addps nb410_Vvdwtot(%rsp),%xmm11
731 movlps %xmm11,nb410_Vvdwtot(%rsp)
733 movlps 8(%rsi,%r12,4),%xmm6 ## G1 H1
734 movlps 8(%rsi,%r13,4),%xmm7 ## G2 H2
735 unpcklps %xmm7,%xmm6 ## G1 G2
736 movhlps %xmm6,%xmm7 ## H1 H2
737 ## table data ready in xmm4-xmm7
739 mulps %xmm1,%xmm7 ## Heps
740 mulps %xmm1,%xmm6 ## xmm6=Geps
741 mulps %xmm1,%xmm7 ## Heps2
742 addps %xmm6,%xmm5
743 addps %xmm7,%xmm5 ## xmm5=Fp
744 addps %xmm7,%xmm7 ## two*Heps2
745 movaps nb410_qq(%rsp),%xmm3
747 addps %xmm6,%xmm7
748 addps %xmm5,%xmm7 ## xmm7=FF
749 mulps %xmm1,%xmm5 ## xmm5=eps*Fp
750 addps %xmm4,%xmm5 ## xmm5=VV
751 mulps %xmm3,%xmm5 ## vcoul=qq*VV
752 mulps %xmm7,%xmm3 ## fijC=FF*qq
753 ## at this point xmm5 contains vcoul and xmm3 fijC
755 ## LJ forces
756 mulps nb410_six(%rsp),%xmm10
757 mulps nb410_twelve(%rsp),%xmm9
758 subps %xmm10,%xmm9
759 mulps %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv
761 ## zero upper part of vcoul
762 xorps %xmm2,%xmm2
763 movlhps %xmm2,%xmm5
765 movq nb410_dvda(%rbp),%rsi
767 ## Calculate dVda
768 xorps %xmm7,%xmm7
769 mulps nb410_gbscale(%rsp),%xmm3
770 movaps %xmm3,%xmm6
771 mulps nb410_r(%rsp),%xmm6
772 addps %xmm5,%xmm6
774 xorps %xmm4,%xmm4
775 ## increment vctot (sum in xmm12)
776 addps %xmm5,%xmm12
778 ## xmm6=(vcoul+fijC*r)
779 subps %xmm6,%xmm7
780 movaps %xmm7,%xmm6
782 ## zero upper half of dvda
783 movlhps %xmm4,%xmm7
785 ## update dvdasum
786 addps nb410_dvdasum(%rsp),%xmm7
787 movaps %xmm7,nb410_dvdasum(%rsp)
789 ## update j atoms dvdaj
790 movaps %xmm6,%xmm5
791 shufps $0x1,%xmm5,%xmm5
793 ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
794 addss (%rsi,%rax,4),%xmm6
795 addss (%rsi,%rbx,4),%xmm5
796 movss %xmm6,(%rsi,%rax,4)
797 movss %xmm5,(%rsi,%rbx,4)
799 xorps %xmm7,%xmm7
801 subps %xmm3,%xmm9
802 mulps %xmm0,%xmm9 ## fscal
804 movaps %xmm9,%xmm10
805 movaps %xmm9,%xmm11
807 mulps nb410_dx(%rsp),%xmm9
808 mulps nb410_dy(%rsp),%xmm10
809 mulps nb410_dz(%rsp),%xmm11
811 movlhps %xmm7,%xmm9
812 movlhps %xmm7,%xmm10
813 movlhps %xmm7,%xmm11
815 ## accumulate i forces
816 addps %xmm9,%xmm13
817 addps %xmm10,%xmm14
818 addps %xmm11,%xmm15
820 movq nb410_faction(%rbp),%rsi
821 ## the fj's - start by accumulating x & y forces from memory
822 movlps (%rsi,%r8,4),%xmm0 ## x1 y1 - -
823 movhps (%rsi,%r9,4),%xmm0 ## x1 y1 x2 y2
825 unpcklps %xmm10,%xmm9 ## x1 y1 x2 y2
826 addps %xmm9,%xmm0
828 movlps %xmm0,(%rsi,%r8,4)
829 movhps %xmm0,(%rsi,%r9,4)
831 ## z forces
832 pshufd $1,%xmm11,%xmm8
833 addss 8(%rsi,%r8,4),%xmm11
834 addss 8(%rsi,%r9,4),%xmm8
835 movss %xmm11,8(%rsi,%r8,4)
836 movss %xmm8,8(%rsi,%r9,4)
838 _nb_kernel410_x86_64_sse.nb410_checksingle:
839 movl nb410_innerk(%rsp),%edx
840 andl $1,%edx
841 jnz _nb_kernel410_x86_64_sse.nb410_dosingle
842 jmp _nb_kernel410_x86_64_sse.nb410_updateouterdata
843 _nb_kernel410_x86_64_sse.nb410_dosingle:
844 movq nb410_charge(%rbp),%rsi
845 movq nb410_invsqrta(%rbp),%rdx
846 movq nb410_pos(%rbp),%rdi
847 movq nb410_innerjjnr(%rsp),%rcx
848 movl (%rcx),%eax
850 ## load isaj
851 movq nb410_invsqrta(%rbp),%rsi
852 movss (%rsi,%rax,4),%xmm3
853 movaps nb410_isai(%rsp),%xmm2
854 mulss %xmm3,%xmm2
856 movss %xmm2,nb410_isaprod(%rsp)
857 movaps %xmm2,%xmm1
858 mulss nb410_gbtsc(%rsp),%xmm1
859 movss %xmm1,nb410_gbscale(%rsp)
861 mulss nb410_iq(%rsp),%xmm2
862 movq nb410_charge(%rbp),%rsi ## base of charge[]
864 movss (%rsi,%rax,4),%xmm3
865 mulss %xmm2,%xmm3
866 movss %xmm3,nb410_qq(%rsp)
868 ## vdw parameters
869 movq nb410_type(%rbp),%rsi
870 movl (%rsi,%rax,4),%r12d
871 shll %r12d
872 movl nb410_ntia(%rsp),%edi
873 addl %edi,%r12d
875 movq nb410_vdwparam(%rbp),%rsi
876 movss (%rsi,%r12,4),%xmm0
877 movss 4(%rsi,%r12,4),%xmm3
878 movaps %xmm0,nb410_c6(%rsp)
879 movaps %xmm3,nb410_c12(%rsp)
881 movq nb410_pos(%rbp),%rsi ## base of pos[]
883 lea (%rax,%rax,2),%r8 ## jnr
885 ## move four coordinates to xmm0-xmm2
886 movss (%rsi,%r8,4),%xmm4
887 movss 4(%rsi,%r8,4),%xmm5
888 movss 8(%rsi,%r8,4),%xmm6
890 ## calc dr
891 subss nb410_ix(%rsp),%xmm4
892 subss nb410_iy(%rsp),%xmm5
893 subss nb410_iz(%rsp),%xmm6
895 ## store dr
896 movaps %xmm4,nb410_dx(%rsp)
897 movaps %xmm5,nb410_dy(%rsp)
898 movaps %xmm6,nb410_dz(%rsp)
900 ## square it
901 mulss %xmm4,%xmm4
902 mulss %xmm5,%xmm5
903 mulss %xmm6,%xmm6
904 addss %xmm5,%xmm4
905 addss %xmm6,%xmm4
906 ## rsq in xmm4
908 rsqrtss %xmm4,%xmm5
909 ## lookup seed in xmm5
910 movaps %xmm5,%xmm2
911 mulss %xmm5,%xmm5
912 movaps nb410_three(%rsp),%xmm1
913 mulss %xmm4,%xmm5 ## rsq*lu*lu
914 movaps nb410_half(%rsp),%xmm0
915 subss %xmm5,%xmm1 ## 30-rsq*lu*lu
916 mulss %xmm2,%xmm1
917 mulss %xmm1,%xmm0 ## xmm0=rinv
918 mulss %xmm0,%xmm4 ## xmm4=r
919 movaps %xmm4,nb410_r(%rsp)
920 mulss nb410_gbscale(%rsp),%xmm4
922 ## truncate and convert to integers
923 cvttss2si %xmm4,%r12d
925 ## convert back to float
926 cvtsi2ss %r12d,%xmm6
928 ## multiply by 4
929 shll $2,%r12d
931 ## calculate eps
932 subss %xmm6,%xmm4
933 movaps %xmm4,%xmm1 ##eps
935 movq nb410_GBtab(%rbp),%rsi
937 movaps %xmm0,%xmm9 ## rinv
938 mulss %xmm9,%xmm9 ## rinvsq
939 movaps %xmm9,%xmm10 ## rinvsq
940 mulss %xmm10,%xmm10 ## rinv4
941 mulss %xmm9,%xmm10 ## rinv6
942 movaps %xmm10,%xmm11
943 mulss %xmm11,%xmm11 ## rinv12
945 ## load table data
946 movss (%rsi,%r12,4),%xmm4
947 movss 4(%rsi,%r12,4),%xmm5
948 movss 8(%rsi,%r12,4),%xmm6
949 movss 12(%rsi,%r12,4),%xmm7
950 ## table data ready in xmm4-xmm7
952 mulss nb410_c6(%rsp),%xmm10 ## vvdw6=c6*rinv6
953 mulss nb410_c12(%rsp),%xmm11 ## vvdw12=c12*rinv12
955 movaps %xmm11,%xmm9
956 subss %xmm10,%xmm11 ## Vvdw=Vvdw12-Vvdw6
958 ## add potential to vvdwtot
959 addss nb410_Vvdwtot(%rsp),%xmm11
960 movss %xmm11,nb410_Vvdwtot(%rsp)
962 mulss %xmm1,%xmm7 ## Heps
963 mulss %xmm1,%xmm6 ## xmm6=Geps
964 mulss %xmm1,%xmm7 ## Heps2
965 addss %xmm6,%xmm5
966 addss %xmm7,%xmm5 ## xmm5=Fp
967 addss %xmm7,%xmm7 ## two*Heps2
968 movss nb410_qq(%rsp),%xmm3
969 addss %xmm6,%xmm7
970 addss %xmm5,%xmm7 ## xmm7=FF
971 mulss %xmm1,%xmm5 ## xmm5=eps*Fp
972 addss %xmm4,%xmm5 ## xmm5=VV
973 mulss %xmm3,%xmm5 ## vcoul=qq*VV
974 mulss %xmm7,%xmm3 ## fijC=FF*qq
975 ## at this point xmm5 contains vcoul and xmm3 fijC
977 ## LJ forces
978 mulss nb410_six(%rsp),%xmm10
979 mulss nb410_twelve(%rsp),%xmm9
980 subss %xmm10,%xmm9
981 mulss %xmm0,%xmm9 ## (12*vnb12-6*vnb6)*rinv
983 movq nb410_dvda(%rbp),%rsi
985 ## Calculate dVda
986 xorps %xmm7,%xmm7
987 mulss nb410_gbscale(%rsp),%xmm3
988 movaps %xmm3,%xmm6
989 mulss nb410_r(%rsp),%xmm6
990 addss %xmm5,%xmm6
992 ## increment vctot (sum in xmm12)
993 addss %xmm5,%xmm12
995 ## xmm6=(vcoul+fijC*r)
996 subss %xmm6,%xmm7
997 movaps %xmm7,%xmm6
999 ## update dvdasum
1000 addss nb410_dvdasum(%rsp),%xmm7
1001 movss %xmm7,nb410_dvdasum(%rsp)
1003 ## update j atoms dvdaj
1004 addss (%rsi,%rax,4),%xmm6
1005 movss %xmm6,(%rsi,%rax,4)
1007 subss %xmm3,%xmm9
1008 mulss %xmm0,%xmm9 ## fscal
1010 movaps %xmm9,%xmm10
1011 movaps %xmm9,%xmm11
1013 mulss nb410_dx(%rsp),%xmm9
1014 mulss nb410_dy(%rsp),%xmm10
1015 mulss nb410_dz(%rsp),%xmm11
1017 ## accumulate i forces
1018 addss %xmm9,%xmm13
1019 addss %xmm10,%xmm14
1020 addss %xmm11,%xmm15
1022 movq nb410_faction(%rbp),%rsi
1023 ## add to j forces
1024 addss (%rsi,%r8,4),%xmm9
1025 addss 4(%rsi,%r8,4),%xmm10
1026 addss 8(%rsi,%r8,4),%xmm11
1027 movss %xmm9,(%rsi,%r8,4)
1028 movss %xmm10,4(%rsi,%r8,4)
1029 movss %xmm11,8(%rsi,%r8,4)
1031 _nb_kernel410_x86_64_sse.nb410_updateouterdata:
1032 movl nb410_ii3(%rsp),%ecx
1033 movq nb410_faction(%rbp),%rdi
1034 movq nb410_fshift(%rbp),%rsi
1035 movl nb410_is3(%rsp),%edx
1037 ## accumulate i forces in xmm13, xmm14, xmm15
1038 movhlps %xmm13,%xmm0
1039 movhlps %xmm14,%xmm1
1040 movhlps %xmm15,%xmm2
1041 addps %xmm13,%xmm0
1042 addps %xmm14,%xmm1
1043 addps %xmm15,%xmm2
1044 movaps %xmm0,%xmm3
1045 movaps %xmm1,%xmm4
1046 movaps %xmm2,%xmm5
1047 shufps $1,%xmm3,%xmm3
1048 shufps $1,%xmm4,%xmm4
1049 shufps $1,%xmm5,%xmm5
1050 addss %xmm3,%xmm0
1051 addss %xmm4,%xmm1
1052 addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0
1055 ## increment i force
1056 movss (%rdi,%rcx,4),%xmm3
1057 movss 4(%rdi,%rcx,4),%xmm4
1058 movss 8(%rdi,%rcx,4),%xmm5
1059 subss %xmm0,%xmm3
1060 subss %xmm1,%xmm4
1061 subss %xmm2,%xmm5
1062 movss %xmm3,(%rdi,%rcx,4)
1063 movss %xmm4,4(%rdi,%rcx,4)
1064 movss %xmm5,8(%rdi,%rcx,4)
1066 ## increment fshift force
1067 movss (%rsi,%rdx,4),%xmm3
1068 movss 4(%rsi,%rdx,4),%xmm4
1069 movss 8(%rsi,%rdx,4),%xmm5
1070 subss %xmm0,%xmm3
1071 subss %xmm1,%xmm4
1072 subss %xmm2,%xmm5
1073 movss %xmm3,(%rsi,%rdx,4)
1074 movss %xmm4,4(%rsi,%rdx,4)
1075 movss %xmm5,8(%rsi,%rdx,4)
1077 ## get n from stack
1078 movl nb410_n(%rsp),%esi
1079 ## get group index for i particle
1080 movq nb410_gid(%rbp),%rdx ## base of gid[]
1081 movl (%rdx,%rsi,4),%edx ## ggid=gid[n]
1083 ## accumulate total potential energy and update it
1084 ## accumulate
1085 movhlps %xmm12,%xmm6
1086 addps %xmm6,%xmm12 ## pos 0-1 in xmm12 have the sum now
1087 movaps %xmm12,%xmm6
1088 shufps $1,%xmm6,%xmm6
1089 addss %xmm6,%xmm12
1091 ## add earlier value from mem
1092 movq nb410_Vc(%rbp),%rax
1093 addss (%rax,%rdx,4),%xmm12
1094 ## move back to mem
1095 movss %xmm12,(%rax,%rdx,4)
1097 ## accumulate total lj energy and update it
1098 movaps nb410_Vvdwtot(%rsp),%xmm7
1099 ## accumulate
1100 movhlps %xmm7,%xmm6
1101 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1102 movaps %xmm7,%xmm6
1103 shufps $1,%xmm6,%xmm6
1104 addss %xmm6,%xmm7
1106 ## add earlier value from mem
1107 movq nb410_Vvdw(%rbp),%rax
1108 addss (%rax,%rdx,4),%xmm7
1109 ## move back to mem
1110 movss %xmm7,(%rax,%rdx,4)
1112 ## accumulate dVda and update it
1113 movaps nb410_dvdasum(%rsp),%xmm7
1114 ## accumulate
1115 movhlps %xmm7,%xmm6
1116 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1117 movaps %xmm7,%xmm6
1118 shufps $1,%xmm6,%xmm6
1119 addss %xmm6,%xmm7
1121 movl nb410_ii(%rsp),%edx
1122 movq nb410_dvda(%rbp),%rax
1123 addss (%rax,%rdx,4),%xmm7
1124 movss %xmm7,(%rax,%rdx,4)
1126 ## finish if last
1127 movl nb410_nn1(%rsp),%ecx
1128 ## esi already loaded with n
1129 incl %esi
1130 subl %esi,%ecx
1131 jz _nb_kernel410_x86_64_sse.nb410_outerend
1133 ## not last, iterate outer loop once more!
1134 movl %esi,nb410_n(%rsp)
1135 jmp _nb_kernel410_x86_64_sse.nb410_outer
1136 _nb_kernel410_x86_64_sse.nb410_outerend:
1137 ## check if more outer neighborlists remain
1138 movl nb410_nri(%rsp),%ecx
1139 ## esi already loaded with n above
1140 subl %esi,%ecx
1141 jz _nb_kernel410_x86_64_sse.nb410_end
1142 ## non-zero, do one more workunit
1143 jmp _nb_kernel410_x86_64_sse.nb410_threadloop
1144 _nb_kernel410_x86_64_sse.nb410_end:
1146 movl nb410_nouter(%rsp),%eax
1147 movl nb410_ninner(%rsp),%ebx
1148 movq nb410_outeriter(%rbp),%rcx
1149 movq nb410_inneriter(%rbp),%rdx
1150 movl %eax,(%rcx)
1151 movl %ebx,(%rdx)
1153 addq $568,%rsp
1154 emms
1157 pop %r15
1158 pop %r14
1159 pop %r13
1160 pop %r12
1162 pop %rbx
1163 pop %rbp
1168 .globl nb_kernel410nf_x86_64_sse
1169 .globl _nb_kernel410nf_x86_64_sse
1170 nb_kernel410nf_x86_64_sse:
1171 _nb_kernel410nf_x86_64_sse:
1172 ## Room for return address and rbp (16 bytes)
1173 .set nb410nf_fshift, 16
1174 .set nb410nf_gid, 24
1175 .set nb410nf_pos, 32
1176 .set nb410nf_faction, 40
1177 .set nb410nf_charge, 48
1178 .set nb410nf_p_facel, 56
1179 .set nb410nf_argkrf, 64
1180 .set nb410nf_argcrf, 72
1181 .set nb410nf_Vc, 80
1182 .set nb410nf_type, 88
1183 .set nb410nf_p_ntype, 96
1184 .set nb410nf_vdwparam, 104
1185 .set nb410nf_Vvdw, 112
1186 .set nb410nf_p_tabscale, 120
1187 .set nb410nf_VFtab, 128
1188 .set nb410nf_invsqrta, 136
1189 .set nb410nf_dvda, 144
1190 .set nb410nf_p_gbtabscale, 152
1191 .set nb410nf_GBtab, 160
1192 .set nb410nf_p_nthreads, 168
1193 .set nb410nf_count, 176
1194 .set nb410nf_mtx, 184
1195 .set nb410nf_outeriter, 192
1196 .set nb410nf_inneriter, 200
1197 .set nb410nf_work, 208
1198 ## stack offsets for local variables
1199 ## bottom of stack is cache-aligned for sse use
1200 .set nb410nf_ix, 0
1201 .set nb410nf_iy, 16
1202 .set nb410nf_iz, 32
1203 .set nb410nf_iq, 48
1204 .set nb410nf_gbtsc, 64
1205 .set nb410nf_qq, 80
1206 .set nb410nf_c6, 96
1207 .set nb410nf_c12, 112
1208 .set nb410nf_vctot, 128
1209 .set nb410nf_Vvdwtot, 144
1210 .set nb410nf_half, 160
1211 .set nb410nf_three, 176
1212 .set nb410nf_isai, 192
1213 .set nb410nf_isaprod, 208
1214 .set nb410nf_gbscale, 224
1215 .set nb410nf_nri, 240
1216 .set nb410nf_iinr, 248
1217 .set nb410nf_jindex, 256
1218 .set nb410nf_jjnr, 264
1219 .set nb410nf_shift, 272
1220 .set nb410nf_shiftvec, 280
1221 .set nb410nf_facel, 288
1222 .set nb410nf_innerjjnr, 296
1223 .set nb410nf_is3, 304
1224 .set nb410nf_ii3, 308
1225 .set nb410nf_ntia, 312
1226 .set nb410nf_innerk, 316
1227 .set nb410nf_n, 320
1228 .set nb410nf_nn1, 324
1229 .set nb410nf_ntype, 328
1230 .set nb410nf_nouter, 332
1231 .set nb410nf_ninner, 336
1233 push %rbp
1234 movq %rsp,%rbp
1235 push %rbx
1238 emms
1240 push %r12
1241 push %r13
1242 push %r14
1243 push %r15
1245 subq $360,%rsp ## local variable stack space (n*16+8)
1247 ## zero 32-bit iteration counters
1248 movl $0,%eax
1249 movl %eax,nb410nf_nouter(%rsp)
1250 movl %eax,nb410nf_ninner(%rsp)
1252 movl (%rdi),%edi
1253 movl %edi,nb410nf_nri(%rsp)
1254 movq %rsi,nb410nf_iinr(%rsp)
1255 movq %rdx,nb410nf_jindex(%rsp)
1256 movq %rcx,nb410nf_jjnr(%rsp)
1257 movq %r8,nb410nf_shift(%rsp)
1258 movq %r9,nb410nf_shiftvec(%rsp)
1259 movq nb410nf_p_ntype(%rbp),%rdi
1260 movl (%rdi),%edi
1261 movl %edi,nb410nf_ntype(%rsp)
1262 movq nb410nf_p_facel(%rbp),%rsi
1263 movss (%rsi),%xmm0
1264 movss %xmm0,nb410nf_facel(%rsp)
1266 movq nb410nf_p_gbtabscale(%rbp),%rbx
1267 movss (%rbx),%xmm4
1268 shufps $0,%xmm4,%xmm4
1269 movaps %xmm4,nb410nf_gbtsc(%rsp)
1272 ## create constant floating-point factors on stack
1273 movl $0x3f000000,%eax ## half in IEEE (hex)
1274 movl %eax,nb410nf_half(%rsp)
1275 movss nb410nf_half(%rsp),%xmm1
1276 shufps $0,%xmm1,%xmm1 ## splat to all elements
1277 movaps %xmm1,%xmm2
1278 addps %xmm2,%xmm2 ## one
1279 movaps %xmm2,%xmm3
1280 addps %xmm2,%xmm2 ## two
1281 addps %xmm2,%xmm3 ## three
1282 movaps %xmm1,nb410nf_half(%rsp)
1283 movaps %xmm3,nb410nf_three(%rsp)
1285 _nb_kernel410nf_x86_64_sse.nb410nf_threadloop:
1286 movq nb410nf_count(%rbp),%rsi ## pointer to sync counter
1287 movl (%rsi),%eax
1288 _nb_kernel410nf_x86_64_sse.nb410nf_spinlock:
1289 movl %eax,%ebx ## ebx=*count=nn0
1290 addl $1,%ebx ## ebx=nn1=nn0+10
1291 lock
1292 cmpxchgl %ebx,(%rsi) ## write nn1 to *counter,
1293 ## if it hasnt changed.
1294 ## or reread *counter to eax.
1295 pause ## -> better p4 performance
1296 jnz _nb_kernel410nf_x86_64_sse.nb410nf_spinlock
1298 ## if(nn1>nri) nn1=nri
1299 movl nb410nf_nri(%rsp),%ecx
1300 movl %ecx,%edx
1301 subl %ebx,%ecx
1302 cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri
1303 ## Cleared the spinlock if we got here.
1304 ## eax contains nn0, ebx contains nn1.
1305 movl %eax,nb410nf_n(%rsp)
1306 movl %ebx,nb410nf_nn1(%rsp)
1307 subl %eax,%ebx ## calc number of outer lists
1308 movl %eax,%esi ## copy n to esi
1309 jg _nb_kernel410nf_x86_64_sse.nb410nf_outerstart
1310 jmp _nb_kernel410nf_x86_64_sse.nb410nf_end
1312 _nb_kernel410nf_x86_64_sse.nb410nf_outerstart:
1313 ## ebx contains number of outer iterations
1314 addl nb410nf_nouter(%rsp),%ebx
1315 movl %ebx,nb410nf_nouter(%rsp)
1317 _nb_kernel410nf_x86_64_sse.nb410nf_outer:
1318 movq nb410nf_shift(%rsp),%rax ## rax = pointer into shift[]
1319 movl (%rax,%rsi,4),%ebx ## ebx=shift[n]
1321 lea (%rbx,%rbx,2),%rbx ## rbx=3*is
1322 movl %ebx,nb410nf_is3(%rsp) ## store is3
1324 movq nb410nf_shiftvec(%rsp),%rax ## rax = base of shiftvec[]
1326 movss (%rax,%rbx,4),%xmm0
1327 movss 4(%rax,%rbx,4),%xmm1
1328 movss 8(%rax,%rbx,4),%xmm2
1330 movq nb410nf_iinr(%rsp),%rcx ## rcx = pointer into iinr[]
1331 movl (%rcx,%rsi,4),%ebx ## ebx =ii
1333 movq nb410nf_charge(%rbp),%rdx
1334 movss (%rdx,%rbx,4),%xmm3
1335 mulss nb410nf_facel(%rsp),%xmm3
1336 shufps $0,%xmm3,%xmm3
1338 movq nb410nf_invsqrta(%rbp),%rdx ## load invsqrta[ii]
1339 movss (%rdx,%rbx,4),%xmm4
1340 shufps $0,%xmm4,%xmm4
1342 movq nb410nf_type(%rbp),%rdx
1343 movl (%rdx,%rbx,4),%edx
1344 imull nb410nf_ntype(%rsp),%edx
1345 shll %edx
1346 movl %edx,nb410nf_ntia(%rsp)
1348 lea (%rbx,%rbx,2),%rbx ## rbx = 3*ii=ii3
1349 movq nb410nf_pos(%rbp),%rax ## rax = base of pos[]
1351 addss (%rax,%rbx,4),%xmm0
1352 addss 4(%rax,%rbx,4),%xmm1
1353 addss 8(%rax,%rbx,4),%xmm2
1355 movaps %xmm3,nb410nf_iq(%rsp)
1356 movaps %xmm4,nb410nf_isai(%rsp)
1358 shufps $0,%xmm0,%xmm0
1359 shufps $0,%xmm1,%xmm1
1360 shufps $0,%xmm2,%xmm2
1362 movaps %xmm0,nb410nf_ix(%rsp)
1363 movaps %xmm1,nb410nf_iy(%rsp)
1364 movaps %xmm2,nb410nf_iz(%rsp)
1366 movl %ebx,nb410nf_ii3(%rsp)
1368 ## clear vctot
1369 xorps %xmm4,%xmm4
1370 movaps %xmm4,nb410nf_vctot(%rsp)
1371 movaps %xmm4,nb410nf_Vvdwtot(%rsp)
1373 movq nb410nf_jindex(%rsp),%rax
1374 movl (%rax,%rsi,4),%ecx ## jindex[n]
1375 movl 4(%rax,%rsi,4),%edx ## jindex[n+1]
1376 subl %ecx,%edx ## number of innerloop atoms
1378 movq nb410nf_pos(%rbp),%rsi
1379 movq nb410nf_faction(%rbp),%rdi
1380 movq nb410nf_jjnr(%rsp),%rax
1381 shll $2,%ecx
1382 addq %rcx,%rax
1383 movq %rax,nb410nf_innerjjnr(%rsp) ## pointer to jjnr[nj0]
1384 movl %edx,%ecx
1385 subl $4,%edx
1386 addl nb410nf_ninner(%rsp),%ecx
1387 movl %ecx,nb410nf_ninner(%rsp)
1388 addl $0,%edx
1389 movl %edx,nb410nf_innerk(%rsp) ## number of innerloop atoms
1390 jge _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop
1391 jmp _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner
1392 _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop:
1393 ## quad-unroll innerloop here
1394 movq nb410nf_innerjjnr(%rsp),%rdx ## pointer to jjnr[k]
1395 movl (%rdx),%eax
1396 movl 4(%rdx),%ebx
1397 movl 8(%rdx),%ecx
1398 movl 12(%rdx),%edx ## eax-edx=jnr1-4
1399 addq $16,nb410nf_innerjjnr(%rsp) ## advance pointer (unrolled 4)
1401 ## load isa2
1402 movq nb410nf_invsqrta(%rbp),%rsi
1403 movss (%rsi,%rax,4),%xmm3
1404 movss (%rsi,%rcx,4),%xmm4
1405 movss (%rsi,%rbx,4),%xmm6
1406 movss (%rsi,%rdx,4),%xmm7
1407 movaps nb410nf_isai(%rsp),%xmm2
1408 shufps $0,%xmm6,%xmm3
1409 shufps $0,%xmm7,%xmm4
1410 shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3
1411 mulps %xmm3,%xmm2
1413 movaps %xmm2,nb410nf_isaprod(%rsp)
1414 movaps %xmm2,%xmm1
1415 mulps nb410nf_gbtsc(%rsp),%xmm1
1416 movaps %xmm1,nb410nf_gbscale(%rsp)
1418 movq nb410nf_charge(%rbp),%rsi ## base of charge[]
1420 movss (%rsi,%rax,4),%xmm3
1421 movss (%rsi,%rcx,4),%xmm4
1422 movss (%rsi,%rbx,4),%xmm6
1423 movss (%rsi,%rdx,4),%xmm7
1425 mulps nb410nf_iq(%rsp),%xmm2
1426 shufps $0,%xmm6,%xmm3
1427 shufps $0,%xmm7,%xmm4
1428 shufps $136,%xmm4,%xmm3 ## 10001000 ;# all charges in xmm3
1429 mulps %xmm2,%xmm3
1430 movaps %xmm3,nb410nf_qq(%rsp)
1432 movd %eax,%mm0
1433 movd %ebx,%mm1
1434 movd %ecx,%mm2
1435 movd %edx,%mm3
1437 movq nb410nf_type(%rbp),%rsi
1438 movl (%rsi,%rax,4),%eax
1439 movl (%rsi,%rbx,4),%ebx
1440 movl (%rsi,%rcx,4),%ecx
1441 movl (%rsi,%rdx,4),%edx
1442 movq nb410nf_vdwparam(%rbp),%rsi
1443 shll %eax
1444 shll %ebx
1445 shll %ecx
1446 shll %edx
1447 movl nb410nf_ntia(%rsp),%edi
1448 addl %edi,%eax
1449 addl %edi,%ebx
1450 addl %edi,%ecx
1451 addl %edi,%edx
1453 movlps (%rsi,%rax,4),%xmm6
1454 movlps (%rsi,%rcx,4),%xmm7
1455 movhps (%rsi,%rbx,4),%xmm6
1456 movhps (%rsi,%rdx,4),%xmm7
1458 movaps %xmm6,%xmm4
1459 shufps $136,%xmm7,%xmm4 ## 10001000
1460 shufps $221,%xmm7,%xmm6 ## 11011101
1462 movd %mm0,%eax
1463 movd %mm1,%ebx
1464 movd %mm2,%ecx
1465 movd %mm3,%edx
1467 movaps %xmm4,nb410nf_c6(%rsp)
1468 movaps %xmm6,nb410nf_c12(%rsp)
1470 movq nb410nf_pos(%rbp),%rsi ## base of pos[]
1472 lea (%rax,%rax,2),%rax ## replace jnr with j3
1473 lea (%rbx,%rbx,2),%rbx
1475 lea (%rcx,%rcx,2),%rcx ## replace jnr with j3
1476 lea (%rdx,%rdx,2),%rdx
1478 ## move four coordinates to xmm0-xmm2
1480 movlps (%rsi,%rax,4),%xmm4
1481 movlps (%rsi,%rcx,4),%xmm5
1482 movss 8(%rsi,%rax,4),%xmm2
1483 movss 8(%rsi,%rcx,4),%xmm6
1485 movhps (%rsi,%rbx,4),%xmm4
1486 movhps (%rsi,%rdx,4),%xmm5
1488 movss 8(%rsi,%rbx,4),%xmm0
1489 movss 8(%rsi,%rdx,4),%xmm1
1491 shufps $0,%xmm0,%xmm2
1492 shufps $0,%xmm1,%xmm6
1494 movaps %xmm4,%xmm0
1495 movaps %xmm4,%xmm1
1497 shufps $136,%xmm6,%xmm2 ## 10001000
1499 shufps $136,%xmm5,%xmm0 ## 10001000
1500 shufps $221,%xmm5,%xmm1 ## 11011101
1502 ## move ix-iz to xmm4-xmm6
1503 movaps nb410nf_ix(%rsp),%xmm4
1504 movaps nb410nf_iy(%rsp),%xmm5
1505 movaps nb410nf_iz(%rsp),%xmm6
1507 ## calc dr
1508 subps %xmm0,%xmm4
1509 subps %xmm1,%xmm5
1510 subps %xmm2,%xmm6
1512 ## square it
1513 mulps %xmm4,%xmm4
1514 mulps %xmm5,%xmm5
1515 mulps %xmm6,%xmm6
1516 addps %xmm5,%xmm4
1517 addps %xmm6,%xmm4
1518 ## rsq in xmm4
1520 rsqrtps %xmm4,%xmm5
1521 ## lookup seed in xmm5
1522 movaps %xmm5,%xmm2
1523 mulps %xmm5,%xmm5
1524 movaps nb410nf_three(%rsp),%xmm1
1525 mulps %xmm4,%xmm5 ## rsq*lu*lu
1526 movaps nb410nf_half(%rsp),%xmm0
1527 subps %xmm5,%xmm1 ## 30-rsq*lu*lu
1528 mulps %xmm2,%xmm1
1529 mulps %xmm1,%xmm0 ## xmm0=rinv
1530 mulps %xmm0,%xmm4 ## xmm4=r
1531 mulps nb410nf_gbscale(%rsp),%xmm4
1533 movhlps %xmm4,%xmm5
1534 cvttps2pi %xmm4,%mm6
1535 cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices
1536 cvtpi2ps %mm6,%xmm6
1537 cvtpi2ps %mm7,%xmm5
1538 movlhps %xmm5,%xmm6
1539 subps %xmm6,%xmm4
1540 movaps %xmm4,%xmm1 ## xmm1=eps
1541 movaps %xmm1,%xmm2
1542 mulps %xmm2,%xmm2 ## xmm2=eps2
1543 pslld $2,%mm6
1544 pslld $2,%mm7
1546 movd %eax,%mm0
1547 movd %ebx,%mm1
1548 movd %ecx,%mm2
1549 movd %edx,%mm3
1551 movq nb410nf_GBtab(%rbp),%rsi
1552 movd %mm6,%eax
1553 psrlq $32,%mm6
1554 movd %mm7,%ecx
1555 psrlq $32,%mm7
1556 movd %mm6,%ebx
1557 movd %mm7,%edx
1559 ## load coulomb table
1560 movaps (%rsi,%rax,4),%xmm4
1561 movaps (%rsi,%rbx,4),%xmm5
1562 movaps (%rsi,%rcx,4),%xmm6
1563 movaps (%rsi,%rdx,4),%xmm7
1564 ## transpose, using xmm3 for scratch
1565 movaps %xmm6,%xmm3
1566 shufps $0xEE,%xmm7,%xmm3
1567 shufps $0x44,%xmm7,%xmm6
1568 movaps %xmm4,%xmm7
1569 shufps $0xEE,%xmm5,%xmm7
1570 shufps $0x44,%xmm5,%xmm4
1571 movaps %xmm4,%xmm5
1572 shufps $0xDD,%xmm6,%xmm5
1573 shufps $0x88,%xmm6,%xmm4
1574 movaps %xmm7,%xmm6
1575 shufps $0x88,%xmm3,%xmm6
1576 shufps $0xDD,%xmm3,%xmm7
1577 ## coulomb table ready, in xmm4-xmm7
1578 mulps %xmm1,%xmm6 ## xmm6=Geps
1579 mulps %xmm2,%xmm7 ## xmm7=Heps2
1581 addps %xmm6,%xmm5
1582 addps %xmm7,%xmm5 ## xmm5=Fp
1583 movaps nb410nf_qq(%rsp),%xmm3
1584 mulps %xmm1,%xmm5 ## xmm5=eps*Fp
1585 addps %xmm4,%xmm5 ## xmm5=VV
1586 mulps %xmm3,%xmm5 ## vcoul=qq*VV
1587 ## update vctot
1588 addps nb410nf_vctot(%rsp),%xmm5
1589 movaps %xmm5,nb410nf_vctot(%rsp)
1591 ## L-J
1592 movaps %xmm0,%xmm4
1593 mulps %xmm0,%xmm4 ## xmm4=rinvsq
1595 movaps %xmm4,%xmm6
1596 mulps %xmm4,%xmm6
1598 mulps %xmm4,%xmm6 ## xmm6=rinvsix
1599 movaps %xmm6,%xmm4
1600 mulps %xmm4,%xmm4 ## xmm4=rinvtwelve
1601 mulps nb410nf_c6(%rsp),%xmm6
1602 mulps nb410nf_c12(%rsp),%xmm4
1603 movaps nb410nf_Vvdwtot(%rsp),%xmm7
1604 addps %xmm4,%xmm7
1605 subps %xmm6,%xmm7
1606 movaps %xmm7,nb410nf_Vvdwtot(%rsp)
1608 ## should we do one more iteration?
1609 subl $4,nb410nf_innerk(%rsp)
1610 jl _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner
1611 jmp _nb_kernel410nf_x86_64_sse.nb410nf_unroll_loop
1612 _nb_kernel410nf_x86_64_sse.nb410nf_finish_inner:
1613 ## check if at least two particles remain
1614 addl $4,nb410nf_innerk(%rsp)
1615 movl nb410nf_innerk(%rsp),%edx
1616 andl $2,%edx
1617 jnz _nb_kernel410nf_x86_64_sse.nb410nf_dopair
1618 jmp _nb_kernel410nf_x86_64_sse.nb410nf_checksingle
1619 _nb_kernel410nf_x86_64_sse.nb410nf_dopair:
1620 movq nb410nf_innerjjnr(%rsp),%rcx
1621 movl (%rcx),%eax
1622 movl 4(%rcx),%ebx
1623 addq $8,nb410nf_innerjjnr(%rsp)
1625 xorps %xmm2,%xmm2
1626 movaps %xmm2,%xmm6
1628 ## load isa2
1629 movq nb410nf_invsqrta(%rbp),%rsi
1630 movss (%rsi,%rax,4),%xmm2
1631 movss (%rsi,%rbx,4),%xmm3
1632 unpcklps %xmm3,%xmm2 ## isa2 in xmm3(0,1)
1633 mulps nb410nf_isai(%rsp),%xmm2
1634 movaps %xmm2,nb410nf_isaprod(%rsp)
1635 movaps %xmm2,%xmm1
1636 mulps nb410nf_gbtsc(%rsp),%xmm1
1637 movaps %xmm1,nb410nf_gbscale(%rsp)
1639 movq nb410nf_charge(%rbp),%rsi ## base of charge[]
1640 movss (%rsi,%rax,4),%xmm3
1641 movss (%rsi,%rbx,4),%xmm6
1642 unpcklps %xmm6,%xmm3 ## 00001000 ;# xmm3(0,1) has the charges
1644 mulps nb410nf_iq(%rsp),%xmm2
1645 mulps %xmm2,%xmm3
1646 movaps %xmm3,nb410nf_qq(%rsp)
1648 movq nb410nf_type(%rbp),%rsi
1649 movl %eax,%ecx
1650 movl %ebx,%edx
1651 movl (%rsi,%rcx,4),%ecx
1652 movl (%rsi,%rdx,4),%edx
1653 movq nb410nf_vdwparam(%rbp),%rsi
1654 shll %ecx
1655 shll %edx
1656 movl nb410nf_ntia(%rsp),%edi
1657 addl %edi,%ecx
1658 addl %edi,%edx
1659 movlps (%rsi,%rcx,4),%xmm6
1660 movhps (%rsi,%rdx,4),%xmm6
1661 movq nb410nf_pos(%rbp),%rdi
1663 movaps %xmm6,%xmm4
1664 shufps $8,%xmm4,%xmm4 ## 00001000
1665 shufps $13,%xmm6,%xmm6 ## 00001101
1666 movlhps %xmm7,%xmm4
1667 movlhps %xmm7,%xmm6
1669 movaps %xmm4,nb410nf_c6(%rsp)
1670 movaps %xmm6,nb410nf_c12(%rsp)
1672 lea (%rax,%rax,2),%rax
1673 lea (%rbx,%rbx,2),%rbx
1674 ## move coordinates to xmm0-xmm2
1675 movlps (%rdi,%rax,4),%xmm1
1676 movss 8(%rdi,%rax,4),%xmm2
1677 movhps (%rdi,%rbx,4),%xmm1
1678 movss 8(%rdi,%rbx,4),%xmm0
1680 movlhps %xmm7,%xmm3
1682 shufps $0,%xmm0,%xmm2
1684 movaps %xmm1,%xmm0
1686 shufps $136,%xmm2,%xmm2 ## 10001000
1688 shufps $136,%xmm0,%xmm0 ## 10001000
1689 shufps $221,%xmm1,%xmm1 ## 11011101
1691 movq nb410nf_faction(%rbp),%rdi
1692 ## move ix-iz to xmm4-xmm6
1693 xorps %xmm7,%xmm7
1695 movaps nb410nf_ix(%rsp),%xmm4
1696 movaps nb410nf_iy(%rsp),%xmm5
1697 movaps nb410nf_iz(%rsp),%xmm6
1699 ## calc dr
1700 subps %xmm0,%xmm4
1701 subps %xmm1,%xmm5
1702 subps %xmm2,%xmm6
1704 ## square it
1705 mulps %xmm4,%xmm4
1706 mulps %xmm5,%xmm5
1707 mulps %xmm6,%xmm6
1708 addps %xmm5,%xmm4
1709 addps %xmm6,%xmm4
1710 ## rsq in xmm4
1712 rsqrtps %xmm4,%xmm5
1713 ## lookup seed in xmm5
1714 movaps %xmm5,%xmm2
1715 mulps %xmm5,%xmm5
1716 movaps nb410nf_three(%rsp),%xmm1
1717 mulps %xmm4,%xmm5 ## rsq*lu*lu
1718 movaps nb410nf_half(%rsp),%xmm0
1719 subps %xmm5,%xmm1 ## 30-rsq*lu*lu
1720 mulps %xmm2,%xmm1
1721 mulps %xmm1,%xmm0 ## xmm0=rinv
1722 mulps %xmm0,%xmm4 ## xmm4=r
1723 mulps nb410nf_gbscale(%rsp),%xmm4
1725 cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices
1726 cvtpi2ps %mm6,%xmm6
1727 subps %xmm6,%xmm4
1728 movaps %xmm4,%xmm1 ## xmm1=eps
1729 movaps %xmm1,%xmm2
1730 mulps %xmm2,%xmm2 ## xmm2=eps2
1732 pslld $2,%mm6
1734 movq nb410nf_GBtab(%rbp),%rsi
1735 movd %mm6,%ecx
1736 psrlq $32,%mm6
1737 movd %mm6,%edx
1739 ## load coulomb table
1740 movaps (%rsi,%rcx,4),%xmm4
1741 movaps (%rsi,%rdx,4),%xmm7
1742 ## transpose, using xmm3 for scratch
1743 movaps %xmm4,%xmm6
1744 unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2
1745 unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2
1746 movhlps %xmm4,%xmm5 ## F1 F2
1747 movhlps %xmm6,%xmm7 ## H1 H2
1748 ## coulomb table ready, in xmm4-xmm7
1750 mulps %xmm1,%xmm6 ## xmm6=Geps
1751 mulps %xmm2,%xmm7 ## xmm7=Heps2
1752 addps %xmm6,%xmm5
1753 addps %xmm7,%xmm5 ## xmm5=Fp
1754 movaps nb410nf_qq(%rsp),%xmm3
1755 mulps %xmm1,%xmm5 ## xmm5=eps*Fp
1756 addps %xmm4,%xmm5 ## xmm5=VV
1757 mulps %xmm3,%xmm5 ## vcoul=qq*VV
1759 addps nb410nf_vctot(%rsp),%xmm5
1760 movaps %xmm5,nb410nf_vctot(%rsp)
1762 ## L-J
1763 movaps %xmm0,%xmm4
1764 mulps %xmm0,%xmm4 ## xmm4=rinvsq
1766 ## at this point mm5 contains vcoul and mm3 fijC
1767 ## increment vcoul - then we can get rid of mm5
1768 ## update vctot
1770 movaps %xmm4,%xmm6
1771 mulps %xmm4,%xmm6
1773 mulps %xmm4,%xmm6 ## xmm6=rinvsix
1774 movaps %xmm6,%xmm4
1775 mulps %xmm4,%xmm4 ## xmm4=rinvtwelve
1776 mulps nb410nf_c6(%rsp),%xmm6
1777 mulps nb410nf_c12(%rsp),%xmm4
1778 movaps nb410nf_Vvdwtot(%rsp),%xmm7
1779 addps %xmm4,%xmm7
1780 subps %xmm6,%xmm7
1781 movaps %xmm7,nb410nf_Vvdwtot(%rsp)
1783 _nb_kernel410nf_x86_64_sse.nb410nf_checksingle:
1784 movl nb410nf_innerk(%rsp),%edx
1785 andl $1,%edx
1786 jnz _nb_kernel410nf_x86_64_sse.nb410nf_dosingle
1787 jmp _nb_kernel410nf_x86_64_sse.nb410nf_updateouterdata
1788 _nb_kernel410nf_x86_64_sse.nb410nf_dosingle:
1789 movq nb410nf_charge(%rbp),%rsi
1790 movq nb410nf_invsqrta(%rbp),%rdx
1791 movq nb410nf_pos(%rbp),%rdi
1792 movq nb410nf_innerjjnr(%rsp),%rcx
1793 movl (%rcx),%eax
1794 xorps %xmm2,%xmm2
1795 movaps %xmm2,%xmm6
1796 movss (%rdx,%rax,4),%xmm2 ## isa2
1797 mulss nb410nf_isai(%rsp),%xmm2
1798 movss %xmm2,nb410nf_isaprod(%rsp)
1799 movss %xmm2,%xmm1
1800 mulss nb410nf_gbtsc(%rsp),%xmm1
1801 movss %xmm1,nb410nf_gbscale(%rsp)
1803 mulss nb410nf_iq(%rsp),%xmm2
1804 movss (%rsi,%rax,4),%xmm6 ## xmm6(0) has the charge
1805 mulss %xmm2,%xmm6
1806 movss %xmm6,nb410nf_qq(%rsp)
1808 movq nb410nf_type(%rbp),%rsi
1809 movl %eax,%ecx
1810 movl (%rsi,%rcx,4),%ecx
1811 movq nb410nf_vdwparam(%rbp),%rsi
1812 shll %ecx
1813 addl nb410nf_ntia(%rsp),%ecx
1814 movlps (%rsi,%rcx,4),%xmm6
1815 movaps %xmm6,%xmm4
1816 shufps $252,%xmm4,%xmm4 ## 11111100
1817 shufps $253,%xmm6,%xmm6 ## 11111101
1819 movaps %xmm4,nb410nf_c6(%rsp)
1820 movaps %xmm6,nb410nf_c12(%rsp)
1822 lea (%rax,%rax,2),%rax
1824 ## move coordinates to xmm0-xmm2
1825 movss (%rdi,%rax,4),%xmm0
1826 movss 4(%rdi,%rax,4),%xmm1
1827 movss 8(%rdi,%rax,4),%xmm2
1829 movaps nb410nf_ix(%rsp),%xmm4
1830 movaps nb410nf_iy(%rsp),%xmm5
1831 movaps nb410nf_iz(%rsp),%xmm6
1833 ## calc dr
1834 subss %xmm0,%xmm4
1835 subss %xmm1,%xmm5
1836 subss %xmm2,%xmm6
1838 ## square it
1839 mulss %xmm4,%xmm4
1840 mulss %xmm5,%xmm5
1841 mulss %xmm6,%xmm6
1842 addss %xmm5,%xmm4
1843 addss %xmm6,%xmm4
1844 ## rsq in xmm4
1846 rsqrtss %xmm4,%xmm5
1847 ## lookup seed in xmm5
1848 movaps %xmm5,%xmm2
1849 mulss %xmm5,%xmm5
1850 movss nb410nf_three(%rsp),%xmm1
1851 mulss %xmm4,%xmm5 ## rsq*lu*lu
1852 movss nb410nf_half(%rsp),%xmm0
1853 subss %xmm5,%xmm1 ## 30-rsq*lu*lu
1854 mulss %xmm2,%xmm1
1855 mulss %xmm1,%xmm0 ## xmm0=rinv
1857 mulss %xmm0,%xmm4 ## xmm4=r
1858 mulss nb410nf_gbscale(%rsp),%xmm4
1860 cvttss2si %xmm4,%ebx ## mm6 contain lu indices
1861 cvtsi2ss %ebx,%xmm6
1862 subss %xmm6,%xmm4
1863 movaps %xmm4,%xmm1 ## xmm1=eps
1864 movaps %xmm1,%xmm2
1865 mulss %xmm2,%xmm2 ## xmm2=eps2
1867 shll $2,%ebx
1868 movq nb410nf_GBtab(%rbp),%rsi
1870 movaps (%rsi,%rbx,4),%xmm4
1871 movhlps %xmm4,%xmm6
1872 movaps %xmm4,%xmm5
1873 movaps %xmm6,%xmm7
1874 shufps $1,%xmm5,%xmm5
1875 shufps $1,%xmm7,%xmm7
1876 ## table ready in xmm4-xmm7
1878 mulss %xmm1,%xmm6 ## xmm6=Geps
1879 mulss %xmm2,%xmm7 ## xmm7=Heps2
1880 addss %xmm6,%xmm5
1881 addss %xmm7,%xmm5 ## xmm5=Fp
1882 movss nb410nf_qq(%rsp),%xmm3
1883 mulss %xmm1,%xmm5 ## xmm5=eps*Fp
1884 addss %xmm4,%xmm5 ## xmm5=VV
1885 mulss %xmm3,%xmm5 ## vcoul=qq*VV
1886 addss nb410nf_vctot(%rsp),%xmm5
1887 movss %xmm5,nb410nf_vctot(%rsp)
1889 ## L-J
1890 movaps %xmm0,%xmm4
1891 mulss %xmm0,%xmm4 ## xmm4=rinvsq
1893 movaps %xmm4,%xmm6
1894 mulss %xmm4,%xmm6
1896 mulss %xmm4,%xmm6 ## xmm6=rinvsix
1897 movaps %xmm6,%xmm4
1898 mulss %xmm4,%xmm4 ## xmm4=rinvtwelve
1899 mulss nb410nf_c6(%rsp),%xmm6
1900 mulss nb410nf_c12(%rsp),%xmm4
1901 movss nb410nf_Vvdwtot(%rsp),%xmm7
1902 addps %xmm4,%xmm7
1903 subps %xmm6,%xmm7
1904 movss %xmm7,nb410nf_Vvdwtot(%rsp)
1906 _nb_kernel410nf_x86_64_sse.nb410nf_updateouterdata:
1907 ## get n from stack
1908 movl nb410nf_n(%rsp),%esi
1909 ## get group index for i particle
1910 movq nb410nf_gid(%rbp),%rdx ## base of gid[]
1911 movl (%rdx,%rsi,4),%edx ## ggid=gid[n]
1913 ## accumulate total potential energy and update it
1914 movaps nb410nf_vctot(%rsp),%xmm7
1915 ## accumulate
1916 movhlps %xmm7,%xmm6
1917 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1918 movaps %xmm7,%xmm6
1919 shufps $1,%xmm6,%xmm6
1920 addss %xmm6,%xmm7
1922 ## add earlier value from mem
1923 movq nb410nf_Vc(%rbp),%rax
1924 addss (%rax,%rdx,4),%xmm7
1925 ## move back to mem
1926 movss %xmm7,(%rax,%rdx,4)
1928 ## accumulate total lj energy and update it
1929 movaps nb410nf_Vvdwtot(%rsp),%xmm7
1930 ## accumulate
1931 movhlps %xmm7,%xmm6
1932 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1933 movaps %xmm7,%xmm6
1934 shufps $1,%xmm6,%xmm6
1935 addss %xmm6,%xmm7
1937 ## add earlier value from mem
1938 movq nb410nf_Vvdw(%rbp),%rax
1939 addss (%rax,%rdx,4),%xmm7
1940 ## move back to mem
1941 movss %xmm7,(%rax,%rdx,4)
1943 ## finish if last
1944 movl nb410nf_nn1(%rsp),%ecx
1945 ## esi already loaded with n
1946 incl %esi
1947 subl %esi,%ecx
1948 jz _nb_kernel410nf_x86_64_sse.nb410nf_outerend
1950 ## not last, iterate outer loop once more!
1951 movl %esi,nb410nf_n(%rsp)
1952 jmp _nb_kernel410nf_x86_64_sse.nb410nf_outer
1953 _nb_kernel410nf_x86_64_sse.nb410nf_outerend:
1954 ## check if more outer neighborlists remain
1955 movl nb410nf_nri(%rsp),%ecx
1956 ## esi already loaded with n above
1957 subl %esi,%ecx
1958 jz _nb_kernel410nf_x86_64_sse.nb410nf_end
1959 ## non-zero, do one more workunit
1960 jmp _nb_kernel410nf_x86_64_sse.nb410nf_threadloop
1961 _nb_kernel410nf_x86_64_sse.nb410nf_end:
1963 movl nb410nf_nouter(%rsp),%eax
1964 movl nb410nf_ninner(%rsp),%ebx
1965 movq nb410nf_outeriter(%rbp),%rcx
1966 movq nb410nf_inneriter(%rbp),%rdx
1967 movl %eax,(%rcx)
1968 movl %ebx,(%rdx)
1970 addq $360,%rsp
1971 emms
1974 pop %r15
1975 pop %r14
1976 pop %r13
1977 pop %r12
1979 pop %rbx
1980 pop %rbp