Renamed intel syntax assembly files to avoid double extensions
[gromacs.git] / src / gmxlib / nonbonded / nb_kernel_ia32_sse / nb_kernel410_ia32_sse.s
blob0c05ad5c914734317f7f8f1e9ddc0436a34f3bb0
1 ##
2 ##
3 ## Gromacs 4.0 Copyright (c) 1991-2003
4 ## David van der Spoel, Erik Lindahl
5 ##
6 ## This program is free software; you can redistribute it and/or
7 ## modify it under the terms of the GNU General Public License
8 ## as published by the Free Software Foundation; either version 2
9 ## of the License, or (at your option) any later version.
11 ## To help us fund GROMACS development, we humbly ask that you cite
12 ## the research papers on the package. Check out http://www.gromacs.org
13 ##
14 ## And Hey:
15 ## Gnomes, ROck Monsters And Chili Sauce
20 .globl nb_kernel410_ia32_sse
21 .globl _nb_kernel410_ia32_sse
22 nb_kernel410_ia32_sse:
23 _nb_kernel410_ia32_sse:
24 .set nb410_p_nri, 8
25 .set nb410_iinr, 12
26 .set nb410_jindex, 16
27 .set nb410_jjnr, 20
28 .set nb410_shift, 24
29 .set nb410_shiftvec, 28
30 .set nb410_fshift, 32
31 .set nb410_gid, 36
32 .set nb410_pos, 40
33 .set nb410_faction, 44
34 .set nb410_charge, 48
35 .set nb410_p_facel, 52
36 .set nb410_argkrf, 56
37 .set nb410_argcrf, 60
38 .set nb410_Vc, 64
39 .set nb410_type, 68
40 .set nb410_p_ntype, 72
41 .set nb410_vdwparam, 76
42 .set nb410_Vvdw, 80
43 .set nb410_p_tabscale, 84
44 .set nb410_VFtab, 88
45 .set nb410_invsqrta, 92
46 .set nb410_dvda, 96
47 .set nb410_p_gbtabscale, 100
48 .set nb410_GBtab, 104
49 .set nb410_p_nthreads, 108
50 .set nb410_count, 112
51 .set nb410_mtx, 116
52 .set nb410_outeriter, 120
53 .set nb410_inneriter, 124
54 .set nb410_work, 128
55 ## stack offsets for local variables
56 ## bottom of stack is cache-aligned for sse use
57 .set nb410_ix, 0
58 .set nb410_iy, 16
59 .set nb410_iz, 32
60 .set nb410_iq, 48
61 .set nb410_dx, 64
62 .set nb410_dy, 80
63 .set nb410_dz, 96
64 .set nb410_two, 112
65 .set nb410_six, 128
66 .set nb410_twelve, 144
67 .set nb410_gbtsc, 160
68 .set nb410_qq, 176
69 .set nb410_c6, 192
70 .set nb410_c12, 208
71 .set nb410_fscal, 224
72 .set nb410_vctot, 240
73 .set nb410_Vvdwtot, 256
74 .set nb410_fix, 272
75 .set nb410_fiy, 288
76 .set nb410_fiz, 304
77 .set nb410_half, 320
78 .set nb410_three, 336
79 .set nb410_r, 352
80 .set nb410_isai, 368
81 .set nb410_isaprod, 384
82 .set nb410_dvdasum, 400
83 .set nb410_gbscale, 416
84 .set nb410_is3, 432
85 .set nb410_ii3, 436
86 .set nb410_ii, 440
87 .set nb410_ntia, 444
88 .set nb410_innerjjnr, 448
89 .set nb410_innerk, 452
90 .set nb410_n, 456
91 .set nb410_nn1, 460
92 .set nb410_jnra, 464
93 .set nb410_jnrb, 468
94 .set nb410_jnrc, 472
95 .set nb410_jnrd, 476
96 .set nb410_nri, 480
97 .set nb410_facel, 484
98 .set nb410_ntype, 488
99 .set nb410_nouter, 492
100 .set nb410_ninner, 496
101 .set nb410_salign, 500
102 pushl %ebp
103 movl %esp,%ebp
104 pushl %eax
105 pushl %ebx
106 pushl %ecx
107 pushl %edx
108 pushl %esi
109 pushl %edi
110 subl $504,%esp ## local stack space
111 movl %esp,%eax
112 andl $0xf,%eax
113 subl %eax,%esp
114 movl %eax,nb410_salign(%esp)
116 emms
118 ## Move args passed by reference to stack
119 movl nb410_p_nri(%ebp),%ecx
120 movl nb410_p_facel(%ebp),%esi
121 movl nb410_p_ntype(%ebp),%edi
122 movl (%ecx),%ecx
123 movl (%esi),%esi
124 movl (%edi),%edi
125 movl %ecx,nb410_nri(%esp)
126 movl %esi,nb410_facel(%esp)
127 movl %edi,nb410_ntype(%esp)
129 ## zero iteration counters
130 movl $0,%eax
131 movl %eax,nb410_nouter(%esp)
132 movl %eax,nb410_ninner(%esp)
135 movl nb410_p_gbtabscale(%ebp),%eax
136 movss (%eax),%xmm5
137 shufps $0,%xmm5,%xmm5
138 movaps %xmm5,nb410_gbtsc(%esp)
140 ## create constant floating-point factors on stack
141 movl $0x3f000000,%eax ## constant 0.5 in IEEE (hex)
142 movl %eax,nb410_half(%esp)
143 movss nb410_half(%esp),%xmm1
144 shufps $0,%xmm1,%xmm1 ## splat to all elements
145 movaps %xmm1,%xmm2
146 addps %xmm2,%xmm2 ## constant 1.0
147 movaps %xmm2,%xmm3
148 addps %xmm2,%xmm2 ## constant 2.0
149 addps %xmm2,%xmm3 ## constant 3.0
150 movaps %xmm3,%xmm4
151 addps %xmm4,%xmm4 ## 6.0
152 movaps %xmm4,%xmm5
153 addps %xmm5,%xmm5 ## constant 12.0
154 movaps %xmm1,nb410_half(%esp)
155 movaps %xmm2,nb410_two(%esp)
156 movaps %xmm3,nb410_three(%esp)
157 movaps %xmm4,nb410_six(%esp)
158 movaps %xmm5,nb410_twelve(%esp)
160 _nb_kernel410_ia32_sse.nb410_threadloop:
161 movl nb410_count(%ebp),%esi ## pointer to sync counter
162 movl (%esi),%eax
163 _nb_kernel410_ia32_sse.nb410_spinlock:
164 movl %eax,%ebx ## ebx=*count=nn0
165 addl $1,%ebx ## ebx=nn1=nn0+10
166 lock
167 cmpxchgl %ebx,(%esi) ## write nn1 to *counter,
168 ## if it hasnt changed.
169 ## or reread *counter to eax.
170 pause ## -> better p4 performance
171 jnz _nb_kernel410_ia32_sse.nb410_spinlock
173 ## if(nn1>nri) nn1=nri
174 movl nb410_nri(%esp),%ecx
175 movl %ecx,%edx
176 subl %ebx,%ecx
177 cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri
178 ## Cleared the spinlock if we got here.
179 ## eax contains nn0, ebx contains nn1.
180 movl %eax,nb410_n(%esp)
181 movl %ebx,nb410_nn1(%esp)
182 subl %eax,%ebx ## calc number of outer lists
183 movl %eax,%esi ## copy n to esi
184 jg _nb_kernel410_ia32_sse.nb410_outerstart
185 jmp _nb_kernel410_ia32_sse.nb410_end
187 _nb_kernel410_ia32_sse.nb410_outerstart:
188 ## ebx contains number of outer iterations
189 addl nb410_nouter(%esp),%ebx
190 movl %ebx,nb410_nouter(%esp)
192 _nb_kernel410_ia32_sse.nb410_outer:
193 movl nb410_shift(%ebp),%eax ## eax = pointer into shift[]
194 movl (%eax,%esi,4),%ebx ## ebx=shift[n]
196 leal (%ebx,%ebx,2),%ebx ## ebx=3*is
197 movl %ebx,nb410_is3(%esp) ## store is3
199 movl nb410_shiftvec(%ebp),%eax ## eax = base of shiftvec[]
201 movss (%eax,%ebx,4),%xmm0
202 movss 4(%eax,%ebx,4),%xmm1
203 movss 8(%eax,%ebx,4),%xmm2
205 movl nb410_iinr(%ebp),%ecx ## ecx = pointer into iinr[]
206 movl (%ecx,%esi,4),%ebx ## ebx =ii
207 movl %ebx,nb410_ii(%esp)
209 movl nb410_charge(%ebp),%edx
210 movss (%edx,%ebx,4),%xmm3
211 mulss nb410_facel(%esp),%xmm3
212 shufps $0,%xmm3,%xmm3
214 movl nb410_invsqrta(%ebp),%edx ## load invsqrta[ii]
215 movss (%edx,%ebx,4),%xmm4
216 shufps $0,%xmm4,%xmm4
218 movl nb410_type(%ebp),%edx
219 movl (%edx,%ebx,4),%edx
220 imull nb410_ntype(%esp),%edx
221 shll %edx
222 movl %edx,nb410_ntia(%esp)
224 leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3
225 movl nb410_pos(%ebp),%eax ## eax = base of pos[]
227 addss (%eax,%ebx,4),%xmm0
228 addss 4(%eax,%ebx,4),%xmm1
229 addss 8(%eax,%ebx,4),%xmm2
231 movaps %xmm3,nb410_iq(%esp)
232 movaps %xmm4,nb410_isai(%esp)
234 shufps $0,%xmm0,%xmm0
235 shufps $0,%xmm1,%xmm1
236 shufps $0,%xmm2,%xmm2
238 movaps %xmm0,nb410_ix(%esp)
239 movaps %xmm1,nb410_iy(%esp)
240 movaps %xmm2,nb410_iz(%esp)
242 movl %ebx,nb410_ii3(%esp)
244 ## clear vctot and i forces
245 xorps %xmm4,%xmm4
246 movaps %xmm4,nb410_vctot(%esp)
247 movaps %xmm4,nb410_Vvdwtot(%esp)
248 movaps %xmm4,nb410_dvdasum(%esp)
249 movaps %xmm4,nb410_fix(%esp)
250 movaps %xmm4,nb410_fiy(%esp)
251 movaps %xmm4,nb410_fiz(%esp)
253 movl nb410_jindex(%ebp),%eax
254 movl (%eax,%esi,4),%ecx ## jindex[n]
255 movl 4(%eax,%esi,4),%edx ## jindex[n+1]
256 subl %ecx,%edx ## number of innerloop atoms
258 movl nb410_pos(%ebp),%esi
259 movl nb410_faction(%ebp),%edi
260 movl nb410_jjnr(%ebp),%eax
261 shll $2,%ecx
262 addl %ecx,%eax
263 movl %eax,nb410_innerjjnr(%esp) ## pointer to jjnr[nj0]
264 movl %edx,%ecx
265 subl $4,%edx
266 addl nb410_ninner(%esp),%ecx
267 movl %ecx,nb410_ninner(%esp)
268 addl $0,%edx
269 movl %edx,nb410_innerk(%esp) ## number of innerloop atoms
270 jge _nb_kernel410_ia32_sse.nb410_unroll_loop
271 jmp _nb_kernel410_ia32_sse.nb410_finish_inner
272 _nb_kernel410_ia32_sse.nb410_unroll_loop:
273 ## quad-unroll innerloop here
274 movl nb410_innerjjnr(%esp),%edx ## pointer to jjnr[k]
275 movl (%edx),%eax
276 movl 4(%edx),%ebx
277 movl 8(%edx),%ecx
278 movl 12(%edx),%edx ## eax-edx=jnr1-4
279 addl $16,nb410_innerjjnr(%esp) ## advance pointer (unrolled 4)
281 ## load isaj
282 movl nb410_invsqrta(%ebp),%esi
283 movss (%esi,%eax,4),%xmm3
284 movss (%esi,%ecx,4),%xmm4
285 movss (%esi,%ebx,4),%xmm6
286 movss (%esi,%edx,4),%xmm7
287 movaps nb410_isai(%esp),%xmm2
288 shufps $0,%xmm6,%xmm3
289 shufps $0,%xmm7,%xmm4
290 shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all isaj in xmm3
291 mulps %xmm3,%xmm2
293 movaps %xmm2,nb410_isaprod(%esp)
294 movaps %xmm2,%xmm1
295 mulps nb410_gbtsc(%esp),%xmm1
296 movaps %xmm1,nb410_gbscale(%esp)
298 movl nb410_charge(%ebp),%esi ## base of charge[]
300 movss (%esi,%eax,4),%xmm3
301 movss (%esi,%ecx,4),%xmm4
302 movss (%esi,%ebx,4),%xmm6
303 movss (%esi,%edx,4),%xmm7
305 mulps nb410_iq(%esp),%xmm2
306 shufps $0,%xmm6,%xmm3
307 shufps $0,%xmm7,%xmm4
308 shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3
309 mulps %xmm2,%xmm3
310 movaps %xmm3,nb410_qq(%esp)
312 movd %eax,%mm0
313 movd %ebx,%mm1
314 movd %ecx,%mm2
315 movd %edx,%mm3
317 movl nb410_type(%ebp),%esi
318 movl (%esi,%eax,4),%eax
319 movl (%esi,%ebx,4),%ebx
320 movl (%esi,%ecx,4),%ecx
321 movl (%esi,%edx,4),%edx
322 movl nb410_vdwparam(%ebp),%esi
323 shll %eax
324 shll %ebx
325 shll %ecx
326 shll %edx
327 movl nb410_ntia(%esp),%edi
328 addl %edi,%eax
329 addl %edi,%ebx
330 addl %edi,%ecx
331 addl %edi,%edx
333 movlps (%esi,%eax,4),%xmm6
334 movlps (%esi,%ecx,4),%xmm7
335 movhps (%esi,%ebx,4),%xmm6
336 movhps (%esi,%edx,4),%xmm7
338 movaps %xmm6,%xmm4
339 shufps $136,%xmm7,%xmm4 ## constant 10001000
340 shufps $221,%xmm7,%xmm6 ## constant 11011101
342 movd %mm0,%eax
343 movd %mm1,%ebx
344 movd %mm2,%ecx
345 movd %mm3,%edx
347 movaps %xmm4,nb410_c6(%esp)
348 movaps %xmm6,nb410_c12(%esp)
350 movl nb410_pos(%ebp),%esi ## base of pos[]
352 movl %eax,nb410_jnra(%esp)
353 movl %ebx,nb410_jnrb(%esp)
354 movl %ecx,nb410_jnrc(%esp)
355 movl %edx,nb410_jnrd(%esp)
357 leal (%eax,%eax,2),%eax ## replace jnr with j3
358 leal (%ebx,%ebx,2),%ebx
360 leal (%ecx,%ecx,2),%ecx ## replace jnr with j3
361 leal (%edx,%edx,2),%edx
363 ## move four coordinates to xmm0-xmm2
365 movlps (%esi,%eax,4),%xmm4
366 movlps (%esi,%ecx,4),%xmm5
367 movss 8(%esi,%eax,4),%xmm2
368 movss 8(%esi,%ecx,4),%xmm6
370 movhps (%esi,%ebx,4),%xmm4
371 movhps (%esi,%edx,4),%xmm5
373 movss 8(%esi,%ebx,4),%xmm0
374 movss 8(%esi,%edx,4),%xmm1
376 shufps $0,%xmm0,%xmm2
377 shufps $0,%xmm1,%xmm6
379 movaps %xmm4,%xmm0
380 movaps %xmm4,%xmm1
382 shufps $136,%xmm6,%xmm2 ## constant 10001000
384 shufps $136,%xmm5,%xmm0 ## constant 10001000
385 shufps $221,%xmm5,%xmm1 ## constant 11011101
387 ## move ix-iz to xmm4-xmm6
388 movaps nb410_ix(%esp),%xmm4
389 movaps nb410_iy(%esp),%xmm5
390 movaps nb410_iz(%esp),%xmm6
392 ## calc dr
393 subps %xmm0,%xmm4
394 subps %xmm1,%xmm5
395 subps %xmm2,%xmm6
397 ## store dr
398 movaps %xmm4,nb410_dx(%esp)
399 movaps %xmm5,nb410_dy(%esp)
400 movaps %xmm6,nb410_dz(%esp)
401 ## square it
402 mulps %xmm4,%xmm4
403 mulps %xmm5,%xmm5
404 mulps %xmm6,%xmm6
405 addps %xmm5,%xmm4
406 addps %xmm6,%xmm4
407 ## rsq in xmm4
409 rsqrtps %xmm4,%xmm5
410 ## lookup seed in xmm5
411 movaps %xmm5,%xmm2
412 mulps %xmm5,%xmm5
413 movaps nb410_three(%esp),%xmm1
414 mulps %xmm4,%xmm5 ## rsq*lu*lu
415 movaps nb410_half(%esp),%xmm0
416 subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu
417 mulps %xmm2,%xmm1
418 mulps %xmm1,%xmm0 ## xmm0=rinv
419 mulps %xmm0,%xmm4 ## xmm4=r
420 movaps %xmm4,nb410_r(%esp)
421 mulps nb410_gbscale(%esp),%xmm4
423 movhlps %xmm4,%xmm5
424 cvttps2pi %xmm4,%mm6
425 cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices
426 cvtpi2ps %mm6,%xmm6
427 cvtpi2ps %mm7,%xmm5
428 movlhps %xmm5,%xmm6
429 subps %xmm6,%xmm4
430 movaps %xmm4,%xmm1 ## xmm1=eps
431 movaps %xmm1,%xmm2
432 mulps %xmm2,%xmm2 ## xmm2=eps2
433 pslld $2,%mm6
434 pslld $2,%mm7
436 movd %eax,%mm0
437 movd %ebx,%mm1
438 movd %ecx,%mm2
439 movd %edx,%mm3
441 movl nb410_GBtab(%ebp),%esi
442 movd %mm6,%eax
443 psrlq $32,%mm6
444 movd %mm7,%ecx
445 psrlq $32,%mm7
446 movd %mm6,%ebx
447 movd %mm7,%edx
449 ## load coulomb table
450 movaps (%esi,%eax,4),%xmm4
451 movaps (%esi,%ebx,4),%xmm5
452 movaps (%esi,%ecx,4),%xmm6
453 movaps (%esi,%edx,4),%xmm7
454 ## transpose, using xmm3 for scratch
455 movaps %xmm6,%xmm3
456 shufps $0xEE,%xmm7,%xmm3
457 shufps $0x44,%xmm7,%xmm6
458 movaps %xmm4,%xmm7
459 shufps $0xEE,%xmm5,%xmm7
460 shufps $0x44,%xmm5,%xmm4
461 movaps %xmm4,%xmm5
462 shufps $0xDD,%xmm6,%xmm5
463 shufps $0x88,%xmm6,%xmm4
464 movaps %xmm7,%xmm6
465 shufps $0x88,%xmm3,%xmm6
466 shufps $0xDD,%xmm3,%xmm7
467 ## coulomb table ready, in xmm4-xmm7
468 mulps %xmm1,%xmm6 ## xmm6=Geps
469 mulps %xmm2,%xmm7 ## xmm7=Heps2
471 addps %xmm6,%xmm5
472 addps %xmm7,%xmm5 ## xmm5=Fp
473 mulps nb410_two(%esp),%xmm7 ## two*Heps2
474 movaps nb410_qq(%esp),%xmm3
475 addps %xmm6,%xmm7
476 addps %xmm5,%xmm7 ## xmm7=FF
477 mulps %xmm1,%xmm5 ## xmm5=eps*Fp
478 addps %xmm4,%xmm5 ## xmm5=VV
479 mulps %xmm3,%xmm5 ## vcoul=qq*VV
480 mulps %xmm7,%xmm3 ## fijC=FF*qq
481 ## get jnr from stack
482 movl nb410_jnra(%esp),%eax
483 movl nb410_jnrb(%esp),%ebx
484 movl nb410_jnrc(%esp),%ecx
485 movl nb410_jnrd(%esp),%edx
487 movl nb410_dvda(%ebp),%esi
489 ## Calculate dVda
490 xorps %xmm7,%xmm7
491 mulps nb410_gbscale(%esp),%xmm3
492 movaps %xmm3,%xmm6
493 mulps nb410_r(%esp),%xmm6
494 addps %xmm5,%xmm6
495 addps nb410_vctot(%esp),%xmm5
496 movaps %xmm5,nb410_vctot(%esp)
498 ## xmm6=(vcoul+fijC*r)
499 subps %xmm6,%xmm7
500 movaps %xmm7,%xmm6
502 ## update dvdasum
503 addps nb410_dvdasum(%esp),%xmm7
504 movaps %xmm7,nb410_dvdasum(%esp)
506 ## update j atoms dvdaj
507 movhlps %xmm6,%xmm7
508 movaps %xmm6,%xmm5
509 movaps %xmm7,%xmm4
510 shufps $0x1,%xmm5,%xmm5
511 shufps $0x1,%xmm4,%xmm4
512 ## xmm6=dvdaj1 xmm5=dvdaj2 xmm7=dvdaj3 xmm4=dvdaj4
513 addss (%esi,%eax,4),%xmm6
514 addss (%esi,%ebx,4),%xmm5
515 addss (%esi,%ecx,4),%xmm7
516 addss (%esi,%edx,4),%xmm4
517 movss %xmm6,(%esi,%eax,4)
518 movss %xmm5,(%esi,%ebx,4)
519 movss %xmm7,(%esi,%ecx,4)
520 movss %xmm4,(%esi,%edx,4)
522 ## L-J
523 movaps %xmm0,%xmm4
524 mulps %xmm0,%xmm4 ## xmm4=rinvsq
526 movaps %xmm4,%xmm6
527 mulps %xmm4,%xmm6
529 mulps %xmm4,%xmm6 ## xmm6=rinvsix
530 movaps %xmm6,%xmm4
531 mulps %xmm4,%xmm4 ## xmm4=rinvtwelve
532 mulps nb410_c6(%esp),%xmm6
533 mulps nb410_c12(%esp),%xmm4
534 movaps nb410_Vvdwtot(%esp),%xmm7
535 addps %xmm4,%xmm7
536 mulps nb410_twelve(%esp),%xmm4
537 subps %xmm6,%xmm7
538 mulps nb410_six(%esp),%xmm6
539 movaps %xmm7,nb410_Vvdwtot(%esp)
540 subps %xmm6,%xmm4
541 mulps %xmm0,%xmm4
542 subps %xmm3,%xmm4
543 mulps %xmm0,%xmm4
545 movaps nb410_dx(%esp),%xmm0
546 movaps nb410_dy(%esp),%xmm1
547 movaps nb410_dz(%esp),%xmm2
549 movd %mm0,%eax
550 movd %mm1,%ebx
551 movd %mm2,%ecx
552 movd %mm3,%edx
554 movl nb410_faction(%ebp),%edi
555 mulps %xmm4,%xmm0
556 mulps %xmm4,%xmm1
557 mulps %xmm4,%xmm2
558 ## xmm0-xmm2 contains tx-tz (partial force)
559 ## now update f_i
560 movaps nb410_fix(%esp),%xmm3
561 movaps nb410_fiy(%esp),%xmm4
562 movaps nb410_fiz(%esp),%xmm5
563 addps %xmm0,%xmm3
564 addps %xmm1,%xmm4
565 addps %xmm2,%xmm5
566 movaps %xmm3,nb410_fix(%esp)
567 movaps %xmm4,nb410_fiy(%esp)
568 movaps %xmm5,nb410_fiz(%esp)
569 ## the fj's - start by accumulating x & y forces from memory
570 movlps (%edi,%eax,4),%xmm4
571 movlps (%edi,%ecx,4),%xmm6
572 movhps (%edi,%ebx,4),%xmm4
573 movhps (%edi,%edx,4),%xmm6
575 movaps %xmm4,%xmm3
576 shufps $136,%xmm6,%xmm3 ## constant 10001000
577 shufps $221,%xmm6,%xmm4 ## constant 11011101
579 ## now xmm3-xmm5 contains fjx, fjy, fjz
580 subps %xmm0,%xmm3
581 subps %xmm1,%xmm4
583 ## unpack them back so we can store them - first x & y in xmm3/xmm4
585 movaps %xmm3,%xmm6
586 unpcklps %xmm4,%xmm6
587 unpckhps %xmm4,%xmm3
588 ## xmm6(l)=x & y for j1, (h) for j2
589 ## xmm3(l)=x & y for j3, (h) for j4
590 movlps %xmm6,(%edi,%eax,4)
591 movlps %xmm3,(%edi,%ecx,4)
593 movhps %xmm6,(%edi,%ebx,4)
594 movhps %xmm3,(%edi,%edx,4)
596 ## and the z forces
597 movss 8(%edi,%eax,4),%xmm4
598 movss 8(%edi,%ebx,4),%xmm5
599 movss 8(%edi,%ecx,4),%xmm6
600 movss 8(%edi,%edx,4),%xmm7
601 subss %xmm2,%xmm4
602 shufps $229,%xmm2,%xmm2 ## constant 11100101
603 subss %xmm2,%xmm5
604 shufps $234,%xmm2,%xmm2 ## constant 11101010
605 subss %xmm2,%xmm6
606 shufps $255,%xmm2,%xmm2 ## constant 11111111
607 subss %xmm2,%xmm7
608 movss %xmm4,8(%edi,%eax,4)
609 movss %xmm5,8(%edi,%ebx,4)
610 movss %xmm6,8(%edi,%ecx,4)
611 movss %xmm7,8(%edi,%edx,4)
613 ## should we do one more iteration?
614 subl $4,nb410_innerk(%esp)
615 jl _nb_kernel410_ia32_sse.nb410_finish_inner
616 jmp _nb_kernel410_ia32_sse.nb410_unroll_loop
617 _nb_kernel410_ia32_sse.nb410_finish_inner:
618 ## check if at least two particles remain
619 addl $4,nb410_innerk(%esp)
620 movl nb410_innerk(%esp),%edx
621 andl $2,%edx
622 jnz _nb_kernel410_ia32_sse.nb410_dopair
623 jmp _nb_kernel410_ia32_sse.nb410_checksingle
624 _nb_kernel410_ia32_sse.nb410_dopair:
625 movl nb410_innerjjnr(%esp),%ecx
626 movl (%ecx),%eax
627 movl 4(%ecx),%ebx
628 addl $8,nb410_innerjjnr(%esp)
630 xorps %xmm2,%xmm2
631 movaps %xmm2,%xmm6
633 ## load isaj
634 movl nb410_invsqrta(%ebp),%esi
635 movss (%esi,%eax,4),%xmm2
636 movss (%esi,%ebx,4),%xmm3
637 unpcklps %xmm3,%xmm2 ## isaj in xmm2(0,1)
638 mulps nb410_isai(%esp),%xmm2
639 movaps %xmm2,nb410_isaprod(%esp)
640 movaps %xmm2,%xmm1
641 mulps nb410_gbtsc(%esp),%xmm1
642 movaps %xmm1,nb410_gbscale(%esp)
644 movl nb410_charge(%ebp),%esi ## base of charge[]
645 movss (%esi,%eax,4),%xmm3
646 movss (%esi,%ebx,4),%xmm6
647 unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges
649 mulps nb410_iq(%esp),%xmm2
650 mulps %xmm2,%xmm3
651 movaps %xmm3,nb410_qq(%esp)
653 movl nb410_type(%ebp),%esi
654 movl %eax,%ecx
655 movl %ebx,%edx
656 movl (%esi,%ecx,4),%ecx
657 movl (%esi,%edx,4),%edx
658 movl nb410_vdwparam(%ebp),%esi
659 shll %ecx
660 shll %edx
661 movl nb410_ntia(%esp),%edi
662 addl %edi,%ecx
663 addl %edi,%edx
664 movlps (%esi,%ecx,4),%xmm6
665 movhps (%esi,%edx,4),%xmm6
666 movl nb410_pos(%ebp),%edi
668 movaps %xmm6,%xmm4
669 shufps $8,%xmm4,%xmm4 ## constant 00001000
670 shufps $13,%xmm6,%xmm6 ## constant 00001101
671 movlhps %xmm7,%xmm4
672 movlhps %xmm7,%xmm6
674 movaps %xmm4,nb410_c6(%esp)
675 movaps %xmm6,nb410_c12(%esp)
677 movd %eax,%mm0
678 movd %ebx,%mm1
680 leal (%eax,%eax,2),%eax
681 leal (%ebx,%ebx,2),%ebx
682 ## move coordinates to xmm0-xmm2
683 movlps (%edi,%eax,4),%xmm1
684 movss 8(%edi,%eax,4),%xmm2
685 movhps (%edi,%ebx,4),%xmm1
686 movss 8(%edi,%ebx,4),%xmm0
688 movlhps %xmm7,%xmm3
690 shufps $0,%xmm0,%xmm2
692 movaps %xmm1,%xmm0
694 shufps $136,%xmm2,%xmm2 ## constant 10001000
696 shufps $136,%xmm0,%xmm0 ## constant 10001000
697 shufps $221,%xmm1,%xmm1 ## constant 11011101
699 movl nb410_faction(%ebp),%edi
700 ## move ix-iz to xmm4-xmm6
701 xorps %xmm7,%xmm7
703 movaps nb410_ix(%esp),%xmm4
704 movaps nb410_iy(%esp),%xmm5
705 movaps nb410_iz(%esp),%xmm6
707 ## calc dr
708 subps %xmm0,%xmm4
709 subps %xmm1,%xmm5
710 subps %xmm2,%xmm6
712 ## store dr
713 movaps %xmm4,nb410_dx(%esp)
714 movaps %xmm5,nb410_dy(%esp)
715 movaps %xmm6,nb410_dz(%esp)
716 ## square it
717 mulps %xmm4,%xmm4
718 mulps %xmm5,%xmm5
719 mulps %xmm6,%xmm6
720 addps %xmm5,%xmm4
721 addps %xmm6,%xmm4
722 ## rsq in xmm4
724 rsqrtps %xmm4,%xmm5
725 ## lookup seed in xmm5
726 movaps %xmm5,%xmm2
727 mulps %xmm5,%xmm5
728 movaps nb410_three(%esp),%xmm1
729 mulps %xmm4,%xmm5 ## rsq*lu*lu
730 movaps nb410_half(%esp),%xmm0
731 subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu
732 mulps %xmm2,%xmm1
733 mulps %xmm1,%xmm0 ## xmm0=rinv
734 mulps %xmm0,%xmm4 ## xmm4=r
735 movaps %xmm4,nb410_r(%esp)
736 mulps nb410_gbscale(%esp),%xmm4
738 cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices
739 cvtpi2ps %mm6,%xmm6
740 subps %xmm6,%xmm4
741 movaps %xmm4,%xmm1 ## xmm1=eps
742 movaps %xmm1,%xmm2
743 mulps %xmm2,%xmm2 ## xmm2=eps2
745 pslld $2,%mm6
747 movl nb410_GBtab(%ebp),%esi
748 movd %mm6,%ecx
749 psrlq $32,%mm6
750 movd %mm6,%edx
752 ## load coulomb table
753 movaps (%esi,%ecx,4),%xmm4
754 movaps (%esi,%edx,4),%xmm7
755 ## transpose, using xmm3 for scratch
756 movaps %xmm4,%xmm6
757 unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2
758 unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2
759 movhlps %xmm4,%xmm5 ## F1 F2
760 movhlps %xmm6,%xmm7 ## H1 H2
761 ## coulomb table ready, in xmm4-xmm7
763 mulps %xmm1,%xmm6 ## xmm6=Geps
764 mulps %xmm2,%xmm7 ## xmm7=Heps2
765 addps %xmm6,%xmm5
766 addps %xmm7,%xmm5 ## xmm5=Fp
767 mulps nb410_two(%esp),%xmm7 ## two*Heps2
768 movaps nb410_qq(%esp),%xmm3
769 addps %xmm6,%xmm7
770 addps %xmm5,%xmm7 ## xmm7=FF
771 mulps %xmm1,%xmm5 ## xmm5=eps*Fp
772 addps %xmm4,%xmm5 ## xmm5=VV
773 mulps %xmm3,%xmm5 ## vcoul=qq*VV
774 mulps %xmm7,%xmm3 ## fijC=FF*qq
775 ## get jnr from regs
776 movd %mm0,%ecx
777 movd %mm1,%edx
779 movl nb410_dvda(%ebp),%esi
780 ## Calculate dVda
781 xorps %xmm7,%xmm7
782 mulps nb410_gbscale(%esp),%xmm3
783 movaps %xmm3,%xmm6
784 mulps nb410_r(%esp),%xmm6
785 addps %xmm5,%xmm6
786 addps nb410_vctot(%esp),%xmm5
787 movaps %xmm5,nb410_vctot(%esp)
789 ## xmm6=(vcoul+fijC*r)
790 subps %xmm6,%xmm7
791 movaps %xmm7,%xmm6
793 ## update dvdasum
794 addps nb410_dvdasum(%esp),%xmm7
795 movaps %xmm7,nb410_dvdasum(%esp)
797 ## update j atoms dvdaj
798 movaps %xmm6,%xmm7
799 shufps $0x1,%xmm7,%xmm7
800 addss (%esi,%ecx,4),%xmm6
801 addss (%esi,%edx,4),%xmm7
802 movss %xmm6,(%esi,%ecx,4)
803 movss %xmm7,(%esi,%edx,4)
805 ## L-J
806 movaps %xmm0,%xmm4
807 mulps %xmm0,%xmm4 ## xmm4=rinvsq
809 ## at this point mm5 contains vcoul and mm3 fijC
810 ## increment vcoul - then we can get rid of mm5
811 ## update vctot
813 movaps %xmm4,%xmm6
814 mulps %xmm4,%xmm6
816 mulps %xmm4,%xmm6 ## xmm6=rinvsix
817 movaps %xmm6,%xmm4
818 mulps %xmm4,%xmm4 ## xmm4=rinvtwelve
819 mulps nb410_c6(%esp),%xmm6
820 mulps nb410_c12(%esp),%xmm4
821 movaps nb410_Vvdwtot(%esp),%xmm7
822 addps %xmm4,%xmm7
823 mulps nb410_twelve(%esp),%xmm4
824 subps %xmm6,%xmm7
825 mulps nb410_six(%esp),%xmm6
826 movaps %xmm7,nb410_Vvdwtot(%esp)
827 subps %xmm6,%xmm4
828 mulps %xmm0,%xmm4
829 subps %xmm3,%xmm4
830 mulps %xmm0,%xmm4
832 movaps nb410_dx(%esp),%xmm0
833 movaps nb410_dy(%esp),%xmm1
834 movaps nb410_dz(%esp),%xmm2
836 mulps %xmm4,%xmm0
837 mulps %xmm4,%xmm1
838 mulps %xmm4,%xmm2
839 ## xmm0-xmm2 contains tx-tz (partial force)
840 ## now update f_i
841 movaps nb410_fix(%esp),%xmm3
842 movaps nb410_fiy(%esp),%xmm4
843 movaps nb410_fiz(%esp),%xmm5
844 addps %xmm0,%xmm3
845 addps %xmm1,%xmm4
846 addps %xmm2,%xmm5
847 movaps %xmm3,nb410_fix(%esp)
848 movaps %xmm4,nb410_fiy(%esp)
849 movaps %xmm5,nb410_fiz(%esp)
850 ## update the fj's
851 movss (%edi,%eax,4),%xmm3
852 movss 4(%edi,%eax,4),%xmm4
853 movss 8(%edi,%eax,4),%xmm5
854 subss %xmm0,%xmm3
855 subss %xmm1,%xmm4
856 subss %xmm2,%xmm5
857 movss %xmm3,(%edi,%eax,4)
858 movss %xmm4,4(%edi,%eax,4)
859 movss %xmm5,8(%edi,%eax,4)
861 shufps $225,%xmm0,%xmm0 ## constant 11100001
862 shufps $225,%xmm1,%xmm1 ## constant 11100001
863 shufps $225,%xmm2,%xmm2 ## constant 11100001
865 movss (%edi,%ebx,4),%xmm3
866 movss 4(%edi,%ebx,4),%xmm4
867 movss 8(%edi,%ebx,4),%xmm5
868 subss %xmm0,%xmm3
869 subss %xmm1,%xmm4
870 subss %xmm2,%xmm5
871 movss %xmm3,(%edi,%ebx,4)
872 movss %xmm4,4(%edi,%ebx,4)
873 movss %xmm5,8(%edi,%ebx,4)
875 _nb_kernel410_ia32_sse.nb410_checksingle:
876 movl nb410_innerk(%esp),%edx
877 andl $1,%edx
878 jnz _nb_kernel410_ia32_sse.nb410_dosingle
879 jmp _nb_kernel410_ia32_sse.nb410_updateouterdata
880 _nb_kernel410_ia32_sse.nb410_dosingle:
881 movl nb410_charge(%ebp),%esi
882 movl nb410_invsqrta(%ebp),%edx
883 movl nb410_pos(%ebp),%edi
884 movl nb410_innerjjnr(%esp),%ecx
885 movl (%ecx),%eax
886 xorps %xmm2,%xmm2
887 movaps %xmm2,%xmm6
888 movss (%edx,%eax,4),%xmm2 ## isaj
889 mulss nb410_isai(%esp),%xmm2
890 movss %xmm2,nb410_isaprod(%esp)
891 movss %xmm2,%xmm1
892 mulss nb410_gbtsc(%esp),%xmm1
893 movss %xmm1,nb410_gbscale(%esp)
895 mulss nb410_iq(%esp),%xmm2
896 movss (%esi,%eax,4),%xmm6 ## xmm6(0) has the charge
897 mulss %xmm2,%xmm6
898 movss %xmm6,nb410_qq(%esp)
900 movl nb410_type(%ebp),%esi
901 movl %eax,%ecx
902 movl (%esi,%ecx,4),%ecx
903 movl nb410_vdwparam(%ebp),%esi
904 shll %ecx
905 addl nb410_ntia(%esp),%ecx
906 movlps (%esi,%ecx,4),%xmm6
907 movaps %xmm6,%xmm4
908 shufps $252,%xmm4,%xmm4 ## constant 11111100
909 shufps $253,%xmm6,%xmm6 ## constant 11111101
911 movaps %xmm4,nb410_c6(%esp)
912 movaps %xmm6,nb410_c12(%esp)
914 movd %eax,%mm0
915 leal (%eax,%eax,2),%eax
917 ## move coordinates to xmm0-xmm2
918 movss (%edi,%eax,4),%xmm0
919 movss 4(%edi,%eax,4),%xmm1
920 movss 8(%edi,%eax,4),%xmm2
922 movaps nb410_ix(%esp),%xmm4
923 movaps nb410_iy(%esp),%xmm5
924 movaps nb410_iz(%esp),%xmm6
926 ## calc dr
927 subss %xmm0,%xmm4
928 subss %xmm1,%xmm5
929 subss %xmm2,%xmm6
931 ## store dr
932 movss %xmm4,nb410_dx(%esp)
933 movss %xmm5,nb410_dy(%esp)
934 movss %xmm6,nb410_dz(%esp)
935 ## square it
936 mulss %xmm4,%xmm4
937 mulss %xmm5,%xmm5
938 mulss %xmm6,%xmm6
939 addss %xmm5,%xmm4
940 addss %xmm6,%xmm4
941 ## rsq in xmm4
943 rsqrtss %xmm4,%xmm5
944 ## lookup seed in xmm5
945 movaps %xmm5,%xmm2
946 mulss %xmm5,%xmm5
947 movss nb410_three(%esp),%xmm1
948 mulss %xmm4,%xmm5 ## rsq*lu*lu
949 movss nb410_half(%esp),%xmm0
950 subss %xmm5,%xmm1 ## constant 30-rsq*lu*lu
951 mulss %xmm2,%xmm1
952 mulss %xmm1,%xmm0 ## xmm0=rinv
954 mulss %xmm0,%xmm4 ## xmm4=r
955 movss %xmm4,nb410_r(%esp)
956 mulss nb410_gbscale(%esp),%xmm4
958 cvttss2si %xmm4,%ebx ## mm6 contain lu indices
959 cvtsi2ss %ebx,%xmm6
960 subss %xmm6,%xmm4
961 movaps %xmm4,%xmm1 ## xmm1=eps
962 movaps %xmm1,%xmm2
963 mulss %xmm2,%xmm2 ## xmm2=eps2
965 shll $2,%ebx
966 movl nb410_GBtab(%ebp),%esi
968 movaps (%esi,%ebx,4),%xmm4
969 movhlps %xmm4,%xmm6
970 movaps %xmm4,%xmm5
971 movaps %xmm6,%xmm7
972 shufps $1,%xmm5,%xmm5
973 shufps $1,%xmm7,%xmm7
974 ## table ready in xmm4-xmm7
976 mulss %xmm1,%xmm6 ## xmm6=Geps
977 mulss %xmm2,%xmm7 ## xmm7=Heps2
978 addss %xmm6,%xmm5
979 addss %xmm7,%xmm5 ## xmm5=Fp
980 mulss nb410_two(%esp),%xmm7 ## two*Heps2
981 movss nb410_qq(%esp),%xmm3
982 addss %xmm6,%xmm7
983 addss %xmm5,%xmm7 ## xmm7=FF
984 mulss %xmm1,%xmm5 ## xmm5=eps*Fp
985 addss %xmm4,%xmm5 ## xmm5=VV
986 mulss %xmm3,%xmm5 ## vcoul=qq*VV
987 mulss %xmm7,%xmm3 ## fijC=FF*qq
989 movd %mm0,%ebx
990 movl nb410_dvda(%ebp),%esi
992 ## Calculate dVda
993 xorps %xmm7,%xmm7
994 mulss nb410_gbscale(%esp),%xmm3
995 movaps %xmm3,%xmm6
996 mulss nb410_r(%esp),%xmm6
997 addss %xmm5,%xmm6
998 addss nb410_vctot(%esp),%xmm5
999 movss %xmm5,nb410_vctot(%esp)
1001 ## xmm6=(vcoul+fijC*r)
1002 subps %xmm6,%xmm7
1003 movaps %xmm7,%xmm6
1005 ## update dvdasum
1006 addps nb410_dvdasum(%esp),%xmm7
1007 movaps %xmm7,nb410_dvdasum(%esp)
1009 ## update j atoms dvdaj
1010 addss (%esi,%ebx,4),%xmm6
1011 movss %xmm6,(%esi,%ebx,4)
1013 ## L-J
1014 movaps %xmm0,%xmm4
1015 mulss %xmm0,%xmm4 ## xmm4=rinvsq
1017 movaps %xmm4,%xmm6
1018 mulss %xmm4,%xmm6
1020 mulss %xmm4,%xmm6 ## xmm6=rinvsix
1021 movaps %xmm6,%xmm4
1022 mulss %xmm4,%xmm4 ## xmm4=rinvtwelve
1023 mulss nb410_c6(%esp),%xmm6
1024 mulss nb410_c12(%esp),%xmm4
1025 movss nb410_Vvdwtot(%esp),%xmm7
1026 addss %xmm4,%xmm7
1027 mulss nb410_twelve(%esp),%xmm4
1028 subss %xmm6,%xmm7
1029 mulss nb410_six(%esp),%xmm6
1030 movss %xmm7,nb410_Vvdwtot(%esp)
1031 subss %xmm6,%xmm4
1032 mulss %xmm0,%xmm4
1033 subss %xmm3,%xmm4
1034 mulss %xmm0,%xmm4
1036 movss nb410_dx(%esp),%xmm0
1037 movss nb410_dy(%esp),%xmm1
1038 movss nb410_dz(%esp),%xmm2
1040 movl nb410_faction(%ebp),%edi
1041 mulss %xmm4,%xmm0
1042 mulss %xmm4,%xmm1
1043 mulss %xmm4,%xmm2
1044 ## xmm0-xmm2 contains tx-tz (partial force)
1045 ## now update f_i
1046 movss nb410_fix(%esp),%xmm3
1047 movss nb410_fiy(%esp),%xmm4
1048 movss nb410_fiz(%esp),%xmm5
1049 addss %xmm0,%xmm3
1050 addss %xmm1,%xmm4
1051 addss %xmm2,%xmm5
1052 movss %xmm3,nb410_fix(%esp)
1053 movss %xmm4,nb410_fiy(%esp)
1054 movss %xmm5,nb410_fiz(%esp)
1055 ## update fj
1057 movss (%edi,%eax,4),%xmm3
1058 movss 4(%edi,%eax,4),%xmm4
1059 movss 8(%edi,%eax,4),%xmm5
1060 subss %xmm0,%xmm3
1061 subss %xmm1,%xmm4
1062 subss %xmm2,%xmm5
1063 movss %xmm3,(%edi,%eax,4)
1064 movss %xmm4,4(%edi,%eax,4)
1065 movss %xmm5,8(%edi,%eax,4)
1066 _nb_kernel410_ia32_sse.nb410_updateouterdata:
1067 movl nb410_ii3(%esp),%ecx
1068 movl nb410_faction(%ebp),%edi
1069 movl nb410_fshift(%ebp),%esi
1070 movl nb410_is3(%esp),%edx
1072 ## accumulate i forces in xmm0, xmm1, xmm2
1073 movaps nb410_fix(%esp),%xmm0
1074 movaps nb410_fiy(%esp),%xmm1
1075 movaps nb410_fiz(%esp),%xmm2
1077 movhlps %xmm0,%xmm3
1078 movhlps %xmm1,%xmm4
1079 movhlps %xmm2,%xmm5
1080 addps %xmm3,%xmm0
1081 addps %xmm4,%xmm1
1082 addps %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2
1084 movaps %xmm0,%xmm3
1085 movaps %xmm1,%xmm4
1086 movaps %xmm2,%xmm5
1088 shufps $1,%xmm3,%xmm3
1089 shufps $1,%xmm4,%xmm4
1090 shufps $1,%xmm5,%xmm5
1091 addss %xmm3,%xmm0
1092 addss %xmm4,%xmm1
1093 addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0
1095 ## increment i force
1096 movss (%edi,%ecx,4),%xmm3
1097 movss 4(%edi,%ecx,4),%xmm4
1098 movss 8(%edi,%ecx,4),%xmm5
1099 addss %xmm0,%xmm3
1100 addss %xmm1,%xmm4
1101 addss %xmm2,%xmm5
1102 movss %xmm3,(%edi,%ecx,4)
1103 movss %xmm4,4(%edi,%ecx,4)
1104 movss %xmm5,8(%edi,%ecx,4)
1106 ## increment fshift force
1107 movss (%esi,%edx,4),%xmm3
1108 movss 4(%esi,%edx,4),%xmm4
1109 movss 8(%esi,%edx,4),%xmm5
1110 addss %xmm0,%xmm3
1111 addss %xmm1,%xmm4
1112 addss %xmm2,%xmm5
1113 movss %xmm3,(%esi,%edx,4)
1114 movss %xmm4,4(%esi,%edx,4)
1115 movss %xmm5,8(%esi,%edx,4)
1117 ## get n from stack
1118 movl nb410_n(%esp),%esi
1119 ## get group index for i particle
1120 movl nb410_gid(%ebp),%edx ## base of gid[]
1121 movl (%edx,%esi,4),%edx ## ggid=gid[n]
1123 ## accumulate total potential energy and update it
1124 movaps nb410_vctot(%esp),%xmm7
1125 ## accumulate
1126 movhlps %xmm7,%xmm6
1127 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1128 movaps %xmm7,%xmm6
1129 shufps $1,%xmm6,%xmm6
1130 addss %xmm6,%xmm7
1132 ## add earlier value from mem
1133 movl nb410_Vc(%ebp),%eax
1134 addss (%eax,%edx,4),%xmm7
1135 ## move back to mem
1136 movss %xmm7,(%eax,%edx,4)
1138 ## accumulate total lj energy and update it
1139 movaps nb410_Vvdwtot(%esp),%xmm7
1140 ## accumulate
1141 movhlps %xmm7,%xmm6
1142 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1143 movaps %xmm7,%xmm6
1144 shufps $1,%xmm6,%xmm6
1145 addss %xmm6,%xmm7
1147 ## add earlier value from mem
1148 movl nb410_Vvdw(%ebp),%eax
1149 addss (%eax,%edx,4),%xmm7
1150 ## move back to mem
1151 movss %xmm7,(%eax,%edx,4)
1153 ## accumulate dVda and update it
1154 movaps nb410_dvdasum(%esp),%xmm7
1155 ## accumulate
1156 movhlps %xmm7,%xmm6
1157 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1158 movaps %xmm7,%xmm6
1159 shufps $1,%xmm6,%xmm6
1160 addss %xmm6,%xmm7
1162 movl nb410_ii(%esp),%edx
1163 movl nb410_dvda(%ebp),%eax
1164 addss (%eax,%edx,4),%xmm7
1165 movss %xmm7,(%eax,%edx,4)
1167 ## finish if last
1168 movl nb410_nn1(%esp),%ecx
1169 ## esi already loaded with n
1170 incl %esi
1171 subl %esi,%ecx
1172 jz _nb_kernel410_ia32_sse.nb410_outerend
1174 ## not last, iterate outer loop once more!
1175 movl %esi,nb410_n(%esp)
1176 jmp _nb_kernel410_ia32_sse.nb410_outer
1177 _nb_kernel410_ia32_sse.nb410_outerend:
1178 ## check if more outer neighborlists remain
1179 movl nb410_nri(%esp),%ecx
1180 ## esi already loaded with n above
1181 subl %esi,%ecx
1182 jz _nb_kernel410_ia32_sse.nb410_end
1183 ## non-zero, do one more workunit
1184 jmp _nb_kernel410_ia32_sse.nb410_threadloop
1185 _nb_kernel410_ia32_sse.nb410_end:
1186 emms
1188 movl nb410_nouter(%esp),%eax
1189 movl nb410_ninner(%esp),%ebx
1190 movl nb410_outeriter(%ebp),%ecx
1191 movl nb410_inneriter(%ebp),%edx
1192 movl %eax,(%ecx)
1193 movl %ebx,(%edx)
1195 movl nb410_salign(%esp),%eax
1196 addl %eax,%esp
1197 addl $504,%esp
1198 popl %edi
1199 popl %esi
1200 popl %edx
1201 popl %ecx
1202 popl %ebx
1203 popl %eax
1204 leave
1209 .globl nb_kernel410nf_ia32_sse
1210 .globl _nb_kernel410nf_ia32_sse
1211 nb_kernel410nf_ia32_sse:
1212 _nb_kernel410nf_ia32_sse:
1213 .set nb410nf_p_nri, 8
1214 .set nb410nf_iinr, 12
1215 .set nb410nf_jindex, 16
1216 .set nb410nf_jjnr, 20
1217 .set nb410nf_shift, 24
1218 .set nb410nf_shiftvec, 28
1219 .set nb410nf_fshift, 32
1220 .set nb410nf_gid, 36
1221 .set nb410nf_pos, 40
1222 .set nb410nf_faction, 44
1223 .set nb410nf_charge, 48
1224 .set nb410nf_p_facel, 52
1225 .set nb410nf_argkrf, 56
1226 .set nb410nf_argcrf, 60
1227 .set nb410nf_Vc, 64
1228 .set nb410nf_type, 68
1229 .set nb410nf_p_ntype, 72
1230 .set nb410nf_vdwparam, 76
1231 .set nb410nf_Vvdw, 80
1232 .set nb410nf_p_tabscale, 84
1233 .set nb410nf_VFtab, 88
1234 .set nb410nf_invsqrta, 92
1235 .set nb410nf_dvda, 96
1236 .set nb410nf_p_gbtabscale, 100
1237 .set nb410nf_GBtab, 104
1238 .set nb410nf_p_nthreads, 108
1239 .set nb410nf_count, 112
1240 .set nb410nf_mtx, 116
1241 .set nb410nf_outeriter, 120
1242 .set nb410nf_inneriter, 124
1243 .set nb410nf_work, 128
1244 ## stack offsets for local variables
1245 ## bottom of stack is cache-aligned for sse use
1246 .set nb410nf_ix, 0
1247 .set nb410nf_iy, 16
1248 .set nb410nf_iz, 32
1249 .set nb410nf_iq, 48
1250 .set nb410nf_gbtsc, 64
1251 .set nb410nf_qq, 80
1252 .set nb410nf_c6, 96
1253 .set nb410nf_c12, 112
1254 .set nb410nf_vctot, 128
1255 .set nb410nf_Vvdwtot, 144
1256 .set nb410nf_half, 160
1257 .set nb410nf_three, 176
1258 .set nb410nf_isai, 192
1259 .set nb410nf_isaprod, 208
1260 .set nb410nf_gbscale, 224
1261 .set nb410nf_is3, 240
1262 .set nb410nf_ii3, 244
1263 .set nb410nf_ntia, 248
1264 .set nb410nf_innerjjnr, 252
1265 .set nb410nf_innerk, 256
1266 .set nb410nf_n, 260
1267 .set nb410nf_nn1, 264
1268 .set nb410nf_nri, 268
1269 .set nb410nf_facel, 272
1270 .set nb410nf_ntype, 276
1271 .set nb410nf_nouter, 280
1272 .set nb410nf_ninner, 284
1273 .set nb410nf_salign, 288
1274 pushl %ebp
1275 movl %esp,%ebp
1276 pushl %eax
1277 pushl %ebx
1278 pushl %ecx
1279 pushl %edx
1280 pushl %esi
1281 pushl %edi
1282 subl $292,%esp ## local stack space
1283 movl %esp,%eax
1284 andl $0xf,%eax
1285 subl %eax,%esp
1286 movl %eax,nb410nf_salign(%esp)
1288 emms
1290 ## Move args passed by reference to stack
1291 movl nb410nf_p_nri(%ebp),%ecx
1292 movl nb410nf_p_facel(%ebp),%esi
1293 movl nb410nf_p_ntype(%ebp),%edi
1294 movl (%ecx),%ecx
1295 movl (%esi),%esi
1296 movl (%edi),%edi
1297 movl %ecx,nb410nf_nri(%esp)
1298 movl %esi,nb410nf_facel(%esp)
1299 movl %edi,nb410nf_ntype(%esp)
1301 ## zero iteration counters
1302 movl $0,%eax
1303 movl %eax,nb410nf_nouter(%esp)
1304 movl %eax,nb410nf_ninner(%esp)
1307 movl nb410nf_p_gbtabscale(%ebp),%eax
1308 movss (%eax),%xmm5
1309 shufps $0,%xmm5,%xmm5
1310 movaps %xmm5,nb410nf_gbtsc(%esp)
1312 ## create constant floating-point factors on stack
1313 movl $0x3f000000,%eax ## constant 0.5 in IEEE (hex)
1314 movl %eax,nb410nf_half(%esp)
1315 movss nb410nf_half(%esp),%xmm1
1316 shufps $0,%xmm1,%xmm1 ## splat to all elements
1317 movaps %xmm1,%xmm2
1318 addps %xmm2,%xmm2 ## constant 1.0
1319 movaps %xmm2,%xmm3
1320 addps %xmm2,%xmm2 ## constant 2.0
1321 addps %xmm2,%xmm3 ## constant 3.0
1322 movaps %xmm1,nb410nf_half(%esp)
1323 movaps %xmm3,nb410nf_three(%esp)
1325 _nb_kernel410nf_ia32_sse.nb410nf_threadloop:
1326 movl nb410nf_count(%ebp),%esi ## pointer to sync counter
1327 movl (%esi),%eax
1328 _nb_kernel410nf_ia32_sse.nb410nf_spinlock:
1329 movl %eax,%ebx ## ebx=*count=nn0
1330 addl $1,%ebx ## ebx=nn1=nn0+10
1331 lock
1332 cmpxchgl %ebx,(%esi) ## write nn1 to *counter,
1333 ## if it hasnt changed.
1334 ## or reread *counter to eax.
1335 pause ## -> better p4 performance
1336 jnz _nb_kernel410nf_ia32_sse.nb410nf_spinlock
1338 ## if(nn1>nri) nn1=nri
1339 movl nb410nf_nri(%esp),%ecx
1340 movl %ecx,%edx
1341 subl %ebx,%ecx
1342 cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri
1343 ## Cleared the spinlock if we got here.
1344 ## eax contains nn0, ebx contains nn1.
1345 movl %eax,nb410nf_n(%esp)
1346 movl %ebx,nb410nf_nn1(%esp)
1347 subl %eax,%ebx ## calc number of outer lists
1348 movl %eax,%esi ## copy n to esi
1349 jg _nb_kernel410nf_ia32_sse.nb410nf_outerstart
1350 jmp _nb_kernel410nf_ia32_sse.nb410nf_end
1352 _nb_kernel410nf_ia32_sse.nb410nf_outerstart:
1353 ## ebx contains number of outer iterations
1354 addl nb410nf_nouter(%esp),%ebx
1355 movl %ebx,nb410nf_nouter(%esp)
1357 _nb_kernel410nf_ia32_sse.nb410nf_outer:
1358 movl nb410nf_shift(%ebp),%eax ## eax = pointer into shift[]
1359 movl (%eax,%esi,4),%ebx ## ebx=shift[n]
1361 leal (%ebx,%ebx,2),%ebx ## ebx=3*is
1362 movl %ebx,nb410nf_is3(%esp) ## store is3
1364 movl nb410nf_shiftvec(%ebp),%eax ## eax = base of shiftvec[]
1366 movss (%eax,%ebx,4),%xmm0
1367 movss 4(%eax,%ebx,4),%xmm1
1368 movss 8(%eax,%ebx,4),%xmm2
1370 movl nb410nf_iinr(%ebp),%ecx ## ecx = pointer into iinr[]
1371 movl (%ecx,%esi,4),%ebx ## ebx =ii
1373 movl nb410nf_charge(%ebp),%edx
1374 movss (%edx,%ebx,4),%xmm3
1375 mulss nb410nf_facel(%esp),%xmm3
1376 shufps $0,%xmm3,%xmm3
1378 movl nb410nf_invsqrta(%ebp),%edx ## load invsqrta[ii]
1379 movss (%edx,%ebx,4),%xmm4
1380 shufps $0,%xmm4,%xmm4
1382 movl nb410nf_type(%ebp),%edx
1383 movl (%edx,%ebx,4),%edx
1384 imull nb410nf_ntype(%esp),%edx
1385 shll %edx
1386 movl %edx,nb410nf_ntia(%esp)
1388 leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3
1389 movl nb410nf_pos(%ebp),%eax ## eax = base of pos[]
1391 addss (%eax,%ebx,4),%xmm0
1392 addss 4(%eax,%ebx,4),%xmm1
1393 addss 8(%eax,%ebx,4),%xmm2
1395 movaps %xmm3,nb410nf_iq(%esp)
1396 movaps %xmm4,nb410nf_isai(%esp)
1398 shufps $0,%xmm0,%xmm0
1399 shufps $0,%xmm1,%xmm1
1400 shufps $0,%xmm2,%xmm2
1402 movaps %xmm0,nb410nf_ix(%esp)
1403 movaps %xmm1,nb410nf_iy(%esp)
1404 movaps %xmm2,nb410nf_iz(%esp)
1406 movl %ebx,nb410nf_ii3(%esp)
1408 ## clear vctot
1409 xorps %xmm4,%xmm4
1410 movaps %xmm4,nb410nf_vctot(%esp)
1411 movaps %xmm4,nb410nf_Vvdwtot(%esp)
1413 movl nb410nf_jindex(%ebp),%eax
1414 movl (%eax,%esi,4),%ecx ## jindex[n]
1415 movl 4(%eax,%esi,4),%edx ## jindex[n+1]
1416 subl %ecx,%edx ## number of innerloop atoms
1418 movl nb410nf_pos(%ebp),%esi
1419 movl nb410nf_faction(%ebp),%edi
1420 movl nb410nf_jjnr(%ebp),%eax
1421 shll $2,%ecx
1422 addl %ecx,%eax
1423 movl %eax,nb410nf_innerjjnr(%esp) ## pointer to jjnr[nj0]
1424 movl %edx,%ecx
1425 subl $4,%edx
1426 addl nb410nf_ninner(%esp),%ecx
1427 movl %ecx,nb410nf_ninner(%esp)
1428 addl $0,%edx
1429 movl %edx,nb410nf_innerk(%esp) ## number of innerloop atoms
1430 jge _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop
1431 jmp _nb_kernel410nf_ia32_sse.nb410nf_finish_inner
1432 _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop:
1433 ## quad-unroll innerloop here
1434 movl nb410nf_innerjjnr(%esp),%edx ## pointer to jjnr[k]
1435 movl (%edx),%eax
1436 movl 4(%edx),%ebx
1437 movl 8(%edx),%ecx
1438 movl 12(%edx),%edx ## eax-edx=jnr1-4
1439 addl $16,nb410nf_innerjjnr(%esp) ## advance pointer (unrolled 4)
1441 ## load isa2
1442 movl nb410nf_invsqrta(%ebp),%esi
1443 movss (%esi,%eax,4),%xmm3
1444 movss (%esi,%ecx,4),%xmm4
1445 movss (%esi,%ebx,4),%xmm6
1446 movss (%esi,%edx,4),%xmm7
1447 movaps nb410nf_isai(%esp),%xmm2
1448 shufps $0,%xmm6,%xmm3
1449 shufps $0,%xmm7,%xmm4
1450 shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3
1451 mulps %xmm3,%xmm2
1453 movaps %xmm2,nb410nf_isaprod(%esp)
1454 movaps %xmm2,%xmm1
1455 mulps nb410nf_gbtsc(%esp),%xmm1
1456 movaps %xmm1,nb410nf_gbscale(%esp)
1458 movl nb410nf_charge(%ebp),%esi ## base of charge[]
1460 movss (%esi,%eax,4),%xmm3
1461 movss (%esi,%ecx,4),%xmm4
1462 movss (%esi,%ebx,4),%xmm6
1463 movss (%esi,%edx,4),%xmm7
1465 mulps nb410nf_iq(%esp),%xmm2
1466 shufps $0,%xmm6,%xmm3
1467 shufps $0,%xmm7,%xmm4
1468 shufps $136,%xmm4,%xmm3 ## constant 10001000 ;# all charges in xmm3
1469 mulps %xmm2,%xmm3
1470 movaps %xmm3,nb410nf_qq(%esp)
1472 movd %eax,%mm0
1473 movd %ebx,%mm1
1474 movd %ecx,%mm2
1475 movd %edx,%mm3
1477 movl nb410nf_type(%ebp),%esi
1478 movl (%esi,%eax,4),%eax
1479 movl (%esi,%ebx,4),%ebx
1480 movl (%esi,%ecx,4),%ecx
1481 movl (%esi,%edx,4),%edx
1482 movl nb410nf_vdwparam(%ebp),%esi
1483 shll %eax
1484 shll %ebx
1485 shll %ecx
1486 shll %edx
1487 movl nb410nf_ntia(%esp),%edi
1488 addl %edi,%eax
1489 addl %edi,%ebx
1490 addl %edi,%ecx
1491 addl %edi,%edx
1493 movlps (%esi,%eax,4),%xmm6
1494 movlps (%esi,%ecx,4),%xmm7
1495 movhps (%esi,%ebx,4),%xmm6
1496 movhps (%esi,%edx,4),%xmm7
1498 movaps %xmm6,%xmm4
1499 shufps $136,%xmm7,%xmm4 ## constant 10001000
1500 shufps $221,%xmm7,%xmm6 ## constant 11011101
1502 movd %mm0,%eax
1503 movd %mm1,%ebx
1504 movd %mm2,%ecx
1505 movd %mm3,%edx
1507 movaps %xmm4,nb410nf_c6(%esp)
1508 movaps %xmm6,nb410nf_c12(%esp)
1510 movl nb410nf_pos(%ebp),%esi ## base of pos[]
1512 leal (%eax,%eax,2),%eax ## replace jnr with j3
1513 leal (%ebx,%ebx,2),%ebx
1515 leal (%ecx,%ecx,2),%ecx ## replace jnr with j3
1516 leal (%edx,%edx,2),%edx
1518 ## move four coordinates to xmm0-xmm2
1520 movlps (%esi,%eax,4),%xmm4
1521 movlps (%esi,%ecx,4),%xmm5
1522 movss 8(%esi,%eax,4),%xmm2
1523 movss 8(%esi,%ecx,4),%xmm6
1525 movhps (%esi,%ebx,4),%xmm4
1526 movhps (%esi,%edx,4),%xmm5
1528 movss 8(%esi,%ebx,4),%xmm0
1529 movss 8(%esi,%edx,4),%xmm1
1531 shufps $0,%xmm0,%xmm2
1532 shufps $0,%xmm1,%xmm6
1534 movaps %xmm4,%xmm0
1535 movaps %xmm4,%xmm1
1537 shufps $136,%xmm6,%xmm2 ## constant 10001000
1539 shufps $136,%xmm5,%xmm0 ## constant 10001000
1540 shufps $221,%xmm5,%xmm1 ## constant 11011101
1542 ## move ix-iz to xmm4-xmm6
1543 movaps nb410nf_ix(%esp),%xmm4
1544 movaps nb410nf_iy(%esp),%xmm5
1545 movaps nb410nf_iz(%esp),%xmm6
1547 ## calc dr
1548 subps %xmm0,%xmm4
1549 subps %xmm1,%xmm5
1550 subps %xmm2,%xmm6
1552 ## square it
1553 mulps %xmm4,%xmm4
1554 mulps %xmm5,%xmm5
1555 mulps %xmm6,%xmm6
1556 addps %xmm5,%xmm4
1557 addps %xmm6,%xmm4
1558 ## rsq in xmm4
1560 rsqrtps %xmm4,%xmm5
1561 ## lookup seed in xmm5
1562 movaps %xmm5,%xmm2
1563 mulps %xmm5,%xmm5
1564 movaps nb410nf_three(%esp),%xmm1
1565 mulps %xmm4,%xmm5 ## rsq*lu*lu
1566 movaps nb410nf_half(%esp),%xmm0
1567 subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu
1568 mulps %xmm2,%xmm1
1569 mulps %xmm1,%xmm0 ## xmm0=rinv
1570 mulps %xmm0,%xmm4 ## xmm4=r
1571 mulps nb410nf_gbscale(%esp),%xmm4
1573 movhlps %xmm4,%xmm5
1574 cvttps2pi %xmm4,%mm6
1575 cvttps2pi %xmm5,%mm7 ## mm6/mm7 contain lu indices
1576 cvtpi2ps %mm6,%xmm6
1577 cvtpi2ps %mm7,%xmm5
1578 movlhps %xmm5,%xmm6
1579 subps %xmm6,%xmm4
1580 movaps %xmm4,%xmm1 ## xmm1=eps
1581 movaps %xmm1,%xmm2
1582 mulps %xmm2,%xmm2 ## xmm2=eps2
1583 pslld $2,%mm6
1584 pslld $2,%mm7
1586 movd %eax,%mm0
1587 movd %ebx,%mm1
1588 movd %ecx,%mm2
1589 movd %edx,%mm3
1591 movl nb410nf_GBtab(%ebp),%esi
1592 movd %mm6,%eax
1593 psrlq $32,%mm6
1594 movd %mm7,%ecx
1595 psrlq $32,%mm7
1596 movd %mm6,%ebx
1597 movd %mm7,%edx
1599 ## load coulomb table
1600 movaps (%esi,%eax,4),%xmm4
1601 movaps (%esi,%ebx,4),%xmm5
1602 movaps (%esi,%ecx,4),%xmm6
1603 movaps (%esi,%edx,4),%xmm7
1604 ## transpose, using xmm3 for scratch
1605 movaps %xmm6,%xmm3
1606 shufps $0xEE,%xmm7,%xmm3
1607 shufps $0x44,%xmm7,%xmm6
1608 movaps %xmm4,%xmm7
1609 shufps $0xEE,%xmm5,%xmm7
1610 shufps $0x44,%xmm5,%xmm4
1611 movaps %xmm4,%xmm5
1612 shufps $0xDD,%xmm6,%xmm5
1613 shufps $0x88,%xmm6,%xmm4
1614 movaps %xmm7,%xmm6
1615 shufps $0x88,%xmm3,%xmm6
1616 shufps $0xDD,%xmm3,%xmm7
1617 ## coulomb table ready, in xmm4-xmm7
1618 mulps %xmm1,%xmm6 ## xmm6=Geps
1619 mulps %xmm2,%xmm7 ## xmm7=Heps2
1621 addps %xmm6,%xmm5
1622 addps %xmm7,%xmm5 ## xmm5=Fp
1623 movaps nb410nf_qq(%esp),%xmm3
1624 mulps %xmm1,%xmm5 ## xmm5=eps*Fp
1625 addps %xmm4,%xmm5 ## xmm5=VV
1626 mulps %xmm3,%xmm5 ## vcoul=qq*VV
1627 ## update vctot
1628 addps nb410nf_vctot(%esp),%xmm5
1629 movaps %xmm5,nb410nf_vctot(%esp)
1631 ## L-J
1632 movaps %xmm0,%xmm4
1633 mulps %xmm0,%xmm4 ## xmm4=rinvsq
1635 movaps %xmm4,%xmm6
1636 mulps %xmm4,%xmm6
1638 mulps %xmm4,%xmm6 ## xmm6=rinvsix
1639 movaps %xmm6,%xmm4
1640 mulps %xmm4,%xmm4 ## xmm4=rinvtwelve
1641 mulps nb410nf_c6(%esp),%xmm6
1642 mulps nb410nf_c12(%esp),%xmm4
1643 movaps nb410nf_Vvdwtot(%esp),%xmm7
1644 addps %xmm4,%xmm7
1645 subps %xmm6,%xmm7
1646 movaps %xmm7,nb410nf_Vvdwtot(%esp)
1648 ## should we do one more iteration?
1649 subl $4,nb410nf_innerk(%esp)
1650 jl _nb_kernel410nf_ia32_sse.nb410nf_finish_inner
1651 jmp _nb_kernel410nf_ia32_sse.nb410nf_unroll_loop
1652 _nb_kernel410nf_ia32_sse.nb410nf_finish_inner:
1653 ## check if at least two particles remain
1654 addl $4,nb410nf_innerk(%esp)
1655 movl nb410nf_innerk(%esp),%edx
1656 andl $2,%edx
1657 jnz _nb_kernel410nf_ia32_sse.nb410nf_dopair
1658 jmp _nb_kernel410nf_ia32_sse.nb410nf_checksingle
1659 _nb_kernel410nf_ia32_sse.nb410nf_dopair:
1660 movl nb410nf_innerjjnr(%esp),%ecx
1661 movl (%ecx),%eax
1662 movl 4(%ecx),%ebx
1663 addl $8,nb410nf_innerjjnr(%esp)
1665 xorps %xmm2,%xmm2
1666 movaps %xmm2,%xmm6
1668 ## load isa2
1669 movl nb410nf_invsqrta(%ebp),%esi
1670 movss (%esi,%eax,4),%xmm2
1671 movss (%esi,%ebx,4),%xmm3
1672 unpcklps %xmm3,%xmm2 ## isa2 in xmm3(0,1)
1673 mulps nb410nf_isai(%esp),%xmm2
1674 movaps %xmm2,nb410nf_isaprod(%esp)
1675 movaps %xmm2,%xmm1
1676 mulps nb410nf_gbtsc(%esp),%xmm1
1677 movaps %xmm1,nb410nf_gbscale(%esp)
1679 movl nb410nf_charge(%ebp),%esi ## base of charge[]
1680 movss (%esi,%eax,4),%xmm3
1681 movss (%esi,%ebx,4),%xmm6
1682 unpcklps %xmm6,%xmm3 ## constant 00001000 ;# xmm3(0,1) has the charges
1684 mulps nb410nf_iq(%esp),%xmm2
1685 mulps %xmm2,%xmm3
1686 movaps %xmm3,nb410nf_qq(%esp)
1688 movl nb410nf_type(%ebp),%esi
1689 movl %eax,%ecx
1690 movl %ebx,%edx
1691 movl (%esi,%ecx,4),%ecx
1692 movl (%esi,%edx,4),%edx
1693 movl nb410nf_vdwparam(%ebp),%esi
1694 shll %ecx
1695 shll %edx
1696 movl nb410nf_ntia(%esp),%edi
1697 addl %edi,%ecx
1698 addl %edi,%edx
1699 movlps (%esi,%ecx,4),%xmm6
1700 movhps (%esi,%edx,4),%xmm6
1701 movl nb410nf_pos(%ebp),%edi
1703 movaps %xmm6,%xmm4
1704 shufps $8,%xmm4,%xmm4 ## constant 00001000
1705 shufps $13,%xmm6,%xmm6 ## constant 00001101
1706 movlhps %xmm7,%xmm4
1707 movlhps %xmm7,%xmm6
1709 movaps %xmm4,nb410nf_c6(%esp)
1710 movaps %xmm6,nb410nf_c12(%esp)
1712 leal (%eax,%eax,2),%eax
1713 leal (%ebx,%ebx,2),%ebx
1714 ## move coordinates to xmm0-xmm2
1715 movlps (%edi,%eax,4),%xmm1
1716 movss 8(%edi,%eax,4),%xmm2
1717 movhps (%edi,%ebx,4),%xmm1
1718 movss 8(%edi,%ebx,4),%xmm0
1720 movlhps %xmm7,%xmm3
1722 shufps $0,%xmm0,%xmm2
1724 movaps %xmm1,%xmm0
1726 shufps $136,%xmm2,%xmm2 ## constant 10001000
1728 shufps $136,%xmm0,%xmm0 ## constant 10001000
1729 shufps $221,%xmm1,%xmm1 ## constant 11011101
1731 movl nb410nf_faction(%ebp),%edi
1732 ## move ix-iz to xmm4-xmm6
1733 xorps %xmm7,%xmm7
1735 movaps nb410nf_ix(%esp),%xmm4
1736 movaps nb410nf_iy(%esp),%xmm5
1737 movaps nb410nf_iz(%esp),%xmm6
1739 ## calc dr
1740 subps %xmm0,%xmm4
1741 subps %xmm1,%xmm5
1742 subps %xmm2,%xmm6
1744 ## square it
1745 mulps %xmm4,%xmm4
1746 mulps %xmm5,%xmm5
1747 mulps %xmm6,%xmm6
1748 addps %xmm5,%xmm4
1749 addps %xmm6,%xmm4
1750 ## rsq in xmm4
1752 rsqrtps %xmm4,%xmm5
1753 ## lookup seed in xmm5
1754 movaps %xmm5,%xmm2
1755 mulps %xmm5,%xmm5
1756 movaps nb410nf_three(%esp),%xmm1
1757 mulps %xmm4,%xmm5 ## rsq*lu*lu
1758 movaps nb410nf_half(%esp),%xmm0
1759 subps %xmm5,%xmm1 ## constant 30-rsq*lu*lu
1760 mulps %xmm2,%xmm1
1761 mulps %xmm1,%xmm0 ## xmm0=rinv
1762 mulps %xmm0,%xmm4 ## xmm4=r
1763 mulps nb410nf_gbscale(%esp),%xmm4
1765 cvttps2pi %xmm4,%mm6 ## mm6 contain lu indices
1766 cvtpi2ps %mm6,%xmm6
1767 subps %xmm6,%xmm4
1768 movaps %xmm4,%xmm1 ## xmm1=eps
1769 movaps %xmm1,%xmm2
1770 mulps %xmm2,%xmm2 ## xmm2=eps2
1772 pslld $2,%mm6
1774 movl nb410nf_GBtab(%ebp),%esi
1775 movd %mm6,%ecx
1776 psrlq $32,%mm6
1777 movd %mm6,%edx
1779 ## load coulomb table
1780 movaps (%esi,%ecx,4),%xmm4
1781 movaps (%esi,%edx,4),%xmm7
1782 ## transpose, using xmm3 for scratch
1783 movaps %xmm4,%xmm6
1784 unpcklps %xmm7,%xmm4 ## Y1 Y2 F1 F2
1785 unpckhps %xmm7,%xmm6 ## G1 G2 H1 H2
1786 movhlps %xmm4,%xmm5 ## F1 F2
1787 movhlps %xmm6,%xmm7 ## H1 H2
1788 ## coulomb table ready, in xmm4-xmm7
1790 mulps %xmm1,%xmm6 ## xmm6=Geps
1791 mulps %xmm2,%xmm7 ## xmm7=Heps2
1792 addps %xmm6,%xmm5
1793 addps %xmm7,%xmm5 ## xmm5=Fp
1794 movaps nb410nf_qq(%esp),%xmm3
1795 mulps %xmm1,%xmm5 ## xmm5=eps*Fp
1796 addps %xmm4,%xmm5 ## xmm5=VV
1797 mulps %xmm3,%xmm5 ## vcoul=qq*VV
1799 addps nb410nf_vctot(%esp),%xmm5
1800 movaps %xmm5,nb410nf_vctot(%esp)
1802 ## L-J
1803 movaps %xmm0,%xmm4
1804 mulps %xmm0,%xmm4 ## xmm4=rinvsq
1806 ## at this point mm5 contains vcoul and mm3 fijC
1807 ## increment vcoul - then we can get rid of mm5
1808 ## update vctot
1810 movaps %xmm4,%xmm6
1811 mulps %xmm4,%xmm6
1813 mulps %xmm4,%xmm6 ## xmm6=rinvsix
1814 movaps %xmm6,%xmm4
1815 mulps %xmm4,%xmm4 ## xmm4=rinvtwelve
1816 mulps nb410nf_c6(%esp),%xmm6
1817 mulps nb410nf_c12(%esp),%xmm4
1818 movaps nb410nf_Vvdwtot(%esp),%xmm7
1819 addps %xmm4,%xmm7
1820 subps %xmm6,%xmm7
1821 movaps %xmm7,nb410nf_Vvdwtot(%esp)
1823 _nb_kernel410nf_ia32_sse.nb410nf_checksingle:
1824 movl nb410nf_innerk(%esp),%edx
1825 andl $1,%edx
1826 jnz _nb_kernel410nf_ia32_sse.nb410nf_dosingle
1827 jmp _nb_kernel410nf_ia32_sse.nb410nf_updateouterdata
1828 _nb_kernel410nf_ia32_sse.nb410nf_dosingle:
1829 movl nb410nf_charge(%ebp),%esi
1830 movl nb410nf_invsqrta(%ebp),%edx
1831 movl nb410nf_pos(%ebp),%edi
1832 movl nb410nf_innerjjnr(%esp),%ecx
1833 movl (%ecx),%eax
1834 xorps %xmm2,%xmm2
1835 movaps %xmm2,%xmm6
1836 movss (%edx,%eax,4),%xmm2 ## isa2
1837 mulss nb410nf_isai(%esp),%xmm2
1838 movss %xmm2,nb410nf_isaprod(%esp)
1839 movss %xmm2,%xmm1
1840 mulss nb410nf_gbtsc(%esp),%xmm1
1841 movss %xmm1,nb410nf_gbscale(%esp)
1843 mulss nb410nf_iq(%esp),%xmm2
1844 movss (%esi,%eax,4),%xmm6 ## xmm6(0) has the charge
1845 mulss %xmm2,%xmm6
1846 movss %xmm6,nb410nf_qq(%esp)
1848 movl nb410nf_type(%ebp),%esi
1849 movl %eax,%ecx
1850 movl (%esi,%ecx,4),%ecx
1851 movl nb410nf_vdwparam(%ebp),%esi
1852 shll %ecx
1853 addl nb410nf_ntia(%esp),%ecx
1854 movlps (%esi,%ecx,4),%xmm6
1855 movaps %xmm6,%xmm4
1856 shufps $252,%xmm4,%xmm4 ## constant 11111100
1857 shufps $253,%xmm6,%xmm6 ## constant 11111101
1859 movaps %xmm4,nb410nf_c6(%esp)
1860 movaps %xmm6,nb410nf_c12(%esp)
1862 leal (%eax,%eax,2),%eax
1864 ## move coordinates to xmm0-xmm2
1865 movss (%edi,%eax,4),%xmm0
1866 movss 4(%edi,%eax,4),%xmm1
1867 movss 8(%edi,%eax,4),%xmm2
1869 movaps nb410nf_ix(%esp),%xmm4
1870 movaps nb410nf_iy(%esp),%xmm5
1871 movaps nb410nf_iz(%esp),%xmm6
1873 ## calc dr
1874 subss %xmm0,%xmm4
1875 subss %xmm1,%xmm5
1876 subss %xmm2,%xmm6
1878 ## square it
1879 mulss %xmm4,%xmm4
1880 mulss %xmm5,%xmm5
1881 mulss %xmm6,%xmm6
1882 addss %xmm5,%xmm4
1883 addss %xmm6,%xmm4
1884 ## rsq in xmm4
1886 rsqrtss %xmm4,%xmm5
1887 ## lookup seed in xmm5
1888 movaps %xmm5,%xmm2
1889 mulss %xmm5,%xmm5
1890 movss nb410nf_three(%esp),%xmm1
1891 mulss %xmm4,%xmm5 ## rsq*lu*lu
1892 movss nb410nf_half(%esp),%xmm0
1893 subss %xmm5,%xmm1 ## constant 30-rsq*lu*lu
1894 mulss %xmm2,%xmm1
1895 mulss %xmm1,%xmm0 ## xmm0=rinv
1897 mulss %xmm0,%xmm4 ## xmm4=r
1898 mulss nb410nf_gbscale(%esp),%xmm4
1900 cvttss2si %xmm4,%ebx ## mm6 contain lu indices
1901 cvtsi2ss %ebx,%xmm6
1902 subss %xmm6,%xmm4
1903 movaps %xmm4,%xmm1 ## xmm1=eps
1904 movaps %xmm1,%xmm2
1905 mulss %xmm2,%xmm2 ## xmm2=eps2
1907 shll $2,%ebx
1908 movl nb410nf_GBtab(%ebp),%esi
1910 movaps (%esi,%ebx,4),%xmm4
1911 movhlps %xmm4,%xmm6
1912 movaps %xmm4,%xmm5
1913 movaps %xmm6,%xmm7
1914 shufps $1,%xmm5,%xmm5
1915 shufps $1,%xmm7,%xmm7
1916 ## table ready in xmm4-xmm7
1918 mulss %xmm1,%xmm6 ## xmm6=Geps
1919 mulss %xmm2,%xmm7 ## xmm7=Heps2
1920 addss %xmm6,%xmm5
1921 addss %xmm7,%xmm5 ## xmm5=Fp
1922 movss nb410nf_qq(%esp),%xmm3
1923 mulss %xmm1,%xmm5 ## xmm5=eps*Fp
1924 addss %xmm4,%xmm5 ## xmm5=VV
1925 mulss %xmm3,%xmm5 ## vcoul=qq*VV
1926 addss nb410nf_vctot(%esp),%xmm5
1927 movss %xmm5,nb410nf_vctot(%esp)
1929 ## L-J
1930 movaps %xmm0,%xmm4
1931 mulss %xmm0,%xmm4 ## xmm4=rinvsq
1933 movaps %xmm4,%xmm6
1934 mulss %xmm4,%xmm6
1936 mulss %xmm4,%xmm6 ## xmm6=rinvsix
1937 movaps %xmm6,%xmm4
1938 mulss %xmm4,%xmm4 ## xmm4=rinvtwelve
1939 mulss nb410nf_c6(%esp),%xmm6
1940 mulss nb410nf_c12(%esp),%xmm4
1941 movss nb410nf_Vvdwtot(%esp),%xmm7
1942 addps %xmm4,%xmm7
1943 subps %xmm6,%xmm7
1944 movss %xmm7,nb410nf_Vvdwtot(%esp)
1946 _nb_kernel410nf_ia32_sse.nb410nf_updateouterdata:
1947 ## get n from stack
1948 movl nb410nf_n(%esp),%esi
1949 ## get group index for i particle
1950 movl nb410nf_gid(%ebp),%edx ## base of gid[]
1951 movl (%edx,%esi,4),%edx ## ggid=gid[n]
1953 ## accumulate total potential energy and update it
1954 movaps nb410nf_vctot(%esp),%xmm7
1955 ## accumulate
1956 movhlps %xmm7,%xmm6
1957 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1958 movaps %xmm7,%xmm6
1959 shufps $1,%xmm6,%xmm6
1960 addss %xmm6,%xmm7
1962 ## add earlier value from mem
1963 movl nb410nf_Vc(%ebp),%eax
1964 addss (%eax,%edx,4),%xmm7
1965 ## move back to mem
1966 movss %xmm7,(%eax,%edx,4)
1968 ## accumulate total lj energy and update it
1969 movaps nb410nf_Vvdwtot(%esp),%xmm7
1970 ## accumulate
1971 movhlps %xmm7,%xmm6
1972 addps %xmm6,%xmm7 ## pos 0-1 in xmm7 have the sum now
1973 movaps %xmm7,%xmm6
1974 shufps $1,%xmm6,%xmm6
1975 addss %xmm6,%xmm7
1977 ## add earlier value from mem
1978 movl nb410nf_Vvdw(%ebp),%eax
1979 addss (%eax,%edx,4),%xmm7
1980 ## move back to mem
1981 movss %xmm7,(%eax,%edx,4)
1983 ## finish if last
1984 movl nb410nf_nn1(%esp),%ecx
1985 ## esi already loaded with n
1986 incl %esi
1987 subl %esi,%ecx
1988 jz _nb_kernel410nf_ia32_sse.nb410nf_outerend
1990 ## not last, iterate outer loop once more!
1991 movl %esi,nb410nf_n(%esp)
1992 jmp _nb_kernel410nf_ia32_sse.nb410nf_outer
1993 _nb_kernel410nf_ia32_sse.nb410nf_outerend:
1994 ## check if more outer neighborlists remain
1995 movl nb410nf_nri(%esp),%ecx
1996 ## esi already loaded with n above
1997 subl %esi,%ecx
1998 jz _nb_kernel410nf_ia32_sse.nb410nf_end
1999 ## non-zero, do one more workunit
2000 jmp _nb_kernel410nf_ia32_sse.nb410nf_threadloop
2001 _nb_kernel410nf_ia32_sse.nb410nf_end:
2002 emms
2004 movl nb410nf_nouter(%esp),%eax
2005 movl nb410nf_ninner(%esp),%ebx
2006 movl nb410nf_outeriter(%ebp),%ecx
2007 movl nb410nf_inneriter(%ebp),%edx
2008 movl %eax,(%ecx)
2009 movl %ebx,(%edx)
2011 movl nb410nf_salign(%esp),%eax
2012 addl %eax,%esp
2013 addl $292,%esp
2014 popl %edi
2015 popl %esi
2016 popl %edx
2017 popl %ecx
2018 popl %ebx
2019 popl %eax
2020 leave