beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / mul_2.asm
blob5343f64427e480dc26b85401200a1e415ec5f6c4
1 dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
2 dnl store the result to a (n+1)-limb number.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright 2004, 2011 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
36 C cycles/limb
37 C Itanium: ?
38 C Itanium 2: 1.5
40 C TODO
41 C * Clean up variable names, and try to decrease the number of distinct
42 C registers used.
43 C * Clean up feed-in code to not require zeroing several registers.
44 C * Make sure we don't depend on uninitialized predicate registers.
45 C * Could perhaps save a few cycles by using 1 c/l carry propagation in
46 C wind-down code.
47 C * Ultimately rewrite. The problem with this code is that it first uses a
48 C loaded u value in one xma pair, then leaves it live over several unrelated
49 C xma pairs, before it uses it again. It should actually be quite possible
50 C to just swap some aligned xma pairs around. But we should then schedule
51 C u loads further from the first use.
53 C INPUT PARAMETERS
54 define(`rp',`r32')
55 define(`up',`r33')
56 define(`n',`r34')
57 define(`vp',`r35')
59 define(`srp',`r3')
61 define(`v0',`f6')
62 define(`v1',`f7')
64 define(`s0',`r14')
65 define(`acc0',`r15')
67 define(`pr0_0',`r16') define(`pr0_1',`r17')
68 define(`pr0_2',`r18') define(`pr0_3',`r19')
70 define(`pr1_0',`r20') define(`pr1_1',`r21')
71 define(`pr1_2',`r22') define(`pr1_3',`r23')
73 define(`acc1_0',`r24') define(`acc1_1',`r25')
74 define(`acc1_2',`r26') define(`acc1_3',`r27')
76 dnl define(`',`r28')
77 dnl define(`',`r29')
78 dnl define(`',`r30')
79 dnl define(`',`r31')
81 define(`fp0b_0',`f8') define(`fp0b_1',`f9')
82 define(`fp0b_2',`f10') define(`fp0b_3',`f11')
84 define(`fp1a_0',`f12') define(`fp1a_1',`f13')
85 define(`fp1a_2',`f14') define(`fp1a_3',`f15')
87 define(`fp1b_0',`f32') define(`fp1b_1',`f33')
88 define(`fp1b_2',`f34') define(`fp1b_3',`f35')
90 define(`fp2a_0',`f36') define(`fp2a_1',`f37')
91 define(`fp2a_2',`f38') define(`fp2a_3',`f39')
93 define(`u_0',`f44') define(`u_1',`f45')
94 define(`u_2',`f46') define(`u_3',`f47')
96 define(`ux',`f49')
97 define(`uy',`f51')
99 ASM_START()
100 PROLOGUE(mpn_mul_2)
101 .prologue
102 .save ar.lc, r2
103 .body
105 ifdef(`HAVE_ABI_32',`
106 {.mmi; addp4 rp = 0, rp C M I
107 addp4 up = 0, up C M I
108 addp4 vp = 0, vp C M I
109 }{.mmi; nop 1
110 nop 1
111 zxt4 n = n C I
115 {.mmi; ldf8 ux = [up], 8 C M
116 ldf8 v0 = [vp], 8 C M
117 mov r2 = ar.lc C I0
118 }{.mmi; nop 1 C M
119 and r14 = 3, n C M I
120 add n = -2, n C M I
122 }{.mmi; ldf8 uy = [up], 8 C M
123 ldf8 v1 = [vp] C M
124 shr.u n = n, 2 C I0
125 }{.mmi; nop 1 C M
126 cmp.eq p10, p0 = 1, r14 C M I
127 cmp.eq p11, p0 = 2, r14 C M I
129 }{.mmi; nop 1 C M
130 cmp.eq p12, p0 = 3, r14 C M I
131 mov ar.lc = n C I0
132 }{.bbb; (p10) br.dptk L(b01) C B
133 (p11) br.dptk L(b10) C B
134 (p12) br.dptk L(b11) C B
137 ALIGN(32)
138 L(b00): ldf8 u_1 = [up], 8
139 mov acc1_2 = 0
140 mov pr1_2 = 0
141 mov pr0_3 = 0
142 cmp.ne p8, p9 = r0, r0
144 xma.l fp0b_3 = ux, v0, f0
145 cmp.ne p12, p13 = r0, r0
146 ldf8 u_2 = [up], 8
147 xma.hu fp1a_3 = ux, v0, f0
148 br.cloop.dptk L(gt4)
150 xma.l fp0b_0 = uy, v0, f0
151 xma.hu fp1a_0 = uy, v0, f0
153 getfsig acc0 = fp0b_3
154 xma.l fp1b_3 = ux, v1, fp1a_3
155 xma.hu fp2a_3 = ux, v1, fp1a_3
157 xma.l fp0b_1 = u_1, v0, f0
158 xma.hu fp1a_1 = u_1, v0, f0
160 getfsig pr0_0 = fp0b_0
161 xma.l fp1b_0 = uy, v1, fp1a_0
162 xma.hu fp2a_0 = uy, v1, fp1a_0
164 getfsig pr1_3 = fp1b_3
165 getfsig acc1_3 = fp2a_3
166 xma.l fp0b_2 = u_2, v0, f0
167 xma.hu fp1a_2 = u_2, v0, f0
168 br L(cj4)
170 L(gt4): xma.l fp0b_0 = uy, v0, f0
171 xma.hu fp1a_0 = uy, v0, f0
173 getfsig acc0 = fp0b_3
174 xma.l fp1b_3 = ux, v1, fp1a_3
175 ldf8 u_3 = [up], 8
176 xma.hu fp2a_3 = ux, v1, fp1a_3
178 xma.l fp0b_1 = u_1, v0, f0
179 xma.hu fp1a_1 = u_1, v0, f0
181 getfsig pr0_0 = fp0b_0
182 xma.l fp1b_0 = uy, v1, fp1a_0
183 xma.hu fp2a_0 = uy, v1, fp1a_0
185 ldf8 u_0 = [up], 8
186 getfsig pr1_3 = fp1b_3
187 xma.l fp0b_2 = u_2, v0, f0
189 getfsig acc1_3 = fp2a_3
190 xma.hu fp1a_2 = u_2, v0, f0
191 br L(00)
194 ALIGN(32)
195 L(b01): ldf8 u_0 = [up], 8 C M
196 mov acc1_1 = 0 C M I
197 mov pr1_1 = 0 C M I
198 mov pr0_2 = 0 C M I
199 cmp.ne p6, p7 = r0, r0 C M I
201 xma.l fp0b_2 = ux, v0, f0 C F
202 cmp.ne p10, p11 = r0, r0 C M I
203 ldf8 u_1 = [up], 8 C M
204 xma.hu fp1a_2 = ux, v0, f0 C F
206 xma.l fp0b_3 = uy, v0, f0 C F
207 xma.hu fp1a_3 = uy, v0, f0 C F
209 getfsig acc0 = fp0b_2 C M
210 xma.l fp1b_2 = ux, v1,fp1a_2 C F
211 ldf8 u_2 = [up], 8 C M
212 xma.hu fp2a_2 = ux, v1,fp1a_2 C F
213 br.cloop.dptk L(gt5)
215 xma.l fp0b_0 = u_0, v0, f0 C F
216 xma.hu fp1a_0 = u_0, v0, f0 C F
218 getfsig pr0_3 = fp0b_3 C M
219 xma.l fp1b_3 = uy, v1,fp1a_3 C F
220 xma.hu fp2a_3 = uy, v1,fp1a_3 C F
222 getfsig pr1_2 = fp1b_2 C M
223 getfsig acc1_2 = fp2a_2 C M
224 xma.l fp0b_1 = u_1, v0, f0 C F
225 xma.hu fp1a_1 = u_1, v0, f0 C F
226 br L(cj5)
228 L(gt5): xma.l fp0b_0 = u_0, v0, f0
229 xma.hu fp1a_0 = u_0, v0, f0
231 getfsig pr0_3 = fp0b_3
232 xma.l fp1b_3 = uy, v1, fp1a_3
233 xma.hu fp2a_3 = uy, v1, fp1a_3
235 ldf8 u_3 = [up], 8
236 getfsig pr1_2 = fp1b_2
237 xma.l fp0b_1 = u_1, v0, f0
239 getfsig acc1_2 = fp2a_2
240 xma.hu fp1a_1 = u_1, v0, f0
241 br L(01)
244 ALIGN(32)
245 L(b10): br.cloop.dptk L(gt2)
246 xma.l fp0b_1 = ux, v0, f0
247 xma.hu fp1a_1 = ux, v0, f0
249 xma.l fp0b_2 = uy, v0, f0
250 xma.hu fp1a_2 = uy, v0, f0
252 stf8 [rp] = fp0b_1, 8
253 xma.l fp1b_1 = ux, v1, fp1a_1
254 xma.hu fp2a_1 = ux, v1, fp1a_1
256 getfsig acc0 = fp0b_2
257 xma.l fp1b_2 = uy, v1, fp1a_2
258 xma.hu fp2a_2 = uy, v1, fp1a_2
260 getfsig pr1_1 = fp1b_1
261 getfsig acc1_1 = fp2a_1
262 mov ar.lc = r2
263 getfsig pr1_2 = fp1b_2
264 getfsig r8 = fp2a_2
266 add s0 = pr1_1, acc0
268 st8 [rp] = s0, 8
269 cmp.ltu p8, p9 = s0, pr1_1
270 sub r31 = -1, acc1_1
272 .pred.rel "mutex", p8, p9
273 (p8) add acc0 = pr1_2, acc1_1, 1
274 (p9) add acc0 = pr1_2, acc1_1
275 (p8) cmp.leu p10, p0 = r31, pr1_2
276 (p9) cmp.ltu p10, p0 = r31, pr1_2
278 st8 [rp] = acc0, 8
279 (p10) add r8 = 1, r8
280 br.ret.sptk.many b0
282 L(gt2): ldf8 u_3 = [up], 8
283 mov acc1_0 = 0
284 mov pr1_0 = 0
286 mov pr0_1 = 0
287 xma.l fp0b_1 = ux, v0, f0
288 ldf8 u_0 = [up], 8
289 xma.hu fp1a_1 = ux, v0, f0
291 xma.l fp0b_2 = uy, v0, f0
292 xma.hu fp1a_2 = uy, v0, f0
294 getfsig acc0 = fp0b_1
295 xma.l fp1b_1 = ux, v1, fp1a_1
296 xma.hu fp2a_1 = ux, v1, fp1a_1
298 ldf8 u_1 = [up], 8
299 xma.l fp0b_3 = u_3, v0, f0
300 xma.hu fp1a_3 = u_3, v0, f0
302 getfsig pr0_2 = fp0b_2
303 xma.l fp1b_2 = uy, v1, fp1a_2
304 xma.hu fp2a_2 = uy, v1, fp1a_2
306 ldf8 u_2 = [up], 8
307 getfsig pr1_1 = fp1b_1
309 {.mfi; getfsig acc1_1 = fp2a_1
310 xma.l fp0b_0 = u_0, v0, f0
311 cmp.ne p8, p9 = r0, r0
312 }{.mfb; cmp.ne p12, p13 = r0, r0
313 xma.hu fp1a_0 = u_0, v0, f0
314 br L(10)
317 ALIGN(32)
318 L(b11): mov acc1_3 = 0
319 mov pr1_3 = 0
320 mov pr0_0 = 0
321 ldf8 u_2 = [up], 8
322 cmp.ne p6, p7 = r0, r0
323 br.cloop.dptk L(gt3)
325 xma.l fp0b_0 = ux, v0, f0
326 xma.hu fp1a_0 = ux, v0, f0
328 cmp.ne p10, p11 = r0, r0
329 xma.l fp0b_1 = uy, v0, f0
330 xma.hu fp1a_1 = uy, v0, f0
332 getfsig acc0 = fp0b_0
333 xma.l fp1b_0 = ux, v1, fp1a_0
334 xma.hu fp2a_0 = ux, v1, fp1a_0
336 xma.l fp0b_2 = u_2, v0, f0
337 xma.hu fp1a_2 = u_2, v0, f0
339 getfsig pr0_1 = fp0b_1
340 xma.l fp1b_1 = uy, v1, fp1a_1
341 xma.hu fp2a_1 = uy, v1, fp1a_1
343 getfsig pr1_0 = fp1b_0
344 getfsig acc1_0 = fp2a_0
345 br L(cj3)
347 L(gt3): xma.l fp0b_0 = ux, v0, f0
348 cmp.ne p10, p11 = r0, r0
349 ldf8 u_3 = [up], 8
350 xma.hu fp1a_0 = ux, v0, f0
352 xma.l fp0b_1 = uy, v0, f0
353 xma.hu fp1a_1 = uy, v0, f0
355 getfsig acc0 = fp0b_0
356 xma.l fp1b_0 = ux, v1, fp1a_0
357 ldf8 u_0 = [up], 8
358 xma.hu fp2a_0 = ux, v1, fp1a_0
360 xma.l fp0b_2 = u_2, v0, f0
361 xma.hu fp1a_2 = u_2, v0, f0
363 getfsig pr0_1 = fp0b_1
364 xma.l fp1b_1 = uy, v1, fp1a_1
365 xma.hu fp2a_1 = uy, v1, fp1a_1
367 ldf8 u_1 = [up], 8
368 getfsig pr1_0 = fp1b_0
370 getfsig acc1_0 = fp2a_0
371 xma.l fp0b_3 = u_3, v0, f0
372 xma.hu fp1a_3 = u_3, v0, f0
373 br L(11)
376 C *** MAIN LOOP START ***
377 ALIGN(32)
378 L(top): C 00
379 .pred.rel "mutex", p8, p9
380 .pred.rel "mutex", p12, p13
381 ldf8 u_3 = [up], 8
382 getfsig pr1_2 = fp1b_2
383 (p8) cmp.leu p6, p7 = acc0, pr0_1
384 (p9) cmp.ltu p6, p7 = acc0, pr0_1
385 (p12) cmp.leu p10, p11 = s0, pr1_0
386 (p13) cmp.ltu p10, p11 = s0, pr1_0
387 ;; C 01
388 .pred.rel "mutex", p6, p7
389 getfsig acc1_2 = fp2a_2
390 st8 [rp] = s0, 8
391 xma.l fp0b_1 = u_1, v0, f0
392 (p6) add acc0 = pr0_2, acc1_0, 1
393 (p7) add acc0 = pr0_2, acc1_0
394 xma.hu fp1a_1 = u_1, v0, f0
395 ;; C 02
396 L(01):
397 .pred.rel "mutex", p10, p11
398 getfsig pr0_0 = fp0b_0
399 xma.l fp1b_0 = u_0, v1, fp1a_0
400 (p10) add s0 = pr1_1, acc0, 1
401 (p11) add s0 = pr1_1, acc0
402 xma.hu fp2a_0 = u_0, v1, fp1a_0
403 nop 1
404 ;; C 03
405 .pred.rel "mutex", p6, p7
406 .pred.rel "mutex", p10, p11
407 ldf8 u_0 = [up], 8
408 getfsig pr1_3 = fp1b_3
409 (p6) cmp.leu p8, p9 = acc0, pr0_2
410 (p7) cmp.ltu p8, p9 = acc0, pr0_2
411 (p10) cmp.leu p12, p13 = s0, pr1_1
412 (p11) cmp.ltu p12, p13 = s0, pr1_1
413 ;; C 04
414 .pred.rel "mutex", p8, p9
415 getfsig acc1_3 = fp2a_3
416 st8 [rp] = s0, 8
417 xma.l fp0b_2 = u_2, v0, f0
418 (p8) add acc0 = pr0_3, acc1_1, 1
419 (p9) add acc0 = pr0_3, acc1_1
420 xma.hu fp1a_2 = u_2, v0, f0
421 ;; C 05
422 L(00):
423 .pred.rel "mutex", p12, p13
424 getfsig pr0_1 = fp0b_1
425 xma.l fp1b_1 = u_1, v1, fp1a_1
426 (p12) add s0 = pr1_2, acc0, 1
427 (p13) add s0 = pr1_2, acc0
428 xma.hu fp2a_1 = u_1, v1, fp1a_1
429 nop 1
430 ;; C 06
431 .pred.rel "mutex", p8, p9
432 .pred.rel "mutex", p12, p13
433 ldf8 u_1 = [up], 8
434 getfsig pr1_0 = fp1b_0
435 (p8) cmp.leu p6, p7 = acc0, pr0_3
436 (p9) cmp.ltu p6, p7 = acc0, pr0_3
437 (p12) cmp.leu p10, p11 = s0, pr1_2
438 (p13) cmp.ltu p10, p11 = s0, pr1_2
439 ;; C 07
440 .pred.rel "mutex", p6, p7
441 getfsig acc1_0 = fp2a_0
442 st8 [rp] = s0, 8
443 xma.l fp0b_3 = u_3, v0, f0
444 (p6) add acc0 = pr0_0, acc1_2, 1
445 (p7) add acc0 = pr0_0, acc1_2
446 xma.hu fp1a_3 = u_3, v0, f0
447 ;; C 08
448 L(11):
449 .pred.rel "mutex", p10, p11
450 getfsig pr0_2 = fp0b_2
451 xma.l fp1b_2 = u_2, v1, fp1a_2
452 (p10) add s0 = pr1_3, acc0, 1
453 (p11) add s0 = pr1_3, acc0
454 xma.hu fp2a_2 = u_2, v1, fp1a_2
455 nop 1
456 ;; C 09
457 .pred.rel "mutex", p6, p7
458 .pred.rel "mutex", p10, p11
459 ldf8 u_2 = [up], 8
460 getfsig pr1_1 = fp1b_1
461 (p6) cmp.leu p8, p9 = acc0, pr0_0
462 (p7) cmp.ltu p8, p9 = acc0, pr0_0
463 (p10) cmp.leu p12, p13 = s0, pr1_3
464 (p11) cmp.ltu p12, p13 = s0, pr1_3
465 ;; C 10
466 .pred.rel "mutex", p8, p9
467 getfsig acc1_1 = fp2a_1
468 st8 [rp] = s0, 8
469 xma.l fp0b_0 = u_0, v0, f0
470 (p8) add acc0 = pr0_1, acc1_3, 1
471 (p9) add acc0 = pr0_1, acc1_3
472 xma.hu fp1a_0 = u_0, v0, f0
473 ;; C 11
474 L(10):
475 .pred.rel "mutex", p12, p13
476 getfsig pr0_3 = fp0b_3
477 xma.l fp1b_3 = u_3, v1, fp1a_3
478 (p12) add s0 = pr1_0, acc0, 1
479 (p13) add s0 = pr1_0, acc0
480 xma.hu fp2a_3 = u_3, v1, fp1a_3
481 br.cloop.dptk L(top)
483 C *** MAIN LOOP END ***
485 .pred.rel "mutex", p8, p9
486 .pred.rel "mutex", p12, p13
487 {.mmi; getfsig pr1_2 = fp1b_2
488 st8 [rp] = s0, 8
489 (p8) cmp.leu p6, p7 = acc0, pr0_1
490 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
491 (p12) cmp.leu p10, p11 = s0, pr1_0
492 (p13) cmp.ltu p10, p11 = s0, pr1_0
494 } .pred.rel "mutex", p6, p7
495 {.mfi; getfsig acc1_2 = fp2a_2
496 xma.l fp0b_1 = u_1, v0, f0
497 nop 1
498 }{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1
499 (p7) add acc0 = pr0_2, acc1_0
500 xma.hu fp1a_1 = u_1, v0, f0
503 L(cj5):
504 .pred.rel "mutex", p10, p11
505 {.mfi; getfsig pr0_0 = fp0b_0
506 xma.l fp1b_0 = u_0, v1, fp1a_0
507 (p10) add s0 = pr1_1, acc0, 1
508 }{.mfi; (p11) add s0 = pr1_1, acc0
509 xma.hu fp2a_0 = u_0, v1, fp1a_0
510 nop 1
512 } .pred.rel "mutex", p6, p7
513 .pred.rel "mutex", p10, p11
514 {.mmi; getfsig pr1_3 = fp1b_3
515 st8 [rp] = s0, 8
516 (p6) cmp.leu p8, p9 = acc0, pr0_2
517 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
518 (p10) cmp.leu p12, p13 = s0, pr1_1
519 (p11) cmp.ltu p12, p13 = s0, pr1_1
521 } .pred.rel "mutex", p8, p9
522 {.mfi; getfsig acc1_3 = fp2a_3
523 xma.l fp0b_2 = u_2, v0, f0
524 nop 1
525 }{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1
526 (p9) add acc0 = pr0_3, acc1_1
527 xma.hu fp1a_2 = u_2, v0, f0
530 L(cj4):
531 .pred.rel "mutex", p12, p13
532 {.mfi; getfsig pr0_1 = fp0b_1
533 xma.l fp1b_1 = u_1, v1, fp1a_1
534 (p12) add s0 = pr1_2, acc0, 1
535 }{.mfi; (p13) add s0 = pr1_2, acc0
536 xma.hu fp2a_1 = u_1, v1, fp1a_1
537 nop 1
539 } .pred.rel "mutex", p8, p9
540 .pred.rel "mutex", p12, p13
541 {.mmi; getfsig pr1_0 = fp1b_0
542 st8 [rp] = s0, 8
543 (p8) cmp.leu p6, p7 = acc0, pr0_3
544 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3
545 (p12) cmp.leu p10, p11 = s0, pr1_2
546 (p13) cmp.ltu p10, p11 = s0, pr1_2
548 } .pred.rel "mutex", p6, p7
549 {.mmi; getfsig acc1_0 = fp2a_0
550 (p6) add acc0 = pr0_0, acc1_2, 1
551 (p7) add acc0 = pr0_0, acc1_2
554 L(cj3):
555 .pred.rel "mutex", p10, p11
556 {.mfi; getfsig pr0_2 = fp0b_2
557 xma.l fp1b_2 = u_2, v1, fp1a_2
558 (p10) add s0 = pr1_3, acc0, 1
559 }{.mfi; (p11) add s0 = pr1_3, acc0
560 xma.hu fp2a_2 = u_2, v1, fp1a_2
561 nop 1
563 } .pred.rel "mutex", p6, p7
564 .pred.rel "mutex", p10, p11
565 {.mmi; getfsig pr1_1 = fp1b_1
566 st8 [rp] = s0, 8
567 (p6) cmp.leu p8, p9 = acc0, pr0_0
568 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0
569 (p10) cmp.leu p12, p13 = s0, pr1_3
570 (p11) cmp.ltu p12, p13 = s0, pr1_3
572 } .pred.rel "mutex", p8, p9
573 {.mmi; getfsig acc1_1 = fp2a_1
574 (p8) add acc0 = pr0_1, acc1_3, 1
575 (p9) add acc0 = pr0_1, acc1_3
577 } .pred.rel "mutex", p12, p13
578 {.mmi; (p12) add s0 = pr1_0, acc0, 1
579 (p13) add s0 = pr1_0, acc0
580 nop 1
582 } .pred.rel "mutex", p8, p9
583 .pred.rel "mutex", p12, p13
584 {.mmi; getfsig pr1_2 = fp1b_2
585 st8 [rp] = s0, 8
586 (p8) cmp.leu p6, p7 = acc0, pr0_1
587 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
588 (p12) cmp.leu p10, p11 = s0, pr1_0
589 (p13) cmp.ltu p10, p11 = s0, pr1_0
591 } .pred.rel "mutex", p6, p7
592 {.mmi; getfsig r8 = fp2a_2
593 (p6) add acc0 = pr0_2, acc1_0, 1
594 (p7) add acc0 = pr0_2, acc1_0
596 } .pred.rel "mutex", p10, p11
597 {.mmi; (p10) add s0 = pr1_1, acc0, 1
598 (p11) add s0 = pr1_1, acc0
599 (p6) cmp.leu p8, p9 = acc0, pr0_2
601 } .pred.rel "mutex", p10, p11
602 {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
603 (p10) cmp.leu p12, p13 = s0, pr1_1
604 (p11) cmp.ltu p12, p13 = s0, pr1_1
606 } .pred.rel "mutex", p8, p9
607 {.mmi; st8 [rp] = s0, 8
608 (p8) add acc0 = pr1_2, acc1_1, 1
609 (p9) add acc0 = pr1_2, acc1_1
611 } .pred.rel "mutex", p8, p9
612 {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2
613 (p9) cmp.ltu p10, p11 = acc0, pr1_2
614 (p12) add acc0 = 1, acc0
616 }{.mmi; st8 [rp] = acc0, 8
617 (p12) cmpeqor p10, p0 = 0, acc0
618 nop 1
620 }{.mib; (p10) add r8 = 1, r8
621 mov ar.lc = r2
622 br.ret.sptk.many b0
624 EPILOGUE()
625 ASM_END()