beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / addmul_2.asm
blob86e8de4051a1e972364bc64b4984a2d293eaca3c
1 dnl IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and
2 dnl add the result to a (n+1)-limb number.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright 2004, 2005, 2011 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
36 C cycles/limb
37 C Itanium: 3.65
38 C Itanium 2: 1.625
40 C TODO
41 C * Clean up variable names, and try to decrease the number of distinct
42 C registers used.
43 C * Clean up feed-in code to not require zeroing several registers.
44 C * Make sure we don't depend on uninitialised predicate registers.
45 C * Could perhaps save a few cycles by using 1 c/l carry propagation in
46 C wind-down code.
47 C * Ultimately rewrite. The problem with this code is that it first uses a
48 C loaded u value in one xma pair, then leaves it live over several unrelated
49 C xma pairs, before it uses it again. It should actually be quite possible
50 C to just swap some aligned xma pairs around. But we should then schedule
51 C u loads further from the first use.
53 C INPUT PARAMETERS
54 define(`rp',`r32')
55 define(`up',`r33')
56 define(`n',`r34')
57 define(`vp',`r35')
59 define(`srp',`r3')
61 define(`v0',`f6')
62 define(`v1',`f7')
64 define(`s0',`r14')
65 define(`acc0',`r15')
67 define(`pr0_0',`r16') define(`pr0_1',`r17')
68 define(`pr0_2',`r18') define(`pr0_3',`r19')
70 define(`pr1_0',`r20') define(`pr1_1',`r21')
71 define(`pr1_2',`r22') define(`pr1_3',`r23')
73 define(`acc1_0',`r24') define(`acc1_1',`r25')
74 define(`acc1_2',`r26') define(`acc1_3',`r27')
76 dnl define(`',`r28')
77 dnl define(`',`r29')
78 dnl define(`',`r30')
79 dnl define(`',`r31')
81 define(`fp0b_0',`f8') define(`fp0b_1',`f9')
82 define(`fp0b_2',`f10') define(`fp0b_3',`f11')
84 define(`fp1a_0',`f12') define(`fp1a_1',`f13')
85 define(`fp1a_2',`f14') define(`fp1a_3',`f15')
87 define(`fp1b_0',`f32') define(`fp1b_1',`f33')
88 define(`fp1b_2',`f34') define(`fp1b_3',`f35')
90 define(`fp2a_0',`f36') define(`fp2a_1',`f37')
91 define(`fp2a_2',`f38') define(`fp2a_3',`f39')
93 define(`r_0',`f40') define(`r_1',`f41')
94 define(`r_2',`f42') define(`r_3',`f43')
96 define(`u_0',`f44') define(`u_1',`f45')
97 define(`u_2',`f46') define(`u_3',`f47')
99 define(`rx',`f48')
100 define(`ux',`f49')
101 define(`ry',`f50')
102 define(`uy',`f51')
104 ASM_START()
105 PROLOGUE(mpn_addmul_2s)
106 .prologue
107 .save ar.lc, r2
108 .body
110 ifdef(`HAVE_ABI_32',`
111 {.mmi; addp4 rp = 0, rp C M I
112 addp4 up = 0, up C M I
113 addp4 vp = 0, vp C M I
114 }{.mmi; nop 1
115 nop 1
116 zxt4 n = n C I
120 {.mmi; ldf8 ux = [up], 8 C M
121 ldf8 v0 = [vp], 8 C M
122 mov r2 = ar.lc C I0
123 }{.mmi; ldf8 rx = [rp], 8 C M
124 and r14 = 3, n C M I
125 add n = -2, n C M I
127 }{.mmi; ldf8 uy = [up], 8 C M
128 ldf8 v1 = [vp] C M
129 shr.u n = n, 2 C I0
130 }{.mmi; ldf8 ry = [rp], -8 C M
131 cmp.eq p14, p0 = 1, r14 C M I
132 cmp.eq p11, p0 = 2, r14 C M I
134 }{.mmi; add srp = 16, rp C M I
135 cmp.eq p15, p0 = 3, r14 C M I
136 mov ar.lc = n C I0
137 }{.bbb; (p14) br.dptk L(x01) C B
138 (p11) br.dptk L(x10) C B
139 (p15) br.dptk L(x11) C B
142 L(x00): cmp.ne p6, p0 = r0, r0 C suppress initial xma pair
143 mov fp2a_3 = f0
144 br L(b00)
145 L(x01): cmp.ne p14, p0 = r0, r0 C suppress initial xma pair
146 mov fp2a_2 = f0
147 br L(b01)
148 L(x10): cmp.ne p11, p0 = r0, r0 C suppress initial xma pair
149 mov fp2a_1 = f0
150 br L(b10)
151 L(x11): cmp.ne p15, p0 = r0, r0 C suppress initial xma pair
152 mov fp2a_0 = f0
153 br L(b11)
155 EPILOGUE()
157 PROLOGUE(mpn_addmul_2)
158 .prologue
159 .save ar.lc, r2
160 .body
162 ifdef(`HAVE_ABI_32',`
163 {.mmi; addp4 rp = 0, rp C M I
164 addp4 up = 0, up C M I
165 addp4 vp = 0, vp C M I
166 }{.mmi; nop 1
167 nop 1
168 zxt4 n = n C I
172 {.mmi; ldf8 ux = [up], 8 C M
173 ldf8 v0 = [vp], 8 C M
174 mov r2 = ar.lc C I0
175 }{.mmi; ldf8 rx = [rp], 8 C M
176 and r14 = 3, n C M I
177 add n = -2, n C M I
179 }{.mmi; ldf8 uy = [up], 8 C M
180 ldf8 v1 = [vp] C M
181 shr.u n = n, 2 C I0
182 }{.mmi; ldf8 ry = [rp], -8 C M
183 cmp.eq p14, p0 = 1, r14 C M I
184 cmp.eq p11, p0 = 2, r14 C M I
186 }{.mmi; add srp = 16, rp C M I
187 cmp.eq p15, p6 = 3, r14 C M I
188 mov ar.lc = n C I0
189 }{.bbb; (p14) br.dptk L(b01) C B
190 (p11) br.dptk L(b10) C B
191 (p15) br.dptk L(b11) C B
194 ALIGN(32)
195 L(b00):
196 {.mmi; ldf8 r_1 = [srp], 8
197 ldf8 u_1 = [up], 8
198 mov acc1_2 = 0
199 }{.mmi; mov pr1_2 = 0
200 mov pr0_3 = 0
201 cmp.ne p8, p9 = r0, r0
203 }{.mfi; ldf8 r_2 = [srp], 8
204 xma.l fp0b_3 = ux, v0, rx
205 cmp.ne p12, p13 = r0, r0
206 }{.mfb; ldf8 u_2 = [up], 8
207 xma.hu fp1b_3 = ux, v0, rx
208 br.cloop.dptk L(gt4)
210 xma.l fp0b_0 = uy, v0, ry
211 xma.hu fp1a_0 = uy, v0, ry
213 getfsig acc0 = fp0b_3
214 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
215 (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
217 xma.l fp0b_1 = u_1, v0, r_1
218 xma.hu fp1a_1 = u_1, v0, r_1
220 getfsig pr0_0 = fp0b_0
221 xma.l fp1b_0 = uy, v1, fp1a_0
222 xma.hu fp2a_0 = uy, v1, fp1a_0
224 getfsig pr1_3 = fp1b_3
225 getfsig acc1_3 = fp2a_3
226 xma.l fp0b_2 = u_2, v0, r_2
227 xma.hu fp1a_2 = u_2, v0, r_2
228 br L(cj4)
230 L(gt4): xma.l fp0b_0 = uy, v0, ry
231 xma.hu fp1a_0 = uy, v0, ry
233 ldf8 r_3 = [srp], 8
234 getfsig acc0 = fp0b_3
235 (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
236 ldf8 u_3 = [up], 8
237 (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s
239 xma.l fp0b_1 = u_1, v0, r_1
240 xma.hu fp1a_1 = u_1, v0, r_1
242 ldf8 r_0 = [srp], 8
243 getfsig pr0_0 = fp0b_0
244 xma.l fp1b_0 = uy, v1, fp1a_0
245 xma.hu fp2a_0 = uy, v1, fp1a_0
247 ldf8 u_0 = [up], 8
248 getfsig pr1_3 = fp1b_3
249 xma.l fp0b_2 = u_2, v0, r_2
251 getfsig acc1_3 = fp2a_3
252 xma.hu fp1a_2 = u_2, v0, r_2
253 br L(00)
256 ALIGN(32)
257 L(b01):
258 {.mmi; ldf8 r_0 = [srp], 8 C M
259 ldf8 u_0 = [up], 8 C M
260 mov acc1_1 = 0 C M I
261 }{.mmi; mov pr1_1 = 0 C M I
262 mov pr0_2 = 0 C M I
263 cmp.ne p6, p7 = r0, r0 C M I
265 }{.mfi; ldf8 r_1 = [srp], 8 C M
266 xma.l fp0b_2 = ux, v0, rx C F
267 cmp.ne p10, p11 = r0, r0 C M I
268 }{.mfi; ldf8 u_1 = [up], 8 C M
269 xma.hu fp1b_2 = ux, v0, rx C F
270 nop 1
272 } xma.l fp0b_3 = uy, v0, ry C F
273 xma.hu fp1a_3 = uy, v0, ry C F
275 {.mmf; getfsig acc0 = fp0b_2 C M
276 ldf8 r_2 = [srp], 8 C M
277 (p14) xma.hu fp2a_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s
278 }{.mfb; ldf8 u_2 = [up], 8 C M
279 (p14) xma.l fp1b_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s
280 br.cloop.dptk L(gt5)
282 xma.l fp0b_0 = u_0, v0, r_0 C F
283 xma.hu fp1a_0 = u_0, v0, r_0 C F
285 getfsig pr0_3 = fp0b_3 C M
286 xma.l fp1b_3 = uy, v1,fp1a_3 C F
287 xma.hu fp2a_3 = uy, v1,fp1a_3 C F
289 getfsig pr1_2 = fp1b_2 C M
290 getfsig acc1_2 = fp2a_2 C M
291 xma.l fp0b_1 = u_1, v0, r_1 C F
292 xma.hu fp1a_1 = u_1, v0, r_1 C F
293 br L(cj5)
295 L(gt5): xma.l fp0b_0 = u_0, v0, r_0
296 xma.hu fp1a_0 = u_0, v0, r_0
298 getfsig pr0_3 = fp0b_3
299 ldf8 r_3 = [srp], 8
300 xma.l fp1b_3 = uy, v1, fp1a_3
301 xma.hu fp2a_3 = uy, v1, fp1a_3
303 ldf8 u_3 = [up], 8
304 getfsig pr1_2 = fp1b_2
305 xma.l fp0b_1 = u_1, v0, r_1
307 getfsig acc1_2 = fp2a_2
308 xma.hu fp1a_1 = u_1, v0, r_1
309 br L(01)
312 ALIGN(32)
313 L(b10): br.cloop.dptk L(gt2)
314 xma.l fp0b_1 = ux, v0, rx
315 xma.hu fp1b_1 = ux, v0, rx
317 xma.l fp0b_2 = uy, v0, ry
318 xma.hu fp1a_2 = uy, v0, ry
320 stf8 [rp] = fp0b_1, 8
321 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
322 (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
324 getfsig acc0 = fp0b_2
325 xma.l fp1b_2 = uy, v1, fp1a_2
326 xma.hu fp2a_2 = uy, v1, fp1a_2
328 getfsig pr1_1 = fp1b_1
329 getfsig acc1_1 = fp2a_1
330 mov ar.lc = r2
331 getfsig pr1_2 = fp1b_2
332 getfsig r8 = fp2a_2
334 add s0 = pr1_1, acc0
336 st8 [rp] = s0, 8
337 cmp.ltu p8, p9 = s0, pr1_1
338 sub r31 = -1, acc1_1
340 .pred.rel "mutex", p8, p9
341 (p8) add acc0 = pr1_2, acc1_1, 1
342 (p9) add acc0 = pr1_2, acc1_1
343 (p8) cmp.leu p10, p0 = r31, pr1_2
344 (p9) cmp.ltu p10, p0 = r31, pr1_2
346 st8 [rp] = acc0, 8
347 (p10) add r8 = 1, r8
348 br.ret.sptk.many b0
351 L(gt2):
352 {.mmi; ldf8 r_3 = [srp], 8
353 ldf8 u_3 = [up], 8
354 mov acc1_0 = 0
356 }{.mfi; ldf8 r_0 = [srp], 8
357 xma.l fp0b_1 = ux, v0, rx
358 mov pr1_0 = 0
359 }{.mfi; ldf8 u_0 = [up], 8
360 xma.hu fp1b_1 = ux, v0, rx
361 mov pr0_1 = 0
363 } xma.l fp0b_2 = uy, v0, ry
364 xma.hu fp1a_2 = uy, v0, ry
366 getfsig acc0 = fp0b_1
367 ldf8 r_1 = [srp], 8
368 (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
369 (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s
371 ldf8 u_1 = [up], 8
372 xma.l fp0b_3 = u_3, v0, r_3
373 xma.hu fp1a_3 = u_3, v0, r_3
375 getfsig pr0_2 = fp0b_2
376 ldf8 r_2 = [srp], 8
377 xma.l fp1b_2 = uy, v1, fp1a_2
378 xma.hu fp2a_2 = uy, v1, fp1a_2
380 ldf8 u_2 = [up], 8
381 getfsig pr1_1 = fp1b_1
383 {.mfi; getfsig acc1_1 = fp2a_1
384 xma.l fp0b_0 = u_0, v0, r_0
385 cmp.ne p8, p9 = r0, r0
386 }{.mfb; cmp.ne p12, p13 = r0, r0
387 xma.hu fp1a_0 = u_0, v0, r_0
388 br.cloop.sptk.clr L(top)
390 br.many L(end)
393 ALIGN(32)
394 L(b11): ldf8 r_2 = [srp], 8
395 mov pr1_3 = 0
396 mov pr0_0 = 0
398 ldf8 u_2 = [up], 8
399 mov acc1_3 = 0
400 br.cloop.dptk L(gt3)
402 cmp.ne p6, p7 = r0, r0
403 xma.l fp0b_0 = ux, v0, rx
404 xma.hu fp1b_0 = ux, v0, rx
406 cmp.ne p10, p11 = r0, r0
407 xma.l fp0b_1 = uy, v0, ry
408 xma.hu fp1a_1 = uy, v0, ry
410 getfsig acc0 = fp0b_0
411 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
412 (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
414 xma.l fp0b_2 = uy, v1, r_2
415 xma.hu fp1a_2 = uy, v1, r_2
417 getfsig pr0_1 = fp0b_1
418 xma.l fp1b_1 = u_2, v0, fp1a_1
419 xma.hu fp2a_1 = u_2, v0, fp1a_1
421 getfsig pr1_0 = fp1b_0
422 getfsig acc1_0 = fp2a_0
423 br L(cj3)
425 L(gt3): ldf8 r_3 = [srp], 8
426 xma.l fp0b_0 = ux, v0, rx
427 cmp.ne p10, p11 = r0, r0
428 ldf8 u_3 = [up], 8
429 xma.hu fp1b_0 = ux, v0, rx
430 cmp.ne p6, p7 = r0, r0
432 xma.l fp0b_1 = uy, v0, ry
433 xma.hu fp1a_1 = uy, v0, ry
435 getfsig acc0 = fp0b_0
436 ldf8 r_0 = [srp], 8
437 (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
438 ldf8 u_0 = [up], 8
439 (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s
441 xma.l fp0b_2 = u_2, v0, r_2
442 xma.hu fp1a_2 = u_2, v0, r_2
444 getfsig pr0_1 = fp0b_1
445 ldf8 r_1 = [srp], 8
446 xma.l fp1b_1 = uy, v1, fp1a_1
447 xma.hu fp2a_1 = uy, v1, fp1a_1
449 ldf8 u_1 = [up], 8
450 getfsig pr1_0 = fp1b_0
452 getfsig acc1_0 = fp2a_0
453 xma.l fp0b_3 = u_3, v0, r_3
454 xma.hu fp1a_3 = u_3, v0, r_3
455 br L(11)
458 C *** MAIN LOOP START ***
459 ALIGN(32)
460 L(top): C 00
461 .pred.rel "mutex", p12, p13
462 getfsig pr0_3 = fp0b_3
463 ldf8 r_3 = [srp], 8
464 xma.l fp1b_3 = u_3, v1, fp1a_3
465 (p12) add s0 = pr1_0, acc0, 1
466 (p13) add s0 = pr1_0, acc0
467 xma.hu fp2a_3 = u_3, v1, fp1a_3
468 ;; C 01
469 .pred.rel "mutex", p8, p9
470 .pred.rel "mutex", p12, p13
471 ldf8 u_3 = [up], 8
472 getfsig pr1_2 = fp1b_2
473 (p8) cmp.leu p6, p7 = acc0, pr0_1
474 (p9) cmp.ltu p6, p7 = acc0, pr0_1
475 (p12) cmp.leu p10, p11 = s0, pr1_0
476 (p13) cmp.ltu p10, p11 = s0, pr1_0
477 ;; C 02
478 .pred.rel "mutex", p6, p7
479 getfsig acc1_2 = fp2a_2
480 st8 [rp] = s0, 8
481 xma.l fp0b_1 = u_1, v0, r_1
482 (p6) add acc0 = pr0_2, acc1_0, 1
483 (p7) add acc0 = pr0_2, acc1_0
484 xma.hu fp1a_1 = u_1, v0, r_1
485 ;; C 03
486 L(01):
487 .pred.rel "mutex", p10, p11
488 getfsig pr0_0 = fp0b_0
489 ldf8 r_0 = [srp], 8
490 xma.l fp1b_0 = u_0, v1, fp1a_0
491 (p10) add s0 = pr1_1, acc0, 1
492 (p11) add s0 = pr1_1, acc0
493 xma.hu fp2a_0 = u_0, v1, fp1a_0
494 ;; C 04
495 .pred.rel "mutex", p6, p7
496 .pred.rel "mutex", p10, p11
497 ldf8 u_0 = [up], 8
498 getfsig pr1_3 = fp1b_3
499 (p6) cmp.leu p8, p9 = acc0, pr0_2
500 (p7) cmp.ltu p8, p9 = acc0, pr0_2
501 (p10) cmp.leu p12, p13 = s0, pr1_1
502 (p11) cmp.ltu p12, p13 = s0, pr1_1
503 ;; C 05
504 .pred.rel "mutex", p8, p9
505 getfsig acc1_3 = fp2a_3
506 st8 [rp] = s0, 8
507 xma.l fp0b_2 = u_2, v0, r_2
508 (p8) add acc0 = pr0_3, acc1_1, 1
509 (p9) add acc0 = pr0_3, acc1_1
510 xma.hu fp1a_2 = u_2, v0, r_2
511 ;; C 06
512 L(00):
513 .pred.rel "mutex", p12, p13
514 getfsig pr0_1 = fp0b_1
515 ldf8 r_1 = [srp], 8
516 xma.l fp1b_1 = u_1, v1, fp1a_1
517 (p12) add s0 = pr1_2, acc0, 1
518 (p13) add s0 = pr1_2, acc0
519 xma.hu fp2a_1 = u_1, v1, fp1a_1
520 ;; C 07
521 .pred.rel "mutex", p8, p9
522 .pred.rel "mutex", p12, p13
523 ldf8 u_1 = [up], 8
524 getfsig pr1_0 = fp1b_0
525 (p8) cmp.leu p6, p7 = acc0, pr0_3
526 (p9) cmp.ltu p6, p7 = acc0, pr0_3
527 (p12) cmp.leu p10, p11 = s0, pr1_2
528 (p13) cmp.ltu p10, p11 = s0, pr1_2
529 ;; C 08
530 .pred.rel "mutex", p6, p7
531 getfsig acc1_0 = fp2a_0
532 st8 [rp] = s0, 8
533 xma.l fp0b_3 = u_3, v0, r_3
534 (p6) add acc0 = pr0_0, acc1_2, 1
535 (p7) add acc0 = pr0_0, acc1_2
536 xma.hu fp1a_3 = u_3, v0, r_3
537 ;; C 09
538 L(11):
539 .pred.rel "mutex", p10, p11
540 getfsig pr0_2 = fp0b_2
541 ldf8 r_2 = [srp], 8
542 xma.l fp1b_2 = u_2, v1, fp1a_2
543 (p10) add s0 = pr1_3, acc0, 1
544 (p11) add s0 = pr1_3, acc0
545 xma.hu fp2a_2 = u_2, v1, fp1a_2
546 ;; C 10
547 .pred.rel "mutex", p6, p7
548 .pred.rel "mutex", p10, p11
549 ldf8 u_2 = [up], 8
550 getfsig pr1_1 = fp1b_1
551 (p6) cmp.leu p8, p9 = acc0, pr0_0
552 (p7) cmp.ltu p8, p9 = acc0, pr0_0
553 (p10) cmp.leu p12, p13 = s0, pr1_3
554 (p11) cmp.ltu p12, p13 = s0, pr1_3
555 ;; C 11
556 .pred.rel "mutex", p8, p9
557 getfsig acc1_1 = fp2a_1
558 st8 [rp] = s0, 8
559 xma.l fp0b_0 = u_0, v0, r_0
560 (p8) add acc0 = pr0_1, acc1_3, 1
561 (p9) add acc0 = pr0_1, acc1_3
562 xma.hu fp1a_0 = u_0, v0, r_0
563 L(10): br.cloop.sptk.clr L(top) C 12
565 C *** MAIN LOOP END ***
566 L(end):
567 .pred.rel "mutex", p12, p13
568 {.mfi; getfsig pr0_3 = fp0b_3
569 xma.l fp1b_3 = u_3, v1, fp1a_3
570 (p12) add s0 = pr1_0, acc0, 1
571 }{.mfi; (p13) add s0 = pr1_0, acc0
572 xma.hu fp2a_3 = u_3, v1, fp1a_3
573 nop 1
575 } .pred.rel "mutex", p8, p9
576 .pred.rel "mutex", p12, p13
577 {.mmi; getfsig pr1_2 = fp1b_2
578 st8 [rp] = s0, 8
579 (p8) cmp.leu p6, p7 = acc0, pr0_1
580 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
581 (p12) cmp.leu p10, p11 = s0, pr1_0
582 (p13) cmp.ltu p10, p11 = s0, pr1_0
584 } .pred.rel "mutex", p6, p7
585 {.mfi; getfsig acc1_2 = fp2a_2
586 xma.l fp0b_1 = u_1, v0, r_1
587 nop 1
588 }{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1
589 (p7) add acc0 = pr0_2, acc1_0
590 xma.hu fp1a_1 = u_1, v0, r_1
593 L(cj5):
594 .pred.rel "mutex", p10, p11
595 {.mfi; getfsig pr0_0 = fp0b_0
596 xma.l fp1b_0 = u_0, v1, fp1a_0
597 (p10) add s0 = pr1_1, acc0, 1
598 }{.mfi; (p11) add s0 = pr1_1, acc0
599 xma.hu fp2a_0 = u_0, v1, fp1a_0
600 nop 1
602 } .pred.rel "mutex", p6, p7
603 .pred.rel "mutex", p10, p11
604 {.mmi; getfsig pr1_3 = fp1b_3
605 st8 [rp] = s0, 8
606 (p6) cmp.leu p8, p9 = acc0, pr0_2
607 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
608 (p10) cmp.leu p12, p13 = s0, pr1_1
609 (p11) cmp.ltu p12, p13 = s0, pr1_1
611 } .pred.rel "mutex", p8, p9
612 {.mfi; getfsig acc1_3 = fp2a_3
613 xma.l fp0b_2 = u_2, v0, r_2
614 nop 1
615 }{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1
616 (p9) add acc0 = pr0_3, acc1_1
617 xma.hu fp1a_2 = u_2, v0, r_2
620 L(cj4):
621 .pred.rel "mutex", p12, p13
622 {.mfi; getfsig pr0_1 = fp0b_1
623 xma.l fp1b_1 = u_1, v1, fp1a_1
624 (p12) add s0 = pr1_2, acc0, 1
625 }{.mfi; (p13) add s0 = pr1_2, acc0
626 xma.hu fp2a_1 = u_1, v1, fp1a_1
627 nop 1
629 } .pred.rel "mutex", p8, p9
630 .pred.rel "mutex", p12, p13
631 {.mmi; getfsig pr1_0 = fp1b_0
632 st8 [rp] = s0, 8
633 (p8) cmp.leu p6, p7 = acc0, pr0_3
634 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3
635 (p12) cmp.leu p10, p11 = s0, pr1_2
636 (p13) cmp.ltu p10, p11 = s0, pr1_2
638 } .pred.rel "mutex", p6, p7
639 {.mmi; getfsig acc1_0 = fp2a_0
640 (p6) add acc0 = pr0_0, acc1_2, 1
641 (p7) add acc0 = pr0_0, acc1_2
644 L(cj3):
645 .pred.rel "mutex", p10, p11
646 {.mfi; getfsig pr0_2 = fp0b_2
647 xma.l fp1b_2 = u_2, v1, fp1a_2
648 (p10) add s0 = pr1_3, acc0, 1
649 }{.mfi; (p11) add s0 = pr1_3, acc0
650 xma.hu fp2a_2 = u_2, v1, fp1a_2
651 nop 1
653 } .pred.rel "mutex", p6, p7
654 .pred.rel "mutex", p10, p11
655 {.mmi; getfsig pr1_1 = fp1b_1
656 st8 [rp] = s0, 8
657 (p6) cmp.leu p8, p9 = acc0, pr0_0
658 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0
659 (p10) cmp.leu p12, p13 = s0, pr1_3
660 (p11) cmp.ltu p12, p13 = s0, pr1_3
662 } .pred.rel "mutex", p8, p9
663 {.mmi; getfsig acc1_1 = fp2a_1
664 (p8) add acc0 = pr0_1, acc1_3, 1
665 (p9) add acc0 = pr0_1, acc1_3
667 } .pred.rel "mutex", p12, p13
668 {.mmi; (p12) add s0 = pr1_0, acc0, 1
669 (p13) add s0 = pr1_0, acc0
670 nop 1
672 } .pred.rel "mutex", p8, p9
673 .pred.rel "mutex", p12, p13
674 {.mmi; getfsig pr1_2 = fp1b_2
675 st8 [rp] = s0, 8
676 (p8) cmp.leu p6, p7 = acc0, pr0_1
677 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
678 (p12) cmp.leu p10, p11 = s0, pr1_0
679 (p13) cmp.ltu p10, p11 = s0, pr1_0
681 } .pred.rel "mutex", p6, p7
682 {.mmi; getfsig r8 = fp2a_2
683 (p6) add acc0 = pr0_2, acc1_0, 1
684 (p7) add acc0 = pr0_2, acc1_0
686 } .pred.rel "mutex", p10, p11
687 {.mmi; (p10) add s0 = pr1_1, acc0, 1
688 (p11) add s0 = pr1_1, acc0
689 (p6) cmp.leu p8, p9 = acc0, pr0_2
691 } .pred.rel "mutex", p10, p11
692 {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
693 (p10) cmp.leu p12, p13 = s0, pr1_1
694 (p11) cmp.ltu p12, p13 = s0, pr1_1
696 } .pred.rel "mutex", p8, p9
697 {.mmi; st8 [rp] = s0, 8
698 (p8) add acc0 = pr1_2, acc1_1, 1
699 (p9) add acc0 = pr1_2, acc1_1
701 } .pred.rel "mutex", p8, p9
702 {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2
703 (p9) cmp.ltu p10, p11 = acc0, pr1_2
704 (p12) add acc0 = 1, acc0
706 }{.mmi; st8 [rp] = acc0, 8
707 (p12) cmpeqor p10, p0 = 0, acc0
708 nop 1
710 }{.mib; (p10) add r8 = 1, r8
711 mov ar.lc = r2
712 br.ret.sptk.many b0
714 EPILOGUE()
715 ASM_END()