beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / bdiv_dbm1c.asm
blob47e4553cda5dd7a42d63fdf4b899603ab9145ec6
1 dnl IA-64 mpn_bdiv_dbm1.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2008, 2009 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C Itanium: 4
37 C Itanium 2: 2
39 C TODO
40 C * Optimize feed-in and wind-down code, both for speed and code size.
42 C INPUT PARAMETERS
43 define(`rp', `r32')
44 define(`up', `r33')
45 define(`n', `r34')
46 define(`bd', `r35')
48 ASM_START()
49 PROLOGUE(mpn_bdiv_dbm1c)
50 .prologue
51 .save ar.lc, r2
52 .body
54 ifdef(`HAVE_ABI_32',
55 ` addp4 rp = 0, rp C M I
56 addp4 up = 0, up C M I
57 zxt4 n = n C I
60 {.mmb
61 mov r15 = r36 C M I
62 ldf8 f9 = [up], 8 C M
63 nop.b 0 C B
65 .Lcommon:
66 {.mii
67 adds r16 = -1, n C M I
68 mov r2 = ar.lc C I0
69 and r14 = 3, n C M I
72 {.mii
73 setf.sig f6 = bd C M2 M3
74 shr.u r31 = r16, 2 C I0
75 cmp.eq p10, p0 = 0, r14 C M I
77 {.mii
78 nop.m 0 C M
79 cmp.eq p11, p0 = 2, r14 C M I
80 cmp.eq p12, p0 = 3, r14 C M I
83 {.mii
84 cmp.ne p6, p7 = r0, r0 C M I
85 mov.i ar.lc = r31 C I0
86 cmp.ne p8, p9 = r0, r0 C M I
88 {.bbb
89 (p10) br.dptk .Lb00 C B
90 (p11) br.dptk .Lb10 C B
91 (p12) br.dptk .Lb11 C B
95 .Lb01: br.cloop.dptk .grt1
97 xma.l f38 = f9, f6, f0
98 xma.hu f39 = f9, f6, f0
100 getf.sig r26 = f38
101 getf.sig r27 = f39
102 br .Lcj1
104 .grt1: ldf8 f10 = [r33], 8
106 ldf8 f11 = [r33], 8
108 ldf8 f12 = [r33], 8
110 xma.l f38 = f9, f6, f0
111 xma.hu f39 = f9, f6, f0
113 ldf8 f13 = [r33], 8
115 xma.l f32 = f10, f6, f0
116 xma.hu f33 = f10, f6, f0
117 br.cloop.dptk .grt5
120 getf.sig r26 = f38
121 xma.l f34 = f11, f6, f0
122 xma.hu f35 = f11, f6, f0
124 getf.sig r27 = f39
126 getf.sig r20 = f32
127 xma.l f36 = f12, f6, f0
128 xma.hu f37 = f12, f6, f0
130 getf.sig r21 = f33
132 getf.sig r22 = f34
133 xma.l f38 = f13, f6, f0
134 xma.hu f39 = f13, f6, f0
135 br .Lcj5
137 .grt5: ldf8 f10 = [r33], 8
139 getf.sig r26 = f38
140 xma.l f34 = f11, f6, f0
141 xma.hu f35 = f11, f6, f0
143 getf.sig r27 = f39
144 ldf8 f11 = [r33], 8
146 getf.sig r20 = f32
147 xma.l f36 = f12, f6, f0
148 xma.hu f37 = f12, f6, f0
150 getf.sig r21 = f33
151 ldf8 f12 = [r33], 8
153 getf.sig r22 = f34
154 xma.l f38 = f13, f6, f0
155 xma.hu f39 = f13, f6, f0
156 br .LL01
158 .Lb10: ldf8 f13 = [r33], 8
159 br.cloop.dptk .grt2
162 xma.l f36 = f9, f6, f0
163 xma.hu f37 = f9, f6, f0
165 xma.l f38 = f13, f6, f0
166 xma.hu f39 = f13, f6, f0
168 getf.sig r24 = f36
170 getf.sig r25 = f37
172 getf.sig r26 = f38
174 getf.sig r27 = f39
175 br .Lcj2
177 .grt2: ldf8 f10 = [r33], 8
179 ldf8 f11 = [r33], 8
181 xma.l f36 = f9, f6, f0
182 xma.hu f37 = f9, f6, f0
184 ldf8 f12 = [r33], 8
186 xma.l f38 = f13, f6, f0
187 xma.hu f39 = f13, f6, f0
189 ldf8 f13 = [r33], 8
191 getf.sig r24 = f36
192 xma.l f32 = f10, f6, f0
193 xma.hu f33 = f10, f6, f0
194 br.cloop.dptk .grt6
196 getf.sig r25 = f37
198 getf.sig r26 = f38
199 xma.l f34 = f11, f6, f0
200 xma.hu f35 = f11, f6, f0
202 getf.sig r27 = f39
204 getf.sig r20 = f32
205 xma.l f36 = f12, f6, f0
206 xma.hu f37 = f12, f6, f0
207 br .Lcj6
209 .grt6: getf.sig r25 = f37
210 ldf8 f10 = [r33], 8
212 getf.sig r26 = f38
213 xma.l f34 = f11, f6, f0
214 xma.hu f35 = f11, f6, f0
216 getf.sig r27 = f39
217 ldf8 f11 = [r33], 8
219 getf.sig r20 = f32
220 xma.l f36 = f12, f6, f0
221 xma.hu f37 = f12, f6, f0
222 br .LL10
225 .Lb11: ldf8 f12 = [r33], 8
227 ldf8 f13 = [r33], 8
228 br.cloop.dptk .grt3
231 xma.l f34 = f9, f6, f0
232 xma.hu f35 = f9, f6, f0
234 xma.l f36 = f12, f6, f0
235 xma.hu f37 = f12, f6, f0
237 getf.sig r22 = f34
238 xma.l f38 = f13, f6, f0
239 xma.hu f39 = f13, f6, f0
241 getf.sig r23 = f35
243 getf.sig r24 = f36
245 getf.sig r25 = f37
247 getf.sig r26 = f38
248 br .Lcj3
250 .grt3: ldf8 f10 = [r33], 8
252 xma.l f34 = f9, f6, f0
253 xma.hu f35 = f9, f6, f0
255 ldf8 f11 = [r33], 8
257 xma.l f36 = f12, f6, f0
258 xma.hu f37 = f12, f6, f0
260 ldf8 f12 = [r33], 8
262 getf.sig r22 = f34
263 xma.l f38 = f13, f6, f0
264 xma.hu f39 = f13, f6, f0
266 getf.sig r23 = f35
267 ldf8 f13 = [r33], 8
269 getf.sig r24 = f36
270 xma.l f32 = f10, f6, f0
271 xma.hu f33 = f10, f6, f0
272 br.cloop.dptk .grt7
274 getf.sig r25 = f37
276 getf.sig r26 = f38
277 xma.l f34 = f11, f6, f0
278 xma.hu f35 = f11, f6, f0
279 br .Lcj7
281 .grt7: getf.sig r25 = f37
282 ldf8 f10 = [r33], 8
284 getf.sig r26 = f38
285 xma.l f34 = f11, f6, f0
286 xma.hu f35 = f11, f6, f0
287 br .LL11
290 .Lb00: ldf8 f11 = [r33], 8
292 ldf8 f12 = [r33], 8
294 ldf8 f13 = [r33], 8
295 br.cloop.dptk .grt4
298 xma.l f32 = f9, f6, f0
299 xma.hu f33 = f9, f6, f0
301 xma.l f34 = f11, f6, f0
302 xma.hu f35 = f11, f6, f0
304 getf.sig r20 = f32
305 xma.l f36 = f12, f6, f0
306 xma.hu f37 = f12, f6, f0
308 getf.sig r21 = f33
310 getf.sig r22 = f34
311 xma.l f38 = f13, f6, f0
312 xma.hu f39 = f13, f6, f0
314 getf.sig r23 = f35
316 getf.sig r24 = f36
317 br .Lcj4
319 .grt4: xma.l f32 = f9, f6, f0
320 xma.hu f33 = f9, f6, f0
322 ldf8 f10 = [r33], 8
324 xma.l f34 = f11, f6, f0
325 xma.hu f35 = f11, f6, f0
327 ldf8 f11 = [r33], 8
329 getf.sig r20 = f32
330 xma.l f36 = f12, f6, f0
331 xma.hu f37 = f12, f6, f0
333 getf.sig r21 = f33
334 ldf8 f12 = [r33], 8
336 getf.sig r22 = f34
337 xma.l f38 = f13, f6, f0
338 xma.hu f39 = f13, f6, f0
340 getf.sig r23 = f35
341 ldf8 f13 = [r33], 8
343 getf.sig r24 = f36
344 xma.l f32 = f10, f6, f0
345 xma.hu f33 = f10, f6, f0
346 br.cloop.dptk .LL00
347 br .Lcj8
349 C *** MAIN LOOP START ***
350 ALIGN(32)
351 .Ltop:
352 .pred.rel "mutex",p6,p7
353 C .mfi
354 getf.sig r24 = f36
355 xma.l f32 = f10, f6, f0
356 (p6) sub r15 = r19, r27, 1
357 C .mfi
358 st8 [r32] = r19, 8
359 xma.hu f33 = f10, f6, f0
360 (p7) sub r15 = r19, r27
362 .LL00:
363 C .mfi
364 getf.sig r25 = f37
365 nop.f 0
366 cmp.ltu p6, p7 = r15, r20
367 C .mib
368 ldf8 f10 = [r33], 8
369 sub r16 = r15, r20
370 nop.b 0
373 C .mfi
374 getf.sig r26 = f38
375 xma.l f34 = f11, f6, f0
376 (p6) sub r15 = r16, r21, 1
377 C .mfi
378 st8 [r32] = r16, 8
379 xma.hu f35 = f11, f6, f0
380 (p7) sub r15 = r16, r21
382 .LL11:
383 C .mfi
384 getf.sig r27 = f39
385 nop.f 0
386 cmp.ltu p6, p7 = r15, r22
387 C .mib
388 ldf8 f11 = [r33], 8
389 sub r17 = r15, r22
390 nop.b 0
393 C .mfi
394 getf.sig r20 = f32
395 xma.l f36 = f12, f6, f0
396 (p6) sub r15 = r17, r23, 1
397 C .mfi
398 st8 [r32] = r17, 8
399 xma.hu f37 = f12, f6, f0
400 (p7) sub r15 = r17, r23
402 .LL10:
403 C .mfi
404 getf.sig r21 = f33
405 nop.f 0
406 cmp.ltu p6, p7 = r15, r24
407 C .mib
408 ldf8 f12 = [r33], 8
409 sub r18 = r15, r24
410 nop.b 0
413 C .mfi
414 getf.sig r22 = f34
415 xma.l f38 = f13, f6, f0
416 (p6) sub r15 = r18, r25, 1
417 C .mfi
418 st8 [r32] = r18, 8
419 xma.hu f39 = f13, f6, f0
420 (p7) sub r15 = r18, r25
422 .LL01:
423 C .mfi
424 getf.sig r23 = f35
425 nop.f 0
426 cmp.ltu p6, p7 = r15, r26
427 C .mib
428 ldf8 f13 = [r33], 8
429 sub r19 = r15, r26
430 br.cloop.sptk.few .Ltop
431 C *** MAIN LOOP END ***
434 getf.sig r24 = f36
435 xma.l f32 = f10, f6, f0
436 (p6) sub r15 = r19, r27, 1
437 st8 [r32] = r19, 8
438 xma.hu f33 = f10, f6, f0
439 (p7) sub r15 = r19, r27
441 .Lcj8: getf.sig r25 = f37
442 cmp.ltu p6, p7 = r15, r20
443 sub r16 = r15, r20
445 getf.sig r26 = f38
446 xma.l f34 = f11, f6, f0
447 (p6) sub r15 = r16, r21, 1
448 st8 [r32] = r16, 8
449 xma.hu f35 = f11, f6, f0
450 (p7) sub r15 = r16, r21
452 .Lcj7: getf.sig r27 = f39
453 cmp.ltu p6, p7 = r15, r22
454 sub r17 = r15, r22
456 getf.sig r20 = f32
457 xma.l f36 = f12, f6, f0
458 (p6) sub r15 = r17, r23, 1
459 st8 [r32] = r17, 8
460 xma.hu f37 = f12, f6, f0
461 (p7) sub r15 = r17, r23
463 .Lcj6: getf.sig r21 = f33
464 cmp.ltu p6, p7 = r15, r24
465 sub r18 = r15, r24
467 getf.sig r22 = f34
468 xma.l f38 = f13, f6, f0
469 (p6) sub r15 = r18, r25, 1
470 st8 [r32] = r18, 8
471 xma.hu f39 = f13, f6, f0
472 (p7) sub r15 = r18, r25
474 .Lcj5: getf.sig r23 = f35
475 cmp.ltu p6, p7 = r15, r26
476 sub r19 = r15, r26
478 getf.sig r24 = f36
479 (p6) sub r15 = r19, r27, 1
480 st8 [r32] = r19, 8
481 (p7) sub r15 = r19, r27
483 .Lcj4: getf.sig r25 = f37
484 cmp.ltu p6, p7 = r15, r20
485 sub r16 = r15, r20
487 getf.sig r26 = f38
488 (p6) sub r15 = r16, r21, 1
489 st8 [r32] = r16, 8
490 (p7) sub r15 = r16, r21
492 .Lcj3: getf.sig r27 = f39
493 cmp.ltu p6, p7 = r15, r22
494 sub r17 = r15, r22
496 (p6) sub r15 = r17, r23, 1
497 st8 [r32] = r17, 8
498 (p7) sub r15 = r17, r23
500 .Lcj2: cmp.ltu p6, p7 = r15, r24
501 sub r18 = r15, r24
503 (p6) sub r15 = r18, r25, 1
504 st8 [r32] = r18, 8
505 (p7) sub r15 = r18, r25
507 .Lcj1: cmp.ltu p6, p7 = r15, r26
508 sub r19 = r15, r26
510 (p6) sub r8 = r19, r27, 1
511 st8 [r32] = r19
512 (p7) sub r8 = r19, r27
513 mov ar.lc = r2
514 br.ret.sptk.many b0
515 EPILOGUE()
516 ASM_END()