beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / divrem_1.asm
blobe8878209db7fc8d28664bb5ee2e668375c092a1c
1 dnl IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an
2 dnl unnormalized limb.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright 2002, 2004, 2005 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
37 C cycles/limb
38 C Itanium: 40-42
39 C Itanium 2: 29-30
41 C This was generated by gcc, then the loops were optimized. The preinv entry
42 C point was shoehorned into the file. Lots of things outside the loops could
43 C be streamlined. It would probably be a good idea to merge the loops for
44 C normalized and unnormalized divisor, since the shifting stuff is done for
45 C free in parallel with other operations. It would even be possible to merge
46 C all loops, if the ld8 were made conditional.
48 C TODO
49 C * Consider delaying inversion for normalized mpn_divrem_1 entry till after
50 C computing leading limb.
51 C * Inline and interleave limb inversion code with loop setup code.
53 ASM_START()
55 C HP's assembler requires these declarations for importing mpn_invert_limb
56 .global mpn_invert_limb
57 .type mpn_invert_limb,@function
59 C INPUT PARAMETERS
60 C rp = r32
61 C qxn = r33
62 C up = r34
63 C n = r35
64 C vl = r36
65 C vlinv = r37 (preinv only)
66 C cnt = r38 (preinv only)
68 PROLOGUE(mpn_preinv_divrem_1)
69 .prologue
70 .save ar.pfs, r42
71 alloc r42 = ar.pfs, 7, 8, 1, 0
72 .save ar.lc, r44
73 mov r44 = ar.lc
74 .save rp, r41
75 mov r41 = b0
76 .body
77 ifdef(`HAVE_ABI_32',
78 ` addp4 r32 = 0, r32
79 sxt4 r33 = r33
80 addp4 r34 = 0, r34
81 sxt4 r35 = r35
84 mov r40 = r38
85 shladd r34 = r35, 3, r34
87 adds r34 = -8, r34
89 ld8 r39 = [r34], -8
92 add r15 = r35, r33
94 mov r8 = r37
95 shladd r32 = r15, 3, r32 C r32 = rp + n + qxn
96 cmp.le p8, p0 = 0, r36
98 adds r32 = -8, r32 C r32 = rp + n + qxn - 1
99 cmp.leu p6, p7 = r36, r39
100 (p8) br.cond.dpnt .Lpunnorm
103 (p6) addl r15 = 1, r0
104 (p7) mov r15 = r0
106 (p6) sub r38 = r39, r36
107 (p7) mov r38 = r39
108 st8 [r32] = r15, -8
109 adds r35 = -2, r35 C un -= 2
110 br .Lpn
112 .Lpunnorm:
113 (p6) add r34 = 8, r34
114 mov r38 = 0 C r = 0
115 shl r36 = r36, r40
116 (p6) br.cond.dptk .Lpu
118 shl r38 = r39, r40 C r = ahigh << cnt
119 cmp.ne p8, p0 = 1, r35
120 st8 [r32] = r0, -8
121 adds r35 = -1, r35 C un--
122 (p8) br.cond.dpnt .Lpu
124 mov r23 = 1
126 setf.sig f6 = r8
127 setf.sig f12 = r23
128 br .L435
129 EPILOGUE()
132 PROLOGUE(mpn_divrem_1)
133 .prologue
134 .save ar.pfs, r42
135 alloc r42 = ar.pfs, 5, 8, 1, 0
136 .save ar.lc, r44
137 mov r44 = ar.lc
138 .save rp, r41
139 mov r41 = b0
140 .body
141 ifdef(`HAVE_ABI_32',
142 ` addp4 r32 = 0, r32
143 sxt4 r33 = r33
144 addp4 r34 = 0, r34
145 sxt4 r35 = r35
148 mov r38 = r0
149 add r15 = r35, r33
151 cmp.ne p6, p7 = 0, r15
153 (p7) mov r8 = r0
154 (p7) br.cond.dpnt .Lret
155 shladd r14 = r15, 3, r32 C r14 = rp + n + qxn
156 cmp.le p6, p7 = 0, r36
158 adds r32 = -8, r14 C r32 = rp + n + qxn - 1
159 (p6) br.cond.dpnt .Lunnorm
160 cmp.eq p6, p7 = 0, r35
161 (p6) br.cond.dpnt .L179
162 shladd r14 = r35, 3, r34
164 adds r14 = -8, r14
165 adds r35 = -1, r35
167 ld8 r38 = [r14]
169 cmp.leu p6, p7 = r36, r38
171 (p6) addl r15 = 1, r0
172 (p7) mov r15 = r0
174 st8 [r32] = r15, -8
175 (p6) sub r38 = r38, r36
177 .L179:
178 mov r45 = r36
179 adds r35 = -1, r35
180 br.call.sptk.many b0 = mpn_invert_limb
182 shladd r34 = r35, 3, r34
183 .Lpn:
184 mov r23 = 1
186 setf.sig f6 = r8
187 setf.sig f12 = r23
188 cmp.le p6, p7 = 0, r35
189 mov r40 = 0
190 (p7) br.cond.dpnt .L435
191 setf.sig f10 = r36
192 mov ar.lc = r35
193 setf.sig f7 = r38
195 sub r28 = -1, r36
196 C Develop quotient limbs for normalized divisor
197 .Loop1: C 00 C q=r18 nh=r38/f7
198 ld8 r20 = [r34], -8
199 xma.hu f11 = f7, f6, f0
200 ;; C 04
201 xma.l f8 = f11, f12, f7 C q = q + nh
202 ;; C 08
203 getf.sig r18 = f8
204 xma.hu f9 = f8, f10, f0
205 xma.l f8 = f8, f10, f0
206 ;; C 12
207 getf.sig r16 = f9
208 C 13
209 getf.sig r15 = f8
210 ;; C 18
211 cmp.ltu p6, p7 = r20, r15
212 sub r15 = r20, r15
213 sub r16 = r38, r16
214 ;; C 19
215 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
216 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
217 (p6) add r16 = -1, r16
218 (p0) cmp.ne.unc p6, p7 = r0, r0
219 ;; C 20
220 (p8) cmp.ltu p6, p7 = r15, r36
221 (p8) sub r15 = r15, r36
222 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
223 ;; C 21
224 .pred.rel "mutex",p6,p7
225 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
226 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
227 cmp.ltu p6, p7 = r15, r36 C speculative
228 sub r28 = r15, r36 C speculative, just for cmp
229 ;; C 22
230 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed
231 (p8) mov r15 = r28
232 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
233 ;; C 23
234 (p6) setf.sig f7 = r15
235 (p7) sub r15 = r15, r36
236 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
237 ;; C 24
238 (p7) setf.sig f7 = r15
239 st8 [r32] = r18, -8
240 mov r38 = r15
241 br.cloop.dptk .Loop1
242 C 29/30
243 br.sptk .L435
245 .Lunnorm:
246 mux1 r16 = r36, @rev
247 cmp.eq p6, p7 = 0, r35
248 (p6) br.cond.dpnt .L322
249 shladd r34 = r35, 3, r34
251 adds r34 = -8, r34
253 ld8 r39 = [r34]
255 cmp.leu p6, p7 = r36, r39
256 (p6) br.cond.dptk .L322
257 adds r34 = -8, r34
259 mov r38 = r39
261 cmp.ne p6, p7 = 1, r15
262 st8 [r32] = r0, -8
264 (p7) mov r8 = r38
265 (p7) br.cond.dpnt .Lret
266 adds r35 = -1, r35
267 .L322:
268 sub r14 = r0, r16
270 or r14 = r16, r14
272 mov r16 = -8
273 czx1.l r14 = r14
275 shladd r16 = r14, 3, r16
277 shr.u r14 = r36, r16
279 cmp.geu p6, p7 = 15, r14
281 (p7) shr.u r14 = r14, 4
282 (p7) adds r16 = 4, r16
284 cmp.geu p6, p7 = 3, r14
286 (p7) shr.u r14 = r14, 2
287 (p7) adds r16 = 2, r16
289 tbit.nz p6, p7 = r14, 1
291 .pred.rel "mutex",p6,p7
292 (p6) sub r40 = 62, r16
293 (p7) sub r40 = 63, r16
295 shl r45 = r36, r40
296 shl r36 = r36, r40
297 shl r38 = r38, r40
298 br.call.sptk.many b0 = mpn_invert_limb
300 .Lpu:
301 mov r23 = 1
303 setf.sig f6 = r8
304 setf.sig f12 = r23
305 cmp.eq p6, p7 = 0, r35
306 (p6) br.cond.dpnt .L435
307 sub r16 = 64, r40
308 adds r35 = -2, r35
310 ld8 r39 = [r34], -8
311 cmp.le p6, p7 = 0, r35
313 shr.u r14 = r39, r16
315 or r38 = r14, r38
316 (p7) br.cond.dpnt .Lend3
318 mov r22 = r16
319 setf.sig f10 = r36
320 setf.sig f7 = r38
321 mov ar.lc = r35
323 C Develop quotient limbs for unnormalized divisor
324 .Loop3:
325 ld8 r14 = [r34], -8
326 xma.hu f11 = f7, f6, f0
328 xma.l f8 = f11, f12, f7 C q = q + nh
330 getf.sig r18 = f8
331 xma.hu f9 = f8, f10, f0
332 shl r20 = r39, r40
333 xma.l f8 = f8, f10, f0
334 shr.u r24 = r14, r22
336 getf.sig r16 = f9
337 getf.sig r15 = f8
338 or r20 = r24, r20
340 cmp.ltu p6, p7 = r20, r15
341 sub r15 = r20, r15
342 sub r16 = r38, r16
344 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
345 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
346 (p6) add r16 = -1, r16
347 (p0) cmp.ne.unc p6, p7 = r0, r0
349 (p8) cmp.ltu p6, p7 = r15, r36
350 (p8) sub r15 = r15, r36
351 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
353 .pred.rel "mutex",p6,p7
354 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
355 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
356 cmp.ltu p6, p7 = r15, r36 C speculative
357 sub r28 = r15, r36 C speculative, just for cmp
359 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed
360 (p8) mov r15 = r28
361 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
363 (p6) setf.sig f7 = r15
364 (p7) sub r15 = r15, r36
365 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
367 (p7) setf.sig f7 = r15
368 st8 [r32] = r18, -8
369 mov r39 = r14
370 mov r38 = r15
371 br.cloop.dptk .Loop3
373 .Lend3:
374 setf.sig f10 = r36
375 setf.sig f7 = r38
377 xma.hu f11 = f7, f6, f0
379 xma.l f8 = f11, f12, f7 C q = q + nh
381 getf.sig r18 = f8
382 xma.hu f9 = f8, f10, f0
383 shl r20 = r39, r40
384 xma.l f8 = f8, f10, f0
386 getf.sig r16 = f9
387 getf.sig r15 = f8
389 cmp.ltu p6, p7 = r20, r15
390 sub r15 = r20, r15
391 sub r16 = r38, r16
393 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
394 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
395 (p6) add r16 = -1, r16
396 (p0) cmp.ne.unc p6, p7 = r0, r0
398 (p8) cmp.ltu p6, p7 = r15, r36
399 (p8) sub r15 = r15, r36
400 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
402 .pred.rel "mutex",p6,p7
403 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
404 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
406 (p8) sub r15 = r15, r36
407 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
409 cmp.ltu p6, p7 = r15, r36
411 (p7) sub r15 = r15, r36
412 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
414 st8 [r32] = r18, -8
415 mov r38 = r15
416 .L435:
417 adds r35 = -1, r33
418 cmp.le p6, p7 = 1, r33
419 (p7) br.cond.dpnt .Lend4
421 setf.sig f7 = r38
422 setf.sig f10 = r36
423 mov ar.lc = r35
425 .Loop4:
426 xma.hu f11 = f7, f6, f0
428 xma.l f8 = f11, f12, f7 C q = q + nh
430 getf.sig r18 = f8
431 xma.hu f9 = f8, f10, f0
432 xma.l f8 = f8, f10, f0
434 getf.sig r16 = f9
435 getf.sig r15 = f8
437 cmp.ltu p6, p7 = 0, r15
438 sub r15 = 0, r15
439 sub r16 = r38, r16
441 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0?
442 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0?
443 (p6) add r16 = -1, r16
444 (p0) cmp.ne.unc p6, p7 = r0, r0
446 (p8) cmp.ltu p6, p7 = r15, r36
447 (p8) sub r15 = r15, r36
448 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
450 .pred.rel "mutex",p6,p7
451 (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still?
452 (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still?
453 cmp.ltu p6, p7 = r15, r36 C speculative
454 sub r28 = r15, r36 C speculative, just for cmp
456 (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed
457 (p8) mov r15 = r28
458 (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0
460 (p6) setf.sig f7 = r15
461 (p7) sub r15 = r15, r36
462 (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0
464 (p7) setf.sig f7 = r15
465 st8 [r32] = r18, -8
466 mov r38 = r15
467 br.cloop.dptk .Loop4
469 .Lend4:
470 shr.u r8 = r38, r40
471 .Lret:
472 mov ar.pfs = r42
473 mov ar.lc = r44
474 mov b0 = r41
475 br.ret.sptk.many b0
476 EPILOGUE()
477 ASM_END()