1 dnl IA
-64 mpn_divrem_1
and mpn_preinv_divrem_1
-- Divide an mpn number by an
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright
2002, 2004, 2005 Free Software Foundation
, Inc.
8 dnl
This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
11 dnl it under the terms of
either:
13 dnl
* the GNU Lesser General
Public License as published by the Free
14 dnl Software Foundation
; either version 3 of the License, or (at your
15 dnl option
) any later version.
19 dnl
* the GNU General
Public License as published by the Free Software
20 dnl Foundation
; either version 2 of the License, or (at your option) any
23 dnl
or both
in parallel
, as here.
25 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
26 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
27 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
30 dnl You should have received copies of the GNU General
Public License
and the
31 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
32 dnl see
https://www.gnu.
org/licenses
/.
34 include(`..
/config.m4
')
41 C This was generated by gcc, then the loops were optimized. The preinv entry
42 C point was shoehorned into the file. Lots of things outside the loops could
43 C be streamlined. It would probably be a good idea to merge the loops for
44 C normalized and unnormalized divisor, since the shifting stuff is done for
45 C free in parallel with other operations. It would even be possible to merge
46 C all loops, if the ld8 were made conditional.
49 C * Consider delaying inversion for normalized mpn_divrem_1 entry till after
50 C computing leading limb.
51 C * Inline and interleave limb inversion code with loop setup code.
55 C HP's assembler requires these declarations for importing mpn_invert_limb
56 .
global mpn_invert_limb
57 .
type mpn_invert_limb
,@function
65 C vlinv
= r37
(preinv only
)
66 C cnt
= r38
(preinv only
)
68 PROLOGUE
(mpn_preinv_divrem_1
)
71 alloc r42
= ar.pfs
, 7, 8, 1, 0
85 shladd r34
= r35
, 3, r34
95 shladd r32
= r15
, 3, r32 C r32
= rp
+ n
+ qxn
96 cmp.
le p8
, p0
= 0, r36
98 adds r32
= -8, r32 C r32
= rp
+ n
+ qxn
- 1
99 cmp.leu p6
, p7
= r36
, r39
100 (p8
) br.cond.dpnt .Lpunnorm
103 (p6
) addl r15
= 1, r0
106 (p6
) sub r38
= r39
, r36
109 adds r35
= -2, r35 C un
-= 2
113 (p6
) add r34
= 8, r34
116 (p6
) br.cond.dptk .Lpu
118 shl r38
= r39
, r40 C r
= ahigh
<< cnt
119 cmp.ne p8
, p0
= 1, r35
121 adds r35
= -1, r35 C un
--
122 (p8
) br.cond.dpnt .Lpu
132 PROLOGUE
(mpn_divrem_1
)
135 alloc r42
= ar.pfs
, 5, 8, 1, 0
151 cmp.ne p6
, p7
= 0, r15
154 (p7
) br.cond.dpnt .Lret
155 shladd r14
= r15
, 3, r32 C r14
= rp
+ n
+ qxn
156 cmp.
le p6
, p7
= 0, r36
158 adds r32
= -8, r14 C r32
= rp
+ n
+ qxn
- 1
159 (p6
) br.cond.dpnt .Lunnorm
160 cmp.
eq p6
, p7
= 0, r35
161 (p6
) br.cond.dpnt .L179
162 shladd r14
= r35
, 3, r34
169 cmp.leu p6
, p7
= r36
, r38
171 (p6
) addl r15
= 1, r0
175 (p6
) sub r38
= r38
, r36
180 br.
call.sptk.many b0
= mpn_invert_limb
182 shladd r34
= r35
, 3, r34
188 cmp.
le p6
, p7
= 0, r35
190 (p7
) br.cond.dpnt .L435
196 C Develop quotient limbs for normalized divisor
197 .
Loop1: C
00 C q
=r18 nh
=r38
/f7
199 xma.hu f11
= f7
, f6
, f0
201 xma.l f8
= f11
, f12
, f7 C q
= q
+ nh
204 xma.hu f9
= f8
, f10
, f0
205 xma.l f8
= f8
, f10
, f0
211 cmp.ltu p6
, p7
= r20
, r15
215 (p6
) cmp.ne p8
, p9
= 1, r16 C is rH
!= 0?
216 (p7
) cmp.ne p8
, p9
= 0, r16 C is rH
!= 0?
217 (p6
) add r16
= -1, r16
218 (p0
) cmp.ne.unc p6
, p7
= r0
, r0
220 (p8
) cmp.ltu p6
, p7
= r15
, r36
221 (p8
) sub r15
= r15
, r36
222 (p8
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
224 .pred.rel
"mutex",p6
,p7
225 (p6
) cmp.ne p8
, p9
= 1, r16 C is rH
!= 0 still
?
226 (p7
) cmp.ne p8
, p9
= 0, r16 C is rH
!= 0 still
?
227 cmp.ltu p6
, p7
= r15
, r36 C speculative
228 sub r28
= r15
, r36 C speculative
, just for
cmp
230 (p8
) cmp.ltu p6
, p7
= r28
, r36 C redo last
cmp if needed
232 (p8
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
234 (p6
) setf.sig f7
= r15
235 (p7
) sub r15
= r15
, r36
236 (p7
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
238 (p7
) setf.sig f7
= r15
247 cmp.
eq p6
, p7
= 0, r35
248 (p6
) br.cond.dpnt .L322
249 shladd r34
= r35
, 3, r34
255 cmp.leu p6
, p7
= r36
, r39
256 (p6
) br.cond.dptk .L322
261 cmp.ne p6
, p7
= 1, r15
265 (p7
) br.cond.dpnt .Lret
275 shladd r16
= r14
, 3, r16
279 cmp.geu p6
, p7
= 15, r14
281 (p7
) shr.u r14
= r14
, 4
282 (p7
) adds r16
= 4, r16
284 cmp.geu p6
, p7
= 3, r14
286 (p7
) shr.u r14
= r14
, 2
287 (p7
) adds r16
= 2, r16
289 tbit.nz p6
, p7
= r14
, 1
291 .pred.rel
"mutex",p6
,p7
292 (p6
) sub r40
= 62, r16
293 (p7
) sub r40
= 63, r16
298 br.
call.sptk.many b0
= mpn_invert_limb
305 cmp.
eq p6
, p7
= 0, r35
306 (p6
) br.cond.dpnt .L435
311 cmp.
le p6
, p7
= 0, r35
316 (p7
) br.cond.dpnt .Lend3
323 C Develop quotient limbs for unnormalized divisor
326 xma.hu f11
= f7
, f6
, f0
328 xma.l f8
= f11
, f12
, f7 C q
= q
+ nh
331 xma.hu f9
= f8
, f10
, f0
333 xma.l f8
= f8
, f10
, f0
340 cmp.ltu p6
, p7
= r20
, r15
344 (p6
) cmp.ne p8
, p9
= 1, r16 C is rH
!= 0?
345 (p7
) cmp.ne p8
, p9
= 0, r16 C is rH
!= 0?
346 (p6
) add r16
= -1, r16
347 (p0
) cmp.ne.unc p6
, p7
= r0
, r0
349 (p8
) cmp.ltu p6
, p7
= r15
, r36
350 (p8
) sub r15
= r15
, r36
351 (p8
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
353 .pred.rel
"mutex",p6
,p7
354 (p6
) cmp.ne p8
, p9
= 1, r16 C is rH
!= 0 still
?
355 (p7
) cmp.ne p8
, p9
= 0, r16 C is rH
!= 0 still
?
356 cmp.ltu p6
, p7
= r15
, r36 C speculative
357 sub r28
= r15
, r36 C speculative
, just for
cmp
359 (p8
) cmp.ltu p6
, p7
= r28
, r36 C redo last
cmp if needed
361 (p8
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
363 (p6
) setf.sig f7
= r15
364 (p7
) sub r15
= r15
, r36
365 (p7
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
367 (p7
) setf.sig f7
= r15
377 xma.hu f11
= f7
, f6
, f0
379 xma.l f8
= f11
, f12
, f7 C q
= q
+ nh
382 xma.hu f9
= f8
, f10
, f0
384 xma.l f8
= f8
, f10
, f0
389 cmp.ltu p6
, p7
= r20
, r15
393 (p6
) cmp.ne p8
, p9
= 1, r16 C is rH
!= 0?
394 (p7
) cmp.ne p8
, p9
= 0, r16 C is rH
!= 0?
395 (p6
) add r16
= -1, r16
396 (p0
) cmp.ne.unc p6
, p7
= r0
, r0
398 (p8
) cmp.ltu p6
, p7
= r15
, r36
399 (p8
) sub r15
= r15
, r36
400 (p8
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
402 .pred.rel
"mutex",p6
,p7
403 (p6
) cmp.ne p8
, p9
= 1, r16 C is rH
!= 0 still
?
404 (p7
) cmp.ne p8
, p9
= 0, r16 C is rH
!= 0 still
?
406 (p8
) sub r15
= r15
, r36
407 (p8
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
409 cmp.ltu p6
, p7
= r15
, r36
411 (p7
) sub r15
= r15
, r36
412 (p7
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
418 cmp.
le p6
, p7
= 1, r33
419 (p7
) br.cond.dpnt .Lend4
426 xma.hu f11
= f7
, f6
, f0
428 xma.l f8
= f11
, f12
, f7 C q
= q
+ nh
431 xma.hu f9
= f8
, f10
, f0
432 xma.l f8
= f8
, f10
, f0
437 cmp.ltu p6
, p7
= 0, r15
441 (p6
) cmp.ne p8
, p9
= 1, r16 C is rH
!= 0?
442 (p7
) cmp.ne p8
, p9
= 0, r16 C is rH
!= 0?
443 (p6
) add r16
= -1, r16
444 (p0
) cmp.ne.unc p6
, p7
= r0
, r0
446 (p8
) cmp.ltu p6
, p7
= r15
, r36
447 (p8
) sub r15
= r15
, r36
448 (p8
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
450 .pred.rel
"mutex",p6
,p7
451 (p6
) cmp.ne p8
, p9
= 1, r16 C is rH
!= 0 still
?
452 (p7
) cmp.ne p8
, p9
= 0, r16 C is rH
!= 0 still
?
453 cmp.ltu p6
, p7
= r15
, r36 C speculative
454 sub r28
= r15
, r36 C speculative
, just for
cmp
456 (p8
) cmp.ltu p6
, p7
= r28
, r36 C redo last
cmp if needed
458 (p8
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
460 (p6
) setf.sig f7
= r15
461 (p7
) sub r15
= r15
, r36
462 (p7
) add r18
= 1, r18 C q
= q
+ 1; done if: rH > 0
464 (p7
) setf.sig f7
= r15