1 dnl IA
-64 mpn_divrem_2
-- Divide an mpn number by a normalized
2-limb number.
3 dnl Copyright
2010, 2013 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
39 C * Inline and interleave limb inversion code with loop setup code.
40 C * We should use explicit bundling in much of the code, since it typically
41 C cuts some cycles with the GNU assembler.
46 C HP's assembler requires these declarations for importing mpn_invert_limb
47 .
global mpn_invert_limb
48 .
type mpn_invert_limb
,@function
60 PROLOGUE
(mpn_divrem_2
)
63 ` addp4 r32 = 0, r32 C M I
64 addp4 r34 = 0, r34 C M I
66 addp4 r36 = 0, r36 C M I
72 alloc r42
= ar.pfs
, 5, 9, 1, 0
73 shladd r34
= r35
, 3, r34
92 cmp.gtu p6
, p7
= r39
, r38
95 cmp.leu p8
, p9
= r36
, r37
96 cmp.geu p6
, p7
= r39
, r38
98 (p8
) cmp4.ne.
and.orcm p6
, p7
= 0, r0
99 (p7
) br.cond.dptk .L51
101 add r14
= r33
, r35
// un
+ fn
102 mov r46
= r39
// argument to mpn_invert_limb
106 cmp.
gt p12
, p0
= r0
, r35
107 (p12
) br.cond.dpnt L
(end)
108 br.
call.sptk.many b0
= mpn_invert_limb
110 setf.sig f11
= r8
// di (non
-final
)
111 setf.sig f34
= r39
// d1
112 setf.sig f33
= r36
// d0
116 setf.sig f9
= r38
// n2
117 xma.l f6
= f11
, f34
, f0
// t0
= LO
(di * d1
)
119 setf.sig f10
= r37
// n1
120 setf.sig f15
= r17
// 1
121 xma.hu f8
= f11
, f33
, f0
// s0
= HI
(di * d0
)
127 sub r18
= r0
, r39
// -d1
130 setf.sig f14
= r18
// -d1
131 cmp.leu p8
, p9
= r17
, r14
134 (p9
) adds r19
= 0, r0
135 (p8
) adds r19
= -1, r0
136 cmp.gtu p6
, p7
= r14
, r16
138 (p6
) adds r19
= 1, r19
141 cmp.
gt p7
, p6
= r0
, r19
143 (p6
) adds r8
= -1, r8
// di--
144 (p6
) sub r14
= r16
, r39
// t0
-= d1
145 (p6
) cmp.ltu p6
, p7
= r16
, r39
// cy
for: t0
- d1
147 (p6
) cmp.
gt p9
, p8
= 1, r19
148 (p7
) cmp.
gt p9
, p8
= 0, r19
149 (p6
) adds r19
= -1, r19
// t1
-= cy
152 (p8
) adds r8
= -1, r8
// di--
153 (p8
) sub r14
= r16
, r39
// t0
-= d1
154 (p8
) cmp.ltu p8
, p9
= r16
, r39
// cy
for: t0
- d1
156 (p8
) cmp.
gt p7
, p6
= 1, r19
157 (p9
) cmp.
gt p7
, p6
= 0, r19
158 (p8
) adds r19
= -1, r19
// t1
-= cy
161 (p6
) adds r8
= -1, r8
// di--
162 (p6
) sub r14
= r16
, r39
// t0
-= d1
163 (p6
) cmp.ltu p6
, p7
= r16
, r39
// cy
for: t0
- d1
165 (p6
) cmp.
gt p9
, p8
= 1, r19
166 (p7
) cmp.
gt p9
, p8
= 0, r19
167 (p6
) adds r19
= -1, r19
// t1
-= cy
170 (p8
) adds r8
= -1, r8
// di--
171 (p8
) sub r14
= r16
, r39
// t0
-= d1
172 (p8
) cmp.ltu p8
, p9
= r16
, r39
// cy
for: t0
- d1
174 (p8
) adds r19
= -1, r19
// t1
-= cy
177 cmp.gt p8, p9 = r0, r19
178 (p8) br.cond.dpnt .L46
180 cmp.leu p6, p7 = r39, r16
184 (p7) adds r19 = -1, r19
187 (p7) cmp.gt p8, p9 = r0, r19
188 (p9) br.cond.dptk .L52
191 setf.sig f32
= r8
// di
192 shladd r32
= r35
, 3, r32
198 cmp.
gt p8
, p9
= r33
, r35
201 (p9
) ld8 r37
= [r34
], -8
202 xma.hu f8
= f9
, f32
, f10
// 0,29
203 xma.l f12
= f9
, f32
, f10
// 0
205 getf.sig r20
= f12
// q0
4
206 xma.l f13
= f15
, f8
, f9
// q
+= n2
4
207 sub r8
= -1, r36
// bitnot d0
209 getf.sig r18
= f13
// 8
210 xma.l f7
= f14
, f13
, f10
// 8
211 xma.l f6
= f33
, f13
, f33
// t0
= LO
(d0
*q
+d0
) 8
212 xma.hu f9
= f33
, f13
, f33
// t1
= HI
(d0
*q
+d0
) 9
214 getf.sig r38
= f7
// n1
12
215 getf.sig r16
= f6
// 13
216 getf.sig r19
= f9
// 14
218 sub r38
= r38
, r39
// n1
-= d1
17
220 cmp.ne p9
, p0
= r0
, r0
// clear p9
221 cmp.leu p10
, p11
= r16
, r37
// cy
for: n0
- t0
18
223 sub r37
= r37
, r16
// n0
-= t0
19
224 (p11
) sub r38
= r38
, r19
, 1 // n1
-= t1
- cy
19
225 (p10
) sub r38
= r38
, r19
// n1
-= t1
19
227 cmp.gtu p6
, p7
= r20
, r38
// n1
>= q0
20
229 (p7
) cmp.ltu p9
, p0
= r8
, r37
// 21
230 (p6
) add r18
= 1, r18
//
231 (p7
) add r37
= r37
, r36
// 21
232 (p7
) add r38
= r38
, r39
// 21
234 setf.sig f10
= r37
// n1
22
235 (p9
) add r38
= 1, r38
// 22
237 setf.sig f9
= r38
// n2
23
238 cmp.gtu p6
, p7
= r39
, r38
// 23
239 (p7
) br.cond.spnt L
(fix
)
240 L
(bck
): st8
[r32
] = r18
, -8
242 br.cloop.sptk.few L
(top
)
245 L
(end): add r14
= 8, r34
257 .pred.rel
"mutex", p8
, p9
259 (p9
) sub r38
= r38
, r39
, 1
260 (p8
) sub r38
= r38
, r39
265 L
(fix
): cmp.geu p6
, p7
= r39
, r38
266 cmp.leu p8
, p9
= r36
, r37
268 (p8
) cmp4.ne.
and.orcm p6
, p7
= 0, r0
269 (p6
) br.cond.dptk L
(bck
)
271 (p9
) sub r38
= r38
, r39
, 1
272 (p8
) sub r38
= r38
, r39
275 setf.sig f9
= r38
// n2
276 setf.sig f10
= r37
// n1