beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / divrem_2.asm
blob9864311278c9782ac96adb235555ca4c902f7239
1 dnl IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
3 dnl Copyright 2010, 2013 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C norm frac
34 C itanium 1
35 C itanium 2 29 29
38 C TODO
39 C * Inline and interleave limb inversion code with loop setup code.
40 C * We should use explicit bundling in much of the code, since it typically
41 C cuts some cycles with the GNU assembler.
44 ASM_START()
46 C HP's assembler requires these declarations for importing mpn_invert_limb
47 .global mpn_invert_limb
48 .type mpn_invert_limb,@function
50 C INPUT PARAMETERS
51 C qp = r32
52 C fn = r33
53 C np = r34
54 C nn = r35
55 C dp = r36
57 define(`f0x1', `f15')
59 ASM_START()
60 PROLOGUE(mpn_divrem_2)
61 .prologue
62 ifdef(`HAVE_ABI_32',
63 ` addp4 r32 = 0, r32 C M I
64 addp4 r34 = 0, r34 C M I
65 zxt4 r35 = r35 C I
66 addp4 r36 = 0, r36 C M I
67 nop.m 0
68 zxt4 r33 = r33 C I
71 .save ar.pfs, r42
72 alloc r42 = ar.pfs, 5, 9, 1, 0
73 shladd r34 = r35, 3, r34
74 adds r14 = 8, r36
75 mov r43 = r1
77 adds r15 = -8, r34
78 ld8 r39 = [r14]
79 .save ar.lc, r45
80 mov r45 = ar.lc
81 adds r14 = -16, r34
82 mov r40 = r0
83 adds r34 = -24, r34
85 ld8 r38 = [r15]
86 .save rp, r41
87 mov r41 = b0
88 .body
89 ld8 r36 = [r36]
90 ld8 r37 = [r14]
92 cmp.gtu p6, p7 = r39, r38
93 (p6) br.cond.dptk .L8
95 cmp.leu p8, p9 = r36, r37
96 cmp.geu p6, p7 = r39, r38
98 (p8) cmp4.ne.and.orcm p6, p7 = 0, r0
99 (p7) br.cond.dptk .L51
100 .L8:
101 add r14 = r33, r35 // un + fn
102 mov r46 = r39 // argument to mpn_invert_limb
104 adds r35 = -3, r14
106 cmp.gt p12, p0 = r0, r35
107 (p12) br.cond.dpnt L(end)
108 br.call.sptk.many b0 = mpn_invert_limb
110 setf.sig f11 = r8 // di (non-final)
111 setf.sig f34 = r39 // d1
112 setf.sig f33 = r36 // d0
113 mov r1 = r43
115 mov r17 = 1
116 setf.sig f9 = r38 // n2
117 xma.l f6 = f11, f34, f0 // t0 = LO(di * d1)
119 setf.sig f10 = r37 // n1
120 setf.sig f15 = r17 // 1
121 xma.hu f8 = f11, f33, f0 // s0 = HI(di * d0)
123 getf.sig r17 = f6
124 getf.sig r16 = f8
125 mov ar.lc = r35
127 sub r18 = r0, r39 // -d1
128 add r14 = r17, r36
130 setf.sig f14 = r18 // -d1
131 cmp.leu p8, p9 = r17, r14
132 add r16 = r14, r16
134 (p9) adds r19 = 0, r0
135 (p8) adds r19 = -1, r0
136 cmp.gtu p6, p7 = r14, r16
138 (p6) adds r19 = 1, r19
140 ifelse(1,1,`
141 cmp.gt p7, p6 = r0, r19
143 (p6) adds r8 = -1, r8 // di--
144 (p6) sub r14 = r16, r39 // t0 -= d1
145 (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1
147 (p6) cmp.gt p9, p8 = 1, r19
148 (p7) cmp.gt p9, p8 = 0, r19
149 (p6) adds r19 = -1, r19 // t1 -= cy
150 mov r16 = r14
152 (p8) adds r8 = -1, r8 // di--
153 (p8) sub r14 = r16, r39 // t0 -= d1
154 (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1
156 (p8) cmp.gt p7, p6 = 1, r19
157 (p9) cmp.gt p7, p6 = 0, r19
158 (p8) adds r19 = -1, r19 // t1 -= cy
159 mov r16 = r14
161 (p6) adds r8 = -1, r8 // di--
162 (p6) sub r14 = r16, r39 // t0 -= d1
163 (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1
165 (p6) cmp.gt p9, p8 = 1, r19
166 (p7) cmp.gt p9, p8 = 0, r19
167 (p6) adds r19 = -1, r19 // t1 -= cy
168 mov r16 = r14
170 (p8) adds r8 = -1, r8 // di--
171 (p8) sub r14 = r16, r39 // t0 -= d1
172 (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1
174 (p8) adds r19 = -1, r19 // t1 -= cy
175 mov r16 = r14
177 cmp.gt p8, p9 = r0, r19
178 (p8) br.cond.dpnt .L46
179 .L52:
180 cmp.leu p6, p7 = r39, r16
181 sub r14 = r16, r39
182 adds r8 = -1, r8
184 (p7) adds r19 = -1, r19
185 mov r16 = r14
187 (p7) cmp.gt p8, p9 = r0, r19
188 (p9) br.cond.dptk .L52
189 .L46:
191 setf.sig f32 = r8 // di
192 shladd r32 = r35, 3, r32
195 ALIGN(16)
196 L(top): nop 0
197 nop 0
198 cmp.gt p8, p9 = r33, r35
200 (p8) mov r37 = r0
201 (p9) ld8 r37 = [r34], -8
202 xma.hu f8 = f9, f32, f10 // 0,29
203 xma.l f12 = f9, f32, f10 // 0
205 getf.sig r20 = f12 // q0 4
206 xma.l f13 = f15, f8, f9 // q += n2 4
207 sub r8 = -1, r36 // bitnot d0
209 getf.sig r18 = f13 // 8
210 xma.l f7 = f14, f13, f10 // 8
211 xma.l f6 = f33, f13, f33 // t0 = LO(d0*q+d0) 8
212 xma.hu f9 = f33, f13, f33 // t1 = HI(d0*q+d0) 9
214 getf.sig r38 = f7 // n1 12
215 getf.sig r16 = f6 // 13
216 getf.sig r19 = f9 // 14
218 sub r38 = r38, r39 // n1 -= d1 17
220 cmp.ne p9, p0 = r0, r0 // clear p9
221 cmp.leu p10, p11 = r16, r37 // cy for: n0 - t0 18
223 sub r37 = r37, r16 // n0 -= t0 19
224 (p11) sub r38 = r38, r19, 1 // n1 -= t1 - cy 19
225 (p10) sub r38 = r38, r19 // n1 -= t1 19
227 cmp.gtu p6, p7 = r20, r38 // n1 >= q0 20
229 (p7) cmp.ltu p9, p0 = r8, r37 // 21
230 (p6) add r18 = 1, r18 //
231 (p7) add r37 = r37, r36 // 21
232 (p7) add r38 = r38, r39 // 21
234 setf.sig f10 = r37 // n1 22
235 (p9) add r38 = 1, r38 // 22
237 setf.sig f9 = r38 // n2 23
238 cmp.gtu p6, p7 = r39, r38 // 23
239 (p7) br.cond.spnt L(fix)
240 L(bck): st8 [r32] = r18, -8
241 adds r35 = -1, r35
242 br.cloop.sptk.few L(top)
245 L(end): add r14 = 8, r34
246 add r15 = 16, r34
247 mov b0 = r41
249 st8 [r14] = r37
250 st8 [r15] = r38
251 mov ar.pfs = r42
252 mov r8 = r40
253 mov ar.lc = r45
254 br.ret.sptk.many b0
256 .L51:
257 .pred.rel "mutex", p8, p9
258 sub r37 = r37, r36
259 (p9) sub r38 = r38, r39, 1
260 (p8) sub r38 = r38, r39
261 adds r40 = 1, r0
262 br .L8
265 L(fix): cmp.geu p6, p7 = r39, r38
266 cmp.leu p8, p9 = r36, r37
268 (p8) cmp4.ne.and.orcm p6, p7 = 0, r0
269 (p6) br.cond.dptk L(bck)
270 sub r37 = r37, r36
271 (p9) sub r38 = r38, r39, 1
272 (p8) sub r38 = r38, r39
273 adds r18 = 1, r18
275 setf.sig f9 = r38 // n2
276 setf.sig f10 = r37 // n1
277 br L(bck)
279 EPILOGUE()
280 ASM_END()