beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc64 / mode64 / divrem_1.asm
blobb283877006437007d7eccc59430a6bdf3c1bc2f9
1 dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
3 dnl Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C norm unorm frac
35 C POWER3/PPC630 16-34 16-34 ~11 outdated figures
36 C POWER4/PPC970 28 28 19
37 C POWER5 29 29 ~19
38 C POWER6 49 59 ~42
39 C POWER7 24.5 23 ~14
41 C INPUT PARAMETERS
42 C qp = r3
43 C fn = r4
44 C up = r5
45 C un = r6
46 C d = r7
48 C We use a not very predictable branch in the frac code, therefore the cycle
49 C count wobbles somewhat. With the alternative branch-free code, things run
50 C considerably slower on POWER4/PPC970 and POWER5.
52 C Add preinv entry point.
55 ASM_START()
57 EXTERN_FUNC(mpn_invert_limb)
59 PROLOGUE(mpn_divrem_1,toc)
61 mfcr r12
62 add. r10, r6, r4
63 std r25, -56(r1)
64 mr r25, r4
65 mflr r0
66 std r26, -48(r1)
67 mr r26, r5
68 std r28, -32(r1)
69 mr r28, r6
70 std r29, -24(r1)
71 mr r29, r3
72 li r3, 0
73 std r30, -16(r1)
74 mr r30, r7
75 std r31, -8(r1)
76 li r31, 0
77 std r27, -40(r1)
78 std r0, 16(r1)
79 stw r12, 8(r1)
80 stdu r1, -176(r1)
81 beq- cr0, L(1)
82 cmpdi cr7, r7, 0
83 sldi r0, r10, 3
84 add r11, r0, r29
85 addi r29, r11, -8
86 blt- cr7, L(162)
87 cmpdi cr4, r6, 0
88 beq+ cr4, L(71)
89 L(163):
90 sldi r9, r6, 3
91 add r9, r9, r5
92 ld r7, -8(r9)
93 cmpld cr7, r7, r30
94 bge- cr7, L(71)
95 cmpdi cr7, r10, 1
96 li r0, 0
97 mr r31, r7
98 std r0, -8(r11)
99 addi r29, r29, -8
100 mr r3, r7
101 beq- cr7, L(1)
102 addi r28, r6, -1
103 cmpdi cr4, r28, 0
104 L(71):
105 cntlzd r27, r30
106 sld r30, r30, r27
107 sld r31, r31, r27
108 mr r3, r30
109 CALL( mpn_invert_limb)
110 beq- cr4, L(110)
111 sldi r9, r28, 3
112 addic. r6, r28, -2
113 add r9, r9, r26
114 subfic r5, r27, 64
115 ld r8, -8(r9)
116 srd r0, r8, r5
117 or r31, r31, r0
118 sld r7, r8, r27
119 blt- cr0, L(154)
120 addi r28, r28, -1
121 mtctr r28
122 sldi r6, r6, 3
123 ALIGN(16)
124 L(uloop):
125 ldx r8, r26, r6
127 mulld r0, r31, r3
128 mulhdu r10, r31, r3
129 addi r11, r31, 1
130 srd r9, r8, r5
131 addi r6, r6, -8
132 or r9, r7, r9
133 addc r0, r0, r9
134 adde r10, r10, r11
135 mulld r31, r10, r30
136 subf r31, r31, r9
137 subfc r0, r31, r0 C r <= ql
138 subfe r0, r0, r0 C r0 = -(r <= ql)
139 and r9, r30, r0
140 add r31, r31, r9
141 add r10, r0, r10 C qh -= (r >= ql)
142 cmpld cr7, r31, r30
143 bge- cr7, L(164)
144 L(123):
145 std r10, 0(r29)
146 addi r29, r29, -8
147 sld r7, r8, r27
148 bdnz L(uloop)
149 L(154):
150 addi r11, r31, 1
152 mulld r0, r31, r3
153 mulhdu r8, r31, r3
154 addc r0, r0, r7
155 adde r8, r8, r11
156 mulld r31, r8, r30
157 subf r31, r31, r7
158 subfc r0, r0, r31 C r >= ql
159 subfe r0, r0, r0 C r0 = -(r >= ql)
160 not r7, r0
161 add r8, r7, r8 C qh -= (r >= ql)
162 andc r0, r30, r0
163 add r31, r31, r0
164 cmpld cr7, r31, r30
165 bge- cr7, L(165)
166 L(134):
167 std r8, 0(r29)
168 addi r29, r29, -8
169 L(110):
170 addic. r0, r25, -1
171 blt- cr0, L(156)
172 mtctr r25
173 neg r9, r30
174 ALIGN(16)
175 L(ufloop):
176 addi r11, r31, 1
178 mulld r0, r3, r31
179 mulhdu r10, r3, r31
180 add r10, r10, r11
181 mulld r31, r9, r10
182 ifelse(0,1,`
183 subfc r0, r0, r31
184 subfe r0, r0, r0 C r0 = -(r >= ql)
185 not r7, r0
186 add r10, r7, r10 C qh -= (r >= ql)
187 andc r0, r30, r0
188 add r31, r31, r0
190 cmpld cr7, r31, r0
191 blt cr7, L(29)
192 add r31, r30, r31
193 addi r10, r10, -1
194 L(29):
196 std r10, 0(r29)
197 addi r29, r29, -8
198 bdnz L(ufloop)
199 L(156):
200 srd r3, r31, r27
201 L(1):
202 addi r1, r1, 176
203 ld r0, 16(r1)
204 lwz r12, 8(r1)
205 mtlr r0
206 ld r25, -56(r1)
207 ld r26, -48(r1)
208 mtcrf 8, r12
209 ld r27, -40(r1)
210 ld r28, -32(r1)
211 ld r29, -24(r1)
212 ld r30, -16(r1)
213 ld r31, -8(r1)
215 L(162):
216 cmpdi cr7, r6, 0
217 beq- cr7, L(8)
218 sldi r9, r6, 3
219 addi r29, r29, -8
220 add r9, r9, r5
221 addi r28, r6, -1
222 ld r31, -8(r9)
223 subfc r9, r7, r31
224 li r9, 0
225 adde r9, r9, r9
226 neg r0, r9
227 std r9, -8(r11)
228 and r0, r0, r7
229 subf r31, r0, r31
230 L(8):
231 mr r3, r30
232 CALL( mpn_invert_limb)
233 li r27, 0
234 addic. r6, r28, -1
235 blt- cr0, L(110)
236 mtctr r28
237 sldi r6, r6, 3
238 ALIGN(16)
239 L(nloop):
240 addi r11, r31, 1
241 ldx r8, r26, r6
242 mulld r0, r31, r3
243 mulhdu r10, r31, r3
244 addi r6, r6, -8
245 addc r0, r0, r8
246 adde r10, r10, r11
247 mulld r31, r10, r30
248 subf r31, r31, r8 C r = nl - qh * d
249 subfc r0, r31, r0 C r <= ql
250 subfe r0, r0, r0 C r0 = -(r <= ql)
251 and r9, r30, r0
252 add r31, r31, r9
253 add r10, r0, r10 C qh -= (r >= ql)
254 cmpld cr7, r31, r30
255 bge- cr7, L(167)
256 L(51):
257 std r10, 0(r29)
258 addi r29, r29, -8
259 bdnz L(nloop)
260 b L(110)
262 L(164):
263 subf r31, r30, r31
264 addi r10, r10, 1
265 b L(123)
266 L(167):
267 subf r31, r30, r31
268 addi r10, r10, 1
269 b L(51)
270 L(165):
271 subf r31, r30, r31
272 addi r8, r8, 1
273 b L(134)
274 EPILOGUE()