beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / alpha / bdiv_dbm1c.asm
blob472966ca98e4159170edf3dabb915bc4e172a425
1 dnl Alpha mpn_bdiv_dbm1c.
3 dnl Copyright 2008 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C EV4: 42
35 C EV5: 18
36 C EV6: 3
38 C TODO
39 C * Try less unrolling, 2-way should give the same performance.
40 C * Optimize feed-in and wind-down code, for speed, and perhaps further for
41 C code size.
42 C * This runs optimally given the algorithm, r8 is on a 3 operation recurrency
43 C path. We have not tried very hard to find a better algorithm. Perhaps
44 C it would be a good task for the GNU superoptimizer.
46 C INPUT PARAMETERS
47 define(`rp', `r16')
48 define(`up', `r17')
49 define(`n', `r18')
50 define(`bd', `r19')
51 define(`cy', `r19')
54 ASM_START()
55 PROLOGUE(mpn_bdiv_dbm1c)
56 mov r20, r8
58 ldq r24, 0(r17)
59 and r18, 3, r28
60 lda r18, -4(r18)
61 beq r28, L(b0)
62 cmpeq r28, 1, r21
63 bne r21, L(b1)
64 cmpeq r28, 2, r21
65 bne r21, L(b2)
68 L(b3): ldq r2, 8(r17)
69 ldq r3, 16(r17)
70 bgt r18, L(gt3)
72 mulq r24, r19, r5 C U1
73 umulh r24, r19, r21 C U1
74 mulq r2, r19, r6 C U1
75 umulh r2, r19, r22 C U1
76 mulq r3, r19, r7 C U1
77 umulh r3, r19, r23 C U1
78 lda r16, -32(r16)
79 br L(cj3)
81 L(gt3): ldq r0, 24(r17)
82 mulq r24, r19, r5 C U1
83 umulh r24, r19, r21 C U1
84 ldq r1, 32(r17)
85 mulq r2, r19, r6 C U1
86 umulh r2, r19, r22 C U1
87 ldq r2, 40(r17)
88 mulq r3, r19, r7 C U1
89 umulh r3, r19, r23 C U1
90 ldq r3, 48(r17)
91 lda r18, -4(r18)
92 lda r17, 56(r17)
93 mulq r0, r19, r4 C U1
94 bgt r18, L(L3)
96 br L(cj7)
99 L(b2): ldq r3, 8(r17)
100 bgt r18, L(gt2)
102 mulq r24, r19, r6 C U1
103 umulh r24, r19, r22 C U1
104 mulq r3, r19, r7 C U1
105 umulh r3, r19, r23 C U1
106 lda r16, -40(r16)
107 br L(cj2)
109 L(gt2): ldq r0, 16(r17)
110 ldq r1, 24(r17)
111 mulq r24, r19, r6 C U1
112 umulh r24, r19, r22 C U1
113 ldq r2, 32(r17)
114 mulq r3, r19, r7 C U1
115 umulh r3, r19, r23 C U1
116 ldq r3, 40(r17)
117 lda r18, -4(r18)
118 lda r17, 48(r17)
119 mulq r0, r19, r4 C U1
120 umulh r0, r19, r20 C U1
121 lda r16, -8(r16)
122 bgt r18, L(gt6)
124 mulq r1, r19, r5 C U1
125 br L(cj6)
127 L(gt6): ldq r0, 0(r17)
128 mulq r1, r19, r5 C U1
129 br L(L2)
132 L(b1): bgt r18, L(gt1)
134 mulq r24, r19, r7 C U1
135 umulh r24, r19, r23 C U1
136 lda r16, -48(r16)
137 br L(cj1)
139 L(gt1): ldq r0, 8(r17)
140 ldq r1, 16(r17)
141 ldq r2, 24(r17)
142 mulq r24, r19, r7 C U1
143 umulh r24, r19, r23 C U1
144 ldq r3, 32(r17)
145 lda r18, -4(r18)
146 lda r17, 40(r17)
147 mulq r0, r19, r4 C U1
148 umulh r0, r19, r20 C U1
149 lda r16, -16(r16)
150 bgt r18, L(gt5)
152 mulq r1, r19, r5 C U1
153 umulh r1, r19, r21 C U1
154 mulq r2, r19, r6 C U1
155 br L(cj5)
157 L(gt5): ldq r0, 0(r17)
158 mulq r1, r19, r5 C U1
159 umulh r1, r19, r21 C U1
160 ldq r1, 8(r17)
161 mulq r2, r19, r6 C U1
162 br L(L1)
165 L(b0): ldq r1, 8(r17)
166 ldq r2, 16(r17)
167 ldq r3, 24(r17)
168 lda r17, 32(r17)
169 lda r16, -24(r16)
170 mulq r24, r19, r4 C U1
171 umulh r24, r19, r20 C U1
172 bgt r18, L(gt4)
174 mulq r1, r19, r5 C U1
175 umulh r1, r19, r21 C U1
176 mulq r2, r19, r6 C U1
177 umulh r2, r19, r22 C U1
178 mulq r3, r19, r7 C U1
179 br L(cj4)
181 L(gt4): ldq r0, 0(r17)
182 mulq r1, r19, r5 C U1
183 umulh r1, r19, r21 C U1
184 ldq r1, 8(r17)
185 mulq r2, r19, r6 C U1
186 umulh r2, r19, r22 C U1
187 ldq r2, 16(r17)
188 mulq r3, r19, r7 C U1
189 br L(L0)
191 C *** MAIN LOOP START ***
192 ALIGN(16)
193 L(top): mulq r0, r19, r4 C U1
194 subq r8, r28, r8
195 L(L3): umulh r0, r19, r20 C U1
196 cmpult r8, r5, r28
197 ldq r0, 0(r17)
198 subq r8, r5, r8
199 addq r21, r28, r28
200 stq r8, 0(r16)
202 mulq r1, r19, r5 C U1
203 subq r8, r28, r8
204 L(L2): umulh r1, r19, r21 C U1
205 cmpult r8, r6, r28
206 ldq r1, 8(r17)
207 subq r8, r6, r8
208 addq r22, r28, r28
209 stq r8, 8(r16)
211 mulq r2, r19, r6 C U1
212 subq r8, r28, r8
213 L(L1): umulh r2, r19, r22 C U1
214 cmpult r8, r7, r28
215 ldq r2, 16(r17)
216 subq r8, r7, r8
217 addq r23, r28, r28
218 stq r8, 16(r16)
220 mulq r3, r19, r7 C U1
221 subq r8, r28, r8
222 L(L0): umulh r3, r19, r23 C U1
223 cmpult r8, r4, r28
224 ldq r3, 24(r17)
225 subq r8, r4, r8
226 addq r20, r28, r28
227 stq r8, 24(r16)
229 lda r18, -4(r18)
230 lda r17, 32(r17)
231 lda r16, 32(r16)
232 bgt r18, L(top)
233 C *** MAIN LOOP END ***
235 mulq r0, r19, r4 C U1
236 subq r8, r28, r8
237 L(cj7): umulh r0, r19, r20 C U1
238 cmpult r8, r5, r28
239 subq r8, r5, r8
240 addq r21, r28, r28
241 stq r8, 0(r16)
242 mulq r1, r19, r5 C U1
243 subq r8, r28, r8
244 L(cj6): umulh r1, r19, r21 C U1
245 cmpult r8, r6, r28
246 subq r8, r6, r8
247 addq r22, r28, r28
248 stq r8, 8(r16)
249 mulq r2, r19, r6 C U1
250 subq r8, r28, r8
251 L(cj5): umulh r2, r19, r22 C U1
252 cmpult r8, r7, r28
253 subq r8, r7, r8
254 addq r23, r28, r28
255 stq r8, 16(r16)
256 mulq r3, r19, r7 C U1
257 subq r8, r28, r8
258 L(cj4): umulh r3, r19, r23 C U1
259 cmpult r8, r4, r28
260 subq r8, r4, r8
261 addq r20, r28, r28
262 stq r8, 24(r16)
263 subq r8, r28, r8
264 L(cj3): cmpult r8, r5, r28
265 subq r8, r5, r8
266 addq r21, r28, r28
267 stq r8, 32(r16)
268 subq r8, r28, r8
269 L(cj2): cmpult r8, r6, r28
270 subq r8, r6, r8
271 addq r22, r28, r28
272 stq r8, 40(r16)
273 subq r8, r28, r8
274 L(cj1): cmpult r8, r7, r28
275 subq r8, r7, r8
276 addq r23, r28, r28
277 stq r8, 48(r16)
278 subq r8, r28, r0
279 ret r31, (r26), 1
281 EPILOGUE()
282 ASM_END()