new beta-0.90.0
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / divrem_1.asm
blobd4d61ad9d8fe940c825ea5dbb5bb1f7424e678e1
1 dnl x86-64 mpn_divrem_1 -- mpn by limb division.
3 dnl Copyright 2004, 2005, 2007-2012, 2014 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C norm unorm frac
35 C AMD K8,K9 13 13 12
36 C AMD K10 13 13 12
37 C Intel P4 43 44 43
38 C Intel core2 24.5 24.5 19.5
39 C Intel corei 20.5 19.5 18
40 C Intel atom 43 46 36
41 C VIA nano 25.5 25.5 24
43 C mp_limb_t
44 C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
45 C mp_srcptr np, mp_size_t nn, mp_limb_t d)
47 C mp_limb_t
48 C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
49 C mp_srcptr np, mp_size_t nn, mp_limb_t d,
50 C mp_limb_t dinv, int cnt)
52 C INPUT PARAMETERS
53 define(`qp', `%rdi')
54 define(`fn_param', `%rsi')
55 define(`up_param', `%rdx')
56 define(`un_param', `%rcx')
57 define(`d', `%r8')
58 define(`dinv', `%r9') C only for mpn_preinv_divrem_1
59 C shift passed on stack C only for mpn_preinv_divrem_1
61 define(`cnt', `%rcx')
62 define(`up', `%rsi')
63 define(`fn', `%r12')
64 define(`un', `%rbx')
67 C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
68 C cnt qp d dinv
70 ABI_SUPPORT(DOS64)
71 ABI_SUPPORT(STD64)
73 IFSTD(`define(`CNTOFF', `40($1)')')
74 IFDOS(`define(`CNTOFF', `104($1)')')
76 ASM_START()
77 TEXT
78 ALIGN(16)
79 PROLOGUE(mpn_preinv_divrem_1)
80 FUNC_ENTRY(4)
81 IFDOS(` mov 56(%rsp), %r8 ')
82 IFDOS(` mov 64(%rsp), %r9 ')
83 xor R32(%rax), R32(%rax)
84 push %r13
85 push %r12
86 push %rbp
87 push %rbx
89 mov fn_param, fn
90 mov un_param, un
91 add fn_param, un_param
92 mov up_param, up
94 lea -8(qp,un_param,8), qp
96 test d, d
97 js L(nent)
99 mov CNTOFF(%rsp), R8(cnt)
100 shl R8(cnt), d
101 jmp L(uent)
102 EPILOGUE()
104 ALIGN(16)
105 PROLOGUE(mpn_divrem_1)
106 FUNC_ENTRY(4)
107 IFDOS(` mov 56(%rsp), %r8 ')
108 xor R32(%rax), R32(%rax)
109 push %r13
110 push %r12
111 push %rbp
112 push %rbx
114 mov fn_param, fn
115 mov un_param, un
116 add fn_param, un_param
117 mov up_param, up
118 je L(ret)
120 lea -8(qp,un_param,8), qp
121 xor R32(%rbp), R32(%rbp)
123 test d, d
124 jns L(unnormalized)
126 L(normalized):
127 test un, un
128 je L(8) C un == 0
129 mov -8(up,un,8), %rbp
130 dec un
131 mov %rbp, %rax
132 sub d, %rbp
133 cmovc %rax, %rbp
134 sbb R32(%rax), R32(%rax)
135 inc R32(%rax)
136 mov %rax, (qp)
137 lea -8(qp), qp
138 L(8):
139 IFSTD(` push %rdi ')
140 IFSTD(` push %rsi ')
141 push %r8
142 IFSTD(` mov d, %rdi ')
143 IFDOS(` sub $32, %rsp ')
144 IFDOS(` mov d, %rcx ')
145 ASSERT(nz, `test $15, %rsp')
146 CALL( mpn_invert_limb)
147 IFDOS(` add $32, %rsp ')
148 pop %r8
149 IFSTD(` pop %rsi ')
150 IFSTD(` pop %rdi ')
152 mov %rax, dinv
153 mov %rbp, %rax
154 jmp L(nent)
156 ALIGN(16)
157 L(ntop):mov (up,un,8), %r10 C K8-K10 P6-CNR P6-NHM P4
158 mul dinv C 0,13 0,20 0,18 0,45
159 add %r10, %rax C 4 8 3 12
160 adc %rbp, %rdx C 5 9 10 13
161 mov %rax, %rbp C 5 9 4 13
162 mov %rdx, %r13 C 6 11 12 23
163 imul d, %rdx C 6 11 11 23
164 sub %rdx, %r10 C 10 16 14 33
165 mov d, %rax C
166 add %r10, %rax C 11 17 15 34
167 cmp %rbp, %r10 C 11 17 15 34
168 cmovc %r10, %rax C 12 18 16 35
169 adc $-1, %r13 C
170 cmp d, %rax C
171 jae L(nfx) C
172 L(nok): mov %r13, (qp) C
173 sub $8, qp C
174 L(nent):lea 1(%rax), %rbp C
175 dec un C
176 jns L(ntop) C
178 xor R32(%rcx), R32(%rcx)
179 jmp L(frac)
181 L(nfx): sub d, %rax
182 inc %r13
183 jmp L(nok)
185 L(unnormalized):
186 test un, un
187 je L(44)
188 mov -8(up,un,8), %rax
189 cmp d, %rax
190 jae L(44)
191 mov %rbp, (qp)
192 mov %rax, %rbp
193 lea -8(qp), qp
194 je L(ret)
195 dec un
196 L(44):
197 bsr d, %rcx
198 not R32(%rcx)
199 shl R8(%rcx), d
200 shl R8(%rcx), %rbp
202 push %rcx
203 IFSTD(` push %rdi ')
204 IFSTD(` push %rsi ')
205 push %r8
206 IFSTD(` sub $8, %rsp ')
207 IFSTD(` mov d, %rdi ')
208 IFDOS(` sub $40, %rsp ')
209 IFDOS(` mov d, %rcx ')
210 ASSERT(nz, `test $15, %rsp')
211 CALL( mpn_invert_limb)
212 IFSTD(` add $8, %rsp ')
213 IFDOS(` add $40, %rsp ')
214 pop %r8
215 IFSTD(` pop %rsi ')
216 IFSTD(` pop %rdi ')
217 pop %rcx
219 mov %rax, dinv
220 mov %rbp, %rax
221 test un, un
222 je L(frac)
224 L(uent):dec un
225 mov (up,un,8), %rbp
226 neg R32(%rcx)
227 shr R8(%rcx), %rbp
228 neg R32(%rcx)
229 or %rbp, %rax
230 jmp L(ent)
232 ALIGN(16)
233 L(utop):mov (up,un,8), %r10
234 shl R8(%rcx), %rbp
235 neg R32(%rcx)
236 shr R8(%rcx), %r10
237 neg R32(%rcx)
238 or %r10, %rbp
239 mul dinv
240 add %rbp, %rax
241 adc %r11, %rdx
242 mov %rax, %r11
243 mov %rdx, %r13
244 imul d, %rdx
245 sub %rdx, %rbp
246 mov d, %rax
247 add %rbp, %rax
248 cmp %r11, %rbp
249 cmovc %rbp, %rax
250 adc $-1, %r13
251 cmp d, %rax
252 jae L(ufx)
253 L(uok): mov %r13, (qp)
254 sub $8, qp
255 L(ent): mov (up,un,8), %rbp
256 dec un
257 lea 1(%rax), %r11
258 jns L(utop)
260 L(uend):shl R8(%rcx), %rbp
261 mul dinv
262 add %rbp, %rax
263 adc %r11, %rdx
264 mov %rax, %r11
265 mov %rdx, %r13
266 imul d, %rdx
267 sub %rdx, %rbp
268 mov d, %rax
269 add %rbp, %rax
270 cmp %r11, %rbp
271 cmovc %rbp, %rax
272 adc $-1, %r13
273 cmp d, %rax
274 jae L(efx)
275 L(eok): mov %r13, (qp)
276 sub $8, qp
277 jmp L(frac)
279 L(ufx): sub d, %rax
280 inc %r13
281 jmp L(uok)
282 L(efx): sub d, %rax
283 inc %r13
284 jmp L(eok)
286 L(frac):mov d, %rbp
287 neg %rbp
288 jmp L(fent)
290 ALIGN(16) C K8-K10 P6-CNR P6-NHM P4
291 L(ftop):mul dinv C 0,12 0,17 0,17
292 add %r11, %rdx C 5 8 10
293 mov %rax, %r11 C 4 8 3
294 mov %rdx, %r13 C 6 9 11
295 imul %rbp, %rdx C 6 9 11
296 mov d, %rax C
297 add %rdx, %rax C 10 14 14
298 cmp %r11, %rdx C 10 14 14
299 cmovc %rdx, %rax C 11 15 15
300 adc $-1, %r13 C
301 mov %r13, (qp) C
302 sub $8, qp C
303 L(fent):lea 1(%rax), %r11 C
304 dec fn C
305 jns L(ftop) C
307 shr R8(%rcx), %rax
308 L(ret): pop %rbx
309 pop %rbp
310 pop %r12
311 pop %r13
312 FUNC_EXIT()
314 EPILOGUE()