beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / divrem_1.asm
blob255d4935c389f495a465a3cc8c47799ca03b55c3
1 dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
3 dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C 486 approx 43 maybe
36 C P5 44
37 C P6 39
38 C P6MMX 39
39 C K6 22
40 C K7 42
41 C P4 58
44 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
45 C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
46 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
47 C mp_srcptr src, mp_size_t size, mp_limb_t divisor,
48 C mp_limb_t carry);
50 C Divide src,size by divisor and store the quotient in dst+xsize,size.
51 C Extend the division to fractional quotient limbs in dst,xsize. Return the
52 C remainder. Either or both xsize and size can be 0.
54 C mpn_divrem_1c takes a carry parameter which is an initial high limb,
55 C effectively one extra limb at the top of src,size. Must have
56 C carry<divisor.
59 C Essentially the code is the same as the division based part of
60 C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl
61 C instruction even when gcc is not being used (when longlong.h only has the
62 C rather slow generic C udiv_qrnnd().
64 C A test is done to see if the high limb is less than the divisor, and if so
65 C one less div is done. A div is between 20 and 40 cycles on the various
66 C x86s, so assuming high<divisor about half the time, then this test saves
67 C half that amount. The branch misprediction penalty on each chip is less
68 C than half a div.
71 C Notes for P5:
73 C It might be thought that moving the load down to pair with the store would
74 C save 1 cycle, but that doesn't seem to happen in practice, and in any case
75 C would be a mere 2.2% saving, so it's hardly worth bothering about.
77 C A mul-by-inverse might be a possibility for P5, as done in
78 C mpn/x86/pentium/mod_1.asm. The number of auxiliary instructions required
79 C is a hinderance, but there could be a 10-15% speedup available.
82 C Notes for K6:
84 C K6 has its own version of this code, using loop and paying attention to
85 C cache line boundary crossings. The target 20 c/l can be had with the
86 C decl+jnz of the present code by pairing up the load and store in the
87 C loops. But it's considered easier not to introduce complexity just for
88 C that, but instead let k6 have its own code.
91 defframe(PARAM_CARRY, 24)
92 defframe(PARAM_DIVISOR,20)
93 defframe(PARAM_SIZE, 16)
94 defframe(PARAM_SRC, 12)
95 defframe(PARAM_XSIZE, 8)
96 defframe(PARAM_DST, 4)
98 TEXT
99 ALIGN(16)
101 PROLOGUE(mpn_divrem_1c)
102 deflit(`FRAME',0)
104 movl PARAM_SIZE, %ecx
105 pushl %edi FRAME_pushl()
107 movl PARAM_SRC, %edi
108 pushl %esi FRAME_pushl()
110 movl PARAM_DIVISOR, %esi
111 pushl %ebx FRAME_pushl()
113 movl PARAM_DST, %ebx
114 pushl %ebp FRAME_pushl()
116 movl PARAM_XSIZE, %ebp
117 orl %ecx, %ecx
119 movl PARAM_CARRY, %edx
120 jz L(fraction)
122 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
123 jmp L(integer_top)
125 EPILOGUE()
128 PROLOGUE(mpn_divrem_1)
129 deflit(`FRAME',0)
131 movl PARAM_SIZE, %ecx
132 pushl %edi FRAME_pushl()
134 movl PARAM_SRC, %edi
135 pushl %esi FRAME_pushl()
137 movl PARAM_DIVISOR, %esi
138 orl %ecx,%ecx
140 jz L(size_zero)
141 pushl %ebx FRAME_pushl()
143 movl -4(%edi,%ecx,4), %eax C src high limb
144 xorl %edx, %edx
146 movl PARAM_DST, %ebx
147 pushl %ebp FRAME_pushl()
149 movl PARAM_XSIZE, %ebp
150 cmpl %esi, %eax
152 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
153 jae L(integer_entry)
156 C high<divisor, so high of dst is zero, and avoid one div
158 movl %edx, (%ebx,%ecx,4)
159 decl %ecx
161 movl %eax, %edx
162 jz L(fraction)
165 L(integer_top):
166 C eax scratch (quotient)
167 C ebx dst+4*xsize-4
168 C ecx counter
169 C edx scratch (remainder)
170 C esi divisor
171 C edi src
172 C ebp xsize
174 movl -4(%edi,%ecx,4), %eax
175 L(integer_entry):
177 divl %esi
179 movl %eax, (%ebx,%ecx,4)
180 decl %ecx
181 jnz L(integer_top)
184 L(fraction):
185 orl %ebp, %ecx
186 jz L(done)
188 movl PARAM_DST, %ebx
191 L(fraction_top):
192 C eax scratch (quotient)
193 C ebx dst
194 C ecx counter
195 C edx scratch (remainder)
196 C esi divisor
197 C edi
198 C ebp
200 xorl %eax, %eax
202 divl %esi
204 movl %eax, -4(%ebx,%ecx,4)
205 decl %ecx
206 jnz L(fraction_top)
209 L(done):
210 popl %ebp
211 movl %edx, %eax
212 popl %ebx
213 popl %esi
214 popl %edi
218 L(size_zero):
219 deflit(`FRAME',8)
220 movl PARAM_XSIZE, %ecx
221 xorl %eax, %eax
223 movl PARAM_DST, %edi
225 cld C better safe than sorry, see mpn/x86/README
228 stosl
230 popl %esi
231 popl %edi
233 EPILOGUE()