beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / divrem_1.asm
blobb4cea4fa2a74053f95589f4a28821903075370e8
1 dnl AMD K6 mpn_divrem_1 -- mpn by limb division.
3 dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K6: 20 cycles/limb
37 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
38 C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
39 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
40 C mp_srcptr src, mp_size_t size, mp_limb_t divisor,
41 C mp_limb_t carry);
43 C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop
44 C instead of decl+jnz, since it comes out 2 cycles/limb faster.
46 C A test is done to see if the high limb is less than the divisor, and if so
47 C one less div is done. A div is 20 cycles, so assuming high<divisor about
48 C half the time, then this test saves half that amount. The branch
49 C misprediction penalty is less than that.
51 C Back-to-back div instructions run at 20 cycles, the same as the loop here,
52 C so it seems there's nothing to gain by rearranging the loop. Pairing the
53 C mov and loop instructions was found to gain nothing.
55 C Enhancements:
57 C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but
58 C that algorithm has been found to suffer from the relatively poor carry
59 C handling on K6 and too many auxiliary instructions. The fractional part
60 C however could be done at about 13 c/l, if it mattered enough.
62 defframe(PARAM_CARRY, 24)
63 defframe(PARAM_DIVISOR,20)
64 defframe(PARAM_SIZE, 16)
65 defframe(PARAM_SRC, 12)
66 defframe(PARAM_XSIZE, 8)
67 defframe(PARAM_DST, 4)
69 TEXT
71 ALIGN(32)
72 PROLOGUE(mpn_divrem_1c)
73 deflit(`FRAME',0)
75 movl PARAM_SIZE, %ecx
76 pushl %edi FRAME_pushl()
78 movl PARAM_SRC, %edi
79 pushl %esi FRAME_pushl()
81 movl PARAM_DIVISOR, %esi
82 pushl %ebx FRAME_pushl()
84 movl PARAM_DST, %ebx
85 pushl %ebp FRAME_pushl()
87 movl PARAM_XSIZE, %ebp
88 orl %ecx, %ecx C size
90 movl PARAM_CARRY, %edx
91 jz L(fraction) C if size==0
93 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
94 jmp L(integer_top)
96 EPILOGUE()
99 ALIGN(16)
100 PROLOGUE(mpn_divrem_1)
101 deflit(`FRAME',0)
103 movl PARAM_SIZE, %ecx
104 pushl %edi FRAME_pushl()
106 movl PARAM_SRC, %edi
107 pushl %esi FRAME_pushl()
109 movl PARAM_DIVISOR, %esi
110 orl %ecx,%ecx C size
112 jz L(size_zero)
113 pushl %ebx FRAME_pushl()
115 movl -4(%edi,%ecx,4), %eax C src high limb
116 xorl %edx, %edx
118 movl PARAM_DST, %ebx
119 pushl %ebp FRAME_pushl()
121 movl PARAM_XSIZE, %ebp
122 cmpl %esi, %eax
124 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
125 jae L(integer_entry)
128 C high<divisor, so high of dst is zero, and avoid one div
130 movl %edx, (%ebx,%ecx,4)
131 decl %ecx
133 movl %eax, %edx
134 jz L(fraction)
137 L(integer_top):
138 C eax scratch (quotient)
139 C ebx dst+4*xsize-4
140 C ecx counter
141 C edx scratch (remainder)
142 C esi divisor
143 C edi src
144 C ebp xsize
146 movl -4(%edi,%ecx,4), %eax
147 L(integer_entry):
149 divl %esi
151 movl %eax, (%ebx,%ecx,4)
152 loop L(integer_top)
155 L(fraction):
156 orl %ebp, %ecx
157 jz L(done)
159 movl PARAM_DST, %ebx
162 L(fraction_top):
163 C eax scratch (quotient)
164 C ebx dst
165 C ecx counter
166 C edx scratch (remainder)
167 C esi divisor
168 C edi
169 C ebp
171 xorl %eax, %eax
173 divl %esi
175 movl %eax, -4(%ebx,%ecx,4)
176 loop L(fraction_top)
179 L(done):
180 popl %ebp
181 movl %edx, %eax
182 popl %ebx
183 popl %esi
184 popl %edi
188 L(size_zero):
189 deflit(`FRAME',8)
190 movl PARAM_XSIZE, %ecx
191 xorl %eax, %eax
193 movl PARAM_DST, %edi
195 cld C better safe than sorry, see mpn/x86/README
198 stosl
200 popl %esi
201 popl %edi
203 EPILOGUE()