beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / dive_1.asm
blob21b5287ca5fa138602cfbfe9fb706a4284f488dc
1 dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2001, 2002, 2014 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C divisor
35 C odd even
36 C P54: 24.5 30.5 cycles/limb
37 C P55: 23.0 28.0
40 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
41 C mp_limb_t divisor);
43 C Plain divl is used for small sizes, since the inverse takes a while to
44 C setup. Multiplying works out faster for size>=3 when the divisor is odd,
45 C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or
46 C size==3 for even are about the same speed for both divl or mul, but the
47 C former is used since it will use up less code cache.
49 C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
50 C expected. On P54 in the even case the shrdl pairing nonsense (see
51 C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
52 C further 1.5 slowdown for both odd and even.
54 defframe(PARAM_DIVISOR,16)
55 defframe(PARAM_SIZE, 12)
56 defframe(PARAM_SRC, 8)
57 defframe(PARAM_DST, 4)
59 dnl re-use parameter space
60 define(VAR_INVERSE,`PARAM_DST')
62 TEXT
64 ALIGN(32)
65 PROLOGUE(mpn_divexact_1)
66 deflit(`FRAME',0)
68 movl PARAM_DIVISOR, %eax
69 movl PARAM_SIZE, %ecx
71 pushl %esi FRAME_pushl()
72 push %edi FRAME_pushl()
74 movl PARAM_SRC, %esi
75 andl $1, %eax
77 movl PARAM_DST, %edi
78 addl %ecx, %eax C size if even, size+1 if odd
80 cmpl $4, %eax
81 jae L(mul_by_inverse)
84 xorl %edx, %edx
85 L(div_top):
86 movl -4(%esi,%ecx,4), %eax
88 divl PARAM_DIVISOR
90 movl %eax, -4(%edi,%ecx,4)
91 decl %ecx
93 jnz L(div_top)
95 popl %edi
96 popl %esi
98 ret
102 L(mul_by_inverse):
103 movl PARAM_DIVISOR, %eax
104 movl $-1, %ecx
106 L(strip_twos):
107 ASSERT(nz, `orl %eax, %eax')
108 shrl %eax
109 incl %ecx C shift count
111 jnc L(strip_twos)
113 leal 1(%eax,%eax), %edx C d
114 andl $127, %eax C d/2, 7 bits
116 pushl %ebx FRAME_pushl()
117 pushl %ebp FRAME_pushl()
119 ifdef(`PIC',`dnl
120 LEA( binvert_limb_table, %ebp)
121 movzbl (%eax,%ebp), %eax C inv 8 bits
123 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
126 movl %eax, %ebp C inv
127 addl %eax, %eax C 2*inv
129 imull %ebp, %ebp C inv*inv
131 imull %edx, %ebp C inv*inv*d
133 subl %ebp, %eax C inv = 2*inv - inv*inv*d
134 movl PARAM_SIZE, %ebx
136 movl %eax, %ebp
137 addl %eax, %eax C 2*inv
139 imull %ebp, %ebp C inv*inv
141 imull %edx, %ebp C inv*inv*d
143 subl %ebp, %eax C inv = 2*inv - inv*inv*d
144 movl %edx, PARAM_DIVISOR C d without twos
146 leal (%esi,%ebx,4), %esi C src end
147 leal (%edi,%ebx,4), %edi C dst end
149 negl %ebx C -size
151 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
152 pushl %eax FRAME_pushl()
153 imull PARAM_DIVISOR, %eax
154 cmpl $1, %eax
155 popl %eax FRAME_popl()')
157 movl %eax, VAR_INVERSE
158 xorl %ebp, %ebp C initial carry bit
160 movl (%esi,%ebx,4), %eax C src low limb
161 orl %ecx, %ecx C shift
163 movl 4(%esi,%ebx,4), %edx C src second limb (for even)
164 jz L(odd_entry)
166 shrdl( %cl, %edx, %eax)
168 incl %ebx
169 jmp L(even_entry)
172 ALIGN(8)
173 L(odd_top):
174 C eax scratch
175 C ebx counter, limbs, negative
176 C ecx
177 C edx
178 C esi src end
179 C edi dst end
180 C ebp carry bit, 0 or -1
182 mull PARAM_DIVISOR
184 movl (%esi,%ebx,4), %eax
185 subl %ebp, %edx
187 subl %edx, %eax
189 sbbl %ebp, %ebp
191 L(odd_entry):
192 imull VAR_INVERSE, %eax
194 movl %eax, (%edi,%ebx,4)
196 incl %ebx
197 jnz L(odd_top)
200 popl %ebp
201 popl %ebx
203 popl %edi
204 popl %esi
209 L(even_top):
210 C eax scratch
211 C ebx counter, limbs, negative
212 C ecx twos
213 C edx
214 C esi src end
215 C edi dst end
216 C ebp carry bit, 0 or -1
218 mull PARAM_DIVISOR
220 subl %ebp, %edx C carry bit
221 movl -4(%esi,%ebx,4), %eax C src limb
223 movl (%esi,%ebx,4), %ebp C and one above it
225 shrdl( %cl, %ebp, %eax)
227 subl %edx, %eax C carry limb
229 sbbl %ebp, %ebp
231 L(even_entry):
232 imull VAR_INVERSE, %eax
234 movl %eax, -4(%edi,%ebx,4)
235 incl %ebx
237 jnz L(even_top)
241 mull PARAM_DIVISOR
243 movl -4(%esi), %eax C src high limb
244 subl %ebp, %edx
246 shrl %cl, %eax
248 subl %edx, %eax C no carry if division is exact
250 imull VAR_INVERSE, %eax
252 movl %eax, -4(%edi) C dst high limb
253 nop C protect against cache bank clash
255 popl %ebp
256 popl %ebx
258 popl %edi
259 popl %esi
263 EPILOGUE()
264 ASM_END()