beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / bdiv_q_1.asm
blobc2c4f58b6d7a92c2e02cec068f30010bbcf5b5c3
1 dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
3 dnl Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato.
5 dnl Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C divisor
37 C odd even
38 C P54: 24.5 30.5 cycles/limb
39 C P55: 23.0 28.0
41 MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
43 C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
44 C expected. On P54 in the even case the shrdl pairing nonsense (see
45 C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
46 C further 1.5 slowdown for both odd and even.
48 defframe(PARAM_SHIFT, 24)
49 defframe(PARAM_INVERSE,20)
50 defframe(PARAM_DIVISOR,16)
51 defframe(PARAM_SIZE, 12)
52 defframe(PARAM_SRC, 8)
53 defframe(PARAM_DST, 4)
55 dnl re-use parameter space
56 define(VAR_INVERSE,`PARAM_DST')
58 TEXT
60 ALIGN(32)
61 C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
62 C mp_limb_t divisor);
64 PROLOGUE(mpn_bdiv_q_1)
65 deflit(`FRAME',0)
67 movl $-1, %ecx
68 movl PARAM_DIVISOR, %eax
70 L(strip_twos):
71 ASSERT(nz, `orl %eax, %eax')
72 shrl %eax
73 incl %ecx C shift count
75 jnc L(strip_twos)
77 leal 1(%eax,%eax), %edx C d
78 andl $127, %eax C d/2, 7 bits
80 pushl %ebx FRAME_pushl()
81 pushl %ebp FRAME_pushl()
83 ifdef(`PIC',`
84 ifdef(`DARWIN',`
85 LEA( binvert_limb_table, %ebp)
86 movzbl (%eax,%ebp), %eax
87 ',`
88 call L(here)
89 L(here):
90 popl %ebp C eip
92 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
93 C AGI
94 movl binvert_limb_table@GOT(%ebp), %ebp
95 C AGI
96 movzbl (%eax,%ebp), %eax
98 ',`
100 dnl non-PIC
101 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
104 movl %eax, %ebp C inv
105 addl %eax, %eax C 2*inv
107 imull %ebp, %ebp C inv*inv
109 imull %edx, %ebp C inv*inv*d
111 subl %ebp, %eax C inv = 2*inv - inv*inv*d
112 movl PARAM_SIZE, %ebx
114 movl %eax, %ebp
115 addl %eax, %eax C 2*inv
117 imull %ebp, %ebp C inv*inv
119 imull %edx, %ebp C inv*inv*d
121 subl %ebp, %eax C inv = 2*inv - inv*inv*d
122 movl %edx, PARAM_DIVISOR C d without twos
124 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
125 pushl %eax FRAME_pushl()
126 imull PARAM_DIVISOR, %eax
127 cmpl $1, %eax
128 popl %eax FRAME_popl()')
130 jmp L(common)
131 EPILOGUE()
133 C mp_limb_t
134 C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
135 C mp_limb_t inverse, int shift)
136 ALIGN(32)
137 PROLOGUE(mpn_pi1_bdiv_q_1)
138 deflit(`FRAME',0)
140 movl PARAM_SHIFT, %ecx
142 pushl %ebx FRAME_pushl()
143 pushl %ebp FRAME_pushl()
145 movl PARAM_SIZE, %ebx
146 movl PARAM_INVERSE, %eax
148 L(common):
149 pushl %esi FRAME_pushl()
150 push %edi FRAME_pushl()
152 movl PARAM_SRC, %esi
153 movl PARAM_DST, %edi
154 movl %eax, VAR_INVERSE
156 leal (%esi,%ebx,4), %esi C src end
157 leal (%edi,%ebx,4), %edi C dst end
159 negl %ebx C -size
161 xorl %ebp, %ebp C initial carry bit
163 orl %ecx, %ecx C shift
164 movl (%esi,%ebx,4), %eax C src low limb
165 jz L(odd_entry)
167 xorl %edx, %edx C initial carry limb (for even, if one)
168 incl %ebx
169 jz L(one)
171 movl (%esi,%ebx,4), %edx C src second limb (for even)
172 shrdl( %cl, %edx, %eax)
174 jmp L(even_entry)
177 ALIGN(8)
178 L(odd_top):
179 C eax scratch
180 C ebx counter, limbs, negative
181 C ecx
182 C edx
183 C esi src end
184 C edi dst end
185 C ebp carry bit, 0 or -1
187 mull PARAM_DIVISOR
189 movl (%esi,%ebx,4), %eax
190 subl %ebp, %edx
192 subl %edx, %eax
194 sbbl %ebp, %ebp
196 L(odd_entry):
197 imull VAR_INVERSE, %eax
199 movl %eax, (%edi,%ebx,4)
201 incl %ebx
202 jnz L(odd_top)
204 popl %edi
205 popl %esi
207 popl %ebp
208 popl %ebx
212 L(even_top):
213 C eax scratch
214 C ebx counter, limbs, negative
215 C ecx twos
216 C edx
217 C esi src end
218 C edi dst end
219 C ebp carry bit, 0 or -1
221 mull PARAM_DIVISOR
223 subl %ebp, %edx C carry bit
224 movl -4(%esi,%ebx,4), %eax C src limb
226 movl (%esi,%ebx,4), %ebp C and one above it
228 shrdl( %cl, %ebp, %eax)
230 subl %edx, %eax C carry limb
232 sbbl %ebp, %ebp
234 L(even_entry):
235 imull VAR_INVERSE, %eax
237 movl %eax, -4(%edi,%ebx,4)
238 incl %ebx
240 jnz L(even_top)
242 mull PARAM_DIVISOR
244 movl -4(%esi), %eax C src high limb
245 subl %ebp, %edx
247 L(one):
248 shrl %cl, %eax
250 subl %edx, %eax C no carry if division is exact
252 imull VAR_INVERSE, %eax
254 movl %eax, -4(%edi) C dst high limb
255 nop C protect against cache bank clash
257 popl %edi
258 popl %esi
260 popl %ebp
261 popl %ebx
265 EPILOGUE()
266 ASM_END()