1 dnl Intel Pentium mpn_divexact_1
-- mpn by limb exact division.
3 dnl Rearranged from mpn
/x86
/pentium
/dive_1.asm by Marco Bodrato.
5 dnl Copyright
2001, 2002, 2011, 2014 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
38 C P54: 24.5 30.5 cycles/limb
41 MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
43 C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
44 C expected. On P54 in the even case the shrdl pairing nonsense (see
45 C mpn/x86/pentium/README) costs 1 cycle, but it's
not clear why there
's a
46 C further 1.5 slowdown for both odd and even.
48 defframe(PARAM_SHIFT, 24)
49 defframe(PARAM_INVERSE,20)
50 defframe(PARAM_DIVISOR,16)
51 defframe(PARAM_SIZE, 12)
52 defframe(PARAM_SRC, 8)
53 defframe(PARAM_DST, 4)
55 dnl re-use parameter space
56 define(VAR_INVERSE,`PARAM_DST')
61 C mp_limb_t mpn_bdiv_q_1
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size,
64 PROLOGUE
(mpn_bdiv_q_1
)
68 movl PARAM_DIVISOR, %eax
71 ASSERT(nz, `orl %eax, %eax')
73 incl
%ecx C shift count
77 leal
1(%eax,%eax), %edx C d
78 andl
$127, %eax C d
/2, 7 bits
80 pushl
%ebx FRAME_pushl
()
81 pushl
%ebp FRAME_pushl
()
85 LEA( binvert_limb_table
, %ebp)
86 movzbl
(%eax,%ebp), %eax
92 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
94 movl binvert_limb_table@GOT(%ebp), %ebp
96 movzbl (%eax,%ebp), %eax
101 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
104 movl
%eax, %ebp C inv
105 addl
%eax, %eax C
2*inv
107 imull
%ebp, %ebp C inv
*inv
109 imull
%edx, %ebp C inv
*inv
*d
111 subl
%ebp, %eax C inv
= 2*inv
- inv
*inv
*d
112 movl PARAM_SIZE
, %ebx
115 addl
%eax, %eax C
2*inv
117 imull
%ebp, %ebp C inv
*inv
119 imull
%edx, %ebp C inv
*inv
*d
121 subl
%ebp, %eax C inv
= 2*inv
- inv
*inv
*d
122 movl
%edx, PARAM_DIVISOR C d without twos
124 ASSERT
(e
,` C expect d
*inv
== 1 mod 2^GMP_LIMB_BITS
125 pushl
%eax FRAME_pushl
()
126 imull PARAM_DIVISOR
, %eax
128 popl
%eax FRAME_popl
()')
134 C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
135 C mp_limb_t inverse, int shift)
137 PROLOGUE(mpn_pi1_bdiv_q_1)
140 movl PARAM_SHIFT
, %ecx
142 pushl
%ebx FRAME_pushl
()
143 pushl
%ebp FRAME_pushl
()
145 movl PARAM_SIZE
, %ebx
146 movl PARAM_INVERSE
, %eax
149 pushl
%esi FRAME_pushl
()
150 push %edi FRAME_pushl
()
154 movl
%eax, VAR_INVERSE
156 leal
(%esi,%ebx,4), %esi C src
end
157 leal
(%edi,%ebx,4), %edi C dst
end
161 xorl
%ebp, %ebp C initial carry bit
163 orl
%ecx, %ecx C shift
164 movl
(%esi,%ebx,4), %eax C src
low limb
167 xorl
%edx, %edx C initial carry limb
(for even
, if one
)
171 movl
(%esi,%ebx,4), %edx C src second limb
(for even
)
172 shrdl
( %cl, %edx, %eax)
180 C
ebx counter
, limbs
, negative
185 C
ebp carry bit
, 0 or -1
189 movl
(%esi,%ebx,4), %eax
197 imull VAR_INVERSE
, %eax
199 movl
%eax, (%edi,%ebx,4)
214 C
ebx counter
, limbs
, negative
219 C
ebp carry bit
, 0 or -1
223 subl
%ebp, %edx C carry bit
224 movl
-4(%esi,%ebx,4), %eax C src limb
226 movl
(%esi,%ebx,4), %ebp C
and one above it
228 shrdl
( %cl, %ebp, %eax)
230 subl
%edx, %eax C carry limb
235 imull VAR_INVERSE
, %eax
237 movl
%eax, -4(%edi,%ebx,4)
244 movl
-4(%esi), %eax C src
high limb
250 subl
%edx, %eax C no carry if division is exact
252 imull VAR_INVERSE
, %eax
254 movl
%eax, -4(%edi) C dst
high limb
255 nop C protect against cache bank clash