1 dnl AMD K7 mpn_bdiv_q_1
-- mpn by limb exact division.
3 dnl Rearranged from mpn
/x86
/k7
/dive_1.asm by Marco Bodrato.
5 dnl Copyright
2001, 2002, 2004, 2007, 2011 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
41 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
44 C The dependent chain is mul+imul+sub for 11 cycles and that speed is
45 C achieved with no special effort. The load and shrld latencies are hidden
46 C by out of order execution.
48 C It's a touch faster on
size==1 to use the
mul-by
-inverse than divl.
50 defframe
(PARAM_SHIFT
, 24)
51 defframe
(PARAM_INVERSE
,20)
52 defframe
(PARAM_DIVISOR
,16)
53 defframe
(PARAM_SIZE
, 12)
54 defframe
(PARAM_SRC
, 8)
55 defframe
(PARAM_DST
, 4)
57 defframe
(SAVE_EBX
, -4)
58 defframe
(SAVE_ESI
, -8)
59 defframe
(SAVE_EDI
, -12)
60 defframe
(SAVE_EBP
, -16)
61 defframe
(VAR_INVERSE
, -20)
62 defframe
(VAR_DST_END
, -24)
64 deflit
(STACK_SPACE
, 24)
69 C mpn_pi1_bdiv_q_1
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size, mp_limb_t divisor
,
70 C mp_limb_t inverse
, int shift
)
72 PROLOGUE
(mpn_pi1_bdiv_q_1
)
75 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE
)
76 movl PARAM_SHIFT
, %ecx C shift count
89 leal
(%esi,%ebp,4), %esi C src
end
90 leal
(%edi,%ebp,4), %edi C dst
end
93 movl PARAM_INVERSE
, %eax C inv
96 movl
%eax, VAR_INVERSE
97 movl
(%esi,%ebp,4), %eax C src
[0]
102 movl
(%esi,%ebp,4), %edx C src
[1]
104 shrdl
( %cl, %edx, %eax)
106 movl
%edi, VAR_DST_END
113 C
ebx carry bit
, 0 or 1
118 C
ebp counter
, limbs
, negative
120 mull PARAM_DIVISOR C carry limb
in edx
122 movl
-4(%esi,%ebp,4), %eax
123 movl
(%esi,%ebp,4), %edi
125 shrdl
( %cl, %edi, %eax)
127 subl
%ebx, %eax C apply carry bit
129 movl VAR_DST_END
, %edi
131 subl
%edx, %eax C apply carry limb
135 imull VAR_INVERSE
, %eax
137 movl
%eax, -4(%edi,%ebp,4)
142 mull PARAM_DIVISOR C carry limb
in edx
144 movl
-4(%esi), %eax C src
high limb
148 subl
%ebx, %eax C apply carry bit
152 subl
%edx, %eax C apply carry limb
154 imull VAR_INVERSE
, %eax
158 addl $STACK_SPACE
, %esp
167 imull VAR_INVERSE
, %eax
173 addl $STACK_SPACE
, %esp
178 C mp_limb_t mpn_bdiv_q_1
(mp_ptr dst
, mp_srcptr src
, mp_size_t
size,
179 C mp_limb_t divisor
);
183 PROLOGUE
(mpn_bdiv_q_1
)
186 movl PARAM_DIVISOR, %eax
187 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE
)
188 movl $
-1, %ecx C shift count
191 movl PARAM_SIZE
, %ebp
196 C If there
's usually only one or two trailing zero bits then this
197 C should be faster than bsfl.
204 leal 1(%eax,%eax), %ebx C d without twos
205 andl $127, %eax C d/2, 7 bits
208 LEA( binvert_limb_table
, %edx)
209 movzbl
(%eax,%edx), %eax C inv
8 bits
211 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
214 leal
(%eax,%eax), %edx C
2*inv
215 movl
%ebx, PARAM_DIVISOR C d without twos
217 imull
%eax, %eax C inv
*inv
222 imull
%ebx, %eax C inv
*inv
*d
224 subl
%eax, %edx C inv
= 2*inv
- inv
*inv
*d
225 leal
(%edx,%edx), %eax C
2*inv
227 imull
%edx, %edx C inv
*inv
229 leal
(%esi,%ebp,4), %esi C src
end
230 leal
(%edi,%ebp,4), %edi C dst
end
233 imull
%ebx, %edx C inv
*inv
*d
235 subl
%edx, %eax C inv
= 2*inv
- inv
*inv
*d
237 ASSERT
(e
,` C expect d
*inv
== 1 mod 2^GMP_LIMB_BITS
238 pushl
%eax FRAME_pushl
()
239 imull PARAM_DIVISOR
, %eax
241 popl
%eax FRAME_popl
()')