1 dnl Intel Pentium mpn_divexact_1
-- mpn by limb exact division.
3 dnl Copyright
2001, 2002, 2014 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
36 C P54: 24.5 30.5 cycles/limb
40 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
43 C Plain divl is used for small sizes, since the inverse takes a while to
44 C setup. Multiplying works out faster for size>=3 when the divisor is odd,
45 C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or
46 C size==3 for even are about the same speed for both divl or mul, but the
47 C former is used since it will use up less code cache.
49 C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
50 C expected. On P54 in the even case the shrdl pairing nonsense (see
51 C mpn/x86/pentium/README) costs 1 cycle, but it's
not clear why there
's a
52 C further 1.5 slowdown for both odd and even.
54 defframe(PARAM_DIVISOR,16)
55 defframe(PARAM_SIZE, 12)
56 defframe(PARAM_SRC, 8)
57 defframe(PARAM_DST, 4)
59 dnl re-use parameter space
60 define(VAR_INVERSE,`PARAM_DST')
65 PROLOGUE
(mpn_divexact_1
)
68 movl PARAM_DIVISOR, %eax
71 pushl %esi FRAME_pushl()
72 push %edi FRAME_pushl()
78 addl %ecx, %eax C size if even, size+1 if odd
86 movl -4(%esi,%ecx,4), %eax
90 movl %eax, -4(%edi,%ecx,4)
103 movl PARAM_DIVISOR, %eax
107 ASSERT(nz, `orl %eax, %eax')
109 incl
%ecx C shift count
113 leal
1(%eax,%eax), %edx C d
114 andl
$127, %eax C d
/2, 7 bits
116 pushl
%ebx FRAME_pushl
()
117 pushl
%ebp FRAME_pushl
()
120 LEA( binvert_limb_table, %ebp)
121 movzbl (%eax,%ebp), %eax C inv 8 bits
123 movzbl binvert_limb_table
(%eax), %eax C inv
8 bits
126 movl %eax, %ebp C inv
127 addl %eax, %eax C 2*inv
129 imull %ebp, %ebp C inv*inv
131 imull %edx, %ebp C inv*inv*d
133 subl %ebp, %eax C inv = 2*inv - inv*inv*d
134 movl PARAM_SIZE, %ebx
137 addl %eax, %eax C 2*inv
139 imull %ebp, %ebp C inv*inv
141 imull %edx, %ebp C inv*inv*d
143 subl %ebp, %eax C inv = 2*inv - inv*inv*d
144 movl %edx, PARAM_DIVISOR C d without twos
146 leal (%esi,%ebx,4), %esi C src end
147 leal (%edi,%ebx,4), %edi C dst end
151 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
152 pushl %eax FRAME_pushl()
153 imull PARAM_DIVISOR, %eax
155 popl %eax FRAME_popl()')
157 movl
%eax, VAR_INVERSE
158 xorl
%ebp, %ebp C initial carry bit
160 movl
(%esi,%ebx,4), %eax C src
low limb
161 orl
%ecx, %ecx C shift
163 movl
4(%esi,%ebx,4), %edx C src second limb
(for even
)
166 shrdl
( %cl, %edx, %eax)
175 C
ebx counter
, limbs
, negative
180 C
ebp carry bit
, 0 or -1
184 movl
(%esi,%ebx,4), %eax
192 imull VAR_INVERSE
, %eax
194 movl
%eax, (%edi,%ebx,4)
211 C
ebx counter
, limbs
, negative
216 C
ebp carry bit
, 0 or -1
220 subl
%ebp, %edx C carry bit
221 movl
-4(%esi,%ebx,4), %eax C src limb
223 movl
(%esi,%ebx,4), %ebp C
and one above it
225 shrdl
( %cl, %ebp, %eax)
227 subl
%edx, %eax C carry limb
232 imull VAR_INVERSE
, %eax
234 movl
%eax, -4(%edi,%ebx,4)
243 movl
-4(%esi), %eax C src
high limb
248 subl
%edx, %eax C no carry if division is exact
250 imull VAR_INVERSE
, %eax
252 movl
%eax, -4(%edi) C dst
high limb
253 nop C protect against cache bank clash