1 dnl Intel Pentium
-4 mpn_divrem_1
-- mpn by limb division.
3 dnl Copyright
1999-2004 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
37 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
38 C mp_srcptr src, mp_size_t size,
40 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
41 C mp_srcptr src, mp_size_t size,
42 C mp_limb_t divisor, mp_limb_t carry);
43 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
44 C mp_srcptr src, mp_size_t size,
45 C mp_limb_t divisor, mp_limb_t inverse,
50 C The method and nomenclature follow part 8 of "Division by Invariant
51 C Integers using Multiplication" by Granlund and Montgomery, reference in
54 C "m" is written for what is m' in the paper
, and "d" for d_norm
, which
55 C won
't cause any confusion since it's only the normalized divisor that
's of
56 C any use in the code. "b" is written for 2^N, the size of a limb, N being
59 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
60 C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets
61 C us have just a psubq on the dependent chain.
63 C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
64 C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
68 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
69 C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
70 C carry
, since
in normal circumstances that will be a very rare event.
72 C The
test for skipping a division is branch free
(once
size>=1 is tested
).
73 C The store to the destination
high limb is
0 when a divide is skipped
, or
74 C if it
's not skipped then a copy of the src high limb is stored. The
75 C latter is in case src==dst.
77 C There's a
small bias towards expecting xsize
==0, by having code for
78 C xsize
==0 in a straight line
and xsize
!=0 under forward jumps.
82 C The
loop measures
32 cycles
, but the dependent chain would suggest it
83 C could be done with
30.
Not sure where to start looking for the extras.
87 C If the divisor is normalized
(high bit set
) then a division step can
88 C always be skipped
, since the
high destination limb is always
0 or 1 in
89 C that case. It doesn
't seem worth checking for this though, since it
90 C probably occurs infrequently.
93 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
94 dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
96 dnl The inverse takes about 80-90 cycles to calculate, but after that the
97 dnl multiply is 32 c/l versus division at about 58 c/l.
99 dnl At 4 limbs the div is a touch faster than the mul (and of course
100 dnl simpler), so start the mul from 5 limbs.
102 deflit(MUL_THRESHOLD, 5)
105 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
106 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
107 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
108 defframe(PARAM_DIVISOR,20)
109 defframe(PARAM_SIZE, 16)
110 defframe(PARAM_SRC, 12)
111 defframe(PARAM_XSIZE, 8)
112 defframe(PARAM_DST, 4)
114 dnl re-use parameter space
115 define(SAVE_ESI,`PARAM_SIZE')
116 define
(SAVE_EBP
,`PARAM_SRC
')
117 define(SAVE_EDI,`PARAM_DIVISOR')
118 define
(SAVE_EBX
,`PARAM_DST
')
123 PROLOGUE(mpn_preinv_divrem_1)
126 movl PARAM_SIZE
, %ecx
127 xorl
%edx, %edx C carry if can
't skip a div
133 movl PARAM_DIVISOR, %ebp
138 movl -4(%esi,%ecx,4), %eax C src high limb
141 movl PARAM_XSIZE, %ebx
143 movd PARAM_PREINV_INVERSE, %mm4
145 movd PARAM_PREINV_SHIFT, %mm7 C l
146 cmpl %ebp, %eax C high cmp divisor
148 cmovc( %eax, %edx) C high is carry if high<divisor
149 movd %edx, %mm0 C carry
151 movd %edx, %mm1 C carry
155 cmovnc( %eax, %edx) C 0 if skip div, src high if not
156 C (the latter in case src==dst)
157 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
159 movl %edx, (%edi,%ecx,4) C dst high limb
160 sbbl $0, %ecx C skip one division if high<divisor
163 subl PARAM_PREINV_SHIFT, %eax
164 psllq %mm7, %mm5 C d normalized
165 leal (%edi,%ecx,4), %edi C &dst[xsize+size-1]
166 leal -4(%esi,%ecx,4), %esi C &src[size-1]
168 movd %eax, %mm6 C 32-l
175 PROLOGUE(mpn_divrem_1c)
178 movl PARAM_CARRY
, %edx
180 movl PARAM_SIZE
, %ecx
186 movl PARAM_DIVISOR
, %ebp
192 movl PARAM_XSIZE
, %ebx
194 leal
-4(%edi,%ebx,4), %edi C
&dst
[xsize
-1]
201 PROLOGUE
(mpn_divrem_1
)
204 movl PARAM_SIZE, %ecx
205 xorl %edx, %edx C initial carry (if can't skip a
div)
211 movl PARAM_DIVISOR
, %ebp
217 movl PARAM_XSIZE
, %ebx
218 leal
-4(%edi,%ebx,4), %edi C
&dst
[xsize
-1]
220 orl
%ecx, %ecx C
size
221 jz L
(no_skip_div
) C if
size==0
222 movl
-4(%esi,%ecx,4), %eax C src
high limb
224 cmpl
%ebp, %eax C
high cmp divisor
226 cmovnc
( %eax, %edx) C
0 if skip
div, src
high if
not
227 movl
%edx, (%edi,%ecx,4) C dst
high limb
230 cmovc
( %eax, %edx) C
high is carry if
high<divisor
232 sbbl
$0, %ecx C
size-1 if
high<divisor
245 leal
(%ebx,%ecx), %eax C
size+xsize
246 leal
-4(%esi,%ecx,4), %esi C
&src
[size-1]
247 leal
(%edi,%ecx,4), %edi C
&dst
[size+xsize
-1]
249 cmpl $MUL_THRESHOLD
, %eax
250 jae L
(mul_by_inverse
)
254 jz L
(divide_no_integer
) C if
size==0
257 C
eax scratch
(quotient
)
261 C
esi src
, decrementing
262 C
edi dst
, decrementing
274 jnz L
(divide_integer
)
277 L
(divide_no_integer
):
279 jnz L
(divide_fraction
) C if xsize
!=0
291 C
eax scratch
(quotient
)
296 C
edi dst
, decrementing
307 jnz L
(divide_fraction
)
313 C
-----------------------------------------------------------------------------
321 C
edi &dst
[size+xsize
-1]
324 bsrl
%ebp, %eax C
31-l
325 movd
%edx, %mm0 C carry
326 movd
%edx, %mm1 C carry
327 movl
%ecx, %edx C
size
332 xorl
%eax, %ecx C l
= leading zeros on d
335 shll
%cl, %ebp C d normalized
337 movl
%edx, %ecx C
size
339 movd
%eax, %mm6 C
32-l
345 subl
%ebp, %edx C
(b
-d
)-1 so
edx:eax = b
*(b
-d
)-1
347 divl
%ebp C floor
(b
*(b
-d
)-1 / d
)
361 C
edi &dst
[size+xsize
-1]
372 psllq
%mm7
, %mm0 C n2
= carry
<< l
, for
size==0
377 movd
(%esi), %mm0 C src
high limb
379 psrlq
%mm6
, %mm0 C n2
= high (carry:srchigh
<< l
)
383 C The dependent chain here consists of
386 C
8 pmuludq m
*(n1
+n2
)
387 C
2 paddq
n2:nadj
+ m
*(n1
+n2
)
391 C
2 psrlq
high n
-(q1
+1)*d
mask
393 C
2 paddd n2
+d addback
397 C But it seems to run at
32 cycles
, so presumably there
's something else
404 C ecx counter, size-1 to 0
406 C esi src, decrementing
407 C edi dst, decrementing
420 movd
-4(%esi), %mm1 C next src limbs
425 psrlq
%mm6
, %mm1 C n10
427 movq
%mm1
, %mm2 C n10
428 movq
%mm1
, %mm3 C n10
429 psrad
$31, %mm1 C
-n1
430 pand
%mm5
, %mm1 C
-n1
& d
431 paddd
%mm2
, %mm1 C nadj
= n10
+(-n1
&d
), ignore overflow
434 paddd
%mm0
, %mm2 C n2
+n1
435 punpckldq
%mm0
, %mm1 C
n2:nadj
437 pmuludq
%mm4
, %mm2 C m
*(n2
+n1
)
441 paddq
%mm2
, %mm1 C
n2:nadj
+ m
*(n2
+n1
)
442 pxor
%mm2
, %mm2 C break dependency
, saves
4 cycles
443 pcmpeqd
%mm2
, %mm2 C FF...FF
446 psrlq
$32, %mm1 C q1
= high(n2:nadj
+ m
*(n2
+n1
))
448 paddd
%mm1
, %mm2 C q1
+1
449 pmuludq
%mm5
, %mm1 C q1
*d
451 punpckldq
%mm0
, %mm3 C n
= n2:n10
454 psubq
%mm5
, %mm3 C n
- d
458 psubq
%mm1
, %mm3 C n
- (q1
+1)*d
460 por
%mm3
, %mm0 C copy remainder
-> new n2
461 psrlq
$32, %mm3 C
high n
- (q1
+1)*d
, 0 or -1
469 pand %mm5, %mm3 C mask & d
471 paddd %mm3, %mm0 C addback if necessary
498 movd
(%esi), %mm1 C src
[0]
499 psllq
%mm7
, %mm1 C n10
501 movq
%mm1
, %mm2 C n10
502 movq
%mm1
, %mm3 C n10
503 psrad
$31, %mm1 C
-n1
504 pand
%mm5
, %mm1 C
-n1
& d
505 paddd
%mm2
, %mm1 C nadj
= n10
+(-n1
&d
), ignore overflow
508 paddd
%mm0
, %mm2 C n2
+n1
509 punpckldq
%mm0
, %mm1 C
n2:nadj
511 pmuludq
%mm4
, %mm2 C m
*(n2
+n1
)
515 paddq
%mm2
, %mm1 C
n2:nadj
+ m
*(n2
+n1
)
516 pcmpeqd
%mm2
, %mm2 C FF...FF
519 psrlq
$32, %mm1 C q1
= high(n2:nadj
+ m
*(n2
+n1
))
520 paddd
%mm1
, %mm2 C q1
522 pmuludq
%mm5
, %mm1 C q1
*d
523 punpckldq
%mm0
, %mm3 C n
524 psubq
%mm5
, %mm3 C n
- d
529 psubq
%mm1
, %mm3 C n
- (q1
+1)*d
531 por
%mm3
, %mm0 C remainder
-> n2
532 psrlq
$32, %mm3 C
high n
- (q1
+1)*d
, 0 or -1
540 pand %mm5, %mm3 C mask & d
542 paddd %mm3, %mm0 C addback if necessary
552 jnz L(fraction_some) C if xsize!=0
557 psrld %mm7, %mm0 C remainder
569 C -----------------------------------------------------------------------------
584 C ebx counter, xsize iterations
587 C esi src, decrementing
588 C edi dst, decrementing
602 pmuludq
%mm4
, %mm0 C m
*n2
609 psrlq
$32, %mm0 C
high(m
*n2
)
611 paddd
%mm1
, %mm0 C q1
= high(n2:0 + m
*n2
)
613 paddd
%mm0
, %mm2 C q1
+1
614 pmuludq
%mm5
, %mm0 C q1
*d
616 psllq
$32, %mm1 C n
= n2:0
617 psubq
%mm5
, %mm1 C n
- d
621 psubq
%mm0
, %mm1 C r
= n
- (q1
+1)*d
624 por
%mm1
, %mm0 C r
-> n2
625 psrlq
$32, %mm1 C
high n
- (q1
+1)*d
, 0 or -1
633 pand %mm5, %mm1 C mask & d
635 paddd %mm1, %mm0 C addback if necessary