1 dnl AMD K7 mpn_divrem_1
, mpn_divrem_1c
, mpn_preinv_divrem_1
-- mpn by limb
4 dnl Copyright
1999-2002, 2004 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
35 C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
38 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
39 C mp_srcptr src, mp_size_t size,
41 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
42 C mp_srcptr src, mp_size_t size,
43 C mp_limb_t divisor, mp_limb_t carry);
44 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
45 C mp_srcptr src, mp_size_t size,
46 C mp_limb_t divisor, mp_limb_t inverse,
51 C The method and nomenclature follow part 8 of "Division by Invariant
52 C Integers using Multiplication" by Granlund and Montgomery, reference in
55 C The "and"s shown in the paper are done here with "cmov"s. "m" is written
56 C for m', and "d" for d_norm
, which won
't cause any confusion since it's
57 C only the normalized divisor that
's of any use in the code. "b" is written
58 C for 2^N, the size of a limb, N being 32 here.
60 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
61 C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer. If
62 C q1==0xFFFFFFFF, then q1+1 would overflow. We branch to a special case
63 C "q1_ff" if this occurs. Since the true quotient is either q1 or q1+1 then
64 C if q1==0xFFFFFFFF that must be the right value.
66 C For the last and second last steps q1==0xFFFFFFFF is instead handled by an
67 C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1. This
68 C then goes through as normal, and finding no addback required. sbbl costs
69 C an extra cycle over what the main loop code does, but it keeps code size
70 C and complexity down.
74 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
75 C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
76 C carry
, since
in normal circumstances that will be a very rare event.
78 C The
test for skipping a division is branch free
(once
size>=1 is tested
).
79 C The store to the destination
high limb is
0 when a divide is skipped
, or
80 C if it
's not skipped then a copy of the src high limb is used. The latter
81 C is in case src==dst.
83 C There's a
small bias towards expecting xsize
==0, by having code for
84 C xsize
==0 in a straight line
and xsize
!=0 under forward jumps.
88 C If the divisor is normalized
(high bit set
) then a division step can
89 C always be skipped
, since the
high destination limb is always
0 or 1 in
90 C that case. It doesn
't seem worth checking for this though, since it
91 C probably occurs infrequently, in particular note that big_base for a
92 C decimal mpn_get_str is not normalized in a 32-bit limb.
95 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
96 dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
98 dnl The inverse takes about 50 cycles to calculate, but after that the
99 dnl multiply is 17 c/l versus division at 42 c/l.
101 dnl At 3 limbs the mul is a touch faster than div on the integer part, and
102 dnl even more so on the fractional part.
104 deflit(MUL_THRESHOLD, 3)
107 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
108 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
109 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
110 defframe(PARAM_DIVISOR,20)
111 defframe(PARAM_SIZE, 16)
112 defframe(PARAM_SRC, 12)
113 defframe(PARAM_XSIZE, 8)
114 defframe(PARAM_DST, 4)
116 defframe(SAVE_EBX, -4)
117 defframe(SAVE_ESI, -8)
118 defframe(SAVE_EDI, -12)
119 defframe(SAVE_EBP, -16)
121 defframe(VAR_NORM, -20)
122 defframe(VAR_INVERSE, -24)
123 defframe(VAR_SRC, -28)
124 defframe(VAR_DST, -32)
125 defframe(VAR_DST_STOP,-36)
127 deflit(STACK_SPACE, 36)
132 PROLOGUE(mpn_preinv_divrem_1)
134 movl PARAM_XSIZE
, %ecx
136 subl $STACK_SPACE
, %esp FRAME_subl_esp
(STACK_SPACE
)
142 movl PARAM_SIZE
, %ebx
144 leal
8(%edx,%ecx,4), %edx C
&dst
[xsize
+2]
146 movl PARAM_DIVISOR
, %ebp
148 movl
%edx, VAR_DST_STOP C
&dst
[xsize
+2]
150 xorl
%edi, %edi C carry
152 movl
-4(%esi,%ebx,4), %eax C src
high limb
159 cmpl
%ebp, %eax C
high cmp divisor
161 cmovc
( %eax, %edi) C
high is carry if
high<divisor
162 cmovnc
( %eax, %ecx) C
0 if skip
div, src
high if
not
163 C
(the latter
in case src
==dst
)
165 movl
%ecx, -12(%edx,%ebx,4) C dst
high limb
166 sbbl
$0, %ebx C skip one division if
high<divisor
167 movl PARAM_PREINV_SHIFT
, %ecx
169 leal
-8(%edx,%ebx,4), %edx C
&dst
[xsize
+size]
172 movl
%edx, VAR_DST C
&dst
[xsize
+size]
174 shll
%cl, %ebp C d normalized
178 movd
%eax, %mm7 C rshift
179 movl PARAM_PREINV_INVERSE
, %eax
187 PROLOGUE
(mpn_divrem_1c
)
189 movl PARAM_CARRY, %edx
190 movl PARAM_SIZE, %ecx
191 subl $STACK_SPACE, %esp
192 deflit(`FRAME',STACK_SPACE
)
195 movl PARAM_XSIZE
, %ebx
201 movl PARAM_DIVISOR
, %ebp
206 leal
-4(%edi,%ebx,4), %edi C
&dst
[xsize
-1]
212 C
offset 0xa1, close enough to aligned
213 PROLOGUE
(mpn_divrem_1
)
216 movl PARAM_SIZE, %ecx
217 movl $0, %edx C initial carry (if can't skip a
div)
218 subl $STACK_SPACE
, %esp
219 deflit
(`FRAME
',STACK_SPACE)
225 movl PARAM_XSIZE, %ebx
228 movl PARAM_DIVISOR, %ebp
229 orl %ecx, %ecx C size
233 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
235 jz L(no_skip_div) C if size==0
236 movl -4(%esi,%ecx,4), %eax C src high limb
239 cmpl %ebp, %eax C high cmp divisor
241 cmovc( %eax, %edx) C high is carry if high<divisor
242 cmovnc( %eax, %esi) C 0 if skip div, src high if not
244 movl %esi, (%edi,%ecx,4) C dst high limb
245 sbbl $0, %ecx C size-1 if high<divisor
246 movl PARAM_SRC, %esi C reload
259 leal (%ebx,%ecx), %eax C size+xsize
260 cmpl $MUL_THRESHOLD, %eax
261 jae L(mul_by_inverse)
264 C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
265 C It'd be possible to write them
out without the looping
, but no speedup
268 C Using PARAM_DIVISOR instead of
%ebp measures
1 cycle
/loop faster on the
269 C integer part
, but curiously
not on the fractional part
, where
%ebp is a
270 C
(fixed
) couple of cycles faster.
273 jz L
(divide_no_integer
)
276 C
eax scratch
(quotient
)
279 C
edx scratch
(remainder
)
284 movl
-4(%esi,%ecx,4), %eax
288 movl
%eax, (%edi,%ecx,4)
290 jnz L
(divide_integer
)
293 L
(divide_no_integer
):
296 jnz L
(divide_fraction
)
305 addl $STACK_SPACE
, %esp
311 C
eax scratch
(quotient
)
314 C
edx scratch
(remainder
)
323 movl
%eax, -4(%edi,%ebx,4)
325 jnz L
(divide_fraction
)
331 C
-----------------------------------------------------------------------------
342 bsrl
%ebp, %eax C
31-l
344 leal
12(%edi), %ebx C
&dst
[xsize
+2], loop dst stop
345 leal
4(%edi,%ecx,4), %edi C
&dst
[xsize
+size]
348 movl
%ebx, VAR_DST_STOP
350 movl
%ecx, %ebx C
size
353 movl
%edx, %edi C carry
361 shll
%cl, %ebp C d normalized
367 subl
%ebp, %edx C
(b
-d
)-1 giving
edx:eax = b
*(b
-d
)-1
369 divl
%ebp C floor
(b
*(b
-d
)-1) / d
382 orl
%ebx, %ebx C
size
383 movl
%eax, VAR_INVERSE
384 leal
-12(%esi,%ebx,4), %eax C
&src
[size-3]
390 movl
8(%eax), %esi C src
high limb
393 L
(start_two_or_more
):
394 movl
4(%eax), %edx C src second highest limb
396 shldl
( %cl, %esi, %edi) C n2
= carry
,high << l
398 shldl
( %cl, %edx, %esi) C n10
= high,second
<< l
401 je L
(integer_two_left
)
406 shldl
( %cl, %esi, %edi) C n2
= carry
,high << l
408 shll
%cl, %esi C n10
= high << l
410 jmp L
(integer_one_left
)
414 C Can be here with xsize
==0 if mpn_preinv_divrem_1 had
size==1 and
415 C skipped a division.
417 shll
%cl, %edi C n2
= carry
<< l
418 movl
%edi, %eax C return value for zero_done
426 C
-----------------------------------------------------------------------------
428 C The multiply by inverse
loop is
17 cycles
, and relies on some
out-of
-order
429 C execution. The instruction scheduling is important
, with various
430 C apparently equivalent forms running
1 to
5 cycles slower.
432 C A lower
bound for the time would seem to be
16 cycles
, based on the
433 C following successive dependencies.
445 C
This chain is what the
loop has already
, but
16 cycles isn
't achieved.
446 C K7 has enough decode, and probably enough execute (depending maybe on what
447 C a mul actually consumes), but nothing running under 17 has been found.
449 C In theory n2+n1 could be done in the sub and addback stages (by
450 C calculating both n2 and n2+n1 there), but lack of registers makes this an
451 C unlikely proposition.
453 C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow
454 C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
455 C chain, and nothing better than 18 cycles has been found when using it.
456 C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
457 C be an extremely rare event.
459 C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
460 C if some special data is coming out with this always, the q1_ff special
461 C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
462 C induce the q1_ff case, for speed measurements or testing. Note that
463 C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
465 C The instruction groupings
and empty comments show the cycles for a naive
466 C
in-order view of the code
(conveniently ignoring the load latency on
467 C VAR_INVERSE
).
This shows some of where the time is going
, but is nonsense
468 C to the extent that
out-of
-order execution rearranges it.
In this case
469 C there
's 19 cycles shown, but it executes at 17.
474 C ebx scratch (nadj, q1)
475 C ecx scratch (src, dst)
481 C mm0 scratch (src qword)
482 C mm7 rshift for normalization
484 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
488 leal (%ebp,%esi), %ebx
489 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
490 sbbl $-1, %eax C n2+n1
492 mull VAR_INVERSE C m*(n2+n1)
494 movq (%ecx), %mm0 C next limb and the one below it
501 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
502 leal 1(%edi), %ebx C n2+1
507 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
520 movl VAR_DST_STOP, %eax
524 sbbl %edx, %edi C n - (q1+1)*d
525 movl %esi, %edi C remainder -> n2
526 leal (%ebp,%esi), %edx
530 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
539 L(integer_loop_done):
542 C -----------------------------------------------------------------------------
544 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
545 C q1_ff special case. This make the code a bit smaller and simpler, and
546 C costs only 1 cycle (each).
550 C ebx scratch (nadj, q1)
551 C ecx scratch (src, dst)
559 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
563 leal (%ebp,%esi), %ebx
564 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
565 sbbl $-1, %eax C n2+n1
567 mull VAR_INVERSE C m*(n2+n1)
569 movd (%ecx), %mm0 C src low limb
571 movl VAR_DST_STOP, %ecx
575 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
576 leal 1(%edi), %ebx C n2+1
579 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
595 sbbl %edx, %edi C n - (q1+1)*d
596 movl %esi, %edi C remainder -> n2
597 leal (%ebp,%esi), %edx
601 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
607 C -----------------------------------------------------------------------------
610 C ebx scratch (nadj, q1)
619 movl VAR_DST_STOP, %ecx
620 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
623 leal (%ebp,%esi), %ebx
624 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
625 sbbl $-1, %eax C n2+n1
627 mull VAR_INVERSE C m*(n2+n1)
635 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
636 leal 1(%edi), %ebx C n2+1
641 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
643 sbbl $0, %ebx C q1 if q1+1 overflowed
657 sbbl %edx, %edi C n - (q1+1)*d
658 movl %esi, %edi C remainder -> n2
659 leal (%ebp,%esi), %edx
661 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
683 addl $STACK_SPACE, %esp
691 C -----------------------------------------------------------------------------
693 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
694 C of q*d is simply -d and the remainder n-q*d = n10+d
706 movl VAR_DST_STOP, %edx
710 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
713 movd %mm0, %esi C next n10
719 jmp L(integer_loop_done)
723 C -----------------------------------------------------------------------------
725 C Being the fractional part, the "source" limbs are all zero, meaning
726 C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
728 C The loop runs at 15 cycles. The dependent chain is the same as the
729 C general case above, but without the n2+n1 stage (due to n1==0), so 15
730 C would seem to be the lower bound.
732 C A not entirely obvious simplification is that q1+1 never overflows a limb,
733 C and so there's no need for the sbbl
$0 or jz q1_ff from the general case.
734 C q1 is the
high word of m
*n2
+b
*n2
and the following shows q1
<=b
-2 always.
735 C rnd
() means rounding down to a multiple of d.
737 C m
*n2
+ b
*n2
<= m
*(d
-1) + b
*(d
-1)
738 C
= m
*d
+ b
*d
- m
- b
739 C
= floor
((b
(b
-d
)-1)/d
)*d
+ b
*d
- m
- b
740 C
= rnd
(b
(b
-d
)-1) + b
*d
- m
- b
741 C
= rnd
(b
(b
-d
)-1 + b
*d
) - m
- b
742 C
= rnd
(b
*b
-1) - m
- b
745 C Unchanged from the general case is that the final quotient limb q can be
746 C either q1
or q1
+1, and the q1
+1 case occurs often.
This can be seen from
747 C equation
8.4 of the paper which simplifies as follows when n1
==0 and
750 C n
-q1
*d
= (n2
*k
+q0
*d
)/b
<= d
+ (d
*d
-2d
)/b
752 C As before
, the instruction groupings
and empty comments show a naive
753 C
in-order view of the code
, which is made a nonsense by
out of order
754 C execution. There
's 17 cycles shown, but it executes at 15.
756 C Rotating the store q and remainder->n2 instructions up to the top of the
757 C loop gets the run time down from 16 to 15.
770 movl VAR_DST_STOP, %ecx C &dst[xsize+2]
773 subl $8, %ecx C &dst[xsize]
774 jmp L(fraction_entry)
779 C eax n2 carry, then scratch
780 C ebx scratch (nadj, q1)
781 C ecx dst, decrementing
787 movl %ebx, (%ecx) C previous q
788 movl %eax, %edi C remainder->n2
791 mull VAR_INVERSE C m*n2
805 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
815 negl %eax C low of n - (q1+1)*d
819 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
820 leal (%ebp,%eax), %edx
822 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1