1 dnl Intel Pentium
-II mpn_divrem_1
-- mpn by limb division.
3 dnl Copyright
1999-2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
37 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
38 C mp_srcptr src, mp_size_t size,
40 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
41 C mp_srcptr src, mp_size_t size,
42 C mp_limb_t divisor, mp_limb_t carry);
43 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
44 C mp_srcptr src, mp_size_t size,
45 C mp_limb_t divisor, mp_limb_t inverse,
48 C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
49 C see that file for some comments. It's possible what
's here can be improved.
52 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
53 dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
55 dnl The different speeds of the integer and fraction parts means that using
56 dnl xsize+size isn't quite right. The threshold wants to be a bit higher
57 dnl for the integer part
and a bit lower for the fraction part.
(Or what
's
58 dnl really wanted is to speed up the integer part!)
60 dnl The threshold is set to make the integer part right. At 4 limbs the
61 dnl div and mul are about the same there, but on the fractional part the
62 dnl mul is much faster.
64 deflit(MUL_THRESHOLD, 4)
67 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
68 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
69 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
70 defframe(PARAM_DIVISOR,20)
71 defframe(PARAM_SIZE, 16)
72 defframe(PARAM_SRC, 12)
73 defframe(PARAM_XSIZE, 8)
74 defframe(PARAM_DST, 4)
76 defframe(SAVE_EBX, -4)
77 defframe(SAVE_ESI, -8)
78 defframe(SAVE_EDI, -12)
79 defframe(SAVE_EBP, -16)
81 defframe(VAR_NORM, -20)
82 defframe(VAR_INVERSE, -24)
83 defframe(VAR_SRC, -28)
84 defframe(VAR_DST, -32)
85 defframe(VAR_DST_STOP,-36)
87 deflit(STACK_SPACE, 36)
92 PROLOGUE(mpn_preinv_divrem_1)
94 movl PARAM_XSIZE
, %ecx
95 subl $STACK_SPACE
, %esp FRAME_subl_esp
(STACK_SPACE
)
101 movl PARAM_SIZE
, %ebx
104 movl PARAM_DIVISOR
, %ebp
109 movl
-4(%esi,%ebx,4), %eax C src
high limb
110 xorl
%edi, %edi C initial carry
(if can
't skip a div)
114 leal 8(%edx,%ecx,4), %edx C &dst[xsize+2]
117 movl %edx, VAR_DST_STOP C &dst[xsize+2]
118 cmpl %ebp, %eax C high cmp divisor
120 cmovc( %eax, %edi) C high is carry if high<divisor
122 cmovnc( %eax, %ecx) C 0 if skip div, src high if not
123 C (the latter in case src==dst)
125 movl %ecx, -12(%edx,%ebx,4) C dst high limb
127 sbbl $0, %ebx C skip one division if high<divisor
128 movl PARAM_PREINV_SHIFT, %ecx
130 leal -8(%edx,%ebx,4), %edx C &dst[xsize+size]
133 movl %edx, VAR_DST C &dst[xsize+size]
135 shll %cl, %ebp C d normalized
139 movd %eax, %mm7 C rshift
140 movl PARAM_PREINV_INVERSE, %eax
149 PROLOGUE(mpn_divrem_1c)
151 movl PARAM_CARRY
, %edx
153 movl PARAM_SIZE
, %ecx
154 subl $STACK_SPACE
, %esp
155 deflit
(`FRAME
',STACK_SPACE)
158 movl PARAM_XSIZE, %ebx
164 movl PARAM_DIVISOR, %ebp
169 leal -4(%edi,%ebx,4), %edi
175 C offset 0x31, close enough to aligned
176 PROLOGUE(mpn_divrem_1)
179 movl PARAM_SIZE
, %ecx
180 movl
$0, %edx C initial carry
(if can
't skip a div)
181 subl $STACK_SPACE, %esp
182 deflit(`FRAME',STACK_SPACE
)
185 movl PARAM_DIVISOR
, %ebp
188 movl PARAM_XSIZE
, %ebx
192 orl
%ecx, %ecx C
size
197 leal
-4(%edi,%ebx,4), %edi C
&dst
[xsize
-1]
198 jz L
(no_skip_div
) C if
size==0
200 movl
-4(%esi,%ecx,4), %eax C src
high limb
202 cmpl
%ebp, %eax C
high cmp divisor
204 cmovc
( %eax, %edx) C
high is carry if
high<divisor
206 cmovnc
( %eax, %esi) C
0 if skip
div, src
high if
not
207 C
(the latter
in case src
==dst
)
209 movl
%esi, (%edi,%ecx,4) C dst
high limb
211 sbbl
$0, %ecx C
size-1 if
high<divisor
212 movl PARAM_SRC
, %esi C reload
225 leal
(%ebx,%ecx), %eax C
size+xsize
226 cmpl $MUL_THRESHOLD
, %eax
227 jae L
(mul_by_inverse
)
230 jz L
(divide_no_integer
)
233 C
eax scratch
(quotient
)
236 C
edx scratch
(remainder
)
241 movl
-4(%esi,%ecx,4), %eax
245 movl
%eax, (%edi,%ecx,4)
247 jnz L
(divide_integer
)
250 L
(divide_no_integer
):
253 jnz L
(divide_fraction
)
264 addl $STACK_SPACE
, %esp
270 C
eax scratch
(quotient
)
273 C
edx scratch
(remainder
)
282 movl
%eax, -4(%edi,%ebx,4)
284 jnz L
(divide_fraction
)
290 C
-----------------------------------------------------------------------------
301 leal
12(%edi), %ebx C
&dst
[xsize
+2], loop dst stop
303 movl
%ebx, VAR_DST_STOP
304 leal
4(%edi,%ecx,4), %edi C
&dst
[xsize
+size]
307 movl
%ecx, %ebx C
size
309 bsrl
%ebp, %ecx C
31-l
310 movl
%edx, %edi C carry
312 leal
1(%ecx), %eax C
32-l
318 shll
%cl, %ebp C d normalized
322 subl
%ebp, %edx C
(b
-d
)-1 giving
edx:eax = b
*(b
-d
)-1
324 divl
%ebp C floor
(b
*(b
-d
)-1) / d
337 movl
%eax, VAR_INVERSE
338 orl
%ebx, %ebx C
size
339 leal
-12(%esi,%ebx,4), %eax C
&src
[size-3]
344 movl
8(%eax), %esi C src
high limb
348 L
(start_two_or_more
):
349 movl
4(%eax), %edx C src second highest limb
351 shldl
( %cl, %esi, %edi) C n2
= carry
,high << l
353 shldl
( %cl, %edx, %esi) C n10
= high,second
<< l
356 je L
(integer_two_left
)
361 shldl
( %cl, %esi, %edi) C n2
= carry
,high << l
363 shll
%cl, %esi C n10
= high << l
364 jmp L
(integer_one_left
)
368 C Can be here with xsize
==0 if mpn_preinv_divrem_1 had
size==1 and
369 C skipped a division.
371 shll
%cl, %edi C n2
= carry
<< l
372 movl
%edi, %eax C return value for zero_done
380 C
-----------------------------------------------------------------------------
382 C
This loop runs at about
25 cycles
, which is probably
sub-optimal
, and
383 C certainly more than the dependent chain would suggest. A better
loop, or
384 C a better rough analysis of what
's possible, would be welcomed.
386 C In the current implementation, the following successively dependent
387 C micro-ops seem to exist.
399 C Lack of registers hinders explicit scheduling and it might be that the
400 C normal out of order execution isn't able to hide enough under the
mul
403 C Using sarl
/negl to pick
out n1 for the n2
+n1 stage is a touch faster than
404 C cmov
(and takes one uop off the dependent chain
). A sarl
/andl
/addl
405 C combination was tried for the addback
(despite the fact it would lengthen
406 C the dependent chain
) but found to be no faster.
412 C
ebx scratch
(nadj
, q1
)
413 C
ecx scratch
(src
, dst
)
419 C mm0 scratch
(src
qword)
420 C mm7 rshift for normalization
428 andl
%eax, %ebx C
-n1
& d
431 addl
%esi, %ebx C nadj
= n10
+ (-n1
& d
), ignoring overflow
432 addl
%edi, %eax C n2
+n1
433 movq
(%ecx), %mm0 C next src limb
and the one below it
435 mull VAR_INVERSE C m
*(n2
+n1
)
445 addl
%ebx, %eax C m
*(n2
+n1
) + nadj
, low giving carry flag
447 leal
1(%edi), %ebx C n2
+1
449 adcl
%edx, %ebx C
1 + high(n2
<<32 + m
*(n2
+n1
) + nadj
) = q1
+1
464 movl VAR_DST_STOP
, %eax
466 sbbl
%edx, %edi C n
- (q1
+1)*d
467 movl
%esi, %edi C remainder
-> n2
468 leal
(%ebp,%esi), %edx
470 cmovc
( %edx, %edi) C n
- q1
*d if underflow from using q1
+1
483 L
(integer_loop_done
):
486 C
-----------------------------------------------------------------------------
488 C Here
, and in integer_one_left below
, an sbbl
$0 is used rather than a
jz
489 C q1_ff special case.
This make the code a bit smaller
and simpler
, and
490 C costs only
2 cycles
(each
).
494 C
ebx scratch
(nadj
, q1
)
495 C
ecx scratch
(src
, dst
)
510 andl
%eax, %ebx C
-n1
& d
513 addl
%esi, %ebx C nadj
= n10
+ (-n1
& d
), ignoring overflow
514 addl
%edi, %eax C n2
+n1
516 mull VAR_INVERSE C m
*(n2
+n1
)
518 movd
(%ecx), %mm0 C src
low limb
520 movl VAR_DST_STOP
, %ecx
526 addl
%ebx, %eax C m
*(n2
+n1
) + nadj
, low giving carry flag
527 leal
1(%edi), %ebx C n2
+1
530 adcl
%edx, %ebx C
1 + high(n2
<<32 + m
*(n2
+n1
) + nadj
) = q1
+1
546 sbbl
%edx, %edi C n
- (q1
+1)*d
547 movl
%esi, %edi C remainder
-> n2
548 leal
(%ebp,%esi), %edx
550 cmovc
( %edx, %edi) C n
- q1
*d if underflow from using q1
+1
558 C
-----------------------------------------------------------------------------
561 C
ebx scratch
(nadj
, q1
)
575 movl VAR_DST_STOP
, %ecx
577 andl
%eax, %ebx C
-n1
& d
580 addl
%esi, %ebx C nadj
= n10
+ (-n1
& d
), ignoring overflow
581 addl
%edi, %eax C n2
+n1
583 mull VAR_INVERSE C m
*(n2
+n1
)
591 addl
%ebx, %eax C m
*(n2
+n1
) + nadj
, low giving carry flag
592 leal
1(%edi), %ebx C n2
+1
597 adcl
%edx, %ebx C
1 + high(n2
<<32 + m
*(n2
+n1
) + nadj
) = q1
+1
599 sbbl
$0, %ebx C q1 if q1
+1 overflowed
612 movl PARAM_XSIZE
, %eax
614 sbbl
%edx, %edi C n
- (q1
+1)*d
615 movl
%esi, %edi C remainder
-> n2
616 leal
(%ebp,%esi), %edx
618 cmovc
( %edx, %edi) C n
- q1
*d if underflow from using q1
+1
627 orl
%eax, %eax C xsize
641 addl $STACK_SPACE
, %esp
649 C
-----------------------------------------------------------------------------
651 C Special case for q1
=0xFFFFFFFF, giving q
=0xFFFFFFFF meaning the
low dword
652 C of q
*d is simply
-d
and the remainder n
-q
*d
= n10
+d
664 movl VAR_DST_STOP
, %edx
669 leal
(%ebp,%esi), %edi C n
-q
*d remainder
-> next n2
672 movd
%mm0
, %esi C next n10
677 jmp L
(integer_loop_done
)
681 C
-----------------------------------------------------------------------------
683 C
In the current implementation
, the following successively dependent
684 C micro
-ops seem to exist.
695 C The
loop in fact runs at about
17.5 cycles. Using a sarl
/andl
/addl for
696 C the addback was found to be a touch slower.
710 movl VAR_DST_STOP
, %ecx C
&dst
[xsize
+2]
713 subl
$8, %ecx C
&dst
[xsize
]
718 C
eax n2
, then scratch
719 C
ebx scratch
(nadj
, q1
)
720 C
ecx dst
, decrementing
726 mull VAR_INVERSE C m
*n2
738 addl
%edx, %ebx C
1 + high(n2
<<32 + m
*n2
) = q1
+1
750 negl
%eax C
low of n
- (q1
+1)*d
752 sbbl
%edx, %edi C
high of n
- (q1
+1)*d
, caring only about carry
753 leal
(%ebp,%eax), %edx
755 cmovc
( %edx, %eax) C n
- q1
*d if underflow from using q1
+1
758 movl
%eax, %edi C remainder
->n2
761 movl
%ebx, (%ecx) C previous q