1 dnl AMD K6 mpn_gcd_1
-- mpn by
1 gcd.
3 dnl Copyright
2000-2002, 2004, 2014 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C K6: 9.5 cycles/bit (approx) 1x1 gcd
35 C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
38 C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
40 C This code is nothing very special, but offers a speedup over what gcc 2.95
41 C can do with mpn/generic/gcd_1.c.
45 C Using a lookup table to count trailing zeros seems a touch quicker, but
46 C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used
50 dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
51 dnl bigger than y, then a division x%y is done to reduce it.
53 dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
54 dnl there should be an advantage in the divl at about 4 or 5 bits, which is
57 deflit
(DIV_THRESHOLD
, 5)
60 defframe
(PARAM_LIMB
, 12)
61 defframe
(PARAM_SIZE
, 8)
62 defframe
(PARAM_SRC
, 4)
70 ASSERT(ne, `cmpl $0, PARAM_LIMB')
71 ASSERT
(ae
, `cmpl
$1, PARAM_SIZE
')
75 pushl %ebx FRAME_pushl()
80 movl (%eax), %ebx C src low limb
82 movl %ebx, %eax C src low limb
89 jnc L(common_twos) C 1/4 chance on random data
93 ja L(size_two_or_more)
96 ASSERT(nz, `orl %eax, %eax') C should have src limb
!= 0
101 C Swap if necessary to make x
>=y. Measures a touch quicker as a
102 C jump than a branch free calculation.
120 C See if it
's worth reducing x with a divl.
127 shrl $DIV_THRESHOLD, %ebx
165 leal 1(%edx,%edx), %edx
166 movl %ecx, %ebx C common twos
172 C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
173 C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is
176 C A second computed %cl shift was tried, but that measured a touch slower
177 C than branching back.
179 C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
180 C measured about 1 cycle/bit slower.
189 addl %eax, %edx C x-y+y = x
190 negl %eax C -(x-y) = y-x
193 shrl %eax C odd-odd = even, so always one to strip
200 andl $1, %ecx C (x^1)&1
202 shrl %cl, %eax C shift if x even
207 ASSERT(nz,`testl $1, %eax') C x
, y odd
208 ASSERT
(nz
,`testl
$1, %edx')
225 C -----------------------------------------------------------------------------
228 C x={src,size} is reduced modulo y using either a plain mod_1 style
229 C remainder, or a modexact_1 style exact division.
231 deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
238 C
edx y
, without common twos
243 deflit
(FRAME_TWO_OR_MORE
, FRAME
)
245 pushl
%edi defframe_pushl
(SAVE_EDI
)
252 movl
%ecx, %edi C common twos
253 movl PARAM_SIZE
, %ecx
255 pushl
%esi defframe_pushl
(SAVE_ESI
)
256 leal
1(%edx,%edx), %esi C y
(odd
)
258 movl
-4(%ebx,%ecx,4), %eax C src
high limb
260 cmpl
%edx, %eax C carry if
high<divisor
262 sbbl
%edx, %edx C
-1 if
high<divisor
264 addl
%edx, %ecx C skip one limb if
high<divisor
267 cmpl $MODEXACT_THRESHOLD
, %ecx
272 C
eax scratch
(quotient
)
274 C
ecx counter
, size-1 to
1
275 C
edx carry
(remainder
)
280 movl
-4(%ebx,%ecx,4), %eax
286 movl
%esi, %edx C y
(odd
)
288 movl
%edi, %ebx C common twos
317 movl PARAM_SIZE
, %eax
318 pushl
%esi FRAME_pushl
()
320 pushl
%eax FRAME_pushl
()
322 pushl
%ebx FRAME_pushl
()
324 ifdef
(`PIC_WITH_EBX
',`
327 add $_GLOBAL_OFFSET_TABLE_, %ebx
329 CALL( mpn_modexact_1_odd
)
331 movl
%esi, %edx C y odd
334 movl
%edi, %ebx C common twos
337 addl
$eval
(FRAME
- FRAME_TWO_OR_MORE
), %esp
353 ifdef
(`PIC_WITH_EBX
',`