1 dnl x86 mpn_gcd_1 optimised for AMD K7.
3 dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn
6 dnl Copyright
2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software
9 dnl
This file is part of the GNU MP Library.
11 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
12 dnl it under the terms of
either:
14 dnl
* the GNU Lesser General
Public License as published by the Free
15 dnl Software Foundation
; either version 3 of the License, or (at your
16 dnl option
) any later version.
20 dnl
* the GNU General
Public License as published by the Free Software
21 dnl Foundation
; either version 2 of the License, or (at your option) any
24 dnl
or both
in parallel
, as here.
26 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
27 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
28 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
31 dnl You should have received copies of the GNU General
Public License
and the
32 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
33 dnl see
https://www.gnu.
org/licenses
/.
35 include(`..
/config.m4
')
52 C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
55 C * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
56 C * Stream things better through registers, avoiding some copying.
57 C * For ELF, avoid putting GOT base in both ebx and esi. Needs special
58 C LEA/LEAL or else discrete code here.
60 C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
63 deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
65 DEF_OBJECT(ctz_table,64)
68 ` .byte m4_count_trailing_zeros(i)
72 C Threshold of when to
call bmod when U is one limb. Should be about
73 C
(time_in_cycles
(bmod_1
,1) + call_overhead
) / (cycles
/bit
).
74 define
(`DIV_THRES_LOG2
', 7)
93 mov (up), %eax C U low limb
103 mov %ecx, %eax C common twos
107 jnc L(divide_strip_y)
116 C Both U and V are single limbs, reduce with bmod if u0 >> v0.
119 shr $DIV_THRES_LOG2, %ecx
130 ifdef(`PIC_WITH_EBX',`dnl
134 add $_GLOBAL_OFFSET_TABLE_
, %ebx
139 cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
144 CALL( mpn_modexact_1_odd)
147 ifdef(`PIC_WITH_EBX',`dnl
148 add $16, %esp C deallocate params
151 add $12, %esp C deallocate params
156 LEAL
( ctz_table
, %esi)
162 ALIGN(16) C K8 BC P4 NHM SBR
163 L
(top
): cmovc
( %ecx, %eax) C if x
-y
< 0 0
164 cmovc
( %edi, %edx) C use x
,y
-x
0
165 L
(mid
): and $
MASK, %ecx C
0
166 movzbl
(%esi,%ecx), %ecx C
1
187 ifdef
(`PIC_WITH_EBX
',`dnl