beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / gcd_1.asm
blob833d05d934cba01cff2b71ede5ee8d79171209d2
1 dnl x86 mpn_gcd_1 optimised for AMD K7.
3 dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn
4 dnl Granlund.
6 dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software
7 dnl Foundation, Inc.
9 dnl This file is part of the GNU MP Library.
10 dnl
11 dnl The GNU MP Library is free software; you can redistribute it and/or modify
12 dnl it under the terms of either:
13 dnl
14 dnl * the GNU Lesser General Public License as published by the Free
15 dnl Software Foundation; either version 3 of the License, or (at your
16 dnl option) any later version.
17 dnl
18 dnl or
19 dnl
20 dnl * the GNU General Public License as published by the Free Software
21 dnl Foundation; either version 2 of the License, or (at your option) any
22 dnl later version.
23 dnl
24 dnl or both in parallel, as here.
25 dnl
26 dnl The GNU MP Library is distributed in the hope that it will be useful, but
27 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 dnl for more details.
30 dnl
31 dnl You should have received copies of the GNU General Public License and the
32 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
33 dnl see https://www.gnu.org/licenses/.
35 include(`../config.m4')
38 C cycles/bit (approx)
39 C AMD K7 5.31
40 C AMD K8,K9 5.33
41 C AMD K10 5.30
42 C AMD bd1 ?
43 C AMD bobcat 7.02
44 C Intel P4-2 10.1
45 C Intel P4-3/4 10.0
46 C Intel P6/13 5.88
47 C Intel core2 6.26
48 C Intel NHM 6.83
49 C Intel SBR 8.50
50 C Intel atom 8.90
51 C VIA nano ?
52 C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
54 C TODO
55 C * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
56 C * Stream things better through registers, avoiding some copying.
57 C * For ELF, avoid putting GOT base in both ebx and esi. Needs special
58 C LEA/LEAL or else discrete code here.
60 C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
62 deflit(MAXSHIFT, 6)
63 deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
65 DEF_OBJECT(ctz_table,64)
66 .byte MAXSHIFT
67 forloop(i,1,MASK,
68 ` .byte m4_count_trailing_zeros(i)
70 END_OBJECT(ctz_table)
72 C Threshold of when to call bmod when U is one limb. Should be about
73 C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
74 define(`DIV_THRES_LOG2', 7)
77 define(`up', `%edi')
78 define(`n', `%esi')
79 define(`v0', `%edx')
82 ASM_START()
83 TEXT
84 ALIGN(16)
85 PROLOGUE(mpn_gcd_1)
86 push %edi
87 push %esi
89 mov 12(%esp), up
90 mov 16(%esp), n
91 mov 20(%esp), v0
93 mov (up), %eax C U low limb
94 or v0, %eax C x | y
95 mov $-1, %ecx
97 L(twos):
98 inc %ecx
99 shr %eax
100 jnc L(twos)
102 shr %cl, v0
103 mov %ecx, %eax C common twos
105 L(divide_strip_y):
106 shr v0
107 jnc L(divide_strip_y)
108 adc v0, v0
110 push %eax
111 push v0
113 cmp $1, n
114 jnz L(reduce_nby1)
116 C Both U and V are single limbs, reduce with bmod if u0 >> v0.
117 mov (up), %ecx
118 mov %ecx, %eax
119 shr $DIV_THRES_LOG2, %ecx
120 cmp %ecx, v0
121 ja L(reduced)
123 mov v0, %esi
124 xor %edx, %edx
125 div %esi
126 mov %edx, %eax
127 jmp L(reduced)
129 L(reduce_nby1):
130 ifdef(`PIC_WITH_EBX',`dnl
131 push %ebx
132 add $-4, %esp
133 call L(movl_eip_ebx)
134 add $_GLOBAL_OFFSET_TABLE_, %ebx
136 push v0 C param 3
137 push n C param 2
138 push up C param 1
139 cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
140 jl L(bmod)
141 CALL( mpn_mod_1)
142 jmp L(called)
143 L(bmod):
144 CALL( mpn_modexact_1_odd)
146 L(called):
147 ifdef(`PIC_WITH_EBX',`dnl
148 add $16, %esp C deallocate params
149 pop %ebx
151 add $12, %esp C deallocate params
153 L(reduced):
154 pop %edx
156 LEAL( ctz_table, %esi)
157 test %eax, %eax
158 mov %eax, %ecx
159 jnz L(mid)
160 jmp L(end)
162 ALIGN(16) C K8 BC P4 NHM SBR
163 L(top): cmovc( %ecx, %eax) C if x-y < 0 0
164 cmovc( %edi, %edx) C use x,y-x 0
165 L(mid): and $MASK, %ecx C 0
166 movzbl (%esi,%ecx), %ecx C 1
167 jz L(shift_alot) C 1
168 shr %cl, %eax C 3
169 mov %eax, %edi C 4
170 mov %edx, %ecx C 3
171 sub %eax, %ecx C 4
172 sub %edx, %eax C 4
173 jnz L(top) C 5
175 L(end): pop %ecx
176 mov %edx, %eax
177 shl %cl, %eax
178 pop %esi
179 pop %edi
182 L(shift_alot):
183 shr $MAXSHIFT, %eax
184 mov %eax, %ecx
185 jmp L(mid)
187 ifdef(`PIC_WITH_EBX',`dnl
188 L(movl_eip_ebx):
189 mov (%esp), %ebx
192 EPILOGUE()
193 ASM_END()