beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / gcd_1.asm
bloba45774d37219c8db6cc3d207bc096236f4f99a46
1 dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
3 dnl Copyright 2000-2002, 2004, 2014 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K6: 9.5 cycles/bit (approx) 1x1 gcd
35 C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
38 C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
40 C This code is nothing very special, but offers a speedup over what gcc 2.95
41 C can do with mpn/generic/gcd_1.c.
43 C Future:
45 C Using a lookup table to count trailing zeros seems a touch quicker, but
46 C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used
47 C it too.
50 dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
51 dnl bigger than y, then a division x%y is done to reduce it.
52 dnl
53 dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
54 dnl there should be an advantage in the divl at about 4 or 5 bits, which is
55 dnl what's found.
57 deflit(DIV_THRESHOLD, 5)
60 defframe(PARAM_LIMB, 12)
61 defframe(PARAM_SIZE, 8)
62 defframe(PARAM_SRC, 4)
64 TEXT
65 ALIGN(16)
67 PROLOGUE(mpn_gcd_1)
68 deflit(`FRAME',0)
70 ASSERT(ne, `cmpl $0, PARAM_LIMB')
71 ASSERT(ae, `cmpl $1, PARAM_SIZE')
74 movl PARAM_SRC, %eax
75 pushl %ebx FRAME_pushl()
77 movl PARAM_LIMB, %edx
78 movl $-1, %ecx
80 movl (%eax), %ebx C src low limb
82 movl %ebx, %eax C src low limb
83 orl %edx, %ebx
85 L(common_twos):
86 shrl %ebx
87 incl %ecx
89 jnc L(common_twos) C 1/4 chance on random data
90 shrl %cl, %edx C y
92 cmpl $1, PARAM_SIZE
93 ja L(size_two_or_more)
96 ASSERT(nz, `orl %eax, %eax') C should have src limb != 0
98 shrl %cl, %eax C x
101 C Swap if necessary to make x>=y. Measures a touch quicker as a
102 C jump than a branch free calculation.
104 C eax x
105 C ebx
106 C ecx common twos
107 C edx y
109 movl %eax, %ebx
110 cmpl %eax, %edx
112 jb L(noswap)
113 movl %edx, %eax
115 movl %ebx, %edx
116 movl %eax, %ebx
117 L(noswap):
120 C See if it's worth reducing x with a divl.
122 C eax x
123 C ebx x
124 C ecx common twos
125 C edx y
127 shrl $DIV_THRESHOLD, %ebx
129 cmpl %ebx, %edx
130 ja L(nodiv)
133 C Reduce x to x%y.
135 C eax x
136 C ebx
137 C ecx common twos
138 C edx y
140 movl %edx, %ebx
141 xorl %edx, %edx
143 divl %ebx
145 orl %edx, %edx C y
146 nop C code alignment
148 movl %ebx, %eax C x
149 jz L(done_shll)
150 L(nodiv):
153 C eax x
154 C ebx
155 C ecx common twos
156 C edx y
157 C esi
158 C edi
159 C ebp
161 L(strip_y):
162 shrl %edx
163 jnc L(strip_y)
165 leal 1(%edx,%edx), %edx
166 movl %ecx, %ebx C common twos
168 leal 1(%eax), %ecx
169 jmp L(strip_x_and)
172 C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
173 C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is
174 C only 1/4.
176 C A second computed %cl shift was tried, but that measured a touch slower
177 C than branching back.
179 C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
180 C measured about 1 cycle/bit slower.
182 C eax x
183 C ebx common twos
184 C ecx scratch
185 C edx y
187 ALIGN(4)
188 L(swap):
189 addl %eax, %edx C x-y+y = x
190 negl %eax C -(x-y) = y-x
192 L(strip_x):
193 shrl %eax C odd-odd = even, so always one to strip
194 ASSERT(nz)
196 L(strip_x_leal):
197 leal 1(%eax), %ecx
199 L(strip_x_and):
200 andl $1, %ecx C (x^1)&1
202 shrl %cl, %eax C shift if x even
204 testb $1, %al
205 jz L(strip_x)
207 ASSERT(nz,`testl $1, %eax') C x, y odd
208 ASSERT(nz,`testl $1, %edx')
210 subl %edx, %eax
211 jb L(swap)
212 ja L(strip_x)
215 movl %edx, %eax
216 movl %ebx, %ecx
218 L(done_shll):
219 shll %cl, %eax
220 popl %ebx
225 C -----------------------------------------------------------------------------
226 C Two or more limbs.
228 C x={src,size} is reduced modulo y using either a plain mod_1 style
229 C remainder, or a modexact_1 style exact division.
231 deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
233 ALIGN(8)
234 L(size_two_or_more):
235 C eax
236 C ebx
237 C ecx common twos
238 C edx y, without common twos
239 C esi
240 C edi
241 C ebp
243 deflit(FRAME_TWO_OR_MORE, FRAME)
245 pushl %edi defframe_pushl(SAVE_EDI)
246 movl PARAM_SRC, %ebx
248 L(y_twos):
249 shrl %edx
250 jnc L(y_twos)
252 movl %ecx, %edi C common twos
253 movl PARAM_SIZE, %ecx
255 pushl %esi defframe_pushl(SAVE_ESI)
256 leal 1(%edx,%edx), %esi C y (odd)
258 movl -4(%ebx,%ecx,4), %eax C src high limb
260 cmpl %edx, %eax C carry if high<divisor
262 sbbl %edx, %edx C -1 if high<divisor
264 addl %edx, %ecx C skip one limb if high<divisor
265 andl %eax, %edx
267 cmpl $MODEXACT_THRESHOLD, %ecx
268 jae L(modexact)
271 L(divide_top):
272 C eax scratch (quotient)
273 C ebx src
274 C ecx counter, size-1 to 1
275 C edx carry (remainder)
276 C esi divisor (odd)
277 C edi
278 C ebp
280 movl -4(%ebx,%ecx,4), %eax
281 divl %esi
282 loop L(divide_top)
285 movl %edx, %eax C x
286 movl %esi, %edx C y (odd)
288 movl %edi, %ebx C common twos
289 popl %esi
291 popl %edi
292 leal 1(%eax), %ecx
294 orl %eax, %eax
295 jnz L(strip_x_and)
298 movl %ebx, %ecx
299 movl %edx, %eax
301 shll %cl, %eax
302 popl %ebx
307 ALIGN(8)
308 L(modexact):
309 C eax
310 C ebx src ptr
311 C ecx size or size-1
312 C edx
313 C esi y odd
314 C edi common twos
315 C ebp
317 movl PARAM_SIZE, %eax
318 pushl %esi FRAME_pushl()
320 pushl %eax FRAME_pushl()
322 pushl %ebx FRAME_pushl()
324 ifdef(`PIC_WITH_EBX',`
325 nop C code alignment
326 call L(movl_eip_ebx)
327 add $_GLOBAL_OFFSET_TABLE_, %ebx
329 CALL( mpn_modexact_1_odd)
331 movl %esi, %edx C y odd
332 movl SAVE_ESI, %esi
334 movl %edi, %ebx C common twos
335 movl SAVE_EDI, %edi
337 addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp
338 orl %eax, %eax
340 leal 1(%eax), %ecx
341 jnz L(strip_x_and)
344 movl %ebx, %ecx
345 movl %edx, %eax
347 shll %cl, %eax
348 popl %ebx
353 ifdef(`PIC_WITH_EBX',`
354 L(movl_eip_ebx):
355 movl (%esp), %ebx
356 ret_internal
359 EPILOGUE()