beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / alpha / ev67 / gcd_1.asm
blob55fa7d3673112671a6bdc81faf19da25237affc3
1 dnl Alpha ev67 mpn_gcd_1 -- Nx1 greatest common divisor.
3 dnl Copyright 2003, 2004 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C ev67: 3.4 cycles/bitpair for 1x1 part
37 C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
39 C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
40 C strip trailing zeros from abs(x-y) to maintain x and y both odd.
42 C The trailing zeros are calculated from just x-y, since in twos-complement
43 C there's the same number of trailing zeros on d or -d. This means the cttz
44 C runs in parallel with abs(x-y).
46 C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
47 C operands with this algorithm gives the measured 3.4 c/l.
49 C The slottings shown are for SVR4 style systems, Unicos differs in the
50 C initial gp setup and the LEA.
52 C Enhancement:
54 C On the jsr, !lituse_jsr! (when available) would allow the linker to relax
55 C it to a bsr, but probably only in a static binary. Plain "jsr foo" gives
56 C the right object code for relaxation, and ought to be available
57 C everywhere, but we prefer to schedule the GOT ldq (LEA) back earlier, for
58 C the usual case of running in a shared library.
60 C bsr could perhaps be used explicitly anyway. We should be able to assume
61 C modexact is in the same module as us (ie. shared library or mainline).
62 C Would there be any worries about the size of the displacement? Could
63 C always put modexact and gcd_1 in the same .o to be certain.
65 ASM_START()
66 PROLOGUE(mpn_gcd_1, gp)
68 C r16 xp
69 C r17 size
70 C r18 y
72 C ldah C l
73 C lda C u
75 ldq r0, 0(r16) C L x = xp[0]
76 lda r30, -32(r30) C u alloc stack
78 LEA( r27, mpn_modexact_1c_odd) C L modexact addr, ldq (gp)
79 stq r10, 16(r30) C L save r10
80 cttz r18, r10 C U0 y twos
81 cmpeq r17, 1, r5 C u test size==1
83 stq r9, 8(r30) C L save r9
84 clr r19 C u zero c for modexact
85 unop
86 unop
88 cttz r0, r6 C U0 x twos
89 stq r26, 0(r30) C L save ra
91 srl r18, r10, r18 C U y odd
93 mov r18, r9 C l hold y across call
95 cmpult r6, r10, r2 C u test x_twos < y_twos
97 cmovne r2, r6, r10 C l common_twos = min(x_twos,y_twos)
98 bne r5, L(one) C U no modexact if size==1
99 jsr r26, (r27), mpn_modexact_1c_odd C L0
101 LDGP( r29, 0(r26)) C u,l ldah,lda
102 cttz r0, r6 C U0 new x twos
103 ldq r26, 0(r30) C L restore ra
105 L(one):
106 mov r9, r1 C u y
107 ldq r9, 8(r30) C L restore r9
108 mov r10, r2 C u common twos
109 ldq r10, 16(r30) C L restore r10
111 lda r30, 32(r30) C l free stack
112 beq r0, L(done) C U return y if x%y==0
114 srl r0, r6, r0 C U x odd
115 unop
117 ALIGN(16)
118 L(top):
119 C r0 x
120 C r1 y
121 C r2 common twos, for use at end
123 subq r0, r1, r7 C l0 d = x - y
124 cmpult r0, r1, r16 C u0 test x >= y
126 subq r1, r0, r4 C l0 new_x = y - x
127 cttz r7, r8 C U0 d twos
129 cmoveq r16, r7, r4 C l0 new_x = d if x>=y
130 cmovne r16, r0, r1 C u0 y = x if x<y
131 unop C l \ force cmoveq into l0
132 unop C u /
134 C C cmoveq2 L0, cmovne2 U0
136 srl r4, r8, r0 C U0 x = new_x >> twos
137 bne r7, L(top) C U1 stop when d==0
140 L(done):
141 sll r1, r2, r0 C U0 return y << common_twos
142 ret r31, (r26), 1 C L0
144 EPILOGUE()
145 ASM_END()