1 dnl Alpha ev67 mpn_gcd_1
-- Nx1 greatest common divisor.
3 dnl Copyright
2003, 2004 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C ev67: 3.4 cycles/bitpair for 1x1 part
37 C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
39 C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
40 C strip trailing zeros from abs(x-y) to maintain x and y both odd.
42 C The trailing zeros are calculated from just x-y, since in twos-complement
43 C there's the same number of trailing zeros on d
or -d.
This means the cttz
44 C runs
in parallel with abs
(x
-y
).
46 C The
loop takes
5 cycles
, and at
0.68 iterations per bit for two N
-bit
47 C operands with
this algorithm gives the measured
3.4 c
/l.
49 C The slottings shown are for SVR4 style systems
, Unicos differs
in the
50 C initial gp setup
and the
LEA.
54 C On the jsr
, !lituse_jsr
! (when available
) would allow the linker to relax
55 C it to a
bsr, but probably only
in a static binary. Plain
"jsr foo" gives
56 C the right object code for relaxation
, and ought to be available
57 C everywhere
, but we prefer to schedule the GOT ldq
(LEA) back earlier
, for
58 C the usual case of running
in a shared library.
60 C
bsr could perhaps be used explicitly anyway. We should be able to
assume
61 C modexact is
in the same module as us
(ie. shared library
or mainline
).
62 C Would there be any worries about the
size of the displacement
? Could
63 C always put modexact
and gcd_1
in the same .o to be certain.
66 PROLOGUE
(mpn_gcd_1
, gp
)
75 ldq r0
, 0(r16
) C L x
= xp
[0]
76 lda r30
, -32(r30
) C u alloc stack
78 LEA( r27
, mpn_modexact_1c_odd
) C L modexact addr
, ldq
(gp
)
79 stq r10
, 16(r30
) C L save r10
80 cttz r18
, r10 C U0 y twos
81 cmpeq r17
, 1, r5 C u
test size==1
83 stq r9
, 8(r30
) C L save r9
84 clr r19 C u zero c for modexact
88 cttz r0
, r6 C U0 x twos
89 stq r26
, 0(r30
) C L save ra
91 srl r18
, r10
, r18 C U y odd
93 mov r18
, r9 C l hold y across
call
95 cmpult r6
, r10
, r2 C u
test x_twos
< y_twos
97 cmovne r2
, r6
, r10 C l common_twos
= min
(x_twos
,y_twos
)
98 bne r5
, L
(one
) C U no modexact if
size==1
99 jsr r26
, (r27
), mpn_modexact_1c_odd C L0
101 LDGP
( r29
, 0(r26
)) C u
,l ldah
,lda
102 cttz r0
, r6 C U0 new x twos
103 ldq r26
, 0(r30
) C L restore ra
107 ldq r9
, 8(r30
) C L restore r9
108 mov r10
, r2 C u common twos
109 ldq r10
, 16(r30
) C L restore r10
111 lda r30
, 32(r30
) C l free stack
112 beq r0
, L
(done
) C U return y if x
%y
==0
114 srl r0
, r6
, r0 C U x odd
121 C r2 common twos
, for use at
end
123 subq r0
, r1
, r7 C l0 d
= x
- y
124 cmpult r0
, r1
, r16 C u0
test x
>= y
126 subq r1
, r0
, r4 C l0 new_x
= y
- x
127 cttz r7
, r8 C U0 d twos
129 cmoveq r16
, r7
, r4 C l0 new_x
= d if x
>=y
130 cmovne r16
, r0
, r1 C u0 y
= x if x
<y
131 unop C l \ force cmoveq
into l0
134 C C cmoveq2 L0
, cmovne2 U0
136 srl r4
, r8
, r0 C U0 x
= new_x
>> twos
137 bne r7
, L
(top
) C U1 stop when d
==0
141 sll r1
, r2
, r0 C U0 return y
<< common_twos
142 ret r31
, (r26
), 1 C L0