1 dnl Itanium
-2 mpn_gcd_1
-- mpn by
1 gcd.
3 dnl Contributed to the GNU project by Kevin Ryde
, innerloop by Torbjorn
6 dnl Copyright
2002-2005, 2012, 2013, 2015 Free Software Foundation
, Inc.
8 dnl
This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
11 dnl it under the terms of
either:
13 dnl
* the GNU Lesser General
Public License as published by the Free
14 dnl Software Foundation
; either version 3 of the License, or (at your
15 dnl option
) any later version.
19 dnl
* the GNU General
Public License as published by the Free Software
20 dnl Foundation
; either version 2 of the License, or (at your option) any
23 dnl
or both
in parallel
, as here.
25 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
26 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
27 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
30 dnl You should have received copies of the GNU General
Public License
and the
31 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
32 dnl see
https://www.gnu.
org/licenses
/.
34 include(`..
/config.m4
')
37 C cycles/bitpair (1x1 gcd)
42 C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
44 C The entry sequence is designed to expect xsize>1 and hence a modexact
45 C call. This ought to be more common than a 1x1 operation. Our critical
46 C path is thus stripping factors of 2 from y, calling modexact, then
47 C stripping factors of 2 from the x remainder returned.
49 C The common factors of 2 between x and y must be determined using the
50 C original x, not the remainder from the modexact. This is done with
51 C x_orig which is xp[0]. There's plenty of time to do
this while the rest
52 C of the modexact etc is happening.
54 C It
's possible xp[0] is zero. In this case the trailing zeros calculation
55 C popc((x-1)&~x) gives 63, and that's clearly no less than what y will
56 C have
, making min
(x_twos
,y_twos
) == y_twos.
58 C The main
loop consists of transforming x
,y to abs
(x
-y
),min
(x
,y
), and then
59 C stripping factors of
2 from abs
(x
-y
). Those factors of two are
60 C determined from just y
-x
, without the abs
(), since there
's the same
61 C number of trailing zeros on n or -n in twos complement. That makes the
62 C dependent chain 8 cycles deep.
64 C The selection of x-y versus y-x for abs(x-y), and the selection of the
65 C minimum of x and y, is done in parallel with the critical path.
67 C The algorithm takes about 0.68 iterations per bit (two N bit operands) on
68 C average, hence the final 5.8 cycles/bitpair.
72 C An alternate algorithm which didn't strip all twos
, but instead applied
73 C tbit
and predicated extr on x
, and then y
, was attempted. The
loop was
6
74 C cycles
, but the algorithm is an average
1.25 iterations per bitpair for a
75 C total
7.25 c
/bp, which is slower than the current approach.
79 C Perhaps we could do something tricky by extracting a few
high bits
and a
80 C few
low bits from the operands
, and looking up a table which would give a
81 C set of predicates to control some shifts
or subtracts
or whatever. That
82 C could knock off multiple bits per iteration.
84 C The right shifts are a bit of a bottleneck
(shr at
2 or 3 cycles
, or extr
85 C only going down I0
), perhaps it
'd be possible to shift left instead,
86 C using add. That would mean keeping track of the lowest not-yet-zeroed
87 C bit, using some sort of mask.
90 C * Once mod_1_N exists in assembly for Itanium, add conditional calls.
91 C * Call bmod_1 even for n=1 when up[0] >> v0 (like other gcd_1 impls).
92 C * Probably avoid popcnt also outside of loop, instead use ctz_table.
95 .explicit C What does this mean?
97 C HP's assembler requires these declarations for importing mpn_modexact_1c_odd
98 .
global mpn_modexact_1c_odd
99 .
type mpn_modexact_1c_odd
,@function
101 C ctz_table
[n
] is the number of trailing zeros on n
, or MAXSHIFT if n
==0.
104 deflit
(MASK, eval
((m4_lshift
(1,MAXSHIFT
))-1))
108 ALIGN(m4_lshift
(1,MAXSHIFT
)) C
align table to allow using dep
112 ` data1 m4_count_trailing_zeros
(i
)
124 define(y, r34) define(inputs, 3)
126 define(save_pfs, r36)
128 define(x_orig_one, r38)
129 define(y_twos, r39) define(locals, 5)
131 define(out_xsize, r41)
132 define(out_divisor, r42)
133 define(out_carry, r43) define(outputs, 4)
138 ` addp4 r9
= 0, xp_orig define
(xp
,r9
)', C M0
139 ` define(xp,xp_orig)')
140 .save ar.pfs
, save_pfs
141 alloc save_pfs
= ar.pfs
, inputs
, locals
, outputs
, 0 C M2
143 mov save_rp
= b0 C I0
145 add r10 = -1, y C M3 y-1
150 }{.mmi; ld8 x = [xp] C M0 x = xp[0] if no modexact
151 ld8 x_orig = [xp] C M1 orig x for common twos
152 cmp.ne p6,p0 = 1, xsize C I0
153 }{.mmi; andcm y_twos = r10, y C M2 (y-1)&~y
154 mov out_xp = xp_orig C M3
155 mov out_xsize = xsize C I1
157 }{.mmi; mov out_carry = 0 C M0
159 popcnt y_twos = y_twos C I0 y twos
161 }{.mmi; add x_orig_one = -1, x_orig C M0 orig x-1
163 shr.u out_divisor = y, y_twos C I0 y without twos
165 shr.u y = y, y_twos C I1 y without twos
166 (p6) br.call.sptk.many b0 = mpn_modexact_1c_odd C if xsize>1
169 C modexact can
leave x
==0
170 {.mmi; cmp.eq p6,p0 = 0, x C M0 if {xp,xsize} % y
== 0
171 andcm x_orig
= x_orig_one
, x_orig C M1 orig
(x
-1)&~x
172 add r9
= -1, x C I0 x
-1
174 }
{.mmi; andcm r9 = r9, x C M0 (x-1)&~x
176 mov b0 = save_rp C I0
179 popcnt x_orig = x_orig C I0 orig x twos
180 popcnt r9 = r9 C I0 x twos
182 }{.mmi; cmp.lt p7,p0 = x_orig, y_twos C M0 orig x_twos < y_twos
183 addl r22 = @ltoff(ctz_table), r1
184 shr.u x = x, r9 C I0 x odd
187 (p7) mov y_twos = x_orig C M0 common twos
188 add r10 = -1, y C I0 y-1
189 (p6) br.dpnt.few L(done_y) C B0 x%y==0 then result y
192 mov r25
= m4_lshift
(MASK, MAXSHIFT
)
199 .pred.rel
"mutex", p6
,p7
200 {.mmi; (p7) mov y = x
202 dep r21 = r19, r22, 0, MAXSHIFT C concat(table,lowbits)
203 }{.mmi; and r20 = MASK, r19
209 {.mmb; ld1 r16 = [r21]
210 cmp.eq p10,p0 = 0, r20
211 (p10) br.spnt.few.clr L(shift_alot)
219 {.mmi; sub r19 = y, x
224 (p8) br.sptk.few.clr L(top)
227 L
(done_y
): C result is y
228 mov ar.pfs
= save_pfs C I0
229 shl r8
= y
, y_twos C I common factors of
2
234 shr.u x
= x
, MAXSHIFT
236 dep r21
= x
, r22
, 0, MAXSHIFT