beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / gcd_1.asm
blob3afabd706fa0bc6c221134cce7f256cc825674be
1 dnl Itanium-2 mpn_gcd_1 -- mpn by 1 gcd.
3 dnl Contributed to the GNU project by Kevin Ryde, innerloop by Torbjorn
4 dnl Granlund.
6 dnl Copyright 2002-2005, 2012, 2013, 2015 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
37 C cycles/bitpair (1x1 gcd)
38 C Itanium: ?
39 C Itanium 2: 5.1
42 C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
44 C The entry sequence is designed to expect xsize>1 and hence a modexact
45 C call. This ought to be more common than a 1x1 operation. Our critical
46 C path is thus stripping factors of 2 from y, calling modexact, then
47 C stripping factors of 2 from the x remainder returned.
49 C The common factors of 2 between x and y must be determined using the
50 C original x, not the remainder from the modexact. This is done with
51 C x_orig which is xp[0]. There's plenty of time to do this while the rest
52 C of the modexact etc is happening.
54 C It's possible xp[0] is zero. In this case the trailing zeros calculation
55 C popc((x-1)&~x) gives 63, and that's clearly no less than what y will
56 C have, making min(x_twos,y_twos) == y_twos.
58 C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then
59 C stripping factors of 2 from abs(x-y). Those factors of two are
60 C determined from just y-x, without the abs(), since there's the same
61 C number of trailing zeros on n or -n in twos complement. That makes the
62 C dependent chain 8 cycles deep.
64 C The selection of x-y versus y-x for abs(x-y), and the selection of the
65 C minimum of x and y, is done in parallel with the critical path.
67 C The algorithm takes about 0.68 iterations per bit (two N bit operands) on
68 C average, hence the final 5.8 cycles/bitpair.
70 C Not done:
72 C An alternate algorithm which didn't strip all twos, but instead applied
73 C tbit and predicated extr on x, and then y, was attempted. The loop was 6
74 C cycles, but the algorithm is an average 1.25 iterations per bitpair for a
75 C total 7.25 c/bp, which is slower than the current approach.
77 C Alternatives:
79 C Perhaps we could do something tricky by extracting a few high bits and a
80 C few low bits from the operands, and looking up a table which would give a
81 C set of predicates to control some shifts or subtracts or whatever. That
82 C could knock off multiple bits per iteration.
84 C The right shifts are a bit of a bottleneck (shr at 2 or 3 cycles, or extr
85 C only going down I0), perhaps it'd be possible to shift left instead,
86 C using add. That would mean keeping track of the lowest not-yet-zeroed
87 C bit, using some sort of mask.
89 C TODO:
90 C * Once mod_1_N exists in assembly for Itanium, add conditional calls.
91 C * Call bmod_1 even for n=1 when up[0] >> v0 (like other gcd_1 impls).
92 C * Probably avoid popcnt also outside of loop, instead use ctz_table.
94 ASM_START()
95 .explicit C What does this mean?
97 C HP's assembler requires these declarations for importing mpn_modexact_1c_odd
98 .global mpn_modexact_1c_odd
99 .type mpn_modexact_1c_odd,@function
101 C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
103 deflit(MAXSHIFT, 7)
104 deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
106 C .section ".rodata"
107 .rodata
108 ALIGN(m4_lshift(1,MAXSHIFT)) C align table to allow using dep
109 ctz_table:
110 data1 MAXSHIFT
111 forloop(i,1,MASK,
112 ` data1 m4_count_trailing_zeros(i)
115 PROLOGUE(mpn_gcd_1)
117 C r32 xp
118 C r33 xsize
119 C r34 y
121 define(x, r8)
122 define(xp_orig, r32)
123 define(xsize, r33)
124 define(y, r34) define(inputs, 3)
125 define(save_rp, r35)
126 define(save_pfs, r36)
127 define(x_orig, r37)
128 define(x_orig_one, r38)
129 define(y_twos, r39) define(locals, 5)
130 define(out_xp, r40)
131 define(out_xsize, r41)
132 define(out_divisor, r42)
133 define(out_carry, r43) define(outputs, 4)
135 .prologue
136 {.mmi;
137 ifdef(`HAVE_ABI_32',
138 ` addp4 r9 = 0, xp_orig define(xp,r9)', C M0
139 ` define(xp,xp_orig)')
140 .save ar.pfs, save_pfs
141 alloc save_pfs = ar.pfs, inputs, locals, outputs, 0 C M2
142 .save rp, save_rp
143 mov save_rp = b0 C I0
144 }{.mbb; .body
145 add r10 = -1, y C M3 y-1
146 nop.b 0 C B0
147 nop.b 0 C B1
150 }{.mmi; ld8 x = [xp] C M0 x = xp[0] if no modexact
151 ld8 x_orig = [xp] C M1 orig x for common twos
152 cmp.ne p6,p0 = 1, xsize C I0
153 }{.mmi; andcm y_twos = r10, y C M2 (y-1)&~y
154 mov out_xp = xp_orig C M3
155 mov out_xsize = xsize C I1
157 }{.mmi; mov out_carry = 0 C M0
158 nop.m 0 C M1
159 popcnt y_twos = y_twos C I0 y twos
161 }{.mmi; add x_orig_one = -1, x_orig C M0 orig x-1
162 nop.m 0 C M1
163 shr.u out_divisor = y, y_twos C I0 y without twos
164 }{.mib; nop.m 0 C M2
165 shr.u y = y, y_twos C I1 y without twos
166 (p6) br.call.sptk.many b0 = mpn_modexact_1c_odd C if xsize>1
169 C modexact can leave x==0
170 {.mmi; cmp.eq p6,p0 = 0, x C M0 if {xp,xsize} % y == 0
171 andcm x_orig = x_orig_one, x_orig C M1 orig (x-1)&~x
172 add r9 = -1, x C I0 x-1
174 }{.mmi; andcm r9 = r9, x C M0 (x-1)&~x
175 nop.m 0 C M1
176 mov b0 = save_rp C I0
178 }{.mii; nop.m 0 C M0
179 popcnt x_orig = x_orig C I0 orig x twos
180 popcnt r9 = r9 C I0 x twos
182 }{.mmi; cmp.lt p7,p0 = x_orig, y_twos C M0 orig x_twos < y_twos
183 addl r22 = @ltoff(ctz_table), r1
184 shr.u x = x, r9 C I0 x odd
186 }{.mib;
187 (p7) mov y_twos = x_orig C M0 common twos
188 add r10 = -1, y C I0 y-1
189 (p6) br.dpnt.few L(done_y) C B0 x%y==0 then result y
192 mov r25 = m4_lshift(MASK, MAXSHIFT)
193 ld8 r22 = [r22]
194 br L(ent)
197 ALIGN(32)
198 L(top):
199 .pred.rel "mutex", p6,p7
200 {.mmi; (p7) mov y = x
201 (p6) sub x = x, y
202 dep r21 = r19, r22, 0, MAXSHIFT C concat(table,lowbits)
203 }{.mmi; and r20 = MASK, r19
204 (p7) mov x = r19
205 nop 0
208 L(mid):
209 {.mmb; ld1 r16 = [r21]
210 cmp.eq p10,p0 = 0, r20
211 (p10) br.spnt.few.clr L(shift_alot)
213 }{.mmi; nop 0
214 nop 0
215 shr.u x = x, r16
218 L(ent):
219 {.mmi; sub r19 = y, x
220 cmp.gtu p6,p7 = x, y
221 cmp.ne p8,p0 = x, y
222 }{.mmb; nop 0
223 nop 0
224 (p8) br.sptk.few.clr L(top)
227 L(done_y): C result is y
228 mov ar.pfs = save_pfs C I0
229 shl r8 = y, y_twos C I common factors of 2
230 br.ret.sptk.many b0
232 L(shift_alot):
233 and r20 = x, r25
234 shr.u x = x, MAXSHIFT
236 dep r21 = x, r22, 0, MAXSHIFT
237 br L(mid)
238 EPILOGUE()