beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / sparc64 / mod_1.c
blobf1c51970d976ea48dbab066c60b2c54da1a89bdd
1 /* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder.
3 Copyright 1991, 1993, 1994, 1999-2001, 2003, 2010 Free Software Foundation,
4 Inc.
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
19 later version.
21 or both in parallel, as here.
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 for more details.
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library. If not,
30 see https://www.gnu.org/licenses/. */
32 #include "gmp.h"
33 #include "gmp-impl.h"
34 #include "longlong.h"
36 #include "mpn/sparc64/sparc64.h"
39 /* 64-bit divisor 32-bit divisor
40 cycles/limb cycles/limb
41 (approx) (approx)
42 Ultrasparc 2i: 160 120
46 /* 32-bit divisors are treated in special case code. This requires 4 mulx
47 per limb instead of 8 in the general case.
49 For big endian systems we need HALF_ENDIAN_ADJ included in the src[i]
50 addressing, to get the two halves of each limb read in the correct order.
51 This is kept in an adj variable. Doing that measures about 6 c/l faster
52 than just writing HALF_ENDIAN_ADJ(i) in the loop. The latter shouldn't
53 be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc
54 3.2.1 at least).
56 A simple udivx/umulx loop for the 32-bit case was attempted for small
57 sizes, but at size==2 it was only about the same speed and at size==3 was
58 slower. */
60 static mp_limb_t
61 mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
63 int norm, norm_rshift;
64 mp_limb_t src_high_limb;
65 mp_size_t i;
67 ASSERT (size_limbs >= 0);
68 ASSERT (d_limb != 0);
70 if (UNLIKELY (size_limbs == 0))
71 return 0;
73 src_high_limb = src_limbptr[size_limbs-1];
75 /* udivx is good for size==1, and no need to bother checking limb<divisor,
76 since if that's likely the caller should check */
77 if (UNLIKELY (size_limbs == 1))
78 return src_high_limb % d_limb;
80 if (d_limb <= CNST_LIMB(0xFFFFFFFF))
82 unsigned *src, n1, n0, r, dummy_q, nshift, norm_rmask;
83 mp_size_t size, adj;
84 mp_limb_t dinv_limb;
86 size = 2 * size_limbs; /* halfwords */
87 src = (unsigned *) src_limbptr;
89 /* prospective initial remainder, if < d */
90 r = src_high_limb >> 32;
92 /* If the length of the source is uniformly distributed, then there's
93 a 50% chance of the high 32-bits being zero, which we can skip. */
94 if (r == 0)
96 r = (unsigned) src_high_limb;
97 size--;
98 ASSERT (size > 0); /* because always even */
101 /* Skip a division if high < divisor. Having the test here before
102 normalizing will still skip as often as possible. */
103 if (r < d_limb)
105 size--;
106 ASSERT (size > 0); /* because size==1 handled above */
108 else
109 r = 0;
111 count_leading_zeros_32 (norm, d_limb);
112 norm -= 32;
113 d_limb <<= norm;
115 norm_rshift = 32 - norm;
116 norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
117 i = size-1;
118 adj = HALF_ENDIAN_ADJ (i);
119 n1 = src [i + adj];
120 r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
122 invert_half_limb (dinv_limb, d_limb);
123 adj = -adj;
125 for (i--; i >= 0; i--)
127 n0 = src [i + adj];
128 adj = -adj;
129 nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
130 udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
131 n1 = n0;
134 /* same as loop, but without n0 */
135 nshift = n1 << norm;
136 udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
138 ASSERT ((r & ((1 << norm) - 1)) == 0);
139 return r >> norm;
141 else
143 mp_srcptr src;
144 mp_size_t size;
145 mp_limb_t n1, n0, r, dinv, dummy_q, nshift, norm_rmask;
147 src = src_limbptr;
148 size = size_limbs;
149 r = src_high_limb; /* initial remainder */
151 /* Skip a division if high < divisor. Having the test here before
152 normalizing will still skip as often as possible. */
153 if (r < d_limb)
155 size--;
156 ASSERT (size > 0); /* because size==1 handled above */
158 else
159 r = 0;
161 count_leading_zeros (norm, d_limb);
162 d_limb <<= norm;
164 norm_rshift = GMP_LIMB_BITS - norm;
165 norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
167 src += size;
168 n1 = *--src;
169 r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
171 invert_limb (dinv, d_limb);
173 for (i = size-2; i >= 0; i--)
175 n0 = *--src;
176 nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
177 udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
178 n1 = n0;
181 /* same as loop, but without n0 */
182 nshift = n1 << norm;
183 udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
185 ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0);
186 return r >> norm;
190 mp_limb_t
191 mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b)
193 ASSERT (n >= 0);
194 ASSERT (b != 0);
196 /* Should this be handled at all? Rely on callers? Note un==0 is currently
197 required by mpz/fdiv_r_ui.c and possibly other places. */
198 if (n == 0)
199 return 0;
201 if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0))
203 if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD))
205 return mpn_mod_1_anynorm (ap, n, b);
207 else
209 mp_limb_t pre[4];
210 mpn_mod_1_1p_cps (pre, b);
211 return mpn_mod_1_1p (ap, n, b, pre);
214 else
216 if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD))
218 return mpn_mod_1_anynorm (ap, n, b);
220 else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD))
222 mp_limb_t pre[4];
223 mpn_mod_1_1p_cps (pre, b);
224 return mpn_mod_1_1p (ap, n, b << pre[1], pre);
226 else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4))
228 mp_limb_t pre[5];
229 mpn_mod_1s_2p_cps (pre, b);
230 return mpn_mod_1s_2p (ap, n, b << pre[1], pre);
232 else
234 mp_limb_t pre[7];
235 mpn_mod_1s_4p_cps (pre, b);
236 return mpn_mod_1s_4p (ap, n, b << pre[1], pre);