beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpz / n_pow_ui.c
blob6c3febe23757688076502eb3de7bc36f3496abbe
1 /* mpz_n_pow_ui -- mpn raised to ulong.
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
7 Copyright 2001, 2002, 2005, 2012 Free Software Foundation, Inc.
9 This file is part of the GNU MP Library.
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
14 * the GNU Lesser General Public License as published by the Free
15 Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
20 * the GNU General Public License as published by the Free Software
21 Foundation; either version 2 of the License, or (at your option) any
22 later version.
24 or both in parallel, as here.
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 for more details.
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library. If not,
33 see https://www.gnu.org/licenses/. */
35 #include "gmp.h"
36 #include "gmp-impl.h"
37 #include "longlong.h"
40 /* Change this to "#define TRACE(x) x" for some traces. */
41 #define TRACE(x)
44 /* Use this to test the mul_2 code on a CPU without a native version of that
45 routine. */
46 #if 0
47 #define mpn_mul_2 refmpn_mul_2
48 #define HAVE_NATIVE_mpn_mul_2 1
49 #endif
52 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
53 ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
54 bsize==2 or >2, but separating that isn't easy because there's shared
55 code both before and after (the size calculations and the powers of 2
56 handling).
58 Alternatives:
60 It would work to just use the mpn_mul powering loop for 1 and 2 limb
61 bases, but the current separate loop allows mul_1 and mul_2 to be done
62 in-place, which might help cache locality a bit. If mpn_mul was relaxed
63 to allow source==dest when vn==1 or 2 then some pointer twiddling might
64 let us get the same effect in one loop.
66 The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
67 form the biggest possible power of b that fits, only the biggest power of
68 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
69 using mp_bases[b].big_base for small b, and thereby get better value
70 from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
71 so would be more complicated than it's worth, and could well end up being
72 a slowdown for small e. For big e on the other hand the algorithm is
73 dominated by mpn_sqr so there wouldn't much of a saving. The current
74 code can be viewed as simply doing the first few steps of the powering in
75 a single or double limb where possible.
77 If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
78 copy made of b is unnecessary. We could just use the old alloc'ed block
79 and free it at the end. But arranging this seems like a lot more trouble
80 than it's worth. */
83 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
84 a limb without overflowing.
85 FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
87 #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
90 /* The following are for convenience, they update the size and check the
91 alloc. */
93 #define MPN_SQR(dst, alloc, src, size) \
94 do { \
95 ASSERT (2*(size) <= (alloc)); \
96 mpn_sqr (dst, src, size); \
97 (size) *= 2; \
98 (size) -= ((dst)[(size)-1] == 0); \
99 } while (0)
101 #define MPN_MUL(dst, alloc, src, size, src2, size2) \
102 do { \
103 mp_limb_t cy; \
104 ASSERT ((size) + (size2) <= (alloc)); \
105 cy = mpn_mul (dst, src, size, src2, size2); \
106 (size) += (size2) - (cy == 0); \
107 } while (0)
109 #define MPN_MUL_2(ptr, size, alloc, mult) \
110 do { \
111 mp_limb_t cy; \
112 ASSERT ((size)+2 <= (alloc)); \
113 cy = mpn_mul_2 (ptr, ptr, size, mult); \
114 (size)++; \
115 (ptr)[(size)] = cy; \
116 (size) += (cy != 0); \
117 } while (0)
119 #define MPN_MUL_1(ptr, size, alloc, limb) \
120 do { \
121 mp_limb_t cy; \
122 ASSERT ((size)+1 <= (alloc)); \
123 cy = mpn_mul_1 (ptr, ptr, size, limb); \
124 (ptr)[size] = cy; \
125 (size) += (cy != 0); \
126 } while (0)
128 #define MPN_LSHIFT(ptr, size, alloc, shift) \
129 do { \
130 mp_limb_t cy; \
131 ASSERT ((size)+1 <= (alloc)); \
132 cy = mpn_lshift (ptr, ptr, size, shift); \
133 (ptr)[size] = cy; \
134 (size) += (cy != 0); \
135 } while (0)
137 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
138 do { \
139 if ((shift) == 0) \
140 MPN_COPY (dst, src, size); \
141 else \
143 mpn_rshift (dst, src, size, shift); \
144 (size) -= ((dst)[(size)-1] == 0); \
146 } while (0)
149 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
150 allocations. Avoid writing values to them in a normal build, to ensure
151 the compiler lets them go dead. gcc already figures this out itself
152 actually. */
154 #define SWAP_RP_TP \
155 do { \
156 MP_PTR_SWAP (rp, tp); \
157 ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
158 } while (0)
161 void
162 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
164 mp_ptr rp;
165 mp_size_t rtwos_limbs, ralloc, rsize;
166 int rneg, i, cnt, btwos, r_bp_overlap;
167 mp_limb_t blimb, rl;
168 mp_bitcnt_t rtwos_bits;
169 #if HAVE_NATIVE_mpn_mul_2
170 mp_limb_t blimb_low, rl_high;
171 #else
172 mp_limb_t b_twolimbs[2];
173 #endif
174 TMP_DECL;
176 TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
177 PTR(r), bp, bsize, e, e);
178 mpn_trace ("b", bp, bsize));
180 ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
181 ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
183 /* b^0 == 1, including 0^0 == 1 */
184 if (e == 0)
186 PTR(r)[0] = 1;
187 SIZ(r) = 1;
188 return;
191 /* 0^e == 0 apart from 0^0 above */
192 if (bsize == 0)
194 SIZ(r) = 0;
195 return;
198 /* Sign of the final result. */
199 rneg = (bsize < 0 && (e & 1) != 0);
200 bsize = ABS (bsize);
201 TRACE (printf ("rneg %d\n", rneg));
203 r_bp_overlap = (PTR(r) == bp);
205 /* Strip low zero limbs from b. */
206 rtwos_limbs = 0;
207 for (blimb = *bp; blimb == 0; blimb = *++bp)
209 rtwos_limbs += e;
210 bsize--; ASSERT (bsize >= 1);
212 TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
214 /* Strip low zero bits from b. */
215 count_trailing_zeros (btwos, blimb);
216 blimb >>= btwos;
217 rtwos_bits = e * btwos;
218 rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
219 rtwos_bits %= GMP_NUMB_BITS;
220 TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
221 btwos, rtwos_limbs, rtwos_bits));
223 TMP_MARK;
225 rl = 1;
226 #if HAVE_NATIVE_mpn_mul_2
227 rl_high = 0;
228 #endif
230 if (bsize == 1)
232 bsize_1:
233 /* Power up as far as possible within blimb. We start here with e!=0,
234 but if e is small then we might reach e==0 and the whole b^e in rl.
235 Notice this code works when blimb==1 too, reaching e==0. */
237 while (blimb <= GMP_NUMB_HALFMAX)
239 TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
240 e, blimb, rl));
241 ASSERT (e != 0);
242 if ((e & 1) != 0)
243 rl *= blimb;
244 e >>= 1;
245 if (e == 0)
246 goto got_rl;
247 blimb *= blimb;
250 #if HAVE_NATIVE_mpn_mul_2
251 TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
252 e, blimb, rl));
254 /* Can power b once more into blimb:blimb_low */
255 bsize = 2;
256 ASSERT (e != 0);
257 if ((e & 1) != 0)
259 umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
260 rl >>= GMP_NAIL_BITS;
262 e >>= 1;
263 umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
264 blimb_low >>= GMP_NAIL_BITS;
266 got_rl:
267 TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
268 e, blimb, blimb_low, rl_high, rl));
270 /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
271 final mul_1 or mul_2 rather than a separate lshift.
272 - rl_high:rl mustn't be 1 (since then there's no final mul)
273 - rl_high mustn't overflow
274 - rl_high mustn't change to non-zero, since mul_1+lshift is
275 probably faster than mul_2 (FIXME: is this true?) */
277 if (rtwos_bits != 0
278 && ! (rl_high == 0 && rl == 1)
279 && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
281 mp_limb_t new_rl_high = (rl_high << rtwos_bits)
282 | (rl >> (GMP_NUMB_BITS-rtwos_bits));
283 if (! (rl_high == 0 && new_rl_high != 0))
285 rl_high = new_rl_high;
286 rl <<= rtwos_bits;
287 rtwos_bits = 0;
288 TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
289 rl_high, rl));
292 #else
293 got_rl:
294 TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
295 e, blimb, rl));
297 /* Combine left-over rtwos_bits into rl to be handled by the final
298 mul_1 rather than a separate lshift.
299 - rl mustn't be 1 (since then there's no final mul)
300 - rl mustn't overflow */
302 if (rtwos_bits != 0
303 && rl != 1
304 && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
306 rl <<= rtwos_bits;
307 rtwos_bits = 0;
308 TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
310 #endif
312 else if (bsize == 2)
314 mp_limb_t bsecond = bp[1];
315 if (btwos != 0)
316 blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
317 bsecond >>= btwos;
318 if (bsecond == 0)
320 /* Two limbs became one after rshift. */
321 bsize = 1;
322 goto bsize_1;
325 TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
326 #if HAVE_NATIVE_mpn_mul_2
327 blimb_low = blimb;
328 #else
329 bp = b_twolimbs;
330 b_twolimbs[0] = blimb;
331 b_twolimbs[1] = bsecond;
332 #endif
333 blimb = bsecond;
335 else
337 if (r_bp_overlap || btwos != 0)
339 mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
340 MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
341 bp = tp;
342 TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
344 #if HAVE_NATIVE_mpn_mul_2
345 /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
346 blimb_low = bp[0];
347 #endif
348 blimb = bp[bsize-1];
350 TRACE (printf ("big bsize=%ld ", bsize);
351 mpn_trace ("b", bp, bsize));
354 /* At this point blimb is the most significant limb of the base to use.
356 Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
357 limb to round up the division; +1 for multiplies all using an extra
358 limb over the true size; +2 for rl at the end; +1 for lshift at the
359 end.
361 The size calculation here is reasonably accurate. The base is at least
362 half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
363 when it will power up as just over 16, an overestimate of 17/16 =
364 6.25%. For a 64-bit limb it's half that.
366 If e==0 then blimb won't be anything useful (though it will be
367 non-zero), but that doesn't matter since we just end up with ralloc==5,
368 and that's fine for 2 limbs of rl and 1 of lshift. */
370 ASSERT (blimb != 0);
371 count_leading_zeros (cnt, blimb);
372 ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
373 TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
374 ralloc, bsize, blimb, cnt));
375 rp = MPZ_REALLOC (r, ralloc + rtwos_limbs);
377 /* Low zero limbs resulting from powers of 2. */
378 MPN_ZERO (rp, rtwos_limbs);
379 rp += rtwos_limbs;
381 if (e == 0)
383 /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
384 start. */
385 rp[0] = rl;
386 rsize = 1;
387 #if HAVE_NATIVE_mpn_mul_2
388 rp[1] = rl_high;
389 rsize += (rl_high != 0);
390 #endif
391 ASSERT (rp[rsize-1] != 0);
393 else
395 mp_ptr tp;
396 mp_size_t talloc;
398 /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
399 low bit of e is zero, tp only has to hold the second last power
400 step, which is half the size of the final result. There's no need
401 to round up the divide by 2, since ralloc includes a +2 for rl
402 which not needed by tp. In the mpn_mul loop when the low bit of e
403 is 1, tp must hold nearly the full result, so just size it the same
404 as rp. */
406 talloc = ralloc;
407 #if HAVE_NATIVE_mpn_mul_2
408 if (bsize <= 2 || (e & 1) == 0)
409 talloc /= 2;
410 #else
411 if (bsize <= 1 || (e & 1) == 0)
412 talloc /= 2;
413 #endif
414 TRACE (printf ("talloc %ld\n", talloc));
415 tp = TMP_ALLOC_LIMBS (talloc);
417 /* Go from high to low over the bits of e, starting with i pointing at
418 the bit below the highest 1 (which will mean i==-1 if e==1). */
419 count_leading_zeros (cnt, (mp_limb_t) e);
420 i = GMP_LIMB_BITS - cnt - 2;
422 #if HAVE_NATIVE_mpn_mul_2
423 if (bsize <= 2)
425 mp_limb_t mult[2];
427 /* Any bsize==1 will have been powered above to be two limbs. */
428 ASSERT (bsize == 2);
429 ASSERT (blimb != 0);
431 /* Arrange the final result ends up in r, not in the temp space */
432 if ((i & 1) == 0)
433 SWAP_RP_TP;
435 rp[0] = blimb_low;
436 rp[1] = blimb;
437 rsize = 2;
439 mult[0] = blimb_low;
440 mult[1] = blimb;
442 for ( ; i >= 0; i--)
444 TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
445 i, e, rsize, ralloc, talloc);
446 mpn_trace ("r", rp, rsize));
448 MPN_SQR (tp, talloc, rp, rsize);
449 SWAP_RP_TP;
450 if ((e & (1L << i)) != 0)
451 MPN_MUL_2 (rp, rsize, ralloc, mult);
454 TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
455 if (rl_high != 0)
457 mult[0] = rl;
458 mult[1] = rl_high;
459 MPN_MUL_2 (rp, rsize, ralloc, mult);
461 else if (rl != 1)
462 MPN_MUL_1 (rp, rsize, ralloc, rl);
464 #else
465 if (bsize == 1)
467 /* Arrange the final result ends up in r, not in the temp space */
468 if ((i & 1) == 0)
469 SWAP_RP_TP;
471 rp[0] = blimb;
472 rsize = 1;
474 for ( ; i >= 0; i--)
476 TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
477 i, e, rsize, ralloc, talloc);
478 mpn_trace ("r", rp, rsize));
480 MPN_SQR (tp, talloc, rp, rsize);
481 SWAP_RP_TP;
482 if ((e & (1L << i)) != 0)
483 MPN_MUL_1 (rp, rsize, ralloc, blimb);
486 TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
487 if (rl != 1)
488 MPN_MUL_1 (rp, rsize, ralloc, rl);
490 #endif
491 else
493 int parity;
495 /* Arrange the final result ends up in r, not in the temp space */
496 ULONG_PARITY (parity, e);
497 if (((parity ^ i) & 1) != 0)
498 SWAP_RP_TP;
500 MPN_COPY (rp, bp, bsize);
501 rsize = bsize;
503 for ( ; i >= 0; i--)
505 TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
506 i, e, rsize, ralloc, talloc);
507 mpn_trace ("r", rp, rsize));
509 MPN_SQR (tp, talloc, rp, rsize);
510 SWAP_RP_TP;
511 if ((e & (1L << i)) != 0)
513 MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
514 SWAP_RP_TP;
520 ASSERT (rp == PTR(r) + rtwos_limbs);
521 TRACE (mpn_trace ("end loop r", rp, rsize));
522 TMP_FREE;
524 /* Apply any partial limb factors of 2. */
525 if (rtwos_bits != 0)
527 MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
528 TRACE (mpn_trace ("lshift r", rp, rsize));
531 rsize += rtwos_limbs;
532 SIZ(r) = (rneg ? -rsize : rsize);