1 /* mpz_n_pow_ui -- mpn raised to ulong.
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
7 Copyright 2001, 2002, 2005, 2012 Free Software Foundation, Inc.
9 This file is part of the GNU MP Library.
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
14 * the GNU Lesser General Public License as published by the Free
15 Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
20 * the GNU General Public License as published by the Free Software
21 Foundation; either version 2 of the License, or (at your option) any
24 or both in parallel, as here.
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library. If not,
33 see https://www.gnu.org/licenses/. */
40 /* Change this to "#define TRACE(x) x" for some traces. */
44 /* Use this to test the mul_2 code on a CPU without a native version of that
47 #define mpn_mul_2 refmpn_mul_2
48 #define HAVE_NATIVE_mpn_mul_2 1
52 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
53 ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
54 bsize==2 or >2, but separating that isn't easy because there's shared
55 code both before and after (the size calculations and the powers of 2
60 It would work to just use the mpn_mul powering loop for 1 and 2 limb
61 bases, but the current separate loop allows mul_1 and mul_2 to be done
62 in-place, which might help cache locality a bit. If mpn_mul was relaxed
63 to allow source==dest when vn==1 or 2 then some pointer twiddling might
64 let us get the same effect in one loop.
66 The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
67 form the biggest possible power of b that fits, only the biggest power of
68 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
69 using mp_bases[b].big_base for small b, and thereby get better value
70 from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
71 so would be more complicated than it's worth, and could well end up being
72 a slowdown for small e. For big e on the other hand the algorithm is
73 dominated by mpn_sqr so there wouldn't much of a saving. The current
74 code can be viewed as simply doing the first few steps of the powering in
75 a single or double limb where possible.
77 If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
78 copy made of b is unnecessary. We could just use the old alloc'ed block
79 and free it at the end. But arranging this seems like a lot more trouble
83 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
84 a limb without overflowing.
85 FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
87 #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
90 /* The following are for convenience, they update the size and check the
93 #define MPN_SQR(dst, alloc, src, size) \
95 ASSERT (2*(size) <= (alloc)); \
96 mpn_sqr (dst, src, size); \
98 (size) -= ((dst)[(size)-1] == 0); \
101 #define MPN_MUL(dst, alloc, src, size, src2, size2) \
104 ASSERT ((size) + (size2) <= (alloc)); \
105 cy = mpn_mul (dst, src, size, src2, size2); \
106 (size) += (size2) - (cy == 0); \
109 #define MPN_MUL_2(ptr, size, alloc, mult) \
112 ASSERT ((size)+2 <= (alloc)); \
113 cy = mpn_mul_2 (ptr, ptr, size, mult); \
115 (ptr)[(size)] = cy; \
116 (size) += (cy != 0); \
119 #define MPN_MUL_1(ptr, size, alloc, limb) \
122 ASSERT ((size)+1 <= (alloc)); \
123 cy = mpn_mul_1 (ptr, ptr, size, limb); \
125 (size) += (cy != 0); \
128 #define MPN_LSHIFT(ptr, size, alloc, shift) \
131 ASSERT ((size)+1 <= (alloc)); \
132 cy = mpn_lshift (ptr, ptr, size, shift); \
134 (size) += (cy != 0); \
137 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
140 MPN_COPY (dst, src, size); \
143 mpn_rshift (dst, src, size, shift); \
144 (size) -= ((dst)[(size)-1] == 0); \
149 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
150 allocations. Avoid writing values to them in a normal build, to ensure
151 the compiler lets them go dead. gcc already figures this out itself
156 MP_PTR_SWAP (rp, tp); \
157 ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
162 mpz_n_pow_ui (mpz_ptr r
, mp_srcptr bp
, mp_size_t bsize
, unsigned long int e
)
165 mp_size_t rtwos_limbs
, ralloc
, rsize
;
166 int rneg
, i
, cnt
, btwos
, r_bp_overlap
;
168 mp_bitcnt_t rtwos_bits
;
169 #if HAVE_NATIVE_mpn_mul_2
170 mp_limb_t blimb_low
, rl_high
;
172 mp_limb_t b_twolimbs
[2];
176 TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
177 PTR(r
), bp
, bsize
, e
, e
);
178 mpn_trace ("b", bp
, bsize
));
180 ASSERT (bsize
== 0 || bp
[ABS(bsize
)-1] != 0);
181 ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r
), ALLOC(r
), bp
, ABS(bsize
)));
183 /* b^0 == 1, including 0^0 == 1 */
191 /* 0^e == 0 apart from 0^0 above */
198 /* Sign of the final result. */
199 rneg
= (bsize
< 0 && (e
& 1) != 0);
201 TRACE (printf ("rneg %d\n", rneg
));
203 r_bp_overlap
= (PTR(r
) == bp
);
205 /* Strip low zero limbs from b. */
207 for (blimb
= *bp
; blimb
== 0; blimb
= *++bp
)
210 bsize
--; ASSERT (bsize
>= 1);
212 TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs
));
214 /* Strip low zero bits from b. */
215 count_trailing_zeros (btwos
, blimb
);
217 rtwos_bits
= e
* btwos
;
218 rtwos_limbs
+= rtwos_bits
/ GMP_NUMB_BITS
;
219 rtwos_bits
%= GMP_NUMB_BITS
;
220 TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
221 btwos
, rtwos_limbs
, rtwos_bits
));
226 #if HAVE_NATIVE_mpn_mul_2
233 /* Power up as far as possible within blimb. We start here with e!=0,
234 but if e is small then we might reach e==0 and the whole b^e in rl.
235 Notice this code works when blimb==1 too, reaching e==0. */
237 while (blimb
<= GMP_NUMB_HALFMAX
)
239 TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
250 #if HAVE_NATIVE_mpn_mul_2
251 TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
254 /* Can power b once more into blimb:blimb_low */
259 umul_ppmm (rl_high
, rl
, rl
, blimb
<< GMP_NAIL_BITS
);
260 rl
>>= GMP_NAIL_BITS
;
263 umul_ppmm (blimb
, blimb_low
, blimb
, blimb
<< GMP_NAIL_BITS
);
264 blimb_low
>>= GMP_NAIL_BITS
;
267 TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
268 e
, blimb
, blimb_low
, rl_high
, rl
));
270 /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
271 final mul_1 or mul_2 rather than a separate lshift.
272 - rl_high:rl mustn't be 1 (since then there's no final mul)
273 - rl_high mustn't overflow
274 - rl_high mustn't change to non-zero, since mul_1+lshift is
275 probably faster than mul_2 (FIXME: is this true?) */
278 && ! (rl_high
== 0 && rl
== 1)
279 && (rl_high
>> (GMP_NUMB_BITS
-rtwos_bits
)) == 0)
281 mp_limb_t new_rl_high
= (rl_high
<< rtwos_bits
)
282 | (rl
>> (GMP_NUMB_BITS
-rtwos_bits
));
283 if (! (rl_high
== 0 && new_rl_high
!= 0))
285 rl_high
= new_rl_high
;
288 TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
294 TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
297 /* Combine left-over rtwos_bits into rl to be handled by the final
298 mul_1 rather than a separate lshift.
299 - rl mustn't be 1 (since then there's no final mul)
300 - rl mustn't overflow */
304 && (rl
>> (GMP_NUMB_BITS
-rtwos_bits
)) == 0)
308 TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl
));
314 mp_limb_t bsecond
= bp
[1];
316 blimb
|= (bsecond
<< (GMP_NUMB_BITS
- btwos
)) & GMP_NUMB_MASK
;
320 /* Two limbs became one after rshift. */
325 TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond
, blimb
));
326 #if HAVE_NATIVE_mpn_mul_2
330 b_twolimbs
[0] = blimb
;
331 b_twolimbs
[1] = bsecond
;
337 if (r_bp_overlap
|| btwos
!= 0)
339 mp_ptr tp
= TMP_ALLOC_LIMBS (bsize
);
340 MPN_RSHIFT_OR_COPY (tp
, bp
, bsize
, btwos
);
342 TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize
));
344 #if HAVE_NATIVE_mpn_mul_2
345 /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
350 TRACE (printf ("big bsize=%ld ", bsize
);
351 mpn_trace ("b", bp
, bsize
));
354 /* At this point blimb is the most significant limb of the base to use.
356 Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
357 limb to round up the division; +1 for multiplies all using an extra
358 limb over the true size; +2 for rl at the end; +1 for lshift at the
361 The size calculation here is reasonably accurate. The base is at least
362 half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
363 when it will power up as just over 16, an overestimate of 17/16 =
364 6.25%. For a 64-bit limb it's half that.
366 If e==0 then blimb won't be anything useful (though it will be
367 non-zero), but that doesn't matter since we just end up with ralloc==5,
368 and that's fine for 2 limbs of rl and 1 of lshift. */
371 count_leading_zeros (cnt
, blimb
);
372 ralloc
= (bsize
*GMP_NUMB_BITS
- cnt
+ GMP_NAIL_BITS
) * e
/ GMP_NUMB_BITS
+ 5;
373 TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
374 ralloc
, bsize
, blimb
, cnt
));
375 rp
= MPZ_REALLOC (r
, ralloc
+ rtwos_limbs
);
377 /* Low zero limbs resulting from powers of 2. */
378 MPN_ZERO (rp
, rtwos_limbs
);
383 /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
387 #if HAVE_NATIVE_mpn_mul_2
389 rsize
+= (rl_high
!= 0);
391 ASSERT (rp
[rsize
-1] != 0);
398 /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
399 low bit of e is zero, tp only has to hold the second last power
400 step, which is half the size of the final result. There's no need
401 to round up the divide by 2, since ralloc includes a +2 for rl
402 which not needed by tp. In the mpn_mul loop when the low bit of e
403 is 1, tp must hold nearly the full result, so just size it the same
407 #if HAVE_NATIVE_mpn_mul_2
408 if (bsize
<= 2 || (e
& 1) == 0)
411 if (bsize
<= 1 || (e
& 1) == 0)
414 TRACE (printf ("talloc %ld\n", talloc
));
415 tp
= TMP_ALLOC_LIMBS (talloc
);
417 /* Go from high to low over the bits of e, starting with i pointing at
418 the bit below the highest 1 (which will mean i==-1 if e==1). */
419 count_leading_zeros (cnt
, (mp_limb_t
) e
);
420 i
= GMP_LIMB_BITS
- cnt
- 2;
422 #if HAVE_NATIVE_mpn_mul_2
427 /* Any bsize==1 will have been powered above to be two limbs. */
431 /* Arrange the final result ends up in r, not in the temp space */
444 TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
445 i
, e
, rsize
, ralloc
, talloc
);
446 mpn_trace ("r", rp
, rsize
));
448 MPN_SQR (tp
, talloc
, rp
, rsize
);
450 if ((e
& (1L << i
)) != 0)
451 MPN_MUL_2 (rp
, rsize
, ralloc
, mult
);
454 TRACE (mpn_trace ("mul_2 before rl, r", rp
, rsize
));
459 MPN_MUL_2 (rp
, rsize
, ralloc
, mult
);
462 MPN_MUL_1 (rp
, rsize
, ralloc
, rl
);
467 /* Arrange the final result ends up in r, not in the temp space */
476 TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
477 i
, e
, rsize
, ralloc
, talloc
);
478 mpn_trace ("r", rp
, rsize
));
480 MPN_SQR (tp
, talloc
, rp
, rsize
);
482 if ((e
& (1L << i
)) != 0)
483 MPN_MUL_1 (rp
, rsize
, ralloc
, blimb
);
486 TRACE (mpn_trace ("mul_1 before rl, r", rp
, rsize
));
488 MPN_MUL_1 (rp
, rsize
, ralloc
, rl
);
495 /* Arrange the final result ends up in r, not in the temp space */
496 ULONG_PARITY (parity
, e
);
497 if (((parity
^ i
) & 1) != 0)
500 MPN_COPY (rp
, bp
, bsize
);
505 TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
506 i
, e
, rsize
, ralloc
, talloc
);
507 mpn_trace ("r", rp
, rsize
));
509 MPN_SQR (tp
, talloc
, rp
, rsize
);
511 if ((e
& (1L << i
)) != 0)
513 MPN_MUL (tp
, talloc
, rp
, rsize
, bp
, bsize
);
520 ASSERT (rp
== PTR(r
) + rtwos_limbs
);
521 TRACE (mpn_trace ("end loop r", rp
, rsize
));
524 /* Apply any partial limb factors of 2. */
527 MPN_LSHIFT (rp
, rsize
, ralloc
, (unsigned) rtwos_bits
);
528 TRACE (mpn_trace ("lshift r", rp
, rsize
));
531 rsize
+= rtwos_limbs
;
532 SIZ(r
) = (rneg
? -rsize
: rsize
);