1 /* mpz_bin_uiui - compute n over k.
3 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5 Copyright 2010-2012 Free Software Foundation, Inc.
7 This file is part of the GNU MP Library.
9 The GNU MP Library is free software; you can redistribute it and/or modify
10 it under the terms of either:
12 * the GNU Lesser General Public License as published by the Free
13 Software Foundation; either version 3 of the License, or (at your
14 option) any later version.
18 * the GNU General Public License as published by the Free Software
19 Foundation; either version 2 of the License, or (at your option) any
22 or both in parallel, as here.
24 The GNU MP Library is distributed in the hope that it will be useful, but
25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 You should have received copies of the GNU General Public License and the
30 GNU Lesser General Public License along with the GNU MP Library. If not,
31 see https://www.gnu.org/licenses/. */
37 #ifndef BIN_GOETGHELUCK_THRESHOLD
38 #define BIN_GOETGHELUCK_THRESHOLD 1000
40 #ifndef BIN_UIUI_ENABLE_SMALLDC
41 #define BIN_UIUI_ENABLE_SMALLDC 1
43 #ifndef BIN_UIUI_RECURSIVE_SMALLDC
44 #define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32)
49 Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
50 which are then accumulated into mpn numbers. The first inner loop
51 accumulates divisor factors, the 2nd inner loop accumulates exactly the same
52 number of dividend factors. We avoid accumulating more for the divisor,
53 even with its smaller factors, since we else cannot guarantee divisibility.
55 Since we know each division will yield an integer, we compute the quotient
56 using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
61 (1) An obvious improvement to this code would be to compute mod 2^t
62 everywhere. Unfortunately, we cannot determine t beforehand, unless we
63 invoke some approximation, such as Stirling's formula. Of course, we don't
64 need t to be tight. However, it is not clear that this would help much,
65 our numbers are kept reasonably small already.
67 (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
68 Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
69 would make it both reasonably accurate and fast. (We could use a table
70 stored into a limb, perhaps.) The table should take the removed factors of
71 2 into account (those done on-the-fly in mulN).
73 (3) The first time in the loop we compute the odd part of a
74 factorial in kp, we might use oddfac_1 for this task.
77 /* This threshold determines how large divisor to accumulate before we call
78 bdiv. Perhaps we should never call bdiv, and accumulate all we are told,
79 since we are just basecase code anyway? Presumably, this depends on the
80 relative speed of the asymptotically fast code and this code. */
81 #define SOME_THRESHOLD 20
83 /* Multiply-into-limb functions. These remove factors of 2 on-the-fly. FIXME:
84 All versions of MAXFACS don't take this 2 removal into account now, meaning
85 that then, shifting just adds some overhead. (We remove factors from the
86 completed limb anyway.) */
97 /* We need to shift before multiplying, to avoid an overflow. */
98 mp_limb_t m01
= (m
| 1) * ((m
+ 1) >> 1);
105 mp_limb_t m01
= (m
+ 0) * (m
+ 1) >> 1;
106 mp_limb_t m2
= (m
+ 2);
113 mp_limb_t m01
= (m
+ 0) * (m
+ 1) >> 1;
114 mp_limb_t m23
= (m
+ 2) * (m
+ 3) >> 1;
121 mp_limb_t m012
= (m
+ 0) * (m
+ 1) * (m
+ 2) >> 1;
122 mp_limb_t m34
= (m
+ 3) * (m
+ 4) >> 1;
129 mp_limb_t m01
= (m
+ 0) * (m
+ 1);
130 mp_limb_t m23
= (m
+ 2) * (m
+ 3);
131 mp_limb_t m45
= (m
+ 4) * (m
+ 5) >> 1;
132 mp_limb_t m0123
= m01
* m23
>> 3;
139 mp_limb_t m01
= (m
+ 0) * (m
+ 1);
140 mp_limb_t m23
= (m
+ 2) * (m
+ 3);
141 mp_limb_t m456
= (m
+ 4) * (m
+ 5) * (m
+ 6) >> 1;
142 mp_limb_t m0123
= m01
* m23
>> 3;
149 mp_limb_t m01
= (m
+ 0) * (m
+ 1);
150 mp_limb_t m23
= (m
+ 2) * (m
+ 3);
151 mp_limb_t m45
= (m
+ 4) * (m
+ 5);
152 mp_limb_t m67
= (m
+ 6) * (m
+ 7);
153 mp_limb_t m0123
= m01
* m23
>> 3;
154 mp_limb_t m4567
= m45
* m67
>> 3;
155 return m0123
* m4567
;
158 typedef mp_limb_t (* mulfunc_t
) (mp_limb_t
);
160 static const mulfunc_t mulfunc
[] = {mul1
,mul2
,mul3
,mul4
,mul5
,mul6
,mul7
,mul8
};
161 #define M (numberof(mulfunc))
163 /* Number of factors-of-2 removed by the corresponding mulN function. */
164 static const unsigned char tcnttab
[] = {0, 1, 1, 2, 2, 4, 4, 6};
167 /* This variant is inaccurate but share the code with other functions. */
168 #define MAXFACS(max,l) \
170 (max) = log_n_max (l); \
174 /* This variant is exact(?) but uses a loop. It takes the 2 removal
175 of mulN into account. */
176 static const unsigned long ftab
[] =
177 #if GMP_NUMB_BITS == 64
178 /* 1 to 8 factors per iteration */
179 {CNST_LIMB(0xffffffffffffffff),CNST_LIMB(0x100000000),0x32cbfe,0x16a0b,0x24c4,0xa16,0x34b,0x1b2 /*,0xdf,0x8d */};
181 #if GMP_NUMB_BITS == 32
182 /* 1 to 7 factors per iteration */
183 {0xffffffff,0x10000,0x801,0x16b,0x71,0x42,0x26 /* ,0x1e */};
186 #define MAXFACS(max,l) \
189 for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--) \
195 /* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis
196 is an odd integer. */
197 static const mp_limb_t facinv
[] = { ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE
};
200 mpz_bdiv_bin_uiui (mpz_ptr r
, unsigned long int n
, unsigned long int k
)
202 int nmax
, kmax
, nmaxnow
, numfac
;
204 mp_size_t nn
, kn
, alloc
;
205 mp_limb_t i
, j
, t
, iii
, jjj
, cy
, dinv
;
206 mp_bitcnt_t i2cnt
, j2cnt
;
211 ASSERT (k
> ODD_FACTORIAL_TABLE_LIMIT
);
214 maxn
= 1 + n
/ GMP_NUMB_BITS
; /* absolutely largest result size (limbs) */
216 /* FIXME: This allocation might be insufficient, but is usually way too
218 alloc
= SOME_THRESHOLD
- 1 + MAX (3 * maxn
/ 2, SOME_THRESHOLD
);
219 alloc
= MIN (alloc
, k
) + 1;
220 np
= TMP_ALLOC_LIMBS (alloc
);
221 kp
= TMP_ALLOC_LIMBS (SOME_THRESHOLD
+ 1);
233 i2cnt
= 0; /* total low zeros in dividend */
234 j2cnt
= __gmp_fac2cnt_table
[ODD_FACTORIAL_TABLE_LIMIT
/ 2 - 1];
235 /* total low zeros in divisor */
238 j
= ODD_FACTORIAL_TABLE_LIMIT
+ 1;
239 jjj
= ODD_FACTORIAL_TABLE_MAX
;
240 ASSERT (__gmp_oddfac_table
[ODD_FACTORIAL_TABLE_LIMIT
] == ODD_FACTORIAL_TABLE_MAX
);
244 kp
[0] = jjj
; /* store new factors */
247 kmax
= MIN (kmax
, t
);
249 while (kmax
!= 0 && kn
< SOME_THRESHOLD
)
251 jjj
= mulfunc
[kmax
- 1] (j
);
252 j
+= kmax
; /* number of factors used */
253 count_trailing_zeros (cnt
, jjj
); /* count low zeros */
254 jjj
>>= cnt
; /* remove remaining low zeros */
255 j2cnt
+= tcnttab
[kmax
- 1] + cnt
; /* update low zeros count */
256 cy
= mpn_mul_1 (kp
, kp
, kn
, jjj
); /* accumulate new factors */
260 kmax
= MIN (kmax
, t
);
266 nmaxnow
= MIN (nmax
, numfac
);
267 iii
= mulfunc
[nmaxnow
- 1] (i
);
268 i
+= nmaxnow
; /* number of factors used */
269 count_trailing_zeros (cnt
, iii
); /* count low zeros */
270 iii
>>= cnt
; /* remove remaining low zeros */
271 i2cnt
+= tcnttab
[nmaxnow
- 1] + cnt
; /* update low zeros count */
272 cy
= mpn_mul_1 (np
, np
, nn
, iii
); /* accumulate new factors */
280 binvert_limb (dinv
, kp
[0]);
281 nn
+= (np
[nn
- 1] >= kp
[kn
- 1]);
283 mpn_sbpi1_bdiv_q (np
, np
, nn
, kp
, MIN(kn
,nn
), -dinv
);
289 jjj
= mulfunc
[kmax
- 1] (j
);
290 j
+= kmax
; /* number of factors used */
291 count_trailing_zeros (cnt
, jjj
); /* count low zeros */
292 jjj
>>= cnt
; /* remove remaining low zeros */
293 j2cnt
+= tcnttab
[kmax
- 1] + cnt
; /* update low zeros count */
296 /* Put back the right number of factors of 2. */
300 ASSERT (cnt
< GMP_NUMB_BITS
); /* can happen, but not for intended use */
301 cy
= mpn_lshift (np
, np
, nn
, cnt
);
306 nn
-= np
[nn
- 1] == 0; /* normalisation */
308 kp
= MPZ_NEWALLOC (r
, nn
);
310 MPN_COPY (kp
, np
, nn
);
315 mpz_smallk_bin_uiui (mpz_ptr r
, unsigned long int n
, unsigned long int k
)
320 mp_limb_t i
, iii
, cy
;
321 mp_bitcnt_t i2cnt
, cnt
;
323 count_leading_zeros (cnt
, (mp_limb_t
) n
);
324 cnt
= GMP_LIMB_BITS
- cnt
;
325 alloc
= cnt
* k
/ GMP_NUMB_BITS
+ 3; /* FIXME: ensure rounding is enough. */
326 rp
= MPZ_NEWALLOC (r
, alloc
);
329 nmax
= MIN (nmax
, M
);
333 nmax
= MIN (nmax
, k
);
334 rp
[0] = mulfunc
[nmax
- 1] (i
);
336 i
+= nmax
; /* number of factors used */
337 i2cnt
= tcnttab
[nmax
- 1]; /* low zeros count */
341 nmax
= MIN (nmax
, numfac
);
342 iii
= mulfunc
[nmax
- 1] (i
);
343 i
+= nmax
; /* number of factors used */
344 i2cnt
+= tcnttab
[nmax
- 1]; /* update low zeros count */
345 cy
= mpn_mul_1 (rp
, rp
, rn
, iii
); /* accumulate new factors */
353 mpn_pi1_bdiv_q_1 (rp
, rp
, rn
, __gmp_oddfac_table
[k
], facinv
[k
- 2],
354 __gmp_fac2cnt_table
[k
/ 2 - 1] - i2cnt
);
355 /* A two-fold, branch-free normalisation is possible :*/
356 /* rn -= rp[rn - 1] == 0; */
357 /* rn -= rp[rn - 1] == 0; */
358 MPN_NORMALIZE_NOT_ZERO (rp
, rn
);
365 Plain and simply multiply things together.
367 We tabulate factorials (k!/2^t)^(-1) mod B (where t is chosen such
373 bc_bin_uiui (unsigned int n
, unsigned int k
)
375 return ((__gmp_oddfac_table
[n
] * facinv
[k
- 2] * facinv
[n
- k
- 2])
376 << (__gmp_fac2cnt_table
[n
/ 2 - 1] - __gmp_fac2cnt_table
[k
/ 2 - 1] - __gmp_fac2cnt_table
[(n
-k
) / 2 - 1]))
382 Recursively exploit the relation
383 bin(n,k) = bin(n,k>>1)*bin(n-k>>1,k-k>>1)/bin(k,k>>1) .
385 Values for binomial(k,k>>1) that fit in a limb are precomputed
389 /* bin2kk[i - ODD_CENTRAL_BINOMIAL_OFFSET] =
390 binomial(i*2,i)/2^t (where t is chosen so that it is odd). */
391 static const mp_limb_t bin2kk
[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE
};
393 /* bin2kkinv[i] = bin2kk[i]^-1 mod B */
394 static const mp_limb_t bin2kkinv
[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE
};
396 /* bin2kk[i] = binomial((i+MIN_S)*2,i+MIN_S)/2^t. This table contains the t values. */
397 static const unsigned char fac2bin
[] = { CENTRAL_BINOMIAL_2FAC_TABLE
};
400 mpz_smallkdc_bin_uiui (mpz_ptr r
, unsigned long int n
, unsigned long int k
)
404 unsigned long int hk
;
408 if ((! BIN_UIUI_RECURSIVE_SMALLDC
) || hk
<= ODD_FACTORIAL_TABLE_LIMIT
)
409 mpz_smallk_bin_uiui (r
, n
, hk
);
411 mpz_smallkdc_bin_uiui (r
, n
, hk
);
414 if (n
<= ODD_FACTORIAL_EXTTABLE_LIMIT
) {
417 rp
= MPZ_REALLOC (r
, rn
+ 1);
418 cy
= mpn_mul_1 (rp
, rp
, rn
, bc_bin_uiui (n
, k
));
422 mp_limb_t buffer
[ODD_CENTRAL_BINOMIAL_TABLE_LIMIT
+ 3];
425 ALLOC (t
) = ODD_CENTRAL_BINOMIAL_TABLE_LIMIT
+ 3;
427 if ((! BIN_UIUI_RECURSIVE_SMALLDC
) || k
<= ODD_FACTORIAL_TABLE_LIMIT
)
428 mpz_smallk_bin_uiui (t
, n
, k
);
430 mpz_smallkdc_bin_uiui (t
, n
, k
);
436 mpn_pi1_bdiv_q_1 (rp
, rp
, rn
, bin2kk
[k
- ODD_CENTRAL_BINOMIAL_OFFSET
],
437 bin2kkinv
[k
- ODD_CENTRAL_BINOMIAL_OFFSET
],
438 fac2bin
[k
- ODD_CENTRAL_BINOMIAL_OFFSET
] - (k
!= hk
));
439 /* A two-fold, branch-free normalisation is possible :*/
440 /* rn -= rp[rn - 1] == 0; */
441 /* rn -= rp[rn - 1] == 0; */
442 MPN_NORMALIZE_NOT_ZERO (rp
, rn
);
447 /* mpz_goetgheluck_bin_uiui(RESULT, N, K) -- Set RESULT to binomial(N,K).
449 * Contributed to the GNU project by Marco Bodrato.
451 * Implementation of the algorithm by P. Goetgheluck, "Computing
452 * Binomial Coefficients", The American Mathematical Monthly, Vol. 94,
453 * No. 4 (April 1987), pp. 360-365.
455 * Acknowledgment: Peter Luschny did spot the slowness of the previous
456 * code and suggested the reference.
459 /* TODO: Remove duplicated constants / macros / static functions...
462 /*************************************************************/
463 /* Section macros: common macros, for swing/fac/bin (&sieve) */
464 /*************************************************************/
466 #define FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I) \
467 if ((PR) > (MAX_PR)) { \
468 (VEC)[(I)++] = (PR); \
472 #define FACTOR_LIST_STORE(P, PR, MAX_PR, VEC, I) \
474 if ((PR) > (MAX_PR)) { \
475 (VEC)[(I)++] = (PR); \
481 #define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) \
486 if (((sieve)[__index] & __mask) == 0) \
488 (prime) = id_to_n(__i)
490 #define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve) \
492 mp_limb_t __mask, __index, __max_i, __i; \
494 __i = (start)-(off); \
495 __index = __i / GMP_LIMB_BITS; \
496 __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS); \
499 LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)
501 #define LOOP_ON_SIEVE_STOP \
503 __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1); \
504 __index += __mask & 1; \
505 } while (__i <= __max_i) \
507 #define LOOP_ON_SIEVE_END \
508 LOOP_ON_SIEVE_STOP; \
511 /*********************************************************/
512 /* Section sieve: sieving functions and tools for primes */
513 /*********************************************************/
517 bit_to_n (mp_limb_t bit
) { return (bit
*3+4)|1; }
520 /* id_to_n (x) = bit_to_n (x-1) = (id*3+1)|1*/
522 id_to_n (mp_limb_t id
) { return id
*3+1+(id
&1); }
524 /* n_to_bit (n) = ((n-1)&(-CNST_LIMB(2)))/3U-1 */
526 n_to_bit (mp_limb_t n
) { return ((n
-5)|1)/3U; }
529 primesieve_size (mp_limb_t n
) { return n_to_bit(n
) / GMP_LIMB_BITS
+ 1; }
531 /*********************************************************/
532 /* Section binomial: fast binomial implementation */
533 /*********************************************************/
535 #define COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I) \
537 mp_limb_t __a, __b, __prime, __ma,__mb; \
539 __a = (N); __b = (K); __mb = 0; \
540 FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I); \
542 __mb += __b % __prime; __b /= __prime; \
543 __ma = __a % __prime; __a /= __prime; \
545 __mb = 1; (PR) *= __prime; \
547 } while (__a >= __prime); \
550 #define SH_COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I) \
554 if (((N) % __prime) < ((K) % __prime)) { \
555 FACTOR_LIST_STORE (__prime, PR, MAX_PR, VEC, I); \
559 /* Returns an approximation of the sqare root of x. *
560 * It gives: x <= limb_apprsqrt (x) ^ 2 < x * 9/4 */
562 limb_apprsqrt (mp_limb_t x
)
567 count_leading_zeros (s
, x
- 1);
568 s
= GMP_LIMB_BITS
- 1 - s
;
569 return (CNST_LIMB(1) << (s
>> 1)) + (CNST_LIMB(1) << ((s
- 1) >> 1));
573 mpz_goetgheluck_bin_uiui (mpz_ptr r
, unsigned long int n
, unsigned long int k
)
575 mp_limb_t
*sieve
, *factors
, count
;
576 mp_limb_t prod
, max_prod
, j
;
579 ASSERT (BIN_GOETGHELUCK_THRESHOLD
>= 13);
583 sieve
= TMP_ALLOC_LIMBS (primesieve_size (n
));
585 count
= gmp_primesieve (sieve
, n
) + 1;
586 factors
= TMP_ALLOC_LIMBS (count
/ log_n_max (n
) + 1);
588 max_prod
= GMP_NUMB_MAX
/ n
;
590 /* Handle primes = 2, 3 separately. */
591 popc_limb (count
, n
- k
);
596 prod
= CNST_LIMB(1) << count
;
599 COUNT_A_PRIME (3, n
, k
, prod
, max_prod
, factors
, j
);
601 /* Accumulate prime factors from 5 to n/2 */
607 s
= limb_apprsqrt(n
);
609 LOOP_ON_SIEVE_BEGIN (prime
, n_to_bit (5), s
, 0,sieve
);
610 COUNT_A_PRIME (prime
, n
, k
, prod
, max_prod
, factors
, j
);
615 ASSERT (max_prod
<= GMP_NUMB_MAX
/ 2);
617 ASSERT (bit_to_n (s
) * bit_to_n (s
) > n
);
618 ASSERT (s
<= n_to_bit (n
>> 1));
622 LOOP_ON_SIEVE_BEGIN (prime
, s
, n_to_bit (n
>> 1), 0,sieve
);
623 SH_COUNT_A_PRIME (prime
, n
, k
, prod
, max_prod
, factors
, j
);
629 /* Store primes from (n-k)+1 to n */
630 ASSERT (n_to_bit (n
- k
) < n_to_bit (n
));
633 LOOP_ON_SIEVE_BEGIN (prime
, n_to_bit (n
- k
) + 1, n_to_bit (n
), 0,sieve
);
634 FACTOR_LIST_STORE (prime
, prod
, max_prod
, factors
, j
);
641 mpz_prodlimbs (r
, factors
, j
);
652 #undef SH_COUNT_A_PRIME
653 #undef LOOP_ON_SIEVE_END
654 #undef LOOP_ON_SIEVE_STOP
655 #undef LOOP_ON_SIEVE_BEGIN
656 #undef LOOP_ON_SIEVE_CONTINUE
658 /*********************************************************/
659 /* End of implementation of Goetgheluck's algorithm */
660 /*********************************************************/
663 mpz_bin_uiui (mpz_ptr r
, unsigned long int n
, unsigned long int k
)
665 if (UNLIKELY (n
< k
)) {
667 #if BITS_PER_ULONG > GMP_NUMB_BITS
668 } else if (UNLIKELY (n
> GMP_NUMB_MAX
)) {
671 mpz_init_set_ui (tmp
, n
);
672 mpz_bin_ui (r
, tmp
, k
);
676 ASSERT (n
<= GMP_NUMB_MAX
);
677 /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */
680 PTR(r
)[0] = k
? n
: 1; /* 1 + ((-k) & (n-1)); */
682 } else if (n
<= ODD_FACTORIAL_EXTTABLE_LIMIT
) { /* k >= 2, n >= 4 */
683 PTR(r
)[0] = bc_bin_uiui (n
, k
);
685 } else if (k
<= ODD_FACTORIAL_TABLE_LIMIT
)
686 mpz_smallk_bin_uiui (r
, n
, k
);
687 else if (BIN_UIUI_ENABLE_SMALLDC
&&
688 k
<= (BIN_UIUI_RECURSIVE_SMALLDC
? ODD_CENTRAL_BINOMIAL_TABLE_LIMIT
: ODD_FACTORIAL_TABLE_LIMIT
)* 2)
689 mpz_smallkdc_bin_uiui (r
, n
, k
);
690 else if (ABOVE_THRESHOLD (k
, BIN_GOETGHELUCK_THRESHOLD
) &&
691 k
> (n
>> 4)) /* k > ODD_FACTORIAL_TABLE_LIMIT */
692 mpz_goetgheluck_bin_uiui (r
, n
, k
);
694 mpz_bdiv_bin_uiui (r
, n
, k
);