source/libs/gmp/gmp-src/mpz/bin_uiui.c

   1 /* mpz_bin_uiui - compute n over k.
   2
   3 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
   4
   5 Copyright 2010-2012 Free Software Foundation, Inc.
   6
   7 This file is part of the GNU MP Library.
   8
   9 The GNU MP Library is free software; you can redistribute it and/or modify
  10 it under the terms of either:
  11
  12   * the GNU Lesser General Public License as published by the Free
  13     Software Foundation; either version 3 of the License, or (at your
  14     option) any later version.
  15
  16 or
  17
  18   * the GNU General Public License as published by the Free Software
  19     Foundation; either version 2 of the License, or (at your option) any
  20     later version.
  21
  22 or both in parallel, as here.
  23
  24 The GNU MP Library is distributed in the hope that it will be useful, but
  25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 for more details.
  28
  29 You should have received copies of the GNU General Public License and the
  30 GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 see https://www.gnu.org/licenses/.  */
  32
  33 #include "gmp.h"
  34 #include "gmp-impl.h"
  35 #include "longlong.h"
  36
  37 #ifndef BIN_GOETGHELUCK_THRESHOLD
  38 #define BIN_GOETGHELUCK_THRESHOLD  1000
  39 #endif
  40 #ifndef BIN_UIUI_ENABLE_SMALLDC
  41 #define BIN_UIUI_ENABLE_SMALLDC    1
  42 #endif
  43 #ifndef BIN_UIUI_RECURSIVE_SMALLDC
  44 #define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32)
  45 #endif
  46
  47 /* Algorithm:
  48
  49    Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
  50    which are then accumulated into mpn numbers.  The first inner loop
  51    accumulates divisor factors, the 2nd inner loop accumulates exactly the same
  52    number of dividend factors.  We avoid accumulating more for the divisor,
  53    even with its smaller factors, since we else cannot guarantee divisibility.
  54
  55    Since we know each division will yield an integer, we compute the quotient
  56    using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
  57    2^t.
  58
  59    Improvements:
  60
  61    (1) An obvious improvement to this code would be to compute mod 2^t
  62    everywhere.  Unfortunately, we cannot determine t beforehand, unless we
  63    invoke some approximation, such as Stirling's formula.  Of course, we don't
  64    need t to be tight.  However, it is not clear that this would help much,
  65    our numbers are kept reasonably small already.
  66
  67    (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
  68    Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
  69    would make it both reasonably accurate and fast.  (We could use a table
  70    stored into a limb, perhaps.)  The table should take the removed factors of
  71    2 into account (those done on-the-fly in mulN).
  72
  73    (3) The first time in the loop we compute the odd part of a
  74    factorial in kp, we might use oddfac_1 for this task.
  75  */
  76
  77 /* This threshold determines how large divisor to accumulate before we call
  78    bdiv.  Perhaps we should never call bdiv, and accumulate all we are told,
  79    since we are just basecase code anyway?  Presumably, this depends on the
  80    relative speed of the asymptotically fast code and this code.  */
  81 #define SOME_THRESHOLD 20
  82
  83 /* Multiply-into-limb functions.  These remove factors of 2 on-the-fly.  FIXME:
  84    All versions of MAXFACS don't take this 2 removal into account now, meaning
  85    that then, shifting just adds some overhead.  (We remove factors from the
  86    completed limb anyway.)  */
  87
  88 static mp_limb_t
  89 mul1 (mp_limb_t m)
  90 {
  91   return m;
  92 }
  93
  94 static mp_limb_t
  95 mul2 (mp_limb_t m)
  96 {
  97   /* We need to shift before multiplying, to avoid an overflow. */
  98   mp_limb_t m01 = (m | 1) * ((m + 1) >> 1);
  99   return m01;
 100 }
 101
 102 static mp_limb_t
 103 mul3 (mp_limb_t m)
 104 {
 105   mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
 106   mp_limb_t m2 = (m + 2);
 107   return m01 * m2;
 108 }
 109
 110 static mp_limb_t
 111 mul4 (mp_limb_t m)
 112 {
 113   mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
 114   mp_limb_t m23 = (m + 2) * (m + 3) >> 1;
 115   return m01 * m23;
 116 }
 117
 118 static mp_limb_t
 119 mul5 (mp_limb_t m)
 120 {
 121   mp_limb_t m012 = (m + 0) * (m + 1) * (m + 2) >> 1;
 122   mp_limb_t m34 = (m + 3) * (m + 4) >> 1;
 123   return m012 * m34;
 124 }
 125
 126 static mp_limb_t
 127 mul6 (mp_limb_t m)
 128 {
 129   mp_limb_t m01 = (m + 0) * (m + 1);
 130   mp_limb_t m23 = (m + 2) * (m + 3);
 131   mp_limb_t m45 = (m + 4) * (m + 5) >> 1;
 132   mp_limb_t m0123 = m01 * m23 >> 3;
 133   return m0123 * m45;
 134 }
 135
 136 static mp_limb_t
 137 mul7 (mp_limb_t m)
 138 {
 139   mp_limb_t m01 = (m + 0) * (m + 1);
 140   mp_limb_t m23 = (m + 2) * (m + 3);
 141   mp_limb_t m456 = (m + 4) * (m + 5) * (m + 6) >> 1;
 142   mp_limb_t m0123 = m01 * m23 >> 3;
 143   return m0123 * m456;
 144 }
 145
 146 static mp_limb_t
 147 mul8 (mp_limb_t m)
 148 {
 149   mp_limb_t m01 = (m + 0) * (m + 1);
 150   mp_limb_t m23 = (m + 2) * (m + 3);
 151   mp_limb_t m45 = (m + 4) * (m + 5);
 152   mp_limb_t m67 = (m + 6) * (m + 7);
 153   mp_limb_t m0123 = m01 * m23 >> 3;
 154   mp_limb_t m4567 = m45 * m67 >> 3;
 155   return m0123 * m4567;
 156 }
 157
 158 typedef mp_limb_t (* mulfunc_t) (mp_limb_t);
 159
 160 static const mulfunc_t mulfunc[] = {mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8};
 161 #define M (numberof(mulfunc))
 162
 163 /* Number of factors-of-2 removed by the corresponding mulN function.  */
 164 static const unsigned char tcnttab[] = {0, 1, 1, 2, 2, 4, 4, 6};
 165
 166 #if 1
 167 /* This variant is inaccurate but share the code with other functions.  */
 168 #define MAXFACS(max,l)                                                  \
 169   do {                                                                  \
 170     (max) = log_n_max (l);                                              \
 171   } while (0)
 172 #else
 173
 174 /* This variant is exact(?) but uses a loop.  It takes the 2 removal
 175  of mulN into account.  */
 176 static const unsigned long ftab[] =
 177 #if GMP_NUMB_BITS == 64
 178   /* 1 to 8 factors per iteration */
 179   {CNST_LIMB(0xffffffffffffffff),CNST_LIMB(0x100000000),0x32cbfe,0x16a0b,0x24c4,0xa16,0x34b,0x1b2 /*,0xdf,0x8d */};
 180 #endif
 181 #if GMP_NUMB_BITS == 32
 182   /* 1 to 7 factors per iteration */
 183   {0xffffffff,0x10000,0x801,0x16b,0x71,0x42,0x26 /* ,0x1e */};
 184 #endif
 185
 186 #define MAXFACS(max,l)                                                  \
 187   do {                                                                  \
 188     int __i;                                                            \
 189     for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--)               \
 190       ;                                                                 \
 191     (max) = __i + 1;                                                    \
 192   } while (0)
 193 #endif
 194
 195 /* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis
 196    is an odd integer. */
 197 static const mp_limb_t facinv[] = { ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE };
 198
 199 static void
 200 mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
 201 {
 202   int nmax, kmax, nmaxnow, numfac;
 203   mp_ptr np, kp;
 204   mp_size_t nn, kn, alloc;
 205   mp_limb_t i, j, t, iii, jjj, cy, dinv;
 206   mp_bitcnt_t i2cnt, j2cnt;
 207   int cnt;
 208   mp_size_t maxn;
 209   TMP_DECL;
 210
 211   ASSERT (k > ODD_FACTORIAL_TABLE_LIMIT);
 212   TMP_MARK;
 213
 214   maxn = 1 + n / GMP_NUMB_BITS;    /* absolutely largest result size (limbs) */
 215
 216   /* FIXME: This allocation might be insufficient, but is usually way too
 217      large.  */
 218   alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD);
 219   alloc = MIN (alloc, k) + 1;
 220   np = TMP_ALLOC_LIMBS (alloc);
 221   kp = TMP_ALLOC_LIMBS (SOME_THRESHOLD + 1);
 222
 223   MAXFACS (nmax, n);
 224   ASSERT (nmax <= M);
 225   MAXFACS (kmax, k);
 226   ASSERT (kmax <= M);
 227   ASSERT (k >= M);
 228
 229   i = n - k + 1;
 230
 231   np[0] = 1; nn = 1;
 232
 233   i2cnt = 0;                            /* total low zeros in dividend */
 234   j2cnt = __gmp_fac2cnt_table[ODD_FACTORIAL_TABLE_LIMIT / 2 - 1];
 235                                         /* total low zeros in divisor */
 236
 237   numfac = 1;
 238   j = ODD_FACTORIAL_TABLE_LIMIT + 1;
 239   jjj = ODD_FACTORIAL_TABLE_MAX;
 240   ASSERT (__gmp_oddfac_table[ODD_FACTORIAL_TABLE_LIMIT] == ODD_FACTORIAL_TABLE_MAX);
 241
 242   while (1)
 243     {
 244       kp[0] = jjj;                              /* store new factors */
 245       kn = 1;
 246       t = k - j + 1;
 247       kmax = MIN (kmax, t);
 248
 249       while (kmax != 0 && kn < SOME_THRESHOLD)
 250         {
 251           jjj = mulfunc[kmax - 1] (j);
 252           j += kmax;                            /* number of factors used */
 253           count_trailing_zeros (cnt, jjj);      /* count low zeros */
 254           jjj >>= cnt;                          /* remove remaining low zeros */
 255           j2cnt += tcnttab[kmax - 1] + cnt;     /* update low zeros count */
 256           cy = mpn_mul_1 (kp, kp, kn, jjj);     /* accumulate new factors */
 257           kp[kn] = cy;
 258           kn += cy != 0;
 259           t = k - j + 1;
 260           kmax = MIN (kmax, t);
 261         }
 262       numfac = j - numfac;
 263
 264       while (numfac != 0)
 265         {
 266           nmaxnow = MIN (nmax, numfac);
 267           iii = mulfunc[nmaxnow - 1] (i);
 268           i += nmaxnow;                         /* number of factors used */
 269           count_trailing_zeros (cnt, iii);      /* count low zeros */
 270           iii >>= cnt;                          /* remove remaining low zeros */
 271           i2cnt += tcnttab[nmaxnow - 1] + cnt;  /* update low zeros count */
 272           cy = mpn_mul_1 (np, np, nn, iii);     /* accumulate new factors */
 273           np[nn] = cy;
 274           nn += cy != 0;
 275           numfac -= nmaxnow;
 276         }
 277
 278       ASSERT (nn < alloc);
 279
 280       binvert_limb (dinv, kp[0]);
 281       nn += (np[nn - 1] >= kp[kn - 1]);
 282       nn -= kn;
 283       mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv);
 284
 285       if (kmax == 0)
 286         break;
 287       numfac = j;
 288
 289       jjj = mulfunc[kmax - 1] (j);
 290       j += kmax;                                /* number of factors used */
 291       count_trailing_zeros (cnt, jjj);          /* count low zeros */
 292       jjj >>= cnt;                              /* remove remaining low zeros */
 293       j2cnt += tcnttab[kmax - 1] + cnt;         /* update low zeros count */
 294     }
 295
 296   /* Put back the right number of factors of 2.  */
 297   cnt = i2cnt - j2cnt;
 298   if (cnt != 0)
 299     {
 300       ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */
 301       cy = mpn_lshift (np, np, nn, cnt);
 302       np[nn] = cy;
 303       nn += cy != 0;
 304     }
 305
 306   nn -= np[nn - 1] == 0;        /* normalisation */
 307
 308   kp = MPZ_NEWALLOC (r, nn);
 309   SIZ(r) = nn;
 310   MPN_COPY (kp, np, nn);
 311   TMP_FREE;
 312 }
 313
 314 static void
 315 mpz_smallk_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
 316 {
 317   int nmax, numfac;
 318   mp_ptr rp;
 319   mp_size_t rn, alloc;
 320   mp_limb_t i, iii, cy;
 321   mp_bitcnt_t i2cnt, cnt;
 322
 323   count_leading_zeros (cnt, (mp_limb_t) n);
 324   cnt = GMP_LIMB_BITS - cnt;
 325   alloc = cnt * k / GMP_NUMB_BITS + 3;  /* FIXME: ensure rounding is enough. */
 326   rp = MPZ_NEWALLOC (r, alloc);
 327
 328   MAXFACS (nmax, n);
 329   nmax = MIN (nmax, M);
 330
 331   i = n - k + 1;
 332
 333   nmax = MIN (nmax, k);
 334   rp[0] = mulfunc[nmax - 1] (i);
 335   rn = 1;
 336   i += nmax;                            /* number of factors used */
 337   i2cnt = tcnttab[nmax - 1];            /* low zeros count */
 338   numfac = k - nmax;
 339   while (numfac != 0)
 340     {
 341       nmax = MIN (nmax, numfac);
 342       iii = mulfunc[nmax - 1] (i);
 343       i += nmax;                        /* number of factors used */
 344       i2cnt += tcnttab[nmax - 1];       /* update low zeros count */
 345       cy = mpn_mul_1 (rp, rp, rn, iii); /* accumulate new factors */
 346       rp[rn] = cy;
 347       rn += cy != 0;
 348       numfac -= nmax;
 349     }
 350
 351   ASSERT (rn < alloc);
 352
 353   mpn_pi1_bdiv_q_1 (rp, rp, rn, __gmp_oddfac_table[k], facinv[k - 2],
 354                     __gmp_fac2cnt_table[k / 2 - 1] - i2cnt);
 355   /* A two-fold, branch-free normalisation is possible :*/
 356   /* rn -= rp[rn - 1] == 0; */
 357   /* rn -= rp[rn - 1] == 0; */
 358   MPN_NORMALIZE_NOT_ZERO (rp, rn);
 359
 360   SIZ(r) = rn;
 361 }
 362
 363 /* Algorithm:
 364
 365    Plain and simply multiply things together.
 366
 367    We tabulate factorials (k!/2^t)^(-1) mod B (where t is chosen such
 368    that k!/2^t is odd).
 369
 370 */
 371
 372 static mp_limb_t
 373 bc_bin_uiui (unsigned int n, unsigned int k)
 374 {
 375   return ((__gmp_oddfac_table[n] * facinv[k - 2] * facinv[n - k - 2])
 376     << (__gmp_fac2cnt_table[n / 2 - 1] - __gmp_fac2cnt_table[k / 2 - 1] - __gmp_fac2cnt_table[(n-k) / 2 - 1]))
 377     & GMP_NUMB_MASK;
 378 }
 379
 380 /* Algorithm:
 381
 382    Recursively exploit the relation
 383    bin(n,k) = bin(n,k>>1)*bin(n-k>>1,k-k>>1)/bin(k,k>>1) .
 384
 385    Values for binomial(k,k>>1) that fit in a limb are precomputed
 386    (with inverses).
 387 */
 388
 389 /* bin2kk[i - ODD_CENTRAL_BINOMIAL_OFFSET] =
 390    binomial(i*2,i)/2^t (where t is chosen so that it is odd). */
 391 static const mp_limb_t bin2kk[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE };
 392
 393 /* bin2kkinv[i] = bin2kk[i]^-1 mod B */
 394 static const mp_limb_t bin2kkinv[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE };
 395
 396 /* bin2kk[i] = binomial((i+MIN_S)*2,i+MIN_S)/2^t. This table contains the t values. */
 397 static const unsigned char fac2bin[] = { CENTRAL_BINOMIAL_2FAC_TABLE };
 398
 399 static void
 400 mpz_smallkdc_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
 401 {
 402   mp_ptr rp;
 403   mp_size_t rn;
 404   unsigned long int hk;
 405
 406   hk = k >> 1;
 407
 408   if ((! BIN_UIUI_RECURSIVE_SMALLDC) || hk <= ODD_FACTORIAL_TABLE_LIMIT)
 409     mpz_smallk_bin_uiui (r, n, hk);
 410   else
 411     mpz_smallkdc_bin_uiui (r, n, hk);
 412   k -= hk;
 413   n -= hk;
 414   if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) {
 415     mp_limb_t cy;
 416     rn = SIZ (r);
 417     rp = MPZ_REALLOC (r, rn + 1);
 418     cy = mpn_mul_1 (rp, rp, rn, bc_bin_uiui (n, k));
 419     rp [rn] = cy;
 420     rn += cy != 0;
 421   } else {
 422     mp_limb_t buffer[ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3];
 423     mpz_t t;
 424
 425     ALLOC (t) = ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3;
 426     PTR (t) = buffer;
 427     if ((! BIN_UIUI_RECURSIVE_SMALLDC) || k <= ODD_FACTORIAL_TABLE_LIMIT)
 428       mpz_smallk_bin_uiui (t, n, k);
 429     else
 430       mpz_smallkdc_bin_uiui (t, n, k);
 431     mpz_mul (r, r, t);
 432     rp = PTR (r);
 433     rn = SIZ (r);
 434   }
 435
 436   mpn_pi1_bdiv_q_1 (rp, rp, rn, bin2kk[k - ODD_CENTRAL_BINOMIAL_OFFSET],
 437                     bin2kkinv[k - ODD_CENTRAL_BINOMIAL_OFFSET],
 438                     fac2bin[k - ODD_CENTRAL_BINOMIAL_OFFSET] - (k != hk));
 439   /* A two-fold, branch-free normalisation is possible :*/
 440   /* rn -= rp[rn - 1] == 0; */
 441   /* rn -= rp[rn - 1] == 0; */
 442   MPN_NORMALIZE_NOT_ZERO (rp, rn);
 443
 444   SIZ(r) = rn;
 445 }
 446
 447 /* mpz_goetgheluck_bin_uiui(RESULT, N, K) -- Set RESULT to binomial(N,K).
 448  *
 449  * Contributed to the GNU project by Marco Bodrato.
 450  *
 451  * Implementation of the algorithm by P. Goetgheluck, "Computing
 452  * Binomial Coefficients", The American Mathematical Monthly, Vol. 94,
 453  * No. 4 (April 1987), pp. 360-365.
 454  *
 455  * Acknowledgment: Peter Luschny did spot the slowness of the previous
 456  * code and suggested the reference.
 457  */
 458
 459 /* TODO: Remove duplicated constants / macros / static functions...
 460  */
 461
 462 /*************************************************************/
 463 /* Section macros: common macros, for swing/fac/bin (&sieve) */
 464 /*************************************************************/
 465
 466 #define FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I)                  \
 467   if ((PR) > (MAX_PR)) {                                        \
 468     (VEC)[(I)++] = (PR);                                        \
 469     (PR) = 1;                                                   \
 470   }
 471
 472 #define FACTOR_LIST_STORE(P, PR, MAX_PR, VEC, I)                \
 473   do {                                                          \
 474     if ((PR) > (MAX_PR)) {                                      \
 475       (VEC)[(I)++] = (PR);                                      \
 476       (PR) = (P);                                               \
 477     } else                                                      \
 478       (PR) *= (P);                                              \
 479   } while (0)
 480
 481 #define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)                 \
 482     __max_i = (end);                                            \
 483                                                                 \
 484     do {                                                        \
 485       ++__i;                                                    \
 486       if (((sieve)[__index] & __mask) == 0)                     \
 487         {                                                       \
 488           (prime) = id_to_n(__i)
 489
 490 #define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve)          \
 491   do {                                                          \
 492     mp_limb_t __mask, __index, __max_i, __i;                    \
 493                                                                 \
 494     __i = (start)-(off);                                        \
 495     __index = __i / GMP_LIMB_BITS;                              \
 496     __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS);             \
 497     __i += (off);                                               \
 498                                                                 \
 499     LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)
 500
 501 #define LOOP_ON_SIEVE_STOP                                      \
 502         }                                                       \
 503       __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1);       \
 504       __index += __mask & 1;                                    \
 505     }  while (__i <= __max_i)                                   \
 506
 507 #define LOOP_ON_SIEVE_END                                       \
 508     LOOP_ON_SIEVE_STOP;                                         \
 509   } while (0)
 510
 511 /*********************************************************/
 512 /* Section sieve: sieving functions and tools for primes */
 513 /*********************************************************/
 514
 515 #if WANT_ASSERT
 516 static mp_limb_t
 517 bit_to_n (mp_limb_t bit) { return (bit*3+4)|1; }
 518 #endif
 519
 520 /* id_to_n (x) = bit_to_n (x-1) = (id*3+1)|1*/
 521 static mp_limb_t
 522 id_to_n  (mp_limb_t id)  { return id*3+1+(id&1); }
 523
 524 /* n_to_bit (n) = ((n-1)&(-CNST_LIMB(2)))/3U-1 */
 525 static mp_limb_t
 526 n_to_bit (mp_limb_t n) { return ((n-5)|1)/3U; }
 527
 528 static mp_size_t
 529 primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; }
 530
 531 /*********************************************************/
 532 /* Section binomial: fast binomial implementation        */
 533 /*********************************************************/
 534
 535 #define COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)      \
 536   do {                                                  \
 537     mp_limb_t __a, __b, __prime, __ma,__mb;             \
 538     __prime = (P);                                      \
 539     __a = (N); __b = (K); __mb = 0;                     \
 540     FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I);             \
 541     do {                                                \
 542       __mb += __b % __prime; __b /= __prime;            \
 543       __ma = __a % __prime; __a /= __prime;             \
 544       if (__ma < __mb) {                                \
 545         __mb = 1; (PR) *= __prime;                      \
 546       } else  __mb = 0;                                 \
 547     } while (__a >= __prime);                           \
 548   } while (0)
 549
 550 #define SH_COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)   \
 551   do {                                                  \
 552     mp_limb_t __prime;                                  \
 553     __prime = (P);                                      \
 554     if (((N) % __prime) < ((K) % __prime)) {            \
 555       FACTOR_LIST_STORE (__prime, PR, MAX_PR, VEC, I);  \
 556     }                                                   \
 557   } while (0)
 558
 559 /* Returns an approximation of the sqare root of x.  *
 560  * It gives: x <= limb_apprsqrt (x) ^ 2 < x * 9/4    */
 561 static mp_limb_t
 562 limb_apprsqrt (mp_limb_t x)
 563 {
 564   int s;
 565
 566   ASSERT (x > 2);
 567   count_leading_zeros (s, x - 1);
 568   s = GMP_LIMB_BITS - 1 - s;
 569   return (CNST_LIMB(1) << (s >> 1)) + (CNST_LIMB(1) << ((s - 1) >> 1));
 570 }
 571
 572 static void
 573 mpz_goetgheluck_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
 574 {
 575   mp_limb_t *sieve, *factors, count;
 576   mp_limb_t prod, max_prod, j;
 577   TMP_DECL;
 578
 579   ASSERT (BIN_GOETGHELUCK_THRESHOLD >= 13);
 580   ASSERT (n >= 25);
 581
 582   TMP_MARK;
 583   sieve = TMP_ALLOC_LIMBS (primesieve_size (n));
 584
 585   count = gmp_primesieve (sieve, n) + 1;
 586   factors = TMP_ALLOC_LIMBS (count / log_n_max (n) + 1);
 587
 588   max_prod = GMP_NUMB_MAX / n;
 589
 590   /* Handle primes = 2, 3 separately. */
 591   popc_limb (count, n - k);
 592   popc_limb (j, k);
 593   count += j;
 594   popc_limb (j, n);
 595   count -= j;
 596   prod = CNST_LIMB(1) << count;
 597
 598   j = 0;
 599   COUNT_A_PRIME (3, n, k, prod, max_prod, factors, j);
 600
 601   /* Accumulate prime factors from 5 to n/2 */
 602     {
 603       mp_limb_t s;
 604
 605       {
 606         mp_limb_t prime;
 607         s = limb_apprsqrt(n);
 608         s = n_to_bit (s);
 609         LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), s, 0,sieve);
 610         COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
 611         LOOP_ON_SIEVE_END;
 612         s++;
 613       }
 614
 615       ASSERT (max_prod <= GMP_NUMB_MAX / 2);
 616       max_prod <<= 1;
 617       ASSERT (bit_to_n (s) * bit_to_n (s) > n);
 618       ASSERT (s <= n_to_bit (n >> 1));
 619       {
 620         mp_limb_t prime;
 621
 622         LOOP_ON_SIEVE_BEGIN (prime, s, n_to_bit (n >> 1), 0,sieve);
 623         SH_COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
 624         LOOP_ON_SIEVE_END;
 625       }
 626       max_prod >>= 1;
 627     }
 628
 629   /* Store primes from (n-k)+1 to n */
 630   ASSERT (n_to_bit (n - k) < n_to_bit (n));
 631     {
 632       mp_limb_t prime;
 633       LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (n - k) + 1, n_to_bit (n), 0,sieve);
 634       FACTOR_LIST_STORE (prime, prod, max_prod, factors, j);
 635       LOOP_ON_SIEVE_END;
 636     }
 637
 638   if (LIKELY (j != 0))
 639     {
 640       factors[j++] = prod;
 641       mpz_prodlimbs (r, factors, j);
 642     }
 643   else
 644     {
 645       PTR (r)[0] = prod;
 646       SIZ (r) = 1;
 647     }
 648   TMP_FREE;
 649 }
 650
 651 #undef COUNT_A_PRIME
 652 #undef SH_COUNT_A_PRIME
 653 #undef LOOP_ON_SIEVE_END
 654 #undef LOOP_ON_SIEVE_STOP
 655 #undef LOOP_ON_SIEVE_BEGIN
 656 #undef LOOP_ON_SIEVE_CONTINUE
 657
 658 /*********************************************************/
 659 /* End of implementation of Goetgheluck's algorithm      */
 660 /*********************************************************/
 661
 662 void
 663 mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
 664 {
 665   if (UNLIKELY (n < k)) {
 666     SIZ (r) = 0;
 667 #if BITS_PER_ULONG > GMP_NUMB_BITS
 668   } else if (UNLIKELY (n > GMP_NUMB_MAX)) {
 669     mpz_t tmp;
 670
 671     mpz_init_set_ui (tmp, n);
 672     mpz_bin_ui (r, tmp, k);
 673     mpz_clear (tmp);
 674 #endif
 675   } else {
 676     ASSERT (n <= GMP_NUMB_MAX);
 677     /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */
 678     k = MIN (k, n - k);
 679     if (k < 2) {
 680       PTR(r)[0] = k ? n : 1; /* 1 + ((-k) & (n-1)); */
 681       SIZ(r) = 1;
 682     } else if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { /* k >= 2, n >= 4 */
 683       PTR(r)[0] = bc_bin_uiui (n, k);
 684       SIZ(r) = 1;
 685     } else if (k <= ODD_FACTORIAL_TABLE_LIMIT)
 686       mpz_smallk_bin_uiui (r, n, k);
 687     else if (BIN_UIUI_ENABLE_SMALLDC &&
 688              k <= (BIN_UIUI_RECURSIVE_SMALLDC ? ODD_CENTRAL_BINOMIAL_TABLE_LIMIT : ODD_FACTORIAL_TABLE_LIMIT)* 2)
 689       mpz_smallkdc_bin_uiui (r, n, k);
 690     else if (ABOVE_THRESHOLD (k, BIN_GOETGHELUCK_THRESHOLD) &&
 691              k > (n >> 4)) /* k > ODD_FACTORIAL_TABLE_LIMIT */
 692       mpz_goetgheluck_bin_uiui (r, n, k);
 693     else
 694       mpz_bdiv_bin_uiui (r, n, k);
 695   }
 696 }