dlls/rsaenh/mpi.c

   1 /*
   2  * dlls/rsaenh/mpi.c
   3  * Multi Precision Integer functions
   4  *
   5  * Copyright 2004 Michael Jung
   6  * Based on public domain code by Tom St Denis (tomstdenis@iahu.ca)
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  21  */
  22
  23 /*
  24  * This file contains code from the LibTomCrypt cryptographic
  25  * library written by Tom St Denis (tomstdenis@iahu.ca). LibTomCrypt
  26  * is in the public domain. The code in this file is tailored to
  27  * special requirements. Take a look at http://libtomcrypt.org for the
  28  * original version.
  29  */
  30
  31 #include <stdarg.h>
  32
  33 #include "windef.h"
  34 #include "winbase.h"
  35 #include "tomcrypt.h"
  36
  37 /* Known optimal configurations
  38  CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
  39 -------------------------------------------------------------
  40  Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
  41 */
  42 static const int KARATSUBA_MUL_CUTOFF = 88,  /* Min. number of digits before Karatsuba multiplication is used. */
  43                  KARATSUBA_SQR_CUTOFF = 128; /* Min. number of digits before Karatsuba squaring is used. */
  44
  45
  46 /* trim unused digits */
  47 static void mp_clamp(mp_int *a);
  48
  49 /* compare |a| to |b| */
  50 static int mp_cmp_mag(const mp_int *a, const mp_int *b);
  51
  52 /* Counts the number of lsbs which are zero before the first zero bit */
  53 static int mp_cnt_lsb(const mp_int *a);
  54
  55 /* computes a = B**n mod b without division or multiplication useful for
  56  * normalizing numbers in a Montgomery system.
  57  */
  58 static int mp_montgomery_calc_normalization(mp_int *a, const mp_int *b);
  59
  60 /* computes x/R == x (mod N) via Montgomery Reduction */
  61 static int mp_montgomery_reduce(mp_int *a, const mp_int *m, mp_digit mp);
  62
  63 /* setups the montgomery reduction */
  64 static int mp_montgomery_setup(const mp_int *a, mp_digit *mp);
  65
  66 /* Barrett Reduction, computes a (mod b) with a precomputed value c
  67  *
  68  * Assumes that 0 < a <= b*b, note if 0 > a > -(b*b) then you can merely
  69  * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
  70  */
  71 static int mp_reduce(mp_int *a, const mp_int *b, const mp_int *c);
  72
  73 /* reduces a modulo b where b is of the form 2**p - k [0 <= a] */
  74 static int mp_reduce_2k(mp_int *a, const mp_int *n, mp_digit d);
  75
  76 /* determines k value for 2k reduction */
  77 static int mp_reduce_2k_setup(const mp_int *a, mp_digit *d);
  78
  79 /* used to setup the Barrett reduction for a given modulus b */
  80 static int mp_reduce_setup(mp_int *a, const mp_int *b);
  81
  82 /* set to a digit */
  83 static void mp_set(mp_int *a, mp_digit b);
  84
  85 /* b = a*a  */
  86 static int mp_sqr(const mp_int *a, mp_int *b);
  87
  88 /* c = a * a (mod b) */
  89 static int mp_sqrmod(const mp_int *a, mp_int *b, mp_int *c);
  90
  91
  92 static void bn_reverse(unsigned char *s, int len);
  93 static int s_mp_add(mp_int *a, mp_int *b, mp_int *c);
  94 static int s_mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y);
  95 #define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
  96 static int s_mp_mul_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs);
  97 static int s_mp_mul_high_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs);
  98 static int s_mp_sqr(const mp_int *a, mp_int *b);
  99 static int s_mp_sub(const mp_int *a, const mp_int *b, mp_int *c);
 100 static int mp_exptmod_fast(const mp_int *G, const mp_int *X, mp_int *P, mp_int *Y, int mode);
 101 static int mp_invmod_slow (const mp_int * a, mp_int * b, mp_int * c);
 102 static int mp_karatsuba_mul(const mp_int *a, const mp_int *b, mp_int *c);
 103 static int mp_karatsuba_sqr(const mp_int *a, mp_int *b);
 104
 105 /* grow as required */
 106 static int mp_grow (mp_int * a, int size)
 107 {
 108   int     i;
 109   mp_digit *tmp;
 110
 111   /* if the alloc size is smaller alloc more ram */
 112   if (a->alloc < size) {
 113     /* ensure there are always at least MP_PREC digits extra on top */
 114     size += (MP_PREC * 2) - (size % MP_PREC);
 115
 116     /* reallocate the array a->dp
 117      *
 118      * We store the return in a temporary variable
 119      * in case the operation failed we don't want
 120      * to overwrite the dp member of a.
 121      */
 122     tmp = HeapReAlloc(GetProcessHeap(), 0, a->dp, sizeof (mp_digit) * size);
 123     if (tmp == NULL) {
 124       /* reallocation failed but "a" is still valid [can be freed] */
 125       return MP_MEM;
 126     }
 127
 128     /* reallocation succeeded so set a->dp */
 129     a->dp = tmp;
 130
 131     /* zero excess digits */
 132     i        = a->alloc;
 133     a->alloc = size;
 134     for (; i < a->alloc; i++) {
 135       a->dp[i] = 0;
 136     }
 137   }
 138   return MP_OKAY;
 139 }
 140
 141 /* b = a/2 */
 142 static int mp_div_2(const mp_int * a, mp_int * b)
 143 {
 144   int     x, res, oldused;
 145
 146   /* copy */
 147   if (b->alloc < a->used) {
 148     if ((res = mp_grow (b, a->used)) != MP_OKAY) {
 149       return res;
 150     }
 151   }
 152
 153   oldused = b->used;
 154   b->used = a->used;
 155   {
 156     register mp_digit r, rr, *tmpa, *tmpb;
 157
 158     /* source alias */
 159     tmpa = a->dp + b->used - 1;
 160
 161     /* dest alias */
 162     tmpb = b->dp + b->used - 1;
 163
 164     /* carry */
 165     r = 0;
 166     for (x = b->used - 1; x >= 0; x--) {
 167       /* get the carry for the next iteration */
 168       rr = *tmpa & 1;
 169
 170       /* shift the current digit, add in carry and store */
 171       *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
 172
 173       /* forward carry to next iteration */
 174       r = rr;
 175     }
 176
 177     /* zero excess digits */
 178     tmpb = b->dp + b->used;
 179     for (x = b->used; x < oldused; x++) {
 180       *tmpb++ = 0;
 181     }
 182   }
 183   b->sign = a->sign;
 184   mp_clamp (b);
 185   return MP_OKAY;
 186 }
 187
 188 /* swap the elements of two integers, for cases where you can't simply swap the
 189  * mp_int pointers around
 190  */
 191 static void
 192 mp_exch (mp_int * a, mp_int * b)
 193 {
 194   mp_int  t;
 195
 196   t  = *a;
 197   *a = *b;
 198   *b = t;
 199 }
 200
 201 /* init a new mp_int */
 202 static int mp_init (mp_int * a)
 203 {
 204   int i;
 205
 206   /* allocate memory required and clear it */
 207   a->dp = HeapAlloc(GetProcessHeap(), 0, sizeof (mp_digit) * MP_PREC);
 208   if (a->dp == NULL) {
 209     return MP_MEM;
 210   }
 211
 212   /* set the digits to zero */
 213   for (i = 0; i < MP_PREC; i++) {
 214       a->dp[i] = 0;
 215   }
 216
 217   /* set the used to zero, allocated digits to the default precision
 218    * and sign to positive */
 219   a->used  = 0;
 220   a->alloc = MP_PREC;
 221   a->sign  = MP_ZPOS;
 222
 223   return MP_OKAY;
 224 }
 225
 226 /* init an mp_init for a given size */
 227 static int mp_init_size (mp_int * a, int size)
 228 {
 229   int x;
 230
 231   /* pad size so there are always extra digits */
 232   size += (MP_PREC * 2) - (size % MP_PREC);
 233
 234   /* alloc mem */
 235   a->dp = HeapAlloc(GetProcessHeap(), 0, sizeof (mp_digit) * size);
 236   if (a->dp == NULL) {
 237     return MP_MEM;
 238   }
 239
 240   /* set the members */
 241   a->used  = 0;
 242   a->alloc = size;
 243   a->sign  = MP_ZPOS;
 244
 245   /* zero the digits */
 246   for (x = 0; x < size; x++) {
 247       a->dp[x] = 0;
 248   }
 249
 250   return MP_OKAY;
 251 }
 252
 253 /* clear one (frees)  */
 254 static void
 255 mp_clear (mp_int * a)
 256 {
 257   int i;
 258
 259   /* only do anything if a hasn't been freed previously */
 260   if (a->dp != NULL) {
 261     /* first zero the digits */
 262     for (i = 0; i < a->used; i++) {
 263         a->dp[i] = 0;
 264     }
 265
 266     /* free ram */
 267     HeapFree(GetProcessHeap(), 0, a->dp);
 268
 269     /* reset members to make debugging easier */
 270     a->dp    = NULL;
 271     a->alloc = a->used = 0;
 272     a->sign  = MP_ZPOS;
 273   }
 274 }
 275
 276 /* set to zero */
 277 static void
 278 mp_zero (mp_int * a)
 279 {
 280   a->sign = MP_ZPOS;
 281   a->used = 0;
 282   memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
 283 }
 284
 285 /* b = |a|
 286  *
 287  * Simple function copies the input and fixes the sign to positive
 288  */
 289 static int
 290 mp_abs (const mp_int * a, mp_int * b)
 291 {
 292   int     res;
 293
 294   /* copy a to b */
 295   if (a != b) {
 296      if ((res = mp_copy (a, b)) != MP_OKAY) {
 297        return res;
 298      }
 299   }
 300
 301   /* force the sign of b to positive */
 302   b->sign = MP_ZPOS;
 303
 304   return MP_OKAY;
 305 }
 306
 307 /* computes the modular inverse via binary extended euclidean algorithm,
 308  * that is c = 1/a mod b
 309  *
 310  * Based on slow invmod except this is optimized for the case where b is
 311  * odd as per HAC Note 14.64 on pp. 610
 312  */
 313 static int
 314 fast_mp_invmod (const mp_int * a, mp_int * b, mp_int * c)
 315 {
 316   mp_int  x, y, u, v, B, D;
 317   int     res, neg;
 318
 319   /* 2. [modified] b must be odd   */
 320   if (mp_iseven (b) == 1) {
 321     return MP_VAL;
 322   }
 323
 324   /* init all our temps */
 325   if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
 326      return res;
 327   }
 328
 329   /* x == modulus, y == value to invert */
 330   if ((res = mp_copy (b, &x)) != MP_OKAY) {
 331     goto __ERR;
 332   }
 333
 334   /* we need y = |a| */
 335   if ((res = mp_abs (a, &y)) != MP_OKAY) {
 336     goto __ERR;
 337   }
 338
 339   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
 340   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
 341     goto __ERR;
 342   }
 343   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
 344     goto __ERR;
 345   }
 346   mp_set (&D, 1);
 347
 348 top:
 349   /* 4.  while u is even do */
 350   while (mp_iseven (&u) == 1) {
 351     /* 4.1 u = u/2 */
 352     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
 353       goto __ERR;
 354     }
 355     /* 4.2 if B is odd then */
 356     if (mp_isodd (&B) == 1) {
 357       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
 358         goto __ERR;
 359       }
 360     }
 361     /* B = B/2 */
 362     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
 363       goto __ERR;
 364     }
 365   }
 366
 367   /* 5.  while v is even do */
 368   while (mp_iseven (&v) == 1) {
 369     /* 5.1 v = v/2 */
 370     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
 371       goto __ERR;
 372     }
 373     /* 5.2 if D is odd then */
 374     if (mp_isodd (&D) == 1) {
 375       /* D = (D-x)/2 */
 376       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
 377         goto __ERR;
 378       }
 379     }
 380     /* D = D/2 */
 381     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
 382       goto __ERR;
 383     }
 384   }
 385
 386   /* 6.  if u >= v then */
 387   if (mp_cmp (&u, &v) != MP_LT) {
 388     /* u = u - v, B = B - D */
 389     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
 390       goto __ERR;
 391     }
 392
 393     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
 394       goto __ERR;
 395     }
 396   } else {
 397     /* v - v - u, D = D - B */
 398     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
 399       goto __ERR;
 400     }
 401
 402     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
 403       goto __ERR;
 404     }
 405   }
 406
 407   /* if not zero goto step 4 */
 408   if (mp_iszero (&u) == 0) {
 409     goto top;
 410   }
 411
 412   /* now a = C, b = D, gcd == g*v */
 413
 414   /* if v != 1 then there is no inverse */
 415   if (mp_cmp_d (&v, 1) != MP_EQ) {
 416     res = MP_VAL;
 417     goto __ERR;
 418   }
 419
 420   /* b is now the inverse */
 421   neg = a->sign;
 422   while (D.sign == MP_NEG) {
 423     if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
 424       goto __ERR;
 425     }
 426   }
 427   mp_exch (&D, c);
 428   c->sign = neg;
 429   res = MP_OKAY;
 430
 431 __ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
 432   return res;
 433 }
 434
 435 /* computes xR**-1 == x (mod N) via Montgomery Reduction
 436  *
 437  * This is an optimized implementation of montgomery_reduce
 438  * which uses the comba method to quickly calculate the columns of the
 439  * reduction.
 440  *
 441  * Based on Algorithm 14.32 on pp.601 of HAC.
 442 */
 443 static int
 444 fast_mp_montgomery_reduce (mp_int * x, const mp_int * n, mp_digit rho)
 445 {
 446   int     ix, res, olduse;
 447   mp_word W[MP_WARRAY];
 448
 449   /* get old used count */
 450   olduse = x->used;
 451
 452   /* grow a as required */
 453   if (x->alloc < n->used + 1) {
 454     if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
 455       return res;
 456     }
 457   }
 458
 459   /* first we have to get the digits of the input into
 460    * an array of double precision words W[...]
 461    */
 462   {
 463     register mp_word *_W;
 464     register mp_digit *tmpx;
 465
 466     /* alias for the W[] array */
 467     _W   = W;
 468
 469     /* alias for the digits of  x*/
 470     tmpx = x->dp;
 471
 472     /* copy the digits of a into W[0..a->used-1] */
 473     for (ix = 0; ix < x->used; ix++) {
 474       *_W++ = *tmpx++;
 475     }
 476
 477     /* zero the high words of W[a->used..m->used*2] */
 478     for (; ix < n->used * 2 + 1; ix++) {
 479       *_W++ = 0;
 480     }
 481   }
 482
 483   /* now we proceed to zero successive digits
 484    * from the least significant upwards
 485    */
 486   for (ix = 0; ix < n->used; ix++) {
 487     /* mu = ai * m' mod b
 488      *
 489      * We avoid a double precision multiplication (which isn't required)
 490      * by casting the value down to a mp_digit.  Note this requires
 491      * that W[ix-1] have  the carry cleared (see after the inner loop)
 492      */
 493     register mp_digit mu;
 494     mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
 495
 496     /* a = a + mu * m * b**i
 497      *
 498      * This is computed in place and on the fly.  The multiplication
 499      * by b**i is handled by offsetting which columns the results
 500      * are added to.
 501      *
 502      * Note the comba method normally doesn't handle carries in the
 503      * inner loop In this case we fix the carry from the previous
 504      * column since the Montgomery reduction requires digits of the
 505      * result (so far) [see above] to work.  This is
 506      * handled by fixing up one carry after the inner loop.  The
 507      * carry fixups are done in order so after these loops the
 508      * first m->used words of W[] have the carries fixed
 509      */
 510     {
 511       register int iy;
 512       register mp_digit *tmpn;
 513       register mp_word *_W;
 514
 515       /* alias for the digits of the modulus */
 516       tmpn = n->dp;
 517
 518       /* Alias for the columns set by an offset of ix */
 519       _W = W + ix;
 520
 521       /* inner loop */
 522       for (iy = 0; iy < n->used; iy++) {
 523           *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
 524       }
 525     }
 526
 527     /* now fix carry for next digit, W[ix+1] */
 528     W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
 529   }
 530
 531   /* now we have to propagate the carries and
 532    * shift the words downward [all those least
 533    * significant digits we zeroed].
 534    */
 535   {
 536     register mp_digit *tmpx;
 537     register mp_word *_W, *_W1;
 538
 539     /* nox fix rest of carries */
 540
 541     /* alias for current word */
 542     _W1 = W + ix;
 543
 544     /* alias for next word, where the carry goes */
 545     _W = W + ++ix;
 546
 547     for (; ix <= n->used * 2 + 1; ix++) {
 548       *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
 549     }
 550
 551     /* copy out, A = A/b**n
 552      *
 553      * The result is A/b**n but instead of converting from an
 554      * array of mp_word to mp_digit than calling mp_rshd
 555      * we just copy them in the right order
 556      */
 557
 558     /* alias for destination word */
 559     tmpx = x->dp;
 560
 561     /* alias for shifted double precision result */
 562     _W = W + n->used;
 563
 564     for (ix = 0; ix < n->used + 1; ix++) {
 565       *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
 566     }
 567
 568     /* zero oldused digits, if the input a was larger than
 569      * m->used+1 we'll have to clear the digits
 570      */
 571     for (; ix < olduse; ix++) {
 572       *tmpx++ = 0;
 573     }
 574   }
 575
 576   /* set the max used and clamp */
 577   x->used = n->used + 1;
 578   mp_clamp (x);
 579
 580   /* if A >= m then A = A - m */
 581   if (mp_cmp_mag (x, n) != MP_LT) {
 582     return s_mp_sub (x, n, x);
 583   }
 584   return MP_OKAY;
 585 }
 586
 587 /* Fast (comba) multiplier
 588  *
 589  * This is the fast column-array [comba] multiplier.  It is
 590  * designed to compute the columns of the product first
 591  * then handle the carries afterwards.  This has the effect
 592  * of making the nested loops that compute the columns very
 593  * simple and schedulable on super-scalar processors.
 594  *
 595  * This has been modified to produce a variable number of
 596  * digits of output so if say only a half-product is required
 597  * you don't have to compute the upper half (a feature
 598  * required for fast Barrett reduction).
 599  *
 600  * Based on Algorithm 14.12 on pp.595 of HAC.
 601  *
 602  */
 603 static int
 604 fast_s_mp_mul_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
 605 {
 606   int     olduse, res, pa, ix, iz;
 607   mp_digit W[MP_WARRAY];
 608   register mp_word  _W;
 609
 610   /* grow the destination as required */
 611   if (c->alloc < digs) {
 612     if ((res = mp_grow (c, digs)) != MP_OKAY) {
 613       return res;
 614     }
 615   }
 616
 617   /* number of output digits to produce */
 618   pa = MIN(digs, a->used + b->used);
 619
 620   /* clear the carry */
 621   _W = 0;
 622   for (ix = 0; ix <= pa; ix++) {
 623       int      tx, ty;
 624       int      iy;
 625       mp_digit *tmpx, *tmpy;
 626
 627       /* get offsets into the two bignums */
 628       ty = MIN(b->used-1, ix);
 629       tx = ix - ty;
 630
 631       /* setup temp aliases */
 632       tmpx = a->dp + tx;
 633       tmpy = b->dp + ty;
 634
 635       /* This is the number of times the loop will iterate, essentially it's
 636          while (tx++ < a->used && ty-- >= 0) { ... }
 637        */
 638       iy = MIN(a->used-tx, ty+1);
 639
 640       /* execute loop */
 641       for (iz = 0; iz < iy; ++iz) {
 642          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
 643       }
 644
 645       /* store term */
 646       W[ix] = ((mp_digit)_W) & MP_MASK;
 647
 648       /* make next carry */
 649       _W = _W >> ((mp_word)DIGIT_BIT);
 650   }
 651
 652   /* setup dest */
 653   olduse  = c->used;
 654   c->used = digs;
 655
 656   {
 657     register mp_digit *tmpc;
 658     tmpc = c->dp;
 659     for (ix = 0; ix < digs; ix++) {
 660       /* now extract the previous digit [below the carry] */
 661       *tmpc++ = W[ix];
 662     }
 663
 664     /* clear unused digits [that existed in the old copy of c] */
 665     for (; ix < olduse; ix++) {
 666       *tmpc++ = 0;
 667     }
 668   }
 669   mp_clamp (c);
 670   return MP_OKAY;
 671 }
 672
 673 /* this is a modified version of fast_s_mul_digs that only produces
 674  * output digits *above* digs.  See the comments for fast_s_mul_digs
 675  * to see how it works.
 676  *
 677  * This is used in the Barrett reduction since for one of the multiplications
 678  * only the higher digits were needed.  This essentially halves the work.
 679  *
 680  * Based on Algorithm 14.12 on pp.595 of HAC.
 681  */
 682 static int
 683 fast_s_mp_mul_high_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
 684 {
 685   int     olduse, res, pa, ix, iz;
 686   mp_digit W[MP_WARRAY];
 687   mp_word  _W;
 688
 689   /* grow the destination as required */
 690   pa = a->used + b->used;
 691   if (c->alloc < pa) {
 692     if ((res = mp_grow (c, pa)) != MP_OKAY) {
 693       return res;
 694     }
 695   }
 696
 697   /* number of output digits to produce */
 698   pa = a->used + b->used;
 699   _W = 0;
 700   for (ix = digs; ix <= pa; ix++) {
 701       int      tx, ty, iy;
 702       mp_digit *tmpx, *tmpy;
 703
 704       /* get offsets into the two bignums */
 705       ty = MIN(b->used-1, ix);
 706       tx = ix - ty;
 707
 708       /* setup temp aliases */
 709       tmpx = a->dp + tx;
 710       tmpy = b->dp + ty;
 711
 712       /* This is the number of times the loop will iterate, essentially it's
 713          while (tx++ < a->used && ty-- >= 0) { ... }
 714        */
 715       iy = MIN(a->used-tx, ty+1);
 716
 717       /* execute loop */
 718       for (iz = 0; iz < iy; iz++) {
 719          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
 720       }
 721
 722       /* store term */
 723       W[ix] = ((mp_digit)_W) & MP_MASK;
 724
 725       /* make next carry */
 726       _W = _W >> ((mp_word)DIGIT_BIT);
 727   }
 728
 729   /* setup dest */
 730   olduse  = c->used;
 731   c->used = pa;
 732
 733   {
 734     register mp_digit *tmpc;
 735
 736     tmpc = c->dp + digs;
 737     for (ix = digs; ix <= pa; ix++) {
 738       /* now extract the previous digit [below the carry] */
 739       *tmpc++ = W[ix];
 740     }
 741
 742     /* clear unused digits [that existed in the old copy of c] */
 743     for (; ix < olduse; ix++) {
 744       *tmpc++ = 0;
 745     }
 746   }
 747   mp_clamp (c);
 748   return MP_OKAY;
 749 }
 750
 751 /* fast squaring
 752  *
 753  * This is the comba method where the columns of the product
 754  * are computed first then the carries are computed.  This
 755  * has the effect of making a very simple inner loop that
 756  * is executed the most
 757  *
 758  * W2 represents the outer products and W the inner.
 759  *
 760  * A further optimizations is made because the inner
 761  * products are of the form "A * B * 2".  The *2 part does
 762  * not need to be computed until the end which is good
 763  * because 64-bit shifts are slow!
 764  *
 765  * Based on Algorithm 14.16 on pp.597 of HAC.
 766  *
 767  */
 768 /* the jist of squaring...
 769
 770 you do like mult except the offset of the tmpx [one that starts closer to zero]
 771 can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
 772 (ty-tx) so that it never happens.  You double all those you add in the inner loop
 773
 774 After that loop you do the squares and add them in.
 775
 776 Remove W2 and don't memset W
 777
 778 */
 779
 780 static int fast_s_mp_sqr (const mp_int * a, mp_int * b)
 781 {
 782   int       olduse, res, pa, ix, iz;
 783   mp_digit   W[MP_WARRAY], *tmpx;
 784   mp_word   W1;
 785
 786   /* grow the destination as required */
 787   pa = a->used + a->used;
 788   if (b->alloc < pa) {
 789     if ((res = mp_grow (b, pa)) != MP_OKAY) {
 790       return res;
 791     }
 792   }
 793
 794   /* number of output digits to produce */
 795   W1 = 0;
 796   for (ix = 0; ix <= pa; ix++) {
 797       int      tx, ty, iy;
 798       mp_word  _W;
 799       mp_digit *tmpy;
 800
 801       /* clear counter */
 802       _W = 0;
 803
 804       /* get offsets into the two bignums */
 805       ty = MIN(a->used-1, ix);
 806       tx = ix - ty;
 807
 808       /* setup temp aliases */
 809       tmpx = a->dp + tx;
 810       tmpy = a->dp + ty;
 811
 812       /* This is the number of times the loop will iterate, essentially it's
 813          while (tx++ < a->used && ty-- >= 0) { ... }
 814        */
 815       iy = MIN(a->used-tx, ty+1);
 816
 817       /* now for squaring tx can never equal ty
 818        * we halve the distance since they approach at a rate of 2x
 819        * and we have to round because odd cases need to be executed
 820        */
 821       iy = MIN(iy, (ty-tx+1)>>1);
 822
 823       /* execute loop */
 824       for (iz = 0; iz < iy; iz++) {
 825          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
 826       }
 827
 828       /* double the inner product and add carry */
 829       _W = _W + _W + W1;
 830
 831       /* even columns have the square term in them */
 832       if ((ix&1) == 0) {
 833          _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
 834       }
 835
 836       /* store it */
 837       W[ix] = _W;
 838
 839       /* make next carry */
 840       W1 = _W >> ((mp_word)DIGIT_BIT);
 841   }
 842
 843   /* setup dest */
 844   olduse  = b->used;
 845   b->used = a->used+a->used;
 846
 847   {
 848     mp_digit *tmpb;
 849     tmpb = b->dp;
 850     for (ix = 0; ix < pa; ix++) {
 851       *tmpb++ = W[ix] & MP_MASK;
 852     }
 853
 854     /* clear unused digits [that existed in the old copy of c] */
 855     for (; ix < olduse; ix++) {
 856       *tmpb++ = 0;
 857     }
 858   }
 859   mp_clamp (b);
 860   return MP_OKAY;
 861 }
 862
 863 /* computes a = 2**b
 864  *
 865  * Simple algorithm which zeroes the int, grows it then just sets one bit
 866  * as required.
 867  */
 868 static int
 869 mp_2expt (mp_int * a, int b)
 870 {
 871   int     res;
 872
 873   /* zero a as per default */
 874   mp_zero (a);
 875
 876   /* grow a to accommodate the single bit */
 877   if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
 878     return res;
 879   }
 880
 881   /* set the used count of where the bit will go */
 882   a->used = b / DIGIT_BIT + 1;
 883
 884   /* put the single bit in its place */
 885   a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
 886
 887   return MP_OKAY;
 888 }
 889
 890 /* high level addition (handles signs) */
 891 int mp_add (mp_int * a, mp_int * b, mp_int * c)
 892 {
 893   int     sa, sb, res;
 894
 895   /* get sign of both inputs */
 896   sa = a->sign;
 897   sb = b->sign;
 898
 899   /* handle two cases, not four */
 900   if (sa == sb) {
 901     /* both positive or both negative */
 902     /* add their magnitudes, copy the sign */
 903     c->sign = sa;
 904     res = s_mp_add (a, b, c);
 905   } else {
 906     /* one positive, the other negative */
 907     /* subtract the one with the greater magnitude from */
 908     /* the one of the lesser magnitude.  The result gets */
 909     /* the sign of the one with the greater magnitude. */
 910     if (mp_cmp_mag (a, b) == MP_LT) {
 911       c->sign = sb;
 912       res = s_mp_sub (b, a, c);
 913     } else {
 914       c->sign = sa;
 915       res = s_mp_sub (a, b, c);
 916     }
 917   }
 918   return res;
 919 }
 920
 921
 922 /* single digit addition */
 923 static int
 924 mp_add_d (mp_int * a, mp_digit b, mp_int * c)
 925 {
 926   int     res, ix, oldused;
 927   mp_digit *tmpa, *tmpc, mu;
 928
 929   /* grow c as required */
 930   if (c->alloc < a->used + 1) {
 931      if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
 932         return res;
 933      }
 934   }
 935
 936   /* if a is negative and |a| >= b, call c = |a| - b */
 937   if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
 938      /* temporarily fix sign of a */
 939      a->sign = MP_ZPOS;
 940
 941      /* c = |a| - b */
 942      res = mp_sub_d(a, b, c);
 943
 944      /* fix sign  */
 945      a->sign = c->sign = MP_NEG;
 946
 947      return res;
 948   }
 949
 950   /* old number of used digits in c */
 951   oldused = c->used;
 952
 953   /* sign always positive */
 954   c->sign = MP_ZPOS;
 955
 956   /* source alias */
 957   tmpa    = a->dp;
 958
 959   /* destination alias */
 960   tmpc    = c->dp;
 961
 962   /* if a is positive */
 963   if (a->sign == MP_ZPOS) {
 964      /* add digit, after this we're propagating
 965       * the carry.
 966       */
 967      *tmpc   = *tmpa++ + b;
 968      mu      = *tmpc >> DIGIT_BIT;
 969      *tmpc++ &= MP_MASK;
 970
 971      /* now handle rest of the digits */
 972      for (ix = 1; ix < a->used; ix++) {
 973         *tmpc   = *tmpa++ + mu;
 974         mu      = *tmpc >> DIGIT_BIT;
 975         *tmpc++ &= MP_MASK;
 976      }
 977      /* set final carry */
 978      ix++;
 979      *tmpc++  = mu;
 980
 981      /* setup size */
 982      c->used = a->used + 1;
 983   } else {
 984      /* a was negative and |a| < b */
 985      c->used  = 1;
 986
 987      /* the result is a single digit */
 988      if (a->used == 1) {
 989         *tmpc++  =  b - a->dp[0];
 990      } else {
 991         *tmpc++  =  b;
 992      }
 993
 994      /* setup count so the clearing of oldused
 995       * can fall through correctly
 996       */
 997      ix       = 1;
 998   }
 999
1000   /* now zero to oldused */
1001   while (ix++ < oldused) {
1002      *tmpc++ = 0;
1003   }
1004   mp_clamp(c);
1005
1006   return MP_OKAY;
1007 }
1008
1009 /* trim unused digits
1010  *
1011  * This is used to ensure that leading zero digits are
1012  * trimed and the leading "used" digit will be non-zero
1013  * Typically very fast.  Also fixes the sign if there
1014  * are no more leading digits
1015  */
1016 void
1017 mp_clamp (mp_int * a)
1018 {
1019   /* decrease used while the most significant digit is
1020    * zero.
1021    */
1022   while (a->used > 0 && a->dp[a->used - 1] == 0) {
1023     --(a->used);
1024   }
1025
1026   /* reset the sign flag if used == 0 */
1027   if (a->used == 0) {
1028     a->sign = MP_ZPOS;
1029   }
1030 }
1031
1032 void mp_clear_multi(mp_int *mp, ...)
1033 {
1034     mp_int* next_mp = mp;
1035     va_list args;
1036     va_start(args, mp);
1037     while (next_mp != NULL) {
1038         mp_clear(next_mp);
1039         next_mp = va_arg(args, mp_int*);
1040     }
1041     va_end(args);
1042 }
1043
1044 /* compare two ints (signed)*/
1045 int
1046 mp_cmp (const mp_int * a, const mp_int * b)
1047 {
1048   /* compare based on sign */
1049   if (a->sign != b->sign) {
1050      if (a->sign == MP_NEG) {
1051         return MP_LT;
1052      } else {
1053         return MP_GT;
1054      }
1055   }
1056
1057   /* compare digits */
1058   if (a->sign == MP_NEG) {
1059      /* if negative compare opposite direction */
1060      return mp_cmp_mag(b, a);
1061   } else {
1062      return mp_cmp_mag(a, b);
1063   }
1064 }
1065
1066 /* compare a digit */
1067 int mp_cmp_d(const mp_int * a, mp_digit b)
1068 {
1069   /* compare based on sign */
1070   if (a->sign == MP_NEG) {
1071     return MP_LT;
1072   }
1073
1074   /* compare based on magnitude */
1075   if (a->used > 1) {
1076     return MP_GT;
1077   }
1078
1079   /* compare the only digit of a to b */
1080   if (a->dp[0] > b) {
1081     return MP_GT;
1082   } else if (a->dp[0] < b) {
1083     return MP_LT;
1084   } else {
1085     return MP_EQ;
1086   }
1087 }
1088
1089 /* compare maginitude of two ints (unsigned) */
1090 int mp_cmp_mag (const mp_int * a, const mp_int * b)
1091 {
1092   int     n;
1093   mp_digit *tmpa, *tmpb;
1094
1095   /* compare based on # of non-zero digits */
1096   if (a->used > b->used) {
1097     return MP_GT;
1098   }
1099
1100   if (a->used < b->used) {
1101     return MP_LT;
1102   }
1103
1104   /* alias for a */
1105   tmpa = a->dp + (a->used - 1);
1106
1107   /* alias for b */
1108   tmpb = b->dp + (a->used - 1);
1109
1110   /* compare based on digits  */
1111   for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
1112     if (*tmpa > *tmpb) {
1113       return MP_GT;
1114     }
1115
1116     if (*tmpa < *tmpb) {
1117       return MP_LT;
1118     }
1119   }
1120   return MP_EQ;
1121 }
1122
1123 static const int lnz[16] = {
1124    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
1125 };
1126
1127 /* Counts the number of lsbs which are zero before the first zero bit */
1128 int mp_cnt_lsb(const mp_int *a)
1129 {
1130    int x;
1131    mp_digit q, qq;
1132
1133    /* easy out */
1134    if (mp_iszero(a) == 1) {
1135       return 0;
1136    }
1137
1138    /* scan lower digits until non-zero */
1139    for (x = 0; x < a->used && a->dp[x] == 0; x++);
1140    q = a->dp[x];
1141    x *= DIGIT_BIT;
1142
1143    /* now scan this digit until a 1 is found */
1144    if ((q & 1) == 0) {
1145       do {
1146          qq  = q & 15;
1147          x  += lnz[qq];
1148          q >>= 4;
1149       } while (qq == 0);
1150    }
1151    return x;
1152 }
1153
1154 /* copy, b = a */
1155 int
1156 mp_copy (const mp_int * a, mp_int * b)
1157 {
1158   int     res, n;
1159
1160   /* if dst == src do nothing */
1161   if (a == b) {
1162     return MP_OKAY;
1163   }
1164
1165   /* grow dest */
1166   if (b->alloc < a->used) {
1167      if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1168         return res;
1169      }
1170   }
1171
1172   /* zero b and copy the parameters over */
1173   {
1174     register mp_digit *tmpa, *tmpb;
1175
1176     /* pointer aliases */
1177
1178     /* source */
1179     tmpa = a->dp;
1180
1181     /* destination */
1182     tmpb = b->dp;
1183
1184     /* copy all the digits */
1185     for (n = 0; n < a->used; n++) {
1186       *tmpb++ = *tmpa++;
1187     }
1188
1189     /* clear high digits */
1190     for (; n < b->used; n++) {
1191       *tmpb++ = 0;
1192     }
1193   }
1194
1195   /* copy used count and sign */
1196   b->used = a->used;
1197   b->sign = a->sign;
1198   return MP_OKAY;
1199 }
1200
1201 /* returns the number of bits in an int */
1202 int
1203 mp_count_bits (const mp_int * a)
1204 {
1205   int     r;
1206   mp_digit q;
1207
1208   /* shortcut */
1209   if (a->used == 0) {
1210     return 0;
1211   }
1212
1213   /* get number of digits and add that */
1214   r = (a->used - 1) * DIGIT_BIT;
1215
1216   /* take the last digit and count the bits in it */
1217   q = a->dp[a->used - 1];
1218   while (q > 0) {
1219     ++r;
1220     q >>= ((mp_digit) 1);
1221   }
1222   return r;
1223 }
1224
1225 /* calc a value mod 2**b */
1226 static int
1227 mp_mod_2d (const mp_int * a, int b, mp_int * c)
1228 {
1229   int     x, res;
1230
1231   /* if b is <= 0 then zero the int */
1232   if (b <= 0) {
1233     mp_zero (c);
1234     return MP_OKAY;
1235   }
1236
1237   /* if the modulus is larger than the value than return */
1238   if (b > a->used * DIGIT_BIT) {
1239     res = mp_copy (a, c);
1240     return res;
1241   }
1242
1243   /* copy */
1244   if ((res = mp_copy (a, c)) != MP_OKAY) {
1245     return res;
1246   }
1247
1248   /* zero digits above the last digit of the modulus */
1249   for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
1250     c->dp[x] = 0;
1251   }
1252   /* clear the digit that is not completely outside/inside the modulus */
1253   c->dp[b / DIGIT_BIT] &= (1 << ((mp_digit)b % DIGIT_BIT)) - 1;
1254   mp_clamp (c);
1255   return MP_OKAY;
1256 }
1257
1258 /* shift right a certain amount of digits */
1259 static void mp_rshd (mp_int * a, int b)
1260 {
1261   int     x;
1262
1263   /* if b <= 0 then ignore it */
1264   if (b <= 0) {
1265     return;
1266   }
1267
1268   /* if b > used then simply zero it and return */
1269   if (a->used <= b) {
1270     mp_zero (a);
1271     return;
1272   }
1273
1274   {
1275     register mp_digit *bottom, *top;
1276
1277     /* shift the digits down */
1278
1279     /* bottom */
1280     bottom = a->dp;
1281
1282     /* top [offset into digits] */
1283     top = a->dp + b;
1284
1285     /* this is implemented as a sliding window where
1286      * the window is b-digits long and digits from
1287      * the top of the window are copied to the bottom
1288      *
1289      * e.g.
1290
1291      b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
1292                  /\                   |      ---->
1293                   \-------------------/      ---->
1294      */
1295     for (x = 0; x < (a->used - b); x++) {
1296       *bottom++ = *top++;
1297     }
1298
1299     /* zero the top digits */
1300     for (; x < a->used; x++) {
1301       *bottom++ = 0;
1302     }
1303   }
1304
1305   /* remove excess digits */
1306   a->used -= b;
1307 }
1308
1309 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
1310 static int mp_div_2d (const mp_int * a, int b, mp_int * c, mp_int * d)
1311 {
1312   mp_digit D, r, rr;
1313   int     x, res;
1314   mp_int  t;
1315
1316
1317   /* if the shift count is <= 0 then we do no work */
1318   if (b <= 0) {
1319     res = mp_copy (a, c);
1320     if (d != NULL) {
1321       mp_zero (d);
1322     }
1323     return res;
1324   }
1325
1326   if ((res = mp_init (&t)) != MP_OKAY) {
1327     return res;
1328   }
1329
1330   /* get the remainder */
1331   if (d != NULL) {
1332     if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
1333       mp_clear (&t);
1334       return res;
1335     }
1336   }
1337
1338   /* copy */
1339   if ((res = mp_copy (a, c)) != MP_OKAY) {
1340     mp_clear (&t);
1341     return res;
1342   }
1343
1344   /* shift by as many digits in the bit count */
1345   if (b >= DIGIT_BIT) {
1346     mp_rshd (c, b / DIGIT_BIT);
1347   }
1348
1349   /* shift any bit count < DIGIT_BIT */
1350   D = (mp_digit) (b % DIGIT_BIT);
1351   if (D != 0) {
1352     register mp_digit *tmpc, mask, shift;
1353
1354     /* mask */
1355     mask = (((mp_digit)1) << D) - 1;
1356
1357     /* shift for lsb */
1358     shift = DIGIT_BIT - D;
1359
1360     /* alias */
1361     tmpc = c->dp + (c->used - 1);
1362
1363     /* carry */
1364     r = 0;
1365     for (x = c->used - 1; x >= 0; x--) {
1366       /* get the lower  bits of this word in a temp */
1367       rr = *tmpc & mask;
1368
1369       /* shift the current word and mix in the carry bits from the previous word */
1370       *tmpc = (*tmpc >> D) | (r << shift);
1371       --tmpc;
1372
1373       /* set the carry to the carry bits of the current word found above */
1374       r = rr;
1375     }
1376   }
1377   mp_clamp (c);
1378   if (d != NULL) {
1379     mp_exch (&t, d);
1380   }
1381   mp_clear (&t);
1382   return MP_OKAY;
1383 }
1384
1385 /* shift left a certain amount of digits */
1386 static int mp_lshd (mp_int * a, int b)
1387 {
1388   int     x, res;
1389
1390   /* if it's less than zero return */
1391   if (b <= 0) {
1392     return MP_OKAY;
1393   }
1394
1395   /* grow to fit the new digits */
1396   if (a->alloc < a->used + b) {
1397      if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
1398        return res;
1399      }
1400   }
1401
1402   {
1403     register mp_digit *top, *bottom;
1404
1405     /* increment the used by the shift amount then copy upwards */
1406     a->used += b;
1407
1408     /* top */
1409     top = a->dp + a->used - 1;
1410
1411     /* base */
1412     bottom = a->dp + a->used - 1 - b;
1413
1414     /* much like mp_rshd this is implemented using a sliding window
1415      * except the window goes the other way around.  Copying from
1416      * the bottom to the top.  see bn_mp_rshd.c for more info.
1417      */
1418     for (x = a->used - 1; x >= b; x--) {
1419       *top-- = *bottom--;
1420     }
1421
1422     /* zero the lower digits */
1423     top = a->dp;
1424     for (x = 0; x < b; x++) {
1425       *top++ = 0;
1426     }
1427   }
1428   return MP_OKAY;
1429 }
1430
1431 /* shift left by a certain bit count */
1432 static int mp_mul_2d (const mp_int * a, int b, mp_int * c)
1433 {
1434   mp_digit d;
1435   int      res;
1436
1437   /* copy */
1438   if (a != c) {
1439      if ((res = mp_copy (a, c)) != MP_OKAY) {
1440        return res;
1441      }
1442   }
1443
1444   if (c->alloc < c->used + b/DIGIT_BIT + 1) {
1445      if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
1446        return res;
1447      }
1448   }
1449
1450   /* shift by as many digits in the bit count */
1451   if (b >= DIGIT_BIT) {
1452     if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
1453       return res;
1454     }
1455   }
1456
1457   /* shift any bit count < DIGIT_BIT */
1458   d = (mp_digit) (b % DIGIT_BIT);
1459   if (d != 0) {
1460     register mp_digit *tmpc, shift, mask, r, rr;
1461     register int x;
1462
1463     /* bitmask for carries */
1464     mask = (((mp_digit)1) << d) - 1;
1465
1466     /* shift for msbs */
1467     shift = DIGIT_BIT - d;
1468
1469     /* alias */
1470     tmpc = c->dp;
1471
1472     /* carry */
1473     r    = 0;
1474     for (x = 0; x < c->used; x++) {
1475       /* get the higher bits of the current word */
1476       rr = (*tmpc >> shift) & mask;
1477
1478       /* shift the current word and OR in the carry */
1479       *tmpc = ((*tmpc << d) | r) & MP_MASK;
1480       ++tmpc;
1481
1482       /* set the carry to the carry bits of the current word */
1483       r = rr;
1484     }
1485
1486     /* set final carry */
1487     if (r != 0) {
1488        c->dp[(c->used)++] = r;
1489     }
1490   }
1491   mp_clamp (c);
1492   return MP_OKAY;
1493 }
1494
1495 /* multiply by a digit */
1496 static int
1497 mp_mul_d (const mp_int * a, mp_digit b, mp_int * c)
1498 {
1499   mp_digit u, *tmpa, *tmpc;
1500   mp_word  r;
1501   int      ix, res, olduse;
1502
1503   /* make sure c is big enough to hold a*b */
1504   if (c->alloc < a->used + 1) {
1505     if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
1506       return res;
1507     }
1508   }
1509
1510   /* get the original destinations used count */
1511   olduse = c->used;
1512
1513   /* set the sign */
1514   c->sign = a->sign;
1515
1516   /* alias for a->dp [source] */
1517   tmpa = a->dp;
1518
1519   /* alias for c->dp [dest] */
1520   tmpc = c->dp;
1521
1522   /* zero carry */
1523   u = 0;
1524
1525   /* compute columns */
1526   for (ix = 0; ix < a->used; ix++) {
1527     /* compute product and carry sum for this term */
1528     r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
1529
1530     /* mask off higher bits to get a single digit */
1531     *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
1532
1533     /* send carry into next iteration */
1534     u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
1535   }
1536
1537   /* store final carry [if any] */
1538   *tmpc++ = u;
1539
1540   /* now zero digits above the top */
1541   while (ix++ < olduse) {
1542      *tmpc++ = 0;
1543   }
1544
1545   /* set used count */
1546   c->used = a->used + 1;
1547   mp_clamp(c);
1548
1549   return MP_OKAY;
1550 }
1551
1552 /* integer signed division.
1553  * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
1554  * HAC pp.598 Algorithm 14.20
1555  *
1556  * Note that the description in HAC is horribly
1557  * incomplete.  For example, it doesn't consider
1558  * the case where digits are removed from 'x' in
1559  * the inner loop.  It also doesn't consider the
1560  * case that y has fewer than three digits, etc..
1561  *
1562  * The overall algorithm is as described as
1563  * 14.20 from HAC but fixed to treat these cases.
1564 */
1565 static int mp_div (const mp_int * a, const mp_int * b, mp_int * c, mp_int * d)
1566 {
1567   mp_int  q, x, y, t1, t2;
1568   int     res, n, t, i, norm, neg;
1569
1570   /* is divisor zero ? */
1571   if (mp_iszero (b) == 1) {
1572     return MP_VAL;
1573   }
1574
1575   /* if a < b then q=0, r = a */
1576   if (mp_cmp_mag (a, b) == MP_LT) {
1577     if (d != NULL) {
1578       res = mp_copy (a, d);
1579     } else {
1580       res = MP_OKAY;
1581     }
1582     if (c != NULL) {
1583       mp_zero (c);
1584     }
1585     return res;
1586   }
1587
1588   if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
1589     return res;
1590   }
1591   q.used = a->used + 2;
1592
1593   if ((res = mp_init (&t1)) != MP_OKAY) {
1594     goto __Q;
1595   }
1596
1597   if ((res = mp_init (&t2)) != MP_OKAY) {
1598     goto __T1;
1599   }
1600
1601   if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
1602     goto __T2;
1603   }
1604
1605   if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
1606     goto __X;
1607   }
1608
1609   /* fix the sign */
1610   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
1611   x.sign = y.sign = MP_ZPOS;
1612
1613   /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
1614   norm = mp_count_bits(&y) % DIGIT_BIT;
1615   if (norm < DIGIT_BIT-1) {
1616      norm = (DIGIT_BIT-1) - norm;
1617      if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
1618        goto __Y;
1619      }
1620      if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
1621        goto __Y;
1622      }
1623   } else {
1624      norm = 0;
1625   }
1626
1627   /* note hac does 0 based, so if used==5 then it's 0,1,2,3,4, e.g. use 4 */
1628   n = x.used - 1;
1629   t = y.used - 1;
1630
1631   /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
1632   if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
1633     goto __Y;
1634   }
1635
1636   while (mp_cmp (&x, &y) != MP_LT) {
1637     ++(q.dp[n - t]);
1638     if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
1639       goto __Y;
1640     }
1641   }
1642
1643   /* reset y by shifting it back down */
1644   mp_rshd (&y, n - t);
1645
1646   /* step 3. for i from n down to (t + 1) */
1647   for (i = n; i >= (t + 1); i--) {
1648     if (i > x.used) {
1649       continue;
1650     }
1651
1652     /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
1653      * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
1654     if (x.dp[i] == y.dp[t]) {
1655       q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
1656     } else {
1657       mp_word tmp;
1658       tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
1659       tmp |= ((mp_word) x.dp[i - 1]);
1660       tmp /= ((mp_word) y.dp[t]);
1661       if (tmp > (mp_word) MP_MASK)
1662         tmp = MP_MASK;
1663       q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
1664     }
1665
1666     /* while (q{i-t-1} * (yt * b + y{t-1})) >
1667              xi * b**2 + xi-1 * b + xi-2
1668
1669        do q{i-t-1} -= 1;
1670     */
1671     q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
1672     do {
1673       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
1674
1675       /* find left hand */
1676       mp_zero (&t1);
1677       t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
1678       t1.dp[1] = y.dp[t];
1679       t1.used = 2;
1680       if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1681         goto __Y;
1682       }
1683
1684       /* find right hand */
1685       t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
1686       t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
1687       t2.dp[2] = x.dp[i];
1688       t2.used = 3;
1689     } while (mp_cmp_mag(&t1, &t2) == MP_GT);
1690
1691     /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
1692     if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1693       goto __Y;
1694     }
1695
1696     if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1697       goto __Y;
1698     }
1699
1700     if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
1701       goto __Y;
1702     }
1703
1704     /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
1705     if (x.sign == MP_NEG) {
1706       if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
1707         goto __Y;
1708       }
1709       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1710         goto __Y;
1711       }
1712       if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
1713         goto __Y;
1714       }
1715
1716       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
1717     }
1718   }
1719
1720   /* now q is the quotient and x is the remainder
1721    * [which we have to normalize]
1722    */
1723
1724   /* get sign before writing to c */
1725   x.sign = x.used == 0 ? MP_ZPOS : a->sign;
1726
1727   if (c != NULL) {
1728     mp_clamp (&q);
1729     mp_exch (&q, c);
1730     c->sign = neg;
1731   }
1732
1733   if (d != NULL) {
1734     mp_div_2d (&x, norm, &x, NULL);
1735     mp_exch (&x, d);
1736   }
1737
1738   res = MP_OKAY;
1739
1740 __Y:mp_clear (&y);
1741 __X:mp_clear (&x);
1742 __T2:mp_clear (&t2);
1743 __T1:mp_clear (&t1);
1744 __Q:mp_clear (&q);
1745   return res;
1746 }
1747
1748 static BOOL s_is_power_of_two(mp_digit b, int *p)
1749 {
1750    int x;
1751
1752    for (x = 1; x < DIGIT_BIT; x++) {
1753       if (b == (((mp_digit)1)<<x)) {
1754          *p = x;
1755          return TRUE;
1756       }
1757    }
1758    return FALSE;
1759 }
1760
1761 /* single digit division (based on routine from MPI) */
1762 static int mp_div_d (const mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
1763 {
1764   mp_int  q;
1765   mp_word w;
1766   mp_digit t;
1767   int     res, ix;
1768
1769   /* cannot divide by zero */
1770   if (b == 0) {
1771      return MP_VAL;
1772   }
1773
1774   /* quick outs */
1775   if (b == 1 || mp_iszero(a) == 1) {
1776      if (d != NULL) {
1777         *d = 0;
1778      }
1779      if (c != NULL) {
1780         return mp_copy(a, c);
1781      }
1782      return MP_OKAY;
1783   }
1784
1785   /* power of two ? */
1786   if (s_is_power_of_two(b, &ix)) {
1787      if (d != NULL) {
1788         *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
1789      }
1790      if (c != NULL) {
1791         return mp_div_2d(a, ix, c, NULL);
1792      }
1793      return MP_OKAY;
1794   }
1795
1796   /* no easy answer [c'est la vie].  Just division */
1797   if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
1798      return res;
1799   }
1800
1801   q.used = a->used;
1802   q.sign = a->sign;
1803   w = 0;
1804   for (ix = a->used - 1; ix >= 0; ix--) {
1805      w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
1806
1807      if (w >= b) {
1808         t = (mp_digit)(w / b);
1809         w -= ((mp_word)t) * ((mp_word)b);
1810       } else {
1811         t = 0;
1812       }
1813       q.dp[ix] = t;
1814   }
1815
1816   if (d != NULL) {
1817      *d = (mp_digit)w;
1818   }
1819
1820   if (c != NULL) {
1821      mp_clamp(&q);
1822      mp_exch(&q, c);
1823   }
1824   mp_clear(&q);
1825
1826   return res;
1827 }
1828
1829 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
1830  *
1831  * Based on algorithm from the paper
1832  *
1833  * "Generating Efficient Primes for Discrete Log Cryptosystems"
1834  *                 Chae Hoon Lim, Pil Loong Lee,
1835  *          POSTECH Information Research Laboratories
1836  *
1837  * The modulus must be of a special format [see manual]
1838  *
1839  * Has been modified to use algorithm 7.10 from the LTM book instead
1840  *
1841  * Input x must be in the range 0 <= x <= (n-1)**2
1842  */
1843 static int
1844 mp_dr_reduce (mp_int * x, const mp_int * n, mp_digit k)
1845 {
1846   int      err, i, m;
1847   mp_word  r;
1848   mp_digit mu, *tmpx1, *tmpx2;
1849
1850   /* m = digits in modulus */
1851   m = n->used;
1852
1853   /* ensure that "x" has at least 2m digits */
1854   if (x->alloc < m + m) {
1855     if ((err = mp_grow (x, m + m)) != MP_OKAY) {
1856       return err;
1857     }
1858   }
1859
1860 /* top of loop, this is where the code resumes if
1861  * another reduction pass is required.
1862  */
1863 top:
1864   /* aliases for digits */
1865   /* alias for lower half of x */
1866   tmpx1 = x->dp;
1867
1868   /* alias for upper half of x, or x/B**m */
1869   tmpx2 = x->dp + m;
1870
1871   /* set carry to zero */
1872   mu = 0;
1873
1874   /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
1875   for (i = 0; i < m; i++) {
1876       r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
1877       *tmpx1++  = (mp_digit)(r & MP_MASK);
1878       mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
1879   }
1880
1881   /* set final carry */
1882   *tmpx1++ = mu;
1883
1884   /* zero words above m */
1885   for (i = m + 1; i < x->used; i++) {
1886       *tmpx1++ = 0;
1887   }
1888
1889   /* clamp, sub and return */
1890   mp_clamp (x);
1891
1892   /* if x >= n then subtract and reduce again
1893    * Each successive "recursion" makes the input smaller and smaller.
1894    */
1895   if (mp_cmp_mag (x, n) != MP_LT) {
1896     s_mp_sub(x, n, x);
1897     goto top;
1898   }
1899   return MP_OKAY;
1900 }
1901
1902 /* sets the value of "d" required for mp_dr_reduce */
1903 static void mp_dr_setup(const mp_int *a, mp_digit *d)
1904 {
1905    /* the casts are required if DIGIT_BIT is one less than
1906     * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
1907     */
1908    *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) -
1909         ((mp_word)a->dp[0]));
1910 }
1911
1912 /* this is a shell function that calls either the normal or Montgomery
1913  * exptmod functions.  Originally the call to the montgomery code was
1914  * embedded in the normal function but that wasted a lot of stack space
1915  * for nothing (since 99% of the time the Montgomery code would be called)
1916  */
1917 int mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y)
1918 {
1919   int dr;
1920
1921   /* modulus P must be positive */
1922   if (P->sign == MP_NEG) {
1923      return MP_VAL;
1924   }
1925
1926   /* if exponent X is negative we have to recurse */
1927   if (X->sign == MP_NEG) {
1928      mp_int tmpG, tmpX;
1929      int err;
1930
1931      /* first compute 1/G mod P */
1932      if ((err = mp_init(&tmpG)) != MP_OKAY) {
1933         return err;
1934      }
1935      if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
1936         mp_clear(&tmpG);
1937         return err;
1938      }
1939
1940      /* now get |X| */
1941      if ((err = mp_init(&tmpX)) != MP_OKAY) {
1942         mp_clear(&tmpG);
1943         return err;
1944      }
1945      if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
1946         mp_clear_multi(&tmpG, &tmpX, NULL);
1947         return err;
1948      }
1949
1950      /* and now compute (1/G)**|X| instead of G**X [X < 0] */
1951      err = mp_exptmod(&tmpG, &tmpX, P, Y);
1952      mp_clear_multi(&tmpG, &tmpX, NULL);
1953      return err;
1954   }
1955
1956   dr = 0;
1957
1958   /* if the modulus is odd use the fast method */
1959   if (mp_isodd (P) == 1) {
1960     return mp_exptmod_fast (G, X, P, Y, dr);
1961   } else {
1962     /* otherwise use the generic Barrett reduction technique */
1963     return s_mp_exptmod (G, X, P, Y);
1964   }
1965 }
1966
1967 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
1968  *
1969  * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
1970  * The value of k changes based on the size of the exponent.
1971  *
1972  * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
1973  */
1974
1975 int
1976 mp_exptmod_fast (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y, int redmode)
1977 {
1978   mp_int  M[256], res;
1979   mp_digit buf, mp;
1980   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
1981
1982   /* use a pointer to the reduction algorithm.  This allows us to use
1983    * one of many reduction algorithms without modding the guts of
1984    * the code with if statements everywhere.
1985    */
1986   int     (*redux)(mp_int*,const mp_int*,mp_digit);
1987
1988   /* find window size */
1989   x = mp_count_bits (X);
1990   if (x <= 7) {
1991     winsize = 2;
1992   } else if (x <= 36) {
1993     winsize = 3;
1994   } else if (x <= 140) {
1995     winsize = 4;
1996   } else if (x <= 450) {
1997     winsize = 5;
1998   } else if (x <= 1303) {
1999     winsize = 6;
2000   } else if (x <= 3529) {
2001     winsize = 7;
2002   } else {
2003     winsize = 8;
2004   }
2005
2006   /* init M array */
2007   /* init first cell */
2008   if ((err = mp_init(&M[1])) != MP_OKAY) {
2009      return err;
2010   }
2011
2012   /* now init the second half of the array */
2013   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
2014     if ((err = mp_init(&M[x])) != MP_OKAY) {
2015       for (y = 1<<(winsize-1); y < x; y++) {
2016         mp_clear (&M[y]);
2017       }
2018       mp_clear(&M[1]);
2019       return err;
2020     }
2021   }
2022
2023   /* determine and setup reduction code */
2024   if (redmode == 0) {
2025      /* now setup montgomery  */
2026      if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
2027         goto __M;
2028      }
2029
2030      /* automatically pick the comba one if available (saves quite a few calls/ifs) */
2031      if (((P->used * 2 + 1) < MP_WARRAY) &&
2032           P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2033         redux = fast_mp_montgomery_reduce;
2034      } else {
2035         /* use slower baseline Montgomery method */
2036         redux = mp_montgomery_reduce;
2037      }
2038   } else if (redmode == 1) {
2039      /* setup DR reduction for moduli of the form B**k - b */
2040      mp_dr_setup(P, &mp);
2041      redux = mp_dr_reduce;
2042   } else {
2043      /* setup DR reduction for moduli of the form 2**k - b */
2044      if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
2045         goto __M;
2046      }
2047      redux = mp_reduce_2k;
2048   }
2049
2050   /* setup result */
2051   if ((err = mp_init (&res)) != MP_OKAY) {
2052     goto __M;
2053   }
2054
2055   /* create M table
2056    *
2057
2058    *
2059    * The first half of the table is not computed though accept for M[0] and M[1]
2060    */
2061
2062   if (redmode == 0) {
2063      /* now we need R mod m */
2064      if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
2065        goto __RES;
2066      }
2067
2068      /* now set M[1] to G * R mod m */
2069      if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
2070        goto __RES;
2071      }
2072   } else {
2073      mp_set(&res, 1);
2074      if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
2075         goto __RES;
2076      }
2077   }
2078
2079   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
2080   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
2081     goto __RES;
2082   }
2083
2084   for (x = 0; x < (winsize - 1); x++) {
2085     if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
2086       goto __RES;
2087     }
2088     if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
2089       goto __RES;
2090     }
2091   }
2092
2093   /* create upper table */
2094   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
2095     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
2096       goto __RES;
2097     }
2098     if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
2099       goto __RES;
2100     }
2101   }
2102
2103   /* set initial mode and bit cnt */
2104   mode   = 0;
2105   bitcnt = 1;
2106   buf    = 0;
2107   digidx = X->used - 1;
2108   bitcpy = 0;
2109   bitbuf = 0;
2110
2111   for (;;) {
2112     /* grab next digit as required */
2113     if (--bitcnt == 0) {
2114       /* if digidx == -1 we are out of digits so break */
2115       if (digidx == -1) {
2116         break;
2117       }
2118       /* read next digit and reset bitcnt */
2119       buf    = X->dp[digidx--];
2120       bitcnt = DIGIT_BIT;
2121     }
2122
2123     /* grab the next msb from the exponent */
2124     y     = (buf >> (DIGIT_BIT - 1)) & 1;
2125     buf <<= (mp_digit)1;
2126
2127     /* if the bit is zero and mode == 0 then we ignore it
2128      * These represent the leading zero bits before the first 1 bit
2129      * in the exponent.  Technically this opt is not required but it
2130      * does lower the # of trivial squaring/reductions used
2131      */
2132     if (mode == 0 && y == 0) {
2133       continue;
2134     }
2135
2136     /* if the bit is zero and mode == 1 then we square */
2137     if (mode == 1 && y == 0) {
2138       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2139         goto __RES;
2140       }
2141       if ((err = redux (&res, P, mp)) != MP_OKAY) {
2142         goto __RES;
2143       }
2144       continue;
2145     }
2146
2147     /* else we add it to the window */
2148     bitbuf |= (y << (winsize - ++bitcpy));
2149     mode    = 2;
2150
2151     if (bitcpy == winsize) {
2152       /* ok window is filled so square as required and multiply  */
2153       /* square first */
2154       for (x = 0; x < winsize; x++) {
2155         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2156           goto __RES;
2157         }
2158         if ((err = redux (&res, P, mp)) != MP_OKAY) {
2159           goto __RES;
2160         }
2161       }
2162
2163       /* then multiply */
2164       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
2165         goto __RES;
2166       }
2167       if ((err = redux (&res, P, mp)) != MP_OKAY) {
2168         goto __RES;
2169       }
2170
2171       /* empty window and reset */
2172       bitcpy = 0;
2173       bitbuf = 0;
2174       mode   = 1;
2175     }
2176   }
2177
2178   /* if bits remain then square/multiply */
2179   if (mode == 2 && bitcpy > 0) {
2180     /* square then multiply if the bit is set */
2181     for (x = 0; x < bitcpy; x++) {
2182       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2183         goto __RES;
2184       }
2185       if ((err = redux (&res, P, mp)) != MP_OKAY) {
2186         goto __RES;
2187       }
2188
2189       /* get next bit of the window */
2190       bitbuf <<= 1;
2191       if ((bitbuf & (1 << winsize)) != 0) {
2192         /* then multiply */
2193         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
2194           goto __RES;
2195         }
2196         if ((err = redux (&res, P, mp)) != MP_OKAY) {
2197           goto __RES;
2198         }
2199       }
2200     }
2201   }
2202
2203   if (redmode == 0) {
2204      /* fixup result if Montgomery reduction is used
2205       * recall that any value in a Montgomery system is
2206       * actually multiplied by R mod n.  So we have
2207       * to reduce one more time to cancel out the factor
2208       * of R.
2209       */
2210      if ((err = redux(&res, P, mp)) != MP_OKAY) {
2211        goto __RES;
2212      }
2213   }
2214
2215   /* swap res with Y */
2216   mp_exch (&res, Y);
2217   err = MP_OKAY;
2218 __RES:mp_clear (&res);
2219 __M:
2220   mp_clear(&M[1]);
2221   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
2222     mp_clear (&M[x]);
2223   }
2224   return err;
2225 }
2226
2227 /* Greatest Common Divisor using the binary method */
2228 int mp_gcd (const mp_int * a, const mp_int * b, mp_int * c)
2229 {
2230   mp_int  u, v;
2231   int     k, u_lsb, v_lsb, res;
2232
2233   /* either zero than gcd is the largest */
2234   if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
2235     return mp_abs (b, c);
2236   }
2237   if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
2238     return mp_abs (a, c);
2239   }
2240
2241   /* optimized.  At this point if a == 0 then
2242    * b must equal zero too
2243    */
2244   if (mp_iszero (a) == 1) {
2245     mp_zero(c);
2246     return MP_OKAY;
2247   }
2248
2249   /* get copies of a and b we can modify */
2250   if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
2251     return res;
2252   }
2253
2254   if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
2255     goto __U;
2256   }
2257
2258   /* must be positive for the remainder of the algorithm */
2259   u.sign = v.sign = MP_ZPOS;
2260
2261   /* B1.  Find the common power of two for u and v */
2262   u_lsb = mp_cnt_lsb(&u);
2263   v_lsb = mp_cnt_lsb(&v);
2264   k     = MIN(u_lsb, v_lsb);
2265
2266   if (k > 0) {
2267      /* divide the power of two out */
2268      if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
2269         goto __V;
2270      }
2271
2272      if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
2273         goto __V;
2274      }
2275   }
2276
2277   /* divide any remaining factors of two out */
2278   if (u_lsb != k) {
2279      if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
2280         goto __V;
2281      }
2282   }
2283
2284   if (v_lsb != k) {
2285      if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
2286         goto __V;
2287      }
2288   }
2289
2290   while (mp_iszero(&v) == 0) {
2291      /* make sure v is the largest */
2292      if (mp_cmp_mag(&u, &v) == MP_GT) {
2293         /* swap u and v to make sure v is >= u */
2294         mp_exch(&u, &v);
2295      }
2296
2297      /* subtract smallest from largest */
2298      if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
2299         goto __V;
2300      }
2301
2302      /* Divide out all factors of two */
2303      if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
2304         goto __V;
2305      }
2306   }
2307
2308   /* multiply by 2**k which we divided out at the beginning */
2309   if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
2310      goto __V;
2311   }
2312   c->sign = MP_ZPOS;
2313   res = MP_OKAY;
2314 __V:mp_clear (&u);
2315 __U:mp_clear (&v);
2316   return res;
2317 }
2318
2319 /* get the lower 32-bits of an mp_int */
2320 unsigned long mp_get_int(const mp_int * a)
2321 {
2322   int i;
2323   unsigned long res;
2324
2325   if (a->used == 0) {
2326      return 0;
2327   }
2328
2329   /* get number of digits of the lsb we have to read */
2330   i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
2331
2332   /* get most significant digit of result */
2333   res = DIGIT(a,i);
2334
2335   while (--i >= 0) {
2336     res = (res << DIGIT_BIT) | DIGIT(a,i);
2337   }
2338
2339   /* force result to 32-bits always so it is consistent on non 32-bit platforms */
2340   return res & 0xFFFFFFFFUL;
2341 }
2342
2343 /* creates "a" then copies b into it */
2344 int mp_init_copy (mp_int * a, const mp_int * b)
2345 {
2346   int     res;
2347
2348   if ((res = mp_init (a)) != MP_OKAY) {
2349     return res;
2350   }
2351   return mp_copy (b, a);
2352 }
2353
2354 int mp_init_multi(mp_int *mp, ...)
2355 {
2356     mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
2357     int n = 0;                 /* Number of ok inits */
2358     mp_int* cur_arg = mp;
2359     va_list args;
2360
2361     va_start(args, mp);        /* init args to next argument from caller */
2362     while (cur_arg != NULL) {
2363         if (mp_init(cur_arg) != MP_OKAY) {
2364             /* Oops - error! Back-track and mp_clear what we already
2365                succeeded in init-ing, then return error.
2366             */
2367             va_list clean_args;
2368
2369             /* now start cleaning up */
2370             cur_arg = mp;
2371             va_start(clean_args, mp);
2372             while (n--) {
2373                 mp_clear(cur_arg);
2374                 cur_arg = va_arg(clean_args, mp_int*);
2375             }
2376             va_end(clean_args);
2377             res = MP_MEM;
2378             break;
2379         }
2380         n++;
2381         cur_arg = va_arg(args, mp_int*);
2382     }
2383     va_end(args);
2384     return res;                /* Assumed ok, if error flagged above. */
2385 }
2386
2387 /* hac 14.61, pp608 */
2388 int mp_invmod (const mp_int * a, mp_int * b, mp_int * c)
2389 {
2390   /* b cannot be negative */
2391   if (b->sign == MP_NEG || mp_iszero(b) == 1) {
2392     return MP_VAL;
2393   }
2394
2395   /* if the modulus is odd we can use a faster routine instead */
2396   if (mp_isodd (b) == 1) {
2397     return fast_mp_invmod (a, b, c);
2398   }
2399
2400   return mp_invmod_slow(a, b, c);
2401 }
2402
2403 /* hac 14.61, pp608 */
2404 int mp_invmod_slow (const mp_int * a, mp_int * b, mp_int * c)
2405 {
2406   mp_int  x, y, u, v, A, B, C, D;
2407   int     res;
2408
2409   /* b cannot be negative */
2410   if (b->sign == MP_NEG || mp_iszero(b) == 1) {
2411     return MP_VAL;
2412   }
2413
2414   /* init temps */
2415   if ((res = mp_init_multi(&x, &y, &u, &v,
2416                            &A, &B, &C, &D, NULL)) != MP_OKAY) {
2417      return res;
2418   }
2419
2420   /* x = a, y = b */
2421   if ((res = mp_copy (a, &x)) != MP_OKAY) {
2422     goto __ERR;
2423   }
2424   if ((res = mp_copy (b, &y)) != MP_OKAY) {
2425     goto __ERR;
2426   }
2427
2428   /* 2. [modified] if x,y are both even then return an error! */
2429   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
2430     res = MP_VAL;
2431     goto __ERR;
2432   }
2433
2434   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
2435   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
2436     goto __ERR;
2437   }
2438   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
2439     goto __ERR;
2440   }
2441   mp_set (&A, 1);
2442   mp_set (&D, 1);
2443
2444 top:
2445   /* 4.  while u is even do */
2446   while (mp_iseven (&u) == 1) {
2447     /* 4.1 u = u/2 */
2448     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
2449       goto __ERR;
2450     }
2451     /* 4.2 if A or B is odd then */
2452     if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
2453       /* A = (A+y)/2, B = (B-x)/2 */
2454       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
2455          goto __ERR;
2456       }
2457       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
2458          goto __ERR;
2459       }
2460     }
2461     /* A = A/2, B = B/2 */
2462     if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
2463       goto __ERR;
2464     }
2465     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
2466       goto __ERR;
2467     }
2468   }
2469
2470   /* 5.  while v is even do */
2471   while (mp_iseven (&v) == 1) {
2472     /* 5.1 v = v/2 */
2473     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
2474       goto __ERR;
2475     }
2476     /* 5.2 if C or D is odd then */
2477     if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
2478       /* C = (C+y)/2, D = (D-x)/2 */
2479       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
2480          goto __ERR;
2481       }
2482       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
2483          goto __ERR;
2484       }
2485     }
2486     /* C = C/2, D = D/2 */
2487     if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
2488       goto __ERR;
2489     }
2490     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
2491       goto __ERR;
2492     }
2493   }
2494
2495   /* 6.  if u >= v then */
2496   if (mp_cmp (&u, &v) != MP_LT) {
2497     /* u = u - v, A = A - C, B = B - D */
2498     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
2499       goto __ERR;
2500     }
2501
2502     if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
2503       goto __ERR;
2504     }
2505
2506     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
2507       goto __ERR;
2508     }
2509   } else {
2510     /* v - v - u, C = C - A, D = D - B */
2511     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
2512       goto __ERR;
2513     }
2514
2515     if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
2516       goto __ERR;
2517     }
2518
2519     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
2520       goto __ERR;
2521     }
2522   }
2523
2524   /* if not zero goto step 4 */
2525   if (mp_iszero (&u) == 0)
2526     goto top;
2527
2528   /* now a = C, b = D, gcd == g*v */
2529
2530   /* if v != 1 then there is no inverse */
2531   if (mp_cmp_d (&v, 1) != MP_EQ) {
2532     res = MP_VAL;
2533     goto __ERR;
2534   }
2535
2536   /* if it's too low */
2537   while (mp_cmp_d(&C, 0) == MP_LT) {
2538       if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
2539          goto __ERR;
2540       }
2541   }
2542
2543   /* too big */
2544   while (mp_cmp_mag(&C, b) != MP_LT) {
2545       if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
2546          goto __ERR;
2547       }
2548   }
2549
2550   /* C is now the inverse */
2551   mp_exch (&C, c);
2552   res = MP_OKAY;
2553 __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
2554   return res;
2555 }
2556
2557 /* c = |a| * |b| using Karatsuba Multiplication using
2558  * three half size multiplications
2559  *
2560  * Let B represent the radix [e.g. 2**DIGIT_BIT] and
2561  * let n represent half of the number of digits in
2562  * the min(a,b)
2563  *
2564  * a = a1 * B**n + a0
2565  * b = b1 * B**n + b0
2566  *
2567  * Then, a * b =>
2568    a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
2569  *
2570  * Note that a1b1 and a0b0 are used twice and only need to be
2571  * computed once.  So in total three half size (half # of
2572  * digit) multiplications are performed, a0b0, a1b1 and
2573  * (a1-b1)(a0-b0)
2574  *
2575  * Note that a multiplication of half the digits requires
2576  * 1/4th the number of single precision multiplications so in
2577  * total after one call 25% of the single precision multiplications
2578  * are saved.  Note also that the call to mp_mul can end up back
2579  * in this function if the a0, a1, b0, or b1 are above the threshold.
2580  * This is known as divide-and-conquer and leads to the famous
2581  * O(N**lg(3)) or O(N**1.584) work which is asymptotically lower than
2582  * the standard O(N**2) that the baseline/comba methods use.
2583  * Generally though the overhead of this method doesn't pay off
2584  * until a certain size (N ~ 80) is reached.
2585  */
2586 int mp_karatsuba_mul (const mp_int * a, const mp_int * b, mp_int * c)
2587 {
2588   mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
2589   int     B, err;
2590
2591   /* default the return code to an error */
2592   err = MP_MEM;
2593
2594   /* min # of digits */
2595   B = MIN (a->used, b->used);
2596
2597   /* now divide in two */
2598   B = B >> 1;
2599
2600   /* init copy all the temps */
2601   if (mp_init_size (&x0, B) != MP_OKAY)
2602     goto ERR;
2603   if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2604     goto X0;
2605   if (mp_init_size (&y0, B) != MP_OKAY)
2606     goto X1;
2607   if (mp_init_size (&y1, b->used - B) != MP_OKAY)
2608     goto Y0;
2609
2610   /* init temps */
2611   if (mp_init_size (&t1, B * 2) != MP_OKAY)
2612     goto Y1;
2613   if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
2614     goto T1;
2615   if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
2616     goto X0Y0;
2617
2618   /* now shift the digits */
2619   x0.used = y0.used = B;
2620   x1.used = a->used - B;
2621   y1.used = b->used - B;
2622
2623   {
2624     register int x;
2625     register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
2626
2627     /* we copy the digits directly instead of using higher level functions
2628      * since we also need to shift the digits
2629      */
2630     tmpa = a->dp;
2631     tmpb = b->dp;
2632
2633     tmpx = x0.dp;
2634     tmpy = y0.dp;
2635     for (x = 0; x < B; x++) {
2636       *tmpx++ = *tmpa++;
2637       *tmpy++ = *tmpb++;
2638     }
2639
2640     tmpx = x1.dp;
2641     for (x = B; x < a->used; x++) {
2642       *tmpx++ = *tmpa++;
2643     }
2644
2645     tmpy = y1.dp;
2646     for (x = B; x < b->used; x++) {
2647       *tmpy++ = *tmpb++;
2648     }
2649   }
2650
2651   /* only need to clamp the lower words since by definition the
2652    * upper words x1/y1 must have a known number of digits
2653    */
2654   mp_clamp (&x0);
2655   mp_clamp (&y0);
2656
2657   /* now calc the products x0y0 and x1y1 */
2658   /* after this x0 is no longer required, free temp [x0==t2]! */
2659   if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
2660     goto X1Y1;          /* x0y0 = x0*y0 */
2661   if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
2662     goto X1Y1;          /* x1y1 = x1*y1 */
2663
2664   /* now calc x1-x0 and y1-y0 */
2665   if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
2666     goto X1Y1;          /* t1 = x1 - x0 */
2667   if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
2668     goto X1Y1;          /* t2 = y1 - y0 */
2669   if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
2670     goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
2671
2672   /* add x0y0 */
2673   if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
2674     goto X1Y1;          /* t2 = x0y0 + x1y1 */
2675   if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
2676     goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
2677
2678   /* shift by B */
2679   if (mp_lshd (&t1, B) != MP_OKAY)
2680     goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
2681   if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
2682     goto X1Y1;          /* x1y1 = x1y1 << 2*B */
2683
2684   if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
2685     goto X1Y1;          /* t1 = x0y0 + t1 */
2686   if (mp_add (&t1, &x1y1, c) != MP_OKAY)
2687     goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
2688
2689   /* Algorithm succeeded set the return code to MP_OKAY */
2690   err = MP_OKAY;
2691
2692 X1Y1:mp_clear (&x1y1);
2693 X0Y0:mp_clear (&x0y0);
2694 T1:mp_clear (&t1);
2695 Y1:mp_clear (&y1);
2696 Y0:mp_clear (&y0);
2697 X1:mp_clear (&x1);
2698 X0:mp_clear (&x0);
2699 ERR:
2700   return err;
2701 }
2702
2703 /* Karatsuba squaring, computes b = a*a using three
2704  * half size squarings
2705  *
2706  * See comments of karatsuba_mul for details.  It
2707  * is essentially the same algorithm but merely
2708  * tuned to perform recursive squarings.
2709  */
2710 int mp_karatsuba_sqr (const mp_int * a, mp_int * b)
2711 {
2712   mp_int  x0, x1, t1, t2, x0x0, x1x1;
2713   int     B, err;
2714
2715   err = MP_MEM;
2716
2717   /* min # of digits */
2718   B = a->used;
2719
2720   /* now divide in two */
2721   B = B >> 1;
2722
2723   /* init copy all the temps */
2724   if (mp_init_size (&x0, B) != MP_OKAY)
2725     goto ERR;
2726   if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2727     goto X0;
2728
2729   /* init temps */
2730   if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
2731     goto X1;
2732   if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
2733     goto T1;
2734   if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
2735     goto T2;
2736   if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
2737     goto X0X0;
2738
2739   {
2740     register int x;
2741     register mp_digit *dst, *src;
2742
2743     src = a->dp;
2744
2745     /* now shift the digits */
2746     dst = x0.dp;
2747     for (x = 0; x < B; x++) {
2748       *dst++ = *src++;
2749     }
2750
2751     dst = x1.dp;
2752     for (x = B; x < a->used; x++) {
2753       *dst++ = *src++;
2754     }
2755   }
2756
2757   x0.used = B;
2758   x1.used = a->used - B;
2759
2760   mp_clamp (&x0);
2761
2762   /* now calc the products x0*x0 and x1*x1 */
2763   if (mp_sqr (&x0, &x0x0) != MP_OKAY)
2764     goto X1X1;           /* x0x0 = x0*x0 */
2765   if (mp_sqr (&x1, &x1x1) != MP_OKAY)
2766     goto X1X1;           /* x1x1 = x1*x1 */
2767
2768   /* now calc (x1-x0)**2 */
2769   if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
2770     goto X1X1;           /* t1 = x1 - x0 */
2771   if (mp_sqr (&t1, &t1) != MP_OKAY)
2772     goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
2773
2774   /* add x0y0 */
2775   if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
2776     goto X1X1;           /* t2 = x0x0 + x1x1 */
2777   if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
2778     goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
2779
2780   /* shift by B */
2781   if (mp_lshd (&t1, B) != MP_OKAY)
2782     goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
2783   if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
2784     goto X1X1;           /* x1x1 = x1x1 << 2*B */
2785
2786   if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
2787     goto X1X1;           /* t1 = x0x0 + t1 */
2788   if (mp_add (&t1, &x1x1, b) != MP_OKAY)
2789     goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
2790
2791   err = MP_OKAY;
2792
2793 X1X1:mp_clear (&x1x1);
2794 X0X0:mp_clear (&x0x0);
2795 T2:mp_clear (&t2);
2796 T1:mp_clear (&t1);
2797 X1:mp_clear (&x1);
2798 X0:mp_clear (&x0);
2799 ERR:
2800   return err;
2801 }
2802
2803 /* computes least common multiple as |a*b|/(a, b) */
2804 int mp_lcm (const mp_int * a, const mp_int * b, mp_int * c)
2805 {
2806   int     res;
2807   mp_int  t1, t2;
2808
2809
2810   if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
2811     return res;
2812   }
2813
2814   /* t1 = get the GCD of the two inputs */
2815   if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
2816     goto __T;
2817   }
2818
2819   /* divide the smallest by the GCD */
2820   if (mp_cmp_mag(a, b) == MP_LT) {
2821      /* store quotient in t2 so that t2 * b is the LCM */
2822      if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
2823         goto __T;
2824      }
2825      res = mp_mul(b, &t2, c);
2826   } else {
2827      /* store quotient in t2 so that t2 * a is the LCM */
2828      if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
2829         goto __T;
2830      }
2831      res = mp_mul(a, &t2, c);
2832   }
2833
2834   /* fix the sign to positive */
2835   c->sign = MP_ZPOS;
2836
2837 __T:
2838   mp_clear_multi (&t1, &t2, NULL);
2839   return res;
2840 }
2841
2842 /* c = a mod b, 0 <= c < b */
2843 int
2844 mp_mod (const mp_int * a, mp_int * b, mp_int * c)
2845 {
2846   mp_int  t;
2847   int     res;
2848
2849   if ((res = mp_init (&t)) != MP_OKAY) {
2850     return res;
2851   }
2852
2853   if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
2854     mp_clear (&t);
2855     return res;
2856   }
2857
2858   if (t.sign != b->sign) {
2859     res = mp_add (b, &t, c);
2860   } else {
2861     res = MP_OKAY;
2862     mp_exch (&t, c);
2863   }
2864
2865   mp_clear (&t);
2866   return res;
2867 }
2868
2869 static int
2870 mp_mod_d (const mp_int * a, mp_digit b, mp_digit * c)
2871 {
2872   return mp_div_d(a, b, NULL, c);
2873 }
2874
2875 /* b = a*2 */
2876 static int mp_mul_2(const mp_int * a, mp_int * b)
2877 {
2878   int     x, res, oldused;
2879
2880   /* grow to accommodate result */
2881   if (b->alloc < a->used + 1) {
2882     if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
2883       return res;
2884     }
2885   }
2886
2887   oldused = b->used;
2888   b->used = a->used;
2889
2890   {
2891     register mp_digit r, rr, *tmpa, *tmpb;
2892
2893     /* alias for source */
2894     tmpa = a->dp;
2895
2896     /* alias for dest */
2897     tmpb = b->dp;
2898
2899     /* carry */
2900     r = 0;
2901     for (x = 0; x < a->used; x++) {
2902
2903       /* get what will be the *next* carry bit from the
2904        * MSB of the current digit
2905        */
2906       rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
2907
2908       /* now shift up this digit, add in the carry [from the previous] */
2909       *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
2910
2911       /* copy the carry that would be from the source
2912        * digit into the next iteration
2913        */
2914       r = rr;
2915     }
2916
2917     /* new leading digit? */
2918     if (r != 0) {
2919       /* add a MSB which is always 1 at this point */
2920       *tmpb = 1;
2921       ++(b->used);
2922     }
2923
2924     /* now zero any excess digits on the destination
2925      * that we didn't write to
2926      */
2927     tmpb = b->dp + b->used;
2928     for (x = b->used; x < oldused; x++) {
2929       *tmpb++ = 0;
2930     }
2931   }
2932   b->sign = a->sign;
2933   return MP_OKAY;
2934 }
2935
2936 /*
2937  * shifts with subtractions when the result is greater than b.
2938  *
2939  * The method is slightly modified to shift B unconditionally up to just under
2940  * the leading bit of b.  This saves a lot of multiple precision shifting.
2941  */
2942 int mp_montgomery_calc_normalization (mp_int * a, const mp_int * b)
2943 {
2944   int     x, bits, res;
2945
2946   /* how many bits of last digit does b use */
2947   bits = mp_count_bits (b) % DIGIT_BIT;
2948
2949
2950   if (b->used > 1) {
2951      if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
2952         return res;
2953      }
2954   } else {
2955      mp_set(a, 1);
2956      bits = 1;
2957   }
2958
2959
2960   /* now compute C = A * B mod b */
2961   for (x = bits - 1; x < DIGIT_BIT; x++) {
2962     if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
2963       return res;
2964     }
2965     if (mp_cmp_mag (a, b) != MP_LT) {
2966       if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
2967         return res;
2968       }
2969     }
2970   }
2971
2972   return MP_OKAY;
2973 }
2974
2975 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
2976 int
2977 mp_montgomery_reduce (mp_int * x, const mp_int * n, mp_digit rho)
2978 {
2979   int     ix, res, digs;
2980   mp_digit mu;
2981
2982   /* can the fast reduction [comba] method be used?
2983    *
2984    * Note that unlike in mul you're safely allowed *less*
2985    * than the available columns [255 per default] since carries
2986    * are fixed up in the inner loop.
2987    */
2988   digs = n->used * 2 + 1;
2989   if ((digs < MP_WARRAY) &&
2990       n->used <
2991       (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2992     return fast_mp_montgomery_reduce (x, n, rho);
2993   }
2994
2995   /* grow the input as required */
2996   if (x->alloc < digs) {
2997     if ((res = mp_grow (x, digs)) != MP_OKAY) {
2998       return res;
2999     }
3000   }
3001   x->used = digs;
3002
3003   for (ix = 0; ix < n->used; ix++) {
3004     /* mu = ai * rho mod b
3005      *
3006      * The value of rho must be precalculated via
3007      * montgomery_setup() such that
3008      * it equals -1/n0 mod b this allows the
3009      * following inner loop to reduce the
3010      * input one digit at a time
3011      */
3012     mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
3013
3014     /* a = a + mu * m * b**i */
3015     {
3016       register int iy;
3017       register mp_digit *tmpn, *tmpx, u;
3018       register mp_word r;
3019
3020       /* alias for digits of the modulus */
3021       tmpn = n->dp;
3022
3023       /* alias for the digits of x [the input] */
3024       tmpx = x->dp + ix;
3025
3026       /* set the carry to zero */
3027       u = 0;
3028
3029       /* Multiply and add in place */
3030       for (iy = 0; iy < n->used; iy++) {
3031         /* compute product and sum */
3032         r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
3033                   ((mp_word) u) + ((mp_word) * tmpx);
3034
3035         /* get carry */
3036         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
3037
3038         /* fix digit */
3039         *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
3040       }
3041       /* At this point the ix'th digit of x should be zero */
3042
3043
3044       /* propagate carries upwards as required*/
3045       while (u) {
3046         *tmpx   += u;
3047         u        = *tmpx >> DIGIT_BIT;
3048         *tmpx++ &= MP_MASK;
3049       }
3050     }
3051   }
3052
3053   /* at this point the n.used'th least
3054    * significant digits of x are all zero
3055    * which means we can shift x to the
3056    * right by n.used digits and the
3057    * residue is unchanged.
3058    */
3059
3060   /* x = x/b**n.used */
3061   mp_clamp(x);
3062   mp_rshd (x, n->used);
3063
3064   /* if x >= n then x = x - n */
3065   if (mp_cmp_mag (x, n) != MP_LT) {
3066     return s_mp_sub (x, n, x);
3067   }
3068
3069   return MP_OKAY;
3070 }
3071
3072 /* setups the montgomery reduction stuff */
3073 int
3074 mp_montgomery_setup (const mp_int * n, mp_digit * rho)
3075 {
3076   mp_digit x, b;
3077
3078 /* fast inversion mod 2**k
3079  *
3080  * Based on the fact that
3081  *
3082  * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
3083  *                    =>  2*X*A - X*X*A*A = 1
3084  *                    =>  2*(1) - (1)     = 1
3085  */
3086   b = n->dp[0];
3087
3088   if ((b & 1) == 0) {
3089     return MP_VAL;
3090   }
3091
3092   x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
3093   x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
3094   x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
3095   x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
3096
3097   /* rho = -1/m mod b */
3098   *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
3099
3100   return MP_OKAY;
3101 }
3102
3103 /* high level multiplication (handles sign) */
3104 int mp_mul (const mp_int * a, const mp_int * b, mp_int * c)
3105 {
3106   int     res, neg;
3107   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
3108
3109   /* use Karatsuba? */
3110   if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
3111     res = mp_karatsuba_mul (a, b, c);
3112   } else
3113   {
3114     /* can we use the fast multiplier?
3115      *
3116      * The fast multiplier can be used if the output will
3117      * have less than MP_WARRAY digits and the number of
3118      * digits won't affect carry propagation
3119      */
3120     int     digs = a->used + b->used + 1;
3121
3122     if ((digs < MP_WARRAY) &&
3123         MIN(a->used, b->used) <=
3124         (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
3125       res = fast_s_mp_mul_digs (a, b, c, digs);
3126     } else
3127       res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
3128   }
3129   c->sign = (c->used > 0) ? neg : MP_ZPOS;
3130   return res;
3131 }
3132
3133 /* d = a * b (mod c) */
3134 int
3135 mp_mulmod (const mp_int * a, const mp_int * b, mp_int * c, mp_int * d)
3136 {
3137   int     res;
3138   mp_int  t;
3139
3140   if ((res = mp_init (&t)) != MP_OKAY) {
3141     return res;
3142   }
3143
3144   if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
3145     mp_clear (&t);
3146     return res;
3147   }
3148   res = mp_mod (&t, c, d);
3149   mp_clear (&t);
3150   return res;
3151 }
3152
3153 /* table of first PRIME_SIZE primes */
3154 static const mp_digit __prime_tab[] = {
3155   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
3156   0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
3157   0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
3158   0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
3159   0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
3160   0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
3161   0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
3162   0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
3163
3164   0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
3165   0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
3166   0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
3167   0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
3168   0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
3169   0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
3170   0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
3171   0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
3172
3173   0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
3174   0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
3175   0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
3176   0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
3177   0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
3178   0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
3179   0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
3180   0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
3181
3182   0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
3183   0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
3184   0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
3185   0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
3186   0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
3187   0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
3188   0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
3189   0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
3190 };
3191
3192 /* determines if an integers is divisible by one
3193  * of the first PRIME_SIZE primes or not
3194  *
3195  * sets result to 0 if not, 1 if yes
3196  */
3197 static int mp_prime_is_divisible (const mp_int * a, int *result)
3198 {
3199   int     err, ix;
3200   mp_digit res;
3201
3202   /* default to not */
3203   *result = MP_NO;
3204
3205   for (ix = 0; ix < PRIME_SIZE; ix++) {
3206     /* what is a mod __prime_tab[ix] */
3207     if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
3208       return err;
3209     }
3210
3211     /* is the residue zero? */
3212     if (res == 0) {
3213       *result = MP_YES;
3214       return MP_OKAY;
3215     }
3216   }
3217
3218   return MP_OKAY;
3219 }
3220
3221 /* Miller-Rabin test of "a" to the base of "b" as described in
3222  * HAC pp. 139 Algorithm 4.24
3223  *
3224  * Sets result to 0 if definitely composite or 1 if probably prime.
3225  * Randomly the chance of error is no more than 1/4 and often
3226  * very much lower.
3227  */
3228 static int mp_prime_miller_rabin (mp_int * a, const mp_int * b, int *result)
3229 {
3230   mp_int  n1, y, r;
3231   int     s, j, err;
3232
3233   /* default */
3234   *result = MP_NO;
3235
3236   /* ensure b > 1 */
3237   if (mp_cmp_d(b, 1) != MP_GT) {
3238      return MP_VAL;
3239   }
3240
3241   /* get n1 = a - 1 */
3242   if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
3243     return err;
3244   }
3245   if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
3246     goto __N1;
3247   }
3248
3249   /* set 2**s * r = n1 */
3250   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
3251     goto __N1;
3252   }
3253
3254   /* count the number of least significant bits
3255    * which are zero
3256    */
3257   s = mp_cnt_lsb(&r);
3258
3259   /* now divide n - 1 by 2**s */
3260   if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
3261     goto __R;
3262   }
3263
3264   /* compute y = b**r mod a */
3265   if ((err = mp_init (&y)) != MP_OKAY) {
3266     goto __R;
3267   }
3268   if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
3269     goto __Y;
3270   }
3271
3272   /* if y != 1 and y != n1 do */
3273   if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
3274     j = 1;
3275     /* while j <= s-1 and y != n1 */
3276     while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
3277       if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
3278          goto __Y;
3279       }
3280
3281       /* if y == 1 then composite */
3282       if (mp_cmp_d (&y, 1) == MP_EQ) {
3283          goto __Y;
3284       }
3285
3286       ++j;
3287     }
3288
3289     /* if y != n1 then composite */
3290     if (mp_cmp (&y, &n1) != MP_EQ) {
3291       goto __Y;
3292     }
3293   }
3294
3295   /* probably prime now */
3296   *result = MP_YES;
3297 __Y:mp_clear (&y);
3298 __R:mp_clear (&r);
3299 __N1:mp_clear (&n1);
3300   return err;
3301 }
3302
3303 /* performs a variable number of rounds of Miller-Rabin
3304  *
3305  * Probability of error after t rounds is no more than
3306
3307  *
3308  * Sets result to 1 if probably prime, 0 otherwise
3309  */
3310 static int mp_prime_is_prime (mp_int * a, int t, int *result)
3311 {
3312   mp_int  b;
3313   int     ix, err, res;
3314
3315   /* default to no */
3316   *result = MP_NO;
3317
3318   /* valid value of t? */
3319   if (t <= 0 || t > PRIME_SIZE) {
3320     return MP_VAL;
3321   }
3322
3323   /* is the input equal to one of the primes in the table? */
3324   for (ix = 0; ix < PRIME_SIZE; ix++) {
3325       if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
3326          *result = 1;
3327          return MP_OKAY;
3328       }
3329   }
3330
3331   /* first perform trial division */
3332   if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
3333     return err;
3334   }
3335
3336   /* return if it was trivially divisible */
3337   if (res == MP_YES) {
3338     return MP_OKAY;
3339   }
3340
3341   /* now perform the miller-rabin rounds */
3342   if ((err = mp_init (&b)) != MP_OKAY) {
3343     return err;
3344   }
3345
3346   for (ix = 0; ix < t; ix++) {
3347     /* set the prime */
3348     mp_set (&b, __prime_tab[ix]);
3349
3350     if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
3351       goto __B;
3352     }
3353
3354     if (res == MP_NO) {
3355       goto __B;
3356     }
3357   }
3358
3359   /* passed the test */
3360   *result = MP_YES;
3361 __B:mp_clear (&b);
3362   return err;
3363 }
3364
3365 static const struct {
3366    int k, t;
3367 } sizes[] = {
3368 {   128,    28 },
3369 {   256,    16 },
3370 {   384,    10 },
3371 {   512,     7 },
3372 {   640,     6 },
3373 {   768,     5 },
3374 {   896,     4 },
3375 {  1024,     4 }
3376 };
3377
3378 /* returns # of RM trials required for a given bit size */
3379 int mp_prime_rabin_miller_trials(int size)
3380 {
3381    int x;
3382
3383    for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
3384        if (sizes[x].k == size) {
3385           return sizes[x].t;
3386        } else if (sizes[x].k > size) {
3387           return (x == 0) ? sizes[0].t : sizes[x - 1].t;
3388        }
3389    }
3390    return sizes[x-1].t + 1;
3391 }
3392
3393 /* makes a truly random prime of a given size (bits),
3394  *
3395  * Flags are as follows:
3396  *
3397  *   LTM_PRIME_BBS      - make prime congruent to 3 mod 4
3398  *   LTM_PRIME_SAFE     - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
3399  *   LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
3400  *   LTM_PRIME_2MSB_ON  - make the 2nd highest bit one
3401  *
3402  * You have to supply a callback which fills in a buffer with random bytes.  "dat" is a parameter you can
3403  * have passed to the callback (e.g. a state or something).  This function doesn't use "dat" itself
3404  * so it can be NULL
3405  *
3406  */
3407
3408 /* This is possibly the mother of all prime generation functions, muahahahahaha! */
3409 int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat)
3410 {
3411    unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
3412    int res, err, bsize, maskOR_msb_offset;
3413
3414    /* sanity check the input */
3415    if (size <= 1 || t <= 0) {
3416       return MP_VAL;
3417    }
3418
3419    /* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
3420    if (flags & LTM_PRIME_SAFE) {
3421       flags |= LTM_PRIME_BBS;
3422    }
3423
3424    /* calc the byte size */
3425    bsize = (size>>3)+((size&7)?1:0);
3426
3427    /* we need a buffer of bsize bytes */
3428    tmp = HeapAlloc(GetProcessHeap(), 0, bsize);
3429    if (tmp == NULL) {
3430       return MP_MEM;
3431    }
3432
3433    /* calc the maskAND value for the MSbyte*/
3434    maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
3435
3436    /* calc the maskOR_msb */
3437    maskOR_msb        = 0;
3438    maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
3439    if (flags & LTM_PRIME_2MSB_ON) {
3440       maskOR_msb     |= 1 << ((size - 2) & 7);
3441    } else if (flags & LTM_PRIME_2MSB_OFF) {
3442       maskAND        &= ~(1 << ((size - 2) & 7));
3443    }
3444
3445    /* get the maskOR_lsb */
3446    maskOR_lsb         = 0;
3447    if (flags & LTM_PRIME_BBS) {
3448       maskOR_lsb     |= 3;
3449    }
3450
3451    do {
3452       /* read the bytes */
3453       if (cb(tmp, bsize, dat) != bsize) {
3454          err = MP_VAL;
3455          goto error;
3456       }
3457
3458       /* work over the MSbyte */
3459       tmp[0]    &= maskAND;
3460       tmp[0]    |= 1 << ((size - 1) & 7);
3461
3462       /* mix in the maskORs */
3463       tmp[maskOR_msb_offset]   |= maskOR_msb;
3464       tmp[bsize-1]             |= maskOR_lsb;
3465
3466       /* read it in */
3467       if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY)     { goto error; }
3468
3469       /* is it prime? */
3470       if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)           { goto error; }
3471       if (res == MP_NO) {
3472          continue;
3473       }
3474
3475       if (flags & LTM_PRIME_SAFE) {
3476          /* see if (a-1)/2 is prime */
3477          if ((err = mp_sub_d(a, 1, a)) != MP_OKAY)                    { goto error; }
3478          if ((err = mp_div_2(a, a)) != MP_OKAY)                       { goto error; }
3479
3480          /* is it prime? */
3481          if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)        { goto error; }
3482       }
3483    } while (res == MP_NO);
3484
3485    if (flags & LTM_PRIME_SAFE) {
3486       /* restore a to the original value */
3487       if ((err = mp_mul_2(a, a)) != MP_OKAY)                          { goto error; }
3488       if ((err = mp_add_d(a, 1, a)) != MP_OKAY)                       { goto error; }
3489    }
3490
3491    err = MP_OKAY;
3492 error:
3493    HeapFree(GetProcessHeap(), 0, tmp);
3494    return err;
3495 }
3496
3497 /* reads an unsigned char array, assumes the msb is stored first [big endian] */
3498 int
3499 mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
3500 {
3501   int     res;
3502
3503   /* make sure there are at least two digits */
3504   if (a->alloc < 2) {
3505      if ((res = mp_grow(a, 2)) != MP_OKAY) {
3506         return res;
3507      }
3508   }
3509
3510   /* zero the int */
3511   mp_zero (a);
3512
3513   /* read the bytes in */
3514   while (c-- > 0) {
3515     if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
3516       return res;
3517     }
3518
3519     a->dp[0] |= *b++;
3520     a->used += 1;
3521   }
3522   mp_clamp (a);
3523   return MP_OKAY;
3524 }
3525
3526 /* reduces x mod m, assumes 0 < x < m**2, mu is
3527  * precomputed via mp_reduce_setup.
3528  * From HAC pp.604 Algorithm 14.42
3529  */
3530 int
3531 mp_reduce (mp_int * x, const mp_int * m, const mp_int * mu)
3532 {
3533   mp_int  q;
3534   int     res, um = m->used;
3535
3536   /* q = x */
3537   if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
3538     return res;
3539   }
3540
3541   /* q1 = x / b**(k-1)  */
3542   mp_rshd (&q, um - 1);
3543
3544   /* according to HAC this optimization is ok */
3545   if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
3546     if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
3547       goto CLEANUP;
3548     }
3549   } else {
3550     if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
3551       goto CLEANUP;
3552     }
3553   }
3554
3555   /* q3 = q2 / b**(k+1) */
3556   mp_rshd (&q, um + 1);
3557
3558   /* x = x mod b**(k+1), quick (no division) */
3559   if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
3560     goto CLEANUP;
3561   }
3562
3563   /* q = q * m mod b**(k+1), quick (no division) */
3564   if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
3565     goto CLEANUP;
3566   }
3567
3568   /* x = x - q */
3569   if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
3570     goto CLEANUP;
3571   }
3572
3573   /* If x < 0, add b**(k+1) to it */
3574   if (mp_cmp_d (x, 0) == MP_LT) {
3575     mp_set (&q, 1);
3576     if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
3577       goto CLEANUP;
3578     if ((res = mp_add (x, &q, x)) != MP_OKAY)
3579       goto CLEANUP;
3580   }
3581
3582   /* Back off if it's too big */
3583   while (mp_cmp (x, m) != MP_LT) {
3584     if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
3585       goto CLEANUP;
3586     }
3587   }
3588
3589 CLEANUP:
3590   mp_clear (&q);
3591
3592   return res;
3593 }
3594
3595 /* reduces a modulo n where n is of the form 2**p - d */
3596 int
3597 mp_reduce_2k(mp_int *a, const mp_int *n, mp_digit d)
3598 {
3599    mp_int q;
3600    int    p, res;
3601
3602    if ((res = mp_init(&q)) != MP_OKAY) {
3603       return res;
3604    }
3605
3606    p = mp_count_bits(n);
3607 top:
3608    /* q = a/2**p, a = a mod 2**p */
3609    if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
3610       goto ERR;
3611    }
3612
3613    if (d != 1) {
3614       /* q = q * d */
3615       if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) {
3616          goto ERR;
3617       }
3618    }
3619
3620    /* a = a + q */
3621    if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
3622       goto ERR;
3623    }
3624
3625    if (mp_cmp_mag(a, n) != MP_LT) {
3626       s_mp_sub(a, n, a);
3627       goto top;
3628    }
3629
3630 ERR:
3631    mp_clear(&q);
3632    return res;
3633 }
3634
3635 /* determines the setup value */
3636 static int
3637 mp_reduce_2k_setup(const mp_int *a, mp_digit *d)
3638 {
3639    int res, p;
3640    mp_int tmp;
3641
3642    if ((res = mp_init(&tmp)) != MP_OKAY) {
3643       return res;
3644    }
3645
3646    p = mp_count_bits(a);
3647    if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
3648       mp_clear(&tmp);
3649       return res;
3650    }
3651
3652    if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
3653       mp_clear(&tmp);
3654       return res;
3655    }
3656
3657    *d = tmp.dp[0];
3658    mp_clear(&tmp);
3659    return MP_OKAY;
3660 }
3661
3662 /* pre-calculate the value required for Barrett reduction
3663  * For a given modulus "b" it calculates the value required in "a"
3664  */
3665 int mp_reduce_setup (mp_int * a, const mp_int * b)
3666 {
3667   int     res;
3668
3669   if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
3670     return res;
3671   }
3672   return mp_div (a, b, a, NULL);
3673 }
3674
3675 /* set to a digit */
3676 void mp_set (mp_int * a, mp_digit b)
3677 {
3678   mp_zero (a);
3679   a->dp[0] = b & MP_MASK;
3680   a->used  = (a->dp[0] != 0) ? 1 : 0;
3681 }
3682
3683 /* set a 32-bit const */
3684 int mp_set_int (mp_int * a, unsigned long b)
3685 {
3686   int     x, res;
3687
3688   mp_zero (a);
3689
3690   /* set four bits at a time */
3691   for (x = 0; x < 8; x++) {
3692     /* shift the number up four bits */
3693     if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
3694       return res;
3695     }
3696
3697     /* OR in the top four bits of the source */
3698     a->dp[0] |= (b >> 28) & 15;
3699
3700     /* shift the source up to the next four bits */
3701     b <<= 4;
3702
3703     /* ensure that digits are not clamped off */
3704     a->used += 1;
3705   }
3706   mp_clamp (a);
3707   return MP_OKAY;
3708 }
3709
3710 /* shrink a bignum */
3711 int mp_shrink (mp_int * a)
3712 {
3713   mp_digit *tmp;
3714   if (a->alloc != a->used && a->used > 0) {
3715     if ((tmp = HeapReAlloc(GetProcessHeap(), 0, a->dp, sizeof (mp_digit) * a->used)) == NULL) {
3716       return MP_MEM;
3717     }
3718     a->dp    = tmp;
3719     a->alloc = a->used;
3720   }
3721   return MP_OKAY;
3722 }
3723
3724 /* computes b = a*a */
3725 int
3726 mp_sqr (const mp_int * a, mp_int * b)
3727 {
3728   int     res;
3729
3730 if (a->used >= KARATSUBA_SQR_CUTOFF) {
3731     res = mp_karatsuba_sqr (a, b);
3732   } else
3733   {
3734     /* can we use the fast comba multiplier? */
3735     if ((a->used * 2 + 1) < MP_WARRAY &&
3736          a->used <
3737          (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
3738       res = fast_s_mp_sqr (a, b);
3739     } else
3740       res = s_mp_sqr (a, b);
3741   }
3742   b->sign = MP_ZPOS;
3743   return res;
3744 }
3745
3746 /* c = a * a (mod b) */
3747 int
3748 mp_sqrmod (const mp_int * a, mp_int * b, mp_int * c)
3749 {
3750   int     res;
3751   mp_int  t;
3752
3753   if ((res = mp_init (&t)) != MP_OKAY) {
3754     return res;
3755   }
3756
3757   if ((res = mp_sqr (a, &t)) != MP_OKAY) {
3758     mp_clear (&t);
3759     return res;
3760   }
3761   res = mp_mod (&t, b, c);
3762   mp_clear (&t);
3763   return res;
3764 }
3765
3766 /* high level subtraction (handles signs) */
3767 int
3768 mp_sub (mp_int * a, mp_int * b, mp_int * c)
3769 {
3770   int     sa, sb, res;
3771
3772   sa = a->sign;
3773   sb = b->sign;
3774
3775   if (sa != sb) {
3776     /* subtract a negative from a positive, OR */
3777     /* subtract a positive from a negative. */
3778     /* In either case, ADD their magnitudes, */
3779     /* and use the sign of the first number. */
3780     c->sign = sa;
3781     res = s_mp_add (a, b, c);
3782   } else {
3783     /* subtract a positive from a positive, OR */
3784     /* subtract a negative from a negative. */
3785     /* First, take the difference between their */
3786     /* magnitudes, then... */
3787     if (mp_cmp_mag (a, b) != MP_LT) {
3788       /* Copy the sign from the first */
3789       c->sign = sa;
3790       /* The first has a larger or equal magnitude */
3791       res = s_mp_sub (a, b, c);
3792     } else {
3793       /* The result has the *opposite* sign from */
3794       /* the first number. */
3795       c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
3796       /* The second has a larger magnitude */
3797       res = s_mp_sub (b, a, c);
3798     }
3799   }
3800   return res;
3801 }
3802
3803 /* single digit subtraction */
3804 int
3805 mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
3806 {
3807   mp_digit *tmpa, *tmpc, mu;
3808   int       res, ix, oldused;
3809
3810   /* grow c as required */
3811   if (c->alloc < a->used + 1) {
3812      if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
3813         return res;
3814      }
3815   }
3816
3817   /* if a is negative just do an unsigned
3818    * addition [with fudged signs]
3819    */
3820   if (a->sign == MP_NEG) {
3821      a->sign = MP_ZPOS;
3822      res     = mp_add_d(a, b, c);
3823      a->sign = c->sign = MP_NEG;
3824      return res;
3825   }
3826
3827   /* setup regs */
3828   oldused = c->used;
3829   tmpa    = a->dp;
3830   tmpc    = c->dp;
3831
3832   /* if a <= b simply fix the single digit */
3833   if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
3834      if (a->used == 1) {
3835         *tmpc++ = b - *tmpa;
3836      } else {
3837         *tmpc++ = b;
3838      }
3839      ix      = 1;
3840
3841      /* negative/1digit */
3842      c->sign = MP_NEG;
3843      c->used = 1;
3844   } else {
3845      /* positive/size */
3846      c->sign = MP_ZPOS;
3847      c->used = a->used;
3848
3849      /* subtract first digit */
3850      *tmpc    = *tmpa++ - b;
3851      mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3852      *tmpc++ &= MP_MASK;
3853
3854      /* handle rest of the digits */
3855      for (ix = 1; ix < a->used; ix++) {
3856         *tmpc    = *tmpa++ - mu;
3857         mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3858         *tmpc++ &= MP_MASK;
3859      }
3860   }
3861
3862   /* zero excess digits */
3863   while (ix++ < oldused) {
3864      *tmpc++ = 0;
3865   }
3866   mp_clamp(c);
3867   return MP_OKAY;
3868 }
3869
3870 /* store in unsigned [big endian] format */
3871 int
3872 mp_to_unsigned_bin (const mp_int * a, unsigned char *b)
3873 {
3874   int     x, res;
3875   mp_int  t;
3876
3877   if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
3878     return res;
3879   }
3880
3881   x = 0;
3882   while (mp_iszero (&t) == 0) {
3883     b[x++] = (unsigned char) (t.dp[0] & 255);
3884     if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
3885       mp_clear (&t);
3886       return res;
3887     }
3888   }
3889   bn_reverse (b, x);
3890   mp_clear (&t);
3891   return MP_OKAY;
3892 }
3893
3894 /* get the size for an unsigned equivalent */
3895 int
3896 mp_unsigned_bin_size (const mp_int * a)
3897 {
3898   int     size = mp_count_bits (a);
3899   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
3900 }
3901
3902 /* reverse an array, used for radix code */
3903 static void
3904 bn_reverse (unsigned char *s, int len)
3905 {
3906   int     ix, iy;
3907   unsigned char t;
3908
3909   ix = 0;
3910   iy = len - 1;
3911   while (ix < iy) {
3912     t     = s[ix];
3913     s[ix] = s[iy];
3914     s[iy] = t;
3915     ++ix;
3916     --iy;
3917   }
3918 }
3919
3920 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
3921 static int
3922 s_mp_add (mp_int * a, mp_int * b, mp_int * c)
3923 {
3924   mp_int *x;
3925   int     olduse, res, min, max;
3926
3927   /* find sizes, we let |a| <= |b| which means we have to sort
3928    * them.  "x" will point to the input with the most digits
3929    */
3930   if (a->used > b->used) {
3931     min = b->used;
3932     max = a->used;
3933     x = a;
3934   } else {
3935     min = a->used;
3936     max = b->used;
3937     x = b;
3938   }
3939
3940   /* init result */
3941   if (c->alloc < max + 1) {
3942     if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
3943       return res;
3944     }
3945   }
3946
3947   /* get old used digit count and set new one */
3948   olduse = c->used;
3949   c->used = max + 1;
3950
3951   {
3952     register mp_digit u, *tmpa, *tmpb, *tmpc;
3953     register int i;
3954
3955     /* alias for digit pointers */
3956
3957     /* first input */
3958     tmpa = a->dp;
3959
3960     /* second input */
3961     tmpb = b->dp;
3962
3963     /* destination */
3964     tmpc = c->dp;
3965
3966     /* zero the carry */
3967     u = 0;
3968     for (i = 0; i < min; i++) {
3969       /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
3970       *tmpc = *tmpa++ + *tmpb++ + u;
3971
3972       /* U = carry bit of T[i] */
3973       u = *tmpc >> ((mp_digit)DIGIT_BIT);
3974
3975       /* take away carry bit from T[i] */
3976       *tmpc++ &= MP_MASK;
3977     }
3978
3979     /* now copy higher words if any, that is in A+B
3980      * if A or B has more digits add those in
3981      */
3982     if (min != max) {
3983       for (; i < max; i++) {
3984         /* T[i] = X[i] + U */
3985         *tmpc = x->dp[i] + u;
3986
3987         /* U = carry bit of T[i] */
3988         u = *tmpc >> ((mp_digit)DIGIT_BIT);
3989
3990         /* take away carry bit from T[i] */
3991         *tmpc++ &= MP_MASK;
3992       }
3993     }
3994
3995     /* add carry */
3996     *tmpc++ = u;
3997
3998     /* clear digits above oldused */
3999     for (i = c->used; i < olduse; i++) {
4000       *tmpc++ = 0;
4001     }
4002   }
4003
4004   mp_clamp (c);
4005   return MP_OKAY;
4006 }
4007
4008 static int s_mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y)
4009 {
4010   mp_int  M[256], res, mu;
4011   mp_digit buf;
4012   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
4013
4014   /* find window size */
4015   x = mp_count_bits (X);
4016   if (x <= 7) {
4017     winsize = 2;
4018   } else if (x <= 36) {
4019     winsize = 3;
4020   } else if (x <= 140) {
4021     winsize = 4;
4022   } else if (x <= 450) {
4023     winsize = 5;
4024   } else if (x <= 1303) {
4025     winsize = 6;
4026   } else if (x <= 3529) {
4027     winsize = 7;
4028   } else {
4029     winsize = 8;
4030   }
4031
4032   /* init M array */
4033   /* init first cell */
4034   if ((err = mp_init(&M[1])) != MP_OKAY) {
4035      return err;
4036   }
4037
4038   /* now init the second half of the array */
4039   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
4040     if ((err = mp_init(&M[x])) != MP_OKAY) {
4041       for (y = 1<<(winsize-1); y < x; y++) {
4042         mp_clear (&M[y]);
4043       }
4044       mp_clear(&M[1]);
4045       return err;
4046     }
4047   }
4048
4049   /* create mu, used for Barrett reduction */
4050   if ((err = mp_init (&mu)) != MP_OKAY) {
4051     goto __M;
4052   }
4053   if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
4054     goto __MU;
4055   }
4056
4057   /* create M table
4058    *
4059    * The M table contains powers of the base,
4060    * e.g. M[x] = G**x mod P
4061    *
4062    * The first half of the table is not
4063    * computed though accept for M[0] and M[1]
4064    */
4065   if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
4066     goto __MU;
4067   }
4068
4069   /* compute the value at M[1<<(winsize-1)] by squaring
4070    * M[1] (winsize-1) times
4071    */
4072   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
4073     goto __MU;
4074   }
4075
4076   for (x = 0; x < (winsize - 1); x++) {
4077     if ((err = mp_sqr (&M[1 << (winsize - 1)],
4078                        &M[1 << (winsize - 1)])) != MP_OKAY) {
4079       goto __MU;
4080     }
4081     if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
4082       goto __MU;
4083     }
4084   }
4085
4086   /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
4087    * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
4088    */
4089   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
4090     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
4091       goto __MU;
4092     }
4093     if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
4094       goto __MU;
4095     }
4096   }
4097
4098   /* setup result */
4099   if ((err = mp_init (&res)) != MP_OKAY) {
4100     goto __MU;
4101   }
4102   mp_set (&res, 1);
4103
4104   /* set initial mode and bit cnt */
4105   mode   = 0;
4106   bitcnt = 1;
4107   buf    = 0;
4108   digidx = X->used - 1;
4109   bitcpy = 0;
4110   bitbuf = 0;
4111
4112   for (;;) {
4113     /* grab next digit as required */
4114     if (--bitcnt == 0) {
4115       /* if digidx == -1 we are out of digits */
4116       if (digidx == -1) {
4117         break;
4118       }
4119       /* read next digit and reset the bitcnt */
4120       buf    = X->dp[digidx--];
4121       bitcnt = DIGIT_BIT;
4122     }
4123
4124     /* grab the next msb from the exponent */
4125     y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
4126     buf <<= (mp_digit)1;
4127
4128     /* if the bit is zero and mode == 0 then we ignore it
4129      * These represent the leading zero bits before the first 1 bit
4130      * in the exponent.  Technically this opt is not required but it
4131      * does lower the # of trivial squaring/reductions used
4132      */
4133     if (mode == 0 && y == 0) {
4134       continue;
4135     }
4136
4137     /* if the bit is zero and mode == 1 then we square */
4138     if (mode == 1 && y == 0) {
4139       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4140         goto __RES;
4141       }
4142       if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4143         goto __RES;
4144       }
4145       continue;
4146     }
4147
4148     /* else we add it to the window */
4149     bitbuf |= (y << (winsize - ++bitcpy));
4150     mode    = 2;
4151
4152     if (bitcpy == winsize) {
4153       /* ok window is filled so square as required and multiply  */
4154       /* square first */
4155       for (x = 0; x < winsize; x++) {
4156         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4157           goto __RES;
4158         }
4159         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4160           goto __RES;
4161         }
4162       }
4163
4164       /* then multiply */
4165       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
4166         goto __RES;
4167       }
4168       if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4169         goto __RES;
4170       }
4171
4172       /* empty window and reset */
4173       bitcpy = 0;
4174       bitbuf = 0;
4175       mode   = 1;
4176     }
4177   }
4178
4179   /* if bits remain then square/multiply */
4180   if (mode == 2 && bitcpy > 0) {
4181     /* square then multiply if the bit is set */
4182     for (x = 0; x < bitcpy; x++) {
4183       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4184         goto __RES;
4185       }
4186       if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4187         goto __RES;
4188       }
4189
4190       bitbuf <<= 1;
4191       if ((bitbuf & (1 << winsize)) != 0) {
4192         /* then multiply */
4193         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
4194           goto __RES;
4195         }
4196         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4197           goto __RES;
4198         }
4199       }
4200     }
4201   }
4202
4203   mp_exch (&res, Y);
4204   err = MP_OKAY;
4205 __RES:mp_clear (&res);
4206 __MU:mp_clear (&mu);
4207 __M:
4208   mp_clear(&M[1]);
4209   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
4210     mp_clear (&M[x]);
4211   }
4212   return err;
4213 }
4214
4215 /* multiplies |a| * |b| and only computes up to digs digits of result
4216  * HAC pp. 595, Algorithm 14.12  Modified so you can control how
4217  * many digits of output are created.
4218  */
4219 static int
4220 s_mp_mul_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
4221 {
4222   mp_int  t;
4223   int     res, pa, pb, ix, iy;
4224   mp_digit u;
4225   mp_word r;
4226   mp_digit tmpx, *tmpt, *tmpy;
4227
4228   /* can we use the fast multiplier? */
4229   if (((digs) < MP_WARRAY) &&
4230       MIN (a->used, b->used) <
4231           (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4232     return fast_s_mp_mul_digs (a, b, c, digs);
4233   }
4234
4235   if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
4236     return res;
4237   }
4238   t.used = digs;
4239
4240   /* compute the digits of the product directly */
4241   pa = a->used;
4242   for (ix = 0; ix < pa; ix++) {
4243     /* set the carry to zero */
4244     u = 0;
4245
4246     /* limit ourselves to making digs digits of output */
4247     pb = MIN (b->used, digs - ix);
4248
4249     /* setup some aliases */
4250     /* copy of the digit from a used within the nested loop */
4251     tmpx = a->dp[ix];
4252
4253     /* an alias for the destination shifted ix places */
4254     tmpt = t.dp + ix;
4255
4256     /* an alias for the digits of b */
4257     tmpy = b->dp;
4258
4259     /* compute the columns of the output and propagate the carry */
4260     for (iy = 0; iy < pb; iy++) {
4261       /* compute the column as a mp_word */
4262       r       = ((mp_word)*tmpt) +
4263                 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
4264                 ((mp_word) u);
4265
4266       /* the new column is the lower part of the result */
4267       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4268
4269       /* get the carry word from the result */
4270       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
4271     }
4272     /* set carry if it is placed below digs */
4273     if (ix + iy < digs) {
4274       *tmpt = u;
4275     }
4276   }
4277
4278   mp_clamp (&t);
4279   mp_exch (&t, c);
4280
4281   mp_clear (&t);
4282   return MP_OKAY;
4283 }
4284
4285 /* multiplies |a| * |b| and does not compute the lower digs digits
4286  * [meant to get the higher part of the product]
4287  */
4288 static int
4289 s_mp_mul_high_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
4290 {
4291   mp_int  t;
4292   int     res, pa, pb, ix, iy;
4293   mp_digit u;
4294   mp_word r;
4295   mp_digit tmpx, *tmpt, *tmpy;
4296
4297   /* can we use the fast multiplier? */
4298   if (((a->used + b->used + 1) < MP_WARRAY)
4299       && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4300     return fast_s_mp_mul_high_digs (a, b, c, digs);
4301   }
4302
4303   if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
4304     return res;
4305   }
4306   t.used = a->used + b->used + 1;
4307
4308   pa = a->used;
4309   pb = b->used;
4310   for (ix = 0; ix < pa; ix++) {
4311     /* clear the carry */
4312     u = 0;
4313
4314     /* left hand side of A[ix] * B[iy] */
4315     tmpx = a->dp[ix];
4316
4317     /* alias to the address of where the digits will be stored */
4318     tmpt = &(t.dp[digs]);
4319
4320     /* alias for where to read the right hand side from */
4321     tmpy = b->dp + (digs - ix);
4322
4323     for (iy = digs - ix; iy < pb; iy++) {
4324       /* calculate the double precision result */
4325       r       = ((mp_word)*tmpt) +
4326                 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
4327                 ((mp_word) u);
4328
4329       /* get the lower part */
4330       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4331
4332       /* carry the carry */
4333       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
4334     }
4335     *tmpt = u;
4336   }
4337   mp_clamp (&t);
4338   mp_exch (&t, c);
4339   mp_clear (&t);
4340   return MP_OKAY;
4341 }
4342
4343 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
4344 static int
4345 s_mp_sqr (const mp_int * a, mp_int * b)
4346 {
4347   mp_int  t;
4348   int     res, ix, iy, pa;
4349   mp_word r;
4350   mp_digit u, tmpx, *tmpt;
4351
4352   pa = a->used;
4353   if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
4354     return res;
4355   }
4356
4357   /* default used is maximum possible size */
4358   t.used = 2*pa + 1;
4359
4360   for (ix = 0; ix < pa; ix++) {
4361     /* first calculate the digit at 2*ix */
4362     /* calculate double precision result */
4363     r = ((mp_word) t.dp[2*ix]) +
4364         ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
4365
4366     /* store lower part in result */
4367     t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
4368
4369     /* get the carry */
4370     u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4371
4372     /* left hand side of A[ix] * A[iy] */
4373     tmpx        = a->dp[ix];
4374
4375     /* alias for where to store the results */
4376     tmpt        = t.dp + (2*ix + 1);
4377
4378     for (iy = ix + 1; iy < pa; iy++) {
4379       /* first calculate the product */
4380       r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
4381
4382       /* now calculate the double precision result, note we use
4383        * addition instead of *2 since it's easier to optimize
4384        */
4385       r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
4386
4387       /* store lower part */
4388       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4389
4390       /* get carry */
4391       u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4392     }
4393     /* propagate upwards */
4394     while (u != 0) {
4395       r       = ((mp_word) *tmpt) + ((mp_word) u);
4396       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4397       u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4398     }
4399   }
4400
4401   mp_clamp (&t);
4402   mp_exch (&t, b);
4403   mp_clear (&t);
4404   return MP_OKAY;
4405 }
4406
4407 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
4408 int
4409 s_mp_sub (const mp_int * a, const mp_int * b, mp_int * c)
4410 {
4411   int     olduse, res, min, max;
4412
4413   /* find sizes */
4414   min = b->used;
4415   max = a->used;
4416
4417   /* init result */
4418   if (c->alloc < max) {
4419     if ((res = mp_grow (c, max)) != MP_OKAY) {
4420       return res;
4421     }
4422   }
4423   olduse = c->used;
4424   c->used = max;
4425
4426   {
4427     register mp_digit u, *tmpa, *tmpb, *tmpc;
4428     register int i;
4429
4430     /* alias for digit pointers */
4431     tmpa = a->dp;
4432     tmpb = b->dp;
4433     tmpc = c->dp;
4434
4435     /* set carry to zero */
4436     u = 0;
4437     for (i = 0; i < min; i++) {
4438       /* T[i] = A[i] - B[i] - U */
4439       *tmpc = *tmpa++ - *tmpb++ - u;
4440
4441       /* U = carry bit of T[i]
4442        * Note this saves performing an AND operation since
4443        * if a carry does occur it will propagate all the way to the
4444        * MSB.  As a result a single shift is enough to get the carry
4445        */
4446       u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
4447
4448       /* Clear carry from T[i] */
4449       *tmpc++ &= MP_MASK;
4450     }
4451
4452     /* now copy higher words if any, e.g. if A has more digits than B  */
4453     for (; i < max; i++) {
4454       /* T[i] = A[i] - U */
4455       *tmpc = *tmpa++ - u;
4456
4457       /* U = carry bit of T[i] */
4458       u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
4459
4460       /* Clear carry from T[i] */
4461       *tmpc++ &= MP_MASK;
4462     }
4463
4464     /* clear digits above used (since we may not have grown result above) */
4465     for (i = c->used; i < olduse; i++) {
4466       *tmpc++ = 0;
4467     }
4468   }
4469
4470   mp_clamp (c);
4471   return MP_OKAY;
4472 }