release/src/router/matrixssl/src/crypto/peersec/mpi.c

   1 /*
   2  *      mpi.c
   3  *      Release $Name: MATRIXSSL_1_8_8_OPEN $
   4  *
   5  *      multiple-precision integer library
   6  */
   7 /*
   8  *      Copyright (c) PeerSec Networks, 2002-2009. All Rights Reserved.
   9  *      The latest version of this code is available at http://www.matrixssl.org
  10  *
  11  *      This software is open source; you can redistribute it and/or modify
  12  *      it under the terms of the GNU General Public License as published by
  13  *      the Free Software Foundation; either version 2 of the License, or
  14  *      (at your option) any later version.
  15  *
  16  *      This General Public License does NOT permit incorporating this software
  17  *      into proprietary programs.  If you are unable to comply with the GPL, a
  18  *      commercial license for this software may be purchased from PeerSec Networks
  19  *      at http://www.peersec.com
  20  *
  21  *      This program is distributed in WITHOUT ANY WARRANTY; without even the
  22  *      implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  23  *      See the GNU General Public License for more details.
  24  *
  25  *      You should have received a copy of the GNU General Public License
  26  *      along with this program; if not, write to the Free Software
  27  *      Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  28  *      http://www.gnu.org/copyleft/gpl.html
  29  */
  30 /******************************************************************************/
  31
  32 #include "../cryptoLayer.h"
  33 #include <stdarg.h>
  34
  35 #ifndef USE_MPI2
  36
  37 static int32 mp_exptmod_fast (psPool_t *pool, mp_int * G, mp_int * X,
  38                                 mp_int * P, mp_int * Y, int32 redmode);
  39
  40 /******************************************************************************/
  41 /*
  42         FUTURE
  43         1. Convert the mp_init and mp_clear functions to not use malloc + free,
  44         but to use static storage within the bignum variable instead - but
  45         how to handle grow()?  Maybe use a simple memory allocator
  46         2. verify stack usage of all functions and use of MP_LOW_MEM:
  47                 fast_mp_montgomery_reduce
  48                 fast_s_mp_mul_digs
  49                 fast_s_mp_sqr
  50                 fast_s_mp_mul_high_digs
  51         3. HAC stands for Handbook of Applied Cryptography
  52                 http://www.cacr.math.uwaterloo.ca/hac/
  53 */
  54 /******************************************************************************/
  55 /*
  56         Utility functions
  57 */
  58 void psZeromem(void *dst, size_t len)
  59 {
  60         unsigned char *mem = (unsigned char *)dst;
  61
  62         if (dst == NULL) {
  63                 return;
  64         }
  65         while (len-- > 0) {
  66                 *mem++ = 0;
  67         }
  68 }
  69
  70 void psBurnStack(unsigned long len)
  71 {
  72         unsigned char buf[32];
  73
  74         psZeromem(buf, sizeof(buf));
  75         if (len > (unsigned long)sizeof(buf)) {
  76                 psBurnStack(len - sizeof(buf));
  77         }
  78 }
  79
  80 /******************************************************************************/
  81 /*
  82         Multiple precision integer functions
  83         Note: we don't use va_args here to prevent portability issues.
  84 */
  85 int32 _mp_init_multi(psPool_t *pool, mp_int *mp0, mp_int *mp1, mp_int *mp2,
  86                                         mp_int *mp3, mp_int *mp4, mp_int *mp5,
  87                                         mp_int *mp6, mp_int *mp7)
  88 {
  89         mp_err  res             = MP_OKAY;              /* Assume ok until proven otherwise */
  90         int32           n               = 0;                    /* Number of ok inits */
  91         mp_int  *tempArray[9];
  92
  93         tempArray[0] = mp0;
  94         tempArray[1] = mp1;
  95         tempArray[2] = mp2;
  96         tempArray[3] = mp3;
  97         tempArray[4] = mp4;
  98         tempArray[5] = mp5;
  99         tempArray[6] = mp6;
 100         tempArray[7] = mp7;
 101         tempArray[8] = NULL;
 102
 103         while (tempArray[n] != NULL) {
 104                 if (mp_init(pool, tempArray[n]) != MP_OKAY) {
 105                         res = MP_MEM;
 106                         break;
 107                 }
 108                 n++;
 109         }
 110
 111         if (res == MP_MEM) {
 112                 n = 0;
 113                 while (tempArray[n] != NULL) {
 114                         mp_clear(tempArray[n]);
 115                         n++;
 116                 }
 117         }
 118         return res;             /* Assumed ok, if error flagged above. */
 119 }
 120 /******************************************************************************/
 121 /*
 122         Reads a unsigned char array, assumes the msb is stored first [big endian]
 123  */
 124 int32 mp_read_unsigned_bin (mp_int * a, unsigned char *b, int32 c)
 125 {
 126         int32           res;
 127
 128 /*
 129         Make sure there are at least two digits.
 130  */
 131         if (a->alloc < 2) {
 132                 if ((res = mp_grow(a, 2)) != MP_OKAY) {
 133                         return res;
 134                 }
 135         }
 136
 137 /*
 138         Zero the int32.
 139  */
 140         mp_zero (a);
 141
 142 /*
 143         read the bytes in
 144  */
 145         while (c-- > 0) {
 146                 if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
 147                         return res;
 148                 }
 149
 150 #ifndef MP_8BIT
 151                 a->dp[0] |= *b++;
 152                 a->used += 1;
 153 #else
 154                 a->dp[0] = (*b & MP_MASK);
 155                 a->dp[1] |= ((*b++ >> 7U) & 1);
 156                 a->used += 2;
 157 #endif /* MP_8BIT */
 158         }
 159         mp_clamp (a);
 160         return MP_OKAY;
 161 }
 162
 163 /******************************************************************************/
 164 /*
 165         Compare two ints (signed)
 166  */
 167 int32 mp_cmp (mp_int * a, mp_int * b)
 168 {
 169 /*
 170         compare based on sign
 171  */
 172         if (a->sign != b->sign) {
 173                 if (a->sign == MP_NEG) {
 174                         return MP_LT;
 175                 } else {
 176                         return MP_GT;
 177                 }
 178         }
 179
 180 /*
 181         compare digits
 182  */
 183         if (a->sign == MP_NEG) {
 184                 /* if negative compare opposite direction */
 185                 return mp_cmp_mag(b, a);
 186         } else {
 187                 return mp_cmp_mag(a, b);
 188         }
 189 }
 190
 191 /******************************************************************************/
 192 /*
 193         Store in unsigned [big endian] format.
 194 */
 195 int32 mp_to_unsigned_bin(psPool_t *pool, mp_int * a, unsigned char *b)
 196 {
 197         int32                   x, res;
 198         mp_int          t;
 199
 200         if ((res = mp_init_copy(pool, &t, a)) != MP_OKAY) {
 201                 return res;
 202         }
 203
 204         x = 0;
 205         while (mp_iszero (&t) == 0) {
 206 #ifndef MP_8BIT
 207                 b[x++] = (unsigned char) (t.dp[0] & 255);
 208 #else
 209                 b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
 210 #endif /* MP_8BIT */
 211                 if ((res = mp_div_2d (pool, &t, 8, &t, NULL)) != MP_OKAY) {
 212                         mp_clear (&t);
 213                         return res;
 214                 }
 215         }
 216         bn_reverse (b, x);
 217         mp_clear (&t);
 218         return MP_OKAY;
 219 }
 220
 221 void _mp_clear_multi(mp_int *mp0, mp_int *mp1, mp_int *mp2, mp_int *mp3,
 222                                   mp_int *mp4, mp_int *mp5, mp_int *mp6, mp_int *mp7)
 223 {
 224         int32           n               = 0;            /* Number of ok inits */
 225
 226         mp_int  *tempArray[9];
 227
 228         tempArray[0] = mp0;
 229         tempArray[1] = mp1;
 230         tempArray[2] = mp2;
 231         tempArray[3] = mp3;
 232         tempArray[4] = mp4;
 233         tempArray[5] = mp5;
 234         tempArray[6] = mp6;
 235         tempArray[7] = mp7;
 236         tempArray[8] = NULL;
 237
 238         for (n = 0; tempArray[n] != NULL; n++) {
 239                 mp_clear(tempArray[n]);
 240         }
 241 }
 242
 243 /******************************************************************************/
 244 /*
 245         Init a new mp_int.
 246 */
 247 int32 mp_init (psPool_t *pool, mp_int * a)
 248 {
 249         int32           i;
 250 /*
 251         allocate memory required and clear it
 252  */
 253         a->dp = OPT_CAST(mp_digit) psMalloc(pool, sizeof (mp_digit) * MP_PREC);
 254         if (a->dp == NULL) {
 255                 return MP_MEM;
 256         }
 257
 258 /*
 259         set the digits to zero
 260  */
 261         for (i = 0; i < MP_PREC; i++) {
 262                 a->dp[i] = 0;
 263         }
 264 /*
 265         set the used to zero, allocated digits to the default precision and sign
 266         to positive
 267  */
 268         a->used  = 0;
 269         a->alloc = MP_PREC;
 270         a->sign  = MP_ZPOS;
 271
 272         return MP_OKAY;
 273 }
 274
 275 /******************************************************************************/
 276 /*
 277         clear one (frees).
 278  */
 279 void mp_clear (mp_int * a)
 280 {
 281         int32           i;
 282 /*
 283         only do anything if a hasn't been freed previously
 284  */
 285         if (a->dp != NULL) {
 286 /*
 287                 first zero the digits
 288  */
 289                 for (i = 0; i < a->used; i++) {
 290                         a->dp[i] = 0;
 291                 }
 292
 293                 /* free ram */
 294                 psFree (a->dp);
 295
 296 /*
 297                 reset members to make debugging easier
 298  */
 299                 a->dp           = NULL;
 300                 a->alloc        = a->used = 0;
 301                 a->sign         = MP_ZPOS;
 302         }
 303 }
 304
 305 /******************************************************************************/
 306 /*
 307         Get the size for an unsigned equivalent.
 308  */
 309 int32 mp_unsigned_bin_size (mp_int * a)
 310 {
 311         int32           size = mp_count_bits (a);
 312
 313         return  (size / 8 + ((size & 7) != 0 ? 1 : 0));
 314 }
 315
 316 /******************************************************************************/
 317 /*
 318         Trim unused digits
 319
 320         This is used to ensure that leading zero digits are trimed and the
 321         leading "used" digit will be non-zero. Typically very fast.  Also fixes
 322         the sign if there are no more leading digits
 323 */
 324 void mp_clamp (mp_int * a)
 325 {
 326 /*
 327         decrease used while the most significant digit is zero.
 328  */
 329         while (a->used > 0 && a->dp[a->used - 1] == 0) {
 330                 --(a->used);
 331         }
 332
 333 /*
 334         reset the sign flag if used == 0
 335  */
 336         if (a->used == 0) {
 337                 a->sign = MP_ZPOS;
 338         }
 339 }
 340
 341 /******************************************************************************/
 342 /*
 343         Shift left by a certain bit count.
 344  */
 345 int32 mp_mul_2d (mp_int * a, int32 b, mp_int * c)
 346 {
 347         mp_digit        d;
 348         int32                   res;
 349
 350 /*
 351         Copy
 352  */
 353         if (a != c) {
 354                 if ((res = mp_copy (a, c)) != MP_OKAY) {
 355                         return res;
 356                 }
 357         }
 358
 359         if (c->alloc < (int32)(c->used + b/DIGIT_BIT + 1)) {
 360                 if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
 361                         return res;
 362                 }
 363         }
 364
 365 /*
 366         Shift by as many digits in the bit count
 367  */
 368         if (b >= (int32)DIGIT_BIT) {
 369                 if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
 370                         return res;
 371                 }
 372         }
 373
 374 /*
 375         shift any bit count < DIGIT_BIT
 376  */
 377         d = (mp_digit) (b % DIGIT_BIT);
 378         if (d != 0) {
 379                 register mp_digit *tmpc, shift, mask, r, rr;
 380                 register int32 x;
 381
 382 /*
 383                 bitmask for carries
 384  */
 385                 mask = (((mp_digit)1) << d) - 1;
 386
 387 /*
 388                 shift for msbs
 389  */
 390                 shift = DIGIT_BIT - d;
 391
 392                 /* alias */
 393                 tmpc = c->dp;
 394
 395                 /* carry */
 396                 r = 0;
 397                 for (x = 0; x < c->used; x++) {
 398 /*
 399                         get the higher bits of the current word
 400  */
 401                         rr = (*tmpc >> shift) & mask;
 402
 403 /*
 404                         shift the current word and OR in the carry
 405  */
 406                         *tmpc = ((*tmpc << d) | r) & MP_MASK;
 407                         ++tmpc;
 408
 409 /*
 410                         set the carry to the carry bits of the current word
 411  */
 412                         r = rr;
 413                 }
 414
 415 /*
 416                 set final carry
 417  */
 418                 if (r != 0) {
 419                         c->dp[(c->used)++] = r;
 420                 }
 421         }
 422         mp_clamp (c);
 423         return MP_OKAY;
 424 }
 425
 426 /******************************************************************************/
 427 /*
 428         Set to zero.
 429  */
 430 void mp_zero (mp_int * a)
 431 {
 432         int                     n;
 433         mp_digit        *tmp;
 434
 435         a->sign = MP_ZPOS;
 436         a->used = 0;
 437
 438         tmp = a->dp;
 439         for (n = 0; n < a->alloc; n++) {
 440                 *tmp++ = 0;
 441         }
 442 }
 443
 444 #ifdef MP_LOW_MEM
 445 #define TAB_SIZE 32
 446 #else
 447 #define TAB_SIZE 256
 448 #endif /* MP_LOW_MEM */
 449
 450 /******************************************************************************/
 451 /*
 452         Compare maginitude of two ints (unsigned).
 453  */
 454 int32 mp_cmp_mag (mp_int * a, mp_int * b)
 455 {
 456         int32                   n;
 457         mp_digit        *tmpa, *tmpb;
 458
 459 /*
 460         compare based on # of non-zero digits
 461  */
 462         if (a->used > b->used) {
 463                 return MP_GT;
 464         }
 465
 466         if (a->used < b->used) {
 467                 return MP_LT;
 468         }
 469
 470         /* alias for a */
 471         tmpa = a->dp + (a->used - 1);
 472
 473         /* alias for b */
 474         tmpb = b->dp + (a->used - 1);
 475
 476 /*
 477         compare based on digits
 478  */
 479         for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
 480                 if (*tmpa > *tmpb) {
 481                         return MP_GT;
 482                 }
 483
 484                 if (*tmpa < *tmpb) {
 485                         return MP_LT;
 486                 }
 487         }
 488         return MP_EQ;
 489 }
 490
 491 /******************************************************************************/
 492 /*
 493         computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
 494
 495         Uses a left-to-right k-ary sliding window to compute the modular
 496         exponentiation. The value of k changes based on the size of the exponent.
 497
 498         Uses Montgomery or Diminished Radix reduction [whichever appropriate]
 499 */
 500 int32 mp_exptmod(psPool_t *pool, mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 501 {
 502
 503 /*
 504         modulus P must be positive
 505  */
 506         if (P->sign == MP_NEG) {
 507                 return MP_VAL;
 508         }
 509
 510 /*
 511         if exponent X is negative we have to recurse
 512  */
 513         if (X->sign == MP_NEG) {
 514                 mp_int tmpG, tmpX;
 515                 int32 err;
 516
 517 /*
 518                 first compute 1/G mod P
 519  */
 520                 if ((err = mp_init(pool, &tmpG)) != MP_OKAY) {
 521                         return err;
 522                 }
 523                 if ((err = mp_invmod(pool, G, P, &tmpG)) != MP_OKAY) {
 524                         mp_clear(&tmpG);
 525                         return err;
 526                 }
 527
 528 /*
 529                 now get |X|
 530  */
 531                 if ((err = mp_init(pool, &tmpX)) != MP_OKAY) {
 532                         mp_clear(&tmpG);
 533                         return err;
 534                 }
 535                 if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
 536                         mp_clear(&tmpG);
 537                         mp_clear(&tmpX);
 538                         return err;
 539                 }
 540
 541 /*
 542                 and now compute (1/G)**|X| instead of G**X [X < 0]
 543  */
 544                 err = mp_exptmod(pool, &tmpG, &tmpX, P, Y);
 545                 mp_clear(&tmpG);
 546                 mp_clear(&tmpX);
 547                 return err;
 548         }
 549
 550 /*
 551         if the modulus is odd or dr != 0 use the fast method
 552  */
 553         if (mp_isodd (P) == 1) {
 554                 return mp_exptmod_fast (pool, G, X, P, Y, 0);
 555         } else {
 556 /*
 557                 no exptmod for evens
 558  */
 559                 return MP_VAL;
 560         }
 561 }
 562
 563 /******************************************************************************/
 564 /*
 565         Call only from mp_exptmod to make sure this fast version qualifies
 566 */
 567 static int32 mp_exptmod_fast(psPool_t *pool, mp_int * G, mp_int * X,
 568                                 mp_int * P, mp_int * Y, int32 redmode)
 569 {
 570         mp_int          M[TAB_SIZE], res;
 571         mp_digit        buf, mp;
 572         int32           err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
 573
 574
 575 /*
 576         use a pointer to the reduction algorithm.  This allows us to use
 577         one of many reduction algorithms without modding the guts of
 578         the code with if statements everywhere.
 579  */
 580         int32   (*redux)(mp_int*,mp_int*,mp_digit);
 581
 582 /*
 583         find window size
 584  */
 585         x = mp_count_bits (X);
 586         if (x <= 7) {
 587                 winsize = 2;
 588         } else if (x <= 36) {
 589                 winsize = 3;
 590         } else if (x <= 140) {
 591                 winsize = 4;
 592         } else if (x <= 450) {
 593                 winsize = 5;
 594         } else if (x <= 1303) {
 595                 winsize = 6;
 596         } else if (x <= 3529) {
 597                 winsize = 7;
 598         } else {
 599                 winsize = 8;
 600         }
 601
 602 #ifdef MP_LOW_MEM
 603         if (winsize > 5) {
 604                 winsize = 5;
 605         }
 606 #endif
 607
 608 /*
 609         init M array
 610         init first cell
 611  */
 612         if ((err = mp_init(pool, &M[1])) != MP_OKAY) {
 613                 return err;
 614         }
 615
 616 /*
 617         now init the second half of the array
 618  */
 619         for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
 620                 if ((err = mp_init(pool, &M[x])) != MP_OKAY) {
 621                         for (y = 1<<(winsize-1); y < x; y++) {
 622                                 mp_clear(&M[y]);
 623                         }
 624                         mp_clear(&M[1]);
 625                         return err;
 626                 }
 627         }
 628
 629
 630 /*
 631         now setup montgomery
 632  */
 633         if ((err = mp_montgomery_setup(P, &mp)) != MP_OKAY) {
 634                 goto LBL_M;
 635         }
 636
 637 /*
 638         automatically pick the comba one if available
 639  */
 640         if (((P->used * 2 + 1) < MP_WARRAY) &&
 641                         P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
 642                 redux = fast_mp_montgomery_reduce;
 643         } else {
 644 /*
 645                 use slower baseline Montgomery method
 646  */
 647                 redux = mp_montgomery_reduce;
 648         }
 649
 650 /*
 651         setup result
 652  */
 653         if ((err = mp_init(pool, &res)) != MP_OKAY) {
 654                 goto LBL_M;
 655         }
 656
 657 /*
 658         create M table. The first half of the table is not computed
 659         though accept for M[0] and M[1]
 660 */
 661
 662 /*
 663         now we need R mod m
 664  */
 665         if ((err = mp_montgomery_calc_normalization(&res, P)) != MP_OKAY) {
 666                 goto LBL_RES;
 667         }
 668
 669 /*
 670         now set M[1] to G * R mod m
 671  */
 672         if ((err = mp_mulmod(pool, G, &res, P, &M[1])) != MP_OKAY) {
 673                 goto LBL_RES;
 674         }
 675
 676 /*
 677         compute the value at M[1<<(winsize-1)] by squaring
 678         M[1] (winsize-1) times
 679 */
 680         if ((err = mp_copy(&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
 681                 goto LBL_RES;
 682         }
 683
 684         for (x = 0; x < (winsize - 1); x++) {
 685                 if ((err = mp_sqr(pool, &M[1 << (winsize - 1)],
 686                                 &M[1 << (winsize - 1)])) != MP_OKAY) {
 687                         goto LBL_RES;
 688                 }
 689                 if ((err = redux(&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
 690                         goto LBL_RES;
 691                 }
 692         }
 693
 694 /*
 695         create upper table
 696  */
 697         for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
 698                 if ((err = mp_mul(pool, &M[x - 1], &M[1], &M[x])) != MP_OKAY) {
 699                         goto LBL_RES;
 700                 }
 701                 if ((err = redux(&M[x], P, mp)) != MP_OKAY) {
 702                         goto LBL_RES;
 703                 }
 704         }
 705
 706 /*
 707         set initial mode and bit cnt
 708  */
 709         mode   = 0;
 710         bitcnt = 1;
 711         buf    = 0;
 712         digidx = X->used - 1;
 713         bitcpy = 0;
 714         bitbuf = 0;
 715
 716         for (;;) {
 717 /*
 718                 grab next digit as required
 719  */
 720                 if (--bitcnt == 0) {
 721                         /* if digidx == -1 we are out of digits so break */
 722                         if (digidx == -1) {
 723                                 break;
 724                         }
 725                         /* read next digit and reset bitcnt */
 726                         buf    = X->dp[digidx--];
 727                         bitcnt = (int)DIGIT_BIT;
 728                 }
 729
 730                 /* grab the next msb from the exponent */
 731                 y     = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
 732                 buf <<= (mp_digit)1;
 733
 734 /*
 735                 if the bit is zero and mode == 0 then we ignore it
 736                 These represent the leading zero bits before the first 1 bit
 737                 in the exponent.  Technically this opt is not required but it
 738                 does lower the # of trivial squaring/reductions used
 739 */
 740                 if (mode == 0 && y == 0) {
 741                         continue;
 742                 }
 743
 744 /*
 745                 if the bit is zero and mode == 1 then we square
 746  */
 747                 if (mode == 1 && y == 0) {
 748                         if ((err = mp_sqr (pool, &res, &res)) != MP_OKAY) {
 749                                 goto LBL_RES;
 750                         }
 751                         if ((err = redux (&res, P, mp)) != MP_OKAY) {
 752                                 goto LBL_RES;
 753                         }
 754                         continue;
 755                 }
 756
 757 /*
 758                 else we add it to the window
 759  */
 760                 bitbuf |= (y << (winsize - ++bitcpy));
 761                 mode    = 2;
 762
 763                 if (bitcpy == winsize) {
 764 /*
 765                         ok window is filled so square as required and multiply
 766                         square first
 767  */
 768                         for (x = 0; x < winsize; x++) {
 769                                 if ((err = mp_sqr(pool, &res, &res)) != MP_OKAY) {
 770                                         goto LBL_RES;
 771                                 }
 772                                 if ((err = redux(&res, P, mp)) != MP_OKAY) {
 773                                         goto LBL_RES;
 774                                 }
 775                         }
 776
 777                         /* then multiply */
 778                         if ((err = mp_mul(pool, &res, &M[bitbuf], &res)) != MP_OKAY) {
 779                                 goto LBL_RES;
 780                         }
 781                         if ((err = redux(&res, P, mp)) != MP_OKAY) {
 782                                 goto LBL_RES;
 783                         }
 784
 785 /*
 786                         empty window and reset
 787  */
 788                         bitcpy = 0;
 789                         bitbuf = 0;
 790                         mode   = 1;
 791                 }
 792         }
 793
 794 /*
 795         if bits remain then square/multiply
 796  */
 797         if (mode == 2 && bitcpy > 0) {
 798                 /* square then multiply if the bit is set */
 799                 for (x = 0; x < bitcpy; x++) {
 800                         if ((err = mp_sqr(pool, &res, &res)) != MP_OKAY) {
 801                                 goto LBL_RES;
 802                         }
 803                         if ((err = redux(&res, P, mp)) != MP_OKAY) {
 804                                 goto LBL_RES;
 805                         }
 806
 807 /*
 808                         get next bit of the window
 809  */
 810                         bitbuf <<= 1;
 811                         if ((bitbuf & (1 << winsize)) != 0) {
 812 /*
 813                                 then multiply
 814  */
 815                                 if ((err = mp_mul(pool, &res, &M[1], &res)) != MP_OKAY) {
 816                                         goto LBL_RES;
 817                                 }
 818                                 if ((err = redux(&res, P, mp)) != MP_OKAY) {
 819                                         goto LBL_RES;
 820                                 }
 821                         }
 822                 }
 823         }
 824
 825 /*
 826         fixup result if Montgomery reduction is used
 827         recall that any value in a Montgomery system is
 828         actually multiplied by R mod n.  So we have
 829         to reduce one more time to cancel out the factor of R.
 830 */
 831         if ((err = redux(&res, P, mp)) != MP_OKAY) {
 832                 goto LBL_RES;
 833         }
 834
 835 /*
 836         swap res with Y
 837  */
 838         mp_exch(&res, Y);
 839         err = MP_OKAY;
 840 LBL_RES:mp_clear(&res);
 841 LBL_M:
 842         mp_clear(&M[1]);
 843         for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
 844                 mp_clear(&M[x]);
 845         }
 846         return err;
 847 }
 848
 849 /******************************************************************************/
 850 /*
 851         Grow as required
 852  */
 853 int32 mp_grow (mp_int * a, int32 size)
 854 {
 855         int32                   i;
 856         mp_digit        *tmp;
 857
 858 /*
 859         If the alloc size is smaller alloc more ram.
 860  */
 861         if (a->alloc < size) {
 862 /*
 863                 ensure there are always at least MP_PREC digits extra on top
 864  */
 865                 size += (MP_PREC * 2) - (size % MP_PREC);
 866
 867 /*
 868                 Reallocate the array a->dp
 869
 870                 We store the return in a temporary variable in case the operation
 871                 failed we don't want to overwrite the dp member of a.
 872 */
 873                 tmp = OPT_CAST(mp_digit) psRealloc(a->dp, sizeof (mp_digit) * size);
 874                 if (tmp == NULL) {
 875 /*
 876                         reallocation failed but "a" is still valid [can be freed]
 877  */
 878                         return MP_MEM;
 879                 }
 880
 881 /*
 882                 reallocation succeeded so set a->dp
 883  */
 884                 a->dp = tmp;
 885
 886 /*
 887                 zero excess digits
 888  */
 889                 i                       = a->alloc;
 890                 a->alloc        = size;
 891                 for (; i < a->alloc; i++) {
 892                         a->dp[i] = 0;
 893                 }
 894         }
 895         return MP_OKAY;
 896 }
 897
 898 /******************************************************************************/
 899 /*
 900         b = |a|
 901
 902         Simple function copies the input and fixes the sign to positive
 903 */
 904 int32 mp_abs (mp_int * a, mp_int * b)
 905 {
 906         int32           res;
 907
 908 /*
 909         copy a to b
 910  */
 911         if (a != b) {
 912                 if ((res = mp_copy (a, b)) != MP_OKAY) {
 913                         return res;
 914                 }
 915         }
 916
 917 /*
 918         Force the sign of b to positive
 919  */
 920         b->sign = MP_ZPOS;
 921
 922         return MP_OKAY;
 923 }
 924
 925 /******************************************************************************/
 926 /*
 927         Creates "a" then copies b into it
 928  */
 929 int32 mp_init_copy(psPool_t *pool, mp_int * a, mp_int * b)
 930 {
 931         int32           res;
 932
 933         if ((res = mp_init(pool, a)) != MP_OKAY) {
 934                 return res;
 935         }
 936         return mp_copy (b, a);
 937 }
 938
 939 /******************************************************************************/
 940 /*
 941         Reverse an array, used for radix code
 942  */
 943 void bn_reverse (unsigned char *s, int32 len)
 944 {
 945         int32                           ix, iy;
 946         unsigned char   t;
 947
 948         ix = 0;
 949         iy = len - 1;
 950         while (ix < iy) {
 951                 t               = s[ix];
 952                 s[ix]   = s[iy];
 953                 s[iy]   = t;
 954                 ++ix;
 955                 --iy;
 956         }
 957 }
 958
 959 /******************************************************************************/
 960 /*
 961         Shift right by a certain bit count (store quotient in c, optional
 962         remainder in d)
 963  */
 964 int32 mp_div_2d(psPool_t *pool, mp_int * a, int32 b, mp_int * c, mp_int * d)
 965 {
 966         mp_digit        D, r, rr;
 967         int32                   x, res;
 968         mp_int          t;
 969
 970 /*
 971         If the shift count is <= 0 then we do no work
 972  */
 973         if (b <= 0) {
 974                 res = mp_copy (a, c);
 975                 if (d != NULL) {
 976                         mp_zero (d);
 977                 }
 978                 return res;
 979         }
 980
 981         if ((res = mp_init(pool, &t)) != MP_OKAY) {
 982                 return res;
 983         }
 984
 985 /*
 986         Get the remainder
 987  */
 988         if (d != NULL) {
 989                 if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
 990                         mp_clear (&t);
 991                         return res;
 992                 }
 993         }
 994
 995         /* copy */
 996         if ((res = mp_copy (a, c)) != MP_OKAY) {
 997                 mp_clear (&t);
 998                 return res;
 999         }
1000
1001 /*
1002         Shift by as many digits in the bit count
1003  */
1004         if (b >= (int32)DIGIT_BIT) {
1005                 mp_rshd (c, b / DIGIT_BIT);
1006         }
1007
1008         /* shift any bit count < DIGIT_BIT */
1009         D = (mp_digit) (b % DIGIT_BIT);
1010         if (D != 0) {
1011                 register mp_digit *tmpc, mask, shift;
1012
1013                 /* mask */
1014                 mask = (((mp_digit)1) << D) - 1;
1015
1016                 /* shift for lsb */
1017                 shift = DIGIT_BIT - D;
1018
1019                 /* alias */
1020                 tmpc = c->dp + (c->used - 1);
1021
1022                 /* carry */
1023                 r = 0;
1024                 for (x = c->used - 1; x >= 0; x--) {
1025 /*
1026                         Get the lower  bits of this word in a temp.
1027  */
1028                         rr = *tmpc & mask;
1029
1030 /*
1031                         shift the current word and mix in the carry bits from the previous word
1032  */
1033                         *tmpc = (*tmpc >> D) | (r << shift);
1034                         --tmpc;
1035
1036 /*
1037                         set the carry to the carry bits of the current word found above
1038  */
1039                         r = rr;
1040                 }
1041         }
1042         mp_clamp (c);
1043         if (d != NULL) {
1044                 mp_exch (&t, d);
1045         }
1046         mp_clear (&t);
1047         return MP_OKAY;
1048 }
1049
1050 /******************************************************************************/
1051 /*
1052         copy, b = a
1053  */
1054 int32 mp_copy (mp_int * a, mp_int * b)
1055 {
1056         int32           res, n;
1057
1058 /*
1059         If dst == src do nothing
1060  */
1061         if (a == b) {
1062                 return MP_OKAY;
1063         }
1064
1065 /*
1066         Grow dest
1067  */
1068         if (b->alloc < a->used) {
1069                 if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1070                         return res;
1071                 }
1072         }
1073
1074 /*
1075         Zero b and copy the parameters over
1076  */
1077         {
1078                 register mp_digit *tmpa, *tmpb;
1079
1080                 /* pointer aliases */
1081                 /* source */
1082                 tmpa = a->dp;
1083
1084                 /* destination */
1085                 tmpb = b->dp;
1086
1087                 /* copy all the digits */
1088                 for (n = 0; n < a->used; n++) {
1089                         *tmpb++ = *tmpa++;
1090                 }
1091
1092                 /* clear high digits */
1093                 for (; n < b->used; n++) {
1094                         *tmpb++ = 0;
1095                 }
1096         }
1097
1098 /*
1099         copy used count and sign
1100  */
1101         b->used = a->used;
1102         b->sign = a->sign;
1103         return MP_OKAY;
1104 }
1105
1106 /******************************************************************************/
1107 /*
1108         Returns the number of bits in an int32
1109  */
1110 int32 mp_count_bits (mp_int * a)
1111 {
1112         int32                   r;
1113         mp_digit        q;
1114
1115 /*
1116         Shortcut
1117  */
1118         if (a->used == 0) {
1119                 return 0;
1120         }
1121
1122 /*
1123         Get number of digits and add that.
1124  */
1125         r = (a->used - 1) * DIGIT_BIT;
1126
1127 /*
1128         Take the last digit and count the bits in it.
1129  */
1130         q = a->dp[a->used - 1];
1131         while (q > ((mp_digit) 0)) {
1132                 ++r;
1133                 q >>= ((mp_digit) 1);
1134         }
1135         return r;
1136 }
1137
1138 /******************************************************************************/
1139 /*
1140         Shift left a certain amount of digits.
1141  */
1142 int32 mp_lshd (mp_int * a, int32 b)
1143 {
1144         int32           x, res;
1145
1146 /*
1147         If its less than zero return.
1148  */
1149         if (b <= 0) {
1150                 return MP_OKAY;
1151         }
1152
1153 /*
1154         Grow to fit the new digits.
1155  */
1156         if (a->alloc < a->used + b) {
1157                 if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
1158                         return res;
1159                 }
1160         }
1161
1162         {
1163                 register mp_digit *top, *bottom;
1164
1165 /*
1166                 Increment the used by the shift amount then copy upwards.
1167  */
1168                 a->used += b;
1169
1170                 /* top */
1171                 top = a->dp + a->used - 1;
1172
1173                 /* base */
1174                 bottom = a->dp + a->used - 1 - b;
1175
1176 /*
1177                 Much like mp_rshd this is implemented using a sliding window
1178                 except the window goes the otherway around.  Copying from
1179                 the bottom to the top.  see bn_mp_rshd.c for more info.
1180  */
1181                 for (x = a->used - 1; x >= b; x--) {
1182                         *top-- = *bottom--;
1183                 }
1184
1185                 /* zero the lower digits */
1186                 top = a->dp;
1187                 for (x = 0; x < b; x++) {
1188                         *top++ = 0;
1189                 }
1190         }
1191         return MP_OKAY;
1192 }
1193
1194 /******************************************************************************/
1195 /*
1196         Set to a digit.
1197  */
1198 void mp_set (mp_int * a, mp_digit b)
1199 {
1200         mp_zero (a);
1201         a->dp[0] = b & MP_MASK;
1202         a->used  = (a->dp[0] != 0) ? 1 : 0;
1203 }
1204
1205 /******************************************************************************/
1206 /*
1207         Swap the elements of two integers, for cases where you can't simply swap
1208         the     mp_int pointers around
1209 */
1210 void mp_exch (mp_int * a, mp_int * b)
1211 {
1212         mp_int          t;
1213
1214         t       = *a;
1215         *a      = *b;
1216         *b      = t;
1217 }
1218
1219 /******************************************************************************/
1220 /*
1221         High level multiplication (handles sign)
1222  */
1223 int32 mp_mul(psPool_t *pool, mp_int * a, mp_int * b, mp_int * c)
1224 {
1225         int32                   res, neg;
1226         int32                   digs = a->used + b->used + 1;
1227
1228         neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
1229
1230 /*      Can we use the fast multiplier?
1231
1232         The fast multiplier can be used if the output will have less than
1233         MP_WARRAY digits and the number of digits won't affect carry propagation
1234 */
1235         if ((digs < MP_WARRAY) && MIN(a->used, b->used) <=
1236                         (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
1237                 res = fast_s_mp_mul_digs(pool, a, b, c, digs);
1238         } else {
1239                 res = s_mp_mul(pool, a, b, c);
1240         }
1241         c->sign = (c->used > 0) ? neg : MP_ZPOS;
1242         return res;
1243 }
1244
1245 /******************************************************************************/
1246 /*
1247         c = a mod b, 0 <= c < b
1248  */
1249 int32 mp_mod(psPool_t *pool, mp_int * a, mp_int * b, mp_int * c)
1250 {
1251         mp_int          t;
1252         int32                   res;
1253
1254         if ((res = mp_init(pool, &t)) != MP_OKAY) {
1255                 return res;
1256         }
1257
1258         if ((res = mp_div (pool, a, b, NULL, &t)) != MP_OKAY) {
1259                 mp_clear (&t);
1260                 return res;
1261         }
1262
1263         if (t.sign != b->sign) {
1264                 res = mp_add (b, &t, c);
1265         } else {
1266                 res = MP_OKAY;
1267                 mp_exch (&t, c);
1268         }
1269
1270         mp_clear (&t);
1271         return res;
1272 }
1273
1274 /******************************************************************************/
1275 /*
1276         shifts with subtractions when the result is greater than b.
1277
1278         The method is slightly modified to shift B unconditionally upto just under
1279         the leading bit of b.  This saves alot of multiple precision shifting.
1280 */
1281 int32 mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
1282 {
1283         int32           x, bits, res;
1284
1285 /*
1286         How many bits of last digit does b use
1287  */
1288         bits = mp_count_bits (b) % DIGIT_BIT;
1289
1290         if (b->used > 1) {
1291                 if ((res = mp_2expt(a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
1292                         return res;
1293                 }
1294         } else {
1295                 mp_set(a, 1);
1296                 bits = 1;
1297         }
1298
1299 /*
1300         Now compute C = A * B mod b
1301  */
1302         for (x = bits - 1; x < (int32)DIGIT_BIT; x++) {
1303                 if ((res = mp_mul_2(a, a)) != MP_OKAY) {
1304                         return res;
1305                 }
1306                 if (mp_cmp_mag(a, b) != MP_LT) {
1307                         if ((res = s_mp_sub(a, b, a)) != MP_OKAY) {
1308                                 return res;
1309                         }
1310                 }
1311         }
1312
1313         return MP_OKAY;
1314 }
1315
1316 /******************************************************************************/
1317 /*
1318         d = a * b (mod c)
1319  */
1320 int32 mp_mulmod(psPool_t *pool, mp_int * a, mp_int * b, mp_int * c, mp_int * d)
1321 {
1322         int32           res;
1323         mp_int          t;
1324
1325         if ((res = mp_init(pool, &t)) != MP_OKAY) {
1326                 return res;
1327         }
1328
1329         if ((res = mp_mul (pool, a, b, &t)) != MP_OKAY) {
1330                 mp_clear (&t);
1331                 return res;
1332         }
1333         res = mp_mod (pool, &t, c, d);
1334         mp_clear (&t);
1335         return res;
1336 }
1337
1338 /******************************************************************************/
1339 /*
1340         Computes b = a*a
1341  */
1342 #ifdef USE_SMALL_WORD
1343 int32 mp_sqr (psPool_t *pool, mp_int * a, mp_int * b)
1344 {
1345         int32           res;
1346
1347 /*
1348         Can we use the fast comba multiplier?
1349  */
1350         if ((a->used * 2 + 1) < MP_WARRAY && a->used <
1351                         (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
1352                 res = fast_s_mp_sqr (pool, a, b);
1353         } else {
1354                 res = s_mp_sqr (pool, a, b);
1355         }
1356         b->sign = MP_ZPOS;
1357         return res;
1358 }
1359 #endif /* USE_SMALL_WORD */
1360
1361 /******************************************************************************/
1362 /*
1363         Computes xR**-1 == x (mod N) via Montgomery Reduction.
1364
1365         This is an optimized implementation of montgomery_reduce
1366         which uses the comba method to quickly calculate the columns of the
1367         reduction.
1368
1369         Based on Algorithm 14.32 on pp.601 of HAC.
1370 */
1371
1372 int32 fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
1373 {
1374         int32           ix, res, olduse;
1375         mp_word         W[MP_WARRAY];
1376
1377 /*
1378         Get old used count
1379  */
1380         olduse = x->used;
1381
1382 /*
1383         Grow a as required
1384  */
1385         if (x->alloc < n->used + 1) {
1386                 if ((res = mp_grow(x, n->used + 1)) != MP_OKAY) {
1387                         return res;
1388                 }
1389         }
1390
1391 /*
1392         First we have to get the digits of the input into
1393         an array of double precision words W[...]
1394  */
1395         {
1396                 register mp_word                *_W;
1397                 register mp_digit       *tmpx;
1398
1399 /*
1400                 Alias for the W[] array
1401  */
1402                 _W   = W;
1403
1404 /*
1405                 Alias for the digits of  x
1406  */
1407                 tmpx = x->dp;
1408
1409 /*
1410                 Copy the digits of a into W[0..a->used-1]
1411  */
1412                 for (ix = 0; ix < x->used; ix++) {
1413                         *_W++ = *tmpx++;
1414                 }
1415
1416 /*
1417                 Zero the high words of W[a->used..m->used*2]
1418  */
1419                 for (; ix < n->used * 2 + 1; ix++) {
1420                         *_W++ = 0;
1421                 }
1422         }
1423
1424 /*
1425         Now we proceed to zero successive digits from the least
1426         significant upwards.
1427  */
1428         for (ix = 0; ix < n->used; ix++) {
1429 /*
1430                 mu = ai * m' mod b
1431
1432                 We avoid a double precision multiplication (which isn't required) by
1433                 casting the value down to a mp_digit.  Note this requires that
1434                 W[ix-1] have  the carry cleared (see after the inner loop)
1435  */
1436                 register mp_digit mu;
1437                 mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
1438
1439 /*
1440                 a = a + mu * m * b**i
1441
1442                 This is computed in place and on the fly.  The multiplication by b**i
1443                 is handled by offseting which columns the results are added to.
1444
1445                 Note the comba method normally doesn't handle carries in the inner loop
1446                 In this case we fix the carry from the previous column since the
1447                 Montgomery reduction requires digits of the result (so far) [see above]
1448                 to work.  This is handled by fixing up one carry after the inner loop.
1449                 The carry fixups are done in order so after these loops the first
1450                 m->used words of W[] have the carries fixed
1451  */
1452                 {
1453                         register int32          iy;
1454                         register mp_digit       *tmpn;
1455                         register mp_word        *_W;
1456
1457 /*
1458                         Alias for the digits of the modulus
1459  */
1460                         tmpn = n->dp;
1461
1462 /*
1463                         Alias for the columns set by an offset of ix
1464  */
1465                         _W = W + ix;
1466
1467 /*
1468                                 inner loop
1469  */
1470                                 for (iy = 0; iy < n->used; iy++) {
1471                                         *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
1472                                 }
1473                         }
1474
1475 /*
1476                 Now fix carry for next digit, W[ix+1]
1477  */
1478                 W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
1479         }
1480
1481 /*
1482                 Now we have to propagate the carries and shift the words downward [all those
1483                 least significant digits we zeroed].
1484  */
1485         {
1486                 register mp_digit       *tmpx;
1487                 register mp_word        *_W, *_W1;
1488
1489 /*
1490                 Now fix rest of carries
1491  */
1492
1493 /*
1494                 alias for current word
1495  */
1496                 _W1 = W + ix;
1497
1498 /*
1499                 alias for next word, where the carry goes
1500  */
1501                 _W = W + ++ix;
1502
1503                 for (; ix <= n->used * 2 + 1; ix++) {
1504                         *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
1505                 }
1506
1507 /*
1508                 copy out, A = A/b**n
1509
1510                 The result is A/b**n but instead of converting from an
1511                 array of mp_word to mp_digit than calling mp_rshd
1512                 we just copy them in the right order
1513  */
1514
1515 /*
1516                 alias for destination word
1517  */
1518                 tmpx = x->dp;
1519
1520 /*
1521                 alias for shifted double precision result
1522  */
1523                 _W = W + n->used;
1524
1525                 for (ix = 0; ix < n->used + 1; ix++) {
1526                 *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
1527                 }
1528
1529 /*
1530                 zero oldused digits, if the input a was larger than
1531                 m->used+1 we'll have to clear the digits
1532  */
1533                 for (; ix < olduse; ix++) {
1534                         *tmpx++ = 0;
1535                 }
1536         }
1537
1538 /*
1539         Set the max used and clamp
1540  */
1541         x->used = n->used + 1;
1542         mp_clamp(x);
1543
1544 /*
1545         if A >= m then A = A - m
1546  */
1547         if (mp_cmp_mag(x, n) != MP_LT) {
1548                 return s_mp_sub(x, n, x);
1549         }
1550         return MP_OKAY;
1551 }
1552
1553 /******************************************************************************/
1554 /*
1555         High level addition (handles signs)
1556  */
1557 int32 mp_add (mp_int * a, mp_int * b, mp_int * c)
1558 {
1559         int32           sa, sb, res;
1560
1561 /*
1562         Get sign of both inputs
1563  */
1564         sa = a->sign;
1565         sb = b->sign;
1566
1567 /*
1568         Handle two cases, not four.
1569  */
1570         if (sa == sb) {
1571 /*
1572                 Both positive or both negative. Add their magnitudes, copy the sign.
1573  */
1574                 c->sign = sa;
1575                 res = s_mp_add (a, b, c);
1576         } else {
1577 /*
1578                 One positive, the other negative.  Subtract the one with the greater
1579                 magnitude from the one of the lesser magnitude.  The result gets the sign of
1580                 the one with the greater magnitude.
1581  */
1582                 if (mp_cmp_mag (a, b) == MP_LT) {
1583                         c->sign = sb;
1584                         res = s_mp_sub (b, a, c);
1585                 } else {
1586                         c->sign = sa;
1587                         res = s_mp_sub (a, b, c);
1588                 }
1589         }
1590         return res;
1591 }
1592
1593 /******************************************************************************/
1594 /*
1595         Compare a digit.
1596  */
1597 int32 mp_cmp_d (mp_int * a, mp_digit b)
1598 {
1599 /*
1600         Compare based on sign
1601  */
1602         if (a->sign == MP_NEG) {
1603                 return MP_LT;
1604         }
1605
1606 /*
1607         Compare based on magnitude
1608  */
1609         if (a->used > 1) {
1610                 return MP_GT;
1611         }
1612
1613 /*
1614         Compare the only digit of a to b
1615  */
1616         if (a->dp[0] > b) {
1617                 return MP_GT;
1618         } else if (a->dp[0] < b) {
1619                 return MP_LT;
1620         } else {
1621                 return MP_EQ;
1622         }
1623 }
1624
1625 /******************************************************************************/
1626 /*
1627         b = a/2
1628  */
1629 int32 mp_div_2 (mp_int * a, mp_int * b)
1630 {
1631         int32           x, res, oldused;
1632
1633 /*
1634         Copy
1635  */
1636         if (b->alloc < a->used) {
1637                 if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1638                         return res;
1639                 }
1640         }
1641
1642         oldused = b->used;
1643         b->used = a->used;
1644         {
1645                 register mp_digit r, rr, *tmpa, *tmpb;
1646
1647 /*
1648                 Source alias
1649  */
1650                 tmpa = a->dp + b->used - 1;
1651
1652 /*
1653                 dest alias
1654  */
1655                 tmpb = b->dp + b->used - 1;
1656
1657 /*
1658                 carry
1659  */
1660                 r = 0;
1661                 for (x = b->used - 1; x >= 0; x--) {
1662 /*
1663                         Get the carry for the next iteration
1664  */
1665                         rr = *tmpa & 1;
1666
1667 /*
1668                         Shift the current digit, add in carry and store
1669  */
1670                         *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
1671 /*
1672                         Forward carry to next iteration
1673  */
1674                         r = rr;
1675                 }
1676
1677 /*
1678                 Zero excess digits
1679  */
1680                 tmpb = b->dp + b->used;
1681                 for (x = b->used; x < oldused; x++) {
1682                         *tmpb++ = 0;
1683                 }
1684         }
1685         b->sign = a->sign;
1686         mp_clamp (b);
1687         return MP_OKAY;
1688 }
1689
1690 /******************************************************************************/
1691 /*
1692         Computes xR**-1 == x (mod N) via Montgomery Reduction
1693  */
1694 #ifdef USE_SMALL_WORD
1695 int32 mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
1696 {
1697         int32                   ix, res, digs;
1698         mp_digit        mu;
1699
1700 /*      Can the fast reduction [comba] method be used?
1701
1702         Note that unlike in mul you're safely allowed *less* than the available
1703         columns [255 per default] since carries are fixed up in the inner loop.
1704  */
1705         digs = n->used * 2 + 1;
1706         if ((digs < MP_WARRAY) &&
1707                 n->used <
1708                 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
1709                         return fast_mp_montgomery_reduce (x, n, rho);
1710                 }
1711
1712 /*
1713                 Grow the input as required.
1714  */
1715                 if (x->alloc < digs) {
1716                         if ((res = mp_grow (x, digs)) != MP_OKAY) {
1717                                 return res;
1718                         }
1719                 }
1720                 x->used = digs;
1721
1722                 for (ix = 0; ix < n->used; ix++) {
1723 /*
1724                         mu = ai * rho mod b
1725
1726                         The value of rho must be precalculated via mp_montgomery_setup()
1727                         such that it equals -1/n0 mod b this allows the following inner
1728                         loop to reduce the input one digit at a time
1729  */
1730                         mu = (mp_digit)(((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
1731
1732                         /* a = a + mu * m * b**i */
1733                         {
1734                                 register int32 iy;
1735                                 register mp_digit *tmpn, *tmpx, u;
1736                                 register mp_word r;
1737
1738 /*
1739                                 alias for digits of the modulus
1740  */
1741                                 tmpn = n->dp;
1742
1743 /*
1744                                 alias for the digits of x [the input]
1745  */
1746                                 tmpx = x->dp + ix;
1747
1748 /*
1749                                 set the carry to zero
1750  */
1751                                 u = 0;
1752
1753 /*
1754                                 Multiply and add in place
1755  */
1756                                 for (iy = 0; iy < n->used; iy++) {
1757                                         /* compute product and sum */
1758                                         r = ((mp_word)mu) * ((mp_word)*tmpn++) +
1759                                                 ((mp_word) u) + ((mp_word) * tmpx);
1760
1761                                         /* get carry */
1762                                         u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
1763
1764                                         /* fix digit */
1765                                         *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
1766                                 }
1767                                 /* At this point the ix'th digit of x should be zero */
1768
1769
1770 /*
1771                                 propagate carries upwards as required
1772  */
1773                                 while (u) {
1774                                         *tmpx           += u;
1775                                         u                       = *tmpx >> DIGIT_BIT;
1776                                         *tmpx++ &= MP_MASK;
1777                                 }
1778                         }
1779                 }
1780
1781 /*
1782                 At this point the n.used'th least significant digits of x are all zero
1783                 which means we can shift x to the right by n.used digits and the
1784                 residue is unchanged.
1785 */
1786                 /* x = x/b**n.used */
1787                 mp_clamp(x);
1788                 mp_rshd (x, n->used);
1789
1790                 /* if x >= n then x = x - n */
1791                 if (mp_cmp_mag (x, n) != MP_LT) {
1792                         return s_mp_sub (x, n, x);
1793                 }
1794
1795                 return MP_OKAY;
1796 }
1797 #endif /* USE_SMALL_WORD */
1798
1799 /******************************************************************************/
1800 /*
1801         Setups the montgomery reduction stuff.
1802  */
1803 int32 mp_montgomery_setup (mp_int * n, mp_digit * rho)
1804 {
1805         mp_digit x, b;
1806
1807 /*
1808         fast inversion mod 2**k
1809
1810         Based on the fact that
1811
1812         XA = 1 (mod 2**n)       =>  (X(2-XA)) A         = 1 (mod 2**2n)
1813                                                 =>  2*X*A - X*X*A*A     = 1
1814                                                 =>  2*(1) - (1)         = 1
1815 */
1816         b = n->dp[0];
1817
1818         if ((b & 1) == 0) {
1819                 return MP_VAL;
1820         }
1821
1822         x = (((b + 2) & 4) << 1) + b;           /* here x*a==1 mod 2**4 */
1823         x = (x * (2 - b * x)) & MP_MASK;        /* here x*a==1 mod 2**8 */
1824 #if !defined(MP_8BIT)
1825         x = (x * (2 - b * x)) & MP_MASK;        /* here x*a==1 mod 2**8 */
1826 #endif /* MP_8BIT */
1827 #if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
1828         x *= 2 - b * x;                                         /* here x*a==1 mod 2**32 */
1829 #endif
1830 #ifdef MP_64BIT
1831         x *= 2 - b * x;                                         /* here x*a==1 mod 2**64 */
1832 #endif /* MP_64BIT */
1833
1834         /* rho = -1/m mod b */
1835         *rho = (((mp_word) 1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
1836
1837         return MP_OKAY;
1838 }
1839
1840 /******************************************************************************/
1841 /*
1842         High level subtraction (handles signs)
1843  */
1844 int32 mp_sub (mp_int * a, mp_int * b, mp_int * c)
1845 {
1846         int32           sa, sb, res;
1847
1848         sa = a->sign;
1849         sb = b->sign;
1850
1851         if (sa != sb) {
1852 /*
1853                 Subtract a negative from a positive, OR subtract a positive from a
1854                 negative.  In either case, ADD their magnitudes, and use the sign of
1855                 the first number.
1856  */
1857                 c->sign = sa;
1858                 res = s_mp_add (a, b, c);
1859         } else {
1860 /*
1861                 Subtract a positive from a positive, OR subtract a negative
1862                 from a negative. First, take the difference between their
1863                 magnitudes, then...
1864  */
1865                 if (mp_cmp_mag (a, b) != MP_LT) {
1866 /*
1867                         Copy the sign from the first
1868  */
1869                         c->sign = sa;
1870                         /* The first has a larger or equal magnitude */
1871                         res = s_mp_sub (a, b, c);
1872                 } else {
1873 /*
1874                         The result has the *opposite* sign from the first number.
1875  */
1876                         c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
1877 /*
1878                         The second has a larger magnitude
1879  */
1880                         res = s_mp_sub (b, a, c);
1881                 }
1882         }
1883         return res;
1884 }
1885
1886 /******************************************************************************/
1887 /*
1888         calc a value mod 2**b
1889  */
1890 int32 mp_mod_2d (mp_int * a, int32 b, mp_int * c)
1891 {
1892         int32           x, res;
1893
1894 /*
1895         if b is <= 0 then zero the int32
1896  */
1897         if (b <= 0) {
1898                 mp_zero (c);
1899                 return MP_OKAY;
1900         }
1901
1902 /*
1903         If the modulus is larger than the value than return
1904  */
1905         if (b >=(int32) (a->used * DIGIT_BIT)) {
1906                 res = mp_copy (a, c);
1907                 return res;
1908         }
1909
1910         /* copy */
1911         if ((res = mp_copy (a, c)) != MP_OKAY) {
1912                 return res;
1913         }
1914
1915 /*
1916         Zero digits above the last digit of the modulus
1917  */
1918         for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
1919                 c->dp[x] = 0;
1920         }
1921 /*
1922         Clear the digit that is not completely outside/inside the modulus
1923  */
1924         c->dp[b / DIGIT_BIT] &=
1925                 (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
1926         mp_clamp (c);
1927         return MP_OKAY;
1928 }
1929
1930 /******************************************************************************/
1931 /*
1932         Shift right a certain amount of digits.
1933  */
1934 void mp_rshd (mp_int * a, int32 b)
1935 {
1936         int32           x;
1937
1938 /*
1939         If b <= 0 then ignore it
1940  */
1941         if (b <= 0) {
1942                 return;
1943         }
1944
1945 /*
1946         If b > used then simply zero it and return.
1947 */
1948         if (a->used <= b) {
1949                 mp_zero (a);
1950                 return;
1951         }
1952
1953         {
1954                 register mp_digit *bottom, *top;
1955
1956 /*
1957                 Shift the digits down
1958  */
1959                 /* bottom */
1960                 bottom = a->dp;
1961
1962                 /* top [offset into digits] */
1963                 top = a->dp + b;
1964
1965 /*
1966                 This is implemented as a sliding window where the window is b-digits long
1967                 and digits from the top of the window are copied to the bottom.
1968
1969                  e.g.
1970
1971                 b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
1972                 /\                   |      ---->
1973                 \-------------------/      ---->
1974  */
1975                 for (x = 0; x < (a->used - b); x++) {
1976                         *bottom++ = *top++;
1977                 }
1978
1979 /*
1980                 Zero the top digits
1981  */
1982                 for (; x < a->used; x++) {
1983                         *bottom++ = 0;
1984                 }
1985         }
1986
1987 /*
1988         Remove excess digits
1989  */
1990         a->used -= b;
1991 }
1992
1993 /******************************************************************************/
1994 /*
1995         Low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9
1996  */
1997 int32 s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
1998 {
1999         int32           olduse, res, min, max;
2000
2001 /*
2002         Find sizes
2003  */
2004         min = b->used;
2005         max = a->used;
2006
2007 /*
2008         init result
2009  */
2010         if (c->alloc < max) {
2011                 if ((res = mp_grow (c, max)) != MP_OKAY) {
2012                         return res;
2013                 }
2014         }
2015         olduse = c->used;
2016         c->used = max;
2017
2018         {
2019                 register mp_digit u, *tmpa, *tmpb, *tmpc;
2020                 register int32 i;
2021
2022 /*
2023                 alias for digit pointers
2024  */
2025                 tmpa = a->dp;
2026                 tmpb = b->dp;
2027                 tmpc = c->dp;
2028
2029 /*
2030                 set carry to zero
2031  */
2032                 u = 0;
2033                 for (i = 0; i < min; i++) {
2034                         /* T[i] = A[i] - B[i] - U */
2035                         *tmpc = *tmpa++ - *tmpb++ - u;
2036
2037 /*
2038                         U = carry bit of T[i]
2039                         Note this saves performing an AND operation since if a carry does occur it
2040                         will propagate all the way to the MSB.  As a result a single shift
2041                         is enough to get the carry
2042  */
2043                         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
2044
2045                         /* Clear carry from T[i] */
2046                         *tmpc++ &= MP_MASK;
2047                 }
2048
2049 /*
2050                 Now copy higher words if any, e.g. if A has more digits than B
2051  */
2052                 for (; i < max; i++) {
2053                         /* T[i] = A[i] - U */
2054                         *tmpc = *tmpa++ - u;
2055
2056                         /* U = carry bit of T[i] */
2057                         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
2058
2059                         /* Clear carry from T[i] */
2060                         *tmpc++ &= MP_MASK;
2061                 }
2062
2063 /*
2064                 Clear digits above used (since we may not have grown result above)
2065  */
2066                 for (i = c->used; i < olduse; i++) {
2067                         *tmpc++ = 0;
2068                 }
2069         }
2070
2071         mp_clamp (c);
2072         return MP_OKAY;
2073 }
2074 /******************************************************************************/
2075 /*
2076         integer signed division.
2077
2078         c*b + d == a [e.g. a/b, c=quotient, d=remainder]
2079         HAC pp.598 Algorithm 14.20
2080
2081         Note that the description in HAC is horribly incomplete.  For example,
2082         it doesn't consider the case where digits are removed from 'x' in the inner
2083         loop.  It also doesn't consider the case that y has fewer than three
2084         digits, etc..
2085
2086         The overall algorithm is as described as 14.20 from HAC but fixed to
2087         treat these cases.
2088  */
2089 #ifdef MP_DIV_SMALL
2090 int32 mp_div(psPool_t *pool, mp_int * a, mp_int * b, mp_int * c, mp_int * d)
2091 {
2092         mp_int  ta, tb, tq, q;
2093         int32           res, n, n2;
2094
2095 /*
2096         is divisor zero ?
2097  */
2098         if (mp_iszero (b) == 1) {
2099                 return MP_VAL;
2100         }
2101
2102 /*
2103         if a < b then q=0, r = a
2104  */
2105         if (mp_cmp_mag (a, b) == MP_LT) {
2106                 if (d != NULL) {
2107                         res = mp_copy (a, d);
2108                 } else {
2109                         res = MP_OKAY;
2110                 }
2111                 if (c != NULL) {
2112                         mp_zero (c);
2113                 }
2114                 return res;
2115         }
2116
2117 /*
2118         init our temps
2119  */
2120         if ((res = _mp_init_multi(pool, &ta, &tb, &tq, &q, NULL, NULL, NULL, NULL) != MP_OKAY)) {
2121                 return res;
2122         }
2123
2124 /*
2125         tq = 2^n,  tb == b*2^n
2126  */
2127         mp_set(&tq, 1);
2128         n = mp_count_bits(a) - mp_count_bits(b);
2129         if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
2130                         ((res = mp_abs(b, &tb)) != MP_OKAY) ||
2131                         ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
2132                         ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
2133       goto __ERR;
2134         }
2135 /* old
2136         if (((res = mp_copy(a, &ta)) != MP_OKAY) ||
2137                 ((res = mp_copy(b, &tb)) != MP_OKAY) ||
2138                 ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
2139                 ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
2140                         goto LBL_ERR;
2141         }
2142 */
2143         while (n-- >= 0) {
2144                 if (mp_cmp(&tb, &ta) != MP_GT) {
2145                         if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
2146                                 ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
2147                                         goto LBL_ERR;
2148                         }
2149                 }
2150                 if (((res = mp_div_2d(pool, &tb, 1, &tb, NULL)) != MP_OKAY) ||
2151                         ((res = mp_div_2d(pool, &tq, 1, &tq, NULL)) != MP_OKAY)) {
2152                         goto LBL_ERR;
2153                 }
2154         }
2155
2156 /*
2157         now q == quotient and ta == remainder
2158  */
2159         n  = a->sign;
2160         n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
2161         if (c != NULL) {
2162                 mp_exch(c, &q);
2163                 c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
2164         }
2165         if (d != NULL) {
2166                 mp_exch(d, &ta);
2167                 d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
2168         }
2169 LBL_ERR:
2170         _mp_clear_multi(&ta, &tb, &tq, &q, NULL, NULL, NULL, NULL);
2171         return res;
2172 }
2173 #else /* MP_DIV_SMALL */
2174
2175 int32 mp_div(psPool_t *pool, mp_int * a, mp_int * b, mp_int * c, mp_int * d)
2176 {
2177         mp_int          q, x, y, t1, t2;
2178         int32           res, n, t, i, norm, neg;
2179
2180 /*
2181         is divisor zero ?
2182  */
2183         if (mp_iszero(b) == 1) {
2184                 return MP_VAL;
2185         }
2186
2187 /*
2188         if a < b then q=0, r = a
2189  */
2190         if (mp_cmp_mag(a, b) == MP_LT) {
2191                 if (d != NULL) {
2192                         res = mp_copy(a, d);
2193                 } else {
2194                         res = MP_OKAY;
2195                 }
2196                 if (c != NULL) {
2197                 mp_zero(c);
2198                 }
2199                 return res;
2200         }
2201
2202         if ((res = mp_init_size(pool, &q, a->used + 2)) != MP_OKAY) {
2203                 return res;
2204         }
2205         q.used = a->used + 2;
2206
2207         if ((res = mp_init(pool, &t1)) != MP_OKAY) {
2208                 goto LBL_Q;
2209         }
2210
2211         if ((res = mp_init(pool, &t2)) != MP_OKAY) {
2212                 goto LBL_T1;
2213         }
2214
2215         if ((res = mp_init_copy(pool, &x, a)) != MP_OKAY) {
2216                 goto LBL_T2;
2217         }
2218
2219         if ((res = mp_init_copy(pool, &y, b)) != MP_OKAY) {
2220                 goto LBL_X;
2221         }
2222
2223 /*
2224         fix the sign
2225  */
2226         neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
2227         x.sign = y.sign = MP_ZPOS;
2228
2229 /*
2230         normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT]
2231  */
2232         norm = mp_count_bits(&y) % DIGIT_BIT;
2233         if (norm < (int32)(DIGIT_BIT-1)) {
2234                 norm = (DIGIT_BIT-1) - norm;
2235                 if ((res = mp_mul_2d(&x, norm, &x)) != MP_OKAY) {
2236                         goto LBL_Y;
2237                 }
2238                 if ((res = mp_mul_2d(&y, norm, &y)) != MP_OKAY) {
2239                         goto LBL_Y;
2240                 }
2241         } else {
2242                 norm = 0;
2243         }
2244
2245 /*
2246         note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4
2247  */
2248         n = x.used - 1;
2249         t = y.used - 1;
2250
2251 /*
2252         while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} }
2253  */
2254         if ((res = mp_lshd(&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
2255                 goto LBL_Y;
2256         }
2257
2258         while (mp_cmp(&x, &y) != MP_LT) {
2259                 ++(q.dp[n - t]);
2260                 if ((res = mp_sub(&x, &y, &x)) != MP_OKAY) {
2261                 goto LBL_Y;
2262                 }
2263         }
2264
2265 /*
2266         reset y by shifting it back down
2267  */
2268         mp_rshd(&y, n - t);
2269
2270 /*
2271         step 3. for i from n down to (t + 1)
2272  */
2273         for (i = n; i >= (t + 1); i--) {
2274                 if (i > x.used) {
2275                         continue;
2276                 }
2277
2278 /*
2279                 step 3.1 if xi == yt then set q{i-t-1} to b-1,
2280                 otherwise set q{i-t-1} to (xi*b + x{i-1})/yt
2281  */
2282                 if (x.dp[i] == y.dp[t]) {
2283                         q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
2284                 } else {
2285                         mp_word tmp;
2286                         tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
2287                         tmp |= ((mp_word) x.dp[i - 1]);
2288                         tmp /= ((mp_word) y.dp[t]);
2289                         if (tmp > (mp_word) MP_MASK) {
2290                                 tmp = MP_MASK;
2291                         }
2292                         q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
2293                 }
2294
2295 /*
2296                 while (q{i-t-1} * (yt * b + y{t-1})) >
2297                                 xi * b**2 + xi-1 * b + xi-2
2298
2299                         do q{i-t-1} -= 1;
2300  */
2301                 q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
2302                 do {
2303                         q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
2304
2305 /*
2306                         find left hand
2307  */
2308                         mp_zero (&t1);
2309                         t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
2310                         t1.dp[1] = y.dp[t];
2311                         t1.used = 2;
2312                         if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
2313                                 goto LBL_Y;
2314                         }
2315
2316 /*
2317                         find right hand
2318  */
2319                         t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
2320                         t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
2321                         t2.dp[2] = x.dp[i];
2322                         t2.used = 3;
2323                 } while (mp_cmp_mag(&t1, &t2) == MP_GT);
2324
2325 /*
2326                 step 3.3 x = x - q{i-t-1} * y * b**{i-t-1}
2327  */
2328                 if ((res = mp_mul_d(&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
2329                         goto LBL_Y;
2330                 }
2331
2332                 if ((res = mp_lshd(&t1, i - t - 1)) != MP_OKAY) {
2333                         goto LBL_Y;
2334                 }
2335
2336                 if ((res = mp_sub(&x, &t1, &x)) != MP_OKAY) {
2337                         goto LBL_Y;
2338                 }
2339
2340 /*
2341         if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; }
2342  */
2343                 if (x.sign == MP_NEG) {
2344                         if ((res = mp_copy(&y, &t1)) != MP_OKAY) {
2345                                 goto LBL_Y;
2346                         }
2347                 if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
2348                         goto LBL_Y;
2349                 }
2350                 if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
2351                         goto LBL_Y;
2352                 }
2353
2354                 q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
2355                 }
2356         }
2357
2358 /*
2359         now q is the quotient and x is the remainder
2360         [which we have to normalize]
2361  */
2362
2363 /*
2364         get sign before writing to c
2365  */
2366         x.sign = x.used == 0 ? MP_ZPOS : a->sign;
2367
2368         if (c != NULL) {
2369                 mp_clamp(&q);
2370                 mp_exch(&q, c);
2371                 c->sign = neg;
2372         }
2373
2374         if (d != NULL) {
2375                 mp_div_2d(pool, &x, norm, &x, NULL);
2376                 mp_exch(&x, d);
2377         }
2378
2379         res = MP_OKAY;
2380
2381 LBL_Y:mp_clear (&y);
2382 LBL_X:mp_clear (&x);
2383 LBL_T2:mp_clear (&t2);
2384 LBL_T1:mp_clear (&t1);
2385 LBL_Q:mp_clear (&q);
2386         return res;
2387 }
2388 #endif /* MP_DIV_SMALL */
2389
2390 /******************************************************************************/
2391 /*
2392         multiplies |a| * |b| and only computes upto digs digits of result
2393         HAC pp. 595, Algorithm 14.12  Modified so you can control how many digits
2394         of output are created.
2395  */
2396 #ifdef USE_SMALL_WORD
2397 int32 s_mp_mul_digs(psPool_t *pool, mp_int * a, mp_int * b, mp_int * c, int32 digs)
2398 {
2399         mp_int          t;
2400         int32                   res, pa, pb, ix, iy;
2401         mp_digit        u;
2402         mp_word         r;
2403         mp_digit        tmpx, *tmpt, *tmpy;
2404
2405 /*
2406         Can we use the fast multiplier?
2407  */
2408         if (((digs) < MP_WARRAY) &&
2409                 MIN (a->used, b->used) <
2410                 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2411                         return fast_s_mp_mul_digs (pool, a, b, c, digs);
2412                 }
2413
2414                 if ((res = mp_init_size(pool, &t, digs)) != MP_OKAY) {
2415                         return res;
2416                 }
2417                 t.used = digs;
2418
2419 /*
2420                 Compute the digits of the product directly
2421  */
2422                 pa = a->used;
2423                 for (ix = 0; ix < pa; ix++) {
2424                         /* set the carry to zero */
2425                         u = 0;
2426
2427 /*
2428                         Limit ourselves to making digs digits of output.
2429 */
2430                         pb = MIN (b->used, digs - ix);
2431
2432 /*
2433                         Setup some aliases. Copy of the digit from a used
2434                         within the nested loop
2435  */
2436                         tmpx = a->dp[ix];
2437
2438 /*
2439                         An alias for the destination shifted ix places
2440  */
2441                         tmpt = t.dp + ix;
2442
2443 /*
2444                         An alias for the digits of b
2445  */
2446                         tmpy = b->dp;
2447
2448 /*
2449                         Compute the columns of the output and propagate the carry
2450  */
2451                         for (iy = 0; iy < pb; iy++) {
2452                                 /* compute the column as a mp_word */
2453                                 r       = ((mp_word)*tmpt) +
2454                                         ((mp_word)tmpx) * ((mp_word)*tmpy++) +
2455                                         ((mp_word) u);
2456
2457                                 /* the new column is the lower part of the result */
2458                                 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2459
2460                                 /* get the carry word from the result */
2461                                 u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2462                         }
2463 /*
2464                         Set carry if it is placed below digs
2465  */
2466                         if (ix + iy < digs) {
2467                                 *tmpt = u;
2468                         }
2469                 }
2470
2471                 mp_clamp (&t);
2472                 mp_exch (&t, c);
2473
2474                 mp_clear (&t);
2475                 return MP_OKAY;
2476 }
2477 #endif /* USE_SMALL_WORD */
2478
2479 /******************************************************************************/
2480 /*
2481         Fast (comba) multiplier
2482
2483         This is the fast column-array [comba] multiplier.  It is designed to
2484         compute the columns of the product first then handle the carries afterwards.
2485         This has the effect of making the nested loops that compute the columns
2486         very simple and schedulable on super-scalar processors.
2487
2488         This has been modified to produce a variable number of digits of output so
2489         if say only a half-product is required you don't have to compute the upper
2490         half (a feature required for fast Barrett reduction).
2491
2492         Based on Algorithm 14.12 on pp.595 of HAC.
2493 */
2494
2495 int32 fast_s_mp_mul_digs(psPool_t *pool, mp_int * a, mp_int * b, mp_int * c,
2496                                                   int32 digs)
2497 {
2498         int32           olduse, res, pa, ix, iz, neg;
2499         mp_digit        W[MP_WARRAY];
2500         register        mp_word  _W;
2501
2502         neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
2503
2504 /*
2505         grow the destination as required
2506  */
2507         if (c->alloc < digs) {
2508                 if ((res = mp_grow(c, digs)) != MP_OKAY) {
2509                         return res;
2510                 }
2511         }
2512
2513 /*
2514         number of output digits to produce
2515  */
2516         pa = MIN(digs, a->used + b->used);
2517
2518 /*
2519         clear the carry
2520  */
2521         _W = 0;
2522         for (ix = 0; ix < pa; ix++) {
2523                 int32           tx, ty;
2524                 int32           iy;
2525                 mp_digit        *tmpx, *tmpy;
2526
2527 /*
2528                 get offsets into the two bignums
2529  */
2530                 ty = MIN(b->used-1, ix);
2531                 tx = ix - ty;
2532
2533 /*
2534                 setup temp aliases
2535  */
2536                 tmpx = a->dp + tx;
2537                 tmpy = b->dp + ty;
2538
2539 /*
2540                 this is the number of times the loop will iterrate, essentially its
2541                 while (tx++ < a->used && ty-- >= 0) { ... }
2542  */
2543                 iy = MIN(a->used-tx, ty+1);
2544
2545 /*
2546                 execute loop
2547  */
2548                 for (iz = 0; iz < iy; ++iz) {
2549                         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
2550                 }
2551
2552 /*
2553                 store term
2554  */
2555                 W[ix] = (mp_digit)(_W & MP_MASK);
2556
2557 /*
2558                 make next carry
2559  */
2560                 _W = _W >> ((mp_word)DIGIT_BIT);
2561         }
2562
2563 /*
2564         store final carry
2565  */
2566         W[ix] = (mp_digit)(_W & MP_MASK);
2567
2568 /*
2569         setup dest
2570  */
2571         olduse  = c->used;
2572         c->used = pa;
2573
2574         {
2575                 register mp_digit *tmpc;
2576                 tmpc = c->dp;
2577                 for (ix = 0; ix < pa+1; ix++) {
2578 /*
2579                         now extract the previous digit [below the carry]
2580  */
2581                         *tmpc++ = W[ix];
2582                 }
2583
2584 /*
2585                 clear unused digits [that existed in the old copy of c]
2586  */
2587                 for (; ix < olduse; ix++) {
2588                         *tmpc++ = 0;
2589                 }
2590         }
2591         mp_clamp (c);
2592         c->sign = (c->used > 0) ? neg : MP_ZPOS;
2593         return MP_OKAY;
2594 }
2595
2596 /******************************************************************************/
2597 /*
2598         b = a*2
2599  */
2600 int32 mp_mul_2 (mp_int * a, mp_int * b)
2601 {
2602         int32           x, res, oldused;
2603
2604 /*
2605         grow to accomodate result
2606  */
2607         if (b->alloc < a->used + 1) {
2608                 if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
2609                         return res;
2610                 }
2611         }
2612
2613         oldused = b->used;
2614         b->used = a->used;
2615
2616         {
2617                 register mp_digit r, rr, *tmpa, *tmpb;
2618
2619                 /* alias for source */
2620                 tmpa = a->dp;
2621
2622                 /* alias for dest */
2623                 tmpb = b->dp;
2624
2625                 /* carry */
2626                 r = 0;
2627                 for (x = 0; x < a->used; x++) {
2628
2629 /*
2630                         get what will be the *next* carry bit from the MSB of the
2631                         current digit
2632  */
2633                         rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
2634
2635 /*
2636                         now shift up this digit, add in the carry [from the previous]
2637  */
2638                         *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
2639
2640 /*                      copy the carry that would be from the source digit into the next
2641                         iteration
2642  */
2643                         r = rr;
2644                 }
2645
2646 /*
2647                 new leading digit?
2648  */
2649                 if (r != 0) {
2650 /*
2651                         add a MSB which is always 1 at this point
2652  */
2653                         *tmpb = 1;
2654                         ++(b->used);
2655                 }
2656
2657 /*
2658                 now zero any excess digits on the destination that we didn't write to
2659  */
2660                 tmpb = b->dp + b->used;
2661                 for (x = b->used; x < oldused; x++) {
2662                         *tmpb++ = 0;
2663                 }
2664         }
2665         b->sign = a->sign;
2666         return MP_OKAY;
2667 }
2668
2669 /******************************************************************************/
2670 /*
2671         multiply by a digit
2672  */
2673 int32 mp_mul_d(mp_int * a, mp_digit b, mp_int * c)
2674 {
2675         mp_digit        u, *tmpa, *tmpc;
2676         mp_word         r;
2677         int32                   ix, res, olduse;
2678
2679 /*
2680         make sure c is big enough to hold a*b
2681  */
2682         if (c->alloc < a->used + 1) {
2683                 if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
2684                         return res;
2685                 }
2686         }
2687
2688 /*
2689         get the original destinations used count
2690  */
2691         olduse = c->used;
2692
2693 /*
2694         set the sign
2695  */
2696         c->sign = a->sign;
2697
2698 /*
2699         alias for a->dp [source]
2700  */
2701         tmpa = a->dp;
2702
2703 /*
2704         alias for c->dp [dest]
2705  */
2706         tmpc = c->dp;
2707
2708         /* zero carry */
2709         u = 0;
2710
2711         /* compute columns */
2712         for (ix = 0; ix < a->used; ix++) {
2713 /*
2714                 compute product and carry sum for this term
2715  */
2716                 r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
2717
2718 /*
2719                 mask off higher bits to get a single digit
2720  */
2721                 *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
2722
2723 /*
2724                 send carry into next iteration
2725  */
2726                 u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2727         }
2728
2729 /*
2730         store final carry [if any] and increment ix offset
2731  */
2732         *tmpc++ = u;
2733         ++ix;
2734
2735 /*
2736         now zero digits above the top
2737  */
2738         while (ix++ < olduse) {
2739                 *tmpc++ = 0;
2740         }
2741
2742         /* set used count */
2743         c->used = a->used + 1;
2744         mp_clamp(c);
2745
2746         return MP_OKAY;
2747 }
2748
2749 /******************************************************************************/
2750 /*
2751         low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16
2752  */
2753 #ifdef USE_SMALL_WORD
2754 int32 s_mp_sqr (psPool_t *pool, mp_int * a, mp_int * b)
2755 {
2756         mp_int          t;
2757         int32                   res, ix, iy, pa;
2758         mp_word         r;
2759         mp_digit        u, tmpx, *tmpt;
2760
2761         pa = a->used;
2762         if ((res = mp_init_size(pool, &t, 2*pa + 1)) != MP_OKAY) {
2763                 return res;
2764         }
2765
2766 /*
2767         default used is maximum possible size
2768  */
2769         t.used = 2*pa + 1;
2770
2771         for (ix = 0; ix < pa; ix++) {
2772 /*
2773                 first calculate the digit at 2*ix
2774                 calculate double precision result
2775  */
2776                 r = ((mp_word) t.dp[2*ix]) +
2777                         ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
2778
2779 /*
2780                 store lower part in result
2781  */
2782                 t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
2783
2784 /*
2785                 get the carry
2786  */
2787                 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2788
2789 /*
2790                 left hand side of A[ix] * A[iy]
2791  */
2792                 tmpx = a->dp[ix];
2793
2794 /*
2795                 alias for where to store the results
2796  */
2797                 tmpt = t.dp + (2*ix + 1);
2798
2799                 for (iy = ix + 1; iy < pa; iy++) {
2800 /*
2801                         first calculate the product
2802  */
2803                         r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
2804
2805 /*
2806                         now calculate the double precision result, note we use addition
2807                         instead of *2 since it's easier to optimize
2808  */
2809                         r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
2810
2811 /*
2812                         store lower part
2813  */
2814                         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2815
2816                         /* get carry */
2817                         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2818                 }
2819                 /* propagate upwards */
2820                 while (u != ((mp_digit) 0)) {
2821                         r       = ((mp_word) *tmpt) + ((mp_word) u);
2822                         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2823                         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2824                 }
2825         }
2826
2827         mp_clamp (&t);
2828         mp_exch (&t, b);
2829         mp_clear (&t);
2830         return MP_OKAY;
2831 }
2832 #endif /* USE_SMALL_WORD */
2833
2834 /******************************************************************************/
2835 /*
2836         fast squaring
2837
2838         This is the comba method where the columns of the product are computed
2839         first then the carries are computed.  This has the effect of making a very
2840         simple inner loop that is executed the most
2841
2842         W2 represents the outer products and W the inner.
2843
2844         A further optimizations is made because the inner products are of the
2845         form "A * B * 2".  The *2 part does not need to be computed until the end
2846         which is good because 64-bit shifts are slow!
2847
2848         Based on Algorithm 14.16 on pp.597 of HAC.
2849
2850         This is the 1.0 version, but no SSE stuff
2851 */
2852 int32 fast_s_mp_sqr(psPool_t *pool, mp_int * a, mp_int * b)
2853 {
2854         int32           olduse, res, pa, ix, iz;
2855         mp_digit        W[MP_WARRAY], *tmpx;
2856         mp_word         W1;
2857
2858 /*
2859         grow the destination as required
2860  */
2861         pa = a->used + a->used;
2862         if (b->alloc < pa) {
2863                 if ((res = mp_grow(b, pa)) != MP_OKAY) {
2864                         return res;
2865                 }
2866         }
2867
2868 /*
2869         number of output digits to produce
2870  */
2871         W1 = 0;
2872         for (ix = 0; ix < pa; ix++) {
2873                 int32           tx, ty, iy;
2874                 mp_word         _W;
2875                 mp_digit        *tmpy;
2876
2877 /*
2878                 clear counter
2879  */
2880                 _W = 0;
2881
2882 /*
2883                 get offsets into the two bignums
2884  */
2885                 ty = MIN(a->used-1, ix);
2886                 tx = ix - ty;
2887
2888 /*
2889                 setup temp aliases
2890  */
2891                 tmpx = a->dp + tx;
2892                 tmpy = a->dp + ty;
2893
2894 /*
2895                 this is the number of times the loop will iterrate, essentially
2896                 while (tx++ < a->used && ty-- >= 0) { ... }
2897 */
2898                 iy = MIN(a->used-tx, ty+1);
2899
2900 /*
2901                 now for squaring tx can never equal ty
2902                 we halve the distance since they approach at a rate of 2x
2903                 and we have to round because odd cases need to be executed
2904 */
2905                 iy = MIN(iy, (ty-tx+1)>>1);
2906
2907 /*
2908                 execute loop
2909  */
2910                 for (iz = 0; iz < iy; iz++) {
2911                          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
2912                 }
2913
2914 /*
2915                 double the inner product and add carry
2916  */
2917                 _W = _W + _W + W1;
2918
2919 /*
2920                 even columns have the square term in them
2921  */
2922                 if ((ix&1) == 0) {
2923                         _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
2924                 }
2925
2926 /*
2927                 store it
2928  */
2929                 W[ix] = (mp_digit)(_W & MP_MASK);
2930
2931 /*
2932                 make next carry
2933  */
2934                 W1 = _W >> ((mp_word)DIGIT_BIT);
2935         }
2936
2937 /*
2938         setup dest
2939  */
2940         olduse  = b->used;
2941         b->used = a->used+a->used;
2942
2943         {
2944                 mp_digit *tmpb;
2945                 tmpb = b->dp;
2946                 for (ix = 0; ix < pa; ix++) {
2947                         *tmpb++ = W[ix] & MP_MASK;
2948                 }
2949
2950 /*
2951                 clear unused digits [that existed in the old copy of c]
2952  */
2953                 for (; ix < olduse; ix++) {
2954                         *tmpb++ = 0;
2955                 }
2956         }
2957         mp_clamp(b);
2958         return MP_OKAY;
2959 }
2960
2961 /******************************************************************************/
2962 /*
2963         computes a = 2**b
2964
2965         Simple algorithm which zeroes the int32, grows it then just sets one bit
2966         as required.
2967  */
2968 int32 mp_2expt (mp_int * a, int32 b)
2969 {
2970         int32           res;
2971
2972 /*
2973         zero a as per default
2974  */
2975         mp_zero (a);
2976
2977 /*
2978         grow a to accomodate the single bit
2979  */
2980         if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
2981                 return res;
2982         }
2983
2984 /*
2985         set the used count of where the bit will go
2986  */
2987         a->used = b / DIGIT_BIT + 1;
2988
2989 /*
2990         put the single bit in its place
2991  */
2992         a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
2993
2994         return MP_OKAY;
2995 }
2996
2997 /******************************************************************************/
2998 /*
2999         init an mp_init for a given size
3000  */
3001 int32 mp_init_size(psPool_t *pool, mp_int * a, int32 size)
3002 {
3003         int             x;
3004 /*
3005         pad size so there are always extra digits
3006  */
3007         size += (MP_PREC * 2) - (size % MP_PREC);
3008
3009 /*
3010         alloc mem
3011  */
3012         a->dp = OPT_CAST(mp_digit) psMalloc(pool, sizeof (mp_digit) * size);
3013         if (a->dp == NULL) {
3014                 return MP_MEM;
3015         }
3016         a->used  = 0;
3017         a->alloc = size;
3018         a->sign  = MP_ZPOS;
3019
3020 /*
3021         zero the digits
3022  */
3023         for (x = 0; x < size; x++) {
3024                 a->dp[x] = 0;
3025         }
3026         return MP_OKAY;
3027 }
3028
3029 /******************************************************************************/
3030 /*
3031         low level addition, based on HAC pp.594, Algorithm 14.7
3032  */
3033 int32 s_mp_add (mp_int * a, mp_int * b, mp_int * c)
3034 {
3035         mp_int          *x;
3036         int32                   olduse, res, min, max;
3037
3038 /*
3039         find sizes, we let |a| <= |b| which means we have to sort them.  "x" will
3040         point to the input with the most digits
3041  */
3042         if (a->used > b->used) {
3043                 min = b->used;
3044                 max = a->used;
3045                 x = a;
3046         } else {
3047                 min = a->used;
3048                 max = b->used;
3049                 x = b;
3050         }
3051
3052         /* init result */
3053         if (c->alloc < max + 1) {
3054                 if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
3055                         return res;
3056                 }
3057         }
3058
3059 /*
3060         get old used digit count and set new one
3061  */
3062         olduse = c->used;
3063         c->used = max + 1;
3064
3065         {
3066                 register mp_digit u, *tmpa, *tmpb, *tmpc;
3067                 register int32 i;
3068
3069                 /* alias for digit pointers */
3070
3071                 /* first input */
3072                 tmpa = a->dp;
3073
3074                 /* second input */
3075                 tmpb = b->dp;
3076
3077                 /* destination */
3078                 tmpc = c->dp;
3079
3080                 /* zero the carry */
3081                 u = 0;
3082                 for (i = 0; i < min; i++) {
3083 /*
3084                         Compute the sum at one digit, T[i] = A[i] + B[i] + U
3085  */
3086                         *tmpc = *tmpa++ + *tmpb++ + u;
3087
3088 /*
3089                         U = carry bit of T[i]
3090  */
3091                         u = *tmpc >> ((mp_digit)DIGIT_BIT);
3092
3093 /*
3094                         take away carry bit from T[i]
3095  */
3096                         *tmpc++ &= MP_MASK;
3097                 }
3098
3099 /*
3100                 now copy higher words if any, that is in A+B if A or B has more digits add
3101                 those in
3102  */
3103                 if (min != max) {
3104                         for (; i < max; i++) {
3105                                 /* T[i] = X[i] + U */
3106                                 *tmpc = x->dp[i] + u;
3107
3108                                 /* U = carry bit of T[i] */
3109                                 u = *tmpc >> ((mp_digit)DIGIT_BIT);
3110
3111                                 /* take away carry bit from T[i] */
3112                                 *tmpc++ &= MP_MASK;
3113                         }
3114                 }
3115
3116                 /* add carry */
3117                 *tmpc++ = u;
3118
3119 /*
3120                 clear digits above oldused
3121  */
3122                 for (i = c->used; i < olduse; i++) {
3123                         *tmpc++ = 0;
3124                 }
3125         }
3126
3127         mp_clamp (c);
3128         return MP_OKAY;
3129 }
3130
3131 /******************************************************************************/
3132 /*
3133         FUTURE - this is only needed by the SSH code, SLOW or not, because RSA
3134         exponents are always odd.
3135 */
3136 int32 mp_invmodSSH(psPool_t *pool, mp_int * a, mp_int * b, mp_int * c)
3137 {
3138         mp_int          x, y, u, v, A, B, C, D;
3139         int32                   res;
3140
3141 /*
3142         b cannot be negative
3143  */
3144         if (b->sign == MP_NEG || mp_iszero(b) == 1) {
3145                 return MP_VAL;
3146         }
3147
3148 /*
3149         if the modulus is odd we can use a faster routine instead
3150  */
3151         if (mp_isodd (b) == 1) {
3152                 return fast_mp_invmod(pool, a, b, c);
3153         }
3154
3155 /*
3156         init temps
3157  */
3158         if ((res = _mp_init_multi(pool, &x, &y, &u, &v,
3159                         &A, &B, &C, &D)) != MP_OKAY) {
3160                 return res;
3161         }
3162
3163         /* x = a, y = b */
3164         if ((res = mp_copy(a, &x)) != MP_OKAY) {
3165                 goto LBL_ERR;
3166         }
3167         if ((res = mp_copy(b, &y)) != MP_OKAY) {
3168                 goto LBL_ERR;
3169         }
3170
3171 /*
3172         2. [modified] if x,y are both even then return an error!
3173  */
3174         if (mp_iseven(&x) == 1 && mp_iseven (&y) == 1) {
3175                 res = MP_VAL;
3176                 goto LBL_ERR;
3177         }
3178
3179 /*
3180         3. u=x, v=y, A=1, B=0, C=0,D=1
3181  */
3182         if ((res = mp_copy(&x, &u)) != MP_OKAY) {
3183                 goto LBL_ERR;
3184         }
3185         if ((res = mp_copy(&y, &v)) != MP_OKAY) {
3186                 goto LBL_ERR;
3187         }
3188         mp_set (&A, 1);
3189         mp_set (&D, 1);
3190
3191 top:
3192 /*
3193         4.  while u is even do
3194  */
3195         while (mp_iseven(&u) == 1) {
3196                 /* 4.1 u = u/2 */
3197                 if ((res = mp_div_2(&u, &u)) != MP_OKAY) {
3198                 goto LBL_ERR;
3199                 }
3200                 /* 4.2 if A or B is odd then */
3201                 if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
3202                         /* A = (A+y)/2, B = (B-x)/2 */
3203                         if ((res = mp_add(&A, &y, &A)) != MP_OKAY) {
3204                                 goto LBL_ERR;
3205                         }
3206                         if ((res = mp_sub(&B, &x, &B)) != MP_OKAY) {
3207                                 goto LBL_ERR;
3208                         }
3209                 }
3210                 /* A = A/2, B = B/2 */
3211                 if ((res = mp_div_2(&A, &A)) != MP_OKAY) {
3212                         goto LBL_ERR;
3213                 }
3214                 if ((res = mp_div_2(&B, &B)) != MP_OKAY) {
3215                         goto LBL_ERR;
3216                 }
3217         }
3218
3219 /*
3220         5.  while v is even do
3221  */
3222         while (mp_iseven(&v) == 1) {
3223                 /* 5.1 v = v/2 */
3224                 if ((res = mp_div_2(&v, &v)) != MP_OKAY) {
3225                 goto LBL_ERR;
3226                 }
3227                 /* 5.2 if C or D is odd then */
3228                 if (mp_isodd(&C) == 1 || mp_isodd (&D) == 1) {
3229                         /* C = (C+y)/2, D = (D-x)/2 */
3230                         if ((res = mp_add(&C, &y, &C)) != MP_OKAY) {
3231                                 goto LBL_ERR;
3232                         }
3233                         if ((res = mp_sub(&D, &x, &D)) != MP_OKAY) {
3234                                 goto LBL_ERR;
3235                         }
3236                 }
3237                 /* C = C/2, D = D/2 */
3238                 if ((res = mp_div_2(&C, &C)) != MP_OKAY) {
3239                         goto LBL_ERR;
3240                 }
3241                 if ((res = mp_div_2(&D, &D)) != MP_OKAY) {
3242                         goto LBL_ERR;
3243                 }
3244         }
3245
3246 /*
3247         6.  if u >= v then
3248  */
3249         if (mp_cmp(&u, &v) != MP_LT) {
3250                 /* u = u - v, A = A - C, B = B - D */
3251                 if ((res = mp_sub(&u, &v, &u)) != MP_OKAY) {
3252                 goto LBL_ERR;
3253                 }
3254
3255                 if ((res = mp_sub(&A, &C, &A)) != MP_OKAY) {
3256                 goto LBL_ERR;
3257                 }
3258
3259                 if ((res = mp_sub(&B, &D, &B)) != MP_OKAY) {
3260                 goto LBL_ERR;
3261                 }
3262         } else {
3263                 /* v - v - u, C = C - A, D = D - B */
3264                 if ((res = mp_sub(&v, &u, &v)) != MP_OKAY) {
3265                 goto LBL_ERR;
3266                 }
3267
3268                 if ((res = mp_sub(&C, &A, &C)) != MP_OKAY) {
3269                 goto LBL_ERR;
3270                 }
3271
3272                 if ((res = mp_sub(&D, &B, &D)) != MP_OKAY) {
3273                 goto LBL_ERR;
3274                 }
3275         }
3276
3277 /*
3278         if not zero goto step 4
3279  */
3280         if (mp_iszero(&u) == 0)
3281                 goto top;
3282
3283 /*
3284         now a = C, b = D, gcd == g*v
3285  */
3286
3287 /*
3288         if v != 1 then there is no inverse
3289  */
3290         if (mp_cmp_d(&v, 1) != MP_EQ) {
3291                 res = MP_VAL;
3292                 goto LBL_ERR;
3293         }
3294
3295 /*
3296         if its too low
3297  */
3298         while (mp_cmp_d(&C, 0) == MP_LT) {
3299                 if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
3300                         goto LBL_ERR;
3301                 }
3302         }
3303
3304 /*
3305         too big
3306  */
3307         while (mp_cmp_mag(&C, b) != MP_LT) {
3308                 if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
3309                         goto LBL_ERR;
3310                 }
3311         }
3312
3313 /*
3314         C is now the inverse
3315  */
3316         mp_exch(&C, c);
3317         res = MP_OKAY;
3318 LBL_ERR:_mp_clear_multi(&x, &y, &u, &v, &A, &B, &C, &D);
3319         return res;
3320 }
3321
3322 /******************************************************************************/
3323
3324 /*
3325  *      Computes the modular inverse via binary extended euclidean algorithm,
3326  *      that is c = 1/a mod b
3327  *
3328  * Based on slow invmod except this is optimized for the case where b is
3329  * odd as per HAC Note 14.64 on pp. 610
3330  */
3331 int32 fast_mp_invmod(psPool_t *pool, mp_int * a, mp_int * b, mp_int * c)
3332 {
3333         mp_int          x, y, u, v, B, D;
3334         int32                   res, neg;
3335
3336 /*
3337         2. [modified] b must be odd
3338  */
3339         if (mp_iseven (b) == 1) {
3340                 return MP_VAL;
3341         }
3342
3343 /*
3344         init all our temps
3345  */
3346         if ((res = _mp_init_multi(pool, &x, &y, &u, &v, &B, &D, NULL, NULL)) != MP_OKAY) {
3347                 return res;
3348         }
3349
3350 /*
3351         x == modulus, y == value to invert
3352  */
3353         if ((res = mp_copy(b, &x)) != MP_OKAY) {
3354                 goto LBL_ERR;
3355         }
3356
3357 /*
3358         we need y = |a|
3359  */
3360         if ((res = mp_mod(pool, a, b, &y)) != MP_OKAY) {
3361                 goto LBL_ERR;
3362         }
3363
3364 /*
3365         3. u=x, v=y, A=1, B=0, C=0,D=1
3366  */
3367         if ((res = mp_copy(&x, &u)) != MP_OKAY) {
3368                 goto LBL_ERR;
3369         }
3370         if ((res = mp_copy(&y, &v)) != MP_OKAY) {
3371                 goto LBL_ERR;
3372         }
3373         mp_set(&D, 1);
3374
3375 top:
3376 /*
3377         4.  while u is even do
3378 */
3379         while (mp_iseven(&u) == 1) {
3380                 /* 4.1 u = u/2 */
3381                 if ((res = mp_div_2(&u, &u)) != MP_OKAY) {
3382                         goto LBL_ERR;
3383                 }
3384                 /* 4.2 if B is odd then */
3385                 if (mp_isodd(&B) == 1) {
3386                         if ((res = mp_sub(&B, &x, &B)) != MP_OKAY) {
3387                                 goto LBL_ERR;
3388                         }
3389                 }
3390                 /* B = B/2 */
3391                 if ((res = mp_div_2(&B, &B)) != MP_OKAY) {
3392                         goto LBL_ERR;
3393                 }
3394         }
3395
3396 /*
3397         5.  while v is even do
3398  */
3399         while (mp_iseven(&v) == 1) {
3400                 /* 5.1 v = v/2 */
3401                 if ((res = mp_div_2(&v, &v)) != MP_OKAY) {
3402                         goto LBL_ERR;
3403                 }
3404                 /* 5.2 if D is odd then */
3405                 if (mp_isodd(&D) == 1) {
3406                         /* D = (D-x)/2 */
3407                         if ((res = mp_sub(&D, &x, &D)) != MP_OKAY) {
3408                                 goto LBL_ERR;
3409                         }
3410                 }
3411                 /* D = D/2 */
3412                 if ((res = mp_div_2(&D, &D)) != MP_OKAY) {
3413                         goto LBL_ERR;
3414                 }
3415         }
3416
3417 /*
3418         6.  if u >= v then
3419  */
3420         if (mp_cmp(&u, &v) != MP_LT) {
3421                 /* u = u - v, B = B - D */
3422                 if ((res = mp_sub(&u, &v, &u)) != MP_OKAY) {
3423                         goto LBL_ERR;
3424                 }
3425
3426                 if ((res = mp_sub(&B, &D, &B)) != MP_OKAY) {
3427                         goto LBL_ERR;
3428                 }
3429         } else {
3430                 /* v - v - u, D = D - B */
3431                 if ((res = mp_sub(&v, &u, &v)) != MP_OKAY) {
3432                         goto LBL_ERR;
3433                 }
3434
3435                 if ((res = mp_sub(&D, &B, &D)) != MP_OKAY) {
3436                 goto LBL_ERR;
3437                 }
3438         }
3439
3440 /*
3441         if not zero goto step 4
3442  */
3443         if (mp_iszero(&u) == 0) {
3444                 goto top;
3445         }
3446
3447 /*
3448         now a = C, b = D, gcd == g*v
3449  */
3450
3451 /*
3452         if v != 1 then there is no inverse
3453  */
3454         if (mp_cmp_d(&v, 1) != MP_EQ) {
3455                 res = MP_VAL;
3456                 goto LBL_ERR;
3457         }
3458
3459 /*
3460         b is now the inverse
3461  */
3462         neg = a->sign;
3463         while (D.sign == MP_NEG) {
3464                 if ((res = mp_add(&D, b, &D)) != MP_OKAY) {
3465                 goto LBL_ERR;
3466                 }
3467         }
3468         mp_exch(&D, c);
3469         c->sign = neg;
3470         res = MP_OKAY;
3471
3472 LBL_ERR:_mp_clear_multi(&x, &y, &u, &v, &B, &D, NULL, NULL);
3473         return res;
3474 }
3475
3476 /******************************************************************************/
3477 /*
3478         d = a + b (mod c)
3479  */
3480 int32 mp_addmod (psPool_t *pool, mp_int * a, mp_int * b, mp_int * c, mp_int * d)
3481 {
3482         int32                   res;
3483         mp_int          t;
3484
3485         if ((res = mp_init(pool, &t)) != MP_OKAY) {
3486                 return res;
3487         }
3488
3489         if ((res = mp_add (a, b, &t)) != MP_OKAY) {
3490                 mp_clear (&t);
3491                 return res;
3492         }
3493         res = mp_mod (pool, &t, c, d);
3494         mp_clear (&t);
3495         return res;
3496 }
3497
3498 /******************************************************************************/
3499 /*
3500         shrink a bignum
3501  */
3502 int32 mp_shrink (mp_int * a)
3503 {
3504         mp_digit *tmp;
3505
3506         if (a->alloc != a->used && a->used > 0) {
3507                 if ((tmp = psRealloc(a->dp, sizeof (mp_digit) * a->used)) == NULL) {
3508                         return MP_MEM;
3509                 }
3510                 a->dp    = tmp;
3511                 a->alloc = a->used;
3512         }
3513         return MP_OKAY;
3514 }
3515
3516 /* single digit subtraction */
3517 int32 mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
3518 {
3519         mp_digit *tmpa, *tmpc, mu;
3520         int32       res, ix, oldused;
3521
3522         /* grow c as required */
3523         if (c->alloc < a->used + 1) {
3524                 if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
3525                         return res;
3526                 }
3527         }
3528
3529         /* if a is negative just do an unsigned
3530         * addition [with fudged signs]
3531         */
3532         if (a->sign == MP_NEG) {
3533                 a->sign = MP_ZPOS;
3534                 res     = mp_add_d(a, b, c);
3535                 a->sign = c->sign = MP_NEG;
3536                 return res;
3537         }
3538
3539         /* setup regs */
3540         oldused = c->used;
3541         tmpa    = a->dp;
3542         tmpc    = c->dp;
3543
3544         /* if a <= b simply fix the single digit */
3545         if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
3546                 if (a->used == 1) {
3547                         *tmpc++ = b - *tmpa;
3548                 } else {
3549                         *tmpc++ = b;
3550                 }
3551                 ix      = 1;
3552
3553                 /* negative/1digit */
3554                 c->sign = MP_NEG;
3555                 c->used = 1;
3556         } else {
3557                 /* positive/size */
3558                 c->sign = MP_ZPOS;
3559                 c->used = a->used;
3560
3561                 /* subtract first digit */
3562                 *tmpc    = *tmpa++ - b;
3563                 mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3564                 *tmpc++ &= MP_MASK;
3565
3566                 /* handle rest of the digits */
3567                 for (ix = 1; ix < a->used; ix++) {
3568                         *tmpc    = *tmpa++ - mu;
3569                         mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3570                         *tmpc++ &= MP_MASK;
3571                 }
3572         }
3573
3574         /* zero excess digits */
3575         while (ix++ < oldused) {
3576                 *tmpc++ = 0;
3577         }
3578         mp_clamp(c);
3579         return MP_OKAY;
3580 }
3581
3582 /* single digit addition */
3583 int32 mp_add_d (mp_int * a, mp_digit b, mp_int * c)
3584 {
3585         int32     res, ix, oldused;
3586         mp_digit *tmpa, *tmpc, mu;
3587
3588         /* grow c as required */
3589         if (c->alloc < a->used + 1) {
3590                 if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
3591                         return res;
3592                 }
3593         }
3594
3595         /* if a is negative and |a| >= b, call c = |a| - b */
3596         if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
3597                 /* temporarily fix sign of a */
3598                 a->sign = MP_ZPOS;
3599
3600                 /* c = |a| - b */
3601                 res = mp_sub_d(a, b, c);
3602
3603                 /* fix sign  */
3604                 a->sign = c->sign = MP_NEG;
3605                 return res;
3606         }
3607
3608         /* old number of used digits in c */
3609         oldused = c->used;
3610
3611         /* sign always positive */
3612         c->sign = MP_ZPOS;
3613
3614         /* source alias */
3615         tmpa    = a->dp;
3616
3617         /* destination alias */
3618         tmpc    = c->dp;
3619
3620         /* if a is positive */
3621         if (a->sign == MP_ZPOS) {
3622                 /* add digit, after this we're propagating the carry */
3623                 *tmpc   = *tmpa++ + b;
3624                 mu      = *tmpc >> DIGIT_BIT;
3625                 *tmpc++ &= MP_MASK;
3626
3627                 /* now handle rest of the digits */
3628                 for (ix = 1; ix < a->used; ix++) {
3629                         *tmpc   = *tmpa++ + mu;
3630                         mu      = *tmpc >> DIGIT_BIT;
3631                         *tmpc++ &= MP_MASK;
3632                 }
3633                 /* set final carry */
3634                 ix++;
3635                 *tmpc++  = mu;
3636
3637                 /* setup size */
3638                 c->used = a->used + 1;
3639         } else {
3640                 /* a was negative and |a| < b */
3641                 c->used  = 1;
3642
3643                 /* the result is a single digit */
3644                 if (a->used == 1) {
3645                         *tmpc++  =  b - a->dp[0];
3646                 } else {
3647                         *tmpc++  =  b;
3648                 }
3649
3650                 /* setup count so the clearing of oldused
3651                 * can fall through correctly
3652                 */
3653                 ix       = 1;
3654         }
3655
3656         /* now zero to oldused */
3657         while (ix++ < oldused) {
3658                 *tmpc++ = 0;
3659         }
3660         mp_clamp(c);
3661         return MP_OKAY;
3662 }
3663
3664
3665 /******************************************************************************/
3666
3667 #endif /* USE_MPI2 */