3 * Release $Name: MATRIXSSL_1_8_8_OPEN $
5 * multiple-precision integer library
8 * Copyright (c) PeerSec Networks, 2002-2009. All Rights Reserved.
9 * The latest version of this code is available at http://www.matrixssl.org
11 * This software is open source; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This General Public License does NOT permit incorporating this software
17 * into proprietary programs. If you are unable to comply with the GPL, a
18 * commercial license for this software may be purchased from PeerSec Networks
19 * at http://www.peersec.com
21 * This program is distributed in WITHOUT ANY WARRANTY; without even the
22 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 * See the GNU General Public License for more details.
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, write to the Free Software
27 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 * http://www.gnu.org/copyleft/gpl.html
30 /******************************************************************************/
32 #include "../cryptoLayer.h"
37 static int32
mp_exptmod_fast (psPool_t
*pool
, mp_int
* G
, mp_int
* X
,
38 mp_int
* P
, mp_int
* Y
, int32 redmode
);
40 /******************************************************************************/
43 1. Convert the mp_init and mp_clear functions to not use malloc + free,
44 but to use static storage within the bignum variable instead - but
45 how to handle grow()? Maybe use a simple memory allocator
46 2. verify stack usage of all functions and use of MP_LOW_MEM:
47 fast_mp_montgomery_reduce
50 fast_s_mp_mul_high_digs
51 3. HAC stands for Handbook of Applied Cryptography
52 http://www.cacr.math.uwaterloo.ca/hac/
54 /******************************************************************************/
58 void psZeromem(void *dst
, size_t len
)
60 unsigned char *mem
= (unsigned char *)dst
;
70 void psBurnStack(unsigned long len
)
72 unsigned char buf
[32];
74 psZeromem(buf
, sizeof(buf
));
75 if (len
> (unsigned long)sizeof(buf
)) {
76 psBurnStack(len
- sizeof(buf
));
80 /******************************************************************************/
82 Multiple precision integer functions
83 Note: we don't use va_args here to prevent portability issues.
85 int32
_mp_init_multi(psPool_t
*pool
, mp_int
*mp0
, mp_int
*mp1
, mp_int
*mp2
,
86 mp_int
*mp3
, mp_int
*mp4
, mp_int
*mp5
,
87 mp_int
*mp6
, mp_int
*mp7
)
89 mp_err res
= MP_OKAY
; /* Assume ok until proven otherwise */
90 int32 n
= 0; /* Number of ok inits */
103 while (tempArray
[n
] != NULL
) {
104 if (mp_init(pool
, tempArray
[n
]) != MP_OKAY
) {
113 while (tempArray
[n
] != NULL
) {
114 mp_clear(tempArray
[n
]);
118 return res
; /* Assumed ok, if error flagged above. */
120 /******************************************************************************/
122 Reads a unsigned char array, assumes the msb is stored first [big endian]
124 int32
mp_read_unsigned_bin (mp_int
* a
, unsigned char *b
, int32 c
)
129 Make sure there are at least two digits.
132 if ((res
= mp_grow(a
, 2)) != MP_OKAY
) {
146 if ((res
= mp_mul_2d (a
, 8, a
)) != MP_OKAY
) {
154 a
->dp
[0] = (*b
& MP_MASK
);
155 a
->dp
[1] |= ((*b
++ >> 7U) & 1);
163 /******************************************************************************/
165 Compare two ints (signed)
167 int32
mp_cmp (mp_int
* a
, mp_int
* b
)
170 compare based on sign
172 if (a
->sign
!= b
->sign
) {
173 if (a
->sign
== MP_NEG
) {
183 if (a
->sign
== MP_NEG
) {
184 /* if negative compare opposite direction */
185 return mp_cmp_mag(b
, a
);
187 return mp_cmp_mag(a
, b
);
191 /******************************************************************************/
193 Store in unsigned [big endian] format.
195 int32
mp_to_unsigned_bin(psPool_t
*pool
, mp_int
* a
, unsigned char *b
)
200 if ((res
= mp_init_copy(pool
, &t
, a
)) != MP_OKAY
) {
205 while (mp_iszero (&t
) == 0) {
207 b
[x
++] = (unsigned char) (t
.dp
[0] & 255);
209 b
[x
++] = (unsigned char) (t
.dp
[0] | ((t
.dp
[1] & 0x01) << 7));
211 if ((res
= mp_div_2d (pool
, &t
, 8, &t
, NULL
)) != MP_OKAY
) {
221 void _mp_clear_multi(mp_int
*mp0
, mp_int
*mp1
, mp_int
*mp2
, mp_int
*mp3
,
222 mp_int
*mp4
, mp_int
*mp5
, mp_int
*mp6
, mp_int
*mp7
)
224 int32 n
= 0; /* Number of ok inits */
226 mp_int
*tempArray
[9];
238 for (n
= 0; tempArray
[n
] != NULL
; n
++) {
239 mp_clear(tempArray
[n
]);
243 /******************************************************************************/
247 int32
mp_init (psPool_t
*pool
, mp_int
* a
)
251 allocate memory required and clear it
253 a
->dp
= OPT_CAST(mp_digit
) psMalloc(pool
, sizeof (mp_digit
) * MP_PREC
);
259 set the digits to zero
261 for (i
= 0; i
< MP_PREC
; i
++) {
265 set the used to zero, allocated digits to the default precision and sign
275 /******************************************************************************/
279 void mp_clear (mp_int
* a
)
283 only do anything if a hasn't been freed previously
287 first zero the digits
289 for (i
= 0; i
< a
->used
; i
++) {
297 reset members to make debugging easier
300 a
->alloc
= a
->used
= 0;
305 /******************************************************************************/
307 Get the size for an unsigned equivalent.
309 int32
mp_unsigned_bin_size (mp_int
* a
)
311 int32 size
= mp_count_bits (a
);
313 return (size
/ 8 + ((size
& 7) != 0 ? 1 : 0));
316 /******************************************************************************/
320 This is used to ensure that leading zero digits are trimed and the
321 leading "used" digit will be non-zero. Typically very fast. Also fixes
322 the sign if there are no more leading digits
324 void mp_clamp (mp_int
* a
)
327 decrease used while the most significant digit is zero.
329 while (a
->used
> 0 && a
->dp
[a
->used
- 1] == 0) {
334 reset the sign flag if used == 0
341 /******************************************************************************/
343 Shift left by a certain bit count.
345 int32
mp_mul_2d (mp_int
* a
, int32 b
, mp_int
* c
)
354 if ((res
= mp_copy (a
, c
)) != MP_OKAY
) {
359 if (c
->alloc
< (int32
)(c
->used
+ b
/DIGIT_BIT
+ 1)) {
360 if ((res
= mp_grow (c
, c
->used
+ b
/ DIGIT_BIT
+ 1)) != MP_OKAY
) {
366 Shift by as many digits in the bit count
368 if (b
>= (int32
)DIGIT_BIT
) {
369 if ((res
= mp_lshd (c
, b
/ DIGIT_BIT
)) != MP_OKAY
) {
375 shift any bit count < DIGIT_BIT
377 d
= (mp_digit
) (b
% DIGIT_BIT
);
379 register mp_digit
*tmpc
, shift
, mask
, r
, rr
;
385 mask
= (((mp_digit
)1) << d
) - 1;
390 shift
= DIGIT_BIT
- d
;
397 for (x
= 0; x
< c
->used
; x
++) {
399 get the higher bits of the current word
401 rr
= (*tmpc
>> shift
) & mask
;
404 shift the current word and OR in the carry
406 *tmpc
= ((*tmpc
<< d
) | r
) & MP_MASK
;
410 set the carry to the carry bits of the current word
419 c
->dp
[(c
->used
)++] = r
;
426 /******************************************************************************/
430 void mp_zero (mp_int
* a
)
439 for (n
= 0; n
< a
->alloc
; n
++) {
448 #endif /* MP_LOW_MEM */
450 /******************************************************************************/
452 Compare maginitude of two ints (unsigned).
454 int32
mp_cmp_mag (mp_int
* a
, mp_int
* b
)
457 mp_digit
*tmpa
, *tmpb
;
460 compare based on # of non-zero digits
462 if (a
->used
> b
->used
) {
466 if (a
->used
< b
->used
) {
471 tmpa
= a
->dp
+ (a
->used
- 1);
474 tmpb
= b
->dp
+ (a
->used
- 1);
477 compare based on digits
479 for (n
= 0; n
< a
->used
; ++n
, --tmpa
, --tmpb
) {
491 /******************************************************************************/
493 computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
495 Uses a left-to-right k-ary sliding window to compute the modular
496 exponentiation. The value of k changes based on the size of the exponent.
498 Uses Montgomery or Diminished Radix reduction [whichever appropriate]
500 int32
mp_exptmod(psPool_t
*pool
, mp_int
* G
, mp_int
* X
, mp_int
* P
, mp_int
* Y
)
504 modulus P must be positive
506 if (P
->sign
== MP_NEG
) {
511 if exponent X is negative we have to recurse
513 if (X
->sign
== MP_NEG
) {
518 first compute 1/G mod P
520 if ((err
= mp_init(pool
, &tmpG
)) != MP_OKAY
) {
523 if ((err
= mp_invmod(pool
, G
, P
, &tmpG
)) != MP_OKAY
) {
531 if ((err
= mp_init(pool
, &tmpX
)) != MP_OKAY
) {
535 if ((err
= mp_abs(X
, &tmpX
)) != MP_OKAY
) {
542 and now compute (1/G)**|X| instead of G**X [X < 0]
544 err
= mp_exptmod(pool
, &tmpG
, &tmpX
, P
, Y
);
551 if the modulus is odd or dr != 0 use the fast method
553 if (mp_isodd (P
) == 1) {
554 return mp_exptmod_fast (pool
, G
, X
, P
, Y
, 0);
563 /******************************************************************************/
565 Call only from mp_exptmod to make sure this fast version qualifies
567 static int32
mp_exptmod_fast(psPool_t
*pool
, mp_int
* G
, mp_int
* X
,
568 mp_int
* P
, mp_int
* Y
, int32 redmode
)
570 mp_int M
[TAB_SIZE
], res
;
572 int32 err
, bitbuf
, bitcpy
, bitcnt
, mode
, digidx
, x
, y
, winsize
;
576 use a pointer to the reduction algorithm. This allows us to use
577 one of many reduction algorithms without modding the guts of
578 the code with if statements everywhere.
580 int32 (*redux
)(mp_int
*,mp_int
*,mp_digit
);
585 x
= mp_count_bits (X
);
588 } else if (x
<= 36) {
590 } else if (x
<= 140) {
592 } else if (x
<= 450) {
594 } else if (x
<= 1303) {
596 } else if (x
<= 3529) {
612 if ((err
= mp_init(pool
, &M
[1])) != MP_OKAY
) {
617 now init the second half of the array
619 for (x
= 1<<(winsize
-1); x
< (1 << winsize
); x
++) {
620 if ((err
= mp_init(pool
, &M
[x
])) != MP_OKAY
) {
621 for (y
= 1<<(winsize
-1); y
< x
; y
++) {
633 if ((err
= mp_montgomery_setup(P
, &mp
)) != MP_OKAY
) {
638 automatically pick the comba one if available
640 if (((P
->used
* 2 + 1) < MP_WARRAY
) &&
641 P
->used
< (1 << ((CHAR_BIT
* sizeof (mp_word
)) - (2 * DIGIT_BIT
)))) {
642 redux
= fast_mp_montgomery_reduce
;
645 use slower baseline Montgomery method
647 redux
= mp_montgomery_reduce
;
653 if ((err
= mp_init(pool
, &res
)) != MP_OKAY
) {
658 create M table. The first half of the table is not computed
659 though accept for M[0] and M[1]
665 if ((err
= mp_montgomery_calc_normalization(&res
, P
)) != MP_OKAY
) {
670 now set M[1] to G * R mod m
672 if ((err
= mp_mulmod(pool
, G
, &res
, P
, &M
[1])) != MP_OKAY
) {
677 compute the value at M[1<<(winsize-1)] by squaring
678 M[1] (winsize-1) times
680 if ((err
= mp_copy(&M
[1], &M
[1 << (winsize
- 1)])) != MP_OKAY
) {
684 for (x
= 0; x
< (winsize
- 1); x
++) {
685 if ((err
= mp_sqr(pool
, &M
[1 << (winsize
- 1)],
686 &M
[1 << (winsize
- 1)])) != MP_OKAY
) {
689 if ((err
= redux(&M
[1 << (winsize
- 1)], P
, mp
)) != MP_OKAY
) {
697 for (x
= (1 << (winsize
- 1)) + 1; x
< (1 << winsize
); x
++) {
698 if ((err
= mp_mul(pool
, &M
[x
- 1], &M
[1], &M
[x
])) != MP_OKAY
) {
701 if ((err
= redux(&M
[x
], P
, mp
)) != MP_OKAY
) {
707 set initial mode and bit cnt
712 digidx
= X
->used
- 1;
718 grab next digit as required
721 /* if digidx == -1 we are out of digits so break */
725 /* read next digit and reset bitcnt */
726 buf
= X
->dp
[digidx
--];
727 bitcnt
= (int)DIGIT_BIT
;
730 /* grab the next msb from the exponent */
731 y
= (mp_digit
)(buf
>> (DIGIT_BIT
- 1)) & 1;
735 if the bit is zero and mode == 0 then we ignore it
736 These represent the leading zero bits before the first 1 bit
737 in the exponent. Technically this opt is not required but it
738 does lower the # of trivial squaring/reductions used
740 if (mode
== 0 && y
== 0) {
745 if the bit is zero and mode == 1 then we square
747 if (mode
== 1 && y
== 0) {
748 if ((err
= mp_sqr (pool
, &res
, &res
)) != MP_OKAY
) {
751 if ((err
= redux (&res
, P
, mp
)) != MP_OKAY
) {
758 else we add it to the window
760 bitbuf
|= (y
<< (winsize
- ++bitcpy
));
763 if (bitcpy
== winsize
) {
765 ok window is filled so square as required and multiply
768 for (x
= 0; x
< winsize
; x
++) {
769 if ((err
= mp_sqr(pool
, &res
, &res
)) != MP_OKAY
) {
772 if ((err
= redux(&res
, P
, mp
)) != MP_OKAY
) {
778 if ((err
= mp_mul(pool
, &res
, &M
[bitbuf
], &res
)) != MP_OKAY
) {
781 if ((err
= redux(&res
, P
, mp
)) != MP_OKAY
) {
786 empty window and reset
795 if bits remain then square/multiply
797 if (mode
== 2 && bitcpy
> 0) {
798 /* square then multiply if the bit is set */
799 for (x
= 0; x
< bitcpy
; x
++) {
800 if ((err
= mp_sqr(pool
, &res
, &res
)) != MP_OKAY
) {
803 if ((err
= redux(&res
, P
, mp
)) != MP_OKAY
) {
808 get next bit of the window
811 if ((bitbuf
& (1 << winsize
)) != 0) {
815 if ((err
= mp_mul(pool
, &res
, &M
[1], &res
)) != MP_OKAY
) {
818 if ((err
= redux(&res
, P
, mp
)) != MP_OKAY
) {
826 fixup result if Montgomery reduction is used
827 recall that any value in a Montgomery system is
828 actually multiplied by R mod n. So we have
829 to reduce one more time to cancel out the factor of R.
831 if ((err
= redux(&res
, P
, mp
)) != MP_OKAY
) {
840 LBL_RES
:mp_clear(&res
);
843 for (x
= 1<<(winsize
-1); x
< (1 << winsize
); x
++) {
849 /******************************************************************************/
853 int32
mp_grow (mp_int
* a
, int32 size
)
859 If the alloc size is smaller alloc more ram.
861 if (a
->alloc
< size
) {
863 ensure there are always at least MP_PREC digits extra on top
865 size
+= (MP_PREC
* 2) - (size
% MP_PREC
);
868 Reallocate the array a->dp
870 We store the return in a temporary variable in case the operation
871 failed we don't want to overwrite the dp member of a.
873 tmp
= OPT_CAST(mp_digit
) psRealloc(a
->dp
, sizeof (mp_digit
) * size
);
876 reallocation failed but "a" is still valid [can be freed]
882 reallocation succeeded so set a->dp
891 for (; i
< a
->alloc
; i
++) {
898 /******************************************************************************/
902 Simple function copies the input and fixes the sign to positive
904 int32
mp_abs (mp_int
* a
, mp_int
* b
)
912 if ((res
= mp_copy (a
, b
)) != MP_OKAY
) {
918 Force the sign of b to positive
925 /******************************************************************************/
927 Creates "a" then copies b into it
929 int32
mp_init_copy(psPool_t
*pool
, mp_int
* a
, mp_int
* b
)
933 if ((res
= mp_init(pool
, a
)) != MP_OKAY
) {
936 return mp_copy (b
, a
);
939 /******************************************************************************/
941 Reverse an array, used for radix code
943 void bn_reverse (unsigned char *s
, int32 len
)
959 /******************************************************************************/
961 Shift right by a certain bit count (store quotient in c, optional
964 int32
mp_div_2d(psPool_t
*pool
, mp_int
* a
, int32 b
, mp_int
* c
, mp_int
* d
)
971 If the shift count is <= 0 then we do no work
974 res
= mp_copy (a
, c
);
981 if ((res
= mp_init(pool
, &t
)) != MP_OKAY
) {
989 if ((res
= mp_mod_2d (a
, b
, &t
)) != MP_OKAY
) {
996 if ((res
= mp_copy (a
, c
)) != MP_OKAY
) {
1002 Shift by as many digits in the bit count
1004 if (b
>= (int32
)DIGIT_BIT
) {
1005 mp_rshd (c
, b
/ DIGIT_BIT
);
1008 /* shift any bit count < DIGIT_BIT */
1009 D
= (mp_digit
) (b
% DIGIT_BIT
);
1011 register mp_digit
*tmpc
, mask
, shift
;
1014 mask
= (((mp_digit
)1) << D
) - 1;
1017 shift
= DIGIT_BIT
- D
;
1020 tmpc
= c
->dp
+ (c
->used
- 1);
1024 for (x
= c
->used
- 1; x
>= 0; x
--) {
1026 Get the lower bits of this word in a temp.
1031 shift the current word and mix in the carry bits from the previous word
1033 *tmpc
= (*tmpc
>> D
) | (r
<< shift
);
1037 set the carry to the carry bits of the current word found above
1050 /******************************************************************************/
1054 int32
mp_copy (mp_int
* a
, mp_int
* b
)
1059 If dst == src do nothing
1068 if (b
->alloc
< a
->used
) {
1069 if ((res
= mp_grow (b
, a
->used
)) != MP_OKAY
) {
1075 Zero b and copy the parameters over
1078 register mp_digit
*tmpa
, *tmpb
;
1080 /* pointer aliases */
1087 /* copy all the digits */
1088 for (n
= 0; n
< a
->used
; n
++) {
1092 /* clear high digits */
1093 for (; n
< b
->used
; n
++) {
1099 copy used count and sign
1106 /******************************************************************************/
1108 Returns the number of bits in an int32
1110 int32
mp_count_bits (mp_int
* a
)
1123 Get number of digits and add that.
1125 r
= (a
->used
- 1) * DIGIT_BIT
;
1128 Take the last digit and count the bits in it.
1130 q
= a
->dp
[a
->used
- 1];
1131 while (q
> ((mp_digit
) 0)) {
1133 q
>>= ((mp_digit
) 1);
1138 /******************************************************************************/
1140 Shift left a certain amount of digits.
1142 int32
mp_lshd (mp_int
* a
, int32 b
)
1147 If its less than zero return.
1154 Grow to fit the new digits.
1156 if (a
->alloc
< a
->used
+ b
) {
1157 if ((res
= mp_grow (a
, a
->used
+ b
)) != MP_OKAY
) {
1163 register mp_digit
*top
, *bottom
;
1166 Increment the used by the shift amount then copy upwards.
1171 top
= a
->dp
+ a
->used
- 1;
1174 bottom
= a
->dp
+ a
->used
- 1 - b
;
1177 Much like mp_rshd this is implemented using a sliding window
1178 except the window goes the otherway around. Copying from
1179 the bottom to the top. see bn_mp_rshd.c for more info.
1181 for (x
= a
->used
- 1; x
>= b
; x
--) {
1185 /* zero the lower digits */
1187 for (x
= 0; x
< b
; x
++) {
1194 /******************************************************************************/
1198 void mp_set (mp_int
* a
, mp_digit b
)
1201 a
->dp
[0] = b
& MP_MASK
;
1202 a
->used
= (a
->dp
[0] != 0) ? 1 : 0;
1205 /******************************************************************************/
1207 Swap the elements of two integers, for cases where you can't simply swap
1208 the mp_int pointers around
1210 void mp_exch (mp_int
* a
, mp_int
* b
)
1219 /******************************************************************************/
1221 High level multiplication (handles sign)
1223 int32
mp_mul(psPool_t
*pool
, mp_int
* a
, mp_int
* b
, mp_int
* c
)
1226 int32 digs
= a
->used
+ b
->used
+ 1;
1228 neg
= (a
->sign
== b
->sign
) ? MP_ZPOS
: MP_NEG
;
1230 /* Can we use the fast multiplier?
1232 The fast multiplier can be used if the output will have less than
1233 MP_WARRAY digits and the number of digits won't affect carry propagation
1235 if ((digs
< MP_WARRAY
) && MIN(a
->used
, b
->used
) <=
1236 (1 << ((CHAR_BIT
* sizeof (mp_word
)) - (2 * DIGIT_BIT
)))) {
1237 res
= fast_s_mp_mul_digs(pool
, a
, b
, c
, digs
);
1239 res
= s_mp_mul(pool
, a
, b
, c
);
1241 c
->sign
= (c
->used
> 0) ? neg
: MP_ZPOS
;
1245 /******************************************************************************/
1247 c = a mod b, 0 <= c < b
1249 int32
mp_mod(psPool_t
*pool
, mp_int
* a
, mp_int
* b
, mp_int
* c
)
1254 if ((res
= mp_init(pool
, &t
)) != MP_OKAY
) {
1258 if ((res
= mp_div (pool
, a
, b
, NULL
, &t
)) != MP_OKAY
) {
1263 if (t
.sign
!= b
->sign
) {
1264 res
= mp_add (b
, &t
, c
);
1274 /******************************************************************************/
1276 shifts with subtractions when the result is greater than b.
1278 The method is slightly modified to shift B unconditionally upto just under
1279 the leading bit of b. This saves alot of multiple precision shifting.
1281 int32
mp_montgomery_calc_normalization (mp_int
* a
, mp_int
* b
)
1286 How many bits of last digit does b use
1288 bits
= mp_count_bits (b
) % DIGIT_BIT
;
1291 if ((res
= mp_2expt(a
, (b
->used
- 1) * DIGIT_BIT
+ bits
- 1)) != MP_OKAY
) {
1300 Now compute C = A * B mod b
1302 for (x
= bits
- 1; x
< (int32
)DIGIT_BIT
; x
++) {
1303 if ((res
= mp_mul_2(a
, a
)) != MP_OKAY
) {
1306 if (mp_cmp_mag(a
, b
) != MP_LT
) {
1307 if ((res
= s_mp_sub(a
, b
, a
)) != MP_OKAY
) {
1316 /******************************************************************************/
1320 int32
mp_mulmod(psPool_t
*pool
, mp_int
* a
, mp_int
* b
, mp_int
* c
, mp_int
* d
)
1325 if ((res
= mp_init(pool
, &t
)) != MP_OKAY
) {
1329 if ((res
= mp_mul (pool
, a
, b
, &t
)) != MP_OKAY
) {
1333 res
= mp_mod (pool
, &t
, c
, d
);
1338 /******************************************************************************/
1342 #ifdef USE_SMALL_WORD
1343 int32
mp_sqr (psPool_t
*pool
, mp_int
* a
, mp_int
* b
)
1348 Can we use the fast comba multiplier?
1350 if ((a
->used
* 2 + 1) < MP_WARRAY
&& a
->used
<
1351 (1 << (sizeof(mp_word
) * CHAR_BIT
- 2*DIGIT_BIT
- 1))) {
1352 res
= fast_s_mp_sqr (pool
, a
, b
);
1354 res
= s_mp_sqr (pool
, a
, b
);
1359 #endif /* USE_SMALL_WORD */
1361 /******************************************************************************/
1363 Computes xR**-1 == x (mod N) via Montgomery Reduction.
1365 This is an optimized implementation of montgomery_reduce
1366 which uses the comba method to quickly calculate the columns of the
1369 Based on Algorithm 14.32 on pp.601 of HAC.
1372 int32
fast_mp_montgomery_reduce (mp_int
* x
, mp_int
* n
, mp_digit rho
)
1374 int32 ix
, res
, olduse
;
1375 mp_word W
[MP_WARRAY
];
1385 if (x
->alloc
< n
->used
+ 1) {
1386 if ((res
= mp_grow(x
, n
->used
+ 1)) != MP_OKAY
) {
1392 First we have to get the digits of the input into
1393 an array of double precision words W[...]
1396 register mp_word
*_W
;
1397 register mp_digit
*tmpx
;
1400 Alias for the W[] array
1405 Alias for the digits of x
1410 Copy the digits of a into W[0..a->used-1]
1412 for (ix
= 0; ix
< x
->used
; ix
++) {
1417 Zero the high words of W[a->used..m->used*2]
1419 for (; ix
< n
->used
* 2 + 1; ix
++) {
1425 Now we proceed to zero successive digits from the least
1426 significant upwards.
1428 for (ix
= 0; ix
< n
->used
; ix
++) {
1432 We avoid a double precision multiplication (which isn't required) by
1433 casting the value down to a mp_digit. Note this requires that
1434 W[ix-1] have the carry cleared (see after the inner loop)
1436 register mp_digit mu
;
1437 mu
= (mp_digit
) (((W
[ix
] & MP_MASK
) * rho
) & MP_MASK
);
1440 a = a + mu * m * b**i
1442 This is computed in place and on the fly. The multiplication by b**i
1443 is handled by offseting which columns the results are added to.
1445 Note the comba method normally doesn't handle carries in the inner loop
1446 In this case we fix the carry from the previous column since the
1447 Montgomery reduction requires digits of the result (so far) [see above]
1448 to work. This is handled by fixing up one carry after the inner loop.
1449 The carry fixups are done in order so after these loops the first
1450 m->used words of W[] have the carries fixed
1454 register mp_digit
*tmpn
;
1455 register mp_word
*_W
;
1458 Alias for the digits of the modulus
1463 Alias for the columns set by an offset of ix
1470 for (iy
= 0; iy
< n
->used
; iy
++) {
1471 *_W
++ += ((mp_word
)mu
) * ((mp_word
)*tmpn
++);
1476 Now fix carry for next digit, W[ix+1]
1478 W
[ix
+ 1] += W
[ix
] >> ((mp_word
) DIGIT_BIT
);
1482 Now we have to propagate the carries and shift the words downward [all those
1483 least significant digits we zeroed].
1486 register mp_digit
*tmpx
;
1487 register mp_word
*_W
, *_W1
;
1490 Now fix rest of carries
1494 alias for current word
1499 alias for next word, where the carry goes
1503 for (; ix
<= n
->used
* 2 + 1; ix
++) {
1504 *_W
++ += *_W1
++ >> ((mp_word
) DIGIT_BIT
);
1508 copy out, A = A/b**n
1510 The result is A/b**n but instead of converting from an
1511 array of mp_word to mp_digit than calling mp_rshd
1512 we just copy them in the right order
1516 alias for destination word
1521 alias for shifted double precision result
1525 for (ix
= 0; ix
< n
->used
+ 1; ix
++) {
1526 *tmpx
++ = (mp_digit
)(*_W
++ & ((mp_word
) MP_MASK
));
1530 zero oldused digits, if the input a was larger than
1531 m->used+1 we'll have to clear the digits
1533 for (; ix
< olduse
; ix
++) {
1539 Set the max used and clamp
1541 x
->used
= n
->used
+ 1;
1545 if A >= m then A = A - m
1547 if (mp_cmp_mag(x
, n
) != MP_LT
) {
1548 return s_mp_sub(x
, n
, x
);
1553 /******************************************************************************/
1555 High level addition (handles signs)
1557 int32
mp_add (mp_int
* a
, mp_int
* b
, mp_int
* c
)
1562 Get sign of both inputs
1568 Handle two cases, not four.
1572 Both positive or both negative. Add their magnitudes, copy the sign.
1575 res
= s_mp_add (a
, b
, c
);
1578 One positive, the other negative. Subtract the one with the greater
1579 magnitude from the one of the lesser magnitude. The result gets the sign of
1580 the one with the greater magnitude.
1582 if (mp_cmp_mag (a
, b
) == MP_LT
) {
1584 res
= s_mp_sub (b
, a
, c
);
1587 res
= s_mp_sub (a
, b
, c
);
1593 /******************************************************************************/
1597 int32
mp_cmp_d (mp_int
* a
, mp_digit b
)
1600 Compare based on sign
1602 if (a
->sign
== MP_NEG
) {
1607 Compare based on magnitude
1614 Compare the only digit of a to b
1618 } else if (a
->dp
[0] < b
) {
1625 /******************************************************************************/
1629 int32
mp_div_2 (mp_int
* a
, mp_int
* b
)
1631 int32 x
, res
, oldused
;
1636 if (b
->alloc
< a
->used
) {
1637 if ((res
= mp_grow (b
, a
->used
)) != MP_OKAY
) {
1645 register mp_digit r
, rr
, *tmpa
, *tmpb
;
1650 tmpa
= a
->dp
+ b
->used
- 1;
1655 tmpb
= b
->dp
+ b
->used
- 1;
1661 for (x
= b
->used
- 1; x
>= 0; x
--) {
1663 Get the carry for the next iteration
1668 Shift the current digit, add in carry and store
1670 *tmpb
-- = (*tmpa
-- >> 1) | (r
<< (DIGIT_BIT
- 1));
1672 Forward carry to next iteration
1680 tmpb
= b
->dp
+ b
->used
;
1681 for (x
= b
->used
; x
< oldused
; x
++) {
1690 /******************************************************************************/
1692 Computes xR**-1 == x (mod N) via Montgomery Reduction
1694 #ifdef USE_SMALL_WORD
1695 int32
mp_montgomery_reduce (mp_int
* x
, mp_int
* n
, mp_digit rho
)
1697 int32 ix
, res
, digs
;
1700 /* Can the fast reduction [comba] method be used?
1702 Note that unlike in mul you're safely allowed *less* than the available
1703 columns [255 per default] since carries are fixed up in the inner loop.
1705 digs
= n
->used
* 2 + 1;
1706 if ((digs
< MP_WARRAY
) &&
1708 (1 << ((CHAR_BIT
* sizeof (mp_word
)) - (2 * DIGIT_BIT
)))) {
1709 return fast_mp_montgomery_reduce (x
, n
, rho
);
1713 Grow the input as required.
1715 if (x
->alloc
< digs
) {
1716 if ((res
= mp_grow (x
, digs
)) != MP_OKAY
) {
1722 for (ix
= 0; ix
< n
->used
; ix
++) {
1726 The value of rho must be precalculated via mp_montgomery_setup()
1727 such that it equals -1/n0 mod b this allows the following inner
1728 loop to reduce the input one digit at a time
1730 mu
= (mp_digit
)(((mp_word
)x
->dp
[ix
]) * ((mp_word
)rho
) & MP_MASK
);
1732 /* a = a + mu * m * b**i */
1735 register mp_digit
*tmpn
, *tmpx
, u
;
1739 alias for digits of the modulus
1744 alias for the digits of x [the input]
1749 set the carry to zero
1754 Multiply and add in place
1756 for (iy
= 0; iy
< n
->used
; iy
++) {
1757 /* compute product and sum */
1758 r
= ((mp_word
)mu
) * ((mp_word
)*tmpn
++) +
1759 ((mp_word
) u
) + ((mp_word
) * tmpx
);
1762 u
= (mp_digit
)(r
>> ((mp_word
) DIGIT_BIT
));
1765 *tmpx
++ = (mp_digit
)(r
& ((mp_word
) MP_MASK
));
1767 /* At this point the ix'th digit of x should be zero */
1771 propagate carries upwards as required
1775 u
= *tmpx
>> DIGIT_BIT
;
1782 At this point the n.used'th least significant digits of x are all zero
1783 which means we can shift x to the right by n.used digits and the
1784 residue is unchanged.
1786 /* x = x/b**n.used */
1788 mp_rshd (x
, n
->used
);
1790 /* if x >= n then x = x - n */
1791 if (mp_cmp_mag (x
, n
) != MP_LT
) {
1792 return s_mp_sub (x
, n
, x
);
1797 #endif /* USE_SMALL_WORD */
1799 /******************************************************************************/
1801 Setups the montgomery reduction stuff.
1803 int32
mp_montgomery_setup (mp_int
* n
, mp_digit
* rho
)
1808 fast inversion mod 2**k
1810 Based on the fact that
1812 XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
1813 => 2*X*A - X*X*A*A = 1
1822 x
= (((b
+ 2) & 4) << 1) + b
; /* here x*a==1 mod 2**4 */
1823 x
= (x
* (2 - b
* x
)) & MP_MASK
; /* here x*a==1 mod 2**8 */
1824 #if !defined(MP_8BIT)
1825 x
= (x
* (2 - b
* x
)) & MP_MASK
; /* here x*a==1 mod 2**8 */
1826 #endif /* MP_8BIT */
1827 #if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
1828 x
*= 2 - b
* x
; /* here x*a==1 mod 2**32 */
1831 x
*= 2 - b
* x
; /* here x*a==1 mod 2**64 */
1832 #endif /* MP_64BIT */
1834 /* rho = -1/m mod b */
1835 *rho
= (((mp_word
) 1 << ((mp_word
) DIGIT_BIT
)) - x
) & MP_MASK
;
1840 /******************************************************************************/
1842 High level subtraction (handles signs)
1844 int32
mp_sub (mp_int
* a
, mp_int
* b
, mp_int
* c
)
1853 Subtract a negative from a positive, OR subtract a positive from a
1854 negative. In either case, ADD their magnitudes, and use the sign of
1858 res
= s_mp_add (a
, b
, c
);
1861 Subtract a positive from a positive, OR subtract a negative
1862 from a negative. First, take the difference between their
1865 if (mp_cmp_mag (a
, b
) != MP_LT
) {
1867 Copy the sign from the first
1870 /* The first has a larger or equal magnitude */
1871 res
= s_mp_sub (a
, b
, c
);
1874 The result has the *opposite* sign from the first number.
1876 c
->sign
= (sa
== MP_ZPOS
) ? MP_NEG
: MP_ZPOS
;
1878 The second has a larger magnitude
1880 res
= s_mp_sub (b
, a
, c
);
1886 /******************************************************************************/
1888 calc a value mod 2**b
1890 int32
mp_mod_2d (mp_int
* a
, int32 b
, mp_int
* c
)
1895 if b is <= 0 then zero the int32
1903 If the modulus is larger than the value than return
1905 if (b
>=(int32
) (a
->used
* DIGIT_BIT
)) {
1906 res
= mp_copy (a
, c
);
1911 if ((res
= mp_copy (a
, c
)) != MP_OKAY
) {
1916 Zero digits above the last digit of the modulus
1918 for (x
= (b
/ DIGIT_BIT
) + ((b
% DIGIT_BIT
) == 0 ? 0 : 1); x
< c
->used
; x
++) {
1922 Clear the digit that is not completely outside/inside the modulus
1924 c
->dp
[b
/ DIGIT_BIT
] &=
1925 (mp_digit
) ((((mp_digit
) 1) << (((mp_digit
) b
) % DIGIT_BIT
)) - ((mp_digit
) 1));
1930 /******************************************************************************/
1932 Shift right a certain amount of digits.
1934 void mp_rshd (mp_int
* a
, int32 b
)
1939 If b <= 0 then ignore it
1946 If b > used then simply zero it and return.
1954 register mp_digit
*bottom
, *top
;
1957 Shift the digits down
1962 /* top [offset into digits] */
1966 This is implemented as a sliding window where the window is b-digits long
1967 and digits from the top of the window are copied to the bottom.
1971 b-2 | b-1 | b0 | b1 | b2 | ... | bb | ---->
1973 \-------------------/ ---->
1975 for (x
= 0; x
< (a
->used
- b
); x
++) {
1982 for (; x
< a
->used
; x
++) {
1988 Remove excess digits
1993 /******************************************************************************/
1995 Low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9
1997 int32
s_mp_sub (mp_int
* a
, mp_int
* b
, mp_int
* c
)
1999 int32 olduse
, res
, min
, max
;
2010 if (c
->alloc
< max
) {
2011 if ((res
= mp_grow (c
, max
)) != MP_OKAY
) {
2019 register mp_digit u
, *tmpa
, *tmpb
, *tmpc
;
2023 alias for digit pointers
2033 for (i
= 0; i
< min
; i
++) {
2034 /* T[i] = A[i] - B[i] - U */
2035 *tmpc
= *tmpa
++ - *tmpb
++ - u
;
2038 U = carry bit of T[i]
2039 Note this saves performing an AND operation since if a carry does occur it
2040 will propagate all the way to the MSB. As a result a single shift
2041 is enough to get the carry
2043 u
= *tmpc
>> ((mp_digit
)(CHAR_BIT
* sizeof (mp_digit
) - 1));
2045 /* Clear carry from T[i] */
2050 Now copy higher words if any, e.g. if A has more digits than B
2052 for (; i
< max
; i
++) {
2053 /* T[i] = A[i] - U */
2054 *tmpc
= *tmpa
++ - u
;
2056 /* U = carry bit of T[i] */
2057 u
= *tmpc
>> ((mp_digit
)(CHAR_BIT
* sizeof (mp_digit
) - 1));
2059 /* Clear carry from T[i] */
2064 Clear digits above used (since we may not have grown result above)
2066 for (i
= c
->used
; i
< olduse
; i
++) {
2074 /******************************************************************************/
2076 integer signed division.
2078 c*b + d == a [e.g. a/b, c=quotient, d=remainder]
2079 HAC pp.598 Algorithm 14.20
2081 Note that the description in HAC is horribly incomplete. For example,
2082 it doesn't consider the case where digits are removed from 'x' in the inner
2083 loop. It also doesn't consider the case that y has fewer than three
2086 The overall algorithm is as described as 14.20 from HAC but fixed to
2090 int32
mp_div(psPool_t
*pool
, mp_int
* a
, mp_int
* b
, mp_int
* c
, mp_int
* d
)
2092 mp_int ta
, tb
, tq
, q
;
2098 if (mp_iszero (b
) == 1) {
2103 if a < b then q=0, r = a
2105 if (mp_cmp_mag (a
, b
) == MP_LT
) {
2107 res
= mp_copy (a
, d
);
2120 if ((res
= _mp_init_multi(pool
, &ta
, &tb
, &tq
, &q
, NULL
, NULL
, NULL
, NULL
) != MP_OKAY
)) {
2125 tq = 2^n, tb == b*2^n
2128 n
= mp_count_bits(a
) - mp_count_bits(b
);
2129 if (((res
= mp_abs(a
, &ta
)) != MP_OKAY
) ||
2130 ((res
= mp_abs(b
, &tb
)) != MP_OKAY
) ||
2131 ((res
= mp_mul_2d(&tb
, n
, &tb
)) != MP_OKAY
) ||
2132 ((res
= mp_mul_2d(&tq
, n
, &tq
)) != MP_OKAY
)) {
2136 if (((res = mp_copy(a, &ta)) != MP_OKAY) ||
2137 ((res = mp_copy(b, &tb)) != MP_OKAY) ||
2138 ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
2139 ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
2144 if (mp_cmp(&tb
, &ta
) != MP_GT
) {
2145 if (((res
= mp_sub(&ta
, &tb
, &ta
)) != MP_OKAY
) ||
2146 ((res
= mp_add(&q
, &tq
, &q
)) != MP_OKAY
)) {
2150 if (((res
= mp_div_2d(pool
, &tb
, 1, &tb
, NULL
)) != MP_OKAY
) ||
2151 ((res
= mp_div_2d(pool
, &tq
, 1, &tq
, NULL
)) != MP_OKAY
)) {
2157 now q == quotient and ta == remainder
2160 n2
= (a
->sign
== b
->sign
? MP_ZPOS
: MP_NEG
);
2163 c
->sign
= (mp_iszero(c
) == MP_YES
) ? MP_ZPOS
: n2
;
2167 d
->sign
= (mp_iszero(d
) == MP_YES
) ? MP_ZPOS
: n
;
2170 _mp_clear_multi(&ta
, &tb
, &tq
, &q
, NULL
, NULL
, NULL
, NULL
);
2173 #else /* MP_DIV_SMALL */
2175 int32
mp_div(psPool_t
*pool
, mp_int
* a
, mp_int
* b
, mp_int
* c
, mp_int
* d
)
2177 mp_int q
, x
, y
, t1
, t2
;
2178 int32 res
, n
, t
, i
, norm
, neg
;
2183 if (mp_iszero(b
) == 1) {
2188 if a < b then q=0, r = a
2190 if (mp_cmp_mag(a
, b
) == MP_LT
) {
2192 res
= mp_copy(a
, d
);
2202 if ((res
= mp_init_size(pool
, &q
, a
->used
+ 2)) != MP_OKAY
) {
2205 q
.used
= a
->used
+ 2;
2207 if ((res
= mp_init(pool
, &t1
)) != MP_OKAY
) {
2211 if ((res
= mp_init(pool
, &t2
)) != MP_OKAY
) {
2215 if ((res
= mp_init_copy(pool
, &x
, a
)) != MP_OKAY
) {
2219 if ((res
= mp_init_copy(pool
, &y
, b
)) != MP_OKAY
) {
2226 neg
= (a
->sign
== b
->sign
) ? MP_ZPOS
: MP_NEG
;
2227 x
.sign
= y
.sign
= MP_ZPOS
;
2230 normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT]
2232 norm
= mp_count_bits(&y
) % DIGIT_BIT
;
2233 if (norm
< (int32
)(DIGIT_BIT
-1)) {
2234 norm
= (DIGIT_BIT
-1) - norm
;
2235 if ((res
= mp_mul_2d(&x
, norm
, &x
)) != MP_OKAY
) {
2238 if ((res
= mp_mul_2d(&y
, norm
, &y
)) != MP_OKAY
) {
2246 note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4
2252 while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} }
2254 if ((res
= mp_lshd(&y
, n
- t
)) != MP_OKAY
) { /* y = y*b**{n-t} */
2258 while (mp_cmp(&x
, &y
) != MP_LT
) {
2260 if ((res
= mp_sub(&x
, &y
, &x
)) != MP_OKAY
) {
2266 reset y by shifting it back down
2271 step 3. for i from n down to (t + 1)
2273 for (i
= n
; i
>= (t
+ 1); i
--) {
2279 step 3.1 if xi == yt then set q{i-t-1} to b-1,
2280 otherwise set q{i-t-1} to (xi*b + x{i-1})/yt
2282 if (x
.dp
[i
] == y
.dp
[t
]) {
2283 q
.dp
[i
- t
- 1] = ((((mp_digit
)1) << DIGIT_BIT
) - 1);
2286 tmp
= ((mp_word
) x
.dp
[i
]) << ((mp_word
) DIGIT_BIT
);
2287 tmp
|= ((mp_word
) x
.dp
[i
- 1]);
2288 tmp
/= ((mp_word
) y
.dp
[t
]);
2289 if (tmp
> (mp_word
) MP_MASK
) {
2292 q
.dp
[i
- t
- 1] = (mp_digit
) (tmp
& (mp_word
) (MP_MASK
));
2296 while (q{i-t-1} * (yt * b + y{t-1})) >
2297 xi * b**2 + xi-1 * b + xi-2
2301 q
.dp
[i
- t
- 1] = (q
.dp
[i
- t
- 1] + 1) & MP_MASK
;
2303 q
.dp
[i
- t
- 1] = (q
.dp
[i
- t
- 1] - 1) & MP_MASK
;
2309 t1
.dp
[0] = (t
- 1 < 0) ? 0 : y
.dp
[t
- 1];
2312 if ((res
= mp_mul_d (&t1
, q
.dp
[i
- t
- 1], &t1
)) != MP_OKAY
) {
2319 t2
.dp
[0] = (i
- 2 < 0) ? 0 : x
.dp
[i
- 2];
2320 t2
.dp
[1] = (i
- 1 < 0) ? 0 : x
.dp
[i
- 1];
2323 } while (mp_cmp_mag(&t1
, &t2
) == MP_GT
);
2326 step 3.3 x = x - q{i-t-1} * y * b**{i-t-1}
2328 if ((res
= mp_mul_d(&y
, q
.dp
[i
- t
- 1], &t1
)) != MP_OKAY
) {
2332 if ((res
= mp_lshd(&t1
, i
- t
- 1)) != MP_OKAY
) {
2336 if ((res
= mp_sub(&x
, &t1
, &x
)) != MP_OKAY
) {
2341 if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; }
2343 if (x
.sign
== MP_NEG
) {
2344 if ((res
= mp_copy(&y
, &t1
)) != MP_OKAY
) {
2347 if ((res
= mp_lshd (&t1
, i
- t
- 1)) != MP_OKAY
) {
2350 if ((res
= mp_add (&x
, &t1
, &x
)) != MP_OKAY
) {
2354 q
.dp
[i
- t
- 1] = (q
.dp
[i
- t
- 1] - 1UL) & MP_MASK
;
2359 now q is the quotient and x is the remainder
2360 [which we have to normalize]
2364 get sign before writing to c
2366 x
.sign
= x
.used
== 0 ? MP_ZPOS
: a
->sign
;
2375 mp_div_2d(pool
, &x
, norm
, &x
, NULL
);
2381 LBL_Y
:mp_clear (&y
);
2382 LBL_X
:mp_clear (&x
);
2383 LBL_T2
:mp_clear (&t2
);
2384 LBL_T1
:mp_clear (&t1
);
2385 LBL_Q
:mp_clear (&q
);
2388 #endif /* MP_DIV_SMALL */
2390 /******************************************************************************/
2392 multiplies |a| * |b| and only computes upto digs digits of result
2393 HAC pp. 595, Algorithm 14.12 Modified so you can control how many digits
2394 of output are created.
2396 #ifdef USE_SMALL_WORD
2397 int32
s_mp_mul_digs(psPool_t
*pool
, mp_int
* a
, mp_int
* b
, mp_int
* c
, int32 digs
)
2400 int32 res
, pa
, pb
, ix
, iy
;
2403 mp_digit tmpx
, *tmpt
, *tmpy
;
2406 Can we use the fast multiplier?
2408 if (((digs
) < MP_WARRAY
) &&
2409 MIN (a
->used
, b
->used
) <
2410 (1 << ((CHAR_BIT
* sizeof (mp_word
)) - (2 * DIGIT_BIT
)))) {
2411 return fast_s_mp_mul_digs (pool
, a
, b
, c
, digs
);
2414 if ((res
= mp_init_size(pool
, &t
, digs
)) != MP_OKAY
) {
2420 Compute the digits of the product directly
2423 for (ix
= 0; ix
< pa
; ix
++) {
2424 /* set the carry to zero */
2428 Limit ourselves to making digs digits of output.
2430 pb
= MIN (b
->used
, digs
- ix
);
2433 Setup some aliases. Copy of the digit from a used
2434 within the nested loop
2439 An alias for the destination shifted ix places
2444 An alias for the digits of b
2449 Compute the columns of the output and propagate the carry
2451 for (iy
= 0; iy
< pb
; iy
++) {
2452 /* compute the column as a mp_word */
2453 r
= ((mp_word
)*tmpt
) +
2454 ((mp_word
)tmpx
) * ((mp_word
)*tmpy
++) +
2457 /* the new column is the lower part of the result */
2458 *tmpt
++ = (mp_digit
) (r
& ((mp_word
) MP_MASK
));
2460 /* get the carry word from the result */
2461 u
= (mp_digit
) (r
>> ((mp_word
) DIGIT_BIT
));
2464 Set carry if it is placed below digs
2466 if (ix
+ iy
< digs
) {
2477 #endif /* USE_SMALL_WORD */
2479 /******************************************************************************/
2481 Fast (comba) multiplier
2483 This is the fast column-array [comba] multiplier. It is designed to
2484 compute the columns of the product first then handle the carries afterwards.
2485 This has the effect of making the nested loops that compute the columns
2486 very simple and schedulable on super-scalar processors.
2488 This has been modified to produce a variable number of digits of output so
2489 if say only a half-product is required you don't have to compute the upper
2490 half (a feature required for fast Barrett reduction).
2492 Based on Algorithm 14.12 on pp.595 of HAC.
2495 int32
fast_s_mp_mul_digs(psPool_t
*pool
, mp_int
* a
, mp_int
* b
, mp_int
* c
,
2498 int32 olduse
, res
, pa
, ix
, iz
, neg
;
2499 mp_digit W
[MP_WARRAY
];
2500 register mp_word _W
;
2502 neg
= (a
->sign
== b
->sign
) ? MP_ZPOS
: MP_NEG
;
2505 grow the destination as required
2507 if (c
->alloc
< digs
) {
2508 if ((res
= mp_grow(c
, digs
)) != MP_OKAY
) {
2514 number of output digits to produce
2516 pa
= MIN(digs
, a
->used
+ b
->used
);
2522 for (ix
= 0; ix
< pa
; ix
++) {
2525 mp_digit
*tmpx
, *tmpy
;
2528 get offsets into the two bignums
2530 ty
= MIN(b
->used
-1, ix
);
2540 this is the number of times the loop will iterrate, essentially its
2541 while (tx++ < a->used && ty-- >= 0) { ... }
2543 iy
= MIN(a
->used
-tx
, ty
+1);
2548 for (iz
= 0; iz
< iy
; ++iz
) {
2549 _W
+= ((mp_word
)*tmpx
++)*((mp_word
)*tmpy
--);
2555 W
[ix
] = (mp_digit
)(_W
& MP_MASK
);
2560 _W
= _W
>> ((mp_word
)DIGIT_BIT
);
2566 W
[ix
] = (mp_digit
)(_W
& MP_MASK
);
2575 register mp_digit
*tmpc
;
2577 for (ix
= 0; ix
< pa
+1; ix
++) {
2579 now extract the previous digit [below the carry]
2585 clear unused digits [that existed in the old copy of c]
2587 for (; ix
< olduse
; ix
++) {
2592 c
->sign
= (c
->used
> 0) ? neg
: MP_ZPOS
;
2596 /******************************************************************************/
2600 int32
mp_mul_2 (mp_int
* a
, mp_int
* b
)
2602 int32 x
, res
, oldused
;
2605 grow to accomodate result
2607 if (b
->alloc
< a
->used
+ 1) {
2608 if ((res
= mp_grow (b
, a
->used
+ 1)) != MP_OKAY
) {
2617 register mp_digit r
, rr
, *tmpa
, *tmpb
;
2619 /* alias for source */
2622 /* alias for dest */
2627 for (x
= 0; x
< a
->used
; x
++) {
2630 get what will be the *next* carry bit from the MSB of the
2633 rr
= *tmpa
>> ((mp_digit
)(DIGIT_BIT
- 1));
2636 now shift up this digit, add in the carry [from the previous]
2638 *tmpb
++ = ((*tmpa
++ << ((mp_digit
)1)) | r
) & MP_MASK
;
2640 /* copy the carry that would be from the source digit into the next
2651 add a MSB which is always 1 at this point
2658 now zero any excess digits on the destination that we didn't write to
2660 tmpb
= b
->dp
+ b
->used
;
2661 for (x
= b
->used
; x
< oldused
; x
++) {
2669 /******************************************************************************/
2673 int32
mp_mul_d(mp_int
* a
, mp_digit b
, mp_int
* c
)
2675 mp_digit u
, *tmpa
, *tmpc
;
2677 int32 ix
, res
, olduse
;
2680 make sure c is big enough to hold a*b
2682 if (c
->alloc
< a
->used
+ 1) {
2683 if ((res
= mp_grow (c
, a
->used
+ 1)) != MP_OKAY
) {
2689 get the original destinations used count
2699 alias for a->dp [source]
2704 alias for c->dp [dest]
2711 /* compute columns */
2712 for (ix
= 0; ix
< a
->used
; ix
++) {
2714 compute product and carry sum for this term
2716 r
= ((mp_word
) u
) + ((mp_word
)*tmpa
++) * ((mp_word
)b
);
2719 mask off higher bits to get a single digit
2721 *tmpc
++ = (mp_digit
) (r
& ((mp_word
) MP_MASK
));
2724 send carry into next iteration
2726 u
= (mp_digit
) (r
>> ((mp_word
) DIGIT_BIT
));
2730 store final carry [if any] and increment ix offset
2736 now zero digits above the top
2738 while (ix
++ < olduse
) {
2742 /* set used count */
2743 c
->used
= a
->used
+ 1;
2749 /******************************************************************************/
2751 low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16
2753 #ifdef USE_SMALL_WORD
2754 int32
s_mp_sqr (psPool_t
*pool
, mp_int
* a
, mp_int
* b
)
2757 int32 res
, ix
, iy
, pa
;
2759 mp_digit u
, tmpx
, *tmpt
;
2762 if ((res
= mp_init_size(pool
, &t
, 2*pa
+ 1)) != MP_OKAY
) {
2767 default used is maximum possible size
2771 for (ix
= 0; ix
< pa
; ix
++) {
2773 first calculate the digit at 2*ix
2774 calculate double precision result
2776 r
= ((mp_word
) t
.dp
[2*ix
]) +
2777 ((mp_word
)a
->dp
[ix
])*((mp_word
)a
->dp
[ix
]);
2780 store lower part in result
2782 t
.dp
[ix
+ix
] = (mp_digit
) (r
& ((mp_word
) MP_MASK
));
2787 u
= (mp_digit
)(r
>> ((mp_word
) DIGIT_BIT
));
2790 left hand side of A[ix] * A[iy]
2795 alias for where to store the results
2797 tmpt
= t
.dp
+ (2*ix
+ 1);
2799 for (iy
= ix
+ 1; iy
< pa
; iy
++) {
2801 first calculate the product
2803 r
= ((mp_word
)tmpx
) * ((mp_word
)a
->dp
[iy
]);
2806 now calculate the double precision result, note we use addition
2807 instead of *2 since it's easier to optimize
2809 r
= ((mp_word
) *tmpt
) + r
+ r
+ ((mp_word
) u
);
2814 *tmpt
++ = (mp_digit
) (r
& ((mp_word
) MP_MASK
));
2817 u
= (mp_digit
)(r
>> ((mp_word
) DIGIT_BIT
));
2819 /* propagate upwards */
2820 while (u
!= ((mp_digit
) 0)) {
2821 r
= ((mp_word
) *tmpt
) + ((mp_word
) u
);
2822 *tmpt
++ = (mp_digit
) (r
& ((mp_word
) MP_MASK
));
2823 u
= (mp_digit
)(r
>> ((mp_word
) DIGIT_BIT
));
2832 #endif /* USE_SMALL_WORD */
2834 /******************************************************************************/
2838 This is the comba method where the columns of the product are computed
2839 first then the carries are computed. This has the effect of making a very
2840 simple inner loop that is executed the most
2842 W2 represents the outer products and W the inner.
2844 A further optimizations is made because the inner products are of the
2845 form "A * B * 2". The *2 part does not need to be computed until the end
2846 which is good because 64-bit shifts are slow!
2848 Based on Algorithm 14.16 on pp.597 of HAC.
2850 This is the 1.0 version, but no SSE stuff
2852 int32
fast_s_mp_sqr(psPool_t
*pool
, mp_int
* a
, mp_int
* b
)
2854 int32 olduse
, res
, pa
, ix
, iz
;
2855 mp_digit W
[MP_WARRAY
], *tmpx
;
2859 grow the destination as required
2861 pa
= a
->used
+ a
->used
;
2862 if (b
->alloc
< pa
) {
2863 if ((res
= mp_grow(b
, pa
)) != MP_OKAY
) {
2869 number of output digits to produce
2872 for (ix
= 0; ix
< pa
; ix
++) {
2883 get offsets into the two bignums
2885 ty
= MIN(a
->used
-1, ix
);
2895 this is the number of times the loop will iterrate, essentially
2896 while (tx++ < a->used && ty-- >= 0) { ... }
2898 iy
= MIN(a
->used
-tx
, ty
+1);
2901 now for squaring tx can never equal ty
2902 we halve the distance since they approach at a rate of 2x
2903 and we have to round because odd cases need to be executed
2905 iy
= MIN(iy
, (ty
-tx
+1)>>1);
2910 for (iz
= 0; iz
< iy
; iz
++) {
2911 _W
+= ((mp_word
)*tmpx
++)*((mp_word
)*tmpy
--);
2915 double the inner product and add carry
2920 even columns have the square term in them
2923 _W
+= ((mp_word
)a
->dp
[ix
>>1])*((mp_word
)a
->dp
[ix
>>1]);
2929 W
[ix
] = (mp_digit
)(_W
& MP_MASK
);
2934 W1
= _W
>> ((mp_word
)DIGIT_BIT
);
2941 b
->used
= a
->used
+a
->used
;
2946 for (ix
= 0; ix
< pa
; ix
++) {
2947 *tmpb
++ = W
[ix
] & MP_MASK
;
2951 clear unused digits [that existed in the old copy of c]
2953 for (; ix
< olduse
; ix
++) {
2961 /******************************************************************************/
2965 Simple algorithm which zeroes the int32, grows it then just sets one bit
2968 int32
mp_2expt (mp_int
* a
, int32 b
)
2973 zero a as per default
2978 grow a to accomodate the single bit
2980 if ((res
= mp_grow (a
, b
/ DIGIT_BIT
+ 1)) != MP_OKAY
) {
2985 set the used count of where the bit will go
2987 a
->used
= b
/ DIGIT_BIT
+ 1;
2990 put the single bit in its place
2992 a
->dp
[b
/ DIGIT_BIT
] = ((mp_digit
)1) << (b
% DIGIT_BIT
);
2997 /******************************************************************************/
2999 init an mp_init for a given size
3001 int32
mp_init_size(psPool_t
*pool
, mp_int
* a
, int32 size
)
3005 pad size so there are always extra digits
3007 size
+= (MP_PREC
* 2) - (size
% MP_PREC
);
3012 a
->dp
= OPT_CAST(mp_digit
) psMalloc(pool
, sizeof (mp_digit
) * size
);
3013 if (a
->dp
== NULL
) {
3023 for (x
= 0; x
< size
; x
++) {
3029 /******************************************************************************/
3031 low level addition, based on HAC pp.594, Algorithm 14.7
3033 int32
s_mp_add (mp_int
* a
, mp_int
* b
, mp_int
* c
)
3036 int32 olduse
, res
, min
, max
;
3039 find sizes, we let |a| <= |b| which means we have to sort them. "x" will
3040 point to the input with the most digits
3042 if (a
->used
> b
->used
) {
3053 if (c
->alloc
< max
+ 1) {
3054 if ((res
= mp_grow (c
, max
+ 1)) != MP_OKAY
) {
3060 get old used digit count and set new one
3066 register mp_digit u
, *tmpa
, *tmpb
, *tmpc
;
3069 /* alias for digit pointers */
3080 /* zero the carry */
3082 for (i
= 0; i
< min
; i
++) {
3084 Compute the sum at one digit, T[i] = A[i] + B[i] + U
3086 *tmpc
= *tmpa
++ + *tmpb
++ + u
;
3089 U = carry bit of T[i]
3091 u
= *tmpc
>> ((mp_digit
)DIGIT_BIT
);
3094 take away carry bit from T[i]
3100 now copy higher words if any, that is in A+B if A or B has more digits add
3104 for (; i
< max
; i
++) {
3105 /* T[i] = X[i] + U */
3106 *tmpc
= x
->dp
[i
] + u
;
3108 /* U = carry bit of T[i] */
3109 u
= *tmpc
>> ((mp_digit
)DIGIT_BIT
);
3111 /* take away carry bit from T[i] */
3120 clear digits above oldused
3122 for (i
= c
->used
; i
< olduse
; i
++) {
3131 /******************************************************************************/
3133 FUTURE - this is only needed by the SSH code, SLOW or not, because RSA
3134 exponents are always odd.
3136 int32
mp_invmodSSH(psPool_t
*pool
, mp_int
* a
, mp_int
* b
, mp_int
* c
)
3138 mp_int x
, y
, u
, v
, A
, B
, C
, D
;
3142 b cannot be negative
3144 if (b
->sign
== MP_NEG
|| mp_iszero(b
) == 1) {
3149 if the modulus is odd we can use a faster routine instead
3151 if (mp_isodd (b
) == 1) {
3152 return fast_mp_invmod(pool
, a
, b
, c
);
3158 if ((res
= _mp_init_multi(pool
, &x
, &y
, &u
, &v
,
3159 &A
, &B
, &C
, &D
)) != MP_OKAY
) {
3164 if ((res
= mp_copy(a
, &x
)) != MP_OKAY
) {
3167 if ((res
= mp_copy(b
, &y
)) != MP_OKAY
) {
3172 2. [modified] if x,y are both even then return an error!
3174 if (mp_iseven(&x
) == 1 && mp_iseven (&y
) == 1) {
3180 3. u=x, v=y, A=1, B=0, C=0,D=1
3182 if ((res
= mp_copy(&x
, &u
)) != MP_OKAY
) {
3185 if ((res
= mp_copy(&y
, &v
)) != MP_OKAY
) {
3193 4. while u is even do
3195 while (mp_iseven(&u
) == 1) {
3197 if ((res
= mp_div_2(&u
, &u
)) != MP_OKAY
) {
3200 /* 4.2 if A or B is odd then */
3201 if (mp_isodd (&A
) == 1 || mp_isodd (&B
) == 1) {
3202 /* A = (A+y)/2, B = (B-x)/2 */
3203 if ((res
= mp_add(&A
, &y
, &A
)) != MP_OKAY
) {
3206 if ((res
= mp_sub(&B
, &x
, &B
)) != MP_OKAY
) {
3210 /* A = A/2, B = B/2 */
3211 if ((res
= mp_div_2(&A
, &A
)) != MP_OKAY
) {
3214 if ((res
= mp_div_2(&B
, &B
)) != MP_OKAY
) {
3220 5. while v is even do
3222 while (mp_iseven(&v
) == 1) {
3224 if ((res
= mp_div_2(&v
, &v
)) != MP_OKAY
) {
3227 /* 5.2 if C or D is odd then */
3228 if (mp_isodd(&C
) == 1 || mp_isodd (&D
) == 1) {
3229 /* C = (C+y)/2, D = (D-x)/2 */
3230 if ((res
= mp_add(&C
, &y
, &C
)) != MP_OKAY
) {
3233 if ((res
= mp_sub(&D
, &x
, &D
)) != MP_OKAY
) {
3237 /* C = C/2, D = D/2 */
3238 if ((res
= mp_div_2(&C
, &C
)) != MP_OKAY
) {
3241 if ((res
= mp_div_2(&D
, &D
)) != MP_OKAY
) {
3249 if (mp_cmp(&u
, &v
) != MP_LT
) {
3250 /* u = u - v, A = A - C, B = B - D */
3251 if ((res
= mp_sub(&u
, &v
, &u
)) != MP_OKAY
) {
3255 if ((res
= mp_sub(&A
, &C
, &A
)) != MP_OKAY
) {
3259 if ((res
= mp_sub(&B
, &D
, &B
)) != MP_OKAY
) {
3263 /* v - v - u, C = C - A, D = D - B */
3264 if ((res
= mp_sub(&v
, &u
, &v
)) != MP_OKAY
) {
3268 if ((res
= mp_sub(&C
, &A
, &C
)) != MP_OKAY
) {
3272 if ((res
= mp_sub(&D
, &B
, &D
)) != MP_OKAY
) {
3278 if not zero goto step 4
3280 if (mp_iszero(&u
) == 0)
3284 now a = C, b = D, gcd == g*v
3288 if v != 1 then there is no inverse
3290 if (mp_cmp_d(&v
, 1) != MP_EQ
) {
3298 while (mp_cmp_d(&C
, 0) == MP_LT
) {
3299 if ((res
= mp_add(&C
, b
, &C
)) != MP_OKAY
) {
3307 while (mp_cmp_mag(&C
, b
) != MP_LT
) {
3308 if ((res
= mp_sub(&C
, b
, &C
)) != MP_OKAY
) {
3314 C is now the inverse
3318 LBL_ERR
:_mp_clear_multi(&x
, &y
, &u
, &v
, &A
, &B
, &C
, &D
);
3322 /******************************************************************************/
3325 * Computes the modular inverse via binary extended euclidean algorithm,
3326 * that is c = 1/a mod b
3328 * Based on slow invmod except this is optimized for the case where b is
3329 * odd as per HAC Note 14.64 on pp. 610
3331 int32
fast_mp_invmod(psPool_t
*pool
, mp_int
* a
, mp_int
* b
, mp_int
* c
)
3333 mp_int x
, y
, u
, v
, B
, D
;
3337 2. [modified] b must be odd
3339 if (mp_iseven (b
) == 1) {
3346 if ((res
= _mp_init_multi(pool
, &x
, &y
, &u
, &v
, &B
, &D
, NULL
, NULL
)) != MP_OKAY
) {
3351 x == modulus, y == value to invert
3353 if ((res
= mp_copy(b
, &x
)) != MP_OKAY
) {
3360 if ((res
= mp_mod(pool
, a
, b
, &y
)) != MP_OKAY
) {
3365 3. u=x, v=y, A=1, B=0, C=0,D=1
3367 if ((res
= mp_copy(&x
, &u
)) != MP_OKAY
) {
3370 if ((res
= mp_copy(&y
, &v
)) != MP_OKAY
) {
3377 4. while u is even do
3379 while (mp_iseven(&u
) == 1) {
3381 if ((res
= mp_div_2(&u
, &u
)) != MP_OKAY
) {
3384 /* 4.2 if B is odd then */
3385 if (mp_isodd(&B
) == 1) {
3386 if ((res
= mp_sub(&B
, &x
, &B
)) != MP_OKAY
) {
3391 if ((res
= mp_div_2(&B
, &B
)) != MP_OKAY
) {
3397 5. while v is even do
3399 while (mp_iseven(&v
) == 1) {
3401 if ((res
= mp_div_2(&v
, &v
)) != MP_OKAY
) {
3404 /* 5.2 if D is odd then */
3405 if (mp_isodd(&D
) == 1) {
3407 if ((res
= mp_sub(&D
, &x
, &D
)) != MP_OKAY
) {
3412 if ((res
= mp_div_2(&D
, &D
)) != MP_OKAY
) {
3420 if (mp_cmp(&u
, &v
) != MP_LT
) {
3421 /* u = u - v, B = B - D */
3422 if ((res
= mp_sub(&u
, &v
, &u
)) != MP_OKAY
) {
3426 if ((res
= mp_sub(&B
, &D
, &B
)) != MP_OKAY
) {
3430 /* v - v - u, D = D - B */
3431 if ((res
= mp_sub(&v
, &u
, &v
)) != MP_OKAY
) {
3435 if ((res
= mp_sub(&D
, &B
, &D
)) != MP_OKAY
) {
3441 if not zero goto step 4
3443 if (mp_iszero(&u
) == 0) {
3448 now a = C, b = D, gcd == g*v
3452 if v != 1 then there is no inverse
3454 if (mp_cmp_d(&v
, 1) != MP_EQ
) {
3460 b is now the inverse
3463 while (D
.sign
== MP_NEG
) {
3464 if ((res
= mp_add(&D
, b
, &D
)) != MP_OKAY
) {
3472 LBL_ERR
:_mp_clear_multi(&x
, &y
, &u
, &v
, &B
, &D
, NULL
, NULL
);
3476 /******************************************************************************/
3480 int32
mp_addmod (psPool_t
*pool
, mp_int
* a
, mp_int
* b
, mp_int
* c
, mp_int
* d
)
3485 if ((res
= mp_init(pool
, &t
)) != MP_OKAY
) {
3489 if ((res
= mp_add (a
, b
, &t
)) != MP_OKAY
) {
3493 res
= mp_mod (pool
, &t
, c
, d
);
3498 /******************************************************************************/
3502 int32
mp_shrink (mp_int
* a
)
3506 if (a
->alloc
!= a
->used
&& a
->used
> 0) {
3507 if ((tmp
= psRealloc(a
->dp
, sizeof (mp_digit
) * a
->used
)) == NULL
) {
3516 /* single digit subtraction */
3517 int32
mp_sub_d (mp_int
* a
, mp_digit b
, mp_int
* c
)
3519 mp_digit
*tmpa
, *tmpc
, mu
;
3520 int32 res
, ix
, oldused
;
3522 /* grow c as required */
3523 if (c
->alloc
< a
->used
+ 1) {
3524 if ((res
= mp_grow(c
, a
->used
+ 1)) != MP_OKAY
) {
3529 /* if a is negative just do an unsigned
3530 * addition [with fudged signs]
3532 if (a
->sign
== MP_NEG
) {
3534 res
= mp_add_d(a
, b
, c
);
3535 a
->sign
= c
->sign
= MP_NEG
;
3544 /* if a <= b simply fix the single digit */
3545 if ((a
->used
== 1 && a
->dp
[0] <= b
) || a
->used
== 0) {
3547 *tmpc
++ = b
- *tmpa
;
3553 /* negative/1digit */
3561 /* subtract first digit */
3562 *tmpc
= *tmpa
++ - b
;
3563 mu
= *tmpc
>> (sizeof(mp_digit
) * CHAR_BIT
- 1);
3566 /* handle rest of the digits */
3567 for (ix
= 1; ix
< a
->used
; ix
++) {
3568 *tmpc
= *tmpa
++ - mu
;
3569 mu
= *tmpc
>> (sizeof(mp_digit
) * CHAR_BIT
- 1);
3574 /* zero excess digits */
3575 while (ix
++ < oldused
) {
3582 /* single digit addition */
3583 int32
mp_add_d (mp_int
* a
, mp_digit b
, mp_int
* c
)
3585 int32 res
, ix
, oldused
;
3586 mp_digit
*tmpa
, *tmpc
, mu
;
3588 /* grow c as required */
3589 if (c
->alloc
< a
->used
+ 1) {
3590 if ((res
= mp_grow(c
, a
->used
+ 1)) != MP_OKAY
) {
3595 /* if a is negative and |a| >= b, call c = |a| - b */
3596 if (a
->sign
== MP_NEG
&& (a
->used
> 1 || a
->dp
[0] >= b
)) {
3597 /* temporarily fix sign of a */
3601 res
= mp_sub_d(a
, b
, c
);
3604 a
->sign
= c
->sign
= MP_NEG
;
3608 /* old number of used digits in c */
3611 /* sign always positive */
3617 /* destination alias */
3620 /* if a is positive */
3621 if (a
->sign
== MP_ZPOS
) {
3622 /* add digit, after this we're propagating the carry */
3623 *tmpc
= *tmpa
++ + b
;
3624 mu
= *tmpc
>> DIGIT_BIT
;
3627 /* now handle rest of the digits */
3628 for (ix
= 1; ix
< a
->used
; ix
++) {
3629 *tmpc
= *tmpa
++ + mu
;
3630 mu
= *tmpc
>> DIGIT_BIT
;
3633 /* set final carry */
3638 c
->used
= a
->used
+ 1;
3640 /* a was negative and |a| < b */
3643 /* the result is a single digit */
3645 *tmpc
++ = b
- a
->dp
[0];
3650 /* setup count so the clearing of oldused
3651 * can fall through correctly
3656 /* now zero to oldused */
3657 while (ix
++ < oldused
) {
3665 /******************************************************************************/
3667 #endif /* USE_MPI2 */