update libressl to v2.7.4
[unleashed.git] / lib / libcrypto / ec / ecp_nistp256.c
blob6f3ec3c25055eb7d79af28a06ba43f2b54b8dc05
1 /* $OpenBSD: ecp_nistp256.c,v 1.18 2017/05/02 03:59:44 deraadt Exp $ */
2 /*
3 * Written by Adam Langley (Google) for the OpenSSL project
4 */
5 /*
6 * Copyright (c) 2011 Google Inc.
8 * Permission to use, copy, modify, and distribute this software for any
9 * purpose with or without fee is hereby granted, provided that the above
10 * copyright notice and this permission notice appear in all copies.
12 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
13 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
14 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
15 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
16 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
17 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
18 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22 * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
29 #include <stdint.h>
30 #include <string.h>
32 #include <openssl/opensslconf.h>
34 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
36 #include <openssl/err.h>
37 #include "ec_lcl.h"
39 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
40 /* even with gcc, the typedef won't work for 32-bit platforms */
41 typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
42 typedef __int128_t int128_t;
43 #else
44 #error "Need GCC 3.1 or later to define type uint128_t"
45 #endif
47 typedef uint8_t u8;
48 typedef uint32_t u32;
49 typedef uint64_t u64;
50 typedef int64_t s64;
52 /* The underlying field.
54 * P256 operates over GF(2^256-2^224+2^192+2^96-1). We can serialise an element
55 * of this field into 32 bytes. We call this an felem_bytearray. */
57 typedef u8 felem_bytearray[32];
59 /* These are the parameters of P256, taken from FIPS 186-3, page 86. These
60 * values are big-endian. */
61 static const felem_bytearray nistp256_curve_params[5] = {
62 {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
63 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
64 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
65 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
66 {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
67 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
68 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
69 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
70 {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
71 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
72 0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
73 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
74 {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
75 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
76 0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
77 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
78 {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
79 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
80 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
81 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
84 /* The representation of field elements.
85 * ------------------------------------
87 * We represent field elements with either four 128-bit values, eight 128-bit
88 * values, or four 64-bit values. The field element represented is:
89 * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192 (mod p)
90 * or:
91 * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512 (mod p)
93 * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
94 * apart, but are 128-bits wide, the most significant bits of each limb overlap
95 * with the least significant bits of the next.
97 * A field element with four limbs is an 'felem'. One with eight limbs is a
98 * 'longfelem'
100 * A field element with four, 64-bit values is called a 'smallfelem'. Small
101 * values are used as intermediate values before multiplication.
104 #define NLIMBS 4
106 typedef uint128_t limb;
107 typedef limb felem[NLIMBS];
108 typedef limb longfelem[NLIMBS * 2];
109 typedef u64 smallfelem[NLIMBS];
111 /* This is the value of the prime as four 64-bit words, little-endian. */
112 static const u64 kPrime[4] = {0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul};
113 static const limb bottom32bits = 0xffffffff;
114 static const u64 bottom63bits = 0x7ffffffffffffffful;
116 /* bin32_to_felem takes a little-endian byte array and converts it into felem
117 * form. This assumes that the CPU is little-endian. */
118 static void
119 bin32_to_felem(felem out, const u8 in[32])
121 out[0] = *((u64 *) & in[0]);
122 out[1] = *((u64 *) & in[8]);
123 out[2] = *((u64 *) & in[16]);
124 out[3] = *((u64 *) & in[24]);
127 /* smallfelem_to_bin32 takes a smallfelem and serialises into a little endian,
128 * 32 byte array. This assumes that the CPU is little-endian. */
129 static void
130 smallfelem_to_bin32(u8 out[32], const smallfelem in)
132 *((u64 *) & out[0]) = in[0];
133 *((u64 *) & out[8]) = in[1];
134 *((u64 *) & out[16]) = in[2];
135 *((u64 *) & out[24]) = in[3];
138 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
139 static void
140 flip_endian(u8 * out, const u8 * in, unsigned len)
142 unsigned i;
143 for (i = 0; i < len; ++i)
144 out[i] = in[len - 1 - i];
147 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
148 static int
149 BN_to_felem(felem out, const BIGNUM * bn)
151 felem_bytearray b_in;
152 felem_bytearray b_out;
153 unsigned num_bytes;
155 /* BN_bn2bin eats leading zeroes */
156 memset(b_out, 0, sizeof b_out);
157 num_bytes = BN_num_bytes(bn);
158 if (num_bytes > sizeof b_out) {
159 ECerror(EC_R_BIGNUM_OUT_OF_RANGE);
160 return 0;
162 if (BN_is_negative(bn)) {
163 ECerror(EC_R_BIGNUM_OUT_OF_RANGE);
164 return 0;
166 num_bytes = BN_bn2bin(bn, b_in);
167 flip_endian(b_out, b_in, num_bytes);
168 bin32_to_felem(out, b_out);
169 return 1;
172 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
173 static BIGNUM *
174 smallfelem_to_BN(BIGNUM * out, const smallfelem in)
176 felem_bytearray b_in, b_out;
177 smallfelem_to_bin32(b_in, in);
178 flip_endian(b_out, b_in, sizeof b_out);
179 return BN_bin2bn(b_out, sizeof b_out, out);
183 /* Field operations
184 * ---------------- */
186 static void
187 smallfelem_one(smallfelem out)
189 out[0] = 1;
190 out[1] = 0;
191 out[2] = 0;
192 out[3] = 0;
195 static void
196 smallfelem_assign(smallfelem out, const smallfelem in)
198 out[0] = in[0];
199 out[1] = in[1];
200 out[2] = in[2];
201 out[3] = in[3];
204 static void
205 felem_assign(felem out, const felem in)
207 out[0] = in[0];
208 out[1] = in[1];
209 out[2] = in[2];
210 out[3] = in[3];
213 /* felem_sum sets out = out + in. */
214 static void
215 felem_sum(felem out, const felem in)
217 out[0] += in[0];
218 out[1] += in[1];
219 out[2] += in[2];
220 out[3] += in[3];
223 /* felem_small_sum sets out = out + in. */
224 static void
225 felem_small_sum(felem out, const smallfelem in)
227 out[0] += in[0];
228 out[1] += in[1];
229 out[2] += in[2];
230 out[3] += in[3];
233 /* felem_scalar sets out = out * scalar */
234 static void
235 felem_scalar(felem out, const u64 scalar)
237 out[0] *= scalar;
238 out[1] *= scalar;
239 out[2] *= scalar;
240 out[3] *= scalar;
243 /* longfelem_scalar sets out = out * scalar */
244 static void
245 longfelem_scalar(longfelem out, const u64 scalar)
247 out[0] *= scalar;
248 out[1] *= scalar;
249 out[2] *= scalar;
250 out[3] *= scalar;
251 out[4] *= scalar;
252 out[5] *= scalar;
253 out[6] *= scalar;
254 out[7] *= scalar;
257 #define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
258 #define two105 (((limb)1) << 105)
259 #define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
261 /* zero105 is 0 mod p */
262 static const felem zero105 = {two105m41m9, two105, two105m41p9, two105m41p9};
264 /* smallfelem_neg sets |out| to |-small|
265 * On exit:
266 * out[i] < out[i] + 2^105
268 static void
269 smallfelem_neg(felem out, const smallfelem small)
271 /* In order to prevent underflow, we subtract from 0 mod p. */
272 out[0] = zero105[0] - small[0];
273 out[1] = zero105[1] - small[1];
274 out[2] = zero105[2] - small[2];
275 out[3] = zero105[3] - small[3];
278 /* felem_diff subtracts |in| from |out|
279 * On entry:
280 * in[i] < 2^104
281 * On exit:
282 * out[i] < out[i] + 2^105
284 static void
285 felem_diff(felem out, const felem in)
287 /* In order to prevent underflow, we add 0 mod p before subtracting. */
288 out[0] += zero105[0];
289 out[1] += zero105[1];
290 out[2] += zero105[2];
291 out[3] += zero105[3];
293 out[0] -= in[0];
294 out[1] -= in[1];
295 out[2] -= in[2];
296 out[3] -= in[3];
299 #define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
300 #define two107 (((limb)1) << 107)
301 #define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
303 /* zero107 is 0 mod p */
304 static const felem zero107 = {two107m43m11, two107, two107m43p11, two107m43p11};
306 /* An alternative felem_diff for larger inputs |in|
307 * felem_diff_zero107 subtracts |in| from |out|
308 * On entry:
309 * in[i] < 2^106
310 * On exit:
311 * out[i] < out[i] + 2^107
313 static void
314 felem_diff_zero107(felem out, const felem in)
316 /* In order to prevent underflow, we add 0 mod p before subtracting. */
317 out[0] += zero107[0];
318 out[1] += zero107[1];
319 out[2] += zero107[2];
320 out[3] += zero107[3];
322 out[0] -= in[0];
323 out[1] -= in[1];
324 out[2] -= in[2];
325 out[3] -= in[3];
328 /* longfelem_diff subtracts |in| from |out|
329 * On entry:
330 * in[i] < 7*2^67
331 * On exit:
332 * out[i] < out[i] + 2^70 + 2^40
334 static void
335 longfelem_diff(longfelem out, const longfelem in)
337 static const limb two70m8p6 = (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
338 static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
339 static const limb two70 = (((limb) 1) << 70);
340 static const limb two70m40m38p6 = (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) + (((limb) 1) << 6);
341 static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
343 /* add 0 mod p to avoid underflow */
344 out[0] += two70m8p6;
345 out[1] += two70p40;
346 out[2] += two70;
347 out[3] += two70m40m38p6;
348 out[4] += two70m6;
349 out[5] += two70m6;
350 out[6] += two70m6;
351 out[7] += two70m6;
353 /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
354 out[0] -= in[0];
355 out[1] -= in[1];
356 out[2] -= in[2];
357 out[3] -= in[3];
358 out[4] -= in[4];
359 out[5] -= in[5];
360 out[6] -= in[6];
361 out[7] -= in[7];
364 #define two64m0 (((limb)1) << 64) - 1
365 #define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
366 #define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
367 #define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
369 /* zero110 is 0 mod p */
370 static const felem zero110 = {two64m0, two110p32m0, two64m46, two64m32};
372 /* felem_shrink converts an felem into a smallfelem. The result isn't quite
373 * minimal as the value may be greater than p.
375 * On entry:
376 * in[i] < 2^109
377 * On exit:
378 * out[i] < 2^64
380 static void
381 felem_shrink(smallfelem out, const felem in)
383 felem tmp;
384 u64 a, b, mask;
385 s64 high, low;
386 static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
388 /* Carry 2->3 */
389 tmp[3] = zero110[3] + in[3] + ((u64) (in[2] >> 64));
390 /* tmp[3] < 2^110 */
392 tmp[2] = zero110[2] + (u64) in[2];
393 tmp[0] = zero110[0] + in[0];
394 tmp[1] = zero110[1] + in[1];
395 /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
398 * We perform two partial reductions where we eliminate the high-word
399 * of tmp[3]. We don't update the other words till the end.
401 a = tmp[3] >> 64; /* a < 2^46 */
402 tmp[3] = (u64) tmp[3];
403 tmp[3] -= a;
404 tmp[3] += ((limb) a) << 32;
405 /* tmp[3] < 2^79 */
407 b = a;
408 a = tmp[3] >> 64; /* a < 2^15 */
409 b += a; /* b < 2^46 + 2^15 < 2^47 */
410 tmp[3] = (u64) tmp[3];
411 tmp[3] -= a;
412 tmp[3] += ((limb) a) << 32;
413 /* tmp[3] < 2^64 + 2^47 */
416 * This adjusts the other two words to complete the two partial
417 * reductions.
419 tmp[0] += b;
420 tmp[1] -= (((limb) b) << 32);
423 * In order to make space in tmp[3] for the carry from 2 -> 3, we
424 * conditionally subtract kPrime if tmp[3] is large enough.
426 high = tmp[3] >> 64;
427 /* As tmp[3] < 2^65, high is either 1 or 0 */
428 high <<= 63;
429 high >>= 63;
431 * high is: all ones if the high word of tmp[3] is 1 all zeros if
432 * the high word of tmp[3] if 0
434 low = tmp[3];
435 mask = low >> 63;
437 * mask is: all ones if the MSB of low is 1 all zeros if the MSB
438 * of low if 0
440 low &= bottom63bits;
441 low -= kPrime3Test;
442 /* if low was greater than kPrime3Test then the MSB is zero */
443 low = ~low;
444 low >>= 63;
446 * low is: all ones if low was > kPrime3Test all zeros if low was
447 * <= kPrime3Test
449 mask = (mask & low) | high;
450 tmp[0] -= mask & kPrime[0];
451 tmp[1] -= mask & kPrime[1];
452 /* kPrime[2] is zero, so omitted */
453 tmp[3] -= mask & kPrime[3];
454 /* tmp[3] < 2**64 - 2**32 + 1 */
456 tmp[1] += ((u64) (tmp[0] >> 64));
457 tmp[0] = (u64) tmp[0];
458 tmp[2] += ((u64) (tmp[1] >> 64));
459 tmp[1] = (u64) tmp[1];
460 tmp[3] += ((u64) (tmp[2] >> 64));
461 tmp[2] = (u64) tmp[2];
462 /* tmp[i] < 2^64 */
464 out[0] = tmp[0];
465 out[1] = tmp[1];
466 out[2] = tmp[2];
467 out[3] = tmp[3];
470 /* smallfelem_expand converts a smallfelem to an felem */
471 static void
472 smallfelem_expand(felem out, const smallfelem in)
474 out[0] = in[0];
475 out[1] = in[1];
476 out[2] = in[2];
477 out[3] = in[3];
480 /* smallfelem_square sets |out| = |small|^2
481 * On entry:
482 * small[i] < 2^64
483 * On exit:
484 * out[i] < 7 * 2^64 < 2^67
486 static void
487 smallfelem_square(longfelem out, const smallfelem small)
489 limb a;
490 u64 high, low;
492 a = ((uint128_t) small[0]) * small[0];
493 low = a;
494 high = a >> 64;
495 out[0] = low;
496 out[1] = high;
498 a = ((uint128_t) small[0]) * small[1];
499 low = a;
500 high = a >> 64;
501 out[1] += low;
502 out[1] += low;
503 out[2] = high;
505 a = ((uint128_t) small[0]) * small[2];
506 low = a;
507 high = a >> 64;
508 out[2] += low;
509 out[2] *= 2;
510 out[3] = high;
512 a = ((uint128_t) small[0]) * small[3];
513 low = a;
514 high = a >> 64;
515 out[3] += low;
516 out[4] = high;
518 a = ((uint128_t) small[1]) * small[2];
519 low = a;
520 high = a >> 64;
521 out[3] += low;
522 out[3] *= 2;
523 out[4] += high;
525 a = ((uint128_t) small[1]) * small[1];
526 low = a;
527 high = a >> 64;
528 out[2] += low;
529 out[3] += high;
531 a = ((uint128_t) small[1]) * small[3];
532 low = a;
533 high = a >> 64;
534 out[4] += low;
535 out[4] *= 2;
536 out[5] = high;
538 a = ((uint128_t) small[2]) * small[3];
539 low = a;
540 high = a >> 64;
541 out[5] += low;
542 out[5] *= 2;
543 out[6] = high;
544 out[6] += high;
546 a = ((uint128_t) small[2]) * small[2];
547 low = a;
548 high = a >> 64;
549 out[4] += low;
550 out[5] += high;
552 a = ((uint128_t) small[3]) * small[3];
553 low = a;
554 high = a >> 64;
555 out[6] += low;
556 out[7] = high;
559 /* felem_square sets |out| = |in|^2
560 * On entry:
561 * in[i] < 2^109
562 * On exit:
563 * out[i] < 7 * 2^64 < 2^67
565 static void
566 felem_square(longfelem out, const felem in)
568 u64 small[4];
569 felem_shrink(small, in);
570 smallfelem_square(out, small);
573 /* smallfelem_mul sets |out| = |small1| * |small2|
574 * On entry:
575 * small1[i] < 2^64
576 * small2[i] < 2^64
577 * On exit:
578 * out[i] < 7 * 2^64 < 2^67
580 static void
581 smallfelem_mul(longfelem out, const smallfelem small1, const smallfelem small2)
583 limb a;
584 u64 high, low;
586 a = ((uint128_t) small1[0]) * small2[0];
587 low = a;
588 high = a >> 64;
589 out[0] = low;
590 out[1] = high;
593 a = ((uint128_t) small1[0]) * small2[1];
594 low = a;
595 high = a >> 64;
596 out[1] += low;
597 out[2] = high;
599 a = ((uint128_t) small1[1]) * small2[0];
600 low = a;
601 high = a >> 64;
602 out[1] += low;
603 out[2] += high;
606 a = ((uint128_t) small1[0]) * small2[2];
607 low = a;
608 high = a >> 64;
609 out[2] += low;
610 out[3] = high;
612 a = ((uint128_t) small1[1]) * small2[1];
613 low = a;
614 high = a >> 64;
615 out[2] += low;
616 out[3] += high;
618 a = ((uint128_t) small1[2]) * small2[0];
619 low = a;
620 high = a >> 64;
621 out[2] += low;
622 out[3] += high;
625 a = ((uint128_t) small1[0]) * small2[3];
626 low = a;
627 high = a >> 64;
628 out[3] += low;
629 out[4] = high;
631 a = ((uint128_t) small1[1]) * small2[2];
632 low = a;
633 high = a >> 64;
634 out[3] += low;
635 out[4] += high;
637 a = ((uint128_t) small1[2]) * small2[1];
638 low = a;
639 high = a >> 64;
640 out[3] += low;
641 out[4] += high;
643 a = ((uint128_t) small1[3]) * small2[0];
644 low = a;
645 high = a >> 64;
646 out[3] += low;
647 out[4] += high;
650 a = ((uint128_t) small1[1]) * small2[3];
651 low = a;
652 high = a >> 64;
653 out[4] += low;
654 out[5] = high;
656 a = ((uint128_t) small1[2]) * small2[2];
657 low = a;
658 high = a >> 64;
659 out[4] += low;
660 out[5] += high;
662 a = ((uint128_t) small1[3]) * small2[1];
663 low = a;
664 high = a >> 64;
665 out[4] += low;
666 out[5] += high;
669 a = ((uint128_t) small1[2]) * small2[3];
670 low = a;
671 high = a >> 64;
672 out[5] += low;
673 out[6] = high;
675 a = ((uint128_t) small1[3]) * small2[2];
676 low = a;
677 high = a >> 64;
678 out[5] += low;
679 out[6] += high;
682 a = ((uint128_t) small1[3]) * small2[3];
683 low = a;
684 high = a >> 64;
685 out[6] += low;
686 out[7] = high;
689 /* felem_mul sets |out| = |in1| * |in2|
690 * On entry:
691 * in1[i] < 2^109
692 * in2[i] < 2^109
693 * On exit:
694 * out[i] < 7 * 2^64 < 2^67
696 static void
697 felem_mul(longfelem out, const felem in1, const felem in2)
699 smallfelem small1, small2;
700 felem_shrink(small1, in1);
701 felem_shrink(small2, in2);
702 smallfelem_mul(out, small1, small2);
705 /* felem_small_mul sets |out| = |small1| * |in2|
706 * On entry:
707 * small1[i] < 2^64
708 * in2[i] < 2^109
709 * On exit:
710 * out[i] < 7 * 2^64 < 2^67
712 static void
713 felem_small_mul(longfelem out, const smallfelem small1, const felem in2)
715 smallfelem small2;
716 felem_shrink(small2, in2);
717 smallfelem_mul(out, small1, small2);
720 #define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
721 #define two100 (((limb)1) << 100)
722 #define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
723 /* zero100 is 0 mod p */
724 static const felem zero100 = {two100m36m4, two100, two100m36p4, two100m36p4};
726 /* Internal function for the different flavours of felem_reduce.
727 * felem_reduce_ reduces the higher coefficients in[4]-in[7].
728 * On entry:
729 * out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
730 * out[1] >= in[7] + 2^32*in[4]
731 * out[2] >= in[5] + 2^32*in[5]
732 * out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
733 * On exit:
734 * out[0] <= out[0] + in[4] + 2^32*in[5]
735 * out[1] <= out[1] + in[5] + 2^33*in[6]
736 * out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
737 * out[3] <= out[3] + 2^32*in[4] + 3*in[7]
739 static void
740 felem_reduce_(felem out, const longfelem in)
742 int128_t c;
743 /* combine common terms from below */
744 c = in[4] + (in[5] << 32);
745 out[0] += c;
746 out[3] -= c;
748 c = in[5] - in[7];
749 out[1] += c;
750 out[2] -= c;
752 /* the remaining terms */
753 /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
754 out[1] -= (in[4] << 32);
755 out[3] += (in[4] << 32);
757 /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
758 out[2] -= (in[5] << 32);
760 /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
761 out[0] -= in[6];
762 out[0] -= (in[6] << 32);
763 out[1] += (in[6] << 33);
764 out[2] += (in[6] * 2);
765 out[3] -= (in[6] << 32);
767 /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
768 out[0] -= in[7];
769 out[0] -= (in[7] << 32);
770 out[2] += (in[7] << 33);
771 out[3] += (in[7] * 3);
774 /* felem_reduce converts a longfelem into an felem.
775 * To be called directly after felem_square or felem_mul.
776 * On entry:
777 * in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
778 * in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
779 * On exit:
780 * out[i] < 2^101
782 static void
783 felem_reduce(felem out, const longfelem in)
785 out[0] = zero100[0] + in[0];
786 out[1] = zero100[1] + in[1];
787 out[2] = zero100[2] + in[2];
788 out[3] = zero100[3] + in[3];
790 felem_reduce_(out, in);
793 * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
794 * out[1] > 2^100 - 2^64 - 7*2^96 > 0 out[2] > 2^100 - 2^36 + 2^4 -
795 * 5*2^64 - 5*2^96 > 0 out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96
796 * - 3*2^96 > 0
798 * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101 out[1] < 2^100 +
799 * 3*2^64 + 5*2^64 + 3*2^97 < 2^101 out[2] < 2^100 + 5*2^64 + 2^64 +
800 * 3*2^65 + 2^97 < 2^101 out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 <
801 * 2^101
805 /* felem_reduce_zero105 converts a larger longfelem into an felem.
806 * On entry:
807 * in[0] < 2^71
808 * On exit:
809 * out[i] < 2^106
811 static void
812 felem_reduce_zero105(felem out, const longfelem in)
814 out[0] = zero105[0] + in[0];
815 out[1] = zero105[1] + in[1];
816 out[2] = zero105[2] + in[2];
817 out[3] = zero105[3] + in[3];
819 felem_reduce_(out, in);
822 * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
823 * out[1] > 2^105 - 2^71 - 2^103 > 0 out[2] > 2^105 - 2^41 + 2^9 -
824 * 2^71 - 2^103 > 0 out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 -
825 * 2^103 > 0
827 * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106 out[1] < 2^105 + 2^71 +
828 * 2^71 + 2^103 < 2^106 out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 <
829 * 2^106 out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
833 /* subtract_u64 sets *result = *result - v and *carry to one if the subtraction
834 * underflowed. */
835 static void
836 subtract_u64(u64 * result, u64 * carry, u64 v)
838 uint128_t r = *result;
839 r -= v;
840 *carry = (r >> 64) & 1;
841 *result = (u64) r;
844 /* felem_contract converts |in| to its unique, minimal representation.
845 * On entry:
846 * in[i] < 2^109
848 static void
849 felem_contract(smallfelem out, const felem in)
851 unsigned i;
852 u64 all_equal_so_far = 0, result = 0, carry;
854 felem_shrink(out, in);
855 /* small is minimal except that the value might be > p */
857 all_equal_so_far--;
859 * We are doing a constant time test if out >= kPrime. We need to
860 * compare each u64, from most-significant to least significant. For
861 * each one, if all words so far have been equal (m is all ones) then
862 * a non-equal result is the answer. Otherwise we continue.
864 for (i = 3; i < 4; i--) {
865 u64 equal;
866 uint128_t a = ((uint128_t) kPrime[i]) - out[i];
868 * if out[i] > kPrime[i] then a will underflow and the high
869 * 64-bits will all be set.
871 result |= all_equal_so_far & ((u64) (a >> 64));
874 * if kPrime[i] == out[i] then |equal| will be all zeros and
875 * the decrement will make it all ones.
877 equal = kPrime[i] ^ out[i];
878 equal--;
879 equal &= equal << 32;
880 equal &= equal << 16;
881 equal &= equal << 8;
882 equal &= equal << 4;
883 equal &= equal << 2;
884 equal &= equal << 1;
885 equal = ((s64) equal) >> 63;
887 all_equal_so_far &= equal;
891 * if all_equal_so_far is still all ones then the two values are
892 * equal and so out >= kPrime is true.
894 result |= all_equal_so_far;
896 /* if out >= kPrime then we subtract kPrime. */
897 subtract_u64(&out[0], &carry, result & kPrime[0]);
898 subtract_u64(&out[1], &carry, carry);
899 subtract_u64(&out[2], &carry, carry);
900 subtract_u64(&out[3], &carry, carry);
902 subtract_u64(&out[1], &carry, result & kPrime[1]);
903 subtract_u64(&out[2], &carry, carry);
904 subtract_u64(&out[3], &carry, carry);
906 subtract_u64(&out[2], &carry, result & kPrime[2]);
907 subtract_u64(&out[3], &carry, carry);
909 subtract_u64(&out[3], &carry, result & kPrime[3]);
912 static void
913 smallfelem_square_contract(smallfelem out, const smallfelem in)
915 longfelem longtmp;
916 felem tmp;
918 smallfelem_square(longtmp, in);
919 felem_reduce(tmp, longtmp);
920 felem_contract(out, tmp);
923 static void
924 smallfelem_mul_contract(smallfelem out, const smallfelem in1, const smallfelem in2)
926 longfelem longtmp;
927 felem tmp;
929 smallfelem_mul(longtmp, in1, in2);
930 felem_reduce(tmp, longtmp);
931 felem_contract(out, tmp);
934 /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
935 * otherwise.
936 * On entry:
937 * small[i] < 2^64
939 static limb
940 smallfelem_is_zero(const smallfelem small)
942 limb result;
943 u64 is_p;
945 u64 is_zero = small[0] | small[1] | small[2] | small[3];
946 is_zero--;
947 is_zero &= is_zero << 32;
948 is_zero &= is_zero << 16;
949 is_zero &= is_zero << 8;
950 is_zero &= is_zero << 4;
951 is_zero &= is_zero << 2;
952 is_zero &= is_zero << 1;
953 is_zero = ((s64) is_zero) >> 63;
955 is_p = (small[0] ^ kPrime[0]) |
956 (small[1] ^ kPrime[1]) |
957 (small[2] ^ kPrime[2]) |
958 (small[3] ^ kPrime[3]);
959 is_p--;
960 is_p &= is_p << 32;
961 is_p &= is_p << 16;
962 is_p &= is_p << 8;
963 is_p &= is_p << 4;
964 is_p &= is_p << 2;
965 is_p &= is_p << 1;
966 is_p = ((s64) is_p) >> 63;
968 is_zero |= is_p;
970 result = is_zero;
971 result |= ((limb) is_zero) << 64;
972 return result;
975 static int
976 smallfelem_is_zero_int(const smallfelem small)
978 return (int) (smallfelem_is_zero(small) & ((limb) 1));
981 /* felem_inv calculates |out| = |in|^{-1}
983 * Based on Fermat's Little Theorem:
984 * a^p = a (mod p)
985 * a^{p-1} = 1 (mod p)
986 * a^{p-2} = a^{-1} (mod p)
988 static void
989 felem_inv(felem out, const felem in)
991 felem ftmp, ftmp2;
992 /* each e_I will hold |in|^{2^I - 1} */
993 felem e2, e4, e8, e16, e32, e64;
994 longfelem tmp;
995 unsigned i;
997 felem_square(tmp, in);
998 felem_reduce(ftmp, tmp);/* 2^1 */
999 felem_mul(tmp, in, ftmp);
1000 felem_reduce(ftmp, tmp);/* 2^2 - 2^0 */
1001 felem_assign(e2, ftmp);
1002 felem_square(tmp, ftmp);
1003 felem_reduce(ftmp, tmp);/* 2^3 - 2^1 */
1004 felem_square(tmp, ftmp);
1005 felem_reduce(ftmp, tmp);/* 2^4 - 2^2 */
1006 felem_mul(tmp, ftmp, e2);
1007 felem_reduce(ftmp, tmp);/* 2^4 - 2^0 */
1008 felem_assign(e4, ftmp);
1009 felem_square(tmp, ftmp);
1010 felem_reduce(ftmp, tmp);/* 2^5 - 2^1 */
1011 felem_square(tmp, ftmp);
1012 felem_reduce(ftmp, tmp);/* 2^6 - 2^2 */
1013 felem_square(tmp, ftmp);
1014 felem_reduce(ftmp, tmp);/* 2^7 - 2^3 */
1015 felem_square(tmp, ftmp);
1016 felem_reduce(ftmp, tmp);/* 2^8 - 2^4 */
1017 felem_mul(tmp, ftmp, e4);
1018 felem_reduce(ftmp, tmp);/* 2^8 - 2^0 */
1019 felem_assign(e8, ftmp);
1020 for (i = 0; i < 8; i++) {
1021 felem_square(tmp, ftmp);
1022 felem_reduce(ftmp, tmp);
1023 } /* 2^16 - 2^8 */
1024 felem_mul(tmp, ftmp, e8);
1025 felem_reduce(ftmp, tmp);/* 2^16 - 2^0 */
1026 felem_assign(e16, ftmp);
1027 for (i = 0; i < 16; i++) {
1028 felem_square(tmp, ftmp);
1029 felem_reduce(ftmp, tmp);
1030 } /* 2^32 - 2^16 */
1031 felem_mul(tmp, ftmp, e16);
1032 felem_reduce(ftmp, tmp);/* 2^32 - 2^0 */
1033 felem_assign(e32, ftmp);
1034 for (i = 0; i < 32; i++) {
1035 felem_square(tmp, ftmp);
1036 felem_reduce(ftmp, tmp);
1037 } /* 2^64 - 2^32 */
1038 felem_assign(e64, ftmp);
1039 felem_mul(tmp, ftmp, in);
1040 felem_reduce(ftmp, tmp);/* 2^64 - 2^32 + 2^0 */
1041 for (i = 0; i < 192; i++) {
1042 felem_square(tmp, ftmp);
1043 felem_reduce(ftmp, tmp);
1044 } /* 2^256 - 2^224 + 2^192 */
1046 felem_mul(tmp, e64, e32);
1047 felem_reduce(ftmp2, tmp); /* 2^64 - 2^0 */
1048 for (i = 0; i < 16; i++) {
1049 felem_square(tmp, ftmp2);
1050 felem_reduce(ftmp2, tmp);
1051 } /* 2^80 - 2^16 */
1052 felem_mul(tmp, ftmp2, e16);
1053 felem_reduce(ftmp2, tmp); /* 2^80 - 2^0 */
1054 for (i = 0; i < 8; i++) {
1055 felem_square(tmp, ftmp2);
1056 felem_reduce(ftmp2, tmp);
1057 } /* 2^88 - 2^8 */
1058 felem_mul(tmp, ftmp2, e8);
1059 felem_reduce(ftmp2, tmp); /* 2^88 - 2^0 */
1060 for (i = 0; i < 4; i++) {
1061 felem_square(tmp, ftmp2);
1062 felem_reduce(ftmp2, tmp);
1063 } /* 2^92 - 2^4 */
1064 felem_mul(tmp, ftmp2, e4);
1065 felem_reduce(ftmp2, tmp); /* 2^92 - 2^0 */
1066 felem_square(tmp, ftmp2);
1067 felem_reduce(ftmp2, tmp); /* 2^93 - 2^1 */
1068 felem_square(tmp, ftmp2);
1069 felem_reduce(ftmp2, tmp); /* 2^94 - 2^2 */
1070 felem_mul(tmp, ftmp2, e2);
1071 felem_reduce(ftmp2, tmp); /* 2^94 - 2^0 */
1072 felem_square(tmp, ftmp2);
1073 felem_reduce(ftmp2, tmp); /* 2^95 - 2^1 */
1074 felem_square(tmp, ftmp2);
1075 felem_reduce(ftmp2, tmp); /* 2^96 - 2^2 */
1076 felem_mul(tmp, ftmp2, in);
1077 felem_reduce(ftmp2, tmp); /* 2^96 - 3 */
1079 felem_mul(tmp, ftmp2, ftmp);
1080 felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1083 static void
1084 smallfelem_inv_contract(smallfelem out, const smallfelem in)
1086 felem tmp;
1088 smallfelem_expand(tmp, in);
1089 felem_inv(tmp, tmp);
1090 felem_contract(out, tmp);
1093 /* Group operations
1094 * ----------------
1096 * Building on top of the field operations we have the operations on the
1097 * elliptic curve group itself. Points on the curve are represented in Jacobian
1098 * coordinates */
1100 /* point_double calculates 2*(x_in, y_in, z_in)
1102 * The method is taken from:
1103 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1105 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1106 * while x_out == y_in is not (maybe this works, but it's not tested). */
1107 static void
1108 point_double(felem x_out, felem y_out, felem z_out,
1109 const felem x_in, const felem y_in, const felem z_in)
1111 longfelem tmp, tmp2;
1112 felem delta, gamma, beta, alpha, ftmp, ftmp2;
1113 smallfelem small1, small2;
1115 felem_assign(ftmp, x_in);
1116 /* ftmp[i] < 2^106 */
1117 felem_assign(ftmp2, x_in);
1118 /* ftmp2[i] < 2^106 */
1120 /* delta = z^2 */
1121 felem_square(tmp, z_in);
1122 felem_reduce(delta, tmp);
1123 /* delta[i] < 2^101 */
1125 /* gamma = y^2 */
1126 felem_square(tmp, y_in);
1127 felem_reduce(gamma, tmp);
1128 /* gamma[i] < 2^101 */
1129 felem_shrink(small1, gamma);
1131 /* beta = x*gamma */
1132 felem_small_mul(tmp, small1, x_in);
1133 felem_reduce(beta, tmp);
1134 /* beta[i] < 2^101 */
1136 /* alpha = 3*(x-delta)*(x+delta) */
1137 felem_diff(ftmp, delta);
1138 /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1139 felem_sum(ftmp2, delta);
1140 /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1141 felem_scalar(ftmp2, 3);
1142 /* ftmp2[i] < 3 * 2^107 < 2^109 */
1143 felem_mul(tmp, ftmp, ftmp2);
1144 felem_reduce(alpha, tmp);
1145 /* alpha[i] < 2^101 */
1146 felem_shrink(small2, alpha);
1148 /* x' = alpha^2 - 8*beta */
1149 smallfelem_square(tmp, small2);
1150 felem_reduce(x_out, tmp);
1151 felem_assign(ftmp, beta);
1152 felem_scalar(ftmp, 8);
1153 /* ftmp[i] < 8 * 2^101 = 2^104 */
1154 felem_diff(x_out, ftmp);
1155 /* x_out[i] < 2^105 + 2^101 < 2^106 */
1157 /* z' = (y + z)^2 - gamma - delta */
1158 felem_sum(delta, gamma);
1159 /* delta[i] < 2^101 + 2^101 = 2^102 */
1160 felem_assign(ftmp, y_in);
1161 felem_sum(ftmp, z_in);
1162 /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1163 felem_square(tmp, ftmp);
1164 felem_reduce(z_out, tmp);
1165 felem_diff(z_out, delta);
1166 /* z_out[i] < 2^105 + 2^101 < 2^106 */
1168 /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1169 felem_scalar(beta, 4);
1170 /* beta[i] < 4 * 2^101 = 2^103 */
1171 felem_diff_zero107(beta, x_out);
1172 /* beta[i] < 2^107 + 2^103 < 2^108 */
1173 felem_small_mul(tmp, small2, beta);
1174 /* tmp[i] < 7 * 2^64 < 2^67 */
1175 smallfelem_square(tmp2, small1);
1176 /* tmp2[i] < 7 * 2^64 */
1177 longfelem_scalar(tmp2, 8);
1178 /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1179 longfelem_diff(tmp, tmp2);
1180 /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1181 felem_reduce_zero105(y_out, tmp);
1182 /* y_out[i] < 2^106 */
1185 /* point_double_small is the same as point_double, except that it operates on
1186 * smallfelems */
1187 static void
1188 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1189 const smallfelem x_in, const smallfelem y_in, const smallfelem z_in)
1191 felem felem_x_out, felem_y_out, felem_z_out;
1192 felem felem_x_in, felem_y_in, felem_z_in;
1194 smallfelem_expand(felem_x_in, x_in);
1195 smallfelem_expand(felem_y_in, y_in);
1196 smallfelem_expand(felem_z_in, z_in);
1197 point_double(felem_x_out, felem_y_out, felem_z_out,
1198 felem_x_in, felem_y_in, felem_z_in);
1199 felem_shrink(x_out, felem_x_out);
1200 felem_shrink(y_out, felem_y_out);
1201 felem_shrink(z_out, felem_z_out);
1204 /* copy_conditional copies in to out iff mask is all ones. */
1205 static void
1206 copy_conditional(felem out, const felem in, limb mask)
1208 unsigned i;
1209 for (i = 0; i < NLIMBS; ++i) {
1210 const limb tmp = mask & (in[i] ^ out[i]);
1211 out[i] ^= tmp;
1215 /* copy_small_conditional copies in to out iff mask is all ones. */
1216 static void
1217 copy_small_conditional(felem out, const smallfelem in, limb mask)
1219 unsigned i;
1220 const u64 mask64 = mask;
1221 for (i = 0; i < NLIMBS; ++i) {
1222 out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1226 /* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1228 * The method is taken from:
1229 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1230 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1232 * This function includes a branch for checking whether the two input points
1233 * are equal, (while not equal to the point at infinity). This case never
1234 * happens during single point multiplication, so there is no timing leak for
1235 * ECDH or ECDSA signing. */
1236 static void
1237 point_add(felem x3, felem y3, felem z3,
1238 const felem x1, const felem y1, const felem z1,
1239 const int mixed, const smallfelem x2, const smallfelem y2, const smallfelem z2)
1241 felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1242 longfelem tmp, tmp2;
1243 smallfelem small1, small2, small3, small4, small5;
1244 limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1246 felem_shrink(small3, z1);
1248 z1_is_zero = smallfelem_is_zero(small3);
1249 z2_is_zero = smallfelem_is_zero(z2);
1251 /* ftmp = z1z1 = z1**2 */
1252 smallfelem_square(tmp, small3);
1253 felem_reduce(ftmp, tmp);
1254 /* ftmp[i] < 2^101 */
1255 felem_shrink(small1, ftmp);
1257 if (!mixed) {
1258 /* ftmp2 = z2z2 = z2**2 */
1259 smallfelem_square(tmp, z2);
1260 felem_reduce(ftmp2, tmp);
1261 /* ftmp2[i] < 2^101 */
1262 felem_shrink(small2, ftmp2);
1264 felem_shrink(small5, x1);
1266 /* u1 = ftmp3 = x1*z2z2 */
1267 smallfelem_mul(tmp, small5, small2);
1268 felem_reduce(ftmp3, tmp);
1269 /* ftmp3[i] < 2^101 */
1271 /* ftmp5 = z1 + z2 */
1272 felem_assign(ftmp5, z1);
1273 felem_small_sum(ftmp5, z2);
1274 /* ftmp5[i] < 2^107 */
1276 /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1277 felem_square(tmp, ftmp5);
1278 felem_reduce(ftmp5, tmp);
1279 /* ftmp2 = z2z2 + z1z1 */
1280 felem_sum(ftmp2, ftmp);
1281 /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1282 felem_diff(ftmp5, ftmp2);
1283 /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1285 /* ftmp2 = z2 * z2z2 */
1286 smallfelem_mul(tmp, small2, z2);
1287 felem_reduce(ftmp2, tmp);
1289 /* s1 = ftmp2 = y1 * z2**3 */
1290 felem_mul(tmp, y1, ftmp2);
1291 felem_reduce(ftmp6, tmp);
1292 /* ftmp6[i] < 2^101 */
1293 } else {
1294 /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1296 /* u1 = ftmp3 = x1*z2z2 */
1297 felem_assign(ftmp3, x1);
1298 /* ftmp3[i] < 2^106 */
1300 /* ftmp5 = 2z1z2 */
1301 felem_assign(ftmp5, z1);
1302 felem_scalar(ftmp5, 2);
1303 /* ftmp5[i] < 2*2^106 = 2^107 */
1305 /* s1 = ftmp2 = y1 * z2**3 */
1306 felem_assign(ftmp6, y1);
1307 /* ftmp6[i] < 2^106 */
1310 /* u2 = x2*z1z1 */
1311 smallfelem_mul(tmp, x2, small1);
1312 felem_reduce(ftmp4, tmp);
1314 /* h = ftmp4 = u2 - u1 */
1315 felem_diff_zero107(ftmp4, ftmp3);
1316 /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1317 felem_shrink(small4, ftmp4);
1319 x_equal = smallfelem_is_zero(small4);
1321 /* z_out = ftmp5 * h */
1322 felem_small_mul(tmp, small4, ftmp5);
1323 felem_reduce(z_out, tmp);
1324 /* z_out[i] < 2^101 */
1326 /* ftmp = z1 * z1z1 */
1327 smallfelem_mul(tmp, small1, small3);
1328 felem_reduce(ftmp, tmp);
1330 /* s2 = tmp = y2 * z1**3 */
1331 felem_small_mul(tmp, y2, ftmp);
1332 felem_reduce(ftmp5, tmp);
1334 /* r = ftmp5 = (s2 - s1)*2 */
1335 felem_diff_zero107(ftmp5, ftmp6);
1336 /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1337 felem_scalar(ftmp5, 2);
1338 /* ftmp5[i] < 2^109 */
1339 felem_shrink(small1, ftmp5);
1340 y_equal = smallfelem_is_zero(small1);
1342 if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1343 point_double(x3, y3, z3, x1, y1, z1);
1344 return;
1346 /* I = ftmp = (2h)**2 */
1347 felem_assign(ftmp, ftmp4);
1348 felem_scalar(ftmp, 2);
1349 /* ftmp[i] < 2*2^108 = 2^109 */
1350 felem_square(tmp, ftmp);
1351 felem_reduce(ftmp, tmp);
1353 /* J = ftmp2 = h * I */
1354 felem_mul(tmp, ftmp4, ftmp);
1355 felem_reduce(ftmp2, tmp);
1357 /* V = ftmp4 = U1 * I */
1358 felem_mul(tmp, ftmp3, ftmp);
1359 felem_reduce(ftmp4, tmp);
1361 /* x_out = r**2 - J - 2V */
1362 smallfelem_square(tmp, small1);
1363 felem_reduce(x_out, tmp);
1364 felem_assign(ftmp3, ftmp4);
1365 felem_scalar(ftmp4, 2);
1366 felem_sum(ftmp4, ftmp2);
1367 /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1368 felem_diff(x_out, ftmp4);
1369 /* x_out[i] < 2^105 + 2^101 */
1371 /* y_out = r(V-x_out) - 2 * s1 * J */
1372 felem_diff_zero107(ftmp3, x_out);
1373 /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1374 felem_small_mul(tmp, small1, ftmp3);
1375 felem_mul(tmp2, ftmp6, ftmp2);
1376 longfelem_scalar(tmp2, 2);
1377 /* tmp2[i] < 2*2^67 = 2^68 */
1378 longfelem_diff(tmp, tmp2);
1379 /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1380 felem_reduce_zero105(y_out, tmp);
1381 /* y_out[i] < 2^106 */
1383 copy_small_conditional(x_out, x2, z1_is_zero);
1384 copy_conditional(x_out, x1, z2_is_zero);
1385 copy_small_conditional(y_out, y2, z1_is_zero);
1386 copy_conditional(y_out, y1, z2_is_zero);
1387 copy_small_conditional(z_out, z2, z1_is_zero);
1388 copy_conditional(z_out, z1, z2_is_zero);
1389 felem_assign(x3, x_out);
1390 felem_assign(y3, y_out);
1391 felem_assign(z3, z_out);
1394 /* point_add_small is the same as point_add, except that it operates on
1395 * smallfelems */
1396 static void
1397 point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1398 smallfelem x1, smallfelem y1, smallfelem z1,
1399 smallfelem x2, smallfelem y2, smallfelem z2)
1401 felem felem_x3, felem_y3, felem_z3;
1402 felem felem_x1, felem_y1, felem_z1;
1403 smallfelem_expand(felem_x1, x1);
1404 smallfelem_expand(felem_y1, y1);
1405 smallfelem_expand(felem_z1, z1);
1406 point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0, x2, y2, z2);
1407 felem_shrink(x3, felem_x3);
1408 felem_shrink(y3, felem_y3);
1409 felem_shrink(z3, felem_z3);
1412 /* Base point pre computation
1413 * --------------------------
1415 * Two different sorts of precomputed tables are used in the following code.
1416 * Each contain various points on the curve, where each point is three field
1417 * elements (x, y, z).
1419 * For the base point table, z is usually 1 (0 for the point at infinity).
1420 * This table has 2 * 16 elements, starting with the following:
1421 * index | bits | point
1422 * ------+---------+------------------------------
1423 * 0 | 0 0 0 0 | 0G
1424 * 1 | 0 0 0 1 | 1G
1425 * 2 | 0 0 1 0 | 2^64G
1426 * 3 | 0 0 1 1 | (2^64 + 1)G
1427 * 4 | 0 1 0 0 | 2^128G
1428 * 5 | 0 1 0 1 | (2^128 + 1)G
1429 * 6 | 0 1 1 0 | (2^128 + 2^64)G
1430 * 7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1431 * 8 | 1 0 0 0 | 2^192G
1432 * 9 | 1 0 0 1 | (2^192 + 1)G
1433 * 10 | 1 0 1 0 | (2^192 + 2^64)G
1434 * 11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1435 * 12 | 1 1 0 0 | (2^192 + 2^128)G
1436 * 13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1437 * 14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1438 * 15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1439 * followed by a copy of this with each element multiplied by 2^32.
1441 * The reason for this is so that we can clock bits into four different
1442 * locations when doing simple scalar multiplies against the base point,
1443 * and then another four locations using the second 16 elements.
1445 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1447 /* gmul is the table of precomputed base points */
1448 static const smallfelem gmul[2][16][3] =
1449 {{{{0, 0, 0, 0},
1450 {0, 0, 0, 0},
1451 {0, 0, 0, 0}},
1452 {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2, 0x6b17d1f2e12c4247},
1453 {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16, 0x4fe342e2fe1a7f9b},
1454 {1, 0, 0, 0}},
1455 {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de, 0x0fa822bc2811aaa5},
1456 {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b, 0xbff44ae8f5dba80d},
1457 {1, 0, 0, 0}},
1458 {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789, 0x300a4bbc89d6726f},
1459 {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f, 0x72aac7e0d09b4644},
1460 {1, 0, 0, 0}},
1461 {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e, 0x447d739beedb5e67},
1462 {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7, 0x2d4825ab834131ee},
1463 {1, 0, 0, 0}},
1464 {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60, 0xef9519328a9c72ff},
1465 {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c, 0x611e9fc37dbb2c9b},
1466 {1, 0, 0, 0}},
1467 {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf, 0x550663797b51f5d8},
1468 {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5, 0x157164848aecb851},
1469 {1, 0, 0, 0}},
1470 {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391, 0xeb5d7745b21141ea},
1471 {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee, 0xeafd72ebdbecc17b},
1472 {1, 0, 0, 0}},
1473 {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5, 0xa6d39677a7849276},
1474 {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf, 0x674f84749b0b8816},
1475 {1, 0, 0, 0}},
1476 {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb, 0x4e769e7672c9ddad},
1477 {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281, 0x42b99082de830663},
1478 {1, 0, 0, 0}},
1479 {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478, 0x78878ef61c6ce04d},
1480 {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def, 0xb6cb3f5d7b72c321},
1481 {1, 0, 0, 0}},
1482 {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae, 0x0c88bc4d716b1287},
1483 {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa, 0xdd5ddea3f3901dc6},
1484 {1, 0, 0, 0}},
1485 {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3, 0x68f344af6b317466},
1486 {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3, 0x31b9c405f8540a20},
1487 {1, 0, 0, 0}},
1488 {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0, 0x4052bf4b6f461db9},
1489 {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8, 0xfecf4d5190b0fc61},
1490 {1, 0, 0, 0}},
1491 {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a, 0x1eddbae2c802e41a},
1492 {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0, 0x43104d86560ebcfc},
1493 {1, 0, 0, 0}},
1494 {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a, 0xb48e26b484f7a21c},
1495 {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668, 0xfac015404d4d3dab},
1496 {1, 0, 0, 0}}},
1497 {{{0, 0, 0, 0},
1498 {0, 0, 0, 0},
1499 {0, 0, 0, 0}},
1500 {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da, 0x7fe36b40af22af89},
1501 {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1, 0xe697d45825b63624},
1502 {1, 0, 0, 0}},
1503 {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902, 0x4a5b506612a677a6},
1504 {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40, 0xeb13461ceac089f1},
1505 {1, 0, 0, 0}},
1506 {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857, 0x0781b8291c6a220a},
1507 {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434, 0x690cde8df0151593},
1508 {1, 0, 0, 0}},
1509 {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326, 0x8a535f566ec73617},
1510 {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf, 0x0455c08468b08bd7},
1511 {1, 0, 0, 0}},
1512 {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279, 0x06bada7ab77f8276},
1513 {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70, 0x5b476dfd0e6cb18a},
1514 {1, 0, 0, 0}},
1515 {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8, 0x3e29864e8a2ec908},
1516 {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed, 0x239b90ea3dc31e7e},
1517 {1, 0, 0, 0}},
1518 {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4, 0x820f4dd949f72ff7},
1519 {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3, 0x140406ec783a05ec},
1520 {1, 0, 0, 0}},
1521 {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe, 0x68f6b8542783dfee},
1522 {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028, 0xcbe1feba92e40ce6},
1523 {1, 0, 0, 0}},
1524 {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927, 0xd0b2f94d2f420109},
1525 {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a, 0x971459828b0719e5},
1526 {1, 0, 0, 0}},
1527 {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687, 0x961610004a866aba},
1528 {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c, 0x7acb9fadcee75e44},
1529 {1, 0, 0, 0}},
1530 {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea, 0x24eb9acca333bf5b},
1531 {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d, 0x69f891c5acd079cc},
1532 {1, 0, 0, 0}},
1533 {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514, 0xe51f547c5972a107},
1534 {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06, 0x1c309a2b25bb1387},
1535 {1, 0, 0, 0}},
1536 {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828, 0x20b87b8aa2c4e503},
1537 {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044, 0xf5c6fa49919776be},
1538 {1, 0, 0, 0}},
1539 {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56, 0x1ed7d1b9332010b9},
1540 {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24, 0x3a2b03f03217257a},
1541 {1, 0, 0, 0}},
1542 {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b, 0x15fee545c78dd9f6},
1543 {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb, 0x4ab5b6b2b8753f81},
1544 {1, 0, 0, 0}}}};
1546 /* select_point selects the |idx|th point from a precomputation table and
1547 * copies it to out. */
1548 static void
1549 select_point(const u64 idx, unsigned int size, const smallfelem pre_comp[16][3], smallfelem out[3])
1551 unsigned i, j;
1552 u64 *outlimbs = &out[0][0];
1553 memset(outlimbs, 0, 3 * sizeof(smallfelem));
1555 for (i = 0; i < size; i++) {
1556 const u64 *inlimbs = (u64 *) & pre_comp[i][0][0];
1557 u64 mask = i ^ idx;
1558 mask |= mask >> 4;
1559 mask |= mask >> 2;
1560 mask |= mask >> 1;
1561 mask &= 1;
1562 mask--;
1563 for (j = 0; j < NLIMBS * 3; j++)
1564 outlimbs[j] |= inlimbs[j] & mask;
1568 /* get_bit returns the |i|th bit in |in| */
1569 static char
1570 get_bit(const felem_bytearray in, int i)
1572 if ((i < 0) || (i >= 256))
1573 return 0;
1574 return (in[i >> 3] >> (i & 7)) & 1;
1577 /* Interleaved point multiplication using precomputed point multiples:
1578 * The small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[],
1579 * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1580 * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1581 * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1582 static void
1583 batch_mul(felem x_out, felem y_out, felem z_out,
1584 const felem_bytearray scalars[], const unsigned num_points, const u8 * g_scalar,
1585 const int mixed, const smallfelem pre_comp[][17][3], const smallfelem g_pre_comp[2][16][3])
1587 int i, skip;
1588 unsigned num, gen_mul = (g_scalar != NULL);
1589 felem nq[3], ftmp;
1590 smallfelem tmp[3];
1591 u64 bits;
1592 u8 sign, digit;
1594 /* set nq to the point at infinity */
1595 memset(nq, 0, 3 * sizeof(felem));
1598 * Loop over all scalars msb-to-lsb, interleaving additions of
1599 * multiples of the generator (two in each of the last 32 rounds) and
1600 * additions of other points multiples (every 5th round).
1602 skip = 1; /* save two point operations in the first
1603 * round */
1604 for (i = (num_points ? 255 : 31); i >= 0; --i) {
1605 /* double */
1606 if (!skip)
1607 point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1609 /* add multiples of the generator */
1610 if (gen_mul && (i <= 31)) {
1611 /* first, look 32 bits upwards */
1612 bits = get_bit(g_scalar, i + 224) << 3;
1613 bits |= get_bit(g_scalar, i + 160) << 2;
1614 bits |= get_bit(g_scalar, i + 96) << 1;
1615 bits |= get_bit(g_scalar, i + 32);
1616 /* select the point to add, in constant time */
1617 select_point(bits, 16, g_pre_comp[1], tmp);
1619 if (!skip) {
1620 point_add(nq[0], nq[1], nq[2],
1621 nq[0], nq[1], nq[2],
1622 1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1623 } else {
1624 smallfelem_expand(nq[0], tmp[0]);
1625 smallfelem_expand(nq[1], tmp[1]);
1626 smallfelem_expand(nq[2], tmp[2]);
1627 skip = 0;
1630 /* second, look at the current position */
1631 bits = get_bit(g_scalar, i + 192) << 3;
1632 bits |= get_bit(g_scalar, i + 128) << 2;
1633 bits |= get_bit(g_scalar, i + 64) << 1;
1634 bits |= get_bit(g_scalar, i);
1635 /* select the point to add, in constant time */
1636 select_point(bits, 16, g_pre_comp[0], tmp);
1637 point_add(nq[0], nq[1], nq[2],
1638 nq[0], nq[1], nq[2],
1639 1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1641 /* do other additions every 5 doublings */
1642 if (num_points && (i % 5 == 0)) {
1643 /* loop over all scalars */
1644 for (num = 0; num < num_points; ++num) {
1645 bits = get_bit(scalars[num], i + 4) << 5;
1646 bits |= get_bit(scalars[num], i + 3) << 4;
1647 bits |= get_bit(scalars[num], i + 2) << 3;
1648 bits |= get_bit(scalars[num], i + 1) << 2;
1649 bits |= get_bit(scalars[num], i) << 1;
1650 bits |= get_bit(scalars[num], i - 1);
1651 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1654 * select the point to add or subtract, in
1655 * constant time
1657 select_point(digit, 17, pre_comp[num], tmp);
1658 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the
1659 * negative point */
1660 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1661 felem_contract(tmp[1], ftmp);
1663 if (!skip) {
1664 point_add(nq[0], nq[1], nq[2],
1665 nq[0], nq[1], nq[2],
1666 mixed, tmp[0], tmp[1], tmp[2]);
1667 } else {
1668 smallfelem_expand(nq[0], tmp[0]);
1669 smallfelem_expand(nq[1], tmp[1]);
1670 smallfelem_expand(nq[2], tmp[2]);
1671 skip = 0;
1676 felem_assign(x_out, nq[0]);
1677 felem_assign(y_out, nq[1]);
1678 felem_assign(z_out, nq[2]);
1681 /* Precomputation for the group generator. */
1682 typedef struct {
1683 smallfelem g_pre_comp[2][16][3];
1684 int references;
1685 } NISTP256_PRE_COMP;
1687 const EC_METHOD *
1688 EC_GFp_nistp256_method(void)
1690 static const EC_METHOD ret = {
1691 .flags = EC_FLAGS_DEFAULT_OCT,
1692 .field_type = NID_X9_62_prime_field,
1693 .group_init = ec_GFp_nistp256_group_init,
1694 .group_finish = ec_GFp_simple_group_finish,
1695 .group_clear_finish = ec_GFp_simple_group_clear_finish,
1696 .group_copy = ec_GFp_nist_group_copy,
1697 .group_set_curve = ec_GFp_nistp256_group_set_curve,
1698 .group_get_curve = ec_GFp_simple_group_get_curve,
1699 .group_get_degree = ec_GFp_simple_group_get_degree,
1700 .group_check_discriminant =
1701 ec_GFp_simple_group_check_discriminant,
1702 .point_init = ec_GFp_simple_point_init,
1703 .point_finish = ec_GFp_simple_point_finish,
1704 .point_clear_finish = ec_GFp_simple_point_clear_finish,
1705 .point_copy = ec_GFp_simple_point_copy,
1706 .point_set_to_infinity = ec_GFp_simple_point_set_to_infinity,
1707 .point_set_Jprojective_coordinates_GFp =
1708 ec_GFp_simple_set_Jprojective_coordinates_GFp,
1709 .point_get_Jprojective_coordinates_GFp =
1710 ec_GFp_simple_get_Jprojective_coordinates_GFp,
1711 .point_set_affine_coordinates =
1712 ec_GFp_simple_point_set_affine_coordinates,
1713 .point_get_affine_coordinates =
1714 ec_GFp_nistp256_point_get_affine_coordinates,
1715 .add = ec_GFp_simple_add,
1716 .dbl = ec_GFp_simple_dbl,
1717 .invert = ec_GFp_simple_invert,
1718 .is_at_infinity = ec_GFp_simple_is_at_infinity,
1719 .is_on_curve = ec_GFp_simple_is_on_curve,
1720 .point_cmp = ec_GFp_simple_cmp,
1721 .make_affine = ec_GFp_simple_make_affine,
1722 .points_make_affine = ec_GFp_simple_points_make_affine,
1723 .mul = ec_GFp_nistp256_points_mul,
1724 .precompute_mult = ec_GFp_nistp256_precompute_mult,
1725 .have_precompute_mult = ec_GFp_nistp256_have_precompute_mult,
1726 .field_mul = ec_GFp_nist_field_mul,
1727 .field_sqr = ec_GFp_nist_field_sqr
1730 return &ret;
1733 /******************************************************************************/
1734 /* FUNCTIONS TO MANAGE PRECOMPUTATION
1737 static NISTP256_PRE_COMP *
1738 nistp256_pre_comp_new()
1740 NISTP256_PRE_COMP *ret = NULL;
1741 ret = malloc(sizeof *ret);
1742 if (!ret) {
1743 ECerror(ERR_R_MALLOC_FAILURE);
1744 return ret;
1746 memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1747 ret->references = 1;
1748 return ret;
1751 static void *
1752 nistp256_pre_comp_dup(void *src_)
1754 NISTP256_PRE_COMP *src = src_;
1756 /* no need to actually copy, these objects never change! */
1757 CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1759 return src_;
1762 static void
1763 nistp256_pre_comp_free(void *pre_)
1765 int i;
1766 NISTP256_PRE_COMP *pre = pre_;
1768 if (!pre)
1769 return;
1771 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1772 if (i > 0)
1773 return;
1775 free(pre);
1778 static void
1779 nistp256_pre_comp_clear_free(void *pre_)
1781 int i;
1782 NISTP256_PRE_COMP *pre = pre_;
1784 if (!pre)
1785 return;
1787 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1788 if (i > 0)
1789 return;
1791 freezero(pre, sizeof *pre);
1794 /******************************************************************************/
1795 /* OPENSSL EC_METHOD FUNCTIONS
1798 int
1799 ec_GFp_nistp256_group_init(EC_GROUP * group)
1801 int ret;
1802 ret = ec_GFp_simple_group_init(group);
1803 group->a_is_minus3 = 1;
1804 return ret;
1807 int
1808 ec_GFp_nistp256_group_set_curve(EC_GROUP * group, const BIGNUM * p,
1809 const BIGNUM * a, const BIGNUM * b, BN_CTX * ctx)
1811 int ret = 0;
1812 BN_CTX *new_ctx = NULL;
1813 BIGNUM *curve_p, *curve_a, *curve_b;
1815 if (ctx == NULL)
1816 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1817 return 0;
1818 BN_CTX_start(ctx);
1819 if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1820 ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1821 ((curve_b = BN_CTX_get(ctx)) == NULL))
1822 goto err;
1823 BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1824 BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1825 BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1826 if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1827 (BN_cmp(curve_b, b))) {
1828 ECerror(EC_R_WRONG_CURVE_PARAMETERS);
1829 goto err;
1831 group->field_mod_func = BN_nist_mod_256;
1832 ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1833 err:
1834 BN_CTX_end(ctx);
1835 BN_CTX_free(new_ctx);
1836 return ret;
1839 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1840 * (X', Y') = (X/Z^2, Y/Z^3) */
1841 int
1842 ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP * group,
1843 const EC_POINT * point, BIGNUM * x, BIGNUM * y, BN_CTX * ctx)
1845 felem z1, z2, x_in, y_in;
1846 smallfelem x_out, y_out;
1847 longfelem tmp;
1849 if (EC_POINT_is_at_infinity(group, point) > 0) {
1850 ECerror(EC_R_POINT_AT_INFINITY);
1851 return 0;
1853 if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1854 (!BN_to_felem(z1, &point->Z)))
1855 return 0;
1856 felem_inv(z2, z1);
1857 felem_square(tmp, z2);
1858 felem_reduce(z1, tmp);
1859 felem_mul(tmp, x_in, z1);
1860 felem_reduce(x_in, tmp);
1861 felem_contract(x_out, x_in);
1862 if (x != NULL) {
1863 if (!smallfelem_to_BN(x, x_out)) {
1864 ECerror(ERR_R_BN_LIB);
1865 return 0;
1868 felem_mul(tmp, z1, z2);
1869 felem_reduce(z1, tmp);
1870 felem_mul(tmp, y_in, z1);
1871 felem_reduce(y_in, tmp);
1872 felem_contract(y_out, y_in);
1873 if (y != NULL) {
1874 if (!smallfelem_to_BN(y, y_out)) {
1875 ECerror(ERR_R_BN_LIB);
1876 return 0;
1879 return 1;
1882 static void
1883 make_points_affine(size_t num, smallfelem points[ /* num */ ][3], smallfelem tmp_smallfelems[ /* num+1 */ ])
1886 * Runs in constant time, unless an input is the point at infinity
1887 * (which normally shouldn't happen).
1889 ec_GFp_nistp_points_make_affine_internal(
1890 num,
1891 points,
1892 sizeof(smallfelem),
1893 tmp_smallfelems,
1894 (void (*) (void *)) smallfelem_one,
1895 (int (*) (const void *)) smallfelem_is_zero_int,
1896 (void (*) (void *, const void *)) smallfelem_assign,
1897 (void (*) (void *, const void *)) smallfelem_square_contract,
1898 (void (*) (void *, const void *, const void *)) smallfelem_mul_contract,
1899 (void (*) (void *, const void *)) smallfelem_inv_contract,
1900 (void (*) (void *, const void *)) smallfelem_assign /* nothing to contract */ );
1903 /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1904 * Result is stored in r (r can equal one of the inputs). */
1905 int
1906 ec_GFp_nistp256_points_mul(const EC_GROUP * group, EC_POINT * r,
1907 const BIGNUM * scalar, size_t num, const EC_POINT * points[],
1908 const BIGNUM * scalars[], BN_CTX * ctx)
1910 int ret = 0;
1911 int j;
1912 int mixed = 0;
1913 BN_CTX *new_ctx = NULL;
1914 BIGNUM *x, *y, *z, *tmp_scalar;
1915 felem_bytearray g_secret;
1916 felem_bytearray *secrets = NULL;
1917 smallfelem(*pre_comp)[17][3] = NULL;
1918 smallfelem *tmp_smallfelems = NULL;
1919 felem_bytearray tmp;
1920 unsigned i, num_bytes;
1921 int have_pre_comp = 0;
1922 size_t num_points = num;
1923 smallfelem x_in, y_in, z_in;
1924 felem x_out, y_out, z_out;
1925 NISTP256_PRE_COMP *pre = NULL;
1926 const smallfelem(*g_pre_comp)[16][3] = NULL;
1927 EC_POINT *generator = NULL;
1928 const EC_POINT *p = NULL;
1929 const BIGNUM *p_scalar = NULL;
1931 if (ctx == NULL)
1932 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1933 return 0;
1934 BN_CTX_start(ctx);
1935 if (((x = BN_CTX_get(ctx)) == NULL) ||
1936 ((y = BN_CTX_get(ctx)) == NULL) ||
1937 ((z = BN_CTX_get(ctx)) == NULL) ||
1938 ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1939 goto err;
1941 if (scalar != NULL) {
1942 pre = EC_EX_DATA_get_data(group->extra_data,
1943 nistp256_pre_comp_dup, nistp256_pre_comp_free,
1944 nistp256_pre_comp_clear_free);
1945 if (pre)
1946 /* we have precomputation, try to use it */
1947 g_pre_comp = (const smallfelem(*)[16][3]) pre->g_pre_comp;
1948 else
1949 /* try to use the standard precomputation */
1950 g_pre_comp = &gmul[0];
1951 generator = EC_POINT_new(group);
1952 if (generator == NULL)
1953 goto err;
1954 /* get the generator from precomputation */
1955 if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
1956 !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
1957 !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
1958 ECerror(ERR_R_BN_LIB);
1959 goto err;
1961 if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1962 generator, x, y, z, ctx))
1963 goto err;
1964 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1965 /* precomputation matches generator */
1966 have_pre_comp = 1;
1967 else
1969 * we don't have valid precomputation: treat the
1970 * generator as a random point
1972 num_points++;
1974 if (num_points > 0) {
1975 if (num_points >= 3) {
1977 * unless we precompute multiples for just one or two
1978 * points, converting those into affine form is time
1979 * well spent
1981 mixed = 1;
1983 secrets = calloc(num_points, sizeof(felem_bytearray));
1984 pre_comp = calloc(num_points, 17 * 3 * sizeof(smallfelem));
1985 if (mixed) {
1986 /* XXX should do more int overflow checking */
1987 tmp_smallfelems = reallocarray(NULL,
1988 (num_points * 17 + 1), sizeof(smallfelem));
1990 if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_smallfelems == NULL))) {
1991 ECerror(ERR_R_MALLOC_FAILURE);
1992 goto err;
1995 * we treat NULL scalars as 0, and NULL points as points at
1996 * infinity, i.e., they contribute nothing to the linear
1997 * combination
1999 for (i = 0; i < num_points; ++i) {
2000 if (i == num)
2002 * we didn't have a valid precomputation, so
2003 * we pick the generator
2006 p = EC_GROUP_get0_generator(group);
2007 p_scalar = scalar;
2008 } else
2009 /* the i^th point */
2011 p = points[i];
2012 p_scalar = scalars[i];
2014 if ((p_scalar != NULL) && (p != NULL)) {
2015 /* reduce scalar to 0 <= scalar < 2^256 */
2016 if ((BN_num_bits(p_scalar) > 256) || (BN_is_negative(p_scalar))) {
2018 * this is an unusual input, and we
2019 * don't guarantee constant-timeness
2021 if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
2022 ECerror(ERR_R_BN_LIB);
2023 goto err;
2025 num_bytes = BN_bn2bin(tmp_scalar, tmp);
2026 } else
2027 num_bytes = BN_bn2bin(p_scalar, tmp);
2028 flip_endian(secrets[i], tmp, num_bytes);
2029 /* precompute multiples */
2030 if ((!BN_to_felem(x_out, &p->X)) ||
2031 (!BN_to_felem(y_out, &p->Y)) ||
2032 (!BN_to_felem(z_out, &p->Z)))
2033 goto err;
2034 felem_shrink(pre_comp[i][1][0], x_out);
2035 felem_shrink(pre_comp[i][1][1], y_out);
2036 felem_shrink(pre_comp[i][1][2], z_out);
2037 for (j = 2; j <= 16; ++j) {
2038 if (j & 1) {
2039 point_add_small(
2040 pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
2041 pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
2042 pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
2043 } else {
2044 point_double_small(
2045 pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
2046 pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
2051 if (mixed)
2052 make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2054 /* the scalar for the generator */
2055 if ((scalar != NULL) && (have_pre_comp)) {
2056 memset(g_secret, 0, sizeof(g_secret));
2057 /* reduce scalar to 0 <= scalar < 2^256 */
2058 if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2060 * this is an unusual input, and we don't guarantee
2061 * constant-timeness
2063 if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
2064 ECerror(ERR_R_BN_LIB);
2065 goto err;
2067 num_bytes = BN_bn2bin(tmp_scalar, tmp);
2068 } else
2069 num_bytes = BN_bn2bin(scalar, tmp);
2070 flip_endian(g_secret, tmp, num_bytes);
2071 /* do the multiplication with generator precomputation */
2072 batch_mul(x_out, y_out, z_out,
2073 (const felem_bytearray(*)) secrets, num_points,
2074 g_secret,
2075 mixed, (const smallfelem(*)[17][3]) pre_comp,
2076 g_pre_comp);
2077 } else
2078 /* do the multiplication without generator precomputation */
2079 batch_mul(x_out, y_out, z_out,
2080 (const felem_bytearray(*)) secrets, num_points,
2081 NULL, mixed, (const smallfelem(*)[17][3]) pre_comp, NULL);
2082 /* reduce the output to its unique minimal representation */
2083 felem_contract(x_in, x_out);
2084 felem_contract(y_in, y_out);
2085 felem_contract(z_in, z_out);
2086 if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2087 (!smallfelem_to_BN(z, z_in))) {
2088 ECerror(ERR_R_BN_LIB);
2089 goto err;
2091 ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2093 err:
2094 BN_CTX_end(ctx);
2095 EC_POINT_free(generator);
2096 BN_CTX_free(new_ctx);
2097 free(secrets);
2098 free(pre_comp);
2099 free(tmp_smallfelems);
2100 return ret;
2103 int
2104 ec_GFp_nistp256_precompute_mult(EC_GROUP * group, BN_CTX * ctx)
2106 int ret = 0;
2107 NISTP256_PRE_COMP *pre = NULL;
2108 int i, j;
2109 BN_CTX *new_ctx = NULL;
2110 BIGNUM *x, *y;
2111 EC_POINT *generator = NULL;
2112 smallfelem tmp_smallfelems[32];
2113 felem x_tmp, y_tmp, z_tmp;
2115 /* throw away old precomputation */
2116 EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
2117 nistp256_pre_comp_free, nistp256_pre_comp_clear_free);
2118 if (ctx == NULL)
2119 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2120 return 0;
2121 BN_CTX_start(ctx);
2122 if (((x = BN_CTX_get(ctx)) == NULL) ||
2123 ((y = BN_CTX_get(ctx)) == NULL))
2124 goto err;
2125 /* get the generator */
2126 if (group->generator == NULL)
2127 goto err;
2128 generator = EC_POINT_new(group);
2129 if (generator == NULL)
2130 goto err;
2131 BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2132 BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2133 if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2134 goto err;
2135 if ((pre = nistp256_pre_comp_new()) == NULL)
2136 goto err;
2137 /* if the generator is the standard one, use built-in precomputation */
2138 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2139 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2140 ret = 1;
2141 goto err;
2143 if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
2144 (!BN_to_felem(y_tmp, &group->generator->Y)) ||
2145 (!BN_to_felem(z_tmp, &group->generator->Z)))
2146 goto err;
2147 felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2148 felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2149 felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2151 * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G,
2152 * 2^96*G, 2^160*G, 2^224*G for the second one
2154 for (i = 1; i <= 8; i <<= 1) {
2155 point_double_small(
2156 pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
2157 pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
2158 for (j = 0; j < 31; ++j) {
2159 point_double_small(
2160 pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
2161 pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
2163 if (i == 8)
2164 break;
2165 point_double_small(
2166 pre->g_pre_comp[0][2 * i][0], pre->g_pre_comp[0][2 * i][1], pre->g_pre_comp[0][2 * i][2],
2167 pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
2168 for (j = 0; j < 31; ++j) {
2169 point_double_small(
2170 pre->g_pre_comp[0][2 * i][0], pre->g_pre_comp[0][2 * i][1], pre->g_pre_comp[0][2 * i][2],
2171 pre->g_pre_comp[0][2 * i][0], pre->g_pre_comp[0][2 * i][1], pre->g_pre_comp[0][2 * i][2]);
2174 for (i = 0; i < 2; i++) {
2175 /* g_pre_comp[i][0] is the point at infinity */
2176 memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2177 /* the remaining multiples */
2178 /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2179 point_add_small(
2180 pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], pre->g_pre_comp[i][6][2],
2181 pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2182 pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
2183 /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2184 point_add_small(
2185 pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], pre->g_pre_comp[i][10][2],
2186 pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2187 pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
2188 /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2189 point_add_small(
2190 pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2191 pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2192 pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2]);
2194 * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G +
2195 * 2^224*G
2197 point_add_small(
2198 pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], pre->g_pre_comp[i][14][2],
2199 pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2200 pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
2201 for (j = 1; j < 8; ++j) {
2202 /* odd multiples: add G resp. 2^32*G */
2203 point_add_small(
2204 pre->g_pre_comp[i][2 * j + 1][0], pre->g_pre_comp[i][2 * j + 1][1], pre->g_pre_comp[i][2 * j + 1][2],
2205 pre->g_pre_comp[i][2 * j][0], pre->g_pre_comp[i][2 * j][1], pre->g_pre_comp[i][2 * j][2],
2206 pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], pre->g_pre_comp[i][1][2]);
2209 make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2211 if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
2212 nistp256_pre_comp_free, nistp256_pre_comp_clear_free))
2213 goto err;
2214 ret = 1;
2215 pre = NULL;
2216 err:
2217 BN_CTX_end(ctx);
2218 EC_POINT_free(generator);
2219 BN_CTX_free(new_ctx);
2220 nistp256_pre_comp_free(pre);
2221 return ret;
2224 int
2225 ec_GFp_nistp256_have_precompute_mult(const EC_GROUP * group)
2227 if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
2228 nistp256_pre_comp_free, nistp256_pre_comp_clear_free)
2229 != NULL)
2230 return 1;
2231 else
2232 return 0;
2234 #endif