import libcrypto (LibreSSL 2.5.2)
[unleashed.git] / lib / libcrypto / ec / ecp_nistp521.c
blob22bafe392ffb8ba3bb7a1440bb328842c525f240
1 /* $OpenBSD: ecp_nistp521.c,v 1.18 2017/01/29 17:49:23 beck Exp $ */
2 /*
3 * Written by Adam Langley (Google) for the OpenSSL project
4 */
5 /*
6 * Copyright (c) 2011 Google Inc.
8 * Permission to use, copy, modify, and distribute this software for any
9 * purpose with or without fee is hereby granted, provided that the above
10 * copyright notice and this permission notice appear in all copies.
12 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
13 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
14 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
15 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
16 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
17 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
18 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
29 #include <stdint.h>
30 #include <string.h>
32 #include <openssl/opensslconf.h>
34 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
36 #include <openssl/err.h>
37 #include "ec_lcl.h"
39 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
40 /* even with gcc, the typedef won't work for 32-bit platforms */
41 typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
42 #else
43 #error "Need GCC 3.1 or later to define type uint128_t"
44 #endif
46 typedef uint8_t u8;
47 typedef uint64_t u64;
48 typedef int64_t s64;
50 /* The underlying field.
52 * P521 operates over GF(2^521-1). We can serialise an element of this field
53 * into 66 bytes where the most significant byte contains only a single bit. We
54 * call this an felem_bytearray. */
56 typedef u8 felem_bytearray[66];
58 /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
59 * These values are big-endian. */
60 static const felem_bytearray nistp521_curve_params[5] =
62 {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
63 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
64 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
65 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
66 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70 0xff, 0xff},
71 {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
72 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
76 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
77 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79 0xff, 0xfc},
80 {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
81 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
82 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
83 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
84 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
85 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
86 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
87 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
88 0x3f, 0x00},
89 {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
90 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
91 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
92 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
93 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
94 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
95 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
96 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
97 0xbd, 0x66},
98 {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
99 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
100 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
101 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
102 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
103 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
104 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
105 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
106 0x66, 0x50}
109 /* The representation of field elements.
110 * ------------------------------------
112 * We represent field elements with nine values. These values are either 64 or
113 * 128 bits and the field element represented is:
114 * v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464 (mod p)
115 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
116 * 58 bits apart, but are greater than 58 bits in length, the most significant
117 * bits of each limb overlap with the least significant bits of the next.
119 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
120 * 'largefelem' */
122 #define NLIMBS 9
124 typedef uint64_t limb;
125 typedef limb felem[NLIMBS];
126 typedef uint128_t largefelem[NLIMBS];
128 static const limb bottom57bits = 0x1ffffffffffffff;
129 static const limb bottom58bits = 0x3ffffffffffffff;
131 /* bin66_to_felem takes a little-endian byte array and converts it into felem
132 * form. This assumes that the CPU is little-endian. */
133 static void
134 bin66_to_felem(felem out, const u8 in[66])
136 out[0] = (*((limb *) & in[0])) & bottom58bits;
137 out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
138 out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
139 out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
140 out[4] = (*((limb *) & in[29])) & bottom58bits;
141 out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
142 out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
143 out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
144 out[8] = (*((limb *) & in[58])) & bottom57bits;
147 /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
148 * array. This assumes that the CPU is little-endian. */
149 static void
150 felem_to_bin66(u8 out[66], const felem in)
152 memset(out, 0, 66);
153 (*((limb *) & out[0])) = in[0];
154 (*((limb *) & out[7])) |= in[1] << 2;
155 (*((limb *) & out[14])) |= in[2] << 4;
156 (*((limb *) & out[21])) |= in[3] << 6;
157 (*((limb *) & out[29])) = in[4];
158 (*((limb *) & out[36])) |= in[5] << 2;
159 (*((limb *) & out[43])) |= in[6] << 4;
160 (*((limb *) & out[50])) |= in[7] << 6;
161 (*((limb *) & out[58])) = in[8];
164 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
165 static void
166 flip_endian(u8 * out, const u8 * in, unsigned len)
168 unsigned i;
169 for (i = 0; i < len; ++i)
170 out[i] = in[len - 1 - i];
173 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
174 static int
175 BN_to_felem(felem out, const BIGNUM * bn)
177 felem_bytearray b_in;
178 felem_bytearray b_out;
179 unsigned num_bytes;
181 /* BN_bn2bin eats leading zeroes */
182 memset(b_out, 0, sizeof b_out);
183 num_bytes = BN_num_bytes(bn);
184 if (num_bytes > sizeof b_out) {
185 ECerror(EC_R_BIGNUM_OUT_OF_RANGE);
186 return 0;
188 if (BN_is_negative(bn)) {
189 ECerror(EC_R_BIGNUM_OUT_OF_RANGE);
190 return 0;
192 num_bytes = BN_bn2bin(bn, b_in);
193 flip_endian(b_out, b_in, num_bytes);
194 bin66_to_felem(out, b_out);
195 return 1;
198 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
199 static BIGNUM *
200 felem_to_BN(BIGNUM * out, const felem in)
202 felem_bytearray b_in, b_out;
203 felem_to_bin66(b_in, in);
204 flip_endian(b_out, b_in, sizeof b_out);
205 return BN_bin2bn(b_out, sizeof b_out, out);
209 /* Field operations
210 * ---------------- */
212 static void
213 felem_one(felem out)
215 out[0] = 1;
216 out[1] = 0;
217 out[2] = 0;
218 out[3] = 0;
219 out[4] = 0;
220 out[5] = 0;
221 out[6] = 0;
222 out[7] = 0;
223 out[8] = 0;
226 static void
227 felem_assign(felem out, const felem in)
229 out[0] = in[0];
230 out[1] = in[1];
231 out[2] = in[2];
232 out[3] = in[3];
233 out[4] = in[4];
234 out[5] = in[5];
235 out[6] = in[6];
236 out[7] = in[7];
237 out[8] = in[8];
240 /* felem_sum64 sets out = out + in. */
241 static void
242 felem_sum64(felem out, const felem in)
244 out[0] += in[0];
245 out[1] += in[1];
246 out[2] += in[2];
247 out[3] += in[3];
248 out[4] += in[4];
249 out[5] += in[5];
250 out[6] += in[6];
251 out[7] += in[7];
252 out[8] += in[8];
255 /* felem_scalar sets out = in * scalar */
256 static void
257 felem_scalar(felem out, const felem in, limb scalar)
259 out[0] = in[0] * scalar;
260 out[1] = in[1] * scalar;
261 out[2] = in[2] * scalar;
262 out[3] = in[3] * scalar;
263 out[4] = in[4] * scalar;
264 out[5] = in[5] * scalar;
265 out[6] = in[6] * scalar;
266 out[7] = in[7] * scalar;
267 out[8] = in[8] * scalar;
270 /* felem_scalar64 sets out = out * scalar */
271 static void
272 felem_scalar64(felem out, limb scalar)
274 out[0] *= scalar;
275 out[1] *= scalar;
276 out[2] *= scalar;
277 out[3] *= scalar;
278 out[4] *= scalar;
279 out[5] *= scalar;
280 out[6] *= scalar;
281 out[7] *= scalar;
282 out[8] *= scalar;
285 /* felem_scalar128 sets out = out * scalar */
286 static void
287 felem_scalar128(largefelem out, limb scalar)
289 out[0] *= scalar;
290 out[1] *= scalar;
291 out[2] *= scalar;
292 out[3] *= scalar;
293 out[4] *= scalar;
294 out[5] *= scalar;
295 out[6] *= scalar;
296 out[7] *= scalar;
297 out[8] *= scalar;
300 /* felem_neg sets |out| to |-in|
301 * On entry:
302 * in[i] < 2^59 + 2^14
303 * On exit:
304 * out[i] < 2^62
306 static void
307 felem_neg(felem out, const felem in)
309 /* In order to prevent underflow, we subtract from 0 mod p. */
310 static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
311 static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
313 out[0] = two62m3 - in[0];
314 out[1] = two62m2 - in[1];
315 out[2] = two62m2 - in[2];
316 out[3] = two62m2 - in[3];
317 out[4] = two62m2 - in[4];
318 out[5] = two62m2 - in[5];
319 out[6] = two62m2 - in[6];
320 out[7] = two62m2 - in[7];
321 out[8] = two62m2 - in[8];
324 /* felem_diff64 subtracts |in| from |out|
325 * On entry:
326 * in[i] < 2^59 + 2^14
327 * On exit:
328 * out[i] < out[i] + 2^62
330 static void
331 felem_diff64(felem out, const felem in)
333 /* In order to prevent underflow, we add 0 mod p before subtracting. */
334 static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
335 static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
337 out[0] += two62m3 - in[0];
338 out[1] += two62m2 - in[1];
339 out[2] += two62m2 - in[2];
340 out[3] += two62m2 - in[3];
341 out[4] += two62m2 - in[4];
342 out[5] += two62m2 - in[5];
343 out[6] += two62m2 - in[6];
344 out[7] += two62m2 - in[7];
345 out[8] += two62m2 - in[8];
348 /* felem_diff_128_64 subtracts |in| from |out|
349 * On entry:
350 * in[i] < 2^62 + 2^17
351 * On exit:
352 * out[i] < out[i] + 2^63
354 static void
355 felem_diff_128_64(largefelem out, const felem in)
357 /* In order to prevent underflow, we add 0 mod p before subtracting. */
358 static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
359 static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
361 out[0] += two63m6 - in[0];
362 out[1] += two63m5 - in[1];
363 out[2] += two63m5 - in[2];
364 out[3] += two63m5 - in[3];
365 out[4] += two63m5 - in[4];
366 out[5] += two63m5 - in[5];
367 out[6] += two63m5 - in[6];
368 out[7] += two63m5 - in[7];
369 out[8] += two63m5 - in[8];
372 /* felem_diff_128_64 subtracts |in| from |out|
373 * On entry:
374 * in[i] < 2^126
375 * On exit:
376 * out[i] < out[i] + 2^127 - 2^69
378 static void
379 felem_diff128(largefelem out, const largefelem in)
381 /* In order to prevent underflow, we add 0 mod p before subtracting. */
382 static const uint128_t two127m70 = (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
383 static const uint128_t two127m69 = (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
385 out[0] += (two127m70 - in[0]);
386 out[1] += (two127m69 - in[1]);
387 out[2] += (two127m69 - in[2]);
388 out[3] += (two127m69 - in[3]);
389 out[4] += (two127m69 - in[4]);
390 out[5] += (two127m69 - in[5]);
391 out[6] += (two127m69 - in[6]);
392 out[7] += (two127m69 - in[7]);
393 out[8] += (two127m69 - in[8]);
396 /* felem_square sets |out| = |in|^2
397 * On entry:
398 * in[i] < 2^62
399 * On exit:
400 * out[i] < 17 * max(in[i]) * max(in[i])
402 static void
403 felem_square(largefelem out, const felem in)
405 felem inx2, inx4;
406 felem_scalar(inx2, in, 2);
407 felem_scalar(inx4, in, 4);
410 * We have many cases were we want to do in[x] * in[y] + in[y] *
411 * in[x] This is obviously just 2 * in[x] * in[y] However, rather
412 * than do the doubling on the 128 bit result, we double one of the
413 * inputs to the multiplication by reading from |inx2|
416 out[0] = ((uint128_t) in[0]) * in[0];
417 out[1] = ((uint128_t) in[0]) * inx2[1];
418 out[2] = ((uint128_t) in[0]) * inx2[2] +
419 ((uint128_t) in[1]) * in[1];
420 out[3] = ((uint128_t) in[0]) * inx2[3] +
421 ((uint128_t) in[1]) * inx2[2];
422 out[4] = ((uint128_t) in[0]) * inx2[4] +
423 ((uint128_t) in[1]) * inx2[3] +
424 ((uint128_t) in[2]) * in[2];
425 out[5] = ((uint128_t) in[0]) * inx2[5] +
426 ((uint128_t) in[1]) * inx2[4] +
427 ((uint128_t) in[2]) * inx2[3];
428 out[6] = ((uint128_t) in[0]) * inx2[6] +
429 ((uint128_t) in[1]) * inx2[5] +
430 ((uint128_t) in[2]) * inx2[4] +
431 ((uint128_t) in[3]) * in[3];
432 out[7] = ((uint128_t) in[0]) * inx2[7] +
433 ((uint128_t) in[1]) * inx2[6] +
434 ((uint128_t) in[2]) * inx2[5] +
435 ((uint128_t) in[3]) * inx2[4];
436 out[8] = ((uint128_t) in[0]) * inx2[8] +
437 ((uint128_t) in[1]) * inx2[7] +
438 ((uint128_t) in[2]) * inx2[6] +
439 ((uint128_t) in[3]) * inx2[5] +
440 ((uint128_t) in[4]) * in[4];
443 * The remaining limbs fall above 2^521, with the first falling at
444 * 2^522. They correspond to locations one bit up from the limbs
445 * produced above so we would have to multiply by two to align them.
446 * Again, rather than operate on the 128-bit result, we double one of
447 * the inputs to the multiplication. If we want to double for both
448 * this reason, and the reason above, then we end up multiplying by
449 * four.
452 /* 9 */
453 out[0] += ((uint128_t) in[1]) * inx4[8] +
454 ((uint128_t) in[2]) * inx4[7] +
455 ((uint128_t) in[3]) * inx4[6] +
456 ((uint128_t) in[4]) * inx4[5];
458 /* 10 */
459 out[1] += ((uint128_t) in[2]) * inx4[8] +
460 ((uint128_t) in[3]) * inx4[7] +
461 ((uint128_t) in[4]) * inx4[6] +
462 ((uint128_t) in[5]) * inx2[5];
464 /* 11 */
465 out[2] += ((uint128_t) in[3]) * inx4[8] +
466 ((uint128_t) in[4]) * inx4[7] +
467 ((uint128_t) in[5]) * inx4[6];
469 /* 12 */
470 out[3] += ((uint128_t) in[4]) * inx4[8] +
471 ((uint128_t) in[5]) * inx4[7] +
472 ((uint128_t) in[6]) * inx2[6];
474 /* 13 */
475 out[4] += ((uint128_t) in[5]) * inx4[8] +
476 ((uint128_t) in[6]) * inx4[7];
478 /* 14 */
479 out[5] += ((uint128_t) in[6]) * inx4[8] +
480 ((uint128_t) in[7]) * inx2[7];
482 /* 15 */
483 out[6] += ((uint128_t) in[7]) * inx4[8];
485 /* 16 */
486 out[7] += ((uint128_t) in[8]) * inx2[8];
489 /* felem_mul sets |out| = |in1| * |in2|
490 * On entry:
491 * in1[i] < 2^64
492 * in2[i] < 2^63
493 * On exit:
494 * out[i] < 17 * max(in1[i]) * max(in2[i])
496 static void
497 felem_mul(largefelem out, const felem in1, const felem in2)
499 felem in2x2;
500 felem_scalar(in2x2, in2, 2);
502 out[0] = ((uint128_t) in1[0]) * in2[0];
504 out[1] = ((uint128_t) in1[0]) * in2[1] +
505 ((uint128_t) in1[1]) * in2[0];
507 out[2] = ((uint128_t) in1[0]) * in2[2] +
508 ((uint128_t) in1[1]) * in2[1] +
509 ((uint128_t) in1[2]) * in2[0];
511 out[3] = ((uint128_t) in1[0]) * in2[3] +
512 ((uint128_t) in1[1]) * in2[2] +
513 ((uint128_t) in1[2]) * in2[1] +
514 ((uint128_t) in1[3]) * in2[0];
516 out[4] = ((uint128_t) in1[0]) * in2[4] +
517 ((uint128_t) in1[1]) * in2[3] +
518 ((uint128_t) in1[2]) * in2[2] +
519 ((uint128_t) in1[3]) * in2[1] +
520 ((uint128_t) in1[4]) * in2[0];
522 out[5] = ((uint128_t) in1[0]) * in2[5] +
523 ((uint128_t) in1[1]) * in2[4] +
524 ((uint128_t) in1[2]) * in2[3] +
525 ((uint128_t) in1[3]) * in2[2] +
526 ((uint128_t) in1[4]) * in2[1] +
527 ((uint128_t) in1[5]) * in2[0];
529 out[6] = ((uint128_t) in1[0]) * in2[6] +
530 ((uint128_t) in1[1]) * in2[5] +
531 ((uint128_t) in1[2]) * in2[4] +
532 ((uint128_t) in1[3]) * in2[3] +
533 ((uint128_t) in1[4]) * in2[2] +
534 ((uint128_t) in1[5]) * in2[1] +
535 ((uint128_t) in1[6]) * in2[0];
537 out[7] = ((uint128_t) in1[0]) * in2[7] +
538 ((uint128_t) in1[1]) * in2[6] +
539 ((uint128_t) in1[2]) * in2[5] +
540 ((uint128_t) in1[3]) * in2[4] +
541 ((uint128_t) in1[4]) * in2[3] +
542 ((uint128_t) in1[5]) * in2[2] +
543 ((uint128_t) in1[6]) * in2[1] +
544 ((uint128_t) in1[7]) * in2[0];
546 out[8] = ((uint128_t) in1[0]) * in2[8] +
547 ((uint128_t) in1[1]) * in2[7] +
548 ((uint128_t) in1[2]) * in2[6] +
549 ((uint128_t) in1[3]) * in2[5] +
550 ((uint128_t) in1[4]) * in2[4] +
551 ((uint128_t) in1[5]) * in2[3] +
552 ((uint128_t) in1[6]) * in2[2] +
553 ((uint128_t) in1[7]) * in2[1] +
554 ((uint128_t) in1[8]) * in2[0];
556 /* See comment in felem_square about the use of in2x2 here */
558 out[0] += ((uint128_t) in1[1]) * in2x2[8] +
559 ((uint128_t) in1[2]) * in2x2[7] +
560 ((uint128_t) in1[3]) * in2x2[6] +
561 ((uint128_t) in1[4]) * in2x2[5] +
562 ((uint128_t) in1[5]) * in2x2[4] +
563 ((uint128_t) in1[6]) * in2x2[3] +
564 ((uint128_t) in1[7]) * in2x2[2] +
565 ((uint128_t) in1[8]) * in2x2[1];
567 out[1] += ((uint128_t) in1[2]) * in2x2[8] +
568 ((uint128_t) in1[3]) * in2x2[7] +
569 ((uint128_t) in1[4]) * in2x2[6] +
570 ((uint128_t) in1[5]) * in2x2[5] +
571 ((uint128_t) in1[6]) * in2x2[4] +
572 ((uint128_t) in1[7]) * in2x2[3] +
573 ((uint128_t) in1[8]) * in2x2[2];
575 out[2] += ((uint128_t) in1[3]) * in2x2[8] +
576 ((uint128_t) in1[4]) * in2x2[7] +
577 ((uint128_t) in1[5]) * in2x2[6] +
578 ((uint128_t) in1[6]) * in2x2[5] +
579 ((uint128_t) in1[7]) * in2x2[4] +
580 ((uint128_t) in1[8]) * in2x2[3];
582 out[3] += ((uint128_t) in1[4]) * in2x2[8] +
583 ((uint128_t) in1[5]) * in2x2[7] +
584 ((uint128_t) in1[6]) * in2x2[6] +
585 ((uint128_t) in1[7]) * in2x2[5] +
586 ((uint128_t) in1[8]) * in2x2[4];
588 out[4] += ((uint128_t) in1[5]) * in2x2[8] +
589 ((uint128_t) in1[6]) * in2x2[7] +
590 ((uint128_t) in1[7]) * in2x2[6] +
591 ((uint128_t) in1[8]) * in2x2[5];
593 out[5] += ((uint128_t) in1[6]) * in2x2[8] +
594 ((uint128_t) in1[7]) * in2x2[7] +
595 ((uint128_t) in1[8]) * in2x2[6];
597 out[6] += ((uint128_t) in1[7]) * in2x2[8] +
598 ((uint128_t) in1[8]) * in2x2[7];
600 out[7] += ((uint128_t) in1[8]) * in2x2[8];
603 static const limb bottom52bits = 0xfffffffffffff;
605 /* felem_reduce converts a largefelem to an felem.
606 * On entry:
607 * in[i] < 2^128
608 * On exit:
609 * out[i] < 2^59 + 2^14
611 static void
612 felem_reduce(felem out, const largefelem in)
614 u64 overflow1, overflow2;
616 out[0] = ((limb) in[0]) & bottom58bits;
617 out[1] = ((limb) in[1]) & bottom58bits;
618 out[2] = ((limb) in[2]) & bottom58bits;
619 out[3] = ((limb) in[3]) & bottom58bits;
620 out[4] = ((limb) in[4]) & bottom58bits;
621 out[5] = ((limb) in[5]) & bottom58bits;
622 out[6] = ((limb) in[6]) & bottom58bits;
623 out[7] = ((limb) in[7]) & bottom58bits;
624 out[8] = ((limb) in[8]) & bottom58bits;
626 /* out[i] < 2^58 */
628 out[1] += ((limb) in[0]) >> 58;
629 out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
631 * out[1] < 2^58 + 2^6 + 2^58 = 2^59 + 2^6
633 out[2] += ((limb) (in[0] >> 64)) >> 52;
635 out[2] += ((limb) in[1]) >> 58;
636 out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
637 out[3] += ((limb) (in[1] >> 64)) >> 52;
639 out[3] += ((limb) in[2]) >> 58;
640 out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
641 out[4] += ((limb) (in[2] >> 64)) >> 52;
643 out[4] += ((limb) in[3]) >> 58;
644 out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
645 out[5] += ((limb) (in[3] >> 64)) >> 52;
647 out[5] += ((limb) in[4]) >> 58;
648 out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
649 out[6] += ((limb) (in[4] >> 64)) >> 52;
651 out[6] += ((limb) in[5]) >> 58;
652 out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
653 out[7] += ((limb) (in[5] >> 64)) >> 52;
655 out[7] += ((limb) in[6]) >> 58;
656 out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
657 out[8] += ((limb) (in[6] >> 64)) >> 52;
659 out[8] += ((limb) in[7]) >> 58;
660 out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
662 * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12 < 2^59 + 2^13
664 overflow1 = ((limb) (in[7] >> 64)) >> 52;
666 overflow1 += ((limb) in[8]) >> 58;
667 overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
668 overflow2 = ((limb) (in[8] >> 64)) >> 52;
670 overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */
671 overflow2 <<= 1; /* overflow2 < 2^13 */
673 out[0] += overflow1; /* out[0] < 2^60 */
674 out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */
676 out[1] += out[0] >> 58;
677 out[0] &= bottom58bits;
679 * out[0] < 2^58 out[1] < 2^59 + 2^6 + 2^13 + 2^2 < 2^59 + 2^14
683 static void
684 felem_square_reduce(felem out, const felem in)
686 largefelem tmp;
687 felem_square(tmp, in);
688 felem_reduce(out, tmp);
691 static void
692 felem_mul_reduce(felem out, const felem in1, const felem in2)
694 largefelem tmp;
695 felem_mul(tmp, in1, in2);
696 felem_reduce(out, tmp);
699 /* felem_inv calculates |out| = |in|^{-1}
701 * Based on Fermat's Little Theorem:
702 * a^p = a (mod p)
703 * a^{p-1} = 1 (mod p)
704 * a^{p-2} = a^{-1} (mod p)
706 static void
707 felem_inv(felem out, const felem in)
709 felem ftmp, ftmp2, ftmp3, ftmp4;
710 largefelem tmp;
711 unsigned i;
713 felem_square(tmp, in);
714 felem_reduce(ftmp, tmp);/* 2^1 */
715 felem_mul(tmp, in, ftmp);
716 felem_reduce(ftmp, tmp);/* 2^2 - 2^0 */
717 felem_assign(ftmp2, ftmp);
718 felem_square(tmp, ftmp);
719 felem_reduce(ftmp, tmp);/* 2^3 - 2^1 */
720 felem_mul(tmp, in, ftmp);
721 felem_reduce(ftmp, tmp);/* 2^3 - 2^0 */
722 felem_square(tmp, ftmp);
723 felem_reduce(ftmp, tmp);/* 2^4 - 2^1 */
725 felem_square(tmp, ftmp2);
726 felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */
727 felem_square(tmp, ftmp3);
728 felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */
729 felem_mul(tmp, ftmp3, ftmp2);
730 felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
732 felem_assign(ftmp2, ftmp3);
733 felem_square(tmp, ftmp3);
734 felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */
735 felem_square(tmp, ftmp3);
736 felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */
737 felem_square(tmp, ftmp3);
738 felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */
739 felem_square(tmp, ftmp3);
740 felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */
741 felem_assign(ftmp4, ftmp3);
742 felem_mul(tmp, ftmp3, ftmp);
743 felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */
744 felem_square(tmp, ftmp4);
745 felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */
746 felem_mul(tmp, ftmp3, ftmp2);
747 felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
748 felem_assign(ftmp2, ftmp3);
750 for (i = 0; i < 8; i++) {
751 felem_square(tmp, ftmp3);
752 felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
754 felem_mul(tmp, ftmp3, ftmp2);
755 felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
756 felem_assign(ftmp2, ftmp3);
758 for (i = 0; i < 16; i++) {
759 felem_square(tmp, ftmp3);
760 felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
762 felem_mul(tmp, ftmp3, ftmp2);
763 felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
764 felem_assign(ftmp2, ftmp3);
766 for (i = 0; i < 32; i++) {
767 felem_square(tmp, ftmp3);
768 felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
770 felem_mul(tmp, ftmp3, ftmp2);
771 felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
772 felem_assign(ftmp2, ftmp3);
774 for (i = 0; i < 64; i++) {
775 felem_square(tmp, ftmp3);
776 felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
778 felem_mul(tmp, ftmp3, ftmp2);
779 felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
780 felem_assign(ftmp2, ftmp3);
782 for (i = 0; i < 128; i++) {
783 felem_square(tmp, ftmp3);
784 felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
786 felem_mul(tmp, ftmp3, ftmp2);
787 felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
788 felem_assign(ftmp2, ftmp3);
790 for (i = 0; i < 256; i++) {
791 felem_square(tmp, ftmp3);
792 felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
794 felem_mul(tmp, ftmp3, ftmp2);
795 felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
797 for (i = 0; i < 9; i++) {
798 felem_square(tmp, ftmp3);
799 felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
801 felem_mul(tmp, ftmp3, ftmp4);
802 felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
803 felem_mul(tmp, ftmp3, in);
804 felem_reduce(out, tmp); /* 2^512 - 3 */
807 /* This is 2^521-1, expressed as an felem */
808 static const felem kPrime =
810 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
811 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
812 0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
815 /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
816 * otherwise.
817 * On entry:
818 * in[i] < 2^59 + 2^14
820 static limb
821 felem_is_zero(const felem in)
823 felem ftmp;
824 limb is_zero, is_p;
825 felem_assign(ftmp, in);
827 ftmp[0] += ftmp[8] >> 57;
828 ftmp[8] &= bottom57bits;
829 /* ftmp[8] < 2^57 */
830 ftmp[1] += ftmp[0] >> 58;
831 ftmp[0] &= bottom58bits;
832 ftmp[2] += ftmp[1] >> 58;
833 ftmp[1] &= bottom58bits;
834 ftmp[3] += ftmp[2] >> 58;
835 ftmp[2] &= bottom58bits;
836 ftmp[4] += ftmp[3] >> 58;
837 ftmp[3] &= bottom58bits;
838 ftmp[5] += ftmp[4] >> 58;
839 ftmp[4] &= bottom58bits;
840 ftmp[6] += ftmp[5] >> 58;
841 ftmp[5] &= bottom58bits;
842 ftmp[7] += ftmp[6] >> 58;
843 ftmp[6] &= bottom58bits;
844 ftmp[8] += ftmp[7] >> 58;
845 ftmp[7] &= bottom58bits;
846 /* ftmp[8] < 2^57 + 4 */
849 * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
850 * greater than our bound for ftmp[8]. Therefore we only have to
851 * check if the zero is zero or 2^521-1.
854 is_zero = 0;
855 is_zero |= ftmp[0];
856 is_zero |= ftmp[1];
857 is_zero |= ftmp[2];
858 is_zero |= ftmp[3];
859 is_zero |= ftmp[4];
860 is_zero |= ftmp[5];
861 is_zero |= ftmp[6];
862 is_zero |= ftmp[7];
863 is_zero |= ftmp[8];
865 is_zero--;
867 * We know that ftmp[i] < 2^63, therefore the only way that the top
868 * bit can be set is if is_zero was 0 before the decrement.
870 is_zero = ((s64) is_zero) >> 63;
872 is_p = ftmp[0] ^ kPrime[0];
873 is_p |= ftmp[1] ^ kPrime[1];
874 is_p |= ftmp[2] ^ kPrime[2];
875 is_p |= ftmp[3] ^ kPrime[3];
876 is_p |= ftmp[4] ^ kPrime[4];
877 is_p |= ftmp[5] ^ kPrime[5];
878 is_p |= ftmp[6] ^ kPrime[6];
879 is_p |= ftmp[7] ^ kPrime[7];
880 is_p |= ftmp[8] ^ kPrime[8];
882 is_p--;
883 is_p = ((s64) is_p) >> 63;
885 is_zero |= is_p;
886 return is_zero;
889 static int
890 felem_is_zero_int(const felem in)
892 return (int) (felem_is_zero(in) & ((limb) 1));
895 /* felem_contract converts |in| to its unique, minimal representation.
896 * On entry:
897 * in[i] < 2^59 + 2^14
899 static void
900 felem_contract(felem out, const felem in)
902 limb is_p, is_greater, sign;
903 static const limb two58 = ((limb) 1) << 58;
905 felem_assign(out, in);
907 out[0] += out[8] >> 57;
908 out[8] &= bottom57bits;
909 /* out[8] < 2^57 */
910 out[1] += out[0] >> 58;
911 out[0] &= bottom58bits;
912 out[2] += out[1] >> 58;
913 out[1] &= bottom58bits;
914 out[3] += out[2] >> 58;
915 out[2] &= bottom58bits;
916 out[4] += out[3] >> 58;
917 out[3] &= bottom58bits;
918 out[5] += out[4] >> 58;
919 out[4] &= bottom58bits;
920 out[6] += out[5] >> 58;
921 out[5] &= bottom58bits;
922 out[7] += out[6] >> 58;
923 out[6] &= bottom58bits;
924 out[8] += out[7] >> 58;
925 out[7] &= bottom58bits;
926 /* out[8] < 2^57 + 4 */
929 * If the value is greater than 2^521-1 then we have to subtract
930 * 2^521-1 out. See the comments in felem_is_zero regarding why we
931 * don't test for other multiples of the prime.
935 * First, if |out| is equal to 2^521-1, we subtract it out to get
936 * zero.
939 is_p = out[0] ^ kPrime[0];
940 is_p |= out[1] ^ kPrime[1];
941 is_p |= out[2] ^ kPrime[2];
942 is_p |= out[3] ^ kPrime[3];
943 is_p |= out[4] ^ kPrime[4];
944 is_p |= out[5] ^ kPrime[5];
945 is_p |= out[6] ^ kPrime[6];
946 is_p |= out[7] ^ kPrime[7];
947 is_p |= out[8] ^ kPrime[8];
949 is_p--;
950 is_p &= is_p << 32;
951 is_p &= is_p << 16;
952 is_p &= is_p << 8;
953 is_p &= is_p << 4;
954 is_p &= is_p << 2;
955 is_p &= is_p << 1;
956 is_p = ((s64) is_p) >> 63;
957 is_p = ~is_p;
959 /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
961 out[0] &= is_p;
962 out[1] &= is_p;
963 out[2] &= is_p;
964 out[3] &= is_p;
965 out[4] &= is_p;
966 out[5] &= is_p;
967 out[6] &= is_p;
968 out[7] &= is_p;
969 out[8] &= is_p;
972 * In order to test that |out| >= 2^521-1 we need only test if out[8]
973 * >> 57 is greater than zero as (2^521-1) + x >= 2^522
975 is_greater = out[8] >> 57;
976 is_greater |= is_greater << 32;
977 is_greater |= is_greater << 16;
978 is_greater |= is_greater << 8;
979 is_greater |= is_greater << 4;
980 is_greater |= is_greater << 2;
981 is_greater |= is_greater << 1;
982 is_greater = ((s64) is_greater) >> 63;
984 out[0] -= kPrime[0] & is_greater;
985 out[1] -= kPrime[1] & is_greater;
986 out[2] -= kPrime[2] & is_greater;
987 out[3] -= kPrime[3] & is_greater;
988 out[4] -= kPrime[4] & is_greater;
989 out[5] -= kPrime[5] & is_greater;
990 out[6] -= kPrime[6] & is_greater;
991 out[7] -= kPrime[7] & is_greater;
992 out[8] -= kPrime[8] & is_greater;
994 /* Eliminate negative coefficients */
995 sign = -(out[0] >> 63);
996 out[0] += (two58 & sign);
997 out[1] -= (1 & sign);
998 sign = -(out[1] >> 63);
999 out[1] += (two58 & sign);
1000 out[2] -= (1 & sign);
1001 sign = -(out[2] >> 63);
1002 out[2] += (two58 & sign);
1003 out[3] -= (1 & sign);
1004 sign = -(out[3] >> 63);
1005 out[3] += (two58 & sign);
1006 out[4] -= (1 & sign);
1007 sign = -(out[4] >> 63);
1008 out[4] += (two58 & sign);
1009 out[5] -= (1 & sign);
1010 sign = -(out[0] >> 63);
1011 out[5] += (two58 & sign);
1012 out[6] -= (1 & sign);
1013 sign = -(out[6] >> 63);
1014 out[6] += (two58 & sign);
1015 out[7] -= (1 & sign);
1016 sign = -(out[7] >> 63);
1017 out[7] += (two58 & sign);
1018 out[8] -= (1 & sign);
1019 sign = -(out[5] >> 63);
1020 out[5] += (two58 & sign);
1021 out[6] -= (1 & sign);
1022 sign = -(out[6] >> 63);
1023 out[6] += (two58 & sign);
1024 out[7] -= (1 & sign);
1025 sign = -(out[7] >> 63);
1026 out[7] += (two58 & sign);
1027 out[8] -= (1 & sign);
1030 /* Group operations
1031 * ----------------
1033 * Building on top of the field operations we have the operations on the
1034 * elliptic curve group itself. Points on the curve are represented in Jacobian
1035 * coordinates */
1037 /* point_double calcuates 2*(x_in, y_in, z_in)
1039 * The method is taken from:
1040 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1042 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1043 * while x_out == y_in is not (maybe this works, but it's not tested). */
1044 static void
1045 point_double(felem x_out, felem y_out, felem z_out,
1046 const felem x_in, const felem y_in, const felem z_in)
1048 largefelem tmp, tmp2;
1049 felem delta, gamma, beta, alpha, ftmp, ftmp2;
1051 felem_assign(ftmp, x_in);
1052 felem_assign(ftmp2, x_in);
1054 /* delta = z^2 */
1055 felem_square(tmp, z_in);
1056 felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */
1058 /* gamma = y^2 */
1059 felem_square(tmp, y_in);
1060 felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */
1062 /* beta = x*gamma */
1063 felem_mul(tmp, x_in, gamma);
1064 felem_reduce(beta, tmp);/* beta[i] < 2^59 + 2^14 */
1066 /* alpha = 3*(x-delta)*(x+delta) */
1067 felem_diff64(ftmp, delta);
1068 /* ftmp[i] < 2^61 */
1069 felem_sum64(ftmp2, delta);
1070 /* ftmp2[i] < 2^60 + 2^15 */
1071 felem_scalar64(ftmp2, 3);
1072 /* ftmp2[i] < 3*2^60 + 3*2^15 */
1073 felem_mul(tmp, ftmp, ftmp2);
1075 * tmp[i] < 17(3*2^121 + 3*2^76) = 61*2^121 + 61*2^76 < 64*2^121 +
1076 * 64*2^76 = 2^127 + 2^82 < 2^128
1078 felem_reduce(alpha, tmp);
1080 /* x' = alpha^2 - 8*beta */
1081 felem_square(tmp, alpha);
1083 * tmp[i] < 17*2^120 < 2^125
1085 felem_assign(ftmp, beta);
1086 felem_scalar64(ftmp, 8);
1087 /* ftmp[i] < 2^62 + 2^17 */
1088 felem_diff_128_64(tmp, ftmp);
1089 /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1090 felem_reduce(x_out, tmp);
1092 /* z' = (y + z)^2 - gamma - delta */
1093 felem_sum64(delta, gamma);
1094 /* delta[i] < 2^60 + 2^15 */
1095 felem_assign(ftmp, y_in);
1096 felem_sum64(ftmp, z_in);
1097 /* ftmp[i] < 2^60 + 2^15 */
1098 felem_square(tmp, ftmp);
1100 * tmp[i] < 17(2^122) < 2^127
1102 felem_diff_128_64(tmp, delta);
1103 /* tmp[i] < 2^127 + 2^63 */
1104 felem_reduce(z_out, tmp);
1106 /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1107 felem_scalar64(beta, 4);
1108 /* beta[i] < 2^61 + 2^16 */
1109 felem_diff64(beta, x_out);
1110 /* beta[i] < 2^61 + 2^60 + 2^16 */
1111 felem_mul(tmp, alpha, beta);
1113 * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16)) = 17*(2^120 + 2^75
1114 * + 2^119 + 2^74 + 2^75 + 2^30) = 17*(2^120 + 2^119 + 2^76 + 2^74 +
1115 * 2^30) < 2^128
1117 felem_square(tmp2, gamma);
1119 * tmp2[i] < 17*(2^59 + 2^14)^2 = 17*(2^118 + 2^74 + 2^28)
1121 felem_scalar128(tmp2, 8);
1123 * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28) = 2^125 + 2^121 + 2^81 + 2^77
1124 * + 2^35 + 2^31 < 2^126
1126 felem_diff128(tmp, tmp2);
1128 * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30) =
1129 * 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 + 2^74
1130 * + 2^69 + 2^34 + 2^30 < 2^128
1132 felem_reduce(y_out, tmp);
1135 /* copy_conditional copies in to out iff mask is all ones. */
1136 static void
1137 copy_conditional(felem out, const felem in, limb mask)
1139 unsigned i;
1140 for (i = 0; i < NLIMBS; ++i) {
1141 const limb tmp = mask & (in[i] ^ out[i]);
1142 out[i] ^= tmp;
1146 /* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1148 * The method is taken from
1149 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1150 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1152 * This function includes a branch for checking whether the two input points
1153 * are equal (while not equal to the point at infinity). This case never
1154 * happens during single point multiplication, so there is no timing leak for
1155 * ECDH or ECDSA signing. */
1156 static void
1157 point_add(felem x3, felem y3, felem z3,
1158 const felem x1, const felem y1, const felem z1,
1159 const int mixed, const felem x2, const felem y2, const felem z2)
1161 felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1162 largefelem tmp, tmp2;
1163 limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1165 z1_is_zero = felem_is_zero(z1);
1166 z2_is_zero = felem_is_zero(z2);
1168 /* ftmp = z1z1 = z1**2 */
1169 felem_square(tmp, z1);
1170 felem_reduce(ftmp, tmp);
1172 if (!mixed) {
1173 /* ftmp2 = z2z2 = z2**2 */
1174 felem_square(tmp, z2);
1175 felem_reduce(ftmp2, tmp);
1177 /* u1 = ftmp3 = x1*z2z2 */
1178 felem_mul(tmp, x1, ftmp2);
1179 felem_reduce(ftmp3, tmp);
1181 /* ftmp5 = z1 + z2 */
1182 felem_assign(ftmp5, z1);
1183 felem_sum64(ftmp5, z2);
1184 /* ftmp5[i] < 2^61 */
1186 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1187 felem_square(tmp, ftmp5);
1188 /* tmp[i] < 17*2^122 */
1189 felem_diff_128_64(tmp, ftmp);
1190 /* tmp[i] < 17*2^122 + 2^63 */
1191 felem_diff_128_64(tmp, ftmp2);
1192 /* tmp[i] < 17*2^122 + 2^64 */
1193 felem_reduce(ftmp5, tmp);
1195 /* ftmp2 = z2 * z2z2 */
1196 felem_mul(tmp, ftmp2, z2);
1197 felem_reduce(ftmp2, tmp);
1199 /* s1 = ftmp6 = y1 * z2**3 */
1200 felem_mul(tmp, y1, ftmp2);
1201 felem_reduce(ftmp6, tmp);
1202 } else {
1203 /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1205 /* u1 = ftmp3 = x1*z2z2 */
1206 felem_assign(ftmp3, x1);
1208 /* ftmp5 = 2*z1z2 */
1209 felem_scalar(ftmp5, z1, 2);
1211 /* s1 = ftmp6 = y1 * z2**3 */
1212 felem_assign(ftmp6, y1);
1215 /* u2 = x2*z1z1 */
1216 felem_mul(tmp, x2, ftmp);
1217 /* tmp[i] < 17*2^120 */
1219 /* h = ftmp4 = u2 - u1 */
1220 felem_diff_128_64(tmp, ftmp3);
1221 /* tmp[i] < 17*2^120 + 2^63 */
1222 felem_reduce(ftmp4, tmp);
1224 x_equal = felem_is_zero(ftmp4);
1226 /* z_out = ftmp5 * h */
1227 felem_mul(tmp, ftmp5, ftmp4);
1228 felem_reduce(z_out, tmp);
1230 /* ftmp = z1 * z1z1 */
1231 felem_mul(tmp, ftmp, z1);
1232 felem_reduce(ftmp, tmp);
1234 /* s2 = tmp = y2 * z1**3 */
1235 felem_mul(tmp, y2, ftmp);
1236 /* tmp[i] < 17*2^120 */
1238 /* r = ftmp5 = (s2 - s1)*2 */
1239 felem_diff_128_64(tmp, ftmp6);
1240 /* tmp[i] < 17*2^120 + 2^63 */
1241 felem_reduce(ftmp5, tmp);
1242 y_equal = felem_is_zero(ftmp5);
1243 felem_scalar64(ftmp5, 2);
1244 /* ftmp5[i] < 2^61 */
1246 if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1247 point_double(x3, y3, z3, x1, y1, z1);
1248 return;
1250 /* I = ftmp = (2h)**2 */
1251 felem_assign(ftmp, ftmp4);
1252 felem_scalar64(ftmp, 2);
1253 /* ftmp[i] < 2^61 */
1254 felem_square(tmp, ftmp);
1255 /* tmp[i] < 17*2^122 */
1256 felem_reduce(ftmp, tmp);
1258 /* J = ftmp2 = h * I */
1259 felem_mul(tmp, ftmp4, ftmp);
1260 felem_reduce(ftmp2, tmp);
1262 /* V = ftmp4 = U1 * I */
1263 felem_mul(tmp, ftmp3, ftmp);
1264 felem_reduce(ftmp4, tmp);
1266 /* x_out = r**2 - J - 2V */
1267 felem_square(tmp, ftmp5);
1268 /* tmp[i] < 17*2^122 */
1269 felem_diff_128_64(tmp, ftmp2);
1270 /* tmp[i] < 17*2^122 + 2^63 */
1271 felem_assign(ftmp3, ftmp4);
1272 felem_scalar64(ftmp4, 2);
1273 /* ftmp4[i] < 2^61 */
1274 felem_diff_128_64(tmp, ftmp4);
1275 /* tmp[i] < 17*2^122 + 2^64 */
1276 felem_reduce(x_out, tmp);
1278 /* y_out = r(V-x_out) - 2 * s1 * J */
1279 felem_diff64(ftmp3, x_out);
1281 * ftmp3[i] < 2^60 + 2^60 = 2^61
1283 felem_mul(tmp, ftmp5, ftmp3);
1284 /* tmp[i] < 17*2^122 */
1285 felem_mul(tmp2, ftmp6, ftmp2);
1286 /* tmp2[i] < 17*2^120 */
1287 felem_scalar128(tmp2, 2);
1288 /* tmp2[i] < 17*2^121 */
1289 felem_diff128(tmp, tmp2);
1291 * tmp[i] < 2^127 - 2^69 + 17*2^122 = 2^126 - 2^122 - 2^6 - 2^2 - 1 <
1292 * 2^127
1294 felem_reduce(y_out, tmp);
1296 copy_conditional(x_out, x2, z1_is_zero);
1297 copy_conditional(x_out, x1, z2_is_zero);
1298 copy_conditional(y_out, y2, z1_is_zero);
1299 copy_conditional(y_out, y1, z2_is_zero);
1300 copy_conditional(z_out, z2, z1_is_zero);
1301 copy_conditional(z_out, z1, z2_is_zero);
1302 felem_assign(x3, x_out);
1303 felem_assign(y3, y_out);
1304 felem_assign(z3, z_out);
1307 /* Base point pre computation
1308 * --------------------------
1310 * Two different sorts of precomputed tables are used in the following code.
1311 * Each contain various points on the curve, where each point is three field
1312 * elements (x, y, z).
1314 * For the base point table, z is usually 1 (0 for the point at infinity).
1315 * This table has 16 elements:
1316 * index | bits | point
1317 * ------+---------+------------------------------
1318 * 0 | 0 0 0 0 | 0G
1319 * 1 | 0 0 0 1 | 1G
1320 * 2 | 0 0 1 0 | 2^130G
1321 * 3 | 0 0 1 1 | (2^130 + 1)G
1322 * 4 | 0 1 0 0 | 2^260G
1323 * 5 | 0 1 0 1 | (2^260 + 1)G
1324 * 6 | 0 1 1 0 | (2^260 + 2^130)G
1325 * 7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1326 * 8 | 1 0 0 0 | 2^390G
1327 * 9 | 1 0 0 1 | (2^390 + 1)G
1328 * 10 | 1 0 1 0 | (2^390 + 2^130)G
1329 * 11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1330 * 12 | 1 1 0 0 | (2^390 + 2^260)G
1331 * 13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1332 * 14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1333 * 15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1335 * The reason for this is so that we can clock bits into four different
1336 * locations when doing simple scalar multiplies against the base point.
1338 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1340 /* gmul is the table of precomputed base points */
1341 static const felem gmul[16][3] =
1342 {{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1343 {0, 0, 0, 0, 0, 0, 0, 0, 0},
1344 {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1345 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1346 0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1347 0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1348 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1349 0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1350 0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1351 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1352 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1353 0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1354 0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1355 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1356 0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1357 0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1358 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1359 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1360 0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1361 0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1362 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1363 0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1364 0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1365 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1366 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1367 0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1368 0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1369 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1370 0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1371 0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1372 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1373 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1374 0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1375 0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1376 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1377 0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1378 0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1379 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1380 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1381 0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1382 0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1383 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1384 0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1385 0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1386 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1387 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1388 0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1389 0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1390 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1391 0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1392 0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1393 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1394 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1395 0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1396 0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1397 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1398 0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1399 0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1400 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1401 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1402 0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1403 0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1404 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1405 0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1406 0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1407 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1408 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1409 0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1410 0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1411 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1412 0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1413 0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1414 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1415 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1416 0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1417 0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1418 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1419 0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1420 0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1421 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1422 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1423 0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1424 0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1425 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1426 0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1427 0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1428 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1429 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1430 0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1431 0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1432 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1433 0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1434 0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1435 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1436 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1437 0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1438 0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1439 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1440 0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1441 0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1442 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1443 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1444 0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1445 0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1446 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1447 0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1448 0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1449 {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
1451 /* select_point selects the |idx|th point from a precomputation table and
1452 * copies it to out. */
1453 static void
1454 select_point(const limb idx, unsigned int size, const felem pre_comp[ /* size */ ][3],
1455 felem out[3])
1457 unsigned i, j;
1458 limb *outlimbs = &out[0][0];
1459 memset(outlimbs, 0, 3 * sizeof(felem));
1461 for (i = 0; i < size; i++) {
1462 const limb *inlimbs = &pre_comp[i][0][0];
1463 limb mask = i ^ idx;
1464 mask |= mask >> 4;
1465 mask |= mask >> 2;
1466 mask |= mask >> 1;
1467 mask &= 1;
1468 mask--;
1469 for (j = 0; j < NLIMBS * 3; j++)
1470 outlimbs[j] |= inlimbs[j] & mask;
1474 /* get_bit returns the |i|th bit in |in| */
1475 static char
1476 get_bit(const felem_bytearray in, int i)
1478 if (i < 0)
1479 return 0;
1480 return (in[i >> 3] >> (i & 7)) & 1;
1483 /* Interleaved point multiplication using precomputed point multiples:
1484 * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
1485 * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1486 * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1487 * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1488 static void
1489 batch_mul(felem x_out, felem y_out, felem z_out,
1490 const felem_bytearray scalars[], const unsigned num_points, const u8 * g_scalar,
1491 const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
1493 int i, skip;
1494 unsigned num, gen_mul = (g_scalar != NULL);
1495 felem nq[3], tmp[4];
1496 limb bits;
1497 u8 sign, digit;
1499 /* set nq to the point at infinity */
1500 memset(nq, 0, 3 * sizeof(felem));
1503 * Loop over all scalars msb-to-lsb, interleaving additions of
1504 * multiples of the generator (last quarter of rounds) and additions
1505 * of other points multiples (every 5th round).
1507 skip = 1; /* save two point operations in the first
1508 * round */
1509 for (i = (num_points ? 520 : 130); i >= 0; --i) {
1510 /* double */
1511 if (!skip)
1512 point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1514 /* add multiples of the generator */
1515 if (gen_mul && (i <= 130)) {
1516 bits = get_bit(g_scalar, i + 390) << 3;
1517 if (i < 130) {
1518 bits |= get_bit(g_scalar, i + 260) << 2;
1519 bits |= get_bit(g_scalar, i + 130) << 1;
1520 bits |= get_bit(g_scalar, i);
1522 /* select the point to add, in constant time */
1523 select_point(bits, 16, g_pre_comp, tmp);
1524 if (!skip) {
1525 point_add(nq[0], nq[1], nq[2],
1526 nq[0], nq[1], nq[2],
1527 1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1528 } else {
1529 memcpy(nq, tmp, 3 * sizeof(felem));
1530 skip = 0;
1533 /* do other additions every 5 doublings */
1534 if (num_points && (i % 5 == 0)) {
1535 /* loop over all scalars */
1536 for (num = 0; num < num_points; ++num) {
1537 bits = get_bit(scalars[num], i + 4) << 5;
1538 bits |= get_bit(scalars[num], i + 3) << 4;
1539 bits |= get_bit(scalars[num], i + 2) << 3;
1540 bits |= get_bit(scalars[num], i + 1) << 2;
1541 bits |= get_bit(scalars[num], i) << 1;
1542 bits |= get_bit(scalars[num], i - 1);
1543 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1546 * select the point to add or subtract, in
1547 * constant time
1549 select_point(digit, 17, pre_comp[num], tmp);
1550 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the
1551 * negative point */
1552 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1554 if (!skip) {
1555 point_add(nq[0], nq[1], nq[2],
1556 nq[0], nq[1], nq[2],
1557 mixed, tmp[0], tmp[1], tmp[2]);
1558 } else {
1559 memcpy(nq, tmp, 3 * sizeof(felem));
1560 skip = 0;
1565 felem_assign(x_out, nq[0]);
1566 felem_assign(y_out, nq[1]);
1567 felem_assign(z_out, nq[2]);
1571 /* Precomputation for the group generator. */
1572 typedef struct {
1573 felem g_pre_comp[16][3];
1574 int references;
1575 } NISTP521_PRE_COMP;
1577 const EC_METHOD *
1578 EC_GFp_nistp521_method(void)
1580 static const EC_METHOD ret = {
1581 .flags = EC_FLAGS_DEFAULT_OCT,
1582 .field_type = NID_X9_62_prime_field,
1583 .group_init = ec_GFp_nistp521_group_init,
1584 .group_finish = ec_GFp_simple_group_finish,
1585 .group_clear_finish = ec_GFp_simple_group_clear_finish,
1586 .group_copy = ec_GFp_nist_group_copy,
1587 .group_set_curve = ec_GFp_nistp521_group_set_curve,
1588 .group_get_curve = ec_GFp_simple_group_get_curve,
1589 .group_get_degree = ec_GFp_simple_group_get_degree,
1590 .group_check_discriminant =
1591 ec_GFp_simple_group_check_discriminant,
1592 .point_init = ec_GFp_simple_point_init,
1593 .point_finish = ec_GFp_simple_point_finish,
1594 .point_clear_finish = ec_GFp_simple_point_clear_finish,
1595 .point_copy = ec_GFp_simple_point_copy,
1596 .point_set_to_infinity = ec_GFp_simple_point_set_to_infinity,
1597 .point_set_Jprojective_coordinates_GFp =
1598 ec_GFp_simple_set_Jprojective_coordinates_GFp,
1599 .point_get_Jprojective_coordinates_GFp =
1600 ec_GFp_simple_get_Jprojective_coordinates_GFp,
1601 .point_set_affine_coordinates =
1602 ec_GFp_simple_point_set_affine_coordinates,
1603 .point_get_affine_coordinates =
1604 ec_GFp_nistp521_point_get_affine_coordinates,
1605 .add = ec_GFp_simple_add,
1606 .dbl = ec_GFp_simple_dbl,
1607 .invert = ec_GFp_simple_invert,
1608 .is_at_infinity = ec_GFp_simple_is_at_infinity,
1609 .is_on_curve = ec_GFp_simple_is_on_curve,
1610 .point_cmp = ec_GFp_simple_cmp,
1611 .make_affine = ec_GFp_simple_make_affine,
1612 .points_make_affine = ec_GFp_simple_points_make_affine,
1613 .mul = ec_GFp_nistp521_points_mul,
1614 .precompute_mult = ec_GFp_nistp521_precompute_mult,
1615 .have_precompute_mult = ec_GFp_nistp521_have_precompute_mult,
1616 .field_mul = ec_GFp_nist_field_mul,
1617 .field_sqr = ec_GFp_nist_field_sqr
1620 return &ret;
1624 /******************************************************************************/
1625 /* FUNCTIONS TO MANAGE PRECOMPUTATION
1628 static NISTP521_PRE_COMP *
1629 nistp521_pre_comp_new()
1631 NISTP521_PRE_COMP *ret = NULL;
1632 ret = malloc(sizeof(NISTP521_PRE_COMP));
1633 if (!ret) {
1634 ECerror(ERR_R_MALLOC_FAILURE);
1635 return ret;
1637 memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1638 ret->references = 1;
1639 return ret;
1642 static void *
1643 nistp521_pre_comp_dup(void *src_)
1645 NISTP521_PRE_COMP *src = src_;
1647 /* no need to actually copy, these objects never change! */
1648 CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1650 return src_;
1653 static void
1654 nistp521_pre_comp_free(void *pre_)
1656 int i;
1657 NISTP521_PRE_COMP *pre = pre_;
1659 if (!pre)
1660 return;
1662 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1663 if (i > 0)
1664 return;
1666 free(pre);
1669 static void
1670 nistp521_pre_comp_clear_free(void *pre_)
1672 int i;
1673 NISTP521_PRE_COMP *pre = pre_;
1675 if (!pre)
1676 return;
1678 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1679 if (i > 0)
1680 return;
1682 explicit_bzero(pre, sizeof(*pre));
1683 free(pre);
1686 /******************************************************************************/
1687 /* OPENSSL EC_METHOD FUNCTIONS
1690 int
1691 ec_GFp_nistp521_group_init(EC_GROUP * group)
1693 int ret;
1694 ret = ec_GFp_simple_group_init(group);
1695 group->a_is_minus3 = 1;
1696 return ret;
1699 int
1700 ec_GFp_nistp521_group_set_curve(EC_GROUP * group, const BIGNUM * p,
1701 const BIGNUM * a, const BIGNUM * b, BN_CTX * ctx)
1703 int ret = 0;
1704 BN_CTX *new_ctx = NULL;
1705 BIGNUM *curve_p, *curve_a, *curve_b;
1707 if (ctx == NULL)
1708 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1709 return 0;
1710 BN_CTX_start(ctx);
1711 if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1712 ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1713 ((curve_b = BN_CTX_get(ctx)) == NULL))
1714 goto err;
1715 BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1716 BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1717 BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1718 if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1719 (BN_cmp(curve_b, b))) {
1720 ECerror(EC_R_WRONG_CURVE_PARAMETERS);
1721 goto err;
1723 group->field_mod_func = BN_nist_mod_521;
1724 ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1725 err:
1726 BN_CTX_end(ctx);
1727 BN_CTX_free(new_ctx);
1728 return ret;
1731 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1732 * (X', Y') = (X/Z^2, Y/Z^3) */
1733 int
1734 ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP * group,
1735 const EC_POINT * point, BIGNUM * x, BIGNUM * y, BN_CTX * ctx)
1737 felem z1, z2, x_in, y_in, x_out, y_out;
1738 largefelem tmp;
1740 if (EC_POINT_is_at_infinity(group, point) > 0) {
1741 ECerror(EC_R_POINT_AT_INFINITY);
1742 return 0;
1744 if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1745 (!BN_to_felem(z1, &point->Z)))
1746 return 0;
1747 felem_inv(z2, z1);
1748 felem_square(tmp, z2);
1749 felem_reduce(z1, tmp);
1750 felem_mul(tmp, x_in, z1);
1751 felem_reduce(x_in, tmp);
1752 felem_contract(x_out, x_in);
1753 if (x != NULL) {
1754 if (!felem_to_BN(x, x_out)) {
1755 ECerror(ERR_R_BN_LIB);
1756 return 0;
1759 felem_mul(tmp, z1, z2);
1760 felem_reduce(z1, tmp);
1761 felem_mul(tmp, y_in, z1);
1762 felem_reduce(y_in, tmp);
1763 felem_contract(y_out, y_in);
1764 if (y != NULL) {
1765 if (!felem_to_BN(y, y_out)) {
1766 ECerror(ERR_R_BN_LIB);
1767 return 0;
1770 return 1;
1773 static void
1774 make_points_affine(size_t num, felem points[ /* num */ ][3], felem tmp_felems[ /* num+1 */ ])
1777 * Runs in constant time, unless an input is the point at infinity
1778 * (which normally shouldn't happen).
1780 ec_GFp_nistp_points_make_affine_internal(
1781 num,
1782 points,
1783 sizeof(felem),
1784 tmp_felems,
1785 (void (*) (void *)) felem_one,
1786 (int (*) (const void *)) felem_is_zero_int,
1787 (void (*) (void *, const void *)) felem_assign,
1788 (void (*) (void *, const void *)) felem_square_reduce,
1789 (void (*) (void *, const void *, const void *)) felem_mul_reduce,
1790 (void (*) (void *, const void *)) felem_inv,
1791 (void (*) (void *, const void *)) felem_contract);
1794 /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1795 * Result is stored in r (r can equal one of the inputs). */
1796 int
1797 ec_GFp_nistp521_points_mul(const EC_GROUP * group, EC_POINT * r,
1798 const BIGNUM * scalar, size_t num, const EC_POINT * points[],
1799 const BIGNUM * scalars[], BN_CTX * ctx)
1801 int ret = 0;
1802 int j;
1803 int mixed = 0;
1804 BN_CTX *new_ctx = NULL;
1805 BIGNUM *x, *y, *z, *tmp_scalar;
1806 felem_bytearray g_secret;
1807 felem_bytearray *secrets = NULL;
1808 felem(*pre_comp)[17][3] = NULL;
1809 felem *tmp_felems = NULL;
1810 felem_bytearray tmp;
1811 unsigned i, num_bytes;
1812 int have_pre_comp = 0;
1813 size_t num_points = num;
1814 felem x_in, y_in, z_in, x_out, y_out, z_out;
1815 NISTP521_PRE_COMP *pre = NULL;
1816 felem(*g_pre_comp)[3] = NULL;
1817 EC_POINT *generator = NULL;
1818 const EC_POINT *p = NULL;
1819 const BIGNUM *p_scalar = NULL;
1821 if (ctx == NULL)
1822 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1823 return 0;
1824 BN_CTX_start(ctx);
1825 if (((x = BN_CTX_get(ctx)) == NULL) ||
1826 ((y = BN_CTX_get(ctx)) == NULL) ||
1827 ((z = BN_CTX_get(ctx)) == NULL) ||
1828 ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1829 goto err;
1831 if (scalar != NULL) {
1832 pre = EC_EX_DATA_get_data(group->extra_data,
1833 nistp521_pre_comp_dup, nistp521_pre_comp_free,
1834 nistp521_pre_comp_clear_free);
1835 if (pre)
1836 /* we have precomputation, try to use it */
1837 g_pre_comp = &pre->g_pre_comp[0];
1838 else
1839 /* try to use the standard precomputation */
1840 g_pre_comp = (felem(*)[3]) gmul;
1841 generator = EC_POINT_new(group);
1842 if (generator == NULL)
1843 goto err;
1844 /* get the generator from precomputation */
1845 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1846 !felem_to_BN(y, g_pre_comp[1][1]) ||
1847 !felem_to_BN(z, g_pre_comp[1][2])) {
1848 ECerror(ERR_R_BN_LIB);
1849 goto err;
1851 if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1852 generator, x, y, z, ctx))
1853 goto err;
1854 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1855 /* precomputation matches generator */
1856 have_pre_comp = 1;
1857 else
1859 * we don't have valid precomputation: treat the
1860 * generator as a random point
1862 num_points++;
1864 if (num_points > 0) {
1865 if (num_points >= 2) {
1867 * unless we precompute multiples for just one point,
1868 * converting those into affine form is time well
1869 * spent
1871 mixed = 1;
1873 secrets = calloc(num_points, sizeof(felem_bytearray));
1874 pre_comp = calloc(num_points, 17 * 3 * sizeof(felem));
1875 if (mixed) {
1876 /* XXX should do more int overflow checking */
1877 tmp_felems = reallocarray(NULL,
1878 (num_points * 17 + 1), sizeof(felem));
1880 if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL))) {
1881 ECerror(ERR_R_MALLOC_FAILURE);
1882 goto err;
1885 * we treat NULL scalars as 0, and NULL points as points at
1886 * infinity, i.e., they contribute nothing to the linear
1887 * combination
1889 for (i = 0; i < num_points; ++i) {
1890 if (i == num)
1892 * we didn't have a valid precomputation, so
1893 * we pick the generator
1896 p = EC_GROUP_get0_generator(group);
1897 p_scalar = scalar;
1898 } else
1899 /* the i^th point */
1901 p = points[i];
1902 p_scalar = scalars[i];
1904 if ((p_scalar != NULL) && (p != NULL)) {
1905 /* reduce scalar to 0 <= scalar < 2^521 */
1906 if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar))) {
1908 * this is an unusual input, and we
1909 * don't guarantee constant-timeness
1911 if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1912 ECerror(ERR_R_BN_LIB);
1913 goto err;
1915 num_bytes = BN_bn2bin(tmp_scalar, tmp);
1916 } else
1917 num_bytes = BN_bn2bin(p_scalar, tmp);
1918 flip_endian(secrets[i], tmp, num_bytes);
1919 /* precompute multiples */
1920 if ((!BN_to_felem(x_out, &p->X)) ||
1921 (!BN_to_felem(y_out, &p->Y)) ||
1922 (!BN_to_felem(z_out, &p->Z)))
1923 goto err;
1924 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1925 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1926 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1927 for (j = 2; j <= 16; ++j) {
1928 if (j & 1) {
1929 point_add(
1930 pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1931 pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1932 0, pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1933 } else {
1934 point_double(
1935 pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1936 pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1941 if (mixed)
1942 make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1944 /* the scalar for the generator */
1945 if ((scalar != NULL) && (have_pre_comp)) {
1946 memset(g_secret, 0, sizeof(g_secret));
1947 /* reduce scalar to 0 <= scalar < 2^521 */
1948 if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1950 * this is an unusual input, and we don't guarantee
1951 * constant-timeness
1953 if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1954 ECerror(ERR_R_BN_LIB);
1955 goto err;
1957 num_bytes = BN_bn2bin(tmp_scalar, tmp);
1958 } else
1959 num_bytes = BN_bn2bin(scalar, tmp);
1960 flip_endian(g_secret, tmp, num_bytes);
1961 /* do the multiplication with generator precomputation */
1962 batch_mul(x_out, y_out, z_out,
1963 (const felem_bytearray(*)) secrets, num_points,
1964 g_secret,
1965 mixed, (const felem(*)[17][3]) pre_comp,
1966 (const felem(*)[3]) g_pre_comp);
1967 } else
1968 /* do the multiplication without generator precomputation */
1969 batch_mul(x_out, y_out, z_out,
1970 (const felem_bytearray(*)) secrets, num_points,
1971 NULL, mixed, (const felem(*)[17][3]) pre_comp, NULL);
1972 /* reduce the output to its unique minimal representation */
1973 felem_contract(x_in, x_out);
1974 felem_contract(y_in, y_out);
1975 felem_contract(z_in, z_out);
1976 if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1977 (!felem_to_BN(z, z_in))) {
1978 ECerror(ERR_R_BN_LIB);
1979 goto err;
1981 ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1983 err:
1984 BN_CTX_end(ctx);
1985 EC_POINT_free(generator);
1986 BN_CTX_free(new_ctx);
1987 free(secrets);
1988 free(pre_comp);
1989 free(tmp_felems);
1990 return ret;
1993 int
1994 ec_GFp_nistp521_precompute_mult(EC_GROUP * group, BN_CTX * ctx)
1996 int ret = 0;
1997 NISTP521_PRE_COMP *pre = NULL;
1998 int i, j;
1999 BN_CTX *new_ctx = NULL;
2000 BIGNUM *x, *y;
2001 EC_POINT *generator = NULL;
2002 felem tmp_felems[16];
2004 /* throw away old precomputation */
2005 EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2006 nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
2007 if (ctx == NULL)
2008 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2009 return 0;
2010 BN_CTX_start(ctx);
2011 if (((x = BN_CTX_get(ctx)) == NULL) ||
2012 ((y = BN_CTX_get(ctx)) == NULL))
2013 goto err;
2014 /* get the generator */
2015 if (group->generator == NULL)
2016 goto err;
2017 generator = EC_POINT_new(group);
2018 if (generator == NULL)
2019 goto err;
2020 BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2021 BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2022 if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2023 goto err;
2024 if ((pre = nistp521_pre_comp_new()) == NULL)
2025 goto err;
2026 /* if the generator is the standard one, use built-in precomputation */
2027 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2028 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2029 ret = 1;
2030 goto err;
2032 if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2033 (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2034 (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2035 goto err;
2036 /* compute 2^130*G, 2^260*G, 2^390*G */
2037 for (i = 1; i <= 4; i <<= 1) {
2038 point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2039 pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2040 pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2041 for (j = 0; j < 129; ++j) {
2042 point_double(pre->g_pre_comp[2 * i][0],
2043 pre->g_pre_comp[2 * i][1],
2044 pre->g_pre_comp[2 * i][2],
2045 pre->g_pre_comp[2 * i][0],
2046 pre->g_pre_comp[2 * i][1],
2047 pre->g_pre_comp[2 * i][2]);
2050 /* g_pre_comp[0] is the point at infinity */
2051 memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2052 /* the remaining multiples */
2053 /* 2^130*G + 2^260*G */
2054 point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2055 pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2056 pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2057 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2058 pre->g_pre_comp[2][2]);
2059 /* 2^130*G + 2^390*G */
2060 point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2061 pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2062 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2063 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2064 pre->g_pre_comp[2][2]);
2065 /* 2^260*G + 2^390*G */
2066 point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2067 pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2068 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2069 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2070 pre->g_pre_comp[4][2]);
2071 /* 2^130*G + 2^260*G + 2^390*G */
2072 point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2073 pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2074 pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2075 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2076 pre->g_pre_comp[2][2]);
2077 for (i = 1; i < 8; ++i) {
2078 /* odd multiples: add G */
2079 point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1],
2080 pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2081 pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
2082 0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2083 pre->g_pre_comp[1][2]);
2085 make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2087 if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2088 nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
2089 goto err;
2090 ret = 1;
2091 pre = NULL;
2092 err:
2093 BN_CTX_end(ctx);
2094 EC_POINT_free(generator);
2095 BN_CTX_free(new_ctx);
2096 nistp521_pre_comp_free(pre);
2097 return ret;
2100 int
2101 ec_GFp_nistp521_have_precompute_mult(const EC_GROUP * group)
2103 if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2104 nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
2105 != NULL)
2106 return 1;
2107 else
2108 return 0;
2111 #endif