lib/libcrypto/ec/ecp_nistp521.c

   1 /* $OpenBSD: ecp_nistp521.c,v 1.18 2017/01/29 17:49:23 beck Exp $ */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /*
   6  * Copyright (c) 2011 Google Inc.
   7  *
   8  * Permission to use, copy, modify, and distribute this software for any
   9  * purpose with or without fee is hereby granted, provided that the above
  10  * copyright notice and this permission notice appear in all copies.
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  13  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  14  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  15  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  16  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  17  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  18  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <stdint.h>
  30 #include <string.h>
  31
  32 #include <openssl/opensslconf.h>
  33
  34 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  35
  36 #include <openssl/err.h>
  37 #include "ec_lcl.h"
  38
  39 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  40   /* even with gcc, the typedef won't work for 32-bit platforms */
  41   typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
  42 #else
  43   #error "Need GCC 3.1 or later to define type uint128_t"
  44 #endif
  45
  46 typedef uint8_t u8;
  47 typedef uint64_t u64;
  48 typedef int64_t s64;
  49
  50 /* The underlying field.
  51  *
  52  * P521 operates over GF(2^521-1). We can serialise an element of this field
  53  * into 66 bytes where the most significant byte contains only a single bit. We
  54  * call this an felem_bytearray. */
  55
  56 typedef u8 felem_bytearray[66];
  57
  58 /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  59  * These values are big-endian. */
  60 static const felem_bytearray nistp521_curve_params[5] =
  61         {
  62         {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* p */
  63          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  64          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  65          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  66          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  67          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  68          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70          0xff, 0xff},
  71         {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* a = -3 */
  72          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79          0xff, 0xfc},
  80         {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c,  /* b */
  81          0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  82          0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  83          0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  84          0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  85          0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  86          0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  87          0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  88          0x3f, 0x00},
  89         {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04,  /* x */
  90          0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  91          0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  92          0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  93          0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  94          0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
  95          0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
  96          0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
  97          0xbd, 0x66},
  98         {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b,  /* y */
  99          0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 100          0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 101          0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 102          0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 103          0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 104          0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 105          0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 106          0x66, 0x50}
 107         };
 108
 109 /* The representation of field elements.
 110  * ------------------------------------
 111  *
 112  * We represent field elements with nine values. These values are either 64 or
 113  * 128 bits and the field element represented is:
 114  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 115  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 116  * 58 bits apart, but are greater than 58 bits in length, the most significant
 117  * bits of each limb overlap with the least significant bits of the next.
 118  *
 119  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 120  * 'largefelem' */
 121
 122 #define NLIMBS 9
 123
 124 typedef uint64_t limb;
 125 typedef limb felem[NLIMBS];
 126 typedef uint128_t largefelem[NLIMBS];
 127
 128 static const limb bottom57bits = 0x1ffffffffffffff;
 129 static const limb bottom58bits = 0x3ffffffffffffff;
 130
 131 /* bin66_to_felem takes a little-endian byte array and converts it into felem
 132  * form. This assumes that the CPU is little-endian. */
 133 static void
 134 bin66_to_felem(felem out, const u8 in[66])
 135 {
 136         out[0] = (*((limb *) & in[0])) & bottom58bits;
 137         out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 138         out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 139         out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 140         out[4] = (*((limb *) & in[29])) & bottom58bits;
 141         out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 142         out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 143         out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 144         out[8] = (*((limb *) & in[58])) & bottom57bits;
 145 }
 146
 147 /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 148  * array. This assumes that the CPU is little-endian. */
 149 static void
 150 felem_to_bin66(u8 out[66], const felem in)
 151 {
 152         memset(out, 0, 66);
 153         (*((limb *) & out[0])) = in[0];
 154         (*((limb *) & out[7])) |= in[1] << 2;
 155         (*((limb *) & out[14])) |= in[2] << 4;
 156         (*((limb *) & out[21])) |= in[3] << 6;
 157         (*((limb *) & out[29])) = in[4];
 158         (*((limb *) & out[36])) |= in[5] << 2;
 159         (*((limb *) & out[43])) |= in[6] << 4;
 160         (*((limb *) & out[50])) |= in[7] << 6;
 161         (*((limb *) & out[58])) = in[8];
 162 }
 163
 164 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 165 static void
 166 flip_endian(u8 * out, const u8 * in, unsigned len)
 167 {
 168         unsigned i;
 169         for (i = 0; i < len; ++i)
 170                 out[i] = in[len - 1 - i];
 171 }
 172
 173 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 174 static int
 175 BN_to_felem(felem out, const BIGNUM * bn)
 176 {
 177         felem_bytearray b_in;
 178         felem_bytearray b_out;
 179         unsigned num_bytes;
 180
 181         /* BN_bn2bin eats leading zeroes */
 182         memset(b_out, 0, sizeof b_out);
 183         num_bytes = BN_num_bytes(bn);
 184         if (num_bytes > sizeof b_out) {
 185                 ECerror(EC_R_BIGNUM_OUT_OF_RANGE);
 186                 return 0;
 187         }
 188         if (BN_is_negative(bn)) {
 189                 ECerror(EC_R_BIGNUM_OUT_OF_RANGE);
 190                 return 0;
 191         }
 192         num_bytes = BN_bn2bin(bn, b_in);
 193         flip_endian(b_out, b_in, num_bytes);
 194         bin66_to_felem(out, b_out);
 195         return 1;
 196 }
 197
 198 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 199 static BIGNUM *
 200 felem_to_BN(BIGNUM * out, const felem in)
 201 {
 202         felem_bytearray b_in, b_out;
 203         felem_to_bin66(b_in, in);
 204         flip_endian(b_out, b_in, sizeof b_out);
 205         return BN_bin2bn(b_out, sizeof b_out, out);
 206 }
 207
 208
 209 /* Field operations
 210  * ---------------- */
 211
 212 static void
 213 felem_one(felem out)
 214 {
 215         out[0] = 1;
 216         out[1] = 0;
 217         out[2] = 0;
 218         out[3] = 0;
 219         out[4] = 0;
 220         out[5] = 0;
 221         out[6] = 0;
 222         out[7] = 0;
 223         out[8] = 0;
 224 }
 225
 226 static void
 227 felem_assign(felem out, const felem in)
 228 {
 229         out[0] = in[0];
 230         out[1] = in[1];
 231         out[2] = in[2];
 232         out[3] = in[3];
 233         out[4] = in[4];
 234         out[5] = in[5];
 235         out[6] = in[6];
 236         out[7] = in[7];
 237         out[8] = in[8];
 238 }
 239
 240 /* felem_sum64 sets out = out + in. */
 241 static void
 242 felem_sum64(felem out, const felem in)
 243 {
 244         out[0] += in[0];
 245         out[1] += in[1];
 246         out[2] += in[2];
 247         out[3] += in[3];
 248         out[4] += in[4];
 249         out[5] += in[5];
 250         out[6] += in[6];
 251         out[7] += in[7];
 252         out[8] += in[8];
 253 }
 254
 255 /* felem_scalar sets out = in * scalar */
 256 static void
 257 felem_scalar(felem out, const felem in, limb scalar)
 258 {
 259         out[0] = in[0] * scalar;
 260         out[1] = in[1] * scalar;
 261         out[2] = in[2] * scalar;
 262         out[3] = in[3] * scalar;
 263         out[4] = in[4] * scalar;
 264         out[5] = in[5] * scalar;
 265         out[6] = in[6] * scalar;
 266         out[7] = in[7] * scalar;
 267         out[8] = in[8] * scalar;
 268 }
 269
 270 /* felem_scalar64 sets out = out * scalar */
 271 static void
 272 felem_scalar64(felem out, limb scalar)
 273 {
 274         out[0] *= scalar;
 275         out[1] *= scalar;
 276         out[2] *= scalar;
 277         out[3] *= scalar;
 278         out[4] *= scalar;
 279         out[5] *= scalar;
 280         out[6] *= scalar;
 281         out[7] *= scalar;
 282         out[8] *= scalar;
 283 }
 284
 285 /* felem_scalar128 sets out = out * scalar */
 286 static void
 287 felem_scalar128(largefelem out, limb scalar)
 288 {
 289         out[0] *= scalar;
 290         out[1] *= scalar;
 291         out[2] *= scalar;
 292         out[3] *= scalar;
 293         out[4] *= scalar;
 294         out[5] *= scalar;
 295         out[6] *= scalar;
 296         out[7] *= scalar;
 297         out[8] *= scalar;
 298 }
 299
 300 /* felem_neg sets |out| to |-in|
 301  * On entry:
 302  *   in[i] < 2^59 + 2^14
 303  * On exit:
 304  *   out[i] < 2^62
 305  */
 306 static void
 307 felem_neg(felem out, const felem in)
 308 {
 309         /* In order to prevent underflow, we subtract from 0 mod p. */
 310         static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 311         static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 312
 313         out[0] = two62m3 - in[0];
 314         out[1] = two62m2 - in[1];
 315         out[2] = two62m2 - in[2];
 316         out[3] = two62m2 - in[3];
 317         out[4] = two62m2 - in[4];
 318         out[5] = two62m2 - in[5];
 319         out[6] = two62m2 - in[6];
 320         out[7] = two62m2 - in[7];
 321         out[8] = two62m2 - in[8];
 322 }
 323
 324 /* felem_diff64 subtracts |in| from |out|
 325  * On entry:
 326  *   in[i] < 2^59 + 2^14
 327  * On exit:
 328  *   out[i] < out[i] + 2^62
 329  */
 330 static void
 331 felem_diff64(felem out, const felem in)
 332 {
 333         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 334         static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 335         static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 336
 337         out[0] += two62m3 - in[0];
 338         out[1] += two62m2 - in[1];
 339         out[2] += two62m2 - in[2];
 340         out[3] += two62m2 - in[3];
 341         out[4] += two62m2 - in[4];
 342         out[5] += two62m2 - in[5];
 343         out[6] += two62m2 - in[6];
 344         out[7] += two62m2 - in[7];
 345         out[8] += two62m2 - in[8];
 346 }
 347
 348 /* felem_diff_128_64 subtracts |in| from |out|
 349  * On entry:
 350  *   in[i] < 2^62 + 2^17
 351  * On exit:
 352  *   out[i] < out[i] + 2^63
 353  */
 354 static void
 355 felem_diff_128_64(largefelem out, const felem in)
 356 {
 357         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 358         static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
 359         static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
 360
 361         out[0] += two63m6 - in[0];
 362         out[1] += two63m5 - in[1];
 363         out[2] += two63m5 - in[2];
 364         out[3] += two63m5 - in[3];
 365         out[4] += two63m5 - in[4];
 366         out[5] += two63m5 - in[5];
 367         out[6] += two63m5 - in[6];
 368         out[7] += two63m5 - in[7];
 369         out[8] += two63m5 - in[8];
 370 }
 371
 372 /* felem_diff_128_64 subtracts |in| from |out|
 373  * On entry:
 374  *   in[i] < 2^126
 375  * On exit:
 376  *   out[i] < out[i] + 2^127 - 2^69
 377  */
 378 static void
 379 felem_diff128(largefelem out, const largefelem in)
 380 {
 381         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 382         static const uint128_t two127m70 = (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 383         static const uint128_t two127m69 = (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 384
 385         out[0] += (two127m70 - in[0]);
 386         out[1] += (two127m69 - in[1]);
 387         out[2] += (two127m69 - in[2]);
 388         out[3] += (two127m69 - in[3]);
 389         out[4] += (two127m69 - in[4]);
 390         out[5] += (two127m69 - in[5]);
 391         out[6] += (two127m69 - in[6]);
 392         out[7] += (two127m69 - in[7]);
 393         out[8] += (two127m69 - in[8]);
 394 }
 395
 396 /* felem_square sets |out| = |in|^2
 397  * On entry:
 398  *   in[i] < 2^62
 399  * On exit:
 400  *   out[i] < 17 * max(in[i]) * max(in[i])
 401  */
 402 static void
 403 felem_square(largefelem out, const felem in)
 404 {
 405         felem inx2, inx4;
 406         felem_scalar(inx2, in, 2);
 407         felem_scalar(inx4, in, 4);
 408
 409         /*
 410          * We have many cases were we want to do in[x] * in[y] + in[y] *
 411          * in[x] This is obviously just 2 * in[x] * in[y] However, rather
 412          * than do the doubling on the 128 bit result, we double one of the
 413          * inputs to the multiplication by reading from |inx2|
 414          */
 415
 416         out[0] = ((uint128_t) in[0]) * in[0];
 417         out[1] = ((uint128_t) in[0]) * inx2[1];
 418         out[2] = ((uint128_t) in[0]) * inx2[2] +
 419             ((uint128_t) in[1]) * in[1];
 420         out[3] = ((uint128_t) in[0]) * inx2[3] +
 421             ((uint128_t) in[1]) * inx2[2];
 422         out[4] = ((uint128_t) in[0]) * inx2[4] +
 423             ((uint128_t) in[1]) * inx2[3] +
 424             ((uint128_t) in[2]) * in[2];
 425         out[5] = ((uint128_t) in[0]) * inx2[5] +
 426             ((uint128_t) in[1]) * inx2[4] +
 427             ((uint128_t) in[2]) * inx2[3];
 428         out[6] = ((uint128_t) in[0]) * inx2[6] +
 429             ((uint128_t) in[1]) * inx2[5] +
 430             ((uint128_t) in[2]) * inx2[4] +
 431             ((uint128_t) in[3]) * in[3];
 432         out[7] = ((uint128_t) in[0]) * inx2[7] +
 433             ((uint128_t) in[1]) * inx2[6] +
 434             ((uint128_t) in[2]) * inx2[5] +
 435             ((uint128_t) in[3]) * inx2[4];
 436         out[8] = ((uint128_t) in[0]) * inx2[8] +
 437             ((uint128_t) in[1]) * inx2[7] +
 438             ((uint128_t) in[2]) * inx2[6] +
 439             ((uint128_t) in[3]) * inx2[5] +
 440             ((uint128_t) in[4]) * in[4];
 441
 442         /*
 443          * The remaining limbs fall above 2^521, with the first falling at
 444          * 2^522. They correspond to locations one bit up from the limbs
 445          * produced above so we would have to multiply by two to align them.
 446          * Again, rather than operate on the 128-bit result, we double one of
 447          * the inputs to the multiplication. If we want to double for both
 448          * this reason, and the reason above, then we end up multiplying by
 449          * four.
 450          */
 451
 452         /* 9 */
 453         out[0] += ((uint128_t) in[1]) * inx4[8] +
 454             ((uint128_t) in[2]) * inx4[7] +
 455             ((uint128_t) in[3]) * inx4[6] +
 456             ((uint128_t) in[4]) * inx4[5];
 457
 458         /* 10 */
 459         out[1] += ((uint128_t) in[2]) * inx4[8] +
 460             ((uint128_t) in[3]) * inx4[7] +
 461             ((uint128_t) in[4]) * inx4[6] +
 462             ((uint128_t) in[5]) * inx2[5];
 463
 464         /* 11 */
 465         out[2] += ((uint128_t) in[3]) * inx4[8] +
 466             ((uint128_t) in[4]) * inx4[7] +
 467             ((uint128_t) in[5]) * inx4[6];
 468
 469         /* 12 */
 470         out[3] += ((uint128_t) in[4]) * inx4[8] +
 471             ((uint128_t) in[5]) * inx4[7] +
 472             ((uint128_t) in[6]) * inx2[6];
 473
 474         /* 13 */
 475         out[4] += ((uint128_t) in[5]) * inx4[8] +
 476             ((uint128_t) in[6]) * inx4[7];
 477
 478         /* 14 */
 479         out[5] += ((uint128_t) in[6]) * inx4[8] +
 480             ((uint128_t) in[7]) * inx2[7];
 481
 482         /* 15 */
 483         out[6] += ((uint128_t) in[7]) * inx4[8];
 484
 485         /* 16 */
 486         out[7] += ((uint128_t) in[8]) * inx2[8];
 487 }
 488
 489 /* felem_mul sets |out| = |in1| * |in2|
 490  * On entry:
 491  *   in1[i] < 2^64
 492  *   in2[i] < 2^63
 493  * On exit:
 494  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 495  */
 496 static void
 497 felem_mul(largefelem out, const felem in1, const felem in2)
 498 {
 499         felem in2x2;
 500         felem_scalar(in2x2, in2, 2);
 501
 502         out[0] = ((uint128_t) in1[0]) * in2[0];
 503
 504         out[1] = ((uint128_t) in1[0]) * in2[1] +
 505             ((uint128_t) in1[1]) * in2[0];
 506
 507         out[2] = ((uint128_t) in1[0]) * in2[2] +
 508             ((uint128_t) in1[1]) * in2[1] +
 509             ((uint128_t) in1[2]) * in2[0];
 510
 511         out[3] = ((uint128_t) in1[0]) * in2[3] +
 512             ((uint128_t) in1[1]) * in2[2] +
 513             ((uint128_t) in1[2]) * in2[1] +
 514             ((uint128_t) in1[3]) * in2[0];
 515
 516         out[4] = ((uint128_t) in1[0]) * in2[4] +
 517             ((uint128_t) in1[1]) * in2[3] +
 518             ((uint128_t) in1[2]) * in2[2] +
 519             ((uint128_t) in1[3]) * in2[1] +
 520             ((uint128_t) in1[4]) * in2[0];
 521
 522         out[5] = ((uint128_t) in1[0]) * in2[5] +
 523             ((uint128_t) in1[1]) * in2[4] +
 524             ((uint128_t) in1[2]) * in2[3] +
 525             ((uint128_t) in1[3]) * in2[2] +
 526             ((uint128_t) in1[4]) * in2[1] +
 527             ((uint128_t) in1[5]) * in2[0];
 528
 529         out[6] = ((uint128_t) in1[0]) * in2[6] +
 530             ((uint128_t) in1[1]) * in2[5] +
 531             ((uint128_t) in1[2]) * in2[4] +
 532             ((uint128_t) in1[3]) * in2[3] +
 533             ((uint128_t) in1[4]) * in2[2] +
 534             ((uint128_t) in1[5]) * in2[1] +
 535             ((uint128_t) in1[6]) * in2[0];
 536
 537         out[7] = ((uint128_t) in1[0]) * in2[7] +
 538             ((uint128_t) in1[1]) * in2[6] +
 539             ((uint128_t) in1[2]) * in2[5] +
 540             ((uint128_t) in1[3]) * in2[4] +
 541             ((uint128_t) in1[4]) * in2[3] +
 542             ((uint128_t) in1[5]) * in2[2] +
 543             ((uint128_t) in1[6]) * in2[1] +
 544             ((uint128_t) in1[7]) * in2[0];
 545
 546         out[8] = ((uint128_t) in1[0]) * in2[8] +
 547             ((uint128_t) in1[1]) * in2[7] +
 548             ((uint128_t) in1[2]) * in2[6] +
 549             ((uint128_t) in1[3]) * in2[5] +
 550             ((uint128_t) in1[4]) * in2[4] +
 551             ((uint128_t) in1[5]) * in2[3] +
 552             ((uint128_t) in1[6]) * in2[2] +
 553             ((uint128_t) in1[7]) * in2[1] +
 554             ((uint128_t) in1[8]) * in2[0];
 555
 556         /* See comment in felem_square about the use of in2x2 here */
 557
 558         out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 559             ((uint128_t) in1[2]) * in2x2[7] +
 560             ((uint128_t) in1[3]) * in2x2[6] +
 561             ((uint128_t) in1[4]) * in2x2[5] +
 562             ((uint128_t) in1[5]) * in2x2[4] +
 563             ((uint128_t) in1[6]) * in2x2[3] +
 564             ((uint128_t) in1[7]) * in2x2[2] +
 565             ((uint128_t) in1[8]) * in2x2[1];
 566
 567         out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 568             ((uint128_t) in1[3]) * in2x2[7] +
 569             ((uint128_t) in1[4]) * in2x2[6] +
 570             ((uint128_t) in1[5]) * in2x2[5] +
 571             ((uint128_t) in1[6]) * in2x2[4] +
 572             ((uint128_t) in1[7]) * in2x2[3] +
 573             ((uint128_t) in1[8]) * in2x2[2];
 574
 575         out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 576             ((uint128_t) in1[4]) * in2x2[7] +
 577             ((uint128_t) in1[5]) * in2x2[6] +
 578             ((uint128_t) in1[6]) * in2x2[5] +
 579             ((uint128_t) in1[7]) * in2x2[4] +
 580             ((uint128_t) in1[8]) * in2x2[3];
 581
 582         out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 583             ((uint128_t) in1[5]) * in2x2[7] +
 584             ((uint128_t) in1[6]) * in2x2[6] +
 585             ((uint128_t) in1[7]) * in2x2[5] +
 586             ((uint128_t) in1[8]) * in2x2[4];
 587
 588         out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 589             ((uint128_t) in1[6]) * in2x2[7] +
 590             ((uint128_t) in1[7]) * in2x2[6] +
 591             ((uint128_t) in1[8]) * in2x2[5];
 592
 593         out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 594             ((uint128_t) in1[7]) * in2x2[7] +
 595             ((uint128_t) in1[8]) * in2x2[6];
 596
 597         out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 598             ((uint128_t) in1[8]) * in2x2[7];
 599
 600         out[7] += ((uint128_t) in1[8]) * in2x2[8];
 601 }
 602
 603 static const limb bottom52bits = 0xfffffffffffff;
 604
 605 /* felem_reduce converts a largefelem to an felem.
 606  * On entry:
 607  *   in[i] < 2^128
 608  * On exit:
 609  *   out[i] < 2^59 + 2^14
 610  */
 611 static void
 612 felem_reduce(felem out, const largefelem in)
 613 {
 614         u64 overflow1, overflow2;
 615
 616         out[0] = ((limb) in[0]) & bottom58bits;
 617         out[1] = ((limb) in[1]) & bottom58bits;
 618         out[2] = ((limb) in[2]) & bottom58bits;
 619         out[3] = ((limb) in[3]) & bottom58bits;
 620         out[4] = ((limb) in[4]) & bottom58bits;
 621         out[5] = ((limb) in[5]) & bottom58bits;
 622         out[6] = ((limb) in[6]) & bottom58bits;
 623         out[7] = ((limb) in[7]) & bottom58bits;
 624         out[8] = ((limb) in[8]) & bottom58bits;
 625
 626         /* out[i] < 2^58 */
 627
 628         out[1] += ((limb) in[0]) >> 58;
 629         out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 630         /*
 631          * out[1] < 2^58 + 2^6 + 2^58 = 2^59 + 2^6
 632          */
 633         out[2] += ((limb) (in[0] >> 64)) >> 52;
 634
 635         out[2] += ((limb) in[1]) >> 58;
 636         out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 637         out[3] += ((limb) (in[1] >> 64)) >> 52;
 638
 639         out[3] += ((limb) in[2]) >> 58;
 640         out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 641         out[4] += ((limb) (in[2] >> 64)) >> 52;
 642
 643         out[4] += ((limb) in[3]) >> 58;
 644         out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 645         out[5] += ((limb) (in[3] >> 64)) >> 52;
 646
 647         out[5] += ((limb) in[4]) >> 58;
 648         out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 649         out[6] += ((limb) (in[4] >> 64)) >> 52;
 650
 651         out[6] += ((limb) in[5]) >> 58;
 652         out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 653         out[7] += ((limb) (in[5] >> 64)) >> 52;
 654
 655         out[7] += ((limb) in[6]) >> 58;
 656         out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 657         out[8] += ((limb) (in[6] >> 64)) >> 52;
 658
 659         out[8] += ((limb) in[7]) >> 58;
 660         out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 661         /*
 662          * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12 < 2^59 + 2^13
 663          */
 664         overflow1 = ((limb) (in[7] >> 64)) >> 52;
 665
 666         overflow1 += ((limb) in[8]) >> 58;
 667         overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 668         overflow2 = ((limb) (in[8] >> 64)) >> 52;
 669
 670         overflow1 <<= 1;        /* overflow1 < 2^13 + 2^7 + 2^59 */
 671         overflow2 <<= 1;        /* overflow2 < 2^13 */
 672
 673         out[0] += overflow1;    /* out[0] < 2^60 */
 674         out[1] += overflow2;    /* out[1] < 2^59 + 2^6 + 2^13 */
 675
 676         out[1] += out[0] >> 58;
 677         out[0] &= bottom58bits;
 678         /*
 679          * out[0] < 2^58 out[1] < 2^59 + 2^6 + 2^13 + 2^2 < 2^59 + 2^14
 680          */
 681 }
 682
 683 static void
 684 felem_square_reduce(felem out, const felem in)
 685 {
 686         largefelem tmp;
 687         felem_square(tmp, in);
 688         felem_reduce(out, tmp);
 689 }
 690
 691 static void
 692 felem_mul_reduce(felem out, const felem in1, const felem in2)
 693 {
 694         largefelem tmp;
 695         felem_mul(tmp, in1, in2);
 696         felem_reduce(out, tmp);
 697 }
 698
 699 /* felem_inv calculates |out| = |in|^{-1}
 700  *
 701  * Based on Fermat's Little Theorem:
 702  *   a^p = a (mod p)
 703  *   a^{p-1} = 1 (mod p)
 704  *   a^{p-2} = a^{-1} (mod p)
 705  */
 706 static void
 707 felem_inv(felem out, const felem in)
 708 {
 709         felem ftmp, ftmp2, ftmp3, ftmp4;
 710         largefelem tmp;
 711         unsigned i;
 712
 713         felem_square(tmp, in);
 714         felem_reduce(ftmp, tmp);/* 2^1 */
 715         felem_mul(tmp, in, ftmp);
 716         felem_reduce(ftmp, tmp);/* 2^2 - 2^0 */
 717         felem_assign(ftmp2, ftmp);
 718         felem_square(tmp, ftmp);
 719         felem_reduce(ftmp, tmp);/* 2^3 - 2^1 */
 720         felem_mul(tmp, in, ftmp);
 721         felem_reduce(ftmp, tmp);/* 2^3 - 2^0 */
 722         felem_square(tmp, ftmp);
 723         felem_reduce(ftmp, tmp);/* 2^4 - 2^1 */
 724
 725         felem_square(tmp, ftmp2);
 726         felem_reduce(ftmp3, tmp);       /* 2^3 - 2^1 */
 727         felem_square(tmp, ftmp3);
 728         felem_reduce(ftmp3, tmp);       /* 2^4 - 2^2 */
 729         felem_mul(tmp, ftmp3, ftmp2);
 730         felem_reduce(ftmp3, tmp);       /* 2^4 - 2^0 */
 731
 732         felem_assign(ftmp2, ftmp3);
 733         felem_square(tmp, ftmp3);
 734         felem_reduce(ftmp3, tmp);       /* 2^5 - 2^1 */
 735         felem_square(tmp, ftmp3);
 736         felem_reduce(ftmp3, tmp);       /* 2^6 - 2^2 */
 737         felem_square(tmp, ftmp3);
 738         felem_reduce(ftmp3, tmp);       /* 2^7 - 2^3 */
 739         felem_square(tmp, ftmp3);
 740         felem_reduce(ftmp3, tmp);       /* 2^8 - 2^4 */
 741         felem_assign(ftmp4, ftmp3);
 742         felem_mul(tmp, ftmp3, ftmp);
 743         felem_reduce(ftmp4, tmp);       /* 2^8 - 2^1 */
 744         felem_square(tmp, ftmp4);
 745         felem_reduce(ftmp4, tmp);       /* 2^9 - 2^2 */
 746         felem_mul(tmp, ftmp3, ftmp2);
 747         felem_reduce(ftmp3, tmp);       /* 2^8 - 2^0 */
 748         felem_assign(ftmp2, ftmp3);
 749
 750         for (i = 0; i < 8; i++) {
 751                 felem_square(tmp, ftmp3);
 752                 felem_reduce(ftmp3, tmp);       /* 2^16 - 2^8 */
 753         }
 754         felem_mul(tmp, ftmp3, ftmp2);
 755         felem_reduce(ftmp3, tmp);       /* 2^16 - 2^0 */
 756         felem_assign(ftmp2, ftmp3);
 757
 758         for (i = 0; i < 16; i++) {
 759                 felem_square(tmp, ftmp3);
 760                 felem_reduce(ftmp3, tmp);       /* 2^32 - 2^16 */
 761         }
 762         felem_mul(tmp, ftmp3, ftmp2);
 763         felem_reduce(ftmp3, tmp);       /* 2^32 - 2^0 */
 764         felem_assign(ftmp2, ftmp3);
 765
 766         for (i = 0; i < 32; i++) {
 767                 felem_square(tmp, ftmp3);
 768                 felem_reduce(ftmp3, tmp);       /* 2^64 - 2^32 */
 769         }
 770         felem_mul(tmp, ftmp3, ftmp2);
 771         felem_reduce(ftmp3, tmp);       /* 2^64 - 2^0 */
 772         felem_assign(ftmp2, ftmp3);
 773
 774         for (i = 0; i < 64; i++) {
 775                 felem_square(tmp, ftmp3);
 776                 felem_reduce(ftmp3, tmp);       /* 2^128 - 2^64 */
 777         }
 778         felem_mul(tmp, ftmp3, ftmp2);
 779         felem_reduce(ftmp3, tmp);       /* 2^128 - 2^0 */
 780         felem_assign(ftmp2, ftmp3);
 781
 782         for (i = 0; i < 128; i++) {
 783                 felem_square(tmp, ftmp3);
 784                 felem_reduce(ftmp3, tmp);       /* 2^256 - 2^128 */
 785         }
 786         felem_mul(tmp, ftmp3, ftmp2);
 787         felem_reduce(ftmp3, tmp);       /* 2^256 - 2^0 */
 788         felem_assign(ftmp2, ftmp3);
 789
 790         for (i = 0; i < 256; i++) {
 791                 felem_square(tmp, ftmp3);
 792                 felem_reduce(ftmp3, tmp);       /* 2^512 - 2^256 */
 793         }
 794         felem_mul(tmp, ftmp3, ftmp2);
 795         felem_reduce(ftmp3, tmp);       /* 2^512 - 2^0 */
 796
 797         for (i = 0; i < 9; i++) {
 798                 felem_square(tmp, ftmp3);
 799                 felem_reduce(ftmp3, tmp);       /* 2^521 - 2^9 */
 800         }
 801         felem_mul(tmp, ftmp3, ftmp4);
 802         felem_reduce(ftmp3, tmp);       /* 2^512 - 2^2 */
 803         felem_mul(tmp, ftmp3, in);
 804         felem_reduce(out, tmp); /* 2^512 - 3 */
 805 }
 806
 807 /* This is 2^521-1, expressed as an felem */
 808 static const felem kPrime =
 809 {
 810         0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 811         0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 812         0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 813 };
 814
 815 /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 816  * otherwise.
 817  * On entry:
 818  *   in[i] < 2^59 + 2^14
 819  */
 820 static limb
 821 felem_is_zero(const felem in)
 822 {
 823         felem ftmp;
 824         limb is_zero, is_p;
 825         felem_assign(ftmp, in);
 826
 827         ftmp[0] += ftmp[8] >> 57;
 828         ftmp[8] &= bottom57bits;
 829         /* ftmp[8] < 2^57 */
 830         ftmp[1] += ftmp[0] >> 58;
 831         ftmp[0] &= bottom58bits;
 832         ftmp[2] += ftmp[1] >> 58;
 833         ftmp[1] &= bottom58bits;
 834         ftmp[3] += ftmp[2] >> 58;
 835         ftmp[2] &= bottom58bits;
 836         ftmp[4] += ftmp[3] >> 58;
 837         ftmp[3] &= bottom58bits;
 838         ftmp[5] += ftmp[4] >> 58;
 839         ftmp[4] &= bottom58bits;
 840         ftmp[6] += ftmp[5] >> 58;
 841         ftmp[5] &= bottom58bits;
 842         ftmp[7] += ftmp[6] >> 58;
 843         ftmp[6] &= bottom58bits;
 844         ftmp[8] += ftmp[7] >> 58;
 845         ftmp[7] &= bottom58bits;
 846         /* ftmp[8] < 2^57 + 4 */
 847
 848         /*
 849          * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
 850          * greater than our bound for ftmp[8]. Therefore we only have to
 851          * check if the zero is zero or 2^521-1.
 852          */
 853
 854         is_zero = 0;
 855         is_zero |= ftmp[0];
 856         is_zero |= ftmp[1];
 857         is_zero |= ftmp[2];
 858         is_zero |= ftmp[3];
 859         is_zero |= ftmp[4];
 860         is_zero |= ftmp[5];
 861         is_zero |= ftmp[6];
 862         is_zero |= ftmp[7];
 863         is_zero |= ftmp[8];
 864
 865         is_zero--;
 866         /*
 867          * We know that ftmp[i] < 2^63, therefore the only way that the top
 868          * bit can be set is if is_zero was 0 before the decrement.
 869          */
 870         is_zero = ((s64) is_zero) >> 63;
 871
 872         is_p = ftmp[0] ^ kPrime[0];
 873         is_p |= ftmp[1] ^ kPrime[1];
 874         is_p |= ftmp[2] ^ kPrime[2];
 875         is_p |= ftmp[3] ^ kPrime[3];
 876         is_p |= ftmp[4] ^ kPrime[4];
 877         is_p |= ftmp[5] ^ kPrime[5];
 878         is_p |= ftmp[6] ^ kPrime[6];
 879         is_p |= ftmp[7] ^ kPrime[7];
 880         is_p |= ftmp[8] ^ kPrime[8];
 881
 882         is_p--;
 883         is_p = ((s64) is_p) >> 63;
 884
 885         is_zero |= is_p;
 886         return is_zero;
 887 }
 888
 889 static int
 890 felem_is_zero_int(const felem in)
 891 {
 892         return (int) (felem_is_zero(in) & ((limb) 1));
 893 }
 894
 895 /* felem_contract converts |in| to its unique, minimal representation.
 896  * On entry:
 897  *   in[i] < 2^59 + 2^14
 898  */
 899 static void
 900 felem_contract(felem out, const felem in)
 901 {
 902         limb is_p, is_greater, sign;
 903         static const limb two58 = ((limb) 1) << 58;
 904
 905         felem_assign(out, in);
 906
 907         out[0] += out[8] >> 57;
 908         out[8] &= bottom57bits;
 909         /* out[8] < 2^57 */
 910         out[1] += out[0] >> 58;
 911         out[0] &= bottom58bits;
 912         out[2] += out[1] >> 58;
 913         out[1] &= bottom58bits;
 914         out[3] += out[2] >> 58;
 915         out[2] &= bottom58bits;
 916         out[4] += out[3] >> 58;
 917         out[3] &= bottom58bits;
 918         out[5] += out[4] >> 58;
 919         out[4] &= bottom58bits;
 920         out[6] += out[5] >> 58;
 921         out[5] &= bottom58bits;
 922         out[7] += out[6] >> 58;
 923         out[6] &= bottom58bits;
 924         out[8] += out[7] >> 58;
 925         out[7] &= bottom58bits;
 926         /* out[8] < 2^57 + 4 */
 927
 928         /*
 929          * If the value is greater than 2^521-1 then we have to subtract
 930          * 2^521-1 out. See the comments in felem_is_zero regarding why we
 931          * don't test for other multiples of the prime.
 932          */
 933
 934         /*
 935          * First, if |out| is equal to 2^521-1, we subtract it out to get
 936          * zero.
 937          */
 938
 939         is_p = out[0] ^ kPrime[0];
 940         is_p |= out[1] ^ kPrime[1];
 941         is_p |= out[2] ^ kPrime[2];
 942         is_p |= out[3] ^ kPrime[3];
 943         is_p |= out[4] ^ kPrime[4];
 944         is_p |= out[5] ^ kPrime[5];
 945         is_p |= out[6] ^ kPrime[6];
 946         is_p |= out[7] ^ kPrime[7];
 947         is_p |= out[8] ^ kPrime[8];
 948
 949         is_p--;
 950         is_p &= is_p << 32;
 951         is_p &= is_p << 16;
 952         is_p &= is_p << 8;
 953         is_p &= is_p << 4;
 954         is_p &= is_p << 2;
 955         is_p &= is_p << 1;
 956         is_p = ((s64) is_p) >> 63;
 957         is_p = ~is_p;
 958
 959         /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 960
 961         out[0] &= is_p;
 962         out[1] &= is_p;
 963         out[2] &= is_p;
 964         out[3] &= is_p;
 965         out[4] &= is_p;
 966         out[5] &= is_p;
 967         out[6] &= is_p;
 968         out[7] &= is_p;
 969         out[8] &= is_p;
 970
 971         /*
 972          * In order to test that |out| >= 2^521-1 we need only test if out[8]
 973          * >> 57 is greater than zero as (2^521-1) + x >= 2^522
 974          */
 975         is_greater = out[8] >> 57;
 976         is_greater |= is_greater << 32;
 977         is_greater |= is_greater << 16;
 978         is_greater |= is_greater << 8;
 979         is_greater |= is_greater << 4;
 980         is_greater |= is_greater << 2;
 981         is_greater |= is_greater << 1;
 982         is_greater = ((s64) is_greater) >> 63;
 983
 984         out[0] -= kPrime[0] & is_greater;
 985         out[1] -= kPrime[1] & is_greater;
 986         out[2] -= kPrime[2] & is_greater;
 987         out[3] -= kPrime[3] & is_greater;
 988         out[4] -= kPrime[4] & is_greater;
 989         out[5] -= kPrime[5] & is_greater;
 990         out[6] -= kPrime[6] & is_greater;
 991         out[7] -= kPrime[7] & is_greater;
 992         out[8] -= kPrime[8] & is_greater;
 993
 994         /* Eliminate negative coefficients */
 995         sign = -(out[0] >> 63);
 996         out[0] += (two58 & sign);
 997         out[1] -= (1 & sign);
 998         sign = -(out[1] >> 63);
 999         out[1] += (two58 & sign);
1000         out[2] -= (1 & sign);
1001         sign = -(out[2] >> 63);
1002         out[2] += (two58 & sign);
1003         out[3] -= (1 & sign);
1004         sign = -(out[3] >> 63);
1005         out[3] += (two58 & sign);
1006         out[4] -= (1 & sign);
1007         sign = -(out[4] >> 63);
1008         out[4] += (two58 & sign);
1009         out[5] -= (1 & sign);
1010         sign = -(out[0] >> 63);
1011         out[5] += (two58 & sign);
1012         out[6] -= (1 & sign);
1013         sign = -(out[6] >> 63);
1014         out[6] += (two58 & sign);
1015         out[7] -= (1 & sign);
1016         sign = -(out[7] >> 63);
1017         out[7] += (two58 & sign);
1018         out[8] -= (1 & sign);
1019         sign = -(out[5] >> 63);
1020         out[5] += (two58 & sign);
1021         out[6] -= (1 & sign);
1022         sign = -(out[6] >> 63);
1023         out[6] += (two58 & sign);
1024         out[7] -= (1 & sign);
1025         sign = -(out[7] >> 63);
1026         out[7] += (two58 & sign);
1027         out[8] -= (1 & sign);
1028 }
1029
1030 /* Group operations
1031  * ----------------
1032  *
1033  * Building on top of the field operations we have the operations on the
1034  * elliptic curve group itself. Points on the curve are represented in Jacobian
1035  * coordinates */
1036
1037 /* point_double calcuates 2*(x_in, y_in, z_in)
1038  *
1039  * The method is taken from:
1040  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1041  *
1042  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1043  * while x_out == y_in is not (maybe this works, but it's not tested). */
1044 static void
1045 point_double(felem x_out, felem y_out, felem z_out,
1046     const felem x_in, const felem y_in, const felem z_in)
1047 {
1048         largefelem tmp, tmp2;
1049         felem delta, gamma, beta, alpha, ftmp, ftmp2;
1050
1051         felem_assign(ftmp, x_in);
1052         felem_assign(ftmp2, x_in);
1053
1054         /* delta = z^2 */
1055         felem_square(tmp, z_in);
1056         felem_reduce(delta, tmp);       /* delta[i] < 2^59 + 2^14 */
1057
1058         /* gamma = y^2 */
1059         felem_square(tmp, y_in);
1060         felem_reduce(gamma, tmp);       /* gamma[i] < 2^59 + 2^14 */
1061
1062         /* beta = x*gamma */
1063         felem_mul(tmp, x_in, gamma);
1064         felem_reduce(beta, tmp);/* beta[i] < 2^59 + 2^14 */
1065
1066         /* alpha = 3*(x-delta)*(x+delta) */
1067         felem_diff64(ftmp, delta);
1068         /* ftmp[i] < 2^61 */
1069         felem_sum64(ftmp2, delta);
1070         /* ftmp2[i] < 2^60 + 2^15 */
1071         felem_scalar64(ftmp2, 3);
1072         /* ftmp2[i] < 3*2^60 + 3*2^15 */
1073         felem_mul(tmp, ftmp, ftmp2);
1074         /*
1075          * tmp[i] < 17(3*2^121 + 3*2^76) = 61*2^121 + 61*2^76 < 64*2^121 +
1076          * 64*2^76 = 2^127 + 2^82 < 2^128
1077          */
1078         felem_reduce(alpha, tmp);
1079
1080         /* x' = alpha^2 - 8*beta */
1081         felem_square(tmp, alpha);
1082         /*
1083          * tmp[i] < 17*2^120 < 2^125
1084          */
1085         felem_assign(ftmp, beta);
1086         felem_scalar64(ftmp, 8);
1087         /* ftmp[i] < 2^62 + 2^17 */
1088         felem_diff_128_64(tmp, ftmp);
1089         /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1090         felem_reduce(x_out, tmp);
1091
1092         /* z' = (y + z)^2 - gamma - delta */
1093         felem_sum64(delta, gamma);
1094         /* delta[i] < 2^60 + 2^15 */
1095         felem_assign(ftmp, y_in);
1096         felem_sum64(ftmp, z_in);
1097         /* ftmp[i] < 2^60 + 2^15 */
1098         felem_square(tmp, ftmp);
1099         /*
1100          * tmp[i] < 17(2^122) < 2^127
1101          */
1102         felem_diff_128_64(tmp, delta);
1103         /* tmp[i] < 2^127 + 2^63 */
1104         felem_reduce(z_out, tmp);
1105
1106         /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1107         felem_scalar64(beta, 4);
1108         /* beta[i] < 2^61 + 2^16 */
1109         felem_diff64(beta, x_out);
1110         /* beta[i] < 2^61 + 2^60 + 2^16 */
1111         felem_mul(tmp, alpha, beta);
1112         /*
1113          * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16)) = 17*(2^120 + 2^75
1114          * + 2^119 + 2^74 + 2^75 + 2^30) = 17*(2^120 + 2^119 + 2^76 + 2^74 +
1115          * 2^30) < 2^128
1116          */
1117         felem_square(tmp2, gamma);
1118         /*
1119          * tmp2[i] < 17*(2^59 + 2^14)^2 = 17*(2^118 + 2^74 + 2^28)
1120          */
1121         felem_scalar128(tmp2, 8);
1122         /*
1123          * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28) = 2^125 + 2^121 + 2^81 + 2^77
1124          * + 2^35 + 2^31 < 2^126
1125          */
1126         felem_diff128(tmp, tmp2);
1127         /*
1128          * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30) =
1129          * 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 + 2^74
1130          * + 2^69 + 2^34 + 2^30 < 2^128
1131          */
1132         felem_reduce(y_out, tmp);
1133 }
1134
1135 /* copy_conditional copies in to out iff mask is all ones. */
1136 static void
1137 copy_conditional(felem out, const felem in, limb mask)
1138 {
1139         unsigned i;
1140         for (i = 0; i < NLIMBS; ++i) {
1141                 const limb tmp = mask & (in[i] ^ out[i]);
1142                 out[i] ^= tmp;
1143         }
1144 }
1145
1146 /* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1147  *
1148  * The method is taken from
1149  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1150  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1151  *
1152  * This function includes a branch for checking whether the two input points
1153  * are equal (while not equal to the point at infinity). This case never
1154  * happens during single point multiplication, so there is no timing leak for
1155  * ECDH or ECDSA signing. */
1156 static void
1157 point_add(felem x3, felem y3, felem z3,
1158     const felem x1, const felem y1, const felem z1,
1159     const int mixed, const felem x2, const felem y2, const felem z2)
1160 {
1161         felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1162         largefelem tmp, tmp2;
1163         limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1164
1165         z1_is_zero = felem_is_zero(z1);
1166         z2_is_zero = felem_is_zero(z2);
1167
1168         /* ftmp = z1z1 = z1**2 */
1169         felem_square(tmp, z1);
1170         felem_reduce(ftmp, tmp);
1171
1172         if (!mixed) {
1173                 /* ftmp2 = z2z2 = z2**2 */
1174                 felem_square(tmp, z2);
1175                 felem_reduce(ftmp2, tmp);
1176
1177                 /* u1 = ftmp3 = x1*z2z2 */
1178                 felem_mul(tmp, x1, ftmp2);
1179                 felem_reduce(ftmp3, tmp);
1180
1181                 /* ftmp5 = z1 + z2 */
1182                 felem_assign(ftmp5, z1);
1183                 felem_sum64(ftmp5, z2);
1184                 /* ftmp5[i] < 2^61 */
1185
1186                 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1187                 felem_square(tmp, ftmp5);
1188                 /* tmp[i] < 17*2^122 */
1189                 felem_diff_128_64(tmp, ftmp);
1190                 /* tmp[i] < 17*2^122 + 2^63 */
1191                 felem_diff_128_64(tmp, ftmp2);
1192                 /* tmp[i] < 17*2^122 + 2^64 */
1193                 felem_reduce(ftmp5, tmp);
1194
1195                 /* ftmp2 = z2 * z2z2 */
1196                 felem_mul(tmp, ftmp2, z2);
1197                 felem_reduce(ftmp2, tmp);
1198
1199                 /* s1 = ftmp6 = y1 * z2**3 */
1200                 felem_mul(tmp, y1, ftmp2);
1201                 felem_reduce(ftmp6, tmp);
1202         } else {
1203                 /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1204
1205                 /* u1 = ftmp3 = x1*z2z2 */
1206                 felem_assign(ftmp3, x1);
1207
1208                 /* ftmp5 = 2*z1z2 */
1209                 felem_scalar(ftmp5, z1, 2);
1210
1211                 /* s1 = ftmp6 = y1 * z2**3 */
1212                 felem_assign(ftmp6, y1);
1213         }
1214
1215         /* u2 = x2*z1z1 */
1216         felem_mul(tmp, x2, ftmp);
1217         /* tmp[i] < 17*2^120 */
1218
1219         /* h = ftmp4 = u2 - u1 */
1220         felem_diff_128_64(tmp, ftmp3);
1221         /* tmp[i] < 17*2^120 + 2^63 */
1222         felem_reduce(ftmp4, tmp);
1223
1224         x_equal = felem_is_zero(ftmp4);
1225
1226         /* z_out = ftmp5 * h */
1227         felem_mul(tmp, ftmp5, ftmp4);
1228         felem_reduce(z_out, tmp);
1229
1230         /* ftmp = z1 * z1z1 */
1231         felem_mul(tmp, ftmp, z1);
1232         felem_reduce(ftmp, tmp);
1233
1234         /* s2 = tmp = y2 * z1**3 */
1235         felem_mul(tmp, y2, ftmp);
1236         /* tmp[i] < 17*2^120 */
1237
1238         /* r = ftmp5 = (s2 - s1)*2 */
1239         felem_diff_128_64(tmp, ftmp6);
1240         /* tmp[i] < 17*2^120 + 2^63 */
1241         felem_reduce(ftmp5, tmp);
1242         y_equal = felem_is_zero(ftmp5);
1243         felem_scalar64(ftmp5, 2);
1244         /* ftmp5[i] < 2^61 */
1245
1246         if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1247                 point_double(x3, y3, z3, x1, y1, z1);
1248                 return;
1249         }
1250         /* I = ftmp = (2h)**2 */
1251         felem_assign(ftmp, ftmp4);
1252         felem_scalar64(ftmp, 2);
1253         /* ftmp[i] < 2^61 */
1254         felem_square(tmp, ftmp);
1255         /* tmp[i] < 17*2^122 */
1256         felem_reduce(ftmp, tmp);
1257
1258         /* J = ftmp2 = h * I */
1259         felem_mul(tmp, ftmp4, ftmp);
1260         felem_reduce(ftmp2, tmp);
1261
1262         /* V = ftmp4 = U1 * I */
1263         felem_mul(tmp, ftmp3, ftmp);
1264         felem_reduce(ftmp4, tmp);
1265
1266         /* x_out = r**2 - J - 2V */
1267         felem_square(tmp, ftmp5);
1268         /* tmp[i] < 17*2^122 */
1269         felem_diff_128_64(tmp, ftmp2);
1270         /* tmp[i] < 17*2^122 + 2^63 */
1271         felem_assign(ftmp3, ftmp4);
1272         felem_scalar64(ftmp4, 2);
1273         /* ftmp4[i] < 2^61 */
1274         felem_diff_128_64(tmp, ftmp4);
1275         /* tmp[i] < 17*2^122 + 2^64 */
1276         felem_reduce(x_out, tmp);
1277
1278         /* y_out = r(V-x_out) - 2 * s1 * J */
1279         felem_diff64(ftmp3, x_out);
1280         /*
1281          * ftmp3[i] < 2^60 + 2^60 = 2^61
1282          */
1283         felem_mul(tmp, ftmp5, ftmp3);
1284         /* tmp[i] < 17*2^122 */
1285         felem_mul(tmp2, ftmp6, ftmp2);
1286         /* tmp2[i] < 17*2^120 */
1287         felem_scalar128(tmp2, 2);
1288         /* tmp2[i] < 17*2^121 */
1289         felem_diff128(tmp, tmp2);
1290         /*
1291          * tmp[i] < 2^127 - 2^69 + 17*2^122 = 2^126 - 2^122 - 2^6 - 2^2 - 1 <
1292          * 2^127
1293          */
1294         felem_reduce(y_out, tmp);
1295
1296         copy_conditional(x_out, x2, z1_is_zero);
1297         copy_conditional(x_out, x1, z2_is_zero);
1298         copy_conditional(y_out, y2, z1_is_zero);
1299         copy_conditional(y_out, y1, z2_is_zero);
1300         copy_conditional(z_out, z2, z1_is_zero);
1301         copy_conditional(z_out, z1, z2_is_zero);
1302         felem_assign(x3, x_out);
1303         felem_assign(y3, y_out);
1304         felem_assign(z3, z_out);
1305 }
1306
1307 /* Base point pre computation
1308  * --------------------------
1309  *
1310  * Two different sorts of precomputed tables are used in the following code.
1311  * Each contain various points on the curve, where each point is three field
1312  * elements (x, y, z).
1313  *
1314  * For the base point table, z is usually 1 (0 for the point at infinity).
1315  * This table has 16 elements:
1316  * index | bits    | point
1317  * ------+---------+------------------------------
1318  *     0 | 0 0 0 0 | 0G
1319  *     1 | 0 0 0 1 | 1G
1320  *     2 | 0 0 1 0 | 2^130G
1321  *     3 | 0 0 1 1 | (2^130 + 1)G
1322  *     4 | 0 1 0 0 | 2^260G
1323  *     5 | 0 1 0 1 | (2^260 + 1)G
1324  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1325  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1326  *     8 | 1 0 0 0 | 2^390G
1327  *     9 | 1 0 0 1 | (2^390 + 1)G
1328  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1329  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1330  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1331  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1332  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1333  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1334  *
1335  * The reason for this is so that we can clock bits into four different
1336  * locations when doing simple scalar multiplies against the base point.
1337  *
1338  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1339
1340 /* gmul is the table of precomputed base points */
1341 static const felem gmul[16][3] =
1342 {{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1343 {0, 0, 0, 0, 0, 0, 0, 0, 0},
1344 {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1345 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1346                 0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1347 0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1348 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1349         0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1350 0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1351 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1352 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1353                 0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1354 0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1355 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1356         0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1357 0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1358 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1359 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1360                 0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1361 0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1362 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1363         0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1364 0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1365 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1366 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1367                 0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1368 0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1369 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1370         0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1371 0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1372 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1373 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1374                 0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1375 0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1376 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1377         0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1378 0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1379 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1380 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1381                 0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1382 0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1383 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1384         0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1385 0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1386 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1387 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1388                 0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1389 0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1390 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1391         0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1392 0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1393 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1394 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1395                 0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1396 0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1397 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1398         0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1399 0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1400 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1401 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1402                 0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1403 0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1404 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1405         0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1406 0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1407 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1408 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1409                 0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1410 0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1411 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1412         0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1413 0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1414 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1415 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1416                 0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1417 0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1418 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1419         0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1420 0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1421 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1422 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1423                 0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1424 0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1425 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1426         0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1427 0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1428 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1429 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1430                 0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1431 0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1432 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1433         0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1434 0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1435 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1436 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1437                 0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1438 0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1439 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1440         0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1441 0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1442 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1443 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1444                 0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1445 0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1446 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1447         0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1448 0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1449 {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
1450
1451 /* select_point selects the |idx|th point from a precomputation table and
1452  * copies it to out. */
1453 static void
1454 select_point(const limb idx, unsigned int size, const felem pre_comp[ /* size */ ][3],
1455     felem out[3])
1456 {
1457         unsigned i, j;
1458         limb *outlimbs = &out[0][0];
1459         memset(outlimbs, 0, 3 * sizeof(felem));
1460
1461         for (i = 0; i < size; i++) {
1462                 const limb *inlimbs = &pre_comp[i][0][0];
1463                 limb mask = i ^ idx;
1464                 mask |= mask >> 4;
1465                 mask |= mask >> 2;
1466                 mask |= mask >> 1;
1467                 mask &= 1;
1468                 mask--;
1469                 for (j = 0; j < NLIMBS * 3; j++)
1470                         outlimbs[j] |= inlimbs[j] & mask;
1471         }
1472 }
1473
1474 /* get_bit returns the |i|th bit in |in| */
1475 static char
1476 get_bit(const felem_bytearray in, int i)
1477 {
1478         if (i < 0)
1479                 return 0;
1480         return (in[i >> 3] >> (i & 7)) & 1;
1481 }
1482
1483 /* Interleaved point multiplication using precomputed point multiples:
1484  * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
1485  * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1486  * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1487  * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1488 static void
1489 batch_mul(felem x_out, felem y_out, felem z_out,
1490     const felem_bytearray scalars[], const unsigned num_points, const u8 * g_scalar,
1491     const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
1492 {
1493         int i, skip;
1494         unsigned num, gen_mul = (g_scalar != NULL);
1495         felem nq[3], tmp[4];
1496         limb bits;
1497         u8 sign, digit;
1498
1499         /* set nq to the point at infinity */
1500         memset(nq, 0, 3 * sizeof(felem));
1501
1502         /*
1503          * Loop over all scalars msb-to-lsb, interleaving additions of
1504          * multiples of the generator (last quarter of rounds) and additions
1505          * of other points multiples (every 5th round).
1506          */
1507         skip = 1;               /* save two point operations in the first
1508                                  * round */
1509         for (i = (num_points ? 520 : 130); i >= 0; --i) {
1510                 /* double */
1511                 if (!skip)
1512                         point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1513
1514                 /* add multiples of the generator */
1515                 if (gen_mul && (i <= 130)) {
1516                         bits = get_bit(g_scalar, i + 390) << 3;
1517                         if (i < 130) {
1518                                 bits |= get_bit(g_scalar, i + 260) << 2;
1519                                 bits |= get_bit(g_scalar, i + 130) << 1;
1520                                 bits |= get_bit(g_scalar, i);
1521                         }
1522                         /* select the point to add, in constant time */
1523                         select_point(bits, 16, g_pre_comp, tmp);
1524                         if (!skip) {
1525                                 point_add(nq[0], nq[1], nq[2],
1526                                     nq[0], nq[1], nq[2],
1527                                     1 /* mixed */ , tmp[0], tmp[1], tmp[2]);
1528                         } else {
1529                                 memcpy(nq, tmp, 3 * sizeof(felem));
1530                                 skip = 0;
1531                         }
1532                 }
1533                 /* do other additions every 5 doublings */
1534                 if (num_points && (i % 5 == 0)) {
1535                         /* loop over all scalars */
1536                         for (num = 0; num < num_points; ++num) {
1537                                 bits = get_bit(scalars[num], i + 4) << 5;
1538                                 bits |= get_bit(scalars[num], i + 3) << 4;
1539                                 bits |= get_bit(scalars[num], i + 2) << 3;
1540                                 bits |= get_bit(scalars[num], i + 1) << 2;
1541                                 bits |= get_bit(scalars[num], i) << 1;
1542                                 bits |= get_bit(scalars[num], i - 1);
1543                                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1544
1545                                 /*
1546                                  * select the point to add or subtract, in
1547                                  * constant time
1548                                  */
1549                                 select_point(digit, 17, pre_comp[num], tmp);
1550                                 felem_neg(tmp[3], tmp[1]);      /* (X, -Y, Z) is the
1551                                                                  * negative point */
1552                                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1553
1554                                 if (!skip) {
1555                                         point_add(nq[0], nq[1], nq[2],
1556                                             nq[0], nq[1], nq[2],
1557                                             mixed, tmp[0], tmp[1], tmp[2]);
1558                                 } else {
1559                                         memcpy(nq, tmp, 3 * sizeof(felem));
1560                                         skip = 0;
1561                                 }
1562                         }
1563                 }
1564         }
1565         felem_assign(x_out, nq[0]);
1566         felem_assign(y_out, nq[1]);
1567         felem_assign(z_out, nq[2]);
1568 }
1569
1570
1571 /* Precomputation for the group generator. */
1572 typedef struct {
1573         felem g_pre_comp[16][3];
1574         int references;
1575 } NISTP521_PRE_COMP;
1576
1577 const EC_METHOD *
1578 EC_GFp_nistp521_method(void)
1579 {
1580         static const EC_METHOD ret = {
1581                 .flags = EC_FLAGS_DEFAULT_OCT,
1582                 .field_type = NID_X9_62_prime_field,
1583                 .group_init = ec_GFp_nistp521_group_init,
1584                 .group_finish = ec_GFp_simple_group_finish,
1585                 .group_clear_finish = ec_GFp_simple_group_clear_finish,
1586                 .group_copy = ec_GFp_nist_group_copy,
1587                 .group_set_curve = ec_GFp_nistp521_group_set_curve,
1588                 .group_get_curve = ec_GFp_simple_group_get_curve,
1589                 .group_get_degree = ec_GFp_simple_group_get_degree,
1590                 .group_check_discriminant =
1591                 ec_GFp_simple_group_check_discriminant,
1592                 .point_init = ec_GFp_simple_point_init,
1593                 .point_finish = ec_GFp_simple_point_finish,
1594                 .point_clear_finish = ec_GFp_simple_point_clear_finish,
1595                 .point_copy = ec_GFp_simple_point_copy,
1596                 .point_set_to_infinity = ec_GFp_simple_point_set_to_infinity,
1597                 .point_set_Jprojective_coordinates_GFp =
1598                 ec_GFp_simple_set_Jprojective_coordinates_GFp,
1599                 .point_get_Jprojective_coordinates_GFp =
1600                 ec_GFp_simple_get_Jprojective_coordinates_GFp,
1601                 .point_set_affine_coordinates =
1602                 ec_GFp_simple_point_set_affine_coordinates,
1603                 .point_get_affine_coordinates =
1604                 ec_GFp_nistp521_point_get_affine_coordinates,
1605                 .add = ec_GFp_simple_add,
1606                 .dbl = ec_GFp_simple_dbl,
1607                 .invert = ec_GFp_simple_invert,
1608                 .is_at_infinity = ec_GFp_simple_is_at_infinity,
1609                 .is_on_curve = ec_GFp_simple_is_on_curve,
1610                 .point_cmp = ec_GFp_simple_cmp,
1611                 .make_affine = ec_GFp_simple_make_affine,
1612                 .points_make_affine = ec_GFp_simple_points_make_affine,
1613                 .mul = ec_GFp_nistp521_points_mul,
1614                 .precompute_mult = ec_GFp_nistp521_precompute_mult,
1615                 .have_precompute_mult = ec_GFp_nistp521_have_precompute_mult,
1616                 .field_mul = ec_GFp_nist_field_mul,
1617                 .field_sqr = ec_GFp_nist_field_sqr
1618         };
1619
1620         return &ret;
1621 }
1622
1623
1624 /******************************************************************************/
1625 /*                     FUNCTIONS TO MANAGE PRECOMPUTATION
1626  */
1627
1628 static NISTP521_PRE_COMP *
1629 nistp521_pre_comp_new()
1630 {
1631         NISTP521_PRE_COMP *ret = NULL;
1632         ret = malloc(sizeof(NISTP521_PRE_COMP));
1633         if (!ret) {
1634                 ECerror(ERR_R_MALLOC_FAILURE);
1635                 return ret;
1636         }
1637         memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1638         ret->references = 1;
1639         return ret;
1640 }
1641
1642 static void *
1643 nistp521_pre_comp_dup(void *src_)
1644 {
1645         NISTP521_PRE_COMP *src = src_;
1646
1647         /* no need to actually copy, these objects never change! */
1648         CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1649
1650         return src_;
1651 }
1652
1653 static void
1654 nistp521_pre_comp_free(void *pre_)
1655 {
1656         int i;
1657         NISTP521_PRE_COMP *pre = pre_;
1658
1659         if (!pre)
1660                 return;
1661
1662         i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1663         if (i > 0)
1664                 return;
1665
1666         free(pre);
1667 }
1668
1669 static void
1670 nistp521_pre_comp_clear_free(void *pre_)
1671 {
1672         int i;
1673         NISTP521_PRE_COMP *pre = pre_;
1674
1675         if (!pre)
1676                 return;
1677
1678         i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1679         if (i > 0)
1680                 return;
1681
1682         explicit_bzero(pre, sizeof(*pre));
1683         free(pre);
1684 }
1685
1686 /******************************************************************************/
1687 /*                         OPENSSL EC_METHOD FUNCTIONS
1688  */
1689
1690 int
1691 ec_GFp_nistp521_group_init(EC_GROUP * group)
1692 {
1693         int ret;
1694         ret = ec_GFp_simple_group_init(group);
1695         group->a_is_minus3 = 1;
1696         return ret;
1697 }
1698
1699 int
1700 ec_GFp_nistp521_group_set_curve(EC_GROUP * group, const BIGNUM * p,
1701     const BIGNUM * a, const BIGNUM * b, BN_CTX * ctx)
1702 {
1703         int ret = 0;
1704         BN_CTX *new_ctx = NULL;
1705         BIGNUM *curve_p, *curve_a, *curve_b;
1706
1707         if (ctx == NULL)
1708                 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1709                         return 0;
1710         BN_CTX_start(ctx);
1711         if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1712             ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1713             ((curve_b = BN_CTX_get(ctx)) == NULL))
1714                 goto err;
1715         BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1716         BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1717         BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1718         if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1719             (BN_cmp(curve_b, b))) {
1720                 ECerror(EC_R_WRONG_CURVE_PARAMETERS);
1721                 goto err;
1722         }
1723         group->field_mod_func = BN_nist_mod_521;
1724         ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1725 err:
1726         BN_CTX_end(ctx);
1727         BN_CTX_free(new_ctx);
1728         return ret;
1729 }
1730
1731 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1732  * (X', Y') = (X/Z^2, Y/Z^3) */
1733 int
1734 ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP * group,
1735     const EC_POINT * point, BIGNUM * x, BIGNUM * y, BN_CTX * ctx)
1736 {
1737         felem z1, z2, x_in, y_in, x_out, y_out;
1738         largefelem tmp;
1739
1740         if (EC_POINT_is_at_infinity(group, point) > 0) {
1741                 ECerror(EC_R_POINT_AT_INFINITY);
1742                 return 0;
1743         }
1744         if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1745             (!BN_to_felem(z1, &point->Z)))
1746                 return 0;
1747         felem_inv(z2, z1);
1748         felem_square(tmp, z2);
1749         felem_reduce(z1, tmp);
1750         felem_mul(tmp, x_in, z1);
1751         felem_reduce(x_in, tmp);
1752         felem_contract(x_out, x_in);
1753         if (x != NULL) {
1754                 if (!felem_to_BN(x, x_out)) {
1755                         ECerror(ERR_R_BN_LIB);
1756                         return 0;
1757                 }
1758         }
1759         felem_mul(tmp, z1, z2);
1760         felem_reduce(z1, tmp);
1761         felem_mul(tmp, y_in, z1);
1762         felem_reduce(y_in, tmp);
1763         felem_contract(y_out, y_in);
1764         if (y != NULL) {
1765                 if (!felem_to_BN(y, y_out)) {
1766                         ECerror(ERR_R_BN_LIB);
1767                         return 0;
1768                 }
1769         }
1770         return 1;
1771 }
1772
1773 static void
1774 make_points_affine(size_t num, felem points[ /* num */ ][3], felem tmp_felems[ /* num+1 */ ])
1775 {
1776         /*
1777          * Runs in constant time, unless an input is the point at infinity
1778          * (which normally shouldn't happen).
1779          */
1780         ec_GFp_nistp_points_make_affine_internal(
1781             num,
1782             points,
1783             sizeof(felem),
1784             tmp_felems,
1785             (void (*) (void *)) felem_one,
1786             (int (*) (const void *)) felem_is_zero_int,
1787             (void (*) (void *, const void *)) felem_assign,
1788             (void (*) (void *, const void *)) felem_square_reduce,
1789             (void (*) (void *, const void *, const void *)) felem_mul_reduce,
1790             (void (*) (void *, const void *)) felem_inv,
1791             (void (*) (void *, const void *)) felem_contract);
1792 }
1793
1794 /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1795  * Result is stored in r (r can equal one of the inputs). */
1796 int
1797 ec_GFp_nistp521_points_mul(const EC_GROUP * group, EC_POINT * r,
1798     const BIGNUM * scalar, size_t num, const EC_POINT * points[],
1799     const BIGNUM * scalars[], BN_CTX * ctx)
1800 {
1801         int ret = 0;
1802         int j;
1803         int mixed = 0;
1804         BN_CTX *new_ctx = NULL;
1805         BIGNUM *x, *y, *z, *tmp_scalar;
1806         felem_bytearray g_secret;
1807         felem_bytearray *secrets = NULL;
1808         felem(*pre_comp)[17][3] = NULL;
1809         felem *tmp_felems = NULL;
1810         felem_bytearray tmp;
1811         unsigned i, num_bytes;
1812         int have_pre_comp = 0;
1813         size_t num_points = num;
1814         felem x_in, y_in, z_in, x_out, y_out, z_out;
1815         NISTP521_PRE_COMP *pre = NULL;
1816         felem(*g_pre_comp)[3] = NULL;
1817         EC_POINT *generator = NULL;
1818         const EC_POINT *p = NULL;
1819         const BIGNUM *p_scalar = NULL;
1820
1821         if (ctx == NULL)
1822                 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1823                         return 0;
1824         BN_CTX_start(ctx);
1825         if (((x = BN_CTX_get(ctx)) == NULL) ||
1826             ((y = BN_CTX_get(ctx)) == NULL) ||
1827             ((z = BN_CTX_get(ctx)) == NULL) ||
1828             ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1829                 goto err;
1830
1831         if (scalar != NULL) {
1832                 pre = EC_EX_DATA_get_data(group->extra_data,
1833                     nistp521_pre_comp_dup, nistp521_pre_comp_free,
1834                     nistp521_pre_comp_clear_free);
1835                 if (pre)
1836                         /* we have precomputation, try to use it */
1837                         g_pre_comp = &pre->g_pre_comp[0];
1838                 else
1839                         /* try to use the standard precomputation */
1840                         g_pre_comp = (felem(*)[3]) gmul;
1841                 generator = EC_POINT_new(group);
1842                 if (generator == NULL)
1843                         goto err;
1844                 /* get the generator from precomputation */
1845                 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1846                     !felem_to_BN(y, g_pre_comp[1][1]) ||
1847                     !felem_to_BN(z, g_pre_comp[1][2])) {
1848                         ECerror(ERR_R_BN_LIB);
1849                         goto err;
1850                 }
1851                 if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1852                         generator, x, y, z, ctx))
1853                         goto err;
1854                 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1855                         /* precomputation matches generator */
1856                         have_pre_comp = 1;
1857                 else
1858                         /*
1859                          * we don't have valid precomputation: treat the
1860                          * generator as a random point
1861                          */
1862                         num_points++;
1863         }
1864         if (num_points > 0) {
1865                 if (num_points >= 2) {
1866                         /*
1867                          * unless we precompute multiples for just one point,
1868                          * converting those into affine form is time well
1869                          * spent
1870                          */
1871                         mixed = 1;
1872                 }
1873                 secrets = calloc(num_points, sizeof(felem_bytearray));
1874                 pre_comp = calloc(num_points, 17 * 3 * sizeof(felem));
1875                 if (mixed) {
1876                         /* XXX should do more int overflow checking */
1877                         tmp_felems = reallocarray(NULL,
1878                             (num_points * 17 + 1), sizeof(felem));
1879                 }
1880                 if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL))) {
1881                         ECerror(ERR_R_MALLOC_FAILURE);
1882                         goto err;
1883                 }
1884                 /*
1885                  * we treat NULL scalars as 0, and NULL points as points at
1886                  * infinity, i.e., they contribute nothing to the linear
1887                  * combination
1888                  */
1889                 for (i = 0; i < num_points; ++i) {
1890                         if (i == num)
1891                                 /*
1892                                  * we didn't have a valid precomputation, so
1893                                  * we pick the generator
1894                                  */
1895                         {
1896                                 p = EC_GROUP_get0_generator(group);
1897                                 p_scalar = scalar;
1898                         } else
1899                                 /* the i^th point */
1900                         {
1901                                 p = points[i];
1902                                 p_scalar = scalars[i];
1903                         }
1904                         if ((p_scalar != NULL) && (p != NULL)) {
1905                                 /* reduce scalar to 0 <= scalar < 2^521 */
1906                                 if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar))) {
1907                                         /*
1908                                          * this is an unusual input, and we
1909                                          * don't guarantee constant-timeness
1910                                          */
1911                                         if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1912                                                 ECerror(ERR_R_BN_LIB);
1913                                                 goto err;
1914                                         }
1915                                         num_bytes = BN_bn2bin(tmp_scalar, tmp);
1916                                 } else
1917                                         num_bytes = BN_bn2bin(p_scalar, tmp);
1918                                 flip_endian(secrets[i], tmp, num_bytes);
1919                                 /* precompute multiples */
1920                                 if ((!BN_to_felem(x_out, &p->X)) ||
1921                                     (!BN_to_felem(y_out, &p->Y)) ||
1922                                     (!BN_to_felem(z_out, &p->Z)))
1923                                         goto err;
1924                                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1925                                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1926                                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1927                                 for (j = 2; j <= 16; ++j) {
1928                                         if (j & 1) {
1929                                                 point_add(
1930                                                     pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1931                                                     pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1932                                                     0, pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1933                                         } else {
1934                                                 point_double(
1935                                                     pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1936                                                     pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1937                                         }
1938                                 }
1939                         }
1940                 }
1941                 if (mixed)
1942                         make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1943         }
1944         /* the scalar for the generator */
1945         if ((scalar != NULL) && (have_pre_comp)) {
1946                 memset(g_secret, 0, sizeof(g_secret));
1947                 /* reduce scalar to 0 <= scalar < 2^521 */
1948                 if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1949                         /*
1950                          * this is an unusual input, and we don't guarantee
1951                          * constant-timeness
1952                          */
1953                         if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1954                                 ECerror(ERR_R_BN_LIB);
1955                                 goto err;
1956                         }
1957                         num_bytes = BN_bn2bin(tmp_scalar, tmp);
1958                 } else
1959                         num_bytes = BN_bn2bin(scalar, tmp);
1960                 flip_endian(g_secret, tmp, num_bytes);
1961                 /* do the multiplication with generator precomputation */
1962                 batch_mul(x_out, y_out, z_out,
1963                     (const felem_bytearray(*)) secrets, num_points,
1964                     g_secret,
1965                     mixed, (const felem(*)[17][3]) pre_comp,
1966                     (const felem(*)[3]) g_pre_comp);
1967         } else
1968                 /* do the multiplication without generator precomputation */
1969                 batch_mul(x_out, y_out, z_out,
1970                     (const felem_bytearray(*)) secrets, num_points,
1971                     NULL, mixed, (const felem(*)[17][3]) pre_comp, NULL);
1972         /* reduce the output to its unique minimal representation */
1973         felem_contract(x_in, x_out);
1974         felem_contract(y_in, y_out);
1975         felem_contract(z_in, z_out);
1976         if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1977             (!felem_to_BN(z, z_in))) {
1978                 ECerror(ERR_R_BN_LIB);
1979                 goto err;
1980         }
1981         ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1982
1983 err:
1984         BN_CTX_end(ctx);
1985         EC_POINT_free(generator);
1986         BN_CTX_free(new_ctx);
1987         free(secrets);
1988         free(pre_comp);
1989         free(tmp_felems);
1990         return ret;
1991 }
1992
1993 int
1994 ec_GFp_nistp521_precompute_mult(EC_GROUP * group, BN_CTX * ctx)
1995 {
1996         int ret = 0;
1997         NISTP521_PRE_COMP *pre = NULL;
1998         int i, j;
1999         BN_CTX *new_ctx = NULL;
2000         BIGNUM *x, *y;
2001         EC_POINT *generator = NULL;
2002         felem tmp_felems[16];
2003
2004         /* throw away old precomputation */
2005         EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2006             nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
2007         if (ctx == NULL)
2008                 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2009                         return 0;
2010         BN_CTX_start(ctx);
2011         if (((x = BN_CTX_get(ctx)) == NULL) ||
2012             ((y = BN_CTX_get(ctx)) == NULL))
2013                 goto err;
2014         /* get the generator */
2015         if (group->generator == NULL)
2016                 goto err;
2017         generator = EC_POINT_new(group);
2018         if (generator == NULL)
2019                 goto err;
2020         BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2021         BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2022         if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2023                 goto err;
2024         if ((pre = nistp521_pre_comp_new()) == NULL)
2025                 goto err;
2026         /* if the generator is the standard one, use built-in precomputation */
2027         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2028                 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2029                 ret = 1;
2030                 goto err;
2031         }
2032         if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2033             (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2034             (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2035                 goto err;
2036         /* compute 2^130*G, 2^260*G, 2^390*G */
2037         for (i = 1; i <= 4; i <<= 1) {
2038                 point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2039                     pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2040                     pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2041                 for (j = 0; j < 129; ++j) {
2042                         point_double(pre->g_pre_comp[2 * i][0],
2043                             pre->g_pre_comp[2 * i][1],
2044                             pre->g_pre_comp[2 * i][2],
2045                             pre->g_pre_comp[2 * i][0],
2046                             pre->g_pre_comp[2 * i][1],
2047                             pre->g_pre_comp[2 * i][2]);
2048                 }
2049         }
2050         /* g_pre_comp[0] is the point at infinity */
2051         memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2052         /* the remaining multiples */
2053         /* 2^130*G + 2^260*G */
2054         point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2055             pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2056             pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2057             0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2058             pre->g_pre_comp[2][2]);
2059         /* 2^130*G + 2^390*G */
2060         point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2061             pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2062             pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2063             0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2064             pre->g_pre_comp[2][2]);
2065         /* 2^260*G + 2^390*G */
2066         point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2067             pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2068             pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2069             0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2070             pre->g_pre_comp[4][2]);
2071         /* 2^130*G + 2^260*G + 2^390*G */
2072         point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2073             pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2074             pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2075             0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2076             pre->g_pre_comp[2][2]);
2077         for (i = 1; i < 8; ++i) {
2078                 /* odd multiples: add G */
2079                 point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1],
2080                     pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2081                     pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
2082                     0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2083                     pre->g_pre_comp[1][2]);
2084         }
2085         make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2086
2087         if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2088                 nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
2089                 goto err;
2090         ret = 1;
2091         pre = NULL;
2092 err:
2093         BN_CTX_end(ctx);
2094         EC_POINT_free(generator);
2095         BN_CTX_free(new_ctx);
2096         nistp521_pre_comp_free(pre);
2097         return ret;
2098 }
2099
2100 int
2101 ec_GFp_nistp521_have_precompute_mult(const EC_GROUP * group)
2102 {
2103         if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2104                 nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
2105             != NULL)
2106                 return 1;
2107         else
2108                 return 0;
2109 }
2110
2111 #endif