src/ext/curve25519_donna/curve25519-donna.c

   1 /* Copyright 2008, Google Inc.
   2  * All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions are
   6  * met:
   7  *
   8  *     * Redistributions of source code must retain the above copyright
   9  * notice, this list of conditions and the following disclaimer.
  10  *     * Redistributions in binary form must reproduce the above
  11  * copyright notice, this list of conditions and the following disclaimer
  12  * in the documentation and/or other materials provided with the
  13  * distribution.
  14  *     * Neither the name of Google Inc. nor the names of its
  15  * contributors may be used to endorse or promote products derived from
  16  * this software without specific prior written permission.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29  *
  30  * curve25519-donna: Curve25519 elliptic curve, public key function
  31  *
  32  * http://code.google.com/p/curve25519-donna/
  33  *
  34  * Adam Langley <agl@imperialviolet.org>
  35  *
  36  * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
  37  *
  38  * More information about curve25519 can be found here
  39  *   http://cr.yp.to/ecdh.html
  40  *
  41  * djb's sample implementation of curve25519 is written in a special assembly
  42  * language called qhasm and uses the floating point registers.
  43  *
  44  * This is, almost, a clean room reimplementation from the curve25519 paper. It
  45  * uses many of the tricks described therein. Only the crecip function is taken
  46  * from the sample implementation.
  47  */
  48
  49 #include "orconfig.h"
  50
  51 #include <string.h>
  52 #include "torint.h"
  53
  54 typedef uint8_t u8;
  55 typedef int32_t s32;
  56 typedef int64_t limb;
  57
  58 /* Field element representation:
  59  *
  60  * Field elements are written as an array of signed, 64-bit limbs, least
  61  * significant first. The value of the field element is:
  62  *   x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
  63  *
  64  * i.e. the limbs are 26, 25, 26, 25, ... bits wide.
  65  */
  66
  67 /* Sum two numbers: output += in */
  68 static void fsum(limb *output, const limb *in) {
  69   unsigned i;
  70   for (i = 0; i < 10; i += 2) {
  71     output[0+i] = (output[0+i] + in[0+i]);
  72     output[1+i] = (output[1+i] + in[1+i]);
  73   }
  74 }
  75
  76 /* Find the difference of two numbers: output = in - output
  77  * (note the order of the arguments!)
  78  */
  79 static void fdifference(limb *output, const limb *in) {
  80   unsigned i;
  81   for (i = 0; i < 10; ++i) {
  82     output[i] = (in[i] - output[i]);
  83   }
  84 }
  85
  86 /* Multiply a number by a scalar: output = in * scalar */
  87 static void fscalar_product(limb *output, const limb *in, const limb scalar) {
  88   unsigned i;
  89   for (i = 0; i < 10; ++i) {
  90     output[i] = in[i] * scalar;
  91   }
  92 }
  93
  94 /* Multiply two numbers: output = in2 * in
  95  *
  96  * output must be distinct to both inputs. The inputs are reduced coefficient
  97  * form, the output is not.
  98  */
  99 static void fproduct(limb *output, const limb *in2, const limb *in) {
 100   output[0] =       ((limb) ((s32) in2[0])) * ((s32) in[0]);
 101   output[1] =       ((limb) ((s32) in2[0])) * ((s32) in[1]) +
 102                     ((limb) ((s32) in2[1])) * ((s32) in[0]);
 103   output[2] =  2 *  ((limb) ((s32) in2[1])) * ((s32) in[1]) +
 104                     ((limb) ((s32) in2[0])) * ((s32) in[2]) +
 105                     ((limb) ((s32) in2[2])) * ((s32) in[0]);
 106   output[3] =       ((limb) ((s32) in2[1])) * ((s32) in[2]) +
 107                     ((limb) ((s32) in2[2])) * ((s32) in[1]) +
 108                     ((limb) ((s32) in2[0])) * ((s32) in[3]) +
 109                     ((limb) ((s32) in2[3])) * ((s32) in[0]);
 110   output[4] =       ((limb) ((s32) in2[2])) * ((s32) in[2]) +
 111                2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) +
 112                     ((limb) ((s32) in2[3])) * ((s32) in[1])) +
 113                     ((limb) ((s32) in2[0])) * ((s32) in[4]) +
 114                     ((limb) ((s32) in2[4])) * ((s32) in[0]);
 115   output[5] =       ((limb) ((s32) in2[2])) * ((s32) in[3]) +
 116                     ((limb) ((s32) in2[3])) * ((s32) in[2]) +
 117                     ((limb) ((s32) in2[1])) * ((s32) in[4]) +
 118                     ((limb) ((s32) in2[4])) * ((s32) in[1]) +
 119                     ((limb) ((s32) in2[0])) * ((s32) in[5]) +
 120                     ((limb) ((s32) in2[5])) * ((s32) in[0]);
 121   output[6] =  2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) +
 122                     ((limb) ((s32) in2[1])) * ((s32) in[5]) +
 123                     ((limb) ((s32) in2[5])) * ((s32) in[1])) +
 124                     ((limb) ((s32) in2[2])) * ((s32) in[4]) +
 125                     ((limb) ((s32) in2[4])) * ((s32) in[2]) +
 126                     ((limb) ((s32) in2[0])) * ((s32) in[6]) +
 127                     ((limb) ((s32) in2[6])) * ((s32) in[0]);
 128   output[7] =       ((limb) ((s32) in2[3])) * ((s32) in[4]) +
 129                     ((limb) ((s32) in2[4])) * ((s32) in[3]) +
 130                     ((limb) ((s32) in2[2])) * ((s32) in[5]) +
 131                     ((limb) ((s32) in2[5])) * ((s32) in[2]) +
 132                     ((limb) ((s32) in2[1])) * ((s32) in[6]) +
 133                     ((limb) ((s32) in2[6])) * ((s32) in[1]) +
 134                     ((limb) ((s32) in2[0])) * ((s32) in[7]) +
 135                     ((limb) ((s32) in2[7])) * ((s32) in[0]);
 136   output[8] =       ((limb) ((s32) in2[4])) * ((s32) in[4]) +
 137                2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) +
 138                     ((limb) ((s32) in2[5])) * ((s32) in[3]) +
 139                     ((limb) ((s32) in2[1])) * ((s32) in[7]) +
 140                     ((limb) ((s32) in2[7])) * ((s32) in[1])) +
 141                     ((limb) ((s32) in2[2])) * ((s32) in[6]) +
 142                     ((limb) ((s32) in2[6])) * ((s32) in[2]) +
 143                     ((limb) ((s32) in2[0])) * ((s32) in[8]) +
 144                     ((limb) ((s32) in2[8])) * ((s32) in[0]);
 145   output[9] =       ((limb) ((s32) in2[4])) * ((s32) in[5]) +
 146                     ((limb) ((s32) in2[5])) * ((s32) in[4]) +
 147                     ((limb) ((s32) in2[3])) * ((s32) in[6]) +
 148                     ((limb) ((s32) in2[6])) * ((s32) in[3]) +
 149                     ((limb) ((s32) in2[2])) * ((s32) in[7]) +
 150                     ((limb) ((s32) in2[7])) * ((s32) in[2]) +
 151                     ((limb) ((s32) in2[1])) * ((s32) in[8]) +
 152                     ((limb) ((s32) in2[8])) * ((s32) in[1]) +
 153                     ((limb) ((s32) in2[0])) * ((s32) in[9]) +
 154                     ((limb) ((s32) in2[9])) * ((s32) in[0]);
 155   output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) +
 156                     ((limb) ((s32) in2[3])) * ((s32) in[7]) +
 157                     ((limb) ((s32) in2[7])) * ((s32) in[3]) +
 158                     ((limb) ((s32) in2[1])) * ((s32) in[9]) +
 159                     ((limb) ((s32) in2[9])) * ((s32) in[1])) +
 160                     ((limb) ((s32) in2[4])) * ((s32) in[6]) +
 161                     ((limb) ((s32) in2[6])) * ((s32) in[4]) +
 162                     ((limb) ((s32) in2[2])) * ((s32) in[8]) +
 163                     ((limb) ((s32) in2[8])) * ((s32) in[2]);
 164   output[11] =      ((limb) ((s32) in2[5])) * ((s32) in[6]) +
 165                     ((limb) ((s32) in2[6])) * ((s32) in[5]) +
 166                     ((limb) ((s32) in2[4])) * ((s32) in[7]) +
 167                     ((limb) ((s32) in2[7])) * ((s32) in[4]) +
 168                     ((limb) ((s32) in2[3])) * ((s32) in[8]) +
 169                     ((limb) ((s32) in2[8])) * ((s32) in[3]) +
 170                     ((limb) ((s32) in2[2])) * ((s32) in[9]) +
 171                     ((limb) ((s32) in2[9])) * ((s32) in[2]);
 172   output[12] =      ((limb) ((s32) in2[6])) * ((s32) in[6]) +
 173                2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) +
 174                     ((limb) ((s32) in2[7])) * ((s32) in[5]) +
 175                     ((limb) ((s32) in2[3])) * ((s32) in[9]) +
 176                     ((limb) ((s32) in2[9])) * ((s32) in[3])) +
 177                     ((limb) ((s32) in2[4])) * ((s32) in[8]) +
 178                     ((limb) ((s32) in2[8])) * ((s32) in[4]);
 179   output[13] =      ((limb) ((s32) in2[6])) * ((s32) in[7]) +
 180                     ((limb) ((s32) in2[7])) * ((s32) in[6]) +
 181                     ((limb) ((s32) in2[5])) * ((s32) in[8]) +
 182                     ((limb) ((s32) in2[8])) * ((s32) in[5]) +
 183                     ((limb) ((s32) in2[4])) * ((s32) in[9]) +
 184                     ((limb) ((s32) in2[9])) * ((s32) in[4]);
 185   output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) +
 186                     ((limb) ((s32) in2[5])) * ((s32) in[9]) +
 187                     ((limb) ((s32) in2[9])) * ((s32) in[5])) +
 188                     ((limb) ((s32) in2[6])) * ((s32) in[8]) +
 189                     ((limb) ((s32) in2[8])) * ((s32) in[6]);
 190   output[15] =      ((limb) ((s32) in2[7])) * ((s32) in[8]) +
 191                     ((limb) ((s32) in2[8])) * ((s32) in[7]) +
 192                     ((limb) ((s32) in2[6])) * ((s32) in[9]) +
 193                     ((limb) ((s32) in2[9])) * ((s32) in[6]);
 194   output[16] =      ((limb) ((s32) in2[8])) * ((s32) in[8]) +
 195                2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) +
 196                     ((limb) ((s32) in2[9])) * ((s32) in[7]));
 197   output[17] =      ((limb) ((s32) in2[8])) * ((s32) in[9]) +
 198                     ((limb) ((s32) in2[9])) * ((s32) in[8]);
 199   output[18] = 2 *  ((limb) ((s32) in2[9])) * ((s32) in[9]);
 200 }
 201
 202 /* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
 203 static void freduce_degree(limb *output) {
 204   /* Each of these shifts and adds ends up multiplying the value by 19. */
 205   output[8] += output[18] << 4;
 206   output[8] += output[18] << 1;
 207   output[8] += output[18];
 208   output[7] += output[17] << 4;
 209   output[7] += output[17] << 1;
 210   output[7] += output[17];
 211   output[6] += output[16] << 4;
 212   output[6] += output[16] << 1;
 213   output[6] += output[16];
 214   output[5] += output[15] << 4;
 215   output[5] += output[15] << 1;
 216   output[5] += output[15];
 217   output[4] += output[14] << 4;
 218   output[4] += output[14] << 1;
 219   output[4] += output[14];
 220   output[3] += output[13] << 4;
 221   output[3] += output[13] << 1;
 222   output[3] += output[13];
 223   output[2] += output[12] << 4;
 224   output[2] += output[12] << 1;
 225   output[2] += output[12];
 226   output[1] += output[11] << 4;
 227   output[1] += output[11] << 1;
 228   output[1] += output[11];
 229   output[0] += output[10] << 4;
 230   output[0] += output[10] << 1;
 231   output[0] += output[10];
 232 }
 233
 234 #if (-1 & 3) != 3
 235 #error "This code only works on a two's complement system"
 236 #endif
 237
 238 /* return v / 2^26, using only shifts and adds. */
 239 static inline limb
 240 div_by_2_26(const limb v)
 241 {
 242   /* High word of v; no shift needed*/
 243   const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
 244   /* Set to all 1s if v was negative; else set to 0s. */
 245   const int32_t sign = ((int32_t) highword) >> 31;
 246   /* Set to 0x3ffffff if v was negative; else set to 0. */
 247   const int32_t roundoff = ((uint32_t) sign) >> 6;
 248   /* Should return v / (1<<26) */
 249   return (v + roundoff) >> 26;
 250 }
 251
 252 /* return v / (2^25), using only shifts and adds. */
 253 static inline limb
 254 div_by_2_25(const limb v)
 255 {
 256   /* High word of v; no shift needed*/
 257   const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
 258   /* Set to all 1s if v was negative; else set to 0s. */
 259   const int32_t sign = ((int32_t) highword) >> 31;
 260   /* Set to 0x1ffffff if v was negative; else set to 0. */
 261   const int32_t roundoff = ((uint32_t) sign) >> 7;
 262   /* Should return v / (1<<25) */
 263   return (v + roundoff) >> 25;
 264 }
 265
 266 static inline s32
 267 div_s32_by_2_25(const s32 v)
 268 {
 269    const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
 270    return (v + roundoff) >> 25;
 271 }
 272
 273 /* Reduce all coefficients of the short form input so that |x| < 2^26.
 274  *
 275  * On entry: |output[i]| < 2^62
 276  */
 277 static void freduce_coefficients(limb *output) {
 278   unsigned i;
 279
 280   output[10] = 0;
 281
 282   for (i = 0; i < 10; i += 2) {
 283     limb over = div_by_2_26(output[i]);
 284     output[i] -= over << 26;
 285     output[i+1] += over;
 286
 287     over = div_by_2_25(output[i+1]);
 288     output[i+1] -= over << 25;
 289     output[i+2] += over;
 290   }
 291   /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */
 292   output[0] += output[10] << 4;
 293   output[0] += output[10] << 1;
 294   output[0] += output[10];
 295
 296   output[10] = 0;
 297
 298   /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38
 299    * So |over| will be no more than 77825  */
 300   {
 301     limb over = div_by_2_26(output[0]);
 302     output[0] -= over << 26;
 303     output[1] += over;
 304   }
 305
 306   /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825
 307    * So |over| will be no more than 1. */
 308   {
 309     /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */
 310     s32 over32 = div_s32_by_2_25((s32) output[1]);
 311     output[1] -= over32 << 25;
 312     output[2] += over32;
 313   }
 314
 315   /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced":
 316    * we have |output[2]| <= 2^26.  This is good enough for all of our math,
 317    * but it will require an extra freduce_coefficients before fcontract. */
 318 }
 319
 320 /* A helpful wrapper around fproduct: output = in * in2.
 321  *
 322  * output must be distinct to both inputs. The output is reduced degree and
 323  * reduced coefficient.
 324  */
 325 static void
 326 fmul(limb *output, const limb *in, const limb *in2) {
 327   limb t[19];
 328   fproduct(t, in, in2);
 329   freduce_degree(t);
 330   freduce_coefficients(t);
 331   memcpy(output, t, sizeof(limb) * 10);
 332 }
 333
 334 static void fsquare_inner(limb *output, const limb *in) {
 335   output[0] =       ((limb) ((s32) in[0])) * ((s32) in[0]);
 336   output[1] =  2 *  ((limb) ((s32) in[0])) * ((s32) in[1]);
 337   output[2] =  2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
 338                     ((limb) ((s32) in[0])) * ((s32) in[2]));
 339   output[3] =  2 * (((limb) ((s32) in[1])) * ((s32) in[2]) +
 340                     ((limb) ((s32) in[0])) * ((s32) in[3]));
 341   output[4] =       ((limb) ((s32) in[2])) * ((s32) in[2]) +
 342                4 *  ((limb) ((s32) in[1])) * ((s32) in[3]) +
 343                2 *  ((limb) ((s32) in[0])) * ((s32) in[4]);
 344   output[5] =  2 * (((limb) ((s32) in[2])) * ((s32) in[3]) +
 345                     ((limb) ((s32) in[1])) * ((s32) in[4]) +
 346                     ((limb) ((s32) in[0])) * ((s32) in[5]));
 347   output[6] =  2 * (((limb) ((s32) in[3])) * ((s32) in[3]) +
 348                     ((limb) ((s32) in[2])) * ((s32) in[4]) +
 349                     ((limb) ((s32) in[0])) * ((s32) in[6]) +
 350                2 *  ((limb) ((s32) in[1])) * ((s32) in[5]));
 351   output[7] =  2 * (((limb) ((s32) in[3])) * ((s32) in[4]) +
 352                     ((limb) ((s32) in[2])) * ((s32) in[5]) +
 353                     ((limb) ((s32) in[1])) * ((s32) in[6]) +
 354                     ((limb) ((s32) in[0])) * ((s32) in[7]));
 355   output[8] =       ((limb) ((s32) in[4])) * ((s32) in[4]) +
 356                2 * (((limb) ((s32) in[2])) * ((s32) in[6]) +
 357                     ((limb) ((s32) in[0])) * ((s32) in[8]) +
 358                2 * (((limb) ((s32) in[1])) * ((s32) in[7]) +
 359                     ((limb) ((s32) in[3])) * ((s32) in[5])));
 360   output[9] =  2 * (((limb) ((s32) in[4])) * ((s32) in[5]) +
 361                     ((limb) ((s32) in[3])) * ((s32) in[6]) +
 362                     ((limb) ((s32) in[2])) * ((s32) in[7]) +
 363                     ((limb) ((s32) in[1])) * ((s32) in[8]) +
 364                     ((limb) ((s32) in[0])) * ((s32) in[9]));
 365   output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) +
 366                     ((limb) ((s32) in[4])) * ((s32) in[6]) +
 367                     ((limb) ((s32) in[2])) * ((s32) in[8]) +
 368                2 * (((limb) ((s32) in[3])) * ((s32) in[7]) +
 369                     ((limb) ((s32) in[1])) * ((s32) in[9])));
 370   output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) +
 371                     ((limb) ((s32) in[4])) * ((s32) in[7]) +
 372                     ((limb) ((s32) in[3])) * ((s32) in[8]) +
 373                     ((limb) ((s32) in[2])) * ((s32) in[9]));
 374   output[12] =      ((limb) ((s32) in[6])) * ((s32) in[6]) +
 375                2 * (((limb) ((s32) in[4])) * ((s32) in[8]) +
 376                2 * (((limb) ((s32) in[5])) * ((s32) in[7]) +
 377                     ((limb) ((s32) in[3])) * ((s32) in[9])));
 378   output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) +
 379                     ((limb) ((s32) in[5])) * ((s32) in[8]) +
 380                     ((limb) ((s32) in[4])) * ((s32) in[9]));
 381   output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) +
 382                     ((limb) ((s32) in[6])) * ((s32) in[8]) +
 383                2 *  ((limb) ((s32) in[5])) * ((s32) in[9]));
 384   output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) +
 385                     ((limb) ((s32) in[6])) * ((s32) in[9]));
 386   output[16] =      ((limb) ((s32) in[8])) * ((s32) in[8]) +
 387                4 *  ((limb) ((s32) in[7])) * ((s32) in[9]);
 388   output[17] = 2 *  ((limb) ((s32) in[8])) * ((s32) in[9]);
 389   output[18] = 2 *  ((limb) ((s32) in[9])) * ((s32) in[9]);
 390 }
 391
 392 static void
 393 fsquare(limb *output, const limb *in) {
 394   limb t[19];
 395   fsquare_inner(t, in);
 396   freduce_degree(t);
 397   freduce_coefficients(t);
 398   memcpy(output, t, sizeof(limb) * 10);
 399 }
 400
 401 /* Take a little-endian, 32-byte number and expand it into polynomial form */
 402 static void
 403 fexpand(limb *output, const u8 *input) {
 404 #define F(n,start,shift,mask) \
 405   output[n] = ((((limb) input[start + 0]) | \
 406                 ((limb) input[start + 1]) << 8 | \
 407                 ((limb) input[start + 2]) << 16 | \
 408                 ((limb) input[start + 3]) << 24) >> shift) & mask;
 409   F(0, 0, 0, 0x3ffffff);
 410   F(1, 3, 2, 0x1ffffff);
 411   F(2, 6, 3, 0x3ffffff);
 412   F(3, 9, 5, 0x1ffffff);
 413   F(4, 12, 6, 0x3ffffff);
 414   F(5, 16, 0, 0x1ffffff);
 415   F(6, 19, 1, 0x3ffffff);
 416   F(7, 22, 3, 0x1ffffff);
 417   F(8, 25, 4, 0x3ffffff);
 418   F(9, 28, 6, 0x1ffffff);
 419 #undef F
 420 }
 421
 422 #if (-32 >> 1) != -16
 423 #error "This code only works when >> does sign-extension on negative numbers"
 424 #endif
 425
 426 /* Take a fully reduced polynomial form number and contract it into a
 427  * little-endian, 32-byte array
 428  */
 429 static void
 430 fcontract(u8 *output, limb *input) {
 431   int i;
 432   int j;
 433
 434   for (j = 0; j < 2; ++j) {
 435     for (i = 0; i < 9; ++i) {
 436       if ((i & 1) == 1) {
 437         /* This calculation is a time-invariant way to make input[i] positive
 438            by borrowing from the next-larger limb.
 439         */
 440         const s32 mask = (s32)(input[i]) >> 31;
 441         const s32 carry = -(((s32)(input[i]) & mask) >> 25);
 442         input[i] = (s32)(input[i]) + (carry << 25);
 443         input[i+1] = (s32)(input[i+1]) - carry;
 444       } else {
 445         const s32 mask = (s32)(input[i]) >> 31;
 446         const s32 carry = -(((s32)(input[i]) & mask) >> 26);
 447         input[i] = (s32)(input[i]) + (carry << 26);
 448         input[i+1] = (s32)(input[i+1]) - carry;
 449       }
 450     }
 451     {
 452       const s32 mask = (s32)(input[9]) >> 31;
 453       const s32 carry = -(((s32)(input[9]) & mask) >> 25);
 454       input[9] = (s32)(input[9]) + (carry << 25);
 455       input[0] = (s32)(input[0]) - (carry * 19);
 456     }
 457   }
 458
 459   /* The first borrow-propagation pass above ended with every limb
 460      except (possibly) input[0] non-negative.
 461
 462      Since each input limb except input[0] is decreased by at most 1
 463      by a borrow-propagation pass, the second borrow-propagation pass
 464      could only have wrapped around to decrease input[0] again if the
 465      first pass left input[0] negative *and* input[1] through input[9]
 466      were all zero.  In that case, input[1] is now 2^25 - 1, and this
 467      last borrow-propagation step will leave input[1] non-negative.
 468   */
 469   {
 470     const s32 mask = (s32)(input[0]) >> 31;
 471     const s32 carry = -(((s32)(input[0]) & mask) >> 26);
 472     input[0] = (s32)(input[0]) + (carry << 26);
 473     input[1] = (s32)(input[1]) - carry;
 474   }
 475
 476   /* Both passes through the above loop, plus the last 0-to-1 step, are
 477      necessary: if input[9] is -1 and input[0] through input[8] are 0,
 478      negative values will remain in the array until the end.
 479    */
 480
 481   input[1] <<= 2;
 482   input[2] <<= 3;
 483   input[3] <<= 5;
 484   input[4] <<= 6;
 485   input[6] <<= 1;
 486   input[7] <<= 3;
 487   input[8] <<= 4;
 488   input[9] <<= 6;
 489 #define F(i, s) \
 490   output[s+0] |=  input[i] & 0xff; \
 491   output[s+1]  = (input[i] >> 8) & 0xff; \
 492   output[s+2]  = (input[i] >> 16) & 0xff; \
 493   output[s+3]  = (input[i] >> 24) & 0xff;
 494   output[0] = 0;
 495   output[16] = 0;
 496   F(0,0);
 497   F(1,3);
 498   F(2,6);
 499   F(3,9);
 500   F(4,12);
 501   F(5,16);
 502   F(6,19);
 503   F(7,22);
 504   F(8,25);
 505   F(9,28);
 506 #undef F
 507 }
 508
 509 /* Input: Q, Q', Q-Q'
 510  * Output: 2Q, Q+Q'
 511  *
 512  *   x2 z3: long form
 513  *   x3 z3: long form
 514  *   x z: short form, destroyed
 515  *   xprime zprime: short form, destroyed
 516  *   qmqp: short form, preserved
 517  */
 518 static void fmonty(limb *x2, limb *z2,  /* output 2Q */
 519                    limb *x3, limb *z3,  /* output Q + Q' */
 520                    limb *x, limb *z,    /* input Q */
 521                    limb *xprime, limb *zprime,  /* input Q' */
 522                    const limb *qmqp /* input Q - Q' */) {
 523   limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
 524         zzprime[19], zzzprime[19], xxxprime[19];
 525
 526   memcpy(origx, x, 10 * sizeof(limb));
 527   fsum(x, z);
 528   fdifference(z, origx);  // does x - z
 529
 530   memcpy(origxprime, xprime, sizeof(limb) * 10);
 531   fsum(xprime, zprime);
 532   fdifference(zprime, origxprime);
 533   fproduct(xxprime, xprime, z);
 534   fproduct(zzprime, x, zprime);
 535   freduce_degree(xxprime);
 536   freduce_coefficients(xxprime);
 537   freduce_degree(zzprime);
 538   freduce_coefficients(zzprime);
 539   memcpy(origxprime, xxprime, sizeof(limb) * 10);
 540   fsum(xxprime, zzprime);
 541   fdifference(zzprime, origxprime);
 542   fsquare(xxxprime, xxprime);
 543   fsquare(zzzprime, zzprime);
 544   fproduct(zzprime, zzzprime, qmqp);
 545   freduce_degree(zzprime);
 546   freduce_coefficients(zzprime);
 547   memcpy(x3, xxxprime, sizeof(limb) * 10);
 548   memcpy(z3, zzprime, sizeof(limb) * 10);
 549
 550   fsquare(xx, x);
 551   fsquare(zz, z);
 552   fproduct(x2, xx, zz);
 553   freduce_degree(x2);
 554   freduce_coefficients(x2);
 555   fdifference(zz, xx);  // does zz = xx - zz
 556   memset(zzz + 10, 0, sizeof(limb) * 9);
 557   fscalar_product(zzz, zz, 121665);
 558   /* No need to call freduce_degree here:
 559      fscalar_product doesn't increase the degree of its input. */
 560   freduce_coefficients(zzz);
 561   fsum(zzz, xx);
 562   fproduct(z2, zz, zzz);
 563   freduce_degree(z2);
 564   freduce_coefficients(z2);
 565 }
 566
 567 /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
 568  * them unchanged if 'iswap' is 0.  Runs in data-invariant time to avoid
 569  * side-channel attacks.
 570  *
 571  * NOTE that this function requires that 'iswap' be 1 or 0; other values give
 572  * wrong results.  Also, the two limb arrays must be in reduced-coefficient,
 573  * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
 574  * and all all values in a[0..9],b[0..9] must have magnitude less than
 575  * INT32_MAX.
 576  */
 577 static void
 578 swap_conditional(limb a[19], limb b[19], limb iswap) {
 579   unsigned i;
 580   const s32 swap = (s32) -iswap;
 581
 582   for (i = 0; i < 10; ++i) {
 583     const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) );
 584     a[i] = ((s32)a[i]) ^ x;
 585     b[i] = ((s32)b[i]) ^ x;
 586   }
 587 }
 588
 589 /* Calculates nQ where Q is the x-coordinate of a point on the curve
 590  *
 591  *   resultx/resultz: the x coordinate of the resulting curve point (short form)
 592  *   n: a little endian, 32-byte number
 593  *   q: a point of the curve (short form)
 594  */
 595 static void
 596 cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
 597   limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
 598   limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
 599   limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
 600   limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
 601
 602   unsigned i, j;
 603
 604   memcpy(nqpqx, q, sizeof(limb) * 10);
 605
 606   for (i = 0; i < 32; ++i) {
 607     u8 byte = n[31 - i];
 608     for (j = 0; j < 8; ++j) {
 609       const limb bit = byte >> 7;
 610
 611       swap_conditional(nqx, nqpqx, bit);
 612       swap_conditional(nqz, nqpqz, bit);
 613       fmonty(nqx2, nqz2,
 614              nqpqx2, nqpqz2,
 615              nqx, nqz,
 616              nqpqx, nqpqz,
 617              q);
 618       swap_conditional(nqx2, nqpqx2, bit);
 619       swap_conditional(nqz2, nqpqz2, bit);
 620
 621       t = nqx;
 622       nqx = nqx2;
 623       nqx2 = t;
 624       t = nqz;
 625       nqz = nqz2;
 626       nqz2 = t;
 627       t = nqpqx;
 628       nqpqx = nqpqx2;
 629       nqpqx2 = t;
 630       t = nqpqz;
 631       nqpqz = nqpqz2;
 632       nqpqz2 = t;
 633
 634       byte <<= 1;
 635     }
 636   }
 637
 638   memcpy(resultx, nqx, sizeof(limb) * 10);
 639   memcpy(resultz, nqz, sizeof(limb) * 10);
 640 }
 641
 642 // -----------------------------------------------------------------------------
 643 // Shamelessly copied from djb's code
 644 // -----------------------------------------------------------------------------
 645 static void
 646 crecip(limb *out, const limb *z) {
 647   limb z2[10];
 648   limb z9[10];
 649   limb z11[10];
 650   limb z2_5_0[10];
 651   limb z2_10_0[10];
 652   limb z2_20_0[10];
 653   limb z2_50_0[10];
 654   limb z2_100_0[10];
 655   limb t0[10];
 656   limb t1[10];
 657   int i;
 658
 659   /* 2 */ fsquare(z2,z);
 660   /* 4 */ fsquare(t1,z2);
 661   /* 8 */ fsquare(t0,t1);
 662   /* 9 */ fmul(z9,t0,z);
 663   /* 11 */ fmul(z11,z9,z2);
 664   /* 22 */ fsquare(t0,z11);
 665   /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
 666
 667   /* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
 668   /* 2^7 - 2^2 */ fsquare(t1,t0);
 669   /* 2^8 - 2^3 */ fsquare(t0,t1);
 670   /* 2^9 - 2^4 */ fsquare(t1,t0);
 671   /* 2^10 - 2^5 */ fsquare(t0,t1);
 672   /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
 673
 674   /* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
 675   /* 2^12 - 2^2 */ fsquare(t1,t0);
 676   /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
 677   /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
 678
 679   /* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
 680   /* 2^22 - 2^2 */ fsquare(t1,t0);
 681   /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
 682   /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
 683
 684   /* 2^41 - 2^1 */ fsquare(t1,t0);
 685   /* 2^42 - 2^2 */ fsquare(t0,t1);
 686   /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
 687   /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
 688
 689   /* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
 690   /* 2^52 - 2^2 */ fsquare(t1,t0);
 691   /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
 692   /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
 693
 694   /* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
 695   /* 2^102 - 2^2 */ fsquare(t0,t1);
 696   /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
 697   /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
 698
 699   /* 2^201 - 2^1 */ fsquare(t0,t1);
 700   /* 2^202 - 2^2 */ fsquare(t1,t0);
 701   /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
 702   /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
 703
 704   /* 2^251 - 2^1 */ fsquare(t1,t0);
 705   /* 2^252 - 2^2 */ fsquare(t0,t1);
 706   /* 2^253 - 2^3 */ fsquare(t1,t0);
 707   /* 2^254 - 2^4 */ fsquare(t0,t1);
 708   /* 2^255 - 2^5 */ fsquare(t1,t0);
 709   /* 2^255 - 21 */ fmul(out,t1,z11);
 710 }
 711
 712 int curve25519_donna(u8 *, const u8 *, const u8 *);
 713
 714 int
 715 curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
 716   limb bp[10], x[10], z[11], zmone[10];
 717   uint8_t e[32];
 718   int i;
 719
 720   for (i = 0; i < 32; ++i) e[i] = secret[i];
 721   e[0] &= 248;
 722   e[31] &= 127;
 723   e[31] |= 64;
 724
 725   fexpand(bp, basepoint);
 726   cmult(x, z, e, bp);
 727   crecip(zmone, z);
 728   fmul(z, x, zmone);
 729   freduce_coefficients(z);
 730   fcontract(mypublic, z);
 731   return 0;
 732 }