libgo/go/crypto/elliptic/p256.go

   1 // Copyright 2013 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // -build !amd64
   6
   7 package elliptic
   8
   9 // This file contains a constant-time, 32-bit implementation of P256.
  10
  11 import (
  12         "math/big"
  13 )
  14
  15 type p256Curve struct {
  16         *CurveParams
  17 }
  18
  19 var (
  20         p256Params *CurveParams
  21
  22         // RInverse contains 1/R mod p - the inverse of the Montgomery constant
  23         // (2**257).
  24         p256RInverse *big.Int
  25 )
  26
  27 func initP256() {
  28         // See FIPS 186-3, section D.2.3
  29         p256Params = &CurveParams{Name: "P-256"}
  30         p256Params.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
  31         p256Params.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
  32         p256Params.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
  33         p256Params.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
  34         p256Params.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
  35         p256Params.BitSize = 256
  36
  37         p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
  38
  39         // Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
  40         initP256Arch()
  41 }
  42
  43 func (curve p256Curve) Params() *CurveParams {
  44         return curve.CurveParams
  45 }
  46
  47 // p256GetScalar endian-swaps the big-endian scalar value from in and writes it
  48 // to out. If the scalar is equal or greater than the order of the group, it's
  49 // reduced modulo that order.
  50 func p256GetScalar(out *[32]byte, in []byte) {
  51         n := new(big.Int).SetBytes(in)
  52         var scalarBytes []byte
  53
  54         if n.Cmp(p256Params.N) >= 0 {
  55                 n.Mod(n, p256Params.N)
  56                 scalarBytes = n.Bytes()
  57         } else {
  58                 scalarBytes = in
  59         }
  60
  61         for i, v := range scalarBytes {
  62                 out[len(scalarBytes)-(1+i)] = v
  63         }
  64 }
  65
  66 func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
  67         var scalarReversed [32]byte
  68         p256GetScalar(&scalarReversed, scalar)
  69
  70         var x1, y1, z1 [p256Limbs]uint32
  71         p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
  72         return p256ToAffine(&x1, &y1, &z1)
  73 }
  74
  75 func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
  76         var scalarReversed [32]byte
  77         p256GetScalar(&scalarReversed, scalar)
  78
  79         var px, py, x1, y1, z1 [p256Limbs]uint32
  80         p256FromBig(&px, bigX)
  81         p256FromBig(&py, bigY)
  82         p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
  83         return p256ToAffine(&x1, &y1, &z1)
  84 }
  85
  86 // Field elements are represented as nine, unsigned 32-bit words.
  87 //
  88 // The value of an field element is:
  89 //   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
  90 //
  91 // That is, each limb is alternately 29 or 28-bits wide in little-endian
  92 // order.
  93 //
  94 // This means that a field element hits 2**257, rather than 2**256 as we would
  95 // like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
  96 // problems when multiplying as terms end up one bit short of a limb which
  97 // would require much bit-shifting to correct.
  98 //
  99 // Finally, the values stored in a field element are in Montgomery form. So the
 100 // value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
 101 // 2**257.
 102
 103 const (
 104         p256Limbs    = 9
 105         bottom29Bits = 0x1fffffff
 106 )
 107
 108 var (
 109         // p256One is the number 1 as a field element.
 110         p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
 111         p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
 112         // p256P is the prime modulus as a field element.
 113         p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
 114         // p2562P is the twice prime modulus as a field element.
 115         p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
 116 )
 117
 118 // p256Precomputed contains precomputed values to aid the calculation of scalar
 119 // multiples of the base point, G. It's actually two, equal length, tables
 120 // concatenated.
 121 //
 122 // The first table contains (x,y) field element pairs for 16 multiples of the
 123 // base point, G.
 124 //
 125 //   Index  |  Index (binary) | Value
 126 //       0  |           0000  | 0G (all zeros, omitted)
 127 //       1  |           0001  | G
 128 //       2  |           0010  | 2**64G
 129 //       3  |           0011  | 2**64G + G
 130 //       4  |           0100  | 2**128G
 131 //       5  |           0101  | 2**128G + G
 132 //       6  |           0110  | 2**128G + 2**64G
 133 //       7  |           0111  | 2**128G + 2**64G + G
 134 //       8  |           1000  | 2**192G
 135 //       9  |           1001  | 2**192G + G
 136 //      10  |           1010  | 2**192G + 2**64G
 137 //      11  |           1011  | 2**192G + 2**64G + G
 138 //      12  |           1100  | 2**192G + 2**128G
 139 //      13  |           1101  | 2**192G + 2**128G + G
 140 //      14  |           1110  | 2**192G + 2**128G + 2**64G
 141 //      15  |           1111  | 2**192G + 2**128G + 2**64G + G
 142 //
 143 // The second table follows the same style, but the terms are 2**32G,
 144 // 2**96G, 2**160G, 2**224G.
 145 //
 146 // This is ~2KB of data.
 147 var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
 148         0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
 149         0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
 150         0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
 151         0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
 152         0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
 153         0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
 154         0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
 155         0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
 156         0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
 157         0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
 158         0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
 159         0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
 160         0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
 161         0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
 162         0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
 163         0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
 164         0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
 165         0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
 166         0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
 167         0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
 168         0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
 169         0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
 170         0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
 171         0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
 172         0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
 173         0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
 174         0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
 175         0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
 176         0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
 177         0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
 178         0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
 179         0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
 180         0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
 181         0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
 182         0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
 183         0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
 184         0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
 185         0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
 186         0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
 187         0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
 188         0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
 189         0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
 190         0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
 191         0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
 192         0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
 193         0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
 194         0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
 195         0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
 196         0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
 197         0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
 198         0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
 199         0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
 200         0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
 201         0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
 202         0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
 203         0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
 204         0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
 205         0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
 206         0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
 207         0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
 208 }
 209
 210 // Field element operations:
 211
 212 // nonZeroToAllOnes returns:
 213 //   0xffffffff for 0 < x <= 2**31
 214 //   0 for x == 0 or x > 2**31.
 215 func nonZeroToAllOnes(x uint32) uint32 {
 216         return ((x - 1) >> 31) - 1
 217 }
 218
 219 // p256ReduceCarry adds a multiple of p in order to cancel |carry|,
 220 // which is a term at 2**257.
 221 //
 222 // On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
 223 // On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
 224 func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
 225         carry_mask := nonZeroToAllOnes(carry)
 226
 227         inout[0] += carry << 1
 228         inout[3] += 0x10000000 & carry_mask
 229         // carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
 230         // previous line therefore this doesn't underflow.
 231         inout[3] -= carry << 11
 232         inout[4] += (0x20000000 - 1) & carry_mask
 233         inout[5] += (0x10000000 - 1) & carry_mask
 234         inout[6] += (0x20000000 - 1) & carry_mask
 235         inout[6] -= carry << 22
 236         // This may underflow if carry is non-zero but, if so, we'll fix it in the
 237         // next line.
 238         inout[7] -= 1 & carry_mask
 239         inout[7] += carry << 25
 240 }
 241
 242 // p256Sum sets out = in+in2.
 243 //
 244 // On entry, in[i]+in2[i] must not overflow a 32-bit word.
 245 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
 246 func p256Sum(out, in, in2 *[p256Limbs]uint32) {
 247         carry := uint32(0)
 248         for i := 0; ; i++ {
 249                 out[i] = in[i] + in2[i]
 250                 out[i] += carry
 251                 carry = out[i] >> 29
 252                 out[i] &= bottom29Bits
 253
 254                 i++
 255                 if i == p256Limbs {
 256                         break
 257                 }
 258
 259                 out[i] = in[i] + in2[i]
 260                 out[i] += carry
 261                 carry = out[i] >> 28
 262                 out[i] &= bottom28Bits
 263         }
 264
 265         p256ReduceCarry(out, carry)
 266 }
 267
 268 const (
 269         two30m2    = 1<<30 - 1<<2
 270         two30p13m2 = 1<<30 + 1<<13 - 1<<2
 271         two31m2    = 1<<31 - 1<<2
 272         two31p24m2 = 1<<31 + 1<<24 - 1<<2
 273         two30m27m2 = 1<<30 - 1<<27 - 1<<2
 274 )
 275
 276 // p256Zero31 is 0 mod p.
 277 var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
 278
 279 // p256Diff sets out = in-in2.
 280 //
 281 // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
 282 //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
 283 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
 284 func p256Diff(out, in, in2 *[p256Limbs]uint32) {
 285         var carry uint32
 286
 287         for i := 0; ; i++ {
 288                 out[i] = in[i] - in2[i]
 289                 out[i] += p256Zero31[i]
 290                 out[i] += carry
 291                 carry = out[i] >> 29
 292                 out[i] &= bottom29Bits
 293
 294                 i++
 295                 if i == p256Limbs {
 296                         break
 297                 }
 298
 299                 out[i] = in[i] - in2[i]
 300                 out[i] += p256Zero31[i]
 301                 out[i] += carry
 302                 carry = out[i] >> 28
 303                 out[i] &= bottom28Bits
 304         }
 305
 306         p256ReduceCarry(out, carry)
 307 }
 308
 309 // p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
 310 // the same 29,28,... bit positions as an field element.
 311 //
 312 // The values in field elements are in Montgomery form: x*R mod p where R =
 313 // 2**257. Since we just multiplied two Montgomery values together, the result
 314 // is x*y*R*R mod p. We wish to divide by R in order for the result also to be
 315 // in Montgomery form.
 316 //
 317 // On entry: tmp[i] < 2**64
 318 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
 319 func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
 320         // The following table may be helpful when reading this code:
 321         //
 322         // Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
 323         // Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
 324         // Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
 325         //   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
 326         var tmp2 [18]uint32
 327         var carry, x, xMask uint32
 328
 329         // tmp contains 64-bit words with the same 29,28,29-bit positions as an
 330         // field element. So the top of an element of tmp might overlap with
 331         // another element two positions down. The following loop eliminates
 332         // this overlap.
 333         tmp2[0] = uint32(tmp[0]) & bottom29Bits
 334
 335         tmp2[1] = uint32(tmp[0]) >> 29
 336         tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
 337         tmp2[1] += uint32(tmp[1]) & bottom28Bits
 338         carry = tmp2[1] >> 28
 339         tmp2[1] &= bottom28Bits
 340
 341         for i := 2; i < 17; i++ {
 342                 tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
 343                 tmp2[i] += (uint32(tmp[i-1])) >> 28
 344                 tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
 345                 tmp2[i] += uint32(tmp[i]) & bottom29Bits
 346                 tmp2[i] += carry
 347                 carry = tmp2[i] >> 29
 348                 tmp2[i] &= bottom29Bits
 349
 350                 i++
 351                 if i == 17 {
 352                         break
 353                 }
 354                 tmp2[i] = uint32(tmp[i-2]>>32) >> 25
 355                 tmp2[i] += uint32(tmp[i-1]) >> 29
 356                 tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
 357                 tmp2[i] += uint32(tmp[i]) & bottom28Bits
 358                 tmp2[i] += carry
 359                 carry = tmp2[i] >> 28
 360                 tmp2[i] &= bottom28Bits
 361         }
 362
 363         tmp2[17] = uint32(tmp[15]>>32) >> 25
 364         tmp2[17] += uint32(tmp[16]) >> 29
 365         tmp2[17] += uint32(tmp[16]>>32) << 3
 366         tmp2[17] += carry
 367
 368         // Montgomery elimination of terms:
 369         //
 370         // Since R is 2**257, we can divide by R with a bitwise shift if we can
 371         // ensure that the right-most 257 bits are all zero. We can make that true
 372         // by adding multiplies of p without affecting the value.
 373         //
 374         // So we eliminate limbs from right to left. Since the bottom 29 bits of p
 375         // are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
 376         // We can do that for 8 further limbs and then right shift to eliminate the
 377         // extra factor of R.
 378         for i := 0; ; i += 2 {
 379                 tmp2[i+1] += tmp2[i] >> 29
 380                 x = tmp2[i] & bottom29Bits
 381                 xMask = nonZeroToAllOnes(x)
 382                 tmp2[i] = 0
 383
 384                 // The bounds calculations for this loop are tricky. Each iteration of
 385                 // the loop eliminates two words by adding values to words to their
 386                 // right.
 387                 //
 388                 // The following table contains the amounts added to each word (as an
 389                 // offset from the value of i at the top of the loop). The amounts are
 390                 // accounted for from the first and second half of the loop separately
 391                 // and are written as, for example, 28 to mean a value <2**28.
 392                 //
 393                 // Word:                   3   4   5   6   7   8   9   10
 394                 // Added in top half:     28  11      29  21  29  28
 395                 //                                        28  29
 396                 //                                            29
 397                 // Added in bottom half:      29  10      28  21  28   28
 398                 //                                            29
 399                 //
 400                 // The value that is currently offset 7 will be offset 5 for the next
 401                 // iteration and then offset 3 for the iteration after that. Therefore
 402                 // the total value added will be the values added at 7, 5 and 3.
 403                 //
 404                 // The following table accumulates these values. The sums at the bottom
 405                 // are written as, for example, 29+28, to mean a value < 2**29+2**28.
 406                 //
 407                 // Word:                   3   4   5   6   7   8   9  10  11  12  13
 408                 //                        28  11  10  29  21  29  28  28  28  28  28
 409                 //                            29  28  11  28  29  28  29  28  29  28
 410                 //                                    29  28  21  21  29  21  29  21
 411                 //                                        10  29  28  21  28  21  28
 412                 //                                        28  29  28  29  28  29  28
 413                 //                                            11  10  29  10  29  10
 414                 //                                            29  28  11  28  11
 415                 //                                                    29      29
 416                 //                        --------------------------------------------
 417                 //                                                30+ 31+ 30+ 31+ 30+
 418                 //                                                28+ 29+ 28+ 29+ 21+
 419                 //                                                21+ 28+ 21+ 28+ 10
 420                 //                                                10  21+ 10  21+
 421                 //                                                    11      11
 422                 //
 423                 // So the greatest amount is added to tmp2[10] and tmp2[12]. If
 424                 // tmp2[10/12] has an initial value of <2**29, then the maximum value
 425                 // will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
 426                 // as required.
 427                 tmp2[i+3] += (x << 10) & bottom28Bits
 428                 tmp2[i+4] += (x >> 18)
 429
 430                 tmp2[i+6] += (x << 21) & bottom29Bits
 431                 tmp2[i+7] += x >> 8
 432
 433                 // At position 200, which is the starting bit position for word 7, we
 434                 // have a factor of 0xf000000 = 2**28 - 2**24.
 435                 tmp2[i+7] += 0x10000000 & xMask
 436                 tmp2[i+8] += (x - 1) & xMask
 437                 tmp2[i+7] -= (x << 24) & bottom28Bits
 438                 tmp2[i+8] -= x >> 4
 439
 440                 tmp2[i+8] += 0x20000000 & xMask
 441                 tmp2[i+8] -= x
 442                 tmp2[i+8] += (x << 28) & bottom29Bits
 443                 tmp2[i+9] += ((x >> 1) - 1) & xMask
 444
 445                 if i+1 == p256Limbs {
 446                         break
 447                 }
 448                 tmp2[i+2] += tmp2[i+1] >> 28
 449                 x = tmp2[i+1] & bottom28Bits
 450                 xMask = nonZeroToAllOnes(x)
 451                 tmp2[i+1] = 0
 452
 453                 tmp2[i+4] += (x << 11) & bottom29Bits
 454                 tmp2[i+5] += (x >> 18)
 455
 456                 tmp2[i+7] += (x << 21) & bottom28Bits
 457                 tmp2[i+8] += x >> 7
 458
 459                 // At position 199, which is the starting bit of the 8th word when
 460                 // dealing with a context starting on an odd word, we have a factor of
 461                 // 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
 462                 // word from i+1 is i+8.
 463                 tmp2[i+8] += 0x20000000 & xMask
 464                 tmp2[i+9] += (x - 1) & xMask
 465                 tmp2[i+8] -= (x << 25) & bottom29Bits
 466                 tmp2[i+9] -= x >> 4
 467
 468                 tmp2[i+9] += 0x10000000 & xMask
 469                 tmp2[i+9] -= x
 470                 tmp2[i+10] += (x - 1) & xMask
 471         }
 472
 473         // We merge the right shift with a carry chain. The words above 2**257 have
 474         // widths of 28,29,... which we need to correct when copying them down.
 475         carry = 0
 476         for i := 0; i < 8; i++ {
 477                 // The maximum value of tmp2[i + 9] occurs on the first iteration and
 478                 // is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
 479                 // therefore safe.
 480                 out[i] = tmp2[i+9]
 481                 out[i] += carry
 482                 out[i] += (tmp2[i+10] << 28) & bottom29Bits
 483                 carry = out[i] >> 29
 484                 out[i] &= bottom29Bits
 485
 486                 i++
 487                 out[i] = tmp2[i+9] >> 1
 488                 out[i] += carry
 489                 carry = out[i] >> 28
 490                 out[i] &= bottom28Bits
 491         }
 492
 493         out[8] = tmp2[17]
 494         out[8] += carry
 495         carry = out[8] >> 29
 496         out[8] &= bottom29Bits
 497
 498         p256ReduceCarry(out, carry)
 499 }
 500
 501 // p256Square sets out=in*in.
 502 //
 503 // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
 504 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
 505 func p256Square(out, in *[p256Limbs]uint32) {
 506         var tmp [17]uint64
 507
 508         tmp[0] = uint64(in[0]) * uint64(in[0])
 509         tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
 510         tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
 511                 uint64(in[1])*(uint64(in[1])<<1)
 512         tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
 513                 uint64(in[1])*(uint64(in[2])<<1)
 514         tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
 515                 uint64(in[1])*(uint64(in[3])<<2) +
 516                 uint64(in[2])*uint64(in[2])
 517         tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
 518                 uint64(in[1])*(uint64(in[4])<<1) +
 519                 uint64(in[2])*(uint64(in[3])<<1)
 520         tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
 521                 uint64(in[1])*(uint64(in[5])<<2) +
 522                 uint64(in[2])*(uint64(in[4])<<1) +
 523                 uint64(in[3])*(uint64(in[3])<<1)
 524         tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
 525                 uint64(in[1])*(uint64(in[6])<<1) +
 526                 uint64(in[2])*(uint64(in[5])<<1) +
 527                 uint64(in[3])*(uint64(in[4])<<1)
 528         // tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
 529         // which is < 2**64 as required.
 530         tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
 531                 uint64(in[1])*(uint64(in[7])<<2) +
 532                 uint64(in[2])*(uint64(in[6])<<1) +
 533                 uint64(in[3])*(uint64(in[5])<<2) +
 534                 uint64(in[4])*uint64(in[4])
 535         tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
 536                 uint64(in[2])*(uint64(in[7])<<1) +
 537                 uint64(in[3])*(uint64(in[6])<<1) +
 538                 uint64(in[4])*(uint64(in[5])<<1)
 539         tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
 540                 uint64(in[3])*(uint64(in[7])<<2) +
 541                 uint64(in[4])*(uint64(in[6])<<1) +
 542                 uint64(in[5])*(uint64(in[5])<<1)
 543         tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
 544                 uint64(in[4])*(uint64(in[7])<<1) +
 545                 uint64(in[5])*(uint64(in[6])<<1)
 546         tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
 547                 uint64(in[5])*(uint64(in[7])<<2) +
 548                 uint64(in[6])*uint64(in[6])
 549         tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
 550                 uint64(in[6])*(uint64(in[7])<<1)
 551         tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
 552                 uint64(in[7])*(uint64(in[7])<<1)
 553         tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
 554         tmp[16] = uint64(in[8]) * uint64(in[8])
 555
 556         p256ReduceDegree(out, tmp)
 557 }
 558
 559 // p256Mul sets out=in*in2.
 560 //
 561 // On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
 562 //           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
 563 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
 564 func p256Mul(out, in, in2 *[p256Limbs]uint32) {
 565         var tmp [17]uint64
 566
 567         tmp[0] = uint64(in[0]) * uint64(in2[0])
 568         tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
 569                 uint64(in[1])*(uint64(in2[0])<<0)
 570         tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
 571                 uint64(in[1])*(uint64(in2[1])<<1) +
 572                 uint64(in[2])*(uint64(in2[0])<<0)
 573         tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
 574                 uint64(in[1])*(uint64(in2[2])<<0) +
 575                 uint64(in[2])*(uint64(in2[1])<<0) +
 576                 uint64(in[3])*(uint64(in2[0])<<0)
 577         tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
 578                 uint64(in[1])*(uint64(in2[3])<<1) +
 579                 uint64(in[2])*(uint64(in2[2])<<0) +
 580                 uint64(in[3])*(uint64(in2[1])<<1) +
 581                 uint64(in[4])*(uint64(in2[0])<<0)
 582         tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
 583                 uint64(in[1])*(uint64(in2[4])<<0) +
 584                 uint64(in[2])*(uint64(in2[3])<<0) +
 585                 uint64(in[3])*(uint64(in2[2])<<0) +
 586                 uint64(in[4])*(uint64(in2[1])<<0) +
 587                 uint64(in[5])*(uint64(in2[0])<<0)
 588         tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
 589                 uint64(in[1])*(uint64(in2[5])<<1) +
 590                 uint64(in[2])*(uint64(in2[4])<<0) +
 591                 uint64(in[3])*(uint64(in2[3])<<1) +
 592                 uint64(in[4])*(uint64(in2[2])<<0) +
 593                 uint64(in[5])*(uint64(in2[1])<<1) +
 594                 uint64(in[6])*(uint64(in2[0])<<0)
 595         tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
 596                 uint64(in[1])*(uint64(in2[6])<<0) +
 597                 uint64(in[2])*(uint64(in2[5])<<0) +
 598                 uint64(in[3])*(uint64(in2[4])<<0) +
 599                 uint64(in[4])*(uint64(in2[3])<<0) +
 600                 uint64(in[5])*(uint64(in2[2])<<0) +
 601                 uint64(in[6])*(uint64(in2[1])<<0) +
 602                 uint64(in[7])*(uint64(in2[0])<<0)
 603         // tmp[8] has the greatest value but doesn't overflow. See logic in
 604         // p256Square.
 605         tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
 606                 uint64(in[1])*(uint64(in2[7])<<1) +
 607                 uint64(in[2])*(uint64(in2[6])<<0) +
 608                 uint64(in[3])*(uint64(in2[5])<<1) +
 609                 uint64(in[4])*(uint64(in2[4])<<0) +
 610                 uint64(in[5])*(uint64(in2[3])<<1) +
 611                 uint64(in[6])*(uint64(in2[2])<<0) +
 612                 uint64(in[7])*(uint64(in2[1])<<1) +
 613                 uint64(in[8])*(uint64(in2[0])<<0)
 614         tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
 615                 uint64(in[2])*(uint64(in2[7])<<0) +
 616                 uint64(in[3])*(uint64(in2[6])<<0) +
 617                 uint64(in[4])*(uint64(in2[5])<<0) +
 618                 uint64(in[5])*(uint64(in2[4])<<0) +
 619                 uint64(in[6])*(uint64(in2[3])<<0) +
 620                 uint64(in[7])*(uint64(in2[2])<<0) +
 621                 uint64(in[8])*(uint64(in2[1])<<0)
 622         tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
 623                 uint64(in[3])*(uint64(in2[7])<<1) +
 624                 uint64(in[4])*(uint64(in2[6])<<0) +
 625                 uint64(in[5])*(uint64(in2[5])<<1) +
 626                 uint64(in[6])*(uint64(in2[4])<<0) +
 627                 uint64(in[7])*(uint64(in2[3])<<1) +
 628                 uint64(in[8])*(uint64(in2[2])<<0)
 629         tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
 630                 uint64(in[4])*(uint64(in2[7])<<0) +
 631                 uint64(in[5])*(uint64(in2[6])<<0) +
 632                 uint64(in[6])*(uint64(in2[5])<<0) +
 633                 uint64(in[7])*(uint64(in2[4])<<0) +
 634                 uint64(in[8])*(uint64(in2[3])<<0)
 635         tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
 636                 uint64(in[5])*(uint64(in2[7])<<1) +
 637                 uint64(in[6])*(uint64(in2[6])<<0) +
 638                 uint64(in[7])*(uint64(in2[5])<<1) +
 639                 uint64(in[8])*(uint64(in2[4])<<0)
 640         tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
 641                 uint64(in[6])*(uint64(in2[7])<<0) +
 642                 uint64(in[7])*(uint64(in2[6])<<0) +
 643                 uint64(in[8])*(uint64(in2[5])<<0)
 644         tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
 645                 uint64(in[7])*(uint64(in2[7])<<1) +
 646                 uint64(in[8])*(uint64(in2[6])<<0)
 647         tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
 648                 uint64(in[8])*(uint64(in2[7])<<0)
 649         tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
 650
 651         p256ReduceDegree(out, tmp)
 652 }
 653
 654 func p256Assign(out, in *[p256Limbs]uint32) {
 655         *out = *in
 656 }
 657
 658 // p256Invert calculates |out| = |in|^{-1}
 659 //
 660 // Based on Fermat's Little Theorem:
 661 //   a^p = a (mod p)
 662 //   a^{p-1} = 1 (mod p)
 663 //   a^{p-2} = a^{-1} (mod p)
 664 func p256Invert(out, in *[p256Limbs]uint32) {
 665         var ftmp, ftmp2 [p256Limbs]uint32
 666
 667         // each e_I will hold |in|^{2^I - 1}
 668         var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
 669
 670         p256Square(&ftmp, in)     // 2^1
 671         p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
 672         p256Assign(&e2, &ftmp)
 673         p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
 674         p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
 675         p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
 676         p256Assign(&e4, &ftmp)
 677         p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
 678         p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
 679         p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
 680         p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
 681         p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
 682         p256Assign(&e8, &ftmp)
 683         for i := 0; i < 8; i++ {
 684                 p256Square(&ftmp, &ftmp)
 685         } // 2^16 - 2^8
 686         p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
 687         p256Assign(&e16, &ftmp)
 688         for i := 0; i < 16; i++ {
 689                 p256Square(&ftmp, &ftmp)
 690         } // 2^32 - 2^16
 691         p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
 692         p256Assign(&e32, &ftmp)
 693         for i := 0; i < 32; i++ {
 694                 p256Square(&ftmp, &ftmp)
 695         } // 2^64 - 2^32
 696         p256Assign(&e64, &ftmp)
 697         p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
 698         for i := 0; i < 192; i++ {
 699                 p256Square(&ftmp, &ftmp)
 700         } // 2^256 - 2^224 + 2^192
 701
 702         p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
 703         for i := 0; i < 16; i++ {
 704                 p256Square(&ftmp2, &ftmp2)
 705         } // 2^80 - 2^16
 706         p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
 707         for i := 0; i < 8; i++ {
 708                 p256Square(&ftmp2, &ftmp2)
 709         } // 2^88 - 2^8
 710         p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
 711         for i := 0; i < 4; i++ {
 712                 p256Square(&ftmp2, &ftmp2)
 713         } // 2^92 - 2^4
 714         p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
 715         p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
 716         p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
 717         p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
 718         p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
 719         p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
 720         p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
 721
 722         p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
 723 }
 724
 725 // p256Scalar3 sets out=3*out.
 726 //
 727 // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
 728 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
 729 func p256Scalar3(out *[p256Limbs]uint32) {
 730         var carry uint32
 731
 732         for i := 0; ; i++ {
 733                 out[i] *= 3
 734                 out[i] += carry
 735                 carry = out[i] >> 29
 736                 out[i] &= bottom29Bits
 737
 738                 i++
 739                 if i == p256Limbs {
 740                         break
 741                 }
 742
 743                 out[i] *= 3
 744                 out[i] += carry
 745                 carry = out[i] >> 28
 746                 out[i] &= bottom28Bits
 747         }
 748
 749         p256ReduceCarry(out, carry)
 750 }
 751
 752 // p256Scalar4 sets out=4*out.
 753 //
 754 // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
 755 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
 756 func p256Scalar4(out *[p256Limbs]uint32) {
 757         var carry, nextCarry uint32
 758
 759         for i := 0; ; i++ {
 760                 nextCarry = out[i] >> 27
 761                 out[i] <<= 2
 762                 out[i] &= bottom29Bits
 763                 out[i] += carry
 764                 carry = nextCarry + (out[i] >> 29)
 765                 out[i] &= bottom29Bits
 766
 767                 i++
 768                 if i == p256Limbs {
 769                         break
 770                 }
 771                 nextCarry = out[i] >> 26
 772                 out[i] <<= 2
 773                 out[i] &= bottom28Bits
 774                 out[i] += carry
 775                 carry = nextCarry + (out[i] >> 28)
 776                 out[i] &= bottom28Bits
 777         }
 778
 779         p256ReduceCarry(out, carry)
 780 }
 781
 782 // p256Scalar8 sets out=8*out.
 783 //
 784 // On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
 785 // On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
 786 func p256Scalar8(out *[p256Limbs]uint32) {
 787         var carry, nextCarry uint32
 788
 789         for i := 0; ; i++ {
 790                 nextCarry = out[i] >> 26
 791                 out[i] <<= 3
 792                 out[i] &= bottom29Bits
 793                 out[i] += carry
 794                 carry = nextCarry + (out[i] >> 29)
 795                 out[i] &= bottom29Bits
 796
 797                 i++
 798                 if i == p256Limbs {
 799                         break
 800                 }
 801                 nextCarry = out[i] >> 25
 802                 out[i] <<= 3
 803                 out[i] &= bottom28Bits
 804                 out[i] += carry
 805                 carry = nextCarry + (out[i] >> 28)
 806                 out[i] &= bottom28Bits
 807         }
 808
 809         p256ReduceCarry(out, carry)
 810 }
 811
 812 // Group operations:
 813 //
 814 // Elements of the elliptic curve group are represented in Jacobian
 815 // coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
 816 // Jacobian form.
 817
 818 // p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
 819 //
 820 // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
 821 func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
 822         var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
 823
 824         p256Square(&delta, z)
 825         p256Square(&gamma, y)
 826         p256Mul(&beta, x, &gamma)
 827
 828         p256Sum(&tmp, x, &delta)
 829         p256Diff(&tmp2, x, &delta)
 830         p256Mul(&alpha, &tmp, &tmp2)
 831         p256Scalar3(&alpha)
 832
 833         p256Sum(&tmp, y, z)
 834         p256Square(&tmp, &tmp)
 835         p256Diff(&tmp, &tmp, &gamma)
 836         p256Diff(zOut, &tmp, &delta)
 837
 838         p256Scalar4(&beta)
 839         p256Square(xOut, &alpha)
 840         p256Diff(xOut, xOut, &beta)
 841         p256Diff(xOut, xOut, &beta)
 842
 843         p256Diff(&tmp, &beta, xOut)
 844         p256Mul(&tmp, &alpha, &tmp)
 845         p256Square(&tmp2, &gamma)
 846         p256Scalar8(&tmp2)
 847         p256Diff(yOut, &tmp, &tmp2)
 848 }
 849
 850 // p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
 851 // (i.e. the second point is affine.)
 852 //
 853 // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
 854 //
 855 // Note that this function does not handle P+P, infinity+P nor P+infinity
 856 // correctly.
 857 func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
 858         var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
 859
 860         p256Square(&z1z1, z1)
 861         p256Sum(&tmp, z1, z1)
 862
 863         p256Mul(&u2, x2, &z1z1)
 864         p256Mul(&z1z1z1, z1, &z1z1)
 865         p256Mul(&s2, y2, &z1z1z1)
 866         p256Diff(&h, &u2, x1)
 867         p256Sum(&i, &h, &h)
 868         p256Square(&i, &i)
 869         p256Mul(&j, &h, &i)
 870         p256Diff(&r, &s2, y1)
 871         p256Sum(&r, &r, &r)
 872         p256Mul(&v, x1, &i)
 873
 874         p256Mul(zOut, &tmp, &h)
 875         p256Square(&rr, &r)
 876         p256Diff(xOut, &rr, &j)
 877         p256Diff(xOut, xOut, &v)
 878         p256Diff(xOut, xOut, &v)
 879
 880         p256Diff(&tmp, &v, xOut)
 881         p256Mul(yOut, &tmp, &r)
 882         p256Mul(&tmp, y1, &j)
 883         p256Diff(yOut, yOut, &tmp)
 884         p256Diff(yOut, yOut, &tmp)
 885 }
 886
 887 // p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
 888 //
 889 // See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
 890 //
 891 // Note that this function does not handle P+P, infinity+P nor P+infinity
 892 // correctly.
 893 func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
 894         var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
 895
 896         p256Square(&z1z1, z1)
 897         p256Square(&z2z2, z2)
 898         p256Mul(&u1, x1, &z2z2)
 899
 900         p256Sum(&tmp, z1, z2)
 901         p256Square(&tmp, &tmp)
 902         p256Diff(&tmp, &tmp, &z1z1)
 903         p256Diff(&tmp, &tmp, &z2z2)
 904
 905         p256Mul(&z2z2z2, z2, &z2z2)
 906         p256Mul(&s1, y1, &z2z2z2)
 907
 908         p256Mul(&u2, x2, &z1z1)
 909         p256Mul(&z1z1z1, z1, &z1z1)
 910         p256Mul(&s2, y2, &z1z1z1)
 911         p256Diff(&h, &u2, &u1)
 912         p256Sum(&i, &h, &h)
 913         p256Square(&i, &i)
 914         p256Mul(&j, &h, &i)
 915         p256Diff(&r, &s2, &s1)
 916         p256Sum(&r, &r, &r)
 917         p256Mul(&v, &u1, &i)
 918
 919         p256Mul(zOut, &tmp, &h)
 920         p256Square(&rr, &r)
 921         p256Diff(xOut, &rr, &j)
 922         p256Diff(xOut, xOut, &v)
 923         p256Diff(xOut, xOut, &v)
 924
 925         p256Diff(&tmp, &v, xOut)
 926         p256Mul(yOut, &tmp, &r)
 927         p256Mul(&tmp, &s1, &j)
 928         p256Diff(yOut, yOut, &tmp)
 929         p256Diff(yOut, yOut, &tmp)
 930 }
 931
 932 // p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
 933 //
 934 // On entry: mask is either 0 or 0xffffffff.
 935 func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
 936         for i := 0; i < p256Limbs; i++ {
 937                 tmp := mask & (in[i] ^ out[i])
 938                 out[i] ^= tmp
 939         }
 940 }
 941
 942 // p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
 943 // On entry: index < 16, table[0] must be zero.
 944 func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
 945         for i := range xOut {
 946                 xOut[i] = 0
 947         }
 948         for i := range yOut {
 949                 yOut[i] = 0
 950         }
 951
 952         for i := uint32(1); i < 16; i++ {
 953                 mask := i ^ index
 954                 mask |= mask >> 2
 955                 mask |= mask >> 1
 956                 mask &= 1
 957                 mask--
 958                 for j := range xOut {
 959                         xOut[j] |= table[0] & mask
 960                         table = table[1:]
 961                 }
 962                 for j := range yOut {
 963                         yOut[j] |= table[0] & mask
 964                         table = table[1:]
 965                 }
 966         }
 967 }
 968
 969 // p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
 970 // table.
 971 // On entry: index < 16, table[0] must be zero.
 972 func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
 973         for i := range xOut {
 974                 xOut[i] = 0
 975         }
 976         for i := range yOut {
 977                 yOut[i] = 0
 978         }
 979         for i := range zOut {
 980                 zOut[i] = 0
 981         }
 982
 983         // The implicit value at index 0 is all zero. We don't need to perform that
 984         // iteration of the loop because we already set out_* to zero.
 985         for i := uint32(1); i < 16; i++ {
 986                 mask := i ^ index
 987                 mask |= mask >> 2
 988                 mask |= mask >> 1
 989                 mask &= 1
 990                 mask--
 991                 for j := range xOut {
 992                         xOut[j] |= table[i][0][j] & mask
 993                 }
 994                 for j := range yOut {
 995                         yOut[j] |= table[i][1][j] & mask
 996                 }
 997                 for j := range zOut {
 998                         zOut[j] |= table[i][2][j] & mask
 999                 }
1000         }
1001 }
1002
1003 // p256GetBit returns the bit'th bit of scalar.
1004 func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
1005         return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
1006 }
1007
1008 // p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
1009 // little-endian number. Note that the value of scalar must be less than the
1010 // order of the group.
1011 func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
1012         nIsInfinityMask := ^uint32(0)
1013         var pIsNoninfiniteMask, mask, tableOffset uint32
1014         var px, py, tx, ty, tz [p256Limbs]uint32
1015
1016         for i := range xOut {
1017                 xOut[i] = 0
1018         }
1019         for i := range yOut {
1020                 yOut[i] = 0
1021         }
1022         for i := range zOut {
1023                 zOut[i] = 0
1024         }
1025
1026         // The loop adds bits at positions 0, 64, 128 and 192, followed by
1027         // positions 32,96,160 and 224 and does this 32 times.
1028         for i := uint(0); i < 32; i++ {
1029                 if i != 0 {
1030                         p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1031                 }
1032                 tableOffset = 0
1033                 for j := uint(0); j <= 32; j += 32 {
1034                         bit0 := p256GetBit(scalar, 31-i+j)
1035                         bit1 := p256GetBit(scalar, 95-i+j)
1036                         bit2 := p256GetBit(scalar, 159-i+j)
1037                         bit3 := p256GetBit(scalar, 223-i+j)
1038                         index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
1039
1040                         p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
1041                         tableOffset += 30 * p256Limbs
1042
1043                         // Since scalar is less than the order of the group, we know that
1044                         // {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
1045                         // below.
1046                         p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
1047                         // The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
1048                         // (a.k.a.  the point at infinity). We handle that situation by
1049                         // copying the point from the table.
1050                         p256CopyConditional(xOut, &px, nIsInfinityMask)
1051                         p256CopyConditional(yOut, &py, nIsInfinityMask)
1052                         p256CopyConditional(zOut, &p256One, nIsInfinityMask)
1053
1054                         // Equally, the result is also wrong if the point from the table is
1055                         // zero, which happens when the index is zero. We handle that by
1056                         // only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
1057                         pIsNoninfiniteMask = nonZeroToAllOnes(index)
1058                         mask = pIsNoninfiniteMask & ^nIsInfinityMask
1059                         p256CopyConditional(xOut, &tx, mask)
1060                         p256CopyConditional(yOut, &ty, mask)
1061                         p256CopyConditional(zOut, &tz, mask)
1062                         // If p was not zero, then n is now non-zero.
1063                         nIsInfinityMask &^= pIsNoninfiniteMask
1064                 }
1065         }
1066 }
1067
1068 // p256PointToAffine converts a Jacobian point to an affine point. If the input
1069 // is the point at infinity then it returns (0, 0) in constant time.
1070 func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
1071         var zInv, zInvSq [p256Limbs]uint32
1072
1073         p256Invert(&zInv, z)
1074         p256Square(&zInvSq, &zInv)
1075         p256Mul(xOut, x, &zInvSq)
1076         p256Mul(&zInv, &zInv, &zInvSq)
1077         p256Mul(yOut, y, &zInv)
1078 }
1079
1080 // p256ToAffine returns a pair of *big.Int containing the affine representation
1081 // of {x,y,z}.
1082 func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
1083         var xx, yy [p256Limbs]uint32
1084         p256PointToAffine(&xx, &yy, x, y, z)
1085         return p256ToBig(&xx), p256ToBig(&yy)
1086 }
1087
1088 // p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
1089 func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
1090         var px, py, pz, tx, ty, tz [p256Limbs]uint32
1091         var precomp [16][3][p256Limbs]uint32
1092         var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
1093
1094         // We precompute 0,1,2,... times {x,y}.
1095         precomp[1][0] = *x
1096         precomp[1][1] = *y
1097         precomp[1][2] = p256One
1098
1099         for i := 2; i < 16; i += 2 {
1100                 p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
1101                 p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
1102         }
1103
1104         for i := range xOut {
1105                 xOut[i] = 0
1106         }
1107         for i := range yOut {
1108                 yOut[i] = 0
1109         }
1110         for i := range zOut {
1111                 zOut[i] = 0
1112         }
1113         nIsInfinityMask = ^uint32(0)
1114
1115         // We add in a window of four bits each iteration and do this 64 times.
1116         for i := 0; i < 64; i++ {
1117                 if i != 0 {
1118                         p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1119                         p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1120                         p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1121                         p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
1122                 }
1123
1124                 index = uint32(scalar[31-i/2])
1125                 if (i & 1) == 1 {
1126                         index &= 15
1127                 } else {
1128                         index >>= 4
1129                 }
1130
1131                 // See the comments in scalarBaseMult about handling infinities.
1132                 p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
1133                 p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
1134                 p256CopyConditional(xOut, &px, nIsInfinityMask)
1135                 p256CopyConditional(yOut, &py, nIsInfinityMask)
1136                 p256CopyConditional(zOut, &pz, nIsInfinityMask)
1137
1138                 pIsNoninfiniteMask = nonZeroToAllOnes(index)
1139                 mask = pIsNoninfiniteMask & ^nIsInfinityMask
1140                 p256CopyConditional(xOut, &tx, mask)
1141                 p256CopyConditional(yOut, &ty, mask)
1142                 p256CopyConditional(zOut, &tz, mask)
1143                 nIsInfinityMask &^= pIsNoninfiniteMask
1144         }
1145 }
1146
1147 // p256FromBig sets out = R*in.
1148 func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
1149         tmp := new(big.Int).Lsh(in, 257)
1150         tmp.Mod(tmp, p256Params.P)
1151
1152         for i := 0; i < p256Limbs; i++ {
1153                 if bits := tmp.Bits(); len(bits) > 0 {
1154                         out[i] = uint32(bits[0]) & bottom29Bits
1155                 } else {
1156                         out[i] = 0
1157                 }
1158                 tmp.Rsh(tmp, 29)
1159
1160                 i++
1161                 if i == p256Limbs {
1162                         break
1163                 }
1164
1165                 if bits := tmp.Bits(); len(bits) > 0 {
1166                         out[i] = uint32(bits[0]) & bottom28Bits
1167                 } else {
1168                         out[i] = 0
1169                 }
1170                 tmp.Rsh(tmp, 28)
1171         }
1172 }
1173
1174 // p256ToBig returns a *big.Int containing the value of in.
1175 func p256ToBig(in *[p256Limbs]uint32) *big.Int {
1176         result, tmp := new(big.Int), new(big.Int)
1177
1178         result.SetInt64(int64(in[p256Limbs-1]))
1179         for i := p256Limbs - 2; i >= 0; i-- {
1180                 if (i & 1) == 0 {
1181                         result.Lsh(result, 29)
1182                 } else {
1183                         result.Lsh(result, 28)
1184                 }
1185                 tmp.SetInt64(int64(in[i]))
1186                 result.Add(result, tmp)
1187         }
1188
1189         result.Mul(result, p256RInverse)
1190         result.Mod(result, p256Params.P)
1191         return result
1192 }